crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (85) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/startproject.py +24 -0
  4. crawlo/core/engine.py +2 -2
  5. crawlo/core/scheduler.py +4 -4
  6. crawlo/crawler.py +8 -7
  7. crawlo/downloader/__init__.py +5 -2
  8. crawlo/extension/__init__.py +2 -2
  9. crawlo/filters/aioredis_filter.py +8 -1
  10. crawlo/filters/memory_filter.py +8 -1
  11. crawlo/initialization/built_in.py +13 -4
  12. crawlo/initialization/core.py +5 -4
  13. crawlo/interfaces.py +24 -0
  14. crawlo/middleware/__init__.py +7 -4
  15. crawlo/middleware/middleware_manager.py +15 -8
  16. crawlo/mode_manager.py +45 -11
  17. crawlo/network/response.py +374 -69
  18. crawlo/pipelines/mysql_pipeline.py +6 -6
  19. crawlo/pipelines/pipeline_manager.py +2 -2
  20. crawlo/project.py +2 -4
  21. crawlo/settings/default_settings.py +4 -0
  22. crawlo/task_manager.py +2 -2
  23. crawlo/templates/project/items.py.tmpl +2 -2
  24. crawlo/templates/project/middlewares.py.tmpl +9 -89
  25. crawlo/templates/project/pipelines.py.tmpl +8 -68
  26. crawlo/tools/__init__.py +0 -11
  27. crawlo/utils/__init__.py +17 -1
  28. crawlo/utils/db_helper.py +220 -319
  29. crawlo/utils/error_handler.py +313 -67
  30. crawlo/utils/fingerprint.py +3 -4
  31. crawlo/utils/misc.py +82 -0
  32. crawlo/utils/request.py +55 -66
  33. crawlo/utils/selector_helper.py +138 -0
  34. crawlo/utils/spider_loader.py +185 -45
  35. crawlo/utils/text_helper.py +95 -0
  36. crawlo-1.4.5.dist-info/METADATA +329 -0
  37. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
  38. tests/bug_check_test.py +251 -0
  39. tests/direct_selector_helper_test.py +97 -0
  40. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  41. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  42. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  43. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  44. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  45. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  46. tests/ofweek_scrapy/scrapy.cfg +11 -0
  47. tests/performance_comparison.py +4 -5
  48. tests/simple_crawlo_test.py +1 -2
  49. tests/simple_follow_test.py +39 -0
  50. tests/simple_response_selector_test.py +95 -0
  51. tests/simple_selector_helper_test.py +155 -0
  52. tests/simple_selector_test.py +208 -0
  53. tests/simple_url_test.py +74 -0
  54. tests/test_crawler_process_import.py +39 -0
  55. tests/test_crawler_process_spider_modules.py +48 -0
  56. tests/test_edge_cases.py +7 -5
  57. tests/test_encoding_core.py +57 -0
  58. tests/test_encoding_detection.py +127 -0
  59. tests/test_factory_compatibility.py +197 -0
  60. tests/test_optimized_selector_naming.py +101 -0
  61. tests/test_priority_behavior.py +18 -18
  62. tests/test_response_follow.py +105 -0
  63. tests/test_response_selector_methods.py +93 -0
  64. tests/test_response_url_methods.py +71 -0
  65. tests/test_response_urljoin.py +87 -0
  66. tests/test_scrapy_style_encoding.py +113 -0
  67. tests/test_selector_helper.py +101 -0
  68. tests/test_selector_optimizations.py +147 -0
  69. tests/test_spider_loader.py +50 -0
  70. tests/test_spider_loader_comprehensive.py +70 -0
  71. tests/test_spiders/__init__.py +1 -0
  72. tests/test_spiders/test_spider.py +10 -0
  73. crawlo/tools/anti_crawler.py +0 -269
  74. crawlo/utils/class_loader.py +0 -26
  75. crawlo/utils/enhanced_error_handler.py +0 -357
  76. crawlo-1.4.4.dist-info/METADATA +0 -190
  77. tests/simple_log_test.py +0 -58
  78. tests/simple_test.py +0 -48
  79. tests/test_framework_logger.py +0 -67
  80. tests/test_framework_startup.py +0 -65
  81. tests/test_mode_change.py +0 -73
  82. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  83. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  84. {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  85. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
@@ -1,54 +1,142 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  """
4
- 统一错误处理工具
5
- 提供一致的错误处理和日志记录机制
4
+ 错误处理工具
5
+ 提供详细、一致的错误处理和日志记录机制
6
6
  """
7
- import time
8
7
  import traceback
9
8
  from functools import wraps
10
- from typing import Callable, Any
9
+ from datetime import datetime
10
+ from typing import Optional, Callable, Any, Dict, List
11
11
 
12
12
  from crawlo.utils.log import get_logger
13
13
 
14
14
 
15
- class ErrorHandler:
16
- """统一错误处理器(简化版,避免循环依赖)"""
15
+ class ErrorContext:
16
+ """错误上下文信息"""
17
+
18
+ def __init__(self, context: str = "", module: str = "", function: str = ""):
19
+ self.context = context
20
+ self.module = module
21
+ self.function = function
22
+ self.timestamp = datetime.now()
23
+
24
+ def __str__(self):
25
+ parts = []
26
+ if self.module:
27
+ parts.append(f"Module: {self.module}")
28
+ if self.function:
29
+ parts.append(f"Function: {self.function}")
30
+ if self.context:
31
+ parts.append(f"Context: {self.context}")
32
+ parts.append(f"Time: {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
33
+ return " | ".join(parts)
34
+
35
+
36
+ class DetailedException(Exception):
37
+ """带有详细信息的异常基类"""
38
+
39
+ def __init__(self, message: str, context: Optional[ErrorContext] = None,
40
+ error_code: Optional[str] = None, **kwargs):
41
+ super().__init__(message)
42
+ self.context = context
43
+ self.error_code = error_code
44
+ self.details = kwargs
45
+ self.timestamp = datetime.now()
46
+
47
+ def __str__(self):
48
+ base_msg = super().__str__()
49
+ if self.context:
50
+ return f"{base_msg} ({self.context})"
51
+ return base_msg
52
+
53
+ def get_full_details(self) -> Dict:
54
+ """获取完整的错误详情"""
55
+ return {
56
+ "message": str(self),
57
+ "error_code": self.error_code,
58
+ "context": str(self.context) if self.context else None,
59
+ "details": self.details,
60
+ "timestamp": self.timestamp.isoformat(),
61
+ "exception_type": self.__class__.__name__
62
+ }
63
+
17
64
 
65
+ class EnhancedErrorHandler:
66
+ """增强版错误处理器"""
67
+
18
68
  def __init__(self, logger_name: str = __name__, log_level: str = 'ERROR'):
19
- # 延迟初始化logger避免循环依赖
20
- self._logger = None
21
- self.logger_name = logger_name
22
- self.log_level = log_level
23
-
24
- @property
25
- def logger(self):
26
- if self._logger is None:
27
- self._logger = get_logger(self.logger_name)
28
- return self._logger
29
-
30
- def handle_error(self, exception: Exception, context: str = "",
31
- raise_error: bool = True, log_error: bool = True) -> None:
69
+ self.logger = get_logger(logger_name, log_level)
70
+ self.error_history: List[Dict] = [] # 错误历史记录
71
+ self.max_history_size = 100 # 最大历史记录数
72
+
73
+ def handle_error(self, exception: Exception, context: Optional[ErrorContext] = None,
74
+ raise_error: bool = True, log_error: bool = True,
75
+ extra_info: Optional[Dict] = None) -> Dict:
32
76
  """
33
- 统一处理错误
77
+ 增强版错误处理
34
78
 
35
79
  Args:
36
80
  exception: 异常对象
37
- context: 错误上下文描述
81
+ context: 错误上下文信息
38
82
  raise_error: 是否重新抛出异常
39
83
  log_error: 是否记录错误日志
84
+ extra_info: 额外的错误信息
85
+
86
+ Returns:
87
+ 包含错误详情的字典
40
88
  """
89
+ # 构建错误详情
90
+ error_details = {
91
+ "exception": exception,
92
+ "exception_type": type(exception).__name__,
93
+ "message": str(exception),
94
+ "context": str(context) if context else None,
95
+ "timestamp": datetime.now().isoformat(),
96
+ "traceback": traceback.format_exc() if log_error else None,
97
+ "extra_info": extra_info or {}
98
+ }
99
+
100
+ # 记录到历史
101
+ self._record_error(error_details)
102
+
103
+ # 记录日志
41
104
  if log_error:
42
- error_msg = f"Error in {context}: {str(exception)}" if context else str(exception)
43
- self.logger.error(error_msg)
44
- # 在DEBUG级别记录详细的堆栈跟踪
45
- self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
46
-
105
+ self._log_error(error_details)
106
+
107
+ # 重新抛出异常
47
108
  if raise_error:
48
109
  raise exception
49
-
50
- def safe_call(self, func: Callable, *args, default_return=None,
51
- context: str = "", **kwargs) -> Any:
110
+
111
+ return error_details
112
+
113
+ def _log_error(self, error_details: Dict):
114
+ """记录错误日志"""
115
+ # 基本错误信息
116
+ context_info = error_details.get("context", "")
117
+ message = error_details["message"]
118
+ error_msg = f"{message} [{context_info}]" if context_info else message
119
+
120
+ # 记录错误
121
+ self.logger.error(error_msg)
122
+
123
+ # 记录详细信息
124
+ if error_details.get("traceback"):
125
+ self.logger.debug(f"详细错误信息:\n{error_details['traceback']}")
126
+
127
+ # 记录额外信息
128
+ if error_details.get("extra_info"):
129
+ self.logger.debug(f"额外信息: {error_details['extra_info']}")
130
+
131
+ def _record_error(self, error_details: Dict):
132
+ """记录错误到历史"""
133
+ self.error_history.append(error_details)
134
+ # 限制历史记录大小
135
+ if len(self.error_history) > self.max_history_size:
136
+ self.error_history.pop(0)
137
+
138
+ def safe_call(self, func: Callable, *args, default_return=None,
139
+ context: Optional[ErrorContext] = None, **kwargs) -> Any:
52
140
  """
53
141
  安全调用函数,捕获并处理异常
54
142
 
@@ -56,7 +144,7 @@ class ErrorHandler:
56
144
  func: 要调用的函数
57
145
  *args: 函数参数
58
146
  default_return: 默认返回值
59
- context: 错误上下文描述
147
+ context: 错误上下文
60
148
  **kwargs: 函数关键字参数
61
149
 
62
150
  Returns:
@@ -67,77 +155,234 @@ class ErrorHandler:
67
155
  except Exception as e:
68
156
  self.handle_error(e, context=context, raise_error=False)
69
157
  return default_return
70
-
71
- def retry_on_failure(self, max_retries: int = 3, delay: float = 1.0,
72
- exceptions: tuple = (Exception,)):
158
+
159
+ def retry_on_failure(self, max_retries: int = 3, delay: float = 1.0,
160
+ exceptions: tuple = (Exception,), backoff_factor: float = 1.0,
161
+ context: Optional[ErrorContext] = None):
73
162
  """
74
- 装饰器:失败时重试
163
+ 装饰器:失败时重试(增强版)
75
164
 
76
165
  Args:
77
166
  max_retries: 最大重试次数
78
- delay: 重试间隔(秒)
167
+ delay: 初始重试间隔(秒)
79
168
  exceptions: 需要重试的异常类型
169
+ backoff_factor: 退避因子(每次重试间隔乘以此因子)
170
+ context: 错误上下文
80
171
  """
81
-
82
172
  def decorator(func):
83
173
  @wraps(func)
84
- def wrapper(*args, **kwargs):
174
+ async def async_wrapper(*args, **kwargs):
175
+ last_exception = None
176
+ current_delay = delay
177
+
178
+ for attempt in range(max_retries + 1):
179
+ try:
180
+ return await func(*args, **kwargs)
181
+ except exceptions as e:
182
+ last_exception = e
183
+ if attempt < max_retries:
184
+ # 记录重试信息
185
+ retry_context = ErrorContext(
186
+ context=f"函数 {func.__name__} 执行失败 (尝试 {attempt + 1}/{max_retries + 1})",
187
+ module=context.module if context else "",
188
+ function=func.__name__
189
+ ) if context else None
190
+
191
+ self.logger.warning(
192
+ f"函数 {func.__name__} 执行失败 (尝试 {attempt + 1}/{max_retries + 1}): {e}"
193
+ )
194
+
195
+ import asyncio
196
+ await asyncio.sleep(current_delay)
197
+ current_delay *= backoff_factor # 指数退避
198
+ else:
199
+ # 最后一次尝试失败
200
+ final_context = ErrorContext(
201
+ context=f"函数 {func.__name__} 执行失败,已达到最大重试次数",
202
+ module=context.module if context else "",
203
+ function=func.__name__
204
+ ) if context else None
205
+
206
+ self.logger.error(
207
+ f"函数 {func.__name__} 执行失败,已达到最大重试次数: {e}"
208
+ )
209
+ raise last_exception
210
+
211
+ @wraps(func)
212
+ def sync_wrapper(*args, **kwargs):
85
213
  last_exception = None
214
+ current_delay = delay
215
+
86
216
  for attempt in range(max_retries + 1):
87
217
  try:
88
218
  return func(*args, **kwargs)
89
219
  except exceptions as e:
90
220
  last_exception = e
91
221
  if attempt < max_retries:
92
- self.logger.warning(f"Attempt {attempt + 1} failed, retrying in {delay}s: {e}")
93
- time.sleep(delay)
222
+ # 记录重试信息
223
+ retry_context = ErrorContext(
224
+ context=f"函数 {func.__name__} 执行失败 (尝试 {attempt + 1}/{max_retries + 1})",
225
+ module=context.module if context else "",
226
+ function=func.__name__
227
+ ) if context else None
228
+
229
+ self.logger.warning(
230
+ f"函数 {func.__name__} 执行失败 (尝试 {attempt + 1}/{max_retries + 1}): {e}"
231
+ )
232
+
233
+ import time
234
+ time.sleep(current_delay)
235
+ current_delay *= backoff_factor # 指数退避
94
236
  else:
95
- self.logger.error(f"All {max_retries + 1} attempts failed")
96
- raise e
97
- return None
98
-
99
- return wrapper
100
-
237
+ # 最后一次尝试失败
238
+ final_context = ErrorContext(
239
+ context=f"函数 {func.__name__} 执行失败,已达到最大重试次数",
240
+ module=context.module if context else "",
241
+ function=func.__name__
242
+ ) if context else None
243
+
244
+ self.logger.error(
245
+ f"函数 {func.__name__} 执行失败,已达到最大重试次数: {e}"
246
+ )
247
+ raise last_exception
248
+
249
+ # 根据函数是否为异步函数返回相应的包装器
250
+ import inspect
251
+ if inspect.iscoroutinefunction(func):
252
+ return async_wrapper
253
+ else:
254
+ return sync_wrapper
255
+
101
256
  return decorator
257
+
258
+ def get_error_history(self) -> List[Dict]:
259
+ """获取错误历史记录"""
260
+ return self.error_history.copy()
261
+
262
+ def clear_error_history(self):
263
+ """清空错误历史记录"""
264
+ self.error_history.clear()
102
265
 
103
266
 
104
- # 全局错误处理器实例(延迟初始化)
105
- _default_error_handler = None
106
-
107
-
108
- def get_default_error_handler():
109
- """Get the default error handler with lazy initialization"""
110
- global _default_error_handler
111
- if _default_error_handler is None:
112
- _default_error_handler = ErrorHandler()
113
- return _default_error_handler
267
+ # 为了向后兼容,提供与旧版error_handler.py相同的接口
114
268
 
269
+ # 别名,保持与旧版接口一致
270
+ ErrorHandler = EnhancedErrorHandler
115
271
 
116
- # 为了向后兼容,保留老的接口
117
- def __getattr__(name):
118
- if name == 'default_error_handler':
119
- return get_default_error_handler()
120
- raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
272
+ # 全局增强错误处理器实例
273
+ enhanced_error_handler = EnhancedErrorHandler()
121
274
 
275
+ # 为了向后兼容,提供默认错误处理器实例
276
+ default_error_handler = enhanced_error_handler
122
277
 
123
- def handle_exception(context: str = "", raise_error: bool = True, log_error: bool = True):
278
+ def handle_exception(context: str = "", module: str = "", function: str = "",
279
+ raise_error: bool = True, log_error: bool = True,
280
+ error_code: Optional[str] = None):
124
281
  """
125
- 装饰器:处理函数异常
282
+ 装饰器:处理函数异常(增强版)
126
283
 
127
284
  Args:
128
285
  context: 错误上下文描述
286
+ module: 模块名称
287
+ function: 函数名称
129
288
  raise_error: 是否重新抛出异常
130
289
  log_error: 是否记录错误日志
290
+ error_code: 错误代码
131
291
  """
292
+ def decorator(func):
293
+ @wraps(func)
294
+ async def async_wrapper(*args, **kwargs):
295
+ try:
296
+ return await func(*args, **kwargs)
297
+ except Exception as e:
298
+ error_context = ErrorContext(
299
+ context=f"{context} - {func.__name__}",
300
+ module=module,
301
+ function=func.__name__
302
+ )
303
+
304
+ # 如果是详细异常,保留原有信息
305
+ if isinstance(e, DetailedException):
306
+ # 确保上下文信息完整
307
+ if not e.context:
308
+ e.context = error_context
309
+ enhanced_error_handler.handle_error(
310
+ e, context=e.context,
311
+ raise_error=raise_error, log_error=log_error
312
+ )
313
+ else:
314
+ # 包装为详细异常
315
+ detailed_e = DetailedException(
316
+ str(e), context=error_context, error_code=error_code
317
+ )
318
+ enhanced_error_handler.handle_error(
319
+ detailed_e, context=error_context,
320
+ raise_error=raise_error, log_error=log_error
321
+ )
322
+ if not raise_error:
323
+ return None
324
+
325
+ @wraps(func)
326
+ def sync_wrapper(*args, **kwargs):
327
+ try:
328
+ return func(*args, **kwargs)
329
+ except Exception as e:
330
+ error_context = ErrorContext(
331
+ context=f"{context} - {func.__name__}",
332
+ module=module,
333
+ function=func.__name__
334
+ )
335
+
336
+ # 如果是详细异常,保留原有信息
337
+ if isinstance(e, DetailedException):
338
+ # 确保上下文信息完整
339
+ if not e.context:
340
+ e.context = error_context
341
+ enhanced_error_handler.handle_error(
342
+ e, context=e.context,
343
+ raise_error=raise_error, log_error=log_error
344
+ )
345
+ else:
346
+ # 包装为详细异常
347
+ detailed_e = DetailedException(
348
+ str(e), context=error_context, error_code=error_code
349
+ )
350
+ enhanced_error_handler.handle_error(
351
+ detailed_e, context=error_context,
352
+ raise_error=raise_error, log_error=log_error
353
+ )
354
+ if not raise_error:
355
+ return None
356
+
357
+ # 根据函数是否为异步函数返回相应的包装器
358
+ import inspect
359
+ if inspect.iscoroutinefunction(func):
360
+ return async_wrapper
361
+ else:
362
+ return sync_wrapper
363
+
364
+ return decorator
132
365
 
366
+ # 为了完全向后兼容,也提供旧版的handle_exception接口
367
+ # 这个版本与error_handler.py中的接口完全一致
368
+ def handle_exception_simple(context: str = "", raise_error: bool = True, log_error: bool = True):
369
+ """
370
+ 装饰器:处理函数异常(简化版,与旧版接口兼容)
371
+
372
+ Args:
373
+ context: 错误上下文描述
374
+ raise_error: 是否重新抛出异常
375
+ log_error: 是否记录错误日志
376
+ """
133
377
  def decorator(func):
134
378
  @wraps(func)
135
379
  async def async_wrapper(*args, **kwargs):
136
380
  try:
137
381
  return await func(*args, **kwargs)
138
382
  except Exception as e:
139
- default_error_handler.handle_error(
140
- e, context=f"{context} - {func.__name__}",
383
+ error_context = ErrorContext(context=f"{context} - {func.__name__}")
384
+ enhanced_error_handler.handle_error(
385
+ e, context=error_context,
141
386
  raise_error=raise_error, log_error=log_error
142
387
  )
143
388
  if not raise_error:
@@ -148,8 +393,9 @@ def handle_exception(context: str = "", raise_error: bool = True, log_error: boo
148
393
  try:
149
394
  return func(*args, **kwargs)
150
395
  except Exception as e:
151
- default_error_handler.handle_error(
152
- e, context=f"{context} - {func.__name__}",
396
+ error_context = ErrorContext(context=f"{context} - {func.__name__}")
397
+ enhanced_error_handler.handle_error(
398
+ e, context=error_context,
153
399
  raise_error=raise_error, log_error=log_error
154
400
  )
155
401
  if not raise_error:
@@ -162,4 +408,4 @@ def handle_exception(context: str = "", raise_error: bool = True, log_error: boo
162
408
  else:
163
409
  return sync_wrapper
164
410
 
165
- return decorator
411
+ return decorator
@@ -13,9 +13,8 @@
13
13
  """
14
14
 
15
15
  import hashlib
16
- import json
17
- from typing import Any, Dict, Union
18
- from collections import namedtuple
16
+ from typing import Any, Dict
17
+ from w3lib.url import canonicalize_url
19
18
 
20
19
 
21
20
  def generate_data_fingerprint(data: Any) -> str:
@@ -73,7 +72,7 @@ def generate_request_fingerprint(
73
72
 
74
73
  # 基本字段
75
74
  hash_func.update(method.encode('utf-8'))
76
- hash_func.update(url.encode('utf-8'))
75
+ hash_func.update(canonicalize_url(url).encode('utf-8'))
77
76
  hash_func.update(body or b'')
78
77
 
79
78
  # 可选的 headers
crawlo/utils/misc.py ADDED
@@ -0,0 +1,82 @@
1
+ import importlib
2
+ import pkgutil
3
+ from typing import Iterator, Any, Type
4
+
5
+ from crawlo.spider import Spider
6
+
7
+
8
+ def walk_modules(module_path: str) -> Iterator[Any]:
9
+ """
10
+ 加载模块并递归遍历其所有子模块
11
+
12
+ Args:
13
+ module_path: 模块路径
14
+
15
+ Yields:
16
+ 导入的模块对象
17
+
18
+ Raises:
19
+ ImportError: 如果模块无法导入
20
+ """
21
+ # 导入模块
22
+ module = importlib.import_module(module_path)
23
+ yield module
24
+
25
+ # 如果是包,则递归导入子模块
26
+ if hasattr(module, '__path__'):
27
+ for loader, submodule_name, is_pkg in pkgutil.walk_packages(module.__path__):
28
+ try:
29
+ submodule_path = f"{module_path}.{submodule_name}"
30
+ submodule = importlib.import_module(submodule_path)
31
+ yield submodule
32
+
33
+ # 如果子模块也是包,递归遍历
34
+ if is_pkg:
35
+ yield from walk_modules(submodule_path)
36
+ except ImportError:
37
+ # 跳过无法导入的子模块
38
+ continue
39
+
40
+
41
+ def iter_spider_classes(module) -> Iterator[Type[Spider]]:
42
+ """
43
+ 遍历模块中的所有Spider子类
44
+
45
+ Args:
46
+ module: 要遍历的模块
47
+
48
+ Yields:
49
+ Spider子类
50
+ """
51
+ for attr_name in dir(module):
52
+ attr_value = getattr(module, attr_name)
53
+ if (isinstance(attr_value, type) and
54
+ issubclass(attr_value, Spider) and
55
+ attr_value != Spider and
56
+ hasattr(attr_value, 'name')):
57
+ yield attr_value
58
+
59
+
60
+ def load_object(path: str):
61
+ """
62
+ 从路径加载对象
63
+
64
+ Args:
65
+ path: 对象路径,格式为 module.submodule:object_name 或 module.submodule.object_name
66
+
67
+ Returns:
68
+ 加载的对象
69
+ """
70
+ try:
71
+ # 处理 module.submodule:object_name 格式
72
+ if ':' in path:
73
+ module_path, obj_name = path.split(':', 1)
74
+ module = importlib.import_module(module_path)
75
+ return getattr(module, obj_name)
76
+ else:
77
+ # 处理 module.submodule.object_name 格式
78
+ module_path, obj_name = path.rsplit('.', 1)
79
+ module = importlib.import_module(module_path)
80
+ return getattr(module, obj_name)
81
+ except (ImportError, AttributeError) as e:
82
+ raise ImportError(f"Could not load object from path '{path}': {e}")