crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +52 -17
  4. crawlo/commands/startproject.py +24 -0
  5. crawlo/core/engine.py +2 -2
  6. crawlo/core/scheduler.py +4 -4
  7. crawlo/crawler.py +13 -6
  8. crawlo/downloader/__init__.py +5 -2
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/mode_manager.py +45 -11
  18. crawlo/network/response.py +374 -69
  19. crawlo/pipelines/mysql_pipeline.py +6 -6
  20. crawlo/pipelines/pipeline_manager.py +2 -2
  21. crawlo/project.py +2 -4
  22. crawlo/queue/pqueue.py +2 -6
  23. crawlo/queue/queue_manager.py +1 -2
  24. crawlo/settings/default_settings.py +15 -30
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +51 -65
  30. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  31. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  32. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  33. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  34. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  35. crawlo/templates/run.py.tmpl +3 -7
  36. crawlo/tools/__init__.py +0 -11
  37. crawlo/utils/__init__.py +17 -1
  38. crawlo/utils/db_helper.py +220 -319
  39. crawlo/utils/error_handler.py +313 -67
  40. crawlo/utils/fingerprint.py +3 -4
  41. crawlo/utils/misc.py +82 -0
  42. crawlo/utils/request.py +55 -66
  43. crawlo/utils/selector_helper.py +138 -0
  44. crawlo/utils/spider_loader.py +185 -45
  45. crawlo/utils/text_helper.py +95 -0
  46. crawlo-1.4.5.dist-info/METADATA +329 -0
  47. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
  48. tests/bug_check_test.py +251 -0
  49. tests/direct_selector_helper_test.py +97 -0
  50. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  51. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  52. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  56. tests/ofweek_scrapy/scrapy.cfg +11 -0
  57. tests/performance_comparison.py +4 -5
  58. tests/simple_crawlo_test.py +1 -2
  59. tests/simple_follow_test.py +39 -0
  60. tests/simple_response_selector_test.py +95 -0
  61. tests/simple_selector_helper_test.py +155 -0
  62. tests/simple_selector_test.py +208 -0
  63. tests/simple_url_test.py +74 -0
  64. tests/test_crawler_process_import.py +39 -0
  65. tests/test_crawler_process_spider_modules.py +48 -0
  66. tests/test_edge_cases.py +7 -5
  67. tests/test_encoding_core.py +57 -0
  68. tests/test_encoding_detection.py +127 -0
  69. tests/test_factory_compatibility.py +197 -0
  70. tests/test_multi_directory.py +68 -0
  71. tests/test_multiple_spider_modules.py +81 -0
  72. tests/test_optimized_selector_naming.py +101 -0
  73. tests/test_priority_behavior.py +18 -18
  74. tests/test_response_follow.py +105 -0
  75. tests/test_response_selector_methods.py +93 -0
  76. tests/test_response_url_methods.py +71 -0
  77. tests/test_response_urljoin.py +87 -0
  78. tests/test_scrapy_style_encoding.py +113 -0
  79. tests/test_selector_helper.py +101 -0
  80. tests/test_selector_optimizations.py +147 -0
  81. tests/test_spider_loader.py +50 -0
  82. tests/test_spider_loader_comprehensive.py +70 -0
  83. tests/test_spider_modules.py +85 -0
  84. tests/test_spiders/__init__.py +1 -0
  85. tests/test_spiders/test_spider.py +10 -0
  86. crawlo/tools/anti_crawler.py +0 -269
  87. crawlo/utils/class_loader.py +0 -26
  88. crawlo/utils/enhanced_error_handler.py +0 -357
  89. crawlo-1.4.3.dist-info/METADATA +0 -190
  90. examples/test_project/__init__.py +0 -7
  91. examples/test_project/run.py +0 -35
  92. examples/test_project/test_project/__init__.py +0 -4
  93. examples/test_project/test_project/items.py +0 -18
  94. examples/test_project/test_project/middlewares.py +0 -119
  95. examples/test_project/test_project/pipelines.py +0 -97
  96. examples/test_project/test_project/settings.py +0 -170
  97. examples/test_project/test_project/spiders/__init__.py +0 -10
  98. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  99. tests/simple_log_test.py +0 -58
  100. tests/simple_test.py +0 -48
  101. tests/test_framework_logger.py +0 -67
  102. tests/test_framework_startup.py +0 -65
  103. tests/test_mode_change.py +0 -73
  104. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  105. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  106. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  107. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
crawlo/utils/request.py CHANGED
@@ -79,52 +79,63 @@ def request_fingerprint(
79
79
  :param include_headers: 指定要参与指纹计算的 header 名称列表(str 或 bytes)
80
80
  :return: 请求指纹(hex string)
81
81
  """
82
- hash_func = hashlib.sha256()
83
-
84
- # 基本字段
85
- hash_func.update(to_bytes(request.method))
86
- hash_func.update(to_bytes(canonicalize_url(request.url)))
87
- hash_func.update(request.body or b'')
88
-
82
+ from crawlo.utils.fingerprint import FingerprintGenerator
83
+
84
+ # 准备请求数据
85
+ method = request.method
86
+ url = request.url
87
+ body = request.body or b''
88
+ headers = None
89
+
89
90
  # 处理 headers
90
- if include_headers:
91
- headers = request.headers # 假设 headers 是类似字典或 MultiDict 的结构
91
+ if include_headers and hasattr(request, 'headers'):
92
+ headers = {}
92
93
  for header_name in include_headers:
93
- name_bytes = to_bytes(header_name).lower() # 统一转为小写进行匹配
94
- value = b''
94
+ name_str = str(header_name).lower() # 统一转为小写进行匹配
95
+ value = ''
95
96
 
96
97
  # 兼容 headers 的访问方式(如 MultiDict 或 dict)
97
- if hasattr(headers, 'get_all'):
98
+ if hasattr(request.headers, 'get_all'):
98
99
  # 如 scrapy.http.Headers 的 get_all 方法
99
- values = headers.get_all(name_bytes)
100
- value = b';'.join(values) if values else b''
101
- elif hasattr(headers, '__getitem__'):
100
+ values = request.headers.get_all(name_str)
101
+ value = ';'.join(str(v) for v in values) if values else ''
102
+ elif hasattr(request.headers, '__getitem__'):
102
103
  # 如普通 dict
103
104
  try:
104
- raw_value = headers[name_bytes]
105
+ raw_value = request.headers[name_str]
105
106
  if isinstance(raw_value, list):
106
- value = b';'.join(to_bytes(v) for v in raw_value)
107
+ value = ';'.join(str(v) for v in raw_value)
107
108
  else:
108
- value = to_bytes(raw_value)
109
+ value = str(raw_value)
109
110
  except (KeyError, TypeError):
110
- value = b''
111
+ value = ''
111
112
  else:
112
- value = b''
113
-
114
- hash_func.update(name_bytes + b':' + value)
115
-
116
- return hash_func.hexdigest()
113
+ value = ''
114
+
115
+ headers[name_str] = value
116
+
117
+ # 使用统一的指纹生成器
118
+ return FingerprintGenerator.request_fingerprint(method, url, body, headers)
117
119
 
118
120
 
119
121
  def set_request(request: Request, priority: int) -> None:
122
+ """
123
+ 设置请求的深度和优先级
124
+
125
+ :param request: Request 对象
126
+ :param priority: 优先级值
127
+ """
128
+ # 增加请求深度
120
129
  request.meta['depth'] = request.meta.setdefault('depth', 0) + 1
130
+
131
+ # 根据深度调整优先级,深度越深优先级越低
121
132
  if priority:
122
133
  request.priority -= request.meta['depth'] * priority
123
134
 
124
135
 
125
136
  def request_to_dict(request: Request, spider=None) -> Dict[str, Any]:
126
137
  """
127
- 将 Request 对象转换为可 JSON 序列化的字典。
138
+ 将 Request 对象转换为可 JSON 序列化的字典,用于分布式爬虫中的请求序列化。
128
139
 
129
140
  Args:
130
141
  request: 要序列化的 Request 对象
@@ -146,28 +157,22 @@ def request_to_dict(request: Request, spider=None) -> Dict[str, Any]:
146
157
 
147
158
  # 1. 处理 callback
148
159
  # 不能直接序列化函数,所以存储其路径
149
- if callable(request.callback):
160
+ if callable(getattr(request, 'callback', None)):
150
161
  d['_callback'] = _get_function_path(request.callback)
151
162
 
152
163
  # 2. 处理 errback
153
- if callable(request.errback):
154
- d['_errback'] = _get_function_path(request.errback)
164
+ if callable(getattr(request, 'err_back', None)):
165
+ d['_errback'] = _get_function_path(request.err_back)
155
166
 
156
167
  # 3. 记录原始类名,以便反序列化时创建正确的实例
157
168
  d['_class'] = request.__class__.__module__ + '.' + request.__class__.__name__
158
169
 
159
- # 4. 特殊处理 FormRequest
160
- # 如果是 FormRequest,需要保存 formdata
161
- if isinstance(request, Request):
162
- if hasattr(request, 'formdata'):
163
- d['formdata'] = request.formdata
164
-
165
170
  return d
166
171
 
167
172
 
168
173
  def request_from_dict(d: Dict[str, Any], spider=None) -> Request:
169
174
  """
170
- 从字典重建 Request 对象。
175
+ 从字典重建 Request 对象,用于分布式爬虫中的请求反序列化。
171
176
 
172
177
  Args:
173
178
  d: 由 request_to_dict 生成的字典
@@ -193,37 +198,21 @@ def request_from_dict(d: Dict[str, Any], spider=None) -> Request:
193
198
  errback_path = d.pop('_errback', None)
194
199
  errback = _get_function_from_path(errback_path, spider) if errback_path else None
195
200
 
196
- # 4. 提取特殊字段
197
- formdata = d.pop('formdata', None)
198
-
199
201
  # 5. 创建 Request 实例
200
- # 注意:body formdata 不能同时存在
201
- if formdata is not None and cls is FormRequest:
202
- # 如果是 FormRequest 且有 formdata,优先使用 formdata
203
- request = FormRequest(
204
- url=d['url'],
205
- method=d.get('method', 'GET'),
206
- headers=d.get('headers', {}),
207
- formdata=formdata,
208
- callback=callback,
209
- errback=errback,
210
- meta=d.get('meta', {}),
211
- flags=d.get('flags', []),
212
- cb_kwargs=d.get('cb_kwargs', {}),
213
- )
214
- else:
215
- # 普通 Request 或没有 formdata 的情况
216
- request = cls(
217
- url=d['url'],
218
- method=d.get('method', 'GET'),
219
- headers=d.get('headers', {}),
220
- body=d.get('body'),
221
- callback=callback,
222
- errback=errback,
223
- meta=d.get('meta', {}),
224
- flags=d.get('flags', []),
225
- cb_kwargs=d.get('cb_kwargs', {}),
226
- )
202
+ request = cls(
203
+ url=d['url'],
204
+ method=d.get('method', 'GET'),
205
+ headers=d.get('headers', {}),
206
+ body=d.get('body'),
207
+ callback=callback,
208
+ meta=d.get('meta', {}),
209
+ flags=d.get('flags', []),
210
+ cb_kwargs=d.get('cb_kwargs', {}),
211
+ )
212
+
213
+ # 手动设置 err_back 属性
214
+ if errback is not None:
215
+ request.err_back = errback
227
216
 
228
217
  return request
229
218
 
@@ -256,7 +245,7 @@ def _get_function_from_path(path: str, spider=None) -> Optional[callable]:
256
245
  func = getattr(func, attr)
257
246
 
258
247
  # 如果 spider 存在,并且 func 是 spider 的方法
259
- if spider and hasattr(spider, func.__name__):
248
+ if spider and hasattr(func, '__name__') and hasattr(spider, func.__name__):
260
249
  spider_method = getattr(spider, func.__name__)
261
250
  if spider_method is func:
262
251
  return spider_method # 返回绑定的方法
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 选择器辅助工具模块
5
+ ==================
6
+ 提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
7
+
8
+ 该模块包含以下主要函数:
9
+ - extract_text: 从元素列表中提取文本并拼接
10
+ - extract_texts: 从元素列表中提取多个文本列表
11
+ - extract_attr: 从元素列表中提取单个元素的属性值
12
+ - extract_attrs: 从元素列表中提取多个元素的属性值列表
13
+ - is_xpath: 判断查询语句是否为XPath
14
+
15
+ 所有方法都采用了简洁直观的命名风格,便于记忆和使用。
16
+ """
17
+
18
+ from typing import List, Any, Optional
19
+ from parsel import Selector, SelectorList
20
+
21
+
22
+ def extract_text(elements: SelectorList, join_str: str = " ") -> str:
23
+ """
24
+ 从元素列表中提取文本并拼接
25
+
26
+ :param elements: SelectorList元素列表
27
+ :param join_str: 文本拼接分隔符
28
+ :return: 拼接后的文本
29
+
30
+ 示例:
31
+ title_elements = selector.css('title')
32
+ title_text = extract_text(title_elements)
33
+ """
34
+ texts = []
35
+ for element in elements:
36
+ # 获取元素的所有文本节点
37
+ if hasattr(element, 'xpath'):
38
+ element_texts = element.xpath('.//text()').getall()
39
+ else:
40
+ element_texts = [str(element)]
41
+ # 清理并添加非空文本
42
+ for text in element_texts:
43
+ cleaned = text.strip()
44
+ if cleaned:
45
+ texts.append(cleaned)
46
+ return join_str.join(texts)
47
+
48
+
49
+ def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
50
+ """
51
+ 从元素列表中提取多个文本列表
52
+
53
+ :param elements: SelectorList元素列表
54
+ :param join_str: 单个节点内文本拼接分隔符
55
+ :return: 纯文本列表(每个元素对应一个节点的文本)
56
+
57
+ 示例:
58
+ li_elements = selector.css('.list li')
59
+ li_texts = extract_texts(li_elements)
60
+ """
61
+ result = []
62
+ for element in elements:
63
+ # 对每个元素提取文本
64
+ if hasattr(element, 'xpath'):
65
+ texts = element.xpath('.//text()').getall()
66
+ else:
67
+ texts = [str(element)]
68
+
69
+ # 清理文本并拼接
70
+ clean_texts = [text.strip() for text in texts if text.strip()]
71
+ if clean_texts:
72
+ result.append(join_str.join(clean_texts))
73
+
74
+ return result
75
+
76
+
77
+ def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
78
+ """
79
+ 从元素列表中提取单个元素的属性值
80
+
81
+ :param elements: SelectorList元素列表
82
+ :param attr_name: 属性名称
83
+ :param default: 默认返回值
84
+ :return: 属性值或默认值
85
+
86
+ 示例:
87
+ link_elements = selector.css('.link')
88
+ link_href = extract_attr(link_elements, 'href')
89
+ """
90
+ # 使用parsel的attrib属性获取第一个匹配元素的属性值
91
+ if hasattr(elements, 'attrib'):
92
+ return elements.attrib.get(attr_name, default)
93
+ # 如果elements是SelectorList,获取第一个元素的属性
94
+ elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
95
+ return elements[0].attrib.get(attr_name, default)
96
+ return default
97
+
98
+
99
+ def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
100
+ """
101
+ 从元素列表中提取多个元素的属性值列表
102
+
103
+ :param elements: SelectorList元素列表
104
+ :param attr_name: 属性名称
105
+ :return: 属性值列表
106
+
107
+ 示例:
108
+ all_links = selector.css('a')
109
+ all_hrefs = extract_attrs(all_links, 'href')
110
+ """
111
+ result = []
112
+ for element in elements:
113
+ # 使用parsel的attrib属性获取元素的属性值
114
+ if hasattr(element, 'attrib'):
115
+ attr_value = element.attrib.get(attr_name)
116
+ if attr_value is not None:
117
+ result.append(attr_value)
118
+
119
+ return result
120
+
121
+
122
+ def is_xpath(query: str) -> bool:
123
+ """
124
+ 判断查询语句是否为XPath
125
+
126
+ :param query: 查询语句
127
+ :return: 是否为XPath
128
+ """
129
+ return query.startswith(('/', '//', './'))
130
+
131
+
132
+ __all__ = [
133
+ "extract_text",
134
+ "extract_texts",
135
+ "extract_attr",
136
+ "extract_attrs",
137
+ "is_xpath"
138
+ ]
@@ -1,62 +1,202 @@
1
1
  import importlib
2
+ import traceback
3
+ import warnings
4
+ from collections import defaultdict
2
5
  from pathlib import Path
3
- from typing import List, Type, Optional, Dict
6
+ from typing import List, Type, Dict, Any
4
7
 
8
+ from crawlo.interfaces import ISpiderLoader
9
+ from crawlo.settings.setting_manager import SettingManager
5
10
  from crawlo.spider import Spider
11
+ from crawlo.network.request import Request
6
12
  from crawlo.utils.log import get_logger
7
13
 
8
14
  logger = get_logger(__name__)
9
15
 
10
16
 
11
- class SpiderLoader:
12
- """爬虫加载器,负责发现和加载爬虫"""
13
-
14
- def __init__(self, project_package: str):
15
- self.project_package = project_package
16
- self._spiders: Dict[str, Type[Spider]] = {}
17
- self._load_spiders()
18
-
19
- def _load_spiders(self):
20
- """加载所有爬虫"""
21
- spiders_dir = Path.cwd() / self.project_package / 'spiders'
22
- if not spiders_dir.exists():
23
- logger.warning(f"Spiders directory not found: {spiders_dir}")
24
- return
25
-
26
- for py_file in spiders_dir.glob("*.py"):
27
- if py_file.name.startswith('_'):
28
- continue
29
-
30
- module_name = py_file.stem
31
- spider_module_path = f"{self.project_package}.spiders.{module_name}"
32
-
33
- try:
34
- module = importlib.import_module(spider_module_path)
35
- except ImportError as e:
36
- logger.debug(f"Skip module {module_name}: {e}")
37
- continue
38
-
39
- # 查找所有 Spider 子类
40
- for attr_name in dir(module):
41
- attr_value = getattr(module, attr_name)
42
- if (isinstance(attr_value, type) and
43
- issubclass(attr_value, Spider) and
44
- attr_value != Spider and
45
- hasattr(attr_value, 'name')):
46
-
47
- spider_name = getattr(attr_value, 'name')
48
- if spider_name in self._spiders:
49
- logger.warning(f"Duplicate spider name '{spider_name}' found")
50
- self._spiders[spider_name] = attr_value
17
+ class SpiderLoaderProtocol:
18
+ """Protocol for spider loader"""
19
+
20
+ @classmethod
21
+ def from_settings(cls, settings: SettingManager) -> 'SpiderLoaderProtocol':
22
+ """Create spider loader from settings"""
23
+ return cls(settings)
24
+
25
+ def load(self, spider_name: str) -> Type[Spider]:
26
+ """Load a spider by name"""
27
+ raise NotImplementedError
28
+
29
+ def list(self) -> List[str]:
30
+ """List all available spider names"""
31
+ raise NotImplementedError
32
+
33
+ def find_by_request(self, request: 'Request') -> List[str]:
34
+ """Find spider names that can handle the given request"""
35
+ raise NotImplementedError
51
36
 
52
- def load(self, spider_name: str) -> Optional[Type[Spider]]:
53
- """通过 name 加载爬虫"""
54
- return self._spiders.get(spider_name)
55
37
 
38
+ class SpiderLoader(ISpiderLoader):
39
+ """爬虫加载器,负责发现和加载爬虫"""
40
+
41
+ def __init__(self, settings: SettingManager = None):
42
+ # 如果提供了settings,则从settings中获取配置
43
+ if settings is not None:
44
+ self.spider_modules = settings.get('SPIDER_MODULES', [])
45
+ self.warn_only = settings.get('SPIDER_LOADER_WARN_ONLY', False)
46
+ else:
47
+ # 默认配置
48
+ self.spider_modules = []
49
+ self.warn_only = False
50
+
51
+ self._spiders: Dict[str, Type[Spider]] = {}
52
+ self._found: Dict[str, List[tuple]] = defaultdict(list)
53
+ self._load_all_spiders()
54
+
55
+ @classmethod
56
+ def from_settings(cls, settings: SettingManager) -> 'SpiderLoader':
57
+ """从设置创建SpiderLoader实例"""
58
+ return cls(settings)
59
+
60
+ def _check_name_duplicates(self) -> None:
61
+ """检查重复的spider名称"""
62
+ dupes = []
63
+ for name, locations in self._found.items():
64
+ if len(locations) > 1:
65
+ dupes.extend([
66
+ f" {cls} named {name!r} (in {mod})"
67
+ for mod, cls in locations
68
+ ])
69
+
70
+ if dupes:
71
+ dupes_string = "\n\n".join(dupes)
72
+ warnings.warn(
73
+ "There are several spiders with the same name:\n\n"
74
+ f"{dupes_string}\n\n This can cause unexpected behavior.",
75
+ category=UserWarning,
76
+ )
77
+
78
+ def _load_spiders(self, module) -> None:
79
+ """加载模块中的所有spider"""
80
+ for attr_name in dir(module):
81
+ attr_value = getattr(module, attr_name)
82
+ if (isinstance(attr_value, type) and
83
+ issubclass(attr_value, Spider) and
84
+ attr_value != Spider and
85
+ hasattr(attr_value, 'name')):
86
+
87
+ spider_name = getattr(attr_value, 'name')
88
+ self._found[spider_name].append((module.__name__, attr_value.__name__))
89
+ self._spiders[spider_name] = attr_value
90
+
91
+ def _load_spiders_from_package(self, package_name: str) -> None:
92
+ """从包中加载spiders"""
93
+ try:
94
+ # 尝试导入包
95
+ package = importlib.import_module(package_name)
96
+
97
+ # 遍历包中的所有模块
98
+ package_path = Path(package.__file__).parent
99
+ for py_file in package_path.glob("*.py"):
100
+ if py_file.name.startswith('_'):
101
+ continue
102
+
103
+ module_name = py_file.stem
104
+ spider_module_path = f"{package_name}.{module_name}"
105
+
106
+ try:
107
+ module = importlib.import_module(spider_module_path)
108
+ self._load_spiders(module)
109
+ except ImportError as e:
110
+ if self.warn_only:
111
+ logger.warning(f"Could not load spiders from module '{spider_module_path}': {e}")
112
+ logger.debug(traceback.format_exc())
113
+ else:
114
+ raise
115
+ except (ImportError, SyntaxError) as e:
116
+ if self.warn_only:
117
+ logger.warning(f"Could not load spiders from package '{package_name}': {e}")
118
+ logger.debug(traceback.format_exc())
119
+ else:
120
+ raise
121
+
122
+ def _load_all_spiders(self) -> None:
123
+ """加载所有spiders"""
124
+ # 如果配置了SPIDER_MODULES,则从这些模块加载
125
+ if self.spider_modules:
126
+ for module_name in self.spider_modules:
127
+ self._load_spiders_from_package(module_name)
128
+ else:
129
+ # 向后兼容:如果没有配置SPIDER_MODULES,则使用旧的方式
130
+ # 这里假设默认的spiders目录结构
131
+ spiders_dir = Path.cwd() / 'spiders'
132
+ if not spiders_dir.exists():
133
+ spiders_dir = Path.cwd() / 'spider'
134
+ if not spiders_dir.exists():
135
+ logger.warning("Spiders directory not found")
136
+ return
137
+
138
+ for py_file in spiders_dir.glob("*.py"):
139
+ if py_file.name.startswith('_'):
140
+ continue
141
+
142
+ module_name = py_file.stem
143
+ module = None
144
+ try:
145
+ # 尝试不同的导入路径
146
+ spider_module_path = None
147
+ for possible_package in ['spiders', 'spider']:
148
+ try:
149
+ spider_module_path = f"{possible_package}.{module_name}"
150
+ module = importlib.import_module(spider_module_path)
151
+ break
152
+ except ImportError:
153
+ continue
154
+
155
+ if module is None:
156
+ raise ImportError(f"Could not import {module_name}")
157
+
158
+ self._load_spiders(module)
159
+ except ImportError as e:
160
+ logger.debug(f"Skip module {module_name}: {e}")
161
+ continue
162
+
163
+ self._check_name_duplicates()
164
+
165
+ def load(self, spider_name: str) -> Type[Spider]:
166
+ """
167
+ 通过name加载爬虫
168
+
169
+ Args:
170
+ spider_name: 爬虫名称
171
+
172
+ Returns:
173
+ Spider类
174
+
175
+ Raises:
176
+ KeyError: 如果找不到指定名称的爬虫
177
+ """
178
+ if spider_name not in self._spiders:
179
+ raise KeyError(f"Spider not found: {spider_name}")
180
+ return self._spiders[spider_name]
181
+
56
182
  def list(self) -> List[str]:
57
183
  """列出所有可用的爬虫名称"""
58
184
  return list(self._spiders.keys())
59
-
185
+
186
+ def find_by_request(self, request: 'Request') -> List[str]:
187
+ """
188
+ 根据请求找到可以处理该请求的爬虫名称
189
+
190
+ Args:
191
+ request: 请求对象
192
+
193
+ Returns:
194
+ 可以处理该请求的爬虫名称列表
195
+ """
196
+ # 这里可以实现更复杂的匹配逻辑
197
+ # 目前只是返回所有爬虫名称
198
+ return list(self._spiders.keys())
199
+
60
200
  def get_all(self) -> Dict[str, Type[Spider]]:
61
201
  """获取所有爬虫"""
62
202
  return self._spiders.copy()
@@ -0,0 +1,95 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+ import re
4
+ from typing import Any, Union, List, Dict, Tuple, Optional
5
+
6
+ from crawlo.utils.log import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+ # 正则表达式缓存
11
+ _REGEXPS: Dict[str, "re.Pattern"] = {}
12
+
13
+
14
+ def extract_text_by_regex(
15
+ text: Union[str, Any],
16
+ patterns: Union[str, List[str]],
17
+ allow_repeat: bool = True,
18
+ fetch_one: bool = False,
19
+ join_with: Optional[str] = None,
20
+ ) -> Union[str, List[str], Tuple]:
21
+ """
22
+ 从文本中提取信息,支持正则匹配和多模式 fallback。
23
+
24
+ Args:
25
+ text (str): 文本内容或可转为字符串的类型
26
+ patterns (str or list of str): 正则表达式模式,按顺序尝试匹配
27
+ allow_repeat (bool): 是否允许重复结果
28
+ fetch_one (bool): 是否只提取第一个匹配项(返回元组)
29
+ join_with (str, optional): 若提供,则将结果用该字符连接成字符串
30
+
31
+ Returns:
32
+ str | list | tuple: 匹配结果,根据参数返回字符串、列表或元组
33
+ """
34
+ if isinstance(patterns, str):
35
+ patterns = [patterns]
36
+
37
+ results = []
38
+ for pattern in patterns:
39
+ if not pattern:
40
+ continue
41
+
42
+ if pattern not in _REGEXPS:
43
+ _REGEXPS[pattern] = re.compile(pattern, re.S)
44
+
45
+ if fetch_one:
46
+ match = _REGEXPS[pattern].search(str(text))
47
+ results = match.groups() if match else ("",)
48
+ break
49
+ else:
50
+ found = _REGEXPS[pattern].findall(str(text))
51
+ if found:
52
+ results = found
53
+ break
54
+
55
+ if fetch_one:
56
+ return results[0] if len(results) == 1 else results
57
+
58
+ if not allow_repeat:
59
+ results = sorted(set(results), key=results.index)
60
+
61
+ return join_with.join(results) if join_with else results
62
+
63
+
64
+ def parse_json_safely(json_str: Union[str, Any]) -> Dict:
65
+ """
66
+ 安全解析 JSON 字符串,兼容非标准格式(如单引号、缺少引号键)。
67
+
68
+ Args:
69
+ json_str (str): JSON 字符串
70
+
71
+ Returns:
72
+ dict: 解析后的字典,失败返回空字典
73
+ """
74
+ if not json_str:
75
+ return {}
76
+
77
+ try:
78
+ return json.loads(json_str)
79
+ except Exception as e1:
80
+ try:
81
+ cleaned = json_str.strip().replace("'", '"')
82
+ # 使用新的函数名
83
+ keys = extract_text_by_regex(cleaned, r'(\w+):')
84
+ for key in keys:
85
+ cleaned = cleaned.replace(f"{key}:", f'"{key}":')
86
+ return json.loads(cleaned) if cleaned else {}
87
+ except Exception as e2:
88
+ logger.error(
89
+ f"JSON 解析失败\n"
90
+ f"原始内容: {json_str}\n"
91
+ f"错误1: {e1}\n"
92
+ f"修复后: {cleaned}\n"
93
+ f"错误2: {e2}"
94
+ )
95
+ return {}