crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (115) hide show
  1. crawlo/__init__.py +28 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/commands/startproject.py +117 -13
  8. crawlo/config.py +30 -0
  9. crawlo/config_validator.py +253 -0
  10. crawlo/core/engine.py +185 -11
  11. crawlo/core/scheduler.py +49 -78
  12. crawlo/crawler.py +6 -6
  13. crawlo/downloader/__init__.py +24 -0
  14. crawlo/downloader/aiohttp_downloader.py +8 -0
  15. crawlo/downloader/cffi_downloader.py +5 -0
  16. crawlo/downloader/hybrid_downloader.py +214 -0
  17. crawlo/downloader/playwright_downloader.py +403 -0
  18. crawlo/downloader/selenium_downloader.py +473 -0
  19. crawlo/extension/__init__.py +17 -10
  20. crawlo/extension/health_check.py +142 -0
  21. crawlo/extension/log_interval.py +27 -18
  22. crawlo/extension/log_stats.py +62 -24
  23. crawlo/extension/logging_extension.py +18 -9
  24. crawlo/extension/memory_monitor.py +105 -0
  25. crawlo/extension/performance_profiler.py +134 -0
  26. crawlo/extension/request_recorder.py +108 -0
  27. crawlo/filters/aioredis_filter.py +50 -12
  28. crawlo/middleware/proxy.py +26 -2
  29. crawlo/mode_manager.py +24 -19
  30. crawlo/network/request.py +30 -3
  31. crawlo/network/response.py +114 -25
  32. crawlo/pipelines/mongo_pipeline.py +81 -66
  33. crawlo/pipelines/mysql_pipeline.py +165 -43
  34. crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  35. crawlo/queue/queue_manager.py +15 -2
  36. crawlo/queue/redis_priority_queue.py +144 -76
  37. crawlo/settings/default_settings.py +93 -121
  38. crawlo/subscriber.py +62 -37
  39. crawlo/templates/project/items.py.tmpl +1 -1
  40. crawlo/templates/project/middlewares.py.tmpl +73 -49
  41. crawlo/templates/project/pipelines.py.tmpl +51 -295
  42. crawlo/templates/project/settings.py.tmpl +93 -17
  43. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  44. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  45. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  46. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  47. crawlo/templates/spider/spider.py.tmpl +2 -38
  48. crawlo/tools/__init__.py +183 -0
  49. crawlo/tools/anti_crawler.py +269 -0
  50. crawlo/tools/authenticated_proxy.py +241 -0
  51. crawlo/tools/data_validator.py +181 -0
  52. crawlo/tools/date_tools.py +36 -0
  53. crawlo/tools/distributed_coordinator.py +387 -0
  54. crawlo/tools/retry_mechanism.py +221 -0
  55. crawlo/tools/scenario_adapter.py +263 -0
  56. crawlo/utils/__init__.py +29 -1
  57. crawlo/utils/batch_processor.py +261 -0
  58. crawlo/utils/date_tools.py +58 -1
  59. crawlo/utils/enhanced_error_handler.py +360 -0
  60. crawlo/utils/env_config.py +106 -0
  61. crawlo/utils/error_handler.py +126 -0
  62. crawlo/utils/performance_monitor.py +285 -0
  63. crawlo/utils/redis_connection_pool.py +335 -0
  64. crawlo/utils/redis_key_validator.py +200 -0
  65. crawlo-1.1.5.dist-info/METADATA +401 -0
  66. crawlo-1.1.5.dist-info/RECORD +185 -0
  67. tests/advanced_tools_example.py +276 -0
  68. tests/authenticated_proxy_example.py +237 -0
  69. tests/cleaners_example.py +161 -0
  70. tests/config_validation_demo.py +103 -0
  71. tests/date_tools_example.py +181 -0
  72. tests/dynamic_loading_example.py +524 -0
  73. tests/dynamic_loading_test.py +105 -0
  74. tests/env_config_example.py +134 -0
  75. tests/error_handling_example.py +172 -0
  76. tests/redis_key_validation_demo.py +131 -0
  77. tests/response_improvements_example.py +145 -0
  78. tests/test_advanced_tools.py +149 -0
  79. tests/test_all_redis_key_configs.py +146 -0
  80. tests/test_authenticated_proxy.py +142 -0
  81. tests/test_cleaners.py +55 -0
  82. tests/test_comprehensive.py +147 -0
  83. tests/test_config_validator.py +194 -0
  84. tests/test_date_tools.py +124 -0
  85. tests/test_dynamic_downloaders_proxy.py +125 -0
  86. tests/test_dynamic_proxy.py +93 -0
  87. tests/test_dynamic_proxy_config.py +147 -0
  88. tests/test_dynamic_proxy_real.py +110 -0
  89. tests/test_edge_cases.py +304 -0
  90. tests/test_enhanced_error_handler.py +271 -0
  91. tests/test_env_config.py +122 -0
  92. tests/test_error_handler_compatibility.py +113 -0
  93. tests/test_framework_env_usage.py +104 -0
  94. tests/test_integration.py +357 -0
  95. tests/test_item_dedup_redis_key.py +123 -0
  96. tests/test_parsel.py +30 -0
  97. tests/test_performance.py +328 -0
  98. tests/test_queue_manager_redis_key.py +177 -0
  99. tests/test_redis_connection_pool.py +295 -0
  100. tests/test_redis_key_naming.py +182 -0
  101. tests/test_redis_key_validator.py +124 -0
  102. tests/test_response_improvements.py +153 -0
  103. tests/test_simple_response.py +62 -0
  104. tests/test_telecom_spider_redis_key.py +206 -0
  105. tests/test_template_content.py +88 -0
  106. tests/test_template_redis_key.py +135 -0
  107. tests/test_tools.py +154 -0
  108. tests/tools_example.py +258 -0
  109. crawlo/core/enhanced_engine.py +0 -190
  110. crawlo-1.1.3.dist-info/METADATA +0 -635
  111. crawlo-1.1.3.dist-info/RECORD +0 -113
  112. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  113. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  114. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
  115. {examples → tests}/controlled_spider_example.py +0 -0
@@ -16,6 +16,7 @@ from typing import Optional
16
16
  from crawlo.filters import BaseFilter
17
17
  from crawlo.utils.log import get_logger
18
18
  from crawlo.utils.request import request_fingerprint
19
+ from crawlo.utils.redis_connection_pool import get_redis_pool
19
20
 
20
21
 
21
22
  class AioRedisFilter(BaseFilter):
@@ -48,7 +49,7 @@ class AioRedisFilter(BaseFilter):
48
49
  初始化Redis过滤器
49
50
 
50
51
  :param redis_key: Redis中存储指纹的键名
51
- :param client: Redis客户端实例
52
+ :param client: Redis客户端实例(可以为None,稍后初始化)
52
53
  :param stats: 统计信息存储
53
54
  :param debug: 是否启用调试模式
54
55
  :param log_level: 日志级别
@@ -63,6 +64,9 @@ class AioRedisFilter(BaseFilter):
63
64
  self.cleanup_fp = cleanup_fp
64
65
  self.ttl = ttl
65
66
 
67
+ # 保存连接池引用(用于延迟初始化)
68
+ self._redis_pool = None
69
+
66
70
  # 性能计数器
67
71
  self._redis_operations = 0
68
72
  self._pipeline_operations = 0
@@ -80,17 +84,30 @@ class AioRedisFilter(BaseFilter):
80
84
  ttl = max(0, int(ttl_setting)) if ttl_setting > 0 else None
81
85
 
82
86
  try:
83
- redis_client = aioredis.from_url(
87
+ # 使用优化的连接池
88
+ redis_pool = get_redis_pool(
84
89
  redis_url,
85
- decode_responses=decode_responses,
86
90
  max_connections=20,
91
+ socket_connect_timeout=5,
92
+ socket_timeout=30,
93
+ health_check_interval=30,
94
+ retry_on_timeout=True,
95
+ decode_responses=decode_responses,
87
96
  encoding='utf-8'
88
97
  )
98
+
99
+ # 注意:这里不应该使用 await,因为 create_instance 不是异步方法
100
+ # 我们将在实际使用时获取连接
101
+ redis_client = None # 延迟初始化
89
102
  except Exception as e:
90
- raise RuntimeError(f"Redis连接失败: {redis_url} - {str(e)}")
103
+ raise RuntimeError(f"Redis连接池初始化失败: {redis_url} - {str(e)}")
91
104
 
92
- return cls(
93
- redis_key=f"{crawler.settings.get('PROJECT_NAME', 'default')}:{crawler.settings.get('REDIS_KEY', 'request_fingerprints')}",
105
+ # 使用统一的Redis key命名规范: crawlo:{project_name}:filter:fingerprint
106
+ project_name = crawler.settings.get('PROJECT_NAME', 'default')
107
+ redis_key = f"crawlo:{project_name}:filter:fingerprint"
108
+
109
+ instance = cls(
110
+ redis_key=redis_key,
94
111
  client=redis_client,
95
112
  stats=crawler.stats,
96
113
  cleanup_fp=crawler.settings.get_bool('CLEANUP_FP', False),
@@ -98,6 +115,16 @@ class AioRedisFilter(BaseFilter):
98
115
  debug=crawler.settings.get_bool('FILTER_DEBUG', False),
99
116
  log_level=crawler.settings.get('LOG_LEVEL', 'INFO')
100
117
  )
118
+
119
+ # 保存连接池引用,以便在需要时获取连接
120
+ instance._redis_pool = redis_pool
121
+ return instance
122
+
123
+ async def _get_redis_client(self):
124
+ """获取Redis客户端实例(延迟初始化)"""
125
+ if self.redis is None and self._redis_pool is not None:
126
+ self.redis = await self._redis_pool.get_connection()
127
+ return self.redis
101
128
 
102
129
  async def requested(self, request) -> bool:
103
130
  """
@@ -107,6 +134,9 @@ class AioRedisFilter(BaseFilter):
107
134
  :return: True 表示重复,False 表示新请求
108
135
  """
109
136
  try:
137
+ # 确保Redis客户端已初始化
138
+ await self._get_redis_client()
139
+
110
140
  fp = str(request_fingerprint(request))
111
141
  self._redis_operations += 1
112
142
 
@@ -141,6 +171,9 @@ class AioRedisFilter(BaseFilter):
141
171
  :return: 是否成功添加(True 表示新添加,False 表示已存在)
142
172
  """
143
173
  try:
174
+ # 确保Redis客户端已初始化
175
+ await self._get_redis_client()
176
+
144
177
  fp = str(fp)
145
178
 
146
179
  # 使用 pipeline 优化性能
@@ -178,6 +211,9 @@ class AioRedisFilter(BaseFilter):
178
211
  async def get_stats(self) -> dict:
179
212
  """获取过滤器详细统计信息"""
180
213
  try:
214
+ # 确保Redis客户端已初始化
215
+ await self._get_redis_client()
216
+
181
217
  count = await self.redis.scard(self.redis_key)
182
218
 
183
219
  # 获取TTL信息
@@ -212,6 +248,9 @@ class AioRedisFilter(BaseFilter):
212
248
  async def clear_all(self) -> int:
213
249
  """清空所有指纹数据"""
214
250
  try:
251
+ # 确保Redis客户端已初始化
252
+ await self._get_redis_client()
253
+
215
254
  deleted = await self.redis.delete(self.redis_key)
216
255
  self.logger.info(f"已清除指纹数: {deleted}")
217
256
  return deleted
@@ -222,6 +261,9 @@ class AioRedisFilter(BaseFilter):
222
261
  async def closed(self, reason: Optional[str] = None) -> None:
223
262
  """爬虫关闭时的清理操作"""
224
263
  try:
264
+ # 确保Redis客户端已初始化
265
+ await self._get_redis_client()
266
+
225
267
  if self.cleanup_fp:
226
268
  deleted = await self.redis.delete(self.redis_key)
227
269
  self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
@@ -234,9 +276,5 @@ class AioRedisFilter(BaseFilter):
234
276
 
235
277
  async def _close_redis(self) -> None:
236
278
  """安全关闭Redis连接"""
237
- try:
238
- if hasattr(self.redis, 'close'):
239
- await self.redis.close()
240
- self.logger.debug("Redis连接已关闭")
241
- except Exception as e:
242
- self.logger.warning(f"Redis关闭时出错:{e}")
279
+ # 连接池会自动管理连接,这里不需要显式关闭
280
+ self.logger.debug("Redis连接已释放")
@@ -173,7 +173,7 @@ class ProxyMiddleware:
173
173
  if isinstance(result, str) and result.strip():
174
174
  return result.strip()
175
175
  elif isinstance(result, dict):
176
- cleaned = {k: v.strip() for k, v in result.items() if v and isinstance(v, str)}
176
+ cleaned = {k: v.strip() if isinstance(v, str) else v for k, v in result.items()}
177
177
  return cleaned if cleaned else None
178
178
  return None
179
179
  except Exception as e:
@@ -225,7 +225,31 @@ class ProxyMiddleware:
225
225
 
226
226
  proxy = await self._get_cached_proxy()
227
227
  if proxy:
228
- request.proxy = proxy
228
+ # 处理带认证的代理URL
229
+ if isinstance(proxy, str) and "@" in proxy and "://" in proxy:
230
+ # 解析带认证的代理URL
231
+ parsed = urlparse(proxy)
232
+ if parsed.username and parsed.password:
233
+ # 对于AioHttp下载器,需要特殊处理认证信息
234
+ downloader_type = spider.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
235
+ if downloader_type == "aiohttp":
236
+ # 将认证信息存储在meta中,由下载器处理
237
+ request.meta["proxy_auth"] = {
238
+ "username": parsed.username,
239
+ "password": parsed.password
240
+ }
241
+ # 清理URL中的认证信息
242
+ clean_proxy = f"{parsed.scheme}://{parsed.hostname}"
243
+ if parsed.port:
244
+ clean_proxy += f":{parsed.port}"
245
+ request.proxy = clean_proxy
246
+ else:
247
+ # 其他下载器可以直接使用带认证的URL
248
+ request.proxy = proxy
249
+ else:
250
+ request.proxy = proxy
251
+ else:
252
+ request.proxy = proxy
229
253
  self.logger.info(f"分配代理 → {proxy} | {request.url}")
230
254
  else:
231
255
  self.logger.warning(f"未获取到代理,请求直连: {request.url}")
crawlo/mode_manager.py CHANGED
@@ -63,8 +63,9 @@ class ModeManager:
63
63
  'REDIS_PORT': redis_port,
64
64
  'REDIS_PASSWORD': redis_password,
65
65
  'REDIS_URL': redis_url,
66
- 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
- 'REDIS_KEY': f'{project_name}:fingerprint',
66
+ 'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
67
+ # Redis key配置已移至各组件中,使用统一的命名规范
68
+ # crawlo:{project_name}:filter:fingerprint (请求去重)
68
69
  'CONCURRENCY': 16,
69
70
  'MAX_RUNNING_SPIDERS': 1,
70
71
  'DOWNLOAD_DELAY': 1.0,
@@ -181,21 +182,25 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
181
182
  # 环境变量支持
182
183
  def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
184
  """从环境变量创建配置"""
184
- mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
+ # 移除直接使用 os.getenv(),要求通过 settings 配置
186
+ raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
185
187
 
186
- if mode == 'distributed':
187
- return distributed_mode(
188
- redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
- redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
- redis_password=os.getenv('REDIS_PASSWORD'),
191
- project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
- CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
- )
194
- elif mode == 'auto':
195
- return auto_mode(
196
- CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
- )
198
- else: # standalone
199
- return standalone_mode(
200
- CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
201
- )
188
+ # 保留原有代码作为参考
189
+ # mode = os.getenv('CRAWLO_MODE', default_mode).lower()
190
+ #
191
+ # if mode == 'distributed':
192
+ # return distributed_mode(
193
+ # redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
194
+ # redis_port=int(os.getenv('REDIS_PORT', 6379)),
195
+ # redis_password=os.getenv('REDIS_PASSWORD'),
196
+ # project_name=os.getenv('PROJECT_NAME', 'crawlo'),
197
+ # CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
198
+ # )
199
+ # elif mode == 'auto':
200
+ # return auto_mode(
201
+ # CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
202
+ # )
203
+ # else: # standalone
204
+ # return standalone_mode(
205
+ # CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
206
+ # )
crawlo/network/request.py CHANGED
@@ -76,7 +76,9 @@ class Request:
76
76
  'verify',
77
77
  'flags',
78
78
  '_json_body',
79
- '_form_data'
79
+ '_form_data',
80
+ 'use_dynamic_loader',
81
+ 'dynamic_loader_options'
80
82
  )
81
83
 
82
84
  def __init__(
@@ -99,7 +101,10 @@ class Request:
99
101
  auth: Optional[tuple] = None,
100
102
  verify: bool = True,
101
103
  flags: Optional[List[str]] = None,
102
- encoding: str = 'utf-8'
104
+ encoding: str = 'utf-8',
105
+ # 动态加载相关参数
106
+ use_dynamic_loader: bool = False,
107
+ dynamic_loader_options: Optional[Dict[str, Any]] = None
103
108
  ):
104
109
  """
105
110
  初始化请求对象。
@@ -145,6 +150,10 @@ class Request:
145
150
  # 保存高层语义参数(用于 copy)
146
151
  self._json_body = json_body
147
152
  self._form_data = form_data
153
+
154
+ # 动态加载相关属性
155
+ self.use_dynamic_loader = use_dynamic_loader
156
+ self.dynamic_loader_options = dynamic_loader_options or {}
148
157
 
149
158
  # 构建 body
150
159
  if json_body is not None:
@@ -228,7 +237,9 @@ class Request:
228
237
  auth=self.auth,
229
238
  verify=self.verify,
230
239
  flags=self.flags.copy(),
231
- encoding=self.encoding
240
+ encoding=self.encoding,
241
+ use_dynamic_loader=self.use_dynamic_loader,
242
+ dynamic_loader_options=deepcopy(self.dynamic_loader_options)
232
243
  )
233
244
 
234
245
  def set_meta(self, key: str, value: Any) -> 'Request':
@@ -267,6 +278,22 @@ class Request:
267
278
  if flag in self.flags:
268
279
  self.flags.remove(flag)
269
280
  return self
281
+
282
+ def set_dynamic_loader(self, use_dynamic: bool = True, options: Optional[Dict[str, Any]] = None) -> 'Request':
283
+ """设置使用动态加载器,支持链式调用。"""
284
+ self.use_dynamic_loader = use_dynamic
285
+ if options:
286
+ self.dynamic_loader_options = options
287
+ # 同时在meta中设置标记,供混合下载器使用
288
+ self._meta['use_dynamic_loader'] = use_dynamic
289
+ return self
290
+
291
+ def set_protocol_loader(self) -> 'Request':
292
+ """强制使用协议加载器,支持链式调用。"""
293
+ self.use_dynamic_loader = False
294
+ self._meta['use_dynamic_loader'] = False
295
+ self._meta['use_protocol_loader'] = True
296
+ return self
270
297
 
271
298
  def _set_url(self, url: str) -> None:
272
299
  """安全设置 URL,确保格式正确。"""
@@ -196,50 +196,139 @@ class Response:
196
196
  """使用 CSS 选择器查询文档。"""
197
197
  return self._selector.css(query)
198
198
 
199
- def xpath_text(self, query: str) -> str:
200
- """使用 XPath 提取并返回纯文本。"""
201
- fragments = self.xpath(f"{query}//text()").getall()
202
- return " ".join(text.strip() for text in fragments if text.strip())
199
+ def _is_xpath(self, query: str) -> bool:
200
+ """判断查询语句是否为XPath"""
201
+ return query.startswith(('/', '//', './'))
203
202
 
204
- def css_text(self, query: str) -> str:
205
- """使用 CSS 选择器提取并返回纯文本。"""
206
- fragments = self.css(f"{query} ::text").getall()
207
- return " ".join(text.strip() for text in fragments if text.strip())
208
-
209
- def get_text(self, xpath_or_css: str, join_str: str = " ") -> str:
203
+ def _extract_text_from_elements(self, elements: SelectorList, join_str: str = " ") -> str:
204
+ """
205
+ 从元素列表中提取文本并拼接
206
+
207
+ :param elements: SelectorList元素列表
208
+ :param join_str: 文本拼接分隔符
209
+ :return: 拼接后的文本
210
+ """
211
+ texts = []
212
+ for element in elements:
213
+ # 获取元素的所有文本节点
214
+ if hasattr(element, 'xpath'):
215
+ element_texts = element.xpath('.//text()').getall()
216
+ else:
217
+ element_texts = [str(element)]
218
+ # 清理并添加非空文本
219
+ for text in element_texts:
220
+ cleaned = text.strip()
221
+ if cleaned:
222
+ texts.append(cleaned)
223
+ return join_str.join(texts)
224
+
225
+ def extract_text(self, xpath_or_css: str, join_str: str = " ", default: str = '') -> str:
210
226
  """
211
- 获取指定节点的纯文本(自动拼接子节点文本)
227
+ 提取单个元素的文本内容,支持CSS和XPath选择器
212
228
 
213
229
  参数:
214
230
  xpath_or_css: XPath或CSS选择器
215
231
  join_str: 文本拼接分隔符(默认为空格)
232
+ default: 默认返回值,当未找到元素时返回
216
233
 
217
234
  返回:
218
235
  拼接后的纯文本字符串
219
236
  """
220
- elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
221
- texts = elements.xpath('.//text()').getall()
222
- return join_str.join(text.strip() for text in texts if text.strip())
237
+ try:
238
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
239
+ if not elements:
240
+ return default
241
+ return self._extract_text_from_elements(elements, join_str)
242
+ except Exception:
243
+ return default
223
244
 
224
- def get_all_text(self, xpath_or_css: str, join_str: str = " ") -> List[str]:
245
+ def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
225
246
  """
226
- 获取多个节点的纯文本列表
247
+ 提取多个元素的文本内容列表,支持CSS和XPath选择器
227
248
 
228
249
  参数:
229
250
  xpath_or_css: XPath或CSS选择器
230
251
  join_str: 单个节点内文本拼接分隔符
252
+ default: 默认返回值,当未找到元素时返回
231
253
 
232
254
  返回:
233
255
  纯文本列表(每个元素对应一个节点的文本)
234
256
  """
235
- elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
236
- result = []
237
- for element in elements:
238
- texts = element.xpath('.//text()').getall()
239
- clean_text = join_str.join(text.strip() for text in texts if text.strip())
240
- if clean_text:
241
- result.append(clean_text)
242
- return result
257
+ if default is None:
258
+ default = []
259
+
260
+ try:
261
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
262
+ if not elements:
263
+ return default
264
+
265
+ result = []
266
+ for element in elements:
267
+ # 对每个元素提取文本
268
+ if hasattr(element, 'xpath'):
269
+ texts = element.xpath('.//text()').getall()
270
+ else:
271
+ texts = [str(element)]
272
+
273
+ # 清理文本并拼接
274
+ clean_texts = [text.strip() for text in texts if text.strip()]
275
+ if clean_texts:
276
+ result.append(join_str.join(clean_texts))
277
+
278
+ return result if result else default
279
+ except Exception:
280
+ return default
281
+
282
+ def extract_attr(self, xpath_or_css: str, attr_name: str, default: Any = None) -> Any:
283
+ """
284
+ 提取单个元素的属性值,支持CSS和XPath选择器
285
+
286
+ 参数:
287
+ xpath_or_css: XPath或CSS选择器
288
+ attr_name: 属性名称
289
+ default: 默认返回值
290
+
291
+ 返回:
292
+ 属性值或默认值
293
+ """
294
+ try:
295
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
296
+ if not elements:
297
+ return default
298
+ return elements.attrib.get(attr_name, default)
299
+ except Exception:
300
+ return default
301
+
302
+ def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
303
+ """
304
+ 提取多个元素的属性值列表,支持CSS和XPath选择器
305
+
306
+ 参数:
307
+ xpath_or_css: XPath或CSS选择器
308
+ attr_name: 属性名称
309
+ default: 默认返回值
310
+
311
+ 返回:
312
+ 属性值列表
313
+ """
314
+ if default is None:
315
+ default = []
316
+
317
+ try:
318
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
319
+ if not elements:
320
+ return default
321
+
322
+ result = []
323
+ for element in elements:
324
+ if hasattr(element, 'attrib'):
325
+ attr_value = element.attrib.get(attr_name)
326
+ if attr_value is not None:
327
+ result.append(attr_value)
328
+
329
+ return result if result else default
330
+ except Exception:
331
+ return default
243
332
 
244
333
  def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
245
334
  """在响应文本上执行正则表达式搜索。"""
@@ -268,4 +357,4 @@ class Response:
268
357
  return self.request.meta if self.request else {}
269
358
 
270
359
  def __str__(self):
271
- return f"<{self.status_code} {self.url}>"
360
+ return f"<{self.status_code} {self.url}>"
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- from typing import Optional
2
+ from typing import Optional, List, Dict
3
3
  from motor.motor_asyncio import AsyncIOMotorClient
4
4
  from pymongo.errors import PyMongoError
5
5
  from crawlo.utils.log import get_logger
@@ -21,6 +21,17 @@ class MongoPipeline:
21
21
  self.mongo_uri = self.settings.get('MONGO_URI', 'mongodb://localhost:27017')
22
22
  self.db_name = self.settings.get('MONGO_DATABASE', 'scrapy_db')
23
23
  self.collection_name = self.settings.get('MONGO_COLLECTION', crawler.spider.name)
24
+
25
+ # 连接池配置
26
+ self.max_pool_size = self.settings.getint('MONGO_MAX_POOL_SIZE', 100)
27
+ self.min_pool_size = self.settings.getint('MONGO_MIN_POOL_SIZE', 10)
28
+ self.connect_timeout_ms = self.settings.getint('MONGO_CONNECT_TIMEOUT_MS', 5000)
29
+ self.socket_timeout_ms = self.settings.getint('MONGO_SOCKET_TIMEOUT_MS', 30000)
30
+
31
+ # 批量插入配置
32
+ self.batch_size = self.settings.getint('MONGO_BATCH_SIZE', 100)
33
+ self.use_batch = self.settings.getbool('MONGO_USE_BATCH', False)
34
+ self.batch_buffer: List[Dict] = [] # 批量缓冲区
24
35
 
25
36
  # 注册关闭事件
26
37
  crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
@@ -32,86 +43,90 @@ class MongoPipeline:
32
43
  async def _ensure_connection(self):
33
44
  """确保连接已建立"""
34
45
  if self.client is None:
35
- self.client = AsyncIOMotorClient(self.mongo_uri)
46
+ # 使用连接池配置创建客户端
47
+ self.client = AsyncIOMotorClient(
48
+ self.mongo_uri,
49
+ maxPoolSize=self.max_pool_size,
50
+ minPoolSize=self.min_pool_size,
51
+ connectTimeoutMS=self.connect_timeout_ms,
52
+ socketTimeoutMS=self.socket_timeout_ms
53
+ )
36
54
  self.db = self.client[self.db_name]
37
55
  self.collection = self.db[self.collection_name]
38
56
  self.logger.info(f"MongoDB连接建立 (集合: {self.collection_name})")
39
57
 
40
58
  async def process_item(self, item, spider) -> Optional[dict]:
41
- """处理item的核心方法"""
42
- try:
43
- await self._ensure_connection()
44
-
45
- item_dict = dict(item)
46
- result = await self.collection.insert_one(item_dict)
47
-
48
- # 统计计数
49
- self.crawler.stats.inc_value('mongodb/inserted')
50
- self.logger.debug(f"插入文档ID: {result.inserted_id}")
51
-
59
+ """处理item的核心方法(带重试机制)"""
60
+ # 如果启用批量插入,将item添加到缓冲区
61
+ if self.use_batch:
62
+ self.batch_buffer.append(dict(item))
63
+
64
+ # 如果缓冲区达到批量大小,执行批量插入
65
+ if len(self.batch_buffer) >= self.batch_size:
66
+ await self._flush_batch(spider)
67
+
52
68
  return item
69
+ else:
70
+ # 单条插入逻辑
71
+ try:
72
+ await self._ensure_connection()
73
+
74
+ item_dict = dict(item)
75
+
76
+ # 带重试的插入操作
77
+ for attempt in range(3):
78
+ try:
79
+ result = await self.collection.insert_one(item_dict)
80
+ # 统一使用insert_success统计键名
81
+ self.crawler.stats.inc_value('mongodb/insert_success')
82
+ self.logger.debug(f"插入成功 [attempt {attempt + 1}]: {result.inserted_id}")
83
+ return item
84
+ except PyMongoError as e:
85
+ if attempt == 2: # 最后一次尝试仍失败
86
+ raise
87
+ self.logger.warning(f"插入重试中 [attempt {attempt + 1}]: {e}")
88
+
89
+ except Exception as e:
90
+ # 统一使用insert_failed统计键名
91
+ self.crawler.stats.inc_value('mongodb/insert_failed')
92
+ self.logger.error(f"MongoDB操作最终失败: {e}")
93
+ raise ItemDiscard(f"MongoDB操作失败: {e}")
94
+
95
+ async def _flush_batch(self, spider):
96
+ """刷新批量缓冲区并执行批量插入"""
97
+ if not self.batch_buffer:
98
+ return
53
99
 
54
- except Exception as e:
55
- self.crawler.stats.inc_value('mongodb/failed')
56
- self.logger.error(f"MongoDB插入失败: {e}")
57
- raise ItemDiscard(f"MongoDB操作失败: {e}")
58
-
59
- async def spider_closed(self):
60
- """关闭爬虫时清理资源"""
61
- if self.client:
62
- self.client.close()
63
- self.logger.info("MongoDB连接已关闭")
64
-
65
-
66
- class MongoPoolPipeline:
67
- def __init__(self, crawler):
68
- self.crawler = crawler
69
- self.settings = crawler.settings
70
- self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
71
-
72
- # 连接池配置
73
- self.client = AsyncIOMotorClient(
74
- self.settings.get('MONGO_URI', 'mongodb://localhost:27017'),
75
- maxPoolSize=self.settings.getint('MONGO_MAX_POOL_SIZE', 100),
76
- minPoolSize=self.settings.getint('MONGO_MIN_POOL_SIZE', 10),
77
- connectTimeoutMS=5000,
78
- socketTimeoutMS=30000
79
- )
80
-
81
- self.db = self.client[self.settings.get('MONGO_DATABASE', 'scrapy_db')]
82
- self.collection = self.db[self.settings.get('MONGO_COLLECTION', crawler.spider.name)]
83
-
84
- crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
85
- self.logger.info(f"MongoDB连接池已初始化 (集合: {self.collection.name})")
86
-
87
- @classmethod
88
- def create_instance(cls, crawler):
89
- return cls(crawler)
90
-
91
- async def process_item(self, item, spider) -> Optional[dict]:
92
- """处理item方法(带重试机制)"""
93
100
  try:
94
- item_dict = dict(item)
101
+ await self._ensure_connection()
95
102
 
96
- # 带重试的插入操作
103
+ # 带重试的批量插入操作
97
104
  for attempt in range(3):
98
105
  try:
99
- result = await self.collection.insert_one(item_dict)
100
- self.crawler.stats.inc_value('mongodb/insert_success')
101
- self.logger.debug(f"插入成功 [attempt {attempt + 1}]: {result.inserted_id}")
102
- return item
106
+ result = await self.collection.insert_many(self.batch_buffer, ordered=False)
107
+ # 统一使用insert_success统计键名
108
+ inserted_count = len(result.inserted_ids)
109
+ self.crawler.stats.inc_value('mongodb/insert_success', inserted_count)
110
+ self.logger.debug(f"批量插入成功 [attempt {attempt + 1}]: {inserted_count} 条记录")
111
+ self.batch_buffer.clear()
112
+ return
103
113
  except PyMongoError as e:
104
114
  if attempt == 2: # 最后一次尝试仍失败
105
115
  raise
106
- self.logger.warning(f"插入重试中 [attempt {attempt + 1}]: {e}")
107
-
116
+ self.logger.warning(f"批量插入重试中 [attempt {attempt + 1}]: {e}")
108
117
  except Exception as e:
109
- self.crawler.stats.inc_value('mongodb/insert_failed')
110
- self.logger.error(f"MongoDB操作最终失败: {e}")
111
- raise ItemDiscard(f"MongoDB操作失败: {e}")
118
+ # 统一使用insert_failed统计键名
119
+ failed_count = len(self.batch_buffer)
120
+ self.crawler.stats.inc_value('mongodb/insert_failed', failed_count)
121
+ self.logger.error(f"MongoDB批量插入最终失败: {e}")
122
+ raise ItemDiscard(f"MongoDB批量插入失败: {e}")
112
123
 
113
124
  async def spider_closed(self):
114
- """资源清理"""
115
- if hasattr(self, 'client'):
125
+ """关闭爬虫时清理资源"""
126
+ # 在关闭前刷新剩余的批量数据
127
+ if self.use_batch and self.batch_buffer:
128
+ await self._flush_batch(self.crawler.spider)
129
+
130
+ if self.client:
116
131
  self.client.close()
117
- self.logger.info("MongoDB连接池已释放")
132
+ self.logger.info("MongoDB连接已关闭")