crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -11,12 +11,9 @@ HTTP Response 封装模块
11
11
  - Cookie 处理
12
12
  """
13
13
  import re
14
- from http.cookies import SimpleCookie
15
- from typing import Dict, Any, List, Optional, Tuple
16
- from urllib.parse import urljoin as _urljoin, urlparse as _urlparse, urlsplit as _urlsplit, parse_qs as _parse_qs, \
17
- urlencode as _urlencode, quote as _quote, unquote as _unquote, urldefrag as _urldefrag
18
-
19
14
  import ujson
15
+ from typing import Dict, Any, List, Optional
16
+ from urllib.parse import urljoin as _urljoin
20
17
  from parsel import Selector, SelectorList
21
18
 
22
19
  # 尝试导入 w3lib 编码检测函数
@@ -31,6 +28,14 @@ try:
31
28
  W3LIB_AVAILABLE = True
32
29
  except ImportError:
33
30
  W3LIB_AVAILABLE = False
31
+ # 当 w3lib 不可用时,从 utils 导入替代函数
32
+ from crawlo.utils import (
33
+ html_body_declared_encoding,
34
+ html_to_unicode,
35
+ http_content_type_encoding,
36
+ read_bom,
37
+ resolve_encoding,
38
+ )
34
39
 
35
40
  from crawlo.exceptions import DecodeError
36
41
  from crawlo.utils import (
@@ -38,7 +43,11 @@ from crawlo.utils import (
38
43
  extract_texts,
39
44
  extract_attr,
40
45
  extract_attrs,
41
- is_xpath
46
+ is_xpath,
47
+ parse_cookies,
48
+ regex_search,
49
+ regex_findall,
50
+ get_header_value
42
51
  )
43
52
 
44
53
 
@@ -74,10 +83,10 @@ class Response:
74
83
  self,
75
84
  url: str,
76
85
  *,
77
- headers: Dict[str, Any] = None,
86
+ headers: Optional[Dict[str, Any]] = None,
78
87
  body: bytes = b"",
79
88
  method: str = 'GET',
80
- request: 'Request' = None, # 使用字符串注解避免循环导入
89
+ request: Optional['Request'] = None, # 使用字符串注解避免循环导入
81
90
  status_code: int = 200,
82
91
  ):
83
92
  # 基本属性
@@ -106,7 +115,7 @@ class Response:
106
115
 
107
116
  def _determine_encoding(self) -> str:
108
117
  """
109
- 智能检测响应编码(参考 Scrapy 设计)
118
+ 智能检测响应编码
110
119
 
111
120
  编码检测优先级:
112
121
  1. Request 中指定的编码
@@ -184,15 +193,19 @@ class Response:
184
193
  def _bom_encoding(self) -> Optional[str]:
185
194
  """BOM 字节顺序标记编码检测"""
186
195
  if not W3LIB_AVAILABLE:
187
- return None
196
+ # 使用替代函数
197
+ encoding, _ = read_bom(self.body)
198
+ return encoding
188
199
  return read_bom(self.body)[0]
189
200
 
190
201
  @memoize_method_noargs
191
202
  def _headers_encoding(self) -> Optional[str]:
192
203
  """HTTP Content-Type 头部编码检测"""
193
204
  if not W3LIB_AVAILABLE:
194
- return None
195
- content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
205
+ # 使用替代函数
206
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
207
+ return http_content_type_encoding(content_type)
208
+ content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
196
209
  if isinstance(content_type, bytes):
197
210
  content_type = content_type.decode('latin-1')
198
211
  return http_content_type_encoding(content_type)
@@ -201,23 +214,26 @@ class Response:
201
214
  def _body_declared_encoding(self) -> Optional[str]:
202
215
  """HTML meta 标签声明编码检测"""
203
216
  if not W3LIB_AVAILABLE:
204
- return None
217
+ # 使用替代函数
218
+ return html_body_declared_encoding(self.body)
205
219
  return html_body_declared_encoding(self.body)
206
220
 
207
221
  @memoize_method_noargs
208
222
  def _body_inferred_encoding(self) -> str:
209
223
  """内容自动检测编码"""
210
224
  if not W3LIB_AVAILABLE:
211
- # 回退到简单的自动检测
212
- for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
213
- try:
214
- self.body.decode(enc)
215
- except UnicodeError:
216
- continue
217
- return enc
218
- return self._DEFAULT_ENCODING
219
-
220
- content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
225
+ # 使用替代函数
226
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
227
+ # 使用 html_to_unicode 函数进行编码检测
228
+ encoding, _ = html_to_unicode(
229
+ content_type,
230
+ self.body,
231
+ auto_detect_fun=self._auto_detect_fun,
232
+ default_encoding=self._DEFAULT_ENCODING,
233
+ )
234
+ return encoding
235
+
236
+ content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
221
237
  if isinstance(content_type, bytes):
222
238
  content_type = content_type.decode('latin-1')
223
239
 
@@ -233,6 +249,13 @@ class Response:
233
249
  def _auto_detect_fun(self, text: bytes) -> Optional[str]:
234
250
  """自动检测编码的回调函数"""
235
251
  if not W3LIB_AVAILABLE:
252
+ # 使用替代函数
253
+ for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
254
+ try:
255
+ text.decode(enc)
256
+ except UnicodeError:
257
+ continue
258
+ return resolve_encoding(enc)
236
259
  return None
237
260
  for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
238
261
  try:
@@ -255,7 +278,7 @@ class Response:
255
278
  # 如果可用,使用 w3lib 进行更准确的解码
256
279
  if W3LIB_AVAILABLE:
257
280
  try:
258
- content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
281
+ content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
259
282
  if isinstance(content_type, bytes):
260
283
  content_type = content_type.decode('latin-1')
261
284
 
@@ -269,6 +292,20 @@ class Response:
269
292
  except Exception:
270
293
  # 如果 w3lib 解码失败,回退到原有方法
271
294
  pass
295
+ else:
296
+ # 使用替代函数
297
+ try:
298
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
299
+ # 使用 html_to_unicode 函数
300
+ _, self._text_cache = html_to_unicode(
301
+ content_type,
302
+ self.body,
303
+ default_encoding=self.encoding
304
+ )
305
+ return self._text_cache
306
+ except Exception:
307
+ # 如果解码失败,回退到原有方法
308
+ pass
272
309
 
273
310
  # 尝试多种编码
274
311
  encodings_to_try = [self.encoding]
@@ -323,12 +360,12 @@ class Response:
323
360
  @property
324
361
  def content_type(self) -> str:
325
362
  """获取响应的 Content-Type"""
326
- return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
363
+ return get_header_value(self.headers, 'content-type', '')
327
364
 
328
365
  @property
329
366
  def content_length(self) -> Optional[int]:
330
367
  """获取响应的 Content-Length"""
331
- length = self.headers.get('content-length') or self.headers.get('Content-Length')
368
+ length = get_header_value(self.headers, 'content-length')
332
369
  return int(length) if length else None
333
370
 
334
371
  # ==================== JSON处理方法 ====================
@@ -352,103 +389,8 @@ class Response:
352
389
  """拼接 URL,自动处理相对路径。"""
353
390
  return _urljoin(self.url, url)
354
391
 
355
- def urlparse(self, url: str = None) -> Tuple:
356
- """
357
- 解析 URL 为组件元组 (scheme, netloc, path, params, query, fragment)
358
-
359
- Args:
360
- url (str, optional): 要解析的URL,默认为响应的URL
361
-
362
- Returns:
363
- tuple: URL组件元组
364
- """
365
- target_url = url if url is not None else self.url
366
- return _urlparse(target_url)
367
-
368
- def urlsplit(self, url: str = None) -> Tuple:
369
- """
370
- 解析 URL 为组件元组 (scheme, netloc, path, query, fragment)
371
-
372
- Args:
373
- url (str, optional): 要解析的URL,默认为响应的URL
374
-
375
- Returns:
376
- tuple: URL组件元组(不包含params)
377
- """
378
- target_url = url if url is not None else self.url
379
- return _urlsplit(target_url)
380
-
381
- def parse_qs(self, query_string: str = None, keep_blank_values: bool = False) -> Dict[str, List[str]]:
382
- """
383
- 解析查询字符串为字典
384
-
385
- Args:
386
- query_string (str, optional): 查询字符串,默认从URL中提取
387
- keep_blank_values (bool): 是否保留空值
388
-
389
- Returns:
390
- dict: 查询参数字典
391
- """
392
- if query_string is None:
393
- # 从URL中提取查询字符串
394
- parsed = _urlparse(self.url)
395
- query_string = parsed.query
396
-
397
- return _parse_qs(query_string, keep_blank_values=keep_blank_values)
398
-
399
- def urlencode(self, query: Dict[str, Any]) -> str:
400
- """
401
- 将字典编码为查询字符串
402
-
403
- Args:
404
- query (dict): 要编码的查询参数字典
405
-
406
- Returns:
407
- str: 编码后的查询字符串
408
- """
409
- return _urlencode(query)
410
-
411
- def quote(self, string: str, safe: str = '/') -> str:
412
- """
413
- URL 编码
414
-
415
- Args:
416
- string (str): 要编码的字符串
417
- safe (str): 不编码的字符,默认为 '/'
418
-
419
- Returns:
420
- str: URL编码后的字符串
421
- """
422
- return _quote(string, safe=safe)
423
-
424
- def unquote(self, string: str) -> str:
425
- """
426
- URL 解码
427
-
428
- Args:
429
- string (str): 要解码的字符串
430
-
431
- Returns:
432
- str: URL解码后的字符串
433
- """
434
- return _unquote(string)
435
-
436
- def urldefrag(self, url: str = None) -> Tuple[str, str]:
437
- """
438
- 移除 URL 中的片段标识符
439
-
440
- Args:
441
- url (str, optional): 要处理的URL,默认为响应的URL
442
-
443
- Returns:
444
- tuple: (去除片段的URL, 片段)
445
- """
446
- target_url = url if url is not None else self.url
447
- defrag_result = _urldefrag(target_url)
448
- return (defrag_result.url, defrag_result.fragment)
449
-
450
392
  # ==================== 选择器相关方法 ====================
451
-
393
+
452
394
  @property
453
395
  def _selector(self) -> Selector:
454
396
  """懒加载 Selector 实例"""
@@ -507,7 +449,7 @@ class Response:
507
449
  except Exception:
508
450
  return default
509
451
 
510
- def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
452
+ def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: Optional[List[str]] = None) -> List[str]:
511
453
  """
512
454
  提取多个元素的文本内容列表,支持CSS和XPath选择器
513
455
 
@@ -570,7 +512,7 @@ class Response:
570
512
  except Exception:
571
513
  return default
572
514
 
573
- def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
515
+ def extract_attrs(self, xpath_or_css: str, attr_name: str, default: Optional[List[Any]] = None) -> List[Any]:
574
516
  """
575
517
  提取多个元素的属性值列表,支持CSS和XPath选择器
576
518
 
@@ -608,26 +550,18 @@ class Response:
608
550
 
609
551
  def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
610
552
  """在响应文本上执行正则表达式搜索。"""
611
- if not isinstance(pattern, str):
612
- raise TypeError("Pattern must be a string")
613
- return re.search(pattern, self.text, flags=flags)
553
+ return regex_search(pattern, self.text, flags)
614
554
 
615
555
  def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
616
556
  """在响应文本上执行正则表达式查找。"""
617
- if not isinstance(pattern, str):
618
- raise TypeError("Pattern must be a string")
619
- return re.findall(pattern, self.text, flags=flags)
557
+ return regex_findall(pattern, self.text, flags)
620
558
 
621
559
  # ==================== Cookie处理方法 ====================
622
560
 
623
561
  def get_cookies(self) -> Dict[str, str]:
624
562
  """从响应头中解析并返回Cookies。"""
625
563
  cookie_header = self.headers.get("Set-Cookie", "")
626
- if isinstance(cookie_header, list):
627
- cookie_header = ", ".join(cookie_header)
628
- cookies = SimpleCookie()
629
- cookies.load(cookie_header)
630
- return {key: morsel.value for key, morsel in cookies.items()}
564
+ return parse_cookies(cookie_header)
631
565
 
632
566
  # ==================== 请求相关方法 ====================
633
567
 
@@ -1,17 +1,36 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from crawlo.items import Item
3
+ """
4
+ Pipeline 模块
5
+ =============
4
6
 
7
+ Pipeline体系:
8
+ - BasePipeline: 基础抽象类,定义Pipeline接口规范
9
+ - ResourceManagedPipeline: 提供资源管理功能(推荐使用)
10
+ - FileBasedPipeline/DatabasePipeline/CacheBasedPipeline: 特定场景的专用基类
5
11
 
6
- class BasePipeline:
12
+ 内置去重Pipeline:
13
+ - MemoryDedupPipeline: 基于内存的去重
14
+ - RedisDedupPipeline: 基于Redis的分布式去重
15
+ - BloomDedupPipeline: 基于Bloom Filter的高效去重
16
+ - DatabaseDedupPipeline: 基于数据库的去重
7
17
 
8
- def process_item(self, item: Item, spider):
9
- raise NotImplementedError
10
-
11
- @classmethod
12
- def create_instance(cls, crawler):
13
- return cls()
18
+ 使用示例:
19
+ # 在settings.py中配置
20
+ PIPELINES = [
21
+ 'crawlo.pipelines.RedisDedupPipeline',
22
+ 'your_project.pipelines.MongoPipeline',
23
+ ]
24
+ """
14
25
 
26
+ # 导入所有基类(从base_pipeline.py)
27
+ from .base_pipeline import (
28
+ BasePipeline,
29
+ ResourceManagedPipeline,
30
+ FileBasedPipeline,
31
+ DatabasePipeline,
32
+ CacheBasedPipeline
33
+ )
15
34
 
16
35
  # 导出去重管道
17
36
  from .memory_dedup_pipeline import MemoryDedupPipeline
@@ -19,4 +38,16 @@ from .redis_dedup_pipeline import RedisDedupPipeline
19
38
  from .bloom_dedup_pipeline import BloomDedupPipeline
20
39
  from .database_dedup_pipeline import DatabaseDedupPipeline
21
40
 
22
- __all__ = ['BasePipeline', 'MemoryDedupPipeline', 'RedisDedupPipeline', 'BloomDedupPipeline', 'DatabaseDedupPipeline']
41
+ __all__ = [
42
+ # 基类
43
+ 'BasePipeline',
44
+ 'ResourceManagedPipeline',
45
+ 'FileBasedPipeline',
46
+ 'DatabasePipeline',
47
+ 'CacheBasedPipeline',
48
+ # 去重管道
49
+ 'MemoryDedupPipeline',
50
+ 'RedisDedupPipeline',
51
+ 'BloomDedupPipeline',
52
+ 'DatabaseDedupPipeline'
53
+ ]