crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ from curl_cffi.requests import AsyncSession
9
9
 
10
10
  from crawlo.network.response import Response
11
11
  from crawlo.downloader import DownloaderBase
12
- from crawlo.utils.log import get_logger
12
+ from crawlo.logging import get_logger
13
13
 
14
14
  class CurlCffiDownloader(DownloaderBase):
15
15
  """
@@ -25,7 +25,7 @@ class CurlCffiDownloader(DownloaderBase):
25
25
  # 调用父类初始化方法,确保 _closed 等属性被正确初始化
26
26
  super().__init__(crawler)
27
27
 
28
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
28
+ self.logger = get_logger(self.__class__.__name__)
29
29
  self._active_requests = set()
30
30
 
31
31
  # --- 基础配置 ---
@@ -237,6 +237,9 @@ class CurlCffiDownloader(DownloaderBase):
237
237
  self.logger.warning(f"关闭 curl-cffi 会话时出错: {e}")
238
238
  finally:
239
239
  self.session = None
240
+ # 清空活跃请求跟踪
241
+ self._active_requests.clear()
242
+
240
243
  self.logger.debug("CurlCffiDownloader 已关闭")
241
244
 
242
245
  def idle(self) -> bool:
@@ -6,7 +6,7 @@ from httpx import AsyncClient, Timeout, Limits
6
6
 
7
7
  from crawlo.network.response import Response
8
8
  from crawlo.downloader import DownloaderBase
9
- from crawlo.utils.log import get_logger
9
+ from crawlo.logging import get_logger
10
10
 
11
11
  # 尝试导入 httpx 异常,用于更精确地捕获
12
12
  try:
@@ -48,7 +48,7 @@ class HttpXDownloader(DownloaderBase):
48
48
  self._timeout: Optional[Timeout] = None
49
49
  self._limits: Optional[Limits] = None
50
50
  # --- 获取 logger 实例 ---
51
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
51
+ self.logger = get_logger(self.__class__.__name__)
52
52
 
53
53
  def open(self):
54
54
  super().open()
@@ -255,5 +255,11 @@ class HttpXDownloader(DownloaderBase):
255
255
  """关闭主客户端"""
256
256
  if self._client:
257
257
  self.logger.info("Closing HttpXDownloader client...")
258
- await self._client.aclose()
258
+ try:
259
+ await self._client.aclose()
260
+ except Exception as e:
261
+ self.logger.warning(f"Error during client close: {e}")
262
+ finally:
263
+ self._client = None
264
+
259
265
  self.logger.debug("HttpXDownloader closed.")
@@ -23,7 +23,7 @@ from urllib.parse import urlparse
23
23
  from crawlo.downloader import DownloaderBase
24
24
  from crawlo.network.request import Request
25
25
  from crawlo.network.response import Response
26
- from crawlo.utils.log import get_logger
26
+ from crawlo.logging import get_logger
27
27
 
28
28
  # 动态导入下载器(避免循环导入)
29
29
  try:
@@ -59,7 +59,7 @@ class HybridDownloader(DownloaderBase):
59
59
 
60
60
  def __init__(self, crawler):
61
61
  super().__init__(crawler)
62
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
62
+ self.logger = get_logger(self.__class__.__name__)
63
63
 
64
64
  # 下载器实例缓存
65
65
  self._downloaders: Dict[str, DownloaderBase] = {}
@@ -23,7 +23,7 @@ from playwright.async_api import async_playwright, Playwright, Browser, Page, Br
23
23
 
24
24
  from crawlo.downloader import DownloaderBase
25
25
  from crawlo.network.response import Response
26
- from crawlo.utils.log import get_logger
26
+ from crawlo.logging import get_logger
27
27
 
28
28
 
29
29
  class PlaywrightDownloader(DownloaderBase):
@@ -37,7 +37,7 @@ class PlaywrightDownloader(DownloaderBase):
37
37
  self.playwright: Optional[Playwright] = None
38
38
  self.browser: Optional[Browser] = None
39
39
  self.context: Optional[BrowserContext] = None
40
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
40
+ self.logger = get_logger(self.__class__.__name__)
41
41
  self.default_timeout = crawler.settings.get_int("PLAYWRIGHT_TIMEOUT", 30000) # 毫秒
42
42
  self.load_timeout = crawler.settings.get_int("PLAYWRIGHT_LOAD_TIMEOUT", 10000) # 毫秒
43
43
  self.browser_type = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE", "chromium").lower()
@@ -319,25 +319,48 @@ class PlaywrightDownloader(DownloaderBase):
319
319
  """关闭 Playwright 资源"""
320
320
  try:
321
321
  # 关闭所有页面
322
- for page in self._page_pool:
323
- try:
324
- await page.close()
325
- except:
326
- pass
327
- self._page_pool.clear()
328
- self._used_pages.clear()
322
+ if self._page_pool:
323
+ self.logger.debug(f"Closing {len(self._page_pool)} page(s)...")
324
+ for page in self._page_pool:
325
+ try:
326
+ await page.close()
327
+ except Exception as e:
328
+ self.logger.warning(f"Error closing page: {e}")
329
+
330
+ self._page_pool.clear()
331
+ self._used_pages.clear()
329
332
 
333
+ # 关闭上下文
330
334
  if self.context:
331
- await self.context.close()
335
+ try:
336
+ await self.context.close()
337
+ except Exception as e:
338
+ self.logger.warning(f"Error closing context: {e}")
339
+ finally:
340
+ self.context = None
341
+
342
+ # 关闭浏览器
332
343
  if self.browser:
333
- await self.browser.close()
344
+ try:
345
+ await self.browser.close()
346
+ except Exception as e:
347
+ self.logger.warning(f"Error closing browser: {e}")
348
+ finally:
349
+ self.browser = None
350
+
351
+ # 停止 Playwright
334
352
  if self.playwright:
335
- await self.playwright.stop()
336
-
353
+ try:
354
+ await self.playwright.stop()
355
+ except Exception as e:
356
+ self.logger.warning(f"Error stopping playwright: {e}")
357
+ finally:
358
+ self.playwright = None
359
+
337
360
  self.logger.info("PlaywrightDownloader closed.")
338
361
  except Exception as e:
339
- self.logger.warning(f"Error closing Playwright resources: {e}")
340
- finally:
362
+ self.logger.error(f"Error during Playwright cleanup: {e}", exc_info=True)
363
+ # 确保资源被清空
341
364
  self.context = None
342
365
  self.browser = None
343
366
  self.playwright = None
@@ -31,7 +31,7 @@ from selenium.webdriver.support.ui import WebDriverWait
31
31
 
32
32
  from crawlo.downloader import DownloaderBase
33
33
  from crawlo.network.response import Response
34
- from crawlo.utils.log import get_logger
34
+ from crawlo.logging import get_logger
35
35
 
36
36
 
37
37
  class SeleniumDownloader(DownloaderBase):
@@ -43,7 +43,7 @@ class SeleniumDownloader(DownloaderBase):
43
43
  def __init__(self, crawler):
44
44
  super().__init__(crawler)
45
45
  self.driver: Optional[webdriver.Chrome] = None
46
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get("LOG_LEVEL"))
46
+ self.logger = get_logger(self.__class__.__name__)
47
47
  self.default_timeout = crawler.settings.get_int("SELENIUM_TIMEOUT", 30)
48
48
  self.load_timeout = crawler.settings.get_int("SELENIUM_LOAD_TIMEOUT", 10)
49
49
  self.window_width = crawler.settings.get_int("SELENIUM_WINDOW_WIDTH", 1920)
@@ -465,9 +465,23 @@ class SeleniumDownloader(DownloaderBase):
465
465
  if self.driver:
466
466
  self.logger.info("Closing SeleniumDownloader driver...")
467
467
  try:
468
+ # 关闭所有标签页
469
+ if self._window_handles:
470
+ self.logger.debug(f"Closing {len(self._window_handles)} tab(s)...")
471
+ for handle in self._window_handles[1:]: # 保留第一个,其他关闭
472
+ try:
473
+ self.driver.switch_to.window(handle)
474
+ self.driver.close()
475
+ except Exception as e:
476
+ self.logger.warning(f"Error closing tab {handle}: {e}")
477
+
478
+ self._window_handles.clear()
479
+
480
+ # 退出浏览器
468
481
  self.driver.quit()
469
482
  except Exception as e:
470
483
  self.logger.warning(f"Error closing Selenium driver: {e}")
471
484
  finally:
472
485
  self.driver = None
486
+
473
487
  self.logger.debug("SeleniumDownloader closed.")
crawlo/event.py CHANGED
@@ -1,11 +1,45 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
+ """
4
+ Crawlo 事件系统
5
+ ==============
6
+ 定义框架中的所有事件类型,支持类型安全和IDE自动补全。
7
+ """
8
+ from enum import Enum
3
9
 
4
- spider_error = "spider_error"
5
- spider_opened = "spider_open"
6
- spider_closed = "spider_closed"
7
- ignore_request = "ignore_request"
8
- request_scheduled = "request_scheduled"
9
- response_received = "request_received"
10
- item_successful = "item_successful"
11
- item_discard = "item_discard"
10
+
11
+ class CrawlerEvent(str, Enum):
12
+ """
13
+ 爬虫事件枚举
14
+
15
+ 所有事件都应该使用此枚举类型,以获得:
16
+ - 类型安全:避免拼写错误
17
+ - IDE支持:自动补全和提示
18
+ - 文档化:集中管理所有事件
19
+
20
+ 使用示例:
21
+ >>> from crawlo.event import CrawlerEvent
22
+ >>> await subscriber.notify(CrawlerEvent.SPIDER_OPENED, spider)
23
+ """
24
+
25
+ # 爬虫生命周期事件
26
+ SPIDER_OPENED = "spider_opened" # 爬虫启动
27
+ SPIDER_CLOSED = "spider_closed" # 爬虫关闭
28
+ SPIDER_ERROR = "spider_error" # 爬虫错误
29
+
30
+ # 请求相关事件
31
+ REQUEST_SCHEDULED = "request_scheduled" # 请求已调度
32
+ IGNORE_REQUEST = "ignore_request" # 请求被忽略
33
+
34
+ # 响应相关事件
35
+ RESPONSE_RECEIVED = "response_received" # 响应已接收
36
+
37
+ # Item相关事件
38
+ ITEM_SUCCESSFUL = "item_successful" # Item处理成功
39
+ ITEM_DISCARD = "item_discard" # Item被丢弃
40
+
41
+
42
+ # 导出所有公共API
43
+ __all__ = [
44
+ 'CrawlerEvent',
45
+ ]
crawlo/exceptions.py CHANGED
@@ -1,82 +1,215 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- class TransformTypeError(TypeError):
3
+ """
4
+ Crawlo 框架异常定义
5
+ ===================
6
+ 提供层次化的异常体系,便于统一处理和类型安全。
7
+
8
+ 异常层次:
9
+ CrawloException (基础异常)
10
+ ├── SpiderException (爬虫相关)
11
+ ├── ComponentInitException (组件初始化)
12
+ ├── DataException (数据处理)
13
+ ├── RequestException (请求/响应)
14
+ ├── OutputException (输出)
15
+ └── ConfigException (配置)
16
+
17
+ 使用示例:
18
+ >>> try:
19
+ ... # 代码
20
+ ... except CrawloException as e:
21
+ ... # 捕获所有框架异常
22
+ ... except Exception as e:
23
+ ... # 其他异常
24
+ """
25
+
26
+
27
+ # ============= 基础异常 =============
28
+ class CrawloException(Exception):
29
+ """Crawlo框架基础异常。所有框架异常都应继承此类。"""
30
+ pass
31
+
32
+
33
+ # ============= 爬虫相关异常 =============
34
+ class SpiderException(CrawloException):
35
+ """爬虫相关异常基类。"""
4
36
  pass
5
37
 
6
38
 
7
- class OutputError(Exception):
39
+ class SpiderTypeError(SpiderException, TypeError):
40
+ """爬虫类型错误。当爬虫类型不符合预期时抛出。"""
8
41
  pass
9
42
 
10
43
 
11
- class SpiderTypeError(TypeError):
44
+ class SpiderCreationError(SpiderException):
45
+ """爬虫实例化失败异常。当无法创建爬虫实例时抛出。"""
12
46
  pass
13
47
 
14
48
 
15
- class ItemInitError(Exception):
49
+ # ============= 组件初始化异常 =============
50
+ class ComponentInitException(CrawloException):
51
+ """组件初始化异常基类。"""
16
52
  pass
17
53
 
18
54
 
19
- class ItemAttributeError(Exception):
55
+ class MiddlewareInitError(ComponentInitException):
56
+ """中间件初始化失败异常。"""
20
57
  pass
21
58
 
22
59
 
23
- class DecodeError(Exception):
60
+ class PipelineInitError(ComponentInitException):
61
+ """管道初始化失败异常。"""
24
62
  pass
25
63
 
26
64
 
27
- class MiddlewareInitError(Exception):
65
+ class ExtensionInitError(ComponentInitException):
66
+ """扩展初始化失败异常。"""
28
67
  pass
29
68
 
30
69
 
31
- class PipelineInitError(Exception):
70
+ # ============= 数据处理异常 =============
71
+ class DataException(CrawloException):
72
+ """数据处理异常基类。"""
32
73
  pass
33
74
 
34
75
 
35
- class InvalidOutputError(Exception):
76
+ class ItemInitError(DataException):
77
+ """Item初始化错误。当Item实例创建失败时抛出。"""
36
78
  pass
37
79
 
38
80
 
39
- class RequestMethodError(Exception):
81
+ class ItemAttributeError(DataException, AttributeError):
82
+ """Item属性错误。当访问不存在的Item属性时抛出。"""
40
83
  pass
41
84
 
42
85
 
43
- class IgnoreRequestError(Exception):
86
+ class ItemValidationError(DataException):
87
+ """Item字段验证错误。当Item字段值不符合验证规则时抛出。"""
88
+ pass
89
+
90
+
91
+ class ItemDiscard(DataException):
92
+ """
93
+ Item被丢弃异常。
94
+
95
+ 注意:这不是一个真正的错误,而是用于流程控制,
96
+ 表示Item应该被管道丢弃(例如重复数据)。
97
+ """
44
98
  def __init__(self, msg):
45
99
  self.msg = msg
46
- super(IgnoreRequestError, self).__init__(msg)
100
+ super().__init__(msg)
101
+
102
+
103
+ # ============= 请求/响应异常 =============
104
+ class RequestException(CrawloException):
105
+ """请求异常基类。"""
106
+ pass
47
107
 
48
108
 
49
- class ItemDiscard(Exception):
109
+ class RequestMethodError(RequestException):
110
+ """请求方法错误。当使用不支持的HTTP方法时抛出。"""
111
+ pass
112
+
113
+
114
+ class IgnoreRequestError(RequestException):
115
+ """
116
+ 请求被忽略异常。
117
+
118
+ 用于流程控制,表示请求应该被跳过处理。
119
+ """
50
120
  def __init__(self, msg):
51
121
  self.msg = msg
52
- super(ItemDiscard, self).__init__(msg)
122
+ super().__init__(msg)
123
+
124
+
125
+ class DecodeError(RequestException):
126
+ """响应解码错误。当无法解码响应内容时抛出。"""
127
+ pass
128
+
129
+
130
+ # ============= 输出异常 =============
131
+ class OutputException(CrawloException):
132
+ """输出异常基类。"""
133
+ pass
134
+
135
+
136
+ class OutputError(OutputException):
137
+ """输出错误。当输出处理失败时抛出。"""
138
+ pass
53
139
 
54
140
 
55
- class NotConfigured(Exception):
141
+ class InvalidOutputError(OutputException):
142
+ """无效的输出错误。当输出类型或格式不符合预期时抛出。"""
56
143
  pass
57
144
 
58
145
 
59
- class NotConfiguredError(Exception):
146
+ # ============= 配置异常 =============
147
+ class ConfigException(CrawloException):
148
+ """配置异常基类。"""
60
149
  pass
61
150
 
62
151
 
63
- class ExtensionInitError(Exception):
152
+ class NotConfigured(ConfigException):
153
+ """组件未配置异常。当必需的配置缺失时抛出。"""
64
154
  pass
65
155
 
66
156
 
67
- class ReceiverTypeError(Exception):
157
+ class NotConfiguredError(ConfigException):
158
+ """配置错误异常。当配置值无效时抛出。"""
68
159
  pass
69
160
 
70
161
 
71
- class SpiderCreationError(Exception):
72
- """爬虫实例化失败异常"""
162
+ # ============= 类型异常 =============
163
+ class TransformTypeError(CrawloException, TypeError):
164
+ """转换类型错误。当数据转换类型不匹配时抛出。"""
73
165
  pass
74
166
 
75
167
 
76
- class ItemValidationError(Exception):
77
- """Item 字段验证错误"""
168
+ class ReceiverTypeError(CrawloException, TypeError):
169
+ """接收者类型错误。当事件接收者类型不符合预期时抛出。"""
78
170
  pass
79
171
 
80
172
 
81
- class DropItem(Exception):
82
- pass
173
+ # ============= 导出所有异常 =============
174
+ __all__ = [
175
+ # 基础异常
176
+ 'CrawloException',
177
+
178
+ # 爬虫相关
179
+ 'SpiderException',
180
+ 'SpiderTypeError',
181
+ 'SpiderCreationError',
182
+
183
+ # 组件初始化
184
+ 'ComponentInitException',
185
+ 'MiddlewareInitError',
186
+ 'PipelineInitError',
187
+ 'ExtensionInitError',
188
+
189
+ # 数据处理
190
+ 'DataException',
191
+ 'ItemInitError',
192
+ 'ItemAttributeError',
193
+ 'ItemValidationError',
194
+ 'ItemDiscard',
195
+
196
+ # 请求/响应
197
+ 'RequestException',
198
+ 'RequestMethodError',
199
+ 'IgnoreRequestError',
200
+ 'DecodeError',
201
+
202
+ # 输出
203
+ 'OutputException',
204
+ 'OutputError',
205
+ 'InvalidOutputError',
206
+
207
+ # 配置
208
+ 'ConfigException',
209
+ 'NotConfigured',
210
+ 'NotConfiguredError',
211
+
212
+ # 类型
213
+ 'TransformTypeError',
214
+ 'ReceiverTypeError',
215
+ ]
@@ -3,18 +3,18 @@
3
3
  from typing import List, Any
4
4
  from pprint import pformat
5
5
 
6
- from crawlo.utils.log import get_logger
6
+ from crawlo.logging import get_logger
7
7
  from crawlo.utils.misc import load_object
8
8
  from crawlo.exceptions import ExtensionInitError
9
9
 
10
10
 
11
- class ExtensionManager(object):
11
+ class ExtensionManager:
12
12
 
13
13
  def __init__(self, crawler: Any):
14
14
  self.crawler = crawler
15
15
  self.extensions: List = []
16
16
  extensions = self.crawler.settings.get_list('EXTENSIONS')
17
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
17
+ self.logger = get_logger(self.__class__.__name__)
18
18
  self._add_extensions(extensions)
19
19
  self._subscribe_extensions()
20
20
 
@@ -41,24 +41,25 @@ class ExtensionManager(object):
41
41
 
42
42
  def _subscribe_extensions(self) -> None:
43
43
  """订阅扩展方法到相应的事件"""
44
+ from crawlo.event import CrawlerEvent
45
+
44
46
  for extension in self.extensions:
45
47
  # 订阅 spider_closed 方法
46
48
  if hasattr(extension, 'spider_closed'):
47
- self.crawler.subscriber.subscribe(extension.spider_closed, event="spider_closed")
49
+ self.crawler.subscriber.subscribe(extension.spider_closed, event=CrawlerEvent.SPIDER_CLOSED)
48
50
 
49
51
  # 订阅 item_successful 方法
50
52
  if hasattr(extension, 'item_successful'):
51
- self.crawler.subscriber.subscribe(extension.item_successful, event="item_successful")
53
+ self.crawler.subscriber.subscribe(extension.item_successful, event=CrawlerEvent.ITEM_SUCCESSFUL)
52
54
 
53
55
  # 订阅 item_discard 方法
54
56
  if hasattr(extension, 'item_discard'):
55
- self.crawler.subscriber.subscribe(extension.item_discard, event="item_discard")
57
+ self.crawler.subscriber.subscribe(extension.item_discard, event=CrawlerEvent.ITEM_DISCARD)
56
58
 
57
59
  # 订阅 response_received 方法
58
60
  if hasattr(extension, 'response_received'):
59
- # 修复:将事件名称从 "request_received" 更正为 "response_received"
60
- self.crawler.subscriber.subscribe(extension.response_received, event="response_received")
61
+ self.crawler.subscriber.subscribe(extension.response_received, event=CrawlerEvent.RESPONSE_RECEIVED)
61
62
 
62
63
  # 订阅 request_scheduled 方法
63
64
  if hasattr(extension, 'request_scheduled'):
64
- self.crawler.subscriber.subscribe(extension.request_scheduled, event="request_scheduled")
65
+ self.crawler.subscriber.subscribe(extension.request_scheduled, event=CrawlerEvent.REQUEST_SCHEDULED)
@@ -4,8 +4,8 @@ import asyncio
4
4
  from datetime import datetime
5
5
  from typing import Any, Optional, Dict
6
6
 
7
- from crawlo.event import spider_opened, spider_closed, response_received, request_scheduled
8
- from crawlo.utils.log import get_logger
7
+ from crawlo.event import CrawlerEvent
8
+ from crawlo.logging import get_logger
9
9
 
10
10
 
11
11
  class HealthCheckExtension:
@@ -16,7 +16,7 @@ class HealthCheckExtension:
16
16
 
17
17
  def __init__(self, crawler: Any):
18
18
  self.settings = crawler.settings
19
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
19
+ self.logger = get_logger(self.__class__.__name__)
20
20
 
21
21
  # 获取配置参数
22
22
  self.enabled = self.settings.get_bool('HEALTH_CHECK_ENABLED', True)
@@ -43,10 +43,10 @@ class HealthCheckExtension:
43
43
 
44
44
  o = cls(crawler)
45
45
  if o.enabled:
46
- crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
47
- crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
48
- crawler.subscriber.subscribe(o.response_received, event=response_received)
49
- crawler.subscriber.subscribe(o.request_scheduled, event=request_scheduled)
46
+ crawler.subscriber.subscribe(o.spider_opened, event=CrawlerEvent.SPIDER_OPENED)
47
+ crawler.subscriber.subscribe(o.spider_closed, event=CrawlerEvent.SPIDER_CLOSED)
48
+ crawler.subscriber.subscribe(o.response_received, event=CrawlerEvent.RESPONSE_RECEIVED)
49
+ crawler.subscriber.subscribe(o.request_scheduled, event=CrawlerEvent.REQUEST_SCHEDULED)
50
50
  return o
51
51
 
52
52
  async def spider_opened(self) -> None:
@@ -3,11 +3,11 @@
3
3
  import asyncio
4
4
  from typing import Any, Optional
5
5
 
6
- from crawlo.utils.log import get_logger
7
- from crawlo.event import spider_opened, spider_closed
6
+ from crawlo.logging import get_logger
7
+ from crawlo.event import CrawlerEvent
8
8
 
9
9
 
10
- class LogIntervalExtension(object):
10
+ class LogIntervalExtension:
11
11
 
12
12
  def __init__(self, crawler: Any):
13
13
  self.task: Optional[asyncio.Task] = None
@@ -30,14 +30,14 @@ class LogIntervalExtension(object):
30
30
  else:
31
31
  self.interval_display = str(self.interval)
32
32
 
33
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
33
+ self.logger = get_logger(self.__class__.__name__)
34
34
  self.logger.info(f"LogIntervalExtension initialized with interval: {self.seconds} seconds")
35
35
 
36
36
  @classmethod
37
37
  def create_instance(cls, crawler: Any) -> 'LogIntervalExtension':
38
38
  o = cls(crawler)
39
- crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
40
- crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
39
+ crawler.subscriber.subscribe(o.spider_opened, event=CrawlerEvent.SPIDER_OPENED)
40
+ crawler.subscriber.subscribe(o.spider_closed, event=CrawlerEvent.SPIDER_CLOSED)
41
41
  return o
42
42
 
43
43
  async def spider_opened(self) -> None:
@@ -7,7 +7,7 @@
7
7
  import asyncio
8
8
  from typing import Any
9
9
 
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.logging import get_logger
11
11
  from crawlo.utils import now, time_diff
12
12
 
13
13
 
@@ -18,7 +18,7 @@ class LogStats:
18
18
 
19
19
  def __init__(self, crawler):
20
20
  self.crawler = crawler
21
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
21
+ self.logger = get_logger(self.__class__.__name__)
22
22
  self._stats = crawler.stats
23
23
  self._stats['start_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
24
24
 
@@ -1,16 +1,9 @@
1
1
  from typing import Any
2
2
  from crawlo.exceptions import NotConfigured
3
- from crawlo.utils.log import get_logger
3
+ from crawlo.logging import get_logger
4
4
 
5
- # 延迟获取logger,确保在日志系统配置之后获取
6
- _logger = None
7
-
8
- def logger():
9
- """延迟获取logger实例,确保在日志系统配置之后获取"""
10
- global _logger
11
- if _logger is None:
12
- _logger = get_logger(__name__)
13
- return _logger
5
+ # 获取logger实例
6
+ _logger = get_logger(__name__)
14
7
 
15
8
 
16
9
  class CustomLoggerExtension:
@@ -50,9 +43,8 @@ class CustomLoggerExtension:
50
43
  return cls(crawler.settings)
51
44
 
52
45
  def spider_opened(self, spider: Any) -> None:
53
- logger_instance = logger()
54
46
  try:
55
- logger_instance.info(
47
+ _logger.info(
56
48
  f"CustomLoggerExtension: Logging initialized. "
57
49
  f"LOG_FILE={self.settings.get('LOG_FILE')}, "
58
50
  f"LOG_LEVEL={self.settings.get('LOG_LEVEL')}"