crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,87 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from typing import TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, Optional, Union
4
4
 
5
5
  if TYPE_CHECKING:
6
6
  from crawlo import Request, Response
7
7
 
8
8
 
9
- class BaseMiddleware(object):
10
- def process_request(self, request, spider) -> 'None | Request | Response':
11
- # 请求预处理
9
+ class BaseMiddleware:
10
+ """中间件基类
11
+
12
+ 定义了中间件的标准接口,所有自定义中间件都应该继承此类。
13
+
14
+ 中间件处理流程:
15
+ 1. process_request: 请求发送前处理
16
+ 2. process_response: 响应接收后处理
17
+ 3. process_exception: 异常发生时处理
18
+ """
19
+
20
+ def process_request(
21
+ self,
22
+ request: 'Request',
23
+ spider
24
+ ) -> Optional[Union['Request', 'Response']]:
25
+ """处理请求
26
+
27
+ Args:
28
+ request: 待处理的请求对象
29
+ spider: 当前爬虫实例
30
+
31
+ Returns:
32
+ None: 继续处理
33
+ Request: 替换原请求
34
+ Response: 跳过下载,直接返回响应
35
+ """
12
36
  pass
13
37
 
14
- def process_response(self, request, response, spider) -> 'Request | Response':
15
- # 响应预处理
16
- pass
38
+ def process_response(
39
+ self,
40
+ request: 'Request',
41
+ response: 'Response',
42
+ spider
43
+ ) -> Union['Request', 'Response']:
44
+ """处理响应
45
+
46
+ Args:
47
+ request: 原始请求对象
48
+ response: 接收到的响应对象
49
+ spider: 当前爬虫实例
50
+
51
+ Returns:
52
+ Request: 重新发起请求
53
+ Response: 返回响应(可能是修改后的)
54
+ """
55
+ return response
17
56
 
18
- def process_exception(self, request, exp, spider) -> 'None | Request | Response':
19
- # 异常预处理
57
+ def process_exception(
58
+ self,
59
+ request: 'Request',
60
+ exp: Exception,
61
+ spider
62
+ ) -> Optional[Union['Request', 'Response']]:
63
+ """处理异常
64
+
65
+ Args:
66
+ request: 发生异常的请求
67
+ exp: 捕获的异常对象
68
+ spider: 当前爬虫实例
69
+
70
+ Returns:
71
+ None: 继续传递异常
72
+ Request: 重新发起请求
73
+ Response: 返回响应
74
+ """
20
75
  pass
21
76
 
22
77
  @classmethod
23
78
  def create_instance(cls, crawler):
79
+ """创建中间件实例
80
+
81
+ Args:
82
+ crawler: Crawler实例,包含settings等配置
83
+
84
+ Returns:
85
+ 中间件实例
86
+ """
24
87
  return cls()
@@ -6,7 +6,7 @@ DefaultHeaderMiddleware 中间件
6
6
  """
7
7
 
8
8
  import random
9
- from crawlo.utils.log import get_logger
9
+ from crawlo.logging import get_logger
10
10
  from crawlo.exceptions import NotConfiguredError
11
11
  # 导入User-Agent数据
12
12
  from crawlo.data.user_agents import get_user_agents
@@ -22,7 +22,7 @@ class DefaultHeaderMiddleware(object):
22
22
  """
23
23
  初始化中间件
24
24
  """
25
- self.logger = get_logger(self.__class__.__name__, log_level)
25
+ self.logger = get_logger(self.__class__.__name__)
26
26
 
27
27
  # 获取默认请求头配置
28
28
  self.headers = settings.get_dict('DEFAULT_REQUEST_HEADERS', {})
@@ -7,7 +7,7 @@ DownloadDelayMiddleware 中间件
7
7
 
8
8
  from asyncio import sleep
9
9
  from random import uniform
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.logging import get_logger
11
11
  from crawlo.exceptions import NotConfiguredError
12
12
 
13
13
 
@@ -51,7 +51,7 @@ class DownloadDelayMiddleware(object):
51
51
  # 如果配置不完整,使用默认值
52
52
  self.floor, self.upper = 0.5, 1.5
53
53
 
54
- self.logger = get_logger(self.__class__.__name__, log_level)
54
+ self.logger = get_logger(self.__class__.__name__)
55
55
  self.stats = stats
56
56
 
57
57
  @classmethod
@@ -14,11 +14,11 @@ else:
14
14
  # 为 isinstance 检查导入实际的类
15
15
  from crawlo.network.request import Request
16
16
  from crawlo.network.response import Response
17
- from crawlo.utils.log import get_logger
17
+ from crawlo.logging import get_logger
18
18
  from crawlo.utils.misc import load_object
19
19
  from crawlo.middleware import BaseMiddleware
20
20
  from crawlo.project import common_call
21
- from crawlo.event import ignore_request, response_received
21
+ from crawlo.event import CrawlerEvent
22
22
  from crawlo.exceptions import MiddlewareInitError, InvalidOutputError, RequestMethodError, IgnoreRequestError, \
23
23
  NotConfiguredError
24
24
 
@@ -27,7 +27,7 @@ class MiddlewareManager:
27
27
 
28
28
  def __init__(self, crawler):
29
29
  self.crawler = crawler
30
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
30
+ self.logger = get_logger(self.__class__.__name__)
31
31
  self.middlewares: List = []
32
32
  self.methods: Dict[str, List[MethodType]] = defaultdict(list)
33
33
  middlewares = self.crawler.settings.get_list('MIDDLEWARES')
@@ -54,7 +54,7 @@ class MiddlewareManager:
54
54
  try:
55
55
  response = await common_call(method, request, response, self.crawler.spider)
56
56
  except IgnoreRequestError as exp:
57
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
57
+ create_task(self.crawler.subscriber.notify(CrawlerEvent.IGNORE_REQUEST, exp, request, self.crawler.spider))
58
58
  if isinstance(response, Request):
59
59
  return response
60
60
  if isinstance(response, Response):
@@ -86,13 +86,13 @@ class MiddlewareManager:
86
86
  except KeyError:
87
87
  raise RequestMethodError(f"{request.method.lower()} is not supported")
88
88
  except IgnoreRequestError as exp:
89
- create_task(self.crawler.subscriber.notify(ignore_request, exp, request, self.crawler.spider))
89
+ create_task(self.crawler.subscriber.notify(CrawlerEvent.IGNORE_REQUEST, exp, request, self.crawler.spider))
90
90
  response = await self._process_exception(request, exp)
91
91
  except Exception as exp:
92
92
  self._stats.inc_value(f'download_error/{exp.__class__.__name__}')
93
93
  response = await self._process_exception(request, exp)
94
94
  else:
95
- create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
95
+ create_task(self.crawler.subscriber.notify(CrawlerEvent.RESPONSE_RECEIVED, response, self.crawler.spider))
96
96
  self._stats.inc_value('response_received_count')
97
97
  if isinstance(response, Response):
98
98
  response = await self._process_response(request, response)
@@ -7,7 +7,7 @@ OffsiteMiddleware 中间件
7
7
  import re
8
8
  from urllib.parse import urlparse
9
9
 
10
- from crawlo.utils.log import get_logger
10
+ from crawlo.logging import get_logger
11
11
  from crawlo.exceptions import IgnoreRequestError
12
12
 
13
13
 
@@ -18,7 +18,7 @@ class OffsiteMiddleware:
18
18
  """
19
19
 
20
20
  def __init__(self, stats, log_level, allowed_domains=None):
21
- self.logger = get_logger(self.__class__.__name__, log_level)
21
+ self.logger = get_logger(self.__class__.__name__)
22
22
  self.stats = stats
23
23
  self.allowed_domains = allowed_domains or []
24
24
 
@@ -9,14 +9,14 @@ from urllib.parse import urlparse
9
9
  from typing import Optional, List
10
10
 
11
11
  from crawlo.network import Request, Response
12
- from crawlo.utils.log import get_logger
12
+ from crawlo.logging import get_logger
13
13
 
14
14
 
15
15
  class ProxyMiddleware:
16
16
  """通用代理中间件"""
17
17
 
18
18
  def __init__(self, settings, log_level):
19
- self.logger = get_logger(self.__class__.__name__, log_level)
19
+ self.logger = get_logger(self.__class__.__name__)
20
20
 
21
21
  # 获取代理列表和API URL
22
22
  self.proxies: List[str] = settings.get("PROXY_LIST", [])
@@ -4,9 +4,9 @@
4
4
  RequestIgnoreMiddleware 中间件
5
5
  用于处理和记录被忽略的请求
6
6
  """
7
- from crawlo.utils.log import get_logger
7
+ from crawlo.logging import get_logger
8
8
  from crawlo.exceptions import IgnoreRequestError
9
- from crawlo.event import ignore_request
9
+ from crawlo.event import CrawlerEvent
10
10
 
11
11
 
12
12
  class RequestIgnoreMiddleware(object):
@@ -23,7 +23,7 @@ class RequestIgnoreMiddleware(object):
23
23
  stats: 统计信息收集器
24
24
  log_level: 日志级别
25
25
  """
26
- self.logger = get_logger(self.__class__.__name__, log_level)
26
+ self.logger = get_logger(self.__class__.__name__)
27
27
  self.stats = stats
28
28
 
29
29
  @classmethod
@@ -38,7 +38,7 @@ class RequestIgnoreMiddleware(object):
38
38
  RequestIgnoreMiddleware: 中间件实例
39
39
  """
40
40
  o = cls(stats=crawler.stats, log_level=crawler.settings.get('LOG_LEVEL'))
41
- crawler.subscriber.subscribe(o.request_ignore, event=ignore_request)
41
+ crawler.subscriber.subscribe(o.request_ignore, event=CrawlerEvent.IGNORE_REQUEST)
42
42
  return o
43
43
 
44
44
  async def request_ignore(self, exc, request, _spider):
@@ -4,7 +4,7 @@
4
4
  ResponseCodeMiddleware 中间件
5
5
  用于处理HTTP响应状态码,记录统计信息并支持特殊状态码处理
6
6
  """
7
- from crawlo.utils.log import get_logger
7
+ from crawlo.logging import get_logger
8
8
 
9
9
 
10
10
  class ResponseCodeMiddleware(object):
@@ -27,7 +27,7 @@ class ResponseCodeMiddleware(object):
27
27
  stats: 统计信息收集器
28
28
  log_level: 日志级别
29
29
  """
30
- self.logger = get_logger(self.__class__.__name__, log_level)
30
+ self.logger = get_logger(self.__class__.__name__)
31
31
  self.stats = stats
32
32
 
33
33
  @classmethod
@@ -4,7 +4,7 @@
4
4
  ResponseFilterMiddleware 中间件
5
5
  用于过滤不符合要求的HTTP响应,支持自定义允许的状态码
6
6
  """
7
- from crawlo.utils.log import get_logger
7
+ from crawlo.logging import get_logger
8
8
  from crawlo.exceptions import IgnoreRequestError
9
9
 
10
10
 
@@ -47,7 +47,7 @@ class ResponseFilterMiddleware:
47
47
  except (ValueError, TypeError):
48
48
  pass # 忽略无效的状态码
49
49
 
50
- self.logger = get_logger(self.__class__.__name__, log_level)
50
+ self.logger = get_logger(self.__class__.__name__)
51
51
 
52
52
  @classmethod
53
53
  def create_instance(cls, crawler):
@@ -43,7 +43,7 @@ except ImportError:
43
43
  class ClientResponseError(Exception):
44
44
  pass
45
45
 
46
- from crawlo.utils.log import get_logger
46
+ from crawlo.logging import get_logger
47
47
  from crawlo.stats_collector import StatsCollector
48
48
 
49
49
  _retry_exceptions = [
crawlo/mode_manager.py CHANGED
@@ -34,7 +34,7 @@ class ModeManager:
34
34
  """延迟获取logger实例"""
35
35
  if self._logger is None:
36
36
  try:
37
- from crawlo.utils.log import get_logger
37
+ from crawlo.logging import get_logger
38
38
  self._logger = get_logger(__name__)
39
39
  except Exception:
40
40
  # 如果日志系统尚未初始化,返回None
@@ -248,6 +248,40 @@ def auto_mode(
248
248
 
249
249
  # 环境变量支持
250
250
  def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
251
- """从环境变量创建配置"""
252
- # 移除直接使用 os.getenv(),要求通过 settings 配置
253
- raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
251
+ """从环境变量创建配置
252
+
253
+ 支持的环境变量:
254
+ - CRAWLO_MODE: 运行模式 (standalone/distributed/auto)
255
+ - CRAWLO_REDIS_HOST: Redis主机地址
256
+ - CRAWLO_REDIS_PORT: Redis端口
257
+ - CRAWLO_REDIS_PASSWORD: Redis密码
258
+ - CRAWLO_REDIS_DB: Redis数据库编号
259
+ - CRAWLO_PROJECT_NAME: 项目名称
260
+ - CRAWLO_CONCURRENCY: 并发数
261
+
262
+ Args:
263
+ default_mode: 默认运行模式(当未设置环境变量时使用)
264
+
265
+ Returns:
266
+ 配置字典
267
+ """
268
+ mode = os.getenv('CRAWLO_MODE', default_mode).lower()
269
+
270
+ kwargs = {}
271
+
272
+ # 分布式模式特有配置
273
+ if mode == 'distributed':
274
+ kwargs['redis_host'] = os.getenv('CRAWLO_REDIS_HOST', '127.0.0.1')
275
+ kwargs['redis_port'] = int(os.getenv('CRAWLO_REDIS_PORT', '6379'))
276
+ if password := os.getenv('CRAWLO_REDIS_PASSWORD'):
277
+ kwargs['redis_password'] = password
278
+ kwargs['redis_db'] = int(os.getenv('CRAWLO_REDIS_DB', '0'))
279
+
280
+ # 通用配置
281
+ if project_name := os.getenv('CRAWLO_PROJECT_NAME'):
282
+ kwargs['project_name'] = project_name
283
+
284
+ if concurrency := os.getenv('CRAWLO_CONCURRENCY'):
285
+ kwargs['CONCURRENCY'] = int(concurrency)
286
+
287
+ return ModeManager().resolve_mode_settings(mode, **kwargs)
crawlo/network/request.py CHANGED
@@ -12,42 +12,32 @@ HTTP Request 封装模块
12
12
  """
13
13
  import json
14
14
  from copy import deepcopy
15
- from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
16
- from w3lib.url import safe_url_string
15
+ from enum import IntEnum
16
+ from urllib.parse import urldefrag, urlencode, urlparse, urlunparse, parse_qsl
17
+ from w3lib.url import safe_url_string, add_or_replace_parameter
17
18
  from typing import Dict, Optional, Callable, Union, Any, TypeVar, List
18
19
 
19
- from crawlo.utils.url import escape_ajax
20
+
20
21
 
21
22
 
22
23
  _Request = TypeVar("_Request", bound="Request")
23
24
 
24
25
 
25
- class RequestPriority:
26
- """请求优先级常量和工具类"""
26
+ class RequestPriority(IntEnum):
27
+ """
28
+ 请求优先级枚举。
29
+
30
+ 数值越小,优先级越高。使用 IntEnum 确保可以直接当作整数使用。
31
+
32
+ Examples:
33
+ >>> request = Request(url, priority=RequestPriority.HIGH)
34
+ >>> request.priority = RequestPriority.URGENT
35
+ """
27
36
  URGENT = -200 # 紧急任务
28
37
  HIGH = -100 # 高优先级
29
38
  NORMAL = 0 # 正常优先级(默认)
30
39
  LOW = 100 # 低优先级
31
40
  BACKGROUND = 200 # 后台任务
32
-
33
- @classmethod
34
- def get_all_priorities(cls) -> Dict[str, int]:
35
- """获取所有优先级常量"""
36
- return {
37
- 'URGENT': cls.URGENT,
38
- 'HIGH': cls.HIGH,
39
- 'NORMAL': cls.NORMAL,
40
- 'LOW': cls.LOW,
41
- 'BACKGROUND': cls.BACKGROUND
42
- }
43
-
44
- @classmethod
45
- def from_string(cls, priority_str: str) -> int:
46
- """从字符串获取优先级值"""
47
- priorities = cls.get_all_priorities()
48
- if priority_str.upper() not in priorities:
49
- raise ValueError(f"不支持的优先级: {priority_str}, 支持: {list(priorities.keys())}")
50
- return priorities[priority_str.upper()]
51
41
 
52
42
 
53
43
  class Request:
@@ -87,6 +77,7 @@ class Request:
87
77
  self,
88
78
  url: str,
89
79
  callback: Optional[Callable] = None,
80
+ err_back: Optional[Callable] = None,
90
81
  method: Optional[str] = 'GET',
91
82
  headers: Optional[Dict[str, str]] = None,
92
83
  body: Optional[Union[bytes, str, Dict[Any, Any]]] = None,
@@ -114,6 +105,7 @@ class Request:
114
105
 
115
106
  :param url: 请求 URL(必须)
116
107
  :param callback: 成功回调函数
108
+ :param err_back: 错误回调函数
117
109
  :param method: HTTP 方法,默认 GET
118
110
  :param headers: 请求头
119
111
  :param body: 原始请求体(bytes/str),若为 dict 且未使用 json_body/form_data,则自动转为 JSON
@@ -134,6 +126,7 @@ class Request:
134
126
  :param encoding: 字符编码,默认 utf-8
135
127
  """
136
128
  self.callback = callback
129
+ self.err_back = err_back
137
130
  self.method = str(method).upper()
138
131
  self.headers = headers or {}
139
132
  self.cookies = cookies or {}
@@ -229,7 +222,7 @@ class Request:
229
222
  """安全地 deepcopy meta,移除 logger 后再复制"""
230
223
  import logging
231
224
 
232
- def clean_logger_recursive(obj):
225
+ def clean_logger_recursive(obj: Any) -> Any:
233
226
  """递归移除 logger 对象"""
234
227
  if isinstance(obj, logging.Logger):
235
228
  return None
@@ -251,6 +244,9 @@ class Request:
251
244
 
252
245
  # 先清理 logger,再 deepcopy
253
246
  cleaned_meta = clean_logger_recursive(meta)
247
+ # 确保返回字典类型
248
+ if not isinstance(cleaned_meta, dict):
249
+ return {}
254
250
  return deepcopy(cleaned_meta)
255
251
 
256
252
  def copy(self: _Request) -> _Request:
@@ -377,4 +373,36 @@ class Request:
377
373
 
378
374
  def __lt__(self, other: _Request) -> bool:
379
375
  """用于按优先级排序"""
380
- return self.priority < other.priority
376
+ return self.priority < other.priority
377
+
378
+ def escape_ajax(url: str) -> str:
379
+ """
380
+ 根据Google AJAX爬取规范转换URL(处理哈希片段#!):
381
+ https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
382
+
383
+ 规则说明:
384
+ 1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
385
+ 2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
386
+ 3. 保留原始查询参数(如果有)
387
+
388
+ 示例:
389
+ >>> escape_ajax("www.example.com/ajax.html#!key=value")
390
+ 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
391
+ >>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
392
+ 'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
393
+ >>> escape_ajax("www.example.com/ajax.html#!")
394
+ 'www.example.com/ajax.html?_escaped_fragment_='
395
+
396
+ 非AJAX可爬取的URL(无#!)原样返回:
397
+ >>> escape_ajax("www.example.com/ajax.html#normal")
398
+ 'www.example.com/ajax.html#normal'
399
+ """
400
+ # 分离URL的基础部分和哈希片段
401
+ de_frag, frag = urldefrag(url)
402
+
403
+ # 仅处理以"!"开头的哈希片段(Google规范)
404
+ if not frag.startswith("!"):
405
+ return url # 不符合规则则原样返回
406
+
407
+ # 调用辅助函数添加 `_escaped_fragment_` 参数
408
+ return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])