crawlo 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (190) hide show
  1. crawlo/__init__.py +61 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/cli.py +40 -40
  8. crawlo/commands/__init__.py +13 -13
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/list.py +155 -155
  12. crawlo/commands/run.py +292 -285
  13. crawlo/commands/startproject.py +419 -196
  14. crawlo/commands/stats.py +188 -188
  15. crawlo/commands/utils.py +186 -186
  16. crawlo/config.py +312 -279
  17. crawlo/config_validator.py +253 -0
  18. crawlo/core/__init__.py +2 -2
  19. crawlo/core/engine.py +346 -172
  20. crawlo/core/processor.py +40 -40
  21. crawlo/core/scheduler.py +137 -166
  22. crawlo/crawler.py +1027 -1027
  23. crawlo/downloader/__init__.py +266 -242
  24. crawlo/downloader/aiohttp_downloader.py +220 -212
  25. crawlo/downloader/cffi_downloader.py +256 -251
  26. crawlo/downloader/httpx_downloader.py +259 -259
  27. crawlo/downloader/hybrid_downloader.py +214 -0
  28. crawlo/downloader/playwright_downloader.py +403 -0
  29. crawlo/downloader/selenium_downloader.py +473 -0
  30. crawlo/event.py +11 -11
  31. crawlo/exceptions.py +81 -81
  32. crawlo/extension/__init__.py +37 -37
  33. crawlo/extension/health_check.py +141 -141
  34. crawlo/extension/log_interval.py +57 -57
  35. crawlo/extension/log_stats.py +81 -81
  36. crawlo/extension/logging_extension.py +43 -43
  37. crawlo/extension/memory_monitor.py +104 -88
  38. crawlo/extension/performance_profiler.py +133 -117
  39. crawlo/extension/request_recorder.py +107 -107
  40. crawlo/filters/__init__.py +154 -154
  41. crawlo/filters/aioredis_filter.py +281 -242
  42. crawlo/filters/memory_filter.py +269 -269
  43. crawlo/items/__init__.py +23 -23
  44. crawlo/items/base.py +21 -21
  45. crawlo/items/fields.py +53 -53
  46. crawlo/items/items.py +104 -104
  47. crawlo/middleware/__init__.py +21 -21
  48. crawlo/middleware/default_header.py +32 -32
  49. crawlo/middleware/download_delay.py +28 -28
  50. crawlo/middleware/middleware_manager.py +135 -135
  51. crawlo/middleware/proxy.py +272 -248
  52. crawlo/middleware/request_ignore.py +30 -30
  53. crawlo/middleware/response_code.py +18 -18
  54. crawlo/middleware/response_filter.py +26 -26
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/mode_manager.py +212 -201
  57. crawlo/network/__init__.py +21 -21
  58. crawlo/network/request.py +338 -311
  59. crawlo/network/response.py +360 -271
  60. crawlo/pipelines/__init__.py +21 -21
  61. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  62. crawlo/pipelines/console_pipeline.py +39 -39
  63. crawlo/pipelines/csv_pipeline.py +316 -316
  64. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  65. crawlo/pipelines/json_pipeline.py +218 -218
  66. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  67. crawlo/pipelines/mongo_pipeline.py +131 -131
  68. crawlo/pipelines/mysql_pipeline.py +316 -316
  69. crawlo/pipelines/pipeline_manager.py +61 -56
  70. crawlo/pipelines/redis_dedup_pipeline.py +167 -162
  71. crawlo/project.py +188 -153
  72. crawlo/queue/pqueue.py +37 -37
  73. crawlo/queue/queue_manager.py +334 -307
  74. crawlo/queue/redis_priority_queue.py +299 -209
  75. crawlo/settings/__init__.py +7 -7
  76. crawlo/settings/default_settings.py +219 -278
  77. crawlo/settings/setting_manager.py +123 -100
  78. crawlo/spider/__init__.py +639 -639
  79. crawlo/stats_collector.py +59 -59
  80. crawlo/subscriber.py +130 -130
  81. crawlo/task_manager.py +30 -30
  82. crawlo/templates/crawlo.cfg.tmpl +10 -10
  83. crawlo/templates/project/__init__.py.tmpl +3 -3
  84. crawlo/templates/project/items.py.tmpl +17 -17
  85. crawlo/templates/project/middlewares.py.tmpl +110 -110
  86. crawlo/templates/project/pipelines.py.tmpl +97 -97
  87. crawlo/templates/project/run.py.tmpl +251 -251
  88. crawlo/templates/project/settings.py.tmpl +326 -279
  89. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  90. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  91. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  92. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  93. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  94. crawlo/templates/spider/spider.py.tmpl +141 -141
  95. crawlo/tools/__init__.py +183 -0
  96. crawlo/tools/anti_crawler.py +269 -0
  97. crawlo/tools/authenticated_proxy.py +241 -0
  98. crawlo/tools/data_validator.py +181 -0
  99. crawlo/tools/date_tools.py +36 -0
  100. crawlo/tools/distributed_coordinator.py +387 -0
  101. crawlo/tools/retry_mechanism.py +221 -0
  102. crawlo/tools/scenario_adapter.py +263 -0
  103. crawlo/utils/__init__.py +35 -7
  104. crawlo/utils/batch_processor.py +261 -0
  105. crawlo/utils/controlled_spider_mixin.py +439 -439
  106. crawlo/utils/date_tools.py +290 -233
  107. crawlo/utils/db_helper.py +343 -343
  108. crawlo/utils/enhanced_error_handler.py +360 -0
  109. crawlo/utils/env_config.py +106 -0
  110. crawlo/utils/error_handler.py +126 -0
  111. crawlo/utils/func_tools.py +82 -82
  112. crawlo/utils/large_scale_config.py +286 -286
  113. crawlo/utils/large_scale_helper.py +343 -343
  114. crawlo/utils/log.py +128 -128
  115. crawlo/utils/performance_monitor.py +285 -0
  116. crawlo/utils/queue_helper.py +175 -175
  117. crawlo/utils/redis_connection_pool.py +335 -0
  118. crawlo/utils/redis_key_validator.py +200 -0
  119. crawlo/utils/request.py +267 -267
  120. crawlo/utils/request_serializer.py +219 -219
  121. crawlo/utils/spider_loader.py +62 -62
  122. crawlo/utils/system.py +11 -11
  123. crawlo/utils/tools.py +4 -4
  124. crawlo/utils/url.py +39 -39
  125. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/METADATA +401 -403
  126. crawlo-1.1.6.dist-info/RECORD +189 -0
  127. examples/__init__.py +7 -7
  128. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +82 -0
  129. tests/__init__.py +7 -7
  130. tests/advanced_tools_example.py +276 -0
  131. tests/authenticated_proxy_example.py +237 -0
  132. tests/cleaners_example.py +161 -0
  133. tests/config_validation_demo.py +103 -0
  134. {examples → tests}/controlled_spider_example.py +205 -205
  135. tests/date_tools_example.py +181 -0
  136. tests/dynamic_loading_example.py +524 -0
  137. tests/dynamic_loading_test.py +105 -0
  138. tests/env_config_example.py +134 -0
  139. tests/error_handling_example.py +172 -0
  140. tests/redis_key_validation_demo.py +131 -0
  141. tests/response_improvements_example.py +145 -0
  142. tests/test_advanced_tools.py +149 -0
  143. tests/test_all_redis_key_configs.py +146 -0
  144. tests/test_authenticated_proxy.py +142 -0
  145. tests/test_cleaners.py +55 -0
  146. tests/test_comprehensive.py +147 -0
  147. tests/test_config_validator.py +194 -0
  148. tests/test_date_tools.py +124 -0
  149. tests/test_double_crawlo_fix.py +208 -0
  150. tests/test_double_crawlo_fix_simple.py +125 -0
  151. tests/test_dynamic_downloaders_proxy.py +125 -0
  152. tests/test_dynamic_proxy.py +93 -0
  153. tests/test_dynamic_proxy_config.py +147 -0
  154. tests/test_dynamic_proxy_real.py +110 -0
  155. tests/test_edge_cases.py +304 -0
  156. tests/test_enhanced_error_handler.py +271 -0
  157. tests/test_env_config.py +122 -0
  158. tests/test_error_handler_compatibility.py +113 -0
  159. tests/test_final_validation.py +153 -153
  160. tests/test_framework_env_usage.py +104 -0
  161. tests/test_integration.py +357 -0
  162. tests/test_item_dedup_redis_key.py +123 -0
  163. tests/test_parsel.py +30 -0
  164. tests/test_performance.py +328 -0
  165. tests/test_proxy_health_check.py +32 -32
  166. tests/test_proxy_middleware_integration.py +136 -136
  167. tests/test_proxy_providers.py +56 -56
  168. tests/test_proxy_stats.py +19 -19
  169. tests/test_proxy_strategies.py +59 -59
  170. tests/test_queue_manager_double_crawlo.py +231 -0
  171. tests/test_queue_manager_redis_key.py +177 -0
  172. tests/test_redis_config.py +28 -28
  173. tests/test_redis_connection_pool.py +295 -0
  174. tests/test_redis_key_naming.py +182 -0
  175. tests/test_redis_key_validator.py +124 -0
  176. tests/test_redis_queue.py +224 -224
  177. tests/test_request_serialization.py +70 -70
  178. tests/test_response_improvements.py +153 -0
  179. tests/test_scheduler.py +241 -241
  180. tests/test_simple_response.py +62 -0
  181. tests/test_telecom_spider_redis_key.py +206 -0
  182. tests/test_template_content.py +88 -0
  183. tests/test_template_redis_key.py +135 -0
  184. tests/test_tools.py +154 -0
  185. tests/tools_example.py +258 -0
  186. crawlo/core/enhanced_engine.py +0 -190
  187. crawlo-1.1.4.dist-info/RECORD +0 -117
  188. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/WHEEL +0 -0
  189. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/entry_points.txt +0 -0
  190. {crawlo-1.1.4.dist-info → crawlo-1.1.6.dist-info}/top_level.txt +0 -0
crawlo/core/scheduler.py CHANGED
@@ -1,166 +1,137 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import Optional, Callable
4
-
5
- from crawlo.utils.log import get_logger
6
- from crawlo.utils.request import set_request
7
- from crawlo.utils.request_serializer import RequestSerializer
8
- from crawlo.queue.queue_manager import QueueManager, QueueConfig
9
- from crawlo.project import load_class, common_call
10
-
11
-
12
- class Scheduler:
13
- def __init__(self, crawler, dupe_filter, stats, log_level, priority):
14
- self.crawler = crawler
15
- self.queue_manager: Optional[QueueManager] = None
16
- self.request_serializer = RequestSerializer() # 专门处理序列化
17
-
18
- self.logger = get_logger(name=self.__class__.__name__, level=log_level)
19
- self.stats = stats
20
- self.dupe_filter = dupe_filter
21
- self.priority = priority
22
-
23
- @classmethod
24
- def create_instance(cls, crawler):
25
- filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
26
- o = cls(
27
- crawler=crawler,
28
- dupe_filter=filter_cls.create_instance(crawler),
29
- stats=crawler.stats,
30
- log_level=crawler.settings.get('LOG_LEVEL'),
31
- priority=crawler.settings.get('DEPTH_PRIORITY')
32
- )
33
- return o
34
-
35
- async def open(self):
36
- """初始化调度器和队列"""
37
- try:
38
- # 创建队列配置
39
- queue_config = QueueConfig.from_settings(self.crawler.settings)
40
-
41
- # 创建队列管理器
42
- self.queue_manager = QueueManager(queue_config)
43
-
44
- # 初始化队列
45
- success = await self.queue_manager.initialize()
46
- if not success:
47
- raise RuntimeError("队列初始化失败")
48
-
49
- # 输出队列状态
50
- status = self.queue_manager.get_status()
51
- self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
52
- self.logger.info(f'requesting filter: {self.dupe_filter}')
53
- except Exception as e:
54
- self.logger.error(f"❌ 调度器初始化失败: {e}")
55
- raise
56
-
57
- async def next_request(self):
58
- """获取下一个请求"""
59
- if not self.queue_manager:
60
- return None
61
-
62
- request = await self.queue_manager.get()
63
-
64
- # 恢复 callback(从 Redis 队列取出时)
65
- if request:
66
- spider = getattr(self.crawler, 'spider', None)
67
- request = self.request_serializer.restore_after_deserialization(request, spider)
68
-
69
- return request
70
-
71
- async def enqueue_request(self, request):
72
- """将请求加入队列"""
73
- if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
74
- self.dupe_filter.log_stats(request)
75
- return False
76
-
77
- if not self.queue_manager:
78
- self.logger.error("队列管理器未初始化")
79
- return False
80
-
81
- set_request(request, self.priority)
82
-
83
- # 使用统一的队列接口
84
- success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
85
-
86
- if success:
87
- self.logger.debug(f"✅ 请求入队成功: {request.url}")
88
-
89
- return success
90
-
91
- def idle(self) -> bool:
92
- """检查队列是否为空"""
93
- return len(self) == 0
94
-
95
- async def close(self):
96
- """关闭调度器"""
97
- if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
98
- await closed()
99
-
100
- if self.queue_manager:
101
- await self.queue_manager.close()
102
-
103
- def __len__(self):
104
- """获取队列大小"""
105
- if not self.queue_manager:
106
- return 0
107
- # 返回同步的近似值,实际大小需要异步获取
108
- return 0 if self.queue_manager.empty() else 1
109
-
110
- # #!/usr/bin/python
111
- # # -*- coding:UTF-8 -*-
112
- # from typing import Optional, Callable
113
- #
114
- # from crawlo.utils.log import get_logger
115
- # from crawlo.utils.request import set_request
116
- # from crawlo.utils.pqueue import SpiderPriorityQueue
117
- # from crawlo.project import load_class, common_call
118
- #
119
- #
120
- # class Scheduler:
121
- # def __init__(self, crawler, dupe_filter, stats, log_level, priority):
122
- # self.crawler = crawler
123
- # self.request_queue: Optional[SpiderPriorityQueue] = None
124
- #
125
- # self.logger = get_logger(name=self.__class__.__name__, level=log_level)
126
- # self.stats = stats
127
- # self.dupe_filter = dupe_filter
128
- # self.priority = priority
129
- #
130
- # @classmethod
131
- # def create_instance(cls, crawler):
132
- # filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
133
- # o = cls(
134
- # crawler=crawler,
135
- # dupe_filter=filter_cls.create_instance(crawler),
136
- # stats=crawler.stats,
137
- # log_level=crawler.settings.get('LOG_LEVEL'),
138
- # priority=crawler.settings.get('DEPTH_PRIORITY')
139
- # )
140
- # return o
141
- #
142
- # def open(self):
143
- # self.request_queue = SpiderPriorityQueue()
144
- # self.logger.info(f'requesting filter: {self.dupe_filter}')
145
- #
146
- # async def next_request(self):
147
- # request = await self.request_queue.get()
148
- # return request
149
- #
150
- # async def enqueue_request(self, request):
151
- # if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
152
- # self.dupe_filter.log_stats(request)
153
- # return False
154
- # set_request(request, self.priority)
155
- # await self.request_queue.put(request)
156
- # return True
157
- #
158
- # def idle(self) -> bool:
159
- # return len(self) == 0
160
- #
161
- # async def close(self):
162
- # if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
163
- # await closed()
164
- #
165
- # def __len__(self):
166
- # return self.request_queue.qsize()
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional, Callable
4
+
5
+ from crawlo.utils.log import get_logger
6
+ from crawlo.utils.request import set_request
7
+ from crawlo.utils.request_serializer import RequestSerializer
8
+ from crawlo.utils.error_handler import ErrorHandler
9
+ from crawlo.queue.queue_manager import QueueManager, QueueConfig
10
+ from crawlo.project import load_class, common_call
11
+
12
+
13
+ class Scheduler:
14
+ def __init__(self, crawler, dupe_filter, stats, log_level, priority):
15
+ self.crawler = crawler
16
+ self.queue_manager: Optional[QueueManager] = None
17
+ self.request_serializer = RequestSerializer() # 专门处理序列化
18
+
19
+ self.logger = get_logger(name=self.__class__.__name__, level=log_level)
20
+ self.error_handler = ErrorHandler(self.__class__.__name__, log_level)
21
+ self.stats = stats
22
+ self.dupe_filter = dupe_filter
23
+ self.priority = priority
24
+
25
+ @classmethod
26
+ def create_instance(cls, crawler):
27
+ filter_cls = load_class(crawler.settings.get('FILTER_CLASS'))
28
+ o = cls(
29
+ crawler=crawler,
30
+ dupe_filter=filter_cls.create_instance(crawler),
31
+ stats=crawler.stats,
32
+ log_level=crawler.settings.get('LOG_LEVEL'),
33
+ priority=crawler.settings.get('DEPTH_PRIORITY')
34
+ )
35
+ return o
36
+
37
+ async def open(self):
38
+ """初始化调度器和队列"""
39
+ self.logger.info("开始初始化调度器...")
40
+ try:
41
+ # 创建队列配置
42
+ queue_config = QueueConfig.from_settings(self.crawler.settings)
43
+
44
+ # 创建队列管理器
45
+ self.queue_manager = QueueManager(queue_config)
46
+
47
+ # 初始化队列
48
+ self.logger.info("开始初始化队列管理器...")
49
+ success = await self.queue_manager.initialize()
50
+ if not success:
51
+ raise RuntimeError("队列初始化失败")
52
+
53
+ # 输出队列状态
54
+ status = self.queue_manager.get_status()
55
+ self.logger.info(f'队列类型: {status["type"]}, 状态: {status["health"]}')
56
+ self.logger.info(f'requesting filter: {self.dupe_filter}')
57
+ self.logger.info("调度器初始化完成")
58
+ except Exception as e:
59
+ self.logger.error(f"❌ 调度器初始化失败: {e}")
60
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
61
+ raise
62
+
63
+ async def next_request(self):
64
+ """获取下一个请求"""
65
+ if not self.queue_manager:
66
+ return None
67
+
68
+ try:
69
+ request = await self.queue_manager.get()
70
+
71
+ # 恢复 callback(从 Redis 队列取出时)
72
+ if request:
73
+ spider = getattr(self.crawler, 'spider', None)
74
+ request = self.request_serializer.restore_after_deserialization(request, spider)
75
+
76
+ return request
77
+ except Exception as e:
78
+ self.error_handler.handle_error(
79
+ e,
80
+ context="获取下一个请求失败",
81
+ raise_error=False
82
+ )
83
+ return None
84
+
85
+ async def enqueue_request(self, request):
86
+ """将请求加入队列"""
87
+ if not request.dont_filter and await common_call(self.dupe_filter.requested, request):
88
+ self.dupe_filter.log_stats(request)
89
+ return False
90
+
91
+ if not self.queue_manager:
92
+ self.logger.error("队列管理器未初始化")
93
+ return False
94
+
95
+ set_request(request, self.priority)
96
+
97
+ try:
98
+ # 使用统一的队列接口
99
+ success = await self.queue_manager.put(request, priority=getattr(request, 'priority', 0))
100
+
101
+ if success:
102
+ self.logger.debug(f"✅ 请求入队成功: {request.url}")
103
+
104
+ return success
105
+ except Exception as e:
106
+ self.error_handler.handle_error(
107
+ e,
108
+ context="请求入队失败",
109
+ raise_error=False
110
+ )
111
+ return False
112
+
113
+ def idle(self) -> bool:
114
+ """检查队列是否为空"""
115
+ return len(self) == 0
116
+
117
+ async def close(self):
118
+ """关闭调度器"""
119
+ try:
120
+ if isinstance(closed := getattr(self.dupe_filter, 'closed', None), Callable):
121
+ await closed()
122
+
123
+ if self.queue_manager:
124
+ await self.queue_manager.close()
125
+ except Exception as e:
126
+ self.error_handler.handle_error(
127
+ e,
128
+ context="关闭调度器失败",
129
+ raise_error=False
130
+ )
131
+
132
+ def __len__(self):
133
+ """获取队列大小"""
134
+ if not self.queue_manager:
135
+ return 0
136
+ # 返回同步的近似值,实际大小需要异步获取
137
+ return 0 if self.queue_manager.empty() else 1