crawlo 1.3.3__py3-none-any.whl → 1.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (289) hide show
  1. crawlo/__init__.py +87 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +46 -2
  16. crawlo/core/engine.py +439 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -256
  19. crawlo/crawler.py +639 -1167
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -52
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +28 -0
  40. crawlo/factories/base.py +69 -0
  41. crawlo/factories/crawler.py +104 -0
  42. crawlo/factories/registry.py +85 -0
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -234
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -0
  47. crawlo/initialization/__init__.py +40 -0
  48. crawlo/initialization/built_in.py +426 -0
  49. crawlo/initialization/context.py +142 -0
  50. crawlo/initialization/core.py +194 -0
  51. crawlo/initialization/phases.py +149 -0
  52. crawlo/initialization/registry.py +146 -0
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -22
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +38 -0
  58. crawlo/logging/config.py +97 -0
  59. crawlo/logging/factory.py +129 -0
  60. crawlo/logging/manager.py +112 -0
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -187
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +325 -318
  85. crawlo/pipelines/pipeline_manager.py +76 -75
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -325
  88. crawlo/queue/pqueue.py +43 -37
  89. crawlo/queue/queue_manager.py +503 -379
  90. crawlo/queue/redis_priority_queue.py +326 -306
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -225
  93. crawlo/settings/setting_manager.py +214 -198
  94. crawlo/spider/__init__.py +657 -639
  95. crawlo/stats_collector.py +73 -59
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +139 -30
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +168 -267
  104. crawlo/templates/project/settings_distributed.py.tmpl +167 -180
  105. crawlo/templates/project/settings_gentle.py.tmpl +167 -61
  106. crawlo/templates/project/settings_high_performance.py.tmpl +168 -131
  107. crawlo/templates/project/settings_minimal.py.tmpl +66 -35
  108. crawlo/templates/project/settings_simple.py.tmpl +165 -102
  109. crawlo/templates/project/spiders/__init__.py.tmpl +10 -6
  110. crawlo/templates/run.py.tmpl +34 -38
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +10 -0
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +365 -0
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +26 -0
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -124
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +80 -200
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -351
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -218
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/METADATA +1126 -1020
  149. crawlo-1.3.5.dist-info/RECORD +288 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +107 -107
  154. tests/baidu_performance_test.py +109 -0
  155. tests/baidu_test.py +60 -0
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +213 -0
  158. tests/comprehensive_test.py +82 -0
  159. tests/comprehensive_testing_summary.md +187 -0
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +70 -0
  164. tests/debug_framework_logger.py +85 -0
  165. tests/debug_log_config.py +127 -0
  166. tests/debug_log_levels.py +64 -0
  167. tests/debug_pipelines.py +66 -66
  168. tests/detailed_log_test.py +234 -0
  169. tests/distributed_test.py +67 -0
  170. tests/distributed_test_debug.py +77 -0
  171. tests/dynamic_loading_example.py +523 -523
  172. tests/dynamic_loading_test.py +104 -104
  173. tests/env_config_example.py +133 -133
  174. tests/error_handling_example.py +171 -171
  175. tests/final_command_test_report.md +0 -0
  176. tests/final_comprehensive_test.py +152 -0
  177. tests/final_log_test.py +261 -0
  178. tests/final_validation_test.py +183 -0
  179. tests/fix_log_test.py +143 -0
  180. tests/framework_performance_test.py +203 -0
  181. tests/log_buffering_test.py +112 -0
  182. tests/log_generation_timing_test.py +154 -0
  183. tests/optimized_performance_test.py +212 -0
  184. tests/performance_comparison.py +246 -0
  185. tests/queue_blocking_test.py +114 -0
  186. tests/queue_test.py +90 -0
  187. tests/redis_key_validation_demo.py +130 -130
  188. tests/request_params_example.py +150 -150
  189. tests/response_improvements_example.py +144 -144
  190. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  191. tests/scrapy_comparison/scrapy_test.py +134 -0
  192. tests/simple_command_test.py +120 -0
  193. tests/simple_crawlo_test.py +128 -0
  194. tests/simple_log_test.py +58 -0
  195. tests/simple_log_test2.py +138 -0
  196. tests/simple_optimization_test.py +129 -0
  197. tests/simple_spider_test.py +50 -0
  198. tests/simple_test.py +48 -0
  199. tests/spider_log_timing_test.py +178 -0
  200. tests/test_advanced_tools.py +148 -148
  201. tests/test_all_commands.py +231 -0
  202. tests/test_all_redis_key_configs.py +145 -145
  203. tests/test_authenticated_proxy.py +141 -141
  204. tests/test_batch_processor.py +179 -0
  205. tests/test_cleaners.py +54 -54
  206. tests/test_component_factory.py +175 -0
  207. tests/test_comprehensive.py +146 -146
  208. tests/test_config_consistency.py +80 -80
  209. tests/test_config_merge.py +152 -152
  210. tests/test_config_validator.py +182 -182
  211. tests/test_controlled_spider_mixin.py +80 -0
  212. tests/test_crawlo_proxy_integration.py +108 -108
  213. tests/test_date_tools.py +123 -123
  214. tests/test_default_header_middleware.py +158 -158
  215. tests/test_distributed.py +65 -65
  216. tests/test_double_crawlo_fix.py +207 -207
  217. tests/test_double_crawlo_fix_simple.py +124 -124
  218. tests/test_download_delay_middleware.py +221 -221
  219. tests/test_downloader_proxy_compatibility.py +268 -268
  220. tests/test_dynamic_downloaders_proxy.py +124 -124
  221. tests/test_dynamic_proxy.py +92 -92
  222. tests/test_dynamic_proxy_config.py +146 -146
  223. tests/test_dynamic_proxy_real.py +109 -109
  224. tests/test_edge_cases.py +303 -303
  225. tests/test_enhanced_error_handler.py +270 -270
  226. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  227. tests/test_env_config.py +121 -121
  228. tests/test_error_handler_compatibility.py +112 -112
  229. tests/test_factories.py +253 -0
  230. tests/test_final_validation.py +153 -153
  231. tests/test_framework_env_usage.py +103 -103
  232. tests/test_framework_logger.py +67 -0
  233. tests/test_framework_startup.py +65 -0
  234. tests/test_get_component_logger.py +84 -0
  235. tests/test_integration.py +169 -169
  236. tests/test_item_dedup_redis_key.py +122 -122
  237. tests/test_large_scale_config.py +113 -0
  238. tests/test_large_scale_helper.py +236 -0
  239. tests/test_logging_system.py +283 -0
  240. tests/test_mode_change.py +73 -0
  241. tests/test_mode_consistency.py +51 -51
  242. tests/test_offsite_middleware.py +221 -221
  243. tests/test_parsel.py +29 -29
  244. tests/test_performance.py +327 -327
  245. tests/test_performance_monitor.py +116 -0
  246. tests/test_proxy_api.py +264 -264
  247. tests/test_proxy_health_check.py +32 -32
  248. tests/test_proxy_middleware.py +121 -121
  249. tests/test_proxy_middleware_enhanced.py +216 -216
  250. tests/test_proxy_middleware_integration.py +136 -136
  251. tests/test_proxy_middleware_refactored.py +184 -184
  252. tests/test_proxy_providers.py +56 -56
  253. tests/test_proxy_stats.py +19 -19
  254. tests/test_proxy_strategies.py +59 -59
  255. tests/test_queue_empty_check.py +42 -0
  256. tests/test_queue_manager_double_crawlo.py +173 -173
  257. tests/test_queue_manager_redis_key.py +176 -176
  258. tests/test_random_user_agent.py +72 -72
  259. tests/test_real_scenario_proxy.py +195 -195
  260. tests/test_redis_config.py +28 -28
  261. tests/test_redis_connection_pool.py +294 -294
  262. tests/test_redis_key_naming.py +181 -181
  263. tests/test_redis_key_validator.py +123 -123
  264. tests/test_redis_queue.py +224 -224
  265. tests/test_request_ignore_middleware.py +182 -182
  266. tests/test_request_params.py +111 -111
  267. tests/test_request_serialization.py +70 -70
  268. tests/test_response_code_middleware.py +349 -349
  269. tests/test_response_filter_middleware.py +427 -427
  270. tests/test_response_improvements.py +152 -152
  271. tests/test_retry_middleware.py +241 -241
  272. tests/test_scheduler.py +252 -252
  273. tests/test_scheduler_config_update.py +133 -133
  274. tests/test_simple_response.py +61 -61
  275. tests/test_telecom_spider_redis_key.py +205 -205
  276. tests/test_template_content.py +87 -87
  277. tests/test_template_redis_key.py +134 -134
  278. tests/test_tools.py +159 -159
  279. tests/test_user_agents.py +96 -96
  280. tests/tools_example.py +260 -260
  281. tests/untested_features_report.md +139 -0
  282. tests/verify_debug.py +52 -0
  283. tests/verify_distributed.py +117 -117
  284. tests/verify_log_fix.py +112 -0
  285. crawlo-1.3.3.dist-info/RECORD +0 -219
  286. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  287. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/WHEEL +0 -0
  288. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/entry_points.txt +0 -0
  289. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/top_level.txt +0 -0
@@ -1,345 +1,345 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 大规模爬虫优化辅助工具
5
- """
6
- import asyncio
7
- import json
8
- import time
9
- from typing import Generator, List, Dict, Any
10
-
11
- from crawlo.utils.log import get_logger
12
-
13
-
14
- class LargeScaleHelper:
15
- """大规模爬虫辅助类"""
16
-
17
- def __init__(self, batch_size: int = 1000, checkpoint_interval: int = 5000):
18
- self.batch_size = batch_size
19
- self.checkpoint_interval = checkpoint_interval
20
- self.logger = get_logger(self.__class__.__name__)
21
-
22
- def batch_iterator(self, data_source, start_offset: int = 0) -> Generator[List[Any], None, None]:
23
- """
24
- 批量迭代器,适用于大量数据的分批处理
25
-
26
- Args:
27
- data_source: 数据源(支持多种类型)
28
- start_offset: 起始偏移量
29
-
30
- Yields:
31
- 每批数据的列表
32
- """
33
- if hasattr(data_source, '__iter__') and not isinstance(data_source, (str, bytes)):
34
- # 可迭代对象
35
- yield from self._iterate_batches(data_source, start_offset)
36
- elif hasattr(data_source, 'get_batch'):
37
- # 支持分批获取的数据源
38
- yield from self._get_batches_from_source(data_source, start_offset)
39
- elif callable(data_source):
40
- # 函数形式的数据源
41
- yield from self._get_batches_from_function(data_source, start_offset)
42
- else:
43
- raise ValueError(f"不支持的数据源类型: {type(data_source)}")
44
-
45
- def _iterate_batches(self, iterable, start_offset: int) -> Generator[List[Any], None, None]:
46
- """从可迭代对象分批获取数据"""
47
- iterator = iter(iterable)
48
-
49
- # 跳过已处理的数据
50
- for _ in range(start_offset):
51
- try:
52
- next(iterator)
53
- except StopIteration:
54
- return
55
-
56
- while True:
57
- batch = []
58
- for _ in range(self.batch_size):
59
- try:
60
- batch.append(next(iterator))
61
- except StopIteration:
62
- if batch:
63
- yield batch
64
- return
65
-
66
- if batch:
67
- yield batch
68
-
69
- def _get_batches_from_source(self, data_source, start_offset: int) -> Generator[List[Any], None, None]:
70
- """从支持分批获取的数据源获取数据"""
71
- offset = start_offset
72
-
73
- while True:
74
- try:
75
- batch = data_source.get_batch(offset, self.batch_size)
76
- if not batch:
77
- break
78
-
79
- yield batch
80
- offset += len(batch)
81
-
82
- if len(batch) < self.batch_size:
83
- break # 已到达数据末尾
84
-
85
- except Exception as e:
86
- self.logger.error(f"获取批次数据失败: {e}")
87
- break
88
-
89
- def _get_batches_from_function(self, func, start_offset: int) -> Generator[List[Any], None, None]:
90
- """从函数获取批次数据"""
91
- offset = start_offset
92
-
93
- while True:
94
- try:
95
- batch = func(offset, self.batch_size)
96
- if not batch:
97
- break
98
-
99
- yield batch
100
- offset += len(batch)
101
-
102
- if len(batch) < self.batch_size:
103
- break
104
-
105
- except Exception as e:
106
- self.logger.error(f"函数获取数据失败: {e}")
107
- break
108
-
109
-
110
- class ProgressManager:
111
- """进度管理器"""
112
-
113
- def __init__(self, progress_file: str = "spider_progress.json"):
114
- self.progress_file = progress_file
115
- self.logger = get_logger(self.__class__.__name__)
116
-
117
- def load_progress(self) -> Dict[str, Any]:
118
- """加载进度"""
119
- try:
120
- with open(self.progress_file, 'r', encoding='utf-8') as f:
121
- progress = json.load(f)
122
- self.logger.info(f"加载进度: {progress}")
123
- return progress
124
- except FileNotFoundError:
125
- self.logger.info("📄 未找到进度文件,从头开始")
126
- return self._get_default_progress()
127
- except Exception as e:
128
- self.logger.error(f"加载进度失败: {e}")
129
- return self._get_default_progress()
130
-
131
- def save_progress(self, **kwargs):
132
- """保存进度"""
133
- try:
134
- progress = {
135
- **kwargs,
136
- 'timestamp': time.time(),
137
- 'formatted_time': time.strftime('%Y-%m-%d %H:%M:%S')
138
- }
139
-
140
- with open(self.progress_file, 'w', encoding='utf-8') as f:
141
- json.dump(progress, f, indent=2, ensure_ascii=False)
142
-
143
- self.logger.debug(f"💾 已保存进度: {progress}")
144
-
145
- except Exception as e:
146
- self.logger.error(f"保存进度失败: {e}")
147
-
148
- def _get_default_progress(self) -> Dict[str, Any]:
149
- """获取默认进度"""
150
- return {
151
- 'batch_num': 0,
152
- 'processed_count': 0,
153
- 'skipped_count': 0,
154
- 'timestamp': time.time()
155
- }
156
-
157
-
158
- class MemoryOptimizer:
159
- """内存优化器"""
160
-
161
- def __init__(self, max_memory_mb: int = 500):
162
- self.max_memory_mb = max_memory_mb
163
- self.logger = get_logger(self.__class__.__name__)
164
-
165
- def check_memory_usage(self) -> Dict[str, float]:
166
- """检查内存使用情况"""
167
- try:
168
- import psutil
169
- process = psutil.Process()
170
- memory_info = process.memory_info()
171
-
172
- memory_mb = memory_info.rss / 1024 / 1024
173
- memory_percent = process.memory_percent()
174
-
175
- return {
176
- 'memory_mb': memory_mb,
177
- 'memory_percent': memory_percent,
178
- 'threshold_mb': self.max_memory_mb
179
- }
180
- except ImportError:
181
- self.logger.warning("psutil 未安装,无法监控内存")
182
- return {}
183
- except Exception as e:
184
- self.logger.error(f"检查内存失败: {e}")
185
- return {}
186
-
187
- def should_pause_for_memory(self) -> bool:
188
- """检查是否应该因内存不足而暂停"""
189
- memory_info = self.check_memory_usage()
190
-
191
- if not memory_info:
192
- return False
193
-
194
- memory_mb = memory_info.get('memory_mb', 0)
195
-
196
- if memory_mb > self.max_memory_mb:
197
- self.logger.warning(f"内存使用过高: {memory_mb:.1f}MB > {self.max_memory_mb}MB")
198
- return True
199
-
200
- return False
201
-
202
- def force_garbage_collection(self):
203
- """强制垃圾回收"""
204
- try:
205
- import gc
206
- collected = gc.collect()
207
- self.logger.debug(f"垃圾回收: 清理了 {collected} 个对象")
208
- except Exception as e:
209
- self.logger.error(f"垃圾回收失败: {e}")
210
-
211
-
212
- class DataSourceAdapter:
213
- """数据源适配器"""
214
-
215
- @staticmethod
216
- def from_redis_queue(queue, batch_size: int = 1000):
217
- """从Redis队列创建批量数据源"""
218
- def get_batch(offset: int, limit: int) -> List[Dict]:
219
- try:
220
- # 如果队列支持范围查询
221
- if hasattr(queue, 'get_range'):
222
- return queue.get_range(offset, offset + limit - 1)
223
-
224
- # 如果队列支持批量获取
225
- if hasattr(queue, 'get_batch'):
226
- return queue.get_batch(offset, limit)
227
-
228
- # 模拟批量获取
229
- results = []
230
- for _ in range(limit):
231
- item = queue.get_nowait() if hasattr(queue, 'get_nowait') else None
232
- if item:
233
- results.append(item)
234
- else:
235
- break
236
-
237
- return results
238
-
239
- except Exception as e:
240
- print(f"获取批次失败: {e}")
241
- return []
242
-
243
- return get_batch
244
-
245
- @staticmethod
246
- def from_database(db_helper, query: str, batch_size: int = 1000):
247
- """从数据库创建批量数据源"""
248
- def get_batch(offset: int, limit: int) -> List[Dict]:
249
- try:
250
- # 添加分页查询
251
- paginated_query = f"{query} LIMIT {limit} OFFSET {offset}"
252
- return db_helper.execute_query(paginated_query)
253
- except Exception as e:
254
- print(f"数据库查询失败: {e}")
255
- return []
256
-
257
- return get_batch
258
-
259
- @staticmethod
260
- def from_file(file_path: str, batch_size: int = 1000):
261
- """从文件创建批量数据源"""
262
- def get_batch(offset: int, limit: int) -> List[str]:
263
- try:
264
- with open(file_path, 'r', encoding='utf-8') as f:
265
- # 跳过已处理的行
266
- for _ in range(offset):
267
- f.readline()
268
-
269
- # 读取当前批次
270
- batch = []
271
- for _ in range(limit):
272
- line = f.readline()
273
- if not line:
274
- break
275
- batch.append(line.strip())
276
-
277
- return batch
278
- except Exception as e:
279
- print(f"读取文件失败: {e}")
280
- return []
281
-
282
- return get_batch
283
-
284
-
285
- class LargeScaleSpiderMixin:
286
- """大规模爬虫混入类"""
287
-
288
- def __init__(self):
289
- super().__init__()
290
- self.large_scale_helper = LargeScaleHelper(
291
- batch_size=getattr(self, 'batch_size', 1000),
292
- checkpoint_interval=getattr(self, 'checkpoint_interval', 5000)
293
- )
294
- self.progress_manager = ProgressManager(
295
- progress_file=getattr(self, 'progress_file', f"{self.name}_progress.json")
296
- )
297
- self.memory_optimizer = MemoryOptimizer(
298
- max_memory_mb=getattr(self, 'max_memory_mb', 500)
299
- )
300
-
301
- def create_streaming_start_requests(self, data_source):
302
- """创建流式start_requests生成器"""
303
- progress = self.progress_manager.load_progress()
304
- start_offset = progress.get('processed_count', 0)
305
-
306
- processed_count = start_offset
307
- skipped_count = progress.get('skipped_count', 0)
308
-
309
- for batch in self.large_scale_helper.batch_iterator(data_source, start_offset):
310
-
311
- # 内存检查
312
- if self.memory_optimizer.should_pause_for_memory():
313
- self.memory_optimizer.force_garbage_collection()
314
- # 可以添加延迟或其他处理
315
- asyncio.sleep(1)
316
-
317
- for item in batch:
318
- processed_count += 1
319
-
320
- # 检查进度保存
321
- if processed_count % self.large_scale_helper.checkpoint_interval == 0:
322
- self.progress_manager.save_progress(
323
- processed_count=processed_count,
324
- skipped_count=skipped_count
325
- )
326
-
327
- # 生成请求
328
- request = self.create_request_from_item(item)
329
- if request:
330
- yield request
331
- else:
332
- skipped_count += 1
333
-
334
- # 最终保存进度
335
- self.progress_manager.save_progress(
336
- processed_count=processed_count,
337
- skipped_count=skipped_count,
338
- completed=True
339
- )
340
-
341
- self.logger.info(f"处理完成!总计: {processed_count}, 跳过: {skipped_count}")
342
-
343
- def create_request_from_item(self, item):
344
- """从数据项创建请求(需要子类实现)"""
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 大规模爬虫优化辅助工具
5
+ """
6
+ import asyncio
7
+ import json
8
+ import time
9
+ from typing import Generator, List, Dict, Any
10
+
11
+ from crawlo.utils.log import get_logger
12
+
13
+
14
+ class LargeScaleHelper:
15
+ """大规模爬虫辅助类"""
16
+
17
+ def __init__(self, batch_size: int = 1000, checkpoint_interval: int = 5000):
18
+ self.batch_size = batch_size
19
+ self.checkpoint_interval = checkpoint_interval
20
+ self.logger = get_logger(self.__class__.__name__)
21
+
22
+ def batch_iterator(self, data_source, start_offset: int = 0) -> Generator[List[Any], None, None]:
23
+ """
24
+ 批量迭代器,适用于大量数据的分批处理
25
+
26
+ Args:
27
+ data_source: 数据源(支持多种类型)
28
+ start_offset: 起始偏移量
29
+
30
+ Yields:
31
+ 每批数据的列表
32
+ """
33
+ if hasattr(data_source, '__iter__') and not isinstance(data_source, (str, bytes)):
34
+ # 可迭代对象
35
+ yield from self._iterate_batches(data_source, start_offset)
36
+ elif hasattr(data_source, 'get_batch'):
37
+ # 支持分批获取的数据源
38
+ yield from self._get_batches_from_source(data_source, start_offset)
39
+ elif callable(data_source):
40
+ # 函数形式的数据源
41
+ yield from self._get_batches_from_function(data_source, start_offset)
42
+ else:
43
+ raise ValueError(f"不支持的数据源类型: {type(data_source)}")
44
+
45
+ def _iterate_batches(self, iterable, start_offset: int) -> Generator[List[Any], None, None]:
46
+ """从可迭代对象分批获取数据"""
47
+ iterator = iter(iterable)
48
+
49
+ # 跳过已处理的数据
50
+ for _ in range(start_offset):
51
+ try:
52
+ next(iterator)
53
+ except StopIteration:
54
+ return
55
+
56
+ while True:
57
+ batch = []
58
+ for _ in range(self.batch_size):
59
+ try:
60
+ batch.append(next(iterator))
61
+ except StopIteration:
62
+ if batch:
63
+ yield batch
64
+ return
65
+
66
+ if batch:
67
+ yield batch
68
+
69
+ def _get_batches_from_source(self, data_source, start_offset: int) -> Generator[List[Any], None, None]:
70
+ """从支持分批获取的数据源获取数据"""
71
+ offset = start_offset
72
+
73
+ while True:
74
+ try:
75
+ batch = data_source.get_batch(offset, self.batch_size)
76
+ if not batch:
77
+ break
78
+
79
+ yield batch
80
+ offset += len(batch)
81
+
82
+ if len(batch) < self.batch_size:
83
+ break # 已到达数据末尾
84
+
85
+ except Exception as e:
86
+ self.logger.error(f"获取批次数据失败: {e}")
87
+ break
88
+
89
+ def _get_batches_from_function(self, func, start_offset: int) -> Generator[List[Any], None, None]:
90
+ """从函数获取批次数据"""
91
+ offset = start_offset
92
+
93
+ while True:
94
+ try:
95
+ batch = func(offset, self.batch_size)
96
+ if not batch:
97
+ break
98
+
99
+ yield batch
100
+ offset += len(batch)
101
+
102
+ if len(batch) < self.batch_size:
103
+ break
104
+
105
+ except Exception as e:
106
+ self.logger.error(f"函数获取数据失败: {e}")
107
+ break
108
+
109
+
110
+ class ProgressManager:
111
+ """进度管理器"""
112
+
113
+ def __init__(self, progress_file: str = "spider_progress.json"):
114
+ self.progress_file = progress_file
115
+ self.logger = get_logger(self.__class__.__name__)
116
+
117
+ def load_progress(self) -> Dict[str, Any]:
118
+ """加载进度"""
119
+ try:
120
+ with open(self.progress_file, 'r', encoding='utf-8') as f:
121
+ progress = json.load(f)
122
+ self.logger.info(f"加载进度: {progress}")
123
+ return progress
124
+ except FileNotFoundError:
125
+ self.logger.info("📄 未找到进度文件,从头开始")
126
+ return self._get_default_progress()
127
+ except Exception as e:
128
+ self.logger.error(f"加载进度失败: {e}")
129
+ return self._get_default_progress()
130
+
131
+ def save_progress(self, **kwargs):
132
+ """保存进度"""
133
+ try:
134
+ progress = {
135
+ **kwargs,
136
+ 'timestamp': time.time(),
137
+ 'formatted_time': time.strftime('%Y-%m-%d %H:%M:%S')
138
+ }
139
+
140
+ with open(self.progress_file, 'w', encoding='utf-8') as f:
141
+ json.dump(progress, f, indent=2, ensure_ascii=False)
142
+
143
+ self.logger.debug(f"💾 已保存进度: {progress}")
144
+
145
+ except Exception as e:
146
+ self.logger.error(f"保存进度失败: {e}")
147
+
148
+ def _get_default_progress(self) -> Dict[str, Any]:
149
+ """获取默认进度"""
150
+ return {
151
+ 'batch_num': 0,
152
+ 'processed_count': 0,
153
+ 'skipped_count': 0,
154
+ 'timestamp': time.time()
155
+ }
156
+
157
+
158
+ class MemoryOptimizer:
159
+ """内存优化器"""
160
+
161
+ def __init__(self, max_memory_mb: int = 500):
162
+ self.max_memory_mb = max_memory_mb
163
+ self.logger = get_logger(self.__class__.__name__)
164
+
165
+ def check_memory_usage(self) -> Dict[str, float]:
166
+ """检查内存使用情况"""
167
+ try:
168
+ import psutil
169
+ process = psutil.Process()
170
+ memory_info = process.memory_info()
171
+
172
+ memory_mb = memory_info.rss / 1024 / 1024
173
+ memory_percent = process.memory_percent()
174
+
175
+ return {
176
+ 'memory_mb': memory_mb,
177
+ 'memory_percent': memory_percent,
178
+ 'threshold_mb': self.max_memory_mb
179
+ }
180
+ except ImportError:
181
+ self.logger.warning("psutil 未安装,无法监控内存")
182
+ return {}
183
+ except Exception as e:
184
+ self.logger.error(f"检查内存失败: {e}")
185
+ return {}
186
+
187
+ def should_pause_for_memory(self) -> bool:
188
+ """检查是否应该因内存不足而暂停"""
189
+ memory_info = self.check_memory_usage()
190
+
191
+ if not memory_info:
192
+ return False
193
+
194
+ memory_mb = memory_info.get('memory_mb', 0)
195
+
196
+ if memory_mb > self.max_memory_mb:
197
+ self.logger.warning(f"内存使用过高: {memory_mb:.1f}MB > {self.max_memory_mb}MB")
198
+ return True
199
+
200
+ return False
201
+
202
+ def force_garbage_collection(self):
203
+ """强制垃圾回收"""
204
+ try:
205
+ import gc
206
+ collected = gc.collect()
207
+ self.logger.debug(f"垃圾回收: 清理了 {collected} 个对象")
208
+ except Exception as e:
209
+ self.logger.error(f"垃圾回收失败: {e}")
210
+
211
+
212
+ class DataSourceAdapter:
213
+ """数据源适配器"""
214
+
215
+ @staticmethod
216
+ def from_redis_queue(queue, batch_size: int = 1000):
217
+ """从Redis队列创建批量数据源"""
218
+ def get_batch(offset: int, limit: int) -> List[Dict]:
219
+ try:
220
+ # 如果队列支持范围查询
221
+ if hasattr(queue, 'get_range'):
222
+ return queue.get_range(offset, offset + limit - 1)
223
+
224
+ # 如果队列支持批量获取
225
+ if hasattr(queue, 'get_batch'):
226
+ return queue.get_batch(offset, limit)
227
+
228
+ # 模拟批量获取
229
+ results = []
230
+ for _ in range(limit):
231
+ item = queue.get_nowait() if hasattr(queue, 'get_nowait') else None
232
+ if item:
233
+ results.append(item)
234
+ else:
235
+ break
236
+
237
+ return results
238
+
239
+ except Exception as e:
240
+ print(f"获取批次失败: {e}")
241
+ return []
242
+
243
+ return get_batch
244
+
245
+ @staticmethod
246
+ def from_database(db_helper, query: str, batch_size: int = 1000):
247
+ """从数据库创建批量数据源"""
248
+ def get_batch(offset: int, limit: int) -> List[Dict]:
249
+ try:
250
+ # 添加分页查询
251
+ paginated_query = f"{query} LIMIT {limit} OFFSET {offset}"
252
+ return db_helper.execute_query(paginated_query)
253
+ except Exception as e:
254
+ print(f"数据库查询失败: {e}")
255
+ return []
256
+
257
+ return get_batch
258
+
259
+ @staticmethod
260
+ def from_file(file_path: str, batch_size: int = 1000):
261
+ """从文件创建批量数据源"""
262
+ def get_batch(offset: int, limit: int) -> List[str]:
263
+ try:
264
+ with open(file_path, 'r', encoding='utf-8') as f:
265
+ # 跳过已处理的行
266
+ for _ in range(offset):
267
+ f.readline()
268
+
269
+ # 读取当前批次
270
+ batch = []
271
+ for _ in range(limit):
272
+ line = f.readline()
273
+ if not line:
274
+ break
275
+ batch.append(line.strip())
276
+
277
+ return batch
278
+ except Exception as e:
279
+ print(f"读取文件失败: {e}")
280
+ return []
281
+
282
+ return get_batch
283
+
284
+
285
+ class LargeScaleSpiderMixin:
286
+ """大规模爬虫混入类"""
287
+
288
+ def __init__(self):
289
+ super().__init__()
290
+ self.large_scale_helper = LargeScaleHelper(
291
+ batch_size=getattr(self, 'batch_size', 1000),
292
+ checkpoint_interval=getattr(self, 'checkpoint_interval', 5000)
293
+ )
294
+ self.progress_manager = ProgressManager(
295
+ progress_file=getattr(self, 'progress_file', f"{self.name}_progress.json")
296
+ )
297
+ self.memory_optimizer = MemoryOptimizer(
298
+ max_memory_mb=getattr(self, 'max_memory_mb', 500)
299
+ )
300
+
301
+ def create_streaming_start_requests(self, data_source):
302
+ """创建流式start_requests生成器"""
303
+ progress = self.progress_manager.load_progress()
304
+ start_offset = progress.get('processed_count', 0)
305
+
306
+ processed_count = start_offset
307
+ skipped_count = progress.get('skipped_count', 0)
308
+
309
+ for batch in self.large_scale_helper.batch_iterator(data_source, start_offset):
310
+
311
+ # 内存检查
312
+ if self.memory_optimizer.should_pause_for_memory():
313
+ self.memory_optimizer.force_garbage_collection()
314
+ # 可以添加延迟或其他处理
315
+ asyncio.sleep(1)
316
+
317
+ for item in batch:
318
+ processed_count += 1
319
+
320
+ # 检查进度保存
321
+ if processed_count % self.large_scale_helper.checkpoint_interval == 0:
322
+ self.progress_manager.save_progress(
323
+ processed_count=processed_count,
324
+ skipped_count=skipped_count
325
+ )
326
+
327
+ # 生成请求
328
+ request = self.create_request_from_item(item)
329
+ if request:
330
+ yield request
331
+ else:
332
+ skipped_count += 1
333
+
334
+ # 最终保存进度
335
+ self.progress_manager.save_progress(
336
+ processed_count=processed_count,
337
+ skipped_count=skipped_count,
338
+ completed=True
339
+ )
340
+
341
+ self.logger.info(f"处理完成!总计: {processed_count}, 跳过: {skipped_count}")
342
+
343
+ def create_request_from_item(self, item):
344
+ """从数据项创建请求(需要子类实现)"""
345
345
  raise NotImplementedError("子类必须实现 create_request_from_item 方法")