crawlo 1.3.4__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (289) hide show
  1. crawlo/__init__.py +87 -87
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -341
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +45 -45
  16. crawlo/core/engine.py +439 -439
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -257
  19. crawlo/crawler.py +638 -638
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -228
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +103 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -257
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -291
  47. crawlo/initialization/__init__.py +39 -39
  48. crawlo/initialization/built_in.py +425 -425
  49. crawlo/initialization/context.py +141 -141
  50. crawlo/initialization/core.py +193 -193
  51. crawlo/initialization/phases.py +148 -148
  52. crawlo/initialization/registry.py +145 -145
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -23
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +37 -37
  58. crawlo/logging/config.py +96 -96
  59. crawlo/logging/factory.py +128 -128
  60. crawlo/logging/manager.py +111 -111
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -212
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +325 -318
  85. crawlo/pipelines/pipeline_manager.py +76 -76
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -327
  88. crawlo/queue/pqueue.py +42 -42
  89. crawlo/queue/queue_manager.py +503 -503
  90. crawlo/queue/redis_priority_queue.py +326 -326
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -321
  93. crawlo/settings/setting_manager.py +214 -214
  94. crawlo/spider/__init__.py +657 -657
  95. crawlo/stats_collector.py +73 -73
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +138 -138
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +167 -167
  104. crawlo/templates/project/settings_distributed.py.tmpl +166 -166
  105. crawlo/templates/project/settings_gentle.py.tmpl +166 -166
  106. crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
  107. crawlo/templates/project/settings_minimal.py.tmpl +65 -65
  108. crawlo/templates/project/settings_simple.py.tmpl +164 -164
  109. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  110. crawlo/templates/run.py.tmpl +34 -34
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +9 -9
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +364 -364
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +25 -25
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -165
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +80 -44
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -388
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -225
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/METADATA +1126 -1126
  149. crawlo-1.3.6.dist-info/RECORD +290 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +106 -106
  154. tests/baidu_performance_test.py +108 -108
  155. tests/baidu_test.py +59 -59
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +212 -212
  158. tests/comprehensive_test.py +81 -81
  159. tests/comprehensive_testing_summary.md +186 -186
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +69 -69
  164. tests/debug_framework_logger.py +84 -84
  165. tests/debug_log_config.py +127 -0
  166. tests/debug_log_levels.py +63 -63
  167. tests/debug_pipelines.py +66 -66
  168. tests/detailed_log_test.py +234 -0
  169. tests/distributed_test.py +66 -66
  170. tests/distributed_test_debug.py +76 -76
  171. tests/dynamic_loading_example.py +523 -523
  172. tests/dynamic_loading_test.py +104 -104
  173. tests/env_config_example.py +133 -133
  174. tests/error_handling_example.py +171 -171
  175. tests/final_comprehensive_test.py +151 -151
  176. tests/final_log_test.py +261 -0
  177. tests/final_validation_test.py +182 -182
  178. tests/fix_log_test.py +143 -0
  179. tests/framework_performance_test.py +202 -202
  180. tests/log_buffering_test.py +112 -0
  181. tests/log_generation_timing_test.py +154 -0
  182. tests/optimized_performance_test.py +211 -211
  183. tests/performance_comparison.py +245 -245
  184. tests/queue_blocking_test.py +113 -113
  185. tests/queue_test.py +89 -89
  186. tests/redis_key_validation_demo.py +130 -130
  187. tests/request_params_example.py +150 -150
  188. tests/response_improvements_example.py +144 -144
  189. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  190. tests/scrapy_comparison/scrapy_test.py +133 -133
  191. tests/simple_command_test.py +119 -119
  192. tests/simple_crawlo_test.py +127 -127
  193. tests/simple_log_test.py +57 -57
  194. tests/simple_log_test2.py +138 -0
  195. tests/simple_optimization_test.py +128 -128
  196. tests/simple_queue_type_test.py +42 -0
  197. tests/simple_spider_test.py +49 -49
  198. tests/simple_test.py +47 -47
  199. tests/spider_log_timing_test.py +178 -0
  200. tests/test_advanced_tools.py +148 -148
  201. tests/test_all_commands.py +230 -230
  202. tests/test_all_redis_key_configs.py +145 -145
  203. tests/test_authenticated_proxy.py +141 -141
  204. tests/test_batch_processor.py +178 -178
  205. tests/test_cleaners.py +54 -54
  206. tests/test_component_factory.py +174 -174
  207. tests/test_comprehensive.py +146 -146
  208. tests/test_config_consistency.py +80 -80
  209. tests/test_config_merge.py +152 -152
  210. tests/test_config_validator.py +182 -182
  211. tests/test_controlled_spider_mixin.py +79 -79
  212. tests/test_crawlo_proxy_integration.py +108 -108
  213. tests/test_date_tools.py +123 -123
  214. tests/test_default_header_middleware.py +158 -158
  215. tests/test_distributed.py +65 -65
  216. tests/test_double_crawlo_fix.py +207 -207
  217. tests/test_double_crawlo_fix_simple.py +124 -124
  218. tests/test_download_delay_middleware.py +221 -221
  219. tests/test_downloader_proxy_compatibility.py +268 -268
  220. tests/test_dynamic_downloaders_proxy.py +124 -124
  221. tests/test_dynamic_proxy.py +92 -92
  222. tests/test_dynamic_proxy_config.py +146 -146
  223. tests/test_dynamic_proxy_real.py +109 -109
  224. tests/test_edge_cases.py +303 -303
  225. tests/test_enhanced_error_handler.py +270 -270
  226. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  227. tests/test_env_config.py +121 -121
  228. tests/test_error_handler_compatibility.py +112 -112
  229. tests/test_factories.py +252 -252
  230. tests/test_final_validation.py +153 -153
  231. tests/test_framework_env_usage.py +103 -103
  232. tests/test_framework_logger.py +66 -66
  233. tests/test_framework_startup.py +64 -64
  234. tests/test_get_component_logger.py +84 -0
  235. tests/test_integration.py +169 -169
  236. tests/test_item_dedup_redis_key.py +122 -122
  237. tests/test_large_scale_config.py +112 -112
  238. tests/test_large_scale_helper.py +235 -235
  239. tests/test_logging_system.py +283 -0
  240. tests/test_mode_change.py +72 -72
  241. tests/test_mode_consistency.py +51 -51
  242. tests/test_offsite_middleware.py +221 -221
  243. tests/test_parsel.py +29 -29
  244. tests/test_performance.py +327 -327
  245. tests/test_performance_monitor.py +115 -115
  246. tests/test_proxy_api.py +264 -264
  247. tests/test_proxy_health_check.py +32 -32
  248. tests/test_proxy_middleware.py +121 -121
  249. tests/test_proxy_middleware_enhanced.py +216 -216
  250. tests/test_proxy_middleware_integration.py +136 -136
  251. tests/test_proxy_middleware_refactored.py +184 -184
  252. tests/test_proxy_providers.py +56 -56
  253. tests/test_proxy_stats.py +19 -19
  254. tests/test_proxy_strategies.py +59 -59
  255. tests/test_queue_empty_check.py +41 -41
  256. tests/test_queue_manager_double_crawlo.py +173 -173
  257. tests/test_queue_manager_redis_key.py +176 -176
  258. tests/test_queue_type.py +107 -0
  259. tests/test_random_user_agent.py +72 -72
  260. tests/test_real_scenario_proxy.py +195 -195
  261. tests/test_redis_config.py +28 -28
  262. tests/test_redis_connection_pool.py +294 -294
  263. tests/test_redis_key_naming.py +181 -181
  264. tests/test_redis_key_validator.py +123 -123
  265. tests/test_redis_queue.py +224 -224
  266. tests/test_request_ignore_middleware.py +182 -182
  267. tests/test_request_params.py +111 -111
  268. tests/test_request_serialization.py +70 -70
  269. tests/test_response_code_middleware.py +349 -349
  270. tests/test_response_filter_middleware.py +427 -427
  271. tests/test_response_improvements.py +152 -152
  272. tests/test_retry_middleware.py +241 -241
  273. tests/test_scheduler.py +252 -252
  274. tests/test_scheduler_config_update.py +133 -133
  275. tests/test_simple_response.py +61 -61
  276. tests/test_telecom_spider_redis_key.py +205 -205
  277. tests/test_template_content.py +87 -87
  278. tests/test_template_redis_key.py +134 -134
  279. tests/test_tools.py +159 -159
  280. tests/test_user_agents.py +96 -96
  281. tests/tools_example.py +260 -260
  282. tests/untested_features_report.md +138 -138
  283. tests/verify_debug.py +51 -51
  284. tests/verify_distributed.py +117 -117
  285. tests/verify_log_fix.py +111 -111
  286. crawlo-1.3.4.dist-info/RECORD +0 -278
  287. {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/WHEEL +0 -0
  288. {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/entry_points.txt +0 -0
  289. {crawlo-1.3.4.dist-info → crawlo-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,345 +1,345 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 大规模爬虫优化辅助工具
5
- """
6
- import asyncio
7
- import json
8
- import time
9
- from typing import Generator, List, Dict, Any
10
-
11
- from crawlo.utils.log import get_logger
12
-
13
-
14
- class LargeScaleHelper:
15
- """大规模爬虫辅助类"""
16
-
17
- def __init__(self, batch_size: int = 1000, checkpoint_interval: int = 5000):
18
- self.batch_size = batch_size
19
- self.checkpoint_interval = checkpoint_interval
20
- self.logger = get_logger(self.__class__.__name__)
21
-
22
- def batch_iterator(self, data_source, start_offset: int = 0) -> Generator[List[Any], None, None]:
23
- """
24
- 批量迭代器,适用于大量数据的分批处理
25
-
26
- Args:
27
- data_source: 数据源(支持多种类型)
28
- start_offset: 起始偏移量
29
-
30
- Yields:
31
- 每批数据的列表
32
- """
33
- if hasattr(data_source, '__iter__') and not isinstance(data_source, (str, bytes)):
34
- # 可迭代对象
35
- yield from self._iterate_batches(data_source, start_offset)
36
- elif hasattr(data_source, 'get_batch'):
37
- # 支持分批获取的数据源
38
- yield from self._get_batches_from_source(data_source, start_offset)
39
- elif callable(data_source):
40
- # 函数形式的数据源
41
- yield from self._get_batches_from_function(data_source, start_offset)
42
- else:
43
- raise ValueError(f"不支持的数据源类型: {type(data_source)}")
44
-
45
- def _iterate_batches(self, iterable, start_offset: int) -> Generator[List[Any], None, None]:
46
- """从可迭代对象分批获取数据"""
47
- iterator = iter(iterable)
48
-
49
- # 跳过已处理的数据
50
- for _ in range(start_offset):
51
- try:
52
- next(iterator)
53
- except StopIteration:
54
- return
55
-
56
- while True:
57
- batch = []
58
- for _ in range(self.batch_size):
59
- try:
60
- batch.append(next(iterator))
61
- except StopIteration:
62
- if batch:
63
- yield batch
64
- return
65
-
66
- if batch:
67
- yield batch
68
-
69
- def _get_batches_from_source(self, data_source, start_offset: int) -> Generator[List[Any], None, None]:
70
- """从支持分批获取的数据源获取数据"""
71
- offset = start_offset
72
-
73
- while True:
74
- try:
75
- batch = data_source.get_batch(offset, self.batch_size)
76
- if not batch:
77
- break
78
-
79
- yield batch
80
- offset += len(batch)
81
-
82
- if len(batch) < self.batch_size:
83
- break # 已到达数据末尾
84
-
85
- except Exception as e:
86
- self.logger.error(f"获取批次数据失败: {e}")
87
- break
88
-
89
- def _get_batches_from_function(self, func, start_offset: int) -> Generator[List[Any], None, None]:
90
- """从函数获取批次数据"""
91
- offset = start_offset
92
-
93
- while True:
94
- try:
95
- batch = func(offset, self.batch_size)
96
- if not batch:
97
- break
98
-
99
- yield batch
100
- offset += len(batch)
101
-
102
- if len(batch) < self.batch_size:
103
- break
104
-
105
- except Exception as e:
106
- self.logger.error(f"函数获取数据失败: {e}")
107
- break
108
-
109
-
110
- class ProgressManager:
111
- """进度管理器"""
112
-
113
- def __init__(self, progress_file: str = "spider_progress.json"):
114
- self.progress_file = progress_file
115
- self.logger = get_logger(self.__class__.__name__)
116
-
117
- def load_progress(self) -> Dict[str, Any]:
118
- """加载进度"""
119
- try:
120
- with open(self.progress_file, 'r', encoding='utf-8') as f:
121
- progress = json.load(f)
122
- self.logger.info(f"加载进度: {progress}")
123
- return progress
124
- except FileNotFoundError:
125
- self.logger.info("📄 未找到进度文件,从头开始")
126
- return self._get_default_progress()
127
- except Exception as e:
128
- self.logger.error(f"加载进度失败: {e}")
129
- return self._get_default_progress()
130
-
131
- def save_progress(self, **kwargs):
132
- """保存进度"""
133
- try:
134
- progress = {
135
- **kwargs,
136
- 'timestamp': time.time(),
137
- 'formatted_time': time.strftime('%Y-%m-%d %H:%M:%S')
138
- }
139
-
140
- with open(self.progress_file, 'w', encoding='utf-8') as f:
141
- json.dump(progress, f, indent=2, ensure_ascii=False)
142
-
143
- self.logger.debug(f"💾 已保存进度: {progress}")
144
-
145
- except Exception as e:
146
- self.logger.error(f"保存进度失败: {e}")
147
-
148
- def _get_default_progress(self) -> Dict[str, Any]:
149
- """获取默认进度"""
150
- return {
151
- 'batch_num': 0,
152
- 'processed_count': 0,
153
- 'skipped_count': 0,
154
- 'timestamp': time.time()
155
- }
156
-
157
-
158
- class MemoryOptimizer:
159
- """内存优化器"""
160
-
161
- def __init__(self, max_memory_mb: int = 500):
162
- self.max_memory_mb = max_memory_mb
163
- self.logger = get_logger(self.__class__.__name__)
164
-
165
- def check_memory_usage(self) -> Dict[str, float]:
166
- """检查内存使用情况"""
167
- try:
168
- import psutil
169
- process = psutil.Process()
170
- memory_info = process.memory_info()
171
-
172
- memory_mb = memory_info.rss / 1024 / 1024
173
- memory_percent = process.memory_percent()
174
-
175
- return {
176
- 'memory_mb': memory_mb,
177
- 'memory_percent': memory_percent,
178
- 'threshold_mb': self.max_memory_mb
179
- }
180
- except ImportError:
181
- self.logger.warning("psutil 未安装,无法监控内存")
182
- return {}
183
- except Exception as e:
184
- self.logger.error(f"检查内存失败: {e}")
185
- return {}
186
-
187
- def should_pause_for_memory(self) -> bool:
188
- """检查是否应该因内存不足而暂停"""
189
- memory_info = self.check_memory_usage()
190
-
191
- if not memory_info:
192
- return False
193
-
194
- memory_mb = memory_info.get('memory_mb', 0)
195
-
196
- if memory_mb > self.max_memory_mb:
197
- self.logger.warning(f"内存使用过高: {memory_mb:.1f}MB > {self.max_memory_mb}MB")
198
- return True
199
-
200
- return False
201
-
202
- def force_garbage_collection(self):
203
- """强制垃圾回收"""
204
- try:
205
- import gc
206
- collected = gc.collect()
207
- self.logger.debug(f"垃圾回收: 清理了 {collected} 个对象")
208
- except Exception as e:
209
- self.logger.error(f"垃圾回收失败: {e}")
210
-
211
-
212
- class DataSourceAdapter:
213
- """数据源适配器"""
214
-
215
- @staticmethod
216
- def from_redis_queue(queue, batch_size: int = 1000):
217
- """从Redis队列创建批量数据源"""
218
- def get_batch(offset: int, limit: int) -> List[Dict]:
219
- try:
220
- # 如果队列支持范围查询
221
- if hasattr(queue, 'get_range'):
222
- return queue.get_range(offset, offset + limit - 1)
223
-
224
- # 如果队列支持批量获取
225
- if hasattr(queue, 'get_batch'):
226
- return queue.get_batch(offset, limit)
227
-
228
- # 模拟批量获取
229
- results = []
230
- for _ in range(limit):
231
- item = queue.get_nowait() if hasattr(queue, 'get_nowait') else None
232
- if item:
233
- results.append(item)
234
- else:
235
- break
236
-
237
- return results
238
-
239
- except Exception as e:
240
- print(f"获取批次失败: {e}")
241
- return []
242
-
243
- return get_batch
244
-
245
- @staticmethod
246
- def from_database(db_helper, query: str, batch_size: int = 1000):
247
- """从数据库创建批量数据源"""
248
- def get_batch(offset: int, limit: int) -> List[Dict]:
249
- try:
250
- # 添加分页查询
251
- paginated_query = f"{query} LIMIT {limit} OFFSET {offset}"
252
- return db_helper.execute_query(paginated_query)
253
- except Exception as e:
254
- print(f"数据库查询失败: {e}")
255
- return []
256
-
257
- return get_batch
258
-
259
- @staticmethod
260
- def from_file(file_path: str, batch_size: int = 1000):
261
- """从文件创建批量数据源"""
262
- def get_batch(offset: int, limit: int) -> List[str]:
263
- try:
264
- with open(file_path, 'r', encoding='utf-8') as f:
265
- # 跳过已处理的行
266
- for _ in range(offset):
267
- f.readline()
268
-
269
- # 读取当前批次
270
- batch = []
271
- for _ in range(limit):
272
- line = f.readline()
273
- if not line:
274
- break
275
- batch.append(line.strip())
276
-
277
- return batch
278
- except Exception as e:
279
- print(f"读取文件失败: {e}")
280
- return []
281
-
282
- return get_batch
283
-
284
-
285
- class LargeScaleSpiderMixin:
286
- """大规模爬虫混入类"""
287
-
288
- def __init__(self):
289
- super().__init__()
290
- self.large_scale_helper = LargeScaleHelper(
291
- batch_size=getattr(self, 'batch_size', 1000),
292
- checkpoint_interval=getattr(self, 'checkpoint_interval', 5000)
293
- )
294
- self.progress_manager = ProgressManager(
295
- progress_file=getattr(self, 'progress_file', f"{self.name}_progress.json")
296
- )
297
- self.memory_optimizer = MemoryOptimizer(
298
- max_memory_mb=getattr(self, 'max_memory_mb', 500)
299
- )
300
-
301
- def create_streaming_start_requests(self, data_source):
302
- """创建流式start_requests生成器"""
303
- progress = self.progress_manager.load_progress()
304
- start_offset = progress.get('processed_count', 0)
305
-
306
- processed_count = start_offset
307
- skipped_count = progress.get('skipped_count', 0)
308
-
309
- for batch in self.large_scale_helper.batch_iterator(data_source, start_offset):
310
-
311
- # 内存检查
312
- if self.memory_optimizer.should_pause_for_memory():
313
- self.memory_optimizer.force_garbage_collection()
314
- # 可以添加延迟或其他处理
315
- asyncio.sleep(1)
316
-
317
- for item in batch:
318
- processed_count += 1
319
-
320
- # 检查进度保存
321
- if processed_count % self.large_scale_helper.checkpoint_interval == 0:
322
- self.progress_manager.save_progress(
323
- processed_count=processed_count,
324
- skipped_count=skipped_count
325
- )
326
-
327
- # 生成请求
328
- request = self.create_request_from_item(item)
329
- if request:
330
- yield request
331
- else:
332
- skipped_count += 1
333
-
334
- # 最终保存进度
335
- self.progress_manager.save_progress(
336
- processed_count=processed_count,
337
- skipped_count=skipped_count,
338
- completed=True
339
- )
340
-
341
- self.logger.info(f"处理完成!总计: {processed_count}, 跳过: {skipped_count}")
342
-
343
- def create_request_from_item(self, item):
344
- """从数据项创建请求(需要子类实现)"""
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 大规模爬虫优化辅助工具
5
+ """
6
+ import asyncio
7
+ import json
8
+ import time
9
+ from typing import Generator, List, Dict, Any
10
+
11
+ from crawlo.utils.log import get_logger
12
+
13
+
14
+ class LargeScaleHelper:
15
+ """大规模爬虫辅助类"""
16
+
17
+ def __init__(self, batch_size: int = 1000, checkpoint_interval: int = 5000):
18
+ self.batch_size = batch_size
19
+ self.checkpoint_interval = checkpoint_interval
20
+ self.logger = get_logger(self.__class__.__name__)
21
+
22
+ def batch_iterator(self, data_source, start_offset: int = 0) -> Generator[List[Any], None, None]:
23
+ """
24
+ 批量迭代器,适用于大量数据的分批处理
25
+
26
+ Args:
27
+ data_source: 数据源(支持多种类型)
28
+ start_offset: 起始偏移量
29
+
30
+ Yields:
31
+ 每批数据的列表
32
+ """
33
+ if hasattr(data_source, '__iter__') and not isinstance(data_source, (str, bytes)):
34
+ # 可迭代对象
35
+ yield from self._iterate_batches(data_source, start_offset)
36
+ elif hasattr(data_source, 'get_batch'):
37
+ # 支持分批获取的数据源
38
+ yield from self._get_batches_from_source(data_source, start_offset)
39
+ elif callable(data_source):
40
+ # 函数形式的数据源
41
+ yield from self._get_batches_from_function(data_source, start_offset)
42
+ else:
43
+ raise ValueError(f"不支持的数据源类型: {type(data_source)}")
44
+
45
+ def _iterate_batches(self, iterable, start_offset: int) -> Generator[List[Any], None, None]:
46
+ """从可迭代对象分批获取数据"""
47
+ iterator = iter(iterable)
48
+
49
+ # 跳过已处理的数据
50
+ for _ in range(start_offset):
51
+ try:
52
+ next(iterator)
53
+ except StopIteration:
54
+ return
55
+
56
+ while True:
57
+ batch = []
58
+ for _ in range(self.batch_size):
59
+ try:
60
+ batch.append(next(iterator))
61
+ except StopIteration:
62
+ if batch:
63
+ yield batch
64
+ return
65
+
66
+ if batch:
67
+ yield batch
68
+
69
+ def _get_batches_from_source(self, data_source, start_offset: int) -> Generator[List[Any], None, None]:
70
+ """从支持分批获取的数据源获取数据"""
71
+ offset = start_offset
72
+
73
+ while True:
74
+ try:
75
+ batch = data_source.get_batch(offset, self.batch_size)
76
+ if not batch:
77
+ break
78
+
79
+ yield batch
80
+ offset += len(batch)
81
+
82
+ if len(batch) < self.batch_size:
83
+ break # 已到达数据末尾
84
+
85
+ except Exception as e:
86
+ self.logger.error(f"获取批次数据失败: {e}")
87
+ break
88
+
89
+ def _get_batches_from_function(self, func, start_offset: int) -> Generator[List[Any], None, None]:
90
+ """从函数获取批次数据"""
91
+ offset = start_offset
92
+
93
+ while True:
94
+ try:
95
+ batch = func(offset, self.batch_size)
96
+ if not batch:
97
+ break
98
+
99
+ yield batch
100
+ offset += len(batch)
101
+
102
+ if len(batch) < self.batch_size:
103
+ break
104
+
105
+ except Exception as e:
106
+ self.logger.error(f"函数获取数据失败: {e}")
107
+ break
108
+
109
+
110
+ class ProgressManager:
111
+ """进度管理器"""
112
+
113
+ def __init__(self, progress_file: str = "spider_progress.json"):
114
+ self.progress_file = progress_file
115
+ self.logger = get_logger(self.__class__.__name__)
116
+
117
+ def load_progress(self) -> Dict[str, Any]:
118
+ """加载进度"""
119
+ try:
120
+ with open(self.progress_file, 'r', encoding='utf-8') as f:
121
+ progress = json.load(f)
122
+ self.logger.info(f"加载进度: {progress}")
123
+ return progress
124
+ except FileNotFoundError:
125
+ self.logger.info("📄 未找到进度文件,从头开始")
126
+ return self._get_default_progress()
127
+ except Exception as e:
128
+ self.logger.error(f"加载进度失败: {e}")
129
+ return self._get_default_progress()
130
+
131
+ def save_progress(self, **kwargs):
132
+ """保存进度"""
133
+ try:
134
+ progress = {
135
+ **kwargs,
136
+ 'timestamp': time.time(),
137
+ 'formatted_time': time.strftime('%Y-%m-%d %H:%M:%S')
138
+ }
139
+
140
+ with open(self.progress_file, 'w', encoding='utf-8') as f:
141
+ json.dump(progress, f, indent=2, ensure_ascii=False)
142
+
143
+ self.logger.debug(f"💾 已保存进度: {progress}")
144
+
145
+ except Exception as e:
146
+ self.logger.error(f"保存进度失败: {e}")
147
+
148
+ def _get_default_progress(self) -> Dict[str, Any]:
149
+ """获取默认进度"""
150
+ return {
151
+ 'batch_num': 0,
152
+ 'processed_count': 0,
153
+ 'skipped_count': 0,
154
+ 'timestamp': time.time()
155
+ }
156
+
157
+
158
+ class MemoryOptimizer:
159
+ """内存优化器"""
160
+
161
+ def __init__(self, max_memory_mb: int = 500):
162
+ self.max_memory_mb = max_memory_mb
163
+ self.logger = get_logger(self.__class__.__name__)
164
+
165
+ def check_memory_usage(self) -> Dict[str, float]:
166
+ """检查内存使用情况"""
167
+ try:
168
+ import psutil
169
+ process = psutil.Process()
170
+ memory_info = process.memory_info()
171
+
172
+ memory_mb = memory_info.rss / 1024 / 1024
173
+ memory_percent = process.memory_percent()
174
+
175
+ return {
176
+ 'memory_mb': memory_mb,
177
+ 'memory_percent': memory_percent,
178
+ 'threshold_mb': self.max_memory_mb
179
+ }
180
+ except ImportError:
181
+ self.logger.warning("psutil 未安装,无法监控内存")
182
+ return {}
183
+ except Exception as e:
184
+ self.logger.error(f"检查内存失败: {e}")
185
+ return {}
186
+
187
+ def should_pause_for_memory(self) -> bool:
188
+ """检查是否应该因内存不足而暂停"""
189
+ memory_info = self.check_memory_usage()
190
+
191
+ if not memory_info:
192
+ return False
193
+
194
+ memory_mb = memory_info.get('memory_mb', 0)
195
+
196
+ if memory_mb > self.max_memory_mb:
197
+ self.logger.warning(f"内存使用过高: {memory_mb:.1f}MB > {self.max_memory_mb}MB")
198
+ return True
199
+
200
+ return False
201
+
202
+ def force_garbage_collection(self):
203
+ """强制垃圾回收"""
204
+ try:
205
+ import gc
206
+ collected = gc.collect()
207
+ self.logger.debug(f"垃圾回收: 清理了 {collected} 个对象")
208
+ except Exception as e:
209
+ self.logger.error(f"垃圾回收失败: {e}")
210
+
211
+
212
+ class DataSourceAdapter:
213
+ """数据源适配器"""
214
+
215
+ @staticmethod
216
+ def from_redis_queue(queue, batch_size: int = 1000):
217
+ """从Redis队列创建批量数据源"""
218
+ def get_batch(offset: int, limit: int) -> List[Dict]:
219
+ try:
220
+ # 如果队列支持范围查询
221
+ if hasattr(queue, 'get_range'):
222
+ return queue.get_range(offset, offset + limit - 1)
223
+
224
+ # 如果队列支持批量获取
225
+ if hasattr(queue, 'get_batch'):
226
+ return queue.get_batch(offset, limit)
227
+
228
+ # 模拟批量获取
229
+ results = []
230
+ for _ in range(limit):
231
+ item = queue.get_nowait() if hasattr(queue, 'get_nowait') else None
232
+ if item:
233
+ results.append(item)
234
+ else:
235
+ break
236
+
237
+ return results
238
+
239
+ except Exception as e:
240
+ print(f"获取批次失败: {e}")
241
+ return []
242
+
243
+ return get_batch
244
+
245
+ @staticmethod
246
+ def from_database(db_helper, query: str, batch_size: int = 1000):
247
+ """从数据库创建批量数据源"""
248
+ def get_batch(offset: int, limit: int) -> List[Dict]:
249
+ try:
250
+ # 添加分页查询
251
+ paginated_query = f"{query} LIMIT {limit} OFFSET {offset}"
252
+ return db_helper.execute_query(paginated_query)
253
+ except Exception as e:
254
+ print(f"数据库查询失败: {e}")
255
+ return []
256
+
257
+ return get_batch
258
+
259
+ @staticmethod
260
+ def from_file(file_path: str, batch_size: int = 1000):
261
+ """从文件创建批量数据源"""
262
+ def get_batch(offset: int, limit: int) -> List[str]:
263
+ try:
264
+ with open(file_path, 'r', encoding='utf-8') as f:
265
+ # 跳过已处理的行
266
+ for _ in range(offset):
267
+ f.readline()
268
+
269
+ # 读取当前批次
270
+ batch = []
271
+ for _ in range(limit):
272
+ line = f.readline()
273
+ if not line:
274
+ break
275
+ batch.append(line.strip())
276
+
277
+ return batch
278
+ except Exception as e:
279
+ print(f"读取文件失败: {e}")
280
+ return []
281
+
282
+ return get_batch
283
+
284
+
285
+ class LargeScaleSpiderMixin:
286
+ """大规模爬虫混入类"""
287
+
288
+ def __init__(self):
289
+ super().__init__()
290
+ self.large_scale_helper = LargeScaleHelper(
291
+ batch_size=getattr(self, 'batch_size', 1000),
292
+ checkpoint_interval=getattr(self, 'checkpoint_interval', 5000)
293
+ )
294
+ self.progress_manager = ProgressManager(
295
+ progress_file=getattr(self, 'progress_file', f"{self.name}_progress.json")
296
+ )
297
+ self.memory_optimizer = MemoryOptimizer(
298
+ max_memory_mb=getattr(self, 'max_memory_mb', 500)
299
+ )
300
+
301
+ def create_streaming_start_requests(self, data_source):
302
+ """创建流式start_requests生成器"""
303
+ progress = self.progress_manager.load_progress()
304
+ start_offset = progress.get('processed_count', 0)
305
+
306
+ processed_count = start_offset
307
+ skipped_count = progress.get('skipped_count', 0)
308
+
309
+ for batch in self.large_scale_helper.batch_iterator(data_source, start_offset):
310
+
311
+ # 内存检查
312
+ if self.memory_optimizer.should_pause_for_memory():
313
+ self.memory_optimizer.force_garbage_collection()
314
+ # 可以添加延迟或其他处理
315
+ asyncio.sleep(1)
316
+
317
+ for item in batch:
318
+ processed_count += 1
319
+
320
+ # 检查进度保存
321
+ if processed_count % self.large_scale_helper.checkpoint_interval == 0:
322
+ self.progress_manager.save_progress(
323
+ processed_count=processed_count,
324
+ skipped_count=skipped_count
325
+ )
326
+
327
+ # 生成请求
328
+ request = self.create_request_from_item(item)
329
+ if request:
330
+ yield request
331
+ else:
332
+ skipped_count += 1
333
+
334
+ # 最终保存进度
335
+ self.progress_manager.save_progress(
336
+ processed_count=processed_count,
337
+ skipped_count=skipped_count,
338
+ completed=True
339
+ )
340
+
341
+ self.logger.info(f"处理完成!总计: {processed_count}, 跳过: {skipped_count}")
342
+
343
+ def create_request_from_item(self, item):
344
+ """从数据项创建请求(需要子类实现)"""
345
345
  raise NotImplementedError("子类必须实现 create_request_from_item 方法")