crawlo 1.3.3__py3-none-any.whl → 1.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (289) hide show
  1. crawlo/__init__.py +87 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +46 -2
  16. crawlo/core/engine.py +439 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -256
  19. crawlo/crawler.py +639 -1167
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -52
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +28 -0
  40. crawlo/factories/base.py +69 -0
  41. crawlo/factories/crawler.py +104 -0
  42. crawlo/factories/registry.py +85 -0
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -234
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -0
  47. crawlo/initialization/__init__.py +40 -0
  48. crawlo/initialization/built_in.py +426 -0
  49. crawlo/initialization/context.py +142 -0
  50. crawlo/initialization/core.py +194 -0
  51. crawlo/initialization/phases.py +149 -0
  52. crawlo/initialization/registry.py +146 -0
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -22
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +38 -0
  58. crawlo/logging/config.py +97 -0
  59. crawlo/logging/factory.py +129 -0
  60. crawlo/logging/manager.py +112 -0
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -187
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +325 -318
  85. crawlo/pipelines/pipeline_manager.py +76 -75
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -325
  88. crawlo/queue/pqueue.py +43 -37
  89. crawlo/queue/queue_manager.py +503 -379
  90. crawlo/queue/redis_priority_queue.py +326 -306
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -225
  93. crawlo/settings/setting_manager.py +214 -198
  94. crawlo/spider/__init__.py +657 -639
  95. crawlo/stats_collector.py +73 -59
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +139 -30
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +168 -267
  104. crawlo/templates/project/settings_distributed.py.tmpl +167 -180
  105. crawlo/templates/project/settings_gentle.py.tmpl +167 -61
  106. crawlo/templates/project/settings_high_performance.py.tmpl +168 -131
  107. crawlo/templates/project/settings_minimal.py.tmpl +66 -35
  108. crawlo/templates/project/settings_simple.py.tmpl +165 -102
  109. crawlo/templates/project/spiders/__init__.py.tmpl +10 -6
  110. crawlo/templates/run.py.tmpl +34 -38
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +10 -0
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +365 -0
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +26 -0
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -124
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +80 -200
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -351
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -218
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/METADATA +1126 -1020
  149. crawlo-1.3.5.dist-info/RECORD +288 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +107 -107
  154. tests/baidu_performance_test.py +109 -0
  155. tests/baidu_test.py +60 -0
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +213 -0
  158. tests/comprehensive_test.py +82 -0
  159. tests/comprehensive_testing_summary.md +187 -0
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +70 -0
  164. tests/debug_framework_logger.py +85 -0
  165. tests/debug_log_config.py +127 -0
  166. tests/debug_log_levels.py +64 -0
  167. tests/debug_pipelines.py +66 -66
  168. tests/detailed_log_test.py +234 -0
  169. tests/distributed_test.py +67 -0
  170. tests/distributed_test_debug.py +77 -0
  171. tests/dynamic_loading_example.py +523 -523
  172. tests/dynamic_loading_test.py +104 -104
  173. tests/env_config_example.py +133 -133
  174. tests/error_handling_example.py +171 -171
  175. tests/final_command_test_report.md +0 -0
  176. tests/final_comprehensive_test.py +152 -0
  177. tests/final_log_test.py +261 -0
  178. tests/final_validation_test.py +183 -0
  179. tests/fix_log_test.py +143 -0
  180. tests/framework_performance_test.py +203 -0
  181. tests/log_buffering_test.py +112 -0
  182. tests/log_generation_timing_test.py +154 -0
  183. tests/optimized_performance_test.py +212 -0
  184. tests/performance_comparison.py +246 -0
  185. tests/queue_blocking_test.py +114 -0
  186. tests/queue_test.py +90 -0
  187. tests/redis_key_validation_demo.py +130 -130
  188. tests/request_params_example.py +150 -150
  189. tests/response_improvements_example.py +144 -144
  190. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  191. tests/scrapy_comparison/scrapy_test.py +134 -0
  192. tests/simple_command_test.py +120 -0
  193. tests/simple_crawlo_test.py +128 -0
  194. tests/simple_log_test.py +58 -0
  195. tests/simple_log_test2.py +138 -0
  196. tests/simple_optimization_test.py +129 -0
  197. tests/simple_spider_test.py +50 -0
  198. tests/simple_test.py +48 -0
  199. tests/spider_log_timing_test.py +178 -0
  200. tests/test_advanced_tools.py +148 -148
  201. tests/test_all_commands.py +231 -0
  202. tests/test_all_redis_key_configs.py +145 -145
  203. tests/test_authenticated_proxy.py +141 -141
  204. tests/test_batch_processor.py +179 -0
  205. tests/test_cleaners.py +54 -54
  206. tests/test_component_factory.py +175 -0
  207. tests/test_comprehensive.py +146 -146
  208. tests/test_config_consistency.py +80 -80
  209. tests/test_config_merge.py +152 -152
  210. tests/test_config_validator.py +182 -182
  211. tests/test_controlled_spider_mixin.py +80 -0
  212. tests/test_crawlo_proxy_integration.py +108 -108
  213. tests/test_date_tools.py +123 -123
  214. tests/test_default_header_middleware.py +158 -158
  215. tests/test_distributed.py +65 -65
  216. tests/test_double_crawlo_fix.py +207 -207
  217. tests/test_double_crawlo_fix_simple.py +124 -124
  218. tests/test_download_delay_middleware.py +221 -221
  219. tests/test_downloader_proxy_compatibility.py +268 -268
  220. tests/test_dynamic_downloaders_proxy.py +124 -124
  221. tests/test_dynamic_proxy.py +92 -92
  222. tests/test_dynamic_proxy_config.py +146 -146
  223. tests/test_dynamic_proxy_real.py +109 -109
  224. tests/test_edge_cases.py +303 -303
  225. tests/test_enhanced_error_handler.py +270 -270
  226. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  227. tests/test_env_config.py +121 -121
  228. tests/test_error_handler_compatibility.py +112 -112
  229. tests/test_factories.py +253 -0
  230. tests/test_final_validation.py +153 -153
  231. tests/test_framework_env_usage.py +103 -103
  232. tests/test_framework_logger.py +67 -0
  233. tests/test_framework_startup.py +65 -0
  234. tests/test_get_component_logger.py +84 -0
  235. tests/test_integration.py +169 -169
  236. tests/test_item_dedup_redis_key.py +122 -122
  237. tests/test_large_scale_config.py +113 -0
  238. tests/test_large_scale_helper.py +236 -0
  239. tests/test_logging_system.py +283 -0
  240. tests/test_mode_change.py +73 -0
  241. tests/test_mode_consistency.py +51 -51
  242. tests/test_offsite_middleware.py +221 -221
  243. tests/test_parsel.py +29 -29
  244. tests/test_performance.py +327 -327
  245. tests/test_performance_monitor.py +116 -0
  246. tests/test_proxy_api.py +264 -264
  247. tests/test_proxy_health_check.py +32 -32
  248. tests/test_proxy_middleware.py +121 -121
  249. tests/test_proxy_middleware_enhanced.py +216 -216
  250. tests/test_proxy_middleware_integration.py +136 -136
  251. tests/test_proxy_middleware_refactored.py +184 -184
  252. tests/test_proxy_providers.py +56 -56
  253. tests/test_proxy_stats.py +19 -19
  254. tests/test_proxy_strategies.py +59 -59
  255. tests/test_queue_empty_check.py +42 -0
  256. tests/test_queue_manager_double_crawlo.py +173 -173
  257. tests/test_queue_manager_redis_key.py +176 -176
  258. tests/test_random_user_agent.py +72 -72
  259. tests/test_real_scenario_proxy.py +195 -195
  260. tests/test_redis_config.py +28 -28
  261. tests/test_redis_connection_pool.py +294 -294
  262. tests/test_redis_key_naming.py +181 -181
  263. tests/test_redis_key_validator.py +123 -123
  264. tests/test_redis_queue.py +224 -224
  265. tests/test_request_ignore_middleware.py +182 -182
  266. tests/test_request_params.py +111 -111
  267. tests/test_request_serialization.py +70 -70
  268. tests/test_response_code_middleware.py +349 -349
  269. tests/test_response_filter_middleware.py +427 -427
  270. tests/test_response_improvements.py +152 -152
  271. tests/test_retry_middleware.py +241 -241
  272. tests/test_scheduler.py +252 -252
  273. tests/test_scheduler_config_update.py +133 -133
  274. tests/test_simple_response.py +61 -61
  275. tests/test_telecom_spider_redis_key.py +205 -205
  276. tests/test_template_content.py +87 -87
  277. tests/test_template_redis_key.py +134 -134
  278. tests/test_tools.py +159 -159
  279. tests/test_user_agents.py +96 -96
  280. tests/tools_example.py +260 -260
  281. tests/untested_features_report.md +139 -0
  282. tests/verify_debug.py +52 -0
  283. tests/verify_distributed.py +117 -117
  284. tests/verify_log_fix.py +112 -0
  285. crawlo-1.3.3.dist-info/RECORD +0 -219
  286. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  287. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/WHEEL +0 -0
  288. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/entry_points.txt +0 -0
  289. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/top_level.txt +0 -0
tests/verify_debug.py ADDED
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 验证运行模式日志级别修改的简单测试
5
+ """
6
+ import os
7
+
8
+ # 删除旧日志文件
9
+ log_file = 'verify_debug.log'
10
+ if os.path.exists(log_file):
11
+ os.remove(log_file)
12
+
13
+ # 简单测试日志级别
14
+ from crawlo.utils.log import LoggerManager
15
+
16
+ # 配置日志系统
17
+ LoggerManager.configure(
18
+ LOG_LEVEL='INFO',
19
+ LOG_FILE=log_file
20
+ )
21
+
22
+ from crawlo.utils.log import get_logger
23
+
24
+ # 创建测试logger
25
+ test_logger = get_logger('crawlo.framework')
26
+
27
+ # 测试输出
28
+ test_logger.info("这是INFO级别的测试信息")
29
+ test_logger.debug("这是DEBUG级别的测试信息(不应该在INFO级别的日志中出现)")
30
+ test_logger.debug("使用单机模式 - 简单快速,适合开发和中小规模爬取")
31
+
32
+ print("测试完成")
33
+
34
+ # 检查日志文件
35
+ if os.path.exists(log_file):
36
+ with open(log_file, 'r', encoding='utf-8') as f:
37
+ content = f.read()
38
+ print(f"日志文件内容({len(content)} 字符):")
39
+ print(content)
40
+
41
+ # 检查是否包含不应该出现的DEBUG信息
42
+ if "DEBUG" in content:
43
+ print("❌ 发现DEBUG级别信息(不应该出现)")
44
+ else:
45
+ print("✅ 没有发现DEBUG级别信息(正确)")
46
+
47
+ if "使用单机模式" in content:
48
+ print("❌ 发现运行模式信息(不应该出现在INFO级别)")
49
+ else:
50
+ print("✅ 没有发现运行模式信息(正确)")
51
+ else:
52
+ print("❌ 日志文件未创建")
@@ -1,117 +1,117 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 分布式采集功能验证脚本
5
- 验证Crawlo框架的分布式采集功能是否正常工作
6
- """
7
-
8
- import redis
9
- import json
10
- import os
11
- import sys
12
-
13
- # 添加项目根目录到 Python 路径
14
- project_root = os.path.dirname(os.path.abspath(__file__))
15
- sys.path.insert(0, project_root)
16
-
17
-
18
- def verify_distributed_functionality():
19
- """验证分布式采集功能"""
20
- print("=== Crawlo分布式采集功能验证 ===\n")
21
-
22
- # 1. 连接Redis
23
- try:
24
- r = redis.Redis(host='localhost', port=6379, db=2, decode_responses=False)
25
- r.ping()
26
- print("✓ Redis连接成功")
27
- except Exception as e:
28
- print(f"✗ Redis连接失败: {e}")
29
- return False
30
-
31
- # 2. 检查项目配置
32
- try:
33
- with open('../examples/ofweek_distributed/crawlo.cfg', 'r') as f:
34
- config_content = f.read()
35
- if 'ofweek_distributed.settings' in config_content:
36
- print("✓ 项目配置文件正确")
37
- else:
38
- print("✗ 项目配置文件不正确")
39
- return False
40
- except Exception as e:
41
- print(f"✗ 无法读取配置文件: {e}")
42
- return False
43
-
44
- # 3. 检查设置文件
45
- try:
46
- with open('../examples/ofweek_distributed/ofweek_distributed/settings.py', 'r') as f:
47
- settings_content = f.read()
48
- checks = [
49
- ('RUN_MODE = \'distributed\'', '运行模式设置为分布式'),
50
- ('QUEUE_TYPE = \'redis\'', '队列类型设置为Redis'),
51
- ('FILTER_CLASS = \'crawlo.filters.aioredis_filter.AioRedisFilter\'', '过滤器设置为Redis过滤器'),
52
- ('REDIS_HOST = \'127.0.0.1\'', 'Redis主机配置正确'),
53
- ]
54
-
55
- all_passed = True
56
- for check, description in checks:
57
- if check in settings_content:
58
- print(f"✓ {description}")
59
- else:
60
- print(f"✗ {description}")
61
- all_passed = False
62
-
63
- if not all_passed:
64
- return False
65
- except Exception as e:
66
- print(f"✗ 无法读取设置文件: {e}")
67
- return False
68
-
69
- # 4. 检查Redis中的数据
70
- try:
71
- # 检查请求去重指纹
72
- request_fingerprints = r.scard("crawlo:ofweek_distributed:filter:fingerprint")
73
- print(f"✓ 请求去重指纹数量: {request_fingerprints}")
74
-
75
- # 检查数据项去重指纹
76
- item_fingerprints = r.scard("crawlo:ofweek_distributed:item:fingerprint")
77
- print(f"✓ 数据项去重指纹数量: {item_fingerprints}")
78
-
79
- # 检查请求队列
80
- queue_size = r.zcard("crawlo:ofweek_distributed:queue:requests")
81
- print(f"✓ 请求队列大小: {queue_size}")
82
-
83
- # 验证数据是否存在
84
- if request_fingerprints > 0 and item_fingerprints > 0:
85
- print("✓ Redis中存在分布式采集数据")
86
- else:
87
- print("⚠ Redis中分布式采集数据为空")
88
-
89
- except Exception as e:
90
- print(f"✗ Redis数据检查失败: {e}")
91
- return False
92
-
93
- # 5. 检查输出文件
94
- try:
95
- import glob
96
- json_files = glob.glob("output/*.json")
97
- if json_files:
98
- latest_file = max(json_files, key=os.path.getctime)
99
- file_size = os.path.getsize(latest_file)
100
- print(f"✓ 输出文件存在: {latest_file} ({file_size} bytes)")
101
- else:
102
- print("⚠ 未找到输出文件")
103
- except Exception as e:
104
- print(f"✗ 输出文件检查失败: {e}")
105
-
106
- print("\n=== 验证结果 ===")
107
- print("✓ Crawlo分布式采集功能正常工作!")
108
- print(" - Redis连接正常")
109
- print(" - 分布式配置正确")
110
- print(" - Redis数据存储正常")
111
- print(" - 采集任务执行正常")
112
-
113
- return True
114
-
115
-
116
- if __name__ == '__main__':
117
- verify_distributed_functionality()
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 分布式采集功能验证脚本
5
+ 验证Crawlo框架的分布式采集功能是否正常工作
6
+ """
7
+
8
+ import redis
9
+ import json
10
+ import os
11
+ import sys
12
+
13
+ # 添加项目根目录到 Python 路径
14
+ project_root = os.path.dirname(os.path.abspath(__file__))
15
+ sys.path.insert(0, project_root)
16
+
17
+
18
+ def verify_distributed_functionality():
19
+ """验证分布式采集功能"""
20
+ print("=== Crawlo分布式采集功能验证 ===\n")
21
+
22
+ # 1. 连接Redis
23
+ try:
24
+ r = redis.Redis(host='localhost', port=6379, db=2, decode_responses=False)
25
+ r.ping()
26
+ print("✓ Redis连接成功")
27
+ except Exception as e:
28
+ print(f"✗ Redis连接失败: {e}")
29
+ return False
30
+
31
+ # 2. 检查项目配置
32
+ try:
33
+ with open('../examples/ofweek_distributed/crawlo.cfg', 'r') as f:
34
+ config_content = f.read()
35
+ if 'ofweek_distributed.settings' in config_content:
36
+ print("✓ 项目配置文件正确")
37
+ else:
38
+ print("✗ 项目配置文件不正确")
39
+ return False
40
+ except Exception as e:
41
+ print(f"✗ 无法读取配置文件: {e}")
42
+ return False
43
+
44
+ # 3. 检查设置文件
45
+ try:
46
+ with open('../examples/ofweek_distributed/ofweek_distributed/settings.py', 'r') as f:
47
+ settings_content = f.read()
48
+ checks = [
49
+ ('RUN_MODE = \'distributed\'', '运行模式设置为分布式'),
50
+ ('QUEUE_TYPE = \'redis\'', '队列类型设置为Redis'),
51
+ ('FILTER_CLASS = \'crawlo.filters.aioredis_filter.AioRedisFilter\'', '过滤器设置为Redis过滤器'),
52
+ ('REDIS_HOST = \'127.0.0.1\'', 'Redis主机配置正确'),
53
+ ]
54
+
55
+ all_passed = True
56
+ for check, description in checks:
57
+ if check in settings_content:
58
+ print(f"✓ {description}")
59
+ else:
60
+ print(f"✗ {description}")
61
+ all_passed = False
62
+
63
+ if not all_passed:
64
+ return False
65
+ except Exception as e:
66
+ print(f"✗ 无法读取设置文件: {e}")
67
+ return False
68
+
69
+ # 4. 检查Redis中的数据
70
+ try:
71
+ # 检查请求去重指纹
72
+ request_fingerprints = r.scard("crawlo:ofweek_distributed:filter:fingerprint")
73
+ print(f"✓ 请求去重指纹数量: {request_fingerprints}")
74
+
75
+ # 检查数据项去重指纹
76
+ item_fingerprints = r.scard("crawlo:ofweek_distributed:item:fingerprint")
77
+ print(f"✓ 数据项去重指纹数量: {item_fingerprints}")
78
+
79
+ # 检查请求队列
80
+ queue_size = r.zcard("crawlo:ofweek_distributed:queue:requests")
81
+ print(f"✓ 请求队列大小: {queue_size}")
82
+
83
+ # 验证数据是否存在
84
+ if request_fingerprints > 0 and item_fingerprints > 0:
85
+ print("✓ Redis中存在分布式采集数据")
86
+ else:
87
+ print("⚠ Redis中分布式采集数据为空")
88
+
89
+ except Exception as e:
90
+ print(f"✗ Redis数据检查失败: {e}")
91
+ return False
92
+
93
+ # 5. 检查输出文件
94
+ try:
95
+ import glob
96
+ json_files = glob.glob("output/*.json")
97
+ if json_files:
98
+ latest_file = max(json_files, key=os.path.getctime)
99
+ file_size = os.path.getsize(latest_file)
100
+ print(f"✓ 输出文件存在: {latest_file} ({file_size} bytes)")
101
+ else:
102
+ print("⚠ 未找到输出文件")
103
+ except Exception as e:
104
+ print(f"✗ 输出文件检查失败: {e}")
105
+
106
+ print("\n=== 验证结果 ===")
107
+ print("✓ Crawlo分布式采集功能正常工作!")
108
+ print(" - Redis连接正常")
109
+ print(" - 分布式配置正确")
110
+ print(" - Redis数据存储正常")
111
+ print(" - 采集任务执行正常")
112
+
113
+ return True
114
+
115
+
116
+ if __name__ == '__main__':
117
+ verify_distributed_functionality()
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 验证日志级别修复效果
5
+ 创建一个简化的测试来验证控制台和日志文件级别的一致性
6
+ """
7
+ import sys
8
+ import os
9
+ import tempfile
10
+
11
+ # 添加项目根目录到Python路径
12
+ sys.path.insert(0, '/')
13
+
14
+ from crawlo.utils.log import LoggerManager, get_logger
15
+
16
+
17
+ def main():
18
+ """验证日志级别修复效果"""
19
+ print("🔧 验证日志级别修复效果")
20
+ print("=" * 50)
21
+
22
+ # 创建临时日志文件
23
+ temp_log = tempfile.NamedTemporaryFile(mode='w+', suffix='.log', delete=False)
24
+ temp_log_path = temp_log.name
25
+ temp_log.close()
26
+
27
+ try:
28
+ # 重置LoggerManager状态
29
+ LoggerManager.reset()
30
+
31
+ # 使用INFO级别配置
32
+ LoggerManager.configure(
33
+ LOG_LEVEL='INFO',
34
+ LOG_FILE=temp_log_path,
35
+ LOG_FORMAT='%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
36
+ )
37
+
38
+ print(f"✅ 配置完成:")
39
+ print(f" 默认级别: {LoggerManager._default_level}")
40
+ print(f" 控制台级别: {LoggerManager._default_console_level}")
41
+ print(f" 文件级别: {LoggerManager._default_file_level}")
42
+ print(f" 日志文件: {temp_log_path}")
43
+
44
+ # 创建测试logger
45
+ test_logger = get_logger('crawlo.test')
46
+
47
+ # 检查handler配置
48
+ print(f"\n📋 Handler配置:")
49
+ for i, handler in enumerate(test_logger.handlers):
50
+ handler_type = type(handler).__name__
51
+ handler_level = handler.level
52
+ print(f" Handler {i} ({handler_type}): 级别 {handler_level}")
53
+
54
+ # 测试日志输出
55
+ print(f"\n📝 测试日志输出(控制台):")
56
+ test_logger.debug("这是DEBUG级别日志 - 不应该显示")
57
+ test_logger.info("这是INFO级别日志 - 应该显示")
58
+ test_logger.warning("这是WARNING级别日志 - 应该显示")
59
+ test_logger.error("这是ERROR级别日志 - 应该显示")
60
+
61
+ # 检查日志文件内容
62
+ print(f"\n📄 检查日志文件内容:")
63
+ with open(temp_log_path, 'r', encoding='utf-8') as f:
64
+ log_content = f.read()
65
+ if log_content:
66
+ print("日志文件内容:")
67
+ print(log_content)
68
+ else:
69
+ print("❌ 日志文件为空")
70
+
71
+ # 分析结果
72
+ lines = log_content.strip().split('\n') if log_content.strip() else []
73
+ debug_lines = [line for line in lines if '- DEBUG:' in line]
74
+ info_lines = [line for line in lines if '- INFO:' in line]
75
+ warning_lines = [line for line in lines if '- WARNING:' in line]
76
+ error_lines = [line for line in lines if '- ERROR:' in line]
77
+
78
+ print(f"\n📊 分析结果:")
79
+ print(f" DEBUG级别日志: {len(debug_lines)}条 {'✅ 正确' if len(debug_lines) == 0 else '❌ 错误'}")
80
+ print(f" INFO级别日志: {len(info_lines)}条 {'✅ 正确' if len(info_lines) >= 1 else '❌ 错误'}")
81
+ print(f" WARNING级别日志: {len(warning_lines)}条 {'✅ 正确' if len(warning_lines) >= 1 else '❌ 错误'}")
82
+ print(f" ERROR级别日志: {len(error_lines)}条 {'✅ 正确' if len(error_lines) >= 1 else '❌ 错误'}")
83
+
84
+ # 判断修复是否成功
85
+ success = (len(debug_lines) == 0 and len(info_lines) >= 1 and
86
+ len(warning_lines) >= 1 and len(error_lines) >= 1)
87
+
88
+ print(f"\n🎯 修复结果: {'✅ 成功' if success else '❌ 失败'}")
89
+
90
+ if success:
91
+ print("📋 控制台和日志文件现在使用相同的INFO级别")
92
+ print("🎉 日志级别一致性问题已解决")
93
+ else:
94
+ print("❌ 仍存在日志级别不一致问题,需要进一步调试")
95
+
96
+ except Exception as e:
97
+ print(f"❌ 验证过程中发生错误: {e}")
98
+ import traceback
99
+ traceback.print_exc()
100
+ return 1
101
+ finally:
102
+ # 清理临时文件
103
+ try:
104
+ os.unlink(temp_log_path)
105
+ except:
106
+ pass
107
+
108
+ return 0 if success else 1
109
+
110
+
111
+ if __name__ == '__main__':
112
+ sys.exit(main())
@@ -1,219 +0,0 @@
1
- crawlo/__init__.py,sha256=2H7llH-yRV3N5_DomJ02JYsd5wNJdUNZI3VowiTQvOc,1444
2
- crawlo/__version__.py,sha256=9ap_Mho2n-5Wj2kAxLL8yqq57pG-v6Z_7an7VCKga44,23
3
- crawlo/cli.py,sha256=AQnAB5NMI-Ic1VPw_Jjng8L4AI4-wMozOwzE6CfXkZU,2402
4
- crawlo/config.py,sha256=EQIT7WpkXAlr2ocd5SYJYOKTSWUlQx2AkTHX7ErEWxw,9798
5
- crawlo/config_validator.py,sha256=oY4-2bwXUlwHAnGgkI-EznviDfML_dcxbWSGXNSxC2k,11516
6
- crawlo/crawler.py,sha256=rixy3qIy7e0vg1Ns4u1NC3S3Lbi-Mbqe_edPkXgV0yc,43600
7
- crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
8
- crawlo/exceptions.py,sha256=YVIDnC1bKSMv3fXH_6tinWMuD9HmKHIaUfO4_fkX5sY,1247
9
- crawlo/mode_manager.py,sha256=h6ZWOK9U9WZXCLk1MXwBkpzMOw6l5royxfrst4vCtJg,6573
10
- crawlo/project.py,sha256=Qw_Z8-ppYdv-aynBlzxH9CqABbrMIyGjtfLsM0werqc,12550
11
- crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
12
- crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
13
- crawlo/task_manager.py,sha256=PScfEB03306Txa0l38AeQ_0WVhKzeWOFyT3bnrkbHW0,849
14
- crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
15
- crawlo/commands/check.py,sha256=TKDhI_sj7kErgiJpt2vCZ9QL-g6yWjrrPWKbgh8pgEU,23199
16
- crawlo/commands/genspider.py,sha256=7YGZdv12G341SWmkGbyDeMde2RgqGYxYXRExFy7KKNc,5088
17
- crawlo/commands/help.py,sha256=8xPC0iNCg1rRBoK2bb6noAEANc1JwrdM35eF-j6yeZM,5111
18
- crawlo/commands/list.py,sha256=trzcd3kG6DhkOqYZADcl3yR7M8iJBgRw5fE-g9e0gVM,5877
19
- crawlo/commands/run.py,sha256=ybATvNXjXdr6GczW6gujkaTK05dhfk1tUTA0qXCO_rg,12360
20
- crawlo/commands/startproject.py,sha256=-Bo8vvDfIhqzGmWyhxMatBlPLhYpRwJC7l4fpbN8vVk,16506
21
- crawlo/commands/stats.py,sha256=vlGJLyiXZtY0ASdzCK59JNereSsAel4W9JCGaOzCr-8,6201
22
- crawlo/commands/utils.py,sha256=YVNEEzlm_qNY3SVvU8h6o2lQMkVgypvoB4ZFrP4gln0,5578
23
- crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
24
- crawlo/core/engine.py,sha256=d6L4Xwwjc1UQJY9QutqC_Uk88ZzBCyN9T_7z3lMSuIQ,14861
25
- crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
26
- crawlo/core/scheduler.py,sha256=AaZID01ovAbFzf1Urp55SPyUd7AOtHetX6R_GEYMTfA,12849
27
- crawlo/data/__init__.py,sha256=UPqgioMdu3imSUmpLWzVlpvoBnEfaPSAT-crCcWd7iw,121
28
- crawlo/data/user_agents.py,sha256=zjjFkldQkqtrn45j0WZplaZLannPxZDeAU0JofxQcBc,9891
29
- crawlo/downloader/__init__.py,sha256=VZG5HiSHOmimiH9okQN3MBwgXsCzxr2awflVz5UiboY,8897
30
- crawlo/downloader/aiohttp_downloader.py,sha256=GaUgR5WwG7VvMDQnL9tdwnLgu2bt8btdMuecWKyP2Uk,9195
31
- crawlo/downloader/cffi_downloader.py,sha256=QxoeocCE2DsQCnhZla6-BjhplaTZDWMbEJmNrghWSDA,10488
32
- crawlo/downloader/httpx_downloader.py,sha256=MpgDeIdGqNsiSKLOEDBnr5Z0eUbhHnqVEmAuoIfJmFU,12296
33
- crawlo/downloader/hybrid_downloader.py,sha256=dNnFeegRnyLaOxTWI6XrWKqqVPx80AZBZNgmrcKRVBM,8240
34
- crawlo/downloader/playwright_downloader.py,sha256=L-TVzG7cYfuBlqW0XSZuz5C_r9fpJrmYNcoQ-cDEna4,16663
35
- crawlo/downloader/selenium_downloader.py,sha256=P8GuhEw6OYVeN3oeksuBLpUJCELXiu0mAR23X6IIOAA,21508
36
- crawlo/extension/__init__.py,sha256=jOdyLjtf-JqEKN67x2haIhtMhy_5bGSMbdFIdsERU7o,1633
37
- crawlo/extension/health_check.py,sha256=stDpyP4gOzAdbBlPbSf0rge0QounAhF8CtrGq5fa_7c,5657
38
- crawlo/extension/log_interval.py,sha256=2R3XVdM1grDN8wh9TTHRB_WmQypCr5YSGvESNDnS16s,2474
39
- crawlo/extension/log_stats.py,sha256=6Hoq0ASU8evjT5AsUuc0b018-vkzeeO6CyJrU9ZabWk,2989
40
- crawlo/extension/logging_extension.py,sha256=hAi3hUbrVMRcE7b0tqybSRgnDYrgZYIDTsF-wxmezI0,1940
41
- crawlo/extension/memory_monitor.py,sha256=fClPchpCkVjcIiU0AJHCKDd7HEiz5B4KqNqKTRZ2hcU,4394
42
- crawlo/extension/performance_profiler.py,sha256=BjWD3LOb4VwjQJQvQtWNg7GluEwFquI1CztNfgMzy3c,5032
43
- crawlo/extension/request_recorder.py,sha256=KA_RmcfscDxP5wPdolO76yKfRj-1jmHhG3jkVGO1pbc,4181
44
- crawlo/filters/__init__.py,sha256=lX-QOCDTiTRFoiK1qrZ5HABo7LgZfcxScx_lELYEvJk,4395
45
- crawlo/filters/aioredis_filter.py,sha256=aB1GPCALikvPUWdoACaGsvmnkzseKXxpR7l3gh1glsY,8479
46
- crawlo/filters/memory_filter.py,sha256=ZojFhZ6gE76aQBC-rfImxSkSMwQtiotenx0pIcQOaFg,9561
47
- crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
48
- crawlo/items/base.py,sha256=hwGJEdFWOdaZfalFX8umRkh_HUWLEbCjvq4j70fplMQ,598
49
- crawlo/items/fields.py,sha256=l-DIwK6CCpdzNvf6ELz7Ckc7YCghZD9UCXA8vhNn2UE,1852
50
- crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
51
- crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
52
- crawlo/middleware/default_header.py,sha256=Pw-ev8ffi16GeCh84R5L3hAZgp3G1QXS-H5kV3JEp4Q,5164
53
- crawlo/middleware/download_delay.py,sha256=2iWnJFtWDlqDy5MsAob8TPiJQoiz9v21yatkBI0eptg,3542
54
- crawlo/middleware/middleware_manager.py,sha256=69l0QS6HJA2TmhdEHgyXMMhJ1nZlVUjODUFo3xhSth4,6413
55
- crawlo/middleware/offsite.py,sha256=4tUkPqXMMXsi1WwYnJ_e7wMd6sRgK19QHRCYq8-w8jk,4682
56
- crawlo/middleware/proxy.py,sha256=uKk5OSLIs7jv9bBgkZwsi1rIpthooxhMrGBC2BPRDCc,16022
57
- crawlo/middleware/request_ignore.py,sha256=7qdX4zAimjSGwdod_aWUbOTfzLBWZ5KzLVFchGMCxCI,2663
58
- crawlo/middleware/response_code.py,sha256=0_NbiCzLgJmSuSSF2_jqpypWYy0ES4GV-0iWQPLfYLc,5097
59
- crawlo/middleware/response_filter.py,sha256=tVGr06bfJBR3xAHI2G5c3WimFsGHu8qoJtDcsVuCATU,4384
60
- crawlo/middleware/retry.py,sha256=Acfo95B9wF8fQTCQIqluZOS2hHdnknQu_FOHvpGKJp0,4248
61
- crawlo/middleware/simple_proxy.py,sha256=rQ4RkqewGvDRCw021nGrg8ngkBzg3wqrEVqvSmBgQ6M,2256
62
- crawlo/network/__init__.py,sha256=bvEnpEUBZJ79URfNZbsHhsBKna54hM2-x_BV8eotTA4,418
63
- crawlo/network/request.py,sha256=e6-YLgK7SU8D19n21mQwqt_b_aeRVJFOgWPIBPal2ys,14178
64
- crawlo/network/response.py,sha256=QwJhL3xJfPVy_gwtGrg61oAgaqCoCmjyj1Ug7Zju7Pg,13060
65
- crawlo/pipelines/__init__.py,sha256=FDe2Pr5tiHtV8hFlheElRO_O1aVKvSWlkTcAl9BXAKA,637
66
- crawlo/pipelines/bloom_dedup_pipeline.py,sha256=NoqU0pCS8clRvdmR-7EsJEzBGn_RJvI5-Fz-iVpO5mc,5817
67
- crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
68
- crawlo/pipelines/csv_pipeline.py,sha256=qbXZoqq4FIR9QkUGpC0ryWzmqGJSrM2bxmWLM4I1nXM,12490
69
- crawlo/pipelines/database_dedup_pipeline.py,sha256=L9lc6k62kUzwcDPgUJ0wT3KHhnC_lls_L5XMb08i_H8,8200
70
- crawlo/pipelines/json_pipeline.py,sha256=wrCsh8YInmcPLAkhPrHObMx89VZfhf-c7qRrYsTixPE,8585
71
- crawlo/pipelines/memory_dedup_pipeline.py,sha256=Wf_M7-FFmqXvcr3_Rpz97q0KcKebx8Ii2iRHv2A3orc,3952
72
- crawlo/pipelines/mongo_pipeline.py,sha256=PohTKTGw3QRvuP-T6SrquwW3FAHSno8jQ2D2cH_d75U,5837
73
- crawlo/pipelines/mysql_pipeline.py,sha256=fESKJ6qBcW3NZ9Gz1ACASL-PILvYAW3YagIZMg7H1h0,13818
74
- crawlo/pipelines/pipeline_manager.py,sha256=wr79XeVDH-v7meSwB10W1qU3AZDh9IopxGWg5rWFerw,3154
75
- crawlo/pipelines/redis_dedup_pipeline.py,sha256=kexmobW_JNSkaVRTQ4uhsKW5hGTXeqjGjOFmOP_wflw,6508
76
- crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- crawlo/queue/pqueue.py,sha256=qTFOuvEXsYEZbm0ULjsOeZo0XtSsZ-SHpx7nFEtmluE,1095
78
- crawlo/queue/queue_manager.py,sha256=xjodzF8Yjb1wJ3ut_Mu3eRFrqeCMo5O0RXW5tdw9o1M,15320
79
- crawlo/queue/redis_priority_queue.py,sha256=J30HcC16E3FjxfUCbL_9HbFoaszBy7prFvG8xRvWd3M,13432
80
- crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
81
- crawlo/settings/default_settings.py,sha256=Wu1iUdhdgsFihCSuJJiXYE8DAHeV0HNuR663Hqsmg0U,9436
82
- crawlo/settings/setting_manager.py,sha256=LTs4NQ_CXvfhnDmmdKGlvosIjHtZk_48v7BEa_O0ghQ,7710
83
- crawlo/spider/__init__.py,sha256=I2_eb6NtgTQ-dckhQXZyDFQORUTx1OHqcn-9yleumkg,21074
84
- crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
85
- crawlo/templates/run.py.tmpl,sha256=vYCRPWpG2LxK3UvTxIyCDJh7qy43eoaU1CrJgBF-I6Y,1071
86
- crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
87
- crawlo/templates/project/items.py.tmpl,sha256=8_3DBA8HrS2XbfHzsMZNJiZbFY6fDJUUMFoFti_obJk,314
88
- crawlo/templates/project/middlewares.py.tmpl,sha256=fxHqi-Sjec5GiHJciprOU-6SAUTzM728NlZckIqf9hM,4278
89
- crawlo/templates/project/pipelines.py.tmpl,sha256=j9oqEhCezmmHlBhMWgYtlgup4jhWnMlv6AEiAOHODkg,2704
90
- crawlo/templates/project/settings.py.tmpl,sha256=5J__P6nTNr-3-ziSPdbBe-aklP4G6OFglI7UgEEF1zE,10562
91
- crawlo/templates/project/settings_distributed.py.tmpl,sha256=ci_vMOTwVwUeoqKGTa5tA6ygeruuKGt677liMYcp4Tw,7181
92
- crawlo/templates/project/settings_gentle.py.tmpl,sha256=CJMn4gvTg8xNUJLgObh8OmBCyGGyeCcLrfEdBAw8I4I,2770
93
- crawlo/templates/project/settings_high_performance.py.tmpl,sha256=rqPqIeChbfjEvQmMjAbuRml7pMhxTv2WBIkn21CZ6ew,5134
94
- crawlo/templates/project/settings_minimal.py.tmpl,sha256=TIKOnEbE2A6P52oe_aVtVYpR6zN-4-ECwl-zjNSFGV4,1241
95
- crawlo/templates/project/settings_simple.py.tmpl,sha256=8a0c1KTtnA4JzszhifIzG8wf9P7yEMjMhCMjoO0Qz2s,4500
96
- crawlo/templates/project/spiders/__init__.py.tmpl,sha256=zMbePipgLsctQUEnda4WkHz8rDLUX--rc8ruI6zkpWc,111
97
- crawlo/templates/spider/spider.py.tmpl,sha256=KvU-9YpN6MifDE7XzejjyyQS7RUjLDLZ8zqJcLwSsu0,5198
98
- crawlo/tools/__init__.py,sha256=tOYfYPvZlrO8cmvnMWBjTma6UTLTFZN3qdC8pJwHrzI,4142
99
- crawlo/tools/anti_crawler.py,sha256=LwLC6BkxDSkxc5H1hQ6kY9j7O0PZGAMPZECr7gbqw2M,9431
100
- crawlo/tools/authenticated_proxy.py,sha256=ULCK0Cc9F2rGhRqu6kzKBdxzK9v2n1CsatSQ_PMxpAg,7272
101
- crawlo/tools/data_formatter.py,sha256=iBDHpZBZvn9O7pLkTQilE1TzYJQEc3z3f6HXoVus0f0,7808
102
- crawlo/tools/data_validator.py,sha256=bLWnkpFdclJuqjtSAgMI5nznN4vAuPwE1YaiFWKWenM,5490
103
- crawlo/tools/date_tools.py,sha256=QOT3W5MqcEQhVM3cTZYxu1MRfgX-TI4aF1RI9s0QbdE,9195
104
- crawlo/tools/distributed_coordinator.py,sha256=0Ej8hv5GA0UmUI7EXNpCNdgh-D-DC7Eapm_3O2POV0U,12711
105
- crawlo/tools/encoding_converter.py,sha256=CqHAsR2rwxuzsyR-TeQNb79HX5mH4KAUixEY-sX7204,4170
106
- crawlo/tools/request_tools.py,sha256=oXrk4yWMACVa65fDQCQgzsg6a94FH4_lS7qNR53FHYU,2420
107
- crawlo/tools/retry_mechanism.py,sha256=4AQ_HLuYt4hYMI9XHoKFk2GQKEiDJB5pAnsMCfjc6Bk,7777
108
- crawlo/tools/scenario_adapter.py,sha256=pzysL1B2uQ1ZSEncVHd9Hv2viHNgaxP44YAUcDcppfw,9660
109
- crawlo/tools/text_cleaner.py,sha256=UrMGcgRnJaufjmDKIDsRYKMA8znCAArHDgouttWPygk,6690
110
- crawlo/utils/__init__.py,sha256=8kMbOZf9bzOUjtvh2QvqXZmiZh3pYzxXH9YQhYcwcoY,597
111
- crawlo/utils/batch_processor.py,sha256=8LNy-K2SrQVUxmGEWxQyYw_j9M-erN4Ie7O4d3zpBvM,9142
112
- crawlo/utils/controlled_spider_mixin.py,sha256=8CuM3Cr2wQLHbaO_ohbCsPcImJnyfZHpERbSeMgQ-AQ,16936
113
- crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,10902
114
- crawlo/utils/enhanced_error_handler.py,sha256=fJC__rnYNKTNUHNbgjZtT846HoE31qyGbPft9bwyYLU,14214
115
- crawlo/utils/env_config.py,sha256=W-VD_WF63DHxsyJysvp1eJwRh3L_pBRl_PitQAY3nQY,4079
116
- crawlo/utils/error_handler.py,sha256=N6suB8Utcn7tp6WRJ8gKECr0RIAG86dcOXdwOr998OE,4367
117
- crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
118
- crawlo/utils/large_scale_config.py,sha256=lsraHTAQx3sMPjTnCBY_SVIpkuIBUb3zD9eFvmccOOM,8440
119
- crawlo/utils/large_scale_helper.py,sha256=4ORkZcIrwJ0SlKOUh7l7WIuERORuRhNBgHCM71Rz0n0,12452
120
- crawlo/utils/log.py,sha256=05-OrQW-qNsAJHoVytICss6oHVSIHVCMnfivg2qI6tQ,7829
121
- crawlo/utils/performance_monitor.py,sha256=32KspSo7RWvCX_fl0ZFn4ScWWOqbVVwEhPRd921Ez6I,9832
122
- crawlo/utils/queue_helper.py,sha256=gFmkh1jKlIcN1rmo2Jl6vYcLP5ByUWlfHO9eNlZPBLs,4918
123
- crawlo/utils/redis_connection_pool.py,sha256=Czm0RoYmgJ5E5xIuVbBnm0IgSWH2AfeaFTGnYVWneYk,11401
124
- crawlo/utils/redis_key_validator.py,sha256=-UTTx0Ul184pzwSply8hVdH0lp-gkXXOc_gEHR_7VlU,5809
125
- crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
126
- crawlo/utils/request_serializer.py,sha256=KIQBbQWCb5Ne3jFPMtqD96TNs5dTD85Ex3xr16vBrUM,8739
127
- crawlo/utils/spider_loader.py,sha256=xNzQb7qhQ7TqZsfFtCLpuVcsGi-USriZosU0YSBr9II,2233
128
- crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
129
- crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
130
- crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
131
- examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
132
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md,sha256=4W6HlT9Uc3cyu77T9pfbkrMxpAZ-xq_L9MU-GbukLV0,3427
133
- tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
134
- tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
135
- tests/authenticated_proxy_example.py,sha256=GhmbISta3lDnNRl6Cr1P2ZEy2jXasz942cOeXiSbVXk,2971
136
- tests/cleaners_example.py,sha256=blVqSJ7SeWUNd17JjHZJgVTzWH65XKevLyaMB_Wg8qA,5324
137
- tests/config_validation_demo.py,sha256=jbZ7h-HGsJmuqBb1euB_AhmKjllkvPmItRF1K0MQrVM,4171
138
- tests/controlled_spider_example.py,sha256=2SAQKoREGHe-OzVaSkGpopCcrou6QXmeW7rLdmsyopw,7981
139
- tests/date_tools_example.py,sha256=XI3iFEzeo7Nb5YepK8WHytIaBegtxWVSISpqQgpV6M8,5042
140
- tests/debug_pipelines.py,sha256=FMb36bH9lQxBLb-nM579hBRK1S16Vxu1t_BC3Dj8O2w,2164
141
- tests/dynamic_loading_example.py,sha256=7LdeQZFevrb-U1_dgr4oX3aYo2Da4HvE_0KIf1fw4Ew,18786
142
- tests/dynamic_loading_test.py,sha256=dzDW7b66HeDsIYsYgvNRihE3V6b6gEbUGQpp-eJbcIM,3413
143
- tests/env_config_example.py,sha256=_ZRDh_LR23ZKpy9E--y_KM0QIOiZF5vRT98QTn52TY8,4951
144
- tests/error_handling_example.py,sha256=grTeo1X17rFz4lhgASb0g5yu4NWbmNz5neyuonnNR40,5294
145
- tests/redis_key_validation_demo.py,sha256=WD2jvuBwHhLYIb3lVFtvYSSnmXWn1EW4EPCEwFhfi6M,4467
146
- tests/request_params_example.py,sha256=J50NdsnK1sDrqG-5m3oA-mu1_wHwVwHIfsWxGeQpz7o,4250
147
- tests/response_improvements_example.py,sha256=t1cbG3nesp82bqog4_ku1GvQzNbhRyWa5EaKTmOPrSk,5402
148
- tests/test_advanced_tools.py,sha256=HT_TcwfFzli-CavIJSqQqnCxnBn5FDMX09zL7AJ5tNY,5398
149
- tests/test_all_redis_key_configs.py,sha256=dWc4Dsr07_vuSpb4hwkMpyy6XO8SI7vglVjGuGvXoa4,5710
150
- tests/test_authenticated_proxy.py,sha256=lnvmQwuf0zaZP_E05EzcNFR2VJbwTkLjOmZGNoJKaC4,4339
151
- tests/test_cleaners.py,sha256=HDK8_YU7GUj_3hGU415cxEeUR74mnDSk0yroLlgDI0I,1816
152
- tests/test_comprehensive.py,sha256=dvRJeeVYc1cgXK9Y171hH9Y847zZpWSAFFH-EI3UepQ,5182
153
- tests/test_config_consistency.py,sha256=RgSxyaypMpysltsGSh1vFMeOShiZZG0rmUKzEhNLpYw,2001
154
- tests/test_config_merge.py,sha256=ts1j-TIKkFS0EO5q1I4O7f4YUKR5MLTmRSqOpOlv094,5606
155
- tests/test_config_validator.py,sha256=Z4gBHkI0_fEx-xgiiG4T33F4BAuePuF81obpNTXfseY,6202
156
- tests/test_crawlo_proxy_integration.py,sha256=miag_gufreZodFBwYGtyZNN-wfGyxUztg4w5-HEiBCY,2728
157
- tests/test_date_tools.py,sha256=pcLDyhLrZ_jh-PhPm4CvLZEgNeH9kLMPKN5zacHwuWM,4053
158
- tests/test_default_header_middleware.py,sha256=v-ei_1EY7cvFSsySrQPXF5-DmyGsq2yzjYhhrwFMOXs,6003
159
- tests/test_distributed.py,sha256=78Pn4HPLIaO8t1IiaSkckBmuEVTcnC8IDw7znf9_Zcw,1790
160
- tests/test_double_crawlo_fix.py,sha256=uT-PJLxGS4psOvVkJhurffV19hxhhlX5zHMPEyi59og,7977
161
- tests/test_double_crawlo_fix_simple.py,sha256=NDmCEeyvpf_D1tGQMA66iLPPKlAnSZcEg71e7GHYcjg,4768
162
- tests/test_download_delay_middleware.py,sha256=Idc6KzhL3hY3aDKgn1j_v5-mLIHz7dTnV5c4tJVZh5Q,9107
163
- tests/test_downloader_proxy_compatibility.py,sha256=0hgIzWXIqd92YXEB5sNneyp4Sk7PaG76up2cd6N9QQY,8903
164
- tests/test_dynamic_downloaders_proxy.py,sha256=t_aWpxOHi4h3_fg2ImtIq7IIJ0r3PTHtnXiopPe2ZlM,4450
165
- tests/test_dynamic_proxy.py,sha256=zi7Ocbhc9GL1zCs0XhmG2NvBBeIZ2d2hPJVh18lH4Y0,3172
166
- tests/test_dynamic_proxy_config.py,sha256=C_9CEjCJtrr0SxIXCyLDhSIi88ujF7UAT1F-FAphd0w,5853
167
- tests/test_dynamic_proxy_real.py,sha256=krWnbFIH26mWNPhOfPMmx3ZxJfOreZxMZFGwVb_8-K8,3511
168
- tests/test_edge_cases.py,sha256=1RnFaCebYTDNNz_LK8M0MepiSwPvJUk_FBK4nQTCUbg,10729
169
- tests/test_enhanced_error_handler.py,sha256=Ku_86jv7iDe25v8ZxalcXxJJjIiIvQXWH8ZldbwdVm8,8581
170
- tests/test_env_config.py,sha256=Qu1sDeADs69dSr1x0QmEe8nJrMHneE_4JClt-N901e8,4867
171
- tests/test_error_handler_compatibility.py,sha256=xJ43cmCwfBGh-qBwCGiMDPPlfNDLw4ZrmlrHN9IojkU,4241
172
- tests/test_final_validation.py,sha256=OuZI01O0E68Pao--bD-BFDTRZFPc_Mt4W-OXUzlt6ZA,4966
173
- tests/test_framework_env_usage.py,sha256=bFb_ptdLeX2obdJWEqEHPWweiWR-wR2BpvEaJMQK7h4,4201
174
- tests/test_integration.py,sha256=lVEzKNAjFzFZHRNZAyJmXxa_5Ogf_qqL4APqs620o58,4839
175
- tests/test_item_dedup_redis_key.py,sha256=dp_H59exJLaZHh5oMtmMEOWh-DNZwbnwIFYDjOpHgd0,3842
176
- tests/test_mode_consistency.py,sha256=hS9JwawnBvNwSu1l3DfArlGQGWPyVYXGuXCQtMuDHKs,1226
177
- tests/test_offsite_middleware.py,sha256=1DYktO_D-hiLEB6dBnc0iOvnWimqOdE6kimnS8aof_s,7764
178
- tests/test_parsel.py,sha256=wuZqRFIm9xx1tt6o3Xi_OjvwhT_MPmHiUEj2ax06zlo,701
179
- tests/test_performance.py,sha256=Lqs2iu3dmWipZkBPARcwIjDLXsqe42ntz1M4RzqqXKo,11457
180
- tests/test_proxy_api.py,sha256=XnmklS-xU4ke_560gV6AIlBsRmG8YLQTGFAZrTUZuhc,11013
181
- tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
182
- tests/test_proxy_middleware.py,sha256=EdQAfwwAJIBxw9JmUFTDEu_pdxapaTlcJr7KcrY6-AY,4021
183
- tests/test_proxy_middleware_enhanced.py,sha256=QR-p26F63N7MxNjZ2QJUeerh_xdnCDejkrGPIh7Fh4U,7035
184
- tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX61493Ew78WfTp-bYQ,4441
185
- tests/test_proxy_middleware_refactored.py,sha256=VbkTWkmmomcyswobA_gf3p_bERl_eexY2e6ohJQS_A8,6960
186
- tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
187
- tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
188
- tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
189
- tests/test_queue_manager_double_crawlo.py,sha256=MijZ3JuyHMuqGbRC-8kclFr-4O7m_T8CqezP4qiWk-E,6957
190
- tests/test_queue_manager_redis_key.py,sha256=B6JqScm_8FcriPb0UtBDLoEmLCGCI2z3NuqC0UMJsWo,6407
191
- tests/test_random_user_agent.py,sha256=6HjU4iUcMk-J6bR2N5FhIkWDfnaFKAPNVyRzxmQQ14k,2302
192
- tests/test_real_scenario_proxy.py,sha256=clmLvBfap5OpsaCE08MAWap-78jhVrxYfVfDNyoa4Hg,8454
193
- tests/test_redis_config.py,sha256=51_Fy1PqIhS0MMO2nR4q6oQjBFxfqcUPK_4NNf5s83g,903
194
- tests/test_redis_connection_pool.py,sha256=pKfXdE3Cm_L_fNqI9zqFmqiidCwR0t7hiM_Fu_V1cNI,9328
195
- tests/test_redis_key_naming.py,sha256=MTFk656JhiGVTsMctBDhBNOMFcBDZrsQA3UfPZ-Dgj4,6911
196
- tests/test_redis_key_validator.py,sha256=GszSzGADgk3uN6Bye1d8pS-AtMVgB8jwqW-22gPNM6M,4418
197
- tests/test_redis_queue.py,sha256=WQV3MtGg8rJzHgC2kRfXM6lSMXpwXJVQZfqn2dVrhg0,6758
198
- tests/test_request_ignore_middleware.py,sha256=QN81wgG_W_XfXCF9LvJNxCNwbOH6_tZnLIwLDTK2K5Q,6229
199
- tests/test_request_params.py,sha256=l2etiDebqylPBym1e9DSDn4wxwTHv8DQHKq9AzlzlG0,4287
200
- tests/test_request_serialization.py,sha256=Ikgec8tt_sPCK6jcZyK8vRw84zRNE6nxQy9rba1WKmE,2332
201
- tests/test_response_code_middleware.py,sha256=wSe525bm-bk_iWMjPDzUu1LfOQrwJY8_MLKAspq2dzk,12193
202
- tests/test_response_filter_middleware.py,sha256=YWrGzJ7wmftTjJXcNTtJl3b3EdJsO4oR22ZLWwgErhg,16327
203
- tests/test_response_improvements.py,sha256=vNqHKyoEoYeEGAUiRzdsff2V6yvJ9QnDwGg7gmN38Ow,6028
204
- tests/test_retry_middleware.py,sha256=RmSYSf0GagGPGAVi5TXJWc0bZlmAI_hwFr2FYhvuKrk,8097
205
- tests/test_scheduler.py,sha256=1fCu35QgK5gzgrhD0aUZj5lxL0QbokzPav-yEJxz9Ig,8182
206
- tests/test_scheduler_config_update.py,sha256=LuxjEbt20QrPyVkjSFxvTnFtUxwMaHB6TcqjFyo8bow,4261
207
- tests/test_simple_response.py,sha256=_ui2PuVZvJcAuLY7HZ8xcsy_tDBimgBqX0ukj3kE5J0,1549
208
- tests/test_telecom_spider_redis_key.py,sha256=c-gfixPul2VlYMQJGf0H5ZgYJ461fQgSKbCPrbAU45M,7625
209
- tests/test_template_content.py,sha256=2RgCdOA3pMUSOqC_JbTGeW7KonbTqJ0ySYJNWegU-v0,2903
210
- tests/test_template_redis_key.py,sha256=99-s0_-8MFJbIvGG_X__sH0qkXWTtJv8fdTdlftsq4I,4876
211
- tests/test_tools.py,sha256=z50Bvq_q8FwpyxNkmh00_A3sXkSv2l1Q_EbK02FDYgk,5504
212
- tests/test_user_agents.py,sha256=e4haX-o8Janl-PawGJ9MemZyMqTX33_tBF_WnYSVoUw,3327
213
- tests/tools_example.py,sha256=Rxu5vVKnj3CZ3mCx-EEotBWPtZs2S7ktyqq-SYeclxU,7999
214
- tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4045
215
- crawlo-1.3.3.dist-info/METADATA,sha256=I5cs60iVGIq6oms6duJsGrlzs-WGdlVtUYbMkzYBXvE,27833
216
- crawlo-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
- crawlo-1.3.3.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
- crawlo-1.3.3.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
- crawlo-1.3.3.dist-info/RECORD,,