crawlo 1.3.3__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (279) hide show
  1. crawlo/__init__.py +87 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +46 -2
  16. crawlo/core/engine.py +439 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -256
  19. crawlo/crawler.py +639 -1167
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -52
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +28 -0
  40. crawlo/factories/base.py +69 -0
  41. crawlo/factories/crawler.py +104 -0
  42. crawlo/factories/registry.py +85 -0
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -234
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -0
  47. crawlo/initialization/__init__.py +40 -0
  48. crawlo/initialization/built_in.py +426 -0
  49. crawlo/initialization/context.py +142 -0
  50. crawlo/initialization/core.py +194 -0
  51. crawlo/initialization/phases.py +149 -0
  52. crawlo/initialization/registry.py +146 -0
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -22
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +38 -0
  58. crawlo/logging/config.py +97 -0
  59. crawlo/logging/factory.py +129 -0
  60. crawlo/logging/manager.py +112 -0
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -187
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +318 -318
  85. crawlo/pipelines/pipeline_manager.py +76 -75
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -325
  88. crawlo/queue/pqueue.py +43 -37
  89. crawlo/queue/queue_manager.py +503 -379
  90. crawlo/queue/redis_priority_queue.py +326 -306
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -225
  93. crawlo/settings/setting_manager.py +214 -198
  94. crawlo/spider/__init__.py +657 -639
  95. crawlo/stats_collector.py +73 -59
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +139 -30
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +168 -267
  104. crawlo/templates/project/settings_distributed.py.tmpl +167 -180
  105. crawlo/templates/project/settings_gentle.py.tmpl +167 -61
  106. crawlo/templates/project/settings_high_performance.py.tmpl +168 -131
  107. crawlo/templates/project/settings_minimal.py.tmpl +66 -35
  108. crawlo/templates/project/settings_simple.py.tmpl +165 -102
  109. crawlo/templates/project/spiders/__init__.py.tmpl +10 -6
  110. crawlo/templates/run.py.tmpl +34 -38
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +10 -0
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +365 -0
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +26 -0
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -124
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +44 -200
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -351
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -218
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/METADATA +1126 -1020
  149. crawlo-1.3.4.dist-info/RECORD +278 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +107 -107
  154. tests/baidu_performance_test.py +109 -0
  155. tests/baidu_test.py +60 -0
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +213 -0
  158. tests/comprehensive_test.py +82 -0
  159. tests/comprehensive_testing_summary.md +187 -0
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +70 -0
  164. tests/debug_framework_logger.py +85 -0
  165. tests/debug_log_levels.py +64 -0
  166. tests/debug_pipelines.py +66 -66
  167. tests/distributed_test.py +67 -0
  168. tests/distributed_test_debug.py +77 -0
  169. tests/dynamic_loading_example.py +523 -523
  170. tests/dynamic_loading_test.py +104 -104
  171. tests/env_config_example.py +133 -133
  172. tests/error_handling_example.py +171 -171
  173. tests/final_command_test_report.md +0 -0
  174. tests/final_comprehensive_test.py +152 -0
  175. tests/final_validation_test.py +183 -0
  176. tests/framework_performance_test.py +203 -0
  177. tests/optimized_performance_test.py +212 -0
  178. tests/performance_comparison.py +246 -0
  179. tests/queue_blocking_test.py +114 -0
  180. tests/queue_test.py +90 -0
  181. tests/redis_key_validation_demo.py +130 -130
  182. tests/request_params_example.py +150 -150
  183. tests/response_improvements_example.py +144 -144
  184. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  185. tests/scrapy_comparison/scrapy_test.py +134 -0
  186. tests/simple_command_test.py +120 -0
  187. tests/simple_crawlo_test.py +128 -0
  188. tests/simple_log_test.py +58 -0
  189. tests/simple_optimization_test.py +129 -0
  190. tests/simple_spider_test.py +50 -0
  191. tests/simple_test.py +48 -0
  192. tests/test_advanced_tools.py +148 -148
  193. tests/test_all_commands.py +231 -0
  194. tests/test_all_redis_key_configs.py +145 -145
  195. tests/test_authenticated_proxy.py +141 -141
  196. tests/test_batch_processor.py +179 -0
  197. tests/test_cleaners.py +54 -54
  198. tests/test_component_factory.py +175 -0
  199. tests/test_comprehensive.py +146 -146
  200. tests/test_config_consistency.py +80 -80
  201. tests/test_config_merge.py +152 -152
  202. tests/test_config_validator.py +182 -182
  203. tests/test_controlled_spider_mixin.py +80 -0
  204. tests/test_crawlo_proxy_integration.py +108 -108
  205. tests/test_date_tools.py +123 -123
  206. tests/test_default_header_middleware.py +158 -158
  207. tests/test_distributed.py +65 -65
  208. tests/test_double_crawlo_fix.py +207 -207
  209. tests/test_double_crawlo_fix_simple.py +124 -124
  210. tests/test_download_delay_middleware.py +221 -221
  211. tests/test_downloader_proxy_compatibility.py +268 -268
  212. tests/test_dynamic_downloaders_proxy.py +124 -124
  213. tests/test_dynamic_proxy.py +92 -92
  214. tests/test_dynamic_proxy_config.py +146 -146
  215. tests/test_dynamic_proxy_real.py +109 -109
  216. tests/test_edge_cases.py +303 -303
  217. tests/test_enhanced_error_handler.py +270 -270
  218. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  219. tests/test_env_config.py +121 -121
  220. tests/test_error_handler_compatibility.py +112 -112
  221. tests/test_factories.py +253 -0
  222. tests/test_final_validation.py +153 -153
  223. tests/test_framework_env_usage.py +103 -103
  224. tests/test_framework_logger.py +67 -0
  225. tests/test_framework_startup.py +65 -0
  226. tests/test_integration.py +169 -169
  227. tests/test_item_dedup_redis_key.py +122 -122
  228. tests/test_large_scale_config.py +113 -0
  229. tests/test_large_scale_helper.py +236 -0
  230. tests/test_mode_change.py +73 -0
  231. tests/test_mode_consistency.py +51 -51
  232. tests/test_offsite_middleware.py +221 -221
  233. tests/test_parsel.py +29 -29
  234. tests/test_performance.py +327 -327
  235. tests/test_performance_monitor.py +116 -0
  236. tests/test_proxy_api.py +264 -264
  237. tests/test_proxy_health_check.py +32 -32
  238. tests/test_proxy_middleware.py +121 -121
  239. tests/test_proxy_middleware_enhanced.py +216 -216
  240. tests/test_proxy_middleware_integration.py +136 -136
  241. tests/test_proxy_middleware_refactored.py +184 -184
  242. tests/test_proxy_providers.py +56 -56
  243. tests/test_proxy_stats.py +19 -19
  244. tests/test_proxy_strategies.py +59 -59
  245. tests/test_queue_empty_check.py +42 -0
  246. tests/test_queue_manager_double_crawlo.py +173 -173
  247. tests/test_queue_manager_redis_key.py +176 -176
  248. tests/test_random_user_agent.py +72 -72
  249. tests/test_real_scenario_proxy.py +195 -195
  250. tests/test_redis_config.py +28 -28
  251. tests/test_redis_connection_pool.py +294 -294
  252. tests/test_redis_key_naming.py +181 -181
  253. tests/test_redis_key_validator.py +123 -123
  254. tests/test_redis_queue.py +224 -224
  255. tests/test_request_ignore_middleware.py +182 -182
  256. tests/test_request_params.py +111 -111
  257. tests/test_request_serialization.py +70 -70
  258. tests/test_response_code_middleware.py +349 -349
  259. tests/test_response_filter_middleware.py +427 -427
  260. tests/test_response_improvements.py +152 -152
  261. tests/test_retry_middleware.py +241 -241
  262. tests/test_scheduler.py +252 -252
  263. tests/test_scheduler_config_update.py +133 -133
  264. tests/test_simple_response.py +61 -61
  265. tests/test_telecom_spider_redis_key.py +205 -205
  266. tests/test_template_content.py +87 -87
  267. tests/test_template_redis_key.py +134 -134
  268. tests/test_tools.py +159 -159
  269. tests/test_user_agents.py +96 -96
  270. tests/tools_example.py +260 -260
  271. tests/untested_features_report.md +139 -0
  272. tests/verify_debug.py +52 -0
  273. tests/verify_distributed.py +117 -117
  274. tests/verify_log_fix.py +112 -0
  275. crawlo-1.3.3.dist-info/RECORD +0 -219
  276. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  277. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  278. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  279. {crawlo-1.3.3.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
tests/tools_example.py CHANGED
@@ -1,261 +1,261 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Crawlo框架工具包使用示例
5
- """
6
- import asyncio
7
- from crawlo.tools import (
8
- # 日期工具
9
- parse_time,
10
- format_time,
11
- time_diff,
12
-
13
- # 数据清洗工具
14
- clean_text,
15
- format_currency,
16
- extract_emails,
17
-
18
- # 数据验证工具
19
- validate_email,
20
- validate_url,
21
- validate_phone,
22
-
23
- # 请求处理工具
24
- build_url,
25
- add_query_params,
26
- merge_headers,
27
-
28
- # 反爬虫应对工具
29
- get_random_user_agent,
30
- rotate_proxy,
31
-
32
- # 带认证代理工具
33
- AuthenticatedProxy,
34
- create_proxy_config,
35
- get_proxy_info,
36
-
37
- # 分布式协调工具
38
- generate_task_id,
39
- get_cluster_info
40
- )
41
-
42
-
43
- def demo_date_tools():
44
- """演示日期工具的使用"""
45
- print("=== 日期工具演示 ===\n")
46
-
47
- # 解析时间
48
- time_str = "2025-09-10 14:30:00"
49
- parsed_time = parse_time(time_str)
50
- print(f"解析时间: {time_str} -> {parsed_time}")
51
-
52
- # 格式化时间
53
- formatted_time = format_time(parsed_time, "%Y年%m月%d日 %H:%M:%S")
54
- print(f"格式化时间: {parsed_time} -> {formatted_time}")
55
-
56
- # 时间差计算
57
- time_str2 = "2025-09-11 16:45:30"
58
- parsed_time2 = parse_time(time_str2)
59
- diff = time_diff(parsed_time2, parsed_time)
60
- print(f"时间差: {time_str2} - {time_str} = {diff} 秒")
61
-
62
- print()
63
-
64
-
65
- def demo_data_cleaning_tools():
66
- """演示数据清洗工具的使用"""
67
- print("=== 数据清洗工具演示 ===\n")
68
-
69
- # 清洗文本
70
- dirty_text = "<p>这是一个&nbsp;<b>测试</b>&amp;文本</p>"
71
- clean_result = clean_text(dirty_text)
72
- print(f"清洗文本: {dirty_text} -> {clean_result}")
73
-
74
- # 格式化货币
75
- price = 1234.567
76
- formatted_price = format_currency(price, "¥", 2)
77
- print(f"格式化货币: {price} -> {formatted_price}")
78
-
79
- # 提取邮箱
80
- text_with_email = "联系邮箱: test@example.com, support@crawler.com"
81
- emails = extract_emails(text_with_email)
82
- print(f"提取邮箱: {text_with_email} -> {emails}")
83
-
84
- print()
85
-
86
-
87
- def demo_data_validation_tools():
88
- """演示数据验证工具的使用"""
89
- print("=== 数据验证工具演示 ===\n")
90
-
91
- # 验证邮箱
92
- email = "test@example.com"
93
- is_valid_email = validate_email(email)
94
- print(f"验证邮箱: {email} -> {'有效' if is_valid_email else '无效'}")
95
-
96
- # 验证无效邮箱
97
- invalid_email = "invalid-email"
98
- is_valid_invalid = validate_email(invalid_email)
99
- print(f"验证邮箱: {invalid_email} -> {'有效' if is_valid_invalid else '无效'}")
100
-
101
- # 验证URL
102
- url = "https://example.com/path?param=value"
103
- is_valid_url = validate_url(url)
104
- print(f"验证URL: {url} -> {'有效' if is_valid_url else '无效'}")
105
-
106
- # 验证电话号码
107
- phone = "13812345678"
108
- is_valid_phone = validate_phone(phone)
109
- print(f"验证电话: {phone} -> {'有效' if is_valid_phone else '无效'}")
110
-
111
- print()
112
-
113
-
114
- def demo_request_handling_tools():
115
- """演示请求处理工具的使用"""
116
- print("=== 请求处理工具演示 ===\n")
117
-
118
- # 构建URL
119
- base_url = "https://api.example.com"
120
- path = "/v1/users"
121
- query_params = {"page": 1, "limit": 10}
122
- full_url = build_url(base_url, path, query_params)
123
- print(f"构建URL: {base_url} + {path} + {query_params} -> {full_url}")
124
-
125
- # 添加查询参数
126
- existing_url = "https://api.example.com/v1/users?page=1"
127
- new_params = {"sort": "name", "order": "asc"}
128
- updated_url = add_query_params(existing_url, new_params)
129
- print(f"添加参数: {existing_url} + {new_params} -> {updated_url}")
130
-
131
- # 合并请求头
132
- base_headers = {"Content-Type": "application/json", "Accept": "application/json"}
133
- additional_headers = {"Authorization": "Bearer token123", "User-Agent": "Crawlo/1.0"}
134
- merged_headers = merge_headers(base_headers, additional_headers)
135
- print(f"合并请求头:")
136
- print(f" 基础头: {base_headers}")
137
- print(f" 额外头: {additional_headers}")
138
- print(f" 合并后: {merged_headers}")
139
-
140
- print()
141
-
142
-
143
- def demo_anti_crawler_tools():
144
- """演示反爬虫应对工具的使用"""
145
- print("=== 反爬虫应对工具演示 ===\n")
146
-
147
- # 获取随机User-Agent
148
- user_agent = get_random_user_agent()
149
- print(f"随机User-Agent: {user_agent[:50]}...")
150
-
151
- # 轮换代理
152
- proxy = rotate_proxy()
153
- print(f"轮换代理: {proxy}")
154
-
155
- print()
156
-
157
-
158
- def demo_authenticated_proxy_tools():
159
- """演示带认证代理工具的使用"""
160
- print("=== 带认证代理工具演示 ===\n")
161
-
162
- # 创建带认证的代理
163
- proxy_url = "http://username:password@proxy.example.com:8080"
164
- proxy = AuthenticatedProxy(proxy_url)
165
-
166
- print(f"代理URL: {proxy}")
167
- print(f"清洁URL: {proxy.clean_url}")
168
- print(f"用户名: {proxy.username}")
169
- print(f"密码: {proxy.password}")
170
- print(f"代理字典: {proxy.proxy_dict}")
171
- print(f"认证凭据: {proxy.get_auth_credentials()}")
172
- print(f"认证头: {proxy.get_auth_header()}")
173
- print(f"是否有效: {proxy.is_valid()}")
174
-
175
- # 创建代理配置
176
- proxy_config = create_proxy_config(proxy_url)
177
- print(f"\n代理配置: {proxy_config}")
178
-
179
- # 获取代理信息
180
- proxy_info = get_proxy_info(proxy_url)
181
- print(f"代理信息: {proxy_info}")
182
-
183
- print()
184
-
185
-
186
- async def demo_distributed_coordinator_tools():
187
- """演示分布式协调工具的使用"""
188
- print("=== 分布式协调工具演示 ===\n")
189
-
190
- # 生成任务ID
191
- url = "https://example.com/page/1"
192
- spider_name = "example_spider"
193
- task_id = generate_task_id(url, spider_name)
194
- print(f"生成任务ID: URL={url}, Spider={spider_name} -> {task_id}")
195
-
196
- # 获取集群信息
197
- cluster_info = await get_cluster_info()
198
- print(f"集群信息: {cluster_info}")
199
-
200
- print()
201
-
202
-
203
- if __name__ == '__main__':
204
- # 运行演示
205
- demo_date_tools()
206
- demo_data_cleaning_tools()
207
- demo_data_validation_tools()
208
- demo_request_handling_tools()
209
- demo_anti_crawler_tools()
210
- demo_authenticated_proxy_tools()
211
-
212
- # 运行异步演示
213
- asyncio.run(demo_distributed_coordinator_tools())
214
-
215
- print("=== 在爬虫中使用工具包 ===\n")
216
- print("在爬虫项目中,您可以这样使用工具包:")
217
- print("""
218
- from crawlo import Spider, Request
219
- from crawlo.tools import (
220
- clean_text,
221
- validate_email,
222
- get_random_user_agent,
223
- build_url,
224
- AuthenticatedProxy
225
- )
226
-
227
- class ExampleSpider(Spider):
228
- def start_requests(self):
229
- headers = {"User-Agent": get_random_user_agent()}
230
-
231
- # 使用带认证的代理
232
- proxy_url = "http://username:password@proxy.example.com:8080"
233
- proxy = AuthenticatedProxy(proxy_url)
234
-
235
- request = Request("https://example.com", headers=headers)
236
- # 根据下载器类型设置代理
237
- downloader_type = self.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
238
- if downloader_type == "aiohttp":
239
- request.proxy = proxy.clean_url
240
- auth = proxy.get_auth_credentials()
241
- if auth:
242
- request.meta["proxy_auth"] = auth
243
- else:
244
- request.proxy = proxy.proxy_dict
245
-
246
- yield request
247
-
248
- def parse(self, response):
249
- # 提取数据
250
- title = response.css('h1::text').get()
251
- email = response.css('.email::text').get()
252
-
253
- # 清洗和验证数据
254
- clean_title = clean_text(title) if title else None
255
- is_valid_email = validate_email(email) if email else False
256
-
257
- # 构建下一页URL
258
- next_page_url = build_url("https://example.com", "/page/2")
259
-
260
- # 处理数据...
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo框架工具包使用示例
5
+ """
6
+ import asyncio
7
+ from crawlo.tools import (
8
+ # 日期工具
9
+ parse_time,
10
+ format_time,
11
+ time_diff,
12
+
13
+ # 数据清洗工具
14
+ clean_text,
15
+ format_currency,
16
+ extract_emails,
17
+
18
+ # 数据验证工具
19
+ validate_email,
20
+ validate_url,
21
+ validate_phone,
22
+
23
+ # 请求处理工具
24
+ build_url,
25
+ add_query_params,
26
+ merge_headers,
27
+
28
+ # 反爬虫应对工具
29
+ get_random_user_agent,
30
+ rotate_proxy,
31
+
32
+ # 带认证代理工具
33
+ AuthenticatedProxy,
34
+ create_proxy_config,
35
+ get_proxy_info,
36
+
37
+ # 分布式协调工具
38
+ generate_task_id,
39
+ get_cluster_info
40
+ )
41
+
42
+
43
+ def demo_date_tools():
44
+ """演示日期工具的使用"""
45
+ print("=== 日期工具演示 ===\n")
46
+
47
+ # 解析时间
48
+ time_str = "2025-09-10 14:30:00"
49
+ parsed_time = parse_time(time_str)
50
+ print(f"解析时间: {time_str} -> {parsed_time}")
51
+
52
+ # 格式化时间
53
+ formatted_time = format_time(parsed_time, "%Y年%m月%d日 %H:%M:%S")
54
+ print(f"格式化时间: {parsed_time} -> {formatted_time}")
55
+
56
+ # 时间差计算
57
+ time_str2 = "2025-09-11 16:45:30"
58
+ parsed_time2 = parse_time(time_str2)
59
+ diff = time_diff(parsed_time2, parsed_time)
60
+ print(f"时间差: {time_str2} - {time_str} = {diff} 秒")
61
+
62
+ print()
63
+
64
+
65
+ def demo_data_cleaning_tools():
66
+ """演示数据清洗工具的使用"""
67
+ print("=== 数据清洗工具演示 ===\n")
68
+
69
+ # 清洗文本
70
+ dirty_text = "<p>这是一个&nbsp;<b>测试</b>&amp;文本</p>"
71
+ clean_result = clean_text(dirty_text)
72
+ print(f"清洗文本: {dirty_text} -> {clean_result}")
73
+
74
+ # 格式化货币
75
+ price = 1234.567
76
+ formatted_price = format_currency(price, "¥", 2)
77
+ print(f"格式化货币: {price} -> {formatted_price}")
78
+
79
+ # 提取邮箱
80
+ text_with_email = "联系邮箱: test@example.com, support@crawler.com"
81
+ emails = extract_emails(text_with_email)
82
+ print(f"提取邮箱: {text_with_email} -> {emails}")
83
+
84
+ print()
85
+
86
+
87
+ def demo_data_validation_tools():
88
+ """演示数据验证工具的使用"""
89
+ print("=== 数据验证工具演示 ===\n")
90
+
91
+ # 验证邮箱
92
+ email = "test@example.com"
93
+ is_valid_email = validate_email(email)
94
+ print(f"验证邮箱: {email} -> {'有效' if is_valid_email else '无效'}")
95
+
96
+ # 验证无效邮箱
97
+ invalid_email = "invalid-email"
98
+ is_valid_invalid = validate_email(invalid_email)
99
+ print(f"验证邮箱: {invalid_email} -> {'有效' if is_valid_invalid else '无效'}")
100
+
101
+ # 验证URL
102
+ url = "https://example.com/path?param=value"
103
+ is_valid_url = validate_url(url)
104
+ print(f"验证URL: {url} -> {'有效' if is_valid_url else '无效'}")
105
+
106
+ # 验证电话号码
107
+ phone = "13812345678"
108
+ is_valid_phone = validate_phone(phone)
109
+ print(f"验证电话: {phone} -> {'有效' if is_valid_phone else '无效'}")
110
+
111
+ print()
112
+
113
+
114
+ def demo_request_handling_tools():
115
+ """演示请求处理工具的使用"""
116
+ print("=== 请求处理工具演示 ===\n")
117
+
118
+ # 构建URL
119
+ base_url = "https://api.example.com"
120
+ path = "/v1/users"
121
+ query_params = {"page": 1, "limit": 10}
122
+ full_url = build_url(base_url, path, query_params)
123
+ print(f"构建URL: {base_url} + {path} + {query_params} -> {full_url}")
124
+
125
+ # 添加查询参数
126
+ existing_url = "https://api.example.com/v1/users?page=1"
127
+ new_params = {"sort": "name", "order": "asc"}
128
+ updated_url = add_query_params(existing_url, new_params)
129
+ print(f"添加参数: {existing_url} + {new_params} -> {updated_url}")
130
+
131
+ # 合并请求头
132
+ base_headers = {"Content-Type": "application/json", "Accept": "application/json"}
133
+ additional_headers = {"Authorization": "Bearer token123", "User-Agent": "Crawlo/1.0"}
134
+ merged_headers = merge_headers(base_headers, additional_headers)
135
+ print(f"合并请求头:")
136
+ print(f" 基础头: {base_headers}")
137
+ print(f" 额外头: {additional_headers}")
138
+ print(f" 合并后: {merged_headers}")
139
+
140
+ print()
141
+
142
+
143
+ def demo_anti_crawler_tools():
144
+ """演示反爬虫应对工具的使用"""
145
+ print("=== 反爬虫应对工具演示 ===\n")
146
+
147
+ # 获取随机User-Agent
148
+ user_agent = get_random_user_agent()
149
+ print(f"随机User-Agent: {user_agent[:50]}...")
150
+
151
+ # 轮换代理
152
+ proxy = rotate_proxy()
153
+ print(f"轮换代理: {proxy}")
154
+
155
+ print()
156
+
157
+
158
+ def demo_authenticated_proxy_tools():
159
+ """演示带认证代理工具的使用"""
160
+ print("=== 带认证代理工具演示 ===\n")
161
+
162
+ # 创建带认证的代理
163
+ proxy_url = "http://username:password@proxy.example.com:8080"
164
+ proxy = AuthenticatedProxy(proxy_url)
165
+
166
+ print(f"代理URL: {proxy}")
167
+ print(f"清洁URL: {proxy.clean_url}")
168
+ print(f"用户名: {proxy.username}")
169
+ print(f"密码: {proxy.password}")
170
+ print(f"代理字典: {proxy.proxy_dict}")
171
+ print(f"认证凭据: {proxy.get_auth_credentials()}")
172
+ print(f"认证头: {proxy.get_auth_header()}")
173
+ print(f"是否有效: {proxy.is_valid()}")
174
+
175
+ # 创建代理配置
176
+ proxy_config = create_proxy_config(proxy_url)
177
+ print(f"\n代理配置: {proxy_config}")
178
+
179
+ # 获取代理信息
180
+ proxy_info = get_proxy_info(proxy_url)
181
+ print(f"代理信息: {proxy_info}")
182
+
183
+ print()
184
+
185
+
186
+ async def demo_distributed_coordinator_tools():
187
+ """演示分布式协调工具的使用"""
188
+ print("=== 分布式协调工具演示 ===\n")
189
+
190
+ # 生成任务ID
191
+ url = "https://example.com/page/1"
192
+ spider_name = "example_spider"
193
+ task_id = generate_task_id(url, spider_name)
194
+ print(f"生成任务ID: URL={url}, Spider={spider_name} -> {task_id}")
195
+
196
+ # 获取集群信息
197
+ cluster_info = await get_cluster_info()
198
+ print(f"集群信息: {cluster_info}")
199
+
200
+ print()
201
+
202
+
203
+ if __name__ == '__main__':
204
+ # 运行演示
205
+ demo_date_tools()
206
+ demo_data_cleaning_tools()
207
+ demo_data_validation_tools()
208
+ demo_request_handling_tools()
209
+ demo_anti_crawler_tools()
210
+ demo_authenticated_proxy_tools()
211
+
212
+ # 运行异步演示
213
+ asyncio.run(demo_distributed_coordinator_tools())
214
+
215
+ print("=== 在爬虫中使用工具包 ===\n")
216
+ print("在爬虫项目中,您可以这样使用工具包:")
217
+ print("""
218
+ from crawlo import Spider, Request
219
+ from crawlo.tools import (
220
+ clean_text,
221
+ validate_email,
222
+ get_random_user_agent,
223
+ build_url,
224
+ AuthenticatedProxy
225
+ )
226
+
227
+ class ExampleSpider(Spider):
228
+ def start_requests(self):
229
+ headers = {"User-Agent": get_random_user_agent()}
230
+
231
+ # 使用带认证的代理
232
+ proxy_url = "http://username:password@proxy.example.com:8080"
233
+ proxy = AuthenticatedProxy(proxy_url)
234
+
235
+ request = Request("https://example.com", headers=headers)
236
+ # 根据下载器类型设置代理
237
+ downloader_type = self.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
238
+ if downloader_type == "aiohttp":
239
+ request.proxy = proxy.clean_url
240
+ auth = proxy.get_auth_credentials()
241
+ if auth:
242
+ request.meta["proxy_auth"] = auth
243
+ else:
244
+ request.proxy = proxy.proxy_dict
245
+
246
+ yield request
247
+
248
+ def parse(self, response):
249
+ # 提取数据
250
+ title = response.css('h1::text').get()
251
+ email = response.css('.email::text').get()
252
+
253
+ # 清洗和验证数据
254
+ clean_title = clean_text(title) if title else None
255
+ is_valid_email = validate_email(email) if email else False
256
+
257
+ # 构建下一页URL
258
+ next_page_url = build_url("https://example.com", "/page/2")
259
+
260
+ # 处理数据...
261
261
  """)
@@ -0,0 +1,139 @@
1
+ # 未测试功能报告
2
+
3
+ ## 概述
4
+
5
+ 在对Crawlo框架进行全面分析后,发现以下功能模块缺乏专门的测试用例。这些模块虽然部分功能在其他测试中可能有间接覆盖,但缺乏针对性的单元测试和集成测试。
6
+
7
+ ## 已完成测试的功能模块
8
+
9
+ ### 1. 工厂模式相关模块
10
+
11
+ **模块路径**: `crawlo/factories/`
12
+
13
+ **测试文件**: `tests/test_factories.py`
14
+
15
+ **已测试组件**:
16
+ - `ComponentRegistry` - 组件注册表
17
+ - `ComponentFactory` - 组件工厂基类
18
+ - `DefaultComponentFactory` - 默认组件工厂
19
+ - `CrawlerComponentFactory` - Crawler组件工厂
20
+
21
+ ### 2. 批处理工具
22
+
23
+ **模块路径**: `crawlo/utils/batch_processor.py`
24
+
25
+ **测试文件**: `tests/test_batch_processor.py`
26
+
27
+ **已测试组件**:
28
+ - `BatchProcessor` - 批处理处理器
29
+ - `RedisBatchProcessor` - Redis批处理处理器
30
+ - `batch_process` - 便捷批处理函数
31
+
32
+ ### 3. 受控爬虫混入类
33
+
34
+ **模块路径**: `crawlo/utils/controlled_spider_mixin.py`
35
+
36
+ **测试文件**: `tests/test_controlled_spider_mixin.py`
37
+
38
+ **已测试组件**:
39
+ - `ControlledRequestMixin` - 受控请求生成混入类
40
+ - `AsyncControlledRequestMixin` - 异步受控请求混入类
41
+
42
+ ### 4. 大规模配置工具
43
+
44
+ **模块路径**: `crawlo/utils/large_scale_config.py`
45
+
46
+ **测试文件**: `tests/test_large_scale_config.py`
47
+
48
+ **已测试组件**:
49
+ - `LargeScaleConfig` - 大规模爬虫配置类
50
+ - `apply_large_scale_config` - 应用大规模配置函数
51
+
52
+ ### 5. 大规模爬虫辅助工具
53
+
54
+ **模块路径**: `crawlo/utils/large_scale_helper.py`
55
+
56
+ **测试文件**: `tests/test_large_scale_helper.py`
57
+
58
+ **已测试组件**:
59
+ - `LargeScaleHelper` - 大规模爬虫辅助类
60
+ - `ProgressManager` - 进度管理器
61
+ - `MemoryOptimizer` - 内存优化器
62
+ - `DataSourceAdapter` - 数据源适配器
63
+ - `LargeScaleSpiderMixin` - 大规模爬虫混入类
64
+
65
+ ### 6. 增强错误处理工具
66
+
67
+ **模块路径**: `crawlo/utils/enhanced_error_handler.py`
68
+
69
+ **测试文件**:
70
+ - `tests/test_enhanced_error_handler.py` (基础测试)
71
+ - `tests/test_enhanced_error_handler_comprehensive.py` (综合测试)
72
+
73
+ **已测试组件**:
74
+ - `ErrorContext` - 错误上下文信息
75
+ - `DetailedException` - 详细异常基类
76
+ - `EnhancedErrorHandler` - 增强错误处理器
77
+ - `handle_exception` 装饰器
78
+
79
+ ## 未测试的功能模块
80
+
81
+ ### 1. 性能监控工具
82
+
83
+ **模块路径**: `crawlo/utils/performance_monitor.py`
84
+
85
+ **测试文件**: `tests/test_performance_monitor.py` (部分测试,依赖psutil)
86
+
87
+ **未充分测试组件**:
88
+ - `PerformanceMonitor` - 性能监控器
89
+ - `PerformanceTimer` - 性能计时器
90
+ - `performance_monitor_decorator` - 性能监控装饰器
91
+
92
+ **风险**: 性能监控是优化和诊断的重要工具,缺乏测试可能导致监控数据不准确或监控功能失效。
93
+
94
+ ## 建议的测试策略
95
+
96
+ ### 1. 优先级排序
97
+
98
+ **高优先级** (直接影响核心功能):
99
+ - (已完成)
100
+
101
+ **中优先级** (影响性能和稳定性):
102
+ - 性能监控工具
103
+
104
+ **低优先级** (辅助功能):
105
+ - (已完成)
106
+
107
+ ### 2. 测试类型建议
108
+
109
+ **单元测试**:
110
+ - 针对每个类的方法进行独立测试
111
+ - 验证边界条件和异常情况
112
+ - 测试配置参数的有效性
113
+
114
+ **集成测试**:
115
+ - 测试模块间的协作
116
+ - 验证与Redis等外部服务的交互
117
+ - 测试在真实爬虫场景中的表现
118
+
119
+ **性能测试**:
120
+ - 验证批处理工具的性能优势
121
+ - 测试大规模处理工具的内存使用情况
122
+ - 验证性能监控工具的准确性
123
+
124
+ ### 3. 测试覆盖建议
125
+
126
+ **核心功能覆盖**:
127
+ - 正常流程测试
128
+ - 异常流程测试
129
+ - 边界条件测试
130
+ - 并发安全测试
131
+
132
+ **配置覆盖**:
133
+ - 不同配置参数的测试
134
+ - 默认配置与自定义配置的对比
135
+ - 配置更新的动态测试
136
+
137
+ ## 结论
138
+
139
+ 已为工厂模式、批处理工具、受控爬虫混入类、大规模配置工具、大规模爬虫辅助工具和增强错误处理工具创建了测试用例,这些核心组件现在有了基本的测试覆盖。建议继续为性能监控工具补充测试用例(在安装psutil后),以确保框架的完整性和稳定性。