crawlo 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +63 -63
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +322 -314
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -365
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +256 -256
- crawlo/crawler.py +1166 -1168
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +226 -226
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +52 -45
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -115
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +187 -148
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +318 -318
- crawlo/pipelines/pipeline_manager.py +75 -75
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +325 -297
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +379 -379
- crawlo/queue/redis_priority_queue.py +306 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +225 -225
- crawlo/settings/setting_manager.py +198 -198
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +266 -261
- crawlo/templates/project/settings_distributed.py.tmpl +179 -174
- crawlo/templates/project/settings_gentle.py.tmpl +60 -95
- crawlo/templates/project/settings_high_performance.py.tmpl +130 -125
- crawlo/templates/project/settings_minimal.py.tmpl +34 -29
- crawlo/templates/project/settings_simple.py.tmpl +101 -96
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +38 -47
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +199 -146
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
- crawlo-1.3.3.dist-info/RECORD +219 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +107 -107
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_pipelines.py +66 -66
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/verify_distributed.py +117 -117
- crawlo-1.3.1.dist-info/RECORD +0 -219
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
tests/test_user_agents.py
CHANGED
|
@@ -1,97 +1,97 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试User-Agent列表的功能
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到Python路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.data.user_agents import (
|
|
14
|
-
DESKTOP_USER_AGENTS,
|
|
15
|
-
MOBILE_USER_AGENTS,
|
|
16
|
-
BOT_USER_AGENTS,
|
|
17
|
-
CHROME_USER_AGENTS,
|
|
18
|
-
FIREFOX_USER_AGENTS,
|
|
19
|
-
SAFARI_USER_AGENTS,
|
|
20
|
-
EDGE_USER_AGENTS,
|
|
21
|
-
OPERA_USER_AGENTS,
|
|
22
|
-
ALL_USER_AGENTS,
|
|
23
|
-
USER_AGENTS_BY_TYPE,
|
|
24
|
-
get_user_agents,
|
|
25
|
-
get_random_user_agent
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def test_user_agent_counts():
|
|
30
|
-
"""测试User-Agent数量"""
|
|
31
|
-
print("=== User-Agent数量测试 ===")
|
|
32
|
-
print(f"桌面浏览器User-Agent数量: {len(DESKTOP_USER_AGENTS)}")
|
|
33
|
-
print(f"移动设备User-Agent数量: {len(MOBILE_USER_AGENTS)}")
|
|
34
|
-
print(f"爬虫User-Agent数量: {len(BOT_USER_AGENTS)}")
|
|
35
|
-
print(f"Chrome User-Agent数量: {len(CHROME_USER_AGENTS)}")
|
|
36
|
-
print(f"Firefox User-Agent数量: {len(FIREFOX_USER_AGENTS)}")
|
|
37
|
-
print(f"Safari User-Agent数量: {len(SAFARI_USER_AGENTS)}")
|
|
38
|
-
print(f"Edge User-Agent数量: {len(EDGE_USER_AGENTS)}")
|
|
39
|
-
print(f"Opera User-Agent数量: {len(OPERA_USER_AGENTS)}")
|
|
40
|
-
print(f"所有User-Agent数量: {len(ALL_USER_AGENTS)}")
|
|
41
|
-
print()
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def test_get_user_agents():
|
|
45
|
-
"""测试get_user_agents函数"""
|
|
46
|
-
print("=== get_user_agents函数测试 ===")
|
|
47
|
-
for device_type in ["desktop", "mobile", "bot", "all", "chrome", "firefox", "safari", "edge", "opera"]:
|
|
48
|
-
user_agents = get_user_agents(device_type)
|
|
49
|
-
print(f"{device_type}类型User-Agent数量: {len(user_agents)}")
|
|
50
|
-
print()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def test_get_random_user_agent():
|
|
54
|
-
"""测试get_random_user_agent函数"""
|
|
55
|
-
print("=== get_random_user_agent函数测试 ===")
|
|
56
|
-
for device_type in ["desktop", "mobile", "all", "chrome", "firefox"]:
|
|
57
|
-
ua = get_random_user_agent(device_type)
|
|
58
|
-
print(f"{device_type}类型随机User-Agent: {ua[:100]}...")
|
|
59
|
-
print()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def test_user_agents_content():
|
|
63
|
-
"""测试User-Agent内容"""
|
|
64
|
-
print("=== User-Agent内容测试 ===")
|
|
65
|
-
|
|
66
|
-
# 检查是否包含最新的浏览器版本
|
|
67
|
-
chrome_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Chrome/136" in ua)
|
|
68
|
-
firefox_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Firefox/136" in ua)
|
|
69
|
-
safari_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Version/18" in ua and "Safari" in ua)
|
|
70
|
-
|
|
71
|
-
print(f"包含Chrome 136的User-Agent数量: {chrome_ua_count}")
|
|
72
|
-
print(f"包含Firefox 136的User-Agent数量: {firefox_ua_count}")
|
|
73
|
-
print(f"包含Safari 18的User-Agent数量: {safari_ua_count}")
|
|
74
|
-
|
|
75
|
-
# 检查是否包含移动设备User-Agent
|
|
76
|
-
ios_ua_count = sum(1 for ua in ALL_USER_AGENTS if "iPhone" in ua or "iPad" in ua)
|
|
77
|
-
android_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Android" in ua)
|
|
78
|
-
|
|
79
|
-
print(f"包含iOS设备的User-Agent数量: {ios_ua_count}")
|
|
80
|
-
print(f"包含Android设备的User-Agent数量: {android_ua_count}")
|
|
81
|
-
print()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def main():
|
|
85
|
-
"""主测试函数"""
|
|
86
|
-
print("开始测试User-Agent列表...\n")
|
|
87
|
-
|
|
88
|
-
test_user_agent_counts()
|
|
89
|
-
test_get_user_agents()
|
|
90
|
-
test_get_random_user_agent()
|
|
91
|
-
test_user_agents_content()
|
|
92
|
-
|
|
93
|
-
print("所有测试完成!")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试User-Agent列表的功能
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到Python路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.data.user_agents import (
|
|
14
|
+
DESKTOP_USER_AGENTS,
|
|
15
|
+
MOBILE_USER_AGENTS,
|
|
16
|
+
BOT_USER_AGENTS,
|
|
17
|
+
CHROME_USER_AGENTS,
|
|
18
|
+
FIREFOX_USER_AGENTS,
|
|
19
|
+
SAFARI_USER_AGENTS,
|
|
20
|
+
EDGE_USER_AGENTS,
|
|
21
|
+
OPERA_USER_AGENTS,
|
|
22
|
+
ALL_USER_AGENTS,
|
|
23
|
+
USER_AGENTS_BY_TYPE,
|
|
24
|
+
get_user_agents,
|
|
25
|
+
get_random_user_agent
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_user_agent_counts():
|
|
30
|
+
"""测试User-Agent数量"""
|
|
31
|
+
print("=== User-Agent数量测试 ===")
|
|
32
|
+
print(f"桌面浏览器User-Agent数量: {len(DESKTOP_USER_AGENTS)}")
|
|
33
|
+
print(f"移动设备User-Agent数量: {len(MOBILE_USER_AGENTS)}")
|
|
34
|
+
print(f"爬虫User-Agent数量: {len(BOT_USER_AGENTS)}")
|
|
35
|
+
print(f"Chrome User-Agent数量: {len(CHROME_USER_AGENTS)}")
|
|
36
|
+
print(f"Firefox User-Agent数量: {len(FIREFOX_USER_AGENTS)}")
|
|
37
|
+
print(f"Safari User-Agent数量: {len(SAFARI_USER_AGENTS)}")
|
|
38
|
+
print(f"Edge User-Agent数量: {len(EDGE_USER_AGENTS)}")
|
|
39
|
+
print(f"Opera User-Agent数量: {len(OPERA_USER_AGENTS)}")
|
|
40
|
+
print(f"所有User-Agent数量: {len(ALL_USER_AGENTS)}")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_get_user_agents():
|
|
45
|
+
"""测试get_user_agents函数"""
|
|
46
|
+
print("=== get_user_agents函数测试 ===")
|
|
47
|
+
for device_type in ["desktop", "mobile", "bot", "all", "chrome", "firefox", "safari", "edge", "opera"]:
|
|
48
|
+
user_agents = get_user_agents(device_type)
|
|
49
|
+
print(f"{device_type}类型User-Agent数量: {len(user_agents)}")
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_get_random_user_agent():
|
|
54
|
+
"""测试get_random_user_agent函数"""
|
|
55
|
+
print("=== get_random_user_agent函数测试 ===")
|
|
56
|
+
for device_type in ["desktop", "mobile", "all", "chrome", "firefox"]:
|
|
57
|
+
ua = get_random_user_agent(device_type)
|
|
58
|
+
print(f"{device_type}类型随机User-Agent: {ua[:100]}...")
|
|
59
|
+
print()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_user_agents_content():
|
|
63
|
+
"""测试User-Agent内容"""
|
|
64
|
+
print("=== User-Agent内容测试 ===")
|
|
65
|
+
|
|
66
|
+
# 检查是否包含最新的浏览器版本
|
|
67
|
+
chrome_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Chrome/136" in ua)
|
|
68
|
+
firefox_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Firefox/136" in ua)
|
|
69
|
+
safari_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Version/18" in ua and "Safari" in ua)
|
|
70
|
+
|
|
71
|
+
print(f"包含Chrome 136的User-Agent数量: {chrome_ua_count}")
|
|
72
|
+
print(f"包含Firefox 136的User-Agent数量: {firefox_ua_count}")
|
|
73
|
+
print(f"包含Safari 18的User-Agent数量: {safari_ua_count}")
|
|
74
|
+
|
|
75
|
+
# 检查是否包含移动设备User-Agent
|
|
76
|
+
ios_ua_count = sum(1 for ua in ALL_USER_AGENTS if "iPhone" in ua or "iPad" in ua)
|
|
77
|
+
android_ua_count = sum(1 for ua in ALL_USER_AGENTS if "Android" in ua)
|
|
78
|
+
|
|
79
|
+
print(f"包含iOS设备的User-Agent数量: {ios_ua_count}")
|
|
80
|
+
print(f"包含Android设备的User-Agent数量: {android_ua_count}")
|
|
81
|
+
print()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def main():
|
|
85
|
+
"""主测试函数"""
|
|
86
|
+
print("开始测试User-Agent列表...\n")
|
|
87
|
+
|
|
88
|
+
test_user_agent_counts()
|
|
89
|
+
test_get_user_agents()
|
|
90
|
+
test_get_random_user_agent()
|
|
91
|
+
test_user_agents_content()
|
|
92
|
+
|
|
93
|
+
print("所有测试完成!")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
97
|
main()
|