crawlo 1.2.8__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +63 -61
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +314 -323
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -251
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -365
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +256 -251
- crawlo/crawler.py +1097 -1099
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -107
- crawlo/downloader/__init__.py +273 -266
- crawlo/downloader/aiohttp_downloader.py +226 -228
- crawlo/downloader/cffi_downloader.py +245 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +45 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +136 -136
- crawlo/middleware/offsite.py +114 -114
- crawlo/middleware/proxy.py +386 -368
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -0
- crawlo/mode_manager.py +212 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +157 -157
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +223 -223
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +317 -317
- crawlo/pipelines/pipeline_manager.py +74 -62
- crawlo/pipelines/redis_dedup_pipeline.py +167 -167
- crawlo/project.py +284 -315
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +379 -378
- crawlo/queue/redis_priority_queue.py +306 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +216 -220
- crawlo/settings/setting_manager.py +175 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +261 -288
- crawlo/templates/project/settings_distributed.py.tmpl +174 -157
- crawlo/templates/project/settings_gentle.py.tmpl +95 -100
- crawlo/templates/project/settings_high_performance.py.tmpl +125 -134
- crawlo/templates/project/settings_minimal.py.tmpl +30 -0
- crawlo/templates/project/settings_simple.py.tmpl +96 -98
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +47 -47
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +200 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/{cleaners → tools}/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +290 -36
- crawlo/tools/distributed_coordinator.py +388 -387
- crawlo/{cleaners → tools}/encoding_converter.py +127 -126
- crawlo/tools/request_tools.py +83 -0
- crawlo/tools/retry_mechanism.py +224 -221
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/{cleaners → tools}/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +146 -128
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/METADATA +1011 -764
- crawlo-1.3.0.dist-info/RECORD +219 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +107 -237
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +143 -103
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_pipelines.py +67 -0
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +151 -0
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +153 -0
- tests/test_config_validator.py +182 -193
- tests/test_crawlo_proxy_integration.py +109 -173
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -0
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +169 -357
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +185 -0
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_random_user_agent.py +73 -0
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +112 -0
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -153
- tests/test_user_agents.py +97 -0
- tests/tools_example.py +260 -257
- tests/verify_distributed.py +117 -0
- crawlo/cleaners/__init__.py +0 -61
- crawlo/utils/date_tools.py +0 -290
- crawlo-1.2.8.dist-info/RECORD +0 -209
- {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/WHEEL +0 -0
- {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,142 +1,142 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试带认证代理的功能
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import asyncio
|
|
8
|
-
import aiohttp
|
|
9
|
-
import httpx
|
|
10
|
-
from crawlo.network.request import Request
|
|
11
|
-
from crawlo.tools import AuthenticatedProxy
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
async def test_proxy_with_aiohttp():
|
|
15
|
-
"""测试AioHttp与认证代理"""
|
|
16
|
-
print("=== 测试AioHttp与认证代理 ===")
|
|
17
|
-
|
|
18
|
-
# 代理配置
|
|
19
|
-
proxy_config = {
|
|
20
|
-
"http": "http://dwe20241014:Dwe0101014@182.201.243.186:58111",
|
|
21
|
-
"https": "http://dwe20241014:Dwe0101014@182.201.243.186:58111"
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
# 创建代理对象
|
|
25
|
-
proxy_url = proxy_config["http"]
|
|
26
|
-
proxy = AuthenticatedProxy(proxy_url)
|
|
27
|
-
|
|
28
|
-
print(f"原始代理URL: {proxy_url}")
|
|
29
|
-
print(f"清洁URL: {proxy.clean_url}")
|
|
30
|
-
print(f"认证信息: {proxy.get_auth_credentials()}")
|
|
31
|
-
|
|
32
|
-
# 使用aiohttp直接测试
|
|
33
|
-
try:
|
|
34
|
-
auth = proxy.get_auth_credentials()
|
|
35
|
-
if auth:
|
|
36
|
-
basic_auth = aiohttp.BasicAuth(auth['username'], auth['password'])
|
|
37
|
-
else:
|
|
38
|
-
basic_auth = None
|
|
39
|
-
|
|
40
|
-
async with aiohttp.ClientSession() as session:
|
|
41
|
-
async with session.get(
|
|
42
|
-
"https://httpbin.org/ip",
|
|
43
|
-
proxy=proxy.clean_url,
|
|
44
|
-
proxy_auth=basic_auth
|
|
45
|
-
) as response:
|
|
46
|
-
print(f"AioHttp测试成功!")
|
|
47
|
-
print(f"状态码: {response.status}")
|
|
48
|
-
content = await response.text()
|
|
49
|
-
print(f"响应内容: {content[:200]}...")
|
|
50
|
-
|
|
51
|
-
except Exception as e:
|
|
52
|
-
print(f"AioHttp测试失败: {e}")
|
|
53
|
-
import traceback
|
|
54
|
-
traceback.print_exc()
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def test_proxy_with_httpx():
|
|
58
|
-
"""测试HttpX与认证代理"""
|
|
59
|
-
print("\n=== 测试HttpX与认证代理 ===")
|
|
60
|
-
|
|
61
|
-
# 代理配置
|
|
62
|
-
proxy_config = {
|
|
63
|
-
"http": "http://dwe20241014:Dwe0101014@182.201.243.186:58111",
|
|
64
|
-
"https": "http://dwe20241014:Dwe0101014@182.201.243.186:58111"
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
# 使用httpx直接测试
|
|
68
|
-
try:
|
|
69
|
-
# HttpX可以直接使用带认证的URL作为proxy参数
|
|
70
|
-
proxy_url = proxy_config["http"]
|
|
71
|
-
|
|
72
|
-
with httpx.Client(proxy=proxy_url) as client:
|
|
73
|
-
response = client.get("https://httpbin.org/ip")
|
|
74
|
-
print(f"HttpX测试成功!")
|
|
75
|
-
print(f"状态码: {response.status_code}")
|
|
76
|
-
print(f"响应内容: {response.text[:200]}...")
|
|
77
|
-
|
|
78
|
-
except Exception as e:
|
|
79
|
-
print(f"HttpX测试失败: {e}")
|
|
80
|
-
import traceback
|
|
81
|
-
traceback.print_exc()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
async def test_proxy_with_curl_cffi():
|
|
85
|
-
"""测试CurlCffi与认证代理"""
|
|
86
|
-
print("\n=== 测试CurlCffi与认证代理 ===")
|
|
87
|
-
|
|
88
|
-
# 代理配置
|
|
89
|
-
proxy_config = {
|
|
90
|
-
"http": "http://dwe20241014:Dwe0101014@182.201.243.186:58111",
|
|
91
|
-
"https": "http://dwe20241014:Dwe0101014@182.201.243.186:58111"
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
# 创建代理对象
|
|
95
|
-
proxy_url = proxy_config["http"]
|
|
96
|
-
proxy = AuthenticatedProxy(proxy_url)
|
|
97
|
-
|
|
98
|
-
print(f"原始代理URL: {proxy_url}")
|
|
99
|
-
print(f"代理字典: {proxy.proxy_dict}")
|
|
100
|
-
print(f"认证头: {proxy.get_auth_header()}")
|
|
101
|
-
|
|
102
|
-
# 使用curl-cffi直接测试
|
|
103
|
-
try:
|
|
104
|
-
from curl_cffi import requests as curl_requests
|
|
105
|
-
|
|
106
|
-
# 设置代理和认证头
|
|
107
|
-
proxies = proxy.proxy_dict
|
|
108
|
-
headers = {}
|
|
109
|
-
auth_header = proxy.get_auth_header()
|
|
110
|
-
if auth_header:
|
|
111
|
-
headers["Proxy-Authorization"] = auth_header
|
|
112
|
-
|
|
113
|
-
response = curl_requests.get(
|
|
114
|
-
"https://httpbin.org/ip",
|
|
115
|
-
proxies=proxies,
|
|
116
|
-
headers=headers
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
print(f"CurlCffi测试成功!")
|
|
120
|
-
print(f"状态码: {response.status_code}")
|
|
121
|
-
print(f"响应内容: {response.text[:200]}...")
|
|
122
|
-
|
|
123
|
-
except Exception as e:
|
|
124
|
-
print(f"CurlCffi测试失败: {e}")
|
|
125
|
-
import traceback
|
|
126
|
-
traceback.print_exc()
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
async def main():
|
|
130
|
-
"""主测试函数"""
|
|
131
|
-
print("开始测试带认证代理的功能...\n")
|
|
132
|
-
|
|
133
|
-
# 测试各个库
|
|
134
|
-
await test_proxy_with_aiohttp()
|
|
135
|
-
test_proxy_with_httpx()
|
|
136
|
-
await test_proxy_with_curl_cffi()
|
|
137
|
-
|
|
138
|
-
print("\n所有测试完成!")
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试带认证代理的功能
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import aiohttp
|
|
9
|
+
import httpx
|
|
10
|
+
from crawlo.network.request import Request
|
|
11
|
+
from crawlo.tools import AuthenticatedProxy
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def test_proxy_with_aiohttp():
|
|
15
|
+
"""测试AioHttp与认证代理"""
|
|
16
|
+
print("=== 测试AioHttp与认证代理 ===")
|
|
17
|
+
|
|
18
|
+
# 代理配置
|
|
19
|
+
proxy_config = {
|
|
20
|
+
"http": "http://dwe20241014:Dwe0101014@182.201.243.186:58111",
|
|
21
|
+
"https": "http://dwe20241014:Dwe0101014@182.201.243.186:58111"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# 创建代理对象
|
|
25
|
+
proxy_url = proxy_config["http"]
|
|
26
|
+
proxy = AuthenticatedProxy(proxy_url)
|
|
27
|
+
|
|
28
|
+
print(f"原始代理URL: {proxy_url}")
|
|
29
|
+
print(f"清洁URL: {proxy.clean_url}")
|
|
30
|
+
print(f"认证信息: {proxy.get_auth_credentials()}")
|
|
31
|
+
|
|
32
|
+
# 使用aiohttp直接测试
|
|
33
|
+
try:
|
|
34
|
+
auth = proxy.get_auth_credentials()
|
|
35
|
+
if auth:
|
|
36
|
+
basic_auth = aiohttp.BasicAuth(auth['username'], auth['password'])
|
|
37
|
+
else:
|
|
38
|
+
basic_auth = None
|
|
39
|
+
|
|
40
|
+
async with aiohttp.ClientSession() as session:
|
|
41
|
+
async with session.get(
|
|
42
|
+
"https://httpbin.org/ip",
|
|
43
|
+
proxy=proxy.clean_url,
|
|
44
|
+
proxy_auth=basic_auth
|
|
45
|
+
) as response:
|
|
46
|
+
print(f"AioHttp测试成功!")
|
|
47
|
+
print(f"状态码: {response.status}")
|
|
48
|
+
content = await response.text()
|
|
49
|
+
print(f"响应内容: {content[:200]}...")
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"AioHttp测试失败: {e}")
|
|
53
|
+
import traceback
|
|
54
|
+
traceback.print_exc()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_proxy_with_httpx():
|
|
58
|
+
"""测试HttpX与认证代理"""
|
|
59
|
+
print("\n=== 测试HttpX与认证代理 ===")
|
|
60
|
+
|
|
61
|
+
# 代理配置
|
|
62
|
+
proxy_config = {
|
|
63
|
+
"http": "http://dwe20241014:Dwe0101014@182.201.243.186:58111",
|
|
64
|
+
"https": "http://dwe20241014:Dwe0101014@182.201.243.186:58111"
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# 使用httpx直接测试
|
|
68
|
+
try:
|
|
69
|
+
# HttpX可以直接使用带认证的URL作为proxy参数
|
|
70
|
+
proxy_url = proxy_config["http"]
|
|
71
|
+
|
|
72
|
+
with httpx.Client(proxy=proxy_url) as client:
|
|
73
|
+
response = client.get("https://httpbin.org/ip")
|
|
74
|
+
print(f"HttpX测试成功!")
|
|
75
|
+
print(f"状态码: {response.status_code}")
|
|
76
|
+
print(f"响应内容: {response.text[:200]}...")
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f"HttpX测试失败: {e}")
|
|
80
|
+
import traceback
|
|
81
|
+
traceback.print_exc()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
async def test_proxy_with_curl_cffi():
|
|
85
|
+
"""测试CurlCffi与认证代理"""
|
|
86
|
+
print("\n=== 测试CurlCffi与认证代理 ===")
|
|
87
|
+
|
|
88
|
+
# 代理配置
|
|
89
|
+
proxy_config = {
|
|
90
|
+
"http": "http://dwe20241014:Dwe0101014@182.201.243.186:58111",
|
|
91
|
+
"https": "http://dwe20241014:Dwe0101014@182.201.243.186:58111"
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# 创建代理对象
|
|
95
|
+
proxy_url = proxy_config["http"]
|
|
96
|
+
proxy = AuthenticatedProxy(proxy_url)
|
|
97
|
+
|
|
98
|
+
print(f"原始代理URL: {proxy_url}")
|
|
99
|
+
print(f"代理字典: {proxy.proxy_dict}")
|
|
100
|
+
print(f"认证头: {proxy.get_auth_header()}")
|
|
101
|
+
|
|
102
|
+
# 使用curl-cffi直接测试
|
|
103
|
+
try:
|
|
104
|
+
from curl_cffi import requests as curl_requests
|
|
105
|
+
|
|
106
|
+
# 设置代理和认证头
|
|
107
|
+
proxies = proxy.proxy_dict
|
|
108
|
+
headers = {}
|
|
109
|
+
auth_header = proxy.get_auth_header()
|
|
110
|
+
if auth_header:
|
|
111
|
+
headers["Proxy-Authorization"] = auth_header
|
|
112
|
+
|
|
113
|
+
response = curl_requests.get(
|
|
114
|
+
"https://httpbin.org/ip",
|
|
115
|
+
proxies=proxies,
|
|
116
|
+
headers=headers
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
print(f"CurlCffi测试成功!")
|
|
120
|
+
print(f"状态码: {response.status_code}")
|
|
121
|
+
print(f"响应内容: {response.text[:200]}...")
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f"CurlCffi测试失败: {e}")
|
|
125
|
+
import traceback
|
|
126
|
+
traceback.print_exc()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
async def main():
|
|
130
|
+
"""主测试函数"""
|
|
131
|
+
print("开始测试带认证代理的功能...\n")
|
|
132
|
+
|
|
133
|
+
# 测试各个库
|
|
134
|
+
await test_proxy_with_aiohttp()
|
|
135
|
+
test_proxy_with_httpx()
|
|
136
|
+
await test_proxy_with_curl_cffi()
|
|
137
|
+
|
|
138
|
+
print("\n所有测试完成!")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
142
|
asyncio.run(main())
|
tests/test_cleaners.py
CHANGED
|
@@ -1,55 +1,55 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
数据清洗工具测试
|
|
5
|
-
"""
|
|
6
|
-
import unittest
|
|
7
|
-
from crawlo.
|
|
8
|
-
TextCleaner,
|
|
9
|
-
DataFormatter,
|
|
10
|
-
remove_html_tags,
|
|
11
|
-
decode_html_entities,
|
|
12
|
-
clean_text,
|
|
13
|
-
format_number,
|
|
14
|
-
format_currency,
|
|
15
|
-
format_phone_number
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class TestCleaners(unittest.TestCase):
|
|
20
|
-
"""数据清洗工具测试类"""
|
|
21
|
-
|
|
22
|
-
def test_text_cleaner(self):
|
|
23
|
-
"""测试文本清洗功能"""
|
|
24
|
-
# 测试移除HTML标签
|
|
25
|
-
html_text = "<p>这是一个<b>测试</b>文本</p>"
|
|
26
|
-
clean_text_result = remove_html_tags(html_text)
|
|
27
|
-
self.assertEqual(clean_text_result, "这是一个测试文本")
|
|
28
|
-
|
|
29
|
-
# 测试解码HTML实体
|
|
30
|
-
entity_text = "这是一个 测试&文本"
|
|
31
|
-
decoded_text = decode_html_entities(entity_text)
|
|
32
|
-
self.assertEqual(decoded_text, "这是一个 测试&文本")
|
|
33
|
-
|
|
34
|
-
# 测试综合清洗
|
|
35
|
-
complex_text = "<p>这是一个 <b>测试</b>&文本</p>"
|
|
36
|
-
cleaned = clean_text(complex_text)
|
|
37
|
-
self.assertEqual(cleaned, "这是一个 测试&文本")
|
|
38
|
-
|
|
39
|
-
def test_data_formatter(self):
|
|
40
|
-
"""测试数据格式化功能"""
|
|
41
|
-
# 测试数字格式化
|
|
42
|
-
formatted_num = format_number(1234.567, precision=2, thousand_separator=True)
|
|
43
|
-
self.assertEqual(formatted_num, "1,234.57")
|
|
44
|
-
|
|
45
|
-
# 测试货币格式化
|
|
46
|
-
formatted_currency = format_currency(1234.567, "¥", 2)
|
|
47
|
-
self.assertEqual(formatted_currency, "¥1,234.57")
|
|
48
|
-
|
|
49
|
-
# 测试电话号码格式化
|
|
50
|
-
formatted_phone = format_phone_number("13812345678", "+86", "international")
|
|
51
|
-
self.assertEqual(formatted_phone, "+86 138 1234 5678")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
数据清洗工具测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from crawlo.tools import (
|
|
8
|
+
TextCleaner,
|
|
9
|
+
DataFormatter,
|
|
10
|
+
remove_html_tags,
|
|
11
|
+
decode_html_entities,
|
|
12
|
+
clean_text,
|
|
13
|
+
format_number,
|
|
14
|
+
format_currency,
|
|
15
|
+
format_phone_number
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestCleaners(unittest.TestCase):
|
|
20
|
+
"""数据清洗工具测试类"""
|
|
21
|
+
|
|
22
|
+
def test_text_cleaner(self):
|
|
23
|
+
"""测试文本清洗功能"""
|
|
24
|
+
# 测试移除HTML标签
|
|
25
|
+
html_text = "<p>这是一个<b>测试</b>文本</p>"
|
|
26
|
+
clean_text_result = remove_html_tags(html_text)
|
|
27
|
+
self.assertEqual(clean_text_result, "这是一个测试文本")
|
|
28
|
+
|
|
29
|
+
# 测试解码HTML实体
|
|
30
|
+
entity_text = "这是一个 测试&文本"
|
|
31
|
+
decoded_text = decode_html_entities(entity_text)
|
|
32
|
+
self.assertEqual(decoded_text, "这是一个 测试&文本")
|
|
33
|
+
|
|
34
|
+
# 测试综合清洗
|
|
35
|
+
complex_text = "<p>这是一个 <b>测试</b>&文本</p>"
|
|
36
|
+
cleaned = clean_text(complex_text)
|
|
37
|
+
self.assertEqual(cleaned, "这是一个 测试&文本")
|
|
38
|
+
|
|
39
|
+
def test_data_formatter(self):
|
|
40
|
+
"""测试数据格式化功能"""
|
|
41
|
+
# 测试数字格式化
|
|
42
|
+
formatted_num = format_number(1234.567, precision=2, thousand_separator=True)
|
|
43
|
+
self.assertEqual(formatted_num, "1,234.57")
|
|
44
|
+
|
|
45
|
+
# 测试货币格式化
|
|
46
|
+
formatted_currency = format_currency(1234.567, "¥", 2)
|
|
47
|
+
self.assertEqual(formatted_currency, "¥1,234.57")
|
|
48
|
+
|
|
49
|
+
# 测试电话号码格式化
|
|
50
|
+
formatted_phone = format_phone_number("13812345678", "+86", "international")
|
|
51
|
+
self.assertEqual(formatted_phone, "+86 138 1234 5678")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == '__main__':
|
|
55
55
|
unittest.main()
|