crawlo 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +63 -63
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +322 -314
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -365
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +256 -256
- crawlo/crawler.py +1166 -1168
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +226 -226
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +52 -45
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -115
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +187 -148
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +318 -318
- crawlo/pipelines/pipeline_manager.py +75 -75
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +325 -297
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +379 -379
- crawlo/queue/redis_priority_queue.py +306 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +225 -225
- crawlo/settings/setting_manager.py +198 -198
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +266 -261
- crawlo/templates/project/settings_distributed.py.tmpl +179 -174
- crawlo/templates/project/settings_gentle.py.tmpl +60 -95
- crawlo/templates/project/settings_high_performance.py.tmpl +130 -125
- crawlo/templates/project/settings_minimal.py.tmpl +34 -29
- crawlo/templates/project/settings_simple.py.tmpl +101 -96
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +38 -47
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +199 -146
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
- crawlo-1.3.3.dist-info/RECORD +219 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +107 -107
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_pipelines.py +66 -66
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/verify_distributed.py +117 -117
- crawlo-1.3.1.dist-info/RECORD +0 -219
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
|
@@ -1,71 +1,71 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试 Request 序列化问题修复
|
|
5
|
-
"""
|
|
6
|
-
import pickle
|
|
7
|
-
import sys
|
|
8
|
-
sys.path.insert(0, "..")
|
|
9
|
-
|
|
10
|
-
from crawlo.network.request import Request
|
|
11
|
-
from crawlo.core.scheduler import Scheduler
|
|
12
|
-
from unittest.mock import Mock
|
|
13
|
-
|
|
14
|
-
# 模拟一个带 logger 的 Request
|
|
15
|
-
class TestRequest(Request):
|
|
16
|
-
def __init__(self, *args, **kwargs):
|
|
17
|
-
super().__init__(*args, **kwargs)
|
|
18
|
-
# 添加一个 logger 属性模拟问题
|
|
19
|
-
from crawlo.utils.log import get_logger
|
|
20
|
-
self.logger = get_logger("test_request")
|
|
21
|
-
self.meta['spider_logger'] = get_logger("spider_logger")
|
|
22
|
-
|
|
23
|
-
def test_request_serialization():
|
|
24
|
-
"""测试 Request 序列化"""
|
|
25
|
-
print("🔍 测试 Request 序列化修复...")
|
|
26
|
-
|
|
27
|
-
# 创建一个带 logger 的请求
|
|
28
|
-
request = TestRequest(
|
|
29
|
-
url="https://example.com",
|
|
30
|
-
meta={"test": "data"} # 移除 Mock 对象
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
print(f" 📦 原始请求: {request}")
|
|
34
|
-
print(f" 请求有 logger: {hasattr(request, 'logger')}")
|
|
35
|
-
print(f" meta 有 logger: {'spider_logger' in request.meta}")
|
|
36
|
-
|
|
37
|
-
# 创建一个 mock scheduler 来测试清理
|
|
38
|
-
class MockScheduler:
|
|
39
|
-
def _deep_clean_loggers(self, request):
|
|
40
|
-
return Scheduler._deep_clean_loggers(self, request)
|
|
41
|
-
def _remove_logger_from_dict(self, d):
|
|
42
|
-
return Scheduler._remove_logger_from_dict(self, d)
|
|
43
|
-
|
|
44
|
-
scheduler = MockScheduler()
|
|
45
|
-
|
|
46
|
-
# 执行清理
|
|
47
|
-
scheduler._deep_clean_loggers(request)
|
|
48
|
-
|
|
49
|
-
print(f" 🧹 清理后有 logger: {hasattr(request, 'logger')}")
|
|
50
|
-
print(f" 🧹 清理后 meta 有 logger: {'spider_logger' in request.meta}")
|
|
51
|
-
|
|
52
|
-
# 测试序列化
|
|
53
|
-
try:
|
|
54
|
-
serialized = pickle.dumps(request)
|
|
55
|
-
print(f" 序列化成功,大小: {len(serialized)} bytes")
|
|
56
|
-
|
|
57
|
-
# 测试反序列化
|
|
58
|
-
deserialized = pickle.loads(serialized)
|
|
59
|
-
print(f" 反序列化成功: {deserialized}")
|
|
60
|
-
return True
|
|
61
|
-
|
|
62
|
-
except Exception as e:
|
|
63
|
-
print(f" 序列化失败: {e}")
|
|
64
|
-
return False
|
|
65
|
-
|
|
66
|
-
if __name__ == "__main__":
|
|
67
|
-
success = test_request_serialization()
|
|
68
|
-
if success:
|
|
69
|
-
print("Request 序列化修复成功!")
|
|
70
|
-
else:
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试 Request 序列化问题修复
|
|
5
|
+
"""
|
|
6
|
+
import pickle
|
|
7
|
+
import sys
|
|
8
|
+
sys.path.insert(0, "..")
|
|
9
|
+
|
|
10
|
+
from crawlo.network.request import Request
|
|
11
|
+
from crawlo.core.scheduler import Scheduler
|
|
12
|
+
from unittest.mock import Mock
|
|
13
|
+
|
|
14
|
+
# 模拟一个带 logger 的 Request
|
|
15
|
+
class TestRequest(Request):
|
|
16
|
+
def __init__(self, *args, **kwargs):
|
|
17
|
+
super().__init__(*args, **kwargs)
|
|
18
|
+
# 添加一个 logger 属性模拟问题
|
|
19
|
+
from crawlo.utils.log import get_logger
|
|
20
|
+
self.logger = get_logger("test_request")
|
|
21
|
+
self.meta['spider_logger'] = get_logger("spider_logger")
|
|
22
|
+
|
|
23
|
+
def test_request_serialization():
|
|
24
|
+
"""测试 Request 序列化"""
|
|
25
|
+
print("🔍 测试 Request 序列化修复...")
|
|
26
|
+
|
|
27
|
+
# 创建一个带 logger 的请求
|
|
28
|
+
request = TestRequest(
|
|
29
|
+
url="https://example.com",
|
|
30
|
+
meta={"test": "data"} # 移除 Mock 对象
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
print(f" 📦 原始请求: {request}")
|
|
34
|
+
print(f" 请求有 logger: {hasattr(request, 'logger')}")
|
|
35
|
+
print(f" meta 有 logger: {'spider_logger' in request.meta}")
|
|
36
|
+
|
|
37
|
+
# 创建一个 mock scheduler 来测试清理
|
|
38
|
+
class MockScheduler:
|
|
39
|
+
def _deep_clean_loggers(self, request):
|
|
40
|
+
return Scheduler._deep_clean_loggers(self, request)
|
|
41
|
+
def _remove_logger_from_dict(self, d):
|
|
42
|
+
return Scheduler._remove_logger_from_dict(self, d)
|
|
43
|
+
|
|
44
|
+
scheduler = MockScheduler()
|
|
45
|
+
|
|
46
|
+
# 执行清理
|
|
47
|
+
scheduler._deep_clean_loggers(request)
|
|
48
|
+
|
|
49
|
+
print(f" 🧹 清理后有 logger: {hasattr(request, 'logger')}")
|
|
50
|
+
print(f" 🧹 清理后 meta 有 logger: {'spider_logger' in request.meta}")
|
|
51
|
+
|
|
52
|
+
# 测试序列化
|
|
53
|
+
try:
|
|
54
|
+
serialized = pickle.dumps(request)
|
|
55
|
+
print(f" 序列化成功,大小: {len(serialized)} bytes")
|
|
56
|
+
|
|
57
|
+
# 测试反序列化
|
|
58
|
+
deserialized = pickle.loads(serialized)
|
|
59
|
+
print(f" 反序列化成功: {deserialized}")
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print(f" 序列化失败: {e}")
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
success = test_request_serialization()
|
|
68
|
+
if success:
|
|
69
|
+
print("Request 序列化修复成功!")
|
|
70
|
+
else:
|
|
71
71
|
print("❌ 序列化问题仍未解决")
|