crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/utils/fingerprint.py
CHANGED
|
@@ -1,122 +1,122 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
统一指纹生成工具
|
|
5
|
-
================
|
|
6
|
-
提供一致的指纹生成方法,确保在框架各组件中生成的指纹保持一致。
|
|
7
|
-
|
|
8
|
-
特点:
|
|
9
|
-
- 算法统一: 所有指纹生成使用相同的算法(SHA256)
|
|
10
|
-
- 格式一致: 相同数据在不同场景下生成相同指纹
|
|
11
|
-
- 高性能: 优化的实现确保高效生成
|
|
12
|
-
- 易扩展: 支持不同类型数据的指纹生成
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
import hashlib
|
|
16
|
-
from typing import Any, Dict
|
|
17
|
-
from w3lib.url import canonicalize_url
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def generate_data_fingerprint(data: Any) -> str:
|
|
21
|
-
"""
|
|
22
|
-
生成数据指纹
|
|
23
|
-
|
|
24
|
-
基于数据内容生成唯一指纹,用于去重判断。
|
|
25
|
-
使用 SHA256 算法确保安全性。
|
|
26
|
-
|
|
27
|
-
:param data: 要生成指纹的数据(支持 dict, Item, namedtuple, str 等类型)
|
|
28
|
-
:return: 数据指纹(hex string)
|
|
29
|
-
"""
|
|
30
|
-
# 将数据转换为可序列化的字典
|
|
31
|
-
if hasattr(data, 'to_dict'):
|
|
32
|
-
# 支持 Item 等实现了 to_dict 方法的对象
|
|
33
|
-
data_dict = data.to_dict()
|
|
34
|
-
elif hasattr(data, '_asdict'):
|
|
35
|
-
# 支持 namedtuple 对象
|
|
36
|
-
data_dict = data._asdict()
|
|
37
|
-
elif isinstance(data, dict):
|
|
38
|
-
data_dict = data
|
|
39
|
-
else:
|
|
40
|
-
# 其他类型转换为字符串处理
|
|
41
|
-
data_dict = {'__data__': str(data)}
|
|
42
|
-
|
|
43
|
-
# 对字典进行排序以确保一致性
|
|
44
|
-
sorted_items = sorted(data_dict.items())
|
|
45
|
-
|
|
46
|
-
# 生成指纹字符串
|
|
47
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
48
|
-
|
|
49
|
-
# 使用 SHA256 生成固定长度的指纹
|
|
50
|
-
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def generate_request_fingerprint(
|
|
54
|
-
method: str,
|
|
55
|
-
url: str,
|
|
56
|
-
body: bytes = b'',
|
|
57
|
-
headers: Dict[str, str] = None
|
|
58
|
-
) -> str:
|
|
59
|
-
"""
|
|
60
|
-
生成请求指纹
|
|
61
|
-
|
|
62
|
-
基于请求的方法、URL、body 和可选的 headers 生成唯一指纹。
|
|
63
|
-
使用 SHA256 算法确保安全性。
|
|
64
|
-
|
|
65
|
-
:param method: HTTP方法
|
|
66
|
-
:param url: 请求URL
|
|
67
|
-
:param body: 请求体
|
|
68
|
-
:param headers: 请求头
|
|
69
|
-
:return: 请求指纹(hex string)
|
|
70
|
-
"""
|
|
71
|
-
hash_func = hashlib.sha256()
|
|
72
|
-
|
|
73
|
-
# 基本字段
|
|
74
|
-
hash_func.update(method.encode('utf-8'))
|
|
75
|
-
hash_func.update(canonicalize_url(url).encode('utf-8'))
|
|
76
|
-
hash_func.update(body or b'')
|
|
77
|
-
|
|
78
|
-
# 可选的 headers
|
|
79
|
-
if headers:
|
|
80
|
-
# 对 headers 进行排序以确保一致性
|
|
81
|
-
sorted_headers = sorted(headers.items())
|
|
82
|
-
for name, value in sorted_headers:
|
|
83
|
-
hash_func.update(f"{name}:{value}".encode('utf-8'))
|
|
84
|
-
|
|
85
|
-
return hash_func.hexdigest()
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class FingerprintGenerator:
|
|
89
|
-
"""指纹生成器类"""
|
|
90
|
-
|
|
91
|
-
@staticmethod
|
|
92
|
-
def item_fingerprint(item) -> str:
|
|
93
|
-
"""
|
|
94
|
-
生成数据项指纹
|
|
95
|
-
|
|
96
|
-
:param item: 数据项
|
|
97
|
-
:return: 指纹字符串
|
|
98
|
-
"""
|
|
99
|
-
return generate_data_fingerprint(item)
|
|
100
|
-
|
|
101
|
-
@staticmethod
|
|
102
|
-
def request_fingerprint(method: str, url: str, body: bytes = b'', headers: Dict[str, str] = None) -> str:
|
|
103
|
-
"""
|
|
104
|
-
生成请求指纹
|
|
105
|
-
|
|
106
|
-
:param method: HTTP方法
|
|
107
|
-
:param url: 请求URL
|
|
108
|
-
:param body: 请求体
|
|
109
|
-
:param headers: 请求头
|
|
110
|
-
:return: 指纹字符串
|
|
111
|
-
"""
|
|
112
|
-
return generate_request_fingerprint(method, url, body, headers)
|
|
113
|
-
|
|
114
|
-
@staticmethod
|
|
115
|
-
def data_fingerprint(data: Any) -> str:
|
|
116
|
-
"""
|
|
117
|
-
生成通用数据指纹
|
|
118
|
-
|
|
119
|
-
:param data: 任意数据
|
|
120
|
-
:return: 指纹字符串
|
|
121
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
统一指纹生成工具
|
|
5
|
+
================
|
|
6
|
+
提供一致的指纹生成方法,确保在框架各组件中生成的指纹保持一致。
|
|
7
|
+
|
|
8
|
+
特点:
|
|
9
|
+
- 算法统一: 所有指纹生成使用相同的算法(SHA256)
|
|
10
|
+
- 格式一致: 相同数据在不同场景下生成相同指纹
|
|
11
|
+
- 高性能: 优化的实现确保高效生成
|
|
12
|
+
- 易扩展: 支持不同类型数据的指纹生成
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
from typing import Any, Dict
|
|
17
|
+
from w3lib.url import canonicalize_url
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def generate_data_fingerprint(data: Any) -> str:
|
|
21
|
+
"""
|
|
22
|
+
生成数据指纹
|
|
23
|
+
|
|
24
|
+
基于数据内容生成唯一指纹,用于去重判断。
|
|
25
|
+
使用 SHA256 算法确保安全性。
|
|
26
|
+
|
|
27
|
+
:param data: 要生成指纹的数据(支持 dict, Item, namedtuple, str 等类型)
|
|
28
|
+
:return: 数据指纹(hex string)
|
|
29
|
+
"""
|
|
30
|
+
# 将数据转换为可序列化的字典
|
|
31
|
+
if hasattr(data, 'to_dict'):
|
|
32
|
+
# 支持 Item 等实现了 to_dict 方法的对象
|
|
33
|
+
data_dict = data.to_dict()
|
|
34
|
+
elif hasattr(data, '_asdict'):
|
|
35
|
+
# 支持 namedtuple 对象
|
|
36
|
+
data_dict = data._asdict()
|
|
37
|
+
elif isinstance(data, dict):
|
|
38
|
+
data_dict = data
|
|
39
|
+
else:
|
|
40
|
+
# 其他类型转换为字符串处理
|
|
41
|
+
data_dict = {'__data__': str(data)}
|
|
42
|
+
|
|
43
|
+
# 对字典进行排序以确保一致性
|
|
44
|
+
sorted_items = sorted(data_dict.items())
|
|
45
|
+
|
|
46
|
+
# 生成指纹字符串
|
|
47
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
48
|
+
|
|
49
|
+
# 使用 SHA256 生成固定长度的指纹
|
|
50
|
+
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def generate_request_fingerprint(
|
|
54
|
+
method: str,
|
|
55
|
+
url: str,
|
|
56
|
+
body: bytes = b'',
|
|
57
|
+
headers: Dict[str, str] = None
|
|
58
|
+
) -> str:
|
|
59
|
+
"""
|
|
60
|
+
生成请求指纹
|
|
61
|
+
|
|
62
|
+
基于请求的方法、URL、body 和可选的 headers 生成唯一指纹。
|
|
63
|
+
使用 SHA256 算法确保安全性。
|
|
64
|
+
|
|
65
|
+
:param method: HTTP方法
|
|
66
|
+
:param url: 请求URL
|
|
67
|
+
:param body: 请求体
|
|
68
|
+
:param headers: 请求头
|
|
69
|
+
:return: 请求指纹(hex string)
|
|
70
|
+
"""
|
|
71
|
+
hash_func = hashlib.sha256()
|
|
72
|
+
|
|
73
|
+
# 基本字段
|
|
74
|
+
hash_func.update(method.encode('utf-8'))
|
|
75
|
+
hash_func.update(canonicalize_url(url).encode('utf-8'))
|
|
76
|
+
hash_func.update(body or b'')
|
|
77
|
+
|
|
78
|
+
# 可选的 headers
|
|
79
|
+
if headers:
|
|
80
|
+
# 对 headers 进行排序以确保一致性
|
|
81
|
+
sorted_headers = sorted(headers.items())
|
|
82
|
+
for name, value in sorted_headers:
|
|
83
|
+
hash_func.update(f"{name}:{value}".encode('utf-8'))
|
|
84
|
+
|
|
85
|
+
return hash_func.hexdigest()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class FingerprintGenerator:
|
|
89
|
+
"""指纹生成器类"""
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def item_fingerprint(item) -> str:
|
|
93
|
+
"""
|
|
94
|
+
生成数据项指纹
|
|
95
|
+
|
|
96
|
+
:param item: 数据项
|
|
97
|
+
:return: 指纹字符串
|
|
98
|
+
"""
|
|
99
|
+
return generate_data_fingerprint(item)
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def request_fingerprint(method: str, url: str, body: bytes = b'', headers: Dict[str, str] = None) -> str:
|
|
103
|
+
"""
|
|
104
|
+
生成请求指纹
|
|
105
|
+
|
|
106
|
+
:param method: HTTP方法
|
|
107
|
+
:param url: 请求URL
|
|
108
|
+
:param body: 请求体
|
|
109
|
+
:param headers: 请求头
|
|
110
|
+
:return: 指纹字符串
|
|
111
|
+
"""
|
|
112
|
+
return generate_request_fingerprint(method, url, body, headers)
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def data_fingerprint(data: Any) -> str:
|
|
116
|
+
"""
|
|
117
|
+
生成通用数据指纹
|
|
118
|
+
|
|
119
|
+
:param data: 任意数据
|
|
120
|
+
:return: 指纹字符串
|
|
121
|
+
"""
|
|
122
122
|
return generate_data_fingerprint(data)
|
crawlo/utils/func_tools.py
CHANGED
|
@@ -1,82 +1,82 @@
|
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
|
2
|
-
from typing import Union, AsyncGenerator, Generator
|
|
3
|
-
from inspect import isgenerator, isasyncgen
|
|
4
|
-
from crawlo import Response, Request, Item
|
|
5
|
-
from crawlo.exceptions import TransformTypeError
|
|
6
|
-
|
|
7
|
-
T = Union[Request, Item]
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
async def transform(
|
|
11
|
-
func: Union[Generator[T, None, None], AsyncGenerator[T, None]],
|
|
12
|
-
response: Response
|
|
13
|
-
) -> AsyncGenerator[Union[T, Exception], None]:
|
|
14
|
-
"""
|
|
15
|
-
转换回调函数的输出为统一异步生成器
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
func: 同步或异步生成器函数
|
|
19
|
-
response: 当前响应对象
|
|
20
|
-
|
|
21
|
-
Yields:
|
|
22
|
-
Union[T, Exception]: 生成请求/Item或异常对象
|
|
23
|
-
|
|
24
|
-
Raises:
|
|
25
|
-
TransformTypeError: 当输入类型不符合要求时
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
def _set_meta(obj: T) -> T:
|
|
29
|
-
"""统一设置请求的depth元数据"""
|
|
30
|
-
if isinstance(obj, Request):
|
|
31
|
-
obj.meta.setdefault('depth', response.meta.get('depth', 0))
|
|
32
|
-
return obj
|
|
33
|
-
|
|
34
|
-
# 类型检查前置
|
|
35
|
-
if not (isgenerator(func) or isasyncgen(func)):
|
|
36
|
-
raise TransformTypeError(
|
|
37
|
-
f'Callback must return generator or async generator, got {type(func).__name__}'
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
try:
|
|
41
|
-
if isgenerator(func):
|
|
42
|
-
# 同步生成器处理
|
|
43
|
-
for item in func:
|
|
44
|
-
yield _set_meta(item)
|
|
45
|
-
else:
|
|
46
|
-
# 异步生成器处理
|
|
47
|
-
async for item in func:
|
|
48
|
-
yield _set_meta(item)
|
|
49
|
-
|
|
50
|
-
except Exception as e:
|
|
51
|
-
yield e
|
|
52
|
-
|
|
53
|
-
# #!/usr/bin/python
|
|
54
|
-
# # -*- coding:UTF-8 -*-
|
|
55
|
-
# from typing import Callable, Union
|
|
56
|
-
# from inspect import isgenerator, isasyncgen
|
|
57
|
-
# from crawlo import Response, Request, Item
|
|
58
|
-
# from crawlo.exceptions import TransformTypeError
|
|
59
|
-
#
|
|
60
|
-
#
|
|
61
|
-
# T = Union[Request, Item]
|
|
62
|
-
#
|
|
63
|
-
#
|
|
64
|
-
# async def transform(func: Callable, response: Response):
|
|
65
|
-
# def set_request(t: T) -> T:
|
|
66
|
-
# if isinstance(t, Request):
|
|
67
|
-
# t.meta['depth'] = response.meta['depth']
|
|
68
|
-
# return t
|
|
69
|
-
# try:
|
|
70
|
-
# if isgenerator(func):
|
|
71
|
-
# for f in func:
|
|
72
|
-
# yield set_request(f)
|
|
73
|
-
# elif isasyncgen(func):
|
|
74
|
-
# async for f in func:
|
|
75
|
-
# yield set_request(f)
|
|
76
|
-
# else:
|
|
77
|
-
# raise TransformTypeError(
|
|
78
|
-
# f'callback return type error: {type(func)} must be `generator` or `async generator`'
|
|
79
|
-
# )
|
|
80
|
-
# except Exception as exp:
|
|
81
|
-
# yield exp
|
|
82
|
-
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
from typing import Union, AsyncGenerator, Generator
|
|
3
|
+
from inspect import isgenerator, isasyncgen
|
|
4
|
+
from crawlo import Response, Request, Item
|
|
5
|
+
from crawlo.exceptions import TransformTypeError
|
|
6
|
+
|
|
7
|
+
T = Union[Request, Item]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def transform(
|
|
11
|
+
func: Union[Generator[T, None, None], AsyncGenerator[T, None]],
|
|
12
|
+
response: Response
|
|
13
|
+
) -> AsyncGenerator[Union[T, Exception], None]:
|
|
14
|
+
"""
|
|
15
|
+
转换回调函数的输出为统一异步生成器
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
func: 同步或异步生成器函数
|
|
19
|
+
response: 当前响应对象
|
|
20
|
+
|
|
21
|
+
Yields:
|
|
22
|
+
Union[T, Exception]: 生成请求/Item或异常对象
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
TransformTypeError: 当输入类型不符合要求时
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def _set_meta(obj: T) -> T:
|
|
29
|
+
"""统一设置请求的depth元数据"""
|
|
30
|
+
if isinstance(obj, Request):
|
|
31
|
+
obj.meta.setdefault('depth', response.meta.get('depth', 0))
|
|
32
|
+
return obj
|
|
33
|
+
|
|
34
|
+
# 类型检查前置
|
|
35
|
+
if not (isgenerator(func) or isasyncgen(func)):
|
|
36
|
+
raise TransformTypeError(
|
|
37
|
+
f'Callback must return generator or async generator, got {type(func).__name__}'
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
if isgenerator(func):
|
|
42
|
+
# 同步生成器处理
|
|
43
|
+
for item in func:
|
|
44
|
+
yield _set_meta(item)
|
|
45
|
+
else:
|
|
46
|
+
# 异步生成器处理
|
|
47
|
+
async for item in func:
|
|
48
|
+
yield _set_meta(item)
|
|
49
|
+
|
|
50
|
+
except Exception as e:
|
|
51
|
+
yield e
|
|
52
|
+
|
|
53
|
+
# #!/usr/bin/python
|
|
54
|
+
# # -*- coding:UTF-8 -*-
|
|
55
|
+
# from typing import Callable, Union
|
|
56
|
+
# from inspect import isgenerator, isasyncgen
|
|
57
|
+
# from crawlo import Response, Request, Item
|
|
58
|
+
# from crawlo.exceptions import TransformTypeError
|
|
59
|
+
#
|
|
60
|
+
#
|
|
61
|
+
# T = Union[Request, Item]
|
|
62
|
+
#
|
|
63
|
+
#
|
|
64
|
+
# async def transform(func: Callable, response: Response):
|
|
65
|
+
# def set_request(t: T) -> T:
|
|
66
|
+
# if isinstance(t, Request):
|
|
67
|
+
# t.meta['depth'] = response.meta['depth']
|
|
68
|
+
# return t
|
|
69
|
+
# try:
|
|
70
|
+
# if isgenerator(func):
|
|
71
|
+
# for f in func:
|
|
72
|
+
# yield set_request(f)
|
|
73
|
+
# elif isasyncgen(func):
|
|
74
|
+
# async for f in func:
|
|
75
|
+
# yield set_request(f)
|
|
76
|
+
# else:
|
|
77
|
+
# raise TransformTypeError(
|
|
78
|
+
# f'callback return type error: {type(func)} must be `generator` or `async generator`'
|
|
79
|
+
# )
|
|
80
|
+
# except Exception as exp:
|
|
81
|
+
# yield exp
|
|
82
|
+
|