crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
crawlo/utils/db_helper.py
CHANGED
|
@@ -1,251 +1,251 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
import json
|
|
3
|
-
import re
|
|
4
|
-
from typing import Any, Union, List, Dict, Tuple, Optional
|
|
5
|
-
from datetime import date, time, datetime
|
|
6
|
-
from enum import Enum
|
|
7
|
-
|
|
8
|
-
from crawlo.
|
|
9
|
-
|
|
10
|
-
logger = get_logger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class SQLStatementType(Enum):
|
|
14
|
-
"""SQL语句类型枚举"""
|
|
15
|
-
INSERT = "INSERT"
|
|
16
|
-
REPLACE = "REPLACE"
|
|
17
|
-
UPDATE = "UPDATE"
|
|
18
|
-
BATCH_INSERT = "BATCH_INSERT"
|
|
19
|
-
BATCH_REPLACE = "BATCH_REPLACE"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SQLBuilder:
|
|
23
|
-
"""SQL语句构建器"""
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def format_value(value: Any) -> Union[str, int, float, None]:
|
|
27
|
-
"""
|
|
28
|
-
格式化 SQL 字段值,防止注入并兼容类型。
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
value (Any): 待处理的值
|
|
32
|
-
|
|
33
|
-
Returns:
|
|
34
|
-
str | int | float | None: 格式化后的值,None 表示 SQL 的 NULL
|
|
35
|
-
"""
|
|
36
|
-
if value is None:
|
|
37
|
-
return None
|
|
38
|
-
|
|
39
|
-
if isinstance(value, str):
|
|
40
|
-
return value.strip()
|
|
41
|
-
|
|
42
|
-
elif isinstance(value, (list, tuple, dict)):
|
|
43
|
-
try:
|
|
44
|
-
return json.dumps(value, ensure_ascii=False, default=str)
|
|
45
|
-
except Exception as e:
|
|
46
|
-
raise ValueError(f"Failed to serialize container to JSON: {value}, error: {e}")
|
|
47
|
-
|
|
48
|
-
elif isinstance(value, bool):
|
|
49
|
-
return int(value)
|
|
50
|
-
|
|
51
|
-
elif isinstance(value, (int, float)):
|
|
52
|
-
return value
|
|
53
|
-
|
|
54
|
-
elif isinstance(value, (date, time, datetime)):
|
|
55
|
-
return str(value)
|
|
56
|
-
|
|
57
|
-
else:
|
|
58
|
-
raise TypeError(f"Unsupported value type: {type(value)}, value: {value}")
|
|
59
|
-
|
|
60
|
-
@staticmethod
|
|
61
|
-
def list_to_tuple_str(datas: List[Any]) -> str:
|
|
62
|
-
"""
|
|
63
|
-
将列表转为 SQL 元组字符串格式。
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
datas (list): 输入列表
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
str: 对应的元组字符串表示
|
|
70
|
-
"""
|
|
71
|
-
if not datas:
|
|
72
|
-
return "()"
|
|
73
|
-
if len(datas) == 1:
|
|
74
|
-
# 处理单元素元组,确保末尾有逗号
|
|
75
|
-
return f"({datas[0]},)"
|
|
76
|
-
return str(tuple(datas))
|
|
77
|
-
|
|
78
|
-
@staticmethod
|
|
79
|
-
def _build_key_value_pairs(data: Dict[str, Any]) -> Tuple[List[str], List[Any]]:
|
|
80
|
-
"""
|
|
81
|
-
构建键值对列表
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
data (dict): 数据字典
|
|
85
|
-
|
|
86
|
-
Returns:
|
|
87
|
-
tuple: (键列表, 值列表)
|
|
88
|
-
"""
|
|
89
|
-
keys = [f"`{key}`" for key in data.keys()]
|
|
90
|
-
values = [SQLBuilder.format_value(value) for value in data.values()]
|
|
91
|
-
return keys, values
|
|
92
|
-
|
|
93
|
-
@staticmethod
|
|
94
|
-
def _build_update_clause(update_columns: Union[Tuple, List]) -> str:
|
|
95
|
-
"""
|
|
96
|
-
构建更新子句,使用新的 MySQL 语法避免 VALUES() 函数弃用警告
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
update_columns (tuple or list): 更新列名
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
str: 更新子句
|
|
103
|
-
"""
|
|
104
|
-
if not isinstance(update_columns, (tuple, list)):
|
|
105
|
-
update_columns = (update_columns,)
|
|
106
|
-
# 使用新的语法:INSERT ... VALUES (...) AS alias ... UPDATE ... alias.col
|
|
107
|
-
# 确保使用 excluded 别名而不是 VALUES() 函数
|
|
108
|
-
return ", ".join(f"`{key}`=`excluded`.`{key}`" for key in update_columns)
|
|
109
|
-
|
|
110
|
-
@staticmethod
|
|
111
|
-
def make_insert(
|
|
112
|
-
table: str,
|
|
113
|
-
data: Dict[str, Any],
|
|
114
|
-
auto_update: bool = False,
|
|
115
|
-
update_columns: Tuple = (),
|
|
116
|
-
insert_ignore: bool = False,
|
|
117
|
-
) -> str:
|
|
118
|
-
"""
|
|
119
|
-
生成 MySQL INSERT 或 REPLACE 语句。
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
table (str): 表名
|
|
123
|
-
data (dict): 表数据,JSON 格式字典
|
|
124
|
-
auto_update (bool): 是否使用 REPLACE INTO(完全覆盖已存在记录)
|
|
125
|
-
update_columns (tuple or list): 冲突时需更新的列名;指定后 auto_update 失效
|
|
126
|
-
insert_ignore (bool): 是否使用 INSERT IGNORE,忽略重复数据
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
str: 生成的 SQL 语句
|
|
130
|
-
"""
|
|
131
|
-
keys, values = SQLBuilder._build_key_value_pairs(data)
|
|
132
|
-
keys_str = SQLBuilder.list_to_tuple_str(keys).replace("'", "")
|
|
133
|
-
values_str = SQLBuilder.list_to_tuple_str(values)
|
|
134
|
-
|
|
135
|
-
if update_columns:
|
|
136
|
-
update_clause = SQLBuilder._build_update_clause(update_columns)
|
|
137
|
-
ignore_flag = " IGNORE" if insert_ignore else ""
|
|
138
|
-
# 使用新的语法避免 VALUES() 函数弃用警告
|
|
139
|
-
sql = f"INSERT{ignore_flag} INTO `{table}` {keys_str} VALUES {values_str} AS `excluded` ON DUPLICATE KEY UPDATE {update_clause}"
|
|
140
|
-
|
|
141
|
-
elif auto_update:
|
|
142
|
-
sql = f"REPLACE INTO `{table}` {keys_str} VALUES {values_str}"
|
|
143
|
-
|
|
144
|
-
else:
|
|
145
|
-
ignore_flag = " IGNORE" if insert_ignore else ""
|
|
146
|
-
sql = f"INSERT{ignore_flag} INTO `{table}` {keys_str} VALUES {values_str}"
|
|
147
|
-
|
|
148
|
-
return sql.replace("None", "null")
|
|
149
|
-
|
|
150
|
-
@staticmethod
|
|
151
|
-
def make_update(
|
|
152
|
-
table: str,
|
|
153
|
-
data: Dict[str, Any],
|
|
154
|
-
condition: str,
|
|
155
|
-
) -> str:
|
|
156
|
-
"""
|
|
157
|
-
生成 MySQL UPDATE 语句。
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
table (str): 表名
|
|
161
|
-
data (dict): 更新字段的键值对,键为列名,值为新值
|
|
162
|
-
condition (str): WHERE 条件,如 "id = 1"
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
str: 生成的 SQL 语句
|
|
166
|
-
"""
|
|
167
|
-
key_values: List[str] = []
|
|
168
|
-
for key, value in data.items():
|
|
169
|
-
formatted_value = SQLBuilder.format_value(value)
|
|
170
|
-
if isinstance(formatted_value, str):
|
|
171
|
-
key_values.append(f"`{key}`={repr(formatted_value)}")
|
|
172
|
-
elif formatted_value is None:
|
|
173
|
-
key_values.append(f"`{key}`=null")
|
|
174
|
-
else:
|
|
175
|
-
key_values.append(f"`{key}`={formatted_value}")
|
|
176
|
-
|
|
177
|
-
key_values_str = ", ".join(key_values)
|
|
178
|
-
sql = f"UPDATE `{table}` SET {key_values_str} WHERE {condition}"
|
|
179
|
-
return sql
|
|
180
|
-
|
|
181
|
-
@staticmethod
|
|
182
|
-
def make_batch(
|
|
183
|
-
table: str,
|
|
184
|
-
datas: List[Dict[str, Any]],
|
|
185
|
-
auto_update: bool = False,
|
|
186
|
-
update_columns: Tuple = (),
|
|
187
|
-
update_columns_value: Tuple = (),
|
|
188
|
-
) -> Optional[Tuple[str, List[List[Any]]]]:
|
|
189
|
-
"""
|
|
190
|
-
生成批量插入 SQL 及对应值列表。
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
table (str): 表名
|
|
194
|
-
datas (list of dict): 数据列表
|
|
195
|
-
auto_update (bool): 使用 REPLACE INTO 替代 INSERT
|
|
196
|
-
update_columns (tuple or list): 主键冲突时要更新的列名
|
|
197
|
-
update_columns_value (tuple): 更新列对应的固定值
|
|
198
|
-
|
|
199
|
-
Returns:
|
|
200
|
-
tuple[str, list[list]] | None: (SQL语句, 值列表);若数据为空则返回 None
|
|
201
|
-
"""
|
|
202
|
-
if not datas:
|
|
203
|
-
return None
|
|
204
|
-
|
|
205
|
-
# 提取所有唯一字段名
|
|
206
|
-
keys = list({key for data in datas for key in data})
|
|
207
|
-
values_list = []
|
|
208
|
-
|
|
209
|
-
for data in datas:
|
|
210
|
-
if not isinstance(data, dict):
|
|
211
|
-
continue # 跳过非字典数据
|
|
212
|
-
|
|
213
|
-
row = []
|
|
214
|
-
for key in keys:
|
|
215
|
-
raw_value = data.get(key)
|
|
216
|
-
try:
|
|
217
|
-
formatted_value = SQLBuilder.format_value(raw_value)
|
|
218
|
-
row.append(formatted_value)
|
|
219
|
-
except Exception as e:
|
|
220
|
-
logger.error(f"{key}: {raw_value} (类型: {type(raw_value)}) -> {e}")
|
|
221
|
-
values_list.append(row)
|
|
222
|
-
|
|
223
|
-
keys_str = ", ".join(f"`{key}`" for key in keys)
|
|
224
|
-
placeholders_str = ", ".join(["%s"] * len(keys))
|
|
225
|
-
|
|
226
|
-
if update_columns:
|
|
227
|
-
if not isinstance(update_columns, (tuple, list)):
|
|
228
|
-
update_columns = (update_columns,)
|
|
229
|
-
|
|
230
|
-
if update_columns_value:
|
|
231
|
-
# 当提供了固定值时,使用这些值进行更新
|
|
232
|
-
update_pairs = [
|
|
233
|
-
f"`{key}`={value}"
|
|
234
|
-
for key, value in zip(update_columns, update_columns_value)
|
|
235
|
-
]
|
|
236
|
-
else:
|
|
237
|
-
# 使用新的语法避免 VALUES() 函数弃用警告
|
|
238
|
-
# INSERT ... VALUES (...) AS excluded ... ON DUPLICATE KEY UPDATE col=excluded.col
|
|
239
|
-
update_pairs = [
|
|
240
|
-
f"`{key}`=`excluded`.`{key}`" for key in update_columns
|
|
241
|
-
]
|
|
242
|
-
update_clause = ", ".join(update_pairs)
|
|
243
|
-
sql = f"INSERT INTO `{table}` ({keys_str}) VALUES ({placeholders_str}) AS `excluded` ON DUPLICATE KEY UPDATE {update_clause}"
|
|
244
|
-
|
|
245
|
-
elif auto_update:
|
|
246
|
-
sql = f"REPLACE INTO `{table}` ({keys_str}) VALUES ({placeholders_str})"
|
|
247
|
-
|
|
248
|
-
else:
|
|
249
|
-
sql = f"INSERT IGNORE INTO `{table}` ({keys_str}) VALUES ({placeholders_str})"
|
|
250
|
-
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Union, List, Dict, Tuple, Optional
|
|
5
|
+
from datetime import date, time, datetime
|
|
6
|
+
from enum import Enum
|
|
7
|
+
|
|
8
|
+
from crawlo.logging import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SQLStatementType(Enum):
|
|
14
|
+
"""SQL语句类型枚举"""
|
|
15
|
+
INSERT = "INSERT"
|
|
16
|
+
REPLACE = "REPLACE"
|
|
17
|
+
UPDATE = "UPDATE"
|
|
18
|
+
BATCH_INSERT = "BATCH_INSERT"
|
|
19
|
+
BATCH_REPLACE = "BATCH_REPLACE"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SQLBuilder:
|
|
23
|
+
"""SQL语句构建器"""
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def format_value(value: Any) -> Union[str, int, float, None]:
|
|
27
|
+
"""
|
|
28
|
+
格式化 SQL 字段值,防止注入并兼容类型。
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
value (Any): 待处理的值
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
str | int | float | None: 格式化后的值,None 表示 SQL 的 NULL
|
|
35
|
+
"""
|
|
36
|
+
if value is None:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
if isinstance(value, str):
|
|
40
|
+
return value.strip()
|
|
41
|
+
|
|
42
|
+
elif isinstance(value, (list, tuple, dict)):
|
|
43
|
+
try:
|
|
44
|
+
return json.dumps(value, ensure_ascii=False, default=str)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
raise ValueError(f"Failed to serialize container to JSON: {value}, error: {e}")
|
|
47
|
+
|
|
48
|
+
elif isinstance(value, bool):
|
|
49
|
+
return int(value)
|
|
50
|
+
|
|
51
|
+
elif isinstance(value, (int, float)):
|
|
52
|
+
return value
|
|
53
|
+
|
|
54
|
+
elif isinstance(value, (date, time, datetime)):
|
|
55
|
+
return str(value)
|
|
56
|
+
|
|
57
|
+
else:
|
|
58
|
+
raise TypeError(f"Unsupported value type: {type(value)}, value: {value}")
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def list_to_tuple_str(datas: List[Any]) -> str:
|
|
62
|
+
"""
|
|
63
|
+
将列表转为 SQL 元组字符串格式。
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
datas (list): 输入列表
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
str: 对应的元组字符串表示
|
|
70
|
+
"""
|
|
71
|
+
if not datas:
|
|
72
|
+
return "()"
|
|
73
|
+
if len(datas) == 1:
|
|
74
|
+
# 处理单元素元组,确保末尾有逗号
|
|
75
|
+
return f"({datas[0]},)"
|
|
76
|
+
return str(tuple(datas))
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def _build_key_value_pairs(data: Dict[str, Any]) -> Tuple[List[str], List[Any]]:
|
|
80
|
+
"""
|
|
81
|
+
构建键值对列表
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
data (dict): 数据字典
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
tuple: (键列表, 值列表)
|
|
88
|
+
"""
|
|
89
|
+
keys = [f"`{key}`" for key in data.keys()]
|
|
90
|
+
values = [SQLBuilder.format_value(value) for value in data.values()]
|
|
91
|
+
return keys, values
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def _build_update_clause(update_columns: Union[Tuple, List]) -> str:
|
|
95
|
+
"""
|
|
96
|
+
构建更新子句,使用新的 MySQL 语法避免 VALUES() 函数弃用警告
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
update_columns (tuple or list): 更新列名
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
str: 更新子句
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(update_columns, (tuple, list)):
|
|
105
|
+
update_columns = (update_columns,)
|
|
106
|
+
# 使用新的语法:INSERT ... VALUES (...) AS alias ... UPDATE ... alias.col
|
|
107
|
+
# 确保使用 excluded 别名而不是 VALUES() 函数
|
|
108
|
+
return ", ".join(f"`{key}`=`excluded`.`{key}`" for key in update_columns)
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def make_insert(
|
|
112
|
+
table: str,
|
|
113
|
+
data: Dict[str, Any],
|
|
114
|
+
auto_update: bool = False,
|
|
115
|
+
update_columns: Tuple = (),
|
|
116
|
+
insert_ignore: bool = False,
|
|
117
|
+
) -> str:
|
|
118
|
+
"""
|
|
119
|
+
生成 MySQL INSERT 或 REPLACE 语句。
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
table (str): 表名
|
|
123
|
+
data (dict): 表数据,JSON 格式字典
|
|
124
|
+
auto_update (bool): 是否使用 REPLACE INTO(完全覆盖已存在记录)
|
|
125
|
+
update_columns (tuple or list): 冲突时需更新的列名;指定后 auto_update 失效
|
|
126
|
+
insert_ignore (bool): 是否使用 INSERT IGNORE,忽略重复数据
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
str: 生成的 SQL 语句
|
|
130
|
+
"""
|
|
131
|
+
keys, values = SQLBuilder._build_key_value_pairs(data)
|
|
132
|
+
keys_str = SQLBuilder.list_to_tuple_str(keys).replace("'", "")
|
|
133
|
+
values_str = SQLBuilder.list_to_tuple_str(values)
|
|
134
|
+
|
|
135
|
+
if update_columns:
|
|
136
|
+
update_clause = SQLBuilder._build_update_clause(update_columns)
|
|
137
|
+
ignore_flag = " IGNORE" if insert_ignore else ""
|
|
138
|
+
# 使用新的语法避免 VALUES() 函数弃用警告
|
|
139
|
+
sql = f"INSERT{ignore_flag} INTO `{table}` {keys_str} VALUES {values_str} AS `excluded` ON DUPLICATE KEY UPDATE {update_clause}"
|
|
140
|
+
|
|
141
|
+
elif auto_update:
|
|
142
|
+
sql = f"REPLACE INTO `{table}` {keys_str} VALUES {values_str}"
|
|
143
|
+
|
|
144
|
+
else:
|
|
145
|
+
ignore_flag = " IGNORE" if insert_ignore else ""
|
|
146
|
+
sql = f"INSERT{ignore_flag} INTO `{table}` {keys_str} VALUES {values_str}"
|
|
147
|
+
|
|
148
|
+
return sql.replace("None", "null")
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def make_update(
|
|
152
|
+
table: str,
|
|
153
|
+
data: Dict[str, Any],
|
|
154
|
+
condition: str,
|
|
155
|
+
) -> str:
|
|
156
|
+
"""
|
|
157
|
+
生成 MySQL UPDATE 语句。
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
table (str): 表名
|
|
161
|
+
data (dict): 更新字段的键值对,键为列名,值为新值
|
|
162
|
+
condition (str): WHERE 条件,如 "id = 1"
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
str: 生成的 SQL 语句
|
|
166
|
+
"""
|
|
167
|
+
key_values: List[str] = []
|
|
168
|
+
for key, value in data.items():
|
|
169
|
+
formatted_value = SQLBuilder.format_value(value)
|
|
170
|
+
if isinstance(formatted_value, str):
|
|
171
|
+
key_values.append(f"`{key}`={repr(formatted_value)}")
|
|
172
|
+
elif formatted_value is None:
|
|
173
|
+
key_values.append(f"`{key}`=null")
|
|
174
|
+
else:
|
|
175
|
+
key_values.append(f"`{key}`={formatted_value}")
|
|
176
|
+
|
|
177
|
+
key_values_str = ", ".join(key_values)
|
|
178
|
+
sql = f"UPDATE `{table}` SET {key_values_str} WHERE {condition}"
|
|
179
|
+
return sql
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def make_batch(
|
|
183
|
+
table: str,
|
|
184
|
+
datas: List[Dict[str, Any]],
|
|
185
|
+
auto_update: bool = False,
|
|
186
|
+
update_columns: Tuple = (),
|
|
187
|
+
update_columns_value: Tuple = (),
|
|
188
|
+
) -> Optional[Tuple[str, List[List[Any]]]]:
|
|
189
|
+
"""
|
|
190
|
+
生成批量插入 SQL 及对应值列表。
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
table (str): 表名
|
|
194
|
+
datas (list of dict): 数据列表
|
|
195
|
+
auto_update (bool): 使用 REPLACE INTO 替代 INSERT
|
|
196
|
+
update_columns (tuple or list): 主键冲突时要更新的列名
|
|
197
|
+
update_columns_value (tuple): 更新列对应的固定值
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
tuple[str, list[list]] | None: (SQL语句, 值列表);若数据为空则返回 None
|
|
201
|
+
"""
|
|
202
|
+
if not datas:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
# 提取所有唯一字段名
|
|
206
|
+
keys = list({key for data in datas for key in data})
|
|
207
|
+
values_list = []
|
|
208
|
+
|
|
209
|
+
for data in datas:
|
|
210
|
+
if not isinstance(data, dict):
|
|
211
|
+
continue # 跳过非字典数据
|
|
212
|
+
|
|
213
|
+
row = []
|
|
214
|
+
for key in keys:
|
|
215
|
+
raw_value = data.get(key)
|
|
216
|
+
try:
|
|
217
|
+
formatted_value = SQLBuilder.format_value(raw_value)
|
|
218
|
+
row.append(formatted_value)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"{key}: {raw_value} (类型: {type(raw_value)}) -> {e}")
|
|
221
|
+
values_list.append(row)
|
|
222
|
+
|
|
223
|
+
keys_str = ", ".join(f"`{key}`" for key in keys)
|
|
224
|
+
placeholders_str = ", ".join(["%s"] * len(keys))
|
|
225
|
+
|
|
226
|
+
if update_columns:
|
|
227
|
+
if not isinstance(update_columns, (tuple, list)):
|
|
228
|
+
update_columns = (update_columns,)
|
|
229
|
+
|
|
230
|
+
if update_columns_value:
|
|
231
|
+
# 当提供了固定值时,使用这些值进行更新
|
|
232
|
+
update_pairs = [
|
|
233
|
+
f"`{key}`={value}"
|
|
234
|
+
for key, value in zip(update_columns, update_columns_value)
|
|
235
|
+
]
|
|
236
|
+
else:
|
|
237
|
+
# 使用新的语法避免 VALUES() 函数弃用警告
|
|
238
|
+
# INSERT ... VALUES (...) AS excluded ... ON DUPLICATE KEY UPDATE col=excluded.col
|
|
239
|
+
update_pairs = [
|
|
240
|
+
f"`{key}`=`excluded`.`{key}`" for key in update_columns
|
|
241
|
+
]
|
|
242
|
+
update_clause = ", ".join(update_pairs)
|
|
243
|
+
sql = f"INSERT INTO `{table}` ({keys_str}) VALUES ({placeholders_str}) AS `excluded` ON DUPLICATE KEY UPDATE {update_clause}"
|
|
244
|
+
|
|
245
|
+
elif auto_update:
|
|
246
|
+
sql = f"REPLACE INTO `{table}` ({keys_str}) VALUES ({placeholders_str})"
|
|
247
|
+
|
|
248
|
+
else:
|
|
249
|
+
sql = f"INSERT IGNORE INTO `{table}` ({keys_str}) VALUES ({placeholders_str})"
|
|
250
|
+
|
|
251
251
|
return sql, values_list
|