crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
crawlo/utils/spider_loader.py
CHANGED
|
@@ -1,202 +1,202 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
import traceback
|
|
3
|
-
import warnings
|
|
4
|
-
from collections import defaultdict
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import List, Type, Dict, Any
|
|
7
|
-
|
|
8
|
-
from crawlo.interfaces import ISpiderLoader
|
|
9
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
-
from crawlo.spider import Spider
|
|
11
|
-
from crawlo.network.request import Request
|
|
12
|
-
from crawlo.
|
|
13
|
-
|
|
14
|
-
logger = get_logger(__name__)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SpiderLoaderProtocol:
|
|
18
|
-
"""Protocol for spider loader"""
|
|
19
|
-
|
|
20
|
-
@classmethod
|
|
21
|
-
def from_settings(cls, settings: SettingManager) -> 'SpiderLoaderProtocol':
|
|
22
|
-
"""Create spider loader from settings"""
|
|
23
|
-
return cls(settings)
|
|
24
|
-
|
|
25
|
-
def load(self, spider_name: str) -> Type[Spider]:
|
|
26
|
-
"""Load a spider by name"""
|
|
27
|
-
raise NotImplementedError
|
|
28
|
-
|
|
29
|
-
def list(self) -> List[str]:
|
|
30
|
-
"""List all available spider names"""
|
|
31
|
-
raise NotImplementedError
|
|
32
|
-
|
|
33
|
-
def find_by_request(self, request: 'Request') -> List[str]:
|
|
34
|
-
"""Find spider names that can handle the given request"""
|
|
35
|
-
raise NotImplementedError
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class SpiderLoader(ISpiderLoader):
|
|
39
|
-
"""爬虫加载器,负责发现和加载爬虫"""
|
|
40
|
-
|
|
41
|
-
def __init__(self, settings: SettingManager = None):
|
|
42
|
-
# 如果提供了settings,则从settings中获取配置
|
|
43
|
-
if settings is not None:
|
|
44
|
-
self.spider_modules = settings.get('SPIDER_MODULES', [])
|
|
45
|
-
self.warn_only = settings.get('SPIDER_LOADER_WARN_ONLY', False)
|
|
46
|
-
else:
|
|
47
|
-
# 默认配置
|
|
48
|
-
self.spider_modules = []
|
|
49
|
-
self.warn_only = False
|
|
50
|
-
|
|
51
|
-
self._spiders: Dict[str, Type[Spider]] = {}
|
|
52
|
-
self._found: Dict[str, List[tuple]] = defaultdict(list)
|
|
53
|
-
self._load_all_spiders()
|
|
54
|
-
|
|
55
|
-
@classmethod
|
|
56
|
-
def from_settings(cls, settings: SettingManager) -> 'SpiderLoader':
|
|
57
|
-
"""从设置创建SpiderLoader实例"""
|
|
58
|
-
return cls(settings)
|
|
59
|
-
|
|
60
|
-
def _check_name_duplicates(self) -> None:
|
|
61
|
-
"""检查重复的spider名称"""
|
|
62
|
-
dupes = []
|
|
63
|
-
for name, locations in self._found.items():
|
|
64
|
-
if len(locations) > 1:
|
|
65
|
-
dupes.extend([
|
|
66
|
-
f" {cls} named {name!r} (in {mod})"
|
|
67
|
-
for mod, cls in locations
|
|
68
|
-
])
|
|
69
|
-
|
|
70
|
-
if dupes:
|
|
71
|
-
dupes_string = "\n\n".join(dupes)
|
|
72
|
-
warnings.warn(
|
|
73
|
-
"There are several spiders with the same name:\n\n"
|
|
74
|
-
f"{dupes_string}\n\n This can cause unexpected behavior.",
|
|
75
|
-
category=UserWarning,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
def _load_spiders(self, module) -> None:
|
|
79
|
-
"""加载模块中的所有spider"""
|
|
80
|
-
for attr_name in dir(module):
|
|
81
|
-
attr_value = getattr(module, attr_name)
|
|
82
|
-
if (isinstance(attr_value, type) and
|
|
83
|
-
issubclass(attr_value, Spider) and
|
|
84
|
-
attr_value != Spider and
|
|
85
|
-
hasattr(attr_value, 'name')):
|
|
86
|
-
|
|
87
|
-
spider_name = getattr(attr_value, 'name')
|
|
88
|
-
self._found[spider_name].append((module.__name__, attr_value.__name__))
|
|
89
|
-
self._spiders[spider_name] = attr_value
|
|
90
|
-
|
|
91
|
-
def _load_spiders_from_package(self, package_name: str) -> None:
|
|
92
|
-
"""从包中加载spiders"""
|
|
93
|
-
try:
|
|
94
|
-
# 尝试导入包
|
|
95
|
-
package = importlib.import_module(package_name)
|
|
96
|
-
|
|
97
|
-
# 遍历包中的所有模块
|
|
98
|
-
package_path = Path(package.__file__).parent
|
|
99
|
-
for py_file in package_path.glob("*.py"):
|
|
100
|
-
if py_file.name.startswith('_'):
|
|
101
|
-
continue
|
|
102
|
-
|
|
103
|
-
module_name = py_file.stem
|
|
104
|
-
spider_module_path = f"{package_name}.{module_name}"
|
|
105
|
-
|
|
106
|
-
try:
|
|
107
|
-
module = importlib.import_module(spider_module_path)
|
|
108
|
-
self._load_spiders(module)
|
|
109
|
-
except ImportError as e:
|
|
110
|
-
if self.warn_only:
|
|
111
|
-
logger.warning(f"Could not load spiders from module '{spider_module_path}': {e}")
|
|
112
|
-
logger.debug(traceback.format_exc())
|
|
113
|
-
else:
|
|
114
|
-
raise
|
|
115
|
-
except (ImportError, SyntaxError) as e:
|
|
116
|
-
if self.warn_only:
|
|
117
|
-
logger.warning(f"Could not load spiders from package '{package_name}': {e}")
|
|
118
|
-
logger.debug(traceback.format_exc())
|
|
119
|
-
else:
|
|
120
|
-
raise
|
|
121
|
-
|
|
122
|
-
def _load_all_spiders(self) -> None:
|
|
123
|
-
"""加载所有spiders"""
|
|
124
|
-
# 如果配置了SPIDER_MODULES,则从这些模块加载
|
|
125
|
-
if self.spider_modules:
|
|
126
|
-
for module_name in self.spider_modules:
|
|
127
|
-
self._load_spiders_from_package(module_name)
|
|
128
|
-
else:
|
|
129
|
-
# 向后兼容:如果没有配置SPIDER_MODULES,则使用旧的方式
|
|
130
|
-
# 这里假设默认的spiders目录结构
|
|
131
|
-
spiders_dir = Path.cwd() / 'spiders'
|
|
132
|
-
if not spiders_dir.exists():
|
|
133
|
-
spiders_dir = Path.cwd() / 'spider'
|
|
134
|
-
if not spiders_dir.exists():
|
|
135
|
-
logger.warning("Spiders directory not found")
|
|
136
|
-
return
|
|
137
|
-
|
|
138
|
-
for py_file in spiders_dir.glob("*.py"):
|
|
139
|
-
if py_file.name.startswith('_'):
|
|
140
|
-
continue
|
|
141
|
-
|
|
142
|
-
module_name = py_file.stem
|
|
143
|
-
module = None
|
|
144
|
-
try:
|
|
145
|
-
# 尝试不同的导入路径
|
|
146
|
-
spider_module_path = None
|
|
147
|
-
for possible_package in ['spiders', 'spider']:
|
|
148
|
-
try:
|
|
149
|
-
spider_module_path = f"{possible_package}.{module_name}"
|
|
150
|
-
module = importlib.import_module(spider_module_path)
|
|
151
|
-
break
|
|
152
|
-
except ImportError:
|
|
153
|
-
continue
|
|
154
|
-
|
|
155
|
-
if module is None:
|
|
156
|
-
raise ImportError(f"Could not import {module_name}")
|
|
157
|
-
|
|
158
|
-
self._load_spiders(module)
|
|
159
|
-
except ImportError as e:
|
|
160
|
-
logger.debug(f"Skip module {module_name}: {e}")
|
|
161
|
-
continue
|
|
162
|
-
|
|
163
|
-
self._check_name_duplicates()
|
|
164
|
-
|
|
165
|
-
def load(self, spider_name: str) -> Type[Spider]:
|
|
166
|
-
"""
|
|
167
|
-
通过name加载爬虫
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
spider_name: 爬虫名称
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
Spider类
|
|
174
|
-
|
|
175
|
-
Raises:
|
|
176
|
-
KeyError: 如果找不到指定名称的爬虫
|
|
177
|
-
"""
|
|
178
|
-
if spider_name not in self._spiders:
|
|
179
|
-
raise KeyError(f"Spider not found: {spider_name}")
|
|
180
|
-
return self._spiders[spider_name]
|
|
181
|
-
|
|
182
|
-
def list(self) -> List[str]:
|
|
183
|
-
"""列出所有可用的爬虫名称"""
|
|
184
|
-
return list(self._spiders.keys())
|
|
185
|
-
|
|
186
|
-
def find_by_request(self, request: 'Request') -> List[str]:
|
|
187
|
-
"""
|
|
188
|
-
根据请求找到可以处理该请求的爬虫名称
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
request: 请求对象
|
|
192
|
-
|
|
193
|
-
Returns:
|
|
194
|
-
可以处理该请求的爬虫名称列表
|
|
195
|
-
"""
|
|
196
|
-
# 这里可以实现更复杂的匹配逻辑
|
|
197
|
-
# 目前只是返回所有爬虫名称
|
|
198
|
-
return list(self._spiders.keys())
|
|
199
|
-
|
|
200
|
-
def get_all(self) -> Dict[str, Type[Spider]]:
|
|
201
|
-
"""获取所有爬虫"""
|
|
1
|
+
import importlib
|
|
2
|
+
import traceback
|
|
3
|
+
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Type, Dict, Any
|
|
7
|
+
|
|
8
|
+
from crawlo.interfaces import ISpiderLoader
|
|
9
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
+
from crawlo.spider import Spider
|
|
11
|
+
from crawlo.network.request import Request
|
|
12
|
+
from crawlo.logging import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SpiderLoaderProtocol:
|
|
18
|
+
"""Protocol for spider loader"""
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def from_settings(cls, settings: SettingManager) -> 'SpiderLoaderProtocol':
|
|
22
|
+
"""Create spider loader from settings"""
|
|
23
|
+
return cls(settings)
|
|
24
|
+
|
|
25
|
+
def load(self, spider_name: str) -> Type[Spider]:
|
|
26
|
+
"""Load a spider by name"""
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
def list(self) -> List[str]:
|
|
30
|
+
"""List all available spider names"""
|
|
31
|
+
raise NotImplementedError
|
|
32
|
+
|
|
33
|
+
def find_by_request(self, request: 'Request') -> List[str]:
|
|
34
|
+
"""Find spider names that can handle the given request"""
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SpiderLoader(ISpiderLoader):
|
|
39
|
+
"""爬虫加载器,负责发现和加载爬虫"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, settings: SettingManager = None):
|
|
42
|
+
# 如果提供了settings,则从settings中获取配置
|
|
43
|
+
if settings is not None:
|
|
44
|
+
self.spider_modules = settings.get('SPIDER_MODULES', [])
|
|
45
|
+
self.warn_only = settings.get('SPIDER_LOADER_WARN_ONLY', False)
|
|
46
|
+
else:
|
|
47
|
+
# 默认配置
|
|
48
|
+
self.spider_modules = []
|
|
49
|
+
self.warn_only = False
|
|
50
|
+
|
|
51
|
+
self._spiders: Dict[str, Type[Spider]] = {}
|
|
52
|
+
self._found: Dict[str, List[tuple]] = defaultdict(list)
|
|
53
|
+
self._load_all_spiders()
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_settings(cls, settings: SettingManager) -> 'SpiderLoader':
|
|
57
|
+
"""从设置创建SpiderLoader实例"""
|
|
58
|
+
return cls(settings)
|
|
59
|
+
|
|
60
|
+
def _check_name_duplicates(self) -> None:
|
|
61
|
+
"""检查重复的spider名称"""
|
|
62
|
+
dupes = []
|
|
63
|
+
for name, locations in self._found.items():
|
|
64
|
+
if len(locations) > 1:
|
|
65
|
+
dupes.extend([
|
|
66
|
+
f" {cls} named {name!r} (in {mod})"
|
|
67
|
+
for mod, cls in locations
|
|
68
|
+
])
|
|
69
|
+
|
|
70
|
+
if dupes:
|
|
71
|
+
dupes_string = "\n\n".join(dupes)
|
|
72
|
+
warnings.warn(
|
|
73
|
+
"There are several spiders with the same name:\n\n"
|
|
74
|
+
f"{dupes_string}\n\n This can cause unexpected behavior.",
|
|
75
|
+
category=UserWarning,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _load_spiders(self, module) -> None:
|
|
79
|
+
"""加载模块中的所有spider"""
|
|
80
|
+
for attr_name in dir(module):
|
|
81
|
+
attr_value = getattr(module, attr_name)
|
|
82
|
+
if (isinstance(attr_value, type) and
|
|
83
|
+
issubclass(attr_value, Spider) and
|
|
84
|
+
attr_value != Spider and
|
|
85
|
+
hasattr(attr_value, 'name')):
|
|
86
|
+
|
|
87
|
+
spider_name = getattr(attr_value, 'name')
|
|
88
|
+
self._found[spider_name].append((module.__name__, attr_value.__name__))
|
|
89
|
+
self._spiders[spider_name] = attr_value
|
|
90
|
+
|
|
91
|
+
def _load_spiders_from_package(self, package_name: str) -> None:
|
|
92
|
+
"""从包中加载spiders"""
|
|
93
|
+
try:
|
|
94
|
+
# 尝试导入包
|
|
95
|
+
package = importlib.import_module(package_name)
|
|
96
|
+
|
|
97
|
+
# 遍历包中的所有模块
|
|
98
|
+
package_path = Path(package.__file__).parent
|
|
99
|
+
for py_file in package_path.glob("*.py"):
|
|
100
|
+
if py_file.name.startswith('_'):
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
module_name = py_file.stem
|
|
104
|
+
spider_module_path = f"{package_name}.{module_name}"
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
module = importlib.import_module(spider_module_path)
|
|
108
|
+
self._load_spiders(module)
|
|
109
|
+
except ImportError as e:
|
|
110
|
+
if self.warn_only:
|
|
111
|
+
logger.warning(f"Could not load spiders from module '{spider_module_path}': {e}")
|
|
112
|
+
logger.debug(traceback.format_exc())
|
|
113
|
+
else:
|
|
114
|
+
raise
|
|
115
|
+
except (ImportError, SyntaxError) as e:
|
|
116
|
+
if self.warn_only:
|
|
117
|
+
logger.warning(f"Could not load spiders from package '{package_name}': {e}")
|
|
118
|
+
logger.debug(traceback.format_exc())
|
|
119
|
+
else:
|
|
120
|
+
raise
|
|
121
|
+
|
|
122
|
+
def _load_all_spiders(self) -> None:
|
|
123
|
+
"""加载所有spiders"""
|
|
124
|
+
# 如果配置了SPIDER_MODULES,则从这些模块加载
|
|
125
|
+
if self.spider_modules:
|
|
126
|
+
for module_name in self.spider_modules:
|
|
127
|
+
self._load_spiders_from_package(module_name)
|
|
128
|
+
else:
|
|
129
|
+
# 向后兼容:如果没有配置SPIDER_MODULES,则使用旧的方式
|
|
130
|
+
# 这里假设默认的spiders目录结构
|
|
131
|
+
spiders_dir = Path.cwd() / 'spiders'
|
|
132
|
+
if not spiders_dir.exists():
|
|
133
|
+
spiders_dir = Path.cwd() / 'spider'
|
|
134
|
+
if not spiders_dir.exists():
|
|
135
|
+
logger.warning("Spiders directory not found")
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
for py_file in spiders_dir.glob("*.py"):
|
|
139
|
+
if py_file.name.startswith('_'):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
module_name = py_file.stem
|
|
143
|
+
module = None
|
|
144
|
+
try:
|
|
145
|
+
# 尝试不同的导入路径
|
|
146
|
+
spider_module_path = None
|
|
147
|
+
for possible_package in ['spiders', 'spider']:
|
|
148
|
+
try:
|
|
149
|
+
spider_module_path = f"{possible_package}.{module_name}"
|
|
150
|
+
module = importlib.import_module(spider_module_path)
|
|
151
|
+
break
|
|
152
|
+
except ImportError:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if module is None:
|
|
156
|
+
raise ImportError(f"Could not import {module_name}")
|
|
157
|
+
|
|
158
|
+
self._load_spiders(module)
|
|
159
|
+
except ImportError as e:
|
|
160
|
+
logger.debug(f"Skip module {module_name}: {e}")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
self._check_name_duplicates()
|
|
164
|
+
|
|
165
|
+
def load(self, spider_name: str) -> Type[Spider]:
|
|
166
|
+
"""
|
|
167
|
+
通过name加载爬虫
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
spider_name: 爬虫名称
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Spider类
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
KeyError: 如果找不到指定名称的爬虫
|
|
177
|
+
"""
|
|
178
|
+
if spider_name not in self._spiders:
|
|
179
|
+
raise KeyError(f"Spider not found: {spider_name}")
|
|
180
|
+
return self._spiders[spider_name]
|
|
181
|
+
|
|
182
|
+
def list(self) -> List[str]:
|
|
183
|
+
"""列出所有可用的爬虫名称"""
|
|
184
|
+
return list(self._spiders.keys())
|
|
185
|
+
|
|
186
|
+
def find_by_request(self, request: 'Request') -> List[str]:
|
|
187
|
+
"""
|
|
188
|
+
根据请求找到可以处理该请求的爬虫名称
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
request: 请求对象
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
可以处理该请求的爬虫名称列表
|
|
195
|
+
"""
|
|
196
|
+
# 这里可以实现更复杂的匹配逻辑
|
|
197
|
+
# 目前只是返回所有爬虫名称
|
|
198
|
+
return list(self._spiders.keys())
|
|
199
|
+
|
|
200
|
+
def get_all(self) -> Dict[str, Type[Spider]]:
|
|
201
|
+
"""获取所有爬虫"""
|
|
202
202
|
return self._spiders.copy()
|
crawlo/utils/text_helper.py
CHANGED
|
@@ -1,95 +1,95 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
import json
|
|
3
|
-
import re
|
|
4
|
-
from typing import Any, Union, List, Dict, Tuple, Optional
|
|
5
|
-
|
|
6
|
-
from crawlo.
|
|
7
|
-
|
|
8
|
-
logger = get_logger(__name__)
|
|
9
|
-
|
|
10
|
-
# 正则表达式缓存
|
|
11
|
-
_REGEXPS: Dict[str, "re.Pattern"] = {}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def extract_text_by_regex(
|
|
15
|
-
text: Union[str, Any],
|
|
16
|
-
patterns: Union[str, List[str]],
|
|
17
|
-
allow_repeat: bool = True,
|
|
18
|
-
fetch_one: bool = False,
|
|
19
|
-
join_with: Optional[str] = None,
|
|
20
|
-
) -> Union[str, List[str], Tuple]:
|
|
21
|
-
"""
|
|
22
|
-
从文本中提取信息,支持正则匹配和多模式 fallback。
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
text (str): 文本内容或可转为字符串的类型
|
|
26
|
-
patterns (str or list of str): 正则表达式模式,按顺序尝试匹配
|
|
27
|
-
allow_repeat (bool): 是否允许重复结果
|
|
28
|
-
fetch_one (bool): 是否只提取第一个匹配项(返回元组)
|
|
29
|
-
join_with (str, optional): 若提供,则将结果用该字符连接成字符串
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
str | list | tuple: 匹配结果,根据参数返回字符串、列表或元组
|
|
33
|
-
"""
|
|
34
|
-
if isinstance(patterns, str):
|
|
35
|
-
patterns = [patterns]
|
|
36
|
-
|
|
37
|
-
results = []
|
|
38
|
-
for pattern in patterns:
|
|
39
|
-
if not pattern:
|
|
40
|
-
continue
|
|
41
|
-
|
|
42
|
-
if pattern not in _REGEXPS:
|
|
43
|
-
_REGEXPS[pattern] = re.compile(pattern, re.S)
|
|
44
|
-
|
|
45
|
-
if fetch_one:
|
|
46
|
-
match = _REGEXPS[pattern].search(str(text))
|
|
47
|
-
results = match.groups() if match else ("",)
|
|
48
|
-
break
|
|
49
|
-
else:
|
|
50
|
-
found = _REGEXPS[pattern].findall(str(text))
|
|
51
|
-
if found:
|
|
52
|
-
results = found
|
|
53
|
-
break
|
|
54
|
-
|
|
55
|
-
if fetch_one:
|
|
56
|
-
return results[0] if len(results) == 1 else results
|
|
57
|
-
|
|
58
|
-
if not allow_repeat:
|
|
59
|
-
results = sorted(set(results), key=results.index)
|
|
60
|
-
|
|
61
|
-
return join_with.join(results) if join_with else results
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def parse_json_safely(json_str: Union[str, Any]) -> Dict:
|
|
65
|
-
"""
|
|
66
|
-
安全解析 JSON 字符串,兼容非标准格式(如单引号、缺少引号键)。
|
|
67
|
-
|
|
68
|
-
Args:
|
|
69
|
-
json_str (str): JSON 字符串
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
dict: 解析后的字典,失败返回空字典
|
|
73
|
-
"""
|
|
74
|
-
if not json_str:
|
|
75
|
-
return {}
|
|
76
|
-
|
|
77
|
-
try:
|
|
78
|
-
return json.loads(json_str)
|
|
79
|
-
except Exception as e1:
|
|
80
|
-
try:
|
|
81
|
-
cleaned = json_str.strip().replace("'", '"')
|
|
82
|
-
# 使用新的函数名
|
|
83
|
-
keys = extract_text_by_regex(cleaned, r'(\w+):')
|
|
84
|
-
for key in keys:
|
|
85
|
-
cleaned = cleaned.replace(f"{key}:", f'"{key}":')
|
|
86
|
-
return json.loads(cleaned) if cleaned else {}
|
|
87
|
-
except Exception as e2:
|
|
88
|
-
logger.error(
|
|
89
|
-
f"JSON 解析失败\n"
|
|
90
|
-
f"原始内容: {json_str}\n"
|
|
91
|
-
f"错误1: {e1}\n"
|
|
92
|
-
f"修复后: {cleaned}\n"
|
|
93
|
-
f"错误2: {e2}"
|
|
94
|
-
)
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Union, List, Dict, Tuple, Optional
|
|
5
|
+
|
|
6
|
+
from crawlo.logging import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
# 正则表达式缓存
|
|
11
|
+
_REGEXPS: Dict[str, "re.Pattern"] = {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_text_by_regex(
|
|
15
|
+
text: Union[str, Any],
|
|
16
|
+
patterns: Union[str, List[str]],
|
|
17
|
+
allow_repeat: bool = True,
|
|
18
|
+
fetch_one: bool = False,
|
|
19
|
+
join_with: Optional[str] = None,
|
|
20
|
+
) -> Union[str, List[str], Tuple]:
|
|
21
|
+
"""
|
|
22
|
+
从文本中提取信息,支持正则匹配和多模式 fallback。
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text (str): 文本内容或可转为字符串的类型
|
|
26
|
+
patterns (str or list of str): 正则表达式模式,按顺序尝试匹配
|
|
27
|
+
allow_repeat (bool): 是否允许重复结果
|
|
28
|
+
fetch_one (bool): 是否只提取第一个匹配项(返回元组)
|
|
29
|
+
join_with (str, optional): 若提供,则将结果用该字符连接成字符串
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
str | list | tuple: 匹配结果,根据参数返回字符串、列表或元组
|
|
33
|
+
"""
|
|
34
|
+
if isinstance(patterns, str):
|
|
35
|
+
patterns = [patterns]
|
|
36
|
+
|
|
37
|
+
results = []
|
|
38
|
+
for pattern in patterns:
|
|
39
|
+
if not pattern:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
if pattern not in _REGEXPS:
|
|
43
|
+
_REGEXPS[pattern] = re.compile(pattern, re.S)
|
|
44
|
+
|
|
45
|
+
if fetch_one:
|
|
46
|
+
match = _REGEXPS[pattern].search(str(text))
|
|
47
|
+
results = match.groups() if match else ("",)
|
|
48
|
+
break
|
|
49
|
+
else:
|
|
50
|
+
found = _REGEXPS[pattern].findall(str(text))
|
|
51
|
+
if found:
|
|
52
|
+
results = found
|
|
53
|
+
break
|
|
54
|
+
|
|
55
|
+
if fetch_one:
|
|
56
|
+
return results[0] if len(results) == 1 else results
|
|
57
|
+
|
|
58
|
+
if not allow_repeat:
|
|
59
|
+
results = sorted(set(results), key=results.index)
|
|
60
|
+
|
|
61
|
+
return join_with.join(results) if join_with else results
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def parse_json_safely(json_str: Union[str, Any]) -> Dict:
|
|
65
|
+
"""
|
|
66
|
+
安全解析 JSON 字符串,兼容非标准格式(如单引号、缺少引号键)。
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
json_str (str): JSON 字符串
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
dict: 解析后的字典,失败返回空字典
|
|
73
|
+
"""
|
|
74
|
+
if not json_str:
|
|
75
|
+
return {}
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
return json.loads(json_str)
|
|
79
|
+
except Exception as e1:
|
|
80
|
+
try:
|
|
81
|
+
cleaned = json_str.strip().replace("'", '"')
|
|
82
|
+
# 使用新的函数名
|
|
83
|
+
keys = extract_text_by_regex(cleaned, r'(\w+):')
|
|
84
|
+
for key in keys:
|
|
85
|
+
cleaned = cleaned.replace(f"{key}:", f'"{key}":')
|
|
86
|
+
return json.loads(cleaned) if cleaned else {}
|
|
87
|
+
except Exception as e2:
|
|
88
|
+
logger.error(
|
|
89
|
+
f"JSON 解析失败\n"
|
|
90
|
+
f"原始内容: {json_str}\n"
|
|
91
|
+
f"错误1: {e1}\n"
|
|
92
|
+
f"修复后: {cleaned}\n"
|
|
93
|
+
f"错误2: {e2}"
|
|
94
|
+
)
|
|
95
95
|
return {}
|