crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
tests/simple_selector_test.py
CHANGED
|
@@ -1,208 +1,208 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
简化选择器测试
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
-
|
|
10
|
-
# 直接导入需要的模块
|
|
11
|
-
from parsel import Selector, SelectorList
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MockResponse:
|
|
15
|
-
"""模拟Response类用于测试"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, text):
|
|
18
|
-
self._text = text
|
|
19
|
-
self._selector_instance = None
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def text(self):
|
|
23
|
-
return self._text
|
|
24
|
-
|
|
25
|
-
@property
|
|
26
|
-
def _selector(self):
|
|
27
|
-
if self._selector_instance is None:
|
|
28
|
-
self._selector_instance = Selector(self.text)
|
|
29
|
-
return self._selector_instance
|
|
30
|
-
|
|
31
|
-
def xpath(self, query):
|
|
32
|
-
return self._selector.xpath(query)
|
|
33
|
-
|
|
34
|
-
def css(self, query):
|
|
35
|
-
return self._selector.css(query)
|
|
36
|
-
|
|
37
|
-
def _is_xpath(self, query):
|
|
38
|
-
return query.startswith(('/', '//', './'))
|
|
39
|
-
|
|
40
|
-
def _extract_text_from_elements(self, elements, join_str=" "):
|
|
41
|
-
texts = []
|
|
42
|
-
for element in elements:
|
|
43
|
-
if hasattr(element, 'xpath'):
|
|
44
|
-
element_texts = element.xpath('.//text()').getall()
|
|
45
|
-
else:
|
|
46
|
-
element_texts = [str(element)]
|
|
47
|
-
for text in element_texts:
|
|
48
|
-
cleaned = text.strip()
|
|
49
|
-
if cleaned:
|
|
50
|
-
texts.append(cleaned)
|
|
51
|
-
return join_str.join(texts)
|
|
52
|
-
|
|
53
|
-
def extract_text(self, xpath_or_css, join_str=" ", default=''):
|
|
54
|
-
try:
|
|
55
|
-
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
56
|
-
if not elements:
|
|
57
|
-
return default
|
|
58
|
-
return self._extract_text_from_elements(elements, join_str)
|
|
59
|
-
except Exception:
|
|
60
|
-
return default
|
|
61
|
-
|
|
62
|
-
def extract_texts(self, xpath_or_css, join_str=" ", default=None):
|
|
63
|
-
if default is None:
|
|
64
|
-
default = []
|
|
65
|
-
|
|
66
|
-
try:
|
|
67
|
-
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
68
|
-
if not elements:
|
|
69
|
-
return default
|
|
70
|
-
|
|
71
|
-
result = []
|
|
72
|
-
for element in elements:
|
|
73
|
-
if hasattr(element, 'xpath'):
|
|
74
|
-
texts = element.xpath('.//text()').getall()
|
|
75
|
-
else:
|
|
76
|
-
texts = [str(element)]
|
|
77
|
-
|
|
78
|
-
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
79
|
-
if clean_texts:
|
|
80
|
-
result.append(join_str.join(clean_texts))
|
|
81
|
-
|
|
82
|
-
return result if result else default
|
|
83
|
-
except Exception:
|
|
84
|
-
return default
|
|
85
|
-
|
|
86
|
-
def extract_attr(self, xpath_or_css, attr_name, default=None):
|
|
87
|
-
try:
|
|
88
|
-
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
89
|
-
if not elements:
|
|
90
|
-
return default
|
|
91
|
-
if hasattr(elements, 'attrib'):
|
|
92
|
-
return elements.attrib.get(attr_name, default)
|
|
93
|
-
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
94
|
-
return elements[0].attrib.get(attr_name, default)
|
|
95
|
-
return default
|
|
96
|
-
except Exception:
|
|
97
|
-
return default
|
|
98
|
-
|
|
99
|
-
def extract_attrs(self, xpath_or_css, attr_name, default=None):
|
|
100
|
-
if default is None:
|
|
101
|
-
default = []
|
|
102
|
-
|
|
103
|
-
try:
|
|
104
|
-
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
105
|
-
if not elements:
|
|
106
|
-
return default
|
|
107
|
-
|
|
108
|
-
result = []
|
|
109
|
-
for element in elements:
|
|
110
|
-
if hasattr(element, 'attrib'):
|
|
111
|
-
attr_value = element.attrib.get(attr_name)
|
|
112
|
-
if attr_value is not None:
|
|
113
|
-
result.append(attr_value)
|
|
114
|
-
|
|
115
|
-
return result if result else default
|
|
116
|
-
except Exception:
|
|
117
|
-
return default
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def test_selector_methods():
|
|
121
|
-
"""测试选择器方法"""
|
|
122
|
-
print("测试选择器方法...")
|
|
123
|
-
print("=" * 50)
|
|
124
|
-
|
|
125
|
-
# 创建测试HTML
|
|
126
|
-
html_content = """
|
|
127
|
-
<html>
|
|
128
|
-
<head>
|
|
129
|
-
<title>测试页面</title>
|
|
130
|
-
</head>
|
|
131
|
-
<body>
|
|
132
|
-
<div class="content">
|
|
133
|
-
<h1>主标题</h1>
|
|
134
|
-
<p class="intro">介绍段落</p>
|
|
135
|
-
<ul class="list">
|
|
136
|
-
<li>项目1</li>
|
|
137
|
-
<li>项目2</li>
|
|
138
|
-
<li>项目3</li>
|
|
139
|
-
</ul>
|
|
140
|
-
<a href="https://example.com" class="link">链接文本</a>
|
|
141
|
-
<img src="image.jpg" alt="图片描述" class="image">
|
|
142
|
-
</div>
|
|
143
|
-
</body>
|
|
144
|
-
</html>
|
|
145
|
-
"""
|
|
146
|
-
|
|
147
|
-
response = MockResponse(html_content)
|
|
148
|
-
|
|
149
|
-
# 测试 extract_text
|
|
150
|
-
print("1. 测试 extract_text:")
|
|
151
|
-
title = response.extract_text('title')
|
|
152
|
-
print(f" 标题: {title}")
|
|
153
|
-
|
|
154
|
-
h1_text = response.extract_text('.content h1')
|
|
155
|
-
print(f" H1文本: {h1_text}")
|
|
156
|
-
|
|
157
|
-
# 测试XPath
|
|
158
|
-
title_xpath = response.extract_text('//title')
|
|
159
|
-
print(f" XPath标题: {title_xpath}")
|
|
160
|
-
|
|
161
|
-
print()
|
|
162
|
-
|
|
163
|
-
# 测试 extract_texts
|
|
164
|
-
print("2. 测试 extract_texts:")
|
|
165
|
-
list_items = response.extract_texts('.list li')
|
|
166
|
-
print(f" 列表项: {list_items}")
|
|
167
|
-
|
|
168
|
-
# 测试XPath
|
|
169
|
-
list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
|
|
170
|
-
print(f" XPath列表项: {list_items_xpath}")
|
|
171
|
-
|
|
172
|
-
print()
|
|
173
|
-
|
|
174
|
-
# 测试 extract_attr
|
|
175
|
-
print("3. 测试 extract_attr:")
|
|
176
|
-
link_href = response.extract_attr('.link', 'href')
|
|
177
|
-
print(f" 链接href: {link_href}")
|
|
178
|
-
|
|
179
|
-
img_alt = response.extract_attr('.image', 'alt')
|
|
180
|
-
print(f" 图片alt: {img_alt}")
|
|
181
|
-
|
|
182
|
-
# 测试XPath
|
|
183
|
-
link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
|
|
184
|
-
print(f" XPath链接href: {link_href_xpath}")
|
|
185
|
-
|
|
186
|
-
print()
|
|
187
|
-
|
|
188
|
-
# 测试 extract_attrs
|
|
189
|
-
print("4. 测试 extract_attrs:")
|
|
190
|
-
all_links = response.extract_attrs('a', 'href')
|
|
191
|
-
print(f" 所有链接: {all_links}")
|
|
192
|
-
|
|
193
|
-
print()
|
|
194
|
-
|
|
195
|
-
# 测试边界情况
|
|
196
|
-
print("5. 测试边界情况:")
|
|
197
|
-
non_exist = response.extract_text('.non-exist', default='默认文本')
|
|
198
|
-
print(f" 不存在元素的默认值: {non_exist}")
|
|
199
|
-
|
|
200
|
-
non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
|
|
201
|
-
print(f" 不存在属性的默认值: {non_exist_attr}")
|
|
202
|
-
|
|
203
|
-
print()
|
|
204
|
-
print("所有测试完成!")
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简化选择器测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
9
|
+
|
|
10
|
+
# 直接导入需要的模块
|
|
11
|
+
from parsel import Selector, SelectorList
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MockResponse:
|
|
15
|
+
"""模拟Response类用于测试"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, text):
|
|
18
|
+
self._text = text
|
|
19
|
+
self._selector_instance = None
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def text(self):
|
|
23
|
+
return self._text
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def _selector(self):
|
|
27
|
+
if self._selector_instance is None:
|
|
28
|
+
self._selector_instance = Selector(self.text)
|
|
29
|
+
return self._selector_instance
|
|
30
|
+
|
|
31
|
+
def xpath(self, query):
|
|
32
|
+
return self._selector.xpath(query)
|
|
33
|
+
|
|
34
|
+
def css(self, query):
|
|
35
|
+
return self._selector.css(query)
|
|
36
|
+
|
|
37
|
+
def _is_xpath(self, query):
|
|
38
|
+
return query.startswith(('/', '//', './'))
|
|
39
|
+
|
|
40
|
+
def _extract_text_from_elements(self, elements, join_str=" "):
|
|
41
|
+
texts = []
|
|
42
|
+
for element in elements:
|
|
43
|
+
if hasattr(element, 'xpath'):
|
|
44
|
+
element_texts = element.xpath('.//text()').getall()
|
|
45
|
+
else:
|
|
46
|
+
element_texts = [str(element)]
|
|
47
|
+
for text in element_texts:
|
|
48
|
+
cleaned = text.strip()
|
|
49
|
+
if cleaned:
|
|
50
|
+
texts.append(cleaned)
|
|
51
|
+
return join_str.join(texts)
|
|
52
|
+
|
|
53
|
+
def extract_text(self, xpath_or_css, join_str=" ", default=''):
|
|
54
|
+
try:
|
|
55
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
56
|
+
if not elements:
|
|
57
|
+
return default
|
|
58
|
+
return self._extract_text_from_elements(elements, join_str)
|
|
59
|
+
except Exception:
|
|
60
|
+
return default
|
|
61
|
+
|
|
62
|
+
def extract_texts(self, xpath_or_css, join_str=" ", default=None):
|
|
63
|
+
if default is None:
|
|
64
|
+
default = []
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
68
|
+
if not elements:
|
|
69
|
+
return default
|
|
70
|
+
|
|
71
|
+
result = []
|
|
72
|
+
for element in elements:
|
|
73
|
+
if hasattr(element, 'xpath'):
|
|
74
|
+
texts = element.xpath('.//text()').getall()
|
|
75
|
+
else:
|
|
76
|
+
texts = [str(element)]
|
|
77
|
+
|
|
78
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
79
|
+
if clean_texts:
|
|
80
|
+
result.append(join_str.join(clean_texts))
|
|
81
|
+
|
|
82
|
+
return result if result else default
|
|
83
|
+
except Exception:
|
|
84
|
+
return default
|
|
85
|
+
|
|
86
|
+
def extract_attr(self, xpath_or_css, attr_name, default=None):
|
|
87
|
+
try:
|
|
88
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
89
|
+
if not elements:
|
|
90
|
+
return default
|
|
91
|
+
if hasattr(elements, 'attrib'):
|
|
92
|
+
return elements.attrib.get(attr_name, default)
|
|
93
|
+
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
94
|
+
return elements[0].attrib.get(attr_name, default)
|
|
95
|
+
return default
|
|
96
|
+
except Exception:
|
|
97
|
+
return default
|
|
98
|
+
|
|
99
|
+
def extract_attrs(self, xpath_or_css, attr_name, default=None):
|
|
100
|
+
if default is None:
|
|
101
|
+
default = []
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
105
|
+
if not elements:
|
|
106
|
+
return default
|
|
107
|
+
|
|
108
|
+
result = []
|
|
109
|
+
for element in elements:
|
|
110
|
+
if hasattr(element, 'attrib'):
|
|
111
|
+
attr_value = element.attrib.get(attr_name)
|
|
112
|
+
if attr_value is not None:
|
|
113
|
+
result.append(attr_value)
|
|
114
|
+
|
|
115
|
+
return result if result else default
|
|
116
|
+
except Exception:
|
|
117
|
+
return default
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_selector_methods():
|
|
121
|
+
"""测试选择器方法"""
|
|
122
|
+
print("测试选择器方法...")
|
|
123
|
+
print("=" * 50)
|
|
124
|
+
|
|
125
|
+
# 创建测试HTML
|
|
126
|
+
html_content = """
|
|
127
|
+
<html>
|
|
128
|
+
<head>
|
|
129
|
+
<title>测试页面</title>
|
|
130
|
+
</head>
|
|
131
|
+
<body>
|
|
132
|
+
<div class="content">
|
|
133
|
+
<h1>主标题</h1>
|
|
134
|
+
<p class="intro">介绍段落</p>
|
|
135
|
+
<ul class="list">
|
|
136
|
+
<li>项目1</li>
|
|
137
|
+
<li>项目2</li>
|
|
138
|
+
<li>项目3</li>
|
|
139
|
+
</ul>
|
|
140
|
+
<a href="https://example.com" class="link">链接文本</a>
|
|
141
|
+
<img src="image.jpg" alt="图片描述" class="image">
|
|
142
|
+
</div>
|
|
143
|
+
</body>
|
|
144
|
+
</html>
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
response = MockResponse(html_content)
|
|
148
|
+
|
|
149
|
+
# 测试 extract_text
|
|
150
|
+
print("1. 测试 extract_text:")
|
|
151
|
+
title = response.extract_text('title')
|
|
152
|
+
print(f" 标题: {title}")
|
|
153
|
+
|
|
154
|
+
h1_text = response.extract_text('.content h1')
|
|
155
|
+
print(f" H1文本: {h1_text}")
|
|
156
|
+
|
|
157
|
+
# 测试XPath
|
|
158
|
+
title_xpath = response.extract_text('//title')
|
|
159
|
+
print(f" XPath标题: {title_xpath}")
|
|
160
|
+
|
|
161
|
+
print()
|
|
162
|
+
|
|
163
|
+
# 测试 extract_texts
|
|
164
|
+
print("2. 测试 extract_texts:")
|
|
165
|
+
list_items = response.extract_texts('.list li')
|
|
166
|
+
print(f" 列表项: {list_items}")
|
|
167
|
+
|
|
168
|
+
# 测试XPath
|
|
169
|
+
list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
|
|
170
|
+
print(f" XPath列表项: {list_items_xpath}")
|
|
171
|
+
|
|
172
|
+
print()
|
|
173
|
+
|
|
174
|
+
# 测试 extract_attr
|
|
175
|
+
print("3. 测试 extract_attr:")
|
|
176
|
+
link_href = response.extract_attr('.link', 'href')
|
|
177
|
+
print(f" 链接href: {link_href}")
|
|
178
|
+
|
|
179
|
+
img_alt = response.extract_attr('.image', 'alt')
|
|
180
|
+
print(f" 图片alt: {img_alt}")
|
|
181
|
+
|
|
182
|
+
# 测试XPath
|
|
183
|
+
link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
|
|
184
|
+
print(f" XPath链接href: {link_href_xpath}")
|
|
185
|
+
|
|
186
|
+
print()
|
|
187
|
+
|
|
188
|
+
# 测试 extract_attrs
|
|
189
|
+
print("4. 测试 extract_attrs:")
|
|
190
|
+
all_links = response.extract_attrs('a', 'href')
|
|
191
|
+
print(f" 所有链接: {all_links}")
|
|
192
|
+
|
|
193
|
+
print()
|
|
194
|
+
|
|
195
|
+
# 测试边界情况
|
|
196
|
+
print("5. 测试边界情况:")
|
|
197
|
+
non_exist = response.extract_text('.non-exist', default='默认文本')
|
|
198
|
+
print(f" 不存在元素的默认值: {non_exist}")
|
|
199
|
+
|
|
200
|
+
non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
|
|
201
|
+
print(f" 不存在属性的默认值: {non_exist_attr}")
|
|
202
|
+
|
|
203
|
+
print()
|
|
204
|
+
print("所有测试完成!")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
208
|
test_selector_methods()
|
tests/simple_spider_test.py
CHANGED
|
@@ -1,50 +1,50 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
简单的爬虫测试脚本
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
10
|
-
|
|
11
|
-
from crawlo.spider import Spider
|
|
12
|
-
from crawlo import Request
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class TestSpider(Spider):
|
|
16
|
-
"""测试爬虫"""
|
|
17
|
-
name = 'test_spider'
|
|
18
|
-
|
|
19
|
-
def start_requests(self):
|
|
20
|
-
"""发起测试请求"""
|
|
21
|
-
yield Request('https://httpbin.org/get', callback=self.parse)
|
|
22
|
-
|
|
23
|
-
def parse(self, response):
|
|
24
|
-
"""解析响应"""
|
|
25
|
-
print(f"成功获取响应: {response.url}")
|
|
26
|
-
print(f"状态码: {response.status_code}")
|
|
27
|
-
return []
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def main():
|
|
31
|
-
"""主函数"""
|
|
32
|
-
print("开始测试爬虫功能...")
|
|
33
|
-
|
|
34
|
-
# 初始化框架
|
|
35
|
-
from crawlo.initialization import initialize_framework
|
|
36
|
-
settings = initialize_framework()
|
|
37
|
-
|
|
38
|
-
# 创建爬虫进程
|
|
39
|
-
from crawlo.crawler import CrawlerProcess
|
|
40
|
-
process = CrawlerProcess(settings=settings)
|
|
41
|
-
|
|
42
|
-
# 运行爬虫
|
|
43
|
-
import asyncio
|
|
44
|
-
asyncio.run(process.crawl(TestSpider))
|
|
45
|
-
|
|
46
|
-
print("爬虫测试完成!")
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简单的爬虫测试脚本
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
10
|
+
|
|
11
|
+
from crawlo.spider import Spider
|
|
12
|
+
from crawlo import Request
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestSpider(Spider):
|
|
16
|
+
"""测试爬虫"""
|
|
17
|
+
name = 'test_spider'
|
|
18
|
+
|
|
19
|
+
def start_requests(self):
|
|
20
|
+
"""发起测试请求"""
|
|
21
|
+
yield Request('https://httpbin.org/get', callback=self.parse)
|
|
22
|
+
|
|
23
|
+
def parse(self, response):
|
|
24
|
+
"""解析响应"""
|
|
25
|
+
print(f"成功获取响应: {response.url}")
|
|
26
|
+
print(f"状态码: {response.status_code}")
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def main():
|
|
31
|
+
"""主函数"""
|
|
32
|
+
print("开始测试爬虫功能...")
|
|
33
|
+
|
|
34
|
+
# 初始化框架
|
|
35
|
+
from crawlo.initialization import initialize_framework
|
|
36
|
+
settings = initialize_framework()
|
|
37
|
+
|
|
38
|
+
# 创建爬虫进程
|
|
39
|
+
from crawlo.crawler import CrawlerProcess
|
|
40
|
+
process = CrawlerProcess(settings=settings)
|
|
41
|
+
|
|
42
|
+
# 运行爬虫
|
|
43
|
+
import asyncio
|
|
44
|
+
asyncio.run(process.crawl(TestSpider))
|
|
45
|
+
|
|
46
|
+
print("爬虫测试完成!")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
50
|
main()
|
tests/simple_url_test.py
CHANGED
|
@@ -1,74 +1,74 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Response URL 处理方法简单测试
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
# 添加项目根目录到Python路径
|
|
10
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
-
|
|
12
|
-
# 直接导入需要的模块
|
|
13
|
-
from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def test_url_methods():
|
|
17
|
-
"""测试 URL 处理方法"""
|
|
18
|
-
print("测试 Response URL 处理方法")
|
|
19
|
-
|
|
20
|
-
# 测试数据
|
|
21
|
-
test_url = "https://example.com/test?param1=value1¶m2=value2#section1"
|
|
22
|
-
print(f"测试URL: {test_url}")
|
|
23
|
-
|
|
24
|
-
# 1. 测试 urlparse
|
|
25
|
-
print("\n1. 测试 urlparse:")
|
|
26
|
-
parsed = urlparse(test_url)
|
|
27
|
-
print(f" scheme: {parsed.scheme}")
|
|
28
|
-
print(f" netloc: {parsed.netloc}")
|
|
29
|
-
print(f" path: {parsed.path}")
|
|
30
|
-
print(f" query: {parsed.query}")
|
|
31
|
-
print(f" fragment: {parsed.fragment}")
|
|
32
|
-
|
|
33
|
-
# 2. 测试 urlsplit
|
|
34
|
-
print("\n2. 测试 urlsplit:")
|
|
35
|
-
split_result = urlsplit(test_url)
|
|
36
|
-
print(f" scheme: {split_result.scheme}")
|
|
37
|
-
print(f" netloc: {split_result.netloc}")
|
|
38
|
-
print(f" path: {split_result.path}")
|
|
39
|
-
print(f" query: {split_result.query}")
|
|
40
|
-
print(f" fragment: {split_result.fragment}")
|
|
41
|
-
|
|
42
|
-
# 3. 测试 parse_qs
|
|
43
|
-
print("\n3. 测试 parse_qs:")
|
|
44
|
-
query_dict = parse_qs(parsed.query)
|
|
45
|
-
print(f" 解析结果: {query_dict}")
|
|
46
|
-
|
|
47
|
-
# 4. 测试 urlencode
|
|
48
|
-
print("\n4. 测试 urlencode:")
|
|
49
|
-
test_dict = {"name": "张三", "age": 25, "city": "北京"}
|
|
50
|
-
encoded = urlencode(test_dict)
|
|
51
|
-
print(f" 编码结果: {encoded}")
|
|
52
|
-
|
|
53
|
-
# 5. 测试 quote/unquote
|
|
54
|
-
print("\n5. 测试 quote/unquote:")
|
|
55
|
-
original = "hello world 你好"
|
|
56
|
-
quoted = quote(original)
|
|
57
|
-
print(f" 原始字符串: {original}")
|
|
58
|
-
print(f" URL编码: {quoted}")
|
|
59
|
-
|
|
60
|
-
unquoted = unquote(quoted)
|
|
61
|
-
print(f" URL解码: {unquoted}")
|
|
62
|
-
print(f" 编码解码是否一致: {original == unquoted}")
|
|
63
|
-
|
|
64
|
-
# 6. 测试 urldefrag
|
|
65
|
-
print("\n6. 测试 urldefrag:")
|
|
66
|
-
url_without_frag, fragment = urldefrag(test_url)
|
|
67
|
-
print(f" 去除片段的URL: {url_without_frag}")
|
|
68
|
-
print(f" 片段: {fragment}")
|
|
69
|
-
|
|
70
|
-
print("\n所有测试完成!")
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Response URL 处理方法简单测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到Python路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
# 直接导入需要的模块
|
|
13
|
+
from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_url_methods():
|
|
17
|
+
"""测试 URL 处理方法"""
|
|
18
|
+
print("测试 Response URL 处理方法")
|
|
19
|
+
|
|
20
|
+
# 测试数据
|
|
21
|
+
test_url = "https://example.com/test?param1=value1¶m2=value2#section1"
|
|
22
|
+
print(f"测试URL: {test_url}")
|
|
23
|
+
|
|
24
|
+
# 1. 测试 urlparse
|
|
25
|
+
print("\n1. 测试 urlparse:")
|
|
26
|
+
parsed = urlparse(test_url)
|
|
27
|
+
print(f" scheme: {parsed.scheme}")
|
|
28
|
+
print(f" netloc: {parsed.netloc}")
|
|
29
|
+
print(f" path: {parsed.path}")
|
|
30
|
+
print(f" query: {parsed.query}")
|
|
31
|
+
print(f" fragment: {parsed.fragment}")
|
|
32
|
+
|
|
33
|
+
# 2. 测试 urlsplit
|
|
34
|
+
print("\n2. 测试 urlsplit:")
|
|
35
|
+
split_result = urlsplit(test_url)
|
|
36
|
+
print(f" scheme: {split_result.scheme}")
|
|
37
|
+
print(f" netloc: {split_result.netloc}")
|
|
38
|
+
print(f" path: {split_result.path}")
|
|
39
|
+
print(f" query: {split_result.query}")
|
|
40
|
+
print(f" fragment: {split_result.fragment}")
|
|
41
|
+
|
|
42
|
+
# 3. 测试 parse_qs
|
|
43
|
+
print("\n3. 测试 parse_qs:")
|
|
44
|
+
query_dict = parse_qs(parsed.query)
|
|
45
|
+
print(f" 解析结果: {query_dict}")
|
|
46
|
+
|
|
47
|
+
# 4. 测试 urlencode
|
|
48
|
+
print("\n4. 测试 urlencode:")
|
|
49
|
+
test_dict = {"name": "张三", "age": 25, "city": "北京"}
|
|
50
|
+
encoded = urlencode(test_dict)
|
|
51
|
+
print(f" 编码结果: {encoded}")
|
|
52
|
+
|
|
53
|
+
# 5. 测试 quote/unquote
|
|
54
|
+
print("\n5. 测试 quote/unquote:")
|
|
55
|
+
original = "hello world 你好"
|
|
56
|
+
quoted = quote(original)
|
|
57
|
+
print(f" 原始字符串: {original}")
|
|
58
|
+
print(f" URL编码: {quoted}")
|
|
59
|
+
|
|
60
|
+
unquoted = unquote(quoted)
|
|
61
|
+
print(f" URL解码: {unquoted}")
|
|
62
|
+
print(f" 编码解码是否一致: {original == unquoted}")
|
|
63
|
+
|
|
64
|
+
# 6. 测试 urldefrag
|
|
65
|
+
print("\n6. 测试 urldefrag:")
|
|
66
|
+
url_without_frag, fragment = urldefrag(test_url)
|
|
67
|
+
print(f" 去除片段的URL: {url_without_frag}")
|
|
68
|
+
print(f" 片段: {fragment}")
|
|
69
|
+
|
|
70
|
+
print("\n所有测试完成!")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == '__main__':
|
|
74
74
|
test_url_methods()
|