PyPI - crawlo - Versions diffs - 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl - Mend

crawlo 1.4.6py3-none-any.whl → 1.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (374) hide show

crawlo/__init__.py +90 -89
crawlo/__version__.py +1 -1
crawlo/cli.py +75 -75
crawlo/commands/__init__.py +14 -14
crawlo/commands/check.py +594 -594
crawlo/commands/genspider.py +186 -186
crawlo/commands/help.py +140 -138
crawlo/commands/list.py +155 -155
crawlo/commands/run.py +379 -341
crawlo/commands/startproject.py +460 -460
crawlo/commands/stats.py +187 -187
crawlo/commands/utils.py +196 -196
crawlo/config.py +320 -312
crawlo/config_validator.py +277 -277
crawlo/core/__init__.py +52 -52
crawlo/core/engine.py +451 -438
crawlo/core/processor.py +47 -47
crawlo/core/scheduler.py +290 -291
crawlo/crawler.py +698 -657
crawlo/data/__init__.py +5 -5
crawlo/data/user_agents.py +194 -194
crawlo/downloader/__init__.py +280 -276
crawlo/downloader/aiohttp_downloader.py +233 -233
crawlo/downloader/cffi_downloader.py +250 -247
crawlo/downloader/httpx_downloader.py +265 -259
crawlo/downloader/hybrid_downloader.py +212 -212
crawlo/downloader/playwright_downloader.py +425 -402
crawlo/downloader/selenium_downloader.py +486 -472
crawlo/event.py +45 -11
crawlo/exceptions.py +215 -82
crawlo/extension/__init__.py +65 -64
crawlo/extension/health_check.py +141 -141
crawlo/extension/log_interval.py +94 -94
crawlo/extension/log_stats.py +70 -70
crawlo/extension/logging_extension.py +53 -61
crawlo/extension/memory_monitor.py +104 -104
crawlo/extension/performance_profiler.py +133 -133
crawlo/extension/request_recorder.py +107 -107
crawlo/factories/__init__.py +27 -27
crawlo/factories/base.py +68 -68
crawlo/factories/crawler.py +104 -103
crawlo/factories/registry.py +84 -84
crawlo/factories/utils.py +135 -0
crawlo/filters/__init__.py +170 -153
crawlo/filters/aioredis_filter.py +348 -264
crawlo/filters/memory_filter.py +261 -276
crawlo/framework.py +306 -292
crawlo/initialization/__init__.py +44 -44
crawlo/initialization/built_in.py +391 -434
crawlo/initialization/context.py +141 -141
crawlo/initialization/core.py +240 -194
crawlo/initialization/phases.py +230 -149
crawlo/initialization/registry.py +143 -145
crawlo/initialization/utils.py +49 -0
crawlo/interfaces.py +23 -23
crawlo/items/__init__.py +23 -23
crawlo/items/base.py +23 -23
crawlo/items/fields.py +52 -52
crawlo/items/items.py +104 -104
crawlo/logging/__init__.py +42 -46
crawlo/logging/config.py +277 -197
crawlo/logging/factory.py +175 -171
crawlo/logging/manager.py +104 -112
crawlo/middleware/__init__.py +87 -24
crawlo/middleware/default_header.py +132 -132
crawlo/middleware/download_delay.py +104 -104
crawlo/middleware/middleware_manager.py +142 -142
crawlo/middleware/offsite.py +123 -123
crawlo/middleware/proxy.py +209 -209
crawlo/middleware/request_ignore.py +86 -86
crawlo/middleware/response_code.py +150 -150
crawlo/middleware/response_filter.py +136 -136
crawlo/middleware/retry.py +124 -124
crawlo/mode_manager.py +287 -253
crawlo/network/__init__.py +21 -21
crawlo/network/request.py +375 -379
crawlo/network/response.py +569 -664
crawlo/pipelines/__init__.py +53 -22
crawlo/pipelines/base_pipeline.py +452 -0
crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
crawlo/pipelines/console_pipeline.py +39 -39
crawlo/pipelines/csv_pipeline.py +316 -316
crawlo/pipelines/database_dedup_pipeline.py +197 -197
crawlo/pipelines/json_pipeline.py +218 -218
crawlo/pipelines/memory_dedup_pipeline.py +105 -105
crawlo/pipelines/mongo_pipeline.py +140 -132
crawlo/pipelines/mysql_pipeline.py +469 -476
crawlo/pipelines/pipeline_manager.py +100 -100
crawlo/pipelines/redis_dedup_pipeline.py +155 -156
crawlo/project.py +347 -347
crawlo/queue/__init__.py +10 -0
crawlo/queue/pqueue.py +38 -38
crawlo/queue/queue_manager.py +591 -525
crawlo/queue/redis_priority_queue.py +519 -370
crawlo/settings/__init__.py +7 -7
crawlo/settings/default_settings.py +284 -277
crawlo/settings/setting_manager.py +219 -219
crawlo/spider/__init__.py +657 -657
crawlo/stats_collector.py +81 -81
crawlo/subscriber.py +129 -129
crawlo/task_manager.py +138 -138
crawlo/templates/crawlo.cfg.tmpl +10 -10
crawlo/templates/project/__init__.py.tmpl +2 -4
crawlo/templates/project/items.py.tmpl +13 -17
crawlo/templates/project/middlewares.py.tmpl +38 -38
crawlo/templates/project/pipelines.py.tmpl +35 -36
crawlo/templates/project/settings.py.tmpl +109 -111
crawlo/templates/project/settings_distributed.py.tmpl +156 -159
crawlo/templates/project/settings_gentle.py.tmpl +170 -176
crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
crawlo/templates/project/settings_minimal.py.tmpl +98 -100
crawlo/templates/project/settings_simple.py.tmpl +168 -174
crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
crawlo/templates/run.py.tmpl +23 -23
crawlo/templates/spider/spider.py.tmpl +32 -40
crawlo/templates/spiders_init.py.tmpl +5 -10
crawlo/tools/__init__.py +86 -189
crawlo/tools/date_tools.py +289 -289
crawlo/tools/distributed_coordinator.py +384 -384
crawlo/tools/scenario_adapter.py +262 -262
crawlo/tools/text_cleaner.py +232 -232
crawlo/utils/__init__.py +50 -50
crawlo/utils/batch_processor.py +276 -259
crawlo/utils/config_manager.py +442 -0
crawlo/utils/controlled_spider_mixin.py +439 -439
crawlo/utils/db_helper.py +250 -250
crawlo/utils/error_handler.py +410 -410
crawlo/utils/fingerprint.py +121 -121
crawlo/utils/func_tools.py +82 -82
crawlo/utils/large_scale_helper.py +344 -344
crawlo/utils/leak_detector.py +335 -0
crawlo/utils/log.py +79 -79
crawlo/utils/misc.py +81 -81
crawlo/utils/mongo_connection_pool.py +157 -0
crawlo/utils/mysql_connection_pool.py +197 -0
crawlo/utils/performance_monitor.py +285 -285
crawlo/utils/queue_helper.py +175 -175
crawlo/utils/redis_checker.py +91 -0
crawlo/utils/redis_connection_pool.py +578 -388
crawlo/utils/redis_key_validator.py +198 -198
crawlo/utils/request.py +278 -256
crawlo/utils/request_serializer.py +225 -225
crawlo/utils/resource_manager.py +337 -0
crawlo/utils/selector_helper.py +137 -137
crawlo/utils/singleton.py +70 -0
crawlo/utils/spider_loader.py +201 -201
crawlo/utils/text_helper.py +94 -94
crawlo/utils/{url.py → url_utils.py} +39 -39
crawlo-1.4.7.dist-info/METADATA +689 -0
crawlo-1.4.7.dist-info/RECORD +347 -0
examples/__init__.py +7 -7
tests/__init__.py +7 -7
tests/advanced_tools_example.py +217 -275
tests/authenticated_proxy_example.py +110 -110
tests/baidu_performance_test.py +108 -108
tests/baidu_test.py +59 -59
tests/bug_check_test.py +250 -250
tests/cleaners_example.py +160 -160
tests/comprehensive_framework_test.py +212 -212
tests/comprehensive_test.py +81 -81
tests/comprehensive_testing_summary.md +186 -186
tests/config_validation_demo.py +142 -142
tests/controlled_spider_example.py +205 -205
tests/date_tools_example.py +180 -180
tests/debug_configure.py +69 -69
tests/debug_framework_logger.py +84 -84
tests/debug_log_config.py +126 -126
tests/debug_log_levels.py +63 -63
tests/debug_pipelines.py +66 -66
tests/detailed_log_test.py +233 -233
tests/direct_selector_helper_test.py +96 -96
tests/distributed_dedup_test.py +467 -0
tests/distributed_test.py +66 -66
tests/distributed_test_debug.py +76 -76
tests/dynamic_loading_example.py +523 -523
tests/dynamic_loading_test.py +104 -104
tests/error_handling_example.py +171 -171
tests/explain_mysql_update_behavior.py +76 -76
tests/final_comprehensive_test.py +151 -151
tests/final_log_test.py +260 -260
tests/final_validation_test.py +182 -182
tests/fix_log_test.py +142 -142
tests/framework_performance_test.py +202 -202
tests/log_buffering_test.py +111 -111
tests/log_generation_timing_test.py +153 -153
tests/monitor_redis_dedup.sh +72 -0
tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
tests/ofweek_scrapy/scrapy.cfg +11 -11
tests/optimized_performance_test.py +211 -211
tests/performance_comparison.py +244 -244
tests/queue_blocking_test.py +113 -113
tests/queue_test.py +89 -89
tests/redis_key_validation_demo.py +130 -130
tests/request_params_example.py +150 -150
tests/response_improvements_example.py +144 -144
tests/scrapy_comparison/ofweek_scrapy.py +138 -138
tests/scrapy_comparison/scrapy_test.py +133 -133
tests/simple_cli_test.py +55 -0
tests/simple_command_test.py +119 -119
tests/simple_crawlo_test.py +126 -126
tests/simple_follow_test.py +38 -38
tests/simple_log_test2.py +137 -137
tests/simple_optimization_test.py +128 -128
tests/simple_queue_type_test.py +41 -41
tests/simple_response_selector_test.py +94 -94
tests/simple_selector_helper_test.py +154 -154
tests/simple_selector_test.py +207 -207
tests/simple_spider_test.py +49 -49
tests/simple_url_test.py +73 -73
tests/simulate_mysql_update_test.py +139 -139
tests/spider_log_timing_test.py +177 -177
tests/test_advanced_tools.py +148 -148
tests/test_all_commands.py +230 -230
tests/test_all_pipeline_fingerprints.py +133 -133
tests/test_all_redis_key_configs.py +145 -145
tests/test_asyncmy_usage.py +56 -56
tests/test_batch_processor.py +178 -178
tests/test_cleaners.py +54 -54
tests/test_cli_arguments.py +119 -0
tests/test_component_factory.py +174 -174
tests/test_config_consistency.py +80 -80
tests/test_config_merge.py +152 -152
tests/test_config_validator.py +182 -182
tests/test_controlled_spider_mixin.py +79 -79
tests/test_crawler_process_import.py +38 -38
tests/test_crawler_process_spider_modules.py +47 -47
tests/test_crawlo_proxy_integration.py +114 -114
tests/test_date_tools.py +123 -123
tests/test_dedup_fix.py +220 -220
tests/test_dedup_pipeline_consistency.py +124 -124
tests/test_default_header_middleware.py +313 -313
tests/test_distributed.py +65 -65
tests/test_double_crawlo_fix.py +204 -204
tests/test_double_crawlo_fix_simple.py +124 -124
tests/test_download_delay_middleware.py +221 -221
tests/test_downloader_proxy_compatibility.py +272 -272
tests/test_edge_cases.py +305 -305
tests/test_encoding_core.py +56 -56
tests/test_encoding_detection.py +126 -126
tests/test_enhanced_error_handler.py +270 -270
tests/test_enhanced_error_handler_comprehensive.py +245 -245
tests/test_error_handler_compatibility.py +112 -112
tests/test_factories.py +252 -252
tests/test_factory_compatibility.py +196 -196
tests/test_final_validation.py +153 -153
tests/test_fingerprint_consistency.py +135 -135
tests/test_fingerprint_simple.py +51 -51
tests/test_get_component_logger.py +83 -83
tests/test_hash_performance.py +99 -99
tests/test_integration.py +169 -169
tests/test_item_dedup_redis_key.py +122 -122
tests/test_large_scale_helper.py +235 -235
tests/test_logging_enhancements.py +374 -374
tests/test_logging_final.py +184 -184
tests/test_logging_integration.py +312 -312
tests/test_logging_system.py +282 -282
tests/test_middleware_debug.py +141 -141
tests/test_mode_consistency.py +51 -51
tests/test_multi_directory.py +67 -67
tests/test_multiple_spider_modules.py +80 -80
tests/test_mysql_pipeline_config.py +164 -164
tests/test_mysql_pipeline_error.py +98 -98
tests/test_mysql_pipeline_init_log.py +82 -82
tests/test_mysql_pipeline_integration.py +132 -132
tests/test_mysql_pipeline_refactor.py +143 -143
tests/test_mysql_pipeline_refactor_simple.py +85 -85
tests/test_mysql_pipeline_robustness.py +195 -195
tests/test_mysql_pipeline_types.py +88 -88
tests/test_mysql_update_columns.py +93 -93
tests/test_offsite_middleware.py +244 -244
tests/test_offsite_middleware_simple.py +203 -203
tests/test_optimized_selector_naming.py +100 -100
tests/test_parsel.py +29 -29
tests/test_performance.py +327 -327
tests/test_performance_monitor.py +115 -115
tests/test_pipeline_fingerprint_consistency.py +86 -86
tests/test_priority_behavior.py +211 -211
tests/test_priority_consistency.py +151 -151
tests/test_priority_consistency_fixed.py +249 -249
tests/test_proxy_health_check.py +32 -32
tests/test_proxy_middleware.py +217 -217
tests/test_proxy_middleware_enhanced.py +212 -212
tests/test_proxy_middleware_integration.py +142 -142
tests/test_proxy_middleware_refactored.py +207 -207
tests/test_proxy_only.py +83 -83
tests/test_proxy_providers.py +56 -56
tests/test_proxy_stats.py +19 -19
tests/test_proxy_strategies.py +59 -59
tests/test_proxy_with_downloader.py +152 -152
tests/test_queue_empty_check.py +41 -41
tests/test_queue_manager_double_crawlo.py +173 -173
tests/test_queue_manager_redis_key.py +179 -179
tests/test_queue_naming.py +154 -154
tests/test_queue_type.py +106 -106
tests/test_queue_type_redis_config_consistency.py +130 -130
tests/test_random_headers_default.py +322 -322
tests/test_random_headers_necessity.py +308 -308
tests/test_random_user_agent.py +72 -72
tests/test_redis_config.py +28 -28
tests/test_redis_connection_pool.py +294 -294
tests/test_redis_key_naming.py +181 -181
tests/test_redis_key_validator.py +123 -123
tests/test_redis_queue.py +224 -224
tests/test_redis_queue_name_fix.py +175 -175
tests/test_redis_queue_type_fallback.py +129 -129
tests/test_request_ignore_middleware.py +182 -182
tests/test_request_params.py +111 -111
tests/test_request_serialization.py +70 -70
tests/test_response_code_middleware.py +349 -349
tests/test_response_filter_middleware.py +427 -427
tests/test_response_follow.py +104 -104
tests/test_response_improvements.py +152 -152
tests/test_response_selector_methods.py +92 -92
tests/test_response_url_methods.py +70 -70
tests/test_response_urljoin.py +86 -86
tests/test_retry_middleware.py +333 -333
tests/test_retry_middleware_realistic.py +273 -273
tests/test_scheduler.py +252 -252
tests/test_scheduler_config_update.py +133 -133
tests/test_scrapy_style_encoding.py +112 -112
tests/test_selector_helper.py +100 -100
tests/test_selector_optimizations.py +146 -146
tests/test_simple_response.py +61 -61
tests/test_spider_loader.py +49 -49
tests/test_spider_loader_comprehensive.py +69 -69
tests/test_spider_modules.py +84 -84
tests/test_spiders/test_spider.py +9 -9
tests/test_telecom_spider_redis_key.py +205 -205
tests/test_template_content.py +87 -87
tests/test_template_redis_key.py +134 -134
tests/test_tools.py +159 -159
tests/test_user_agent_randomness.py +176 -176
tests/test_user_agents.py +96 -96
tests/untested_features_report.md +138 -138
tests/verify_debug.py +51 -51
tests/verify_distributed.py +117 -117
tests/verify_log_fix.py +111 -111
tests/verify_mysql_warnings.py +109 -109
crawlo/logging/async_handler.py +0 -181
crawlo/logging/monitor.py +0 -153
crawlo/logging/sampler.py +0 -167
crawlo/tools/authenticated_proxy.py +0 -241
crawlo/tools/data_formatter.py +0 -226
crawlo/tools/data_validator.py +0 -181
crawlo/tools/encoding_converter.py +0 -127
crawlo/tools/network_diagnostic.py +0 -365
crawlo/tools/request_tools.py +0 -83
crawlo/tools/retry_mechanism.py +0 -224
crawlo/utils/env_config.py +0 -143
crawlo/utils/large_scale_config.py +0 -287
crawlo/utils/system.py +0 -11
crawlo/utils/tools.py +0 -5
crawlo-1.4.6.dist-info/METADATA +0 -329
crawlo-1.4.6.dist-info/RECORD +0 -361
tests/env_config_example.py +0 -134
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
tests/test_authenticated_proxy.py +0 -142
tests/test_comprehensive.py +0 -147
tests/test_dynamic_downloaders_proxy.py +0 -125
tests/test_dynamic_proxy.py +0 -93
tests/test_dynamic_proxy_config.py +0 -147
tests/test_dynamic_proxy_real.py +0 -110
tests/test_env_config.py +0 -122
tests/test_framework_env_usage.py +0 -104
tests/test_large_scale_config.py +0 -113
tests/test_proxy_api.py +0 -265
tests/test_real_scenario_proxy.py +0 -196
tests/tools_example.py +0 -261
{crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0

tests/simple_selector_test.py CHANGED Viewed

@@ -1,208 +1,208 @@
-#!/usr/bin/python
-# -*- coding:UTF-8 -*-
-"""
-简化选择器测试
-"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-# 直接导入需要的模块
-from parsel import Selector, SelectorList
-class MockResponse:
-    """模拟Response类用于测试"""
-    def __init__(self, text):
-        self._text = text
-        self._selector_instance = None
-    @property
-    def text(self):
-        return self._text
-    @property
-    def _selector(self):
-        if self._selector_instance is None:
-            self._selector_instance = Selector(self.text)
-        return self._selector_instance
-    def xpath(self, query):
-        return self._selector.xpath(query)
-    def css(self, query):
-        return self._selector.css(query)
-    def _is_xpath(self, query):
-        return query.startswith(('/', '//', './'))
-    def _extract_text_from_elements(self, elements, join_str=" "):
-        texts = []
-        for element in elements:
-            if hasattr(element, 'xpath'):
-                element_texts = element.xpath('.//text()').getall()
-            else:
-                element_texts = [str(element)]
-            for text in element_texts:
-                cleaned = text.strip()
-                if cleaned:
-                    texts.append(cleaned)
-        return join_str.join(texts)
-    def extract_text(self, xpath_or_css, join_str=" ", default=''):
-        try:
-            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
-            if not elements:
-                return default
-            return self._extract_text_from_elements(elements, join_str)
-        except Exception:
-            return default
-    def extract_texts(self, xpath_or_css, join_str=" ", default=None):
-        if default is None:
-            default = []
-        try:
-            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
-            if not elements:
-                return default
-            result = []
-            for element in elements:
-                if hasattr(element, 'xpath'):
-                    texts = element.xpath('.//text()').getall()
-                else:
-                    texts = [str(element)]
-                clean_texts = [text.strip() for text in texts if text.strip()]
-                if clean_texts:
-                    result.append(join_str.join(clean_texts))
-            return result if result else default
-        except Exception:
-            return default
-    def extract_attr(self, xpath_or_css, attr_name, default=None):
-        try:
-            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
-            if not elements:
-                return default
-            if hasattr(elements, 'attrib'):
-                return elements.attrib.get(attr_name, default)
-            elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
-                return elements[0].attrib.get(attr_name, default)
-            return default
-        except Exception:
-            return default
-    def extract_attrs(self, xpath_or_css, attr_name, default=None):
-        if default is None:
-            default = []
-        try:
-            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
-            if not elements:
-                return default
-            result = []
-            for element in elements:
-                if hasattr(element, 'attrib'):
-                    attr_value = element.attrib.get(attr_name)
-                    if attr_value is not None:
-                        result.append(attr_value)
-            return result if result else default
-        except Exception:
-            return default
-def test_selector_methods():
-    """测试选择器方法"""
-    print("测试选择器方法...")
-    print("=" * 50)
-    # 创建测试HTML
-    html_content = """
-    <html>
-    <head>
-        <title>测试页面</title>
-    </head>
-    <body>
-        <div class="content">
-            <h1>主标题</h1>
-            <p class="intro">介绍段落</p>
-            <ul class="list">
-                <li>项目1</li>
-                <li>项目2</li>
-                <li>项目3</li>
-            </ul>
-            <a href="https://example.com" class="link">链接文本</a>
-            <img src="image.jpg" alt="图片描述" class="image">
-        </div>
-    </body>
-    </html>
-    """
-    response = MockResponse(html_content)
-    # 测试 extract_text
-    print("1. 测试 extract_text:")
-    title = response.extract_text('title')
-    print(f"   标题: {title}")
-    h1_text = response.extract_text('.content h1')
-    print(f"   H1文本: {h1_text}")
-    # 测试XPath
-    title_xpath = response.extract_text('//title')
-    print(f"   XPath标题: {title_xpath}")
-    print()
-    # 测试 extract_texts
-    print("2. 测试 extract_texts:")
-    list_items = response.extract_texts('.list li')
-    print(f"   列表项: {list_items}")
-    # 测试XPath
-    list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
-    print(f"   XPath列表项: {list_items_xpath}")
-    print()
-    # 测试 extract_attr
-    print("3. 测试 extract_attr:")
-    link_href = response.extract_attr('.link', 'href')
-    print(f"   链接href: {link_href}")
-    img_alt = response.extract_attr('.image', 'alt')
-    print(f"   图片alt: {img_alt}")
-    # 测试XPath
-    link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
-    print(f"   XPath链接href: {link_href_xpath}")
-    print()
-    # 测试 extract_attrs
-    print("4. 测试 extract_attrs:")
-    all_links = response.extract_attrs('a', 'href')
-    print(f"   所有链接: {all_links}")
-    print()
-    # 测试边界情况
-    print("5. 测试边界情况:")
-    non_exist = response.extract_text('.non-exist', default='默认文本')
-    print(f"   不存在元素的默认值: {non_exist}")
-    non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
-    print(f"   不存在属性的默认值: {non_exist_attr}")
-    print()
-    print("所有测试完成！")
-if __name__ == '__main__':
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+简化选择器测试
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+# 直接导入需要的模块
+from parsel import Selector, SelectorList
+class MockResponse:
+    """模拟Response类用于测试"""
+    def __init__(self, text):
+        self._text = text
+        self._selector_instance = None
+    @property
+    def text(self):
+        return self._text
+    @property
+    def _selector(self):
+        if self._selector_instance is None:
+            self._selector_instance = Selector(self.text)
+        return self._selector_instance
+    def xpath(self, query):
+        return self._selector.xpath(query)
+    def css(self, query):
+        return self._selector.css(query)
+    def _is_xpath(self, query):
+        return query.startswith(('/', '//', './'))
+    def _extract_text_from_elements(self, elements, join_str=" "):
+        texts = []
+        for element in elements:
+            if hasattr(element, 'xpath'):
+                element_texts = element.xpath('.//text()').getall()
+            else:
+                element_texts = [str(element)]
+            for text in element_texts:
+                cleaned = text.strip()
+                if cleaned:
+                    texts.append(cleaned)
+        return join_str.join(texts)
+    def extract_text(self, xpath_or_css, join_str=" ", default=''):
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            return self._extract_text_from_elements(elements, join_str)
+        except Exception:
+            return default
+    def extract_texts(self, xpath_or_css, join_str=" ", default=None):
+        if default is None:
+            default = []
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            result = []
+            for element in elements:
+                if hasattr(element, 'xpath'):
+                    texts = element.xpath('.//text()').getall()
+                else:
+                    texts = [str(element)]
+                clean_texts = [text.strip() for text in texts if text.strip()]
+                if clean_texts:
+                    result.append(join_str.join(clean_texts))
+            return result if result else default
+        except Exception:
+            return default
+    def extract_attr(self, xpath_or_css, attr_name, default=None):
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            if hasattr(elements, 'attrib'):
+                return elements.attrib.get(attr_name, default)
+            elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
+                return elements[0].attrib.get(attr_name, default)
+            return default
+        except Exception:
+            return default
+    def extract_attrs(self, xpath_or_css, attr_name, default=None):
+        if default is None:
+            default = []
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            result = []
+            for element in elements:
+                if hasattr(element, 'attrib'):
+                    attr_value = element.attrib.get(attr_name)
+                    if attr_value is not None:
+                        result.append(attr_value)
+            return result if result else default
+        except Exception:
+            return default
+def test_selector_methods():
+    """测试选择器方法"""
+    print("测试选择器方法...")
+    print("=" * 50)
+    # 创建测试HTML
+    html_content = """
+    <html>
+    <head>
+        <title>测试页面</title>
+    </head>
+    <body>
+        <div class="content">
+            <h1>主标题</h1>
+            <p class="intro">介绍段落</p>
+            <ul class="list">
+                <li>项目1</li>
+                <li>项目2</li>
+                <li>项目3</li>
+            </ul>
+            <a href="https://example.com" class="link">链接文本</a>
+            <img src="image.jpg" alt="图片描述" class="image">
+        </div>
+    </body>
+    </html>
+    """
+    response = MockResponse(html_content)
+    # 测试 extract_text
+    print("1. 测试 extract_text:")
+    title = response.extract_text('title')
+    print(f"   标题: {title}")
+    h1_text = response.extract_text('.content h1')
+    print(f"   H1文本: {h1_text}")
+    # 测试XPath
+    title_xpath = response.extract_text('//title')
+    print(f"   XPath标题: {title_xpath}")
+    print()
+    # 测试 extract_texts
+    print("2. 测试 extract_texts:")
+    list_items = response.extract_texts('.list li')
+    print(f"   列表项: {list_items}")
+    # 测试XPath
+    list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
+    print(f"   XPath列表项: {list_items_xpath}")
+    print()
+    # 测试 extract_attr
+    print("3. 测试 extract_attr:")
+    link_href = response.extract_attr('.link', 'href')
+    print(f"   链接href: {link_href}")
+    img_alt = response.extract_attr('.image', 'alt')
+    print(f"   图片alt: {img_alt}")
+    # 测试XPath
+    link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
+    print(f"   XPath链接href: {link_href_xpath}")
+    print()
+    # 测试 extract_attrs
+    print("4. 测试 extract_attrs:")
+    all_links = response.extract_attrs('a', 'href')
+    print(f"   所有链接: {all_links}")
+    print()
+    # 测试边界情况
+    print("5. 测试边界情况:")
+    non_exist = response.extract_text('.non-exist', default='默认文本')
+    print(f"   不存在元素的默认值: {non_exist}")
+    non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
+    print(f"   不存在属性的默认值: {non_exist_attr}")
+    print()
+    print("所有测试完成！")
+if __name__ == '__main__':
     test_selector_methods()

tests/simple_spider_test.py CHANGED Viewed

@@ -1,50 +1,50 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-简单的爬虫测试脚本
-"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-from crawlo.spider import Spider
-from crawlo import Request
-class TestSpider(Spider):
-    """测试爬虫"""
-    name = 'test_spider'
-    def start_requests(self):
-        """发起测试请求"""
-        yield Request('https://httpbin.org/get', callback=self.parse)
-    def parse(self, response):
-        """解析响应"""
-        print(f"成功获取响应: {response.url}")
-        print(f"状态码: {response.status_code}")
-        return []
-def main():
-    """主函数"""
-    print("开始测试爬虫功能...")
-    # 初始化框架
-    from crawlo.initialization import initialize_framework
-    settings = initialize_framework()
-    # 创建爬虫进程
-    from crawlo.crawler import CrawlerProcess
-    process = CrawlerProcess(settings=settings)
-    # 运行爬虫
-    import asyncio
-    asyncio.run(process.crawl(TestSpider))
-    print("爬虫测试完成！")
-if __name__ == "__main__":
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+简单的爬虫测试脚本
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.spider import Spider
+from crawlo import Request
+class TestSpider(Spider):
+    """测试爬虫"""
+    name = 'test_spider'
+    def start_requests(self):
+        """发起测试请求"""
+        yield Request('https://httpbin.org/get', callback=self.parse)
+    def parse(self, response):
+        """解析响应"""
+        print(f"成功获取响应: {response.url}")
+        print(f"状态码: {response.status_code}")
+        return []
+def main():
+    """主函数"""
+    print("开始测试爬虫功能...")
+    # 初始化框架
+    from crawlo.initialization import initialize_framework
+    settings = initialize_framework()
+    # 创建爬虫进程
+    from crawlo.crawler import CrawlerProcess
+    process = CrawlerProcess(settings=settings)
+    # 运行爬虫
+    import asyncio
+    asyncio.run(process.crawl(TestSpider))
+    print("爬虫测试完成！")
+if __name__ == "__main__":
     main()

tests/simple_url_test.py CHANGED Viewed

@@ -1,74 +1,74 @@
-#!/usr/bin/python
-# -*- coding:UTF-8 -*-
-"""
-Response URL 处理方法简单测试
-"""
-import sys
-import os
-# 添加项目根目录到Python路径
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-# 直接导入需要的模块
-from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
-def test_url_methods():
-    """测试 URL 处理方法"""
-    print("测试 Response URL 处理方法")
-    # 测试数据
-    test_url = "https://example.com/test?param1=value1&param2=value2#section1"
-    print(f"测试URL: {test_url}")
-    # 1. 测试 urlparse
-    print("\n1. 测试 urlparse:")
-    parsed = urlparse(test_url)
-    print(f"  scheme: {parsed.scheme}")
-    print(f"  netloc: {parsed.netloc}")
-    print(f"  path: {parsed.path}")
-    print(f"  query: {parsed.query}")
-    print(f"  fragment: {parsed.fragment}")
-    # 2. 测试 urlsplit
-    print("\n2. 测试 urlsplit:")
-    split_result = urlsplit(test_url)
-    print(f"  scheme: {split_result.scheme}")
-    print(f"  netloc: {split_result.netloc}")
-    print(f"  path: {split_result.path}")
-    print(f"  query: {split_result.query}")
-    print(f"  fragment: {split_result.fragment}")
-    # 3. 测试 parse_qs
-    print("\n3. 测试 parse_qs:")
-    query_dict = parse_qs(parsed.query)
-    print(f"  解析结果: {query_dict}")
-    # 4. 测试 urlencode
-    print("\n4. 测试 urlencode:")
-    test_dict = {"name": "张三", "age": 25, "city": "北京"}
-    encoded = urlencode(test_dict)
-    print(f"  编码结果: {encoded}")
-    # 5. 测试 quote/unquote
-    print("\n5. 测试 quote/unquote:")
-    original = "hello world 你好"
-    quoted = quote(original)
-    print(f"  原始字符串: {original}")
-    print(f"  URL编码: {quoted}")
-    unquoted = unquote(quoted)
-    print(f"  URL解码: {unquoted}")
-    print(f"  编码解码是否一致: {original == unquoted}")
-    # 6. 测试 urldefrag
-    print("\n6. 测试 urldefrag:")
-    url_without_frag, fragment = urldefrag(test_url)
-    print(f"  去除片段的URL: {url_without_frag}")
-    print(f"  片段: {fragment}")
-    print("\n所有测试完成！")
-if __name__ == '__main__':
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+Response URL 处理方法简单测试
+"""
+import sys
+import os
+# 添加项目根目录到Python路径
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+# 直接导入需要的模块
+from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
+def test_url_methods():
+    """测试 URL 处理方法"""
+    print("测试 Response URL 处理方法")
+    # 测试数据
+    test_url = "https://example.com/test?param1=value1&param2=value2#section1"
+    print(f"测试URL: {test_url}")
+    # 1. 测试 urlparse
+    print("\n1. 测试 urlparse:")
+    parsed = urlparse(test_url)
+    print(f"  scheme: {parsed.scheme}")
+    print(f"  netloc: {parsed.netloc}")
+    print(f"  path: {parsed.path}")
+    print(f"  query: {parsed.query}")
+    print(f"  fragment: {parsed.fragment}")
+    # 2. 测试 urlsplit
+    print("\n2. 测试 urlsplit:")
+    split_result = urlsplit(test_url)
+    print(f"  scheme: {split_result.scheme}")
+    print(f"  netloc: {split_result.netloc}")
+    print(f"  path: {split_result.path}")
+    print(f"  query: {split_result.query}")
+    print(f"  fragment: {split_result.fragment}")
+    # 3. 测试 parse_qs
+    print("\n3. 测试 parse_qs:")
+    query_dict = parse_qs(parsed.query)
+    print(f"  解析结果: {query_dict}")
+    # 4. 测试 urlencode
+    print("\n4. 测试 urlencode:")
+    test_dict = {"name": "张三", "age": 25, "city": "北京"}
+    encoded = urlencode(test_dict)
+    print(f"  编码结果: {encoded}")
+    # 5. 测试 quote/unquote
+    print("\n5. 测试 quote/unquote:")
+    original = "hello world 你好"
+    quoted = quote(original)
+    print(f"  原始字符串: {original}")
+    print(f"  URL编码: {quoted}")
+    unquoted = unquote(quoted)
+    print(f"  URL解码: {unquoted}")
+    print(f"  编码解码是否一致: {original == unquoted}")
+    # 6. 测试 urldefrag
+    print("\n6. 测试 urldefrag:")
+    url_without_frag, fragment = urldefrag(test_url)
+    print(f"  去除片段的URL: {url_without_frag}")
+    print(f"  片段: {fragment}")
+    print("\n所有测试完成！")
+if __name__ == '__main__':
     test_url_methods()

crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.6py3-none-any.whl → 1.4.7py3-none-any.whl