PyPI - crawlo - Versions diffs - 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl - Mend

crawlo 1.4.4py3-none-any.whl → 1.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (85) hide show

crawlo/__init__.py +11 -15
crawlo/__version__.py +1 -1
crawlo/commands/startproject.py +24 -0
crawlo/core/engine.py +2 -2
crawlo/core/scheduler.py +4 -4
crawlo/crawler.py +8 -7
crawlo/downloader/__init__.py +5 -2
crawlo/extension/__init__.py +2 -2
crawlo/filters/aioredis_filter.py +8 -1
crawlo/filters/memory_filter.py +8 -1
crawlo/initialization/built_in.py +13 -4
crawlo/initialization/core.py +5 -4
crawlo/interfaces.py +24 -0
crawlo/middleware/__init__.py +7 -4
crawlo/middleware/middleware_manager.py +15 -8
crawlo/mode_manager.py +45 -11
crawlo/network/response.py +374 -69
crawlo/pipelines/mysql_pipeline.py +6 -6
crawlo/pipelines/pipeline_manager.py +2 -2
crawlo/project.py +2 -4
crawlo/settings/default_settings.py +4 -0
crawlo/task_manager.py +2 -2
crawlo/templates/project/items.py.tmpl +2 -2
crawlo/templates/project/middlewares.py.tmpl +9 -89
crawlo/templates/project/pipelines.py.tmpl +8 -68
crawlo/tools/__init__.py +0 -11
crawlo/utils/__init__.py +17 -1
crawlo/utils/db_helper.py +220 -319
crawlo/utils/error_handler.py +313 -67
crawlo/utils/fingerprint.py +3 -4
crawlo/utils/misc.py +82 -0
crawlo/utils/request.py +55 -66
crawlo/utils/selector_helper.py +138 -0
crawlo/utils/spider_loader.py +185 -45
crawlo/utils/text_helper.py +95 -0
crawlo-1.4.5.dist-info/METADATA +329 -0
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
tests/bug_check_test.py +251 -0
tests/direct_selector_helper_test.py +97 -0
tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
tests/ofweek_scrapy/scrapy.cfg +11 -0
tests/performance_comparison.py +4 -5
tests/simple_crawlo_test.py +1 -2
tests/simple_follow_test.py +39 -0
tests/simple_response_selector_test.py +95 -0
tests/simple_selector_helper_test.py +155 -0
tests/simple_selector_test.py +208 -0
tests/simple_url_test.py +74 -0
tests/test_crawler_process_import.py +39 -0
tests/test_crawler_process_spider_modules.py +48 -0
tests/test_edge_cases.py +7 -5
tests/test_encoding_core.py +57 -0
tests/test_encoding_detection.py +127 -0
tests/test_factory_compatibility.py +197 -0
tests/test_optimized_selector_naming.py +101 -0
tests/test_priority_behavior.py +18 -18
tests/test_response_follow.py +105 -0
tests/test_response_selector_methods.py +93 -0
tests/test_response_url_methods.py +71 -0
tests/test_response_urljoin.py +87 -0
tests/test_scrapy_style_encoding.py +113 -0
tests/test_selector_helper.py +101 -0
tests/test_selector_optimizations.py +147 -0
tests/test_spider_loader.py +50 -0
tests/test_spider_loader_comprehensive.py +70 -0
tests/test_spiders/__init__.py +1 -0
tests/test_spiders/test_spider.py +10 -0
crawlo/tools/anti_crawler.py +0 -269
crawlo/utils/class_loader.py +0 -26
crawlo/utils/enhanced_error_handler.py +0 -357
crawlo-1.4.4.dist-info/METADATA +0 -190
tests/simple_log_test.py +0 -58
tests/simple_test.py +0 -48
tests/test_framework_logger.py +0 -67
tests/test_framework_startup.py +0 -65
tests/test_mode_change.py +0 -73
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
/tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0

tests/simple_selector_test.py ADDED Viewed

@@ -0,0 +1,208 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+简化选择器测试
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+# 直接导入需要的模块
+from parsel import Selector, SelectorList
+class MockResponse:
+    """模拟Response类用于测试"""
+    def __init__(self, text):
+        self._text = text
+        self._selector_instance = None
+    @property
+    def text(self):
+        return self._text
+    @property
+    def _selector(self):
+        if self._selector_instance is None:
+            self._selector_instance = Selector(self.text)
+        return self._selector_instance
+    def xpath(self, query):
+        return self._selector.xpath(query)
+    def css(self, query):
+        return self._selector.css(query)
+    def _is_xpath(self, query):
+        return query.startswith(('/', '//', './'))
+    def _extract_text_from_elements(self, elements, join_str=" "):
+        texts = []
+        for element in elements:
+            if hasattr(element, 'xpath'):
+                element_texts = element.xpath('.//text()').getall()
+            else:
+                element_texts = [str(element)]
+            for text in element_texts:
+                cleaned = text.strip()
+                if cleaned:
+                    texts.append(cleaned)
+        return join_str.join(texts)
+    def extract_text(self, xpath_or_css, join_str=" ", default=''):
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            return self._extract_text_from_elements(elements, join_str)
+        except Exception:
+            return default
+    def extract_texts(self, xpath_or_css, join_str=" ", default=None):
+        if default is None:
+            default = []
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            result = []
+            for element in elements:
+                if hasattr(element, 'xpath'):
+                    texts = element.xpath('.//text()').getall()
+                else:
+                    texts = [str(element)]
+                clean_texts = [text.strip() for text in texts if text.strip()]
+                if clean_texts:
+                    result.append(join_str.join(clean_texts))
+            return result if result else default
+        except Exception:
+            return default
+    def extract_attr(self, xpath_or_css, attr_name, default=None):
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            if hasattr(elements, 'attrib'):
+                return elements.attrib.get(attr_name, default)
+            elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
+                return elements[0].attrib.get(attr_name, default)
+            return default
+        except Exception:
+            return default
+    def extract_attrs(self, xpath_or_css, attr_name, default=None):
+        if default is None:
+            default = []
+        try:
+            elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
+            if not elements:
+                return default
+            result = []
+            for element in elements:
+                if hasattr(element, 'attrib'):
+                    attr_value = element.attrib.get(attr_name)
+                    if attr_value is not None:
+                        result.append(attr_value)
+            return result if result else default
+        except Exception:
+            return default
+def test_selector_methods():
+    """测试选择器方法"""
+    print("测试选择器方法...")
+    print("=" * 50)
+    # 创建测试HTML
+    html_content = """
+    <html>
+    <head>
+        <title>测试页面</title>
+    </head>
+    <body>
+        <div class="content">
+            <h1>主标题</h1>
+            <p class="intro">介绍段落</p>
+            <ul class="list">
+                <li>项目1</li>
+                <li>项目2</li>
+                <li>项目3</li>
+            </ul>
+            <a href="https://example.com" class="link">链接文本</a>
+            <img src="image.jpg" alt="图片描述" class="image">
+        </div>
+    </body>
+    </html>
+    """
+    response = MockResponse(html_content)
+    # 测试 extract_text
+    print("1. 测试 extract_text:")
+    title = response.extract_text('title')
+    print(f"   标题: {title}")
+    h1_text = response.extract_text('.content h1')
+    print(f"   H1文本: {h1_text}")
+    # 测试XPath
+    title_xpath = response.extract_text('//title')
+    print(f"   XPath标题: {title_xpath}")
+    print()
+    # 测试 extract_texts
+    print("2. 测试 extract_texts:")
+    list_items = response.extract_texts('.list li')
+    print(f"   列表项: {list_items}")
+    # 测试XPath
+    list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
+    print(f"   XPath列表项: {list_items_xpath}")
+    print()
+    # 测试 extract_attr
+    print("3. 测试 extract_attr:")
+    link_href = response.extract_attr('.link', 'href')
+    print(f"   链接href: {link_href}")
+    img_alt = response.extract_attr('.image', 'alt')
+    print(f"   图片alt: {img_alt}")
+    # 测试XPath
+    link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
+    print(f"   XPath链接href: {link_href_xpath}")
+    print()
+    # 测试 extract_attrs
+    print("4. 测试 extract_attrs:")
+    all_links = response.extract_attrs('a', 'href')
+    print(f"   所有链接: {all_links}")
+    print()
+    # 测试边界情况
+    print("5. 测试边界情况:")
+    non_exist = response.extract_text('.non-exist', default='默认文本')
+    print(f"   不存在元素的默认值: {non_exist}")
+    non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
+    print(f"   不存在属性的默认值: {non_exist_attr}")
+    print()
+    print("所有测试完成！")
+if __name__ == '__main__':
+    test_selector_methods()

tests/simple_url_test.py ADDED Viewed

@@ -0,0 +1,74 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+Response URL 处理方法简单测试
+"""
+import sys
+import os
+# 添加项目根目录到Python路径
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+# 直接导入需要的模块
+from urllib.parse import urlparse, urlsplit, parse_qs, urlencode, quote, unquote, urldefrag
+def test_url_methods():
+    """测试 URL 处理方法"""
+    print("测试 Response URL 处理方法")
+    # 测试数据
+    test_url = "https://example.com/test?param1=value1&param2=value2#section1"
+    print(f"测试URL: {test_url}")
+    # 1. 测试 urlparse
+    print("\n1. 测试 urlparse:")
+    parsed = urlparse(test_url)
+    print(f"  scheme: {parsed.scheme}")
+    print(f"  netloc: {parsed.netloc}")
+    print(f"  path: {parsed.path}")
+    print(f"  query: {parsed.query}")
+    print(f"  fragment: {parsed.fragment}")
+    # 2. 测试 urlsplit
+    print("\n2. 测试 urlsplit:")
+    split_result = urlsplit(test_url)
+    print(f"  scheme: {split_result.scheme}")
+    print(f"  netloc: {split_result.netloc}")
+    print(f"  path: {split_result.path}")
+    print(f"  query: {split_result.query}")
+    print(f"  fragment: {split_result.fragment}")
+    # 3. 测试 parse_qs
+    print("\n3. 测试 parse_qs:")
+    query_dict = parse_qs(parsed.query)
+    print(f"  解析结果: {query_dict}")
+    # 4. 测试 urlencode
+    print("\n4. 测试 urlencode:")
+    test_dict = {"name": "张三", "age": 25, "city": "北京"}
+    encoded = urlencode(test_dict)
+    print(f"  编码结果: {encoded}")
+    # 5. 测试 quote/unquote
+    print("\n5. 测试 quote/unquote:")
+    original = "hello world 你好"
+    quoted = quote(original)
+    print(f"  原始字符串: {original}")
+    print(f"  URL编码: {quoted}")
+    unquoted = unquote(quoted)
+    print(f"  URL解码: {unquoted}")
+    print(f"  编码解码是否一致: {original == unquoted}")
+    # 6. 测试 urldefrag
+    print("\n6. 测试 urldefrag:")
+    url_without_frag, fragment = urldefrag(test_url)
+    print(f"  去除片段的URL: {url_without_frag}")
+    print(f"  片段: {fragment}")
+    print("\n所有测试完成！")
+if __name__ == '__main__':
+    test_url_methods()

tests/test_crawler_process_import.py ADDED Viewed

@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+测试CrawlerProcess导入功能
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+def test_crawler_process_import():
+    """测试CrawlerProcess导入功能"""
+    print("测试CrawlerProcess导入功能...")
+    try:
+        # 测试直接从crawlo导入CrawlerProcess
+        from crawlo import CrawlerProcess
+        print(f"  成功从crawlo导入CrawlerProcess: {CrawlerProcess}")
+        # 测试创建实例
+        process = CrawlerProcess()
+        print(f"  成功创建CrawlerProcess实例: {process}")
+        print("CrawlerProcess导入测试通过!")
+    except ImportError as e:
+        print(f"  导入失败: {e}")
+        # 如果直接导入失败，尝试从crawler模块导入
+        try:
+            from crawlo.crawler import CrawlerProcess
+            print(f"  成功从crawlo.crawler导入CrawlerProcess: {CrawlerProcess}")
+        except ImportError as e2:
+            print(f"  从crawler模块导入也失败: {e2}")
+if __name__ == '__main__':
+    test_crawler_process_import()

tests/test_crawler_process_spider_modules.py ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+测试CrawlerProcess与SPIDER_MODULES的集成
+"""
+import sys
+import os
+import asyncio
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.crawler import CrawlerProcess
+from crawlo.settings.setting_manager import SettingManager
+def test_crawler_process_spider_modules():
+    """测试CrawlerProcess与SPIDER_MODULES的集成"""
+    print("测试CrawlerProcess与SPIDER_MODULES的集成...")
+    # 创建一个包含SPIDER_MODULES的设置
+    settings = SettingManager({
+        'SPIDER_MODULES': ['tests.test_spiders'],
+        'SPIDER_LOADER_WARN_ONLY': True,
+        'CONCURRENCY': 1,
+        'LOG_LEVEL': 'INFO'
+    })
+    # 创建CrawlerProcess实例
+    process = CrawlerProcess(settings=settings)
+    # 测试获取爬虫名称
+    spider_names = process.get_spider_names()
+    print(f"发现的爬虫: {spider_names}")
+    # 测试检查爬虫是否已注册
+    is_registered = process.is_spider_registered('test_spider')
+    print(f"爬虫'test_spider'是否已注册: {is_registered}")
+    # 测试获取爬虫类
+    spider_class = process.get_spider_class('test_spider')
+    print(f"爬虫'test_spider'的类: {spider_class}")
+    print("测试完成!")
+if __name__ == '__main__':
+    test_crawler_process_spider_modules()

tests/test_edge_cases.py CHANGED Viewed

@@ -112,13 +112,15 @@ async def test_redis_queue_edge_cases():
         print("   特殊字符 URL 测试通过")
         # 4. 测试优先级（高优先级值应该先出队）
-        high_priority_request = Request(url="https://high-priority.com", priority=1000)
-        low_priority_request = Request(url="https://low-priority.com", priority=-1000)
+        # 注意：Request构造函数会将传入的priority值取反存储
+        # 所以priority=1000的请求实际存储为-1000，priority=-1000的请求实际存储为1000
+        high_priority_request = Request(url="https://high-priority.com", priority=1000)  # 实际存储为-1000
+        low_priority_request = Request(url="https://low-priority.com", priority=-1000)   # 实际存储为1000
-        await queue.put(high_priority_request)  # 高优先级值
-        await queue.put(low_priority_request)   # 低优先级值
+        await queue.put(high_priority_request, priority=high_priority_request.priority)  # 使用实际存储的priority值
+        await queue.put(low_priority_request, priority=low_priority_request.priority)    # 使用实际存储的priority值
-        # 高优先级值应该先出队
+        # 高优先级值应该先出队（因为score = priority，score小的先出队）
         first = await queue.get(timeout=1.0)
         assert first is not None and first.url == "https://high-priority.com", "高优先级值应该先出队"
         print("   优先级测试通过")

tests/test_encoding_core.py ADDED Viewed

@@ -0,0 +1,57 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+编码检测核心功能测试
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.network.response import Response
+def test_encoding_detection():
+    """测试编码检测核心功能"""
+    print("测试编码检测核心功能...")
+    # 测试 Request 编码优先级
+    class MockRequest:
+        encoding = 'gbk'
+    response1 = Response(
+        url="https://example.com",
+        body=b'',
+        request=MockRequest()
+    )
+    print(f"Request 编码优先级: {response1.encoding}")
+    # 测试 Content-Type 头部编码
+    response2 = Response(
+        url="https://example.com",
+        body=b'',
+        headers={"content-type": "text/html; charset=iso-8859-1"}
+    )
+    print(f"Content-Type 编码: {response2.encoding}")
+    # 测试声明编码方法
+    declared_enc = response2._declared_encoding()
+    print(f"声明编码: {declared_enc}")
+    # 测试默认编码
+    response3 = Response(
+        url="https://example.com",
+        body=b''
+    )
+    print(f"默认编码: {response3.encoding}")
+    # 验证结果
+    assert response1.encoding == 'gbk', f"Expected 'gbk', got {response1.encoding}"
+    assert response2.encoding == 'iso-8859-1', f"Expected 'iso-8859-1', got {response2.encoding}"
+    assert declared_enc == 'iso-8859-1', f"Expected 'iso-8859-1', got {declared_enc}"
+    assert response3.encoding == 'utf-8', f"Expected 'utf-8', got {response3.encoding}"
+    print("所有测试通过！")
+if __name__ == '__main__':
+    test_encoding_detection()

tests/test_encoding_detection.py ADDED Viewed

@@ -0,0 +1,127 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+Response 编码检测优化测试
+"""
+import unittest
+# 模拟 Response 类的部分功能用于测试
+class MockResponse:
+    def __init__(self, body, headers=None, request=None):
+        self.body = body
+        self.headers = headers or {}
+        self.request = request
+        self._DEFAULT_ENCODING = "ascii"
+    def _determine_encoding(self):
+        """简化版编码检测"""
+        # 1. 优先使用声明的编码
+        declared_encoding = self._declared_encoding()
+        if declared_encoding:
+            return declared_encoding
+        # 2. 默认使用 utf-8
+        return 'utf-8'
+    def _declared_encoding(self):
+        """获取声明的编码"""
+        # 1. Request 中指定的编码
+        if self.request and getattr(self.request, 'encoding', None):
+            return self.request.encoding
+        # 2. 从 Content-Type 头中检测
+        content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
+        if content_type:
+            import re
+            charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
+            if charset_match:
+                return charset_match.group(1).lower()
+        return None
+class TestDetermineEncoding(unittest.TestCase):
+    """编码检测测试类"""
+    def test_request_encoding_priority(self):
+        """测试 Request 编码优先级"""
+        class MockRequest:
+            encoding = 'gbk'
+        response = MockResponse(b'', request=MockRequest())
+        encoding = response._determine_encoding()
+        self.assertEqual(encoding, 'gbk')
+    def test_content_type_encoding(self):
+        """测试 Content-Type 头部编码检测"""
+        response = MockResponse(
+            b'',
+            headers={"content-type": "text/html; charset=iso-8859-1"}
+        )
+        encoding = response._determine_encoding()
+        self.assertEqual(encoding, 'iso-8859-1')
+    def test_default_encoding(self):
+        """测试默认编码"""
+        response = MockResponse(b'')
+        encoding = response._determine_encoding()
+        self.assertEqual(encoding, 'utf-8')
+    def test_case_insensitive_content_type(self):
+        """测试 Content-Type 头部大小写不敏感"""
+        response = MockResponse(
+            b'',
+            headers={"Content-Type": "text/html; CHARSET=UTF-8"}
+        )
+        encoding = response._determine_encoding()
+        self.assertEqual(encoding, 'utf-8')
+    def test_declared_encoding_with_request(self):
+        """测试声明编码 - Request优先级"""
+        class MockRequest:
+            encoding = 'gbk'
+        response = MockResponse(b'', request=MockRequest())
+        declared_encoding = response._declared_encoding()
+        self.assertEqual(declared_encoding, 'gbk')
+    def test_declared_encoding_with_content_type(self):
+        """测试声明编码 - Content-Type"""
+        response = MockResponse(
+            b'',
+            headers={"content-type": "text/html; charset=iso-8859-1"}
+        )
+        declared_encoding = response._declared_encoding()
+        self.assertEqual(declared_encoding, 'iso-8859-1')
+def test_encoding_detection():
+    """简单测试编码检测功能"""
+    print("测试编码检测功能...")
+    # 测试 Request 编码优先级
+    class MockRequest:
+        encoding = 'gbk'
+    response1 = MockResponse(b'', request=MockRequest())
+    encoding1 = response1._determine_encoding()
+    print(f"Request 编码优先级: {encoding1}")
+    # 测试 Content-Type 头部编码
+    response2 = MockResponse(
+        b'',
+        headers={"content-type": "text/html; charset=iso-8859-1"}
+    )
+    encoding2 = response2._determine_encoding()
+    print(f"Content-Type 编码: {encoding2}")
+    # 测试默认编码
+    response3 = MockResponse(b'')
+    encoding3 = response3._determine_encoding()
+    print(f"默认编码: {encoding3}")
+    print("编码检测测试完成！")
+if __name__ == '__main__':
+    test_encoding_detection()

crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.4py3-none-any.whl → 1.4.5py3-none-any.whl