PyPI - crawlo - Versions diffs - 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl - Mend

crawlo 1.4.4py3-none-any.whl → 1.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (85) hide show

crawlo/__init__.py +11 -15
crawlo/__version__.py +1 -1
crawlo/commands/startproject.py +24 -0
crawlo/core/engine.py +2 -2
crawlo/core/scheduler.py +4 -4
crawlo/crawler.py +8 -7
crawlo/downloader/__init__.py +5 -2
crawlo/extension/__init__.py +2 -2
crawlo/filters/aioredis_filter.py +8 -1
crawlo/filters/memory_filter.py +8 -1
crawlo/initialization/built_in.py +13 -4
crawlo/initialization/core.py +5 -4
crawlo/interfaces.py +24 -0
crawlo/middleware/__init__.py +7 -4
crawlo/middleware/middleware_manager.py +15 -8
crawlo/mode_manager.py +45 -11
crawlo/network/response.py +374 -69
crawlo/pipelines/mysql_pipeline.py +6 -6
crawlo/pipelines/pipeline_manager.py +2 -2
crawlo/project.py +2 -4
crawlo/settings/default_settings.py +4 -0
crawlo/task_manager.py +2 -2
crawlo/templates/project/items.py.tmpl +2 -2
crawlo/templates/project/middlewares.py.tmpl +9 -89
crawlo/templates/project/pipelines.py.tmpl +8 -68
crawlo/tools/__init__.py +0 -11
crawlo/utils/__init__.py +17 -1
crawlo/utils/db_helper.py +220 -319
crawlo/utils/error_handler.py +313 -67
crawlo/utils/fingerprint.py +3 -4
crawlo/utils/misc.py +82 -0
crawlo/utils/request.py +55 -66
crawlo/utils/selector_helper.py +138 -0
crawlo/utils/spider_loader.py +185 -45
crawlo/utils/text_helper.py +95 -0
crawlo-1.4.5.dist-info/METADATA +329 -0
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
tests/bug_check_test.py +251 -0
tests/direct_selector_helper_test.py +97 -0
tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
tests/ofweek_scrapy/scrapy.cfg +11 -0
tests/performance_comparison.py +4 -5
tests/simple_crawlo_test.py +1 -2
tests/simple_follow_test.py +39 -0
tests/simple_response_selector_test.py +95 -0
tests/simple_selector_helper_test.py +155 -0
tests/simple_selector_test.py +208 -0
tests/simple_url_test.py +74 -0
tests/test_crawler_process_import.py +39 -0
tests/test_crawler_process_spider_modules.py +48 -0
tests/test_edge_cases.py +7 -5
tests/test_encoding_core.py +57 -0
tests/test_encoding_detection.py +127 -0
tests/test_factory_compatibility.py +197 -0
tests/test_optimized_selector_naming.py +101 -0
tests/test_priority_behavior.py +18 -18
tests/test_response_follow.py +105 -0
tests/test_response_selector_methods.py +93 -0
tests/test_response_url_methods.py +71 -0
tests/test_response_urljoin.py +87 -0
tests/test_scrapy_style_encoding.py +113 -0
tests/test_selector_helper.py +101 -0
tests/test_selector_optimizations.py +147 -0
tests/test_spider_loader.py +50 -0
tests/test_spider_loader_comprehensive.py +70 -0
tests/test_spiders/__init__.py +1 -0
tests/test_spiders/test_spider.py +10 -0
crawlo/tools/anti_crawler.py +0 -269
crawlo/utils/class_loader.py +0 -26
crawlo/utils/enhanced_error_handler.py +0 -357
crawlo-1.4.4.dist-info/METADATA +0 -190
tests/simple_log_test.py +0 -58
tests/simple_test.py +0 -48
tests/test_framework_logger.py +0 -67
tests/test_framework_startup.py +0 -65
tests/test_mode_change.py +0 -73
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
/tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0

tests/test_response_urljoin.py ADDED Viewed

@@ -0,0 +1,87 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+Response.urljoin 方法测试
+"""
+import unittest
+from crawlo.network.response import Response
+class TestResponseUrljoin(unittest.TestCase):
+    """Response.urljoin 方法测试类"""
+    def setUp(self):
+        """测试前准备"""
+        # 创建一个模拟的HTML响应
+        html_content = """
+        <html>
+        <head>
+            <title>测试页面</title>
+        </head>
+        <body>
+            <div class="content">
+                <h1>主标题</h1>
+                <p class="intro">这是介绍段落</p>
+                <ul class="list">
+                    <li>项目1</li>
+                    <li>项目2</li>
+                    <li>项目3</li>
+                </ul>
+                <a href="https://example.com" class="link">链接文本</a>
+                <a href="/relative/path" class="relative-link">相对链接</a>
+                <img src="image.jpg" alt="图片描述" class="image">
+            </div>
+        </body>
+        </html>
+        """
+        self.response = Response(
+            url="https://example.com/test",
+            body=html_content.encode('utf-8'),
+            headers={"content-type": "text/html; charset=utf-8"}
+        )
+    def test_urljoin_absolute_url(self):
+        """测试处理绝对URL"""
+        absolute_url = self.response.urljoin("https://other.com/page")
+        self.assertEqual(absolute_url, "https://other.com/page")
+    def test_urljoin_relative_url(self):
+        """测试处理相对URL"""
+        relative_url = self.response.urljoin("/relative/path")
+        self.assertEqual(relative_url, "https://example.com/relative/path")
+        relative_url2 = self.response.urljoin("relative/path")
+        self.assertEqual(relative_url2, "https://example.com/relative/path")
+    def test_urljoin_complex_relative_url(self):
+        """测试处理复杂的相对URL"""
+        relative_url = self.response.urljoin("../other/path")
+        self.assertEqual(relative_url, "https://example.com/other/path")
+        relative_url2 = self.response.urljoin("./another/path")
+        self.assertEqual(relative_url2, "https://example.com/another/path")
+    def test_urljoin_with_query_params(self):
+        """测试处理带查询参数的URL"""
+        url_with_params = self.response.urljoin("/path?param=value")
+        self.assertEqual(url_with_params, "https://example.com/path?param=value")
+        url_with_fragment = self.response.urljoin("/path#section")
+        self.assertEqual(url_with_fragment, "https://example.com/path#section")
+    def test_urljoin_empty_url(self):
+        """测试处理空URL"""
+        empty_url = self.response.urljoin("")
+        self.assertEqual(empty_url, "https://example.com/test")
+    def test_urljoin_none_url(self):
+        """测试处理None URL"""
+        # 由于 urllib.parse.urljoin 会将 None 转换为字符串 "None"，所以我们测试实际行为
+        none_url = self.response.urljoin(None)
+        # 根据实际测试结果，urllib.parse.urljoin(None) 返回基础URL
+        # 我们接受这种行为，因为它与 urllib.parse.urljoin 的行为一致
+        self.assertEqual(none_url, "https://example.com/test")
+if __name__ == '__main__':
+    unittest.main()

tests/test_scrapy_style_encoding.py ADDED Viewed

@@ -0,0 +1,113 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+Scrapy风格编码检测测试
+"""
+import unittest
+from crawlo.network.response import Response
+class TestScrapyStyleEncoding(unittest.TestCase):
+    """Scrapy风格编码检测测试类"""
+    def test_request_encoding_priority(self):
+        """测试 Request 编码优先级"""
+        class MockRequest:
+            encoding = 'gbk'
+        response = Response(
+            url="https://example.com",
+            body=b'',
+            request=MockRequest()
+        )
+        self.assertEqual(response.encoding, 'gbk')
+    def test_declared_encoding_method(self):
+        """测试 _declared_encoding 方法"""
+        class MockRequest:
+            encoding = 'gbk'
+        response = Response(
+            url="https://example.com",
+            body=b'',
+            request=MockRequest()
+        )
+        self.assertEqual(response._declared_encoding(), 'gbk')
+    def test_content_type_encoding(self):
+        """测试 Content-Type 头部编码检测"""
+        response = Response(
+            url="https://example.com",
+            body=b'',
+            headers={"content-type": "text/html; charset=iso-8859-1"}
+        )
+        self.assertEqual(response.encoding, 'iso-8859-1')
+    def test_case_insensitive_content_type(self):
+        """测试 Content-Type 头部大小写不敏感"""
+        response = Response(
+            url="https://example.com",
+            body=b'',
+            headers={"Content-Type": "text/html; CHARSET=UTF-8"}
+        )
+        self.assertEqual(response.encoding, 'utf-8')
+    def test_default_encoding(self):
+        """测试默认编码"""
+        response = Response(
+            url="https://example.com",
+            body=b''
+        )
+        self.assertEqual(response.encoding, 'utf-8')
+    def test_declared_encoding_priority(self):
+        """测试声明编码的优先级"""
+        # 模拟没有request编码的情况
+        response = Response(
+            url="https://example.com",
+            body=b'',
+            headers={"content-type": "text/html; charset=iso-8859-1"}
+        )
+        # 应该返回Content-Type中的编码
+        self.assertEqual(response._declared_encoding(), 'iso-8859-1')
+def test_scrapy_style_encoding():
+    """测试Scrapy风格的编码检测"""
+    print("测试Scrapy风格的编码检测...")
+    # 测试 Request 编码优先级
+    class MockRequest:
+        encoding = 'gbk'
+    response1 = Response(
+        url="https://example.com",
+        body=b'',
+        request=MockRequest()
+    )
+    print(f"Request 编码优先级: {response1.encoding}")
+    # 测试 Content-Type 头部编码
+    response2 = Response(
+        url="https://example.com",
+        body=b'',
+        headers={"content-type": "text/html; charset=iso-8859-1"}
+    )
+    print(f"Content-Type 编码: {response2.encoding}")
+    # 测试声明编码方法
+    declared_enc = response2._declared_encoding()
+    print(f"声明编码: {declared_enc}")
+    # 测试默认编码
+    response3 = Response(
+        url="https://example.com",
+        body=b''
+    )
+    print(f"默认编码: {response3.encoding}")
+    print("Scrapy风格编码检测测试完成！")
+if __name__ == '__main__':
+    test_scrapy_style_encoding()

tests/test_selector_helper.py ADDED Viewed

@@ -0,0 +1,101 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+选择器辅助工具测试
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.utils import (
+    extract_text,
+    extract_texts,
+    extract_attr,
+    extract_attrs,
+    is_xpath
+)
+from parsel import Selector, SelectorList
+def test_selector_helper():
+    """测试选择器辅助工具"""
+    print("测试选择器辅助工具...")
+    print("=" * 50)
+    # 创建测试HTML
+    html_content = """
+    <html>
+    <head>
+        <title>测试页面</title>
+    </head>
+    <body>
+        <div class="content">
+            <h1>主标题</h1>
+            <p class="intro">介绍段落</p>
+            <ul class="list">
+                <li>项目1</li>
+                <li>项目2</li>
+                <li>项目3</li>
+            </ul>
+            <a href="https://example.com" class="link">链接文本</a>
+            <img src="image.jpg" alt="图片描述" class="image">
+        </div>
+    </body>
+    </html>
+    """
+    selector = Selector(text=html_content)
+    # 测试 is_xpath
+    print("1. 测试 is_xpath:")
+    print(f"   '/' 开头: {is_xpath('/')}")
+    print(f"   '//' 开头: {is_xpath('//title')}")
+    print(f"   './' 开头: {is_xpath('./div')}")
+    print(f"   'title' 开头: {is_xpath('title')}")
+    print()
+    # 测试 extract_text
+    print("2. 测试 extract_text:")
+    title_elements = selector.css('title')
+    title_text = extract_text(title_elements)
+    print(f"   标题文本: {title_text}")
+    h1_elements = selector.css('.content h1')
+    h1_text = extract_text(h1_elements)
+    print(f"   H1文本: {h1_text}")
+    print()
+    # 测试 extract_texts_from_elements
+    print("3. 测试 extract_texts_from_elements:")
+    li_elements = selector.css('.list li')
+    li_texts = extract_texts(li_elements)
+    print(f"   列表项文本: {li_texts}")
+    print()
+    # 测试 extract_attr
+    print("4. 测试 extract_attr:")
+    link_elements = selector.css('.link')
+    link_href = extract_attr(link_elements, 'href')
+    print(f"   链接href: {link_href}")
+    img_elements = selector.css('.image')
+    img_alt = extract_attr(img_elements, 'alt')
+    print(f"   图片alt: {img_alt}")
+    print()
+    # 测试 extract_attrs
+    print("5. 测试 extract_attrs:")
+    all_links = selector.css('a')
+    all_hrefs = extract_attrs(all_links, 'href')
+    print(f"   所有链接href: {all_hrefs}")
+    all_images = selector.css('img')
+    all_srcs = extract_attrs(all_images, 'src')
+    print(f"   所有图片src: {all_srcs}")
+    print()
+    print("所有测试完成！")
+if __name__ == '__main__':
+    test_selector_helper()

tests/test_selector_optimizations.py ADDED Viewed

@@ -0,0 +1,147 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+选择器方法优化测试
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.network.response import Response
+def test_selector_optimizations():
+    """测试选择器方法优化"""
+    print("测试选择器方法优化...")
+    print("=" * 50)
+    # 创建一个复杂的HTML响应
+    html_content = """
+    <html>
+    <head>
+        <title>测试页面标题</title>
+    </head>
+    <body>
+        <div class="content">
+            <h1>主标题</h1>
+            <p class="intro">这是介绍段落</p>
+            <div class="article">
+                <p>第一段内容 <strong>粗体文本</strong> 普通文本</p>
+                <p>第二段内容 <em>斜体文本</em></p>
+            </div>
+            <ul class="list">
+                <li>项目1</li>
+                <li>项目2</li>
+                <li>项目3</li>
+            </ul>
+            <a href="https://example.com" class="link">链接文本</a>
+            <img src="image.jpg" alt="图片描述" class="image">
+            <div class="products">
+                <div class="product" data-id="1">
+                    <h2>产品A</h2>
+                    <p class="price">¥99.99</p>
+                </div>
+                <div class="product" data-id="2">
+                    <h2>产品B</h2>
+                    <p class="price">¥149.99</p>
+                </div>
+            </div>
+        </div>
+    </body>
+    </html>
+    """
+    response = Response(
+        url="https://example.com/test",
+        body=html_content.encode('utf-8'),
+        headers={"content-type": "text/html; charset=utf-8"}
+    )
+    # 测试 extract_text 方法
+    print("1. 测试 extract_text 方法:")
+    title = response.extract_text('title')
+    print(f"   标题: {title}")
+    h1_text = response.extract_text('.content h1')
+    print(f"   H1文本: {h1_text}")
+    # 测试XPath
+    title_xpath = response.extract_text('//title')
+    print(f"   XPath标题: {title_xpath}")
+    # 测试复杂文本提取
+    complex_text = response.extract_text('.article p', join_str=' ')
+    print(f"   复杂文本: {complex_text}")
+    print()
+    # 测试 extract_texts 方法
+    print("2. 测试 extract_texts 方法:")
+    list_items = response.extract_texts('.list li')
+    print(f"   列表项: {list_items}")
+    # 测试XPath
+    list_items_xpath = response.extract_texts('//ul[@class="list"]/li')
+    print(f"   XPath列表项: {list_items_xpath}")
+    # 测试多个元素
+    product_names = response.extract_texts('.product h2')
+    print(f"   产品名称: {product_names}")
+    product_prices = response.extract_texts('.price')
+    print(f"   产品价格: {product_prices}")
+    print()
+    # 测试 extract_attr 方法
+    print("3. 测试 extract_attr 方法:")
+    link_href = response.extract_attr('.link', 'href')
+    print(f"   链接href: {link_href}")
+    img_alt = response.extract_attr('.image', 'alt')
+    print(f"   图片alt: {img_alt}")
+    # 测试XPath
+    link_href_xpath = response.extract_attr('//a[@class="link"]', 'href')
+    print(f"   XPath链接href: {link_href_xpath}")
+    print()
+    # 测试 extract_attrs 方法
+    print("4. 测试 extract_attrs 方法:")
+    product_ids = response.extract_attrs('.product', 'data-id')
+    print(f"   产品ID: {product_ids}")
+    # 测试XPath
+    product_ids_xpath = response.extract_attrs('//div[@class="product"]', 'data-id')
+    print(f"   XPath产品ID: {product_ids_xpath}")
+    # 测试所有链接
+    all_links = response.extract_attrs('a', 'href')
+    print(f"   所有链接: {all_links}")
+    print()
+    # 测试边界情况
+    print("5. 测试边界情况:")
+    # 测试默认值
+    non_exist = response.extract_text('.non-exist', default='默认文本')
+    print(f"   不存在元素的默认值: {non_exist}")
+    non_exist_attr = response.extract_attr('.non-exist', 'href', default='默认链接')
+    print(f"   不存在属性的默认值: {non_exist_attr}")
+    print()
+    # 测试空响应
+    print("6. 测试空响应:")
+    empty_response = Response(url="https://example.com/empty", body=b"")
+    empty_text = empty_response.extract_text('title', default='默认标题')
+    print(f"   空响应默认值: {empty_text}")
+    print()
+    print("所有测试完成！")
+if __name__ == '__main__':
+    test_selector_optimizations()

tests/test_spider_loader.py ADDED Viewed

@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+测试SpiderLoader功能
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.utils.spider_loader import SpiderLoader
+from crawlo.settings.setting_manager import SettingManager
+def test_spider_loader():
+    """测试SpiderLoader基本功能"""
+    print("测试SpiderLoader基本功能...")
+    # 创建一个简单的设置
+    settings = SettingManager({
+        'SPIDER_MODULES': ['tests.test_spiders'],
+        'SPIDER_LOADER_WARN_ONLY': True
+    })
+    # 创建SpiderLoader实例
+    loader = SpiderLoader.from_settings(settings)
+    # 测试list方法
+    spider_names = loader.list()
+    print(f"发现的爬虫: {spider_names}")
+    # 测试load方法
+    if spider_names:
+        spider_name = spider_names[0]
+        try:
+            spider_class = loader.load(spider_name)
+            print(f"成功加载爬虫: {spider_name} -> {spider_class}")
+        except KeyError as e:
+            print(f"加载爬虫失败: {e}")
+    # 测试get_all方法
+    all_spiders = loader.get_all()
+    print(f"所有爬虫: {list(all_spiders.keys())}")
+    print("测试完成!")
+if __name__ == '__main__':
+    test_spider_loader()

tests/test_spider_loader_comprehensive.py ADDED Viewed

@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+综合测试SpiderLoader功能
+"""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from crawlo.utils.spider_loader import SpiderLoader
+from crawlo.crawler import CrawlerProcess
+from crawlo.settings.setting_manager import SettingManager
+def test_spider_loader_comprehensive():
+    """综合测试SpiderLoader功能"""
+    print("综合测试SpiderLoader功能...")
+    # 1. 测试基本的SpiderLoader功能
+    print("\n1. 测试基本的SpiderLoader功能")
+    settings = SettingManager({
+        'SPIDER_MODULES': ['tests.test_spiders'],
+        'SPIDER_LOADER_WARN_ONLY': True
+    })
+    loader = SpiderLoader.from_settings(settings)
+    spider_names = loader.list()
+    print(f"  发现的爬虫: {spider_names}")
+    if spider_names:
+        spider_name = spider_names[0]
+        spider_class = loader.load(spider_name)
+        print(f"  成功加载爬虫: {spider_name} -> {spider_class}")
+    # 2. 测试CrawlerProcess与SPIDER_MODULES的集成
+    print("\n2. 测试CrawlerProcess与SPIDER_MODULES的集成")
+    process = CrawlerProcess(settings=settings)
+    process_spider_names = process.get_spider_names()
+    print(f"  CrawlerProcess发现的爬虫: {process_spider_names}")
+    is_registered = process.is_spider_registered('test_spider')
+    print(f"  爬虫'test_spider'是否已注册: {is_registered}")
+    spider_class = process.get_spider_class('test_spider')
+    print(f"  爬虫'test_spider'的类: {spider_class}")
+    # 3. 测试接口规范
+    print("\n3. 测试接口规范")
+    # 检查SpiderLoader是否实现了ISpiderLoader接口所需的方法
+    from crawlo.interfaces import ISpiderLoader
+    # 由于ISpiderLoader是Protocol，我们不能直接使用isinstance检查
+    # 而是检查是否实现了所需的方法
+    required_methods = ['load', 'list', 'find_by_request']
+    implements_interface = all(hasattr(loader, method) for method in required_methods)
+    print(f"  SpiderLoader是否实现了ISpiderLoader接口: {implements_interface}")
+    # 4. 测试方法存在性
+    print("\n4. 测试方法存在性")
+    required_methods = ['load', 'list', 'find_by_request', 'get_all']
+    for method in required_methods:
+        has_method = hasattr(loader, method)
+        print(f"  SpiderLoader是否有{method}方法: {has_method}")
+    print("\n综合测试完成!")
+if __name__ == '__main__':
+    test_spider_loader_comprehensive()

tests/test_spiders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # 测试爬虫模块

tests/test_spiders/test_spider.py ADDED Viewed

@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+from crawlo.spider import Spider
+class TestSpider(Spider):
+    name = 'test_spider'
+    def parse(self, response):
+        pass

crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.4py3-none-any.whl → 1.4.5py3-none-any.whl