PyPI - crawlo - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

crawlo 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (120) hide show

crawlo/__init__.py +34 -24
crawlo/__version__.py +1 -1
crawlo/cli.py +40 -40
crawlo/commands/__init__.py +13 -13
crawlo/commands/check.py +594 -155
crawlo/commands/genspider.py +152 -111
crawlo/commands/list.py +156 -119
crawlo/commands/run.py +285 -170
crawlo/commands/startproject.py +196 -101
crawlo/commands/stats.py +188 -167
crawlo/commands/utils.py +187 -0
crawlo/config.py +280 -0
crawlo/core/__init__.py +2 -2
crawlo/core/engine.py +171 -158
crawlo/core/enhanced_engine.py +190 -0
crawlo/core/processor.py +40 -40
crawlo/core/scheduler.py +162 -57
crawlo/crawler.py +1028 -493
crawlo/downloader/__init__.py +242 -78
crawlo/downloader/aiohttp_downloader.py +212 -199
crawlo/downloader/cffi_downloader.py +252 -277
crawlo/downloader/httpx_downloader.py +257 -246
crawlo/event.py +11 -11
crawlo/exceptions.py +78 -78
crawlo/extension/__init__.py +31 -31
crawlo/extension/log_interval.py +49 -49
crawlo/extension/log_stats.py +44 -44
crawlo/extension/logging_extension.py +34 -34
crawlo/filters/__init__.py +154 -37
crawlo/filters/aioredis_filter.py +242 -150
crawlo/filters/memory_filter.py +269 -202
crawlo/items/__init__.py +23 -23
crawlo/items/base.py +21 -21
crawlo/items/fields.py +53 -53
crawlo/items/items.py +104 -104
crawlo/middleware/__init__.py +21 -21
crawlo/middleware/default_header.py +32 -32
crawlo/middleware/download_delay.py +28 -28
crawlo/middleware/middleware_manager.py +135 -135
crawlo/middleware/proxy.py +248 -245
crawlo/middleware/request_ignore.py +30 -30
crawlo/middleware/response_code.py +18 -18
crawlo/middleware/response_filter.py +26 -26
crawlo/middleware/retry.py +125 -90
crawlo/mode_manager.py +201 -0
crawlo/network/__init__.py +21 -7
crawlo/network/request.py +311 -203
crawlo/network/response.py +269 -166
crawlo/pipelines/__init__.py +13 -13
crawlo/pipelines/console_pipeline.py +39 -39
crawlo/pipelines/csv_pipeline.py +317 -0
crawlo/pipelines/json_pipeline.py +219 -0
crawlo/pipelines/mongo_pipeline.py +116 -116
crawlo/pipelines/mysql_pipeline.py +195 -195
crawlo/pipelines/pipeline_manager.py +56 -56
crawlo/project.py +153 -0
crawlo/queue/pqueue.py +37 -0
crawlo/queue/queue_manager.py +304 -0
crawlo/queue/redis_priority_queue.py +192 -0
crawlo/settings/__init__.py +7 -7
crawlo/settings/default_settings.py +226 -169
crawlo/settings/setting_manager.py +99 -99
crawlo/spider/__init__.py +639 -129
crawlo/stats_collector.py +59 -59
crawlo/subscriber.py +106 -106
crawlo/task_manager.py +30 -27
crawlo/templates/crawlo.cfg.tmpl +10 -10
crawlo/templates/project/__init__.py.tmpl +3 -3
crawlo/templates/project/items.py.tmpl +17 -17
crawlo/templates/project/middlewares.py.tmpl +87 -76
crawlo/templates/project/pipelines.py.tmpl +336 -64
crawlo/templates/project/run.py.tmpl +239 -0
crawlo/templates/project/settings.py.tmpl +248 -54
crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
crawlo/templates/spider/spider.py.tmpl +178 -32
crawlo/utils/__init__.py +7 -7
crawlo/utils/controlled_spider_mixin.py +336 -0
crawlo/utils/date_tools.py +233 -233
crawlo/utils/db_helper.py +343 -343
crawlo/utils/func_tools.py +82 -82
crawlo/utils/large_scale_config.py +287 -0
crawlo/utils/large_scale_helper.py +344 -0
crawlo/utils/log.py +128 -128
crawlo/utils/queue_helper.py +176 -0
crawlo/utils/request.py +267 -267
crawlo/utils/request_serializer.py +220 -0
crawlo/utils/spider_loader.py +62 -62
crawlo/utils/system.py +11 -11
crawlo/utils/tools.py +4 -4
crawlo/utils/url.py +39 -39
crawlo-1.1.2.dist-info/METADATA +567 -0
crawlo-1.1.2.dist-info/RECORD +108 -0
examples/__init__.py +7 -0
tests/__init__.py +7 -7
tests/test_final_validation.py +154 -0
tests/test_proxy_health_check.py +32 -32
tests/test_proxy_middleware_integration.py +136 -136
tests/test_proxy_providers.py +56 -56
tests/test_proxy_stats.py +19 -19
tests/test_proxy_strategies.py +59 -59
tests/test_redis_config.py +29 -0
tests/test_redis_queue.py +225 -0
tests/test_request_serialization.py +71 -0
tests/test_scheduler.py +242 -0
crawlo/pipelines/mysql_batch_pipline.py +0 -273
crawlo/utils/concurrency_manager.py +0 -125
crawlo/utils/pqueue.py +0 -174
crawlo/utils/project.py +0 -197
crawlo-1.1.0.dist-info/METADATA +0 -49
crawlo-1.1.0.dist-info/RECORD +0 -97
examples/gxb/items.py +0 -36
examples/gxb/run.py +0 -16
examples/gxb/settings.py +0 -72
examples/gxb/spider/__init__.py +0 -2
examples/gxb/spider/miit_spider.py +0 -180
examples/gxb/spider/telecom_device.py +0 -129
{examples/gxb → crawlo/queue}/__init__.py +0 -0
{crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
{crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0

examples/gxb/spider/telecom_device.py DELETED Viewed

@@ -1,129 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-from crawlo import Spider, Request
-from crawlo.utils.log import get_logger
-from examples.gxb.items import TelecomLicenseItem
-from examples.gxb.settings import HEADERS, COOKIES
-logger = get_logger(__name__)
-class TelecomDeviceLicensesSpider(Spider):
-    name = 'telecom_device'
-    allowed_domains = ['ythzxfw.miit.gov.cn']
-    # API 的基础 URL
-    base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
-    # 配置：起始页码和结束页码
-    start_page = 1
-    end_page = 26405
-    data = {
-        "categoryId": "144",
-        "currentPage": 1,
-        "pageSize": 5,
-        "searchContent": ""
-    }
-    def start_requests(self):
-        """从第一页开始，逐页发起请求"""
-        yield Request(
-            url=self.base_api_url,
-            method='POST',
-            headers=HEADERS,
-            cookies=COOKIES,
-            body=json.dumps(self.data),
-            callback=self.parse,
-            meta={'page': 1},
-            dont_filter=True
-        )
-    def parse(self, response):
-        """
-        解析 API 响应
-        :param response: Scrapy Response 对象
-        """
-        page = response.meta['page']
-        self.logger.info(f"正在解析第 {page} 页，状态码: {response.status_code}")
-        try:
-            json_data = response.json()
-            if not json_data.get('success'):
-                self.logger.error(f"第 {page} 页请求失败: {json_data.get('msg', 'Unknown error')}")
-                return
-            # 提取总页数和总记录数（可选，用于验证）
-            total_records = json_data.get("params", {}).get("tbAppArticle", {}).get("total", 0)
-            self.logger.info(f"第 {page} 页，总记录数: {total_records}")
-            article_list = json_data.get("params", {}).get("tbAppArticle", {}).get("list", [])
-            if not article_list:
-                self.logger.warning(f"第 {page} 页未找到数据")
-                return
-            self.logger.info(f"第 {page} 页成功解析到 {len(article_list)} 条记录")
-            # 将每条记录作为独立的 item yield 出去
-            for item in article_list:
-                # 清洗数据：移除 HTML 标签
-                cleaned_item = self.clean_item(item)
-                item = TelecomLicenseItem()
-                item['license_number'] = cleaned_item.get('articleField01')
-                item['device_name'] = cleaned_item.get('articleField02')
-                item['device_model'] = cleaned_item.get('articleField03')
-                item['applicant'] = cleaned_item.get('articleField04')
-                item['manufacturer'] = cleaned_item.get('articleField05')
-                item['issue_date'] = cleaned_item.get('articleField06')
-                item['expiry_date'] = cleaned_item.get('articleField07')
-                item['certificate_type'] = cleaned_item.get('articleField08')
-                item['remarks'] = cleaned_item.get('articleField09')
-                item['certificate_status'] = cleaned_item.get('articleField10')
-                item['origin'] = cleaned_item.get('articleField11')
-                item['article_id'] = cleaned_item.get('articleId')
-                item['article_edit_date'] = cleaned_item.get('articleEdate')
-                item['create_time'] = cleaned_item.get('createTime')
-                yield item
-            # --- 自动翻页逻辑 ---
-            # 检查是否还有下一页
-            # 方法1：根据当前页码和预设的总页数
-            if page < self.end_page:
-                next_page = page + 1
-                self.data['currentPage'] = next_page
-                self.logger.debug(f"准备爬取下一页: {next_page}")
-                yield Request(
-                    url=self.base_api_url,
-                    method='POST',
-                    headers=HEADERS,
-                    cookies=COOKIES,
-                    body=json.dumps(self.data),
-                    callback=self.parse,
-                    meta={'page': next_page},
-                    dont_filter=True
-                )
-        except Exception as e:
-            self.logger.error(f"解析第 {page} 页响应失败: {e}", exc_info=True)
-    @staticmethod
-    def clean_item(item: dict) -> dict:
-        """
-        清洗单条记录，移除 HTML 标签等
-        :param item: 原始字典
-        :return: 清洗后的字典
-        """
-        import re
-        html_tag_re = re.compile(r'<[^>]+>')
-        cleaned = {}
-        for k, v in item.items():
-            if isinstance(v, str):
-                # 移除 HTML 标签并去除首尾空白
-                cleaned[k] = html_tag_re.sub('', v).strip()
-            else:
-                cleaned[k] = v
-        return cleaned

{examples/gxb → crawlo/queue}/__init__.py RENAMED Viewed

File without changes

{crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

crawlo 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl