crawlo 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +33 -24
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -106
- crawlo/commands/genspider.py +125 -110
- crawlo/commands/list.py +147 -92
- crawlo/commands/run.py +286 -181
- crawlo/commands/startproject.py +111 -101
- crawlo/commands/stats.py +188 -59
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -57
- crawlo/crawler.py +494 -492
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +199 -199
- crawlo/downloader/cffi_downloader.py +242 -277
- crawlo/downloader/httpx_downloader.py +246 -246
- crawlo/event.py +11 -11
- crawlo/exceptions.py +78 -78
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +34 -34
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -150
- crawlo/filters/memory_filter.py +202 -202
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +245 -245
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +203 -203
- crawlo/network/response.py +166 -166
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +272 -272
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/project.py +153 -0
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +166 -168
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +129 -129
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +75 -75
- crawlo/templates/project/pipelines.py.tmpl +63 -63
- crawlo/templates/project/settings.py.tmpl +54 -54
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +31 -31
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +128 -128
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/request.py +267 -267
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.1.dist-info/METADATA +220 -0
- crawlo-1.1.1.dist-info/RECORD +100 -0
- examples/__init__.py +7 -0
- examples/baidu_spider/__init__.py +7 -0
- examples/baidu_spider/demo.py +94 -0
- examples/baidu_spider/items.py +46 -0
- examples/baidu_spider/middleware.py +49 -0
- examples/baidu_spider/pipeline.py +55 -0
- examples/baidu_spider/run.py +27 -0
- examples/baidu_spider/settings.py +121 -0
- examples/baidu_spider/spiders/__init__.py +7 -0
- examples/baidu_spider/spiders/bai_du.py +61 -0
- examples/baidu_spider/spiders/miit.py +159 -0
- examples/baidu_spider/spiders/sina.py +79 -0
- tests/__init__.py +7 -7
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- crawlo/utils/concurrency_manager.py +0 -125
- crawlo/utils/project.py +0 -197
- crawlo-1.0.9.dist-info/METADATA +0 -49
- crawlo-1.0.9.dist-info/RECORD +0 -97
- examples/gxb/__init__.py +0 -0
- examples/gxb/items.py +0 -36
- examples/gxb/run.py +0 -16
- examples/gxb/settings.py +0 -72
- examples/gxb/spider/__init__.py +0 -0
- examples/gxb/spider/miit_spider.py +0 -180
- examples/gxb/spider/telecom_device.py +0 -129
- {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/WHEEL +0 -0
- {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/entry_points.txt +0 -0
- {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
import json
|
|
3
|
-
from crawlo import Spider, Request
|
|
4
|
-
from crawlo.utils.log import get_logger
|
|
5
|
-
|
|
6
|
-
from examples.gxb.items import TelecomLicenseItem
|
|
7
|
-
from examples.gxb.settings import HEADERS, COOKIES
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
logger = get_logger(__name__)
|
|
11
|
-
|
|
12
|
-
class TelecomDeviceLicensesSpider(Spider):
|
|
13
|
-
name = 'telecom_device'
|
|
14
|
-
allowed_domains = ['ythzxfw.miit.gov.cn']
|
|
15
|
-
# API 的基础 URL
|
|
16
|
-
base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
|
|
17
|
-
|
|
18
|
-
# 配置:起始页码和结束页码
|
|
19
|
-
start_page = 1
|
|
20
|
-
end_page = 26405
|
|
21
|
-
data = {
|
|
22
|
-
"categoryId": "144",
|
|
23
|
-
"currentPage": 1,
|
|
24
|
-
"pageSize": 5,
|
|
25
|
-
"searchContent": ""
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def start_requests(self):
|
|
30
|
-
"""从第一页开始,逐页发起请求"""
|
|
31
|
-
|
|
32
|
-
yield Request(
|
|
33
|
-
url=self.base_api_url,
|
|
34
|
-
method='POST',
|
|
35
|
-
headers=HEADERS,
|
|
36
|
-
cookies=COOKIES,
|
|
37
|
-
body=json.dumps(self.data),
|
|
38
|
-
callback=self.parse,
|
|
39
|
-
meta={'page': 1},
|
|
40
|
-
dont_filter=True
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def parse(self, response):
|
|
45
|
-
"""
|
|
46
|
-
解析 API 响应
|
|
47
|
-
:param response: Scrapy Response 对象
|
|
48
|
-
"""
|
|
49
|
-
page = response.meta['page']
|
|
50
|
-
self.logger.info(f"正在解析第 {page} 页,状态码: {response.status_code}")
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
json_data = response.json()
|
|
54
|
-
|
|
55
|
-
if not json_data.get('success'):
|
|
56
|
-
self.logger.error(f"第 {page} 页请求失败: {json_data.get('msg', 'Unknown error')}")
|
|
57
|
-
return
|
|
58
|
-
|
|
59
|
-
# 提取总页数和总记录数(可选,用于验证)
|
|
60
|
-
total_records = json_data.get("params", {}).get("tbAppArticle", {}).get("total", 0)
|
|
61
|
-
self.logger.info(f"第 {page} 页,总记录数: {total_records}")
|
|
62
|
-
|
|
63
|
-
article_list = json_data.get("params", {}).get("tbAppArticle", {}).get("list", [])
|
|
64
|
-
|
|
65
|
-
if not article_list:
|
|
66
|
-
self.logger.warning(f"第 {page} 页未找到数据")
|
|
67
|
-
return
|
|
68
|
-
|
|
69
|
-
self.logger.info(f"第 {page} 页成功解析到 {len(article_list)} 条记录")
|
|
70
|
-
|
|
71
|
-
# 将每条记录作为独立的 item yield 出去
|
|
72
|
-
for item in article_list:
|
|
73
|
-
# 清洗数据:移除 HTML 标签
|
|
74
|
-
cleaned_item = self.clean_item(item)
|
|
75
|
-
item = TelecomLicenseItem()
|
|
76
|
-
item['license_number'] = cleaned_item.get('articleField01')
|
|
77
|
-
item['device_name'] = cleaned_item.get('articleField02')
|
|
78
|
-
item['device_model'] = cleaned_item.get('articleField03')
|
|
79
|
-
item['applicant'] = cleaned_item.get('articleField04')
|
|
80
|
-
item['manufacturer'] = cleaned_item.get('articleField05')
|
|
81
|
-
item['issue_date'] = cleaned_item.get('articleField06')
|
|
82
|
-
item['expiry_date'] = cleaned_item.get('articleField07')
|
|
83
|
-
item['certificate_type'] = cleaned_item.get('articleField08')
|
|
84
|
-
item['remarks'] = cleaned_item.get('articleField09')
|
|
85
|
-
item['certificate_status'] = cleaned_item.get('articleField10')
|
|
86
|
-
item['origin'] = cleaned_item.get('articleField11')
|
|
87
|
-
item['article_id'] = cleaned_item.get('articleId')
|
|
88
|
-
item['article_edit_date'] = cleaned_item.get('articleEdate')
|
|
89
|
-
item['create_time'] = cleaned_item.get('createTime')
|
|
90
|
-
yield item
|
|
91
|
-
|
|
92
|
-
# --- 自动翻页逻辑 ---
|
|
93
|
-
# 检查是否还有下一页
|
|
94
|
-
# 方法1:根据当前页码和预设的总页数
|
|
95
|
-
if page < self.end_page:
|
|
96
|
-
next_page = page + 1
|
|
97
|
-
self.data['currentPage'] = next_page
|
|
98
|
-
self.logger.debug(f"准备爬取下一页: {next_page}")
|
|
99
|
-
yield Request(
|
|
100
|
-
url=self.base_api_url,
|
|
101
|
-
method='POST',
|
|
102
|
-
headers=HEADERS,
|
|
103
|
-
cookies=COOKIES,
|
|
104
|
-
body=json.dumps(self.data),
|
|
105
|
-
callback=self.parse,
|
|
106
|
-
meta={'page': next_page},
|
|
107
|
-
dont_filter=True
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
except Exception as e:
|
|
111
|
-
self.logger.error(f"解析第 {page} 页响应失败: {e}", exc_info=True)
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def clean_item(item: dict) -> dict:
|
|
115
|
-
"""
|
|
116
|
-
清洗单条记录,移除 HTML 标签等
|
|
117
|
-
:param item: 原始字典
|
|
118
|
-
:return: 清洗后的字典
|
|
119
|
-
"""
|
|
120
|
-
import re
|
|
121
|
-
html_tag_re = re.compile(r'<[^>]+>')
|
|
122
|
-
cleaned = {}
|
|
123
|
-
for k, v in item.items():
|
|
124
|
-
if isinstance(v, str):
|
|
125
|
-
# 移除 HTML 标签并去除首尾空白
|
|
126
|
-
cleaned[k] = html_tag_re.sub('', v).strip()
|
|
127
|
-
else:
|
|
128
|
-
cleaned[k] = v
|
|
129
|
-
return cleaned
|
|
File without changes
|
|
File without changes
|
|
File without changes
|