crawlo 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +25 -9
- crawlo/__version__.py +1 -1
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -57
- crawlo/crawler.py +424 -242
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +200 -259
- crawlo/downloader/cffi_downloader.py +277 -0
- crawlo/downloader/httpx_downloader.py +246 -187
- crawlo/event.py +11 -11
- crawlo/exceptions.py +73 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +35 -0
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -150
- crawlo/filters/memory_filter.py +202 -202
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +115 -119
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -140
- crawlo/middleware/proxy.py +246 -0
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +203 -204
- crawlo/network/response.py +166 -166
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +273 -134
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +169 -94
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +41 -36
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +233 -177
- crawlo/utils/db_helper.py +344 -0
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +129 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +267 -122
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +5 -303
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
- crawlo-1.0.5.dist-info/RECORD +84 -0
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
- examples/__init__.py +0 -0
- examples/gxb/__init__.py +0 -0
- examples/gxb/items.py +36 -0
- examples/gxb/run.py +15 -0
- examples/gxb/settings.py +71 -0
- examples/gxb/spider/__init__.py +0 -0
- examples/gxb/spider/miit_spider.py +180 -0
- examples/gxb/spider/telecom_device_licenses.py +129 -0
- tests/__init__.py +7 -7
- tests/test_proxy_health_check.py +33 -0
- tests/test_proxy_middleware_integration.py +137 -0
- tests/test_proxy_providers.py +57 -0
- tests/test_proxy_stats.py +20 -0
- tests/test_proxy_strategies.py +60 -0
- crawlo/downloader/playwright_downloader.py +0 -161
- crawlo-1.0.4.dist-info/RECORD +0 -79
- tests/baidu_spider/__init__.py +0 -7
- tests/baidu_spider/demo.py +0 -94
- tests/baidu_spider/items.py +0 -25
- tests/baidu_spider/middleware.py +0 -49
- tests/baidu_spider/pipeline.py +0 -55
- tests/baidu_spider/request_fingerprints.txt +0 -9
- tests/baidu_spider/run.py +0 -27
- tests/baidu_spider/settings.py +0 -80
- tests/baidu_spider/spiders/__init__.py +0 -7
- tests/baidu_spider/spiders/bai_du.py +0 -61
- tests/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
examples/gxb/settings.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import platform
|
|
2
|
+
|
|
3
|
+
PROXY_ENABLED = False
|
|
4
|
+
|
|
5
|
+
# API 地址
|
|
6
|
+
PROXY_API_URL = 'http://123.56.42.142:5000/proxy/getitem/'
|
|
7
|
+
|
|
8
|
+
# 提取方式(根据实际返回结构选择)
|
|
9
|
+
PROXY_EXTRACTOR = "proxy"
|
|
10
|
+
# 或
|
|
11
|
+
# from utils.proxy_extractors import custom_extractor_proxy
|
|
12
|
+
# PROXY_EXTRACTOR = custom_extractor_proxy
|
|
13
|
+
|
|
14
|
+
# 刷新间隔
|
|
15
|
+
PROXY_REFRESH_INTERVAL = 5
|
|
16
|
+
|
|
17
|
+
CONCURRENCY = 3
|
|
18
|
+
|
|
19
|
+
# 超时时间
|
|
20
|
+
PROXY_API_TIMEOUT = 10
|
|
21
|
+
|
|
22
|
+
if platform.system() == "Windows":
|
|
23
|
+
MYSQL_HOST = "pc-2ze9oh2diu5e5firh.rwlb.rds.aliyuncs.com"
|
|
24
|
+
else:
|
|
25
|
+
MYSQL_HOST = "tianmai-k8s-dmadmin-x.rwlb.rds.aliyuncs.com"
|
|
26
|
+
|
|
27
|
+
# 数据库端口
|
|
28
|
+
MYSQL_PORT = 3306
|
|
29
|
+
# 数据库用户名
|
|
30
|
+
MYSQL_USER = "data_collection"
|
|
31
|
+
# 数据库密码
|
|
32
|
+
MYSQL_PASSWORD = "CRNabzFQ2H"
|
|
33
|
+
# 数据库名
|
|
34
|
+
MYSQL_DB = "cxzx_xm"
|
|
35
|
+
# 数据库编码
|
|
36
|
+
MYSQL_TABLE = "telecom_device_licenses_v4"
|
|
37
|
+
|
|
38
|
+
MYSQL_BATCH_SIZE = 100
|
|
39
|
+
|
|
40
|
+
PIPELINES = [
|
|
41
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
42
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 可选:存入 MySQL
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
HEADERS = {
|
|
47
|
+
"Accept": "application/json, text/plain, */*",
|
|
48
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
49
|
+
"Authorization": "null",
|
|
50
|
+
"Cache-Control": "no-cache",
|
|
51
|
+
"Connection": "keep-alive",
|
|
52
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
53
|
+
"Origin": "https://ythzxfw.miit.gov.cn",
|
|
54
|
+
"Pragma": "no-cache",
|
|
55
|
+
"Referer": "https://ythzxfw.miit.gov.cn/oldyth/resultQuery",
|
|
56
|
+
"Sec-Fetch-Dest": "empty",
|
|
57
|
+
"Sec-Fetch-Mode": "cors",
|
|
58
|
+
"Sec-Fetch-Site": "same-origin",
|
|
59
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
|
60
|
+
"sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
|
|
61
|
+
"sec-ch-ua-mobile": "?0",
|
|
62
|
+
"sec-ch-ua-platform": '"Windows"'
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
COOKIES = {
|
|
66
|
+
"wzws_sessionid": "oGivsIOAMjQwZTozYjM6MzBiMjo3MWMwOjg0NmY6MzQ4OTozNWZjOjEyMTGBOGY2OTQ2gjdjYmMyNQ==",
|
|
67
|
+
"ariauseGraymode": "false",
|
|
68
|
+
"Hm_lvt_a73626d298a849004aacc34159f68abd": "1755909741,1756084244,1756256541,1756344453",
|
|
69
|
+
"Hm_lpvt_a73626d298a849004aacc34159f68abd": "1756344453",
|
|
70
|
+
"HMACCOUNT": "08DF0D235A291EAA"
|
|
71
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from crawlo import Request, Spider
|
|
6
|
+
|
|
7
|
+
from examples.gxb.items import RadioApprovalItem, TelecomLicenseItem
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
# 基础配置
|
|
12
|
+
BASE_URL = "https://ythzxfw.miit.gov.cn"
|
|
13
|
+
API_URL = BASE_URL + "/oldyth/user-center/tbAppSearch/selectResult"
|
|
14
|
+
|
|
15
|
+
# 任务配置
|
|
16
|
+
TASKS = {
|
|
17
|
+
"radio_approval": {
|
|
18
|
+
"name": "无线电设备型号核准",
|
|
19
|
+
"category_id": "352",
|
|
20
|
+
"item_class": RadioApprovalItem,
|
|
21
|
+
"table": "radio_equipment_approval_new",
|
|
22
|
+
"field_mapping": {
|
|
23
|
+
'articleField01': 'approval_number',
|
|
24
|
+
'articleField02': 'device_name',
|
|
25
|
+
'articleField03': 'device_model',
|
|
26
|
+
'articleField04': 'applicant',
|
|
27
|
+
'articleField05': 'remarks',
|
|
28
|
+
'articleField06': 'validity_period',
|
|
29
|
+
'articleField07': 'frequency_tolerance',
|
|
30
|
+
'articleField08': 'frequency_range',
|
|
31
|
+
'articleField09': 'transmit_power',
|
|
32
|
+
'articleField10': 'occupied_bandwidth',
|
|
33
|
+
'articleField11': 'spurious_emission_limit',
|
|
34
|
+
'articleField12': 'issue_date',
|
|
35
|
+
'articleField13': 'approval_code',
|
|
36
|
+
'articleField14': 'cmiit_id',
|
|
37
|
+
'articleField15': 'modulation_mode',
|
|
38
|
+
'articleField16': 'technology_system',
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"telecom_license": {
|
|
42
|
+
"name": "电信设备进网许可证",
|
|
43
|
+
"category_id": "144",
|
|
44
|
+
"item_class": TelecomLicenseItem,
|
|
45
|
+
"table": "telecom_device_licenses_new",
|
|
46
|
+
"field_mapping": {
|
|
47
|
+
'articleField01': 'license_number',
|
|
48
|
+
'articleField02': 'device_name',
|
|
49
|
+
'articleField03': 'device_model',
|
|
50
|
+
'articleField04': 'applicant',
|
|
51
|
+
'articleField05': 'manufacturer',
|
|
52
|
+
'articleField06': 'issue_date',
|
|
53
|
+
'articleField07': 'expiry_date',
|
|
54
|
+
'articleField08': 'certificate_type',
|
|
55
|
+
'articleField09': 'remarks',
|
|
56
|
+
'articleField10': 'certificate_status',
|
|
57
|
+
'articleField11': 'origin',
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def strip_html(text: str) -> str:
|
|
63
|
+
"""去除 HTML 标签"""
|
|
64
|
+
if not text or not isinstance(text, str):
|
|
65
|
+
return text
|
|
66
|
+
return re.sub(r'<[^>]+>', '', text)
|
|
67
|
+
|
|
68
|
+
class MiitSpider(Spider):
|
|
69
|
+
name = "miit_spider"
|
|
70
|
+
custom_settings = {
|
|
71
|
+
'DOWNLOAD_DELAY': 0.5,
|
|
72
|
+
'CONCURRENT_REQUESTS': 5,
|
|
73
|
+
'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
|
|
74
|
+
'COOKIES_ENABLED': True,
|
|
75
|
+
'RETRY_TIMES': 3,
|
|
76
|
+
'DEFAULT_REQUEST_HEADERS': {
|
|
77
|
+
"Accept": "application/json, text/plain, */*",
|
|
78
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
79
|
+
"Authorization": "null",
|
|
80
|
+
"Cache-Control": "no-cache",
|
|
81
|
+
"Connection": "keep-alive",
|
|
82
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
83
|
+
"Origin": BASE_URL,
|
|
84
|
+
"Pragma": "no-cache",
|
|
85
|
+
"Referer": f"{BASE_URL}/oldyth/resultQuery",
|
|
86
|
+
"Sec-Fetch-Dest": "empty",
|
|
87
|
+
"Sec-Fetch-Mode": "cors",
|
|
88
|
+
"Sec-Fetch-Site": "same-origin",
|
|
89
|
+
"sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
|
|
90
|
+
"sec-ch-ua-mobile": "?0",
|
|
91
|
+
"sec-ch-ua-platform": '"Windows"'
|
|
92
|
+
},
|
|
93
|
+
'COOKIES_DEBUG': False,
|
|
94
|
+
'LOG_LEVEL': 'INFO',
|
|
95
|
+
'ITEM_PIPELINES': {
|
|
96
|
+
'kyqb_scrapy.pipelines.DedupAndMySQLPipeline': 300,
|
|
97
|
+
},
|
|
98
|
+
'DOWNLOADER_MIDDLEWARES': {
|
|
99
|
+
'kyqb_scrapy.middlewares.RandomUserAgentMiddleware': 400,
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
def __init__(self, task='telecom_license', start_page=1, end_page=100, *args, **kwargs):
|
|
104
|
+
super(MiitSpider, self).__init__(*args, **kwargs)
|
|
105
|
+
|
|
106
|
+
if task not in TASKS:
|
|
107
|
+
raise ValueError(f"不支持的任务: {task}")
|
|
108
|
+
|
|
109
|
+
self.task_config = TASKS[task]
|
|
110
|
+
self.category_id = self.task_config["category_id"]
|
|
111
|
+
self.item_class = self.task_config["item_class"]
|
|
112
|
+
self.table_name = self.task_config["table"]
|
|
113
|
+
self.field_mapping = self.task_config["field_mapping"]
|
|
114
|
+
|
|
115
|
+
self.start_page = int(start_page)
|
|
116
|
+
self.end_page = int(end_page)
|
|
117
|
+
self.page_size = 5
|
|
118
|
+
|
|
119
|
+
# 设置 custom_settings 中的表名(动态)
|
|
120
|
+
self.custom_settings['MYSQL_TABLE'] = self.table_name
|
|
121
|
+
|
|
122
|
+
logger.info(f"🚀 启动任务: {self.task_config['name']},页码 {self.start_page} ~ {self.end_page}")
|
|
123
|
+
|
|
124
|
+
def start_requests(self):
|
|
125
|
+
for page in range(self.start_page, self.end_page + 1):
|
|
126
|
+
data = {
|
|
127
|
+
"categoryId": self.category_id,
|
|
128
|
+
"currentPage": page,
|
|
129
|
+
"pageSize": self.page_size,
|
|
130
|
+
"searchContent": ""
|
|
131
|
+
}
|
|
132
|
+
yield Request(
|
|
133
|
+
url=API_URL,
|
|
134
|
+
method='POST',
|
|
135
|
+
body=json.dumps(data, separators=(',', ':')),
|
|
136
|
+
headers={'Content-Type': 'application/json;charset=UTF-8'},
|
|
137
|
+
callback=self.parse,
|
|
138
|
+
meta={'page': page},
|
|
139
|
+
dont_filter=True
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def parse(self, response):
|
|
143
|
+
page = response.meta['page']
|
|
144
|
+
|
|
145
|
+
# 检查响应
|
|
146
|
+
if response.status_code != 200:
|
|
147
|
+
self.logger.error(f"❌ 第 {page} 页请求失败: HTTP {response.status}")
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
result = json.loads(response.text)
|
|
152
|
+
except json.JSONDecodeError:
|
|
153
|
+
text = response.text
|
|
154
|
+
if "升级浏览器" in text or "请尝试" in text:
|
|
155
|
+
self.logger.error(f"⚠️ 检测到反爬: 请升级浏览器。响应片段: {text[:300]}")
|
|
156
|
+
else:
|
|
157
|
+
self.logger.error(f"JSON解析失败: {text[:300]}")
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
if not result.get("success"):
|
|
161
|
+
msg = result.get("msg", "未知错误")
|
|
162
|
+
if "升级浏览器" in msg or "请尝试" in msg:
|
|
163
|
+
self.logger.error(f"⚠️ 反爬提示: {msg}")
|
|
164
|
+
else:
|
|
165
|
+
self.logger.error(f"接口失败: {msg}")
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
raw_records = result["params"]["tbAppArticle"]["list"]
|
|
169
|
+
self.logger.info(f"✅ 第 {page} 页获取 {len(raw_records)} 条数据")
|
|
170
|
+
|
|
171
|
+
for record in raw_records:
|
|
172
|
+
item = self.item_class()
|
|
173
|
+
|
|
174
|
+
for src_key, dst_key in self.field_mapping.items():
|
|
175
|
+
value = record.get(src_key, '')
|
|
176
|
+
if isinstance(value, str):
|
|
177
|
+
value = strip_html(value)
|
|
178
|
+
item[dst_key] = value
|
|
179
|
+
|
|
180
|
+
yield item
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
from crawlo import Spider, Request
|
|
4
|
+
from crawlo.utils.log import get_logger
|
|
5
|
+
|
|
6
|
+
from examples.gxb.items import TelecomLicenseItem
|
|
7
|
+
from examples.gxb.settings import HEADERS, COOKIES
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
class TelecomDeviceLicensesSpider(Spider):
|
|
13
|
+
name = 'telecom_device_licenses'
|
|
14
|
+
allowed_domains = ['ythzxfw.miit.gov.cn']
|
|
15
|
+
# API 的基础 URL
|
|
16
|
+
base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
|
|
17
|
+
|
|
18
|
+
# 配置:起始页码和结束页码
|
|
19
|
+
start_page = 1
|
|
20
|
+
end_page = 26405
|
|
21
|
+
data = {
|
|
22
|
+
"categoryId": "144",
|
|
23
|
+
"currentPage": 1,
|
|
24
|
+
"pageSize": 5,
|
|
25
|
+
"searchContent": ""
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def start_requests(self):
|
|
30
|
+
"""从第一页开始,逐页发起请求"""
|
|
31
|
+
|
|
32
|
+
yield Request(
|
|
33
|
+
url=self.base_api_url,
|
|
34
|
+
method='POST',
|
|
35
|
+
headers=HEADERS,
|
|
36
|
+
cookies=COOKIES,
|
|
37
|
+
body=json.dumps(self.data),
|
|
38
|
+
callback=self.parse,
|
|
39
|
+
meta={'page': 1},
|
|
40
|
+
dont_filter=True
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def parse(self, response):
|
|
45
|
+
"""
|
|
46
|
+
解析 API 响应
|
|
47
|
+
:param response: Scrapy Response 对象
|
|
48
|
+
"""
|
|
49
|
+
page = response.meta['page']
|
|
50
|
+
self.logger.info(f"正在解析第 {page} 页,状态码: {response.status_code}")
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
json_data = response.json()
|
|
54
|
+
|
|
55
|
+
if not json_data.get('success'):
|
|
56
|
+
self.logger.error(f"第 {page} 页请求失败: {json_data.get('msg', 'Unknown error')}")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
# 提取总页数和总记录数(可选,用于验证)
|
|
60
|
+
total_records = json_data.get("params", {}).get("tbAppArticle", {}).get("total", 0)
|
|
61
|
+
self.logger.info(f"第 {page} 页,总记录数: {total_records}")
|
|
62
|
+
|
|
63
|
+
article_list = json_data.get("params", {}).get("tbAppArticle", {}).get("list", [])
|
|
64
|
+
|
|
65
|
+
if not article_list:
|
|
66
|
+
self.logger.warning(f"第 {page} 页未找到数据")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
self.logger.info(f"第 {page} 页成功解析到 {len(article_list)} 条记录")
|
|
70
|
+
|
|
71
|
+
# 将每条记录作为独立的 item yield 出去
|
|
72
|
+
for item in article_list:
|
|
73
|
+
# 清洗数据:移除 HTML 标签
|
|
74
|
+
cleaned_item = self.clean_item(item)
|
|
75
|
+
item = TelecomLicenseItem()
|
|
76
|
+
item['license_number'] = cleaned_item.get('articleField01')
|
|
77
|
+
item['device_name'] = cleaned_item.get('articleField02')
|
|
78
|
+
item['device_model'] = cleaned_item.get('articleField03')
|
|
79
|
+
item['applicant'] = cleaned_item.get('articleField04')
|
|
80
|
+
item['manufacturer'] = cleaned_item.get('articleField05')
|
|
81
|
+
item['issue_date'] = cleaned_item.get('articleField06')
|
|
82
|
+
item['expiry_date'] = cleaned_item.get('articleField07')
|
|
83
|
+
item['certificate_type'] = cleaned_item.get('articleField08')
|
|
84
|
+
item['remarks'] = cleaned_item.get('articleField09')
|
|
85
|
+
item['certificate_status'] = cleaned_item.get('articleField10')
|
|
86
|
+
item['origin'] = cleaned_item.get('articleField11')
|
|
87
|
+
item['article_id'] = cleaned_item.get('articleId')
|
|
88
|
+
item['article_edit_date'] = cleaned_item.get('articleEdate')
|
|
89
|
+
item['create_time'] = cleaned_item.get('createTime')
|
|
90
|
+
yield item
|
|
91
|
+
|
|
92
|
+
# --- 自动翻页逻辑 ---
|
|
93
|
+
# 检查是否还有下一页
|
|
94
|
+
# 方法1:根据当前页码和预设的总页数
|
|
95
|
+
if page < self.end_page:
|
|
96
|
+
next_page = page + 1
|
|
97
|
+
self.data['currentPage'] = next_page
|
|
98
|
+
self.logger.debug(f"准备爬取下一页: {next_page}")
|
|
99
|
+
yield Request(
|
|
100
|
+
url=self.base_api_url,
|
|
101
|
+
method='POST',
|
|
102
|
+
headers=HEADERS,
|
|
103
|
+
cookies=COOKIES,
|
|
104
|
+
body=json.dumps(self.data),
|
|
105
|
+
callback=self.parse,
|
|
106
|
+
meta={'page': next_page},
|
|
107
|
+
dont_filter=True
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
self.logger.error(f"解析第 {page} 页响应失败: {e}", exc_info=True)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def clean_item(item: dict) -> dict:
|
|
115
|
+
"""
|
|
116
|
+
清洗单条记录,移除 HTML 标签等
|
|
117
|
+
:param item: 原始字典
|
|
118
|
+
:return: 清洗后的字典
|
|
119
|
+
"""
|
|
120
|
+
import re
|
|
121
|
+
html_tag_re = re.compile(r'<[^>]+>')
|
|
122
|
+
cleaned = {}
|
|
123
|
+
for k, v in item.items():
|
|
124
|
+
if isinstance(v, str):
|
|
125
|
+
# 移除 HTML 标签并去除首尾空白
|
|
126
|
+
cleaned[k] = html_tag_re.sub('', v).strip()
|
|
127
|
+
else:
|
|
128
|
+
cleaned[k] = v
|
|
129
|
+
return cleaned
|
tests/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-
|
|
5
|
-
# @Author :
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-24 12:36
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# tests/test_proxy_health_check.py
|
|
2
|
+
import pytest
|
|
3
|
+
from unittest.mock import AsyncMock, patch
|
|
4
|
+
from crawlo.proxy.health_check import check_single_proxy
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.asyncio
|
|
9
|
+
@patch('httpx.AsyncClient')
|
|
10
|
+
async def test_health_check_success(mock_client_class):
|
|
11
|
+
"""测试健康检查:成功"""
|
|
12
|
+
mock_resp = AsyncMock()
|
|
13
|
+
mock_resp.status_code = 200
|
|
14
|
+
mock_client_class.return_value.__aenter__.return_value.get.return_value = mock_resp
|
|
15
|
+
|
|
16
|
+
proxy_info = {'url': 'http://good:8080', 'healthy': False}
|
|
17
|
+
await check_single_proxy(proxy_info)
|
|
18
|
+
|
|
19
|
+
assert proxy_info['healthy'] is True
|
|
20
|
+
assert proxy_info['failures'] == 0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.asyncio
|
|
24
|
+
@patch('httpx.AsyncClient')
|
|
25
|
+
async def test_health_check_failure(mock_client_class):
|
|
26
|
+
"""测试健康检查:失败"""
|
|
27
|
+
mock_client_class.return_value.__aenter__.return_value.get.side_effect = httpx.ConnectError("Failed")
|
|
28
|
+
|
|
29
|
+
proxy_info = {'url': 'http://bad:8080', 'healthy': True, 'failures': 0}
|
|
30
|
+
await check_single_proxy(proxy_info)
|
|
31
|
+
|
|
32
|
+
assert proxy_info['healthy'] is False
|
|
33
|
+
assert proxy_info['failures'] == 1
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# tests/test_proxy_middleware_integration.py
|
|
2
|
+
import pytest
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from unittest.mock import Mock, AsyncMock, patch
|
|
6
|
+
from crawlo import Request, Response, Spider
|
|
7
|
+
from crawlo.proxy.middleware import ProxyMiddleware
|
|
8
|
+
from crawlo.proxy.stats import ProxyStats
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def crawler():
|
|
13
|
+
class MockSettings:
|
|
14
|
+
def get(self, key, default=None):
|
|
15
|
+
defaults = {
|
|
16
|
+
'PROXY_ENABLED': True,
|
|
17
|
+
'PROXIES': ['http://p1:8080', 'http://p2:8080'],
|
|
18
|
+
'PROXY_SELECTION_STRATEGY': 'random',
|
|
19
|
+
'PROXY_REQUEST_DELAY_ENABLED': False,
|
|
20
|
+
'PROXY_MAX_RETRY_COUNT': 1,
|
|
21
|
+
}
|
|
22
|
+
return defaults.get(key, default)
|
|
23
|
+
|
|
24
|
+
def get_bool(self, key, default=None):
|
|
25
|
+
return self.get(key, default)
|
|
26
|
+
|
|
27
|
+
def get_int(self, key, default=None):
|
|
28
|
+
return self.get(key, default)
|
|
29
|
+
|
|
30
|
+
def get_float(self, key, default=None):
|
|
31
|
+
return self.get(key, default)
|
|
32
|
+
|
|
33
|
+
def get_list(self, key, default=None):
|
|
34
|
+
return self.get(key, default)
|
|
35
|
+
|
|
36
|
+
class MockCrawler:
|
|
37
|
+
def __init__(self):
|
|
38
|
+
self.settings = MockSettings()
|
|
39
|
+
|
|
40
|
+
return MockCrawler()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture
|
|
44
|
+
def middleware(crawler):
|
|
45
|
+
mw = ProxyMiddleware.create_instance(crawler)
|
|
46
|
+
mw._load_providers = Mock()
|
|
47
|
+
mw._update_proxy_pool = AsyncMock()
|
|
48
|
+
mw._health_check = AsyncMock()
|
|
49
|
+
mw.scheduler = None
|
|
50
|
+
|
|
51
|
+
mw.proxies = [
|
|
52
|
+
{
|
|
53
|
+
'url': 'http://p1:8080',
|
|
54
|
+
'healthy': True,
|
|
55
|
+
'failures': 0,
|
|
56
|
+
'last_health_check': 0,
|
|
57
|
+
'unhealthy_since': 0
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
'url': 'http://p2:8080',
|
|
61
|
+
'healthy': True,
|
|
62
|
+
'failures': 0,
|
|
63
|
+
'last_health_check': 0,
|
|
64
|
+
'unhealthy_since': 0
|
|
65
|
+
},
|
|
66
|
+
]
|
|
67
|
+
mw.stats = ProxyStats()
|
|
68
|
+
for p in mw.proxies:
|
|
69
|
+
mw.stats.record(p['url'], 'total')
|
|
70
|
+
|
|
71
|
+
asyncio.get_event_loop().run_until_complete(mw._initial_setup())
|
|
72
|
+
return mw
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@pytest.fixture
|
|
76
|
+
def spider():
|
|
77
|
+
return Mock(spec=Spider, logger=Mock())
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_process_request_sets_proxy(middleware, spider):
|
|
81
|
+
request = Request("https://example.com")
|
|
82
|
+
result = asyncio.get_event_loop().run_until_complete(
|
|
83
|
+
middleware.process_request(request, spider)
|
|
84
|
+
)
|
|
85
|
+
assert result is None
|
|
86
|
+
assert hasattr(request, 'proxy')
|
|
87
|
+
assert request.proxy in ['http://p1:8080', 'http://p2:8080']
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_process_response_records_success(middleware, spider):
|
|
91
|
+
request = Request("https://example.com")
|
|
92
|
+
request.proxy = 'http://p1:8080'
|
|
93
|
+
response = Response("https://example.com", body=b"ok", headers={})
|
|
94
|
+
middleware.stats.record(request.proxy, 'total')
|
|
95
|
+
middleware.process_response(request, response, spider)
|
|
96
|
+
assert middleware.stats.get(request.proxy)['success'] == 1
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_process_exception_switches_proxy(middleware, spider):
|
|
100
|
+
request = Request("https://example.com")
|
|
101
|
+
request.proxy = 'http://p1:8080'
|
|
102
|
+
request.meta['proxy_retry_count'] = 0
|
|
103
|
+
|
|
104
|
+
result = middleware.process_exception(request, Exception("Timeout"), spider)
|
|
105
|
+
assert result is not None
|
|
106
|
+
assert result.proxy != 'http://p1:8080'
|
|
107
|
+
assert result.meta['proxy_retry_count'] == 1
|
|
108
|
+
|
|
109
|
+
final = middleware.process_exception(result, Exception("Timeout"), spider)
|
|
110
|
+
assert final is None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_mark_failure_disables_proxy(middleware):
|
|
114
|
+
proxy_url = 'http://p1:8080'
|
|
115
|
+
p = next(p for p in middleware.proxies if p['url'] == proxy_url)
|
|
116
|
+
p['failures'] = 2
|
|
117
|
+
|
|
118
|
+
middleware._mark_failure(proxy_url)
|
|
119
|
+
assert p['failures'] == 3
|
|
120
|
+
assert p['healthy'] is False
|
|
121
|
+
assert p['unhealthy_since'] > 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@pytest.mark.asyncio
|
|
125
|
+
async def test_request_delay(middleware, spider):
|
|
126
|
+
"""测试请求延迟功能:验证是否调用了 asyncio.sleep"""
|
|
127
|
+
with patch("crawlo.proxy.middleware.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
|
|
128
|
+
middleware.delay_enabled = True # 注意:这里应该是 delay_enabled 而不是 request_delay_enabled
|
|
129
|
+
middleware.request_delay = 0.1
|
|
130
|
+
middleware._last_req_time = time.time() - 0.05 # 50ms 前
|
|
131
|
+
|
|
132
|
+
request = Request("https://a.com")
|
|
133
|
+
await middleware.process_request(request, spider)
|
|
134
|
+
|
|
135
|
+
mock_sleep.assert_called_once()
|
|
136
|
+
delay = mock_sleep.call_args[0][0]
|
|
137
|
+
assert 0.04 <= delay <= 0.06
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# tests/test_proxy_providers.py
|
|
2
|
+
import pytest
|
|
3
|
+
import pytest
|
|
4
|
+
import respx
|
|
5
|
+
from httpx import Response
|
|
6
|
+
from crawlo.proxy.providers import StaticProxyProvider, FileProxyProvider, APIProxyProvider
|
|
7
|
+
import tempfile
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.asyncio
|
|
12
|
+
async def test_static_provider():
|
|
13
|
+
"""测试静态代理提供者"""
|
|
14
|
+
provider = StaticProxyProvider(['http://1.1.1.1:8080', 'http://2.2.2.2:8080'])
|
|
15
|
+
proxies = await provider.fetch_proxies()
|
|
16
|
+
assert len(proxies) == 2
|
|
17
|
+
assert 'http://1.1.1.1:8080' in proxies
|
|
18
|
+
assert 'http://2.2.2.2:8080' in proxies
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.mark.asyncio
|
|
22
|
+
async def test_file_provider():
|
|
23
|
+
"""测试文件代理提供者"""
|
|
24
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
|
|
25
|
+
f.write("http://a.com:8080\nhttp://b.com:8080\n")
|
|
26
|
+
temp_path = f.name
|
|
27
|
+
try:
|
|
28
|
+
provider = FileProxyProvider(temp_path)
|
|
29
|
+
proxies = await provider.fetch_proxies()
|
|
30
|
+
assert len(proxies) == 2
|
|
31
|
+
assert 'http://a.com:8080' in proxies
|
|
32
|
+
assert 'http://b.com:8080' in proxies
|
|
33
|
+
finally:
|
|
34
|
+
os.unlink(temp_path)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.mark.asyncio
|
|
38
|
+
@respx.mock
|
|
39
|
+
async def test_api_provider():
|
|
40
|
+
"""使用 respx 拦截 HTTP 请求,更简洁可靠"""
|
|
41
|
+
# 拦截 GET 请求
|
|
42
|
+
respx.get("https://api.example.com").mock(
|
|
43
|
+
return_value=Response(
|
|
44
|
+
200,
|
|
45
|
+
json=[
|
|
46
|
+
{"ip": "1.1.1.1", "port": 8080},
|
|
47
|
+
{"ip": "2.2.2.2", "port": 8080}
|
|
48
|
+
]
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
provider = APIProxyProvider(url="https://api.example.com")
|
|
53
|
+
proxies = await provider.fetch_proxies()
|
|
54
|
+
|
|
55
|
+
assert len(proxies) == 2
|
|
56
|
+
assert "http://1.1.1.1:8080" in proxies
|
|
57
|
+
assert "http://2.2.2.2:8080" in proxies
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# tests/test_proxy_stats.py
|
|
2
|
+
from crawlo.proxy.stats import ProxyStats
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_proxy_stats():
|
|
6
|
+
"""测试代理统计功能"""
|
|
7
|
+
stats = ProxyStats()
|
|
8
|
+
url = 'http://proxy1:8080'
|
|
9
|
+
|
|
10
|
+
stats.record(url, 'success')
|
|
11
|
+
stats.record(url, 'success')
|
|
12
|
+
stats.record(url, 'failure')
|
|
13
|
+
|
|
14
|
+
assert stats.get(url)['success'] == 2
|
|
15
|
+
assert stats.get(url)['failure'] == 1
|
|
16
|
+
assert stats.get(url)['total'] == 3
|
|
17
|
+
|
|
18
|
+
all_data = stats.all()
|
|
19
|
+
assert url in all_data
|
|
20
|
+
assert all_data[url]['success'] == 2
|