crawlo 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +33 -24
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -155
- crawlo/commands/genspider.py +125 -110
- crawlo/commands/list.py +147 -119
- crawlo/commands/run.py +285 -170
- crawlo/commands/startproject.py +111 -101
- crawlo/commands/stats.py +188 -167
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -57
- crawlo/crawler.py +494 -492
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +199 -199
- crawlo/downloader/cffi_downloader.py +242 -277
- crawlo/downloader/httpx_downloader.py +246 -246
- crawlo/event.py +11 -11
- crawlo/exceptions.py +78 -78
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +34 -34
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -150
- crawlo/filters/memory_filter.py +202 -202
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +245 -245
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +203 -203
- crawlo/network/response.py +166 -166
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +272 -272
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/project.py +153 -0
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +166 -168
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +129 -129
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +75 -75
- crawlo/templates/project/pipelines.py.tmpl +63 -63
- crawlo/templates/project/settings.py.tmpl +54 -54
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +31 -31
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +128 -128
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/request.py +267 -267
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.1.dist-info/METADATA +220 -0
- crawlo-1.1.1.dist-info/RECORD +100 -0
- examples/__init__.py +7 -0
- examples/baidu_spider/__init__.py +7 -0
- examples/baidu_spider/demo.py +94 -0
- examples/baidu_spider/items.py +46 -0
- examples/baidu_spider/middleware.py +49 -0
- examples/baidu_spider/pipeline.py +55 -0
- examples/baidu_spider/run.py +27 -0
- examples/baidu_spider/settings.py +121 -0
- examples/baidu_spider/spiders/__init__.py +7 -0
- examples/baidu_spider/spiders/bai_du.py +61 -0
- examples/baidu_spider/spiders/miit.py +159 -0
- examples/baidu_spider/spiders/sina.py +79 -0
- tests/__init__.py +7 -7
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- crawlo/utils/concurrency_manager.py +0 -125
- crawlo/utils/project.py +0 -197
- crawlo-1.1.0.dist-info/METADATA +0 -49
- crawlo-1.1.0.dist-info/RECORD +0 -97
- examples/gxb/__init__.py +0 -0
- examples/gxb/items.py +0 -36
- examples/gxb/run.py +0 -16
- examples/gxb/settings.py +0 -72
- examples/gxb/spider/__init__.py +0 -2
- examples/gxb/spider/miit_spider.py +0 -180
- examples/gxb/spider/telecom_device.py +0 -129
- {crawlo-1.1.0.dist-info → crawlo-1.1.1.dist-info}/WHEEL +0 -0
- {crawlo-1.1.0.dist-info → crawlo-1.1.1.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.0.dist-info → crawlo-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-05-17 11:21
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import asyncio
|
|
9
|
+
import random
|
|
10
|
+
|
|
11
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
12
|
+
from crawlo.middleware import BaseMiddleware
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestMiddleWare(BaseMiddleware):
|
|
16
|
+
|
|
17
|
+
async def process_request(self, request, spider):
|
|
18
|
+
# 请求预处理
|
|
19
|
+
# print('process_request', request, spider)
|
|
20
|
+
# if random.randint(1, 5) == 1:
|
|
21
|
+
# raise IgnoreRequestError('url不正确')
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def process_response(self, request, response, spider):
|
|
25
|
+
# 响应预处理
|
|
26
|
+
# print('process_response', response, response, spider)
|
|
27
|
+
return response
|
|
28
|
+
|
|
29
|
+
def process_exception(self, request, exception, spider):
|
|
30
|
+
# 异常预处理
|
|
31
|
+
# print('process_exception', request, exception, spider)
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TestMiddleWare2(BaseMiddleware):
|
|
36
|
+
def process_request(self, request, spider):
|
|
37
|
+
# 请求预处理
|
|
38
|
+
# print('process_request2', request, spider)
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def process_response(self, request, response, spider):
|
|
42
|
+
# 响应预处理
|
|
43
|
+
# print('process_response2', response, response, spider)
|
|
44
|
+
return response
|
|
45
|
+
|
|
46
|
+
def process_exception(self, request, exception, spider):
|
|
47
|
+
# 异常预处理
|
|
48
|
+
# print('process_exception2', request, exception, spider)
|
|
49
|
+
pass
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
import pymongo
|
|
4
|
+
|
|
5
|
+
from motor.motor_asyncio import AsyncIOMotorClient
|
|
6
|
+
|
|
7
|
+
from random import randint
|
|
8
|
+
|
|
9
|
+
from crawlo.event import spider_closed
|
|
10
|
+
from crawlo.exceptions import ItemDiscard
|
|
11
|
+
from crawlo.utils.log import get_logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TestPipeline(object):
|
|
15
|
+
|
|
16
|
+
async def process_item(self, item, spider):
|
|
17
|
+
if randint(1, 3) == 1:
|
|
18
|
+
raise ItemDiscard('重复数据')
|
|
19
|
+
return item
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def create_instance(cls, *args, **kwargs):
|
|
23
|
+
return cls()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MongoPipeline(object):
|
|
27
|
+
|
|
28
|
+
def __init__(self, conn, col):
|
|
29
|
+
self.conn = conn
|
|
30
|
+
self.col = col
|
|
31
|
+
|
|
32
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def create_instance(cls, crawler):
|
|
36
|
+
settings = crawler.settings
|
|
37
|
+
mongo_params = settings.get('MONGODB_PARAMS', None)
|
|
38
|
+
db_name = settings.get('MONGODB_DB', None)
|
|
39
|
+
project_name = settings.get('PROJECT_NAME', None)
|
|
40
|
+
|
|
41
|
+
conn = AsyncIOMotorClient(**mongo_params) if mongo_params else AsyncIOMotorClient()
|
|
42
|
+
|
|
43
|
+
col = conn[db_name][project_name]
|
|
44
|
+
o = cls(conn, col)
|
|
45
|
+
crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
|
|
46
|
+
return o
|
|
47
|
+
|
|
48
|
+
async def process_item(self, item, spider):
|
|
49
|
+
await self.col.insert_one(item.to_dict())
|
|
50
|
+
return item
|
|
51
|
+
|
|
52
|
+
async def spider_closed(self):
|
|
53
|
+
self.logger.info('MongoDB closed.')
|
|
54
|
+
self.conn.close()
|
|
55
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 13:12
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import asyncio
|
|
9
|
+
from crawlo.crawler import CrawlerProcess
|
|
10
|
+
|
|
11
|
+
from examples.baidu_spider.spiders.miit import MiitDeviceSpider
|
|
12
|
+
from examples.baidu_spider.spiders.sina import SinaSpider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def main():
|
|
16
|
+
process = CrawlerProcess()
|
|
17
|
+
# await process.crawl(
|
|
18
|
+
# [
|
|
19
|
+
# # SinaSpider,
|
|
20
|
+
# MiitDeviceSpider
|
|
21
|
+
# ]
|
|
22
|
+
# )
|
|
23
|
+
await process.crawl('miit_device')
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if __name__ == '__main__':
|
|
27
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
|
|
4
|
+
PROJECT_NAME = 'baidu_spider'
|
|
5
|
+
|
|
6
|
+
CONCURRENCY = 30
|
|
7
|
+
|
|
8
|
+
MAX_RUNNING_SPIDERS = 8
|
|
9
|
+
|
|
10
|
+
USE_SESSION = True
|
|
11
|
+
|
|
12
|
+
# 下载延迟
|
|
13
|
+
DOWNLOAD_DELAY = 0.1
|
|
14
|
+
RANDOMNESS = False
|
|
15
|
+
|
|
16
|
+
# --------------------------------------------------- 公共MySQL配置 -----------------------------------------------------
|
|
17
|
+
MYSQL_HOST = '43.139.14.225'
|
|
18
|
+
MYSQL_PORT = 3306
|
|
19
|
+
MYSQL_USER = 'picker'
|
|
20
|
+
MYSQL_PASSWORD = 'kmcNbbz6TbSihttZ'
|
|
21
|
+
MYSQL_DB = 'stock_pro'
|
|
22
|
+
MYSQL_TABLE = 'articles' # 可选,默认使用spider名称
|
|
23
|
+
MYSQL_BATCH_SIZE = 500
|
|
24
|
+
|
|
25
|
+
# asyncmy专属配置
|
|
26
|
+
MYSQL_POOL_MIN = 5 # 连接池最小连接数
|
|
27
|
+
MYSQL_POOL_MAX = 20 # 连接池最大连接数
|
|
28
|
+
|
|
29
|
+
# 选择下载器
|
|
30
|
+
DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
|
|
31
|
+
# DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"
|
|
32
|
+
|
|
33
|
+
# MIDDLEWARES = [
|
|
34
|
+
# 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
35
|
+
# 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
36
|
+
# 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
37
|
+
# 'crawlo.middleware.retry.RetryMiddleware',
|
|
38
|
+
# 'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
39
|
+
# 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
40
|
+
# ]
|
|
41
|
+
|
|
42
|
+
EXTENSIONS = [
|
|
43
|
+
'crawlo.extension.log_interval.LogIntervalExtension',
|
|
44
|
+
'crawlo.extension.log_stats.LogStats',
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
PIPELINES = [
|
|
48
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
49
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
50
|
+
# 'crawlo.pipelines.mysql_batch_pipline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
51
|
+
# 'baidu_spider.pipeline.TestPipeline',
|
|
52
|
+
# 'baidu_spider.pipeline.MongoPipeline',
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36'
|
|
56
|
+
DEFAULT_HEADERS = {
|
|
57
|
+
"accept": "application/json, text/javascript, */*; q=0.01",
|
|
58
|
+
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
59
|
+
"cache-control": "no-cache",
|
|
60
|
+
"pragma": "no-cache",
|
|
61
|
+
"priority": "u=1, i",
|
|
62
|
+
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
|
|
63
|
+
"sec-ch-ua-mobile": "?0",
|
|
64
|
+
"sec-ch-ua-platform": "\"macOS\"",
|
|
65
|
+
"sec-fetch-dest": "empty",
|
|
66
|
+
"sec-fetch-mode": "cors",
|
|
67
|
+
"sec-fetch-site": "same-origin",
|
|
68
|
+
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
|
69
|
+
"x-requested-with": "XMLHttpRequest"
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# --------------------------------------DB ---------------------------------------------
|
|
73
|
+
Mongo_Params = ''
|
|
74
|
+
MONGODB_DB = 'news'
|
|
75
|
+
|
|
76
|
+
REDIS_TTL = 0
|
|
77
|
+
CLEANUP_FP = False
|
|
78
|
+
|
|
79
|
+
LOG_LEVEL = 'DEBUG'
|
|
80
|
+
|
|
81
|
+
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
82
|
+
# FILTER_CLASS = 'crawlo.filters.redis_filter.RedisFilter'
|
|
83
|
+
# FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFileFilter'
|
|
84
|
+
|
|
85
|
+
# PROXY_POOL_API = 'http://123.56.42.142:5000/proxy/getitem/'
|
|
86
|
+
#
|
|
87
|
+
# PROXY_FETCH_FUNC = "crawlo.utils.proxy.get_proxies"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# PROXY_ENABLED = True
|
|
91
|
+
#
|
|
92
|
+
# # 使用 API 提供者
|
|
93
|
+
# PROXY_PROVIDERS = [
|
|
94
|
+
# {
|
|
95
|
+
# 'class': 'crawlo.proxy.providers.APIProxyProvider',
|
|
96
|
+
# 'config': {
|
|
97
|
+
# 'url': 'http://123.56.42.142:5000/proxy/getitem/',
|
|
98
|
+
# 'method': 'GET',
|
|
99
|
+
# 'timeout': 10.0
|
|
100
|
+
# }
|
|
101
|
+
# }
|
|
102
|
+
# ]
|
|
103
|
+
#
|
|
104
|
+
# # 代理选择策略:使用最少的代理(避免单 IP 过载)
|
|
105
|
+
# PROXY_SELECTION_STRATEGY = 'least_used'
|
|
106
|
+
#
|
|
107
|
+
# # 请求延迟:0.5~1.5 秒,避免请求过快
|
|
108
|
+
# PROXY_REQUEST_DELAY_ENABLED = True
|
|
109
|
+
# PROXY_REQUEST_DELAY = 1.0
|
|
110
|
+
#
|
|
111
|
+
# # 健康检查
|
|
112
|
+
# PROXY_HEALTH_CHECK_ENABLED = True
|
|
113
|
+
# PROXY_HEALTH_CHECK_INTERVAL = 10 # 每 5 分钟检查一次
|
|
114
|
+
#
|
|
115
|
+
# # 代理池更新
|
|
116
|
+
# PROXY_POOL_UPDATE_INTERVAL = 5 # 每 5 分钟从 API 拉取新代理
|
|
117
|
+
#
|
|
118
|
+
# # 失败处理
|
|
119
|
+
# PROXY_MAX_FAILURES = 3 # 失败 3 次后禁用代理
|
|
120
|
+
# PROXY_COOLDOWN_PERIOD = 600 # 禁用 10 分钟后恢复
|
|
121
|
+
# PROXY_MAX_RETRY_COUNT = 2 # 每个请求最多重试 2 次
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 13:05
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import asyncio
|
|
9
|
+
from crawlo import Request
|
|
10
|
+
from crawlo.spider import Spider
|
|
11
|
+
|
|
12
|
+
from items import BauDuItem
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaiDuSpider(Spider):
|
|
16
|
+
start_urls = ["https://www.baidu.com/", "https://www.baidu.com/"]
|
|
17
|
+
|
|
18
|
+
custom_settings = {
|
|
19
|
+
'CONCURRENCY': 1
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
name = "bai_du"
|
|
23
|
+
|
|
24
|
+
# headers = {
|
|
25
|
+
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
26
|
+
# }
|
|
27
|
+
#
|
|
28
|
+
user_gent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
29
|
+
|
|
30
|
+
async def parse(self, response):
|
|
31
|
+
for i in range(5):
|
|
32
|
+
url = f"https://www.baidu.com"
|
|
33
|
+
# url = f"https://www.httpbin.org/404"
|
|
34
|
+
r = Request(url=url, callback=self.parse_page, dont_filter=True)
|
|
35
|
+
yield r
|
|
36
|
+
|
|
37
|
+
async def parse_page(self, response):
|
|
38
|
+
for i in range(5):
|
|
39
|
+
url = f"https://www.baidu.com"
|
|
40
|
+
meta = {'test': 'hhhh'}
|
|
41
|
+
r = Request(url=url, callback=self.parse_detail, meta=meta, dont_filter=False)
|
|
42
|
+
yield r
|
|
43
|
+
|
|
44
|
+
def parse_detail(self, response):
|
|
45
|
+
item = BauDuItem()
|
|
46
|
+
item['title'] = response.xpath('//title/text()').get()
|
|
47
|
+
|
|
48
|
+
item['url'] = response.url
|
|
49
|
+
|
|
50
|
+
yield item
|
|
51
|
+
|
|
52
|
+
async def spider_opened(self):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
async def spider_closed(self):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
b = BaiDuSpider()
|
|
61
|
+
b.start_requests()
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-22 14:00
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : 爬取工信部无线电设备核准信息(支持全量34652页)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import asyncio
|
|
11
|
+
import random
|
|
12
|
+
|
|
13
|
+
from crawlo import Request
|
|
14
|
+
from crawlo.spider import Spider
|
|
15
|
+
from crawlo.utils.log import get_logger
|
|
16
|
+
from crawlo.utils.date_tools import to_datetime
|
|
17
|
+
|
|
18
|
+
# 引入定义好的 Item
|
|
19
|
+
from examples.baidu_spider.items import MiitDeviceItem
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MiitDeviceSpider(Spider):
|
|
26
|
+
name = 'miit_device'
|
|
27
|
+
allowed_domains = ['ythzxfw.miit.gov.cn']
|
|
28
|
+
|
|
29
|
+
# 字段映射表
|
|
30
|
+
FIELD_MAPPING = {
|
|
31
|
+
"articleField01": ("核准证编号", "approval_certificate_no"),
|
|
32
|
+
"articleField02": ("设备名称", "device_name"),
|
|
33
|
+
"articleField03": ("设备型号", "model_number"),
|
|
34
|
+
"articleField04": ("申请单位", "applicant"),
|
|
35
|
+
"articleField05": ("备注", "remarks"),
|
|
36
|
+
"articleField06": ("有效期", "validity_period"),
|
|
37
|
+
"articleField07": ("频率容限", "frequency_tolerance"),
|
|
38
|
+
"articleField08": ("频率范围", "frequency_range"),
|
|
39
|
+
"articleField09": ("发射功率", "transmission_power"),
|
|
40
|
+
"articleField10": ("占用带宽", "occupied_bandwidth"),
|
|
41
|
+
"articleField11": ("杂散发射限制", "spurious_emission_limit"),
|
|
42
|
+
"articleField12": ("发证日期", "issue_date"),
|
|
43
|
+
"articleField13": ("核准代码", "approval_code"),
|
|
44
|
+
"articleField14": ("CMIIT ID", "cmiit_id"),
|
|
45
|
+
"articleField15": ("调制方式", "modulation_scheme"),
|
|
46
|
+
"articleField16": ("技术体制/功能模块", "technology_module"),
|
|
47
|
+
"createTime": ("createTime", "create_time"),
|
|
48
|
+
"articleId": ("articleId", "article_id")
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
headers = {
|
|
52
|
+
"Accept": "application/json, text/plain, */*",
|
|
53
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
54
|
+
"Authorization": "null",
|
|
55
|
+
"Cache-Control": "no-cache",
|
|
56
|
+
"Connection": "keep-alive",
|
|
57
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
58
|
+
"Origin": "https://ythzxfw.miit.gov.cn",
|
|
59
|
+
"Pragma": "no-cache",
|
|
60
|
+
"Referer": "https://ythzxfw.miit.gov.cn/oldyth/resultQuery",
|
|
61
|
+
"Sec-Fetch-Dest": "empty",
|
|
62
|
+
"Sec-Fetch-Mode": "cors",
|
|
63
|
+
"Sec-Fetch-Site": "same-origin",
|
|
64
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
|
65
|
+
"sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
|
|
66
|
+
"sec-ch-ua-mobile": "?0",
|
|
67
|
+
"sec-ch-ua-platform": '"macOS"'
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
cookies = {
|
|
71
|
+
"wzws_sessionid": "gjdjYmMyNYFkZjRiZjCgaKkOx4AyNDBlOjQ3ZTozMmUwOmQ5MmI6ZjFjZTphNWJiOjk5ZmU6OTU4OQ==",
|
|
72
|
+
"ariauseGraymode": "false",
|
|
73
|
+
"Hm_lvt_a73626d298a849004aacc34159f68abd": "1755909833",
|
|
74
|
+
"Hm_lpvt_a73626d298a849004aacc34159f68abd": "1755909833",
|
|
75
|
+
"HMACCOUNT": "6C5E4C6C47DC62FF"
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# 分页配置
|
|
79
|
+
start_page = 1 # 起始页
|
|
80
|
+
end_page = 34652 # 总页数
|
|
81
|
+
current_page = 1
|
|
82
|
+
page_size = 5 # 每页条数
|
|
83
|
+
|
|
84
|
+
# 请求间隔(秒),防止被封
|
|
85
|
+
min_delay = 1.5
|
|
86
|
+
max_delay = 3.0
|
|
87
|
+
|
|
88
|
+
def start_requests(self):
|
|
89
|
+
# 从起始页开始
|
|
90
|
+
yield self.make_request(self.start_page)
|
|
91
|
+
|
|
92
|
+
def make_request(self, page):
|
|
93
|
+
"""封装请求创建"""
|
|
94
|
+
data = {
|
|
95
|
+
"categoryId": "352",
|
|
96
|
+
"currentPage": page,
|
|
97
|
+
"pageSize": self.page_size,
|
|
98
|
+
"searchContent": ""
|
|
99
|
+
}
|
|
100
|
+
return Request(
|
|
101
|
+
method='POST',
|
|
102
|
+
url='https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult',
|
|
103
|
+
headers=self.headers,
|
|
104
|
+
cookies=self.cookies,
|
|
105
|
+
body=json.dumps(data, separators=(',', ':'), ensure_ascii=False),
|
|
106
|
+
callback=self.parse,
|
|
107
|
+
dont_filter=True,
|
|
108
|
+
meta={'page': page} # 记录当前页码,便于日志和调试
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async def parse(self, response):
|
|
112
|
+
page = response.meta.get('page', 'unknown')
|
|
113
|
+
try:
|
|
114
|
+
json_data = response.json()
|
|
115
|
+
success = json_data.get("success")
|
|
116
|
+
code = json_data.get("code")
|
|
117
|
+
|
|
118
|
+
if not success or code != 200:
|
|
119
|
+
logger.error(f"第 {page} 页请求失败: code={code}, msg={json_data.get('msg')}")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
tb_app_article = json_data.get('params', {}).get('tbAppArticle', {})
|
|
123
|
+
records = tb_app_article.get('list', [])
|
|
124
|
+
total_count = tb_app_article.get('total', 0) # 总数据条数,例如 173256
|
|
125
|
+
|
|
126
|
+
logger.info(f"✅ 第 {page} 页解析成功,共 {len(records)} 条数据。总计: {total_count} 条")
|
|
127
|
+
|
|
128
|
+
for raw_item in records:
|
|
129
|
+
item = MiitDeviceItem()
|
|
130
|
+
for field_key, (chinese_name, english_field) in self.FIELD_MAPPING.items():
|
|
131
|
+
value = raw_item.get(field_key)
|
|
132
|
+
if english_field == 'issue_date' and value:
|
|
133
|
+
value = to_datetime(value.split()[0])
|
|
134
|
+
item[english_field] = value
|
|
135
|
+
yield item
|
|
136
|
+
|
|
137
|
+
# ✅ 核心修复:根据 total_count 和 page_size 计算真实总页数
|
|
138
|
+
# 注意:需要向上取整,例如 173256 / 5 = 34651.2,应该有 34652 页
|
|
139
|
+
import math
|
|
140
|
+
calculated_total_pages = math.ceil(total_count / self.page_size)
|
|
141
|
+
|
|
142
|
+
# 现在使用 calculated_total_pages 来判断是否继续翻页
|
|
143
|
+
next_page = page + 1
|
|
144
|
+
if next_page <= calculated_total_pages:
|
|
145
|
+
delay = random.uniform(self.min_delay, self.max_delay)
|
|
146
|
+
logger.debug(f"等待 {delay:.2f}s 后请求第 {next_page} 页...")
|
|
147
|
+
await asyncio.sleep(delay)
|
|
148
|
+
yield self.make_request(next_page)
|
|
149
|
+
else:
|
|
150
|
+
logger.info(f"🎉 爬取完成!已到达最后一页 {calculated_total_pages}")
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"❌ 解析第 {page} 页失败: {e}, 响应: {response.text[:500]}...")
|
|
154
|
+
|
|
155
|
+
async def spider_opened(self):
|
|
156
|
+
logger.info(f"MiitDeviceSpider 启动,准备爬取 {self.start_page} 至 {self.end_page} 页...")
|
|
157
|
+
|
|
158
|
+
async def spider_closed(self):
|
|
159
|
+
logger.info("MiitDeviceSpider 结束。")
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 13:05
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
from crawlo import Request
|
|
11
|
+
from crawlo.spider import Spider
|
|
12
|
+
from crawlo.utils.date_tools import to_datetime
|
|
13
|
+
|
|
14
|
+
from examples.baidu_spider.items import ArticleItem
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SinaSpider(Spider):
|
|
18
|
+
# 获取当前时间戳,并减去 10 分钟(600 秒)
|
|
19
|
+
current_time_minus_10min = int(time.time()) - 6000
|
|
20
|
+
# 构造 URL
|
|
21
|
+
url = f'https://news.10jqka.com.cn/tapp/news/push/stock/?page=1&tag=&track=website&ctime={current_time_minus_10min}'
|
|
22
|
+
|
|
23
|
+
start_urls = [url]
|
|
24
|
+
name = 'sina'
|
|
25
|
+
# mysql_table = 'news_10jqka'
|
|
26
|
+
|
|
27
|
+
allowed_domains = ['*']
|
|
28
|
+
|
|
29
|
+
def start_requests(self):
|
|
30
|
+
for url in self.start_urls:
|
|
31
|
+
yield Request(url=url, callback=self.parse, dont_filter=True)
|
|
32
|
+
|
|
33
|
+
async def parse(self, response):
|
|
34
|
+
jsonp_str = response.json()
|
|
35
|
+
rows = jsonp_str.get('data', {}).get('list', [])
|
|
36
|
+
for row in rows:
|
|
37
|
+
article_id = row.get('id')
|
|
38
|
+
title = row.get('title')
|
|
39
|
+
digest = row.get('digest')
|
|
40
|
+
short = row.get('short')
|
|
41
|
+
detail_url = row.get('url')
|
|
42
|
+
tag = row.get('tag')
|
|
43
|
+
ctime = row.get('ctime')
|
|
44
|
+
source = row.get('source')
|
|
45
|
+
meta = {
|
|
46
|
+
'article_id': article_id,
|
|
47
|
+
'title': title,
|
|
48
|
+
'digest': digest,
|
|
49
|
+
'short': short,
|
|
50
|
+
'detail_url': detail_url,
|
|
51
|
+
'source': source,
|
|
52
|
+
'tag': tag,
|
|
53
|
+
'ctime': to_datetime(int(ctime))
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
yield Request(url=detail_url, callback=self.parse_detail, encoding='gbk', meta=meta)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
async def parse_detail(response):
|
|
60
|
+
item = ArticleItem()
|
|
61
|
+
meta = response.meta
|
|
62
|
+
content = ''.join(response.xpath('//*[@id="contentApp"]/p/text()').extract()).strip()
|
|
63
|
+
ctime = meta.get('ctime')
|
|
64
|
+
item['article_id'] = meta.get('article_id')
|
|
65
|
+
item['title'] = meta.get('title')
|
|
66
|
+
item['digest'] = content
|
|
67
|
+
item['short'] = meta.get('short')
|
|
68
|
+
item['url'] = meta.get('detail_url')
|
|
69
|
+
item['tag'] = meta.get('tag').strip()
|
|
70
|
+
item['ctime'] = to_datetime(ctime)
|
|
71
|
+
item['source'] = meta.get('source')
|
|
72
|
+
|
|
73
|
+
yield item
|
|
74
|
+
|
|
75
|
+
async def spider_opened(self):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
async def spider_closed(self):
|
|
79
|
+
pass
|
tests/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-08-24 12:36
|
|
5
|
-
# @Author : crawl-coder
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-08-24 12:36
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
tests/test_proxy_health_check.py
CHANGED
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
# tests/test_proxy_health_check.py
|
|
2
|
-
import pytest
|
|
3
|
-
from unittest.mock import AsyncMock, patch
|
|
4
|
-
from crawlo.proxy.health_check import check_single_proxy
|
|
5
|
-
import httpx
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@pytest.mark.asyncio
|
|
9
|
-
@patch('httpx.AsyncClient')
|
|
10
|
-
async def test_health_check_success(mock_client_class):
|
|
11
|
-
"""测试健康检查:成功"""
|
|
12
|
-
mock_resp = AsyncMock()
|
|
13
|
-
mock_resp.status_code = 200
|
|
14
|
-
mock_client_class.return_value.__aenter__.return_value.get.return_value = mock_resp
|
|
15
|
-
|
|
16
|
-
proxy_info = {'url': 'http://good:8080', 'healthy': False}
|
|
17
|
-
await check_single_proxy(proxy_info)
|
|
18
|
-
|
|
19
|
-
assert proxy_info['healthy'] is True
|
|
20
|
-
assert proxy_info['failures'] == 0
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.mark.asyncio
|
|
24
|
-
@patch('httpx.AsyncClient')
|
|
25
|
-
async def test_health_check_failure(mock_client_class):
|
|
26
|
-
"""测试健康检查:失败"""
|
|
27
|
-
mock_client_class.return_value.__aenter__.return_value.get.side_effect = httpx.ConnectError("Failed")
|
|
28
|
-
|
|
29
|
-
proxy_info = {'url': 'http://bad:8080', 'healthy': True, 'failures': 0}
|
|
30
|
-
await check_single_proxy(proxy_info)
|
|
31
|
-
|
|
32
|
-
assert proxy_info['healthy'] is False
|
|
1
|
+
# tests/test_proxy_health_check.py
|
|
2
|
+
import pytest
|
|
3
|
+
from unittest.mock import AsyncMock, patch
|
|
4
|
+
from crawlo.proxy.health_check import check_single_proxy
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.asyncio
|
|
9
|
+
@patch('httpx.AsyncClient')
|
|
10
|
+
async def test_health_check_success(mock_client_class):
|
|
11
|
+
"""测试健康检查:成功"""
|
|
12
|
+
mock_resp = AsyncMock()
|
|
13
|
+
mock_resp.status_code = 200
|
|
14
|
+
mock_client_class.return_value.__aenter__.return_value.get.return_value = mock_resp
|
|
15
|
+
|
|
16
|
+
proxy_info = {'url': 'http://good:8080', 'healthy': False}
|
|
17
|
+
await check_single_proxy(proxy_info)
|
|
18
|
+
|
|
19
|
+
assert proxy_info['healthy'] is True
|
|
20
|
+
assert proxy_info['failures'] == 0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.asyncio
|
|
24
|
+
@patch('httpx.AsyncClient')
|
|
25
|
+
async def test_health_check_failure(mock_client_class):
|
|
26
|
+
"""测试健康检查:失败"""
|
|
27
|
+
mock_client_class.return_value.__aenter__.return_value.get.side_effect = httpx.ConnectError("Failed")
|
|
28
|
+
|
|
29
|
+
proxy_info = {'url': 'http://bad:8080', 'healthy': True, 'failures': 0}
|
|
30
|
+
await check_single_proxy(proxy_info)
|
|
31
|
+
|
|
32
|
+
assert proxy_info['healthy'] is False
|
|
33
33
|
assert proxy_info['failures'] == 1
|