crawlo 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -6
- crawlo/__version__.py +1 -2
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -59
- crawlo/crawler.py +242 -107
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +259 -96
- crawlo/downloader/httpx_downloader.py +187 -48
- crawlo/downloader/playwright_downloader.py +160 -160
- crawlo/event.py +11 -11
- crawlo/exceptions.py +64 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +157 -129
- crawlo/filters/memory_filter.py +202 -203
- crawlo/filters/redis_filter.py +119 -119
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +118 -118
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +140 -140
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -89
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +205 -155
- crawlo/network/response.py +166 -93
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +133 -133
- crawlo/pipelines/mysql_pipeline.py +195 -176
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +93 -89
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +36 -36
- crawlo/stats_collector.py +59 -47
- crawlo/subscriber.py +106 -27
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +125 -0
- crawlo/utils/date_tools.py +177 -177
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +39 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +122 -85
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +303 -0
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/METADATA +48 -36
- crawlo-1.0.3.dist-info/RECORD +80 -0
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/top_level.txt +1 -0
- tests/__init__.py +7 -0
- tests/baidu_spider/__init__.py +7 -0
- tests/baidu_spider/demo.py +94 -0
- tests/baidu_spider/items.py +25 -0
- tests/baidu_spider/middleware.py +49 -0
- tests/baidu_spider/pipeline.py +55 -0
- tests/baidu_spider/request_fingerprints.txt +9 -0
- tests/baidu_spider/run.py +27 -0
- tests/baidu_spider/settings.py +78 -0
- tests/baidu_spider/spiders/__init__.py +7 -0
- tests/baidu_spider/spiders/bai_du.py +61 -0
- tests/baidu_spider/spiders/sina.py +79 -0
- crawlo-1.0.1.dist-info/RECORD +0 -67
- crawlo-1.0.1.dist-info/licenses/LICENSE +0 -23
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/WHEEL +0 -0
- {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,190 +1,190 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""爬虫配置文件"""
|
|
3
|
-
# import os
|
|
4
|
-
# import sys
|
|
5
|
-
#
|
|
6
|
-
# # MYSQL
|
|
7
|
-
# MYSQL_IP = "localhost"
|
|
8
|
-
# MYSQL_PORT = 3306
|
|
9
|
-
# MYSQL_DB = ""
|
|
10
|
-
# MYSQL_USER_NAME = ""
|
|
11
|
-
# MYSQL_USER_PASS = ""
|
|
12
|
-
#
|
|
13
|
-
# # MONGODB
|
|
14
|
-
# MONGO_IP = "localhost"
|
|
15
|
-
# MONGO_PORT = 27017
|
|
16
|
-
# MONGO_DB = ""
|
|
17
|
-
# MONGO_USER_NAME = ""
|
|
18
|
-
# MONGO_USER_PASS = ""
|
|
19
|
-
# MONGO_URL = "
|
|
20
|
-
#
|
|
21
|
-
# # REDIS
|
|
22
|
-
# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
|
|
23
|
-
# REDISDB_IP_PORTS = "localhost:6379"
|
|
24
|
-
# REDISDB_USER_PASS = ""
|
|
25
|
-
# REDISDB_DB = 0
|
|
26
|
-
# # 连接redis时携带的其他参数,如ssl=True
|
|
27
|
-
# REDISDB_KWARGS = dict()
|
|
28
|
-
# # 适用于redis哨兵模式
|
|
29
|
-
# REDISDB_SERVICE_NAME = ""
|
|
30
|
-
#
|
|
31
|
-
# # 数据入库的pipeline,可自定义,默认MysqlPipeline
|
|
32
|
-
# ITEM_PIPELINES = [
|
|
33
|
-
# "feapder.pipelines.mysql_pipeline.MysqlPipeline",
|
|
34
|
-
# # "feapder.pipelines.mongo_pipeline.MongoPipeline",
|
|
35
|
-
# # "feapder.pipelines.console_pipeline.ConsolePipeline",
|
|
36
|
-
# ]
|
|
37
|
-
# EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
|
|
38
|
-
# EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
|
|
39
|
-
#
|
|
40
|
-
# # 爬虫相关
|
|
41
|
-
# # COLLECTOR
|
|
42
|
-
# COLLECTOR_TASK_COUNT = 32 # 每次获取任务数量,追求速度推荐32
|
|
43
|
-
#
|
|
44
|
-
# # SPIDER
|
|
45
|
-
# SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
|
|
46
|
-
# # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
|
|
47
|
-
# SPIDER_SLEEP_TIME = 0
|
|
48
|
-
# SPIDER_MAX_RETRY_TIMES = 10 # 每个请求最大重试次数
|
|
49
|
-
# KEEP_ALIVE = False # 爬虫是否常驻
|
|
50
|
-
|
|
51
|
-
# 下载
|
|
52
|
-
# DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器
|
|
53
|
-
# SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
|
|
54
|
-
# RENDER_DOWNLOADER = "feapder.network.downloader.SeleniumDownloader" # 渲染下载器
|
|
55
|
-
# # RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader"
|
|
56
|
-
# MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接
|
|
57
|
-
|
|
58
|
-
# # 浏览器渲染
|
|
59
|
-
# WEBDRIVER = dict(
|
|
60
|
-
# pool_size=1, # 浏览器的数量
|
|
61
|
-
# load_images=True, # 是否加载图片
|
|
62
|
-
# user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
63
|
-
# proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
64
|
-
# headless=False, # 是否为无头浏览器
|
|
65
|
-
# driver_type="CHROME", # CHROME、EDGE、PHANTOMJS、FIREFOX
|
|
66
|
-
# timeout=30, # 请求超时时间
|
|
67
|
-
# window_size=(1024, 800), # 窗口大小
|
|
68
|
-
# executable_path=None, # 浏览器路径,默认为默认路径
|
|
69
|
-
# render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
70
|
-
# custom_argument=[
|
|
71
|
-
# "--ignore-certificate-errors",
|
|
72
|
-
# "--disable-blink-features=AutomationControlled",
|
|
73
|
-
# ], # 自定义浏览器渲染参数
|
|
74
|
-
# xhr_url_regexes=None, # 拦截xhr接口,支持正则,数组类型
|
|
75
|
-
# auto_install_driver=True, # 自动下载浏览器驱动 支持chrome 和 firefox
|
|
76
|
-
# download_path=None, # 下载文件的路径
|
|
77
|
-
# use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
|
|
78
|
-
# )
|
|
79
|
-
#
|
|
80
|
-
# PLAYWRIGHT = dict(
|
|
81
|
-
# user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
82
|
-
# proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
83
|
-
# headless=False, # 是否为无头浏览器
|
|
84
|
-
# driver_type="chromium", # chromium、firefox、webkit
|
|
85
|
-
# timeout=30, # 请求超时时间
|
|
86
|
-
# window_size=(1024, 800), # 窗口大小
|
|
87
|
-
# executable_path=None, # 浏览器路径,默认为默认路径
|
|
88
|
-
# download_path=None, # 下载文件的路径
|
|
89
|
-
# render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
90
|
-
# wait_until="networkidle", # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle"
|
|
91
|
-
# use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
|
|
92
|
-
# page_on_event_callback=None, # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
|
|
93
|
-
# storage_state_path=None, # 保存浏览器状态的路径
|
|
94
|
-
# url_regexes=None, # 拦截接口,支持正则,数组类型
|
|
95
|
-
# save_all=False, # 是否保存所有拦截的接口, 配合url_regexes使用,为False时只保存最后一次拦截的接口
|
|
96
|
-
# )
|
|
97
|
-
#
|
|
98
|
-
# # 爬虫启动时,重新抓取失败的requests
|
|
99
|
-
# RETRY_FAILED_REQUESTS = False
|
|
100
|
-
# # 爬虫启动时,重新入库失败的item
|
|
101
|
-
# RETRY_FAILED_ITEMS = False
|
|
102
|
-
# # 保存失败的request
|
|
103
|
-
# SAVE_FAILED_REQUEST = True
|
|
104
|
-
# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
|
|
105
|
-
# REQUEST_LOST_TIMEOUT = 600 # 10分钟
|
|
106
|
-
# # request网络请求超时时间
|
|
107
|
-
# REQUEST_TIMEOUT = 22 # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
|
|
108
|
-
# # item在内存队列中最大缓存数量
|
|
109
|
-
# ITEM_MAX_CACHED_COUNT = 5000
|
|
110
|
-
# # item每批入库的最大数量
|
|
111
|
-
# ITEM_UPLOAD_BATCH_MAX_SIZE = 1000
|
|
112
|
-
# # item入库时间间隔
|
|
113
|
-
# ITEM_UPLOAD_INTERVAL = 1
|
|
114
|
-
# # 内存任务队列最大缓存的任务数,默认不限制;仅对AirSpider有效。
|
|
115
|
-
# TASK_MAX_CACHED_SIZE = 0
|
|
116
|
-
#
|
|
117
|
-
# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
|
|
118
|
-
# RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
|
|
119
|
-
# RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒
|
|
120
|
-
# RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True
|
|
121
|
-
#
|
|
122
|
-
# # 设置代理
|
|
123
|
-
# PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n
|
|
124
|
-
# PROXY_ENABLE = True
|
|
125
|
-
# PROXY_MAX_FAILED_TIMES = 5 # 代理最大失败次数,超过则不使用,自动删除
|
|
126
|
-
# PROXY_POOL = "feapder.network.proxy_pool.ProxyPool" # 代理池
|
|
127
|
-
#
|
|
128
|
-
# # 随机headers
|
|
129
|
-
# RANDOM_HEADERS = True
|
|
130
|
-
# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
|
|
131
|
-
# USER_AGENT_TYPE = "chrome"
|
|
132
|
-
# # 默认使用的浏览器头
|
|
133
|
-
# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
|
|
134
|
-
# # requests 使用session
|
|
135
|
-
# USE_SESSION = False
|
|
136
|
-
#
|
|
137
|
-
# # 去重
|
|
138
|
-
# ITEM_FILTER_ENABLE = False # item 去重
|
|
139
|
-
# REQUEST_FILTER_ENABLE = False # request 去重
|
|
140
|
-
# ITEM_FILTER_SETTING = dict(
|
|
141
|
-
# filter_type=1 # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4
|
|
142
|
-
# )
|
|
143
|
-
# REQUEST_FILTER_SETTING = dict(
|
|
144
|
-
# filter_type=3, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
|
|
145
|
-
# expire_time=2592000, # 过期时间1个月
|
|
146
|
-
# )
|
|
147
|
-
#
|
|
148
|
-
# # 报警 支持钉钉、飞书、企业微信、邮件
|
|
149
|
-
# # 钉钉报警
|
|
150
|
-
# DINGDING_WARNING_URL = "" # 钉钉机器人api
|
|
151
|
-
# DINGDING_WARNING_PHONE = "" # 被@的群成员手机号,支持列表,可指定多个。
|
|
152
|
-
# DINGDING_WARNING_USER_ID = "" # 被@的群成员userId,支持列表,可指定多个
|
|
153
|
-
# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
|
|
154
|
-
# DINGDING_WARNING_SECRET = None # 加签密钥
|
|
155
|
-
# # 飞书报警
|
|
156
|
-
# # https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN#e1cdee9f
|
|
157
|
-
# FEISHU_WARNING_URL = "" # 飞书机器人api
|
|
158
|
-
# FEISHU_WARNING_USER = None # 报警人 {"open_id":"ou_xxxxx", "name":"xxxx"} 或 [{"open_id":"ou_xxxxx", "name":"xxxx"}]
|
|
159
|
-
# FEISHU_WARNING_ALL = False # 是否提示所有人, 默认为False
|
|
160
|
-
# # 邮件报警
|
|
161
|
-
# EMAIL_SENDER = "" # 发件人
|
|
162
|
-
# EMAIL_PASSWORD = "" # 授权码
|
|
163
|
-
# EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个
|
|
164
|
-
# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
|
|
165
|
-
# # 企业微信报警
|
|
166
|
-
# WECHAT_WARNING_URL = "" # 企业微信机器人api
|
|
167
|
-
# WECHAT_WARNING_PHONE = "" # 报警人 将会在群内@此人, 支持列表,可指定多人
|
|
168
|
-
# WECHAT_WARNING_ALL = False # 是否提示所有人, 默认为False
|
|
169
|
-
# # 时间间隔
|
|
170
|
-
# WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
|
|
171
|
-
# WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / INFO / ERROR
|
|
172
|
-
# WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警
|
|
173
|
-
#
|
|
174
|
-
# LOG_NAME = os.path.basename(os.getcwd())
|
|
175
|
-
# LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径
|
|
176
|
-
# LOG_LEVEL = "DEBUG"
|
|
177
|
-
# LOG_COLOR = True # 是否带有颜色
|
|
178
|
-
# LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
|
|
179
|
-
# LOG_IS_WRITE_TO_FILE = False # 是否写文件
|
|
180
|
-
# LOG_MODE = "w" # 写文件的模式
|
|
181
|
-
# LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
|
|
182
|
-
# LOG_BACKUP_COUNT = 20 # 日志文件保留数量
|
|
183
|
-
# LOG_ENCODING = "utf8" # 日志文件编码
|
|
184
|
-
# OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级
|
|
185
|
-
#
|
|
186
|
-
# # 切换工作路径为当前项目路径
|
|
187
|
-
# project_path = os.path.abspath(os.path.dirname(__file__))
|
|
188
|
-
# os.chdir(project_path) # 切换工作路经
|
|
189
|
-
# sys.path.insert(0, project_path)
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""爬虫配置文件"""
|
|
3
|
+
# import os
|
|
4
|
+
# import sys
|
|
5
|
+
#
|
|
6
|
+
# # MYSQL
|
|
7
|
+
# MYSQL_IP = "localhost"
|
|
8
|
+
# MYSQL_PORT = 3306
|
|
9
|
+
# MYSQL_DB = ""
|
|
10
|
+
# MYSQL_USER_NAME = ""
|
|
11
|
+
# MYSQL_USER_PASS = ""
|
|
12
|
+
#
|
|
13
|
+
# # MONGODB
|
|
14
|
+
# MONGO_IP = "localhost"
|
|
15
|
+
# MONGO_PORT = 27017
|
|
16
|
+
# MONGO_DB = ""
|
|
17
|
+
# MONGO_USER_NAME = ""
|
|
18
|
+
# MONGO_USER_PASS = ""
|
|
19
|
+
# MONGO_URL = "
|
|
20
|
+
#
|
|
21
|
+
# # REDIS
|
|
22
|
+
# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
|
|
23
|
+
# REDISDB_IP_PORTS = "localhost:6379"
|
|
24
|
+
# REDISDB_USER_PASS = ""
|
|
25
|
+
# REDISDB_DB = 0
|
|
26
|
+
# # 连接redis时携带的其他参数,如ssl=True
|
|
27
|
+
# REDISDB_KWARGS = dict()
|
|
28
|
+
# # 适用于redis哨兵模式
|
|
29
|
+
# REDISDB_SERVICE_NAME = ""
|
|
30
|
+
#
|
|
31
|
+
# # 数据入库的pipeline,可自定义,默认MysqlPipeline
|
|
32
|
+
# ITEM_PIPELINES = [
|
|
33
|
+
# "feapder.pipelines.mysql_pipeline.MysqlPipeline",
|
|
34
|
+
# # "feapder.pipelines.mongo_pipeline.MongoPipeline",
|
|
35
|
+
# # "feapder.pipelines.console_pipeline.ConsolePipeline",
|
|
36
|
+
# ]
|
|
37
|
+
# EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
|
|
38
|
+
# EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
|
|
39
|
+
#
|
|
40
|
+
# # 爬虫相关
|
|
41
|
+
# # COLLECTOR
|
|
42
|
+
# COLLECTOR_TASK_COUNT = 32 # 每次获取任务数量,追求速度推荐32
|
|
43
|
+
#
|
|
44
|
+
# # SPIDER
|
|
45
|
+
# SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
|
|
46
|
+
# # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
|
|
47
|
+
# SPIDER_SLEEP_TIME = 0
|
|
48
|
+
# SPIDER_MAX_RETRY_TIMES = 10 # 每个请求最大重试次数
|
|
49
|
+
# KEEP_ALIVE = False # 爬虫是否常驻
|
|
50
|
+
|
|
51
|
+
# 下载
|
|
52
|
+
# DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器
|
|
53
|
+
# SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
|
|
54
|
+
# RENDER_DOWNLOADER = "feapder.network.downloader.SeleniumDownloader" # 渲染下载器
|
|
55
|
+
# # RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader"
|
|
56
|
+
# MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接
|
|
57
|
+
|
|
58
|
+
# # 浏览器渲染
|
|
59
|
+
# WEBDRIVER = dict(
|
|
60
|
+
# pool_size=1, # 浏览器的数量
|
|
61
|
+
# load_images=True, # 是否加载图片
|
|
62
|
+
# user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
63
|
+
# proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
64
|
+
# headless=False, # 是否为无头浏览器
|
|
65
|
+
# driver_type="CHROME", # CHROME、EDGE、PHANTOMJS、FIREFOX
|
|
66
|
+
# timeout=30, # 请求超时时间
|
|
67
|
+
# window_size=(1024, 800), # 窗口大小
|
|
68
|
+
# executable_path=None, # 浏览器路径,默认为默认路径
|
|
69
|
+
# render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
70
|
+
# custom_argument=[
|
|
71
|
+
# "--ignore-certificate-errors",
|
|
72
|
+
# "--disable-blink-features=AutomationControlled",
|
|
73
|
+
# ], # 自定义浏览器渲染参数
|
|
74
|
+
# xhr_url_regexes=None, # 拦截xhr接口,支持正则,数组类型
|
|
75
|
+
# auto_install_driver=True, # 自动下载浏览器驱动 支持chrome 和 firefox
|
|
76
|
+
# download_path=None, # 下载文件的路径
|
|
77
|
+
# use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
|
|
78
|
+
# )
|
|
79
|
+
#
|
|
80
|
+
# PLAYWRIGHT = dict(
|
|
81
|
+
# user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
82
|
+
# proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
83
|
+
# headless=False, # 是否为无头浏览器
|
|
84
|
+
# driver_type="chromium", # chromium、firefox、webkit
|
|
85
|
+
# timeout=30, # 请求超时时间
|
|
86
|
+
# window_size=(1024, 800), # 窗口大小
|
|
87
|
+
# executable_path=None, # 浏览器路径,默认为默认路径
|
|
88
|
+
# download_path=None, # 下载文件的路径
|
|
89
|
+
# render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
90
|
+
# wait_until="networkidle", # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle"
|
|
91
|
+
# use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
|
|
92
|
+
# page_on_event_callback=None, # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
|
|
93
|
+
# storage_state_path=None, # 保存浏览器状态的路径
|
|
94
|
+
# url_regexes=None, # 拦截接口,支持正则,数组类型
|
|
95
|
+
# save_all=False, # 是否保存所有拦截的接口, 配合url_regexes使用,为False时只保存最后一次拦截的接口
|
|
96
|
+
# )
|
|
97
|
+
#
|
|
98
|
+
# # 爬虫启动时,重新抓取失败的requests
|
|
99
|
+
# RETRY_FAILED_REQUESTS = False
|
|
100
|
+
# # 爬虫启动时,重新入库失败的item
|
|
101
|
+
# RETRY_FAILED_ITEMS = False
|
|
102
|
+
# # 保存失败的request
|
|
103
|
+
# SAVE_FAILED_REQUEST = True
|
|
104
|
+
# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
|
|
105
|
+
# REQUEST_LOST_TIMEOUT = 600 # 10分钟
|
|
106
|
+
# # request网络请求超时时间
|
|
107
|
+
# REQUEST_TIMEOUT = 22 # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
|
|
108
|
+
# # item在内存队列中最大缓存数量
|
|
109
|
+
# ITEM_MAX_CACHED_COUNT = 5000
|
|
110
|
+
# # item每批入库的最大数量
|
|
111
|
+
# ITEM_UPLOAD_BATCH_MAX_SIZE = 1000
|
|
112
|
+
# # item入库时间间隔
|
|
113
|
+
# ITEM_UPLOAD_INTERVAL = 1
|
|
114
|
+
# # 内存任务队列最大缓存的任务数,默认不限制;仅对AirSpider有效。
|
|
115
|
+
# TASK_MAX_CACHED_SIZE = 0
|
|
116
|
+
#
|
|
117
|
+
# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
|
|
118
|
+
# RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
|
|
119
|
+
# RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒
|
|
120
|
+
# RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True
|
|
121
|
+
#
|
|
122
|
+
# # 设置代理
|
|
123
|
+
# PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n
|
|
124
|
+
# PROXY_ENABLE = True
|
|
125
|
+
# PROXY_MAX_FAILED_TIMES = 5 # 代理最大失败次数,超过则不使用,自动删除
|
|
126
|
+
# PROXY_POOL = "feapder.network.proxy_pool.ProxyPool" # 代理池
|
|
127
|
+
#
|
|
128
|
+
# # 随机headers
|
|
129
|
+
# RANDOM_HEADERS = True
|
|
130
|
+
# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
|
|
131
|
+
# USER_AGENT_TYPE = "chrome"
|
|
132
|
+
# # 默认使用的浏览器头
|
|
133
|
+
# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
|
|
134
|
+
# # requests 使用session
|
|
135
|
+
# USE_SESSION = False
|
|
136
|
+
#
|
|
137
|
+
# # 去重
|
|
138
|
+
# ITEM_FILTER_ENABLE = False # item 去重
|
|
139
|
+
# REQUEST_FILTER_ENABLE = False # request 去重
|
|
140
|
+
# ITEM_FILTER_SETTING = dict(
|
|
141
|
+
# filter_type=1 # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4
|
|
142
|
+
# )
|
|
143
|
+
# REQUEST_FILTER_SETTING = dict(
|
|
144
|
+
# filter_type=3, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
|
|
145
|
+
# expire_time=2592000, # 过期时间1个月
|
|
146
|
+
# )
|
|
147
|
+
#
|
|
148
|
+
# # 报警 支持钉钉、飞书、企业微信、邮件
|
|
149
|
+
# # 钉钉报警
|
|
150
|
+
# DINGDING_WARNING_URL = "" # 钉钉机器人api
|
|
151
|
+
# DINGDING_WARNING_PHONE = "" # 被@的群成员手机号,支持列表,可指定多个。
|
|
152
|
+
# DINGDING_WARNING_USER_ID = "" # 被@的群成员userId,支持列表,可指定多个
|
|
153
|
+
# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
|
|
154
|
+
# DINGDING_WARNING_SECRET = None # 加签密钥
|
|
155
|
+
# # 飞书报警
|
|
156
|
+
# # https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN#e1cdee9f
|
|
157
|
+
# FEISHU_WARNING_URL = "" # 飞书机器人api
|
|
158
|
+
# FEISHU_WARNING_USER = None # 报警人 {"open_id":"ou_xxxxx", "name":"xxxx"} 或 [{"open_id":"ou_xxxxx", "name":"xxxx"}]
|
|
159
|
+
# FEISHU_WARNING_ALL = False # 是否提示所有人, 默认为False
|
|
160
|
+
# # 邮件报警
|
|
161
|
+
# EMAIL_SENDER = "" # 发件人
|
|
162
|
+
# EMAIL_PASSWORD = "" # 授权码
|
|
163
|
+
# EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个
|
|
164
|
+
# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
|
|
165
|
+
# # 企业微信报警
|
|
166
|
+
# WECHAT_WARNING_URL = "" # 企业微信机器人api
|
|
167
|
+
# WECHAT_WARNING_PHONE = "" # 报警人 将会在群内@此人, 支持列表,可指定多人
|
|
168
|
+
# WECHAT_WARNING_ALL = False # 是否提示所有人, 默认为False
|
|
169
|
+
# # 时间间隔
|
|
170
|
+
# WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
|
|
171
|
+
# WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / INFO / ERROR
|
|
172
|
+
# WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警
|
|
173
|
+
#
|
|
174
|
+
# LOG_NAME = os.path.basename(os.getcwd())
|
|
175
|
+
# LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径
|
|
176
|
+
# LOG_LEVEL = "DEBUG"
|
|
177
|
+
# LOG_COLOR = True # 是否带有颜色
|
|
178
|
+
# LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
|
|
179
|
+
# LOG_IS_WRITE_TO_FILE = False # 是否写文件
|
|
180
|
+
# LOG_MODE = "w" # 写文件的模式
|
|
181
|
+
# LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
|
|
182
|
+
# LOG_BACKUP_COUNT = 20 # 日志文件保留数量
|
|
183
|
+
# LOG_ENCODING = "utf8" # 日志文件编码
|
|
184
|
+
# OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级
|
|
185
|
+
#
|
|
186
|
+
# # 切换工作路径为当前项目路径
|
|
187
|
+
# project_path = os.path.abspath(os.path.dirname(__file__))
|
|
188
|
+
# os.chdir(project_path) # 切换工作路经
|
|
189
|
+
# sys.path.insert(0, project_path)
|
|
190
190
|
# print("当前工作路径为 " + os.getcwd())
|
|
@@ -1,31 +1,31 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
Created on {DATE}
|
|
4
|
-
---------
|
|
5
|
-
@summary:
|
|
6
|
-
---------
|
|
7
|
-
@author: {USER}
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import crawlo
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ${spider_name}(crawlo.Spider):
|
|
14
|
-
# 自定义数据库,若项目中有setting.py文件,此自定义可删除
|
|
15
|
-
__custom_setting__ = dict(
|
|
16
|
-
REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
def start_requests(self):
|
|
20
|
-
yield feapder.Request("https://spidertools.cn")
|
|
21
|
-
|
|
22
|
-
def parse(self, request, response):
|
|
23
|
-
# 提取网站title
|
|
24
|
-
print(response.xpath("//title/text()").extract_first())
|
|
25
|
-
# 提取网站描述
|
|
26
|
-
print(response.xpath("//meta[@name='description']/@content").extract_first())
|
|
27
|
-
print("网站地址: ", response.url)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
if __name__ == "__main__":
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Created on {DATE}
|
|
4
|
+
---------
|
|
5
|
+
@summary:
|
|
6
|
+
---------
|
|
7
|
+
@author: {USER}
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import crawlo
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ${spider_name}(crawlo.Spider):
|
|
14
|
+
# 自定义数据库,若项目中有setting.py文件,此自定义可删除
|
|
15
|
+
__custom_setting__ = dict(
|
|
16
|
+
REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def start_requests(self):
|
|
20
|
+
yield feapder.Request("https://spidertools.cn")
|
|
21
|
+
|
|
22
|
+
def parse(self, request, response):
|
|
23
|
+
# 提取网站title
|
|
24
|
+
print(response.xpath("//title/text()").extract_first())
|
|
25
|
+
# 提取网站描述
|
|
26
|
+
print(response.xpath("//meta[@name='description']/@content").extract_first())
|
|
27
|
+
print("网站地址: ", response.url)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
31
|
${spider_name}(redis_key="xxx:xxx").start()
|
crawlo/utils/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:57
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 13:57
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import platform
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import psutil # 用于获取系统资源信息的第三方库
|
|
8
|
+
except ImportError:
|
|
9
|
+
psutil = None # 如果psutil不可用则设为None
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def calculate_optimal_concurrency(user_specified: Optional[int] = None, use_logical_cores: bool = True) -> int:
|
|
15
|
+
"""
|
|
16
|
+
基于系统资源计算最优并发数,或使用用户指定值
|
|
17
|
+
|
|
18
|
+
参数:
|
|
19
|
+
user_specified: 用户指定的并发数(优先使用)
|
|
20
|
+
use_logical_cores: 是否使用逻辑CPU核心数(超线程),默认为True
|
|
21
|
+
|
|
22
|
+
返回:
|
|
23
|
+
计算得出的最优并发数
|
|
24
|
+
|
|
25
|
+
说明:
|
|
26
|
+
1. 优先使用用户指定的并发数
|
|
27
|
+
2. 根据操作系统类型采用不同的计算策略:
|
|
28
|
+
- Windows: 保守计算,避免内存压力
|
|
29
|
+
- macOS: 平衡资源使用
|
|
30
|
+
- Linux: 充分利用服务器资源
|
|
31
|
+
- 其他系统: 使用合理默认值
|
|
32
|
+
3. 使用可用内存和CPU核心数进行计算
|
|
33
|
+
4. 提供psutil不可用时的备用方案
|
|
34
|
+
"""
|
|
35
|
+
# 优先使用用户指定的并发数
|
|
36
|
+
if user_specified is not None:
|
|
37
|
+
logger.info(f"使用用户指定的并发数: {user_specified}")
|
|
38
|
+
return user_specified
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
current_os = platform.system() # 获取当前操作系统类型
|
|
42
|
+
logger.debug(f"检测到操作系统: {current_os}")
|
|
43
|
+
|
|
44
|
+
# 获取CPU核心数(根据参数决定是否使用逻辑核心)
|
|
45
|
+
cpu_count = psutil.cpu_count(logical=use_logical_cores) or 1 if psutil else os.cpu_count() or 1
|
|
46
|
+
|
|
47
|
+
# 根据操作系统类型选择不同的计算方法
|
|
48
|
+
if current_os == "Windows":
|
|
49
|
+
concurrency = _get_concurrency_for_windows(cpu_count, use_logical_cores)
|
|
50
|
+
elif current_os == "Darwin": # macOS系统
|
|
51
|
+
concurrency = _get_concurrency_for_macos(cpu_count, use_logical_cores)
|
|
52
|
+
elif current_os == "Linux":
|
|
53
|
+
concurrency = _get_concurrency_for_linux(cpu_count, use_logical_cores)
|
|
54
|
+
else: # 其他操作系统
|
|
55
|
+
concurrency = _get_concurrency_default(cpu_count)
|
|
56
|
+
|
|
57
|
+
logger.info(f"计算得到最大并发数: {concurrency}")
|
|
58
|
+
return concurrency
|
|
59
|
+
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.warning(f"动态计算并发数失败: {str(e)},使用默认值50")
|
|
62
|
+
return 50 # 计算失败时的安全默认值
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _get_concurrency_for_windows(cpu_count: int, use_logical_cores: bool) -> int:
|
|
66
|
+
"""Windows系统专用的并发数计算逻辑"""
|
|
67
|
+
if psutil:
|
|
68
|
+
# 计算可用内存(GB)
|
|
69
|
+
available_memory = psutil.virtual_memory().available / (1024 ** 3)
|
|
70
|
+
# 内存计算:每4GB可用内存分配10个并发
|
|
71
|
+
mem_based = int((available_memory / 4) * 10)
|
|
72
|
+
# CPU计算:使用逻辑核心时乘数较大
|
|
73
|
+
cpu_based = cpu_count * (5 if use_logical_cores else 3)
|
|
74
|
+
# 取5-100之间的值,选择内存和CPU限制中较小的
|
|
75
|
+
return max(5, min(100, mem_based, cpu_based))
|
|
76
|
+
else:
|
|
77
|
+
# 无psutil时的备用方案
|
|
78
|
+
return min(50, cpu_count * 5)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _get_concurrency_for_macos(cpu_count: int, use_logical_cores: bool) -> int:
|
|
82
|
+
"""macOS系统专用的并发数计算逻辑"""
|
|
83
|
+
if psutil:
|
|
84
|
+
available_memory = psutil.virtual_memory().available / (1024 ** 3)
|
|
85
|
+
# 内存计算:每3GB可用内存分配10个并发
|
|
86
|
+
mem_based = int((available_memory / 3) * 10)
|
|
87
|
+
# CPU计算:使用逻辑核心时乘数较大
|
|
88
|
+
cpu_based = cpu_count * (6 if use_logical_cores else 4)
|
|
89
|
+
# 取5-120之间的值
|
|
90
|
+
return max(5, min(120, mem_based, cpu_based))
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
# macOS备用方案:使用系统命令获取物理CPU核心数
|
|
94
|
+
import subprocess
|
|
95
|
+
output = subprocess.check_output(["sysctl", "hw.physicalcpu"])
|
|
96
|
+
cpu_count = int(output.split()[1])
|
|
97
|
+
return min(60, cpu_count * 5)
|
|
98
|
+
except:
|
|
99
|
+
return 40 # Mac电脑的合理默认值
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _get_concurrency_for_linux(cpu_count: int, use_logical_cores: bool) -> int:
|
|
103
|
+
"""Linux系统专用的并发数计算逻辑(更激进)"""
|
|
104
|
+
if psutil:
|
|
105
|
+
available_memory = psutil.virtual_memory().available / (1024 ** 3)
|
|
106
|
+
# 内存计算:每1.5GB可用内存分配10个并发
|
|
107
|
+
mem_based = int((available_memory / 1.5) * 10)
|
|
108
|
+
# CPU计算:服务器环境使用更大的乘数
|
|
109
|
+
cpu_based = cpu_count * (8 if use_logical_cores else 5)
|
|
110
|
+
# 取5-200之间的值
|
|
111
|
+
return max(5, min(200, mem_based, cpu_based))
|
|
112
|
+
else:
|
|
113
|
+
try:
|
|
114
|
+
# Linux备用方案:解析/proc/cpuinfo文件
|
|
115
|
+
with open("/proc/cpuinfo") as f:
|
|
116
|
+
cpu_count = f.read().count("processor\t:")
|
|
117
|
+
if cpu_count > 0:
|
|
118
|
+
return min(200, cpu_count * 8)
|
|
119
|
+
except:
|
|
120
|
+
return 50 # Linux服务器的合理默认值
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _get_concurrency_default(cpu_count: int) -> int:
|
|
124
|
+
"""未知操作系统的默认计算逻辑"""
|
|
125
|
+
return min(50, cpu_count * 5) # 保守的默认计算方式
|