crawlo 1.4.3__py3-none-any.whl → 1.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +52 -17
- crawlo/crawler.py +6 -0
- crawlo/queue/pqueue.py +2 -6
- crawlo/queue/queue_manager.py +1 -2
- crawlo/settings/default_settings.py +11 -30
- crawlo/templates/project/settings.py.tmpl +51 -65
- crawlo/templates/project/settings_distributed.py.tmpl +59 -67
- crawlo/templates/project/settings_gentle.py.tmpl +45 -40
- crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
- crawlo/templates/project/settings_minimal.py.tmpl +37 -26
- crawlo/templates/project/settings_simple.py.tmpl +45 -40
- crawlo/templates/run.py.tmpl +3 -7
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/METADATA +1 -1
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/RECORD +21 -27
- tests/test_multi_directory.py +68 -0
- tests/test_multiple_spider_modules.py +81 -0
- tests/test_spider_modules.py +85 -0
- examples/test_project/__init__.py +0 -7
- examples/test_project/run.py +0 -35
- examples/test_project/test_project/__init__.py +0 -4
- examples/test_project/test_project/items.py +0 -18
- examples/test_project/test_project/middlewares.py +0 -119
- examples/test_project/test_project/pipelines.py +0 -97
- examples/test_project/test_project/settings.py +0 -170
- examples/test_project/test_project/spiders/__init__.py +0 -10
- examples/test_project/test_project/spiders/of_week_dis.py +0 -144
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/WHEEL +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.3.dist-info → crawlo-1.4.4.dist-info}/top_level.txt +0 -0
|
@@ -6,75 +6,79 @@
|
|
|
6
6
|
适合快速开始和简单项目。
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# =================================== 基础配置 ===================================
|
|
10
|
+
|
|
11
|
+
# 项目基本信息
|
|
10
12
|
PROJECT_NAME = '{{project_name}}'
|
|
11
13
|
|
|
12
|
-
#
|
|
14
|
+
# 运行模式
|
|
13
15
|
RUN_MODE = 'standalone'
|
|
14
16
|
|
|
15
|
-
#
|
|
17
|
+
# 并发配置
|
|
16
18
|
CONCURRENCY = 4
|
|
17
19
|
MAX_RUNNING_SPIDERS = 1
|
|
18
20
|
DOWNLOAD_DELAY = 1.0
|
|
19
21
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
#
|
|
23
|
-
# DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
|
|
24
|
-
# DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
|
|
22
|
+
# =================================== 核心组件配置 ===================================
|
|
23
|
+
|
|
24
|
+
# 下载器配置
|
|
25
25
|
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
26
26
|
|
|
27
|
-
#
|
|
27
|
+
# 队列配置
|
|
28
28
|
QUEUE_TYPE = 'memory'
|
|
29
29
|
|
|
30
|
-
#
|
|
30
|
+
# 去重过滤器
|
|
31
31
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# 默认去重管道
|
|
34
34
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline'
|
|
35
35
|
|
|
36
|
-
#
|
|
37
|
-
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
36
|
+
# =================================== 爬虫配置 ===================================
|
|
38
37
|
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
# 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
|
|
42
|
-
# ]
|
|
38
|
+
# 爬虫模块配置
|
|
39
|
+
SPIDER_MODULES = ['{{project_name}}.spiders']
|
|
43
40
|
|
|
44
|
-
#
|
|
41
|
+
# 默认请求头配置
|
|
45
42
|
# 为DefaultHeaderMiddleware配置默认请求头
|
|
46
|
-
DEFAULT_REQUEST_HEADERS = {
|
|
47
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
48
|
-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
49
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
50
|
-
}
|
|
43
|
+
# DEFAULT_REQUEST_HEADERS = {}
|
|
51
44
|
|
|
52
|
-
#
|
|
45
|
+
# 允许的域名
|
|
53
46
|
# 为OffsiteMiddleware配置允许的域名
|
|
54
|
-
# ALLOWED_DOMAINS = [
|
|
47
|
+
# ALLOWED_DOMAINS = []
|
|
55
48
|
|
|
56
|
-
#
|
|
49
|
+
# 数据管道
|
|
50
|
+
# 如需添加自定义管道,请取消注释并添加
|
|
57
51
|
# PIPELINES = [
|
|
58
|
-
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',
|
|
52
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
53
|
+
# # '{{project_name}}.pipelines.CustomPipeline', # 用户自定义管道示例
|
|
59
54
|
# ]
|
|
60
55
|
|
|
61
|
-
#
|
|
56
|
+
# =================================== 系统配置 ===================================
|
|
57
|
+
|
|
58
|
+
# 扩展组件
|
|
59
|
+
# 如需添加自定义扩展,请取消注释并添加
|
|
62
60
|
# EXTENSIONS = [
|
|
63
|
-
# '
|
|
64
|
-
#
|
|
65
|
-
|
|
61
|
+
# # '{{project_name}}.extensions.CustomExtension', # 用户自定义扩展示例
|
|
62
|
+
# ]
|
|
63
|
+
|
|
64
|
+
# 中间件
|
|
65
|
+
# 如需添加自定义中间件,请取消注释并添加
|
|
66
|
+
# MIDDLEWARES = [
|
|
67
|
+
# # '{{project_name}}.middlewares.CustomMiddleware', # 用户自定义中间件示例
|
|
66
68
|
# ]
|
|
67
69
|
|
|
68
|
-
#
|
|
70
|
+
# 日志配置
|
|
69
71
|
LOG_LEVEL = 'INFO'
|
|
70
72
|
LOG_FILE = 'logs/{{project_name}}.log'
|
|
71
73
|
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
72
74
|
STATS_DUMP = True
|
|
73
75
|
|
|
74
|
-
#
|
|
76
|
+
# 输出配置
|
|
75
77
|
OUTPUT_DIR = 'output'
|
|
76
78
|
|
|
77
|
-
#
|
|
79
|
+
# =================================== 数据库配置 ===================================
|
|
80
|
+
|
|
81
|
+
# Redis配置
|
|
78
82
|
REDIS_HOST = '127.0.0.1'
|
|
79
83
|
REDIS_PORT = 6379
|
|
80
84
|
REDIS_PASSWORD = ''
|
|
@@ -86,7 +90,7 @@ if REDIS_PASSWORD:
|
|
|
86
90
|
else:
|
|
87
91
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
88
92
|
|
|
89
|
-
#
|
|
93
|
+
# MySQL配置
|
|
90
94
|
MYSQL_HOST = '127.0.0.1'
|
|
91
95
|
MYSQL_PORT = 3306
|
|
92
96
|
MYSQL_USER = 'root'
|
|
@@ -96,7 +100,7 @@ MYSQL_TABLE = '{{project_name}}_data'
|
|
|
96
100
|
MYSQL_BATCH_SIZE = 100
|
|
97
101
|
MYSQL_USE_BATCH = False # 是否启用批量插入
|
|
98
102
|
|
|
99
|
-
#
|
|
103
|
+
# MongoDB配置
|
|
100
104
|
MONGO_URI = 'mongodb://localhost:27017'
|
|
101
105
|
MONGO_DATABASE = '{{project_name}}_db'
|
|
102
106
|
MONGO_COLLECTION = '{{project_name}}_items'
|
|
@@ -105,7 +109,9 @@ MONGO_MIN_POOL_SIZE = 20
|
|
|
105
109
|
MONGO_BATCH_SIZE = 100 # 批量插入条数
|
|
106
110
|
MONGO_USE_BATCH = False # 是否启用批量插入
|
|
107
111
|
|
|
108
|
-
#
|
|
112
|
+
# =================================== 网络配置 ===================================
|
|
113
|
+
|
|
114
|
+
# 代理配置
|
|
109
115
|
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
110
116
|
PROXY_ENABLED = False # 是否启用代理
|
|
111
117
|
|
|
@@ -124,7 +130,6 @@ PROXY_EXTRACTOR = "proxy"
|
|
|
124
130
|
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
125
131
|
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
126
132
|
|
|
127
|
-
# ============================== Curl-Cffi 特有配置 ==============================
|
|
128
133
|
# 浏览器指纹模拟(仅 CurlCffi 下载器有效)
|
|
129
134
|
CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
|
|
130
135
|
|
|
@@ -136,7 +141,7 @@ CURL_BROWSER_VERSION_MAP = {
|
|
|
136
141
|
"firefox": "firefox135",
|
|
137
142
|
}
|
|
138
143
|
|
|
139
|
-
#
|
|
144
|
+
# 下载器优化配置
|
|
140
145
|
# 下载器健康检查
|
|
141
146
|
DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
|
|
142
147
|
HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
@@ -157,7 +162,7 @@ AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
|
157
162
|
CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
158
163
|
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
159
164
|
|
|
160
|
-
#
|
|
165
|
+
# 内存监控配置
|
|
161
166
|
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
162
167
|
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
163
168
|
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
crawlo/templates/run.py.tmpl
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
基于 Crawlo 框架的简化爬虫启动器。
|
|
7
7
|
|
|
8
8
|
框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
|
|
9
|
-
|
|
9
|
+
框架会自动从settings.py中读取SPIDER_MODULES配置。
|
|
10
10
|
"""
|
|
11
11
|
import sys
|
|
12
12
|
import asyncio
|
|
@@ -17,12 +17,8 @@ from crawlo.crawler import CrawlerProcess
|
|
|
17
17
|
def main():
|
|
18
18
|
"""主函数:运行爬虫"""
|
|
19
19
|
try:
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
23
|
-
|
|
24
|
-
# TODO 运行指定的爬虫
|
|
25
|
-
asyncio.run(process.crawl('spider_name'))
|
|
20
|
+
# TODO: 请将 'spider_name' 替换为实际要运行的爬虫名称
|
|
21
|
+
asyncio.run(CrawlerProcess().crawl('spider_name'))
|
|
26
22
|
|
|
27
23
|
except Exception as e:
|
|
28
24
|
print(f"❌ 运行失败: {e}")
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=2Io5P9qJghOAjjD3YWdaiIq5laPLyLWVkEqgiVfUa3o,2381
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=2ik6wvURqg571WApVvR_ELhg_eclmC_WvbDLEPmoO4Q,23
|
|
3
3
|
crawlo/cli.py,sha256=AQnAB5NMI-Ic1VPw_Jjng8L4AI4-wMozOwzE6CfXkZU,2402
|
|
4
4
|
crawlo/config.py,sha256=EQIT7WpkXAlr2ocd5SYJYOKTSWUlQx2AkTHX7ErEWxw,9798
|
|
5
5
|
crawlo/config_validator.py,sha256=oY4-2bwXUlwHAnGgkI-EznviDfML_dcxbWSGXNSxC2k,11516
|
|
6
|
-
crawlo/crawler.py,sha256=
|
|
6
|
+
crawlo/crawler.py,sha256=E-fgYVtx6v2xEPixlQeWfNYVbW1oeWE0fQFZTQ6_K-I,27305
|
|
7
7
|
crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
|
|
8
8
|
crawlo/exceptions.py,sha256=YVIDnC1bKSMv3fXH_6tinWMuD9HmKHIaUfO4_fkX5sY,1247
|
|
9
9
|
crawlo/framework.py,sha256=9gP6VN4MHqutGXaxnwpNMSULfVYbNp906UdZiJGywlQ,9458
|
|
@@ -14,7 +14,7 @@ crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
|
|
|
14
14
|
crawlo/task_manager.py,sha256=I9h3Rl0VRAfwqp24CHT3TuEAapNdTbVghkmuJhtM7jg,5966
|
|
15
15
|
crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
|
|
16
16
|
crawlo/commands/check.py,sha256=TKDhI_sj7kErgiJpt2vCZ9QL-g6yWjrrPWKbgh8pgEU,23199
|
|
17
|
-
crawlo/commands/genspider.py,sha256=
|
|
17
|
+
crawlo/commands/genspider.py,sha256=JB4ZuFpKsYwtjx3DSsxugH7e3kqxhDWPG5ZKfvM0isI,6041
|
|
18
18
|
crawlo/commands/help.py,sha256=8xPC0iNCg1rRBoK2bb6noAEANc1JwrdM35eF-j6yeZM,5111
|
|
19
19
|
crawlo/commands/list.py,sha256=trzcd3kG6DhkOqYZADcl3yR7M8iJBgRw5fE-g9e0gVM,5877
|
|
20
20
|
crawlo/commands/run.py,sha256=EjpIilgCTkXGVSV4rEISbJubdhqrok9nNe5-xDfDK5E,13169
|
|
@@ -92,26 +92,26 @@ crawlo/pipelines/mysql_pipeline.py,sha256=Kjgu6cks1KD4FPXwlTnFaos2LG-N8LLaBDyKZ_
|
|
|
92
92
|
crawlo/pipelines/pipeline_manager.py,sha256=R6MRb5d-caOit7PZoglJLHa3qQ68U5YAQlwt8KcjRxo,4393
|
|
93
93
|
crawlo/pipelines/redis_dedup_pipeline.py,sha256=RB1kXLr8ZuWNrgZKYwt--tlmnWsQTbuwTsSt3pafol8,6077
|
|
94
94
|
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
|
-
crawlo/queue/pqueue.py,sha256=
|
|
96
|
-
crawlo/queue/queue_manager.py,sha256=
|
|
95
|
+
crawlo/queue/pqueue.py,sha256=bbgd3l1VfqYXfz-4VFaiWLmJit1LdB3qHalCtNqyrqI,1210
|
|
96
|
+
crawlo/queue/queue_manager.py,sha256=8rKygMxr6DgSjnGsKFmvlTI5XAARvQIN_ENkAruHGXs,21532
|
|
97
97
|
crawlo/queue/redis_priority_queue.py,sha256=vLvg2toKaRrXD1QyEdu1ZjTmANv7clFaBF7mCtstBmI,15995
|
|
98
98
|
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
99
|
-
crawlo/settings/default_settings.py,sha256=
|
|
99
|
+
crawlo/settings/default_settings.py,sha256=IKh2eZ9WWXkAbHx5K5KX0whNtumATRpZ7ifFPZJFfBk,11827
|
|
100
100
|
crawlo/settings/setting_manager.py,sha256=yI1tGaludevxKGGZO3Pn4aYofrg2cwYwvMZCFC5PPZw,8595
|
|
101
101
|
crawlo/spider/__init__.py,sha256=QGhe_yNsnfnCF3G9nSoWEw23b8SkP5oSFU5W79C5DzI,21881
|
|
102
102
|
crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
|
|
103
|
-
crawlo/templates/run.py.tmpl,sha256=
|
|
103
|
+
crawlo/templates/run.py.tmpl,sha256=g8yst2hkqhKGNotR33fDxwmEsX6aEvhrXY_cfYos_vc,788
|
|
104
104
|
crawlo/templates/spiders_init.py.tmpl,sha256=p6UK8KWr8FDydNxiAh6Iz29MY5WmgXIkf2z-buOGhOM,354
|
|
105
105
|
crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
|
|
106
106
|
crawlo/templates/project/items.py.tmpl,sha256=8_3DBA8HrS2XbfHzsMZNJiZbFY6fDJUUMFoFti_obJk,314
|
|
107
107
|
crawlo/templates/project/middlewares.py.tmpl,sha256=fxHqi-Sjec5GiHJciprOU-6SAUTzM728NlZckIqf9hM,4278
|
|
108
108
|
crawlo/templates/project/pipelines.py.tmpl,sha256=j9oqEhCezmmHlBhMWgYtlgup4jhWnMlv6AEiAOHODkg,2704
|
|
109
|
-
crawlo/templates/project/settings.py.tmpl,sha256=
|
|
110
|
-
crawlo/templates/project/settings_distributed.py.tmpl,sha256=
|
|
111
|
-
crawlo/templates/project/settings_gentle.py.tmpl,sha256=
|
|
112
|
-
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=
|
|
113
|
-
crawlo/templates/project/settings_minimal.py.tmpl,sha256=
|
|
114
|
-
crawlo/templates/project/settings_simple.py.tmpl,sha256=
|
|
109
|
+
crawlo/templates/project/settings.py.tmpl,sha256=mL9_JAyz8R35r-ywRHi4T-dtal7oczU5kodEWxldw40,5265
|
|
110
|
+
crawlo/templates/project/settings_distributed.py.tmpl,sha256=RHzfWZITv-0ErCR9OYEswAZHpA5d9fYil0ZoGCtFt8g,5459
|
|
111
|
+
crawlo/templates/project/settings_gentle.py.tmpl,sha256=pmjrBLjnpGcR90RkcJrM5O8PsTrRhUB92QR3R4TJyko,5733
|
|
112
|
+
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=9QhXSzfxIsMPyq0kZY9h2YBllyXGpGE37bMEbSrs_Ag,5823
|
|
113
|
+
crawlo/templates/project/settings_minimal.py.tmpl,sha256=1qUPhSdHtvLSHTpytUJ8K63sMROhTwkz8e4tVg1fYoM,2222
|
|
114
|
+
crawlo/templates/project/settings_simple.py.tmpl,sha256=sIyrCIVXsHSKl8Yjj8HkGs-ppMFH26a5yp6egVNlT2Q,5585
|
|
115
115
|
crawlo/templates/project/spiders/__init__.py.tmpl,sha256=llhcIItXpm0TlEeumeLwp4fcYv2NHl8Iru7tLhDhxiE,216
|
|
116
116
|
crawlo/templates/spider/spider.py.tmpl,sha256=KvU-9YpN6MifDE7XzejjyyQS7RUjLDLZ8zqJcLwSsu0,5198
|
|
117
117
|
crawlo/tools/__init__.py,sha256=tOYfYPvZlrO8cmvnMWBjTma6UTLTFZN3qdC8pJwHrzI,4142
|
|
@@ -151,15 +151,6 @@ crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
|
|
|
151
151
|
crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
|
|
152
152
|
crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
|
|
153
153
|
examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
|
|
154
|
-
examples/test_project/__init__.py,sha256=BQ6FVVDjB00-Vyib8h2I3P-LV5tjsFzTSLC2rFDe7Gw,136
|
|
155
|
-
examples/test_project/run.py,sha256=HHMFVY8D9ouWytp-xNvFDKB8NphrZGuFbVrah6-afG8,953
|
|
156
|
-
examples/test_project/test_project/__init__.py,sha256=RU5IUwU1oeDABAj3ZsLdMo53r3XG4saYIYgO1SXo13g,57
|
|
157
|
-
examples/test_project/test_project/items.py,sha256=L_bnTuJLW7xtuja4AaqYuSsQvVdaqMunZWdh1OTz4HY,310
|
|
158
|
-
examples/test_project/test_project/middlewares.py,sha256=fE7AUI2Yb9_ZwA4vC4ngn-s231FtEYeYWjXOwd5mkzE,4270
|
|
159
|
-
examples/test_project/test_project/pipelines.py,sha256=7EDGrvuNH6JY0XINRwzgPmB-X4Ax6J790jtxbkL2O_U,2696
|
|
160
|
-
examples/test_project/test_project/settings.py,sha256=imfrkrwSJ1V3jINe2ZhynlT4_w5bUBkeI9DFK7DS2g0,7115
|
|
161
|
-
examples/test_project/test_project/spiders/__init__.py,sha256=Lx_To88ShpAR6Pyd9PUMjct690MZH0gETxz6knRqPRY,212
|
|
162
|
-
examples/test_project/test_project/spiders/of_week_dis.py,sha256=-gDLaKGoF5birxCoLL_CX82bYopXjo4QmOV6a7I-Ci0,5178
|
|
163
154
|
tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
|
|
164
155
|
tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
|
|
165
156
|
tests/authenticated_proxy_example.py,sha256=fKmHXXxIxCJXjEplttCWRh7PZhbxkBSxJF91Bx-qOME,3019
|
|
@@ -260,6 +251,8 @@ tests/test_logging_system.py,sha256=LGfK14ZEWzRtl3_VkBGz-AaVa_dDtuC5zu40m8FvmMo,
|
|
|
260
251
|
tests/test_middleware_debug.py,sha256=gtiaWCxBSTcaNkdqXirM7CsThr_HfiCueBdQCpp7rqg,4572
|
|
261
252
|
tests/test_mode_change.py,sha256=GT53CBdxcG3-evcKz_OOfH4PBiq_oqQyuDjRXrvv1UU,2665
|
|
262
253
|
tests/test_mode_consistency.py,sha256=t72WX0etC_AayaL2AT6e2lIgbfP-zxTgYAiTARSN2Jk,1276
|
|
254
|
+
tests/test_multi_directory.py,sha256=sH9Y3B-fuESlc7J1aICa-AlBcCW8HFR-Q5j2anUr8l0,2196
|
|
255
|
+
tests/test_multiple_spider_modules.py,sha256=M0wPyQW7HMasbMIgn_R78wjZEj4A_DgqaGHp0qF9Y0c,2567
|
|
263
256
|
tests/test_offsite_middleware.py,sha256=njpXTdngOqBs60Wj6xgo5EEXlJnMHd7vtYGi9dVauW0,10602
|
|
264
257
|
tests/test_offsite_middleware_simple.py,sha256=4MfDKSXGHcoFLYnnxCH2rmnzztWyN0xByYLoHtepyiA,7918
|
|
265
258
|
tests/test_parsel.py,sha256=wuZqRFIm9xx1tt6o3Xi_OjvwhT_MPmHiUEj2ax06zlo,701
|
|
@@ -306,6 +299,7 @@ tests/test_retry_middleware_realistic.py,sha256=Sam5y4jCN8oeElU4xxeS5zjAyzS-P8si
|
|
|
306
299
|
tests/test_scheduler.py,sha256=1fCu35QgK5gzgrhD0aUZj5lxL0QbokzPav-yEJxz9Ig,8182
|
|
307
300
|
tests/test_scheduler_config_update.py,sha256=LuxjEbt20QrPyVkjSFxvTnFtUxwMaHB6TcqjFyo8bow,4261
|
|
308
301
|
tests/test_simple_response.py,sha256=_ui2PuVZvJcAuLY7HZ8xcsy_tDBimgBqX0ukj3kE5J0,1549
|
|
302
|
+
tests/test_spider_modules.py,sha256=wxPs28FtpGnQTemMY6r7WxVrwYo3bHnAd5dq94qj1K4,2797
|
|
309
303
|
tests/test_telecom_spider_redis_key.py,sha256=c-gfixPul2VlYMQJGf0H5ZgYJ461fQgSKbCPrbAU45M,7625
|
|
310
304
|
tests/test_template_content.py,sha256=2RgCdOA3pMUSOqC_JbTGeW7KonbTqJ0ySYJNWegU-v0,2903
|
|
311
305
|
tests/test_template_redis_key.py,sha256=99-s0_-8MFJbIvGG_X__sH0qkXWTtJv8fdTdlftsq4I,4876
|
|
@@ -319,8 +313,8 @@ tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4
|
|
|
319
313
|
tests/verify_log_fix.py,sha256=7reyVl3MXTDASyChgU5BAYuzuxvFjSLG9HywAHso0qg,4336
|
|
320
314
|
tests/scrapy_comparison/ofweek_scrapy.py,sha256=rhVds_WjYum1bLuWWe90HtXE51fZXEqhhPSc822ZasQ,5790
|
|
321
315
|
tests/scrapy_comparison/scrapy_test.py,sha256=-IsGUHPBgEL0TmXjeLZl-TUA01B7Dsc2nRo4JZbFwZA,5599
|
|
322
|
-
crawlo-1.4.
|
|
323
|
-
crawlo-1.4.
|
|
324
|
-
crawlo-1.4.
|
|
325
|
-
crawlo-1.4.
|
|
326
|
-
crawlo-1.4.
|
|
316
|
+
crawlo-1.4.4.dist-info/METADATA,sha256=LAg9xmMfxLUwVUGPqw_p48hGJYZqsRC9Mc4KqDroAUQ,4848
|
|
317
|
+
crawlo-1.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
318
|
+
crawlo-1.4.4.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
319
|
+
crawlo-1.4.4.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
320
|
+
crawlo-1.4.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试多个爬虫目录的支持
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到Python路径
|
|
10
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
11
|
+
|
|
12
|
+
# 添加ofweek_standalone到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
14
|
+
|
|
15
|
+
def test_multiple_spider_directories():
|
|
16
|
+
"""测试多个爬虫目录的支持"""
|
|
17
|
+
print("测试多个爬虫目录的支持...")
|
|
18
|
+
|
|
19
|
+
# 导入设置
|
|
20
|
+
import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
|
|
21
|
+
|
|
22
|
+
# 创建设置管理器
|
|
23
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
24
|
+
settings = SettingManager()
|
|
25
|
+
settings.set_settings(settings_module)
|
|
26
|
+
|
|
27
|
+
# 检查SPIDER_MODULES配置
|
|
28
|
+
spider_modules = settings.get('SPIDER_MODULES')
|
|
29
|
+
print(f"SPIDER_MODULES配置: {spider_modules}")
|
|
30
|
+
|
|
31
|
+
# 创建CrawlerProcess实例
|
|
32
|
+
from crawlo.crawler import CrawlerProcess
|
|
33
|
+
process = CrawlerProcess(settings=settings)
|
|
34
|
+
|
|
35
|
+
# 检查是否注册了爬虫
|
|
36
|
+
spider_names = process.get_spider_names()
|
|
37
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
38
|
+
|
|
39
|
+
# 验证期望的爬虫是否已注册
|
|
40
|
+
expected_spiders = ['of_week_standalone', 'test_spider']
|
|
41
|
+
registered_spiders = []
|
|
42
|
+
|
|
43
|
+
for spider_name in expected_spiders:
|
|
44
|
+
if spider_name in spider_names:
|
|
45
|
+
print(f"✅ 成功: 爬虫 '{spider_name}' 已注册")
|
|
46
|
+
registered_spiders.append(spider_name)
|
|
47
|
+
else:
|
|
48
|
+
print(f"❌ 失败: 爬虫 '{spider_name}' 未找到")
|
|
49
|
+
|
|
50
|
+
if len(registered_spiders) == len(expected_spiders):
|
|
51
|
+
print(f"🎉 所有爬虫都已成功注册!")
|
|
52
|
+
return True
|
|
53
|
+
else:
|
|
54
|
+
print(f"⚠️ 部分爬虫未注册: {set(expected_spiders) - set(registered_spiders)}")
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == '__main__':
|
|
59
|
+
print("开始测试多个爬虫目录的支持...\n")
|
|
60
|
+
|
|
61
|
+
success = test_multiple_spider_directories()
|
|
62
|
+
|
|
63
|
+
if success:
|
|
64
|
+
print("\n🎉 测试通过!")
|
|
65
|
+
sys.exit(0)
|
|
66
|
+
else:
|
|
67
|
+
print("\n❌ 测试失败!")
|
|
68
|
+
sys.exit(1)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试多个SPIDER_MODULES目录的支持
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import asyncio
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到Python路径
|
|
11
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
12
|
+
|
|
13
|
+
# 添加ofweek_standalone到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
15
|
+
|
|
16
|
+
from crawlo.crawler import CrawlerProcess
|
|
17
|
+
from crawlo.spider import get_spider_names
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_multiple_spider_modules():
|
|
21
|
+
"""测试多个SPIDER_MODULES目录的支持"""
|
|
22
|
+
print("测试多个SPIDER_MODULES目录的支持...")
|
|
23
|
+
|
|
24
|
+
# 模拟包含多个目录的SPIDER_MODULES配置
|
|
25
|
+
spider_modules = ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
26
|
+
|
|
27
|
+
# 创建CrawlerProcess实例
|
|
28
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
29
|
+
|
|
30
|
+
# 检查是否注册了爬虫
|
|
31
|
+
spider_names = process.get_spider_names()
|
|
32
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
33
|
+
|
|
34
|
+
# 验证期望的爬虫是否已注册
|
|
35
|
+
expected_spider = 'of_week_standalone'
|
|
36
|
+
if expected_spider in spider_names:
|
|
37
|
+
print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
|
|
38
|
+
return True
|
|
39
|
+
else:
|
|
40
|
+
print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_settings_with_multiple_spider_modules():
|
|
45
|
+
"""测试settings中配置多个SPIDER_MODULES目录"""
|
|
46
|
+
print("\n测试settings中配置多个SPIDER_MODULES目录...")
|
|
47
|
+
|
|
48
|
+
# 创建模拟的settings对象
|
|
49
|
+
class MockSettings:
|
|
50
|
+
def get(self, key, default=None):
|
|
51
|
+
if key == 'SPIDER_MODULES':
|
|
52
|
+
return ['ofweek_standalone.spiders', 'ofweek_standalone.new_spiders']
|
|
53
|
+
return default
|
|
54
|
+
|
|
55
|
+
settings = MockSettings()
|
|
56
|
+
|
|
57
|
+
# 创建CrawlerProcess实例
|
|
58
|
+
process = CrawlerProcess(settings=settings)
|
|
59
|
+
|
|
60
|
+
# 检查是否注册了爬虫
|
|
61
|
+
spider_names = process.get_spider_names()
|
|
62
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
63
|
+
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == '__main__':
|
|
68
|
+
print("开始测试多个SPIDER_MODULES目录的支持...\n")
|
|
69
|
+
|
|
70
|
+
# 测试显式传递多个spider_modules参数
|
|
71
|
+
success1 = test_multiple_spider_modules()
|
|
72
|
+
|
|
73
|
+
# 测试从settings中读取多个spider_modules配置
|
|
74
|
+
success2 = test_settings_with_multiple_spider_modules()
|
|
75
|
+
|
|
76
|
+
if success1 and success2:
|
|
77
|
+
print("\n🎉 所有测试通过!")
|
|
78
|
+
sys.exit(0)
|
|
79
|
+
else:
|
|
80
|
+
print("\n❌ 部分测试失败!")
|
|
81
|
+
sys.exit(1)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试SPIDER_MODULES配置的自动读取功能
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import asyncio
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到Python路径
|
|
11
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
12
|
+
|
|
13
|
+
# 添加ofweek_standalone到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'examples', 'ofweek_standalone'))
|
|
15
|
+
|
|
16
|
+
from crawlo.crawler import CrawlerProcess
|
|
17
|
+
from crawlo.spider import get_spider_names
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_spider_modules_auto_discovery():
|
|
21
|
+
"""测试SPIDER_MODULES配置的自动读取功能"""
|
|
22
|
+
print("测试SPIDER_MODULES配置的自动读取功能...")
|
|
23
|
+
|
|
24
|
+
# 导入设置
|
|
25
|
+
import examples.ofweek_standalone.ofweek_standalone.settings as settings_module
|
|
26
|
+
|
|
27
|
+
# 创建设置管理器
|
|
28
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
29
|
+
settings = SettingManager()
|
|
30
|
+
settings.set_settings(settings_module)
|
|
31
|
+
|
|
32
|
+
# 创建CrawlerProcess实例,不显式传递spider_modules
|
|
33
|
+
process = CrawlerProcess(settings=settings)
|
|
34
|
+
|
|
35
|
+
# 检查是否自动注册了爬虫
|
|
36
|
+
spider_names = process.get_spider_names()
|
|
37
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
38
|
+
|
|
39
|
+
# 验证期望的爬虫是否已注册
|
|
40
|
+
expected_spider = 'of_week_standalone'
|
|
41
|
+
if expected_spider in spider_names:
|
|
42
|
+
print(f"✅ 成功: 爬虫 '{expected_spider}' 已自动注册")
|
|
43
|
+
return True
|
|
44
|
+
else:
|
|
45
|
+
print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_crawler_process_with_explicit_spider_modules():
|
|
50
|
+
"""测试显式传递spider_modules参数的功能"""
|
|
51
|
+
print("\n测试显式传递spider_modules参数的功能...")
|
|
52
|
+
|
|
53
|
+
# 显式传递spider_modules参数
|
|
54
|
+
spider_modules = ['ofweek_standalone.spiders']
|
|
55
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
56
|
+
|
|
57
|
+
# 检查是否注册了爬虫
|
|
58
|
+
spider_names = process.get_spider_names()
|
|
59
|
+
print(f"已注册的爬虫: {spider_names}")
|
|
60
|
+
|
|
61
|
+
# 验证期望的爬虫是否已注册
|
|
62
|
+
expected_spider = 'of_week_standalone'
|
|
63
|
+
if expected_spider in spider_names:
|
|
64
|
+
print(f"✅ 成功: 爬虫 '{expected_spider}' 已注册")
|
|
65
|
+
return True
|
|
66
|
+
else:
|
|
67
|
+
print(f"❌ 失败: 爬虫 '{expected_spider}' 未找到")
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if __name__ == '__main__':
|
|
72
|
+
print("开始测试SPIDER_MODULES配置功能...\n")
|
|
73
|
+
|
|
74
|
+
# 测试自动发现功能
|
|
75
|
+
success1 = test_spider_modules_auto_discovery()
|
|
76
|
+
|
|
77
|
+
# 测试显式传递参数功能
|
|
78
|
+
success2 = test_crawler_process_with_explicit_spider_modules()
|
|
79
|
+
|
|
80
|
+
if success1 and success2:
|
|
81
|
+
print("\n🎉 所有测试通过!")
|
|
82
|
+
sys.exit(0)
|
|
83
|
+
else:
|
|
84
|
+
print("\n❌ 部分测试失败!")
|
|
85
|
+
sys.exit(1)
|
examples/test_project/run.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
test_project 项目运行脚本
|
|
5
|
-
============================
|
|
6
|
-
基于 Crawlo 框架的简化爬虫启动器。
|
|
7
|
-
|
|
8
|
-
框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
|
|
9
|
-
只需指定spider_modules参数,框架会自动扫描并导入所有爬虫。
|
|
10
|
-
"""
|
|
11
|
-
import sys
|
|
12
|
-
import asyncio
|
|
13
|
-
|
|
14
|
-
from crawlo.crawler import CrawlerProcess
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def main():
|
|
18
|
-
"""主函数:运行爬虫"""
|
|
19
|
-
try:
|
|
20
|
-
# 指定爬虫模块路径,框架会自动导入并注册所有爬虫
|
|
21
|
-
spider_modules = ['test_project.spiders']
|
|
22
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
23
|
-
|
|
24
|
-
# TODO 运行指定的爬虫
|
|
25
|
-
asyncio.run(process.crawl('of_week_dis'))
|
|
26
|
-
|
|
27
|
-
except Exception as e:
|
|
28
|
-
print(f"❌ 运行失败: {e}")
|
|
29
|
-
import traceback
|
|
30
|
-
traceback.print_exc()
|
|
31
|
-
sys.exit(1)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
if __name__ == '__main__':
|
|
35
|
-
main()
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
test_project.items
|
|
4
|
-
======================
|
|
5
|
-
定义你抓取的数据结构。
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from crawlo.items import Item, Field
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ExampleItem(Item):
|
|
12
|
-
"""
|
|
13
|
-
一个示例数据项。
|
|
14
|
-
"""
|
|
15
|
-
id = Field()
|
|
16
|
-
# price = Field()
|
|
17
|
-
# description = Field()
|
|
18
|
-
pass
|