crawlo 1.3.5__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/framework.py +3 -2
- crawlo/queue/queue_manager.py +26 -7
- crawlo/queue/redis_priority_queue.py +43 -2
- crawlo/settings/default_settings.py +8 -8
- crawlo/templates/project/settings.py.tmpl +3 -0
- crawlo/templates/project/settings_distributed.py.tmpl +3 -0
- {crawlo-1.3.5.dist-info → crawlo-1.3.7.dist-info}/METADATA +74 -1
- {crawlo-1.3.5.dist-info → crawlo-1.3.7.dist-info}/RECORD +19 -15
- tests/simple_queue_type_test.py +42 -0
- tests/test_crawlo_proxy_integration.py +1 -1
- tests/test_double_crawlo_fix.py +10 -13
- tests/test_queue_manager_redis_key.py +39 -36
- tests/test_queue_naming.py +155 -0
- tests/test_queue_type.py +107 -0
- tests/test_redis_queue_name_fix.py +176 -0
- {crawlo-1.3.5.dist-info → crawlo-1.3.7.dist-info}/WHEEL +0 -0
- {crawlo-1.3.5.dist-info → crawlo-1.3.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.5.dist-info → crawlo-1.3.7.dist-info}/top_level.txt +0 -0
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '1.3.
|
|
1
|
+
__version__ = '1.3.7'
|
crawlo/framework.py
CHANGED
|
@@ -57,9 +57,10 @@ class CrawloFramework:
|
|
|
57
57
|
|
|
58
58
|
self._logger.info(f"Crawlo Framework Started {version}")
|
|
59
59
|
|
|
60
|
-
#
|
|
60
|
+
# 获取运行模式和队列类型并记录日志
|
|
61
61
|
run_mode = self._settings.get('RUN_MODE', 'unknown')
|
|
62
|
-
self.
|
|
62
|
+
queue_type = self._settings.get('QUEUE_TYPE', 'unknown')
|
|
63
|
+
self._logger.info(f"RunMode: {run_mode}, QueueType: {queue_type}")
|
|
63
64
|
|
|
64
65
|
# 记录项目名称
|
|
65
66
|
project_name = self._settings.get('PROJECT_NAME', 'unknown')
|
crawlo/queue/queue_manager.py
CHANGED
|
@@ -146,6 +146,17 @@ class QueueConfig:
|
|
|
146
146
|
@classmethod
|
|
147
147
|
def from_settings(cls, settings) -> 'QueueConfig':
|
|
148
148
|
"""Create configuration from settings"""
|
|
149
|
+
# 获取项目名称,用于生成默认队列名称
|
|
150
|
+
project_name = settings.get('PROJECT_NAME', 'default')
|
|
151
|
+
default_queue_name = f"crawlo:{project_name}:queue:requests"
|
|
152
|
+
|
|
153
|
+
# 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
|
|
154
|
+
scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
|
|
155
|
+
if scheduler_queue_name is not None:
|
|
156
|
+
queue_name = scheduler_queue_name
|
|
157
|
+
else:
|
|
158
|
+
queue_name = default_queue_name
|
|
159
|
+
|
|
149
160
|
return cls(
|
|
150
161
|
queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
|
|
151
162
|
redis_url=settings.get('REDIS_URL'),
|
|
@@ -153,7 +164,7 @@ class QueueConfig:
|
|
|
153
164
|
redis_port=settings.get_int('REDIS_PORT', 6379),
|
|
154
165
|
redis_password=settings.get('REDIS_PASSWORD'),
|
|
155
166
|
redis_db=settings.get_int('REDIS_DB', 0),
|
|
156
|
-
queue_name=
|
|
167
|
+
queue_name=queue_name,
|
|
157
168
|
max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
|
|
158
169
|
max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
|
|
159
170
|
timeout=settings.get_int('QUEUE_TIMEOUT', 300)
|
|
@@ -423,15 +434,23 @@ class QueueManager:
|
|
|
423
434
|
except ImportError as e:
|
|
424
435
|
raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
|
|
425
436
|
|
|
426
|
-
#
|
|
437
|
+
# 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
|
|
427
438
|
project_name = "default"
|
|
428
439
|
if ':' in self.config.queue_name:
|
|
429
440
|
parts = self.config.queue_name.split(':')
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
if
|
|
433
|
-
|
|
434
|
-
|
|
441
|
+
if len(parts) >= 2:
|
|
442
|
+
# 处理可能的双重 crawlo 前缀
|
|
443
|
+
if parts[0] == "crawlo" and parts[1] == "crawlo":
|
|
444
|
+
# 双重 crawlo 前缀,取"crawlo"作为项目名称
|
|
445
|
+
project_name = "crawlo"
|
|
446
|
+
elif parts[0] == "crawlo":
|
|
447
|
+
# 正常的 crawlo 前缀,取第二个部分作为项目名称
|
|
448
|
+
project_name = parts[1]
|
|
449
|
+
else:
|
|
450
|
+
# 没有 crawlo 前缀,使用第一个部分作为项目名称
|
|
451
|
+
project_name = parts[0]
|
|
452
|
+
else:
|
|
453
|
+
project_name = self.config.queue_name or "default"
|
|
435
454
|
else:
|
|
436
455
|
project_name = self.config.queue_name or "default"
|
|
437
456
|
|
|
@@ -63,8 +63,8 @@ class RedisPriorityQueue:
|
|
|
63
63
|
if queue_name is None:
|
|
64
64
|
self.queue_name = f"crawlo:{module_name}:queue:requests"
|
|
65
65
|
else:
|
|
66
|
-
#
|
|
67
|
-
self.queue_name = queue_name
|
|
66
|
+
# 处理多重 crawlo 前缀,规范化队列名称
|
|
67
|
+
self.queue_name = self._normalize_queue_name(queue_name)
|
|
68
68
|
|
|
69
69
|
# 如果未提供 processing_queue,则根据 queue_name 自动生成
|
|
70
70
|
if processing_queue is None:
|
|
@@ -92,6 +92,47 @@ class RedisPriorityQueue:
|
|
|
92
92
|
self._lock = asyncio.Lock() # 用于连接初始化的锁
|
|
93
93
|
self.request_serializer = RequestSerializer() # 处理序列化
|
|
94
94
|
|
|
95
|
+
def _normalize_queue_name(self, queue_name: str) -> str:
|
|
96
|
+
"""
|
|
97
|
+
规范化队列名称,处理多重 crawlo 前缀
|
|
98
|
+
|
|
99
|
+
:param queue_name: 原始队列名称
|
|
100
|
+
:return: 规范化后的队列名称
|
|
101
|
+
"""
|
|
102
|
+
# 如果队列名称已经符合规范(以 crawlo: 开头且不是 crawlo:crawlo:),则保持不变
|
|
103
|
+
if queue_name.startswith("crawlo:") and not queue_name.startswith("crawlo:crawlo:"):
|
|
104
|
+
return queue_name
|
|
105
|
+
|
|
106
|
+
# 处理三重 crawlo 前缀,简化为标准格式
|
|
107
|
+
if queue_name.startswith("crawlo:crawlo:crawlo:"):
|
|
108
|
+
# 三重 crawlo 前缀,简化为标准 crawlo: 格式
|
|
109
|
+
remaining = queue_name[21:] # 去掉 "crawlo:crawlo:crawlo:" 前缀
|
|
110
|
+
if remaining:
|
|
111
|
+
return f"crawlo:{remaining}"
|
|
112
|
+
else:
|
|
113
|
+
return "crawlo:requests" # 默认名称
|
|
114
|
+
|
|
115
|
+
# 处理双重 crawlo 前缀
|
|
116
|
+
elif queue_name.startswith("crawlo:crawlo:"):
|
|
117
|
+
# 双重 crawlo 前缀,简化为标准 crawlo: 格式
|
|
118
|
+
remaining = queue_name[14:] # 去掉 "crawlo:crawlo:" 前缀
|
|
119
|
+
if remaining:
|
|
120
|
+
return f"crawlo:{remaining}"
|
|
121
|
+
else:
|
|
122
|
+
return "crawlo:requests" # 默认名称
|
|
123
|
+
|
|
124
|
+
# 处理无 crawlo 前缀的情况
|
|
125
|
+
elif not queue_name.startswith("crawlo:"):
|
|
126
|
+
# 无 crawlo 前缀,添加 crawlo: 前缀
|
|
127
|
+
if queue_name:
|
|
128
|
+
return f"crawlo:{queue_name}"
|
|
129
|
+
else:
|
|
130
|
+
return "crawlo:requests" # 默认名称
|
|
131
|
+
|
|
132
|
+
# 其他情况,保持不变
|
|
133
|
+
else:
|
|
134
|
+
return queue_name
|
|
135
|
+
|
|
95
136
|
async def connect(self, max_retries=3, delay=1):
|
|
96
137
|
"""异步连接 Redis,支持重试"""
|
|
97
138
|
async with self._lock:
|
|
@@ -60,7 +60,8 @@ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
|
|
|
60
60
|
ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
|
|
61
61
|
|
|
62
62
|
# 调度器队列名称(遵循统一命名规范)
|
|
63
|
-
|
|
63
|
+
# 当使用Redis队列时,取消注释并设置此值,或在项目配置文件中设置
|
|
64
|
+
# SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
|
|
64
65
|
|
|
65
66
|
# 队列类型:memory/redis/auto
|
|
66
67
|
QUEUE_TYPE = 'auto'
|
|
@@ -97,13 +98,12 @@ if REDIS_PASSWORD:
|
|
|
97
98
|
else:
|
|
98
99
|
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
99
100
|
|
|
100
|
-
#
|
|
101
|
-
#
|
|
102
|
-
# crawlo:{PROJECT_NAME}:
|
|
103
|
-
# crawlo:{PROJECT_NAME}:
|
|
104
|
-
# crawlo:{PROJECT_NAME}:queue:
|
|
105
|
-
# crawlo:{PROJECT_NAME}:queue:
|
|
106
|
-
# crawlo:{PROJECT_NAME}:queue:failed (失败队列)
|
|
101
|
+
# Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
|
|
102
|
+
# - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
|
|
103
|
+
# - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
|
|
104
|
+
# - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
|
|
105
|
+
# - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
|
|
106
|
+
# - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
|
|
107
107
|
|
|
108
108
|
REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
|
|
109
109
|
CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
|
|
@@ -29,6 +29,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
|
29
29
|
# ============================== 队列配置 ==============================
|
|
30
30
|
# 队列类型: 'memory', 'redis', 'auto'
|
|
31
31
|
QUEUE_TYPE = 'memory'
|
|
32
|
+
# 当使用Redis队列时,可自定义队列名称
|
|
33
|
+
# 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
|
|
34
|
+
# SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
32
35
|
|
|
33
36
|
# ============================== 去重过滤器 ==============================
|
|
34
37
|
FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
|
|
@@ -28,6 +28,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
|
28
28
|
|
|
29
29
|
# ============================== 队列配置 ==============================
|
|
30
30
|
QUEUE_TYPE = 'redis'
|
|
31
|
+
# 当使用Redis队列时,可自定义队列名称
|
|
32
|
+
# 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
|
|
33
|
+
# SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
31
34
|
|
|
32
35
|
# ============================== 去重过滤器 ==============================
|
|
33
36
|
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlo
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.7
|
|
4
4
|
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
5
|
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
6
|
Author: crawl-coder
|
|
@@ -630,6 +630,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
|
|
|
630
630
|
|
|
631
631
|
推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
|
|
632
632
|
|
|
633
|
+
#### Redis Key 命名规范
|
|
634
|
+
|
|
635
|
+
在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
|
|
636
|
+
|
|
637
|
+
##### 默认命名规则
|
|
638
|
+
Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
|
|
639
|
+
|
|
640
|
+
其中:
|
|
641
|
+
- `PROJECT_NAME`:项目名称,用于区分不同项目
|
|
642
|
+
- `component`:组件类型,如`queue`、`filter`、`item`
|
|
643
|
+
- `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
|
|
644
|
+
|
|
645
|
+
##### 具体Key格式
|
|
646
|
+
1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
|
|
647
|
+
- 用于存储待处理的请求任务
|
|
648
|
+
|
|
649
|
+
2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
|
|
650
|
+
- 用于存储正在处理的请求任务
|
|
651
|
+
|
|
652
|
+
3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
|
|
653
|
+
- 用于存储处理失败的请求任务
|
|
654
|
+
|
|
655
|
+
4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
|
|
656
|
+
- 用于存储请求URL的指纹,实现去重功能
|
|
657
|
+
|
|
658
|
+
5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
|
|
659
|
+
- 用于存储数据项的指纹,防止重复存储
|
|
660
|
+
|
|
661
|
+
##### 自定义队列名称
|
|
662
|
+
用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
|
|
663
|
+
- 处理中队列:将`:queue:requests`替换为`:queue:processing`
|
|
664
|
+
- 失败队列:将`:queue:requests`替换为`:queue:failed`
|
|
665
|
+
|
|
666
|
+
示例配置:
|
|
667
|
+
```python
|
|
668
|
+
# settings.py
|
|
669
|
+
SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
670
|
+
```
|
|
671
|
+
|
|
672
|
+
##### 命名规范优势
|
|
673
|
+
1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
|
|
674
|
+
2. **组件分类清晰**:通过组件类型区分不同功能模块
|
|
675
|
+
3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
|
|
676
|
+
4. **防止命名冲突**:避免不同项目或组件间的Key冲突
|
|
677
|
+
|
|
633
678
|
<!-- 配置系统 section -->
|
|
634
679
|
<h2 align="center">🎛️ 配置系统</h2>
|
|
635
680
|
|
|
@@ -1095,6 +1140,34 @@ asyncio.run(process.crawl('my_spider_name'))
|
|
|
1095
1140
|
|
|
1096
1141
|
---
|
|
1097
1142
|
|
|
1143
|
+
<!-- Redis键名修复说明 section -->
|
|
1144
|
+
<h2 align="center">🔧 Redis键名修复说明</h2>
|
|
1145
|
+
|
|
1146
|
+
在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
|
|
1147
|
+
|
|
1148
|
+
- **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
|
|
1149
|
+
- **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
|
|
1150
|
+
- **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
|
|
1151
|
+
|
|
1152
|
+
**修复内容**:
|
|
1153
|
+
|
|
1154
|
+
1. **队列管理器优化**:
|
|
1155
|
+
- 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
|
|
1156
|
+
- 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
|
|
1157
|
+
|
|
1158
|
+
2. **Redis队列实现改进**:
|
|
1159
|
+
- 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
|
|
1160
|
+
- 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
|
|
1161
|
+
|
|
1162
|
+
3. **配置文件调整**:
|
|
1163
|
+
- 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
|
|
1164
|
+
- 在所有模板和示例项目的配置文件中保持了一致性
|
|
1165
|
+
|
|
1166
|
+
**验证测试**:
|
|
1167
|
+
通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
|
|
1168
|
+
|
|
1169
|
+
---
|
|
1170
|
+
|
|
1098
1171
|
<!-- 文档 section -->
|
|
1099
1172
|
<h2 align="center">📚 文档</h2>
|
|
1100
1173
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=rCeDq1OoX6mmcBxuK60eUpEp1cIg5T8Zgic3FUQAOkA,2318
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=ejHyy8zORCf0PfUyyfPDzlV1k5vn5lI98S7TxKzblZc,22
|
|
3
3
|
crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
|
|
4
4
|
crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
|
|
5
5
|
crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
|
|
6
6
|
crawlo/crawler.py,sha256=wd8_jrfUBwlIw4NiaNeCwMj-CXS7F2ngeUhQ74P0wJE,25656
|
|
7
7
|
crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
|
|
8
8
|
crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
|
|
9
|
-
crawlo/framework.py,sha256=
|
|
9
|
+
crawlo/framework.py,sha256=1RVBwj_VBzfJiMB3lq6XcfFHCjRBHyT4D_T2X4fU_6g,9166
|
|
10
10
|
crawlo/mode_manager.py,sha256=JP8_jkH2p9LMg1-g1e05PhSggSvt4jO_oO2h51pLVYQ,7399
|
|
11
11
|
crawlo/project.py,sha256=DooXmO0nmcHPVRsnDBTE0dOrX-KOqnJe6A0s_-qOxRI,12147
|
|
12
12
|
crawlo/stats_collector.py,sha256=copzmfWTArYZCkMeZJsJfJcdC36s7_LM88hxAYttoeE,2306
|
|
@@ -90,10 +90,10 @@ crawlo/pipelines/pipeline_manager.py,sha256=rtKZEgDc9oMDYaTrSSQYCc7rVJ-a65TQw4p3
|
|
|
90
90
|
crawlo/pipelines/redis_dedup_pipeline.py,sha256=POYRiWAOp1pqDW9iTPJ8h3VcpLALeLrpw74MvJJqPiM,6342
|
|
91
91
|
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
92
|
crawlo/queue/pqueue.py,sha256=j2ISmyays5t1tuI36xM6EcELwSpq2xIjAScSBWSRZms,1220
|
|
93
|
-
crawlo/queue/queue_manager.py,sha256=
|
|
94
|
-
crawlo/queue/redis_priority_queue.py,sha256=
|
|
93
|
+
crawlo/queue/queue_manager.py,sha256=JfkjtOD04e_OZZvEEvp3O_W3lfGXhHslZHrCgw90amY,20693
|
|
94
|
+
crawlo/queue/redis_priority_queue.py,sha256=Evmo514OFL0a7Xu2SdCiz6klFUGH1gmjlxCc01vX1tQ,15400
|
|
95
95
|
crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
|
|
96
|
-
crawlo/settings/default_settings.py,sha256=
|
|
96
|
+
crawlo/settings/default_settings.py,sha256=LReJ11Wm8wh1z2GHgGRBYH6Tpq6_CkRd8QZ4dKRCqI0,13220
|
|
97
97
|
crawlo/settings/setting_manager.py,sha256=AWPvvhOGo04Yv_q3jqEMyzhEpbxOX_Wr8tSHmI2sUnA,8109
|
|
98
98
|
crawlo/spider/__init__.py,sha256=oi9LEYq9xaCSjktAIRUgjpGQQI7rTtN61ESdHeWb1x4,21224
|
|
99
99
|
crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
|
|
@@ -103,8 +103,8 @@ crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYG
|
|
|
103
103
|
crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
|
|
104
104
|
crawlo/templates/project/middlewares.py.tmpl,sha256=T67p8j0laL4NJJ_3xzPM9yivgZRjTEMiEtEWLPwbkmw,4160
|
|
105
105
|
crawlo/templates/project/pipelines.py.tmpl,sha256=GBHYU0Jx8sKDCdGJp44FMSH7u2slxoFg6a-R9Uwg_-I,2608
|
|
106
|
-
crawlo/templates/project/settings.py.tmpl,sha256=
|
|
107
|
-
crawlo/templates/project/settings_distributed.py.tmpl,sha256=
|
|
106
|
+
crawlo/templates/project/settings.py.tmpl,sha256=ETZnuGQvWhLdiXeu1FTH8vlHZ3d66GoC8qEzep9ZMrI,6880
|
|
107
|
+
crawlo/templates/project/settings_distributed.py.tmpl,sha256=BxNjURczwiBoowdMYCM631IXlQIJ15jd5wug7b_5RGw,6978
|
|
108
108
|
crawlo/templates/project/settings_gentle.py.tmpl,sha256=ZT6d-1Ao0h90vT82W9BSZuF8tsdyC4RU3446u1mh104,6631
|
|
109
109
|
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=Z_oWA4_a2yKOFAPG8lsLue2L6RzuKp8flq_NscAQvqA,6720
|
|
110
110
|
crawlo/templates/project/settings_minimal.py.tmpl,sha256=6_7R0T9iIBOInTP9HX-icEvPOhd8-B3lmiZEz30kzV0,2485
|
|
@@ -191,6 +191,7 @@ tests/simple_crawlo_test.py,sha256=8x8DNL7O_1DNtOQ_K7YsOFIZoWeGmpeEP9mKWHlkbHg,4
|
|
|
191
191
|
tests/simple_log_test.py,sha256=4daRH0bqTViz-BmyPcAZY9xKGks7G5kb39MH5W7v2XI,1700
|
|
192
192
|
tests/simple_log_test2.py,sha256=Z2xcCiT_-sCd1Sd-SK7hINcn6WcH_-7Bq0TWAei-XIg,3807
|
|
193
193
|
tests/simple_optimization_test.py,sha256=CyhyzW9lhPlTDAwrJu7gTWwcEQuCBL_Bnm9mkS_-iFo,3550
|
|
194
|
+
tests/simple_queue_type_test.py,sha256=OClhm3GvwWxONuUQqFD1KygGwUVnuYuxUvUY5OgyeKs,1152
|
|
194
195
|
tests/simple_spider_test.py,sha256=X5oFRV02mkOXUd5lpzOBF7gX8K62j4ZwAUXoBEZ0KKE,1119
|
|
195
196
|
tests/simple_test.py,sha256=kzMspCmfJxdnAIXXJv9tmDW1gpodkD9pznW5vA_gL84,1211
|
|
196
197
|
tests/spider_log_timing_test.py,sha256=ngZQ_v3o9oHYcs_BtZgxH1N-N2tZUDPu-cnTnsHEpP8,5396
|
|
@@ -206,11 +207,11 @@ tests/test_config_consistency.py,sha256=DJaAQxGL7RXHs-DWF_B4yhHFGSGHWHUoDmLFiMi4
|
|
|
206
207
|
tests/test_config_merge.py,sha256=d8i8sU1XKS3egNKEYPZ2a6CBnJRx2M3p6q04wYufAcw,5454
|
|
207
208
|
tests/test_config_validator.py,sha256=5ivB71KstHGNi2BPzcclf9hBukXEgt_B8N4l1HRjBFc,6020
|
|
208
209
|
tests/test_controlled_spider_mixin.py,sha256=7t6VGWr6Hxw0xtIFyToLH8_deSagUtsdqSJpibXHMY8,2785
|
|
209
|
-
tests/test_crawlo_proxy_integration.py,sha256=
|
|
210
|
+
tests/test_crawlo_proxy_integration.py,sha256=SvdBuZjS6N2vuvFkTnc59U5n3dHV3E4dmFayxtmjCm4,2625
|
|
210
211
|
tests/test_date_tools.py,sha256=CQdAmIS6bpAdwQH9ETDH__06l2gGL7EHUQuh7mdTF-A,3930
|
|
211
212
|
tests/test_default_header_middleware.py,sha256=7kpONSsGMsmWgTX2pCpseme54_-82Baak0xVz6gclJk,5845
|
|
212
213
|
tests/test_distributed.py,sha256=RQHUpDfRNG2x_1Cdr9DLk25IBcgapm_u0xSBMObE0Xc,1725
|
|
213
|
-
tests/test_double_crawlo_fix.py,sha256=
|
|
214
|
+
tests/test_double_crawlo_fix.py,sha256=E5NxWHnQkwRTIrJGoag8G29fZqVMnsN6eCPuv17gGq0,7652
|
|
214
215
|
tests/test_double_crawlo_fix_simple.py,sha256=MlWUqo51kOQ7Gu6Neoler8FVyRs0jpmQWoORHMBENz0,4644
|
|
215
216
|
tests/test_download_delay_middleware.py,sha256=Va79gsH_8BVrVVLA8gSwFEbrRJ7qwJMCC1cDJN6il_0,8886
|
|
216
217
|
tests/test_downloader_proxy_compatibility.py,sha256=3Jn7RJd1R2ywuitHp2Jju1yYNg57R4QmKwjuHGojDUE,8635
|
|
@@ -251,7 +252,9 @@ tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
|
|
|
251
252
|
tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
|
|
252
253
|
tests/test_queue_empty_check.py,sha256=FsFoThG8qXzhXtG9Gu4hHuz--iVZHSbFbGJh4vgq_ec,1141
|
|
253
254
|
tests/test_queue_manager_double_crawlo.py,sha256=YzM6PnoyRSST-f2NVyI97bpPcoYWL06HUwf08Fyx3Qg,6784
|
|
254
|
-
tests/test_queue_manager_redis_key.py,sha256=
|
|
255
|
+
tests/test_queue_manager_redis_key.py,sha256=qWkrXXlOg6IAi7OLPhBb524KbQHmHusAeYYi1W3yhqY,6938
|
|
256
|
+
tests/test_queue_naming.py,sha256=kDqj-n_k5KrFF5up1Ur6WmGS4Jqmk-QgKrkAtAyE4tk,4632
|
|
257
|
+
tests/test_queue_type.py,sha256=pD9v8Zcf8y6zAEKzko4Qo6FiwrEavXKMTS8Pni2r_8U,3198
|
|
255
258
|
tests/test_random_user_agent.py,sha256=LuyR8WaKfqOap9WBQl4WEBcZDmKxhW80T-_wXbuo2Qw,2230
|
|
256
259
|
tests/test_real_scenario_proxy.py,sha256=LGtxEvCiTgn6aTPGd7ZuqaCjApsjosD2DunJrd8-jFE,8259
|
|
257
260
|
tests/test_redis_config.py,sha256=DBrqURBQt517Rt1h1l2iIKrKDfbkJzQSRUEYYbapcy4,875
|
|
@@ -259,6 +262,7 @@ tests/test_redis_connection_pool.py,sha256=WIUQlI6K3IINan14vknI4oFf9a8wpHCWi87KS
|
|
|
259
262
|
tests/test_redis_key_naming.py,sha256=7_X_PSzFQn5m0n_7qLlCjFvY4ZKScC36cqWFu1PAFRw,6730
|
|
260
263
|
tests/test_redis_key_validator.py,sha256=VFuawmaA0G7VSHueCvZEQNKY-L2IdDGlEcyuJ9nZu7Q,4295
|
|
261
264
|
tests/test_redis_queue.py,sha256=2OZJHn5fN9b6XEgEs4Ht1AL6TOJ_H-IR9JxPzzvqMpg,6534
|
|
265
|
+
tests/test_redis_queue_name_fix.py,sha256=FSbhPMP8k_JP75TeOIPRbgQ7VQ2e5AVYRWjiPgIhelE,5595
|
|
262
266
|
tests/test_request_ignore_middleware.py,sha256=8_2E6JU27eOWI3iHeh3YscLnp3SIHaubWdA477Ki6PE,6047
|
|
263
267
|
tests/test_request_params.py,sha256=9vNksaOrFbuSb0UffruPxUHhJXZxVYyjQw9J69FSzH8,4176
|
|
264
268
|
tests/test_request_serialization.py,sha256=TPBIzjaifcAjFWCFSFZ5ewRn814jSGPL28MGTwvrr_w,2262
|
|
@@ -281,8 +285,8 @@ tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3
|
|
|
281
285
|
tests/verify_log_fix.py,sha256=TD7M1R22NxLqQPufvgE-H33u9tUjyz-rSR2ayIXozRU,4225
|
|
282
286
|
tests/scrapy_comparison/ofweek_scrapy.py,sha256=2Hvpi6DRTubUxBy6RyJApQxMQONPLc1zWjKTQO_i5U4,5652
|
|
283
287
|
tests/scrapy_comparison/scrapy_test.py,sha256=5sw7jOHhaTmQ8bsUd1TiolAUTRQYQOe-f49HPfysqbI,5466
|
|
284
|
-
crawlo-1.3.
|
|
285
|
-
crawlo-1.3.
|
|
286
|
-
crawlo-1.3.
|
|
287
|
-
crawlo-1.3.
|
|
288
|
-
crawlo-1.3.
|
|
288
|
+
crawlo-1.3.7.dist-info/METADATA,sha256=rHuHJj9-pUt4TJRUgQcNV48DM6LRk0Ag8Ek5pZZeqLk,33235
|
|
289
|
+
crawlo-1.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
290
|
+
crawlo-1.3.7.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
291
|
+
crawlo-1.3.7.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
292
|
+
crawlo-1.3.7.dist-info/RECORD,,
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
简单测试 QUEUE_TYPE 配置获取
|
|
5
|
+
验证我们的日志格式修改是否正常工作
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到路径
|
|
12
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
13
|
+
|
|
14
|
+
from crawlo.config import CrawloConfig
|
|
15
|
+
from crawlo.framework import CrawloFramework
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_log_format():
|
|
19
|
+
"""测试日志格式修改是否正常工作"""
|
|
20
|
+
print("=== 测试日志格式修改 ===")
|
|
21
|
+
|
|
22
|
+
# 创建单机模式配置
|
|
23
|
+
config = CrawloConfig.standalone(concurrency=4)
|
|
24
|
+
|
|
25
|
+
# 创建框架实例,这会触发日志输出
|
|
26
|
+
framework = CrawloFramework(config.to_dict())
|
|
27
|
+
|
|
28
|
+
# 获取配置信息
|
|
29
|
+
run_mode = framework.settings.get('RUN_MODE', 'not found')
|
|
30
|
+
queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
|
|
31
|
+
|
|
32
|
+
print(f"从配置中获取到的信息:")
|
|
33
|
+
print(f" RunMode: {run_mode}")
|
|
34
|
+
print(f" QueueType: {queue_type}")
|
|
35
|
+
|
|
36
|
+
print("\n✅ 日志格式修改测试完成")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if __name__ == "__main__":
|
|
40
|
+
print("开始简单测试 QUEUE_TYPE 配置获取...")
|
|
41
|
+
test_log_format()
|
|
42
|
+
print("\n测试结束!")
|
tests/test_double_crawlo_fix.py
CHANGED
|
@@ -34,18 +34,18 @@ async def test_redis_queue_naming():
|
|
|
34
34
|
{
|
|
35
35
|
"name": "双重 crawlo 前缀",
|
|
36
36
|
"queue_name": "crawlo:crawlo:queue:requests",
|
|
37
|
-
"expected_module": "
|
|
38
|
-
"expected_queue": "crawlo:
|
|
39
|
-
"expected_processing": "crawlo:
|
|
40
|
-
"expected_failed": "crawlo:
|
|
37
|
+
"expected_module": "test_project",
|
|
38
|
+
"expected_queue": "crawlo:queue:requests", # 修复后的期望值
|
|
39
|
+
"expected_processing": "crawlo:queue:processing",
|
|
40
|
+
"expected_failed": "crawlo:queue:failed"
|
|
41
41
|
},
|
|
42
42
|
{
|
|
43
43
|
"name": "三重 crawlo 前缀",
|
|
44
44
|
"queue_name": "crawlo:crawlo:crawlo:queue:requests",
|
|
45
|
-
"expected_module": "
|
|
46
|
-
"expected_queue": "crawlo:
|
|
47
|
-
"expected_processing": "crawlo:
|
|
48
|
-
"expected_failed": "crawlo:
|
|
45
|
+
"expected_module": "test_project",
|
|
46
|
+
"expected_queue": "crawlo:queue:requests", # 修复后的期望值
|
|
47
|
+
"expected_processing": "crawlo:queue:processing",
|
|
48
|
+
"expected_failed": "crawlo:queue:failed"
|
|
49
49
|
},
|
|
50
50
|
{
|
|
51
51
|
"name": "无 crawlo 前缀",
|
|
@@ -138,11 +138,8 @@ async def test_queue_manager_naming():
|
|
|
138
138
|
if len(parts) >= 2:
|
|
139
139
|
# 处理可能的双重 crawlo 前缀
|
|
140
140
|
if parts[0] == "crawlo" and parts[1] == "crawlo":
|
|
141
|
-
# 双重 crawlo
|
|
142
|
-
|
|
143
|
-
project_name = parts[2]
|
|
144
|
-
else:
|
|
145
|
-
project_name = "default"
|
|
141
|
+
# 双重 crawlo 前缀,取"crawlo"作为项目名称
|
|
142
|
+
project_name = "crawlo"
|
|
146
143
|
elif parts[0] == "crawlo":
|
|
147
144
|
# 正常的 crawlo 前缀,取第二个部分作为项目名称
|
|
148
145
|
project_name = parts[1]
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
"""
|
|
4
|
-
QueueManager Redis Key
|
|
5
|
-
用于验证QueueManager
|
|
4
|
+
QueueManager Redis Key测试脚本
|
|
5
|
+
用于验证QueueManager创建Redis队列时是否正确传递module_name参数
|
|
6
6
|
"""
|
|
7
7
|
import asyncio
|
|
8
8
|
import sys
|
|
@@ -12,40 +12,32 @@ import traceback
|
|
|
12
12
|
# 添加项目根目录到路径
|
|
13
13
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
14
|
|
|
15
|
+
# 导入相关模块
|
|
15
16
|
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class MockSettings:
|
|
19
20
|
"""模拟设置类"""
|
|
20
|
-
def __init__(self
|
|
21
|
-
self.
|
|
22
|
-
self.
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.REDIS_HOST = '127.0.0.1'
|
|
23
|
+
self.REDIS_PORT = 6379
|
|
24
|
+
self.REDIS_PASSWORD = ''
|
|
25
|
+
self.REDIS_DB = 0
|
|
26
|
+
self.REDIS_URL = 'redis://127.0.0.1:6379/0'
|
|
23
27
|
self.REDIS_TTL = 0
|
|
24
|
-
self.
|
|
25
|
-
self.
|
|
26
|
-
self.
|
|
27
|
-
self.DECODE_RESPONSES = True
|
|
28
|
+
self.SCHEDULER_MAX_QUEUE_SIZE = 1000
|
|
29
|
+
self.QUEUE_MAX_RETRIES = 3
|
|
30
|
+
self.QUEUE_TIMEOUT = 300
|
|
28
31
|
|
|
29
32
|
def get(self, key, default=None):
|
|
30
|
-
if key == '
|
|
31
|
-
return self.
|
|
33
|
+
if key == 'REDIS_HOST':
|
|
34
|
+
return self.REDIS_HOST
|
|
35
|
+
elif key == 'REDIS_PASSWORD':
|
|
36
|
+
return self.REDIS_PASSWORD
|
|
32
37
|
elif key == 'REDIS_URL':
|
|
33
38
|
return self.REDIS_URL
|
|
34
|
-
elif key == '
|
|
35
|
-
return self.
|
|
36
|
-
elif key == 'LOG_LEVEL':
|
|
37
|
-
return self.LOG_LEVEL
|
|
38
|
-
elif key == 'DECODE_RESPONSES':
|
|
39
|
-
return self.DECODE_RESPONSES
|
|
40
|
-
return default
|
|
41
|
-
|
|
42
|
-
def get_bool(self, key, default=False):
|
|
43
|
-
if key == 'FILTER_DEBUG':
|
|
44
|
-
return self.FILTER_DEBUG
|
|
45
|
-
elif key == 'DECODE_RESPONSES':
|
|
46
|
-
return self.DECODE_RESPONSES
|
|
47
|
-
elif key == 'CLEANUP_FP':
|
|
48
|
-
return self.CLEANUP_FP
|
|
39
|
+
elif key == 'REDIS_TTL':
|
|
40
|
+
return self.REDIS_TTL
|
|
49
41
|
return default
|
|
50
42
|
|
|
51
43
|
def get_int(self, key, default=0):
|
|
@@ -74,26 +66,41 @@ async def test_queue_manager_redis_key():
|
|
|
74
66
|
{
|
|
75
67
|
"queue_name": "crawlo:books_distributed:queue:requests",
|
|
76
68
|
"expected_module_name": "books_distributed",
|
|
69
|
+
"expected_queue_name": "crawlo:books_distributed:queue:requests",
|
|
70
|
+
"expected_processing_queue": "crawlo:books_distributed:queue:processing",
|
|
71
|
+
"expected_failed_queue": "crawlo:books_distributed:queue:failed",
|
|
77
72
|
"description": "标准项目名称"
|
|
78
73
|
},
|
|
79
74
|
{
|
|
80
75
|
"queue_name": "crawlo:api_data_collection:queue:requests",
|
|
81
76
|
"expected_module_name": "api_data_collection",
|
|
77
|
+
"expected_queue_name": "crawlo:api_data_collection:queue:requests",
|
|
78
|
+
"expected_processing_queue": "crawlo:api_data_collection:queue:processing",
|
|
79
|
+
"expected_failed_queue": "crawlo:api_data_collection:queue:failed",
|
|
82
80
|
"description": "API数据采集项目"
|
|
83
81
|
},
|
|
84
82
|
{
|
|
85
83
|
"queue_name": "crawlo:test_project:queue:requests",
|
|
86
84
|
"expected_module_name": "test_project",
|
|
85
|
+
"expected_queue_name": "crawlo:test_project:queue:requests",
|
|
86
|
+
"expected_processing_queue": "crawlo:test_project:queue:processing",
|
|
87
|
+
"expected_failed_queue": "crawlo:test_project:queue:failed",
|
|
87
88
|
"description": "测试项目"
|
|
88
89
|
},
|
|
89
90
|
{
|
|
90
91
|
"queue_name": "simple_queue_name",
|
|
91
92
|
"expected_module_name": "simple_queue_name",
|
|
93
|
+
"expected_queue_name": "crawlo:simple_queue_name", # RedisPriorityQueue会规范化名称
|
|
94
|
+
"expected_processing_queue": "crawlo:simple_queue_name:processing",
|
|
95
|
+
"expected_failed_queue": "crawlo:simple_queue_name:failed",
|
|
92
96
|
"description": "简单队列名称"
|
|
93
97
|
},
|
|
94
98
|
{
|
|
95
99
|
"queue_name": "",
|
|
96
100
|
"expected_module_name": "default",
|
|
101
|
+
"expected_queue_name": "crawlo:", # 空字符串会规范化为"crawlo:"
|
|
102
|
+
"expected_processing_queue": "crawlo::processing",
|
|
103
|
+
"expected_failed_queue": "crawlo::failed",
|
|
97
104
|
"description": "空队列名称"
|
|
98
105
|
}
|
|
99
106
|
]
|
|
@@ -123,16 +130,12 @@ async def test_queue_manager_redis_key():
|
|
|
123
130
|
f"module_name不匹配: {queue.module_name} != {test_case['expected_module_name']}"
|
|
124
131
|
|
|
125
132
|
# 验证队列名称是否符合规范
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
assert queue.
|
|
131
|
-
f"
|
|
132
|
-
assert queue.processing_queue == expected_processing_queue, \
|
|
133
|
-
f"处理中队列名称不匹配: {queue.processing_queue} != {expected_processing_queue}"
|
|
134
|
-
assert queue.failed_queue == expected_failed_queue, \
|
|
135
|
-
f"失败队列名称不匹配: {queue.failed_queue} != {expected_failed_queue}"
|
|
133
|
+
assert queue.queue_name == test_case["expected_queue_name"], \
|
|
134
|
+
f"队列名称不匹配: {queue.queue_name} != {test_case['expected_queue_name']}"
|
|
135
|
+
assert queue.processing_queue == test_case["expected_processing_queue"], \
|
|
136
|
+
f"处理中队列名称不匹配: {queue.processing_queue} != {test_case['expected_processing_queue']}"
|
|
137
|
+
assert queue.failed_queue == test_case["expected_failed_queue"], \
|
|
138
|
+
f"失败队列名称不匹配: {queue.failed_queue} != {test_case['expected_failed_queue']}"
|
|
136
139
|
|
|
137
140
|
print(f" module_name: {queue.module_name}")
|
|
138
141
|
print(f" 队列名称: {queue.queue_name}")
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
队列命名测试脚本
|
|
5
|
+
用于验证Redis队列命名修复
|
|
6
|
+
"""
|
|
7
|
+
import asyncio
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# 添加项目根目录到路径
|
|
12
|
+
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
|
13
|
+
|
|
14
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockRequest:
|
|
18
|
+
def __init__(self, url):
|
|
19
|
+
self.url = url
|
|
20
|
+
self.priority = 0
|
|
21
|
+
self.meta = {}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_queue_naming():
|
|
25
|
+
"""测试队列命名"""
|
|
26
|
+
print("开始测试Redis队列命名...")
|
|
27
|
+
print("=" * 50)
|
|
28
|
+
|
|
29
|
+
# 测试用例
|
|
30
|
+
test_cases = [
|
|
31
|
+
{
|
|
32
|
+
"name": "正常命名",
|
|
33
|
+
"queue_name": "crawlo:test_project:queue:requests",
|
|
34
|
+
"expected_queue": "crawlo:test_project:queue:requests",
|
|
35
|
+
"expected_processing": "crawlo:test_project:queue:processing",
|
|
36
|
+
"expected_failed": "crawlo:test_project:queue:failed"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"name": "双重 crawlo 前缀",
|
|
40
|
+
"queue_name": "crawlo:crawlo:queue:requests",
|
|
41
|
+
"expected_queue": "crawlo:queue:requests",
|
|
42
|
+
"expected_processing": "crawlo:queue:processing",
|
|
43
|
+
"expected_failed": "crawlo:queue:failed"
|
|
44
|
+
}
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
all_passed = True
|
|
48
|
+
|
|
49
|
+
for i, test_case in enumerate(test_cases, 1):
|
|
50
|
+
print(f"测试 {i}: {test_case['name']}")
|
|
51
|
+
print(f" 输入队列名称: {test_case['queue_name']}")
|
|
52
|
+
|
|
53
|
+
# 创建RedisPriorityQueue实例
|
|
54
|
+
queue = RedisPriorityQueue(
|
|
55
|
+
redis_url="redis://127.0.0.1:6379/0",
|
|
56
|
+
queue_name=test_case['queue_name'],
|
|
57
|
+
module_name="test_project"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
print(f" 实际队列名称: {queue.queue_name}")
|
|
61
|
+
print(f" 实际处理队列: {queue.processing_queue}")
|
|
62
|
+
print(f" 实际失败队列: {queue.failed_queue}")
|
|
63
|
+
|
|
64
|
+
print(f" 期望队列名称: {test_case['expected_queue']}")
|
|
65
|
+
print(f" 期望处理队列: {test_case['expected_processing']}")
|
|
66
|
+
print(f" 期望失败队列: {test_case['expected_failed']}")
|
|
67
|
+
|
|
68
|
+
# 验证结果
|
|
69
|
+
queue_name_ok = queue.queue_name == test_case['expected_queue']
|
|
70
|
+
processing_queue_ok = queue.processing_queue == test_case['expected_processing']
|
|
71
|
+
failed_queue_ok = queue.failed_queue == test_case['expected_failed']
|
|
72
|
+
|
|
73
|
+
if queue_name_ok and processing_queue_ok and failed_queue_ok:
|
|
74
|
+
print(" ✓ 测试通过")
|
|
75
|
+
else:
|
|
76
|
+
print(" ✗ 测试失败")
|
|
77
|
+
all_passed = False
|
|
78
|
+
|
|
79
|
+
print()
|
|
80
|
+
|
|
81
|
+
return all_passed
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
async def test_queue_operations():
|
|
85
|
+
"""测试队列操作"""
|
|
86
|
+
print("开始测试Redis队列操作...")
|
|
87
|
+
print("=" * 50)
|
|
88
|
+
|
|
89
|
+
# 创建一个RedisPriorityQueue实例
|
|
90
|
+
queue = RedisPriorityQueue(
|
|
91
|
+
redis_url='redis://127.0.0.1:6379/0',
|
|
92
|
+
queue_name='crawlo:test_project:queue:requests',
|
|
93
|
+
module_name='test_project'
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 连接Redis
|
|
97
|
+
await queue.connect()
|
|
98
|
+
|
|
99
|
+
print('队列名称:', queue.queue_name)
|
|
100
|
+
print('处理队列:', queue.processing_queue)
|
|
101
|
+
print('失败队列:', queue.failed_queue)
|
|
102
|
+
|
|
103
|
+
# 清理之前的测试数据
|
|
104
|
+
await queue._redis.delete(queue.queue_name)
|
|
105
|
+
await queue._redis.delete(f'{queue.queue_name}:data')
|
|
106
|
+
await queue._redis.delete(queue.processing_queue)
|
|
107
|
+
await queue._redis.delete(f'{queue.processing_queue}:data')
|
|
108
|
+
await queue._redis.delete(queue.failed_queue)
|
|
109
|
+
|
|
110
|
+
# 添加一个测试任务
|
|
111
|
+
request = MockRequest('https://example.com')
|
|
112
|
+
|
|
113
|
+
# 测试放入队列
|
|
114
|
+
result = await queue.put(request, priority=1)
|
|
115
|
+
print('放入队列结果:', result)
|
|
116
|
+
|
|
117
|
+
# 检查队列大小
|
|
118
|
+
size = await queue.qsize()
|
|
119
|
+
print('队列大小:', size)
|
|
120
|
+
|
|
121
|
+
# 测试获取任务
|
|
122
|
+
retrieved = await queue.get(timeout=1.0)
|
|
123
|
+
print('获取请求:', retrieved.url if retrieved else None)
|
|
124
|
+
|
|
125
|
+
if retrieved:
|
|
126
|
+
# 测试确认任务完成
|
|
127
|
+
await queue.ack(retrieved)
|
|
128
|
+
print('任务确认完成')
|
|
129
|
+
|
|
130
|
+
# 关闭连接
|
|
131
|
+
await queue.close()
|
|
132
|
+
print("队列操作测试完成")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def main():
|
|
136
|
+
"""主测试函数"""
|
|
137
|
+
print("开始Redis队列命名和操作测试...")
|
|
138
|
+
print("=" * 50)
|
|
139
|
+
|
|
140
|
+
# 测试队列命名
|
|
141
|
+
naming_test_passed = test_queue_naming()
|
|
142
|
+
|
|
143
|
+
# 测试队列操作
|
|
144
|
+
await test_queue_operations()
|
|
145
|
+
|
|
146
|
+
print("=" * 50)
|
|
147
|
+
if naming_test_passed:
|
|
148
|
+
print("所有测试通过!")
|
|
149
|
+
else:
|
|
150
|
+
print("部分测试失败!")
|
|
151
|
+
print("测试完成!")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
asyncio.run(main())
|
tests/test_queue_type.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试 QUEUE_TYPE 配置获取
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.framework import CrawloFramework
|
|
14
|
+
from crawlo.config import CrawloConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_queue_type_standalone():
|
|
18
|
+
"""测试单机模式下的 QUEUE_TYPE"""
|
|
19
|
+
print("=== 测试单机模式下的 QUEUE_TYPE ===")
|
|
20
|
+
|
|
21
|
+
# 创建单机模式配置
|
|
22
|
+
config = CrawloConfig.standalone(concurrency=4)
|
|
23
|
+
|
|
24
|
+
# 创建框架实例
|
|
25
|
+
framework = CrawloFramework(config.to_dict())
|
|
26
|
+
|
|
27
|
+
# 获取 QUEUE_TYPE
|
|
28
|
+
queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
|
|
29
|
+
run_mode = framework.settings.get('RUN_MODE', 'not found')
|
|
30
|
+
|
|
31
|
+
print(f"RunMode: {run_mode}")
|
|
32
|
+
print(f"QueueType: {queue_type}")
|
|
33
|
+
|
|
34
|
+
# 验证是否正确
|
|
35
|
+
assert queue_type == 'memory', f"期望 'memory',实际得到 '{queue_type}'"
|
|
36
|
+
assert run_mode == 'standalone', f"期望 'standalone',实际得到 '{run_mode}'"
|
|
37
|
+
|
|
38
|
+
print("✅ 单机模式测试通过")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_queue_type_distributed():
|
|
42
|
+
"""测试分布式模式下的 QUEUE_TYPE"""
|
|
43
|
+
print("\n=== 测试分布式模式下的 QUEUE_TYPE ===")
|
|
44
|
+
|
|
45
|
+
# 创建分布式模式配置
|
|
46
|
+
config = CrawloConfig.distributed(
|
|
47
|
+
redis_host='127.0.0.1',
|
|
48
|
+
redis_port=6379,
|
|
49
|
+
project_name='test_project',
|
|
50
|
+
concurrency=4
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# 创建框架实例
|
|
54
|
+
framework = CrawloFramework(config.to_dict())
|
|
55
|
+
|
|
56
|
+
# 获取 QUEUE_TYPE
|
|
57
|
+
queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
|
|
58
|
+
run_mode = framework.settings.get('RUN_MODE', 'not found')
|
|
59
|
+
|
|
60
|
+
print(f"RunMode: {run_mode}")
|
|
61
|
+
print(f"QueueType: {queue_type}")
|
|
62
|
+
|
|
63
|
+
# 验证是否正确
|
|
64
|
+
assert queue_type == 'redis', f"期望 'redis',实际得到 '{queue_type}'"
|
|
65
|
+
assert run_mode == 'distributed', f"期望 'distributed',实际得到 '{run_mode}'"
|
|
66
|
+
|
|
67
|
+
print("✅ 分布式模式测试通过")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_queue_type_auto():
|
|
71
|
+
"""测试自动模式下的 QUEUE_TYPE"""
|
|
72
|
+
print("\n=== 测试自动模式下的 QUEUE_TYPE ===")
|
|
73
|
+
|
|
74
|
+
# 创建自动模式配置
|
|
75
|
+
config = CrawloConfig.auto(concurrency=4)
|
|
76
|
+
|
|
77
|
+
# 创建框架实例
|
|
78
|
+
framework = CrawloFramework(config.to_dict())
|
|
79
|
+
|
|
80
|
+
# 获取 QUEUE_TYPE
|
|
81
|
+
queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
|
|
82
|
+
run_mode = framework.settings.get('RUN_MODE', 'not found')
|
|
83
|
+
|
|
84
|
+
print(f"RunMode: {run_mode}")
|
|
85
|
+
print(f"QueueType: {queue_type}")
|
|
86
|
+
|
|
87
|
+
# 验证是否正确
|
|
88
|
+
assert queue_type == 'auto', f"期望 'auto',实际得到 '{queue_type}'"
|
|
89
|
+
assert run_mode == 'auto', f"期望 'auto',实际得到 '{run_mode}'"
|
|
90
|
+
|
|
91
|
+
print("✅ 自动模式测试通过")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
print("开始测试 QUEUE_TYPE 配置获取...")
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
test_queue_type_standalone()
|
|
99
|
+
test_queue_type_distributed()
|
|
100
|
+
test_queue_type_auto()
|
|
101
|
+
|
|
102
|
+
print("\n🎉 所有测试通过!可以成功获取到 QUEUE_TYPE 配置。")
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print(f"\n❌ 测试失败: {e}")
|
|
106
|
+
import traceback
|
|
107
|
+
traceback.print_exc()
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Redis队列名称修复测试脚本
|
|
5
|
+
用于验证RedisPriorityQueue中队列名称处理的修复
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
# 导入相关模块
|
|
14
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_normalize_queue_name():
|
|
18
|
+
"""测试队列名称规范化函数"""
|
|
19
|
+
print("开始测试RedisPriorityQueue队列名称规范化...")
|
|
20
|
+
print("=" * 50)
|
|
21
|
+
|
|
22
|
+
# 创建一个RedisPriorityQueue实例用于测试
|
|
23
|
+
queue = RedisPriorityQueue(redis_url="redis://127.0.0.1:6379/15")
|
|
24
|
+
|
|
25
|
+
test_cases = [
|
|
26
|
+
{
|
|
27
|
+
"name": "已经规范化的名称",
|
|
28
|
+
"input": "crawlo:test_project:queue:requests",
|
|
29
|
+
"expected": "crawlo:test_project:queue:requests"
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"name": "双重 crawlo 前缀",
|
|
33
|
+
"input": "crawlo:crawlo:queue:requests",
|
|
34
|
+
"expected": "crawlo:queue:requests"
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"name": "三重 crawlo 前缀",
|
|
38
|
+
"input": "crawlo:crawlo:crawlo:queue:requests",
|
|
39
|
+
"expected": "crawlo:queue:requests"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"name": "无 crawlo 前缀",
|
|
43
|
+
"input": "test_project:queue:requests",
|
|
44
|
+
"expected": "crawlo:test_project:queue:requests"
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"name": "空队列名称",
|
|
48
|
+
"input": "",
|
|
49
|
+
"expected": "crawlo:requests"
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
all_passed = True
|
|
54
|
+
|
|
55
|
+
for i, test_case in enumerate(test_cases, 1):
|
|
56
|
+
print(f"测试 {i}: {test_case['name']}")
|
|
57
|
+
print(f" 输入: {test_case['input']}")
|
|
58
|
+
|
|
59
|
+
# 测试规范化函数
|
|
60
|
+
result = queue._normalize_queue_name(test_case['input'])
|
|
61
|
+
print(f" 输出: {result}")
|
|
62
|
+
print(f" 期望: {test_case['expected']}")
|
|
63
|
+
|
|
64
|
+
# 验证结果
|
|
65
|
+
if result == test_case['expected']:
|
|
66
|
+
print(" ✓ 测试通过")
|
|
67
|
+
else:
|
|
68
|
+
print(" ✗ 测试失败")
|
|
69
|
+
all_passed = False
|
|
70
|
+
|
|
71
|
+
print()
|
|
72
|
+
|
|
73
|
+
print("=" * 50)
|
|
74
|
+
if all_passed:
|
|
75
|
+
print("所有测试通过!队列名称规范化修复成功")
|
|
76
|
+
return True
|
|
77
|
+
else:
|
|
78
|
+
print("部分测试失败,请检查实现")
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_queue_initialization():
|
|
83
|
+
"""测试队列初始化时的名称处理"""
|
|
84
|
+
print("开始测试RedisPriorityQueue初始化时的名称处理...")
|
|
85
|
+
print("=" * 50)
|
|
86
|
+
|
|
87
|
+
test_cases = [
|
|
88
|
+
{
|
|
89
|
+
"name": "正常命名",
|
|
90
|
+
"queue_name": "crawlo:test_project:queue:requests",
|
|
91
|
+
"expected_queue": "crawlo:test_project:queue:requests",
|
|
92
|
+
"expected_processing": "crawlo:test_project:queue:processing",
|
|
93
|
+
"expected_failed": "crawlo:test_project:queue:failed"
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"name": "双重 crawlo 前缀",
|
|
97
|
+
"queue_name": "crawlo:crawlo:queue:requests",
|
|
98
|
+
"expected_queue": "crawlo:queue:requests",
|
|
99
|
+
"expected_processing": "crawlo:queue:processing",
|
|
100
|
+
"expected_failed": "crawlo:queue:failed"
|
|
101
|
+
}
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
all_passed = True
|
|
105
|
+
|
|
106
|
+
for i, test_case in enumerate(test_cases, 1):
|
|
107
|
+
print(f"测试 {i}: {test_case['name']}")
|
|
108
|
+
print(f" 输入队列名称: {test_case['queue_name']}")
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# 创建RedisPriorityQueue实例
|
|
112
|
+
queue = RedisPriorityQueue(
|
|
113
|
+
redis_url="redis://127.0.0.1:6379/15",
|
|
114
|
+
queue_name=test_case['queue_name'],
|
|
115
|
+
module_name="test_project"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
print(f" 实际队列名称: {queue.queue_name}")
|
|
119
|
+
print(f" 实际处理队列: {queue.processing_queue}")
|
|
120
|
+
print(f" 实际失败队列: {queue.failed_queue}")
|
|
121
|
+
|
|
122
|
+
print(f" 期望队列名称: {test_case['expected_queue']}")
|
|
123
|
+
print(f" 期望处理队列: {test_case['expected_processing']}")
|
|
124
|
+
print(f" 期望失败队列: {test_case['expected_failed']}")
|
|
125
|
+
|
|
126
|
+
# 验证结果
|
|
127
|
+
queue_name_ok = queue.queue_name == test_case['expected_queue']
|
|
128
|
+
processing_queue_ok = queue.processing_queue == test_case['expected_processing']
|
|
129
|
+
failed_queue_ok = queue.failed_queue == test_case['expected_failed']
|
|
130
|
+
|
|
131
|
+
if queue_name_ok and processing_queue_ok and failed_queue_ok:
|
|
132
|
+
print(" ✓ 测试通过")
|
|
133
|
+
else:
|
|
134
|
+
print(" ✗ 测试失败")
|
|
135
|
+
all_passed = False
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f" ✗ 测试异常: {e}")
|
|
139
|
+
all_passed = False
|
|
140
|
+
|
|
141
|
+
print()
|
|
142
|
+
|
|
143
|
+
print("=" * 50)
|
|
144
|
+
if all_passed:
|
|
145
|
+
print("队列初始化测试通过!")
|
|
146
|
+
return True
|
|
147
|
+
else:
|
|
148
|
+
print("队列初始化测试失败!")
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def main():
|
|
153
|
+
"""主测试函数"""
|
|
154
|
+
print("开始Redis队列名称修复测试...")
|
|
155
|
+
print("=" * 50)
|
|
156
|
+
|
|
157
|
+
# 测试队列名称规范化函数
|
|
158
|
+
normalize_test_passed = test_normalize_queue_name()
|
|
159
|
+
print()
|
|
160
|
+
|
|
161
|
+
# 测试队列初始化
|
|
162
|
+
init_test_passed = test_queue_initialization()
|
|
163
|
+
print()
|
|
164
|
+
|
|
165
|
+
print("=" * 50)
|
|
166
|
+
if normalize_test_passed and init_test_passed:
|
|
167
|
+
print("所有测试通过!Redis队列名称修复完成")
|
|
168
|
+
return 0
|
|
169
|
+
else:
|
|
170
|
+
print("部分测试失败,请检查实现")
|
|
171
|
+
return 1
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
exit_code = main()
|
|
176
|
+
sys.exit(exit_code)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|