crawlo 1.3.5__py3-none-any.whl → 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.3.5'
1
+ __version__ = '1.3.7'
crawlo/framework.py CHANGED
@@ -57,9 +57,10 @@ class CrawloFramework:
57
57
 
58
58
  self._logger.info(f"Crawlo Framework Started {version}")
59
59
 
60
- # 获取运行模式并记录日志
60
+ # 获取运行模式和队列类型并记录日志
61
61
  run_mode = self._settings.get('RUN_MODE', 'unknown')
62
- self._logger.info(f"Run mode: {run_mode}")
62
+ queue_type = self._settings.get('QUEUE_TYPE', 'unknown')
63
+ self._logger.info(f"RunMode: {run_mode}, QueueType: {queue_type}")
63
64
 
64
65
  # 记录项目名称
65
66
  project_name = self._settings.get('PROJECT_NAME', 'unknown')
@@ -146,6 +146,17 @@ class QueueConfig:
146
146
  @classmethod
147
147
  def from_settings(cls, settings) -> 'QueueConfig':
148
148
  """Create configuration from settings"""
149
+ # 获取项目名称,用于生成默认队列名称
150
+ project_name = settings.get('PROJECT_NAME', 'default')
151
+ default_queue_name = f"crawlo:{project_name}:queue:requests"
152
+
153
+ # 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
154
+ scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
155
+ if scheduler_queue_name is not None:
156
+ queue_name = scheduler_queue_name
157
+ else:
158
+ queue_name = default_queue_name
159
+
149
160
  return cls(
150
161
  queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
151
162
  redis_url=settings.get('REDIS_URL'),
@@ -153,7 +164,7 @@ class QueueConfig:
153
164
  redis_port=settings.get_int('REDIS_PORT', 6379),
154
165
  redis_password=settings.get('REDIS_PASSWORD'),
155
166
  redis_db=settings.get_int('REDIS_DB', 0),
156
- queue_name=settings.get('SCHEDULER_QUEUE_NAME', 'crawlo:requests'),
167
+ queue_name=queue_name,
157
168
  max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
158
169
  max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
159
170
  timeout=settings.get_int('QUEUE_TIMEOUT', 300)
@@ -423,15 +434,23 @@ class QueueManager:
423
434
  except ImportError as e:
424
435
  raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
425
436
 
426
- # 简化项目名称提取逻辑
437
+ # 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
427
438
  project_name = "default"
428
439
  if ':' in self.config.queue_name:
429
440
  parts = self.config.queue_name.split(':')
430
- # 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
431
- for part in parts:
432
- if part != "crawlo":
433
- project_name = part
434
- break
441
+ if len(parts) >= 2:
442
+ # 处理可能的双重 crawlo 前缀
443
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
444
+ # 双重 crawlo 前缀,取"crawlo"作为项目名称
445
+ project_name = "crawlo"
446
+ elif parts[0] == "crawlo":
447
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
448
+ project_name = parts[1]
449
+ else:
450
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
451
+ project_name = parts[0]
452
+ else:
453
+ project_name = self.config.queue_name or "default"
435
454
  else:
436
455
  project_name = self.config.queue_name or "default"
437
456
 
@@ -63,8 +63,8 @@ class RedisPriorityQueue:
63
63
  if queue_name is None:
64
64
  self.queue_name = f"crawlo:{module_name}:queue:requests"
65
65
  else:
66
- # 保持用户提供的队列名称不变,不做修改
67
- self.queue_name = queue_name
66
+ # 处理多重 crawlo 前缀,规范化队列名称
67
+ self.queue_name = self._normalize_queue_name(queue_name)
68
68
 
69
69
  # 如果未提供 processing_queue,则根据 queue_name 自动生成
70
70
  if processing_queue is None:
@@ -92,6 +92,47 @@ class RedisPriorityQueue:
92
92
  self._lock = asyncio.Lock() # 用于连接初始化的锁
93
93
  self.request_serializer = RequestSerializer() # 处理序列化
94
94
 
95
+ def _normalize_queue_name(self, queue_name: str) -> str:
96
+ """
97
+ 规范化队列名称,处理多重 crawlo 前缀
98
+
99
+ :param queue_name: 原始队列名称
100
+ :return: 规范化后的队列名称
101
+ """
102
+ # 如果队列名称已经符合规范(以 crawlo: 开头且不是 crawlo:crawlo:),则保持不变
103
+ if queue_name.startswith("crawlo:") and not queue_name.startswith("crawlo:crawlo:"):
104
+ return queue_name
105
+
106
+ # 处理三重 crawlo 前缀,简化为标准格式
107
+ if queue_name.startswith("crawlo:crawlo:crawlo:"):
108
+ # 三重 crawlo 前缀,简化为标准 crawlo: 格式
109
+ remaining = queue_name[21:] # 去掉 "crawlo:crawlo:crawlo:" 前缀
110
+ if remaining:
111
+ return f"crawlo:{remaining}"
112
+ else:
113
+ return "crawlo:requests" # 默认名称
114
+
115
+ # 处理双重 crawlo 前缀
116
+ elif queue_name.startswith("crawlo:crawlo:"):
117
+ # 双重 crawlo 前缀,简化为标准 crawlo: 格式
118
+ remaining = queue_name[14:] # 去掉 "crawlo:crawlo:" 前缀
119
+ if remaining:
120
+ return f"crawlo:{remaining}"
121
+ else:
122
+ return "crawlo:requests" # 默认名称
123
+
124
+ # 处理无 crawlo 前缀的情况
125
+ elif not queue_name.startswith("crawlo:"):
126
+ # 无 crawlo 前缀,添加 crawlo: 前缀
127
+ if queue_name:
128
+ return f"crawlo:{queue_name}"
129
+ else:
130
+ return "crawlo:requests" # 默认名称
131
+
132
+ # 其他情况,保持不变
133
+ else:
134
+ return queue_name
135
+
95
136
  async def connect(self, max_retries=3, delay=1):
96
137
  """异步连接 Redis,支持重试"""
97
138
  async with self._lock:
@@ -60,7 +60,8 @@ REQUEST_GENERATION_INTERVAL = 0.01 # 请求生成间隔(秒)
60
60
  ENABLE_CONTROLLED_REQUEST_GENERATION = False # 是否启用受控请求生成
61
61
 
62
62
  # 调度器队列名称(遵循统一命名规范)
63
- SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
63
+ # 当使用Redis队列时,取消注释并设置此值,或在项目配置文件中设置
64
+ # SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
64
65
 
65
66
  # 队列类型:memory/redis/auto
66
67
  QUEUE_TYPE = 'auto'
@@ -97,13 +98,12 @@ if REDIS_PASSWORD:
97
98
  else:
98
99
  REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
99
100
 
100
- # 统一的Redis key命名规范配置
101
- # REDIS_KEY_PREFIX 已移至各组件中,使用统一的命名规范
102
- # crawlo:{PROJECT_NAME}:filter:fingerprint (请求去重)
103
- # crawlo:{PROJECT_NAME}:item:fingerprint (数据项去重)
104
- # crawlo:{PROJECT_NAME}:queue:requests (请求队列)
105
- # crawlo:{PROJECT_NAME}:queue:processing (处理中队列)
106
- # crawlo:{PROJECT_NAME}:queue:failed (失败队列)
101
+ # Redis key命名规范已封装到框架内部组件中,用户无需手动配置:
102
+ # - 请求去重: crawlo:{PROJECT_NAME}:filter:fingerprint
103
+ # - 数据项去重: crawlo:{PROJECT_NAME}:item:fingerprint
104
+ # - 请求队列: crawlo:{PROJECT_NAME}:queue:requests
105
+ # - 处理中队列: crawlo:{PROJECT_NAME}:queue:processing
106
+ # - 失败队列: crawlo:{PROJECT_NAME}:queue:failed
107
107
 
108
108
  REDIS_TTL = 0 # 指纹过期时间(0 表示永不过期)
109
109
  CLEANUP_FP = 0 # 程序结束时是否清理指纹(0=不清理)
@@ -29,6 +29,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
29
29
  # ============================== 队列配置 ==============================
30
30
  # 队列类型: 'memory', 'redis', 'auto'
31
31
  QUEUE_TYPE = 'memory'
32
+ # 当使用Redis队列时,可自定义队列名称
33
+ # 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
34
+ # SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
32
35
 
33
36
  # ============================== 去重过滤器 ==============================
34
37
  FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
@@ -28,6 +28,9 @@ DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
28
28
 
29
29
  # ============================== 队列配置 ==============================
30
30
  QUEUE_TYPE = 'redis'
31
+ # 当使用Redis队列时,可自定义队列名称
32
+ # 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
33
+ # SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
31
34
 
32
35
  # ============================== 去重过滤器 ==============================
33
36
  FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.5
3
+ Version: 1.3.7
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -630,6 +630,51 @@ Crawlo支持三种队列类型,可通过`QUEUE_TYPE`配置项设置:
630
630
 
631
631
  推荐使用`auto`模式,让框架根据环境自动选择最适合的队列类型。
632
632
 
633
+ #### Redis Key 命名规范
634
+
635
+ 在分布式模式下,Crawlo框架使用Redis作为队列和去重存储。为了确保不同项目和爬虫之间的数据隔离,框架采用统一的Redis Key命名规范:
636
+
637
+ ##### 默认命名规则
638
+ Redis Key遵循以下命名格式:`crawlo:{PROJECT_NAME}:{component}:{identifier}`
639
+
640
+ 其中:
641
+ - `PROJECT_NAME`:项目名称,用于区分不同项目
642
+ - `component`:组件类型,如`queue`、`filter`、`item`
643
+ - `identifier`:具体标识符,如`requests`、`processing`、`failed`、`fingerprint`
644
+
645
+ ##### 具体Key格式
646
+ 1. **请求队列**:`crawlo:{PROJECT_NAME}:queue:requests`
647
+ - 用于存储待处理的请求任务
648
+
649
+ 2. **处理中队列**:`crawlo:{PROJECT_NAME}:queue:processing`
650
+ - 用于存储正在处理的请求任务
651
+
652
+ 3. **失败队列**:`crawlo:{PROJECT_NAME}:queue:failed`
653
+ - 用于存储处理失败的请求任务
654
+
655
+ 4. **请求去重**:`crawlo:{PROJECT_NAME}:filter:fingerprint`
656
+ - 用于存储请求URL的指纹,实现去重功能
657
+
658
+ 5. **数据项去重**:`crawlo:{PROJECT_NAME}:item:fingerprint`
659
+ - 用于存储数据项的指纹,防止重复存储
660
+
661
+ ##### 自定义队列名称
662
+ 用户可以通过`SCHEDULER_QUEUE_NAME`配置项自定义请求队列名称。处理中队列和失败队列会基于请求队列名称自动生成:
663
+ - 处理中队列:将`:queue:requests`替换为`:queue:processing`
664
+ - 失败队列:将`:queue:requests`替换为`:queue:failed`
665
+
666
+ 示例配置:
667
+ ```python
668
+ # settings.py
669
+ SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
670
+ ```
671
+
672
+ ##### 命名规范优势
673
+ 1. **命名空间隔离**:通过项目名称实现不同项目间的数据隔离
674
+ 2. **组件分类清晰**:通过组件类型区分不同功能模块
675
+ 3. **易于监控和管理**:统一的命名格式便于Redis监控和管理
676
+ 4. **防止命名冲突**:避免不同项目或组件间的Key冲突
677
+
633
678
  <!-- 配置系统 section -->
634
679
  <h2 align="center">🎛️ 配置系统</h2>
635
680
 
@@ -1095,6 +1140,34 @@ asyncio.run(process.crawl('my_spider_name'))
1095
1140
 
1096
1141
  ---
1097
1142
 
1143
+ <!-- Redis键名修复说明 section -->
1144
+ <h2 align="center">🔧 Redis键名修复说明</h2>
1145
+
1146
+ 在早期版本中,Crawlo框架存在Redis队列键名生成的双重前缀问题。具体表现为:
1147
+
1148
+ - **问题现象**:Redis队列键名出现双重"crawlo"前缀,如`crawlo:crawlo:queue:requests`而不是正确的`crawlo:{project_name}:queue:requests`
1149
+ - **影响范围**:影响分布式模式下的请求队列、处理队列和失败队列的正确识别和使用
1150
+ - **根本原因**:队列管理器中的项目名称提取逻辑未能正确处理不同格式的队列名称
1151
+
1152
+ **修复内容**:
1153
+
1154
+ 1. **队列管理器优化**:
1155
+ - 改进了[QueueConfig.from_settings](file:///Users/oscar/projects/Crawlo/crawlo/queue/queue_manager.py#L148-L180)方法,使其在`SCHEDULER_QUEUE_NAME`未设置时能正确使用基于项目名称的默认值
1156
+ - 修复了队列管理器中从队列名称提取项目名称的逻辑,确保能正确处理各种前缀情况
1157
+
1158
+ 2. **Redis队列实现改进**:
1159
+ - 在[RedisPriorityQueue](file:///Users/oscar/projects/Crawlo/crawlo/queue/redis_priority_queue.py#L39-L76)中添加了`_normalize_queue_name`方法来规范化队列名称
1160
+ - 处理了多重"crawlo"前缀的情况,确保队列名称符合统一规范
1161
+
1162
+ 3. **配置文件调整**:
1163
+ - 将`SCHEDULER_QUEUE_NAME`设置为注释状态,提供更大的配置灵活性
1164
+ - 在所有模板和示例项目的配置文件中保持了一致性
1165
+
1166
+ **验证测试**:
1167
+ 通过专门的测试脚本验证了修复效果,确保在各种队列命名情况下都能正确生成和识别Redis键名。
1168
+
1169
+ ---
1170
+
1098
1171
  <!-- 文档 section -->
1099
1172
  <h2 align="center">📚 文档</h2>
1100
1173
 
@@ -1,12 +1,12 @@
1
1
  crawlo/__init__.py,sha256=rCeDq1OoX6mmcBxuK60eUpEp1cIg5T8Zgic3FUQAOkA,2318
2
- crawlo/__version__.py,sha256=AraH1WB67qbGfdcq6TC7ARcEi7zORceoTxsCTwF_h8g,22
2
+ crawlo/__version__.py,sha256=ejHyy8zORCf0PfUyyfPDzlV1k5vn5lI98S7TxKzblZc,22
3
3
  crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
4
4
  crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
5
5
  crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
6
6
  crawlo/crawler.py,sha256=wd8_jrfUBwlIw4NiaNeCwMj-CXS7F2ngeUhQ74P0wJE,25656
7
7
  crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
8
8
  crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
9
- crawlo/framework.py,sha256=N0N9_GOWWYafob5iYqGT4wGAKTxSMWFbWTJuE9PRkqI,9062
9
+ crawlo/framework.py,sha256=1RVBwj_VBzfJiMB3lq6XcfFHCjRBHyT4D_T2X4fU_6g,9166
10
10
  crawlo/mode_manager.py,sha256=JP8_jkH2p9LMg1-g1e05PhSggSvt4jO_oO2h51pLVYQ,7399
11
11
  crawlo/project.py,sha256=DooXmO0nmcHPVRsnDBTE0dOrX-KOqnJe6A0s_-qOxRI,12147
12
12
  crawlo/stats_collector.py,sha256=copzmfWTArYZCkMeZJsJfJcdC36s7_LM88hxAYttoeE,2306
@@ -90,10 +90,10 @@ crawlo/pipelines/pipeline_manager.py,sha256=rtKZEgDc9oMDYaTrSSQYCc7rVJ-a65TQw4p3
90
90
  crawlo/pipelines/redis_dedup_pipeline.py,sha256=POYRiWAOp1pqDW9iTPJ8h3VcpLALeLrpw74MvJJqPiM,6342
91
91
  crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
92
  crawlo/queue/pqueue.py,sha256=j2ISmyays5t1tuI36xM6EcELwSpq2xIjAScSBWSRZms,1220
93
- crawlo/queue/queue_manager.py,sha256=t-49ygGHAvOZ08v5zj4F06Iq2JUCSK5vZldRg-4sVtI,19669
94
- crawlo/queue/redis_priority_queue.py,sha256=5mEgMjqg7XrQrWOhWpwGwycmA-qcwfHtr8w7cKHs4-E,13657
93
+ crawlo/queue/queue_manager.py,sha256=JfkjtOD04e_OZZvEEvp3O_W3lfGXhHslZHrCgw90amY,20693
94
+ crawlo/queue/redis_priority_queue.py,sha256=Evmo514OFL0a7Xu2SdCiz6klFUGH1gmjlxCc01vX1tQ,15400
95
95
  crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
96
- crawlo/settings/default_settings.py,sha256=vmacY04PZqumteQn7URMo0r3JWwJCctXaJcoxlW5-M0,13144
96
+ crawlo/settings/default_settings.py,sha256=LReJ11Wm8wh1z2GHgGRBYH6Tpq6_CkRd8QZ4dKRCqI0,13220
97
97
  crawlo/settings/setting_manager.py,sha256=AWPvvhOGo04Yv_q3jqEMyzhEpbxOX_Wr8tSHmI2sUnA,8109
98
98
  crawlo/spider/__init__.py,sha256=oi9LEYq9xaCSjktAIRUgjpGQQI7rTtN61ESdHeWb1x4,21224
99
99
  crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
@@ -103,8 +103,8 @@ crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYG
103
103
  crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
104
104
  crawlo/templates/project/middlewares.py.tmpl,sha256=T67p8j0laL4NJJ_3xzPM9yivgZRjTEMiEtEWLPwbkmw,4160
105
105
  crawlo/templates/project/pipelines.py.tmpl,sha256=GBHYU0Jx8sKDCdGJp44FMSH7u2slxoFg6a-R9Uwg_-I,2608
106
- crawlo/templates/project/settings.py.tmpl,sha256=qat0jBxnWXZDhCdmHh86JC4eDodRuNW9mKQ6mIBaiCY,6685
107
- crawlo/templates/project/settings_distributed.py.tmpl,sha256=q1v2HBS6NF1Ebwb1ia9z5DV9Zv3CREZPDJSDuCryv58,6783
106
+ crawlo/templates/project/settings.py.tmpl,sha256=ETZnuGQvWhLdiXeu1FTH8vlHZ3d66GoC8qEzep9ZMrI,6880
107
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=BxNjURczwiBoowdMYCM631IXlQIJ15jd5wug7b_5RGw,6978
108
108
  crawlo/templates/project/settings_gentle.py.tmpl,sha256=ZT6d-1Ao0h90vT82W9BSZuF8tsdyC4RU3446u1mh104,6631
109
109
  crawlo/templates/project/settings_high_performance.py.tmpl,sha256=Z_oWA4_a2yKOFAPG8lsLue2L6RzuKp8flq_NscAQvqA,6720
110
110
  crawlo/templates/project/settings_minimal.py.tmpl,sha256=6_7R0T9iIBOInTP9HX-icEvPOhd8-B3lmiZEz30kzV0,2485
@@ -191,6 +191,7 @@ tests/simple_crawlo_test.py,sha256=8x8DNL7O_1DNtOQ_K7YsOFIZoWeGmpeEP9mKWHlkbHg,4
191
191
  tests/simple_log_test.py,sha256=4daRH0bqTViz-BmyPcAZY9xKGks7G5kb39MH5W7v2XI,1700
192
192
  tests/simple_log_test2.py,sha256=Z2xcCiT_-sCd1Sd-SK7hINcn6WcH_-7Bq0TWAei-XIg,3807
193
193
  tests/simple_optimization_test.py,sha256=CyhyzW9lhPlTDAwrJu7gTWwcEQuCBL_Bnm9mkS_-iFo,3550
194
+ tests/simple_queue_type_test.py,sha256=OClhm3GvwWxONuUQqFD1KygGwUVnuYuxUvUY5OgyeKs,1152
194
195
  tests/simple_spider_test.py,sha256=X5oFRV02mkOXUd5lpzOBF7gX8K62j4ZwAUXoBEZ0KKE,1119
195
196
  tests/simple_test.py,sha256=kzMspCmfJxdnAIXXJv9tmDW1gpodkD9pznW5vA_gL84,1211
196
197
  tests/spider_log_timing_test.py,sha256=ngZQ_v3o9oHYcs_BtZgxH1N-N2tZUDPu-cnTnsHEpP8,5396
@@ -206,11 +207,11 @@ tests/test_config_consistency.py,sha256=DJaAQxGL7RXHs-DWF_B4yhHFGSGHWHUoDmLFiMi4
206
207
  tests/test_config_merge.py,sha256=d8i8sU1XKS3egNKEYPZ2a6CBnJRx2M3p6q04wYufAcw,5454
207
208
  tests/test_config_validator.py,sha256=5ivB71KstHGNi2BPzcclf9hBukXEgt_B8N4l1HRjBFc,6020
208
209
  tests/test_controlled_spider_mixin.py,sha256=7t6VGWr6Hxw0xtIFyToLH8_deSagUtsdqSJpibXHMY8,2785
209
- tests/test_crawlo_proxy_integration.py,sha256=_L62_soaHRYy_0fShjiZSmv-RtGICw7_kzhTNRoyFfc,2620
210
+ tests/test_crawlo_proxy_integration.py,sha256=SvdBuZjS6N2vuvFkTnc59U5n3dHV3E4dmFayxtmjCm4,2625
210
211
  tests/test_date_tools.py,sha256=CQdAmIS6bpAdwQH9ETDH__06l2gGL7EHUQuh7mdTF-A,3930
211
212
  tests/test_default_header_middleware.py,sha256=7kpONSsGMsmWgTX2pCpseme54_-82Baak0xVz6gclJk,5845
212
213
  tests/test_distributed.py,sha256=RQHUpDfRNG2x_1Cdr9DLk25IBcgapm_u0xSBMObE0Xc,1725
213
- tests/test_double_crawlo_fix.py,sha256=ZNkRDgWW2WN-QRNZhvIgTHonY-T_U_R_MOIBLuyJd_I,7770
214
+ tests/test_double_crawlo_fix.py,sha256=E5NxWHnQkwRTIrJGoag8G29fZqVMnsN6eCPuv17gGq0,7652
214
215
  tests/test_double_crawlo_fix_simple.py,sha256=MlWUqo51kOQ7Gu6Neoler8FVyRs0jpmQWoORHMBENz0,4644
215
216
  tests/test_download_delay_middleware.py,sha256=Va79gsH_8BVrVVLA8gSwFEbrRJ7qwJMCC1cDJN6il_0,8886
216
217
  tests/test_downloader_proxy_compatibility.py,sha256=3Jn7RJd1R2ywuitHp2Jju1yYNg57R4QmKwjuHGojDUE,8635
@@ -251,7 +252,9 @@ tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
251
252
  tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
252
253
  tests/test_queue_empty_check.py,sha256=FsFoThG8qXzhXtG9Gu4hHuz--iVZHSbFbGJh4vgq_ec,1141
253
254
  tests/test_queue_manager_double_crawlo.py,sha256=YzM6PnoyRSST-f2NVyI97bpPcoYWL06HUwf08Fyx3Qg,6784
254
- tests/test_queue_manager_redis_key.py,sha256=nCCMnpKPNP5fyd4zb4LG2kmJAUcLoa8ODhBGcz4GcCU,6231
255
+ tests/test_queue_manager_redis_key.py,sha256=qWkrXXlOg6IAi7OLPhBb524KbQHmHusAeYYi1W3yhqY,6938
256
+ tests/test_queue_naming.py,sha256=kDqj-n_k5KrFF5up1Ur6WmGS4Jqmk-QgKrkAtAyE4tk,4632
257
+ tests/test_queue_type.py,sha256=pD9v8Zcf8y6zAEKzko4Qo6FiwrEavXKMTS8Pni2r_8U,3198
255
258
  tests/test_random_user_agent.py,sha256=LuyR8WaKfqOap9WBQl4WEBcZDmKxhW80T-_wXbuo2Qw,2230
256
259
  tests/test_real_scenario_proxy.py,sha256=LGtxEvCiTgn6aTPGd7ZuqaCjApsjosD2DunJrd8-jFE,8259
257
260
  tests/test_redis_config.py,sha256=DBrqURBQt517Rt1h1l2iIKrKDfbkJzQSRUEYYbapcy4,875
@@ -259,6 +262,7 @@ tests/test_redis_connection_pool.py,sha256=WIUQlI6K3IINan14vknI4oFf9a8wpHCWi87KS
259
262
  tests/test_redis_key_naming.py,sha256=7_X_PSzFQn5m0n_7qLlCjFvY4ZKScC36cqWFu1PAFRw,6730
260
263
  tests/test_redis_key_validator.py,sha256=VFuawmaA0G7VSHueCvZEQNKY-L2IdDGlEcyuJ9nZu7Q,4295
261
264
  tests/test_redis_queue.py,sha256=2OZJHn5fN9b6XEgEs4Ht1AL6TOJ_H-IR9JxPzzvqMpg,6534
265
+ tests/test_redis_queue_name_fix.py,sha256=FSbhPMP8k_JP75TeOIPRbgQ7VQ2e5AVYRWjiPgIhelE,5595
262
266
  tests/test_request_ignore_middleware.py,sha256=8_2E6JU27eOWI3iHeh3YscLnp3SIHaubWdA477Ki6PE,6047
263
267
  tests/test_request_params.py,sha256=9vNksaOrFbuSb0UffruPxUHhJXZxVYyjQw9J69FSzH8,4176
264
268
  tests/test_request_serialization.py,sha256=TPBIzjaifcAjFWCFSFZ5ewRn814jSGPL28MGTwvrr_w,2262
@@ -281,8 +285,8 @@ tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3
281
285
  tests/verify_log_fix.py,sha256=TD7M1R22NxLqQPufvgE-H33u9tUjyz-rSR2ayIXozRU,4225
282
286
  tests/scrapy_comparison/ofweek_scrapy.py,sha256=2Hvpi6DRTubUxBy6RyJApQxMQONPLc1zWjKTQO_i5U4,5652
283
287
  tests/scrapy_comparison/scrapy_test.py,sha256=5sw7jOHhaTmQ8bsUd1TiolAUTRQYQOe-f49HPfysqbI,5466
284
- crawlo-1.3.5.dist-info/METADATA,sha256=RDqsfzstoxtnrY_AR26XXQ3MbZA-W7sdCx4_CPReUGo,29742
285
- crawlo-1.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
286
- crawlo-1.3.5.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
287
- crawlo-1.3.5.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
288
- crawlo-1.3.5.dist-info/RECORD,,
288
+ crawlo-1.3.7.dist-info/METADATA,sha256=rHuHJj9-pUt4TJRUgQcNV48DM6LRk0Ag8Ek5pZZeqLk,33235
289
+ crawlo-1.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
290
+ crawlo-1.3.7.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
291
+ crawlo-1.3.7.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
292
+ crawlo-1.3.7.dist-info/RECORD,,
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 简单测试 QUEUE_TYPE 配置获取
5
+ 验证我们的日志格式修改是否正常工作
6
+ """
7
+
8
+ import sys
9
+ import os
10
+
11
+ # 添加项目根目录到路径
12
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
13
+
14
+ from crawlo.config import CrawloConfig
15
+ from crawlo.framework import CrawloFramework
16
+
17
+
18
+ def test_log_format():
19
+ """测试日志格式修改是否正常工作"""
20
+ print("=== 测试日志格式修改 ===")
21
+
22
+ # 创建单机模式配置
23
+ config = CrawloConfig.standalone(concurrency=4)
24
+
25
+ # 创建框架实例,这会触发日志输出
26
+ framework = CrawloFramework(config.to_dict())
27
+
28
+ # 获取配置信息
29
+ run_mode = framework.settings.get('RUN_MODE', 'not found')
30
+ queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
31
+
32
+ print(f"从配置中获取到的信息:")
33
+ print(f" RunMode: {run_mode}")
34
+ print(f" QueueType: {queue_type}")
35
+
36
+ print("\n✅ 日志格式修改测试完成")
37
+
38
+
39
+ if __name__ == "__main__":
40
+ print("开始简单测试 QUEUE_TYPE 配置获取...")
41
+ test_log_format()
42
+ print("\n测试结束!")
@@ -45,7 +45,7 @@ class ProxyTestSpider(Spider):
45
45
 
46
46
  item = TestItem(
47
47
  url=response.url,
48
- status=response.status,
48
+ status=response.status_code,
49
49
  proxy=str(response.meta.get('proxy', 'No proxy'))
50
50
  )
51
51
 
@@ -34,18 +34,18 @@ async def test_redis_queue_naming():
34
34
  {
35
35
  "name": "双重 crawlo 前缀",
36
36
  "queue_name": "crawlo:crawlo:queue:requests",
37
- "expected_module": "crawlo",
38
- "expected_queue": "crawlo:crawlo:queue:requests",
39
- "expected_processing": "crawlo:crawlo:queue:processing",
40
- "expected_failed": "crawlo:crawlo:queue:failed"
37
+ "expected_module": "test_project",
38
+ "expected_queue": "crawlo:queue:requests", # 修复后的期望值
39
+ "expected_processing": "crawlo:queue:processing",
40
+ "expected_failed": "crawlo:queue:failed"
41
41
  },
42
42
  {
43
43
  "name": "三重 crawlo 前缀",
44
44
  "queue_name": "crawlo:crawlo:crawlo:queue:requests",
45
- "expected_module": "crawlo",
46
- "expected_queue": "crawlo:crawlo:queue:requests",
47
- "expected_processing": "crawlo:crawlo:queue:processing",
48
- "expected_failed": "crawlo:crawlo:queue:failed"
45
+ "expected_module": "test_project",
46
+ "expected_queue": "crawlo:queue:requests", # 修复后的期望值
47
+ "expected_processing": "crawlo:queue:processing",
48
+ "expected_failed": "crawlo:queue:failed"
49
49
  },
50
50
  {
51
51
  "name": "无 crawlo 前缀",
@@ -138,11 +138,8 @@ async def test_queue_manager_naming():
138
138
  if len(parts) >= 2:
139
139
  # 处理可能的双重 crawlo 前缀
140
140
  if parts[0] == "crawlo" and parts[1] == "crawlo":
141
- # 双重 crawlo 前缀,取第三个部分作为项目名称
142
- if len(parts) >= 3:
143
- project_name = parts[2]
144
- else:
145
- project_name = "default"
141
+ # 双重 crawlo 前缀,取"crawlo"作为项目名称
142
+ project_name = "crawlo"
146
143
  elif parts[0] == "crawlo":
147
144
  # 正常的 crawlo 前缀,取第二个部分作为项目名称
148
145
  project_name = parts[1]
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
3
  """
4
- QueueManager Redis Key 测试脚本
5
- 用于验证QueueManager在创建RedisPriorityQueue时是否正确传递module_name参数
4
+ QueueManager Redis Key测试脚本
5
+ 用于验证QueueManager创建Redis队列时是否正确传递module_name参数
6
6
  """
7
7
  import asyncio
8
8
  import sys
@@ -12,40 +12,32 @@ import traceback
12
12
  # 添加项目根目录到路径
13
13
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
14
14
 
15
+ # 导入相关模块
15
16
  from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
16
17
 
17
18
 
18
19
  class MockSettings:
19
20
  """模拟设置类"""
20
- def __init__(self, project_name="test_project"):
21
- self.project_name = project_name
22
- self.REDIS_URL = "redis://127.0.0.1:6379/15" # 使用测试数据库
21
+ def __init__(self):
22
+ self.REDIS_HOST = '127.0.0.1'
23
+ self.REDIS_PORT = 6379
24
+ self.REDIS_PASSWORD = ''
25
+ self.REDIS_DB = 0
26
+ self.REDIS_URL = 'redis://127.0.0.1:6379/0'
23
27
  self.REDIS_TTL = 0
24
- self.CLEANUP_FP = 0
25
- self.FILTER_DEBUG = True
26
- self.LOG_LEVEL = "INFO"
27
- self.DECODE_RESPONSES = True
28
+ self.SCHEDULER_MAX_QUEUE_SIZE = 1000
29
+ self.QUEUE_MAX_RETRIES = 3
30
+ self.QUEUE_TIMEOUT = 300
28
31
 
29
32
  def get(self, key, default=None):
30
- if key == 'PROJECT_NAME':
31
- return self.project_name
33
+ if key == 'REDIS_HOST':
34
+ return self.REDIS_HOST
35
+ elif key == 'REDIS_PASSWORD':
36
+ return self.REDIS_PASSWORD
32
37
  elif key == 'REDIS_URL':
33
38
  return self.REDIS_URL
34
- elif key == 'FILTER_DEBUG':
35
- return self.FILTER_DEBUG
36
- elif key == 'LOG_LEVEL':
37
- return self.LOG_LEVEL
38
- elif key == 'DECODE_RESPONSES':
39
- return self.DECODE_RESPONSES
40
- return default
41
-
42
- def get_bool(self, key, default=False):
43
- if key == 'FILTER_DEBUG':
44
- return self.FILTER_DEBUG
45
- elif key == 'DECODE_RESPONSES':
46
- return self.DECODE_RESPONSES
47
- elif key == 'CLEANUP_FP':
48
- return self.CLEANUP_FP
39
+ elif key == 'REDIS_TTL':
40
+ return self.REDIS_TTL
49
41
  return default
50
42
 
51
43
  def get_int(self, key, default=0):
@@ -74,26 +66,41 @@ async def test_queue_manager_redis_key():
74
66
  {
75
67
  "queue_name": "crawlo:books_distributed:queue:requests",
76
68
  "expected_module_name": "books_distributed",
69
+ "expected_queue_name": "crawlo:books_distributed:queue:requests",
70
+ "expected_processing_queue": "crawlo:books_distributed:queue:processing",
71
+ "expected_failed_queue": "crawlo:books_distributed:queue:failed",
77
72
  "description": "标准项目名称"
78
73
  },
79
74
  {
80
75
  "queue_name": "crawlo:api_data_collection:queue:requests",
81
76
  "expected_module_name": "api_data_collection",
77
+ "expected_queue_name": "crawlo:api_data_collection:queue:requests",
78
+ "expected_processing_queue": "crawlo:api_data_collection:queue:processing",
79
+ "expected_failed_queue": "crawlo:api_data_collection:queue:failed",
82
80
  "description": "API数据采集项目"
83
81
  },
84
82
  {
85
83
  "queue_name": "crawlo:test_project:queue:requests",
86
84
  "expected_module_name": "test_project",
85
+ "expected_queue_name": "crawlo:test_project:queue:requests",
86
+ "expected_processing_queue": "crawlo:test_project:queue:processing",
87
+ "expected_failed_queue": "crawlo:test_project:queue:failed",
87
88
  "description": "测试项目"
88
89
  },
89
90
  {
90
91
  "queue_name": "simple_queue_name",
91
92
  "expected_module_name": "simple_queue_name",
93
+ "expected_queue_name": "crawlo:simple_queue_name", # RedisPriorityQueue会规范化名称
94
+ "expected_processing_queue": "crawlo:simple_queue_name:processing",
95
+ "expected_failed_queue": "crawlo:simple_queue_name:failed",
92
96
  "description": "简单队列名称"
93
97
  },
94
98
  {
95
99
  "queue_name": "",
96
100
  "expected_module_name": "default",
101
+ "expected_queue_name": "crawlo:", # 空字符串会规范化为"crawlo:"
102
+ "expected_processing_queue": "crawlo::processing",
103
+ "expected_failed_queue": "crawlo::failed",
97
104
  "description": "空队列名称"
98
105
  }
99
106
  ]
@@ -123,16 +130,12 @@ async def test_queue_manager_redis_key():
123
130
  f"module_name不匹配: {queue.module_name} != {test_case['expected_module_name']}"
124
131
 
125
132
  # 验证队列名称是否符合规范
126
- expected_queue_name = f"crawlo:{queue.module_name}:queue:requests"
127
- expected_processing_queue = f"crawlo:{queue.module_name}:queue:processing"
128
- expected_failed_queue = f"crawlo:{queue.module_name}:queue:failed"
129
-
130
- assert queue.queue_name == expected_queue_name, \
131
- f"队列名称不匹配: {queue.queue_name} != {expected_queue_name}"
132
- assert queue.processing_queue == expected_processing_queue, \
133
- f"处理中队列名称不匹配: {queue.processing_queue} != {expected_processing_queue}"
134
- assert queue.failed_queue == expected_failed_queue, \
135
- f"失败队列名称不匹配: {queue.failed_queue} != {expected_failed_queue}"
133
+ assert queue.queue_name == test_case["expected_queue_name"], \
134
+ f"队列名称不匹配: {queue.queue_name} != {test_case['expected_queue_name']}"
135
+ assert queue.processing_queue == test_case["expected_processing_queue"], \
136
+ f"处理中队列名称不匹配: {queue.processing_queue} != {test_case['expected_processing_queue']}"
137
+ assert queue.failed_queue == test_case["expected_failed_queue"], \
138
+ f"失败队列名称不匹配: {queue.failed_queue} != {test_case['expected_failed_queue']}"
136
139
 
137
140
  print(f" module_name: {queue.module_name}")
138
141
  print(f" 队列名称: {queue.queue_name}")
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 队列命名测试脚本
5
+ 用于验证Redis队列命名修复
6
+ """
7
+ import asyncio
8
+ import sys
9
+ import os
10
+
11
+ # 添加项目根目录到路径
12
+ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
13
+
14
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
15
+
16
+
17
+ class MockRequest:
18
+ def __init__(self, url):
19
+ self.url = url
20
+ self.priority = 0
21
+ self.meta = {}
22
+
23
+
24
+ def test_queue_naming():
25
+ """测试队列命名"""
26
+ print("开始测试Redis队列命名...")
27
+ print("=" * 50)
28
+
29
+ # 测试用例
30
+ test_cases = [
31
+ {
32
+ "name": "正常命名",
33
+ "queue_name": "crawlo:test_project:queue:requests",
34
+ "expected_queue": "crawlo:test_project:queue:requests",
35
+ "expected_processing": "crawlo:test_project:queue:processing",
36
+ "expected_failed": "crawlo:test_project:queue:failed"
37
+ },
38
+ {
39
+ "name": "双重 crawlo 前缀",
40
+ "queue_name": "crawlo:crawlo:queue:requests",
41
+ "expected_queue": "crawlo:queue:requests",
42
+ "expected_processing": "crawlo:queue:processing",
43
+ "expected_failed": "crawlo:queue:failed"
44
+ }
45
+ ]
46
+
47
+ all_passed = True
48
+
49
+ for i, test_case in enumerate(test_cases, 1):
50
+ print(f"测试 {i}: {test_case['name']}")
51
+ print(f" 输入队列名称: {test_case['queue_name']}")
52
+
53
+ # 创建RedisPriorityQueue实例
54
+ queue = RedisPriorityQueue(
55
+ redis_url="redis://127.0.0.1:6379/0",
56
+ queue_name=test_case['queue_name'],
57
+ module_name="test_project"
58
+ )
59
+
60
+ print(f" 实际队列名称: {queue.queue_name}")
61
+ print(f" 实际处理队列: {queue.processing_queue}")
62
+ print(f" 实际失败队列: {queue.failed_queue}")
63
+
64
+ print(f" 期望队列名称: {test_case['expected_queue']}")
65
+ print(f" 期望处理队列: {test_case['expected_processing']}")
66
+ print(f" 期望失败队列: {test_case['expected_failed']}")
67
+
68
+ # 验证结果
69
+ queue_name_ok = queue.queue_name == test_case['expected_queue']
70
+ processing_queue_ok = queue.processing_queue == test_case['expected_processing']
71
+ failed_queue_ok = queue.failed_queue == test_case['expected_failed']
72
+
73
+ if queue_name_ok and processing_queue_ok and failed_queue_ok:
74
+ print(" ✓ 测试通过")
75
+ else:
76
+ print(" ✗ 测试失败")
77
+ all_passed = False
78
+
79
+ print()
80
+
81
+ return all_passed
82
+
83
+
84
+ async def test_queue_operations():
85
+ """测试队列操作"""
86
+ print("开始测试Redis队列操作...")
87
+ print("=" * 50)
88
+
89
+ # 创建一个RedisPriorityQueue实例
90
+ queue = RedisPriorityQueue(
91
+ redis_url='redis://127.0.0.1:6379/0',
92
+ queue_name='crawlo:test_project:queue:requests',
93
+ module_name='test_project'
94
+ )
95
+
96
+ # 连接Redis
97
+ await queue.connect()
98
+
99
+ print('队列名称:', queue.queue_name)
100
+ print('处理队列:', queue.processing_queue)
101
+ print('失败队列:', queue.failed_queue)
102
+
103
+ # 清理之前的测试数据
104
+ await queue._redis.delete(queue.queue_name)
105
+ await queue._redis.delete(f'{queue.queue_name}:data')
106
+ await queue._redis.delete(queue.processing_queue)
107
+ await queue._redis.delete(f'{queue.processing_queue}:data')
108
+ await queue._redis.delete(queue.failed_queue)
109
+
110
+ # 添加一个测试任务
111
+ request = MockRequest('https://example.com')
112
+
113
+ # 测试放入队列
114
+ result = await queue.put(request, priority=1)
115
+ print('放入队列结果:', result)
116
+
117
+ # 检查队列大小
118
+ size = await queue.qsize()
119
+ print('队列大小:', size)
120
+
121
+ # 测试获取任务
122
+ retrieved = await queue.get(timeout=1.0)
123
+ print('获取请求:', retrieved.url if retrieved else None)
124
+
125
+ if retrieved:
126
+ # 测试确认任务完成
127
+ await queue.ack(retrieved)
128
+ print('任务确认完成')
129
+
130
+ # 关闭连接
131
+ await queue.close()
132
+ print("队列操作测试完成")
133
+
134
+
135
+ async def main():
136
+ """主测试函数"""
137
+ print("开始Redis队列命名和操作测试...")
138
+ print("=" * 50)
139
+
140
+ # 测试队列命名
141
+ naming_test_passed = test_queue_naming()
142
+
143
+ # 测试队列操作
144
+ await test_queue_operations()
145
+
146
+ print("=" * 50)
147
+ if naming_test_passed:
148
+ print("所有测试通过!")
149
+ else:
150
+ print("部分测试失败!")
151
+ print("测试完成!")
152
+
153
+
154
+ if __name__ == "__main__":
155
+ asyncio.run(main())
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试 QUEUE_TYPE 配置获取
5
+ """
6
+
7
+ import sys
8
+ import os
9
+
10
+ # 添加项目根目录到路径
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
12
+
13
+ from crawlo.framework import CrawloFramework
14
+ from crawlo.config import CrawloConfig
15
+
16
+
17
+ def test_queue_type_standalone():
18
+ """测试单机模式下的 QUEUE_TYPE"""
19
+ print("=== 测试单机模式下的 QUEUE_TYPE ===")
20
+
21
+ # 创建单机模式配置
22
+ config = CrawloConfig.standalone(concurrency=4)
23
+
24
+ # 创建框架实例
25
+ framework = CrawloFramework(config.to_dict())
26
+
27
+ # 获取 QUEUE_TYPE
28
+ queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
29
+ run_mode = framework.settings.get('RUN_MODE', 'not found')
30
+
31
+ print(f"RunMode: {run_mode}")
32
+ print(f"QueueType: {queue_type}")
33
+
34
+ # 验证是否正确
35
+ assert queue_type == 'memory', f"期望 'memory',实际得到 '{queue_type}'"
36
+ assert run_mode == 'standalone', f"期望 'standalone',实际得到 '{run_mode}'"
37
+
38
+ print("✅ 单机模式测试通过")
39
+
40
+
41
+ def test_queue_type_distributed():
42
+ """测试分布式模式下的 QUEUE_TYPE"""
43
+ print("\n=== 测试分布式模式下的 QUEUE_TYPE ===")
44
+
45
+ # 创建分布式模式配置
46
+ config = CrawloConfig.distributed(
47
+ redis_host='127.0.0.1',
48
+ redis_port=6379,
49
+ project_name='test_project',
50
+ concurrency=4
51
+ )
52
+
53
+ # 创建框架实例
54
+ framework = CrawloFramework(config.to_dict())
55
+
56
+ # 获取 QUEUE_TYPE
57
+ queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
58
+ run_mode = framework.settings.get('RUN_MODE', 'not found')
59
+
60
+ print(f"RunMode: {run_mode}")
61
+ print(f"QueueType: {queue_type}")
62
+
63
+ # 验证是否正确
64
+ assert queue_type == 'redis', f"期望 'redis',实际得到 '{queue_type}'"
65
+ assert run_mode == 'distributed', f"期望 'distributed',实际得到 '{run_mode}'"
66
+
67
+ print("✅ 分布式模式测试通过")
68
+
69
+
70
+ def test_queue_type_auto():
71
+ """测试自动模式下的 QUEUE_TYPE"""
72
+ print("\n=== 测试自动模式下的 QUEUE_TYPE ===")
73
+
74
+ # 创建自动模式配置
75
+ config = CrawloConfig.auto(concurrency=4)
76
+
77
+ # 创建框架实例
78
+ framework = CrawloFramework(config.to_dict())
79
+
80
+ # 获取 QUEUE_TYPE
81
+ queue_type = framework.settings.get('QUEUE_TYPE', 'not found')
82
+ run_mode = framework.settings.get('RUN_MODE', 'not found')
83
+
84
+ print(f"RunMode: {run_mode}")
85
+ print(f"QueueType: {queue_type}")
86
+
87
+ # 验证是否正确
88
+ assert queue_type == 'auto', f"期望 'auto',实际得到 '{queue_type}'"
89
+ assert run_mode == 'auto', f"期望 'auto',实际得到 '{run_mode}'"
90
+
91
+ print("✅ 自动模式测试通过")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ print("开始测试 QUEUE_TYPE 配置获取...")
96
+
97
+ try:
98
+ test_queue_type_standalone()
99
+ test_queue_type_distributed()
100
+ test_queue_type_auto()
101
+
102
+ print("\n🎉 所有测试通过!可以成功获取到 QUEUE_TYPE 配置。")
103
+
104
+ except Exception as e:
105
+ print(f"\n❌ 测试失败: {e}")
106
+ import traceback
107
+ traceback.print_exc()
@@ -0,0 +1,176 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Redis队列名称修复测试脚本
5
+ 用于验证RedisPriorityQueue中队列名称处理的修复
6
+ """
7
+ import sys
8
+ import os
9
+
10
+ # 添加项目根目录到路径
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
12
+
13
+ # 导入相关模块
14
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
15
+
16
+
17
+ def test_normalize_queue_name():
18
+ """测试队列名称规范化函数"""
19
+ print("开始测试RedisPriorityQueue队列名称规范化...")
20
+ print("=" * 50)
21
+
22
+ # 创建一个RedisPriorityQueue实例用于测试
23
+ queue = RedisPriorityQueue(redis_url="redis://127.0.0.1:6379/15")
24
+
25
+ test_cases = [
26
+ {
27
+ "name": "已经规范化的名称",
28
+ "input": "crawlo:test_project:queue:requests",
29
+ "expected": "crawlo:test_project:queue:requests"
30
+ },
31
+ {
32
+ "name": "双重 crawlo 前缀",
33
+ "input": "crawlo:crawlo:queue:requests",
34
+ "expected": "crawlo:queue:requests"
35
+ },
36
+ {
37
+ "name": "三重 crawlo 前缀",
38
+ "input": "crawlo:crawlo:crawlo:queue:requests",
39
+ "expected": "crawlo:queue:requests"
40
+ },
41
+ {
42
+ "name": "无 crawlo 前缀",
43
+ "input": "test_project:queue:requests",
44
+ "expected": "crawlo:test_project:queue:requests"
45
+ },
46
+ {
47
+ "name": "空队列名称",
48
+ "input": "",
49
+ "expected": "crawlo:requests"
50
+ }
51
+ ]
52
+
53
+ all_passed = True
54
+
55
+ for i, test_case in enumerate(test_cases, 1):
56
+ print(f"测试 {i}: {test_case['name']}")
57
+ print(f" 输入: {test_case['input']}")
58
+
59
+ # 测试规范化函数
60
+ result = queue._normalize_queue_name(test_case['input'])
61
+ print(f" 输出: {result}")
62
+ print(f" 期望: {test_case['expected']}")
63
+
64
+ # 验证结果
65
+ if result == test_case['expected']:
66
+ print(" ✓ 测试通过")
67
+ else:
68
+ print(" ✗ 测试失败")
69
+ all_passed = False
70
+
71
+ print()
72
+
73
+ print("=" * 50)
74
+ if all_passed:
75
+ print("所有测试通过!队列名称规范化修复成功")
76
+ return True
77
+ else:
78
+ print("部分测试失败,请检查实现")
79
+ return False
80
+
81
+
82
+ def test_queue_initialization():
83
+ """测试队列初始化时的名称处理"""
84
+ print("开始测试RedisPriorityQueue初始化时的名称处理...")
85
+ print("=" * 50)
86
+
87
+ test_cases = [
88
+ {
89
+ "name": "正常命名",
90
+ "queue_name": "crawlo:test_project:queue:requests",
91
+ "expected_queue": "crawlo:test_project:queue:requests",
92
+ "expected_processing": "crawlo:test_project:queue:processing",
93
+ "expected_failed": "crawlo:test_project:queue:failed"
94
+ },
95
+ {
96
+ "name": "双重 crawlo 前缀",
97
+ "queue_name": "crawlo:crawlo:queue:requests",
98
+ "expected_queue": "crawlo:queue:requests",
99
+ "expected_processing": "crawlo:queue:processing",
100
+ "expected_failed": "crawlo:queue:failed"
101
+ }
102
+ ]
103
+
104
+ all_passed = True
105
+
106
+ for i, test_case in enumerate(test_cases, 1):
107
+ print(f"测试 {i}: {test_case['name']}")
108
+ print(f" 输入队列名称: {test_case['queue_name']}")
109
+
110
+ try:
111
+ # 创建RedisPriorityQueue实例
112
+ queue = RedisPriorityQueue(
113
+ redis_url="redis://127.0.0.1:6379/15",
114
+ queue_name=test_case['queue_name'],
115
+ module_name="test_project"
116
+ )
117
+
118
+ print(f" 实际队列名称: {queue.queue_name}")
119
+ print(f" 实际处理队列: {queue.processing_queue}")
120
+ print(f" 实际失败队列: {queue.failed_queue}")
121
+
122
+ print(f" 期望队列名称: {test_case['expected_queue']}")
123
+ print(f" 期望处理队列: {test_case['expected_processing']}")
124
+ print(f" 期望失败队列: {test_case['expected_failed']}")
125
+
126
+ # 验证结果
127
+ queue_name_ok = queue.queue_name == test_case['expected_queue']
128
+ processing_queue_ok = queue.processing_queue == test_case['expected_processing']
129
+ failed_queue_ok = queue.failed_queue == test_case['expected_failed']
130
+
131
+ if queue_name_ok and processing_queue_ok and failed_queue_ok:
132
+ print(" ✓ 测试通过")
133
+ else:
134
+ print(" ✗ 测试失败")
135
+ all_passed = False
136
+
137
+ except Exception as e:
138
+ print(f" ✗ 测试异常: {e}")
139
+ all_passed = False
140
+
141
+ print()
142
+
143
+ print("=" * 50)
144
+ if all_passed:
145
+ print("队列初始化测试通过!")
146
+ return True
147
+ else:
148
+ print("队列初始化测试失败!")
149
+ return False
150
+
151
+
152
+ def main():
153
+ """主测试函数"""
154
+ print("开始Redis队列名称修复测试...")
155
+ print("=" * 50)
156
+
157
+ # 测试队列名称规范化函数
158
+ normalize_test_passed = test_normalize_queue_name()
159
+ print()
160
+
161
+ # 测试队列初始化
162
+ init_test_passed = test_queue_initialization()
163
+ print()
164
+
165
+ print("=" * 50)
166
+ if normalize_test_passed and init_test_passed:
167
+ print("所有测试通过!Redis队列名称修复完成")
168
+ return 0
169
+ else:
170
+ print("部分测试失败,请检查实现")
171
+ return 1
172
+
173
+
174
+ if __name__ == "__main__":
175
+ exit_code = main()
176
+ sys.exit(exit_code)
File without changes