crawlo 1.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

crawlo/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.9"
1
+ __version__ = "1.2.0"
crawlo/core/engine.py CHANGED
@@ -321,12 +321,21 @@ class Engine(object):
321
321
  async def _should_exit(self) -> bool:
322
322
  """检查是否应该退出(增强版本)"""
323
323
  # 没有启动请求,且所有队列都空闲
324
- if (self.start_requests is None and
325
- self.scheduler.idle() and
326
- self.downloader.idle() and
327
- self.task_manager.all_done() and
328
- self.processor.idle()):
329
- return True
324
+ if self.start_requests is None:
325
+ # 使用异步的idle检查方法以获得更精确的结果
326
+ scheduler_idle = await self.scheduler.async_idle() if hasattr(self.scheduler, 'async_idle') else self.scheduler.idle()
327
+
328
+ if (scheduler_idle and
329
+ self.downloader.idle() and
330
+ self.task_manager.all_done() and
331
+ self.processor.idle()):
332
+ # 增加额外检查确保所有任务都完成
333
+ await asyncio.sleep(0.1) # 短暂等待确保没有新的任务加入
334
+ if (await self.scheduler.async_idle() and
335
+ self.downloader.idle() and
336
+ self.task_manager.all_done() and
337
+ self.processor.idle()):
338
+ return True
330
339
 
331
340
  return False
332
341
 
crawlo/core/scheduler.py CHANGED
@@ -114,6 +114,13 @@ class Scheduler:
114
114
  """检查队列是否为空"""
115
115
  return len(self) == 0
116
116
 
117
+ async def async_idle(self) -> bool:
118
+ """异步检查队列是否为空(更精确)"""
119
+ if not self.queue_manager:
120
+ return True
121
+ # 使用队列管理器的异步empty方法
122
+ return await self.queue_manager.async_empty()
123
+
117
124
  async def close(self):
118
125
  """关闭调度器"""
119
126
  try:
@@ -198,13 +198,28 @@ class QueueManager:
198
198
  return 0
199
199
 
200
200
  def empty(self) -> bool:
201
- """检查队列是否为空"""
201
+ """检查队列是否为空(同步版本,用于兼容性)"""
202
202
  try:
203
203
  # 对于内存队列,可以同步检查
204
204
  if self._queue_type == QueueType.MEMORY:
205
205
  return self._queue.qsize() == 0
206
- # 对于 Redis 队列,需要异步操作,这里返回近似值
207
- return False
206
+ # 对于 Redis 队列,由于需要异步操作,这里返回近似值
207
+ # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
208
+ return True
209
+ except Exception:
210
+ return True
211
+
212
+ async def async_empty(self) -> bool:
213
+ """检查队列是否为空(异步版本,更精确)"""
214
+ try:
215
+ # 对于内存队列
216
+ if self._queue_type == QueueType.MEMORY:
217
+ return self._queue.qsize() == 0
218
+ # 对于 Redis 队列,使用异步检查
219
+ elif self._queue_type == QueueType.REDIS:
220
+ size = await self.size()
221
+ return size == 0
222
+ return True
208
223
  except Exception:
209
224
  return True
210
225
 
@@ -261,27 +276,15 @@ class QueueManager:
261
276
  async def _create_queue(self, queue_type: QueueType):
262
277
  """创建队列实例"""
263
278
  if queue_type == QueueType.REDIS:
264
- # 从队列名称中提取项目名称,用于module_name
265
- # 例如:crawlo:books_distributed:queue:requests -> books_distributed
279
+ # 简化项目名称提取逻辑
266
280
  project_name = "default"
267
281
  if ':' in self.config.queue_name:
268
282
  parts = self.config.queue_name.split(':')
269
- if len(parts) >= 2:
270
- # 处理可能的双重 crawlo 前缀
271
- if parts[0] == "crawlo" and parts[1] == "crawlo":
272
- # 双重 crawlo 前缀,取第三个部分作为项目名称
273
- if len(parts) >= 3:
274
- project_name = parts[2]
275
- else:
276
- project_name = "default"
277
- elif parts[0] == "crawlo":
278
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
279
- project_name = parts[1]
280
- else:
281
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
282
- project_name = parts[0]
283
- else:
284
- project_name = self.config.queue_name or "default"
283
+ # 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
284
+ for part in parts:
285
+ if part != "crawlo":
286
+ project_name = part
287
+ break
285
288
  else:
286
289
  project_name = self.config.queue_name or "default"
287
290
 
@@ -45,7 +45,7 @@ SCHEDULER_MAX_QUEUE_SIZE = 1000
45
45
  SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
46
46
 
47
47
  # 队列类型:memory/redis/auto
48
- QUEUE_TYPE = 'memory'
48
+ QUEUE_TYPE = 'auto'
49
49
 
50
50
  # 默认去重管道(根据运行模式自动选择)
51
51
  DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'