crawlo 1.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/core/engine.py +15 -6
- crawlo/core/scheduler.py +7 -0
- crawlo/queue/queue_manager.py +24 -21
- crawlo/settings/default_settings.py +1 -1
- crawlo-1.2.0.dist-info/METADATA +697 -0
- {crawlo-1.1.9.dist-info → crawlo-1.2.0.dist-info}/RECORD +11 -11
- tests/test_queue_manager_double_crawlo.py +13 -70
- crawlo-1.1.9.dist-info/METADATA +0 -626
- {crawlo-1.1.9.dist-info → crawlo-1.2.0.dist-info}/WHEEL +0 -0
- {crawlo-1.1.9.dist-info → crawlo-1.2.0.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.9.dist-info → crawlo-1.2.0.dist-info}/top_level.txt +0 -0
crawlo/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.
|
|
1
|
+
__version__ = "1.2.0"
|
crawlo/core/engine.py
CHANGED
|
@@ -321,12 +321,21 @@ class Engine(object):
|
|
|
321
321
|
async def _should_exit(self) -> bool:
|
|
322
322
|
"""检查是否应该退出(增强版本)"""
|
|
323
323
|
# 没有启动请求,且所有队列都空闲
|
|
324
|
-
if
|
|
325
|
-
|
|
326
|
-
self.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
324
|
+
if self.start_requests is None:
|
|
325
|
+
# 使用异步的idle检查方法以获得更精确的结果
|
|
326
|
+
scheduler_idle = await self.scheduler.async_idle() if hasattr(self.scheduler, 'async_idle') else self.scheduler.idle()
|
|
327
|
+
|
|
328
|
+
if (scheduler_idle and
|
|
329
|
+
self.downloader.idle() and
|
|
330
|
+
self.task_manager.all_done() and
|
|
331
|
+
self.processor.idle()):
|
|
332
|
+
# 增加额外检查确保所有任务都完成
|
|
333
|
+
await asyncio.sleep(0.1) # 短暂等待确保没有新的任务加入
|
|
334
|
+
if (await self.scheduler.async_idle() and
|
|
335
|
+
self.downloader.idle() and
|
|
336
|
+
self.task_manager.all_done() and
|
|
337
|
+
self.processor.idle()):
|
|
338
|
+
return True
|
|
330
339
|
|
|
331
340
|
return False
|
|
332
341
|
|
crawlo/core/scheduler.py
CHANGED
|
@@ -114,6 +114,13 @@ class Scheduler:
|
|
|
114
114
|
"""检查队列是否为空"""
|
|
115
115
|
return len(self) == 0
|
|
116
116
|
|
|
117
|
+
async def async_idle(self) -> bool:
|
|
118
|
+
"""异步检查队列是否为空(更精确)"""
|
|
119
|
+
if not self.queue_manager:
|
|
120
|
+
return True
|
|
121
|
+
# 使用队列管理器的异步empty方法
|
|
122
|
+
return await self.queue_manager.async_empty()
|
|
123
|
+
|
|
117
124
|
async def close(self):
|
|
118
125
|
"""关闭调度器"""
|
|
119
126
|
try:
|
crawlo/queue/queue_manager.py
CHANGED
|
@@ -198,13 +198,28 @@ class QueueManager:
|
|
|
198
198
|
return 0
|
|
199
199
|
|
|
200
200
|
def empty(self) -> bool:
|
|
201
|
-
"""
|
|
201
|
+
"""检查队列是否为空(同步版本,用于兼容性)"""
|
|
202
202
|
try:
|
|
203
203
|
# 对于内存队列,可以同步检查
|
|
204
204
|
if self._queue_type == QueueType.MEMORY:
|
|
205
205
|
return self._queue.qsize() == 0
|
|
206
|
-
# 对于 Redis
|
|
207
|
-
|
|
206
|
+
# 对于 Redis 队列,由于需要异步操作,这里返回近似值
|
|
207
|
+
# 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
|
|
208
|
+
return True
|
|
209
|
+
except Exception:
|
|
210
|
+
return True
|
|
211
|
+
|
|
212
|
+
async def async_empty(self) -> bool:
|
|
213
|
+
"""检查队列是否为空(异步版本,更精确)"""
|
|
214
|
+
try:
|
|
215
|
+
# 对于内存队列
|
|
216
|
+
if self._queue_type == QueueType.MEMORY:
|
|
217
|
+
return self._queue.qsize() == 0
|
|
218
|
+
# 对于 Redis 队列,使用异步检查
|
|
219
|
+
elif self._queue_type == QueueType.REDIS:
|
|
220
|
+
size = await self.size()
|
|
221
|
+
return size == 0
|
|
222
|
+
return True
|
|
208
223
|
except Exception:
|
|
209
224
|
return True
|
|
210
225
|
|
|
@@ -261,27 +276,15 @@ class QueueManager:
|
|
|
261
276
|
async def _create_queue(self, queue_type: QueueType):
|
|
262
277
|
"""创建队列实例"""
|
|
263
278
|
if queue_type == QueueType.REDIS:
|
|
264
|
-
#
|
|
265
|
-
# 例如:crawlo:books_distributed:queue:requests -> books_distributed
|
|
279
|
+
# 简化项目名称提取逻辑
|
|
266
280
|
project_name = "default"
|
|
267
281
|
if ':' in self.config.queue_name:
|
|
268
282
|
parts = self.config.queue_name.split(':')
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
if
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
project_name = parts[2]
|
|
275
|
-
else:
|
|
276
|
-
project_name = "default"
|
|
277
|
-
elif parts[0] == "crawlo":
|
|
278
|
-
# 正常的 crawlo 前缀,取第二个部分作为项目名称
|
|
279
|
-
project_name = parts[1]
|
|
280
|
-
else:
|
|
281
|
-
# 没有 crawlo 前缀,使用第一个部分作为项目名称
|
|
282
|
-
project_name = parts[0]
|
|
283
|
-
else:
|
|
284
|
-
project_name = self.config.queue_name or "default"
|
|
283
|
+
# 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
|
|
284
|
+
for part in parts:
|
|
285
|
+
if part != "crawlo":
|
|
286
|
+
project_name = part
|
|
287
|
+
break
|
|
285
288
|
else:
|
|
286
289
|
project_name = self.config.queue_name or "default"
|
|
287
290
|
|
|
@@ -45,7 +45,7 @@ SCHEDULER_MAX_QUEUE_SIZE = 1000
|
|
|
45
45
|
SCHEDULER_QUEUE_NAME = f"crawlo:{PROJECT_NAME}:queue:requests"
|
|
46
46
|
|
|
47
47
|
# 队列类型:memory/redis/auto
|
|
48
|
-
QUEUE_TYPE = '
|
|
48
|
+
QUEUE_TYPE = 'auto'
|
|
49
49
|
|
|
50
50
|
# 默认去重管道(根据运行模式自动选择)
|
|
51
51
|
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|