crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -34
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -196
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +279 -279
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -171
- crawlo/core/enhanced_engine.py +189 -189
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +165 -165
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +242 -242
- crawlo/downloader/aiohttp_downloader.py +212 -212
- crawlo/downloader/cffi_downloader.py +251 -251
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +38 -31
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +58 -49
- crawlo/extension/log_stats.py +82 -44
- crawlo/extension/logging_extension.py +44 -35
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +241 -241
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -248
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +200 -200
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +311 -311
- crawlo/network/response.py +271 -271
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +132 -117
- crawlo/pipelines/mysql_pipeline.py +317 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +162 -162
- crawlo/project.py +153 -153
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +307 -307
- crawlo/queue/redis_priority_queue.py +208 -208
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +278 -244
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +131 -106
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +111 -87
- crawlo/templates/project/pipelines.py.tmpl +97 -341
- crawlo/templates/project/run.py.tmpl +251 -251
- crawlo/templates/project/settings.py.tmpl +279 -250
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +142 -178
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.4.dist-info/METADATA +403 -0
- crawlo-1.1.4.dist-info/RECORD +117 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -205
- tests/__init__.py +7 -7
- tests/test_final_validation.py +153 -153
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +28 -28
- tests/test_redis_queue.py +224 -224
- tests/test_request_serialization.py +70 -70
- tests/test_scheduler.py +241 -241
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
crawlo/queue/pqueue.py
CHANGED
|
@@ -1,37 +1,37 @@
|
|
|
1
|
-
# -*- coding:UTF-8 -*-
|
|
2
|
-
import json
|
|
3
|
-
import sys
|
|
4
|
-
import asyncio
|
|
5
|
-
from asyncio import PriorityQueue
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from crawlo import Request
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SpiderPriorityQueue(PriorityQueue):
|
|
13
|
-
"""带超时功能的异步优先级队列"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, maxsize: int = 0) -> None:
|
|
16
|
-
"""初始化队列,maxsize为0表示无大小限制"""
|
|
17
|
-
super().__init__(maxsize)
|
|
18
|
-
|
|
19
|
-
async def get(self, timeout: float = 0.1) -> Optional[Request]:
|
|
20
|
-
"""
|
|
21
|
-
异步获取队列元素,带超时功能
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
timeout: 超时时间(秒),默认0.1秒
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
队列元素(优先级, 值)或None(超时)
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# 根据Python版本选择超时实现方式
|
|
31
|
-
if sys.version_info >= (3, 11):
|
|
32
|
-
async with asyncio.timeout(timeout):
|
|
33
|
-
return await super().get()
|
|
34
|
-
else:
|
|
35
|
-
return await asyncio.wait_for(super().get(), timeout=timeout)
|
|
36
|
-
except asyncio.TimeoutError:
|
|
37
|
-
return None
|
|
1
|
+
# -*- coding:UTF-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
import asyncio
|
|
5
|
+
from asyncio import PriorityQueue
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from crawlo import Request
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpiderPriorityQueue(PriorityQueue):
|
|
13
|
+
"""带超时功能的异步优先级队列"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, maxsize: int = 0) -> None:
|
|
16
|
+
"""初始化队列,maxsize为0表示无大小限制"""
|
|
17
|
+
super().__init__(maxsize)
|
|
18
|
+
|
|
19
|
+
async def get(self, timeout: float = 0.1) -> Optional[Request]:
|
|
20
|
+
"""
|
|
21
|
+
异步获取队列元素,带超时功能
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
timeout: 超时时间(秒),默认0.1秒
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
队列元素(优先级, 值)或None(超时)
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
# 根据Python版本选择超时实现方式
|
|
31
|
+
if sys.version_info >= (3, 11):
|
|
32
|
+
async with asyncio.timeout(timeout):
|
|
33
|
+
return await super().get()
|
|
34
|
+
else:
|
|
35
|
+
return await asyncio.wait_for(super().get(), timeout=timeout)
|
|
36
|
+
except asyncio.TimeoutError:
|
|
37
|
+
return None
|