crawlo 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -33
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +152 -126
- crawlo/commands/list.py +156 -147
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -111
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +187 -0
- crawlo/config.py +280 -0
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -158
- crawlo/core/enhanced_engine.py +190 -0
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +166 -57
- crawlo/crawler.py +1028 -495
- crawlo/downloader/__init__.py +242 -78
- crawlo/downloader/aiohttp_downloader.py +212 -199
- crawlo/downloader/cffi_downloader.py +251 -241
- crawlo/downloader/httpx_downloader.py +259 -246
- crawlo/event.py +11 -11
- crawlo/exceptions.py +82 -78
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +34 -34
- crawlo/filters/__init__.py +154 -37
- crawlo/filters/aioredis_filter.py +242 -150
- crawlo/filters/memory_filter.py +269 -202
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -245
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +125 -90
- crawlo/mode_manager.py +201 -0
- crawlo/network/__init__.py +21 -7
- crawlo/network/request.py +311 -203
- crawlo/network/response.py +271 -166
- crawlo/pipelines/__init__.py +22 -13
- crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +317 -0
- crawlo/pipelines/database_dedup_pipeline.py +225 -0
- crawlo/pipelines/json_pipeline.py +219 -0
- crawlo/pipelines/memory_dedup_pipeline.py +116 -0
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +163 -0
- crawlo/project.py +153 -153
- crawlo/queue/__init__.py +0 -0
- crawlo/queue/pqueue.py +37 -0
- crawlo/queue/queue_manager.py +308 -0
- crawlo/queue/redis_priority_queue.py +209 -0
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +245 -167
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -129
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +30 -27
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +87 -76
- crawlo/templates/project/pipelines.py.tmpl +342 -64
- crawlo/templates/project/run.py.tmpl +252 -0
- crawlo/templates/project/settings.py.tmpl +251 -54
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +178 -32
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +440 -0
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +287 -0
- crawlo/utils/large_scale_helper.py +344 -0
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +176 -0
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +220 -0
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.3.dist-info/METADATA +635 -0
- crawlo-1.1.3.dist-info/RECORD +113 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -0
- tests/__init__.py +7 -7
- tests/test_final_validation.py +154 -0
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +29 -0
- tests/test_redis_queue.py +225 -0
- tests/test_request_serialization.py +71 -0
- tests/test_scheduler.py +242 -0
- crawlo/pipelines/mysql_batch_pipline.py +0 -273
- crawlo/utils/pqueue.py +0 -174
- crawlo-1.1.1.dist-info/METADATA +0 -220
- crawlo-1.1.1.dist-info/RECORD +0 -100
- examples/baidu_spider/__init__.py +0 -7
- examples/baidu_spider/demo.py +0 -94
- examples/baidu_spider/items.py +0 -46
- examples/baidu_spider/middleware.py +0 -49
- examples/baidu_spider/pipeline.py +0 -55
- examples/baidu_spider/run.py +0 -27
- examples/baidu_spider/settings.py +0 -121
- examples/baidu_spider/spiders/__init__.py +0 -7
- examples/baidu_spider/spiders/bai_du.py +0 -61
- examples/baidu_spider/spiders/miit.py +0 -159
- examples/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
队列配置辅助工具
|
|
5
|
+
为用户提供简洁的队列配置接口
|
|
6
|
+
"""
|
|
7
|
+
from typing import Dict, Any, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QueueHelper:
|
|
11
|
+
"""队列配置辅助类"""
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def use_memory_queue(max_size: int = 2000) -> Dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
配置使用内存队列
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
max_size: 队列最大容量
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
配置字典
|
|
23
|
+
"""
|
|
24
|
+
return {
|
|
25
|
+
'QUEUE_TYPE': 'memory',
|
|
26
|
+
'SCHEDULER_MAX_QUEUE_SIZE': max_size,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def use_redis_queue(
|
|
31
|
+
host: str = "127.0.0.1",
|
|
32
|
+
port: int = 6379,
|
|
33
|
+
password: Optional[str] = None,
|
|
34
|
+
db: int = 0,
|
|
35
|
+
queue_name: str = "crawlo:requests",
|
|
36
|
+
max_retries: int = 3,
|
|
37
|
+
timeout: int = 300
|
|
38
|
+
) -> Dict[str, Any]:
|
|
39
|
+
"""
|
|
40
|
+
配置使用 Redis 分布式队列
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
host: Redis 主机地址
|
|
44
|
+
port: Redis 端口
|
|
45
|
+
password: Redis 密码(可选)
|
|
46
|
+
db: Redis 数据库编号
|
|
47
|
+
queue_name: 队列名称
|
|
48
|
+
max_retries: 最大重试次数
|
|
49
|
+
timeout: 操作超时时间(秒)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
配置字典
|
|
53
|
+
"""
|
|
54
|
+
if password:
|
|
55
|
+
redis_url = f"redis://:{password}@{host}:{port}/{db}"
|
|
56
|
+
else:
|
|
57
|
+
redis_url = f"redis://{host}:{port}/{db}"
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
'QUEUE_TYPE': 'redis',
|
|
61
|
+
'REDIS_URL': redis_url,
|
|
62
|
+
'REDIS_HOST': host,
|
|
63
|
+
'REDIS_PORT': port,
|
|
64
|
+
'REDIS_PASSWORD': password or '',
|
|
65
|
+
'REDIS_DB': db,
|
|
66
|
+
'SCHEDULER_QUEUE_NAME': queue_name,
|
|
67
|
+
'QUEUE_MAX_RETRIES': max_retries,
|
|
68
|
+
'QUEUE_TIMEOUT': timeout,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def auto_queue(
|
|
73
|
+
redis_fallback: bool = True,
|
|
74
|
+
memory_max_size: int = 2000,
|
|
75
|
+
**redis_kwargs
|
|
76
|
+
) -> Dict[str, Any]:
|
|
77
|
+
"""
|
|
78
|
+
配置自动选择队列类型
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
redis_fallback: Redis 不可用时是否回退到内存队列
|
|
82
|
+
memory_max_size: 内存队列最大容量
|
|
83
|
+
**redis_kwargs: Redis 配置参数
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
配置字典
|
|
87
|
+
"""
|
|
88
|
+
config = {
|
|
89
|
+
'QUEUE_TYPE': 'auto',
|
|
90
|
+
'SCHEDULER_MAX_QUEUE_SIZE': memory_max_size,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# 添加 Redis 配置(用于自动检测)
|
|
94
|
+
if redis_kwargs:
|
|
95
|
+
redis_config = QueueHelper.use_redis_queue(**redis_kwargs)
|
|
96
|
+
config.update(redis_config)
|
|
97
|
+
config['QUEUE_TYPE'] = 'auto' # 确保是自动模式
|
|
98
|
+
|
|
99
|
+
return config
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# 预定义的常用配置
|
|
103
|
+
class QueuePresets:
|
|
104
|
+
"""预定义的队列配置"""
|
|
105
|
+
|
|
106
|
+
# 开发环境:使用内存队列
|
|
107
|
+
DEVELOPMENT = QueueHelper.use_memory_queue(max_size=1000)
|
|
108
|
+
|
|
109
|
+
# 生产环境:使用 Redis 分布式队列
|
|
110
|
+
PRODUCTION = QueueHelper.use_redis_queue(
|
|
111
|
+
host="127.0.0.1",
|
|
112
|
+
port=6379,
|
|
113
|
+
queue_name="crawlo:production",
|
|
114
|
+
max_retries=5,
|
|
115
|
+
timeout=600
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 测试环境:自动选择,Redis 不可用时使用内存队列
|
|
119
|
+
TESTING = QueueHelper.auto_queue(
|
|
120
|
+
redis_fallback=True,
|
|
121
|
+
memory_max_size=500,
|
|
122
|
+
host="127.0.0.1",
|
|
123
|
+
port=6379,
|
|
124
|
+
queue_name="crawlo:testing"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# 高性能环境:Redis 集群
|
|
128
|
+
HIGH_PERFORMANCE = QueueHelper.use_redis_queue(
|
|
129
|
+
host="redis-cluster.example.com",
|
|
130
|
+
port=6379,
|
|
131
|
+
queue_name="crawlo:cluster",
|
|
132
|
+
max_retries=10,
|
|
133
|
+
timeout=300
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def apply_queue_config(settings_dict: Dict[str, Any], config: Dict[str, Any]) -> None:
|
|
138
|
+
"""
|
|
139
|
+
将队列配置应用到设置字典
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
settings_dict: 现有的设置字典
|
|
143
|
+
config: 队列配置字典
|
|
144
|
+
"""
|
|
145
|
+
settings_dict.update(config)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# 使用示例和文档
|
|
149
|
+
USAGE_EXAMPLES = """
|
|
150
|
+
# 使用示例:
|
|
151
|
+
|
|
152
|
+
# 1. 在 settings.py 中使用内存队列
|
|
153
|
+
from crawlo.utils.queue_helper import QueueHelper
|
|
154
|
+
apply_queue_config(locals(), QueueHelper.use_memory_queue())
|
|
155
|
+
|
|
156
|
+
# 2. 在 settings.py 中使用 Redis 队列
|
|
157
|
+
apply_queue_config(locals(), QueueHelper.use_redis_queue(
|
|
158
|
+
host="redis.example.com",
|
|
159
|
+
password="your_password"
|
|
160
|
+
))
|
|
161
|
+
|
|
162
|
+
# 3. 使用预定义配置
|
|
163
|
+
from crawlo.utils.queue_helper import QueuePresets
|
|
164
|
+
apply_queue_config(locals(), QueuePresets.PRODUCTION)
|
|
165
|
+
|
|
166
|
+
# 4. 自动选择队列类型
|
|
167
|
+
apply_queue_config(locals(), QueueHelper.auto_queue(
|
|
168
|
+
host="127.0.0.1",
|
|
169
|
+
port=6379
|
|
170
|
+
))
|
|
171
|
+
|
|
172
|
+
# 5. 直接在 settings 中配置
|
|
173
|
+
QUEUE_TYPE = 'auto' # 'memory', 'redis', 'auto'
|
|
174
|
+
REDIS_URL = 'redis://127.0.0.1:6379/0'
|
|
175
|
+
SCHEDULER_MAX_QUEUE_SIZE = 2000
|
|
176
|
+
"""
|