crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -34
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -196
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +279 -279
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -171
- crawlo/core/enhanced_engine.py +189 -189
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +165 -165
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +242 -242
- crawlo/downloader/aiohttp_downloader.py +212 -212
- crawlo/downloader/cffi_downloader.py +251 -251
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +38 -31
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +58 -49
- crawlo/extension/log_stats.py +82 -44
- crawlo/extension/logging_extension.py +44 -35
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +241 -241
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -248
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +200 -200
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +311 -311
- crawlo/network/response.py +271 -271
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +132 -117
- crawlo/pipelines/mysql_pipeline.py +317 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +162 -162
- crawlo/project.py +153 -153
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +307 -307
- crawlo/queue/redis_priority_queue.py +208 -208
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +278 -244
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +131 -106
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +111 -87
- crawlo/templates/project/pipelines.py.tmpl +97 -341
- crawlo/templates/project/run.py.tmpl +251 -251
- crawlo/templates/project/settings.py.tmpl +279 -250
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +142 -178
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.4.dist-info/METADATA +403 -0
- crawlo-1.1.4.dist-info/RECORD +117 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -205
- tests/__init__.py +7 -7
- tests/test_final_validation.py +153 -153
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +28 -28
- tests/test_redis_queue.py +224 -224
- tests/test_request_serialization.py +70 -70
- tests/test_scheduler.py +241 -241
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,163 +1,163 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
基于 Redis 的数据项去重管道
|
|
5
|
-
========================
|
|
6
|
-
提供分布式环境下的数据项去重功能,防止保存重复的数据记录。
|
|
7
|
-
|
|
8
|
-
特点:
|
|
9
|
-
- 分布式支持: 多节点共享去重数据
|
|
10
|
-
- 高性能: 使用 Redis 集合进行快速查找
|
|
11
|
-
- 可配置: 支持自定义 Redis 连接参数
|
|
12
|
-
- 容错设计: 网络异常时不会丢失数据
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
import hashlib
|
|
16
|
-
from typing import Dict, Any, Optional
|
|
17
|
-
import redis
|
|
18
|
-
|
|
19
|
-
from crawlo import Item
|
|
20
|
-
from crawlo.spider import Spider
|
|
21
|
-
from crawlo.utils.log import get_logger
|
|
22
|
-
from crawlo.exceptions import DropItem
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class RedisDedupPipeline:
|
|
26
|
-
"""基于 Redis 的数据项去重管道"""
|
|
27
|
-
|
|
28
|
-
def __init__(
|
|
29
|
-
self,
|
|
30
|
-
redis_host: str = 'localhost',
|
|
31
|
-
redis_port: int = 6379,
|
|
32
|
-
redis_db: int = 0,
|
|
33
|
-
redis_password: Optional[str] = None,
|
|
34
|
-
redis_key: str = 'crawlo:item_fingerprints',
|
|
35
|
-
log_level: str = "INFO"
|
|
36
|
-
):
|
|
37
|
-
"""
|
|
38
|
-
初始化 Redis 去重管道
|
|
39
|
-
|
|
40
|
-
:param redis_host: Redis 主机地址
|
|
41
|
-
:param redis_port: Redis 端口
|
|
42
|
-
:param redis_db: Redis 数据库编号
|
|
43
|
-
:param redis_password: Redis 密码
|
|
44
|
-
:param redis_key: 存储指纹的 Redis 键名
|
|
45
|
-
:param log_level: 日志级别
|
|
46
|
-
"""
|
|
47
|
-
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
48
|
-
|
|
49
|
-
# 初始化 Redis 连接
|
|
50
|
-
try:
|
|
51
|
-
self.redis_client = redis.Redis(
|
|
52
|
-
host=redis_host,
|
|
53
|
-
port=redis_port,
|
|
54
|
-
db=redis_db,
|
|
55
|
-
password=redis_password,
|
|
56
|
-
decode_responses=True,
|
|
57
|
-
socket_connect_timeout=5,
|
|
58
|
-
socket_timeout=5
|
|
59
|
-
)
|
|
60
|
-
# 测试连接
|
|
61
|
-
self.redis_client.ping()
|
|
62
|
-
self.logger.info(f"Redis 连接成功: {redis_host}:{redis_port}/{redis_db}")
|
|
63
|
-
except Exception as e:
|
|
64
|
-
self.logger.error(f"Redis 连接失败: {e}")
|
|
65
|
-
raise RuntimeError(f"Redis 连接失败: {e}")
|
|
66
|
-
|
|
67
|
-
self.redis_key = redis_key
|
|
68
|
-
self.dropped_count = 0
|
|
69
|
-
|
|
70
|
-
@classmethod
|
|
71
|
-
def from_crawler(cls, crawler):
|
|
72
|
-
"""从爬虫配置创建管道实例"""
|
|
73
|
-
settings = crawler.settings
|
|
74
|
-
|
|
75
|
-
return cls(
|
|
76
|
-
redis_host=settings.get('REDIS_HOST', 'localhost'),
|
|
77
|
-
redis_port=settings.getint('REDIS_PORT', 6379),
|
|
78
|
-
redis_db=settings.getint('REDIS_DB', 0),
|
|
79
|
-
redis_password=settings.get('REDIS_PASSWORD') or None,
|
|
80
|
-
redis_key=settings.get('REDIS_DEDUP_KEY', 'crawlo:item_fingerprints'),
|
|
81
|
-
log_level=settings.get('LOG_LEVEL', 'INFO')
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
def process_item(self, item: Item, spider: Spider) -> Item:
|
|
85
|
-
"""
|
|
86
|
-
处理数据项,进行去重检查
|
|
87
|
-
|
|
88
|
-
:param item: 要处理的数据项
|
|
89
|
-
:param spider: 爬虫实例
|
|
90
|
-
:return: 处理后的数据项或抛出 DropItem 异常
|
|
91
|
-
"""
|
|
92
|
-
try:
|
|
93
|
-
# 生成数据项指纹
|
|
94
|
-
fingerprint = self._generate_item_fingerprint(item)
|
|
95
|
-
|
|
96
|
-
# 使用 Redis 的 SADD 命令检查并添加指纹
|
|
97
|
-
# 如果指纹已存在,SADD 返回 0;如果指纹是新的,SADD 返回 1
|
|
98
|
-
is_new = self.redis_client.sadd(self.redis_key, fingerprint)
|
|
99
|
-
|
|
100
|
-
if not is_new:
|
|
101
|
-
# 如果指纹已存在,丢弃这个数据项
|
|
102
|
-
self.dropped_count += 1
|
|
103
|
-
self.logger.debug(f"丢弃重复数据项: {fingerprint[:20]}...")
|
|
104
|
-
raise DropItem(f"重复的数据项: {fingerprint}")
|
|
105
|
-
else:
|
|
106
|
-
# 如果是新数据项,继续处理
|
|
107
|
-
self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
|
|
108
|
-
return item
|
|
109
|
-
|
|
110
|
-
except redis.RedisError as e:
|
|
111
|
-
self.logger.error(f"Redis 错误: {e}")
|
|
112
|
-
# 在 Redis 错误时继续处理,避免丢失数据
|
|
113
|
-
return item
|
|
114
|
-
except Exception as e:
|
|
115
|
-
self.logger.error(f"处理数据项时出错: {e}")
|
|
116
|
-
# 在其他错误时继续处理
|
|
117
|
-
return item
|
|
118
|
-
|
|
119
|
-
def _generate_item_fingerprint(self, item: Item) -> str:
|
|
120
|
-
"""
|
|
121
|
-
生成数据项指纹
|
|
122
|
-
|
|
123
|
-
基于数据项的所有字段生成唯一指纹,用于去重判断。
|
|
124
|
-
|
|
125
|
-
:param item: 数据项
|
|
126
|
-
:return: 指纹字符串
|
|
127
|
-
"""
|
|
128
|
-
# 将数据项转换为可序列化的字典
|
|
129
|
-
try:
|
|
130
|
-
item_dict = item.to_dict()
|
|
131
|
-
except AttributeError:
|
|
132
|
-
# 兼容没有to_dict方法的Item实现
|
|
133
|
-
item_dict = dict(item)
|
|
134
|
-
|
|
135
|
-
# 对字典进行排序以确保一致性
|
|
136
|
-
sorted_items = sorted(item_dict.items())
|
|
137
|
-
|
|
138
|
-
# 生成指纹字符串
|
|
139
|
-
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
140
|
-
|
|
141
|
-
# 使用 SHA256 生成固定长度的指纹
|
|
142
|
-
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
143
|
-
|
|
144
|
-
def close_spider(self, spider: Spider) -> None:
|
|
145
|
-
"""
|
|
146
|
-
爬虫关闭时的清理工作
|
|
147
|
-
|
|
148
|
-
:param spider: 爬虫实例
|
|
149
|
-
"""
|
|
150
|
-
try:
|
|
151
|
-
# 获取去重统计信息
|
|
152
|
-
total_items = self.redis_client.scard(self.redis_key)
|
|
153
|
-
self.logger.info(f"爬虫 {spider.name} 关闭:")
|
|
154
|
-
self.logger.info(f" - 丢弃的重复数据项: {self.dropped_count}")
|
|
155
|
-
self.logger.info(f" - Redis 中存储的指纹数: {total_items}")
|
|
156
|
-
|
|
157
|
-
# 注意:默认情况下不清理 Redis 中的指纹
|
|
158
|
-
# 如果需要清理,可以在设置中配置
|
|
159
|
-
if spider.crawler.settings.getbool('REDIS_DEDUP_CLEANUP', False):
|
|
160
|
-
deleted = self.redis_client.delete(self.redis_key)
|
|
161
|
-
self.logger.info(f" - 清理的指纹数: {deleted}")
|
|
162
|
-
except Exception as e:
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
基于 Redis 的数据项去重管道
|
|
5
|
+
========================
|
|
6
|
+
提供分布式环境下的数据项去重功能,防止保存重复的数据记录。
|
|
7
|
+
|
|
8
|
+
特点:
|
|
9
|
+
- 分布式支持: 多节点共享去重数据
|
|
10
|
+
- 高性能: 使用 Redis 集合进行快速查找
|
|
11
|
+
- 可配置: 支持自定义 Redis 连接参数
|
|
12
|
+
- 容错设计: 网络异常时不会丢失数据
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
from typing import Dict, Any, Optional
|
|
17
|
+
import redis
|
|
18
|
+
|
|
19
|
+
from crawlo import Item
|
|
20
|
+
from crawlo.spider import Spider
|
|
21
|
+
from crawlo.utils.log import get_logger
|
|
22
|
+
from crawlo.exceptions import DropItem
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RedisDedupPipeline:
|
|
26
|
+
"""基于 Redis 的数据项去重管道"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
redis_host: str = 'localhost',
|
|
31
|
+
redis_port: int = 6379,
|
|
32
|
+
redis_db: int = 0,
|
|
33
|
+
redis_password: Optional[str] = None,
|
|
34
|
+
redis_key: str = 'crawlo:item_fingerprints',
|
|
35
|
+
log_level: str = "INFO"
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
初始化 Redis 去重管道
|
|
39
|
+
|
|
40
|
+
:param redis_host: Redis 主机地址
|
|
41
|
+
:param redis_port: Redis 端口
|
|
42
|
+
:param redis_db: Redis 数据库编号
|
|
43
|
+
:param redis_password: Redis 密码
|
|
44
|
+
:param redis_key: 存储指纹的 Redis 键名
|
|
45
|
+
:param log_level: 日志级别
|
|
46
|
+
"""
|
|
47
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
48
|
+
|
|
49
|
+
# 初始化 Redis 连接
|
|
50
|
+
try:
|
|
51
|
+
self.redis_client = redis.Redis(
|
|
52
|
+
host=redis_host,
|
|
53
|
+
port=redis_port,
|
|
54
|
+
db=redis_db,
|
|
55
|
+
password=redis_password,
|
|
56
|
+
decode_responses=True,
|
|
57
|
+
socket_connect_timeout=5,
|
|
58
|
+
socket_timeout=5
|
|
59
|
+
)
|
|
60
|
+
# 测试连接
|
|
61
|
+
self.redis_client.ping()
|
|
62
|
+
self.logger.info(f"Redis 连接成功: {redis_host}:{redis_port}/{redis_db}")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
self.logger.error(f"Redis 连接失败: {e}")
|
|
65
|
+
raise RuntimeError(f"Redis 连接失败: {e}")
|
|
66
|
+
|
|
67
|
+
self.redis_key = redis_key
|
|
68
|
+
self.dropped_count = 0
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_crawler(cls, crawler):
|
|
72
|
+
"""从爬虫配置创建管道实例"""
|
|
73
|
+
settings = crawler.settings
|
|
74
|
+
|
|
75
|
+
return cls(
|
|
76
|
+
redis_host=settings.get('REDIS_HOST', 'localhost'),
|
|
77
|
+
redis_port=settings.getint('REDIS_PORT', 6379),
|
|
78
|
+
redis_db=settings.getint('REDIS_DB', 0),
|
|
79
|
+
redis_password=settings.get('REDIS_PASSWORD') or None,
|
|
80
|
+
redis_key=settings.get('REDIS_DEDUP_KEY', 'crawlo:item_fingerprints'),
|
|
81
|
+
log_level=settings.get('LOG_LEVEL', 'INFO')
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def process_item(self, item: Item, spider: Spider) -> Item:
|
|
85
|
+
"""
|
|
86
|
+
处理数据项,进行去重检查
|
|
87
|
+
|
|
88
|
+
:param item: 要处理的数据项
|
|
89
|
+
:param spider: 爬虫实例
|
|
90
|
+
:return: 处理后的数据项或抛出 DropItem 异常
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
# 生成数据项指纹
|
|
94
|
+
fingerprint = self._generate_item_fingerprint(item)
|
|
95
|
+
|
|
96
|
+
# 使用 Redis 的 SADD 命令检查并添加指纹
|
|
97
|
+
# 如果指纹已存在,SADD 返回 0;如果指纹是新的,SADD 返回 1
|
|
98
|
+
is_new = self.redis_client.sadd(self.redis_key, fingerprint)
|
|
99
|
+
|
|
100
|
+
if not is_new:
|
|
101
|
+
# 如果指纹已存在,丢弃这个数据项
|
|
102
|
+
self.dropped_count += 1
|
|
103
|
+
self.logger.debug(f"丢弃重复数据项: {fingerprint[:20]}...")
|
|
104
|
+
raise DropItem(f"重复的数据项: {fingerprint}")
|
|
105
|
+
else:
|
|
106
|
+
# 如果是新数据项,继续处理
|
|
107
|
+
self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
|
|
108
|
+
return item
|
|
109
|
+
|
|
110
|
+
except redis.RedisError as e:
|
|
111
|
+
self.logger.error(f"Redis 错误: {e}")
|
|
112
|
+
# 在 Redis 错误时继续处理,避免丢失数据
|
|
113
|
+
return item
|
|
114
|
+
except Exception as e:
|
|
115
|
+
self.logger.error(f"处理数据项时出错: {e}")
|
|
116
|
+
# 在其他错误时继续处理
|
|
117
|
+
return item
|
|
118
|
+
|
|
119
|
+
def _generate_item_fingerprint(self, item: Item) -> str:
|
|
120
|
+
"""
|
|
121
|
+
生成数据项指纹
|
|
122
|
+
|
|
123
|
+
基于数据项的所有字段生成唯一指纹,用于去重判断。
|
|
124
|
+
|
|
125
|
+
:param item: 数据项
|
|
126
|
+
:return: 指纹字符串
|
|
127
|
+
"""
|
|
128
|
+
# 将数据项转换为可序列化的字典
|
|
129
|
+
try:
|
|
130
|
+
item_dict = item.to_dict()
|
|
131
|
+
except AttributeError:
|
|
132
|
+
# 兼容没有to_dict方法的Item实现
|
|
133
|
+
item_dict = dict(item)
|
|
134
|
+
|
|
135
|
+
# 对字典进行排序以确保一致性
|
|
136
|
+
sorted_items = sorted(item_dict.items())
|
|
137
|
+
|
|
138
|
+
# 生成指纹字符串
|
|
139
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
140
|
+
|
|
141
|
+
# 使用 SHA256 生成固定长度的指纹
|
|
142
|
+
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
143
|
+
|
|
144
|
+
def close_spider(self, spider: Spider) -> None:
|
|
145
|
+
"""
|
|
146
|
+
爬虫关闭时的清理工作
|
|
147
|
+
|
|
148
|
+
:param spider: 爬虫实例
|
|
149
|
+
"""
|
|
150
|
+
try:
|
|
151
|
+
# 获取去重统计信息
|
|
152
|
+
total_items = self.redis_client.scard(self.redis_key)
|
|
153
|
+
self.logger.info(f"爬虫 {spider.name} 关闭:")
|
|
154
|
+
self.logger.info(f" - 丢弃的重复数据项: {self.dropped_count}")
|
|
155
|
+
self.logger.info(f" - Redis 中存储的指纹数: {total_items}")
|
|
156
|
+
|
|
157
|
+
# 注意:默认情况下不清理 Redis 中的指纹
|
|
158
|
+
# 如果需要清理,可以在设置中配置
|
|
159
|
+
if spider.crawler.settings.getbool('REDIS_DEDUP_CLEANUP', False):
|
|
160
|
+
deleted = self.redis_client.delete(self.redis_key)
|
|
161
|
+
self.logger.info(f" - 清理的指纹数: {deleted}")
|
|
162
|
+
except Exception as e:
|
|
163
163
|
self.logger.error(f"关闭爬虫时出错: {e}")
|
crawlo/project.py
CHANGED
|
@@ -1,153 +1,153 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Crawlo 项目初始化模块
|
|
5
|
-
|
|
6
|
-
负责:
|
|
7
|
-
1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
|
|
8
|
-
2. 将项目根目录加入 sys.path
|
|
9
|
-
3. 加载 settings 模块
|
|
10
|
-
4. 返回 SettingManager 实例
|
|
11
|
-
"""
|
|
12
|
-
import os
|
|
13
|
-
import sys
|
|
14
|
-
import configparser
|
|
15
|
-
from importlib import import_module
|
|
16
|
-
from inspect import iscoroutinefunction
|
|
17
|
-
from typing import Callable, Optional, Tuple
|
|
18
|
-
|
|
19
|
-
from crawlo.utils.log import get_logger
|
|
20
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
21
|
-
|
|
22
|
-
logger = get_logger(__name__)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
26
|
-
"""
|
|
27
|
-
从指定路径向上查找项目根目录。
|
|
28
|
-
识别依据:
|
|
29
|
-
1. 存在 'crawlo.cfg'
|
|
30
|
-
2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
|
|
31
|
-
"""
|
|
32
|
-
path = os.path.abspath(start_path)
|
|
33
|
-
while True:
|
|
34
|
-
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
35
|
-
if os.path.isfile(cfg_file):
|
|
36
|
-
logger.info(f"✅ 找到项目配置文件: {cfg_file}")
|
|
37
|
-
return path
|
|
38
|
-
|
|
39
|
-
settings_file = os.path.join(path, "settings.py")
|
|
40
|
-
init_file = os.path.join(path, "__init__.py")
|
|
41
|
-
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
42
|
-
logger.info(f"✅ 找到项目模块: {path}")
|
|
43
|
-
return path
|
|
44
|
-
|
|
45
|
-
parent = os.path.dirname(path)
|
|
46
|
-
if parent == path:
|
|
47
|
-
break
|
|
48
|
-
path = parent
|
|
49
|
-
|
|
50
|
-
logger.warning("❌ 未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
|
|
51
|
-
return None
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
55
|
-
"""从 crawlo.cfg 读取 settings 模块路径"""
|
|
56
|
-
config = configparser.ConfigParser()
|
|
57
|
-
try:
|
|
58
|
-
config.read(cfg_path, encoding="utf-8")
|
|
59
|
-
if config.has_section("settings") and config.has_option("settings", "default"):
|
|
60
|
-
module_path = config.get("settings", "default")
|
|
61
|
-
logger.info(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
|
|
62
|
-
return module_path
|
|
63
|
-
else:
|
|
64
|
-
raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
|
|
65
|
-
except Exception as e:
|
|
66
|
-
raise RuntimeError(f"解析 crawlo.cfg 失败: {e}")
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
70
|
-
"""
|
|
71
|
-
获取配置管理器实例(主入口函数)
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
custom_settings: 运行时自定义配置,会覆盖 settings.py
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
SettingManager: 已加载配置的实例
|
|
78
|
-
"""
|
|
79
|
-
logger.info("🚀 正在初始化 Crawlo 项目配置...")
|
|
80
|
-
|
|
81
|
-
# 1. 查找项目根
|
|
82
|
-
project_root = _find_project_root()
|
|
83
|
-
if not project_root:
|
|
84
|
-
raise RuntimeError("未找到 Crawlo 项目,请检查项目结构")
|
|
85
|
-
|
|
86
|
-
# 2. 确定 settings 模块
|
|
87
|
-
settings_module_path = None
|
|
88
|
-
cfg_file = os.path.join(project_root, "crawlo.cfg")
|
|
89
|
-
|
|
90
|
-
if os.path.isfile(cfg_file):
|
|
91
|
-
settings_module_path = _get_settings_module_from_cfg(cfg_file)
|
|
92
|
-
else:
|
|
93
|
-
# 推断:项目目录名.settings
|
|
94
|
-
project_name = os.path.basename(project_root)
|
|
95
|
-
settings_module_path = f"{project_name}.settings"
|
|
96
|
-
logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
|
|
97
|
-
|
|
98
|
-
# 3. 注入 sys.path
|
|
99
|
-
project_root_str = os.path.abspath(project_root)
|
|
100
|
-
if project_root_str not in sys.path:
|
|
101
|
-
sys.path.insert(0, project_root_str)
|
|
102
|
-
logger.info(f"📁 项目根目录已加入 sys.path: {project_root_str}")
|
|
103
|
-
|
|
104
|
-
# 4. 加载 SettingManager
|
|
105
|
-
logger.info(f"⚙️ 正在加载配置模块: {settings_module_path}")
|
|
106
|
-
settings = SettingManager()
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
settings.set_settings(settings_module_path)
|
|
110
|
-
logger.info("✅ settings 模块加载成功")
|
|
111
|
-
except Exception as e:
|
|
112
|
-
raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
|
|
113
|
-
|
|
114
|
-
# 5. 合并运行时配置
|
|
115
|
-
if custom_settings:
|
|
116
|
-
settings.update_attributes(custom_settings)
|
|
117
|
-
logger.info(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
|
|
118
|
-
|
|
119
|
-
logger.info("🎉 Crawlo 项目配置初始化完成!")
|
|
120
|
-
return settings
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def load_class(_path):
|
|
124
|
-
if not isinstance(_path, str):
|
|
125
|
-
if callable(_path):
|
|
126
|
-
return _path
|
|
127
|
-
else:
|
|
128
|
-
raise TypeError(f"args expect str or object, got {_path}")
|
|
129
|
-
|
|
130
|
-
module_name, class_name = _path.rsplit('.', 1)
|
|
131
|
-
module = import_module(module_name)
|
|
132
|
-
|
|
133
|
-
try:
|
|
134
|
-
cls = getattr(module, class_name)
|
|
135
|
-
except AttributeError:
|
|
136
|
-
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
137
|
-
return cls
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def merge_settings(spider, settings):
|
|
141
|
-
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
142
|
-
if hasattr(spider, 'custom_settings'):
|
|
143
|
-
custom_settings = getattr(spider, 'custom_settings')
|
|
144
|
-
settings.update_attributes(custom_settings)
|
|
145
|
-
else:
|
|
146
|
-
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
async def common_call(func: Callable, *args, **kwargs):
|
|
150
|
-
if iscoroutinefunction(func):
|
|
151
|
-
return await func(*args, **kwargs)
|
|
152
|
-
else:
|
|
153
|
-
return func(*args, **kwargs)
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Crawlo 项目初始化模块
|
|
5
|
+
|
|
6
|
+
负责:
|
|
7
|
+
1. 向上搜索项目根目录(通过 crawlo.cfg 或 settings.py)
|
|
8
|
+
2. 将项目根目录加入 sys.path
|
|
9
|
+
3. 加载 settings 模块
|
|
10
|
+
4. 返回 SettingManager 实例
|
|
11
|
+
"""
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
import configparser
|
|
15
|
+
from importlib import import_module
|
|
16
|
+
from inspect import iscoroutinefunction
|
|
17
|
+
from typing import Callable, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
from crawlo.utils.log import get_logger
|
|
20
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _find_project_root(start_path: str = ".") -> Optional[str]:
|
|
26
|
+
"""
|
|
27
|
+
从指定路径向上查找项目根目录。
|
|
28
|
+
识别依据:
|
|
29
|
+
1. 存在 'crawlo.cfg'
|
|
30
|
+
2. 存在 '__init__.py' 和 'settings.py'(即 Python 包)
|
|
31
|
+
"""
|
|
32
|
+
path = os.path.abspath(start_path)
|
|
33
|
+
while True:
|
|
34
|
+
cfg_file = os.path.join(path, "crawlo.cfg")
|
|
35
|
+
if os.path.isfile(cfg_file):
|
|
36
|
+
logger.info(f"✅ 找到项目配置文件: {cfg_file}")
|
|
37
|
+
return path
|
|
38
|
+
|
|
39
|
+
settings_file = os.path.join(path, "settings.py")
|
|
40
|
+
init_file = os.path.join(path, "__init__.py")
|
|
41
|
+
if os.path.isfile(settings_file) and os.path.isfile(init_file):
|
|
42
|
+
logger.info(f"✅ 找到项目模块: {path}")
|
|
43
|
+
return path
|
|
44
|
+
|
|
45
|
+
parent = os.path.dirname(path)
|
|
46
|
+
if parent == path:
|
|
47
|
+
break
|
|
48
|
+
path = parent
|
|
49
|
+
|
|
50
|
+
logger.warning("❌ 未找到 Crawlo 项目根目录。请确保在包含 'crawlo.cfg' 或 'settings.py' 的目录运行。")
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_settings_module_from_cfg(cfg_path: str) -> str:
|
|
55
|
+
"""从 crawlo.cfg 读取 settings 模块路径"""
|
|
56
|
+
config = configparser.ConfigParser()
|
|
57
|
+
try:
|
|
58
|
+
config.read(cfg_path, encoding="utf-8")
|
|
59
|
+
if config.has_section("settings") and config.has_option("settings", "default"):
|
|
60
|
+
module_path = config.get("settings", "default")
|
|
61
|
+
logger.info(f"📄 从 crawlo.cfg 加载 settings 模块: {module_path}")
|
|
62
|
+
return module_path
|
|
63
|
+
else:
|
|
64
|
+
raise RuntimeError(f"配置文件缺少 [settings] 或 default 选项: {cfg_path}")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise RuntimeError(f"解析 crawlo.cfg 失败: {e}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_settings(custom_settings: Optional[dict] = None) -> SettingManager:
|
|
70
|
+
"""
|
|
71
|
+
获取配置管理器实例(主入口函数)
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
custom_settings: 运行时自定义配置,会覆盖 settings.py
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
SettingManager: 已加载配置的实例
|
|
78
|
+
"""
|
|
79
|
+
logger.info("🚀 正在初始化 Crawlo 项目配置...")
|
|
80
|
+
|
|
81
|
+
# 1. 查找项目根
|
|
82
|
+
project_root = _find_project_root()
|
|
83
|
+
if not project_root:
|
|
84
|
+
raise RuntimeError("未找到 Crawlo 项目,请检查项目结构")
|
|
85
|
+
|
|
86
|
+
# 2. 确定 settings 模块
|
|
87
|
+
settings_module_path = None
|
|
88
|
+
cfg_file = os.path.join(project_root, "crawlo.cfg")
|
|
89
|
+
|
|
90
|
+
if os.path.isfile(cfg_file):
|
|
91
|
+
settings_module_path = _get_settings_module_from_cfg(cfg_file)
|
|
92
|
+
else:
|
|
93
|
+
# 推断:项目目录名.settings
|
|
94
|
+
project_name = os.path.basename(project_root)
|
|
95
|
+
settings_module_path = f"{project_name}.settings"
|
|
96
|
+
logger.warning(f"⚠️ 未找到 crawlo.cfg,推断 settings 模块为: {settings_module_path}")
|
|
97
|
+
|
|
98
|
+
# 3. 注入 sys.path
|
|
99
|
+
project_root_str = os.path.abspath(project_root)
|
|
100
|
+
if project_root_str not in sys.path:
|
|
101
|
+
sys.path.insert(0, project_root_str)
|
|
102
|
+
logger.info(f"📁 项目根目录已加入 sys.path: {project_root_str}")
|
|
103
|
+
|
|
104
|
+
# 4. 加载 SettingManager
|
|
105
|
+
logger.info(f"⚙️ 正在加载配置模块: {settings_module_path}")
|
|
106
|
+
settings = SettingManager()
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
settings.set_settings(settings_module_path)
|
|
110
|
+
logger.info("✅ settings 模块加载成功")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise ImportError(f"加载 settings 模块失败 '{settings_module_path}': {e}")
|
|
113
|
+
|
|
114
|
+
# 5. 合并运行时配置
|
|
115
|
+
if custom_settings:
|
|
116
|
+
settings.update_attributes(custom_settings)
|
|
117
|
+
logger.info(f"🔧 已应用运行时自定义配置: {list(custom_settings.keys())}")
|
|
118
|
+
|
|
119
|
+
logger.info("🎉 Crawlo 项目配置初始化完成!")
|
|
120
|
+
return settings
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def load_class(_path):
|
|
124
|
+
if not isinstance(_path, str):
|
|
125
|
+
if callable(_path):
|
|
126
|
+
return _path
|
|
127
|
+
else:
|
|
128
|
+
raise TypeError(f"args expect str or object, got {_path}")
|
|
129
|
+
|
|
130
|
+
module_name, class_name = _path.rsplit('.', 1)
|
|
131
|
+
module = import_module(module_name)
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
cls = getattr(module, class_name)
|
|
135
|
+
except AttributeError:
|
|
136
|
+
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
137
|
+
return cls
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def merge_settings(spider, settings):
|
|
141
|
+
spider_name = getattr(spider, 'name', 'UnknownSpider')
|
|
142
|
+
if hasattr(spider, 'custom_settings'):
|
|
143
|
+
custom_settings = getattr(spider, 'custom_settings')
|
|
144
|
+
settings.update_attributes(custom_settings)
|
|
145
|
+
else:
|
|
146
|
+
logger.debug(f"爬虫 '{spider_name}' 无 custom_settings,跳过合并") # 添加日志
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
async def common_call(func: Callable, *args, **kwargs):
|
|
150
|
+
if iscoroutinefunction(func):
|
|
151
|
+
return await func(*args, **kwargs)
|
|
152
|
+
else:
|
|
153
|
+
return func(*args, **kwargs)
|