crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +28 -1
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +61 -0
- crawlo/cleaners/data_formatter.py +226 -0
- crawlo/cleaners/encoding_converter.py +126 -0
- crawlo/cleaners/text_cleaner.py +233 -0
- crawlo/commands/startproject.py +117 -13
- crawlo/config.py +30 -0
- crawlo/config_validator.py +253 -0
- crawlo/core/engine.py +185 -11
- crawlo/core/scheduler.py +49 -78
- crawlo/crawler.py +6 -6
- crawlo/downloader/__init__.py +24 -0
- crawlo/downloader/aiohttp_downloader.py +8 -0
- crawlo/downloader/cffi_downloader.py +5 -0
- crawlo/downloader/hybrid_downloader.py +214 -0
- crawlo/downloader/playwright_downloader.py +403 -0
- crawlo/downloader/selenium_downloader.py +473 -0
- crawlo/extension/__init__.py +17 -10
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +27 -18
- crawlo/extension/log_stats.py +62 -24
- crawlo/extension/logging_extension.py +18 -9
- crawlo/extension/memory_monitor.py +105 -0
- crawlo/extension/performance_profiler.py +134 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/aioredis_filter.py +50 -12
- crawlo/middleware/proxy.py +26 -2
- crawlo/mode_manager.py +24 -19
- crawlo/network/request.py +30 -3
- crawlo/network/response.py +114 -25
- crawlo/pipelines/mongo_pipeline.py +81 -66
- crawlo/pipelines/mysql_pipeline.py +165 -43
- crawlo/pipelines/redis_dedup_pipeline.py +7 -3
- crawlo/queue/queue_manager.py +15 -2
- crawlo/queue/redis_priority_queue.py +144 -76
- crawlo/settings/default_settings.py +93 -121
- crawlo/subscriber.py +62 -37
- crawlo/templates/project/items.py.tmpl +1 -1
- crawlo/templates/project/middlewares.py.tmpl +73 -49
- crawlo/templates/project/pipelines.py.tmpl +51 -295
- crawlo/templates/project/settings.py.tmpl +93 -17
- crawlo/templates/project/settings_distributed.py.tmpl +120 -0
- crawlo/templates/project/settings_gentle.py.tmpl +95 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
- crawlo/templates/project/settings_simple.py.tmpl +69 -0
- crawlo/templates/spider/spider.py.tmpl +2 -38
- crawlo/tools/__init__.py +183 -0
- crawlo/tools/anti_crawler.py +269 -0
- crawlo/tools/authenticated_proxy.py +241 -0
- crawlo/tools/data_validator.py +181 -0
- crawlo/tools/date_tools.py +36 -0
- crawlo/tools/distributed_coordinator.py +387 -0
- crawlo/tools/retry_mechanism.py +221 -0
- crawlo/tools/scenario_adapter.py +263 -0
- crawlo/utils/__init__.py +29 -1
- crawlo/utils/batch_processor.py +261 -0
- crawlo/utils/date_tools.py +58 -1
- crawlo/utils/enhanced_error_handler.py +360 -0
- crawlo/utils/env_config.py +106 -0
- crawlo/utils/error_handler.py +126 -0
- crawlo/utils/performance_monitor.py +285 -0
- crawlo/utils/redis_connection_pool.py +335 -0
- crawlo/utils/redis_key_validator.py +200 -0
- crawlo-1.1.5.dist-info/METADATA +401 -0
- crawlo-1.1.5.dist-info/RECORD +185 -0
- tests/advanced_tools_example.py +276 -0
- tests/authenticated_proxy_example.py +237 -0
- tests/cleaners_example.py +161 -0
- tests/config_validation_demo.py +103 -0
- tests/date_tools_example.py +181 -0
- tests/dynamic_loading_example.py +524 -0
- tests/dynamic_loading_test.py +105 -0
- tests/env_config_example.py +134 -0
- tests/error_handling_example.py +172 -0
- tests/redis_key_validation_demo.py +131 -0
- tests/response_improvements_example.py +145 -0
- tests/test_advanced_tools.py +149 -0
- tests/test_all_redis_key_configs.py +146 -0
- tests/test_authenticated_proxy.py +142 -0
- tests/test_cleaners.py +55 -0
- tests/test_comprehensive.py +147 -0
- tests/test_config_validator.py +194 -0
- tests/test_date_tools.py +124 -0
- tests/test_dynamic_downloaders_proxy.py +125 -0
- tests/test_dynamic_proxy.py +93 -0
- tests/test_dynamic_proxy_config.py +147 -0
- tests/test_dynamic_proxy_real.py +110 -0
- tests/test_edge_cases.py +304 -0
- tests/test_enhanced_error_handler.py +271 -0
- tests/test_env_config.py +122 -0
- tests/test_error_handler_compatibility.py +113 -0
- tests/test_framework_env_usage.py +104 -0
- tests/test_integration.py +357 -0
- tests/test_item_dedup_redis_key.py +123 -0
- tests/test_parsel.py +30 -0
- tests/test_performance.py +328 -0
- tests/test_queue_manager_redis_key.py +177 -0
- tests/test_redis_connection_pool.py +295 -0
- tests/test_redis_key_naming.py +182 -0
- tests/test_redis_key_validator.py +124 -0
- tests/test_response_improvements.py +153 -0
- tests/test_simple_response.py +62 -0
- tests/test_telecom_spider_redis_key.py +206 -0
- tests/test_template_content.py +88 -0
- tests/test_template_redis_key.py +135 -0
- tests/test_tools.py +154 -0
- tests/tools_example.py +258 -0
- crawlo/core/enhanced_engine.py +0 -190
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
- {examples → tests}/controlled_spider_example.py +0 -0
|
@@ -16,6 +16,7 @@ from typing import Optional
|
|
|
16
16
|
from crawlo.filters import BaseFilter
|
|
17
17
|
from crawlo.utils.log import get_logger
|
|
18
18
|
from crawlo.utils.request import request_fingerprint
|
|
19
|
+
from crawlo.utils.redis_connection_pool import get_redis_pool
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class AioRedisFilter(BaseFilter):
|
|
@@ -48,7 +49,7 @@ class AioRedisFilter(BaseFilter):
|
|
|
48
49
|
初始化Redis过滤器
|
|
49
50
|
|
|
50
51
|
:param redis_key: Redis中存储指纹的键名
|
|
51
|
-
:param client: Redis
|
|
52
|
+
:param client: Redis客户端实例(可以为None,稍后初始化)
|
|
52
53
|
:param stats: 统计信息存储
|
|
53
54
|
:param debug: 是否启用调试模式
|
|
54
55
|
:param log_level: 日志级别
|
|
@@ -63,6 +64,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
63
64
|
self.cleanup_fp = cleanup_fp
|
|
64
65
|
self.ttl = ttl
|
|
65
66
|
|
|
67
|
+
# 保存连接池引用(用于延迟初始化)
|
|
68
|
+
self._redis_pool = None
|
|
69
|
+
|
|
66
70
|
# 性能计数器
|
|
67
71
|
self._redis_operations = 0
|
|
68
72
|
self._pipeline_operations = 0
|
|
@@ -80,17 +84,30 @@ class AioRedisFilter(BaseFilter):
|
|
|
80
84
|
ttl = max(0, int(ttl_setting)) if ttl_setting > 0 else None
|
|
81
85
|
|
|
82
86
|
try:
|
|
83
|
-
|
|
87
|
+
# 使用优化的连接池
|
|
88
|
+
redis_pool = get_redis_pool(
|
|
84
89
|
redis_url,
|
|
85
|
-
decode_responses=decode_responses,
|
|
86
90
|
max_connections=20,
|
|
91
|
+
socket_connect_timeout=5,
|
|
92
|
+
socket_timeout=30,
|
|
93
|
+
health_check_interval=30,
|
|
94
|
+
retry_on_timeout=True,
|
|
95
|
+
decode_responses=decode_responses,
|
|
87
96
|
encoding='utf-8'
|
|
88
97
|
)
|
|
98
|
+
|
|
99
|
+
# 注意:这里不应该使用 await,因为 create_instance 不是异步方法
|
|
100
|
+
# 我们将在实际使用时获取连接
|
|
101
|
+
redis_client = None # 延迟初始化
|
|
89
102
|
except Exception as e:
|
|
90
|
-
raise RuntimeError(f"Redis
|
|
103
|
+
raise RuntimeError(f"Redis连接池初始化失败: {redis_url} - {str(e)}")
|
|
91
104
|
|
|
92
|
-
|
|
93
|
-
|
|
105
|
+
# 使用统一的Redis key命名规范: crawlo:{project_name}:filter:fingerprint
|
|
106
|
+
project_name = crawler.settings.get('PROJECT_NAME', 'default')
|
|
107
|
+
redis_key = f"crawlo:{project_name}:filter:fingerprint"
|
|
108
|
+
|
|
109
|
+
instance = cls(
|
|
110
|
+
redis_key=redis_key,
|
|
94
111
|
client=redis_client,
|
|
95
112
|
stats=crawler.stats,
|
|
96
113
|
cleanup_fp=crawler.settings.get_bool('CLEANUP_FP', False),
|
|
@@ -98,6 +115,16 @@ class AioRedisFilter(BaseFilter):
|
|
|
98
115
|
debug=crawler.settings.get_bool('FILTER_DEBUG', False),
|
|
99
116
|
log_level=crawler.settings.get('LOG_LEVEL', 'INFO')
|
|
100
117
|
)
|
|
118
|
+
|
|
119
|
+
# 保存连接池引用,以便在需要时获取连接
|
|
120
|
+
instance._redis_pool = redis_pool
|
|
121
|
+
return instance
|
|
122
|
+
|
|
123
|
+
async def _get_redis_client(self):
|
|
124
|
+
"""获取Redis客户端实例(延迟初始化)"""
|
|
125
|
+
if self.redis is None and self._redis_pool is not None:
|
|
126
|
+
self.redis = await self._redis_pool.get_connection()
|
|
127
|
+
return self.redis
|
|
101
128
|
|
|
102
129
|
async def requested(self, request) -> bool:
|
|
103
130
|
"""
|
|
@@ -107,6 +134,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
107
134
|
:return: True 表示重复,False 表示新请求
|
|
108
135
|
"""
|
|
109
136
|
try:
|
|
137
|
+
# 确保Redis客户端已初始化
|
|
138
|
+
await self._get_redis_client()
|
|
139
|
+
|
|
110
140
|
fp = str(request_fingerprint(request))
|
|
111
141
|
self._redis_operations += 1
|
|
112
142
|
|
|
@@ -141,6 +171,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
141
171
|
:return: 是否成功添加(True 表示新添加,False 表示已存在)
|
|
142
172
|
"""
|
|
143
173
|
try:
|
|
174
|
+
# 确保Redis客户端已初始化
|
|
175
|
+
await self._get_redis_client()
|
|
176
|
+
|
|
144
177
|
fp = str(fp)
|
|
145
178
|
|
|
146
179
|
# 使用 pipeline 优化性能
|
|
@@ -178,6 +211,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
178
211
|
async def get_stats(self) -> dict:
|
|
179
212
|
"""获取过滤器详细统计信息"""
|
|
180
213
|
try:
|
|
214
|
+
# 确保Redis客户端已初始化
|
|
215
|
+
await self._get_redis_client()
|
|
216
|
+
|
|
181
217
|
count = await self.redis.scard(self.redis_key)
|
|
182
218
|
|
|
183
219
|
# 获取TTL信息
|
|
@@ -212,6 +248,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
212
248
|
async def clear_all(self) -> int:
|
|
213
249
|
"""清空所有指纹数据"""
|
|
214
250
|
try:
|
|
251
|
+
# 确保Redis客户端已初始化
|
|
252
|
+
await self._get_redis_client()
|
|
253
|
+
|
|
215
254
|
deleted = await self.redis.delete(self.redis_key)
|
|
216
255
|
self.logger.info(f"已清除指纹数: {deleted}")
|
|
217
256
|
return deleted
|
|
@@ -222,6 +261,9 @@ class AioRedisFilter(BaseFilter):
|
|
|
222
261
|
async def closed(self, reason: Optional[str] = None) -> None:
|
|
223
262
|
"""爬虫关闭时的清理操作"""
|
|
224
263
|
try:
|
|
264
|
+
# 确保Redis客户端已初始化
|
|
265
|
+
await self._get_redis_client()
|
|
266
|
+
|
|
225
267
|
if self.cleanup_fp:
|
|
226
268
|
deleted = await self.redis.delete(self.redis_key)
|
|
227
269
|
self.logger.info(f"爬虫关闭清理: 已删除{deleted}个指纹")
|
|
@@ -234,9 +276,5 @@ class AioRedisFilter(BaseFilter):
|
|
|
234
276
|
|
|
235
277
|
async def _close_redis(self) -> None:
|
|
236
278
|
"""安全关闭Redis连接"""
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
await self.redis.close()
|
|
240
|
-
self.logger.debug("Redis连接已关闭")
|
|
241
|
-
except Exception as e:
|
|
242
|
-
self.logger.warning(f"Redis关闭时出错:{e}")
|
|
279
|
+
# 连接池会自动管理连接,这里不需要显式关闭
|
|
280
|
+
self.logger.debug("Redis连接已释放")
|
crawlo/middleware/proxy.py
CHANGED
|
@@ -173,7 +173,7 @@ class ProxyMiddleware:
|
|
|
173
173
|
if isinstance(result, str) and result.strip():
|
|
174
174
|
return result.strip()
|
|
175
175
|
elif isinstance(result, dict):
|
|
176
|
-
cleaned = {k: v.strip() for k, v in result.items()
|
|
176
|
+
cleaned = {k: v.strip() if isinstance(v, str) else v for k, v in result.items()}
|
|
177
177
|
return cleaned if cleaned else None
|
|
178
178
|
return None
|
|
179
179
|
except Exception as e:
|
|
@@ -225,7 +225,31 @@ class ProxyMiddleware:
|
|
|
225
225
|
|
|
226
226
|
proxy = await self._get_cached_proxy()
|
|
227
227
|
if proxy:
|
|
228
|
-
|
|
228
|
+
# 处理带认证的代理URL
|
|
229
|
+
if isinstance(proxy, str) and "@" in proxy and "://" in proxy:
|
|
230
|
+
# 解析带认证的代理URL
|
|
231
|
+
parsed = urlparse(proxy)
|
|
232
|
+
if parsed.username and parsed.password:
|
|
233
|
+
# 对于AioHttp下载器,需要特殊处理认证信息
|
|
234
|
+
downloader_type = spider.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
|
|
235
|
+
if downloader_type == "aiohttp":
|
|
236
|
+
# 将认证信息存储在meta中,由下载器处理
|
|
237
|
+
request.meta["proxy_auth"] = {
|
|
238
|
+
"username": parsed.username,
|
|
239
|
+
"password": parsed.password
|
|
240
|
+
}
|
|
241
|
+
# 清理URL中的认证信息
|
|
242
|
+
clean_proxy = f"{parsed.scheme}://{parsed.hostname}"
|
|
243
|
+
if parsed.port:
|
|
244
|
+
clean_proxy += f":{parsed.port}"
|
|
245
|
+
request.proxy = clean_proxy
|
|
246
|
+
else:
|
|
247
|
+
# 其他下载器可以直接使用带认证的URL
|
|
248
|
+
request.proxy = proxy
|
|
249
|
+
else:
|
|
250
|
+
request.proxy = proxy
|
|
251
|
+
else:
|
|
252
|
+
request.proxy = proxy
|
|
229
253
|
self.logger.info(f"分配代理 → {proxy} | {request.url}")
|
|
230
254
|
else:
|
|
231
255
|
self.logger.warning(f"未获取到代理,请求直连: {request.url}")
|
crawlo/mode_manager.py
CHANGED
|
@@ -63,8 +63,9 @@ class ModeManager:
|
|
|
63
63
|
'REDIS_PORT': redis_port,
|
|
64
64
|
'REDIS_PASSWORD': redis_password,
|
|
65
65
|
'REDIS_URL': redis_url,
|
|
66
|
-
'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
|
|
67
|
-
|
|
66
|
+
'SCHEDULER_QUEUE_NAME': f'crawlo:{project_name}:queue:requests', # 使用统一命名规范
|
|
67
|
+
# Redis key配置已移至各组件中,使用统一的命名规范
|
|
68
|
+
# crawlo:{project_name}:filter:fingerprint (请求去重)
|
|
68
69
|
'CONCURRENCY': 16,
|
|
69
70
|
'MAX_RUNNING_SPIDERS': 1,
|
|
70
71
|
'DOWNLOAD_DELAY': 1.0,
|
|
@@ -181,21 +182,25 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
|
|
|
181
182
|
# 环境变量支持
|
|
182
183
|
def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
183
184
|
"""从环境变量创建配置"""
|
|
184
|
-
|
|
185
|
+
# 移除直接使用 os.getenv(),要求通过 settings 配置
|
|
186
|
+
raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
|
|
185
187
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
188
|
+
# 保留原有代码作为参考
|
|
189
|
+
# mode = os.getenv('CRAWLO_MODE', default_mode).lower()
|
|
190
|
+
#
|
|
191
|
+
# if mode == 'distributed':
|
|
192
|
+
# return distributed_mode(
|
|
193
|
+
# redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
|
|
194
|
+
# redis_port=int(os.getenv('REDIS_PORT', 6379)),
|
|
195
|
+
# redis_password=os.getenv('REDIS_PASSWORD'),
|
|
196
|
+
# project_name=os.getenv('PROJECT_NAME', 'crawlo'),
|
|
197
|
+
# CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
|
|
198
|
+
# )
|
|
199
|
+
# elif mode == 'auto':
|
|
200
|
+
# return auto_mode(
|
|
201
|
+
# CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
|
|
202
|
+
# )
|
|
203
|
+
# else: # standalone
|
|
204
|
+
# return standalone_mode(
|
|
205
|
+
# CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
|
|
206
|
+
# )
|
crawlo/network/request.py
CHANGED
|
@@ -76,7 +76,9 @@ class Request:
|
|
|
76
76
|
'verify',
|
|
77
77
|
'flags',
|
|
78
78
|
'_json_body',
|
|
79
|
-
'_form_data'
|
|
79
|
+
'_form_data',
|
|
80
|
+
'use_dynamic_loader',
|
|
81
|
+
'dynamic_loader_options'
|
|
80
82
|
)
|
|
81
83
|
|
|
82
84
|
def __init__(
|
|
@@ -99,7 +101,10 @@ class Request:
|
|
|
99
101
|
auth: Optional[tuple] = None,
|
|
100
102
|
verify: bool = True,
|
|
101
103
|
flags: Optional[List[str]] = None,
|
|
102
|
-
encoding: str = 'utf-8'
|
|
104
|
+
encoding: str = 'utf-8',
|
|
105
|
+
# 动态加载相关参数
|
|
106
|
+
use_dynamic_loader: bool = False,
|
|
107
|
+
dynamic_loader_options: Optional[Dict[str, Any]] = None
|
|
103
108
|
):
|
|
104
109
|
"""
|
|
105
110
|
初始化请求对象。
|
|
@@ -145,6 +150,10 @@ class Request:
|
|
|
145
150
|
# 保存高层语义参数(用于 copy)
|
|
146
151
|
self._json_body = json_body
|
|
147
152
|
self._form_data = form_data
|
|
153
|
+
|
|
154
|
+
# 动态加载相关属性
|
|
155
|
+
self.use_dynamic_loader = use_dynamic_loader
|
|
156
|
+
self.dynamic_loader_options = dynamic_loader_options or {}
|
|
148
157
|
|
|
149
158
|
# 构建 body
|
|
150
159
|
if json_body is not None:
|
|
@@ -228,7 +237,9 @@ class Request:
|
|
|
228
237
|
auth=self.auth,
|
|
229
238
|
verify=self.verify,
|
|
230
239
|
flags=self.flags.copy(),
|
|
231
|
-
encoding=self.encoding
|
|
240
|
+
encoding=self.encoding,
|
|
241
|
+
use_dynamic_loader=self.use_dynamic_loader,
|
|
242
|
+
dynamic_loader_options=deepcopy(self.dynamic_loader_options)
|
|
232
243
|
)
|
|
233
244
|
|
|
234
245
|
def set_meta(self, key: str, value: Any) -> 'Request':
|
|
@@ -267,6 +278,22 @@ class Request:
|
|
|
267
278
|
if flag in self.flags:
|
|
268
279
|
self.flags.remove(flag)
|
|
269
280
|
return self
|
|
281
|
+
|
|
282
|
+
def set_dynamic_loader(self, use_dynamic: bool = True, options: Optional[Dict[str, Any]] = None) -> 'Request':
|
|
283
|
+
"""设置使用动态加载器,支持链式调用。"""
|
|
284
|
+
self.use_dynamic_loader = use_dynamic
|
|
285
|
+
if options:
|
|
286
|
+
self.dynamic_loader_options = options
|
|
287
|
+
# 同时在meta中设置标记,供混合下载器使用
|
|
288
|
+
self._meta['use_dynamic_loader'] = use_dynamic
|
|
289
|
+
return self
|
|
290
|
+
|
|
291
|
+
def set_protocol_loader(self) -> 'Request':
|
|
292
|
+
"""强制使用协议加载器,支持链式调用。"""
|
|
293
|
+
self.use_dynamic_loader = False
|
|
294
|
+
self._meta['use_dynamic_loader'] = False
|
|
295
|
+
self._meta['use_protocol_loader'] = True
|
|
296
|
+
return self
|
|
270
297
|
|
|
271
298
|
def _set_url(self, url: str) -> None:
|
|
272
299
|
"""安全设置 URL,确保格式正确。"""
|
crawlo/network/response.py
CHANGED
|
@@ -196,50 +196,139 @@ class Response:
|
|
|
196
196
|
"""使用 CSS 选择器查询文档。"""
|
|
197
197
|
return self._selector.css(query)
|
|
198
198
|
|
|
199
|
-
def
|
|
200
|
-
"""
|
|
201
|
-
|
|
202
|
-
return " ".join(text.strip() for text in fragments if text.strip())
|
|
199
|
+
def _is_xpath(self, query: str) -> bool:
|
|
200
|
+
"""判断查询语句是否为XPath"""
|
|
201
|
+
return query.startswith(('/', '//', './'))
|
|
203
202
|
|
|
204
|
-
def
|
|
205
|
-
"""
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
203
|
+
def _extract_text_from_elements(self, elements: SelectorList, join_str: str = " ") -> str:
|
|
204
|
+
"""
|
|
205
|
+
从元素列表中提取文本并拼接
|
|
206
|
+
|
|
207
|
+
:param elements: SelectorList元素列表
|
|
208
|
+
:param join_str: 文本拼接分隔符
|
|
209
|
+
:return: 拼接后的文本
|
|
210
|
+
"""
|
|
211
|
+
texts = []
|
|
212
|
+
for element in elements:
|
|
213
|
+
# 获取元素的所有文本节点
|
|
214
|
+
if hasattr(element, 'xpath'):
|
|
215
|
+
element_texts = element.xpath('.//text()').getall()
|
|
216
|
+
else:
|
|
217
|
+
element_texts = [str(element)]
|
|
218
|
+
# 清理并添加非空文本
|
|
219
|
+
for text in element_texts:
|
|
220
|
+
cleaned = text.strip()
|
|
221
|
+
if cleaned:
|
|
222
|
+
texts.append(cleaned)
|
|
223
|
+
return join_str.join(texts)
|
|
224
|
+
|
|
225
|
+
def extract_text(self, xpath_or_css: str, join_str: str = " ", default: str = '') -> str:
|
|
210
226
|
"""
|
|
211
|
-
|
|
227
|
+
提取单个元素的文本内容,支持CSS和XPath选择器
|
|
212
228
|
|
|
213
229
|
参数:
|
|
214
230
|
xpath_or_css: XPath或CSS选择器
|
|
215
231
|
join_str: 文本拼接分隔符(默认为空格)
|
|
232
|
+
default: 默认返回值,当未找到元素时返回
|
|
216
233
|
|
|
217
234
|
返回:
|
|
218
235
|
拼接后的纯文本字符串
|
|
219
236
|
"""
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
237
|
+
try:
|
|
238
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
239
|
+
if not elements:
|
|
240
|
+
return default
|
|
241
|
+
return self._extract_text_from_elements(elements, join_str)
|
|
242
|
+
except Exception:
|
|
243
|
+
return default
|
|
223
244
|
|
|
224
|
-
def
|
|
245
|
+
def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
|
|
225
246
|
"""
|
|
226
|
-
|
|
247
|
+
提取多个元素的文本内容列表,支持CSS和XPath选择器
|
|
227
248
|
|
|
228
249
|
参数:
|
|
229
250
|
xpath_or_css: XPath或CSS选择器
|
|
230
251
|
join_str: 单个节点内文本拼接分隔符
|
|
252
|
+
default: 默认返回值,当未找到元素时返回
|
|
231
253
|
|
|
232
254
|
返回:
|
|
233
255
|
纯文本列表(每个元素对应一个节点的文本)
|
|
234
256
|
"""
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
if
|
|
241
|
-
|
|
242
|
-
|
|
257
|
+
if default is None:
|
|
258
|
+
default = []
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
262
|
+
if not elements:
|
|
263
|
+
return default
|
|
264
|
+
|
|
265
|
+
result = []
|
|
266
|
+
for element in elements:
|
|
267
|
+
# 对每个元素提取文本
|
|
268
|
+
if hasattr(element, 'xpath'):
|
|
269
|
+
texts = element.xpath('.//text()').getall()
|
|
270
|
+
else:
|
|
271
|
+
texts = [str(element)]
|
|
272
|
+
|
|
273
|
+
# 清理文本并拼接
|
|
274
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
275
|
+
if clean_texts:
|
|
276
|
+
result.append(join_str.join(clean_texts))
|
|
277
|
+
|
|
278
|
+
return result if result else default
|
|
279
|
+
except Exception:
|
|
280
|
+
return default
|
|
281
|
+
|
|
282
|
+
def extract_attr(self, xpath_or_css: str, attr_name: str, default: Any = None) -> Any:
|
|
283
|
+
"""
|
|
284
|
+
提取单个元素的属性值,支持CSS和XPath选择器
|
|
285
|
+
|
|
286
|
+
参数:
|
|
287
|
+
xpath_or_css: XPath或CSS选择器
|
|
288
|
+
attr_name: 属性名称
|
|
289
|
+
default: 默认返回值
|
|
290
|
+
|
|
291
|
+
返回:
|
|
292
|
+
属性值或默认值
|
|
293
|
+
"""
|
|
294
|
+
try:
|
|
295
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
296
|
+
if not elements:
|
|
297
|
+
return default
|
|
298
|
+
return elements.attrib.get(attr_name, default)
|
|
299
|
+
except Exception:
|
|
300
|
+
return default
|
|
301
|
+
|
|
302
|
+
def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
|
|
303
|
+
"""
|
|
304
|
+
提取多个元素的属性值列表,支持CSS和XPath选择器
|
|
305
|
+
|
|
306
|
+
参数:
|
|
307
|
+
xpath_or_css: XPath或CSS选择器
|
|
308
|
+
attr_name: 属性名称
|
|
309
|
+
default: 默认返回值
|
|
310
|
+
|
|
311
|
+
返回:
|
|
312
|
+
属性值列表
|
|
313
|
+
"""
|
|
314
|
+
if default is None:
|
|
315
|
+
default = []
|
|
316
|
+
|
|
317
|
+
try:
|
|
318
|
+
elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
|
|
319
|
+
if not elements:
|
|
320
|
+
return default
|
|
321
|
+
|
|
322
|
+
result = []
|
|
323
|
+
for element in elements:
|
|
324
|
+
if hasattr(element, 'attrib'):
|
|
325
|
+
attr_value = element.attrib.get(attr_name)
|
|
326
|
+
if attr_value is not None:
|
|
327
|
+
result.append(attr_value)
|
|
328
|
+
|
|
329
|
+
return result if result else default
|
|
330
|
+
except Exception:
|
|
331
|
+
return default
|
|
243
332
|
|
|
244
333
|
def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
|
|
245
334
|
"""在响应文本上执行正则表达式搜索。"""
|
|
@@ -268,4 +357,4 @@ class Response:
|
|
|
268
357
|
return self.request.meta if self.request else {}
|
|
269
358
|
|
|
270
359
|
def __str__(self):
|
|
271
|
-
return f"<{self.status_code} {self.url}>"
|
|
360
|
+
return f"<{self.status_code} {self.url}>"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, List, Dict
|
|
3
3
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
4
4
|
from pymongo.errors import PyMongoError
|
|
5
5
|
from crawlo.utils.log import get_logger
|
|
@@ -21,6 +21,17 @@ class MongoPipeline:
|
|
|
21
21
|
self.mongo_uri = self.settings.get('MONGO_URI', 'mongodb://localhost:27017')
|
|
22
22
|
self.db_name = self.settings.get('MONGO_DATABASE', 'scrapy_db')
|
|
23
23
|
self.collection_name = self.settings.get('MONGO_COLLECTION', crawler.spider.name)
|
|
24
|
+
|
|
25
|
+
# 连接池配置
|
|
26
|
+
self.max_pool_size = self.settings.getint('MONGO_MAX_POOL_SIZE', 100)
|
|
27
|
+
self.min_pool_size = self.settings.getint('MONGO_MIN_POOL_SIZE', 10)
|
|
28
|
+
self.connect_timeout_ms = self.settings.getint('MONGO_CONNECT_TIMEOUT_MS', 5000)
|
|
29
|
+
self.socket_timeout_ms = self.settings.getint('MONGO_SOCKET_TIMEOUT_MS', 30000)
|
|
30
|
+
|
|
31
|
+
# 批量插入配置
|
|
32
|
+
self.batch_size = self.settings.getint('MONGO_BATCH_SIZE', 100)
|
|
33
|
+
self.use_batch = self.settings.getbool('MONGO_USE_BATCH', False)
|
|
34
|
+
self.batch_buffer: List[Dict] = [] # 批量缓冲区
|
|
24
35
|
|
|
25
36
|
# 注册关闭事件
|
|
26
37
|
crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
|
|
@@ -32,86 +43,90 @@ class MongoPipeline:
|
|
|
32
43
|
async def _ensure_connection(self):
|
|
33
44
|
"""确保连接已建立"""
|
|
34
45
|
if self.client is None:
|
|
35
|
-
|
|
46
|
+
# 使用连接池配置创建客户端
|
|
47
|
+
self.client = AsyncIOMotorClient(
|
|
48
|
+
self.mongo_uri,
|
|
49
|
+
maxPoolSize=self.max_pool_size,
|
|
50
|
+
minPoolSize=self.min_pool_size,
|
|
51
|
+
connectTimeoutMS=self.connect_timeout_ms,
|
|
52
|
+
socketTimeoutMS=self.socket_timeout_ms
|
|
53
|
+
)
|
|
36
54
|
self.db = self.client[self.db_name]
|
|
37
55
|
self.collection = self.db[self.collection_name]
|
|
38
56
|
self.logger.info(f"MongoDB连接建立 (集合: {self.collection_name})")
|
|
39
57
|
|
|
40
58
|
async def process_item(self, item, spider) -> Optional[dict]:
|
|
41
|
-
"""处理item
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
self.logger.debug(f"插入文档ID: {result.inserted_id}")
|
|
51
|
-
|
|
59
|
+
"""处理item的核心方法(带重试机制)"""
|
|
60
|
+
# 如果启用批量插入,将item添加到缓冲区
|
|
61
|
+
if self.use_batch:
|
|
62
|
+
self.batch_buffer.append(dict(item))
|
|
63
|
+
|
|
64
|
+
# 如果缓冲区达到批量大小,执行批量插入
|
|
65
|
+
if len(self.batch_buffer) >= self.batch_size:
|
|
66
|
+
await self._flush_batch(spider)
|
|
67
|
+
|
|
52
68
|
return item
|
|
69
|
+
else:
|
|
70
|
+
# 单条插入逻辑
|
|
71
|
+
try:
|
|
72
|
+
await self._ensure_connection()
|
|
73
|
+
|
|
74
|
+
item_dict = dict(item)
|
|
75
|
+
|
|
76
|
+
# 带重试的插入操作
|
|
77
|
+
for attempt in range(3):
|
|
78
|
+
try:
|
|
79
|
+
result = await self.collection.insert_one(item_dict)
|
|
80
|
+
# 统一使用insert_success统计键名
|
|
81
|
+
self.crawler.stats.inc_value('mongodb/insert_success')
|
|
82
|
+
self.logger.debug(f"插入成功 [attempt {attempt + 1}]: {result.inserted_id}")
|
|
83
|
+
return item
|
|
84
|
+
except PyMongoError as e:
|
|
85
|
+
if attempt == 2: # 最后一次尝试仍失败
|
|
86
|
+
raise
|
|
87
|
+
self.logger.warning(f"插入重试中 [attempt {attempt + 1}]: {e}")
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
# 统一使用insert_failed统计键名
|
|
91
|
+
self.crawler.stats.inc_value('mongodb/insert_failed')
|
|
92
|
+
self.logger.error(f"MongoDB操作最终失败: {e}")
|
|
93
|
+
raise ItemDiscard(f"MongoDB操作失败: {e}")
|
|
94
|
+
|
|
95
|
+
async def _flush_batch(self, spider):
|
|
96
|
+
"""刷新批量缓冲区并执行批量插入"""
|
|
97
|
+
if not self.batch_buffer:
|
|
98
|
+
return
|
|
53
99
|
|
|
54
|
-
except Exception as e:
|
|
55
|
-
self.crawler.stats.inc_value('mongodb/failed')
|
|
56
|
-
self.logger.error(f"MongoDB插入失败: {e}")
|
|
57
|
-
raise ItemDiscard(f"MongoDB操作失败: {e}")
|
|
58
|
-
|
|
59
|
-
async def spider_closed(self):
|
|
60
|
-
"""关闭爬虫时清理资源"""
|
|
61
|
-
if self.client:
|
|
62
|
-
self.client.close()
|
|
63
|
-
self.logger.info("MongoDB连接已关闭")
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class MongoPoolPipeline:
|
|
67
|
-
def __init__(self, crawler):
|
|
68
|
-
self.crawler = crawler
|
|
69
|
-
self.settings = crawler.settings
|
|
70
|
-
self.logger = get_logger(self.__class__.__name__, self.settings.get('LOG_LEVEL'))
|
|
71
|
-
|
|
72
|
-
# 连接池配置
|
|
73
|
-
self.client = AsyncIOMotorClient(
|
|
74
|
-
self.settings.get('MONGO_URI', 'mongodb://localhost:27017'),
|
|
75
|
-
maxPoolSize=self.settings.getint('MONGO_MAX_POOL_SIZE', 100),
|
|
76
|
-
minPoolSize=self.settings.getint('MONGO_MIN_POOL_SIZE', 10),
|
|
77
|
-
connectTimeoutMS=5000,
|
|
78
|
-
socketTimeoutMS=30000
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
self.db = self.client[self.settings.get('MONGO_DATABASE', 'scrapy_db')]
|
|
82
|
-
self.collection = self.db[self.settings.get('MONGO_COLLECTION', crawler.spider.name)]
|
|
83
|
-
|
|
84
|
-
crawler.subscriber.subscribe(self.spider_closed, event='spider_closed')
|
|
85
|
-
self.logger.info(f"MongoDB连接池已初始化 (集合: {self.collection.name})")
|
|
86
|
-
|
|
87
|
-
@classmethod
|
|
88
|
-
def create_instance(cls, crawler):
|
|
89
|
-
return cls(crawler)
|
|
90
|
-
|
|
91
|
-
async def process_item(self, item, spider) -> Optional[dict]:
|
|
92
|
-
"""处理item方法(带重试机制)"""
|
|
93
100
|
try:
|
|
94
|
-
|
|
101
|
+
await self._ensure_connection()
|
|
95
102
|
|
|
96
|
-
#
|
|
103
|
+
# 带重试的批量插入操作
|
|
97
104
|
for attempt in range(3):
|
|
98
105
|
try:
|
|
99
|
-
result = await self.collection.
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
106
|
+
result = await self.collection.insert_many(self.batch_buffer, ordered=False)
|
|
107
|
+
# 统一使用insert_success统计键名
|
|
108
|
+
inserted_count = len(result.inserted_ids)
|
|
109
|
+
self.crawler.stats.inc_value('mongodb/insert_success', inserted_count)
|
|
110
|
+
self.logger.debug(f"批量插入成功 [attempt {attempt + 1}]: {inserted_count} 条记录")
|
|
111
|
+
self.batch_buffer.clear()
|
|
112
|
+
return
|
|
103
113
|
except PyMongoError as e:
|
|
104
114
|
if attempt == 2: # 最后一次尝试仍失败
|
|
105
115
|
raise
|
|
106
|
-
self.logger.warning(f"
|
|
107
|
-
|
|
116
|
+
self.logger.warning(f"批量插入重试中 [attempt {attempt + 1}]: {e}")
|
|
108
117
|
except Exception as e:
|
|
109
|
-
|
|
110
|
-
self.
|
|
111
|
-
|
|
118
|
+
# 统一使用insert_failed统计键名
|
|
119
|
+
failed_count = len(self.batch_buffer)
|
|
120
|
+
self.crawler.stats.inc_value('mongodb/insert_failed', failed_count)
|
|
121
|
+
self.logger.error(f"MongoDB批量插入最终失败: {e}")
|
|
122
|
+
raise ItemDiscard(f"MongoDB批量插入失败: {e}")
|
|
112
123
|
|
|
113
124
|
async def spider_closed(self):
|
|
114
|
-
"""
|
|
115
|
-
|
|
125
|
+
"""关闭爬虫时清理资源"""
|
|
126
|
+
# 在关闭前刷新剩余的批量数据
|
|
127
|
+
if self.use_batch and self.batch_buffer:
|
|
128
|
+
await self._flush_batch(self.crawler.spider)
|
|
129
|
+
|
|
130
|
+
if self.client:
|
|
116
131
|
self.client.close()
|
|
117
|
-
self.logger.info("MongoDB
|
|
132
|
+
self.logger.info("MongoDB连接已关闭")
|