crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
资源管理器 - 统一管理所有可清理资源
|
|
5
|
+
========================================
|
|
6
|
+
|
|
7
|
+
功能特性:
|
|
8
|
+
- 统一注册和清理资源
|
|
9
|
+
- 支持异步资源清理
|
|
10
|
+
- 资源泄露检测
|
|
11
|
+
- 清理顺序保证(LIFO)
|
|
12
|
+
"""
|
|
13
|
+
import asyncio
|
|
14
|
+
import time
|
|
15
|
+
import traceback
|
|
16
|
+
from typing import Any, Callable, List, Tuple, Optional, Dict
|
|
17
|
+
from enum import Enum
|
|
18
|
+
|
|
19
|
+
from crawlo.logging import get_logger
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ResourceType(Enum):
|
|
23
|
+
"""资源类型枚举"""
|
|
24
|
+
DOWNLOADER = "downloader"
|
|
25
|
+
REDIS_POOL = "redis_pool"
|
|
26
|
+
QUEUE = "queue"
|
|
27
|
+
FILTER = "filter"
|
|
28
|
+
PIPELINE = "pipeline"
|
|
29
|
+
MIDDLEWARE = "middleware"
|
|
30
|
+
EXTENSION = "extension"
|
|
31
|
+
SESSION = "session"
|
|
32
|
+
BROWSER = "browser"
|
|
33
|
+
OTHER = "other"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ResourceStatus(Enum):
|
|
37
|
+
"""资源状态"""
|
|
38
|
+
ACTIVE = "active"
|
|
39
|
+
CLOSING = "closing"
|
|
40
|
+
CLOSED = "closed"
|
|
41
|
+
ERROR = "error"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ManagedResource:
|
|
45
|
+
"""托管资源"""
|
|
46
|
+
|
|
47
|
+
def __init__(self,
|
|
48
|
+
resource: Any,
|
|
49
|
+
cleanup_func: Callable,
|
|
50
|
+
resource_type: ResourceType = ResourceType.OTHER,
|
|
51
|
+
name: Optional[str] = None):
|
|
52
|
+
self.resource = resource
|
|
53
|
+
self.cleanup_func = cleanup_func
|
|
54
|
+
self.resource_type = resource_type
|
|
55
|
+
self.name = name or f"{resource_type.value}_{id(resource)}"
|
|
56
|
+
self.status = ResourceStatus.ACTIVE
|
|
57
|
+
self.created_at = time.time()
|
|
58
|
+
self.closed_at: Optional[float] = None
|
|
59
|
+
|
|
60
|
+
async def cleanup(self) -> bool:
|
|
61
|
+
"""清理资源"""
|
|
62
|
+
if self.status == ResourceStatus.CLOSED:
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
self.status = ResourceStatus.CLOSING
|
|
66
|
+
try:
|
|
67
|
+
# 检查cleanup_func是否为异步函数
|
|
68
|
+
if asyncio.iscoroutinefunction(self.cleanup_func):
|
|
69
|
+
await self.cleanup_func(self.resource)
|
|
70
|
+
else:
|
|
71
|
+
# 同步函数,直接调用
|
|
72
|
+
result = self.cleanup_func(self.resource)
|
|
73
|
+
# 如果返回的是协程,则await
|
|
74
|
+
if asyncio.iscoroutine(result):
|
|
75
|
+
await result
|
|
76
|
+
|
|
77
|
+
self.status = ResourceStatus.CLOSED
|
|
78
|
+
self.closed_at = time.time()
|
|
79
|
+
return True
|
|
80
|
+
except Exception as e:
|
|
81
|
+
self.status = ResourceStatus.ERROR
|
|
82
|
+
raise e
|
|
83
|
+
|
|
84
|
+
def get_lifetime(self) -> float:
|
|
85
|
+
"""获取资源生命周期(秒)"""
|
|
86
|
+
end_time = self.closed_at or time.time()
|
|
87
|
+
return end_time - self.created_at
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ResourceManager:
|
|
91
|
+
"""
|
|
92
|
+
资源管理器 - 统一管理所有可清理资源
|
|
93
|
+
|
|
94
|
+
特性:
|
|
95
|
+
1. 自动跟踪注册的资源
|
|
96
|
+
2. 保证清理顺序(LIFO - 后进先出)
|
|
97
|
+
3. 容错清理(一个失败不影响其他)
|
|
98
|
+
4. 资源泄露检测
|
|
99
|
+
5. 统计和监控
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, name: str = "default"):
|
|
103
|
+
self.name = name
|
|
104
|
+
self._resources: List[ManagedResource] = []
|
|
105
|
+
self._lock = asyncio.Lock()
|
|
106
|
+
self._cleanup_errors: List[Tuple[str, Exception]] = []
|
|
107
|
+
self._logger = get_logger(f"ResourceManager.{name}")
|
|
108
|
+
|
|
109
|
+
# 统计信息
|
|
110
|
+
self._stats = {
|
|
111
|
+
'total_registered': 0,
|
|
112
|
+
'total_cleaned': 0,
|
|
113
|
+
'total_errors': 0,
|
|
114
|
+
'active_resources': 0,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
def register(self,
|
|
118
|
+
resource: Any,
|
|
119
|
+
cleanup_func: Callable,
|
|
120
|
+
resource_type: ResourceType = ResourceType.OTHER,
|
|
121
|
+
name: Optional[str] = None) -> ManagedResource:
|
|
122
|
+
"""
|
|
123
|
+
注册需要清理的资源
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
resource: 资源对象
|
|
127
|
+
cleanup_func: 清理函数(同步或异步)
|
|
128
|
+
resource_type: 资源类型
|
|
129
|
+
name: 资源名称(用于日志)
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
托管资源对象
|
|
133
|
+
"""
|
|
134
|
+
managed = ManagedResource(resource, cleanup_func, resource_type, name)
|
|
135
|
+
self._resources.append(managed)
|
|
136
|
+
self._stats['total_registered'] += 1
|
|
137
|
+
self._stats['active_resources'] += 1
|
|
138
|
+
|
|
139
|
+
self._logger.debug(f"Resource registered: {managed.name} ({resource_type.value})")
|
|
140
|
+
return managed
|
|
141
|
+
|
|
142
|
+
async def cleanup_all(self, reverse: bool = True) -> Dict[str, Any]:
|
|
143
|
+
"""
|
|
144
|
+
清理所有注册的资源
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
reverse: 是否反向清理(LIFO,推荐)
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
清理结果统计
|
|
151
|
+
"""
|
|
152
|
+
async with self._lock:
|
|
153
|
+
if not self._resources:
|
|
154
|
+
self._logger.debug("No resources to cleanup")
|
|
155
|
+
return self._get_cleanup_stats()
|
|
156
|
+
|
|
157
|
+
self._logger.info(f"Starting cleanup of {len(self._resources)} resources...")
|
|
158
|
+
|
|
159
|
+
# 反向清理(后创建的先清理)
|
|
160
|
+
resources = reversed(self._resources) if reverse else self._resources
|
|
161
|
+
|
|
162
|
+
cleanup_start = time.time()
|
|
163
|
+
success_count = 0
|
|
164
|
+
error_count = 0
|
|
165
|
+
|
|
166
|
+
for managed in resources:
|
|
167
|
+
try:
|
|
168
|
+
self._logger.debug(f"Cleaning up: {managed.name}")
|
|
169
|
+
await managed.cleanup()
|
|
170
|
+
success_count += 1
|
|
171
|
+
self._stats['total_cleaned'] += 1
|
|
172
|
+
self._stats['active_resources'] -= 1
|
|
173
|
+
except Exception as e:
|
|
174
|
+
error_count += 1
|
|
175
|
+
self._stats['total_errors'] += 1
|
|
176
|
+
self._cleanup_errors.append((managed.name, e))
|
|
177
|
+
self._logger.error(
|
|
178
|
+
f"Failed to cleanup {managed.name}: {e}",
|
|
179
|
+
exc_info=True
|
|
180
|
+
)
|
|
181
|
+
# 继续清理其他资源,不中断
|
|
182
|
+
|
|
183
|
+
cleanup_duration = time.time() - cleanup_start
|
|
184
|
+
|
|
185
|
+
# 清空资源列表
|
|
186
|
+
self._resources.clear()
|
|
187
|
+
|
|
188
|
+
result = {
|
|
189
|
+
'success': success_count,
|
|
190
|
+
'errors': error_count,
|
|
191
|
+
'duration': cleanup_duration,
|
|
192
|
+
'total_resources': success_count + error_count,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if error_count > 0:
|
|
196
|
+
self._logger.warning(
|
|
197
|
+
f"Cleanup completed with errors: {success_count} success, "
|
|
198
|
+
f"{error_count} errors in {cleanup_duration:.2f}s"
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
self._logger.info(
|
|
202
|
+
f"Cleanup completed successfully: {success_count} resources "
|
|
203
|
+
f"in {cleanup_duration:.2f}s"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return result
|
|
207
|
+
|
|
208
|
+
async def cleanup_by_type(self, resource_type: ResourceType) -> int:
|
|
209
|
+
"""
|
|
210
|
+
按类型清理资源
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
resource_type: 资源类型
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
清理的资源数量
|
|
217
|
+
"""
|
|
218
|
+
async with self._lock:
|
|
219
|
+
to_cleanup = [r for r in self._resources if r.resource_type == resource_type]
|
|
220
|
+
|
|
221
|
+
if not to_cleanup:
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
cleaned = 0
|
|
225
|
+
for managed in reversed(to_cleanup):
|
|
226
|
+
try:
|
|
227
|
+
await managed.cleanup()
|
|
228
|
+
self._resources.remove(managed)
|
|
229
|
+
cleaned += 1
|
|
230
|
+
self._stats['total_cleaned'] += 1
|
|
231
|
+
self._stats['active_resources'] -= 1
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self._logger.error(f"Failed to cleanup {managed.name}: {e}")
|
|
234
|
+
self._stats['total_errors'] += 1
|
|
235
|
+
|
|
236
|
+
return cleaned
|
|
237
|
+
|
|
238
|
+
def get_active_resources(self) -> List[ManagedResource]:
|
|
239
|
+
"""获取所有活跃资源"""
|
|
240
|
+
return [r for r in self._resources if r.status == ResourceStatus.ACTIVE]
|
|
241
|
+
|
|
242
|
+
def get_resources_by_type(self, resource_type: ResourceType) -> List[ManagedResource]:
|
|
243
|
+
"""按类型获取资源"""
|
|
244
|
+
return [r for r in self._resources if r.resource_type == resource_type]
|
|
245
|
+
|
|
246
|
+
def detect_leaks(self, max_lifetime: float = 3600) -> List[ManagedResource]:
|
|
247
|
+
"""
|
|
248
|
+
检测可能的资源泄露
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
max_lifetime: 最大生命周期(秒),超过此时间未清理视为泄露
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
可能泄露的资源列表
|
|
255
|
+
"""
|
|
256
|
+
current_time = time.time()
|
|
257
|
+
leaks = []
|
|
258
|
+
|
|
259
|
+
for managed in self._resources:
|
|
260
|
+
if managed.status == ResourceStatus.ACTIVE:
|
|
261
|
+
lifetime = current_time - managed.created_at
|
|
262
|
+
if lifetime > max_lifetime:
|
|
263
|
+
leaks.append(managed)
|
|
264
|
+
self._logger.warning(
|
|
265
|
+
f"Potential leak detected: {managed.name} "
|
|
266
|
+
f"(lifetime: {lifetime:.2f}s)"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return leaks
|
|
270
|
+
|
|
271
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
272
|
+
"""获取统计信息"""
|
|
273
|
+
return {
|
|
274
|
+
**self._stats,
|
|
275
|
+
'cleanup_errors': len(self._cleanup_errors),
|
|
276
|
+
'active_by_type': self._get_active_by_type(),
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
def _get_active_by_type(self) -> Dict[str, int]:
|
|
280
|
+
"""按类型统计活跃资源"""
|
|
281
|
+
result = {}
|
|
282
|
+
for managed in self._resources:
|
|
283
|
+
if managed.status == ResourceStatus.ACTIVE:
|
|
284
|
+
type_name = managed.resource_type.value
|
|
285
|
+
result[type_name] = result.get(type_name, 0) + 1
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
def _get_cleanup_stats(self) -> Dict[str, Any]:
|
|
289
|
+
"""获取清理统计"""
|
|
290
|
+
return {
|
|
291
|
+
'success': 0,
|
|
292
|
+
'errors': 0,
|
|
293
|
+
'duration': 0.0,
|
|
294
|
+
'total_resources': 0,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
async def __aenter__(self):
|
|
298
|
+
"""上下文管理器入口"""
|
|
299
|
+
return self
|
|
300
|
+
|
|
301
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
302
|
+
"""上下文管理器退出,自动清理"""
|
|
303
|
+
await self.cleanup_all()
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# 全局资源管理器注册表
|
|
308
|
+
_global_managers: Dict[str, ResourceManager] = {}
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def get_resource_manager(name: str = "default") -> ResourceManager:
|
|
312
|
+
"""
|
|
313
|
+
获取资源管理器实例(单例)
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
name: 管理器名称
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
资源管理器实例
|
|
320
|
+
"""
|
|
321
|
+
if name not in _global_managers:
|
|
322
|
+
_global_managers[name] = ResourceManager(name)
|
|
323
|
+
return _global_managers[name]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
async def cleanup_all_managers():
|
|
327
|
+
"""清理所有资源管理器"""
|
|
328
|
+
logger = get_logger("ResourceManager")
|
|
329
|
+
|
|
330
|
+
for name, manager in _global_managers.items():
|
|
331
|
+
try:
|
|
332
|
+
logger.info(f"Cleaning up resource manager: {name}")
|
|
333
|
+
await manager.cleanup_all()
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.error(f"Failed to cleanup manager {name}: {e}")
|
|
336
|
+
|
|
337
|
+
_global_managers.clear()
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
响应处理工具模块
|
|
5
|
+
==================
|
|
6
|
+
提供用于处理HTTP响应的辅助函数,包括Cookie处理和正则表达式操作。
|
|
7
|
+
|
|
8
|
+
该模块包含以下主要函数:
|
|
9
|
+
- parse_cookies: 从响应头中解析Cookies
|
|
10
|
+
- regex_search: 在文本上执行正则表达式搜索
|
|
11
|
+
- regex_findall: 在文本上执行正则表达式查找
|
|
12
|
+
- get_header_value: 从响应头中获取值,处理大小写不敏感的情况
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import Dict, List, Any, Optional, Union
|
|
17
|
+
from http.cookies import SimpleCookie
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_cookies(cookie_header: str) -> Dict[str, str]:
|
|
21
|
+
"""
|
|
22
|
+
从响应头中解析并返回Cookies
|
|
23
|
+
|
|
24
|
+
:param cookie_header: Set-Cookie头部的值
|
|
25
|
+
:return: 解析后的Cookies字典
|
|
26
|
+
"""
|
|
27
|
+
if isinstance(cookie_header, list):
|
|
28
|
+
cookie_header = ", ".join(cookie_header)
|
|
29
|
+
|
|
30
|
+
if not cookie_header:
|
|
31
|
+
return {}
|
|
32
|
+
|
|
33
|
+
cookies = SimpleCookie()
|
|
34
|
+
try:
|
|
35
|
+
cookies.load(cookie_header)
|
|
36
|
+
return {key: morsel.value for key, morsel in cookies.items()}
|
|
37
|
+
except Exception:
|
|
38
|
+
# 如果解析失败,返回空字典
|
|
39
|
+
return {}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def regex_search(pattern: str, text: str, flags: int = re.DOTALL) -> Optional[re.Match]:
|
|
43
|
+
"""
|
|
44
|
+
在文本上执行正则表达式搜索
|
|
45
|
+
|
|
46
|
+
:param pattern: 正则表达式模式
|
|
47
|
+
:param text: 要搜索的文本
|
|
48
|
+
:param flags: 正则表达式标志
|
|
49
|
+
:return: 匹配对象或None
|
|
50
|
+
"""
|
|
51
|
+
if not isinstance(pattern, str):
|
|
52
|
+
raise TypeError("Pattern must be a string")
|
|
53
|
+
if not isinstance(text, str):
|
|
54
|
+
raise TypeError("Text must be a string")
|
|
55
|
+
return re.search(pattern, text, flags=flags)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def regex_findall(pattern: str, text: str, flags: int = re.DOTALL) -> List[Any]:
|
|
59
|
+
"""
|
|
60
|
+
在文本上执行正则表达式查找
|
|
61
|
+
|
|
62
|
+
:param pattern: 正则表达式模式
|
|
63
|
+
:param text: 要搜索的文本
|
|
64
|
+
:param flags: 正则表达式标志
|
|
65
|
+
:return: 匹配结果列表
|
|
66
|
+
"""
|
|
67
|
+
if not isinstance(pattern, str):
|
|
68
|
+
raise TypeError("Pattern must be a string")
|
|
69
|
+
if not isinstance(text, str):
|
|
70
|
+
raise TypeError("Text must be a string")
|
|
71
|
+
return re.findall(pattern, text, flags=flags)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_header_value(headers: Dict[str, Any], header_name: str, default: Any = None) -> Any:
|
|
75
|
+
"""
|
|
76
|
+
从响应头中获取值,处理大小写不敏感的情况
|
|
77
|
+
|
|
78
|
+
:param headers: 响应头字典
|
|
79
|
+
:param header_name: 头部名称
|
|
80
|
+
:param default: 默认值
|
|
81
|
+
:return: 头部值或默认值
|
|
82
|
+
"""
|
|
83
|
+
if not headers or not header_name:
|
|
84
|
+
return default
|
|
85
|
+
|
|
86
|
+
# 首先尝试直接匹配
|
|
87
|
+
if header_name in headers:
|
|
88
|
+
return headers[header_name]
|
|
89
|
+
|
|
90
|
+
# 尝试小写匹配
|
|
91
|
+
lower_header = header_name.lower()
|
|
92
|
+
if lower_header in headers:
|
|
93
|
+
return headers[lower_header]
|
|
94
|
+
|
|
95
|
+
# 尝试首字母大写匹配
|
|
96
|
+
capitalized_header = header_name.capitalize()
|
|
97
|
+
if capitalized_header in headers:
|
|
98
|
+
return headers[capitalized_header]
|
|
99
|
+
|
|
100
|
+
# 尝试标题格式匹配
|
|
101
|
+
title_header = header_name.title()
|
|
102
|
+
if title_header in headers:
|
|
103
|
+
return headers[title_header]
|
|
104
|
+
|
|
105
|
+
return default
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
"parse_cookies",
|
|
110
|
+
"regex_search",
|
|
111
|
+
"regex_findall",
|
|
112
|
+
"get_header_value"
|
|
113
|
+
]
|
crawlo/utils/selector_helper.py
CHANGED
|
@@ -15,8 +15,9 @@
|
|
|
15
15
|
所有方法都采用了简洁直观的命名风格,便于记忆和使用。
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
from typing import List, Any
|
|
19
|
-
|
|
18
|
+
from typing import List, Any
|
|
19
|
+
|
|
20
|
+
from parsel import SelectorList
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def extract_text(elements: SelectorList, join_str: str = " ") -> str:
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
单例模式工具模块
|
|
5
|
+
================
|
|
6
|
+
|
|
7
|
+
提供同步和异步两种单例实现方式,适用于不同的使用场景。
|
|
8
|
+
|
|
9
|
+
使用场景:
|
|
10
|
+
1. 同步单例:用于框架初始化、配置管理等同步代码
|
|
11
|
+
2. 异步单例:用于数据库连接池、网络资源等异步代码
|
|
12
|
+
|
|
13
|
+
示例:
|
|
14
|
+
# 同步单例
|
|
15
|
+
@singleton
|
|
16
|
+
class CoreInitializer:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
# 异步单例(在连接池管理器中使用)
|
|
20
|
+
class MySQLConnectionPoolManager:
|
|
21
|
+
_instances: Dict[str, 'MySQLConnectionPoolManager'] = {}
|
|
22
|
+
_lock = asyncio.Lock()
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
async def get_pool(cls, ...):
|
|
26
|
+
async with cls._lock:
|
|
27
|
+
if pool_key not in cls._instances:
|
|
28
|
+
cls._instances[pool_key] = cls(pool_key)
|
|
29
|
+
return cls._instances[pool_key].pool
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import threading
|
|
33
|
+
from typing import Any, Dict, Type
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SingletonMeta(type):
|
|
37
|
+
"""单例元类"""
|
|
38
|
+
_instances: Dict[Type, Any] = {}
|
|
39
|
+
_lock = threading.Lock()
|
|
40
|
+
|
|
41
|
+
def __call__(cls, *args, **kwargs):
|
|
42
|
+
if cls not in cls._instances:
|
|
43
|
+
with cls._lock:
|
|
44
|
+
if cls not in cls._instances:
|
|
45
|
+
instance = super().__call__(*args, **kwargs)
|
|
46
|
+
cls._instances[cls] = instance
|
|
47
|
+
return cls._instances[cls]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def singleton(cls):
|
|
51
|
+
"""
|
|
52
|
+
单例装饰器
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
cls: 要装饰的类
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
装饰后的类,确保只有一个实例
|
|
59
|
+
"""
|
|
60
|
+
instances = {}
|
|
61
|
+
lock = threading.Lock()
|
|
62
|
+
|
|
63
|
+
def get_instance(*args, **kwargs):
|
|
64
|
+
if cls not in instances:
|
|
65
|
+
with lock:
|
|
66
|
+
if cls not in instances:
|
|
67
|
+
instances[cls] = cls(*args, **kwargs)
|
|
68
|
+
return instances[cls]
|
|
69
|
+
|
|
70
|
+
return get_instance
|
crawlo/utils/spider_loader.py
CHANGED
|
@@ -9,7 +9,7 @@ from crawlo.interfaces import ISpiderLoader
|
|
|
9
9
|
from crawlo.settings.setting_manager import SettingManager
|
|
10
10
|
from crawlo.spider import Spider
|
|
11
11
|
from crawlo.network.request import Request
|
|
12
|
-
from crawlo.
|
|
12
|
+
from crawlo.logging import get_logger
|
|
13
13
|
|
|
14
14
|
logger = get_logger(__name__)
|
|
15
15
|
|