crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,337 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 资源管理器 - 统一管理所有可清理资源
5
+ ========================================
6
+
7
+ 功能特性:
8
+ - 统一注册和清理资源
9
+ - 支持异步资源清理
10
+ - 资源泄露检测
11
+ - 清理顺序保证(LIFO)
12
+ """
13
+ import asyncio
14
+ import time
15
+ import traceback
16
+ from typing import Any, Callable, List, Tuple, Optional, Dict
17
+ from enum import Enum
18
+
19
+ from crawlo.logging import get_logger
20
+
21
+
22
+ class ResourceType(Enum):
23
+ """资源类型枚举"""
24
+ DOWNLOADER = "downloader"
25
+ REDIS_POOL = "redis_pool"
26
+ QUEUE = "queue"
27
+ FILTER = "filter"
28
+ PIPELINE = "pipeline"
29
+ MIDDLEWARE = "middleware"
30
+ EXTENSION = "extension"
31
+ SESSION = "session"
32
+ BROWSER = "browser"
33
+ OTHER = "other"
34
+
35
+
36
+ class ResourceStatus(Enum):
37
+ """资源状态"""
38
+ ACTIVE = "active"
39
+ CLOSING = "closing"
40
+ CLOSED = "closed"
41
+ ERROR = "error"
42
+
43
+
44
+ class ManagedResource:
45
+ """托管资源"""
46
+
47
+ def __init__(self,
48
+ resource: Any,
49
+ cleanup_func: Callable,
50
+ resource_type: ResourceType = ResourceType.OTHER,
51
+ name: Optional[str] = None):
52
+ self.resource = resource
53
+ self.cleanup_func = cleanup_func
54
+ self.resource_type = resource_type
55
+ self.name = name or f"{resource_type.value}_{id(resource)}"
56
+ self.status = ResourceStatus.ACTIVE
57
+ self.created_at = time.time()
58
+ self.closed_at: Optional[float] = None
59
+
60
+ async def cleanup(self) -> bool:
61
+ """清理资源"""
62
+ if self.status == ResourceStatus.CLOSED:
63
+ return True
64
+
65
+ self.status = ResourceStatus.CLOSING
66
+ try:
67
+ # 检查cleanup_func是否为异步函数
68
+ if asyncio.iscoroutinefunction(self.cleanup_func):
69
+ await self.cleanup_func(self.resource)
70
+ else:
71
+ # 同步函数,直接调用
72
+ result = self.cleanup_func(self.resource)
73
+ # 如果返回的是协程,则await
74
+ if asyncio.iscoroutine(result):
75
+ await result
76
+
77
+ self.status = ResourceStatus.CLOSED
78
+ self.closed_at = time.time()
79
+ return True
80
+ except Exception as e:
81
+ self.status = ResourceStatus.ERROR
82
+ raise e
83
+
84
+ def get_lifetime(self) -> float:
85
+ """获取资源生命周期(秒)"""
86
+ end_time = self.closed_at or time.time()
87
+ return end_time - self.created_at
88
+
89
+
90
+ class ResourceManager:
91
+ """
92
+ 资源管理器 - 统一管理所有可清理资源
93
+
94
+ 特性:
95
+ 1. 自动跟踪注册的资源
96
+ 2. 保证清理顺序(LIFO - 后进先出)
97
+ 3. 容错清理(一个失败不影响其他)
98
+ 4. 资源泄露检测
99
+ 5. 统计和监控
100
+ """
101
+
102
+ def __init__(self, name: str = "default"):
103
+ self.name = name
104
+ self._resources: List[ManagedResource] = []
105
+ self._lock = asyncio.Lock()
106
+ self._cleanup_errors: List[Tuple[str, Exception]] = []
107
+ self._logger = get_logger(f"ResourceManager.{name}")
108
+
109
+ # 统计信息
110
+ self._stats = {
111
+ 'total_registered': 0,
112
+ 'total_cleaned': 0,
113
+ 'total_errors': 0,
114
+ 'active_resources': 0,
115
+ }
116
+
117
+ def register(self,
118
+ resource: Any,
119
+ cleanup_func: Callable,
120
+ resource_type: ResourceType = ResourceType.OTHER,
121
+ name: Optional[str] = None) -> ManagedResource:
122
+ """
123
+ 注册需要清理的资源
124
+
125
+ Args:
126
+ resource: 资源对象
127
+ cleanup_func: 清理函数(同步或异步)
128
+ resource_type: 资源类型
129
+ name: 资源名称(用于日志)
130
+
131
+ Returns:
132
+ 托管资源对象
133
+ """
134
+ managed = ManagedResource(resource, cleanup_func, resource_type, name)
135
+ self._resources.append(managed)
136
+ self._stats['total_registered'] += 1
137
+ self._stats['active_resources'] += 1
138
+
139
+ self._logger.debug(f"Resource registered: {managed.name} ({resource_type.value})")
140
+ return managed
141
+
142
+ async def cleanup_all(self, reverse: bool = True) -> Dict[str, Any]:
143
+ """
144
+ 清理所有注册的资源
145
+
146
+ Args:
147
+ reverse: 是否反向清理(LIFO,推荐)
148
+
149
+ Returns:
150
+ 清理结果统计
151
+ """
152
+ async with self._lock:
153
+ if not self._resources:
154
+ self._logger.debug("No resources to cleanup")
155
+ return self._get_cleanup_stats()
156
+
157
+ self._logger.info(f"Starting cleanup of {len(self._resources)} resources...")
158
+
159
+ # 反向清理(后创建的先清理)
160
+ resources = reversed(self._resources) if reverse else self._resources
161
+
162
+ cleanup_start = time.time()
163
+ success_count = 0
164
+ error_count = 0
165
+
166
+ for managed in resources:
167
+ try:
168
+ self._logger.debug(f"Cleaning up: {managed.name}")
169
+ await managed.cleanup()
170
+ success_count += 1
171
+ self._stats['total_cleaned'] += 1
172
+ self._stats['active_resources'] -= 1
173
+ except Exception as e:
174
+ error_count += 1
175
+ self._stats['total_errors'] += 1
176
+ self._cleanup_errors.append((managed.name, e))
177
+ self._logger.error(
178
+ f"Failed to cleanup {managed.name}: {e}",
179
+ exc_info=True
180
+ )
181
+ # 继续清理其他资源,不中断
182
+
183
+ cleanup_duration = time.time() - cleanup_start
184
+
185
+ # 清空资源列表
186
+ self._resources.clear()
187
+
188
+ result = {
189
+ 'success': success_count,
190
+ 'errors': error_count,
191
+ 'duration': cleanup_duration,
192
+ 'total_resources': success_count + error_count,
193
+ }
194
+
195
+ if error_count > 0:
196
+ self._logger.warning(
197
+ f"Cleanup completed with errors: {success_count} success, "
198
+ f"{error_count} errors in {cleanup_duration:.2f}s"
199
+ )
200
+ else:
201
+ self._logger.info(
202
+ f"Cleanup completed successfully: {success_count} resources "
203
+ f"in {cleanup_duration:.2f}s"
204
+ )
205
+
206
+ return result
207
+
208
+ async def cleanup_by_type(self, resource_type: ResourceType) -> int:
209
+ """
210
+ 按类型清理资源
211
+
212
+ Args:
213
+ resource_type: 资源类型
214
+
215
+ Returns:
216
+ 清理的资源数量
217
+ """
218
+ async with self._lock:
219
+ to_cleanup = [r for r in self._resources if r.resource_type == resource_type]
220
+
221
+ if not to_cleanup:
222
+ return 0
223
+
224
+ cleaned = 0
225
+ for managed in reversed(to_cleanup):
226
+ try:
227
+ await managed.cleanup()
228
+ self._resources.remove(managed)
229
+ cleaned += 1
230
+ self._stats['total_cleaned'] += 1
231
+ self._stats['active_resources'] -= 1
232
+ except Exception as e:
233
+ self._logger.error(f"Failed to cleanup {managed.name}: {e}")
234
+ self._stats['total_errors'] += 1
235
+
236
+ return cleaned
237
+
238
+ def get_active_resources(self) -> List[ManagedResource]:
239
+ """获取所有活跃资源"""
240
+ return [r for r in self._resources if r.status == ResourceStatus.ACTIVE]
241
+
242
+ def get_resources_by_type(self, resource_type: ResourceType) -> List[ManagedResource]:
243
+ """按类型获取资源"""
244
+ return [r for r in self._resources if r.resource_type == resource_type]
245
+
246
+ def detect_leaks(self, max_lifetime: float = 3600) -> List[ManagedResource]:
247
+ """
248
+ 检测可能的资源泄露
249
+
250
+ Args:
251
+ max_lifetime: 最大生命周期(秒),超过此时间未清理视为泄露
252
+
253
+ Returns:
254
+ 可能泄露的资源列表
255
+ """
256
+ current_time = time.time()
257
+ leaks = []
258
+
259
+ for managed in self._resources:
260
+ if managed.status == ResourceStatus.ACTIVE:
261
+ lifetime = current_time - managed.created_at
262
+ if lifetime > max_lifetime:
263
+ leaks.append(managed)
264
+ self._logger.warning(
265
+ f"Potential leak detected: {managed.name} "
266
+ f"(lifetime: {lifetime:.2f}s)"
267
+ )
268
+
269
+ return leaks
270
+
271
+ def get_stats(self) -> Dict[str, Any]:
272
+ """获取统计信息"""
273
+ return {
274
+ **self._stats,
275
+ 'cleanup_errors': len(self._cleanup_errors),
276
+ 'active_by_type': self._get_active_by_type(),
277
+ }
278
+
279
+ def _get_active_by_type(self) -> Dict[str, int]:
280
+ """按类型统计活跃资源"""
281
+ result = {}
282
+ for managed in self._resources:
283
+ if managed.status == ResourceStatus.ACTIVE:
284
+ type_name = managed.resource_type.value
285
+ result[type_name] = result.get(type_name, 0) + 1
286
+ return result
287
+
288
+ def _get_cleanup_stats(self) -> Dict[str, Any]:
289
+ """获取清理统计"""
290
+ return {
291
+ 'success': 0,
292
+ 'errors': 0,
293
+ 'duration': 0.0,
294
+ 'total_resources': 0,
295
+ }
296
+
297
+ async def __aenter__(self):
298
+ """上下文管理器入口"""
299
+ return self
300
+
301
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
302
+ """上下文管理器退出,自动清理"""
303
+ await self.cleanup_all()
304
+ return False
305
+
306
+
307
+ # 全局资源管理器注册表
308
+ _global_managers: Dict[str, ResourceManager] = {}
309
+
310
+
311
+ def get_resource_manager(name: str = "default") -> ResourceManager:
312
+ """
313
+ 获取资源管理器实例(单例)
314
+
315
+ Args:
316
+ name: 管理器名称
317
+
318
+ Returns:
319
+ 资源管理器实例
320
+ """
321
+ if name not in _global_managers:
322
+ _global_managers[name] = ResourceManager(name)
323
+ return _global_managers[name]
324
+
325
+
326
+ async def cleanup_all_managers():
327
+ """清理所有资源管理器"""
328
+ logger = get_logger("ResourceManager")
329
+
330
+ for name, manager in _global_managers.items():
331
+ try:
332
+ logger.info(f"Cleaning up resource manager: {name}")
333
+ await manager.cleanup_all()
334
+ except Exception as e:
335
+ logger.error(f"Failed to cleanup manager {name}: {e}")
336
+
337
+ _global_managers.clear()
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 响应处理工具模块
5
+ ==================
6
+ 提供用于处理HTTP响应的辅助函数,包括Cookie处理和正则表达式操作。
7
+
8
+ 该模块包含以下主要函数:
9
+ - parse_cookies: 从响应头中解析Cookies
10
+ - regex_search: 在文本上执行正则表达式搜索
11
+ - regex_findall: 在文本上执行正则表达式查找
12
+ - get_header_value: 从响应头中获取值,处理大小写不敏感的情况
13
+ """
14
+
15
+ import re
16
+ from typing import Dict, List, Any, Optional, Union
17
+ from http.cookies import SimpleCookie
18
+
19
+
20
+ def parse_cookies(cookie_header: str) -> Dict[str, str]:
21
+ """
22
+ 从响应头中解析并返回Cookies
23
+
24
+ :param cookie_header: Set-Cookie头部的值
25
+ :return: 解析后的Cookies字典
26
+ """
27
+ if isinstance(cookie_header, list):
28
+ cookie_header = ", ".join(cookie_header)
29
+
30
+ if not cookie_header:
31
+ return {}
32
+
33
+ cookies = SimpleCookie()
34
+ try:
35
+ cookies.load(cookie_header)
36
+ return {key: morsel.value for key, morsel in cookies.items()}
37
+ except Exception:
38
+ # 如果解析失败,返回空字典
39
+ return {}
40
+
41
+
42
+ def regex_search(pattern: str, text: str, flags: int = re.DOTALL) -> Optional[re.Match]:
43
+ """
44
+ 在文本上执行正则表达式搜索
45
+
46
+ :param pattern: 正则表达式模式
47
+ :param text: 要搜索的文本
48
+ :param flags: 正则表达式标志
49
+ :return: 匹配对象或None
50
+ """
51
+ if not isinstance(pattern, str):
52
+ raise TypeError("Pattern must be a string")
53
+ if not isinstance(text, str):
54
+ raise TypeError("Text must be a string")
55
+ return re.search(pattern, text, flags=flags)
56
+
57
+
58
+ def regex_findall(pattern: str, text: str, flags: int = re.DOTALL) -> List[Any]:
59
+ """
60
+ 在文本上执行正则表达式查找
61
+
62
+ :param pattern: 正则表达式模式
63
+ :param text: 要搜索的文本
64
+ :param flags: 正则表达式标志
65
+ :return: 匹配结果列表
66
+ """
67
+ if not isinstance(pattern, str):
68
+ raise TypeError("Pattern must be a string")
69
+ if not isinstance(text, str):
70
+ raise TypeError("Text must be a string")
71
+ return re.findall(pattern, text, flags=flags)
72
+
73
+
74
+ def get_header_value(headers: Dict[str, Any], header_name: str, default: Any = None) -> Any:
75
+ """
76
+ 从响应头中获取值,处理大小写不敏感的情况
77
+
78
+ :param headers: 响应头字典
79
+ :param header_name: 头部名称
80
+ :param default: 默认值
81
+ :return: 头部值或默认值
82
+ """
83
+ if not headers or not header_name:
84
+ return default
85
+
86
+ # 首先尝试直接匹配
87
+ if header_name in headers:
88
+ return headers[header_name]
89
+
90
+ # 尝试小写匹配
91
+ lower_header = header_name.lower()
92
+ if lower_header in headers:
93
+ return headers[lower_header]
94
+
95
+ # 尝试首字母大写匹配
96
+ capitalized_header = header_name.capitalize()
97
+ if capitalized_header in headers:
98
+ return headers[capitalized_header]
99
+
100
+ # 尝试标题格式匹配
101
+ title_header = header_name.title()
102
+ if title_header in headers:
103
+ return headers[title_header]
104
+
105
+ return default
106
+
107
+
108
+ __all__ = [
109
+ "parse_cookies",
110
+ "regex_search",
111
+ "regex_findall",
112
+ "get_header_value"
113
+ ]
@@ -15,8 +15,9 @@
15
15
  所有方法都采用了简洁直观的命名风格,便于记忆和使用。
16
16
  """
17
17
 
18
- from typing import List, Any, Optional
19
- from parsel import Selector, SelectorList
18
+ from typing import List, Any
19
+
20
+ from parsel import SelectorList
20
21
 
21
22
 
22
23
  def extract_text(elements: SelectorList, join_str: str = " ") -> str:
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 单例模式工具模块
5
+ ================
6
+
7
+ 提供同步和异步两种单例实现方式,适用于不同的使用场景。
8
+
9
+ 使用场景:
10
+ 1. 同步单例:用于框架初始化、配置管理等同步代码
11
+ 2. 异步单例:用于数据库连接池、网络资源等异步代码
12
+
13
+ 示例:
14
+ # 同步单例
15
+ @singleton
16
+ class CoreInitializer:
17
+ pass
18
+
19
+ # 异步单例(在连接池管理器中使用)
20
+ class MySQLConnectionPoolManager:
21
+ _instances: Dict[str, 'MySQLConnectionPoolManager'] = {}
22
+ _lock = asyncio.Lock()
23
+
24
+ @classmethod
25
+ async def get_pool(cls, ...):
26
+ async with cls._lock:
27
+ if pool_key not in cls._instances:
28
+ cls._instances[pool_key] = cls(pool_key)
29
+ return cls._instances[pool_key].pool
30
+ """
31
+
32
+ import threading
33
+ from typing import Any, Dict, Type
34
+
35
+
36
+ class SingletonMeta(type):
37
+ """单例元类"""
38
+ _instances: Dict[Type, Any] = {}
39
+ _lock = threading.Lock()
40
+
41
+ def __call__(cls, *args, **kwargs):
42
+ if cls not in cls._instances:
43
+ with cls._lock:
44
+ if cls not in cls._instances:
45
+ instance = super().__call__(*args, **kwargs)
46
+ cls._instances[cls] = instance
47
+ return cls._instances[cls]
48
+
49
+
50
+ def singleton(cls):
51
+ """
52
+ 单例装饰器
53
+
54
+ Args:
55
+ cls: 要装饰的类
56
+
57
+ Returns:
58
+ 装饰后的类,确保只有一个实例
59
+ """
60
+ instances = {}
61
+ lock = threading.Lock()
62
+
63
+ def get_instance(*args, **kwargs):
64
+ if cls not in instances:
65
+ with lock:
66
+ if cls not in instances:
67
+ instances[cls] = cls(*args, **kwargs)
68
+ return instances[cls]
69
+
70
+ return get_instance
@@ -9,7 +9,7 @@ from crawlo.interfaces import ISpiderLoader
9
9
  from crawlo.settings.setting_manager import SettingManager
10
10
  from crawlo.spider import Spider
11
11
  from crawlo.network.request import Request
12
- from crawlo.utils.log import get_logger
12
+ from crawlo.logging import get_logger
13
13
 
14
14
  logger = get_logger(__name__)
15
15
 
@@ -3,7 +3,7 @@ import json
3
3
  import re
4
4
  from typing import Any, Union, List, Dict, Tuple, Optional
5
5
 
6
- from crawlo.utils.log import get_logger
6
+ from crawlo.logging import get_logger
7
7
 
8
8
  logger = get_logger(__name__)
9
9