crawlo 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/core/scheduler.py +20 -16
- crawlo/downloader/httpx_downloader.py +14 -12
- crawlo/exceptions.py +4 -0
- crawlo/extension/__init__.py +17 -10
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +27 -18
- crawlo/extension/log_stats.py +62 -24
- crawlo/extension/logging_extension.py +18 -9
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/aioredis_filter.py +2 -2
- crawlo/middleware/retry.py +3 -3
- crawlo/network/request.py +2 -2
- crawlo/network/response.py +25 -23
- crawlo/pipelines/__init__.py +9 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
- crawlo/pipelines/database_dedup_pipeline.py +225 -0
- crawlo/pipelines/memory_dedup_pipeline.py +116 -0
- crawlo/pipelines/mongo_pipeline.py +81 -66
- crawlo/pipelines/mysql_pipeline.py +165 -43
- crawlo/pipelines/redis_dedup_pipeline.py +163 -0
- crawlo/queue/queue_manager.py +4 -0
- crawlo/queue/redis_priority_queue.py +20 -3
- crawlo/settings/default_settings.py +119 -66
- crawlo/subscriber.py +62 -37
- crawlo/templates/project/items.py.tmpl +1 -1
- crawlo/templates/project/middlewares.py.tmpl +73 -49
- crawlo/templates/project/pipelines.py.tmpl +52 -290
- crawlo/templates/project/run.py.tmpl +20 -7
- crawlo/templates/project/settings.py.tmpl +35 -3
- crawlo/templates/spider/spider.py.tmpl +1 -37
- crawlo/utils/controlled_spider_mixin.py +109 -5
- crawlo-1.1.4.dist-info/METADATA +403 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/RECORD +40 -31
- examples/controlled_spider_example.py +205 -0
- crawlo-1.1.2.dist-info/METADATA +0 -567
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
import io
|
|
4
|
+
import os
|
|
5
|
+
import pstats
|
|
6
|
+
import asyncio
|
|
7
|
+
import cProfile
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
from crawlo.event import spider_opened, spider_closed
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PerformanceProfilerExtension:
|
|
15
|
+
"""
|
|
16
|
+
性能分析扩展
|
|
17
|
+
在爬虫运行期间进行性能分析,帮助优化爬虫性能
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, crawler: Any):
|
|
21
|
+
self.settings = crawler.settings
|
|
22
|
+
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
23
|
+
|
|
24
|
+
# 获取配置参数
|
|
25
|
+
self.enabled = self.settings.get_bool('PERFORMANCE_PROFILER_ENABLED', False)
|
|
26
|
+
self.output_dir = self.settings.get('PERFORMANCE_PROFILER_OUTPUT_DIR', 'profiling')
|
|
27
|
+
self.interval = self.settings.get_int('PERFORMANCE_PROFILER_INTERVAL', 300) # 默认5分钟
|
|
28
|
+
|
|
29
|
+
self.profiler: Optional[cProfile.Profile] = None
|
|
30
|
+
self.task: Optional[asyncio.Task] = None
|
|
31
|
+
|
|
32
|
+
# 创建输出目录
|
|
33
|
+
if self.enabled:
|
|
34
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def create_instance(cls, crawler: Any) -> 'PerformanceProfilerExtension':
|
|
38
|
+
# 只有当配置启用时才创建实例
|
|
39
|
+
if not crawler.settings.get_bool('PERFORMANCE_PROFILER_ENABLED', False):
|
|
40
|
+
from crawlo.exceptions import NotConfigured
|
|
41
|
+
raise NotConfigured("PerformanceProfilerExtension: PERFORMANCE_PROFILER_ENABLED is False")
|
|
42
|
+
|
|
43
|
+
o = cls(crawler)
|
|
44
|
+
if o.enabled:
|
|
45
|
+
crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
|
|
46
|
+
crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
|
|
47
|
+
return o
|
|
48
|
+
|
|
49
|
+
async def spider_opened(self) -> None:
|
|
50
|
+
"""爬虫启动时开始性能分析"""
|
|
51
|
+
if not self.enabled:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
self.profiler = cProfile.Profile()
|
|
55
|
+
self.profiler.enable()
|
|
56
|
+
|
|
57
|
+
# 启动定期保存分析结果的任务
|
|
58
|
+
self.task = asyncio.create_task(self._periodic_save())
|
|
59
|
+
|
|
60
|
+
self.logger.info("Performance profiler started.")
|
|
61
|
+
|
|
62
|
+
async def spider_closed(self) -> None:
|
|
63
|
+
"""爬虫关闭时停止性能分析并保存结果"""
|
|
64
|
+
if not self.enabled or not self.profiler:
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
# 停止定期保存任务
|
|
68
|
+
if self.task:
|
|
69
|
+
self.task.cancel()
|
|
70
|
+
try:
|
|
71
|
+
await self.task
|
|
72
|
+
except asyncio.CancelledError:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
# 停止分析器并保存最终结果
|
|
76
|
+
self.profiler.disable()
|
|
77
|
+
|
|
78
|
+
# 保存分析结果
|
|
79
|
+
await self._save_profile("final")
|
|
80
|
+
self.logger.info("Performance profiler stopped and results saved.")
|
|
81
|
+
|
|
82
|
+
async def _periodic_save(self) -> None:
|
|
83
|
+
"""定期保存分析结果"""
|
|
84
|
+
counter = 1
|
|
85
|
+
while True:
|
|
86
|
+
try:
|
|
87
|
+
await asyncio.sleep(self.interval)
|
|
88
|
+
if self.profiler:
|
|
89
|
+
# 临时禁用分析器以保存结果
|
|
90
|
+
self.profiler.disable()
|
|
91
|
+
await self._save_profile(f"periodic_{counter}")
|
|
92
|
+
counter += 1
|
|
93
|
+
# 重新启用分析器
|
|
94
|
+
self.profiler.enable()
|
|
95
|
+
except asyncio.CancelledError:
|
|
96
|
+
break
|
|
97
|
+
except Exception as e:
|
|
98
|
+
self.logger.error(f"Error in periodic profiling save: {e}")
|
|
99
|
+
|
|
100
|
+
async def _save_profile(self, name: str) -> None:
|
|
101
|
+
"""保存分析结果到文件"""
|
|
102
|
+
try:
|
|
103
|
+
# 创建内存中的字符串流
|
|
104
|
+
s = io.StringIO()
|
|
105
|
+
ps = pstats.Stats(self.profiler, stream=s)
|
|
106
|
+
|
|
107
|
+
# 排序并打印统计信息
|
|
108
|
+
ps.sort_stats('cumulative')
|
|
109
|
+
ps.print_stats()
|
|
110
|
+
|
|
111
|
+
# 保存到文件
|
|
112
|
+
filename = os.path.join(self.output_dir, f'profile_{name}.txt')
|
|
113
|
+
with open(filename, 'w', encoding='utf-8') as f:
|
|
114
|
+
f.write(s.getvalue())
|
|
115
|
+
|
|
116
|
+
self.logger.info(f"Performance profile saved to {filename}")
|
|
117
|
+
except Exception as e:
|
|
118
|
+
self.logger.error(f"Error saving performance profile: {e}")
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
from crawlo import event
|
|
9
|
+
from crawlo.utils.log import get_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RequestRecorderExtension:
|
|
13
|
+
"""
|
|
14
|
+
请求记录扩展
|
|
15
|
+
记录所有发送的请求信息到文件,便于调试和分析
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, crawler: Any):
|
|
19
|
+
self.settings = crawler.settings
|
|
20
|
+
self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
|
|
21
|
+
|
|
22
|
+
# 获取配置参数
|
|
23
|
+
self.enabled = self.settings.get_bool('REQUEST_RECORDER_ENABLED', False)
|
|
24
|
+
self.output_dir = self.settings.get('REQUEST_RECORDER_OUTPUT_DIR', 'requests_log')
|
|
25
|
+
self.max_file_size = self.settings.get_int('REQUEST_RECORDER_MAX_FILE_SIZE', 10 * 1024 * 1024) # 默认10MB
|
|
26
|
+
|
|
27
|
+
# 创建输出目录
|
|
28
|
+
if self.enabled:
|
|
29
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
self.current_file = None
|
|
32
|
+
self.current_file_size = 0
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def create_instance(cls, crawler: Any) -> 'RequestRecorderExtension':
|
|
36
|
+
# 只有当配置启用时才创建实例
|
|
37
|
+
if not crawler.settings.get_bool('REQUEST_RECORDER_ENABLED', False):
|
|
38
|
+
from crawlo.exceptions import NotConfigured
|
|
39
|
+
raise NotConfigured("RequestRecorderExtension: REQUEST_RECORDER_ENABLED is False")
|
|
40
|
+
|
|
41
|
+
o = cls(crawler)
|
|
42
|
+
if o.enabled:
|
|
43
|
+
crawler.subscriber.subscribe(o.request_scheduled, event=event.request_scheduled)
|
|
44
|
+
crawler.subscriber.subscribe(o.response_received, event=event.response_received)
|
|
45
|
+
crawler.subscriber.subscribe(o.spider_closed, event=event.spider_closed)
|
|
46
|
+
return o
|
|
47
|
+
|
|
48
|
+
async def request_scheduled(self, request: Any, spider: Any) -> None:
|
|
49
|
+
"""记录调度的请求"""
|
|
50
|
+
if not self.enabled:
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
request_info = {
|
|
55
|
+
'timestamp': datetime.now().isoformat(),
|
|
56
|
+
'type': 'request',
|
|
57
|
+
'url': request.url,
|
|
58
|
+
'method': request.method,
|
|
59
|
+
'headers': dict(request.headers),
|
|
60
|
+
'meta': getattr(request, 'meta', {}),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
await self._write_record(request_info)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
self.logger.error(f"Error recording request: {e}")
|
|
66
|
+
|
|
67
|
+
async def response_received(self, response: Any, spider: Any) -> None:
|
|
68
|
+
"""记录接收到的响应"""
|
|
69
|
+
if not self.enabled:
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
response_info = {
|
|
74
|
+
'timestamp': datetime.now().isoformat(),
|
|
75
|
+
'type': 'response',
|
|
76
|
+
'url': response.url,
|
|
77
|
+
'status_code': response.status_code,
|
|
78
|
+
'headers': dict(response.headers),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
await self._write_record(response_info)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
self.logger.error(f"Error recording response: {e}")
|
|
84
|
+
|
|
85
|
+
async def spider_closed(self, spider: Any) -> None:
|
|
86
|
+
"""爬虫关闭时清理资源"""
|
|
87
|
+
if self.current_file:
|
|
88
|
+
self.current_file.close()
|
|
89
|
+
self.current_file = None
|
|
90
|
+
self.logger.info("Request recorder closed.")
|
|
91
|
+
|
|
92
|
+
async def _write_record(self, record: dict) -> None:
|
|
93
|
+
"""写入记录到文件"""
|
|
94
|
+
# 检查是否需要创建新文件
|
|
95
|
+
if not self.current_file or self.current_file_size > self.max_file_size:
|
|
96
|
+
if self.current_file:
|
|
97
|
+
self.current_file.close()
|
|
98
|
+
|
|
99
|
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
100
|
+
filename = os.path.join(self.output_dir, f'requests_{timestamp}.jsonl')
|
|
101
|
+
self.current_file = open(filename, 'a', encoding='utf-8')
|
|
102
|
+
self.current_file_size = 0
|
|
103
|
+
|
|
104
|
+
# 写入记录
|
|
105
|
+
line = json.dumps(record, ensure_ascii=False) + '\n'
|
|
106
|
+
self.current_file.write(line)
|
|
107
|
+
self.current_file.flush()
|
|
108
|
+
self.current_file_size += len(line.encode('utf-8'))
|
|
@@ -11,8 +11,8 @@ Redis 过滤器实现
|
|
|
11
11
|
- 高性能: 使用 Redis pipeline 优化
|
|
12
12
|
- 容错设计: 网络异常自动重试
|
|
13
13
|
"""
|
|
14
|
+
import redis.asyncio as aioredis
|
|
14
15
|
from typing import Optional
|
|
15
|
-
from redis import asyncio as aioredis
|
|
16
16
|
from crawlo.filters import BaseFilter
|
|
17
17
|
from crawlo.utils.log import get_logger
|
|
18
18
|
from crawlo.utils.request import request_fingerprint
|
|
@@ -239,4 +239,4 @@ class AioRedisFilter(BaseFilter):
|
|
|
239
239
|
await self.redis.close()
|
|
240
240
|
self.logger.debug("Redis连接已关闭")
|
|
241
241
|
except Exception as e:
|
|
242
|
-
self.logger.warning(f"Redis关闭时出错:{e}")
|
|
242
|
+
self.logger.warning(f"Redis关闭时出错:{e}")
|
crawlo/middleware/retry.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
from typing import List
|
|
4
|
-
|
|
4
|
+
import asyncio
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
7
|
from anyio import EndOfStream
|
|
@@ -49,7 +49,7 @@ from crawlo.stats_collector import StatsCollector
|
|
|
49
49
|
_retry_exceptions = [
|
|
50
50
|
EndOfStream,
|
|
51
51
|
ReadError,
|
|
52
|
-
TimeoutError,
|
|
52
|
+
asyncio.TimeoutError,
|
|
53
53
|
ConnectError,
|
|
54
54
|
ReadTimeout,
|
|
55
55
|
ClientConnectorError,
|
|
@@ -122,4 +122,4 @@ class RetryMiddleware(object):
|
|
|
122
122
|
return request
|
|
123
123
|
else:
|
|
124
124
|
self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
|
|
125
|
-
return None
|
|
125
|
+
return None
|
crawlo/network/request.py
CHANGED
|
@@ -16,7 +16,6 @@ from w3lib.url import safe_url_string
|
|
|
16
16
|
from typing import Dict, Optional, Callable, Union, Any, TypeVar, List
|
|
17
17
|
|
|
18
18
|
from crawlo.utils.url import escape_ajax
|
|
19
|
-
from crawlo.utils.log import get_logger
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
_Request = TypeVar("_Request", bound="Request")
|
|
@@ -176,7 +175,8 @@ class Request:
|
|
|
176
175
|
self.dont_filter = dont_filter
|
|
177
176
|
self._set_url(url)
|
|
178
177
|
|
|
179
|
-
|
|
178
|
+
@staticmethod
|
|
179
|
+
def _safe_deepcopy_meta(meta: Dict[str, Any]) -> Dict[str, Any]:
|
|
180
180
|
"""安全地 deepcopy meta,移除 logger 后再复制"""
|
|
181
181
|
import logging
|
|
182
182
|
|
crawlo/network/response.py
CHANGED
|
@@ -32,14 +32,14 @@ class Response:
|
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
34
|
def __init__(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
35
|
+
self,
|
|
36
|
+
url: str,
|
|
37
|
+
*,
|
|
38
|
+
headers: Dict[str, Any] = None,
|
|
39
|
+
body: bytes = b"",
|
|
40
|
+
method: str = 'GET',
|
|
41
|
+
request: 'Request' = None, # 使用字符串注解避免循环导入
|
|
42
|
+
status_code: int = 200,
|
|
43
43
|
):
|
|
44
44
|
# 基本属性
|
|
45
45
|
self.url = url
|
|
@@ -48,15 +48,15 @@ class Response:
|
|
|
48
48
|
self.method = method.upper()
|
|
49
49
|
self.request = request
|
|
50
50
|
self.status_code = status_code
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
# 编码处理
|
|
53
53
|
self.encoding = self._determine_encoding()
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
# 缓存属性
|
|
56
56
|
self._text_cache = None
|
|
57
57
|
self._json_cache = None
|
|
58
58
|
self._selector_instance = None
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# 状态标记
|
|
61
61
|
self._is_success = 200 <= status_code < 300
|
|
62
62
|
self._is_redirect = 300 <= status_code < 400
|
|
@@ -68,14 +68,14 @@ class Response:
|
|
|
68
68
|
# 1. 优先使用 request 的编码
|
|
69
69
|
if self.request and self.request.encoding:
|
|
70
70
|
return self.request.encoding
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
# 2. 从 Content-Type 头中检测
|
|
73
73
|
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
74
74
|
if content_type:
|
|
75
75
|
charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
|
|
76
76
|
if charset_match:
|
|
77
77
|
return charset_match.group(1).lower()
|
|
78
|
-
|
|
78
|
+
|
|
79
79
|
# 3. 从 HTML meta 标签中检测(仅对HTML内容)
|
|
80
80
|
if b'<html' in self.body[:1024].lower():
|
|
81
81
|
# 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
|
|
@@ -86,16 +86,17 @@ class Response:
|
|
|
86
86
|
charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
|
|
87
87
|
if charset_match:
|
|
88
88
|
return charset_match.group(1).lower()
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
91
91
|
content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
|
|
92
92
|
if content_match:
|
|
93
93
|
return content_match.group(1).lower()
|
|
94
94
|
except Exception:
|
|
95
95
|
pass
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
# 4. 默认使用 utf-8
|
|
98
98
|
return 'utf-8'
|
|
99
|
+
|
|
99
100
|
@property
|
|
100
101
|
def text(self) -> str:
|
|
101
102
|
"""将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
|
|
@@ -115,7 +116,7 @@ class Response:
|
|
|
115
116
|
if 'gb2312' not in encodings_to_try:
|
|
116
117
|
encodings_to_try.append('gb2312')
|
|
117
118
|
encodings_to_try.append('latin1') # 最后的回退选项
|
|
118
|
-
|
|
119
|
+
|
|
119
120
|
for encoding in encodings_to_try:
|
|
120
121
|
if not encoding:
|
|
121
122
|
continue
|
|
@@ -124,7 +125,7 @@ class Response:
|
|
|
124
125
|
return self._text_cache
|
|
125
126
|
except (UnicodeDecodeError, LookupError):
|
|
126
127
|
continue
|
|
127
|
-
|
|
128
|
+
|
|
128
129
|
# 所有编码都失败,使用容错解码
|
|
129
130
|
try:
|
|
130
131
|
self._text_cache = self.body.decode('utf-8', errors='replace')
|
|
@@ -136,37 +137,38 @@ class Response:
|
|
|
136
137
|
def is_success(self) -> bool:
|
|
137
138
|
"""检查响应是否成功 (2xx)"""
|
|
138
139
|
return self._is_success
|
|
139
|
-
|
|
140
|
+
|
|
140
141
|
@property
|
|
141
142
|
def is_redirect(self) -> bool:
|
|
142
143
|
"""检查响应是否为重定向 (3xx)"""
|
|
143
144
|
return self._is_redirect
|
|
144
|
-
|
|
145
|
+
|
|
145
146
|
@property
|
|
146
147
|
def is_client_error(self) -> bool:
|
|
147
148
|
"""检查响应是否为客户端错误 (4xx)"""
|
|
148
149
|
return self._is_client_error
|
|
149
|
-
|
|
150
|
+
|
|
150
151
|
@property
|
|
151
152
|
def is_server_error(self) -> bool:
|
|
152
153
|
"""检查响应是否为服务器错误 (5xx)"""
|
|
153
154
|
return self._is_server_error
|
|
154
|
-
|
|
155
|
+
|
|
155
156
|
@property
|
|
156
157
|
def content_type(self) -> str:
|
|
157
158
|
"""获取响应的 Content-Type"""
|
|
158
159
|
return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
|
|
159
|
-
|
|
160
|
+
|
|
160
161
|
@property
|
|
161
162
|
def content_length(self) -> Optional[int]:
|
|
162
163
|
"""获取响应的 Content-Length"""
|
|
163
164
|
length = self.headers.get('content-length') or self.headers.get('Content-Length')
|
|
164
165
|
return int(length) if length else None
|
|
166
|
+
|
|
165
167
|
def json(self, default: Any = None) -> Any:
|
|
166
168
|
"""将响应文本解析为 JSON 对象。"""
|
|
167
169
|
if self._json_cache is not None:
|
|
168
170
|
return self._json_cache
|
|
169
|
-
|
|
171
|
+
|
|
170
172
|
try:
|
|
171
173
|
self._json_cache = ujson.loads(self.text)
|
|
172
174
|
return self._json_cache
|
crawlo/pipelines/__init__.py
CHANGED
|
@@ -11,3 +11,12 @@ class BasePipeline:
|
|
|
11
11
|
@classmethod
|
|
12
12
|
def create_instance(cls, crawler):
|
|
13
13
|
return cls()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# 导出去重管道
|
|
17
|
+
from .memory_dedup_pipeline import MemoryDedupPipeline
|
|
18
|
+
from .redis_dedup_pipeline import RedisDedupPipeline
|
|
19
|
+
from .bloom_dedup_pipeline import BloomDedupPipeline
|
|
20
|
+
from .database_dedup_pipeline import DatabaseDedupPipeline
|
|
21
|
+
|
|
22
|
+
__all__ = ['BasePipeline', 'MemoryDedupPipeline', 'RedisDedupPipeline', 'BloomDedupPipeline', 'DatabaseDedupPipeline']
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
基于 Bloom Filter 的数据项去重管道
|
|
5
|
+
=============================
|
|
6
|
+
提供大规模数据采集场景下的高效去重功能,使用概率性数据结构节省内存。
|
|
7
|
+
|
|
8
|
+
特点:
|
|
9
|
+
- 内存效率高: 相比传统集合节省大量内存
|
|
10
|
+
- 高性能: 快速的插入和查找操作
|
|
11
|
+
- 可扩展: 支持自定义容量和误判率
|
|
12
|
+
- 适用性广: 特别适合大规模数据采集
|
|
13
|
+
|
|
14
|
+
注意: Bloom Filter 有误判率,可能会错误地丢弃一些未见过的数据项。
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
try:
|
|
19
|
+
from pybloom_live import BloomFilter
|
|
20
|
+
BLOOM_FILTER_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
# 如果没有安装 pybloom_live,使用简单的替代方案
|
|
23
|
+
BLOOM_FILTER_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
class BloomFilter:
|
|
26
|
+
def __init__(self, capacity, error_rate):
|
|
27
|
+
self._data = set()
|
|
28
|
+
|
|
29
|
+
def add(self, item):
|
|
30
|
+
if item in self._data:
|
|
31
|
+
return False
|
|
32
|
+
else:
|
|
33
|
+
self._data.add(item)
|
|
34
|
+
return True
|
|
35
|
+
|
|
36
|
+
def __contains__(self, item):
|
|
37
|
+
return item in self._data
|
|
38
|
+
|
|
39
|
+
from crawlo import Item
|
|
40
|
+
from crawlo.spider import Spider
|
|
41
|
+
from crawlo.utils.log import get_logger
|
|
42
|
+
from crawlo.exceptions import DropItem
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BloomDedupPipeline:
|
|
46
|
+
"""基于 Bloom Filter 的数据项去重管道"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
capacity: int = 1000000,
|
|
51
|
+
error_rate: float = 0.001,
|
|
52
|
+
log_level: str = "INFO"
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
初始化 Bloom Filter 去重管道
|
|
56
|
+
|
|
57
|
+
:param capacity: 预期存储的元素数量
|
|
58
|
+
:param error_rate: 误判率 (例如 0.001 表示 0.1%)
|
|
59
|
+
:param log_level: 日志级别
|
|
60
|
+
"""
|
|
61
|
+
self.logger = get_logger(self.__class__.__name__, log_level)
|
|
62
|
+
|
|
63
|
+
# 初始化 Bloom Filter
|
|
64
|
+
try:
|
|
65
|
+
self.bloom_filter = BloomFilter(capacity=capacity, error_rate=error_rate)
|
|
66
|
+
self.logger.info(f"Bloom Filter 去重管道初始化完成 (容量: {capacity}, 误判率: {error_rate})")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
self.logger.error(f"Bloom Filter 初始化失败: {e}")
|
|
69
|
+
raise RuntimeError(f"Bloom Filter 初始化失败: {e}")
|
|
70
|
+
|
|
71
|
+
self.capacity = capacity
|
|
72
|
+
self.error_rate = error_rate
|
|
73
|
+
self.dropped_count = 0
|
|
74
|
+
self.added_count = 0
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def from_crawler(cls, crawler):
|
|
78
|
+
"""从爬虫配置创建管道实例"""
|
|
79
|
+
settings = crawler.settings
|
|
80
|
+
|
|
81
|
+
return cls(
|
|
82
|
+
capacity=settings.getint('BLOOM_FILTER_CAPACITY', 1000000),
|
|
83
|
+
error_rate=settings.getfloat('BLOOM_FILTER_ERROR_RATE', 0.001),
|
|
84
|
+
log_level=settings.get('LOG_LEVEL', 'INFO')
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def process_item(self, item: Item, spider: Spider) -> Item:
|
|
88
|
+
"""
|
|
89
|
+
处理数据项,进行去重检查
|
|
90
|
+
|
|
91
|
+
:param item: 要处理的数据项
|
|
92
|
+
:param spider: 爬虫实例
|
|
93
|
+
:return: 处理后的数据项或抛出 DropItem 异常
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
# 生成数据项指纹
|
|
97
|
+
fingerprint = self._generate_item_fingerprint(item)
|
|
98
|
+
|
|
99
|
+
# 检查指纹是否已存在
|
|
100
|
+
if fingerprint in self.bloom_filter:
|
|
101
|
+
# 如果可能已存在(Bloom Filter 可能有误判),丢弃这个数据项
|
|
102
|
+
self.dropped_count += 1
|
|
103
|
+
self.logger.debug(f"可能丢弃重复数据项: {fingerprint[:20]}...")
|
|
104
|
+
raise DropItem(f"可能重复的数据项: {fingerprint}")
|
|
105
|
+
else:
|
|
106
|
+
# 添加指纹到 Bloom Filter
|
|
107
|
+
self.bloom_filter.add(fingerprint)
|
|
108
|
+
self.added_count += 1
|
|
109
|
+
self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
|
|
110
|
+
return item
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
self.logger.error(f"处理数据项时出错: {e}")
|
|
114
|
+
# 在错误时继续处理,避免丢失数据
|
|
115
|
+
return item
|
|
116
|
+
|
|
117
|
+
def _generate_item_fingerprint(self, item: Item) -> str:
|
|
118
|
+
"""
|
|
119
|
+
生成数据项指纹
|
|
120
|
+
|
|
121
|
+
基于数据项的所有字段生成唯一指纹,用于去重判断。
|
|
122
|
+
|
|
123
|
+
:param item: 数据项
|
|
124
|
+
:return: 指纹字符串
|
|
125
|
+
"""
|
|
126
|
+
# 将数据项转换为可序列化的字典
|
|
127
|
+
try:
|
|
128
|
+
item_dict = item.to_dict()
|
|
129
|
+
except AttributeError:
|
|
130
|
+
# 兼容没有to_dict方法的Item实现
|
|
131
|
+
item_dict = dict(item)
|
|
132
|
+
|
|
133
|
+
# 对字典进行排序以确保一致性
|
|
134
|
+
sorted_items = sorted(item_dict.items())
|
|
135
|
+
|
|
136
|
+
# 生成指纹字符串
|
|
137
|
+
fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
|
|
138
|
+
|
|
139
|
+
# 使用 SHA256 生成固定长度的指纹
|
|
140
|
+
return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
|
|
141
|
+
|
|
142
|
+
def close_spider(self, spider: Spider) -> None:
|
|
143
|
+
"""
|
|
144
|
+
爬虫关闭时的清理工作
|
|
145
|
+
|
|
146
|
+
:param spider: 爬虫实例
|
|
147
|
+
"""
|
|
148
|
+
self.logger.info(f"爬虫 {spider.name} 关闭:")
|
|
149
|
+
self.logger.info(f" - 处理的数据项数: {self.added_count}")
|
|
150
|
+
self.logger.info(f" - 可能丢弃的重复数据项: {self.dropped_count}")
|
|
151
|
+
|
|
152
|
+
if BLOOM_FILTER_AVAILABLE:
|
|
153
|
+
# 注意:Bloom Filter 无法准确统计元素数量
|
|
154
|
+
self.logger.info(f" - Bloom Filter 容量: {self.capacity}")
|
|
155
|
+
self.logger.info(f" - Bloom Filter 误判率: {self.error_rate}")
|
|
156
|
+
else:
|
|
157
|
+
self.logger.warning(" - 未安装 pybloom_live,使用内存集合作为替代")
|