crawlo 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (41) hide show
  1. crawlo/__version__.py +1 -1
  2. crawlo/core/scheduler.py +20 -16
  3. crawlo/downloader/httpx_downloader.py +14 -12
  4. crawlo/exceptions.py +4 -0
  5. crawlo/extension/__init__.py +17 -10
  6. crawlo/extension/health_check.py +142 -0
  7. crawlo/extension/log_interval.py +27 -18
  8. crawlo/extension/log_stats.py +62 -24
  9. crawlo/extension/logging_extension.py +18 -9
  10. crawlo/extension/memory_monitor.py +89 -0
  11. crawlo/extension/performance_profiler.py +118 -0
  12. crawlo/extension/request_recorder.py +108 -0
  13. crawlo/filters/aioredis_filter.py +2 -2
  14. crawlo/middleware/retry.py +3 -3
  15. crawlo/network/request.py +2 -2
  16. crawlo/network/response.py +25 -23
  17. crawlo/pipelines/__init__.py +9 -0
  18. crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
  19. crawlo/pipelines/database_dedup_pipeline.py +225 -0
  20. crawlo/pipelines/memory_dedup_pipeline.py +116 -0
  21. crawlo/pipelines/mongo_pipeline.py +81 -66
  22. crawlo/pipelines/mysql_pipeline.py +165 -43
  23. crawlo/pipelines/redis_dedup_pipeline.py +163 -0
  24. crawlo/queue/queue_manager.py +4 -0
  25. crawlo/queue/redis_priority_queue.py +20 -3
  26. crawlo/settings/default_settings.py +119 -66
  27. crawlo/subscriber.py +62 -37
  28. crawlo/templates/project/items.py.tmpl +1 -1
  29. crawlo/templates/project/middlewares.py.tmpl +73 -49
  30. crawlo/templates/project/pipelines.py.tmpl +52 -290
  31. crawlo/templates/project/run.py.tmpl +20 -7
  32. crawlo/templates/project/settings.py.tmpl +35 -3
  33. crawlo/templates/spider/spider.py.tmpl +1 -37
  34. crawlo/utils/controlled_spider_mixin.py +109 -5
  35. crawlo-1.1.4.dist-info/METADATA +403 -0
  36. {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/RECORD +40 -31
  37. examples/controlled_spider_example.py +205 -0
  38. crawlo-1.1.2.dist-info/METADATA +0 -567
  39. {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
  40. {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
  41. {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import io
4
+ import os
5
+ import pstats
6
+ import asyncio
7
+ import cProfile
8
+ from typing import Any, Optional
9
+
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.event import spider_opened, spider_closed
12
+
13
+
14
+ class PerformanceProfilerExtension:
15
+ """
16
+ 性能分析扩展
17
+ 在爬虫运行期间进行性能分析,帮助优化爬虫性能
18
+ """
19
+
20
+ def __init__(self, crawler: Any):
21
+ self.settings = crawler.settings
22
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
23
+
24
+ # 获取配置参数
25
+ self.enabled = self.settings.get_bool('PERFORMANCE_PROFILER_ENABLED', False)
26
+ self.output_dir = self.settings.get('PERFORMANCE_PROFILER_OUTPUT_DIR', 'profiling')
27
+ self.interval = self.settings.get_int('PERFORMANCE_PROFILER_INTERVAL', 300) # 默认5分钟
28
+
29
+ self.profiler: Optional[cProfile.Profile] = None
30
+ self.task: Optional[asyncio.Task] = None
31
+
32
+ # 创建输出目录
33
+ if self.enabled:
34
+ os.makedirs(self.output_dir, exist_ok=True)
35
+
36
+ @classmethod
37
+ def create_instance(cls, crawler: Any) -> 'PerformanceProfilerExtension':
38
+ # 只有当配置启用时才创建实例
39
+ if not crawler.settings.get_bool('PERFORMANCE_PROFILER_ENABLED', False):
40
+ from crawlo.exceptions import NotConfigured
41
+ raise NotConfigured("PerformanceProfilerExtension: PERFORMANCE_PROFILER_ENABLED is False")
42
+
43
+ o = cls(crawler)
44
+ if o.enabled:
45
+ crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
46
+ crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
47
+ return o
48
+
49
+ async def spider_opened(self) -> None:
50
+ """爬虫启动时开始性能分析"""
51
+ if not self.enabled:
52
+ return
53
+
54
+ self.profiler = cProfile.Profile()
55
+ self.profiler.enable()
56
+
57
+ # 启动定期保存分析结果的任务
58
+ self.task = asyncio.create_task(self._periodic_save())
59
+
60
+ self.logger.info("Performance profiler started.")
61
+
62
+ async def spider_closed(self) -> None:
63
+ """爬虫关闭时停止性能分析并保存结果"""
64
+ if not self.enabled or not self.profiler:
65
+ return
66
+
67
+ # 停止定期保存任务
68
+ if self.task:
69
+ self.task.cancel()
70
+ try:
71
+ await self.task
72
+ except asyncio.CancelledError:
73
+ pass
74
+
75
+ # 停止分析器并保存最终结果
76
+ self.profiler.disable()
77
+
78
+ # 保存分析结果
79
+ await self._save_profile("final")
80
+ self.logger.info("Performance profiler stopped and results saved.")
81
+
82
+ async def _periodic_save(self) -> None:
83
+ """定期保存分析结果"""
84
+ counter = 1
85
+ while True:
86
+ try:
87
+ await asyncio.sleep(self.interval)
88
+ if self.profiler:
89
+ # 临时禁用分析器以保存结果
90
+ self.profiler.disable()
91
+ await self._save_profile(f"periodic_{counter}")
92
+ counter += 1
93
+ # 重新启用分析器
94
+ self.profiler.enable()
95
+ except asyncio.CancelledError:
96
+ break
97
+ except Exception as e:
98
+ self.logger.error(f"Error in periodic profiling save: {e}")
99
+
100
+ async def _save_profile(self, name: str) -> None:
101
+ """保存分析结果到文件"""
102
+ try:
103
+ # 创建内存中的字符串流
104
+ s = io.StringIO()
105
+ ps = pstats.Stats(self.profiler, stream=s)
106
+
107
+ # 排序并打印统计信息
108
+ ps.sort_stats('cumulative')
109
+ ps.print_stats()
110
+
111
+ # 保存到文件
112
+ filename = os.path.join(self.output_dir, f'profile_{name}.txt')
113
+ with open(filename, 'w', encoding='utf-8') as f:
114
+ f.write(s.getvalue())
115
+
116
+ self.logger.info(f"Performance profile saved to {filename}")
117
+ except Exception as e:
118
+ self.logger.error(f"Error saving performance profile: {e}")
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import os
4
+ import json
5
+ from typing import Any
6
+ from datetime import datetime
7
+
8
+ from crawlo import event
9
+ from crawlo.utils.log import get_logger
10
+
11
+
12
+ class RequestRecorderExtension:
13
+ """
14
+ 请求记录扩展
15
+ 记录所有发送的请求信息到文件,便于调试和分析
16
+ """
17
+
18
+ def __init__(self, crawler: Any):
19
+ self.settings = crawler.settings
20
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
21
+
22
+ # 获取配置参数
23
+ self.enabled = self.settings.get_bool('REQUEST_RECORDER_ENABLED', False)
24
+ self.output_dir = self.settings.get('REQUEST_RECORDER_OUTPUT_DIR', 'requests_log')
25
+ self.max_file_size = self.settings.get_int('REQUEST_RECORDER_MAX_FILE_SIZE', 10 * 1024 * 1024) # 默认10MB
26
+
27
+ # 创建输出目录
28
+ if self.enabled:
29
+ os.makedirs(self.output_dir, exist_ok=True)
30
+
31
+ self.current_file = None
32
+ self.current_file_size = 0
33
+
34
+ @classmethod
35
+ def create_instance(cls, crawler: Any) -> 'RequestRecorderExtension':
36
+ # 只有当配置启用时才创建实例
37
+ if not crawler.settings.get_bool('REQUEST_RECORDER_ENABLED', False):
38
+ from crawlo.exceptions import NotConfigured
39
+ raise NotConfigured("RequestRecorderExtension: REQUEST_RECORDER_ENABLED is False")
40
+
41
+ o = cls(crawler)
42
+ if o.enabled:
43
+ crawler.subscriber.subscribe(o.request_scheduled, event=event.request_scheduled)
44
+ crawler.subscriber.subscribe(o.response_received, event=event.response_received)
45
+ crawler.subscriber.subscribe(o.spider_closed, event=event.spider_closed)
46
+ return o
47
+
48
+ async def request_scheduled(self, request: Any, spider: Any) -> None:
49
+ """记录调度的请求"""
50
+ if not self.enabled:
51
+ return
52
+
53
+ try:
54
+ request_info = {
55
+ 'timestamp': datetime.now().isoformat(),
56
+ 'type': 'request',
57
+ 'url': request.url,
58
+ 'method': request.method,
59
+ 'headers': dict(request.headers),
60
+ 'meta': getattr(request, 'meta', {}),
61
+ }
62
+
63
+ await self._write_record(request_info)
64
+ except Exception as e:
65
+ self.logger.error(f"Error recording request: {e}")
66
+
67
+ async def response_received(self, response: Any, spider: Any) -> None:
68
+ """记录接收到的响应"""
69
+ if not self.enabled:
70
+ return
71
+
72
+ try:
73
+ response_info = {
74
+ 'timestamp': datetime.now().isoformat(),
75
+ 'type': 'response',
76
+ 'url': response.url,
77
+ 'status_code': response.status_code,
78
+ 'headers': dict(response.headers),
79
+ }
80
+
81
+ await self._write_record(response_info)
82
+ except Exception as e:
83
+ self.logger.error(f"Error recording response: {e}")
84
+
85
+ async def spider_closed(self, spider: Any) -> None:
86
+ """爬虫关闭时清理资源"""
87
+ if self.current_file:
88
+ self.current_file.close()
89
+ self.current_file = None
90
+ self.logger.info("Request recorder closed.")
91
+
92
+ async def _write_record(self, record: dict) -> None:
93
+ """写入记录到文件"""
94
+ # 检查是否需要创建新文件
95
+ if not self.current_file or self.current_file_size > self.max_file_size:
96
+ if self.current_file:
97
+ self.current_file.close()
98
+
99
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
100
+ filename = os.path.join(self.output_dir, f'requests_{timestamp}.jsonl')
101
+ self.current_file = open(filename, 'a', encoding='utf-8')
102
+ self.current_file_size = 0
103
+
104
+ # 写入记录
105
+ line = json.dumps(record, ensure_ascii=False) + '\n'
106
+ self.current_file.write(line)
107
+ self.current_file.flush()
108
+ self.current_file_size += len(line.encode('utf-8'))
@@ -11,8 +11,8 @@ Redis 过滤器实现
11
11
  - 高性能: 使用 Redis pipeline 优化
12
12
  - 容错设计: 网络异常自动重试
13
13
  """
14
+ import redis.asyncio as aioredis
14
15
  from typing import Optional
15
- from redis import asyncio as aioredis
16
16
  from crawlo.filters import BaseFilter
17
17
  from crawlo.utils.log import get_logger
18
18
  from crawlo.utils.request import request_fingerprint
@@ -239,4 +239,4 @@ class AioRedisFilter(BaseFilter):
239
239
  await self.redis.close()
240
240
  self.logger.debug("Redis连接已关闭")
241
241
  except Exception as e:
242
- self.logger.warning(f"Redis关闭时出错:{e}")
242
+ self.logger.warning(f"Redis关闭时出错:{e}")
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  from typing import List
4
- from asyncio.exceptions import TimeoutError
4
+ import asyncio
5
5
 
6
6
  try:
7
7
  from anyio import EndOfStream
@@ -49,7 +49,7 @@ from crawlo.stats_collector import StatsCollector
49
49
  _retry_exceptions = [
50
50
  EndOfStream,
51
51
  ReadError,
52
- TimeoutError,
52
+ asyncio.TimeoutError,
53
53
  ConnectError,
54
54
  ReadTimeout,
55
55
  ClientConnectorError,
@@ -122,4 +122,4 @@ class RetryMiddleware(object):
122
122
  return request
123
123
  else:
124
124
  self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
125
- return None
125
+ return None
crawlo/network/request.py CHANGED
@@ -16,7 +16,6 @@ from w3lib.url import safe_url_string
16
16
  from typing import Dict, Optional, Callable, Union, Any, TypeVar, List
17
17
 
18
18
  from crawlo.utils.url import escape_ajax
19
- from crawlo.utils.log import get_logger
20
19
 
21
20
 
22
21
  _Request = TypeVar("_Request", bound="Request")
@@ -176,7 +175,8 @@ class Request:
176
175
  self.dont_filter = dont_filter
177
176
  self._set_url(url)
178
177
 
179
- def _safe_deepcopy_meta(self, meta: Dict[str, Any]) -> Dict[str, Any]:
178
+ @staticmethod
179
+ def _safe_deepcopy_meta(meta: Dict[str, Any]) -> Dict[str, Any]:
180
180
  """安全地 deepcopy meta,移除 logger 后再复制"""
181
181
  import logging
182
182
 
@@ -32,14 +32,14 @@ class Response:
32
32
  """
33
33
 
34
34
  def __init__(
35
- self,
36
- url: str,
37
- *,
38
- headers: Dict[str, Any] = None,
39
- body: bytes = b"",
40
- method: str = 'GET',
41
- request: 'Request' = None, # 使用字符串注解避免循环导入
42
- status_code: int = 200,
35
+ self,
36
+ url: str,
37
+ *,
38
+ headers: Dict[str, Any] = None,
39
+ body: bytes = b"",
40
+ method: str = 'GET',
41
+ request: 'Request' = None, # 使用字符串注解避免循环导入
42
+ status_code: int = 200,
43
43
  ):
44
44
  # 基本属性
45
45
  self.url = url
@@ -48,15 +48,15 @@ class Response:
48
48
  self.method = method.upper()
49
49
  self.request = request
50
50
  self.status_code = status_code
51
-
51
+
52
52
  # 编码处理
53
53
  self.encoding = self._determine_encoding()
54
-
54
+
55
55
  # 缓存属性
56
56
  self._text_cache = None
57
57
  self._json_cache = None
58
58
  self._selector_instance = None
59
-
59
+
60
60
  # 状态标记
61
61
  self._is_success = 200 <= status_code < 300
62
62
  self._is_redirect = 300 <= status_code < 400
@@ -68,14 +68,14 @@ class Response:
68
68
  # 1. 优先使用 request 的编码
69
69
  if self.request and self.request.encoding:
70
70
  return self.request.encoding
71
-
71
+
72
72
  # 2. 从 Content-Type 头中检测
73
73
  content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
74
74
  if content_type:
75
75
  charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
76
76
  if charset_match:
77
77
  return charset_match.group(1).lower()
78
-
78
+
79
79
  # 3. 从 HTML meta 标签中检测(仅对HTML内容)
80
80
  if b'<html' in self.body[:1024].lower():
81
81
  # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
@@ -86,16 +86,17 @@ class Response:
86
86
  charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
87
87
  if charset_match:
88
88
  return charset_match.group(1).lower()
89
-
89
+
90
90
  # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
91
91
  content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
92
92
  if content_match:
93
93
  return content_match.group(1).lower()
94
94
  except Exception:
95
95
  pass
96
-
96
+
97
97
  # 4. 默认使用 utf-8
98
98
  return 'utf-8'
99
+
99
100
  @property
100
101
  def text(self) -> str:
101
102
  """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
@@ -115,7 +116,7 @@ class Response:
115
116
  if 'gb2312' not in encodings_to_try:
116
117
  encodings_to_try.append('gb2312')
117
118
  encodings_to_try.append('latin1') # 最后的回退选项
118
-
119
+
119
120
  for encoding in encodings_to_try:
120
121
  if not encoding:
121
122
  continue
@@ -124,7 +125,7 @@ class Response:
124
125
  return self._text_cache
125
126
  except (UnicodeDecodeError, LookupError):
126
127
  continue
127
-
128
+
128
129
  # 所有编码都失败,使用容错解码
129
130
  try:
130
131
  self._text_cache = self.body.decode('utf-8', errors='replace')
@@ -136,37 +137,38 @@ class Response:
136
137
  def is_success(self) -> bool:
137
138
  """检查响应是否成功 (2xx)"""
138
139
  return self._is_success
139
-
140
+
140
141
  @property
141
142
  def is_redirect(self) -> bool:
142
143
  """检查响应是否为重定向 (3xx)"""
143
144
  return self._is_redirect
144
-
145
+
145
146
  @property
146
147
  def is_client_error(self) -> bool:
147
148
  """检查响应是否为客户端错误 (4xx)"""
148
149
  return self._is_client_error
149
-
150
+
150
151
  @property
151
152
  def is_server_error(self) -> bool:
152
153
  """检查响应是否为服务器错误 (5xx)"""
153
154
  return self._is_server_error
154
-
155
+
155
156
  @property
156
157
  def content_type(self) -> str:
157
158
  """获取响应的 Content-Type"""
158
159
  return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
159
-
160
+
160
161
  @property
161
162
  def content_length(self) -> Optional[int]:
162
163
  """获取响应的 Content-Length"""
163
164
  length = self.headers.get('content-length') or self.headers.get('Content-Length')
164
165
  return int(length) if length else None
166
+
165
167
  def json(self, default: Any = None) -> Any:
166
168
  """将响应文本解析为 JSON 对象。"""
167
169
  if self._json_cache is not None:
168
170
  return self._json_cache
169
-
171
+
170
172
  try:
171
173
  self._json_cache = ujson.loads(self.text)
172
174
  return self._json_cache
@@ -11,3 +11,12 @@ class BasePipeline:
11
11
  @classmethod
12
12
  def create_instance(cls, crawler):
13
13
  return cls()
14
+
15
+
16
+ # 导出去重管道
17
+ from .memory_dedup_pipeline import MemoryDedupPipeline
18
+ from .redis_dedup_pipeline import RedisDedupPipeline
19
+ from .bloom_dedup_pipeline import BloomDedupPipeline
20
+ from .database_dedup_pipeline import DatabaseDedupPipeline
21
+
22
+ __all__ = ['BasePipeline', 'MemoryDedupPipeline', 'RedisDedupPipeline', 'BloomDedupPipeline', 'DatabaseDedupPipeline']
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 基于 Bloom Filter 的数据项去重管道
5
+ =============================
6
+ 提供大规模数据采集场景下的高效去重功能,使用概率性数据结构节省内存。
7
+
8
+ 特点:
9
+ - 内存效率高: 相比传统集合节省大量内存
10
+ - 高性能: 快速的插入和查找操作
11
+ - 可扩展: 支持自定义容量和误判率
12
+ - 适用性广: 特别适合大规模数据采集
13
+
14
+ 注意: Bloom Filter 有误判率,可能会错误地丢弃一些未见过的数据项。
15
+ """
16
+
17
+ import hashlib
18
+ try:
19
+ from pybloom_live import BloomFilter
20
+ BLOOM_FILTER_AVAILABLE = True
21
+ except ImportError:
22
+ # 如果没有安装 pybloom_live,使用简单的替代方案
23
+ BLOOM_FILTER_AVAILABLE = False
24
+
25
+ class BloomFilter:
26
+ def __init__(self, capacity, error_rate):
27
+ self._data = set()
28
+
29
+ def add(self, item):
30
+ if item in self._data:
31
+ return False
32
+ else:
33
+ self._data.add(item)
34
+ return True
35
+
36
+ def __contains__(self, item):
37
+ return item in self._data
38
+
39
+ from crawlo import Item
40
+ from crawlo.spider import Spider
41
+ from crawlo.utils.log import get_logger
42
+ from crawlo.exceptions import DropItem
43
+
44
+
45
+ class BloomDedupPipeline:
46
+ """基于 Bloom Filter 的数据项去重管道"""
47
+
48
+ def __init__(
49
+ self,
50
+ capacity: int = 1000000,
51
+ error_rate: float = 0.001,
52
+ log_level: str = "INFO"
53
+ ):
54
+ """
55
+ 初始化 Bloom Filter 去重管道
56
+
57
+ :param capacity: 预期存储的元素数量
58
+ :param error_rate: 误判率 (例如 0.001 表示 0.1%)
59
+ :param log_level: 日志级别
60
+ """
61
+ self.logger = get_logger(self.__class__.__name__, log_level)
62
+
63
+ # 初始化 Bloom Filter
64
+ try:
65
+ self.bloom_filter = BloomFilter(capacity=capacity, error_rate=error_rate)
66
+ self.logger.info(f"Bloom Filter 去重管道初始化完成 (容量: {capacity}, 误判率: {error_rate})")
67
+ except Exception as e:
68
+ self.logger.error(f"Bloom Filter 初始化失败: {e}")
69
+ raise RuntimeError(f"Bloom Filter 初始化失败: {e}")
70
+
71
+ self.capacity = capacity
72
+ self.error_rate = error_rate
73
+ self.dropped_count = 0
74
+ self.added_count = 0
75
+
76
+ @classmethod
77
+ def from_crawler(cls, crawler):
78
+ """从爬虫配置创建管道实例"""
79
+ settings = crawler.settings
80
+
81
+ return cls(
82
+ capacity=settings.getint('BLOOM_FILTER_CAPACITY', 1000000),
83
+ error_rate=settings.getfloat('BLOOM_FILTER_ERROR_RATE', 0.001),
84
+ log_level=settings.get('LOG_LEVEL', 'INFO')
85
+ )
86
+
87
+ def process_item(self, item: Item, spider: Spider) -> Item:
88
+ """
89
+ 处理数据项,进行去重检查
90
+
91
+ :param item: 要处理的数据项
92
+ :param spider: 爬虫实例
93
+ :return: 处理后的数据项或抛出 DropItem 异常
94
+ """
95
+ try:
96
+ # 生成数据项指纹
97
+ fingerprint = self._generate_item_fingerprint(item)
98
+
99
+ # 检查指纹是否已存在
100
+ if fingerprint in self.bloom_filter:
101
+ # 如果可能已存在(Bloom Filter 可能有误判),丢弃这个数据项
102
+ self.dropped_count += 1
103
+ self.logger.debug(f"可能丢弃重复数据项: {fingerprint[:20]}...")
104
+ raise DropItem(f"可能重复的数据项: {fingerprint}")
105
+ else:
106
+ # 添加指纹到 Bloom Filter
107
+ self.bloom_filter.add(fingerprint)
108
+ self.added_count += 1
109
+ self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
110
+ return item
111
+
112
+ except Exception as e:
113
+ self.logger.error(f"处理数据项时出错: {e}")
114
+ # 在错误时继续处理,避免丢失数据
115
+ return item
116
+
117
+ def _generate_item_fingerprint(self, item: Item) -> str:
118
+ """
119
+ 生成数据项指纹
120
+
121
+ 基于数据项的所有字段生成唯一指纹,用于去重判断。
122
+
123
+ :param item: 数据项
124
+ :return: 指纹字符串
125
+ """
126
+ # 将数据项转换为可序列化的字典
127
+ try:
128
+ item_dict = item.to_dict()
129
+ except AttributeError:
130
+ # 兼容没有to_dict方法的Item实现
131
+ item_dict = dict(item)
132
+
133
+ # 对字典进行排序以确保一致性
134
+ sorted_items = sorted(item_dict.items())
135
+
136
+ # 生成指纹字符串
137
+ fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
138
+
139
+ # 使用 SHA256 生成固定长度的指纹
140
+ return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
141
+
142
+ def close_spider(self, spider: Spider) -> None:
143
+ """
144
+ 爬虫关闭时的清理工作
145
+
146
+ :param spider: 爬虫实例
147
+ """
148
+ self.logger.info(f"爬虫 {spider.name} 关闭:")
149
+ self.logger.info(f" - 处理的数据项数: {self.added_count}")
150
+ self.logger.info(f" - 可能丢弃的重复数据项: {self.dropped_count}")
151
+
152
+ if BLOOM_FILTER_AVAILABLE:
153
+ # 注意:Bloom Filter 无法准确统计元素数量
154
+ self.logger.info(f" - Bloom Filter 容量: {self.capacity}")
155
+ self.logger.info(f" - Bloom Filter 误判率: {self.error_rate}")
156
+ else:
157
+ self.logger.warning(" - 未安装 pybloom_live,使用内存集合作为替代")