crawlo 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (128) hide show
  1. crawlo/__init__.py +34 -33
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +152 -126
  7. crawlo/commands/list.py +156 -147
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -111
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +187 -0
  12. crawlo/config.py +280 -0
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -158
  15. crawlo/core/enhanced_engine.py +190 -0
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +166 -57
  18. crawlo/crawler.py +1028 -495
  19. crawlo/downloader/__init__.py +242 -78
  20. crawlo/downloader/aiohttp_downloader.py +212 -199
  21. crawlo/downloader/cffi_downloader.py +251 -241
  22. crawlo/downloader/httpx_downloader.py +259 -246
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +82 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -37
  30. crawlo/filters/aioredis_filter.py +242 -150
  31. crawlo/filters/memory_filter.py +269 -202
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -245
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -90
  45. crawlo/mode_manager.py +201 -0
  46. crawlo/network/__init__.py +21 -7
  47. crawlo/network/request.py +311 -203
  48. crawlo/network/response.py +271 -166
  49. crawlo/pipelines/__init__.py +22 -13
  50. crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
  51. crawlo/pipelines/console_pipeline.py +39 -39
  52. crawlo/pipelines/csv_pipeline.py +317 -0
  53. crawlo/pipelines/database_dedup_pipeline.py +225 -0
  54. crawlo/pipelines/json_pipeline.py +219 -0
  55. crawlo/pipelines/memory_dedup_pipeline.py +116 -0
  56. crawlo/pipelines/mongo_pipeline.py +116 -116
  57. crawlo/pipelines/mysql_pipeline.py +195 -195
  58. crawlo/pipelines/pipeline_manager.py +56 -56
  59. crawlo/pipelines/redis_dedup_pipeline.py +163 -0
  60. crawlo/project.py +153 -153
  61. crawlo/queue/__init__.py +0 -0
  62. crawlo/queue/pqueue.py +37 -0
  63. crawlo/queue/queue_manager.py +308 -0
  64. crawlo/queue/redis_priority_queue.py +209 -0
  65. crawlo/settings/__init__.py +7 -7
  66. crawlo/settings/default_settings.py +245 -167
  67. crawlo/settings/setting_manager.py +99 -99
  68. crawlo/spider/__init__.py +639 -129
  69. crawlo/stats_collector.py +59 -59
  70. crawlo/subscriber.py +106 -106
  71. crawlo/task_manager.py +30 -27
  72. crawlo/templates/crawlo.cfg.tmpl +10 -10
  73. crawlo/templates/project/__init__.py.tmpl +3 -3
  74. crawlo/templates/project/items.py.tmpl +17 -17
  75. crawlo/templates/project/middlewares.py.tmpl +87 -76
  76. crawlo/templates/project/pipelines.py.tmpl +342 -64
  77. crawlo/templates/project/run.py.tmpl +252 -0
  78. crawlo/templates/project/settings.py.tmpl +251 -54
  79. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  80. crawlo/templates/spider/spider.py.tmpl +178 -32
  81. crawlo/utils/__init__.py +7 -7
  82. crawlo/utils/controlled_spider_mixin.py +440 -0
  83. crawlo/utils/date_tools.py +233 -233
  84. crawlo/utils/db_helper.py +343 -343
  85. crawlo/utils/func_tools.py +82 -82
  86. crawlo/utils/large_scale_config.py +287 -0
  87. crawlo/utils/large_scale_helper.py +344 -0
  88. crawlo/utils/log.py +128 -128
  89. crawlo/utils/queue_helper.py +176 -0
  90. crawlo/utils/request.py +267 -267
  91. crawlo/utils/request_serializer.py +220 -0
  92. crawlo/utils/spider_loader.py +62 -62
  93. crawlo/utils/system.py +11 -11
  94. crawlo/utils/tools.py +4 -4
  95. crawlo/utils/url.py +39 -39
  96. crawlo-1.1.3.dist-info/METADATA +635 -0
  97. crawlo-1.1.3.dist-info/RECORD +113 -0
  98. examples/__init__.py +7 -7
  99. examples/controlled_spider_example.py +205 -0
  100. tests/__init__.py +7 -7
  101. tests/test_final_validation.py +154 -0
  102. tests/test_proxy_health_check.py +32 -32
  103. tests/test_proxy_middleware_integration.py +136 -136
  104. tests/test_proxy_providers.py +56 -56
  105. tests/test_proxy_stats.py +19 -19
  106. tests/test_proxy_strategies.py +59 -59
  107. tests/test_redis_config.py +29 -0
  108. tests/test_redis_queue.py +225 -0
  109. tests/test_request_serialization.py +71 -0
  110. tests/test_scheduler.py +242 -0
  111. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  112. crawlo/utils/pqueue.py +0 -174
  113. crawlo-1.1.1.dist-info/METADATA +0 -220
  114. crawlo-1.1.1.dist-info/RECORD +0 -100
  115. examples/baidu_spider/__init__.py +0 -7
  116. examples/baidu_spider/demo.py +0 -94
  117. examples/baidu_spider/items.py +0 -46
  118. examples/baidu_spider/middleware.py +0 -49
  119. examples/baidu_spider/pipeline.py +0 -55
  120. examples/baidu_spider/run.py +0 -27
  121. examples/baidu_spider/settings.py +0 -121
  122. examples/baidu_spider/spiders/__init__.py +0 -7
  123. examples/baidu_spider/spiders/bai_du.py +0 -61
  124. examples/baidu_spider/spiders/miit.py +0 -159
  125. examples/baidu_spider/spiders/sina.py +0 -79
  126. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
  127. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
  128. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,166 +1,271 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- import re
4
- import ujson
5
- from http.cookies import SimpleCookie
6
- from parsel import Selector, SelectorList
7
- from typing import Dict, Any, List, Optional
8
- from urllib.parse import urljoin as _urljoin
9
-
10
- from crawlo import Request
11
- from crawlo.exceptions import DecodeError
12
-
13
-
14
- class Response:
15
- """
16
- HTTP响应的封装,提供数据解析的便捷方法。
17
- """
18
-
19
- def __init__(
20
- self,
21
- url: str,
22
- *,
23
- headers: Dict[str, Any],
24
- body: bytes = b"",
25
- method: str = 'GET',
26
- request: Request = None,
27
- status_code: int = 200,
28
- ):
29
- self.url = url
30
- self.headers = headers
31
- self.body = body
32
- self.method = method
33
- self.request = request
34
- self.status_code = status_code
35
- self.encoding = self.request.encoding if self.request else None
36
- self._text_cache = None
37
- self._json_cache = None
38
- self._selector_instance = None # 修改变量名,避免与 @property 冲突
39
-
40
- @property
41
- def text(self) -> str:
42
- """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
43
- if self._text_cache is not None:
44
- return self._text_cache
45
-
46
- encoding = self.encoding
47
- try:
48
- # 优先使用 request 提供的编码
49
- if encoding:
50
- self._text_cache = self.body.decode(encoding)
51
- return self._text_cache
52
-
53
- # Content-Type 中提取编码
54
- content_type = self.headers.get("Content-Type", "")
55
- charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
56
- if charset_match:
57
- encoding = charset_match.group(1)
58
- self._text_cache = self.body.decode(encoding)
59
- return self._text_cache
60
-
61
- # 默认尝试 UTF-8
62
- self._text_cache = self.body.decode("utf-8")
63
- return self._text_cache
64
-
65
- except UnicodeDecodeError as e:
66
- raise DecodeError(f"Failed to decode response from {self.url}: {e}")
67
-
68
- def json(self) -> Any:
69
- """将响应文本解析为 JSON 对象。"""
70
- if self._json_cache:
71
- return self._json_cache
72
- self._json_cache = ujson.loads(self.text)
73
- return self._json_cache
74
-
75
- def urljoin(self, url: str) -> str:
76
- """拼接 URL,自动处理相对路径。"""
77
- return _urljoin(self.url, url)
78
-
79
- @property
80
- def _selector(self) -> Selector:
81
- """懒加载 Selector 实例"""
82
- if self._selector_instance is None:
83
- self._selector_instance = Selector(self.text)
84
- return self._selector_instance
85
-
86
- def xpath(self, query: str) -> SelectorList:
87
- """使用 XPath 选择器查询文档。"""
88
- return self._selector.xpath(query)
89
-
90
- def css(self, query: str) -> SelectorList:
91
- """使用 CSS 选择器查询文档。"""
92
- return self._selector.css(query)
93
-
94
- def xpath_text(self, query: str) -> str:
95
- """使用 XPath 提取并返回纯文本。"""
96
- fragments = self.xpath(f"{query}//text()").getall()
97
- return " ".join(text.strip() for text in fragments if text.strip())
98
-
99
- def css_text(self, query: str) -> str:
100
- """使用 CSS 选择器提取并返回纯文本。"""
101
- fragments = self.css(f"{query} ::text").getall()
102
- return " ".join(text.strip() for text in fragments if text.strip())
103
-
104
- def get_text(self, xpath_or_css: str, join_str: str = " ") -> str:
105
- """
106
- 获取指定节点的纯文本(自动拼接子节点文本)
107
-
108
- 参数:
109
- xpath_or_css: XPath或CSS选择器
110
- join_str: 文本拼接分隔符(默认为空格)
111
-
112
- 返回:
113
- 拼接后的纯文本字符串
114
- """
115
- elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
116
- texts = elements.xpath('.//text()').getall()
117
- return join_str.join(text.strip() for text in texts if text.strip())
118
-
119
- def get_all_text(self, xpath_or_css: str, join_str: str = " ") -> List[str]:
120
- """
121
- 获取多个节点的纯文本列表
122
-
123
- 参数:
124
- xpath_or_css: XPath或CSS选择器
125
- join_str: 单个节点内文本拼接分隔符
126
-
127
- 返回:
128
- 纯文本列表(每个元素对应一个节点的文本)
129
- """
130
- elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
131
- result = []
132
- for element in elements:
133
- texts = element.xpath('.//text()').getall()
134
- clean_text = join_str.join(text.strip() for text in texts if text.strip())
135
- if clean_text:
136
- result.append(clean_text)
137
- return result
138
-
139
- def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
140
- """在响应文本上执行正则表达式搜索。"""
141
- if not isinstance(pattern, str):
142
- raise TypeError("Pattern must be a string")
143
- return re.search(pattern, self.text, flags=flags)
144
-
145
- def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
146
- """在响应文本上执行正则表达式查找。"""
147
- if not isinstance(pattern, str):
148
- raise TypeError("Pattern must be a string")
149
- return re.findall(pattern, self.text, flags=flags)
150
-
151
- def get_cookies(self) -> Dict[str, str]:
152
- """从响应头中解析并返回Cookies。"""
153
- cookie_header = self.headers.get("Set-Cookie", "")
154
- if isinstance(cookie_header, list):
155
- cookie_header = ", ".join(cookie_header)
156
- cookies = SimpleCookie()
157
- cookies.load(cookie_header)
158
- return {key: morsel.value for key, morsel in cookies.items()}
159
-
160
- @property
161
- def meta(self) -> Dict:
162
- """获取关联的 Request 对象的 meta 字典。"""
163
- return self.request.meta if self.request else {}
164
-
165
- def __str__(self):
166
- return f"<{self.status_code} {self.url}>"
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ HTTP Response 封装模块
5
+ =====================
6
+ 提供功能丰富的HTTP响应封装,支持:
7
+ - 智能编码检测和解码
8
+ - XPath/CSS 选择器
9
+ - JSON 解析和缓存
10
+ - 正则表达式支持
11
+ - Cookie 处理
12
+ """
13
+ import re
14
+ import ujson
15
+ from http.cookies import SimpleCookie
16
+ from parsel import Selector, SelectorList
17
+ from typing import Dict, Any, List, Optional, Union
18
+ from urllib.parse import urljoin as _urljoin
19
+
20
+ from crawlo.exceptions import DecodeError
21
+
22
+
23
+ class Response:
24
+ """
25
+ HTTP响应的封装,提供数据解析的便捷方法。
26
+
27
+ 功能特性:
28
+ - 智能编码检测和缓存
29
+ - 懒加载 Selector 实例
30
+ - JSON 解析和缓存
31
+ - 多类型数据提取
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ url: str,
37
+ *,
38
+ headers: Dict[str, Any] = None,
39
+ body: bytes = b"",
40
+ method: str = 'GET',
41
+ request: 'Request' = None, # 使用字符串注解避免循环导入
42
+ status_code: int = 200,
43
+ ):
44
+ # 基本属性
45
+ self.url = url
46
+ self.headers = headers or {}
47
+ self.body = body
48
+ self.method = method.upper()
49
+ self.request = request
50
+ self.status_code = status_code
51
+
52
+ # 编码处理
53
+ self.encoding = self._determine_encoding()
54
+
55
+ # 缓存属性
56
+ self._text_cache = None
57
+ self._json_cache = None
58
+ self._selector_instance = None
59
+
60
+ # 状态标记
61
+ self._is_success = 200 <= status_code < 300
62
+ self._is_redirect = 300 <= status_code < 400
63
+ self._is_client_error = 400 <= status_code < 500
64
+ self._is_server_error = status_code >= 500
65
+
66
+ def _determine_encoding(self) -> Optional[str]:
67
+ """智能检测响应编码"""
68
+ # 1. 优先使用 request 的编码
69
+ if self.request and self.request.encoding:
70
+ return self.request.encoding
71
+
72
+ # 2. Content-Type 头中检测
73
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
74
+ if content_type:
75
+ charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
76
+ if charset_match:
77
+ return charset_match.group(1).lower()
78
+
79
+ # 3. 从 HTML meta 标签中检测(仅对HTML内容)
80
+ if b'<html' in self.body[:1024].lower():
81
+ # 查找 <meta charset="xxx"> <meta http-equiv="Content-Type" content="...charset=xxx">
82
+ html_start = self.body[:4096] # 只检查前4KB
83
+ try:
84
+ html_text = html_start.decode('ascii', errors='ignore')
85
+ # <meta charset="utf-8">
86
+ charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
87
+ if charset_match:
88
+ return charset_match.group(1).lower()
89
+
90
+ # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
91
+ content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
92
+ if content_match:
93
+ return content_match.group(1).lower()
94
+ except Exception:
95
+ pass
96
+
97
+ # 4. 默认使用 utf-8
98
+ return 'utf-8'
99
+
100
+ @property
101
+ def text(self) -> str:
102
+ """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
103
+ if self._text_cache is not None:
104
+ return self._text_cache
105
+
106
+ if not self.body:
107
+ self._text_cache = ""
108
+ return self._text_cache
109
+
110
+ # 尝试多种编码
111
+ encodings_to_try = [self.encoding]
112
+ if self.encoding != 'utf-8':
113
+ encodings_to_try.append('utf-8')
114
+ if 'gbk' not in encodings_to_try:
115
+ encodings_to_try.append('gbk')
116
+ if 'gb2312' not in encodings_to_try:
117
+ encodings_to_try.append('gb2312')
118
+ encodings_to_try.append('latin1') # 最后的回退选项
119
+
120
+ for encoding in encodings_to_try:
121
+ if not encoding:
122
+ continue
123
+ try:
124
+ self._text_cache = self.body.decode(encoding)
125
+ return self._text_cache
126
+ except (UnicodeDecodeError, LookupError):
127
+ continue
128
+
129
+ # 所有编码都失败,使用容错解码
130
+ try:
131
+ self._text_cache = self.body.decode('utf-8', errors='replace')
132
+ return self._text_cache
133
+ except Exception as e:
134
+ raise DecodeError(f"Failed to decode response from {self.url}: {e}")
135
+
136
+ @property
137
+ def is_success(self) -> bool:
138
+ """检查响应是否成功 (2xx)"""
139
+ return self._is_success
140
+
141
+ @property
142
+ def is_redirect(self) -> bool:
143
+ """检查响应是否为重定向 (3xx)"""
144
+ return self._is_redirect
145
+
146
+ @property
147
+ def is_client_error(self) -> bool:
148
+ """检查响应是否为客户端错误 (4xx)"""
149
+ return self._is_client_error
150
+
151
+ @property
152
+ def is_server_error(self) -> bool:
153
+ """检查响应是否为服务器错误 (5xx)"""
154
+ return self._is_server_error
155
+
156
+ @property
157
+ def content_type(self) -> str:
158
+ """获取响应的 Content-Type"""
159
+ return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
160
+
161
+ @property
162
+ def content_length(self) -> Optional[int]:
163
+ """获取响应的 Content-Length"""
164
+ length = self.headers.get('content-length') or self.headers.get('Content-Length')
165
+ return int(length) if length else None
166
+
167
+ def json(self, default: Any = None) -> Any:
168
+ """将响应文本解析为 JSON 对象。"""
169
+ if self._json_cache is not None:
170
+ return self._json_cache
171
+
172
+ try:
173
+ self._json_cache = ujson.loads(self.text)
174
+ return self._json_cache
175
+ except (ujson.JSONDecodeError, ValueError) as e:
176
+ if default is not None:
177
+ return default
178
+ raise DecodeError(f"Failed to parse JSON from {self.url}: {e}")
179
+
180
+ def urljoin(self, url: str) -> str:
181
+ """拼接 URL,自动处理相对路径。"""
182
+ return _urljoin(self.url, url)
183
+
184
+ @property
185
+ def _selector(self) -> Selector:
186
+ """懒加载 Selector 实例"""
187
+ if self._selector_instance is None:
188
+ self._selector_instance = Selector(self.text)
189
+ return self._selector_instance
190
+
191
+ def xpath(self, query: str) -> SelectorList:
192
+ """使用 XPath 选择器查询文档。"""
193
+ return self._selector.xpath(query)
194
+
195
+ def css(self, query: str) -> SelectorList:
196
+ """使用 CSS 选择器查询文档。"""
197
+ return self._selector.css(query)
198
+
199
+ def xpath_text(self, query: str) -> str:
200
+ """使用 XPath 提取并返回纯文本。"""
201
+ fragments = self.xpath(f"{query}//text()").getall()
202
+ return " ".join(text.strip() for text in fragments if text.strip())
203
+
204
+ def css_text(self, query: str) -> str:
205
+ """使用 CSS 选择器提取并返回纯文本。"""
206
+ fragments = self.css(f"{query} ::text").getall()
207
+ return " ".join(text.strip() for text in fragments if text.strip())
208
+
209
+ def get_text(self, xpath_or_css: str, join_str: str = " ") -> str:
210
+ """
211
+ 获取指定节点的纯文本(自动拼接子节点文本)
212
+
213
+ 参数:
214
+ xpath_or_css: XPath或CSS选择器
215
+ join_str: 文本拼接分隔符(默认为空格)
216
+
217
+ 返回:
218
+ 拼接后的纯文本字符串
219
+ """
220
+ elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
221
+ texts = elements.xpath('.//text()').getall()
222
+ return join_str.join(text.strip() for text in texts if text.strip())
223
+
224
+ def get_all_text(self, xpath_or_css: str, join_str: str = " ") -> List[str]:
225
+ """
226
+ 获取多个节点的纯文本列表
227
+
228
+ 参数:
229
+ xpath_or_css: XPath或CSS选择器
230
+ join_str: 单个节点内文本拼接分隔符
231
+
232
+ 返回:
233
+ 纯文本列表(每个元素对应一个节点的文本)
234
+ """
235
+ elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
236
+ result = []
237
+ for element in elements:
238
+ texts = element.xpath('.//text()').getall()
239
+ clean_text = join_str.join(text.strip() for text in texts if text.strip())
240
+ if clean_text:
241
+ result.append(clean_text)
242
+ return result
243
+
244
+ def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
245
+ """在响应文本上执行正则表达式搜索。"""
246
+ if not isinstance(pattern, str):
247
+ raise TypeError("Pattern must be a string")
248
+ return re.search(pattern, self.text, flags=flags)
249
+
250
+ def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
251
+ """在响应文本上执行正则表达式查找。"""
252
+ if not isinstance(pattern, str):
253
+ raise TypeError("Pattern must be a string")
254
+ return re.findall(pattern, self.text, flags=flags)
255
+
256
+ def get_cookies(self) -> Dict[str, str]:
257
+ """从响应头中解析并返回Cookies。"""
258
+ cookie_header = self.headers.get("Set-Cookie", "")
259
+ if isinstance(cookie_header, list):
260
+ cookie_header = ", ".join(cookie_header)
261
+ cookies = SimpleCookie()
262
+ cookies.load(cookie_header)
263
+ return {key: morsel.value for key, morsel in cookies.items()}
264
+
265
+ @property
266
+ def meta(self) -> Dict:
267
+ """获取关联的 Request 对象的 meta 字典。"""
268
+ return self.request.meta if self.request else {}
269
+
270
+ def __str__(self):
271
+ return f"<{self.status_code} {self.url}>"
@@ -1,13 +1,22 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.items.items import Item
4
-
5
-
6
- class BasePipeline:
7
-
8
- def process_item(self, item: Item, spider):
9
- raise NotImplementedError
10
-
11
- @classmethod
12
- def create_instance(cls, crawler):
13
- return cls()
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.items import Item
4
+
5
+
6
+ class BasePipeline:
7
+
8
+ def process_item(self, item: Item, spider):
9
+ raise NotImplementedError
10
+
11
+ @classmethod
12
+ def create_instance(cls, crawler):
13
+ return cls()
14
+
15
+
16
+ # 导出去重管道
17
+ from .memory_dedup_pipeline import MemoryDedupPipeline
18
+ from .redis_dedup_pipeline import RedisDedupPipeline
19
+ from .bloom_dedup_pipeline import BloomDedupPipeline
20
+ from .database_dedup_pipeline import DatabaseDedupPipeline
21
+
22
+ __all__ = ['BasePipeline', 'MemoryDedupPipeline', 'RedisDedupPipeline', 'BloomDedupPipeline', 'DatabaseDedupPipeline']
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 基于 Bloom Filter 的数据项去重管道
5
+ =============================
6
+ 提供大规模数据采集场景下的高效去重功能,使用概率性数据结构节省内存。
7
+
8
+ 特点:
9
+ - 内存效率高: 相比传统集合节省大量内存
10
+ - 高性能: 快速的插入和查找操作
11
+ - 可扩展: 支持自定义容量和误判率
12
+ - 适用性广: 特别适合大规模数据采集
13
+
14
+ 注意: Bloom Filter 有误判率,可能会错误地丢弃一些未见过的数据项。
15
+ """
16
+
17
+ import hashlib
18
+ try:
19
+ from pybloom_live import BloomFilter
20
+ BLOOM_FILTER_AVAILABLE = True
21
+ except ImportError:
22
+ # 如果没有安装 pybloom_live,使用简单的替代方案
23
+ BLOOM_FILTER_AVAILABLE = False
24
+
25
+ class BloomFilter:
26
+ def __init__(self, capacity, error_rate):
27
+ self._data = set()
28
+
29
+ def add(self, item):
30
+ if item in self._data:
31
+ return False
32
+ else:
33
+ self._data.add(item)
34
+ return True
35
+
36
+ def __contains__(self, item):
37
+ return item in self._data
38
+
39
+ from crawlo import Item
40
+ from crawlo.spider import Spider
41
+ from crawlo.utils.log import get_logger
42
+ from crawlo.exceptions import DropItem
43
+
44
+
45
+ class BloomDedupPipeline:
46
+ """基于 Bloom Filter 的数据项去重管道"""
47
+
48
+ def __init__(
49
+ self,
50
+ capacity: int = 1000000,
51
+ error_rate: float = 0.001,
52
+ log_level: str = "INFO"
53
+ ):
54
+ """
55
+ 初始化 Bloom Filter 去重管道
56
+
57
+ :param capacity: 预期存储的元素数量
58
+ :param error_rate: 误判率 (例如 0.001 表示 0.1%)
59
+ :param log_level: 日志级别
60
+ """
61
+ self.logger = get_logger(self.__class__.__name__, log_level)
62
+
63
+ # 初始化 Bloom Filter
64
+ try:
65
+ self.bloom_filter = BloomFilter(capacity=capacity, error_rate=error_rate)
66
+ self.logger.info(f"Bloom Filter 去重管道初始化完成 (容量: {capacity}, 误判率: {error_rate})")
67
+ except Exception as e:
68
+ self.logger.error(f"Bloom Filter 初始化失败: {e}")
69
+ raise RuntimeError(f"Bloom Filter 初始化失败: {e}")
70
+
71
+ self.capacity = capacity
72
+ self.error_rate = error_rate
73
+ self.dropped_count = 0
74
+ self.added_count = 0
75
+
76
+ @classmethod
77
+ def from_crawler(cls, crawler):
78
+ """从爬虫配置创建管道实例"""
79
+ settings = crawler.settings
80
+
81
+ return cls(
82
+ capacity=settings.getint('BLOOM_FILTER_CAPACITY', 1000000),
83
+ error_rate=settings.getfloat('BLOOM_FILTER_ERROR_RATE', 0.001),
84
+ log_level=settings.get('LOG_LEVEL', 'INFO')
85
+ )
86
+
87
+ def process_item(self, item: Item, spider: Spider) -> Item:
88
+ """
89
+ 处理数据项,进行去重检查
90
+
91
+ :param item: 要处理的数据项
92
+ :param spider: 爬虫实例
93
+ :return: 处理后的数据项或抛出 DropItem 异常
94
+ """
95
+ try:
96
+ # 生成数据项指纹
97
+ fingerprint = self._generate_item_fingerprint(item)
98
+
99
+ # 检查指纹是否已存在
100
+ if fingerprint in self.bloom_filter:
101
+ # 如果可能已存在(Bloom Filter 可能有误判),丢弃这个数据项
102
+ self.dropped_count += 1
103
+ self.logger.debug(f"可能丢弃重复数据项: {fingerprint[:20]}...")
104
+ raise DropItem(f"可能重复的数据项: {fingerprint}")
105
+ else:
106
+ # 添加指纹到 Bloom Filter
107
+ self.bloom_filter.add(fingerprint)
108
+ self.added_count += 1
109
+ self.logger.debug(f"处理新数据项: {fingerprint[:20]}...")
110
+ return item
111
+
112
+ except Exception as e:
113
+ self.logger.error(f"处理数据项时出错: {e}")
114
+ # 在错误时继续处理,避免丢失数据
115
+ return item
116
+
117
+ def _generate_item_fingerprint(self, item: Item) -> str:
118
+ """
119
+ 生成数据项指纹
120
+
121
+ 基于数据项的所有字段生成唯一指纹,用于去重判断。
122
+
123
+ :param item: 数据项
124
+ :return: 指纹字符串
125
+ """
126
+ # 将数据项转换为可序列化的字典
127
+ try:
128
+ item_dict = item.to_dict()
129
+ except AttributeError:
130
+ # 兼容没有to_dict方法的Item实现
131
+ item_dict = dict(item)
132
+
133
+ # 对字典进行排序以确保一致性
134
+ sorted_items = sorted(item_dict.items())
135
+
136
+ # 生成指纹字符串
137
+ fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
138
+
139
+ # 使用 SHA256 生成固定长度的指纹
140
+ return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
141
+
142
+ def close_spider(self, spider: Spider) -> None:
143
+ """
144
+ 爬虫关闭时的清理工作
145
+
146
+ :param spider: 爬虫实例
147
+ """
148
+ self.logger.info(f"爬虫 {spider.name} 关闭:")
149
+ self.logger.info(f" - 处理的数据项数: {self.added_count}")
150
+ self.logger.info(f" - 可能丢弃的重复数据项: {self.dropped_count}")
151
+
152
+ if BLOOM_FILTER_AVAILABLE:
153
+ # 注意:Bloom Filter 无法准确统计元素数量
154
+ self.logger.info(f" - Bloom Filter 容量: {self.capacity}")
155
+ self.logger.info(f" - Bloom Filter 误判率: {self.error_rate}")
156
+ else:
157
+ self.logger.warning(" - 未安装 pybloom_live,使用内存集合作为替代")