crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/network/response.py
CHANGED
|
@@ -11,12 +11,9 @@ HTTP Response 封装模块
|
|
|
11
11
|
- Cookie 处理
|
|
12
12
|
"""
|
|
13
13
|
import re
|
|
14
|
-
from http.cookies import SimpleCookie
|
|
15
|
-
from typing import Dict, Any, List, Optional, Tuple
|
|
16
|
-
from urllib.parse import urljoin as _urljoin, urlparse as _urlparse, urlsplit as _urlsplit, parse_qs as _parse_qs, \
|
|
17
|
-
urlencode as _urlencode, quote as _quote, unquote as _unquote, urldefrag as _urldefrag
|
|
18
|
-
|
|
19
14
|
import ujson
|
|
15
|
+
from typing import Dict, Any, List, Optional
|
|
16
|
+
from urllib.parse import urljoin as _urljoin
|
|
20
17
|
from parsel import Selector, SelectorList
|
|
21
18
|
|
|
22
19
|
# 尝试导入 w3lib 编码检测函数
|
|
@@ -31,6 +28,14 @@ try:
|
|
|
31
28
|
W3LIB_AVAILABLE = True
|
|
32
29
|
except ImportError:
|
|
33
30
|
W3LIB_AVAILABLE = False
|
|
31
|
+
# 当 w3lib 不可用时,从 utils 导入替代函数
|
|
32
|
+
from crawlo.utils import (
|
|
33
|
+
html_body_declared_encoding,
|
|
34
|
+
html_to_unicode,
|
|
35
|
+
http_content_type_encoding,
|
|
36
|
+
read_bom,
|
|
37
|
+
resolve_encoding,
|
|
38
|
+
)
|
|
34
39
|
|
|
35
40
|
from crawlo.exceptions import DecodeError
|
|
36
41
|
from crawlo.utils import (
|
|
@@ -38,7 +43,11 @@ from crawlo.utils import (
|
|
|
38
43
|
extract_texts,
|
|
39
44
|
extract_attr,
|
|
40
45
|
extract_attrs,
|
|
41
|
-
is_xpath
|
|
46
|
+
is_xpath,
|
|
47
|
+
parse_cookies,
|
|
48
|
+
regex_search,
|
|
49
|
+
regex_findall,
|
|
50
|
+
get_header_value
|
|
42
51
|
)
|
|
43
52
|
|
|
44
53
|
|
|
@@ -74,10 +83,10 @@ class Response:
|
|
|
74
83
|
self,
|
|
75
84
|
url: str,
|
|
76
85
|
*,
|
|
77
|
-
headers: Dict[str, Any] = None,
|
|
86
|
+
headers: Optional[Dict[str, Any]] = None,
|
|
78
87
|
body: bytes = b"",
|
|
79
88
|
method: str = 'GET',
|
|
80
|
-
request: 'Request' = None, # 使用字符串注解避免循环导入
|
|
89
|
+
request: Optional['Request'] = None, # 使用字符串注解避免循环导入
|
|
81
90
|
status_code: int = 200,
|
|
82
91
|
):
|
|
83
92
|
# 基本属性
|
|
@@ -106,7 +115,7 @@ class Response:
|
|
|
106
115
|
|
|
107
116
|
def _determine_encoding(self) -> str:
|
|
108
117
|
"""
|
|
109
|
-
|
|
118
|
+
智能检测响应编码
|
|
110
119
|
|
|
111
120
|
编码检测优先级:
|
|
112
121
|
1. Request 中指定的编码
|
|
@@ -184,15 +193,19 @@ class Response:
|
|
|
184
193
|
def _bom_encoding(self) -> Optional[str]:
|
|
185
194
|
"""BOM 字节顺序标记编码检测"""
|
|
186
195
|
if not W3LIB_AVAILABLE:
|
|
187
|
-
|
|
196
|
+
# 使用替代函数
|
|
197
|
+
encoding, _ = read_bom(self.body)
|
|
198
|
+
return encoding
|
|
188
199
|
return read_bom(self.body)[0]
|
|
189
200
|
|
|
190
201
|
@memoize_method_noargs
|
|
191
202
|
def _headers_encoding(self) -> Optional[str]:
|
|
192
203
|
"""HTTP Content-Type 头部编码检测"""
|
|
193
204
|
if not W3LIB_AVAILABLE:
|
|
194
|
-
|
|
195
|
-
|
|
205
|
+
# 使用替代函数
|
|
206
|
+
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
207
|
+
return http_content_type_encoding(content_type)
|
|
208
|
+
content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
|
|
196
209
|
if isinstance(content_type, bytes):
|
|
197
210
|
content_type = content_type.decode('latin-1')
|
|
198
211
|
return http_content_type_encoding(content_type)
|
|
@@ -201,23 +214,26 @@ class Response:
|
|
|
201
214
|
def _body_declared_encoding(self) -> Optional[str]:
|
|
202
215
|
"""HTML meta 标签声明编码检测"""
|
|
203
216
|
if not W3LIB_AVAILABLE:
|
|
204
|
-
|
|
217
|
+
# 使用替代函数
|
|
218
|
+
return html_body_declared_encoding(self.body)
|
|
205
219
|
return html_body_declared_encoding(self.body)
|
|
206
220
|
|
|
207
221
|
@memoize_method_noargs
|
|
208
222
|
def _body_inferred_encoding(self) -> str:
|
|
209
223
|
"""内容自动检测编码"""
|
|
210
224
|
if not W3LIB_AVAILABLE:
|
|
211
|
-
#
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
225
|
+
# 使用替代函数
|
|
226
|
+
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
227
|
+
# 使用 html_to_unicode 函数进行编码检测
|
|
228
|
+
encoding, _ = html_to_unicode(
|
|
229
|
+
content_type,
|
|
230
|
+
self.body,
|
|
231
|
+
auto_detect_fun=self._auto_detect_fun,
|
|
232
|
+
default_encoding=self._DEFAULT_ENCODING,
|
|
233
|
+
)
|
|
234
|
+
return encoding
|
|
235
|
+
|
|
236
|
+
content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
|
|
221
237
|
if isinstance(content_type, bytes):
|
|
222
238
|
content_type = content_type.decode('latin-1')
|
|
223
239
|
|
|
@@ -233,6 +249,13 @@ class Response:
|
|
|
233
249
|
def _auto_detect_fun(self, text: bytes) -> Optional[str]:
|
|
234
250
|
"""自动检测编码的回调函数"""
|
|
235
251
|
if not W3LIB_AVAILABLE:
|
|
252
|
+
# 使用替代函数
|
|
253
|
+
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
|
|
254
|
+
try:
|
|
255
|
+
text.decode(enc)
|
|
256
|
+
except UnicodeError:
|
|
257
|
+
continue
|
|
258
|
+
return resolve_encoding(enc)
|
|
236
259
|
return None
|
|
237
260
|
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
|
|
238
261
|
try:
|
|
@@ -255,7 +278,7 @@ class Response:
|
|
|
255
278
|
# 如果可用,使用 w3lib 进行更准确的解码
|
|
256
279
|
if W3LIB_AVAILABLE:
|
|
257
280
|
try:
|
|
258
|
-
content_type = self.headers.get(
|
|
281
|
+
content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
|
|
259
282
|
if isinstance(content_type, bytes):
|
|
260
283
|
content_type = content_type.decode('latin-1')
|
|
261
284
|
|
|
@@ -269,6 +292,20 @@ class Response:
|
|
|
269
292
|
except Exception:
|
|
270
293
|
# 如果 w3lib 解码失败,回退到原有方法
|
|
271
294
|
pass
|
|
295
|
+
else:
|
|
296
|
+
# 使用替代函数
|
|
297
|
+
try:
|
|
298
|
+
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
299
|
+
# 使用 html_to_unicode 函数
|
|
300
|
+
_, self._text_cache = html_to_unicode(
|
|
301
|
+
content_type,
|
|
302
|
+
self.body,
|
|
303
|
+
default_encoding=self.encoding
|
|
304
|
+
)
|
|
305
|
+
return self._text_cache
|
|
306
|
+
except Exception:
|
|
307
|
+
# 如果解码失败,回退到原有方法
|
|
308
|
+
pass
|
|
272
309
|
|
|
273
310
|
# 尝试多种编码
|
|
274
311
|
encodings_to_try = [self.encoding]
|
|
@@ -323,12 +360,12 @@ class Response:
|
|
|
323
360
|
@property
|
|
324
361
|
def content_type(self) -> str:
|
|
325
362
|
"""获取响应的 Content-Type"""
|
|
326
|
-
return self.headers
|
|
363
|
+
return get_header_value(self.headers, 'content-type', '')
|
|
327
364
|
|
|
328
365
|
@property
|
|
329
366
|
def content_length(self) -> Optional[int]:
|
|
330
367
|
"""获取响应的 Content-Length"""
|
|
331
|
-
length = self.headers
|
|
368
|
+
length = get_header_value(self.headers, 'content-length')
|
|
332
369
|
return int(length) if length else None
|
|
333
370
|
|
|
334
371
|
# ==================== JSON处理方法 ====================
|
|
@@ -352,103 +389,8 @@ class Response:
|
|
|
352
389
|
"""拼接 URL,自动处理相对路径。"""
|
|
353
390
|
return _urljoin(self.url, url)
|
|
354
391
|
|
|
355
|
-
def urlparse(self, url: str = None) -> Tuple:
|
|
356
|
-
"""
|
|
357
|
-
解析 URL 为组件元组 (scheme, netloc, path, params, query, fragment)
|
|
358
|
-
|
|
359
|
-
Args:
|
|
360
|
-
url (str, optional): 要解析的URL,默认为响应的URL
|
|
361
|
-
|
|
362
|
-
Returns:
|
|
363
|
-
tuple: URL组件元组
|
|
364
|
-
"""
|
|
365
|
-
target_url = url if url is not None else self.url
|
|
366
|
-
return _urlparse(target_url)
|
|
367
|
-
|
|
368
|
-
def urlsplit(self, url: str = None) -> Tuple:
|
|
369
|
-
"""
|
|
370
|
-
解析 URL 为组件元组 (scheme, netloc, path, query, fragment)
|
|
371
|
-
|
|
372
|
-
Args:
|
|
373
|
-
url (str, optional): 要解析的URL,默认为响应的URL
|
|
374
|
-
|
|
375
|
-
Returns:
|
|
376
|
-
tuple: URL组件元组(不包含params)
|
|
377
|
-
"""
|
|
378
|
-
target_url = url if url is not None else self.url
|
|
379
|
-
return _urlsplit(target_url)
|
|
380
|
-
|
|
381
|
-
def parse_qs(self, query_string: str = None, keep_blank_values: bool = False) -> Dict[str, List[str]]:
|
|
382
|
-
"""
|
|
383
|
-
解析查询字符串为字典
|
|
384
|
-
|
|
385
|
-
Args:
|
|
386
|
-
query_string (str, optional): 查询字符串,默认从URL中提取
|
|
387
|
-
keep_blank_values (bool): 是否保留空值
|
|
388
|
-
|
|
389
|
-
Returns:
|
|
390
|
-
dict: 查询参数字典
|
|
391
|
-
"""
|
|
392
|
-
if query_string is None:
|
|
393
|
-
# 从URL中提取查询字符串
|
|
394
|
-
parsed = _urlparse(self.url)
|
|
395
|
-
query_string = parsed.query
|
|
396
|
-
|
|
397
|
-
return _parse_qs(query_string, keep_blank_values=keep_blank_values)
|
|
398
|
-
|
|
399
|
-
def urlencode(self, query: Dict[str, Any]) -> str:
|
|
400
|
-
"""
|
|
401
|
-
将字典编码为查询字符串
|
|
402
|
-
|
|
403
|
-
Args:
|
|
404
|
-
query (dict): 要编码的查询参数字典
|
|
405
|
-
|
|
406
|
-
Returns:
|
|
407
|
-
str: 编码后的查询字符串
|
|
408
|
-
"""
|
|
409
|
-
return _urlencode(query)
|
|
410
|
-
|
|
411
|
-
def quote(self, string: str, safe: str = '/') -> str:
|
|
412
|
-
"""
|
|
413
|
-
URL 编码
|
|
414
|
-
|
|
415
|
-
Args:
|
|
416
|
-
string (str): 要编码的字符串
|
|
417
|
-
safe (str): 不编码的字符,默认为 '/'
|
|
418
|
-
|
|
419
|
-
Returns:
|
|
420
|
-
str: URL编码后的字符串
|
|
421
|
-
"""
|
|
422
|
-
return _quote(string, safe=safe)
|
|
423
|
-
|
|
424
|
-
def unquote(self, string: str) -> str:
|
|
425
|
-
"""
|
|
426
|
-
URL 解码
|
|
427
|
-
|
|
428
|
-
Args:
|
|
429
|
-
string (str): 要解码的字符串
|
|
430
|
-
|
|
431
|
-
Returns:
|
|
432
|
-
str: URL解码后的字符串
|
|
433
|
-
"""
|
|
434
|
-
return _unquote(string)
|
|
435
|
-
|
|
436
|
-
def urldefrag(self, url: str = None) -> Tuple[str, str]:
|
|
437
|
-
"""
|
|
438
|
-
移除 URL 中的片段标识符
|
|
439
|
-
|
|
440
|
-
Args:
|
|
441
|
-
url (str, optional): 要处理的URL,默认为响应的URL
|
|
442
|
-
|
|
443
|
-
Returns:
|
|
444
|
-
tuple: (去除片段的URL, 片段)
|
|
445
|
-
"""
|
|
446
|
-
target_url = url if url is not None else self.url
|
|
447
|
-
defrag_result = _urldefrag(target_url)
|
|
448
|
-
return (defrag_result.url, defrag_result.fragment)
|
|
449
|
-
|
|
450
392
|
# ==================== 选择器相关方法 ====================
|
|
451
|
-
|
|
393
|
+
|
|
452
394
|
@property
|
|
453
395
|
def _selector(self) -> Selector:
|
|
454
396
|
"""懒加载 Selector 实例"""
|
|
@@ -507,7 +449,7 @@ class Response:
|
|
|
507
449
|
except Exception:
|
|
508
450
|
return default
|
|
509
451
|
|
|
510
|
-
def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
|
|
452
|
+
def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: Optional[List[str]] = None) -> List[str]:
|
|
511
453
|
"""
|
|
512
454
|
提取多个元素的文本内容列表,支持CSS和XPath选择器
|
|
513
455
|
|
|
@@ -570,7 +512,7 @@ class Response:
|
|
|
570
512
|
except Exception:
|
|
571
513
|
return default
|
|
572
514
|
|
|
573
|
-
def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
|
|
515
|
+
def extract_attrs(self, xpath_or_css: str, attr_name: str, default: Optional[List[Any]] = None) -> List[Any]:
|
|
574
516
|
"""
|
|
575
517
|
提取多个元素的属性值列表,支持CSS和XPath选择器
|
|
576
518
|
|
|
@@ -608,26 +550,18 @@ class Response:
|
|
|
608
550
|
|
|
609
551
|
def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
|
|
610
552
|
"""在响应文本上执行正则表达式搜索。"""
|
|
611
|
-
|
|
612
|
-
raise TypeError("Pattern must be a string")
|
|
613
|
-
return re.search(pattern, self.text, flags=flags)
|
|
553
|
+
return regex_search(pattern, self.text, flags)
|
|
614
554
|
|
|
615
555
|
def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
|
|
616
556
|
"""在响应文本上执行正则表达式查找。"""
|
|
617
|
-
|
|
618
|
-
raise TypeError("Pattern must be a string")
|
|
619
|
-
return re.findall(pattern, self.text, flags=flags)
|
|
557
|
+
return regex_findall(pattern, self.text, flags)
|
|
620
558
|
|
|
621
559
|
# ==================== Cookie处理方法 ====================
|
|
622
560
|
|
|
623
561
|
def get_cookies(self) -> Dict[str, str]:
|
|
624
562
|
"""从响应头中解析并返回Cookies。"""
|
|
625
563
|
cookie_header = self.headers.get("Set-Cookie", "")
|
|
626
|
-
|
|
627
|
-
cookie_header = ", ".join(cookie_header)
|
|
628
|
-
cookies = SimpleCookie()
|
|
629
|
-
cookies.load(cookie_header)
|
|
630
|
-
return {key: morsel.value for key, morsel in cookies.items()}
|
|
564
|
+
return parse_cookies(cookie_header)
|
|
631
565
|
|
|
632
566
|
# ==================== 请求相关方法 ====================
|
|
633
567
|
|
crawlo/pipelines/__init__.py
CHANGED
|
@@ -1,17 +1,36 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
-
|
|
3
|
+
"""
|
|
4
|
+
Pipeline 模块
|
|
5
|
+
=============
|
|
4
6
|
|
|
7
|
+
Pipeline体系:
|
|
8
|
+
- BasePipeline: 基础抽象类,定义Pipeline接口规范
|
|
9
|
+
- ResourceManagedPipeline: 提供资源管理功能(推荐使用)
|
|
10
|
+
- FileBasedPipeline/DatabasePipeline/CacheBasedPipeline: 特定场景的专用基类
|
|
5
11
|
|
|
6
|
-
|
|
12
|
+
内置去重Pipeline:
|
|
13
|
+
- MemoryDedupPipeline: 基于内存的去重
|
|
14
|
+
- RedisDedupPipeline: 基于Redis的分布式去重
|
|
15
|
+
- BloomDedupPipeline: 基于Bloom Filter的高效去重
|
|
16
|
+
- DatabaseDedupPipeline: 基于数据库的去重
|
|
7
17
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
18
|
+
使用示例:
|
|
19
|
+
# 在settings.py中配置
|
|
20
|
+
PIPELINES = [
|
|
21
|
+
'crawlo.pipelines.RedisDedupPipeline',
|
|
22
|
+
'your_project.pipelines.MongoPipeline',
|
|
23
|
+
]
|
|
24
|
+
"""
|
|
14
25
|
|
|
26
|
+
# 导入所有基类(从base_pipeline.py)
|
|
27
|
+
from .base_pipeline import (
|
|
28
|
+
BasePipeline,
|
|
29
|
+
ResourceManagedPipeline,
|
|
30
|
+
FileBasedPipeline,
|
|
31
|
+
DatabasePipeline,
|
|
32
|
+
CacheBasedPipeline
|
|
33
|
+
)
|
|
15
34
|
|
|
16
35
|
# 导出去重管道
|
|
17
36
|
from .memory_dedup_pipeline import MemoryDedupPipeline
|
|
@@ -19,4 +38,16 @@ from .redis_dedup_pipeline import RedisDedupPipeline
|
|
|
19
38
|
from .bloom_dedup_pipeline import BloomDedupPipeline
|
|
20
39
|
from .database_dedup_pipeline import DatabaseDedupPipeline
|
|
21
40
|
|
|
22
|
-
__all__ = [
|
|
41
|
+
__all__ = [
|
|
42
|
+
# 基类
|
|
43
|
+
'BasePipeline',
|
|
44
|
+
'ResourceManagedPipeline',
|
|
45
|
+
'FileBasedPipeline',
|
|
46
|
+
'DatabasePipeline',
|
|
47
|
+
'CacheBasedPipeline',
|
|
48
|
+
# 去重管道
|
|
49
|
+
'MemoryDedupPipeline',
|
|
50
|
+
'RedisDedupPipeline',
|
|
51
|
+
'BloomDedupPipeline',
|
|
52
|
+
'DatabaseDedupPipeline'
|
|
53
|
+
]
|