crawlo 1.4.4__py3-none-any.whl → 1.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +11 -15
- crawlo/__version__.py +1 -1
- crawlo/commands/startproject.py +24 -0
- crawlo/core/engine.py +2 -2
- crawlo/core/scheduler.py +4 -4
- crawlo/crawler.py +8 -7
- crawlo/downloader/__init__.py +5 -2
- crawlo/extension/__init__.py +2 -2
- crawlo/filters/aioredis_filter.py +8 -1
- crawlo/filters/memory_filter.py +8 -1
- crawlo/initialization/built_in.py +13 -4
- crawlo/initialization/core.py +5 -4
- crawlo/interfaces.py +24 -0
- crawlo/middleware/__init__.py +7 -4
- crawlo/middleware/middleware_manager.py +15 -8
- crawlo/mode_manager.py +45 -11
- crawlo/network/response.py +374 -69
- crawlo/pipelines/mysql_pipeline.py +6 -6
- crawlo/pipelines/pipeline_manager.py +2 -2
- crawlo/project.py +2 -4
- crawlo/settings/default_settings.py +4 -0
- crawlo/task_manager.py +2 -2
- crawlo/templates/project/items.py.tmpl +2 -2
- crawlo/templates/project/middlewares.py.tmpl +9 -89
- crawlo/templates/project/pipelines.py.tmpl +8 -68
- crawlo/tools/__init__.py +0 -11
- crawlo/utils/__init__.py +17 -1
- crawlo/utils/db_helper.py +220 -319
- crawlo/utils/error_handler.py +313 -67
- crawlo/utils/fingerprint.py +3 -4
- crawlo/utils/misc.py +82 -0
- crawlo/utils/request.py +55 -66
- crawlo/utils/selector_helper.py +138 -0
- crawlo/utils/spider_loader.py +185 -45
- crawlo/utils/text_helper.py +95 -0
- crawlo-1.4.5.dist-info/METADATA +329 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/RECORD +76 -49
- tests/bug_check_test.py +251 -0
- tests/direct_selector_helper_test.py +97 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
- tests/ofweek_scrapy/scrapy.cfg +11 -0
- tests/performance_comparison.py +4 -5
- tests/simple_crawlo_test.py +1 -2
- tests/simple_follow_test.py +39 -0
- tests/simple_response_selector_test.py +95 -0
- tests/simple_selector_helper_test.py +155 -0
- tests/simple_selector_test.py +208 -0
- tests/simple_url_test.py +74 -0
- tests/test_crawler_process_import.py +39 -0
- tests/test_crawler_process_spider_modules.py +48 -0
- tests/test_edge_cases.py +7 -5
- tests/test_encoding_core.py +57 -0
- tests/test_encoding_detection.py +127 -0
- tests/test_factory_compatibility.py +197 -0
- tests/test_optimized_selector_naming.py +101 -0
- tests/test_priority_behavior.py +18 -18
- tests/test_response_follow.py +105 -0
- tests/test_response_selector_methods.py +93 -0
- tests/test_response_url_methods.py +71 -0
- tests/test_response_urljoin.py +87 -0
- tests/test_scrapy_style_encoding.py +113 -0
- tests/test_selector_helper.py +101 -0
- tests/test_selector_optimizations.py +147 -0
- tests/test_spider_loader.py +50 -0
- tests/test_spider_loader_comprehensive.py +70 -0
- tests/test_spiders/__init__.py +1 -0
- tests/test_spiders/test_spider.py +10 -0
- crawlo/tools/anti_crawler.py +0 -269
- crawlo/utils/class_loader.py +0 -26
- crawlo/utils/enhanced_error_handler.py +0 -357
- crawlo-1.4.4.dist-info/METADATA +0 -190
- tests/simple_log_test.py +0 -58
- tests/simple_test.py +0 -48
- tests/test_framework_logger.py +0 -67
- tests/test_framework_startup.py +0 -65
- tests/test_mode_change.py +0 -73
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.4.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
- /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
crawlo/utils/request.py
CHANGED
|
@@ -79,52 +79,63 @@ def request_fingerprint(
|
|
|
79
79
|
:param include_headers: 指定要参与指纹计算的 header 名称列表(str 或 bytes)
|
|
80
80
|
:return: 请求指纹(hex string)
|
|
81
81
|
"""
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
82
|
+
from crawlo.utils.fingerprint import FingerprintGenerator
|
|
83
|
+
|
|
84
|
+
# 准备请求数据
|
|
85
|
+
method = request.method
|
|
86
|
+
url = request.url
|
|
87
|
+
body = request.body or b''
|
|
88
|
+
headers = None
|
|
89
|
+
|
|
89
90
|
# 处理 headers
|
|
90
|
-
if include_headers:
|
|
91
|
-
headers =
|
|
91
|
+
if include_headers and hasattr(request, 'headers'):
|
|
92
|
+
headers = {}
|
|
92
93
|
for header_name in include_headers:
|
|
93
|
-
|
|
94
|
-
value =
|
|
94
|
+
name_str = str(header_name).lower() # 统一转为小写进行匹配
|
|
95
|
+
value = ''
|
|
95
96
|
|
|
96
97
|
# 兼容 headers 的访问方式(如 MultiDict 或 dict)
|
|
97
|
-
if hasattr(headers, 'get_all'):
|
|
98
|
+
if hasattr(request.headers, 'get_all'):
|
|
98
99
|
# 如 scrapy.http.Headers 的 get_all 方法
|
|
99
|
-
values = headers.get_all(
|
|
100
|
-
value =
|
|
101
|
-
elif hasattr(headers, '__getitem__'):
|
|
100
|
+
values = request.headers.get_all(name_str)
|
|
101
|
+
value = ';'.join(str(v) for v in values) if values else ''
|
|
102
|
+
elif hasattr(request.headers, '__getitem__'):
|
|
102
103
|
# 如普通 dict
|
|
103
104
|
try:
|
|
104
|
-
raw_value = headers[
|
|
105
|
+
raw_value = request.headers[name_str]
|
|
105
106
|
if isinstance(raw_value, list):
|
|
106
|
-
value =
|
|
107
|
+
value = ';'.join(str(v) for v in raw_value)
|
|
107
108
|
else:
|
|
108
|
-
value =
|
|
109
|
+
value = str(raw_value)
|
|
109
110
|
except (KeyError, TypeError):
|
|
110
|
-
value =
|
|
111
|
+
value = ''
|
|
111
112
|
else:
|
|
112
|
-
value =
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
113
|
+
value = ''
|
|
114
|
+
|
|
115
|
+
headers[name_str] = value
|
|
116
|
+
|
|
117
|
+
# 使用统一的指纹生成器
|
|
118
|
+
return FingerprintGenerator.request_fingerprint(method, url, body, headers)
|
|
117
119
|
|
|
118
120
|
|
|
119
121
|
def set_request(request: Request, priority: int) -> None:
|
|
122
|
+
"""
|
|
123
|
+
设置请求的深度和优先级
|
|
124
|
+
|
|
125
|
+
:param request: Request 对象
|
|
126
|
+
:param priority: 优先级值
|
|
127
|
+
"""
|
|
128
|
+
# 增加请求深度
|
|
120
129
|
request.meta['depth'] = request.meta.setdefault('depth', 0) + 1
|
|
130
|
+
|
|
131
|
+
# 根据深度调整优先级,深度越深优先级越低
|
|
121
132
|
if priority:
|
|
122
133
|
request.priority -= request.meta['depth'] * priority
|
|
123
134
|
|
|
124
135
|
|
|
125
136
|
def request_to_dict(request: Request, spider=None) -> Dict[str, Any]:
|
|
126
137
|
"""
|
|
127
|
-
将 Request 对象转换为可 JSON
|
|
138
|
+
将 Request 对象转换为可 JSON 序列化的字典,用于分布式爬虫中的请求序列化。
|
|
128
139
|
|
|
129
140
|
Args:
|
|
130
141
|
request: 要序列化的 Request 对象
|
|
@@ -146,28 +157,22 @@ def request_to_dict(request: Request, spider=None) -> Dict[str, Any]:
|
|
|
146
157
|
|
|
147
158
|
# 1. 处理 callback
|
|
148
159
|
# 不能直接序列化函数,所以存储其路径
|
|
149
|
-
if callable(request
|
|
160
|
+
if callable(getattr(request, 'callback', None)):
|
|
150
161
|
d['_callback'] = _get_function_path(request.callback)
|
|
151
162
|
|
|
152
163
|
# 2. 处理 errback
|
|
153
|
-
if callable(request
|
|
154
|
-
d['_errback'] = _get_function_path(request.
|
|
164
|
+
if callable(getattr(request, 'err_back', None)):
|
|
165
|
+
d['_errback'] = _get_function_path(request.err_back)
|
|
155
166
|
|
|
156
167
|
# 3. 记录原始类名,以便反序列化时创建正确的实例
|
|
157
168
|
d['_class'] = request.__class__.__module__ + '.' + request.__class__.__name__
|
|
158
169
|
|
|
159
|
-
# 4. 特殊处理 FormRequest
|
|
160
|
-
# 如果是 FormRequest,需要保存 formdata
|
|
161
|
-
if isinstance(request, Request):
|
|
162
|
-
if hasattr(request, 'formdata'):
|
|
163
|
-
d['formdata'] = request.formdata
|
|
164
|
-
|
|
165
170
|
return d
|
|
166
171
|
|
|
167
172
|
|
|
168
173
|
def request_from_dict(d: Dict[str, Any], spider=None) -> Request:
|
|
169
174
|
"""
|
|
170
|
-
从字典重建 Request
|
|
175
|
+
从字典重建 Request 对象,用于分布式爬虫中的请求反序列化。
|
|
171
176
|
|
|
172
177
|
Args:
|
|
173
178
|
d: 由 request_to_dict 生成的字典
|
|
@@ -193,37 +198,21 @@ def request_from_dict(d: Dict[str, Any], spider=None) -> Request:
|
|
|
193
198
|
errback_path = d.pop('_errback', None)
|
|
194
199
|
errback = _get_function_from_path(errback_path, spider) if errback_path else None
|
|
195
200
|
|
|
196
|
-
# 4. 提取特殊字段
|
|
197
|
-
formdata = d.pop('formdata', None)
|
|
198
|
-
|
|
199
201
|
# 5. 创建 Request 实例
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
else:
|
|
215
|
-
# 普通 Request 或没有 formdata 的情况
|
|
216
|
-
request = cls(
|
|
217
|
-
url=d['url'],
|
|
218
|
-
method=d.get('method', 'GET'),
|
|
219
|
-
headers=d.get('headers', {}),
|
|
220
|
-
body=d.get('body'),
|
|
221
|
-
callback=callback,
|
|
222
|
-
errback=errback,
|
|
223
|
-
meta=d.get('meta', {}),
|
|
224
|
-
flags=d.get('flags', []),
|
|
225
|
-
cb_kwargs=d.get('cb_kwargs', {}),
|
|
226
|
-
)
|
|
202
|
+
request = cls(
|
|
203
|
+
url=d['url'],
|
|
204
|
+
method=d.get('method', 'GET'),
|
|
205
|
+
headers=d.get('headers', {}),
|
|
206
|
+
body=d.get('body'),
|
|
207
|
+
callback=callback,
|
|
208
|
+
meta=d.get('meta', {}),
|
|
209
|
+
flags=d.get('flags', []),
|
|
210
|
+
cb_kwargs=d.get('cb_kwargs', {}),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# 手动设置 err_back 属性
|
|
214
|
+
if errback is not None:
|
|
215
|
+
request.err_back = errback
|
|
227
216
|
|
|
228
217
|
return request
|
|
229
218
|
|
|
@@ -256,7 +245,7 @@ def _get_function_from_path(path: str, spider=None) -> Optional[callable]:
|
|
|
256
245
|
func = getattr(func, attr)
|
|
257
246
|
|
|
258
247
|
# 如果 spider 存在,并且 func 是 spider 的方法
|
|
259
|
-
if spider and hasattr(spider, func.__name__):
|
|
248
|
+
if spider and hasattr(func, '__name__') and hasattr(spider, func.__name__):
|
|
260
249
|
spider_method = getattr(spider, func.__name__)
|
|
261
250
|
if spider_method is func:
|
|
262
251
|
return spider_method # 返回绑定的方法
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
选择器辅助工具模块
|
|
5
|
+
==================
|
|
6
|
+
提供用于处理parsel选择器的辅助函数,用于提取文本和属性等操作。
|
|
7
|
+
|
|
8
|
+
该模块包含以下主要函数:
|
|
9
|
+
- extract_text: 从元素列表中提取文本并拼接
|
|
10
|
+
- extract_texts: 从元素列表中提取多个文本列表
|
|
11
|
+
- extract_attr: 从元素列表中提取单个元素的属性值
|
|
12
|
+
- extract_attrs: 从元素列表中提取多个元素的属性值列表
|
|
13
|
+
- is_xpath: 判断查询语句是否为XPath
|
|
14
|
+
|
|
15
|
+
所有方法都采用了简洁直观的命名风格,便于记忆和使用。
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import List, Any, Optional
|
|
19
|
+
from parsel import Selector, SelectorList
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_text(elements: SelectorList, join_str: str = " ") -> str:
|
|
23
|
+
"""
|
|
24
|
+
从元素列表中提取文本并拼接
|
|
25
|
+
|
|
26
|
+
:param elements: SelectorList元素列表
|
|
27
|
+
:param join_str: 文本拼接分隔符
|
|
28
|
+
:return: 拼接后的文本
|
|
29
|
+
|
|
30
|
+
示例:
|
|
31
|
+
title_elements = selector.css('title')
|
|
32
|
+
title_text = extract_text(title_elements)
|
|
33
|
+
"""
|
|
34
|
+
texts = []
|
|
35
|
+
for element in elements:
|
|
36
|
+
# 获取元素的所有文本节点
|
|
37
|
+
if hasattr(element, 'xpath'):
|
|
38
|
+
element_texts = element.xpath('.//text()').getall()
|
|
39
|
+
else:
|
|
40
|
+
element_texts = [str(element)]
|
|
41
|
+
# 清理并添加非空文本
|
|
42
|
+
for text in element_texts:
|
|
43
|
+
cleaned = text.strip()
|
|
44
|
+
if cleaned:
|
|
45
|
+
texts.append(cleaned)
|
|
46
|
+
return join_str.join(texts)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
|
|
50
|
+
"""
|
|
51
|
+
从元素列表中提取多个文本列表
|
|
52
|
+
|
|
53
|
+
:param elements: SelectorList元素列表
|
|
54
|
+
:param join_str: 单个节点内文本拼接分隔符
|
|
55
|
+
:return: 纯文本列表(每个元素对应一个节点的文本)
|
|
56
|
+
|
|
57
|
+
示例:
|
|
58
|
+
li_elements = selector.css('.list li')
|
|
59
|
+
li_texts = extract_texts(li_elements)
|
|
60
|
+
"""
|
|
61
|
+
result = []
|
|
62
|
+
for element in elements:
|
|
63
|
+
# 对每个元素提取文本
|
|
64
|
+
if hasattr(element, 'xpath'):
|
|
65
|
+
texts = element.xpath('.//text()').getall()
|
|
66
|
+
else:
|
|
67
|
+
texts = [str(element)]
|
|
68
|
+
|
|
69
|
+
# 清理文本并拼接
|
|
70
|
+
clean_texts = [text.strip() for text in texts if text.strip()]
|
|
71
|
+
if clean_texts:
|
|
72
|
+
result.append(join_str.join(clean_texts))
|
|
73
|
+
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
|
|
78
|
+
"""
|
|
79
|
+
从元素列表中提取单个元素的属性值
|
|
80
|
+
|
|
81
|
+
:param elements: SelectorList元素列表
|
|
82
|
+
:param attr_name: 属性名称
|
|
83
|
+
:param default: 默认返回值
|
|
84
|
+
:return: 属性值或默认值
|
|
85
|
+
|
|
86
|
+
示例:
|
|
87
|
+
link_elements = selector.css('.link')
|
|
88
|
+
link_href = extract_attr(link_elements, 'href')
|
|
89
|
+
"""
|
|
90
|
+
# 使用parsel的attrib属性获取第一个匹配元素的属性值
|
|
91
|
+
if hasattr(elements, 'attrib'):
|
|
92
|
+
return elements.attrib.get(attr_name, default)
|
|
93
|
+
# 如果elements是SelectorList,获取第一个元素的属性
|
|
94
|
+
elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
|
|
95
|
+
return elements[0].attrib.get(attr_name, default)
|
|
96
|
+
return default
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
|
|
100
|
+
"""
|
|
101
|
+
从元素列表中提取多个元素的属性值列表
|
|
102
|
+
|
|
103
|
+
:param elements: SelectorList元素列表
|
|
104
|
+
:param attr_name: 属性名称
|
|
105
|
+
:return: 属性值列表
|
|
106
|
+
|
|
107
|
+
示例:
|
|
108
|
+
all_links = selector.css('a')
|
|
109
|
+
all_hrefs = extract_attrs(all_links, 'href')
|
|
110
|
+
"""
|
|
111
|
+
result = []
|
|
112
|
+
for element in elements:
|
|
113
|
+
# 使用parsel的attrib属性获取元素的属性值
|
|
114
|
+
if hasattr(element, 'attrib'):
|
|
115
|
+
attr_value = element.attrib.get(attr_name)
|
|
116
|
+
if attr_value is not None:
|
|
117
|
+
result.append(attr_value)
|
|
118
|
+
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def is_xpath(query: str) -> bool:
|
|
123
|
+
"""
|
|
124
|
+
判断查询语句是否为XPath
|
|
125
|
+
|
|
126
|
+
:param query: 查询语句
|
|
127
|
+
:return: 是否为XPath
|
|
128
|
+
"""
|
|
129
|
+
return query.startswith(('/', '//', './'))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
__all__ = [
|
|
133
|
+
"extract_text",
|
|
134
|
+
"extract_texts",
|
|
135
|
+
"extract_attr",
|
|
136
|
+
"extract_attrs",
|
|
137
|
+
"is_xpath"
|
|
138
|
+
]
|
crawlo/utils/spider_loader.py
CHANGED
|
@@ -1,62 +1,202 @@
|
|
|
1
1
|
import importlib
|
|
2
|
+
import traceback
|
|
3
|
+
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
2
5
|
from pathlib import Path
|
|
3
|
-
from typing import List, Type,
|
|
6
|
+
from typing import List, Type, Dict, Any
|
|
4
7
|
|
|
8
|
+
from crawlo.interfaces import ISpiderLoader
|
|
9
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
5
10
|
from crawlo.spider import Spider
|
|
11
|
+
from crawlo.network.request import Request
|
|
6
12
|
from crawlo.utils.log import get_logger
|
|
7
13
|
|
|
8
14
|
logger = get_logger(__name__)
|
|
9
15
|
|
|
10
16
|
|
|
11
|
-
class
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
module_name = py_file.stem
|
|
31
|
-
spider_module_path = f"{self.project_package}.spiders.{module_name}"
|
|
32
|
-
|
|
33
|
-
try:
|
|
34
|
-
module = importlib.import_module(spider_module_path)
|
|
35
|
-
except ImportError as e:
|
|
36
|
-
logger.debug(f"Skip module {module_name}: {e}")
|
|
37
|
-
continue
|
|
38
|
-
|
|
39
|
-
# 查找所有 Spider 子类
|
|
40
|
-
for attr_name in dir(module):
|
|
41
|
-
attr_value = getattr(module, attr_name)
|
|
42
|
-
if (isinstance(attr_value, type) and
|
|
43
|
-
issubclass(attr_value, Spider) and
|
|
44
|
-
attr_value != Spider and
|
|
45
|
-
hasattr(attr_value, 'name')):
|
|
46
|
-
|
|
47
|
-
spider_name = getattr(attr_value, 'name')
|
|
48
|
-
if spider_name in self._spiders:
|
|
49
|
-
logger.warning(f"Duplicate spider name '{spider_name}' found")
|
|
50
|
-
self._spiders[spider_name] = attr_value
|
|
17
|
+
class SpiderLoaderProtocol:
|
|
18
|
+
"""Protocol for spider loader"""
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def from_settings(cls, settings: SettingManager) -> 'SpiderLoaderProtocol':
|
|
22
|
+
"""Create spider loader from settings"""
|
|
23
|
+
return cls(settings)
|
|
24
|
+
|
|
25
|
+
def load(self, spider_name: str) -> Type[Spider]:
|
|
26
|
+
"""Load a spider by name"""
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
def list(self) -> List[str]:
|
|
30
|
+
"""List all available spider names"""
|
|
31
|
+
raise NotImplementedError
|
|
32
|
+
|
|
33
|
+
def find_by_request(self, request: 'Request') -> List[str]:
|
|
34
|
+
"""Find spider names that can handle the given request"""
|
|
35
|
+
raise NotImplementedError
|
|
51
36
|
|
|
52
|
-
def load(self, spider_name: str) -> Optional[Type[Spider]]:
|
|
53
|
-
"""通过 name 加载爬虫"""
|
|
54
|
-
return self._spiders.get(spider_name)
|
|
55
37
|
|
|
38
|
+
class SpiderLoader(ISpiderLoader):
|
|
39
|
+
"""爬虫加载器,负责发现和加载爬虫"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, settings: SettingManager = None):
|
|
42
|
+
# 如果提供了settings,则从settings中获取配置
|
|
43
|
+
if settings is not None:
|
|
44
|
+
self.spider_modules = settings.get('SPIDER_MODULES', [])
|
|
45
|
+
self.warn_only = settings.get('SPIDER_LOADER_WARN_ONLY', False)
|
|
46
|
+
else:
|
|
47
|
+
# 默认配置
|
|
48
|
+
self.spider_modules = []
|
|
49
|
+
self.warn_only = False
|
|
50
|
+
|
|
51
|
+
self._spiders: Dict[str, Type[Spider]] = {}
|
|
52
|
+
self._found: Dict[str, List[tuple]] = defaultdict(list)
|
|
53
|
+
self._load_all_spiders()
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_settings(cls, settings: SettingManager) -> 'SpiderLoader':
|
|
57
|
+
"""从设置创建SpiderLoader实例"""
|
|
58
|
+
return cls(settings)
|
|
59
|
+
|
|
60
|
+
def _check_name_duplicates(self) -> None:
|
|
61
|
+
"""检查重复的spider名称"""
|
|
62
|
+
dupes = []
|
|
63
|
+
for name, locations in self._found.items():
|
|
64
|
+
if len(locations) > 1:
|
|
65
|
+
dupes.extend([
|
|
66
|
+
f" {cls} named {name!r} (in {mod})"
|
|
67
|
+
for mod, cls in locations
|
|
68
|
+
])
|
|
69
|
+
|
|
70
|
+
if dupes:
|
|
71
|
+
dupes_string = "\n\n".join(dupes)
|
|
72
|
+
warnings.warn(
|
|
73
|
+
"There are several spiders with the same name:\n\n"
|
|
74
|
+
f"{dupes_string}\n\n This can cause unexpected behavior.",
|
|
75
|
+
category=UserWarning,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _load_spiders(self, module) -> None:
|
|
79
|
+
"""加载模块中的所有spider"""
|
|
80
|
+
for attr_name in dir(module):
|
|
81
|
+
attr_value = getattr(module, attr_name)
|
|
82
|
+
if (isinstance(attr_value, type) and
|
|
83
|
+
issubclass(attr_value, Spider) and
|
|
84
|
+
attr_value != Spider and
|
|
85
|
+
hasattr(attr_value, 'name')):
|
|
86
|
+
|
|
87
|
+
spider_name = getattr(attr_value, 'name')
|
|
88
|
+
self._found[spider_name].append((module.__name__, attr_value.__name__))
|
|
89
|
+
self._spiders[spider_name] = attr_value
|
|
90
|
+
|
|
91
|
+
def _load_spiders_from_package(self, package_name: str) -> None:
|
|
92
|
+
"""从包中加载spiders"""
|
|
93
|
+
try:
|
|
94
|
+
# 尝试导入包
|
|
95
|
+
package = importlib.import_module(package_name)
|
|
96
|
+
|
|
97
|
+
# 遍历包中的所有模块
|
|
98
|
+
package_path = Path(package.__file__).parent
|
|
99
|
+
for py_file in package_path.glob("*.py"):
|
|
100
|
+
if py_file.name.startswith('_'):
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
module_name = py_file.stem
|
|
104
|
+
spider_module_path = f"{package_name}.{module_name}"
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
module = importlib.import_module(spider_module_path)
|
|
108
|
+
self._load_spiders(module)
|
|
109
|
+
except ImportError as e:
|
|
110
|
+
if self.warn_only:
|
|
111
|
+
logger.warning(f"Could not load spiders from module '{spider_module_path}': {e}")
|
|
112
|
+
logger.debug(traceback.format_exc())
|
|
113
|
+
else:
|
|
114
|
+
raise
|
|
115
|
+
except (ImportError, SyntaxError) as e:
|
|
116
|
+
if self.warn_only:
|
|
117
|
+
logger.warning(f"Could not load spiders from package '{package_name}': {e}")
|
|
118
|
+
logger.debug(traceback.format_exc())
|
|
119
|
+
else:
|
|
120
|
+
raise
|
|
121
|
+
|
|
122
|
+
def _load_all_spiders(self) -> None:
|
|
123
|
+
"""加载所有spiders"""
|
|
124
|
+
# 如果配置了SPIDER_MODULES,则从这些模块加载
|
|
125
|
+
if self.spider_modules:
|
|
126
|
+
for module_name in self.spider_modules:
|
|
127
|
+
self._load_spiders_from_package(module_name)
|
|
128
|
+
else:
|
|
129
|
+
# 向后兼容:如果没有配置SPIDER_MODULES,则使用旧的方式
|
|
130
|
+
# 这里假设默认的spiders目录结构
|
|
131
|
+
spiders_dir = Path.cwd() / 'spiders'
|
|
132
|
+
if not spiders_dir.exists():
|
|
133
|
+
spiders_dir = Path.cwd() / 'spider'
|
|
134
|
+
if not spiders_dir.exists():
|
|
135
|
+
logger.warning("Spiders directory not found")
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
for py_file in spiders_dir.glob("*.py"):
|
|
139
|
+
if py_file.name.startswith('_'):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
module_name = py_file.stem
|
|
143
|
+
module = None
|
|
144
|
+
try:
|
|
145
|
+
# 尝试不同的导入路径
|
|
146
|
+
spider_module_path = None
|
|
147
|
+
for possible_package in ['spiders', 'spider']:
|
|
148
|
+
try:
|
|
149
|
+
spider_module_path = f"{possible_package}.{module_name}"
|
|
150
|
+
module = importlib.import_module(spider_module_path)
|
|
151
|
+
break
|
|
152
|
+
except ImportError:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if module is None:
|
|
156
|
+
raise ImportError(f"Could not import {module_name}")
|
|
157
|
+
|
|
158
|
+
self._load_spiders(module)
|
|
159
|
+
except ImportError as e:
|
|
160
|
+
logger.debug(f"Skip module {module_name}: {e}")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
self._check_name_duplicates()
|
|
164
|
+
|
|
165
|
+
def load(self, spider_name: str) -> Type[Spider]:
|
|
166
|
+
"""
|
|
167
|
+
通过name加载爬虫
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
spider_name: 爬虫名称
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Spider类
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
KeyError: 如果找不到指定名称的爬虫
|
|
177
|
+
"""
|
|
178
|
+
if spider_name not in self._spiders:
|
|
179
|
+
raise KeyError(f"Spider not found: {spider_name}")
|
|
180
|
+
return self._spiders[spider_name]
|
|
181
|
+
|
|
56
182
|
def list(self) -> List[str]:
|
|
57
183
|
"""列出所有可用的爬虫名称"""
|
|
58
184
|
return list(self._spiders.keys())
|
|
59
|
-
|
|
185
|
+
|
|
186
|
+
def find_by_request(self, request: 'Request') -> List[str]:
|
|
187
|
+
"""
|
|
188
|
+
根据请求找到可以处理该请求的爬虫名称
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
request: 请求对象
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
可以处理该请求的爬虫名称列表
|
|
195
|
+
"""
|
|
196
|
+
# 这里可以实现更复杂的匹配逻辑
|
|
197
|
+
# 目前只是返回所有爬虫名称
|
|
198
|
+
return list(self._spiders.keys())
|
|
199
|
+
|
|
60
200
|
def get_all(self) -> Dict[str, Type[Spider]]:
|
|
61
201
|
"""获取所有爬虫"""
|
|
62
202
|
return self._spiders.copy()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Union, List, Dict, Tuple, Optional
|
|
5
|
+
|
|
6
|
+
from crawlo.utils.log import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
# 正则表达式缓存
|
|
11
|
+
_REGEXPS: Dict[str, "re.Pattern"] = {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_text_by_regex(
|
|
15
|
+
text: Union[str, Any],
|
|
16
|
+
patterns: Union[str, List[str]],
|
|
17
|
+
allow_repeat: bool = True,
|
|
18
|
+
fetch_one: bool = False,
|
|
19
|
+
join_with: Optional[str] = None,
|
|
20
|
+
) -> Union[str, List[str], Tuple]:
|
|
21
|
+
"""
|
|
22
|
+
从文本中提取信息,支持正则匹配和多模式 fallback。
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text (str): 文本内容或可转为字符串的类型
|
|
26
|
+
patterns (str or list of str): 正则表达式模式,按顺序尝试匹配
|
|
27
|
+
allow_repeat (bool): 是否允许重复结果
|
|
28
|
+
fetch_one (bool): 是否只提取第一个匹配项(返回元组)
|
|
29
|
+
join_with (str, optional): 若提供,则将结果用该字符连接成字符串
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
str | list | tuple: 匹配结果,根据参数返回字符串、列表或元组
|
|
33
|
+
"""
|
|
34
|
+
if isinstance(patterns, str):
|
|
35
|
+
patterns = [patterns]
|
|
36
|
+
|
|
37
|
+
results = []
|
|
38
|
+
for pattern in patterns:
|
|
39
|
+
if not pattern:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
if pattern not in _REGEXPS:
|
|
43
|
+
_REGEXPS[pattern] = re.compile(pattern, re.S)
|
|
44
|
+
|
|
45
|
+
if fetch_one:
|
|
46
|
+
match = _REGEXPS[pattern].search(str(text))
|
|
47
|
+
results = match.groups() if match else ("",)
|
|
48
|
+
break
|
|
49
|
+
else:
|
|
50
|
+
found = _REGEXPS[pattern].findall(str(text))
|
|
51
|
+
if found:
|
|
52
|
+
results = found
|
|
53
|
+
break
|
|
54
|
+
|
|
55
|
+
if fetch_one:
|
|
56
|
+
return results[0] if len(results) == 1 else results
|
|
57
|
+
|
|
58
|
+
if not allow_repeat:
|
|
59
|
+
results = sorted(set(results), key=results.index)
|
|
60
|
+
|
|
61
|
+
return join_with.join(results) if join_with else results
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def parse_json_safely(json_str: Union[str, Any]) -> Dict:
|
|
65
|
+
"""
|
|
66
|
+
安全解析 JSON 字符串,兼容非标准格式(如单引号、缺少引号键)。
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
json_str (str): JSON 字符串
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
dict: 解析后的字典,失败返回空字典
|
|
73
|
+
"""
|
|
74
|
+
if not json_str:
|
|
75
|
+
return {}
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
return json.loads(json_str)
|
|
79
|
+
except Exception as e1:
|
|
80
|
+
try:
|
|
81
|
+
cleaned = json_str.strip().replace("'", '"')
|
|
82
|
+
# 使用新的函数名
|
|
83
|
+
keys = extract_text_by_regex(cleaned, r'(\w+):')
|
|
84
|
+
for key in keys:
|
|
85
|
+
cleaned = cleaned.replace(f"{key}:", f'"{key}":')
|
|
86
|
+
return json.loads(cleaned) if cleaned else {}
|
|
87
|
+
except Exception as e2:
|
|
88
|
+
logger.error(
|
|
89
|
+
f"JSON 解析失败\n"
|
|
90
|
+
f"原始内容: {json_str}\n"
|
|
91
|
+
f"错误1: {e1}\n"
|
|
92
|
+
f"修复后: {cleaned}\n"
|
|
93
|
+
f"错误2: {e2}"
|
|
94
|
+
)
|
|
95
|
+
return {}
|