crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show
  1. crawlo/__init__.py +11 -15
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +52 -17
  4. crawlo/commands/startproject.py +24 -0
  5. crawlo/core/engine.py +2 -2
  6. crawlo/core/scheduler.py +4 -4
  7. crawlo/crawler.py +13 -6
  8. crawlo/downloader/__init__.py +5 -2
  9. crawlo/extension/__init__.py +2 -2
  10. crawlo/filters/aioredis_filter.py +8 -1
  11. crawlo/filters/memory_filter.py +8 -1
  12. crawlo/initialization/built_in.py +13 -4
  13. crawlo/initialization/core.py +5 -4
  14. crawlo/interfaces.py +24 -0
  15. crawlo/middleware/__init__.py +7 -4
  16. crawlo/middleware/middleware_manager.py +15 -8
  17. crawlo/mode_manager.py +45 -11
  18. crawlo/network/response.py +374 -69
  19. crawlo/pipelines/mysql_pipeline.py +6 -6
  20. crawlo/pipelines/pipeline_manager.py +2 -2
  21. crawlo/project.py +2 -4
  22. crawlo/queue/pqueue.py +2 -6
  23. crawlo/queue/queue_manager.py +1 -2
  24. crawlo/settings/default_settings.py +15 -30
  25. crawlo/task_manager.py +2 -2
  26. crawlo/templates/project/items.py.tmpl +2 -2
  27. crawlo/templates/project/middlewares.py.tmpl +9 -89
  28. crawlo/templates/project/pipelines.py.tmpl +8 -68
  29. crawlo/templates/project/settings.py.tmpl +51 -65
  30. crawlo/templates/project/settings_distributed.py.tmpl +59 -67
  31. crawlo/templates/project/settings_gentle.py.tmpl +45 -40
  32. crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
  33. crawlo/templates/project/settings_minimal.py.tmpl +37 -26
  34. crawlo/templates/project/settings_simple.py.tmpl +45 -40
  35. crawlo/templates/run.py.tmpl +3 -7
  36. crawlo/tools/__init__.py +0 -11
  37. crawlo/utils/__init__.py +17 -1
  38. crawlo/utils/db_helper.py +220 -319
  39. crawlo/utils/error_handler.py +313 -67
  40. crawlo/utils/fingerprint.py +3 -4
  41. crawlo/utils/misc.py +82 -0
  42. crawlo/utils/request.py +55 -66
  43. crawlo/utils/selector_helper.py +138 -0
  44. crawlo/utils/spider_loader.py +185 -45
  45. crawlo/utils/text_helper.py +95 -0
  46. crawlo-1.4.5.dist-info/METADATA +329 -0
  47. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
  48. tests/bug_check_test.py +251 -0
  49. tests/direct_selector_helper_test.py +97 -0
  50. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  51. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  52. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  53. tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  54. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  55. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  56. tests/ofweek_scrapy/scrapy.cfg +11 -0
  57. tests/performance_comparison.py +4 -5
  58. tests/simple_crawlo_test.py +1 -2
  59. tests/simple_follow_test.py +39 -0
  60. tests/simple_response_selector_test.py +95 -0
  61. tests/simple_selector_helper_test.py +155 -0
  62. tests/simple_selector_test.py +208 -0
  63. tests/simple_url_test.py +74 -0
  64. tests/test_crawler_process_import.py +39 -0
  65. tests/test_crawler_process_spider_modules.py +48 -0
  66. tests/test_edge_cases.py +7 -5
  67. tests/test_encoding_core.py +57 -0
  68. tests/test_encoding_detection.py +127 -0
  69. tests/test_factory_compatibility.py +197 -0
  70. tests/test_multi_directory.py +68 -0
  71. tests/test_multiple_spider_modules.py +81 -0
  72. tests/test_optimized_selector_naming.py +101 -0
  73. tests/test_priority_behavior.py +18 -18
  74. tests/test_response_follow.py +105 -0
  75. tests/test_response_selector_methods.py +93 -0
  76. tests/test_response_url_methods.py +71 -0
  77. tests/test_response_urljoin.py +87 -0
  78. tests/test_scrapy_style_encoding.py +113 -0
  79. tests/test_selector_helper.py +101 -0
  80. tests/test_selector_optimizations.py +147 -0
  81. tests/test_spider_loader.py +50 -0
  82. tests/test_spider_loader_comprehensive.py +70 -0
  83. tests/test_spider_modules.py +85 -0
  84. tests/test_spiders/__init__.py +1 -0
  85. tests/test_spiders/test_spider.py +10 -0
  86. crawlo/tools/anti_crawler.py +0 -269
  87. crawlo/utils/class_loader.py +0 -26
  88. crawlo/utils/enhanced_error_handler.py +0 -357
  89. crawlo-1.4.3.dist-info/METADATA +0 -190
  90. examples/test_project/__init__.py +0 -7
  91. examples/test_project/run.py +0 -35
  92. examples/test_project/test_project/__init__.py +0 -4
  93. examples/test_project/test_project/items.py +0 -18
  94. examples/test_project/test_project/middlewares.py +0 -119
  95. examples/test_project/test_project/pipelines.py +0 -97
  96. examples/test_project/test_project/settings.py +0 -170
  97. examples/test_project/test_project/spiders/__init__.py +0 -10
  98. examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  99. tests/simple_log_test.py +0 -58
  100. tests/simple_test.py +0 -48
  101. tests/test_framework_logger.py +0 -67
  102. tests/test_framework_startup.py +0 -65
  103. tests/test_mode_change.py +0 -73
  104. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
  105. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
  106. {crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
  107. /tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0
crawlo/mode_manager.py CHANGED
@@ -51,9 +51,11 @@ class ModeManager:
51
51
  def get_standalone_settings() -> Dict[str, Any]:
52
52
  """获取单机模式配置"""
53
53
  return {
54
+ 'RUN_MODE': 'standalone',
54
55
  'QUEUE_TYPE': 'memory',
55
56
  'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
56
57
  'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline',
58
+ 'PROJECT_NAME': 'crawlo',
57
59
  'CONCURRENCY': 8,
58
60
  'MAX_RUNNING_SPIDERS': 1,
59
61
  'DOWNLOAD_DELAY': 1.0,
@@ -75,6 +77,7 @@ class ModeManager:
75
77
  redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
76
78
 
77
79
  return {
80
+ 'RUN_MODE': 'distributed',
78
81
  'QUEUE_TYPE': 'redis',
79
82
  'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
80
83
  'DEFAULT_DEDUP_PIPELINE': 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline',
@@ -95,6 +98,7 @@ class ModeManager:
95
98
  """获取自动检测模式配置"""
96
99
  # 默认使用内存队列和过滤器
97
100
  settings = ModeManager.get_standalone_settings()
101
+ settings['RUN_MODE'] = 'auto'
98
102
  settings['QUEUE_TYPE'] = 'auto'
99
103
  return settings
100
104
 
@@ -143,13 +147,29 @@ class ModeManager:
143
147
  raise ValueError(f"不支持的运行模式: {mode}")
144
148
 
145
149
  # 合并用户自定义配置
146
- user_settings = {
147
- k: v for k,
148
- v in kwargs.items() if k not in [
149
- 'redis_host',
150
- 'redis_port',
151
- 'redis_password',
152
- 'project_name']}
150
+ # 对于分布式模式,过滤掉特定参数
151
+ if mode == RunMode.DISTRIBUTED:
152
+ user_settings = {
153
+ k.upper(): v for k,
154
+ v in kwargs.items() if k not in [
155
+ 'redis_host',
156
+ 'redis_port',
157
+ 'redis_password',
158
+ 'project_name']}
159
+ # 特别处理project_name
160
+ if 'project_name' in kwargs:
161
+ settings['PROJECT_NAME'] = kwargs['project_name']
162
+ else:
163
+ # 对于单机模式和自动模式,只过滤Redis相关参数
164
+ user_settings = {
165
+ k.upper(): v for k,
166
+ v in kwargs.items() if k not in [
167
+ 'redis_host',
168
+ 'redis_port',
169
+ 'redis_password']}
170
+ # 特别处理project_name
171
+ if 'project_name' in kwargs:
172
+ settings['PROJECT_NAME'] = kwargs['project_name']
153
173
  settings.update(user_settings)
154
174
  self._debug(f"合并用户自定义配置: {list(user_settings.keys())}")
155
175
 
@@ -182,9 +202,16 @@ class ModeManager:
182
202
 
183
203
 
184
204
  # 便利函数
185
- def standalone_mode(**kwargs) -> Dict[str, Any]:
205
+ def standalone_mode(
206
+ project_name: str = 'crawlo',
207
+ **kwargs
208
+ ) -> Dict[str, Any]:
186
209
  """快速创建单机模式配置"""
187
- return ModeManager().resolve_mode_settings('standalone', **kwargs)
210
+ return ModeManager().resolve_mode_settings(
211
+ 'standalone',
212
+ project_name=project_name,
213
+ **kwargs
214
+ )
188
215
 
189
216
 
190
217
  def distributed_mode(
@@ -207,9 +234,16 @@ def distributed_mode(
207
234
  )
208
235
 
209
236
 
210
- def auto_mode(**kwargs) -> Dict[str, Any]:
237
+ def auto_mode(
238
+ project_name: str = 'crawlo',
239
+ **kwargs
240
+ ) -> Dict[str, Any]:
211
241
  """快速创建自动检测模式配置"""
212
- return ModeManager().resolve_mode_settings('auto', **kwargs)
242
+ return ModeManager().resolve_mode_settings(
243
+ 'auto',
244
+ project_name=project_name,
245
+ **kwargs
246
+ )
213
247
 
214
248
 
215
249
  # 环境变量支持
@@ -11,13 +11,49 @@ HTTP Response 封装模块
11
11
  - Cookie 处理
12
12
  """
13
13
  import re
14
- import ujson
15
14
  from http.cookies import SimpleCookie
15
+ from typing import Dict, Any, List, Optional, Tuple
16
+ from urllib.parse import urljoin as _urljoin, urlparse as _urlparse, urlsplit as _urlsplit, parse_qs as _parse_qs, \
17
+ urlencode as _urlencode, quote as _quote, unquote as _unquote, urldefrag as _urldefrag
18
+
19
+ import ujson
16
20
  from parsel import Selector, SelectorList
17
- from typing import Dict, Any, List, Optional, Union
18
- from urllib.parse import urljoin as _urljoin
21
+
22
+ # 尝试导入 w3lib 编码检测函数
23
+ try:
24
+ from w3lib.encoding import (
25
+ html_body_declared_encoding,
26
+ html_to_unicode,
27
+ http_content_type_encoding,
28
+ read_bom,
29
+ resolve_encoding,
30
+ )
31
+ W3LIB_AVAILABLE = True
32
+ except ImportError:
33
+ W3LIB_AVAILABLE = False
19
34
 
20
35
  from crawlo.exceptions import DecodeError
36
+ from crawlo.utils import (
37
+ extract_text,
38
+ extract_texts,
39
+ extract_attr,
40
+ extract_attrs,
41
+ is_xpath
42
+ )
43
+
44
+
45
+ def memoize_method_noargs(func):
46
+ """
47
+ 装饰器,用于缓存无参数方法的结果
48
+ """
49
+ cache_attr = f'_cache_{func.__name__}'
50
+
51
+ def wrapper(self):
52
+ if not hasattr(self, cache_attr):
53
+ setattr(self, cache_attr, func(self))
54
+ return getattr(self, cache_attr)
55
+
56
+ return wrapper
21
57
 
22
58
 
23
59
  class Response:
@@ -31,6 +67,9 @@ class Response:
31
67
  - 多类型数据提取
32
68
  """
33
69
 
70
+ # 默认编码
71
+ _DEFAULT_ENCODING = "ascii"
72
+
34
73
  def __init__(
35
74
  self,
36
75
  url: str,
@@ -63,40 +102,146 @@ class Response:
63
102
  self._is_client_error = 400 <= status_code < 500
64
103
  self._is_server_error = status_code >= 500
65
104
 
66
- def _determine_encoding(self) -> Optional[str]:
67
- """智能检测响应编码"""
68
- # 1. 优先使用 request 的编码
69
- if self.request and self.request.encoding:
70
- return self.request.encoding
71
-
72
- # 2. 从 Content-Type 头中检测
73
- content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
74
- if content_type:
75
- charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
76
- if charset_match:
77
- return charset_match.group(1).lower()
78
-
79
- # 3. 从 HTML meta 标签中检测(仅对HTML内容)
80
- if b'<html' in self.body[:1024].lower():
81
- # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
82
- html_start = self.body[:4096] # 只检查前4KB
83
- try:
84
- html_text = html_start.decode('ascii', errors='ignore')
85
- # <meta charset="utf-8">
86
- charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
105
+ # ==================== 编码检测相关方法 ====================
106
+
107
+ def _determine_encoding(self) -> str:
108
+ """
109
+ 智能检测响应编码(参考 Scrapy 设计)
110
+
111
+ 编码检测优先级:
112
+ 1. Request 中指定的编码
113
+ 2. BOM 字节顺序标记
114
+ 3. HTTP Content-Type 头部
115
+ 4. HTML meta 标签声明
116
+ 5. 内容自动检测
117
+ 6. 默认编码 (utf-8)
118
+ """
119
+ # 1. 优先使用声明的编码
120
+ declared_encoding = self._declared_encoding()
121
+ if declared_encoding:
122
+ return declared_encoding
123
+
124
+ # 2. 使用 w3lib 进行编码检测(如果可用)
125
+ if W3LIB_AVAILABLE:
126
+ return self._body_inferred_encoding()
127
+ else:
128
+ # 如果 w3lib 不可用,使用原有的检测逻辑
129
+ # 从 Content-Type 头中检测
130
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
131
+ if content_type:
132
+ charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
87
133
  if charset_match:
88
134
  return charset_match.group(1).lower()
89
135
 
90
- # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
91
- content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
92
- if content_match:
93
- return content_match.group(1).lower()
94
- except Exception:
95
- pass
96
-
97
- # 4. 默认使用 utf-8
136
+ # HTML meta 标签中检测(仅对HTML内容)
137
+ if b'<html' in self.body[:1024].lower():
138
+ # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
139
+ html_start = self.body[:4096] # 只检查前4KB
140
+ try:
141
+ html_text = html_start.decode('ascii', errors='ignore')
142
+ # <meta charset="utf-8">
143
+ charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
144
+ if charset_match:
145
+ return charset_match.group(1).lower()
146
+
147
+ # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
148
+ content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
149
+ if content_match:
150
+ return content_match.group(1).lower()
151
+ except Exception:
152
+ pass
153
+
154
+ # 3. 默认使用 utf-8
98
155
  return 'utf-8'
99
156
 
157
+ def _declared_encoding(self) -> Optional[str]:
158
+ """
159
+ 获取声明的编码
160
+ 优先级:Request编码 > BOM > HTTP头部 > HTML meta标签
161
+ """
162
+ # 1. Request 中指定的编码
163
+ if self.request and getattr(self.request, 'encoding', None):
164
+ return self.request.encoding
165
+
166
+ # 2. BOM 字节顺序标记
167
+ bom_encoding = self._bom_encoding()
168
+ if bom_encoding:
169
+ return bom_encoding
170
+
171
+ # 3. HTTP Content-Type 头部
172
+ headers_encoding = self._headers_encoding()
173
+ if headers_encoding:
174
+ return headers_encoding
175
+
176
+ # 4. HTML meta 标签声明编码
177
+ body_declared_encoding = self._body_declared_encoding()
178
+ if body_declared_encoding:
179
+ return body_declared_encoding
180
+
181
+ return None
182
+
183
+ @memoize_method_noargs
184
+ def _bom_encoding(self) -> Optional[str]:
185
+ """BOM 字节顺序标记编码检测"""
186
+ if not W3LIB_AVAILABLE:
187
+ return None
188
+ return read_bom(self.body)[0]
189
+
190
+ @memoize_method_noargs
191
+ def _headers_encoding(self) -> Optional[str]:
192
+ """HTTP Content-Type 头部编码检测"""
193
+ if not W3LIB_AVAILABLE:
194
+ return None
195
+ content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
196
+ if isinstance(content_type, bytes):
197
+ content_type = content_type.decode('latin-1')
198
+ return http_content_type_encoding(content_type)
199
+
200
+ @memoize_method_noargs
201
+ def _body_declared_encoding(self) -> Optional[str]:
202
+ """HTML meta 标签声明编码检测"""
203
+ if not W3LIB_AVAILABLE:
204
+ return None
205
+ return html_body_declared_encoding(self.body)
206
+
207
+ @memoize_method_noargs
208
+ def _body_inferred_encoding(self) -> str:
209
+ """内容自动检测编码"""
210
+ if not W3LIB_AVAILABLE:
211
+ # 回退到简单的自动检测
212
+ for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
213
+ try:
214
+ self.body.decode(enc)
215
+ except UnicodeError:
216
+ continue
217
+ return enc
218
+ return self._DEFAULT_ENCODING
219
+
220
+ content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
221
+ if isinstance(content_type, bytes):
222
+ content_type = content_type.decode('latin-1')
223
+
224
+ # 使用 w3lib 的 html_to_unicode 函数进行编码检测
225
+ benc, _ = html_to_unicode(
226
+ content_type,
227
+ self.body,
228
+ auto_detect_fun=self._auto_detect_fun,
229
+ default_encoding=self._DEFAULT_ENCODING,
230
+ )
231
+ return benc
232
+
233
+ def _auto_detect_fun(self, text: bytes) -> Optional[str]:
234
+ """自动检测编码的回调函数"""
235
+ if not W3LIB_AVAILABLE:
236
+ return None
237
+ for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
238
+ try:
239
+ text.decode(enc)
240
+ except UnicodeError:
241
+ continue
242
+ return resolve_encoding(enc)
243
+ return None
244
+
100
245
  @property
101
246
  def text(self) -> str:
102
247
  """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
@@ -107,6 +252,24 @@ class Response:
107
252
  self._text_cache = ""
108
253
  return self._text_cache
109
254
 
255
+ # 如果可用,使用 w3lib 进行更准确的解码
256
+ if W3LIB_AVAILABLE:
257
+ try:
258
+ content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
259
+ if isinstance(content_type, bytes):
260
+ content_type = content_type.decode('latin-1')
261
+
262
+ # 使用 w3lib 的 html_to_unicode 函数
263
+ _, self._text_cache = html_to_unicode(
264
+ content_type,
265
+ self.body,
266
+ default_encoding=self.encoding
267
+ )
268
+ return self._text_cache
269
+ except Exception:
270
+ # 如果 w3lib 解码失败,回退到原有方法
271
+ pass
272
+
110
273
  # 尝试多种编码
111
274
  encodings_to_try = [self.encoding]
112
275
  if self.encoding != 'utf-8':
@@ -133,6 +296,8 @@ class Response:
133
296
  except Exception as e:
134
297
  raise DecodeError(f"Failed to decode response from {self.url}: {e}")
135
298
 
299
+ # ==================== 状态检查相关属性 ====================
300
+
136
301
  @property
137
302
  def is_success(self) -> bool:
138
303
  """检查响应是否成功 (2xx)"""
@@ -153,6 +318,8 @@ class Response:
153
318
  """检查响应是否为服务器错误 (5xx)"""
154
319
  return self._is_server_error
155
320
 
321
+ # ==================== 响应头相关属性 ====================
322
+
156
323
  @property
157
324
  def content_type(self) -> str:
158
325
  """获取响应的 Content-Type"""
@@ -164,6 +331,8 @@ class Response:
164
331
  length = self.headers.get('content-length') or self.headers.get('Content-Length')
165
332
  return int(length) if length else None
166
333
 
334
+ # ==================== JSON处理方法 ====================
335
+
167
336
  def json(self, default: Any = None) -> Any:
168
337
  """将响应文本解析为 JSON 对象。"""
169
338
  if self._json_cache is not None:
@@ -177,10 +346,109 @@ class Response:
177
346
  return default
178
347
  raise DecodeError(f"Failed to parse JSON from {self.url}: {e}")
179
348
 
349
+ # ==================== URL处理方法 ====================
350
+
180
351
  def urljoin(self, url: str) -> str:
181
352
  """拼接 URL,自动处理相对路径。"""
182
353
  return _urljoin(self.url, url)
183
354
 
355
+ def urlparse(self, url: str = None) -> Tuple:
356
+ """
357
+ 解析 URL 为组件元组 (scheme, netloc, path, params, query, fragment)
358
+
359
+ Args:
360
+ url (str, optional): 要解析的URL,默认为响应的URL
361
+
362
+ Returns:
363
+ tuple: URL组件元组
364
+ """
365
+ target_url = url if url is not None else self.url
366
+ return _urlparse(target_url)
367
+
368
+ def urlsplit(self, url: str = None) -> Tuple:
369
+ """
370
+ 解析 URL 为组件元组 (scheme, netloc, path, query, fragment)
371
+
372
+ Args:
373
+ url (str, optional): 要解析的URL,默认为响应的URL
374
+
375
+ Returns:
376
+ tuple: URL组件元组(不包含params)
377
+ """
378
+ target_url = url if url is not None else self.url
379
+ return _urlsplit(target_url)
380
+
381
+ def parse_qs(self, query_string: str = None, keep_blank_values: bool = False) -> Dict[str, List[str]]:
382
+ """
383
+ 解析查询字符串为字典
384
+
385
+ Args:
386
+ query_string (str, optional): 查询字符串,默认从URL中提取
387
+ keep_blank_values (bool): 是否保留空值
388
+
389
+ Returns:
390
+ dict: 查询参数字典
391
+ """
392
+ if query_string is None:
393
+ # 从URL中提取查询字符串
394
+ parsed = _urlparse(self.url)
395
+ query_string = parsed.query
396
+
397
+ return _parse_qs(query_string, keep_blank_values=keep_blank_values)
398
+
399
+ def urlencode(self, query: Dict[str, Any]) -> str:
400
+ """
401
+ 将字典编码为查询字符串
402
+
403
+ Args:
404
+ query (dict): 要编码的查询参数字典
405
+
406
+ Returns:
407
+ str: 编码后的查询字符串
408
+ """
409
+ return _urlencode(query)
410
+
411
+ def quote(self, string: str, safe: str = '/') -> str:
412
+ """
413
+ URL 编码
414
+
415
+ Args:
416
+ string (str): 要编码的字符串
417
+ safe (str): 不编码的字符,默认为 '/'
418
+
419
+ Returns:
420
+ str: URL编码后的字符串
421
+ """
422
+ return _quote(string, safe=safe)
423
+
424
+ def unquote(self, string: str) -> str:
425
+ """
426
+ URL 解码
427
+
428
+ Args:
429
+ string (str): 要解码的字符串
430
+
431
+ Returns:
432
+ str: URL解码后的字符串
433
+ """
434
+ return _unquote(string)
435
+
436
+ def urldefrag(self, url: str = None) -> Tuple[str, str]:
437
+ """
438
+ 移除 URL 中的片段标识符
439
+
440
+ Args:
441
+ url (str, optional): 要处理的URL,默认为响应的URL
442
+
443
+ Returns:
444
+ tuple: (去除片段的URL, 片段)
445
+ """
446
+ target_url = url if url is not None else self.url
447
+ defrag_result = _urldefrag(target_url)
448
+ return (defrag_result.url, defrag_result.fragment)
449
+
450
+ # ==================== 选择器相关方法 ====================
451
+
184
452
  @property
185
453
  def _selector(self) -> Selector:
186
454
  """懒加载 Selector 实例"""
@@ -198,9 +466,9 @@ class Response:
198
466
 
199
467
  def _is_xpath(self, query: str) -> bool:
200
468
  """判断查询语句是否为XPath"""
201
- return query.startswith(('/', '//', './'))
469
+ return is_xpath(query)
202
470
 
203
- def _extract_text_from_elements(self, elements: SelectorList, join_str: str = " ") -> str:
471
+ def _extract_text(self, elements: SelectorList, join_str: str = " ") -> str:
204
472
  """
205
473
  从元素列表中提取文本并拼接
206
474
 
@@ -208,19 +476,7 @@ class Response:
208
476
  :param join_str: 文本拼接分隔符
209
477
  :return: 拼接后的文本
210
478
  """
211
- texts = []
212
- for element in elements:
213
- # 获取元素的所有文本节点
214
- if hasattr(element, 'xpath'):
215
- element_texts = element.xpath('.//text()').getall()
216
- else:
217
- element_texts = [str(element)]
218
- # 清理并添加非空文本
219
- for text in element_texts:
220
- cleaned = text.strip()
221
- if cleaned:
222
- texts.append(cleaned)
223
- return join_str.join(texts)
479
+ return extract_text(elements, join_str)
224
480
 
225
481
  def extract_text(self, xpath_or_css: str, join_str: str = " ", default: str = '') -> str:
226
482
  """
@@ -233,12 +489,21 @@ class Response:
233
489
 
234
490
  返回:
235
491
  拼接后的纯文本字符串
492
+
493
+ 示例:
494
+ # 使用CSS选择器
495
+ title = response.extract_text('title')
496
+ content = response.extract_text('.content p', join_str=' ')
497
+
498
+ # 使用XPath选择器
499
+ title = response.extract_text('//title')
500
+ content = response.extract_text('//div[@class="content"]//p', join_str=' ')
236
501
  """
237
502
  try:
238
503
  elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
239
504
  if not elements:
240
505
  return default
241
- return self._extract_text_from_elements(elements, join_str)
506
+ return self._extract_text(elements, join_str)
242
507
  except Exception:
243
508
  return default
244
509
 
@@ -253,6 +518,15 @@ class Response:
253
518
 
254
519
  返回:
255
520
  纯文本列表(每个元素对应一个节点的文本)
521
+
522
+ 示例:
523
+ # 使用CSS选择器
524
+ paragraphs = response.extract_texts('.content p')
525
+ titles = response.extract_texts('h1, h2, h3')
526
+
527
+ # 使用XPath选择器
528
+ paragraphs = response.extract_texts('//div[@class="content"]//p')
529
+ titles = response.extract_texts('//h1 | //h2 | //h3')
256
530
  """
257
531
  if default is None:
258
532
  default = []
@@ -262,19 +536,7 @@ class Response:
262
536
  if not elements:
263
537
  return default
264
538
 
265
- result = []
266
- for element in elements:
267
- # 对每个元素提取文本
268
- if hasattr(element, 'xpath'):
269
- texts = element.xpath('.//text()').getall()
270
- else:
271
- texts = [str(element)]
272
-
273
- # 清理文本并拼接
274
- clean_texts = [text.strip() for text in texts if text.strip()]
275
- if clean_texts:
276
- result.append(join_str.join(clean_texts))
277
-
539
+ result = extract_texts(elements, join_str)
278
540
  return result if result else default
279
541
  except Exception:
280
542
  return default
@@ -290,12 +552,21 @@ class Response:
290
552
 
291
553
  返回:
292
554
  属性值或默认值
555
+
556
+ 示例:
557
+ # 使用CSS选择器
558
+ link_href = response.extract_attr('a', 'href')
559
+ img_src = response.extract_attr('.image', 'src')
560
+
561
+ # 使用XPath选择器
562
+ link_href = response.extract_attr('//a[@class="link"]', 'href')
563
+ img_src = response.extract_attr('//img[@alt="example"]', 'src')
293
564
  """
294
565
  try:
295
566
  elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
296
567
  if not elements:
297
568
  return default
298
- return elements.attrib.get(attr_name, default)
569
+ return extract_attr(elements, attr_name, default)
299
570
  except Exception:
300
571
  return default
301
572
 
@@ -310,6 +581,15 @@ class Response:
310
581
 
311
582
  返回:
312
583
  属性值列表
584
+
585
+ 示例:
586
+ # 使用CSS选择器
587
+ links = response.extract_attrs('a', 'href')
588
+ images = response.extract_attrs('img', 'src')
589
+
590
+ # 使用XPath选择器
591
+ links = response.extract_attrs('//a[@class="link"]', 'href')
592
+ images = response.extract_attrs('//img[@alt]', 'src')
313
593
  """
314
594
  if default is None:
315
595
  default = []
@@ -319,17 +599,13 @@ class Response:
319
599
  if not elements:
320
600
  return default
321
601
 
322
- result = []
323
- for element in elements:
324
- if hasattr(element, 'attrib'):
325
- attr_value = element.attrib.get(attr_name)
326
- if attr_value is not None:
327
- result.append(attr_value)
328
-
602
+ result = extract_attrs(elements, attr_name)
329
603
  return result if result else default
330
604
  except Exception:
331
605
  return default
332
606
 
607
+ # ==================== 正则表达式相关方法 ====================
608
+
333
609
  def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
334
610
  """在响应文本上执行正则表达式搜索。"""
335
611
  if not isinstance(pattern, str):
@@ -342,6 +618,8 @@ class Response:
342
618
  raise TypeError("Pattern must be a string")
343
619
  return re.findall(pattern, self.text, flags=flags)
344
620
 
621
+ # ==================== Cookie处理方法 ====================
622
+
345
623
  def get_cookies(self) -> Dict[str, str]:
346
624
  """从响应头中解析并返回Cookies。"""
347
625
  cookie_header = self.headers.get("Set-Cookie", "")
@@ -351,6 +629,33 @@ class Response:
351
629
  cookies.load(cookie_header)
352
630
  return {key: morsel.value for key, morsel in cookies.items()}
353
631
 
632
+ # ==================== 请求相关方法 ====================
633
+
634
+ def follow(self, url: str, callback=None, **kwargs):
635
+ """
636
+ 创建一个跟随链接的请求。
637
+
638
+ Args:
639
+ url: 要跟随的URL(可以是相对URL)
640
+ callback: 回调函数
641
+ **kwargs: 传递给Request的其他参数
642
+
643
+ Returns:
644
+ Request: 新的请求对象
645
+ """
646
+ # 延迟导入Request以避免循环导入
647
+ from ..network.request import Request
648
+
649
+ # 使用urljoin处理相对URL
650
+ absolute_url = self.urljoin(url)
651
+
652
+ # 创建新的请求
653
+ return Request(
654
+ url=absolute_url,
655
+ callback=callback or getattr(self.request, 'callback', None),
656
+ **kwargs
657
+ )
658
+
354
659
  @property
355
660
  def meta(self) -> Dict:
356
661
  """获取关联的 Request 对象的 meta 字典。"""