crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (68) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +68 -42
  4. crawlo/commands/list.py +102 -93
  5. crawlo/commands/startproject.py +89 -4
  6. crawlo/commands/utils.py +187 -0
  7. crawlo/config.py +280 -0
  8. crawlo/core/engine.py +16 -3
  9. crawlo/core/enhanced_engine.py +190 -0
  10. crawlo/core/scheduler.py +113 -8
  11. crawlo/crawler.py +840 -307
  12. crawlo/downloader/__init__.py +181 -17
  13. crawlo/downloader/aiohttp_downloader.py +15 -2
  14. crawlo/downloader/cffi_downloader.py +11 -1
  15. crawlo/downloader/httpx_downloader.py +14 -3
  16. crawlo/filters/__init__.py +122 -5
  17. crawlo/filters/aioredis_filter.py +128 -36
  18. crawlo/filters/memory_filter.py +99 -32
  19. crawlo/middleware/proxy.py +11 -8
  20. crawlo/middleware/retry.py +40 -5
  21. crawlo/mode_manager.py +201 -0
  22. crawlo/network/__init__.py +17 -3
  23. crawlo/network/request.py +118 -10
  24. crawlo/network/response.py +131 -28
  25. crawlo/pipelines/__init__.py +1 -1
  26. crawlo/pipelines/csv_pipeline.py +317 -0
  27. crawlo/pipelines/json_pipeline.py +219 -0
  28. crawlo/queue/__init__.py +0 -0
  29. crawlo/queue/pqueue.py +37 -0
  30. crawlo/queue/queue_manager.py +304 -0
  31. crawlo/queue/redis_priority_queue.py +192 -0
  32. crawlo/settings/default_settings.py +68 -9
  33. crawlo/spider/__init__.py +576 -66
  34. crawlo/task_manager.py +4 -1
  35. crawlo/templates/project/middlewares.py.tmpl +56 -45
  36. crawlo/templates/project/pipelines.py.tmpl +308 -36
  37. crawlo/templates/project/run.py.tmpl +239 -0
  38. crawlo/templates/project/settings.py.tmpl +211 -17
  39. crawlo/templates/spider/spider.py.tmpl +153 -7
  40. crawlo/utils/controlled_spider_mixin.py +336 -0
  41. crawlo/utils/large_scale_config.py +287 -0
  42. crawlo/utils/large_scale_helper.py +344 -0
  43. crawlo/utils/queue_helper.py +176 -0
  44. crawlo/utils/request_serializer.py +220 -0
  45. crawlo-1.1.2.dist-info/METADATA +567 -0
  46. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
  47. tests/test_final_validation.py +154 -0
  48. tests/test_redis_config.py +29 -0
  49. tests/test_redis_queue.py +225 -0
  50. tests/test_request_serialization.py +71 -0
  51. tests/test_scheduler.py +242 -0
  52. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  53. crawlo/utils/pqueue.py +0 -174
  54. crawlo-1.1.1.dist-info/METADATA +0 -220
  55. examples/baidu_spider/__init__.py +0 -7
  56. examples/baidu_spider/demo.py +0 -94
  57. examples/baidu_spider/items.py +0 -46
  58. examples/baidu_spider/middleware.py +0 -49
  59. examples/baidu_spider/pipeline.py +0 -55
  60. examples/baidu_spider/run.py +0 -27
  61. examples/baidu_spider/settings.py +0 -121
  62. examples/baidu_spider/spiders/__init__.py +0 -7
  63. examples/baidu_spider/spiders/bai_du.py +0 -61
  64. examples/baidu_spider/spiders/miit.py +0 -159
  65. examples/baidu_spider/spiders/sina.py +0 -79
  66. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
  67. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
  68. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
crawlo/mode_manager.py ADDED
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 运行模式管理器
5
+ ==============
6
+ 管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
7
+
8
+ 支持的运行模式:
9
+ 1. standalone - 单机模式(默认)
10
+ 2. distributed - 分布式模式
11
+ 3. auto - 自动检测模式
12
+ """
13
+
14
+ from typing import Dict, Any, Optional
15
+ from enum import Enum
16
+ import os
17
+ from crawlo.utils.log import get_logger
18
+
19
+
20
+ class RunMode(Enum):
21
+ """运行模式枚举"""
22
+ STANDALONE = "standalone" # 单机模式
23
+ DISTRIBUTED = "distributed" # 分布式模式
24
+ AUTO = "auto" # 自动检测模式
25
+
26
+
27
+ class ModeManager:
28
+ """运行模式管理器"""
29
+
30
+ def __init__(self):
31
+ self.logger = get_logger(self.__class__.__name__)
32
+
33
+ @staticmethod
34
+ def get_standalone_settings() -> Dict[str, Any]:
35
+ """获取单机模式配置"""
36
+ return {
37
+ 'QUEUE_TYPE': 'memory',
38
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
39
+ 'CONCURRENCY': 8,
40
+ 'MAX_RUNNING_SPIDERS': 1,
41
+ 'DOWNLOAD_DELAY': 1.0,
42
+ 'LOG_LEVEL': 'INFO',
43
+ }
44
+
45
+ @staticmethod
46
+ def get_distributed_settings(
47
+ redis_host: str = '127.0.0.1',
48
+ redis_port: int = 6379,
49
+ redis_password: Optional[str] = None,
50
+ project_name: str = 'crawlo'
51
+ ) -> Dict[str, Any]:
52
+ """获取分布式模式配置"""
53
+ # 构建 Redis URL
54
+ if redis_password:
55
+ redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
56
+ else:
57
+ redis_url = f'redis://{redis_host}:{redis_port}/0'
58
+
59
+ return {
60
+ 'QUEUE_TYPE': 'redis',
61
+ 'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
62
+ 'REDIS_HOST': redis_host,
63
+ 'REDIS_PORT': redis_port,
64
+ 'REDIS_PASSWORD': redis_password,
65
+ 'REDIS_URL': redis_url,
66
+ 'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
67
+ 'REDIS_KEY': f'{project_name}:fingerprint',
68
+ 'CONCURRENCY': 16,
69
+ 'MAX_RUNNING_SPIDERS': 1,
70
+ 'DOWNLOAD_DELAY': 1.0,
71
+ 'LOG_LEVEL': 'INFO',
72
+ }
73
+
74
+ @staticmethod
75
+ def get_auto_settings() -> Dict[str, Any]:
76
+ """获取自动检测模式配置"""
77
+ return {
78
+ 'QUEUE_TYPE': 'auto',
79
+ 'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
80
+ 'CONCURRENCY': 12,
81
+ 'MAX_RUNNING_SPIDERS': 1,
82
+ 'DOWNLOAD_DELAY': 1.0,
83
+ 'LOG_LEVEL': 'INFO',
84
+ }
85
+
86
+ def resolve_mode_settings(
87
+ self,
88
+ mode: str = 'standalone',
89
+ **kwargs
90
+ ) -> Dict[str, Any]:
91
+ """
92
+ 解析运行模式并返回对应配置
93
+
94
+ Args:
95
+ mode: 运行模式 ('standalone', 'distributed', 'auto')
96
+ **kwargs: 额外配置参数
97
+
98
+ Returns:
99
+ Dict[str, Any]: 配置字典
100
+ """
101
+ mode = RunMode(mode.lower())
102
+
103
+ if mode == RunMode.STANDALONE:
104
+ self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
105
+ settings = self.get_standalone_settings()
106
+
107
+ elif mode == RunMode.DISTRIBUTED:
108
+ self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
109
+ settings = self.get_distributed_settings(
110
+ redis_host=kwargs.get('redis_host', '127.0.0.1'),
111
+ redis_port=kwargs.get('redis_port', 6379),
112
+ redis_password=kwargs.get('redis_password'),
113
+ project_name=kwargs.get('project_name', 'crawlo')
114
+ )
115
+
116
+ elif mode == RunMode.AUTO:
117
+ self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
118
+ settings = self.get_auto_settings()
119
+
120
+ else:
121
+ raise ValueError(f"不支持的运行模式: {mode}")
122
+
123
+ # 合并用户自定义配置
124
+ user_settings = {k: v for k, v in kwargs.items()
125
+ if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
126
+ settings.update(user_settings)
127
+
128
+ return settings
129
+
130
+ def from_environment(self) -> Dict[str, Any]:
131
+ """从环境变量构建配置"""
132
+ config = {}
133
+
134
+ # 扫描 CRAWLO_ 前缀的环境变量
135
+ for key, value in os.environ.items():
136
+ if key.startswith('CRAWLO_'):
137
+ config_key = key[7:] # 去掉 'CRAWLO_' 前缀
138
+ # 简单的类型转换
139
+ if value.lower() in ('true', 'false'):
140
+ config[config_key] = value.lower() == 'true'
141
+ elif value.isdigit():
142
+ config[config_key] = int(value)
143
+ else:
144
+ try:
145
+ config[config_key] = float(value)
146
+ except ValueError:
147
+ config[config_key] = value
148
+
149
+ return config
150
+
151
+
152
+ # 便利函数
153
+ def standalone_mode(**kwargs) -> Dict[str, Any]:
154
+ """快速创建单机模式配置"""
155
+ return ModeManager().resolve_mode_settings('standalone', **kwargs)
156
+
157
+
158
+ def distributed_mode(
159
+ redis_host: str = '127.0.0.1',
160
+ redis_port: int = 6379,
161
+ redis_password: Optional[str] = None,
162
+ project_name: str = 'crawlo',
163
+ **kwargs
164
+ ) -> Dict[str, Any]:
165
+ """快速创建分布式模式配置"""
166
+ return ModeManager().resolve_mode_settings(
167
+ 'distributed',
168
+ redis_host=redis_host,
169
+ redis_port=redis_port,
170
+ redis_password=redis_password,
171
+ project_name=project_name,
172
+ **kwargs
173
+ )
174
+
175
+
176
+ def auto_mode(**kwargs) -> Dict[str, Any]:
177
+ """快速创建自动检测模式配置"""
178
+ return ModeManager().resolve_mode_settings('auto', **kwargs)
179
+
180
+
181
+ # 环境变量支持
182
+ def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
183
+ """从环境变量创建配置"""
184
+ mode = os.getenv('CRAWLO_MODE', default_mode).lower()
185
+
186
+ if mode == 'distributed':
187
+ return distributed_mode(
188
+ redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
189
+ redis_port=int(os.getenv('REDIS_PORT', 6379)),
190
+ redis_password=os.getenv('REDIS_PASSWORD'),
191
+ project_name=os.getenv('PROJECT_NAME', 'crawlo'),
192
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
193
+ )
194
+ elif mode == 'auto':
195
+ return auto_mode(
196
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
197
+ )
198
+ else: # standalone
199
+ return standalone_mode(
200
+ CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
201
+ )
@@ -1,7 +1,21 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
3
  """
4
- # @Time : 2025-02-05 14:07
5
- # @Author : oscar
6
- # @Desc : None
4
+ Crawlo Network Module
5
+ ====================
6
+ 提供HTTP请求和响应对象的封装。
7
+
8
+ 主要组件:
9
+ - Request: HTTP请求封装
10
+ - Response: HTTP响应封装
11
+ - RequestPriority: 请求优先级常量
7
12
  """
13
+
14
+ from .request import Request, RequestPriority
15
+ from .response import Response
16
+
17
+ __all__ = [
18
+ 'Request',
19
+ 'RequestPriority',
20
+ 'Response',
21
+ ]
crawlo/network/request.py CHANGED
@@ -1,5 +1,14 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding: UTF-8 -*-
3
+ """
4
+ HTTP Request 封装模块
5
+ ====================
6
+ 提供功能完善的HTTP请求封装,支持:
7
+ - JSON/表单数据自动处理
8
+ - 优先级排序机制
9
+ - 安全的深拷贝操作
10
+ - 灵活的请求配置
11
+ """
3
12
  import json
4
13
  from copy import deepcopy
5
14
  from urllib.parse import urlencode
@@ -7,16 +16,38 @@ from w3lib.url import safe_url_string
7
16
  from typing import Dict, Optional, Callable, Union, Any, TypeVar, List
8
17
 
9
18
  from crawlo.utils.url import escape_ajax
19
+ from crawlo.utils.log import get_logger
10
20
 
11
21
 
12
22
  _Request = TypeVar("_Request", bound="Request")
13
23
 
14
24
 
15
25
  class RequestPriority:
16
- """请求优先级常量"""
17
- HIGH = -100
18
- NORMAL = 0
19
- LOW = 100
26
+ """请求优先级常量和工具类"""
27
+ URGENT = -200 # 紧急任务
28
+ HIGH = -100 # 高优先级
29
+ NORMAL = 0 # 正常优先级(默认)
30
+ LOW = 100 # 低优先级
31
+ BACKGROUND = 200 # 后台任务
32
+
33
+ @classmethod
34
+ def get_all_priorities(cls) -> Dict[str, int]:
35
+ """获取所有优先级常量"""
36
+ return {
37
+ 'URGENT': cls.URGENT,
38
+ 'HIGH': cls.HIGH,
39
+ 'NORMAL': cls.NORMAL,
40
+ 'LOW': cls.LOW,
41
+ 'BACKGROUND': cls.BACKGROUND
42
+ }
43
+
44
+ @classmethod
45
+ def from_string(cls, priority_str: str) -> int:
46
+ """从字符串获取优先级值"""
47
+ priorities = cls.get_all_priorities()
48
+ if priority_str.upper() not in priorities:
49
+ raise ValueError(f"不支持的优先级: {priority_str}, 支持: {list(priorities.keys())}")
50
+ return priorities[priority_str.upper()]
20
51
 
21
52
 
22
53
  class Request:
@@ -99,7 +130,10 @@ class Request:
99
130
  self.headers = headers or {}
100
131
  self.cookies = cookies or {}
101
132
  self.priority = -priority # 用于排序:值越小优先级越高
102
- self._meta = deepcopy(meta) if meta is not None else {}
133
+
134
+ # 🔧 安全处理 meta,移除 logger 后再 deepcopy
135
+ self._meta = self._safe_deepcopy_meta(meta) if meta is not None else {}
136
+
103
137
  self.timeout = self._meta.get('download_timeout', timeout)
104
138
  self.proxy = proxy
105
139
  self.allow_redirects = allow_redirects
@@ -142,6 +176,34 @@ class Request:
142
176
  self.dont_filter = dont_filter
143
177
  self._set_url(url)
144
178
 
179
+ def _safe_deepcopy_meta(self, meta: Dict[str, Any]) -> Dict[str, Any]:
180
+ """安全地 deepcopy meta,移除 logger 后再复制"""
181
+ import logging
182
+
183
+ def clean_logger_recursive(obj):
184
+ """递归移除 logger 对象"""
185
+ if isinstance(obj, logging.Logger):
186
+ return None
187
+ elif isinstance(obj, dict):
188
+ cleaned = {}
189
+ for k, v in obj.items():
190
+ if not (k == 'logger' or isinstance(v, logging.Logger)):
191
+ cleaned[k] = clean_logger_recursive(v)
192
+ return cleaned
193
+ elif isinstance(obj, (list, tuple)):
194
+ cleaned_list = []
195
+ for item in obj:
196
+ cleaned_item = clean_logger_recursive(item)
197
+ if cleaned_item is not None:
198
+ cleaned_list.append(cleaned_item)
199
+ return type(obj)(cleaned_list)
200
+ else:
201
+ return obj
202
+
203
+ # 先清理 logger,再 deepcopy
204
+ cleaned_meta = clean_logger_recursive(meta)
205
+ return deepcopy(cleaned_meta)
206
+
145
207
  def copy(self: _Request) -> _Request:
146
208
  """
147
209
  创建当前请求的副本,保留所有高层语义(json_body/form_data)。
@@ -169,22 +231,68 @@ class Request:
169
231
  encoding=self.encoding
170
232
  )
171
233
 
172
- def set_meta(self, key: str, value: Any) -> None:
173
- """设置 meta 中的某个键值。"""
234
+ def set_meta(self, key: str, value: Any) -> 'Request':
235
+ """设置 meta 中的某个键值,支持链式调用。"""
174
236
  self._meta[key] = value
237
+ return self
238
+
239
+ def add_header(self, key: str, value: str) -> 'Request':
240
+ """添加请求头,支持链式调用。"""
241
+ self.headers[key] = value
242
+ return self
243
+
244
+ def add_headers(self, headers: Dict[str, str]) -> 'Request':
245
+ """批量添加请求头,支持链式调用。"""
246
+ self.headers.update(headers)
247
+ return self
248
+
249
+ def set_proxy(self, proxy: str) -> 'Request':
250
+ """设置代理,支持链式调用。"""
251
+ self.proxy = proxy
252
+ return self
253
+
254
+ def set_timeout(self, timeout: float) -> 'Request':
255
+ """设置超时时间,支持链式调用。"""
256
+ self.timeout = timeout
257
+ return self
258
+
259
+ def add_flag(self, flag: str) -> 'Request':
260
+ """添加标记,支持链式调用。"""
261
+ if flag not in self.flags:
262
+ self.flags.append(flag)
263
+ return self
264
+
265
+ def remove_flag(self, flag: str) -> 'Request':
266
+ """移除标记,支持链式调用。"""
267
+ if flag in self.flags:
268
+ self.flags.remove(flag)
269
+ return self
175
270
 
176
271
  def _set_url(self, url: str) -> None:
177
272
  """安全设置 URL,确保格式正确。"""
178
273
  if not isinstance(url, str):
179
274
  raise TypeError(f"Request url 必须为字符串,当前类型: {type(url).__name__}")
275
+
276
+ if not url.strip():
277
+ raise ValueError("URL 不能为空")
278
+
279
+ # 检查危险的 URL scheme
280
+ dangerous_schemes = ['file://', 'ftp://', 'javascript:', 'data:']
281
+ if any(url.lower().startswith(scheme) for scheme in dangerous_schemes):
282
+ raise ValueError(f"URL scheme 不安全: {url[:20]}...")
180
283
 
181
284
  s = safe_url_string(url, self.encoding)
182
285
  escaped_url = escape_ajax(s)
286
+
287
+ if not escaped_url.startswith(('http://', 'https://')):
288
+ raise ValueError(f"URL 缺少 HTTP(S) scheme: {escaped_url[:50]}...")
289
+
290
+ # 检查 URL 长度
291
+ if len(escaped_url) > 8192: # 大多数服务器支持的最大 URL 长度
292
+ raise ValueError(f"URL 过长 (超过 8192 字符): {len(escaped_url)} 字符")
293
+
183
294
  self._url = escaped_url
184
295
 
185
- if not self._url.startswith(('http://', 'https://')):
186
- raise ValueError(f"URL 缺少 scheme: {self._url}")
187
-
188
296
  @property
189
297
  def url(self) -> str:
190
298
  return self._url
@@ -1,76 +1,179 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
+ """
4
+ HTTP Response 封装模块
5
+ =====================
6
+ 提供功能丰富的HTTP响应封装,支持:
7
+ - 智能编码检测和解码
8
+ - XPath/CSS 选择器
9
+ - JSON 解析和缓存
10
+ - 正则表达式支持
11
+ - Cookie 处理
12
+ """
3
13
  import re
4
14
  import ujson
5
15
  from http.cookies import SimpleCookie
6
16
  from parsel import Selector, SelectorList
7
- from typing import Dict, Any, List, Optional
17
+ from typing import Dict, Any, List, Optional, Union
8
18
  from urllib.parse import urljoin as _urljoin
9
19
 
10
- from crawlo import Request
11
20
  from crawlo.exceptions import DecodeError
12
21
 
13
22
 
14
23
  class Response:
15
24
  """
16
25
  HTTP响应的封装,提供数据解析的便捷方法。
26
+
27
+ 功能特性:
28
+ - 智能编码检测和缓存
29
+ - 懒加载 Selector 实例
30
+ - JSON 解析和缓存
31
+ - 多类型数据提取
17
32
  """
18
33
 
19
34
  def __init__(
20
35
  self,
21
36
  url: str,
22
37
  *,
23
- headers: Dict[str, Any],
38
+ headers: Dict[str, Any] = None,
24
39
  body: bytes = b"",
25
40
  method: str = 'GET',
26
- request: Request = None,
41
+ request: 'Request' = None, # 使用字符串注解避免循环导入
27
42
  status_code: int = 200,
28
43
  ):
44
+ # 基本属性
29
45
  self.url = url
30
- self.headers = headers
46
+ self.headers = headers or {}
31
47
  self.body = body
32
- self.method = method
48
+ self.method = method.upper()
33
49
  self.request = request
34
50
  self.status_code = status_code
35
- self.encoding = self.request.encoding if self.request else None
51
+
52
+ # 编码处理
53
+ self.encoding = self._determine_encoding()
54
+
55
+ # 缓存属性
36
56
  self._text_cache = None
37
57
  self._json_cache = None
38
- self._selector_instance = None # 修改变量名,避免与 @property 冲突
58
+ self._selector_instance = None
59
+
60
+ # 状态标记
61
+ self._is_success = 200 <= status_code < 300
62
+ self._is_redirect = 300 <= status_code < 400
63
+ self._is_client_error = 400 <= status_code < 500
64
+ self._is_server_error = status_code >= 500
39
65
 
66
+ def _determine_encoding(self) -> Optional[str]:
67
+ """智能检测响应编码"""
68
+ # 1. 优先使用 request 的编码
69
+ if self.request and self.request.encoding:
70
+ return self.request.encoding
71
+
72
+ # 2. 从 Content-Type 头中检测
73
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
74
+ if content_type:
75
+ charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
76
+ if charset_match:
77
+ return charset_match.group(1).lower()
78
+
79
+ # 3. 从 HTML meta 标签中检测(仅对HTML内容)
80
+ if b'<html' in self.body[:1024].lower():
81
+ # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
82
+ html_start = self.body[:4096] # 只检查前4KB
83
+ try:
84
+ html_text = html_start.decode('ascii', errors='ignore')
85
+ # <meta charset="utf-8">
86
+ charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
87
+ if charset_match:
88
+ return charset_match.group(1).lower()
89
+
90
+ # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
91
+ content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
92
+ if content_match:
93
+ return content_match.group(1).lower()
94
+ except Exception:
95
+ pass
96
+
97
+ # 4. 默认使用 utf-8
98
+ return 'utf-8'
40
99
  @property
41
100
  def text(self) -> str:
42
101
  """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
43
102
  if self._text_cache is not None:
44
103
  return self._text_cache
45
104
 
46
- encoding = self.encoding
47
- try:
48
- # 优先使用 request 提供的编码
49
- if encoding:
50
- self._text_cache = self.body.decode(encoding)
51
- return self._text_cache
105
+ if not self.body:
106
+ self._text_cache = ""
107
+ return self._text_cache
52
108
 
53
- # 从 Content-Type 中提取编码
54
- content_type = self.headers.get("Content-Type", "")
55
- charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
56
- if charset_match:
57
- encoding = charset_match.group(1)
109
+ # 尝试多种编码
110
+ encodings_to_try = [self.encoding]
111
+ if self.encoding != 'utf-8':
112
+ encodings_to_try.append('utf-8')
113
+ if 'gbk' not in encodings_to_try:
114
+ encodings_to_try.append('gbk')
115
+ if 'gb2312' not in encodings_to_try:
116
+ encodings_to_try.append('gb2312')
117
+ encodings_to_try.append('latin1') # 最后的回退选项
118
+
119
+ for encoding in encodings_to_try:
120
+ if not encoding:
121
+ continue
122
+ try:
58
123
  self._text_cache = self.body.decode(encoding)
59
124
  return self._text_cache
60
-
61
- # 默认尝试 UTF-8
62
- self._text_cache = self.body.decode("utf-8")
125
+ except (UnicodeDecodeError, LookupError):
126
+ continue
127
+
128
+ # 所有编码都失败,使用容错解码
129
+ try:
130
+ self._text_cache = self.body.decode('utf-8', errors='replace')
63
131
  return self._text_cache
64
-
65
- except UnicodeDecodeError as e:
132
+ except Exception as e:
66
133
  raise DecodeError(f"Failed to decode response from {self.url}: {e}")
67
134
 
68
- def json(self) -> Any:
135
+ @property
136
+ def is_success(self) -> bool:
137
+ """检查响应是否成功 (2xx)"""
138
+ return self._is_success
139
+
140
+ @property
141
+ def is_redirect(self) -> bool:
142
+ """检查响应是否为重定向 (3xx)"""
143
+ return self._is_redirect
144
+
145
+ @property
146
+ def is_client_error(self) -> bool:
147
+ """检查响应是否为客户端错误 (4xx)"""
148
+ return self._is_client_error
149
+
150
+ @property
151
+ def is_server_error(self) -> bool:
152
+ """检查响应是否为服务器错误 (5xx)"""
153
+ return self._is_server_error
154
+
155
+ @property
156
+ def content_type(self) -> str:
157
+ """获取响应的 Content-Type"""
158
+ return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
159
+
160
+ @property
161
+ def content_length(self) -> Optional[int]:
162
+ """获取响应的 Content-Length"""
163
+ length = self.headers.get('content-length') or self.headers.get('Content-Length')
164
+ return int(length) if length else None
165
+ def json(self, default: Any = None) -> Any:
69
166
  """将响应文本解析为 JSON 对象。"""
70
- if self._json_cache:
167
+ if self._json_cache is not None:
168
+ return self._json_cache
169
+
170
+ try:
171
+ self._json_cache = ujson.loads(self.text)
71
172
  return self._json_cache
72
- self._json_cache = ujson.loads(self.text)
73
- return self._json_cache
173
+ except (ujson.JSONDecodeError, ValueError) as e:
174
+ if default is not None:
175
+ return default
176
+ raise DecodeError(f"Failed to parse JSON from {self.url}: {e}")
74
177
 
75
178
  def urljoin(self, url: str) -> str:
76
179
  """拼接 URL,自动处理相对路径。"""
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from crawlo.items.items import Item
3
+ from crawlo.items import Item
4
4
 
5
5
 
6
6
  class BasePipeline: