crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +68 -42
- crawlo/commands/list.py +102 -93
- crawlo/commands/startproject.py +89 -4
- crawlo/commands/utils.py +187 -0
- crawlo/config.py +280 -0
- crawlo/core/engine.py +16 -3
- crawlo/core/enhanced_engine.py +190 -0
- crawlo/core/scheduler.py +113 -8
- crawlo/crawler.py +840 -307
- crawlo/downloader/__init__.py +181 -17
- crawlo/downloader/aiohttp_downloader.py +15 -2
- crawlo/downloader/cffi_downloader.py +11 -1
- crawlo/downloader/httpx_downloader.py +14 -3
- crawlo/filters/__init__.py +122 -5
- crawlo/filters/aioredis_filter.py +128 -36
- crawlo/filters/memory_filter.py +99 -32
- crawlo/middleware/proxy.py +11 -8
- crawlo/middleware/retry.py +40 -5
- crawlo/mode_manager.py +201 -0
- crawlo/network/__init__.py +17 -3
- crawlo/network/request.py +118 -10
- crawlo/network/response.py +131 -28
- crawlo/pipelines/__init__.py +1 -1
- crawlo/pipelines/csv_pipeline.py +317 -0
- crawlo/pipelines/json_pipeline.py +219 -0
- crawlo/queue/__init__.py +0 -0
- crawlo/queue/pqueue.py +37 -0
- crawlo/queue/queue_manager.py +304 -0
- crawlo/queue/redis_priority_queue.py +192 -0
- crawlo/settings/default_settings.py +68 -9
- crawlo/spider/__init__.py +576 -66
- crawlo/task_manager.py +4 -1
- crawlo/templates/project/middlewares.py.tmpl +56 -45
- crawlo/templates/project/pipelines.py.tmpl +308 -36
- crawlo/templates/project/run.py.tmpl +239 -0
- crawlo/templates/project/settings.py.tmpl +211 -17
- crawlo/templates/spider/spider.py.tmpl +153 -7
- crawlo/utils/controlled_spider_mixin.py +336 -0
- crawlo/utils/large_scale_config.py +287 -0
- crawlo/utils/large_scale_helper.py +344 -0
- crawlo/utils/queue_helper.py +176 -0
- crawlo/utils/request_serializer.py +220 -0
- crawlo-1.1.2.dist-info/METADATA +567 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
- tests/test_final_validation.py +154 -0
- tests/test_redis_config.py +29 -0
- tests/test_redis_queue.py +225 -0
- tests/test_request_serialization.py +71 -0
- tests/test_scheduler.py +242 -0
- crawlo/pipelines/mysql_batch_pipline.py +0 -273
- crawlo/utils/pqueue.py +0 -174
- crawlo-1.1.1.dist-info/METADATA +0 -220
- examples/baidu_spider/__init__.py +0 -7
- examples/baidu_spider/demo.py +0 -94
- examples/baidu_spider/items.py +0 -46
- examples/baidu_spider/middleware.py +0 -49
- examples/baidu_spider/pipeline.py +0 -55
- examples/baidu_spider/run.py +0 -27
- examples/baidu_spider/settings.py +0 -121
- examples/baidu_spider/spiders/__init__.py +0 -7
- examples/baidu_spider/spiders/bai_du.py +0 -61
- examples/baidu_spider/spiders/miit.py +0 -159
- examples/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
crawlo/mode_manager.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
运行模式管理器
|
|
5
|
+
==============
|
|
6
|
+
管理 Crawlo 框架的不同运行模式,提供优雅的配置方式。
|
|
7
|
+
|
|
8
|
+
支持的运行模式:
|
|
9
|
+
1. standalone - 单机模式(默认)
|
|
10
|
+
2. distributed - 分布式模式
|
|
11
|
+
3. auto - 自动检测模式
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Dict, Any, Optional
|
|
15
|
+
from enum import Enum
|
|
16
|
+
import os
|
|
17
|
+
from crawlo.utils.log import get_logger
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RunMode(Enum):
|
|
21
|
+
"""运行模式枚举"""
|
|
22
|
+
STANDALONE = "standalone" # 单机模式
|
|
23
|
+
DISTRIBUTED = "distributed" # 分布式模式
|
|
24
|
+
AUTO = "auto" # 自动检测模式
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ModeManager:
|
|
28
|
+
"""运行模式管理器"""
|
|
29
|
+
|
|
30
|
+
def __init__(self):
|
|
31
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def get_standalone_settings() -> Dict[str, Any]:
|
|
35
|
+
"""获取单机模式配置"""
|
|
36
|
+
return {
|
|
37
|
+
'QUEUE_TYPE': 'memory',
|
|
38
|
+
'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter',
|
|
39
|
+
'CONCURRENCY': 8,
|
|
40
|
+
'MAX_RUNNING_SPIDERS': 1,
|
|
41
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
42
|
+
'LOG_LEVEL': 'INFO',
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def get_distributed_settings(
|
|
47
|
+
redis_host: str = '127.0.0.1',
|
|
48
|
+
redis_port: int = 6379,
|
|
49
|
+
redis_password: Optional[str] = None,
|
|
50
|
+
project_name: str = 'crawlo'
|
|
51
|
+
) -> Dict[str, Any]:
|
|
52
|
+
"""获取分布式模式配置"""
|
|
53
|
+
# 构建 Redis URL
|
|
54
|
+
if redis_password:
|
|
55
|
+
redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/0'
|
|
56
|
+
else:
|
|
57
|
+
redis_url = f'redis://{redis_host}:{redis_port}/0'
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
'QUEUE_TYPE': 'redis',
|
|
61
|
+
'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
|
|
62
|
+
'REDIS_HOST': redis_host,
|
|
63
|
+
'REDIS_PORT': redis_port,
|
|
64
|
+
'REDIS_PASSWORD': redis_password,
|
|
65
|
+
'REDIS_URL': redis_url,
|
|
66
|
+
'SCHEDULER_QUEUE_NAME': f'{project_name}:requests',
|
|
67
|
+
'REDIS_KEY': f'{project_name}:fingerprint',
|
|
68
|
+
'CONCURRENCY': 16,
|
|
69
|
+
'MAX_RUNNING_SPIDERS': 1,
|
|
70
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
71
|
+
'LOG_LEVEL': 'INFO',
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_auto_settings() -> Dict[str, Any]:
|
|
76
|
+
"""获取自动检测模式配置"""
|
|
77
|
+
return {
|
|
78
|
+
'QUEUE_TYPE': 'auto',
|
|
79
|
+
'FILTER_CLASS': 'crawlo.filters.memory_filter.MemoryFilter', # 默认内存过滤器
|
|
80
|
+
'CONCURRENCY': 12,
|
|
81
|
+
'MAX_RUNNING_SPIDERS': 1,
|
|
82
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
83
|
+
'LOG_LEVEL': 'INFO',
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def resolve_mode_settings(
|
|
87
|
+
self,
|
|
88
|
+
mode: str = 'standalone',
|
|
89
|
+
**kwargs
|
|
90
|
+
) -> Dict[str, Any]:
|
|
91
|
+
"""
|
|
92
|
+
解析运行模式并返回对应配置
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
mode: 运行模式 ('standalone', 'distributed', 'auto')
|
|
96
|
+
**kwargs: 额外配置参数
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Dict[str, Any]: 配置字典
|
|
100
|
+
"""
|
|
101
|
+
mode = RunMode(mode.lower())
|
|
102
|
+
|
|
103
|
+
if mode == RunMode.STANDALONE:
|
|
104
|
+
self.logger.info("🏠 使用单机模式 - 简单快速,适合开发和中小规模爬取")
|
|
105
|
+
settings = self.get_standalone_settings()
|
|
106
|
+
|
|
107
|
+
elif mode == RunMode.DISTRIBUTED:
|
|
108
|
+
self.logger.info("🌐 使用分布式模式 - 支持多节点扩展,适合大规模爬取")
|
|
109
|
+
settings = self.get_distributed_settings(
|
|
110
|
+
redis_host=kwargs.get('redis_host', '127.0.0.1'),
|
|
111
|
+
redis_port=kwargs.get('redis_port', 6379),
|
|
112
|
+
redis_password=kwargs.get('redis_password'),
|
|
113
|
+
project_name=kwargs.get('project_name', 'crawlo')
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
elif mode == RunMode.AUTO:
|
|
117
|
+
self.logger.info("🤖 使用自动检测模式 - 智能选择最佳运行方式")
|
|
118
|
+
settings = self.get_auto_settings()
|
|
119
|
+
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"不支持的运行模式: {mode}")
|
|
122
|
+
|
|
123
|
+
# 合并用户自定义配置
|
|
124
|
+
user_settings = {k: v for k, v in kwargs.items()
|
|
125
|
+
if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
|
|
126
|
+
settings.update(user_settings)
|
|
127
|
+
|
|
128
|
+
return settings
|
|
129
|
+
|
|
130
|
+
def from_environment(self) -> Dict[str, Any]:
|
|
131
|
+
"""从环境变量构建配置"""
|
|
132
|
+
config = {}
|
|
133
|
+
|
|
134
|
+
# 扫描 CRAWLO_ 前缀的环境变量
|
|
135
|
+
for key, value in os.environ.items():
|
|
136
|
+
if key.startswith('CRAWLO_'):
|
|
137
|
+
config_key = key[7:] # 去掉 'CRAWLO_' 前缀
|
|
138
|
+
# 简单的类型转换
|
|
139
|
+
if value.lower() in ('true', 'false'):
|
|
140
|
+
config[config_key] = value.lower() == 'true'
|
|
141
|
+
elif value.isdigit():
|
|
142
|
+
config[config_key] = int(value)
|
|
143
|
+
else:
|
|
144
|
+
try:
|
|
145
|
+
config[config_key] = float(value)
|
|
146
|
+
except ValueError:
|
|
147
|
+
config[config_key] = value
|
|
148
|
+
|
|
149
|
+
return config
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# 便利函数
|
|
153
|
+
def standalone_mode(**kwargs) -> Dict[str, Any]:
|
|
154
|
+
"""快速创建单机模式配置"""
|
|
155
|
+
return ModeManager().resolve_mode_settings('standalone', **kwargs)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def distributed_mode(
|
|
159
|
+
redis_host: str = '127.0.0.1',
|
|
160
|
+
redis_port: int = 6379,
|
|
161
|
+
redis_password: Optional[str] = None,
|
|
162
|
+
project_name: str = 'crawlo',
|
|
163
|
+
**kwargs
|
|
164
|
+
) -> Dict[str, Any]:
|
|
165
|
+
"""快速创建分布式模式配置"""
|
|
166
|
+
return ModeManager().resolve_mode_settings(
|
|
167
|
+
'distributed',
|
|
168
|
+
redis_host=redis_host,
|
|
169
|
+
redis_port=redis_port,
|
|
170
|
+
redis_password=redis_password,
|
|
171
|
+
project_name=project_name,
|
|
172
|
+
**kwargs
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def auto_mode(**kwargs) -> Dict[str, Any]:
|
|
177
|
+
"""快速创建自动检测模式配置"""
|
|
178
|
+
return ModeManager().resolve_mode_settings('auto', **kwargs)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# 环境变量支持
|
|
182
|
+
def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
|
|
183
|
+
"""从环境变量创建配置"""
|
|
184
|
+
mode = os.getenv('CRAWLO_MODE', default_mode).lower()
|
|
185
|
+
|
|
186
|
+
if mode == 'distributed':
|
|
187
|
+
return distributed_mode(
|
|
188
|
+
redis_host=os.getenv('REDIS_HOST', '127.0.0.1'),
|
|
189
|
+
redis_port=int(os.getenv('REDIS_PORT', 6379)),
|
|
190
|
+
redis_password=os.getenv('REDIS_PASSWORD'),
|
|
191
|
+
project_name=os.getenv('PROJECT_NAME', 'crawlo'),
|
|
192
|
+
CONCURRENCY=int(os.getenv('CONCURRENCY', 16)),
|
|
193
|
+
)
|
|
194
|
+
elif mode == 'auto':
|
|
195
|
+
return auto_mode(
|
|
196
|
+
CONCURRENCY=int(os.getenv('CONCURRENCY', 12)),
|
|
197
|
+
)
|
|
198
|
+
else: # standalone
|
|
199
|
+
return standalone_mode(
|
|
200
|
+
CONCURRENCY=int(os.getenv('CONCURRENCY', 8)),
|
|
201
|
+
)
|
crawlo/network/__init__.py
CHANGED
|
@@ -1,7 +1,21 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
Crawlo Network Module
|
|
5
|
+
====================
|
|
6
|
+
提供HTTP请求和响应对象的封装。
|
|
7
|
+
|
|
8
|
+
主要组件:
|
|
9
|
+
- Request: HTTP请求封装
|
|
10
|
+
- Response: HTTP响应封装
|
|
11
|
+
- RequestPriority: 请求优先级常量
|
|
7
12
|
"""
|
|
13
|
+
|
|
14
|
+
from .request import Request, RequestPriority
|
|
15
|
+
from .response import Response
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
'Request',
|
|
19
|
+
'RequestPriority',
|
|
20
|
+
'Response',
|
|
21
|
+
]
|
crawlo/network/request.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
HTTP Request 封装模块
|
|
5
|
+
====================
|
|
6
|
+
提供功能完善的HTTP请求封装,支持:
|
|
7
|
+
- JSON/表单数据自动处理
|
|
8
|
+
- 优先级排序机制
|
|
9
|
+
- 安全的深拷贝操作
|
|
10
|
+
- 灵活的请求配置
|
|
11
|
+
"""
|
|
3
12
|
import json
|
|
4
13
|
from copy import deepcopy
|
|
5
14
|
from urllib.parse import urlencode
|
|
@@ -7,16 +16,38 @@ from w3lib.url import safe_url_string
|
|
|
7
16
|
from typing import Dict, Optional, Callable, Union, Any, TypeVar, List
|
|
8
17
|
|
|
9
18
|
from crawlo.utils.url import escape_ajax
|
|
19
|
+
from crawlo.utils.log import get_logger
|
|
10
20
|
|
|
11
21
|
|
|
12
22
|
_Request = TypeVar("_Request", bound="Request")
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
class RequestPriority:
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
26
|
+
"""请求优先级常量和工具类"""
|
|
27
|
+
URGENT = -200 # 紧急任务
|
|
28
|
+
HIGH = -100 # 高优先级
|
|
29
|
+
NORMAL = 0 # 正常优先级(默认)
|
|
30
|
+
LOW = 100 # 低优先级
|
|
31
|
+
BACKGROUND = 200 # 后台任务
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def get_all_priorities(cls) -> Dict[str, int]:
|
|
35
|
+
"""获取所有优先级常量"""
|
|
36
|
+
return {
|
|
37
|
+
'URGENT': cls.URGENT,
|
|
38
|
+
'HIGH': cls.HIGH,
|
|
39
|
+
'NORMAL': cls.NORMAL,
|
|
40
|
+
'LOW': cls.LOW,
|
|
41
|
+
'BACKGROUND': cls.BACKGROUND
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_string(cls, priority_str: str) -> int:
|
|
46
|
+
"""从字符串获取优先级值"""
|
|
47
|
+
priorities = cls.get_all_priorities()
|
|
48
|
+
if priority_str.upper() not in priorities:
|
|
49
|
+
raise ValueError(f"不支持的优先级: {priority_str}, 支持: {list(priorities.keys())}")
|
|
50
|
+
return priorities[priority_str.upper()]
|
|
20
51
|
|
|
21
52
|
|
|
22
53
|
class Request:
|
|
@@ -99,7 +130,10 @@ class Request:
|
|
|
99
130
|
self.headers = headers or {}
|
|
100
131
|
self.cookies = cookies or {}
|
|
101
132
|
self.priority = -priority # 用于排序:值越小优先级越高
|
|
102
|
-
|
|
133
|
+
|
|
134
|
+
# 🔧 安全处理 meta,移除 logger 后再 deepcopy
|
|
135
|
+
self._meta = self._safe_deepcopy_meta(meta) if meta is not None else {}
|
|
136
|
+
|
|
103
137
|
self.timeout = self._meta.get('download_timeout', timeout)
|
|
104
138
|
self.proxy = proxy
|
|
105
139
|
self.allow_redirects = allow_redirects
|
|
@@ -142,6 +176,34 @@ class Request:
|
|
|
142
176
|
self.dont_filter = dont_filter
|
|
143
177
|
self._set_url(url)
|
|
144
178
|
|
|
179
|
+
def _safe_deepcopy_meta(self, meta: Dict[str, Any]) -> Dict[str, Any]:
|
|
180
|
+
"""安全地 deepcopy meta,移除 logger 后再复制"""
|
|
181
|
+
import logging
|
|
182
|
+
|
|
183
|
+
def clean_logger_recursive(obj):
|
|
184
|
+
"""递归移除 logger 对象"""
|
|
185
|
+
if isinstance(obj, logging.Logger):
|
|
186
|
+
return None
|
|
187
|
+
elif isinstance(obj, dict):
|
|
188
|
+
cleaned = {}
|
|
189
|
+
for k, v in obj.items():
|
|
190
|
+
if not (k == 'logger' or isinstance(v, logging.Logger)):
|
|
191
|
+
cleaned[k] = clean_logger_recursive(v)
|
|
192
|
+
return cleaned
|
|
193
|
+
elif isinstance(obj, (list, tuple)):
|
|
194
|
+
cleaned_list = []
|
|
195
|
+
for item in obj:
|
|
196
|
+
cleaned_item = clean_logger_recursive(item)
|
|
197
|
+
if cleaned_item is not None:
|
|
198
|
+
cleaned_list.append(cleaned_item)
|
|
199
|
+
return type(obj)(cleaned_list)
|
|
200
|
+
else:
|
|
201
|
+
return obj
|
|
202
|
+
|
|
203
|
+
# 先清理 logger,再 deepcopy
|
|
204
|
+
cleaned_meta = clean_logger_recursive(meta)
|
|
205
|
+
return deepcopy(cleaned_meta)
|
|
206
|
+
|
|
145
207
|
def copy(self: _Request) -> _Request:
|
|
146
208
|
"""
|
|
147
209
|
创建当前请求的副本,保留所有高层语义(json_body/form_data)。
|
|
@@ -169,22 +231,68 @@ class Request:
|
|
|
169
231
|
encoding=self.encoding
|
|
170
232
|
)
|
|
171
233
|
|
|
172
|
-
def set_meta(self, key: str, value: Any) ->
|
|
173
|
-
"""设置 meta
|
|
234
|
+
def set_meta(self, key: str, value: Any) -> 'Request':
|
|
235
|
+
"""设置 meta 中的某个键值,支持链式调用。"""
|
|
174
236
|
self._meta[key] = value
|
|
237
|
+
return self
|
|
238
|
+
|
|
239
|
+
def add_header(self, key: str, value: str) -> 'Request':
|
|
240
|
+
"""添加请求头,支持链式调用。"""
|
|
241
|
+
self.headers[key] = value
|
|
242
|
+
return self
|
|
243
|
+
|
|
244
|
+
def add_headers(self, headers: Dict[str, str]) -> 'Request':
|
|
245
|
+
"""批量添加请求头,支持链式调用。"""
|
|
246
|
+
self.headers.update(headers)
|
|
247
|
+
return self
|
|
248
|
+
|
|
249
|
+
def set_proxy(self, proxy: str) -> 'Request':
|
|
250
|
+
"""设置代理,支持链式调用。"""
|
|
251
|
+
self.proxy = proxy
|
|
252
|
+
return self
|
|
253
|
+
|
|
254
|
+
def set_timeout(self, timeout: float) -> 'Request':
|
|
255
|
+
"""设置超时时间,支持链式调用。"""
|
|
256
|
+
self.timeout = timeout
|
|
257
|
+
return self
|
|
258
|
+
|
|
259
|
+
def add_flag(self, flag: str) -> 'Request':
|
|
260
|
+
"""添加标记,支持链式调用。"""
|
|
261
|
+
if flag not in self.flags:
|
|
262
|
+
self.flags.append(flag)
|
|
263
|
+
return self
|
|
264
|
+
|
|
265
|
+
def remove_flag(self, flag: str) -> 'Request':
|
|
266
|
+
"""移除标记,支持链式调用。"""
|
|
267
|
+
if flag in self.flags:
|
|
268
|
+
self.flags.remove(flag)
|
|
269
|
+
return self
|
|
175
270
|
|
|
176
271
|
def _set_url(self, url: str) -> None:
|
|
177
272
|
"""安全设置 URL,确保格式正确。"""
|
|
178
273
|
if not isinstance(url, str):
|
|
179
274
|
raise TypeError(f"Request url 必须为字符串,当前类型: {type(url).__name__}")
|
|
275
|
+
|
|
276
|
+
if not url.strip():
|
|
277
|
+
raise ValueError("URL 不能为空")
|
|
278
|
+
|
|
279
|
+
# 检查危险的 URL scheme
|
|
280
|
+
dangerous_schemes = ['file://', 'ftp://', 'javascript:', 'data:']
|
|
281
|
+
if any(url.lower().startswith(scheme) for scheme in dangerous_schemes):
|
|
282
|
+
raise ValueError(f"URL scheme 不安全: {url[:20]}...")
|
|
180
283
|
|
|
181
284
|
s = safe_url_string(url, self.encoding)
|
|
182
285
|
escaped_url = escape_ajax(s)
|
|
286
|
+
|
|
287
|
+
if not escaped_url.startswith(('http://', 'https://')):
|
|
288
|
+
raise ValueError(f"URL 缺少 HTTP(S) scheme: {escaped_url[:50]}...")
|
|
289
|
+
|
|
290
|
+
# 检查 URL 长度
|
|
291
|
+
if len(escaped_url) > 8192: # 大多数服务器支持的最大 URL 长度
|
|
292
|
+
raise ValueError(f"URL 过长 (超过 8192 字符): {len(escaped_url)} 字符")
|
|
293
|
+
|
|
183
294
|
self._url = escaped_url
|
|
184
295
|
|
|
185
|
-
if not self._url.startswith(('http://', 'https://')):
|
|
186
|
-
raise ValueError(f"URL 缺少 scheme: {self._url}")
|
|
187
|
-
|
|
188
296
|
@property
|
|
189
297
|
def url(self) -> str:
|
|
190
298
|
return self._url
|
crawlo/network/response.py
CHANGED
|
@@ -1,76 +1,179 @@
|
|
|
1
1
|
#!/usr/bin/python
|
|
2
2
|
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
HTTP Response 封装模块
|
|
5
|
+
=====================
|
|
6
|
+
提供功能丰富的HTTP响应封装,支持:
|
|
7
|
+
- 智能编码检测和解码
|
|
8
|
+
- XPath/CSS 选择器
|
|
9
|
+
- JSON 解析和缓存
|
|
10
|
+
- 正则表达式支持
|
|
11
|
+
- Cookie 处理
|
|
12
|
+
"""
|
|
3
13
|
import re
|
|
4
14
|
import ujson
|
|
5
15
|
from http.cookies import SimpleCookie
|
|
6
16
|
from parsel import Selector, SelectorList
|
|
7
|
-
from typing import Dict, Any, List, Optional
|
|
17
|
+
from typing import Dict, Any, List, Optional, Union
|
|
8
18
|
from urllib.parse import urljoin as _urljoin
|
|
9
19
|
|
|
10
|
-
from crawlo import Request
|
|
11
20
|
from crawlo.exceptions import DecodeError
|
|
12
21
|
|
|
13
22
|
|
|
14
23
|
class Response:
|
|
15
24
|
"""
|
|
16
25
|
HTTP响应的封装,提供数据解析的便捷方法。
|
|
26
|
+
|
|
27
|
+
功能特性:
|
|
28
|
+
- 智能编码检测和缓存
|
|
29
|
+
- 懒加载 Selector 实例
|
|
30
|
+
- JSON 解析和缓存
|
|
31
|
+
- 多类型数据提取
|
|
17
32
|
"""
|
|
18
33
|
|
|
19
34
|
def __init__(
|
|
20
35
|
self,
|
|
21
36
|
url: str,
|
|
22
37
|
*,
|
|
23
|
-
headers: Dict[str, Any],
|
|
38
|
+
headers: Dict[str, Any] = None,
|
|
24
39
|
body: bytes = b"",
|
|
25
40
|
method: str = 'GET',
|
|
26
|
-
request: Request = None,
|
|
41
|
+
request: 'Request' = None, # 使用字符串注解避免循环导入
|
|
27
42
|
status_code: int = 200,
|
|
28
43
|
):
|
|
44
|
+
# 基本属性
|
|
29
45
|
self.url = url
|
|
30
|
-
self.headers = headers
|
|
46
|
+
self.headers = headers or {}
|
|
31
47
|
self.body = body
|
|
32
|
-
self.method = method
|
|
48
|
+
self.method = method.upper()
|
|
33
49
|
self.request = request
|
|
34
50
|
self.status_code = status_code
|
|
35
|
-
|
|
51
|
+
|
|
52
|
+
# 编码处理
|
|
53
|
+
self.encoding = self._determine_encoding()
|
|
54
|
+
|
|
55
|
+
# 缓存属性
|
|
36
56
|
self._text_cache = None
|
|
37
57
|
self._json_cache = None
|
|
38
|
-
self._selector_instance = None
|
|
58
|
+
self._selector_instance = None
|
|
59
|
+
|
|
60
|
+
# 状态标记
|
|
61
|
+
self._is_success = 200 <= status_code < 300
|
|
62
|
+
self._is_redirect = 300 <= status_code < 400
|
|
63
|
+
self._is_client_error = 400 <= status_code < 500
|
|
64
|
+
self._is_server_error = status_code >= 500
|
|
39
65
|
|
|
66
|
+
def _determine_encoding(self) -> Optional[str]:
|
|
67
|
+
"""智能检测响应编码"""
|
|
68
|
+
# 1. 优先使用 request 的编码
|
|
69
|
+
if self.request and self.request.encoding:
|
|
70
|
+
return self.request.encoding
|
|
71
|
+
|
|
72
|
+
# 2. 从 Content-Type 头中检测
|
|
73
|
+
content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
|
|
74
|
+
if content_type:
|
|
75
|
+
charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
|
|
76
|
+
if charset_match:
|
|
77
|
+
return charset_match.group(1).lower()
|
|
78
|
+
|
|
79
|
+
# 3. 从 HTML meta 标签中检测(仅对HTML内容)
|
|
80
|
+
if b'<html' in self.body[:1024].lower():
|
|
81
|
+
# 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
|
|
82
|
+
html_start = self.body[:4096] # 只检查前4KB
|
|
83
|
+
try:
|
|
84
|
+
html_text = html_start.decode('ascii', errors='ignore')
|
|
85
|
+
# <meta charset="utf-8">
|
|
86
|
+
charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
|
|
87
|
+
if charset_match:
|
|
88
|
+
return charset_match.group(1).lower()
|
|
89
|
+
|
|
90
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
91
|
+
content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
|
|
92
|
+
if content_match:
|
|
93
|
+
return content_match.group(1).lower()
|
|
94
|
+
except Exception:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
# 4. 默认使用 utf-8
|
|
98
|
+
return 'utf-8'
|
|
40
99
|
@property
|
|
41
100
|
def text(self) -> str:
|
|
42
101
|
"""将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
|
|
43
102
|
if self._text_cache is not None:
|
|
44
103
|
return self._text_cache
|
|
45
104
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if encoding:
|
|
50
|
-
self._text_cache = self.body.decode(encoding)
|
|
51
|
-
return self._text_cache
|
|
105
|
+
if not self.body:
|
|
106
|
+
self._text_cache = ""
|
|
107
|
+
return self._text_cache
|
|
52
108
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
109
|
+
# 尝试多种编码
|
|
110
|
+
encodings_to_try = [self.encoding]
|
|
111
|
+
if self.encoding != 'utf-8':
|
|
112
|
+
encodings_to_try.append('utf-8')
|
|
113
|
+
if 'gbk' not in encodings_to_try:
|
|
114
|
+
encodings_to_try.append('gbk')
|
|
115
|
+
if 'gb2312' not in encodings_to_try:
|
|
116
|
+
encodings_to_try.append('gb2312')
|
|
117
|
+
encodings_to_try.append('latin1') # 最后的回退选项
|
|
118
|
+
|
|
119
|
+
for encoding in encodings_to_try:
|
|
120
|
+
if not encoding:
|
|
121
|
+
continue
|
|
122
|
+
try:
|
|
58
123
|
self._text_cache = self.body.decode(encoding)
|
|
59
124
|
return self._text_cache
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
125
|
+
except (UnicodeDecodeError, LookupError):
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# 所有编码都失败,使用容错解码
|
|
129
|
+
try:
|
|
130
|
+
self._text_cache = self.body.decode('utf-8', errors='replace')
|
|
63
131
|
return self._text_cache
|
|
64
|
-
|
|
65
|
-
except UnicodeDecodeError as e:
|
|
132
|
+
except Exception as e:
|
|
66
133
|
raise DecodeError(f"Failed to decode response from {self.url}: {e}")
|
|
67
134
|
|
|
68
|
-
|
|
135
|
+
@property
|
|
136
|
+
def is_success(self) -> bool:
|
|
137
|
+
"""检查响应是否成功 (2xx)"""
|
|
138
|
+
return self._is_success
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def is_redirect(self) -> bool:
|
|
142
|
+
"""检查响应是否为重定向 (3xx)"""
|
|
143
|
+
return self._is_redirect
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def is_client_error(self) -> bool:
|
|
147
|
+
"""检查响应是否为客户端错误 (4xx)"""
|
|
148
|
+
return self._is_client_error
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def is_server_error(self) -> bool:
|
|
152
|
+
"""检查响应是否为服务器错误 (5xx)"""
|
|
153
|
+
return self._is_server_error
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def content_type(self) -> str:
|
|
157
|
+
"""获取响应的 Content-Type"""
|
|
158
|
+
return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def content_length(self) -> Optional[int]:
|
|
162
|
+
"""获取响应的 Content-Length"""
|
|
163
|
+
length = self.headers.get('content-length') or self.headers.get('Content-Length')
|
|
164
|
+
return int(length) if length else None
|
|
165
|
+
def json(self, default: Any = None) -> Any:
|
|
69
166
|
"""将响应文本解析为 JSON 对象。"""
|
|
70
|
-
if self._json_cache:
|
|
167
|
+
if self._json_cache is not None:
|
|
168
|
+
return self._json_cache
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
self._json_cache = ujson.loads(self.text)
|
|
71
172
|
return self._json_cache
|
|
72
|
-
|
|
73
|
-
|
|
173
|
+
except (ujson.JSONDecodeError, ValueError) as e:
|
|
174
|
+
if default is not None:
|
|
175
|
+
return default
|
|
176
|
+
raise DecodeError(f"Failed to parse JSON from {self.url}: {e}")
|
|
74
177
|
|
|
75
178
|
def urljoin(self, url: str) -> str:
|
|
76
179
|
"""拼接 URL,自动处理相对路径。"""
|