crawlo 1.4.2__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (326) hide show
  1. crawlo/__init__.py +93 -93
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -341
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +438 -439
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +291 -257
  19. crawlo/crawler.py +650 -650
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +63 -63
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +61 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +103 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -257
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -292
  47. crawlo/initialization/__init__.py +44 -44
  48. crawlo/initialization/built_in.py +425 -425
  49. crawlo/initialization/context.py +141 -141
  50. crawlo/initialization/core.py +193 -193
  51. crawlo/initialization/phases.py +148 -148
  52. crawlo/initialization/registry.py +145 -145
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -23
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +45 -37
  58. crawlo/logging/async_handler.py +181 -0
  59. crawlo/logging/config.py +196 -96
  60. crawlo/logging/factory.py +171 -128
  61. crawlo/logging/manager.py +111 -111
  62. crawlo/logging/monitor.py +153 -0
  63. crawlo/logging/sampler.py +167 -0
  64. crawlo/middleware/__init__.py +21 -21
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +135 -135
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +386 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/middleware/simple_proxy.py +65 -65
  75. crawlo/mode_manager.py +219 -219
  76. crawlo/network/__init__.py +21 -21
  77. crawlo/network/request.py +379 -379
  78. crawlo/network/response.py +359 -359
  79. crawlo/pipelines/__init__.py +21 -21
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +131 -131
  87. crawlo/pipelines/mysql_pipeline.py +325 -325
  88. crawlo/pipelines/pipeline_manager.py +100 -84
  89. crawlo/pipelines/redis_dedup_pipeline.py +156 -156
  90. crawlo/project.py +349 -338
  91. crawlo/queue/pqueue.py +42 -42
  92. crawlo/queue/queue_manager.py +526 -522
  93. crawlo/queue/redis_priority_queue.py +370 -367
  94. crawlo/settings/__init__.py +7 -7
  95. crawlo/settings/default_settings.py +284 -284
  96. crawlo/settings/setting_manager.py +219 -219
  97. crawlo/spider/__init__.py +657 -657
  98. crawlo/stats_collector.py +73 -73
  99. crawlo/subscriber.py +129 -129
  100. crawlo/task_manager.py +138 -138
  101. crawlo/templates/crawlo.cfg.tmpl +10 -10
  102. crawlo/templates/project/__init__.py.tmpl +3 -3
  103. crawlo/templates/project/items.py.tmpl +17 -17
  104. crawlo/templates/project/middlewares.py.tmpl +118 -118
  105. crawlo/templates/project/pipelines.py.tmpl +96 -96
  106. crawlo/templates/project/settings.py.tmpl +170 -170
  107. crawlo/templates/project/settings_distributed.py.tmpl +169 -169
  108. crawlo/templates/project/settings_gentle.py.tmpl +166 -166
  109. crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
  110. crawlo/templates/project/settings_minimal.py.tmpl +65 -65
  111. crawlo/templates/project/settings_simple.py.tmpl +164 -164
  112. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  113. crawlo/templates/run.py.tmpl +34 -34
  114. crawlo/templates/spider/spider.py.tmpl +143 -143
  115. crawlo/templates/spiders_init.py.tmpl +9 -9
  116. crawlo/tools/__init__.py +200 -200
  117. crawlo/tools/anti_crawler.py +268 -268
  118. crawlo/tools/authenticated_proxy.py +240 -240
  119. crawlo/tools/data_formatter.py +225 -225
  120. crawlo/tools/data_validator.py +180 -180
  121. crawlo/tools/date_tools.py +289 -289
  122. crawlo/tools/distributed_coordinator.py +384 -384
  123. crawlo/tools/encoding_converter.py +127 -127
  124. crawlo/tools/network_diagnostic.py +364 -364
  125. crawlo/tools/request_tools.py +82 -82
  126. crawlo/tools/retry_mechanism.py +224 -224
  127. crawlo/tools/scenario_adapter.py +262 -262
  128. crawlo/tools/text_cleaner.py +232 -232
  129. crawlo/utils/__init__.py +34 -34
  130. crawlo/utils/batch_processor.py +259 -259
  131. crawlo/utils/class_loader.py +25 -25
  132. crawlo/utils/controlled_spider_mixin.py +439 -439
  133. crawlo/utils/db_helper.py +343 -343
  134. crawlo/utils/enhanced_error_handler.py +356 -356
  135. crawlo/utils/env_config.py +142 -142
  136. crawlo/utils/error_handler.py +165 -165
  137. crawlo/utils/fingerprint.py +122 -122
  138. crawlo/utils/func_tools.py +82 -82
  139. crawlo/utils/large_scale_config.py +286 -286
  140. crawlo/utils/large_scale_helper.py +344 -344
  141. crawlo/utils/log.py +79 -79
  142. crawlo/utils/performance_monitor.py +285 -285
  143. crawlo/utils/queue_helper.py +175 -175
  144. crawlo/utils/redis_connection_pool.py +388 -388
  145. crawlo/utils/redis_key_validator.py +198 -198
  146. crawlo/utils/request.py +267 -267
  147. crawlo/utils/request_serializer.py +225 -225
  148. crawlo/utils/spider_loader.py +61 -61
  149. crawlo/utils/system.py +11 -11
  150. crawlo/utils/tools.py +4 -4
  151. crawlo/utils/url.py +39 -39
  152. crawlo-1.4.3.dist-info/METADATA +190 -0
  153. crawlo-1.4.3.dist-info/RECORD +326 -0
  154. examples/__init__.py +7 -7
  155. examples/test_project/__init__.py +7 -7
  156. examples/test_project/run.py +34 -34
  157. examples/test_project/test_project/__init__.py +3 -3
  158. examples/test_project/test_project/items.py +17 -17
  159. examples/test_project/test_project/middlewares.py +118 -118
  160. examples/test_project/test_project/pipelines.py +96 -96
  161. examples/test_project/test_project/settings.py +169 -169
  162. examples/test_project/test_project/spiders/__init__.py +9 -9
  163. examples/test_project/test_project/spiders/of_week_dis.py +143 -143
  164. tests/__init__.py +7 -7
  165. tests/advanced_tools_example.py +275 -275
  166. tests/authenticated_proxy_example.py +106 -106
  167. tests/baidu_performance_test.py +108 -108
  168. tests/baidu_test.py +59 -59
  169. tests/cleaners_example.py +160 -160
  170. tests/comprehensive_framework_test.py +212 -212
  171. tests/comprehensive_test.py +81 -81
  172. tests/comprehensive_testing_summary.md +186 -186
  173. tests/config_validation_demo.py +142 -142
  174. tests/controlled_spider_example.py +205 -205
  175. tests/date_tools_example.py +180 -180
  176. tests/debug_configure.py +69 -69
  177. tests/debug_framework_logger.py +84 -84
  178. tests/debug_log_config.py +126 -126
  179. tests/debug_log_levels.py +63 -63
  180. tests/debug_pipelines.py +66 -66
  181. tests/detailed_log_test.py +233 -233
  182. tests/distributed_test.py +66 -66
  183. tests/distributed_test_debug.py +76 -76
  184. tests/dynamic_loading_example.py +523 -523
  185. tests/dynamic_loading_test.py +104 -104
  186. tests/env_config_example.py +133 -133
  187. tests/error_handling_example.py +171 -171
  188. tests/final_comprehensive_test.py +151 -151
  189. tests/final_log_test.py +260 -260
  190. tests/final_validation_test.py +182 -182
  191. tests/fix_log_test.py +142 -142
  192. tests/framework_performance_test.py +202 -202
  193. tests/log_buffering_test.py +111 -111
  194. tests/log_generation_timing_test.py +153 -153
  195. tests/optimized_performance_test.py +211 -211
  196. tests/performance_comparison.py +245 -245
  197. tests/queue_blocking_test.py +113 -113
  198. tests/queue_test.py +89 -89
  199. tests/redis_key_validation_demo.py +130 -130
  200. tests/request_params_example.py +150 -150
  201. tests/response_improvements_example.py +144 -144
  202. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  203. tests/scrapy_comparison/scrapy_test.py +133 -133
  204. tests/simple_command_test.py +119 -119
  205. tests/simple_crawlo_test.py +127 -127
  206. tests/simple_log_test.py +57 -57
  207. tests/simple_log_test2.py +137 -137
  208. tests/simple_optimization_test.py +128 -128
  209. tests/simple_queue_type_test.py +41 -41
  210. tests/simple_spider_test.py +49 -49
  211. tests/simple_test.py +47 -47
  212. tests/spider_log_timing_test.py +177 -177
  213. tests/test_advanced_tools.py +148 -148
  214. tests/test_all_commands.py +230 -230
  215. tests/test_all_pipeline_fingerprints.py +133 -133
  216. tests/test_all_redis_key_configs.py +145 -145
  217. tests/test_authenticated_proxy.py +141 -141
  218. tests/test_batch_processor.py +178 -178
  219. tests/test_cleaners.py +54 -54
  220. tests/test_component_factory.py +174 -174
  221. tests/test_comprehensive.py +146 -146
  222. tests/test_config_consistency.py +80 -80
  223. tests/test_config_merge.py +152 -152
  224. tests/test_config_validator.py +182 -182
  225. tests/test_controlled_spider_mixin.py +79 -79
  226. tests/test_crawlo_proxy_integration.py +108 -108
  227. tests/test_date_tools.py +123 -123
  228. tests/test_dedup_fix.py +220 -220
  229. tests/test_dedup_pipeline_consistency.py +125 -0
  230. tests/test_default_header_middleware.py +313 -313
  231. tests/test_distributed.py +65 -65
  232. tests/test_double_crawlo_fix.py +204 -204
  233. tests/test_double_crawlo_fix_simple.py +124 -124
  234. tests/test_download_delay_middleware.py +221 -221
  235. tests/test_downloader_proxy_compatibility.py +268 -268
  236. tests/test_dynamic_downloaders_proxy.py +124 -124
  237. tests/test_dynamic_proxy.py +92 -92
  238. tests/test_dynamic_proxy_config.py +146 -146
  239. tests/test_dynamic_proxy_real.py +109 -109
  240. tests/test_edge_cases.py +303 -303
  241. tests/test_enhanced_error_handler.py +270 -270
  242. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  243. tests/test_env_config.py +121 -121
  244. tests/test_error_handler_compatibility.py +112 -112
  245. tests/test_factories.py +252 -252
  246. tests/test_final_validation.py +153 -153
  247. tests/test_fingerprint_consistency.py +135 -135
  248. tests/test_fingerprint_simple.py +51 -51
  249. tests/test_framework_env_usage.py +103 -103
  250. tests/test_framework_logger.py +66 -66
  251. tests/test_framework_startup.py +64 -64
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_config.py +112 -112
  257. tests/test_large_scale_helper.py +235 -235
  258. tests/test_logging_enhancements.py +375 -0
  259. tests/test_logging_final.py +185 -0
  260. tests/test_logging_integration.py +313 -0
  261. tests/test_logging_system.py +282 -282
  262. tests/test_middleware_debug.py +142 -0
  263. tests/test_mode_change.py +72 -72
  264. tests/test_mode_consistency.py +51 -51
  265. tests/test_offsite_middleware.py +244 -244
  266. tests/test_offsite_middleware_simple.py +203 -203
  267. tests/test_parsel.py +29 -29
  268. tests/test_performance.py +327 -327
  269. tests/test_performance_monitor.py +115 -115
  270. tests/test_pipeline_fingerprint_consistency.py +86 -86
  271. tests/test_priority_behavior.py +212 -0
  272. tests/test_priority_consistency.py +152 -0
  273. tests/test_priority_consistency_fixed.py +250 -0
  274. tests/test_proxy_api.py +264 -264
  275. tests/test_proxy_health_check.py +32 -32
  276. tests/test_proxy_middleware.py +121 -121
  277. tests/test_proxy_middleware_enhanced.py +216 -216
  278. tests/test_proxy_middleware_integration.py +136 -136
  279. tests/test_proxy_middleware_refactored.py +184 -184
  280. tests/test_proxy_providers.py +56 -56
  281. tests/test_proxy_stats.py +19 -19
  282. tests/test_proxy_strategies.py +59 -59
  283. tests/test_queue_empty_check.py +41 -41
  284. tests/test_queue_manager_double_crawlo.py +173 -173
  285. tests/test_queue_manager_redis_key.py +179 -179
  286. tests/test_queue_naming.py +154 -154
  287. tests/test_queue_type.py +106 -106
  288. tests/test_queue_type_redis_config_consistency.py +131 -0
  289. tests/test_random_headers_default.py +323 -0
  290. tests/test_random_headers_necessity.py +309 -0
  291. tests/test_random_user_agent.py +72 -72
  292. tests/test_real_scenario_proxy.py +195 -195
  293. tests/test_redis_config.py +28 -28
  294. tests/test_redis_connection_pool.py +294 -294
  295. tests/test_redis_key_naming.py +181 -181
  296. tests/test_redis_key_validator.py +123 -123
  297. tests/test_redis_queue.py +224 -224
  298. tests/test_redis_queue_name_fix.py +175 -175
  299. tests/test_redis_queue_type_fallback.py +130 -0
  300. tests/test_request_ignore_middleware.py +182 -182
  301. tests/test_request_params.py +111 -111
  302. tests/test_request_serialization.py +70 -70
  303. tests/test_response_code_middleware.py +349 -349
  304. tests/test_response_filter_middleware.py +427 -427
  305. tests/test_response_improvements.py +152 -152
  306. tests/test_retry_middleware.py +334 -242
  307. tests/test_retry_middleware_realistic.py +274 -0
  308. tests/test_scheduler.py +252 -252
  309. tests/test_scheduler_config_update.py +133 -133
  310. tests/test_simple_response.py +61 -61
  311. tests/test_telecom_spider_redis_key.py +205 -205
  312. tests/test_template_content.py +87 -87
  313. tests/test_template_redis_key.py +134 -134
  314. tests/test_tools.py +159 -159
  315. tests/test_user_agent_randomness.py +177 -0
  316. tests/test_user_agents.py +96 -96
  317. tests/tools_example.py +260 -260
  318. tests/untested_features_report.md +138 -138
  319. tests/verify_debug.py +51 -51
  320. tests/verify_distributed.py +117 -117
  321. tests/verify_log_fix.py +111 -111
  322. crawlo-1.4.2.dist-info/METADATA +0 -1199
  323. crawlo-1.4.2.dist-info/RECORD +0 -309
  324. {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
  325. {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
  326. {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
@@ -1,360 +1,360 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- HTTP Response 封装模块
5
- =====================
6
- 提供功能丰富的HTTP响应封装,支持:
7
- - 智能编码检测和解码
8
- - XPath/CSS 选择器
9
- - JSON 解析和缓存
10
- - 正则表达式支持
11
- - Cookie 处理
12
- """
13
- import re
14
- import ujson
15
- from http.cookies import SimpleCookie
16
- from parsel import Selector, SelectorList
17
- from typing import Dict, Any, List, Optional, Union
18
- from urllib.parse import urljoin as _urljoin
19
-
20
- from crawlo.exceptions import DecodeError
21
-
22
-
23
- class Response:
24
- """
25
- HTTP响应的封装,提供数据解析的便捷方法。
26
-
27
- 功能特性:
28
- - 智能编码检测和缓存
29
- - 懒加载 Selector 实例
30
- - JSON 解析和缓存
31
- - 多类型数据提取
32
- """
33
-
34
- def __init__(
35
- self,
36
- url: str,
37
- *,
38
- headers: Dict[str, Any] = None,
39
- body: bytes = b"",
40
- method: str = 'GET',
41
- request: 'Request' = None, # 使用字符串注解避免循环导入
42
- status_code: int = 200,
43
- ):
44
- # 基本属性
45
- self.url = url
46
- self.headers = headers or {}
47
- self.body = body
48
- self.method = method.upper()
49
- self.request = request
50
- self.status_code = status_code
51
-
52
- # 编码处理
53
- self.encoding = self._determine_encoding()
54
-
55
- # 缓存属性
56
- self._text_cache = None
57
- self._json_cache = None
58
- self._selector_instance = None
59
-
60
- # 状态标记
61
- self._is_success = 200 <= status_code < 300
62
- self._is_redirect = 300 <= status_code < 400
63
- self._is_client_error = 400 <= status_code < 500
64
- self._is_server_error = status_code >= 500
65
-
66
- def _determine_encoding(self) -> Optional[str]:
67
- """智能检测响应编码"""
68
- # 1. 优先使用 request 的编码
69
- if self.request and self.request.encoding:
70
- return self.request.encoding
71
-
72
- # 2. 从 Content-Type 头中检测
73
- content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
74
- if content_type:
75
- charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
76
- if charset_match:
77
- return charset_match.group(1).lower()
78
-
79
- # 3. 从 HTML meta 标签中检测(仅对HTML内容)
80
- if b'<html' in self.body[:1024].lower():
81
- # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
82
- html_start = self.body[:4096] # 只检查前4KB
83
- try:
84
- html_text = html_start.decode('ascii', errors='ignore')
85
- # <meta charset="utf-8">
86
- charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
87
- if charset_match:
88
- return charset_match.group(1).lower()
89
-
90
- # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
91
- content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
92
- if content_match:
93
- return content_match.group(1).lower()
94
- except Exception:
95
- pass
96
-
97
- # 4. 默认使用 utf-8
98
- return 'utf-8'
99
-
100
- @property
101
- def text(self) -> str:
102
- """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
103
- if self._text_cache is not None:
104
- return self._text_cache
105
-
106
- if not self.body:
107
- self._text_cache = ""
108
- return self._text_cache
109
-
110
- # 尝试多种编码
111
- encodings_to_try = [self.encoding]
112
- if self.encoding != 'utf-8':
113
- encodings_to_try.append('utf-8')
114
- if 'gbk' not in encodings_to_try:
115
- encodings_to_try.append('gbk')
116
- if 'gb2312' not in encodings_to_try:
117
- encodings_to_try.append('gb2312')
118
- encodings_to_try.append('latin1') # 最后的回退选项
119
-
120
- for encoding in encodings_to_try:
121
- if not encoding:
122
- continue
123
- try:
124
- self._text_cache = self.body.decode(encoding)
125
- return self._text_cache
126
- except (UnicodeDecodeError, LookupError):
127
- continue
128
-
129
- # 所有编码都失败,使用容错解码
130
- try:
131
- self._text_cache = self.body.decode('utf-8', errors='replace')
132
- return self._text_cache
133
- except Exception as e:
134
- raise DecodeError(f"Failed to decode response from {self.url}: {e}")
135
-
136
- @property
137
- def is_success(self) -> bool:
138
- """检查响应是否成功 (2xx)"""
139
- return self._is_success
140
-
141
- @property
142
- def is_redirect(self) -> bool:
143
- """检查响应是否为重定向 (3xx)"""
144
- return self._is_redirect
145
-
146
- @property
147
- def is_client_error(self) -> bool:
148
- """检查响应是否为客户端错误 (4xx)"""
149
- return self._is_client_error
150
-
151
- @property
152
- def is_server_error(self) -> bool:
153
- """检查响应是否为服务器错误 (5xx)"""
154
- return self._is_server_error
155
-
156
- @property
157
- def content_type(self) -> str:
158
- """获取响应的 Content-Type"""
159
- return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
160
-
161
- @property
162
- def content_length(self) -> Optional[int]:
163
- """获取响应的 Content-Length"""
164
- length = self.headers.get('content-length') or self.headers.get('Content-Length')
165
- return int(length) if length else None
166
-
167
- def json(self, default: Any = None) -> Any:
168
- """将响应文本解析为 JSON 对象。"""
169
- if self._json_cache is not None:
170
- return self._json_cache
171
-
172
- try:
173
- self._json_cache = ujson.loads(self.text)
174
- return self._json_cache
175
- except (ujson.JSONDecodeError, ValueError) as e:
176
- if default is not None:
177
- return default
178
- raise DecodeError(f"Failed to parse JSON from {self.url}: {e}")
179
-
180
- def urljoin(self, url: str) -> str:
181
- """拼接 URL,自动处理相对路径。"""
182
- return _urljoin(self.url, url)
183
-
184
- @property
185
- def _selector(self) -> Selector:
186
- """懒加载 Selector 实例"""
187
- if self._selector_instance is None:
188
- self._selector_instance = Selector(self.text)
189
- return self._selector_instance
190
-
191
- def xpath(self, query: str) -> SelectorList:
192
- """使用 XPath 选择器查询文档。"""
193
- return self._selector.xpath(query)
194
-
195
- def css(self, query: str) -> SelectorList:
196
- """使用 CSS 选择器查询文档。"""
197
- return self._selector.css(query)
198
-
199
- def _is_xpath(self, query: str) -> bool:
200
- """判断查询语句是否为XPath"""
201
- return query.startswith(('/', '//', './'))
202
-
203
- def _extract_text_from_elements(self, elements: SelectorList, join_str: str = " ") -> str:
204
- """
205
- 从元素列表中提取文本并拼接
206
-
207
- :param elements: SelectorList元素列表
208
- :param join_str: 文本拼接分隔符
209
- :return: 拼接后的文本
210
- """
211
- texts = []
212
- for element in elements:
213
- # 获取元素的所有文本节点
214
- if hasattr(element, 'xpath'):
215
- element_texts = element.xpath('.//text()').getall()
216
- else:
217
- element_texts = [str(element)]
218
- # 清理并添加非空文本
219
- for text in element_texts:
220
- cleaned = text.strip()
221
- if cleaned:
222
- texts.append(cleaned)
223
- return join_str.join(texts)
224
-
225
- def extract_text(self, xpath_or_css: str, join_str: str = " ", default: str = '') -> str:
226
- """
227
- 提取单个元素的文本内容,支持CSS和XPath选择器
228
-
229
- 参数:
230
- xpath_or_css: XPath或CSS选择器
231
- join_str: 文本拼接分隔符(默认为空格)
232
- default: 默认返回值,当未找到元素时返回
233
-
234
- 返回:
235
- 拼接后的纯文本字符串
236
- """
237
- try:
238
- elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
239
- if not elements:
240
- return default
241
- return self._extract_text_from_elements(elements, join_str)
242
- except Exception:
243
- return default
244
-
245
- def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
246
- """
247
- 提取多个元素的文本内容列表,支持CSS和XPath选择器
248
-
249
- 参数:
250
- xpath_or_css: XPath或CSS选择器
251
- join_str: 单个节点内文本拼接分隔符
252
- default: 默认返回值,当未找到元素时返回
253
-
254
- 返回:
255
- 纯文本列表(每个元素对应一个节点的文本)
256
- """
257
- if default is None:
258
- default = []
259
-
260
- try:
261
- elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
262
- if not elements:
263
- return default
264
-
265
- result = []
266
- for element in elements:
267
- # 对每个元素提取文本
268
- if hasattr(element, 'xpath'):
269
- texts = element.xpath('.//text()').getall()
270
- else:
271
- texts = [str(element)]
272
-
273
- # 清理文本并拼接
274
- clean_texts = [text.strip() for text in texts if text.strip()]
275
- if clean_texts:
276
- result.append(join_str.join(clean_texts))
277
-
278
- return result if result else default
279
- except Exception:
280
- return default
281
-
282
- def extract_attr(self, xpath_or_css: str, attr_name: str, default: Any = None) -> Any:
283
- """
284
- 提取单个元素的属性值,支持CSS和XPath选择器
285
-
286
- 参数:
287
- xpath_or_css: XPath或CSS选择器
288
- attr_name: 属性名称
289
- default: 默认返回值
290
-
291
- 返回:
292
- 属性值或默认值
293
- """
294
- try:
295
- elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
296
- if not elements:
297
- return default
298
- return elements.attrib.get(attr_name, default)
299
- except Exception:
300
- return default
301
-
302
- def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
303
- """
304
- 提取多个元素的属性值列表,支持CSS和XPath选择器
305
-
306
- 参数:
307
- xpath_or_css: XPath或CSS选择器
308
- attr_name: 属性名称
309
- default: 默认返回值
310
-
311
- 返回:
312
- 属性值列表
313
- """
314
- if default is None:
315
- default = []
316
-
317
- try:
318
- elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
319
- if not elements:
320
- return default
321
-
322
- result = []
323
- for element in elements:
324
- if hasattr(element, 'attrib'):
325
- attr_value = element.attrib.get(attr_name)
326
- if attr_value is not None:
327
- result.append(attr_value)
328
-
329
- return result if result else default
330
- except Exception:
331
- return default
332
-
333
- def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
334
- """在响应文本上执行正则表达式搜索。"""
335
- if not isinstance(pattern, str):
336
- raise TypeError("Pattern must be a string")
337
- return re.search(pattern, self.text, flags=flags)
338
-
339
- def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
340
- """在响应文本上执行正则表达式查找。"""
341
- if not isinstance(pattern, str):
342
- raise TypeError("Pattern must be a string")
343
- return re.findall(pattern, self.text, flags=flags)
344
-
345
- def get_cookies(self) -> Dict[str, str]:
346
- """从响应头中解析并返回Cookies。"""
347
- cookie_header = self.headers.get("Set-Cookie", "")
348
- if isinstance(cookie_header, list):
349
- cookie_header = ", ".join(cookie_header)
350
- cookies = SimpleCookie()
351
- cookies.load(cookie_header)
352
- return {key: morsel.value for key, morsel in cookies.items()}
353
-
354
- @property
355
- def meta(self) -> Dict:
356
- """获取关联的 Request 对象的 meta 字典。"""
357
- return self.request.meta if self.request else {}
358
-
359
- def __str__(self):
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ HTTP Response 封装模块
5
+ =====================
6
+ 提供功能丰富的HTTP响应封装,支持:
7
+ - 智能编码检测和解码
8
+ - XPath/CSS 选择器
9
+ - JSON 解析和缓存
10
+ - 正则表达式支持
11
+ - Cookie 处理
12
+ """
13
+ import re
14
+ import ujson
15
+ from http.cookies import SimpleCookie
16
+ from parsel import Selector, SelectorList
17
+ from typing import Dict, Any, List, Optional, Union
18
+ from urllib.parse import urljoin as _urljoin
19
+
20
+ from crawlo.exceptions import DecodeError
21
+
22
+
23
+ class Response:
24
+ """
25
+ HTTP响应的封装,提供数据解析的便捷方法。
26
+
27
+ 功能特性:
28
+ - 智能编码检测和缓存
29
+ - 懒加载 Selector 实例
30
+ - JSON 解析和缓存
31
+ - 多类型数据提取
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ url: str,
37
+ *,
38
+ headers: Dict[str, Any] = None,
39
+ body: bytes = b"",
40
+ method: str = 'GET',
41
+ request: 'Request' = None, # 使用字符串注解避免循环导入
42
+ status_code: int = 200,
43
+ ):
44
+ # 基本属性
45
+ self.url = url
46
+ self.headers = headers or {}
47
+ self.body = body
48
+ self.method = method.upper()
49
+ self.request = request
50
+ self.status_code = status_code
51
+
52
+ # 编码处理
53
+ self.encoding = self._determine_encoding()
54
+
55
+ # 缓存属性
56
+ self._text_cache = None
57
+ self._json_cache = None
58
+ self._selector_instance = None
59
+
60
+ # 状态标记
61
+ self._is_success = 200 <= status_code < 300
62
+ self._is_redirect = 300 <= status_code < 400
63
+ self._is_client_error = 400 <= status_code < 500
64
+ self._is_server_error = status_code >= 500
65
+
66
+ def _determine_encoding(self) -> Optional[str]:
67
+ """智能检测响应编码"""
68
+ # 1. 优先使用 request 的编码
69
+ if self.request and self.request.encoding:
70
+ return self.request.encoding
71
+
72
+ # 2. 从 Content-Type 头中检测
73
+ content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
74
+ if content_type:
75
+ charset_match = re.search(r"charset=([w-]+)", content_type, re.I)
76
+ if charset_match:
77
+ return charset_match.group(1).lower()
78
+
79
+ # 3. 从 HTML meta 标签中检测(仅对HTML内容)
80
+ if b'<html' in self.body[:1024].lower():
81
+ # 查找 <meta charset="xxx"> 或 <meta http-equiv="Content-Type" content="...charset=xxx">
82
+ html_start = self.body[:4096] # 只检查前4KB
83
+ try:
84
+ html_text = html_start.decode('ascii', errors='ignore')
85
+ # <meta charset="utf-8">
86
+ charset_match = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', html_text, re.I)
87
+ if charset_match:
88
+ return charset_match.group(1).lower()
89
+
90
+ # <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
91
+ content_match = re.search(r'<meta[^>]+content=["\'][^"\'>]*charset=([\w-]+)', html_text, re.I)
92
+ if content_match:
93
+ return content_match.group(1).lower()
94
+ except Exception:
95
+ pass
96
+
97
+ # 4. 默认使用 utf-8
98
+ return 'utf-8'
99
+
100
+ @property
101
+ def text(self) -> str:
102
+ """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
103
+ if self._text_cache is not None:
104
+ return self._text_cache
105
+
106
+ if not self.body:
107
+ self._text_cache = ""
108
+ return self._text_cache
109
+
110
+ # 尝试多种编码
111
+ encodings_to_try = [self.encoding]
112
+ if self.encoding != 'utf-8':
113
+ encodings_to_try.append('utf-8')
114
+ if 'gbk' not in encodings_to_try:
115
+ encodings_to_try.append('gbk')
116
+ if 'gb2312' not in encodings_to_try:
117
+ encodings_to_try.append('gb2312')
118
+ encodings_to_try.append('latin1') # 最后的回退选项
119
+
120
+ for encoding in encodings_to_try:
121
+ if not encoding:
122
+ continue
123
+ try:
124
+ self._text_cache = self.body.decode(encoding)
125
+ return self._text_cache
126
+ except (UnicodeDecodeError, LookupError):
127
+ continue
128
+
129
+ # 所有编码都失败,使用容错解码
130
+ try:
131
+ self._text_cache = self.body.decode('utf-8', errors='replace')
132
+ return self._text_cache
133
+ except Exception as e:
134
+ raise DecodeError(f"Failed to decode response from {self.url}: {e}")
135
+
136
+ @property
137
+ def is_success(self) -> bool:
138
+ """检查响应是否成功 (2xx)"""
139
+ return self._is_success
140
+
141
+ @property
142
+ def is_redirect(self) -> bool:
143
+ """检查响应是否为重定向 (3xx)"""
144
+ return self._is_redirect
145
+
146
+ @property
147
+ def is_client_error(self) -> bool:
148
+ """检查响应是否为客户端错误 (4xx)"""
149
+ return self._is_client_error
150
+
151
+ @property
152
+ def is_server_error(self) -> bool:
153
+ """检查响应是否为服务器错误 (5xx)"""
154
+ return self._is_server_error
155
+
156
+ @property
157
+ def content_type(self) -> str:
158
+ """获取响应的 Content-Type"""
159
+ return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
160
+
161
+ @property
162
+ def content_length(self) -> Optional[int]:
163
+ """获取响应的 Content-Length"""
164
+ length = self.headers.get('content-length') or self.headers.get('Content-Length')
165
+ return int(length) if length else None
166
+
167
+ def json(self, default: Any = None) -> Any:
168
+ """将响应文本解析为 JSON 对象。"""
169
+ if self._json_cache is not None:
170
+ return self._json_cache
171
+
172
+ try:
173
+ self._json_cache = ujson.loads(self.text)
174
+ return self._json_cache
175
+ except (ujson.JSONDecodeError, ValueError) as e:
176
+ if default is not None:
177
+ return default
178
+ raise DecodeError(f"Failed to parse JSON from {self.url}: {e}")
179
+
180
+ def urljoin(self, url: str) -> str:
181
+ """拼接 URL,自动处理相对路径。"""
182
+ return _urljoin(self.url, url)
183
+
184
+ @property
185
+ def _selector(self) -> Selector:
186
+ """懒加载 Selector 实例"""
187
+ if self._selector_instance is None:
188
+ self._selector_instance = Selector(self.text)
189
+ return self._selector_instance
190
+
191
+ def xpath(self, query: str) -> SelectorList:
192
+ """使用 XPath 选择器查询文档。"""
193
+ return self._selector.xpath(query)
194
+
195
+ def css(self, query: str) -> SelectorList:
196
+ """使用 CSS 选择器查询文档。"""
197
+ return self._selector.css(query)
198
+
199
+ def _is_xpath(self, query: str) -> bool:
200
+ """判断查询语句是否为XPath"""
201
+ return query.startswith(('/', '//', './'))
202
+
203
+ def _extract_text_from_elements(self, elements: SelectorList, join_str: str = " ") -> str:
204
+ """
205
+ 从元素列表中提取文本并拼接
206
+
207
+ :param elements: SelectorList元素列表
208
+ :param join_str: 文本拼接分隔符
209
+ :return: 拼接后的文本
210
+ """
211
+ texts = []
212
+ for element in elements:
213
+ # 获取元素的所有文本节点
214
+ if hasattr(element, 'xpath'):
215
+ element_texts = element.xpath('.//text()').getall()
216
+ else:
217
+ element_texts = [str(element)]
218
+ # 清理并添加非空文本
219
+ for text in element_texts:
220
+ cleaned = text.strip()
221
+ if cleaned:
222
+ texts.append(cleaned)
223
+ return join_str.join(texts)
224
+
225
+ def extract_text(self, xpath_or_css: str, join_str: str = " ", default: str = '') -> str:
226
+ """
227
+ 提取单个元素的文本内容,支持CSS和XPath选择器
228
+
229
+ 参数:
230
+ xpath_or_css: XPath或CSS选择器
231
+ join_str: 文本拼接分隔符(默认为空格)
232
+ default: 默认返回值,当未找到元素时返回
233
+
234
+ 返回:
235
+ 拼接后的纯文本字符串
236
+ """
237
+ try:
238
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
239
+ if not elements:
240
+ return default
241
+ return self._extract_text_from_elements(elements, join_str)
242
+ except Exception:
243
+ return default
244
+
245
+ def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
246
+ """
247
+ 提取多个元素的文本内容列表,支持CSS和XPath选择器
248
+
249
+ 参数:
250
+ xpath_or_css: XPath或CSS选择器
251
+ join_str: 单个节点内文本拼接分隔符
252
+ default: 默认返回值,当未找到元素时返回
253
+
254
+ 返回:
255
+ 纯文本列表(每个元素对应一个节点的文本)
256
+ """
257
+ if default is None:
258
+ default = []
259
+
260
+ try:
261
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
262
+ if not elements:
263
+ return default
264
+
265
+ result = []
266
+ for element in elements:
267
+ # 对每个元素提取文本
268
+ if hasattr(element, 'xpath'):
269
+ texts = element.xpath('.//text()').getall()
270
+ else:
271
+ texts = [str(element)]
272
+
273
+ # 清理文本并拼接
274
+ clean_texts = [text.strip() for text in texts if text.strip()]
275
+ if clean_texts:
276
+ result.append(join_str.join(clean_texts))
277
+
278
+ return result if result else default
279
+ except Exception:
280
+ return default
281
+
282
+ def extract_attr(self, xpath_or_css: str, attr_name: str, default: Any = None) -> Any:
283
+ """
284
+ 提取单个元素的属性值,支持CSS和XPath选择器
285
+
286
+ 参数:
287
+ xpath_or_css: XPath或CSS选择器
288
+ attr_name: 属性名称
289
+ default: 默认返回值
290
+
291
+ 返回:
292
+ 属性值或默认值
293
+ """
294
+ try:
295
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
296
+ if not elements:
297
+ return default
298
+ return elements.attrib.get(attr_name, default)
299
+ except Exception:
300
+ return default
301
+
302
+ def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
303
+ """
304
+ 提取多个元素的属性值列表,支持CSS和XPath选择器
305
+
306
+ 参数:
307
+ xpath_or_css: XPath或CSS选择器
308
+ attr_name: 属性名称
309
+ default: 默认返回值
310
+
311
+ 返回:
312
+ 属性值列表
313
+ """
314
+ if default is None:
315
+ default = []
316
+
317
+ try:
318
+ elements = self.xpath(xpath_or_css) if self._is_xpath(xpath_or_css) else self.css(xpath_or_css)
319
+ if not elements:
320
+ return default
321
+
322
+ result = []
323
+ for element in elements:
324
+ if hasattr(element, 'attrib'):
325
+ attr_value = element.attrib.get(attr_name)
326
+ if attr_value is not None:
327
+ result.append(attr_value)
328
+
329
+ return result if result else default
330
+ except Exception:
331
+ return default
332
+
333
+ def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
334
+ """在响应文本上执行正则表达式搜索。"""
335
+ if not isinstance(pattern, str):
336
+ raise TypeError("Pattern must be a string")
337
+ return re.search(pattern, self.text, flags=flags)
338
+
339
+ def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
340
+ """在响应文本上执行正则表达式查找。"""
341
+ if not isinstance(pattern, str):
342
+ raise TypeError("Pattern must be a string")
343
+ return re.findall(pattern, self.text, flags=flags)
344
+
345
+ def get_cookies(self) -> Dict[str, str]:
346
+ """从响应头中解析并返回Cookies。"""
347
+ cookie_header = self.headers.get("Set-Cookie", "")
348
+ if isinstance(cookie_header, list):
349
+ cookie_header = ", ".join(cookie_header)
350
+ cookies = SimpleCookie()
351
+ cookies.load(cookie_header)
352
+ return {key: morsel.value for key, morsel in cookies.items()}
353
+
354
+ @property
355
+ def meta(self) -> Dict:
356
+ """获取关联的 Request 对象的 meta 字典。"""
357
+ return self.request.meta if self.request else {}
358
+
359
+ def __str__(self):
360
360
  return f"<{self.status_code} {self.url}>"