aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Playwright response implementation for aioscrapy.
|
|
3
|
+
aioscrapy的WebDriverResponse响应实现。
|
|
4
|
+
|
|
5
|
+
This module provides the PlaywrightResponse class, which is a specialized TextResponse
|
|
6
|
+
for handling responses from Playwright browser automation. It adds support for
|
|
7
|
+
browser driver management and response caching.
|
|
8
|
+
此模块提供了WebDriverResponse类,这是一个专门用于处理来自Playwright/DrissionPage等浏览器自动化的响应的TextResponse。
|
|
9
|
+
它添加了对浏览器驱动程序管理和响应缓存的支持。
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Optional, Any
|
|
13
|
+
|
|
14
|
+
from aioscrapy.http.response.text import TextResponse
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class WebDriverResponse(TextResponse):
|
|
18
|
+
"""
|
|
19
|
+
A Response subclass for handling Playwright browser automation responses.
|
|
20
|
+
用于处理Playwright浏览器自动化响应的Response子类。
|
|
21
|
+
|
|
22
|
+
This class extends TextResponse to handle responses from Playwright browser automation.
|
|
23
|
+
It adds support for:
|
|
24
|
+
此类扩展了TextResponse以处理来自Playwright浏览器自动化的响应。
|
|
25
|
+
它添加了对以下内容的支持:
|
|
26
|
+
|
|
27
|
+
- Browser driver management
|
|
28
|
+
浏览器驱动程序管理
|
|
29
|
+
- Response caching
|
|
30
|
+
响应缓存
|
|
31
|
+
- Text content override
|
|
32
|
+
文本内容覆盖
|
|
33
|
+
- Intercepted request data
|
|
34
|
+
拦截的请求数据
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
*args,
|
|
40
|
+
text: str = '',
|
|
41
|
+
cache_response: Optional[dict] = None,
|
|
42
|
+
driver: Optional["WebDriverBase"] = None,
|
|
43
|
+
driver_pool: Optional["WebDriverPool"] = None,
|
|
44
|
+
intercept_request: Optional[dict] = None,
|
|
45
|
+
**kwargs
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
Initialize a PlaywrightResponse.
|
|
49
|
+
初始化PlaywrightResponse。
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
*args: Positional arguments passed to the TextResponse constructor.
|
|
53
|
+
传递给TextResponse构造函数的位置参数。
|
|
54
|
+
text: The text content of the response, which can override the body's decoded text.
|
|
55
|
+
响应的文本内容,可以覆盖正文的解码文本。
|
|
56
|
+
cache_response: A dictionary of cached response data.
|
|
57
|
+
缓存的响应数据字典。
|
|
58
|
+
driver: The Playwright driver instance used for this response.
|
|
59
|
+
用于此响应的Playwright驱动程序实例。
|
|
60
|
+
driver_pool: The WebDriverPool that manages the driver.
|
|
61
|
+
管理驱动程序的WebDriverPool。
|
|
62
|
+
intercept_request: A dictionary of intercepted request data.
|
|
63
|
+
拦截的请求数据字典。
|
|
64
|
+
**kwargs: Keyword arguments passed to the TextResponse constructor.
|
|
65
|
+
传递给TextResponse构造函数的关键字参数。
|
|
66
|
+
"""
|
|
67
|
+
# Store Playwright-specific attributes
|
|
68
|
+
# 存储Playwright特定的属性
|
|
69
|
+
self.driver = driver
|
|
70
|
+
self.driver_pool = driver_pool
|
|
71
|
+
self._text = text
|
|
72
|
+
self.cache_response = cache_response or {}
|
|
73
|
+
self.intercept_request = intercept_request
|
|
74
|
+
|
|
75
|
+
# Initialize the base TextResponse
|
|
76
|
+
# 初始化基本TextResponse
|
|
77
|
+
super().__init__(*args, **kwargs)
|
|
78
|
+
|
|
79
|
+
async def release(self):
|
|
80
|
+
"""
|
|
81
|
+
Release the Playwright driver back to the pool.
|
|
82
|
+
将Playwright驱动程序释放回池中。
|
|
83
|
+
|
|
84
|
+
This method releases the driver instance back to the WebDriverPool
|
|
85
|
+
if both the driver and pool are available.
|
|
86
|
+
如果驱动程序和池都可用,此方法将驱动程序实例释放回WebDriverPool。
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
None
|
|
90
|
+
"""
|
|
91
|
+
self.driver_pool and self.driver and await self.driver_pool.release(self.driver)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def text(self):
|
|
95
|
+
"""
|
|
96
|
+
Get the response text content.
|
|
97
|
+
获取响应文本内容。
|
|
98
|
+
|
|
99
|
+
This property overrides the base TextResponse.text property to return
|
|
100
|
+
the explicitly set text content if available, otherwise falls back to
|
|
101
|
+
the decoded body text from the parent class.
|
|
102
|
+
此属性重写了基本TextResponse.text属性,如果可用,则返回明确设置的文本内容,
|
|
103
|
+
否则回退到父类的解码正文文本。
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
str: The response text content.
|
|
107
|
+
响应文本内容。
|
|
108
|
+
"""
|
|
109
|
+
return self._text or super().text
|
|
110
|
+
|
|
111
|
+
@text.setter
|
|
112
|
+
def text(self, text):
|
|
113
|
+
"""
|
|
114
|
+
Set the response text content.
|
|
115
|
+
设置响应文本内容。
|
|
116
|
+
|
|
117
|
+
This setter allows explicitly setting the text content of the response,
|
|
118
|
+
which will override the decoded body text.
|
|
119
|
+
此设置器允许明确设置响应的文本内容,这将覆盖解码的正文文本。
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text: The text content to set.
|
|
123
|
+
要设置的文本内容。
|
|
124
|
+
"""
|
|
125
|
+
self._text = text
|
|
126
|
+
|
|
127
|
+
def get_response(self, key) -> Any:
|
|
128
|
+
"""
|
|
129
|
+
Get a value from the cached response data.
|
|
130
|
+
从缓存的响应数据中获取值。
|
|
131
|
+
|
|
132
|
+
This method retrieves a value from the cache_response dictionary
|
|
133
|
+
using the provided key.
|
|
134
|
+
此方法使用提供的键从cache_response字典中检索值。
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
key: The key to look up in the cached response data.
|
|
138
|
+
在缓存的响应数据中查找的键。
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Any: The value associated with the key, or None if the key is not found.
|
|
142
|
+
与键关联的值,如果未找到键,则为None。
|
|
143
|
+
"""
|
|
144
|
+
return self.cache_response.get(key)
|
aioscrapy/http/response/xml.py
CHANGED
|
@@ -1,12 +1,54 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
XML response implementation for aioscrapy.
|
|
3
|
+
aioscrapy的XML响应实现。
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
This module provides the XmlResponse class, which is a specialized TextResponse
|
|
6
|
+
for handling XML content. It inherits all functionality from TextResponse
|
|
7
|
+
but is specifically intended for XML responses, with support for XML encoding
|
|
8
|
+
declarations.
|
|
9
|
+
此模块提供了XmlResponse类,这是一个专门用于处理XML内容的TextResponse。
|
|
10
|
+
它继承了TextResponse的所有功能,但专门用于XML响应,支持XML编码声明。
|
|
6
11
|
"""
|
|
7
12
|
|
|
8
13
|
from aioscrapy.http.response.text import TextResponse
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
class XmlResponse(TextResponse):
|
|
17
|
+
"""
|
|
18
|
+
A Response subclass specifically for XML responses.
|
|
19
|
+
专门用于XML响应的Response子类。
|
|
20
|
+
|
|
21
|
+
This class extends TextResponse to handle XML content. It inherits all the
|
|
22
|
+
functionality of TextResponse, including:
|
|
23
|
+
此类扩展了TextResponse以处理XML内容。它继承了TextResponse的所有功能,包括:
|
|
24
|
+
|
|
25
|
+
- Automatic encoding detection (including from XML declarations)
|
|
26
|
+
自动编码检测(包括从XML声明中)
|
|
27
|
+
- Unicode conversion
|
|
28
|
+
Unicode转换
|
|
29
|
+
- CSS and XPath selectors (particularly useful for XML)
|
|
30
|
+
CSS和XPath选择器(对XML特别有用)
|
|
31
|
+
- Enhanced link following
|
|
32
|
+
增强的链接跟踪
|
|
33
|
+
|
|
34
|
+
The main purpose of this class is to provide a specific type for XML responses,
|
|
35
|
+
which can be useful for type checking and middleware processing.
|
|
36
|
+
此类的主要目的是为XML响应提供特定类型,这对类型检查和中间件处理很有用。
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
```python
|
|
40
|
+
def parse(self, response):
|
|
41
|
+
if isinstance(response, XmlResponse):
|
|
42
|
+
# Process XML response
|
|
43
|
+
items = response.xpath('//item')
|
|
44
|
+
for item in items:
|
|
45
|
+
yield {
|
|
46
|
+
'name': item.xpath('./name/text()').get(),
|
|
47
|
+
'value': item.xpath('./value/text()').get()
|
|
48
|
+
}
|
|
49
|
+
else:
|
|
50
|
+
# Handle other response types
|
|
51
|
+
pass
|
|
52
|
+
```
|
|
53
|
+
"""
|
|
12
54
|
pass
|
|
@@ -1,22 +1,86 @@
|
|
|
1
1
|
"""
|
|
2
|
-
DefaultHeaders
|
|
2
|
+
DefaultHeaders Downloader Middleware
|
|
3
|
+
默认头部下载器中间件
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This middleware sets default headers for all requests, as specified in the
|
|
6
|
+
DEFAULT_REQUEST_HEADERS setting. These headers are only set if they are not
|
|
7
|
+
already present in the request.
|
|
8
|
+
此中间件为所有请求设置默认头部,如DEFAULT_REQUEST_HEADERS设置中指定的那样。
|
|
9
|
+
这些头部仅在请求中尚未存在时才会设置。
|
|
5
10
|
"""
|
|
6
11
|
|
|
7
12
|
from aioscrapy.utils.python import without_none_values
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class DefaultHeadersMiddleware:
|
|
16
|
+
"""
|
|
17
|
+
Middleware for setting default headers on requests.
|
|
18
|
+
用于在请求上设置默认头部的中间件。
|
|
19
|
+
|
|
20
|
+
This middleware adds default headers to all outgoing requests, as specified in the
|
|
21
|
+
DEFAULT_REQUEST_HEADERS setting. Headers are only added if they are not already
|
|
22
|
+
present in the request, allowing request-specific headers to take precedence.
|
|
23
|
+
此中间件向所有传出请求添加默认头部,如DEFAULT_REQUEST_HEADERS设置中指定的那样。
|
|
24
|
+
仅当请求中尚未存在头部时才会添加头部,允许特定于请求的头部优先。
|
|
25
|
+
"""
|
|
11
26
|
|
|
12
27
|
def __init__(self, headers):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the DefaultHeadersMiddleware.
|
|
30
|
+
初始化DefaultHeadersMiddleware。
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
headers: An iterable of (name, value) pairs representing the default headers.
|
|
34
|
+
表示默认头部的(名称, 值)对的可迭代对象。
|
|
35
|
+
"""
|
|
36
|
+
# Store the default headers
|
|
37
|
+
# 存储默认头部
|
|
13
38
|
self._headers = headers
|
|
14
39
|
|
|
15
40
|
@classmethod
|
|
16
41
|
def from_crawler(cls, crawler):
|
|
42
|
+
"""
|
|
43
|
+
Create a DefaultHeadersMiddleware instance from a crawler.
|
|
44
|
+
从爬虫创建DefaultHeadersMiddleware实例。
|
|
45
|
+
|
|
46
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
47
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
crawler: The crawler that will use this middleware.
|
|
51
|
+
将使用此中间件的爬虫。
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
DefaultHeadersMiddleware: A new DefaultHeadersMiddleware instance.
|
|
55
|
+
一个新的DefaultHeadersMiddleware实例。
|
|
56
|
+
"""
|
|
57
|
+
# Get the default headers from settings, filtering out None values
|
|
58
|
+
# 从设置获取默认头部,过滤掉None值
|
|
17
59
|
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
|
|
60
|
+
|
|
61
|
+
# Create and return a new instance with the headers as (name, value) pairs
|
|
62
|
+
# 使用作为(名称, 值)对的头部创建并返回一个新实例
|
|
18
63
|
return cls(headers.items())
|
|
19
64
|
|
|
20
65
|
def process_request(self, request, spider):
|
|
66
|
+
"""
|
|
67
|
+
Process a request before it is sent to the downloader.
|
|
68
|
+
在请求发送到下载器之前处理它。
|
|
69
|
+
|
|
70
|
+
This method adds the default headers to the request if they are not already present.
|
|
71
|
+
如果请求中尚未存在默认头部,此方法会将其添加到请求中。
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
request: The request being processed.
|
|
75
|
+
正在处理的请求。
|
|
76
|
+
spider: The spider that generated the request.
|
|
77
|
+
生成请求的爬虫。
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
None: This method does not return a response or a deferred.
|
|
81
|
+
此方法不返回响应或延迟对象。
|
|
82
|
+
"""
|
|
83
|
+
# Add each default header to the request if it's not already set
|
|
84
|
+
# 如果尚未设置,则将每个默认头部添加到请求中
|
|
21
85
|
for k, v in self._headers:
|
|
22
86
|
request.headers.setdefault(k, v)
|
|
@@ -1,26 +1,115 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Download
|
|
2
|
+
Download Timeout Middleware
|
|
3
|
+
下载超时中间件
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This middleware sets a default timeout for all requests, as specified in the
|
|
6
|
+
DOWNLOAD_TIMEOUT setting or the spider's download_timeout attribute. The timeout
|
|
7
|
+
can be overridden on a per-request basis by setting the 'download_timeout' key
|
|
8
|
+
in the request's meta dictionary.
|
|
9
|
+
此中间件为所有请求设置默认超时,如DOWNLOAD_TIMEOUT设置或爬虫的download_timeout
|
|
10
|
+
属性中指定的那样。可以通过在请求的meta字典中设置'download_timeout'键来覆盖每个
|
|
11
|
+
请求的超时。
|
|
5
12
|
"""
|
|
6
13
|
|
|
7
14
|
from aioscrapy import signals
|
|
8
15
|
|
|
9
16
|
|
|
10
17
|
class DownloadTimeoutMiddleware:
|
|
18
|
+
"""
|
|
19
|
+
Middleware for setting default download timeouts on requests.
|
|
20
|
+
用于在请求上设置默认下载超时的中间件。
|
|
21
|
+
|
|
22
|
+
This middleware sets a default timeout for all outgoing requests, as specified in the
|
|
23
|
+
DOWNLOAD_TIMEOUT setting or the spider's download_timeout attribute. The timeout
|
|
24
|
+
can be overridden on a per-request basis by setting the 'download_timeout' key
|
|
25
|
+
in the request's meta dictionary.
|
|
26
|
+
此中间件为所有传出请求设置默认超时,如DOWNLOAD_TIMEOUT设置或爬虫的download_timeout
|
|
27
|
+
属性中指定的那样。可以通过在请求的meta字典中设置'download_timeout'键来覆盖每个
|
|
28
|
+
请求的超时。
|
|
29
|
+
"""
|
|
11
30
|
|
|
12
31
|
def __init__(self, timeout=180):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the DownloadTimeoutMiddleware.
|
|
34
|
+
初始化DownloadTimeoutMiddleware。
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
timeout: The default download timeout in seconds.
|
|
38
|
+
默认下载超时(以秒为单位)。
|
|
39
|
+
Defaults to 180 seconds.
|
|
40
|
+
默认为180秒。
|
|
41
|
+
"""
|
|
42
|
+
# Store the default timeout
|
|
43
|
+
# 存储默认超时
|
|
13
44
|
self._timeout = timeout
|
|
14
45
|
|
|
15
46
|
@classmethod
|
|
16
47
|
def from_crawler(cls, crawler):
|
|
48
|
+
"""
|
|
49
|
+
Create a DownloadTimeoutMiddleware instance from a crawler.
|
|
50
|
+
从爬虫创建DownloadTimeoutMiddleware实例。
|
|
51
|
+
|
|
52
|
+
This is the factory method used by AioScrapy to create the middleware.
|
|
53
|
+
这是AioScrapy用于创建中间件的工厂方法。
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
crawler: The crawler that will use this middleware.
|
|
57
|
+
将使用此中间件的爬虫。
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
DownloadTimeoutMiddleware: A new DownloadTimeoutMiddleware instance.
|
|
61
|
+
一个新的DownloadTimeoutMiddleware实例。
|
|
62
|
+
"""
|
|
63
|
+
# Create a new instance with the timeout from settings
|
|
64
|
+
# 使用来自设置的超时创建一个新实例
|
|
17
65
|
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
|
|
66
|
+
|
|
67
|
+
# Connect to the spider_opened signal
|
|
68
|
+
# 连接到spider_opened信号
|
|
18
69
|
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
|
70
|
+
|
|
71
|
+
# Return the new instance
|
|
72
|
+
# 返回新实例
|
|
19
73
|
return o
|
|
20
74
|
|
|
21
75
|
def spider_opened(self, spider):
|
|
76
|
+
"""
|
|
77
|
+
Handle the spider_opened signal.
|
|
78
|
+
处理spider_opened信号。
|
|
79
|
+
|
|
80
|
+
This method is called when a spider is opened. It updates the default timeout
|
|
81
|
+
with the spider's download_timeout attribute if it exists.
|
|
82
|
+
当爬虫打开时调用此方法。如果存在,它会使用爬虫的download_timeout属性更新默认超时。
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
spider: The spider that was opened.
|
|
86
|
+
被打开的爬虫。
|
|
87
|
+
"""
|
|
88
|
+
# Update the timeout with the spider's download_timeout attribute if it exists
|
|
89
|
+
# 如果存在,则使用爬虫的download_timeout属性更新超时
|
|
22
90
|
self._timeout = getattr(spider, 'download_timeout', self._timeout)
|
|
23
91
|
|
|
24
92
|
def process_request(self, request, spider):
|
|
93
|
+
"""
|
|
94
|
+
Process a request before it is sent to the downloader.
|
|
95
|
+
在请求发送到下载器之前处理它。
|
|
96
|
+
|
|
97
|
+
This method sets the default download timeout in the request's meta dictionary
|
|
98
|
+
if it's not already set and if a default timeout is configured.
|
|
99
|
+
如果尚未设置默认下载超时且已配置默认超时,此方法会在请求的meta字典中设置它。
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
request: The request being processed.
|
|
103
|
+
正在处理的请求。
|
|
104
|
+
spider: The spider that generated the request.
|
|
105
|
+
生成请求的爬虫。
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
None: This method does not return a response or a deferred.
|
|
109
|
+
此方法不返回响应或延迟对象。
|
|
110
|
+
"""
|
|
111
|
+
# Set the default download timeout in the request's meta if it's not already set
|
|
112
|
+
# and if a default timeout is configured
|
|
113
|
+
# 如果尚未设置默认下载超时且已配置默认超时,则在请求的meta中设置它
|
|
25
114
|
if self._timeout:
|
|
26
115
|
request.meta.setdefault('download_timeout', self._timeout)
|
|
@@ -1,36 +1,129 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
JA3 Fingerprint Randomization Middleware
|
|
3
|
+
JA3指纹随机化中间件
|
|
4
|
+
|
|
5
|
+
This module provides a middleware for randomizing SSL/TLS cipher suites to help
|
|
6
|
+
avoid fingerprinting and detection when making HTTPS requests. JA3 is a method
|
|
7
|
+
for creating SSL/TLS client fingerprints that can be used to identify specific
|
|
8
|
+
clients regardless of the presented hostname or client IP address.
|
|
9
|
+
此模块提供了一个中间件,用于随机化SSL/TLS密码套件,以帮助在发出HTTPS请求时
|
|
10
|
+
避免指纹识别和检测。JA3是一种创建SSL/TLS客户端指纹的方法,可用于识别特定
|
|
11
|
+
客户端,而不考虑所呈现的主机名或客户端IP地址。
|
|
12
|
+
|
|
13
|
+
By randomizing the order of cipher suites, this middleware helps to generate
|
|
14
|
+
different JA3 fingerprints for each request, making it harder for servers to
|
|
15
|
+
track or block the crawler based on its TLS fingerprint.
|
|
16
|
+
通过随机化密码套件的顺序,此中间件有助于为每个请求生成不同的JA3指纹,
|
|
17
|
+
使服务器更难基于其TLS指纹跟踪或阻止爬虫。
|
|
3
18
|
"""
|
|
4
19
|
import random
|
|
5
20
|
|
|
6
21
|
|
|
22
|
+
# Default cipher suite string used when no custom ciphers are specified
|
|
23
|
+
# 未指定自定义密码套件时使用的默认密码套件字符串
|
|
7
24
|
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
|
|
8
25
|
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES:!aNULL:!eNULL:!MD5')
|
|
9
26
|
|
|
10
27
|
|
|
11
28
|
class TLSCiphersMiddleware:
|
|
12
|
-
"""
|
|
29
|
+
"""
|
|
30
|
+
SSL/TLS Fingerprint Randomization Middleware.
|
|
31
|
+
SSL/TLS指纹随机化中间件。
|
|
32
|
+
|
|
33
|
+
This middleware modifies the SSL/TLS cipher suites used in HTTPS requests
|
|
34
|
+
to help avoid fingerprinting and detection. It can use custom cipher suites
|
|
35
|
+
or randomize the order of the default cipher suites.
|
|
36
|
+
此中间件修改HTTPS请求中使用的SSL/TLS密码套件,以帮助避免指纹识别和检测。
|
|
37
|
+
它可以使用自定义密码套件或随机化默认密码套件的顺序。
|
|
38
|
+
"""
|
|
13
39
|
|
|
14
40
|
def __init__(self, ciphers, is_random):
|
|
41
|
+
"""
|
|
42
|
+
Initialize the TLS Ciphers Middleware.
|
|
43
|
+
初始化TLS密码套件中间件。
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
ciphers: The cipher suites to use, or 'DEFAULT' to use the default ciphers.
|
|
47
|
+
要使用的密码套件,或'DEFAULT'以使用默认密码套件。
|
|
48
|
+
is_random: Whether to randomize the order of cipher suites.
|
|
49
|
+
是否随机化密码套件的顺序。
|
|
50
|
+
"""
|
|
51
|
+
# If ciphers is 'DEFAULT', set self.ciphers to None to use ORIGIN_CIPHERS later
|
|
52
|
+
# 如果ciphers是'DEFAULT',将self.ciphers设置为None以便稍后使用ORIGIN_CIPHERS
|
|
15
53
|
if ciphers == 'DEFAULT':
|
|
16
54
|
self.ciphers = None
|
|
55
|
+
else:
|
|
56
|
+
self.ciphers = ciphers
|
|
17
57
|
|
|
18
58
|
self.is_random = is_random
|
|
19
59
|
|
|
20
60
|
@classmethod
|
|
21
61
|
def from_crawler(cls, crawler):
|
|
62
|
+
"""
|
|
63
|
+
Create a TLSCiphersMiddleware instance from a crawler.
|
|
64
|
+
从爬虫创建TLSCiphersMiddleware实例。
|
|
65
|
+
|
|
66
|
+
This is the factory method used by AioScrapy to create middleware instances.
|
|
67
|
+
这是AioScrapy用于创建中间件实例的工厂方法。
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
crawler: The crawler that will use this middleware.
|
|
71
|
+
将使用此中间件的爬虫。
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
TLSCiphersMiddleware: A new TLSCiphersMiddleware instance.
|
|
75
|
+
一个新的TLSCiphersMiddleware实例。
|
|
76
|
+
"""
|
|
22
77
|
return cls(
|
|
78
|
+
# Get custom cipher suites from settings, or use 'DEFAULT'
|
|
79
|
+
# 从设置获取自定义密码套件,或使用'DEFAULT'
|
|
23
80
|
ciphers=crawler.settings.get('DOWNLOADER_CLIENT_TLS_CIPHERS', 'DEFAULT'),
|
|
81
|
+
# Get whether to randomize cipher suites from settings
|
|
82
|
+
# 从设置获取是否随机化密码套件
|
|
24
83
|
is_random=crawler.settings.get('RANDOM_TLS_CIPHERS', False)
|
|
25
84
|
)
|
|
26
85
|
|
|
27
86
|
def process_request(self, request, spider):
|
|
87
|
+
"""
|
|
88
|
+
Process a request before it is sent to the downloader.
|
|
89
|
+
在请求发送到下载器之前处理它。
|
|
90
|
+
|
|
91
|
+
This method sets the TLS cipher suites for the request, optionally
|
|
92
|
+
randomizing their order to generate different JA3 fingerprints.
|
|
93
|
+
此方法为请求设置TLS密码套件,可选择随机化它们的顺序以生成不同的JA3指纹。
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
request: The request being processed.
|
|
97
|
+
正在处理的请求。
|
|
98
|
+
spider: The spider that generated the request.
|
|
99
|
+
生成请求的爬虫。
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
None: This method returns None to continue processing the request.
|
|
103
|
+
此方法返回None以继续处理请求。
|
|
104
|
+
"""
|
|
105
|
+
# Skip if neither custom ciphers nor randomization is enabled
|
|
106
|
+
# 如果既没有启用自定义密码套件也没有启用随机化,则跳过
|
|
28
107
|
if not (self.ciphers or self.is_random):
|
|
29
108
|
return
|
|
30
109
|
|
|
110
|
+
# Use custom ciphers if specified, otherwise use default
|
|
111
|
+
# 如果指定了自定义密码套件则使用它,否则使用默认值
|
|
31
112
|
ciphers = self.ciphers or ORIGIN_CIPHERS
|
|
113
|
+
|
|
114
|
+
# Randomize cipher suite order if enabled
|
|
115
|
+
# 如果启用了随机化,则随机化密码套件顺序
|
|
32
116
|
if self.is_random:
|
|
117
|
+
# Split the cipher string into individual ciphers
|
|
118
|
+
# 将密码字符串拆分为单个密码
|
|
33
119
|
ciphers = ciphers.split(":")
|
|
120
|
+
# Shuffle the ciphers randomly
|
|
121
|
+
# 随机打乱密码
|
|
34
122
|
random.shuffle(ciphers)
|
|
123
|
+
# Join the ciphers back into a string
|
|
124
|
+
# 将密码重新连接成字符串
|
|
35
125
|
ciphers = ":".join(ciphers)
|
|
126
|
+
|
|
127
|
+
# Set the cipher suites in the request metadata
|
|
128
|
+
# 在请求元数据中设置密码套件
|
|
36
129
|
request.meta['TLS_CIPHERS'] = ciphers
|