crawlee 1.0.5b21__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/request_loaders/_sitemap_request_loader.py +17 -4
- {crawlee-1.0.5b21.dist-info → crawlee-1.1.0.dist-info}/METADATA +1 -1
- {crawlee-1.0.5b21.dist-info → crawlee-1.1.0.dist-info}/RECORD +6 -6
- {crawlee-1.0.5b21.dist-info → crawlee-1.1.0.dist-info}/WHEEL +0 -0
- {crawlee-1.0.5b21.dist-info → crawlee-1.1.0.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b21.dist-info → crawlee-1.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
|
-
from crawlee import Request
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
14
|
from crawlee._utils.globs import Glob
|
|
15
15
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import re
|
|
21
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
22
|
from types import TracebackType
|
|
23
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
24
25
|
from crawlee.http_clients import HttpClient
|
|
25
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
26
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -112,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
112
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
113
114
|
max_buffer_size: int = 200,
|
|
114
115
|
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
115
117
|
) -> None:
|
|
116
118
|
"""Initialize the sitemap request loader.
|
|
117
119
|
|
|
@@ -125,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
125
127
|
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
126
128
|
When provided, allows resuming from where it left off after interruption.
|
|
127
129
|
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
128
133
|
"""
|
|
129
134
|
self._http_client = http_client
|
|
130
135
|
self._sitemap_urls = sitemap_urls
|
|
@@ -132,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
132
137
|
self._exclude = exclude
|
|
133
138
|
self._proxy_info = proxy_info
|
|
134
139
|
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
135
141
|
|
|
136
142
|
# Synchronization for queue operations
|
|
137
143
|
self._queue_has_capacity = asyncio.Event()
|
|
@@ -313,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
313
319
|
|
|
314
320
|
async with self._queue_lock:
|
|
315
321
|
url = state.url_queue.popleft()
|
|
316
|
-
|
|
317
|
-
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
318
331
|
state.in_progress.add(request.url)
|
|
319
332
|
if len(state.url_queue) < self._max_buffer_size:
|
|
320
333
|
self._queue_has_capacity.set()
|
|
@@ -137,7 +137,7 @@ crawlee/request_loaders/_request_list.py,sha256=SIalHBMuFanE5GLnFocI0QCppWUiJQjr
|
|
|
137
137
|
crawlee/request_loaders/_request_loader.py,sha256=2Bg-AWWkIV1W-Dwjqo91dPY8nmc7H3teQy7d6OSgliQ,3620
|
|
138
138
|
crawlee/request_loaders/_request_manager.py,sha256=qFizyJuV2meIb9iiPfuii7ciuERMrp4SldAufiH46dc,3000
|
|
139
139
|
crawlee/request_loaders/_request_manager_tandem.py,sha256=lv-s94KPsoQAqx1KaXFch96ejhO147uOflF3UK5ORTk,4058
|
|
140
|
-
crawlee/request_loaders/_sitemap_request_loader.py,sha256=
|
|
140
|
+
crawlee/request_loaders/_sitemap_request_loader.py,sha256=W1_k_Szrtk0iE2LJBkHrrFeDtcKReXzr3DG32EnQaQE,16565
|
|
141
141
|
crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,190
|
|
142
142
|
crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
|
|
143
143
|
crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
|
|
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
199
199
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
200
200
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
201
201
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
crawlee-1.0.
|
|
203
|
-
crawlee-1.0.
|
|
204
|
-
crawlee-1.0.
|
|
205
|
-
crawlee-1.0.
|
|
206
|
-
crawlee-1.0.
|
|
202
|
+
crawlee-1.1.0.dist-info/METADATA,sha256=lSL2WVejm9D2tHBLJWJuobID9_WyglyirKeLsMzp1Z0,29530
|
|
203
|
+
crawlee-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
204
|
+
crawlee-1.1.0.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
205
|
+
crawlee-1.1.0.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
206
|
+
crawlee-1.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|