crawlee 1.0.5b21__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -112,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
112
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
113
114
  max_buffer_size: int = 200,
114
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
115
117
  ) -> None:
116
118
  """Initialize the sitemap request loader.
117
119
 
@@ -125,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
125
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
126
128
  When provided, allows resuming from where it left off after interruption.
127
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
128
133
  """
129
134
  self._http_client = http_client
130
135
  self._sitemap_urls = sitemap_urls
@@ -132,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
132
137
  self._exclude = exclude
133
138
  self._proxy_info = proxy_info
134
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
135
141
 
136
142
  # Synchronization for queue operations
137
143
  self._queue_has_capacity = asyncio.Event()
@@ -313,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
313
319
 
314
320
  async with self._queue_lock:
315
321
  url = state.url_queue.popleft()
316
-
317
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
318
331
  state.in_progress.add(request.url)
319
332
  if len(state.url_queue) < self._max_buffer_size:
320
333
  self._queue_has_capacity.set()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.5b21
3
+ Version: 1.1.0
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -137,7 +137,7 @@ crawlee/request_loaders/_request_list.py,sha256=SIalHBMuFanE5GLnFocI0QCppWUiJQjr
137
137
  crawlee/request_loaders/_request_loader.py,sha256=2Bg-AWWkIV1W-Dwjqo91dPY8nmc7H3teQy7d6OSgliQ,3620
138
138
  crawlee/request_loaders/_request_manager.py,sha256=qFizyJuV2meIb9iiPfuii7ciuERMrp4SldAufiH46dc,3000
139
139
  crawlee/request_loaders/_request_manager_tandem.py,sha256=lv-s94KPsoQAqx1KaXFch96ejhO147uOflF3UK5ORTk,4058
140
- crawlee/request_loaders/_sitemap_request_loader.py,sha256=s65D_N0mZxeIrGJEjqUYfu1uYj2AXSOkmErSnfAHv2A,15554
140
+ crawlee/request_loaders/_sitemap_request_loader.py,sha256=W1_k_Szrtk0iE2LJBkHrrFeDtcKReXzr3DG32EnQaQE,16565
141
141
  crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,190
142
142
  crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
143
143
  crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
199
199
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
200
200
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
201
201
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- crawlee-1.0.5b21.dist-info/METADATA,sha256=ERsOFxwDxPP0IVAwE8ZKLakRlDNcXKTYSi9ZKzFHCSQ,29533
203
- crawlee-1.0.5b21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
- crawlee-1.0.5b21.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
- crawlee-1.0.5b21.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
- crawlee-1.0.5b21.dist-info/RECORD,,
202
+ crawlee-1.1.0.dist-info/METADATA,sha256=lSL2WVejm9D2tHBLJWJuobID9_WyglyirKeLsMzp1Z0,29530
203
+ crawlee-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
+ crawlee-1.1.0.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
+ crawlee-1.1.0.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
+ crawlee-1.1.0.dist-info/RECORD,,