crawlee 1.0.5b20__py3-none-any.whl → 1.0.5b22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,7 @@ from crawlee.errors import (
56
56
  SessionError,
57
57
  UserDefinedErrorHandlerError,
58
58
  )
59
- from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData
59
+ from crawlee.events._types import Event, EventCrawlerStatusData
60
60
  from crawlee.http_clients import ImpitHttpClient
61
61
  from crawlee.router import Router
62
62
  from crawlee.sessions import SessionPool
@@ -751,9 +751,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
751
751
 
752
752
  await self._autoscaled_pool.run()
753
753
 
754
- # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
755
- event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
756
-
757
754
  async def add_requests(
758
755
  self,
759
756
  requests: Sequence[str | Request],
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -112,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
112
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
113
114
  max_buffer_size: int = 200,
114
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
115
117
  ) -> None:
116
118
  """Initialize the sitemap request loader.
117
119
 
@@ -125,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
125
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
126
128
  When provided, allows resuming from where it left off after interruption.
127
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
128
133
  """
129
134
  self._http_client = http_client
130
135
  self._sitemap_urls = sitemap_urls
@@ -132,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
132
137
  self._exclude = exclude
133
138
  self._proxy_info = proxy_info
134
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
135
141
 
136
142
  # Synchronization for queue operations
137
143
  self._queue_has_capacity = asyncio.Event()
@@ -313,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
313
319
 
314
320
  async with self._queue_lock:
315
321
  url = state.url_queue.popleft()
316
-
317
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
318
331
  state.in_progress.add(request.url)
319
332
  if len(state.url_queue) < self._max_buffer_size:
320
333
  self._queue_has_capacity.set()
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import Annotated, Any
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
80
  errors: dict[str, Any] = Field(default_factory=dict)
81
81
  retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
82
  requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
93
93
  ),
94
94
  ] = {}
95
95
 
96
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
97
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
98
+
99
+ def model_post_init(self, /, __context: Any) -> None:
100
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
101
+
102
+ @property
103
+ def crawler_runtime(self) -> timedelta:
104
+ if self.crawler_last_started_at:
105
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
106
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
107
+ return self._runtime_offset
108
+
109
+ @crawler_runtime.setter
110
+ def crawler_runtime(self, value: timedelta) -> None:
111
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
112
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
113
+ warnings.warn(
114
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
115
+ f' Value {value} will not be used.',
116
+ DeprecationWarning,
117
+ stacklevel=2,
118
+ )
119
+
120
+ @computed_field(alias='crawlerRuntimeMillis')
121
+ def crawler_runtime_for_serialization(self) -> timedelta:
122
+ if self.crawler_last_started_at:
123
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
124
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
125
+ return self._runtime_offset
126
+
96
127
  @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
97
128
  @property
98
129
  def request_total_duration(self) -> timedelta:
@@ -110,9 +110,6 @@ class Statistics(Generic[TStatisticsState]):
110
110
  # Flag to indicate the context state.
111
111
  self._active = False
112
112
 
113
- # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114
- self._runtime_offset = timedelta(seconds=0)
115
-
116
113
  def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
117
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
118
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
@@ -168,8 +165,8 @@ class Statistics(Generic[TStatisticsState]):
168
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
169
166
 
170
167
  await self._state.initialize()
171
-
172
- self._runtime_offset = self.state.crawler_runtime
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
173
170
 
174
171
  # Start periodic logging and let it print initial state before activation.
175
172
  self._periodic_logger.start()
@@ -200,10 +197,6 @@ class Statistics(Generic[TStatisticsState]):
200
197
  # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
201
198
  await self._periodic_logger.stop()
202
199
  self.state.crawler_finished_at = datetime.now(timezone.utc)
203
- self.state.crawler_runtime = (
204
- self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
205
- )
206
-
207
200
  self._active = False
208
201
  await self._state.teardown()
209
202
 
@@ -262,20 +255,8 @@ class Statistics(Generic[TStatisticsState]):
262
255
 
263
256
  del self._requests_in_progress[request_id_or_key]
264
257
 
265
- def _update_crawler_runtime(self) -> None:
266
- current_run_duration = (
267
- (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
268
- if self.state.crawler_last_started_at
269
- else timedelta()
270
- )
271
- self.state.crawler_runtime = current_run_duration + self._runtime_offset
272
-
273
258
  def calculate(self) -> FinalStatistics:
274
259
  """Calculate the current statistics."""
275
- if self._active:
276
- # Only update state when active. If not, just report the last known runtime.
277
- self._update_crawler_runtime()
278
-
279
260
  total_minutes = self.state.crawler_runtime.total_seconds() / 60
280
261
  state = self._state.current_value
281
262
  serialized_state = state.model_dump(by_alias=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.5b20
3
+ Version: 1.0.5b22
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
65
65
  crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
66
66
  crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
67
67
  crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
68
- crawlee/crawlers/_basic/_basic_crawler.py,sha256=yZ_A_l9Dux9Y2eYa9XbN3c7h-3YO7MgGmJbzCMbCplg,73257
68
+ crawlee/crawlers/_basic/_basic_crawler.py,sha256=-lo9yMjPkT8fU4ndOMaeEiwWHgu-DHw9Fny6f1kPPkk,73004
69
69
  crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
70
70
  crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
71
71
  crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
@@ -92,7 +92,7 @@ crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2
92
92
  crawlee/crawlers/_playwright/_types.py,sha256=hMKA9K9gjzQuwwbnmmfJsQrwR-kq235jH-WBXWeqkGo,2174
93
93
  crawlee/crawlers/_playwright/_utils.py,sha256=FQ_-LYo7DGHsNHRrTtWt3mC06VzQvQ2wkGqpA2wBzYU,3441
94
94
  crawlee/events/__init__.py,sha256=YMgOXKI0LsXfImKQy06PZ2Vdjy-uD_-acioagHft1do,577
95
- crawlee/events/_event_manager.py,sha256=kP5_zO2JmFReWnCZqg_uSS1kSV4reem2XN3oRY-SGcI,11428
95
+ crawlee/events/_event_manager.py,sha256=wjZTYIKBI8daKUkOVxUrbPHuU8LnFpUtWStdkts7r3U,11588
96
96
  crawlee/events/_local_event_manager.py,sha256=CSiMJ6a_BwX0PPwtffEOtHm21dmALJz1zifo3AuMAk8,3708
97
97
  crawlee/events/_types.py,sha256=MKsI014OOKKhjPJRrvWYrezIDGoLjGGhWXrkqYw26Ns,3313
98
98
  crawlee/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -137,7 +137,7 @@ crawlee/request_loaders/_request_list.py,sha256=SIalHBMuFanE5GLnFocI0QCppWUiJQjr
137
137
  crawlee/request_loaders/_request_loader.py,sha256=2Bg-AWWkIV1W-Dwjqo91dPY8nmc7H3teQy7d6OSgliQ,3620
138
138
  crawlee/request_loaders/_request_manager.py,sha256=qFizyJuV2meIb9iiPfuii7ciuERMrp4SldAufiH46dc,3000
139
139
  crawlee/request_loaders/_request_manager_tandem.py,sha256=lv-s94KPsoQAqx1KaXFch96ejhO147uOflF3UK5ORTk,4058
140
- crawlee/request_loaders/_sitemap_request_loader.py,sha256=s65D_N0mZxeIrGJEjqUYfu1uYj2AXSOkmErSnfAHv2A,15554
140
+ crawlee/request_loaders/_sitemap_request_loader.py,sha256=W1_k_Szrtk0iE2LJBkHrrFeDtcKReXzr3DG32EnQaQE,16565
141
141
  crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,190
142
142
  crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
143
143
  crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
@@ -147,8 +147,8 @@ crawlee/sessions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
147
  crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY0,154
148
148
  crawlee/statistics/_error_snapshotter.py,sha256=g-roZgkJ-glyStZL7gXrOhrpdZvZ686W9lR43uZjPao,3279
149
149
  crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
150
- crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
151
- crawlee/statistics/_statistics.py,sha256=AnxbVq6w8fuiRumUJMznhTtQmtONyF4pzqrFYgO4yjo,13076
150
+ crawlee/statistics/_models.py,sha256=n4sT35D4dqNPYREl8Q_YXANZtxaWC0HaZizobA4qK_c,6674
151
+ crawlee/statistics/_statistics.py,sha256=mSgnCnV7q2buJdyuXBxbUU9MQEUjxaLST_NO4ej3XRw,12341
152
152
  crawlee/storage_clients/__init__.py,sha256=X3M6Z_WAOJ3M9I8JhGhJDnrtbCOmM27DpGAzgt87R2A,874
153
153
  crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
154
154
  crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
199
199
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
200
200
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
201
201
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- crawlee-1.0.5b20.dist-info/METADATA,sha256=Lxk3f7BsX5OyPPnvQNVgvj-9XR6TphvJRaSg7ZriFZU,29533
203
- crawlee-1.0.5b20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
- crawlee-1.0.5b20.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
- crawlee-1.0.5b20.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
- crawlee-1.0.5b20.dist-info/RECORD,,
202
+ crawlee-1.0.5b22.dist-info/METADATA,sha256=GOYPQgm__OcaRBA8drb9Ay30Ddois3nthwX-xMME6As,29533
203
+ crawlee-1.0.5b22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
+ crawlee-1.0.5b22.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
+ crawlee-1.0.5b22.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
+ crawlee-1.0.5b22.dist-info/RECORD,,