crawlee 1.0.4b6__py3-none-any.whl → 1.0.4b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

@@ -7,6 +7,9 @@ from typing import TYPE_CHECKING
7
7
  if TYPE_CHECKING:
8
8
  from collections.abc import Callable
9
9
  from datetime import timedelta
10
+ from types import TracebackType
11
+
12
+ from typing_extensions import Self
10
13
 
11
14
  logger = getLogger(__name__)
12
15
 
@@ -26,6 +29,18 @@ class RecurringTask:
26
29
  self.delay = delay
27
30
  self.task: asyncio.Task | None = None
28
31
 
32
+ async def __aenter__(self) -> Self:
33
+ self.start()
34
+ return self
35
+
36
+ async def __aexit__(
37
+ self,
38
+ exc_type: type[BaseException] | None,
39
+ exc_value: BaseException | None,
40
+ exc_traceback: TracebackType | None,
41
+ ) -> None:
42
+ await self.stop()
43
+
29
44
  async def _wrapper(self) -> None:
30
45
  """Continuously execute the provided function with the specified delay.
31
46
 
@@ -56,7 +56,7 @@ from crawlee.errors import (
56
56
  SessionError,
57
57
  UserDefinedErrorHandlerError,
58
58
  )
59
- from crawlee.events._types import Event, EventCrawlerStatusData
59
+ from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData
60
60
  from crawlee.http_clients import ImpitHttpClient
61
61
  from crawlee.router import Router
62
62
  from crawlee.sessions import SessionPool
@@ -437,14 +437,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
437
437
  self._statistics_log_format = statistics_log_format
438
438
 
439
439
  # Statistics
440
- self._statistics = statistics or cast(
441
- 'Statistics[TStatisticsState]',
442
- Statistics.with_default_state(
443
- periodic_message_logger=self._logger,
444
- statistics_log_format=self._statistics_log_format,
445
- log_message='Current request statistics:',
446
- ),
447
- )
440
+ if statistics:
441
+ self._statistics = statistics
442
+ else:
443
+
444
+ async def persist_state_factory() -> KeyValueStore:
445
+ return await self.get_key_value_store()
446
+
447
+ self._statistics = cast(
448
+ 'Statistics[TStatisticsState]',
449
+ Statistics.with_default_state(
450
+ persistence_enabled=True,
451
+ periodic_message_logger=self._logger,
452
+ statistics_log_format=self._statistics_log_format,
453
+ log_message='Current request statistics:',
454
+ persist_state_kvs_factory=persist_state_factory,
455
+ ),
456
+ )
448
457
 
449
458
  # Additional context managers to enter and exit
450
459
  self._additional_context_managers = _additional_context_managers or []
@@ -689,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
689
698
  except CancelledError:
690
699
  pass
691
700
  finally:
692
- await self._crawler_state_rec_task.stop()
693
701
  if threading.current_thread() is threading.main_thread():
694
702
  with suppress(NotImplementedError):
695
703
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -721,8 +729,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
721
729
  async def _run_crawler(self) -> None:
722
730
  event_manager = self._service_locator.get_event_manager()
723
731
 
724
- self._crawler_state_rec_task.start()
725
-
726
732
  # Collect the context managers to be entered. Context managers that are already active are excluded,
727
733
  # as they were likely entered by the caller, who will also be responsible for exiting them.
728
734
  contexts_to_enter = [
@@ -733,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
733
739
  self._statistics,
734
740
  self._session_pool if self._use_session_pool else None,
735
741
  self._http_client,
742
+ self._crawler_state_rec_task,
736
743
  *self._additional_context_managers,
737
744
  )
738
745
  if cm and getattr(cm, 'active', False) is False
@@ -744,6 +751,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
744
751
 
745
752
  await self._autoscaled_pool.run()
746
753
 
754
+ # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
755
+ event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
756
+
747
757
  async def add_requests(
748
758
  self,
749
759
  requests: Sequence[str | Request],
@@ -96,7 +96,7 @@ class Statistics(Generic[TStatisticsState]):
96
96
 
97
97
  self._state = RecoverableState(
98
98
  default_state=state_model(stats_id=self._id),
99
- persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
99
+ persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
100
100
  persistence_enabled=persistence_enabled,
101
101
  persist_state_kvs_name=persist_state_kvs_name,
102
102
  persist_state_kvs_factory=persist_state_kvs_factory,
@@ -130,6 +130,7 @@ class Statistics(Generic[TStatisticsState]):
130
130
  persistence_enabled: bool = False,
131
131
  persist_state_kvs_name: str | None = None,
132
132
  persist_state_key: str | None = None,
133
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
133
134
  log_message: str = 'Statistics',
134
135
  periodic_message_logger: Logger | None = None,
135
136
  log_interval: timedelta = timedelta(minutes=1),
@@ -141,6 +142,7 @@ class Statistics(Generic[TStatisticsState]):
141
142
  persistence_enabled=persistence_enabled,
142
143
  persist_state_kvs_name=persist_state_kvs_name,
143
144
  persist_state_key=persist_state_key,
145
+ persist_state_kvs_factory=persist_state_kvs_factory,
144
146
  log_message=log_message,
145
147
  periodic_message_logger=periodic_message_logger,
146
148
  log_interval=log_interval,
@@ -187,7 +189,10 @@ class Statistics(Generic[TStatisticsState]):
187
189
  if not self._active:
188
190
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
189
191
 
190
- self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
192
+ if not self.state.crawler_last_started_at:
193
+ raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
194
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
195
+ self.state.crawler_runtime += self.state.crawler_finished_at - self.state.crawler_last_started_at
191
196
 
192
197
  await self._state.teardown()
193
198
 
@@ -255,8 +260,7 @@ class Statistics(Generic[TStatisticsState]):
255
260
  if self._instance_start is None:
256
261
  raise RuntimeError('The Statistics object is not initialized')
257
262
 
258
- crawler_runtime = datetime.now(timezone.utc) - self._instance_start
259
- total_minutes = crawler_runtime.total_seconds() / 60
263
+ total_minutes = self.state.crawler_runtime.total_seconds() / 60
260
264
  state = self._state.current_value
261
265
  serialized_state = state.model_dump(by_alias=False)
262
266
 
@@ -267,7 +271,7 @@ class Statistics(Generic[TStatisticsState]):
267
271
  requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
268
272
  request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
269
273
  requests_total=state.requests_failed + state.requests_finished,
270
- crawler_runtime=crawler_runtime,
274
+ crawler_runtime=state.crawler_runtime,
271
275
  requests_finished=state.requests_finished,
272
276
  requests_failed=state.requests_failed,
273
277
  retry_histogram=serialized_state['request_retry_histogram'],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.4b6
3
+ Version: 1.0.4b7
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -30,7 +30,7 @@ crawlee/_utils/html_to_text.py,sha256=1iykT-OXd2xXNy7isHVWHqPxe23X82CGQBHIfbZbZk
30
30
  crawlee/_utils/models.py,sha256=EqM50Uc-xvxKlLCLA2lPpRduzfKvT0z_-Q-UWG8aTRQ,1955
31
31
  crawlee/_utils/raise_if_too_many_kwargs.py,sha256=J2gaUJmsmNwexohuehXw_mdYKv-eWiui6WUHFsQ3qTQ,597
32
32
  crawlee/_utils/recoverable_state.py,sha256=c1D2ZecxEliGZzhqYz9_oU5CF2Hm0UKvpOHqO6CDJRE,9032
33
- crawlee/_utils/recurring_task.py,sha256=sA0n4Cf9pYLQyBD9PZ7QbR6m6KphlbkACaT2GdbLfs4,1757
33
+ crawlee/_utils/recurring_task.py,sha256=sQMiURuDXbwwfAcIXK8V4NXncSxIBxsqN1cZWX7DLyg,2128
34
34
  crawlee/_utils/requests.py,sha256=yOjai7bHR9_duPJ0ck-L76y9AnKZr49JBfSOQv9kvJc,5048
35
35
  crawlee/_utils/robots.py,sha256=k3Yi2OfKT0H04MPkP-OBGGV7fEePgOqb60awltjMYWY,4346
36
36
  crawlee/_utils/sitemap.py,sha256=UI9EJiFiyFvV5_flVUtdsEVz8ZsJeRERPtcx8ZsqjTU,16632
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
65
65
  crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
66
66
  crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
67
67
  crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
68
- crawlee/crawlers/_basic/_basic_crawler.py,sha256=_y8SQHGUbhxGOd5KXHBzTQ0TLq7lXufoaiGcn11X2Kc,72724
68
+ crawlee/crawlers/_basic/_basic_crawler.py,sha256=yZ_A_l9Dux9Y2eYa9XbN3c7h-3YO7MgGmJbzCMbCplg,73257
69
69
  crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
70
70
  crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
71
71
  crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
@@ -148,7 +148,7 @@ crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY
148
148
  crawlee/statistics/_error_snapshotter.py,sha256=ChBBG0gIMWcSeyEzs3jQf3mSnHLZUHcD284wEDan1Js,3278
149
149
  crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
150
150
  crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
151
- crawlee/statistics/_statistics.py,sha256=vp8swl1yt4lBi2W0YyaI_xKCrRku0remI4BLx90q7-Y,12455
151
+ crawlee/statistics/_statistics.py,sha256=d6z5XxXm-an4M_8TierOPpSB78vxqxwvUFCewIEmiK4,12786
152
152
  crawlee/storage_clients/__init__.py,sha256=RCnutWMOqs_kUQpzfLVT5jgpHGWakLv557c6UIYFQsA,754
153
153
  crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
154
154
  crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
187
187
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
188
188
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
189
189
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
- crawlee-1.0.4b6.dist-info/METADATA,sha256=krz9PPrQuGh8tNo-gQbr2eCKo_XlYlYWgyyE47Y6C88,29314
191
- crawlee-1.0.4b6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
192
- crawlee-1.0.4b6.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
193
- crawlee-1.0.4b6.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
194
- crawlee-1.0.4b6.dist-info/RECORD,,
190
+ crawlee-1.0.4b7.dist-info/METADATA,sha256=tZyfbl6piSNoS2rrGMuxlrM9EhIc4HNBFvU9RwW8ZAc,29314
191
+ crawlee-1.0.4b7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
192
+ crawlee-1.0.4b7.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
193
+ crawlee-1.0.4b7.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
194
+ crawlee-1.0.4b7.dist-info/RECORD,,