crawlee 1.0.5b19__py3-none-any.whl → 1.0.5b21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,7 @@ from crawlee.errors import (
56
56
  SessionError,
57
57
  UserDefinedErrorHandlerError,
58
58
  )
59
- from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData
59
+ from crawlee.events._types import Event, EventCrawlerStatusData
60
60
  from crawlee.http_clients import ImpitHttpClient
61
61
  from crawlee.router import Router
62
62
  from crawlee.sessions import SessionPool
@@ -751,9 +751,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
751
751
 
752
752
  await self._autoscaled_pool.run()
753
753
 
754
- # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
755
- event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
756
-
757
754
  async def add_requests(
758
755
  self,
759
756
  requests: Sequence[str | Request],
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import Annotated, Any
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
80
  errors: dict[str, Any] = Field(default_factory=dict)
81
81
  retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
82
  requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
93
93
  ),
94
94
  ] = {}
95
95
 
96
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
97
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
98
+
99
+ def model_post_init(self, /, __context: Any) -> None:
100
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
101
+
102
+ @property
103
+ def crawler_runtime(self) -> timedelta:
104
+ if self.crawler_last_started_at:
105
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
106
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
107
+ return self._runtime_offset
108
+
109
+ @crawler_runtime.setter
110
+ def crawler_runtime(self, value: timedelta) -> None:
111
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
112
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
113
+ warnings.warn(
114
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
115
+ f' Value {value} will not be used.',
116
+ DeprecationWarning,
117
+ stacklevel=2,
118
+ )
119
+
120
+ @computed_field(alias='crawlerRuntimeMillis')
121
+ def crawler_runtime_for_serialization(self) -> timedelta:
122
+ if self.crawler_last_started_at:
123
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
124
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
125
+ return self._runtime_offset
126
+
96
127
  @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
97
128
  @property
98
129
  def request_total_duration(self) -> timedelta:
@@ -110,9 +110,6 @@ class Statistics(Generic[TStatisticsState]):
110
110
  # Flag to indicate the context state.
111
111
  self._active = False
112
112
 
113
- # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114
- self._runtime_offset = timedelta(seconds=0)
115
-
116
113
  def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
117
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
118
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
@@ -168,8 +165,8 @@ class Statistics(Generic[TStatisticsState]):
168
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
169
166
 
170
167
  await self._state.initialize()
171
-
172
- self._runtime_offset = self.state.crawler_runtime
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
173
170
 
174
171
  # Start periodic logging and let it print initial state before activation.
175
172
  self._periodic_logger.start()
@@ -200,10 +197,6 @@ class Statistics(Generic[TStatisticsState]):
200
197
  # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
201
198
  await self._periodic_logger.stop()
202
199
  self.state.crawler_finished_at = datetime.now(timezone.utc)
203
- self.state.crawler_runtime = (
204
- self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
205
- )
206
-
207
200
  self._active = False
208
201
  await self._state.teardown()
209
202
 
@@ -262,20 +255,8 @@ class Statistics(Generic[TStatisticsState]):
262
255
 
263
256
  del self._requests_in_progress[request_id_or_key]
264
257
 
265
- def _update_crawler_runtime(self) -> None:
266
- current_run_duration = (
267
- (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
268
- if self.state.crawler_last_started_at
269
- else timedelta()
270
- )
271
- self.state.crawler_runtime = current_run_duration + self._runtime_offset
272
-
273
258
  def calculate(self) -> FinalStatistics:
274
259
  """Calculate the current statistics."""
275
- if self._active:
276
- # Only update state when active. If not, just report the last known runtime.
277
- self._update_crawler_runtime()
278
-
279
260
  total_minutes = self.state.crawler_runtime.total_seconds() / 60
280
261
  state = self._state.current_value
281
262
  serialized_state = state.model_dump(by_alias=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.5b19
3
+ Version: 1.0.5b21
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
65
65
  crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
66
66
  crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
67
67
  crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
68
- crawlee/crawlers/_basic/_basic_crawler.py,sha256=yZ_A_l9Dux9Y2eYa9XbN3c7h-3YO7MgGmJbzCMbCplg,73257
68
+ crawlee/crawlers/_basic/_basic_crawler.py,sha256=-lo9yMjPkT8fU4ndOMaeEiwWHgu-DHw9Fny6f1kPPkk,73004
69
69
  crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
70
70
  crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
71
71
  crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
@@ -92,7 +92,7 @@ crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2
92
92
  crawlee/crawlers/_playwright/_types.py,sha256=hMKA9K9gjzQuwwbnmmfJsQrwR-kq235jH-WBXWeqkGo,2174
93
93
  crawlee/crawlers/_playwright/_utils.py,sha256=FQ_-LYo7DGHsNHRrTtWt3mC06VzQvQ2wkGqpA2wBzYU,3441
94
94
  crawlee/events/__init__.py,sha256=YMgOXKI0LsXfImKQy06PZ2Vdjy-uD_-acioagHft1do,577
95
- crawlee/events/_event_manager.py,sha256=kP5_zO2JmFReWnCZqg_uSS1kSV4reem2XN3oRY-SGcI,11428
95
+ crawlee/events/_event_manager.py,sha256=wjZTYIKBI8daKUkOVxUrbPHuU8LnFpUtWStdkts7r3U,11588
96
96
  crawlee/events/_local_event_manager.py,sha256=CSiMJ6a_BwX0PPwtffEOtHm21dmALJz1zifo3AuMAk8,3708
97
97
  crawlee/events/_types.py,sha256=MKsI014OOKKhjPJRrvWYrezIDGoLjGGhWXrkqYw26Ns,3313
98
98
  crawlee/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -147,8 +147,8 @@ crawlee/sessions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
147
  crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY0,154
148
148
  crawlee/statistics/_error_snapshotter.py,sha256=g-roZgkJ-glyStZL7gXrOhrpdZvZ686W9lR43uZjPao,3279
149
149
  crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
150
- crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
151
- crawlee/statistics/_statistics.py,sha256=AnxbVq6w8fuiRumUJMznhTtQmtONyF4pzqrFYgO4yjo,13076
150
+ crawlee/statistics/_models.py,sha256=n4sT35D4dqNPYREl8Q_YXANZtxaWC0HaZizobA4qK_c,6674
151
+ crawlee/statistics/_statistics.py,sha256=mSgnCnV7q2buJdyuXBxbUU9MQEUjxaLST_NO4ej3XRw,12341
152
152
  crawlee/storage_clients/__init__.py,sha256=X3M6Z_WAOJ3M9I8JhGhJDnrtbCOmM27DpGAzgt87R2A,874
153
153
  crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
154
154
  crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
199
199
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
200
200
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
201
201
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- crawlee-1.0.5b19.dist-info/METADATA,sha256=6wbOodZec5Rn7kpII_35y3IRaEqHjY5hEDEtiJujZYE,29533
203
- crawlee-1.0.5b19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
- crawlee-1.0.5b19.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
- crawlee-1.0.5b19.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
- crawlee-1.0.5b19.dist-info/RECORD,,
202
+ crawlee-1.0.5b21.dist-info/METADATA,sha256=ERsOFxwDxPP0IVAwE8ZKLakRlDNcXKTYSi9ZKzFHCSQ,29533
203
+ crawlee-1.0.5b21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
+ crawlee-1.0.5b21.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
+ crawlee-1.0.5b21.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
+ crawlee-1.0.5b21.dist-info/RECORD,,