crawlee 1.0.5b13__py3-none-any.whl → 1.0.5b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crawlee/configuration.py CHANGED
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
28
28
  Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
29
29
  """
30
30
 
31
- model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
31
+ # TODO: https://github.com/pydantic/pydantic-settings/issues/706
32
+ # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
33
+ model_config = SettingsConfigDict(populate_by_name=True)
32
34
 
33
35
  internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
34
36
  """Timeout for the internal asynchronous operations."""
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
71
71
  async def __aenter__(self) -> Self:
72
72
  self._active = True
73
73
  await self._state.initialize()
74
- self._after_initialize()
75
74
  return self
76
75
 
77
76
  async def __aexit__(
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -84,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
84
85
  self._id = Statistics.__next_id
85
86
  Statistics.__next_id += 1
86
87
 
87
- self._instance_start: datetime | None = None
88
-
89
88
  self.error_tracker = ErrorTracker(
90
89
  save_error_snapshots=save_error_snapshots,
91
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -111,6 +110,9 @@ class Statistics(Generic[TStatisticsState]):
111
110
  # Flag to indicate the context state.
112
111
  self._active = False
113
112
 
113
+ # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114
+ self._runtime_offset = timedelta(seconds=0)
115
+
114
116
  def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
115
117
  """Create near copy of the `Statistics` with replaced `state_model`."""
116
118
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
@@ -165,14 +167,17 @@ class Statistics(Generic[TStatisticsState]):
165
167
  if self._active:
166
168
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
167
169
 
168
- self._active = True
169
- self._instance_start = datetime.now(timezone.utc)
170
-
171
170
  await self._state.initialize()
172
- self._after_initialize()
173
171
 
172
+ self._runtime_offset = self.state.crawler_runtime
173
+
174
+ # Start periodic logging and let it print initial state before activation.
174
175
  self._periodic_logger.start()
176
+ await asyncio.sleep(0.01)
177
+ self._active = True
175
178
 
179
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
180
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
176
181
  return self
177
182
 
178
183
  async def __aexit__(
@@ -191,14 +196,16 @@ class Statistics(Generic[TStatisticsState]):
191
196
 
192
197
  if not self.state.crawler_last_started_at:
193
198
  raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
194
- self.state.crawler_finished_at = datetime.now(timezone.utc)
195
- self.state.crawler_runtime += self.state.crawler_finished_at - self.state.crawler_last_started_at
196
-
197
- await self._state.teardown()
198
199
 
200
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
199
201
  await self._periodic_logger.stop()
202
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
203
+ self.state.crawler_runtime = (
204
+ self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
205
+ )
200
206
 
201
207
  self._active = False
208
+ await self._state.teardown()
202
209
 
203
210
  @property
204
211
  def state(self) -> TStatisticsState:
@@ -255,10 +262,19 @@ class Statistics(Generic[TStatisticsState]):
255
262
 
256
263
  del self._requests_in_progress[request_id_or_key]
257
264
 
265
+ def _update_crawler_runtime(self) -> None:
266
+ current_run_duration = (
267
+ (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
268
+ if self.state.crawler_last_started_at
269
+ else timedelta()
270
+ )
271
+ self.state.crawler_runtime = current_run_duration + self._runtime_offset
272
+
258
273
  def calculate(self) -> FinalStatistics:
259
274
  """Calculate the current statistics."""
260
- if self._instance_start is None:
261
- raise RuntimeError('The Statistics object is not initialized')
275
+ if self._active:
276
+ # Only update state when active. If not, just report the last known runtime.
277
+ self._update_crawler_runtime()
262
278
 
263
279
  total_minutes = self.state.crawler_runtime.total_seconds() / 60
264
280
  state = self._state.current_value
@@ -291,21 +307,6 @@ class Statistics(Generic[TStatisticsState]):
291
307
  else:
292
308
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
293
309
 
294
- def _after_initialize(self) -> None:
295
- state = self._state.current_value
296
-
297
- if state.crawler_started_at is None:
298
- state.crawler_started_at = datetime.now(timezone.utc)
299
-
300
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
301
- self._instance_start = datetime.now(timezone.utc) - (
302
- state.stats_persisted_at - state.crawler_last_started_at
303
- )
304
- elif state.crawler_last_started_at:
305
- self._instance_start = state.crawler_last_started_at
306
-
307
- state.crawler_last_started_at = self._instance_start
308
-
309
310
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
310
311
  retry_count = record.retry_count
311
312
  state = self._state.current_value
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.5b13
3
+ Version: 1.0.5b15
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -231,7 +231,7 @@ Requires-Dist: impit>=0.6.1
231
231
  Requires-Dist: more-itertools>=10.2.0
232
232
  Requires-Dist: protego>=0.5.0
233
233
  Requires-Dist: psutil>=6.0.0
234
- Requires-Dist: pydantic-settings!=2.7.0,!=2.7.1,!=2.8.0,>=2.2.0
234
+ Requires-Dist: pydantic-settings>=2.12.0
235
235
  Requires-Dist: pydantic>=2.11.0
236
236
  Requires-Dist: pyee>=9.0.0
237
237
  Requires-Dist: tldextract>=5.1.0
@@ -6,7 +6,7 @@ crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
6
6
  crawlee/_request.py,sha256=XliqiaL5Gp3fIDqHVVw0GF35VydXOtg6wJIkeaLcAwk,16458
7
7
  crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
8
8
  crawlee/_types.py,sha256=DAmfSv5W1dt3nJhJ8z-02gDaE06fdEizNKUlHpsd2_A,29129
9
- crawlee/configuration.py,sha256=KG_XDkPe1VaYfaIu41nICvMjfHbDKM0h4-YTi3DkyRY,7917
9
+ crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
10
10
  crawlee/errors.py,sha256=RhFNA_uT615nVBHf9TylpX5YWwtDuHUUEV8LPT4CYa4,3878
11
11
  crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
12
12
  crawlee/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -58,7 +58,7 @@ crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5G
58
58
  crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
59
59
  crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
61
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=ME90JLkScWj_ynUymA59f832vEvvVpkP01cYfEc8m-Y,22895
61
+ crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=nPFB9Q_3xQDJprb24NIQO53gf56J8wXjbM9C-58iiZ8,22862
62
62
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
63
63
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=9FlHIUC05IzUhJsVldQvpnDnj1jk8GJpqC98mPLN_fw,10431
64
64
  crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
@@ -148,7 +148,7 @@ crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY
148
148
  crawlee/statistics/_error_snapshotter.py,sha256=g-roZgkJ-glyStZL7gXrOhrpdZvZ686W9lR43uZjPao,3279
149
149
  crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
150
150
  crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
151
- crawlee/statistics/_statistics.py,sha256=d6z5XxXm-an4M_8TierOPpSB78vxqxwvUFCewIEmiK4,12786
151
+ crawlee/statistics/_statistics.py,sha256=AnxbVq6w8fuiRumUJMznhTtQmtONyF4pzqrFYgO4yjo,13076
152
152
  crawlee/storage_clients/__init__.py,sha256=X3M6Z_WAOJ3M9I8JhGhJDnrtbCOmM27DpGAzgt87R2A,874
153
153
  crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
154
154
  crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
199
199
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
200
200
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
201
201
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- crawlee-1.0.5b13.dist-info/METADATA,sha256=hDy8VXBEMs_osj0eBKSqOntcEHFSN1cHHTeWwcugAgg,29445
203
- crawlee-1.0.5b13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
- crawlee-1.0.5b13.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
- crawlee-1.0.5b13.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
- crawlee-1.0.5b13.dist-info/RECORD,,
202
+ crawlee-1.0.5b15.dist-info/METADATA,sha256=DrXwvY89KKTek0NBwVSk_d-lhRsKyjZO1X-CKwe1q7A,29422
203
+ crawlee-1.0.5b15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
+ crawlee-1.0.5b15.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
+ crawlee-1.0.5b15.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
+ crawlee-1.0.5b15.dist-info/RECORD,,