crawlee 1.0.5b13__py3-none-any.whl → 1.0.5b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/configuration.py +3 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +0 -1
- crawlee/statistics/_statistics.py +28 -27
- {crawlee-1.0.5b13.dist-info → crawlee-1.0.5b15.dist-info}/METADATA +2 -2
- {crawlee-1.0.5b13.dist-info → crawlee-1.0.5b15.dist-info}/RECORD +8 -8
- {crawlee-1.0.5b13.dist-info → crawlee-1.0.5b15.dist-info}/WHEEL +0 -0
- {crawlee-1.0.5b13.dist-info → crawlee-1.0.5b15.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b13.dist-info → crawlee-1.0.5b15.dist-info}/licenses/LICENSE +0 -0
crawlee/configuration.py
CHANGED
|
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
|
|
32
|
+
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
|
|
33
|
+
model_config = SettingsConfigDict(populate_by_name=True)
|
|
32
34
|
|
|
33
35
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
|
34
36
|
"""Timeout for the internal asynchronous operations."""
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import math
|
|
5
6
|
import time
|
|
6
7
|
from datetime import datetime, timedelta, timezone
|
|
@@ -84,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
84
85
|
self._id = Statistics.__next_id
|
|
85
86
|
Statistics.__next_id += 1
|
|
86
87
|
|
|
87
|
-
self._instance_start: datetime | None = None
|
|
88
|
-
|
|
89
88
|
self.error_tracker = ErrorTracker(
|
|
90
89
|
save_error_snapshots=save_error_snapshots,
|
|
91
90
|
snapshot_kvs_name=persist_state_kvs_name,
|
|
@@ -111,6 +110,9 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
111
110
|
# Flag to indicate the context state.
|
|
112
111
|
self._active = False
|
|
113
112
|
|
|
113
|
+
# Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
|
|
114
|
+
self._runtime_offset = timedelta(seconds=0)
|
|
115
|
+
|
|
114
116
|
def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
|
|
115
117
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
116
118
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
@@ -165,14 +167,17 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
165
167
|
if self._active:
|
|
166
168
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
167
169
|
|
|
168
|
-
self._active = True
|
|
169
|
-
self._instance_start = datetime.now(timezone.utc)
|
|
170
|
-
|
|
171
170
|
await self._state.initialize()
|
|
172
|
-
self._after_initialize()
|
|
173
171
|
|
|
172
|
+
self._runtime_offset = self.state.crawler_runtime
|
|
173
|
+
|
|
174
|
+
# Start periodic logging and let it print initial state before activation.
|
|
174
175
|
self._periodic_logger.start()
|
|
176
|
+
await asyncio.sleep(0.01)
|
|
177
|
+
self._active = True
|
|
175
178
|
|
|
179
|
+
self.state.crawler_last_started_at = datetime.now(timezone.utc)
|
|
180
|
+
self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
|
|
176
181
|
return self
|
|
177
182
|
|
|
178
183
|
async def __aexit__(
|
|
@@ -191,14 +196,16 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
191
196
|
|
|
192
197
|
if not self.state.crawler_last_started_at:
|
|
193
198
|
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
194
|
-
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
195
|
-
self.state.crawler_runtime += self.state.crawler_finished_at - self.state.crawler_last_started_at
|
|
196
|
-
|
|
197
|
-
await self._state.teardown()
|
|
198
199
|
|
|
200
|
+
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
199
201
|
await self._periodic_logger.stop()
|
|
202
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
203
|
+
self.state.crawler_runtime = (
|
|
204
|
+
self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
|
|
205
|
+
)
|
|
200
206
|
|
|
201
207
|
self._active = False
|
|
208
|
+
await self._state.teardown()
|
|
202
209
|
|
|
203
210
|
@property
|
|
204
211
|
def state(self) -> TStatisticsState:
|
|
@@ -255,10 +262,19 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
255
262
|
|
|
256
263
|
del self._requests_in_progress[request_id_or_key]
|
|
257
264
|
|
|
265
|
+
def _update_crawler_runtime(self) -> None:
|
|
266
|
+
current_run_duration = (
|
|
267
|
+
(datetime.now(timezone.utc) - self.state.crawler_last_started_at)
|
|
268
|
+
if self.state.crawler_last_started_at
|
|
269
|
+
else timedelta()
|
|
270
|
+
)
|
|
271
|
+
self.state.crawler_runtime = current_run_duration + self._runtime_offset
|
|
272
|
+
|
|
258
273
|
def calculate(self) -> FinalStatistics:
|
|
259
274
|
"""Calculate the current statistics."""
|
|
260
|
-
if self.
|
|
261
|
-
|
|
275
|
+
if self._active:
|
|
276
|
+
# Only update state when active. If not, just report the last known runtime.
|
|
277
|
+
self._update_crawler_runtime()
|
|
262
278
|
|
|
263
279
|
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
264
280
|
state = self._state.current_value
|
|
@@ -291,21 +307,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
291
307
|
else:
|
|
292
308
|
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
|
|
293
309
|
|
|
294
|
-
def _after_initialize(self) -> None:
|
|
295
|
-
state = self._state.current_value
|
|
296
|
-
|
|
297
|
-
if state.crawler_started_at is None:
|
|
298
|
-
state.crawler_started_at = datetime.now(timezone.utc)
|
|
299
|
-
|
|
300
|
-
if state.stats_persisted_at is not None and state.crawler_last_started_at:
|
|
301
|
-
self._instance_start = datetime.now(timezone.utc) - (
|
|
302
|
-
state.stats_persisted_at - state.crawler_last_started_at
|
|
303
|
-
)
|
|
304
|
-
elif state.crawler_last_started_at:
|
|
305
|
-
self._instance_start = state.crawler_last_started_at
|
|
306
|
-
|
|
307
|
-
state.crawler_last_started_at = self._instance_start
|
|
308
|
-
|
|
309
310
|
def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
|
|
310
311
|
retry_count = record.retry_count
|
|
311
312
|
state = self._state.current_value
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlee
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.5b15
|
|
4
4
|
Summary: Crawlee for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
|
|
@@ -231,7 +231,7 @@ Requires-Dist: impit>=0.6.1
|
|
|
231
231
|
Requires-Dist: more-itertools>=10.2.0
|
|
232
232
|
Requires-Dist: protego>=0.5.0
|
|
233
233
|
Requires-Dist: psutil>=6.0.0
|
|
234
|
-
Requires-Dist: pydantic-settings
|
|
234
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
235
235
|
Requires-Dist: pydantic>=2.11.0
|
|
236
236
|
Requires-Dist: pyee>=9.0.0
|
|
237
237
|
Requires-Dist: tldextract>=5.1.0
|
|
@@ -6,7 +6,7 @@ crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
|
|
|
6
6
|
crawlee/_request.py,sha256=XliqiaL5Gp3fIDqHVVw0GF35VydXOtg6wJIkeaLcAwk,16458
|
|
7
7
|
crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
|
|
8
8
|
crawlee/_types.py,sha256=DAmfSv5W1dt3nJhJ8z-02gDaE06fdEizNKUlHpsd2_A,29129
|
|
9
|
-
crawlee/configuration.py,sha256=
|
|
9
|
+
crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
|
|
10
10
|
crawlee/errors.py,sha256=RhFNA_uT615nVBHf9TylpX5YWwtDuHUUEV8LPT4CYa4,3878
|
|
11
11
|
crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
|
|
12
12
|
crawlee/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -58,7 +58,7 @@ crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5G
|
|
|
58
58
|
crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
|
|
59
59
|
crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
|
|
61
|
-
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=
|
|
61
|
+
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=nPFB9Q_3xQDJprb24NIQO53gf56J8wXjbM9C-58iiZ8,22862
|
|
62
62
|
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
|
|
63
63
|
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=9FlHIUC05IzUhJsVldQvpnDnj1jk8GJpqC98mPLN_fw,10431
|
|
64
64
|
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
|
|
@@ -148,7 +148,7 @@ crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY
|
|
|
148
148
|
crawlee/statistics/_error_snapshotter.py,sha256=g-roZgkJ-glyStZL7gXrOhrpdZvZ686W9lR43uZjPao,3279
|
|
149
149
|
crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
|
|
150
150
|
crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
|
|
151
|
-
crawlee/statistics/_statistics.py,sha256=
|
|
151
|
+
crawlee/statistics/_statistics.py,sha256=AnxbVq6w8fuiRumUJMznhTtQmtONyF4pzqrFYgO4yjo,13076
|
|
152
152
|
crawlee/storage_clients/__init__.py,sha256=X3M6Z_WAOJ3M9I8JhGhJDnrtbCOmM27DpGAzgt87R2A,874
|
|
153
153
|
crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
|
|
154
154
|
crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
199
199
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
200
200
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
201
201
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
crawlee-1.0.
|
|
203
|
-
crawlee-1.0.
|
|
204
|
-
crawlee-1.0.
|
|
205
|
-
crawlee-1.0.
|
|
206
|
-
crawlee-1.0.
|
|
202
|
+
crawlee-1.0.5b15.dist-info/METADATA,sha256=DrXwvY89KKTek0NBwVSk_d-lhRsKyjZO1X-CKwe1q7A,29422
|
|
203
|
+
crawlee-1.0.5b15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
204
|
+
crawlee-1.0.5b15.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
205
|
+
crawlee-1.0.5b15.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
206
|
+
crawlee-1.0.5b15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|