crawlee 1.0.5b20__py3-none-any.whl → 1.0.5b21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/crawlers/_basic/_basic_crawler.py +1 -4
- crawlee/events/_event_manager.py +3 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +2 -21
- {crawlee-1.0.5b20.dist-info → crawlee-1.0.5b21.dist-info}/METADATA +1 -1
- {crawlee-1.0.5b20.dist-info → crawlee-1.0.5b21.dist-info}/RECORD +9 -9
- {crawlee-1.0.5b20.dist-info → crawlee-1.0.5b21.dist-info}/WHEEL +0 -0
- {crawlee-1.0.5b20.dist-info → crawlee-1.0.5b21.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b20.dist-info → crawlee-1.0.5b21.dist-info}/licenses/LICENSE +0 -0
|
@@ -56,7 +56,7 @@ from crawlee.errors import (
|
|
|
56
56
|
SessionError,
|
|
57
57
|
UserDefinedErrorHandlerError,
|
|
58
58
|
)
|
|
59
|
-
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
59
|
+
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
60
60
|
from crawlee.http_clients import ImpitHttpClient
|
|
61
61
|
from crawlee.router import Router
|
|
62
62
|
from crawlee.sessions import SessionPool
|
|
@@ -751,9 +751,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
751
751
|
|
|
752
752
|
await self._autoscaled_pool.run()
|
|
753
753
|
|
|
754
|
-
# Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
|
|
755
|
-
event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
|
|
756
|
-
|
|
757
754
|
async def add_requests(
|
|
758
755
|
self,
|
|
759
756
|
requests: Sequence[str | Request],
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
crawlee/statistics/_models.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import warnings
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import Annotated, Any
|
|
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
|
|
|
76
77
|
crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
|
|
77
78
|
crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
|
|
78
79
|
crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
|
|
79
|
-
crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
|
|
80
80
|
errors: dict[str, Any] = Field(default_factory=dict)
|
|
81
81
|
retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
|
|
82
82
|
requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
|
|
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
|
|
|
93
93
|
),
|
|
94
94
|
] = {}
|
|
95
95
|
|
|
96
|
+
# Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
|
|
97
|
+
_runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
|
|
98
|
+
|
|
99
|
+
def model_post_init(self, /, __context: Any) -> None:
|
|
100
|
+
self._runtime_offset = self.crawler_runtime or self._runtime_offset
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def crawler_runtime(self) -> timedelta:
|
|
104
|
+
if self.crawler_last_started_at:
|
|
105
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
106
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
107
|
+
return self._runtime_offset
|
|
108
|
+
|
|
109
|
+
@crawler_runtime.setter
|
|
110
|
+
def crawler_runtime(self, value: timedelta) -> None:
|
|
111
|
+
# Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
|
|
112
|
+
# To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
|
|
113
|
+
warnings.warn(
|
|
114
|
+
f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
|
|
115
|
+
f' Value {value} will not be used.',
|
|
116
|
+
DeprecationWarning,
|
|
117
|
+
stacklevel=2,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@computed_field(alias='crawlerRuntimeMillis')
|
|
121
|
+
def crawler_runtime_for_serialization(self) -> timedelta:
|
|
122
|
+
if self.crawler_last_started_at:
|
|
123
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
124
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
125
|
+
return self._runtime_offset
|
|
126
|
+
|
|
96
127
|
@computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
|
|
97
128
|
@property
|
|
98
129
|
def request_total_duration(self) -> timedelta:
|
|
@@ -110,9 +110,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
110
110
|
# Flag to indicate the context state.
|
|
111
111
|
self._active = False
|
|
112
112
|
|
|
113
|
-
# Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
|
|
114
|
-
self._runtime_offset = timedelta(seconds=0)
|
|
115
|
-
|
|
116
113
|
def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
|
|
117
114
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
118
115
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
@@ -168,8 +165,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
168
165
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
169
166
|
|
|
170
167
|
await self._state.initialize()
|
|
171
|
-
|
|
172
|
-
self.
|
|
168
|
+
# Reset `crawler_finished_at` to indicate a new run in progress.
|
|
169
|
+
self.state.crawler_finished_at = None
|
|
173
170
|
|
|
174
171
|
# Start periodic logging and let it print initial state before activation.
|
|
175
172
|
self._periodic_logger.start()
|
|
@@ -200,10 +197,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
200
197
|
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
201
198
|
await self._periodic_logger.stop()
|
|
202
199
|
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
203
|
-
self.state.crawler_runtime = (
|
|
204
|
-
self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
|
|
205
|
-
)
|
|
206
|
-
|
|
207
200
|
self._active = False
|
|
208
201
|
await self._state.teardown()
|
|
209
202
|
|
|
@@ -262,20 +255,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
255
|
|
|
263
256
|
del self._requests_in_progress[request_id_or_key]
|
|
264
257
|
|
|
265
|
-
def _update_crawler_runtime(self) -> None:
|
|
266
|
-
current_run_duration = (
|
|
267
|
-
(datetime.now(timezone.utc) - self.state.crawler_last_started_at)
|
|
268
|
-
if self.state.crawler_last_started_at
|
|
269
|
-
else timedelta()
|
|
270
|
-
)
|
|
271
|
-
self.state.crawler_runtime = current_run_duration + self._runtime_offset
|
|
272
|
-
|
|
273
258
|
def calculate(self) -> FinalStatistics:
|
|
274
259
|
"""Calculate the current statistics."""
|
|
275
|
-
if self._active:
|
|
276
|
-
# Only update state when active. If not, just report the last known runtime.
|
|
277
|
-
self._update_crawler_runtime()
|
|
278
|
-
|
|
279
260
|
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
280
261
|
state = self._state.current_value
|
|
281
262
|
serialized_state = state.model_dump(by_alias=False)
|
|
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
|
|
|
65
65
|
crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
|
|
66
66
|
crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
|
|
67
67
|
crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
|
|
68
|
-
crawlee/crawlers/_basic/_basic_crawler.py,sha256
|
|
68
|
+
crawlee/crawlers/_basic/_basic_crawler.py,sha256=-lo9yMjPkT8fU4ndOMaeEiwWHgu-DHw9Fny6f1kPPkk,73004
|
|
69
69
|
crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
|
|
70
70
|
crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
|
|
71
71
|
crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
|
|
@@ -92,7 +92,7 @@ crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2
|
|
|
92
92
|
crawlee/crawlers/_playwright/_types.py,sha256=hMKA9K9gjzQuwwbnmmfJsQrwR-kq235jH-WBXWeqkGo,2174
|
|
93
93
|
crawlee/crawlers/_playwright/_utils.py,sha256=FQ_-LYo7DGHsNHRrTtWt3mC06VzQvQ2wkGqpA2wBzYU,3441
|
|
94
94
|
crawlee/events/__init__.py,sha256=YMgOXKI0LsXfImKQy06PZ2Vdjy-uD_-acioagHft1do,577
|
|
95
|
-
crawlee/events/_event_manager.py,sha256=
|
|
95
|
+
crawlee/events/_event_manager.py,sha256=wjZTYIKBI8daKUkOVxUrbPHuU8LnFpUtWStdkts7r3U,11588
|
|
96
96
|
crawlee/events/_local_event_manager.py,sha256=CSiMJ6a_BwX0PPwtffEOtHm21dmALJz1zifo3AuMAk8,3708
|
|
97
97
|
crawlee/events/_types.py,sha256=MKsI014OOKKhjPJRrvWYrezIDGoLjGGhWXrkqYw26Ns,3313
|
|
98
98
|
crawlee/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -147,8 +147,8 @@ crawlee/sessions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
147
147
|
crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY0,154
|
|
148
148
|
crawlee/statistics/_error_snapshotter.py,sha256=g-roZgkJ-glyStZL7gXrOhrpdZvZ686W9lR43uZjPao,3279
|
|
149
149
|
crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
|
|
150
|
-
crawlee/statistics/_models.py,sha256=
|
|
151
|
-
crawlee/statistics/_statistics.py,sha256=
|
|
150
|
+
crawlee/statistics/_models.py,sha256=n4sT35D4dqNPYREl8Q_YXANZtxaWC0HaZizobA4qK_c,6674
|
|
151
|
+
crawlee/statistics/_statistics.py,sha256=mSgnCnV7q2buJdyuXBxbUU9MQEUjxaLST_NO4ej3XRw,12341
|
|
152
152
|
crawlee/storage_clients/__init__.py,sha256=X3M6Z_WAOJ3M9I8JhGhJDnrtbCOmM27DpGAzgt87R2A,874
|
|
153
153
|
crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
|
|
154
154
|
crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
199
199
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
200
200
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
201
201
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
crawlee-1.0.
|
|
203
|
-
crawlee-1.0.
|
|
204
|
-
crawlee-1.0.
|
|
205
|
-
crawlee-1.0.
|
|
206
|
-
crawlee-1.0.
|
|
202
|
+
crawlee-1.0.5b21.dist-info/METADATA,sha256=ERsOFxwDxPP0IVAwE8ZKLakRlDNcXKTYSi9ZKzFHCSQ,29533
|
|
203
|
+
crawlee-1.0.5b21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
204
|
+
crawlee-1.0.5b21.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
205
|
+
crawlee-1.0.5b21.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
206
|
+
crawlee-1.0.5b21.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|