crawlee 1.0.4b6__py3-none-any.whl → 1.0.4b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/crawlers/_basic/_basic_crawler.py +22 -12
- crawlee/statistics/_statistics.py +9 -5
- {crawlee-1.0.4b6.dist-info → crawlee-1.0.4b7.dist-info}/METADATA +1 -1
- {crawlee-1.0.4b6.dist-info → crawlee-1.0.4b7.dist-info}/RECORD +8 -8
- {crawlee-1.0.4b6.dist-info → crawlee-1.0.4b7.dist-info}/WHEEL +0 -0
- {crawlee-1.0.4b6.dist-info → crawlee-1.0.4b7.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.4b6.dist-info → crawlee-1.0.4b7.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/recurring_task.py
CHANGED
|
@@ -7,6 +7,9 @@ from typing import TYPE_CHECKING
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from collections.abc import Callable
|
|
9
9
|
from datetime import timedelta
|
|
10
|
+
from types import TracebackType
|
|
11
|
+
|
|
12
|
+
from typing_extensions import Self
|
|
10
13
|
|
|
11
14
|
logger = getLogger(__name__)
|
|
12
15
|
|
|
@@ -26,6 +29,18 @@ class RecurringTask:
|
|
|
26
29
|
self.delay = delay
|
|
27
30
|
self.task: asyncio.Task | None = None
|
|
28
31
|
|
|
32
|
+
async def __aenter__(self) -> Self:
|
|
33
|
+
self.start()
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
async def __aexit__(
|
|
37
|
+
self,
|
|
38
|
+
exc_type: type[BaseException] | None,
|
|
39
|
+
exc_value: BaseException | None,
|
|
40
|
+
exc_traceback: TracebackType | None,
|
|
41
|
+
) -> None:
|
|
42
|
+
await self.stop()
|
|
43
|
+
|
|
29
44
|
async def _wrapper(self) -> None:
|
|
30
45
|
"""Continuously execute the provided function with the specified delay.
|
|
31
46
|
|
|
@@ -56,7 +56,7 @@ from crawlee.errors import (
|
|
|
56
56
|
SessionError,
|
|
57
57
|
UserDefinedErrorHandlerError,
|
|
58
58
|
)
|
|
59
|
-
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
59
|
+
from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData
|
|
60
60
|
from crawlee.http_clients import ImpitHttpClient
|
|
61
61
|
from crawlee.router import Router
|
|
62
62
|
from crawlee.sessions import SessionPool
|
|
@@ -437,14 +437,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
437
437
|
self._statistics_log_format = statistics_log_format
|
|
438
438
|
|
|
439
439
|
# Statistics
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
440
|
+
if statistics:
|
|
441
|
+
self._statistics = statistics
|
|
442
|
+
else:
|
|
443
|
+
|
|
444
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
445
|
+
return await self.get_key_value_store()
|
|
446
|
+
|
|
447
|
+
self._statistics = cast(
|
|
448
|
+
'Statistics[TStatisticsState]',
|
|
449
|
+
Statistics.with_default_state(
|
|
450
|
+
persistence_enabled=True,
|
|
451
|
+
periodic_message_logger=self._logger,
|
|
452
|
+
statistics_log_format=self._statistics_log_format,
|
|
453
|
+
log_message='Current request statistics:',
|
|
454
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
455
|
+
),
|
|
456
|
+
)
|
|
448
457
|
|
|
449
458
|
# Additional context managers to enter and exit
|
|
450
459
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -689,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
689
698
|
except CancelledError:
|
|
690
699
|
pass
|
|
691
700
|
finally:
|
|
692
|
-
await self._crawler_state_rec_task.stop()
|
|
693
701
|
if threading.current_thread() is threading.main_thread():
|
|
694
702
|
with suppress(NotImplementedError):
|
|
695
703
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -721,8 +729,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
721
729
|
async def _run_crawler(self) -> None:
|
|
722
730
|
event_manager = self._service_locator.get_event_manager()
|
|
723
731
|
|
|
724
|
-
self._crawler_state_rec_task.start()
|
|
725
|
-
|
|
726
732
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
727
733
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
728
734
|
contexts_to_enter = [
|
|
@@ -733,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
733
739
|
self._statistics,
|
|
734
740
|
self._session_pool if self._use_session_pool else None,
|
|
735
741
|
self._http_client,
|
|
742
|
+
self._crawler_state_rec_task,
|
|
736
743
|
*self._additional_context_managers,
|
|
737
744
|
)
|
|
738
745
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -744,6 +751,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
744
751
|
|
|
745
752
|
await self._autoscaled_pool.run()
|
|
746
753
|
|
|
754
|
+
# Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
|
|
755
|
+
event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
|
|
756
|
+
|
|
747
757
|
async def add_requests(
|
|
748
758
|
self,
|
|
749
759
|
requests: Sequence[str | Request],
|
|
@@ -96,7 +96,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
96
96
|
|
|
97
97
|
self._state = RecoverableState(
|
|
98
98
|
default_state=state_model(stats_id=self._id),
|
|
99
|
-
persist_state_key=persist_state_key or f'
|
|
99
|
+
persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
|
|
100
100
|
persistence_enabled=persistence_enabled,
|
|
101
101
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
102
102
|
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
@@ -130,6 +130,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
130
130
|
persistence_enabled: bool = False,
|
|
131
131
|
persist_state_kvs_name: str | None = None,
|
|
132
132
|
persist_state_key: str | None = None,
|
|
133
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
133
134
|
log_message: str = 'Statistics',
|
|
134
135
|
periodic_message_logger: Logger | None = None,
|
|
135
136
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -141,6 +142,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
141
142
|
persistence_enabled=persistence_enabled,
|
|
142
143
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
143
144
|
persist_state_key=persist_state_key,
|
|
145
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
144
146
|
log_message=log_message,
|
|
145
147
|
periodic_message_logger=periodic_message_logger,
|
|
146
148
|
log_interval=log_interval,
|
|
@@ -187,7 +189,10 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
187
189
|
if not self._active:
|
|
188
190
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
189
191
|
|
|
190
|
-
|
|
192
|
+
if not self.state.crawler_last_started_at:
|
|
193
|
+
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
194
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
195
|
+
self.state.crawler_runtime += self.state.crawler_finished_at - self.state.crawler_last_started_at
|
|
191
196
|
|
|
192
197
|
await self._state.teardown()
|
|
193
198
|
|
|
@@ -255,8 +260,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
255
260
|
if self._instance_start is None:
|
|
256
261
|
raise RuntimeError('The Statistics object is not initialized')
|
|
257
262
|
|
|
258
|
-
|
|
259
|
-
total_minutes = crawler_runtime.total_seconds() / 60
|
|
263
|
+
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
260
264
|
state = self._state.current_value
|
|
261
265
|
serialized_state = state.model_dump(by_alias=False)
|
|
262
266
|
|
|
@@ -267,7 +271,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
267
271
|
requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
|
|
268
272
|
request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
|
|
269
273
|
requests_total=state.requests_failed + state.requests_finished,
|
|
270
|
-
crawler_runtime=crawler_runtime,
|
|
274
|
+
crawler_runtime=state.crawler_runtime,
|
|
271
275
|
requests_finished=state.requests_finished,
|
|
272
276
|
requests_failed=state.requests_failed,
|
|
273
277
|
retry_histogram=serialized_state['request_retry_histogram'],
|
|
@@ -30,7 +30,7 @@ crawlee/_utils/html_to_text.py,sha256=1iykT-OXd2xXNy7isHVWHqPxe23X82CGQBHIfbZbZk
|
|
|
30
30
|
crawlee/_utils/models.py,sha256=EqM50Uc-xvxKlLCLA2lPpRduzfKvT0z_-Q-UWG8aTRQ,1955
|
|
31
31
|
crawlee/_utils/raise_if_too_many_kwargs.py,sha256=J2gaUJmsmNwexohuehXw_mdYKv-eWiui6WUHFsQ3qTQ,597
|
|
32
32
|
crawlee/_utils/recoverable_state.py,sha256=c1D2ZecxEliGZzhqYz9_oU5CF2Hm0UKvpOHqO6CDJRE,9032
|
|
33
|
-
crawlee/_utils/recurring_task.py,sha256=
|
|
33
|
+
crawlee/_utils/recurring_task.py,sha256=sQMiURuDXbwwfAcIXK8V4NXncSxIBxsqN1cZWX7DLyg,2128
|
|
34
34
|
crawlee/_utils/requests.py,sha256=yOjai7bHR9_duPJ0ck-L76y9AnKZr49JBfSOQv9kvJc,5048
|
|
35
35
|
crawlee/_utils/robots.py,sha256=k3Yi2OfKT0H04MPkP-OBGGV7fEePgOqb60awltjMYWY,4346
|
|
36
36
|
crawlee/_utils/sitemap.py,sha256=UI9EJiFiyFvV5_flVUtdsEVz8ZsJeRERPtcx8ZsqjTU,16632
|
|
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
|
|
|
65
65
|
crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
|
|
66
66
|
crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
|
|
67
67
|
crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
|
|
68
|
-
crawlee/crawlers/_basic/_basic_crawler.py,sha256=
|
|
68
|
+
crawlee/crawlers/_basic/_basic_crawler.py,sha256=yZ_A_l9Dux9Y2eYa9XbN3c7h-3YO7MgGmJbzCMbCplg,73257
|
|
69
69
|
crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
|
|
70
70
|
crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
|
|
71
71
|
crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
|
|
@@ -148,7 +148,7 @@ crawlee/statistics/__init__.py,sha256=lXAsHNkeRZQBffW1B7rERarivXIUJveNlcKTGOXQZY
|
|
|
148
148
|
crawlee/statistics/_error_snapshotter.py,sha256=ChBBG0gIMWcSeyEzs3jQf3mSnHLZUHcD284wEDan1Js,3278
|
|
149
149
|
crawlee/statistics/_error_tracker.py,sha256=x9Yw1TuyEptjwgPPJ4gIom-0oVjawcNReQDsHH2nZ3w,8553
|
|
150
150
|
crawlee/statistics/_models.py,sha256=SFWYpT3r1c4XugU8nrm0epTpcM5_0fS1mXi9fnbhGJ8,5237
|
|
151
|
-
crawlee/statistics/_statistics.py,sha256=
|
|
151
|
+
crawlee/statistics/_statistics.py,sha256=d6z5XxXm-an4M_8TierOPpSB78vxqxwvUFCewIEmiK4,12786
|
|
152
152
|
crawlee/storage_clients/__init__.py,sha256=RCnutWMOqs_kUQpzfLVT5jgpHGWakLv557c6UIYFQsA,754
|
|
153
153
|
crawlee/storage_clients/models.py,sha256=gfW_kpSCOBuoTBIW0N7tb3FUv7BgD3keZADS7pyT4_I,6586
|
|
154
154
|
crawlee/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
187
187
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
188
188
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
189
189
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
190
|
-
crawlee-1.0.
|
|
191
|
-
crawlee-1.0.
|
|
192
|
-
crawlee-1.0.
|
|
193
|
-
crawlee-1.0.
|
|
194
|
-
crawlee-1.0.
|
|
190
|
+
crawlee-1.0.4b7.dist-info/METADATA,sha256=tZyfbl6piSNoS2rrGMuxlrM9EhIc4HNBFvU9RwW8ZAc,29314
|
|
191
|
+
crawlee-1.0.4b7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
192
|
+
crawlee-1.0.4b7.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
193
|
+
crawlee-1.0.4b7.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
194
|
+
crawlee-1.0.4b7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|