crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/statistics/_models.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import warnings
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
|
-
from typing import Annotated, Any
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
7
8
|
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
|
|
9
10
|
from typing_extensions import override
|
|
@@ -11,6 +12,9 @@ from typing_extensions import override
|
|
|
11
12
|
from crawlee._utils.console import make_table
|
|
12
13
|
from crawlee._utils.docs import docs_group
|
|
13
14
|
from crawlee._utils.models import timedelta_ms
|
|
15
|
+
from crawlee._utils.time import format_duration
|
|
16
|
+
|
|
17
|
+
_STATISTICS_TABLE_WIDTH = 100
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
@dataclass(frozen=True)
|
|
@@ -31,9 +35,14 @@ class FinalStatistics:
|
|
|
31
35
|
|
|
32
36
|
def to_table(self) -> str:
|
|
33
37
|
"""Print out the Final Statistics data as a table."""
|
|
34
|
-
|
|
38
|
+
formatted_dict = {}
|
|
39
|
+
for k, v in asdict(self).items():
|
|
40
|
+
if isinstance(v, timedelta):
|
|
41
|
+
formatted_dict[k] = format_duration(v)
|
|
42
|
+
else:
|
|
43
|
+
formatted_dict[k] = v
|
|
35
44
|
|
|
36
|
-
return make_table([(str(k), str(v)) for k, v in
|
|
45
|
+
return make_table([(str(k), str(v)) for k, v in formatted_dict.items()], width=_STATISTICS_TABLE_WIDTH)
|
|
37
46
|
|
|
38
47
|
def to_dict(self) -> dict[str, float | int | list[int]]:
|
|
39
48
|
return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}
|
|
@@ -49,7 +58,7 @@ class FinalStatistics:
|
|
|
49
58
|
class StatisticsState(BaseModel):
|
|
50
59
|
"""Statistic data about a crawler run."""
|
|
51
60
|
|
|
52
|
-
model_config = ConfigDict(
|
|
61
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
|
|
53
62
|
stats_id: Annotated[int | None, Field(alias='statsId')] = None
|
|
54
63
|
|
|
55
64
|
requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0
|
|
@@ -68,10 +77,20 @@ class StatisticsState(BaseModel):
|
|
|
68
77
|
crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
|
|
69
78
|
crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
|
|
70
79
|
crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
80
|
+
|
|
81
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
82
|
+
if TYPE_CHECKING:
|
|
83
|
+
errors: dict[str, Any] = {}
|
|
84
|
+
retry_errors: dict[str, Any] = {}
|
|
85
|
+
requests_with_status_code: dict[str, int] = {}
|
|
86
|
+
else:
|
|
87
|
+
errors: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
88
|
+
retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
|
|
89
|
+
requests_with_status_code: Annotated[
|
|
90
|
+
dict[str, int],
|
|
91
|
+
Field(alias='requestsWithStatusCode', default_factory=dict),
|
|
92
|
+
]
|
|
93
|
+
|
|
75
94
|
stats_persisted_at: Annotated[
|
|
76
95
|
datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
|
|
77
96
|
] = None
|
|
@@ -85,22 +104,53 @@ class StatisticsState(BaseModel):
|
|
|
85
104
|
),
|
|
86
105
|
] = {}
|
|
87
106
|
|
|
88
|
-
|
|
107
|
+
# Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
|
|
108
|
+
_runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
|
|
109
|
+
|
|
110
|
+
def model_post_init(self, /, __context: Any) -> None:
|
|
111
|
+
self._runtime_offset = self.crawler_runtime or self._runtime_offset
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def crawler_runtime(self) -> timedelta:
|
|
115
|
+
if self.crawler_last_started_at:
|
|
116
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
117
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
118
|
+
return self._runtime_offset
|
|
119
|
+
|
|
120
|
+
@crawler_runtime.setter
|
|
121
|
+
def crawler_runtime(self, value: timedelta) -> None:
|
|
122
|
+
# Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
|
|
123
|
+
# To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
|
|
124
|
+
warnings.warn(
|
|
125
|
+
f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
|
|
126
|
+
f' Value {value} will not be used.',
|
|
127
|
+
DeprecationWarning,
|
|
128
|
+
stacklevel=2,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@computed_field(alias='crawlerRuntimeMillis')
|
|
132
|
+
def crawler_runtime_for_serialization(self) -> timedelta:
|
|
133
|
+
if self.crawler_last_started_at:
|
|
134
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
135
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
136
|
+
return self._runtime_offset
|
|
137
|
+
|
|
138
|
+
@computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
|
|
89
139
|
@property
|
|
90
140
|
def request_total_duration(self) -> timedelta:
|
|
91
141
|
return self.request_total_finished_duration + self.request_total_failed_duration
|
|
92
142
|
|
|
93
|
-
@computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
|
|
143
|
+
@computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
|
|
94
144
|
@property
|
|
95
145
|
def request_avg_failed_duration(self) -> timedelta | None:
|
|
96
146
|
return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
|
|
97
147
|
|
|
98
|
-
@computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
|
|
148
|
+
@computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
|
|
99
149
|
@property
|
|
100
150
|
def request_avg_finished_duration(self) -> timedelta | None:
|
|
101
151
|
return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
|
|
102
152
|
|
|
103
|
-
@computed_field(alias='requestsTotal')
|
|
153
|
+
@computed_field(alias='requestsTotal')
|
|
104
154
|
@property
|
|
105
155
|
def requests_total(self) -> int:
|
|
106
156
|
return self.requests_failed + self.requests_finished
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import math
|
|
5
6
|
import time
|
|
6
7
|
from datetime import datetime, timedelta, timezone
|
|
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
|
|
|
17
18
|
from crawlee.statistics._error_tracker import ErrorTracker
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Coroutine
|
|
20
22
|
from types import TracebackType
|
|
21
23
|
|
|
24
|
+
from crawlee.storages import KeyValueStore
|
|
25
|
+
|
|
22
26
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
23
27
|
TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
24
28
|
logger = getLogger(__name__)
|
|
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
70
74
|
persistence_enabled: bool | Literal['explicit_only'] = False,
|
|
71
75
|
persist_state_kvs_name: str | None = None,
|
|
72
76
|
persist_state_key: str | None = None,
|
|
77
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
73
78
|
log_message: str = 'Statistics',
|
|
74
79
|
periodic_message_logger: Logger | None = None,
|
|
75
80
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
80
85
|
self._id = Statistics.__next_id
|
|
81
86
|
Statistics.__next_id += 1
|
|
82
87
|
|
|
83
|
-
self._instance_start: datetime | None = None
|
|
84
|
-
|
|
85
88
|
self.error_tracker = ErrorTracker(
|
|
86
89
|
save_error_snapshots=save_error_snapshots,
|
|
87
90
|
snapshot_kvs_name=persist_state_kvs_name,
|
|
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
92
95
|
|
|
93
96
|
self._state = RecoverableState(
|
|
94
97
|
default_state=state_model(stats_id=self._id),
|
|
95
|
-
persist_state_key=persist_state_key or f'
|
|
98
|
+
persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
|
|
96
99
|
persistence_enabled=persistence_enabled,
|
|
97
100
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
101
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
98
102
|
logger=logger,
|
|
99
103
|
)
|
|
100
104
|
|
|
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
110
114
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
111
115
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
112
116
|
persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
|
|
113
|
-
persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
|
|
114
117
|
persist_state_key=self._state._persist_state_key, # noqa: SLF001
|
|
118
|
+
persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
|
|
115
119
|
log_message=self._log_message,
|
|
116
120
|
periodic_message_logger=self._periodic_message_logger,
|
|
117
121
|
state_model=state_model,
|
|
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
125
129
|
persistence_enabled: bool = False,
|
|
126
130
|
persist_state_kvs_name: str | None = None,
|
|
127
131
|
persist_state_key: str | None = None,
|
|
132
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
128
133
|
log_message: str = 'Statistics',
|
|
129
134
|
periodic_message_logger: Logger | None = None,
|
|
130
135
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
136
141
|
persistence_enabled=persistence_enabled,
|
|
137
142
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
138
143
|
persist_state_key=persist_state_key,
|
|
144
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
139
145
|
log_message=log_message,
|
|
140
146
|
periodic_message_logger=periodic_message_logger,
|
|
141
147
|
log_interval=log_interval,
|
|
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
158
164
|
if self._active:
|
|
159
165
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
160
166
|
|
|
161
|
-
self._active = True
|
|
162
|
-
self._instance_start = datetime.now(timezone.utc)
|
|
163
|
-
|
|
164
167
|
await self._state.initialize()
|
|
165
|
-
|
|
168
|
+
# Reset `crawler_finished_at` to indicate a new run in progress.
|
|
169
|
+
self.state.crawler_finished_at = None
|
|
166
170
|
|
|
171
|
+
# Start periodic logging and let it print initial state before activation.
|
|
167
172
|
self._periodic_logger.start()
|
|
173
|
+
await asyncio.sleep(0.01)
|
|
174
|
+
self._active = True
|
|
168
175
|
|
|
176
|
+
self.state.crawler_last_started_at = datetime.now(timezone.utc)
|
|
177
|
+
self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
|
|
169
178
|
return self
|
|
170
179
|
|
|
171
180
|
async def __aexit__(
|
|
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
182
191
|
if not self._active:
|
|
183
192
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
184
193
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
await self._state.teardown()
|
|
194
|
+
if not self.state.crawler_last_started_at:
|
|
195
|
+
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
188
196
|
|
|
197
|
+
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
189
198
|
await self._periodic_logger.stop()
|
|
190
|
-
|
|
199
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
191
200
|
self._active = False
|
|
201
|
+
await self._state.teardown()
|
|
192
202
|
|
|
193
203
|
@property
|
|
194
204
|
def state(self) -> TStatisticsState:
|
|
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
247
257
|
|
|
248
258
|
def calculate(self) -> FinalStatistics:
|
|
249
259
|
"""Calculate the current statistics."""
|
|
250
|
-
|
|
251
|
-
raise RuntimeError('The Statistics object is not initialized')
|
|
252
|
-
|
|
253
|
-
crawler_runtime = datetime.now(timezone.utc) - self._instance_start
|
|
254
|
-
total_minutes = crawler_runtime.total_seconds() / 60
|
|
260
|
+
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
255
261
|
state = self._state.current_value
|
|
256
262
|
serialized_state = state.model_dump(by_alias=False)
|
|
257
263
|
|
|
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
268
|
requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
|
|
263
269
|
request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
|
|
264
270
|
requests_total=state.requests_failed + state.requests_finished,
|
|
265
|
-
crawler_runtime=crawler_runtime,
|
|
271
|
+
crawler_runtime=state.crawler_runtime,
|
|
266
272
|
requests_finished=state.requests_finished,
|
|
267
273
|
requests_failed=state.requests_failed,
|
|
268
274
|
retry_histogram=serialized_state['request_retry_histogram'],
|
|
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
282
288
|
else:
|
|
283
289
|
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
|
|
284
290
|
|
|
285
|
-
def _after_initialize(self) -> None:
|
|
286
|
-
state = self._state.current_value
|
|
287
|
-
|
|
288
|
-
if state.crawler_started_at is None:
|
|
289
|
-
state.crawler_started_at = datetime.now(timezone.utc)
|
|
290
|
-
|
|
291
|
-
if state.stats_persisted_at is not None and state.crawler_last_started_at:
|
|
292
|
-
self._instance_start = datetime.now(timezone.utc) - (
|
|
293
|
-
state.stats_persisted_at - state.crawler_last_started_at
|
|
294
|
-
)
|
|
295
|
-
elif state.crawler_last_started_at:
|
|
296
|
-
self._instance_start = state.crawler_last_started_at
|
|
297
|
-
|
|
298
|
-
state.crawler_last_started_at = self._instance_start
|
|
299
|
-
|
|
300
291
|
def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
|
|
301
292
|
retry_count = record.retry_count
|
|
302
293
|
state = self._state.current_value
|
|
@@ -1,9 +1,25 @@
|
|
|
1
|
+
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
|
+
from crawlee._utils.try_import import try_import as _try_import
|
|
3
|
+
|
|
4
|
+
# These imports have only mandatory dependencies, so they are imported directly.
|
|
1
5
|
from ._base import StorageClient
|
|
2
6
|
from ._file_system import FileSystemStorageClient
|
|
3
7
|
from ._memory import MemoryStorageClient
|
|
4
8
|
|
|
9
|
+
_install_import_hook(__name__)
|
|
10
|
+
|
|
11
|
+
# The following imports are wrapped in try_import to handle optional dependencies,
|
|
12
|
+
# ensuring the module can still function even if these dependencies are missing.
|
|
13
|
+
with _try_import(__name__, 'SqlStorageClient'):
|
|
14
|
+
from ._sql import SqlStorageClient
|
|
15
|
+
|
|
16
|
+
with _try_import(__name__, 'RedisStorageClient'):
|
|
17
|
+
from ._redis import RedisStorageClient
|
|
18
|
+
|
|
5
19
|
__all__ = [
|
|
6
20
|
'FileSystemStorageClient',
|
|
7
21
|
'MemoryStorageClient',
|
|
22
|
+
'RedisStorageClient',
|
|
23
|
+
'SqlStorageClient',
|
|
8
24
|
'StorageClient',
|
|
9
25
|
]
|
|
@@ -87,8 +87,8 @@ class DatasetClient(ABC):
|
|
|
87
87
|
|
|
88
88
|
The backend method for the `Dataset.iterate_items` call.
|
|
89
89
|
"""
|
|
90
|
-
# This syntax is to make
|
|
90
|
+
# This syntax is to make type checker properly work with abstract AsyncIterator.
|
|
91
91
|
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
92
92
|
raise NotImplementedError
|
|
93
|
-
if False:
|
|
93
|
+
if False:
|
|
94
94
|
yield 0
|
|
@@ -72,10 +72,10 @@ class KeyValueStoreClient(ABC):
|
|
|
72
72
|
|
|
73
73
|
The backend method for the `KeyValueStore.iterate_keys` call.
|
|
74
74
|
"""
|
|
75
|
-
# This syntax is to make
|
|
75
|
+
# This syntax is to make type checker properly work with abstract AsyncIterator.
|
|
76
76
|
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
77
77
|
raise NotImplementedError
|
|
78
|
-
if False:
|
|
78
|
+
if False:
|
|
79
79
|
yield 0
|
|
80
80
|
|
|
81
81
|
@abstractmethod
|
|
@@ -63,11 +63,11 @@ class RequestQueueClient(ABC):
|
|
|
63
63
|
"""
|
|
64
64
|
|
|
65
65
|
@abstractmethod
|
|
66
|
-
async def get_request(self,
|
|
66
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
67
67
|
"""Retrieve a request from the queue.
|
|
68
68
|
|
|
69
69
|
Args:
|
|
70
|
-
|
|
70
|
+
unique_key: Unique key of the request to retrieve.
|
|
71
71
|
|
|
72
72
|
Returns:
|
|
73
73
|
The retrieved request, or None, if it did not exist.
|
|
@@ -6,6 +6,8 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
from crawlee._utils.docs import docs_group
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Hashable
|
|
10
|
+
|
|
9
11
|
from crawlee.configuration import Configuration
|
|
10
12
|
|
|
11
13
|
from ._dataset_client import DatasetClient
|
|
@@ -28,12 +30,21 @@ class StorageClient(ABC):
|
|
|
28
30
|
(where applicable), and consistent access patterns across all storage types it supports.
|
|
29
31
|
"""
|
|
30
32
|
|
|
33
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
|
|
34
|
+
"""Return a cache key that can differentiate between different storages of this and other clients.
|
|
35
|
+
|
|
36
|
+
Can be based on configuration or on the client itself. By default, returns a module and name of the client
|
|
37
|
+
class.
|
|
38
|
+
"""
|
|
39
|
+
return f'{self.__class__.__module__}.{self.__class__.__name__}'
|
|
40
|
+
|
|
31
41
|
@abstractmethod
|
|
32
42
|
async def create_dataset_client(
|
|
33
43
|
self,
|
|
34
44
|
*,
|
|
35
45
|
id: str | None = None,
|
|
36
46
|
name: str | None = None,
|
|
47
|
+
alias: str | None = None,
|
|
37
48
|
configuration: Configuration | None = None,
|
|
38
49
|
) -> DatasetClient:
|
|
39
50
|
"""Create a dataset client."""
|
|
@@ -44,6 +55,7 @@ class StorageClient(ABC):
|
|
|
44
55
|
*,
|
|
45
56
|
id: str | None = None,
|
|
46
57
|
name: str | None = None,
|
|
58
|
+
alias: str | None = None,
|
|
47
59
|
configuration: Configuration | None = None,
|
|
48
60
|
) -> KeyValueStoreClient:
|
|
49
61
|
"""Create a key-value store client."""
|
|
@@ -54,6 +66,7 @@ class StorageClient(ABC):
|
|
|
54
66
|
*,
|
|
55
67
|
id: str | None = None,
|
|
56
68
|
name: str | None = None,
|
|
69
|
+
alias: str | None = None,
|
|
57
70
|
configuration: Configuration | None = None,
|
|
58
71
|
) -> RequestQueueClient:
|
|
59
72
|
"""Create a request queue client."""
|
|
@@ -9,11 +9,12 @@ from pathlib import Path
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
11
|
from pydantic import ValidationError
|
|
12
|
-
from typing_extensions import override
|
|
12
|
+
from typing_extensions import Self, override
|
|
13
13
|
|
|
14
14
|
from crawlee._consts import METADATA_FILENAME
|
|
15
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
16
16
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
17
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
17
18
|
from crawlee.storage_clients._base import DatasetClient
|
|
18
19
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
19
20
|
|
|
@@ -56,7 +57,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
56
57
|
self,
|
|
57
58
|
*,
|
|
58
59
|
metadata: DatasetMetadata,
|
|
59
|
-
|
|
60
|
+
path_to_dataset: Path,
|
|
60
61
|
lock: asyncio.Lock,
|
|
61
62
|
) -> None:
|
|
62
63
|
"""Initialize a new instance.
|
|
@@ -65,8 +66,8 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
65
66
|
"""
|
|
66
67
|
self._metadata = metadata
|
|
67
68
|
|
|
68
|
-
self.
|
|
69
|
-
"""The
|
|
69
|
+
self._path_to_dataset = path_to_dataset
|
|
70
|
+
"""The full path to the dataset directory."""
|
|
70
71
|
|
|
71
72
|
self._lock = lock
|
|
72
73
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -78,10 +79,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
78
79
|
@property
|
|
79
80
|
def path_to_dataset(self) -> Path:
|
|
80
81
|
"""The full path to the dataset directory."""
|
|
81
|
-
|
|
82
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
83
|
-
|
|
84
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
82
|
+
return self._path_to_dataset
|
|
85
83
|
|
|
86
84
|
@property
|
|
87
85
|
def path_to_metadata(self) -> Path:
|
|
@@ -94,8 +92,9 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
94
92
|
*,
|
|
95
93
|
id: str | None,
|
|
96
94
|
name: str | None,
|
|
95
|
+
alias: str | None,
|
|
97
96
|
configuration: Configuration,
|
|
98
|
-
) ->
|
|
97
|
+
) -> Self:
|
|
99
98
|
"""Open or create a file system dataset client.
|
|
100
99
|
|
|
101
100
|
This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
|
|
@@ -104,17 +103,21 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
104
103
|
|
|
105
104
|
Args:
|
|
106
105
|
id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
|
|
107
|
-
name: The name of the dataset
|
|
106
|
+
name: The name of the dataset for named (global scope) storages.
|
|
107
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
108
108
|
configuration: The configuration object containing storage directory settings.
|
|
109
109
|
|
|
110
110
|
Returns:
|
|
111
111
|
An instance for the opened or created storage client.
|
|
112
112
|
|
|
113
113
|
Raises:
|
|
114
|
-
ValueError: If a dataset with the specified ID is not found,
|
|
114
|
+
ValueError: If a dataset with the specified ID is not found, if metadata is invalid,
|
|
115
|
+
or if both name and alias are provided.
|
|
115
116
|
"""
|
|
116
|
-
|
|
117
|
-
|
|
117
|
+
# Validate input parameters.
|
|
118
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
119
|
+
|
|
120
|
+
dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
118
121
|
|
|
119
122
|
if not dataset_base_path.exists():
|
|
120
123
|
await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -126,19 +129,19 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
126
129
|
if not dataset_dir.is_dir():
|
|
127
130
|
continue
|
|
128
131
|
|
|
129
|
-
|
|
130
|
-
if not
|
|
132
|
+
path_to_metadata = dataset_dir / METADATA_FILENAME
|
|
133
|
+
if not path_to_metadata.exists():
|
|
131
134
|
continue
|
|
132
135
|
|
|
133
136
|
try:
|
|
134
|
-
file = await asyncio.to_thread(
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
135
138
|
try:
|
|
136
139
|
file_content = json.load(file)
|
|
137
140
|
metadata = DatasetMetadata(**file_content)
|
|
138
141
|
if metadata.id == id:
|
|
139
142
|
client = cls(
|
|
140
143
|
metadata=metadata,
|
|
141
|
-
|
|
144
|
+
path_to_dataset=dataset_base_path / dataset_dir,
|
|
142
145
|
lock=asyncio.Lock(),
|
|
143
146
|
)
|
|
144
147
|
await client._update_metadata(update_accessed_at=True)
|
|
@@ -152,16 +155,15 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
152
155
|
if not found:
|
|
153
156
|
raise ValueError(f'Dataset with ID "{id}" not found')
|
|
154
157
|
|
|
155
|
-
# Get a new instance by name.
|
|
158
|
+
# Get a new instance by name or alias.
|
|
156
159
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
metadata_path = dataset_path / METADATA_FILENAME
|
|
160
|
+
dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
161
|
+
path_to_dataset = dataset_base_path / dataset_dir
|
|
162
|
+
path_to_metadata = path_to_dataset / METADATA_FILENAME
|
|
161
163
|
|
|
162
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
163
|
-
if
|
|
164
|
-
file = await asyncio.to_thread(open,
|
|
165
|
+
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
165
167
|
try:
|
|
166
168
|
file_content = json.load(file)
|
|
167
169
|
finally:
|
|
@@ -169,11 +171,11 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
169
171
|
try:
|
|
170
172
|
metadata = DatasetMetadata(**file_content)
|
|
171
173
|
except ValidationError as exc:
|
|
172
|
-
raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc
|
|
174
|
+
raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc
|
|
173
175
|
|
|
174
176
|
client = cls(
|
|
175
177
|
metadata=metadata,
|
|
176
|
-
|
|
178
|
+
path_to_dataset=path_to_dataset,
|
|
177
179
|
lock=asyncio.Lock(),
|
|
178
180
|
)
|
|
179
181
|
|
|
@@ -192,7 +194,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
192
194
|
)
|
|
193
195
|
client = cls(
|
|
194
196
|
metadata=metadata,
|
|
195
|
-
|
|
197
|
+
path_to_dataset=path_to_dataset,
|
|
196
198
|
lock=asyncio.Lock(),
|
|
197
199
|
)
|
|
198
200
|
await client._update_metadata()
|
|
@@ -471,9 +473,10 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
471
473
|
"""
|
|
472
474
|
# Retrieve and sort all JSON files in the dataset directory numerically.
|
|
473
475
|
files = await asyncio.to_thread(
|
|
474
|
-
sorted
|
|
475
|
-
|
|
476
|
-
|
|
476
|
+
lambda: sorted(
|
|
477
|
+
self.path_to_dataset.glob('*.json'),
|
|
478
|
+
key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
|
|
479
|
+
)
|
|
477
480
|
)
|
|
478
481
|
|
|
479
482
|
# Remove the metadata file from the list if present.
|