crawlee 1.0.1b8__py3-none-any.whl → 1.0.5b18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_request.py +31 -20
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +10 -16
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
- crawlee/crawlers/_basic/_basic_crawler.py +23 -12
- crawlee/crawlers/_playwright/_playwright_crawler.py +11 -4
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/request_loaders/_sitemap_request_loader.py +5 -0
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_statistics.py +41 -31
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_request_queue_client.py +26 -8
- crawlee/storage_clients/_memory/_dataset_client.py +2 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_dataset_client.py +2 -2
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
- crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
- crawlee/storage_clients/_sql/_storage_client.py +10 -1
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +3 -0
- crawlee/storages/_key_value_store.py +8 -2
- crawlee/storages/_request_queue.py +3 -0
- crawlee/storages/_storage_instance_manager.py +9 -1
- crawlee/storages/_utils.py +11 -0
- {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/METADATA +9 -5
- {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/RECORD +58 -45
- {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/WHEEL +0 -0
- {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/licenses/LICENSE +0 -0
|
@@ -114,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
114
114
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
115
115
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
116
116
|
and local storage.
|
|
117
|
-
browser_type: The type of browser to launch
|
|
117
|
+
browser_type: The type of browser to launch:
|
|
118
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
119
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
120
|
+
the system.
|
|
118
121
|
This option should not be used if `browser_pool` is provided.
|
|
119
122
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
120
123
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -153,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
153
156
|
):
|
|
154
157
|
raise ValueError(
|
|
155
158
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
156
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
159
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
157
160
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
158
161
|
)
|
|
159
162
|
|
|
@@ -366,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
366
369
|
links_iterator: Iterator[str] = iter(
|
|
367
370
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
368
371
|
)
|
|
369
|
-
links_iterator = to_absolute_url_iterator(
|
|
372
|
+
links_iterator = to_absolute_url_iterator(
|
|
373
|
+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
|
|
374
|
+
)
|
|
370
375
|
|
|
371
376
|
if robots_txt_file:
|
|
372
377
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -494,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
494
499
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
495
500
|
|
|
496
501
|
browser_type: NotRequired[BrowserType]
|
|
497
|
-
"""The type of browser to launch
|
|
502
|
+
"""The type of browser to launch:
|
|
503
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
504
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
498
505
|
This option should not be used if `browser_pool` is provided."""
|
|
499
506
|
|
|
500
507
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
|
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
69
69
|
|
|
70
70
|
if request_handling_instrumentation:
|
|
71
71
|
|
|
72
|
-
async def
|
|
72
|
+
async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
|
|
73
73
|
with self._tracer.start_as_current_span(
|
|
74
74
|
name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
|
|
75
75
|
attributes={
|
|
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
111
111
|
# Handpicked interesting methods to instrument
|
|
112
112
|
self._instrumented.extend(
|
|
113
113
|
[
|
|
114
|
-
(_Middleware, 'action',
|
|
115
|
-
(_Middleware, 'cleanup',
|
|
114
|
+
(_Middleware, 'action', middleware_wrapper),
|
|
115
|
+
(_Middleware, 'cleanup', middleware_wrapper),
|
|
116
116
|
(ContextPipeline, '__call__', context_pipeline_wrapper),
|
|
117
117
|
(BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
|
|
118
118
|
(BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
|
|
@@ -90,6 +90,11 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
90
90
|
class SitemapRequestLoader(RequestLoader):
|
|
91
91
|
"""A request loader that reads URLs from sitemap(s).
|
|
92
92
|
|
|
93
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
94
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
95
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
96
|
+
and the `enqueue_links` functionality.
|
|
97
|
+
|
|
93
98
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
94
99
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
100
|
|
|
@@ -163,7 +163,7 @@ class SessionPool:
|
|
|
163
163
|
def add_session(self, session: Session) -> None:
|
|
164
164
|
"""Add an externally created session to the pool.
|
|
165
165
|
|
|
166
|
-
This is
|
|
166
|
+
This is intended only for the cases when you want to add a session that was created outside of the pool.
|
|
167
167
|
Otherwise, the pool will create new sessions automatically.
|
|
168
168
|
|
|
169
169
|
Args:
|
|
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
|
|
|
32
32
|
"""Capture error snapshot and save it to key value store.
|
|
33
33
|
|
|
34
34
|
It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
|
|
35
|
-
it returns `KeyValueStoreChangeRecords` which is
|
|
35
|
+
it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
|
|
36
36
|
returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
|
|
37
37
|
an exception.
|
|
38
38
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import math
|
|
5
6
|
import time
|
|
6
7
|
from datetime import datetime, timedelta, timezone
|
|
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
|
|
|
17
18
|
from crawlee.statistics._error_tracker import ErrorTracker
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Coroutine
|
|
20
22
|
from types import TracebackType
|
|
21
23
|
|
|
24
|
+
from crawlee.storages import KeyValueStore
|
|
25
|
+
|
|
22
26
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
23
27
|
TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
24
28
|
logger = getLogger(__name__)
|
|
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
70
74
|
persistence_enabled: bool | Literal['explicit_only'] = False,
|
|
71
75
|
persist_state_kvs_name: str | None = None,
|
|
72
76
|
persist_state_key: str | None = None,
|
|
77
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
73
78
|
log_message: str = 'Statistics',
|
|
74
79
|
periodic_message_logger: Logger | None = None,
|
|
75
80
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
80
85
|
self._id = Statistics.__next_id
|
|
81
86
|
Statistics.__next_id += 1
|
|
82
87
|
|
|
83
|
-
self._instance_start: datetime | None = None
|
|
84
|
-
|
|
85
88
|
self.error_tracker = ErrorTracker(
|
|
86
89
|
save_error_snapshots=save_error_snapshots,
|
|
87
90
|
snapshot_kvs_name=persist_state_kvs_name,
|
|
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
92
95
|
|
|
93
96
|
self._state = RecoverableState(
|
|
94
97
|
default_state=state_model(stats_id=self._id),
|
|
95
|
-
persist_state_key=persist_state_key or f'
|
|
98
|
+
persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
|
|
96
99
|
persistence_enabled=persistence_enabled,
|
|
97
100
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
101
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
98
102
|
logger=logger,
|
|
99
103
|
)
|
|
100
104
|
|
|
@@ -106,12 +110,15 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
106
110
|
# Flag to indicate the context state.
|
|
107
111
|
self._active = False
|
|
108
112
|
|
|
113
|
+
# Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
|
|
114
|
+
self._runtime_offset = timedelta(seconds=0)
|
|
115
|
+
|
|
109
116
|
def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
|
|
110
117
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
111
118
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
112
119
|
persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
|
|
113
|
-
persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
|
|
114
120
|
persist_state_key=self._state._persist_state_key, # noqa: SLF001
|
|
121
|
+
persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
|
|
115
122
|
log_message=self._log_message,
|
|
116
123
|
periodic_message_logger=self._periodic_message_logger,
|
|
117
124
|
state_model=state_model,
|
|
@@ -125,6 +132,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
125
132
|
persistence_enabled: bool = False,
|
|
126
133
|
persist_state_kvs_name: str | None = None,
|
|
127
134
|
persist_state_key: str | None = None,
|
|
135
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
128
136
|
log_message: str = 'Statistics',
|
|
129
137
|
periodic_message_logger: Logger | None = None,
|
|
130
138
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -136,6 +144,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
136
144
|
persistence_enabled=persistence_enabled,
|
|
137
145
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
138
146
|
persist_state_key=persist_state_key,
|
|
147
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
139
148
|
log_message=log_message,
|
|
140
149
|
periodic_message_logger=periodic_message_logger,
|
|
141
150
|
log_interval=log_interval,
|
|
@@ -158,14 +167,17 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
158
167
|
if self._active:
|
|
159
168
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
160
169
|
|
|
161
|
-
self._active = True
|
|
162
|
-
self._instance_start = datetime.now(timezone.utc)
|
|
163
|
-
|
|
164
170
|
await self._state.initialize()
|
|
165
|
-
self._after_initialize()
|
|
166
171
|
|
|
172
|
+
self._runtime_offset = self.state.crawler_runtime
|
|
173
|
+
|
|
174
|
+
# Start periodic logging and let it print initial state before activation.
|
|
167
175
|
self._periodic_logger.start()
|
|
176
|
+
await asyncio.sleep(0.01)
|
|
177
|
+
self._active = True
|
|
168
178
|
|
|
179
|
+
self.state.crawler_last_started_at = datetime.now(timezone.utc)
|
|
180
|
+
self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
|
|
169
181
|
return self
|
|
170
182
|
|
|
171
183
|
async def __aexit__(
|
|
@@ -182,13 +194,18 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
182
194
|
if not self._active:
|
|
183
195
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
184
196
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
await self._state.teardown()
|
|
197
|
+
if not self.state.crawler_last_started_at:
|
|
198
|
+
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
188
199
|
|
|
200
|
+
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
189
201
|
await self._periodic_logger.stop()
|
|
202
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
203
|
+
self.state.crawler_runtime = (
|
|
204
|
+
self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
|
|
205
|
+
)
|
|
190
206
|
|
|
191
207
|
self._active = False
|
|
208
|
+
await self._state.teardown()
|
|
192
209
|
|
|
193
210
|
@property
|
|
194
211
|
def state(self) -> TStatisticsState:
|
|
@@ -245,13 +262,21 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
245
262
|
|
|
246
263
|
del self._requests_in_progress[request_id_or_key]
|
|
247
264
|
|
|
265
|
+
def _update_crawler_runtime(self) -> None:
|
|
266
|
+
current_run_duration = (
|
|
267
|
+
(datetime.now(timezone.utc) - self.state.crawler_last_started_at)
|
|
268
|
+
if self.state.crawler_last_started_at
|
|
269
|
+
else timedelta()
|
|
270
|
+
)
|
|
271
|
+
self.state.crawler_runtime = current_run_duration + self._runtime_offset
|
|
272
|
+
|
|
248
273
|
def calculate(self) -> FinalStatistics:
|
|
249
274
|
"""Calculate the current statistics."""
|
|
250
|
-
if self.
|
|
251
|
-
|
|
275
|
+
if self._active:
|
|
276
|
+
# Only update state when active. If not, just report the last known runtime.
|
|
277
|
+
self._update_crawler_runtime()
|
|
252
278
|
|
|
253
|
-
|
|
254
|
-
total_minutes = crawler_runtime.total_seconds() / 60
|
|
279
|
+
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
255
280
|
state = self._state.current_value
|
|
256
281
|
serialized_state = state.model_dump(by_alias=False)
|
|
257
282
|
|
|
@@ -262,7 +287,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
287
|
requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
|
|
263
288
|
request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
|
|
264
289
|
requests_total=state.requests_failed + state.requests_finished,
|
|
265
|
-
crawler_runtime=crawler_runtime,
|
|
290
|
+
crawler_runtime=state.crawler_runtime,
|
|
266
291
|
requests_finished=state.requests_finished,
|
|
267
292
|
requests_failed=state.requests_failed,
|
|
268
293
|
retry_histogram=serialized_state['request_retry_histogram'],
|
|
@@ -282,21 +307,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
282
307
|
else:
|
|
283
308
|
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
|
|
284
309
|
|
|
285
|
-
def _after_initialize(self) -> None:
|
|
286
|
-
state = self._state.current_value
|
|
287
|
-
|
|
288
|
-
if state.crawler_started_at is None:
|
|
289
|
-
state.crawler_started_at = datetime.now(timezone.utc)
|
|
290
|
-
|
|
291
|
-
if state.stats_persisted_at is not None and state.crawler_last_started_at:
|
|
292
|
-
self._instance_start = datetime.now(timezone.utc) - (
|
|
293
|
-
state.stats_persisted_at - state.crawler_last_started_at
|
|
294
|
-
)
|
|
295
|
-
elif state.crawler_last_started_at:
|
|
296
|
-
self._instance_start = state.crawler_last_started_at
|
|
297
|
-
|
|
298
|
-
state.crawler_last_started_at = self._instance_start
|
|
299
|
-
|
|
300
310
|
def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
|
|
301
311
|
retry_count = record.retry_count
|
|
302
312
|
state = self._state.current_value
|
|
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
|
|
|
13
13
|
with _try_import(__name__, 'SqlStorageClient'):
|
|
14
14
|
from ._sql import SqlStorageClient
|
|
15
15
|
|
|
16
|
+
with _try_import(__name__, 'RedisStorageClient'):
|
|
17
|
+
from ._redis import RedisStorageClient
|
|
18
|
+
|
|
16
19
|
__all__ = [
|
|
17
20
|
'FileSystemStorageClient',
|
|
18
21
|
'MemoryStorageClient',
|
|
22
|
+
'RedisStorageClient',
|
|
19
23
|
'SqlStorageClient',
|
|
20
24
|
'StorageClient',
|
|
21
25
|
]
|
|
@@ -9,7 +9,7 @@ from pathlib import Path
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
11
|
from pydantic import ValidationError
|
|
12
|
-
from typing_extensions import override
|
|
12
|
+
from typing_extensions import Self, override
|
|
13
13
|
|
|
14
14
|
from crawlee._consts import METADATA_FILENAME
|
|
15
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
@@ -94,7 +94,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
94
94
|
name: str | None,
|
|
95
95
|
alias: str | None,
|
|
96
96
|
configuration: Configuration,
|
|
97
|
-
) ->
|
|
97
|
+
) -> Self:
|
|
98
98
|
"""Open or create a file system dataset client.
|
|
99
99
|
|
|
100
100
|
This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
|
|
@@ -10,7 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
from typing import TYPE_CHECKING, Any
|
|
11
11
|
|
|
12
12
|
from pydantic import ValidationError
|
|
13
|
-
from typing_extensions import override
|
|
13
|
+
from typing_extensions import Self, override
|
|
14
14
|
|
|
15
15
|
from crawlee._consts import METADATA_FILENAME
|
|
16
16
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
@@ -93,7 +93,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
93
93
|
name: str | None,
|
|
94
94
|
alias: str | None,
|
|
95
95
|
configuration: Configuration,
|
|
96
|
-
) ->
|
|
96
|
+
) -> Self:
|
|
97
97
|
"""Open or create a file system key-value store client.
|
|
98
98
|
|
|
99
99
|
This method attempts to open an existing key-value store from the file system. If a KVS with the specified
|
|
@@ -11,7 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
from typing import TYPE_CHECKING
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, ValidationError
|
|
14
|
-
from typing_extensions import override
|
|
14
|
+
from typing_extensions import Self, override
|
|
15
15
|
|
|
16
16
|
from crawlee import Request
|
|
17
17
|
from crawlee._consts import METADATA_FILENAME
|
|
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
from collections.abc import Sequence
|
|
32
32
|
|
|
33
33
|
from crawlee.configuration import Configuration
|
|
34
|
+
from crawlee.storages import KeyValueStore
|
|
34
35
|
|
|
35
36
|
logger = getLogger(__name__)
|
|
36
37
|
|
|
@@ -92,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
92
93
|
metadata: RequestQueueMetadata,
|
|
93
94
|
path_to_rq: Path,
|
|
94
95
|
lock: asyncio.Lock,
|
|
96
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
95
97
|
) -> None:
|
|
96
98
|
"""Initialize a new instance.
|
|
97
99
|
|
|
@@ -114,12 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
114
116
|
self._is_empty_cache: bool | None = None
|
|
115
117
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
116
118
|
|
|
117
|
-
self._state =
|
|
118
|
-
default_state=RequestQueueState(),
|
|
119
|
-
persist_state_key=f'__RQ_STATE_{self._metadata.id}',
|
|
120
|
-
persistence_enabled=True,
|
|
121
|
-
logger=logger,
|
|
122
|
-
)
|
|
119
|
+
self._state = recoverable_state
|
|
123
120
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
124
121
|
|
|
125
122
|
@override
|
|
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
136
133
|
"""The full path to the request queue metadata file."""
|
|
137
134
|
return self.path_to_rq / METADATA_FILENAME
|
|
138
135
|
|
|
136
|
+
@classmethod
|
|
137
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
138
|
+
async def kvs_factory() -> KeyValueStore:
|
|
139
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
140
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
141
|
+
|
|
142
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
143
|
+
|
|
144
|
+
return RecoverableState[RequestQueueState](
|
|
145
|
+
default_state=RequestQueueState(),
|
|
146
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
147
|
+
persist_state_kvs_factory=kvs_factory,
|
|
148
|
+
persistence_enabled=True,
|
|
149
|
+
logger=logger,
|
|
150
|
+
)
|
|
151
|
+
|
|
139
152
|
@classmethod
|
|
140
153
|
async def open(
|
|
141
154
|
cls,
|
|
@@ -144,7 +157,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
144
157
|
name: str | None,
|
|
145
158
|
alias: str | None,
|
|
146
159
|
configuration: Configuration,
|
|
147
|
-
) ->
|
|
160
|
+
) -> Self:
|
|
148
161
|
"""Open or create a file system request queue client.
|
|
149
162
|
|
|
150
163
|
This method attempts to open an existing request queue from the file system. If a queue with the specified
|
|
@@ -194,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
194
207
|
metadata=metadata,
|
|
195
208
|
path_to_rq=rq_base_path / rq_dir,
|
|
196
209
|
lock=asyncio.Lock(),
|
|
210
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
211
|
+
id=id, configuration=configuration
|
|
212
|
+
),
|
|
197
213
|
)
|
|
198
214
|
await client._state.initialize()
|
|
199
215
|
await client._discover_existing_requests()
|
|
@@ -230,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
230
246
|
metadata=metadata,
|
|
231
247
|
path_to_rq=path_to_rq,
|
|
232
248
|
lock=asyncio.Lock(),
|
|
249
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
233
250
|
)
|
|
234
251
|
|
|
235
252
|
await client._state.initialize()
|
|
@@ -254,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
254
271
|
metadata=metadata,
|
|
255
272
|
path_to_rq=path_to_rq,
|
|
256
273
|
lock=asyncio.Lock(),
|
|
274
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
257
275
|
)
|
|
258
276
|
await client._state.initialize()
|
|
259
277
|
await client._update_metadata()
|
|
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
|
|
|
4
4
|
from logging import getLogger
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
@@ -55,7 +55,7 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
55
55
|
id: str | None,
|
|
56
56
|
name: str | None,
|
|
57
57
|
alias: str | None,
|
|
58
|
-
) ->
|
|
58
|
+
) -> Self:
|
|
59
59
|
"""Open or create a new memory dataset client.
|
|
60
60
|
|
|
61
61
|
This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
|
|
@@ -4,7 +4,7 @@ import sys
|
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
@@ -53,7 +53,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
53
53
|
id: str | None,
|
|
54
54
|
name: str | None,
|
|
55
55
|
alias: str | None,
|
|
56
|
-
) ->
|
|
56
|
+
) -> Self:
|
|
57
57
|
"""Open or create a new memory key-value store client.
|
|
58
58
|
|
|
59
59
|
This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
|
|
@@ -6,7 +6,7 @@ from datetime import datetime, timezone
|
|
|
6
6
|
from logging import getLogger
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from typing_extensions import override
|
|
9
|
+
from typing_extensions import Self, override
|
|
10
10
|
|
|
11
11
|
from crawlee import Request
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
@@ -65,7 +65,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
65
65
|
id: str | None,
|
|
66
66
|
name: str | None,
|
|
67
67
|
alias: str | None,
|
|
68
|
-
) ->
|
|
68
|
+
) -> Self:
|
|
69
69
|
"""Open or create a new memory request queue client.
|
|
70
70
|
|
|
71
71
|
This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|