crawlee 1.0.1b8__py3-none-any.whl → 1.0.5b18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. crawlee/_request.py +31 -20
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +10 -16
  4. crawlee/_utils/recoverable_state.py +32 -8
  5. crawlee/_utils/recurring_task.py +15 -0
  6. crawlee/_utils/robots.py +17 -5
  7. crawlee/_utils/sitemap.py +1 -1
  8. crawlee/_utils/urls.py +9 -2
  9. crawlee/browsers/_browser_pool.py +4 -1
  10. crawlee/browsers/_playwright_browser_controller.py +1 -1
  11. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  12. crawlee/browsers/_types.py +1 -1
  13. crawlee/configuration.py +3 -1
  14. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  15. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
  16. crawlee/crawlers/_basic/_basic_crawler.py +23 -12
  17. crawlee/crawlers/_playwright/_playwright_crawler.py +11 -4
  18. crawlee/fingerprint_suite/_header_generator.py +2 -2
  19. crawlee/otel/crawler_instrumentor.py +3 -3
  20. crawlee/request_loaders/_sitemap_request_loader.py +5 -0
  21. crawlee/sessions/_session_pool.py +1 -1
  22. crawlee/statistics/_error_snapshotter.py +1 -1
  23. crawlee/statistics/_statistics.py +41 -31
  24. crawlee/storage_clients/__init__.py +4 -0
  25. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  26. crawlee/storage_clients/_file_system/_key_value_store_client.py +2 -2
  27. crawlee/storage_clients/_file_system/_request_queue_client.py +26 -8
  28. crawlee/storage_clients/_memory/_dataset_client.py +2 -2
  29. crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
  30. crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
  31. crawlee/storage_clients/_redis/__init__.py +6 -0
  32. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  33. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  34. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  35. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  36. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  37. crawlee/storage_clients/_redis/_utils.py +23 -0
  38. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  39. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  40. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  41. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  42. crawlee/storage_clients/_redis/py.typed +0 -0
  43. crawlee/storage_clients/_sql/_dataset_client.py +2 -2
  44. crawlee/storage_clients/_sql/_db_models.py +1 -2
  45. crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
  46. crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
  47. crawlee/storage_clients/_sql/_storage_client.py +10 -1
  48. crawlee/storages/_base.py +3 -1
  49. crawlee/storages/_dataset.py +3 -0
  50. crawlee/storages/_key_value_store.py +8 -2
  51. crawlee/storages/_request_queue.py +3 -0
  52. crawlee/storages/_storage_instance_manager.py +9 -1
  53. crawlee/storages/_utils.py +11 -0
  54. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/METADATA +9 -5
  55. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/RECORD +58 -45
  56. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/WHEEL +0 -0
  57. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/entry_points.txt +0 -0
  58. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/licenses/LICENSE +0 -0
@@ -114,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
114
114
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
115
115
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
116
116
  and local storage.
117
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
117
+ browser_type: The type of browser to launch:
118
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
119
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
120
+ the system.
118
121
  This option should not be used if `browser_pool` is provided.
119
122
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
120
123
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -153,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
153
156
  ):
154
157
  raise ValueError(
155
158
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
156
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
159
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
157
160
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
158
161
  )
159
162
 
@@ -366,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
366
369
  links_iterator: Iterator[str] = iter(
367
370
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
368
371
  )
369
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
372
+ links_iterator = to_absolute_url_iterator(
373
+ context.request.loaded_url or context.request.url, links_iterator, logger=context.log
374
+ )
370
375
 
371
376
  if robots_txt_file:
372
377
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -494,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
494
499
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
495
500
 
496
501
  browser_type: NotRequired[BrowserType]
497
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
502
+ """The type of browser to launch:
503
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
504
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
498
505
  This option should not be used if `browser_pool` is provided."""
499
506
 
500
507
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
69
69
 
70
70
  if request_handling_instrumentation:
71
71
 
72
- async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
72
+ async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
73
73
  with self._tracer.start_as_current_span(
74
74
  name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
75
75
  attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
111
111
  # Handpicked interesting methods to instrument
112
112
  self._instrumented.extend(
113
113
  [
114
- (_Middleware, 'action', middlware_wrapper),
115
- (_Middleware, 'cleanup', middlware_wrapper),
114
+ (_Middleware, 'action', middleware_wrapper),
115
+ (_Middleware, 'cleanup', middleware_wrapper),
116
116
  (ContextPipeline, '__call__', context_pipeline_wrapper),
117
117
  (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
118
118
  (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
@@ -90,6 +90,11 @@ class SitemapRequestLoaderState(BaseModel):
90
90
  class SitemapRequestLoader(RequestLoader):
91
91
  """A request loader that reads URLs from sitemap(s).
92
92
 
93
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
94
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
95
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
96
+ and the `enqueue_links` functionality.
97
+
93
98
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
99
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
100
 
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38
 
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
17
18
  from crawlee.statistics._error_tracker import ErrorTracker
18
19
 
19
20
  if TYPE_CHECKING:
21
+ from collections.abc import Callable, Coroutine
20
22
  from types import TracebackType
21
23
 
24
+ from crawlee.storages import KeyValueStore
25
+
22
26
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
23
27
  TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
24
28
  logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
70
74
  persistence_enabled: bool | Literal['explicit_only'] = False,
71
75
  persist_state_kvs_name: str | None = None,
72
76
  persist_state_key: str | None = None,
77
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
73
78
  log_message: str = 'Statistics',
74
79
  periodic_message_logger: Logger | None = None,
75
80
  log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
80
85
  self._id = Statistics.__next_id
81
86
  Statistics.__next_id += 1
82
87
 
83
- self._instance_start: datetime | None = None
84
-
85
88
  self.error_tracker = ErrorTracker(
86
89
  save_error_snapshots=save_error_snapshots,
87
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
92
95
 
93
96
  self._state = RecoverableState(
94
97
  default_state=state_model(stats_id=self._id),
95
- persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
98
+ persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
96
99
  persistence_enabled=persistence_enabled,
97
100
  persist_state_kvs_name=persist_state_kvs_name,
101
+ persist_state_kvs_factory=persist_state_kvs_factory,
98
102
  logger=logger,
99
103
  )
100
104
 
@@ -106,12 +110,15 @@ class Statistics(Generic[TStatisticsState]):
106
110
  # Flag to indicate the context state.
107
111
  self._active = False
108
112
 
113
+ # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114
+ self._runtime_offset = timedelta(seconds=0)
115
+
109
116
  def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
110
117
  """Create near copy of the `Statistics` with replaced `state_model`."""
111
118
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
112
119
  persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
113
- persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
114
120
  persist_state_key=self._state._persist_state_key, # noqa: SLF001
121
+ persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
115
122
  log_message=self._log_message,
116
123
  periodic_message_logger=self._periodic_message_logger,
117
124
  state_model=state_model,
@@ -125,6 +132,7 @@ class Statistics(Generic[TStatisticsState]):
125
132
  persistence_enabled: bool = False,
126
133
  persist_state_kvs_name: str | None = None,
127
134
  persist_state_key: str | None = None,
135
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
128
136
  log_message: str = 'Statistics',
129
137
  periodic_message_logger: Logger | None = None,
130
138
  log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +144,7 @@ class Statistics(Generic[TStatisticsState]):
136
144
  persistence_enabled=persistence_enabled,
137
145
  persist_state_kvs_name=persist_state_kvs_name,
138
146
  persist_state_key=persist_state_key,
147
+ persist_state_kvs_factory=persist_state_kvs_factory,
139
148
  log_message=log_message,
140
149
  periodic_message_logger=periodic_message_logger,
141
150
  log_interval=log_interval,
@@ -158,14 +167,17 @@ class Statistics(Generic[TStatisticsState]):
158
167
  if self._active:
159
168
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
160
169
 
161
- self._active = True
162
- self._instance_start = datetime.now(timezone.utc)
163
-
164
170
  await self._state.initialize()
165
- self._after_initialize()
166
171
 
172
+ self._runtime_offset = self.state.crawler_runtime
173
+
174
+ # Start periodic logging and let it print initial state before activation.
167
175
  self._periodic_logger.start()
176
+ await asyncio.sleep(0.01)
177
+ self._active = True
168
178
 
179
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
180
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
169
181
  return self
170
182
 
171
183
  async def __aexit__(
@@ -182,13 +194,18 @@ class Statistics(Generic[TStatisticsState]):
182
194
  if not self._active:
183
195
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
184
196
 
185
- self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
186
-
187
- await self._state.teardown()
197
+ if not self.state.crawler_last_started_at:
198
+ raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
188
199
 
200
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
189
201
  await self._periodic_logger.stop()
202
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
203
+ self.state.crawler_runtime = (
204
+ self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
205
+ )
190
206
 
191
207
  self._active = False
208
+ await self._state.teardown()
192
209
 
193
210
  @property
194
211
  def state(self) -> TStatisticsState:
@@ -245,13 +262,21 @@ class Statistics(Generic[TStatisticsState]):
245
262
 
246
263
  del self._requests_in_progress[request_id_or_key]
247
264
 
265
+ def _update_crawler_runtime(self) -> None:
266
+ current_run_duration = (
267
+ (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
268
+ if self.state.crawler_last_started_at
269
+ else timedelta()
270
+ )
271
+ self.state.crawler_runtime = current_run_duration + self._runtime_offset
272
+
248
273
  def calculate(self) -> FinalStatistics:
249
274
  """Calculate the current statistics."""
250
- if self._instance_start is None:
251
- raise RuntimeError('The Statistics object is not initialized')
275
+ if self._active:
276
+ # Only update state when active. If not, just report the last known runtime.
277
+ self._update_crawler_runtime()
252
278
 
253
- crawler_runtime = datetime.now(timezone.utc) - self._instance_start
254
- total_minutes = crawler_runtime.total_seconds() / 60
279
+ total_minutes = self.state.crawler_runtime.total_seconds() / 60
255
280
  state = self._state.current_value
256
281
  serialized_state = state.model_dump(by_alias=False)
257
282
 
@@ -262,7 +287,7 @@ class Statistics(Generic[TStatisticsState]):
262
287
  requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
263
288
  request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
264
289
  requests_total=state.requests_failed + state.requests_finished,
265
- crawler_runtime=crawler_runtime,
290
+ crawler_runtime=state.crawler_runtime,
266
291
  requests_finished=state.requests_finished,
267
292
  requests_failed=state.requests_failed,
268
293
  retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +307,6 @@ class Statistics(Generic[TStatisticsState]):
282
307
  else:
283
308
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
284
309
 
285
- def _after_initialize(self) -> None:
286
- state = self._state.current_value
287
-
288
- if state.crawler_started_at is None:
289
- state.crawler_started_at = datetime.now(timezone.utc)
290
-
291
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
292
- self._instance_start = datetime.now(timezone.utc) - (
293
- state.stats_persisted_at - state.crawler_last_started_at
294
- )
295
- elif state.crawler_last_started_at:
296
- self._instance_start = state.crawler_last_started_at
297
-
298
- state.crawler_last_started_at = self._instance_start
299
-
300
310
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
301
311
  retry_count = record.retry_count
302
312
  state = self._state.current_value
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
13
13
  with _try_import(__name__, 'SqlStorageClient'):
14
14
  from ._sql import SqlStorageClient
15
15
 
16
+ with _try_import(__name__, 'RedisStorageClient'):
17
+ from ._redis import RedisStorageClient
18
+
16
19
  __all__ = [
17
20
  'FileSystemStorageClient',
18
21
  'MemoryStorageClient',
22
+ 'RedisStorageClient',
19
23
  'SqlStorageClient',
20
24
  'StorageClient',
21
25
  ]
@@ -9,7 +9,7 @@ from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
11
  from pydantic import ValidationError
12
- from typing_extensions import override
12
+ from typing_extensions import Self, override
13
13
 
14
14
  from crawlee._consts import METADATA_FILENAME
15
15
  from crawlee._utils.crypto import crypto_random_object_id
@@ -94,7 +94,7 @@ class FileSystemDatasetClient(DatasetClient):
94
94
  name: str | None,
95
95
  alias: str | None,
96
96
  configuration: Configuration,
97
- ) -> FileSystemDatasetClient:
97
+ ) -> Self:
98
98
  """Open or create a file system dataset client.
99
99
 
100
100
  This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
  from typing import TYPE_CHECKING, Any
11
11
 
12
12
  from pydantic import ValidationError
13
- from typing_extensions import override
13
+ from typing_extensions import Self, override
14
14
 
15
15
  from crawlee._consts import METADATA_FILENAME
16
16
  from crawlee._utils.crypto import crypto_random_object_id
@@ -93,7 +93,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
93
93
  name: str | None,
94
94
  alias: str | None,
95
95
  configuration: Configuration,
96
- ) -> FileSystemKeyValueStoreClient:
96
+ ) -> Self:
97
97
  """Open or create a file system key-value store client.
98
98
 
99
99
  This method attempts to open an existing key-value store from the file system. If a KVS with the specified
@@ -11,7 +11,7 @@ from pathlib import Path
11
11
  from typing import TYPE_CHECKING
12
12
 
13
13
  from pydantic import BaseModel, ValidationError
14
- from typing_extensions import override
14
+ from typing_extensions import Self, override
15
15
 
16
16
  from crawlee import Request
17
17
  from crawlee._consts import METADATA_FILENAME
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
31
31
  from collections.abc import Sequence
32
32
 
33
33
  from crawlee.configuration import Configuration
34
+ from crawlee.storages import KeyValueStore
34
35
 
35
36
  logger = getLogger(__name__)
36
37
 
@@ -92,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
92
93
  metadata: RequestQueueMetadata,
93
94
  path_to_rq: Path,
94
95
  lock: asyncio.Lock,
96
+ recoverable_state: RecoverableState[RequestQueueState],
95
97
  ) -> None:
96
98
  """Initialize a new instance.
97
99
 
@@ -114,12 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
114
116
  self._is_empty_cache: bool | None = None
115
117
  """Cache for is_empty result: None means unknown, True/False is cached state."""
116
118
 
117
- self._state = RecoverableState[RequestQueueState](
118
- default_state=RequestQueueState(),
119
- persist_state_key=f'__RQ_STATE_{self._metadata.id}',
120
- persistence_enabled=True,
121
- logger=logger,
122
- )
119
+ self._state = recoverable_state
123
120
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
124
121
 
125
122
  @override
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
136
133
  """The full path to the request queue metadata file."""
137
134
  return self.path_to_rq / METADATA_FILENAME
138
135
 
136
+ @classmethod
137
+ async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
138
+ async def kvs_factory() -> KeyValueStore:
139
+ from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
140
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
141
+
142
+ return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
143
+
144
+ return RecoverableState[RequestQueueState](
145
+ default_state=RequestQueueState(),
146
+ persist_state_key=f'__RQ_STATE_{id}',
147
+ persist_state_kvs_factory=kvs_factory,
148
+ persistence_enabled=True,
149
+ logger=logger,
150
+ )
151
+
139
152
  @classmethod
140
153
  async def open(
141
154
  cls,
@@ -144,7 +157,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
144
157
  name: str | None,
145
158
  alias: str | None,
146
159
  configuration: Configuration,
147
- ) -> FileSystemRequestQueueClient:
160
+ ) -> Self:
148
161
  """Open or create a file system request queue client.
149
162
 
150
163
  This method attempts to open an existing request queue from the file system. If a queue with the specified
@@ -194,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
194
207
  metadata=metadata,
195
208
  path_to_rq=rq_base_path / rq_dir,
196
209
  lock=asyncio.Lock(),
210
+ recoverable_state=await cls._create_recoverable_state(
211
+ id=id, configuration=configuration
212
+ ),
197
213
  )
198
214
  await client._state.initialize()
199
215
  await client._discover_existing_requests()
@@ -230,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
230
246
  metadata=metadata,
231
247
  path_to_rq=path_to_rq,
232
248
  lock=asyncio.Lock(),
249
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
233
250
  )
234
251
 
235
252
  await client._state.initialize()
@@ -254,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
254
271
  metadata=metadata,
255
272
  path_to_rq=path_to_rq,
256
273
  lock=asyncio.Lock(),
274
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
257
275
  )
258
276
  await client._state.initialize()
259
277
  await client._update_metadata()
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
4
4
  from logging import getLogger
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
10
  from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
@@ -55,7 +55,7 @@ class MemoryDatasetClient(DatasetClient):
55
55
  id: str | None,
56
56
  name: str | None,
57
57
  alias: str | None,
58
- ) -> MemoryDatasetClient:
58
+ ) -> Self:
59
59
  """Open or create a new memory dataset client.
60
60
 
61
61
  This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
@@ -4,7 +4,7 @@ import sys
4
4
  from datetime import datetime, timezone
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
10
  from crawlee._utils.file import infer_mime_type
@@ -53,7 +53,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
53
53
  id: str | None,
54
54
  name: str | None,
55
55
  alias: str | None,
56
- ) -> MemoryKeyValueStoreClient:
56
+ ) -> Self:
57
57
  """Open or create a new memory key-value store client.
58
58
 
59
59
  This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
@@ -6,7 +6,7 @@ from datetime import datetime, timezone
6
6
  from logging import getLogger
7
7
  from typing import TYPE_CHECKING
8
8
 
9
- from typing_extensions import override
9
+ from typing_extensions import Self, override
10
10
 
11
11
  from crawlee import Request
12
12
  from crawlee._utils.crypto import crypto_random_object_id
@@ -65,7 +65,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
65
65
  id: str | None,
66
66
  name: str | None,
67
67
  alias: str | None,
68
- ) -> MemoryRequestQueueClient:
68
+ ) -> Self:
69
69
  """Open or create a new memory request queue client.
70
70
 
71
71
  This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
@@ -0,0 +1,6 @@
1
+ from ._dataset_client import RedisDatasetClient
2
+ from ._key_value_store_client import RedisKeyValueStoreClient
3
+ from ._request_queue_client import RedisRequestQueueClient
4
+ from ._storage_client import RedisStorageClient
5
+
6
+ __all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']