crawlee 1.0.2b3__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (66) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +30 -17
  4. crawlee/_utils/context.py +2 -2
  5. crawlee/_utils/file.py +7 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +17 -1
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/time.py +41 -1
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +1 -1
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +3 -1
  17. crawlee/crawlers/__init__.py +2 -1
  18. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  19. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  22. crawlee/crawlers/_basic/_basic_crawler.py +126 -112
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +55 -11
  26. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  27. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  28. crawlee/crawlers/_playwright/_types.py +12 -2
  29. crawlee/events/_event_manager.py +4 -4
  30. crawlee/fingerprint_suite/_header_generator.py +2 -2
  31. crawlee/http_clients/_base.py +4 -0
  32. crawlee/http_clients/_curl_impersonate.py +12 -0
  33. crawlee/http_clients/_httpx.py +16 -6
  34. crawlee/http_clients/_impit.py +25 -10
  35. crawlee/otel/crawler_instrumentor.py +3 -3
  36. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  37. crawlee/sessions/_session_pool.py +1 -1
  38. crawlee/statistics/_error_snapshotter.py +1 -1
  39. crawlee/statistics/_models.py +32 -1
  40. crawlee/statistics/_statistics.py +24 -33
  41. crawlee/storage_clients/__init__.py +4 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +27 -9
  45. crawlee/storage_clients/_redis/__init__.py +6 -0
  46. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  47. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  48. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  49. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  50. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  51. crawlee/storage_clients/_redis/_utils.py +23 -0
  52. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  53. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  54. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  55. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  56. crawlee/storage_clients/_redis/py.typed +0 -0
  57. crawlee/storage_clients/_sql/_db_models.py +1 -2
  58. crawlee/storage_clients/_sql/_key_value_store_client.py +3 -2
  59. crawlee/storage_clients/_sql/_request_queue_client.py +18 -4
  60. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  61. crawlee/storages/_key_value_store.py +5 -2
  62. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +8 -3
  63. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +66 -54
  64. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  65. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  66. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
90
91
  class SitemapRequestLoader(RequestLoader):
91
92
  """A request loader that reads URLs from sitemap(s).
92
93
 
94
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
95
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
96
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
97
+ and the `enqueue_links` functionality.
98
+
93
99
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
100
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
101
 
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
107
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
108
114
  max_buffer_size: int = 200,
109
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
110
117
  ) -> None:
111
118
  """Initialize the sitemap request loader.
112
119
 
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
120
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
121
128
  When provided, allows resuming from where it left off after interruption.
122
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
123
133
  """
124
134
  self._http_client = http_client
125
135
  self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
127
137
  self._exclude = exclude
128
138
  self._proxy_info = proxy_info
129
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
130
141
 
131
142
  # Synchronization for queue operations
132
143
  self._queue_has_capacity = asyncio.Event()
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
308
319
 
309
320
  async with self._queue_lock:
310
321
  url = state.url_queue.popleft()
311
-
312
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
313
331
  state.in_progress.add(request.url)
314
332
  if len(state.url_queue) < self._max_buffer_size:
315
333
  self._queue_has_capacity.set()
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import Annotated, Any
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
80
  errors: dict[str, Any] = Field(default_factory=dict)
81
81
  retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
82
  requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
93
93
  ),
94
94
  ] = {}
95
95
 
96
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
97
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
98
+
99
+ def model_post_init(self, /, __context: Any) -> None:
100
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
101
+
102
+ @property
103
+ def crawler_runtime(self) -> timedelta:
104
+ if self.crawler_last_started_at:
105
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
106
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
107
+ return self._runtime_offset
108
+
109
+ @crawler_runtime.setter
110
+ def crawler_runtime(self, value: timedelta) -> None:
111
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
112
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
113
+ warnings.warn(
114
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
115
+ f' Value {value} will not be used.',
116
+ DeprecationWarning,
117
+ stacklevel=2,
118
+ )
119
+
120
+ @computed_field(alias='crawlerRuntimeMillis')
121
+ def crawler_runtime_for_serialization(self) -> timedelta:
122
+ if self.crawler_last_started_at:
123
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
124
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
125
+ return self._runtime_offset
126
+
96
127
  @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
97
128
  @property
98
129
  def request_total_duration(self) -> timedelta:
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
17
18
  from crawlee.statistics._error_tracker import ErrorTracker
18
19
 
19
20
  if TYPE_CHECKING:
21
+ from collections.abc import Callable, Coroutine
20
22
  from types import TracebackType
21
23
 
24
+ from crawlee.storages import KeyValueStore
25
+
22
26
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
23
27
  TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
24
28
  logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
70
74
  persistence_enabled: bool | Literal['explicit_only'] = False,
71
75
  persist_state_kvs_name: str | None = None,
72
76
  persist_state_key: str | None = None,
77
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
73
78
  log_message: str = 'Statistics',
74
79
  periodic_message_logger: Logger | None = None,
75
80
  log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
80
85
  self._id = Statistics.__next_id
81
86
  Statistics.__next_id += 1
82
87
 
83
- self._instance_start: datetime | None = None
84
-
85
88
  self.error_tracker = ErrorTracker(
86
89
  save_error_snapshots=save_error_snapshots,
87
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
92
95
 
93
96
  self._state = RecoverableState(
94
97
  default_state=state_model(stats_id=self._id),
95
- persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
98
+ persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
96
99
  persistence_enabled=persistence_enabled,
97
100
  persist_state_kvs_name=persist_state_kvs_name,
101
+ persist_state_kvs_factory=persist_state_kvs_factory,
98
102
  logger=logger,
99
103
  )
100
104
 
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
110
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
111
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
112
116
  persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
113
- persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
114
117
  persist_state_key=self._state._persist_state_key, # noqa: SLF001
118
+ persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
115
119
  log_message=self._log_message,
116
120
  periodic_message_logger=self._periodic_message_logger,
117
121
  state_model=state_model,
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
125
129
  persistence_enabled: bool = False,
126
130
  persist_state_kvs_name: str | None = None,
127
131
  persist_state_key: str | None = None,
132
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
128
133
  log_message: str = 'Statistics',
129
134
  periodic_message_logger: Logger | None = None,
130
135
  log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
136
141
  persistence_enabled=persistence_enabled,
137
142
  persist_state_kvs_name=persist_state_kvs_name,
138
143
  persist_state_key=persist_state_key,
144
+ persist_state_kvs_factory=persist_state_kvs_factory,
139
145
  log_message=log_message,
140
146
  periodic_message_logger=periodic_message_logger,
141
147
  log_interval=log_interval,
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
158
164
  if self._active:
159
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
160
166
 
161
- self._active = True
162
- self._instance_start = datetime.now(timezone.utc)
163
-
164
167
  await self._state.initialize()
165
- self._after_initialize()
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
166
170
 
171
+ # Start periodic logging and let it print initial state before activation.
167
172
  self._periodic_logger.start()
173
+ await asyncio.sleep(0.01)
174
+ self._active = True
168
175
 
176
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
177
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
169
178
  return self
170
179
 
171
180
  async def __aexit__(
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
182
191
  if not self._active:
183
192
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
184
193
 
185
- self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
186
-
187
- await self._state.teardown()
194
+ if not self.state.crawler_last_started_at:
195
+ raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
188
196
 
197
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
189
198
  await self._periodic_logger.stop()
190
-
199
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
191
200
  self._active = False
201
+ await self._state.teardown()
192
202
 
193
203
  @property
194
204
  def state(self) -> TStatisticsState:
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
247
257
 
248
258
  def calculate(self) -> FinalStatistics:
249
259
  """Calculate the current statistics."""
250
- if self._instance_start is None:
251
- raise RuntimeError('The Statistics object is not initialized')
252
-
253
- crawler_runtime = datetime.now(timezone.utc) - self._instance_start
254
- total_minutes = crawler_runtime.total_seconds() / 60
260
+ total_minutes = self.state.crawler_runtime.total_seconds() / 60
255
261
  state = self._state.current_value
256
262
  serialized_state = state.model_dump(by_alias=False)
257
263
 
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
262
268
  requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
263
269
  request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
264
270
  requests_total=state.requests_failed + state.requests_finished,
265
- crawler_runtime=crawler_runtime,
271
+ crawler_runtime=state.crawler_runtime,
266
272
  requests_finished=state.requests_finished,
267
273
  requests_failed=state.requests_failed,
268
274
  retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
282
288
  else:
283
289
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
284
290
 
285
- def _after_initialize(self) -> None:
286
- state = self._state.current_value
287
-
288
- if state.crawler_started_at is None:
289
- state.crawler_started_at = datetime.now(timezone.utc)
290
-
291
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
292
- self._instance_start = datetime.now(timezone.utc) - (
293
- state.stats_persisted_at - state.crawler_last_started_at
294
- )
295
- elif state.crawler_last_started_at:
296
- self._instance_start = state.crawler_last_started_at
297
-
298
- state.crawler_last_started_at = self._instance_start
299
-
300
291
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
301
292
  retry_count = record.retry_count
302
293
  state = self._state.current_value
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
13
13
  with _try_import(__name__, 'SqlStorageClient'):
14
14
  from ._sql import SqlStorageClient
15
15
 
16
+ with _try_import(__name__, 'RedisStorageClient'):
17
+ from ._redis import RedisStorageClient
18
+
16
19
  __all__ = [
17
20
  'FileSystemStorageClient',
18
21
  'MemoryStorageClient',
22
+ 'RedisStorageClient',
19
23
  'SqlStorageClient',
20
24
  'StorageClient',
21
25
  ]
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
134
134
  continue
135
135
 
136
136
  try:
137
- file = await asyncio.to_thread(path_to_metadata.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
138
138
  try:
139
139
  file_content = json.load(file)
140
140
  metadata = DatasetMetadata(**file_content)
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
163
163
 
164
164
  # If the dataset directory exists, reconstruct the client from the metadata file.
165
165
  if path_to_dataset.exists() and path_to_metadata.exists():
166
- file = await asyncio.to_thread(open, path_to_metadata)
166
+ file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
167
167
  try:
168
168
  file_content = json.load(file)
169
169
  finally:
@@ -133,7 +133,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
133
133
  continue
134
134
 
135
135
  try:
136
- file = await asyncio.to_thread(path_to_metadata.open)
136
+ file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
137
137
  try:
138
138
  file_content = json.load(file)
139
139
  metadata = KeyValueStoreMetadata(**file_content)
@@ -162,7 +162,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
162
162
 
163
163
  # If the key-value store directory exists, reconstruct the client from the metadata file.
164
164
  if path_to_kvs.exists() and path_to_metadata.exists():
165
- file = await asyncio.to_thread(open, path_to_metadata)
165
+ file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
166
166
  try:
167
167
  file_content = json.load(file)
168
168
  finally:
@@ -239,7 +239,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
239
239
  # Read the metadata file
240
240
  async with self._lock:
241
241
  try:
242
- file = await asyncio.to_thread(open, record_metadata_filepath)
242
+ file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8')
243
243
  except FileNotFoundError:
244
244
  logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
245
245
  return None
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
31
31
  from collections.abc import Sequence
32
32
 
33
33
  from crawlee.configuration import Configuration
34
+ from crawlee.storages import KeyValueStore
34
35
 
35
36
  logger = getLogger(__name__)
36
37
 
@@ -92,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
92
93
  metadata: RequestQueueMetadata,
93
94
  path_to_rq: Path,
94
95
  lock: asyncio.Lock,
96
+ recoverable_state: RecoverableState[RequestQueueState],
95
97
  ) -> None:
96
98
  """Initialize a new instance.
97
99
 
@@ -114,12 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
114
116
  self._is_empty_cache: bool | None = None
115
117
  """Cache for is_empty result: None means unknown, True/False is cached state."""
116
118
 
117
- self._state = RecoverableState[RequestQueueState](
118
- default_state=RequestQueueState(),
119
- persist_state_key=f'__RQ_STATE_{self._metadata.id}',
120
- persistence_enabled=True,
121
- logger=logger,
122
- )
119
+ self._state = recoverable_state
123
120
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
124
121
 
125
122
  @override
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
136
133
  """The full path to the request queue metadata file."""
137
134
  return self.path_to_rq / METADATA_FILENAME
138
135
 
136
+ @classmethod
137
+ async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
138
+ async def kvs_factory() -> KeyValueStore:
139
+ from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
140
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
141
+
142
+ return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
143
+
144
+ return RecoverableState[RequestQueueState](
145
+ default_state=RequestQueueState(),
146
+ persist_state_key=f'__RQ_STATE_{id}',
147
+ persist_state_kvs_factory=kvs_factory,
148
+ persistence_enabled=True,
149
+ logger=logger,
150
+ )
151
+
139
152
  @classmethod
140
153
  async def open(
141
154
  cls,
@@ -184,7 +197,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
184
197
  continue
185
198
 
186
199
  try:
187
- file = await asyncio.to_thread(path_to_metadata.open)
200
+ file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
188
201
  try:
189
202
  file_content = json.load(file)
190
203
  metadata = RequestQueueMetadata(**file_content)
@@ -194,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
194
207
  metadata=metadata,
195
208
  path_to_rq=rq_base_path / rq_dir,
196
209
  lock=asyncio.Lock(),
210
+ recoverable_state=await cls._create_recoverable_state(
211
+ id=id, configuration=configuration
212
+ ),
197
213
  )
198
214
  await client._state.initialize()
199
215
  await client._discover_existing_requests()
@@ -216,7 +232,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
216
232
 
217
233
  # If the RQ directory exists, reconstruct the client from the metadata file.
218
234
  if path_to_rq.exists() and path_to_metadata.exists():
219
- file = await asyncio.to_thread(open, path_to_metadata)
235
+ file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
220
236
  try:
221
237
  file_content = json.load(file)
222
238
  finally:
@@ -230,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
230
246
  metadata=metadata,
231
247
  path_to_rq=path_to_rq,
232
248
  lock=asyncio.Lock(),
249
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
233
250
  )
234
251
 
235
252
  await client._state.initialize()
@@ -254,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
254
271
  metadata=metadata,
255
272
  path_to_rq=path_to_rq,
256
273
  lock=asyncio.Lock(),
274
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
257
275
  )
258
276
  await client._state.initialize()
259
277
  await client._update_metadata()
@@ -757,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
757
775
  """
758
776
  # Open the request file.
759
777
  try:
760
- file = await asyncio.to_thread(open, file_path)
778
+ file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
761
779
  except FileNotFoundError:
762
780
  logger.warning(f'Request file "{file_path}" not found.')
763
781
  return None
@@ -0,0 +1,6 @@
1
+ from ._dataset_client import RedisDatasetClient
2
+ from ._key_value_store_client import RedisKeyValueStoreClient
3
+ from ._request_queue_client import RedisRequestQueueClient
4
+ from ._storage_client import RedisStorageClient
5
+
6
+ __all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']