crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (93) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +62 -32
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +52 -19
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +160 -134
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
  61. crawlee/storage_clients/_memory/_dataset_client.py +2 -2
  62. crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
  63. crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
  64. crawlee/storage_clients/_redis/__init__.py +6 -0
  65. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  66. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  67. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  68. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  69. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  70. crawlee/storage_clients/_redis/_utils.py +23 -0
  71. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  72. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  73. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  74. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  75. crawlee/storage_clients/_redis/py.typed +0 -0
  76. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  77. crawlee/storage_clients/_sql/_dataset_client.py +2 -2
  78. crawlee/storage_clients/_sql/_db_models.py +1 -2
  79. crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
  80. crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
  81. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  82. crawlee/storage_clients/models.py +8 -3
  83. crawlee/storages/_base.py +3 -1
  84. crawlee/storages/_dataset.py +3 -0
  85. crawlee/storages/_key_value_store.py +8 -2
  86. crawlee/storages/_request_queue.py +3 -0
  87. crawlee/storages/_storage_instance_manager.py +109 -42
  88. crawlee/storages/_utils.py +11 -0
  89. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
  90. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
  91. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  92. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  93. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
- from typing import Annotated, Any
7
+ from typing import TYPE_CHECKING, Annotated, Any
7
8
 
8
9
  from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
9
10
  from typing_extensions import override
@@ -76,10 +77,20 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
- errors: dict[str, Any] = Field(default_factory=dict)
81
- retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
- requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
80
+
81
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
82
+ if TYPE_CHECKING:
83
+ errors: dict[str, Any] = {}
84
+ retry_errors: dict[str, Any] = {}
85
+ requests_with_status_code: dict[str, int] = {}
86
+ else:
87
+ errors: Annotated[dict[str, Any], Field(default_factory=dict)]
88
+ retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
89
+ requests_with_status_code: Annotated[
90
+ dict[str, int],
91
+ Field(alias='requestsWithStatusCode', default_factory=dict),
92
+ ]
93
+
83
94
  stats_persisted_at: Annotated[
84
95
  datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
85
96
  ] = None
@@ -93,22 +104,53 @@ class StatisticsState(BaseModel):
93
104
  ),
94
105
  ] = {}
95
106
 
96
- @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
107
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
108
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
109
+
110
+ def model_post_init(self, /, __context: Any) -> None:
111
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
112
+
113
+ @property
114
+ def crawler_runtime(self) -> timedelta:
115
+ if self.crawler_last_started_at:
116
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
117
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
118
+ return self._runtime_offset
119
+
120
+ @crawler_runtime.setter
121
+ def crawler_runtime(self, value: timedelta) -> None:
122
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
123
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
124
+ warnings.warn(
125
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
126
+ f' Value {value} will not be used.',
127
+ DeprecationWarning,
128
+ stacklevel=2,
129
+ )
130
+
131
+ @computed_field(alias='crawlerRuntimeMillis')
132
+ def crawler_runtime_for_serialization(self) -> timedelta:
133
+ if self.crawler_last_started_at:
134
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
135
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
136
+ return self._runtime_offset
137
+
138
+ @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
97
139
  @property
98
140
  def request_total_duration(self) -> timedelta:
99
141
  return self.request_total_finished_duration + self.request_total_failed_duration
100
142
 
101
- @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator]
143
+ @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
102
144
  @property
103
145
  def request_avg_failed_duration(self) -> timedelta | None:
104
146
  return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
105
147
 
106
- @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator]
148
+ @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
107
149
  @property
108
150
  def request_avg_finished_duration(self) -> timedelta | None:
109
151
  return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
110
152
 
111
- @computed_field(alias='requestsTotal') # type: ignore[prop-decorator]
153
+ @computed_field(alias='requestsTotal')
112
154
  @property
113
155
  def requests_total(self) -> int:
114
156
  return self.requests_failed + self.requests_finished
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
17
18
  from crawlee.statistics._error_tracker import ErrorTracker
18
19
 
19
20
  if TYPE_CHECKING:
21
+ from collections.abc import Callable, Coroutine
20
22
  from types import TracebackType
21
23
 
24
+ from crawlee.storages import KeyValueStore
25
+
22
26
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
23
27
  TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
24
28
  logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
70
74
  persistence_enabled: bool | Literal['explicit_only'] = False,
71
75
  persist_state_kvs_name: str | None = None,
72
76
  persist_state_key: str | None = None,
77
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
73
78
  log_message: str = 'Statistics',
74
79
  periodic_message_logger: Logger | None = None,
75
80
  log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
80
85
  self._id = Statistics.__next_id
81
86
  Statistics.__next_id += 1
82
87
 
83
- self._instance_start: datetime | None = None
84
-
85
88
  self.error_tracker = ErrorTracker(
86
89
  save_error_snapshots=save_error_snapshots,
87
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
92
95
 
93
96
  self._state = RecoverableState(
94
97
  default_state=state_model(stats_id=self._id),
95
- persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
98
+ persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
96
99
  persistence_enabled=persistence_enabled,
97
100
  persist_state_kvs_name=persist_state_kvs_name,
101
+ persist_state_kvs_factory=persist_state_kvs_factory,
98
102
  logger=logger,
99
103
  )
100
104
 
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
110
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
111
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
112
116
  persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
113
- persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
114
117
  persist_state_key=self._state._persist_state_key, # noqa: SLF001
118
+ persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
115
119
  log_message=self._log_message,
116
120
  periodic_message_logger=self._periodic_message_logger,
117
121
  state_model=state_model,
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
125
129
  persistence_enabled: bool = False,
126
130
  persist_state_kvs_name: str | None = None,
127
131
  persist_state_key: str | None = None,
132
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
128
133
  log_message: str = 'Statistics',
129
134
  periodic_message_logger: Logger | None = None,
130
135
  log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
136
141
  persistence_enabled=persistence_enabled,
137
142
  persist_state_kvs_name=persist_state_kvs_name,
138
143
  persist_state_key=persist_state_key,
144
+ persist_state_kvs_factory=persist_state_kvs_factory,
139
145
  log_message=log_message,
140
146
  periodic_message_logger=periodic_message_logger,
141
147
  log_interval=log_interval,
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
158
164
  if self._active:
159
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
160
166
 
161
- self._active = True
162
- self._instance_start = datetime.now(timezone.utc)
163
-
164
167
  await self._state.initialize()
165
- self._after_initialize()
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
166
170
 
171
+ # Start periodic logging and let it print initial state before activation.
167
172
  self._periodic_logger.start()
173
+ await asyncio.sleep(0.01)
174
+ self._active = True
168
175
 
176
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
177
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
169
178
  return self
170
179
 
171
180
  async def __aexit__(
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
182
191
  if not self._active:
183
192
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
184
193
 
185
- self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
186
-
187
- await self._state.teardown()
194
+ if not self.state.crawler_last_started_at:
195
+ raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
188
196
 
197
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
189
198
  await self._periodic_logger.stop()
190
-
199
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
191
200
  self._active = False
201
+ await self._state.teardown()
192
202
 
193
203
  @property
194
204
  def state(self) -> TStatisticsState:
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
247
257
 
248
258
  def calculate(self) -> FinalStatistics:
249
259
  """Calculate the current statistics."""
250
- if self._instance_start is None:
251
- raise RuntimeError('The Statistics object is not initialized')
252
-
253
- crawler_runtime = datetime.now(timezone.utc) - self._instance_start
254
- total_minutes = crawler_runtime.total_seconds() / 60
260
+ total_minutes = self.state.crawler_runtime.total_seconds() / 60
255
261
  state = self._state.current_value
256
262
  serialized_state = state.model_dump(by_alias=False)
257
263
 
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
262
268
  requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
263
269
  request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
264
270
  requests_total=state.requests_failed + state.requests_finished,
265
- crawler_runtime=crawler_runtime,
271
+ crawler_runtime=state.crawler_runtime,
266
272
  requests_finished=state.requests_finished,
267
273
  requests_failed=state.requests_failed,
268
274
  retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
282
288
  else:
283
289
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
284
290
 
285
- def _after_initialize(self) -> None:
286
- state = self._state.current_value
287
-
288
- if state.crawler_started_at is None:
289
- state.crawler_started_at = datetime.now(timezone.utc)
290
-
291
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
292
- self._instance_start = datetime.now(timezone.utc) - (
293
- state.stats_persisted_at - state.crawler_last_started_at
294
- )
295
- elif state.crawler_last_started_at:
296
- self._instance_start = state.crawler_last_started_at
297
-
298
- state.crawler_last_started_at = self._instance_start
299
-
300
291
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
301
292
  retry_count = record.retry_count
302
293
  state = self._state.current_value
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
13
13
  with _try_import(__name__, 'SqlStorageClient'):
14
14
  from ._sql import SqlStorageClient
15
15
 
16
+ with _try_import(__name__, 'RedisStorageClient'):
17
+ from ._redis import RedisStorageClient
18
+
16
19
  __all__ = [
17
20
  'FileSystemStorageClient',
18
21
  'MemoryStorageClient',
22
+ 'RedisStorageClient',
19
23
  'SqlStorageClient',
20
24
  'StorageClient',
21
25
  ]
@@ -87,8 +87,8 @@ class DatasetClient(ABC):
87
87
 
88
88
  The backend method for the `Dataset.iterate_items` call.
89
89
  """
90
- # This syntax is to make mypy properly work with abstract AsyncIterator.
90
+ # This syntax is to make type checker properly work with abstract AsyncIterator.
91
91
  # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
92
92
  raise NotImplementedError
93
- if False: # type: ignore[unreachable]
93
+ if False:
94
94
  yield 0
@@ -72,10 +72,10 @@ class KeyValueStoreClient(ABC):
72
72
 
73
73
  The backend method for the `KeyValueStore.iterate_keys` call.
74
74
  """
75
- # This syntax is to make mypy properly work with abstract AsyncIterator.
75
+ # This syntax is to make type checker properly work with abstract AsyncIterator.
76
76
  # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
77
77
  raise NotImplementedError
78
- if False: # type: ignore[unreachable]
78
+ if False:
79
79
  yield 0
80
80
 
81
81
  @abstractmethod
@@ -9,7 +9,7 @@ from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
11
  from pydantic import ValidationError
12
- from typing_extensions import override
12
+ from typing_extensions import Self, override
13
13
 
14
14
  from crawlee._consts import METADATA_FILENAME
15
15
  from crawlee._utils.crypto import crypto_random_object_id
@@ -94,7 +94,7 @@ class FileSystemDatasetClient(DatasetClient):
94
94
  name: str | None,
95
95
  alias: str | None,
96
96
  configuration: Configuration,
97
- ) -> FileSystemDatasetClient:
97
+ ) -> Self:
98
98
  """Open or create a file system dataset client.
99
99
 
100
100
  This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
134
134
  continue
135
135
 
136
136
  try:
137
- file = await asyncio.to_thread(path_to_metadata.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
138
138
  try:
139
139
  file_content = json.load(file)
140
140
  metadata = DatasetMetadata(**file_content)
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
163
163
 
164
164
  # If the dataset directory exists, reconstruct the client from the metadata file.
165
165
  if path_to_dataset.exists() and path_to_metadata.exists():
166
- file = await asyncio.to_thread(open, path_to_metadata)
166
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
167
167
  try:
168
168
  file_content = json.load(file)
169
169
  finally:
@@ -473,9 +473,10 @@ class FileSystemDatasetClient(DatasetClient):
473
473
  """
474
474
  # Retrieve and sort all JSON files in the dataset directory numerically.
475
475
  files = await asyncio.to_thread(
476
- sorted,
477
- self.path_to_dataset.glob('*.json'),
478
- key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
476
+ lambda: sorted(
477
+ self.path_to_dataset.glob('*.json'),
478
+ key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
479
+ )
479
480
  )
480
481
 
481
482
  # Remove the metadata file from the list if present.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import functools
4
5
  import json
5
6
  import shutil
6
7
  import urllib.parse
@@ -10,7 +11,7 @@ from pathlib import Path
10
11
  from typing import TYPE_CHECKING, Any
11
12
 
12
13
  from pydantic import ValidationError
13
- from typing_extensions import override
14
+ from typing_extensions import Self, override
14
15
 
15
16
  from crawlee._consts import METADATA_FILENAME
16
17
  from crawlee._utils.crypto import crypto_random_object_id
@@ -93,7 +94,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
93
94
  name: str | None,
94
95
  alias: str | None,
95
96
  configuration: Configuration,
96
- ) -> FileSystemKeyValueStoreClient:
97
+ ) -> Self:
97
98
  """Open or create a file system key-value store client.
98
99
 
99
100
  This method attempts to open an existing key-value store from the file system. If a KVS with the specified
@@ -133,7 +134,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
133
134
  continue
134
135
 
135
136
  try:
136
- file = await asyncio.to_thread(path_to_metadata.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
137
138
  try:
138
139
  file_content = json.load(file)
139
140
  metadata = KeyValueStoreMetadata(**file_content)
@@ -162,7 +163,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
162
163
 
163
164
  # If the key-value store directory exists, reconstruct the client from the metadata file.
164
165
  if path_to_kvs.exists() and path_to_metadata.exists():
165
- file = await asyncio.to_thread(open, path_to_metadata)
166
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
166
167
  try:
167
168
  file_content = json.load(file)
168
169
  finally:
@@ -239,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
239
240
  # Read the metadata file
240
241
  async with self._lock:
241
242
  try:
242
- file = await asyncio.to_thread(open, record_metadata_filepath)
243
+ file = await asyncio.to_thread(
244
+ functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
245
+ )
243
246
  except FileNotFoundError:
244
247
  logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
245
248
  return None
@@ -373,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
373
376
 
374
377
  # List and sort all files *inside* a brief lock, then release it immediately:
375
378
  async with self._lock:
376
- files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*')))
379
+ files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
377
380
 
378
381
  count = 0
379
382
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import functools
4
5
  import json
5
6
  import shutil
6
7
  from collections import deque
@@ -11,7 +12,7 @@ from pathlib import Path
11
12
  from typing import TYPE_CHECKING
12
13
 
13
14
  from pydantic import BaseModel, ValidationError
14
- from typing_extensions import override
15
+ from typing_extensions import Self, override
15
16
 
16
17
  from crawlee import Request
17
18
  from crawlee._consts import METADATA_FILENAME
@@ -31,6 +32,7 @@ if TYPE_CHECKING:
31
32
  from collections.abc import Sequence
32
33
 
33
34
  from crawlee.configuration import Configuration
35
+ from crawlee.storages import KeyValueStore
34
36
 
35
37
  logger = getLogger(__name__)
36
38
 
@@ -92,6 +94,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
92
94
  metadata: RequestQueueMetadata,
93
95
  path_to_rq: Path,
94
96
  lock: asyncio.Lock,
97
+ recoverable_state: RecoverableState[RequestQueueState],
95
98
  ) -> None:
96
99
  """Initialize a new instance.
97
100
 
@@ -114,12 +117,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
114
117
  self._is_empty_cache: bool | None = None
115
118
  """Cache for is_empty result: None means unknown, True/False is cached state."""
116
119
 
117
- self._state = RecoverableState[RequestQueueState](
118
- default_state=RequestQueueState(),
119
- persist_state_key=f'__RQ_STATE_{self._metadata.id}',
120
- persistence_enabled=True,
121
- logger=logger,
122
- )
120
+ self._state = recoverable_state
123
121
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
124
122
 
125
123
  @override
@@ -136,6 +134,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
136
134
  """The full path to the request queue metadata file."""
137
135
  return self.path_to_rq / METADATA_FILENAME
138
136
 
137
+ @classmethod
138
+ async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
139
+ async def kvs_factory() -> KeyValueStore:
140
+ from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
141
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
142
+
143
+ return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
144
+
145
+ return RecoverableState[RequestQueueState](
146
+ default_state=RequestQueueState(),
147
+ persist_state_key=f'__RQ_STATE_{id}',
148
+ persist_state_kvs_factory=kvs_factory,
149
+ persistence_enabled=True,
150
+ logger=logger,
151
+ )
152
+
139
153
  @classmethod
140
154
  async def open(
141
155
  cls,
@@ -144,7 +158,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
144
158
  name: str | None,
145
159
  alias: str | None,
146
160
  configuration: Configuration,
147
- ) -> FileSystemRequestQueueClient:
161
+ ) -> Self:
148
162
  """Open or create a file system request queue client.
149
163
 
150
164
  This method attempts to open an existing request queue from the file system. If a queue with the specified
@@ -184,7 +198,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
184
198
  continue
185
199
 
186
200
  try:
187
- file = await asyncio.to_thread(path_to_metadata.open)
201
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
188
202
  try:
189
203
  file_content = json.load(file)
190
204
  metadata = RequestQueueMetadata(**file_content)
@@ -194,6 +208,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
194
208
  metadata=metadata,
195
209
  path_to_rq=rq_base_path / rq_dir,
196
210
  lock=asyncio.Lock(),
211
+ recoverable_state=await cls._create_recoverable_state(
212
+ id=id, configuration=configuration
213
+ ),
197
214
  )
198
215
  await client._state.initialize()
199
216
  await client._discover_existing_requests()
@@ -216,7 +233,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
216
233
 
217
234
  # If the RQ directory exists, reconstruct the client from the metadata file.
218
235
  if path_to_rq.exists() and path_to_metadata.exists():
219
- file = await asyncio.to_thread(open, path_to_metadata)
236
+ file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
220
237
  try:
221
238
  file_content = json.load(file)
222
239
  finally:
@@ -230,6 +247,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
230
247
  metadata=metadata,
231
248
  path_to_rq=path_to_rq,
232
249
  lock=asyncio.Lock(),
250
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
233
251
  )
234
252
 
235
253
  await client._state.initialize()
@@ -254,6 +272,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
254
272
  metadata=metadata,
255
273
  path_to_rq=path_to_rq,
256
274
  lock=asyncio.Lock(),
275
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
257
276
  )
258
277
  await client._state.initialize()
259
278
  await client._update_metadata()
@@ -738,7 +757,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
738
757
  await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
739
758
 
740
759
  # List all the json files.
741
- files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
760
+ files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
742
761
 
743
762
  # Filter out metadata file and non-file entries.
744
763
  filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
@@ -757,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
757
776
  """
758
777
  # Open the request file.
759
778
  try:
760
- file = await asyncio.to_thread(open, file_path)
779
+ file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
761
780
  except FileNotFoundError:
762
781
  logger.warning(f'Request file "{file_path}" not found.')
763
782
  return None
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
4
4
  from logging import getLogger
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
10
  from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
@@ -55,7 +55,7 @@ class MemoryDatasetClient(DatasetClient):
55
55
  id: str | None,
56
56
  name: str | None,
57
57
  alias: str | None,
58
- ) -> MemoryDatasetClient:
58
+ ) -> Self:
59
59
  """Open or create a new memory dataset client.
60
60
 
61
61
  This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
@@ -4,7 +4,7 @@ import sys
4
4
  from datetime import datetime, timezone
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
10
  from crawlee._utils.file import infer_mime_type
@@ -53,7 +53,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
53
53
  id: str | None,
54
54
  name: str | None,
55
55
  alias: str | None,
56
- ) -> MemoryKeyValueStoreClient:
56
+ ) -> Self:
57
57
  """Open or create a new memory key-value store client.
58
58
 
59
59
  This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
@@ -6,7 +6,7 @@ from datetime import datetime, timezone
6
6
  from logging import getLogger
7
7
  from typing import TYPE_CHECKING
8
8
 
9
- from typing_extensions import override
9
+ from typing_extensions import Self, override
10
10
 
11
11
  from crawlee import Request
12
12
  from crawlee._utils.crypto import crypto_random_object_id
@@ -65,7 +65,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
65
65
  id: str | None,
66
66
  name: str | None,
67
67
  alias: str | None,
68
- ) -> MemoryRequestQueueClient:
68
+ ) -> Self:
69
69
  """Open or create a new memory request queue client.
70
70
 
71
71
  This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
@@ -0,0 +1,6 @@
1
+ from ._dataset_client import RedisDatasetClient
2
+ from ._key_value_store_client import RedisKeyValueStoreClient
3
+ from ._request_queue_client import RedisRequestQueueClient
4
+ from ._storage_client import RedisStorageClient
5
+
6
+ __all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']