crawlee 1.0.2b3__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (66) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +30 -17
  4. crawlee/_utils/context.py +2 -2
  5. crawlee/_utils/file.py +7 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +17 -1
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/time.py +41 -1
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +1 -1
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +3 -1
  17. crawlee/crawlers/__init__.py +2 -1
  18. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  19. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  22. crawlee/crawlers/_basic/_basic_crawler.py +126 -112
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +55 -11
  26. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  27. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  28. crawlee/crawlers/_playwright/_types.py +12 -2
  29. crawlee/events/_event_manager.py +4 -4
  30. crawlee/fingerprint_suite/_header_generator.py +2 -2
  31. crawlee/http_clients/_base.py +4 -0
  32. crawlee/http_clients/_curl_impersonate.py +12 -0
  33. crawlee/http_clients/_httpx.py +16 -6
  34. crawlee/http_clients/_impit.py +25 -10
  35. crawlee/otel/crawler_instrumentor.py +3 -3
  36. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  37. crawlee/sessions/_session_pool.py +1 -1
  38. crawlee/statistics/_error_snapshotter.py +1 -1
  39. crawlee/statistics/_models.py +32 -1
  40. crawlee/statistics/_statistics.py +24 -33
  41. crawlee/storage_clients/__init__.py +4 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +27 -9
  45. crawlee/storage_clients/_redis/__init__.py +6 -0
  46. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  47. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  48. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  49. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  50. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  51. crawlee/storage_clients/_redis/_utils.py +23 -0
  52. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  53. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  54. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  55. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  56. crawlee/storage_clients/_redis/py.typed +0 -0
  57. crawlee/storage_clients/_sql/_db_models.py +1 -2
  58. crawlee/storage_clients/_sql/_key_value_store_client.py +3 -2
  59. crawlee/storage_clients/_sql/_request_queue_client.py +18 -4
  60. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  61. crawlee/storages/_key_value_store.py +5 -2
  62. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +8 -3
  63. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +66 -54
  64. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  65. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  66. {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,295 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from contextlib import asynccontextmanager
5
+ from datetime import datetime, timezone
6
+ from logging import getLogger
7
+ from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, overload
8
+
9
+ from crawlee._utils.crypto import crypto_random_object_id
10
+
11
+ from ._utils import await_redis_response, read_lua_script
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import AsyncIterator
15
+
16
+ from redis.asyncio import Redis
17
+ from redis.asyncio.client import Pipeline
18
+ from redis.commands.core import AsyncScript
19
+ from typing_extensions import NotRequired, Self
20
+
21
+ from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata
22
+
23
+
24
+ logger = getLogger(__name__)
25
+
26
+
27
+ class MetadataUpdateParams(TypedDict, total=False):
28
+ """Parameters for updating metadata."""
29
+
30
+ update_accessed_at: NotRequired[bool]
31
+ update_modified_at: NotRequired[bool]
32
+
33
+
34
+ class RedisClientMixin:
35
+ """Mixin class for Redis clients.
36
+
37
+ This mixin provides common Redis operations and basic methods for Redis storage clients.
38
+ """
39
+
40
+ _DEFAULT_NAME = 'default'
41
+ """Default storage name in key prefix when none provided."""
42
+
43
+ _MAIN_KEY: ClassVar[str]
44
+ """Main Redis key prefix for this storage type."""
45
+
46
+ _CLIENT_TYPE: ClassVar[str]
47
+ """Human-readable client type for error messages."""
48
+
49
+ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
50
+ self._storage_name = storage_name
51
+ self._storage_id = storage_id
52
+ self._redis = redis
53
+
54
+ self._scripts_loaded = False
55
+
56
+ @property
57
+ def redis(self) -> Redis:
58
+ """Return the Redis client instance."""
59
+ return self._redis
60
+
61
+ @property
62
+ def metadata_key(self) -> str:
63
+ """Return the Redis key for the metadata of this storage."""
64
+ return f'{self._MAIN_KEY}:{self._storage_name}:metadata'
65
+
66
+ @classmethod
67
+ async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None:
68
+ """Retrieve metadata by storage name.
69
+
70
+ Args:
71
+ name: The name of the storage.
72
+ redis: The Redis client instance.
73
+ with_wait: Whether to wait for the storage to be created if it doesn't exist.
74
+ """
75
+ if with_wait:
76
+ # Wait for the creation signal (max 30 seconds)
77
+ await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30))
78
+ # Signal consumed, push it back for other waiters
79
+ await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1))
80
+
81
+ response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata'))
82
+ data = response[0] if response is not None and isinstance(response, list) else response
83
+ if data is not None and not isinstance(data, dict):
84
+ raise TypeError('The metadata data was received in an incorrect format.')
85
+ return data
86
+
87
+ @classmethod
88
+ async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None:
89
+ """Retrieve storage name by ID from id_to_name index.
90
+
91
+ Args:
92
+ id: The ID of the storage.
93
+ redis: The Redis client instance.
94
+ """
95
+ name = await await_redis_response(redis.hget(f'{cls._MAIN_KEY}:id_to_name', id))
96
+ if isinstance(name, str) or name is None:
97
+ return name
98
+ if isinstance(name, bytes):
99
+ return name.decode('utf-8')
100
+ return None
101
+
102
+ @classmethod
103
+ async def _open(
104
+ cls,
105
+ *,
106
+ id: str | None,
107
+ name: str | None,
108
+ alias: str | None,
109
+ metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
110
+ redis: Redis,
111
+ extra_metadata_fields: dict[str, Any],
112
+ instance_kwargs: dict[str, Any],
113
+ ) -> Self:
114
+ """Open or create a new Redis storage client.
115
+
116
+ Args:
117
+ id: The ID of the storage. If not provided, a random ID will be generated.
118
+ name: The name of the storage for named (global scope) storages.
119
+ alias: The alias of the storage for unnamed (run scope) storages.
120
+ redis: Redis client instance.
121
+ metadata_model: Pydantic model for metadata validation.
122
+ extra_metadata_fields: Storage-specific metadata fields.
123
+ instance_kwargs: Additional arguments for the client constructor.
124
+
125
+ Returns:
126
+ An instance for the opened or created storage client.
127
+ """
128
+ internal_name = name or alias or cls._DEFAULT_NAME
129
+ storage_id: str | None = None
130
+ # Determine if storage exists by ID or name
131
+ if id:
132
+ storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis)
133
+ storage_id = id
134
+ if storage_name is None:
135
+ raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" does not exist.')
136
+ else:
137
+ metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis)
138
+ storage_name = internal_name if metadata_data is not None else None
139
+ storage_id = metadata_data['id'] if metadata_data is not None else None
140
+ # If both storage_name and storage_id are found, open existing storage
141
+ if storage_name and storage_id:
142
+ client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis, **instance_kwargs)
143
+ async with client._get_pipeline() as pipe:
144
+ await client._update_metadata(pipe, update_accessed_at=True)
145
+ # Otherwise, create a new storage
146
+ else:
147
+ now = datetime.now(timezone.utc)
148
+ metadata = metadata_model(
149
+ id=crypto_random_object_id(),
150
+ name=name,
151
+ created_at=now,
152
+ accessed_at=now,
153
+ modified_at=now,
154
+ **extra_metadata_fields,
155
+ )
156
+ client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)
157
+ created = await client._create_metadata_and_storage(internal_name, metadata.model_dump())
158
+ # The client was probably not created due to a race condition. Let's try to open it using the name.
159
+ if not created:
160
+ metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True)
161
+ client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)
162
+
163
+ # Ensure Lua scripts are loaded
164
+ await client._ensure_scripts_loaded()
165
+ return client
166
+
167
+ async def _load_scripts(self) -> None:
168
+ """Load Lua scripts in Redis."""
169
+ return
170
+
171
+ async def _ensure_scripts_loaded(self) -> None:
172
+ """Ensure Lua scripts are loaded in Redis."""
173
+ if not self._scripts_loaded:
174
+ await self._load_scripts()
175
+ self._scripts_loaded = True
176
+
177
+ @asynccontextmanager
178
+ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]:
179
+ """Create a new Redis pipeline."""
180
+ async with self._redis.pipeline() as pipe:
181
+ try:
182
+ pipe.multi() # type: ignore[no-untyped-call]
183
+ yield pipe
184
+ finally:
185
+ if with_execute:
186
+ await pipe.execute()
187
+
188
+ async def _create_storage(self, pipeline: Pipeline) -> None:
189
+ """Create the actual storage structure in Redis."""
190
+ _ = pipeline # To avoid unused variable mypy error
191
+
192
+ async def _create_script(self, script_name: str) -> AsyncScript:
193
+ """Load a Lua script from a file and return a Script object."""
194
+ script_content = await asyncio.to_thread(read_lua_script, script_name)
195
+
196
+ return self._redis.register_script(script_content)
197
+
198
+ async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) -> bool:
199
+ index_id_to_name = f'{self._MAIN_KEY}:id_to_name'
200
+ index_name_to_id = f'{self._MAIN_KEY}:name_to_id'
201
+ metadata['created_at'] = metadata['created_at'].isoformat()
202
+ metadata['accessed_at'] = metadata['accessed_at'].isoformat()
203
+ metadata['modified_at'] = metadata['modified_at'].isoformat()
204
+
205
+ # Try to create name_to_id index entry, if it already exists, return False.
206
+ name_to_id = await await_redis_response(self._redis.hsetnx(index_name_to_id, storage_name, metadata['id']))
207
+ # If name already exists, return False. Probably an attempt at parallel creation.
208
+ if not name_to_id:
209
+ return False
210
+
211
+ # Create id_to_name index entry, metadata, and storage structure in a transaction.
212
+ async with self._get_pipeline() as pipe:
213
+ await await_redis_response(pipe.hsetnx(index_id_to_name, metadata['id'], storage_name))
214
+ await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata))
215
+ await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1))
216
+
217
+ await self._create_storage(pipe)
218
+
219
+ return True
220
+
221
+ async def _drop(self, extra_keys: list[str]) -> None:
222
+ async with self._get_pipeline() as pipe:
223
+ await pipe.delete(self.metadata_key)
224
+ await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id)
225
+ await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name)
226
+ await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:created_signal')
227
+ for key in extra_keys:
228
+ await pipe.delete(key)
229
+
230
+ async def _purge(self, extra_keys: list[str], metadata_kwargs: MetadataUpdateParams) -> None:
231
+ async with self._get_pipeline() as pipe:
232
+ for key in extra_keys:
233
+ await pipe.delete(key)
234
+ await self._update_metadata(pipe, **metadata_kwargs)
235
+ await self._create_storage(pipe)
236
+
237
+ @overload
238
+ async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...
239
+ @overload
240
+ async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...
241
+ @overload
242
+ async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...
243
+
244
+ async def _get_metadata(
245
+ self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]
246
+ ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
247
+ """Retrieve client metadata."""
248
+ metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis)
249
+ if metadata_dict is None:
250
+ raise ValueError(f'{self._CLIENT_TYPE} with name "{self._storage_name}" does not exist.')
251
+ async with self._get_pipeline() as pipe:
252
+ await self._update_metadata(pipe, update_accessed_at=True)
253
+
254
+ return metadata_model.model_validate(metadata_dict)
255
+
256
+ async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None:
257
+ """Pipeline operations storage-specific metadata updates.
258
+
259
+ Must be implemented by concrete classes.
260
+
261
+ Args:
262
+ pipeline: The Redis pipeline to use for the update.
263
+ **kwargs: Storage-specific update parameters.
264
+ """
265
+ _ = pipeline # To avoid unused variable mypy error
266
+ _ = kwargs
267
+
268
+ async def _update_metadata(
269
+ self,
270
+ pipeline: Pipeline,
271
+ *,
272
+ update_accessed_at: bool = False,
273
+ update_modified_at: bool = False,
274
+ **kwargs: Any,
275
+ ) -> None:
276
+ """Update storage metadata combining common and specific fields.
277
+
278
+ Args:
279
+ pipeline: The Redis pipeline to use for the update.
280
+ update_accessed_at: Whether to update accessed_at timestamp.
281
+ update_modified_at: Whether to update modified_at timestamp.
282
+ **kwargs: Additional arguments for _specific_update_metadata.
283
+ """
284
+ now = datetime.now(timezone.utc)
285
+
286
+ if update_accessed_at:
287
+ await await_redis_response(
288
+ pipeline.json().set(self.metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True)
289
+ )
290
+ if update_modified_at:
291
+ await await_redis_response(
292
+ pipeline.json().set(self.metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True)
293
+ )
294
+
295
+ await self._specific_update_metadata(pipeline, **kwargs)
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+
3
+ from logging import getLogger
4
+ from typing import TYPE_CHECKING, Any, cast
5
+
6
+ from typing_extensions import NotRequired, override
7
+
8
+ from crawlee.storage_clients._base import DatasetClient
9
+ from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
10
+
11
+ from ._client_mixin import MetadataUpdateParams, RedisClientMixin
12
+ from ._utils import await_redis_response
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import AsyncIterator
16
+
17
+ from redis.asyncio import Redis
18
+ from redis.asyncio.client import Pipeline
19
+
20
+ logger = getLogger(__name__)
21
+
22
+
23
+ class _DatasetMetadataUpdateParams(MetadataUpdateParams):
24
+ """Parameters for updating dataset metadata."""
25
+
26
+ new_item_count: NotRequired[int]
27
+ delta_item_count: NotRequired[int]
28
+
29
+
30
+ class RedisDatasetClient(DatasetClient, RedisClientMixin):
31
+ """Redis implementation of the dataset client.
32
+
33
+ This client persists dataset items to Redis using JSON arrays for efficient storage and retrieval.
34
+ Items are stored as JSON objects with automatic ordering preservation through Redis list operations.
35
+
36
+ The dataset data is stored in Redis using the following key pattern:
37
+ - `datasets:{name}:items` - Redis JSON array containing all dataset items.
38
+ - `datasets:{name}:metadata` - Redis JSON object containing dataset metadata.
39
+
40
+ Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset.
41
+ The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency
42
+ through Redis transactions and pipeline operations.
43
+ """
44
+
45
+ _DEFAULT_NAME = 'default'
46
+ """Default Dataset name key prefix when none provided."""
47
+
48
+ _MAIN_KEY = 'datasets'
49
+ """Main Redis key prefix for Dataset."""
50
+
51
+ _CLIENT_TYPE = 'Dataset'
52
+ """Human-readable client type for error messages."""
53
+
54
+ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
55
+ """Initialize a new instance.
56
+
57
+ Preferably use the `RedisDatasetClient.open` class method to create a new instance.
58
+
59
+ Args:
60
+ storage_name: Internal storage name used for Redis keys.
61
+ storage_id: Unique identifier for the dataset.
62
+ redis: Redis client instance.
63
+ """
64
+ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
65
+
66
+ @property
67
+ def _items_key(self) -> str:
68
+ """Return the Redis key for the items of this dataset."""
69
+ return f'{self._MAIN_KEY}:{self._storage_name}:items'
70
+
71
+ @classmethod
72
+ async def open(
73
+ cls,
74
+ *,
75
+ id: str | None,
76
+ name: str | None,
77
+ alias: str | None,
78
+ redis: Redis,
79
+ ) -> RedisDatasetClient:
80
+ """Open or create a new Redis dataset client.
81
+
82
+ This method attempts to open an existing dataset from the Redis database. If a dataset with the specified
83
+ ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
84
+ is created.
85
+
86
+ Args:
87
+ id: The ID of the dataset. If not provided, a random ID will be generated.
88
+ name: The name of the dataset for named (global scope) storages.
89
+ alias: The alias of the dataset for unnamed (run scope) storages.
90
+ redis: Redis client instance.
91
+
92
+ Returns:
93
+ An instance for the opened or created storage client.
94
+ """
95
+ return await cls._open(
96
+ id=id,
97
+ name=name,
98
+ alias=alias,
99
+ redis=redis,
100
+ metadata_model=DatasetMetadata,
101
+ extra_metadata_fields={'item_count': 0},
102
+ instance_kwargs={},
103
+ )
104
+
105
+ @override
106
+ async def get_metadata(self) -> DatasetMetadata:
107
+ return await self._get_metadata(DatasetMetadata)
108
+
109
+ @override
110
+ async def drop(self) -> None:
111
+ await self._drop(extra_keys=[self._items_key])
112
+
113
+ @override
114
+ async def purge(self) -> None:
115
+ await self._purge(
116
+ extra_keys=[self._items_key],
117
+ metadata_kwargs=_DatasetMetadataUpdateParams(
118
+ new_item_count=0, update_accessed_at=True, update_modified_at=True
119
+ ),
120
+ )
121
+
122
+ @override
123
+ async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
124
+ if isinstance(data, dict):
125
+ data = [data]
126
+
127
+ async with self._get_pipeline() as pipe:
128
+ pipe.json().arrappend(self._items_key, '$', *data)
129
+ await self._update_metadata(
130
+ pipe,
131
+ **_DatasetMetadataUpdateParams(
132
+ update_accessed_at=True, update_modified_at=True, delta_item_count=len(data)
133
+ ),
134
+ )
135
+
136
+ @override
137
+ async def get_data(
138
+ self,
139
+ *,
140
+ offset: int = 0,
141
+ limit: int | None = 999_999_999_999,
142
+ clean: bool = False,
143
+ desc: bool = False,
144
+ fields: list[str] | None = None,
145
+ omit: list[str] | None = None,
146
+ unwind: list[str] | None = None,
147
+ skip_empty: bool = False,
148
+ skip_hidden: bool = False,
149
+ flatten: list[str] | None = None,
150
+ view: str | None = None,
151
+ ) -> DatasetItemsListPage:
152
+ # Check for unsupported arguments and log a warning if found
153
+ unsupported_args: dict[str, Any] = {
154
+ 'clean': clean,
155
+ 'fields': fields,
156
+ 'omit': omit,
157
+ 'unwind': unwind,
158
+ 'skip_hidden': skip_hidden,
159
+ 'flatten': flatten,
160
+ 'view': view,
161
+ }
162
+ unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}
163
+
164
+ if unsupported:
165
+ logger.warning(
166
+ f'The arguments {list(unsupported.keys())} of get_data are not supported '
167
+ f'by the {self.__class__.__name__} client.'
168
+ )
169
+
170
+ metadata = await self.get_metadata()
171
+
172
+ total = metadata.item_count
173
+ json_path = '$'
174
+
175
+ # Apply sorting and pagination
176
+ match (desc, offset, limit):
177
+ case (True, 0, int()):
178
+ json_path += f'[-{limit}:]'
179
+ case (True, int(), None):
180
+ json_path += f'[:-{offset}]'
181
+ case (True, int(), int()):
182
+ json_path += f'[-{offset + limit}:-{offset}]'
183
+ case (False, 0, int()):
184
+ json_path += f'[:{limit}]'
185
+ case (False, int(), None):
186
+ json_path += f'[{offset}:]'
187
+ case (False, int(), int()):
188
+ json_path += f'[{offset}:{offset + limit}]'
189
+
190
+ if json_path == '$':
191
+ json_path = '$[*]'
192
+
193
+ data = await await_redis_response(self._redis.json().get(self._items_key, json_path))
194
+
195
+ if data is None:
196
+ data = []
197
+
198
+ if skip_empty:
199
+ data = [item for item in data if item]
200
+
201
+ if desc:
202
+ data = list(reversed(data))
203
+
204
+ async with self._get_pipeline() as pipe:
205
+ await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))
206
+
207
+ return DatasetItemsListPage(
208
+ count=len(data),
209
+ offset=offset,
210
+ limit=limit or (total - offset),
211
+ total=total,
212
+ desc=desc,
213
+ items=data,
214
+ )
215
+
216
+ @override
217
+ async def iterate_items(
218
+ self,
219
+ *,
220
+ offset: int = 0,
221
+ limit: int | None = None,
222
+ clean: bool = False,
223
+ desc: bool = False,
224
+ fields: list[str] | None = None,
225
+ omit: list[str] | None = None,
226
+ unwind: list[str] | None = None,
227
+ skip_empty: bool = False,
228
+ skip_hidden: bool = False,
229
+ ) -> AsyncIterator[dict[str, Any]]:
230
+ """Iterate over dataset items one by one.
231
+
232
+ This method yields items individually instead of loading all items at once,
233
+ which is more memory efficient for large datasets.
234
+ """
235
+ # Log warnings for unsupported arguments
236
+ unsupported_args: dict[str, Any] = {
237
+ 'clean': clean,
238
+ 'fields': fields,
239
+ 'omit': omit,
240
+ 'unwind': unwind,
241
+ 'skip_hidden': skip_hidden,
242
+ }
243
+ unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}
244
+
245
+ if unsupported:
246
+ logger.warning(
247
+ f'The arguments {list(unsupported.keys())} of iterate_items are not supported '
248
+ f'by the {self.__class__.__name__} client.'
249
+ )
250
+
251
+ metadata = await self.get_metadata()
252
+ total_items = metadata.item_count
253
+
254
+ # Calculate actual range based on parameters
255
+ start_idx = offset
256
+ end_idx = min(total_items, offset + limit) if limit is not None else total_items
257
+
258
+ # Update accessed_at timestamp
259
+ async with self._get_pipeline() as pipe:
260
+ await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))
261
+
262
+ # Process items in batches for better network efficiency
263
+ batch_size = 100
264
+
265
+ for batch_start in range(start_idx, end_idx, batch_size):
266
+ batch_end = min(batch_start + batch_size, end_idx)
267
+
268
+ # Build JsonPath for batch slice
269
+ if desc:
270
+ # For descending order, we need to reverse the slice calculation
271
+ desc_batch_start = total_items - batch_end
272
+ desc_batch_end = total_items - batch_start
273
+ json_path = f'$[{desc_batch_start}:{desc_batch_end}]'
274
+ else:
275
+ json_path = f'$[{batch_start}:{batch_end}]'
276
+
277
+ # Get batch of items
278
+ batch_items = await await_redis_response(self._redis.json().get(self._items_key, json_path))
279
+
280
+ # Handle case where batch_items might be None or not a list
281
+ if batch_items is None:
282
+ continue
283
+
284
+ # Reverse batch if desc order (since we got items in normal order but need desc)
285
+ items_iter = reversed(batch_items) if desc else iter(batch_items)
286
+
287
+ # Yield items from batch
288
+ for item in items_iter:
289
+ # Apply skip_empty filter
290
+ if skip_empty and not item:
291
+ continue
292
+
293
+ yield cast('dict[str, Any]', item)
294
+
295
+ async with self._get_pipeline() as pipe:
296
+ await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))
297
+
298
+ @override
299
+ async def _create_storage(self, pipeline: Pipeline) -> None:
300
+ """Create the main dataset keys in Redis."""
301
+ # Create an empty JSON array for items
302
+ await await_redis_response(pipeline.json().set(self._items_key, '$', []))
303
+
304
+ @override
305
+ async def _specific_update_metadata(
306
+ self,
307
+ pipeline: Pipeline,
308
+ *,
309
+ new_item_count: int | None = None,
310
+ delta_item_count: int | None = None,
311
+ **_kwargs: Any,
312
+ ) -> None:
313
+ """Update the dataset metadata in the database.
314
+
315
+ Args:
316
+ pipeline: The Redis pipeline to use for the update.
317
+ new_item_count: If provided, update the item count to this value.
318
+ delta_item_count: If provided, increment the item count by this value.
319
+ """
320
+ if new_item_count is not None:
321
+ await await_redis_response(
322
+ pipeline.json().set(self.metadata_key, '$.item_count', new_item_count, nx=False, xx=True)
323
+ )
324
+ elif delta_item_count is not None:
325
+ await await_redis_response(pipeline.json().numincrby(self.metadata_key, '$.item_count', delta_item_count))