crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +87 -25
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +139 -96
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
|
@@ -30,12 +30,13 @@ class StorageClient(ABC):
|
|
|
30
30
|
(where applicable), and consistent access patterns across all storage types it supports.
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
|
-
def
|
|
34
|
-
"""Return a cache key that can differentiate between different storages of this
|
|
33
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
|
|
34
|
+
"""Return a cache key that can differentiate between different storages of this and other clients.
|
|
35
35
|
|
|
36
|
-
Can be based on configuration or on the client itself. By default, returns
|
|
36
|
+
Can be based on configuration or on the client itself. By default, returns a module and name of the client
|
|
37
|
+
class.
|
|
37
38
|
"""
|
|
38
|
-
return ''
|
|
39
|
+
return f'{self.__class__.__module__}.{self.__class__.__name__}'
|
|
39
40
|
|
|
40
41
|
@abstractmethod
|
|
41
42
|
async def create_dataset_client(
|
|
@@ -9,11 +9,12 @@ from pathlib import Path
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
11
|
from pydantic import ValidationError
|
|
12
|
-
from typing_extensions import override
|
|
12
|
+
from typing_extensions import Self, override
|
|
13
13
|
|
|
14
14
|
from crawlee._consts import METADATA_FILENAME
|
|
15
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
16
16
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
17
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
17
18
|
from crawlee.storage_clients._base import DatasetClient
|
|
18
19
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
19
20
|
|
|
@@ -93,7 +94,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
93
94
|
name: str | None,
|
|
94
95
|
alias: str | None,
|
|
95
96
|
configuration: Configuration,
|
|
96
|
-
) ->
|
|
97
|
+
) -> Self:
|
|
97
98
|
"""Open or create a file system dataset client.
|
|
98
99
|
|
|
99
100
|
This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
|
|
@@ -114,9 +115,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
114
115
|
or if both name and alias are provided.
|
|
115
116
|
"""
|
|
116
117
|
# Validate input parameters.
|
|
117
|
-
|
|
118
|
-
if specified_params > 1:
|
|
119
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
118
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
120
119
|
|
|
121
120
|
dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
122
121
|
|
|
@@ -135,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
135
134
|
continue
|
|
136
135
|
|
|
137
136
|
try:
|
|
138
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
139
138
|
try:
|
|
140
139
|
file_content = json.load(file)
|
|
141
140
|
metadata = DatasetMetadata(**file_content)
|
|
@@ -164,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
164
163
|
|
|
165
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
166
165
|
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
167
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
166
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
168
167
|
try:
|
|
169
168
|
file_content = json.load(file)
|
|
170
169
|
finally:
|
|
@@ -10,11 +10,12 @@ from pathlib import Path
|
|
|
10
10
|
from typing import TYPE_CHECKING, Any
|
|
11
11
|
|
|
12
12
|
from pydantic import ValidationError
|
|
13
|
-
from typing_extensions import override
|
|
13
|
+
from typing_extensions import Self, override
|
|
14
14
|
|
|
15
15
|
from crawlee._consts import METADATA_FILENAME
|
|
16
16
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
17
|
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
|
|
18
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
18
19
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
19
20
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
20
21
|
|
|
@@ -92,7 +93,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
92
93
|
name: str | None,
|
|
93
94
|
alias: str | None,
|
|
94
95
|
configuration: Configuration,
|
|
95
|
-
) ->
|
|
96
|
+
) -> Self:
|
|
96
97
|
"""Open or create a file system key-value store client.
|
|
97
98
|
|
|
98
99
|
This method attempts to open an existing key-value store from the file system. If a KVS with the specified
|
|
@@ -113,9 +114,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
113
114
|
or if both name and alias are provided.
|
|
114
115
|
"""
|
|
115
116
|
# Validate input parameters.
|
|
116
|
-
|
|
117
|
-
if specified_params > 1:
|
|
118
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
117
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
119
118
|
|
|
120
119
|
kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
121
120
|
|
|
@@ -134,7 +133,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
134
133
|
continue
|
|
135
134
|
|
|
136
135
|
try:
|
|
137
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
136
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
138
137
|
try:
|
|
139
138
|
file_content = json.load(file)
|
|
140
139
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
@@ -163,7 +162,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
163
162
|
|
|
164
163
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
165
164
|
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
166
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
165
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
167
166
|
try:
|
|
168
167
|
file_content = json.load(file)
|
|
169
168
|
finally:
|
|
@@ -240,7 +239,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
240
239
|
# Read the metadata file
|
|
241
240
|
async with self._lock:
|
|
242
241
|
try:
|
|
243
|
-
file = await asyncio.to_thread(open, record_metadata_filepath)
|
|
242
|
+
file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8')
|
|
244
243
|
except FileNotFoundError:
|
|
245
244
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
246
245
|
return None
|
|
@@ -11,12 +11,13 @@ from pathlib import Path
|
|
|
11
11
|
from typing import TYPE_CHECKING
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, ValidationError
|
|
14
|
-
from typing_extensions import override
|
|
14
|
+
from typing_extensions import Self, override
|
|
15
15
|
|
|
16
16
|
from crawlee import Request
|
|
17
17
|
from crawlee._consts import METADATA_FILENAME
|
|
18
18
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
19
19
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
20
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
20
21
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
21
22
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
22
23
|
from crawlee.storage_clients.models import (
|
|
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
30
31
|
from collections.abc import Sequence
|
|
31
32
|
|
|
32
33
|
from crawlee.configuration import Configuration
|
|
34
|
+
from crawlee.storages import KeyValueStore
|
|
33
35
|
|
|
34
36
|
logger = getLogger(__name__)
|
|
35
37
|
|
|
@@ -91,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
91
93
|
metadata: RequestQueueMetadata,
|
|
92
94
|
path_to_rq: Path,
|
|
93
95
|
lock: asyncio.Lock,
|
|
96
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
94
97
|
) -> None:
|
|
95
98
|
"""Initialize a new instance.
|
|
96
99
|
|
|
@@ -113,13 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
113
116
|
self._is_empty_cache: bool | None = None
|
|
114
117
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
115
118
|
|
|
116
|
-
self._state =
|
|
117
|
-
default_state=RequestQueueState(),
|
|
118
|
-
persist_state_key='request_queue_state',
|
|
119
|
-
persistence_enabled=True,
|
|
120
|
-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
|
|
121
|
-
logger=logger,
|
|
122
|
-
)
|
|
119
|
+
self._state = recoverable_state
|
|
123
120
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
124
121
|
|
|
125
122
|
@override
|
|
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
136
133
|
"""The full path to the request queue metadata file."""
|
|
137
134
|
return self.path_to_rq / METADATA_FILENAME
|
|
138
135
|
|
|
136
|
+
@classmethod
|
|
137
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
138
|
+
async def kvs_factory() -> KeyValueStore:
|
|
139
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
140
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
141
|
+
|
|
142
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
143
|
+
|
|
144
|
+
return RecoverableState[RequestQueueState](
|
|
145
|
+
default_state=RequestQueueState(),
|
|
146
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
147
|
+
persist_state_kvs_factory=kvs_factory,
|
|
148
|
+
persistence_enabled=True,
|
|
149
|
+
logger=logger,
|
|
150
|
+
)
|
|
151
|
+
|
|
139
152
|
@classmethod
|
|
140
153
|
async def open(
|
|
141
154
|
cls,
|
|
@@ -144,7 +157,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
144
157
|
name: str | None,
|
|
145
158
|
alias: str | None,
|
|
146
159
|
configuration: Configuration,
|
|
147
|
-
) ->
|
|
160
|
+
) -> Self:
|
|
148
161
|
"""Open or create a file system request queue client.
|
|
149
162
|
|
|
150
163
|
This method attempts to open an existing request queue from the file system. If a queue with the specified
|
|
@@ -165,9 +178,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
165
178
|
or if both name and alias are provided.
|
|
166
179
|
"""
|
|
167
180
|
# Validate input parameters.
|
|
168
|
-
|
|
169
|
-
if specified_params > 1:
|
|
170
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
181
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
171
182
|
|
|
172
183
|
rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
173
184
|
|
|
@@ -186,7 +197,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
186
197
|
continue
|
|
187
198
|
|
|
188
199
|
try:
|
|
189
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
200
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
190
201
|
try:
|
|
191
202
|
file_content = json.load(file)
|
|
192
203
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -196,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
196
207
|
metadata=metadata,
|
|
197
208
|
path_to_rq=rq_base_path / rq_dir,
|
|
198
209
|
lock=asyncio.Lock(),
|
|
210
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
211
|
+
id=id, configuration=configuration
|
|
212
|
+
),
|
|
199
213
|
)
|
|
200
214
|
await client._state.initialize()
|
|
201
215
|
await client._discover_existing_requests()
|
|
@@ -218,7 +232,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
218
232
|
|
|
219
233
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
220
234
|
if path_to_rq.exists() and path_to_metadata.exists():
|
|
221
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
235
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
222
236
|
try:
|
|
223
237
|
file_content = json.load(file)
|
|
224
238
|
finally:
|
|
@@ -232,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
232
246
|
metadata=metadata,
|
|
233
247
|
path_to_rq=path_to_rq,
|
|
234
248
|
lock=asyncio.Lock(),
|
|
249
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
235
250
|
)
|
|
236
251
|
|
|
237
252
|
await client._state.initialize()
|
|
@@ -256,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
256
271
|
metadata=metadata,
|
|
257
272
|
path_to_rq=path_to_rq,
|
|
258
273
|
lock=asyncio.Lock(),
|
|
274
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
259
275
|
)
|
|
260
276
|
await client._state.initialize()
|
|
261
277
|
await client._update_metadata()
|
|
@@ -759,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
759
775
|
"""
|
|
760
776
|
# Open the request file.
|
|
761
777
|
try:
|
|
762
|
-
file = await asyncio.to_thread(open, file_path)
|
|
778
|
+
file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
|
|
763
779
|
except FileNotFoundError:
|
|
764
780
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
765
781
|
return None
|
|
@@ -35,9 +35,9 @@ class FileSystemStorageClient(StorageClient):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
@override
|
|
38
|
-
def
|
|
38
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
|
|
39
39
|
# Even different client instances should return same storage if the storage_dir is the same.
|
|
40
|
-
return configuration.storage_dir
|
|
40
|
+
return super().get_storage_client_cache_key(configuration), configuration.storage_dir
|
|
41
41
|
|
|
42
42
|
@override
|
|
43
43
|
async def create_dataset_client(
|
|
@@ -4,9 +4,10 @@ from datetime import datetime, timezone
|
|
|
4
4
|
from logging import getLogger
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
10
11
|
from crawlee.storage_clients._base import DatasetClient
|
|
11
12
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
12
13
|
|
|
@@ -54,7 +55,7 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
54
55
|
id: str | None,
|
|
55
56
|
name: str | None,
|
|
56
57
|
alias: str | None,
|
|
57
|
-
) ->
|
|
58
|
+
) -> Self:
|
|
58
59
|
"""Open or create a new memory dataset client.
|
|
59
60
|
|
|
60
61
|
This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
|
|
@@ -76,9 +77,7 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
76
77
|
ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
|
|
77
78
|
"""
|
|
78
79
|
# Validate input parameters.
|
|
79
|
-
|
|
80
|
-
if specified_params > 1:
|
|
81
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
80
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
82
81
|
|
|
83
82
|
# Create a new dataset
|
|
84
83
|
dataset_id = id or crypto_random_object_id()
|
|
@@ -4,10 +4,11 @@ import sys
|
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
11
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
11
12
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
13
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
14
|
|
|
@@ -52,7 +53,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
52
53
|
id: str | None,
|
|
53
54
|
name: str | None,
|
|
54
55
|
alias: str | None,
|
|
55
|
-
) ->
|
|
56
|
+
) -> Self:
|
|
56
57
|
"""Open or create a new memory key-value store client.
|
|
57
58
|
|
|
58
59
|
This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
|
|
@@ -74,9 +75,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
74
75
|
ValueError: If both name and alias are provided.
|
|
75
76
|
"""
|
|
76
77
|
# Validate input parameters.
|
|
77
|
-
|
|
78
|
-
if specified_params > 1:
|
|
79
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
78
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
80
79
|
|
|
81
80
|
# Create a new key-value store
|
|
82
81
|
store_id = id or crypto_random_object_id()
|
|
@@ -6,10 +6,11 @@ from datetime import datetime, timezone
|
|
|
6
6
|
from logging import getLogger
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from typing_extensions import override
|
|
9
|
+
from typing_extensions import Self, override
|
|
10
10
|
|
|
11
11
|
from crawlee import Request
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
13
14
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
14
15
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
15
16
|
|
|
@@ -64,7 +65,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
64
65
|
id: str | None,
|
|
65
66
|
name: str | None,
|
|
66
67
|
alias: str | None,
|
|
67
|
-
) ->
|
|
68
|
+
) -> Self:
|
|
68
69
|
"""Open or create a new memory request queue client.
|
|
69
70
|
|
|
70
71
|
This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
|
|
@@ -86,9 +87,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
86
87
|
ValueError: If both name and alias are provided.
|
|
87
88
|
"""
|
|
88
89
|
# Validate input parameters.
|
|
89
|
-
|
|
90
|
-
if specified_params > 1:
|
|
91
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
90
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
92
91
|
|
|
93
92
|
# Create a new queue
|
|
94
93
|
queue_id = id or crypto_random_object_id()
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|