crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +34 -22
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +86 -33
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +124 -37
- crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
- crawlee/events/_event_manager.py +3 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +33 -2
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
- crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +10 -2
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,12 +11,13 @@ from pathlib import Path
|
|
|
11
11
|
from typing import TYPE_CHECKING
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, ValidationError
|
|
14
|
-
from typing_extensions import override
|
|
14
|
+
from typing_extensions import Self, override
|
|
15
15
|
|
|
16
16
|
from crawlee import Request
|
|
17
17
|
from crawlee._consts import METADATA_FILENAME
|
|
18
18
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
19
19
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
20
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
20
21
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
21
22
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
22
23
|
from crawlee.storage_clients.models import (
|
|
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
30
31
|
from collections.abc import Sequence
|
|
31
32
|
|
|
32
33
|
from crawlee.configuration import Configuration
|
|
34
|
+
from crawlee.storages import KeyValueStore
|
|
33
35
|
|
|
34
36
|
logger = getLogger(__name__)
|
|
35
37
|
|
|
@@ -89,8 +91,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
89
91
|
self,
|
|
90
92
|
*,
|
|
91
93
|
metadata: RequestQueueMetadata,
|
|
92
|
-
|
|
94
|
+
path_to_rq: Path,
|
|
93
95
|
lock: asyncio.Lock,
|
|
96
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
94
97
|
) -> None:
|
|
95
98
|
"""Initialize a new instance.
|
|
96
99
|
|
|
@@ -98,8 +101,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
98
101
|
"""
|
|
99
102
|
self._metadata = metadata
|
|
100
103
|
|
|
101
|
-
self.
|
|
102
|
-
"""The
|
|
104
|
+
self._path_to_rq = path_to_rq
|
|
105
|
+
"""The full path to the request queue directory."""
|
|
103
106
|
|
|
104
107
|
self._lock = lock
|
|
105
108
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -113,13 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
113
116
|
self._is_empty_cache: bool | None = None
|
|
114
117
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
115
118
|
|
|
116
|
-
self._state =
|
|
117
|
-
default_state=RequestQueueState(),
|
|
118
|
-
persist_state_key='request_queue_state',
|
|
119
|
-
persistence_enabled=True,
|
|
120
|
-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
|
|
121
|
-
logger=logger,
|
|
122
|
-
)
|
|
119
|
+
self._state = recoverable_state
|
|
123
120
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
124
121
|
|
|
125
122
|
@override
|
|
@@ -129,24 +126,38 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
129
126
|
@property
|
|
130
127
|
def path_to_rq(self) -> Path:
|
|
131
128
|
"""The full path to the request queue directory."""
|
|
132
|
-
|
|
133
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
134
|
-
|
|
135
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
129
|
+
return self._path_to_rq
|
|
136
130
|
|
|
137
131
|
@property
|
|
138
132
|
def path_to_metadata(self) -> Path:
|
|
139
133
|
"""The full path to the request queue metadata file."""
|
|
140
134
|
return self.path_to_rq / METADATA_FILENAME
|
|
141
135
|
|
|
136
|
+
@classmethod
|
|
137
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
138
|
+
async def kvs_factory() -> KeyValueStore:
|
|
139
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
140
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
141
|
+
|
|
142
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
143
|
+
|
|
144
|
+
return RecoverableState[RequestQueueState](
|
|
145
|
+
default_state=RequestQueueState(),
|
|
146
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
147
|
+
persist_state_kvs_factory=kvs_factory,
|
|
148
|
+
persistence_enabled=True,
|
|
149
|
+
logger=logger,
|
|
150
|
+
)
|
|
151
|
+
|
|
142
152
|
@classmethod
|
|
143
153
|
async def open(
|
|
144
154
|
cls,
|
|
145
155
|
*,
|
|
146
156
|
id: str | None,
|
|
147
157
|
name: str | None,
|
|
158
|
+
alias: str | None,
|
|
148
159
|
configuration: Configuration,
|
|
149
|
-
) ->
|
|
160
|
+
) -> Self:
|
|
150
161
|
"""Open or create a file system request queue client.
|
|
151
162
|
|
|
152
163
|
This method attempts to open an existing request queue from the file system. If a queue with the specified
|
|
@@ -155,17 +166,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
155
166
|
|
|
156
167
|
Args:
|
|
157
168
|
id: The ID of the request queue to open. If provided, searches for existing queue by ID.
|
|
158
|
-
name: The name of the request queue
|
|
169
|
+
name: The name of the request queue for named (global scope) storages.
|
|
170
|
+
alias: The alias of the request queue for unnamed (run scope) storages.
|
|
159
171
|
configuration: The configuration object containing storage directory settings.
|
|
160
172
|
|
|
161
173
|
Returns:
|
|
162
174
|
An instance for the opened or created storage client.
|
|
163
175
|
|
|
164
176
|
Raises:
|
|
165
|
-
ValueError: If a queue with the specified ID is not found,
|
|
177
|
+
ValueError: If a queue with the specified ID is not found, if metadata is invalid,
|
|
178
|
+
or if both name and alias are provided.
|
|
166
179
|
"""
|
|
167
|
-
|
|
168
|
-
|
|
180
|
+
# Validate input parameters.
|
|
181
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
182
|
+
|
|
183
|
+
rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
169
184
|
|
|
170
185
|
if not rq_base_path.exists():
|
|
171
186
|
await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -177,12 +192,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
177
192
|
if not rq_dir.is_dir():
|
|
178
193
|
continue
|
|
179
194
|
|
|
180
|
-
|
|
181
|
-
if not
|
|
195
|
+
path_to_metadata = rq_dir / METADATA_FILENAME
|
|
196
|
+
if not path_to_metadata.exists():
|
|
182
197
|
continue
|
|
183
198
|
|
|
184
199
|
try:
|
|
185
|
-
file = await asyncio.to_thread(
|
|
200
|
+
file = await asyncio.to_thread(path_to_metadata.open)
|
|
186
201
|
try:
|
|
187
202
|
file_content = json.load(file)
|
|
188
203
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -190,8 +205,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
190
205
|
if metadata.id == id:
|
|
191
206
|
client = cls(
|
|
192
207
|
metadata=metadata,
|
|
193
|
-
|
|
208
|
+
path_to_rq=rq_base_path / rq_dir,
|
|
194
209
|
lock=asyncio.Lock(),
|
|
210
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
211
|
+
id=id, configuration=configuration
|
|
212
|
+
),
|
|
195
213
|
)
|
|
196
214
|
await client._state.initialize()
|
|
197
215
|
await client._discover_existing_requests()
|
|
@@ -206,14 +224,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
206
224
|
if not found:
|
|
207
225
|
raise ValueError(f'Request queue with ID "{id}" not found')
|
|
208
226
|
|
|
209
|
-
# Open an existing RQ by its name, or create a new one if not found.
|
|
227
|
+
# Open an existing RQ by its name or alias, or create a new one if not found.
|
|
210
228
|
else:
|
|
211
|
-
|
|
212
|
-
|
|
229
|
+
rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
230
|
+
path_to_rq = rq_base_path / rq_dir
|
|
231
|
+
path_to_metadata = path_to_rq / METADATA_FILENAME
|
|
213
232
|
|
|
214
233
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
215
|
-
if
|
|
216
|
-
file = await asyncio.to_thread(open,
|
|
234
|
+
if path_to_rq.exists() and path_to_metadata.exists():
|
|
235
|
+
file = await asyncio.to_thread(open, path_to_metadata)
|
|
217
236
|
try:
|
|
218
237
|
file_content = json.load(file)
|
|
219
238
|
finally:
|
|
@@ -221,14 +240,13 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
221
240
|
try:
|
|
222
241
|
metadata = RequestQueueMetadata(**file_content)
|
|
223
242
|
except ValidationError as exc:
|
|
224
|
-
raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
|
|
225
|
-
|
|
226
|
-
metadata.name = name
|
|
243
|
+
raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
|
|
227
244
|
|
|
228
245
|
client = cls(
|
|
229
246
|
metadata=metadata,
|
|
230
|
-
|
|
247
|
+
path_to_rq=path_to_rq,
|
|
231
248
|
lock=asyncio.Lock(),
|
|
249
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
232
250
|
)
|
|
233
251
|
|
|
234
252
|
await client._state.initialize()
|
|
@@ -251,8 +269,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
251
269
|
)
|
|
252
270
|
client = cls(
|
|
253
271
|
metadata=metadata,
|
|
254
|
-
|
|
272
|
+
path_to_rq=path_to_rq,
|
|
255
273
|
lock=asyncio.Lock(),
|
|
274
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
256
275
|
)
|
|
257
276
|
await client._state.initialize()
|
|
258
277
|
await client._update_metadata()
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
3
5
|
from typing_extensions import override
|
|
4
6
|
|
|
5
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,6 +12,9 @@ from ._dataset_client import FileSystemDatasetClient
|
|
|
10
12
|
from ._key_value_store_client import FileSystemKeyValueStoreClient
|
|
11
13
|
from ._request_queue_client import FileSystemRequestQueueClient
|
|
12
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Hashable
|
|
17
|
+
|
|
13
18
|
|
|
14
19
|
@docs_group('Storage clients')
|
|
15
20
|
class FileSystemStorageClient(StorageClient):
|
|
@@ -29,16 +34,22 @@ class FileSystemStorageClient(StorageClient):
|
|
|
29
34
|
Use it only when running a single crawler process at a time.
|
|
30
35
|
"""
|
|
31
36
|
|
|
37
|
+
@override
|
|
38
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
|
|
39
|
+
# Even different client instances should return same storage if the storage_dir is the same.
|
|
40
|
+
return super().get_storage_client_cache_key(configuration), configuration.storage_dir
|
|
41
|
+
|
|
32
42
|
@override
|
|
33
43
|
async def create_dataset_client(
|
|
34
44
|
self,
|
|
35
45
|
*,
|
|
36
46
|
id: str | None = None,
|
|
37
47
|
name: str | None = None,
|
|
48
|
+
alias: str | None = None,
|
|
38
49
|
configuration: Configuration | None = None,
|
|
39
50
|
) -> FileSystemDatasetClient:
|
|
40
51
|
configuration = configuration or Configuration.get_global_configuration()
|
|
41
|
-
client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration)
|
|
52
|
+
client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
42
53
|
await self._purge_if_needed(client, configuration)
|
|
43
54
|
return client
|
|
44
55
|
|
|
@@ -48,10 +59,11 @@ class FileSystemStorageClient(StorageClient):
|
|
|
48
59
|
*,
|
|
49
60
|
id: str | None = None,
|
|
50
61
|
name: str | None = None,
|
|
62
|
+
alias: str | None = None,
|
|
51
63
|
configuration: Configuration | None = None,
|
|
52
64
|
) -> FileSystemKeyValueStoreClient:
|
|
53
65
|
configuration = configuration or Configuration.get_global_configuration()
|
|
54
|
-
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
|
|
66
|
+
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
55
67
|
await self._purge_if_needed(client, configuration)
|
|
56
68
|
return client
|
|
57
69
|
|
|
@@ -61,9 +73,10 @@ class FileSystemStorageClient(StorageClient):
|
|
|
61
73
|
*,
|
|
62
74
|
id: str | None = None,
|
|
63
75
|
name: str | None = None,
|
|
76
|
+
alias: str | None = None,
|
|
64
77
|
configuration: Configuration | None = None,
|
|
65
78
|
) -> FileSystemRequestQueueClient:
|
|
66
79
|
configuration = configuration or Configuration.get_global_configuration()
|
|
67
|
-
client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration)
|
|
80
|
+
client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
68
81
|
await self._purge_if_needed(client, configuration)
|
|
69
82
|
return client
|
|
File without changes
|
|
@@ -4,9 +4,10 @@ from datetime import datetime, timezone
|
|
|
4
4
|
from logging import getLogger
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
10
11
|
from crawlee.storage_clients._base import DatasetClient
|
|
11
12
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
12
13
|
|
|
@@ -53,21 +54,32 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
53
54
|
*,
|
|
54
55
|
id: str | None,
|
|
55
56
|
name: str | None,
|
|
56
|
-
|
|
57
|
+
alias: str | None,
|
|
58
|
+
) -> Self:
|
|
57
59
|
"""Open or create a new memory dataset client.
|
|
58
60
|
|
|
59
61
|
This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
|
|
60
62
|
datasets don't check for existing datasets with the same name or ID since all data exists only in memory
|
|
61
63
|
and is lost when the process terminates.
|
|
62
64
|
|
|
65
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
66
|
+
are supported by default, since data are not persisted.
|
|
67
|
+
|
|
63
68
|
Args:
|
|
64
69
|
id: The ID of the dataset. If not provided, a random ID will be generated.
|
|
65
|
-
name: The name of the dataset
|
|
70
|
+
name: The name of the dataset for named (global scope) storages.
|
|
71
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
66
72
|
|
|
67
73
|
Returns:
|
|
68
74
|
An instance for the opened or created storage client.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
|
|
69
78
|
"""
|
|
70
|
-
#
|
|
79
|
+
# Validate input parameters.
|
|
80
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
81
|
+
|
|
82
|
+
# Create a new dataset
|
|
71
83
|
dataset_id = id or crypto_random_object_id()
|
|
72
84
|
now = datetime.now(timezone.utc)
|
|
73
85
|
|
|
@@ -4,10 +4,11 @@ import sys
|
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
11
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
11
12
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
13
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
14
|
|
|
@@ -51,21 +52,32 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
51
52
|
*,
|
|
52
53
|
id: str | None,
|
|
53
54
|
name: str | None,
|
|
54
|
-
|
|
55
|
+
alias: str | None,
|
|
56
|
+
) -> Self:
|
|
55
57
|
"""Open or create a new memory key-value store client.
|
|
56
58
|
|
|
57
59
|
This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
|
|
58
60
|
memory KVS don't check for existing stores with the same name or ID since all data exists only in memory
|
|
59
61
|
and is lost when the process terminates.
|
|
60
62
|
|
|
63
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
64
|
+
are supported by default, since data are not persisted.
|
|
65
|
+
|
|
61
66
|
Args:
|
|
62
67
|
id: The ID of the key-value store. If not provided, a random ID will be generated.
|
|
63
|
-
name: The name of the key-value store
|
|
68
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
69
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
64
70
|
|
|
65
71
|
Returns:
|
|
66
72
|
An instance for the opened or created storage client.
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If both name and alias are provided.
|
|
67
76
|
"""
|
|
68
|
-
#
|
|
77
|
+
# Validate input parameters.
|
|
78
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
79
|
+
|
|
80
|
+
# Create a new key-value store
|
|
69
81
|
store_id = id or crypto_random_object_id()
|
|
70
82
|
now = datetime.now(timezone.utc)
|
|
71
83
|
|
|
@@ -6,10 +6,11 @@ from datetime import datetime, timezone
|
|
|
6
6
|
from logging import getLogger
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from typing_extensions import override
|
|
9
|
+
from typing_extensions import Self, override
|
|
10
10
|
|
|
11
11
|
from crawlee import Request
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
13
14
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
14
15
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
15
16
|
|
|
@@ -63,21 +64,32 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
63
64
|
*,
|
|
64
65
|
id: str | None,
|
|
65
66
|
name: str | None,
|
|
66
|
-
|
|
67
|
+
alias: str | None,
|
|
68
|
+
) -> Self:
|
|
67
69
|
"""Open or create a new memory request queue client.
|
|
68
70
|
|
|
69
71
|
This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
|
|
70
72
|
memory queues don't check for existing queues with the same name or ID since all data exists only
|
|
71
73
|
in memory and is lost when the process terminates.
|
|
72
74
|
|
|
75
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
76
|
+
are supported by default, since data are not persisted.
|
|
77
|
+
|
|
73
78
|
Args:
|
|
74
79
|
id: The ID of the request queue. If not provided, a random ID will be generated.
|
|
75
|
-
name: The name of the request queue
|
|
80
|
+
name: The name of the request queue for named (global scope) storages.
|
|
81
|
+
alias: The alias of the request queue for unnamed (run scope) storages.
|
|
76
82
|
|
|
77
83
|
Returns:
|
|
78
84
|
An instance for the opened or created storage client.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
ValueError: If both name and alias are provided.
|
|
79
88
|
"""
|
|
80
|
-
#
|
|
89
|
+
# Validate input parameters.
|
|
90
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
91
|
+
|
|
92
|
+
# Create a new queue
|
|
81
93
|
queue_id = id or crypto_random_object_id()
|
|
82
94
|
now = datetime.now(timezone.utc)
|
|
83
95
|
|
|
@@ -33,10 +33,11 @@ class MemoryStorageClient(StorageClient):
|
|
|
33
33
|
*,
|
|
34
34
|
id: str | None = None,
|
|
35
35
|
name: str | None = None,
|
|
36
|
+
alias: str | None = None,
|
|
36
37
|
configuration: Configuration | None = None,
|
|
37
38
|
) -> MemoryDatasetClient:
|
|
38
39
|
configuration = configuration or Configuration.get_global_configuration()
|
|
39
|
-
client = await MemoryDatasetClient.open(id=id, name=name)
|
|
40
|
+
client = await MemoryDatasetClient.open(id=id, name=name, alias=alias)
|
|
40
41
|
await self._purge_if_needed(client, configuration)
|
|
41
42
|
return client
|
|
42
43
|
|
|
@@ -46,10 +47,11 @@ class MemoryStorageClient(StorageClient):
|
|
|
46
47
|
*,
|
|
47
48
|
id: str | None = None,
|
|
48
49
|
name: str | None = None,
|
|
50
|
+
alias: str | None = None,
|
|
49
51
|
configuration: Configuration | None = None,
|
|
50
52
|
) -> MemoryKeyValueStoreClient:
|
|
51
53
|
configuration = configuration or Configuration.get_global_configuration()
|
|
52
|
-
client = await MemoryKeyValueStoreClient.open(id=id, name=name)
|
|
54
|
+
client = await MemoryKeyValueStoreClient.open(id=id, name=name, alias=alias)
|
|
53
55
|
await self._purge_if_needed(client, configuration)
|
|
54
56
|
return client
|
|
55
57
|
|
|
@@ -59,9 +61,10 @@ class MemoryStorageClient(StorageClient):
|
|
|
59
61
|
*,
|
|
60
62
|
id: str | None = None,
|
|
61
63
|
name: str | None = None,
|
|
64
|
+
alias: str | None = None,
|
|
62
65
|
configuration: Configuration | None = None,
|
|
63
66
|
) -> MemoryRequestQueueClient:
|
|
64
67
|
configuration = configuration or Configuration.get_global_configuration()
|
|
65
|
-
client = await MemoryRequestQueueClient.open(id=id, name=name)
|
|
68
|
+
client = await MemoryRequestQueueClient.open(id=id, name=name, alias=alias)
|
|
66
69
|
await self._purge_if_needed(client, configuration)
|
|
67
70
|
return client
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|