crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
3
5
|
from typing_extensions import override
|
|
4
6
|
|
|
5
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,6 +12,9 @@ from ._dataset_client import FileSystemDatasetClient
|
|
|
10
12
|
from ._key_value_store_client import FileSystemKeyValueStoreClient
|
|
11
13
|
from ._request_queue_client import FileSystemRequestQueueClient
|
|
12
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Hashable
|
|
17
|
+
|
|
13
18
|
|
|
14
19
|
@docs_group('Storage clients')
|
|
15
20
|
class FileSystemStorageClient(StorageClient):
|
|
@@ -29,16 +34,22 @@ class FileSystemStorageClient(StorageClient):
|
|
|
29
34
|
Use it only when running a single crawler process at a time.
|
|
30
35
|
"""
|
|
31
36
|
|
|
37
|
+
@override
|
|
38
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
|
|
39
|
+
# Even different client instances should return same storage if the storage_dir is the same.
|
|
40
|
+
return super().get_storage_client_cache_key(configuration), configuration.storage_dir
|
|
41
|
+
|
|
32
42
|
@override
|
|
33
43
|
async def create_dataset_client(
|
|
34
44
|
self,
|
|
35
45
|
*,
|
|
36
46
|
id: str | None = None,
|
|
37
47
|
name: str | None = None,
|
|
48
|
+
alias: str | None = None,
|
|
38
49
|
configuration: Configuration | None = None,
|
|
39
50
|
) -> FileSystemDatasetClient:
|
|
40
51
|
configuration = configuration or Configuration.get_global_configuration()
|
|
41
|
-
client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration)
|
|
52
|
+
client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
42
53
|
await self._purge_if_needed(client, configuration)
|
|
43
54
|
return client
|
|
44
55
|
|
|
@@ -48,10 +59,11 @@ class FileSystemStorageClient(StorageClient):
|
|
|
48
59
|
*,
|
|
49
60
|
id: str | None = None,
|
|
50
61
|
name: str | None = None,
|
|
62
|
+
alias: str | None = None,
|
|
51
63
|
configuration: Configuration | None = None,
|
|
52
64
|
) -> FileSystemKeyValueStoreClient:
|
|
53
65
|
configuration = configuration or Configuration.get_global_configuration()
|
|
54
|
-
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
|
|
66
|
+
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
55
67
|
await self._purge_if_needed(client, configuration)
|
|
56
68
|
return client
|
|
57
69
|
|
|
@@ -61,9 +73,10 @@ class FileSystemStorageClient(StorageClient):
|
|
|
61
73
|
*,
|
|
62
74
|
id: str | None = None,
|
|
63
75
|
name: str | None = None,
|
|
76
|
+
alias: str | None = None,
|
|
64
77
|
configuration: Configuration | None = None,
|
|
65
78
|
) -> FileSystemRequestQueueClient:
|
|
66
79
|
configuration = configuration or Configuration.get_global_configuration()
|
|
67
|
-
client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration)
|
|
80
|
+
client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
68
81
|
await self._purge_if_needed(client, configuration)
|
|
69
82
|
return client
|
|
File without changes
|
|
@@ -4,9 +4,10 @@ from datetime import datetime, timezone
|
|
|
4
4
|
from logging import getLogger
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
10
11
|
from crawlee.storage_clients._base import DatasetClient
|
|
11
12
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
12
13
|
|
|
@@ -53,21 +54,32 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
53
54
|
*,
|
|
54
55
|
id: str | None,
|
|
55
56
|
name: str | None,
|
|
56
|
-
|
|
57
|
+
alias: str | None,
|
|
58
|
+
) -> Self:
|
|
57
59
|
"""Open or create a new memory dataset client.
|
|
58
60
|
|
|
59
61
|
This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
|
|
60
62
|
datasets don't check for existing datasets with the same name or ID since all data exists only in memory
|
|
61
63
|
and is lost when the process terminates.
|
|
62
64
|
|
|
65
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
66
|
+
are supported by default, since data are not persisted.
|
|
67
|
+
|
|
63
68
|
Args:
|
|
64
69
|
id: The ID of the dataset. If not provided, a random ID will be generated.
|
|
65
|
-
name: The name of the dataset
|
|
70
|
+
name: The name of the dataset for named (global scope) storages.
|
|
71
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
66
72
|
|
|
67
73
|
Returns:
|
|
68
74
|
An instance for the opened or created storage client.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
|
|
69
78
|
"""
|
|
70
|
-
#
|
|
79
|
+
# Validate input parameters.
|
|
80
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
81
|
+
|
|
82
|
+
# Create a new dataset
|
|
71
83
|
dataset_id = id or crypto_random_object_id()
|
|
72
84
|
now = datetime.now(timezone.utc)
|
|
73
85
|
|
|
@@ -4,10 +4,11 @@ import sys
|
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
11
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
11
12
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
13
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
14
|
|
|
@@ -51,21 +52,32 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
51
52
|
*,
|
|
52
53
|
id: str | None,
|
|
53
54
|
name: str | None,
|
|
54
|
-
|
|
55
|
+
alias: str | None,
|
|
56
|
+
) -> Self:
|
|
55
57
|
"""Open or create a new memory key-value store client.
|
|
56
58
|
|
|
57
59
|
This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
|
|
58
60
|
memory KVS don't check for existing stores with the same name or ID since all data exists only in memory
|
|
59
61
|
and is lost when the process terminates.
|
|
60
62
|
|
|
63
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
64
|
+
are supported by default, since data are not persisted.
|
|
65
|
+
|
|
61
66
|
Args:
|
|
62
67
|
id: The ID of the key-value store. If not provided, a random ID will be generated.
|
|
63
|
-
name: The name of the key-value store
|
|
68
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
69
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
64
70
|
|
|
65
71
|
Returns:
|
|
66
72
|
An instance for the opened or created storage client.
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If both name and alias are provided.
|
|
67
76
|
"""
|
|
68
|
-
#
|
|
77
|
+
# Validate input parameters.
|
|
78
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
79
|
+
|
|
80
|
+
# Create a new key-value store
|
|
69
81
|
store_id = id or crypto_random_object_id()
|
|
70
82
|
now = datetime.now(timezone.utc)
|
|
71
83
|
|
|
@@ -6,10 +6,11 @@ from datetime import datetime, timezone
|
|
|
6
6
|
from logging import getLogger
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from typing_extensions import override
|
|
9
|
+
from typing_extensions import Self, override
|
|
10
10
|
|
|
11
11
|
from crawlee import Request
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
13
14
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
14
15
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
15
16
|
|
|
@@ -50,9 +51,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
50
51
|
self._in_progress_requests = dict[str, Request]()
|
|
51
52
|
"""In-progress requests are those that have been fetched but not yet marked as handled or reclaimed."""
|
|
52
53
|
|
|
53
|
-
self._requests_by_id = dict[str, Request]()
|
|
54
|
-
"""ID -> Request mapping for fast lookup by request ID."""
|
|
55
|
-
|
|
56
54
|
self._requests_by_unique_key = dict[str, Request]()
|
|
57
55
|
"""Unique key -> Request mapping for fast lookup by unique key."""
|
|
58
56
|
|
|
@@ -66,21 +64,32 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
66
64
|
*,
|
|
67
65
|
id: str | None,
|
|
68
66
|
name: str | None,
|
|
69
|
-
|
|
67
|
+
alias: str | None,
|
|
68
|
+
) -> Self:
|
|
70
69
|
"""Open or create a new memory request queue client.
|
|
71
70
|
|
|
72
71
|
This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
|
|
73
72
|
memory queues don't check for existing queues with the same name or ID since all data exists only
|
|
74
73
|
in memory and is lost when the process terminates.
|
|
75
74
|
|
|
75
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
76
|
+
are supported by default, since data are not persisted.
|
|
77
|
+
|
|
76
78
|
Args:
|
|
77
79
|
id: The ID of the request queue. If not provided, a random ID will be generated.
|
|
78
|
-
name: The name of the request queue
|
|
80
|
+
name: The name of the request queue for named (global scope) storages.
|
|
81
|
+
alias: The alias of the request queue for unnamed (run scope) storages.
|
|
79
82
|
|
|
80
83
|
Returns:
|
|
81
84
|
An instance for the opened or created storage client.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
ValueError: If both name and alias are provided.
|
|
82
88
|
"""
|
|
83
|
-
#
|
|
89
|
+
# Validate input parameters.
|
|
90
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
91
|
+
|
|
92
|
+
# Create a new queue
|
|
84
93
|
queue_id = id or crypto_random_object_id()
|
|
85
94
|
now = datetime.now(timezone.utc)
|
|
86
95
|
|
|
@@ -102,7 +111,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
102
111
|
async def drop(self) -> None:
|
|
103
112
|
self._pending_requests.clear()
|
|
104
113
|
self._handled_requests.clear()
|
|
105
|
-
self._requests_by_id.clear()
|
|
106
114
|
self._requests_by_unique_key.clear()
|
|
107
115
|
self._in_progress_requests.clear()
|
|
108
116
|
|
|
@@ -118,7 +126,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
118
126
|
async def purge(self) -> None:
|
|
119
127
|
self._pending_requests.clear()
|
|
120
128
|
self._handled_requests.clear()
|
|
121
|
-
self._requests_by_id.clear()
|
|
122
129
|
self._requests_by_unique_key.clear()
|
|
123
130
|
self._in_progress_requests.clear()
|
|
124
131
|
|
|
@@ -142,12 +149,12 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
142
149
|
|
|
143
150
|
was_already_present = existing_request is not None
|
|
144
151
|
was_already_handled = was_already_present and existing_request and existing_request.handled_at is not None
|
|
152
|
+
is_in_progress = request.unique_key in self._in_progress_requests
|
|
145
153
|
|
|
146
154
|
# If the request is already in the queue and handled, don't add it again.
|
|
147
155
|
if was_already_handled:
|
|
148
156
|
processed_requests.append(
|
|
149
157
|
ProcessedRequest(
|
|
150
|
-
id=request.id,
|
|
151
158
|
unique_key=request.unique_key,
|
|
152
159
|
was_already_present=True,
|
|
153
160
|
was_already_handled=True,
|
|
@@ -155,22 +162,40 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
155
162
|
)
|
|
156
163
|
continue
|
|
157
164
|
|
|
165
|
+
# If the request is already in progress, don't add it again.
|
|
166
|
+
if is_in_progress:
|
|
167
|
+
processed_requests.append(
|
|
168
|
+
ProcessedRequest(
|
|
169
|
+
unique_key=request.unique_key,
|
|
170
|
+
was_already_present=True,
|
|
171
|
+
was_already_handled=False,
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
continue
|
|
175
|
+
|
|
158
176
|
# If the request is already in the queue but not handled, update it.
|
|
159
177
|
if was_already_present and existing_request:
|
|
160
|
-
# Update the existing request with any new data and
|
|
161
|
-
# remove old request from pending queue if it's there.
|
|
162
|
-
with suppress(ValueError):
|
|
163
|
-
self._pending_requests.remove(existing_request)
|
|
164
|
-
|
|
165
178
|
# Update indexes.
|
|
166
|
-
self._requests_by_id[request.id] = request
|
|
167
179
|
self._requests_by_unique_key[request.unique_key] = request
|
|
168
180
|
|
|
169
|
-
#
|
|
181
|
+
# We only update `forefront` by updating its position by shifting it to the left.
|
|
170
182
|
if forefront:
|
|
183
|
+
# Update the existing request with any new data and
|
|
184
|
+
# remove old request from pending queue if it's there.
|
|
185
|
+
with suppress(ValueError):
|
|
186
|
+
self._pending_requests.remove(existing_request)
|
|
187
|
+
|
|
188
|
+
# Add updated request back to queue.
|
|
171
189
|
self._pending_requests.appendleft(request)
|
|
172
|
-
|
|
173
|
-
|
|
190
|
+
|
|
191
|
+
processed_requests.append(
|
|
192
|
+
ProcessedRequest(
|
|
193
|
+
unique_key=request.unique_key,
|
|
194
|
+
was_already_present=True,
|
|
195
|
+
was_already_handled=False,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
174
199
|
# Add the new request to the queue.
|
|
175
200
|
else:
|
|
176
201
|
if forefront:
|
|
@@ -179,7 +204,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
179
204
|
self._pending_requests.append(request)
|
|
180
205
|
|
|
181
206
|
# Update indexes.
|
|
182
|
-
self._requests_by_id[request.id] = request
|
|
183
207
|
self._requests_by_unique_key[request.unique_key] = request
|
|
184
208
|
|
|
185
209
|
await self._update_metadata(
|
|
@@ -189,7 +213,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
189
213
|
|
|
190
214
|
processed_requests.append(
|
|
191
215
|
ProcessedRequest(
|
|
192
|
-
id=request.id,
|
|
193
216
|
unique_key=request.unique_key,
|
|
194
217
|
was_already_present=was_already_present,
|
|
195
218
|
was_already_handled=False,
|
|
@@ -213,25 +236,24 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
213
236
|
continue
|
|
214
237
|
|
|
215
238
|
# Skip if already in progress (shouldn't happen, but safety check).
|
|
216
|
-
if request.
|
|
217
|
-
|
|
218
|
-
break
|
|
239
|
+
if request.unique_key in self._in_progress_requests:
|
|
240
|
+
continue
|
|
219
241
|
|
|
220
242
|
# Mark as in progress.
|
|
221
|
-
self._in_progress_requests[request.
|
|
243
|
+
self._in_progress_requests[request.unique_key] = request
|
|
222
244
|
return request
|
|
223
245
|
|
|
224
246
|
return None
|
|
225
247
|
|
|
226
248
|
@override
|
|
227
|
-
async def get_request(self,
|
|
249
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
228
250
|
await self._update_metadata(update_accessed_at=True)
|
|
229
|
-
return self.
|
|
251
|
+
return self._requests_by_unique_key.get(unique_key)
|
|
230
252
|
|
|
231
253
|
@override
|
|
232
254
|
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
233
255
|
# Check if the request is in progress.
|
|
234
|
-
if request.
|
|
256
|
+
if request.unique_key not in self._in_progress_requests:
|
|
235
257
|
return None
|
|
236
258
|
|
|
237
259
|
# Set handled_at timestamp if not already set.
|
|
@@ -239,14 +261,13 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
239
261
|
request.handled_at = datetime.now(timezone.utc)
|
|
240
262
|
|
|
241
263
|
# Move request to handled storage.
|
|
242
|
-
self._handled_requests[request.
|
|
264
|
+
self._handled_requests[request.unique_key] = request
|
|
243
265
|
|
|
244
|
-
# Update
|
|
245
|
-
self._requests_by_id[request.id] = request
|
|
266
|
+
# Update index (keep the request in indexes for get_request to work).
|
|
246
267
|
self._requests_by_unique_key[request.unique_key] = request
|
|
247
268
|
|
|
248
269
|
# Remove from in-progress.
|
|
249
|
-
del self._in_progress_requests[request.
|
|
270
|
+
del self._in_progress_requests[request.unique_key]
|
|
250
271
|
|
|
251
272
|
# Update metadata.
|
|
252
273
|
await self._update_metadata(
|
|
@@ -256,7 +277,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
256
277
|
)
|
|
257
278
|
|
|
258
279
|
return ProcessedRequest(
|
|
259
|
-
id=request.id,
|
|
260
280
|
unique_key=request.unique_key,
|
|
261
281
|
was_already_present=True,
|
|
262
282
|
was_already_handled=True,
|
|
@@ -270,11 +290,11 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
270
290
|
forefront: bool = False,
|
|
271
291
|
) -> ProcessedRequest | None:
|
|
272
292
|
# Check if the request is in progress.
|
|
273
|
-
if request.
|
|
293
|
+
if request.unique_key not in self._in_progress_requests:
|
|
274
294
|
return None
|
|
275
295
|
|
|
276
296
|
# Remove from in-progress.
|
|
277
|
-
del self._in_progress_requests[request.
|
|
297
|
+
del self._in_progress_requests[request.unique_key]
|
|
278
298
|
|
|
279
299
|
# Add request back to pending queue.
|
|
280
300
|
if forefront:
|
|
@@ -286,7 +306,6 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
286
306
|
await self._update_metadata(update_modified_at=True)
|
|
287
307
|
|
|
288
308
|
return ProcessedRequest(
|
|
289
|
-
id=request.id,
|
|
290
309
|
unique_key=request.unique_key,
|
|
291
310
|
was_already_present=True,
|
|
292
311
|
was_already_handled=False,
|
|
@@ -33,10 +33,11 @@ class MemoryStorageClient(StorageClient):
|
|
|
33
33
|
*,
|
|
34
34
|
id: str | None = None,
|
|
35
35
|
name: str | None = None,
|
|
36
|
+
alias: str | None = None,
|
|
36
37
|
configuration: Configuration | None = None,
|
|
37
38
|
) -> MemoryDatasetClient:
|
|
38
39
|
configuration = configuration or Configuration.get_global_configuration()
|
|
39
|
-
client = await MemoryDatasetClient.open(id=id, name=name)
|
|
40
|
+
client = await MemoryDatasetClient.open(id=id, name=name, alias=alias)
|
|
40
41
|
await self._purge_if_needed(client, configuration)
|
|
41
42
|
return client
|
|
42
43
|
|
|
@@ -46,10 +47,11 @@ class MemoryStorageClient(StorageClient):
|
|
|
46
47
|
*,
|
|
47
48
|
id: str | None = None,
|
|
48
49
|
name: str | None = None,
|
|
50
|
+
alias: str | None = None,
|
|
49
51
|
configuration: Configuration | None = None,
|
|
50
52
|
) -> MemoryKeyValueStoreClient:
|
|
51
53
|
configuration = configuration or Configuration.get_global_configuration()
|
|
52
|
-
client = await MemoryKeyValueStoreClient.open(id=id, name=name)
|
|
54
|
+
client = await MemoryKeyValueStoreClient.open(id=id, name=name, alias=alias)
|
|
53
55
|
await self._purge_if_needed(client, configuration)
|
|
54
56
|
return client
|
|
55
57
|
|
|
@@ -59,9 +61,10 @@ class MemoryStorageClient(StorageClient):
|
|
|
59
61
|
*,
|
|
60
62
|
id: str | None = None,
|
|
61
63
|
name: str | None = None,
|
|
64
|
+
alias: str | None = None,
|
|
62
65
|
configuration: Configuration | None = None,
|
|
63
66
|
) -> MemoryRequestQueueClient:
|
|
64
67
|
configuration = configuration or Configuration.get_global_configuration()
|
|
65
|
-
client = await MemoryRequestQueueClient.open(id=id, name=name)
|
|
68
|
+
client = await MemoryRequestQueueClient.open(id=id, name=name, alias=alias)
|
|
66
69
|
await self._purge_if_needed(client, configuration)
|
|
67
70
|
return client
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|