crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +87 -25
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +139 -96
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import JSON, BigInteger, Boolean, ForeignKey, Index, Integer, LargeBinary, String, text
|
|
7
|
+
from sqlalchemy.dialects.postgresql import JSONB
|
|
8
|
+
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, synonym
|
|
9
|
+
from sqlalchemy.types import DateTime, TypeDecorator
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from sqlalchemy.engine import Dialect
|
|
14
|
+
from sqlalchemy.types import TypeEngine
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AwareDateTime(TypeDecorator):
|
|
18
|
+
"""Custom SQLAlchemy type for timezone-aware datetime handling.
|
|
19
|
+
|
|
20
|
+
Ensures all datetime values are timezone-aware by adding UTC timezone to
|
|
21
|
+
naive datetime values from databases that don't store timezone information.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
impl = DateTime(timezone=True)
|
|
25
|
+
cache_ok = True
|
|
26
|
+
|
|
27
|
+
@override
|
|
28
|
+
def process_result_value(self, value: datetime | None, dialect: Dialect) -> datetime | None:
|
|
29
|
+
"""Add UTC timezone to naive datetime values."""
|
|
30
|
+
if value is not None and value.tzinfo is None:
|
|
31
|
+
return value.replace(tzinfo=timezone.utc)
|
|
32
|
+
return value
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class JsonField(TypeDecorator):
|
|
36
|
+
"""Uses JSONB for PostgreSQL and JSON for other databases."""
|
|
37
|
+
|
|
38
|
+
impl = JSON
|
|
39
|
+
cache_ok = True
|
|
40
|
+
|
|
41
|
+
def load_dialect_impl(self, dialect: Dialect) -> TypeEngine[JSON | JSONB]:
|
|
42
|
+
"""Load the appropriate dialect implementation for the JSON type."""
|
|
43
|
+
if dialect.name == 'postgresql':
|
|
44
|
+
return dialect.type_descriptor(JSONB())
|
|
45
|
+
return dialect.type_descriptor(JSON())
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Base(DeclarativeBase):
|
|
49
|
+
"""Base class for all database models for correct type annotations."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class StorageMetadataDb:
|
|
53
|
+
"""Base database model for storage metadata."""
|
|
54
|
+
|
|
55
|
+
internal_name: Mapped[str] = mapped_column(String, nullable=False, index=True, unique=True)
|
|
56
|
+
"""Internal unique name for a storage instance based on a name or alias."""
|
|
57
|
+
|
|
58
|
+
name: Mapped[str | None] = mapped_column(String, nullable=True, unique=True)
|
|
59
|
+
"""Human-readable name. None becomes 'default' in database to enforce uniqueness."""
|
|
60
|
+
|
|
61
|
+
accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
|
|
62
|
+
"""Last access datetime for usage tracking."""
|
|
63
|
+
|
|
64
|
+
created_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
|
|
65
|
+
"""Creation datetime."""
|
|
66
|
+
|
|
67
|
+
modified_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
|
|
68
|
+
"""Last modification datetime."""
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class DatasetMetadataDb(StorageMetadataDb, Base):
|
|
72
|
+
"""Metadata table for datasets."""
|
|
73
|
+
|
|
74
|
+
__tablename__ = 'datasets'
|
|
75
|
+
|
|
76
|
+
dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)
|
|
77
|
+
"""Unique identifier for the dataset."""
|
|
78
|
+
|
|
79
|
+
item_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
80
|
+
"""Number of items in the dataset."""
|
|
81
|
+
|
|
82
|
+
# Relationship to dataset items with cascade deletion
|
|
83
|
+
items: Mapped[list[DatasetItemDb]] = relationship(
|
|
84
|
+
back_populates='dataset', cascade='all, delete-orphan', lazy='noload'
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
id = synonym('dataset_id')
|
|
88
|
+
"""Alias for dataset_id to match Pydantic expectations."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class RequestQueueMetadataDb(StorageMetadataDb, Base):
|
|
92
|
+
"""Metadata table for request queues."""
|
|
93
|
+
|
|
94
|
+
__tablename__ = 'request_queues'
|
|
95
|
+
|
|
96
|
+
request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)
|
|
97
|
+
"""Unique identifier for the request queue."""
|
|
98
|
+
|
|
99
|
+
had_multiple_clients: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
|
100
|
+
"""Flag indicating if multiple clients have accessed this queue."""
|
|
101
|
+
|
|
102
|
+
handled_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
103
|
+
"""Number of requests processed."""
|
|
104
|
+
|
|
105
|
+
pending_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
106
|
+
"""Number of requests waiting to be processed."""
|
|
107
|
+
|
|
108
|
+
total_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
109
|
+
"""Total number of requests ever added to this queue."""
|
|
110
|
+
|
|
111
|
+
# Relationship to queue requests with cascade deletion
|
|
112
|
+
requests: Mapped[list[RequestDb]] = relationship(
|
|
113
|
+
back_populates='queue', cascade='all, delete-orphan', lazy='noload'
|
|
114
|
+
)
|
|
115
|
+
# Relationship to queue state
|
|
116
|
+
state: Mapped[RequestQueueStateDb] = relationship(
|
|
117
|
+
back_populates='queue', cascade='all, delete-orphan', lazy='noload'
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
id = synonym('request_queue_id')
|
|
121
|
+
"""Alias for request_queue_id to match Pydantic expectations."""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class KeyValueStoreMetadataDb(StorageMetadataDb, Base):
|
|
125
|
+
"""Metadata table for key-value stores."""
|
|
126
|
+
|
|
127
|
+
__tablename__ = 'key_value_stores'
|
|
128
|
+
|
|
129
|
+
key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)
|
|
130
|
+
"""Unique identifier for the key-value store."""
|
|
131
|
+
|
|
132
|
+
# Relationship to store records with cascade deletion
|
|
133
|
+
records: Mapped[list[KeyValueStoreRecordDb]] = relationship(
|
|
134
|
+
back_populates='kvs', cascade='all, delete-orphan', lazy='noload'
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
id = synonym('key_value_store_id')
|
|
138
|
+
"""Alias for key_value_store_id to match Pydantic expectations."""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class KeyValueStoreRecordDb(Base):
|
|
142
|
+
"""Records table for key-value stores."""
|
|
143
|
+
|
|
144
|
+
__tablename__ = 'key_value_store_records'
|
|
145
|
+
|
|
146
|
+
key_value_store_id: Mapped[str] = mapped_column(
|
|
147
|
+
String(20),
|
|
148
|
+
ForeignKey('key_value_stores.key_value_store_id', ondelete='CASCADE'),
|
|
149
|
+
primary_key=True,
|
|
150
|
+
index=True,
|
|
151
|
+
nullable=False,
|
|
152
|
+
)
|
|
153
|
+
"""Foreign key to metadata key-value store record."""
|
|
154
|
+
|
|
155
|
+
key: Mapped[str] = mapped_column(String(255), primary_key=True)
|
|
156
|
+
"""The key part of the key-value pair."""
|
|
157
|
+
|
|
158
|
+
value: Mapped[bytes] = mapped_column(LargeBinary, nullable=False)
|
|
159
|
+
"""Value stored as binary data to support any content type."""
|
|
160
|
+
|
|
161
|
+
content_type: Mapped[str] = mapped_column(String(50), nullable=False)
|
|
162
|
+
"""MIME type for proper value deserialization."""
|
|
163
|
+
|
|
164
|
+
size: Mapped[int | None] = mapped_column(Integer, nullable=False, default=0)
|
|
165
|
+
"""Size of stored value in bytes."""
|
|
166
|
+
|
|
167
|
+
# Relationship back to parent store
|
|
168
|
+
kvs: Mapped[KeyValueStoreMetadataDb] = relationship(back_populates='records')
|
|
169
|
+
|
|
170
|
+
storage_id = synonym('key_value_store_id')
|
|
171
|
+
"""Alias for key_value_store_id to match SqlClientMixin expectations."""
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class DatasetItemDb(Base):
|
|
175
|
+
"""Items table for datasets."""
|
|
176
|
+
|
|
177
|
+
__tablename__ = 'dataset_records'
|
|
178
|
+
|
|
179
|
+
item_id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
180
|
+
"""Auto-increment primary key preserving insertion order."""
|
|
181
|
+
|
|
182
|
+
dataset_id: Mapped[str] = mapped_column(
|
|
183
|
+
String(20),
|
|
184
|
+
ForeignKey('datasets.dataset_id', ondelete='CASCADE'),
|
|
185
|
+
index=True,
|
|
186
|
+
)
|
|
187
|
+
"""Foreign key to metadata dataset record."""
|
|
188
|
+
|
|
189
|
+
data: Mapped[list[dict[str, Any]] | dict[str, Any]] = mapped_column(JsonField, nullable=False)
|
|
190
|
+
"""JSON serializable item data."""
|
|
191
|
+
|
|
192
|
+
# Relationship back to parent dataset
|
|
193
|
+
dataset: Mapped[DatasetMetadataDb] = relationship(back_populates='items')
|
|
194
|
+
|
|
195
|
+
storage_id = synonym('dataset_id')
|
|
196
|
+
"""Alias for dataset_id to match SqlClientMixin expectations."""
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class RequestDb(Base):
|
|
200
|
+
"""Requests table for request queues."""
|
|
201
|
+
|
|
202
|
+
__tablename__ = 'request_queue_records'
|
|
203
|
+
__table_args__ = (
|
|
204
|
+
Index(
|
|
205
|
+
'idx_fetch_available',
|
|
206
|
+
'request_queue_id',
|
|
207
|
+
'is_handled',
|
|
208
|
+
'sequence_number',
|
|
209
|
+
postgresql_where=text('is_handled is false'),
|
|
210
|
+
),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
request_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
|
|
214
|
+
"""Unique identifier for the request representing the unique_key."""
|
|
215
|
+
|
|
216
|
+
request_queue_id: Mapped[str] = mapped_column(
|
|
217
|
+
String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True
|
|
218
|
+
)
|
|
219
|
+
"""Foreign key to metadata request queue record."""
|
|
220
|
+
|
|
221
|
+
data: Mapped[str] = mapped_column(String, nullable=False)
|
|
222
|
+
"""JSON-serialized Request object."""
|
|
223
|
+
|
|
224
|
+
sequence_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
225
|
+
"""Ordering sequence: negative for forefront, positive for regular."""
|
|
226
|
+
|
|
227
|
+
is_handled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
|
228
|
+
"""Processing status flag."""
|
|
229
|
+
|
|
230
|
+
time_blocked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)
|
|
231
|
+
"""Timestamp until which this request is considered blocked for processing by other clients."""
|
|
232
|
+
|
|
233
|
+
client_key: Mapped[str | None] = mapped_column(String(32), nullable=True)
|
|
234
|
+
"""Identifier of the client that has currently locked this request for processing."""
|
|
235
|
+
|
|
236
|
+
# Relationship back to metadata table
|
|
237
|
+
queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='requests')
|
|
238
|
+
|
|
239
|
+
storage_id = synonym('request_queue_id')
|
|
240
|
+
"""Alias for request_queue_id to match SqlClientMixin expectations."""
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class RequestQueueStateDb(Base):
|
|
244
|
+
"""State table for request queues."""
|
|
245
|
+
|
|
246
|
+
__tablename__ = 'request_queue_state'
|
|
247
|
+
|
|
248
|
+
request_queue_id: Mapped[str] = mapped_column(
|
|
249
|
+
String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True
|
|
250
|
+
)
|
|
251
|
+
"""Foreign key to metadata request queue record."""
|
|
252
|
+
|
|
253
|
+
sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=1)
|
|
254
|
+
"""Counter for regular request ordering (positive)."""
|
|
255
|
+
|
|
256
|
+
forefront_sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=-1)
|
|
257
|
+
"""Counter for forefront request ordering (negative)."""
|
|
258
|
+
|
|
259
|
+
# Relationship back to metadata table
|
|
260
|
+
queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='state')
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class VersionDb(Base):
|
|
264
|
+
"""Table for storing the database schema version."""
|
|
265
|
+
|
|
266
|
+
__tablename__ = 'version'
|
|
267
|
+
|
|
268
|
+
version: Mapped[str] = mapped_column(String(10), nullable=False, primary_key=True)
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from logging import getLogger
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import CursorResult, delete, select
|
|
8
|
+
from typing_extensions import Self, override
|
|
9
|
+
|
|
10
|
+
from crawlee._utils.file import infer_mime_type
|
|
11
|
+
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
|
+
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
|
+
|
|
14
|
+
from ._client_mixin import MetadataUpdateParams, SqlClientMixin
|
|
15
|
+
from ._db_models import KeyValueStoreMetadataDb, KeyValueStoreRecordDb
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import AsyncIterator
|
|
19
|
+
|
|
20
|
+
from ._storage_client import SqlStorageClient
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
|
|
27
|
+
"""SQL implementation of the key-value store client.
|
|
28
|
+
|
|
29
|
+
This client persists key-value data to a SQL database with transaction support and
|
|
30
|
+
concurrent access safety. Keys are mapped to rows in database tables with proper indexing
|
|
31
|
+
for efficient retrieval.
|
|
32
|
+
|
|
33
|
+
The key-value store data is stored in SQL database tables following the pattern:
|
|
34
|
+
- `key_value_stores` table: Contains store metadata (id, name, timestamps)
|
|
35
|
+
- `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type,
|
|
36
|
+
and size information
|
|
37
|
+
|
|
38
|
+
Values are serialized based on their type: JSON objects are stored as formatted JSON,
|
|
39
|
+
text values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column.
|
|
40
|
+
The implementation automatically handles content type detection and maintains metadata
|
|
41
|
+
about each record including size and MIME type information.
|
|
42
|
+
|
|
43
|
+
All database operations are wrapped in transactions with proper error handling and rollback
|
|
44
|
+
mechanisms. The client supports atomic upsert operations and handles race conditions when
|
|
45
|
+
multiple clients access the same store using composite primary keys (key_value_store_id, key).
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
_DEFAULT_NAME = 'default'
|
|
49
|
+
"""Default dataset name used when no name is provided."""
|
|
50
|
+
|
|
51
|
+
_METADATA_TABLE = KeyValueStoreMetadataDb
|
|
52
|
+
"""SQLAlchemy model for key-value store metadata."""
|
|
53
|
+
|
|
54
|
+
_ITEM_TABLE = KeyValueStoreRecordDb
|
|
55
|
+
"""SQLAlchemy model for key-value store items."""
|
|
56
|
+
|
|
57
|
+
_CLIENT_TYPE = 'Key-value store'
|
|
58
|
+
"""Human-readable client type for error messages."""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
storage_client: SqlStorageClient,
|
|
64
|
+
id: str,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Initialize a new instance.
|
|
67
|
+
|
|
68
|
+
Preferably use the `SqlKeyValueStoreClient.open` class method to create a new instance.
|
|
69
|
+
"""
|
|
70
|
+
super().__init__(id=id, storage_client=storage_client)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
async def open(
|
|
74
|
+
cls,
|
|
75
|
+
*,
|
|
76
|
+
id: str | None,
|
|
77
|
+
name: str | None,
|
|
78
|
+
alias: str | None,
|
|
79
|
+
storage_client: SqlStorageClient,
|
|
80
|
+
) -> Self:
|
|
81
|
+
"""Open or create a SQL key-value store client.
|
|
82
|
+
|
|
83
|
+
This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified
|
|
84
|
+
ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
|
|
85
|
+
is created.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
id: The ID of the key-value store to open. If provided, searches for existing store by ID.
|
|
89
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
90
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
91
|
+
storage_client: The SQL storage client used to access the database.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
An instance for the opened or created storage client.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
ValueError: If a store with the specified ID is not found, or if metadata is invalid.
|
|
98
|
+
"""
|
|
99
|
+
return await cls._safely_open(
|
|
100
|
+
id=id,
|
|
101
|
+
name=name,
|
|
102
|
+
alias=alias,
|
|
103
|
+
storage_client=storage_client,
|
|
104
|
+
metadata_model=KeyValueStoreMetadata,
|
|
105
|
+
extra_metadata_fields={},
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
@override
|
|
109
|
+
async def get_metadata(self) -> KeyValueStoreMetadata:
|
|
110
|
+
# The database is a single place of truth
|
|
111
|
+
return await self._get_metadata(KeyValueStoreMetadata)
|
|
112
|
+
|
|
113
|
+
@override
|
|
114
|
+
async def drop(self) -> None:
|
|
115
|
+
"""Delete this key-value store and all its records from the database.
|
|
116
|
+
|
|
117
|
+
This operation is irreversible. Uses CASCADE deletion to remove all related records.
|
|
118
|
+
"""
|
|
119
|
+
await self._drop()
|
|
120
|
+
|
|
121
|
+
@override
|
|
122
|
+
async def purge(self) -> None:
|
|
123
|
+
"""Remove all items from this key-value store while keeping the key-value store structure.
|
|
124
|
+
|
|
125
|
+
Remove all records from key_value_store_records table.
|
|
126
|
+
"""
|
|
127
|
+
await self._purge(metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
|
|
128
|
+
|
|
129
|
+
@override
|
|
130
|
+
async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
|
|
131
|
+
# Special handling for None values
|
|
132
|
+
if value is None:
|
|
133
|
+
content_type = 'application/x-none' # Special content type to identify None values
|
|
134
|
+
value_bytes = b''
|
|
135
|
+
else:
|
|
136
|
+
content_type = content_type or infer_mime_type(value)
|
|
137
|
+
|
|
138
|
+
# Serialize the value to bytes.
|
|
139
|
+
if 'application/json' in content_type:
|
|
140
|
+
value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
|
|
141
|
+
elif isinstance(value, str):
|
|
142
|
+
value_bytes = value.encode('utf-8')
|
|
143
|
+
elif isinstance(value, (bytes, bytearray)):
|
|
144
|
+
value_bytes = value
|
|
145
|
+
else:
|
|
146
|
+
# Fallback: attempt to convert to string and encode.
|
|
147
|
+
value_bytes = str(value).encode('utf-8')
|
|
148
|
+
|
|
149
|
+
size = len(value_bytes)
|
|
150
|
+
insert_values = {
|
|
151
|
+
'key_value_store_id': self._id,
|
|
152
|
+
'key': key,
|
|
153
|
+
'value': value_bytes,
|
|
154
|
+
'content_type': content_type,
|
|
155
|
+
'size': size,
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
upsert_stmt = self._build_upsert_stmt(
|
|
159
|
+
self._ITEM_TABLE,
|
|
160
|
+
insert_values=insert_values,
|
|
161
|
+
update_columns=['value', 'content_type', 'size'],
|
|
162
|
+
conflict_cols=['key_value_store_id', 'key'],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
async with self.get_session(with_simple_commit=True) as session:
|
|
166
|
+
await session.execute(upsert_stmt)
|
|
167
|
+
|
|
168
|
+
await self._update_metadata(
|
|
169
|
+
session, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
@override
|
|
173
|
+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
|
|
174
|
+
# Query the record by key
|
|
175
|
+
stmt = select(self._ITEM_TABLE).where(
|
|
176
|
+
self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
|
|
177
|
+
)
|
|
178
|
+
async with self.get_session() as session:
|
|
179
|
+
result = await session.execute(stmt)
|
|
180
|
+
record_db = result.scalar_one_or_none()
|
|
181
|
+
|
|
182
|
+
updated = await self._update_metadata(session, **MetadataUpdateParams(update_accessed_at=True))
|
|
183
|
+
|
|
184
|
+
# Commit updates to the metadata
|
|
185
|
+
if updated:
|
|
186
|
+
await session.commit()
|
|
187
|
+
|
|
188
|
+
if not record_db:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
# Deserialize the value based on content type
|
|
192
|
+
value_bytes = record_db.value
|
|
193
|
+
|
|
194
|
+
# Handle None values
|
|
195
|
+
if record_db.content_type == 'application/x-none':
|
|
196
|
+
value = None
|
|
197
|
+
# Handle JSON values
|
|
198
|
+
elif 'application/json' in record_db.content_type:
|
|
199
|
+
try:
|
|
200
|
+
value = json.loads(value_bytes.decode('utf-8'))
|
|
201
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
202
|
+
logger.warning(f'Failed to decode JSON value for key "{key}"')
|
|
203
|
+
return None
|
|
204
|
+
# Handle text values
|
|
205
|
+
elif record_db.content_type.startswith('text/'):
|
|
206
|
+
try:
|
|
207
|
+
value = value_bytes.decode('utf-8')
|
|
208
|
+
except UnicodeDecodeError:
|
|
209
|
+
logger.warning(f'Failed to decode text value for key "{key}"')
|
|
210
|
+
return None
|
|
211
|
+
# Handle binary values
|
|
212
|
+
else:
|
|
213
|
+
value = value_bytes
|
|
214
|
+
|
|
215
|
+
return KeyValueStoreRecord(
|
|
216
|
+
key=record_db.key,
|
|
217
|
+
value=value,
|
|
218
|
+
content_type=record_db.content_type,
|
|
219
|
+
size=record_db.size,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
@override
|
|
223
|
+
async def delete_value(self, *, key: str) -> None:
|
|
224
|
+
stmt = delete(self._ITEM_TABLE).where(
|
|
225
|
+
self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
|
|
226
|
+
)
|
|
227
|
+
async with self.get_session(with_simple_commit=True) as session:
|
|
228
|
+
# Delete the record if it exists
|
|
229
|
+
result = await session.execute(stmt)
|
|
230
|
+
result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
|
|
231
|
+
|
|
232
|
+
# Update metadata if we actually deleted something
|
|
233
|
+
if result.rowcount > 0:
|
|
234
|
+
await self._update_metadata(
|
|
235
|
+
session, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
await session.commit()
|
|
239
|
+
|
|
240
|
+
@override
|
|
241
|
+
async def iterate_keys(
|
|
242
|
+
self,
|
|
243
|
+
*,
|
|
244
|
+
exclusive_start_key: str | None = None,
|
|
245
|
+
limit: int | None = None,
|
|
246
|
+
) -> AsyncIterator[KeyValueStoreRecordMetadata]:
|
|
247
|
+
# Build query for record metadata
|
|
248
|
+
stmt = (
|
|
249
|
+
select(self._ITEM_TABLE.key, self._ITEM_TABLE.content_type, self._ITEM_TABLE.size)
|
|
250
|
+
.where(self._ITEM_TABLE.key_value_store_id == self._id)
|
|
251
|
+
.order_by(self._ITEM_TABLE.key)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Apply exclusive_start_key filter
|
|
255
|
+
if exclusive_start_key is not None:
|
|
256
|
+
stmt = stmt.where(self._ITEM_TABLE.key > exclusive_start_key)
|
|
257
|
+
|
|
258
|
+
# Apply limit
|
|
259
|
+
if limit is not None:
|
|
260
|
+
stmt = stmt.limit(limit)
|
|
261
|
+
|
|
262
|
+
async with self.get_session() as session:
|
|
263
|
+
result = await session.stream(stmt.execution_options(stream_results=True))
|
|
264
|
+
|
|
265
|
+
async for row in result:
|
|
266
|
+
yield KeyValueStoreRecordMetadata(
|
|
267
|
+
key=row.key,
|
|
268
|
+
content_type=row.content_type,
|
|
269
|
+
size=row.size,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
updated = await self._update_metadata(session, **MetadataUpdateParams(update_accessed_at=True))
|
|
273
|
+
|
|
274
|
+
# Commit updates to the metadata
|
|
275
|
+
if updated:
|
|
276
|
+
await session.commit()
|
|
277
|
+
|
|
278
|
+
@override
|
|
279
|
+
async def record_exists(self, *, key: str) -> bool:
|
|
280
|
+
stmt = select(self._ITEM_TABLE.key).where(
|
|
281
|
+
self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
|
|
282
|
+
)
|
|
283
|
+
async with self.get_session() as session:
|
|
284
|
+
# Check if record exists
|
|
285
|
+
result = await session.execute(stmt)
|
|
286
|
+
|
|
287
|
+
updated = await self._update_metadata(session, **MetadataUpdateParams(update_accessed_at=True))
|
|
288
|
+
|
|
289
|
+
# Commit updates to the metadata
|
|
290
|
+
if updated:
|
|
291
|
+
await session.commit()
|
|
292
|
+
|
|
293
|
+
return result.scalar_one_or_none() is not None
|
|
294
|
+
|
|
295
|
+
@override
|
|
296
|
+
async def get_public_url(self, *, key: str) -> str:
|
|
297
|
+
raise NotImplementedError('Public URLs are not supported for SQL key-value stores.')
|
|
298
|
+
|
|
299
|
+
def _specific_update_metadata(self, **_kwargs: dict[str, Any]) -> dict[str, Any]:
|
|
300
|
+
return {}
|