atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +31 -1
- atdata/_cid.py +29 -35
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +33 -17
- atdata/_hf_api.py +109 -59
- atdata/_logging.py +70 -0
- atdata/_protocols.py +74 -132
- atdata/_schema_codec.py +38 -41
- atdata/_sources.py +57 -64
- atdata/_stub_manager.py +31 -26
- atdata/_type_utils.py +47 -7
- atdata/atmosphere/__init__.py +31 -24
- atdata/atmosphere/_types.py +11 -11
- atdata/atmosphere/client.py +11 -8
- atdata/atmosphere/lens.py +27 -30
- atdata/atmosphere/records.py +34 -39
- atdata/atmosphere/schema.py +35 -31
- atdata/atmosphere/store.py +16 -20
- atdata/cli/__init__.py +163 -168
- atdata/cli/diagnose.py +12 -8
- atdata/cli/inspect.py +69 -0
- atdata/cli/local.py +5 -2
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +678 -533
- atdata/lens.py +85 -83
- atdata/local/__init__.py +71 -0
- atdata/local/_entry.py +157 -0
- atdata/local/_index.py +940 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/local/_s3.py +349 -0
- atdata/local/_schema.py +380 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +20 -24
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/testing.py +337 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
- atdata-0.3.0b1.dist-info/RECORD +54 -0
- atdata/local.py +0 -1707
- atdata-0.2.2b1.dist-info/RECORD +0 -28
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/lens.py
CHANGED
|
@@ -14,30 +14,28 @@ Key components:
|
|
|
14
14
|
Lenses support the functional programming concept of composable, well-behaved
|
|
15
15
|
transformations that satisfy lens laws (GetPut and PutGet).
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
>>> ds = Dataset[FullData]("data.tar")
|
|
40
|
-
>>> ds_names = ds.as_type(NameOnly) # Uses registered lens
|
|
17
|
+
Examples:
|
|
18
|
+
>>> @packable
|
|
19
|
+
... class FullData:
|
|
20
|
+
... name: str
|
|
21
|
+
... age: int
|
|
22
|
+
... embedding: NDArray
|
|
23
|
+
...
|
|
24
|
+
>>> @packable
|
|
25
|
+
... class NameOnly:
|
|
26
|
+
... name: str
|
|
27
|
+
...
|
|
28
|
+
>>> @lens
|
|
29
|
+
... def name_view(full: FullData) -> NameOnly:
|
|
30
|
+
... return NameOnly(name=full.name)
|
|
31
|
+
...
|
|
32
|
+
>>> @name_view.putter
|
|
33
|
+
... def name_view_put(view: NameOnly, source: FullData) -> FullData:
|
|
34
|
+
... return FullData(name=view.name, age=source.age,
|
|
35
|
+
... embedding=source.embedding)
|
|
36
|
+
...
|
|
37
|
+
>>> ds = Dataset[FullData]("data.tar")
|
|
38
|
+
>>> ds_names = ds.as_type(NameOnly) # Uses registered lens
|
|
41
39
|
"""
|
|
42
40
|
|
|
43
41
|
##
|
|
@@ -56,23 +54,24 @@ from typing import (
|
|
|
56
54
|
Optional,
|
|
57
55
|
Generic,
|
|
58
56
|
#
|
|
59
|
-
TYPE_CHECKING
|
|
57
|
+
TYPE_CHECKING,
|
|
60
58
|
)
|
|
61
59
|
|
|
62
60
|
if TYPE_CHECKING:
|
|
63
61
|
from .dataset import PackableSample
|
|
64
62
|
|
|
65
63
|
from ._protocols import Packable
|
|
64
|
+
from ._exceptions import LensNotFoundError
|
|
66
65
|
|
|
67
66
|
|
|
68
67
|
##
|
|
69
68
|
# Typing helpers
|
|
70
69
|
|
|
71
|
-
DatasetType: TypeAlias = Type[
|
|
70
|
+
DatasetType: TypeAlias = Type["PackableSample"]
|
|
72
71
|
LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
|
|
73
72
|
|
|
74
|
-
S = TypeVar(
|
|
75
|
-
V = TypeVar(
|
|
73
|
+
S = TypeVar("S", bound=Packable)
|
|
74
|
+
V = TypeVar("V", bound=Packable)
|
|
76
75
|
type LensGetter[S, V] = Callable[[S], V]
|
|
77
76
|
type LensPutter[S, V] = Callable[[V, S], S]
|
|
78
77
|
|
|
@@ -80,7 +79,8 @@ type LensPutter[S, V] = Callable[[V, S], S]
|
|
|
80
79
|
##
|
|
81
80
|
# Shortcut decorators
|
|
82
81
|
|
|
83
|
-
|
|
82
|
+
|
|
83
|
+
class Lens(Generic[S, V]):
|
|
84
84
|
"""A bidirectional transformation between two sample types.
|
|
85
85
|
|
|
86
86
|
A lens provides a way to view and update data of type ``S`` (source) as if
|
|
@@ -92,22 +92,22 @@ class Lens( Generic[S, V] ):
|
|
|
92
92
|
S: The source type, must derive from ``PackableSample``.
|
|
93
93
|
V: The view type, must derive from ``PackableSample``.
|
|
94
94
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
... def name_lens_put(view: NameOnly, source: FullData) -> FullData:
|
|
104
|
-
... return FullData(name=view.name, age=source.age)
|
|
95
|
+
Examples:
|
|
96
|
+
>>> @lens
|
|
97
|
+
... def name_lens(full: FullData) -> NameOnly:
|
|
98
|
+
... return NameOnly(name=full.name)
|
|
99
|
+
...
|
|
100
|
+
>>> @name_lens.putter
|
|
101
|
+
... def name_lens_put(view: NameOnly, source: FullData) -> FullData:
|
|
102
|
+
... return FullData(name=view.name, age=source.age)
|
|
105
103
|
"""
|
|
106
|
-
# TODO The above has a line for "Parameters:" that should be "Type Parameters:"; this is a temporary fix for `quartodoc` auto-generation bugs.
|
|
107
104
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
105
|
+
# Note: The docstring uses "Parameters:" for type parameters as a workaround
|
|
106
|
+
# for quartodoc not supporting "Type Parameters:" sections.
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self, get: LensGetter[S, V], put: Optional[LensPutter[S, V]] = None
|
|
110
|
+
) -> None:
|
|
111
111
|
"""Initialize a lens with a getter and optional putter function.
|
|
112
112
|
|
|
113
113
|
Args:
|
|
@@ -126,8 +126,8 @@ class Lens( Generic[S, V] ):
|
|
|
126
126
|
|
|
127
127
|
# Check argument validity
|
|
128
128
|
|
|
129
|
-
sig = inspect.signature(
|
|
130
|
-
input_types = list(
|
|
129
|
+
sig = inspect.signature(get)
|
|
130
|
+
input_types = list(sig.parameters.values())
|
|
131
131
|
if len(input_types) != 1:
|
|
132
132
|
raise ValueError(
|
|
133
133
|
f"Lens getter must have exactly one parameter, got {len(input_types)}: "
|
|
@@ -135,7 +135,7 @@ class Lens( Generic[S, V] ):
|
|
|
135
135
|
)
|
|
136
136
|
|
|
137
137
|
# Update function details for this object as returned by annotation
|
|
138
|
-
functools.update_wrapper(
|
|
138
|
+
functools.update_wrapper(self, get)
|
|
139
139
|
|
|
140
140
|
self.source_type: Type[Packable] = input_types[0].annotation
|
|
141
141
|
self.view_type: Type[Packable] = sig.return_annotation
|
|
@@ -146,14 +146,15 @@ class Lens( Generic[S, V] ):
|
|
|
146
146
|
# Determine and store the putter
|
|
147
147
|
if put is None:
|
|
148
148
|
# Trivial putter does not update the source
|
|
149
|
-
def _trivial_put(
|
|
149
|
+
def _trivial_put(v: V, s: S) -> S:
|
|
150
150
|
return s
|
|
151
|
+
|
|
151
152
|
put = _trivial_put
|
|
152
153
|
self._putter = put
|
|
153
|
-
|
|
154
|
+
|
|
154
155
|
#
|
|
155
156
|
|
|
156
|
-
def putter(
|
|
157
|
+
def putter(self, put: LensPutter[S, V]) -> LensPutter[S, V]:
|
|
157
158
|
"""Decorator to register a putter function for this lens.
|
|
158
159
|
|
|
159
160
|
Args:
|
|
@@ -163,20 +164,18 @@ class Lens( Generic[S, V] ):
|
|
|
163
164
|
Returns:
|
|
164
165
|
The putter function, allowing this to be used as a decorator.
|
|
165
166
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
... def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
171
|
-
... return SourceType(...)
|
|
167
|
+
Examples:
|
|
168
|
+
>>> @my_lens.putter
|
|
169
|
+
... def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
170
|
+
... return SourceType(field=view.field, other=source.other)
|
|
172
171
|
"""
|
|
173
172
|
##
|
|
174
173
|
self._putter = put
|
|
175
174
|
return put
|
|
176
|
-
|
|
175
|
+
|
|
177
176
|
# Methods to actually execute transformations
|
|
178
177
|
|
|
179
|
-
def put(
|
|
178
|
+
def put(self, v: V, s: S) -> S:
|
|
180
179
|
"""Update the source based on a modified view.
|
|
181
180
|
|
|
182
181
|
Args:
|
|
@@ -186,9 +185,9 @@ class Lens( Generic[S, V] ):
|
|
|
186
185
|
Returns:
|
|
187
186
|
An updated source of type ``S`` that reflects changes from the view.
|
|
188
187
|
"""
|
|
189
|
-
return self._putter(
|
|
188
|
+
return self._putter(v, s)
|
|
190
189
|
|
|
191
|
-
def get(
|
|
190
|
+
def get(self, s: S) -> V:
|
|
192
191
|
"""Transform the source into the view type.
|
|
193
192
|
|
|
194
193
|
Args:
|
|
@@ -197,14 +196,14 @@ class Lens( Generic[S, V] ):
|
|
|
197
196
|
Returns:
|
|
198
197
|
A view of the source as type ``V``.
|
|
199
198
|
"""
|
|
200
|
-
return self(
|
|
199
|
+
return self(s)
|
|
201
200
|
|
|
202
|
-
def __call__(
|
|
201
|
+
def __call__(self, s: S) -> V:
|
|
203
202
|
"""Apply the lens transformation (same as ``get()``)."""
|
|
204
|
-
return self._getter(
|
|
203
|
+
return self._getter(s)
|
|
205
204
|
|
|
206
205
|
|
|
207
|
-
def lens(
|
|
206
|
+
def lens(f: LensGetter[S, V]) -> Lens[S, V]:
|
|
208
207
|
"""Decorator to create and register a lens transformation.
|
|
209
208
|
|
|
210
209
|
This decorator converts a getter function into a ``Lens`` object and
|
|
@@ -218,19 +217,17 @@ def lens( f: LensGetter[S, V] ) -> Lens[S, V]:
|
|
|
218
217
|
A ``Lens[S, V]`` object that can be called to apply the transformation
|
|
219
218
|
or decorated with ``@lens_name.putter`` to add a putter function.
|
|
220
219
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
... def extract_name_put(view: NameOnly, source: FullData) -> FullData:
|
|
230
|
-
... return FullData(name=view.name, age=source.age)
|
|
220
|
+
Examples:
|
|
221
|
+
>>> @lens
|
|
222
|
+
... def extract_name(full: FullData) -> NameOnly:
|
|
223
|
+
... return NameOnly(name=full.name)
|
|
224
|
+
...
|
|
225
|
+
>>> @extract_name.putter
|
|
226
|
+
... def extract_name_put(view: NameOnly, source: FullData) -> FullData:
|
|
227
|
+
... return FullData(name=view.name, age=source.age)
|
|
231
228
|
"""
|
|
232
|
-
ret = Lens[S, V](
|
|
233
|
-
_network.register(
|
|
229
|
+
ret = Lens[S, V](f)
|
|
230
|
+
_network.register(ret)
|
|
234
231
|
return ret
|
|
235
232
|
|
|
236
233
|
|
|
@@ -259,11 +256,11 @@ class LensNetwork:
|
|
|
259
256
|
|
|
260
257
|
def __init__(self):
|
|
261
258
|
"""Initialize the lens registry (only on first instantiation)."""
|
|
262
|
-
if not hasattr(self,
|
|
259
|
+
if not hasattr(self, "_initialized"): # Check if already initialized
|
|
263
260
|
self._registry: Dict[LensSignature, Lens] = dict()
|
|
264
261
|
self._initialized = True
|
|
265
|
-
|
|
266
|
-
def register(
|
|
262
|
+
|
|
263
|
+
def register(self, _lens: Lens):
|
|
267
264
|
"""Register a lens as the canonical transformation between two types.
|
|
268
265
|
|
|
269
266
|
Args:
|
|
@@ -275,8 +272,8 @@ class LensNetwork:
|
|
|
275
272
|
overwritten.
|
|
276
273
|
"""
|
|
277
274
|
self._registry[_lens.source_type, _lens.view_type] = _lens
|
|
278
|
-
|
|
279
|
-
def transform(
|
|
275
|
+
|
|
276
|
+
def transform(self, source: DatasetType, view: DatasetType) -> Lens:
|
|
280
277
|
"""Look up the lens transformation between two sample types.
|
|
281
278
|
|
|
282
279
|
Args:
|
|
@@ -293,12 +290,17 @@ class LensNetwork:
|
|
|
293
290
|
Currently only supports direct transformations. Compositional
|
|
294
291
|
transformations (chaining multiple lenses) are not yet implemented.
|
|
295
292
|
"""
|
|
296
|
-
ret = self._registry.get(
|
|
293
|
+
ret = self._registry.get((source, view), None)
|
|
297
294
|
if ret is None:
|
|
298
|
-
|
|
295
|
+
available_targets = [
|
|
296
|
+
(sig[1], lens_obj.__name__)
|
|
297
|
+
for sig, lens_obj in self._registry.items()
|
|
298
|
+
if sig[0] is source and hasattr(lens_obj, "__name__")
|
|
299
|
+
]
|
|
300
|
+
raise LensNotFoundError(source, view, available_targets)
|
|
299
301
|
|
|
300
302
|
return ret
|
|
301
303
|
|
|
302
304
|
|
|
303
305
|
# Global singleton registry instance
|
|
304
|
-
_network = LensNetwork()
|
|
306
|
+
_network = LensNetwork()
|
atdata/local/__init__.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Local storage backend for atdata datasets.
|
|
2
|
+
|
|
3
|
+
Key classes:
|
|
4
|
+
|
|
5
|
+
- ``Index``: Unified index with pluggable providers (SQLite default),
|
|
6
|
+
named repositories, and optional atmosphere backend.
|
|
7
|
+
- ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
|
|
8
|
+
- ``S3DataStore``: S3-compatible shard storage.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from atdata.local._entry import (
|
|
12
|
+
LocalDatasetEntry,
|
|
13
|
+
BasicIndexEntry,
|
|
14
|
+
REDIS_KEY_DATASET_ENTRY,
|
|
15
|
+
REDIS_KEY_SCHEMA,
|
|
16
|
+
)
|
|
17
|
+
from atdata.local._schema import (
|
|
18
|
+
SchemaNamespace,
|
|
19
|
+
SchemaFieldType,
|
|
20
|
+
SchemaField,
|
|
21
|
+
LocalSchemaRecord,
|
|
22
|
+
_ATDATA_URI_PREFIX,
|
|
23
|
+
_LEGACY_URI_PREFIX,
|
|
24
|
+
_kind_str_for_sample_type,
|
|
25
|
+
_schema_ref_from_type,
|
|
26
|
+
_make_schema_ref,
|
|
27
|
+
_parse_schema_ref,
|
|
28
|
+
_increment_patch,
|
|
29
|
+
_python_type_to_field_type,
|
|
30
|
+
_build_schema_record,
|
|
31
|
+
)
|
|
32
|
+
from atdata.local._index import Index
|
|
33
|
+
from atdata.local._s3 import (
|
|
34
|
+
S3DataStore,
|
|
35
|
+
_s3_env,
|
|
36
|
+
_s3_from_credentials,
|
|
37
|
+
_create_s3_write_callbacks,
|
|
38
|
+
)
|
|
39
|
+
from atdata.local._repo_legacy import Repo
|
|
40
|
+
|
|
41
|
+
# Re-export third-party types that were previously importable from the
|
|
42
|
+
# monolithic local.py (tests reference atdata.local.S3FileSystem, etc.)
|
|
43
|
+
from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
# Public API
|
|
47
|
+
"Index",
|
|
48
|
+
"LocalDatasetEntry",
|
|
49
|
+
"BasicIndexEntry",
|
|
50
|
+
"S3DataStore",
|
|
51
|
+
"Repo",
|
|
52
|
+
"SchemaNamespace",
|
|
53
|
+
"SchemaFieldType",
|
|
54
|
+
"SchemaField",
|
|
55
|
+
"LocalSchemaRecord",
|
|
56
|
+
"REDIS_KEY_DATASET_ENTRY",
|
|
57
|
+
"REDIS_KEY_SCHEMA",
|
|
58
|
+
# Internal helpers (re-exported for backward compatibility)
|
|
59
|
+
"_ATDATA_URI_PREFIX",
|
|
60
|
+
"_LEGACY_URI_PREFIX",
|
|
61
|
+
"_kind_str_for_sample_type",
|
|
62
|
+
"_schema_ref_from_type",
|
|
63
|
+
"_make_schema_ref",
|
|
64
|
+
"_parse_schema_ref",
|
|
65
|
+
"_increment_patch",
|
|
66
|
+
"_python_type_to_field_type",
|
|
67
|
+
"_build_schema_record",
|
|
68
|
+
"_s3_env",
|
|
69
|
+
"_s3_from_credentials",
|
|
70
|
+
"_create_s3_write_callbacks",
|
|
71
|
+
]
|
atdata/local/_entry.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Dataset entry model and Redis key constants."""
|
|
2
|
+
|
|
3
|
+
from atdata._cid import generate_cid
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
import msgpack
|
|
9
|
+
from redis import Redis
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Redis key prefixes for index entries and schemas
|
|
13
|
+
REDIS_KEY_DATASET_ENTRY = "LocalDatasetEntry"
|
|
14
|
+
REDIS_KEY_SCHEMA = "LocalSchema"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class LocalDatasetEntry:
|
|
19
|
+
"""Index entry for a dataset stored in the local repository.
|
|
20
|
+
|
|
21
|
+
Implements the IndexEntry protocol for compatibility with AbstractIndex.
|
|
22
|
+
Uses dual identity: a content-addressable CID (ATProto-compatible) and
|
|
23
|
+
a human-readable name.
|
|
24
|
+
|
|
25
|
+
The CID is generated from the entry's content (schema_ref + data_urls),
|
|
26
|
+
ensuring the same data produces the same CID whether stored locally or
|
|
27
|
+
in the atmosphere. This enables seamless promotion from local to ATProto.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
name: Human-readable name for this dataset.
|
|
31
|
+
schema_ref: Reference to the schema for this dataset.
|
|
32
|
+
data_urls: WebDataset URLs for the data.
|
|
33
|
+
metadata: Arbitrary metadata dictionary, or None if not set.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
"""Human-readable name for this dataset."""
|
|
40
|
+
|
|
41
|
+
schema_ref: str
|
|
42
|
+
"""Reference to the schema for this dataset."""
|
|
43
|
+
|
|
44
|
+
data_urls: list[str]
|
|
45
|
+
"""WebDataset URLs for the data."""
|
|
46
|
+
|
|
47
|
+
metadata: dict | None = None
|
|
48
|
+
"""Arbitrary metadata dictionary, or None if not set."""
|
|
49
|
+
|
|
50
|
+
_cid: str | None = field(default=None, repr=False)
|
|
51
|
+
"""Content identifier (ATProto-compatible CID). Generated from content if not provided."""
|
|
52
|
+
|
|
53
|
+
# Legacy field for backwards compatibility during migration
|
|
54
|
+
_legacy_uuid: str | None = field(default=None, repr=False)
|
|
55
|
+
"""Legacy UUID for backwards compatibility with existing Redis entries."""
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
"""Generate CID from content if not provided."""
|
|
59
|
+
if self._cid is None:
|
|
60
|
+
self._cid = self._generate_cid()
|
|
61
|
+
|
|
62
|
+
def _generate_cid(self) -> str:
|
|
63
|
+
"""Generate ATProto-compatible CID from entry content."""
|
|
64
|
+
# CID is based on schema_ref and data_urls - the identity of the dataset
|
|
65
|
+
content = {
|
|
66
|
+
"schema_ref": self.schema_ref,
|
|
67
|
+
"data_urls": self.data_urls,
|
|
68
|
+
}
|
|
69
|
+
return generate_cid(content)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def cid(self) -> str:
|
|
73
|
+
"""Content identifier (ATProto-compatible CID)."""
|
|
74
|
+
if self._cid is None:
|
|
75
|
+
raise RuntimeError(
|
|
76
|
+
"CID not initialized; this should not happen after __post_init__"
|
|
77
|
+
)
|
|
78
|
+
return self._cid
|
|
79
|
+
|
|
80
|
+
# Legacy compatibility
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def wds_url(self) -> str:
|
|
84
|
+
"""Legacy property: returns first data URL for backwards compatibility."""
|
|
85
|
+
return self.data_urls[0] if self.data_urls else ""
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def sample_kind(self) -> str:
|
|
89
|
+
"""Legacy property: returns schema_ref for backwards compatibility."""
|
|
90
|
+
return self.schema_ref
|
|
91
|
+
|
|
92
|
+
def write_to(self, redis: Redis):
|
|
93
|
+
"""Persist this index entry to Redis.
|
|
94
|
+
|
|
95
|
+
Stores the entry as a Redis hash with key '{REDIS_KEY_DATASET_ENTRY}:{cid}'.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
redis: Redis connection to write to.
|
|
99
|
+
"""
|
|
100
|
+
save_key = f"{REDIS_KEY_DATASET_ENTRY}:{self.cid}"
|
|
101
|
+
data: dict[str, Any] = {
|
|
102
|
+
"name": self.name,
|
|
103
|
+
"schema_ref": self.schema_ref,
|
|
104
|
+
"data_urls": msgpack.packb(self.data_urls), # Serialize list
|
|
105
|
+
"cid": self.cid,
|
|
106
|
+
}
|
|
107
|
+
if self.metadata is not None:
|
|
108
|
+
data["metadata"] = msgpack.packb(self.metadata)
|
|
109
|
+
if self._legacy_uuid is not None:
|
|
110
|
+
data["legacy_uuid"] = self._legacy_uuid
|
|
111
|
+
|
|
112
|
+
redis.hset(save_key, mapping=data) # type: ignore[arg-type]
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_redis(cls, redis: Redis, cid: str) -> "LocalDatasetEntry":
|
|
116
|
+
"""Load an entry from Redis by CID.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
redis: Redis connection to read from.
|
|
120
|
+
cid: Content identifier of the entry to load.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
LocalDatasetEntry loaded from Redis.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
KeyError: If entry not found.
|
|
127
|
+
"""
|
|
128
|
+
save_key = f"{REDIS_KEY_DATASET_ENTRY}:{cid}"
|
|
129
|
+
raw_data = redis.hgetall(save_key)
|
|
130
|
+
if not raw_data:
|
|
131
|
+
raise KeyError(f"{REDIS_KEY_DATASET_ENTRY} not found: {cid}")
|
|
132
|
+
|
|
133
|
+
# Decode string fields, keep binary fields as bytes for msgpack
|
|
134
|
+
raw_data_typed = cast(dict[bytes, bytes], raw_data)
|
|
135
|
+
name = raw_data_typed[b"name"].decode("utf-8")
|
|
136
|
+
schema_ref = raw_data_typed[b"schema_ref"].decode("utf-8")
|
|
137
|
+
cid_value = raw_data_typed.get(b"cid", b"").decode("utf-8") or None
|
|
138
|
+
legacy_uuid = raw_data_typed.get(b"legacy_uuid", b"").decode("utf-8") or None
|
|
139
|
+
|
|
140
|
+
# Deserialize msgpack fields (stored as raw bytes)
|
|
141
|
+
data_urls = msgpack.unpackb(raw_data_typed[b"data_urls"])
|
|
142
|
+
metadata = None
|
|
143
|
+
if b"metadata" in raw_data_typed:
|
|
144
|
+
metadata = msgpack.unpackb(raw_data_typed[b"metadata"])
|
|
145
|
+
|
|
146
|
+
return cls(
|
|
147
|
+
name=name,
|
|
148
|
+
schema_ref=schema_ref,
|
|
149
|
+
data_urls=data_urls,
|
|
150
|
+
metadata=metadata,
|
|
151
|
+
_cid=cid_value,
|
|
152
|
+
_legacy_uuid=legacy_uuid,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# Backwards compatibility alias
|
|
157
|
+
BasicIndexEntry = LocalDatasetEntry
|