atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +31 -1
  3. atdata/_cid.py +29 -35
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +33 -17
  6. atdata/_hf_api.py +109 -59
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +74 -132
  9. atdata/_schema_codec.py +38 -41
  10. atdata/_sources.py +57 -64
  11. atdata/_stub_manager.py +31 -26
  12. atdata/_type_utils.py +47 -7
  13. atdata/atmosphere/__init__.py +31 -24
  14. atdata/atmosphere/_types.py +11 -11
  15. atdata/atmosphere/client.py +11 -8
  16. atdata/atmosphere/lens.py +27 -30
  17. atdata/atmosphere/records.py +34 -39
  18. atdata/atmosphere/schema.py +35 -31
  19. atdata/atmosphere/store.py +16 -20
  20. atdata/cli/__init__.py +163 -168
  21. atdata/cli/diagnose.py +12 -8
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/local.py +5 -2
  24. atdata/cli/preview.py +63 -0
  25. atdata/cli/schema.py +109 -0
  26. atdata/dataset.py +678 -533
  27. atdata/lens.py +85 -83
  28. atdata/local/__init__.py +71 -0
  29. atdata/local/_entry.py +157 -0
  30. atdata/local/_index.py +940 -0
  31. atdata/local/_repo_legacy.py +218 -0
  32. atdata/local/_s3.py +349 -0
  33. atdata/local/_schema.py +380 -0
  34. atdata/manifest/__init__.py +28 -0
  35. atdata/manifest/_aggregates.py +156 -0
  36. atdata/manifest/_builder.py +163 -0
  37. atdata/manifest/_fields.py +154 -0
  38. atdata/manifest/_manifest.py +146 -0
  39. atdata/manifest/_query.py +150 -0
  40. atdata/manifest/_writer.py +74 -0
  41. atdata/promote.py +20 -24
  42. atdata/providers/__init__.py +25 -0
  43. atdata/providers/_base.py +140 -0
  44. atdata/providers/_factory.py +69 -0
  45. atdata/providers/_postgres.py +214 -0
  46. atdata/providers/_redis.py +171 -0
  47. atdata/providers/_sqlite.py +191 -0
  48. atdata/repository.py +323 -0
  49. atdata/testing.py +337 -0
  50. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
  51. atdata-0.3.0b1.dist-info/RECORD +54 -0
  52. atdata/local.py +0 -1707
  53. atdata-0.2.2b1.dist-info/RECORD +0 -28
  54. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  55. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  56. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/lens.py CHANGED
@@ -14,30 +14,28 @@ Key components:
14
14
  Lenses support the functional programming concept of composable, well-behaved
15
15
  transformations that satisfy lens laws (GetPut and PutGet).
16
16
 
17
- Example:
18
- ::
19
-
20
- >>> @packable
21
- ... class FullData:
22
- ... name: str
23
- ... age: int
24
- ... embedding: NDArray
25
- ...
26
- >>> @packable
27
- ... class NameOnly:
28
- ... name: str
29
- ...
30
- >>> @lens
31
- ... def name_view(full: FullData) -> NameOnly:
32
- ... return NameOnly(name=full.name)
33
- ...
34
- >>> @name_view.putter
35
- ... def name_view_put(view: NameOnly, source: FullData) -> FullData:
36
- ... return FullData(name=view.name, age=source.age,
37
- ... embedding=source.embedding)
38
- ...
39
- >>> ds = Dataset[FullData]("data.tar")
40
- >>> ds_names = ds.as_type(NameOnly) # Uses registered lens
17
+ Examples:
18
+ >>> @packable
19
+ ... class FullData:
20
+ ... name: str
21
+ ... age: int
22
+ ... embedding: NDArray
23
+ ...
24
+ >>> @packable
25
+ ... class NameOnly:
26
+ ... name: str
27
+ ...
28
+ >>> @lens
29
+ ... def name_view(full: FullData) -> NameOnly:
30
+ ... return NameOnly(name=full.name)
31
+ ...
32
+ >>> @name_view.putter
33
+ ... def name_view_put(view: NameOnly, source: FullData) -> FullData:
34
+ ... return FullData(name=view.name, age=source.age,
35
+ ... embedding=source.embedding)
36
+ ...
37
+ >>> ds = Dataset[FullData]("data.tar")
38
+ >>> ds_names = ds.as_type(NameOnly) # Uses registered lens
41
39
  """
42
40
 
43
41
  ##
@@ -56,23 +54,24 @@ from typing import (
56
54
  Optional,
57
55
  Generic,
58
56
  #
59
- TYPE_CHECKING
57
+ TYPE_CHECKING,
60
58
  )
61
59
 
62
60
  if TYPE_CHECKING:
63
61
  from .dataset import PackableSample
64
62
 
65
63
  from ._protocols import Packable
64
+ from ._exceptions import LensNotFoundError
66
65
 
67
66
 
68
67
  ##
69
68
  # Typing helpers
70
69
 
71
- DatasetType: TypeAlias = Type['PackableSample']
70
+ DatasetType: TypeAlias = Type["PackableSample"]
72
71
  LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
73
72
 
74
- S = TypeVar( 'S', bound = Packable )
75
- V = TypeVar( 'V', bound = Packable )
73
+ S = TypeVar("S", bound=Packable)
74
+ V = TypeVar("V", bound=Packable)
76
75
  type LensGetter[S, V] = Callable[[S], V]
77
76
  type LensPutter[S, V] = Callable[[V, S], S]
78
77
 
@@ -80,7 +79,8 @@ type LensPutter[S, V] = Callable[[V, S], S]
80
79
  ##
81
80
  # Shortcut decorators
82
81
 
83
- class Lens( Generic[S, V] ):
82
+
83
+ class Lens(Generic[S, V]):
84
84
  """A bidirectional transformation between two sample types.
85
85
 
86
86
  A lens provides a way to view and update data of type ``S`` (source) as if
@@ -92,22 +92,22 @@ class Lens( Generic[S, V] ):
92
92
  S: The source type, must derive from ``PackableSample``.
93
93
  V: The view type, must derive from ``PackableSample``.
94
94
 
95
- Example:
96
- ::
97
-
98
- >>> @lens
99
- ... def name_lens(full: FullData) -> NameOnly:
100
- ... return NameOnly(name=full.name)
101
- ...
102
- >>> @name_lens.putter
103
- ... def name_lens_put(view: NameOnly, source: FullData) -> FullData:
104
- ... return FullData(name=view.name, age=source.age)
95
+ Examples:
96
+ >>> @lens
97
+ ... def name_lens(full: FullData) -> NameOnly:
98
+ ... return NameOnly(name=full.name)
99
+ ...
100
+ >>> @name_lens.putter
101
+ ... def name_lens_put(view: NameOnly, source: FullData) -> FullData:
102
+ ... return FullData(name=view.name, age=source.age)
105
103
  """
106
- # TODO The above has a line for "Parameters:" that should be "Type Parameters:"; this is a temporary fix for `quartodoc` auto-generation bugs.
107
104
 
108
- def __init__( self, get: LensGetter[S, V],
109
- put: Optional[LensPutter[S, V]] = None
110
- ) -> None:
105
+ # Note: The docstring uses "Parameters:" for type parameters as a workaround
106
+ # for quartodoc not supporting "Type Parameters:" sections.
107
+
108
+ def __init__(
109
+ self, get: LensGetter[S, V], put: Optional[LensPutter[S, V]] = None
110
+ ) -> None:
111
111
  """Initialize a lens with a getter and optional putter function.
112
112
 
113
113
  Args:
@@ -126,8 +126,8 @@ class Lens( Generic[S, V] ):
126
126
 
127
127
  # Check argument validity
128
128
 
129
- sig = inspect.signature( get )
130
- input_types = list( sig.parameters.values() )
129
+ sig = inspect.signature(get)
130
+ input_types = list(sig.parameters.values())
131
131
  if len(input_types) != 1:
132
132
  raise ValueError(
133
133
  f"Lens getter must have exactly one parameter, got {len(input_types)}: "
@@ -135,7 +135,7 @@ class Lens( Generic[S, V] ):
135
135
  )
136
136
 
137
137
  # Update function details for this object as returned by annotation
138
- functools.update_wrapper( self, get )
138
+ functools.update_wrapper(self, get)
139
139
 
140
140
  self.source_type: Type[Packable] = input_types[0].annotation
141
141
  self.view_type: Type[Packable] = sig.return_annotation
@@ -146,14 +146,15 @@ class Lens( Generic[S, V] ):
146
146
  # Determine and store the putter
147
147
  if put is None:
148
148
  # Trivial putter does not update the source
149
- def _trivial_put( v: V, s: S ) -> S:
149
+ def _trivial_put(v: V, s: S) -> S:
150
150
  return s
151
+
151
152
  put = _trivial_put
152
153
  self._putter = put
153
-
154
+
154
155
  #
155
156
 
156
- def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
157
+ def putter(self, put: LensPutter[S, V]) -> LensPutter[S, V]:
157
158
  """Decorator to register a putter function for this lens.
158
159
 
159
160
  Args:
@@ -163,20 +164,18 @@ class Lens( Generic[S, V] ):
163
164
  Returns:
164
165
  The putter function, allowing this to be used as a decorator.
165
166
 
166
- Example:
167
- ::
168
-
169
- >>> @my_lens.putter
170
- ... def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
171
- ... return SourceType(...)
167
+ Examples:
168
+ >>> @my_lens.putter
169
+ ... def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
170
+ ... return SourceType(field=view.field, other=source.other)
172
171
  """
173
172
  ##
174
173
  self._putter = put
175
174
  return put
176
-
175
+
177
176
  # Methods to actually execute transformations
178
177
 
179
- def put( self, v: V, s: S ) -> S:
178
+ def put(self, v: V, s: S) -> S:
180
179
  """Update the source based on a modified view.
181
180
 
182
181
  Args:
@@ -186,9 +185,9 @@ class Lens( Generic[S, V] ):
186
185
  Returns:
187
186
  An updated source of type ``S`` that reflects changes from the view.
188
187
  """
189
- return self._putter( v, s )
188
+ return self._putter(v, s)
190
189
 
191
- def get( self, s: S ) -> V:
190
+ def get(self, s: S) -> V:
192
191
  """Transform the source into the view type.
193
192
 
194
193
  Args:
@@ -197,14 +196,14 @@ class Lens( Generic[S, V] ):
197
196
  Returns:
198
197
  A view of the source as type ``V``.
199
198
  """
200
- return self( s )
199
+ return self(s)
201
200
 
202
- def __call__( self, s: S ) -> V:
201
+ def __call__(self, s: S) -> V:
203
202
  """Apply the lens transformation (same as ``get()``)."""
204
- return self._getter( s )
203
+ return self._getter(s)
205
204
 
206
205
 
207
- def lens( f: LensGetter[S, V] ) -> Lens[S, V]:
206
+ def lens(f: LensGetter[S, V]) -> Lens[S, V]:
208
207
  """Decorator to create and register a lens transformation.
209
208
 
210
209
  This decorator converts a getter function into a ``Lens`` object and
@@ -218,19 +217,17 @@ def lens( f: LensGetter[S, V] ) -> Lens[S, V]:
218
217
  A ``Lens[S, V]`` object that can be called to apply the transformation
219
218
  or decorated with ``@lens_name.putter`` to add a putter function.
220
219
 
221
- Example:
222
- ::
223
-
224
- >>> @lens
225
- ... def extract_name(full: FullData) -> NameOnly:
226
- ... return NameOnly(name=full.name)
227
- ...
228
- >>> @extract_name.putter
229
- ... def extract_name_put(view: NameOnly, source: FullData) -> FullData:
230
- ... return FullData(name=view.name, age=source.age)
220
+ Examples:
221
+ >>> @lens
222
+ ... def extract_name(full: FullData) -> NameOnly:
223
+ ... return NameOnly(name=full.name)
224
+ ...
225
+ >>> @extract_name.putter
226
+ ... def extract_name_put(view: NameOnly, source: FullData) -> FullData:
227
+ ... return FullData(name=view.name, age=source.age)
231
228
  """
232
- ret = Lens[S, V]( f )
233
- _network.register( ret )
229
+ ret = Lens[S, V](f)
230
+ _network.register(ret)
234
231
  return ret
235
232
 
236
233
 
@@ -259,11 +256,11 @@ class LensNetwork:
259
256
 
260
257
  def __init__(self):
261
258
  """Initialize the lens registry (only on first instantiation)."""
262
- if not hasattr(self, '_initialized'): # Check if already initialized
259
+ if not hasattr(self, "_initialized"): # Check if already initialized
263
260
  self._registry: Dict[LensSignature, Lens] = dict()
264
261
  self._initialized = True
265
-
266
- def register( self, _lens: Lens ):
262
+
263
+ def register(self, _lens: Lens):
267
264
  """Register a lens as the canonical transformation between two types.
268
265
 
269
266
  Args:
@@ -275,8 +272,8 @@ class LensNetwork:
275
272
  overwritten.
276
273
  """
277
274
  self._registry[_lens.source_type, _lens.view_type] = _lens
278
-
279
- def transform( self, source: DatasetType, view: DatasetType ) -> Lens:
275
+
276
+ def transform(self, source: DatasetType, view: DatasetType) -> Lens:
280
277
  """Look up the lens transformation between two sample types.
281
278
 
282
279
  Args:
@@ -293,12 +290,17 @@ class LensNetwork:
293
290
  Currently only supports direct transformations. Compositional
294
291
  transformations (chaining multiple lenses) are not yet implemented.
295
292
  """
296
- ret = self._registry.get( (source, view), None )
293
+ ret = self._registry.get((source, view), None)
297
294
  if ret is None:
298
- raise ValueError( f'No registered lens from source {source} to view {view}' )
295
+ available_targets = [
296
+ (sig[1], lens_obj.__name__)
297
+ for sig, lens_obj in self._registry.items()
298
+ if sig[0] is source and hasattr(lens_obj, "__name__")
299
+ ]
300
+ raise LensNotFoundError(source, view, available_targets)
299
301
 
300
302
  return ret
301
303
 
302
304
 
303
305
  # Global singleton registry instance
304
- _network = LensNetwork()
306
+ _network = LensNetwork()
@@ -0,0 +1,71 @@
1
+ """Local storage backend for atdata datasets.
2
+
3
+ Key classes:
4
+
5
+ - ``Index``: Unified index with pluggable providers (SQLite default),
6
+ named repositories, and optional atmosphere backend.
7
+ - ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
8
+ - ``S3DataStore``: S3-compatible shard storage.
9
+ """
10
+
11
+ from atdata.local._entry import (
12
+ LocalDatasetEntry,
13
+ BasicIndexEntry,
14
+ REDIS_KEY_DATASET_ENTRY,
15
+ REDIS_KEY_SCHEMA,
16
+ )
17
+ from atdata.local._schema import (
18
+ SchemaNamespace,
19
+ SchemaFieldType,
20
+ SchemaField,
21
+ LocalSchemaRecord,
22
+ _ATDATA_URI_PREFIX,
23
+ _LEGACY_URI_PREFIX,
24
+ _kind_str_for_sample_type,
25
+ _schema_ref_from_type,
26
+ _make_schema_ref,
27
+ _parse_schema_ref,
28
+ _increment_patch,
29
+ _python_type_to_field_type,
30
+ _build_schema_record,
31
+ )
32
+ from atdata.local._index import Index
33
+ from atdata.local._s3 import (
34
+ S3DataStore,
35
+ _s3_env,
36
+ _s3_from_credentials,
37
+ _create_s3_write_callbacks,
38
+ )
39
+ from atdata.local._repo_legacy import Repo
40
+
41
+ # Re-export third-party types that were previously importable from the
42
+ # monolithic local.py (tests reference atdata.local.S3FileSystem, etc.)
43
+ from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
44
+
45
+ __all__ = [
46
+ # Public API
47
+ "Index",
48
+ "LocalDatasetEntry",
49
+ "BasicIndexEntry",
50
+ "S3DataStore",
51
+ "Repo",
52
+ "SchemaNamespace",
53
+ "SchemaFieldType",
54
+ "SchemaField",
55
+ "LocalSchemaRecord",
56
+ "REDIS_KEY_DATASET_ENTRY",
57
+ "REDIS_KEY_SCHEMA",
58
+ # Internal helpers (re-exported for backward compatibility)
59
+ "_ATDATA_URI_PREFIX",
60
+ "_LEGACY_URI_PREFIX",
61
+ "_kind_str_for_sample_type",
62
+ "_schema_ref_from_type",
63
+ "_make_schema_ref",
64
+ "_parse_schema_ref",
65
+ "_increment_patch",
66
+ "_python_type_to_field_type",
67
+ "_build_schema_record",
68
+ "_s3_env",
69
+ "_s3_from_credentials",
70
+ "_create_s3_write_callbacks",
71
+ ]
atdata/local/_entry.py ADDED
@@ -0,0 +1,157 @@
1
+ """Dataset entry model and Redis key constants."""
2
+
3
+ from atdata._cid import generate_cid
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, cast
7
+
8
+ import msgpack
9
+ from redis import Redis
10
+
11
+
12
+ # Redis key prefixes for index entries and schemas
13
+ REDIS_KEY_DATASET_ENTRY = "LocalDatasetEntry"
14
+ REDIS_KEY_SCHEMA = "LocalSchema"
15
+
16
+
17
+ @dataclass
18
+ class LocalDatasetEntry:
19
+ """Index entry for a dataset stored in the local repository.
20
+
21
+ Implements the IndexEntry protocol for compatibility with AbstractIndex.
22
+ Uses dual identity: a content-addressable CID (ATProto-compatible) and
23
+ a human-readable name.
24
+
25
+ The CID is generated from the entry's content (schema_ref + data_urls),
26
+ ensuring the same data produces the same CID whether stored locally or
27
+ in the atmosphere. This enables seamless promotion from local to ATProto.
28
+
29
+ Attributes:
30
+ name: Human-readable name for this dataset.
31
+ schema_ref: Reference to the schema for this dataset.
32
+ data_urls: WebDataset URLs for the data.
33
+ metadata: Arbitrary metadata dictionary, or None if not set.
34
+ """
35
+
36
+ ##
37
+
38
+ name: str
39
+ """Human-readable name for this dataset."""
40
+
41
+ schema_ref: str
42
+ """Reference to the schema for this dataset."""
43
+
44
+ data_urls: list[str]
45
+ """WebDataset URLs for the data."""
46
+
47
+ metadata: dict | None = None
48
+ """Arbitrary metadata dictionary, or None if not set."""
49
+
50
+ _cid: str | None = field(default=None, repr=False)
51
+ """Content identifier (ATProto-compatible CID). Generated from content if not provided."""
52
+
53
+ # Legacy field for backwards compatibility during migration
54
+ _legacy_uuid: str | None = field(default=None, repr=False)
55
+ """Legacy UUID for backwards compatibility with existing Redis entries."""
56
+
57
+ def __post_init__(self):
58
+ """Generate CID from content if not provided."""
59
+ if self._cid is None:
60
+ self._cid = self._generate_cid()
61
+
62
+ def _generate_cid(self) -> str:
63
+ """Generate ATProto-compatible CID from entry content."""
64
+ # CID is based on schema_ref and data_urls - the identity of the dataset
65
+ content = {
66
+ "schema_ref": self.schema_ref,
67
+ "data_urls": self.data_urls,
68
+ }
69
+ return generate_cid(content)
70
+
71
+ @property
72
+ def cid(self) -> str:
73
+ """Content identifier (ATProto-compatible CID)."""
74
+ if self._cid is None:
75
+ raise RuntimeError(
76
+ "CID not initialized; this should not happen after __post_init__"
77
+ )
78
+ return self._cid
79
+
80
+ # Legacy compatibility
81
+
82
+ @property
83
+ def wds_url(self) -> str:
84
+ """Legacy property: returns first data URL for backwards compatibility."""
85
+ return self.data_urls[0] if self.data_urls else ""
86
+
87
+ @property
88
+ def sample_kind(self) -> str:
89
+ """Legacy property: returns schema_ref for backwards compatibility."""
90
+ return self.schema_ref
91
+
92
+ def write_to(self, redis: Redis):
93
+ """Persist this index entry to Redis.
94
+
95
+ Stores the entry as a Redis hash with key '{REDIS_KEY_DATASET_ENTRY}:{cid}'.
96
+
97
+ Args:
98
+ redis: Redis connection to write to.
99
+ """
100
+ save_key = f"{REDIS_KEY_DATASET_ENTRY}:{self.cid}"
101
+ data: dict[str, Any] = {
102
+ "name": self.name,
103
+ "schema_ref": self.schema_ref,
104
+ "data_urls": msgpack.packb(self.data_urls), # Serialize list
105
+ "cid": self.cid,
106
+ }
107
+ if self.metadata is not None:
108
+ data["metadata"] = msgpack.packb(self.metadata)
109
+ if self._legacy_uuid is not None:
110
+ data["legacy_uuid"] = self._legacy_uuid
111
+
112
+ redis.hset(save_key, mapping=data) # type: ignore[arg-type]
113
+
114
+ @classmethod
115
+ def from_redis(cls, redis: Redis, cid: str) -> "LocalDatasetEntry":
116
+ """Load an entry from Redis by CID.
117
+
118
+ Args:
119
+ redis: Redis connection to read from.
120
+ cid: Content identifier of the entry to load.
121
+
122
+ Returns:
123
+ LocalDatasetEntry loaded from Redis.
124
+
125
+ Raises:
126
+ KeyError: If entry not found.
127
+ """
128
+ save_key = f"{REDIS_KEY_DATASET_ENTRY}:{cid}"
129
+ raw_data = redis.hgetall(save_key)
130
+ if not raw_data:
131
+ raise KeyError(f"{REDIS_KEY_DATASET_ENTRY} not found: {cid}")
132
+
133
+ # Decode string fields, keep binary fields as bytes for msgpack
134
+ raw_data_typed = cast(dict[bytes, bytes], raw_data)
135
+ name = raw_data_typed[b"name"].decode("utf-8")
136
+ schema_ref = raw_data_typed[b"schema_ref"].decode("utf-8")
137
+ cid_value = raw_data_typed.get(b"cid", b"").decode("utf-8") or None
138
+ legacy_uuid = raw_data_typed.get(b"legacy_uuid", b"").decode("utf-8") or None
139
+
140
+ # Deserialize msgpack fields (stored as raw bytes)
141
+ data_urls = msgpack.unpackb(raw_data_typed[b"data_urls"])
142
+ metadata = None
143
+ if b"metadata" in raw_data_typed:
144
+ metadata = msgpack.unpackb(raw_data_typed[b"metadata"])
145
+
146
+ return cls(
147
+ name=name,
148
+ schema_ref=schema_ref,
149
+ data_urls=data_urls,
150
+ metadata=metadata,
151
+ _cid=cid_value,
152
+ _legacy_uuid=legacy_uuid,
153
+ )
154
+
155
+
156
+ # Backwards compatibility alias
157
+ BasicIndexEntry = LocalDatasetEntry