atdata 0.1.3b3__py3-none-any.whl → 0.2.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/__init__.py CHANGED
@@ -1,4 +1,39 @@
1
- """A loose federation of distributed, typed datasets"""
1
+ """A loose federation of distributed, typed datasets.
2
+
3
+ ``atdata`` provides a typed dataset abstraction built on WebDataset, with support
4
+ for:
5
+
6
+ - **Typed samples** with automatic msgpack serialization
7
+ - **NDArray handling** with transparent bytes conversion
8
+ - **Lens transformations** for viewing datasets through different type schemas
9
+ - **Batch aggregation** with automatic numpy array stacking
10
+ - **WebDataset integration** for efficient large-scale dataset storage
11
+
12
+ Quick Start:
13
+ >>> import atdata
14
+ >>> import numpy as np
15
+ >>>
16
+ >>> @atdata.packable
17
+ ... class MyData:
18
+ ... features: np.ndarray
19
+ ... label: str
20
+ >>>
21
+ >>> # Create dataset from WebDataset tar files
22
+ >>> ds = atdata.Dataset[MyData]("path/to/data-{000000..000009}.tar")
23
+ >>>
24
+ >>> # Iterate with automatic batching
25
+ >>> for batch in ds.shuffled(batch_size=32):
26
+ ... features = batch.features # numpy array (32, ...)
27
+ ... labels = batch.label # list of 32 strings
28
+
29
+ Main Components:
30
+ - ``PackableSample``: Base class for msgpack-serializable samples
31
+ - ``Dataset``: Typed dataset wrapper for WebDataset
32
+ - ``SampleBatch``: Automatic batch aggregation
33
+ - ``Lens``: Bidirectional type transformations
34
+ - ``@packable``: Decorator for creating PackableSample classes
35
+ - ``@lens``: Decorator for creating lens transformations
36
+ """
2
37
 
3
38
  ##
4
39
  # Expose components
@@ -16,5 +51,8 @@ from .lens import (
16
51
  lens,
17
52
  )
18
53
 
54
+ # ATProto integration (lazy import to avoid requiring atproto package)
55
+ from . import atmosphere
56
+
19
57
 
20
58
  #
atdata/_helpers.py CHANGED
@@ -1,4 +1,16 @@
1
- """Assorted helper methods for `atdata`"""
1
+ """Helper utilities for numpy array serialization.
2
+
3
+ This module provides utility functions for converting numpy arrays to and from
4
+ bytes for msgpack serialization. The functions use numpy's native save/load
5
+ format to preserve array dtype and shape information.
6
+
7
+ Functions:
8
+ - ``array_to_bytes()``: Serialize numpy array to bytes
9
+ - ``bytes_to_array()``: Deserialize bytes to numpy array
10
+
11
+ These helpers are used internally by ``PackableSample`` to enable transparent
12
+ handling of NDArray fields during msgpack packing/unpacking.
13
+ """
2
14
 
3
15
  ##
4
16
  # Imports
@@ -11,12 +23,36 @@ import numpy as np
11
23
  ##
12
24
 
13
25
  def array_to_bytes( x: np.ndarray ) -> bytes:
14
- """Convert `numpy` array to a format suitable for packing"""
26
+ """Convert a numpy array to bytes for msgpack serialization.
27
+
28
+ Uses numpy's native ``save()`` format to preserve array dtype and shape.
29
+
30
+ Args:
31
+ x: A numpy array to serialize.
32
+
33
+ Returns:
34
+ Raw bytes representing the serialized array.
35
+
36
+ Note:
37
+ Uses ``allow_pickle=True`` to support object dtypes.
38
+ """
15
39
  np_bytes = BytesIO()
16
40
  np.save( np_bytes, x, allow_pickle = True )
17
41
  return np_bytes.getvalue()
18
42
 
19
43
  def bytes_to_array( b: bytes ) -> np.ndarray:
20
- """Convert packed bytes back to a `numpy` array"""
44
+ """Convert serialized bytes back to a numpy array.
45
+
46
+ Reverses the serialization performed by ``array_to_bytes()``.
47
+
48
+ Args:
49
+ b: Raw bytes from a serialized numpy array.
50
+
51
+ Returns:
52
+ The deserialized numpy array with original dtype and shape.
53
+
54
+ Note:
55
+ Uses ``allow_pickle=True`` to support object dtypes.
56
+ """
21
57
  np_bytes = BytesIO( b )
22
58
  return np.load( np_bytes, allow_pickle = True )
@@ -0,0 +1,61 @@
1
+ """ATProto integration for distributed dataset federation.
2
+
3
+ This module provides ATProto publishing and discovery capabilities for atdata,
4
+ enabling a loose federation of distributed, typed datasets on the AT Protocol
5
+ network.
6
+
7
+ Key components:
8
+
9
+ - ``AtmosphereClient``: Authentication and session management for ATProto
10
+ - ``SchemaPublisher``: Publish PackableSample schemas as ATProto records
11
+ - ``DatasetPublisher``: Publish dataset index records with WebDataset URLs
12
+ - ``LensPublisher``: Publish lens transformation records
13
+
14
+ The ATProto integration is additive - existing atdata functionality continues
15
+ to work unchanged. These features are opt-in for users who want to publish
16
+ or discover datasets on the ATProto network.
17
+
18
+ Example:
19
+ >>> from atdata.atmosphere import AtmosphereClient, SchemaPublisher
20
+ >>>
21
+ >>> client = AtmosphereClient()
22
+ >>> client.login("handle.bsky.social", "app-password")
23
+ >>>
24
+ >>> publisher = SchemaPublisher(client)
25
+ >>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
26
+
27
+ Note:
28
+ This module requires the ``atproto`` package to be installed::
29
+
30
+ pip install atproto
31
+ """
32
+
33
+ from .client import AtmosphereClient
34
+ from .schema import SchemaPublisher, SchemaLoader
35
+ from .records import DatasetPublisher, DatasetLoader
36
+ from .lens import LensPublisher, LensLoader
37
+ from ._types import (
38
+ AtUri,
39
+ SchemaRecord,
40
+ DatasetRecord,
41
+ LensRecord,
42
+ )
43
+
44
+ __all__ = [
45
+ # Client
46
+ "AtmosphereClient",
47
+ # Schema operations
48
+ "SchemaPublisher",
49
+ "SchemaLoader",
50
+ # Dataset operations
51
+ "DatasetPublisher",
52
+ "DatasetLoader",
53
+ # Lens operations
54
+ "LensPublisher",
55
+ "LensLoader",
56
+ # Types
57
+ "AtUri",
58
+ "SchemaRecord",
59
+ "DatasetRecord",
60
+ "LensRecord",
61
+ ]
@@ -0,0 +1,329 @@
1
+ """Type definitions for ATProto record structures.
2
+
3
+ This module defines the data structures used to represent ATProto records
4
+ for schemas, datasets, and lenses. These types map to the Lexicon definitions
5
+ in the ``ac.foundation.dataset.*`` namespace.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
10
+ from typing import Optional, Literal, Any
11
+
12
+ # Lexicon namespace for atdata records
13
+ LEXICON_NAMESPACE = "ac.foundation.dataset"
14
+
15
+
16
+ @dataclass
17
+ class AtUri:
18
+ """Parsed AT Protocol URI.
19
+
20
+ AT URIs follow the format: at://<authority>/<collection>/<rkey>
21
+
22
+ Example:
23
+ >>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
24
+ >>> uri.authority
25
+ 'did:plc:abc123'
26
+ >>> uri.collection
27
+ 'ac.foundation.dataset.sampleSchema'
28
+ >>> uri.rkey
29
+ 'xyz'
30
+ """
31
+
32
+ authority: str
33
+ """The DID or handle of the repository owner."""
34
+
35
+ collection: str
36
+ """The NSID of the record collection."""
37
+
38
+ rkey: str
39
+ """The record key within the collection."""
40
+
41
+ @classmethod
42
+ def parse(cls, uri: str) -> "AtUri":
43
+ """Parse an AT URI string into components.
44
+
45
+ Args:
46
+ uri: AT URI string in format ``at://<authority>/<collection>/<rkey>``
47
+
48
+ Returns:
49
+ Parsed AtUri instance.
50
+
51
+ Raises:
52
+ ValueError: If the URI format is invalid.
53
+ """
54
+ if not uri.startswith("at://"):
55
+ raise ValueError(f"Invalid AT URI: must start with 'at://': {uri}")
56
+
57
+ parts = uri[5:].split("/")
58
+ if len(parts) < 3:
59
+ raise ValueError(f"Invalid AT URI: expected authority/collection/rkey: {uri}")
60
+
61
+ return cls(
62
+ authority=parts[0],
63
+ collection=parts[1],
64
+ rkey="/".join(parts[2:]), # rkey may contain slashes
65
+ )
66
+
67
+ def __str__(self) -> str:
68
+ """Format as AT URI string."""
69
+ return f"at://{self.authority}/{self.collection}/{self.rkey}"
70
+
71
+
72
+ @dataclass
73
+ class FieldType:
74
+ """Schema field type definition.
75
+
76
+ Represents a type in the schema type system, supporting primitives,
77
+ ndarrays, and references to other schemas.
78
+ """
79
+
80
+ kind: Literal["primitive", "ndarray", "ref", "array"]
81
+ """The category of type."""
82
+
83
+ primitive: Optional[str] = None
84
+ """For kind='primitive': one of 'str', 'int', 'float', 'bool', 'bytes'."""
85
+
86
+ dtype: Optional[str] = None
87
+ """For kind='ndarray': numpy dtype string (e.g., 'float32')."""
88
+
89
+ shape: Optional[list[int | None]] = None
90
+ """For kind='ndarray': shape constraints (None for any dimension)."""
91
+
92
+ ref: Optional[str] = None
93
+ """For kind='ref': AT URI of referenced schema."""
94
+
95
+ items: Optional["FieldType"] = None
96
+ """For kind='array': type of array elements."""
97
+
98
+
99
+ @dataclass
100
+ class FieldDef:
101
+ """Schema field definition."""
102
+
103
+ name: str
104
+ """Field name."""
105
+
106
+ field_type: FieldType
107
+ """Type of this field."""
108
+
109
+ optional: bool = False
110
+ """Whether this field can be None."""
111
+
112
+ description: Optional[str] = None
113
+ """Human-readable description."""
114
+
115
+
116
+ @dataclass
117
+ class SchemaRecord:
118
+ """ATProto record for a PackableSample schema.
119
+
120
+ Maps to the ``ac.foundation.dataset.sampleSchema`` Lexicon.
121
+ """
122
+
123
+ name: str
124
+ """Human-readable schema name."""
125
+
126
+ version: str
127
+ """Semantic version string (e.g., '1.0.0')."""
128
+
129
+ fields: list[FieldDef]
130
+ """List of field definitions."""
131
+
132
+ description: Optional[str] = None
133
+ """Human-readable description."""
134
+
135
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
136
+ """When this record was created."""
137
+
138
+ metadata: Optional[dict] = None
139
+ """Arbitrary metadata as msgpack-encoded bytes."""
140
+
141
+ def to_record(self) -> dict:
142
+ """Convert to ATProto record dict for publishing."""
143
+ record = {
144
+ "$type": f"{LEXICON_NAMESPACE}.sampleSchema",
145
+ "name": self.name,
146
+ "version": self.version,
147
+ "fields": [self._field_to_dict(f) for f in self.fields],
148
+ "createdAt": self.created_at.isoformat(),
149
+ }
150
+ if self.description:
151
+ record["description"] = self.description
152
+ if self.metadata:
153
+ record["metadata"] = self.metadata
154
+ return record
155
+
156
+ def _field_to_dict(self, field_def: FieldDef) -> dict:
157
+ """Convert a field definition to dict."""
158
+ result = {
159
+ "name": field_def.name,
160
+ "fieldType": self._type_to_dict(field_def.field_type),
161
+ "optional": field_def.optional,
162
+ }
163
+ if field_def.description:
164
+ result["description"] = field_def.description
165
+ return result
166
+
167
+ def _type_to_dict(self, field_type: FieldType) -> dict:
168
+ """Convert a field type to dict."""
169
+ result: dict = {"$type": f"{LEXICON_NAMESPACE}.schemaType#{field_type.kind}"}
170
+
171
+ if field_type.kind == "primitive":
172
+ result["primitive"] = field_type.primitive
173
+ elif field_type.kind == "ndarray":
174
+ result["dtype"] = field_type.dtype
175
+ if field_type.shape:
176
+ result["shape"] = field_type.shape
177
+ elif field_type.kind == "ref":
178
+ result["ref"] = field_type.ref
179
+ elif field_type.kind == "array":
180
+ if field_type.items:
181
+ result["items"] = self._type_to_dict(field_type.items)
182
+
183
+ return result
184
+
185
+
186
+ @dataclass
187
+ class StorageLocation:
188
+ """Dataset storage location specification."""
189
+
190
+ kind: Literal["external", "blobs"]
191
+ """Storage type: external URLs or ATProto blobs."""
192
+
193
+ urls: Optional[list[str]] = None
194
+ """For kind='external': WebDataset URLs with brace notation."""
195
+
196
+ blob_refs: Optional[list[dict]] = None
197
+ """For kind='blobs': ATProto blob references."""
198
+
199
+
200
+ @dataclass
201
+ class DatasetRecord:
202
+ """ATProto record for a dataset index.
203
+
204
+ Maps to the ``ac.foundation.dataset.record`` Lexicon.
205
+ """
206
+
207
+ name: str
208
+ """Human-readable dataset name."""
209
+
210
+ schema_ref: str
211
+ """AT URI of the schema record."""
212
+
213
+ storage: StorageLocation
214
+ """Where the dataset data is stored."""
215
+
216
+ description: Optional[str] = None
217
+ """Human-readable description."""
218
+
219
+ tags: list[str] = field(default_factory=list)
220
+ """Searchable tags."""
221
+
222
+ license: Optional[str] = None
223
+ """SPDX license identifier."""
224
+
225
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
226
+ """When this record was created."""
227
+
228
+ metadata: Optional[bytes] = None
229
+ """Arbitrary metadata as msgpack-encoded bytes."""
230
+
231
+ def to_record(self) -> dict:
232
+ """Convert to ATProto record dict for publishing."""
233
+ record = {
234
+ "$type": f"{LEXICON_NAMESPACE}.record",
235
+ "name": self.name,
236
+ "schemaRef": self.schema_ref,
237
+ "storage": self._storage_to_dict(),
238
+ "createdAt": self.created_at.isoformat(),
239
+ }
240
+ if self.description:
241
+ record["description"] = self.description
242
+ if self.tags:
243
+ record["tags"] = self.tags
244
+ if self.license:
245
+ record["license"] = self.license
246
+ if self.metadata:
247
+ record["metadata"] = self.metadata
248
+ return record
249
+
250
+ def _storage_to_dict(self) -> dict:
251
+ """Convert storage location to dict."""
252
+ if self.storage.kind == "external":
253
+ return {
254
+ "$type": f"{LEXICON_NAMESPACE}.storageExternal",
255
+ "urls": self.storage.urls or [],
256
+ }
257
+ else:
258
+ return {
259
+ "$type": f"{LEXICON_NAMESPACE}.storageBlobs",
260
+ "blobs": self.storage.blob_refs or [],
261
+ }
262
+
263
+
264
+ @dataclass
265
+ class CodeReference:
266
+ """Reference to lens code in a git repository."""
267
+
268
+ repository: str
269
+ """Git repository URL."""
270
+
271
+ commit: str
272
+ """Git commit hash."""
273
+
274
+ path: str
275
+ """Path to the code file/function."""
276
+
277
+
278
+ @dataclass
279
+ class LensRecord:
280
+ """ATProto record for a lens transformation.
281
+
282
+ Maps to the ``ac.foundation.dataset.lens`` Lexicon.
283
+ """
284
+
285
+ name: str
286
+ """Human-readable lens name."""
287
+
288
+ source_schema: str
289
+ """AT URI of the source schema."""
290
+
291
+ target_schema: str
292
+ """AT URI of the target schema."""
293
+
294
+ description: Optional[str] = None
295
+ """What this transformation does."""
296
+
297
+ getter_code: Optional[CodeReference] = None
298
+ """Reference to getter function code."""
299
+
300
+ putter_code: Optional[CodeReference] = None
301
+ """Reference to putter function code."""
302
+
303
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
304
+ """When this record was created."""
305
+
306
+ def to_record(self) -> dict:
307
+ """Convert to ATProto record dict for publishing."""
308
+ record: dict[str, Any] = {
309
+ "$type": f"{LEXICON_NAMESPACE}.lens",
310
+ "name": self.name,
311
+ "sourceSchema": self.source_schema,
312
+ "targetSchema": self.target_schema,
313
+ "createdAt": self.created_at.isoformat(),
314
+ }
315
+ if self.description:
316
+ record["description"] = self.description
317
+ if self.getter_code:
318
+ record["getterCode"] = {
319
+ "repository": self.getter_code.repository,
320
+ "commit": self.getter_code.commit,
321
+ "path": self.getter_code.path,
322
+ }
323
+ if self.putter_code:
324
+ record["putterCode"] = {
325
+ "repository": self.putter_code.repository,
326
+ "commit": self.putter_code.commit,
327
+ "path": self.putter_code.path,
328
+ }
329
+ return record