atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. atdata/__init__.py +11 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +46 -1
  5. atdata/_logging.py +43 -0
  6. atdata/_protocols.py +81 -182
  7. atdata/_schema_codec.py +2 -2
  8. atdata/_sources.py +24 -4
  9. atdata/_stub_manager.py +5 -25
  10. atdata/atmosphere/__init__.py +60 -21
  11. atdata/atmosphere/_lexicon_types.py +595 -0
  12. atdata/atmosphere/_types.py +73 -245
  13. atdata/atmosphere/client.py +64 -12
  14. atdata/atmosphere/lens.py +60 -53
  15. atdata/atmosphere/records.py +291 -100
  16. atdata/atmosphere/schema.py +91 -65
  17. atdata/atmosphere/store.py +68 -66
  18. atdata/cli/__init__.py +16 -16
  19. atdata/cli/diagnose.py +2 -2
  20. atdata/cli/{local.py → infra.py} +10 -10
  21. atdata/dataset.py +266 -47
  22. atdata/index/__init__.py +54 -0
  23. atdata/{local → index}/_entry.py +6 -2
  24. atdata/{local → index}/_index.py +617 -72
  25. atdata/{local → index}/_schema.py +5 -5
  26. atdata/lexicons/__init__.py +127 -0
  27. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  28. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  29. atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
  30. atdata/lexicons/ac.foundation.dataset.record.json +117 -0
  31. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  32. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
  34. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  35. atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
  36. atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
  37. atdata/lexicons/ndarray_shim.json +16 -0
  38. atdata/local/__init__.py +12 -13
  39. atdata/local/_repo_legacy.py +3 -3
  40. atdata/manifest/__init__.py +4 -0
  41. atdata/manifest/_proxy.py +321 -0
  42. atdata/promote.py +14 -10
  43. atdata/repository.py +66 -16
  44. atdata/stores/__init__.py +23 -0
  45. atdata/stores/_disk.py +131 -0
  46. atdata/{local → stores}/_s3.py +134 -112
  47. atdata/testing.py +12 -8
  48. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
  49. atdata-0.3.2b1.dist-info/RECORD +71 -0
  50. atdata-0.3.0b1.dist-info/RECORD +0 -54
  51. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
  52. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
  53. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,22 @@
1
1
  """Type definitions for ATProto record structures.
2
2
 
3
- This module defines the data structures used to represent ATProto records
4
- for schemas, datasets, and lenses. These types map to the Lexicon definitions
5
- in the ``ac.foundation.dataset.*`` namespace.
3
+ This module provides the ``AtUri`` utility class and the ``LEXICON_NAMESPACE``
4
+ constant. Lexicon-mirror record types (``LexSchemaRecord``, ``LexDatasetRecord``,
5
+ ``LexLensRecord``, etc.) have moved to ``atdata.atmosphere._lexicon_types``.
6
+
7
+ The old type names (``SchemaRecord``, ``DatasetRecord``, ``LensRecord``,
8
+ ``StorageLocation``, ``FieldType``, ``FieldDef``, ``CodeReference``) are
9
+ re-exported here as deprecated aliases for backward compatibility.
6
10
  """
7
11
 
8
- from dataclasses import dataclass, field
9
- from datetime import datetime, timezone
10
- from typing import Optional, Literal, Any
12
+ from __future__ import annotations
13
+
14
+ import warnings
15
+ from dataclasses import dataclass
16
+ from typing import Any, Literal, Optional
11
17
 
12
- # Lexicon namespace for atdata records
18
+ # Canonical constant also defined in _lexicon_types but kept here as the
19
+ # historically authoritative location so existing imports continue to work.
13
20
  LEXICON_NAMESPACE = "ac.foundation.dataset"
14
21
 
15
22
 
@@ -20,11 +27,11 @@ class AtUri:
20
27
  AT URIs follow the format: at://<authority>/<collection>/<rkey>
21
28
 
22
29
  Examples:
23
- >>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
30
+ >>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.schema/xyz")
24
31
  >>> uri.authority
25
32
  'did:plc:abc123'
26
33
  >>> uri.collection
27
- 'ac.foundation.dataset.sampleSchema'
34
+ 'ac.foundation.dataset.schema'
28
35
  >>> uri.rkey
29
36
  'xyz'
30
37
  """
@@ -39,7 +46,7 @@ class AtUri:
39
46
  """The record key within the collection."""
40
47
 
41
48
  @classmethod
42
- def parse(cls, uri: str) -> "AtUri":
49
+ def parse(cls, uri: str) -> AtUri:
43
50
  """Parse an AT URI string into components.
44
51
 
45
52
  Args:
@@ -71,261 +78,82 @@ class AtUri:
71
78
  return f"at://{self.authority}/{self.collection}/{self.rkey}"
72
79
 
73
80
 
74
- @dataclass
75
- class FieldType:
76
- """Schema field type definition.
81
+ # ---------------------------------------------------------------------------
82
+ # Deprecated re-exports (will be removed in a future version)
83
+ # ---------------------------------------------------------------------------
84
+ # These names existed in this module before the lexicon-mirror types were
85
+ # split into _lexicon_types.py. They are re-exported here so that existing
86
+ # imports like ``from atdata.atmosphere._types import SchemaRecord`` continue
87
+ # to work during the migration period.
88
+
89
+
90
+ def __getattr__(name: str) -> Any:
91
+ _DEPRECATED_ALIASES: dict[str, tuple[str, str]] = {
92
+ # old name → (new module attribute, import path in _lexicon_types)
93
+ "FieldType": ("FieldType", "atdata.atmosphere._lexicon_types"),
94
+ "FieldDef": ("FieldDef", "atdata.atmosphere._lexicon_types"),
95
+ "SchemaRecord": ("LexSchemaRecord", "atdata.atmosphere._lexicon_types"),
96
+ "DatasetRecord": ("LexDatasetRecord", "atdata.atmosphere._lexicon_types"),
97
+ "LensRecord": ("LexLensRecord", "atdata.atmosphere._lexicon_types"),
98
+ "StorageLocation": ("StorageLocation", "atdata.atmosphere._lexicon_types"),
99
+ "CodeReference": ("LexCodeReference", "atdata.atmosphere._lexicon_types"),
100
+ }
101
+ if name in _DEPRECATED_ALIASES:
102
+ new_name, mod_path = _DEPRECATED_ALIASES[name]
103
+ warnings.warn(
104
+ f"{name} has been moved. Import {new_name} from {mod_path} instead.",
105
+ DeprecationWarning,
106
+ stacklevel=2,
107
+ )
108
+ from . import _lexicon_types
77
109
 
78
- Represents a type in the schema type system, supporting primitives,
79
- ndarrays, and references to other schemas.
80
- """
110
+ # For StorageLocation, provide a lightweight shim
111
+ if name == "StorageLocation":
112
+ return _StorageLocationShim
113
+ # FieldType / FieldDef don't exist in _lexicon_types; they were
114
+ # internal-only types used by the old SchemaRecord. Return them
115
+ # from the shim definitions below.
116
+ if name in ("FieldType", "FieldDef"):
117
+ return _FIELD_SHIMS[name]
118
+ return getattr(_lexicon_types, new_name)
119
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
81
120
 
82
- kind: Literal["primitive", "ndarray", "ref", "array"]
83
- """The category of type."""
84
121
 
85
- primitive: Optional[str] = None
86
- """For kind='primitive': one of 'str', 'int', 'float', 'bool', 'bytes'."""
122
+ # Lightweight shims for types that have no direct equivalent in _lexicon_types
87
123
 
88
- dtype: Optional[str] = None
89
- """For kind='ndarray': numpy dtype string (e.g., 'float32')."""
90
124
 
91
- shape: Optional[list[int | None]] = None
92
- """For kind='ndarray': shape constraints (None for any dimension)."""
125
+ @dataclass
126
+ class _FieldTypeShim:
127
+ """Deprecated: schema field type used by the old SchemaRecord."""
93
128
 
129
+ kind: Literal["primitive", "ndarray", "ref", "array"]
130
+ primitive: Optional[str] = None
131
+ dtype: Optional[str] = None
132
+ shape: Optional[list[int | None]] = None
94
133
  ref: Optional[str] = None
95
- """For kind='ref': AT URI of referenced schema."""
96
-
97
- items: Optional["FieldType"] = None
98
- """For kind='array': type of array elements."""
134
+ items: Optional[_FieldTypeShim] = None
99
135
 
100
136
 
101
137
  @dataclass
102
- class FieldDef:
103
- """Schema field definition."""
138
+ class _FieldDefShim:
139
+ """Deprecated: schema field definition used by the old SchemaRecord."""
104
140
 
105
141
  name: str
106
- """Field name."""
107
-
108
- field_type: FieldType
109
- """Type of this field."""
110
-
142
+ field_type: _FieldTypeShim
111
143
  optional: bool = False
112
- """Whether this field can be None."""
113
-
114
- description: Optional[str] = None
115
- """Human-readable description."""
116
-
117
-
118
- @dataclass
119
- class SchemaRecord:
120
- """ATProto record for a PackableSample schema.
121
-
122
- Maps to the ``ac.foundation.dataset.sampleSchema`` Lexicon.
123
- """
124
-
125
- name: str
126
- """Human-readable schema name."""
127
-
128
- version: str
129
- """Semantic version string (e.g., '1.0.0')."""
130
-
131
- fields: list[FieldDef]
132
- """List of field definitions."""
133
-
134
144
  description: Optional[str] = None
135
- """Human-readable description."""
136
-
137
- created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
138
- """When this record was created."""
139
-
140
- metadata: Optional[dict] = None
141
- """Arbitrary metadata as msgpack-encoded bytes."""
142
-
143
- def to_record(self) -> dict:
144
- """Convert to ATProto record dict for publishing."""
145
- record = {
146
- "$type": f"{LEXICON_NAMESPACE}.sampleSchema",
147
- "name": self.name,
148
- "version": self.version,
149
- "fields": [self._field_to_dict(f) for f in self.fields],
150
- "createdAt": self.created_at.isoformat(),
151
- }
152
- if self.description:
153
- record["description"] = self.description
154
- if self.metadata:
155
- record["metadata"] = self.metadata
156
- return record
157
-
158
- def _field_to_dict(self, field_def: FieldDef) -> dict:
159
- """Convert a field definition to dict."""
160
- result = {
161
- "name": field_def.name,
162
- "fieldType": self._type_to_dict(field_def.field_type),
163
- "optional": field_def.optional,
164
- }
165
- if field_def.description:
166
- result["description"] = field_def.description
167
- return result
168
-
169
- def _type_to_dict(self, field_type: FieldType) -> dict:
170
- """Convert a field type to dict."""
171
- result: dict = {"$type": f"{LEXICON_NAMESPACE}.schemaType#{field_type.kind}"}
172
-
173
- if field_type.kind == "primitive":
174
- result["primitive"] = field_type.primitive
175
- elif field_type.kind == "ndarray":
176
- result["dtype"] = field_type.dtype
177
- if field_type.shape:
178
- result["shape"] = field_type.shape
179
- elif field_type.kind == "ref":
180
- result["ref"] = field_type.ref
181
- elif field_type.kind == "array":
182
- if field_type.items:
183
- result["items"] = self._type_to_dict(field_type.items)
184
-
185
- return result
186
145
 
187
146
 
188
147
  @dataclass
189
- class StorageLocation:
190
- """Dataset storage location specification."""
148
+ class _StorageLocationShim:
149
+ """Deprecated: use StorageHttp / StorageS3 / StorageBlobs instead."""
191
150
 
192
151
  kind: Literal["external", "blobs"]
193
- """Storage type: external URLs or ATProto blobs."""
194
-
195
152
  urls: Optional[list[str]] = None
196
- """For kind='external': WebDataset URLs with brace notation."""
197
-
198
153
  blob_refs: Optional[list[dict]] = None
199
- """For kind='blobs': ATProto blob references."""
200
-
201
-
202
- @dataclass
203
- class DatasetRecord:
204
- """ATProto record for a dataset index.
205
-
206
- Maps to the ``ac.foundation.dataset.record`` Lexicon.
207
- """
208
-
209
- name: str
210
- """Human-readable dataset name."""
211
-
212
- schema_ref: str
213
- """AT URI of the schema record."""
214
-
215
- storage: StorageLocation
216
- """Where the dataset data is stored."""
217
-
218
- description: Optional[str] = None
219
- """Human-readable description."""
220
-
221
- tags: list[str] = field(default_factory=list)
222
- """Searchable tags."""
223
-
224
- license: Optional[str] = None
225
- """SPDX license identifier."""
226
-
227
- created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
228
- """When this record was created."""
229
-
230
- metadata: Optional[bytes] = None
231
- """Arbitrary metadata as msgpack-encoded bytes."""
232
-
233
- def to_record(self) -> dict:
234
- """Convert to ATProto record dict for publishing."""
235
- record = {
236
- "$type": f"{LEXICON_NAMESPACE}.record",
237
- "name": self.name,
238
- "schemaRef": self.schema_ref,
239
- "storage": self._storage_to_dict(),
240
- "createdAt": self.created_at.isoformat(),
241
- }
242
- if self.description:
243
- record["description"] = self.description
244
- if self.tags:
245
- record["tags"] = self.tags
246
- if self.license:
247
- record["license"] = self.license
248
- if self.metadata:
249
- record["metadata"] = self.metadata
250
- return record
251
-
252
- def _storage_to_dict(self) -> dict:
253
- """Convert storage location to dict."""
254
- if self.storage.kind == "external":
255
- return {
256
- "$type": f"{LEXICON_NAMESPACE}.storageExternal",
257
- "urls": self.storage.urls or [],
258
- }
259
- else:
260
- return {
261
- "$type": f"{LEXICON_NAMESPACE}.storageBlobs",
262
- "blobs": self.storage.blob_refs or [],
263
- }
264
-
265
154
 
266
- @dataclass
267
- class CodeReference:
268
- """Reference to lens code in a git repository."""
269
-
270
- repository: str
271
- """Git repository URL."""
272
-
273
- commit: str
274
- """Git commit hash."""
275
-
276
- path: str
277
- """Path to the code file/function."""
278
-
279
-
280
- @dataclass
281
- class LensRecord:
282
- """ATProto record for a lens transformation.
283
155
 
284
- Maps to the ``ac.foundation.dataset.lens`` Lexicon.
285
- """
286
-
287
- name: str
288
- """Human-readable lens name."""
289
-
290
- source_schema: str
291
- """AT URI of the source schema."""
292
-
293
- target_schema: str
294
- """AT URI of the target schema."""
295
-
296
- description: Optional[str] = None
297
- """What this transformation does."""
298
-
299
- getter_code: Optional[CodeReference] = None
300
- """Reference to getter function code."""
301
-
302
- putter_code: Optional[CodeReference] = None
303
- """Reference to putter function code."""
304
-
305
- created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
306
- """When this record was created."""
307
-
308
- def to_record(self) -> dict:
309
- """Convert to ATProto record dict for publishing."""
310
- record: dict[str, Any] = {
311
- "$type": f"{LEXICON_NAMESPACE}.lens",
312
- "name": self.name,
313
- "sourceSchema": self.source_schema,
314
- "targetSchema": self.target_schema,
315
- "createdAt": self.created_at.isoformat(),
316
- }
317
- if self.description:
318
- record["description"] = self.description
319
- if self.getter_code:
320
- record["getterCode"] = {
321
- "repository": self.getter_code.repository,
322
- "commit": self.getter_code.commit,
323
- "path": self.getter_code.path,
324
- }
325
- if self.putter_code:
326
- record["putterCode"] = {
327
- "repository": self.putter_code.repository,
328
- "commit": self.putter_code.commit,
329
- "path": self.putter_code.path,
330
- }
331
- return record
156
+ _FIELD_SHIMS: dict[str, type] = {
157
+ "FieldType": _FieldTypeShim,
158
+ "FieldDef": _FieldDefShim,
159
+ }
@@ -1,6 +1,6 @@
1
1
  """ATProto client wrapper for atdata.
2
2
 
3
- This module provides the ``AtmosphereClient`` class which wraps the atproto SDK
3
+ This module provides the ``Atmosphere`` class which wraps the atproto SDK
4
4
  client with atdata-specific helpers for publishing and querying records.
5
5
  """
6
6
 
@@ -28,16 +28,15 @@ def _get_atproto_client_class():
28
28
  return _atproto_client_class
29
29
 
30
30
 
31
- class AtmosphereClient:
31
+ class Atmosphere:
32
32
  """ATProto client wrapper for atdata operations.
33
33
 
34
34
  This class wraps the atproto SDK client and provides higher-level methods
35
35
  for working with atdata records (schemas, datasets, lenses).
36
36
 
37
37
  Examples:
38
- >>> client = AtmosphereClient()
39
- >>> client.login("alice.bsky.social", "app-password")
40
- >>> print(client.did)
38
+ >>> atmo = Atmosphere.login("alice.bsky.social", "app-password")
39
+ >>> print(atmo.did)
41
40
  'did:plc:...'
42
41
 
43
42
  Note:
@@ -65,7 +64,63 @@ class AtmosphereClient:
65
64
 
66
65
  self._session: Optional[dict] = None
67
66
 
68
- def login(self, handle: str, password: str) -> None:
67
+ @classmethod
68
+ def login(
69
+ cls,
70
+ handle: str,
71
+ password: str,
72
+ *,
73
+ base_url: Optional[str] = None,
74
+ ) -> "Atmosphere":
75
+ """Create an authenticated Atmosphere client.
76
+
77
+ Args:
78
+ handle: Your Bluesky handle (e.g., 'alice.bsky.social').
79
+ password: App-specific password (not your main password).
80
+ base_url: Optional PDS base URL. Defaults to bsky.social.
81
+
82
+ Returns:
83
+ An authenticated Atmosphere instance.
84
+
85
+ Raises:
86
+ atproto.exceptions.AtProtocolError: If authentication fails.
87
+
88
+ Examples:
89
+ >>> atmo = Atmosphere.login("alice.bsky.social", "app-password")
90
+ >>> index = Index(atmosphere=atmo)
91
+ """
92
+ instance = cls(base_url=base_url)
93
+ instance._login(handle, password)
94
+ return instance
95
+
96
+ @classmethod
97
+ def from_session(
98
+ cls,
99
+ session_string: str,
100
+ *,
101
+ base_url: Optional[str] = None,
102
+ ) -> "Atmosphere":
103
+ """Create an Atmosphere client from an exported session string.
104
+
105
+ This allows reusing a session without re-authenticating, which helps
106
+ avoid rate limits on session creation.
107
+
108
+ Args:
109
+ session_string: Session string from ``export_session()``.
110
+ base_url: Optional PDS base URL. Defaults to bsky.social.
111
+
112
+ Returns:
113
+ An authenticated Atmosphere instance.
114
+
115
+ Examples:
116
+ >>> session = atmo.export_session()
117
+ >>> atmo2 = Atmosphere.from_session(session)
118
+ """
119
+ instance = cls(base_url=base_url)
120
+ instance._login_with_session(session_string)
121
+ return instance
122
+
123
+ def _login(self, handle: str, password: str) -> None:
69
124
  """Authenticate with the ATProto PDS.
70
125
 
71
126
  Args:
@@ -81,12 +136,9 @@ class AtmosphereClient:
81
136
  "handle": profile.handle,
82
137
  }
83
138
 
84
- def login_with_session(self, session_string: str) -> None:
139
+ def _login_with_session(self, session_string: str) -> None:
85
140
  """Authenticate using an exported session string.
86
141
 
87
- This allows reusing a session without re-authenticating, which helps
88
- avoid rate limits on session creation.
89
-
90
142
  Args:
91
143
  session_string: Session string from ``export_session()``.
92
144
  """
@@ -161,7 +213,7 @@ class AtmosphereClient:
161
213
 
162
214
  Args:
163
215
  collection: The NSID of the record collection
164
- (e.g., 'ac.foundation.dataset.sampleSchema').
216
+ (e.g., 'ac.foundation.dataset.schema').
165
217
  record: The record data. Must include a '$type' field.
166
218
  rkey: Optional explicit record key. If not provided, a TID is generated.
167
219
  validate: Whether to validate against the Lexicon schema. Set to False
@@ -487,7 +539,7 @@ class AtmosphereClient:
487
539
  List of schema records.
488
540
  """
489
541
  records, _ = self.list_records(
490
- f"{LEXICON_NAMESPACE}.sampleSchema",
542
+ f"{LEXICON_NAMESPACE}.schema",
491
543
  repo=repo,
492
544
  limit=limit,
493
545
  )