atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +39 -0
  3. atdata/_cid.py +0 -21
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +41 -15
  6. atdata/_hf_api.py +95 -11
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +77 -238
  9. atdata/_schema_codec.py +7 -6
  10. atdata/_stub_manager.py +5 -25
  11. atdata/_type_utils.py +28 -2
  12. atdata/atmosphere/__init__.py +31 -20
  13. atdata/atmosphere/_types.py +4 -4
  14. atdata/atmosphere/client.py +64 -12
  15. atdata/atmosphere/lens.py +11 -12
  16. atdata/atmosphere/records.py +12 -12
  17. atdata/atmosphere/schema.py +16 -18
  18. atdata/atmosphere/store.py +6 -7
  19. atdata/cli/__init__.py +161 -175
  20. atdata/cli/diagnose.py +2 -2
  21. atdata/cli/{local.py → infra.py} +11 -11
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/preview.py +63 -0
  24. atdata/cli/schema.py +109 -0
  25. atdata/dataset.py +583 -328
  26. atdata/index/__init__.py +54 -0
  27. atdata/index/_entry.py +157 -0
  28. atdata/index/_index.py +1198 -0
  29. atdata/index/_schema.py +380 -0
  30. atdata/lens.py +9 -2
  31. atdata/lexicons/__init__.py +121 -0
  32. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  34. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  35. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  36. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  37. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  38. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  39. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  40. atdata/lexicons/ndarray_shim.json +16 -0
  41. atdata/local/__init__.py +70 -0
  42. atdata/local/_repo_legacy.py +218 -0
  43. atdata/manifest/__init__.py +28 -0
  44. atdata/manifest/_aggregates.py +156 -0
  45. atdata/manifest/_builder.py +163 -0
  46. atdata/manifest/_fields.py +154 -0
  47. atdata/manifest/_manifest.py +146 -0
  48. atdata/manifest/_query.py +150 -0
  49. atdata/manifest/_writer.py +74 -0
  50. atdata/promote.py +18 -14
  51. atdata/providers/__init__.py +25 -0
  52. atdata/providers/_base.py +140 -0
  53. atdata/providers/_factory.py +69 -0
  54. atdata/providers/_postgres.py +214 -0
  55. atdata/providers/_redis.py +171 -0
  56. atdata/providers/_sqlite.py +191 -0
  57. atdata/repository.py +323 -0
  58. atdata/stores/__init__.py +23 -0
  59. atdata/stores/_disk.py +123 -0
  60. atdata/stores/_s3.py +349 -0
  61. atdata/testing.py +341 -0
  62. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
  63. atdata-0.3.1b1.dist-info/RECORD +67 -0
  64. atdata/local.py +0 -1720
  65. atdata-0.2.3b1.dist-info/RECORD +0 -28
  66. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  67. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  68. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/_protocols.py CHANGED
@@ -1,37 +1,25 @@
1
1
  """Protocol definitions for atdata index and storage abstractions.
2
2
 
3
- This module defines the abstract protocols that enable interchangeable
4
- index backends (local Redis vs ATProto PDS) and data stores (S3 vs PDS blobs).
5
-
6
- The key insight is that both local and atmosphere implementations solve the
7
- same problem: indexed dataset storage with external data URLs. These protocols
8
- formalize that common interface.
9
-
10
- Note:
11
- Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
12
- the standard Python syntax for Protocol definitions - these are interface
13
- specifications, not stub implementations. Concrete classes (LocalIndex,
14
- AtmosphereIndex, etc.) provide the actual implementations.
3
+ Defines the abstract protocols that enable interchangeable index backends
4
+ (local SQLite/Redis vs ATProto PDS) and data stores (S3, local disk, PDS blobs).
15
5
 
16
6
  Protocols:
17
- Packable: Structural interface for packable sample types (lens compatibility)
7
+ Packable: Structural interface for packable sample types
18
8
  IndexEntry: Common interface for dataset index entries
19
9
  AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
20
10
  AbstractDataStore: Protocol for data storage operations
11
+ DataSource: Protocol for streaming shard data
21
12
 
22
13
  Examples:
23
14
  >>> def process_datasets(index: AbstractIndex) -> None:
24
15
  ... for entry in index.list_datasets():
25
16
  ... print(f"{entry.name}: {entry.data_urls}")
26
- ...
27
- >>> # Works with either LocalIndex or AtmosphereIndex
28
- >>> process_datasets(local_index)
29
- >>> process_datasets(atmosphere_index)
30
17
  """
31
18
 
32
19
  from typing import (
33
20
  IO,
34
21
  Any,
22
+ Iterable,
35
23
  Iterator,
36
24
  Optional,
37
25
  Protocol,
@@ -77,24 +65,16 @@ class Packable(Protocol):
77
65
  """
78
66
 
79
67
  @classmethod
80
- def from_data(cls, data: dict[str, Any]) -> "Packable":
81
- """Create instance from unpacked msgpack data dictionary."""
82
- ...
68
+ def from_data(cls, data: dict[str, Any]) -> "Packable": ...
83
69
 
84
70
  @classmethod
85
- def from_bytes(cls, bs: bytes) -> "Packable":
86
- """Create instance from raw msgpack bytes."""
87
- ...
71
+ def from_bytes(cls, bs: bytes) -> "Packable": ...
88
72
 
89
73
  @property
90
- def packed(self) -> bytes:
91
- """Pack this sample's data into msgpack bytes."""
92
- ...
74
+ def packed(self) -> bytes: ...
93
75
 
94
76
  @property
95
- def as_wds(self) -> dict[str, Any]:
96
- """WebDataset-compatible representation with __key__ and msgpack."""
97
- ...
77
+ def as_wds(self) -> dict[str, Any]: ...
98
78
 
99
79
 
100
80
  ##
@@ -116,16 +96,14 @@ class IndexEntry(Protocol):
116
96
  """
117
97
 
118
98
  @property
119
- def name(self) -> str:
120
- """Human-readable dataset name."""
121
- ...
99
+ def name(self) -> str: ...
122
100
 
123
101
  @property
124
102
  def schema_ref(self) -> str:
125
- """Reference to the schema for this dataset.
103
+ """Schema reference string.
126
104
 
127
- For local: 'local://schemas/{module.Class}@{version}'
128
- For atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
105
+ Local: ``local://schemas/{module.Class}@{version}``
106
+ Atmosphere: ``at://did:plc:.../ac.foundation.dataset.schema/...``
129
107
  """
130
108
  ...
131
109
 
@@ -139,9 +117,7 @@ class IndexEntry(Protocol):
139
117
  ...
140
118
 
141
119
  @property
142
- def metadata(self) -> Optional[dict]:
143
- """Arbitrary metadata dictionary, or None if not set."""
144
- ...
120
+ def metadata(self) -> Optional[dict]: ...
145
121
 
146
122
 
147
123
  ##
@@ -149,32 +125,16 @@ class IndexEntry(Protocol):
149
125
 
150
126
 
151
127
  class AbstractIndex(Protocol):
152
- """Protocol for index operations - implemented by LocalIndex and AtmosphereIndex.
153
-
154
- This protocol defines the common interface for managing dataset metadata:
155
- - Publishing and retrieving schemas
156
- - Inserting and listing datasets
157
- - (Future) Publishing and retrieving lenses
128
+ """Protocol for index operations implemented by Index and AtmosphereIndex.
158
129
 
159
- A single index can hold datasets of many different sample types. The sample
160
- type is tracked via schema references, not as a generic parameter on the index.
161
-
162
- Optional Extensions:
163
- Some index implementations support additional features:
164
- - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
165
- If present, ``load_dataset`` will use it for S3 credential resolution.
130
+ Manages dataset metadata: publishing/retrieving schemas, inserting/listing
131
+ datasets. A single index holds datasets of many sample types, tracked via
132
+ schema references.
166
133
 
167
134
  Examples:
168
135
  >>> def publish_and_list(index: AbstractIndex) -> None:
169
- ... # Publish schemas for different types
170
- ... schema1 = index.publish_schema(ImageSample, version="1.0.0")
171
- ... schema2 = index.publish_schema(TextSample, version="1.0.0")
172
- ...
173
- ... # Insert datasets of different types
136
+ ... index.publish_schema(ImageSample, version="1.0.0")
174
137
  ... index.insert_dataset(image_ds, name="images")
175
- ... index.insert_dataset(text_ds, name="texts")
176
- ...
177
- ... # List all datasets (mixed types)
178
138
  ... for entry in index.list_datasets():
179
139
  ... print(f"{entry.name} -> {entry.schema_ref}")
180
140
  """
@@ -183,55 +143,58 @@ class AbstractIndex(Protocol):
183
143
  def data_store(self) -> Optional["AbstractDataStore"]:
184
144
  """Optional data store for reading/writing shards.
185
145
 
186
- If present, ``load_dataset`` will use it for credential resolution
187
- (e.g., S3 credentials from S3DataStore).
188
-
189
- Returns:
190
- AbstractDataStore instance, or None if this index doesn't have
191
- an associated data store.
192
-
193
- Note:
194
- Not all index implementations provide a data_store. Use
195
- ``hasattr(index, 'data_store') and index.data_store is not None``
196
- for safe access.
146
+ If present, ``load_dataset`` uses it for credential resolution.
147
+ Not all implementations provide a data_store; check with
148
+ ``getattr(index, 'data_store', None)``.
197
149
  """
198
150
  ...
199
151
 
200
152
  # Dataset operations
201
153
 
202
- def insert_dataset(
154
+ def write(
203
155
  self,
204
- ds: "Dataset",
156
+ samples: Iterable,
205
157
  *,
206
158
  name: str,
207
159
  schema_ref: Optional[str] = None,
208
160
  **kwargs,
209
161
  ) -> IndexEntry:
210
- """Insert a dataset into the index.
162
+ """Write samples and create an index entry in one step.
211
163
 
212
- The sample type is inferred from ``ds.sample_type``. If schema_ref is not
213
- provided, the schema may be auto-published based on the sample type.
164
+ Serializes samples to WebDataset tar files, stores them via the
165
+ appropriate backend, and creates an index entry.
214
166
 
215
167
  Args:
216
- ds: The Dataset to register in the index (any sample type).
217
- name: Human-readable name for the dataset.
218
- schema_ref: Optional explicit schema reference. If not provided,
219
- the schema may be auto-published or inferred from ds.sample_type.
220
- **kwargs: Additional backend-specific options.
168
+ samples: Iterable of Packable samples. Must be non-empty.
169
+ name: Dataset name, optionally prefixed with target backend.
170
+ schema_ref: Optional schema reference.
171
+ **kwargs: Backend-specific options (maxcount, description, etc.).
221
172
 
222
173
  Returns:
223
- IndexEntry for the inserted dataset.
174
+ IndexEntry for the created dataset.
224
175
  """
225
176
  ...
226
177
 
227
- def get_dataset(self, ref: str) -> IndexEntry:
228
- """Get a dataset entry by name or reference.
178
+ def insert_dataset(
179
+ self,
180
+ ds: "Dataset",
181
+ *,
182
+ name: str,
183
+ schema_ref: Optional[str] = None,
184
+ **kwargs,
185
+ ) -> IndexEntry:
186
+ """Register an existing dataset in the index.
229
187
 
230
188
  Args:
231
- ref: Dataset name, path, or full reference string.
189
+ ds: The Dataset to register.
190
+ name: Human-readable name.
191
+ schema_ref: Explicit schema ref; auto-published if ``None``.
192
+ **kwargs: Backend-specific options.
193
+ """
194
+ ...
232
195
 
233
- Returns:
234
- IndexEntry for the dataset.
196
+ def get_dataset(self, ref: str) -> IndexEntry:
197
+ """Get a dataset entry by name or reference.
235
198
 
236
199
  Raises:
237
200
  KeyError: If dataset not found.
@@ -239,21 +202,9 @@ class AbstractIndex(Protocol):
239
202
  ...
240
203
 
241
204
  @property
242
- def datasets(self) -> Iterator[IndexEntry]:
243
- """Lazily iterate over all dataset entries in this index.
244
-
245
- Yields:
246
- IndexEntry for each dataset (may be of different sample types).
247
- """
248
- ...
249
-
250
- def list_datasets(self) -> list[IndexEntry]:
251
- """Get all dataset entries as a materialized list.
205
+ def datasets(self) -> Iterator[IndexEntry]: ...
252
206
 
253
- Returns:
254
- List of IndexEntry for each dataset.
255
- """
256
- ...
207
+ def list_datasets(self) -> list[IndexEntry]: ...
257
208
 
258
209
  # Schema operations
259
210
 
@@ -266,80 +217,39 @@ class AbstractIndex(Protocol):
266
217
  ) -> str:
267
218
  """Publish a schema for a sample type.
268
219
 
269
- The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
270
- support ``@packable``-decorated classes, which satisfy the Packable protocol
271
- at runtime but cannot be statically verified by type checkers.
272
-
273
220
  Args:
274
- sample_type: A Packable type (PackableSample subclass or @packable-decorated).
275
- Validated at runtime via the @runtime_checkable Packable protocol.
276
- version: Semantic version string for the schema.
277
- **kwargs: Additional backend-specific options.
221
+ sample_type: A Packable type (``@packable``-decorated or subclass).
222
+ version: Semantic version string.
223
+ **kwargs: Backend-specific options.
278
224
 
279
225
  Returns:
280
- Schema reference string:
281
- - Local: 'local://schemas/{module.Class}@{version}'
282
- - Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
226
+ Schema reference string (``local://...`` or ``at://...``).
283
227
  """
284
228
  ...
285
229
 
286
230
  def get_schema(self, ref: str) -> dict:
287
231
  """Get a schema record by reference.
288
232
 
289
- Args:
290
- ref: Schema reference string (local:// or at://).
291
-
292
- Returns:
293
- Schema record as a dictionary with fields like 'name', 'version',
294
- 'fields', etc.
295
-
296
233
  Raises:
297
234
  KeyError: If schema not found.
298
235
  """
299
236
  ...
300
237
 
301
238
  @property
302
- def schemas(self) -> Iterator[dict]:
303
- """Lazily iterate over all schema records in this index.
304
-
305
- Yields:
306
- Schema records as dictionaries.
307
- """
308
- ...
239
+ def schemas(self) -> Iterator[dict]: ...
309
240
 
310
- def list_schemas(self) -> list[dict]:
311
- """Get all schema records as a materialized list.
312
-
313
- Returns:
314
- List of schema records as dictionaries.
315
- """
316
- ...
241
+ def list_schemas(self) -> list[dict]: ...
317
242
 
318
243
  def decode_schema(self, ref: str) -> Type[Packable]:
319
- """Reconstruct a Python Packable type from a stored schema.
320
-
321
- This method enables loading datasets without knowing the sample type
322
- ahead of time. The index retrieves the schema record and dynamically
323
- generates a Packable class matching the schema definition.
324
-
325
- Args:
326
- ref: Schema reference string (local:// or at://).
327
-
328
- Returns:
329
- A dynamically generated Packable class with fields matching
330
- the schema definition. The class can be used with
331
- ``Dataset[T]`` to load and iterate over samples.
244
+ """Reconstruct a Packable type from a stored schema.
332
245
 
333
246
  Raises:
334
247
  KeyError: If schema not found.
335
- ValueError: If schema cannot be decoded (unsupported field types).
248
+ ValueError: If schema has unsupported field types.
336
249
 
337
250
  Examples:
338
- >>> entry = index.get_dataset("my-dataset")
339
251
  >>> SampleType = index.decode_schema(entry.schema_ref)
340
252
  >>> ds = Dataset[SampleType](entry.data_urls[0])
341
- >>> for sample in ds.ordered():
342
- ... print(sample) # sample is instance of SampleType
343
253
  """
344
254
  ...
345
255
 
@@ -349,21 +259,14 @@ class AbstractIndex(Protocol):
349
259
 
350
260
 
351
261
  class AbstractDataStore(Protocol):
352
- """Protocol for data storage operations.
353
-
354
- This protocol abstracts over different storage backends for dataset data:
355
- - S3DataStore: S3-compatible object storage
356
- - PDSBlobStore: ATProto PDS blob storage (future)
262
+ """Protocol for data storage backends (S3, local disk, PDS blobs).
357
263
 
358
- The separation of index (metadata) from data store (actual files) allows
359
- flexible deployment: local index with S3 storage, atmosphere index with
360
- S3 storage, or atmosphere index with PDS blobs.
264
+ Separates index (metadata) from data store (shard files), enabling
265
+ flexible deployment combinations.
361
266
 
362
267
  Examples:
363
268
  >>> store = S3DataStore(credentials, bucket="my-bucket")
364
269
  >>> urls = store.write_shards(dataset, prefix="training/v1")
365
- >>> print(urls)
366
- ['s3://my-bucket/training/v1/shard-000000.tar', ...]
367
270
  """
368
271
 
369
272
  def write_shards(
@@ -377,38 +280,19 @@ class AbstractDataStore(Protocol):
377
280
 
378
281
  Args:
379
282
  ds: The Dataset to write.
380
- prefix: Path prefix for the shards (e.g., 'datasets/mnist/v1').
381
- **kwargs: Backend-specific options (e.g., maxcount for shard size).
283
+ prefix: Path prefix (e.g., ``'datasets/mnist/v1'``).
284
+ **kwargs: Backend-specific options (``maxcount``, ``maxsize``, etc.).
382
285
 
383
286
  Returns:
384
- List of URLs for the written shards, suitable for use with
385
- WebDataset or atdata.Dataset().
287
+ List of shard URLs suitable for ``atdata.Dataset()``.
386
288
  """
387
289
  ...
388
290
 
389
291
  def read_url(self, url: str) -> str:
390
- """Resolve a storage URL for reading.
391
-
392
- Some storage backends may need to transform URLs (e.g., signing S3 URLs
393
- or resolving blob references). This method returns a URL that can be
394
- used directly with WebDataset.
395
-
396
- Args:
397
- url: Storage URL to resolve.
398
-
399
- Returns:
400
- WebDataset-compatible URL for reading.
401
- """
292
+ """Resolve a storage URL for reading (e.g., sign S3 URLs)."""
402
293
  ...
403
294
 
404
- def supports_streaming(self) -> bool:
405
- """Whether this store supports streaming reads.
406
-
407
- Returns:
408
- True if the store supports efficient streaming (like S3),
409
- False if data must be fully downloaded first.
410
- """
411
- ...
295
+ def supports_streaming(self) -> bool: ...
412
296
 
413
297
 
414
298
  ##
@@ -417,77 +301,32 @@ class AbstractDataStore(Protocol):
417
301
 
418
302
  @runtime_checkable
419
303
  class DataSource(Protocol):
420
- """Protocol for data sources that provide streams to Dataset.
304
+ """Protocol for data sources that stream shard data to Dataset.
421
305
 
422
- A DataSource abstracts over different ways of accessing dataset shards:
423
- - URLSource: Standard WebDataset-compatible URLs (http, https, pipe, gs, etc.)
424
- - S3Source: S3-compatible storage with explicit credentials
425
- - BlobSource: ATProto blob references (future)
426
-
427
- The key method is ``shards()``, which yields (identifier, stream) pairs.
428
- These are fed directly to WebDataset's tar_file_expander, bypassing URL
429
- resolution entirely. This enables:
430
- - Private S3 repos with credentials
431
- - Custom endpoints (Cloudflare R2, MinIO)
432
- - ATProto blob streaming
433
- - Any other source that can provide file-like objects
306
+ Implementations (URLSource, S3Source, BlobSource) yield
307
+ ``(identifier, stream)`` pairs fed to WebDataset's tar expander,
308
+ bypassing URL resolution. This enables private S3, custom endpoints,
309
+ and ATProto blob streaming.
434
310
 
435
311
  Examples:
436
- >>> source = S3Source(
437
- ... bucket="my-bucket",
438
- ... keys=["data-000.tar", "data-001.tar"],
439
- ... endpoint="https://r2.example.com",
440
- ... credentials=creds,
441
- ... )
312
+ >>> source = S3Source(bucket="my-bucket", keys=["data-000.tar"])
442
313
  >>> ds = Dataset[MySample](source)
443
- >>> for sample in ds.ordered():
444
- ... print(sample)
445
314
  """
446
315
 
447
316
  @property
448
317
  def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
449
- """Lazily yield (identifier, stream) pairs for each shard.
450
-
451
- The identifier is used for error messages and __url__ metadata.
452
- The stream must be a file-like object that can be read by tarfile.
453
-
454
- Yields:
455
- Tuple of (shard_identifier, file_like_stream).
456
-
457
- Examples:
458
- >>> for shard_id, stream in source.shards:
459
- ... print(f"Processing {shard_id}")
460
- ... data = stream.read()
461
- """
318
+ """Lazily yield ``(shard_id, stream)`` pairs for each shard."""
462
319
  ...
463
320
 
464
321
  def list_shards(self) -> list[str]:
465
- """Get list of shard identifiers without opening streams.
466
-
467
- Used for metadata queries like counting shards without actually
468
- streaming data. Implementations should return identifiers that
469
- match what shards would yield.
470
-
471
- Returns:
472
- List of shard identifier strings.
473
- """
322
+ """Shard identifiers without opening streams."""
474
323
  ...
475
324
 
476
325
  def open_shard(self, shard_id: str) -> IO[bytes]:
477
- """Open a single shard by its identifier.
478
-
479
- This method enables random access to individual shards, which is
480
- required for PyTorch DataLoader worker splitting. Each worker opens
481
- only its assigned shards rather than iterating all shards.
482
-
483
- Args:
484
- shard_id: Shard identifier from shard_list.
485
-
486
- Returns:
487
- File-like stream for reading the shard.
326
+ """Open a single shard for random access (e.g., DataLoader splitting).
488
327
 
489
328
  Raises:
490
- KeyError: If shard_id is not in shard_list.
329
+ KeyError: If *shard_id* is not in ``list_shards()``.
491
330
  """
492
331
  ...
493
332
 
atdata/_schema_codec.py CHANGED
@@ -28,13 +28,14 @@ import hashlib
28
28
 
29
29
  from numpy.typing import NDArray
30
30
 
31
- # Import PackableSample for inheritance
31
+ # Import PackableSample for inheritance in dynamic class generation
32
32
  from .dataset import PackableSample
33
+ from ._protocols import Packable
33
34
 
34
35
 
35
36
  # Type cache to avoid regenerating identical types
36
37
  # Uses insertion order (Python 3.7+) for simple FIFO eviction
37
- _type_cache: dict[str, Type[PackableSample]] = {}
38
+ _type_cache: dict[str, Type[Packable]] = {}
38
39
  _TYPE_CACHE_MAX_SIZE = 256
39
40
 
40
41
 
@@ -130,7 +131,7 @@ def schema_to_type(
130
131
  schema: dict,
131
132
  *,
132
133
  use_cache: bool = True,
133
- ) -> Type[PackableSample]:
134
+ ) -> Type[Packable]:
134
135
  """Generate a PackableSample subclass from a schema record.
135
136
 
136
137
  This function dynamically creates a dataclass that inherits from PackableSample,
@@ -283,7 +284,7 @@ def generate_stub(schema: dict) -> str:
283
284
  String content for a .pyi stub file.
284
285
 
285
286
  Examples:
286
- >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
287
+ >>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
287
288
  >>> stub_content = generate_stub(schema.to_dict())
288
289
  >>> # Save to a stubs directory configured in your IDE
289
290
  >>> with open("stubs/my_sample.pyi", "w") as f:
@@ -359,7 +360,7 @@ def generate_module(schema: dict) -> str:
359
360
  String content for a .py module file.
360
361
 
361
362
  Examples:
362
- >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
363
+ >>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
363
364
  >>> module_content = generate_module(schema.to_dict())
364
365
  >>> # The module can be imported after being saved
365
366
  """
@@ -420,7 +421,7 @@ def clear_type_cache() -> None:
420
421
  _type_cache.clear()
421
422
 
422
423
 
423
- def get_cached_types() -> dict[str, Type[PackableSample]]:
424
+ def get_cached_types() -> dict[str, Type[Packable]]:
424
425
  """Get a copy of the current type cache.
425
426
 
426
427
  Returns:
atdata/_stub_manager.py CHANGED
@@ -15,7 +15,7 @@ Examples:
15
15
  >>> index = Index(auto_stubs=True)
16
16
  >>>
17
17
  >>> # Modules are generated automatically on decode_schema
18
- >>> MyType = index.decode_schema("atdata://local/sampleSchema/MySample@1.0.0")
18
+ >>> MyType = index.decode_schema("atdata://local/schema/MySample@1.0.0")
19
19
  >>> # MyType is now properly typed for IDE autocomplete!
20
20
  >>>
21
21
  >>> # Get the stub directory path for IDE configuration
@@ -51,8 +51,8 @@ def _extract_authority(schema_ref: Optional[str]) -> str:
51
51
  """Extract authority from a schema reference URI.
52
52
 
53
53
  Args:
54
- schema_ref: Schema ref like "atdata://local/sampleSchema/Name@1.0.0"
55
- or "atdata://alice.bsky.social/sampleSchema/Name@1.0.0"
54
+ schema_ref: Schema ref like "atdata://local/schema/Name@1.0.0"
55
+ or "atdata://alice.bsky.social/schema/Name@1.0.0"
56
56
 
57
57
  Returns:
58
58
  Authority string (e.g., "local", "alice.bsky.social", "did_plc_xxx").
@@ -149,10 +149,6 @@ class StubManager:
149
149
  safe_version = version.replace(".", "_")
150
150
  return f"{name}_{safe_version}.py"
151
151
 
152
- def _stub_filename(self, name: str, version: str) -> str:
153
- """Alias for _module_filename for backwards compatibility."""
154
- return self._module_filename(name, version)
155
-
156
152
  def _module_path(
157
153
  self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
158
154
  ) -> Path:
@@ -168,12 +164,6 @@ class StubManager:
168
164
  """
169
165
  return self._stub_dir / authority / self._module_filename(name, version)
170
166
 
171
- def _stub_path(
172
- self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
173
- ) -> Path:
174
- """Alias for _module_path for backwards compatibility."""
175
- return self._module_path(name, version, authority)
176
-
177
167
  def _module_is_current(self, path: Path, version: str) -> bool:
178
168
  """Check if an existing module file matches the expected version.
179
169
 
@@ -200,10 +190,6 @@ class StubManager:
200
190
  except (OSError, IOError):
201
191
  return False
202
192
 
203
- def _stub_is_current(self, path: Path, version: str) -> bool:
204
- """Alias for _module_is_current for backwards compatibility."""
205
- return self._module_is_current(path, version)
206
-
207
193
  def _ensure_authority_package(self, authority: str) -> None:
208
194
  """Ensure authority subdirectory exists with __init__.py."""
209
195
  self._ensure_dir_exists()
@@ -261,12 +247,6 @@ class StubManager:
261
247
  pass # Temp file cleanup failed, re-raising original error
262
248
  raise
263
249
 
264
- def _write_stub_atomic(self, path: Path, content: str) -> None:
265
- """Legacy method - extracts authority from path and calls _write_module_atomic."""
266
- # Extract authority from path (parent directory name)
267
- authority = path.parent.name
268
- self._write_module_atomic(path, content, authority)
269
-
270
250
  def ensure_stub(self, schema: dict) -> Optional[Path]:
271
251
  """Ensure a module file exists for the given schema.
272
252
 
@@ -426,7 +406,7 @@ class StubManager:
426
406
  Returns:
427
407
  Path if stub exists, None otherwise
428
408
  """
429
- path = self._stub_path(name, version, authority)
409
+ path = self._module_path(name, version, authority)
430
410
  return path if path.exists() else None
431
411
 
432
412
  def list_stubs(self, authority: Optional[str] = None) -> list[Path]:
@@ -513,7 +493,7 @@ class StubManager:
513
493
  Returns:
514
494
  True if file was removed, False if it didn't exist
515
495
  """
516
- path = self._stub_path(name, version, authority)
496
+ path = self._module_path(name, version, authority)
517
497
  if path.exists():
518
498
  try:
519
499
  path.unlink()
atdata/_type_utils.py CHANGED
@@ -45,9 +45,13 @@ def numpy_dtype_to_string(dtype: Any) -> str:
45
45
  Schema dtype string (e.g., "float32", "int64"). Defaults to "float32".
46
46
  """
47
47
  dtype_str = str(dtype)
48
- for key, value in NUMPY_DTYPE_MAP.items():
48
+ # Exact match first (handles "float32", "int64", etc.)
49
+ if dtype_str in NUMPY_DTYPE_MAP:
50
+ return NUMPY_DTYPE_MAP[dtype_str]
51
+ # Substring match, longest keys first to avoid "int8" matching "uint8"
52
+ for key in sorted(NUMPY_DTYPE_MAP, key=len, reverse=True):
49
53
  if key in dtype_str:
50
- return value
54
+ return NUMPY_DTYPE_MAP[key]
51
55
  return "float32"
52
56
 
53
57
 
@@ -102,3 +106,25 @@ def extract_ndarray_dtype(python_type: Any) -> str:
102
106
  if dtype_arg is not None:
103
107
  return numpy_dtype_to_string(dtype_arg)
104
108
  return "float32"
109
+
110
+
111
+ def parse_semver(version: str) -> tuple[int, int, int]:
112
+ """Parse a semantic version string into a comparable tuple.
113
+
114
+ Args:
115
+ version: A ``"major.minor.patch"`` version string.
116
+
117
+ Returns:
118
+ Tuple of (major, minor, patch) integers.
119
+
120
+ Raises:
121
+ ValueError: If the version string is not valid semver.
122
+
123
+ Examples:
124
+ >>> parse_semver("1.2.3")
125
+ (1, 2, 3)
126
+ """
127
+ parts = version.split(".")
128
+ if len(parts) != 3:
129
+ raise ValueError(f"Invalid semver: {version}")
130
+ return int(parts[0]), int(parts[1]), int(parts[2])