atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +31 -1
  3. atdata/_cid.py +29 -35
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +33 -17
  6. atdata/_hf_api.py +109 -59
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +74 -132
  9. atdata/_schema_codec.py +38 -41
  10. atdata/_sources.py +57 -64
  11. atdata/_stub_manager.py +31 -26
  12. atdata/_type_utils.py +47 -7
  13. atdata/atmosphere/__init__.py +31 -24
  14. atdata/atmosphere/_types.py +11 -11
  15. atdata/atmosphere/client.py +11 -8
  16. atdata/atmosphere/lens.py +27 -30
  17. atdata/atmosphere/records.py +34 -39
  18. atdata/atmosphere/schema.py +35 -31
  19. atdata/atmosphere/store.py +16 -20
  20. atdata/cli/__init__.py +163 -168
  21. atdata/cli/diagnose.py +12 -8
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/local.py +5 -2
  24. atdata/cli/preview.py +63 -0
  25. atdata/cli/schema.py +109 -0
  26. atdata/dataset.py +678 -533
  27. atdata/lens.py +85 -83
  28. atdata/local/__init__.py +71 -0
  29. atdata/local/_entry.py +157 -0
  30. atdata/local/_index.py +940 -0
  31. atdata/local/_repo_legacy.py +218 -0
  32. atdata/local/_s3.py +349 -0
  33. atdata/local/_schema.py +380 -0
  34. atdata/manifest/__init__.py +28 -0
  35. atdata/manifest/_aggregates.py +156 -0
  36. atdata/manifest/_builder.py +163 -0
  37. atdata/manifest/_fields.py +154 -0
  38. atdata/manifest/_manifest.py +146 -0
  39. atdata/manifest/_query.py +150 -0
  40. atdata/manifest/_writer.py +74 -0
  41. atdata/promote.py +20 -24
  42. atdata/providers/__init__.py +25 -0
  43. atdata/providers/_base.py +140 -0
  44. atdata/providers/_factory.py +69 -0
  45. atdata/providers/_postgres.py +214 -0
  46. atdata/providers/_redis.py +171 -0
  47. atdata/providers/_sqlite.py +191 -0
  48. atdata/repository.py +323 -0
  49. atdata/testing.py +337 -0
  50. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
  51. atdata-0.3.0b1.dist-info/RECORD +54 -0
  52. atdata/local.py +0 -1707
  53. atdata-0.2.2b1.dist-info/RECORD +0 -28
  54. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  55. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  56. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/local/_index.py ADDED
@@ -0,0 +1,940 @@
1
+ """Index class for local dataset management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from atdata import (
6
+ Dataset,
7
+ )
8
+ from atdata._protocols import AbstractDataStore, Packable
9
+
10
+ from atdata.local._entry import LocalDatasetEntry
11
+ from atdata.local._schema import (
12
+ SchemaNamespace,
13
+ LocalSchemaRecord,
14
+ _schema_ref_from_type,
15
+ _make_schema_ref,
16
+ _parse_schema_ref,
17
+ _increment_patch,
18
+ _build_schema_record,
19
+ )
20
+
21
+ from pathlib import Path
22
+ from typing import (
23
+ Any,
24
+ Type,
25
+ TypeVar,
26
+ Generator,
27
+ TYPE_CHECKING,
28
+ )
29
+ from redis import Redis
30
+ import json
31
+
32
+ if TYPE_CHECKING:
33
+ from atdata.providers._base import IndexProvider
34
+ from atdata.repository import Repository, _AtmosphereBackend
35
+ from atdata._protocols import IndexEntry
36
+
37
+ T = TypeVar("T", bound=Packable)
38
+
39
+
40
+ class Index:
41
+ """Unified index for tracking datasets across multiple repositories.
42
+
43
+ Implements the AbstractIndex protocol. Maintains a registry of
44
+ dataset entries across a built-in ``"local"`` repository, optional
45
+ named repositories, and an optional atmosphere (ATProto) backend.
46
+
47
+ The ``"local"`` repository is always present and uses the storage backend
48
+ determined by the ``provider`` argument. When no provider is given, defaults
49
+ to SQLite (zero external dependencies). Pass a ``redis`` connection or
50
+ Redis ``**kwargs`` for backwards-compatible Redis behaviour.
51
+
52
+ Additional named repositories can be mounted via the ``repos`` parameter,
53
+ each pairing an IndexProvider with an optional data store.
54
+
55
+ An AtmosphereClient is available by default for anonymous read-only
56
+ resolution of ``@handle/dataset`` paths. Pass an authenticated client
57
+ for write operations, or ``atmosphere=None`` to disable.
58
+
59
+ Attributes:
60
+ _provider: IndexProvider for the built-in ``"local"`` repository.
61
+ _data_store: Optional AbstractDataStore for the local repository.
62
+ _repos: Named repositories beyond ``"local"``.
63
+ _atmosphere: Optional atmosphere backend for ATProto operations.
64
+ """
65
+
66
+ ##
67
+
68
+ # Sentinel for default atmosphere behaviour (lazy anonymous client)
69
+ _ATMOSPHERE_DEFAULT = object()
70
+
71
+ def __init__(
72
+ self,
73
+ provider: IndexProvider | str | None = None,
74
+ *,
75
+ path: str | Path | None = None,
76
+ dsn: str | None = None,
77
+ redis: Redis | None = None,
78
+ data_store: AbstractDataStore | None = None,
79
+ repos: dict[str, Repository] | None = None,
80
+ atmosphere: Any | None = _ATMOSPHERE_DEFAULT,
81
+ auto_stubs: bool = False,
82
+ stub_dir: Path | str | None = None,
83
+ **kwargs,
84
+ ) -> None:
85
+ """Initialize an index.
86
+
87
+ Args:
88
+ provider: Storage backend for the ``"local"`` repository.
89
+ Accepts an ``IndexProvider`` instance or a backend name
90
+ string (``"sqlite"``, ``"redis"``, or ``"postgres"``).
91
+ When ``None``, falls back to *redis* / *kwargs* if given,
92
+ otherwise defaults to SQLite.
93
+ path: Database file path (SQLite only). Ignored unless
94
+ *provider* is ``"sqlite"``.
95
+ dsn: PostgreSQL connection string. Required when *provider*
96
+ is ``"postgres"``.
97
+ redis: Redis connection to use (backwards-compat shorthand for
98
+ ``RedisProvider(redis)``). Ignored when *provider* is given.
99
+ data_store: Optional data store for writing dataset shards in the
100
+ ``"local"`` repository. If provided, ``insert_dataset()`` will
101
+ write shards to this store. If None, only indexes existing URLs.
102
+ repos: Named repositories to mount alongside ``"local"``. Keys are
103
+ repository names (e.g. ``"lab"``, ``"shared"``). The name
104
+ ``"local"`` is reserved for the built-in repository.
105
+ atmosphere: ATProto client for distributed network operations.
106
+ - Default (sentinel): creates an anonymous read-only client
107
+ lazily on first access.
108
+ - ``AtmosphereClient`` instance: uses that client directly.
109
+ - ``None``: disables atmosphere backend entirely.
110
+ auto_stubs: If True, automatically generate .pyi stub files when
111
+ schemas are accessed via get_schema() or decode_schema().
112
+ This enables IDE autocomplete for dynamically decoded types.
113
+ stub_dir: Directory to write stub files. Only used if auto_stubs
114
+ is True or if this parameter is provided (which implies auto_stubs).
115
+ Defaults to ~/.atdata/stubs/ if not specified.
116
+ **kwargs: Additional arguments passed to Redis() constructor when
117
+ *redis* is not given. If any kwargs are provided (without an
118
+ explicit *provider*), Redis is used instead of the SQLite default.
119
+
120
+ Raises:
121
+ TypeError: If provider is not an IndexProvider or valid string.
122
+ ValueError: If repos contains the reserved name ``"local"``.
123
+
124
+ Examples:
125
+ >>> # Default: local SQLite + anonymous atmosphere
126
+ >>> index = Index()
127
+ >>>
128
+ >>> # SQLite with explicit path
129
+ >>> index = Index(provider="sqlite", path="~/.atdata/index.db")
130
+ >>>
131
+ >>> # Redis
132
+ >>> index = Index(redis=redis_conn)
133
+ >>>
134
+ >>> # PostgreSQL
135
+ >>> index = Index(provider="postgres", dsn="postgresql://user:pass@host/db")
136
+ >>>
137
+ >>> # Multiple repositories
138
+ >>> from atdata.repository import Repository, create_repository
139
+ >>> index = Index(
140
+ ... provider="sqlite",
141
+ ... repos={
142
+ ... "lab": create_repository("sqlite", path="/data/lab.db"),
143
+ ... },
144
+ ... )
145
+ """
146
+ ##
147
+
148
+ from atdata.providers._base import IndexProvider as _IP
149
+
150
+ if isinstance(provider, str):
151
+ # String-based provider selection
152
+ from atdata.providers._factory import create_provider
153
+
154
+ self._provider: _IP = create_provider(
155
+ provider, path=path, dsn=dsn, redis=redis, **kwargs
156
+ )
157
+ elif provider is not None:
158
+ if not isinstance(provider, _IP):
159
+ raise TypeError(
160
+ f"provider must be an IndexProvider or backend name string, "
161
+ f"got {type(provider).__name__}"
162
+ )
163
+ self._provider = provider
164
+ elif redis is not None:
165
+ # Explicit Redis connection provided
166
+ from atdata.providers._redis import RedisProvider
167
+
168
+ self._provider = RedisProvider(redis)
169
+ elif kwargs:
170
+ # kwargs provided — assume Redis constructor args for compat
171
+ from atdata.providers._redis import RedisProvider
172
+
173
+ self._provider = RedisProvider(Redis(**kwargs))
174
+ else:
175
+ # Default: zero-dependency SQLite
176
+ from atdata.providers._sqlite import SqliteProvider
177
+
178
+ self._provider = SqliteProvider()
179
+
180
+ self._data_store = data_store
181
+
182
+ # Validate and store named repositories
183
+ from atdata.repository import Repository as _Repo
184
+
185
+ if repos is not None:
186
+ if "local" in repos:
187
+ raise ValueError(
188
+ '"local" is reserved for the built-in repository. '
189
+ "Use a different name for your repository."
190
+ )
191
+ for name, repo in repos.items():
192
+ if not isinstance(repo, _Repo):
193
+ raise TypeError(
194
+ f"repos[{name!r}] must be a Repository, "
195
+ f"got {type(repo).__name__}"
196
+ )
197
+ self._repos: dict[str, _Repo] = dict(repos)
198
+ else:
199
+ self._repos = {}
200
+
201
+ # Atmosphere backend (lazy or explicit)
202
+ from atdata.repository import _AtmosphereBackend
203
+
204
+ if atmosphere is Index._ATMOSPHERE_DEFAULT:
205
+ # Deferred: create anonymous client on first use
206
+ self._atmosphere: _AtmosphereBackend | None = None
207
+ self._atmosphere_deferred = True
208
+ elif atmosphere is None:
209
+ self._atmosphere = None
210
+ self._atmosphere_deferred = False
211
+ else:
212
+ self._atmosphere = _AtmosphereBackend(atmosphere)
213
+ self._atmosphere_deferred = False
214
+
215
+ # Initialize stub manager if auto-stubs enabled
216
+ # Providing stub_dir implies auto_stubs=True
217
+ if auto_stubs or stub_dir is not None:
218
+ from atdata._stub_manager import StubManager
219
+
220
+ self._stub_manager: StubManager | None = StubManager(stub_dir=stub_dir)
221
+ else:
222
+ self._stub_manager = None
223
+
224
+ # Initialize schema namespace for load_schema/schemas API
225
+ self._schema_namespace = SchemaNamespace()
226
+
227
+ # -- Repository access --
228
+
229
+ def _get_atmosphere(self) -> "_AtmosphereBackend | None":
230
+ """Get the atmosphere backend, lazily creating anonymous client if needed."""
231
+ if self._atmosphere_deferred and self._atmosphere is None:
232
+ try:
233
+ from atdata.atmosphere.client import AtmosphereClient
234
+ from atdata.repository import _AtmosphereBackend
235
+
236
+ client = AtmosphereClient()
237
+ self._atmosphere = _AtmosphereBackend(client)
238
+ except ImportError:
239
+ # atproto package not installed -- atmosphere unavailable
240
+ self._atmosphere_deferred = False
241
+ return None
242
+ return self._atmosphere
243
+
244
+ def _resolve_prefix(self, ref: str) -> tuple[str, str, str | None]:
245
+ """Route a dataset/schema reference to the correct backend.
246
+
247
+ Returns:
248
+ Tuple of ``(backend_key, resolved_ref, handle_or_did)``.
249
+
250
+ - ``backend_key``: ``"local"``, a named repository, or
251
+ ``"_atmosphere"``.
252
+ - ``resolved_ref``: The dataset/schema name or AT URI to pass
253
+ to the backend.
254
+ - ``handle_or_did``: Populated only for atmosphere paths.
255
+ """
256
+ # AT URIs go to atmosphere
257
+ if ref.startswith("at://"):
258
+ return ("_atmosphere", ref, None)
259
+
260
+ # @ prefix -> atmosphere
261
+ if ref.startswith("@"):
262
+ rest = ref[1:]
263
+ parts = rest.split("/", 1)
264
+ if len(parts) == 2:
265
+ return ("_atmosphere", parts[1], parts[0])
266
+ return ("_atmosphere", rest, None)
267
+
268
+ # atdata:// full URI
269
+ if ref.startswith("atdata://"):
270
+ path = ref[len("atdata://") :]
271
+ parts = path.split("/")
272
+ # atdata://mount/collection/name or atdata://mount/name
273
+ repo_name = parts[0]
274
+ dataset_name = parts[-1]
275
+ if repo_name == "local" or repo_name in self._repos:
276
+ return (repo_name, dataset_name, None)
277
+ # Unknown prefix -- might be an atmosphere handle
278
+ return ("_atmosphere", dataset_name, repo_name)
279
+
280
+ # prefix/name where prefix is a known repository
281
+ if "/" in ref:
282
+ prefix, rest = ref.split("/", 1)
283
+ if prefix == "local":
284
+ return ("local", rest, None)
285
+ if prefix in self._repos:
286
+ return (prefix, rest, None)
287
+
288
+ # Bare name -> local repository
289
+ return ("local", ref, None)
290
+
291
+ @property
292
+ def repos(self) -> dict[str, Repository]:
293
+ """Named repositories mounted on this index (excluding ``"local"``)."""
294
+ return dict(self._repos)
295
+
296
+ @property
297
+ def atmosphere(self) -> Any:
298
+ """The AtmosphereClient for this index, or None if disabled.
299
+
300
+ Returns the underlying client (not the internal backend wrapper).
301
+ """
302
+ backend = self._get_atmosphere()
303
+ if backend is not None:
304
+ return backend.client
305
+ return None
306
+
307
+ @property
308
+ def provider(self) -> "IndexProvider": # noqa: F821
309
+ """The storage provider backing this index."""
310
+ return self._provider
311
+
312
+ @property
313
+ def _redis(self) -> Redis:
314
+ """Backwards-compatible access to the underlying Redis connection.
315
+
316
+ Raises:
317
+ AttributeError: If the current provider is not Redis-backed.
318
+ """
319
+ from atdata.providers._redis import RedisProvider
320
+
321
+ if isinstance(self._provider, RedisProvider):
322
+ return self._provider.redis
323
+ raise AttributeError(
324
+ "Index._redis is only available with a Redis provider. "
325
+ "Use index.provider instead."
326
+ )
327
+
328
+ @property
329
+ def data_store(self) -> AbstractDataStore | None:
330
+ """The data store for writing shards, or None if index-only."""
331
+ return self._data_store
332
+
333
+ @property
334
+ def stub_dir(self) -> Path | None:
335
+ """Directory where stub files are written, or None if auto-stubs disabled.
336
+
337
+ Use this path to configure your IDE for type checking support:
338
+ - VS Code/Pylance: Add to python.analysis.extraPaths in settings.json
339
+ - PyCharm: Mark as Sources Root
340
+ - mypy: Add to mypy_path in mypy.ini
341
+ """
342
+ if self._stub_manager is not None:
343
+ return self._stub_manager.stub_dir
344
+ return None
345
+
346
+ @property
347
+ def types(self) -> SchemaNamespace:
348
+ """Namespace for accessing loaded schema types.
349
+
350
+ After calling :meth:`load_schema`, schema types become available
351
+ as attributes on this namespace.
352
+
353
+ Examples:
354
+ >>> index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
355
+ >>> MyType = index.types.MySample
356
+ >>> sample = MyType(name="hello", value=42)
357
+
358
+ Returns:
359
+ SchemaNamespace containing all loaded schema types.
360
+ """
361
+ return self._schema_namespace
362
+
363
+ def load_schema(self, ref: str) -> Type[Packable]:
364
+ """Load a schema and make it available in the types namespace.
365
+
366
+ This method decodes the schema, optionally generates a Python module
367
+ for IDE support (if auto_stubs is enabled), and registers the type
368
+ in the :attr:`types` namespace for easy access.
369
+
370
+ Args:
371
+ ref: Schema reference string (atdata://local/sampleSchema/... or
372
+ legacy local://schemas/...).
373
+
374
+ Returns:
375
+ The decoded PackableSample subclass. Also available via
376
+ ``index.types.<ClassName>`` after this call.
377
+
378
+ Raises:
379
+ KeyError: If schema not found.
380
+ ValueError: If schema cannot be decoded.
381
+
382
+ Examples:
383
+ >>> # Load and use immediately
384
+ >>> MyType = index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
385
+ >>> sample = MyType(field1="hello", field2=42)
386
+ >>>
387
+ >>> # Or access later via namespace
388
+ >>> index.load_schema("atdata://local/sampleSchema/OtherType@1.0.0")
389
+ >>> other = index.types.OtherType(data="test")
390
+ """
391
+ # Decode the schema (uses generated module if auto_stubs enabled)
392
+ cls = self.decode_schema(ref)
393
+
394
+ # Register in namespace using the class name
395
+ self._schema_namespace._register(cls.__name__, cls)
396
+
397
+ return cls
398
+
399
+ def get_import_path(self, ref: str) -> str | None:
400
+ """Get the import path for a schema's generated module.
401
+
402
+ When auto_stubs is enabled, this returns the import path that can
403
+ be used to import the schema type with full IDE support.
404
+
405
+ Args:
406
+ ref: Schema reference string.
407
+
408
+ Returns:
409
+ Import path like "local.MySample_1_0_0", or None if auto_stubs
410
+ is disabled.
411
+
412
+ Examples:
413
+ >>> index = Index(auto_stubs=True)
414
+ >>> ref = index.publish_schema(MySample, version="1.0.0")
415
+ >>> index.load_schema(ref)
416
+ >>> print(index.get_import_path(ref))
417
+ local.MySample_1_0_0
418
+ >>> # Then in your code:
419
+ >>> # from local.MySample_1_0_0 import MySample
420
+ """
421
+ if self._stub_manager is None:
422
+ return None
423
+
424
+ from atdata._stub_manager import _extract_authority
425
+
426
+ name, version = _parse_schema_ref(ref)
427
+ schema_dict = self.get_schema(ref)
428
+ authority = _extract_authority(schema_dict.get("$ref"))
429
+
430
+ safe_version = version.replace(".", "_")
431
+ module_name = f"{name}_{safe_version}"
432
+
433
+ return f"{authority}.{module_name}"
434
+
435
+ def list_entries(self) -> list[LocalDatasetEntry]:
436
+ """Get all index entries as a materialized list.
437
+
438
+ Returns:
439
+ List of all LocalDatasetEntry objects in the index.
440
+ """
441
+ return list(self.entries)
442
+
443
+ # Legacy alias for backwards compatibility
444
+ @property
445
+ def all_entries(self) -> list[LocalDatasetEntry]:
446
+ """Get all index entries as a list (deprecated, use list_entries())."""
447
+ return self.list_entries()
448
+
449
+ @property
450
+ def entries(self) -> Generator[LocalDatasetEntry, None, None]:
451
+ """Iterate over all index entries.
452
+
453
+ Yields:
454
+ LocalDatasetEntry objects from the index.
455
+ """
456
+ yield from self._provider.iter_entries()
457
+
458
+ def add_entry(
459
+ self,
460
+ ds: Dataset,
461
+ *,
462
+ name: str,
463
+ schema_ref: str | None = None,
464
+ metadata: dict | None = None,
465
+ ) -> LocalDatasetEntry:
466
+ """Add a dataset to the local repository index.
467
+
468
+ Args:
469
+ ds: The dataset to add to the index.
470
+ name: Human-readable name for the dataset.
471
+ schema_ref: Optional schema reference. If None, generates from sample type.
472
+ metadata: Optional metadata dictionary. If None, uses ds._metadata if available.
473
+
474
+ Returns:
475
+ The created LocalDatasetEntry object.
476
+ """
477
+ return self._insert_dataset_to_provider(
478
+ ds,
479
+ name=name,
480
+ schema_ref=schema_ref,
481
+ provider=self._provider,
482
+ store=None,
483
+ metadata=metadata,
484
+ )
485
+
486
+ def get_entry(self, cid: str) -> LocalDatasetEntry:
487
+ """Get an entry by its CID.
488
+
489
+ Args:
490
+ cid: Content identifier of the entry.
491
+
492
+ Returns:
493
+ LocalDatasetEntry for the given CID.
494
+
495
+ Raises:
496
+ KeyError: If entry not found.
497
+ """
498
+ return self._provider.get_entry_by_cid(cid)
499
+
500
+ def get_entry_by_name(self, name: str) -> LocalDatasetEntry:
501
+ """Get an entry by its human-readable name.
502
+
503
+ Args:
504
+ name: Human-readable name of the entry.
505
+
506
+ Returns:
507
+ LocalDatasetEntry with the given name.
508
+
509
+ Raises:
510
+ KeyError: If no entry with that name exists.
511
+ """
512
+ return self._provider.get_entry_by_name(name)
513
+
514
+ # AbstractIndex protocol methods
515
+
516
+ def _insert_dataset_to_provider(
517
+ self,
518
+ ds: Dataset,
519
+ *,
520
+ name: str,
521
+ schema_ref: str | None = None,
522
+ provider: "IndexProvider", # noqa: F821
523
+ store: AbstractDataStore | None = None,
524
+ **kwargs,
525
+ ) -> LocalDatasetEntry:
526
+ """Insert a dataset into a specific provider/store pair.
527
+
528
+ This is the internal implementation shared by all local and named
529
+ repository inserts.
530
+ """
531
+ metadata = kwargs.get("metadata")
532
+
533
+ if store is not None:
534
+ prefix = kwargs.get("prefix", name)
535
+ cache_local = kwargs.get("cache_local", False)
536
+
537
+ written_urls = store.write_shards(
538
+ ds,
539
+ prefix=prefix,
540
+ cache_local=cache_local,
541
+ )
542
+
543
+ if schema_ref is None:
544
+ schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
545
+
546
+ entry_metadata = metadata if metadata is not None else ds._metadata
547
+ entry = LocalDatasetEntry(
548
+ name=name,
549
+ schema_ref=schema_ref,
550
+ data_urls=written_urls,
551
+ metadata=entry_metadata,
552
+ )
553
+ provider.store_entry(entry)
554
+ return entry
555
+
556
+ # No data store - just index the existing URL
557
+ if schema_ref is None:
558
+ schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
559
+
560
+ data_urls = [ds.url]
561
+ entry_metadata = metadata if metadata is not None else ds._metadata
562
+
563
+ entry = LocalDatasetEntry(
564
+ name=name,
565
+ schema_ref=schema_ref,
566
+ data_urls=data_urls,
567
+ metadata=entry_metadata,
568
+ )
569
+ provider.store_entry(entry)
570
+ return entry
571
+
572
+ def insert_dataset(
573
+ self,
574
+ ds: Dataset,
575
+ *,
576
+ name: str,
577
+ schema_ref: str | None = None,
578
+ **kwargs,
579
+ ) -> "IndexEntry":
580
+ """Insert a dataset into the index (AbstractIndex protocol).
581
+
582
+ The target repository is determined by a prefix in the ``name``
583
+ argument (e.g. ``"lab/mnist"``). If no prefix is given, or the
584
+ prefix is ``"local"``, the built-in local repository is used.
585
+
586
+ If the target repository has a data_store, shards are written to
587
+ storage first, then indexed. Otherwise, the dataset's existing URL
588
+ is indexed directly.
589
+
590
+ Args:
591
+ ds: The Dataset to register.
592
+ name: Human-readable name for the dataset, optionally prefixed
593
+ with a repository name (e.g. ``"lab/mnist"``).
594
+ schema_ref: Optional schema reference.
595
+ **kwargs: Additional options:
596
+ - metadata: Optional metadata dict
597
+ - prefix: Storage prefix (default: dataset name)
598
+ - cache_local: If True, cache writes locally first
599
+
600
+ Returns:
601
+ IndexEntry for the inserted dataset.
602
+ """
603
+ backend_key, resolved_name, handle_or_did = self._resolve_prefix(name)
604
+
605
+ if backend_key == "_atmosphere":
606
+ atmo = self._get_atmosphere()
607
+ if atmo is None:
608
+ raise ValueError(
609
+ f"Atmosphere backend required for name {name!r} but not available."
610
+ )
611
+ return atmo.insert_dataset(
612
+ ds, name=resolved_name, schema_ref=schema_ref, **kwargs
613
+ )
614
+
615
+ if backend_key == "local":
616
+ return self._insert_dataset_to_provider(
617
+ ds,
618
+ name=resolved_name,
619
+ schema_ref=schema_ref,
620
+ provider=self._provider,
621
+ store=self._data_store,
622
+ **kwargs,
623
+ )
624
+
625
+ # Named repository
626
+ repo = self._repos.get(backend_key)
627
+ if repo is None:
628
+ raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
629
+ return self._insert_dataset_to_provider(
630
+ ds,
631
+ name=resolved_name,
632
+ schema_ref=schema_ref,
633
+ provider=repo.provider,
634
+ store=repo.data_store,
635
+ **kwargs,
636
+ )
637
+
638
+ def get_dataset(self, ref: str) -> "IndexEntry":
639
+ """Get a dataset entry by name or prefixed reference.
640
+
641
+ Supports repository-prefixed lookups (e.g. ``"lab/mnist"``),
642
+ atmosphere paths (``"@handle/dataset"``), AT URIs, and bare names
643
+ (which default to the ``"local"`` repository).
644
+
645
+ Args:
646
+ ref: Dataset name, prefixed name, or AT URI.
647
+
648
+ Returns:
649
+ IndexEntry for the dataset.
650
+
651
+ Raises:
652
+ KeyError: If dataset not found.
653
+ ValueError: If the atmosphere backend is required but unavailable.
654
+ """
655
+ backend_key, resolved_ref, handle_or_did = self._resolve_prefix(ref)
656
+
657
+ if backend_key == "_atmosphere":
658
+ atmo = self._get_atmosphere()
659
+ if atmo is None:
660
+ raise ValueError(
661
+ f"Atmosphere backend required for path {ref!r} but not available. "
662
+ "Install 'atproto' or pass an AtmosphereClient."
663
+ )
664
+ return atmo.get_dataset(resolved_ref)
665
+
666
+ if backend_key == "local":
667
+ return self._provider.get_entry_by_name(resolved_ref)
668
+
669
+ # Named repository
670
+ repo = self._repos.get(backend_key)
671
+ if repo is None:
672
+ raise KeyError(f"Unknown repository {backend_key!r} in ref {ref!r}")
673
+ return repo.provider.get_entry_by_name(resolved_ref)
674
+
675
+ @property
676
+ def datasets(self) -> Generator["IndexEntry", None, None]:
677
+ """Lazily iterate over all dataset entries across local repositories.
678
+
679
+ Yields entries from the ``"local"`` repository and all named
680
+ repositories. Atmosphere entries are not included (use
681
+ ``list_datasets(repo="_atmosphere")`` for those).
682
+
683
+ Yields:
684
+ IndexEntry for each dataset.
685
+ """
686
+ yield from self._provider.iter_entries()
687
+ for repo in self._repos.values():
688
+ yield from repo.provider.iter_entries()
689
+
690
+ def list_datasets(self, repo: str | None = None) -> list["IndexEntry"]:
691
+ """Get dataset entries as a materialized list (AbstractIndex protocol).
692
+
693
+ Args:
694
+ repo: Optional repository filter. If ``None``, aggregates entries
695
+ from ``"local"`` and all named repositories. Use ``"local"``
696
+ for only the built-in repository, a named repo key, or
697
+ ``"_atmosphere"`` for atmosphere entries.
698
+
699
+ Returns:
700
+ List of IndexEntry for each dataset.
701
+ """
702
+ if repo is None:
703
+ return list(self.datasets)
704
+
705
+ if repo == "local":
706
+ return self.list_entries()
707
+
708
+ if repo == "_atmosphere":
709
+ atmo = self._get_atmosphere()
710
+ if atmo is None:
711
+ return []
712
+ return atmo.list_datasets()
713
+
714
+ named = self._repos.get(repo)
715
+ if named is None:
716
+ raise KeyError(f"Unknown repository {repo!r}")
717
+ return list(named.provider.iter_entries())
718
+
719
+ # Schema operations
720
+
721
+ def _get_latest_schema_version(self, name: str) -> str | None:
722
+ """Get the latest version for a schema by name, or None if not found."""
723
+ return self._provider.find_latest_version(name)
724
+
725
+ def publish_schema(
726
+ self,
727
+ sample_type: type,
728
+ *,
729
+ version: str | None = None,
730
+ description: str | None = None,
731
+ ) -> str:
732
+ """Publish a schema for a sample type to Redis.
733
+
734
+ Args:
735
+ sample_type: A Packable type (@packable-decorated or PackableSample subclass).
736
+ version: Semantic version string (e.g., '1.0.0'). If None,
737
+ auto-increments from the latest published version (patch bump),
738
+ or starts at '1.0.0' if no previous version exists.
739
+ description: Optional human-readable description. If None, uses
740
+ the class docstring.
741
+
742
+ Returns:
743
+ Schema reference string: 'atdata://local/sampleSchema/{name}@{version}'.
744
+
745
+ Raises:
746
+ ValueError: If sample_type is not a dataclass.
747
+ TypeError: If sample_type doesn't satisfy the Packable protocol,
748
+ or if a field type is not supported.
749
+ """
750
+ # Validate that sample_type satisfies Packable protocol at runtime
751
+ # This catches non-packable types early with a clear error message
752
+ try:
753
+ # Check protocol compliance by verifying required methods exist
754
+ if not (
755
+ hasattr(sample_type, "from_data")
756
+ and hasattr(sample_type, "from_bytes")
757
+ and callable(getattr(sample_type, "from_data", None))
758
+ and callable(getattr(sample_type, "from_bytes", None))
759
+ ):
760
+ raise TypeError(
761
+ f"{sample_type.__name__} does not satisfy the Packable protocol. "
762
+ "Use @packable decorator or inherit from PackableSample."
763
+ )
764
+ except AttributeError:
765
+ raise TypeError(
766
+ f"sample_type must be a class, got {type(sample_type).__name__}"
767
+ )
768
+
769
+ # Auto-increment version if not specified
770
+ if version is None:
771
+ latest = self._get_latest_schema_version(sample_type.__name__)
772
+ if latest is None:
773
+ version = "1.0.0"
774
+ else:
775
+ version = _increment_patch(latest)
776
+
777
+ schema_record = _build_schema_record(
778
+ sample_type,
779
+ version=version,
780
+ description=description,
781
+ )
782
+
783
+ schema_ref = _schema_ref_from_type(sample_type, version)
784
+ name, _ = _parse_schema_ref(schema_ref)
785
+
786
+ # Store via provider
787
+ schema_json = json.dumps(schema_record)
788
+ self._provider.store_schema(name, version, schema_json)
789
+
790
+ return schema_ref
791
+
792
+ def get_schema(self, ref: str) -> dict:
793
+ """Get a schema record by reference (AbstractIndex protocol).
794
+
795
+ Args:
796
+ ref: Schema reference string. Supports both new format
797
+ (atdata://local/sampleSchema/{name}@{version}) and legacy
798
+ format (local://schemas/{module.Class}@{version}).
799
+
800
+ Returns:
801
+ Schema record as a dictionary with keys 'name', 'version',
802
+ 'fields', '$ref', etc.
803
+
804
+ Raises:
805
+ KeyError: If schema not found.
806
+ ValueError: If reference format is invalid.
807
+ """
808
+ name, version = _parse_schema_ref(ref)
809
+
810
+ schema_json = self._provider.get_schema_json(name, version)
811
+ if schema_json is None:
812
+ raise KeyError(f"Schema not found: {ref}")
813
+
814
+ schema = json.loads(schema_json)
815
+ schema["$ref"] = _make_schema_ref(name, version)
816
+
817
+ # Auto-generate stub if enabled
818
+ if self._stub_manager is not None:
819
+ self._stub_manager.ensure_stub(schema)
820
+
821
+ return schema
822
+
823
+ def get_schema_record(self, ref: str) -> LocalSchemaRecord:
824
+ """Get a schema record as LocalSchemaRecord object.
825
+
826
+ Use this when you need the full LocalSchemaRecord with typed properties.
827
+ For Protocol-compliant dict access, use get_schema() instead.
828
+
829
+ Args:
830
+ ref: Schema reference string.
831
+
832
+ Returns:
833
+ LocalSchemaRecord with schema details.
834
+
835
+ Raises:
836
+ KeyError: If schema not found.
837
+ ValueError: If reference format is invalid.
838
+ """
839
+ schema = self.get_schema(ref)
840
+ return LocalSchemaRecord.from_dict(schema)
841
+
842
+ @property
843
+ def schemas(self) -> Generator[LocalSchemaRecord, None, None]:
844
+ """Iterate over all schema records in this index.
845
+
846
+ Yields:
847
+ LocalSchemaRecord for each schema.
848
+ """
849
+ for name, version, schema_json in self._provider.iter_schemas():
850
+ schema = json.loads(schema_json)
851
+ schema["$ref"] = _make_schema_ref(name, version)
852
+ yield LocalSchemaRecord.from_dict(schema)
853
+
854
+ def list_schemas(self) -> list[dict]:
855
+ """Get all schema records as a materialized list (AbstractIndex protocol).
856
+
857
+ Returns:
858
+ List of schema records as dictionaries.
859
+ """
860
+ return [record.to_dict() for record in self.schemas]
861
+
862
+ def decode_schema(self, ref: str) -> Type[Packable]:
863
+ """Reconstruct a Python PackableSample type from a stored schema.
864
+
865
+ This method enables loading datasets without knowing the sample type
866
+ ahead of time. The index retrieves the schema record and dynamically
867
+ generates a PackableSample subclass matching the schema definition.
868
+
869
+ If auto_stubs is enabled, a Python module will be generated and the
870
+ class will be imported from it, providing full IDE autocomplete support.
871
+ The returned class has proper type information that IDEs can understand.
872
+
873
+ Args:
874
+ ref: Schema reference string (atdata://local/sampleSchema/... or
875
+ legacy local://schemas/...).
876
+
877
+ Returns:
878
+ A PackableSample subclass - either imported from a generated module
879
+ (if auto_stubs is enabled) or dynamically created.
880
+
881
+ Raises:
882
+ KeyError: If schema not found.
883
+ ValueError: If schema cannot be decoded.
884
+ """
885
+ schema_dict = self.get_schema(ref)
886
+
887
+ # If auto_stubs is enabled, generate module and import class from it
888
+ if self._stub_manager is not None:
889
+ cls = self._stub_manager.ensure_module(schema_dict)
890
+ if cls is not None:
891
+ return cls
892
+
893
+ # Fall back to dynamic type generation
894
+ from atdata._schema_codec import schema_to_type
895
+
896
+ return schema_to_type(schema_dict)
897
+
898
+ def decode_schema_as(self, ref: str, type_hint: type[T]) -> type[T]:
899
+ """Decode a schema with explicit type hint for IDE support.
900
+
901
+ This is a typed wrapper around decode_schema() that preserves the
902
+ type information for IDE autocomplete. Use this when you have a
903
+ stub file for the schema and want full IDE support.
904
+
905
+ Args:
906
+ ref: Schema reference string.
907
+ type_hint: The stub type to use for type hints. Import this from
908
+ the generated stub file.
909
+
910
+ Returns:
911
+ The decoded type, cast to match the type_hint for IDE support.
912
+
913
+ Examples:
914
+ >>> # After enabling auto_stubs and configuring IDE extraPaths:
915
+ >>> from local.MySample_1_0_0 import MySample
916
+ >>>
917
+ >>> # This gives full IDE autocomplete:
918
+ >>> DecodedType = index.decode_schema_as(ref, MySample)
919
+ >>> sample = DecodedType(text="hello", value=42) # IDE knows signature!
920
+
921
+ Note:
922
+ The type_hint is only used for static type checking - at runtime,
923
+ the actual decoded type from the schema is returned. Ensure the
924
+ stub matches the schema to avoid runtime surprises.
925
+ """
926
+ from typing import cast
927
+
928
+ return cast(type[T], self.decode_schema(ref))
929
+
930
+ def clear_stubs(self) -> int:
931
+ """Remove all auto-generated stub files.
932
+
933
+ Only works if auto_stubs was enabled when creating the Index.
934
+
935
+ Returns:
936
+ Number of stub files removed, or 0 if auto_stubs is disabled.
937
+ """
938
+ if self._stub_manager is not None:
939
+ return self._stub_manager.clear_stubs()
940
+ return 0