atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +39 -0
  3. atdata/_cid.py +0 -21
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +41 -15
  6. atdata/_hf_api.py +95 -11
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +77 -238
  9. atdata/_schema_codec.py +7 -6
  10. atdata/_stub_manager.py +5 -25
  11. atdata/_type_utils.py +28 -2
  12. atdata/atmosphere/__init__.py +31 -20
  13. atdata/atmosphere/_types.py +4 -4
  14. atdata/atmosphere/client.py +64 -12
  15. atdata/atmosphere/lens.py +11 -12
  16. atdata/atmosphere/records.py +12 -12
  17. atdata/atmosphere/schema.py +16 -18
  18. atdata/atmosphere/store.py +6 -7
  19. atdata/cli/__init__.py +161 -175
  20. atdata/cli/diagnose.py +2 -2
  21. atdata/cli/{local.py → infra.py} +11 -11
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/preview.py +63 -0
  24. atdata/cli/schema.py +109 -0
  25. atdata/dataset.py +583 -328
  26. atdata/index/__init__.py +54 -0
  27. atdata/index/_entry.py +157 -0
  28. atdata/index/_index.py +1198 -0
  29. atdata/index/_schema.py +380 -0
  30. atdata/lens.py +9 -2
  31. atdata/lexicons/__init__.py +121 -0
  32. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  34. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  35. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  36. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  37. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  38. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  39. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  40. atdata/lexicons/ndarray_shim.json +16 -0
  41. atdata/local/__init__.py +70 -0
  42. atdata/local/_repo_legacy.py +218 -0
  43. atdata/manifest/__init__.py +28 -0
  44. atdata/manifest/_aggregates.py +156 -0
  45. atdata/manifest/_builder.py +163 -0
  46. atdata/manifest/_fields.py +154 -0
  47. atdata/manifest/_manifest.py +146 -0
  48. atdata/manifest/_query.py +150 -0
  49. atdata/manifest/_writer.py +74 -0
  50. atdata/promote.py +18 -14
  51. atdata/providers/__init__.py +25 -0
  52. atdata/providers/_base.py +140 -0
  53. atdata/providers/_factory.py +69 -0
  54. atdata/providers/_postgres.py +214 -0
  55. atdata/providers/_redis.py +171 -0
  56. atdata/providers/_sqlite.py +191 -0
  57. atdata/repository.py +323 -0
  58. atdata/stores/__init__.py +23 -0
  59. atdata/stores/_disk.py +123 -0
  60. atdata/stores/_s3.py +349 -0
  61. atdata/testing.py +341 -0
  62. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
  63. atdata-0.3.1b1.dist-info/RECORD +67 -0
  64. atdata/local.py +0 -1720
  65. atdata-0.2.3b1.dist-info/RECORD +0 -28
  66. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  67. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  68. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/index/_index.py ADDED
@@ -0,0 +1,1198 @@
1
+ """Index class for local dataset management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from atdata import (
6
+ Dataset,
7
+ )
8
+ from atdata._protocols import AbstractDataStore, Packable
9
+
10
+ from atdata.index._entry import LocalDatasetEntry
11
+ from atdata.index._schema import (
12
+ SchemaNamespace,
13
+ LocalSchemaRecord,
14
+ _schema_ref_from_type,
15
+ _make_schema_ref,
16
+ _parse_schema_ref,
17
+ _increment_patch,
18
+ _build_schema_record,
19
+ )
20
+
21
+ from pathlib import Path
22
+ from typing import (
23
+ Any,
24
+ Iterable,
25
+ Type,
26
+ TypeVar,
27
+ Generator,
28
+ TYPE_CHECKING,
29
+ )
30
+ from redis import Redis
31
+ import json
32
+
33
+ if TYPE_CHECKING:
34
+ from atdata.providers._base import IndexProvider
35
+ from atdata.repository import Repository, _AtmosphereBackend
36
+ from atdata._protocols import IndexEntry
37
+
38
+ T = TypeVar("T", bound=Packable)
39
+
40
+
41
+ class Index:
42
+ """Unified index for tracking datasets across multiple repositories.
43
+
44
+ Implements the AbstractIndex protocol. Maintains a registry of
45
+ dataset entries across named repositories (always including a built-in
46
+ ``"local"`` repository) and an optional atmosphere (ATProto) backend.
47
+
48
+ The ``"local"`` repository is always present and uses the storage backend
49
+ determined by the ``provider`` argument. When no provider is given, defaults
50
+ to SQLite (zero external dependencies). Pass a ``redis`` connection or
51
+ Redis ``**kwargs`` for backwards-compatible Redis behaviour.
52
+
53
+ Additional named repositories can be mounted via the ``repos`` parameter,
54
+ each pairing an IndexProvider with an optional data store.
55
+
56
+ An Atmosphere is available by default for anonymous read-only
57
+ resolution of ``@handle/dataset`` paths. Pass an authenticated client
58
+ for write operations, or ``atmosphere=None`` to disable.
59
+
60
+ Attributes:
61
+ _repos: All repositories keyed by name. ``"local"`` is always present.
62
+ _atmosphere: Optional atmosphere backend for ATProto operations.
63
+ """
64
+
65
+ ##
66
+
67
+ # Sentinel for default atmosphere behaviour (lazy anonymous client)
68
+ _ATMOSPHERE_DEFAULT = object()
69
+
70
+ def __init__(
71
+ self,
72
+ provider: IndexProvider | str | None = None,
73
+ *,
74
+ path: str | Path | None = None,
75
+ dsn: str | None = None,
76
+ redis: Redis | None = None,
77
+ data_store: AbstractDataStore | None = None,
78
+ repos: dict[str, Repository] | None = None,
79
+ atmosphere: Any | None = _ATMOSPHERE_DEFAULT,
80
+ auto_stubs: bool = False,
81
+ stub_dir: Path | str | None = None,
82
+ **kwargs,
83
+ ) -> None:
84
+ """Initialize an index.
85
+
86
+ Args:
87
+ provider: Storage backend for the ``"local"`` repository.
88
+ Accepts an ``IndexProvider`` instance or a backend name
89
+ string (``"sqlite"``, ``"redis"``, or ``"postgres"``).
90
+ When ``None``, falls back to *redis* / *kwargs* if given,
91
+ otherwise defaults to SQLite.
92
+ path: Database file path (SQLite only). Ignored unless
93
+ *provider* is ``"sqlite"``.
94
+ dsn: PostgreSQL connection string. Required when *provider*
95
+ is ``"postgres"``.
96
+ redis: Redis connection to use (backwards-compat shorthand for
97
+ ``RedisProvider(redis)``). Ignored when *provider* is given.
98
+ data_store: Optional data store for writing dataset shards in the
99
+ ``"local"`` repository. If provided, ``insert_dataset()`` will
100
+ write shards to this store. If None, only indexes existing URLs.
101
+ repos: Named repositories to mount alongside ``"local"``. Keys are
102
+ repository names (e.g. ``"lab"``, ``"shared"``). The name
103
+ ``"local"`` is reserved for the built-in repository.
104
+ atmosphere: ATProto client for distributed network operations.
105
+ - Default (sentinel): creates an anonymous read-only client
106
+ lazily on first access.
107
+ - ``Atmosphere`` instance: uses that client directly.
108
+ - ``None``: disables atmosphere backend entirely.
109
+ auto_stubs: If True, automatically generate .pyi stub files when
110
+ schemas are accessed via get_schema() or decode_schema().
111
+ This enables IDE autocomplete for dynamically decoded types.
112
+ stub_dir: Directory to write stub files. Only used if auto_stubs
113
+ is True or if this parameter is provided (which implies auto_stubs).
114
+ Defaults to ~/.atdata/stubs/ if not specified.
115
+ **kwargs: Additional arguments passed to Redis() constructor when
116
+ *redis* is not given. If any kwargs are provided (without an
117
+ explicit *provider*), Redis is used instead of the SQLite default.
118
+
119
+ Raises:
120
+ TypeError: If provider is not an IndexProvider or valid string.
121
+ ValueError: If repos contains the reserved name ``"local"``.
122
+
123
+ Examples:
124
+ >>> # Default: local SQLite + anonymous atmosphere
125
+ >>> index = Index()
126
+ >>>
127
+ >>> # SQLite with explicit path
128
+ >>> index = Index(provider="sqlite", path="~/.atdata/index.db")
129
+ >>>
130
+ >>> # Redis
131
+ >>> index = Index(redis=redis_conn)
132
+ >>>
133
+ >>> # PostgreSQL
134
+ >>> index = Index(provider="postgres", dsn="postgresql://user:pass@host/db")
135
+ >>>
136
+ >>> # Multiple repositories
137
+ >>> from atdata.repository import Repository, create_repository
138
+ >>> index = Index(
139
+ ... provider="sqlite",
140
+ ... repos={
141
+ ... "lab": create_repository("sqlite", path="/data/lab.db"),
142
+ ... },
143
+ ... )
144
+ """
145
+ ##
146
+
147
+ from atdata.providers._base import IndexProvider as _IP
148
+ from atdata.repository import Repository as _Repo
149
+
150
+ # Resolve the local provider
151
+ if isinstance(provider, str):
152
+ from atdata.providers._factory import create_provider
153
+
154
+ local_provider: _IP = create_provider(
155
+ provider, path=path, dsn=dsn, redis=redis, **kwargs
156
+ )
157
+ elif provider is not None:
158
+ if not isinstance(provider, _IP):
159
+ raise TypeError(
160
+ f"provider must be an IndexProvider or backend name string, "
161
+ f"got {type(provider).__name__}"
162
+ )
163
+ local_provider = provider
164
+ elif redis is not None:
165
+ from atdata.providers._redis import RedisProvider
166
+
167
+ local_provider = RedisProvider(redis)
168
+ elif kwargs:
169
+ from atdata.providers._redis import RedisProvider
170
+
171
+ local_provider = RedisProvider(Redis(**kwargs))
172
+ else:
173
+ from atdata.providers._sqlite import SqliteProvider
174
+
175
+ local_provider = SqliteProvider()
176
+
177
+ # Build the unified repos dict with "local" always present
178
+ self._repos: dict[str, _Repo] = {
179
+ "local": _Repo(provider=local_provider, data_store=data_store),
180
+ }
181
+
182
+ if repos is not None:
183
+ if "local" in repos:
184
+ raise ValueError(
185
+ '"local" is reserved for the built-in repository. '
186
+ "Use a different name for your repository."
187
+ )
188
+ for name, repo in repos.items():
189
+ if not isinstance(repo, _Repo):
190
+ raise TypeError(
191
+ f"repos[{name!r}] must be a Repository, "
192
+ f"got {type(repo).__name__}"
193
+ )
194
+ self._repos.update(repos)
195
+
196
+ # Atmosphere backend (lazy or explicit)
197
+ from atdata.repository import _AtmosphereBackend
198
+
199
+ if atmosphere is Index._ATMOSPHERE_DEFAULT:
200
+ # Deferred: create anonymous client on first use
201
+ self._atmosphere: _AtmosphereBackend | None = None
202
+ self._atmosphere_deferred = True
203
+ elif atmosphere is None:
204
+ self._atmosphere = None
205
+ self._atmosphere_deferred = False
206
+ else:
207
+ self._atmosphere = _AtmosphereBackend(atmosphere)
208
+ self._atmosphere_deferred = False
209
+
210
+ # Initialize stub manager if auto-stubs enabled
211
+ # Providing stub_dir implies auto_stubs=True
212
+ if auto_stubs or stub_dir is not None:
213
+ from atdata._stub_manager import StubManager
214
+
215
+ self._stub_manager: StubManager | None = StubManager(stub_dir=stub_dir)
216
+ else:
217
+ self._stub_manager = None
218
+
219
+ # Initialize schema namespace for load_schema/schemas API
220
+ self._schema_namespace = SchemaNamespace()
221
+
222
+ # -- Repository access --
223
+
224
+ def _get_atmosphere(self) -> "_AtmosphereBackend | None":
225
+ """Get the atmosphere backend, lazily creating anonymous client if needed."""
226
+ if self._atmosphere_deferred and self._atmosphere is None:
227
+ try:
228
+ from atdata.atmosphere.client import Atmosphere
229
+ from atdata.repository import _AtmosphereBackend
230
+
231
+ client = Atmosphere()
232
+ self._atmosphere = _AtmosphereBackend(client)
233
+ except ImportError:
234
+ # atproto package not installed -- atmosphere unavailable
235
+ self._atmosphere_deferred = False
236
+ return None
237
+ return self._atmosphere
238
+
239
+ def _resolve_prefix(self, ref: str) -> tuple[str, str, str | None]:
240
+ """Route a dataset/schema reference to the correct backend.
241
+
242
+ Returns:
243
+ Tuple of ``(backend_key, resolved_ref, handle_or_did)``.
244
+
245
+ - ``backend_key``: ``"local"``, a named repository, or
246
+ ``"_atmosphere"``.
247
+ - ``resolved_ref``: The dataset/schema name or AT URI to pass
248
+ to the backend.
249
+ - ``handle_or_did``: Populated only for atmosphere paths.
250
+ """
251
+ # AT URIs go to atmosphere
252
+ if ref.startswith("at://"):
253
+ return ("_atmosphere", ref, None)
254
+
255
+ # @ prefix -> atmosphere
256
+ if ref.startswith("@"):
257
+ rest = ref[1:]
258
+ parts = rest.split("/", 1)
259
+ if len(parts) == 2:
260
+ return ("_atmosphere", parts[1], parts[0])
261
+ return ("_atmosphere", rest, None)
262
+
263
+ # atdata:// full URI
264
+ if ref.startswith("atdata://"):
265
+ path = ref[len("atdata://") :]
266
+ parts = path.split("/")
267
+ # atdata://mount/collection/name or atdata://mount/name
268
+ repo_name = parts[0]
269
+ dataset_name = parts[-1]
270
+ if repo_name == "local" or repo_name in self._repos:
271
+ return (repo_name, dataset_name, None)
272
+ # Unknown prefix -- might be an atmosphere handle
273
+ return ("_atmosphere", dataset_name, repo_name)
274
+
275
+ # prefix/name where prefix is a known repository
276
+ if "/" in ref:
277
+ prefix, rest = ref.split("/", 1)
278
+ if prefix == "local":
279
+ return ("local", rest, None)
280
+ if prefix in self._repos:
281
+ return (prefix, rest, None)
282
+
283
+ # Bare name -> local repository
284
+ return ("local", ref, None)
285
+
286
+ @property
287
+ def repos(self) -> dict[str, "Repository"]:
288
+ """All repositories mounted on this index (including ``"local"``)."""
289
+ return dict(self._repos)
290
+
291
+ @property
292
+ def atmosphere(self) -> Any:
293
+ """The Atmosphere for this index, or None if disabled.
294
+
295
+ Returns the underlying client (not the internal backend wrapper).
296
+ """
297
+ backend = self._get_atmosphere()
298
+ if backend is not None:
299
+ return backend.client
300
+ return None
301
+
302
+ @property
303
+ def _provider(self) -> "IndexProvider": # noqa: F821
304
+ """IndexProvider for the ``"local"`` repository (backward compat)."""
305
+ return self._repos["local"].provider
306
+
307
+ @property
308
+ def provider(self) -> "IndexProvider": # noqa: F821
309
+ """The storage provider backing the ``"local"`` repository."""
310
+ return self._repos["local"].provider
311
+
312
+ @property
313
+ def _redis(self) -> Redis:
314
+ """Backwards-compatible access to the underlying Redis connection.
315
+
316
+ Raises:
317
+ AttributeError: If the current provider is not Redis-backed.
318
+ """
319
+ from atdata.providers._redis import RedisProvider
320
+
321
+ prov = self._repos["local"].provider
322
+ if isinstance(prov, RedisProvider):
323
+ return prov.redis
324
+ raise AttributeError(
325
+ "Index._redis is only available with a Redis provider. "
326
+ "Use index.provider instead."
327
+ )
328
+
329
+ @property
330
+ def _data_store(self) -> AbstractDataStore | None:
331
+ """Data store for the ``"local"`` repository (backward compat)."""
332
+ return self._repos["local"].data_store
333
+
334
+ @property
335
+ def data_store(self) -> AbstractDataStore | None:
336
+ """The data store for writing shards, or None if index-only."""
337
+ return self._repos["local"].data_store
338
+
339
+ @property
340
+ def stub_dir(self) -> Path | None:
341
+ """Directory where stub files are written, or None if auto-stubs disabled.
342
+
343
+ Use this path to configure your IDE for type checking support:
344
+ - VS Code/Pylance: Add to python.analysis.extraPaths in settings.json
345
+ - PyCharm: Mark as Sources Root
346
+ - mypy: Add to mypy_path in mypy.ini
347
+ """
348
+ if self._stub_manager is not None:
349
+ return self._stub_manager.stub_dir
350
+ return None
351
+
352
+ @property
353
+ def types(self) -> SchemaNamespace:
354
+ """Namespace for accessing loaded schema types.
355
+
356
+ After calling :meth:`load_schema`, schema types become available
357
+ as attributes on this namespace.
358
+
359
+ Examples:
360
+ >>> index.load_schema("atdata://local/schema/MySample@1.0.0")
361
+ >>> MyType = index.types.MySample
362
+ >>> sample = MyType(name="hello", value=42)
363
+
364
+ Returns:
365
+ SchemaNamespace containing all loaded schema types.
366
+ """
367
+ return self._schema_namespace
368
+
369
+ def load_schema(self, ref: str) -> Type[Packable]:
370
+ """Load a schema and make it available in the types namespace.
371
+
372
+ This method decodes the schema, optionally generates a Python module
373
+ for IDE support (if auto_stubs is enabled), and registers the type
374
+ in the :attr:`types` namespace for easy access.
375
+
376
+ Args:
377
+ ref: Schema reference string (atdata://local/schema/... or
378
+ legacy local://schemas/...).
379
+
380
+ Returns:
381
+ The decoded PackableSample subclass. Also available via
382
+ ``index.types.<ClassName>`` after this call.
383
+
384
+ Raises:
385
+ KeyError: If schema not found.
386
+ ValueError: If schema cannot be decoded.
387
+
388
+ Examples:
389
+ >>> # Load and use immediately
390
+ >>> MyType = index.load_schema("atdata://local/schema/MySample@1.0.0")
391
+ >>> sample = MyType(field1="hello", field2=42)
392
+ >>>
393
+ >>> # Or access later via namespace
394
+ >>> index.load_schema("atdata://local/schema/OtherType@1.0.0")
395
+ >>> other = index.types.OtherType(data="test")
396
+ """
397
+ # Decode the schema (uses generated module if auto_stubs enabled)
398
+ cls = self.decode_schema(ref)
399
+
400
+ # Register in namespace using the class name
401
+ self._schema_namespace._register(cls.__name__, cls)
402
+
403
+ return cls
404
+
405
+ def get_import_path(self, ref: str) -> str | None:
406
+ """Get the import path for a schema's generated module.
407
+
408
+ When auto_stubs is enabled, this returns the import path that can
409
+ be used to import the schema type with full IDE support.
410
+
411
+ Args:
412
+ ref: Schema reference string.
413
+
414
+ Returns:
415
+ Import path like "local.MySample_1_0_0", or None if auto_stubs
416
+ is disabled.
417
+
418
+ Examples:
419
+ >>> index = Index(auto_stubs=True)
420
+ >>> ref = index.publish_schema(MySample, version="1.0.0")
421
+ >>> index.load_schema(ref)
422
+ >>> print(index.get_import_path(ref))
423
+ local.MySample_1_0_0
424
+ >>> # Then in your code:
425
+ >>> # from local.MySample_1_0_0 import MySample
426
+ """
427
+ if self._stub_manager is None:
428
+ return None
429
+
430
+ from atdata._stub_manager import _extract_authority
431
+
432
+ name, version = _parse_schema_ref(ref)
433
+ schema_dict = self.get_schema(ref)
434
+ authority = _extract_authority(schema_dict.get("$ref"))
435
+
436
+ safe_version = version.replace(".", "_")
437
+ module_name = f"{name}_{safe_version}"
438
+
439
+ return f"{authority}.{module_name}"
440
+
441
+ def list_entries(self) -> list[LocalDatasetEntry]:
442
+ """Get all index entries as a materialized list.
443
+
444
+ Returns:
445
+ List of all LocalDatasetEntry objects in the index.
446
+ """
447
+ return list(self.entries)
448
+
449
+ # Legacy alias for backwards compatibility
450
+ @property
451
+ def all_entries(self) -> list[LocalDatasetEntry]:
452
+ """Get all index entries as a list (deprecated, use list_entries())."""
453
+ return self.list_entries()
454
+
455
+ @property
456
+ def entries(self) -> Generator[LocalDatasetEntry, None, None]:
457
+ """Iterate over all index entries.
458
+
459
+ Yields:
460
+ LocalDatasetEntry objects from the index.
461
+ """
462
+ yield from self._provider.iter_entries()
463
+
464
+ def add_entry(
465
+ self,
466
+ ds: Dataset,
467
+ *,
468
+ name: str,
469
+ schema_ref: str | None = None,
470
+ metadata: dict | None = None,
471
+ ) -> LocalDatasetEntry:
472
+ """Add a dataset to the local repository index.
473
+
474
+ Args:
475
+ ds: The dataset to add to the index.
476
+ name: Human-readable name for the dataset.
477
+ schema_ref: Optional schema reference. If None, generates from sample type.
478
+ metadata: Optional metadata dictionary. If None, uses ds._metadata if available.
479
+
480
+ Returns:
481
+ The created LocalDatasetEntry object.
482
+ """
483
+ return self._insert_dataset_to_provider(
484
+ ds,
485
+ name=name,
486
+ schema_ref=schema_ref,
487
+ provider=self._provider,
488
+ store=None,
489
+ metadata=metadata,
490
+ )
491
+
492
+ def get_entry(self, cid: str) -> LocalDatasetEntry:
493
+ """Get an entry by its CID.
494
+
495
+ Args:
496
+ cid: Content identifier of the entry.
497
+
498
+ Returns:
499
+ LocalDatasetEntry for the given CID.
500
+
501
+ Raises:
502
+ KeyError: If entry not found.
503
+ """
504
+ return self._provider.get_entry_by_cid(cid)
505
+
506
+ def get_entry_by_name(self, name: str) -> LocalDatasetEntry:
507
+ """Get an entry by its human-readable name.
508
+
509
+ Args:
510
+ name: Human-readable name of the entry.
511
+
512
+ Returns:
513
+ LocalDatasetEntry with the given name.
514
+
515
+ Raises:
516
+ KeyError: If no entry with that name exists.
517
+ """
518
+ return self._provider.get_entry_by_name(name)
519
+
520
+ # AbstractIndex protocol methods
521
+
522
+ @staticmethod
523
+ def _ensure_schema_stored(
524
+ schema_ref: str,
525
+ sample_type: type,
526
+ provider: "IndexProvider", # noqa: F821
527
+ ) -> None:
528
+ """Persist the schema definition if not already stored.
529
+
530
+ Called during dataset insertion so that ``decode_schema()`` can
531
+ reconstruct the type later without the caller needing to publish
532
+ the schema separately.
533
+ """
534
+ schema_name, version = _parse_schema_ref(schema_ref)
535
+ if provider.get_schema_json(schema_name, version) is None:
536
+ record = _build_schema_record(sample_type, version=version)
537
+ provider.store_schema(schema_name, version, json.dumps(record))
538
+
539
+ def _insert_dataset_to_provider(
540
+ self,
541
+ ds: Dataset,
542
+ *,
543
+ name: str,
544
+ schema_ref: str | None = None,
545
+ provider: "IndexProvider", # noqa: F821
546
+ store: AbstractDataStore | None = None,
547
+ **kwargs,
548
+ ) -> LocalDatasetEntry:
549
+ """Insert a dataset into a specific provider/store pair.
550
+
551
+ This is the internal implementation shared by all local and named
552
+ repository inserts.
553
+ """
554
+ metadata = kwargs.get("metadata")
555
+
556
+ if store is not None:
557
+ prefix = kwargs.get("prefix", name)
558
+ cache_local = kwargs.get("cache_local", False)
559
+
560
+ written_urls = store.write_shards(
561
+ ds,
562
+ prefix=prefix,
563
+ cache_local=cache_local,
564
+ )
565
+
566
+ if schema_ref is None:
567
+ schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
568
+
569
+ self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
570
+
571
+ entry_metadata = metadata if metadata is not None else ds._metadata
572
+ entry = LocalDatasetEntry(
573
+ name=name,
574
+ schema_ref=schema_ref,
575
+ data_urls=written_urls,
576
+ metadata=entry_metadata,
577
+ )
578
+ provider.store_entry(entry)
579
+ return entry
580
+
581
+ # No data store - just index the existing URL
582
+ if schema_ref is None:
583
+ schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
584
+
585
+ self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
586
+
587
+ data_urls = [ds.url]
588
+ entry_metadata = metadata if metadata is not None else ds._metadata
589
+
590
+ entry = LocalDatasetEntry(
591
+ name=name,
592
+ schema_ref=schema_ref,
593
+ data_urls=data_urls,
594
+ metadata=entry_metadata,
595
+ )
596
+ provider.store_entry(entry)
597
+ return entry
598
+
599
+ def insert_dataset(
600
+ self,
601
+ ds: Dataset,
602
+ *,
603
+ name: str,
604
+ schema_ref: str | None = None,
605
+ **kwargs,
606
+ ) -> "IndexEntry":
607
+ """Insert a dataset into the index (AbstractIndex protocol).
608
+
609
+ The target repository is determined by a prefix in the ``name``
610
+ argument (e.g. ``"lab/mnist"``). If no prefix is given, or the
611
+ prefix is ``"local"``, the built-in local repository is used.
612
+
613
+ If the target repository has a data_store, shards are written to
614
+ storage first, then indexed. Otherwise, the dataset's existing URL
615
+ is indexed directly.
616
+
617
+ Args:
618
+ ds: The Dataset to register.
619
+ name: Human-readable name for the dataset, optionally prefixed
620
+ with a repository name (e.g. ``"lab/mnist"``).
621
+ schema_ref: Optional schema reference.
622
+ **kwargs: Additional options:
623
+ - metadata: Optional metadata dict
624
+ - prefix: Storage prefix (default: dataset name)
625
+ - cache_local: If True, cache writes locally first
626
+
627
+ Returns:
628
+ IndexEntry for the inserted dataset.
629
+ """
630
+ backend_key, resolved_name, handle_or_did = self._resolve_prefix(name)
631
+
632
+ if backend_key == "_atmosphere":
633
+ atmo = self._get_atmosphere()
634
+ if atmo is None:
635
+ raise ValueError(
636
+ f"Atmosphere backend required for name {name!r} but not available."
637
+ )
638
+ return atmo.insert_dataset(
639
+ ds, name=resolved_name, schema_ref=schema_ref, **kwargs
640
+ )
641
+
642
+ repo = self._repos.get(backend_key)
643
+ if repo is None:
644
+ raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
645
+ return self._insert_dataset_to_provider(
646
+ ds,
647
+ name=resolved_name,
648
+ schema_ref=schema_ref,
649
+ provider=repo.provider,
650
+ store=repo.data_store,
651
+ **kwargs,
652
+ )
653
+
654
+ def write(
655
+ self,
656
+ samples: Iterable,
657
+ *,
658
+ name: str,
659
+ schema_ref: str | None = None,
660
+ description: str | None = None,
661
+ tags: list[str] | None = None,
662
+ license: str | None = None,
663
+ maxcount: int = 10_000,
664
+ maxsize: int | None = None,
665
+ metadata: dict | None = None,
666
+ manifest: bool = False,
667
+ ) -> "IndexEntry":
668
+ """Write samples and create an index entry in one step.
669
+
670
+ This is the primary method for publishing data. It serializes
671
+ samples to WebDataset tar files, stores them via the appropriate
672
+ backend, and creates an index entry.
673
+
674
+ The target backend is determined by the *name* prefix:
675
+
676
+ - Bare name (e.g., ``"mnist"``): writes to the local repository.
677
+ - ``"@handle/name"``: writes and publishes to the atmosphere.
678
+ - ``"repo/name"``: writes to a named repository.
679
+
680
+ When the local backend has no ``data_store`` configured, a
681
+ ``LocalDiskStore`` is created automatically at
682
+ ``~/.atdata/data/`` so that samples have persistent storage.
683
+
684
+ .. note::
685
+
686
+ This method is synchronous. Samples are written to a temporary
687
+ location first, then copied to permanent storage by the backend.
688
+ Avoid passing lazily-evaluated iterators that depend on external
689
+ state that may change during the call.
690
+
691
+ Args:
692
+ samples: Iterable of ``Packable`` samples. Must be non-empty.
693
+ name: Dataset name, optionally prefixed with target.
694
+ schema_ref: Optional schema reference. Auto-generated if ``None``.
695
+ description: Optional dataset description (atmosphere only).
696
+ tags: Optional tags for discovery (atmosphere only).
697
+ license: Optional license identifier (atmosphere only).
698
+ maxcount: Max samples per shard. Default: 10,000.
699
+ maxsize: Max bytes per shard. Default: ``None``.
700
+ metadata: Optional metadata dict stored with the entry.
701
+ manifest: If True, write per-shard manifest sidecar files
702
+ alongside each tar. Default: ``False``.
703
+
704
+ Returns:
705
+ IndexEntry for the created dataset.
706
+
707
+ Raises:
708
+ ValueError: If *samples* is empty.
709
+
710
+ Examples:
711
+ >>> index = Index()
712
+ >>> samples = [MySample(key="0", text="hello")]
713
+ >>> entry = index.write(samples, name="my-dataset")
714
+ """
715
+ import tempfile
716
+
717
+ from atdata.dataset import write_samples
718
+
719
+ backend_key, resolved_name, _ = self._resolve_prefix(name)
720
+
721
+ # Resolve the target repo's data store; auto-create LocalDiskStore
722
+ # for repos that have no store so write() always persists data.
723
+ repo = self._repos.get(backend_key)
724
+ effective_store = repo.data_store if repo is not None else None
725
+ needs_auto_store = repo is not None and effective_store is None
726
+
727
+ if needs_auto_store and backend_key != "_atmosphere":
728
+ from atdata.stores._disk import LocalDiskStore
729
+
730
+ effective_store = LocalDiskStore()
731
+
732
+ with tempfile.TemporaryDirectory() as tmp_dir:
733
+ tmp_path = Path(tmp_dir) / "data.tar"
734
+ ds = write_samples(
735
+ samples,
736
+ tmp_path,
737
+ maxcount=maxcount,
738
+ maxsize=maxsize,
739
+ manifest=manifest,
740
+ )
741
+
742
+ # When we auto-created a store, write directly through it
743
+ # rather than via insert_dataset (which would just index
744
+ # the temp path).
745
+ if needs_auto_store and repo is not None:
746
+ return self._insert_dataset_to_provider(
747
+ ds,
748
+ name=resolved_name,
749
+ schema_ref=schema_ref,
750
+ provider=repo.provider,
751
+ store=effective_store,
752
+ metadata=metadata,
753
+ )
754
+
755
+ return self.insert_dataset(
756
+ ds,
757
+ name=name,
758
+ schema_ref=schema_ref,
759
+ metadata=metadata,
760
+ description=description,
761
+ tags=tags,
762
+ license=license,
763
+ )
764
+
765
+ def get_dataset(self, ref: str) -> "IndexEntry":
766
+ """Get a dataset entry by name or prefixed reference.
767
+
768
+ Supports repository-prefixed lookups (e.g. ``"lab/mnist"``),
769
+ atmosphere paths (``"@handle/dataset"``), AT URIs, and bare names
770
+ (which default to the ``"local"`` repository).
771
+
772
+ Args:
773
+ ref: Dataset name, prefixed name, or AT URI.
774
+
775
+ Returns:
776
+ IndexEntry for the dataset.
777
+
778
+ Raises:
779
+ KeyError: If dataset not found.
780
+ ValueError: If the atmosphere backend is required but unavailable.
781
+ """
782
+ backend_key, resolved_ref, handle_or_did = self._resolve_prefix(ref)
783
+
784
+ if backend_key == "_atmosphere":
785
+ atmo = self._get_atmosphere()
786
+ if atmo is None:
787
+ raise ValueError(
788
+ f"Atmosphere backend required for path {ref!r} but not available. "
789
+ "Install 'atproto' or pass an Atmosphere."
790
+ )
791
+ return atmo.get_dataset(resolved_ref)
792
+
793
+ repo = self._repos.get(backend_key)
794
+ if repo is None:
795
+ raise KeyError(f"Unknown repository {backend_key!r} in ref {ref!r}")
796
+ return repo.provider.get_entry_by_name(resolved_ref)
797
+
798
+ @property
799
+ def datasets(self) -> Generator["IndexEntry", None, None]:
800
+ """Lazily iterate over all dataset entries across local repositories.
801
+
802
+ Yields entries from all mounted repositories (``"local"`` and named).
803
+ Atmosphere entries are not included (use
804
+ ``list_datasets(repo="_atmosphere")`` for those).
805
+
806
+ Yields:
807
+ IndexEntry for each dataset.
808
+ """
809
+ for repo in self._repos.values():
810
+ yield from repo.provider.iter_entries()
811
+
812
+ def list_datasets(self, repo: str | None = None) -> list["IndexEntry"]:
813
+ """Get dataset entries as a materialized list (AbstractIndex protocol).
814
+
815
+ Args:
816
+ repo: Optional repository filter. If ``None``, aggregates entries
817
+ from ``"local"`` and all named repositories. Use ``"local"``
818
+ for only the built-in repository, a named repo key, or
819
+ ``"_atmosphere"`` for atmosphere entries.
820
+
821
+ Returns:
822
+ List of IndexEntry for each dataset.
823
+ """
824
+ if repo is None:
825
+ return list(self.datasets)
826
+
827
+ if repo == "_atmosphere":
828
+ atmo = self._get_atmosphere()
829
+ if atmo is None:
830
+ return []
831
+ return atmo.list_datasets()
832
+
833
+ named = self._repos.get(repo)
834
+ if named is None:
835
+ raise KeyError(f"Unknown repository {repo!r}")
836
+ return list(named.provider.iter_entries())
837
+
838
+ # Schema operations
839
+
840
+ def _get_latest_schema_version(self, name: str) -> str | None:
841
+ """Get the latest version for a schema by name, or None if not found."""
842
+ return self._provider.find_latest_version(name)
843
+
844
+ def publish_schema(
845
+ self,
846
+ sample_type: type,
847
+ *,
848
+ version: str | None = None,
849
+ description: str | None = None,
850
+ ) -> str:
851
+ """Publish a schema for a sample type to Redis.
852
+
853
+ Args:
854
+ sample_type: A Packable type (@packable-decorated or PackableSample subclass).
855
+ version: Semantic version string (e.g., '1.0.0'). If None,
856
+ auto-increments from the latest published version (patch bump),
857
+ or starts at '1.0.0' if no previous version exists.
858
+ description: Optional human-readable description. If None, uses
859
+ the class docstring.
860
+
861
+ Returns:
862
+ Schema reference string: 'atdata://local/schema/{name}@{version}'.
863
+
864
+ Raises:
865
+ ValueError: If sample_type is not a dataclass.
866
+ TypeError: If sample_type doesn't satisfy the Packable protocol,
867
+ or if a field type is not supported.
868
+ """
869
+ # Validate that sample_type satisfies Packable protocol at runtime
870
+ # This catches non-packable types early with a clear error message
871
+ try:
872
+ # Check protocol compliance by verifying required methods exist
873
+ if not (
874
+ hasattr(sample_type, "from_data")
875
+ and hasattr(sample_type, "from_bytes")
876
+ and callable(getattr(sample_type, "from_data", None))
877
+ and callable(getattr(sample_type, "from_bytes", None))
878
+ ):
879
+ raise TypeError(
880
+ f"{sample_type.__name__} does not satisfy the Packable protocol. "
881
+ "Use @packable decorator or inherit from PackableSample."
882
+ )
883
+ except AttributeError:
884
+ raise TypeError(
885
+ f"sample_type must be a class, got {type(sample_type).__name__}"
886
+ )
887
+
888
+ # Auto-increment version if not specified
889
+ if version is None:
890
+ latest = self._get_latest_schema_version(sample_type.__name__)
891
+ if latest is None:
892
+ version = "1.0.0"
893
+ else:
894
+ version = _increment_patch(latest)
895
+
896
+ schema_record = _build_schema_record(
897
+ sample_type,
898
+ version=version,
899
+ description=description,
900
+ )
901
+
902
+ schema_ref = _schema_ref_from_type(sample_type, version)
903
+ name, _ = _parse_schema_ref(schema_ref)
904
+
905
+ # Store via provider
906
+ schema_json = json.dumps(schema_record)
907
+ self._provider.store_schema(name, version, schema_json)
908
+
909
+ return schema_ref
910
+
911
+ def get_schema(self, ref: str) -> dict:
912
+ """Get a schema record by reference (AbstractIndex protocol).
913
+
914
+ Args:
915
+ ref: Schema reference string. Supports both new format
916
+ (atdata://local/schema/{name}@{version}) and legacy
917
+ format (local://schemas/{module.Class}@{version}).
918
+
919
+ Returns:
920
+ Schema record as a dictionary with keys 'name', 'version',
921
+ 'fields', '$ref', etc.
922
+
923
+ Raises:
924
+ KeyError: If schema not found.
925
+ ValueError: If reference format is invalid.
926
+ """
927
+ name, version = _parse_schema_ref(ref)
928
+
929
+ schema_json = self._provider.get_schema_json(name, version)
930
+ if schema_json is None:
931
+ raise KeyError(f"Schema not found: {ref}")
932
+
933
+ schema = json.loads(schema_json)
934
+ schema["$ref"] = _make_schema_ref(name, version)
935
+
936
+ # Auto-generate stub if enabled
937
+ if self._stub_manager is not None:
938
+ self._stub_manager.ensure_stub(schema)
939
+
940
+ return schema
941
+
942
+ def get_schema_record(self, ref: str) -> LocalSchemaRecord:
943
+ """Get a schema record as LocalSchemaRecord object.
944
+
945
+ Use this when you need the full LocalSchemaRecord with typed properties.
946
+ For Protocol-compliant dict access, use get_schema() instead.
947
+
948
+ Args:
949
+ ref: Schema reference string.
950
+
951
+ Returns:
952
+ LocalSchemaRecord with schema details.
953
+
954
+ Raises:
955
+ KeyError: If schema not found.
956
+ ValueError: If reference format is invalid.
957
+ """
958
+ schema = self.get_schema(ref)
959
+ return LocalSchemaRecord.from_dict(schema)
960
+
961
+ @property
962
+ def schemas(self) -> Generator[LocalSchemaRecord, None, None]:
963
+ """Iterate over all schema records in this index.
964
+
965
+ Yields:
966
+ LocalSchemaRecord for each schema.
967
+ """
968
+ for name, version, schema_json in self._provider.iter_schemas():
969
+ schema = json.loads(schema_json)
970
+ schema["$ref"] = _make_schema_ref(name, version)
971
+ yield LocalSchemaRecord.from_dict(schema)
972
+
973
+ def list_schemas(self) -> list[dict]:
974
+ """Get all schema records as a materialized list (AbstractIndex protocol).
975
+
976
+ Returns:
977
+ List of schema records as dictionaries.
978
+ """
979
+ return [record.to_dict() for record in self.schemas]
980
+
981
+ def decode_schema(self, ref: str) -> Type[Packable]:
982
+ """Reconstruct a Python PackableSample type from a stored schema.
983
+
984
+ This method enables loading datasets without knowing the sample type
985
+ ahead of time. The index retrieves the schema record and dynamically
986
+ generates a PackableSample subclass matching the schema definition.
987
+
988
+ If auto_stubs is enabled, a Python module will be generated and the
989
+ class will be imported from it, providing full IDE autocomplete support.
990
+ The returned class has proper type information that IDEs can understand.
991
+
992
+ Args:
993
+ ref: Schema reference string (atdata://local/schema/... or
994
+ legacy local://schemas/...).
995
+
996
+ Returns:
997
+ A PackableSample subclass - either imported from a generated module
998
+ (if auto_stubs is enabled) or dynamically created.
999
+
1000
+ Raises:
1001
+ KeyError: If schema not found.
1002
+ ValueError: If schema cannot be decoded.
1003
+ """
1004
+ schema_dict = self.get_schema(ref)
1005
+
1006
+ # If auto_stubs is enabled, generate module and import class from it
1007
+ if self._stub_manager is not None:
1008
+ cls = self._stub_manager.ensure_module(schema_dict)
1009
+ if cls is not None:
1010
+ return cls
1011
+
1012
+ # Fall back to dynamic type generation
1013
+ from atdata._schema_codec import schema_to_type
1014
+
1015
+ return schema_to_type(schema_dict)
1016
+
1017
+ def decode_schema_as(self, ref: str, type_hint: type[T]) -> type[T]:
1018
+ """Decode a schema with explicit type hint for IDE support.
1019
+
1020
+ This is a typed wrapper around decode_schema() that preserves the
1021
+ type information for IDE autocomplete. Use this when you have a
1022
+ stub file for the schema and want full IDE support.
1023
+
1024
+ Args:
1025
+ ref: Schema reference string.
1026
+ type_hint: The stub type to use for type hints. Import this from
1027
+ the generated stub file.
1028
+
1029
+ Returns:
1030
+ The decoded type, cast to match the type_hint for IDE support.
1031
+
1032
+ Examples:
1033
+ >>> # After enabling auto_stubs and configuring IDE extraPaths:
1034
+ >>> from local.MySample_1_0_0 import MySample
1035
+ >>>
1036
+ >>> # This gives full IDE autocomplete:
1037
+ >>> DecodedType = index.decode_schema_as(ref, MySample)
1038
+ >>> sample = DecodedType(text="hello", value=42) # IDE knows signature!
1039
+
1040
+ Note:
1041
+ The type_hint is only used for static type checking - at runtime,
1042
+ the actual decoded type from the schema is returned. Ensure the
1043
+ stub matches the schema to avoid runtime surprises.
1044
+ """
1045
+ from typing import cast
1046
+
1047
+ return cast(type[T], self.decode_schema(ref))
1048
+
1049
+ def clear_stubs(self) -> int:
1050
+ """Remove all auto-generated stub files.
1051
+
1052
+ Only works if auto_stubs was enabled when creating the Index.
1053
+
1054
+ Returns:
1055
+ Number of stub files removed, or 0 if auto_stubs is disabled.
1056
+ """
1057
+ if self._stub_manager is not None:
1058
+ return self._stub_manager.clear_stubs()
1059
+ return 0
1060
+
1061
+ # -- Atmosphere promotion --
1062
+
1063
+ def promote_entry(
1064
+ self,
1065
+ entry_name: str,
1066
+ *,
1067
+ name: str | None = None,
1068
+ description: str | None = None,
1069
+ tags: list[str] | None = None,
1070
+ license: str | None = None,
1071
+ ) -> str:
1072
+ """Promote a locally-indexed dataset to the atmosphere.
1073
+
1074
+ Looks up the entry by name in the local index, resolves its
1075
+ schema, and publishes both schema and dataset record to ATProto
1076
+ via the index's atmosphere backend.
1077
+
1078
+ Args:
1079
+ entry_name: Name of the local dataset entry to promote.
1080
+ name: Override name for the atmosphere record. Defaults to
1081
+ the local entry name.
1082
+ description: Optional description for the dataset.
1083
+ tags: Optional tags for discovery.
1084
+ license: Optional license identifier.
1085
+
1086
+ Returns:
1087
+ AT URI of the created atmosphere dataset record.
1088
+
1089
+ Raises:
1090
+ ValueError: If atmosphere backend is not available, or
1091
+ the local entry has no data URLs.
1092
+ KeyError: If the entry or its schema is not found.
1093
+
1094
+ Examples:
1095
+ >>> index = Index(atmosphere=client)
1096
+ >>> uri = index.promote_entry("mnist-train")
1097
+ """
1098
+ from atdata.promote import _find_or_publish_schema
1099
+ from atdata.atmosphere import DatasetPublisher
1100
+ from atdata._schema_codec import schema_to_type
1101
+
1102
+ atmo = self._get_atmosphere()
1103
+ if atmo is None:
1104
+ raise ValueError("Atmosphere backend required but not available.")
1105
+
1106
+ entry = self.get_entry_by_name(entry_name)
1107
+ if not entry.data_urls:
1108
+ raise ValueError(f"Local entry {entry_name!r} has no data URLs")
1109
+
1110
+ schema_record = self.get_schema(entry.schema_ref)
1111
+ sample_type = schema_to_type(schema_record)
1112
+ schema_version = schema_record.get("version", "1.0.0")
1113
+
1114
+ atmosphere_schema_uri = _find_or_publish_schema(
1115
+ sample_type,
1116
+ schema_version,
1117
+ atmo.client,
1118
+ description=schema_record.get("description"),
1119
+ )
1120
+
1121
+ publisher = DatasetPublisher(atmo.client)
1122
+ uri = publisher.publish_with_urls(
1123
+ urls=entry.data_urls,
1124
+ schema_uri=atmosphere_schema_uri,
1125
+ name=name or entry.name,
1126
+ description=description,
1127
+ tags=tags,
1128
+ license=license,
1129
+ metadata=entry.metadata,
1130
+ )
1131
+ return str(uri)
1132
+
1133
+ def promote_dataset(
1134
+ self,
1135
+ dataset: Dataset,
1136
+ *,
1137
+ name: str,
1138
+ sample_type: type | None = None,
1139
+ schema_version: str = "1.0.0",
1140
+ description: str | None = None,
1141
+ tags: list[str] | None = None,
1142
+ license: str | None = None,
1143
+ ) -> str:
1144
+ """Publish a Dataset directly to the atmosphere.
1145
+
1146
+ Publishes the schema (with deduplication) and creates a dataset
1147
+ record on ATProto. Uses the index's atmosphere backend.
1148
+
1149
+ Args:
1150
+ dataset: The Dataset to publish.
1151
+ name: Name for the atmosphere dataset record.
1152
+ sample_type: Sample type for schema publishing. Inferred from
1153
+ ``dataset.sample_type`` if not provided.
1154
+ schema_version: Semantic version for the schema. Default: ``"1.0.0"``.
1155
+ description: Optional description for the dataset.
1156
+ tags: Optional tags for discovery.
1157
+ license: Optional license identifier.
1158
+
1159
+ Returns:
1160
+ AT URI of the created atmosphere dataset record.
1161
+
1162
+ Raises:
1163
+ ValueError: If atmosphere backend is not available.
1164
+
1165
+ Examples:
1166
+ >>> index = Index(atmosphere=client)
1167
+ >>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
1168
+ >>> uri = index.promote_dataset(ds, name="my-dataset")
1169
+ """
1170
+ from atdata.promote import _find_or_publish_schema
1171
+ from atdata.atmosphere import DatasetPublisher
1172
+
1173
+ atmo = self._get_atmosphere()
1174
+ if atmo is None:
1175
+ raise ValueError("Atmosphere backend required but not available.")
1176
+
1177
+ st = sample_type or dataset.sample_type
1178
+
1179
+ atmosphere_schema_uri = _find_or_publish_schema(
1180
+ st,
1181
+ schema_version,
1182
+ atmo.client,
1183
+ description=description,
1184
+ )
1185
+
1186
+ data_urls = dataset.list_shards()
1187
+
1188
+ publisher = DatasetPublisher(atmo.client)
1189
+ uri = publisher.publish_with_urls(
1190
+ urls=data_urls,
1191
+ schema_uri=atmosphere_schema_uri,
1192
+ name=name,
1193
+ description=description,
1194
+ tags=tags,
1195
+ license=license,
1196
+ metadata=dataset._metadata,
1197
+ )
1198
+ return str(uri)