atdata 0.3.0b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +9 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +33 -1
- atdata/_protocols.py +64 -182
- atdata/_schema_codec.py +2 -2
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +12 -11
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +9 -10
- atdata/atmosphere/schema.py +14 -16
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +155 -2
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_index.py +322 -64
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/promote.py +14 -10
- atdata/repository.py +7 -7
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +2 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- /atdata/{local → index}/_entry.py +0 -0
- /atdata/{local → stores}/_s3.py +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/{local → index}/_index.py
RENAMED
|
@@ -7,8 +7,8 @@ from atdata import (
|
|
|
7
7
|
)
|
|
8
8
|
from atdata._protocols import AbstractDataStore, Packable
|
|
9
9
|
|
|
10
|
-
from atdata.
|
|
11
|
-
from atdata.
|
|
10
|
+
from atdata.index._entry import LocalDatasetEntry
|
|
11
|
+
from atdata.index._schema import (
|
|
12
12
|
SchemaNamespace,
|
|
13
13
|
LocalSchemaRecord,
|
|
14
14
|
_schema_ref_from_type,
|
|
@@ -21,6 +21,7 @@ from atdata.local._schema import (
|
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from typing import (
|
|
23
23
|
Any,
|
|
24
|
+
Iterable,
|
|
24
25
|
Type,
|
|
25
26
|
TypeVar,
|
|
26
27
|
Generator,
|
|
@@ -41,8 +42,8 @@ class Index:
|
|
|
41
42
|
"""Unified index for tracking datasets across multiple repositories.
|
|
42
43
|
|
|
43
44
|
Implements the AbstractIndex protocol. Maintains a registry of
|
|
44
|
-
dataset entries across a built-in
|
|
45
|
-
|
|
45
|
+
dataset entries across named repositories (always including a built-in
|
|
46
|
+
``"local"`` repository) and an optional atmosphere (ATProto) backend.
|
|
46
47
|
|
|
47
48
|
The ``"local"`` repository is always present and uses the storage backend
|
|
48
49
|
determined by the ``provider`` argument. When no provider is given, defaults
|
|
@@ -52,14 +53,12 @@ class Index:
|
|
|
52
53
|
Additional named repositories can be mounted via the ``repos`` parameter,
|
|
53
54
|
each pairing an IndexProvider with an optional data store.
|
|
54
55
|
|
|
55
|
-
An
|
|
56
|
+
An Atmosphere is available by default for anonymous read-only
|
|
56
57
|
resolution of ``@handle/dataset`` paths. Pass an authenticated client
|
|
57
58
|
for write operations, or ``atmosphere=None`` to disable.
|
|
58
59
|
|
|
59
60
|
Attributes:
|
|
60
|
-
|
|
61
|
-
_data_store: Optional AbstractDataStore for the local repository.
|
|
62
|
-
_repos: Named repositories beyond ``"local"``.
|
|
61
|
+
_repos: All repositories keyed by name. ``"local"`` is always present.
|
|
63
62
|
_atmosphere: Optional atmosphere backend for ATProto operations.
|
|
64
63
|
"""
|
|
65
64
|
|
|
@@ -105,7 +104,7 @@ class Index:
|
|
|
105
104
|
atmosphere: ATProto client for distributed network operations.
|
|
106
105
|
- Default (sentinel): creates an anonymous read-only client
|
|
107
106
|
lazily on first access.
|
|
108
|
-
- ``
|
|
107
|
+
- ``Atmosphere`` instance: uses that client directly.
|
|
109
108
|
- ``None``: disables atmosphere backend entirely.
|
|
110
109
|
auto_stubs: If True, automatically generate .pyi stub files when
|
|
111
110
|
schemas are accessed via get_schema() or decode_schema().
|
|
@@ -146,12 +145,13 @@ class Index:
|
|
|
146
145
|
##
|
|
147
146
|
|
|
148
147
|
from atdata.providers._base import IndexProvider as _IP
|
|
148
|
+
from atdata.repository import Repository as _Repo
|
|
149
149
|
|
|
150
|
+
# Resolve the local provider
|
|
150
151
|
if isinstance(provider, str):
|
|
151
|
-
# String-based provider selection
|
|
152
152
|
from atdata.providers._factory import create_provider
|
|
153
153
|
|
|
154
|
-
|
|
154
|
+
local_provider: _IP = create_provider(
|
|
155
155
|
provider, path=path, dsn=dsn, redis=redis, **kwargs
|
|
156
156
|
)
|
|
157
157
|
elif provider is not None:
|
|
@@ -160,27 +160,24 @@ class Index:
|
|
|
160
160
|
f"provider must be an IndexProvider or backend name string, "
|
|
161
161
|
f"got {type(provider).__name__}"
|
|
162
162
|
)
|
|
163
|
-
|
|
163
|
+
local_provider = provider
|
|
164
164
|
elif redis is not None:
|
|
165
|
-
# Explicit Redis connection provided
|
|
166
165
|
from atdata.providers._redis import RedisProvider
|
|
167
166
|
|
|
168
|
-
|
|
167
|
+
local_provider = RedisProvider(redis)
|
|
169
168
|
elif kwargs:
|
|
170
|
-
# kwargs provided — assume Redis constructor args for compat
|
|
171
169
|
from atdata.providers._redis import RedisProvider
|
|
172
170
|
|
|
173
|
-
|
|
171
|
+
local_provider = RedisProvider(Redis(**kwargs))
|
|
174
172
|
else:
|
|
175
|
-
# Default: zero-dependency SQLite
|
|
176
173
|
from atdata.providers._sqlite import SqliteProvider
|
|
177
174
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
self._data_store = data_store
|
|
175
|
+
local_provider = SqliteProvider()
|
|
181
176
|
|
|
182
|
-
#
|
|
183
|
-
|
|
177
|
+
# Build the unified repos dict with "local" always present
|
|
178
|
+
self._repos: dict[str, _Repo] = {
|
|
179
|
+
"local": _Repo(provider=local_provider, data_store=data_store),
|
|
180
|
+
}
|
|
184
181
|
|
|
185
182
|
if repos is not None:
|
|
186
183
|
if "local" in repos:
|
|
@@ -194,9 +191,7 @@ class Index:
|
|
|
194
191
|
f"repos[{name!r}] must be a Repository, "
|
|
195
192
|
f"got {type(repo).__name__}"
|
|
196
193
|
)
|
|
197
|
-
self._repos
|
|
198
|
-
else:
|
|
199
|
-
self._repos = {}
|
|
194
|
+
self._repos.update(repos)
|
|
200
195
|
|
|
201
196
|
# Atmosphere backend (lazy or explicit)
|
|
202
197
|
from atdata.repository import _AtmosphereBackend
|
|
@@ -230,10 +225,10 @@ class Index:
|
|
|
230
225
|
"""Get the atmosphere backend, lazily creating anonymous client if needed."""
|
|
231
226
|
if self._atmosphere_deferred and self._atmosphere is None:
|
|
232
227
|
try:
|
|
233
|
-
from atdata.atmosphere.client import
|
|
228
|
+
from atdata.atmosphere.client import Atmosphere
|
|
234
229
|
from atdata.repository import _AtmosphereBackend
|
|
235
230
|
|
|
236
|
-
client =
|
|
231
|
+
client = Atmosphere()
|
|
237
232
|
self._atmosphere = _AtmosphereBackend(client)
|
|
238
233
|
except ImportError:
|
|
239
234
|
# atproto package not installed -- atmosphere unavailable
|
|
@@ -289,13 +284,13 @@ class Index:
|
|
|
289
284
|
return ("local", ref, None)
|
|
290
285
|
|
|
291
286
|
@property
|
|
292
|
-
def repos(self) -> dict[str, Repository]:
|
|
293
|
-
"""
|
|
287
|
+
def repos(self) -> dict[str, "Repository"]:
|
|
288
|
+
"""All repositories mounted on this index (including ``"local"``)."""
|
|
294
289
|
return dict(self._repos)
|
|
295
290
|
|
|
296
291
|
@property
|
|
297
292
|
def atmosphere(self) -> Any:
|
|
298
|
-
"""The
|
|
293
|
+
"""The Atmosphere for this index, or None if disabled.
|
|
299
294
|
|
|
300
295
|
Returns the underlying client (not the internal backend wrapper).
|
|
301
296
|
"""
|
|
@@ -304,10 +299,15 @@ class Index:
|
|
|
304
299
|
return backend.client
|
|
305
300
|
return None
|
|
306
301
|
|
|
302
|
+
@property
|
|
303
|
+
def _provider(self) -> "IndexProvider": # noqa: F821
|
|
304
|
+
"""IndexProvider for the ``"local"`` repository (backward compat)."""
|
|
305
|
+
return self._repos["local"].provider
|
|
306
|
+
|
|
307
307
|
@property
|
|
308
308
|
def provider(self) -> "IndexProvider": # noqa: F821
|
|
309
|
-
"""The storage provider backing
|
|
310
|
-
return self.
|
|
309
|
+
"""The storage provider backing the ``"local"`` repository."""
|
|
310
|
+
return self._repos["local"].provider
|
|
311
311
|
|
|
312
312
|
@property
|
|
313
313
|
def _redis(self) -> Redis:
|
|
@@ -318,17 +318,23 @@ class Index:
|
|
|
318
318
|
"""
|
|
319
319
|
from atdata.providers._redis import RedisProvider
|
|
320
320
|
|
|
321
|
-
|
|
322
|
-
|
|
321
|
+
prov = self._repos["local"].provider
|
|
322
|
+
if isinstance(prov, RedisProvider):
|
|
323
|
+
return prov.redis
|
|
323
324
|
raise AttributeError(
|
|
324
325
|
"Index._redis is only available with a Redis provider. "
|
|
325
326
|
"Use index.provider instead."
|
|
326
327
|
)
|
|
327
328
|
|
|
329
|
+
@property
|
|
330
|
+
def _data_store(self) -> AbstractDataStore | None:
|
|
331
|
+
"""Data store for the ``"local"`` repository (backward compat)."""
|
|
332
|
+
return self._repos["local"].data_store
|
|
333
|
+
|
|
328
334
|
@property
|
|
329
335
|
def data_store(self) -> AbstractDataStore | None:
|
|
330
336
|
"""The data store for writing shards, or None if index-only."""
|
|
331
|
-
return self.
|
|
337
|
+
return self._repos["local"].data_store
|
|
332
338
|
|
|
333
339
|
@property
|
|
334
340
|
def stub_dir(self) -> Path | None:
|
|
@@ -351,7 +357,7 @@ class Index:
|
|
|
351
357
|
as attributes on this namespace.
|
|
352
358
|
|
|
353
359
|
Examples:
|
|
354
|
-
>>> index.load_schema("atdata://local/
|
|
360
|
+
>>> index.load_schema("atdata://local/schema/MySample@1.0.0")
|
|
355
361
|
>>> MyType = index.types.MySample
|
|
356
362
|
>>> sample = MyType(name="hello", value=42)
|
|
357
363
|
|
|
@@ -368,7 +374,7 @@ class Index:
|
|
|
368
374
|
in the :attr:`types` namespace for easy access.
|
|
369
375
|
|
|
370
376
|
Args:
|
|
371
|
-
ref: Schema reference string (atdata://local/
|
|
377
|
+
ref: Schema reference string (atdata://local/schema/... or
|
|
372
378
|
legacy local://schemas/...).
|
|
373
379
|
|
|
374
380
|
Returns:
|
|
@@ -381,11 +387,11 @@ class Index:
|
|
|
381
387
|
|
|
382
388
|
Examples:
|
|
383
389
|
>>> # Load and use immediately
|
|
384
|
-
>>> MyType = index.load_schema("atdata://local/
|
|
390
|
+
>>> MyType = index.load_schema("atdata://local/schema/MySample@1.0.0")
|
|
385
391
|
>>> sample = MyType(field1="hello", field2=42)
|
|
386
392
|
>>>
|
|
387
393
|
>>> # Or access later via namespace
|
|
388
|
-
>>> index.load_schema("atdata://local/
|
|
394
|
+
>>> index.load_schema("atdata://local/schema/OtherType@1.0.0")
|
|
389
395
|
>>> other = index.types.OtherType(data="test")
|
|
390
396
|
"""
|
|
391
397
|
# Decode the schema (uses generated module if auto_stubs enabled)
|
|
@@ -513,6 +519,23 @@ class Index:
|
|
|
513
519
|
|
|
514
520
|
# AbstractIndex protocol methods
|
|
515
521
|
|
|
522
|
+
@staticmethod
|
|
523
|
+
def _ensure_schema_stored(
|
|
524
|
+
schema_ref: str,
|
|
525
|
+
sample_type: type,
|
|
526
|
+
provider: "IndexProvider", # noqa: F821
|
|
527
|
+
) -> None:
|
|
528
|
+
"""Persist the schema definition if not already stored.
|
|
529
|
+
|
|
530
|
+
Called during dataset insertion so that ``decode_schema()`` can
|
|
531
|
+
reconstruct the type later without the caller needing to publish
|
|
532
|
+
the schema separately.
|
|
533
|
+
"""
|
|
534
|
+
schema_name, version = _parse_schema_ref(schema_ref)
|
|
535
|
+
if provider.get_schema_json(schema_name, version) is None:
|
|
536
|
+
record = _build_schema_record(sample_type, version=version)
|
|
537
|
+
provider.store_schema(schema_name, version, json.dumps(record))
|
|
538
|
+
|
|
516
539
|
def _insert_dataset_to_provider(
|
|
517
540
|
self,
|
|
518
541
|
ds: Dataset,
|
|
@@ -543,6 +566,8 @@ class Index:
|
|
|
543
566
|
if schema_ref is None:
|
|
544
567
|
schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
|
|
545
568
|
|
|
569
|
+
self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
|
|
570
|
+
|
|
546
571
|
entry_metadata = metadata if metadata is not None else ds._metadata
|
|
547
572
|
entry = LocalDatasetEntry(
|
|
548
573
|
name=name,
|
|
@@ -557,6 +582,8 @@ class Index:
|
|
|
557
582
|
if schema_ref is None:
|
|
558
583
|
schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
|
|
559
584
|
|
|
585
|
+
self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
|
|
586
|
+
|
|
560
587
|
data_urls = [ds.url]
|
|
561
588
|
entry_metadata = metadata if metadata is not None else ds._metadata
|
|
562
589
|
|
|
@@ -612,17 +639,6 @@ class Index:
|
|
|
612
639
|
ds, name=resolved_name, schema_ref=schema_ref, **kwargs
|
|
613
640
|
)
|
|
614
641
|
|
|
615
|
-
if backend_key == "local":
|
|
616
|
-
return self._insert_dataset_to_provider(
|
|
617
|
-
ds,
|
|
618
|
-
name=resolved_name,
|
|
619
|
-
schema_ref=schema_ref,
|
|
620
|
-
provider=self._provider,
|
|
621
|
-
store=self._data_store,
|
|
622
|
-
**kwargs,
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
# Named repository
|
|
626
642
|
repo = self._repos.get(backend_key)
|
|
627
643
|
if repo is None:
|
|
628
644
|
raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
|
|
@@ -635,6 +651,117 @@ class Index:
|
|
|
635
651
|
**kwargs,
|
|
636
652
|
)
|
|
637
653
|
|
|
654
|
+
def write(
|
|
655
|
+
self,
|
|
656
|
+
samples: Iterable,
|
|
657
|
+
*,
|
|
658
|
+
name: str,
|
|
659
|
+
schema_ref: str | None = None,
|
|
660
|
+
description: str | None = None,
|
|
661
|
+
tags: list[str] | None = None,
|
|
662
|
+
license: str | None = None,
|
|
663
|
+
maxcount: int = 10_000,
|
|
664
|
+
maxsize: int | None = None,
|
|
665
|
+
metadata: dict | None = None,
|
|
666
|
+
manifest: bool = False,
|
|
667
|
+
) -> "IndexEntry":
|
|
668
|
+
"""Write samples and create an index entry in one step.
|
|
669
|
+
|
|
670
|
+
This is the primary method for publishing data. It serializes
|
|
671
|
+
samples to WebDataset tar files, stores them via the appropriate
|
|
672
|
+
backend, and creates an index entry.
|
|
673
|
+
|
|
674
|
+
The target backend is determined by the *name* prefix:
|
|
675
|
+
|
|
676
|
+
- Bare name (e.g., ``"mnist"``): writes to the local repository.
|
|
677
|
+
- ``"@handle/name"``: writes and publishes to the atmosphere.
|
|
678
|
+
- ``"repo/name"``: writes to a named repository.
|
|
679
|
+
|
|
680
|
+
When the local backend has no ``data_store`` configured, a
|
|
681
|
+
``LocalDiskStore`` is created automatically at
|
|
682
|
+
``~/.atdata/data/`` so that samples have persistent storage.
|
|
683
|
+
|
|
684
|
+
.. note::
|
|
685
|
+
|
|
686
|
+
This method is synchronous. Samples are written to a temporary
|
|
687
|
+
location first, then copied to permanent storage by the backend.
|
|
688
|
+
Avoid passing lazily-evaluated iterators that depend on external
|
|
689
|
+
state that may change during the call.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
samples: Iterable of ``Packable`` samples. Must be non-empty.
|
|
693
|
+
name: Dataset name, optionally prefixed with target.
|
|
694
|
+
schema_ref: Optional schema reference. Auto-generated if ``None``.
|
|
695
|
+
description: Optional dataset description (atmosphere only).
|
|
696
|
+
tags: Optional tags for discovery (atmosphere only).
|
|
697
|
+
license: Optional license identifier (atmosphere only).
|
|
698
|
+
maxcount: Max samples per shard. Default: 10,000.
|
|
699
|
+
maxsize: Max bytes per shard. Default: ``None``.
|
|
700
|
+
metadata: Optional metadata dict stored with the entry.
|
|
701
|
+
manifest: If True, write per-shard manifest sidecar files
|
|
702
|
+
alongside each tar. Default: ``False``.
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
IndexEntry for the created dataset.
|
|
706
|
+
|
|
707
|
+
Raises:
|
|
708
|
+
ValueError: If *samples* is empty.
|
|
709
|
+
|
|
710
|
+
Examples:
|
|
711
|
+
>>> index = Index()
|
|
712
|
+
>>> samples = [MySample(key="0", text="hello")]
|
|
713
|
+
>>> entry = index.write(samples, name="my-dataset")
|
|
714
|
+
"""
|
|
715
|
+
import tempfile
|
|
716
|
+
|
|
717
|
+
from atdata.dataset import write_samples
|
|
718
|
+
|
|
719
|
+
backend_key, resolved_name, _ = self._resolve_prefix(name)
|
|
720
|
+
|
|
721
|
+
# Resolve the target repo's data store; auto-create LocalDiskStore
|
|
722
|
+
# for repos that have no store so write() always persists data.
|
|
723
|
+
repo = self._repos.get(backend_key)
|
|
724
|
+
effective_store = repo.data_store if repo is not None else None
|
|
725
|
+
needs_auto_store = repo is not None and effective_store is None
|
|
726
|
+
|
|
727
|
+
if needs_auto_store and backend_key != "_atmosphere":
|
|
728
|
+
from atdata.stores._disk import LocalDiskStore
|
|
729
|
+
|
|
730
|
+
effective_store = LocalDiskStore()
|
|
731
|
+
|
|
732
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
733
|
+
tmp_path = Path(tmp_dir) / "data.tar"
|
|
734
|
+
ds = write_samples(
|
|
735
|
+
samples,
|
|
736
|
+
tmp_path,
|
|
737
|
+
maxcount=maxcount,
|
|
738
|
+
maxsize=maxsize,
|
|
739
|
+
manifest=manifest,
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
# When we auto-created a store, write directly through it
|
|
743
|
+
# rather than via insert_dataset (which would just index
|
|
744
|
+
# the temp path).
|
|
745
|
+
if needs_auto_store and repo is not None:
|
|
746
|
+
return self._insert_dataset_to_provider(
|
|
747
|
+
ds,
|
|
748
|
+
name=resolved_name,
|
|
749
|
+
schema_ref=schema_ref,
|
|
750
|
+
provider=repo.provider,
|
|
751
|
+
store=effective_store,
|
|
752
|
+
metadata=metadata,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
return self.insert_dataset(
|
|
756
|
+
ds,
|
|
757
|
+
name=name,
|
|
758
|
+
schema_ref=schema_ref,
|
|
759
|
+
metadata=metadata,
|
|
760
|
+
description=description,
|
|
761
|
+
tags=tags,
|
|
762
|
+
license=license,
|
|
763
|
+
)
|
|
764
|
+
|
|
638
765
|
def get_dataset(self, ref: str) -> "IndexEntry":
|
|
639
766
|
"""Get a dataset entry by name or prefixed reference.
|
|
640
767
|
|
|
@@ -659,14 +786,10 @@ class Index:
|
|
|
659
786
|
if atmo is None:
|
|
660
787
|
raise ValueError(
|
|
661
788
|
f"Atmosphere backend required for path {ref!r} but not available. "
|
|
662
|
-
"Install 'atproto' or pass an
|
|
789
|
+
"Install 'atproto' or pass an Atmosphere."
|
|
663
790
|
)
|
|
664
791
|
return atmo.get_dataset(resolved_ref)
|
|
665
792
|
|
|
666
|
-
if backend_key == "local":
|
|
667
|
-
return self._provider.get_entry_by_name(resolved_ref)
|
|
668
|
-
|
|
669
|
-
# Named repository
|
|
670
793
|
repo = self._repos.get(backend_key)
|
|
671
794
|
if repo is None:
|
|
672
795
|
raise KeyError(f"Unknown repository {backend_key!r} in ref {ref!r}")
|
|
@@ -676,14 +799,13 @@ class Index:
|
|
|
676
799
|
def datasets(self) -> Generator["IndexEntry", None, None]:
|
|
677
800
|
"""Lazily iterate over all dataset entries across local repositories.
|
|
678
801
|
|
|
679
|
-
Yields entries from
|
|
680
|
-
|
|
802
|
+
Yields entries from all mounted repositories (``"local"`` and named).
|
|
803
|
+
Atmosphere entries are not included (use
|
|
681
804
|
``list_datasets(repo="_atmosphere")`` for those).
|
|
682
805
|
|
|
683
806
|
Yields:
|
|
684
807
|
IndexEntry for each dataset.
|
|
685
808
|
"""
|
|
686
|
-
yield from self._provider.iter_entries()
|
|
687
809
|
for repo in self._repos.values():
|
|
688
810
|
yield from repo.provider.iter_entries()
|
|
689
811
|
|
|
@@ -702,9 +824,6 @@ class Index:
|
|
|
702
824
|
if repo is None:
|
|
703
825
|
return list(self.datasets)
|
|
704
826
|
|
|
705
|
-
if repo == "local":
|
|
706
|
-
return self.list_entries()
|
|
707
|
-
|
|
708
827
|
if repo == "_atmosphere":
|
|
709
828
|
atmo = self._get_atmosphere()
|
|
710
829
|
if atmo is None:
|
|
@@ -740,7 +859,7 @@ class Index:
|
|
|
740
859
|
the class docstring.
|
|
741
860
|
|
|
742
861
|
Returns:
|
|
743
|
-
Schema reference string: 'atdata://local/
|
|
862
|
+
Schema reference string: 'atdata://local/schema/{name}@{version}'.
|
|
744
863
|
|
|
745
864
|
Raises:
|
|
746
865
|
ValueError: If sample_type is not a dataclass.
|
|
@@ -794,7 +913,7 @@ class Index:
|
|
|
794
913
|
|
|
795
914
|
Args:
|
|
796
915
|
ref: Schema reference string. Supports both new format
|
|
797
|
-
(atdata://local/
|
|
916
|
+
(atdata://local/schema/{name}@{version}) and legacy
|
|
798
917
|
format (local://schemas/{module.Class}@{version}).
|
|
799
918
|
|
|
800
919
|
Returns:
|
|
@@ -871,7 +990,7 @@ class Index:
|
|
|
871
990
|
The returned class has proper type information that IDEs can understand.
|
|
872
991
|
|
|
873
992
|
Args:
|
|
874
|
-
ref: Schema reference string (atdata://local/
|
|
993
|
+
ref: Schema reference string (atdata://local/schema/... or
|
|
875
994
|
legacy local://schemas/...).
|
|
876
995
|
|
|
877
996
|
Returns:
|
|
@@ -938,3 +1057,142 @@ class Index:
|
|
|
938
1057
|
if self._stub_manager is not None:
|
|
939
1058
|
return self._stub_manager.clear_stubs()
|
|
940
1059
|
return 0
|
|
1060
|
+
|
|
1061
|
+
# -- Atmosphere promotion --
|
|
1062
|
+
|
|
1063
|
+
def promote_entry(
|
|
1064
|
+
self,
|
|
1065
|
+
entry_name: str,
|
|
1066
|
+
*,
|
|
1067
|
+
name: str | None = None,
|
|
1068
|
+
description: str | None = None,
|
|
1069
|
+
tags: list[str] | None = None,
|
|
1070
|
+
license: str | None = None,
|
|
1071
|
+
) -> str:
|
|
1072
|
+
"""Promote a locally-indexed dataset to the atmosphere.
|
|
1073
|
+
|
|
1074
|
+
Looks up the entry by name in the local index, resolves its
|
|
1075
|
+
schema, and publishes both schema and dataset record to ATProto
|
|
1076
|
+
via the index's atmosphere backend.
|
|
1077
|
+
|
|
1078
|
+
Args:
|
|
1079
|
+
entry_name: Name of the local dataset entry to promote.
|
|
1080
|
+
name: Override name for the atmosphere record. Defaults to
|
|
1081
|
+
the local entry name.
|
|
1082
|
+
description: Optional description for the dataset.
|
|
1083
|
+
tags: Optional tags for discovery.
|
|
1084
|
+
license: Optional license identifier.
|
|
1085
|
+
|
|
1086
|
+
Returns:
|
|
1087
|
+
AT URI of the created atmosphere dataset record.
|
|
1088
|
+
|
|
1089
|
+
Raises:
|
|
1090
|
+
ValueError: If atmosphere backend is not available, or
|
|
1091
|
+
the local entry has no data URLs.
|
|
1092
|
+
KeyError: If the entry or its schema is not found.
|
|
1093
|
+
|
|
1094
|
+
Examples:
|
|
1095
|
+
>>> index = Index(atmosphere=client)
|
|
1096
|
+
>>> uri = index.promote_entry("mnist-train")
|
|
1097
|
+
"""
|
|
1098
|
+
from atdata.promote import _find_or_publish_schema
|
|
1099
|
+
from atdata.atmosphere import DatasetPublisher
|
|
1100
|
+
from atdata._schema_codec import schema_to_type
|
|
1101
|
+
|
|
1102
|
+
atmo = self._get_atmosphere()
|
|
1103
|
+
if atmo is None:
|
|
1104
|
+
raise ValueError("Atmosphere backend required but not available.")
|
|
1105
|
+
|
|
1106
|
+
entry = self.get_entry_by_name(entry_name)
|
|
1107
|
+
if not entry.data_urls:
|
|
1108
|
+
raise ValueError(f"Local entry {entry_name!r} has no data URLs")
|
|
1109
|
+
|
|
1110
|
+
schema_record = self.get_schema(entry.schema_ref)
|
|
1111
|
+
sample_type = schema_to_type(schema_record)
|
|
1112
|
+
schema_version = schema_record.get("version", "1.0.0")
|
|
1113
|
+
|
|
1114
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
1115
|
+
sample_type,
|
|
1116
|
+
schema_version,
|
|
1117
|
+
atmo.client,
|
|
1118
|
+
description=schema_record.get("description"),
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
publisher = DatasetPublisher(atmo.client)
|
|
1122
|
+
uri = publisher.publish_with_urls(
|
|
1123
|
+
urls=entry.data_urls,
|
|
1124
|
+
schema_uri=atmosphere_schema_uri,
|
|
1125
|
+
name=name or entry.name,
|
|
1126
|
+
description=description,
|
|
1127
|
+
tags=tags,
|
|
1128
|
+
license=license,
|
|
1129
|
+
metadata=entry.metadata,
|
|
1130
|
+
)
|
|
1131
|
+
return str(uri)
|
|
1132
|
+
|
|
1133
|
+
def promote_dataset(
|
|
1134
|
+
self,
|
|
1135
|
+
dataset: Dataset,
|
|
1136
|
+
*,
|
|
1137
|
+
name: str,
|
|
1138
|
+
sample_type: type | None = None,
|
|
1139
|
+
schema_version: str = "1.0.0",
|
|
1140
|
+
description: str | None = None,
|
|
1141
|
+
tags: list[str] | None = None,
|
|
1142
|
+
license: str | None = None,
|
|
1143
|
+
) -> str:
|
|
1144
|
+
"""Publish a Dataset directly to the atmosphere.
|
|
1145
|
+
|
|
1146
|
+
Publishes the schema (with deduplication) and creates a dataset
|
|
1147
|
+
record on ATProto. Uses the index's atmosphere backend.
|
|
1148
|
+
|
|
1149
|
+
Args:
|
|
1150
|
+
dataset: The Dataset to publish.
|
|
1151
|
+
name: Name for the atmosphere dataset record.
|
|
1152
|
+
sample_type: Sample type for schema publishing. Inferred from
|
|
1153
|
+
``dataset.sample_type`` if not provided.
|
|
1154
|
+
schema_version: Semantic version for the schema. Default: ``"1.0.0"``.
|
|
1155
|
+
description: Optional description for the dataset.
|
|
1156
|
+
tags: Optional tags for discovery.
|
|
1157
|
+
license: Optional license identifier.
|
|
1158
|
+
|
|
1159
|
+
Returns:
|
|
1160
|
+
AT URI of the created atmosphere dataset record.
|
|
1161
|
+
|
|
1162
|
+
Raises:
|
|
1163
|
+
ValueError: If atmosphere backend is not available.
|
|
1164
|
+
|
|
1165
|
+
Examples:
|
|
1166
|
+
>>> index = Index(atmosphere=client)
|
|
1167
|
+
>>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
|
|
1168
|
+
>>> uri = index.promote_dataset(ds, name="my-dataset")
|
|
1169
|
+
"""
|
|
1170
|
+
from atdata.promote import _find_or_publish_schema
|
|
1171
|
+
from atdata.atmosphere import DatasetPublisher
|
|
1172
|
+
|
|
1173
|
+
atmo = self._get_atmosphere()
|
|
1174
|
+
if atmo is None:
|
|
1175
|
+
raise ValueError("Atmosphere backend required but not available.")
|
|
1176
|
+
|
|
1177
|
+
st = sample_type or dataset.sample_type
|
|
1178
|
+
|
|
1179
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
1180
|
+
st,
|
|
1181
|
+
schema_version,
|
|
1182
|
+
atmo.client,
|
|
1183
|
+
description=description,
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
data_urls = dataset.list_shards()
|
|
1187
|
+
|
|
1188
|
+
publisher = DatasetPublisher(atmo.client)
|
|
1189
|
+
uri = publisher.publish_with_urls(
|
|
1190
|
+
urls=data_urls,
|
|
1191
|
+
schema_uri=atmosphere_schema_uri,
|
|
1192
|
+
name=name,
|
|
1193
|
+
description=description,
|
|
1194
|
+
tags=tags,
|
|
1195
|
+
license=license,
|
|
1196
|
+
metadata=dataset._metadata,
|
|
1197
|
+
)
|
|
1198
|
+
return str(uri)
|
|
@@ -26,7 +26,7 @@ from typing import (
|
|
|
26
26
|
T = TypeVar("T", bound=Packable)
|
|
27
27
|
|
|
28
28
|
# URI scheme prefixes
|
|
29
|
-
_ATDATA_URI_PREFIX = "atdata://local/
|
|
29
|
+
_ATDATA_URI_PREFIX = "atdata://local/schema/"
|
|
30
30
|
_LEGACY_URI_PREFIX = "local://schemas/"
|
|
31
31
|
|
|
32
32
|
|
|
@@ -37,7 +37,7 @@ class SchemaNamespace:
|
|
|
37
37
|
Supports attribute access, iteration, ``len()``, and ``in`` checks.
|
|
38
38
|
|
|
39
39
|
Examples:
|
|
40
|
-
>>> index.load_schema("atdata://local/
|
|
40
|
+
>>> index.load_schema("atdata://local/schema/MySample@1.0.0")
|
|
41
41
|
>>> MyType = index.types.MySample
|
|
42
42
|
>>> sample = MyType(field1="hello", field2=42)
|
|
43
43
|
|
|
@@ -207,7 +207,7 @@ class LocalSchemaRecord:
|
|
|
207
207
|
"""List of field definitions."""
|
|
208
208
|
|
|
209
209
|
ref: str
|
|
210
|
-
"""Schema reference URI (atdata://local/
|
|
210
|
+
"""Schema reference URI (atdata://local/schema/{name}@{version})."""
|
|
211
211
|
|
|
212
212
|
description: Optional[str] = None
|
|
213
213
|
"""Human-readable description."""
|
|
@@ -259,7 +259,7 @@ def _kind_str_for_sample_type(st: Type[Packable]) -> str:
|
|
|
259
259
|
|
|
260
260
|
|
|
261
261
|
def _schema_ref_from_type(sample_type: Type[Packable], version: str) -> str:
|
|
262
|
-
"""Generate 'atdata://local/
|
|
262
|
+
"""Generate 'atdata://local/schema/{name}@{version}' reference."""
|
|
263
263
|
return _make_schema_ref(sample_type.__name__, version)
|
|
264
264
|
|
|
265
265
|
|
|
@@ -271,7 +271,7 @@ def _make_schema_ref(name: str, version: str) -> str:
|
|
|
271
271
|
def _parse_schema_ref(ref: str) -> tuple[str, str]:
|
|
272
272
|
"""Parse schema reference into (name, version).
|
|
273
273
|
|
|
274
|
-
Supports both new format: 'atdata://local/
|
|
274
|
+
Supports both new format: 'atdata://local/schema/{name}@{version}'
|
|
275
275
|
and legacy format: 'local://schemas/{module.Class}@{version}'
|
|
276
276
|
"""
|
|
277
277
|
if ref.startswith(_ATDATA_URI_PREFIX):
|