atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/{local → index}/_index.py
RENAMED
|
@@ -7,8 +7,8 @@ from atdata import (
|
|
|
7
7
|
)
|
|
8
8
|
from atdata._protocols import AbstractDataStore, Packable
|
|
9
9
|
|
|
10
|
-
from atdata.
|
|
11
|
-
from atdata.
|
|
10
|
+
from atdata.index._entry import LocalDatasetEntry
|
|
11
|
+
from atdata.index._schema import (
|
|
12
12
|
SchemaNamespace,
|
|
13
13
|
LocalSchemaRecord,
|
|
14
14
|
_schema_ref_from_type,
|
|
@@ -21,15 +21,17 @@ from atdata.local._schema import (
|
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from typing import (
|
|
23
23
|
Any,
|
|
24
|
+
Iterable,
|
|
24
25
|
Type,
|
|
25
26
|
TypeVar,
|
|
26
27
|
Generator,
|
|
27
28
|
TYPE_CHECKING,
|
|
28
29
|
)
|
|
29
|
-
from redis import Redis
|
|
30
30
|
import json
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
|
+
from redis import Redis
|
|
34
|
+
|
|
33
35
|
from atdata.providers._base import IndexProvider
|
|
34
36
|
from atdata.repository import Repository, _AtmosphereBackend
|
|
35
37
|
from atdata._protocols import IndexEntry
|
|
@@ -37,12 +39,42 @@ if TYPE_CHECKING:
|
|
|
37
39
|
T = TypeVar("T", bound=Packable)
|
|
38
40
|
|
|
39
41
|
|
|
42
|
+
def _is_local_path(url: str) -> bool:
|
|
43
|
+
"""Check if a URL points to the local filesystem."""
|
|
44
|
+
return (
|
|
45
|
+
url.startswith("/")
|
|
46
|
+
or url.startswith("file://")
|
|
47
|
+
or (len(url) > 1 and url[1] == ":")
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _is_credentialed_source(ds: Dataset) -> bool:
|
|
52
|
+
"""Check if a Dataset uses a credentialed source (e.g. S3Source with keys)."""
|
|
53
|
+
from atdata._sources import S3Source
|
|
54
|
+
|
|
55
|
+
return isinstance(ds.source, S3Source)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _estimate_dataset_bytes(ds: Dataset) -> int:
|
|
59
|
+
"""Best-effort total size estimate from local shard files.
|
|
60
|
+
|
|
61
|
+
Returns 0 when size cannot be determined (e.g. remote URLs).
|
|
62
|
+
"""
|
|
63
|
+
total = 0
|
|
64
|
+
for shard_url in ds.list_shards():
|
|
65
|
+
if _is_local_path(shard_url):
|
|
66
|
+
p = Path(shard_url.removeprefix("file://"))
|
|
67
|
+
if p.exists():
|
|
68
|
+
total += p.stat().st_size
|
|
69
|
+
return total
|
|
70
|
+
|
|
71
|
+
|
|
40
72
|
class Index:
|
|
41
73
|
"""Unified index for tracking datasets across multiple repositories.
|
|
42
74
|
|
|
43
75
|
Implements the AbstractIndex protocol. Maintains a registry of
|
|
44
|
-
dataset entries across a built-in
|
|
45
|
-
|
|
76
|
+
dataset entries across named repositories (always including a built-in
|
|
77
|
+
``"local"`` repository) and an optional atmosphere (ATProto) backend.
|
|
46
78
|
|
|
47
79
|
The ``"local"`` repository is always present and uses the storage backend
|
|
48
80
|
determined by the ``provider`` argument. When no provider is given, defaults
|
|
@@ -52,14 +84,12 @@ class Index:
|
|
|
52
84
|
Additional named repositories can be mounted via the ``repos`` parameter,
|
|
53
85
|
each pairing an IndexProvider with an optional data store.
|
|
54
86
|
|
|
55
|
-
An
|
|
87
|
+
An Atmosphere is available by default for anonymous read-only
|
|
56
88
|
resolution of ``@handle/dataset`` paths. Pass an authenticated client
|
|
57
89
|
for write operations, or ``atmosphere=None`` to disable.
|
|
58
90
|
|
|
59
91
|
Attributes:
|
|
60
|
-
|
|
61
|
-
_data_store: Optional AbstractDataStore for the local repository.
|
|
62
|
-
_repos: Named repositories beyond ``"local"``.
|
|
92
|
+
_repos: All repositories keyed by name. ``"local"`` is always present.
|
|
63
93
|
_atmosphere: Optional atmosphere backend for ATProto operations.
|
|
64
94
|
"""
|
|
65
95
|
|
|
@@ -105,7 +135,7 @@ class Index:
|
|
|
105
135
|
atmosphere: ATProto client for distributed network operations.
|
|
106
136
|
- Default (sentinel): creates an anonymous read-only client
|
|
107
137
|
lazily on first access.
|
|
108
|
-
- ``
|
|
138
|
+
- ``Atmosphere`` instance: uses that client directly.
|
|
109
139
|
- ``None``: disables atmosphere backend entirely.
|
|
110
140
|
auto_stubs: If True, automatically generate .pyi stub files when
|
|
111
141
|
schemas are accessed via get_schema() or decode_schema().
|
|
@@ -146,12 +176,13 @@ class Index:
|
|
|
146
176
|
##
|
|
147
177
|
|
|
148
178
|
from atdata.providers._base import IndexProvider as _IP
|
|
179
|
+
from atdata.repository import Repository as _Repo
|
|
149
180
|
|
|
181
|
+
# Resolve the local provider
|
|
150
182
|
if isinstance(provider, str):
|
|
151
|
-
# String-based provider selection
|
|
152
183
|
from atdata.providers._factory import create_provider
|
|
153
184
|
|
|
154
|
-
|
|
185
|
+
local_provider: _IP = create_provider(
|
|
155
186
|
provider, path=path, dsn=dsn, redis=redis, **kwargs
|
|
156
187
|
)
|
|
157
188
|
elif provider is not None:
|
|
@@ -160,27 +191,25 @@ class Index:
|
|
|
160
191
|
f"provider must be an IndexProvider or backend name string, "
|
|
161
192
|
f"got {type(provider).__name__}"
|
|
162
193
|
)
|
|
163
|
-
|
|
194
|
+
local_provider = provider
|
|
164
195
|
elif redis is not None:
|
|
165
|
-
# Explicit Redis connection provided
|
|
166
196
|
from atdata.providers._redis import RedisProvider
|
|
167
197
|
|
|
168
|
-
|
|
198
|
+
local_provider = RedisProvider(redis)
|
|
169
199
|
elif kwargs:
|
|
170
|
-
|
|
200
|
+
from redis import Redis as _Redis
|
|
171
201
|
from atdata.providers._redis import RedisProvider
|
|
172
202
|
|
|
173
|
-
|
|
203
|
+
local_provider = RedisProvider(_Redis(**kwargs))
|
|
174
204
|
else:
|
|
175
|
-
# Default: zero-dependency SQLite
|
|
176
205
|
from atdata.providers._sqlite import SqliteProvider
|
|
177
206
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
self._data_store = data_store
|
|
207
|
+
local_provider = SqliteProvider()
|
|
181
208
|
|
|
182
|
-
#
|
|
183
|
-
|
|
209
|
+
# Build the unified repos dict with "local" always present
|
|
210
|
+
self._repos: dict[str, _Repo] = {
|
|
211
|
+
"local": _Repo(provider=local_provider, data_store=data_store),
|
|
212
|
+
}
|
|
184
213
|
|
|
185
214
|
if repos is not None:
|
|
186
215
|
if "local" in repos:
|
|
@@ -194,9 +223,7 @@ class Index:
|
|
|
194
223
|
f"repos[{name!r}] must be a Repository, "
|
|
195
224
|
f"got {type(repo).__name__}"
|
|
196
225
|
)
|
|
197
|
-
self._repos
|
|
198
|
-
else:
|
|
199
|
-
self._repos = {}
|
|
226
|
+
self._repos.update(repos)
|
|
200
227
|
|
|
201
228
|
# Atmosphere backend (lazy or explicit)
|
|
202
229
|
from atdata.repository import _AtmosphereBackend
|
|
@@ -230,10 +257,10 @@ class Index:
|
|
|
230
257
|
"""Get the atmosphere backend, lazily creating anonymous client if needed."""
|
|
231
258
|
if self._atmosphere_deferred and self._atmosphere is None:
|
|
232
259
|
try:
|
|
233
|
-
from atdata.atmosphere.client import
|
|
260
|
+
from atdata.atmosphere.client import Atmosphere
|
|
234
261
|
from atdata.repository import _AtmosphereBackend
|
|
235
262
|
|
|
236
|
-
client =
|
|
263
|
+
client = Atmosphere()
|
|
237
264
|
self._atmosphere = _AtmosphereBackend(client)
|
|
238
265
|
except ImportError:
|
|
239
266
|
# atproto package not installed -- atmosphere unavailable
|
|
@@ -289,13 +316,13 @@ class Index:
|
|
|
289
316
|
return ("local", ref, None)
|
|
290
317
|
|
|
291
318
|
@property
|
|
292
|
-
def repos(self) -> dict[str, Repository]:
|
|
293
|
-
"""
|
|
319
|
+
def repos(self) -> dict[str, "Repository"]:
|
|
320
|
+
"""All repositories mounted on this index (including ``"local"``)."""
|
|
294
321
|
return dict(self._repos)
|
|
295
322
|
|
|
296
323
|
@property
|
|
297
324
|
def atmosphere(self) -> Any:
|
|
298
|
-
"""The
|
|
325
|
+
"""The Atmosphere for this index, or None if disabled.
|
|
299
326
|
|
|
300
327
|
Returns the underlying client (not the internal backend wrapper).
|
|
301
328
|
"""
|
|
@@ -304,10 +331,15 @@ class Index:
|
|
|
304
331
|
return backend.client
|
|
305
332
|
return None
|
|
306
333
|
|
|
334
|
+
@property
|
|
335
|
+
def _provider(self) -> "IndexProvider": # noqa: F821
|
|
336
|
+
"""IndexProvider for the ``"local"`` repository (backward compat)."""
|
|
337
|
+
return self._repos["local"].provider
|
|
338
|
+
|
|
307
339
|
@property
|
|
308
340
|
def provider(self) -> "IndexProvider": # noqa: F821
|
|
309
|
-
"""The storage provider backing
|
|
310
|
-
return self.
|
|
341
|
+
"""The storage provider backing the ``"local"`` repository."""
|
|
342
|
+
return self._repos["local"].provider
|
|
311
343
|
|
|
312
344
|
@property
|
|
313
345
|
def _redis(self) -> Redis:
|
|
@@ -318,17 +350,23 @@ class Index:
|
|
|
318
350
|
"""
|
|
319
351
|
from atdata.providers._redis import RedisProvider
|
|
320
352
|
|
|
321
|
-
|
|
322
|
-
|
|
353
|
+
prov = self._repos["local"].provider
|
|
354
|
+
if isinstance(prov, RedisProvider):
|
|
355
|
+
return prov.redis
|
|
323
356
|
raise AttributeError(
|
|
324
357
|
"Index._redis is only available with a Redis provider. "
|
|
325
358
|
"Use index.provider instead."
|
|
326
359
|
)
|
|
327
360
|
|
|
361
|
+
@property
|
|
362
|
+
def _data_store(self) -> AbstractDataStore | None:
|
|
363
|
+
"""Data store for the ``"local"`` repository (backward compat)."""
|
|
364
|
+
return self._repos["local"].data_store
|
|
365
|
+
|
|
328
366
|
@property
|
|
329
367
|
def data_store(self) -> AbstractDataStore | None:
|
|
330
368
|
"""The data store for writing shards, or None if index-only."""
|
|
331
|
-
return self.
|
|
369
|
+
return self._repos["local"].data_store
|
|
332
370
|
|
|
333
371
|
@property
|
|
334
372
|
def stub_dir(self) -> Path | None:
|
|
@@ -351,7 +389,7 @@ class Index:
|
|
|
351
389
|
as attributes on this namespace.
|
|
352
390
|
|
|
353
391
|
Examples:
|
|
354
|
-
>>> index.load_schema("atdata://local/
|
|
392
|
+
>>> index.load_schema("atdata://local/schema/MySample@1.0.0")
|
|
355
393
|
>>> MyType = index.types.MySample
|
|
356
394
|
>>> sample = MyType(name="hello", value=42)
|
|
357
395
|
|
|
@@ -368,7 +406,7 @@ class Index:
|
|
|
368
406
|
in the :attr:`types` namespace for easy access.
|
|
369
407
|
|
|
370
408
|
Args:
|
|
371
|
-
ref: Schema reference string (atdata://local/
|
|
409
|
+
ref: Schema reference string (atdata://local/schema/... or
|
|
372
410
|
legacy local://schemas/...).
|
|
373
411
|
|
|
374
412
|
Returns:
|
|
@@ -381,11 +419,11 @@ class Index:
|
|
|
381
419
|
|
|
382
420
|
Examples:
|
|
383
421
|
>>> # Load and use immediately
|
|
384
|
-
>>> MyType = index.load_schema("atdata://local/
|
|
422
|
+
>>> MyType = index.load_schema("atdata://local/schema/MySample@1.0.0")
|
|
385
423
|
>>> sample = MyType(field1="hello", field2=42)
|
|
386
424
|
>>>
|
|
387
425
|
>>> # Or access later via namespace
|
|
388
|
-
>>> index.load_schema("atdata://local/
|
|
426
|
+
>>> index.load_schema("atdata://local/schema/OtherType@1.0.0")
|
|
389
427
|
>>> other = index.types.OtherType(data="test")
|
|
390
428
|
"""
|
|
391
429
|
# Decode the schema (uses generated module if auto_stubs enabled)
|
|
@@ -465,6 +503,9 @@ class Index:
|
|
|
465
503
|
) -> LocalDatasetEntry:
|
|
466
504
|
"""Add a dataset to the local repository index.
|
|
467
505
|
|
|
506
|
+
.. deprecated::
|
|
507
|
+
Use :meth:`insert_dataset` instead.
|
|
508
|
+
|
|
468
509
|
Args:
|
|
469
510
|
ds: The dataset to add to the index.
|
|
470
511
|
name: Human-readable name for the dataset.
|
|
@@ -474,6 +515,13 @@ class Index:
|
|
|
474
515
|
Returns:
|
|
475
516
|
The created LocalDatasetEntry object.
|
|
476
517
|
"""
|
|
518
|
+
import warnings
|
|
519
|
+
|
|
520
|
+
warnings.warn(
|
|
521
|
+
"Index.add_entry() is deprecated, use Index.insert_dataset()",
|
|
522
|
+
DeprecationWarning,
|
|
523
|
+
stacklevel=2,
|
|
524
|
+
)
|
|
477
525
|
return self._insert_dataset_to_provider(
|
|
478
526
|
ds,
|
|
479
527
|
name=name,
|
|
@@ -513,6 +561,23 @@ class Index:
|
|
|
513
561
|
|
|
514
562
|
# AbstractIndex protocol methods
|
|
515
563
|
|
|
564
|
+
@staticmethod
|
|
565
|
+
def _ensure_schema_stored(
|
|
566
|
+
schema_ref: str,
|
|
567
|
+
sample_type: type,
|
|
568
|
+
provider: "IndexProvider", # noqa: F821
|
|
569
|
+
) -> None:
|
|
570
|
+
"""Persist the schema definition if not already stored.
|
|
571
|
+
|
|
572
|
+
Called during dataset insertion so that ``decode_schema()`` can
|
|
573
|
+
reconstruct the type later without the caller needing to publish
|
|
574
|
+
the schema separately.
|
|
575
|
+
"""
|
|
576
|
+
schema_name, version = _parse_schema_ref(schema_ref)
|
|
577
|
+
if provider.get_schema_json(schema_name, version) is None:
|
|
578
|
+
record = _build_schema_record(sample_type, version=version)
|
|
579
|
+
provider.store_schema(schema_name, version, json.dumps(record))
|
|
580
|
+
|
|
516
581
|
def _insert_dataset_to_provider(
|
|
517
582
|
self,
|
|
518
583
|
ds: Dataset,
|
|
@@ -528,21 +593,36 @@ class Index:
|
|
|
528
593
|
This is the internal implementation shared by all local and named
|
|
529
594
|
repository inserts.
|
|
530
595
|
"""
|
|
596
|
+
from atdata._logging import get_logger
|
|
597
|
+
|
|
598
|
+
log = get_logger()
|
|
531
599
|
metadata = kwargs.get("metadata")
|
|
532
600
|
|
|
533
601
|
if store is not None:
|
|
534
602
|
prefix = kwargs.get("prefix", name)
|
|
535
603
|
cache_local = kwargs.get("cache_local", False)
|
|
604
|
+
log.debug(
|
|
605
|
+
"_insert_dataset_to_provider: name=%s, store=%s",
|
|
606
|
+
name,
|
|
607
|
+
type(store).__name__,
|
|
608
|
+
)
|
|
536
609
|
|
|
537
610
|
written_urls = store.write_shards(
|
|
538
611
|
ds,
|
|
539
612
|
prefix=prefix,
|
|
540
613
|
cache_local=cache_local,
|
|
541
614
|
)
|
|
615
|
+
log.info(
|
|
616
|
+
"_insert_dataset_to_provider: %d shard(s) written for %s",
|
|
617
|
+
len(written_urls),
|
|
618
|
+
name,
|
|
619
|
+
)
|
|
542
620
|
|
|
543
621
|
if schema_ref is None:
|
|
544
622
|
schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
|
|
545
623
|
|
|
624
|
+
self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
|
|
625
|
+
|
|
546
626
|
entry_metadata = metadata if metadata is not None else ds._metadata
|
|
547
627
|
entry = LocalDatasetEntry(
|
|
548
628
|
name=name,
|
|
@@ -551,12 +631,15 @@ class Index:
|
|
|
551
631
|
metadata=entry_metadata,
|
|
552
632
|
)
|
|
553
633
|
provider.store_entry(entry)
|
|
634
|
+
log.debug("_insert_dataset_to_provider: entry stored for %s", name)
|
|
554
635
|
return entry
|
|
555
636
|
|
|
556
637
|
# No data store - just index the existing URL
|
|
557
638
|
if schema_ref is None:
|
|
558
639
|
schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
|
|
559
640
|
|
|
641
|
+
self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
|
|
642
|
+
|
|
560
643
|
data_urls = [ds.url]
|
|
561
644
|
entry_metadata = metadata if metadata is not None else ds._metadata
|
|
562
645
|
|
|
@@ -567,6 +650,7 @@ class Index:
|
|
|
567
650
|
metadata=entry_metadata,
|
|
568
651
|
)
|
|
569
652
|
provider.store_entry(entry)
|
|
653
|
+
log.debug("_insert_dataset_to_provider: entry stored for %s", name)
|
|
570
654
|
return entry
|
|
571
655
|
|
|
572
656
|
def insert_dataset(
|
|
@@ -575,66 +659,379 @@ class Index:
|
|
|
575
659
|
*,
|
|
576
660
|
name: str,
|
|
577
661
|
schema_ref: str | None = None,
|
|
662
|
+
description: str | None = None,
|
|
663
|
+
tags: list[str] | None = None,
|
|
664
|
+
license: str | None = None,
|
|
665
|
+
data_store: AbstractDataStore | None = None,
|
|
666
|
+
force: bool = False,
|
|
667
|
+
copy: bool = False,
|
|
668
|
+
metadata: dict | None = None,
|
|
669
|
+
_data_urls: list[str] | None = None,
|
|
670
|
+
_blob_refs: list[dict] | None = None,
|
|
578
671
|
**kwargs,
|
|
579
672
|
) -> "IndexEntry":
|
|
580
|
-
"""Insert a dataset into the index
|
|
673
|
+
"""Insert a dataset into the index.
|
|
581
674
|
|
|
582
675
|
The target repository is determined by a prefix in the ``name``
|
|
583
676
|
argument (e.g. ``"lab/mnist"``). If no prefix is given, or the
|
|
584
677
|
prefix is ``"local"``, the built-in local repository is used.
|
|
585
678
|
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
679
|
+
For atmosphere targets:
|
|
680
|
+
|
|
681
|
+
- **Local sources** are uploaded via *data_store* (defaults to
|
|
682
|
+
``PDSBlobStore``).
|
|
683
|
+
- **Public remote sources** (http/https) are referenced as
|
|
684
|
+
external URLs unless *copy* is ``True``.
|
|
685
|
+
- **Credentialed sources** (e.g. ``S3Source``) raise an error
|
|
686
|
+
unless *copy* is ``True`` or *data_store* is provided, to
|
|
687
|
+
prevent leaking private endpoints.
|
|
589
688
|
|
|
590
689
|
Args:
|
|
591
690
|
ds: The Dataset to register.
|
|
592
691
|
name: Human-readable name for the dataset, optionally prefixed
|
|
593
692
|
with a repository name (e.g. ``"lab/mnist"``).
|
|
594
693
|
schema_ref: Optional schema reference.
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
694
|
+
description: Optional dataset description (atmosphere only).
|
|
695
|
+
tags: Optional tags for discovery (atmosphere only).
|
|
696
|
+
license: Optional license identifier (atmosphere only).
|
|
697
|
+
data_store: Explicit data store for shard storage. When
|
|
698
|
+
provided, data is always copied through this store.
|
|
699
|
+
force: If True, bypass PDS size limits (50 MB per shard,
|
|
700
|
+
1 GB total). Default: ``False``.
|
|
701
|
+
copy: If True, copy data to the destination store even for
|
|
702
|
+
remote sources. Required for credentialed sources
|
|
703
|
+
targeting the atmosphere. Default: ``False``.
|
|
704
|
+
metadata: Optional metadata dict.
|
|
599
705
|
|
|
600
706
|
Returns:
|
|
601
707
|
IndexEntry for the inserted dataset.
|
|
708
|
+
|
|
709
|
+
Raises:
|
|
710
|
+
ValueError: If atmosphere limits are exceeded (when
|
|
711
|
+
*force* is ``False``), or if a credentialed source
|
|
712
|
+
targets the atmosphere without *copy*.
|
|
602
713
|
"""
|
|
714
|
+
from atdata.atmosphere.store import PDS_TOTAL_DATASET_LIMIT_BYTES
|
|
715
|
+
|
|
603
716
|
backend_key, resolved_name, handle_or_did = self._resolve_prefix(name)
|
|
717
|
+
is_atmosphere = backend_key == "_atmosphere"
|
|
604
718
|
|
|
605
|
-
if
|
|
719
|
+
if is_atmosphere:
|
|
606
720
|
atmo = self._get_atmosphere()
|
|
607
721
|
if atmo is None:
|
|
608
722
|
raise ValueError(
|
|
609
723
|
f"Atmosphere backend required for name {name!r} but not available."
|
|
610
724
|
)
|
|
611
|
-
return atmo.insert_dataset(
|
|
612
|
-
ds, name=resolved_name, schema_ref=schema_ref, **kwargs
|
|
613
|
-
)
|
|
614
725
|
|
|
615
|
-
|
|
616
|
-
|
|
726
|
+
# Providing an explicit data_store implies copy behaviour
|
|
727
|
+
needs_copy = copy or data_store is not None
|
|
728
|
+
|
|
729
|
+
# Credentialed source guard
|
|
730
|
+
if _is_credentialed_source(ds) and not needs_copy:
|
|
731
|
+
raise ValueError(
|
|
732
|
+
"Dataset uses a credentialed source. Referencing "
|
|
733
|
+
"these URLs in a public atmosphere record would "
|
|
734
|
+
"leak private endpoints. Pass copy=True to copy "
|
|
735
|
+
"data to the destination store (default: PDS blobs)."
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# If we already have pre-written URLs (from write_samples),
|
|
739
|
+
# go straight to publish.
|
|
740
|
+
if _data_urls is not None:
|
|
741
|
+
return atmo.insert_dataset(
|
|
742
|
+
ds,
|
|
743
|
+
name=resolved_name,
|
|
744
|
+
schema_ref=schema_ref,
|
|
745
|
+
data_urls=_data_urls,
|
|
746
|
+
blob_refs=_blob_refs,
|
|
747
|
+
description=description,
|
|
748
|
+
tags=tags,
|
|
749
|
+
license=license,
|
|
750
|
+
metadata=metadata,
|
|
751
|
+
**kwargs,
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Determine whether data must be copied
|
|
755
|
+
source_is_local = _is_local_path(ds.url)
|
|
756
|
+
|
|
757
|
+
if source_is_local or needs_copy:
|
|
758
|
+
# Resolve effective store
|
|
759
|
+
if data_store is not None:
|
|
760
|
+
effective_store = data_store
|
|
761
|
+
else:
|
|
762
|
+
from atdata.atmosphere.store import PDSBlobStore
|
|
763
|
+
|
|
764
|
+
effective_store = PDSBlobStore(atmo.client)
|
|
765
|
+
|
|
766
|
+
# Size guard
|
|
767
|
+
if not force:
|
|
768
|
+
total_bytes = _estimate_dataset_bytes(ds)
|
|
769
|
+
if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
|
|
770
|
+
raise ValueError(
|
|
771
|
+
f"Total dataset size ({total_bytes} bytes) "
|
|
772
|
+
f"exceeds atmosphere limit "
|
|
773
|
+
f"({PDS_TOTAL_DATASET_LIMIT_BYTES} bytes). "
|
|
774
|
+
f"Pass force=True to bypass."
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
result = effective_store.write_shards(ds, prefix=resolved_name)
|
|
778
|
+
|
|
779
|
+
# ShardUploadResult carries blob_refs; plain list does not
|
|
780
|
+
blob_refs = getattr(result, "blob_refs", None) or None
|
|
781
|
+
|
|
782
|
+
return atmo.insert_dataset(
|
|
783
|
+
ds,
|
|
784
|
+
name=resolved_name,
|
|
785
|
+
schema_ref=schema_ref,
|
|
786
|
+
data_urls=list(result),
|
|
787
|
+
blob_refs=blob_refs,
|
|
788
|
+
description=description,
|
|
789
|
+
tags=tags,
|
|
790
|
+
license=license,
|
|
791
|
+
metadata=metadata,
|
|
792
|
+
**kwargs,
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# Public remote source — reference existing URLs
|
|
796
|
+
data_urls = ds.list_shards()
|
|
797
|
+
return atmo.insert_dataset(
|
|
617
798
|
ds,
|
|
618
799
|
name=resolved_name,
|
|
619
800
|
schema_ref=schema_ref,
|
|
620
|
-
|
|
621
|
-
|
|
801
|
+
data_urls=data_urls,
|
|
802
|
+
description=description,
|
|
803
|
+
tags=tags,
|
|
804
|
+
license=license,
|
|
805
|
+
metadata=metadata,
|
|
622
806
|
**kwargs,
|
|
623
807
|
)
|
|
624
808
|
|
|
625
|
-
#
|
|
809
|
+
# --- Local / named repo path ---
|
|
626
810
|
repo = self._repos.get(backend_key)
|
|
627
811
|
if repo is None:
|
|
628
812
|
raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
|
|
813
|
+
|
|
814
|
+
effective_store = data_store or repo.data_store
|
|
629
815
|
return self._insert_dataset_to_provider(
|
|
630
816
|
ds,
|
|
631
817
|
name=resolved_name,
|
|
632
818
|
schema_ref=schema_ref,
|
|
633
819
|
provider=repo.provider,
|
|
634
|
-
store=
|
|
820
|
+
store=effective_store,
|
|
821
|
+
metadata=metadata,
|
|
635
822
|
**kwargs,
|
|
636
823
|
)
|
|
637
824
|
|
|
825
|
+
def write_samples(
|
|
826
|
+
self,
|
|
827
|
+
samples: Iterable,
|
|
828
|
+
*,
|
|
829
|
+
name: str,
|
|
830
|
+
schema_ref: str | None = None,
|
|
831
|
+
description: str | None = None,
|
|
832
|
+
tags: list[str] | None = None,
|
|
833
|
+
license: str | None = None,
|
|
834
|
+
maxcount: int = 10_000,
|
|
835
|
+
maxsize: int | None = None,
|
|
836
|
+
metadata: dict | None = None,
|
|
837
|
+
manifest: bool = False,
|
|
838
|
+
data_store: AbstractDataStore | None = None,
|
|
839
|
+
force: bool = False,
|
|
840
|
+
) -> "IndexEntry":
|
|
841
|
+
"""Write samples and create an index entry in one step.
|
|
842
|
+
|
|
843
|
+
This is the primary method for publishing data. It serializes
|
|
844
|
+
samples to WebDataset tar files, stores them via the appropriate
|
|
845
|
+
backend, and creates an index entry.
|
|
846
|
+
|
|
847
|
+
The target backend is determined by the *name* prefix:
|
|
848
|
+
|
|
849
|
+
- Bare name (e.g., ``"mnist"``): writes to the local repository.
|
|
850
|
+
- ``"@handle/name"``: writes and publishes to the atmosphere.
|
|
851
|
+
- ``"repo/name"``: writes to a named repository.
|
|
852
|
+
|
|
853
|
+
For atmosphere targets, data is uploaded as PDS blobs by default.
|
|
854
|
+
Shard size is capped at 50 MB and total dataset size at 1 GB
|
|
855
|
+
unless *force* is ``True``.
|
|
856
|
+
|
|
857
|
+
When the local backend has no ``data_store`` configured, a
|
|
858
|
+
``LocalDiskStore`` is created automatically at
|
|
859
|
+
``~/.atdata/data/`` so that samples have persistent storage.
|
|
860
|
+
|
|
861
|
+
Args:
|
|
862
|
+
samples: Iterable of ``Packable`` samples. Must be non-empty.
|
|
863
|
+
name: Dataset name, optionally prefixed with target.
|
|
864
|
+
schema_ref: Optional schema reference. Auto-generated if ``None``.
|
|
865
|
+
description: Optional dataset description (atmosphere only).
|
|
866
|
+
tags: Optional tags for discovery (atmosphere only).
|
|
867
|
+
license: Optional license identifier (atmosphere only).
|
|
868
|
+
maxcount: Max samples per shard. Default: 10,000.
|
|
869
|
+
maxsize: Max bytes per shard. For atmosphere targets defaults
|
|
870
|
+
to 50 MB (PDS blob limit). For local targets defaults to
|
|
871
|
+
``None`` (unlimited).
|
|
872
|
+
metadata: Optional metadata dict stored with the entry.
|
|
873
|
+
manifest: If True, write per-shard manifest sidecar files
|
|
874
|
+
alongside each tar. Default: ``False``.
|
|
875
|
+
data_store: Explicit data store for shard storage. Overrides
|
|
876
|
+
the repository's default store. For atmosphere targets
|
|
877
|
+
defaults to ``PDSBlobStore``.
|
|
878
|
+
force: If True, bypass PDS size limits (50 MB per shard,
|
|
879
|
+
1 GB total dataset). Default: ``False``.
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
IndexEntry for the created dataset.
|
|
883
|
+
|
|
884
|
+
Raises:
|
|
885
|
+
ValueError: If *samples* is empty, or if atmosphere size
|
|
886
|
+
limits are exceeded (when *force* is ``False``).
|
|
887
|
+
|
|
888
|
+
Examples:
|
|
889
|
+
>>> index = Index()
|
|
890
|
+
>>> samples = [MySample(key="0", text="hello")]
|
|
891
|
+
>>> entry = index.write_samples(samples, name="my-dataset")
|
|
892
|
+
"""
|
|
893
|
+
import tempfile
|
|
894
|
+
|
|
895
|
+
from atdata.dataset import write_samples as _write_samples
|
|
896
|
+
from atdata.atmosphere.store import (
|
|
897
|
+
PDS_BLOB_LIMIT_BYTES,
|
|
898
|
+
PDS_TOTAL_DATASET_LIMIT_BYTES,
|
|
899
|
+
)
|
|
900
|
+
from atdata._logging import log_operation
|
|
901
|
+
|
|
902
|
+
backend_key, resolved_name, _ = self._resolve_prefix(name)
|
|
903
|
+
is_atmosphere = backend_key == "_atmosphere"
|
|
904
|
+
|
|
905
|
+
with log_operation("Index.write_samples", name=name):
|
|
906
|
+
# --- Atmosphere size guards ---
|
|
907
|
+
if is_atmosphere and not force:
|
|
908
|
+
if maxsize is not None and maxsize > PDS_BLOB_LIMIT_BYTES:
|
|
909
|
+
raise ValueError(
|
|
910
|
+
f"maxsize={maxsize} exceeds PDS blob limit "
|
|
911
|
+
f"({PDS_BLOB_LIMIT_BYTES} bytes). "
|
|
912
|
+
f"Pass force=True to bypass."
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
# Default maxsize for atmosphere targets
|
|
916
|
+
effective_maxsize = maxsize
|
|
917
|
+
if is_atmosphere and effective_maxsize is None:
|
|
918
|
+
effective_maxsize = PDS_BLOB_LIMIT_BYTES
|
|
919
|
+
|
|
920
|
+
# Resolve the effective data store
|
|
921
|
+
if is_atmosphere:
|
|
922
|
+
atmo = self._get_atmosphere()
|
|
923
|
+
if atmo is None:
|
|
924
|
+
raise ValueError(
|
|
925
|
+
f"Atmosphere backend required for name {name!r} but not available."
|
|
926
|
+
)
|
|
927
|
+
if data_store is None:
|
|
928
|
+
from atdata.atmosphere.store import PDSBlobStore
|
|
929
|
+
|
|
930
|
+
effective_store: AbstractDataStore | None = PDSBlobStore(
|
|
931
|
+
atmo.client
|
|
932
|
+
)
|
|
933
|
+
else:
|
|
934
|
+
effective_store = data_store
|
|
935
|
+
else:
|
|
936
|
+
repo = self._repos.get(backend_key)
|
|
937
|
+
effective_store = data_store or (
|
|
938
|
+
repo.data_store if repo is not None else None
|
|
939
|
+
)
|
|
940
|
+
needs_auto_store = repo is not None and effective_store is None
|
|
941
|
+
if needs_auto_store:
|
|
942
|
+
from atdata.stores._disk import LocalDiskStore
|
|
943
|
+
|
|
944
|
+
effective_store = LocalDiskStore()
|
|
945
|
+
|
|
946
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
947
|
+
tmp_path = Path(tmp_dir) / "data.tar"
|
|
948
|
+
ds = _write_samples(
|
|
949
|
+
samples,
|
|
950
|
+
tmp_path,
|
|
951
|
+
maxcount=maxcount,
|
|
952
|
+
maxsize=effective_maxsize,
|
|
953
|
+
manifest=manifest,
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
# Atmosphere total-size guard (after writing so we can measure)
|
|
957
|
+
if is_atmosphere and not force:
|
|
958
|
+
total_bytes = _estimate_dataset_bytes(ds)
|
|
959
|
+
if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
|
|
960
|
+
raise ValueError(
|
|
961
|
+
f"Total dataset size ({total_bytes} bytes) exceeds "
|
|
962
|
+
f"atmosphere limit ({PDS_TOTAL_DATASET_LIMIT_BYTES} "
|
|
963
|
+
f"bytes). Pass force=True to bypass."
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
if is_atmosphere:
|
|
967
|
+
# Write shards through the store, then publish record
|
|
968
|
+
# with the resulting URLs (not the temp paths).
|
|
969
|
+
written_urls = effective_store.write_shards(
|
|
970
|
+
ds, prefix=resolved_name
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
# If write_shards returned blob refs (e.g. ShardUploadResult),
|
|
974
|
+
# use storageBlobs so the PDS retains the uploaded blobs.
|
|
975
|
+
# Fall back to storageExternal with AT URIs otherwise.
|
|
976
|
+
blob_refs = getattr(written_urls, "blob_refs", None) or None
|
|
977
|
+
|
|
978
|
+
return self.insert_dataset(
|
|
979
|
+
ds,
|
|
980
|
+
name=name,
|
|
981
|
+
schema_ref=schema_ref,
|
|
982
|
+
metadata=metadata,
|
|
983
|
+
description=description,
|
|
984
|
+
tags=tags,
|
|
985
|
+
license=license,
|
|
986
|
+
data_store=data_store,
|
|
987
|
+
force=force,
|
|
988
|
+
_data_urls=written_urls,
|
|
989
|
+
_blob_refs=blob_refs,
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
# Local / named repo path
|
|
993
|
+
repo = self._repos.get(backend_key)
|
|
994
|
+
if repo is not None and effective_store is not None:
|
|
995
|
+
return self._insert_dataset_to_provider(
|
|
996
|
+
ds,
|
|
997
|
+
name=resolved_name,
|
|
998
|
+
schema_ref=schema_ref,
|
|
999
|
+
provider=repo.provider,
|
|
1000
|
+
store=effective_store,
|
|
1001
|
+
metadata=metadata,
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
return self.insert_dataset(
|
|
1005
|
+
ds,
|
|
1006
|
+
name=name,
|
|
1007
|
+
schema_ref=schema_ref,
|
|
1008
|
+
metadata=metadata,
|
|
1009
|
+
description=description,
|
|
1010
|
+
tags=tags,
|
|
1011
|
+
license=license,
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
def write(
|
|
1015
|
+
self,
|
|
1016
|
+
samples: Iterable,
|
|
1017
|
+
*,
|
|
1018
|
+
name: str,
|
|
1019
|
+
**kwargs: Any,
|
|
1020
|
+
) -> "IndexEntry":
|
|
1021
|
+
"""Write samples and create an index entry.
|
|
1022
|
+
|
|
1023
|
+
.. deprecated::
|
|
1024
|
+
Use :meth:`write_samples` instead.
|
|
1025
|
+
"""
|
|
1026
|
+
import warnings
|
|
1027
|
+
|
|
1028
|
+
warnings.warn(
|
|
1029
|
+
"Index.write() is deprecated, use Index.write_samples()",
|
|
1030
|
+
DeprecationWarning,
|
|
1031
|
+
stacklevel=2,
|
|
1032
|
+
)
|
|
1033
|
+
return self.write_samples(samples, name=name, **kwargs)
|
|
1034
|
+
|
|
638
1035
|
def get_dataset(self, ref: str) -> "IndexEntry":
|
|
639
1036
|
"""Get a dataset entry by name or prefixed reference.
|
|
640
1037
|
|
|
@@ -659,14 +1056,10 @@ class Index:
|
|
|
659
1056
|
if atmo is None:
|
|
660
1057
|
raise ValueError(
|
|
661
1058
|
f"Atmosphere backend required for path {ref!r} but not available. "
|
|
662
|
-
"Install 'atproto' or pass an
|
|
1059
|
+
"Install 'atproto' or pass an Atmosphere."
|
|
663
1060
|
)
|
|
664
1061
|
return atmo.get_dataset(resolved_ref)
|
|
665
1062
|
|
|
666
|
-
if backend_key == "local":
|
|
667
|
-
return self._provider.get_entry_by_name(resolved_ref)
|
|
668
|
-
|
|
669
|
-
# Named repository
|
|
670
1063
|
repo = self._repos.get(backend_key)
|
|
671
1064
|
if repo is None:
|
|
672
1065
|
raise KeyError(f"Unknown repository {backend_key!r} in ref {ref!r}")
|
|
@@ -676,14 +1069,13 @@ class Index:
|
|
|
676
1069
|
def datasets(self) -> Generator["IndexEntry", None, None]:
|
|
677
1070
|
"""Lazily iterate over all dataset entries across local repositories.
|
|
678
1071
|
|
|
679
|
-
Yields entries from
|
|
680
|
-
|
|
1072
|
+
Yields entries from all mounted repositories (``"local"`` and named).
|
|
1073
|
+
Atmosphere entries are not included (use
|
|
681
1074
|
``list_datasets(repo="_atmosphere")`` for those).
|
|
682
1075
|
|
|
683
1076
|
Yields:
|
|
684
1077
|
IndexEntry for each dataset.
|
|
685
1078
|
"""
|
|
686
|
-
yield from self._provider.iter_entries()
|
|
687
1079
|
for repo in self._repos.values():
|
|
688
1080
|
yield from repo.provider.iter_entries()
|
|
689
1081
|
|
|
@@ -702,9 +1094,6 @@ class Index:
|
|
|
702
1094
|
if repo is None:
|
|
703
1095
|
return list(self.datasets)
|
|
704
1096
|
|
|
705
|
-
if repo == "local":
|
|
706
|
-
return self.list_entries()
|
|
707
|
-
|
|
708
1097
|
if repo == "_atmosphere":
|
|
709
1098
|
atmo = self._get_atmosphere()
|
|
710
1099
|
if atmo is None:
|
|
@@ -740,7 +1129,7 @@ class Index:
|
|
|
740
1129
|
the class docstring.
|
|
741
1130
|
|
|
742
1131
|
Returns:
|
|
743
|
-
Schema reference string: 'atdata://local/
|
|
1132
|
+
Schema reference string: 'atdata://local/schema/{name}@{version}'.
|
|
744
1133
|
|
|
745
1134
|
Raises:
|
|
746
1135
|
ValueError: If sample_type is not a dataclass.
|
|
@@ -794,7 +1183,7 @@ class Index:
|
|
|
794
1183
|
|
|
795
1184
|
Args:
|
|
796
1185
|
ref: Schema reference string. Supports both new format
|
|
797
|
-
(atdata://local/
|
|
1186
|
+
(atdata://local/schema/{name}@{version}) and legacy
|
|
798
1187
|
format (local://schemas/{module.Class}@{version}).
|
|
799
1188
|
|
|
800
1189
|
Returns:
|
|
@@ -871,7 +1260,7 @@ class Index:
|
|
|
871
1260
|
The returned class has proper type information that IDEs can understand.
|
|
872
1261
|
|
|
873
1262
|
Args:
|
|
874
|
-
ref: Schema reference string (atdata://local/
|
|
1263
|
+
ref: Schema reference string (atdata://local/schema/... or
|
|
875
1264
|
legacy local://schemas/...).
|
|
876
1265
|
|
|
877
1266
|
Returns:
|
|
@@ -938,3 +1327,159 @@ class Index:
|
|
|
938
1327
|
if self._stub_manager is not None:
|
|
939
1328
|
return self._stub_manager.clear_stubs()
|
|
940
1329
|
return 0
|
|
1330
|
+
|
|
1331
|
+
# -- Atmosphere promotion --
|
|
1332
|
+
|
|
1333
|
+
def promote_entry(
|
|
1334
|
+
self,
|
|
1335
|
+
entry_name: str,
|
|
1336
|
+
*,
|
|
1337
|
+
name: str | None = None,
|
|
1338
|
+
description: str | None = None,
|
|
1339
|
+
tags: list[str] | None = None,
|
|
1340
|
+
license: str | None = None,
|
|
1341
|
+
) -> str:
|
|
1342
|
+
"""Promote a locally-indexed dataset to the atmosphere.
|
|
1343
|
+
|
|
1344
|
+
.. deprecated::
|
|
1345
|
+
Use :meth:`insert_dataset` instead.
|
|
1346
|
+
|
|
1347
|
+
Args:
|
|
1348
|
+
entry_name: Name of the local dataset entry to promote.
|
|
1349
|
+
name: Override name for the atmosphere record. Defaults to
|
|
1350
|
+
the local entry name.
|
|
1351
|
+
description: Optional description for the dataset.
|
|
1352
|
+
tags: Optional tags for discovery.
|
|
1353
|
+
license: Optional license identifier.
|
|
1354
|
+
|
|
1355
|
+
Returns:
|
|
1356
|
+
AT URI of the created atmosphere dataset record.
|
|
1357
|
+
|
|
1358
|
+
Raises:
|
|
1359
|
+
ValueError: If atmosphere backend is not available, or
|
|
1360
|
+
the local entry has no data URLs.
|
|
1361
|
+
KeyError: If the entry or its schema is not found.
|
|
1362
|
+
|
|
1363
|
+
Examples:
|
|
1364
|
+
>>> index = Index(atmosphere=client)
|
|
1365
|
+
>>> uri = index.promote_entry("mnist-train")
|
|
1366
|
+
"""
|
|
1367
|
+
import warnings
|
|
1368
|
+
|
|
1369
|
+
warnings.warn(
|
|
1370
|
+
"Index.promote_entry() is deprecated, use Index.insert_dataset()",
|
|
1371
|
+
DeprecationWarning,
|
|
1372
|
+
stacklevel=2,
|
|
1373
|
+
)
|
|
1374
|
+
from atdata.promote import _find_or_publish_schema
|
|
1375
|
+
from atdata.atmosphere import DatasetPublisher
|
|
1376
|
+
from atdata._schema_codec import schema_to_type
|
|
1377
|
+
from atdata._logging import log_operation
|
|
1378
|
+
|
|
1379
|
+
atmo = self._get_atmosphere()
|
|
1380
|
+
if atmo is None:
|
|
1381
|
+
raise ValueError("Atmosphere backend required but not available.")
|
|
1382
|
+
|
|
1383
|
+
with log_operation("Index.promote_entry", entry_name=entry_name):
|
|
1384
|
+
entry = self.get_entry_by_name(entry_name)
|
|
1385
|
+
if not entry.data_urls:
|
|
1386
|
+
raise ValueError(f"Local entry {entry_name!r} has no data URLs")
|
|
1387
|
+
|
|
1388
|
+
schema_record = self.get_schema(entry.schema_ref)
|
|
1389
|
+
sample_type = schema_to_type(schema_record)
|
|
1390
|
+
schema_version = schema_record.get("version", "1.0.0")
|
|
1391
|
+
|
|
1392
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
1393
|
+
sample_type,
|
|
1394
|
+
schema_version,
|
|
1395
|
+
atmo.client,
|
|
1396
|
+
description=schema_record.get("description"),
|
|
1397
|
+
)
|
|
1398
|
+
|
|
1399
|
+
publisher = DatasetPublisher(atmo.client)
|
|
1400
|
+
uri = publisher.publish_with_urls(
|
|
1401
|
+
urls=entry.data_urls,
|
|
1402
|
+
schema_uri=atmosphere_schema_uri,
|
|
1403
|
+
name=name or entry.name,
|
|
1404
|
+
description=description,
|
|
1405
|
+
tags=tags,
|
|
1406
|
+
license=license,
|
|
1407
|
+
metadata=entry.metadata,
|
|
1408
|
+
)
|
|
1409
|
+
return str(uri)
|
|
1410
|
+
|
|
1411
|
+
def promote_dataset(
|
|
1412
|
+
self,
|
|
1413
|
+
dataset: Dataset,
|
|
1414
|
+
*,
|
|
1415
|
+
name: str,
|
|
1416
|
+
sample_type: type | None = None,
|
|
1417
|
+
schema_version: str = "1.0.0",
|
|
1418
|
+
description: str | None = None,
|
|
1419
|
+
tags: list[str] | None = None,
|
|
1420
|
+
license: str | None = None,
|
|
1421
|
+
) -> str:
|
|
1422
|
+
"""Publish a Dataset directly to the atmosphere.
|
|
1423
|
+
|
|
1424
|
+
.. deprecated::
|
|
1425
|
+
Use :meth:`insert_dataset` instead.
|
|
1426
|
+
|
|
1427
|
+
Args:
|
|
1428
|
+
dataset: The Dataset to publish.
|
|
1429
|
+
name: Name for the atmosphere dataset record.
|
|
1430
|
+
sample_type: Sample type for schema publishing. Inferred from
|
|
1431
|
+
``dataset.sample_type`` if not provided.
|
|
1432
|
+
schema_version: Semantic version for the schema. Default: ``"1.0.0"``.
|
|
1433
|
+
description: Optional description for the dataset.
|
|
1434
|
+
tags: Optional tags for discovery.
|
|
1435
|
+
license: Optional license identifier.
|
|
1436
|
+
|
|
1437
|
+
Returns:
|
|
1438
|
+
AT URI of the created atmosphere dataset record.
|
|
1439
|
+
|
|
1440
|
+
Raises:
|
|
1441
|
+
ValueError: If atmosphere backend is not available.
|
|
1442
|
+
|
|
1443
|
+
Examples:
|
|
1444
|
+
>>> index = Index(atmosphere=client)
|
|
1445
|
+
>>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
|
|
1446
|
+
>>> uri = index.promote_dataset(ds, name="my-dataset")
|
|
1447
|
+
"""
|
|
1448
|
+
import warnings
|
|
1449
|
+
|
|
1450
|
+
warnings.warn(
|
|
1451
|
+
"Index.promote_dataset() is deprecated, use Index.insert_dataset()",
|
|
1452
|
+
DeprecationWarning,
|
|
1453
|
+
stacklevel=2,
|
|
1454
|
+
)
|
|
1455
|
+
from atdata.promote import _find_or_publish_schema
|
|
1456
|
+
from atdata.atmosphere import DatasetPublisher
|
|
1457
|
+
from atdata._logging import log_operation
|
|
1458
|
+
|
|
1459
|
+
atmo = self._get_atmosphere()
|
|
1460
|
+
if atmo is None:
|
|
1461
|
+
raise ValueError("Atmosphere backend required but not available.")
|
|
1462
|
+
|
|
1463
|
+
with log_operation("Index.promote_dataset", name=name):
|
|
1464
|
+
st = sample_type or dataset.sample_type
|
|
1465
|
+
|
|
1466
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
1467
|
+
st,
|
|
1468
|
+
schema_version,
|
|
1469
|
+
atmo.client,
|
|
1470
|
+
description=description,
|
|
1471
|
+
)
|
|
1472
|
+
|
|
1473
|
+
data_urls = dataset.list_shards()
|
|
1474
|
+
|
|
1475
|
+
publisher = DatasetPublisher(atmo.client)
|
|
1476
|
+
uri = publisher.publish_with_urls(
|
|
1477
|
+
urls=data_urls,
|
|
1478
|
+
schema_uri=atmosphere_schema_uri,
|
|
1479
|
+
name=name,
|
|
1480
|
+
description=description,
|
|
1481
|
+
tags=tags,
|
|
1482
|
+
license=license,
|
|
1483
|
+
metadata=dataset._metadata,
|
|
1484
|
+
)
|
|
1485
|
+
return str(uri)
|