atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +2 -0
- atdata/_hf_api.py +13 -0
- atdata/_logging.py +43 -0
- atdata/_protocols.py +18 -1
- atdata/_sources.py +24 -4
- atdata/atmosphere/__init__.py +48 -10
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +71 -243
- atdata/atmosphere/lens.py +49 -41
- atdata/atmosphere/records.py +282 -90
- atdata/atmosphere/schema.py +78 -50
- atdata/atmosphere/store.py +62 -59
- atdata/dataset.py +201 -135
- atdata/index/_entry.py +6 -2
- atdata/index/_index.py +396 -109
- atdata/lexicons/__init__.py +9 -3
- atdata/lexicons/ac.foundation.dataset.lens.json +2 -0
- atdata/lexicons/ac.foundation.dataset.record.json +22 -1
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +26 -4
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +1 -1
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/repository.py +59 -9
- atdata/stores/_disk.py +19 -11
- atdata/stores/_s3.py +134 -112
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +1 -1
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/RECORD +37 -33
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/__init__.py
CHANGED
|
@@ -90,6 +90,7 @@ from ._schema_codec import (
|
|
|
90
90
|
from ._logging import (
|
|
91
91
|
configure_logging as configure_logging,
|
|
92
92
|
get_logger as get_logger,
|
|
93
|
+
log_operation as log_operation,
|
|
93
94
|
)
|
|
94
95
|
|
|
95
96
|
from .repository import (
|
|
@@ -121,6 +122,7 @@ from .manifest import (
|
|
|
121
122
|
ManifestWriter as ManifestWriter,
|
|
122
123
|
QueryExecutor as QueryExecutor,
|
|
123
124
|
SampleLocation as SampleLocation,
|
|
125
|
+
query_fields as query_fields,
|
|
124
126
|
)
|
|
125
127
|
|
|
126
128
|
# ATProto integration (lazy import to avoid requiring atproto package)
|
atdata/_hf_api.py
CHANGED
|
@@ -714,12 +714,23 @@ def load_dataset(
|
|
|
714
714
|
>>> index = Index()
|
|
715
715
|
>>> ds = load_dataset("@local/my-dataset", index=index, split="train")
|
|
716
716
|
"""
|
|
717
|
+
from ._logging import get_logger
|
|
718
|
+
|
|
719
|
+
log = get_logger()
|
|
720
|
+
log.info(
|
|
721
|
+
"load_dataset: path=%s, split=%s, sample_type=%s",
|
|
722
|
+
path,
|
|
723
|
+
split,
|
|
724
|
+
sample_type.__name__ if sample_type is not None else "None",
|
|
725
|
+
)
|
|
726
|
+
|
|
717
727
|
# Handle @handle/dataset indexed path resolution
|
|
718
728
|
if _is_indexed_path(path):
|
|
719
729
|
if index is None:
|
|
720
730
|
index = get_default_index()
|
|
721
731
|
|
|
722
732
|
source, schema_ref = _resolve_indexed_path(path, index)
|
|
733
|
+
log.debug("load_dataset: resolved indexed path, schema_ref=%s", schema_ref)
|
|
723
734
|
|
|
724
735
|
# Resolve sample_type from schema if not provided
|
|
725
736
|
resolved_type: Type = (
|
|
@@ -746,6 +757,8 @@ def load_dataset(
|
|
|
746
757
|
if not splits_shards:
|
|
747
758
|
raise FileNotFoundError(f"No data files found at path: {path}")
|
|
748
759
|
|
|
760
|
+
log.debug("load_dataset: resolved %d split(s) from path", len(splits_shards))
|
|
761
|
+
|
|
749
762
|
# Build Dataset for each split
|
|
750
763
|
datasets: dict[str, Dataset] = {}
|
|
751
764
|
for split_name, shards in splits_shards.items():
|
atdata/_logging.py
CHANGED
|
@@ -22,7 +22,10 @@ custom logger implementations.
|
|
|
22
22
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
|
+
import contextlib
|
|
25
26
|
import logging
|
|
27
|
+
import time
|
|
28
|
+
from collections.abc import Generator
|
|
26
29
|
from typing import Any, Protocol, runtime_checkable
|
|
27
30
|
|
|
28
31
|
|
|
@@ -68,3 +71,43 @@ def get_logger() -> LoggerProtocol:
|
|
|
68
71
|
whatever was last set via :func:`configure_logging`.
|
|
69
72
|
"""
|
|
70
73
|
return _logger
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@contextlib.contextmanager
|
|
77
|
+
def log_operation(op_name: str, **context: Any) -> Generator[None, None, None]:
|
|
78
|
+
"""Log the start, completion, and duration of an operation.
|
|
79
|
+
|
|
80
|
+
Emits an ``info`` message on entry and on successful completion
|
|
81
|
+
(with elapsed time), or an ``error`` message if an exception
|
|
82
|
+
propagates out.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
op_name: Short label for the operation (e.g. ``"write_samples"``).
|
|
86
|
+
**context: Arbitrary key-value pairs included in every log message.
|
|
87
|
+
|
|
88
|
+
Examples:
|
|
89
|
+
>>> with log_operation("write_samples", shard_count=10):
|
|
90
|
+
... do_work()
|
|
91
|
+
"""
|
|
92
|
+
log = get_logger()
|
|
93
|
+
ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
|
|
94
|
+
if ctx_str:
|
|
95
|
+
log.info("%s: started (%s)", op_name, ctx_str)
|
|
96
|
+
else:
|
|
97
|
+
log.info("%s: started", op_name)
|
|
98
|
+
t0 = time.monotonic()
|
|
99
|
+
try:
|
|
100
|
+
yield
|
|
101
|
+
except Exception:
|
|
102
|
+
elapsed = time.monotonic() - t0
|
|
103
|
+
if ctx_str:
|
|
104
|
+
log.error("%s: failed after %.2fs (%s)", op_name, elapsed, ctx_str)
|
|
105
|
+
else:
|
|
106
|
+
log.error("%s: failed after %.2fs", op_name, elapsed)
|
|
107
|
+
raise
|
|
108
|
+
else:
|
|
109
|
+
elapsed = time.monotonic() - t0
|
|
110
|
+
if ctx_str:
|
|
111
|
+
log.info("%s: completed in %.2fs (%s)", op_name, elapsed, ctx_str)
|
|
112
|
+
else:
|
|
113
|
+
log.info("%s: completed in %.2fs", op_name, elapsed)
|
atdata/_protocols.py
CHANGED
|
@@ -151,12 +151,14 @@ class AbstractIndex(Protocol):
|
|
|
151
151
|
|
|
152
152
|
# Dataset operations
|
|
153
153
|
|
|
154
|
-
def
|
|
154
|
+
def write_samples(
|
|
155
155
|
self,
|
|
156
156
|
samples: Iterable,
|
|
157
157
|
*,
|
|
158
158
|
name: str,
|
|
159
159
|
schema_ref: Optional[str] = None,
|
|
160
|
+
data_store: Optional["AbstractDataStore"] = None,
|
|
161
|
+
force: bool = False,
|
|
160
162
|
**kwargs,
|
|
161
163
|
) -> IndexEntry:
|
|
162
164
|
"""Write samples and create an index entry in one step.
|
|
@@ -164,10 +166,15 @@ class AbstractIndex(Protocol):
|
|
|
164
166
|
Serializes samples to WebDataset tar files, stores them via the
|
|
165
167
|
appropriate backend, and creates an index entry.
|
|
166
168
|
|
|
169
|
+
For atmosphere targets, data is uploaded as PDS blobs by default
|
|
170
|
+
with size guards (50 MB per shard, 1 GB total).
|
|
171
|
+
|
|
167
172
|
Args:
|
|
168
173
|
samples: Iterable of Packable samples. Must be non-empty.
|
|
169
174
|
name: Dataset name, optionally prefixed with target backend.
|
|
170
175
|
schema_ref: Optional schema reference.
|
|
176
|
+
data_store: Explicit data store for shard storage.
|
|
177
|
+
force: Bypass PDS size limits.
|
|
171
178
|
**kwargs: Backend-specific options (maxcount, description, etc.).
|
|
172
179
|
|
|
173
180
|
Returns:
|
|
@@ -181,14 +188,24 @@ class AbstractIndex(Protocol):
|
|
|
181
188
|
*,
|
|
182
189
|
name: str,
|
|
183
190
|
schema_ref: Optional[str] = None,
|
|
191
|
+
data_store: Optional["AbstractDataStore"] = None,
|
|
192
|
+
force: bool = False,
|
|
193
|
+
copy: bool = False,
|
|
184
194
|
**kwargs,
|
|
185
195
|
) -> IndexEntry:
|
|
186
196
|
"""Register an existing dataset in the index.
|
|
187
197
|
|
|
198
|
+
For atmosphere targets, local sources are uploaded via
|
|
199
|
+
*data_store* (defaults to PDS blobs). Credentialed sources
|
|
200
|
+
require ``copy=True``.
|
|
201
|
+
|
|
188
202
|
Args:
|
|
189
203
|
ds: The Dataset to register.
|
|
190
204
|
name: Human-readable name.
|
|
191
205
|
schema_ref: Explicit schema ref; auto-published if ``None``.
|
|
206
|
+
data_store: Explicit data store for shard storage.
|
|
207
|
+
force: Bypass PDS size limits.
|
|
208
|
+
copy: Copy data to destination store even for remote sources.
|
|
192
209
|
**kwargs: Backend-specific options.
|
|
193
210
|
"""
|
|
194
211
|
...
|
atdata/_sources.py
CHANGED
|
@@ -64,10 +64,20 @@ class URLSource:
|
|
|
64
64
|
"""Expand brace pattern and return list of shard URLs."""
|
|
65
65
|
return list(braceexpand.braceexpand(self.url))
|
|
66
66
|
|
|
67
|
-
# Legacy alias for backwards compatibility
|
|
68
67
|
@property
|
|
69
68
|
def shard_list(self) -> list[str]:
|
|
70
|
-
"""Expand brace pattern and return list of shard URLs
|
|
69
|
+
"""Expand brace pattern and return list of shard URLs.
|
|
70
|
+
|
|
71
|
+
.. deprecated::
|
|
72
|
+
Use :meth:`list_shards` instead.
|
|
73
|
+
"""
|
|
74
|
+
import warnings
|
|
75
|
+
|
|
76
|
+
warnings.warn(
|
|
77
|
+
"shard_list is deprecated, use list_shards()",
|
|
78
|
+
DeprecationWarning,
|
|
79
|
+
stacklevel=2,
|
|
80
|
+
)
|
|
71
81
|
return self.list_shards()
|
|
72
82
|
|
|
73
83
|
@property
|
|
@@ -178,10 +188,20 @@ class S3Source:
|
|
|
178
188
|
"""Return list of S3 URIs for the shards."""
|
|
179
189
|
return [f"s3://{self.bucket}/{key}" for key in self.keys]
|
|
180
190
|
|
|
181
|
-
# Legacy alias for backwards compatibility
|
|
182
191
|
@property
|
|
183
192
|
def shard_list(self) -> list[str]:
|
|
184
|
-
"""Return list of S3 URIs for the shards
|
|
193
|
+
"""Return list of S3 URIs for the shards.
|
|
194
|
+
|
|
195
|
+
.. deprecated::
|
|
196
|
+
Use :meth:`list_shards` instead.
|
|
197
|
+
"""
|
|
198
|
+
import warnings
|
|
199
|
+
|
|
200
|
+
warnings.warn(
|
|
201
|
+
"shard_list is deprecated, use list_shards()",
|
|
202
|
+
DeprecationWarning,
|
|
203
|
+
stacklevel=2,
|
|
204
|
+
)
|
|
185
205
|
return self.list_shards()
|
|
186
206
|
|
|
187
207
|
@property
|
atdata/atmosphere/__init__.py
CHANGED
|
@@ -34,11 +34,23 @@ from .schema import SchemaPublisher, SchemaLoader
|
|
|
34
34
|
from .records import DatasetPublisher, DatasetLoader
|
|
35
35
|
from .lens import LensPublisher, LensLoader
|
|
36
36
|
from .store import PDSBlobStore
|
|
37
|
-
from ._types import
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
37
|
+
from ._types import AtUri, LEXICON_NAMESPACE
|
|
38
|
+
from ._lexicon_types import (
|
|
39
|
+
LexSchemaRecord,
|
|
40
|
+
LexDatasetRecord,
|
|
41
|
+
LexLensRecord,
|
|
42
|
+
LexCodeReference,
|
|
43
|
+
JsonSchemaFormat,
|
|
44
|
+
StorageHttp,
|
|
45
|
+
StorageS3,
|
|
46
|
+
StorageBlobs,
|
|
47
|
+
ShardChecksum,
|
|
48
|
+
HttpShardEntry,
|
|
49
|
+
S3ShardEntry,
|
|
50
|
+
BlobEntry,
|
|
51
|
+
DatasetSize,
|
|
52
|
+
StorageUnion,
|
|
53
|
+
storage_from_record,
|
|
42
54
|
)
|
|
43
55
|
|
|
44
56
|
if TYPE_CHECKING:
|
|
@@ -70,11 +82,23 @@ class AtmosphereIndexEntry:
|
|
|
70
82
|
|
|
71
83
|
@property
|
|
72
84
|
def data_urls(self) -> list[str]:
|
|
73
|
-
"""WebDataset URLs from
|
|
85
|
+
"""WebDataset URLs from storage.
|
|
86
|
+
|
|
87
|
+
Handles storageHttp (shard URLs), storageS3 (s3:// URLs),
|
|
88
|
+
storageExternal (legacy), and storageBlobs (PDS blob URLs).
|
|
89
|
+
"""
|
|
74
90
|
storage = self._record.get("storage", {})
|
|
75
91
|
storage_type = storage.get("$type", "")
|
|
92
|
+
if "storageHttp" in storage_type:
|
|
93
|
+
return [s["url"] for s in storage.get("shards", [])]
|
|
94
|
+
if "storageS3" in storage_type:
|
|
95
|
+
bucket = storage.get("bucket", "")
|
|
96
|
+
return [f"s3://{bucket}/{s['key']}" for s in storage.get("shards", [])]
|
|
76
97
|
if "storageExternal" in storage_type:
|
|
77
98
|
return storage.get("urls", [])
|
|
99
|
+
if "storageBlobs" in storage_type:
|
|
100
|
+
# Blob URLs must be resolved via PDS; return empty for now
|
|
101
|
+
return []
|
|
78
102
|
return []
|
|
79
103
|
|
|
80
104
|
@property
|
|
@@ -332,9 +356,23 @@ __all__ = [
|
|
|
332
356
|
# Lens operations
|
|
333
357
|
"LensPublisher",
|
|
334
358
|
"LensLoader",
|
|
335
|
-
#
|
|
359
|
+
# Core types
|
|
336
360
|
"AtUri",
|
|
337
|
-
"
|
|
338
|
-
|
|
339
|
-
"
|
|
361
|
+
"LEXICON_NAMESPACE",
|
|
362
|
+
# Lexicon-mirror types (Tier 1)
|
|
363
|
+
"LexSchemaRecord",
|
|
364
|
+
"LexDatasetRecord",
|
|
365
|
+
"LexLensRecord",
|
|
366
|
+
"LexCodeReference",
|
|
367
|
+
"JsonSchemaFormat",
|
|
368
|
+
"StorageHttp",
|
|
369
|
+
"StorageS3",
|
|
370
|
+
"StorageBlobs",
|
|
371
|
+
"StorageUnion",
|
|
372
|
+
"storage_from_record",
|
|
373
|
+
"ShardChecksum",
|
|
374
|
+
"HttpShardEntry",
|
|
375
|
+
"S3ShardEntry",
|
|
376
|
+
"BlobEntry",
|
|
377
|
+
"DatasetSize",
|
|
340
378
|
]
|