atdata 0.2.2b1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +1 -1
- atdata/_cid.py +29 -35
- atdata/_helpers.py +7 -5
- atdata/_hf_api.py +48 -50
- atdata/_protocols.py +56 -71
- atdata/_schema_codec.py +33 -37
- atdata/_sources.py +57 -64
- atdata/_stub_manager.py +31 -26
- atdata/_type_utils.py +19 -5
- atdata/atmosphere/__init__.py +20 -23
- atdata/atmosphere/_types.py +11 -11
- atdata/atmosphere/client.py +11 -8
- atdata/atmosphere/lens.py +27 -30
- atdata/atmosphere/records.py +31 -37
- atdata/atmosphere/schema.py +33 -29
- atdata/atmosphere/store.py +16 -20
- atdata/cli/__init__.py +12 -3
- atdata/cli/diagnose.py +12 -8
- atdata/cli/local.py +4 -1
- atdata/dataset.py +284 -241
- atdata/lens.py +77 -82
- atdata/local.py +182 -169
- atdata/promote.py +18 -22
- {atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/METADATA +2 -1
- atdata-0.2.3b1.dist-info/RECORD +28 -0
- atdata-0.2.2b1.dist-info/RECORD +0 -28
- {atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/licenses/LICENSE +0 -0
atdata/atmosphere/schema.py
CHANGED
|
@@ -17,7 +17,6 @@ from ._types import (
|
|
|
17
17
|
LEXICON_NAMESPACE,
|
|
18
18
|
)
|
|
19
19
|
from .._type_utils import (
|
|
20
|
-
numpy_dtype_to_string,
|
|
21
20
|
unwrap_optional,
|
|
22
21
|
is_ndarray_type,
|
|
23
22
|
extract_ndarray_dtype,
|
|
@@ -25,6 +24,7 @@ from .._type_utils import (
|
|
|
25
24
|
|
|
26
25
|
# Import for type checking only to avoid circular imports
|
|
27
26
|
from typing import TYPE_CHECKING
|
|
27
|
+
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
from ..dataset import PackableSample
|
|
30
30
|
|
|
@@ -37,21 +37,19 @@ class SchemaPublisher:
|
|
|
37
37
|
This class introspects a PackableSample class to extract its field
|
|
38
38
|
definitions and publishes them as an ATProto schema record.
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
>>> print(uri)
|
|
54
|
-
at://did:plc:.../ac.foundation.dataset.sampleSchema/...
|
|
40
|
+
Examples:
|
|
41
|
+
>>> @atdata.packable
|
|
42
|
+
... class MySample:
|
|
43
|
+
... image: NDArray
|
|
44
|
+
... label: str
|
|
45
|
+
...
|
|
46
|
+
>>> client = AtmosphereClient()
|
|
47
|
+
>>> client.login("handle", "password")
|
|
48
|
+
>>>
|
|
49
|
+
>>> publisher = SchemaPublisher(client)
|
|
50
|
+
>>> uri = publisher.publish(MySample, version="1.0.0")
|
|
51
|
+
>>> print(uri)
|
|
52
|
+
at://did:plc:.../ac.foundation.dataset.sampleSchema/...
|
|
55
53
|
"""
|
|
56
54
|
|
|
57
55
|
def __init__(self, client: AtmosphereClient):
|
|
@@ -90,7 +88,9 @@ class SchemaPublisher:
|
|
|
90
88
|
TypeError: If a field type is not supported.
|
|
91
89
|
"""
|
|
92
90
|
if not is_dataclass(sample_type):
|
|
93
|
-
raise ValueError(
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"{sample_type.__name__} must be a dataclass (use @packable)"
|
|
93
|
+
)
|
|
94
94
|
|
|
95
95
|
# Build the schema record
|
|
96
96
|
schema_record = self._build_schema_record(
|
|
@@ -155,12 +155,18 @@ class SchemaPublisher:
|
|
|
155
155
|
return FieldType(kind="primitive", primitive="bytes")
|
|
156
156
|
|
|
157
157
|
if is_ndarray_type(python_type):
|
|
158
|
-
return FieldType(
|
|
158
|
+
return FieldType(
|
|
159
|
+
kind="ndarray", dtype=extract_ndarray_dtype(python_type), shape=None
|
|
160
|
+
)
|
|
159
161
|
|
|
160
162
|
origin = get_origin(python_type)
|
|
161
163
|
if origin is list:
|
|
162
164
|
args = get_args(python_type)
|
|
163
|
-
items =
|
|
165
|
+
items = (
|
|
166
|
+
self._python_type_to_field_type(args[0])
|
|
167
|
+
if args
|
|
168
|
+
else FieldType(kind="primitive", primitive="str")
|
|
169
|
+
)
|
|
164
170
|
return FieldType(kind="array", items=items)
|
|
165
171
|
|
|
166
172
|
if is_dataclass(python_type):
|
|
@@ -178,16 +184,14 @@ class SchemaLoader:
|
|
|
178
184
|
This class fetches schema records from ATProto and can list available
|
|
179
185
|
schemas from a repository.
|
|
180
186
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
>>> print(schema["name"])
|
|
190
|
-
'MySample'
|
|
187
|
+
Examples:
|
|
188
|
+
>>> client = AtmosphereClient()
|
|
189
|
+
>>> client.login("handle", "password")
|
|
190
|
+
>>>
|
|
191
|
+
>>> loader = SchemaLoader(client)
|
|
192
|
+
>>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.sampleSchema/...")
|
|
193
|
+
>>> print(schema["name"])
|
|
194
|
+
'MySample'
|
|
191
195
|
"""
|
|
192
196
|
|
|
193
197
|
def __init__(self, client: AtmosphereClient):
|
atdata/atmosphere/store.py
CHANGED
|
@@ -6,23 +6,20 @@ protocol that stores dataset shards as ATProto blobs in a Personal Data Server.
|
|
|
6
6
|
This enables fully decentralized dataset storage where both metadata (records)
|
|
7
7
|
and data (blobs) live on the AT Protocol network.
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
>>> print(urls)
|
|
20
|
-
['at://did:plc:.../blob/bafyrei...', ...]
|
|
9
|
+
Examples:
|
|
10
|
+
>>> from atdata.atmosphere import AtmosphereClient, PDSBlobStore
|
|
11
|
+
>>>
|
|
12
|
+
>>> client = AtmosphereClient()
|
|
13
|
+
>>> client.login("handle.bsky.social", "app-password")
|
|
14
|
+
>>>
|
|
15
|
+
>>> store = PDSBlobStore(client)
|
|
16
|
+
>>> urls = store.write_shards(dataset, prefix="mnist/v1")
|
|
17
|
+
>>> print(urls)
|
|
18
|
+
['at://did:plc:.../blob/bafyrei...', ...]
|
|
21
19
|
"""
|
|
22
20
|
|
|
23
21
|
from __future__ import annotations
|
|
24
22
|
|
|
25
|
-
import io
|
|
26
23
|
import tempfile
|
|
27
24
|
from dataclasses import dataclass
|
|
28
25
|
from typing import TYPE_CHECKING, Any
|
|
@@ -31,6 +28,7 @@ import webdataset as wds
|
|
|
31
28
|
|
|
32
29
|
if TYPE_CHECKING:
|
|
33
30
|
from ..dataset import Dataset
|
|
31
|
+
from .._sources import BlobSource
|
|
34
32
|
from .client import AtmosphereClient
|
|
35
33
|
|
|
36
34
|
|
|
@@ -48,13 +46,11 @@ class PDSBlobStore:
|
|
|
48
46
|
Attributes:
|
|
49
47
|
client: Authenticated AtmosphereClient instance.
|
|
50
48
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
>>> # Returns AT URIs like:
|
|
57
|
-
>>> # ['at://did:plc:abc/blob/bafyrei...', ...]
|
|
49
|
+
Examples:
|
|
50
|
+
>>> store = PDSBlobStore(client)
|
|
51
|
+
>>> urls = store.write_shards(dataset, prefix="training/v1")
|
|
52
|
+
>>> # Returns AT URIs like:
|
|
53
|
+
>>> # ['at://did:plc:abc/blob/bafyrei...', ...]
|
|
58
54
|
"""
|
|
59
55
|
|
|
60
56
|
client: "AtmosphereClient"
|
atdata/cli/__init__.py
CHANGED
|
@@ -42,7 +42,8 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
42
42
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
43
43
|
)
|
|
44
44
|
parser.add_argument(
|
|
45
|
-
"--version",
|
|
45
|
+
"--version",
|
|
46
|
+
"-v",
|
|
46
47
|
action="store_true",
|
|
47
48
|
help="Show version information",
|
|
48
49
|
)
|
|
@@ -83,7 +84,8 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
83
84
|
help="MinIO console port (default: 9001)",
|
|
84
85
|
)
|
|
85
86
|
up_parser.add_argument(
|
|
86
|
-
"--detach",
|
|
87
|
+
"--detach",
|
|
88
|
+
"-d",
|
|
87
89
|
action="store_true",
|
|
88
90
|
default=True,
|
|
89
91
|
help="Run containers in detached mode (default: True)",
|
|
@@ -95,7 +97,8 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
95
97
|
help="Stop local development containers",
|
|
96
98
|
)
|
|
97
99
|
down_parser.add_argument(
|
|
98
|
-
"--volumes",
|
|
100
|
+
"--volumes",
|
|
101
|
+
"-v",
|
|
99
102
|
action="store_true",
|
|
100
103
|
help="Also remove volumes (deletes all data)",
|
|
101
104
|
)
|
|
@@ -165,10 +168,12 @@ def _cmd_version() -> int:
|
|
|
165
168
|
"""Show version information."""
|
|
166
169
|
try:
|
|
167
170
|
from atdata import __version__
|
|
171
|
+
|
|
168
172
|
version = __version__
|
|
169
173
|
except ImportError:
|
|
170
174
|
# Fallback to package metadata
|
|
171
175
|
from importlib.metadata import version as pkg_version
|
|
176
|
+
|
|
172
177
|
version = pkg_version("atdata")
|
|
173
178
|
|
|
174
179
|
print(f"atdata {version}")
|
|
@@ -183,6 +188,7 @@ def _cmd_local_up(
|
|
|
183
188
|
) -> int:
|
|
184
189
|
"""Start local development infrastructure."""
|
|
185
190
|
from .local import local_up
|
|
191
|
+
|
|
186
192
|
return local_up(
|
|
187
193
|
redis_port=redis_port,
|
|
188
194
|
minio_port=minio_port,
|
|
@@ -194,18 +200,21 @@ def _cmd_local_up(
|
|
|
194
200
|
def _cmd_local_down(remove_volumes: bool) -> int:
|
|
195
201
|
"""Stop local development infrastructure."""
|
|
196
202
|
from .local import local_down
|
|
203
|
+
|
|
197
204
|
return local_down(remove_volumes=remove_volumes)
|
|
198
205
|
|
|
199
206
|
|
|
200
207
|
def _cmd_local_status() -> int:
|
|
201
208
|
"""Show status of local infrastructure."""
|
|
202
209
|
from .local import local_status
|
|
210
|
+
|
|
203
211
|
return local_status()
|
|
204
212
|
|
|
205
213
|
|
|
206
214
|
def _cmd_diagnose(host: str, port: int) -> int:
|
|
207
215
|
"""Diagnose Redis configuration."""
|
|
208
216
|
from .diagnose import diagnose_redis
|
|
217
|
+
|
|
209
218
|
return diagnose_redis(host=host, port=port)
|
|
210
219
|
|
|
211
220
|
|
atdata/cli/diagnose.py
CHANGED
|
@@ -5,7 +5,6 @@ and other infrastructure components.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import sys
|
|
8
|
-
from typing import Any
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def _print_status(label: str, ok: bool, detail: str = "") -> None:
|
|
@@ -41,6 +40,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
41
40
|
# Try to connect
|
|
42
41
|
try:
|
|
43
42
|
from redis import Redis
|
|
43
|
+
|
|
44
44
|
redis = Redis(host=host, port=port, socket_connect_timeout=5)
|
|
45
45
|
redis.ping()
|
|
46
46
|
_print_status("Connection", True, "connected")
|
|
@@ -70,7 +70,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
70
70
|
_print_status(
|
|
71
71
|
"AOF Persistence",
|
|
72
72
|
aof_ok,
|
|
73
|
-
"enabled" if aof_ok else "DISABLED - data may be lost on restart!"
|
|
73
|
+
"enabled" if aof_ok else "DISABLED - data may be lost on restart!",
|
|
74
74
|
)
|
|
75
75
|
if not aof_ok:
|
|
76
76
|
issues_found = True
|
|
@@ -85,7 +85,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
85
85
|
_print_status(
|
|
86
86
|
"RDB Persistence",
|
|
87
87
|
rdb_ok,
|
|
88
|
-
f"configured ({save_config})" if rdb_ok else "DISABLED"
|
|
88
|
+
f"configured ({save_config})" if rdb_ok else "DISABLED",
|
|
89
89
|
)
|
|
90
90
|
# RDB disabled is only a warning if AOF is enabled
|
|
91
91
|
except Exception as e:
|
|
@@ -95,7 +95,13 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
95
95
|
try:
|
|
96
96
|
policy = redis.config_get("maxmemory-policy").get("maxmemory-policy", "unknown")
|
|
97
97
|
# Safe policies that won't evict index data
|
|
98
|
-
safe_policies = {
|
|
98
|
+
safe_policies = {
|
|
99
|
+
"noeviction",
|
|
100
|
+
"volatile-lru",
|
|
101
|
+
"volatile-lfu",
|
|
102
|
+
"volatile-ttl",
|
|
103
|
+
"volatile-random",
|
|
104
|
+
}
|
|
99
105
|
policy_ok = policy in safe_policies
|
|
100
106
|
|
|
101
107
|
if policy_ok:
|
|
@@ -104,7 +110,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
104
110
|
_print_status(
|
|
105
111
|
"Memory Policy",
|
|
106
112
|
False,
|
|
107
|
-
f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'"
|
|
113
|
+
f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'",
|
|
108
114
|
)
|
|
109
115
|
issues_found = True
|
|
110
116
|
except Exception as e:
|
|
@@ -141,9 +147,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
141
147
|
for key in redis.scan_iter(match="LocalSchema:*", count=100):
|
|
142
148
|
schema_count += 1
|
|
143
149
|
_print_status(
|
|
144
|
-
"atdata Keys",
|
|
145
|
-
True,
|
|
146
|
-
f"{dataset_count} datasets, {schema_count} schemas"
|
|
150
|
+
"atdata Keys", True, f"{dataset_count} datasets, {schema_count} schemas"
|
|
147
151
|
)
|
|
148
152
|
except Exception as e:
|
|
149
153
|
_print_status("atdata Keys", False, f"check failed: {e}")
|
atdata/cli/local.py
CHANGED
|
@@ -144,7 +144,9 @@ def _run_compose(
|
|
|
144
144
|
elif shutil.which("docker-compose"):
|
|
145
145
|
base_cmd = ["docker-compose"]
|
|
146
146
|
else:
|
|
147
|
-
raise RuntimeError(
|
|
147
|
+
raise RuntimeError(
|
|
148
|
+
"Neither 'docker compose' nor 'docker-compose' available"
|
|
149
|
+
)
|
|
148
150
|
else:
|
|
149
151
|
raise RuntimeError("Docker not found")
|
|
150
152
|
|
|
@@ -195,6 +197,7 @@ def local_up(
|
|
|
195
197
|
|
|
196
198
|
# Wait a moment for containers to be healthy
|
|
197
199
|
import time
|
|
200
|
+
|
|
198
201
|
time.sleep(2)
|
|
199
202
|
|
|
200
203
|
# Show status
|