atdata 0.3.0b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +9 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +33 -1
- atdata/_protocols.py +64 -182
- atdata/_schema_codec.py +2 -2
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +12 -11
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +9 -10
- atdata/atmosphere/schema.py +14 -16
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +155 -2
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_index.py +322 -64
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/promote.py +14 -10
- atdata/repository.py +7 -7
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +2 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- /atdata/{local → index}/_entry.py +0 -0
- /atdata/{local → stores}/_s3.py +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/atmosphere/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ network.
|
|
|
6
6
|
|
|
7
7
|
Key components:
|
|
8
8
|
|
|
9
|
-
- ``
|
|
9
|
+
- ``Atmosphere``: Authentication and session management for ATProto
|
|
10
10
|
- ``SchemaPublisher``: Publish PackableSample schemas as ATProto records
|
|
11
11
|
- ``DatasetPublisher``: Publish dataset index records with WebDataset URLs
|
|
12
12
|
- ``LensPublisher``: Publish lens transformation records
|
|
@@ -16,13 +16,10 @@ to work unchanged. These features are opt-in for users who want to publish
|
|
|
16
16
|
or discover datasets on the ATProto network.
|
|
17
17
|
|
|
18
18
|
Examples:
|
|
19
|
-
>>> from atdata.atmosphere import
|
|
19
|
+
>>> from atdata.atmosphere import Atmosphere
|
|
20
20
|
>>>
|
|
21
|
-
>>>
|
|
22
|
-
>>>
|
|
23
|
-
>>>
|
|
24
|
-
>>> publisher = SchemaPublisher(client)
|
|
25
|
-
>>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
|
|
21
|
+
>>> atmo = Atmosphere.login("handle.bsky.social", "app-password")
|
|
22
|
+
>>> index = Index(atmosphere=atmo)
|
|
26
23
|
|
|
27
24
|
Note:
|
|
28
25
|
This module requires the ``atproto`` package to be installed::
|
|
@@ -32,7 +29,7 @@ Note:
|
|
|
32
29
|
|
|
33
30
|
from typing import Iterator, Optional, Type, TYPE_CHECKING
|
|
34
31
|
|
|
35
|
-
from .client import
|
|
32
|
+
from .client import Atmosphere
|
|
36
33
|
from .schema import SchemaPublisher, SchemaLoader
|
|
37
34
|
from .records import DatasetPublisher, DatasetLoader
|
|
38
35
|
from .lens import LensPublisher, LensLoader
|
|
@@ -122,14 +119,14 @@ class AtmosphereIndex:
|
|
|
122
119
|
|
|
123
120
|
def __init__(
|
|
124
121
|
self,
|
|
125
|
-
client:
|
|
122
|
+
client: Atmosphere,
|
|
126
123
|
*,
|
|
127
124
|
data_store: Optional[PDSBlobStore] = None,
|
|
128
125
|
):
|
|
129
126
|
"""Initialize the atmosphere index.
|
|
130
127
|
|
|
131
128
|
Args:
|
|
132
|
-
client: Authenticated
|
|
129
|
+
client: Authenticated Atmosphere instance.
|
|
133
130
|
data_store: Optional PDSBlobStore for writing shards as blobs.
|
|
134
131
|
If provided, insert_dataset will upload shards to PDS.
|
|
135
132
|
"""
|
|
@@ -314,9 +311,13 @@ class AtmosphereIndex:
|
|
|
314
311
|
return schema_to_type(schema)
|
|
315
312
|
|
|
316
313
|
|
|
314
|
+
# Deprecated alias for backward compatibility
|
|
315
|
+
AtmosphereClient = Atmosphere
|
|
316
|
+
|
|
317
317
|
__all__ = [
|
|
318
318
|
# Client
|
|
319
|
-
"
|
|
319
|
+
"Atmosphere",
|
|
320
|
+
"AtmosphereClient", # deprecated alias
|
|
320
321
|
# Storage
|
|
321
322
|
"PDSBlobStore",
|
|
322
323
|
# Unified index (AbstractIndex protocol)
|
atdata/atmosphere/_types.py
CHANGED
|
@@ -20,11 +20,11 @@ class AtUri:
|
|
|
20
20
|
AT URIs follow the format: at://<authority>/<collection>/<rkey>
|
|
21
21
|
|
|
22
22
|
Examples:
|
|
23
|
-
>>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.
|
|
23
|
+
>>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.schema/xyz")
|
|
24
24
|
>>> uri.authority
|
|
25
25
|
'did:plc:abc123'
|
|
26
26
|
>>> uri.collection
|
|
27
|
-
'ac.foundation.dataset.
|
|
27
|
+
'ac.foundation.dataset.schema'
|
|
28
28
|
>>> uri.rkey
|
|
29
29
|
'xyz'
|
|
30
30
|
"""
|
|
@@ -119,7 +119,7 @@ class FieldDef:
|
|
|
119
119
|
class SchemaRecord:
|
|
120
120
|
"""ATProto record for a PackableSample schema.
|
|
121
121
|
|
|
122
|
-
Maps to the ``ac.foundation.dataset.
|
|
122
|
+
Maps to the ``ac.foundation.dataset.schema`` Lexicon.
|
|
123
123
|
"""
|
|
124
124
|
|
|
125
125
|
name: str
|
|
@@ -143,7 +143,7 @@ class SchemaRecord:
|
|
|
143
143
|
def to_record(self) -> dict:
|
|
144
144
|
"""Convert to ATProto record dict for publishing."""
|
|
145
145
|
record = {
|
|
146
|
-
"$type": f"{LEXICON_NAMESPACE}.
|
|
146
|
+
"$type": f"{LEXICON_NAMESPACE}.schema",
|
|
147
147
|
"name": self.name,
|
|
148
148
|
"version": self.version,
|
|
149
149
|
"fields": [self._field_to_dict(f) for f in self.fields],
|
atdata/atmosphere/client.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""ATProto client wrapper for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides the ``
|
|
3
|
+
This module provides the ``Atmosphere`` class which wraps the atproto SDK
|
|
4
4
|
client with atdata-specific helpers for publishing and querying records.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -28,16 +28,15 @@ def _get_atproto_client_class():
|
|
|
28
28
|
return _atproto_client_class
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class
|
|
31
|
+
class Atmosphere:
|
|
32
32
|
"""ATProto client wrapper for atdata operations.
|
|
33
33
|
|
|
34
34
|
This class wraps the atproto SDK client and provides higher-level methods
|
|
35
35
|
for working with atdata records (schemas, datasets, lenses).
|
|
36
36
|
|
|
37
37
|
Examples:
|
|
38
|
-
>>>
|
|
39
|
-
>>>
|
|
40
|
-
>>> print(client.did)
|
|
38
|
+
>>> atmo = Atmosphere.login("alice.bsky.social", "app-password")
|
|
39
|
+
>>> print(atmo.did)
|
|
41
40
|
'did:plc:...'
|
|
42
41
|
|
|
43
42
|
Note:
|
|
@@ -65,7 +64,63 @@ class AtmosphereClient:
|
|
|
65
64
|
|
|
66
65
|
self._session: Optional[dict] = None
|
|
67
66
|
|
|
68
|
-
|
|
67
|
+
@classmethod
|
|
68
|
+
def login(
|
|
69
|
+
cls,
|
|
70
|
+
handle: str,
|
|
71
|
+
password: str,
|
|
72
|
+
*,
|
|
73
|
+
base_url: Optional[str] = None,
|
|
74
|
+
) -> "Atmosphere":
|
|
75
|
+
"""Create an authenticated Atmosphere client.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
handle: Your Bluesky handle (e.g., 'alice.bsky.social').
|
|
79
|
+
password: App-specific password (not your main password).
|
|
80
|
+
base_url: Optional PDS base URL. Defaults to bsky.social.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
An authenticated Atmosphere instance.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
atproto.exceptions.AtProtocolError: If authentication fails.
|
|
87
|
+
|
|
88
|
+
Examples:
|
|
89
|
+
>>> atmo = Atmosphere.login("alice.bsky.social", "app-password")
|
|
90
|
+
>>> index = Index(atmosphere=atmo)
|
|
91
|
+
"""
|
|
92
|
+
instance = cls(base_url=base_url)
|
|
93
|
+
instance._login(handle, password)
|
|
94
|
+
return instance
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def from_session(
|
|
98
|
+
cls,
|
|
99
|
+
session_string: str,
|
|
100
|
+
*,
|
|
101
|
+
base_url: Optional[str] = None,
|
|
102
|
+
) -> "Atmosphere":
|
|
103
|
+
"""Create an Atmosphere client from an exported session string.
|
|
104
|
+
|
|
105
|
+
This allows reusing a session without re-authenticating, which helps
|
|
106
|
+
avoid rate limits on session creation.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
session_string: Session string from ``export_session()``.
|
|
110
|
+
base_url: Optional PDS base URL. Defaults to bsky.social.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
An authenticated Atmosphere instance.
|
|
114
|
+
|
|
115
|
+
Examples:
|
|
116
|
+
>>> session = atmo.export_session()
|
|
117
|
+
>>> atmo2 = Atmosphere.from_session(session)
|
|
118
|
+
"""
|
|
119
|
+
instance = cls(base_url=base_url)
|
|
120
|
+
instance._login_with_session(session_string)
|
|
121
|
+
return instance
|
|
122
|
+
|
|
123
|
+
def _login(self, handle: str, password: str) -> None:
|
|
69
124
|
"""Authenticate with the ATProto PDS.
|
|
70
125
|
|
|
71
126
|
Args:
|
|
@@ -81,12 +136,9 @@ class AtmosphereClient:
|
|
|
81
136
|
"handle": profile.handle,
|
|
82
137
|
}
|
|
83
138
|
|
|
84
|
-
def
|
|
139
|
+
def _login_with_session(self, session_string: str) -> None:
|
|
85
140
|
"""Authenticate using an exported session string.
|
|
86
141
|
|
|
87
|
-
This allows reusing a session without re-authenticating, which helps
|
|
88
|
-
avoid rate limits on session creation.
|
|
89
|
-
|
|
90
142
|
Args:
|
|
91
143
|
session_string: Session string from ``export_session()``.
|
|
92
144
|
"""
|
|
@@ -161,7 +213,7 @@ class AtmosphereClient:
|
|
|
161
213
|
|
|
162
214
|
Args:
|
|
163
215
|
collection: The NSID of the record collection
|
|
164
|
-
(e.g., 'ac.foundation.dataset.
|
|
216
|
+
(e.g., 'ac.foundation.dataset.schema').
|
|
165
217
|
record: The record data. Must include a '$type' field.
|
|
166
218
|
rkey: Optional explicit record key. If not provided, a TID is generated.
|
|
167
219
|
validate: Whether to validate against the Lexicon schema. Set to False
|
|
@@ -487,7 +539,7 @@ class AtmosphereClient:
|
|
|
487
539
|
List of schema records.
|
|
488
540
|
"""
|
|
489
541
|
records, _ = self.list_records(
|
|
490
|
-
f"{LEXICON_NAMESPACE}.
|
|
542
|
+
f"{LEXICON_NAMESPACE}.schema",
|
|
491
543
|
repo=repo,
|
|
492
544
|
limit=limit,
|
|
493
545
|
)
|
atdata/atmosphere/lens.py
CHANGED
|
@@ -11,7 +11,7 @@ Note:
|
|
|
11
11
|
|
|
12
12
|
from typing import Optional
|
|
13
13
|
|
|
14
|
-
from .client import
|
|
14
|
+
from .client import Atmosphere
|
|
15
15
|
from ._types import (
|
|
16
16
|
AtUri,
|
|
17
17
|
LensRecord,
|
|
@@ -37,14 +37,13 @@ class LensPublisher:
|
|
|
37
37
|
... def my_lens(source: SourceType) -> TargetType:
|
|
38
38
|
... return TargetType(field=source.other_field)
|
|
39
39
|
>>>
|
|
40
|
-
>>>
|
|
41
|
-
>>> client.login("handle", "password")
|
|
40
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
42
41
|
>>>
|
|
43
|
-
>>> publisher = LensPublisher(
|
|
42
|
+
>>> publisher = LensPublisher(atmo)
|
|
44
43
|
>>> uri = publisher.publish(
|
|
45
44
|
... name="my_lens",
|
|
46
|
-
... source_schema_uri="at://did:plc:abc/ac.foundation.dataset.
|
|
47
|
-
... target_schema_uri="at://did:plc:abc/ac.foundation.dataset.
|
|
45
|
+
... source_schema_uri="at://did:plc:abc/ac.foundation.dataset.schema/source",
|
|
46
|
+
... target_schema_uri="at://did:plc:abc/ac.foundation.dataset.schema/target",
|
|
48
47
|
... code_repository="https://github.com/user/repo",
|
|
49
48
|
... code_commit="abc123def456",
|
|
50
49
|
... getter_path="mymodule.lenses:my_lens",
|
|
@@ -57,11 +56,11 @@ class LensPublisher:
|
|
|
57
56
|
records. Users must manually install and trust lens implementations.
|
|
58
57
|
"""
|
|
59
58
|
|
|
60
|
-
def __init__(self, client:
|
|
59
|
+
def __init__(self, client: Atmosphere):
|
|
61
60
|
"""Initialize the lens publisher.
|
|
62
61
|
|
|
63
62
|
Args:
|
|
64
|
-
client: Authenticated
|
|
63
|
+
client: Authenticated Atmosphere instance.
|
|
65
64
|
"""
|
|
66
65
|
self.client = client
|
|
67
66
|
|
|
@@ -195,8 +194,8 @@ class LensLoader:
|
|
|
195
194
|
it manually.
|
|
196
195
|
|
|
197
196
|
Examples:
|
|
198
|
-
>>>
|
|
199
|
-
>>> loader = LensLoader(
|
|
197
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
198
|
+
>>> loader = LensLoader(atmo)
|
|
200
199
|
>>>
|
|
201
200
|
>>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.lens/xyz")
|
|
202
201
|
>>> print(record["name"])
|
|
@@ -204,11 +203,11 @@ class LensLoader:
|
|
|
204
203
|
>>> print(record.get("getterCode", {}).get("repository"))
|
|
205
204
|
"""
|
|
206
205
|
|
|
207
|
-
def __init__(self, client:
|
|
206
|
+
def __init__(self, client: Atmosphere):
|
|
208
207
|
"""Initialize the lens loader.
|
|
209
208
|
|
|
210
209
|
Args:
|
|
211
|
-
client:
|
|
210
|
+
client: Atmosphere instance.
|
|
212
211
|
"""
|
|
213
212
|
self.client = client
|
|
214
213
|
|
atdata/atmosphere/records.py
CHANGED
|
@@ -8,7 +8,7 @@ and loading them back. Dataset records are published as
|
|
|
8
8
|
from typing import Type, TypeVar, Optional
|
|
9
9
|
import msgpack
|
|
10
10
|
|
|
11
|
-
from .client import
|
|
11
|
+
from .client import Atmosphere
|
|
12
12
|
from .schema import SchemaPublisher
|
|
13
13
|
from ._types import (
|
|
14
14
|
AtUri,
|
|
@@ -36,10 +36,9 @@ class DatasetPublisher:
|
|
|
36
36
|
Examples:
|
|
37
37
|
>>> dataset = atdata.Dataset[MySample]("s3://bucket/data-{000000..000009}.tar")
|
|
38
38
|
>>>
|
|
39
|
-
>>>
|
|
40
|
-
>>> client.login("handle", "password")
|
|
39
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
41
40
|
>>>
|
|
42
|
-
>>> publisher = DatasetPublisher(
|
|
41
|
+
>>> publisher = DatasetPublisher(atmo)
|
|
43
42
|
>>> uri = publisher.publish(
|
|
44
43
|
... dataset,
|
|
45
44
|
... name="My Training Data",
|
|
@@ -48,11 +47,11 @@ class DatasetPublisher:
|
|
|
48
47
|
... )
|
|
49
48
|
"""
|
|
50
49
|
|
|
51
|
-
def __init__(self, client:
|
|
50
|
+
def __init__(self, client: Atmosphere):
|
|
52
51
|
"""Initialize the dataset publisher.
|
|
53
52
|
|
|
54
53
|
Args:
|
|
55
|
-
client: Authenticated
|
|
54
|
+
client: Authenticated Atmosphere instance.
|
|
56
55
|
"""
|
|
57
56
|
self.client = client
|
|
58
57
|
self._schema_publisher = SchemaPublisher(client)
|
|
@@ -268,8 +267,8 @@ class DatasetLoader:
|
|
|
268
267
|
Python class for the sample type.
|
|
269
268
|
|
|
270
269
|
Examples:
|
|
271
|
-
>>>
|
|
272
|
-
>>> loader = DatasetLoader(
|
|
270
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
271
|
+
>>> loader = DatasetLoader(atmo)
|
|
273
272
|
>>>
|
|
274
273
|
>>> # List available datasets
|
|
275
274
|
>>> datasets = loader.list()
|
|
@@ -280,11 +279,11 @@ class DatasetLoader:
|
|
|
280
279
|
>>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.record/xyz")
|
|
281
280
|
"""
|
|
282
281
|
|
|
283
|
-
def __init__(self, client:
|
|
282
|
+
def __init__(self, client: Atmosphere):
|
|
284
283
|
"""Initialize the dataset loader.
|
|
285
284
|
|
|
286
285
|
Args:
|
|
287
|
-
client:
|
|
286
|
+
client: Atmosphere instance.
|
|
288
287
|
"""
|
|
289
288
|
self.client = client
|
|
290
289
|
|
atdata/atmosphere/schema.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
"""Schema publishing and loading for ATProto.
|
|
2
2
|
|
|
3
3
|
This module provides classes for publishing PackableSample schemas to ATProto
|
|
4
|
-
and loading them back. Schemas are published as ``ac.foundation.dataset.
|
|
4
|
+
and loading them back. Schemas are published as ``ac.foundation.dataset.schema``
|
|
5
5
|
records.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from dataclasses import fields, is_dataclass
|
|
9
9
|
from typing import Type, TypeVar, Optional, get_type_hints, get_origin, get_args
|
|
10
10
|
|
|
11
|
-
from .client import
|
|
11
|
+
from .client import Atmosphere
|
|
12
12
|
from ._types import (
|
|
13
13
|
AtUri,
|
|
14
14
|
SchemaRecord,
|
|
@@ -43,20 +43,19 @@ class SchemaPublisher:
|
|
|
43
43
|
... image: NDArray
|
|
44
44
|
... label: str
|
|
45
45
|
...
|
|
46
|
-
>>>
|
|
47
|
-
>>> client.login("handle", "password")
|
|
46
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
48
47
|
>>>
|
|
49
|
-
>>> publisher = SchemaPublisher(
|
|
48
|
+
>>> publisher = SchemaPublisher(atmo)
|
|
50
49
|
>>> uri = publisher.publish(MySample, version="1.0.0")
|
|
51
50
|
>>> print(uri)
|
|
52
|
-
at://did:plc:.../ac.foundation.dataset.
|
|
51
|
+
at://did:plc:.../ac.foundation.dataset.schema/...
|
|
53
52
|
"""
|
|
54
53
|
|
|
55
|
-
def __init__(self, client:
|
|
54
|
+
def __init__(self, client: Atmosphere):
|
|
56
55
|
"""Initialize the schema publisher.
|
|
57
56
|
|
|
58
57
|
Args:
|
|
59
|
-
client: Authenticated
|
|
58
|
+
client: Authenticated Atmosphere instance.
|
|
60
59
|
"""
|
|
61
60
|
self.client = client
|
|
62
61
|
|
|
@@ -103,7 +102,7 @@ class SchemaPublisher:
|
|
|
103
102
|
|
|
104
103
|
# Publish to ATProto
|
|
105
104
|
return self.client.create_record(
|
|
106
|
-
collection=f"{LEXICON_NAMESPACE}.
|
|
105
|
+
collection=f"{LEXICON_NAMESPACE}.schema",
|
|
107
106
|
record=schema_record.to_record(),
|
|
108
107
|
rkey=rkey,
|
|
109
108
|
validate=False, # PDS doesn't know our lexicon
|
|
@@ -185,20 +184,19 @@ class SchemaLoader:
|
|
|
185
184
|
schemas from a repository.
|
|
186
185
|
|
|
187
186
|
Examples:
|
|
188
|
-
>>>
|
|
189
|
-
>>> client.login("handle", "password")
|
|
187
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
190
188
|
>>>
|
|
191
|
-
>>> loader = SchemaLoader(
|
|
192
|
-
>>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.
|
|
189
|
+
>>> loader = SchemaLoader(atmo)
|
|
190
|
+
>>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.schema/...")
|
|
193
191
|
>>> print(schema["name"])
|
|
194
192
|
'MySample'
|
|
195
193
|
"""
|
|
196
194
|
|
|
197
|
-
def __init__(self, client:
|
|
195
|
+
def __init__(self, client: Atmosphere):
|
|
198
196
|
"""Initialize the schema loader.
|
|
199
197
|
|
|
200
198
|
Args:
|
|
201
|
-
client:
|
|
199
|
+
client: Atmosphere instance (authentication optional for reads).
|
|
202
200
|
"""
|
|
203
201
|
self.client = client
|
|
204
202
|
|
|
@@ -217,7 +215,7 @@ class SchemaLoader:
|
|
|
217
215
|
"""
|
|
218
216
|
record = self.client.get_record(uri)
|
|
219
217
|
|
|
220
|
-
expected_type = f"{LEXICON_NAMESPACE}.
|
|
218
|
+
expected_type = f"{LEXICON_NAMESPACE}.schema"
|
|
221
219
|
if record.get("$type") != expected_type:
|
|
222
220
|
raise ValueError(
|
|
223
221
|
f"Record at {uri} is not a schema record. "
|
atdata/atmosphere/store.py
CHANGED
|
@@ -7,12 +7,11 @@ This enables fully decentralized dataset storage where both metadata (records)
|
|
|
7
7
|
and data (blobs) live on the AT Protocol network.
|
|
8
8
|
|
|
9
9
|
Examples:
|
|
10
|
-
>>> from atdata.atmosphere import
|
|
10
|
+
>>> from atdata.atmosphere import Atmosphere, PDSBlobStore
|
|
11
11
|
>>>
|
|
12
|
-
>>>
|
|
13
|
-
>>> client.login("handle.bsky.social", "app-password")
|
|
12
|
+
>>> atmo = Atmosphere.login("handle.bsky.social", "app-password")
|
|
14
13
|
>>>
|
|
15
|
-
>>> store = PDSBlobStore(
|
|
14
|
+
>>> store = PDSBlobStore(atmo)
|
|
16
15
|
>>> urls = store.write_shards(dataset, prefix="mnist/v1")
|
|
17
16
|
>>> print(urls)
|
|
18
17
|
['at://did:plc:.../blob/bafyrei...', ...]
|
|
@@ -29,7 +28,7 @@ import webdataset as wds
|
|
|
29
28
|
if TYPE_CHECKING:
|
|
30
29
|
from ..dataset import Dataset
|
|
31
30
|
from .._sources import BlobSource
|
|
32
|
-
from .client import
|
|
31
|
+
from .client import Atmosphere
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
@dataclass
|
|
@@ -44,7 +43,7 @@ class PDSBlobStore:
|
|
|
44
43
|
to HTTP URLs for streaming.
|
|
45
44
|
|
|
46
45
|
Attributes:
|
|
47
|
-
client: Authenticated
|
|
46
|
+
client: Authenticated Atmosphere instance.
|
|
48
47
|
|
|
49
48
|
Examples:
|
|
50
49
|
>>> store = PDSBlobStore(client)
|
|
@@ -53,7 +52,7 @@ class PDSBlobStore:
|
|
|
53
52
|
>>> # ['at://did:plc:abc/blob/bafyrei...', ...]
|
|
54
53
|
"""
|
|
55
54
|
|
|
56
|
-
client: "
|
|
55
|
+
client: "Atmosphere"
|
|
57
56
|
|
|
58
57
|
def write_shards(
|
|
59
58
|
self,
|
atdata/cli/__init__.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
"""Command-line interface for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides CLI commands for managing
|
|
3
|
+
This module provides CLI commands for managing development infrastructure,
|
|
4
4
|
inspecting datasets, and diagnosing configuration issues.
|
|
5
5
|
|
|
6
6
|
Commands:
|
|
7
|
-
atdata
|
|
8
|
-
atdata
|
|
9
|
-
atdata
|
|
7
|
+
atdata infra up Start Redis and MinIO containers for development
|
|
8
|
+
atdata infra down Stop development containers
|
|
9
|
+
atdata infra status Show status of infrastructure
|
|
10
10
|
atdata diagnose Check Redis configuration and connectivity
|
|
11
11
|
atdata inspect Show dataset summary information
|
|
12
12
|
atdata schema show Display dataset schema
|
|
@@ -30,12 +30,12 @@ app = typer.Typer(
|
|
|
30
30
|
no_args_is_help=True,
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
name="
|
|
35
|
-
help="Manage
|
|
33
|
+
infra_app = typer.Typer(
|
|
34
|
+
name="infra",
|
|
35
|
+
help="Manage development infrastructure.",
|
|
36
36
|
no_args_is_help=True,
|
|
37
37
|
)
|
|
38
|
-
app.add_typer(
|
|
38
|
+
app.add_typer(infra_app, name="infra")
|
|
39
39
|
|
|
40
40
|
schema_app = typer.Typer(
|
|
41
41
|
name="schema",
|
|
@@ -101,11 +101,11 @@ def diagnose(
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
# ---------------------------------------------------------------------------
|
|
104
|
-
#
|
|
104
|
+
# infra sub-commands
|
|
105
105
|
# ---------------------------------------------------------------------------
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
@
|
|
108
|
+
@infra_app.command()
|
|
109
109
|
def up(
|
|
110
110
|
redis_port: int = typer.Option(6379, help="Redis port."),
|
|
111
111
|
minio_port: int = typer.Option(9000, help="MinIO API port."),
|
|
@@ -115,7 +115,7 @@ def up(
|
|
|
115
115
|
),
|
|
116
116
|
) -> None:
|
|
117
117
|
"""Start Redis and MinIO containers."""
|
|
118
|
-
from .
|
|
118
|
+
from .infra import local_up
|
|
119
119
|
|
|
120
120
|
code = local_up(
|
|
121
121
|
redis_port=redis_port,
|
|
@@ -126,23 +126,23 @@ def up(
|
|
|
126
126
|
raise typer.Exit(code=code)
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
@
|
|
129
|
+
@infra_app.command()
|
|
130
130
|
def down(
|
|
131
131
|
volumes: bool = typer.Option(
|
|
132
132
|
False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
|
|
133
133
|
),
|
|
134
134
|
) -> None:
|
|
135
135
|
"""Stop local development containers."""
|
|
136
|
-
from .
|
|
136
|
+
from .infra import local_down
|
|
137
137
|
|
|
138
138
|
code = local_down(remove_volumes=volumes)
|
|
139
139
|
raise typer.Exit(code=code)
|
|
140
140
|
|
|
141
141
|
|
|
142
|
-
@
|
|
142
|
+
@infra_app.command()
|
|
143
143
|
def status() -> None:
|
|
144
|
-
"""Show status of
|
|
145
|
-
from .
|
|
144
|
+
"""Show status of infrastructure."""
|
|
145
|
+
from .infra import local_status
|
|
146
146
|
|
|
147
147
|
code = local_status()
|
|
148
148
|
raise typer.Exit(code=code)
|
atdata/cli/diagnose.py
CHANGED
|
@@ -51,7 +51,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
51
51
|
_print_status("Connection", False, str(e))
|
|
52
52
|
print()
|
|
53
53
|
print("Cannot connect to Redis. Make sure Redis is running:")
|
|
54
|
-
print(" atdata
|
|
54
|
+
print(" atdata infra up")
|
|
55
55
|
return 1
|
|
56
56
|
|
|
57
57
|
# Check Redis version
|
|
@@ -162,7 +162,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
162
162
|
print(" maxmemory-policy noeviction")
|
|
163
163
|
print()
|
|
164
164
|
print(" # Or use atdata's preconfigured local setup:")
|
|
165
|
-
print(" atdata
|
|
165
|
+
print(" atdata infra up")
|
|
166
166
|
return 1
|
|
167
167
|
else:
|
|
168
168
|
print("All checks passed. Redis is properly configured for atdata.")
|
atdata/cli/{local.py → infra.py}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Infrastructure management for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides commands to start and stop
|
|
3
|
+
This module provides commands to start and stop development infrastructure:
|
|
4
4
|
- Redis: For index storage and metadata
|
|
5
5
|
- MinIO: S3-compatible object storage for dataset files
|
|
6
6
|
|
|
@@ -179,7 +179,7 @@ def local_up(
|
|
|
179
179
|
if not _check_docker():
|
|
180
180
|
return 1
|
|
181
181
|
|
|
182
|
-
print("Starting atdata
|
|
182
|
+
print("Starting atdata infrastructure...")
|
|
183
183
|
|
|
184
184
|
compose_content = _get_compose_file(redis_port, minio_port, minio_console_port)
|
|
185
185
|
command = ["up"]
|
|
@@ -202,7 +202,7 @@ def local_up(
|
|
|
202
202
|
|
|
203
203
|
# Show status
|
|
204
204
|
print()
|
|
205
|
-
print("
|
|
205
|
+
print("Infrastructure started:")
|
|
206
206
|
print(f" Redis: localhost:{redis_port}")
|
|
207
207
|
print(f" MinIO API: http://localhost:{minio_port}")
|
|
208
208
|
print(f" MinIO Console: http://localhost:{minio_console_port}")
|
|
@@ -210,7 +210,7 @@ def local_up(
|
|
|
210
210
|
print("MinIO credentials: minioadmin / minioadmin")
|
|
211
211
|
print()
|
|
212
212
|
print("Example usage:")
|
|
213
|
-
print(" from atdata.
|
|
213
|
+
print(" from atdata.stores import S3DataStore")
|
|
214
214
|
print(" ")
|
|
215
215
|
print(" store = S3DataStore.from_credentials({")
|
|
216
216
|
print(f" 'AWS_ENDPOINT': 'http://localhost:{minio_port}',")
|
|
@@ -234,7 +234,7 @@ def local_down(remove_volumes: bool = False) -> int:
|
|
|
234
234
|
if not _check_docker():
|
|
235
235
|
return 1
|
|
236
236
|
|
|
237
|
-
print("Stopping atdata
|
|
237
|
+
print("Stopping atdata infrastructure...")
|
|
238
238
|
|
|
239
239
|
# Use default ports for compose file (actual ports don't matter for down)
|
|
240
240
|
compose_content = _get_compose_file(6379, 9000, 9001)
|
|
@@ -252,7 +252,7 @@ def local_down(remove_volumes: bool = False) -> int:
|
|
|
252
252
|
print(f"Error: {e}", file=sys.stderr)
|
|
253
253
|
return 1
|
|
254
254
|
|
|
255
|
-
print("
|
|
255
|
+
print("Infrastructure stopped.")
|
|
256
256
|
return 0
|
|
257
257
|
|
|
258
258
|
|
|
@@ -268,16 +268,16 @@ def local_status() -> int:
|
|
|
268
268
|
redis_running = _container_running(REDIS_CONTAINER)
|
|
269
269
|
minio_running = _container_running(MINIO_CONTAINER)
|
|
270
270
|
|
|
271
|
-
print("atdata
|
|
271
|
+
print("atdata infrastructure status:")
|
|
272
272
|
print()
|
|
273
273
|
print(f" Redis ({REDIS_CONTAINER}): {'running' if redis_running else 'stopped'}")
|
|
274
274
|
print(f" MinIO ({MINIO_CONTAINER}): {'running' if minio_running else 'stopped'}")
|
|
275
275
|
|
|
276
276
|
if redis_running or minio_running:
|
|
277
277
|
print()
|
|
278
|
-
print("To stop: atdata
|
|
278
|
+
print("To stop: atdata infra down")
|
|
279
279
|
else:
|
|
280
280
|
print()
|
|
281
|
-
print("To start: atdata
|
|
281
|
+
print("To start: atdata infra up")
|
|
282
282
|
|
|
283
283
|
return 0
|