atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/atmosphere/schema.py
CHANGED
|
@@ -1,25 +1,19 @@
|
|
|
1
1
|
"""Schema publishing and loading for ATProto.
|
|
2
2
|
|
|
3
3
|
This module provides classes for publishing PackableSample schemas to ATProto
|
|
4
|
-
and loading them back. Schemas are published as ``ac.foundation.dataset.
|
|
4
|
+
and loading them back. Schemas are published as ``ac.foundation.dataset.schema``
|
|
5
5
|
records.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from dataclasses import fields, is_dataclass
|
|
9
9
|
from typing import Type, TypeVar, Optional, get_type_hints, get_origin, get_args
|
|
10
10
|
|
|
11
|
-
from .client import
|
|
12
|
-
from ._types import
|
|
13
|
-
|
|
14
|
-
SchemaRecord,
|
|
15
|
-
FieldDef,
|
|
16
|
-
FieldType,
|
|
17
|
-
LEXICON_NAMESPACE,
|
|
18
|
-
)
|
|
11
|
+
from .client import Atmosphere
|
|
12
|
+
from ._types import AtUri, LEXICON_NAMESPACE
|
|
13
|
+
from ._lexicon_types import LexSchemaRecord, JsonSchemaFormat
|
|
19
14
|
from .._type_utils import (
|
|
20
15
|
unwrap_optional,
|
|
21
16
|
is_ndarray_type,
|
|
22
|
-
extract_ndarray_dtype,
|
|
23
17
|
)
|
|
24
18
|
|
|
25
19
|
# Import for type checking only to avoid circular imports
|
|
@@ -43,20 +37,19 @@ class SchemaPublisher:
|
|
|
43
37
|
... image: NDArray
|
|
44
38
|
... label: str
|
|
45
39
|
...
|
|
46
|
-
>>>
|
|
47
|
-
>>> client.login("handle", "password")
|
|
40
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
48
41
|
>>>
|
|
49
|
-
>>> publisher = SchemaPublisher(
|
|
42
|
+
>>> publisher = SchemaPublisher(atmo)
|
|
50
43
|
>>> uri = publisher.publish(MySample, version="1.0.0")
|
|
51
44
|
>>> print(uri)
|
|
52
|
-
at://did:plc:.../ac.foundation.dataset.
|
|
45
|
+
at://did:plc:.../ac.foundation.dataset.schema/...
|
|
53
46
|
"""
|
|
54
47
|
|
|
55
|
-
def __init__(self, client:
|
|
48
|
+
def __init__(self, client: Atmosphere):
|
|
56
49
|
"""Initialize the schema publisher.
|
|
57
50
|
|
|
58
51
|
Args:
|
|
59
|
-
client: Authenticated
|
|
52
|
+
client: Authenticated Atmosphere instance.
|
|
60
53
|
"""
|
|
61
54
|
self.client = client
|
|
62
55
|
|
|
@@ -87,27 +80,32 @@ class SchemaPublisher:
|
|
|
87
80
|
ValueError: If sample_type is not a dataclass or client is not authenticated.
|
|
88
81
|
TypeError: If a field type is not supported.
|
|
89
82
|
"""
|
|
83
|
+
from atdata._logging import log_operation
|
|
84
|
+
|
|
90
85
|
if not is_dataclass(sample_type):
|
|
91
86
|
raise ValueError(
|
|
92
87
|
f"{sample_type.__name__} must be a dataclass (use @packable)"
|
|
93
88
|
)
|
|
94
89
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
90
|
+
with log_operation(
|
|
91
|
+
"SchemaPublisher.publish", schema=sample_type.__name__, version=version
|
|
92
|
+
):
|
|
93
|
+
# Build the schema record
|
|
94
|
+
schema_record = self._build_schema_record(
|
|
95
|
+
sample_type,
|
|
96
|
+
name=name,
|
|
97
|
+
version=version,
|
|
98
|
+
description=description,
|
|
99
|
+
metadata=metadata,
|
|
100
|
+
)
|
|
103
101
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
102
|
+
# Publish to ATProto
|
|
103
|
+
return self.client.create_record(
|
|
104
|
+
collection=f"{LEXICON_NAMESPACE}.schema",
|
|
105
|
+
record=schema_record.to_record(),
|
|
106
|
+
rkey=rkey,
|
|
107
|
+
validate=False, # PDS doesn't know our lexicon
|
|
108
|
+
)
|
|
111
109
|
|
|
112
110
|
def _build_schema_record(
|
|
113
111
|
self,
|
|
@@ -117,57 +115,74 @@ class SchemaPublisher:
|
|
|
117
115
|
version: str,
|
|
118
116
|
description: Optional[str],
|
|
119
117
|
metadata: Optional[dict],
|
|
120
|
-
) ->
|
|
121
|
-
"""Build a
|
|
122
|
-
field_defs = []
|
|
118
|
+
) -> LexSchemaRecord:
|
|
119
|
+
"""Build a LexSchemaRecord from a PackableSample class."""
|
|
123
120
|
type_hints = get_type_hints(sample_type)
|
|
121
|
+
properties: dict[str, dict] = {}
|
|
122
|
+
required_fields: list[str] = []
|
|
123
|
+
has_ndarray = False
|
|
124
124
|
|
|
125
125
|
for f in fields(sample_type):
|
|
126
126
|
field_type = type_hints.get(f.name, f.type)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
127
|
+
field_type, is_optional = unwrap_optional(field_type)
|
|
128
|
+
prop = self._python_type_to_json_schema(field_type)
|
|
129
|
+
properties[f.name] = prop
|
|
130
|
+
if not is_optional:
|
|
131
|
+
required_fields.append(f.name)
|
|
132
|
+
if is_ndarray_type(field_type):
|
|
133
|
+
has_ndarray = True
|
|
134
|
+
|
|
135
|
+
schema_body = {
|
|
136
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
137
|
+
"type": "object",
|
|
138
|
+
"properties": properties,
|
|
139
|
+
}
|
|
140
|
+
if required_fields:
|
|
141
|
+
schema_body["required"] = required_fields
|
|
142
|
+
|
|
143
|
+
array_format_versions = None
|
|
144
|
+
if has_ndarray:
|
|
145
|
+
array_format_versions = {"ndarrayBytes": "1.0.0"}
|
|
146
|
+
|
|
147
|
+
return LexSchemaRecord(
|
|
131
148
|
name=name or sample_type.__name__,
|
|
132
149
|
version=version,
|
|
150
|
+
schema_type="jsonSchema",
|
|
151
|
+
schema=JsonSchemaFormat(
|
|
152
|
+
schema_body=schema_body,
|
|
153
|
+
array_format_versions=array_format_versions,
|
|
154
|
+
),
|
|
133
155
|
description=description,
|
|
134
|
-
fields=field_defs,
|
|
135
156
|
metadata=metadata,
|
|
136
157
|
)
|
|
137
158
|
|
|
138
|
-
def
|
|
139
|
-
"""
|
|
140
|
-
python_type, is_optional = unwrap_optional(python_type)
|
|
141
|
-
field_type = self._python_type_to_field_type(python_type)
|
|
142
|
-
return FieldDef(name=name, field_type=field_type, optional=is_optional)
|
|
143
|
-
|
|
144
|
-
def _python_type_to_field_type(self, python_type) -> FieldType:
|
|
145
|
-
"""Map a Python type to a FieldType."""
|
|
159
|
+
def _python_type_to_json_schema(self, python_type) -> dict:
|
|
160
|
+
"""Map a Python type to a JSON Schema property definition."""
|
|
146
161
|
if python_type is str:
|
|
147
|
-
return
|
|
162
|
+
return {"type": "string"}
|
|
148
163
|
if python_type is int:
|
|
149
|
-
return
|
|
164
|
+
return {"type": "integer"}
|
|
150
165
|
if python_type is float:
|
|
151
|
-
return
|
|
166
|
+
return {"type": "number"}
|
|
152
167
|
if python_type is bool:
|
|
153
|
-
return
|
|
168
|
+
return {"type": "boolean"}
|
|
154
169
|
if python_type is bytes:
|
|
155
|
-
return
|
|
170
|
+
return {"type": "string", "format": "byte", "contentEncoding": "base64"}
|
|
156
171
|
|
|
157
172
|
if is_ndarray_type(python_type):
|
|
158
|
-
return
|
|
159
|
-
|
|
160
|
-
|
|
173
|
+
return {
|
|
174
|
+
"$ref": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0#/$defs/ndarray"
|
|
175
|
+
}
|
|
161
176
|
|
|
162
177
|
origin = get_origin(python_type)
|
|
163
178
|
if origin is list:
|
|
164
179
|
args = get_args(python_type)
|
|
165
180
|
items = (
|
|
166
|
-
self.
|
|
181
|
+
self._python_type_to_json_schema(args[0])
|
|
167
182
|
if args
|
|
168
|
-
else
|
|
183
|
+
else {"type": "string"}
|
|
169
184
|
)
|
|
170
|
-
return
|
|
185
|
+
return {"type": "array", "items": items}
|
|
171
186
|
|
|
172
187
|
if is_dataclass(python_type):
|
|
173
188
|
raise TypeError(
|
|
@@ -185,20 +200,19 @@ class SchemaLoader:
|
|
|
185
200
|
schemas from a repository.
|
|
186
201
|
|
|
187
202
|
Examples:
|
|
188
|
-
>>>
|
|
189
|
-
>>> client.login("handle", "password")
|
|
203
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
190
204
|
>>>
|
|
191
|
-
>>> loader = SchemaLoader(
|
|
192
|
-
>>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.
|
|
205
|
+
>>> loader = SchemaLoader(atmo)
|
|
206
|
+
>>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.schema/...")
|
|
193
207
|
>>> print(schema["name"])
|
|
194
208
|
'MySample'
|
|
195
209
|
"""
|
|
196
210
|
|
|
197
|
-
def __init__(self, client:
|
|
211
|
+
def __init__(self, client: Atmosphere):
|
|
198
212
|
"""Initialize the schema loader.
|
|
199
213
|
|
|
200
214
|
Args:
|
|
201
|
-
client:
|
|
215
|
+
client: Atmosphere instance (authentication optional for reads).
|
|
202
216
|
"""
|
|
203
217
|
self.client = client
|
|
204
218
|
|
|
@@ -217,7 +231,7 @@ class SchemaLoader:
|
|
|
217
231
|
"""
|
|
218
232
|
record = self.client.get_record(uri)
|
|
219
233
|
|
|
220
|
-
expected_type = f"{LEXICON_NAMESPACE}.
|
|
234
|
+
expected_type = f"{LEXICON_NAMESPACE}.schema"
|
|
221
235
|
if record.get("$type") != expected_type:
|
|
222
236
|
raise ValueError(
|
|
223
237
|
f"Record at {uri} is not a schema record. "
|
|
@@ -226,6 +240,18 @@ class SchemaLoader:
|
|
|
226
240
|
|
|
227
241
|
return record
|
|
228
242
|
|
|
243
|
+
def get_typed(self, uri: str | AtUri) -> LexSchemaRecord:
|
|
244
|
+
"""Fetch a schema record and return as a typed object.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
uri: The AT URI of the schema record.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
LexSchemaRecord instance.
|
|
251
|
+
"""
|
|
252
|
+
record = self.get(uri)
|
|
253
|
+
return LexSchemaRecord.from_record(record)
|
|
254
|
+
|
|
229
255
|
def list_all(
|
|
230
256
|
self,
|
|
231
257
|
repo: Optional[str] = None,
|
atdata/atmosphere/store.py
CHANGED
|
@@ -7,12 +7,11 @@ This enables fully decentralized dataset storage where both metadata (records)
|
|
|
7
7
|
and data (blobs) live on the AT Protocol network.
|
|
8
8
|
|
|
9
9
|
Examples:
|
|
10
|
-
>>> from atdata.atmosphere import
|
|
10
|
+
>>> from atdata.atmosphere import Atmosphere, PDSBlobStore
|
|
11
11
|
>>>
|
|
12
|
-
>>>
|
|
13
|
-
>>> client.login("handle.bsky.social", "app-password")
|
|
12
|
+
>>> atmo = Atmosphere.login("handle.bsky.social", "app-password")
|
|
14
13
|
>>>
|
|
15
|
-
>>> store = PDSBlobStore(
|
|
14
|
+
>>> store = PDSBlobStore(atmo)
|
|
16
15
|
>>> urls = store.write_shards(dataset, prefix="mnist/v1")
|
|
17
16
|
>>> print(urls)
|
|
18
17
|
['at://did:plc:.../blob/bafyrei...', ...]
|
|
@@ -20,16 +19,38 @@ Examples:
|
|
|
20
19
|
|
|
21
20
|
from __future__ import annotations
|
|
22
21
|
|
|
23
|
-
import tempfile
|
|
24
22
|
from dataclasses import dataclass
|
|
25
23
|
from typing import TYPE_CHECKING, Any
|
|
26
24
|
|
|
27
|
-
|
|
25
|
+
#: Maximum size in bytes for a single PDS blob upload (50 MB).
|
|
26
|
+
PDS_BLOB_LIMIT_BYTES: int = 50_000_000
|
|
27
|
+
|
|
28
|
+
#: Maximum total dataset size in bytes for atmosphere uploads (1 GB).
|
|
29
|
+
PDS_TOTAL_DATASET_LIMIT_BYTES: int = 1_000_000_000
|
|
28
30
|
|
|
29
31
|
if TYPE_CHECKING:
|
|
30
32
|
from ..dataset import Dataset
|
|
31
33
|
from .._sources import BlobSource
|
|
32
|
-
from .client import
|
|
34
|
+
from .client import Atmosphere
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ShardUploadResult(list):
|
|
38
|
+
"""Return type for ``PDSBlobStore.write_shards()``.
|
|
39
|
+
|
|
40
|
+
Extends ``list[str]`` (AT URIs) so it satisfies the ``AbstractDataStore``
|
|
41
|
+
protocol, while also carrying the raw blob reference dicts needed to
|
|
42
|
+
create ``storageBlobs`` records.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
blob_refs: Blob reference dicts as returned by
|
|
46
|
+
``Atmosphere.upload_blob()``.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
blob_refs: list[dict]
|
|
50
|
+
|
|
51
|
+
def __init__(self, urls: list[str], blob_refs: list[dict]) -> None:
|
|
52
|
+
super().__init__(urls)
|
|
53
|
+
self.blob_refs = blob_refs
|
|
33
54
|
|
|
34
55
|
|
|
35
56
|
@dataclass
|
|
@@ -44,7 +65,7 @@ class PDSBlobStore:
|
|
|
44
65
|
to HTTP URLs for streaming.
|
|
45
66
|
|
|
46
67
|
Attributes:
|
|
47
|
-
client: Authenticated
|
|
68
|
+
client: Authenticated Atmosphere instance.
|
|
48
69
|
|
|
49
70
|
Examples:
|
|
50
71
|
>>> store = PDSBlobStore(client)
|
|
@@ -53,85 +74,61 @@ class PDSBlobStore:
|
|
|
53
74
|
>>> # ['at://did:plc:abc/blob/bafyrei...', ...]
|
|
54
75
|
"""
|
|
55
76
|
|
|
56
|
-
client: "
|
|
77
|
+
client: "Atmosphere"
|
|
57
78
|
|
|
58
79
|
def write_shards(
|
|
59
80
|
self,
|
|
60
81
|
ds: "Dataset",
|
|
61
82
|
*,
|
|
62
83
|
prefix: str,
|
|
63
|
-
maxcount: int = 10000,
|
|
64
|
-
maxsize: float = 3e9,
|
|
65
84
|
**kwargs: Any,
|
|
66
|
-
) ->
|
|
67
|
-
"""
|
|
85
|
+
) -> "ShardUploadResult":
|
|
86
|
+
"""Upload existing dataset shards as PDS blobs.
|
|
68
87
|
|
|
69
|
-
|
|
70
|
-
to the authenticated user's PDS.
|
|
88
|
+
Reads the tar archives already written to disk by the caller and
|
|
89
|
+
uploads each as a blob to the authenticated user's PDS. This
|
|
90
|
+
avoids re-serializing samples that have already been written.
|
|
71
91
|
|
|
72
92
|
Args:
|
|
73
|
-
ds: The Dataset to
|
|
74
|
-
prefix: Logical path prefix
|
|
75
|
-
|
|
76
|
-
maxsize: Maximum shard size in bytes (default: 3GB, PDS limit).
|
|
77
|
-
**kwargs: Additional args passed to wds.ShardWriter.
|
|
93
|
+
ds: The Dataset whose shards to upload.
|
|
94
|
+
prefix: Logical path prefix (unused, kept for protocol compat).
|
|
95
|
+
**kwargs: Unused, kept for protocol compatibility.
|
|
78
96
|
|
|
79
97
|
Returns:
|
|
80
|
-
|
|
81
|
-
``
|
|
98
|
+
A ``ShardUploadResult`` (behaves as ``list[str]`` of AT URIs)
|
|
99
|
+
with a ``blob_refs`` attribute containing the raw blob reference
|
|
100
|
+
dicts needed for ``storageBlobs`` records.
|
|
82
101
|
|
|
83
102
|
Raises:
|
|
84
103
|
ValueError: If not authenticated.
|
|
85
|
-
RuntimeError: If no shards
|
|
86
|
-
|
|
87
|
-
Note:
|
|
88
|
-
PDS blobs have size limits (typically 50MB-5GB depending on PDS).
|
|
89
|
-
Adjust maxcount/maxsize to stay within limits.
|
|
104
|
+
RuntimeError: If no shards are found on the dataset.
|
|
90
105
|
"""
|
|
91
106
|
if not self.client.did:
|
|
92
107
|
raise ValueError("Client must be authenticated to upload blobs")
|
|
93
108
|
|
|
94
109
|
did = self.client.did
|
|
95
110
|
blob_urls: list[str] = []
|
|
111
|
+
blob_refs: list[dict] = []
|
|
112
|
+
|
|
113
|
+
shard_paths = ds.list_shards()
|
|
114
|
+
if not shard_paths:
|
|
115
|
+
raise RuntimeError("No shards to upload")
|
|
116
|
+
|
|
117
|
+
for shard_url in shard_paths:
|
|
118
|
+
with open(shard_url, "rb") as f:
|
|
119
|
+
shard_data = f.read()
|
|
120
|
+
|
|
121
|
+
blob_ref = self.client.upload_blob(
|
|
122
|
+
shard_data,
|
|
123
|
+
mime_type="application/x-tar",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
blob_refs.append(blob_ref)
|
|
127
|
+
cid = blob_ref["ref"]["$link"]
|
|
128
|
+
at_uri = f"at://{did}/blob/{cid}"
|
|
129
|
+
blob_urls.append(at_uri)
|
|
96
130
|
|
|
97
|
-
|
|
98
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
99
|
-
shard_pattern = f"{temp_dir}/shard-%06d.tar"
|
|
100
|
-
written_files: list[str] = []
|
|
101
|
-
|
|
102
|
-
# Track written files via custom post callback
|
|
103
|
-
def track_file(fname: str) -> None:
|
|
104
|
-
written_files.append(fname)
|
|
105
|
-
|
|
106
|
-
with wds.writer.ShardWriter(
|
|
107
|
-
shard_pattern,
|
|
108
|
-
maxcount=maxcount,
|
|
109
|
-
maxsize=maxsize,
|
|
110
|
-
post=track_file,
|
|
111
|
-
**kwargs,
|
|
112
|
-
) as sink:
|
|
113
|
-
for sample in ds.ordered(batch_size=None):
|
|
114
|
-
sink.write(sample.as_wds)
|
|
115
|
-
|
|
116
|
-
if not written_files:
|
|
117
|
-
raise RuntimeError("No shards written")
|
|
118
|
-
|
|
119
|
-
# Upload each shard as a blob
|
|
120
|
-
for shard_path in written_files:
|
|
121
|
-
with open(shard_path, "rb") as f:
|
|
122
|
-
shard_data = f.read()
|
|
123
|
-
|
|
124
|
-
blob_ref = self.client.upload_blob(
|
|
125
|
-
shard_data,
|
|
126
|
-
mime_type="application/x-tar",
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
# Extract CID from blob reference
|
|
130
|
-
cid = blob_ref["ref"]["$link"]
|
|
131
|
-
at_uri = f"at://{did}/blob/{cid}"
|
|
132
|
-
blob_urls.append(at_uri)
|
|
133
|
-
|
|
134
|
-
return blob_urls
|
|
131
|
+
return ShardUploadResult(blob_urls, blob_refs)
|
|
135
132
|
|
|
136
133
|
def read_url(self, url: str) -> str:
|
|
137
134
|
"""Resolve an AT URI blob reference to an HTTP URL.
|
|
@@ -201,4 +198,9 @@ class PDSBlobStore:
|
|
|
201
198
|
return BlobSource(blob_refs=blob_refs)
|
|
202
199
|
|
|
203
200
|
|
|
204
|
-
__all__ = [
|
|
201
|
+
__all__ = [
|
|
202
|
+
"PDS_BLOB_LIMIT_BYTES",
|
|
203
|
+
"PDS_TOTAL_DATASET_LIMIT_BYTES",
|
|
204
|
+
"PDSBlobStore",
|
|
205
|
+
"ShardUploadResult",
|
|
206
|
+
]
|
atdata/cli/__init__.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
"""Command-line interface for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides CLI commands for managing
|
|
3
|
+
This module provides CLI commands for managing development infrastructure,
|
|
4
4
|
inspecting datasets, and diagnosing configuration issues.
|
|
5
5
|
|
|
6
6
|
Commands:
|
|
7
|
-
atdata
|
|
8
|
-
atdata
|
|
9
|
-
atdata
|
|
7
|
+
atdata infra up Start Redis and MinIO containers for development
|
|
8
|
+
atdata infra down Stop development containers
|
|
9
|
+
atdata infra status Show status of infrastructure
|
|
10
10
|
atdata diagnose Check Redis configuration and connectivity
|
|
11
11
|
atdata inspect Show dataset summary information
|
|
12
12
|
atdata schema show Display dataset schema
|
|
@@ -30,12 +30,12 @@ app = typer.Typer(
|
|
|
30
30
|
no_args_is_help=True,
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
name="
|
|
35
|
-
help="Manage
|
|
33
|
+
infra_app = typer.Typer(
|
|
34
|
+
name="infra",
|
|
35
|
+
help="Manage development infrastructure.",
|
|
36
36
|
no_args_is_help=True,
|
|
37
37
|
)
|
|
38
|
-
app.add_typer(
|
|
38
|
+
app.add_typer(infra_app, name="infra")
|
|
39
39
|
|
|
40
40
|
schema_app = typer.Typer(
|
|
41
41
|
name="schema",
|
|
@@ -101,11 +101,11 @@ def diagnose(
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
# ---------------------------------------------------------------------------
|
|
104
|
-
#
|
|
104
|
+
# infra sub-commands
|
|
105
105
|
# ---------------------------------------------------------------------------
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
@
|
|
108
|
+
@infra_app.command()
|
|
109
109
|
def up(
|
|
110
110
|
redis_port: int = typer.Option(6379, help="Redis port."),
|
|
111
111
|
minio_port: int = typer.Option(9000, help="MinIO API port."),
|
|
@@ -115,7 +115,7 @@ def up(
|
|
|
115
115
|
),
|
|
116
116
|
) -> None:
|
|
117
117
|
"""Start Redis and MinIO containers."""
|
|
118
|
-
from .
|
|
118
|
+
from .infra import local_up
|
|
119
119
|
|
|
120
120
|
code = local_up(
|
|
121
121
|
redis_port=redis_port,
|
|
@@ -126,23 +126,23 @@ def up(
|
|
|
126
126
|
raise typer.Exit(code=code)
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
@
|
|
129
|
+
@infra_app.command()
|
|
130
130
|
def down(
|
|
131
131
|
volumes: bool = typer.Option(
|
|
132
132
|
False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
|
|
133
133
|
),
|
|
134
134
|
) -> None:
|
|
135
135
|
"""Stop local development containers."""
|
|
136
|
-
from .
|
|
136
|
+
from .infra import local_down
|
|
137
137
|
|
|
138
138
|
code = local_down(remove_volumes=volumes)
|
|
139
139
|
raise typer.Exit(code=code)
|
|
140
140
|
|
|
141
141
|
|
|
142
|
-
@
|
|
142
|
+
@infra_app.command()
|
|
143
143
|
def status() -> None:
|
|
144
|
-
"""Show status of
|
|
145
|
-
from .
|
|
144
|
+
"""Show status of infrastructure."""
|
|
145
|
+
from .infra import local_status
|
|
146
146
|
|
|
147
147
|
code = local_status()
|
|
148
148
|
raise typer.Exit(code=code)
|
atdata/cli/diagnose.py
CHANGED
|
@@ -51,7 +51,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
51
51
|
_print_status("Connection", False, str(e))
|
|
52
52
|
print()
|
|
53
53
|
print("Cannot connect to Redis. Make sure Redis is running:")
|
|
54
|
-
print(" atdata
|
|
54
|
+
print(" atdata infra up")
|
|
55
55
|
return 1
|
|
56
56
|
|
|
57
57
|
# Check Redis version
|
|
@@ -162,7 +162,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
162
162
|
print(" maxmemory-policy noeviction")
|
|
163
163
|
print()
|
|
164
164
|
print(" # Or use atdata's preconfigured local setup:")
|
|
165
|
-
print(" atdata
|
|
165
|
+
print(" atdata infra up")
|
|
166
166
|
return 1
|
|
167
167
|
else:
|
|
168
168
|
print("All checks passed. Redis is properly configured for atdata.")
|
atdata/cli/{local.py → infra.py}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Infrastructure management for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides commands to start and stop
|
|
3
|
+
This module provides commands to start and stop development infrastructure:
|
|
4
4
|
- Redis: For index storage and metadata
|
|
5
5
|
- MinIO: S3-compatible object storage for dataset files
|
|
6
6
|
|
|
@@ -179,7 +179,7 @@ def local_up(
|
|
|
179
179
|
if not _check_docker():
|
|
180
180
|
return 1
|
|
181
181
|
|
|
182
|
-
print("Starting atdata
|
|
182
|
+
print("Starting atdata infrastructure...")
|
|
183
183
|
|
|
184
184
|
compose_content = _get_compose_file(redis_port, minio_port, minio_console_port)
|
|
185
185
|
command = ["up"]
|
|
@@ -202,7 +202,7 @@ def local_up(
|
|
|
202
202
|
|
|
203
203
|
# Show status
|
|
204
204
|
print()
|
|
205
|
-
print("
|
|
205
|
+
print("Infrastructure started:")
|
|
206
206
|
print(f" Redis: localhost:{redis_port}")
|
|
207
207
|
print(f" MinIO API: http://localhost:{minio_port}")
|
|
208
208
|
print(f" MinIO Console: http://localhost:{minio_console_port}")
|
|
@@ -210,7 +210,7 @@ def local_up(
|
|
|
210
210
|
print("MinIO credentials: minioadmin / minioadmin")
|
|
211
211
|
print()
|
|
212
212
|
print("Example usage:")
|
|
213
|
-
print(" from atdata.
|
|
213
|
+
print(" from atdata.stores import S3DataStore")
|
|
214
214
|
print(" ")
|
|
215
215
|
print(" store = S3DataStore.from_credentials({")
|
|
216
216
|
print(f" 'AWS_ENDPOINT': 'http://localhost:{minio_port}',")
|
|
@@ -234,7 +234,7 @@ def local_down(remove_volumes: bool = False) -> int:
|
|
|
234
234
|
if not _check_docker():
|
|
235
235
|
return 1
|
|
236
236
|
|
|
237
|
-
print("Stopping atdata
|
|
237
|
+
print("Stopping atdata infrastructure...")
|
|
238
238
|
|
|
239
239
|
# Use default ports for compose file (actual ports don't matter for down)
|
|
240
240
|
compose_content = _get_compose_file(6379, 9000, 9001)
|
|
@@ -252,7 +252,7 @@ def local_down(remove_volumes: bool = False) -> int:
|
|
|
252
252
|
print(f"Error: {e}", file=sys.stderr)
|
|
253
253
|
return 1
|
|
254
254
|
|
|
255
|
-
print("
|
|
255
|
+
print("Infrastructure stopped.")
|
|
256
256
|
return 0
|
|
257
257
|
|
|
258
258
|
|
|
@@ -268,16 +268,16 @@ def local_status() -> int:
|
|
|
268
268
|
redis_running = _container_running(REDIS_CONTAINER)
|
|
269
269
|
minio_running = _container_running(MINIO_CONTAINER)
|
|
270
270
|
|
|
271
|
-
print("atdata
|
|
271
|
+
print("atdata infrastructure status:")
|
|
272
272
|
print()
|
|
273
273
|
print(f" Redis ({REDIS_CONTAINER}): {'running' if redis_running else 'stopped'}")
|
|
274
274
|
print(f" MinIO ({MINIO_CONTAINER}): {'running' if minio_running else 'stopped'}")
|
|
275
275
|
|
|
276
276
|
if redis_running or minio_running:
|
|
277
277
|
print()
|
|
278
|
-
print("To stop: atdata
|
|
278
|
+
print("To stop: atdata infra down")
|
|
279
279
|
else:
|
|
280
280
|
print()
|
|
281
|
-
print("To start: atdata
|
|
281
|
+
print("To start: atdata infra up")
|
|
282
282
|
|
|
283
283
|
return 0
|