atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +31 -1
- atdata/_cid.py +29 -35
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +33 -17
- atdata/_hf_api.py +109 -59
- atdata/_logging.py +70 -0
- atdata/_protocols.py +74 -132
- atdata/_schema_codec.py +38 -41
- atdata/_sources.py +57 -64
- atdata/_stub_manager.py +31 -26
- atdata/_type_utils.py +47 -7
- atdata/atmosphere/__init__.py +31 -24
- atdata/atmosphere/_types.py +11 -11
- atdata/atmosphere/client.py +11 -8
- atdata/atmosphere/lens.py +27 -30
- atdata/atmosphere/records.py +34 -39
- atdata/atmosphere/schema.py +35 -31
- atdata/atmosphere/store.py +16 -20
- atdata/cli/__init__.py +163 -168
- atdata/cli/diagnose.py +12 -8
- atdata/cli/inspect.py +69 -0
- atdata/cli/local.py +5 -2
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +678 -533
- atdata/lens.py +85 -83
- atdata/local/__init__.py +71 -0
- atdata/local/_entry.py +157 -0
- atdata/local/_index.py +940 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/local/_s3.py +349 -0
- atdata/local/_schema.py +380 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +20 -24
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/testing.py +337 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
- atdata-0.3.0b1.dist-info/RECORD +54 -0
- atdata/local.py +0 -1707
- atdata-0.2.2b1.dist-info/RECORD +0 -28
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/_sources.py
CHANGED
|
@@ -13,22 +13,20 @@ The key insight is that WebDataset's tar_file_expander only needs
|
|
|
13
13
|
By providing streams directly, we can support private repos, custom
|
|
14
14
|
endpoints, and future backends like ATProto blobs.
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
... )
|
|
31
|
-
>>> ds = Dataset[MySample](source)
|
|
16
|
+
Examples:
|
|
17
|
+
>>> # Standard URL (uses WebDataset's gopen)
|
|
18
|
+
>>> source = URLSource("https://example.com/data-{000..009}.tar")
|
|
19
|
+
>>> ds = Dataset[MySample](source)
|
|
20
|
+
>>>
|
|
21
|
+
>>> # Private S3 with credentials
|
|
22
|
+
>>> source = S3Source(
|
|
23
|
+
... bucket="my-bucket",
|
|
24
|
+
... keys=["train/shard-000.tar", "train/shard-001.tar"],
|
|
25
|
+
... endpoint="https://my-r2.cloudflarestorage.com",
|
|
26
|
+
... access_key="...",
|
|
27
|
+
... secret_key="...",
|
|
28
|
+
... )
|
|
29
|
+
>>> ds = Dataset[MySample](source)
|
|
32
30
|
"""
|
|
33
31
|
|
|
34
32
|
from __future__ import annotations
|
|
@@ -54,12 +52,10 @@ class URLSource:
|
|
|
54
52
|
Attributes:
|
|
55
53
|
url: URL or brace pattern for the shards.
|
|
56
54
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
>>> for shard_id, stream in source.shards:
|
|
62
|
-
... print(f"Streaming {shard_id}")
|
|
55
|
+
Examples:
|
|
56
|
+
>>> source = URLSource("https://example.com/train-{000..009}.tar")
|
|
57
|
+
>>> for shard_id, stream in source.shards:
|
|
58
|
+
... print(f"Streaming {shard_id}")
|
|
63
59
|
"""
|
|
64
60
|
|
|
65
61
|
url: str
|
|
@@ -131,18 +127,16 @@ class S3Source:
|
|
|
131
127
|
secret_key: Optional AWS secret access key.
|
|
132
128
|
region: Optional AWS region (defaults to us-east-1).
|
|
133
129
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
>>> for shard_id, stream in source.shards:
|
|
145
|
-
... process(stream)
|
|
130
|
+
Examples:
|
|
131
|
+
>>> source = S3Source(
|
|
132
|
+
... bucket="my-datasets",
|
|
133
|
+
... keys=["train/shard-000.tar", "train/shard-001.tar"],
|
|
134
|
+
... endpoint="https://abc123.r2.cloudflarestorage.com",
|
|
135
|
+
... access_key="AKIAIOSFODNN7EXAMPLE",
|
|
136
|
+
... secret_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
|
137
|
+
... )
|
|
138
|
+
>>> for shard_id, stream in source.shards:
|
|
139
|
+
... process(stream)
|
|
146
140
|
"""
|
|
147
141
|
|
|
148
142
|
bucket: str
|
|
@@ -173,7 +167,9 @@ class S3Source:
|
|
|
173
167
|
client_kwargs["region_name"] = self.region
|
|
174
168
|
elif not self.endpoint:
|
|
175
169
|
# Default region for AWS S3
|
|
176
|
-
client_kwargs["region_name"] = os.environ.get(
|
|
170
|
+
client_kwargs["region_name"] = os.environ.get(
|
|
171
|
+
"AWS_DEFAULT_REGION", "us-east-1"
|
|
172
|
+
)
|
|
177
173
|
|
|
178
174
|
self._client = boto3.client("s3", **client_kwargs)
|
|
179
175
|
return self._client
|
|
@@ -225,7 +221,7 @@ class S3Source:
|
|
|
225
221
|
if not shard_id.startswith(f"s3://{self.bucket}/"):
|
|
226
222
|
raise KeyError(f"Shard not in this bucket: {shard_id}")
|
|
227
223
|
|
|
228
|
-
key = shard_id[len(f"s3://{self.bucket}/"):]
|
|
224
|
+
key = shard_id[len(f"s3://{self.bucket}/") :]
|
|
229
225
|
client = self._get_client()
|
|
230
226
|
response = client.get_object(Bucket=self.bucket, Key=key)
|
|
231
227
|
return response["Body"]
|
|
@@ -258,13 +254,11 @@ class S3Source:
|
|
|
258
254
|
Raises:
|
|
259
255
|
ValueError: If URLs are not valid s3:// URLs or span multiple buckets.
|
|
260
256
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
... endpoint="https://r2.example.com",
|
|
267
|
-
... )
|
|
257
|
+
Examples:
|
|
258
|
+
>>> source = S3Source.from_urls(
|
|
259
|
+
... ["s3://my-bucket/train-000.tar", "s3://my-bucket/train-001.tar"],
|
|
260
|
+
... endpoint="https://r2.example.com",
|
|
261
|
+
... )
|
|
268
262
|
"""
|
|
269
263
|
if not urls:
|
|
270
264
|
raise ValueError("urls cannot be empty")
|
|
@@ -317,15 +311,13 @@ class S3Source:
|
|
|
317
311
|
Returns:
|
|
318
312
|
Configured S3Source.
|
|
319
313
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
... }
|
|
328
|
-
>>> source = S3Source.from_credentials(creds, "my-bucket", ["data.tar"])
|
|
314
|
+
Examples:
|
|
315
|
+
>>> creds = {
|
|
316
|
+
... "AWS_ACCESS_KEY_ID": "...",
|
|
317
|
+
... "AWS_SECRET_ACCESS_KEY": "...",
|
|
318
|
+
... "AWS_ENDPOINT": "https://r2.example.com",
|
|
319
|
+
... }
|
|
320
|
+
>>> source = S3Source.from_credentials(creds, "my-bucket", ["data.tar"])
|
|
329
321
|
"""
|
|
330
322
|
return cls(
|
|
331
323
|
bucket=bucket,
|
|
@@ -352,22 +344,22 @@ class BlobSource:
|
|
|
352
344
|
blob_refs: List of blob reference dicts with 'did' and 'cid' keys.
|
|
353
345
|
pds_endpoint: Optional PDS endpoint URL. If not provided, resolved from DID.
|
|
354
346
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
>>> for shard_id, stream in source.shards:
|
|
365
|
-
... process(stream)
|
|
347
|
+
Examples:
|
|
348
|
+
>>> source = BlobSource(
|
|
349
|
+
... blob_refs=[
|
|
350
|
+
... {"did": "did:plc:abc123", "cid": "bafyrei..."},
|
|
351
|
+
... {"did": "did:plc:abc123", "cid": "bafyrei..."},
|
|
352
|
+
... ],
|
|
353
|
+
... )
|
|
354
|
+
>>> for shard_id, stream in source.shards:
|
|
355
|
+
... process(stream)
|
|
366
356
|
"""
|
|
367
357
|
|
|
368
358
|
blob_refs: list[dict[str, str]]
|
|
369
359
|
pds_endpoint: str | None = None
|
|
370
|
-
_endpoint_cache: dict[str, str] = field(
|
|
360
|
+
_endpoint_cache: dict[str, str] = field(
|
|
361
|
+
default_factory=dict, repr=False, compare=False
|
|
362
|
+
)
|
|
371
363
|
|
|
372
364
|
def _resolve_pds_endpoint(self, did: str) -> str:
|
|
373
365
|
"""Resolve PDS endpoint for a DID, with caching."""
|
|
@@ -459,6 +451,7 @@ class BlobSource:
|
|
|
459
451
|
url = self._get_blob_url(did, cid)
|
|
460
452
|
|
|
461
453
|
import requests
|
|
454
|
+
|
|
462
455
|
response = requests.get(url, stream=True, timeout=60)
|
|
463
456
|
response.raise_for_status()
|
|
464
457
|
return response.raw
|
atdata/_stub_manager.py
CHANGED
|
@@ -8,20 +8,18 @@ Unlike simple .pyi stubs, the generated modules are actual Python code that
|
|
|
8
8
|
can be imported at runtime. This allows ``decode_schema`` to return properly
|
|
9
9
|
typed classes that work with both static type checkers and runtime.
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
>>> # Get the stub directory path for IDE configuration
|
|
24
|
-
>>> print(f"Add to IDE: {index.stub_dir}")
|
|
11
|
+
Examples:
|
|
12
|
+
>>> from atdata.local import Index
|
|
13
|
+
>>>
|
|
14
|
+
>>> # Enable auto-stub generation
|
|
15
|
+
>>> index = Index(auto_stubs=True)
|
|
16
|
+
>>>
|
|
17
|
+
>>> # Modules are generated automatically on decode_schema
|
|
18
|
+
>>> MyType = index.decode_schema("atdata://local/sampleSchema/MySample@1.0.0")
|
|
19
|
+
>>> # MyType is now properly typed for IDE autocomplete!
|
|
20
|
+
>>>
|
|
21
|
+
>>> # Get the stub directory path for IDE configuration
|
|
22
|
+
>>> print(f"Add to IDE: {index.stub_dir}")
|
|
25
23
|
"""
|
|
26
24
|
|
|
27
25
|
from pathlib import Path
|
|
@@ -101,14 +99,12 @@ class StubManager:
|
|
|
101
99
|
Args:
|
|
102
100
|
stub_dir: Directory to write module files. Defaults to ``~/.atdata/stubs/``.
|
|
103
101
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
>>> print(manager.stub_dir)
|
|
111
|
-
/Users/you/.atdata/stubs
|
|
102
|
+
Examples:
|
|
103
|
+
>>> manager = StubManager()
|
|
104
|
+
>>> schema_dict = {"name": "MySample", "version": "1.0.0", "fields": [...]}
|
|
105
|
+
>>> SampleClass = manager.ensure_module(schema_dict)
|
|
106
|
+
>>> print(manager.stub_dir)
|
|
107
|
+
/Users/you/.atdata/stubs
|
|
112
108
|
"""
|
|
113
109
|
|
|
114
110
|
def __init__(self, stub_dir: Optional[Union[str, Path]] = None):
|
|
@@ -157,7 +153,9 @@ class StubManager:
|
|
|
157
153
|
"""Alias for _module_filename for backwards compatibility."""
|
|
158
154
|
return self._module_filename(name, version)
|
|
159
155
|
|
|
160
|
-
def _module_path(
|
|
156
|
+
def _module_path(
|
|
157
|
+
self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
|
|
158
|
+
) -> Path:
|
|
161
159
|
"""Get full path to module file for a schema.
|
|
162
160
|
|
|
163
161
|
Args:
|
|
@@ -170,7 +168,9 @@ class StubManager:
|
|
|
170
168
|
"""
|
|
171
169
|
return self._stub_dir / authority / self._module_filename(name, version)
|
|
172
170
|
|
|
173
|
-
def _stub_path(
|
|
171
|
+
def _stub_path(
|
|
172
|
+
self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
|
|
173
|
+
) -> Path:
|
|
174
174
|
"""Alias for _module_path for backwards compatibility."""
|
|
175
175
|
return self._module_path(name, version, authority)
|
|
176
176
|
|
|
@@ -211,7 +211,9 @@ class StubManager:
|
|
|
211
211
|
authority_dir.mkdir(parents=True, exist_ok=True)
|
|
212
212
|
init_path = authority_dir / "__init__.py"
|
|
213
213
|
if not init_path.exists():
|
|
214
|
-
init_path.write_text(
|
|
214
|
+
init_path.write_text(
|
|
215
|
+
f'"""Auto-generated schema modules for {authority}."""\n'
|
|
216
|
+
)
|
|
215
217
|
|
|
216
218
|
def _write_module_atomic(self, path: Path, content: str, authority: str) -> None:
|
|
217
219
|
"""Write module file atomically using temp file and rename.
|
|
@@ -359,7 +361,9 @@ class StubManager:
|
|
|
359
361
|
|
|
360
362
|
return cls
|
|
361
363
|
|
|
362
|
-
def _import_class_from_module(
|
|
364
|
+
def _import_class_from_module(
|
|
365
|
+
self, module_path: Path, class_name: str
|
|
366
|
+
) -> Optional[Type]:
|
|
363
367
|
"""Import a class from a generated module file.
|
|
364
368
|
|
|
365
369
|
Uses importlib to dynamically load the module and extract the class.
|
|
@@ -399,6 +403,7 @@ class StubManager:
|
|
|
399
403
|
def _print_ide_hint(self) -> None:
|
|
400
404
|
"""Print a one-time hint about IDE configuration."""
|
|
401
405
|
import sys as _sys
|
|
406
|
+
|
|
402
407
|
print(
|
|
403
408
|
f"\n[atdata] Generated schema module in: {self._stub_dir}\n"
|
|
404
409
|
f"[atdata] For IDE support, add this path to your type checker:\n"
|
atdata/_type_utils.py
CHANGED
|
@@ -9,15 +9,29 @@ from typing import Any, get_origin, get_args, Union
|
|
|
9
9
|
|
|
10
10
|
# Mapping from numpy dtype strings to schema dtype names
|
|
11
11
|
NUMPY_DTYPE_MAP = {
|
|
12
|
-
"float16": "float16",
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
12
|
+
"float16": "float16",
|
|
13
|
+
"float32": "float32",
|
|
14
|
+
"float64": "float64",
|
|
15
|
+
"int8": "int8",
|
|
16
|
+
"int16": "int16",
|
|
17
|
+
"int32": "int32",
|
|
18
|
+
"int64": "int64",
|
|
19
|
+
"uint8": "uint8",
|
|
20
|
+
"uint16": "uint16",
|
|
21
|
+
"uint32": "uint32",
|
|
22
|
+
"uint64": "uint64",
|
|
23
|
+
"bool": "bool",
|
|
24
|
+
"complex64": "complex64",
|
|
25
|
+
"complex128": "complex128",
|
|
16
26
|
}
|
|
17
27
|
|
|
18
28
|
# Mapping from Python primitive types to schema type names
|
|
19
29
|
PRIMITIVE_TYPE_MAP = {
|
|
20
|
-
str: "str",
|
|
30
|
+
str: "str",
|
|
31
|
+
int: "int",
|
|
32
|
+
float: "float",
|
|
33
|
+
bool: "bool",
|
|
34
|
+
bytes: "bytes",
|
|
21
35
|
}
|
|
22
36
|
|
|
23
37
|
|
|
@@ -31,9 +45,13 @@ def numpy_dtype_to_string(dtype: Any) -> str:
|
|
|
31
45
|
Schema dtype string (e.g., "float32", "int64"). Defaults to "float32".
|
|
32
46
|
"""
|
|
33
47
|
dtype_str = str(dtype)
|
|
34
|
-
|
|
48
|
+
# Exact match first (handles "float32", "int64", etc.)
|
|
49
|
+
if dtype_str in NUMPY_DTYPE_MAP:
|
|
50
|
+
return NUMPY_DTYPE_MAP[dtype_str]
|
|
51
|
+
# Substring match, longest keys first to avoid "int8" matching "uint8"
|
|
52
|
+
for key in sorted(NUMPY_DTYPE_MAP, key=len, reverse=True):
|
|
35
53
|
if key in dtype_str:
|
|
36
|
-
return
|
|
54
|
+
return NUMPY_DTYPE_MAP[key]
|
|
37
55
|
return "float32"
|
|
38
56
|
|
|
39
57
|
|
|
@@ -88,3 +106,25 @@ def extract_ndarray_dtype(python_type: Any) -> str:
|
|
|
88
106
|
if dtype_arg is not None:
|
|
89
107
|
return numpy_dtype_to_string(dtype_arg)
|
|
90
108
|
return "float32"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_semver(version: str) -> tuple[int, int, int]:
|
|
112
|
+
"""Parse a semantic version string into a comparable tuple.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
version: A ``"major.minor.patch"`` version string.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Tuple of (major, minor, patch) integers.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If the version string is not valid semver.
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
>>> parse_semver("1.2.3")
|
|
125
|
+
(1, 2, 3)
|
|
126
|
+
"""
|
|
127
|
+
parts = version.split(".")
|
|
128
|
+
if len(parts) != 3:
|
|
129
|
+
raise ValueError(f"Invalid semver: {version}")
|
|
130
|
+
return int(parts[0]), int(parts[1]), int(parts[2])
|
atdata/atmosphere/__init__.py
CHANGED
|
@@ -15,16 +15,14 @@ The ATProto integration is additive - existing atdata functionality continues
|
|
|
15
15
|
to work unchanged. These features are opt-in for users who want to publish
|
|
16
16
|
or discover datasets on the ATProto network.
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
>>> publisher = SchemaPublisher(client)
|
|
27
|
-
>>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
|
|
18
|
+
Examples:
|
|
19
|
+
>>> from atdata.atmosphere import AtmosphereClient, SchemaPublisher
|
|
20
|
+
>>>
|
|
21
|
+
>>> client = AtmosphereClient()
|
|
22
|
+
>>> client.login("handle.bsky.social", "app-password")
|
|
23
|
+
>>>
|
|
24
|
+
>>> publisher = SchemaPublisher(client)
|
|
25
|
+
>>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
|
|
28
26
|
|
|
29
27
|
Note:
|
|
30
28
|
This module requires the ``atproto`` package to be installed::
|
|
@@ -86,6 +84,7 @@ class AtmosphereIndexEntry:
|
|
|
86
84
|
def metadata(self) -> Optional[dict]:
|
|
87
85
|
"""Metadata from the record, if any."""
|
|
88
86
|
import msgpack
|
|
87
|
+
|
|
89
88
|
metadata_bytes = self._record.get("metadata")
|
|
90
89
|
if metadata_bytes is None:
|
|
91
90
|
return None
|
|
@@ -100,25 +99,25 @@ class AtmosphereIndexEntry:
|
|
|
100
99
|
class AtmosphereIndex:
|
|
101
100
|
"""ATProto index implementing AbstractIndex protocol.
|
|
102
101
|
|
|
102
|
+
.. deprecated::
|
|
103
|
+
Use ``atdata.Index(atmosphere=client)`` instead. ``AtmosphereIndex``
|
|
104
|
+
is retained for backwards compatibility and will be removed in a
|
|
105
|
+
future release.
|
|
106
|
+
|
|
103
107
|
Wraps SchemaPublisher/Loader and DatasetPublisher/Loader to provide
|
|
104
|
-
a unified interface compatible with
|
|
108
|
+
a unified interface compatible with Index.
|
|
105
109
|
|
|
106
110
|
Optionally accepts a ``PDSBlobStore`` for writing dataset shards as
|
|
107
111
|
ATProto blobs, enabling fully decentralized dataset storage.
|
|
108
112
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
>>>
|
|
118
|
-
>>> # With PDS blob storage
|
|
119
|
-
>>> store = PDSBlobStore(client)
|
|
120
|
-
>>> index = AtmosphereIndex(client, data_store=store)
|
|
121
|
-
>>> entry = index.insert_dataset(dataset, name="my-data")
|
|
113
|
+
Examples:
|
|
114
|
+
>>> # Preferred: use unified Index
|
|
115
|
+
>>> from atdata.local import Index
|
|
116
|
+
>>> from atdata.atmosphere import AtmosphereClient
|
|
117
|
+
>>> index = Index(atmosphere=client)
|
|
118
|
+
>>>
|
|
119
|
+
>>> # Legacy (deprecated)
|
|
120
|
+
>>> index = AtmosphereIndex(client)
|
|
122
121
|
"""
|
|
123
122
|
|
|
124
123
|
def __init__(
|
|
@@ -134,6 +133,14 @@ class AtmosphereIndex:
|
|
|
134
133
|
data_store: Optional PDSBlobStore for writing shards as blobs.
|
|
135
134
|
If provided, insert_dataset will upload shards to PDS.
|
|
136
135
|
"""
|
|
136
|
+
import warnings
|
|
137
|
+
|
|
138
|
+
warnings.warn(
|
|
139
|
+
"AtmosphereIndex is deprecated. Use atdata.Index(atmosphere=client) "
|
|
140
|
+
"instead for unified index access.",
|
|
141
|
+
DeprecationWarning,
|
|
142
|
+
stacklevel=2,
|
|
143
|
+
)
|
|
137
144
|
self.client = client
|
|
138
145
|
self._schema_publisher = SchemaPublisher(client)
|
|
139
146
|
self._schema_loader = SchemaLoader(client)
|
atdata/atmosphere/_types.py
CHANGED
|
@@ -19,16 +19,14 @@ class AtUri:
|
|
|
19
19
|
|
|
20
20
|
AT URIs follow the format: at://<authority>/<collection>/<rkey>
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
>>> uri.rkey
|
|
31
|
-
'xyz'
|
|
22
|
+
Examples:
|
|
23
|
+
>>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
|
|
24
|
+
>>> uri.authority
|
|
25
|
+
'did:plc:abc123'
|
|
26
|
+
>>> uri.collection
|
|
27
|
+
'ac.foundation.dataset.sampleSchema'
|
|
28
|
+
>>> uri.rkey
|
|
29
|
+
'xyz'
|
|
32
30
|
"""
|
|
33
31
|
|
|
34
32
|
authority: str
|
|
@@ -58,7 +56,9 @@ class AtUri:
|
|
|
58
56
|
|
|
59
57
|
parts = uri[5:].split("/")
|
|
60
58
|
if len(parts) < 3:
|
|
61
|
-
raise ValueError(
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"Invalid AT URI: expected authority/collection/rkey: {uri}"
|
|
61
|
+
)
|
|
62
62
|
|
|
63
63
|
return cls(
|
|
64
64
|
authority=parts[0],
|
atdata/atmosphere/client.py
CHANGED
|
@@ -18,6 +18,7 @@ def _get_atproto_client_class():
|
|
|
18
18
|
if _atproto_client_class is None:
|
|
19
19
|
try:
|
|
20
20
|
from atproto import Client
|
|
21
|
+
|
|
21
22
|
_atproto_client_class = Client
|
|
22
23
|
except ImportError as e:
|
|
23
24
|
raise ImportError(
|
|
@@ -33,13 +34,11 @@ class AtmosphereClient:
|
|
|
33
34
|
This class wraps the atproto SDK client and provides higher-level methods
|
|
34
35
|
for working with atdata records (schemas, datasets, lenses).
|
|
35
36
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
>>> print(client.did)
|
|
42
|
-
'did:plc:...'
|
|
37
|
+
Examples:
|
|
38
|
+
>>> client = AtmosphereClient()
|
|
39
|
+
>>> client.login("alice.bsky.social", "app-password")
|
|
40
|
+
>>> print(client.did)
|
|
41
|
+
'did:plc:...'
|
|
43
42
|
|
|
44
43
|
Note:
|
|
45
44
|
The password should be an app-specific password, not your main account
|
|
@@ -327,7 +326,11 @@ class AtmosphereClient:
|
|
|
327
326
|
# Convert to dict format suitable for embedding in records
|
|
328
327
|
return {
|
|
329
328
|
"$type": "blob",
|
|
330
|
-
"ref": {
|
|
329
|
+
"ref": {
|
|
330
|
+
"$link": blob_ref.ref.link
|
|
331
|
+
if hasattr(blob_ref.ref, "link")
|
|
332
|
+
else str(blob_ref.ref)
|
|
333
|
+
},
|
|
331
334
|
"mimeType": blob_ref.mime_type,
|
|
332
335
|
"size": blob_ref.size,
|
|
333
336
|
}
|
atdata/atmosphere/lens.py
CHANGED
|
@@ -21,6 +21,7 @@ from ._types import (
|
|
|
21
21
|
|
|
22
22
|
# Import for type checking only
|
|
23
23
|
from typing import TYPE_CHECKING
|
|
24
|
+
|
|
24
25
|
if TYPE_CHECKING:
|
|
25
26
|
from ..lens import Lens
|
|
26
27
|
|
|
@@ -31,26 +32,24 @@ class LensPublisher:
|
|
|
31
32
|
This class creates lens records that reference source and target schemas
|
|
32
33
|
and point to the transformation code in a git repository.
|
|
33
34
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
... putter_path="mymodule.lenses:my_lens_putter",
|
|
53
|
-
... )
|
|
35
|
+
Examples:
|
|
36
|
+
>>> @atdata.lens
|
|
37
|
+
... def my_lens(source: SourceType) -> TargetType:
|
|
38
|
+
... return TargetType(field=source.other_field)
|
|
39
|
+
>>>
|
|
40
|
+
>>> client = AtmosphereClient()
|
|
41
|
+
>>> client.login("handle", "password")
|
|
42
|
+
>>>
|
|
43
|
+
>>> publisher = LensPublisher(client)
|
|
44
|
+
>>> uri = publisher.publish(
|
|
45
|
+
... name="my_lens",
|
|
46
|
+
... source_schema_uri="at://did:plc:abc/ac.foundation.dataset.sampleSchema/source",
|
|
47
|
+
... target_schema_uri="at://did:plc:abc/ac.foundation.dataset.sampleSchema/target",
|
|
48
|
+
... code_repository="https://github.com/user/repo",
|
|
49
|
+
... code_commit="abc123def456",
|
|
50
|
+
... getter_path="mymodule.lenses:my_lens",
|
|
51
|
+
... putter_path="mymodule.lenses:my_lens_putter",
|
|
52
|
+
... )
|
|
54
53
|
|
|
55
54
|
Security Note:
|
|
56
55
|
Lens code is stored as references to git repositories rather than
|
|
@@ -195,16 +194,14 @@ class LensLoader:
|
|
|
195
194
|
using a lens requires installing the referenced code and importing
|
|
196
195
|
it manually.
|
|
197
196
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
>>> print(record["sourceSchema"])
|
|
207
|
-
>>> print(record.get("getterCode", {}).get("repository"))
|
|
197
|
+
Examples:
|
|
198
|
+
>>> client = AtmosphereClient()
|
|
199
|
+
>>> loader = LensLoader(client)
|
|
200
|
+
>>>
|
|
201
|
+
>>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.lens/xyz")
|
|
202
|
+
>>> print(record["name"])
|
|
203
|
+
>>> print(record["sourceSchema"])
|
|
204
|
+
>>> print(record.get("getterCode", {}).get("repository"))
|
|
208
205
|
"""
|
|
209
206
|
|
|
210
207
|
def __init__(self, client: AtmosphereClient):
|