atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +30 -0
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +29 -15
- atdata/_hf_api.py +63 -11
- atdata/_logging.py +70 -0
- atdata/_protocols.py +19 -62
- atdata/_schema_codec.py +5 -4
- atdata/_type_utils.py +28 -2
- atdata/atmosphere/__init__.py +19 -9
- atdata/atmosphere/records.py +3 -2
- atdata/atmosphere/schema.py +2 -2
- atdata/cli/__init__.py +157 -171
- atdata/cli/inspect.py +69 -0
- atdata/cli/local.py +1 -1
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +428 -326
- atdata/lens.py +9 -2
- atdata/local/__init__.py +71 -0
- atdata/local/_entry.py +157 -0
- atdata/local/_index.py +940 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/local/_s3.py +349 -0
- atdata/local/_schema.py +380 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +4 -4
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/testing.py +337 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
- atdata-0.3.0b1.dist-info/RECORD +54 -0
- atdata/local.py +0 -1720
- atdata-0.2.3b1.dist-info/RECORD +0 -28
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/lens.py
CHANGED
|
@@ -61,6 +61,7 @@ if TYPE_CHECKING:
|
|
|
61
61
|
from .dataset import PackableSample
|
|
62
62
|
|
|
63
63
|
from ._protocols import Packable
|
|
64
|
+
from ._exceptions import LensNotFoundError
|
|
64
65
|
|
|
65
66
|
|
|
66
67
|
##
|
|
@@ -101,7 +102,8 @@ class Lens(Generic[S, V]):
|
|
|
101
102
|
... return FullData(name=view.name, age=source.age)
|
|
102
103
|
"""
|
|
103
104
|
|
|
104
|
-
#
|
|
105
|
+
# Note: The docstring uses "Parameters:" for type parameters as a workaround
|
|
106
|
+
# for quartodoc not supporting "Type Parameters:" sections.
|
|
105
107
|
|
|
106
108
|
def __init__(
|
|
107
109
|
self, get: LensGetter[S, V], put: Optional[LensPutter[S, V]] = None
|
|
@@ -290,7 +292,12 @@ class LensNetwork:
|
|
|
290
292
|
"""
|
|
291
293
|
ret = self._registry.get((source, view), None)
|
|
292
294
|
if ret is None:
|
|
293
|
-
|
|
295
|
+
available_targets = [
|
|
296
|
+
(sig[1], lens_obj.__name__)
|
|
297
|
+
for sig, lens_obj in self._registry.items()
|
|
298
|
+
if sig[0] is source and hasattr(lens_obj, "__name__")
|
|
299
|
+
]
|
|
300
|
+
raise LensNotFoundError(source, view, available_targets)
|
|
294
301
|
|
|
295
302
|
return ret
|
|
296
303
|
|
atdata/local/__init__.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Local storage backend for atdata datasets.
|
|
2
|
+
|
|
3
|
+
Key classes:
|
|
4
|
+
|
|
5
|
+
- ``Index``: Unified index with pluggable providers (SQLite default),
|
|
6
|
+
named repositories, and optional atmosphere backend.
|
|
7
|
+
- ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
|
|
8
|
+
- ``S3DataStore``: S3-compatible shard storage.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from atdata.local._entry import (
|
|
12
|
+
LocalDatasetEntry,
|
|
13
|
+
BasicIndexEntry,
|
|
14
|
+
REDIS_KEY_DATASET_ENTRY,
|
|
15
|
+
REDIS_KEY_SCHEMA,
|
|
16
|
+
)
|
|
17
|
+
from atdata.local._schema import (
|
|
18
|
+
SchemaNamespace,
|
|
19
|
+
SchemaFieldType,
|
|
20
|
+
SchemaField,
|
|
21
|
+
LocalSchemaRecord,
|
|
22
|
+
_ATDATA_URI_PREFIX,
|
|
23
|
+
_LEGACY_URI_PREFIX,
|
|
24
|
+
_kind_str_for_sample_type,
|
|
25
|
+
_schema_ref_from_type,
|
|
26
|
+
_make_schema_ref,
|
|
27
|
+
_parse_schema_ref,
|
|
28
|
+
_increment_patch,
|
|
29
|
+
_python_type_to_field_type,
|
|
30
|
+
_build_schema_record,
|
|
31
|
+
)
|
|
32
|
+
from atdata.local._index import Index
|
|
33
|
+
from atdata.local._s3 import (
|
|
34
|
+
S3DataStore,
|
|
35
|
+
_s3_env,
|
|
36
|
+
_s3_from_credentials,
|
|
37
|
+
_create_s3_write_callbacks,
|
|
38
|
+
)
|
|
39
|
+
from atdata.local._repo_legacy import Repo
|
|
40
|
+
|
|
41
|
+
# Re-export third-party types that were previously importable from the
|
|
42
|
+
# monolithic local.py (tests reference atdata.local.S3FileSystem, etc.)
|
|
43
|
+
from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
# Public API
|
|
47
|
+
"Index",
|
|
48
|
+
"LocalDatasetEntry",
|
|
49
|
+
"BasicIndexEntry",
|
|
50
|
+
"S3DataStore",
|
|
51
|
+
"Repo",
|
|
52
|
+
"SchemaNamespace",
|
|
53
|
+
"SchemaFieldType",
|
|
54
|
+
"SchemaField",
|
|
55
|
+
"LocalSchemaRecord",
|
|
56
|
+
"REDIS_KEY_DATASET_ENTRY",
|
|
57
|
+
"REDIS_KEY_SCHEMA",
|
|
58
|
+
# Internal helpers (re-exported for backward compatibility)
|
|
59
|
+
"_ATDATA_URI_PREFIX",
|
|
60
|
+
"_LEGACY_URI_PREFIX",
|
|
61
|
+
"_kind_str_for_sample_type",
|
|
62
|
+
"_schema_ref_from_type",
|
|
63
|
+
"_make_schema_ref",
|
|
64
|
+
"_parse_schema_ref",
|
|
65
|
+
"_increment_patch",
|
|
66
|
+
"_python_type_to_field_type",
|
|
67
|
+
"_build_schema_record",
|
|
68
|
+
"_s3_env",
|
|
69
|
+
"_s3_from_credentials",
|
|
70
|
+
"_create_s3_write_callbacks",
|
|
71
|
+
]
|
atdata/local/_entry.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Dataset entry model and Redis key constants."""
|
|
2
|
+
|
|
3
|
+
from atdata._cid import generate_cid
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
import msgpack
|
|
9
|
+
from redis import Redis
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Redis key prefixes for index entries and schemas
|
|
13
|
+
REDIS_KEY_DATASET_ENTRY = "LocalDatasetEntry"
|
|
14
|
+
REDIS_KEY_SCHEMA = "LocalSchema"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class LocalDatasetEntry:
|
|
19
|
+
"""Index entry for a dataset stored in the local repository.
|
|
20
|
+
|
|
21
|
+
Implements the IndexEntry protocol for compatibility with AbstractIndex.
|
|
22
|
+
Uses dual identity: a content-addressable CID (ATProto-compatible) and
|
|
23
|
+
a human-readable name.
|
|
24
|
+
|
|
25
|
+
The CID is generated from the entry's content (schema_ref + data_urls),
|
|
26
|
+
ensuring the same data produces the same CID whether stored locally or
|
|
27
|
+
in the atmosphere. This enables seamless promotion from local to ATProto.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
name: Human-readable name for this dataset.
|
|
31
|
+
schema_ref: Reference to the schema for this dataset.
|
|
32
|
+
data_urls: WebDataset URLs for the data.
|
|
33
|
+
metadata: Arbitrary metadata dictionary, or None if not set.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
"""Human-readable name for this dataset."""
|
|
40
|
+
|
|
41
|
+
schema_ref: str
|
|
42
|
+
"""Reference to the schema for this dataset."""
|
|
43
|
+
|
|
44
|
+
data_urls: list[str]
|
|
45
|
+
"""WebDataset URLs for the data."""
|
|
46
|
+
|
|
47
|
+
metadata: dict | None = None
|
|
48
|
+
"""Arbitrary metadata dictionary, or None if not set."""
|
|
49
|
+
|
|
50
|
+
_cid: str | None = field(default=None, repr=False)
|
|
51
|
+
"""Content identifier (ATProto-compatible CID). Generated from content if not provided."""
|
|
52
|
+
|
|
53
|
+
# Legacy field for backwards compatibility during migration
|
|
54
|
+
_legacy_uuid: str | None = field(default=None, repr=False)
|
|
55
|
+
"""Legacy UUID for backwards compatibility with existing Redis entries."""
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
"""Generate CID from content if not provided."""
|
|
59
|
+
if self._cid is None:
|
|
60
|
+
self._cid = self._generate_cid()
|
|
61
|
+
|
|
62
|
+
def _generate_cid(self) -> str:
|
|
63
|
+
"""Generate ATProto-compatible CID from entry content."""
|
|
64
|
+
# CID is based on schema_ref and data_urls - the identity of the dataset
|
|
65
|
+
content = {
|
|
66
|
+
"schema_ref": self.schema_ref,
|
|
67
|
+
"data_urls": self.data_urls,
|
|
68
|
+
}
|
|
69
|
+
return generate_cid(content)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def cid(self) -> str:
|
|
73
|
+
"""Content identifier (ATProto-compatible CID)."""
|
|
74
|
+
if self._cid is None:
|
|
75
|
+
raise RuntimeError(
|
|
76
|
+
"CID not initialized; this should not happen after __post_init__"
|
|
77
|
+
)
|
|
78
|
+
return self._cid
|
|
79
|
+
|
|
80
|
+
# Legacy compatibility
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def wds_url(self) -> str:
|
|
84
|
+
"""Legacy property: returns first data URL for backwards compatibility."""
|
|
85
|
+
return self.data_urls[0] if self.data_urls else ""
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def sample_kind(self) -> str:
|
|
89
|
+
"""Legacy property: returns schema_ref for backwards compatibility."""
|
|
90
|
+
return self.schema_ref
|
|
91
|
+
|
|
92
|
+
def write_to(self, redis: Redis):
|
|
93
|
+
"""Persist this index entry to Redis.
|
|
94
|
+
|
|
95
|
+
Stores the entry as a Redis hash with key '{REDIS_KEY_DATASET_ENTRY}:{cid}'.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
redis: Redis connection to write to.
|
|
99
|
+
"""
|
|
100
|
+
save_key = f"{REDIS_KEY_DATASET_ENTRY}:{self.cid}"
|
|
101
|
+
data: dict[str, Any] = {
|
|
102
|
+
"name": self.name,
|
|
103
|
+
"schema_ref": self.schema_ref,
|
|
104
|
+
"data_urls": msgpack.packb(self.data_urls), # Serialize list
|
|
105
|
+
"cid": self.cid,
|
|
106
|
+
}
|
|
107
|
+
if self.metadata is not None:
|
|
108
|
+
data["metadata"] = msgpack.packb(self.metadata)
|
|
109
|
+
if self._legacy_uuid is not None:
|
|
110
|
+
data["legacy_uuid"] = self._legacy_uuid
|
|
111
|
+
|
|
112
|
+
redis.hset(save_key, mapping=data) # type: ignore[arg-type]
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_redis(cls, redis: Redis, cid: str) -> "LocalDatasetEntry":
|
|
116
|
+
"""Load an entry from Redis by CID.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
redis: Redis connection to read from.
|
|
120
|
+
cid: Content identifier of the entry to load.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
LocalDatasetEntry loaded from Redis.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
KeyError: If entry not found.
|
|
127
|
+
"""
|
|
128
|
+
save_key = f"{REDIS_KEY_DATASET_ENTRY}:{cid}"
|
|
129
|
+
raw_data = redis.hgetall(save_key)
|
|
130
|
+
if not raw_data:
|
|
131
|
+
raise KeyError(f"{REDIS_KEY_DATASET_ENTRY} not found: {cid}")
|
|
132
|
+
|
|
133
|
+
# Decode string fields, keep binary fields as bytes for msgpack
|
|
134
|
+
raw_data_typed = cast(dict[bytes, bytes], raw_data)
|
|
135
|
+
name = raw_data_typed[b"name"].decode("utf-8")
|
|
136
|
+
schema_ref = raw_data_typed[b"schema_ref"].decode("utf-8")
|
|
137
|
+
cid_value = raw_data_typed.get(b"cid", b"").decode("utf-8") or None
|
|
138
|
+
legacy_uuid = raw_data_typed.get(b"legacy_uuid", b"").decode("utf-8") or None
|
|
139
|
+
|
|
140
|
+
# Deserialize msgpack fields (stored as raw bytes)
|
|
141
|
+
data_urls = msgpack.unpackb(raw_data_typed[b"data_urls"])
|
|
142
|
+
metadata = None
|
|
143
|
+
if b"metadata" in raw_data_typed:
|
|
144
|
+
metadata = msgpack.unpackb(raw_data_typed[b"metadata"])
|
|
145
|
+
|
|
146
|
+
return cls(
|
|
147
|
+
name=name,
|
|
148
|
+
schema_ref=schema_ref,
|
|
149
|
+
data_urls=data_urls,
|
|
150
|
+
metadata=metadata,
|
|
151
|
+
_cid=cid_value,
|
|
152
|
+
_legacy_uuid=legacy_uuid,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# Backwards compatibility alias
|
|
157
|
+
BasicIndexEntry = LocalDatasetEntry
|