atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +30 -0
  3. atdata/_exceptions.py +168 -0
  4. atdata/_helpers.py +29 -15
  5. atdata/_hf_api.py +63 -11
  6. atdata/_logging.py +70 -0
  7. atdata/_protocols.py +19 -62
  8. atdata/_schema_codec.py +5 -4
  9. atdata/_type_utils.py +28 -2
  10. atdata/atmosphere/__init__.py +19 -9
  11. atdata/atmosphere/records.py +3 -2
  12. atdata/atmosphere/schema.py +2 -2
  13. atdata/cli/__init__.py +157 -171
  14. atdata/cli/inspect.py +69 -0
  15. atdata/cli/local.py +1 -1
  16. atdata/cli/preview.py +63 -0
  17. atdata/cli/schema.py +109 -0
  18. atdata/dataset.py +428 -326
  19. atdata/lens.py +9 -2
  20. atdata/local/__init__.py +71 -0
  21. atdata/local/_entry.py +157 -0
  22. atdata/local/_index.py +940 -0
  23. atdata/local/_repo_legacy.py +218 -0
  24. atdata/local/_s3.py +349 -0
  25. atdata/local/_schema.py +380 -0
  26. atdata/manifest/__init__.py +28 -0
  27. atdata/manifest/_aggregates.py +156 -0
  28. atdata/manifest/_builder.py +163 -0
  29. atdata/manifest/_fields.py +154 -0
  30. atdata/manifest/_manifest.py +146 -0
  31. atdata/manifest/_query.py +150 -0
  32. atdata/manifest/_writer.py +74 -0
  33. atdata/promote.py +4 -4
  34. atdata/providers/__init__.py +25 -0
  35. atdata/providers/_base.py +140 -0
  36. atdata/providers/_factory.py +69 -0
  37. atdata/providers/_postgres.py +214 -0
  38. atdata/providers/_redis.py +171 -0
  39. atdata/providers/_sqlite.py +191 -0
  40. atdata/repository.py +323 -0
  41. atdata/testing.py +337 -0
  42. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
  43. atdata-0.3.0b1.dist-info/RECORD +54 -0
  44. atdata/local.py +0 -1720
  45. atdata-0.2.3b1.dist-info/RECORD +0 -28
  46. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  47. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  48. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/lens.py CHANGED
@@ -61,6 +61,7 @@ if TYPE_CHECKING:
61
61
  from .dataset import PackableSample
62
62
 
63
63
  from ._protocols import Packable
64
+ from ._exceptions import LensNotFoundError
64
65
 
65
66
 
66
67
  ##
@@ -101,7 +102,8 @@ class Lens(Generic[S, V]):
101
102
  ... return FullData(name=view.name, age=source.age)
102
103
  """
103
104
 
104
- # TODO The above has a line for "Parameters:" that should be "Type Parameters:"; this is a temporary fix for `quartodoc` auto-generation bugs.
105
+ # Note: The docstring uses "Parameters:" for type parameters as a workaround
106
+ # for quartodoc not supporting "Type Parameters:" sections.
105
107
 
106
108
  def __init__(
107
109
  self, get: LensGetter[S, V], put: Optional[LensPutter[S, V]] = None
@@ -290,7 +292,12 @@ class LensNetwork:
290
292
  """
291
293
  ret = self._registry.get((source, view), None)
292
294
  if ret is None:
293
- raise ValueError(f"No registered lens from source {source} to view {view}")
295
+ available_targets = [
296
+ (sig[1], lens_obj.__name__)
297
+ for sig, lens_obj in self._registry.items()
298
+ if sig[0] is source and hasattr(lens_obj, "__name__")
299
+ ]
300
+ raise LensNotFoundError(source, view, available_targets)
294
301
 
295
302
  return ret
296
303
 
@@ -0,0 +1,71 @@
1
+ """Local storage backend for atdata datasets.
2
+
3
+ Key classes:
4
+
5
+ - ``Index``: Unified index with pluggable providers (SQLite default),
6
+ named repositories, and optional atmosphere backend.
7
+ - ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
8
+ - ``S3DataStore``: S3-compatible shard storage.
9
+ """
10
+
11
+ from atdata.local._entry import (
12
+ LocalDatasetEntry,
13
+ BasicIndexEntry,
14
+ REDIS_KEY_DATASET_ENTRY,
15
+ REDIS_KEY_SCHEMA,
16
+ )
17
+ from atdata.local._schema import (
18
+ SchemaNamespace,
19
+ SchemaFieldType,
20
+ SchemaField,
21
+ LocalSchemaRecord,
22
+ _ATDATA_URI_PREFIX,
23
+ _LEGACY_URI_PREFIX,
24
+ _kind_str_for_sample_type,
25
+ _schema_ref_from_type,
26
+ _make_schema_ref,
27
+ _parse_schema_ref,
28
+ _increment_patch,
29
+ _python_type_to_field_type,
30
+ _build_schema_record,
31
+ )
32
+ from atdata.local._index import Index
33
+ from atdata.local._s3 import (
34
+ S3DataStore,
35
+ _s3_env,
36
+ _s3_from_credentials,
37
+ _create_s3_write_callbacks,
38
+ )
39
+ from atdata.local._repo_legacy import Repo
40
+
41
+ # Re-export third-party types that were previously importable from the
42
+ # monolithic local.py (tests reference atdata.local.S3FileSystem, etc.)
43
+ from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
44
+
45
+ __all__ = [
46
+ # Public API
47
+ "Index",
48
+ "LocalDatasetEntry",
49
+ "BasicIndexEntry",
50
+ "S3DataStore",
51
+ "Repo",
52
+ "SchemaNamespace",
53
+ "SchemaFieldType",
54
+ "SchemaField",
55
+ "LocalSchemaRecord",
56
+ "REDIS_KEY_DATASET_ENTRY",
57
+ "REDIS_KEY_SCHEMA",
58
+ # Internal helpers (re-exported for backward compatibility)
59
+ "_ATDATA_URI_PREFIX",
60
+ "_LEGACY_URI_PREFIX",
61
+ "_kind_str_for_sample_type",
62
+ "_schema_ref_from_type",
63
+ "_make_schema_ref",
64
+ "_parse_schema_ref",
65
+ "_increment_patch",
66
+ "_python_type_to_field_type",
67
+ "_build_schema_record",
68
+ "_s3_env",
69
+ "_s3_from_credentials",
70
+ "_create_s3_write_callbacks",
71
+ ]
atdata/local/_entry.py ADDED
@@ -0,0 +1,157 @@
1
+ """Dataset entry model and Redis key constants."""
2
+
3
+ from atdata._cid import generate_cid
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, cast
7
+
8
+ import msgpack
9
+ from redis import Redis
10
+
11
+
12
+ # Redis key prefixes for index entries and schemas
13
+ REDIS_KEY_DATASET_ENTRY = "LocalDatasetEntry"
14
+ REDIS_KEY_SCHEMA = "LocalSchema"
15
+
16
+
17
+ @dataclass
18
+ class LocalDatasetEntry:
19
+ """Index entry for a dataset stored in the local repository.
20
+
21
+ Implements the IndexEntry protocol for compatibility with AbstractIndex.
22
+ Uses dual identity: a content-addressable CID (ATProto-compatible) and
23
+ a human-readable name.
24
+
25
+ The CID is generated from the entry's content (schema_ref + data_urls),
26
+ ensuring the same data produces the same CID whether stored locally or
27
+ in the atmosphere. This enables seamless promotion from local to ATProto.
28
+
29
+ Attributes:
30
+ name: Human-readable name for this dataset.
31
+ schema_ref: Reference to the schema for this dataset.
32
+ data_urls: WebDataset URLs for the data.
33
+ metadata: Arbitrary metadata dictionary, or None if not set.
34
+ """
35
+
36
+ ##
37
+
38
+ name: str
39
+ """Human-readable name for this dataset."""
40
+
41
+ schema_ref: str
42
+ """Reference to the schema for this dataset."""
43
+
44
+ data_urls: list[str]
45
+ """WebDataset URLs for the data."""
46
+
47
+ metadata: dict | None = None
48
+ """Arbitrary metadata dictionary, or None if not set."""
49
+
50
+ _cid: str | None = field(default=None, repr=False)
51
+ """Content identifier (ATProto-compatible CID). Generated from content if not provided."""
52
+
53
+ # Legacy field for backwards compatibility during migration
54
+ _legacy_uuid: str | None = field(default=None, repr=False)
55
+ """Legacy UUID for backwards compatibility with existing Redis entries."""
56
+
57
+ def __post_init__(self):
58
+ """Generate CID from content if not provided."""
59
+ if self._cid is None:
60
+ self._cid = self._generate_cid()
61
+
62
+ def _generate_cid(self) -> str:
63
+ """Generate ATProto-compatible CID from entry content."""
64
+ # CID is based on schema_ref and data_urls - the identity of the dataset
65
+ content = {
66
+ "schema_ref": self.schema_ref,
67
+ "data_urls": self.data_urls,
68
+ }
69
+ return generate_cid(content)
70
+
71
+ @property
72
+ def cid(self) -> str:
73
+ """Content identifier (ATProto-compatible CID)."""
74
+ if self._cid is None:
75
+ raise RuntimeError(
76
+ "CID not initialized; this should not happen after __post_init__"
77
+ )
78
+ return self._cid
79
+
80
+ # Legacy compatibility
81
+
82
+ @property
83
+ def wds_url(self) -> str:
84
+ """Legacy property: returns first data URL for backwards compatibility."""
85
+ return self.data_urls[0] if self.data_urls else ""
86
+
87
+ @property
88
+ def sample_kind(self) -> str:
89
+ """Legacy property: returns schema_ref for backwards compatibility."""
90
+ return self.schema_ref
91
+
92
+ def write_to(self, redis: Redis):
93
+ """Persist this index entry to Redis.
94
+
95
+ Stores the entry as a Redis hash with key '{REDIS_KEY_DATASET_ENTRY}:{cid}'.
96
+
97
+ Args:
98
+ redis: Redis connection to write to.
99
+ """
100
+ save_key = f"{REDIS_KEY_DATASET_ENTRY}:{self.cid}"
101
+ data: dict[str, Any] = {
102
+ "name": self.name,
103
+ "schema_ref": self.schema_ref,
104
+ "data_urls": msgpack.packb(self.data_urls), # Serialize list
105
+ "cid": self.cid,
106
+ }
107
+ if self.metadata is not None:
108
+ data["metadata"] = msgpack.packb(self.metadata)
109
+ if self._legacy_uuid is not None:
110
+ data["legacy_uuid"] = self._legacy_uuid
111
+
112
+ redis.hset(save_key, mapping=data) # type: ignore[arg-type]
113
+
114
+ @classmethod
115
+ def from_redis(cls, redis: Redis, cid: str) -> "LocalDatasetEntry":
116
+ """Load an entry from Redis by CID.
117
+
118
+ Args:
119
+ redis: Redis connection to read from.
120
+ cid: Content identifier of the entry to load.
121
+
122
+ Returns:
123
+ LocalDatasetEntry loaded from Redis.
124
+
125
+ Raises:
126
+ KeyError: If entry not found.
127
+ """
128
+ save_key = f"{REDIS_KEY_DATASET_ENTRY}:{cid}"
129
+ raw_data = redis.hgetall(save_key)
130
+ if not raw_data:
131
+ raise KeyError(f"{REDIS_KEY_DATASET_ENTRY} not found: {cid}")
132
+
133
+ # Decode string fields, keep binary fields as bytes for msgpack
134
+ raw_data_typed = cast(dict[bytes, bytes], raw_data)
135
+ name = raw_data_typed[b"name"].decode("utf-8")
136
+ schema_ref = raw_data_typed[b"schema_ref"].decode("utf-8")
137
+ cid_value = raw_data_typed.get(b"cid", b"").decode("utf-8") or None
138
+ legacy_uuid = raw_data_typed.get(b"legacy_uuid", b"").decode("utf-8") or None
139
+
140
+ # Deserialize msgpack fields (stored as raw bytes)
141
+ data_urls = msgpack.unpackb(raw_data_typed[b"data_urls"])
142
+ metadata = None
143
+ if b"metadata" in raw_data_typed:
144
+ metadata = msgpack.unpackb(raw_data_typed[b"metadata"])
145
+
146
+ return cls(
147
+ name=name,
148
+ schema_ref=schema_ref,
149
+ data_urls=data_urls,
150
+ metadata=metadata,
151
+ _cid=cid_value,
152
+ _legacy_uuid=legacy_uuid,
153
+ )
154
+
155
+
156
+ # Backwards compatibility alias
157
+ BasicIndexEntry = LocalDatasetEntry