atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +39 -0
  3. atdata/_cid.py +0 -21
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +41 -15
  6. atdata/_hf_api.py +95 -11
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +77 -238
  9. atdata/_schema_codec.py +7 -6
  10. atdata/_stub_manager.py +5 -25
  11. atdata/_type_utils.py +28 -2
  12. atdata/atmosphere/__init__.py +31 -20
  13. atdata/atmosphere/_types.py +4 -4
  14. atdata/atmosphere/client.py +64 -12
  15. atdata/atmosphere/lens.py +11 -12
  16. atdata/atmosphere/records.py +12 -12
  17. atdata/atmosphere/schema.py +16 -18
  18. atdata/atmosphere/store.py +6 -7
  19. atdata/cli/__init__.py +161 -175
  20. atdata/cli/diagnose.py +2 -2
  21. atdata/cli/{local.py → infra.py} +11 -11
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/preview.py +63 -0
  24. atdata/cli/schema.py +109 -0
  25. atdata/dataset.py +583 -328
  26. atdata/index/__init__.py +54 -0
  27. atdata/index/_entry.py +157 -0
  28. atdata/index/_index.py +1198 -0
  29. atdata/index/_schema.py +380 -0
  30. atdata/lens.py +9 -2
  31. atdata/lexicons/__init__.py +121 -0
  32. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  34. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  35. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  36. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  37. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  38. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  39. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  40. atdata/lexicons/ndarray_shim.json +16 -0
  41. atdata/local/__init__.py +70 -0
  42. atdata/local/_repo_legacy.py +218 -0
  43. atdata/manifest/__init__.py +28 -0
  44. atdata/manifest/_aggregates.py +156 -0
  45. atdata/manifest/_builder.py +163 -0
  46. atdata/manifest/_fields.py +154 -0
  47. atdata/manifest/_manifest.py +146 -0
  48. atdata/manifest/_query.py +150 -0
  49. atdata/manifest/_writer.py +74 -0
  50. atdata/promote.py +18 -14
  51. atdata/providers/__init__.py +25 -0
  52. atdata/providers/_base.py +140 -0
  53. atdata/providers/_factory.py +69 -0
  54. atdata/providers/_postgres.py +214 -0
  55. atdata/providers/_redis.py +171 -0
  56. atdata/providers/_sqlite.py +191 -0
  57. atdata/repository.py +323 -0
  58. atdata/stores/__init__.py +23 -0
  59. atdata/stores/_disk.py +123 -0
  60. atdata/stores/_s3.py +349 -0
  61. atdata/testing.py +341 -0
  62. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
  63. atdata-0.3.1b1.dist-info/RECORD +67 -0
  64. atdata/local.py +0 -1720
  65. atdata-0.2.3b1.dist-info/RECORD +0 -28
  66. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  67. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  68. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,54 @@
1
+ """Index and entry models for atdata datasets.
2
+
3
+ Key classes:
4
+
5
+ - ``Index``: Unified index with pluggable providers (SQLite default),
6
+ named repositories, and optional atmosphere backend.
7
+ - ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
8
+ """
9
+
10
+ from atdata.index._entry import (
11
+ LocalDatasetEntry,
12
+ BasicIndexEntry,
13
+ REDIS_KEY_DATASET_ENTRY,
14
+ REDIS_KEY_SCHEMA,
15
+ )
16
+ from atdata.index._schema import (
17
+ SchemaNamespace,
18
+ SchemaFieldType,
19
+ SchemaField,
20
+ LocalSchemaRecord,
21
+ _ATDATA_URI_PREFIX,
22
+ _LEGACY_URI_PREFIX,
23
+ _kind_str_for_sample_type,
24
+ _schema_ref_from_type,
25
+ _make_schema_ref,
26
+ _parse_schema_ref,
27
+ _increment_patch,
28
+ _python_type_to_field_type,
29
+ _build_schema_record,
30
+ )
31
+ from atdata.index._index import Index
32
+
33
+ __all__ = [
34
+ # Public API
35
+ "Index",
36
+ "LocalDatasetEntry",
37
+ "BasicIndexEntry",
38
+ "SchemaNamespace",
39
+ "SchemaFieldType",
40
+ "SchemaField",
41
+ "LocalSchemaRecord",
42
+ "REDIS_KEY_DATASET_ENTRY",
43
+ "REDIS_KEY_SCHEMA",
44
+ # Internal helpers (re-exported for backward compatibility)
45
+ "_ATDATA_URI_PREFIX",
46
+ "_LEGACY_URI_PREFIX",
47
+ "_kind_str_for_sample_type",
48
+ "_schema_ref_from_type",
49
+ "_make_schema_ref",
50
+ "_parse_schema_ref",
51
+ "_increment_patch",
52
+ "_python_type_to_field_type",
53
+ "_build_schema_record",
54
+ ]
atdata/index/_entry.py ADDED
@@ -0,0 +1,157 @@
1
+ """Dataset entry model and Redis key constants."""
2
+
3
+ from atdata._cid import generate_cid
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, cast
7
+
8
+ import msgpack
9
+ from redis import Redis
10
+
11
+
12
+ # Redis key prefixes for index entries and schemas
13
+ REDIS_KEY_DATASET_ENTRY = "LocalDatasetEntry"
14
+ REDIS_KEY_SCHEMA = "LocalSchema"
15
+
16
+
17
+ @dataclass
18
+ class LocalDatasetEntry:
19
+ """Index entry for a dataset stored in the local repository.
20
+
21
+ Implements the IndexEntry protocol for compatibility with AbstractIndex.
22
+ Uses dual identity: a content-addressable CID (ATProto-compatible) and
23
+ a human-readable name.
24
+
25
+ The CID is generated from the entry's content (schema_ref + data_urls),
26
+ ensuring the same data produces the same CID whether stored locally or
27
+ in the atmosphere. This enables seamless promotion from local to ATProto.
28
+
29
+ Attributes:
30
+ name: Human-readable name for this dataset.
31
+ schema_ref: Reference to the schema for this dataset.
32
+ data_urls: WebDataset URLs for the data.
33
+ metadata: Arbitrary metadata dictionary, or None if not set.
34
+ """
35
+
36
+ ##
37
+
38
+ name: str
39
+ """Human-readable name for this dataset."""
40
+
41
+ schema_ref: str
42
+ """Reference to the schema for this dataset."""
43
+
44
+ data_urls: list[str]
45
+ """WebDataset URLs for the data."""
46
+
47
+ metadata: dict | None = None
48
+ """Arbitrary metadata dictionary, or None if not set."""
49
+
50
+ _cid: str | None = field(default=None, repr=False)
51
+ """Content identifier (ATProto-compatible CID). Generated from content if not provided."""
52
+
53
+ # Legacy field for backwards compatibility during migration
54
+ _legacy_uuid: str | None = field(default=None, repr=False)
55
+ """Legacy UUID for backwards compatibility with existing Redis entries."""
56
+
57
+ def __post_init__(self):
58
+ """Generate CID from content if not provided."""
59
+ if self._cid is None:
60
+ self._cid = self._generate_cid()
61
+
62
+ def _generate_cid(self) -> str:
63
+ """Generate ATProto-compatible CID from entry content."""
64
+ # CID is based on schema_ref and data_urls - the identity of the dataset
65
+ content = {
66
+ "schema_ref": self.schema_ref,
67
+ "data_urls": self.data_urls,
68
+ }
69
+ return generate_cid(content)
70
+
71
+ @property
72
+ def cid(self) -> str:
73
+ """Content identifier (ATProto-compatible CID)."""
74
+ if self._cid is None:
75
+ raise RuntimeError(
76
+ "CID not initialized; this should not happen after __post_init__"
77
+ )
78
+ return self._cid
79
+
80
+ # Legacy compatibility
81
+
82
+ @property
83
+ def wds_url(self) -> str:
84
+ """Legacy property: returns first data URL for backwards compatibility."""
85
+ return self.data_urls[0] if self.data_urls else ""
86
+
87
+ @property
88
+ def sample_kind(self) -> str:
89
+ """Legacy property: returns schema_ref for backwards compatibility."""
90
+ return self.schema_ref
91
+
92
+ def write_to(self, redis: Redis):
93
+ """Persist this index entry to Redis.
94
+
95
+ Stores the entry as a Redis hash with key '{REDIS_KEY_DATASET_ENTRY}:{cid}'.
96
+
97
+ Args:
98
+ redis: Redis connection to write to.
99
+ """
100
+ save_key = f"{REDIS_KEY_DATASET_ENTRY}:{self.cid}"
101
+ data: dict[str, Any] = {
102
+ "name": self.name,
103
+ "schema_ref": self.schema_ref,
104
+ "data_urls": msgpack.packb(self.data_urls), # Serialize list
105
+ "cid": self.cid,
106
+ }
107
+ if self.metadata is not None:
108
+ data["metadata"] = msgpack.packb(self.metadata)
109
+ if self._legacy_uuid is not None:
110
+ data["legacy_uuid"] = self._legacy_uuid
111
+
112
+ redis.hset(save_key, mapping=data) # type: ignore[arg-type]
113
+
114
+ @classmethod
115
+ def from_redis(cls, redis: Redis, cid: str) -> "LocalDatasetEntry":
116
+ """Load an entry from Redis by CID.
117
+
118
+ Args:
119
+ redis: Redis connection to read from.
120
+ cid: Content identifier of the entry to load.
121
+
122
+ Returns:
123
+ LocalDatasetEntry loaded from Redis.
124
+
125
+ Raises:
126
+ KeyError: If entry not found.
127
+ """
128
+ save_key = f"{REDIS_KEY_DATASET_ENTRY}:{cid}"
129
+ raw_data = redis.hgetall(save_key)
130
+ if not raw_data:
131
+ raise KeyError(f"{REDIS_KEY_DATASET_ENTRY} not found: {cid}")
132
+
133
+ # Decode string fields, keep binary fields as bytes for msgpack
134
+ raw_data_typed = cast(dict[bytes, bytes], raw_data)
135
+ name = raw_data_typed[b"name"].decode("utf-8")
136
+ schema_ref = raw_data_typed[b"schema_ref"].decode("utf-8")
137
+ cid_value = raw_data_typed.get(b"cid", b"").decode("utf-8") or None
138
+ legacy_uuid = raw_data_typed.get(b"legacy_uuid", b"").decode("utf-8") or None
139
+
140
+ # Deserialize msgpack fields (stored as raw bytes)
141
+ data_urls = msgpack.unpackb(raw_data_typed[b"data_urls"])
142
+ metadata = None
143
+ if b"metadata" in raw_data_typed:
144
+ metadata = msgpack.unpackb(raw_data_typed[b"metadata"])
145
+
146
+ return cls(
147
+ name=name,
148
+ schema_ref=schema_ref,
149
+ data_urls=data_urls,
150
+ metadata=metadata,
151
+ _cid=cid_value,
152
+ _legacy_uuid=legacy_uuid,
153
+ )
154
+
155
+
156
+ # Backwards compatibility alias
157
+ BasicIndexEntry = LocalDatasetEntry