atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +31 -1
  3. atdata/_cid.py +29 -35
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +33 -17
  6. atdata/_hf_api.py +109 -59
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +74 -132
  9. atdata/_schema_codec.py +38 -41
  10. atdata/_sources.py +57 -64
  11. atdata/_stub_manager.py +31 -26
  12. atdata/_type_utils.py +47 -7
  13. atdata/atmosphere/__init__.py +31 -24
  14. atdata/atmosphere/_types.py +11 -11
  15. atdata/atmosphere/client.py +11 -8
  16. atdata/atmosphere/lens.py +27 -30
  17. atdata/atmosphere/records.py +34 -39
  18. atdata/atmosphere/schema.py +35 -31
  19. atdata/atmosphere/store.py +16 -20
  20. atdata/cli/__init__.py +163 -168
  21. atdata/cli/diagnose.py +12 -8
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/local.py +5 -2
  24. atdata/cli/preview.py +63 -0
  25. atdata/cli/schema.py +109 -0
  26. atdata/dataset.py +678 -533
  27. atdata/lens.py +85 -83
  28. atdata/local/__init__.py +71 -0
  29. atdata/local/_entry.py +157 -0
  30. atdata/local/_index.py +940 -0
  31. atdata/local/_repo_legacy.py +218 -0
  32. atdata/local/_s3.py +349 -0
  33. atdata/local/_schema.py +380 -0
  34. atdata/manifest/__init__.py +28 -0
  35. atdata/manifest/_aggregates.py +156 -0
  36. atdata/manifest/_builder.py +163 -0
  37. atdata/manifest/_fields.py +154 -0
  38. atdata/manifest/_manifest.py +146 -0
  39. atdata/manifest/_query.py +150 -0
  40. atdata/manifest/_writer.py +74 -0
  41. atdata/promote.py +20 -24
  42. atdata/providers/__init__.py +25 -0
  43. atdata/providers/_base.py +140 -0
  44. atdata/providers/_factory.py +69 -0
  45. atdata/providers/_postgres.py +214 -0
  46. atdata/providers/_redis.py +171 -0
  47. atdata/providers/_sqlite.py +191 -0
  48. atdata/repository.py +323 -0
  49. atdata/testing.py +337 -0
  50. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
  51. atdata-0.3.0b1.dist-info/RECORD +54 -0
  52. atdata/local.py +0 -1707
  53. atdata-0.2.2b1.dist-info/RECORD +0 -28
  54. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  55. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  56. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,191 @@
1
+ """SQLite-backed index provider.
2
+
3
+ Stores dataset entries and schema records in a local SQLite database file.
4
+ Uses WAL journal mode for concurrent read access and ``INSERT OR REPLACE``
5
+ for upsert semantics.
6
+
7
+ No external dependencies — uses Python's built-in ``sqlite3`` module.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import sqlite3
13
+ from pathlib import Path
14
+ from typing import Iterator
15
+
16
+ import msgpack
17
+
18
+ from ._base import IndexProvider
19
+ from .._type_utils import parse_semver
20
+
21
+ _CREATE_TABLES = """\
22
+ CREATE TABLE IF NOT EXISTS dataset_entries (
23
+ cid TEXT PRIMARY KEY,
24
+ name TEXT NOT NULL,
25
+ schema_ref TEXT NOT NULL,
26
+ data_urls BLOB NOT NULL,
27
+ metadata BLOB,
28
+ legacy_uuid TEXT,
29
+ created_at TEXT DEFAULT (datetime('now'))
30
+ );
31
+
32
+ CREATE INDEX IF NOT EXISTS idx_entries_name
33
+ ON dataset_entries(name);
34
+
35
+ CREATE TABLE IF NOT EXISTS schemas (
36
+ name TEXT NOT NULL,
37
+ version TEXT NOT NULL,
38
+ schema_json TEXT NOT NULL,
39
+ created_at TEXT DEFAULT (datetime('now')),
40
+ PRIMARY KEY (name, version)
41
+ );
42
+ """
43
+
44
+
45
+ class SqliteProvider(IndexProvider):
46
+ """Index provider backed by a local SQLite database.
47
+
48
+ Args:
49
+ path: Path to the database file. The parent directory is created
50
+ automatically. Defaults to ``~/.atdata/index.db``.
51
+
52
+ Examples:
53
+ >>> provider = SqliteProvider(path="/tmp/test-index.db")
54
+ >>> provider.store_schema("MySample", "1.0.0", '{"name":"MySample"}')
55
+ >>> provider.get_schema_json("MySample", "1.0.0")
56
+ '{"name":"MySample"}'
57
+ """
58
+
59
+ def __init__(self, path: str | Path | None = None) -> None:
60
+ if path is None:
61
+ path = Path.home() / ".atdata" / "index.db"
62
+ self._path = Path(path).expanduser()
63
+ self._path.parent.mkdir(parents=True, exist_ok=True)
64
+
65
+ self._conn = sqlite3.connect(str(self._path))
66
+ self._conn.execute("PRAGMA journal_mode=WAL")
67
+ self._conn.executescript(_CREATE_TABLES)
68
+ self._conn.commit()
69
+
70
+ @property
71
+ def path(self) -> Path:
72
+ """Path to the SQLite database file."""
73
+ return self._path
74
+
75
+ # ------------------------------------------------------------------
76
+ # Dataset entry operations
77
+ # ------------------------------------------------------------------
78
+
79
+ def store_entry(self, entry: "LocalDatasetEntry") -> None: # noqa: F821
80
+ self._conn.execute(
81
+ """INSERT OR REPLACE INTO dataset_entries
82
+ (cid, name, schema_ref, data_urls, metadata, legacy_uuid)
83
+ VALUES (?, ?, ?, ?, ?, ?)""",
84
+ (
85
+ entry.cid,
86
+ entry.name,
87
+ entry.schema_ref,
88
+ msgpack.packb(entry.data_urls),
89
+ msgpack.packb(entry.metadata) if entry.metadata is not None else None,
90
+ entry._legacy_uuid,
91
+ ),
92
+ )
93
+ self._conn.commit()
94
+
95
+ def get_entry_by_cid(self, cid: str) -> "LocalDatasetEntry": # noqa: F821
96
+ row = self._conn.execute(
97
+ "SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
98
+ "FROM dataset_entries WHERE cid = ?",
99
+ (cid,),
100
+ ).fetchone()
101
+ if row is None:
102
+ raise KeyError(f"LocalDatasetEntry not found: {cid}")
103
+ return _row_to_entry(row)
104
+
105
+ def get_entry_by_name(self, name: str) -> "LocalDatasetEntry": # noqa: F821
106
+ row = self._conn.execute(
107
+ "SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
108
+ "FROM dataset_entries WHERE name = ? LIMIT 1",
109
+ (name,),
110
+ ).fetchone()
111
+ if row is None:
112
+ raise KeyError(f"No entry with name: {name}")
113
+ return _row_to_entry(row)
114
+
115
+ def iter_entries(self) -> Iterator["LocalDatasetEntry"]: # noqa: F821
116
+ cursor = self._conn.execute(
117
+ "SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
118
+ "FROM dataset_entries"
119
+ )
120
+ for row in cursor:
121
+ yield _row_to_entry(row)
122
+
123
+ # ------------------------------------------------------------------
124
+ # Schema operations
125
+ # ------------------------------------------------------------------
126
+
127
+ def store_schema(self, name: str, version: str, schema_json: str) -> None:
128
+ self._conn.execute(
129
+ """INSERT OR REPLACE INTO schemas (name, version, schema_json)
130
+ VALUES (?, ?, ?)""",
131
+ (name, version, schema_json),
132
+ )
133
+ self._conn.commit()
134
+
135
+ def get_schema_json(self, name: str, version: str) -> str | None:
136
+ row = self._conn.execute(
137
+ "SELECT schema_json FROM schemas WHERE name = ? AND version = ?",
138
+ (name, version),
139
+ ).fetchone()
140
+ if row is None:
141
+ return None
142
+ return row[0]
143
+
144
+ def iter_schemas(self) -> Iterator[tuple[str, str, str]]:
145
+ cursor = self._conn.execute("SELECT name, version, schema_json FROM schemas")
146
+ yield from cursor
147
+
148
+ def find_latest_version(self, name: str) -> str | None:
149
+ cursor = self._conn.execute(
150
+ "SELECT version FROM schemas WHERE name = ?",
151
+ (name,),
152
+ )
153
+ latest: tuple[int, int, int] | None = None
154
+ latest_str: str | None = None
155
+ for (version_str,) in cursor:
156
+ try:
157
+ v = parse_semver(version_str)
158
+ if latest is None or v > latest:
159
+ latest = v
160
+ latest_str = version_str
161
+ except ValueError:
162
+ continue
163
+ return latest_str
164
+
165
+ # ------------------------------------------------------------------
166
+ # Lifecycle
167
+ # ------------------------------------------------------------------
168
+
169
+ def close(self) -> None:
170
+ """Close the SQLite connection."""
171
+ self._conn.close()
172
+
173
+
174
+ # ------------------------------------------------------------------
175
+ # Helpers
176
+ # ------------------------------------------------------------------
177
+
178
+
179
+ def _row_to_entry(row: tuple) -> "LocalDatasetEntry": # noqa: F821
180
+ """Convert a database row to a ``LocalDatasetEntry``."""
181
+ from ..local import LocalDatasetEntry
182
+
183
+ cid, name, schema_ref, data_urls_blob, metadata_blob, legacy_uuid = row
184
+ return LocalDatasetEntry(
185
+ name=name,
186
+ schema_ref=schema_ref,
187
+ data_urls=msgpack.unpackb(data_urls_blob),
188
+ metadata=msgpack.unpackb(metadata_blob) if metadata_blob is not None else None,
189
+ _cid=cid,
190
+ _legacy_uuid=legacy_uuid,
191
+ )
atdata/repository.py ADDED
@@ -0,0 +1,323 @@
1
+ """Repository and atmosphere backend for the unified Index.
2
+
3
+ A ``Repository`` pairs an ``IndexProvider`` (persistence backend) with an
4
+ optional ``AbstractDataStore`` (shard storage), forming a named storage unit
5
+ that can be mounted into an ``Index``.
6
+
7
+ The ``_AtmosphereBackend`` is an internal adapter that wraps an
8
+ ``AtmosphereClient`` to present the same operational surface as a repository,
9
+ but routes through the ATProto network instead of a local provider.
10
+
11
+ Examples:
12
+ >>> from atdata.repository import Repository, create_repository
13
+ >>> repo = Repository(provider=SqliteProvider("/data/lab.db"))
14
+ >>> repo = create_repository("sqlite", path="/data/lab.db")
15
+ >>>
16
+ >>> # With a data store for shard storage
17
+ >>> repo = Repository(
18
+ ... provider=SqliteProvider(),
19
+ ... data_store=S3DataStore(credentials, bucket="lab-data"),
20
+ ... )
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from dataclasses import dataclass
26
+ from pathlib import Path
27
+ from typing import Any, Iterator, Optional, TYPE_CHECKING
28
+
29
+ from ._protocols import AbstractDataStore
30
+
31
+ if TYPE_CHECKING:
32
+ from .providers._base import IndexProvider
33
+
34
+
35
+ @dataclass
36
+ class Repository:
37
+ """A named storage backend pairing index persistence with optional data storage.
38
+
39
+ Repositories are mounted into an ``Index`` by name. The built-in ``"local"``
40
+ repository uses SQLite by default; additional repositories can be added for
41
+ multi-source dataset management.
42
+
43
+ Attributes:
44
+ provider: IndexProvider handling dataset/schema persistence.
45
+ data_store: Optional data store for reading/writing dataset shards.
46
+ If present, ``insert_dataset`` will write shards to this store.
47
+
48
+ Examples:
49
+ >>> from atdata.providers import create_provider
50
+ >>> from atdata.repository import Repository
51
+ >>>
52
+ >>> provider = create_provider("sqlite", path="/data/lab.db")
53
+ >>> repo = Repository(provider=provider)
54
+ >>>
55
+ >>> # With S3 shard storage
56
+ >>> repo = Repository(
57
+ ... provider=provider,
58
+ ... data_store=S3DataStore(credentials, bucket="lab-data"),
59
+ ... )
60
+ """
61
+
62
+ provider: IndexProvider
63
+ data_store: AbstractDataStore | None = None
64
+
65
+
66
+ def create_repository(
67
+ provider: str = "sqlite",
68
+ *,
69
+ path: str | Path | None = None,
70
+ dsn: str | None = None,
71
+ redis: Any = None,
72
+ data_store: AbstractDataStore | None = None,
73
+ **kwargs: Any,
74
+ ) -> Repository:
75
+ """Create a Repository with a provider by name.
76
+
77
+ This is a convenience factory that combines ``create_provider`` with
78
+ ``Repository`` construction.
79
+
80
+ Args:
81
+ provider: Backend name: ``"sqlite"``, ``"redis"``, or ``"postgres"``.
82
+ path: Database file path (SQLite only).
83
+ dsn: Connection string (PostgreSQL only).
84
+ redis: Existing Redis connection (Redis only).
85
+ data_store: Optional data store for shard storage.
86
+ **kwargs: Extra arguments forwarded to the provider constructor.
87
+
88
+ Returns:
89
+ A ready-to-use Repository.
90
+
91
+ Raises:
92
+ ValueError: If provider name is not recognised.
93
+
94
+ Examples:
95
+ >>> repo = create_repository("sqlite", path="/data/lab.db")
96
+ >>> repo = create_repository(
97
+ ... "sqlite",
98
+ ... data_store=S3DataStore(creds, bucket="lab"),
99
+ ... )
100
+ """
101
+ from .providers._factory import create_provider as _create_provider
102
+
103
+ backend = _create_provider(provider, path=path, dsn=dsn, redis=redis, **kwargs)
104
+ return Repository(provider=backend, data_store=data_store)
105
+
106
+
107
+ class _AtmosphereBackend:
108
+ """Internal adapter wrapping AtmosphereClient for Index routing.
109
+
110
+ This class extracts the operational logic from ``AtmosphereIndex`` into an
111
+ internal component that the unified ``Index`` uses for ATProto resolution.
112
+ It is not part of the public API.
113
+
114
+ The backend is lazily initialised -- the publishers/loaders are only
115
+ created when the client is authenticated or when operations require them.
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ client: Any, # AtmosphereClient, typed as Any to avoid hard import
121
+ *,
122
+ data_store: Optional[AbstractDataStore] = None,
123
+ ) -> None:
124
+ from .atmosphere.client import AtmosphereClient
125
+
126
+ if not isinstance(client, AtmosphereClient):
127
+ raise TypeError(f"Expected AtmosphereClient, got {type(client).__name__}")
128
+ self.client: AtmosphereClient = client
129
+ self._data_store = data_store
130
+ self._schema_publisher: Any = None
131
+ self._schema_loader: Any = None
132
+ self._dataset_publisher: Any = None
133
+ self._dataset_loader: Any = None
134
+
135
+ def _ensure_loaders(self) -> None:
136
+ """Lazily create publishers/loaders on first use."""
137
+ if self._schema_loader is not None:
138
+ return
139
+ from .atmosphere.schema import SchemaPublisher, SchemaLoader
140
+ from .atmosphere.records import DatasetPublisher, DatasetLoader
141
+
142
+ self._schema_publisher = SchemaPublisher(self.client)
143
+ self._schema_loader = SchemaLoader(self.client)
144
+ self._dataset_publisher = DatasetPublisher(self.client)
145
+ self._dataset_loader = DatasetLoader(self.client)
146
+
147
+ @property
148
+ def data_store(self) -> Optional[AbstractDataStore]:
149
+ """The data store for this atmosphere backend, or None."""
150
+ return self._data_store
151
+
152
+ # -- Dataset operations --
153
+
154
+ def get_dataset(self, ref: str) -> Any:
155
+ """Get a dataset entry by name or AT URI.
156
+
157
+ Args:
158
+ ref: Dataset name or AT URI.
159
+
160
+ Returns:
161
+ AtmosphereIndexEntry for the dataset.
162
+
163
+ Raises:
164
+ ValueError: If record is not a dataset.
165
+ """
166
+ self._ensure_loaders()
167
+ from .atmosphere import AtmosphereIndexEntry
168
+
169
+ record = self._dataset_loader.get(ref)
170
+ return AtmosphereIndexEntry(ref, record)
171
+
172
+ def list_datasets(self, repo: str | None = None) -> list[Any]:
173
+ """List all dataset entries.
174
+
175
+ Args:
176
+ repo: DID of repository. Defaults to authenticated user.
177
+
178
+ Returns:
179
+ List of AtmosphereIndexEntry for each dataset.
180
+ """
181
+ self._ensure_loaders()
182
+ from .atmosphere import AtmosphereIndexEntry
183
+
184
+ records = self._dataset_loader.list_all(repo=repo)
185
+ return [
186
+ AtmosphereIndexEntry(rec.get("uri", ""), rec.get("value", rec))
187
+ for rec in records
188
+ ]
189
+
190
+ def iter_datasets(self, repo: str | None = None) -> Iterator[Any]:
191
+ """Lazily iterate over all dataset entries.
192
+
193
+ Args:
194
+ repo: DID of repository. Defaults to authenticated user.
195
+
196
+ Yields:
197
+ AtmosphereIndexEntry for each dataset.
198
+ """
199
+ self._ensure_loaders()
200
+ from .atmosphere import AtmosphereIndexEntry
201
+
202
+ records = self._dataset_loader.list_all(repo=repo)
203
+ for rec in records:
204
+ uri = rec.get("uri", "")
205
+ yield AtmosphereIndexEntry(uri, rec.get("value", rec))
206
+
207
+ def insert_dataset(
208
+ self,
209
+ ds: Any,
210
+ *,
211
+ name: str,
212
+ schema_ref: str | None = None,
213
+ **kwargs: Any,
214
+ ) -> Any:
215
+ """Insert a dataset into ATProto.
216
+
217
+ Args:
218
+ ds: The Dataset to publish.
219
+ name: Human-readable name.
220
+ schema_ref: Optional schema AT URI. If None, auto-publishes schema.
221
+ **kwargs: Additional options (description, tags, license).
222
+
223
+ Returns:
224
+ AtmosphereIndexEntry for the inserted dataset.
225
+ """
226
+ self._ensure_loaders()
227
+ from .atmosphere import AtmosphereIndexEntry
228
+
229
+ uri = self._dataset_publisher.publish(
230
+ ds,
231
+ name=name,
232
+ schema_uri=schema_ref,
233
+ description=kwargs.get("description"),
234
+ tags=kwargs.get("tags"),
235
+ license=kwargs.get("license"),
236
+ auto_publish_schema=(schema_ref is None),
237
+ )
238
+ record = self._dataset_loader.get(uri)
239
+ return AtmosphereIndexEntry(str(uri), record)
240
+
241
+ # -- Schema operations --
242
+
243
+ def publish_schema(
244
+ self,
245
+ sample_type: type,
246
+ *,
247
+ version: str = "1.0.0",
248
+ **kwargs: Any,
249
+ ) -> str:
250
+ """Publish a schema to ATProto.
251
+
252
+ Args:
253
+ sample_type: A Packable type.
254
+ version: Semantic version string.
255
+ **kwargs: Additional options.
256
+
257
+ Returns:
258
+ AT URI of the schema record.
259
+ """
260
+ self._ensure_loaders()
261
+ uri = self._schema_publisher.publish(
262
+ sample_type,
263
+ version=version,
264
+ description=kwargs.get("description"),
265
+ metadata=kwargs.get("metadata"),
266
+ )
267
+ return str(uri)
268
+
269
+ def get_schema(self, ref: str) -> dict:
270
+ """Get a schema record by AT URI.
271
+
272
+ Args:
273
+ ref: AT URI of the schema record.
274
+
275
+ Returns:
276
+ Schema record dictionary.
277
+ """
278
+ self._ensure_loaders()
279
+ return self._schema_loader.get(ref)
280
+
281
+ def list_schemas(self, repo: str | None = None) -> list[dict]:
282
+ """List all schema records.
283
+
284
+ Args:
285
+ repo: DID of repository. Defaults to authenticated user.
286
+
287
+ Returns:
288
+ List of schema records as dictionaries.
289
+ """
290
+ self._ensure_loaders()
291
+ records = self._schema_loader.list_all(repo=repo)
292
+ return [rec.get("value", rec) for rec in records]
293
+
294
+ def iter_schemas(self) -> Iterator[dict]:
295
+ """Lazily iterate over all schema records.
296
+
297
+ Yields:
298
+ Schema records as dictionaries.
299
+ """
300
+ self._ensure_loaders()
301
+ records = self._schema_loader.list_all()
302
+ for rec in records:
303
+ yield rec.get("value", rec)
304
+
305
+ def decode_schema(self, ref: str) -> type:
306
+ """Reconstruct a Python type from a schema record.
307
+
308
+ Args:
309
+ ref: AT URI of the schema record.
310
+
311
+ Returns:
312
+ Dynamically generated Packable type.
313
+ """
314
+ from ._schema_codec import schema_to_type
315
+
316
+ schema = self.get_schema(ref)
317
+ return schema_to_type(schema)
318
+
319
+
320
+ __all__ = [
321
+ "Repository",
322
+ "create_repository",
323
+ ]