atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +39 -0
- atdata/_cid.py +0 -21
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +41 -15
- atdata/_hf_api.py +95 -11
- atdata/_logging.py +70 -0
- atdata/_protocols.py +77 -238
- atdata/_schema_codec.py +7 -6
- atdata/_stub_manager.py +5 -25
- atdata/_type_utils.py +28 -2
- atdata/atmosphere/__init__.py +31 -20
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +12 -12
- atdata/atmosphere/schema.py +16 -18
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +161 -175
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +11 -11
- atdata/cli/inspect.py +69 -0
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +583 -328
- atdata/index/__init__.py +54 -0
- atdata/index/_entry.py +157 -0
- atdata/index/_index.py +1198 -0
- atdata/index/_schema.py +380 -0
- atdata/lens.py +9 -2
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +70 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +18 -14
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/stores/_s3.py +349 -0
- atdata/testing.py +341 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata/local.py +0 -1720
- atdata-0.2.3b1.dist-info/RECORD +0 -28
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""PostgreSQL-backed index provider.
|
|
2
|
+
|
|
3
|
+
Stores dataset entries and schema records in PostgreSQL tables.
|
|
4
|
+
Requires the ``psycopg`` (v3) package, which is an optional dependency::
|
|
5
|
+
|
|
6
|
+
pip install "atdata[postgres]"
|
|
7
|
+
|
|
8
|
+
The provider lazily imports ``psycopg`` so that ``import atdata`` never
|
|
9
|
+
fails when the package is absent.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Iterator
|
|
15
|
+
|
|
16
|
+
import msgpack
|
|
17
|
+
|
|
18
|
+
from ._base import IndexProvider
|
|
19
|
+
from .._type_utils import parse_semver
|
|
20
|
+
|
|
21
|
+
_CREATE_TABLES = """\
|
|
22
|
+
CREATE TABLE IF NOT EXISTS dataset_entries (
|
|
23
|
+
cid TEXT PRIMARY KEY,
|
|
24
|
+
name TEXT NOT NULL,
|
|
25
|
+
schema_ref TEXT NOT NULL,
|
|
26
|
+
data_urls BYTEA NOT NULL,
|
|
27
|
+
metadata BYTEA,
|
|
28
|
+
legacy_uuid TEXT,
|
|
29
|
+
created_at TIMESTAMPTZ DEFAULT now()
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
CREATE INDEX IF NOT EXISTS idx_entries_name
|
|
33
|
+
ON dataset_entries(name);
|
|
34
|
+
|
|
35
|
+
CREATE TABLE IF NOT EXISTS schemas (
|
|
36
|
+
name TEXT NOT NULL,
|
|
37
|
+
version TEXT NOT NULL,
|
|
38
|
+
schema_json TEXT NOT NULL,
|
|
39
|
+
created_at TIMESTAMPTZ DEFAULT now(),
|
|
40
|
+
PRIMARY KEY (name, version)
|
|
41
|
+
);
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class PostgresProvider(IndexProvider):
|
|
46
|
+
"""Index provider backed by PostgreSQL.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
dsn: PostgreSQL connection string, e.g.
|
|
50
|
+
``"postgresql://user:pass@host:5432/dbname"``.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ImportError: If ``psycopg`` is not installed.
|
|
54
|
+
|
|
55
|
+
Examples:
|
|
56
|
+
>>> provider = PostgresProvider(dsn="postgresql://localhost/atdata")
|
|
57
|
+
>>> provider.store_schema("MySample", "1.0.0", '{"name":"MySample"}')
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self, dsn: str) -> None:
|
|
61
|
+
try:
|
|
62
|
+
import psycopg
|
|
63
|
+
except ImportError as exc:
|
|
64
|
+
raise ImportError(
|
|
65
|
+
"The postgres provider requires the 'psycopg' package. "
|
|
66
|
+
"Install it with: pip install 'atdata[postgres]'"
|
|
67
|
+
) from exc
|
|
68
|
+
|
|
69
|
+
self._conn = psycopg.connect(dsn, autocommit=False)
|
|
70
|
+
with self._conn.cursor() as cur:
|
|
71
|
+
cur.execute(_CREATE_TABLES)
|
|
72
|
+
self._conn.commit()
|
|
73
|
+
|
|
74
|
+
# ------------------------------------------------------------------
|
|
75
|
+
# Dataset entry operations
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
def store_entry(self, entry: "LocalDatasetEntry") -> None: # noqa: F821
|
|
79
|
+
with self._conn.cursor() as cur:
|
|
80
|
+
cur.execute(
|
|
81
|
+
"""INSERT INTO dataset_entries
|
|
82
|
+
(cid, name, schema_ref, data_urls, metadata, legacy_uuid)
|
|
83
|
+
VALUES (%s, %s, %s, %s, %s, %s)
|
|
84
|
+
ON CONFLICT (cid) DO UPDATE SET
|
|
85
|
+
name = EXCLUDED.name,
|
|
86
|
+
schema_ref = EXCLUDED.schema_ref,
|
|
87
|
+
data_urls = EXCLUDED.data_urls,
|
|
88
|
+
metadata = EXCLUDED.metadata,
|
|
89
|
+
legacy_uuid = EXCLUDED.legacy_uuid""",
|
|
90
|
+
(
|
|
91
|
+
entry.cid,
|
|
92
|
+
entry.name,
|
|
93
|
+
entry.schema_ref,
|
|
94
|
+
msgpack.packb(entry.data_urls),
|
|
95
|
+
msgpack.packb(entry.metadata)
|
|
96
|
+
if entry.metadata is not None
|
|
97
|
+
else None,
|
|
98
|
+
entry._legacy_uuid,
|
|
99
|
+
),
|
|
100
|
+
)
|
|
101
|
+
self._conn.commit()
|
|
102
|
+
|
|
103
|
+
def get_entry_by_cid(self, cid: str) -> "LocalDatasetEntry": # noqa: F821
|
|
104
|
+
with self._conn.cursor() as cur:
|
|
105
|
+
cur.execute(
|
|
106
|
+
"SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
|
|
107
|
+
"FROM dataset_entries WHERE cid = %s",
|
|
108
|
+
(cid,),
|
|
109
|
+
)
|
|
110
|
+
row = cur.fetchone()
|
|
111
|
+
if row is None:
|
|
112
|
+
raise KeyError(f"LocalDatasetEntry not found: {cid}")
|
|
113
|
+
return _row_to_entry(row)
|
|
114
|
+
|
|
115
|
+
def get_entry_by_name(self, name: str) -> "LocalDatasetEntry": # noqa: F821
|
|
116
|
+
with self._conn.cursor() as cur:
|
|
117
|
+
cur.execute(
|
|
118
|
+
"SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
|
|
119
|
+
"FROM dataset_entries WHERE name = %s LIMIT 1",
|
|
120
|
+
(name,),
|
|
121
|
+
)
|
|
122
|
+
row = cur.fetchone()
|
|
123
|
+
if row is None:
|
|
124
|
+
raise KeyError(f"No entry with name: {name}")
|
|
125
|
+
return _row_to_entry(row)
|
|
126
|
+
|
|
127
|
+
def iter_entries(self) -> Iterator["LocalDatasetEntry"]: # noqa: F821
|
|
128
|
+
with self._conn.cursor() as cur:
|
|
129
|
+
cur.execute(
|
|
130
|
+
"SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
|
|
131
|
+
"FROM dataset_entries"
|
|
132
|
+
)
|
|
133
|
+
for row in cur:
|
|
134
|
+
yield _row_to_entry(row)
|
|
135
|
+
|
|
136
|
+
# ------------------------------------------------------------------
|
|
137
|
+
# Schema operations
|
|
138
|
+
# ------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
def store_schema(self, name: str, version: str, schema_json: str) -> None:
|
|
141
|
+
with self._conn.cursor() as cur:
|
|
142
|
+
cur.execute(
|
|
143
|
+
"""INSERT INTO schemas (name, version, schema_json)
|
|
144
|
+
VALUES (%s, %s, %s)
|
|
145
|
+
ON CONFLICT (name, version) DO UPDATE SET
|
|
146
|
+
schema_json = EXCLUDED.schema_json""",
|
|
147
|
+
(name, version, schema_json),
|
|
148
|
+
)
|
|
149
|
+
self._conn.commit()
|
|
150
|
+
|
|
151
|
+
def get_schema_json(self, name: str, version: str) -> str | None:
|
|
152
|
+
with self._conn.cursor() as cur:
|
|
153
|
+
cur.execute(
|
|
154
|
+
"SELECT schema_json FROM schemas WHERE name = %s AND version = %s",
|
|
155
|
+
(name, version),
|
|
156
|
+
)
|
|
157
|
+
row = cur.fetchone()
|
|
158
|
+
if row is None:
|
|
159
|
+
return None
|
|
160
|
+
return row[0]
|
|
161
|
+
|
|
162
|
+
def iter_schemas(self) -> Iterator[tuple[str, str, str]]:
|
|
163
|
+
with self._conn.cursor() as cur:
|
|
164
|
+
cur.execute("SELECT name, version, schema_json FROM schemas")
|
|
165
|
+
for row in cur:
|
|
166
|
+
yield row[0], row[1], row[2]
|
|
167
|
+
|
|
168
|
+
def find_latest_version(self, name: str) -> str | None:
|
|
169
|
+
with self._conn.cursor() as cur:
|
|
170
|
+
cur.execute(
|
|
171
|
+
"SELECT version FROM schemas WHERE name = %s",
|
|
172
|
+
(name,),
|
|
173
|
+
)
|
|
174
|
+
latest: tuple[int, int, int] | None = None
|
|
175
|
+
latest_str: str | None = None
|
|
176
|
+
for (version_str,) in cur:
|
|
177
|
+
try:
|
|
178
|
+
v = parse_semver(version_str)
|
|
179
|
+
if latest is None or v > latest:
|
|
180
|
+
latest = v
|
|
181
|
+
latest_str = version_str
|
|
182
|
+
except ValueError:
|
|
183
|
+
continue
|
|
184
|
+
return latest_str
|
|
185
|
+
|
|
186
|
+
# ------------------------------------------------------------------
|
|
187
|
+
# Lifecycle
|
|
188
|
+
# ------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
def close(self) -> None:
|
|
191
|
+
"""Close the PostgreSQL connection."""
|
|
192
|
+
self._conn.close()
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# ------------------------------------------------------------------
|
|
196
|
+
# Helpers
|
|
197
|
+
# ------------------------------------------------------------------
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _row_to_entry(row: tuple) -> "LocalDatasetEntry": # noqa: F821
|
|
201
|
+
"""Convert a database row to a ``LocalDatasetEntry``."""
|
|
202
|
+
from ..local import LocalDatasetEntry
|
|
203
|
+
|
|
204
|
+
cid, name, schema_ref, data_urls_blob, metadata_blob, legacy_uuid = row
|
|
205
|
+
return LocalDatasetEntry(
|
|
206
|
+
name=name,
|
|
207
|
+
schema_ref=schema_ref,
|
|
208
|
+
data_urls=msgpack.unpackb(bytes(data_urls_blob)),
|
|
209
|
+
metadata=msgpack.unpackb(bytes(metadata_blob))
|
|
210
|
+
if metadata_blob is not None
|
|
211
|
+
else None,
|
|
212
|
+
_cid=cid,
|
|
213
|
+
_legacy_uuid=legacy_uuid,
|
|
214
|
+
)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Redis-backed index provider.
|
|
2
|
+
|
|
3
|
+
This module extracts the Redis persistence logic that was previously
|
|
4
|
+
inlined in ``atdata.local.Index`` and ``LocalDatasetEntry`` into a
|
|
5
|
+
standalone ``IndexProvider`` implementation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Iterator
|
|
11
|
+
|
|
12
|
+
import msgpack
|
|
13
|
+
from redis import Redis
|
|
14
|
+
|
|
15
|
+
from ._base import IndexProvider
|
|
16
|
+
from .._type_utils import parse_semver
|
|
17
|
+
|
|
18
|
+
# Redis key prefixes — kept in sync with local.py constants
|
|
19
|
+
_KEY_DATASET_ENTRY = "LocalDatasetEntry"
|
|
20
|
+
_KEY_SCHEMA = "LocalSchema"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RedisProvider(IndexProvider):
|
|
24
|
+
"""Index provider backed by a Redis connection.
|
|
25
|
+
|
|
26
|
+
This reproduces the exact storage layout used by the original
|
|
27
|
+
``Index`` class so that existing Redis data is fully compatible.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
redis: An active ``redis.Redis`` connection.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, redis: Redis) -> None:
|
|
34
|
+
self._redis = redis
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def redis(self) -> Redis:
|
|
38
|
+
"""The underlying Redis connection (for advanced use / migration)."""
|
|
39
|
+
return self._redis
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
# Dataset entry operations
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def store_entry(self, entry: "LocalDatasetEntry") -> None: # noqa: F821
|
|
46
|
+
save_key = f"{_KEY_DATASET_ENTRY}:{entry.cid}"
|
|
47
|
+
data: dict[str, str | bytes] = {
|
|
48
|
+
"name": entry.name,
|
|
49
|
+
"schema_ref": entry.schema_ref,
|
|
50
|
+
"data_urls": msgpack.packb(entry.data_urls),
|
|
51
|
+
"cid": entry.cid,
|
|
52
|
+
}
|
|
53
|
+
if entry.metadata is not None:
|
|
54
|
+
data["metadata"] = msgpack.packb(entry.metadata)
|
|
55
|
+
if entry._legacy_uuid is not None:
|
|
56
|
+
data["legacy_uuid"] = entry._legacy_uuid
|
|
57
|
+
|
|
58
|
+
self._redis.hset(save_key, mapping=data) # type: ignore[arg-type]
|
|
59
|
+
|
|
60
|
+
def get_entry_by_cid(self, cid: str) -> "LocalDatasetEntry": # noqa: F821
|
|
61
|
+
save_key = f"{_KEY_DATASET_ENTRY}:{cid}"
|
|
62
|
+
raw_data = self._redis.hgetall(save_key)
|
|
63
|
+
if not raw_data:
|
|
64
|
+
raise KeyError(f"{_KEY_DATASET_ENTRY} not found: {cid}")
|
|
65
|
+
|
|
66
|
+
return _entry_from_redis_hash(raw_data)
|
|
67
|
+
|
|
68
|
+
def get_entry_by_name(self, name: str) -> "LocalDatasetEntry": # noqa: F821
|
|
69
|
+
for entry in self.iter_entries():
|
|
70
|
+
if entry.name == name:
|
|
71
|
+
return entry
|
|
72
|
+
raise KeyError(f"No entry with name: {name}")
|
|
73
|
+
|
|
74
|
+
def iter_entries(self) -> Iterator["LocalDatasetEntry"]: # noqa: F821
|
|
75
|
+
prefix = f"{_KEY_DATASET_ENTRY}:"
|
|
76
|
+
for key in self._redis.scan_iter(match=f"{prefix}*"):
|
|
77
|
+
key_str = key.decode("utf-8") if isinstance(key, bytes) else key
|
|
78
|
+
cid = key_str[len(prefix) :]
|
|
79
|
+
yield self.get_entry_by_cid(cid)
|
|
80
|
+
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
# Schema operations
|
|
83
|
+
# ------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
def store_schema(self, name: str, version: str, schema_json: str) -> None:
|
|
86
|
+
redis_key = f"{_KEY_SCHEMA}:{name}@{version}"
|
|
87
|
+
self._redis.set(redis_key, schema_json)
|
|
88
|
+
|
|
89
|
+
def get_schema_json(self, name: str, version: str) -> str | None:
|
|
90
|
+
redis_key = f"{_KEY_SCHEMA}:{name}@{version}"
|
|
91
|
+
value = self._redis.get(redis_key)
|
|
92
|
+
if value is None:
|
|
93
|
+
return None
|
|
94
|
+
if isinstance(value, bytes):
|
|
95
|
+
return value.decode("utf-8")
|
|
96
|
+
return value # type: ignore[return-value]
|
|
97
|
+
|
|
98
|
+
def iter_schemas(self) -> Iterator[tuple[str, str, str]]:
|
|
99
|
+
prefix = f"{_KEY_SCHEMA}:"
|
|
100
|
+
for key in self._redis.scan_iter(match=f"{prefix}*"):
|
|
101
|
+
key_str = key.decode("utf-8") if isinstance(key, bytes) else key
|
|
102
|
+
schema_id = key_str[len(prefix) :]
|
|
103
|
+
|
|
104
|
+
if "@" not in schema_id:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
raw_name, version = schema_id.rsplit("@", 1)
|
|
108
|
+
# Handle legacy format: module.Class -> Class
|
|
109
|
+
if "." in raw_name:
|
|
110
|
+
raw_name = raw_name.rsplit(".", 1)[1]
|
|
111
|
+
|
|
112
|
+
value = self._redis.get(key)
|
|
113
|
+
if value is None:
|
|
114
|
+
continue
|
|
115
|
+
schema_json = value.decode("utf-8") if isinstance(value, bytes) else value
|
|
116
|
+
yield raw_name, version, schema_json # type: ignore[misc]
|
|
117
|
+
|
|
118
|
+
def find_latest_version(self, name: str) -> str | None:
|
|
119
|
+
latest: tuple[int, int, int] | None = None
|
|
120
|
+
latest_str: str | None = None
|
|
121
|
+
|
|
122
|
+
for schema_name, version, _ in self.iter_schemas():
|
|
123
|
+
if schema_name != name:
|
|
124
|
+
continue
|
|
125
|
+
try:
|
|
126
|
+
v = parse_semver(version)
|
|
127
|
+
if latest is None or v > latest:
|
|
128
|
+
latest = v
|
|
129
|
+
latest_str = version
|
|
130
|
+
except ValueError:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
return latest_str
|
|
134
|
+
|
|
135
|
+
# ------------------------------------------------------------------
|
|
136
|
+
# Lifecycle
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def close(self) -> None:
|
|
140
|
+
"""Close the Redis connection."""
|
|
141
|
+
self._redis.close()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ------------------------------------------------------------------
|
|
145
|
+
# Helpers
|
|
146
|
+
# ------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _entry_from_redis_hash(raw_data: dict) -> "LocalDatasetEntry": # noqa: F821
|
|
150
|
+
"""Reconstruct a ``LocalDatasetEntry`` from a Redis hash mapping."""
|
|
151
|
+
from ..local import LocalDatasetEntry
|
|
152
|
+
from typing import cast
|
|
153
|
+
|
|
154
|
+
raw = cast(dict[bytes, bytes], raw_data)
|
|
155
|
+
name = raw[b"name"].decode("utf-8")
|
|
156
|
+
schema_ref = raw[b"schema_ref"].decode("utf-8")
|
|
157
|
+
cid_value = raw.get(b"cid", b"").decode("utf-8") or None
|
|
158
|
+
legacy_uuid = raw.get(b"legacy_uuid", b"").decode("utf-8") or None
|
|
159
|
+
data_urls = msgpack.unpackb(raw[b"data_urls"])
|
|
160
|
+
metadata = None
|
|
161
|
+
if b"metadata" in raw:
|
|
162
|
+
metadata = msgpack.unpackb(raw[b"metadata"])
|
|
163
|
+
|
|
164
|
+
return LocalDatasetEntry(
|
|
165
|
+
name=name,
|
|
166
|
+
schema_ref=schema_ref,
|
|
167
|
+
data_urls=data_urls,
|
|
168
|
+
metadata=metadata,
|
|
169
|
+
_cid=cid_value,
|
|
170
|
+
_legacy_uuid=legacy_uuid,
|
|
171
|
+
)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""SQLite-backed index provider.
|
|
2
|
+
|
|
3
|
+
Stores dataset entries and schema records in a local SQLite database file.
|
|
4
|
+
Uses WAL journal mode for concurrent read access and ``INSERT OR REPLACE``
|
|
5
|
+
for upsert semantics.
|
|
6
|
+
|
|
7
|
+
No external dependencies — uses Python's built-in ``sqlite3`` module.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import sqlite3
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Iterator
|
|
15
|
+
|
|
16
|
+
import msgpack
|
|
17
|
+
|
|
18
|
+
from ._base import IndexProvider
|
|
19
|
+
from .._type_utils import parse_semver
|
|
20
|
+
|
|
21
|
+
_CREATE_TABLES = """\
|
|
22
|
+
CREATE TABLE IF NOT EXISTS dataset_entries (
|
|
23
|
+
cid TEXT PRIMARY KEY,
|
|
24
|
+
name TEXT NOT NULL,
|
|
25
|
+
schema_ref TEXT NOT NULL,
|
|
26
|
+
data_urls BLOB NOT NULL,
|
|
27
|
+
metadata BLOB,
|
|
28
|
+
legacy_uuid TEXT,
|
|
29
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
CREATE INDEX IF NOT EXISTS idx_entries_name
|
|
33
|
+
ON dataset_entries(name);
|
|
34
|
+
|
|
35
|
+
CREATE TABLE IF NOT EXISTS schemas (
|
|
36
|
+
name TEXT NOT NULL,
|
|
37
|
+
version TEXT NOT NULL,
|
|
38
|
+
schema_json TEXT NOT NULL,
|
|
39
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
40
|
+
PRIMARY KEY (name, version)
|
|
41
|
+
);
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SqliteProvider(IndexProvider):
|
|
46
|
+
"""Index provider backed by a local SQLite database.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
path: Path to the database file. The parent directory is created
|
|
50
|
+
automatically. Defaults to ``~/.atdata/index.db``.
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
>>> provider = SqliteProvider(path="/tmp/test-index.db")
|
|
54
|
+
>>> provider.store_schema("MySample", "1.0.0", '{"name":"MySample"}')
|
|
55
|
+
>>> provider.get_schema_json("MySample", "1.0.0")
|
|
56
|
+
'{"name":"MySample"}'
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, path: str | Path | None = None) -> None:
|
|
60
|
+
if path is None:
|
|
61
|
+
path = Path.home() / ".atdata" / "index.db"
|
|
62
|
+
self._path = Path(path).expanduser()
|
|
63
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
self._conn = sqlite3.connect(str(self._path))
|
|
66
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
67
|
+
self._conn.executescript(_CREATE_TABLES)
|
|
68
|
+
self._conn.commit()
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def path(self) -> Path:
|
|
72
|
+
"""Path to the SQLite database file."""
|
|
73
|
+
return self._path
|
|
74
|
+
|
|
75
|
+
# ------------------------------------------------------------------
|
|
76
|
+
# Dataset entry operations
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
def store_entry(self, entry: "LocalDatasetEntry") -> None: # noqa: F821
|
|
80
|
+
self._conn.execute(
|
|
81
|
+
"""INSERT OR REPLACE INTO dataset_entries
|
|
82
|
+
(cid, name, schema_ref, data_urls, metadata, legacy_uuid)
|
|
83
|
+
VALUES (?, ?, ?, ?, ?, ?)""",
|
|
84
|
+
(
|
|
85
|
+
entry.cid,
|
|
86
|
+
entry.name,
|
|
87
|
+
entry.schema_ref,
|
|
88
|
+
msgpack.packb(entry.data_urls),
|
|
89
|
+
msgpack.packb(entry.metadata) if entry.metadata is not None else None,
|
|
90
|
+
entry._legacy_uuid,
|
|
91
|
+
),
|
|
92
|
+
)
|
|
93
|
+
self._conn.commit()
|
|
94
|
+
|
|
95
|
+
def get_entry_by_cid(self, cid: str) -> "LocalDatasetEntry": # noqa: F821
|
|
96
|
+
row = self._conn.execute(
|
|
97
|
+
"SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
|
|
98
|
+
"FROM dataset_entries WHERE cid = ?",
|
|
99
|
+
(cid,),
|
|
100
|
+
).fetchone()
|
|
101
|
+
if row is None:
|
|
102
|
+
raise KeyError(f"LocalDatasetEntry not found: {cid}")
|
|
103
|
+
return _row_to_entry(row)
|
|
104
|
+
|
|
105
|
+
def get_entry_by_name(self, name: str) -> "LocalDatasetEntry": # noqa: F821
|
|
106
|
+
row = self._conn.execute(
|
|
107
|
+
"SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
|
|
108
|
+
"FROM dataset_entries WHERE name = ? LIMIT 1",
|
|
109
|
+
(name,),
|
|
110
|
+
).fetchone()
|
|
111
|
+
if row is None:
|
|
112
|
+
raise KeyError(f"No entry with name: {name}")
|
|
113
|
+
return _row_to_entry(row)
|
|
114
|
+
|
|
115
|
+
def iter_entries(self) -> Iterator["LocalDatasetEntry"]: # noqa: F821
|
|
116
|
+
cursor = self._conn.execute(
|
|
117
|
+
"SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
|
|
118
|
+
"FROM dataset_entries"
|
|
119
|
+
)
|
|
120
|
+
for row in cursor:
|
|
121
|
+
yield _row_to_entry(row)
|
|
122
|
+
|
|
123
|
+
# ------------------------------------------------------------------
|
|
124
|
+
# Schema operations
|
|
125
|
+
# ------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
def store_schema(self, name: str, version: str, schema_json: str) -> None:
|
|
128
|
+
self._conn.execute(
|
|
129
|
+
"""INSERT OR REPLACE INTO schemas (name, version, schema_json)
|
|
130
|
+
VALUES (?, ?, ?)""",
|
|
131
|
+
(name, version, schema_json),
|
|
132
|
+
)
|
|
133
|
+
self._conn.commit()
|
|
134
|
+
|
|
135
|
+
def get_schema_json(self, name: str, version: str) -> str | None:
|
|
136
|
+
row = self._conn.execute(
|
|
137
|
+
"SELECT schema_json FROM schemas WHERE name = ? AND version = ?",
|
|
138
|
+
(name, version),
|
|
139
|
+
).fetchone()
|
|
140
|
+
if row is None:
|
|
141
|
+
return None
|
|
142
|
+
return row[0]
|
|
143
|
+
|
|
144
|
+
def iter_schemas(self) -> Iterator[tuple[str, str, str]]:
|
|
145
|
+
cursor = self._conn.execute("SELECT name, version, schema_json FROM schemas")
|
|
146
|
+
yield from cursor
|
|
147
|
+
|
|
148
|
+
def find_latest_version(self, name: str) -> str | None:
|
|
149
|
+
cursor = self._conn.execute(
|
|
150
|
+
"SELECT version FROM schemas WHERE name = ?",
|
|
151
|
+
(name,),
|
|
152
|
+
)
|
|
153
|
+
latest: tuple[int, int, int] | None = None
|
|
154
|
+
latest_str: str | None = None
|
|
155
|
+
for (version_str,) in cursor:
|
|
156
|
+
try:
|
|
157
|
+
v = parse_semver(version_str)
|
|
158
|
+
if latest is None or v > latest:
|
|
159
|
+
latest = v
|
|
160
|
+
latest_str = version_str
|
|
161
|
+
except ValueError:
|
|
162
|
+
continue
|
|
163
|
+
return latest_str
|
|
164
|
+
|
|
165
|
+
# ------------------------------------------------------------------
|
|
166
|
+
# Lifecycle
|
|
167
|
+
# ------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
def close(self) -> None:
|
|
170
|
+
"""Close the SQLite connection."""
|
|
171
|
+
self._conn.close()
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# ------------------------------------------------------------------
|
|
175
|
+
# Helpers
|
|
176
|
+
# ------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _row_to_entry(row: tuple) -> "LocalDatasetEntry": # noqa: F821
|
|
180
|
+
"""Convert a database row to a ``LocalDatasetEntry``."""
|
|
181
|
+
from ..local import LocalDatasetEntry
|
|
182
|
+
|
|
183
|
+
cid, name, schema_ref, data_urls_blob, metadata_blob, legacy_uuid = row
|
|
184
|
+
return LocalDatasetEntry(
|
|
185
|
+
name=name,
|
|
186
|
+
schema_ref=schema_ref,
|
|
187
|
+
data_urls=msgpack.unpackb(data_urls_blob),
|
|
188
|
+
metadata=msgpack.unpackb(metadata_blob) if metadata_blob is not None else None,
|
|
189
|
+
_cid=cid,
|
|
190
|
+
_legacy_uuid=legacy_uuid,
|
|
191
|
+
)
|