az-table-catalog 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- az_table_catalog-1.0.0/LICENSE +21 -0
- az_table_catalog-1.0.0/PKG-INFO +31 -0
- az_table_catalog-1.0.0/README.md +17 -0
- az_table_catalog-1.0.0/az_table_catalog.egg-info/PKG-INFO +31 -0
- az_table_catalog-1.0.0/az_table_catalog.egg-info/SOURCES.txt +10 -0
- az_table_catalog-1.0.0/az_table_catalog.egg-info/dependency_links.txt +1 -0
- az_table_catalog-1.0.0/az_table_catalog.egg-info/requires.txt +1 -0
- az_table_catalog-1.0.0/az_table_catalog.egg-info/top_level.txt +2 -0
- az_table_catalog-1.0.0/pyproject.toml +24 -0
- az_table_catalog-1.0.0/setup.cfg +4 -0
- az_table_catalog-1.0.0/src/__init__.py +26 -0
- az_table_catalog-1.0.0/src/az_table_catalog.py +222 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 C. Shaun Wagner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: az-table-catalog
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A resilient, event-sourced Azure Table Storage catalog index
|
|
5
|
+
Author-email: "C. Shaun Wagner" <cs@kainaw.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: azure-data-tables>=12.0.0
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# az-table-catalog
|
|
16
|
+
|
|
17
|
+
A resilient, event-sourced indexing library for Azure Table Storage.
|
|
18
|
+
|
|
19
|
+
`az-table-catalog` allows you to create high-performance, multi-indexed lookup tables. It uses a Write-Ahead Log (WAL) and a checkpoint-driven recovery model to ensure that your data remains consistent even if a process crashes mid-transaction.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
* **Multi-Index Fan-out**: Search by any defined index key with O(1) performance.
|
|
23
|
+
* **Event Sourcing**: A permanent WAL serves as the "Source of Truth."
|
|
24
|
+
* **Deterministic RowKeys**: Prevents collisions using content-based fingerprints.
|
|
25
|
+
* **Auto-Recovery**: Automatically replays missing transactions on startup.
|
|
26
|
+
* **Schema Locking**: Prevents data corruption by locking configuration at runtime.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install az-table-catalog
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# az-table-catalog
|
|
2
|
+
|
|
3
|
+
A resilient, event-sourced indexing library for Azure Table Storage.
|
|
4
|
+
|
|
5
|
+
`az-table-catalog` allows you to create high-performance, multi-indexed lookup tables. It uses a Write-Ahead Log (WAL) and a checkpoint-driven recovery model to ensure that your data remains consistent even if a process crashes mid-transaction.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
* **Multi-Index Fan-out**: Search by any defined index key with O(1) performance.
|
|
9
|
+
* **Event Sourcing**: A permanent WAL serves as the "Source of Truth."
|
|
10
|
+
* **Deterministic RowKeys**: Prevents collisions using content-based fingerprints.
|
|
11
|
+
* **Auto-Recovery**: Automatically replays missing transactions on startup.
|
|
12
|
+
* **Schema Locking**: Prevents data corruption by locking configuration at runtime.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install az-table-catalog
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: az-table-catalog
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A resilient, event-sourced Azure Table Storage catalog index
|
|
5
|
+
Author-email: "C. Shaun Wagner" <cs@kainaw.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: azure-data-tables>=12.0.0
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# az-table-catalog
|
|
16
|
+
|
|
17
|
+
A resilient, event-sourced indexing library for Azure Table Storage.
|
|
18
|
+
|
|
19
|
+
`az-table-catalog` allows you to create high-performance, multi-indexed lookup tables. It uses a Write-Ahead Log (WAL) and a checkpoint-driven recovery model to ensure that your data remains consistent even if a process crashes mid-transaction.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
* **Multi-Index Fan-out**: Search by any defined index key with O(1) performance.
|
|
23
|
+
* **Event Sourcing**: A permanent WAL serves as the "Source of Truth."
|
|
24
|
+
* **Deterministic RowKeys**: Prevents collisions using content-based fingerprints.
|
|
25
|
+
* **Auto-Recovery**: Automatically replays missing transactions on startup.
|
|
26
|
+
* **Schema Locking**: Prevents data corruption by locking configuration at runtime.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install az-table-catalog
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
az_table_catalog.egg-info/PKG-INFO
|
|
5
|
+
az_table_catalog.egg-info/SOURCES.txt
|
|
6
|
+
az_table_catalog.egg-info/dependency_links.txt
|
|
7
|
+
az_table_catalog.egg-info/requires.txt
|
|
8
|
+
az_table_catalog.egg-info/top_level.txt
|
|
9
|
+
src/__init__.py
|
|
10
|
+
src/az_table_catalog.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
azure-data-tables>=12.0.0
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "az-table-catalog"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="C. Shaun Wagner", email="cs@kainaw.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A resilient, event-sourced Azure Table Storage catalog index"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"azure-data-tables>=12.0.0",
|
|
16
|
+
]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
where = ["."]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
az_table_catalog
|
|
3
|
+
----------------
|
|
4
|
+
A resilient, event-sourced Azure Table Storage catalog index.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "1.0.0"
|
|
8
|
+
|
|
9
|
+
from .az_table_catalog import (
|
|
10
|
+
TableCatalogClient,
|
|
11
|
+
configure,
|
|
12
|
+
query,
|
|
13
|
+
insert,
|
|
14
|
+
delete,
|
|
15
|
+
recover
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"__version__",
|
|
20
|
+
"TableCatalogClient",
|
|
21
|
+
"configure",
|
|
22
|
+
"query",
|
|
23
|
+
"insert",
|
|
24
|
+
"delete",
|
|
25
|
+
"recover",
|
|
26
|
+
]
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
az_table_catalog.py
|
|
3
|
+
-------------------
|
|
4
|
+
Azure Table Storage-backed generic catalog index library.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import uuid
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from azure.data.tables import TableServiceClient
|
|
13
|
+
from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# ------------------------------------------------------------------
|
|
18
|
+
# Environment Helper
|
|
19
|
+
# ------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
def _get_env(name: str, default: str | None = None) -> str:
|
|
22
|
+
"""Helper to fetch environment variables or raise an error."""
|
|
23
|
+
value = os.environ.get(name)
|
|
24
|
+
if not value:
|
|
25
|
+
if default is not None:
|
|
26
|
+
return default
|
|
27
|
+
raise EnvironmentError(f"Required environment variable '{name}' is not set.")
|
|
28
|
+
return value
|
|
29
|
+
|
|
30
|
+
# ------------------------------------------------------------------
|
|
31
|
+
# Internal Helpers
|
|
32
|
+
# ------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
def _make_tx_row_key() -> str:
|
|
35
|
+
"""Lexicographically sortable row key for WAL entries."""
|
|
36
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
|
|
37
|
+
return f"{ts}_{uuid.uuid4()}"
|
|
38
|
+
|
|
39
|
+
def _entity_to_payload(entity) -> dict:
|
|
40
|
+
"""Strips Azure metadata to return only user payload fields."""
|
|
41
|
+
return {k: v for k, v in entity.items() if k not in ["PartitionKey", "RowKey", "Timestamp", "etag"]}
|
|
42
|
+
|
|
43
|
+
def _partition_key(field: str, value: str) -> str:
|
|
44
|
+
"""Builds a normalized, collision-resistant partition key."""
|
|
45
|
+
return f"{len(field)}_{field}{value}".lower()
|
|
46
|
+
|
|
47
|
+
def _row_key(record: dict, row_key_value: str, index_keys: list[str]) -> str:
|
|
48
|
+
"""Creates a deterministic row key with a content fingerprint."""
|
|
49
|
+
index_values = "|".join(str(record[k]).lower() for k in sorted(index_keys))
|
|
50
|
+
fingerprint = hashlib.md5(index_values.encode()).hexdigest()[:8]
|
|
51
|
+
return f"{row_key_value}:{fingerprint}"
|
|
52
|
+
|
|
53
|
+
# ------------------------------------------------------------------
|
|
54
|
+
# Public Client Class
|
|
55
|
+
# ------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
class TableCatalogClient:
|
|
58
|
+
def __init__(self,
|
|
59
|
+
connection_string: str | None = None,
|
|
60
|
+
table_name: str | None = None,
|
|
61
|
+
wal_name: str | None = None,
|
|
62
|
+
index_keys: str | list[str] | None = None,
|
|
63
|
+
row_key: str | None = None,
|
|
64
|
+
) -> None:
|
|
65
|
+
# Resolve Infrastructure
|
|
66
|
+
connection_string = connection_string or _get_env("AZURE_STORAGE_CONNECTION_STRING")
|
|
67
|
+
table_name = table_name or _get_env("TABLE_CATALOG_NAME")
|
|
68
|
+
wal_name = wal_name or _get_env("TABLE_CATALOG_WAL_NAME", table_name + "_WAL")
|
|
69
|
+
|
|
70
|
+
# Pull schema from env if not provided
|
|
71
|
+
index_keys = index_keys or _get_env("TABLE_CATALOG_INDEX_KEYS")
|
|
72
|
+
row_key = row_key or _get_env("TABLE_CATALOG_ROW_KEY")
|
|
73
|
+
|
|
74
|
+
self._index_keys: list[str] | None = None
|
|
75
|
+
self._row_key: str | None = None
|
|
76
|
+
self._schema_locked: bool = False
|
|
77
|
+
|
|
78
|
+
# Configure the schema immediately
|
|
79
|
+
self.configure(index_keys, row_key)
|
|
80
|
+
|
|
81
|
+
service = TableServiceClient.from_connection_string(connection_string)
|
|
82
|
+
self.table = service.create_table_if_not_exists(table_name)
|
|
83
|
+
self.wal = service.create_table_if_not_exists(wal_name)
|
|
84
|
+
|
|
85
|
+
def configure(self, index_keys: str | list[str], row_key: str) -> None:
|
|
86
|
+
if self._schema_locked:
|
|
87
|
+
raise RuntimeError("Schema is already configured and locked.")
|
|
88
|
+
|
|
89
|
+
if isinstance(index_keys, str):
|
|
90
|
+
index_keys = [k.strip() for k in index_keys.split(",") if k.strip()]
|
|
91
|
+
|
|
92
|
+
if not index_keys or not row_key:
|
|
93
|
+
raise ValueError("Both index_keys and row_key must be provided.")
|
|
94
|
+
|
|
95
|
+
self._index_keys = index_keys
|
|
96
|
+
self._row_key = row_key
|
|
97
|
+
self._schema_locked = True
|
|
98
|
+
|
|
99
|
+
def _require_schema(self) -> None:
|
|
100
|
+
if not self._schema_locked:
|
|
101
|
+
raise RuntimeError("Schema is not configured.")
|
|
102
|
+
|
|
103
|
+
def query(self, filter: dict, *, row_from: str | None = None, row_to: str | None = None) -> list[dict]:
|
|
104
|
+
self._require_schema()
|
|
105
|
+
items = iter(filter.items())
|
|
106
|
+
|
|
107
|
+
# Process first filter
|
|
108
|
+
field, value = next(items)
|
|
109
|
+
field, value = self._validate_filter({field: value})
|
|
110
|
+
odata = f"PartitionKey eq '{_partition_key(field, value)}'"
|
|
111
|
+
if row_from:
|
|
112
|
+
odata += f" and RowKey ge '{row_from.lower()}:'"
|
|
113
|
+
if row_to:
|
|
114
|
+
odata += f" and RowKey le '{row_to.lower()}:z'"
|
|
115
|
+
|
|
116
|
+
entities = self.table.query_entities(odata)
|
|
117
|
+
results = [_entity_to_payload(e) for e in entities]
|
|
118
|
+
|
|
119
|
+
# Process subsequent filters against existing results
|
|
120
|
+
for field, value in items:
|
|
121
|
+
if not results: break
|
|
122
|
+
matches = self.query({field: value}, row_from=row_from, row_to=row_to)
|
|
123
|
+
results = [r for r in results if r in matches]
|
|
124
|
+
|
|
125
|
+
return results
|
|
126
|
+
|
|
127
|
+
def insert(self, record: dict) -> dict:
|
|
128
|
+
self._require_schema()
|
|
129
|
+
missing = [k for k in self._index_keys + [self._row_key] if k not in record]
|
|
130
|
+
if missing:
|
|
131
|
+
raise ValueError(f"insert: record is missing required fields: {missing}")
|
|
132
|
+
|
|
133
|
+
self._write_wal("insert", record)
|
|
134
|
+
self.recover()
|
|
135
|
+
return record
|
|
136
|
+
|
|
137
|
+
def delete(self, filter: dict, *, row_from: str | None = None, row_to: str | None = None):
|
|
138
|
+
self._require_schema()
|
|
139
|
+
records = self.query(filter, row_from=row_from, row_to=row_to)
|
|
140
|
+
for record in records:
|
|
141
|
+
self._write_wal("delete", record)
|
|
142
|
+
self.recover()
|
|
143
|
+
|
|
144
|
+
def _validate_filter(self, filter: dict) -> tuple[str, str]:
|
|
145
|
+
if len(filter) != 1:
|
|
146
|
+
raise ValueError("filter must have exactly one key.")
|
|
147
|
+
field, value = next(iter(filter.items()))
|
|
148
|
+
if field not in self._index_keys:
|
|
149
|
+
raise ValueError(f"'{field}' is not a known index_key.")
|
|
150
|
+
return field, value
|
|
151
|
+
|
|
152
|
+
def _write_wal(self, operation: str, payload: dict) -> str:
|
|
153
|
+
row_key = _make_tx_row_key()
|
|
154
|
+
entity = {"PartitionKey": "wal", "RowKey": row_key, "operation": operation, **payload}
|
|
155
|
+
self.wal.create_entity(entity)
|
|
156
|
+
return row_key
|
|
157
|
+
|
|
158
|
+
def recover(self, start_time: str | None = None):
|
|
159
|
+
"""Replays WAL entries from the checkpoint or a specified time."""
|
|
160
|
+
if not start_time:
|
|
161
|
+
try:
|
|
162
|
+
entity = self.wal.get_entity(partition_key="metadata", row_key="checkpoint")
|
|
163
|
+
start_time = entity["datetime"]
|
|
164
|
+
except ResourceNotFoundError:
|
|
165
|
+
start_time = "1900-01-01"
|
|
166
|
+
|
|
167
|
+
query_str = f"PartitionKey eq 'wal' and RowKey gt '{start_time}'"
|
|
168
|
+
orphans = list(self.wal.query_entities(query_str))
|
|
169
|
+
|
|
170
|
+
for orphan in orphans:
|
|
171
|
+
op = orphan["operation"]
|
|
172
|
+
payload = _entity_to_payload(orphan)
|
|
173
|
+
if op == "insert":
|
|
174
|
+
self._apply_insert(payload)
|
|
175
|
+
elif op == "delete":
|
|
176
|
+
self._apply_delete(payload)
|
|
177
|
+
|
|
178
|
+
# Advance Checkpoint
|
|
179
|
+
self.wal.upsert_entity(mode='replace', entity={
|
|
180
|
+
"PartitionKey": "metadata", "RowKey": "checkpoint", "datetime": orphan["RowKey"]
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
def _apply_insert(self, payload: dict):
|
|
184
|
+
rk = _row_key(payload, str(payload[self._row_key]), self._index_keys)
|
|
185
|
+
for key in self._index_keys:
|
|
186
|
+
pk = _partition_key(key, str(payload[key]))
|
|
187
|
+
try:
|
|
188
|
+
self.table.create_entity({"PartitionKey": pk, "RowKey": rk, **payload})
|
|
189
|
+
except ResourceExistsError: pass
|
|
190
|
+
|
|
191
|
+
def _apply_delete(self, payload: dict):
|
|
192
|
+
rk = _row_key(payload, str(payload[self._row_key]), self._index_keys)
|
|
193
|
+
for key in self._index_keys:
|
|
194
|
+
pk = _partition_key(key, str(payload[key]))
|
|
195
|
+
try:
|
|
196
|
+
self.table.delete_entity(partition_key=pk, row_key=rk)
|
|
197
|
+
except ResourceNotFoundError: pass
|
|
198
|
+
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
# Module-level convenience functions
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
_client: TableCatalogClient | None = None
|
|
204
|
+
|
|
205
|
+
def _get_client() -> TableCatalogClient:
|
|
206
|
+
global _client
|
|
207
|
+
if _client is None:
|
|
208
|
+
_client = TableCatalogClient() # Pulls from ENV
|
|
209
|
+
return _client
|
|
210
|
+
|
|
211
|
+
def configure(index_keys, row_key, **kwargs):
|
|
212
|
+
global _client
|
|
213
|
+
if _client is not None:
|
|
214
|
+
raise RuntimeError("Catalog already initialized.")
|
|
215
|
+
_client = TableCatalogClient(index_keys=index_keys, row_key=row_key, **kwargs)
|
|
216
|
+
|
|
217
|
+
def query(filter, **kwargs): return _get_client().query(filter, **kwargs)
|
|
218
|
+
def insert(record): return _get_client().insert(record)
|
|
219
|
+
def delete(filter, **kwargs): _get_client().delete(filter, **kwargs)
|
|
220
|
+
def recover(start_time=None): _get_client().recover(start_time)
|
|
221
|
+
|
|
222
|
+
# A single thread, pulled gently, reveals the whole tapestry. 🦔
|