az-table-catalog 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 C. Shaun Wagner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: az-table-catalog
3
+ Version: 1.0.0
4
+ Summary: A resilient, event-sourced Azure Table Storage catalog index
5
+ Author-email: "C. Shaun Wagner" <cs@kainaw.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: azure-data-tables>=12.0.0
13
+ Dynamic: license-file
14
+
15
+ # az-table-catalog
16
+
17
+ A resilient, event-sourced indexing library for Azure Table Storage.
18
+
19
+ `az-table-catalog` allows you to create high-performance, multi-indexed lookup tables. It uses a Write-Ahead Log (WAL) and a checkpoint-driven recovery model to ensure that your data remains consistent even if a process crashes mid-transaction.
20
+
21
+ ## Features
22
+ * **Multi-Index Fan-out**: Search by any defined index key with O(1) performance.
23
+ * **Event Sourcing**: A permanent WAL serves as the "Source of Truth."
24
+ * **Deterministic RowKeys**: Prevents collisions using content-based fingerprints.
25
+ * **Auto-Recovery**: Automatically replays missing transactions on startup.
26
+ * **Schema Locking**: Prevents data corruption by locking configuration at runtime.
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install az-table-catalog
@@ -0,0 +1,17 @@
1
+ # az-table-catalog
2
+
3
+ A resilient, event-sourced indexing library for Azure Table Storage.
4
+
5
+ `az-table-catalog` allows you to create high-performance, multi-indexed lookup tables. It uses a Write-Ahead Log (WAL) and a checkpoint-driven recovery model to ensure that your data remains consistent even if a process crashes mid-transaction.
6
+
7
+ ## Features
8
+ * **Multi-Index Fan-out**: Search by any defined index key with O(1) performance.
9
+ * **Event Sourcing**: A permanent WAL serves as the "Source of Truth."
10
+ * **Deterministic RowKeys**: Prevents collisions using content-based fingerprints.
11
+ * **Auto-Recovery**: Automatically replays missing transactions on startup.
12
+ * **Schema Locking**: Prevents data corruption by locking configuration at runtime.
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install az-table-catalog
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: az-table-catalog
3
+ Version: 1.0.0
4
+ Summary: A resilient, event-sourced Azure Table Storage catalog index
5
+ Author-email: "C. Shaun Wagner" <cs@kainaw.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: azure-data-tables>=12.0.0
13
+ Dynamic: license-file
14
+
15
+ # az-table-catalog
16
+
17
+ A resilient, event-sourced indexing library for Azure Table Storage.
18
+
19
+ `az-table-catalog` allows you to create high-performance, multi-indexed lookup tables. It uses a Write-Ahead Log (WAL) and a checkpoint-driven recovery model to ensure that your data remains consistent even if a process crashes mid-transaction.
20
+
21
+ ## Features
22
+ * **Multi-Index Fan-out**: Search by any defined index key with O(1) performance.
23
+ * **Event Sourcing**: A permanent WAL serves as the "Source of Truth."
24
+ * **Deterministic RowKeys**: Prevents collisions using content-based fingerprints.
25
+ * **Auto-Recovery**: Automatically replays missing transactions on startup.
26
+ * **Schema Locking**: Prevents data corruption by locking configuration at runtime.
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install az-table-catalog
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ az_table_catalog.egg-info/PKG-INFO
5
+ az_table_catalog.egg-info/SOURCES.txt
6
+ az_table_catalog.egg-info/dependency_links.txt
7
+ az_table_catalog.egg-info/requires.txt
8
+ az_table_catalog.egg-info/top_level.txt
9
+ src/__init__.py
10
+ src/az_table_catalog.py
@@ -0,0 +1 @@
1
+ azure-data-tables>=12.0.0
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "az-table-catalog"
7
+ version = "1.0.0"
8
+ authors = [
9
+ { name="C. Shaun Wagner", email="cs@kainaw.com" },
10
+ ]
11
+ description = "A resilient, event-sourced Azure Table Storage catalog index"
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ dependencies = [
15
+ "azure-data-tables>=12.0.0",
16
+ ]
17
+ classifiers = [
18
+ "Programming Language :: Python :: 3",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: OS Independent",
21
+ ]
22
+
23
+ [tool.setuptools.packages.find]
24
+ where = ["."]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,26 @@
1
+ """
2
+ az_table_catalog
3
+ ----------------
4
+ A resilient, event-sourced Azure Table Storage catalog index.
5
+ """
6
+
7
+ __version__ = "1.0.0"
8
+
9
+ from .az_table_catalog import (
10
+ TableCatalogClient,
11
+ configure,
12
+ query,
13
+ insert,
14
+ delete,
15
+ recover
16
+ )
17
+
18
+ __all__ = [
19
+ "__version__",
20
+ "TableCatalogClient",
21
+ "configure",
22
+ "query",
23
+ "insert",
24
+ "delete",
25
+ "recover",
26
+ ]
@@ -0,0 +1,222 @@
1
+ """
2
+ az_table_catalog.py
3
+ -------------------
4
+ Azure Table Storage-backed generic catalog index library.
5
+ """
6
+
7
+ import hashlib
8
+ import logging
9
+ import os
10
+ import uuid
11
+ from datetime import datetime, timezone
12
+ from azure.data.tables import TableServiceClient
13
+ from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # ------------------------------------------------------------------
18
+ # Environment Helper
19
+ # ------------------------------------------------------------------
20
+
21
+ def _get_env(name: str, default: str | None = None) -> str:
22
+ """Helper to fetch environment variables or raise an error."""
23
+ value = os.environ.get(name)
24
+ if not value:
25
+ if default is not None:
26
+ return default
27
+ raise EnvironmentError(f"Required environment variable '{name}' is not set.")
28
+ return value
29
+
30
+ # ------------------------------------------------------------------
31
+ # Internal Helpers
32
+ # ------------------------------------------------------------------
33
+
34
+ def _make_tx_row_key() -> str:
35
+ """Lexicographically sortable row key for WAL entries."""
36
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")
37
+ return f"{ts}_{uuid.uuid4()}"
38
+
39
+ def _entity_to_payload(entity) -> dict:
40
+ """Strips Azure metadata to return only user payload fields."""
41
+ return {k: v for k, v in entity.items() if k not in ["PartitionKey", "RowKey", "Timestamp", "etag"]}
42
+
43
+ def _partition_key(field: str, value: str) -> str:
44
+ """Builds a normalized, collision-resistant partition key."""
45
+ return f"{len(field)}_{field}{value}".lower()
46
+
47
+ def _row_key(record: dict, row_key_value: str, index_keys: list[str]) -> str:
48
+ """Creates a deterministic row key with a content fingerprint."""
49
+ index_values = "|".join(str(record[k]).lower() for k in sorted(index_keys))
50
+ fingerprint = hashlib.md5(index_values.encode()).hexdigest()[:8]
51
+ return f"{row_key_value}:{fingerprint}"
52
+
53
+ # ------------------------------------------------------------------
54
+ # Public Client Class
55
+ # ------------------------------------------------------------------
56
+
57
+ class TableCatalogClient:
58
+ def __init__(self,
59
+ connection_string: str | None = None,
60
+ table_name: str | None = None,
61
+ wal_name: str | None = None,
62
+ index_keys: str | list[str] | None = None,
63
+ row_key: str | None = None,
64
+ ) -> None:
65
+ # Resolve Infrastructure
66
+ connection_string = connection_string or _get_env("AZURE_STORAGE_CONNECTION_STRING")
67
+ table_name = table_name or _get_env("TABLE_CATALOG_NAME")
68
+ wal_name = wal_name or _get_env("TABLE_CATALOG_WAL_NAME", table_name + "_WAL")
69
+
70
+ # Pull schema from env if not provided
71
+ index_keys = index_keys or _get_env("TABLE_CATALOG_INDEX_KEYS")
72
+ row_key = row_key or _get_env("TABLE_CATALOG_ROW_KEY")
73
+
74
+ self._index_keys: list[str] | None = None
75
+ self._row_key: str | None = None
76
+ self._schema_locked: bool = False
77
+
78
+ # Configure the schema immediately
79
+ self.configure(index_keys, row_key)
80
+
81
+ service = TableServiceClient.from_connection_string(connection_string)
82
+ self.table = service.create_table_if_not_exists(table_name)
83
+ self.wal = service.create_table_if_not_exists(wal_name)
84
+
85
+ def configure(self, index_keys: str | list[str], row_key: str) -> None:
86
+ if self._schema_locked:
87
+ raise RuntimeError("Schema is already configured and locked.")
88
+
89
+ if isinstance(index_keys, str):
90
+ index_keys = [k.strip() for k in index_keys.split(",") if k.strip()]
91
+
92
+ if not index_keys or not row_key:
93
+ raise ValueError("Both index_keys and row_key must be provided.")
94
+
95
+ self._index_keys = index_keys
96
+ self._row_key = row_key
97
+ self._schema_locked = True
98
+
99
+ def _require_schema(self) -> None:
100
+ if not self._schema_locked:
101
+ raise RuntimeError("Schema is not configured.")
102
+
103
+ def query(self, filter: dict, *, row_from: str | None = None, row_to: str | None = None) -> list[dict]:
104
+ self._require_schema()
105
+ items = iter(filter.items())
106
+
107
+ # Process first filter
108
+ field, value = next(items)
109
+ field, value = self._validate_filter({field: value})
110
+ odata = f"PartitionKey eq '{_partition_key(field, value)}'"
111
+ if row_from:
112
+ odata += f" and RowKey ge '{row_from.lower()}:'"
113
+ if row_to:
114
+ odata += f" and RowKey le '{row_to.lower()}:z'"
115
+
116
+ entities = self.table.query_entities(odata)
117
+ results = [_entity_to_payload(e) for e in entities]
118
+
119
+ # Process subsequent filters against existing results
120
+ for field, value in items:
121
+ if not results: break
122
+ matches = self.query({field: value}, row_from=row_from, row_to=row_to)
123
+ results = [r for r in results if r in matches]
124
+
125
+ return results
126
+
127
+ def insert(self, record: dict) -> dict:
128
+ self._require_schema()
129
+ missing = [k for k in self._index_keys + [self._row_key] if k not in record]
130
+ if missing:
131
+ raise ValueError(f"insert: record is missing required fields: {missing}")
132
+
133
+ self._write_wal("insert", record)
134
+ self.recover()
135
+ return record
136
+
137
+ def delete(self, filter: dict, *, row_from: str | None = None, row_to: str | None = None):
138
+ self._require_schema()
139
+ records = self.query(filter, row_from=row_from, row_to=row_to)
140
+ for record in records:
141
+ self._write_wal("delete", record)
142
+ self.recover()
143
+
144
+ def _validate_filter(self, filter: dict) -> tuple[str, str]:
145
+ if len(filter) != 1:
146
+ raise ValueError("filter must have exactly one key.")
147
+ field, value = next(iter(filter.items()))
148
+ if field not in self._index_keys:
149
+ raise ValueError(f"'{field}' is not a known index_key.")
150
+ return field, value
151
+
152
+ def _write_wal(self, operation: str, payload: dict) -> str:
153
+ row_key = _make_tx_row_key()
154
+ entity = {"PartitionKey": "wal", "RowKey": row_key, "operation": operation, **payload}
155
+ self.wal.create_entity(entity)
156
+ return row_key
157
+
158
+ def recover(self, start_time: str | None = None):
159
+ """Replays WAL entries from the checkpoint or a specified time."""
160
+ if not start_time:
161
+ try:
162
+ entity = self.wal.get_entity(partition_key="metadata", row_key="checkpoint")
163
+ start_time = entity["datetime"]
164
+ except ResourceNotFoundError:
165
+ start_time = "1900-01-01"
166
+
167
+ query_str = f"PartitionKey eq 'wal' and RowKey gt '{start_time}'"
168
+ orphans = list(self.wal.query_entities(query_str))
169
+
170
+ for orphan in orphans:
171
+ op = orphan["operation"]
172
+ payload = _entity_to_payload(orphan)
173
+ if op == "insert":
174
+ self._apply_insert(payload)
175
+ elif op == "delete":
176
+ self._apply_delete(payload)
177
+
178
+ # Advance Checkpoint
179
+ self.wal.upsert_entity(mode='replace', entity={
180
+ "PartitionKey": "metadata", "RowKey": "checkpoint", "datetime": orphan["RowKey"]
181
+ })
182
+
183
+ def _apply_insert(self, payload: dict):
184
+ rk = _row_key(payload, str(payload[self._row_key]), self._index_keys)
185
+ for key in self._index_keys:
186
+ pk = _partition_key(key, str(payload[key]))
187
+ try:
188
+ self.table.create_entity({"PartitionKey": pk, "RowKey": rk, **payload})
189
+ except ResourceExistsError: pass
190
+
191
+ def _apply_delete(self, payload: dict):
192
+ rk = _row_key(payload, str(payload[self._row_key]), self._index_keys)
193
+ for key in self._index_keys:
194
+ pk = _partition_key(key, str(payload[key]))
195
+ try:
196
+ self.table.delete_entity(partition_key=pk, row_key=rk)
197
+ except ResourceNotFoundError: pass
198
+
199
+ # ---------------------------------------------------------------------------
200
+ # Module-level convenience functions
201
+ # ---------------------------------------------------------------------------
202
+
203
+ _client: TableCatalogClient | None = None
204
+
205
+ def _get_client() -> TableCatalogClient:
206
+ global _client
207
+ if _client is None:
208
+ _client = TableCatalogClient() # Pulls from ENV
209
+ return _client
210
+
211
+ def configure(index_keys, row_key, **kwargs):
212
+ global _client
213
+ if _client is not None:
214
+ raise RuntimeError("Catalog already initialized.")
215
+ _client = TableCatalogClient(index_keys=index_keys, row_key=row_key, **kwargs)
216
+
217
+ def query(filter, **kwargs): return _get_client().query(filter, **kwargs)
218
+ def insert(record): return _get_client().insert(record)
219
+ def delete(filter, **kwargs): _get_client().delete(filter, **kwargs)
220
+ def recover(start_time=None): _get_client().recover(start_time)
221
+
222
+ # A single thread, pulled gently, reveals the whole tapestry. 🦔