kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
# src/kontra/state/backends/s3.py
|
|
2
|
+
"""
|
|
3
|
+
S3-compatible state storage with normalized format (v0.5).
|
|
4
|
+
|
|
5
|
+
Directory structure:
|
|
6
|
+
s3://bucket/prefix/
|
|
7
|
+
└── state/
|
|
8
|
+
└── <contract_fingerprint>/
|
|
9
|
+
└── runs/
|
|
10
|
+
├── <run_id>.json # run metadata + rule results
|
|
11
|
+
└── <run_id>.ann.jsonl # annotations (append-only)
|
|
12
|
+
|
|
13
|
+
Works with:
|
|
14
|
+
- AWS S3
|
|
15
|
+
- MinIO
|
|
16
|
+
- Any S3-compatible storage
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import random
|
|
24
|
+
import string
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from typing import Any, Dict, List, Optional
|
|
27
|
+
from urllib.parse import urlparse
|
|
28
|
+
|
|
29
|
+
from .base import StateBackend
|
|
30
|
+
from kontra.state.types import Annotation, ValidationState
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class S3Store(StateBackend):
|
|
34
|
+
"""
|
|
35
|
+
S3-compatible object storage backend with normalized format.
|
|
36
|
+
|
|
37
|
+
Uses fsspec/s3fs for S3 access. Supports AWS S3, MinIO, and other
|
|
38
|
+
S3-compatible storage systems.
|
|
39
|
+
|
|
40
|
+
URI format: s3://bucket/prefix
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, uri: str):
|
|
44
|
+
"""
|
|
45
|
+
Initialize the S3 store.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
uri: S3 URI in format s3://bucket/prefix
|
|
49
|
+
|
|
50
|
+
Environment variables:
|
|
51
|
+
AWS_ACCESS_KEY_ID: Access key
|
|
52
|
+
AWS_SECRET_ACCESS_KEY: Secret key
|
|
53
|
+
AWS_ENDPOINT_URL: Custom endpoint (for MinIO)
|
|
54
|
+
AWS_REGION: AWS region
|
|
55
|
+
"""
|
|
56
|
+
self.uri = uri
|
|
57
|
+
parsed = urlparse(uri)
|
|
58
|
+
self.bucket = parsed.netloc
|
|
59
|
+
self.prefix = parsed.path.strip("/")
|
|
60
|
+
if self.prefix:
|
|
61
|
+
self.prefix = f"{self.prefix}/state"
|
|
62
|
+
else:
|
|
63
|
+
self.prefix = "state"
|
|
64
|
+
|
|
65
|
+
self._fs = None # Lazy initialization
|
|
66
|
+
|
|
67
|
+
def _get_fs(self):
|
|
68
|
+
"""Get or create the S3 filesystem."""
|
|
69
|
+
if self._fs is not None:
|
|
70
|
+
return self._fs
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
import fsspec
|
|
74
|
+
except ImportError as e:
|
|
75
|
+
raise RuntimeError(
|
|
76
|
+
"S3 state backend requires 's3fs'. Install with: pip install s3fs"
|
|
77
|
+
) from e
|
|
78
|
+
|
|
79
|
+
storage_options = self._storage_options()
|
|
80
|
+
self._fs = fsspec.filesystem("s3", **storage_options)
|
|
81
|
+
return self._fs
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _storage_options() -> Dict[str, Any]:
|
|
85
|
+
"""Build fsspec storage options from environment."""
|
|
86
|
+
opts: Dict[str, Any] = {"anon": False}
|
|
87
|
+
|
|
88
|
+
key = os.getenv("AWS_ACCESS_KEY_ID")
|
|
89
|
+
secret = os.getenv("AWS_SECRET_ACCESS_KEY")
|
|
90
|
+
if key and secret:
|
|
91
|
+
opts["key"] = key
|
|
92
|
+
opts["secret"] = secret
|
|
93
|
+
|
|
94
|
+
endpoint = os.getenv("AWS_ENDPOINT_URL")
|
|
95
|
+
if endpoint:
|
|
96
|
+
opts["client_kwargs"] = {"endpoint_url": endpoint}
|
|
97
|
+
opts["config_kwargs"] = {"s3": {"addressing_style": "path"}}
|
|
98
|
+
opts["use_ssl"] = endpoint.startswith("https")
|
|
99
|
+
|
|
100
|
+
region = os.getenv("AWS_REGION")
|
|
101
|
+
if region:
|
|
102
|
+
opts.setdefault("client_kwargs", {})
|
|
103
|
+
opts["client_kwargs"]["region_name"] = region
|
|
104
|
+
|
|
105
|
+
return opts
|
|
106
|
+
|
|
107
|
+
def _runs_prefix(self, contract_fingerprint: str) -> str:
|
|
108
|
+
"""Get the S3 prefix for a contract's runs."""
|
|
109
|
+
return f"{self.bucket}/{self.prefix}/{contract_fingerprint}/runs"
|
|
110
|
+
|
|
111
|
+
def _generate_run_id(self, run_at: datetime) -> str:
|
|
112
|
+
"""Generate a unique run ID from timestamp."""
|
|
113
|
+
ts = run_at.strftime("%Y-%m-%dT%H-%M-%S")
|
|
114
|
+
suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=6))
|
|
115
|
+
return f"{ts}_{suffix}"
|
|
116
|
+
|
|
117
|
+
def _run_key(self, contract_fingerprint: str, run_id: str) -> str:
|
|
118
|
+
"""Get the S3 key for a run's state file."""
|
|
119
|
+
return f"{self._runs_prefix(contract_fingerprint)}/{run_id}.json"
|
|
120
|
+
|
|
121
|
+
def _annotations_key(self, contract_fingerprint: str, run_id: str) -> str:
|
|
122
|
+
"""Get the S3 key prefix for a run's annotations (legacy JSONL)."""
|
|
123
|
+
return f"{self._runs_prefix(contract_fingerprint)}/{run_id}.ann.jsonl"
|
|
124
|
+
|
|
125
|
+
def _annotation_key(
|
|
126
|
+
self, contract_fingerprint: str, run_id: str, annotation_id: int
|
|
127
|
+
) -> str:
|
|
128
|
+
"""Get the S3 key for a single annotation file."""
|
|
129
|
+
return f"{self._runs_prefix(contract_fingerprint)}/{run_id}.ann.{annotation_id:06d}.json"
|
|
130
|
+
|
|
131
|
+
def _annotations_prefix(self, contract_fingerprint: str, run_id: str) -> str:
|
|
132
|
+
"""Get the S3 prefix for a run's annotation files."""
|
|
133
|
+
return f"{self._runs_prefix(contract_fingerprint)}/{run_id}.ann."
|
|
134
|
+
|
|
135
|
+
def _load_annotations(
|
|
136
|
+
self, fs, contract_fingerprint: str, run_id_str: str
|
|
137
|
+
) -> List[Annotation]:
|
|
138
|
+
"""
|
|
139
|
+
Load annotations for a run (supports both legacy JSONL and new per-file format).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
fs: The fsspec filesystem
|
|
143
|
+
contract_fingerprint: The contract fingerprint
|
|
144
|
+
run_id_str: The string run ID
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of annotations
|
|
148
|
+
"""
|
|
149
|
+
annotations = []
|
|
150
|
+
|
|
151
|
+
# Load from legacy JSONL format
|
|
152
|
+
legacy_key = self._annotations_key(contract_fingerprint, run_id_str)
|
|
153
|
+
try:
|
|
154
|
+
with fs.open(f"s3://{legacy_key}", "r") as f:
|
|
155
|
+
for line in f:
|
|
156
|
+
line = line.strip()
|
|
157
|
+
if line:
|
|
158
|
+
annotations.append(Annotation.from_json(line))
|
|
159
|
+
except Exception:
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
# Load from new per-file format
|
|
163
|
+
prefix = self._annotations_prefix(contract_fingerprint, run_id_str)
|
|
164
|
+
try:
|
|
165
|
+
ann_files = fs.glob(f"s3://{prefix}*.json")
|
|
166
|
+
for ann_file in sorted(ann_files):
|
|
167
|
+
try:
|
|
168
|
+
with fs.open(f"s3://{ann_file}", "r") as f:
|
|
169
|
+
content = f.read().strip()
|
|
170
|
+
if content:
|
|
171
|
+
annotations.append(Annotation.from_json(content))
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
except Exception:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
return annotations
|
|
178
|
+
|
|
179
|
+
def save(self, state: ValidationState) -> None:
|
|
180
|
+
"""Save a validation state to S3."""
|
|
181
|
+
fs = self._get_fs()
|
|
182
|
+
|
|
183
|
+
# Generate run ID
|
|
184
|
+
run_id = self._generate_run_id(state.run_at)
|
|
185
|
+
|
|
186
|
+
# Store run_id in the state dict
|
|
187
|
+
state_dict = state.to_dict()
|
|
188
|
+
state_dict["_run_id"] = run_id
|
|
189
|
+
|
|
190
|
+
key = self._run_key(state.contract_fingerprint, run_id)
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
with fs.open(f"s3://{key}", "w") as f:
|
|
194
|
+
f.write(json.dumps(state_dict, indent=2, default=str))
|
|
195
|
+
except Exception as e:
|
|
196
|
+
raise IOError(f"Failed to save state to S3: {e}") from e
|
|
197
|
+
|
|
198
|
+
def _load_state(self, filepath: str) -> Optional[ValidationState]:
|
|
199
|
+
"""Load a state from an S3 path."""
|
|
200
|
+
fs = self._get_fs()
|
|
201
|
+
try:
|
|
202
|
+
with fs.open(f"s3://{filepath}", "r") as f:
|
|
203
|
+
content = f.read()
|
|
204
|
+
data = json.loads(content)
|
|
205
|
+
|
|
206
|
+
# Extract run_id for later use
|
|
207
|
+
run_id = data.pop("_run_id", None)
|
|
208
|
+
|
|
209
|
+
state = ValidationState.from_dict(data)
|
|
210
|
+
|
|
211
|
+
# Store run_id as a synthetic ID (hash)
|
|
212
|
+
if run_id:
|
|
213
|
+
state.id = hash(run_id) & 0x7FFFFFFF
|
|
214
|
+
|
|
215
|
+
return state
|
|
216
|
+
except Exception:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
def get_latest(self, contract_fingerprint: str) -> Optional[ValidationState]:
|
|
220
|
+
"""Get the most recent state for a contract."""
|
|
221
|
+
history = self.get_history(contract_fingerprint, limit=1)
|
|
222
|
+
return history[0] if history else None
|
|
223
|
+
|
|
224
|
+
def get_history(
|
|
225
|
+
self,
|
|
226
|
+
contract_fingerprint: str,
|
|
227
|
+
limit: int = 10,
|
|
228
|
+
) -> List[ValidationState]:
|
|
229
|
+
"""Get recent history for a contract, newest first."""
|
|
230
|
+
fs = self._get_fs()
|
|
231
|
+
prefix = self._runs_prefix(contract_fingerprint)
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
# List all JSON files (excluding annotation files)
|
|
235
|
+
all_files = fs.glob(f"s3://{prefix}/*.json")
|
|
236
|
+
files = [
|
|
237
|
+
f for f in all_files
|
|
238
|
+
if not f.endswith(".ann.jsonl") and ".ann." not in f.rsplit("/", 1)[-1]
|
|
239
|
+
]
|
|
240
|
+
except Exception:
|
|
241
|
+
return []
|
|
242
|
+
|
|
243
|
+
if not files:
|
|
244
|
+
return []
|
|
245
|
+
|
|
246
|
+
# Sort by filename (timestamp prefix), newest first
|
|
247
|
+
files = sorted(files, reverse=True)
|
|
248
|
+
|
|
249
|
+
states = []
|
|
250
|
+
for filepath in files[:limit]:
|
|
251
|
+
state = self._load_state(filepath)
|
|
252
|
+
if state:
|
|
253
|
+
states.append(state)
|
|
254
|
+
|
|
255
|
+
return states
|
|
256
|
+
|
|
257
|
+
def delete_old(
|
|
258
|
+
self,
|
|
259
|
+
contract_fingerprint: str,
|
|
260
|
+
keep_count: int = 100,
|
|
261
|
+
) -> int:
|
|
262
|
+
"""Delete old states, keeping the most recent ones."""
|
|
263
|
+
fs = self._get_fs()
|
|
264
|
+
prefix = self._runs_prefix(contract_fingerprint)
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
all_files = fs.glob(f"s3://{prefix}/*.json")
|
|
268
|
+
files = [
|
|
269
|
+
f for f in all_files
|
|
270
|
+
if not f.endswith(".ann.jsonl") and ".ann." not in f.rsplit("/", 1)[-1]
|
|
271
|
+
]
|
|
272
|
+
except Exception:
|
|
273
|
+
return 0
|
|
274
|
+
|
|
275
|
+
if not files:
|
|
276
|
+
return 0
|
|
277
|
+
|
|
278
|
+
# Sort newest first
|
|
279
|
+
files = sorted(files, reverse=True)
|
|
280
|
+
|
|
281
|
+
# Delete files beyond keep_count
|
|
282
|
+
deleted = 0
|
|
283
|
+
for filepath in files[keep_count:]:
|
|
284
|
+
try:
|
|
285
|
+
# Delete state file
|
|
286
|
+
fs.rm(f"s3://{filepath}")
|
|
287
|
+
deleted += 1
|
|
288
|
+
|
|
289
|
+
# Delete corresponding annotations (both legacy JSONL and new per-file)
|
|
290
|
+
run_id = filepath.rsplit("/", 1)[-1].replace(".json", "")
|
|
291
|
+
|
|
292
|
+
# Legacy JSONL
|
|
293
|
+
ann_key = self._annotations_key(contract_fingerprint, run_id)
|
|
294
|
+
try:
|
|
295
|
+
fs.rm(f"s3://{ann_key}")
|
|
296
|
+
except Exception:
|
|
297
|
+
pass
|
|
298
|
+
|
|
299
|
+
# New per-file annotations
|
|
300
|
+
ann_prefix = self._annotations_prefix(contract_fingerprint, run_id)
|
|
301
|
+
try:
|
|
302
|
+
ann_files = fs.glob(f"s3://{ann_prefix}*.json")
|
|
303
|
+
for ann_file in ann_files:
|
|
304
|
+
try:
|
|
305
|
+
fs.rm(f"s3://{ann_file}")
|
|
306
|
+
except Exception:
|
|
307
|
+
pass
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
except Exception:
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
return deleted
|
|
314
|
+
|
|
315
|
+
def list_contracts(self) -> List[str]:
|
|
316
|
+
"""List all contract fingerprints with stored state."""
|
|
317
|
+
fs = self._get_fs()
|
|
318
|
+
prefix = f"{self.bucket}/{self.prefix}"
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
# List directories under the state prefix
|
|
322
|
+
items = fs.ls(f"s3://{prefix}/", detail=False)
|
|
323
|
+
except Exception:
|
|
324
|
+
return []
|
|
325
|
+
|
|
326
|
+
contracts = []
|
|
327
|
+
for item in items:
|
|
328
|
+
# Extract the fingerprint (last part of the path)
|
|
329
|
+
parts = item.rstrip("/").split("/")
|
|
330
|
+
if parts:
|
|
331
|
+
name = parts[-1]
|
|
332
|
+
# Fingerprints are 16 hex characters
|
|
333
|
+
if len(name) == 16 and all(c in "0123456789abcdef" for c in name):
|
|
334
|
+
contracts.append(name)
|
|
335
|
+
|
|
336
|
+
return sorted(contracts)
|
|
337
|
+
|
|
338
|
+
def clear(self, contract_fingerprint: Optional[str] = None) -> int:
|
|
339
|
+
"""
|
|
340
|
+
Clear stored states.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
contract_fingerprint: If provided, only clear this contract's states.
|
|
344
|
+
If None, clear all states.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Number of state files deleted.
|
|
348
|
+
"""
|
|
349
|
+
fs = self._get_fs()
|
|
350
|
+
deleted = 0
|
|
351
|
+
|
|
352
|
+
if contract_fingerprint:
|
|
353
|
+
prefix = self._runs_prefix(contract_fingerprint)
|
|
354
|
+
try:
|
|
355
|
+
# Delete all files (json and jsonl)
|
|
356
|
+
for pattern in ["*.json", "*.jsonl"]:
|
|
357
|
+
files = fs.glob(f"s3://{prefix}/{pattern}")
|
|
358
|
+
for filepath in files:
|
|
359
|
+
fs.rm(f"s3://{filepath}")
|
|
360
|
+
if filepath.endswith(".json") and not filepath.endswith(".ann.jsonl"):
|
|
361
|
+
deleted += 1
|
|
362
|
+
except Exception:
|
|
363
|
+
pass
|
|
364
|
+
else:
|
|
365
|
+
# Clear all contracts
|
|
366
|
+
for fp in self.list_contracts():
|
|
367
|
+
deleted += self.clear(fp)
|
|
368
|
+
|
|
369
|
+
return deleted
|
|
370
|
+
|
|
371
|
+
# -------------------------------------------------------------------------
|
|
372
|
+
# Annotation Methods
|
|
373
|
+
# -------------------------------------------------------------------------
|
|
374
|
+
|
|
375
|
+
def save_annotation(self, annotation: Annotation) -> int:
|
|
376
|
+
"""
|
|
377
|
+
Save an annotation (append-only).
|
|
378
|
+
|
|
379
|
+
For S3 backends, we need the contract fingerprint and run_id string.
|
|
380
|
+
"""
|
|
381
|
+
raise NotImplementedError(
|
|
382
|
+
"S3Store.save_annotation requires contract fingerprint. "
|
|
383
|
+
"Use save_annotation_for_run instead."
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
def save_annotation_for_run(
|
|
387
|
+
self,
|
|
388
|
+
contract_fingerprint: str,
|
|
389
|
+
run_id_str: str,
|
|
390
|
+
annotation: Annotation,
|
|
391
|
+
) -> int:
|
|
392
|
+
"""
|
|
393
|
+
Save an annotation for a specific run.
|
|
394
|
+
|
|
395
|
+
Each annotation is stored as a separate file to avoid race conditions.
|
|
396
|
+
File pattern: {run_id}.ann.{annotation_id:06d}.json
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
contract_fingerprint: The contract fingerprint
|
|
400
|
+
run_id_str: The string run ID
|
|
401
|
+
annotation: The annotation to save
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
The annotation ID
|
|
405
|
+
"""
|
|
406
|
+
fs = self._get_fs()
|
|
407
|
+
prefix = self._annotations_prefix(contract_fingerprint, run_id_str)
|
|
408
|
+
|
|
409
|
+
# Count existing annotation files to generate next ID
|
|
410
|
+
existing_count = 0
|
|
411
|
+
try:
|
|
412
|
+
# Glob for annotation files (new format)
|
|
413
|
+
ann_files = fs.glob(f"s3://{prefix}*.json")
|
|
414
|
+
existing_count = len(ann_files)
|
|
415
|
+
|
|
416
|
+
# Also check legacy JSONL for backwards compatibility
|
|
417
|
+
legacy_key = self._annotations_key(contract_fingerprint, run_id_str)
|
|
418
|
+
try:
|
|
419
|
+
with fs.open(f"s3://{legacy_key}", "r") as f:
|
|
420
|
+
existing_count += sum(1 for _ in f)
|
|
421
|
+
except Exception:
|
|
422
|
+
pass
|
|
423
|
+
except Exception:
|
|
424
|
+
pass
|
|
425
|
+
|
|
426
|
+
annotation.id = existing_count + 1
|
|
427
|
+
|
|
428
|
+
# Write annotation as a separate file (atomic, no race condition)
|
|
429
|
+
ann_key = self._annotation_key(
|
|
430
|
+
contract_fingerprint, run_id_str, annotation.id
|
|
431
|
+
)
|
|
432
|
+
try:
|
|
433
|
+
with fs.open(f"s3://{ann_key}", "w") as f:
|
|
434
|
+
f.write(annotation.to_json())
|
|
435
|
+
return annotation.id
|
|
436
|
+
except Exception as e:
|
|
437
|
+
raise IOError(f"Failed to save annotation to S3: {e}") from e
|
|
438
|
+
|
|
439
|
+
def get_annotations(
|
|
440
|
+
self,
|
|
441
|
+
run_id: int,
|
|
442
|
+
rule_result_id: Optional[int] = None,
|
|
443
|
+
) -> List[Annotation]:
|
|
444
|
+
"""Get annotations for a run."""
|
|
445
|
+
return []
|
|
446
|
+
|
|
447
|
+
def get_run_with_annotations(
|
|
448
|
+
self,
|
|
449
|
+
contract_fingerprint: str,
|
|
450
|
+
run_id: Optional[int] = None,
|
|
451
|
+
) -> Optional[ValidationState]:
|
|
452
|
+
"""Get a validation state with its annotations loaded."""
|
|
453
|
+
# Get the state
|
|
454
|
+
if run_id is None:
|
|
455
|
+
state = self.get_latest(contract_fingerprint)
|
|
456
|
+
else:
|
|
457
|
+
states = self.get_history(contract_fingerprint, limit=100)
|
|
458
|
+
state = None
|
|
459
|
+
for s in states:
|
|
460
|
+
if s.id == run_id:
|
|
461
|
+
state = s
|
|
462
|
+
break
|
|
463
|
+
|
|
464
|
+
if not state:
|
|
465
|
+
return None
|
|
466
|
+
|
|
467
|
+
fs = self._get_fs()
|
|
468
|
+
prefix = self._runs_prefix(contract_fingerprint)
|
|
469
|
+
|
|
470
|
+
# Find the run file to get run_id string
|
|
471
|
+
run_id_str = None
|
|
472
|
+
try:
|
|
473
|
+
all_files = fs.glob(f"s3://{prefix}/*.json")
|
|
474
|
+
files = [f for f in all_files if not f.endswith(".ann.jsonl")]
|
|
475
|
+
|
|
476
|
+
for filepath in files:
|
|
477
|
+
loaded = self._load_state(filepath)
|
|
478
|
+
if loaded and loaded.id == state.id:
|
|
479
|
+
run_id_str = filepath.rsplit("/", 1)[-1].replace(".json", "")
|
|
480
|
+
break
|
|
481
|
+
except Exception:
|
|
482
|
+
pass
|
|
483
|
+
|
|
484
|
+
if not run_id_str:
|
|
485
|
+
state.annotations = []
|
|
486
|
+
for rule in state.rules:
|
|
487
|
+
rule.annotations = []
|
|
488
|
+
return state
|
|
489
|
+
|
|
490
|
+
# Load annotations (supports both legacy JSONL and new per-file format)
|
|
491
|
+
annotations = self._load_annotations(
|
|
492
|
+
fs, contract_fingerprint, run_id_str
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
self._attach_annotations_to_state(state, annotations)
|
|
496
|
+
return state
|
|
497
|
+
|
|
498
|
+
def get_history_with_annotations(
|
|
499
|
+
self,
|
|
500
|
+
contract_fingerprint: str,
|
|
501
|
+
limit: int = 10,
|
|
502
|
+
) -> List[ValidationState]:
|
|
503
|
+
"""Get recent history with annotations loaded."""
|
|
504
|
+
states = self.get_history(contract_fingerprint, limit=limit)
|
|
505
|
+
|
|
506
|
+
fs = self._get_fs()
|
|
507
|
+
prefix = self._runs_prefix(contract_fingerprint)
|
|
508
|
+
|
|
509
|
+
# Build ID to run_id_str mapping
|
|
510
|
+
id_to_run_id: Dict[int, str] = {}
|
|
511
|
+
try:
|
|
512
|
+
all_files = fs.glob(f"s3://{prefix}/*.json")
|
|
513
|
+
files = [f for f in all_files if not f.endswith(".ann.jsonl")]
|
|
514
|
+
|
|
515
|
+
for filepath in files:
|
|
516
|
+
loaded = self._load_state(filepath)
|
|
517
|
+
if loaded and loaded.id:
|
|
518
|
+
run_id_str = filepath.rsplit("/", 1)[-1].replace(".json", "")
|
|
519
|
+
id_to_run_id[loaded.id] = run_id_str
|
|
520
|
+
except Exception:
|
|
521
|
+
pass
|
|
522
|
+
|
|
523
|
+
# Load annotations for each state
|
|
524
|
+
for state in states:
|
|
525
|
+
if state.id is None or state.id not in id_to_run_id:
|
|
526
|
+
state.annotations = []
|
|
527
|
+
for rule in state.rules:
|
|
528
|
+
rule.annotations = []
|
|
529
|
+
continue
|
|
530
|
+
|
|
531
|
+
run_id_str = id_to_run_id[state.id]
|
|
532
|
+
|
|
533
|
+
# Load annotations (supports both legacy JSONL and new per-file format)
|
|
534
|
+
annotations = self._load_annotations(
|
|
535
|
+
fs, contract_fingerprint, run_id_str
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
self._attach_annotations_to_state(state, annotations)
|
|
539
|
+
|
|
540
|
+
return states
|
|
541
|
+
|
|
542
|
+
def __repr__(self) -> str:
|
|
543
|
+
return f"S3Store(uri={self.uri})"
|