kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/scout/types.py
ADDED
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
# src/kontra/scout/types.py
|
|
2
|
+
"""
|
|
3
|
+
Data types for Kontra Scout profiling results.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class NumericStats:
|
|
14
|
+
"""Statistics for numeric columns."""
|
|
15
|
+
|
|
16
|
+
min: Optional[float] = None
|
|
17
|
+
max: Optional[float] = None
|
|
18
|
+
mean: Optional[float] = None
|
|
19
|
+
median: Optional[float] = None
|
|
20
|
+
std: Optional[float] = None
|
|
21
|
+
percentiles: Dict[str, float] = field(default_factory=dict) # {"p25": ..., "p50": ..., ...}
|
|
22
|
+
|
|
23
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
24
|
+
return {
|
|
25
|
+
"min": self.min,
|
|
26
|
+
"max": self.max,
|
|
27
|
+
"mean": self.mean,
|
|
28
|
+
"median": self.median,
|
|
29
|
+
"std": self.std,
|
|
30
|
+
"percentiles": self.percentiles,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class StringStats:
|
|
36
|
+
"""Statistics for string columns."""
|
|
37
|
+
|
|
38
|
+
min_length: Optional[int] = None
|
|
39
|
+
max_length: Optional[int] = None
|
|
40
|
+
avg_length: Optional[float] = None
|
|
41
|
+
empty_count: int = 0
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
44
|
+
return {
|
|
45
|
+
"min_length": self.min_length,
|
|
46
|
+
"max_length": self.max_length,
|
|
47
|
+
"avg_length": self.avg_length,
|
|
48
|
+
"empty_count": self.empty_count,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class TemporalStats:
|
|
54
|
+
"""Statistics for date/datetime columns."""
|
|
55
|
+
|
|
56
|
+
date_min: Optional[str] = None # ISO format
|
|
57
|
+
date_max: Optional[str] = None # ISO format
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
60
|
+
return {
|
|
61
|
+
"date_min": self.date_min,
|
|
62
|
+
"date_max": self.date_max,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class TopValue:
|
|
68
|
+
"""A frequently occurring value with its count."""
|
|
69
|
+
|
|
70
|
+
value: Any
|
|
71
|
+
count: int
|
|
72
|
+
pct: float # Percentage of total rows
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
75
|
+
return {
|
|
76
|
+
"value": self.value,
|
|
77
|
+
"count": self.count,
|
|
78
|
+
"pct": round(self.pct, 2),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class ColumnProfile:
|
|
84
|
+
"""Complete profile for a single column."""
|
|
85
|
+
|
|
86
|
+
name: str
|
|
87
|
+
dtype: str # Normalized: string/int/float/bool/date/datetime/binary/unknown
|
|
88
|
+
dtype_raw: str # Original DuckDB/Polars type string
|
|
89
|
+
|
|
90
|
+
# Counts
|
|
91
|
+
row_count: int = 0
|
|
92
|
+
null_count: int = 0
|
|
93
|
+
null_rate: float = 0.0 # null_count / row_count
|
|
94
|
+
distinct_count: int = 0
|
|
95
|
+
uniqueness_ratio: float = 0.0 # distinct / non_null_count
|
|
96
|
+
|
|
97
|
+
# Cardinality analysis
|
|
98
|
+
is_low_cardinality: bool = False
|
|
99
|
+
values: Optional[List[Any]] = None # All values if low cardinality
|
|
100
|
+
top_values: List[TopValue] = field(default_factory=list)
|
|
101
|
+
|
|
102
|
+
# Type-specific stats
|
|
103
|
+
numeric: Optional[NumericStats] = None
|
|
104
|
+
string: Optional[StringStats] = None
|
|
105
|
+
temporal: Optional[TemporalStats] = None
|
|
106
|
+
|
|
107
|
+
# Pattern detection (optional)
|
|
108
|
+
detected_patterns: List[str] = field(default_factory=list)
|
|
109
|
+
|
|
110
|
+
# Semantic type inference
|
|
111
|
+
semantic_type: Optional[str] = None # identifier/category/measure/timestamp
|
|
112
|
+
|
|
113
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
114
|
+
"""Convert to dictionary for JSON serialization."""
|
|
115
|
+
d: Dict[str, Any] = {
|
|
116
|
+
"name": self.name,
|
|
117
|
+
"dtype": self.dtype,
|
|
118
|
+
"dtype_raw": self.dtype_raw,
|
|
119
|
+
"counts": {
|
|
120
|
+
"rows": self.row_count,
|
|
121
|
+
"nulls": self.null_count,
|
|
122
|
+
"null_rate": round(self.null_rate, 4),
|
|
123
|
+
"distinct": self.distinct_count,
|
|
124
|
+
"uniqueness_ratio": round(self.uniqueness_ratio, 4),
|
|
125
|
+
},
|
|
126
|
+
"cardinality": {
|
|
127
|
+
"is_low": self.is_low_cardinality,
|
|
128
|
+
"values": self.values,
|
|
129
|
+
"top_values": [tv.to_dict() for tv in self.top_values],
|
|
130
|
+
},
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if self.numeric:
|
|
134
|
+
d["numeric_stats"] = self.numeric.to_dict()
|
|
135
|
+
if self.string:
|
|
136
|
+
d["string_stats"] = self.string.to_dict()
|
|
137
|
+
if self.temporal:
|
|
138
|
+
d["temporal_stats"] = self.temporal.to_dict()
|
|
139
|
+
if self.detected_patterns:
|
|
140
|
+
d["patterns"] = self.detected_patterns
|
|
141
|
+
if self.semantic_type:
|
|
142
|
+
d["semantic_type"] = self.semantic_type
|
|
143
|
+
|
|
144
|
+
return d
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@dataclass
|
|
148
|
+
class DatasetProfile:
|
|
149
|
+
"""Complete profile for a dataset."""
|
|
150
|
+
|
|
151
|
+
# Metadata
|
|
152
|
+
source_uri: str
|
|
153
|
+
source_format: str # "parquet", "csv"
|
|
154
|
+
profiled_at: str # ISO timestamp
|
|
155
|
+
engine_version: str
|
|
156
|
+
|
|
157
|
+
# Dataset-level stats
|
|
158
|
+
row_count: int = 0
|
|
159
|
+
column_count: int = 0
|
|
160
|
+
estimated_size_bytes: Optional[int] = None
|
|
161
|
+
|
|
162
|
+
# Sampling info
|
|
163
|
+
sampled: bool = False
|
|
164
|
+
sample_size: Optional[int] = None
|
|
165
|
+
|
|
166
|
+
# Columns
|
|
167
|
+
columns: List[ColumnProfile] = field(default_factory=list)
|
|
168
|
+
|
|
169
|
+
# Timing
|
|
170
|
+
profile_duration_ms: int = 0
|
|
171
|
+
|
|
172
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
173
|
+
"""Convert to dictionary for JSON serialization."""
|
|
174
|
+
return {
|
|
175
|
+
"schema_version": "1.0",
|
|
176
|
+
"source_uri": self.source_uri,
|
|
177
|
+
"source_format": self.source_format,
|
|
178
|
+
"profiled_at": self.profiled_at,
|
|
179
|
+
"engine_version": self.engine_version,
|
|
180
|
+
"dataset": {
|
|
181
|
+
"row_count": self.row_count,
|
|
182
|
+
"column_count": self.column_count,
|
|
183
|
+
"estimated_size_bytes": self.estimated_size_bytes,
|
|
184
|
+
"sampled": self.sampled,
|
|
185
|
+
"sample_size": self.sample_size,
|
|
186
|
+
},
|
|
187
|
+
"columns": [c.to_dict() for c in self.columns],
|
|
188
|
+
"profile_duration_ms": self.profile_duration_ms,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def get_column(self, name: str) -> Optional[ColumnProfile]:
|
|
192
|
+
"""Get a column profile by name."""
|
|
193
|
+
for col in self.columns:
|
|
194
|
+
if col.name == name:
|
|
195
|
+
return col
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
def to_llm(self) -> str:
|
|
199
|
+
"""Token-optimized format for LLM context."""
|
|
200
|
+
lines = []
|
|
201
|
+
lines.append(f"PROFILE: {self.source_uri}")
|
|
202
|
+
lines.append(f"rows={self.row_count:,} cols={self.column_count}")
|
|
203
|
+
if self.sampled:
|
|
204
|
+
lines.append(f"(sampled: {self.sample_size:,} rows)")
|
|
205
|
+
|
|
206
|
+
lines.append("")
|
|
207
|
+
lines.append("COLUMNS:")
|
|
208
|
+
for col in self.columns[:20]: # Limit to 20 columns
|
|
209
|
+
parts = [f" {col.name} ({col.dtype})"]
|
|
210
|
+
if col.null_count > 0:
|
|
211
|
+
parts.append(f"nulls={col.null_count:,} ({col.null_rate:.1%})")
|
|
212
|
+
if col.distinct_count is not None:
|
|
213
|
+
parts.append(f"distinct={col.distinct_count:,}")
|
|
214
|
+
if col.numeric:
|
|
215
|
+
if col.numeric.min is not None and col.numeric.max is not None:
|
|
216
|
+
parts.append(f"range=[{col.numeric.min}, {col.numeric.max}]")
|
|
217
|
+
if col.top_values:
|
|
218
|
+
top = col.top_values[0]
|
|
219
|
+
parts.append(f"top='{top.value}'({top.count:,})")
|
|
220
|
+
lines.append(" ".join(parts))
|
|
221
|
+
|
|
222
|
+
if len(self.columns) > 20:
|
|
223
|
+
lines.append(f" ... +{len(self.columns) - 20} more columns")
|
|
224
|
+
|
|
225
|
+
return "\n".join(lines)
|
|
226
|
+
|
|
227
|
+
@classmethod
|
|
228
|
+
def from_dict(cls, d: Dict[str, Any]) -> "DatasetProfile":
|
|
229
|
+
"""Create from dictionary."""
|
|
230
|
+
ds = d.get("dataset", {})
|
|
231
|
+
cols_data = d.get("columns", [])
|
|
232
|
+
|
|
233
|
+
columns = []
|
|
234
|
+
for c in cols_data:
|
|
235
|
+
counts = c.get("counts", {})
|
|
236
|
+
card = c.get("cardinality", {})
|
|
237
|
+
|
|
238
|
+
# Parse top values
|
|
239
|
+
top_values = []
|
|
240
|
+
for tv in card.get("top_values", []):
|
|
241
|
+
top_values.append(TopValue(
|
|
242
|
+
value=tv.get("value"),
|
|
243
|
+
count=tv.get("count", 0),
|
|
244
|
+
pct=tv.get("pct", 0.0),
|
|
245
|
+
))
|
|
246
|
+
|
|
247
|
+
# Parse type-specific stats
|
|
248
|
+
numeric = None
|
|
249
|
+
if "numeric_stats" in c:
|
|
250
|
+
ns = c["numeric_stats"]
|
|
251
|
+
numeric = NumericStats(
|
|
252
|
+
min=ns.get("min"),
|
|
253
|
+
max=ns.get("max"),
|
|
254
|
+
mean=ns.get("mean"),
|
|
255
|
+
median=ns.get("median"),
|
|
256
|
+
std=ns.get("std"),
|
|
257
|
+
percentiles=ns.get("percentiles", {}),
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
string = None
|
|
261
|
+
if "string_stats" in c:
|
|
262
|
+
ss = c["string_stats"]
|
|
263
|
+
string = StringStats(
|
|
264
|
+
min_length=ss.get("min_length"),
|
|
265
|
+
max_length=ss.get("max_length"),
|
|
266
|
+
avg_length=ss.get("avg_length"),
|
|
267
|
+
empty_count=ss.get("empty_count", 0),
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
temporal = None
|
|
271
|
+
if "temporal_stats" in c:
|
|
272
|
+
ts = c["temporal_stats"]
|
|
273
|
+
temporal = TemporalStats(
|
|
274
|
+
date_min=ts.get("date_min"),
|
|
275
|
+
date_max=ts.get("date_max"),
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
columns.append(ColumnProfile(
|
|
279
|
+
name=c.get("name", ""),
|
|
280
|
+
dtype=c.get("dtype", "unknown"),
|
|
281
|
+
dtype_raw=c.get("dtype_raw", ""),
|
|
282
|
+
row_count=counts.get("rows", 0),
|
|
283
|
+
null_count=counts.get("nulls", 0),
|
|
284
|
+
null_rate=counts.get("null_rate", 0.0),
|
|
285
|
+
distinct_count=counts.get("distinct", 0),
|
|
286
|
+
uniqueness_ratio=counts.get("uniqueness_ratio", 0.0),
|
|
287
|
+
is_low_cardinality=card.get("is_low", False),
|
|
288
|
+
values=card.get("values"),
|
|
289
|
+
top_values=top_values,
|
|
290
|
+
numeric=numeric,
|
|
291
|
+
string=string,
|
|
292
|
+
temporal=temporal,
|
|
293
|
+
detected_patterns=c.get("patterns", []),
|
|
294
|
+
semantic_type=c.get("semantic_type"),
|
|
295
|
+
))
|
|
296
|
+
|
|
297
|
+
return cls(
|
|
298
|
+
source_uri=d.get("source_uri", ""),
|
|
299
|
+
source_format=d.get("source_format", ""),
|
|
300
|
+
profiled_at=d.get("profiled_at", ""),
|
|
301
|
+
engine_version=d.get("engine_version", ""),
|
|
302
|
+
row_count=ds.get("row_count", 0),
|
|
303
|
+
column_count=ds.get("column_count", 0),
|
|
304
|
+
estimated_size_bytes=ds.get("estimated_size_bytes"),
|
|
305
|
+
sampled=ds.get("sampled", False),
|
|
306
|
+
sample_size=ds.get("sample_size"),
|
|
307
|
+
columns=columns,
|
|
308
|
+
profile_duration_ms=d.get("profile_duration_ms", 0),
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@dataclass
|
|
313
|
+
class ProfileState:
|
|
314
|
+
"""
|
|
315
|
+
Persistent state for a scout profile.
|
|
316
|
+
|
|
317
|
+
Similar to ValidationState, enables tracking profile changes over time.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
# Identity
|
|
321
|
+
source_fingerprint: str # Hash of source URI
|
|
322
|
+
source_uri: str
|
|
323
|
+
|
|
324
|
+
# Timing
|
|
325
|
+
profiled_at: str # ISO timestamp
|
|
326
|
+
|
|
327
|
+
# The actual profile
|
|
328
|
+
profile: DatasetProfile
|
|
329
|
+
|
|
330
|
+
# Metadata
|
|
331
|
+
schema_version: str = "1.0"
|
|
332
|
+
engine_version: str = ""
|
|
333
|
+
|
|
334
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
335
|
+
"""Convert to dictionary for JSON serialization."""
|
|
336
|
+
return {
|
|
337
|
+
"schema_version": self.schema_version,
|
|
338
|
+
"engine_version": self.engine_version,
|
|
339
|
+
"source_fingerprint": self.source_fingerprint,
|
|
340
|
+
"source_uri": self.source_uri,
|
|
341
|
+
"profiled_at": self.profiled_at,
|
|
342
|
+
"profile": self.profile.to_dict(),
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def from_dict(cls, d: Dict[str, Any]) -> "ProfileState":
|
|
347
|
+
"""Create from dictionary."""
|
|
348
|
+
return cls(
|
|
349
|
+
schema_version=d.get("schema_version", "1.0"),
|
|
350
|
+
engine_version=d.get("engine_version", ""),
|
|
351
|
+
source_fingerprint=d["source_fingerprint"],
|
|
352
|
+
source_uri=d["source_uri"],
|
|
353
|
+
profiled_at=d["profiled_at"],
|
|
354
|
+
profile=DatasetProfile.from_dict(d["profile"]),
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def to_json(self, indent: int = 2) -> str:
|
|
358
|
+
"""Serialize to JSON string."""
|
|
359
|
+
import json
|
|
360
|
+
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
361
|
+
|
|
362
|
+
@classmethod
|
|
363
|
+
def from_json(cls, json_str: str) -> "ProfileState":
|
|
364
|
+
"""Deserialize from JSON string."""
|
|
365
|
+
import json
|
|
366
|
+
return cls.from_dict(json.loads(json_str))
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
@dataclass
|
|
370
|
+
class ColumnDiff:
|
|
371
|
+
"""Diff for a single column between two profiles."""
|
|
372
|
+
|
|
373
|
+
column_name: str
|
|
374
|
+
change_type: str # "added", "removed", "changed", "unchanged"
|
|
375
|
+
|
|
376
|
+
# For changed columns
|
|
377
|
+
null_rate_before: Optional[float] = None
|
|
378
|
+
null_rate_after: Optional[float] = None
|
|
379
|
+
null_rate_delta: Optional[float] = None
|
|
380
|
+
|
|
381
|
+
distinct_count_before: Optional[int] = None
|
|
382
|
+
distinct_count_after: Optional[int] = None
|
|
383
|
+
distinct_count_delta: Optional[int] = None
|
|
384
|
+
|
|
385
|
+
dtype_before: Optional[str] = None
|
|
386
|
+
dtype_after: Optional[str] = None
|
|
387
|
+
dtype_changed: bool = False
|
|
388
|
+
|
|
389
|
+
# Value distribution changes
|
|
390
|
+
new_values: List[Any] = field(default_factory=list)
|
|
391
|
+
removed_values: List[Any] = field(default_factory=list)
|
|
392
|
+
|
|
393
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
394
|
+
d: Dict[str, Any] = {
|
|
395
|
+
"column": self.column_name,
|
|
396
|
+
"change_type": self.change_type,
|
|
397
|
+
}
|
|
398
|
+
if self.change_type == "changed":
|
|
399
|
+
if self.null_rate_delta is not None and abs(self.null_rate_delta) > 0.001:
|
|
400
|
+
d["null_rate"] = {
|
|
401
|
+
"before": self.null_rate_before,
|
|
402
|
+
"after": self.null_rate_after,
|
|
403
|
+
"delta": round(self.null_rate_delta, 4),
|
|
404
|
+
}
|
|
405
|
+
if self.distinct_count_delta is not None and self.distinct_count_delta != 0:
|
|
406
|
+
d["distinct_count"] = {
|
|
407
|
+
"before": self.distinct_count_before,
|
|
408
|
+
"after": self.distinct_count_after,
|
|
409
|
+
"delta": self.distinct_count_delta,
|
|
410
|
+
}
|
|
411
|
+
if self.dtype_changed:
|
|
412
|
+
d["dtype"] = {
|
|
413
|
+
"before": self.dtype_before,
|
|
414
|
+
"after": self.dtype_after,
|
|
415
|
+
}
|
|
416
|
+
if self.new_values:
|
|
417
|
+
d["new_values"] = self.new_values[:10] # Limit
|
|
418
|
+
if self.removed_values:
|
|
419
|
+
d["removed_values"] = self.removed_values[:10]
|
|
420
|
+
return d
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
@dataclass
|
|
424
|
+
class ProfileDiff:
|
|
425
|
+
"""Diff between two scout profiles."""
|
|
426
|
+
|
|
427
|
+
before: ProfileState
|
|
428
|
+
after: ProfileState
|
|
429
|
+
|
|
430
|
+
# Dataset-level changes
|
|
431
|
+
row_count_before: int = 0
|
|
432
|
+
row_count_after: int = 0
|
|
433
|
+
row_count_delta: int = 0
|
|
434
|
+
row_count_pct_change: float = 0.0
|
|
435
|
+
|
|
436
|
+
column_count_before: int = 0
|
|
437
|
+
column_count_after: int = 0
|
|
438
|
+
|
|
439
|
+
# Column-level changes
|
|
440
|
+
columns_added: List[str] = field(default_factory=list)
|
|
441
|
+
columns_removed: List[str] = field(default_factory=list)
|
|
442
|
+
columns_changed: List[ColumnDiff] = field(default_factory=list)
|
|
443
|
+
|
|
444
|
+
# Significant changes summary
|
|
445
|
+
null_rate_increases: List[ColumnDiff] = field(default_factory=list)
|
|
446
|
+
null_rate_decreases: List[ColumnDiff] = field(default_factory=list)
|
|
447
|
+
cardinality_changes: List[ColumnDiff] = field(default_factory=list)
|
|
448
|
+
dtype_changes: List[ColumnDiff] = field(default_factory=list)
|
|
449
|
+
|
|
450
|
+
@property
|
|
451
|
+
def has_changes(self) -> bool:
|
|
452
|
+
"""Check if there are any meaningful changes."""
|
|
453
|
+
return bool(
|
|
454
|
+
self.columns_added
|
|
455
|
+
or self.columns_removed
|
|
456
|
+
or self.columns_changed
|
|
457
|
+
or abs(self.row_count_delta) > 0
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def has_schema_changes(self) -> bool:
|
|
462
|
+
"""Check if there are schema-level changes."""
|
|
463
|
+
return bool(
|
|
464
|
+
self.columns_added
|
|
465
|
+
or self.columns_removed
|
|
466
|
+
or self.dtype_changes
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
@classmethod
|
|
470
|
+
def compute(cls, before: ProfileState, after: ProfileState) -> "ProfileDiff":
|
|
471
|
+
"""Compute diff between two profile states."""
|
|
472
|
+
diff = cls(before=before, after=after)
|
|
473
|
+
|
|
474
|
+
# Dataset-level
|
|
475
|
+
diff.row_count_before = before.profile.row_count
|
|
476
|
+
diff.row_count_after = after.profile.row_count
|
|
477
|
+
diff.row_count_delta = after.profile.row_count - before.profile.row_count
|
|
478
|
+
if before.profile.row_count > 0:
|
|
479
|
+
diff.row_count_pct_change = (diff.row_count_delta / before.profile.row_count) * 100
|
|
480
|
+
|
|
481
|
+
diff.column_count_before = before.profile.column_count
|
|
482
|
+
diff.column_count_after = after.profile.column_count
|
|
483
|
+
|
|
484
|
+
# Build column maps
|
|
485
|
+
before_cols = {c.name: c for c in before.profile.columns}
|
|
486
|
+
after_cols = {c.name: c for c in after.profile.columns}
|
|
487
|
+
|
|
488
|
+
before_names = set(before_cols.keys())
|
|
489
|
+
after_names = set(after_cols.keys())
|
|
490
|
+
|
|
491
|
+
# Added/removed columns
|
|
492
|
+
diff.columns_added = sorted(after_names - before_names)
|
|
493
|
+
diff.columns_removed = sorted(before_names - after_names)
|
|
494
|
+
|
|
495
|
+
# Changed columns
|
|
496
|
+
common_cols = before_names & after_names
|
|
497
|
+
for col_name in sorted(common_cols):
|
|
498
|
+
bc = before_cols[col_name]
|
|
499
|
+
ac = after_cols[col_name]
|
|
500
|
+
|
|
501
|
+
col_diff = ColumnDiff(
|
|
502
|
+
column_name=col_name,
|
|
503
|
+
change_type="unchanged",
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
changed = False
|
|
507
|
+
|
|
508
|
+
# Null rate change
|
|
509
|
+
null_delta = ac.null_rate - bc.null_rate
|
|
510
|
+
if abs(null_delta) > 0.001: # > 0.1% change
|
|
511
|
+
col_diff.null_rate_before = bc.null_rate
|
|
512
|
+
col_diff.null_rate_after = ac.null_rate
|
|
513
|
+
col_diff.null_rate_delta = null_delta
|
|
514
|
+
changed = True
|
|
515
|
+
|
|
516
|
+
if null_delta > 0.01: # > 1% increase
|
|
517
|
+
diff.null_rate_increases.append(col_diff)
|
|
518
|
+
elif null_delta < -0.01: # > 1% decrease
|
|
519
|
+
diff.null_rate_decreases.append(col_diff)
|
|
520
|
+
|
|
521
|
+
# Distinct count change
|
|
522
|
+
distinct_delta = ac.distinct_count - bc.distinct_count
|
|
523
|
+
if distinct_delta != 0:
|
|
524
|
+
col_diff.distinct_count_before = bc.distinct_count
|
|
525
|
+
col_diff.distinct_count_after = ac.distinct_count
|
|
526
|
+
col_diff.distinct_count_delta = distinct_delta
|
|
527
|
+
changed = True
|
|
528
|
+
|
|
529
|
+
# Significant cardinality change (>10%)
|
|
530
|
+
if bc.distinct_count > 0:
|
|
531
|
+
pct_change = abs(distinct_delta / bc.distinct_count)
|
|
532
|
+
if pct_change > 0.1:
|
|
533
|
+
diff.cardinality_changes.append(col_diff)
|
|
534
|
+
|
|
535
|
+
# Dtype change
|
|
536
|
+
if bc.dtype != ac.dtype:
|
|
537
|
+
col_diff.dtype_before = bc.dtype
|
|
538
|
+
col_diff.dtype_after = ac.dtype
|
|
539
|
+
col_diff.dtype_changed = True
|
|
540
|
+
changed = True
|
|
541
|
+
diff.dtype_changes.append(col_diff)
|
|
542
|
+
|
|
543
|
+
# Value distribution changes (if low cardinality)
|
|
544
|
+
if bc.values and ac.values:
|
|
545
|
+
before_vals = set(bc.values) if bc.values else set()
|
|
546
|
+
after_vals = set(ac.values) if ac.values else set()
|
|
547
|
+
col_diff.new_values = list(after_vals - before_vals)
|
|
548
|
+
col_diff.removed_values = list(before_vals - after_vals)
|
|
549
|
+
if col_diff.new_values or col_diff.removed_values:
|
|
550
|
+
changed = True
|
|
551
|
+
|
|
552
|
+
if changed:
|
|
553
|
+
col_diff.change_type = "changed"
|
|
554
|
+
diff.columns_changed.append(col_diff)
|
|
555
|
+
|
|
556
|
+
return diff
|
|
557
|
+
|
|
558
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
559
|
+
"""Convert to dictionary."""
|
|
560
|
+
return {
|
|
561
|
+
"before": {
|
|
562
|
+
"source_uri": self.before.source_uri,
|
|
563
|
+
"profiled_at": self.before.profiled_at,
|
|
564
|
+
"row_count": self.row_count_before,
|
|
565
|
+
"column_count": self.column_count_before,
|
|
566
|
+
},
|
|
567
|
+
"after": {
|
|
568
|
+
"source_uri": self.after.source_uri,
|
|
569
|
+
"profiled_at": self.after.profiled_at,
|
|
570
|
+
"row_count": self.row_count_after,
|
|
571
|
+
"column_count": self.column_count_after,
|
|
572
|
+
},
|
|
573
|
+
"changes": {
|
|
574
|
+
"row_count_delta": self.row_count_delta,
|
|
575
|
+
"row_count_pct_change": round(self.row_count_pct_change, 2),
|
|
576
|
+
"columns_added": self.columns_added,
|
|
577
|
+
"columns_removed": self.columns_removed,
|
|
578
|
+
"columns_changed": [c.to_dict() for c in self.columns_changed],
|
|
579
|
+
},
|
|
580
|
+
"significant": {
|
|
581
|
+
"null_rate_increases": [c.column_name for c in self.null_rate_increases],
|
|
582
|
+
"null_rate_decreases": [c.column_name for c in self.null_rate_decreases],
|
|
583
|
+
"cardinality_changes": [c.column_name for c in self.cardinality_changes],
|
|
584
|
+
"dtype_changes": [c.column_name for c in self.dtype_changes],
|
|
585
|
+
},
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
def to_json(self, indent: int = 2) -> str:
|
|
589
|
+
"""Serialize to JSON."""
|
|
590
|
+
import json
|
|
591
|
+
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
592
|
+
|
|
593
|
+
def to_llm(self) -> str:
|
|
594
|
+
"""Render diff in token-optimized format for LLM context."""
|
|
595
|
+
lines = []
|
|
596
|
+
|
|
597
|
+
# Header
|
|
598
|
+
lines.append(f"# Profile Diff: {self.after.source_uri}")
|
|
599
|
+
lines.append(f"comparing: {self.before.profiled_at[:10]} → {self.after.profiled_at[:10]}")
|
|
600
|
+
|
|
601
|
+
# Row count
|
|
602
|
+
if self.row_count_delta != 0:
|
|
603
|
+
sign = "+" if self.row_count_delta > 0 else ""
|
|
604
|
+
lines.append(f"rows: {self.row_count_before:,} → {self.row_count_after:,} ({sign}{self.row_count_delta:,}, {self.row_count_pct_change:+.1f}%)")
|
|
605
|
+
else:
|
|
606
|
+
lines.append(f"rows: {self.row_count_after:,} (unchanged)")
|
|
607
|
+
|
|
608
|
+
# Schema changes
|
|
609
|
+
if self.columns_added:
|
|
610
|
+
lines.append(f"\n## Columns Added ({len(self.columns_added)})")
|
|
611
|
+
for col in self.columns_added[:10]:
|
|
612
|
+
lines.append(f"- {col}")
|
|
613
|
+
|
|
614
|
+
if self.columns_removed:
|
|
615
|
+
lines.append(f"\n## Columns Removed ({len(self.columns_removed)})")
|
|
616
|
+
for col in self.columns_removed[:10]:
|
|
617
|
+
lines.append(f"- {col}")
|
|
618
|
+
|
|
619
|
+
# Significant changes
|
|
620
|
+
if self.dtype_changes:
|
|
621
|
+
lines.append(f"\n## Type Changes ({len(self.dtype_changes)})")
|
|
622
|
+
for cd in self.dtype_changes[:10]:
|
|
623
|
+
lines.append(f"- {cd.column_name}: {cd.dtype_before} → {cd.dtype_after}")
|
|
624
|
+
|
|
625
|
+
if self.null_rate_increases:
|
|
626
|
+
lines.append(f"\n## Null Rate Increases ({len(self.null_rate_increases)})")
|
|
627
|
+
for cd in self.null_rate_increases[:10]:
|
|
628
|
+
lines.append(f"- {cd.column_name}: {cd.null_rate_before:.1%} → {cd.null_rate_after:.1%}")
|
|
629
|
+
|
|
630
|
+
if self.cardinality_changes:
|
|
631
|
+
lines.append(f"\n## Cardinality Changes ({len(self.cardinality_changes)})")
|
|
632
|
+
for cd in self.cardinality_changes[:10]:
|
|
633
|
+
sign = "+" if cd.distinct_count_delta > 0 else ""
|
|
634
|
+
lines.append(f"- {cd.column_name}: {cd.distinct_count_before:,} → {cd.distinct_count_after:,} ({sign}{cd.distinct_count_delta:,})")
|
|
635
|
+
|
|
636
|
+
# Other column changes
|
|
637
|
+
other_changes = [c for c in self.columns_changed if c not in self.dtype_changes and c not in self.null_rate_increases and c not in self.cardinality_changes]
|
|
638
|
+
if other_changes:
|
|
639
|
+
lines.append(f"\n## Other Changes ({len(other_changes)})")
|
|
640
|
+
for cd in other_changes[:10]:
|
|
641
|
+
parts = [cd.column_name]
|
|
642
|
+
if cd.new_values:
|
|
643
|
+
parts.append(f"+{len(cd.new_values)} values")
|
|
644
|
+
if cd.removed_values:
|
|
645
|
+
parts.append(f"-{len(cd.removed_values)} values")
|
|
646
|
+
lines.append(f"- {' | '.join(parts)}")
|
|
647
|
+
|
|
648
|
+
if not self.has_changes:
|
|
649
|
+
lines.append("\n✓ No significant changes detected")
|
|
650
|
+
|
|
651
|
+
lines.append(f"\nfingerprint: {self.after.source_fingerprint}")
|
|
652
|
+
return "\n".join(lines)
|
kontra/state/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# src/kontra/state/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Kontra State Management - Validation state persistence and comparison.
|
|
4
|
+
|
|
5
|
+
Enables time-based reasoning for agentic workflows by tracking validation
|
|
6
|
+
results across runs.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .types import ValidationState, RuleState, StateSummary, StateDiff, RuleDiff, FailureMode, Severity
|
|
10
|
+
from .fingerprint import fingerprint_contract, fingerprint_dataset
|
|
11
|
+
from .backends import StateBackend, LocalStore, get_default_store
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
# Types
|
|
15
|
+
"ValidationState",
|
|
16
|
+
"RuleState",
|
|
17
|
+
"StateSummary",
|
|
18
|
+
"StateDiff",
|
|
19
|
+
"RuleDiff",
|
|
20
|
+
"FailureMode",
|
|
21
|
+
"Severity",
|
|
22
|
+
# Fingerprinting
|
|
23
|
+
"fingerprint_contract",
|
|
24
|
+
"fingerprint_dataset",
|
|
25
|
+
# Backends
|
|
26
|
+
"StateBackend",
|
|
27
|
+
"LocalStore",
|
|
28
|
+
"get_default_store",
|
|
29
|
+
]
|