kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/scout/types.py ADDED
@@ -0,0 +1,652 @@
1
+ # src/kontra/scout/types.py
2
+ """
3
+ Data types for Kontra Scout profiling results.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Dict, List, Optional
10
+
11
+
12
+ @dataclass
13
+ class NumericStats:
14
+ """Statistics for numeric columns."""
15
+
16
+ min: Optional[float] = None
17
+ max: Optional[float] = None
18
+ mean: Optional[float] = None
19
+ median: Optional[float] = None
20
+ std: Optional[float] = None
21
+ percentiles: Dict[str, float] = field(default_factory=dict) # {"p25": ..., "p50": ..., ...}
22
+
23
+ def to_dict(self) -> Dict[str, Any]:
24
+ return {
25
+ "min": self.min,
26
+ "max": self.max,
27
+ "mean": self.mean,
28
+ "median": self.median,
29
+ "std": self.std,
30
+ "percentiles": self.percentiles,
31
+ }
32
+
33
+
34
+ @dataclass
35
+ class StringStats:
36
+ """Statistics for string columns."""
37
+
38
+ min_length: Optional[int] = None
39
+ max_length: Optional[int] = None
40
+ avg_length: Optional[float] = None
41
+ empty_count: int = 0
42
+
43
+ def to_dict(self) -> Dict[str, Any]:
44
+ return {
45
+ "min_length": self.min_length,
46
+ "max_length": self.max_length,
47
+ "avg_length": self.avg_length,
48
+ "empty_count": self.empty_count,
49
+ }
50
+
51
+
52
+ @dataclass
53
+ class TemporalStats:
54
+ """Statistics for date/datetime columns."""
55
+
56
+ date_min: Optional[str] = None # ISO format
57
+ date_max: Optional[str] = None # ISO format
58
+
59
+ def to_dict(self) -> Dict[str, Any]:
60
+ return {
61
+ "date_min": self.date_min,
62
+ "date_max": self.date_max,
63
+ }
64
+
65
+
66
+ @dataclass
67
+ class TopValue:
68
+ """A frequently occurring value with its count."""
69
+
70
+ value: Any
71
+ count: int
72
+ pct: float # Percentage of total rows
73
+
74
+ def to_dict(self) -> Dict[str, Any]:
75
+ return {
76
+ "value": self.value,
77
+ "count": self.count,
78
+ "pct": round(self.pct, 2),
79
+ }
80
+
81
+
82
+ @dataclass
83
+ class ColumnProfile:
84
+ """Complete profile for a single column."""
85
+
86
+ name: str
87
+ dtype: str # Normalized: string/int/float/bool/date/datetime/binary/unknown
88
+ dtype_raw: str # Original DuckDB/Polars type string
89
+
90
+ # Counts
91
+ row_count: int = 0
92
+ null_count: int = 0
93
+ null_rate: float = 0.0 # null_count / row_count
94
+ distinct_count: int = 0
95
+ uniqueness_ratio: float = 0.0 # distinct / non_null_count
96
+
97
+ # Cardinality analysis
98
+ is_low_cardinality: bool = False
99
+ values: Optional[List[Any]] = None # All values if low cardinality
100
+ top_values: List[TopValue] = field(default_factory=list)
101
+
102
+ # Type-specific stats
103
+ numeric: Optional[NumericStats] = None
104
+ string: Optional[StringStats] = None
105
+ temporal: Optional[TemporalStats] = None
106
+
107
+ # Pattern detection (optional)
108
+ detected_patterns: List[str] = field(default_factory=list)
109
+
110
+ # Semantic type inference
111
+ semantic_type: Optional[str] = None # identifier/category/measure/timestamp
112
+
113
+ def to_dict(self) -> Dict[str, Any]:
114
+ """Convert to dictionary for JSON serialization."""
115
+ d: Dict[str, Any] = {
116
+ "name": self.name,
117
+ "dtype": self.dtype,
118
+ "dtype_raw": self.dtype_raw,
119
+ "counts": {
120
+ "rows": self.row_count,
121
+ "nulls": self.null_count,
122
+ "null_rate": round(self.null_rate, 4),
123
+ "distinct": self.distinct_count,
124
+ "uniqueness_ratio": round(self.uniqueness_ratio, 4),
125
+ },
126
+ "cardinality": {
127
+ "is_low": self.is_low_cardinality,
128
+ "values": self.values,
129
+ "top_values": [tv.to_dict() for tv in self.top_values],
130
+ },
131
+ }
132
+
133
+ if self.numeric:
134
+ d["numeric_stats"] = self.numeric.to_dict()
135
+ if self.string:
136
+ d["string_stats"] = self.string.to_dict()
137
+ if self.temporal:
138
+ d["temporal_stats"] = self.temporal.to_dict()
139
+ if self.detected_patterns:
140
+ d["patterns"] = self.detected_patterns
141
+ if self.semantic_type:
142
+ d["semantic_type"] = self.semantic_type
143
+
144
+ return d
145
+
146
+
147
+ @dataclass
148
+ class DatasetProfile:
149
+ """Complete profile for a dataset."""
150
+
151
+ # Metadata
152
+ source_uri: str
153
+ source_format: str # "parquet", "csv"
154
+ profiled_at: str # ISO timestamp
155
+ engine_version: str
156
+
157
+ # Dataset-level stats
158
+ row_count: int = 0
159
+ column_count: int = 0
160
+ estimated_size_bytes: Optional[int] = None
161
+
162
+ # Sampling info
163
+ sampled: bool = False
164
+ sample_size: Optional[int] = None
165
+
166
+ # Columns
167
+ columns: List[ColumnProfile] = field(default_factory=list)
168
+
169
+ # Timing
170
+ profile_duration_ms: int = 0
171
+
172
+ def to_dict(self) -> Dict[str, Any]:
173
+ """Convert to dictionary for JSON serialization."""
174
+ return {
175
+ "schema_version": "1.0",
176
+ "source_uri": self.source_uri,
177
+ "source_format": self.source_format,
178
+ "profiled_at": self.profiled_at,
179
+ "engine_version": self.engine_version,
180
+ "dataset": {
181
+ "row_count": self.row_count,
182
+ "column_count": self.column_count,
183
+ "estimated_size_bytes": self.estimated_size_bytes,
184
+ "sampled": self.sampled,
185
+ "sample_size": self.sample_size,
186
+ },
187
+ "columns": [c.to_dict() for c in self.columns],
188
+ "profile_duration_ms": self.profile_duration_ms,
189
+ }
190
+
191
+ def get_column(self, name: str) -> Optional[ColumnProfile]:
192
+ """Get a column profile by name."""
193
+ for col in self.columns:
194
+ if col.name == name:
195
+ return col
196
+ return None
197
+
198
+ def to_llm(self) -> str:
199
+ """Token-optimized format for LLM context."""
200
+ lines = []
201
+ lines.append(f"PROFILE: {self.source_uri}")
202
+ lines.append(f"rows={self.row_count:,} cols={self.column_count}")
203
+ if self.sampled:
204
+ lines.append(f"(sampled: {self.sample_size:,} rows)")
205
+
206
+ lines.append("")
207
+ lines.append("COLUMNS:")
208
+ for col in self.columns[:20]: # Limit to 20 columns
209
+ parts = [f" {col.name} ({col.dtype})"]
210
+ if col.null_count > 0:
211
+ parts.append(f"nulls={col.null_count:,} ({col.null_rate:.1%})")
212
+ if col.distinct_count is not None:
213
+ parts.append(f"distinct={col.distinct_count:,}")
214
+ if col.numeric:
215
+ if col.numeric.min is not None and col.numeric.max is not None:
216
+ parts.append(f"range=[{col.numeric.min}, {col.numeric.max}]")
217
+ if col.top_values:
218
+ top = col.top_values[0]
219
+ parts.append(f"top='{top.value}'({top.count:,})")
220
+ lines.append(" ".join(parts))
221
+
222
+ if len(self.columns) > 20:
223
+ lines.append(f" ... +{len(self.columns) - 20} more columns")
224
+
225
+ return "\n".join(lines)
226
+
227
+ @classmethod
228
+ def from_dict(cls, d: Dict[str, Any]) -> "DatasetProfile":
229
+ """Create from dictionary."""
230
+ ds = d.get("dataset", {})
231
+ cols_data = d.get("columns", [])
232
+
233
+ columns = []
234
+ for c in cols_data:
235
+ counts = c.get("counts", {})
236
+ card = c.get("cardinality", {})
237
+
238
+ # Parse top values
239
+ top_values = []
240
+ for tv in card.get("top_values", []):
241
+ top_values.append(TopValue(
242
+ value=tv.get("value"),
243
+ count=tv.get("count", 0),
244
+ pct=tv.get("pct", 0.0),
245
+ ))
246
+
247
+ # Parse type-specific stats
248
+ numeric = None
249
+ if "numeric_stats" in c:
250
+ ns = c["numeric_stats"]
251
+ numeric = NumericStats(
252
+ min=ns.get("min"),
253
+ max=ns.get("max"),
254
+ mean=ns.get("mean"),
255
+ median=ns.get("median"),
256
+ std=ns.get("std"),
257
+ percentiles=ns.get("percentiles", {}),
258
+ )
259
+
260
+ string = None
261
+ if "string_stats" in c:
262
+ ss = c["string_stats"]
263
+ string = StringStats(
264
+ min_length=ss.get("min_length"),
265
+ max_length=ss.get("max_length"),
266
+ avg_length=ss.get("avg_length"),
267
+ empty_count=ss.get("empty_count", 0),
268
+ )
269
+
270
+ temporal = None
271
+ if "temporal_stats" in c:
272
+ ts = c["temporal_stats"]
273
+ temporal = TemporalStats(
274
+ date_min=ts.get("date_min"),
275
+ date_max=ts.get("date_max"),
276
+ )
277
+
278
+ columns.append(ColumnProfile(
279
+ name=c.get("name", ""),
280
+ dtype=c.get("dtype", "unknown"),
281
+ dtype_raw=c.get("dtype_raw", ""),
282
+ row_count=counts.get("rows", 0),
283
+ null_count=counts.get("nulls", 0),
284
+ null_rate=counts.get("null_rate", 0.0),
285
+ distinct_count=counts.get("distinct", 0),
286
+ uniqueness_ratio=counts.get("uniqueness_ratio", 0.0),
287
+ is_low_cardinality=card.get("is_low", False),
288
+ values=card.get("values"),
289
+ top_values=top_values,
290
+ numeric=numeric,
291
+ string=string,
292
+ temporal=temporal,
293
+ detected_patterns=c.get("patterns", []),
294
+ semantic_type=c.get("semantic_type"),
295
+ ))
296
+
297
+ return cls(
298
+ source_uri=d.get("source_uri", ""),
299
+ source_format=d.get("source_format", ""),
300
+ profiled_at=d.get("profiled_at", ""),
301
+ engine_version=d.get("engine_version", ""),
302
+ row_count=ds.get("row_count", 0),
303
+ column_count=ds.get("column_count", 0),
304
+ estimated_size_bytes=ds.get("estimated_size_bytes"),
305
+ sampled=ds.get("sampled", False),
306
+ sample_size=ds.get("sample_size"),
307
+ columns=columns,
308
+ profile_duration_ms=d.get("profile_duration_ms", 0),
309
+ )
310
+
311
+
312
+ @dataclass
313
+ class ProfileState:
314
+ """
315
+ Persistent state for a scout profile.
316
+
317
+ Similar to ValidationState, enables tracking profile changes over time.
318
+ """
319
+
320
+ # Identity
321
+ source_fingerprint: str # Hash of source URI
322
+ source_uri: str
323
+
324
+ # Timing
325
+ profiled_at: str # ISO timestamp
326
+
327
+ # The actual profile
328
+ profile: DatasetProfile
329
+
330
+ # Metadata
331
+ schema_version: str = "1.0"
332
+ engine_version: str = ""
333
+
334
+ def to_dict(self) -> Dict[str, Any]:
335
+ """Convert to dictionary for JSON serialization."""
336
+ return {
337
+ "schema_version": self.schema_version,
338
+ "engine_version": self.engine_version,
339
+ "source_fingerprint": self.source_fingerprint,
340
+ "source_uri": self.source_uri,
341
+ "profiled_at": self.profiled_at,
342
+ "profile": self.profile.to_dict(),
343
+ }
344
+
345
+ @classmethod
346
+ def from_dict(cls, d: Dict[str, Any]) -> "ProfileState":
347
+ """Create from dictionary."""
348
+ return cls(
349
+ schema_version=d.get("schema_version", "1.0"),
350
+ engine_version=d.get("engine_version", ""),
351
+ source_fingerprint=d["source_fingerprint"],
352
+ source_uri=d["source_uri"],
353
+ profiled_at=d["profiled_at"],
354
+ profile=DatasetProfile.from_dict(d["profile"]),
355
+ )
356
+
357
+ def to_json(self, indent: int = 2) -> str:
358
+ """Serialize to JSON string."""
359
+ import json
360
+ return json.dumps(self.to_dict(), indent=indent, default=str)
361
+
362
+ @classmethod
363
+ def from_json(cls, json_str: str) -> "ProfileState":
364
+ """Deserialize from JSON string."""
365
+ import json
366
+ return cls.from_dict(json.loads(json_str))
367
+
368
+
369
+ @dataclass
370
+ class ColumnDiff:
371
+ """Diff for a single column between two profiles."""
372
+
373
+ column_name: str
374
+ change_type: str # "added", "removed", "changed", "unchanged"
375
+
376
+ # For changed columns
377
+ null_rate_before: Optional[float] = None
378
+ null_rate_after: Optional[float] = None
379
+ null_rate_delta: Optional[float] = None
380
+
381
+ distinct_count_before: Optional[int] = None
382
+ distinct_count_after: Optional[int] = None
383
+ distinct_count_delta: Optional[int] = None
384
+
385
+ dtype_before: Optional[str] = None
386
+ dtype_after: Optional[str] = None
387
+ dtype_changed: bool = False
388
+
389
+ # Value distribution changes
390
+ new_values: List[Any] = field(default_factory=list)
391
+ removed_values: List[Any] = field(default_factory=list)
392
+
393
+ def to_dict(self) -> Dict[str, Any]:
394
+ d: Dict[str, Any] = {
395
+ "column": self.column_name,
396
+ "change_type": self.change_type,
397
+ }
398
+ if self.change_type == "changed":
399
+ if self.null_rate_delta is not None and abs(self.null_rate_delta) > 0.001:
400
+ d["null_rate"] = {
401
+ "before": self.null_rate_before,
402
+ "after": self.null_rate_after,
403
+ "delta": round(self.null_rate_delta, 4),
404
+ }
405
+ if self.distinct_count_delta is not None and self.distinct_count_delta != 0:
406
+ d["distinct_count"] = {
407
+ "before": self.distinct_count_before,
408
+ "after": self.distinct_count_after,
409
+ "delta": self.distinct_count_delta,
410
+ }
411
+ if self.dtype_changed:
412
+ d["dtype"] = {
413
+ "before": self.dtype_before,
414
+ "after": self.dtype_after,
415
+ }
416
+ if self.new_values:
417
+ d["new_values"] = self.new_values[:10] # Limit
418
+ if self.removed_values:
419
+ d["removed_values"] = self.removed_values[:10]
420
+ return d
421
+
422
+
423
+ @dataclass
424
+ class ProfileDiff:
425
+ """Diff between two scout profiles."""
426
+
427
+ before: ProfileState
428
+ after: ProfileState
429
+
430
+ # Dataset-level changes
431
+ row_count_before: int = 0
432
+ row_count_after: int = 0
433
+ row_count_delta: int = 0
434
+ row_count_pct_change: float = 0.0
435
+
436
+ column_count_before: int = 0
437
+ column_count_after: int = 0
438
+
439
+ # Column-level changes
440
+ columns_added: List[str] = field(default_factory=list)
441
+ columns_removed: List[str] = field(default_factory=list)
442
+ columns_changed: List[ColumnDiff] = field(default_factory=list)
443
+
444
+ # Significant changes summary
445
+ null_rate_increases: List[ColumnDiff] = field(default_factory=list)
446
+ null_rate_decreases: List[ColumnDiff] = field(default_factory=list)
447
+ cardinality_changes: List[ColumnDiff] = field(default_factory=list)
448
+ dtype_changes: List[ColumnDiff] = field(default_factory=list)
449
+
450
+ @property
451
+ def has_changes(self) -> bool:
452
+ """Check if there are any meaningful changes."""
453
+ return bool(
454
+ self.columns_added
455
+ or self.columns_removed
456
+ or self.columns_changed
457
+ or abs(self.row_count_delta) > 0
458
+ )
459
+
460
+ @property
461
+ def has_schema_changes(self) -> bool:
462
+ """Check if there are schema-level changes."""
463
+ return bool(
464
+ self.columns_added
465
+ or self.columns_removed
466
+ or self.dtype_changes
467
+ )
468
+
469
+ @classmethod
470
+ def compute(cls, before: ProfileState, after: ProfileState) -> "ProfileDiff":
471
+ """Compute diff between two profile states."""
472
+ diff = cls(before=before, after=after)
473
+
474
+ # Dataset-level
475
+ diff.row_count_before = before.profile.row_count
476
+ diff.row_count_after = after.profile.row_count
477
+ diff.row_count_delta = after.profile.row_count - before.profile.row_count
478
+ if before.profile.row_count > 0:
479
+ diff.row_count_pct_change = (diff.row_count_delta / before.profile.row_count) * 100
480
+
481
+ diff.column_count_before = before.profile.column_count
482
+ diff.column_count_after = after.profile.column_count
483
+
484
+ # Build column maps
485
+ before_cols = {c.name: c for c in before.profile.columns}
486
+ after_cols = {c.name: c for c in after.profile.columns}
487
+
488
+ before_names = set(before_cols.keys())
489
+ after_names = set(after_cols.keys())
490
+
491
+ # Added/removed columns
492
+ diff.columns_added = sorted(after_names - before_names)
493
+ diff.columns_removed = sorted(before_names - after_names)
494
+
495
+ # Changed columns
496
+ common_cols = before_names & after_names
497
+ for col_name in sorted(common_cols):
498
+ bc = before_cols[col_name]
499
+ ac = after_cols[col_name]
500
+
501
+ col_diff = ColumnDiff(
502
+ column_name=col_name,
503
+ change_type="unchanged",
504
+ )
505
+
506
+ changed = False
507
+
508
+ # Null rate change
509
+ null_delta = ac.null_rate - bc.null_rate
510
+ if abs(null_delta) > 0.001: # > 0.1% change
511
+ col_diff.null_rate_before = bc.null_rate
512
+ col_diff.null_rate_after = ac.null_rate
513
+ col_diff.null_rate_delta = null_delta
514
+ changed = True
515
+
516
+ if null_delta > 0.01: # > 1% increase
517
+ diff.null_rate_increases.append(col_diff)
518
+ elif null_delta < -0.01: # > 1% decrease
519
+ diff.null_rate_decreases.append(col_diff)
520
+
521
+ # Distinct count change
522
+ distinct_delta = ac.distinct_count - bc.distinct_count
523
+ if distinct_delta != 0:
524
+ col_diff.distinct_count_before = bc.distinct_count
525
+ col_diff.distinct_count_after = ac.distinct_count
526
+ col_diff.distinct_count_delta = distinct_delta
527
+ changed = True
528
+
529
+ # Significant cardinality change (>10%)
530
+ if bc.distinct_count > 0:
531
+ pct_change = abs(distinct_delta / bc.distinct_count)
532
+ if pct_change > 0.1:
533
+ diff.cardinality_changes.append(col_diff)
534
+
535
+ # Dtype change
536
+ if bc.dtype != ac.dtype:
537
+ col_diff.dtype_before = bc.dtype
538
+ col_diff.dtype_after = ac.dtype
539
+ col_diff.dtype_changed = True
540
+ changed = True
541
+ diff.dtype_changes.append(col_diff)
542
+
543
+ # Value distribution changes (if low cardinality)
544
+ if bc.values and ac.values:
545
+ before_vals = set(bc.values) if bc.values else set()
546
+ after_vals = set(ac.values) if ac.values else set()
547
+ col_diff.new_values = list(after_vals - before_vals)
548
+ col_diff.removed_values = list(before_vals - after_vals)
549
+ if col_diff.new_values or col_diff.removed_values:
550
+ changed = True
551
+
552
+ if changed:
553
+ col_diff.change_type = "changed"
554
+ diff.columns_changed.append(col_diff)
555
+
556
+ return diff
557
+
558
+ def to_dict(self) -> Dict[str, Any]:
559
+ """Convert to dictionary."""
560
+ return {
561
+ "before": {
562
+ "source_uri": self.before.source_uri,
563
+ "profiled_at": self.before.profiled_at,
564
+ "row_count": self.row_count_before,
565
+ "column_count": self.column_count_before,
566
+ },
567
+ "after": {
568
+ "source_uri": self.after.source_uri,
569
+ "profiled_at": self.after.profiled_at,
570
+ "row_count": self.row_count_after,
571
+ "column_count": self.column_count_after,
572
+ },
573
+ "changes": {
574
+ "row_count_delta": self.row_count_delta,
575
+ "row_count_pct_change": round(self.row_count_pct_change, 2),
576
+ "columns_added": self.columns_added,
577
+ "columns_removed": self.columns_removed,
578
+ "columns_changed": [c.to_dict() for c in self.columns_changed],
579
+ },
580
+ "significant": {
581
+ "null_rate_increases": [c.column_name for c in self.null_rate_increases],
582
+ "null_rate_decreases": [c.column_name for c in self.null_rate_decreases],
583
+ "cardinality_changes": [c.column_name for c in self.cardinality_changes],
584
+ "dtype_changes": [c.column_name for c in self.dtype_changes],
585
+ },
586
+ }
587
+
588
+ def to_json(self, indent: int = 2) -> str:
589
+ """Serialize to JSON."""
590
+ import json
591
+ return json.dumps(self.to_dict(), indent=indent, default=str)
592
+
593
+ def to_llm(self) -> str:
594
+ """Render diff in token-optimized format for LLM context."""
595
+ lines = []
596
+
597
+ # Header
598
+ lines.append(f"# Profile Diff: {self.after.source_uri}")
599
+ lines.append(f"comparing: {self.before.profiled_at[:10]} → {self.after.profiled_at[:10]}")
600
+
601
+ # Row count
602
+ if self.row_count_delta != 0:
603
+ sign = "+" if self.row_count_delta > 0 else ""
604
+ lines.append(f"rows: {self.row_count_before:,} → {self.row_count_after:,} ({sign}{self.row_count_delta:,}, {self.row_count_pct_change:+.1f}%)")
605
+ else:
606
+ lines.append(f"rows: {self.row_count_after:,} (unchanged)")
607
+
608
+ # Schema changes
609
+ if self.columns_added:
610
+ lines.append(f"\n## Columns Added ({len(self.columns_added)})")
611
+ for col in self.columns_added[:10]:
612
+ lines.append(f"- {col}")
613
+
614
+ if self.columns_removed:
615
+ lines.append(f"\n## Columns Removed ({len(self.columns_removed)})")
616
+ for col in self.columns_removed[:10]:
617
+ lines.append(f"- {col}")
618
+
619
+ # Significant changes
620
+ if self.dtype_changes:
621
+ lines.append(f"\n## Type Changes ({len(self.dtype_changes)})")
622
+ for cd in self.dtype_changes[:10]:
623
+ lines.append(f"- {cd.column_name}: {cd.dtype_before} → {cd.dtype_after}")
624
+
625
+ if self.null_rate_increases:
626
+ lines.append(f"\n## Null Rate Increases ({len(self.null_rate_increases)})")
627
+ for cd in self.null_rate_increases[:10]:
628
+ lines.append(f"- {cd.column_name}: {cd.null_rate_before:.1%} → {cd.null_rate_after:.1%}")
629
+
630
+ if self.cardinality_changes:
631
+ lines.append(f"\n## Cardinality Changes ({len(self.cardinality_changes)})")
632
+ for cd in self.cardinality_changes[:10]:
633
+ sign = "+" if cd.distinct_count_delta > 0 else ""
634
+ lines.append(f"- {cd.column_name}: {cd.distinct_count_before:,} → {cd.distinct_count_after:,} ({sign}{cd.distinct_count_delta:,})")
635
+
636
+ # Other column changes
637
+ other_changes = [c for c in self.columns_changed if c not in self.dtype_changes and c not in self.null_rate_increases and c not in self.cardinality_changes]
638
+ if other_changes:
639
+ lines.append(f"\n## Other Changes ({len(other_changes)})")
640
+ for cd in other_changes[:10]:
641
+ parts = [cd.column_name]
642
+ if cd.new_values:
643
+ parts.append(f"+{len(cd.new_values)} values")
644
+ if cd.removed_values:
645
+ parts.append(f"-{len(cd.removed_values)} values")
646
+ lines.append(f"- {' | '.join(parts)}")
647
+
648
+ if not self.has_changes:
649
+ lines.append("\n✓ No significant changes detected")
650
+
651
+ lines.append(f"\nfingerprint: {self.after.source_fingerprint}")
652
+ return "\n".join(lines)
@@ -0,0 +1,29 @@
1
+ # src/kontra/state/__init__.py
2
+ """
3
+ Kontra State Management - Validation state persistence and comparison.
4
+
5
+ Enables time-based reasoning for agentic workflows by tracking validation
6
+ results across runs.
7
+ """
8
+
9
+ from .types import ValidationState, RuleState, StateSummary, StateDiff, RuleDiff, FailureMode, Severity
10
+ from .fingerprint import fingerprint_contract, fingerprint_dataset
11
+ from .backends import StateBackend, LocalStore, get_default_store
12
+
13
+ __all__ = [
14
+ # Types
15
+ "ValidationState",
16
+ "RuleState",
17
+ "StateSummary",
18
+ "StateDiff",
19
+ "RuleDiff",
20
+ "FailureMode",
21
+ "Severity",
22
+ # Fingerprinting
23
+ "fingerprint_contract",
24
+ "fingerprint_dataset",
25
+ # Backends
26
+ "StateBackend",
27
+ "LocalStore",
28
+ "get_default_store",
29
+ ]