kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/scout/store.py ADDED
@@ -0,0 +1,208 @@
1
+ # src/kontra/scout/store.py
2
+ """
3
+ Profile storage for Kontra Scout.
4
+
5
+ Stores scout profiles using the same backend infrastructure as validation state.
6
+ Profiles are stored separately from validation states but can use the same
7
+ storage backend (local, S3, PostgreSQL).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import json
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ from kontra.version import VERSION
19
+ from .types import DatasetProfile, ProfileState
20
+
21
+
22
+ def fingerprint_source(source_uri: str) -> str:
23
+ """
24
+ Generate a stable fingerprint for a data source URI.
25
+
26
+ Args:
27
+ source_uri: The data source URI
28
+
29
+ Returns:
30
+ 16-character hex fingerprint
31
+ """
32
+ # Normalize the URI
33
+ normalized = source_uri.strip()
34
+
35
+ # Hash it
36
+ return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16]
37
+
38
+
39
+ class LocalProfileStore:
40
+ """
41
+ Filesystem-based profile storage.
42
+
43
+ Stores profiles in .kontra/profiles/ directory:
44
+ .kontra/profiles/<source_fingerprint>/<timestamp>.json
45
+ """
46
+
47
+ def __init__(self, base_path: Optional[str] = None):
48
+ """
49
+ Initialize the local profile store.
50
+
51
+ Args:
52
+ base_path: Base directory for profile storage.
53
+ Defaults to .kontra/profiles/ in cwd.
54
+ """
55
+ if base_path:
56
+ self.base_path = Path(base_path)
57
+ else:
58
+ self.base_path = Path.cwd() / ".kontra" / "profiles"
59
+
60
+ def _source_dir(self, source_fingerprint: str) -> Path:
61
+ """Get the directory for a source's profiles."""
62
+ return self.base_path / source_fingerprint
63
+
64
+ def _profile_filename(self, profiled_at: str) -> str:
65
+ """Generate filename from timestamp."""
66
+ # Use ISO format but replace : with - for filesystem compatibility
67
+ ts = profiled_at.replace(":", "-").replace("+", "_")
68
+ return f"{ts}.json"
69
+
70
+ def save(self, state: ProfileState) -> None:
71
+ """Save a profile state to the filesystem."""
72
+ source_dir = self._source_dir(state.source_fingerprint)
73
+ source_dir.mkdir(parents=True, exist_ok=True)
74
+
75
+ filename = self._profile_filename(state.profiled_at)
76
+ filepath = source_dir / filename
77
+
78
+ # Write atomically
79
+ temp_path = filepath.with_suffix(".tmp")
80
+ try:
81
+ temp_path.write_text(state.to_json(), encoding="utf-8")
82
+ temp_path.rename(filepath)
83
+ except Exception:
84
+ if temp_path.exists():
85
+ temp_path.unlink()
86
+ raise
87
+
88
+ def get_latest(self, source_fingerprint: str) -> Optional[ProfileState]:
89
+ """Get the most recent profile for a source."""
90
+ history = self.get_history(source_fingerprint, limit=1)
91
+ return history[0] if history else None
92
+
93
+ def get_history(
94
+ self,
95
+ source_fingerprint: str,
96
+ limit: int = 10,
97
+ ) -> List[ProfileState]:
98
+ """Get recent profile history for a source, newest first."""
99
+ source_dir = self._source_dir(source_fingerprint)
100
+
101
+ if not source_dir.exists():
102
+ return []
103
+
104
+ # List all JSON files
105
+ profile_files = sorted(
106
+ source_dir.glob("*.json"),
107
+ key=lambda p: p.name,
108
+ reverse=True,
109
+ )
110
+
111
+ states = []
112
+ for filepath in profile_files[:limit]:
113
+ try:
114
+ content = filepath.read_text(encoding="utf-8")
115
+ state = ProfileState.from_json(content)
116
+ states.append(state)
117
+ except Exception:
118
+ continue
119
+
120
+ return states
121
+
122
+ def list_sources(self) -> List[str]:
123
+ """List all source fingerprints with stored profiles."""
124
+ if not self.base_path.exists():
125
+ return []
126
+
127
+ sources = []
128
+ for item in self.base_path.iterdir():
129
+ if item.is_dir() and len(item.name) == 16:
130
+ sources.append(item.name)
131
+
132
+ return sorted(sources)
133
+
134
+ def clear(self, source_fingerprint: Optional[str] = None) -> int:
135
+ """Clear stored profiles."""
136
+ deleted = 0
137
+
138
+ if source_fingerprint:
139
+ source_dir = self._source_dir(source_fingerprint)
140
+ if source_dir.exists():
141
+ for filepath in source_dir.glob("*.json"):
142
+ filepath.unlink()
143
+ deleted += 1
144
+ try:
145
+ source_dir.rmdir()
146
+ except OSError:
147
+ pass
148
+ else:
149
+ if self.base_path.exists():
150
+ for source_dir in self.base_path.iterdir():
151
+ if source_dir.is_dir():
152
+ for filepath in source_dir.glob("*.json"):
153
+ filepath.unlink()
154
+ deleted += 1
155
+ try:
156
+ source_dir.rmdir()
157
+ except OSError:
158
+ pass
159
+
160
+ return deleted
161
+
162
+ def __repr__(self) -> str:
163
+ return f"LocalProfileStore(base_path={self.base_path})"
164
+
165
+
166
+ def create_profile_state(profile: DatasetProfile) -> ProfileState:
167
+ """
168
+ Create a ProfileState from a DatasetProfile.
169
+
170
+ Args:
171
+ profile: The profiled dataset
172
+
173
+ Returns:
174
+ ProfileState ready for storage
175
+ """
176
+ return ProfileState(
177
+ source_fingerprint=fingerprint_source(profile.source_uri),
178
+ source_uri=profile.source_uri,
179
+ profiled_at=profile.profiled_at,
180
+ profile=profile,
181
+ engine_version=VERSION,
182
+ )
183
+
184
+
185
+ # Default store
186
+ _default_profile_store: Optional[LocalProfileStore] = None
187
+
188
+
189
+ def get_default_profile_store() -> LocalProfileStore:
190
+ """Get the default profile store."""
191
+ global _default_profile_store
192
+ if _default_profile_store is None:
193
+ _default_profile_store = LocalProfileStore()
194
+ return _default_profile_store
195
+
196
+
197
+ def get_profile_store(backend: str = "local") -> LocalProfileStore:
198
+ """
199
+ Get a profile store by backend identifier.
200
+
201
+ Currently only supports local storage. Future: S3, PostgreSQL.
202
+ """
203
+ if not backend or backend == "local":
204
+ return get_default_profile_store()
205
+
206
+ # For now, all backends use local profile storage
207
+ # Future: implement S3ProfileStore, PostgresProfileStore
208
+ return get_default_profile_store()
@@ -0,0 +1,200 @@
1
+ # src/kontra/scout/suggest.py
2
+ """
3
+ Generate suggested validation rules based on profile analysis.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any, Dict, List
9
+
10
+ from .patterns import get_pattern_regex
11
+ from .types import DatasetProfile, ColumnProfile
12
+
13
+
14
+ def generate_rules(profile: DatasetProfile) -> List[Dict[str, Any]]:
15
+ """
16
+ Generate suggested validation rules based on profile.
17
+
18
+ Returns a list of rule dictionaries in Kontra contract format.
19
+ """
20
+ rules: List[Dict[str, Any]] = []
21
+
22
+ for col in profile.columns:
23
+ col_rules = _suggest_column_rules(col)
24
+ rules.extend(col_rules)
25
+
26
+ return rules
27
+
28
+
29
+ def generate_rules_yaml(profile: DatasetProfile) -> str:
30
+ """
31
+ Generate YAML-formatted contract with suggested rules.
32
+
33
+ Returns a complete contract YAML that can be used directly with `kontra validate`.
34
+ """
35
+ rules = generate_rules(profile)
36
+
37
+ if not rules:
38
+ return "# No rules suggested - dataset may be empty or all columns have high null rates\nrules: []"
39
+
40
+ lines = [
41
+ f"# Auto-generated contract from Kontra Scout",
42
+ f"# Source: {profile.source_uri}",
43
+ f"# Rows analyzed: {profile.row_count:,}",
44
+ f"# Generated by Kontra Scout v{profile.engine_version}",
45
+ "",
46
+ "name: suggested_contract",
47
+ f"description: Auto-generated from Scout profile",
48
+ "",
49
+ f'dataset: "{profile.source_uri}"',
50
+ "",
51
+ "rules:",
52
+ ]
53
+
54
+ # Add min_rows suggestion (90% of current count)
55
+ min_threshold = int(profile.row_count * 0.9)
56
+ if min_threshold > 0:
57
+ lines.append(f" # Dataset: expecting at least {min_threshold:,} rows (90% of current)")
58
+ lines.append(" - name: min_rows")
59
+ lines.append(f" params: {{ threshold: {min_threshold} }}")
60
+ lines.append("")
61
+
62
+ for rule in rules:
63
+ name = rule["name"]
64
+ params = rule.get("params", {})
65
+ comment = rule.get("_comment", "")
66
+
67
+ if comment:
68
+ lines.append(f" # {comment}")
69
+
70
+ if params:
71
+ # Format params inline for simple cases, multi-line for complex
72
+ if len(params) <= 2 and all(
73
+ not isinstance(v, (list, dict)) or (isinstance(v, list) and len(v) <= 5)
74
+ for v in params.values()
75
+ ):
76
+ params_str = ", ".join(_format_param(k, v) for k, v in params.items())
77
+ lines.append(f" - name: {name}")
78
+ lines.append(f" params: {{ {params_str} }}")
79
+ else:
80
+ lines.append(f" - name: {name}")
81
+ lines.append(" params:")
82
+ for k, v in params.items():
83
+ lines.append(f" {k}: {_format_value(v)}")
84
+ else:
85
+ lines.append(f" - name: {name}")
86
+
87
+ lines.append("")
88
+
89
+ return "\n".join(lines)
90
+
91
+
92
+ def _suggest_column_rules(col: ColumnProfile) -> List[Dict[str, Any]]:
93
+ """Generate rules for a single column based on its profile."""
94
+ rules: List[Dict[str, Any]] = []
95
+
96
+ # 1. not_null: Suggest if column has 0% nulls
97
+ if col.null_rate == 0 and col.row_count > 0:
98
+ rules.append({
99
+ "name": "not_null",
100
+ "params": {"column": col.name},
101
+ "_comment": f"{col.name}: 100% non-null",
102
+ })
103
+
104
+ # 2. unique: Suggest only for columns that look like identifiers
105
+ is_identifier = col.semantic_type == "identifier"
106
+ name_hints = any(hint in col.name.lower() for hint in ["_id", "id", "key", "uuid", "guid", "pk", "code"])
107
+ if col.uniqueness_ratio >= 0.999 and col.null_rate == 0 and (is_identifier or name_hints):
108
+ rules.append({
109
+ "name": "unique",
110
+ "params": {"column": col.name},
111
+ "_comment": f"{col.name}: 100% unique (primary key candidate)",
112
+ })
113
+
114
+ # 3. allowed_values: Suggest for low-cardinality columns with known values
115
+ if col.is_low_cardinality and col.values and len(col.values) <= 20:
116
+ rules.append({
117
+ "name": "allowed_values",
118
+ "params": {
119
+ "column": col.name,
120
+ "values": col.values,
121
+ },
122
+ "_comment": f"{col.name}: low cardinality ({len(col.values)} values)",
123
+ })
124
+
125
+ # 4. dtype: Suggest type validation
126
+ if col.dtype in ("int", "float", "bool", "date", "datetime", "string"):
127
+ # Map to Kontra/Polars type names
128
+ type_map = {
129
+ "int": "int64",
130
+ "float": "float64",
131
+ "bool": "bool",
132
+ "date": "date",
133
+ "datetime": "datetime",
134
+ "string": "utf8",
135
+ }
136
+ kontra_type = type_map.get(col.dtype, col.dtype)
137
+ rules.append({
138
+ "name": "dtype",
139
+ "params": {
140
+ "column": col.name,
141
+ "type": kontra_type,
142
+ },
143
+ "_comment": f"{col.name}: detected type {col.dtype_raw}",
144
+ })
145
+
146
+ # 5. regex: Suggest for detected patterns
147
+ for pattern in col.detected_patterns:
148
+ regex = get_pattern_regex(pattern)
149
+ if regex:
150
+ rules.append({
151
+ "name": "regex",
152
+ "params": {
153
+ "column": col.name,
154
+ "pattern": regex,
155
+ },
156
+ "_comment": f"{col.name}: detected {pattern} pattern",
157
+ })
158
+
159
+ # 6. freshness: Suggest for timestamp columns that look like update times
160
+ if col.dtype in ("datetime", "timestamp") or col.temporal is not None:
161
+ # Check if column name suggests it's an update/modified timestamp
162
+ name_lower = col.name.lower()
163
+ freshness_hints = ["updated", "modified", "timestamp", "created_at", "updated_at", "last_"]
164
+ if any(hint in name_lower for hint in freshness_hints):
165
+ rules.append({
166
+ "name": "freshness",
167
+ "params": {
168
+ "column": col.name,
169
+ "max_age": "7d",
170
+ },
171
+ "_comment": f"{col.name}: timestamp column, adjust max_age as needed",
172
+ })
173
+
174
+ return rules
175
+
176
+
177
+ def _format_param(key: str, value: Any) -> str:
178
+ """Format a single parameter for inline YAML."""
179
+ return f"{key}: {_format_value(value)}"
180
+
181
+
182
+ def _format_value(value: Any) -> str:
183
+ """Format a value for YAML output."""
184
+ if isinstance(value, str):
185
+ # Escape quotes and use quotes if needed
186
+ if any(c in value for c in ['"', "'", ":", "{", "}", "[", "]", ",", "\n"]):
187
+ escaped = value.replace("\\", "\\\\").replace('"', '\\"')
188
+ return f'"{escaped}"'
189
+ return f'"{value}"'
190
+ elif isinstance(value, bool):
191
+ return "true" if value else "false"
192
+ elif isinstance(value, (int, float)):
193
+ return str(value)
194
+ elif isinstance(value, list):
195
+ items = ", ".join(_format_value(v) for v in value)
196
+ return f"[{items}]"
197
+ elif value is None:
198
+ return "null"
199
+ else:
200
+ return str(value)