duckguard 3.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/ai/__init__.py +33 -0
- duckguard/ai/config.py +201 -0
- duckguard/ai/explainer.py +109 -0
- duckguard/ai/fixer.py +105 -0
- duckguard/ai/natural_language.py +119 -0
- duckguard/ai/rules_generator.py +121 -0
- duckguard/checks/conditional.py +4 -3
- duckguard/cli/main.py +480 -93
- duckguard/core/column.py +15 -5
- duckguard/core/result.py +35 -14
- duckguard/profiler/auto_profile.py +217 -64
- duckguard/py.typed +0 -0
- duckguard/reports/html_reporter.py +522 -37
- duckguard/reports/pdf_reporter.py +33 -5
- duckguard/semantic/detector.py +18 -7
- duckguard-3.2.0.dist-info/METADATA +1206 -0
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/RECORD +22 -14
- duckguard-3.2.0.dist-info/licenses/LICENSE +190 -0
- duckguard-3.2.0.dist-info/licenses/NOTICE +7 -0
- duckguard-3.0.1.dist-info/METADATA +0 -1072
- duckguard-3.0.1.dist-info/licenses/LICENSE +0 -55
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/WHEEL +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/entry_points.txt +0 -0
duckguard/__init__.py
CHANGED
duckguard/ai/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""AI-powered data quality features for DuckGuard.
|
|
2
|
+
|
|
3
|
+
This module provides LLM-powered data quality capabilities:
|
|
4
|
+
- explain: Natural language data quality summaries
|
|
5
|
+
- suggest: AI-generated validation rules
|
|
6
|
+
- fix: AI-suggested data cleaning steps
|
|
7
|
+
- natural_rules: Plain English validation rules
|
|
8
|
+
|
|
9
|
+
Requires: pip install duckguard[llm]
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
from duckguard import connect
|
|
13
|
+
from duckguard.ai import explain, suggest_rules
|
|
14
|
+
|
|
15
|
+
orders = connect("orders.csv")
|
|
16
|
+
print(explain(orders))
|
|
17
|
+
rules = suggest_rules(orders)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from duckguard.ai.config import configure, get_config
|
|
21
|
+
from duckguard.ai.explainer import explain
|
|
22
|
+
from duckguard.ai.fixer import suggest_fixes
|
|
23
|
+
from duckguard.ai.natural_language import natural_rules
|
|
24
|
+
from duckguard.ai.rules_generator import suggest_rules
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"configure",
|
|
28
|
+
"get_config",
|
|
29
|
+
"explain",
|
|
30
|
+
"suggest_rules",
|
|
31
|
+
"suggest_fixes",
|
|
32
|
+
"natural_rules",
|
|
33
|
+
]
|
duckguard/ai/config.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""AI configuration for DuckGuard."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class AIConfig:
|
|
12
|
+
"""Configuration for AI-powered features."""
|
|
13
|
+
|
|
14
|
+
provider: str = "openai"
|
|
15
|
+
model: str | None = None
|
|
16
|
+
api_key: str | None = None
|
|
17
|
+
base_url: str | None = None
|
|
18
|
+
temperature: float = 0.3
|
|
19
|
+
max_tokens: int = 2000
|
|
20
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def effective_model(self) -> str:
|
|
24
|
+
"""Get the effective model name based on provider."""
|
|
25
|
+
if self.model:
|
|
26
|
+
return self.model
|
|
27
|
+
defaults = {
|
|
28
|
+
"openai": "gpt-4o-mini",
|
|
29
|
+
"anthropic": "claude-3-5-haiku-20241022",
|
|
30
|
+
"ollama": "llama3",
|
|
31
|
+
}
|
|
32
|
+
return defaults.get(self.provider, "gpt-4o-mini")
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def effective_api_key(self) -> str | None:
|
|
36
|
+
"""Get API key from config or environment."""
|
|
37
|
+
if self.api_key:
|
|
38
|
+
return self.api_key
|
|
39
|
+
env_vars = {
|
|
40
|
+
"openai": "OPENAI_API_KEY",
|
|
41
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
42
|
+
}
|
|
43
|
+
env_var = env_vars.get(self.provider)
|
|
44
|
+
if env_var:
|
|
45
|
+
return os.environ.get(env_var)
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Global config singleton
|
|
50
|
+
_config: AIConfig | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def configure(
|
|
54
|
+
provider: str = "openai",
|
|
55
|
+
model: str | None = None,
|
|
56
|
+
api_key: str | None = None,
|
|
57
|
+
base_url: str | None = None,
|
|
58
|
+
temperature: float = 0.3,
|
|
59
|
+
**kwargs: Any,
|
|
60
|
+
) -> AIConfig:
|
|
61
|
+
"""
|
|
62
|
+
Configure the AI backend for DuckGuard.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
provider: LLM provider ("openai", "anthropic", "ollama")
|
|
66
|
+
model: Model name (defaults based on provider)
|
|
67
|
+
api_key: API key (or set via environment variable)
|
|
68
|
+
base_url: Custom base URL (for Ollama or proxies)
|
|
69
|
+
temperature: Sampling temperature (default: 0.3 for consistency)
|
|
70
|
+
**kwargs: Additional provider-specific options
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
AIConfig instance
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
from duckguard.ai import configure
|
|
77
|
+
|
|
78
|
+
# OpenAI
|
|
79
|
+
configure(provider="openai", api_key="sk-...")
|
|
80
|
+
|
|
81
|
+
# Anthropic
|
|
82
|
+
configure(provider="anthropic") # uses ANTHROPIC_API_KEY env
|
|
83
|
+
|
|
84
|
+
# Local Ollama
|
|
85
|
+
configure(provider="ollama", model="llama3",
|
|
86
|
+
base_url="http://localhost:11434")
|
|
87
|
+
"""
|
|
88
|
+
global _config
|
|
89
|
+
_config = AIConfig(
|
|
90
|
+
provider=provider,
|
|
91
|
+
model=model,
|
|
92
|
+
api_key=api_key,
|
|
93
|
+
base_url=base_url,
|
|
94
|
+
temperature=temperature,
|
|
95
|
+
extra=kwargs,
|
|
96
|
+
)
|
|
97
|
+
return _config
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_config() -> AIConfig:
|
|
101
|
+
"""Get the current AI configuration, or create a default one."""
|
|
102
|
+
global _config
|
|
103
|
+
if _config is None:
|
|
104
|
+
_config = AIConfig()
|
|
105
|
+
return _config
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _get_client(config: AIConfig | None = None):
|
|
109
|
+
"""
|
|
110
|
+
Get an LLM client based on configuration.
|
|
111
|
+
|
|
112
|
+
Returns a callable that takes a prompt and returns a string response.
|
|
113
|
+
"""
|
|
114
|
+
cfg = config or get_config()
|
|
115
|
+
|
|
116
|
+
if cfg.provider == "openai":
|
|
117
|
+
try:
|
|
118
|
+
from openai import OpenAI
|
|
119
|
+
except ImportError:
|
|
120
|
+
raise ImportError(
|
|
121
|
+
"OpenAI support requires the openai package. "
|
|
122
|
+
"Install with: pip install duckguard[llm]"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
client = OpenAI(
|
|
126
|
+
api_key=cfg.effective_api_key,
|
|
127
|
+
base_url=cfg.base_url,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def call_openai(prompt: str, system: str = "") -> str:
|
|
131
|
+
messages = []
|
|
132
|
+
if system:
|
|
133
|
+
messages.append({"role": "system", "content": system})
|
|
134
|
+
messages.append({"role": "user", "content": prompt})
|
|
135
|
+
|
|
136
|
+
response = client.chat.completions.create(
|
|
137
|
+
model=cfg.effective_model,
|
|
138
|
+
messages=messages,
|
|
139
|
+
temperature=cfg.temperature,
|
|
140
|
+
max_tokens=cfg.max_tokens,
|
|
141
|
+
)
|
|
142
|
+
return response.choices[0].message.content or ""
|
|
143
|
+
|
|
144
|
+
return call_openai
|
|
145
|
+
|
|
146
|
+
elif cfg.provider == "anthropic":
|
|
147
|
+
try:
|
|
148
|
+
from anthropic import Anthropic
|
|
149
|
+
except ImportError:
|
|
150
|
+
raise ImportError(
|
|
151
|
+
"Anthropic support requires the anthropic package. "
|
|
152
|
+
"Install with: pip install duckguard[llm]"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
client = Anthropic(api_key=cfg.effective_api_key)
|
|
156
|
+
|
|
157
|
+
def call_anthropic(prompt: str, system: str = "") -> str:
|
|
158
|
+
response = client.messages.create(
|
|
159
|
+
model=cfg.effective_model,
|
|
160
|
+
max_tokens=cfg.max_tokens,
|
|
161
|
+
system=system if system else "You are a data quality expert.",
|
|
162
|
+
messages=[{"role": "user", "content": prompt}],
|
|
163
|
+
)
|
|
164
|
+
return response.content[0].text
|
|
165
|
+
|
|
166
|
+
return call_anthropic
|
|
167
|
+
|
|
168
|
+
elif cfg.provider == "ollama":
|
|
169
|
+
try:
|
|
170
|
+
from openai import OpenAI
|
|
171
|
+
except ImportError:
|
|
172
|
+
raise ImportError(
|
|
173
|
+
"Ollama support uses the openai package. "
|
|
174
|
+
"Install with: pip install openai"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
client = OpenAI(
|
|
178
|
+
api_key="ollama",
|
|
179
|
+
base_url=cfg.base_url or "http://localhost:11434/v1",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def call_ollama(prompt: str, system: str = "") -> str:
|
|
183
|
+
messages = []
|
|
184
|
+
if system:
|
|
185
|
+
messages.append({"role": "system", "content": system})
|
|
186
|
+
messages.append({"role": "user", "content": prompt})
|
|
187
|
+
|
|
188
|
+
response = client.chat.completions.create(
|
|
189
|
+
model=cfg.effective_model,
|
|
190
|
+
messages=messages,
|
|
191
|
+
temperature=cfg.temperature,
|
|
192
|
+
)
|
|
193
|
+
return response.choices[0].message.content or ""
|
|
194
|
+
|
|
195
|
+
return call_ollama
|
|
196
|
+
|
|
197
|
+
else:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"Unsupported AI provider: {cfg.provider}. "
|
|
200
|
+
f"Supported: openai, anthropic, ollama"
|
|
201
|
+
)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""AI-powered data quality explanation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from duckguard.ai.config import _get_client
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from duckguard.core.dataset import Dataset
|
|
11
|
+
|
|
12
|
+
SYSTEM_PROMPT = """You are a data quality expert. You analyze dataset profiles and explain
|
|
13
|
+
data quality issues in clear, actionable language. Be specific about which columns and
|
|
14
|
+
values are problematic. Suggest concrete validation rules using DuckGuard's API.
|
|
15
|
+
|
|
16
|
+
DuckGuard API methods:
|
|
17
|
+
- column.is_not_null() — check for nulls
|
|
18
|
+
- column.is_unique() — check uniqueness
|
|
19
|
+
- column.between(min, max) — range check
|
|
20
|
+
- column.isin(values) — enum check
|
|
21
|
+
- column.matches(pattern) — regex pattern
|
|
22
|
+
- column.not_null_when(condition) — conditional not-null
|
|
23
|
+
- column.between_when(min, max, condition) — conditional range
|
|
24
|
+
- dataset.score() — quality score (A-F)
|
|
25
|
+
- detect_anomalies(dataset) — anomaly detection
|
|
26
|
+
|
|
27
|
+
Keep explanations concise and actionable. Use emoji for visual clarity."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def explain(
|
|
31
|
+
dataset: Dataset,
|
|
32
|
+
focus: str | None = None,
|
|
33
|
+
detail: str = "medium",
|
|
34
|
+
) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Generate a natural language explanation of data quality.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
dataset: Dataset to analyze
|
|
40
|
+
focus: Optional column or aspect to focus on
|
|
41
|
+
detail: Level of detail ("brief", "medium", "detailed")
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Human-readable data quality explanation
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
from duckguard import connect
|
|
48
|
+
from duckguard.ai import explain
|
|
49
|
+
|
|
50
|
+
orders = connect("orders.csv")
|
|
51
|
+
print(explain(orders))
|
|
52
|
+
"""
|
|
53
|
+
from duckguard.profiler import AutoProfiler
|
|
54
|
+
|
|
55
|
+
# Profile the dataset
|
|
56
|
+
profiler = AutoProfiler(deep=True)
|
|
57
|
+
profile = profiler.profile(dataset)
|
|
58
|
+
|
|
59
|
+
# Build context for the LLM
|
|
60
|
+
context_parts = [
|
|
61
|
+
f"Dataset: {dataset.name}",
|
|
62
|
+
f"Rows: {profile.row_count}, Columns: {profile.column_count}",
|
|
63
|
+
f"Overall Quality: {profile.overall_quality_grade} ({profile.overall_quality_score:.1f}/100)",
|
|
64
|
+
"",
|
|
65
|
+
"Column Profiles:",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
for col in profile.columns:
|
|
69
|
+
col_info = f" {col.name} ({col.dtype}): "
|
|
70
|
+
col_info += f"nulls={col.null_percent:.1f}%, "
|
|
71
|
+
col_info += f"unique={col.unique_percent:.1f}%, "
|
|
72
|
+
col_info += f"grade={col.quality_grade}"
|
|
73
|
+
|
|
74
|
+
if col.min_value is not None:
|
|
75
|
+
col_info += f", range=[{col.min_value}, {col.max_value}]"
|
|
76
|
+
if col.detected_patterns:
|
|
77
|
+
col_info += f", patterns={col.detected_patterns}"
|
|
78
|
+
if col.distribution_type:
|
|
79
|
+
col_info += f", dist={col.distribution_type}"
|
|
80
|
+
if col.outlier_count and col.outlier_count > 0:
|
|
81
|
+
col_info += f", outliers={col.outlier_count}"
|
|
82
|
+
|
|
83
|
+
context_parts.append(col_info)
|
|
84
|
+
|
|
85
|
+
if profile.suggested_rules:
|
|
86
|
+
context_parts.append("")
|
|
87
|
+
context_parts.append(f"Auto-suggested rules ({len(profile.suggested_rules)}):")
|
|
88
|
+
for rule in profile.suggested_rules[:10]:
|
|
89
|
+
context_parts.append(f" - {rule}")
|
|
90
|
+
|
|
91
|
+
context = "\n".join(context_parts)
|
|
92
|
+
|
|
93
|
+
# Build prompt
|
|
94
|
+
detail_instruction = {
|
|
95
|
+
"brief": "Give a 3-5 sentence summary.",
|
|
96
|
+
"medium": "Give a comprehensive but concise analysis (10-15 lines).",
|
|
97
|
+
"detailed": "Give a thorough analysis with specific recommendations.",
|
|
98
|
+
}.get(detail, "Give a comprehensive but concise analysis.")
|
|
99
|
+
|
|
100
|
+
focus_instruction = f"\nFocus specifically on: {focus}" if focus else ""
|
|
101
|
+
|
|
102
|
+
prompt = f"""Analyze this dataset profile and explain the data quality status.
|
|
103
|
+
{detail_instruction}{focus_instruction}
|
|
104
|
+
|
|
105
|
+
{context}"""
|
|
106
|
+
|
|
107
|
+
# Call LLM
|
|
108
|
+
client = _get_client()
|
|
109
|
+
return client(prompt, system=SYSTEM_PROMPT)
|
duckguard/ai/fixer.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""AI-powered data fix suggestions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from duckguard.ai.config import _get_client
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from duckguard.core.dataset import Dataset
|
|
11
|
+
|
|
12
|
+
SYSTEM_PROMPT = """You are a data quality expert. Given a dataset profile with quality issues,
|
|
13
|
+
suggest specific fixes. For each issue:
|
|
14
|
+
|
|
15
|
+
1. Describe the problem clearly
|
|
16
|
+
2. Assess severity (critical / warning / info)
|
|
17
|
+
3. Suggest a concrete fix (SQL, Python code, or process change)
|
|
18
|
+
4. Note if no action is needed (e.g., nulls are expected for pending orders)
|
|
19
|
+
|
|
20
|
+
Be practical. Not every null is a bug. Use context to determine what's actually wrong.
|
|
21
|
+
Format with emoji and clear sections."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def suggest_fixes(
|
|
25
|
+
dataset: Dataset,
|
|
26
|
+
rules_result=None,
|
|
27
|
+
) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Get AI-suggested fixes for data quality issues.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset: Dataset to analyze
|
|
33
|
+
rules_result: Optional RuleExecutionResult from a previous validation run
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Human-readable fix suggestions
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
from duckguard import connect
|
|
40
|
+
from duckguard.ai import suggest_fixes
|
|
41
|
+
|
|
42
|
+
orders = connect("orders.csv")
|
|
43
|
+
print(suggest_fixes(orders))
|
|
44
|
+
"""
|
|
45
|
+
from duckguard.profiler import AutoProfiler
|
|
46
|
+
|
|
47
|
+
# Profile the dataset
|
|
48
|
+
profiler = AutoProfiler(deep=True)
|
|
49
|
+
profile = profiler.profile(dataset)
|
|
50
|
+
|
|
51
|
+
# Build context
|
|
52
|
+
context_parts = [
|
|
53
|
+
f"Dataset: {dataset.name} ({profile.row_count} rows, {profile.column_count} columns)",
|
|
54
|
+
f"Quality: {profile.overall_quality_grade} ({profile.overall_quality_score:.1f}/100)",
|
|
55
|
+
"",
|
|
56
|
+
"Issues detected:",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
has_issues = False
|
|
60
|
+
|
|
61
|
+
for col in profile.columns:
|
|
62
|
+
issues = []
|
|
63
|
+
|
|
64
|
+
if col.null_percent > 0:
|
|
65
|
+
issues.append(f"nulls: {col.null_percent:.1f}% ({col.null_count} rows)")
|
|
66
|
+
|
|
67
|
+
if col.quality_grade in ("D", "F"):
|
|
68
|
+
issues.append(f"low quality grade: {col.quality_grade}")
|
|
69
|
+
|
|
70
|
+
if col.outlier_count and col.outlier_count > 0:
|
|
71
|
+
issues.append(f"outliers: {col.outlier_count} ({col.outlier_percentage:.1f}%)")
|
|
72
|
+
|
|
73
|
+
if issues:
|
|
74
|
+
has_issues = True
|
|
75
|
+
context_parts.append(f" {col.name} ({col.dtype}): {'; '.join(issues)}")
|
|
76
|
+
|
|
77
|
+
# Add sample values for context
|
|
78
|
+
if col.min_value is not None:
|
|
79
|
+
context_parts.append(f" range: [{col.min_value}, {col.max_value}]")
|
|
80
|
+
|
|
81
|
+
if not has_issues:
|
|
82
|
+
return "✅ No data quality issues detected. Your data looks clean!"
|
|
83
|
+
|
|
84
|
+
# Add validation results if provided
|
|
85
|
+
if rules_result:
|
|
86
|
+
context_parts.append("")
|
|
87
|
+
context_parts.append("Failed validation checks:")
|
|
88
|
+
for r in getattr(rules_result, "results", []):
|
|
89
|
+
if not r.passed:
|
|
90
|
+
context_parts.append(f" ✗ {r.message}")
|
|
91
|
+
|
|
92
|
+
context = "\n".join(context_parts)
|
|
93
|
+
|
|
94
|
+
prompt = f"""Analyze these data quality issues and suggest specific fixes.
|
|
95
|
+
|
|
96
|
+
{context}
|
|
97
|
+
|
|
98
|
+
For each issue, provide:
|
|
99
|
+
1. What's wrong (brief)
|
|
100
|
+
2. Severity (🔴 critical / 🟡 warning / 🔵 info)
|
|
101
|
+
3. Suggested fix (code or process)
|
|
102
|
+
4. Whether it actually needs fixing (sometimes nulls are expected)"""
|
|
103
|
+
|
|
104
|
+
client = _get_client()
|
|
105
|
+
return client(prompt, system=SYSTEM_PROMPT)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Natural language validation rules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from duckguard.ai.config import _get_client
|
|
8
|
+
from duckguard.core.result import ValidationResult
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from duckguard.core.dataset import Dataset
|
|
12
|
+
|
|
13
|
+
SYSTEM_PROMPT = """You are a data quality expert. Convert natural language rules into
|
|
14
|
+
DuckGuard Python code.
|
|
15
|
+
|
|
16
|
+
Available DuckGuard methods:
|
|
17
|
+
- dataset.column_name.is_not_null()
|
|
18
|
+
- dataset.column_name.is_unique()
|
|
19
|
+
- dataset.column_name.between(min, max)
|
|
20
|
+
- dataset.column_name.greater_than(value)
|
|
21
|
+
- dataset.column_name.less_than(value)
|
|
22
|
+
- dataset.column_name.isin(values_list)
|
|
23
|
+
- dataset.column_name.matches(regex_pattern)
|
|
24
|
+
- dataset.column_name.value_lengths_between(min, max)
|
|
25
|
+
- dataset.column_name.not_null_when(sql_condition)
|
|
26
|
+
- dataset.column_name.between_when(min, max, sql_condition)
|
|
27
|
+
- dataset.column_name.exists_in(other_dataset.column)
|
|
28
|
+
- dataset.expect_columns_unique(column_list)
|
|
29
|
+
|
|
30
|
+
Dataset columns: {columns}
|
|
31
|
+
|
|
32
|
+
For each natural language rule, output ONLY a Python expression that calls the appropriate
|
|
33
|
+
DuckGuard method. One expression per line. No explanations, no imports.
|
|
34
|
+
|
|
35
|
+
Example input: "order IDs should never be null"
|
|
36
|
+
Example output: dataset.order_id.is_not_null()
|
|
37
|
+
|
|
38
|
+
Example input: "quantities between 1 and 1000"
|
|
39
|
+
Example output: dataset.quantity.between(1, 1000)"""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def natural_rules(
|
|
43
|
+
dataset: Dataset,
|
|
44
|
+
rules: list[str],
|
|
45
|
+
) -> list[ValidationResult]:
|
|
46
|
+
"""
|
|
47
|
+
Validate data using natural language rules.
|
|
48
|
+
|
|
49
|
+
Converts plain English rules into DuckGuard validations and executes them.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
dataset: Dataset to validate
|
|
53
|
+
rules: List of natural language rule descriptions
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of ValidationResult objects
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
from duckguard import connect
|
|
60
|
+
from duckguard.ai import natural_rules
|
|
61
|
+
|
|
62
|
+
orders = connect("orders.csv")
|
|
63
|
+
results = natural_rules(orders, [
|
|
64
|
+
"order IDs should never be null or duplicated",
|
|
65
|
+
"quantities should be positive integers under 1000",
|
|
66
|
+
"status must be pending, shipped, or delivered",
|
|
67
|
+
])
|
|
68
|
+
|
|
69
|
+
for r in results:
|
|
70
|
+
print(f"{'✓' if r.passed else '✗'} {r.message}")
|
|
71
|
+
"""
|
|
72
|
+
columns = dataset.columns
|
|
73
|
+
|
|
74
|
+
# Build prompt with all rules
|
|
75
|
+
rules_text = "\n".join(f"Rule {i+1}: {rule}" for i, rule in enumerate(rules))
|
|
76
|
+
|
|
77
|
+
system = SYSTEM_PROMPT.format(columns=columns)
|
|
78
|
+
prompt = f"""Convert these natural language rules to DuckGuard expressions:
|
|
79
|
+
|
|
80
|
+
{rules_text}
|
|
81
|
+
|
|
82
|
+
Output one DuckGuard expression per rule, numbered to match. Use 'dataset' as the variable name."""
|
|
83
|
+
|
|
84
|
+
client = _get_client()
|
|
85
|
+
response = client(prompt, system=system)
|
|
86
|
+
|
|
87
|
+
# Parse and execute the generated expressions
|
|
88
|
+
results = []
|
|
89
|
+
expressions = [line.strip() for line in response.strip().split("\n") if line.strip()]
|
|
90
|
+
|
|
91
|
+
for i, expr in enumerate(expressions):
|
|
92
|
+
# Clean up the expression
|
|
93
|
+
expr = expr.lstrip("0123456789.:)-— ")
|
|
94
|
+
if not expr.startswith("dataset."):
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
# Execute the expression safely
|
|
99
|
+
result = eval(expr, {"dataset": dataset, "__builtins__": {}}) # noqa: S307
|
|
100
|
+
if isinstance(result, ValidationResult):
|
|
101
|
+
results.append(result)
|
|
102
|
+
elif isinstance(result, bool):
|
|
103
|
+
rule_desc = rules[i] if i < len(rules) else expr
|
|
104
|
+
results.append(ValidationResult(
|
|
105
|
+
passed=result,
|
|
106
|
+
actual_value=result,
|
|
107
|
+
expected_value=True,
|
|
108
|
+
message=f"Natural rule: {rule_desc}",
|
|
109
|
+
))
|
|
110
|
+
except Exception as e:
|
|
111
|
+
rule_desc = rules[i] if i < len(rules) else expr
|
|
112
|
+
results.append(ValidationResult(
|
|
113
|
+
passed=False,
|
|
114
|
+
actual_value=str(e),
|
|
115
|
+
expected_value="valid expression",
|
|
116
|
+
message=f"Failed to evaluate rule '{rule_desc}': {e}",
|
|
117
|
+
))
|
|
118
|
+
|
|
119
|
+
return results
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""AI-powered validation rule generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from duckguard.ai.config import _get_client
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from duckguard.core.dataset import Dataset
|
|
11
|
+
|
|
12
|
+
SYSTEM_PROMPT = """You are a data quality expert. Generate DuckGuard YAML validation rules
|
|
13
|
+
based on dataset profiles. Rules should be practical, not overly strict.
|
|
14
|
+
|
|
15
|
+
Output format — valid YAML only, no markdown fences:
|
|
16
|
+
|
|
17
|
+
name: <dataset>_validation
|
|
18
|
+
description: Auto-generated quality checks
|
|
19
|
+
|
|
20
|
+
checks:
|
|
21
|
+
column_name:
|
|
22
|
+
- not_null
|
|
23
|
+
- unique
|
|
24
|
+
- between: [min, max]
|
|
25
|
+
- allowed_values: [val1, val2]
|
|
26
|
+
- pattern: "regex"
|
|
27
|
+
|
|
28
|
+
Rules to consider:
|
|
29
|
+
- not_null for columns with 0% nulls (they're probably required)
|
|
30
|
+
- unique for ID-like columns (>99% unique)
|
|
31
|
+
- between for numeric columns (use actual range with small buffer)
|
|
32
|
+
- allowed_values for low-cardinality columns (<20 distinct values)
|
|
33
|
+
- pattern for columns matching known patterns (email, phone, etc.)
|
|
34
|
+
|
|
35
|
+
Be conservative — generate rules that reflect the actual data, not hypothetical constraints.
|
|
36
|
+
Only output the YAML. No explanations."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def suggest_rules(
|
|
40
|
+
dataset: Dataset,
|
|
41
|
+
strict: bool = False,
|
|
42
|
+
include_comments: bool = True,
|
|
43
|
+
) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Generate validation rules using AI analysis.
|
|
46
|
+
|
|
47
|
+
Combines DuckGuard's profiling with LLM intelligence to generate
|
|
48
|
+
context-aware YAML rules that match your data's actual patterns.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
dataset: Dataset to analyze
|
|
52
|
+
strict: If True, generate stricter rules
|
|
53
|
+
include_comments: If True, add explanatory comments
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
YAML string with validation rules
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
from duckguard import connect
|
|
60
|
+
from duckguard.ai import suggest_rules
|
|
61
|
+
|
|
62
|
+
orders = connect("orders.csv")
|
|
63
|
+
yaml_rules = suggest_rules(orders)
|
|
64
|
+
print(yaml_rules)
|
|
65
|
+
|
|
66
|
+
# Save to file
|
|
67
|
+
with open("duckguard.yaml", "w") as f:
|
|
68
|
+
f.write(yaml_rules)
|
|
69
|
+
"""
|
|
70
|
+
from duckguard.profiler import AutoProfiler
|
|
71
|
+
|
|
72
|
+
# Profile the dataset
|
|
73
|
+
profiler = AutoProfiler()
|
|
74
|
+
profile = profiler.profile(dataset)
|
|
75
|
+
|
|
76
|
+
# Build context
|
|
77
|
+
context_parts = [
|
|
78
|
+
f"Dataset: {dataset.name}",
|
|
79
|
+
f"Rows: {profile.row_count}",
|
|
80
|
+
"",
|
|
81
|
+
"Columns:",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
for col in profile.columns:
|
|
85
|
+
col_info = f" {col.name}:"
|
|
86
|
+
col_info += f" type={col.dtype},"
|
|
87
|
+
col_info += f" nulls={col.null_percent:.1f}%,"
|
|
88
|
+
col_info += f" unique={col.unique_percent:.1f}%,"
|
|
89
|
+
col_info += f" distinct={col.unique_count}"
|
|
90
|
+
|
|
91
|
+
if col.min_value is not None:
|
|
92
|
+
col_info += f", min={col.min_value}, max={col.max_value}"
|
|
93
|
+
if col.detected_patterns:
|
|
94
|
+
col_info += f", patterns={col.detected_patterns}"
|
|
95
|
+
if hasattr(col, "sample_values") and col.sample_values:
|
|
96
|
+
col_info += f", samples={col.sample_values[:5]}"
|
|
97
|
+
|
|
98
|
+
context_parts.append(col_info)
|
|
99
|
+
|
|
100
|
+
# Include auto-detected rules as baseline
|
|
101
|
+
if profile.suggested_rules:
|
|
102
|
+
context_parts.append("")
|
|
103
|
+
context_parts.append("Auto-detected rules (baseline):")
|
|
104
|
+
for rule in profile.suggested_rules:
|
|
105
|
+
context_parts.append(f" - {rule}")
|
|
106
|
+
|
|
107
|
+
context = "\n".join(context_parts)
|
|
108
|
+
|
|
109
|
+
strictness = "Generate strict rules — flag anything suspicious." if strict else \
|
|
110
|
+
"Generate practical rules — match actual data patterns, allow reasonable variation."
|
|
111
|
+
|
|
112
|
+
comment_instruction = "Add YAML comments explaining each rule." if include_comments else \
|
|
113
|
+
"No comments, just rules."
|
|
114
|
+
|
|
115
|
+
prompt = f"""{strictness}
|
|
116
|
+
{comment_instruction}
|
|
117
|
+
|
|
118
|
+
{context}"""
|
|
119
|
+
|
|
120
|
+
client = _get_client()
|
|
121
|
+
return client(prompt, system=SYSTEM_PROMPT)
|