dataforge-07 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge/__init__.py +204 -0
- dataforge/__main__.py +5 -0
- dataforge/agent/__init__.py +16 -0
- dataforge/agent/providers.py +259 -0
- dataforge/agent/scratchpad.py +183 -0
- dataforge/agent/tool_actions.py +343 -0
- dataforge/bench/__init__.py +31 -0
- dataforge/bench/core.py +426 -0
- dataforge/bench/groq_client.py +386 -0
- dataforge/bench/methods.py +443 -0
- dataforge/bench/report.py +309 -0
- dataforge/bench/runner.py +247 -0
- dataforge/causal/__init__.py +21 -0
- dataforge/causal/dag.py +174 -0
- dataforge/causal/pc.py +232 -0
- dataforge/causal/root_cause.py +193 -0
- dataforge/cli/__init__.py +50 -0
- dataforge/cli/audit.py +70 -0
- dataforge/cli/bench.py +154 -0
- dataforge/cli/common.py +267 -0
- dataforge/cli/constraints.py +407 -0
- dataforge/cli/profile.py +147 -0
- dataforge/cli/release.py +166 -0
- dataforge/cli/repair.py +407 -0
- dataforge/cli/revert.py +139 -0
- dataforge/cli/watch.py +144 -0
- dataforge/datasets/__init__.py +25 -0
- dataforge/datasets/embedded/hospital/clean.csv +11 -0
- dataforge/datasets/embedded/hospital/dirty.csv +11 -0
- dataforge/datasets/real_world.py +290 -0
- dataforge/datasets/registry.py +103 -0
- dataforge/detectors/__init__.py +80 -0
- dataforge/detectors/base.py +145 -0
- dataforge/detectors/decimal_shift.py +166 -0
- dataforge/detectors/fd_violation.py +157 -0
- dataforge/detectors/type_mismatch.py +173 -0
- dataforge/engine/__init__.py +39 -0
- dataforge/engine/repair.py +905 -0
- dataforge/env/__init__.py +22 -0
- dataforge/env/environment.py +883 -0
- dataforge/env/observation.py +61 -0
- dataforge/env/openenv_core.py +161 -0
- dataforge/env/reward.py +128 -0
- dataforge/env/server.py +176 -0
- dataforge/evaluation_contract.py +76 -0
- dataforge/fixtures/hospital_10rows.csv +11 -0
- dataforge/fixtures/hospital_schema.yaml +17 -0
- dataforge/http/__init__.py +1 -0
- dataforge/http/problem.py +103 -0
- dataforge/integrations/__init__.py +1 -0
- dataforge/integrations/dbt.py +164 -0
- dataforge/observability.py +76 -0
- dataforge/py.typed +1 -0
- dataforge/release/__init__.py +1 -0
- dataforge/release/doctor.py +367 -0
- dataforge/release/full_vision.py +702 -0
- dataforge/release/gate.py +861 -0
- dataforge/release/playground_check.py +411 -0
- dataforge/repair_contract.py +468 -0
- dataforge/repairers/__init__.py +88 -0
- dataforge/repairers/base.py +77 -0
- dataforge/repairers/decimal_shift.py +43 -0
- dataforge/repairers/fd_violation.py +225 -0
- dataforge/repairers/type_mismatch.py +73 -0
- dataforge/safety/__init__.py +5 -0
- dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
- dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
- dataforge/safety/constitution.py +307 -0
- dataforge/safety/constitutions/default.yaml +40 -0
- dataforge/safety/filter.py +134 -0
- dataforge/schema_inference.py +620 -0
- dataforge/stores/__init__.py +46 -0
- dataforge/stores/base.py +73 -0
- dataforge/stores/cloud.py +78 -0
- dataforge/stores/csv.py +94 -0
- dataforge/stores/duckdb.py +313 -0
- dataforge/stores/patch_plan.py +178 -0
- dataforge/stores/registry.py +82 -0
- dataforge/stores/repair.py +121 -0
- dataforge/stores/revert.py +22 -0
- dataforge/stores/sql.py +27 -0
- dataforge/table.py +228 -0
- dataforge/transactions/__init__.py +34 -0
- dataforge/transactions/files.py +96 -0
- dataforge/transactions/log.py +613 -0
- dataforge/transactions/revert.py +102 -0
- dataforge/transactions/txn.py +104 -0
- dataforge/ui/__init__.py +1 -0
- dataforge/ui/profile_view.py +136 -0
- dataforge/ui/repair_diff.py +91 -0
- dataforge/verifier/__init__.py +55 -0
- dataforge/verifier/constraint_ir.py +155 -0
- dataforge/verifier/explain.py +47 -0
- dataforge/verifier/gate.py +5 -0
- dataforge/verifier/schema.py +111 -0
- dataforge/verifier/smt.py +433 -0
- dataforge_07-0.1.0.dist-info/METADATA +436 -0
- dataforge_07-0.1.0.dist-info/RECORD +150 -0
- dataforge_07-0.1.0.dist-info/WHEEL +5 -0
- dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
- dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
- dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
dataforge/causal/dag.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Column-level causal DAG utilities for root-cause analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import networkx as nx # type: ignore[import-untyped]
|
|
9
|
+
|
|
10
|
+
__all__ = ["CausalDAG", "CausalEdge"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class CausalEdge:
|
|
15
|
+
"""Metadata for a directed causal edge.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
source: Source column name.
|
|
19
|
+
target: Target column name.
|
|
20
|
+
confidence: Confidence in the directed influence, from 0.0 to 1.0.
|
|
21
|
+
provenance: Human-readable source of the edge.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
source: str
|
|
25
|
+
target: str
|
|
26
|
+
confidence: float
|
|
27
|
+
provenance: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CausalDAG:
|
|
31
|
+
"""Acyclic directed graph whose nodes are dataset columns.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
nodes: Optional initial column names.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> dag = CausalDAG(["discount_pct", "order_total"])
|
|
38
|
+
>>> dag.add_edge("discount_pct", "order_total", confidence=0.9, provenance="fd")
|
|
39
|
+
>>> dag.is_reachable("discount_pct", "order_total")
|
|
40
|
+
True
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, nodes: list[str] | tuple[str, ...] = ()) -> None:
|
|
44
|
+
self._graph: nx.DiGraph[Any] = nx.DiGraph()
|
|
45
|
+
self._graph.add_nodes_from(nodes)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def nodes(self) -> tuple[str, ...]:
|
|
49
|
+
"""Return graph nodes in insertion order."""
|
|
50
|
+
return tuple(str(node) for node in self._graph.nodes)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def edges(self) -> tuple[CausalEdge, ...]:
|
|
54
|
+
"""Return directed edges with metadata."""
|
|
55
|
+
result: list[CausalEdge] = []
|
|
56
|
+
for source, target, attrs in self._graph.edges(data=True):
|
|
57
|
+
result.append(
|
|
58
|
+
CausalEdge(
|
|
59
|
+
source=str(source),
|
|
60
|
+
target=str(target),
|
|
61
|
+
confidence=float(attrs.get("confidence", 0.0)),
|
|
62
|
+
provenance=str(attrs.get("provenance", "unknown")),
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
return tuple(result)
|
|
66
|
+
|
|
67
|
+
def add_node(self, column: str) -> None:
|
|
68
|
+
"""Add a column node if it is not already present.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
column: Column name.
|
|
72
|
+
"""
|
|
73
|
+
self._graph.add_node(column)
|
|
74
|
+
|
|
75
|
+
def add_edge(
|
|
76
|
+
self,
|
|
77
|
+
source: str,
|
|
78
|
+
target: str,
|
|
79
|
+
*,
|
|
80
|
+
confidence: float,
|
|
81
|
+
provenance: str,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Add a directed causal edge while preserving acyclicity.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
source: Source column name.
|
|
87
|
+
target: Target column name.
|
|
88
|
+
confidence: Confidence score from 0.0 to 1.0.
|
|
89
|
+
provenance: Source of the edge.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If the edge is self-referential or creates a cycle.
|
|
93
|
+
"""
|
|
94
|
+
if source == target:
|
|
95
|
+
raise ValueError("Causal DAG does not allow self-edges")
|
|
96
|
+
self._graph.add_node(source)
|
|
97
|
+
self._graph.add_node(target)
|
|
98
|
+
if nx.has_path(self._graph, target, source):
|
|
99
|
+
raise ValueError(f"Adding {source!r} -> {target!r} would create a cycle")
|
|
100
|
+
bounded = max(0.0, min(1.0, confidence))
|
|
101
|
+
self._graph.add_edge(source, target, confidence=bounded, provenance=provenance)
|
|
102
|
+
|
|
103
|
+
def successors(self, column: str) -> tuple[str, ...]:
|
|
104
|
+
"""Return direct downstream columns for a node.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
column: Column name.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
A tuple of direct successor column names.
|
|
111
|
+
"""
|
|
112
|
+
if column not in self._graph:
|
|
113
|
+
return ()
|
|
114
|
+
return tuple(str(node) for node in self._graph.successors(column))
|
|
115
|
+
|
|
116
|
+
def is_reachable(self, source: str, target: str) -> bool:
|
|
117
|
+
"""Return whether target is reachable from source.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
source: Source column name.
|
|
121
|
+
target: Target column name.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
True if source equals target or a directed path exists.
|
|
125
|
+
"""
|
|
126
|
+
if source == target:
|
|
127
|
+
return True
|
|
128
|
+
if source not in self._graph or target not in self._graph:
|
|
129
|
+
return False
|
|
130
|
+
return bool(nx.has_path(self._graph, source, target))
|
|
131
|
+
|
|
132
|
+
def path_confidence(self, source: str, target: str) -> float:
|
|
133
|
+
"""Return the weakest-edge confidence on the shortest path.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
source: Source column name.
|
|
137
|
+
target: Target column name.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Confidence in [0.0, 1.0], or 0.0 when no path exists.
|
|
141
|
+
"""
|
|
142
|
+
if source == target:
|
|
143
|
+
return 1.0
|
|
144
|
+
if not self.is_reachable(source, target):
|
|
145
|
+
return 0.0
|
|
146
|
+
path = nx.shortest_path(self._graph, source, target)
|
|
147
|
+
confidences = [
|
|
148
|
+
float(self._graph.edges[path[i], path[i + 1]].get("confidence", 0.0))
|
|
149
|
+
for i in range(len(path) - 1)
|
|
150
|
+
]
|
|
151
|
+
return min(confidences, default=0.0)
|
|
152
|
+
|
|
153
|
+
def minimal_root_columns(self, columns: list[str] | tuple[str, ...]) -> tuple[str, ...]:
|
|
154
|
+
"""Return selected columns that are not downstream of another selection.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
columns: Selected error columns.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Minimal root columns in first-seen order.
|
|
161
|
+
"""
|
|
162
|
+
unique: list[str] = []
|
|
163
|
+
for column in columns:
|
|
164
|
+
if column not in unique:
|
|
165
|
+
unique.append(column)
|
|
166
|
+
|
|
167
|
+
roots: list[str] = []
|
|
168
|
+
for column in unique:
|
|
169
|
+
has_upstream = any(
|
|
170
|
+
other != column and self.is_reachable(other, column) for other in unique
|
|
171
|
+
)
|
|
172
|
+
if not has_upstream:
|
|
173
|
+
roots.append(column)
|
|
174
|
+
return tuple(roots)
|
dataforge/causal/pc.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""PC-based causal DAG discovery with functional-dependency priors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from scipy.stats import chi2_contingency # type: ignore[import-untyped]
|
|
11
|
+
|
|
12
|
+
from dataforge.causal.dag import CausalDAG
|
|
13
|
+
from dataforge.verifier.schema import Schema
|
|
14
|
+
|
|
15
|
+
__all__ = ["CausalDiscoveryResult", "discover_causal_dag"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class CausalDiscoveryResult:
|
|
20
|
+
"""Result of causal discovery.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
dag: Directed acyclic graph over columns.
|
|
24
|
+
confidence_report: Column-pair confidence or diagnostic metadata.
|
|
25
|
+
warnings: Non-fatal discovery warnings.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
dag: CausalDAG
|
|
29
|
+
confidence_report: dict[str, float] = field(default_factory=dict)
|
|
30
|
+
warnings: tuple[str, ...] = ()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def discover_causal_dag(
|
|
34
|
+
df: pd.DataFrame,
|
|
35
|
+
schema: Schema | None = None,
|
|
36
|
+
*,
|
|
37
|
+
alpha: float = 0.05,
|
|
38
|
+
) -> CausalDiscoveryResult:
|
|
39
|
+
"""Infer a deterministic causal DAG from tabular data and FD priors.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
df: Input DataFrame.
|
|
43
|
+
schema: Optional declared schema with functional dependencies.
|
|
44
|
+
alpha: Significance threshold for independence checks.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
CausalDiscoveryResult. A DAG is returned even if PC orientation is
|
|
48
|
+
underdetermined; low-confidence edges are tagged as such.
|
|
49
|
+
"""
|
|
50
|
+
columns = [str(column) for column in df.columns]
|
|
51
|
+
dag = CausalDAG(columns)
|
|
52
|
+
report: dict[str, float] = {}
|
|
53
|
+
warnings: list[str] = []
|
|
54
|
+
|
|
55
|
+
if schema is not None:
|
|
56
|
+
for fd in schema.functional_dependencies:
|
|
57
|
+
for determinant in fd.determinant:
|
|
58
|
+
_try_add_edge(
|
|
59
|
+
dag,
|
|
60
|
+
determinant,
|
|
61
|
+
fd.dependent,
|
|
62
|
+
confidence=0.95,
|
|
63
|
+
provenance="functional_dependency_prior",
|
|
64
|
+
warnings=warnings,
|
|
65
|
+
)
|
|
66
|
+
report[f"{determinant}->{fd.dependent}"] = 0.95
|
|
67
|
+
|
|
68
|
+
cleaned = _prepare_for_pc(df)
|
|
69
|
+
pc_edges, pc_warning = _run_causal_learn_pc(cleaned.to_numpy(), columns, alpha)
|
|
70
|
+
if pc_warning:
|
|
71
|
+
warnings.append(pc_warning)
|
|
72
|
+
for source, target in pc_edges:
|
|
73
|
+
_try_add_edge(
|
|
74
|
+
dag,
|
|
75
|
+
source,
|
|
76
|
+
target,
|
|
77
|
+
confidence=0.55,
|
|
78
|
+
provenance="causal_learn_pc",
|
|
79
|
+
warnings=warnings,
|
|
80
|
+
)
|
|
81
|
+
report.setdefault(f"{source}->{target}", 0.55)
|
|
82
|
+
|
|
83
|
+
for source, target, confidence in _pairwise_dependency_edges(df, alpha):
|
|
84
|
+
_try_add_edge(
|
|
85
|
+
dag,
|
|
86
|
+
source,
|
|
87
|
+
target,
|
|
88
|
+
confidence=confidence,
|
|
89
|
+
provenance="pairwise_ci_fallback",
|
|
90
|
+
warnings=warnings,
|
|
91
|
+
)
|
|
92
|
+
report.setdefault(f"{source}->{target}", confidence)
|
|
93
|
+
|
|
94
|
+
return CausalDiscoveryResult(dag=dag, confidence_report=report, warnings=tuple(warnings))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _prepare_for_pc(df: pd.DataFrame) -> pd.DataFrame:
|
|
98
|
+
"""Return numeric data with no NaN values for causal-learn PC."""
|
|
99
|
+
prepared = pd.DataFrame(index=df.index)
|
|
100
|
+
for column in df.columns:
|
|
101
|
+
numeric = pd.to_numeric(df[column], errors="coerce")
|
|
102
|
+
if numeric.notna().sum() >= max(2, int(0.5 * len(df))):
|
|
103
|
+
fill = float(numeric.median()) if numeric.notna().any() else 0.0
|
|
104
|
+
prepared[str(column)] = numeric.fillna(fill)
|
|
105
|
+
else:
|
|
106
|
+
codes, _ = pd.factorize(df[column].astype("string").fillna("<missing>"), sort=True)
|
|
107
|
+
prepared[str(column)] = codes.astype(float)
|
|
108
|
+
return prepared.fillna(0.0)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _run_causal_learn_pc(
|
|
112
|
+
data: np.ndarray[Any, Any], columns: list[str], alpha: float
|
|
113
|
+
) -> tuple[list[tuple[str, str]], str | None]:
|
|
114
|
+
"""Run causal-learn PC and return deterministic directed edges."""
|
|
115
|
+
try:
|
|
116
|
+
from causallearn.search.ConstraintBased.PC import pc # type: ignore[import-untyped]
|
|
117
|
+
|
|
118
|
+
result = pc(data, alpha=alpha, indep_test="fisherz", stable=True, show_progress=False)
|
|
119
|
+
except Exception as exc:
|
|
120
|
+
return [], f"causal-learn PC unavailable or failed: {exc}"
|
|
121
|
+
|
|
122
|
+
matrix = getattr(getattr(result, "G", None), "graph", None)
|
|
123
|
+
if matrix is None:
|
|
124
|
+
return [], "causal-learn PC returned no adjacency matrix"
|
|
125
|
+
|
|
126
|
+
edges: list[tuple[str, str]] = []
|
|
127
|
+
arr = np.asarray(matrix)
|
|
128
|
+
for i, source in enumerate(columns):
|
|
129
|
+
for j, target in enumerate(columns):
|
|
130
|
+
if i >= j or i >= arr.shape[0] or j >= arr.shape[1]:
|
|
131
|
+
continue
|
|
132
|
+
if arr[i, j] != 0 or arr[j, i] != 0:
|
|
133
|
+
edges.append((source, target))
|
|
134
|
+
return edges, None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _pairwise_dependency_edges(df: pd.DataFrame, alpha: float) -> list[tuple[str, str, float]]:
|
|
138
|
+
"""Return deterministic low-confidence edges for dependent column pairs."""
|
|
139
|
+
columns = [str(column) for column in df.columns]
|
|
140
|
+
edges: list[tuple[str, str, float]] = []
|
|
141
|
+
for i, source in enumerate(columns):
|
|
142
|
+
for target in columns[i + 1 :]:
|
|
143
|
+
p_value = _pairwise_p_value(df[source], df[target])
|
|
144
|
+
if p_value < alpha:
|
|
145
|
+
confidence = max(0.25, min(0.75, 1.0 - p_value))
|
|
146
|
+
edges.append((source, target, round(confidence, 4)))
|
|
147
|
+
return edges
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _pairwise_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
|
|
151
|
+
"""Return a p-value using categorical, continuous, or mixed tests."""
|
|
152
|
+
left_numeric = pd.to_numeric(left, errors="coerce")
|
|
153
|
+
right_numeric = pd.to_numeric(right, errors="coerce")
|
|
154
|
+
left_cont = left_numeric.notna().sum() >= max(5, int(0.8 * len(left)))
|
|
155
|
+
right_cont = right_numeric.notna().sum() >= max(5, int(0.8 * len(right)))
|
|
156
|
+
|
|
157
|
+
if left_cont and right_cont:
|
|
158
|
+
return _hsic_p_value(
|
|
159
|
+
left_numeric.fillna(left_numeric.median()), right_numeric.fillna(right_numeric.median())
|
|
160
|
+
)
|
|
161
|
+
if not left_cont and not right_cont:
|
|
162
|
+
return _chi_squared_p_value(left, right)
|
|
163
|
+
return _mutual_information_p_value(left, right)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _chi_squared_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
|
|
167
|
+
"""Return chi-squared independence p-value for categorical pairs."""
|
|
168
|
+
table = pd.crosstab(
|
|
169
|
+
left.astype("string").fillna("<missing>"), right.astype("string").fillna("<missing>")
|
|
170
|
+
)
|
|
171
|
+
if table.shape[0] < 2 or table.shape[1] < 2:
|
|
172
|
+
return 1.0
|
|
173
|
+
_, p_value, _, _ = chi2_contingency(table)
|
|
174
|
+
return float(p_value)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _hsic_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
|
|
178
|
+
"""Return HSIC p-value for continuous pairs, with correlation fallback."""
|
|
179
|
+
x = left.to_numpy(dtype=float).reshape(-1, 1)
|
|
180
|
+
y = right.to_numpy(dtype=float).reshape(-1, 1)
|
|
181
|
+
try:
|
|
182
|
+
from hyppo.independence import Hsic # type: ignore[import-untyped]
|
|
183
|
+
|
|
184
|
+
_, p_value = Hsic().test(x, y, reps=100, auto=True)
|
|
185
|
+
return float(p_value)
|
|
186
|
+
except Exception:
|
|
187
|
+
corr = abs(float(np.corrcoef(x[:, 0], y[:, 0])[0, 1]))
|
|
188
|
+
return 0.0 if corr > 0.75 else 1.0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _mutual_information_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
|
|
192
|
+
"""Return a bounded pseudo p-value from binned mutual information."""
|
|
193
|
+
left_codes = _codes(left)
|
|
194
|
+
right_codes = _codes(right)
|
|
195
|
+
table = pd.crosstab(left_codes, right_codes)
|
|
196
|
+
total = float(table.to_numpy().sum())
|
|
197
|
+
if total == 0.0 or table.shape[0] < 2 or table.shape[1] < 2:
|
|
198
|
+
return 1.0
|
|
199
|
+
joint = table.to_numpy(dtype=float) / total
|
|
200
|
+
px = joint.sum(axis=1, keepdims=True)
|
|
201
|
+
py = joint.sum(axis=0, keepdims=True)
|
|
202
|
+
expected = px @ py
|
|
203
|
+
mask = joint > 0
|
|
204
|
+
mi = float((joint[mask] * np.log(joint[mask] / expected[mask])).sum())
|
|
205
|
+
return float(np.exp(-mi))
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _codes(series: pd.Series[Any]) -> np.ndarray[Any, Any]:
|
|
209
|
+
"""Return stable integer codes for a mixed-type series."""
|
|
210
|
+
numeric = pd.to_numeric(series, errors="coerce")
|
|
211
|
+
if numeric.notna().sum() >= max(5, int(0.8 * len(series))):
|
|
212
|
+
return pd.qcut(
|
|
213
|
+
numeric.fillna(numeric.median()), q=4, duplicates="drop"
|
|
214
|
+
).cat.codes.to_numpy()
|
|
215
|
+
codes, _ = pd.factorize(series.astype("string").fillna("<missing>"), sort=True)
|
|
216
|
+
return codes
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _try_add_edge(
|
|
220
|
+
dag: CausalDAG,
|
|
221
|
+
source: str,
|
|
222
|
+
target: str,
|
|
223
|
+
*,
|
|
224
|
+
confidence: float,
|
|
225
|
+
provenance: str,
|
|
226
|
+
warnings: list[str],
|
|
227
|
+
) -> None:
|
|
228
|
+
"""Add an edge or record the cycle warning."""
|
|
229
|
+
try:
|
|
230
|
+
dag.add_edge(source, target, confidence=confidence, provenance=provenance)
|
|
231
|
+
except ValueError as exc:
|
|
232
|
+
warnings.append(str(exc))
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Minimal root-cause selection over detected errors and a causal DAG."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Protocol
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from dataforge.causal.dag import CausalDAG
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CausalRootCauseAnalyzer",
|
|
13
|
+
"ErrorEvidence",
|
|
14
|
+
"RootCauseResult",
|
|
15
|
+
"evidence_from_issue",
|
|
16
|
+
"minimal_root_set",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class _IssueLike(Protocol):
|
|
21
|
+
"""Protocol for objects with row/column issue fields."""
|
|
22
|
+
|
|
23
|
+
row: int
|
|
24
|
+
column: str
|
|
25
|
+
issue_type: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ErrorEvidence(BaseModel):
|
|
29
|
+
"""Column-mapped detected error used for causal root-cause analysis.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
index: Zero-based error index in the caller's selected issue list.
|
|
33
|
+
row: Row index where the error was detected.
|
|
34
|
+
column: Column where the error was detected.
|
|
35
|
+
issue_type: Machine-readable issue type.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
index: int = Field(ge=0)
|
|
39
|
+
row: int = Field(ge=0)
|
|
40
|
+
column: str = Field(min_length=1)
|
|
41
|
+
issue_type: str = Field(min_length=1)
|
|
42
|
+
|
|
43
|
+
model_config = {"frozen": True}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class RootCauseResult(BaseModel):
|
|
47
|
+
"""Structured result returned by the root-cause analyzer.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
root_indices: Minimal selected error indices.
|
|
51
|
+
root_columns: Root columns corresponding to root_indices.
|
|
52
|
+
covered_indices: Selected error indices covered by the root set.
|
|
53
|
+
confidence: Mean path confidence from roots to covered errors.
|
|
54
|
+
explanation: Human-readable explanation of the selected roots.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
root_indices: list[int]
|
|
58
|
+
root_columns: list[str]
|
|
59
|
+
covered_indices: list[int]
|
|
60
|
+
confidence: float
|
|
61
|
+
explanation: str
|
|
62
|
+
|
|
63
|
+
model_config = {"frozen": True}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class CausalRootCauseAnalyzer:
|
|
67
|
+
"""Compute minimal root causes for selected detected errors.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
dag: Column-level causal DAG.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> dag = CausalDAG(["discount_pct", "order_total"])
|
|
74
|
+
>>> dag.add_edge("discount_pct", "order_total", confidence=0.9, provenance="formula")
|
|
75
|
+
>>> errors = [
|
|
76
|
+
... ErrorEvidence(index=0, row=1, column="discount_pct", issue_type="bad"),
|
|
77
|
+
... ErrorEvidence(index=1, row=1, column="order_total", issue_type="bad"),
|
|
78
|
+
... ]
|
|
79
|
+
>>> CausalRootCauseAnalyzer(dag).analyze(errors).root_indices
|
|
80
|
+
[0]
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, dag: CausalDAG) -> None:
|
|
84
|
+
self._dag = dag
|
|
85
|
+
|
|
86
|
+
def analyze(self, errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...]) -> RootCauseResult:
|
|
87
|
+
"""Return the minimal root set for the selected errors.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
errors: Selected detected errors.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
RootCauseResult with roots, coverage, confidence, and explanation.
|
|
94
|
+
"""
|
|
95
|
+
if not errors:
|
|
96
|
+
return RootCauseResult(
|
|
97
|
+
root_indices=[],
|
|
98
|
+
root_columns=[],
|
|
99
|
+
covered_indices=[],
|
|
100
|
+
confidence=0.0,
|
|
101
|
+
explanation="No errors were supplied.",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
roots: list[ErrorEvidence] = []
|
|
105
|
+
for candidate in errors:
|
|
106
|
+
if not self._has_upstream_selected_error(candidate, errors):
|
|
107
|
+
roots.append(candidate)
|
|
108
|
+
|
|
109
|
+
covered: list[int] = []
|
|
110
|
+
path_confidences: list[float] = []
|
|
111
|
+
for error in errors:
|
|
112
|
+
for root in roots:
|
|
113
|
+
if root.column == error.column or self._dag.is_reachable(root.column, error.column):
|
|
114
|
+
covered.append(error.index)
|
|
115
|
+
path_confidences.append(self._dag.path_confidence(root.column, error.column))
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
confidence = (
|
|
119
|
+
round(sum(path_confidences) / len(path_confidences), 4) if path_confidences else 0.0
|
|
120
|
+
)
|
|
121
|
+
root_columns = [root.column for root in roots]
|
|
122
|
+
return RootCauseResult(
|
|
123
|
+
root_indices=[root.index for root in roots],
|
|
124
|
+
root_columns=root_columns,
|
|
125
|
+
covered_indices=covered,
|
|
126
|
+
confidence=confidence,
|
|
127
|
+
explanation=self._explain(root_columns, len(covered), len(errors)),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def _has_upstream_selected_error(
|
|
131
|
+
self,
|
|
132
|
+
candidate: ErrorEvidence,
|
|
133
|
+
errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...],
|
|
134
|
+
) -> bool:
|
|
135
|
+
"""Return whether another selected error causally precedes candidate."""
|
|
136
|
+
for other in errors:
|
|
137
|
+
if other.index == candidate.index:
|
|
138
|
+
continue
|
|
139
|
+
if other.column == candidate.column and other.index < candidate.index:
|
|
140
|
+
return True
|
|
141
|
+
if other.column != candidate.column and self._dag.is_reachable(
|
|
142
|
+
other.column, candidate.column
|
|
143
|
+
):
|
|
144
|
+
return True
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _explain(root_columns: list[str], covered_count: int, total_count: int) -> str:
|
|
149
|
+
"""Build a compact result explanation."""
|
|
150
|
+
if not root_columns:
|
|
151
|
+
return "No minimal roots were found."
|
|
152
|
+
joined = ", ".join(root_columns)
|
|
153
|
+
return f"Selected {joined} as minimal roots covering {covered_count}/{total_count} errors."
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def minimal_root_set(
|
|
157
|
+
errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...], dag: CausalDAG
|
|
158
|
+
) -> RootCauseResult:
|
|
159
|
+
"""Convenience wrapper for CausalRootCauseAnalyzer.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
errors: Selected detected errors.
|
|
163
|
+
dag: Column-level causal DAG.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Minimal root-cause result.
|
|
167
|
+
"""
|
|
168
|
+
return CausalRootCauseAnalyzer(dag).analyze(errors)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def evidence_from_issue(index: int, issue: _IssueLike | dict[str, Any]) -> ErrorEvidence:
|
|
172
|
+
"""Build ErrorEvidence from an Issue-like object or dictionary.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
index: Error index to assign.
|
|
176
|
+
issue: Object or dictionary with row/column/type fields.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
ErrorEvidence instance.
|
|
180
|
+
"""
|
|
181
|
+
if isinstance(issue, dict):
|
|
182
|
+
return ErrorEvidence(
|
|
183
|
+
index=index,
|
|
184
|
+
row=int(issue.get("row", 0)),
|
|
185
|
+
column=str(issue.get("column", "")),
|
|
186
|
+
issue_type=str(issue.get("type", issue.get("issue_type", "unknown"))),
|
|
187
|
+
)
|
|
188
|
+
return ErrorEvidence(
|
|
189
|
+
index=index,
|
|
190
|
+
row=int(issue.row),
|
|
191
|
+
column=str(issue.column),
|
|
192
|
+
issue_type=str(issue.issue_type),
|
|
193
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Typer application entrypoint for DataForge.
|
|
2
|
+
|
|
3
|
+
Each CLI subcommand is defined in its own module under ``dataforge.cli.*``
|
|
4
|
+
and registered here. The ``app`` object is the entry point referenced by
|
|
5
|
+
``[project.scripts]`` in ``pyproject.toml``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from dataforge.cli.audit import audit
|
|
11
|
+
from dataforge.cli.bench import bench
|
|
12
|
+
from dataforge.cli.constraints import constraints_app
|
|
13
|
+
from dataforge.cli.profile import profile
|
|
14
|
+
from dataforge.cli.release import release_app
|
|
15
|
+
from dataforge.cli.repair import repair
|
|
16
|
+
from dataforge.cli.revert import revert
|
|
17
|
+
from dataforge.cli.watch import watch
|
|
18
|
+
|
|
19
|
+
app: typer.Typer = typer.Typer(
|
|
20
|
+
help="DataForge - AI-powered data-quality detection and repair.",
|
|
21
|
+
no_args_is_help=True,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.callback(invoke_without_command=True)
|
|
26
|
+
def _main(
|
|
27
|
+
version: bool = typer.Option(
|
|
28
|
+
False,
|
|
29
|
+
"--version",
|
|
30
|
+
"-V",
|
|
31
|
+
help="Show version and exit.",
|
|
32
|
+
is_eager=True,
|
|
33
|
+
),
|
|
34
|
+
) -> None:
|
|
35
|
+
"""DataForge - AI-powered data-quality detection and repair."""
|
|
36
|
+
if version:
|
|
37
|
+
from dataforge import __version__
|
|
38
|
+
|
|
39
|
+
typer.echo(f"dataforge {__version__}")
|
|
40
|
+
raise typer.Exit()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
app.command(name="profile")(profile)
|
|
44
|
+
app.command(name="repair")(repair)
|
|
45
|
+
app.command(name="revert")(revert)
|
|
46
|
+
app.command(name="audit")(audit)
|
|
47
|
+
app.command(name="bench")(bench)
|
|
48
|
+
app.command(name="watch")(watch)
|
|
49
|
+
app.add_typer(constraints_app, name="constraints")
|
|
50
|
+
app.add_typer(release_app, name="release")
|