dataforge-07 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. dataforge/__init__.py +204 -0
  2. dataforge/__main__.py +5 -0
  3. dataforge/agent/__init__.py +16 -0
  4. dataforge/agent/providers.py +259 -0
  5. dataforge/agent/scratchpad.py +183 -0
  6. dataforge/agent/tool_actions.py +343 -0
  7. dataforge/bench/__init__.py +31 -0
  8. dataforge/bench/core.py +426 -0
  9. dataforge/bench/groq_client.py +386 -0
  10. dataforge/bench/methods.py +443 -0
  11. dataforge/bench/report.py +309 -0
  12. dataforge/bench/runner.py +247 -0
  13. dataforge/causal/__init__.py +21 -0
  14. dataforge/causal/dag.py +174 -0
  15. dataforge/causal/pc.py +232 -0
  16. dataforge/causal/root_cause.py +193 -0
  17. dataforge/cli/__init__.py +50 -0
  18. dataforge/cli/audit.py +70 -0
  19. dataforge/cli/bench.py +154 -0
  20. dataforge/cli/common.py +267 -0
  21. dataforge/cli/constraints.py +407 -0
  22. dataforge/cli/profile.py +147 -0
  23. dataforge/cli/release.py +166 -0
  24. dataforge/cli/repair.py +407 -0
  25. dataforge/cli/revert.py +139 -0
  26. dataforge/cli/watch.py +144 -0
  27. dataforge/datasets/__init__.py +25 -0
  28. dataforge/datasets/embedded/hospital/clean.csv +11 -0
  29. dataforge/datasets/embedded/hospital/dirty.csv +11 -0
  30. dataforge/datasets/real_world.py +290 -0
  31. dataforge/datasets/registry.py +103 -0
  32. dataforge/detectors/__init__.py +80 -0
  33. dataforge/detectors/base.py +145 -0
  34. dataforge/detectors/decimal_shift.py +166 -0
  35. dataforge/detectors/fd_violation.py +157 -0
  36. dataforge/detectors/type_mismatch.py +173 -0
  37. dataforge/engine/__init__.py +39 -0
  38. dataforge/engine/repair.py +905 -0
  39. dataforge/env/__init__.py +22 -0
  40. dataforge/env/environment.py +883 -0
  41. dataforge/env/observation.py +61 -0
  42. dataforge/env/openenv_core.py +161 -0
  43. dataforge/env/reward.py +128 -0
  44. dataforge/env/server.py +176 -0
  45. dataforge/evaluation_contract.py +76 -0
  46. dataforge/fixtures/hospital_10rows.csv +11 -0
  47. dataforge/fixtures/hospital_schema.yaml +17 -0
  48. dataforge/http/__init__.py +1 -0
  49. dataforge/http/problem.py +103 -0
  50. dataforge/integrations/__init__.py +1 -0
  51. dataforge/integrations/dbt.py +164 -0
  52. dataforge/observability.py +76 -0
  53. dataforge/py.typed +1 -0
  54. dataforge/release/__init__.py +1 -0
  55. dataforge/release/doctor.py +367 -0
  56. dataforge/release/full_vision.py +702 -0
  57. dataforge/release/gate.py +861 -0
  58. dataforge/release/playground_check.py +411 -0
  59. dataforge/repair_contract.py +468 -0
  60. dataforge/repairers/__init__.py +88 -0
  61. dataforge/repairers/base.py +77 -0
  62. dataforge/repairers/decimal_shift.py +43 -0
  63. dataforge/repairers/fd_violation.py +225 -0
  64. dataforge/repairers/type_mismatch.py +73 -0
  65. dataforge/safety/__init__.py +5 -0
  66. dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
  67. dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
  68. dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
  69. dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
  70. dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
  71. dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
  72. dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
  73. dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
  74. dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
  75. dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
  76. dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
  77. dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
  78. dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
  79. dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
  80. dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
  81. dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
  82. dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
  83. dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
  84. dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
  85. dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
  86. dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
  87. dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
  88. dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
  89. dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
  90. dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
  91. dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
  92. dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
  93. dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
  94. dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
  95. dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
  96. dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
  97. dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
  98. dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
  99. dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
  100. dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
  101. dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
  102. dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
  103. dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
  104. dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
  105. dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
  106. dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
  107. dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
  108. dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
  109. dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
  110. dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
  111. dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
  112. dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
  113. dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
  114. dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
  115. dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
  116. dataforge/safety/constitution.py +307 -0
  117. dataforge/safety/constitutions/default.yaml +40 -0
  118. dataforge/safety/filter.py +134 -0
  119. dataforge/schema_inference.py +620 -0
  120. dataforge/stores/__init__.py +46 -0
  121. dataforge/stores/base.py +73 -0
  122. dataforge/stores/cloud.py +78 -0
  123. dataforge/stores/csv.py +94 -0
  124. dataforge/stores/duckdb.py +313 -0
  125. dataforge/stores/patch_plan.py +178 -0
  126. dataforge/stores/registry.py +82 -0
  127. dataforge/stores/repair.py +121 -0
  128. dataforge/stores/revert.py +22 -0
  129. dataforge/stores/sql.py +27 -0
  130. dataforge/table.py +228 -0
  131. dataforge/transactions/__init__.py +34 -0
  132. dataforge/transactions/files.py +96 -0
  133. dataforge/transactions/log.py +613 -0
  134. dataforge/transactions/revert.py +102 -0
  135. dataforge/transactions/txn.py +104 -0
  136. dataforge/ui/__init__.py +1 -0
  137. dataforge/ui/profile_view.py +136 -0
  138. dataforge/ui/repair_diff.py +91 -0
  139. dataforge/verifier/__init__.py +55 -0
  140. dataforge/verifier/constraint_ir.py +155 -0
  141. dataforge/verifier/explain.py +47 -0
  142. dataforge/verifier/gate.py +5 -0
  143. dataforge/verifier/schema.py +111 -0
  144. dataforge/verifier/smt.py +433 -0
  145. dataforge_07-0.1.0.dist-info/METADATA +436 -0
  146. dataforge_07-0.1.0.dist-info/RECORD +150 -0
  147. dataforge_07-0.1.0.dist-info/WHEEL +5 -0
  148. dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
  149. dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
  150. dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,174 @@
1
+ """Column-level causal DAG utilities for root-cause analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ import networkx as nx # type: ignore[import-untyped]
9
+
10
+ __all__ = ["CausalDAG", "CausalEdge"]
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class CausalEdge:
15
+ """Metadata for a directed causal edge.
16
+
17
+ Args:
18
+ source: Source column name.
19
+ target: Target column name.
20
+ confidence: Confidence in the directed influence, from 0.0 to 1.0.
21
+ provenance: Human-readable source of the edge.
22
+ """
23
+
24
+ source: str
25
+ target: str
26
+ confidence: float
27
+ provenance: str
28
+
29
+
30
+ class CausalDAG:
31
+ """Acyclic directed graph whose nodes are dataset columns.
32
+
33
+ Args:
34
+ nodes: Optional initial column names.
35
+
36
+ Example:
37
+ >>> dag = CausalDAG(["discount_pct", "order_total"])
38
+ >>> dag.add_edge("discount_pct", "order_total", confidence=0.9, provenance="fd")
39
+ >>> dag.is_reachable("discount_pct", "order_total")
40
+ True
41
+ """
42
+
43
+ def __init__(self, nodes: list[str] | tuple[str, ...] = ()) -> None:
44
+ self._graph: nx.DiGraph[Any] = nx.DiGraph()
45
+ self._graph.add_nodes_from(nodes)
46
+
47
+ @property
48
+ def nodes(self) -> tuple[str, ...]:
49
+ """Return graph nodes in insertion order."""
50
+ return tuple(str(node) for node in self._graph.nodes)
51
+
52
+ @property
53
+ def edges(self) -> tuple[CausalEdge, ...]:
54
+ """Return directed edges with metadata."""
55
+ result: list[CausalEdge] = []
56
+ for source, target, attrs in self._graph.edges(data=True):
57
+ result.append(
58
+ CausalEdge(
59
+ source=str(source),
60
+ target=str(target),
61
+ confidence=float(attrs.get("confidence", 0.0)),
62
+ provenance=str(attrs.get("provenance", "unknown")),
63
+ )
64
+ )
65
+ return tuple(result)
66
+
67
+ def add_node(self, column: str) -> None:
68
+ """Add a column node if it is not already present.
69
+
70
+ Args:
71
+ column: Column name.
72
+ """
73
+ self._graph.add_node(column)
74
+
75
+ def add_edge(
76
+ self,
77
+ source: str,
78
+ target: str,
79
+ *,
80
+ confidence: float,
81
+ provenance: str,
82
+ ) -> None:
83
+ """Add a directed causal edge while preserving acyclicity.
84
+
85
+ Args:
86
+ source: Source column name.
87
+ target: Target column name.
88
+ confidence: Confidence score from 0.0 to 1.0.
89
+ provenance: Source of the edge.
90
+
91
+ Raises:
92
+ ValueError: If the edge is self-referential or creates a cycle.
93
+ """
94
+ if source == target:
95
+ raise ValueError("Causal DAG does not allow self-edges")
96
+ self._graph.add_node(source)
97
+ self._graph.add_node(target)
98
+ if nx.has_path(self._graph, target, source):
99
+ raise ValueError(f"Adding {source!r} -> {target!r} would create a cycle")
100
+ bounded = max(0.0, min(1.0, confidence))
101
+ self._graph.add_edge(source, target, confidence=bounded, provenance=provenance)
102
+
103
+ def successors(self, column: str) -> tuple[str, ...]:
104
+ """Return direct downstream columns for a node.
105
+
106
+ Args:
107
+ column: Column name.
108
+
109
+ Returns:
110
+ A tuple of direct successor column names.
111
+ """
112
+ if column not in self._graph:
113
+ return ()
114
+ return tuple(str(node) for node in self._graph.successors(column))
115
+
116
+ def is_reachable(self, source: str, target: str) -> bool:
117
+ """Return whether target is reachable from source.
118
+
119
+ Args:
120
+ source: Source column name.
121
+ target: Target column name.
122
+
123
+ Returns:
124
+ True if source equals target or a directed path exists.
125
+ """
126
+ if source == target:
127
+ return True
128
+ if source not in self._graph or target not in self._graph:
129
+ return False
130
+ return bool(nx.has_path(self._graph, source, target))
131
+
132
+ def path_confidence(self, source: str, target: str) -> float:
133
+ """Return the weakest-edge confidence on the shortest path.
134
+
135
+ Args:
136
+ source: Source column name.
137
+ target: Target column name.
138
+
139
+ Returns:
140
+ Confidence in [0.0, 1.0], or 0.0 when no path exists.
141
+ """
142
+ if source == target:
143
+ return 1.0
144
+ if not self.is_reachable(source, target):
145
+ return 0.0
146
+ path = nx.shortest_path(self._graph, source, target)
147
+ confidences = [
148
+ float(self._graph.edges[path[i], path[i + 1]].get("confidence", 0.0))
149
+ for i in range(len(path) - 1)
150
+ ]
151
+ return min(confidences, default=0.0)
152
+
153
+ def minimal_root_columns(self, columns: list[str] | tuple[str, ...]) -> tuple[str, ...]:
154
+ """Return selected columns that are not downstream of another selection.
155
+
156
+ Args:
157
+ columns: Selected error columns.
158
+
159
+ Returns:
160
+ Minimal root columns in first-seen order.
161
+ """
162
+ unique: list[str] = []
163
+ for column in columns:
164
+ if column not in unique:
165
+ unique.append(column)
166
+
167
+ roots: list[str] = []
168
+ for column in unique:
169
+ has_upstream = any(
170
+ other != column and self.is_reachable(other, column) for other in unique
171
+ )
172
+ if not has_upstream:
173
+ roots.append(column)
174
+ return tuple(roots)
dataforge/causal/pc.py ADDED
@@ -0,0 +1,232 @@
1
+ """PC-based causal DAG discovery with functional-dependency priors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from scipy.stats import chi2_contingency # type: ignore[import-untyped]
11
+
12
+ from dataforge.causal.dag import CausalDAG
13
+ from dataforge.verifier.schema import Schema
14
+
15
+ __all__ = ["CausalDiscoveryResult", "discover_causal_dag"]
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class CausalDiscoveryResult:
20
+ """Result of causal discovery.
21
+
22
+ Args:
23
+ dag: Directed acyclic graph over columns.
24
+ confidence_report: Column-pair confidence or diagnostic metadata.
25
+ warnings: Non-fatal discovery warnings.
26
+ """
27
+
28
+ dag: CausalDAG
29
+ confidence_report: dict[str, float] = field(default_factory=dict)
30
+ warnings: tuple[str, ...] = ()
31
+
32
+
33
+ def discover_causal_dag(
34
+ df: pd.DataFrame,
35
+ schema: Schema | None = None,
36
+ *,
37
+ alpha: float = 0.05,
38
+ ) -> CausalDiscoveryResult:
39
+ """Infer a deterministic causal DAG from tabular data and FD priors.
40
+
41
+ Args:
42
+ df: Input DataFrame.
43
+ schema: Optional declared schema with functional dependencies.
44
+ alpha: Significance threshold for independence checks.
45
+
46
+ Returns:
47
+ CausalDiscoveryResult. A DAG is returned even if PC orientation is
48
+ underdetermined; low-confidence edges are tagged as such.
49
+ """
50
+ columns = [str(column) for column in df.columns]
51
+ dag = CausalDAG(columns)
52
+ report: dict[str, float] = {}
53
+ warnings: list[str] = []
54
+
55
+ if schema is not None:
56
+ for fd in schema.functional_dependencies:
57
+ for determinant in fd.determinant:
58
+ _try_add_edge(
59
+ dag,
60
+ determinant,
61
+ fd.dependent,
62
+ confidence=0.95,
63
+ provenance="functional_dependency_prior",
64
+ warnings=warnings,
65
+ )
66
+ report[f"{determinant}->{fd.dependent}"] = 0.95
67
+
68
+ cleaned = _prepare_for_pc(df)
69
+ pc_edges, pc_warning = _run_causal_learn_pc(cleaned.to_numpy(), columns, alpha)
70
+ if pc_warning:
71
+ warnings.append(pc_warning)
72
+ for source, target in pc_edges:
73
+ _try_add_edge(
74
+ dag,
75
+ source,
76
+ target,
77
+ confidence=0.55,
78
+ provenance="causal_learn_pc",
79
+ warnings=warnings,
80
+ )
81
+ report.setdefault(f"{source}->{target}", 0.55)
82
+
83
+ for source, target, confidence in _pairwise_dependency_edges(df, alpha):
84
+ _try_add_edge(
85
+ dag,
86
+ source,
87
+ target,
88
+ confidence=confidence,
89
+ provenance="pairwise_ci_fallback",
90
+ warnings=warnings,
91
+ )
92
+ report.setdefault(f"{source}->{target}", confidence)
93
+
94
+ return CausalDiscoveryResult(dag=dag, confidence_report=report, warnings=tuple(warnings))
95
+
96
+
97
+ def _prepare_for_pc(df: pd.DataFrame) -> pd.DataFrame:
98
+ """Return numeric data with no NaN values for causal-learn PC."""
99
+ prepared = pd.DataFrame(index=df.index)
100
+ for column in df.columns:
101
+ numeric = pd.to_numeric(df[column], errors="coerce")
102
+ if numeric.notna().sum() >= max(2, int(0.5 * len(df))):
103
+ fill = float(numeric.median()) if numeric.notna().any() else 0.0
104
+ prepared[str(column)] = numeric.fillna(fill)
105
+ else:
106
+ codes, _ = pd.factorize(df[column].astype("string").fillna("<missing>"), sort=True)
107
+ prepared[str(column)] = codes.astype(float)
108
+ return prepared.fillna(0.0)
109
+
110
+
111
+ def _run_causal_learn_pc(
112
+ data: np.ndarray[Any, Any], columns: list[str], alpha: float
113
+ ) -> tuple[list[tuple[str, str]], str | None]:
114
+ """Run causal-learn PC and return deterministic directed edges."""
115
+ try:
116
+ from causallearn.search.ConstraintBased.PC import pc # type: ignore[import-untyped]
117
+
118
+ result = pc(data, alpha=alpha, indep_test="fisherz", stable=True, show_progress=False)
119
+ except Exception as exc:
120
+ return [], f"causal-learn PC unavailable or failed: {exc}"
121
+
122
+ matrix = getattr(getattr(result, "G", None), "graph", None)
123
+ if matrix is None:
124
+ return [], "causal-learn PC returned no adjacency matrix"
125
+
126
+ edges: list[tuple[str, str]] = []
127
+ arr = np.asarray(matrix)
128
+ for i, source in enumerate(columns):
129
+ for j, target in enumerate(columns):
130
+ if i >= j or i >= arr.shape[0] or j >= arr.shape[1]:
131
+ continue
132
+ if arr[i, j] != 0 or arr[j, i] != 0:
133
+ edges.append((source, target))
134
+ return edges, None
135
+
136
+
137
+ def _pairwise_dependency_edges(df: pd.DataFrame, alpha: float) -> list[tuple[str, str, float]]:
138
+ """Return deterministic low-confidence edges for dependent column pairs."""
139
+ columns = [str(column) for column in df.columns]
140
+ edges: list[tuple[str, str, float]] = []
141
+ for i, source in enumerate(columns):
142
+ for target in columns[i + 1 :]:
143
+ p_value = _pairwise_p_value(df[source], df[target])
144
+ if p_value < alpha:
145
+ confidence = max(0.25, min(0.75, 1.0 - p_value))
146
+ edges.append((source, target, round(confidence, 4)))
147
+ return edges
148
+
149
+
150
+ def _pairwise_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
151
+ """Return a p-value using categorical, continuous, or mixed tests."""
152
+ left_numeric = pd.to_numeric(left, errors="coerce")
153
+ right_numeric = pd.to_numeric(right, errors="coerce")
154
+ left_cont = left_numeric.notna().sum() >= max(5, int(0.8 * len(left)))
155
+ right_cont = right_numeric.notna().sum() >= max(5, int(0.8 * len(right)))
156
+
157
+ if left_cont and right_cont:
158
+ return _hsic_p_value(
159
+ left_numeric.fillna(left_numeric.median()), right_numeric.fillna(right_numeric.median())
160
+ )
161
+ if not left_cont and not right_cont:
162
+ return _chi_squared_p_value(left, right)
163
+ return _mutual_information_p_value(left, right)
164
+
165
+
166
+ def _chi_squared_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
167
+ """Return chi-squared independence p-value for categorical pairs."""
168
+ table = pd.crosstab(
169
+ left.astype("string").fillna("<missing>"), right.astype("string").fillna("<missing>")
170
+ )
171
+ if table.shape[0] < 2 or table.shape[1] < 2:
172
+ return 1.0
173
+ _, p_value, _, _ = chi2_contingency(table)
174
+ return float(p_value)
175
+
176
+
177
+ def _hsic_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
178
+ """Return HSIC p-value for continuous pairs, with correlation fallback."""
179
+ x = left.to_numpy(dtype=float).reshape(-1, 1)
180
+ y = right.to_numpy(dtype=float).reshape(-1, 1)
181
+ try:
182
+ from hyppo.independence import Hsic # type: ignore[import-untyped]
183
+
184
+ _, p_value = Hsic().test(x, y, reps=100, auto=True)
185
+ return float(p_value)
186
+ except Exception:
187
+ corr = abs(float(np.corrcoef(x[:, 0], y[:, 0])[0, 1]))
188
+ return 0.0 if corr > 0.75 else 1.0
189
+
190
+
191
+ def _mutual_information_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
192
+ """Return a bounded pseudo p-value from binned mutual information."""
193
+ left_codes = _codes(left)
194
+ right_codes = _codes(right)
195
+ table = pd.crosstab(left_codes, right_codes)
196
+ total = float(table.to_numpy().sum())
197
+ if total == 0.0 or table.shape[0] < 2 or table.shape[1] < 2:
198
+ return 1.0
199
+ joint = table.to_numpy(dtype=float) / total
200
+ px = joint.sum(axis=1, keepdims=True)
201
+ py = joint.sum(axis=0, keepdims=True)
202
+ expected = px @ py
203
+ mask = joint > 0
204
+ mi = float((joint[mask] * np.log(joint[mask] / expected[mask])).sum())
205
+ return float(np.exp(-mi))
206
+
207
+
208
+ def _codes(series: pd.Series[Any]) -> np.ndarray[Any, Any]:
209
+ """Return stable integer codes for a mixed-type series."""
210
+ numeric = pd.to_numeric(series, errors="coerce")
211
+ if numeric.notna().sum() >= max(5, int(0.8 * len(series))):
212
+ return pd.qcut(
213
+ numeric.fillna(numeric.median()), q=4, duplicates="drop"
214
+ ).cat.codes.to_numpy()
215
+ codes, _ = pd.factorize(series.astype("string").fillna("<missing>"), sort=True)
216
+ return codes
217
+
218
+
219
+ def _try_add_edge(
220
+ dag: CausalDAG,
221
+ source: str,
222
+ target: str,
223
+ *,
224
+ confidence: float,
225
+ provenance: str,
226
+ warnings: list[str],
227
+ ) -> None:
228
+ """Add an edge or record the cycle warning."""
229
+ try:
230
+ dag.add_edge(source, target, confidence=confidence, provenance=provenance)
231
+ except ValueError as exc:
232
+ warnings.append(str(exc))
@@ -0,0 +1,193 @@
1
+ """Minimal root-cause selection over detected errors and a causal DAG."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Protocol
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from dataforge.causal.dag import CausalDAG
10
+
11
+ __all__ = [
12
+ "CausalRootCauseAnalyzer",
13
+ "ErrorEvidence",
14
+ "RootCauseResult",
15
+ "evidence_from_issue",
16
+ "minimal_root_set",
17
+ ]
18
+
19
+
20
+ class _IssueLike(Protocol):
21
+ """Protocol for objects with row/column issue fields."""
22
+
23
+ row: int
24
+ column: str
25
+ issue_type: str
26
+
27
+
28
+ class ErrorEvidence(BaseModel):
29
+ """Column-mapped detected error used for causal root-cause analysis.
30
+
31
+ Args:
32
+ index: Zero-based error index in the caller's selected issue list.
33
+ row: Row index where the error was detected.
34
+ column: Column where the error was detected.
35
+ issue_type: Machine-readable issue type.
36
+ """
37
+
38
+ index: int = Field(ge=0)
39
+ row: int = Field(ge=0)
40
+ column: str = Field(min_length=1)
41
+ issue_type: str = Field(min_length=1)
42
+
43
+ model_config = {"frozen": True}
44
+
45
+
46
+ class RootCauseResult(BaseModel):
47
+ """Structured result returned by the root-cause analyzer.
48
+
49
+ Args:
50
+ root_indices: Minimal selected error indices.
51
+ root_columns: Root columns corresponding to root_indices.
52
+ covered_indices: Selected error indices covered by the root set.
53
+ confidence: Mean path confidence from roots to covered errors.
54
+ explanation: Human-readable explanation of the selected roots.
55
+ """
56
+
57
+ root_indices: list[int]
58
+ root_columns: list[str]
59
+ covered_indices: list[int]
60
+ confidence: float
61
+ explanation: str
62
+
63
+ model_config = {"frozen": True}
64
+
65
+
66
+ class CausalRootCauseAnalyzer:
67
+ """Compute minimal root causes for selected detected errors.
68
+
69
+ Args:
70
+ dag: Column-level causal DAG.
71
+
72
+ Example:
73
+ >>> dag = CausalDAG(["discount_pct", "order_total"])
74
+ >>> dag.add_edge("discount_pct", "order_total", confidence=0.9, provenance="formula")
75
+ >>> errors = [
76
+ ... ErrorEvidence(index=0, row=1, column="discount_pct", issue_type="bad"),
77
+ ... ErrorEvidence(index=1, row=1, column="order_total", issue_type="bad"),
78
+ ... ]
79
+ >>> CausalRootCauseAnalyzer(dag).analyze(errors).root_indices
80
+ [0]
81
+ """
82
+
83
+ def __init__(self, dag: CausalDAG) -> None:
84
+ self._dag = dag
85
+
86
+ def analyze(self, errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...]) -> RootCauseResult:
87
+ """Return the minimal root set for the selected errors.
88
+
89
+ Args:
90
+ errors: Selected detected errors.
91
+
92
+ Returns:
93
+ RootCauseResult with roots, coverage, confidence, and explanation.
94
+ """
95
+ if not errors:
96
+ return RootCauseResult(
97
+ root_indices=[],
98
+ root_columns=[],
99
+ covered_indices=[],
100
+ confidence=0.0,
101
+ explanation="No errors were supplied.",
102
+ )
103
+
104
+ roots: list[ErrorEvidence] = []
105
+ for candidate in errors:
106
+ if not self._has_upstream_selected_error(candidate, errors):
107
+ roots.append(candidate)
108
+
109
+ covered: list[int] = []
110
+ path_confidences: list[float] = []
111
+ for error in errors:
112
+ for root in roots:
113
+ if root.column == error.column or self._dag.is_reachable(root.column, error.column):
114
+ covered.append(error.index)
115
+ path_confidences.append(self._dag.path_confidence(root.column, error.column))
116
+ break
117
+
118
+ confidence = (
119
+ round(sum(path_confidences) / len(path_confidences), 4) if path_confidences else 0.0
120
+ )
121
+ root_columns = [root.column for root in roots]
122
+ return RootCauseResult(
123
+ root_indices=[root.index for root in roots],
124
+ root_columns=root_columns,
125
+ covered_indices=covered,
126
+ confidence=confidence,
127
+ explanation=self._explain(root_columns, len(covered), len(errors)),
128
+ )
129
+
130
+ def _has_upstream_selected_error(
131
+ self,
132
+ candidate: ErrorEvidence,
133
+ errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...],
134
+ ) -> bool:
135
+ """Return whether another selected error causally precedes candidate."""
136
+ for other in errors:
137
+ if other.index == candidate.index:
138
+ continue
139
+ if other.column == candidate.column and other.index < candidate.index:
140
+ return True
141
+ if other.column != candidate.column and self._dag.is_reachable(
142
+ other.column, candidate.column
143
+ ):
144
+ return True
145
+ return False
146
+
147
+ @staticmethod
148
+ def _explain(root_columns: list[str], covered_count: int, total_count: int) -> str:
149
+ """Build a compact result explanation."""
150
+ if not root_columns:
151
+ return "No minimal roots were found."
152
+ joined = ", ".join(root_columns)
153
+ return f"Selected {joined} as minimal roots covering {covered_count}/{total_count} errors."
154
+
155
+
156
+ def minimal_root_set(
157
+ errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...], dag: CausalDAG
158
+ ) -> RootCauseResult:
159
+ """Convenience wrapper for CausalRootCauseAnalyzer.
160
+
161
+ Args:
162
+ errors: Selected detected errors.
163
+ dag: Column-level causal DAG.
164
+
165
+ Returns:
166
+ Minimal root-cause result.
167
+ """
168
+ return CausalRootCauseAnalyzer(dag).analyze(errors)
169
+
170
+
171
+ def evidence_from_issue(index: int, issue: _IssueLike | dict[str, Any]) -> ErrorEvidence:
172
+ """Build ErrorEvidence from an Issue-like object or dictionary.
173
+
174
+ Args:
175
+ index: Error index to assign.
176
+ issue: Object or dictionary with row/column/type fields.
177
+
178
+ Returns:
179
+ ErrorEvidence instance.
180
+ """
181
+ if isinstance(issue, dict):
182
+ return ErrorEvidence(
183
+ index=index,
184
+ row=int(issue.get("row", 0)),
185
+ column=str(issue.get("column", "")),
186
+ issue_type=str(issue.get("type", issue.get("issue_type", "unknown"))),
187
+ )
188
+ return ErrorEvidence(
189
+ index=index,
190
+ row=int(issue.row),
191
+ column=str(issue.column),
192
+ issue_type=str(issue.issue_type),
193
+ )
@@ -0,0 +1,50 @@
1
+ """Typer application entrypoint for DataForge.
2
+
3
+ Each CLI subcommand is defined in its own module under ``dataforge.cli.*``
4
+ and registered here. The ``app`` object is the entry point referenced by
5
+ ``[project.scripts]`` in ``pyproject.toml``.
6
+ """
7
+
8
+ import typer
9
+
10
+ from dataforge.cli.audit import audit
11
+ from dataforge.cli.bench import bench
12
+ from dataforge.cli.constraints import constraints_app
13
+ from dataforge.cli.profile import profile
14
+ from dataforge.cli.release import release_app
15
+ from dataforge.cli.repair import repair
16
+ from dataforge.cli.revert import revert
17
+ from dataforge.cli.watch import watch
18
+
19
+ app: typer.Typer = typer.Typer(
20
+ help="DataForge - AI-powered data-quality detection and repair.",
21
+ no_args_is_help=True,
22
+ )
23
+
24
+
25
+ @app.callback(invoke_without_command=True)
26
+ def _main(
27
+ version: bool = typer.Option(
28
+ False,
29
+ "--version",
30
+ "-V",
31
+ help="Show version and exit.",
32
+ is_eager=True,
33
+ ),
34
+ ) -> None:
35
+ """DataForge - AI-powered data-quality detection and repair."""
36
+ if version:
37
+ from dataforge import __version__
38
+
39
+ typer.echo(f"dataforge {__version__}")
40
+ raise typer.Exit()
41
+
42
+
43
+ app.command(name="profile")(profile)
44
+ app.command(name="repair")(repair)
45
+ app.command(name="revert")(revert)
46
+ app.command(name="audit")(audit)
47
+ app.command(name="bench")(bench)
48
+ app.command(name="watch")(watch)
49
+ app.add_typer(constraints_app, name="constraints")
50
+ app.add_typer(release_app, name="release")