dataforge-07 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. dataforge/__init__.py +204 -0
  2. dataforge/__main__.py +5 -0
  3. dataforge/agent/__init__.py +16 -0
  4. dataforge/agent/providers.py +259 -0
  5. dataforge/agent/scratchpad.py +183 -0
  6. dataforge/agent/tool_actions.py +343 -0
  7. dataforge/bench/__init__.py +31 -0
  8. dataforge/bench/core.py +426 -0
  9. dataforge/bench/groq_client.py +386 -0
  10. dataforge/bench/methods.py +443 -0
  11. dataforge/bench/report.py +309 -0
  12. dataforge/bench/runner.py +247 -0
  13. dataforge/causal/__init__.py +21 -0
  14. dataforge/causal/dag.py +174 -0
  15. dataforge/causal/pc.py +232 -0
  16. dataforge/causal/root_cause.py +193 -0
  17. dataforge/cli/__init__.py +50 -0
  18. dataforge/cli/audit.py +70 -0
  19. dataforge/cli/bench.py +154 -0
  20. dataforge/cli/common.py +267 -0
  21. dataforge/cli/constraints.py +407 -0
  22. dataforge/cli/profile.py +147 -0
  23. dataforge/cli/release.py +166 -0
  24. dataforge/cli/repair.py +407 -0
  25. dataforge/cli/revert.py +139 -0
  26. dataforge/cli/watch.py +144 -0
  27. dataforge/datasets/__init__.py +25 -0
  28. dataforge/datasets/embedded/hospital/clean.csv +11 -0
  29. dataforge/datasets/embedded/hospital/dirty.csv +11 -0
  30. dataforge/datasets/real_world.py +290 -0
  31. dataforge/datasets/registry.py +103 -0
  32. dataforge/detectors/__init__.py +80 -0
  33. dataforge/detectors/base.py +145 -0
  34. dataforge/detectors/decimal_shift.py +166 -0
  35. dataforge/detectors/fd_violation.py +157 -0
  36. dataforge/detectors/type_mismatch.py +173 -0
  37. dataforge/engine/__init__.py +39 -0
  38. dataforge/engine/repair.py +905 -0
  39. dataforge/env/__init__.py +22 -0
  40. dataforge/env/environment.py +883 -0
  41. dataforge/env/observation.py +61 -0
  42. dataforge/env/openenv_core.py +161 -0
  43. dataforge/env/reward.py +128 -0
  44. dataforge/env/server.py +176 -0
  45. dataforge/evaluation_contract.py +76 -0
  46. dataforge/fixtures/hospital_10rows.csv +11 -0
  47. dataforge/fixtures/hospital_schema.yaml +17 -0
  48. dataforge/http/__init__.py +1 -0
  49. dataforge/http/problem.py +103 -0
  50. dataforge/integrations/__init__.py +1 -0
  51. dataforge/integrations/dbt.py +164 -0
  52. dataforge/observability.py +76 -0
  53. dataforge/py.typed +1 -0
  54. dataforge/release/__init__.py +1 -0
  55. dataforge/release/doctor.py +367 -0
  56. dataforge/release/full_vision.py +702 -0
  57. dataforge/release/gate.py +861 -0
  58. dataforge/release/playground_check.py +411 -0
  59. dataforge/repair_contract.py +468 -0
  60. dataforge/repairers/__init__.py +88 -0
  61. dataforge/repairers/base.py +77 -0
  62. dataforge/repairers/decimal_shift.py +43 -0
  63. dataforge/repairers/fd_violation.py +225 -0
  64. dataforge/repairers/type_mismatch.py +73 -0
  65. dataforge/safety/__init__.py +5 -0
  66. dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
  67. dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
  68. dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
  69. dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
  70. dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
  71. dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
  72. dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
  73. dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
  74. dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
  75. dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
  76. dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
  77. dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
  78. dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
  79. dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
  80. dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
  81. dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
  82. dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
  83. dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
  84. dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
  85. dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
  86. dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
  87. dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
  88. dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
  89. dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
  90. dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
  91. dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
  92. dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
  93. dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
  94. dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
  95. dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
  96. dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
  97. dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
  98. dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
  99. dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
  100. dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
  101. dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
  102. dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
  103. dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
  104. dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
  105. dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
  106. dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
  107. dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
  108. dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
  109. dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
  110. dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
  111. dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
  112. dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
  113. dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
  114. dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
  115. dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
  116. dataforge/safety/constitution.py +307 -0
  117. dataforge/safety/constitutions/default.yaml +40 -0
  118. dataforge/safety/filter.py +134 -0
  119. dataforge/schema_inference.py +620 -0
  120. dataforge/stores/__init__.py +46 -0
  121. dataforge/stores/base.py +73 -0
  122. dataforge/stores/cloud.py +78 -0
  123. dataforge/stores/csv.py +94 -0
  124. dataforge/stores/duckdb.py +313 -0
  125. dataforge/stores/patch_plan.py +178 -0
  126. dataforge/stores/registry.py +82 -0
  127. dataforge/stores/repair.py +121 -0
  128. dataforge/stores/revert.py +22 -0
  129. dataforge/stores/sql.py +27 -0
  130. dataforge/table.py +228 -0
  131. dataforge/transactions/__init__.py +34 -0
  132. dataforge/transactions/files.py +96 -0
  133. dataforge/transactions/log.py +613 -0
  134. dataforge/transactions/revert.py +102 -0
  135. dataforge/transactions/txn.py +104 -0
  136. dataforge/ui/__init__.py +1 -0
  137. dataforge/ui/profile_view.py +136 -0
  138. dataforge/ui/repair_diff.py +91 -0
  139. dataforge/verifier/__init__.py +55 -0
  140. dataforge/verifier/constraint_ir.py +155 -0
  141. dataforge/verifier/explain.py +47 -0
  142. dataforge/verifier/gate.py +5 -0
  143. dataforge/verifier/schema.py +111 -0
  144. dataforge/verifier/smt.py +433 -0
  145. dataforge_07-0.1.0.dist-info/METADATA +436 -0
  146. dataforge_07-0.1.0.dist-info/RECORD +150 -0
  147. dataforge_07-0.1.0.dist-info/WHEEL +5 -0
  148. dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
  149. dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
  150. dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
dataforge/cli/audit.py ADDED
@@ -0,0 +1,70 @@
1
+ """CLI subcommand: ``dataforge audit <txn_id>``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ import typer
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+
13
+ from dataforge.transactions import TransactionAuditVerdict, verify_transaction_log
14
+
15
+ _console = Console(stderr=True)
16
+
17
+
18
+ def audit(
19
+ txn_id: Annotated[
20
+ str,
21
+ typer.Argument(help="Transaction identifier to audit."),
22
+ ],
23
+ search_root: Annotated[
24
+ Path | None,
25
+ typer.Option(
26
+ "--search-root",
27
+ help="Root directory used to locate the transaction log.",
28
+ exists=True,
29
+ file_okay=False,
30
+ dir_okay=True,
31
+ readable=True,
32
+ ),
33
+ ] = None,
34
+ log_path: Annotated[
35
+ Path | None,
36
+ typer.Option(
37
+ "--log-path",
38
+ help="Explicit JSONL transaction log path.",
39
+ exists=True,
40
+ file_okay=True,
41
+ dir_okay=False,
42
+ readable=True,
43
+ ),
44
+ ] = None,
45
+ json_output: Annotated[
46
+ bool,
47
+ typer.Option("--json", help="Print the audit report as JSON."),
48
+ ] = False,
49
+ ) -> None:
50
+ """Verify a transaction log's local hash chain."""
51
+ report = verify_transaction_log(txn_id, log_path=log_path, search_root=search_root)
52
+ if json_output:
53
+ typer.echo(json.dumps(report.model_dump(mode="json"), indent=2, sort_keys=True))
54
+ else:
55
+ style = "green" if report.verdict == TransactionAuditVerdict.VERIFIED else "red"
56
+ body = (
57
+ f"Verdict: [bold]{report.verdict.value}[/bold]\n"
58
+ f"Transaction: {report.txn_id or txn_id}\n"
59
+ f"Events: {report.event_count}\n"
60
+ f"Head SHA-256: {report.head_sha256 or 'n/a'}"
61
+ )
62
+ if report.errors:
63
+ body += "\n\n" + "\n".join(f"- {error}" for error in report.errors)
64
+ _console.print(Panel(body, title="Transaction Audit", style=style))
65
+
66
+ if report.verdict == TransactionAuditVerdict.VERIFIED:
67
+ raise typer.Exit(code=0)
68
+ if report.verdict == TransactionAuditVerdict.LEGACY_UNVERIFIED:
69
+ raise typer.Exit(code=1)
70
+ raise typer.Exit(code=2)
dataforge/cli/bench.py ADDED
@@ -0,0 +1,154 @@
1
+ """CLI subcommand: ``dataforge bench``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from collections.abc import Callable
7
+ from pathlib import Path
8
+ from typing import Annotated, Any
9
+
10
+ import typer
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+ from rich.table import Table
14
+
15
+ _console = Console(stderr=True)
16
+ run_agent_comparison: Callable[..., Any] | None = None
17
+
18
+
19
+ def _parse_csv_list(raw_value: str) -> list[str]:
20
+ """Parse a comma-separated CLI option into a list of strings."""
21
+ values = [item.strip() for item in raw_value.split(",")]
22
+ return [value for value in values if value]
23
+
24
+
25
+ def _parse_seed_list(raw_value: str | None) -> list[int] | None:
26
+ """Parse an optional comma-separated seed list."""
27
+ if raw_value is None:
28
+ return None
29
+ seeds = [item.strip() for item in raw_value.split(",") if item.strip()]
30
+ return [int(seed) for seed in seeds]
31
+
32
+
33
+ def _runner() -> Callable[..., Any]:
34
+ """Load the benchmark runner lazily so core CLI imports stay lightweight."""
35
+ global run_agent_comparison
36
+ if run_agent_comparison is None:
37
+ from dataforge.bench.runner import run_agent_comparison as loaded_runner
38
+
39
+ run_agent_comparison = loaded_runner
40
+ return run_agent_comparison
41
+
42
+
43
+ def bench(
44
+ methods: Annotated[
45
+ str,
46
+ typer.Option(
47
+ "--methods",
48
+ help="Comma-separated benchmark methods.",
49
+ ),
50
+ ] = "heuristic,llm_zeroshot",
51
+ datasets: Annotated[
52
+ str,
53
+ typer.Option(
54
+ "--datasets",
55
+ help="Comma-separated benchmark datasets.",
56
+ ),
57
+ ] = "hospital",
58
+ seeds: Annotated[
59
+ int,
60
+ typer.Option("--seeds", help="Number of seeds per method/dataset pair."),
61
+ ] = 3,
62
+ seed_list: Annotated[
63
+ str | None,
64
+ typer.Option(
65
+ "--seed-list",
66
+ help="Explicit comma-separated seed list. Overrides --seeds for reproducibility.",
67
+ ),
68
+ ] = None,
69
+ really_run_big_bench: Annotated[
70
+ bool,
71
+ typer.Option(
72
+ "--really-run-big-bench",
73
+ help="Override the free-tier benchmark quota guard when estimated calls exceed 500.",
74
+ ),
75
+ ] = False,
76
+ output_json: Annotated[
77
+ Path,
78
+ typer.Option(
79
+ "--output-json",
80
+ help="Where to write eval/results/agent_comparison.json.",
81
+ ),
82
+ ] = Path("eval/results/agent_comparison.json"),
83
+ cache_root: Annotated[
84
+ Path | None,
85
+ typer.Option(
86
+ "--cache-root",
87
+ help="Benchmark dataset cache root. Defaults to the user DataForge cache.",
88
+ ),
89
+ ] = None,
90
+ verify_dataset_hashes: Annotated[
91
+ bool,
92
+ typer.Option(
93
+ "--verify-dataset-hashes/--no-verify-dataset-hashes",
94
+ help="Verify cached benchmark bytes against pinned upstream hashes.",
95
+ ),
96
+ ] = True,
97
+ json_output: Annotated[
98
+ bool,
99
+ typer.Option("--json", help="Print benchmark results as JSON."),
100
+ ] = False,
101
+ ) -> None:
102
+ """Run real-world benchmark methods across cached benchmark datasets."""
103
+ try:
104
+ output = _runner()(
105
+ methods=_parse_csv_list(methods),
106
+ datasets=_parse_csv_list(datasets),
107
+ seeds=seeds,
108
+ seed_list=_parse_seed_list(seed_list),
109
+ output_json=output_json,
110
+ really_run_big_bench=really_run_big_bench,
111
+ cache_root=cache_root,
112
+ verify_dataset_hashes=verify_dataset_hashes,
113
+ )
114
+ except Exception as exc:
115
+ _console.print(
116
+ Panel(
117
+ f"[bold red]{exc}[/bold red]",
118
+ title="Benchmark Error",
119
+ style="red",
120
+ )
121
+ )
122
+ raise typer.Exit(code=2) from exc
123
+
124
+ if json_output:
125
+ typer.echo(json.dumps(output.model_dump(mode="json"), indent=2, sort_keys=True))
126
+ return
127
+
128
+ table = Table(title="DataForge Benchmark Summary")
129
+ table.add_column("Method")
130
+ table.add_column("Dataset")
131
+ table.add_column("Status")
132
+ table.add_column("F1")
133
+ table.add_column("Avg Steps")
134
+ table.add_column("Quota")
135
+ for aggregate in output.aggregates:
136
+ table.add_row(
137
+ aggregate.method,
138
+ aggregate.dataset,
139
+ aggregate.status,
140
+ "Skipped" if aggregate.f1_mean is None else f"{aggregate.f1_mean:.4f}",
141
+ "Skipped" if aggregate.avg_steps_mean is None else f"{aggregate.avg_steps_mean:.2f}",
142
+ "Skipped"
143
+ if aggregate.quota_units_mean is None
144
+ else f"{aggregate.quota_units_mean:.4f}",
145
+ )
146
+ Console().print(table)
147
+ if any(aggregate.status == "skipped" for aggregate in output.aggregates):
148
+ Console().print(
149
+ Panel(
150
+ "Some LLM baselines were skipped. Set DATAFORGE_LLM_PROVIDER=groq and GROQ_API_KEY to enable them.",
151
+ title="Benchmark Warning",
152
+ style="yellow",
153
+ )
154
+ )
@@ -0,0 +1,267 @@
1
+ """Shared helpers for DataForge CLI commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+ from importlib import resources
7
+ from pathlib import Path
8
+ from typing import cast
9
+
10
+ import typer
11
+ import yaml
12
+
13
+ from dataforge.table import Table
14
+ from dataforge.table import read_csv as read_table_csv
15
+ from dataforge.verifier.schema import (
16
+ AcceptedValues,
17
+ AggregateDependency,
18
+ AggregateLiteral,
19
+ DomainBound,
20
+ FunctionalDependency,
21
+ RegexConstraint,
22
+ RelationshipConstraint,
23
+ Schema,
24
+ )
25
+
26
+ _PACKAGED_DEMO_FIXTURES = {
27
+ "fixtures/hospital_10rows.csv": "fixtures/hospital_10rows.csv",
28
+ "fixtures/hospital_schema.yaml": "fixtures/hospital_schema.yaml",
29
+ }
30
+
31
+
32
+ def resolve_cli_path(path: Path) -> Path:
33
+ """Resolve a user path, including DataForge's packaged demo fixture aliases."""
34
+ if path.exists():
35
+ return path
36
+
37
+ normalized = path.as_posix().replace("\\", "/").lstrip("./")
38
+ packaged_name = _PACKAGED_DEMO_FIXTURES.get(normalized)
39
+ if packaged_name is None:
40
+ return path
41
+
42
+ fixture = resources.files("dataforge").joinpath(packaged_name)
43
+ if not fixture.is_file():
44
+ return path
45
+ return Path(str(fixture))
46
+
47
+
48
+ def schema_from_mapping(raw_mapping: object) -> Schema:
49
+ """Build a Schema from a raw YAML mapping-like payload.
50
+
51
+ Args:
52
+ raw_mapping: Untrusted YAML-decoded value.
53
+
54
+ Returns:
55
+ Parsed Schema object.
56
+
57
+ Raises:
58
+ typer.BadParameter: If the payload is not a mapping.
59
+ """
60
+ if raw_mapping is None:
61
+ mapping: dict[str, object] = {}
62
+ elif isinstance(raw_mapping, dict):
63
+ mapping = raw_mapping
64
+ else:
65
+ raise typer.BadParameter("Schema payload must be a YAML mapping.")
66
+
67
+ columns: dict[str, str] = {}
68
+ raw_columns = mapping.get("columns", {})
69
+ if isinstance(raw_columns, dict):
70
+ columns = {str(key): str(value) for key, value in raw_columns.items()}
71
+
72
+ fds: list[FunctionalDependency] = []
73
+ raw_fds = mapping.get("functional_dependencies", [])
74
+ if isinstance(raw_fds, list):
75
+ for raw_fd in raw_fds:
76
+ if not isinstance(raw_fd, dict):
77
+ continue
78
+ raw_determinant = raw_fd.get("determinant", [])
79
+ determinant_values = (
80
+ tuple(str(value) for value in raw_determinant)
81
+ if isinstance(raw_determinant, Iterable)
82
+ and not isinstance(raw_determinant, (str, bytes))
83
+ else ()
84
+ )
85
+ fds.append(
86
+ FunctionalDependency(
87
+ determinant=determinant_values,
88
+ dependent=str(raw_fd.get("dependent", "")),
89
+ )
90
+ )
91
+
92
+ raw_pii_columns = mapping.get("pii_columns", [])
93
+ pii_columns = (
94
+ frozenset(str(value) for value in raw_pii_columns)
95
+ if isinstance(raw_pii_columns, Iterable) and not isinstance(raw_pii_columns, (str, bytes))
96
+ else frozenset()
97
+ )
98
+ raw_primary_key_columns = mapping.get("primary_key_columns", [])
99
+ primary_key_columns = (
100
+ frozenset(str(value) for value in raw_primary_key_columns)
101
+ if isinstance(raw_primary_key_columns, Iterable)
102
+ and not isinstance(raw_primary_key_columns, (str, bytes))
103
+ else frozenset()
104
+ )
105
+ raw_not_null_columns = mapping.get("not_null_columns", [])
106
+ not_null_columns = (
107
+ frozenset(str(value) for value in raw_not_null_columns)
108
+ if isinstance(raw_not_null_columns, Iterable)
109
+ and not isinstance(raw_not_null_columns, (str, bytes))
110
+ else frozenset()
111
+ )
112
+ raw_unique_columns = mapping.get("unique_columns", [])
113
+ unique_columns = (
114
+ frozenset(str(value) for value in raw_unique_columns)
115
+ if isinstance(raw_unique_columns, Iterable)
116
+ and not isinstance(raw_unique_columns, (str, bytes))
117
+ else frozenset()
118
+ )
119
+
120
+ accepted_values: list[AcceptedValues] = []
121
+ raw_accepted_values = mapping.get("accepted_values", {})
122
+ if isinstance(raw_accepted_values, dict):
123
+ for column, values in raw_accepted_values.items():
124
+ if isinstance(values, Iterable) and not isinstance(values, (str, bytes)):
125
+ accepted_values.append(
126
+ AcceptedValues(
127
+ column=str(column),
128
+ values=tuple(str(value) for value in values),
129
+ )
130
+ )
131
+ elif isinstance(raw_accepted_values, list):
132
+ for raw_rule in raw_accepted_values:
133
+ if not isinstance(raw_rule, dict):
134
+ continue
135
+ raw_values = raw_rule.get("values", [])
136
+ if isinstance(raw_values, Iterable) and not isinstance(raw_values, (str, bytes)):
137
+ accepted_values.append(
138
+ AcceptedValues(
139
+ column=str(raw_rule.get("column", "")),
140
+ values=tuple(str(value) for value in raw_values),
141
+ )
142
+ )
143
+
144
+ regex_constraints: list[RegexConstraint] = []
145
+ raw_regex_constraints = mapping.get("regex_constraints", {})
146
+ if isinstance(raw_regex_constraints, dict):
147
+ for column, pattern in raw_regex_constraints.items():
148
+ regex_constraints.append(RegexConstraint(column=str(column), pattern=str(pattern)))
149
+ elif isinstance(raw_regex_constraints, list):
150
+ for raw_rule in raw_regex_constraints:
151
+ if isinstance(raw_rule, dict):
152
+ regex_constraints.append(
153
+ RegexConstraint(
154
+ column=str(raw_rule.get("column", "")),
155
+ pattern=str(raw_rule.get("pattern", "")),
156
+ )
157
+ )
158
+
159
+ relationships: list[RelationshipConstraint] = []
160
+ raw_relationships = mapping.get("relationships", [])
161
+ if isinstance(raw_relationships, list):
162
+ for raw_rule in raw_relationships:
163
+ if not isinstance(raw_rule, dict):
164
+ continue
165
+ relationships.append(
166
+ RelationshipConstraint(
167
+ column=str(raw_rule.get("column", "")),
168
+ reference=str(raw_rule.get("reference", "")),
169
+ reference_column=str(raw_rule.get("reference_column", "")),
170
+ )
171
+ )
172
+
173
+ bounds: list[DomainBound] = []
174
+ raw_bounds = mapping.get("domain_bounds", {})
175
+ if isinstance(raw_bounds, dict):
176
+ for column, bound_payload in raw_bounds.items():
177
+ if not isinstance(bound_payload, dict):
178
+ continue
179
+ bounds.append(
180
+ DomainBound(
181
+ column=str(column),
182
+ min_value=(
183
+ float(bound_payload["min"])
184
+ if bound_payload.get("min") is not None
185
+ else None
186
+ ),
187
+ max_value=(
188
+ float(bound_payload["max"])
189
+ if bound_payload.get("max") is not None
190
+ else None
191
+ ),
192
+ inclusive_min=bool(bound_payload.get("inclusive_min", True)),
193
+ inclusive_max=bool(bound_payload.get("inclusive_max", True)),
194
+ )
195
+ )
196
+
197
+ aggregate_dependencies: list[AggregateDependency] = []
198
+ raw_aggregates = mapping.get("aggregate_dependencies", [])
199
+ if isinstance(raw_aggregates, list):
200
+ for raw_dependency in raw_aggregates:
201
+ if not isinstance(raw_dependency, dict):
202
+ continue
203
+ raw_aggregate = str(raw_dependency.get("aggregate", "")).lower()
204
+ if raw_aggregate not in {"sum", "avg"}:
205
+ continue
206
+ raw_group_by = raw_dependency.get("group_by", [])
207
+ group_by = (
208
+ tuple(str(value) for value in raw_group_by)
209
+ if isinstance(raw_group_by, Iterable) and not isinstance(raw_group_by, (str, bytes))
210
+ else ()
211
+ )
212
+ aggregate_dependencies.append(
213
+ AggregateDependency(
214
+ source_column=str(raw_dependency.get("source_column", "")),
215
+ aggregate=cast(AggregateLiteral, raw_aggregate),
216
+ target_column=str(raw_dependency.get("target_column", "")),
217
+ group_by=group_by,
218
+ )
219
+ )
220
+
221
+ return Schema(
222
+ columns=columns,
223
+ functional_dependencies=tuple(fds),
224
+ pii_columns=pii_columns,
225
+ primary_key_columns=primary_key_columns,
226
+ not_null_columns=not_null_columns,
227
+ unique_columns=unique_columns,
228
+ accepted_values=tuple(accepted_values),
229
+ regex_constraints=tuple(regex_constraints),
230
+ relationships=tuple(relationships),
231
+ domain_bounds=tuple(bounds),
232
+ aggregate_dependencies=tuple(aggregate_dependencies),
233
+ )
234
+
235
+
236
+ def load_schema(schema_path: Path) -> Schema:
237
+ """Load a Schema from a YAML file.
238
+
239
+ Args:
240
+ schema_path: Path to the YAML schema file.
241
+
242
+ Returns:
243
+ Parsed Schema object.
244
+
245
+ Raises:
246
+ typer.BadParameter: If the schema file is malformed or unreadable.
247
+ """
248
+ try:
249
+ raw = yaml.safe_load(schema_path.read_text(encoding="utf-8"))
250
+ except OSError as exc:
251
+ raise typer.BadParameter(f"Could not read schema file '{schema_path}': {exc}") from exc
252
+
253
+ if raw is not None and not isinstance(raw, dict):
254
+ raise typer.BadParameter(f"Schema file '{schema_path}' must be a YAML mapping.")
255
+ return schema_from_mapping(raw)
256
+
257
+
258
+ def read_csv(path: Path) -> Table:
259
+ """Read a CSV using conservative string-preserving defaults.
260
+
261
+ Args:
262
+ path: CSV path.
263
+
264
+ Returns:
265
+ A string-preserving DataForge table.
266
+ """
267
+ return read_table_csv(path)