dataforge-07 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge/__init__.py +204 -0
- dataforge/__main__.py +5 -0
- dataforge/agent/__init__.py +16 -0
- dataforge/agent/providers.py +259 -0
- dataforge/agent/scratchpad.py +183 -0
- dataforge/agent/tool_actions.py +343 -0
- dataforge/bench/__init__.py +31 -0
- dataforge/bench/core.py +426 -0
- dataforge/bench/groq_client.py +386 -0
- dataforge/bench/methods.py +443 -0
- dataforge/bench/report.py +309 -0
- dataforge/bench/runner.py +247 -0
- dataforge/causal/__init__.py +21 -0
- dataforge/causal/dag.py +174 -0
- dataforge/causal/pc.py +232 -0
- dataforge/causal/root_cause.py +193 -0
- dataforge/cli/__init__.py +50 -0
- dataforge/cli/audit.py +70 -0
- dataforge/cli/bench.py +154 -0
- dataforge/cli/common.py +267 -0
- dataforge/cli/constraints.py +407 -0
- dataforge/cli/profile.py +147 -0
- dataforge/cli/release.py +166 -0
- dataforge/cli/repair.py +407 -0
- dataforge/cli/revert.py +139 -0
- dataforge/cli/watch.py +144 -0
- dataforge/datasets/__init__.py +25 -0
- dataforge/datasets/embedded/hospital/clean.csv +11 -0
- dataforge/datasets/embedded/hospital/dirty.csv +11 -0
- dataforge/datasets/real_world.py +290 -0
- dataforge/datasets/registry.py +103 -0
- dataforge/detectors/__init__.py +80 -0
- dataforge/detectors/base.py +145 -0
- dataforge/detectors/decimal_shift.py +166 -0
- dataforge/detectors/fd_violation.py +157 -0
- dataforge/detectors/type_mismatch.py +173 -0
- dataforge/engine/__init__.py +39 -0
- dataforge/engine/repair.py +905 -0
- dataforge/env/__init__.py +22 -0
- dataforge/env/environment.py +883 -0
- dataforge/env/observation.py +61 -0
- dataforge/env/openenv_core.py +161 -0
- dataforge/env/reward.py +128 -0
- dataforge/env/server.py +176 -0
- dataforge/evaluation_contract.py +76 -0
- dataforge/fixtures/hospital_10rows.csv +11 -0
- dataforge/fixtures/hospital_schema.yaml +17 -0
- dataforge/http/__init__.py +1 -0
- dataforge/http/problem.py +103 -0
- dataforge/integrations/__init__.py +1 -0
- dataforge/integrations/dbt.py +164 -0
- dataforge/observability.py +76 -0
- dataforge/py.typed +1 -0
- dataforge/release/__init__.py +1 -0
- dataforge/release/doctor.py +367 -0
- dataforge/release/full_vision.py +702 -0
- dataforge/release/gate.py +861 -0
- dataforge/release/playground_check.py +411 -0
- dataforge/repair_contract.py +468 -0
- dataforge/repairers/__init__.py +88 -0
- dataforge/repairers/base.py +77 -0
- dataforge/repairers/decimal_shift.py +43 -0
- dataforge/repairers/fd_violation.py +225 -0
- dataforge/repairers/type_mismatch.py +73 -0
- dataforge/safety/__init__.py +5 -0
- dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
- dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
- dataforge/safety/constitution.py +307 -0
- dataforge/safety/constitutions/default.yaml +40 -0
- dataforge/safety/filter.py +134 -0
- dataforge/schema_inference.py +620 -0
- dataforge/stores/__init__.py +46 -0
- dataforge/stores/base.py +73 -0
- dataforge/stores/cloud.py +78 -0
- dataforge/stores/csv.py +94 -0
- dataforge/stores/duckdb.py +313 -0
- dataforge/stores/patch_plan.py +178 -0
- dataforge/stores/registry.py +82 -0
- dataforge/stores/repair.py +121 -0
- dataforge/stores/revert.py +22 -0
- dataforge/stores/sql.py +27 -0
- dataforge/table.py +228 -0
- dataforge/transactions/__init__.py +34 -0
- dataforge/transactions/files.py +96 -0
- dataforge/transactions/log.py +613 -0
- dataforge/transactions/revert.py +102 -0
- dataforge/transactions/txn.py +104 -0
- dataforge/ui/__init__.py +1 -0
- dataforge/ui/profile_view.py +136 -0
- dataforge/ui/repair_diff.py +91 -0
- dataforge/verifier/__init__.py +55 -0
- dataforge/verifier/constraint_ir.py +155 -0
- dataforge/verifier/explain.py +47 -0
- dataforge/verifier/gate.py +5 -0
- dataforge/verifier/schema.py +111 -0
- dataforge/verifier/smt.py +433 -0
- dataforge_07-0.1.0.dist-info/METADATA +436 -0
- dataforge_07-0.1.0.dist-info/RECORD +150 -0
- dataforge_07-0.1.0.dist-info/WHEEL +5 -0
- dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
- dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
- dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
dataforge/cli/audit.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""CLI subcommand: ``dataforge audit <txn_id>``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
|
|
13
|
+
from dataforge.transactions import TransactionAuditVerdict, verify_transaction_log
|
|
14
|
+
|
|
15
|
+
_console = Console(stderr=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def audit(
|
|
19
|
+
txn_id: Annotated[
|
|
20
|
+
str,
|
|
21
|
+
typer.Argument(help="Transaction identifier to audit."),
|
|
22
|
+
],
|
|
23
|
+
search_root: Annotated[
|
|
24
|
+
Path | None,
|
|
25
|
+
typer.Option(
|
|
26
|
+
"--search-root",
|
|
27
|
+
help="Root directory used to locate the transaction log.",
|
|
28
|
+
exists=True,
|
|
29
|
+
file_okay=False,
|
|
30
|
+
dir_okay=True,
|
|
31
|
+
readable=True,
|
|
32
|
+
),
|
|
33
|
+
] = None,
|
|
34
|
+
log_path: Annotated[
|
|
35
|
+
Path | None,
|
|
36
|
+
typer.Option(
|
|
37
|
+
"--log-path",
|
|
38
|
+
help="Explicit JSONL transaction log path.",
|
|
39
|
+
exists=True,
|
|
40
|
+
file_okay=True,
|
|
41
|
+
dir_okay=False,
|
|
42
|
+
readable=True,
|
|
43
|
+
),
|
|
44
|
+
] = None,
|
|
45
|
+
json_output: Annotated[
|
|
46
|
+
bool,
|
|
47
|
+
typer.Option("--json", help="Print the audit report as JSON."),
|
|
48
|
+
] = False,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Verify a transaction log's local hash chain."""
|
|
51
|
+
report = verify_transaction_log(txn_id, log_path=log_path, search_root=search_root)
|
|
52
|
+
if json_output:
|
|
53
|
+
typer.echo(json.dumps(report.model_dump(mode="json"), indent=2, sort_keys=True))
|
|
54
|
+
else:
|
|
55
|
+
style = "green" if report.verdict == TransactionAuditVerdict.VERIFIED else "red"
|
|
56
|
+
body = (
|
|
57
|
+
f"Verdict: [bold]{report.verdict.value}[/bold]\n"
|
|
58
|
+
f"Transaction: {report.txn_id or txn_id}\n"
|
|
59
|
+
f"Events: {report.event_count}\n"
|
|
60
|
+
f"Head SHA-256: {report.head_sha256 or 'n/a'}"
|
|
61
|
+
)
|
|
62
|
+
if report.errors:
|
|
63
|
+
body += "\n\n" + "\n".join(f"- {error}" for error in report.errors)
|
|
64
|
+
_console.print(Panel(body, title="Transaction Audit", style=style))
|
|
65
|
+
|
|
66
|
+
if report.verdict == TransactionAuditVerdict.VERIFIED:
|
|
67
|
+
raise typer.Exit(code=0)
|
|
68
|
+
if report.verdict == TransactionAuditVerdict.LEGACY_UNVERIFIED:
|
|
69
|
+
raise typer.Exit(code=1)
|
|
70
|
+
raise typer.Exit(code=2)
|
dataforge/cli/bench.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""CLI subcommand: ``dataforge bench``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Annotated, Any
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
|
|
15
|
+
_console = Console(stderr=True)
|
|
16
|
+
run_agent_comparison: Callable[..., Any] | None = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _parse_csv_list(raw_value: str) -> list[str]:
|
|
20
|
+
"""Parse a comma-separated CLI option into a list of strings."""
|
|
21
|
+
values = [item.strip() for item in raw_value.split(",")]
|
|
22
|
+
return [value for value in values if value]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_seed_list(raw_value: str | None) -> list[int] | None:
|
|
26
|
+
"""Parse an optional comma-separated seed list."""
|
|
27
|
+
if raw_value is None:
|
|
28
|
+
return None
|
|
29
|
+
seeds = [item.strip() for item in raw_value.split(",") if item.strip()]
|
|
30
|
+
return [int(seed) for seed in seeds]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _runner() -> Callable[..., Any]:
|
|
34
|
+
"""Load the benchmark runner lazily so core CLI imports stay lightweight."""
|
|
35
|
+
global run_agent_comparison
|
|
36
|
+
if run_agent_comparison is None:
|
|
37
|
+
from dataforge.bench.runner import run_agent_comparison as loaded_runner
|
|
38
|
+
|
|
39
|
+
run_agent_comparison = loaded_runner
|
|
40
|
+
return run_agent_comparison
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def bench(
|
|
44
|
+
methods: Annotated[
|
|
45
|
+
str,
|
|
46
|
+
typer.Option(
|
|
47
|
+
"--methods",
|
|
48
|
+
help="Comma-separated benchmark methods.",
|
|
49
|
+
),
|
|
50
|
+
] = "heuristic,llm_zeroshot",
|
|
51
|
+
datasets: Annotated[
|
|
52
|
+
str,
|
|
53
|
+
typer.Option(
|
|
54
|
+
"--datasets",
|
|
55
|
+
help="Comma-separated benchmark datasets.",
|
|
56
|
+
),
|
|
57
|
+
] = "hospital",
|
|
58
|
+
seeds: Annotated[
|
|
59
|
+
int,
|
|
60
|
+
typer.Option("--seeds", help="Number of seeds per method/dataset pair."),
|
|
61
|
+
] = 3,
|
|
62
|
+
seed_list: Annotated[
|
|
63
|
+
str | None,
|
|
64
|
+
typer.Option(
|
|
65
|
+
"--seed-list",
|
|
66
|
+
help="Explicit comma-separated seed list. Overrides --seeds for reproducibility.",
|
|
67
|
+
),
|
|
68
|
+
] = None,
|
|
69
|
+
really_run_big_bench: Annotated[
|
|
70
|
+
bool,
|
|
71
|
+
typer.Option(
|
|
72
|
+
"--really-run-big-bench",
|
|
73
|
+
help="Override the free-tier benchmark quota guard when estimated calls exceed 500.",
|
|
74
|
+
),
|
|
75
|
+
] = False,
|
|
76
|
+
output_json: Annotated[
|
|
77
|
+
Path,
|
|
78
|
+
typer.Option(
|
|
79
|
+
"--output-json",
|
|
80
|
+
help="Where to write eval/results/agent_comparison.json.",
|
|
81
|
+
),
|
|
82
|
+
] = Path("eval/results/agent_comparison.json"),
|
|
83
|
+
cache_root: Annotated[
|
|
84
|
+
Path | None,
|
|
85
|
+
typer.Option(
|
|
86
|
+
"--cache-root",
|
|
87
|
+
help="Benchmark dataset cache root. Defaults to the user DataForge cache.",
|
|
88
|
+
),
|
|
89
|
+
] = None,
|
|
90
|
+
verify_dataset_hashes: Annotated[
|
|
91
|
+
bool,
|
|
92
|
+
typer.Option(
|
|
93
|
+
"--verify-dataset-hashes/--no-verify-dataset-hashes",
|
|
94
|
+
help="Verify cached benchmark bytes against pinned upstream hashes.",
|
|
95
|
+
),
|
|
96
|
+
] = True,
|
|
97
|
+
json_output: Annotated[
|
|
98
|
+
bool,
|
|
99
|
+
typer.Option("--json", help="Print benchmark results as JSON."),
|
|
100
|
+
] = False,
|
|
101
|
+
) -> None:
|
|
102
|
+
"""Run real-world benchmark methods across cached benchmark datasets."""
|
|
103
|
+
try:
|
|
104
|
+
output = _runner()(
|
|
105
|
+
methods=_parse_csv_list(methods),
|
|
106
|
+
datasets=_parse_csv_list(datasets),
|
|
107
|
+
seeds=seeds,
|
|
108
|
+
seed_list=_parse_seed_list(seed_list),
|
|
109
|
+
output_json=output_json,
|
|
110
|
+
really_run_big_bench=really_run_big_bench,
|
|
111
|
+
cache_root=cache_root,
|
|
112
|
+
verify_dataset_hashes=verify_dataset_hashes,
|
|
113
|
+
)
|
|
114
|
+
except Exception as exc:
|
|
115
|
+
_console.print(
|
|
116
|
+
Panel(
|
|
117
|
+
f"[bold red]{exc}[/bold red]",
|
|
118
|
+
title="Benchmark Error",
|
|
119
|
+
style="red",
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
raise typer.Exit(code=2) from exc
|
|
123
|
+
|
|
124
|
+
if json_output:
|
|
125
|
+
typer.echo(json.dumps(output.model_dump(mode="json"), indent=2, sort_keys=True))
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
table = Table(title="DataForge Benchmark Summary")
|
|
129
|
+
table.add_column("Method")
|
|
130
|
+
table.add_column("Dataset")
|
|
131
|
+
table.add_column("Status")
|
|
132
|
+
table.add_column("F1")
|
|
133
|
+
table.add_column("Avg Steps")
|
|
134
|
+
table.add_column("Quota")
|
|
135
|
+
for aggregate in output.aggregates:
|
|
136
|
+
table.add_row(
|
|
137
|
+
aggregate.method,
|
|
138
|
+
aggregate.dataset,
|
|
139
|
+
aggregate.status,
|
|
140
|
+
"Skipped" if aggregate.f1_mean is None else f"{aggregate.f1_mean:.4f}",
|
|
141
|
+
"Skipped" if aggregate.avg_steps_mean is None else f"{aggregate.avg_steps_mean:.2f}",
|
|
142
|
+
"Skipped"
|
|
143
|
+
if aggregate.quota_units_mean is None
|
|
144
|
+
else f"{aggregate.quota_units_mean:.4f}",
|
|
145
|
+
)
|
|
146
|
+
Console().print(table)
|
|
147
|
+
if any(aggregate.status == "skipped" for aggregate in output.aggregates):
|
|
148
|
+
Console().print(
|
|
149
|
+
Panel(
|
|
150
|
+
"Some LLM baselines were skipped. Set DATAFORGE_LLM_PROVIDER=groq and GROQ_API_KEY to enable them.",
|
|
151
|
+
title="Benchmark Warning",
|
|
152
|
+
style="yellow",
|
|
153
|
+
)
|
|
154
|
+
)
|
dataforge/cli/common.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""Shared helpers for DataForge CLI commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from importlib import resources
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import cast
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
from dataforge.table import Table
|
|
14
|
+
from dataforge.table import read_csv as read_table_csv
|
|
15
|
+
from dataforge.verifier.schema import (
|
|
16
|
+
AcceptedValues,
|
|
17
|
+
AggregateDependency,
|
|
18
|
+
AggregateLiteral,
|
|
19
|
+
DomainBound,
|
|
20
|
+
FunctionalDependency,
|
|
21
|
+
RegexConstraint,
|
|
22
|
+
RelationshipConstraint,
|
|
23
|
+
Schema,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
_PACKAGED_DEMO_FIXTURES = {
|
|
27
|
+
"fixtures/hospital_10rows.csv": "fixtures/hospital_10rows.csv",
|
|
28
|
+
"fixtures/hospital_schema.yaml": "fixtures/hospital_schema.yaml",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def resolve_cli_path(path: Path) -> Path:
|
|
33
|
+
"""Resolve a user path, including DataForge's packaged demo fixture aliases."""
|
|
34
|
+
if path.exists():
|
|
35
|
+
return path
|
|
36
|
+
|
|
37
|
+
normalized = path.as_posix().replace("\\", "/").lstrip("./")
|
|
38
|
+
packaged_name = _PACKAGED_DEMO_FIXTURES.get(normalized)
|
|
39
|
+
if packaged_name is None:
|
|
40
|
+
return path
|
|
41
|
+
|
|
42
|
+
fixture = resources.files("dataforge").joinpath(packaged_name)
|
|
43
|
+
if not fixture.is_file():
|
|
44
|
+
return path
|
|
45
|
+
return Path(str(fixture))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def schema_from_mapping(raw_mapping: object) -> Schema:
|
|
49
|
+
"""Build a Schema from a raw YAML mapping-like payload.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
raw_mapping: Untrusted YAML-decoded value.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Parsed Schema object.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
typer.BadParameter: If the payload is not a mapping.
|
|
59
|
+
"""
|
|
60
|
+
if raw_mapping is None:
|
|
61
|
+
mapping: dict[str, object] = {}
|
|
62
|
+
elif isinstance(raw_mapping, dict):
|
|
63
|
+
mapping = raw_mapping
|
|
64
|
+
else:
|
|
65
|
+
raise typer.BadParameter("Schema payload must be a YAML mapping.")
|
|
66
|
+
|
|
67
|
+
columns: dict[str, str] = {}
|
|
68
|
+
raw_columns = mapping.get("columns", {})
|
|
69
|
+
if isinstance(raw_columns, dict):
|
|
70
|
+
columns = {str(key): str(value) for key, value in raw_columns.items()}
|
|
71
|
+
|
|
72
|
+
fds: list[FunctionalDependency] = []
|
|
73
|
+
raw_fds = mapping.get("functional_dependencies", [])
|
|
74
|
+
if isinstance(raw_fds, list):
|
|
75
|
+
for raw_fd in raw_fds:
|
|
76
|
+
if not isinstance(raw_fd, dict):
|
|
77
|
+
continue
|
|
78
|
+
raw_determinant = raw_fd.get("determinant", [])
|
|
79
|
+
determinant_values = (
|
|
80
|
+
tuple(str(value) for value in raw_determinant)
|
|
81
|
+
if isinstance(raw_determinant, Iterable)
|
|
82
|
+
and not isinstance(raw_determinant, (str, bytes))
|
|
83
|
+
else ()
|
|
84
|
+
)
|
|
85
|
+
fds.append(
|
|
86
|
+
FunctionalDependency(
|
|
87
|
+
determinant=determinant_values,
|
|
88
|
+
dependent=str(raw_fd.get("dependent", "")),
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
raw_pii_columns = mapping.get("pii_columns", [])
|
|
93
|
+
pii_columns = (
|
|
94
|
+
frozenset(str(value) for value in raw_pii_columns)
|
|
95
|
+
if isinstance(raw_pii_columns, Iterable) and not isinstance(raw_pii_columns, (str, bytes))
|
|
96
|
+
else frozenset()
|
|
97
|
+
)
|
|
98
|
+
raw_primary_key_columns = mapping.get("primary_key_columns", [])
|
|
99
|
+
primary_key_columns = (
|
|
100
|
+
frozenset(str(value) for value in raw_primary_key_columns)
|
|
101
|
+
if isinstance(raw_primary_key_columns, Iterable)
|
|
102
|
+
and not isinstance(raw_primary_key_columns, (str, bytes))
|
|
103
|
+
else frozenset()
|
|
104
|
+
)
|
|
105
|
+
raw_not_null_columns = mapping.get("not_null_columns", [])
|
|
106
|
+
not_null_columns = (
|
|
107
|
+
frozenset(str(value) for value in raw_not_null_columns)
|
|
108
|
+
if isinstance(raw_not_null_columns, Iterable)
|
|
109
|
+
and not isinstance(raw_not_null_columns, (str, bytes))
|
|
110
|
+
else frozenset()
|
|
111
|
+
)
|
|
112
|
+
raw_unique_columns = mapping.get("unique_columns", [])
|
|
113
|
+
unique_columns = (
|
|
114
|
+
frozenset(str(value) for value in raw_unique_columns)
|
|
115
|
+
if isinstance(raw_unique_columns, Iterable)
|
|
116
|
+
and not isinstance(raw_unique_columns, (str, bytes))
|
|
117
|
+
else frozenset()
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
accepted_values: list[AcceptedValues] = []
|
|
121
|
+
raw_accepted_values = mapping.get("accepted_values", {})
|
|
122
|
+
if isinstance(raw_accepted_values, dict):
|
|
123
|
+
for column, values in raw_accepted_values.items():
|
|
124
|
+
if isinstance(values, Iterable) and not isinstance(values, (str, bytes)):
|
|
125
|
+
accepted_values.append(
|
|
126
|
+
AcceptedValues(
|
|
127
|
+
column=str(column),
|
|
128
|
+
values=tuple(str(value) for value in values),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
elif isinstance(raw_accepted_values, list):
|
|
132
|
+
for raw_rule in raw_accepted_values:
|
|
133
|
+
if not isinstance(raw_rule, dict):
|
|
134
|
+
continue
|
|
135
|
+
raw_values = raw_rule.get("values", [])
|
|
136
|
+
if isinstance(raw_values, Iterable) and not isinstance(raw_values, (str, bytes)):
|
|
137
|
+
accepted_values.append(
|
|
138
|
+
AcceptedValues(
|
|
139
|
+
column=str(raw_rule.get("column", "")),
|
|
140
|
+
values=tuple(str(value) for value in raw_values),
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
regex_constraints: list[RegexConstraint] = []
|
|
145
|
+
raw_regex_constraints = mapping.get("regex_constraints", {})
|
|
146
|
+
if isinstance(raw_regex_constraints, dict):
|
|
147
|
+
for column, pattern in raw_regex_constraints.items():
|
|
148
|
+
regex_constraints.append(RegexConstraint(column=str(column), pattern=str(pattern)))
|
|
149
|
+
elif isinstance(raw_regex_constraints, list):
|
|
150
|
+
for raw_rule in raw_regex_constraints:
|
|
151
|
+
if isinstance(raw_rule, dict):
|
|
152
|
+
regex_constraints.append(
|
|
153
|
+
RegexConstraint(
|
|
154
|
+
column=str(raw_rule.get("column", "")),
|
|
155
|
+
pattern=str(raw_rule.get("pattern", "")),
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
relationships: list[RelationshipConstraint] = []
|
|
160
|
+
raw_relationships = mapping.get("relationships", [])
|
|
161
|
+
if isinstance(raw_relationships, list):
|
|
162
|
+
for raw_rule in raw_relationships:
|
|
163
|
+
if not isinstance(raw_rule, dict):
|
|
164
|
+
continue
|
|
165
|
+
relationships.append(
|
|
166
|
+
RelationshipConstraint(
|
|
167
|
+
column=str(raw_rule.get("column", "")),
|
|
168
|
+
reference=str(raw_rule.get("reference", "")),
|
|
169
|
+
reference_column=str(raw_rule.get("reference_column", "")),
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
bounds: list[DomainBound] = []
|
|
174
|
+
raw_bounds = mapping.get("domain_bounds", {})
|
|
175
|
+
if isinstance(raw_bounds, dict):
|
|
176
|
+
for column, bound_payload in raw_bounds.items():
|
|
177
|
+
if not isinstance(bound_payload, dict):
|
|
178
|
+
continue
|
|
179
|
+
bounds.append(
|
|
180
|
+
DomainBound(
|
|
181
|
+
column=str(column),
|
|
182
|
+
min_value=(
|
|
183
|
+
float(bound_payload["min"])
|
|
184
|
+
if bound_payload.get("min") is not None
|
|
185
|
+
else None
|
|
186
|
+
),
|
|
187
|
+
max_value=(
|
|
188
|
+
float(bound_payload["max"])
|
|
189
|
+
if bound_payload.get("max") is not None
|
|
190
|
+
else None
|
|
191
|
+
),
|
|
192
|
+
inclusive_min=bool(bound_payload.get("inclusive_min", True)),
|
|
193
|
+
inclusive_max=bool(bound_payload.get("inclusive_max", True)),
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
aggregate_dependencies: list[AggregateDependency] = []
|
|
198
|
+
raw_aggregates = mapping.get("aggregate_dependencies", [])
|
|
199
|
+
if isinstance(raw_aggregates, list):
|
|
200
|
+
for raw_dependency in raw_aggregates:
|
|
201
|
+
if not isinstance(raw_dependency, dict):
|
|
202
|
+
continue
|
|
203
|
+
raw_aggregate = str(raw_dependency.get("aggregate", "")).lower()
|
|
204
|
+
if raw_aggregate not in {"sum", "avg"}:
|
|
205
|
+
continue
|
|
206
|
+
raw_group_by = raw_dependency.get("group_by", [])
|
|
207
|
+
group_by = (
|
|
208
|
+
tuple(str(value) for value in raw_group_by)
|
|
209
|
+
if isinstance(raw_group_by, Iterable) and not isinstance(raw_group_by, (str, bytes))
|
|
210
|
+
else ()
|
|
211
|
+
)
|
|
212
|
+
aggregate_dependencies.append(
|
|
213
|
+
AggregateDependency(
|
|
214
|
+
source_column=str(raw_dependency.get("source_column", "")),
|
|
215
|
+
aggregate=cast(AggregateLiteral, raw_aggregate),
|
|
216
|
+
target_column=str(raw_dependency.get("target_column", "")),
|
|
217
|
+
group_by=group_by,
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
return Schema(
|
|
222
|
+
columns=columns,
|
|
223
|
+
functional_dependencies=tuple(fds),
|
|
224
|
+
pii_columns=pii_columns,
|
|
225
|
+
primary_key_columns=primary_key_columns,
|
|
226
|
+
not_null_columns=not_null_columns,
|
|
227
|
+
unique_columns=unique_columns,
|
|
228
|
+
accepted_values=tuple(accepted_values),
|
|
229
|
+
regex_constraints=tuple(regex_constraints),
|
|
230
|
+
relationships=tuple(relationships),
|
|
231
|
+
domain_bounds=tuple(bounds),
|
|
232
|
+
aggregate_dependencies=tuple(aggregate_dependencies),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def load_schema(schema_path: Path) -> Schema:
|
|
237
|
+
"""Load a Schema from a YAML file.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
schema_path: Path to the YAML schema file.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Parsed Schema object.
|
|
244
|
+
|
|
245
|
+
Raises:
|
|
246
|
+
typer.BadParameter: If the schema file is malformed or unreadable.
|
|
247
|
+
"""
|
|
248
|
+
try:
|
|
249
|
+
raw = yaml.safe_load(schema_path.read_text(encoding="utf-8"))
|
|
250
|
+
except OSError as exc:
|
|
251
|
+
raise typer.BadParameter(f"Could not read schema file '{schema_path}': {exc}") from exc
|
|
252
|
+
|
|
253
|
+
if raw is not None and not isinstance(raw, dict):
|
|
254
|
+
raise typer.BadParameter(f"Schema file '{schema_path}' must be a YAML mapping.")
|
|
255
|
+
return schema_from_mapping(raw)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def read_csv(path: Path) -> Table:
|
|
259
|
+
"""Read a CSV using conservative string-preserving defaults.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
path: CSV path.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
A string-preserving DataForge table.
|
|
266
|
+
"""
|
|
267
|
+
return read_table_csv(path)
|