fauxdata-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fauxdata/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """fauxdata - CLI for generating and validating fake datasets."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,117 @@
1
+ """fauxdata generate command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import typer
9
+ from rich import print as rprint
10
+ from rich.console import Console
11
+ from rich.panel import Panel
12
+ from rich.table import Table
13
+
14
+ from fauxdata.generator import generate_dataset
15
+ from fauxdata.output import default_output_path, export_dataset, write_stdout
16
+ from fauxdata.schema import load_schema
17
+ from fauxdata.validator import validate_dataset
18
+
19
+ console = Console()
20
+
21
+
22
+ def run(
23
+ schema_path: str,
24
+ rows: Optional[int] = None,
25
+ out: Optional[str] = None,
26
+ fmt: Optional[str] = None,
27
+ seed: Optional[int] = None,
28
+ validate: bool = False,
29
+ ):
30
+ """Generate a fake dataset from a YAML schema."""
31
+ schema = load_schema(schema_path)
32
+
33
+ n = rows if rows is not None else schema.rows
34
+ rng_seed = seed if seed is not None else schema.seed
35
+ output_fmt = fmt or schema.output_format
36
+ output_path = out or schema.output_path or default_output_path(schema.name, output_fmt)
37
+
38
+ stdout_mode = output_path == "-"
39
+
40
+ if not stdout_mode:
41
+ rprint(Panel(f"[bold cyan]fauxdata generate[/bold cyan] [dim]{schema_path}[/dim]", expand=False))
42
+
43
+ if stdout_mode:
44
+ df = generate_dataset(schema, rows=n, seed=rng_seed)
45
+ write_stdout(df, output_fmt)
46
+ return
47
+
48
+ with console.status(f"[bold green]Generating {n} rows...[/bold green]"):
49
+ df = generate_dataset(schema, rows=n, seed=rng_seed)
50
+
51
+ _print_schema_table(schema, n, rng_seed)
52
+
53
+ saved = export_dataset(df, output_path, output_fmt)
54
+ rprint(f"\n[green]Saved[/green] [bold]{saved}[/bold] ([dim]{output_fmt}, {n} rows[/dim])")
55
+
56
+ if validate:
57
+ _run_validation(df, schema)
58
+
59
+
60
+ def _print_schema_table(schema, n: int, seed):
61
+ t = Table(title=f"Schema: {schema.name}", show_header=True, header_style="bold magenta")
62
+ t.add_column("Column", style="cyan")
63
+ t.add_column("Type")
64
+ t.add_column("Preset/Values")
65
+ t.add_column("Min")
66
+ t.add_column("Max")
67
+ t.add_column("Unique")
68
+
69
+ for col in schema.columns:
70
+ preset_val = col.preset or (str(col.values) if col.values else "-")
71
+ t.add_row(
72
+ col.name,
73
+ col.col_type,
74
+ preset_val,
75
+ str(col.min) if col.min is not None else "-",
76
+ str(col.max) if col.max is not None else "-",
77
+ "yes" if col.unique else "no",
78
+ )
79
+
80
+ console.print(t)
81
+ rprint(f" rows=[bold]{n}[/bold] seed=[bold]{seed}[/bold] locale=[bold]{schema.locale}[/bold]")
82
+
83
+
84
+ def _run_validation(df, schema):
85
+ rprint("\n[bold]Running validation...[/bold]")
86
+ all_passed, results = validate_dataset(df, schema)
87
+
88
+ if not results:
89
+ rprint("[yellow]No validation rules defined.[/yellow]")
90
+ return
91
+
92
+ t = Table(title="Validation Results", show_header=True, header_style="bold")
93
+ t.add_column("#")
94
+ t.add_column("Rule", style="cyan")
95
+ t.add_column("Column")
96
+ t.add_column("Passed", justify="right")
97
+ t.add_column("Failed", justify="right")
98
+ t.add_column("Status")
99
+
100
+ for r in results:
101
+ status = "[green]PASS[/green]" if r["ok"] else "[red]FAIL[/red]"
102
+ t.add_row(
103
+ str(r["step"]),
104
+ r["rule"],
105
+ r["column"],
106
+ str(r["passed"]),
107
+ str(r["failed"]),
108
+ status,
109
+ )
110
+
111
+ console.print(t)
112
+
113
+ if all_passed:
114
+ rprint("[bold green]All validation rules passed.[/bold green]")
115
+ else:
116
+ rprint("[bold red]Some validation rules failed.[/bold red]")
117
+ raise typer.Exit(code=1)
@@ -0,0 +1,116 @@
1
+ """fauxdata init command - create a schema template."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import questionary
8
+ import typer
9
+ from rich import print as rprint
10
+ from rich.panel import Panel
11
+
12
+ TEMPLATE = """\
13
+ # fauxdata schema template
14
+ # Generated by: fauxdata init
15
+
16
+ name: {name}
17
+ description: "{description}"
18
+ rows: {rows}
19
+ seed: 42
20
+ output:
21
+ format: {fmt}
22
+ path: {name}.{fmt}
23
+
24
+ columns:
25
+ id:
26
+ type: int
27
+ unique: true
28
+ min: 1
29
+ max: 99999
30
+
31
+ name:
32
+ type: string
33
+ preset: name # presets: name, email, phone, city, country, company, job, uuid, ip_address, url, iban...
34
+
35
+ email:
36
+ type: string
37
+ preset: email
38
+
39
+ age:
40
+ type: int
41
+ min: 18
42
+ max: 90
43
+
44
+ active:
45
+ type: bool
46
+
47
+ score:
48
+ type: float
49
+ min: 0.0
50
+ max: 100.0
51
+ precision: 2
52
+
53
+ signup_date:
54
+ type: date
55
+ min: "2020-01-01"
56
+ max: "2024-12-31"
57
+
58
+ # Example enum column (pick from a fixed set):
59
+ # status:
60
+ # type: string
61
+ # values: [active, inactive, pending]
62
+
63
+ validation:
64
+ - rule: col_vals_not_null
65
+ columns: [id, name, email]
66
+ - rule: col_vals_between
67
+ column: age
68
+ min: 18
69
+ max: 90
70
+ - rule: col_vals_regex
71
+ column: email
72
+ pattern: "^[^@]+@[^@]+\\\\.[^@]+$"
73
+ - rule: rows_distinct
74
+ columns: [id]
75
+ """
76
+
77
+
78
+ def run(name: str | None = None):
79
+ """Interactive schema template creator."""
80
+ rprint(Panel("[bold cyan]fauxdata init[/bold cyan] — schema template creator", expand=False))
81
+
82
+ schema_name = name or questionary.text(
83
+ "Schema name (e.g. people, orders):",
84
+ default="my_dataset",
85
+ ).ask()
86
+
87
+ if schema_name is None:
88
+ raise typer.Abort()
89
+
90
+ description = questionary.text(
91
+ "Short description:",
92
+ default=f"{schema_name} dataset",
93
+ ).ask() or ""
94
+
95
+ rows = questionary.text("Default number of rows:", default="1000").ask() or "1000"
96
+
97
+ fmt = questionary.select(
98
+ "Default output format:",
99
+ choices=["csv", "parquet", "json", "jsonl"],
100
+ ).ask() or "csv"
101
+
102
+ output_file = f"{schema_name}.yml"
103
+ out_path = Path(output_file)
104
+
105
+ if out_path.exists():
106
+ overwrite = questionary.confirm(f"{output_file} already exists. Overwrite?", default=False).ask()
107
+ if not overwrite:
108
+ rprint("[yellow]Aborted.[/yellow]")
109
+ raise typer.Exit()
110
+
111
+ content = TEMPLATE.format(name=schema_name, description=description, rows=rows, fmt=fmt)
112
+ out_path.write_text(content)
113
+
114
+ rprint(f"[green]Created[/green] [bold]{output_file}[/bold]")
115
+ rprint("[dim]Edit the schema then run:[/dim]")
116
+ rprint(f" [cyan]fauxdata generate {output_file} --validate[/cyan]")
@@ -0,0 +1,79 @@
1
+ """fauxdata preview command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import polars as pl
6
+ import typer
7
+ from rich import print as rprint
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.table import Table
11
+
12
+ console = Console()
13
+
14
+
15
+ def run(dataset_path: str, rows: int = 10):
16
+ """Show a preview of a dataset with column statistics."""
17
+ rprint(Panel(f"[bold cyan]fauxdata preview[/bold cyan] [dim]{dataset_path}[/dim]", expand=False))
18
+
19
+ ext = dataset_path.rsplit(".", 1)[-1].lower()
20
+ if ext == "csv":
21
+ df = pl.read_csv(dataset_path)
22
+ elif ext == "parquet":
23
+ df = pl.read_parquet(dataset_path)
24
+ elif ext == "json":
25
+ df = pl.read_json(dataset_path)
26
+ elif ext in ("jsonl", "ndjson"):
27
+ df = pl.read_ndjson(dataset_path)
28
+ else:
29
+ rprint(f"[red]Unsupported file format: .{ext}[/red]")
30
+ raise typer.Exit(code=1)
31
+
32
+ rprint(f" [bold]{len(df)}[/bold] rows × [bold]{len(df.columns)}[/bold] columns\n")
33
+
34
+ # Data preview table
35
+ preview_df = df.head(rows)
36
+ t = Table(title=f"First {min(rows, len(df))} rows", show_header=True, header_style="bold magenta")
37
+
38
+ for col in preview_df.columns:
39
+ t.add_column(col, overflow="fold", max_width=25)
40
+
41
+ for row in preview_df.iter_rows():
42
+ t.add_row(*[str(v) if v is not None else "[dim]null[/dim]" for v in row])
43
+
44
+ console.print(t)
45
+
46
+ # Column stats
47
+ stats_t = Table(title="Column Statistics", show_header=True, header_style="bold cyan")
48
+ stats_t.add_column("Column", style="cyan")
49
+ stats_t.add_column("Type")
50
+ stats_t.add_column("Nulls", justify="right")
51
+ stats_t.add_column("Unique", justify="right")
52
+ stats_t.add_column("Min")
53
+ stats_t.add_column("Max")
54
+
55
+ for col in df.columns:
56
+ series = df[col]
57
+ dtype = str(series.dtype)
58
+ nulls = str(series.null_count())
59
+ unique = str(series.n_unique())
60
+
61
+ try:
62
+ if series.dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
63
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
64
+ pl.Float32, pl.Float64):
65
+ col_min = str(series.min())
66
+ col_max = str(series.max())
67
+ elif series.dtype == pl.Date or series.dtype == pl.Datetime:
68
+ col_min = str(series.min())
69
+ col_max = str(series.max())
70
+ else:
71
+ col_min = "-"
72
+ col_max = "-"
73
+ except Exception:
74
+ col_min = "-"
75
+ col_max = "-"
76
+
77
+ stats_t.add_row(col, dtype, nulls, unique, col_min, col_max)
78
+
79
+ console.print(stats_t)
@@ -0,0 +1,73 @@
1
+ """fauxdata validate command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import polars as pl
6
+ import typer
7
+ from rich import print as rprint
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.table import Table
11
+
12
+ from fauxdata.schema import load_schema
13
+ from fauxdata.validator import validate_dataset
14
+
15
+ console = Console()
16
+
17
+
18
+ def run(dataset_path: str, schema_path: str):
19
+ """Validate an existing dataset against a YAML schema."""
20
+ schema = load_schema(schema_path)
21
+
22
+ rprint(Panel(
23
+ f"[bold cyan]fauxdata validate[/bold cyan] [dim]{dataset_path}[/dim]",
24
+ expand=False,
25
+ ))
26
+
27
+ ext = dataset_path.rsplit(".", 1)[-1].lower()
28
+ if ext == "csv":
29
+ df = pl.read_csv(dataset_path)
30
+ elif ext == "parquet":
31
+ df = pl.read_parquet(dataset_path)
32
+ elif ext == "json":
33
+ df = pl.read_json(dataset_path)
34
+ elif ext in ("jsonl", "ndjson"):
35
+ df = pl.read_ndjson(dataset_path)
36
+ else:
37
+ rprint(f"[red]Unsupported file format: .{ext}[/red]")
38
+ raise typer.Exit(code=1)
39
+
40
+ rprint(f" Loaded [bold]{len(df)}[/bold] rows, [bold]{len(df.columns)}[/bold] columns")
41
+
42
+ if not schema.validation_rules:
43
+ rprint("[yellow]No validation rules defined in schema.[/yellow]")
44
+ raise typer.Exit()
45
+
46
+ all_passed, results = validate_dataset(df, schema)
47
+
48
+ t = Table(title="Validation Results", show_header=True, header_style="bold")
49
+ t.add_column("#")
50
+ t.add_column("Rule", style="cyan")
51
+ t.add_column("Column")
52
+ t.add_column("Passed", justify="right")
53
+ t.add_column("Failed", justify="right")
54
+ t.add_column("Status")
55
+
56
+ for r in results:
57
+ status = "[green]PASS[/green]" if r["ok"] else "[red]FAIL[/red]"
58
+ t.add_row(
59
+ str(r["step"]),
60
+ r["rule"],
61
+ r["column"],
62
+ str(r["passed"]),
63
+ str(r["failed"]),
64
+ status,
65
+ )
66
+
67
+ console.print(t)
68
+
69
+ if all_passed:
70
+ rprint("[bold green]All validation rules passed.[/bold green]")
71
+ else:
72
+ rprint("[bold red]Some validation rules failed.[/bold red]")
73
+ raise typer.Exit(code=1)
fauxdata/generator.py ADDED
@@ -0,0 +1,80 @@
1
+ """Data generation using pointblank native API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pointblank as pb
6
+ import polars as pl
7
+
8
+ from fauxdata.schema import ColumnSchema, SchemaConfig
9
+
10
+
11
+ def generate_dataset(schema: SchemaConfig, rows: int | None = None, seed: int | None = None) -> pl.DataFrame:
12
+ """Generate a Polars DataFrame from a SchemaConfig using pointblank."""
13
+ n = rows if rows is not None else schema.rows
14
+ rng_seed = seed if seed is not None else schema.seed
15
+
16
+ pb_schema = _build_pb_schema(schema)
17
+ country = schema.locale or "US"
18
+
19
+ df = pb.generate_dataset(pb_schema, n=n, seed=rng_seed, country=country)
20
+ return df
21
+
22
+
23
+ def _build_pb_schema(schema: SchemaConfig) -> pb.Schema:
24
+ """Convert a SchemaConfig to a pointblank Schema."""
25
+ kwargs = {}
26
+ for col in schema.columns:
27
+ kwargs[col.name] = _col_to_field(col)
28
+ return pb.Schema(**kwargs)
29
+
30
+
31
+ def _col_to_field(col: ColumnSchema):
32
+ """Convert a ColumnSchema to a pointblank field spec."""
33
+ nullable = col.nullable
34
+ unique = col.unique
35
+
36
+ if col.col_type == "int":
37
+ return pb.int_field(
38
+ min_val=int(col.min) if col.min is not None else None,
39
+ max_val=int(col.max) if col.max is not None else None,
40
+ nullable=nullable,
41
+ unique=unique,
42
+ )
43
+
44
+ elif col.col_type == "float":
45
+ return pb.float_field(
46
+ min_val=float(col.min) if col.min is not None else None,
47
+ max_val=float(col.max) if col.max is not None else None,
48
+ nullable=nullable,
49
+ unique=unique,
50
+ )
51
+
52
+ elif col.col_type == "bool":
53
+ return pb.bool_field(nullable=nullable)
54
+
55
+ elif col.col_type == "date":
56
+ return pb.date_field(
57
+ min_date=str(col.min) if col.min is not None else None,
58
+ max_date=str(col.max) if col.max is not None else None,
59
+ nullable=nullable,
60
+ unique=unique,
61
+ )
62
+
63
+ elif col.col_type == "datetime":
64
+ return pb.datetime_field(
65
+ min_date=str(col.min) if col.min is not None else None,
66
+ max_date=str(col.max) if col.max is not None else None,
67
+ nullable=nullable,
68
+ unique=unique,
69
+ )
70
+
71
+ elif col.col_type == "string":
72
+ if col.values:
73
+ return pb.string_field(allowed=col.values, nullable=nullable)
74
+ elif col.preset:
75
+ return pb.string_field(preset=col.preset, nullable=nullable, unique=unique)
76
+ else:
77
+ return pb.string_field(nullable=nullable, unique=unique)
78
+
79
+ else:
80
+ return pb.string_field(nullable=nullable)
fauxdata/main.py ADDED
@@ -0,0 +1,73 @@
1
+ """fauxdata CLI entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import pyfiglet
8
+ import typer
9
+ from rich import print as rprint
10
+ from rich.console import Console
11
+
12
+ app = typer.Typer(
13
+ name="fauxdata",
14
+ help="Generate and validate fake datasets from YAML schemas.",
15
+ add_completion=False,
16
+ )
17
+ console = Console()
18
+
19
+
20
+ def _banner():
21
+ banner = pyfiglet.figlet_format("fauxdata", font="slant")
22
+ rprint(f"[bold cyan]{banner}[/bold cyan]")
23
+ rprint("[dim]Generate and validate realistic fake datasets[/dim]\n")
24
+
25
+
26
+ @app.callback(invoke_without_command=True)
27
+ def main(ctx: typer.Context):
28
+ if ctx.invoked_subcommand is None:
29
+ _banner()
30
+ rprint(ctx.get_help())
31
+
32
+
33
+ @app.command("init")
34
+ def init_cmd(
35
+ name: Optional[str] = typer.Option(None, "--name", "-n", help="Schema name"),
36
+ ):
37
+ """Create a schema template interactively."""
38
+ from fauxdata.commands.init import run
39
+ run(name=name)
40
+
41
+
42
+ @app.command("generate")
43
+ def generate_cmd(
44
+ schema: str = typer.Argument(..., help="Path to YAML schema file"),
45
+ rows: Optional[int] = typer.Option(None, "--rows", "-r", help="Number of rows to generate"),
46
+ out: Optional[str] = typer.Option(None, "--out", "-o", help="Output file path"),
47
+ fmt: Optional[str] = typer.Option(None, "--format", "-f", help="Output format: csv, parquet, json, jsonl"),
48
+ seed: Optional[int] = typer.Option(None, "--seed", "-s", help="Random seed for reproducibility"),
49
+ validate: bool = typer.Option(False, "--validate", "-v", help="Run validation after generating"),
50
+ ):
51
+ """Generate a fake dataset from a YAML schema."""
52
+ from fauxdata.commands.generate import run
53
+ run(schema_path=schema, rows=rows, out=out, fmt=fmt, seed=seed, validate=validate)
54
+
55
+
56
+ @app.command("validate")
57
+ def validate_cmd(
58
+ dataset: str = typer.Argument(..., help="Path to dataset file (csv, parquet, json, jsonl)"),
59
+ schema: str = typer.Argument(..., help="Path to YAML schema file"),
60
+ ):
61
+ """Validate an existing dataset against a YAML schema."""
62
+ from fauxdata.commands.validate import run
63
+ run(dataset_path=dataset, schema_path=schema)
64
+
65
+
66
+ @app.command("preview")
67
+ def preview_cmd(
68
+ dataset: str = typer.Argument(..., help="Path to dataset file"),
69
+ rows: int = typer.Option(10, "--rows", "-r", help="Number of rows to preview"),
70
+ ):
71
+ """Show a preview and column statistics for a dataset."""
72
+ from fauxdata.commands.preview import run
73
+ run(dataset_path=dataset, rows=rows)
fauxdata/output.py ADDED
@@ -0,0 +1,57 @@
1
+ """Export functions for fauxdata datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import polars as pl
8
+
9
+
10
+ def normalize_fmt(fmt: str) -> str:
11
+ """Normalize format aliases."""
12
+ if fmt == "jsonlines":
13
+ return "jsonl"
14
+ return fmt
15
+
16
+
17
+ def export_dataset(df: pl.DataFrame, path: str | Path, fmt: str) -> Path:
18
+ """Export a DataFrame to the given format and path. Returns the output path."""
19
+ fmt = normalize_fmt(fmt)
20
+ out = Path(path)
21
+ out.parent.mkdir(parents=True, exist_ok=True)
22
+
23
+ if fmt == "csv":
24
+ df.write_csv(out)
25
+ elif fmt == "parquet":
26
+ df.write_parquet(out)
27
+ elif fmt == "json":
28
+ df.write_json(out)
29
+ elif fmt == "jsonl":
30
+ df.write_ndjson(out)
31
+ else:
32
+ raise ValueError(f"Unsupported format: {fmt}. Use csv, parquet, json, jsonl, or jsonlines.")
33
+
34
+ return out
35
+
36
+
37
+ def write_stdout(df: pl.DataFrame, fmt: str) -> None:
38
+ """Write a DataFrame to stdout."""
39
+ import sys
40
+ fmt = normalize_fmt(fmt)
41
+ if fmt == "csv":
42
+ sys.stdout.write(df.write_csv())
43
+ elif fmt == "json":
44
+ sys.stdout.write(df.write_json())
45
+ elif fmt == "jsonl":
46
+ sys.stdout.write(df.write_ndjson())
47
+ elif fmt == "parquet":
48
+ import io
49
+ buf = io.BytesIO()
50
+ df.write_parquet(buf)
51
+ sys.stdout.buffer.write(buf.getvalue())
52
+ else:
53
+ raise ValueError(f"Unsupported format: {fmt}. Use csv, parquet, json, or jsonl.")
54
+
55
+
56
+ def default_output_path(schema_name: str, fmt: str) -> str:
57
+ return f"{schema_name}.{normalize_fmt(fmt)}"
fauxdata/schema.py ADDED
@@ -0,0 +1,174 @@
1
+ """Schema parsing and validation for fauxdata YAML schemas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+
11
+
12
+ VALID_TYPES = {"int", "float", "string", "bool", "date", "datetime"}
13
+
14
+ STRING_PRESETS = {
15
+ # Personal
16
+ "name", "name_full", "first_name", "last_name",
17
+ "email", "phone_number",
18
+ "address", "city", "state", "country", "country_code_2", "country_code_3", "postcode",
19
+ "latitude", "longitude",
20
+ # Business
21
+ "company", "job", "catch_phrase",
22
+ # Internet
23
+ "url", "domain_name", "ipv4", "ipv6", "user_name", "password",
24
+ # Text
25
+ "text", "sentence", "paragraph", "word",
26
+ # Financial
27
+ "credit_card_number", "iban", "currency_code",
28
+ # Identifiers
29
+ "uuid4", "md5", "sha1",
30
+ # Misc
31
+ "license_plate", "ssn",
32
+ }
33
+
34
+ VALID_RULES = {
35
+ "col_vals_not_null",
36
+ "col_vals_between",
37
+ "col_vals_regex",
38
+ "col_vals_in_set",
39
+ "col_vals_gt",
40
+ "col_vals_lt",
41
+ "col_vals_ge",
42
+ "col_vals_le",
43
+ "rows_distinct",
44
+ "col_exists",
45
+ }
46
+
47
+
48
+ @dataclass
49
+ class ColumnSchema:
50
+ name: str
51
+ col_type: str
52
+ unique: bool = False
53
+ nullable: bool = False
54
+ min: Any = None
55
+ max: Any = None
56
+ preset: str | None = None
57
+ locale: str | None = None
58
+ precision: int | None = None
59
+ values: list | None = None # for in_set
60
+
61
+
62
+ @dataclass
63
+ class ValidationRule:
64
+ rule: str
65
+ columns: list[str] | None = None
66
+ column: str | None = None
67
+ min: Any = None
68
+ max: Any = None
69
+ pattern: str | None = None
70
+ values: list | None = None
71
+
72
+
73
+ @dataclass
74
+ class SchemaConfig:
75
+ name: str
76
+ rows: int
77
+ columns: list[ColumnSchema]
78
+ description: str = ""
79
+ seed: int | None = None
80
+ locale: str = "US"
81
+ output_format: str = "csv"
82
+ output_path: str | None = None
83
+ validation_rules: list[ValidationRule] = field(default_factory=list)
84
+
85
+
86
+ def load_schema(path: str | Path) -> SchemaConfig:
87
+ """Load and parse a YAML schema file."""
88
+ path = Path(path)
89
+ if not path.exists():
90
+ raise FileNotFoundError(f"Schema file not found: {path}")
91
+
92
+ with open(path) as f:
93
+ data = yaml.safe_load(f)
94
+
95
+ return _parse_schema(data)
96
+
97
+
98
+ def _parse_schema(data: dict) -> SchemaConfig:
99
+ if "name" not in data:
100
+ raise ValueError("Schema must have a 'name' field")
101
+ if "columns" not in data:
102
+ raise ValueError("Schema must have a 'columns' field")
103
+
104
+ rows = data.get("rows", 100)
105
+ seed = data.get("seed", None)
106
+ description = data.get("description", "")
107
+
108
+ output = data.get("output", {})
109
+ output_format = output.get("format", "csv")
110
+ output_path = output.get("path", None)
111
+
112
+ columns = []
113
+ for col_name, col_data in data["columns"].items():
114
+ col = _parse_column(col_name, col_data)
115
+ columns.append(col)
116
+
117
+ validation_rules = []
118
+ for rule_data in data.get("validation", []):
119
+ validation_rules.append(_parse_rule(rule_data))
120
+
121
+ return SchemaConfig(
122
+ name=data["name"],
123
+ rows=rows,
124
+ seed=seed,
125
+ description=description,
126
+ locale=data.get("locale", "US"),
127
+ output_format=output_format,
128
+ output_path=output_path,
129
+ columns=columns,
130
+ validation_rules=validation_rules,
131
+ )
132
+
133
+
134
+ def _parse_column(name: str, data: dict) -> ColumnSchema:
135
+ if "type" not in data:
136
+ raise ValueError(f"Column '{name}' must have a 'type' field")
137
+ col_type = data["type"]
138
+ if col_type not in VALID_TYPES:
139
+ raise ValueError(f"Column '{name}': invalid type '{col_type}'. Valid: {VALID_TYPES}")
140
+
141
+ preset = data.get("preset", None)
142
+ if preset and preset not in STRING_PRESETS:
143
+ raise ValueError(f"Column '{name}': unknown preset '{preset}'. Valid: {STRING_PRESETS}")
144
+
145
+ return ColumnSchema(
146
+ name=name,
147
+ col_type=col_type,
148
+ unique=data.get("unique", False),
149
+ nullable=data.get("nullable", False),
150
+ min=data.get("min", None),
151
+ max=data.get("max", None),
152
+ preset=preset,
153
+ locale=data.get("locale", None),
154
+ precision=data.get("precision", None),
155
+ values=data.get("values", None),
156
+ )
157
+
158
+
159
+ def _parse_rule(data: dict) -> ValidationRule:
160
+ if "rule" not in data:
161
+ raise ValueError("Validation rule must have a 'rule' field")
162
+ rule = data["rule"]
163
+ if rule not in VALID_RULES:
164
+ raise ValueError(f"Unknown validation rule '{rule}'. Valid: {VALID_RULES}")
165
+
166
+ return ValidationRule(
167
+ rule=rule,
168
+ columns=data.get("columns", None),
169
+ column=data.get("column", None),
170
+ min=data.get("min", None),
171
+ max=data.get("max", None),
172
+ pattern=data.get("pattern", None),
173
+ values=data.get("values", None),
174
+ )
fauxdata/validator.py ADDED
@@ -0,0 +1,108 @@
1
+ """Validation of datasets using pointblank."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pointblank as pb
6
+ import polars as pl
7
+
8
+ from fauxdata.schema import SchemaConfig, ValidationRule
9
+
10
+
11
+ def validate_dataset(df: pl.DataFrame, schema: SchemaConfig) -> tuple[bool, list[dict]]:
12
+ """
13
+ Run pointblank validation rules against df.
14
+ Returns (all_passed, results_list).
15
+ """
16
+ if not schema.validation_rules:
17
+ return True, []
18
+
19
+ v = pb.Validate(
20
+ data=df,
21
+ tbl_name=schema.name,
22
+ thresholds=pb.Thresholds(warning=1),
23
+ )
24
+
25
+ for rule in schema.validation_rules:
26
+ _add_rule(v, rule)
27
+
28
+ v.interrogate()
29
+
30
+ all_passed = v.all_passed()
31
+ results = _extract_results(v, schema.validation_rules)
32
+ return all_passed, results
33
+
34
+
35
+ def _add_rule(v: pb.Validate, rule: ValidationRule) -> None:
36
+ r = rule.rule
37
+
38
+ if r == "col_vals_not_null":
39
+ cols = rule.columns or ([rule.column] if rule.column else [])
40
+ for col in cols:
41
+ v.col_vals_not_null(columns=col)
42
+
43
+ elif r == "col_vals_between":
44
+ v.col_vals_between(columns=rule.column, left=rule.min, right=rule.max)
45
+
46
+ elif r == "col_vals_gt":
47
+ v.col_vals_gt(columns=rule.column, value=rule.min)
48
+
49
+ elif r == "col_vals_lt":
50
+ v.col_vals_lt(columns=rule.column, value=rule.max)
51
+
52
+ elif r == "col_vals_ge":
53
+ v.col_vals_ge(columns=rule.column, value=rule.min)
54
+
55
+ elif r == "col_vals_le":
56
+ v.col_vals_le(columns=rule.column, value=rule.max)
57
+
58
+ elif r == "col_vals_regex":
59
+ v.col_vals_regex(columns=rule.column, pattern=rule.pattern)
60
+
61
+ elif r == "col_vals_in_set":
62
+ v.col_vals_in_set(columns=rule.column, set=rule.values)
63
+
64
+ elif r == "rows_distinct":
65
+ cols = rule.columns or ([rule.column] if rule.column else None)
66
+ v.rows_distinct(columns_subset=cols)
67
+
68
+ elif r == "col_exists":
69
+ cols = rule.columns or ([rule.column] if rule.column else [])
70
+ for col in cols:
71
+ v.col_exists(columns=col)
72
+
73
+
74
+ def _extract_results(v: pb.Validate, rules: list[ValidationRule]) -> list[dict]:
75
+ """Extract per-step results as a list of dicts."""
76
+ results = []
77
+ # Build a flat list of steps (one per column for multi-column rules)
78
+ step = 1
79
+ for rule in rules:
80
+ if rule.rule in ("col_vals_not_null", "col_exists"):
81
+ cols = rule.columns or ([rule.column] if rule.column else [])
82
+ for col in cols:
83
+ results.append(_get_step(v, step, rule.rule, col))
84
+ step += 1
85
+ else:
86
+ col = rule.column or (", ".join(rule.columns) if rule.columns else "-")
87
+ results.append(_get_step(v, step, rule.rule, col))
88
+ step += 1
89
+ return results
90
+
91
+
92
+ def _get_step(v: pb.Validate, i: int, rule: str, column: str) -> dict:
93
+ try:
94
+ passed = v.n_passed(i=i, scalar=True) or 0
95
+ failed = v.n_failed(i=i, scalar=True) or 0
96
+ except Exception:
97
+ passed = 0
98
+ failed = 0
99
+ total = passed + failed
100
+ return {
101
+ "step": i,
102
+ "rule": rule,
103
+ "column": column,
104
+ "passed": passed,
105
+ "failed": failed,
106
+ "total": total,
107
+ "ok": failed == 0,
108
+ }
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: fauxdata-cli
3
+ Version: 0.1.0
4
+ Summary: CLI for generating and validating fake datasets
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: faker>=26.0
7
+ Requires-Dist: pointblank>=0.22
8
+ Requires-Dist: polars>=1.0
9
+ Requires-Dist: pyfiglet>=1.0
10
+ Requires-Dist: pyyaml>=6.0
11
+ Requires-Dist: questionary>=2.0
12
+ Requires-Dist: rich>=13
13
+ Requires-Dist: typer>=0.12
@@ -0,0 +1,15 @@
1
+ fauxdata/__init__.py,sha256=eSywJE8B_zDUGCVhE_l8gXkdSKtB2MqGWJA1SkZNskU,89
2
+ fauxdata/generator.py,sha256=cN04eq5w-7DDzotGzUj9iZ6EAhOKkw3AanytA68bvXo,2596
3
+ fauxdata/main.py,sha256=MOYg2OpBICrlR2an9b_1E2ujakDqNfrOwDeNVqi6pEo,2475
4
+ fauxdata/output.py,sha256=VS-aXquONgMxSaLBjoUBvFwQrnOSJl5X9Yzrx3XFzjY,1562
5
+ fauxdata/schema.py,sha256=fQ3pB-BhgRgFZxCaBaJte985g8k2ODaygPgCIiDwvCM,4732
6
+ fauxdata/validator.py,sha256=VF7BMvrNhYlojCxtnEP24CIbcc4Dfqi7u-pWeuyoMTo,3193
7
+ fauxdata/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ fauxdata/commands/generate.py,sha256=suzClcxVO8rKJBR98rpylP3-dlZzNyWLAhsnqpEuklc,3562
9
+ fauxdata/commands/init.py,sha256=6MkC_t-oB6BuJw7XbRFTZJtbV9cfV8UVMDIIE7BCNQg,2620
10
+ fauxdata/commands/preview.py,sha256=MU-emgNC3lzFuDU-2aPKTllnxObyq-mmMlXI1X47QIU,2612
11
+ fauxdata/commands/validate.py,sha256=BBX2DQJoZDgffGLUw7ezXKhcMxz0SuxyN3QebF6AEuE,2121
12
+ fauxdata_cli-0.1.0.dist-info/METADATA,sha256=dLOlWhmKxno1O0SCsSu6PkBTTq-BI7J7283IZC53njA,362
13
+ fauxdata_cli-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
14
+ fauxdata_cli-0.1.0.dist-info/entry_points.txt,sha256=fplnEfv4hunVCf1dWhc7fBdcAP_10PdCAqCfxIOnh1o,47
15
+ fauxdata_cli-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ fauxdata = fauxdata.main:app