PyPI - fauxdata-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fauxdata-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

fauxdata/__init__.py +3 -0
fauxdata/commands/__init__.py +0 -0
fauxdata/commands/generate.py +117 -0
fauxdata/commands/init.py +116 -0
fauxdata/commands/preview.py +79 -0
fauxdata/commands/validate.py +73 -0
fauxdata/generator.py +80 -0
fauxdata/main.py +73 -0
fauxdata/output.py +57 -0
fauxdata/schema.py +174 -0
fauxdata/validator.py +108 -0
fauxdata_cli-0.1.0.dist-info/METADATA +13 -0
fauxdata_cli-0.1.0.dist-info/RECORD +15 -0
fauxdata_cli-0.1.0.dist-info/WHEEL +4 -0
fauxdata_cli-0.1.0.dist-info/entry_points.txt +2 -0

fauxdata/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""fauxdata - CLI for generating and validating fake datasets."""
+__version__ = "0.1.0"

fauxdata/commands/__init__.py ADDED Viewed

File without changes

fauxdata/commands/generate.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""fauxdata generate command."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+import typer
+from rich import print as rprint
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from fauxdata.generator import generate_dataset
+from fauxdata.output import default_output_path, export_dataset, write_stdout
+from fauxdata.schema import load_schema
+from fauxdata.validator import validate_dataset
+console = Console()
+def run(
+    schema_path: str,
+    rows: Optional[int] = None,
+    out: Optional[str] = None,
+    fmt: Optional[str] = None,
+    seed: Optional[int] = None,
+    validate: bool = False,
+):
+    """Generate a fake dataset from a YAML schema."""
+    schema = load_schema(schema_path)
+    n = rows if rows is not None else schema.rows
+    rng_seed = seed if seed is not None else schema.seed
+    output_fmt = fmt or schema.output_format
+    output_path = out or schema.output_path or default_output_path(schema.name, output_fmt)
+    stdout_mode = output_path == "-"
+    if not stdout_mode:
+        rprint(Panel(f"[bold cyan]fauxdata generate[/bold cyan]  [dim]{schema_path}[/dim]", expand=False))
+    if stdout_mode:
+        df = generate_dataset(schema, rows=n, seed=rng_seed)
+        write_stdout(df, output_fmt)
+        return
+    with console.status(f"[bold green]Generating {n} rows...[/bold green]"):
+        df = generate_dataset(schema, rows=n, seed=rng_seed)
+    _print_schema_table(schema, n, rng_seed)
+    saved = export_dataset(df, output_path, output_fmt)
+    rprint(f"\n[green]Saved[/green] [bold]{saved}[/bold]  ([dim]{output_fmt}, {n} rows[/dim])")
+    if validate:
+        _run_validation(df, schema)
+def _print_schema_table(schema, n: int, seed):
+    t = Table(title=f"Schema: {schema.name}", show_header=True, header_style="bold magenta")
+    t.add_column("Column", style="cyan")
+    t.add_column("Type")
+    t.add_column("Preset/Values")
+    t.add_column("Min")
+    t.add_column("Max")
+    t.add_column("Unique")
+    for col in schema.columns:
+        preset_val = col.preset or (str(col.values) if col.values else "-")
+        t.add_row(
+            col.name,
+            col.col_type,
+            preset_val,
+            str(col.min) if col.min is not None else "-",
+            str(col.max) if col.max is not None else "-",
+            "yes" if col.unique else "no",
+        )
+    console.print(t)
+    rprint(f"  rows=[bold]{n}[/bold]  seed=[bold]{seed}[/bold]  locale=[bold]{schema.locale}[/bold]")
+def _run_validation(df, schema):
+    rprint("\n[bold]Running validation...[/bold]")
+    all_passed, results = validate_dataset(df, schema)
+    if not results:
+        rprint("[yellow]No validation rules defined.[/yellow]")
+        return
+    t = Table(title="Validation Results", show_header=True, header_style="bold")
+    t.add_column("#")
+    t.add_column("Rule", style="cyan")
+    t.add_column("Column")
+    t.add_column("Passed", justify="right")
+    t.add_column("Failed", justify="right")
+    t.add_column("Status")
+    for r in results:
+        status = "[green]PASS[/green]" if r["ok"] else "[red]FAIL[/red]"
+        t.add_row(
+            str(r["step"]),
+            r["rule"],
+            r["column"],
+            str(r["passed"]),
+            str(r["failed"]),
+            status,
+        )
+    console.print(t)
+    if all_passed:
+        rprint("[bold green]All validation rules passed.[/bold green]")
+    else:
+        rprint("[bold red]Some validation rules failed.[/bold red]")
+        raise typer.Exit(code=1)

fauxdata/commands/init.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""fauxdata init command - create a schema template."""
+from __future__ import annotations
+from pathlib import Path
+import questionary
+import typer
+from rich import print as rprint
+from rich.panel import Panel
+TEMPLATE = """\
+# fauxdata schema template
+# Generated by: fauxdata init
+name: {name}
+description: "{description}"
+rows: {rows}
+seed: 42
+output:
+  format: {fmt}
+  path: {name}.{fmt}
+columns:
+  id:
+    type: int
+    unique: true
+    min: 1
+    max: 99999
+  name:
+    type: string
+    preset: name         # presets: name, email, phone, city, country, company, job, uuid, ip_address, url, iban...
+  email:
+    type: string
+    preset: email
+  age:
+    type: int
+    min: 18
+    max: 90
+  active:
+    type: bool
+  score:
+    type: float
+    min: 0.0
+    max: 100.0
+    precision: 2
+  signup_date:
+    type: date
+    min: "2020-01-01"
+    max: "2024-12-31"
+  # Example enum column (pick from a fixed set):
+  # status:
+  #   type: string
+  #   values: [active, inactive, pending]
+validation:
+  - rule: col_vals_not_null
+    columns: [id, name, email]
+  - rule: col_vals_between
+    column: age
+    min: 18
+    max: 90
+  - rule: col_vals_regex
+    column: email
+    pattern: "^[^@]+@[^@]+\\\\.[^@]+$"
+  - rule: rows_distinct
+    columns: [id]
+"""
+def run(name: str | None = None):
+    """Interactive schema template creator."""
+    rprint(Panel("[bold cyan]fauxdata init[/bold cyan] — schema template creator", expand=False))
+    schema_name = name or questionary.text(
+        "Schema name (e.g. people, orders):",
+        default="my_dataset",
+    ).ask()
+    if schema_name is None:
+        raise typer.Abort()
+    description = questionary.text(
+        "Short description:",
+        default=f"{schema_name} dataset",
+    ).ask() or ""
+    rows = questionary.text("Default number of rows:", default="1000").ask() or "1000"
+    fmt = questionary.select(
+        "Default output format:",
+        choices=["csv", "parquet", "json", "jsonl"],
+    ).ask() or "csv"
+    output_file = f"{schema_name}.yml"
+    out_path = Path(output_file)
+    if out_path.exists():
+        overwrite = questionary.confirm(f"{output_file} already exists. Overwrite?", default=False).ask()
+        if not overwrite:
+            rprint("[yellow]Aborted.[/yellow]")
+            raise typer.Exit()
+    content = TEMPLATE.format(name=schema_name, description=description, rows=rows, fmt=fmt)
+    out_path.write_text(content)
+    rprint(f"[green]Created[/green] [bold]{output_file}[/bold]")
+    rprint("[dim]Edit the schema then run:[/dim]")
+    rprint(f"  [cyan]fauxdata generate {output_file} --validate[/cyan]")

fauxdata/commands/preview.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""fauxdata preview command."""
+from __future__ import annotations
+import polars as pl
+import typer
+from rich import print as rprint
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+console = Console()
+def run(dataset_path: str, rows: int = 10):
+    """Show a preview of a dataset with column statistics."""
+    rprint(Panel(f"[bold cyan]fauxdata preview[/bold cyan]  [dim]{dataset_path}[/dim]", expand=False))
+    ext = dataset_path.rsplit(".", 1)[-1].lower()
+    if ext == "csv":
+        df = pl.read_csv(dataset_path)
+    elif ext == "parquet":
+        df = pl.read_parquet(dataset_path)
+    elif ext == "json":
+        df = pl.read_json(dataset_path)
+    elif ext in ("jsonl", "ndjson"):
+        df = pl.read_ndjson(dataset_path)
+    else:
+        rprint(f"[red]Unsupported file format: .{ext}[/red]")
+        raise typer.Exit(code=1)
+    rprint(f"  [bold]{len(df)}[/bold] rows × [bold]{len(df.columns)}[/bold] columns\n")
+    # Data preview table
+    preview_df = df.head(rows)
+    t = Table(title=f"First {min(rows, len(df))} rows", show_header=True, header_style="bold magenta")
+    for col in preview_df.columns:
+        t.add_column(col, overflow="fold", max_width=25)
+    for row in preview_df.iter_rows():
+        t.add_row(*[str(v) if v is not None else "[dim]null[/dim]" for v in row])
+    console.print(t)
+    # Column stats
+    stats_t = Table(title="Column Statistics", show_header=True, header_style="bold cyan")
+    stats_t.add_column("Column", style="cyan")
+    stats_t.add_column("Type")
+    stats_t.add_column("Nulls", justify="right")
+    stats_t.add_column("Unique", justify="right")
+    stats_t.add_column("Min")
+    stats_t.add_column("Max")
+    for col in df.columns:
+        series = df[col]
+        dtype = str(series.dtype)
+        nulls = str(series.null_count())
+        unique = str(series.n_unique())
+        try:
+            if series.dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
+                                 pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
+                                 pl.Float32, pl.Float64):
+                col_min = str(series.min())
+                col_max = str(series.max())
+            elif series.dtype == pl.Date or series.dtype == pl.Datetime:
+                col_min = str(series.min())
+                col_max = str(series.max())
+            else:
+                col_min = "-"
+                col_max = "-"
+        except Exception:
+            col_min = "-"
+            col_max = "-"
+        stats_t.add_row(col, dtype, nulls, unique, col_min, col_max)
+    console.print(stats_t)

fauxdata/commands/validate.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""fauxdata validate command."""
+from __future__ import annotations
+import polars as pl
+import typer
+from rich import print as rprint
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from fauxdata.schema import load_schema
+from fauxdata.validator import validate_dataset
+console = Console()
+def run(dataset_path: str, schema_path: str):
+    """Validate an existing dataset against a YAML schema."""
+    schema = load_schema(schema_path)
+    rprint(Panel(
+        f"[bold cyan]fauxdata validate[/bold cyan]  [dim]{dataset_path}[/dim]",
+        expand=False,
+    ))
+    ext = dataset_path.rsplit(".", 1)[-1].lower()
+    if ext == "csv":
+        df = pl.read_csv(dataset_path)
+    elif ext == "parquet":
+        df = pl.read_parquet(dataset_path)
+    elif ext == "json":
+        df = pl.read_json(dataset_path)
+    elif ext in ("jsonl", "ndjson"):
+        df = pl.read_ndjson(dataset_path)
+    else:
+        rprint(f"[red]Unsupported file format: .{ext}[/red]")
+        raise typer.Exit(code=1)
+    rprint(f"  Loaded [bold]{len(df)}[/bold] rows, [bold]{len(df.columns)}[/bold] columns")
+    if not schema.validation_rules:
+        rprint("[yellow]No validation rules defined in schema.[/yellow]")
+        raise typer.Exit()
+    all_passed, results = validate_dataset(df, schema)
+    t = Table(title="Validation Results", show_header=True, header_style="bold")
+    t.add_column("#")
+    t.add_column("Rule", style="cyan")
+    t.add_column("Column")
+    t.add_column("Passed", justify="right")
+    t.add_column("Failed", justify="right")
+    t.add_column("Status")
+    for r in results:
+        status = "[green]PASS[/green]" if r["ok"] else "[red]FAIL[/red]"
+        t.add_row(
+            str(r["step"]),
+            r["rule"],
+            r["column"],
+            str(r["passed"]),
+            str(r["failed"]),
+            status,
+        )
+    console.print(t)
+    if all_passed:
+        rprint("[bold green]All validation rules passed.[/bold green]")
+    else:
+        rprint("[bold red]Some validation rules failed.[/bold red]")
+        raise typer.Exit(code=1)

fauxdata/generator.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Data generation using pointblank native API."""
+from __future__ import annotations
+import pointblank as pb
+import polars as pl
+from fauxdata.schema import ColumnSchema, SchemaConfig
+def generate_dataset(schema: SchemaConfig, rows: int | None = None, seed: int | None = None) -> pl.DataFrame:
+    """Generate a Polars DataFrame from a SchemaConfig using pointblank."""
+    n = rows if rows is not None else schema.rows
+    rng_seed = seed if seed is not None else schema.seed
+    pb_schema = _build_pb_schema(schema)
+    country = schema.locale or "US"
+    df = pb.generate_dataset(pb_schema, n=n, seed=rng_seed, country=country)
+    return df
+def _build_pb_schema(schema: SchemaConfig) -> pb.Schema:
+    """Convert a SchemaConfig to a pointblank Schema."""
+    kwargs = {}
+    for col in schema.columns:
+        kwargs[col.name] = _col_to_field(col)
+    return pb.Schema(**kwargs)
+def _col_to_field(col: ColumnSchema):
+    """Convert a ColumnSchema to a pointblank field spec."""
+    nullable = col.nullable
+    unique = col.unique
+    if col.col_type == "int":
+        return pb.int_field(
+            min_val=int(col.min) if col.min is not None else None,
+            max_val=int(col.max) if col.max is not None else None,
+            nullable=nullable,
+            unique=unique,
+        )
+    elif col.col_type == "float":
+        return pb.float_field(
+            min_val=float(col.min) if col.min is not None else None,
+            max_val=float(col.max) if col.max is not None else None,
+            nullable=nullable,
+            unique=unique,
+        )
+    elif col.col_type == "bool":
+        return pb.bool_field(nullable=nullable)
+    elif col.col_type == "date":
+        return pb.date_field(
+            min_date=str(col.min) if col.min is not None else None,
+            max_date=str(col.max) if col.max is not None else None,
+            nullable=nullable,
+            unique=unique,
+        )
+    elif col.col_type == "datetime":
+        return pb.datetime_field(
+            min_date=str(col.min) if col.min is not None else None,
+            max_date=str(col.max) if col.max is not None else None,
+            nullable=nullable,
+            unique=unique,
+        )
+    elif col.col_type == "string":
+        if col.values:
+            return pb.string_field(allowed=col.values, nullable=nullable)
+        elif col.preset:
+            return pb.string_field(preset=col.preset, nullable=nullable, unique=unique)
+        else:
+            return pb.string_field(nullable=nullable, unique=unique)
+    else:
+        return pb.string_field(nullable=nullable)

fauxdata/main.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""fauxdata CLI entry point."""
+from __future__ import annotations
+from typing import Optional
+import pyfiglet
+import typer
+from rich import print as rprint
+from rich.console import Console
+app = typer.Typer(
+    name="fauxdata",
+    help="Generate and validate fake datasets from YAML schemas.",
+    add_completion=False,
+)
+console = Console()
+def _banner():
+    banner = pyfiglet.figlet_format("fauxdata", font="slant")
+    rprint(f"[bold cyan]{banner}[/bold cyan]")
+    rprint("[dim]Generate and validate realistic fake datasets[/dim]\n")
+@app.callback(invoke_without_command=True)
+def main(ctx: typer.Context):
+    if ctx.invoked_subcommand is None:
+        _banner()
+        rprint(ctx.get_help())
+@app.command("init")
+def init_cmd(
+    name: Optional[str] = typer.Option(None, "--name", "-n", help="Schema name"),
+):
+    """Create a schema template interactively."""
+    from fauxdata.commands.init import run
+    run(name=name)
+@app.command("generate")
+def generate_cmd(
+    schema: str = typer.Argument(..., help="Path to YAML schema file"),
+    rows: Optional[int] = typer.Option(None, "--rows", "-r", help="Number of rows to generate"),
+    out: Optional[str] = typer.Option(None, "--out", "-o", help="Output file path"),
+    fmt: Optional[str] = typer.Option(None, "--format", "-f", help="Output format: csv, parquet, json, jsonl"),
+    seed: Optional[int] = typer.Option(None, "--seed", "-s", help="Random seed for reproducibility"),
+    validate: bool = typer.Option(False, "--validate", "-v", help="Run validation after generating"),
+):
+    """Generate a fake dataset from a YAML schema."""
+    from fauxdata.commands.generate import run
+    run(schema_path=schema, rows=rows, out=out, fmt=fmt, seed=seed, validate=validate)
+@app.command("validate")
+def validate_cmd(
+    dataset: str = typer.Argument(..., help="Path to dataset file (csv, parquet, json, jsonl)"),
+    schema: str = typer.Argument(..., help="Path to YAML schema file"),
+):
+    """Validate an existing dataset against a YAML schema."""
+    from fauxdata.commands.validate import run
+    run(dataset_path=dataset, schema_path=schema)
+@app.command("preview")
+def preview_cmd(
+    dataset: str = typer.Argument(..., help="Path to dataset file"),
+    rows: int = typer.Option(10, "--rows", "-r", help="Number of rows to preview"),
+):
+    """Show a preview and column statistics for a dataset."""
+    from fauxdata.commands.preview import run
+    run(dataset_path=dataset, rows=rows)

fauxdata/output.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Export functions for fauxdata datasets."""
+from __future__ import annotations
+from pathlib import Path
+import polars as pl
+def normalize_fmt(fmt: str) -> str:
+    """Normalize format aliases."""
+    if fmt == "jsonlines":
+        return "jsonl"
+    return fmt
+def export_dataset(df: pl.DataFrame, path: str | Path, fmt: str) -> Path:
+    """Export a DataFrame to the given format and path. Returns the output path."""
+    fmt = normalize_fmt(fmt)
+    out = Path(path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    if fmt == "csv":
+        df.write_csv(out)
+    elif fmt == "parquet":
+        df.write_parquet(out)
+    elif fmt == "json":
+        df.write_json(out)
+    elif fmt == "jsonl":
+        df.write_ndjson(out)
+    else:
+        raise ValueError(f"Unsupported format: {fmt}. Use csv, parquet, json, jsonl, or jsonlines.")
+    return out
+def write_stdout(df: pl.DataFrame, fmt: str) -> None:
+    """Write a DataFrame to stdout."""
+    import sys
+    fmt = normalize_fmt(fmt)
+    if fmt == "csv":
+        sys.stdout.write(df.write_csv())
+    elif fmt == "json":
+        sys.stdout.write(df.write_json())
+    elif fmt == "jsonl":
+        sys.stdout.write(df.write_ndjson())
+    elif fmt == "parquet":
+        import io
+        buf = io.BytesIO()
+        df.write_parquet(buf)
+        sys.stdout.buffer.write(buf.getvalue())
+    else:
+        raise ValueError(f"Unsupported format: {fmt}. Use csv, parquet, json, or jsonl.")
+def default_output_path(schema_name: str, fmt: str) -> str:
+    return f"{schema_name}.{normalize_fmt(fmt)}"

fauxdata/schema.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Schema parsing and validation for fauxdata YAML schemas."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+import yaml
+VALID_TYPES = {"int", "float", "string", "bool", "date", "datetime"}
+STRING_PRESETS = {
+    # Personal
+    "name", "name_full", "first_name", "last_name",
+    "email", "phone_number",
+    "address", "city", "state", "country", "country_code_2", "country_code_3", "postcode",
+    "latitude", "longitude",
+    # Business
+    "company", "job", "catch_phrase",
+    # Internet
+    "url", "domain_name", "ipv4", "ipv6", "user_name", "password",
+    # Text
+    "text", "sentence", "paragraph", "word",
+    # Financial
+    "credit_card_number", "iban", "currency_code",
+    # Identifiers
+    "uuid4", "md5", "sha1",
+    # Misc
+    "license_plate", "ssn",
+}
+VALID_RULES = {
+    "col_vals_not_null",
+    "col_vals_between",
+    "col_vals_regex",
+    "col_vals_in_set",
+    "col_vals_gt",
+    "col_vals_lt",
+    "col_vals_ge",
+    "col_vals_le",
+    "rows_distinct",
+    "col_exists",
+}
+@dataclass
+class ColumnSchema:
+    name: str
+    col_type: str
+    unique: bool = False
+    nullable: bool = False
+    min: Any = None
+    max: Any = None
+    preset: str | None = None
+    locale: str | None = None
+    precision: int | None = None
+    values: list | None = None  # for in_set
+@dataclass
+class ValidationRule:
+    rule: str
+    columns: list[str] | None = None
+    column: str | None = None
+    min: Any = None
+    max: Any = None
+    pattern: str | None = None
+    values: list | None = None
+@dataclass
+class SchemaConfig:
+    name: str
+    rows: int
+    columns: list[ColumnSchema]
+    description: str = ""
+    seed: int | None = None
+    locale: str = "US"
+    output_format: str = "csv"
+    output_path: str | None = None
+    validation_rules: list[ValidationRule] = field(default_factory=list)
+def load_schema(path: str | Path) -> SchemaConfig:
+    """Load and parse a YAML schema file."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Schema file not found: {path}")
+    with open(path) as f:
+        data = yaml.safe_load(f)
+    return _parse_schema(data)
+def _parse_schema(data: dict) -> SchemaConfig:
+    if "name" not in data:
+        raise ValueError("Schema must have a 'name' field")
+    if "columns" not in data:
+        raise ValueError("Schema must have a 'columns' field")
+    rows = data.get("rows", 100)
+    seed = data.get("seed", None)
+    description = data.get("description", "")
+    output = data.get("output", {})
+    output_format = output.get("format", "csv")
+    output_path = output.get("path", None)
+    columns = []
+    for col_name, col_data in data["columns"].items():
+        col = _parse_column(col_name, col_data)
+        columns.append(col)
+    validation_rules = []
+    for rule_data in data.get("validation", []):
+        validation_rules.append(_parse_rule(rule_data))
+    return SchemaConfig(
+        name=data["name"],
+        rows=rows,
+        seed=seed,
+        description=description,
+        locale=data.get("locale", "US"),
+        output_format=output_format,
+        output_path=output_path,
+        columns=columns,
+        validation_rules=validation_rules,
+    )
+def _parse_column(name: str, data: dict) -> ColumnSchema:
+    if "type" not in data:
+        raise ValueError(f"Column '{name}' must have a 'type' field")
+    col_type = data["type"]
+    if col_type not in VALID_TYPES:
+        raise ValueError(f"Column '{name}': invalid type '{col_type}'. Valid: {VALID_TYPES}")
+    preset = data.get("preset", None)
+    if preset and preset not in STRING_PRESETS:
+        raise ValueError(f"Column '{name}': unknown preset '{preset}'. Valid: {STRING_PRESETS}")
+    return ColumnSchema(
+        name=name,
+        col_type=col_type,
+        unique=data.get("unique", False),
+        nullable=data.get("nullable", False),
+        min=data.get("min", None),
+        max=data.get("max", None),
+        preset=preset,
+        locale=data.get("locale", None),
+        precision=data.get("precision", None),
+        values=data.get("values", None),
+    )
+def _parse_rule(data: dict) -> ValidationRule:
+    if "rule" not in data:
+        raise ValueError("Validation rule must have a 'rule' field")
+    rule = data["rule"]
+    if rule not in VALID_RULES:
+        raise ValueError(f"Unknown validation rule '{rule}'. Valid: {VALID_RULES}")
+    return ValidationRule(
+        rule=rule,
+        columns=data.get("columns", None),
+        column=data.get("column", None),
+        min=data.get("min", None),
+        max=data.get("max", None),
+        pattern=data.get("pattern", None),
+        values=data.get("values", None),
+    )

fauxdata/validator.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Validation of datasets using pointblank."""
+from __future__ import annotations
+import pointblank as pb
+import polars as pl
+from fauxdata.schema import SchemaConfig, ValidationRule
+def validate_dataset(df: pl.DataFrame, schema: SchemaConfig) -> tuple[bool, list[dict]]:
+    """
+    Run pointblank validation rules against df.
+    Returns (all_passed, results_list).
+    """
+    if not schema.validation_rules:
+        return True, []
+    v = pb.Validate(
+        data=df,
+        tbl_name=schema.name,
+        thresholds=pb.Thresholds(warning=1),
+    )
+    for rule in schema.validation_rules:
+        _add_rule(v, rule)
+    v.interrogate()
+    all_passed = v.all_passed()
+    results = _extract_results(v, schema.validation_rules)
+    return all_passed, results
+def _add_rule(v: pb.Validate, rule: ValidationRule) -> None:
+    r = rule.rule
+    if r == "col_vals_not_null":
+        cols = rule.columns or ([rule.column] if rule.column else [])
+        for col in cols:
+            v.col_vals_not_null(columns=col)
+    elif r == "col_vals_between":
+        v.col_vals_between(columns=rule.column, left=rule.min, right=rule.max)
+    elif r == "col_vals_gt":
+        v.col_vals_gt(columns=rule.column, value=rule.min)
+    elif r == "col_vals_lt":
+        v.col_vals_lt(columns=rule.column, value=rule.max)
+    elif r == "col_vals_ge":
+        v.col_vals_ge(columns=rule.column, value=rule.min)
+    elif r == "col_vals_le":
+        v.col_vals_le(columns=rule.column, value=rule.max)
+    elif r == "col_vals_regex":
+        v.col_vals_regex(columns=rule.column, pattern=rule.pattern)
+    elif r == "col_vals_in_set":
+        v.col_vals_in_set(columns=rule.column, set=rule.values)
+    elif r == "rows_distinct":
+        cols = rule.columns or ([rule.column] if rule.column else None)
+        v.rows_distinct(columns_subset=cols)
+    elif r == "col_exists":
+        cols = rule.columns or ([rule.column] if rule.column else [])
+        for col in cols:
+            v.col_exists(columns=col)
+def _extract_results(v: pb.Validate, rules: list[ValidationRule]) -> list[dict]:
+    """Extract per-step results as a list of dicts."""
+    results = []
+    # Build a flat list of steps (one per column for multi-column rules)
+    step = 1
+    for rule in rules:
+        if rule.rule in ("col_vals_not_null", "col_exists"):
+            cols = rule.columns or ([rule.column] if rule.column else [])
+            for col in cols:
+                results.append(_get_step(v, step, rule.rule, col))
+                step += 1
+        else:
+            col = rule.column or (", ".join(rule.columns) if rule.columns else "-")
+            results.append(_get_step(v, step, rule.rule, col))
+            step += 1
+    return results
+def _get_step(v: pb.Validate, i: int, rule: str, column: str) -> dict:
+    try:
+        passed = v.n_passed(i=i, scalar=True) or 0
+        failed = v.n_failed(i=i, scalar=True) or 0
+    except Exception:
+        passed = 0
+        failed = 0
+    total = passed + failed
+    return {
+        "step": i,
+        "rule": rule,
+        "column": column,
+        "passed": passed,
+        "failed": failed,
+        "total": total,
+        "ok": failed == 0,
+    }

fauxdata_cli-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,13 @@
+Metadata-Version: 2.4
+Name: fauxdata-cli
+Version: 0.1.0
+Summary: CLI for generating and validating fake datasets
+Requires-Python: >=3.11
+Requires-Dist: faker>=26.0
+Requires-Dist: pointblank>=0.22
+Requires-Dist: polars>=1.0
+Requires-Dist: pyfiglet>=1.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: questionary>=2.0
+Requires-Dist: rich>=13
+Requires-Dist: typer>=0.12

fauxdata_cli-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+fauxdata/__init__.py,sha256=eSywJE8B_zDUGCVhE_l8gXkdSKtB2MqGWJA1SkZNskU,89
+fauxdata/generator.py,sha256=cN04eq5w-7DDzotGzUj9iZ6EAhOKkw3AanytA68bvXo,2596
+fauxdata/main.py,sha256=MOYg2OpBICrlR2an9b_1E2ujakDqNfrOwDeNVqi6pEo,2475
+fauxdata/output.py,sha256=VS-aXquONgMxSaLBjoUBvFwQrnOSJl5X9Yzrx3XFzjY,1562
+fauxdata/schema.py,sha256=fQ3pB-BhgRgFZxCaBaJte985g8k2ODaygPgCIiDwvCM,4732
+fauxdata/validator.py,sha256=VF7BMvrNhYlojCxtnEP24CIbcc4Dfqi7u-pWeuyoMTo,3193
+fauxdata/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fauxdata/commands/generate.py,sha256=suzClcxVO8rKJBR98rpylP3-dlZzNyWLAhsnqpEuklc,3562
+fauxdata/commands/init.py,sha256=6MkC_t-oB6BuJw7XbRFTZJtbV9cfV8UVMDIIE7BCNQg,2620
+fauxdata/commands/preview.py,sha256=MU-emgNC3lzFuDU-2aPKTllnxObyq-mmMlXI1X47QIU,2612
+fauxdata/commands/validate.py,sha256=BBX2DQJoZDgffGLUw7ezXKhcMxz0SuxyN3QebF6AEuE,2121
+fauxdata_cli-0.1.0.dist-info/METADATA,sha256=dLOlWhmKxno1O0SCsSu6PkBTTq-BI7J7283IZC53njA,362
+fauxdata_cli-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+fauxdata_cli-0.1.0.dist-info/entry_points.txt,sha256=fplnEfv4hunVCf1dWhc7fBdcAP_10PdCAqCfxIOnh1o,47
+fauxdata_cli-0.1.0.dist-info/RECORD,,

fauxdata_cli-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

fauxdata_cli-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ fauxdata = fauxdata.main:app