fauxdata-cli 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fauxdata_cli-0.1.2/.coverage +0 -0
- fauxdata_cli-0.1.2/LICENSE +21 -0
- fauxdata_cli-0.1.2/LOG.md +34 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/PKG-INFO +2 -2
- fauxdata_cli-0.1.2/docs/deployment.md +90 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/docs/index.html +31 -5
- fauxdata_cli-0.1.2/docs/share.png +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/pyproject.toml +11 -2
- fauxdata_cli-0.1.2/share.png +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/__init__.py +1 -1
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/commands/generate.py +12 -2
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/generator.py +14 -6
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/main.py +15 -1
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/schema.py +8 -0
- fauxdata_cli-0.1.2/tests/__init__.py +0 -0
- fauxdata_cli-0.1.2/tests/conftest.py +39 -0
- fauxdata_cli-0.1.2/tests/test_cli.py +112 -0
- fauxdata_cli-0.1.2/tests/test_generator.py +102 -0
- fauxdata_cli-0.1.2/tests/test_new_fields.py +167 -0
- fauxdata_cli-0.1.2/tests/test_output.py +107 -0
- fauxdata_cli-0.1.2/tests/test_schema.py +147 -0
- fauxdata_cli-0.1.2/tests/test_validator.py +123 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/uv.lock +220 -25
- fauxdata_cli-0.1.1/LOG.md +0 -10
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/.claude/settings.local.json +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/.gitignore +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/.python-version +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/README.md +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/schemas/events.yml +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/schemas/orders.yml +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/schemas/people.yml +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/commands/__init__.py +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/commands/init.py +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/commands/preview.py +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/commands/validate.py +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/output.py +0 -0
- {fauxdata_cli-0.1.1 → fauxdata_cli-0.1.2}/src/fauxdata/validator.py +0 -0
|
Binary file
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Andrea Borruso <aborruso@gmail.com>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Log
|
|
2
|
+
|
|
3
|
+
## 2026-03-06 — v0.1.2
|
|
4
|
+
|
|
5
|
+
- Bump to 0.1.2 and publish to PyPI
|
|
6
|
+
|
|
7
|
+
## 2026-03-06 (feature)
|
|
8
|
+
|
|
9
|
+
- `--version` / `-V` flag nel CLI (`fauxdata --version` → `fauxdata 0.1.1`)
|
|
10
|
+
- Coverage threshold 80% in pytest config (`--cov-fail-under=80`); attuale: 83.76%
|
|
11
|
+
- Nuovo campo `pattern` in ColumnSchema: genera stringhe che matchano un regex via pointblank
|
|
12
|
+
- Nuovo campo `null_probability` in ColumnSchema: controllo granulare dei null (0.0–1.0), con validazione in parsing
|
|
13
|
+
- Rimossa dipendenza `faker` (non usata, pointblank gestisce tutto)
|
|
14
|
+
- Fix generator: `null_probability=None` non passato a pointblank (causa TypeError)
|
|
15
|
+
- Test aggiornati: 79/79 pass
|
|
16
|
+
|
|
17
|
+
## 2026-03-06 (tests)
|
|
18
|
+
|
|
19
|
+
- Add pytest test suite: 65 tests, 100% pass, 0.44s
|
|
20
|
+
- `tests/test_schema.py`: unit tests for YAML schema parsing (valid/invalid cases)
|
|
21
|
+
- `tests/test_output.py`: unit tests for export functions (all formats, stdout, errors)
|
|
22
|
+
- `tests/test_generator.py`: integration tests for generation (types, seed, unique, presets)
|
|
23
|
+
- `tests/test_validator.py`: integration tests for validation rules (pass/fail scenarios)
|
|
24
|
+
- `tests/test_cli.py`: CLI smoke tests via `typer.testing.CliRunner`
|
|
25
|
+
- Add `[dependency-groups] dev` in `pyproject.toml` (pytest, pytest-cov); config via `[tool.pytest.ini_options]`
|
|
26
|
+
|
|
27
|
+
## 2026-03-06
|
|
28
|
+
|
|
29
|
+
- Initial implementation of `fauxdata` CLI
|
|
30
|
+
- Stack: pointblank 0.22 (native generation + validation), polars, typer, rich, pyfiglet, questionary
|
|
31
|
+
- Commands: `init`, `generate`, `validate`, `preview`
|
|
32
|
+
- Example schemas: `people.yml`, `orders.yml`, `events.yml`
|
|
33
|
+
- All schemas generate and validate cleanly (all rules PASS)
|
|
34
|
+
- `locale` field at schema level maps to pointblank `country=` param
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fauxdata-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: CLI for generating and validating fake datasets
|
|
5
5
|
Project-URL: Homepage, https://aborruso.github.io/fauxdata/
|
|
6
6
|
Project-URL: Repository, https://github.com/aborruso/fauxdata
|
|
7
7
|
Project-URL: Bug Tracker, https://github.com/aborruso/fauxdata/issues
|
|
8
|
+
License-File: LICENSE
|
|
8
9
|
Requires-Python: >=3.11
|
|
9
|
-
Requires-Dist: faker>=26.0
|
|
10
10
|
Requires-Dist: pointblank>=0.22
|
|
11
11
|
Requires-Dist: polars>=1.0
|
|
12
12
|
Requires-Dist: pyfiglet>=1.0
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Deployment rules
|
|
2
|
+
|
|
3
|
+
## Pre-release checklist (always)
|
|
4
|
+
|
|
5
|
+
1. Run tests locally — must all pass:
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv run pytest
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Coverage must stay above 80%. If it drops, fix before proceeding.
|
|
12
|
+
|
|
13
|
+
2. Bump version in **both**:
|
|
14
|
+
- `src/fauxdata/__init__.py` → `__version__ = "X.Y.Z"`
|
|
15
|
+
- `pyproject.toml` → `version = "X.Y.Z"`
|
|
16
|
+
|
|
17
|
+
3. Update `LOG.md` with a summary of changes under a new date heading.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## GitHub release (tag + release notes)
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
# Create and push annotated tag
|
|
25
|
+
git tag -a vX.Y.Z -m "vX.Y.Z"
|
|
26
|
+
git push origin vX.Y.Z
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Then create a GitHub release via `gh`:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
gh release create vX.Y.Z \
|
|
33
|
+
--title "vX.Y.Z" \
|
|
34
|
+
--notes "$(cat <<'EOF'
|
|
35
|
+
## What's new
|
|
36
|
+
|
|
37
|
+
- Short bullet list of user-facing changes
|
|
38
|
+
- Include new fields, commands, bug fixes
|
|
39
|
+
|
|
40
|
+
## Breaking changes
|
|
41
|
+
|
|
42
|
+
- List any breaking changes here (or remove section if none)
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
\`\`\`bash
|
|
47
|
+
pip install fauxdata-cli==X.Y.Z
|
|
48
|
+
\`\`\`
|
|
49
|
+
|
|
50
|
+
Full changelog: https://github.com/aborruso/fauxdata/commits/vX.Y.Z
|
|
51
|
+
EOF
|
|
52
|
+
)"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Release notes style: **concise, nerd-friendly, technical**. List the actual changes with enough detail that a developer understands what changed and why.
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## PyPI publish (via twine)
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Build
|
|
63
|
+
uv build
|
|
64
|
+
|
|
65
|
+
# Check the dist
|
|
66
|
+
twine check dist/*
|
|
67
|
+
|
|
68
|
+
# Publish
|
|
69
|
+
twine upload dist/*
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Requires `~/.pypirc` configured with PyPI token, or set `TWINE_USERNAME`/`TWINE_PASSWORD` env vars.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Order of operations
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
uv run pytest # must pass 100%
|
|
80
|
+
bump version # __init__.py + pyproject.toml
|
|
81
|
+
update LOG.md
|
|
82
|
+
git commit + git push
|
|
83
|
+
git tag + git push tag
|
|
84
|
+
gh release create # with release notes
|
|
85
|
+
uv build
|
|
86
|
+
twine check dist/*
|
|
87
|
+
twine upload dist/*
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Never publish to PyPI without a corresponding GitHub release.
|
|
@@ -5,8 +5,32 @@
|
|
|
5
5
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
6
|
<title>fauxdata — fake data, done right</title>
|
|
7
7
|
<meta name="description" content="fauxdata is a CLI tool for generating and validating realistic fake datasets from YAML schemas. Locale-aware, pipeline-friendly, powered by pointblank.">
|
|
8
|
+
<meta name="keywords" content="fake data, synthetic data, dataset generator, CLI, YAML schema, pointblank, data testing, fake dataset, CSV generator, Parquet">
|
|
9
|
+
<meta name="author" content="Andrea Borruso">
|
|
10
|
+
<meta name="robots" content="index, follow">
|
|
11
|
+
<link rel="canonical" href="https://aborruso.github.io/fauxdata/">
|
|
12
|
+
|
|
13
|
+
<!-- Open Graph -->
|
|
14
|
+
<meta property="og:type" content="website">
|
|
15
|
+
<meta property="og:url" content="https://aborruso.github.io/fauxdata/">
|
|
8
16
|
<meta property="og:title" content="fauxdata — fake data, done right">
|
|
9
17
|
<meta property="og:description" content="Generate and validate realistic fake datasets from YAML schemas. Because fake data can actually be better than real data.">
|
|
18
|
+
<meta property="og:image" content="https://aborruso.github.io/fauxdata/share.png">
|
|
19
|
+
<meta property="og:image:width" content="1200">
|
|
20
|
+
<meta property="og:image:height" content="630">
|
|
21
|
+
<meta property="og:image:alt" content="fauxdata — CLI tool for generating realistic fake datasets">
|
|
22
|
+
<meta property="og:site_name" content="fauxdata">
|
|
23
|
+
<meta property="og:locale" content="en_US">
|
|
24
|
+
|
|
25
|
+
<!-- Twitter Card -->
|
|
26
|
+
<meta name="twitter:card" content="summary_large_image">
|
|
27
|
+
<meta name="twitter:url" content="https://aborruso.github.io/fauxdata/">
|
|
28
|
+
<meta name="twitter:title" content="fauxdata — fake data, done right">
|
|
29
|
+
<meta name="twitter:description" content="Generate and validate realistic fake datasets from YAML schemas. Because fake data can actually be better than real data.">
|
|
30
|
+
<meta name="twitter:image" content="https://aborruso.github.io/fauxdata/share.png">
|
|
31
|
+
<meta name="twitter:image:alt" content="fauxdata — CLI tool for generating realistic fake datasets">
|
|
32
|
+
<meta name="twitter:creator" content="@aborruso">
|
|
33
|
+
|
|
10
34
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
11
35
|
<link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:ital,wght@0,300;0,400;0,600;0,700;1,400&family=VT323&display=swap" rel="stylesheet">
|
|
12
36
|
<style>
|
|
@@ -876,11 +900,13 @@
|
|
|
876
900
|
<span class="t-line dim" style="padding-left:2rem">| duckdb -c "SELECT status, COUNT(*) FROM '/dev/stdin' GROUP BY ALL"</span>
|
|
877
901
|
<br>
|
|
878
902
|
<span class="t-line t-out">┌───────────┬──────────┐</span>
|
|
879
|
-
<span class="t-line t-out">│
|
|
880
|
-
<span class="t-line t-out">│
|
|
881
|
-
<span class="t-line t-out"
|
|
882
|
-
<span class="t-line t-out">│
|
|
883
|
-
<span class="t-line t-out">│
|
|
903
|
+
<span class="t-line t-out">│ status │ count(*) │</span>
|
|
904
|
+
<span class="t-line t-out">│ varchar │ int64 │</span>
|
|
905
|
+
<span class="t-line t-out">├───────────┼──────────┤</span>
|
|
906
|
+
<span class="t-line t-out">│ delivered │ 3124 │</span>
|
|
907
|
+
<span class="t-line t-out">│ shipped │ 2891 │</span>
|
|
908
|
+
<span class="t-line t-out">│ pending │ 2003 │</span>
|
|
909
|
+
<span class="t-line t-out">│ cancelled │ 1982 │</span>
|
|
884
910
|
<span class="t-line t-out">└───────────┴──────────┘</span>
|
|
885
911
|
</div>
|
|
886
912
|
</div>
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "fauxdata-cli"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.2"
|
|
4
4
|
description = "CLI for generating and validating fake datasets"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.11"
|
|
@@ -12,7 +12,6 @@ dependencies = [
|
|
|
12
12
|
"questionary>=2.0",
|
|
13
13
|
"polars>=1.0",
|
|
14
14
|
"pyyaml>=6.0",
|
|
15
|
-
"faker>=26.0",
|
|
16
15
|
]
|
|
17
16
|
|
|
18
17
|
[project.urls]
|
|
@@ -23,6 +22,16 @@ Repository = "https://github.com/aborruso/fauxdata"
|
|
|
23
22
|
[project.scripts]
|
|
24
23
|
fauxdata = "fauxdata.main:app"
|
|
25
24
|
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"pytest>=8.0",
|
|
28
|
+
"pytest-cov>=5.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[tool.pytest.ini_options]
|
|
32
|
+
testpaths = ["tests"]
|
|
33
|
+
addopts = "--tb=short --cov=fauxdata --cov-report=term-missing --cov-fail-under=80"
|
|
34
|
+
|
|
26
35
|
[build-system]
|
|
27
36
|
requires = ["hatchling"]
|
|
28
37
|
build-backend = "hatchling.build"
|
|
Binary file
|
|
@@ -61,13 +61,22 @@ def _print_schema_table(schema, n: int, seed):
|
|
|
61
61
|
t = Table(title=f"Schema: {schema.name}", show_header=True, header_style="bold magenta")
|
|
62
62
|
t.add_column("Column", style="cyan")
|
|
63
63
|
t.add_column("Type")
|
|
64
|
-
t.add_column("Preset/Values")
|
|
64
|
+
t.add_column("Preset/Pattern/Values")
|
|
65
65
|
t.add_column("Min")
|
|
66
66
|
t.add_column("Max")
|
|
67
67
|
t.add_column("Unique")
|
|
68
|
+
t.add_column("Null%")
|
|
68
69
|
|
|
69
70
|
for col in schema.columns:
|
|
70
|
-
|
|
71
|
+
if col.pattern:
|
|
72
|
+
preset_val = f"pattern:{col.pattern}"
|
|
73
|
+
elif col.preset:
|
|
74
|
+
preset_val = col.preset
|
|
75
|
+
elif col.values:
|
|
76
|
+
preset_val = str(col.values)
|
|
77
|
+
else:
|
|
78
|
+
preset_val = "-"
|
|
79
|
+
null_pct = f"{int(col.null_probability * 100)}%" if col.null_probability else "-"
|
|
71
80
|
t.add_row(
|
|
72
81
|
col.name,
|
|
73
82
|
col.col_type,
|
|
@@ -75,6 +84,7 @@ def _print_schema_table(schema, n: int, seed):
|
|
|
75
84
|
str(col.min) if col.min is not None else "-",
|
|
76
85
|
str(col.max) if col.max is not None else "-",
|
|
77
86
|
"yes" if col.unique else "no",
|
|
87
|
+
null_pct,
|
|
78
88
|
)
|
|
79
89
|
|
|
80
90
|
console.print(t)
|
|
@@ -30,8 +30,10 @@ def _build_pb_schema(schema: SchemaConfig) -> pb.Schema:
|
|
|
30
30
|
|
|
31
31
|
def _col_to_field(col: ColumnSchema):
|
|
32
32
|
"""Convert a ColumnSchema to a pointblank field spec."""
|
|
33
|
-
nullable = col.nullable
|
|
33
|
+
nullable = col.nullable or (col.null_probability is not None and col.null_probability > 0)
|
|
34
34
|
unique = col.unique
|
|
35
|
+
# Build optional kwargs only when null_probability is explicitly set
|
|
36
|
+
np_kwargs = {"null_probability": col.null_probability} if col.null_probability is not None else {}
|
|
35
37
|
|
|
36
38
|
if col.col_type == "int":
|
|
37
39
|
return pb.int_field(
|
|
@@ -39,6 +41,7 @@ def _col_to_field(col: ColumnSchema):
|
|
|
39
41
|
max_val=int(col.max) if col.max is not None else None,
|
|
40
42
|
nullable=nullable,
|
|
41
43
|
unique=unique,
|
|
44
|
+
**np_kwargs,
|
|
42
45
|
)
|
|
43
46
|
|
|
44
47
|
elif col.col_type == "float":
|
|
@@ -47,10 +50,11 @@ def _col_to_field(col: ColumnSchema):
|
|
|
47
50
|
max_val=float(col.max) if col.max is not None else None,
|
|
48
51
|
nullable=nullable,
|
|
49
52
|
unique=unique,
|
|
53
|
+
**np_kwargs,
|
|
50
54
|
)
|
|
51
55
|
|
|
52
56
|
elif col.col_type == "bool":
|
|
53
|
-
return pb.bool_field(nullable=nullable)
|
|
57
|
+
return pb.bool_field(nullable=nullable, **np_kwargs)
|
|
54
58
|
|
|
55
59
|
elif col.col_type == "date":
|
|
56
60
|
return pb.date_field(
|
|
@@ -58,6 +62,7 @@ def _col_to_field(col: ColumnSchema):
|
|
|
58
62
|
max_date=str(col.max) if col.max is not None else None,
|
|
59
63
|
nullable=nullable,
|
|
60
64
|
unique=unique,
|
|
65
|
+
**np_kwargs,
|
|
61
66
|
)
|
|
62
67
|
|
|
63
68
|
elif col.col_type == "datetime":
|
|
@@ -66,15 +71,18 @@ def _col_to_field(col: ColumnSchema):
|
|
|
66
71
|
max_date=str(col.max) if col.max is not None else None,
|
|
67
72
|
nullable=nullable,
|
|
68
73
|
unique=unique,
|
|
74
|
+
**np_kwargs,
|
|
69
75
|
)
|
|
70
76
|
|
|
71
77
|
elif col.col_type == "string":
|
|
72
78
|
if col.values:
|
|
73
|
-
return pb.string_field(allowed=col.values, nullable=nullable)
|
|
79
|
+
return pb.string_field(allowed=col.values, nullable=nullable, **np_kwargs)
|
|
80
|
+
elif col.pattern:
|
|
81
|
+
return pb.string_field(pattern=col.pattern, nullable=nullable, unique=unique, **np_kwargs)
|
|
74
82
|
elif col.preset:
|
|
75
|
-
return pb.string_field(preset=col.preset, nullable=nullable, unique=unique)
|
|
83
|
+
return pb.string_field(preset=col.preset, nullable=nullable, unique=unique, **np_kwargs)
|
|
76
84
|
else:
|
|
77
|
-
return pb.string_field(nullable=nullable, unique=unique)
|
|
85
|
+
return pb.string_field(nullable=nullable, unique=unique, **np_kwargs)
|
|
78
86
|
|
|
79
87
|
else:
|
|
80
|
-
return pb.string_field(nullable=nullable)
|
|
88
|
+
return pb.string_field(nullable=nullable, **np_kwargs)
|
|
@@ -9,6 +9,8 @@ import typer
|
|
|
9
9
|
from rich import print as rprint
|
|
10
10
|
from rich.console import Console
|
|
11
11
|
|
|
12
|
+
from fauxdata import __version__
|
|
13
|
+
|
|
12
14
|
app = typer.Typer(
|
|
13
15
|
name="fauxdata",
|
|
14
16
|
help="Generate and validate fake datasets from YAML schemas.",
|
|
@@ -23,8 +25,20 @@ def _banner():
|
|
|
23
25
|
rprint("[dim]Generate and validate realistic fake datasets[/dim]\n")
|
|
24
26
|
|
|
25
27
|
|
|
28
|
+
def _version_callback(value: bool):
|
|
29
|
+
if value:
|
|
30
|
+
rprint(f"fauxdata {__version__}")
|
|
31
|
+
raise typer.Exit()
|
|
32
|
+
|
|
33
|
+
|
|
26
34
|
@app.callback(invoke_without_command=True)
|
|
27
|
-
def main(
|
|
35
|
+
def main(
|
|
36
|
+
ctx: typer.Context,
|
|
37
|
+
version: Optional[bool] = typer.Option(
|
|
38
|
+
None, "--version", "-V", callback=_version_callback, is_eager=True,
|
|
39
|
+
help="Show version and exit.",
|
|
40
|
+
),
|
|
41
|
+
):
|
|
28
42
|
if ctx.invoked_subcommand is None:
|
|
29
43
|
_banner()
|
|
30
44
|
rprint(ctx.get_help())
|
|
@@ -57,6 +57,8 @@ class ColumnSchema:
|
|
|
57
57
|
locale: str | None = None
|
|
58
58
|
precision: int | None = None
|
|
59
59
|
values: list | None = None # for in_set
|
|
60
|
+
pattern: str | None = None # regex pattern for string generation
|
|
61
|
+
null_probability: float | None = None # e.g. 0.1 = 10% nulls
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
@dataclass
|
|
@@ -142,6 +144,10 @@ def _parse_column(name: str, data: dict) -> ColumnSchema:
|
|
|
142
144
|
if preset and preset not in STRING_PRESETS:
|
|
143
145
|
raise ValueError(f"Column '{name}': unknown preset '{preset}'. Valid: {STRING_PRESETS}")
|
|
144
146
|
|
|
147
|
+
null_probability = data.get("null_probability", None)
|
|
148
|
+
if null_probability is not None and not (0.0 <= float(null_probability) <= 1.0):
|
|
149
|
+
raise ValueError(f"Column '{name}': null_probability must be between 0.0 and 1.0")
|
|
150
|
+
|
|
145
151
|
return ColumnSchema(
|
|
146
152
|
name=name,
|
|
147
153
|
col_type=col_type,
|
|
@@ -153,6 +159,8 @@ def _parse_column(name: str, data: dict) -> ColumnSchema:
|
|
|
153
159
|
locale=data.get("locale", None),
|
|
154
160
|
precision=data.get("precision", None),
|
|
155
161
|
values=data.get("values", None),
|
|
162
|
+
pattern=data.get("pattern", None),
|
|
163
|
+
null_probability=float(null_probability) if null_probability is not None else None,
|
|
156
164
|
)
|
|
157
165
|
|
|
158
166
|
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Shared fixtures for fauxdata tests."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
from fauxdata.schema import SchemaConfig, ColumnSchema, ValidationRule
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def minimal_schema():
|
|
11
|
+
"""A minimal SchemaConfig with one int and one string column."""
|
|
12
|
+
return SchemaConfig(
|
|
13
|
+
name="test",
|
|
14
|
+
rows=10,
|
|
15
|
+
seed=42,
|
|
16
|
+
locale="US",
|
|
17
|
+
output_format="csv",
|
|
18
|
+
columns=[
|
|
19
|
+
ColumnSchema(name="id", col_type="int", min=1, max=100, unique=True),
|
|
20
|
+
ColumnSchema(name="name", col_type="string", preset="name"),
|
|
21
|
+
],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture
|
|
26
|
+
def simple_df():
|
|
27
|
+
"""A small deterministic DataFrame for validation tests."""
|
|
28
|
+
return pl.DataFrame({
|
|
29
|
+
"id": [1, 2, 3],
|
|
30
|
+
"age": [25, 40, 55],
|
|
31
|
+
"email": ["a@b.com", "c@d.com", "e@f.com"],
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.fixture
|
|
36
|
+
def people_schema_path():
|
|
37
|
+
"""Path to the existing people.yml schema."""
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
return str(Path(__file__).parent.parent / "schemas" / "people.yml")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Smoke tests for the fauxdata CLI using typer's CliRunner."""
|
|
2
|
+
|
|
3
|
+
import textwrap
|
|
4
|
+
import pytest
|
|
5
|
+
from typer.testing import CliRunner
|
|
6
|
+
|
|
7
|
+
from fauxdata.main import app
|
|
8
|
+
|
|
9
|
+
runner = CliRunner()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_cli_no_args():
|
|
13
|
+
"""Running fauxdata with no args should show help."""
|
|
14
|
+
result = runner.invoke(app, [])
|
|
15
|
+
assert result.exit_code == 0
|
|
16
|
+
assert "fauxdata" in result.output.lower() or "generate" in result.output.lower()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_cli_help():
|
|
20
|
+
result = runner.invoke(app, ["--help"])
|
|
21
|
+
assert result.exit_code == 0
|
|
22
|
+
assert "generate" in result.output
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_cli_generate_help():
|
|
26
|
+
result = runner.invoke(app, ["generate", "--help"])
|
|
27
|
+
assert result.exit_code == 0
|
|
28
|
+
assert "--rows" in result.output
|
|
29
|
+
assert "--format" in result.output
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_cli_generate_csv(tmp_path, people_schema_path):
|
|
33
|
+
out = tmp_path / "out.csv"
|
|
34
|
+
result = runner.invoke(app, ["generate", people_schema_path, "--rows", "5",
|
|
35
|
+
"--out", str(out), "--format", "csv", "--seed", "1"])
|
|
36
|
+
assert result.exit_code == 0, result.output
|
|
37
|
+
assert out.exists()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_cli_generate_json(tmp_path, people_schema_path):
|
|
41
|
+
out = tmp_path / "out.json"
|
|
42
|
+
result = runner.invoke(app, ["generate", people_schema_path, "--rows", "5",
|
|
43
|
+
"--out", str(out), "--format", "json", "--seed", "1"])
|
|
44
|
+
assert result.exit_code == 0, result.output
|
|
45
|
+
assert out.exists()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_cli_generate_parquet(tmp_path, people_schema_path):
|
|
49
|
+
out = tmp_path / "out.parquet"
|
|
50
|
+
result = runner.invoke(app, ["generate", people_schema_path, "--rows", "5",
|
|
51
|
+
"--out", str(out), "--format", "parquet", "--seed", "1"])
|
|
52
|
+
assert result.exit_code == 0, result.output
|
|
53
|
+
assert out.exists()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_cli_generate_stdout(people_schema_path, capsys):
|
|
57
|
+
result = runner.invoke(app, ["generate", people_schema_path, "--rows", "3",
|
|
58
|
+
"--out", "-", "--format", "csv", "--seed", "1"])
|
|
59
|
+
assert result.exit_code == 0, result.output
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_cli_generate_with_validate(tmp_path, people_schema_path):
|
|
63
|
+
out = tmp_path / "out.csv"
|
|
64
|
+
result = runner.invoke(app, ["generate", people_schema_path, "--rows", "10",
|
|
65
|
+
"--out", str(out), "--format", "csv",
|
|
66
|
+
"--seed", "42", "--validate"])
|
|
67
|
+
assert result.exit_code == 0, result.output
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_cli_generate_missing_schema(tmp_path):
|
|
71
|
+
result = runner.invoke(app, ["generate", "/nonexistent/schema.yml"])
|
|
72
|
+
assert result.exit_code != 0
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_cli_validate(tmp_path, people_schema_path):
|
|
76
|
+
"""Generate a file then validate it."""
|
|
77
|
+
out = tmp_path / "people.csv"
|
|
78
|
+
runner.invoke(app, ["generate", people_schema_path, "--rows", "10",
|
|
79
|
+
"--out", str(out), "--format", "csv", "--seed", "42"])
|
|
80
|
+
result = runner.invoke(app, ["validate", str(out), people_schema_path])
|
|
81
|
+
assert result.exit_code == 0, result.output
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_cli_preview(tmp_path, people_schema_path):
|
|
85
|
+
out = tmp_path / "people.csv"
|
|
86
|
+
runner.invoke(app, ["generate", people_schema_path, "--rows", "20",
|
|
87
|
+
"--out", str(out), "--format", "csv", "--seed", "42"])
|
|
88
|
+
result = runner.invoke(app, ["preview", str(out), "--rows", "5"])
|
|
89
|
+
assert result.exit_code == 0, result.output
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_cli_generate_inline_schema(tmp_path):
|
|
93
|
+
"""Test with a minimal inline schema written to a tmp file."""
|
|
94
|
+
schema_yaml = textwrap.dedent("""\
|
|
95
|
+
name: mini
|
|
96
|
+
rows: 5
|
|
97
|
+
columns:
|
|
98
|
+
id:
|
|
99
|
+
type: int
|
|
100
|
+
min: 1
|
|
101
|
+
max: 100
|
|
102
|
+
label:
|
|
103
|
+
type: string
|
|
104
|
+
values: ["a", "b"]
|
|
105
|
+
""")
|
|
106
|
+
schema_path = tmp_path / "mini.yml"
|
|
107
|
+
schema_path.write_text(schema_yaml)
|
|
108
|
+
out = tmp_path / "mini.csv"
|
|
109
|
+
result = runner.invoke(app, ["generate", str(schema_path),
|
|
110
|
+
"--out", str(out), "--format", "csv", "--seed", "1"])
|
|
111
|
+
assert result.exit_code == 0, result.output
|
|
112
|
+
assert out.exists()
|