fauxdata-cli 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. fauxdata_cli-0.1.1/PKG-INFO +309 -0
  2. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/README.md +8 -8
  3. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/docs/index.html +4 -14
  4. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/pyproject.toml +7 -1
  5. fauxdata_cli-0.1.0/PKG-INFO +0 -13
  6. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/.claude/settings.local.json +0 -0
  7. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/.gitignore +0 -0
  8. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/.python-version +0 -0
  9. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/LOG.md +0 -0
  10. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/schemas/events.yml +0 -0
  11. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/schemas/orders.yml +0 -0
  12. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/schemas/people.yml +0 -0
  13. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/__init__.py +0 -0
  14. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/commands/__init__.py +0 -0
  15. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/commands/generate.py +0 -0
  16. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/commands/init.py +0 -0
  17. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/commands/preview.py +0 -0
  18. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/commands/validate.py +0 -0
  19. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/generator.py +0 -0
  20. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/main.py +0 -0
  21. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/output.py +0 -0
  22. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/schema.py +0 -0
  23. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/src/fauxdata/validator.py +0 -0
  24. {fauxdata_cli-0.1.0 → fauxdata_cli-0.1.1}/uv.lock +0 -0
@@ -0,0 +1,309 @@
1
+ Metadata-Version: 2.4
2
+ Name: fauxdata-cli
3
+ Version: 0.1.1
4
+ Summary: CLI for generating and validating fake datasets
5
+ Project-URL: Homepage, https://aborruso.github.io/fauxdata/
6
+ Project-URL: Repository, https://github.com/aborruso/fauxdata
7
+ Project-URL: Bug Tracker, https://github.com/aborruso/fauxdata/issues
8
+ Requires-Python: >=3.11
9
+ Requires-Dist: faker>=26.0
10
+ Requires-Dist: pointblank>=0.22
11
+ Requires-Dist: polars>=1.0
12
+ Requires-Dist: pyfiglet>=1.0
13
+ Requires-Dist: pyyaml>=6.0
14
+ Requires-Dist: questionary>=2.0
15
+ Requires-Dist: rich>=13
16
+ Requires-Dist: typer>=0.12
17
+ Description-Content-Type: text/markdown
18
+
19
+ [![PyPI version](https://img.shields.io/pypi/v/fauxdata-cli.svg?label=PyPI%20version)](https://pypi.org/project/fauxdata-cli/)
20
+ [![Python Versions](https://img.shields.io/pypi/pyversions/fauxdata-cli.svg)](https://pypi.org/project/fauxdata-cli/)
21
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/fauxdata)
22
+
23
+ # fauxdata
24
+
25
+ **fauxdata** is a command-line tool for generating and validating realistic fake datasets from simple YAML schemas.
26
+
27
+ If you work with data — as an analyst, engineer, developer, or researcher — you constantly need test data: to prototype a pipeline, populate a demo dashboard, write unit tests, or show a colleague how a system should behave. Real data is often unavailable, sensitive, or too messy to share. fauxdata solves this by letting you describe your dataset structure once and generate as many rows as you need, on demand, with realistic values.
28
+
29
+ ---
30
+
31
+ ## Why fauxdata?
32
+
33
+ - **Schema-first**: define the shape of your data in a readable YAML file — column names, types, constraints, realistic presets
34
+ - **Locale-aware and coherent**: set `locale: IT` and get Italian names, cities, email domains, phone formats, IBANs — all consistent within each row. Set `locale: JP` and get Japanese names and addresses. The data is not just random strings: related fields are generated together so they make sense as a whole record
35
+ - **Validated by design**: the same schema that defines generation also drives validation; no surprises
36
+ - **Pipeline-friendly**: output to stdout with `--out -` for seamless piping and redirection
37
+ - **Multiple formats**: CSV, Parquet, JSON, JSONL / JSON Lines out of the box
38
+
39
+ ---
40
+
41
+ ## Install
42
+
43
+ ### With uv (recommended)
44
+
45
+ [uv](https://docs.astral.sh/uv/) installs fauxdata as an isolated tool available globally, without polluting any existing Python environment:
46
+
47
+ ```bash
48
+ uv tool install fauxdata-cli
49
+ ```
50
+
51
+ After installation, `fauxdata` is available from any directory.
52
+
53
+ To upgrade:
54
+
55
+ ```bash
56
+ uv tool upgrade fauxdata-cli
57
+ ```
58
+
59
+ ### With pip
60
+
61
+ ```bash
62
+ pip install fauxdata-cli
63
+ ```
64
+
65
+ ---
66
+
67
+ ## Quick start
68
+
69
+ ```bash
70
+ # Generate 500 rows from a schema, with validation
71
+ fauxdata generate schemas/people.yml --rows 500 --validate
72
+
73
+ # Stream to stdout and pipe to other tools
74
+ fauxdata generate schemas/people.yml --rows 1000 --out - | head -5
75
+
76
+ # Validate an existing file against a schema
77
+ fauxdata validate my_data.csv schemas/people.yml
78
+
79
+ # Preview a dataset with column statistics
80
+ fauxdata preview my_data.csv --rows 10
81
+
82
+ # Create a new schema interactively
83
+ fauxdata init --name orders
84
+ ```
85
+
86
+ ---
87
+
88
+ ## Schema format
89
+
90
+ A schema is a YAML file that describes the structure of your dataset. Here is a realistic example for a people dataset:
91
+
92
+ ```yaml
93
+ name: people
94
+ description: "People dataset with personal info"
95
+ rows: 1000
96
+ seed: 42
97
+ locale: IT # ISO country code — affects names, cities, emails, phone numbers, etc.
98
+
99
+ output:
100
+ format: csv # csv | parquet | json | jsonl | jsonlines
101
+ path: tmp/people.csv
102
+
103
+ columns:
104
+ id:
105
+ type: int
106
+ unique: true
107
+ min: 1
108
+ max: 99999
109
+
110
+ name:
111
+ type: string
112
+ preset: name # generates realistic full names for the given locale
113
+
114
+ email:
115
+ type: string
116
+ preset: email
117
+
118
+ age:
119
+ type: int
120
+ min: 18
121
+ max: 90
122
+
123
+ city:
124
+ type: string
125
+ preset: city
126
+
127
+ country_code:
128
+ type: string
129
+ preset: country_code_2 # ISO 3166-1 alpha-2, e.g. "IT"
130
+
131
+ active:
132
+ type: bool
133
+
134
+ signup_date:
135
+ type: date
136
+ min: "2020-01-01"
137
+ max: "2024-12-31"
138
+
139
+ score:
140
+ type: float
141
+ min: 0.0
142
+ max: 100.0
143
+
144
+ status:
145
+ type: string
146
+ values: [active, inactive, pending] # enum: pick from a fixed list
147
+
148
+ validation:
149
+ - rule: col_vals_not_null
150
+ columns: [id, name, email]
151
+ - rule: col_vals_between
152
+ column: age
153
+ min: 18
154
+ max: 90
155
+ - rule: col_vals_regex
156
+ column: email
157
+ pattern: "^[^@]+@[^@]+\\.[^@]+$"
158
+ - rule: rows_distinct
159
+ columns: [id]
160
+ ```
161
+
162
+ ### Column types
163
+
164
+ | Type | Description | Options |
165
+ |------|-------------|---------|
166
+ | `int` | Integer | `min`, `max`, `unique` |
167
+ | `float` | Floating point | `min`, `max` |
168
+ | `string` | Text | `preset`, `values`, `unique` |
169
+ | `bool` | Boolean | — |
170
+ | `date` | Date | `min`, `max` (ISO format) |
171
+ | `datetime` | Datetime | `min`, `max` (ISO format) |
172
+
173
+ ### String presets
174
+
175
+ Presets generate realistic, locale-aware values. Set `locale` at the schema level to control the country.
176
+
177
+ | Category | Presets |
178
+ |----------|---------|
179
+ | Personal | `name`, `name_full`, `first_name`, `last_name`, `email`, `phone_number` |
180
+ | Location | `address`, `city`, `state`, `country`, `country_code_2`, `country_code_3`, `postcode`, `latitude`, `longitude` |
181
+ | Business | `company`, `job`, `catch_phrase` |
182
+ | Internet | `url`, `domain_name`, `ipv4`, `ipv6`, `user_name`, `password` |
183
+ | Text | `text`, `sentence`, `paragraph`, `word` |
184
+ | Financial | `iban`, `currency_code`, `credit_card_number` |
185
+ | Identifiers | `uuid4`, `md5`, `sha1`, `ssn`, `license_plate` |
186
+
187
+ ### Locale-aware generation
188
+
189
+ Setting `locale` in the schema is more than a language switch — it makes the entire dataset culturally coherent.
190
+
191
+ With `locale: IT`:
192
+
193
+ ```
194
+ id name email city country_code
195
+ 83811 Giovanni Gentile giovanni.gentile@tin.it Bari IT
196
+ 14593 Bruno Mancini bruno.mancini16@virgilio.it Taranto IT
197
+ 3279 Giada Santini gsantini38@fastwebnet.it Milano IT
198
+ ```
199
+
200
+ With `locale: DE`:
201
+
202
+ ```
203
+ id name email city country_code
204
+ 12044 Hans Müller h.mueller@web.de Berlin DE
205
+ 57892 Lena Schmidt lena.schmidt@gmx.de München DE
206
+ ```
207
+
208
+ With `locale: JP`:
209
+
210
+ ```
211
+ id name email city country_code
212
+ 9341 Yuki Tanaka y.tanaka@docomo.ne.jp Tokyo JP
213
+ ```
214
+
215
+ The magic is that **related presets are generated together**: the email is derived from the name, the city belongs to the country, the phone number uses the right country prefix, and IBANs use the correct country code. A single `locale` field in your schema is all it takes.
216
+
217
+ Supported locales include: `US`, `IT`, `DE`, `FR`, `ES`, `JP`, `BR`, `PL`, `NL`, `SE`, `DK`, `TR`, `RU`, `CN`, `KR`, and [many more](https://github.com/posit-dev/pointblank).
218
+
219
+ ### Validation rules
220
+
221
+ | Rule | Description | Parameters |
222
+ |------|-------------|------------|
223
+ | `col_vals_not_null` | No nulls | `columns` |
224
+ | `col_vals_between` | Value in range | `column`, `min`, `max` |
225
+ | `col_vals_regex` | Matches pattern | `column`, `pattern` |
226
+ | `col_vals_in_set` | Value in allowed set | `column`, `values` |
227
+ | `col_vals_gt` / `col_vals_lt` | Greater / less than | `column`, `min` / `max` |
228
+ | `col_vals_ge` / `col_vals_le` | Greater / less or equal | `column`, `min` / `max` |
229
+ | `rows_distinct` | Unique rows | `columns` |
230
+ | `col_exists` | Column present | `columns` |
231
+
232
+ ---
233
+
234
+ ## Commands
235
+
236
+ ### `fauxdata generate SCHEMA`
237
+
238
+ ```
239
+ fauxdata generate schemas/people.yml
240
+ fauxdata generate schemas/people.yml --rows 500 --seed 42 --validate
241
+ fauxdata generate schemas/people.yml --format parquet --out tmp/people.parquet
242
+ fauxdata generate schemas/people.yml --rows 1000 --out - # stdout
243
+ fauxdata generate schemas/people.yml --out - --format jsonl | wc -l
244
+ ```
245
+
246
+ | Option | Short | Default | Description |
247
+ |--------|-------|---------|-------------|
248
+ | `--rows` | `-r` | from schema | Number of rows to generate |
249
+ | `--out` | `-o` | from schema | Output path — use `-` for stdout |
250
+ | `--format` | `-f` | from schema | Output format: `csv`, `parquet`, `json`, `jsonl`, `jsonlines` |
251
+ | `--seed` | `-s` | from schema | Random seed for reproducibility |
252
+ | `--validate` | `-v` | off | Run validation rules after generating |
253
+
254
+ When `--out -` is used, all output messages are suppressed and only data is written to stdout.
255
+
256
+ ### `fauxdata validate DATASET SCHEMA`
257
+
258
+ ```
259
+ fauxdata validate tmp/people.csv schemas/people.yml
260
+ ```
261
+
262
+ Validates an existing file against a schema. Exits with code `1` if any rule fails — useful in CI pipelines.
263
+
264
+ ### `fauxdata preview DATASET`
265
+
266
+ ```
267
+ fauxdata preview tmp/people.csv --rows 10
268
+ ```
269
+
270
+ Shows the first N rows and a column statistics table (type, nulls, unique count, min/max).
271
+
272
+ | Option | Short | Default | Description |
273
+ |--------|-------|---------|-------------|
274
+ | `--rows` | `-r` | 10 | Number of rows to display |
275
+
276
+ ### `fauxdata init`
277
+
278
+ ```
279
+ fauxdata init
280
+ fauxdata init --name orders
281
+ ```
282
+
283
+ Interactive wizard to create a new schema template. Asks for name, description, row count, and default format.
284
+
285
+ | Option | Short | Description |
286
+ |--------|-------|-------------|
287
+ | `--name` | `-n` | Schema name (skips the interactive prompt) |
288
+
289
+ ---
290
+
291
+ ## Example schemas
292
+
293
+ Three ready-to-use schemas are included in `schemas/`:
294
+
295
+ | Schema | Domain | Columns |
296
+ |--------|--------|---------|
297
+ | `people.yml` | Personal data | id, name, email, age, city, country_code, active, signup_date, score |
298
+ | `orders.yml` | E-commerce | order_id, customer_id, product, amount, status, created_at |
299
+ | `events.yml` | Analytics | event_id, user_id, event_type, timestamp, ip, user_agent, session_duration |
300
+
301
+ ---
302
+
303
+ ## Acknowledgements
304
+
305
+ A heartfelt thank you to **[Rich Iannone](https://github.com/rich-iannone)** and the entire [pointblank](https://github.com/posit-dev/pointblank) team at [Posit](https://posit.co/) for building an exceptional data quality library — and for inspiring this project with their article:
306
+
307
+ > **[Building realistic fake datasets with pointblank](https://posit.co/blog/building-realistic-fake-datasets-with-pointblank/)**
308
+
309
+ Without their work, fauxdata would not exist. If you find pointblank useful, please give it a ⭐ on GitHub.
@@ -1,3 +1,7 @@
1
+ [![PyPI version](https://img.shields.io/pypi/v/fauxdata-cli.svg?label=PyPI%20version)](https://pypi.org/project/fauxdata-cli/)
2
+ [![Python Versions](https://img.shields.io/pypi/pyversions/fauxdata-cli.svg)](https://pypi.org/project/fauxdata-cli/)
3
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/fauxdata)
4
+
1
5
  # fauxdata
2
6
 
3
7
  **fauxdata** is a command-line tool for generating and validating realistic fake datasets from simple YAML schemas.
@@ -23,25 +27,21 @@ If you work with data — as an analyst, engineer, developer, or researcher —
23
27
  [uv](https://docs.astral.sh/uv/) installs fauxdata as an isolated tool available globally, without polluting any existing Python environment:
24
28
 
25
29
  ```bash
26
- git clone https://github.com/aborruso/fauxdata
27
- cd fauxdata
28
- uv tool install .
30
+ uv tool install fauxdata-cli
29
31
  ```
30
32
 
31
33
  After installation, `fauxdata` is available from any directory.
32
34
 
33
- To update after code changes:
35
+ To upgrade:
34
36
 
35
37
  ```bash
36
- uv tool install . --reinstall
38
+ uv tool upgrade fauxdata-cli
37
39
  ```
38
40
 
39
41
  ### With pip
40
42
 
41
43
  ```bash
42
- git clone https://github.com/aborruso/fauxdata
43
- cd fauxdata
44
- pip install .
44
+ pip install fauxdata-cli
45
45
  ```
46
46
 
47
47
  ---
@@ -901,13 +901,8 @@
901
901
  <div role="tabpanel" id="tab-uv" aria-labelledby="tab-btn-uv" class="tab-panel active">
902
902
  <div class="install-cmd">
903
903
  <span class="prompt" aria-hidden="true">$</span>
904
- <code>git clone https://github.com/aborruso/fauxdata &amp;&amp; cd fauxdata</code>
905
- <button class="copy-btn" onclick="copyCmd(this, 'git clone https://github.com/aborruso/fauxdata && cd fauxdata')" aria-label="Copy git clone command">copy</button>
906
- </div>
907
- <div class="install-cmd">
908
- <span class="prompt" aria-hidden="true">$</span>
909
- <code>uv tool install .</code>
910
- <button class="copy-btn" onclick="copyCmd(this, 'uv tool install .')" aria-label="Copy uv tool install command">copy</button>
904
+ <code>uv tool install fauxdata-cli</code>
905
+ <button class="copy-btn" onclick="copyCmd(this, 'uv tool install fauxdata-cli')" aria-label="Copy uv tool install command">copy</button>
911
906
  </div>
912
907
  <div class="install-cmd">
913
908
  <span class="prompt" aria-hidden="true">$</span>
@@ -920,13 +915,8 @@
920
915
  <div role="tabpanel" id="tab-pip" aria-labelledby="tab-btn-pip" class="tab-panel">
921
916
  <div class="install-cmd">
922
917
  <span class="prompt" aria-hidden="true">$</span>
923
- <code>git clone https://github.com/aborruso/fauxdata &amp;&amp; cd fauxdata</code>
924
- <button class="copy-btn" onclick="copyCmd(this, 'git clone https://github.com/aborruso/fauxdata && cd fauxdata')" aria-label="Copy git clone command">copy</button>
925
- </div>
926
- <div class="install-cmd">
927
- <span class="prompt" aria-hidden="true">$</span>
928
- <code>pip install .</code>
929
- <button class="copy-btn" onclick="copyCmd(this, 'pip install .')" aria-label="Copy pip install command">copy</button>
918
+ <code>pip install fauxdata-cli</code>
919
+ <button class="copy-btn" onclick="copyCmd(this, 'pip install fauxdata-cli')" aria-label="Copy pip install command">copy</button>
930
920
  </div>
931
921
  <div class="install-cmd">
932
922
  <span class="prompt" aria-hidden="true">$</span>
@@ -1,7 +1,8 @@
1
1
  [project]
2
2
  name = "fauxdata-cli"
3
- version = "0.1.0"
3
+ version = "0.1.1"
4
4
  description = "CLI for generating and validating fake datasets"
5
+ readme = "README.md"
5
6
  requires-python = ">=3.11"
6
7
  dependencies = [
7
8
  "pointblank>=0.22",
@@ -14,6 +15,11 @@ dependencies = [
14
15
  "faker>=26.0",
15
16
  ]
16
17
 
18
+ [project.urls]
19
+ Homepage = "https://aborruso.github.io/fauxdata/"
20
+ Repository = "https://github.com/aborruso/fauxdata"
21
+ "Bug Tracker" = "https://github.com/aborruso/fauxdata/issues"
22
+
17
23
  [project.scripts]
18
24
  fauxdata = "fauxdata.main:app"
19
25
 
@@ -1,13 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: fauxdata-cli
3
- Version: 0.1.0
4
- Summary: CLI for generating and validating fake datasets
5
- Requires-Python: >=3.11
6
- Requires-Dist: faker>=26.0
7
- Requires-Dist: pointblank>=0.22
8
- Requires-Dist: polars>=1.0
9
- Requires-Dist: pyfiglet>=1.0
10
- Requires-Dist: pyyaml>=6.0
11
- Requires-Dist: questionary>=2.0
12
- Requires-Dist: rich>=13
13
- Requires-Dist: typer>=0.12
File without changes
File without changes
File without changes