fakesmith 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fakesmith-0.1.0/PKG-INFO +246 -0
- fakesmith-0.1.0/README.md +225 -0
- fakesmith-0.1.0/fakesmith/__init__.py +16 -0
- fakesmith-0.1.0/fakesmith/cli.py +150 -0
- fakesmith-0.1.0/fakesmith/detector.py +217 -0
- fakesmith-0.1.0/fakesmith/generators.py +205 -0
- fakesmith-0.1.0/fakesmith/result.py +75 -0
- fakesmith-0.1.0/fakesmith/sanitizer.py +180 -0
- fakesmith-0.1.0/fakesmith/schema.py +109 -0
- fakesmith-0.1.0/fakesmith/smith.py +199 -0
- fakesmith-0.1.0/fakesmith.egg-info/PKG-INFO +246 -0
- fakesmith-0.1.0/fakesmith.egg-info/SOURCES.txt +17 -0
- fakesmith-0.1.0/fakesmith.egg-info/dependency_links.txt +1 -0
- fakesmith-0.1.0/fakesmith.egg-info/entry_points.txt +2 -0
- fakesmith-0.1.0/fakesmith.egg-info/requires.txt +5 -0
- fakesmith-0.1.0/fakesmith.egg-info/top_level.txt +1 -0
- fakesmith-0.1.0/pyproject.toml +41 -0
- fakesmith-0.1.0/setup.cfg +4 -0
- fakesmith-0.1.0/tests/test_fakesmith.py +329 -0
fakesmith-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fakesmith
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generate realistic fake data that mirrors your real data's shape — safe to share with LLMs.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: fake data,data masking,privacy,testing,mock data,llm safety
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Topic :: Software Development :: Testing
|
|
14
|
+
Classifier: Topic :: Security
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: faker>=24.0.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# FakeSmith
|
|
28
|
+
|
|
29
|
+
> Generate realistic fake data that **mirrors your real data's shape** — safe to share with LLMs, teammates, or in public repos.
|
|
30
|
+
|
|
31
|
+
A Python package and CLI that converts real configs, payloads, logs, and datasets into schema-preserving synthetic versions safe to share with LLMs. Because LLM-safe sanitization of real developer artifacts is a real and growing workflow problem.
|
|
32
|
+
|
|
33
|
+
When you share code with an AI assistant, you shouldn't have to expose real emails, API keys, card numbers, or user data. FakeSmith lets you describe (or just paste) a sample of your data and instantly get structurally identical but completely fake replacements.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install fakesmith
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
### Option 1 — Auto-detect from a sample
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from fakesmith import FakeSmith
|
|
51
|
+
|
|
52
|
+
# Paste a real (or representative) sample — FakeSmith reads its shape
|
|
53
|
+
sample = '''[{
|
|
54
|
+
"user_id": "3f2e1a4b-0000-0000-0000-000000000000",
|
|
55
|
+
"email": "john.doe@company.com",
|
|
56
|
+
"phone": "+1-800-555-0199",
|
|
57
|
+
"api_key": "sk-abc123def456ghi789jkl012",
|
|
58
|
+
"amount": 199.99,
|
|
59
|
+
"status": "active",
|
|
60
|
+
"created_at": "2024-01-15T09:30:00"
|
|
61
|
+
}]'''
|
|
62
|
+
|
|
63
|
+
smith = FakeSmith.from_sample(sample)
|
|
64
|
+
smith.describe() # see what was detected
|
|
65
|
+
print(smith.to_json(5)) # 5 fake records, same shape
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from fakesmith import FakeSmith, SchemaField, FieldType
|
|
70
|
+
|
|
71
|
+
smith = FakeSmith([
|
|
72
|
+
SchemaField("user_id", FieldType.UUID),
|
|
73
|
+
SchemaField("email", FieldType.EMAIL),
|
|
74
|
+
SchemaField("name", FieldType.FULL_NAME),
|
|
75
|
+
SchemaField("amount", FieldType.AMOUNT, min_value=10, max_value=5000),
|
|
76
|
+
SchemaField("status", FieldType.STATUS, choices=["active", "inactive", "pending"]),
|
|
77
|
+
SchemaField("api_key", FieldType.API_KEY, prefix="sk-live-"),
|
|
78
|
+
])
|
|
79
|
+
|
|
80
|
+
# Generate deterministic records with a seed
|
|
81
|
+
result = smith.generate(10, seed=42)
|
|
82
|
+
result.print_summary() # See which fields were faked
|
|
83
|
+
records = result.records # Access the list of dicts
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Option 3 — Quick dict shorthand
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
smith = FakeSmith.from_dict({
|
|
90
|
+
"id": FieldType.UUID,
|
|
91
|
+
"email": FieldType.EMAIL,
|
|
92
|
+
"score": FieldType.INTEGER,
|
|
93
|
+
"verified": FieldType.BOOLEAN,
|
|
94
|
+
})
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Output Formats
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
smith.to_json(10) # JSON string
|
|
103
|
+
smith.to_csv(10) # CSV string
|
|
104
|
+
smith.to_sql(10, table_name="users") # SQL INSERT statements
|
|
105
|
+
smith.to_env() # .env file format
|
|
106
|
+
|
|
107
|
+
smith.save_json("fake_users.json", 100) # save to file
|
|
108
|
+
smith.save_csv("fake_users.csv", 100)
|
|
109
|
+
smith.save_sql("seed.sql", 100, table_name="users")
|
|
110
|
+
smith.save_env(".env.fake")
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## CLI
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Generate 20 fake records from a JSON sample
|
|
119
|
+
fakesmith generate --file real_sample.json --count 20 --format json
|
|
120
|
+
|
|
121
|
+
# From CSV, output as SQL inserts
|
|
122
|
+
fakesmith generate --file data.csv --count 50 --format sql --table transactions
|
|
123
|
+
|
|
124
|
+
# Deterministic output using a seed
|
|
125
|
+
fakesmith generate --file data.json --seed 42 --out fake_data.json
|
|
126
|
+
|
|
127
|
+
# Sanitize raw text (log lines, configs) in-place
|
|
128
|
+
fakesmith sanitize --file server.log --out clean.log --summary
|
|
129
|
+
|
|
130
|
+
# Inspect detected schema and sensitivity flags
|
|
131
|
+
fakesmith describe --file data.json
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## In-place Sanitization
|
|
137
|
+
|
|
138
|
+
FakeSmith can scan raw text (log lines, configuration blocks, or emails) and replace PII/secrets in-place without needing a schema.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from fakesmith import sanitize_text
|
|
142
|
+
|
|
143
|
+
raw_text = "My email is alex@example.com and my key is sk-12345"
|
|
144
|
+
result = sanitize_text(raw_text, seed=42)
|
|
145
|
+
|
|
146
|
+
print(result.sanitized)
|
|
147
|
+
# "My email is fake.user@domain.com and my key is sk-a1b2c3d4..."
|
|
148
|
+
|
|
149
|
+
result.print_summary() # See exactly what was replaced and why
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Run the Samples
|
|
155
|
+
|
|
156
|
+
Try out FakeSmith on the included sample datasets (JSON, CSV, and .env) using the demo script:
|
|
157
|
+
|
|
158
|
+
1. **Setup Environment**
|
|
159
|
+
```bash
|
|
160
|
+
python3 -m venv venv
|
|
161
|
+
source venv/bin/activate
|
|
162
|
+
pip install faker pytest
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
2. **Run the Samples**
|
|
166
|
+
To run any script in the `examples/` folder while working on the source code, you must set the `PYTHONPATH` to the current directory:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
# Set PYTHONPATH to the root so Python can find the 'fakesmith' package
|
|
170
|
+
export PYTHONPATH=$PYTHONPATH:.
|
|
171
|
+
|
|
172
|
+
# Run the main demo
|
|
173
|
+
python3 examples/demo_all.py
|
|
174
|
+
|
|
175
|
+
# Or run any individual sample
|
|
176
|
+
python3 examples/export_to_sql_csv.py
|
|
177
|
+
python3 examples/sanitize_logs_in_place.py
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
3. **Explore the examples/ directory**
|
|
181
|
+
The `examples/` folder contains several targeted scripts illustrating different features (auto-detection, manual schemas, in-place sanitization, etc.).
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Override Auto-Detection
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
smith = FakeSmith.from_sample(
|
|
189
|
+
my_json,
|
|
190
|
+
overrides={
|
|
191
|
+
# Auto-detected "status" as SENTENCE — override to proper STATUS
|
|
192
|
+
"status": SchemaField("status", FieldType.STATUS, choices=["open", "closed", "resolved"]),
|
|
193
|
+
# Keep a realistic amount range
|
|
194
|
+
"balance": SchemaField("balance", FieldType.AMOUNT, min_value=0, max_value=100000),
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Custom Fields
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
import random
|
|
205
|
+
|
|
206
|
+
smith = FakeSmith([
|
|
207
|
+
SchemaField("ref_code", FieldType.CUSTOM,
|
|
208
|
+
generator=lambda: f"REF-{random.randint(10000, 99999)}"
|
|
209
|
+
),
|
|
210
|
+
SchemaField("tier", FieldType.CUSTOM,
|
|
211
|
+
generator=lambda: random.choice(["bronze", "silver", "gold", "platinum"])
|
|
212
|
+
),
|
|
213
|
+
])
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Supported Field Types
|
|
219
|
+
|
|
220
|
+
| Category | Types |
|
|
221
|
+
|------------|-------|
|
|
222
|
+
| Identity | UUID, FULL_NAME, FIRST_NAME, LAST_NAME, USERNAME, EMAIL, PHONE, PASSWORD, PASSWORD_HASH |
|
|
223
|
+
| Location | ADDRESS, CITY, STATE, COUNTRY, ZIP_CODE, LATITUDE, LONGITUDE |
|
|
224
|
+
| Finance | CARD_NUMBER, CARD_EXPIRY, CARD_CVV, BANK_ACCOUNT, IBAN, AMOUNT, CURRENCY |
|
|
225
|
+
| Business | COMPANY, JOB_TITLE, DEPARTMENT, API_KEY, SECRET_TOKEN, JWT_TOKEN, WEBHOOK_URL |
|
|
226
|
+
| Dates | DATETIME, DATE, TIME, DATE_OF_BIRTH, TIMESTAMP |
|
|
227
|
+
| Web & Tech | IP_ADDRESS, IPV6, MAC_ADDRESS, USER_AGENT, URL, DOMAIN, SLUG, JWT_TOKEN |
|
|
228
|
+
| Content | WORD, SENTENCE, PARAGRAPH, TITLE, DESCRIPTION, TAG |
|
|
229
|
+
| Numeric | INTEGER, FLOAT, BOOLEAN, PERCENTAGE |
|
|
230
|
+
| Enums | STATUS, GENDER, CUSTOM |
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Why FakeSmith?
|
|
235
|
+
|
|
236
|
+
- **LLM-safe** — no real credentials, PII, or secrets ever leave your machine
|
|
237
|
+
- **Zero config** — paste a sample and go
|
|
238
|
+
- **Structurally identical** — same field names, same types, realistic values
|
|
239
|
+
- **All formats** — JSON, CSV, SQL, .env
|
|
240
|
+
- **Extensible** — override any field with a custom generator
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## License
|
|
245
|
+
|
|
246
|
+
MIT
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# FakeSmith
|
|
7
|
+
|
|
8
|
+
> Generate realistic fake data that **mirrors your real data's shape** — safe to share with LLMs, teammates, or in public repos.
|
|
9
|
+
|
|
10
|
+
A Python package and CLI that converts real configs, payloads, logs, and datasets into schema-preserving synthetic versions safe to share with LLMs. Because LLM-safe sanitization of real developer artifacts is a real and growing workflow problem.
|
|
11
|
+
|
|
12
|
+
When you share code with an AI assistant, you shouldn't have to expose real emails, API keys, card numbers, or user data. FakeSmith lets you describe (or just paste) a sample of your data and instantly get structurally identical but completely fake replacements.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install fakesmith
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
### Option 1 — Auto-detect from a sample
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from fakesmith import FakeSmith
|
|
30
|
+
|
|
31
|
+
# Paste a real (or representative) sample — FakeSmith reads its shape
|
|
32
|
+
sample = '''[{
|
|
33
|
+
"user_id": "3f2e1a4b-0000-0000-0000-000000000000",
|
|
34
|
+
"email": "john.doe@company.com",
|
|
35
|
+
"phone": "+1-800-555-0199",
|
|
36
|
+
"api_key": "sk-abc123def456ghi789jkl012",
|
|
37
|
+
"amount": 199.99,
|
|
38
|
+
"status": "active",
|
|
39
|
+
"created_at": "2024-01-15T09:30:00"
|
|
40
|
+
}]'''
|
|
41
|
+
|
|
42
|
+
smith = FakeSmith.from_sample(sample)
|
|
43
|
+
smith.describe() # see what was detected
|
|
44
|
+
print(smith.to_json(5)) # 5 fake records, same shape
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from fakesmith import FakeSmith, SchemaField, FieldType
|
|
49
|
+
|
|
50
|
+
smith = FakeSmith([
|
|
51
|
+
SchemaField("user_id", FieldType.UUID),
|
|
52
|
+
SchemaField("email", FieldType.EMAIL),
|
|
53
|
+
SchemaField("name", FieldType.FULL_NAME),
|
|
54
|
+
SchemaField("amount", FieldType.AMOUNT, min_value=10, max_value=5000),
|
|
55
|
+
SchemaField("status", FieldType.STATUS, choices=["active", "inactive", "pending"]),
|
|
56
|
+
SchemaField("api_key", FieldType.API_KEY, prefix="sk-live-"),
|
|
57
|
+
])
|
|
58
|
+
|
|
59
|
+
# Generate deterministic records with a seed
|
|
60
|
+
result = smith.generate(10, seed=42)
|
|
61
|
+
result.print_summary() # See which fields were faked
|
|
62
|
+
records = result.records # Access the list of dicts
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Option 3 — Quick dict shorthand
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
smith = FakeSmith.from_dict({
|
|
69
|
+
"id": FieldType.UUID,
|
|
70
|
+
"email": FieldType.EMAIL,
|
|
71
|
+
"score": FieldType.INTEGER,
|
|
72
|
+
"verified": FieldType.BOOLEAN,
|
|
73
|
+
})
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Output Formats
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
smith.to_json(10) # JSON string
|
|
82
|
+
smith.to_csv(10) # CSV string
|
|
83
|
+
smith.to_sql(10, table_name="users") # SQL INSERT statements
|
|
84
|
+
smith.to_env() # .env file format
|
|
85
|
+
|
|
86
|
+
smith.save_json("fake_users.json", 100) # save to file
|
|
87
|
+
smith.save_csv("fake_users.csv", 100)
|
|
88
|
+
smith.save_sql("seed.sql", 100, table_name="users")
|
|
89
|
+
smith.save_env(".env.fake")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## CLI
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Generate 20 fake records from a JSON sample
|
|
98
|
+
fakesmith generate --file real_sample.json --count 20 --format json
|
|
99
|
+
|
|
100
|
+
# From CSV, output as SQL inserts
|
|
101
|
+
fakesmith generate --file data.csv --count 50 --format sql --table transactions
|
|
102
|
+
|
|
103
|
+
# Deterministic output using a seed
|
|
104
|
+
fakesmith generate --file data.json --seed 42 --out fake_data.json
|
|
105
|
+
|
|
106
|
+
# Sanitize raw text (log lines, configs) in-place
|
|
107
|
+
fakesmith sanitize --file server.log --out clean.log --summary
|
|
108
|
+
|
|
109
|
+
# Inspect detected schema and sensitivity flags
|
|
110
|
+
fakesmith describe --file data.json
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## In-place Sanitization
|
|
116
|
+
|
|
117
|
+
FakeSmith can scan raw text (log lines, configuration blocks, or emails) and replace PII/secrets in-place without needing a schema.
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from fakesmith import sanitize_text
|
|
121
|
+
|
|
122
|
+
raw_text = "My email is alex@example.com and my key is sk-12345"
|
|
123
|
+
result = sanitize_text(raw_text, seed=42)
|
|
124
|
+
|
|
125
|
+
print(result.sanitized)
|
|
126
|
+
# "My email is fake.user@domain.com and my key is sk-a1b2c3d4..."
|
|
127
|
+
|
|
128
|
+
result.print_summary() # See exactly what was replaced and why
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Run the Samples
|
|
134
|
+
|
|
135
|
+
Try out FakeSmith on the included sample datasets (JSON, CSV, and .env) using the demo script:
|
|
136
|
+
|
|
137
|
+
1. **Setup Environment**
|
|
138
|
+
```bash
|
|
139
|
+
python3 -m venv venv
|
|
140
|
+
source venv/bin/activate
|
|
141
|
+
pip install faker pytest
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
2. **Run the Samples**
|
|
145
|
+
To run any script in the `examples/` folder while working on the source code, you must set the `PYTHONPATH` to the current directory:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Set PYTHONPATH to the root so Python can find the 'fakesmith' package
|
|
149
|
+
export PYTHONPATH=$PYTHONPATH:.
|
|
150
|
+
|
|
151
|
+
# Run the main demo
|
|
152
|
+
python3 examples/demo_all.py
|
|
153
|
+
|
|
154
|
+
# Or run any individual sample
|
|
155
|
+
python3 examples/export_to_sql_csv.py
|
|
156
|
+
python3 examples/sanitize_logs_in_place.py
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
3. **Explore the examples/ directory**
|
|
160
|
+
The `examples/` folder contains several targeted scripts illustrating different features (auto-detection, manual schemas, in-place sanitization, etc.).
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Override Auto-Detection
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
smith = FakeSmith.from_sample(
|
|
168
|
+
my_json,
|
|
169
|
+
overrides={
|
|
170
|
+
# Auto-detected "status" as SENTENCE — override to proper STATUS
|
|
171
|
+
"status": SchemaField("status", FieldType.STATUS, choices=["open", "closed", "resolved"]),
|
|
172
|
+
# Keep a realistic amount range
|
|
173
|
+
"balance": SchemaField("balance", FieldType.AMOUNT, min_value=0, max_value=100000),
|
|
174
|
+
}
|
|
175
|
+
)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Custom Fields
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
import random
|
|
184
|
+
|
|
185
|
+
smith = FakeSmith([
|
|
186
|
+
SchemaField("ref_code", FieldType.CUSTOM,
|
|
187
|
+
generator=lambda: f"REF-{random.randint(10000, 99999)}"
|
|
188
|
+
),
|
|
189
|
+
SchemaField("tier", FieldType.CUSTOM,
|
|
190
|
+
generator=lambda: random.choice(["bronze", "silver", "gold", "platinum"])
|
|
191
|
+
),
|
|
192
|
+
])
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Supported Field Types
|
|
198
|
+
|
|
199
|
+
| Category | Types |
|
|
200
|
+
|------------|-------|
|
|
201
|
+
| Identity | UUID, FULL_NAME, FIRST_NAME, LAST_NAME, USERNAME, EMAIL, PHONE, PASSWORD, PASSWORD_HASH |
|
|
202
|
+
| Location | ADDRESS, CITY, STATE, COUNTRY, ZIP_CODE, LATITUDE, LONGITUDE |
|
|
203
|
+
| Finance | CARD_NUMBER, CARD_EXPIRY, CARD_CVV, BANK_ACCOUNT, IBAN, AMOUNT, CURRENCY |
|
|
204
|
+
| Business | COMPANY, JOB_TITLE, DEPARTMENT, API_KEY, SECRET_TOKEN, JWT_TOKEN, WEBHOOK_URL |
|
|
205
|
+
| Dates | DATETIME, DATE, TIME, DATE_OF_BIRTH, TIMESTAMP |
|
|
206
|
+
| Web & Tech | IP_ADDRESS, IPV6, MAC_ADDRESS, USER_AGENT, URL, DOMAIN, SLUG, JWT_TOKEN |
|
|
207
|
+
| Content | WORD, SENTENCE, PARAGRAPH, TITLE, DESCRIPTION, TAG |
|
|
208
|
+
| Numeric | INTEGER, FLOAT, BOOLEAN, PERCENTAGE |
|
|
209
|
+
| Enums | STATUS, GENDER, CUSTOM |
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Why FakeSmith?
|
|
214
|
+
|
|
215
|
+
- **LLM-safe** — no real credentials, PII, or secrets ever leave your machine
|
|
216
|
+
- **Zero config** — paste a sample and go
|
|
217
|
+
- **Structurally identical** — same field names, same types, realistic values
|
|
218
|
+
- **All formats** — JSON, CSV, SQL, .env
|
|
219
|
+
- **Extensible** — override any field with a custom generator
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## License
|
|
224
|
+
|
|
225
|
+
MIT
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from .smith import FakeSmith
|
|
2
|
+
from .schema import SchemaField, FieldType
|
|
3
|
+
from .detector import infer_schema
|
|
4
|
+
from .sanitizer import sanitize_text, SanitizeResult
|
|
5
|
+
from .result import GenerationResult, FieldSummary
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"FakeSmith",
|
|
9
|
+
"SchemaField",
|
|
10
|
+
"FieldType",
|
|
11
|
+
"infer_schema",
|
|
12
|
+
"sanitize_text",
|
|
13
|
+
"SanitizeResult",
|
|
14
|
+
"GenerationResult",
|
|
15
|
+
"FieldSummary",
|
|
16
|
+
]
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py — Command-line interface for FakeSmith.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
# Generate fake records from a sample file
|
|
6
|
+
fakesmith generate --file data.json --count 20 --format csv
|
|
7
|
+
fakesmith generate --file data.json --count 5 --seed 42 --format sql --table users
|
|
8
|
+
fakesmith generate --file .env --count 1 --format env
|
|
9
|
+
|
|
10
|
+
# Sanitize raw text in-place (plain text / log lines / config blocks)
|
|
11
|
+
fakesmith sanitize --text "email is john@co.com and key is sk-abc123"
|
|
12
|
+
fakesmith sanitize --file raw_log.txt --out clean_log.txt
|
|
13
|
+
|
|
14
|
+
# Inspect detected schema
|
|
15
|
+
fakesmith describe --file data.json
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
import json
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from .smith import FakeSmith
|
|
24
|
+
from .sanitizer import sanitize_text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def detect_source_type(path: str) -> str:
|
|
28
|
+
ext = Path(path).suffix.lower()
|
|
29
|
+
return {".json": "json", ".csv": "csv"}.get(ext, "auto")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def cmd_generate(args):
|
|
33
|
+
try:
|
|
34
|
+
with open(args.file) as f:
|
|
35
|
+
raw = f.read()
|
|
36
|
+
except FileNotFoundError:
|
|
37
|
+
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
smith = FakeSmith.from_sample(raw, source_type=detect_source_type(args.file))
|
|
41
|
+
|
|
42
|
+
result = smith.generate(args.count, seed=args.seed)
|
|
43
|
+
|
|
44
|
+
if args.summary:
|
|
45
|
+
result.print_summary()
|
|
46
|
+
|
|
47
|
+
if args.format == "json":
|
|
48
|
+
output = json.dumps(result.records, indent=2, default=str)
|
|
49
|
+
elif args.format == "csv":
|
|
50
|
+
output = smith.to_csv(args.count, seed=args.seed)
|
|
51
|
+
elif args.format == "sql":
|
|
52
|
+
output = smith.to_sql(args.count, table_name=args.table, seed=args.seed)
|
|
53
|
+
elif args.format == "env":
|
|
54
|
+
output = smith.to_env(seed=args.seed)
|
|
55
|
+
|
|
56
|
+
if args.out:
|
|
57
|
+
with open(args.out, "w") as f:
|
|
58
|
+
f.write(output)
|
|
59
|
+
print(f"✅ Saved {args.count} records → {args.out}")
|
|
60
|
+
else:
|
|
61
|
+
print(output)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def cmd_sanitize(args):
|
|
65
|
+
if args.text:
|
|
66
|
+
raw = args.text
|
|
67
|
+
elif args.file:
|
|
68
|
+
try:
|
|
69
|
+
with open(args.file) as f:
|
|
70
|
+
raw = f.read()
|
|
71
|
+
except FileNotFoundError:
|
|
72
|
+
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
else:
|
|
75
|
+
print("❌ Provide --text or --file", file=sys.stderr)
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
|
|
78
|
+
result = sanitize_text(raw, seed=args.seed)
|
|
79
|
+
|
|
80
|
+
if args.summary or args.file:
|
|
81
|
+
result.print_summary()
|
|
82
|
+
|
|
83
|
+
if args.out:
|
|
84
|
+
with open(args.out, "w") as f:
|
|
85
|
+
f.write(result.sanitized)
|
|
86
|
+
print(f"✅ Sanitized output saved → {args.out}")
|
|
87
|
+
else:
|
|
88
|
+
print(result.sanitized)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def cmd_describe(args):
|
|
92
|
+
try:
|
|
93
|
+
with open(args.file) as f:
|
|
94
|
+
raw = f.read()
|
|
95
|
+
except FileNotFoundError:
|
|
96
|
+
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
smith = FakeSmith.from_sample(raw, source_type=detect_source_type(args.file))
|
|
100
|
+
smith.describe()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
parser = argparse.ArgumentParser(
|
|
105
|
+
prog="fakesmith",
|
|
106
|
+
description="Generate or sanitize fake data safe to share with LLMs.",
|
|
107
|
+
)
|
|
108
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
109
|
+
|
|
110
|
+
# ── generate ──────────────────────────────────────────────────────────────
|
|
111
|
+
gen = sub.add_parser("generate", aliases=["gen", "g"],
|
|
112
|
+
help="Generate fake records from a sample file")
|
|
113
|
+
gen.add_argument("--file", "-f", required=True)
|
|
114
|
+
gen.add_argument("--count", "-n", type=int, default=10)
|
|
115
|
+
gen.add_argument("--seed", "-s", type=int, default=None,
|
|
116
|
+
help="Integer seed for deterministic output")
|
|
117
|
+
gen.add_argument("--format", "-o", default="json",
|
|
118
|
+
choices=["json", "csv", "sql", "env"])
|
|
119
|
+
gen.add_argument("--table", "-t", default="records")
|
|
120
|
+
gen.add_argument("--out", default=None)
|
|
121
|
+
gen.add_argument("--summary", action="store_true",
|
|
122
|
+
help="Print change summary and warnings")
|
|
123
|
+
|
|
124
|
+
# ── sanitize ──────────────────────────────────────────────────────────────
|
|
125
|
+
san = sub.add_parser("sanitize", aliases=["san", "s"],
|
|
126
|
+
help="Replace secrets/PII in raw text in-place")
|
|
127
|
+
san_src = san.add_mutually_exclusive_group(required=True)
|
|
128
|
+
san_src.add_argument("--text", "-t", help="Inline string to sanitize")
|
|
129
|
+
san_src.add_argument("--file", "-f", help="File to sanitize")
|
|
130
|
+
san.add_argument("--out", "-o", default=None, help="Output file")
|
|
131
|
+
san.add_argument("--seed", "-s", type=int, default=None)
|
|
132
|
+
san.add_argument("--summary", action="store_true")
|
|
133
|
+
|
|
134
|
+
# ── describe ──────────────────────────────────────────────────────────────
|
|
135
|
+
desc = sub.add_parser("describe", aliases=["d"],
|
|
136
|
+
help="Show detected schema and sensitivity flags")
|
|
137
|
+
desc.add_argument("--file", "-f", required=True)
|
|
138
|
+
|
|
139
|
+
args = parser.parse_args()
|
|
140
|
+
|
|
141
|
+
if args.command in ("generate", "gen", "g"):
|
|
142
|
+
cmd_generate(args)
|
|
143
|
+
elif args.command in ("sanitize", "san", "s"):
|
|
144
|
+
cmd_sanitize(args)
|
|
145
|
+
elif args.command in ("describe", "d"):
|
|
146
|
+
cmd_describe(args)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
if __name__ == "__main__":
|
|
150
|
+
main()
|