fakedata-python 2.0.3__py3-none-any.whl → 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fakedata/cli.py +80 -31
- {fakedata_python-2.0.3.dist-info → fakedata_python-2.0.4.dist-info}/METADATA +66 -3
- {fakedata_python-2.0.3.dist-info → fakedata_python-2.0.4.dist-info}/RECORD +7 -7
- {fakedata_python-2.0.3.dist-info → fakedata_python-2.0.4.dist-info}/WHEEL +0 -0
- {fakedata_python-2.0.3.dist-info → fakedata_python-2.0.4.dist-info}/entry_points.txt +0 -0
- {fakedata_python-2.0.3.dist-info → fakedata_python-2.0.4.dist-info}/licenses/LICENSE +0 -0
- {fakedata_python-2.0.3.dist-info → fakedata_python-2.0.4.dist-info}/top_level.txt +0 -0
fakedata/cli.py
CHANGED
|
@@ -89,44 +89,93 @@ EXAMPLES:
|
|
|
89
89
|
'anomaly_rate': args.anomaly_rate,
|
|
90
90
|
'missing_rate': args.missing_rate,
|
|
91
91
|
}
|
|
92
|
-
# Remove None values so defaults are used inside the engine
|
|
93
92
|
options = {k: v for k, v in options.items() if v is not None and v != 0.0}
|
|
94
93
|
|
|
94
|
+
count = args.count
|
|
95
95
|
start = time.time()
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
else: # json
|
|
112
|
-
if args.pretty:
|
|
113
|
-
output = data.users_to_json(args.count, options if options else None)
|
|
96
|
+
PROGRESS_INTERVAL = 10000
|
|
97
|
+
|
|
98
|
+
# ── stdout: buffer is fine for small terminal output ──────────────
|
|
99
|
+
if not args.output:
|
|
100
|
+
if args.timeseries:
|
|
101
|
+
results = [
|
|
102
|
+
data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
|
|
103
|
+
for _ in range(count)
|
|
104
|
+
]
|
|
105
|
+
print(json.dumps(results, indent=2 if args.pretty else None))
|
|
106
|
+
elif args.format == 'csv':
|
|
107
|
+
print(data.users_to_csv(count, options if options else None))
|
|
108
|
+
elif args.format == 'flat':
|
|
109
|
+
rows = data.users_flat(count, options if options else None)
|
|
110
|
+
print(json.dumps(rows, indent=2 if args.pretty else None))
|
|
114
111
|
else:
|
|
115
|
-
|
|
112
|
+
if args.pretty:
|
|
113
|
+
print(data.users_to_json(count, options if options else None))
|
|
114
|
+
else:
|
|
115
|
+
print(json.dumps(data.users(count, options if options else None)))
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
# ── File: STREAMING — open file first, write one record at a time ──
|
|
119
|
+
out_path = os.path.abspath(args.output)
|
|
120
|
+
|
|
121
|
+
with open(out_path, 'w', encoding='utf-8') as f:
|
|
122
|
+
|
|
123
|
+
if args.timeseries:
|
|
124
|
+
f.write('[\n')
|
|
125
|
+
for i in range(count):
|
|
126
|
+
rec = data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
|
|
127
|
+
line = json.dumps(rec, indent=2 if args.pretty else None)
|
|
128
|
+
f.write(line + (',' if i < count - 1 else '') + '\n')
|
|
129
|
+
if (i + 1) % PROGRESS_INTERVAL == 0:
|
|
130
|
+
print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
|
|
131
|
+
f.write(']\n')
|
|
132
|
+
|
|
133
|
+
elif args.format == 'csv':
|
|
134
|
+
# Write header from first record
|
|
135
|
+
first = data.user(options if options else None)
|
|
136
|
+
header = ','.join(f'"{k}"' for k in first.keys())
|
|
137
|
+
f.write(header + '\n')
|
|
138
|
+
f.write(_user_to_csv_row(first) + '\n')
|
|
139
|
+
for i in range(1, count):
|
|
140
|
+
u = data.user(options if options else None)
|
|
141
|
+
f.write(_user_to_csv_row(u) + '\n')
|
|
142
|
+
if (i + 1) % PROGRESS_INTERVAL == 0:
|
|
143
|
+
print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
|
|
144
|
+
|
|
145
|
+
else: # json / flat
|
|
146
|
+
f.write('[\n')
|
|
147
|
+
for i in range(count):
|
|
148
|
+
u = data.user(options if options else None)
|
|
149
|
+
line = json.dumps(u, indent=2 if args.pretty else None)
|
|
150
|
+
f.write(line + (',' if i < count - 1 else '') + '\n')
|
|
151
|
+
if (i + 1) % PROGRESS_INTERVAL == 0:
|
|
152
|
+
print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
|
|
153
|
+
f.write(']\n')
|
|
116
154
|
|
|
117
155
|
elapsed = round(time.time() - start, 2)
|
|
118
|
-
|
|
119
|
-
if
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
156
|
+
size_bytes = os.path.getsize(out_path)
|
|
157
|
+
size_label = f"{size_bytes / 1048576:.1f} MB" if size_bytes >= 1048576 else f"{size_bytes / 1024:.1f} KB"
|
|
158
|
+
print('\r', end='', file=sys.stderr) # clear progress line
|
|
159
|
+
print(
|
|
160
|
+
f"✔ Done! Generated {count:,} users in {elapsed}s → {out_path} ({size_label})",
|
|
161
|
+
file=sys.stderr
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _user_to_csv_row(u):
|
|
166
|
+
"""Serialize a single user dict to a CSV row string."""
|
|
167
|
+
import json as _json
|
|
168
|
+
parts = []
|
|
169
|
+
for v in u.values():
|
|
170
|
+
if v is None:
|
|
171
|
+
parts.append('')
|
|
172
|
+
elif isinstance(v, (dict, list)):
|
|
173
|
+
parts.append('"' + _json.dumps(v).replace('"', '""') + '"')
|
|
174
|
+
elif isinstance(v, str):
|
|
175
|
+
parts.append('"' + v.replace('"', '""') + '"')
|
|
128
176
|
else:
|
|
129
|
-
|
|
177
|
+
parts.append(str(v))
|
|
178
|
+
return ','.join(parts)
|
|
130
179
|
|
|
131
180
|
|
|
132
181
|
if __name__ == '__main__':
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fakedata-python
|
|
3
|
-
Version: 2.0.
|
|
4
|
-
Summary: The fakedata package generates realistic
|
|
3
|
+
Version: 2.0.4
|
|
4
|
+
Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
|
|
5
5
|
Author-email: abhay557 <contact@abhaymourya.in>
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/abhay557/fakedata
|
|
@@ -36,6 +36,33 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
|
|
|
36
36
|
- **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
|
|
37
37
|
- **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
|
|
38
38
|
- **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
|
|
39
|
+
- **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Node.js / TypeScript Implementation
|
|
44
|
+
|
|
45
|
+
### Installation
|
|
46
|
+
```bash
|
|
47
|
+
npm install @abhay557/fakedata
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Quick Start
|
|
51
|
+
```javascript
|
|
52
|
+
const fakedata = require('@abhay557/fakedata');
|
|
53
|
+
|
|
54
|
+
// Generate deterministic users with a 5% missing data rate (null injection)
|
|
55
|
+
const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
|
|
56
|
+
|
|
57
|
+
// Export directly to CSV format
|
|
58
|
+
const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
|
|
59
|
+
|
|
60
|
+
// Time-series activity data
|
|
61
|
+
const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
|
|
62
|
+
console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
39
66
|
|
|
40
67
|
## Python Implementation
|
|
41
68
|
|
|
@@ -57,7 +84,7 @@ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
|
|
|
57
84
|
print(df.head())
|
|
58
85
|
|
|
59
86
|
# Create time-series activity data
|
|
60
|
-
ts = data.user_time_series({"days": 30, "events_per_day": 8})
|
|
87
|
+
ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
|
|
61
88
|
print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
|
|
62
89
|
```
|
|
63
90
|
|
|
@@ -112,6 +139,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
|
|
|
112
139
|
# Fraud detection dataset with 5% anomalies
|
|
113
140
|
fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
|
|
114
141
|
|
|
142
|
+
# Generate 1 million rows without running out of memory (streaming)
|
|
143
|
+
fakedata generate -n 1000000 -f csv -o big_dataset.csv
|
|
144
|
+
|
|
115
145
|
# Preview a single user in the console
|
|
116
146
|
fakedata preview
|
|
117
147
|
|
|
@@ -119,6 +149,23 @@ fakedata preview
|
|
|
119
149
|
fakedata generate -n 100 --timeseries --days 60 -o activity.json
|
|
120
150
|
```
|
|
121
151
|
|
|
152
|
+
### Streaming Architecture
|
|
153
|
+
|
|
154
|
+
When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
|
|
155
|
+
|
|
156
|
+
- The output file is **created first**, before any data is generated.
|
|
157
|
+
- Each user is generated **one at a time** and written immediately to disk.
|
|
158
|
+
- The generated object is then **discarded** — it is never held in a large array.
|
|
159
|
+
- **RAM usage stays constant** (O(1)) regardless of how many records you generate.
|
|
160
|
+
- A live progress counter is printed every 10,000 records for large jobs.
|
|
161
|
+
|
|
162
|
+
This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
|
|
166
|
+
After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
|
|
167
|
+
```
|
|
168
|
+
|
|
122
169
|
---
|
|
123
170
|
### sample output - one user
|
|
124
171
|
```fakedata.data.user()```
|
|
@@ -413,3 +460,19 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
|
|
|
413
460
|
- Project Commit History - `https://github.com/abhay557/random-api.xyz`
|
|
414
461
|
|
|
415
462
|
---
|
|
463
|
+
|
|
464
|
+
## Contributing
|
|
465
|
+
|
|
466
|
+
Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
|
|
467
|
+
|
|
468
|
+
- Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
|
|
469
|
+
- Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
|
|
470
|
+
- Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
|
|
471
|
+
- Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
|
|
472
|
+
|
|
473
|
+
```bash
|
|
474
|
+
# Fork the repo, then:
|
|
475
|
+
git clone https://github.com/YOUR_USERNAME/fakedata.git
|
|
476
|
+
git checkout -b feature/my-improvement
|
|
477
|
+
# Make your changes, then open a Pull Request!
|
|
478
|
+
```
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
fakedata/__init__.py,sha256=PXwXDWU2HFUfAF2zFMrxsJ7BvP5RSpTbF0GvxWCTt3g,93
|
|
2
|
-
fakedata/cli.py,sha256=
|
|
2
|
+
fakedata/cli.py,sha256=UZ0zks_zRT3T5vkvXCWOqZ1qy9q35rx0AbDpFYZOwh0,8288
|
|
3
3
|
fakedata/core.py,sha256=ZiZ51aZ3cAG7n02Giliq0XO5nN-bbjLJWM3pZQ6gWT4,437
|
|
4
4
|
fakedata/test_python.py,sha256=UpfmArMkM7bcRkV_MTed0vZ1QXokOAZqazcfISlLmZA,3226
|
|
5
5
|
fakedata/helpers/cardtype.json,sha256=3Ij5N_QPCO1Xg6g7jTp759yOVF9scXkbBDZvWYRaSAM,201
|
|
@@ -26,9 +26,9 @@ fakedata/helpers/street.json,sha256=Z-1cRr7uGMXBqlPoqoedPagfx_hLXqbWDNoylcnS8L0,
|
|
|
26
26
|
fakedata/helpers/universities.json,sha256=7NHac5anNxnCVSj6kdFc87896S2A3J-zT58Yw4lzkaQ,230421
|
|
27
27
|
fakedata/modules/__init__.py,sha256=buFp940xk9V39VnBFIca5ADTEtX8qsKz7_VQC3102tI,19
|
|
28
28
|
fakedata/modules/data.py,sha256=8MHRV5AtbzLOCBRKUP3E-dHc8nIQ3VuP-Xyk7c_-Eog,52542
|
|
29
|
-
fakedata_python-2.0.
|
|
30
|
-
fakedata_python-2.0.
|
|
31
|
-
fakedata_python-2.0.
|
|
32
|
-
fakedata_python-2.0.
|
|
33
|
-
fakedata_python-2.0.
|
|
34
|
-
fakedata_python-2.0.
|
|
29
|
+
fakedata_python-2.0.4.dist-info/licenses/LICENSE,sha256=uOpwjvuc2Qd4UOuj8ZcDb3FImq0iOK1JzWN4gbOqBVU,1090
|
|
30
|
+
fakedata_python-2.0.4.dist-info/METADATA,sha256=7nU8XHJgskgVBUdvrSkVaKoWPmkXFjMDzDJhogHQbgQ,18229
|
|
31
|
+
fakedata_python-2.0.4.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
32
|
+
fakedata_python-2.0.4.dist-info/entry_points.txt,sha256=qLOKT1Qujc8-qppTaDO2GUWcuoUQR9fSID3qvIaEAPo,47
|
|
33
|
+
fakedata_python-2.0.4.dist-info/top_level.txt,sha256=SHFa_6848yAE45QgW-PX_DHp_nakY64Zs_t2NobLcn0,9
|
|
34
|
+
fakedata_python-2.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|