fakedata-python 2.0.2__py3-none-any.whl → 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fakedata/cli.py +80 -31
- {fakedata_python-2.0.2.dist-info → fakedata_python-2.0.4.dist-info}/METADATA +70 -26
- {fakedata_python-2.0.2.dist-info → fakedata_python-2.0.4.dist-info}/RECORD +7 -6
- fakedata_python-2.0.4.dist-info/licenses/LICENSE +21 -0
- {fakedata_python-2.0.2.dist-info → fakedata_python-2.0.4.dist-info}/WHEEL +0 -0
- {fakedata_python-2.0.2.dist-info → fakedata_python-2.0.4.dist-info}/entry_points.txt +0 -0
- {fakedata_python-2.0.2.dist-info → fakedata_python-2.0.4.dist-info}/top_level.txt +0 -0
fakedata/cli.py
CHANGED
|
@@ -89,44 +89,93 @@ EXAMPLES:
|
|
|
89
89
|
'anomaly_rate': args.anomaly_rate,
|
|
90
90
|
'missing_rate': args.missing_rate,
|
|
91
91
|
}
|
|
92
|
-
# Remove None values so defaults are used inside the engine
|
|
93
92
|
options = {k: v for k, v in options.items() if v is not None and v != 0.0}
|
|
94
93
|
|
|
94
|
+
count = args.count
|
|
95
95
|
start = time.time()
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
else: # json
|
|
112
|
-
if args.pretty:
|
|
113
|
-
output = data.users_to_json(args.count, options if options else None)
|
|
96
|
+
PROGRESS_INTERVAL = 10000
|
|
97
|
+
|
|
98
|
+
# ── stdout: buffer is fine for small terminal output ──────────────
|
|
99
|
+
if not args.output:
|
|
100
|
+
if args.timeseries:
|
|
101
|
+
results = [
|
|
102
|
+
data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
|
|
103
|
+
for _ in range(count)
|
|
104
|
+
]
|
|
105
|
+
print(json.dumps(results, indent=2 if args.pretty else None))
|
|
106
|
+
elif args.format == 'csv':
|
|
107
|
+
print(data.users_to_csv(count, options if options else None))
|
|
108
|
+
elif args.format == 'flat':
|
|
109
|
+
rows = data.users_flat(count, options if options else None)
|
|
110
|
+
print(json.dumps(rows, indent=2 if args.pretty else None))
|
|
114
111
|
else:
|
|
115
|
-
|
|
112
|
+
if args.pretty:
|
|
113
|
+
print(data.users_to_json(count, options if options else None))
|
|
114
|
+
else:
|
|
115
|
+
print(json.dumps(data.users(count, options if options else None)))
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
# ── File: STREAMING — open file first, write one record at a time ──
|
|
119
|
+
out_path = os.path.abspath(args.output)
|
|
120
|
+
|
|
121
|
+
with open(out_path, 'w', encoding='utf-8') as f:
|
|
122
|
+
|
|
123
|
+
if args.timeseries:
|
|
124
|
+
f.write('[\n')
|
|
125
|
+
for i in range(count):
|
|
126
|
+
rec = data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
|
|
127
|
+
line = json.dumps(rec, indent=2 if args.pretty else None)
|
|
128
|
+
f.write(line + (',' if i < count - 1 else '') + '\n')
|
|
129
|
+
if (i + 1) % PROGRESS_INTERVAL == 0:
|
|
130
|
+
print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
|
|
131
|
+
f.write(']\n')
|
|
132
|
+
|
|
133
|
+
elif args.format == 'csv':
|
|
134
|
+
# Write header from first record
|
|
135
|
+
first = data.user(options if options else None)
|
|
136
|
+
header = ','.join(f'"{k}"' for k in first.keys())
|
|
137
|
+
f.write(header + '\n')
|
|
138
|
+
f.write(_user_to_csv_row(first) + '\n')
|
|
139
|
+
for i in range(1, count):
|
|
140
|
+
u = data.user(options if options else None)
|
|
141
|
+
f.write(_user_to_csv_row(u) + '\n')
|
|
142
|
+
if (i + 1) % PROGRESS_INTERVAL == 0:
|
|
143
|
+
print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
|
|
144
|
+
|
|
145
|
+
else: # json / flat
|
|
146
|
+
f.write('[\n')
|
|
147
|
+
for i in range(count):
|
|
148
|
+
u = data.user(options if options else None)
|
|
149
|
+
line = json.dumps(u, indent=2 if args.pretty else None)
|
|
150
|
+
f.write(line + (',' if i < count - 1 else '') + '\n')
|
|
151
|
+
if (i + 1) % PROGRESS_INTERVAL == 0:
|
|
152
|
+
print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
|
|
153
|
+
f.write(']\n')
|
|
116
154
|
|
|
117
155
|
elapsed = round(time.time() - start, 2)
|
|
118
|
-
|
|
119
|
-
if
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
156
|
+
size_bytes = os.path.getsize(out_path)
|
|
157
|
+
size_label = f"{size_bytes / 1048576:.1f} MB" if size_bytes >= 1048576 else f"{size_bytes / 1024:.1f} KB"
|
|
158
|
+
print('\r', end='', file=sys.stderr) # clear progress line
|
|
159
|
+
print(
|
|
160
|
+
f"✔ Done! Generated {count:,} users in {elapsed}s → {out_path} ({size_label})",
|
|
161
|
+
file=sys.stderr
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _user_to_csv_row(u):
|
|
166
|
+
"""Serialize a single user dict to a CSV row string."""
|
|
167
|
+
import json as _json
|
|
168
|
+
parts = []
|
|
169
|
+
for v in u.values():
|
|
170
|
+
if v is None:
|
|
171
|
+
parts.append('')
|
|
172
|
+
elif isinstance(v, (dict, list)):
|
|
173
|
+
parts.append('"' + _json.dumps(v).replace('"', '""') + '"')
|
|
174
|
+
elif isinstance(v, str):
|
|
175
|
+
parts.append('"' + v.replace('"', '""') + '"')
|
|
128
176
|
else:
|
|
129
|
-
|
|
177
|
+
parts.append(str(v))
|
|
178
|
+
return ','.join(parts)
|
|
130
179
|
|
|
131
180
|
|
|
132
181
|
if __name__ == '__main__':
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fakedata-python
|
|
3
|
-
Version: 2.0.
|
|
4
|
-
Summary: The fakedata package generates realistic
|
|
3
|
+
Version: 2.0.4
|
|
4
|
+
Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
|
|
5
5
|
Author-email: abhay557 <contact@abhaymourya.in>
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/abhay557/fakedata
|
|
@@ -9,6 +9,8 @@ Classifier: Programming Language :: Python :: 3
|
|
|
9
9
|
Classifier: Operating System :: OS Independent
|
|
10
10
|
Requires-Python: >=3.7
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
12
14
|
|
|
13
15
|
# fakedata
|
|
14
16
|
|
|
@@ -34,52 +36,56 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
|
|
|
34
36
|
- **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
|
|
35
37
|
- **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
|
|
36
38
|
- **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
|
|
39
|
+
- **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
|
|
37
40
|
|
|
38
41
|
---
|
|
39
42
|
|
|
40
|
-
##
|
|
43
|
+
## Node.js / TypeScript Implementation
|
|
41
44
|
|
|
42
45
|
### Installation
|
|
43
46
|
```bash
|
|
44
|
-
|
|
47
|
+
npm install @abhay557/fakedata
|
|
45
48
|
```
|
|
46
49
|
|
|
47
50
|
### Quick Start
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
import pandas as pd
|
|
51
|
+
```javascript
|
|
52
|
+
const fakedata = require('@abhay557/fakedata');
|
|
51
53
|
|
|
52
|
-
|
|
53
|
-
users = data.users(
|
|
54
|
+
// Generate deterministic users with a 5% missing data rate (null injection)
|
|
55
|
+
const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
|
|
54
56
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
print(df.head())
|
|
57
|
+
// Export directly to CSV format
|
|
58
|
+
const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
|
|
58
59
|
|
|
59
|
-
|
|
60
|
-
ts =
|
|
61
|
-
|
|
60
|
+
// Time-series activity data
|
|
61
|
+
const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
|
|
62
|
+
console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
|
|
62
63
|
```
|
|
63
|
-
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Python Implementation
|
|
64
68
|
|
|
65
69
|
### Installation
|
|
66
70
|
```bash
|
|
67
|
-
|
|
71
|
+
pip install fakedata-python
|
|
68
72
|
```
|
|
69
73
|
|
|
70
74
|
### Quick Start
|
|
71
|
-
```
|
|
72
|
-
|
|
75
|
+
```python
|
|
76
|
+
import fakedata
|
|
77
|
+
import pandas as pd
|
|
73
78
|
|
|
74
|
-
|
|
75
|
-
|
|
79
|
+
# Generate 10,000 highly correlated users deterministically
|
|
80
|
+
users = fakedata.data.users(10000, {"seed": 42})
|
|
76
81
|
|
|
77
|
-
|
|
78
|
-
|
|
82
|
+
# Or export directly to a Pandas DataFrame
|
|
83
|
+
df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
|
|
84
|
+
print(df.head())
|
|
79
85
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
86
|
+
# Create time-series activity data
|
|
87
|
+
ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
|
|
88
|
+
print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
|
|
83
89
|
```
|
|
84
90
|
|
|
85
91
|
---
|
|
@@ -133,6 +139,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
|
|
|
133
139
|
# Fraud detection dataset with 5% anomalies
|
|
134
140
|
fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
|
|
135
141
|
|
|
142
|
+
# Generate 1 million rows without running out of memory (streaming)
|
|
143
|
+
fakedata generate -n 1000000 -f csv -o big_dataset.csv
|
|
144
|
+
|
|
136
145
|
# Preview a single user in the console
|
|
137
146
|
fakedata preview
|
|
138
147
|
|
|
@@ -140,6 +149,23 @@ fakedata preview
|
|
|
140
149
|
fakedata generate -n 100 --timeseries --days 60 -o activity.json
|
|
141
150
|
```
|
|
142
151
|
|
|
152
|
+
### Streaming Architecture
|
|
153
|
+
|
|
154
|
+
When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
|
|
155
|
+
|
|
156
|
+
- The output file is **created first**, before any data is generated.
|
|
157
|
+
- Each user is generated **one at a time** and written immediately to disk.
|
|
158
|
+
- The generated object is then **discarded** — it is never held in a large array.
|
|
159
|
+
- **RAM usage stays constant** (O(1)) regardless of how many records you generate.
|
|
160
|
+
- A live progress counter is printed every 10,000 records for large jobs.
|
|
161
|
+
|
|
162
|
+
This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
|
|
166
|
+
After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
|
|
167
|
+
```
|
|
168
|
+
|
|
143
169
|
---
|
|
144
170
|
### sample output - one user
|
|
145
171
|
```fakedata.data.user()```
|
|
@@ -432,3 +458,21 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
|
|
|
432
458
|
**Maintainer**: [abhay557](https://github.com/abhay557)
|
|
433
459
|
|
|
434
460
|
- Project Commit History - `https://github.com/abhay557/random-api.xyz`
|
|
461
|
+
|
|
462
|
+
---
|
|
463
|
+
|
|
464
|
+
## Contributing
|
|
465
|
+
|
|
466
|
+
Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
|
|
467
|
+
|
|
468
|
+
- Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
|
|
469
|
+
- Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
|
|
470
|
+
- Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
|
|
471
|
+
- Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
|
|
472
|
+
|
|
473
|
+
```bash
|
|
474
|
+
# Fork the repo, then:
|
|
475
|
+
git clone https://github.com/YOUR_USERNAME/fakedata.git
|
|
476
|
+
git checkout -b feature/my-improvement
|
|
477
|
+
# Make your changes, then open a Pull Request!
|
|
478
|
+
```
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
fakedata/__init__.py,sha256=PXwXDWU2HFUfAF2zFMrxsJ7BvP5RSpTbF0GvxWCTt3g,93
|
|
2
|
-
fakedata/cli.py,sha256=
|
|
2
|
+
fakedata/cli.py,sha256=UZ0zks_zRT3T5vkvXCWOqZ1qy9q35rx0AbDpFYZOwh0,8288
|
|
3
3
|
fakedata/core.py,sha256=ZiZ51aZ3cAG7n02Giliq0XO5nN-bbjLJWM3pZQ6gWT4,437
|
|
4
4
|
fakedata/test_python.py,sha256=UpfmArMkM7bcRkV_MTed0vZ1QXokOAZqazcfISlLmZA,3226
|
|
5
5
|
fakedata/helpers/cardtype.json,sha256=3Ij5N_QPCO1Xg6g7jTp759yOVF9scXkbBDZvWYRaSAM,201
|
|
@@ -26,8 +26,9 @@ fakedata/helpers/street.json,sha256=Z-1cRr7uGMXBqlPoqoedPagfx_hLXqbWDNoylcnS8L0,
|
|
|
26
26
|
fakedata/helpers/universities.json,sha256=7NHac5anNxnCVSj6kdFc87896S2A3J-zT58Yw4lzkaQ,230421
|
|
27
27
|
fakedata/modules/__init__.py,sha256=buFp940xk9V39VnBFIca5ADTEtX8qsKz7_VQC3102tI,19
|
|
28
28
|
fakedata/modules/data.py,sha256=8MHRV5AtbzLOCBRKUP3E-dHc8nIQ3VuP-Xyk7c_-Eog,52542
|
|
29
|
-
fakedata_python-2.0.
|
|
30
|
-
fakedata_python-2.0.
|
|
31
|
-
fakedata_python-2.0.
|
|
32
|
-
fakedata_python-2.0.
|
|
33
|
-
fakedata_python-2.0.
|
|
29
|
+
fakedata_python-2.0.4.dist-info/licenses/LICENSE,sha256=uOpwjvuc2Qd4UOuj8ZcDb3FImq0iOK1JzWN4gbOqBVU,1090
|
|
30
|
+
fakedata_python-2.0.4.dist-info/METADATA,sha256=7nU8XHJgskgVBUdvrSkVaKoWPmkXFjMDzDJhogHQbgQ,18229
|
|
31
|
+
fakedata_python-2.0.4.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
32
|
+
fakedata_python-2.0.4.dist-info/entry_points.txt,sha256=qLOKT1Qujc8-qppTaDO2GUWcuoUQR9fSID3qvIaEAPo,47
|
|
33
|
+
fakedata_python-2.0.4.dist-info/top_level.txt,sha256=SHFa_6848yAE45QgW-PX_DHp_nakY64Zs_t2NobLcn0,9
|
|
34
|
+
fakedata_python-2.0.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Abhay Mourya
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|