fakedata-python 2.0.4__tar.gz → 2.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/MANIFEST.in +2 -0
- {fakedata_python-2.0.4/fakedata_python.egg-info → fakedata_python-2.0.8}/PKG-INFO +41 -21
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/README.md +40 -20
- fakedata_python-2.0.8/fakedata/__init__.py +7 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/modules/data.py +174 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8/fakedata_python.egg-info}/PKG-INFO +41 -21
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/pyproject.toml +1 -1
- fakedata_python-2.0.4/fakedata/__init__.py +0 -6
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/LICENSE +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/cli.py +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/core.py +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/cardtype.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/companies.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/countries.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/devices.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/domain.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/email.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/first.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/healthcare.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/hobbies.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/industries.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/job_categories.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/job_titles.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/last.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/locales.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/middle.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/occupation.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/salary_ranges.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/shortformstate.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/state.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/states.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/street.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/helpers/universities.json +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/modules/__init__.py +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata/test_python.py +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata_python.egg-info/SOURCES.txt +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata_python.egg-info/dependency_links.txt +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata_python.egg-info/entry_points.txt +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata_python.egg-info/top_level.txt +0 -0
- {fakedata_python-2.0.4 → fakedata_python-2.0.8}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fakedata-python
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.8
|
|
4
4
|
Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
|
|
5
5
|
Author-email: abhay557 <contact@abhaymourya.in>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -62,6 +62,26 @@ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
|
|
|
62
62
|
console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
|
|
63
63
|
```
|
|
64
64
|
|
|
65
|
+
### Streaming API & Custom Correlations
|
|
66
|
+
Generate unlimited data directly to disk while keeping memory at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
|
|
67
|
+
|
|
68
|
+
```javascript
|
|
69
|
+
const fs = require('fs');
|
|
70
|
+
const fakedata = require('@abhay557/fakedata');
|
|
71
|
+
|
|
72
|
+
// Create a stream that emits 1 million users as CSV
|
|
73
|
+
const stream = fakedata.data.generateStream(1000000, {
|
|
74
|
+
format: 'csv',
|
|
75
|
+
correlations: [
|
|
76
|
+
{ fieldA: 'education.level', fieldB: 'financial.annualIncome', pearson_coeff: 0.85 },
|
|
77
|
+
{ fieldA: 'health.bmi', fieldB: 'health.bloodPressure.systolic', pearson_coeff: 0.60 }
|
|
78
|
+
]
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// Pipe directly to file (constant RAM usage)
|
|
82
|
+
stream.pipe(fs.createWriteStream('1m_dataset.csv'));
|
|
83
|
+
```
|
|
84
|
+
|
|
65
85
|
---
|
|
66
86
|
|
|
67
87
|
## Python Implementation
|
|
@@ -88,6 +108,26 @@ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
|
|
|
88
108
|
print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
|
|
89
109
|
```
|
|
90
110
|
|
|
111
|
+
### Streaming API & Custom Correlations
|
|
112
|
+
Generate unlimited data lazily, keeping memory footprint at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import fakedata
|
|
116
|
+
|
|
117
|
+
# Create a lazy generator that yields 1 million users
|
|
118
|
+
stream = fakedata.generate_stream(1000000, {
|
|
119
|
+
"correlations": [
|
|
120
|
+
{"fieldA": "education.level", "fieldB": "financial.annualIncome", "pearson_coeff": 0.85},
|
|
121
|
+
{"fieldA": "health.bmi", "fieldB": "health.bloodPressure.systolic", "pearson_coeff": 0.60}
|
|
122
|
+
]
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
# Process users one by one without blowing up RAM
|
|
126
|
+
for user in stream:
|
|
127
|
+
# write to DB, serialize to file, or process
|
|
128
|
+
pass
|
|
129
|
+
```
|
|
130
|
+
|
|
91
131
|
---
|
|
92
132
|
|
|
93
133
|
## CLI — Command Line Interface
|
|
@@ -161,10 +201,6 @@ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
|
|
|
161
201
|
|
|
162
202
|
This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
|
|
163
203
|
|
|
164
|
-
```
|
|
165
|
-
Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
|
|
166
|
-
After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
|
|
167
|
-
```
|
|
168
204
|
|
|
169
205
|
---
|
|
170
206
|
### sample output - one user
|
|
@@ -460,19 +496,3 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
|
|
|
460
496
|
- Project Commit History - `https://github.com/abhay557/random-api.xyz`
|
|
461
497
|
|
|
462
498
|
---
|
|
463
|
-
|
|
464
|
-
## Contributing
|
|
465
|
-
|
|
466
|
-
Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
|
|
467
|
-
|
|
468
|
-
- Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
|
|
469
|
-
- Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
|
|
470
|
-
- Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
|
|
471
|
-
- Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
|
|
472
|
-
|
|
473
|
-
```bash
|
|
474
|
-
# Fork the repo, then:
|
|
475
|
-
git clone https://github.com/YOUR_USERNAME/fakedata.git
|
|
476
|
-
git checkout -b feature/my-improvement
|
|
477
|
-
# Make your changes, then open a Pull Request!
|
|
478
|
-
```
|
|
@@ -48,6 +48,26 @@ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
|
|
|
48
48
|
console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
|
|
49
49
|
```
|
|
50
50
|
|
|
51
|
+
### Streaming API & Custom Correlations
|
|
52
|
+
Generate unlimited data directly to disk while keeping memory at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
|
|
53
|
+
|
|
54
|
+
```javascript
|
|
55
|
+
const fs = require('fs');
|
|
56
|
+
const fakedata = require('@abhay557/fakedata');
|
|
57
|
+
|
|
58
|
+
// Create a stream that emits 1 million users as CSV
|
|
59
|
+
const stream = fakedata.data.generateStream(1000000, {
|
|
60
|
+
format: 'csv',
|
|
61
|
+
correlations: [
|
|
62
|
+
{ fieldA: 'education.level', fieldB: 'financial.annualIncome', pearson_coeff: 0.85 },
|
|
63
|
+
{ fieldA: 'health.bmi', fieldB: 'health.bloodPressure.systolic', pearson_coeff: 0.60 }
|
|
64
|
+
]
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
// Pipe directly to file (constant RAM usage)
|
|
68
|
+
stream.pipe(fs.createWriteStream('1m_dataset.csv'));
|
|
69
|
+
```
|
|
70
|
+
|
|
51
71
|
---
|
|
52
72
|
|
|
53
73
|
## Python Implementation
|
|
@@ -74,6 +94,26 @@ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
|
|
|
74
94
|
print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
|
|
75
95
|
```
|
|
76
96
|
|
|
97
|
+
### Streaming API & Custom Correlations
|
|
98
|
+
Generate unlimited data lazily, keeping memory footprint at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import fakedata
|
|
102
|
+
|
|
103
|
+
# Create a lazy generator that yields 1 million users
|
|
104
|
+
stream = fakedata.generate_stream(1000000, {
|
|
105
|
+
"correlations": [
|
|
106
|
+
{"fieldA": "education.level", "fieldB": "financial.annualIncome", "pearson_coeff": 0.85},
|
|
107
|
+
{"fieldA": "health.bmi", "fieldB": "health.bloodPressure.systolic", "pearson_coeff": 0.60}
|
|
108
|
+
]
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
# Process users one by one without blowing up RAM
|
|
112
|
+
for user in stream:
|
|
113
|
+
# write to DB, serialize to file, or process
|
|
114
|
+
pass
|
|
115
|
+
```
|
|
116
|
+
|
|
77
117
|
---
|
|
78
118
|
|
|
79
119
|
## CLI — Command Line Interface
|
|
@@ -147,10 +187,6 @@ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
|
|
|
147
187
|
|
|
148
188
|
This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
|
|
149
189
|
|
|
150
|
-
```
|
|
151
|
-
Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
|
|
152
|
-
After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
|
|
153
|
-
```
|
|
154
190
|
|
|
155
191
|
---
|
|
156
192
|
### sample output - one user
|
|
@@ -446,19 +482,3 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
|
|
|
446
482
|
- Project Commit History - `https://github.com/abhay557/random-api.xyz`
|
|
447
483
|
|
|
448
484
|
---
|
|
449
|
-
|
|
450
|
-
## Contributing
|
|
451
|
-
|
|
452
|
-
Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
|
|
453
|
-
|
|
454
|
-
- Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
|
|
455
|
-
- Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
|
|
456
|
-
- Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
|
|
457
|
-
- Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
|
|
458
|
-
|
|
459
|
-
```bash
|
|
460
|
-
# Fork the repo, then:
|
|
461
|
-
git clone https://github.com/YOUR_USERNAME/fakedata.git
|
|
462
|
-
git checkout -b feature/my-improvement
|
|
463
|
-
# Make your changes, then open a Pull Request!
|
|
464
|
-
```
|
|
@@ -1364,3 +1364,177 @@ def biodata(count=10):
|
|
|
1364
1364
|
})
|
|
1365
1365
|
biodatas.append(bio)
|
|
1366
1366
|
return biodatas
|
|
1367
|
+
|
|
1368
|
+
|
|
1369
|
+
# ─── Custom Correlation Engine ───────────────────────────────────────────────
|
|
1370
|
+
|
|
1371
|
+
def _get_path(obj: dict, path: str):
|
|
1372
|
+
"""
|
|
1373
|
+
Resolves a dot-separated path on a dict and returns the numeric value.
|
|
1374
|
+
Returns None if the path doesn't exist or the value is non-numeric.
|
|
1375
|
+
|
|
1376
|
+
Example: _get_path(user, 'financial.annualIncome')
|
|
1377
|
+
"""
|
|
1378
|
+
parts = path.split('.')
|
|
1379
|
+
cur = obj
|
|
1380
|
+
for p in parts:
|
|
1381
|
+
if not isinstance(cur, dict) or p not in cur:
|
|
1382
|
+
return None
|
|
1383
|
+
cur = cur[p]
|
|
1384
|
+
return cur if isinstance(cur, (int, float)) else None
|
|
1385
|
+
|
|
1386
|
+
|
|
1387
|
+
def _set_path(obj: dict, path: str, value) -> None:
|
|
1388
|
+
"""Sets a dot-separated path on a dict to a new value (mutates dict)."""
|
|
1389
|
+
parts = path.split('.')
|
|
1390
|
+
cur = obj
|
|
1391
|
+
for p in parts[:-1]:
|
|
1392
|
+
cur = cur[p]
|
|
1393
|
+
cur[parts[-1]] = value
|
|
1394
|
+
|
|
1395
|
+
|
|
1396
|
+
def apply_custom_correlations(user: dict, correlations: list) -> dict:
|
|
1397
|
+
"""
|
|
1398
|
+
Applies user-defined Pearson correlation constraints between numeric field pairs.
|
|
1399
|
+
|
|
1400
|
+
Algorithm (conditional normal approximation):
|
|
1401
|
+
Given field A = a, we want field B such that corr(A,B) ≈ r.
|
|
1402
|
+
B' = r * tanh(z_A) + sqrt(1 - r²) * N(0,1)
|
|
1403
|
+
The result is rescaled back to B's natural magnitude.
|
|
1404
|
+
|
|
1405
|
+
Args:
|
|
1406
|
+
user: A fully generated user dict (mutated in place).
|
|
1407
|
+
correlations: List of dicts with keys: fieldA, fieldB, pearson_coeff.
|
|
1408
|
+
Example:
|
|
1409
|
+
[
|
|
1410
|
+
{ "fieldA": "education.level",
|
|
1411
|
+
"fieldB": "financial.annualIncome",
|
|
1412
|
+
"pearson_coeff": 0.85 },
|
|
1413
|
+
{ "fieldA": "health.bmi",
|
|
1414
|
+
"fieldB": "health.bloodPressure.systolic",
|
|
1415
|
+
"pearson_coeff": 0.60 }
|
|
1416
|
+
]
|
|
1417
|
+
|
|
1418
|
+
Returns:
|
|
1419
|
+
The mutated user dict.
|
|
1420
|
+
|
|
1421
|
+
Note:
|
|
1422
|
+
- fieldA is used only as a *signal* (its numeric value drives the nudge).
|
|
1423
|
+
- Non-numeric fields or invalid paths are silently skipped.
|
|
1424
|
+
- pearson_coeff must be in [-1.0, 1.0].
|
|
1425
|
+
"""
|
|
1426
|
+
if not correlations:
|
|
1427
|
+
return user
|
|
1428
|
+
|
|
1429
|
+
for spec in correlations:
|
|
1430
|
+
field_a = spec.get('fieldA')
|
|
1431
|
+
field_b = spec.get('fieldB')
|
|
1432
|
+
r = spec.get('pearson_coeff')
|
|
1433
|
+
|
|
1434
|
+
if r is None or abs(r) > 1.0:
|
|
1435
|
+
continue
|
|
1436
|
+
|
|
1437
|
+
val_a = _get_path(user, field_a)
|
|
1438
|
+
val_b = _get_path(user, field_b)
|
|
1439
|
+
|
|
1440
|
+
if val_a is None or val_b is None:
|
|
1441
|
+
continue
|
|
1442
|
+
|
|
1443
|
+
# Approximate z-score for A (unit-free signal)
|
|
1444
|
+
z_a = (val_a - val_a * 0.5) / (val_a * 0.5 + 1e-9)
|
|
1445
|
+
# Correlated noise for B
|
|
1446
|
+
independent_noise = random.gauss(0, 1)
|
|
1447
|
+
z_b_corr = r * math.tanh(z_a) + math.sqrt(max(0, 1 - r * r)) * independent_noise
|
|
1448
|
+
|
|
1449
|
+
# Nudge B proportionally
|
|
1450
|
+
nudge = z_b_corr * abs(val_b) * 0.2
|
|
1451
|
+
new_val_b = val_b + nudge
|
|
1452
|
+
|
|
1453
|
+
# Preserve int vs float
|
|
1454
|
+
if isinstance(val_b, int):
|
|
1455
|
+
_set_path(user, field_b, int(round(new_val_b)))
|
|
1456
|
+
else:
|
|
1457
|
+
_set_path(user, field_b, round(new_val_b, 4))
|
|
1458
|
+
|
|
1459
|
+
return user
|
|
1460
|
+
|
|
1461
|
+
|
|
1462
|
+
# ─── Streaming API ───────────────────────────────────────────────────────────
|
|
1463
|
+
|
|
1464
|
+
def generate_stream(count: int = 1000, options: dict = None):
|
|
1465
|
+
"""
|
|
1466
|
+
A lazy generator that yields one user dict at a time.
|
|
1467
|
+
Memory usage stays at O(1) regardless of count — safe for millions of rows.
|
|
1468
|
+
|
|
1469
|
+
Args:
|
|
1470
|
+
count: Total number of users to generate.
|
|
1471
|
+
options: Dictionary of options (same as users() plus extras below):
|
|
1472
|
+
seed (int) – Seed for reproducibility.
|
|
1473
|
+
schema (dict) – Schema constraints.
|
|
1474
|
+
locale (str) – Locale code: 'en','in','jp','kr','de','br','ar','fr'.
|
|
1475
|
+
missing_rate (float) – Probability (0-1) each field becomes None.
|
|
1476
|
+
anomaly_rate (float) – Fraction of users to inject anomalies.
|
|
1477
|
+
correlations (list) – Custom Pearson correlation specs (see apply_custom_correlations).
|
|
1478
|
+
|
|
1479
|
+
Yields:
|
|
1480
|
+
dict – One user profile per iteration.
|
|
1481
|
+
|
|
1482
|
+
Examples:
|
|
1483
|
+
# Pipe to CSV without loading all data into memory
|
|
1484
|
+
import csv, io
|
|
1485
|
+
with open('dataset.csv', 'w', newline='') as f:
|
|
1486
|
+
writer = None
|
|
1487
|
+
for user in data.generate_stream(1_000_000, {'seed': 42}):
|
|
1488
|
+
flat = flatten_object(user)
|
|
1489
|
+
if writer is None:
|
|
1490
|
+
writer = csv.DictWriter(f, fieldnames=flat.keys())
|
|
1491
|
+
writer.writeheader()
|
|
1492
|
+
writer.writerow(flat)
|
|
1493
|
+
|
|
1494
|
+
# Custom Pearson correlations
|
|
1495
|
+
for user in data.generate_stream(500, {
|
|
1496
|
+
'correlations': [
|
|
1497
|
+
{'fieldA': 'education.level', 'fieldB': 'financial.annualIncome', 'pearson_coeff': 0.85},
|
|
1498
|
+
{'fieldA': 'health.bmi', 'fieldB': 'health.bloodPressure.systolic', 'pearson_coeff': 0.60},
|
|
1499
|
+
]
|
|
1500
|
+
}):
|
|
1501
|
+
print(user['fullName'], user['financial']['annualIncome'])
|
|
1502
|
+
"""
|
|
1503
|
+
if options is None:
|
|
1504
|
+
options = {}
|
|
1505
|
+
|
|
1506
|
+
seed = options.get('seed')
|
|
1507
|
+
schema = options.get('schema')
|
|
1508
|
+
locale = options.get('locale')
|
|
1509
|
+
missing_rate = options.get('missing_rate', 0)
|
|
1510
|
+
anomaly_rate = options.get('anomaly_rate', 0)
|
|
1511
|
+
correlations = options.get('correlations', [])
|
|
1512
|
+
|
|
1513
|
+
if seed is not None:
|
|
1514
|
+
random.seed(seed)
|
|
1515
|
+
|
|
1516
|
+
for i in range(count):
|
|
1517
|
+
u = generate_single_user(i + 1, schema=schema, locale=locale)
|
|
1518
|
+
|
|
1519
|
+
if missing_rate > 0:
|
|
1520
|
+
u = apply_missing_data(u, missing_rate)
|
|
1521
|
+
|
|
1522
|
+
# Per-record anomaly injection
|
|
1523
|
+
if anomaly_rate > 0:
|
|
1524
|
+
if random.random() < anomaly_rate:
|
|
1525
|
+
weights = [a['weight'] for a in ANOMALY_TYPES_PY]
|
|
1526
|
+
idx = weighted_random(weights)
|
|
1527
|
+
anomaly = ANOMALY_TYPES_PY[idx]
|
|
1528
|
+
anomaly['apply'](u)
|
|
1529
|
+
u['_anomaly'] = {'isAnomaly': True, 'type': anomaly['type']}
|
|
1530
|
+
else:
|
|
1531
|
+
u['_anomaly'] = {'isAnomaly': False, 'type': None}
|
|
1532
|
+
|
|
1533
|
+
# Custom correlations
|
|
1534
|
+
if correlations:
|
|
1535
|
+
apply_custom_correlations(u, correlations)
|
|
1536
|
+
|
|
1537
|
+
yield u
|
|
1538
|
+
|
|
1539
|
+
if seed is not None:
|
|
1540
|
+
random.seed() # Reset to system entropy
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fakedata-python
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.8
|
|
4
4
|
Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
|
|
5
5
|
Author-email: abhay557 <contact@abhaymourya.in>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -62,6 +62,26 @@ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
|
|
|
62
62
|
console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
|
|
63
63
|
```
|
|
64
64
|
|
|
65
|
+
### Streaming API & Custom Correlations
|
|
66
|
+
Generate unlimited data directly to disk while keeping memory at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
|
|
67
|
+
|
|
68
|
+
```javascript
|
|
69
|
+
const fs = require('fs');
|
|
70
|
+
const fakedata = require('@abhay557/fakedata');
|
|
71
|
+
|
|
72
|
+
// Create a stream that emits 1 million users as CSV
|
|
73
|
+
const stream = fakedata.data.generateStream(1000000, {
|
|
74
|
+
format: 'csv',
|
|
75
|
+
correlations: [
|
|
76
|
+
{ fieldA: 'education.level', fieldB: 'financial.annualIncome', pearson_coeff: 0.85 },
|
|
77
|
+
{ fieldA: 'health.bmi', fieldB: 'health.bloodPressure.systolic', pearson_coeff: 0.60 }
|
|
78
|
+
]
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// Pipe directly to file (constant RAM usage)
|
|
82
|
+
stream.pipe(fs.createWriteStream('1m_dataset.csv'));
|
|
83
|
+
```
|
|
84
|
+
|
|
65
85
|
---
|
|
66
86
|
|
|
67
87
|
## Python Implementation
|
|
@@ -88,6 +108,26 @@ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
|
|
|
88
108
|
print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
|
|
89
109
|
```
|
|
90
110
|
|
|
111
|
+
### Streaming API & Custom Correlations
|
|
112
|
+
Generate unlimited data lazily, keeping memory footprint at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import fakedata
|
|
116
|
+
|
|
117
|
+
# Create a lazy generator that yields 1 million users
|
|
118
|
+
stream = fakedata.generate_stream(1000000, {
|
|
119
|
+
"correlations": [
|
|
120
|
+
{"fieldA": "education.level", "fieldB": "financial.annualIncome", "pearson_coeff": 0.85},
|
|
121
|
+
{"fieldA": "health.bmi", "fieldB": "health.bloodPressure.systolic", "pearson_coeff": 0.60}
|
|
122
|
+
]
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
# Process users one by one without blowing up RAM
|
|
126
|
+
for user in stream:
|
|
127
|
+
# write to DB, serialize to file, or process
|
|
128
|
+
pass
|
|
129
|
+
```
|
|
130
|
+
|
|
91
131
|
---
|
|
92
132
|
|
|
93
133
|
## CLI — Command Line Interface
|
|
@@ -161,10 +201,6 @@ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
|
|
|
161
201
|
|
|
162
202
|
This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
|
|
163
203
|
|
|
164
|
-
```
|
|
165
|
-
Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
|
|
166
|
-
After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
|
|
167
|
-
```
|
|
168
204
|
|
|
169
205
|
---
|
|
170
206
|
### sample output - one user
|
|
@@ -460,19 +496,3 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
|
|
|
460
496
|
- Project Commit History - `https://github.com/abhay557/random-api.xyz`
|
|
461
497
|
|
|
462
498
|
---
|
|
463
|
-
|
|
464
|
-
## Contributing
|
|
465
|
-
|
|
466
|
-
Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
|
|
467
|
-
|
|
468
|
-
- Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
|
|
469
|
-
- Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
|
|
470
|
-
- Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
|
|
471
|
-
- Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
|
|
472
|
-
|
|
473
|
-
```bash
|
|
474
|
-
# Fork the repo, then:
|
|
475
|
-
git clone https://github.com/YOUR_USERNAME/fakedata.git
|
|
476
|
-
git checkout -b feature/my-improvement
|
|
477
|
-
# Make your changes, then open a Pull Request!
|
|
478
|
-
```
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{fakedata_python-2.0.4 → fakedata_python-2.0.8}/fakedata_python.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|