fakedata-python 2.0.5__tar.gz → 2.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/MANIFEST.in +2 -0
  2. {fakedata_python-2.0.5/fakedata_python.egg-info → fakedata_python-2.0.8}/PKG-INFO +60 -24
  3. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/README.md +59 -23
  4. fakedata_python-2.0.8/fakedata/__init__.py +7 -0
  5. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/modules/data.py +174 -0
  6. {fakedata_python-2.0.5 → fakedata_python-2.0.8/fakedata_python.egg-info}/PKG-INFO +60 -24
  7. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/pyproject.toml +1 -1
  8. fakedata_python-2.0.5/fakedata/__init__.py +0 -6
  9. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/LICENSE +0 -0
  10. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/cli.py +0 -0
  11. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/core.py +0 -0
  12. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/cardtype.json +0 -0
  13. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/companies.json +0 -0
  14. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/countries.json +0 -0
  15. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/devices.json +0 -0
  16. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/domain.json +0 -0
  17. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/email.json +0 -0
  18. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/first.json +0 -0
  19. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/healthcare.json +0 -0
  20. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/hobbies.json +0 -0
  21. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/industries.json +0 -0
  22. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/job_categories.json +0 -0
  23. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/job_titles.json +0 -0
  24. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/last.json +0 -0
  25. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/locales.json +0 -0
  26. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/middle.json +0 -0
  27. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/occupation.json +0 -0
  28. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/salary_ranges.json +0 -0
  29. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/shortformstate.json +0 -0
  30. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/state.json +0 -0
  31. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/states.json +0 -0
  32. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/street.json +0 -0
  33. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/helpers/universities.json +0 -0
  34. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/modules/__init__.py +0 -0
  35. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata/test_python.py +0 -0
  36. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata_python.egg-info/SOURCES.txt +0 -0
  37. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata_python.egg-info/dependency_links.txt +0 -0
  38. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata_python.egg-info/entry_points.txt +0 -0
  39. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/fakedata_python.egg-info/top_level.txt +0 -0
  40. {fakedata_python-2.0.5 → fakedata_python-2.0.8}/setup.cfg +0 -0
@@ -6,6 +6,8 @@ exclude .npmignore
6
6
  exclude test.js
7
7
  exclude test_py.py
8
8
  exclude test_python.py
9
+ exclude test_new_apis.py
10
+ exclude test_new_apis.js
9
11
 
10
12
  # Exclude JS source code
11
13
  prune src
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fakedata-python
3
- Version: 2.0.5
3
+ Version: 2.0.8
4
4
  Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
5
  Author-email: abhay557 <contact@abhaymourya.in>
6
6
  License-Expression: MIT
@@ -40,6 +40,50 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
40
40
 
41
41
  ---
42
42
 
43
+ ## Node.js / TypeScript Implementation
44
+
45
+ ### Installation
46
+ ```bash
47
+ npm install @abhay557/fakedata
48
+ ```
49
+
50
+ ### Quick Start
51
+ ```javascript
52
+ const fakedata = require('@abhay557/fakedata');
53
+
54
+ // Generate deterministic users with a 5% missing data rate (null injection)
55
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
56
+
57
+ // Export directly to CSV format
58
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
59
+
60
+ // Time-series activity data
61
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
62
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
63
+ ```
64
+
65
+ ### Streaming API & Custom Correlations
66
+ Generate unlimited data directly to disk while keeping memory at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
67
+
68
+ ```javascript
69
+ const fs = require('fs');
70
+ const fakedata = require('@abhay557/fakedata');
71
+
72
+ // Create a stream that emits 1 million users as CSV
73
+ const stream = fakedata.data.generateStream(1000000, {
74
+ format: 'csv',
75
+ correlations: [
76
+ { fieldA: 'education.level', fieldB: 'financial.annualIncome', pearson_coeff: 0.85 },
77
+ { fieldA: 'health.bmi', fieldB: 'health.bloodPressure.systolic', pearson_coeff: 0.60 }
78
+ ]
79
+ });
80
+
81
+ // Pipe directly to file (constant RAM usage)
82
+ stream.pipe(fs.createWriteStream('1m_dataset.csv'));
83
+ ```
84
+
85
+ ---
86
+
43
87
  ## Python Implementation
44
88
 
45
89
  ### Installation
@@ -64,28 +108,24 @@ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
64
108
  print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
65
109
  ```
66
110
 
67
- ---
68
-
69
- ## Node.js / TypeScript Implementation
70
-
71
- ### Installation
72
- ```bash
73
- npm install @abhay557/fakedata
74
- ```
75
-
76
- ### Quick Start
77
- ```javascript
78
- const fakedata = require('@abhay557/fakedata');
111
+ ### Streaming API & Custom Correlations
112
+ Generate unlimited data lazily, keeping memory footprint at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
79
113
 
80
- // Generate deterministic users with a 5% missing data rate (null injection)
81
- const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
114
+ ```python
115
+ import fakedata
82
116
 
83
- // Export directly to CSV format
84
- const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
117
+ # Create a lazy generator that yields 1 million users
118
+ stream = fakedata.generate_stream(1000000, {
119
+ "correlations": [
120
+ {"fieldA": "education.level", "fieldB": "financial.annualIncome", "pearson_coeff": 0.85},
121
+ {"fieldA": "health.bmi", "fieldB": "health.bloodPressure.systolic", "pearson_coeff": 0.60}
122
+ ]
123
+ })
85
124
 
86
- // Time-series activity data
87
- const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
88
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
125
+ # Process users one by one without blowing up RAM
126
+ for user in stream:
127
+ # write to DB, serialize to file, or process
128
+ pass
89
129
  ```
90
130
 
91
131
  ---
@@ -161,10 +201,6 @@ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
161
201
 
162
202
  This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
163
203
 
164
- ```
165
- Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
166
- After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
167
- ```
168
204
 
169
205
  ---
170
206
  ### sample output - one user
@@ -26,6 +26,50 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
26
26
 
27
27
  ---
28
28
 
29
+ ## Node.js / TypeScript Implementation
30
+
31
+ ### Installation
32
+ ```bash
33
+ npm install @abhay557/fakedata
34
+ ```
35
+
36
+ ### Quick Start
37
+ ```javascript
38
+ const fakedata = require('@abhay557/fakedata');
39
+
40
+ // Generate deterministic users with a 5% missing data rate (null injection)
41
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
42
+
43
+ // Export directly to CSV format
44
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
45
+
46
+ // Time-series activity data
47
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
48
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
49
+ ```
50
+
51
+ ### Streaming API & Custom Correlations
52
+ Generate unlimited data directly to disk while keeping memory at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
53
+
54
+ ```javascript
55
+ const fs = require('fs');
56
+ const fakedata = require('@abhay557/fakedata');
57
+
58
+ // Create a stream that emits 1 million users as CSV
59
+ const stream = fakedata.data.generateStream(1000000, {
60
+ format: 'csv',
61
+ correlations: [
62
+ { fieldA: 'education.level', fieldB: 'financial.annualIncome', pearson_coeff: 0.85 },
63
+ { fieldA: 'health.bmi', fieldB: 'health.bloodPressure.systolic', pearson_coeff: 0.60 }
64
+ ]
65
+ });
66
+
67
+ // Pipe directly to file (constant RAM usage)
68
+ stream.pipe(fs.createWriteStream('1m_dataset.csv'));
69
+ ```
70
+
71
+ ---
72
+
29
73
  ## Python Implementation
30
74
 
31
75
  ### Installation
@@ -50,28 +94,24 @@ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
50
94
  print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
51
95
  ```
52
96
 
53
- ---
54
-
55
- ## Node.js / TypeScript Implementation
56
-
57
- ### Installation
58
- ```bash
59
- npm install @abhay557/fakedata
60
- ```
61
-
62
- ### Quick Start
63
- ```javascript
64
- const fakedata = require('@abhay557/fakedata');
97
+ ### Streaming API & Custom Correlations
98
+ Generate unlimited data lazily, keeping memory footprint at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
65
99
 
66
- // Generate deterministic users with a 5% missing data rate (null injection)
67
- const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
100
+ ```python
101
+ import fakedata
68
102
 
69
- // Export directly to CSV format
70
- const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
103
+ # Create a lazy generator that yields 1 million users
104
+ stream = fakedata.generate_stream(1000000, {
105
+ "correlations": [
106
+ {"fieldA": "education.level", "fieldB": "financial.annualIncome", "pearson_coeff": 0.85},
107
+ {"fieldA": "health.bmi", "fieldB": "health.bloodPressure.systolic", "pearson_coeff": 0.60}
108
+ ]
109
+ })
71
110
 
72
- // Time-series activity data
73
- const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
74
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
111
+ # Process users one by one without blowing up RAM
112
+ for user in stream:
113
+ # write to DB, serialize to file, or process
114
+ pass
75
115
  ```
76
116
 
77
117
  ---
@@ -147,10 +187,6 @@ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
147
187
 
148
188
  This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
149
189
 
150
- ```
151
- Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
152
- After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
153
- ```
154
190
 
155
191
  ---
156
192
  ### sample output - one user
@@ -0,0 +1,7 @@
1
+ from .modules import data
2
+ from .modules.data import generate_stream, apply_custom_correlations
3
+
4
+ __version__ = "2.0.5"
5
+ __author__ = "abhay557"
6
+
7
+ __all__ = ["data", "generate_stream", "apply_custom_correlations"]
@@ -1364,3 +1364,177 @@ def biodata(count=10):
1364
1364
  })
1365
1365
  biodatas.append(bio)
1366
1366
  return biodatas
1367
+
1368
+
1369
+ # ─── Custom Correlation Engine ───────────────────────────────────────────────
1370
+
1371
+ def _get_path(obj: dict, path: str):
1372
+ """
1373
+ Resolves a dot-separated path on a dict and returns the numeric value.
1374
+ Returns None if the path doesn't exist or the value is non-numeric.
1375
+
1376
+ Example: _get_path(user, 'financial.annualIncome')
1377
+ """
1378
+ parts = path.split('.')
1379
+ cur = obj
1380
+ for p in parts:
1381
+ if not isinstance(cur, dict) or p not in cur:
1382
+ return None
1383
+ cur = cur[p]
1384
+ return cur if isinstance(cur, (int, float)) else None
1385
+
1386
+
1387
+ def _set_path(obj: dict, path: str, value) -> None:
1388
+ """Sets a dot-separated path on a dict to a new value (mutates dict)."""
1389
+ parts = path.split('.')
1390
+ cur = obj
1391
+ for p in parts[:-1]:
1392
+ cur = cur[p]
1393
+ cur[parts[-1]] = value
1394
+
1395
+
1396
+ def apply_custom_correlations(user: dict, correlations: list) -> dict:
1397
+ """
1398
+ Applies user-defined Pearson correlation constraints between numeric field pairs.
1399
+
1400
+ Algorithm (conditional normal approximation):
1401
+ Given field A = a, we want field B such that corr(A,B) ≈ r.
1402
+ B' = r * tanh(z_A) + sqrt(1 - r²) * N(0,1)
1403
+ The result is rescaled back to B's natural magnitude.
1404
+
1405
+ Args:
1406
+ user: A fully generated user dict (mutated in place).
1407
+ correlations: List of dicts with keys: fieldA, fieldB, pearson_coeff.
1408
+ Example:
1409
+ [
1410
+ { "fieldA": "education.level",
1411
+ "fieldB": "financial.annualIncome",
1412
+ "pearson_coeff": 0.85 },
1413
+ { "fieldA": "health.bmi",
1414
+ "fieldB": "health.bloodPressure.systolic",
1415
+ "pearson_coeff": 0.60 }
1416
+ ]
1417
+
1418
+ Returns:
1419
+ The mutated user dict.
1420
+
1421
+ Note:
1422
+ - fieldA is used only as a *signal* (its numeric value drives the nudge).
1423
+ - Non-numeric fields or invalid paths are silently skipped.
1424
+ - pearson_coeff must be in [-1.0, 1.0].
1425
+ """
1426
+ if not correlations:
1427
+ return user
1428
+
1429
+ for spec in correlations:
1430
+ field_a = spec.get('fieldA')
1431
+ field_b = spec.get('fieldB')
1432
+ r = spec.get('pearson_coeff')
1433
+
1434
+ if r is None or abs(r) > 1.0:
1435
+ continue
1436
+
1437
+ val_a = _get_path(user, field_a)
1438
+ val_b = _get_path(user, field_b)
1439
+
1440
+ if val_a is None or val_b is None:
1441
+ continue
1442
+
1443
+ # Approximate z-score for A (unit-free signal)
1444
+ z_a = (val_a - val_a * 0.5) / (val_a * 0.5 + 1e-9)
1445
+ # Correlated noise for B
1446
+ independent_noise = random.gauss(0, 1)
1447
+ z_b_corr = r * math.tanh(z_a) + math.sqrt(max(0, 1 - r * r)) * independent_noise
1448
+
1449
+ # Nudge B proportionally
1450
+ nudge = z_b_corr * abs(val_b) * 0.2
1451
+ new_val_b = val_b + nudge
1452
+
1453
+ # Preserve int vs float
1454
+ if isinstance(val_b, int):
1455
+ _set_path(user, field_b, int(round(new_val_b)))
1456
+ else:
1457
+ _set_path(user, field_b, round(new_val_b, 4))
1458
+
1459
+ return user
1460
+
1461
+
1462
+ # ─── Streaming API ───────────────────────────────────────────────────────────
1463
+
1464
+ def generate_stream(count: int = 1000, options: dict = None):
1465
+ """
1466
+ A lazy generator that yields one user dict at a time.
1467
+ Memory usage stays at O(1) regardless of count — safe for millions of rows.
1468
+
1469
+ Args:
1470
+ count: Total number of users to generate.
1471
+ options: Dictionary of options (same as users() plus extras below):
1472
+ seed (int) – Seed for reproducibility.
1473
+ schema (dict) – Schema constraints.
1474
+ locale (str) – Locale code: 'en','in','jp','kr','de','br','ar','fr'.
1475
+ missing_rate (float) – Probability (0-1) each field becomes None.
1476
+ anomaly_rate (float) – Fraction of users to inject anomalies.
1477
+ correlations (list) – Custom Pearson correlation specs (see apply_custom_correlations).
1478
+
1479
+ Yields:
1480
+ dict – One user profile per iteration.
1481
+
1482
+ Examples:
1483
+ # Pipe to CSV without loading all data into memory
1484
+ import csv, io
1485
+ with open('dataset.csv', 'w', newline='') as f:
1486
+ writer = None
1487
+ for user in data.generate_stream(1_000_000, {'seed': 42}):
1488
+ flat = flatten_object(user)
1489
+ if writer is None:
1490
+ writer = csv.DictWriter(f, fieldnames=flat.keys())
1491
+ writer.writeheader()
1492
+ writer.writerow(flat)
1493
+
1494
+ # Custom Pearson correlations
1495
+ for user in data.generate_stream(500, {
1496
+ 'correlations': [
1497
+ {'fieldA': 'education.level', 'fieldB': 'financial.annualIncome', 'pearson_coeff': 0.85},
1498
+ {'fieldA': 'health.bmi', 'fieldB': 'health.bloodPressure.systolic', 'pearson_coeff': 0.60},
1499
+ ]
1500
+ }):
1501
+ print(user['fullName'], user['financial']['annualIncome'])
1502
+ """
1503
+ if options is None:
1504
+ options = {}
1505
+
1506
+ seed = options.get('seed')
1507
+ schema = options.get('schema')
1508
+ locale = options.get('locale')
1509
+ missing_rate = options.get('missing_rate', 0)
1510
+ anomaly_rate = options.get('anomaly_rate', 0)
1511
+ correlations = options.get('correlations', [])
1512
+
1513
+ if seed is not None:
1514
+ random.seed(seed)
1515
+
1516
+ for i in range(count):
1517
+ u = generate_single_user(i + 1, schema=schema, locale=locale)
1518
+
1519
+ if missing_rate > 0:
1520
+ u = apply_missing_data(u, missing_rate)
1521
+
1522
+ # Per-record anomaly injection
1523
+ if anomaly_rate > 0:
1524
+ if random.random() < anomaly_rate:
1525
+ weights = [a['weight'] for a in ANOMALY_TYPES_PY]
1526
+ idx = weighted_random(weights)
1527
+ anomaly = ANOMALY_TYPES_PY[idx]
1528
+ anomaly['apply'](u)
1529
+ u['_anomaly'] = {'isAnomaly': True, 'type': anomaly['type']}
1530
+ else:
1531
+ u['_anomaly'] = {'isAnomaly': False, 'type': None}
1532
+
1533
+ # Custom correlations
1534
+ if correlations:
1535
+ apply_custom_correlations(u, correlations)
1536
+
1537
+ yield u
1538
+
1539
+ if seed is not None:
1540
+ random.seed() # Reset to system entropy
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fakedata-python
3
- Version: 2.0.5
3
+ Version: 2.0.8
4
4
  Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
5
  Author-email: abhay557 <contact@abhaymourya.in>
6
6
  License-Expression: MIT
@@ -40,6 +40,50 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
40
40
 
41
41
  ---
42
42
 
43
+ ## Node.js / TypeScript Implementation
44
+
45
+ ### Installation
46
+ ```bash
47
+ npm install @abhay557/fakedata
48
+ ```
49
+
50
+ ### Quick Start
51
+ ```javascript
52
+ const fakedata = require('@abhay557/fakedata');
53
+
54
+ // Generate deterministic users with a 5% missing data rate (null injection)
55
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
56
+
57
+ // Export directly to CSV format
58
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
59
+
60
+ // Time-series activity data
61
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
62
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
63
+ ```
64
+
65
+ ### Streaming API & Custom Correlations
66
+ Generate unlimited data directly to disk while keeping memory at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
67
+
68
+ ```javascript
69
+ const fs = require('fs');
70
+ const fakedata = require('@abhay557/fakedata');
71
+
72
+ // Create a stream that emits 1 million users as CSV
73
+ const stream = fakedata.data.generateStream(1000000, {
74
+ format: 'csv',
75
+ correlations: [
76
+ { fieldA: 'education.level', fieldB: 'financial.annualIncome', pearson_coeff: 0.85 },
77
+ { fieldA: 'health.bmi', fieldB: 'health.bloodPressure.systolic', pearson_coeff: 0.60 }
78
+ ]
79
+ });
80
+
81
+ // Pipe directly to file (constant RAM usage)
82
+ stream.pipe(fs.createWriteStream('1m_dataset.csv'));
83
+ ```
84
+
85
+ ---
86
+
43
87
  ## Python Implementation
44
88
 
45
89
  ### Installation
@@ -64,28 +108,24 @@ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
64
108
  print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
65
109
  ```
66
110
 
67
- ---
68
-
69
- ## Node.js / TypeScript Implementation
70
-
71
- ### Installation
72
- ```bash
73
- npm install @abhay557/fakedata
74
- ```
75
-
76
- ### Quick Start
77
- ```javascript
78
- const fakedata = require('@abhay557/fakedata');
111
+ ### Streaming API & Custom Correlations
112
+ Generate unlimited data lazily, keeping memory footprint at O(1), and force mathematical relationships between fields using the Pearson Correlation API:
79
113
 
80
- // Generate deterministic users with a 5% missing data rate (null injection)
81
- const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
114
+ ```python
115
+ import fakedata
82
116
 
83
- // Export directly to CSV format
84
- const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
117
+ # Create a lazy generator that yields 1 million users
118
+ stream = fakedata.generate_stream(1000000, {
119
+ "correlations": [
120
+ {"fieldA": "education.level", "fieldB": "financial.annualIncome", "pearson_coeff": 0.85},
121
+ {"fieldA": "health.bmi", "fieldB": "health.bloodPressure.systolic", "pearson_coeff": 0.60}
122
+ ]
123
+ })
85
124
 
86
- // Time-series activity data
87
- const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
88
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
125
+ # Process users one by one without blowing up RAM
126
+ for user in stream:
127
+ # write to DB, serialize to file, or process
128
+ pass
89
129
  ```
90
130
 
91
131
  ---
@@ -161,10 +201,6 @@ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
161
201
 
162
202
  This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
163
203
 
164
- ```
165
- Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
166
- After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
167
- ```
168
204
 
169
205
  ---
170
206
  ### sample output - one user
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "fakedata-python"
7
- version = "2.0.5"
7
+ version = "2.0.8"
8
8
  authors = [
9
9
  { name="abhay557", email="contact@abhaymourya.in" },
10
10
  ]
@@ -1,6 +0,0 @@
1
- from .modules import data
2
-
3
- __version__ = "1.1.0"
4
- __author__ = "abhay557"
5
-
6
- __all__ = ["data"]
File without changes