fakedata-python 2.0.2__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. fakedata_python-2.0.4/LICENSE +21 -0
  2. fakedata_python-2.0.4/MANIFEST.in +24 -0
  3. {fakedata_python-2.0.2/fakedata_python.egg-info → fakedata_python-2.0.4}/PKG-INFO +70 -26
  4. fakedata_python-2.0.2/PKG-INFO → fakedata_python-2.0.4/README.md +66 -36
  5. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/cli.py +80 -31
  6. fakedata_python-2.0.2/README.md → fakedata_python-2.0.4/fakedata_python.egg-info/PKG-INFO +80 -24
  7. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata_python.egg-info/SOURCES.txt +2 -0
  8. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/pyproject.toml +2 -2
  9. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/__init__.py +0 -0
  10. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/core.py +0 -0
  11. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/cardtype.json +0 -0
  12. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/companies.json +0 -0
  13. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/countries.json +0 -0
  14. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/devices.json +0 -0
  15. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/domain.json +0 -0
  16. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/email.json +0 -0
  17. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/first.json +0 -0
  18. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/healthcare.json +0 -0
  19. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/hobbies.json +0 -0
  20. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/industries.json +0 -0
  21. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/job_categories.json +0 -0
  22. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/job_titles.json +0 -0
  23. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/last.json +0 -0
  24. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/locales.json +0 -0
  25. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/middle.json +0 -0
  26. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/occupation.json +0 -0
  27. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/salary_ranges.json +0 -0
  28. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/shortformstate.json +0 -0
  29. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/state.json +0 -0
  30. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/states.json +0 -0
  31. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/street.json +0 -0
  32. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/helpers/universities.json +0 -0
  33. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/modules/__init__.py +0 -0
  34. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/modules/data.py +0 -0
  35. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata/test_python.py +0 -0
  36. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata_python.egg-info/dependency_links.txt +0 -0
  37. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata_python.egg-info/entry_points.txt +0 -0
  38. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/fakedata_python.egg-info/top_level.txt +0 -0
  39. {fakedata_python-2.0.2 → fakedata_python-2.0.4}/setup.cfg +0 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Abhay Mourya
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,24 @@
1
+ # Exclude development and Node.js files
2
+ prune .github
3
+ exclude CONTRIBUTING.md
4
+ exclude CODE_OF_CONDUCT.md
5
+ exclude .npmignore
6
+ exclude test.js
7
+ exclude test_py.py
8
+ exclude test_python.py
9
+
10
+ # Exclude JS source code
11
+ prune src
12
+ exclude package.json
13
+ exclude package-lock.json
14
+
15
+ # Exclude web and raw data
16
+ prune website
17
+ prune newdata
18
+ prune dist
19
+ prune tests
20
+
21
+ # Exclude byte-compiled files
22
+ global-exclude *.py[cod]
23
+ global-exclude __pycache__
24
+ global-exclude *.so
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fakedata-python
3
- Version: 2.0.2
4
- Summary: The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows.
3
+ Version: 2.0.4
4
+ Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
5
  Author-email: abhay557 <contact@abhaymourya.in>
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/abhay557/fakedata
@@ -9,6 +9,8 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Operating System :: OS Independent
10
10
  Requires-Python: >=3.7
11
11
  Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Dynamic: license-file
12
14
 
13
15
  # fakedata
14
16
 
@@ -34,52 +36,56 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
34
36
  - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
35
37
  - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
36
38
  - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
39
+ - **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
37
40
 
38
41
  ---
39
42
 
40
- ## Python Implementation
43
+ ## Node.js / TypeScript Implementation
41
44
 
42
45
  ### Installation
43
46
  ```bash
44
- pip install fakedata-python
47
+ npm install @abhay557/fakedata
45
48
  ```
46
49
 
47
50
  ### Quick Start
48
- ```python
49
- import fakedata.data as data
50
- import pandas as pd
51
+ ```javascript
52
+ const fakedata = require('@abhay557/fakedata');
51
53
 
52
- # Generate 10,000 highly correlated users deterministically
53
- users = data.users(10000, {"seed": 42})
54
+ // Generate deterministic users with a 5% missing data rate (null injection)
55
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
54
56
 
55
- # Or export directly to a Pandas DataFrame
56
- df = pd.DataFrame(data.users_flat(10000, {"seed": 42}))
57
- print(df.head())
57
+ // Export directly to CSV format
58
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
58
59
 
59
- # Create time-series activity data
60
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
61
- print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
60
+ // Time-series activity data
61
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
62
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
62
63
  ```
63
- ## Node.js / TypeScript Implementation
64
+
65
+ ---
66
+
67
+ ## Python Implementation
64
68
 
65
69
  ### Installation
66
70
  ```bash
67
- npm install @abhay557/fakedata
71
+ pip install fakedata-python
68
72
  ```
69
73
 
70
74
  ### Quick Start
71
- ```javascript
72
- const { data } = require('@abhay557/fakedata');
75
+ ```python
76
+ import fakedata
77
+ import pandas as pd
73
78
 
74
- // Generate deterministic users with a 5% missing data rate (null injection)
75
- const users = data.users(1000, { seed: 42, missing_rate: 0.05 });
79
+ # Generate 10,000 highly correlated users deterministically
80
+ users = fakedata.data.users(10000, {"seed": 42})
76
81
 
77
- // Export directly to CSV format
78
- const csvString = data.usersToCSV(1000, { seed: 42 });
82
+ # Or export directly to a Pandas DataFrame
83
+ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
84
+ print(df.head())
79
85
 
80
- // Time-series activity data
81
- const ts = data.userTimeSeries({ days: 30, eventsPerDay: 8 });
82
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
86
+ # Create time-series activity data
87
+ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
88
+ print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
83
89
  ```
84
90
 
85
91
  ---
@@ -133,6 +139,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
133
139
  # Fraud detection dataset with 5% anomalies
134
140
  fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
135
141
 
142
+ # Generate 1 million rows without running out of memory (streaming)
143
+ fakedata generate -n 1000000 -f csv -o big_dataset.csv
144
+
136
145
  # Preview a single user in the console
137
146
  fakedata preview
138
147
 
@@ -140,6 +149,23 @@ fakedata preview
140
149
  fakedata generate -n 100 --timeseries --days 60 -o activity.json
141
150
  ```
142
151
 
152
+ ### Streaming Architecture
153
+
154
+ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
155
+
156
+ - The output file is **created first**, before any data is generated.
157
+ - Each user is generated **one at a time** and written immediately to disk.
158
+ - The generated object is then **discarded** — it is never held in a large array.
159
+ - **RAM usage stays constant** (O(1)) regardless of how many records you generate.
160
+ - A live progress counter is printed every 10,000 records for large jobs.
161
+
162
+ This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
163
+
164
+ ```
165
+ Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
166
+ After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
167
+ ```
168
+
143
169
  ---
144
170
  ### sample output - one user
145
171
  ```fakedata.data.user()```
@@ -432,3 +458,21 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
432
458
  **Maintainer**: [abhay557](https://github.com/abhay557)
433
459
 
434
460
  - Project Commit History - `https://github.com/abhay557/random-api.xyz`
461
+
462
+ ---
463
+
464
+ ## Contributing
465
+
466
+ Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
467
+
468
+ - Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
469
+ - Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
470
+ - Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
471
+ - Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
472
+
473
+ ```bash
474
+ # Fork the repo, then:
475
+ git clone https://github.com/YOUR_USERNAME/fakedata.git
476
+ git checkout -b feature/my-improvement
477
+ # Make your changes, then open a Pull Request!
478
+ ```
@@ -1,15 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: fakedata-python
3
- Version: 2.0.2
4
- Summary: The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
- Author-email: abhay557 <contact@abhaymourya.in>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/abhay557/fakedata
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Operating System :: OS Independent
10
- Requires-Python: >=3.7
11
- Description-Content-Type: text/markdown
12
-
13
1
  # fakedata
14
2
 
15
3
  [![NPM Version](https://img.shields.io/npm/v/@abhay557/fakedata?color=red&label=npm)](https://www.npmjs.com/package/@abhay557/fakedata)
@@ -34,52 +22,56 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
34
22
  - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
35
23
  - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
36
24
  - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
25
+ - **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
37
26
 
38
27
  ---
39
28
 
40
- ## Python Implementation
29
+ ## Node.js / TypeScript Implementation
41
30
 
42
31
  ### Installation
43
32
  ```bash
44
- pip install fakedata-python
33
+ npm install @abhay557/fakedata
45
34
  ```
46
35
 
47
36
  ### Quick Start
48
- ```python
49
- import fakedata.data as data
50
- import pandas as pd
37
+ ```javascript
38
+ const fakedata = require('@abhay557/fakedata');
51
39
 
52
- # Generate 10,000 highly correlated users deterministically
53
- users = data.users(10000, {"seed": 42})
40
+ // Generate deterministic users with a 5% missing data rate (null injection)
41
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
54
42
 
55
- # Or export directly to a Pandas DataFrame
56
- df = pd.DataFrame(data.users_flat(10000, {"seed": 42}))
57
- print(df.head())
43
+ // Export directly to CSV format
44
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
58
45
 
59
- # Create time-series activity data
60
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
61
- print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
46
+ // Time-series activity data
47
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
48
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
62
49
  ```
63
- ## Node.js / TypeScript Implementation
50
+
51
+ ---
52
+
53
+ ## Python Implementation
64
54
 
65
55
  ### Installation
66
56
  ```bash
67
- npm install @abhay557/fakedata
57
+ pip install fakedata-python
68
58
  ```
69
59
 
70
60
  ### Quick Start
71
- ```javascript
72
- const { data } = require('@abhay557/fakedata');
61
+ ```python
62
+ import fakedata
63
+ import pandas as pd
73
64
 
74
- // Generate deterministic users with a 5% missing data rate (null injection)
75
- const users = data.users(1000, { seed: 42, missing_rate: 0.05 });
65
+ # Generate 10,000 highly correlated users deterministically
66
+ users = fakedata.data.users(10000, {"seed": 42})
76
67
 
77
- // Export directly to CSV format
78
- const csvString = data.usersToCSV(1000, { seed: 42 });
68
+ # Or export directly to a Pandas DataFrame
69
+ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
70
+ print(df.head())
79
71
 
80
- // Time-series activity data
81
- const ts = data.userTimeSeries({ days: 30, eventsPerDay: 8 });
82
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
72
+ # Create time-series activity data
73
+ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
74
+ print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
83
75
  ```
84
76
 
85
77
  ---
@@ -133,6 +125,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
133
125
  # Fraud detection dataset with 5% anomalies
134
126
  fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
135
127
 
128
+ # Generate 1 million rows without running out of memory (streaming)
129
+ fakedata generate -n 1000000 -f csv -o big_dataset.csv
130
+
136
131
  # Preview a single user in the console
137
132
  fakedata preview
138
133
 
@@ -140,6 +135,23 @@ fakedata preview
140
135
  fakedata generate -n 100 --timeseries --days 60 -o activity.json
141
136
  ```
142
137
 
138
+ ### Streaming Architecture
139
+
140
+ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
141
+
142
+ - The output file is **created first**, before any data is generated.
143
+ - Each user is generated **one at a time** and written immediately to disk.
144
+ - The generated object is then **discarded** — it is never held in a large array.
145
+ - **RAM usage stays constant** (O(1)) regardless of how many records you generate.
146
+ - A live progress counter is printed every 10,000 records for large jobs.
147
+
148
+ This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
149
+
150
+ ```
151
+ Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
152
+ After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
153
+ ```
154
+
143
155
  ---
144
156
  ### sample output - one user
145
157
  ```fakedata.data.user()```
@@ -432,3 +444,21 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
432
444
  **Maintainer**: [abhay557](https://github.com/abhay557)
433
445
 
434
446
  - Project Commit History - `https://github.com/abhay557/random-api.xyz`
447
+
448
+ ---
449
+
450
+ ## Contributing
451
+
452
+ Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
453
+
454
+ - Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
455
+ - Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
456
+ - Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
457
+ - Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
458
+
459
+ ```bash
460
+ # Fork the repo, then:
461
+ git clone https://github.com/YOUR_USERNAME/fakedata.git
462
+ git checkout -b feature/my-improvement
463
+ # Make your changes, then open a Pull Request!
464
+ ```
@@ -89,44 +89,93 @@ EXAMPLES:
89
89
  'anomaly_rate': args.anomaly_rate,
90
90
  'missing_rate': args.missing_rate,
91
91
  }
92
- # Remove None values so defaults are used inside the engine
93
92
  options = {k: v for k, v in options.items() if v is not None and v != 0.0}
94
93
 
94
+ count = args.count
95
95
  start = time.time()
96
-
97
- if args.timeseries:
98
- results = [
99
- data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
100
- for _ in range(args.count)
101
- ]
102
- output = json.dumps(results, indent=2 if args.pretty else None)
103
-
104
- elif args.format == 'csv':
105
- output = data.users_to_csv(args.count, options if options else None)
106
-
107
- elif args.format == 'flat':
108
- rows = data.users_flat(args.count, options if options else None)
109
- output = json.dumps(rows, indent=2 if args.pretty else None)
110
-
111
- else: # json
112
- if args.pretty:
113
- output = data.users_to_json(args.count, options if options else None)
96
+ PROGRESS_INTERVAL = 10000
97
+
98
+ # ── stdout: buffer is fine for small terminal output ──────────────
99
+ if not args.output:
100
+ if args.timeseries:
101
+ results = [
102
+ data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
103
+ for _ in range(count)
104
+ ]
105
+ print(json.dumps(results, indent=2 if args.pretty else None))
106
+ elif args.format == 'csv':
107
+ print(data.users_to_csv(count, options if options else None))
108
+ elif args.format == 'flat':
109
+ rows = data.users_flat(count, options if options else None)
110
+ print(json.dumps(rows, indent=2 if args.pretty else None))
114
111
  else:
115
- output = json.dumps(data.users(args.count, options if options else None))
112
+ if args.pretty:
113
+ print(data.users_to_json(count, options if options else None))
114
+ else:
115
+ print(json.dumps(data.users(count, options if options else None)))
116
+ return
117
+
118
+ # ── File: STREAMING — open file first, write one record at a time ──
119
+ out_path = os.path.abspath(args.output)
120
+
121
+ with open(out_path, 'w', encoding='utf-8') as f:
122
+
123
+ if args.timeseries:
124
+ f.write('[\n')
125
+ for i in range(count):
126
+ rec = data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
127
+ line = json.dumps(rec, indent=2 if args.pretty else None)
128
+ f.write(line + (',' if i < count - 1 else '') + '\n')
129
+ if (i + 1) % PROGRESS_INTERVAL == 0:
130
+ print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
131
+ f.write(']\n')
132
+
133
+ elif args.format == 'csv':
134
+ # Write header from first record
135
+ first = data.user(options if options else None)
136
+ header = ','.join(f'"{k}"' for k in first.keys())
137
+ f.write(header + '\n')
138
+ f.write(_user_to_csv_row(first) + '\n')
139
+ for i in range(1, count):
140
+ u = data.user(options if options else None)
141
+ f.write(_user_to_csv_row(u) + '\n')
142
+ if (i + 1) % PROGRESS_INTERVAL == 0:
143
+ print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
144
+
145
+ else: # json / flat
146
+ f.write('[\n')
147
+ for i in range(count):
148
+ u = data.user(options if options else None)
149
+ line = json.dumps(u, indent=2 if args.pretty else None)
150
+ f.write(line + (',' if i < count - 1 else '') + '\n')
151
+ if (i + 1) % PROGRESS_INTERVAL == 0:
152
+ print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
153
+ f.write(']\n')
116
154
 
117
155
  elapsed = round(time.time() - start, 2)
118
-
119
- if args.output:
120
- out_path = os.path.abspath(args.output)
121
- with open(out_path, 'w', encoding='utf-8') as f:
122
- f.write(output)
123
- size_kb = round(len(output.encode('utf-8')) / 1024, 1)
124
- print(
125
- f"✔ Done! Generated {args.count:,} users in {elapsed}s → {out_path} ({size_kb} KB)",
126
- file=sys.stderr
127
- )
156
+ size_bytes = os.path.getsize(out_path)
157
+ size_label = f"{size_bytes / 1048576:.1f} MB" if size_bytes >= 1048576 else f"{size_bytes / 1024:.1f} KB"
158
+ print('\r', end='', file=sys.stderr) # clear progress line
159
+ print(
160
+ f"✔ Done! Generated {count:,} users in {elapsed}s → {out_path} ({size_label})",
161
+ file=sys.stderr
162
+ )
163
+
164
+
165
+ def _user_to_csv_row(u):
166
+ """Serialize a single user dict to a CSV row string."""
167
+ import json as _json
168
+ parts = []
169
+ for v in u.values():
170
+ if v is None:
171
+ parts.append('')
172
+ elif isinstance(v, (dict, list)):
173
+ parts.append('"' + _json.dumps(v).replace('"', '""') + '"')
174
+ elif isinstance(v, str):
175
+ parts.append('"' + v.replace('"', '""') + '"')
128
176
  else:
129
- print(output)
177
+ parts.append(str(v))
178
+ return ','.join(parts)
130
179
 
131
180
 
132
181
  if __name__ == '__main__':
@@ -1,3 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: fakedata-python
3
+ Version: 2.0.4
4
+ Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
+ Author-email: abhay557 <contact@abhaymourya.in>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/abhay557/fakedata
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.7
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Dynamic: license-file
14
+
1
15
  # fakedata
2
16
 
3
17
  [![NPM Version](https://img.shields.io/npm/v/@abhay557/fakedata?color=red&label=npm)](https://www.npmjs.com/package/@abhay557/fakedata)
@@ -22,52 +36,56 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
22
36
  - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
23
37
  - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
24
38
  - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
39
+ - **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
25
40
 
26
41
  ---
27
42
 
28
- ## Python Implementation
43
+ ## Node.js / TypeScript Implementation
29
44
 
30
45
  ### Installation
31
46
  ```bash
32
- pip install fakedata-python
47
+ npm install @abhay557/fakedata
33
48
  ```
34
49
 
35
50
  ### Quick Start
36
- ```python
37
- import fakedata.data as data
38
- import pandas as pd
51
+ ```javascript
52
+ const fakedata = require('@abhay557/fakedata');
39
53
 
40
- # Generate 10,000 highly correlated users deterministically
41
- users = data.users(10000, {"seed": 42})
54
+ // Generate deterministic users with a 5% missing data rate (null injection)
55
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
42
56
 
43
- # Or export directly to a Pandas DataFrame
44
- df = pd.DataFrame(data.users_flat(10000, {"seed": 42}))
45
- print(df.head())
57
+ // Export directly to CSV format
58
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
46
59
 
47
- # Create time-series activity data
48
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
49
- print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
60
+ // Time-series activity data
61
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
62
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
50
63
  ```
51
- ## Node.js / TypeScript Implementation
64
+
65
+ ---
66
+
67
+ ## Python Implementation
52
68
 
53
69
  ### Installation
54
70
  ```bash
55
- npm install @abhay557/fakedata
71
+ pip install fakedata-python
56
72
  ```
57
73
 
58
74
  ### Quick Start
59
- ```javascript
60
- const { data } = require('@abhay557/fakedata');
75
+ ```python
76
+ import fakedata
77
+ import pandas as pd
61
78
 
62
- // Generate deterministic users with a 5% missing data rate (null injection)
63
- const users = data.users(1000, { seed: 42, missing_rate: 0.05 });
79
+ # Generate 10,000 highly correlated users deterministically
80
+ users = fakedata.data.users(10000, {"seed": 42})
64
81
 
65
- // Export directly to CSV format
66
- const csvString = data.usersToCSV(1000, { seed: 42 });
82
+ # Or export directly to a Pandas DataFrame
83
+ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
84
+ print(df.head())
67
85
 
68
- // Time-series activity data
69
- const ts = data.userTimeSeries({ days: 30, eventsPerDay: 8 });
70
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
86
+ # Create time-series activity data
87
+ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
88
+ print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
71
89
  ```
72
90
 
73
91
  ---
@@ -121,6 +139,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
121
139
  # Fraud detection dataset with 5% anomalies
122
140
  fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
123
141
 
142
+ # Generate 1 million rows without running out of memory (streaming)
143
+ fakedata generate -n 1000000 -f csv -o big_dataset.csv
144
+
124
145
  # Preview a single user in the console
125
146
  fakedata preview
126
147
 
@@ -128,6 +149,23 @@ fakedata preview
128
149
  fakedata generate -n 100 --timeseries --days 60 -o activity.json
129
150
  ```
130
151
 
152
+ ### Streaming Architecture
153
+
154
+ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
155
+
156
+ - The output file is **created first**, before any data is generated.
157
+ - Each user is generated **one at a time** and written immediately to disk.
158
+ - The generated object is then **discarded** — it is never held in a large array.
159
+ - **RAM usage stays constant** (O(1)) regardless of how many records you generate.
160
+ - A live progress counter is printed every 10,000 records for large jobs.
161
+
162
+ This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
163
+
164
+ ```
165
+ Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
166
+ After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
167
+ ```
168
+
131
169
  ---
132
170
  ### sample output - one user
133
171
  ```fakedata.data.user()```
@@ -420,3 +458,21 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
420
458
  **Maintainer**: [abhay557](https://github.com/abhay557)
421
459
 
422
460
  - Project Commit History - `https://github.com/abhay557/random-api.xyz`
461
+
462
+ ---
463
+
464
+ ## Contributing
465
+
466
+ Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
467
+
468
+ - Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
469
+ - Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
470
+ - Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
471
+ - Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
472
+
473
+ ```bash
474
+ # Fork the repo, then:
475
+ git clone https://github.com/YOUR_USERNAME/fakedata.git
476
+ git checkout -b feature/my-improvement
477
+ # Make your changes, then open a Pull Request!
478
+ ```
@@ -1,3 +1,5 @@
1
+ LICENSE
2
+ MANIFEST.in
1
3
  README.md
2
4
  pyproject.toml
3
5
  fakedata/__init__.py
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "fakedata-python"
7
- version = "2.0.2"
7
+ version = "2.0.4"
8
8
  authors = [
9
9
  { name="abhay557", email="contact@abhaymourya.in" },
10
10
  ]
11
- description = "The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows."
11
+ description = "The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows."
12
12
  readme = "README.md"
13
13
  license = "MIT"
14
14
  requires-python = ">=3.7"