fakedata-python 2.0.3__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/PKG-INFO +66 -3
  2. fakedata_python-2.0.3/fakedata_python.egg-info/PKG-INFO → fakedata_python-2.0.4/README.md +64 -15
  3. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/cli.py +80 -31
  4. fakedata_python-2.0.3/README.md → fakedata_python-2.0.4/fakedata_python.egg-info/PKG-INFO +78 -1
  5. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/pyproject.toml +2 -2
  6. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/LICENSE +0 -0
  7. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/MANIFEST.in +0 -0
  8. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/__init__.py +0 -0
  9. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/core.py +0 -0
  10. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/cardtype.json +0 -0
  11. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/companies.json +0 -0
  12. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/countries.json +0 -0
  13. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/devices.json +0 -0
  14. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/domain.json +0 -0
  15. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/email.json +0 -0
  16. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/first.json +0 -0
  17. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/healthcare.json +0 -0
  18. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/hobbies.json +0 -0
  19. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/industries.json +0 -0
  20. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/job_categories.json +0 -0
  21. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/job_titles.json +0 -0
  22. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/last.json +0 -0
  23. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/locales.json +0 -0
  24. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/middle.json +0 -0
  25. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/occupation.json +0 -0
  26. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/salary_ranges.json +0 -0
  27. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/shortformstate.json +0 -0
  28. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/state.json +0 -0
  29. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/states.json +0 -0
  30. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/street.json +0 -0
  31. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/helpers/universities.json +0 -0
  32. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/modules/__init__.py +0 -0
  33. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/modules/data.py +0 -0
  34. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata/test_python.py +0 -0
  35. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata_python.egg-info/SOURCES.txt +0 -0
  36. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata_python.egg-info/dependency_links.txt +0 -0
  37. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata_python.egg-info/entry_points.txt +0 -0
  38. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/fakedata_python.egg-info/top_level.txt +0 -0
  39. {fakedata_python-2.0.3 → fakedata_python-2.0.4}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fakedata-python
3
- Version: 2.0.3
4
- Summary: The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows.
3
+ Version: 2.0.4
4
+ Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
5
  Author-email: abhay557 <contact@abhaymourya.in>
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/abhay557/fakedata
@@ -36,6 +36,33 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
36
36
  - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
37
37
  - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
38
38
  - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
39
+ - **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
40
+
41
+ ---
42
+
43
+ ## Node.js / TypeScript Implementation
44
+
45
+ ### Installation
46
+ ```bash
47
+ npm install @abhay557/fakedata
48
+ ```
49
+
50
+ ### Quick Start
51
+ ```javascript
52
+ const fakedata = require('@abhay557/fakedata');
53
+
54
+ // Generate deterministic users with a 5% missing data rate (null injection)
55
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
56
+
57
+ // Export directly to CSV format
58
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
59
+
60
+ // Time-series activity data
61
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
62
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
63
+ ```
64
+
65
+ ---
39
66
 
40
67
  ## Python Implementation
41
68
 
@@ -57,7 +84,7 @@ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
57
84
  print(df.head())
58
85
 
59
86
  # Create time-series activity data
60
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
87
+ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
61
88
  print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
62
89
  ```
63
90
 
@@ -112,6 +139,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
112
139
  # Fraud detection dataset with 5% anomalies
113
140
  fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
114
141
 
142
+ # Generate 1 million rows without running out of memory (streaming)
143
+ fakedata generate -n 1000000 -f csv -o big_dataset.csv
144
+
115
145
  # Preview a single user in the console
116
146
  fakedata preview
117
147
 
@@ -119,6 +149,23 @@ fakedata preview
119
149
  fakedata generate -n 100 --timeseries --days 60 -o activity.json
120
150
  ```
121
151
 
152
+ ### Streaming Architecture
153
+
154
+ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
155
+
156
+ - The output file is **created first**, before any data is generated.
157
+ - Each user is generated **one at a time** and written immediately to disk.
158
+ - The generated object is then **discarded** — it is never held in a large array.
159
+ - **RAM usage stays constant** (O(1)) regardless of how many records you generate.
160
+ - A live progress counter is printed every 10,000 records for large jobs.
161
+
162
+ This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
163
+
164
+ ```
165
+ Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
166
+ After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
167
+ ```
168
+
122
169
  ---
123
170
  ### sample output - one user
124
171
  ```fakedata.data.user()```
@@ -413,3 +460,19 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
413
460
  - Project Commit History - `https://github.com/abhay557/random-api.xyz`
414
461
 
415
462
  ---
463
+
464
+ ## Contributing
465
+
466
+ Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
467
+
468
+ - Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
469
+ - Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
470
+ - Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
471
+ - Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
472
+
473
+ ```bash
474
+ # Fork the repo, then:
475
+ git clone https://github.com/YOUR_USERNAME/fakedata.git
476
+ git checkout -b feature/my-improvement
477
+ # Make your changes, then open a Pull Request!
478
+ ```
@@ -1,17 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: fakedata-python
3
- Version: 2.0.3
4
- Summary: The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
- Author-email: abhay557 <contact@abhaymourya.in>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/abhay557/fakedata
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Operating System :: OS Independent
10
- Requires-Python: >=3.7
11
- Description-Content-Type: text/markdown
12
- License-File: LICENSE
13
- Dynamic: license-file
14
-
15
1
  # fakedata
16
2
 
17
3
  [![NPM Version](https://img.shields.io/npm/v/@abhay557/fakedata?color=red&label=npm)](https://www.npmjs.com/package/@abhay557/fakedata)
@@ -36,6 +22,33 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
36
22
  - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
37
23
  - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
38
24
  - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
25
+ - **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
26
+
27
+ ---
28
+
29
+ ## Node.js / TypeScript Implementation
30
+
31
+ ### Installation
32
+ ```bash
33
+ npm install @abhay557/fakedata
34
+ ```
35
+
36
+ ### Quick Start
37
+ ```javascript
38
+ const fakedata = require('@abhay557/fakedata');
39
+
40
+ // Generate deterministic users with a 5% missing data rate (null injection)
41
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
42
+
43
+ // Export directly to CSV format
44
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
45
+
46
+ // Time-series activity data
47
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
48
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
49
+ ```
50
+
51
+ ---
39
52
 
40
53
  ## Python Implementation
41
54
 
@@ -57,7 +70,7 @@ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
57
70
  print(df.head())
58
71
 
59
72
  # Create time-series activity data
60
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
73
+ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
61
74
  print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
62
75
  ```
63
76
 
@@ -112,6 +125,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
112
125
  # Fraud detection dataset with 5% anomalies
113
126
  fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
114
127
 
128
+ # Generate 1 million rows without running out of memory (streaming)
129
+ fakedata generate -n 1000000 -f csv -o big_dataset.csv
130
+
115
131
  # Preview a single user in the console
116
132
  fakedata preview
117
133
 
@@ -119,6 +135,23 @@ fakedata preview
119
135
  fakedata generate -n 100 --timeseries --days 60 -o activity.json
120
136
  ```
121
137
 
138
+ ### Streaming Architecture
139
+
140
+ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
141
+
142
+ - The output file is **created first**, before any data is generated.
143
+ - Each user is generated **one at a time** and written immediately to disk.
144
+ - The generated object is then **discarded** — it is never held in a large array.
145
+ - **RAM usage stays constant** (O(1)) regardless of how many records you generate.
146
+ - A live progress counter is printed every 10,000 records for large jobs.
147
+
148
+ This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
149
+
150
+ ```
151
+ Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
152
+ After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
153
+ ```
154
+
122
155
  ---
123
156
  ### sample output - one user
124
157
  ```fakedata.data.user()```
@@ -413,3 +446,19 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
413
446
  - Project Commit History - `https://github.com/abhay557/random-api.xyz`
414
447
 
415
448
  ---
449
+
450
+ ## Contributing
451
+
452
+ Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
453
+
454
+ - Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
455
+ - Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
456
+ - Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
457
+ - Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
458
+
459
+ ```bash
460
+ # Fork the repo, then:
461
+ git clone https://github.com/YOUR_USERNAME/fakedata.git
462
+ git checkout -b feature/my-improvement
463
+ # Make your changes, then open a Pull Request!
464
+ ```
@@ -89,44 +89,93 @@ EXAMPLES:
89
89
  'anomaly_rate': args.anomaly_rate,
90
90
  'missing_rate': args.missing_rate,
91
91
  }
92
- # Remove None values so defaults are used inside the engine
93
92
  options = {k: v for k, v in options.items() if v is not None and v != 0.0}
94
93
 
94
+ count = args.count
95
95
  start = time.time()
96
-
97
- if args.timeseries:
98
- results = [
99
- data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
100
- for _ in range(args.count)
101
- ]
102
- output = json.dumps(results, indent=2 if args.pretty else None)
103
-
104
- elif args.format == 'csv':
105
- output = data.users_to_csv(args.count, options if options else None)
106
-
107
- elif args.format == 'flat':
108
- rows = data.users_flat(args.count, options if options else None)
109
- output = json.dumps(rows, indent=2 if args.pretty else None)
110
-
111
- else: # json
112
- if args.pretty:
113
- output = data.users_to_json(args.count, options if options else None)
96
+ PROGRESS_INTERVAL = 10000
97
+
98
+ # ── stdout: buffer is fine for small terminal output ──────────────
99
+ if not args.output:
100
+ if args.timeseries:
101
+ results = [
102
+ data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
103
+ for _ in range(count)
104
+ ]
105
+ print(json.dumps(results, indent=2 if args.pretty else None))
106
+ elif args.format == 'csv':
107
+ print(data.users_to_csv(count, options if options else None))
108
+ elif args.format == 'flat':
109
+ rows = data.users_flat(count, options if options else None)
110
+ print(json.dumps(rows, indent=2 if args.pretty else None))
114
111
  else:
115
- output = json.dumps(data.users(args.count, options if options else None))
112
+ if args.pretty:
113
+ print(data.users_to_json(count, options if options else None))
114
+ else:
115
+ print(json.dumps(data.users(count, options if options else None)))
116
+ return
117
+
118
+ # ── File: STREAMING — open file first, write one record at a time ──
119
+ out_path = os.path.abspath(args.output)
120
+
121
+ with open(out_path, 'w', encoding='utf-8') as f:
122
+
123
+ if args.timeseries:
124
+ f.write('[\n')
125
+ for i in range(count):
126
+ rec = data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
127
+ line = json.dumps(rec, indent=2 if args.pretty else None)
128
+ f.write(line + (',' if i < count - 1 else '') + '\n')
129
+ if (i + 1) % PROGRESS_INTERVAL == 0:
130
+ print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
131
+ f.write(']\n')
132
+
133
+ elif args.format == 'csv':
134
+ # Write header from first record
135
+ first = data.user(options if options else None)
136
+ header = ','.join(f'"{k}"' for k in first.keys())
137
+ f.write(header + '\n')
138
+ f.write(_user_to_csv_row(first) + '\n')
139
+ for i in range(1, count):
140
+ u = data.user(options if options else None)
141
+ f.write(_user_to_csv_row(u) + '\n')
142
+ if (i + 1) % PROGRESS_INTERVAL == 0:
143
+ print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
144
+
145
+ else: # json / flat
146
+ f.write('[\n')
147
+ for i in range(count):
148
+ u = data.user(options if options else None)
149
+ line = json.dumps(u, indent=2 if args.pretty else None)
150
+ f.write(line + (',' if i < count - 1 else '') + '\n')
151
+ if (i + 1) % PROGRESS_INTERVAL == 0:
152
+ print(f"\r ⏳ {i+1:,} / {count:,} written...", end='', file=sys.stderr)
153
+ f.write(']\n')
116
154
 
117
155
  elapsed = round(time.time() - start, 2)
118
-
119
- if args.output:
120
- out_path = os.path.abspath(args.output)
121
- with open(out_path, 'w', encoding='utf-8') as f:
122
- f.write(output)
123
- size_kb = round(len(output.encode('utf-8')) / 1024, 1)
124
- print(
125
- f"✔ Done! Generated {args.count:,} users in {elapsed}s → {out_path} ({size_kb} KB)",
126
- file=sys.stderr
127
- )
156
+ size_bytes = os.path.getsize(out_path)
157
+ size_label = f"{size_bytes / 1048576:.1f} MB" if size_bytes >= 1048576 else f"{size_bytes / 1024:.1f} KB"
158
+ print('\r', end='', file=sys.stderr) # clear progress line
159
+ print(
160
+ f"✔ Done! Generated {count:,} users in {elapsed}s → {out_path} ({size_label})",
161
+ file=sys.stderr
162
+ )
163
+
164
+
165
+ def _user_to_csv_row(u):
166
+ """Serialize a single user dict to a CSV row string."""
167
+ import json as _json
168
+ parts = []
169
+ for v in u.values():
170
+ if v is None:
171
+ parts.append('')
172
+ elif isinstance(v, (dict, list)):
173
+ parts.append('"' + _json.dumps(v).replace('"', '""') + '"')
174
+ elif isinstance(v, str):
175
+ parts.append('"' + v.replace('"', '""') + '"')
128
176
  else:
129
- print(output)
177
+ parts.append(str(v))
178
+ return ','.join(parts)
130
179
 
131
180
 
132
181
  if __name__ == '__main__':
@@ -1,3 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: fakedata-python
3
+ Version: 2.0.4
4
+ Summary: The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
+ Author-email: abhay557 <contact@abhaymourya.in>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/abhay557/fakedata
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.7
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Dynamic: license-file
14
+
1
15
  # fakedata
2
16
 
3
17
  [![NPM Version](https://img.shields.io/npm/v/@abhay557/fakedata?color=red&label=npm)](https://www.npmjs.com/package/@abhay557/fakedata)
@@ -22,6 +36,33 @@ A high-performance, **zero-dependency** synthetic data generation engine, availa
22
36
  - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
23
37
  - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
24
38
  - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
39
+ - **Streaming Generation**: Files are written one record at a time — constant RAM usage regardless of dataset size. Generate 10M+ rows without running out of memory.
40
+
41
+ ---
42
+
43
+ ## Node.js / TypeScript Implementation
44
+
45
+ ### Installation
46
+ ```bash
47
+ npm install @abhay557/fakedata
48
+ ```
49
+
50
+ ### Quick Start
51
+ ```javascript
52
+ const fakedata = require('@abhay557/fakedata');
53
+
54
+ // Generate deterministic users with a 5% missing data rate (null injection)
55
+ const users = fakedata.data.users(1000, { seed: 42, missing_rate: 0.05 });
56
+
57
+ // Export directly to CSV format
58
+ const csvString = fakedata.data.usersToCSV(1000, { seed: 42 });
59
+
60
+ // Time-series activity data
61
+ const ts = fakedata.userTimeSeries({ days: 30, eventsPerDay: 8 });
62
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
63
+ ```
64
+
65
+ ---
25
66
 
26
67
  ## Python Implementation
27
68
 
@@ -43,7 +84,7 @@ df = pd.DataFrame(fakedata.data.users_flat(10000, {"seed": 42}))
43
84
  print(df.head())
44
85
 
45
86
  # Create time-series activity data
46
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
87
+ ts = fakedata.data.user_time_series({"days": 30, "events_per_day": 8})
47
88
  print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
48
89
  ```
49
90
 
@@ -98,6 +139,9 @@ fakedata generate -n 500 -l in --seed 42 -o india.json
98
139
  # Fraud detection dataset with 5% anomalies
99
140
  fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
100
141
 
142
+ # Generate 1 million rows without running out of memory (streaming)
143
+ fakedata generate -n 1000000 -f csv -o big_dataset.csv
144
+
101
145
  # Preview a single user in the console
102
146
  fakedata preview
103
147
 
@@ -105,6 +149,23 @@ fakedata preview
105
149
  fakedata generate -n 100 --timeseries --days 60 -o activity.json
106
150
  ```
107
151
 
152
+ ### Streaming Architecture
153
+
154
+ When writing to a file (`-o`), the CLI uses a **streaming write** strategy:
155
+
156
+ - The output file is **created first**, before any data is generated.
157
+ - Each user is generated **one at a time** and written immediately to disk.
158
+ - The generated object is then **discarded** — it is never held in a large array.
159
+ - **RAM usage stays constant** (O(1)) regardless of how many records you generate.
160
+ - A live progress counter is printed every 10,000 records for large jobs.
161
+
162
+ This means you can generate **tens of millions of rows** without hitting Node.js heap limits or Python memory errors.
163
+
164
+ ```
165
+ Before (old): generate ALL → hold in RAM → write to file ❌ OOM at ~500k rows
166
+ After (new): open file → generate 1 → write → discard → repeat ✅ unlimited
167
+ ```
168
+
108
169
  ---
109
170
  ### sample output - one user
110
171
  ```fakedata.data.user()```
@@ -399,3 +460,19 @@ Distributed under the **MIT License**. See `LICENSE` for more information.
399
460
  - Project Commit History - `https://github.com/abhay557/random-api.xyz`
400
461
 
401
462
  ---
463
+
464
+ ## Contributing
465
+
466
+ Contributions are welcome! Whether it's a bug fix, a new feature, or improved docs — every bit helps.
467
+
468
+ - Read the [Contributing Guide](./CONTRIBUTING.md) before submitting a PR.
469
+ - Use the [Bug Report](https://github.com/abhay557/fakedata/issues/new?template=bug_report.md) template to report issues.
470
+ - Use the [Feature Request](https://github.com/abhay557/fakedata/issues/new?template=feature_request.md) template to suggest ideas.
471
+ - Please follow our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
472
+
473
+ ```bash
474
+ # Fork the repo, then:
475
+ git clone https://github.com/YOUR_USERNAME/fakedata.git
476
+ git checkout -b feature/my-improvement
477
+ # Make your changes, then open a Pull Request!
478
+ ```
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "fakedata-python"
7
- version = "2.0.3"
7
+ version = "2.0.4"
8
8
  authors = [
9
9
  { name="abhay557", email="contact@abhaymourya.in" },
10
10
  ]
11
- description = "The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows."
11
+ description = "The fakedata package generates realistic user profiles for machine learning, deep learning, data analysis, and data science workflows."
12
12
  readme = "README.md"
13
13
  license = "MIT"
14
14
  requires-python = ">=3.7"
File without changes