fakedata-python 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fakedata/cli.py ADDED
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ fakedata CLI - ML-Ready Synthetic Data Engine
4
+ """
5
+ import argparse
6
+ import json
7
+ import sys
8
+ import time
9
+ import os
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser(
13
+ prog='fakedata',
14
+ description='fakedata - ML-Ready Synthetic Data Engine (Python)',
15
+ formatter_class=argparse.RawTextHelpFormatter,
16
+ epilog="""
17
+ EXAMPLES:
18
+ # Generate 1000 users to a CSV file
19
+ fakedata generate -n 1000 -f csv -o dataset.csv
20
+
21
+ # Generate 500 deterministic Indian users
22
+ fakedata generate -n 500 -l in --seed 42 -o india.json
23
+
24
+ # Generate fraud detection dataset with 5%% anomalies
25
+ fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
26
+
27
+ # Preview a single user profile
28
+ fakedata preview
29
+
30
+ # Generate with time-series activity logs
31
+ fakedata generate -n 100 --timeseries --days 60 -o activity.json
32
+ """
33
+ )
34
+
35
+ subparsers = parser.add_subparsers(dest='command', help='Command to run')
36
+
37
+ # ─── preview ──────────────────────────────────────────────────────────────
38
+ subparsers.add_parser('preview', help='Print a single user profile to the console')
39
+
40
+ # ─── generate ─────────────────────────────────────────────────────────────
41
+ gen = subparsers.add_parser('generate', help='Generate synthetic user data')
42
+ gen.add_argument('-n', '--count', type=int, default=10,
43
+ help='Number of users to generate (default: 10)')
44
+ gen.add_argument('-f', '--format', choices=['json', 'csv', 'flat'], default='json',
45
+ help='Output format: json | csv | flat (default: json)')
46
+ gen.add_argument('-o', '--output', type=str, default=None,
47
+ help='Output file path (default: stdout)')
48
+ gen.add_argument('-s', '--seed', type=int, default=None,
49
+ help='Random seed for reproducibility')
50
+ gen.add_argument('-l', '--locale', type=str, default=None,
51
+ help='Locale: en|in|jp|kr|de|br|ar|fr (default: en)')
52
+ gen.add_argument('-a', '--anomaly-rate', type=float, default=0.0,
53
+ help='Fraction of anomalous users 0-1 (default: 0)')
54
+ gen.add_argument('-m', '--missing-rate', type=float, default=0.0,
55
+ help='Fraction of null fields 0-1 (default: 0)')
56
+ gen.add_argument('-t', '--timeseries', action='store_true',
57
+ help='Include time-series activity logs')
58
+ gen.add_argument('--days', type=int, default=30,
59
+ help='Days of activity for time-series (default: 30)')
60
+ gen.add_argument('--events-per-day', type=int, default=8,
61
+ help='Average events per day for time-series (default: 8)')
62
+ gen.add_argument('--pretty', action='store_true',
63
+ help='Pretty-print JSON output')
64
+
65
+ args = parser.parse_args()
66
+
67
+ if args.command is None or args.command == 'help':
68
+ parser.print_help()
69
+ return
70
+
71
+ # ─── Import the engine (lazy, only when needed) ───────────────────────────
72
+ try:
73
+ import fakedata.modules.data as data
74
+ except ImportError:
75
+ print("ERROR: Could not import fakedata. Make sure it is installed: pip install fakedata-python", file=sys.stderr)
76
+ sys.exit(1)
77
+
78
+ # ─── Preview ─────────────────────────────────────────────────────────────
79
+ if args.command == 'preview':
80
+ u = data.user()
81
+ print(json.dumps(u, indent=2))
82
+ return
83
+
84
+ # ─── Generate ────────────────────────────────────────────────────────────
85
+ if args.command == 'generate':
86
+ options = {
87
+ 'seed': args.seed,
88
+ 'locale': args.locale,
89
+ 'anomaly_rate': args.anomaly_rate,
90
+ 'missing_rate': args.missing_rate,
91
+ }
92
+ # Remove None values so defaults are used inside the engine
93
+ options = {k: v for k, v in options.items() if v is not None and v != 0.0}
94
+
95
+ start = time.time()
96
+
97
+ if args.timeseries:
98
+ results = [
99
+ data.user_time_series({**options, 'days': args.days, 'events_per_day': args.events_per_day})
100
+ for _ in range(args.count)
101
+ ]
102
+ output = json.dumps(results, indent=2 if args.pretty else None)
103
+
104
+ elif args.format == 'csv':
105
+ output = data.users_to_csv(args.count, options if options else None)
106
+
107
+ elif args.format == 'flat':
108
+ rows = data.users_flat(args.count, options if options else None)
109
+ output = json.dumps(rows, indent=2 if args.pretty else None)
110
+
111
+ else: # json
112
+ if args.pretty:
113
+ output = data.users_to_json(args.count, options if options else None)
114
+ else:
115
+ output = json.dumps(data.users(args.count, options if options else None))
116
+
117
+ elapsed = round(time.time() - start, 2)
118
+
119
+ if args.output:
120
+ out_path = os.path.abspath(args.output)
121
+ with open(out_path, 'w', encoding='utf-8') as f:
122
+ f.write(output)
123
+ size_kb = round(len(output.encode('utf-8')) / 1024, 1)
124
+ print(
125
+ f"✔ Done! Generated {args.count:,} users in {elapsed}s → {out_path} ({size_kb} KB)",
126
+ file=sys.stderr
127
+ )
128
+ else:
129
+ print(output)
130
+
131
+
132
+ if __name__ == '__main__':
133
+ main()
fakedata/modules/data.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import random
2
2
  import string
3
3
  import math
4
+ import datetime
4
5
  from ..core import load_data
5
6
 
6
7
  # ─── Data Loading ───────────────────────────────────────────────────────────
@@ -32,6 +33,16 @@ healthcare_data = load_data('healthcare.json')
32
33
  # Phase 4 datasets
33
34
  locales_data = load_data('locales.json')
34
35
 
36
+ # ─── Personas for Realistic Distribution ───────────────────────────────────
37
+ PERSONAS = [
38
+ {"type": "Executive", "weight": 5, "income_mult": 3.5, "tech_bias": "Apple", "lifestyle": "luxury"},
39
+ {"type": "Tech Professional", "weight": 20, "income_mult": 2.2, "tech_bias": "High-End", "lifestyle": "modern"},
40
+ {"type": "Student", "weight": 15, "income_mult": 0.4, "tech_bias": "Mid-Range", "lifestyle": "frugal"},
41
+ {"type": "Manual Laborer", "weight": 25, "income_mult": 0.9, "tech_bias": "Budget", "lifestyle": "basic"},
42
+ {"type": "Service Worker", "weight": 25, "income_mult": 0.8, "tech_bias": "Budget", "lifestyle": "basic"},
43
+ {"type": "Freelancer", "weight": 10, "income_mult": 1.2, "tech_bias": "Mid-Range", "lifestyle": "flexible"}
44
+ ]
45
+
35
46
 
36
47
  # ─── Utility Functions ──────────────────────────────────────────────────────
37
48
  def get_random(arr):
@@ -97,10 +108,22 @@ EDUCATION_FIELDS = [
97
108
  ]
98
109
 
99
110
 
100
- def generate_education(age):
111
+ def generate_education(age, persona):
101
112
  # Filter education levels by age eligibility
102
113
  eligible = [e for e in EDUCATION_LEVELS if age >= e["min_age"]]
103
- weights = [e["weight"] for e in eligible]
114
+
115
+ # Adjust weights based on persona
116
+ weights = []
117
+ for e in eligible:
118
+ w = e["weight"]
119
+ if persona["type"] == "Executive" and e["level"] in ["Master's", "PhD"]:
120
+ w *= 3
121
+ if persona["type"] == "Manual Laborer" and e["level"] in ["High School", "Dropout"]:
122
+ w *= 2
123
+ if persona["type"] == "Student" and age < 25:
124
+ w *= 2
125
+ weights.append(w)
126
+
104
127
  selected_index = weighted_random(weights)
105
128
  selected = eligible[selected_index]
106
129
 
@@ -169,16 +192,20 @@ WORK_MODES = [
169
192
  ]
170
193
 
171
194
 
172
- def generate_employment(age, education):
195
+ def generate_employment(age, education, persona):
173
196
  # Filter eligible statuses by age
174
197
  eligible = [s for s in EMPLOYMENT_STATUSES if age >= s["min_age"] and age <= s["max_age"]]
175
198
 
176
- # Boost weights contextually
199
+ # Boost weights based on persona
177
200
  weights = []
178
201
  for s in eligible:
179
202
  w = s["weight"]
180
- if s["status"] == "student" and age < 25:
203
+ if persona["type"] == "Executive" and s["status"] == "employed":
181
204
  w *= 2
205
+ if persona["type"] == "Freelancer" and s["status"] == "freelancer":
206
+ w *= 5
207
+ if persona["type"] == "Student" and s["status"] == "student":
208
+ w *= 4
182
209
  if s["status"] == "retired" and age >= 65:
183
210
  w *= 3
184
211
  weights.append(w)
@@ -250,7 +277,7 @@ TAX_BRACKETS = [
250
277
  ]
251
278
 
252
279
 
253
- def generate_financial(age, education, employment):
280
+ def generate_financial(age, education, employment, persona):
254
281
  # Base income multipliers by education level
255
282
  income_multipliers = {
256
283
  "High School": 1.0, "Associate's": 1.3, "Bachelor's": 1.8,
@@ -277,7 +304,9 @@ def generate_financial(age, education, employment):
277
304
  sr = salary_ranges_data[role_key]
278
305
  base_salary_inr = normal_random(sr["median"], (sr["p75"] - sr["p25"]) / 2)
279
306
  base_salary_usd = round(clamp(base_salary_inr / 80, 15000, 500000))
280
- annual_income = round(base_salary_usd * edu_multiplier * age_factor)
307
+
308
+ # Income influenced by Persona and Education
309
+ annual_income = round(base_salary_usd * edu_multiplier * age_factor * persona["income_mult"])
281
310
  elif employment["status"] == "retired":
282
311
  annual_income = round(random.uniform(20000, 60000))
283
312
  else:
@@ -994,14 +1023,17 @@ def generate_single_user(id_index=None, schema=None, locale=None):
994
1023
  # Weighted age generation
995
1024
  age = generate_age()
996
1025
 
1026
+ # Pick a Persona to drive statistical correlations
1027
+ persona = PERSONAS[weighted_random([p["weight"] for p in PERSONAS])]
1028
+
997
1029
  # Correlated education
998
- education = generate_education(age)
1030
+ education = generate_education(age, persona)
999
1031
 
1000
1032
  # Correlated employment
1001
- employment = generate_employment(age, education)
1033
+ employment = generate_employment(age, education, persona)
1002
1034
 
1003
1035
  # Correlated financial profile
1004
- financial = generate_financial(age, education, employment)
1036
+ financial = generate_financial(age, education, employment, persona)
1005
1037
 
1006
1038
  # Demographics
1007
1039
  demographics = generate_demographics(age)
@@ -1111,7 +1143,12 @@ def generate_single_user(id_index=None, schema=None, locale=None):
1111
1143
  "cardCvv": str(card_cvv),
1112
1144
  },
1113
1145
  "hobbies": user_hobbies,
1114
- "technology_profile": tech_profile
1146
+ "technology_profile": tech_profile,
1147
+ "persona": persona["type"],
1148
+ "metadata": {
1149
+ "version": "2.1.0",
1150
+ "generation_timestamp": datetime.datetime.utcnow().isoformat() + "Z"
1151
+ }
1115
1152
  }
1116
1153
 
1117
1154
  # Phase 4: Apply locale country override
@@ -0,0 +1,434 @@
1
+ Metadata-Version: 2.4
2
+ Name: fakedata-python
3
+ Version: 2.0.2
4
+ Summary: The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
+ Author-email: abhay557 <contact@abhaymourya.in>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/abhay557/fakedata
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.7
11
+ Description-Content-Type: text/markdown
12
+
13
+ # fakedata
14
+
15
+ [![NPM Version](https://img.shields.io/npm/v/@abhay557/fakedata?color=red&label=npm)](https://www.npmjs.com/package/@abhay557/fakedata)
16
+ [![PyPI Version](https://img.shields.io/pypi/v/fakedata-python?color=blue&label=pypi)](https://pypi.org/project/fakedata-python/)
17
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
18
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/16N9x1YCOVVvIF8rl7IQxKRkK4en_g3Gi?usp=sharing)
19
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/fakedata-python?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/fakedata-python)
20
+
21
+ A high-performance, **zero-dependency** synthetic data generation engine, available for both **Node.js** and **Python**. Designed specifically for machine learning, data science, and analytics workflows, providing 100% data parity across platforms.
22
+
23
+ ## Overview
24
+
25
+ `fakedata` has been completely rebuilt from the ground up to serve as an **ML-ready synthetic data engine**. It generates deeply interconnected user profiles with **112 flat columns across 13 domains** (Health, Financial, Employment, Digital Footprint, etc.), making it the perfect tool for training models, benchmarking pipelines, or simulating realistic databases.
26
+
27
+ ### Machine Learning Power Features:
28
+ - **Behavioral Personas**: Orchestrate correlations through 6 distinct personas (e.g., Executive, Student, Tech Pro) to ensure realistic socio-economic patterns.
29
+ - **Seed Reproducibility**: Generate byte-for-byte identical datasets across runs (and languages!) using `seed`.
30
+ - **Schema Overrides**: Force specific distributions (e.g., age ranges, income brackets, genders) using `schema`.
31
+ - **Locale-Aware Generation**: Support for 8 culture-specific name sets and phone formats (`en`, `in`, `jp`, `kr`, `de`, `br`, `ar`, `fr`).
32
+ - **Missing Data Simulation**: Automatically inject realistic nulls using `missing_rate` to test your data imputation pipelines.
33
+ - **Anomaly Injection**: Inject fraud/outlier profiles (e.g., impossible geography, credit fraud, income spikes) using `anomaly_rate`.
34
+ - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
35
+ - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
36
+ - **CLI Tool**: Generate and export datasets directly from your terminal — no scripting required.
37
+
38
+ ---
39
+
40
+ ## Python Implementation
41
+
42
+ ### Installation
43
+ ```bash
44
+ pip install fakedata-python
45
+ ```
46
+
47
+ ### Quick Start
48
+ ```python
49
+ import fakedata.data as data
50
+ import pandas as pd
51
+
52
+ # Generate 10,000 highly correlated users deterministically
53
+ users = data.users(10000, {"seed": 42})
54
+
55
+ # Or export directly to a Pandas DataFrame
56
+ df = pd.DataFrame(data.users_flat(10000, {"seed": 42}))
57
+ print(df.head())
58
+
59
+ # Create time-series activity data
60
+ ts = data.user_time_series({"days": 30, "events_per_day": 8})
61
+ print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
62
+ ```
63
+ ## Node.js / TypeScript Implementation
64
+
65
+ ### Installation
66
+ ```bash
67
+ npm install @abhay557/fakedata
68
+ ```
69
+
70
+ ### Quick Start
71
+ ```javascript
72
+ const { data } = require('@abhay557/fakedata');
73
+
74
+ // Generate deterministic users with a 5% missing data rate (null injection)
75
+ const users = data.users(1000, { seed: 42, missing_rate: 0.05 });
76
+
77
+ // Export directly to CSV format
78
+ const csvString = data.usersToCSV(1000, { seed: 42 });
79
+
80
+ // Time-series activity data
81
+ const ts = data.userTimeSeries({ days: 30, eventsPerDay: 8 });
82
+ console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
83
+ ```
84
+
85
+ ---
86
+
87
+ ## CLI — Command Line Interface
88
+
89
+ After installing, use `fakedata` directly from your terminal. No scripts needed!
90
+
91
+ ### Node.js (global install)
92
+ ```bash
93
+ npm install -g @abhay557/fakedata
94
+ ```
95
+
96
+ ### Python (global install)
97
+ ```bash
98
+ pip install fakedata-python
99
+ ```
100
+
101
+ ### CLI Commands
102
+
103
+ | Command | Description |
104
+ |:---|:---|
105
+ | `fakedata generate` | Generate synthetic user data |
106
+ | `fakedata preview` | Print a single user profile to the console |
107
+ | `fakedata help` | Show all available options |
108
+
109
+ ### CLI Options
110
+
111
+ | Flag | Default | Description |
112
+ |:---|:---|:---|
113
+ | `-n`, `--count` | `10` | Number of users to generate |
114
+ | `-f`, `--format` | `json` | Output format: `json` \| `csv` \| `flat` |
115
+ | `-o`, `--output` | stdout | Output file path |
116
+ | `-s`, `--seed` | none | Random seed for reproducibility |
117
+ | `-l`, `--locale` | `en` | Locale: `en` \| `in` \| `jp` \| `kr` \| `de` \| `br` \| `ar` \| `fr` |
118
+ | `-a`, `--anomaly-rate` | `0` | Fraction of anomalous users (0–1) |
119
+ | `-m`, `--missing-rate` | `0` | Fraction of null fields (0–1) |
120
+ | `-t`, `--timeseries` | — | Include time-series activity logs |
121
+ | `--days` | `30` | Days of activity for time-series |
122
+ | `--pretty` | — | Pretty-print JSON output |
123
+
124
+ ### Examples
125
+
126
+ ```bash
127
+ # Generate 1000 users and save as CSV
128
+ fakedata generate -n 1000 -f csv -o dataset.csv
129
+
130
+ # Generate 500 deterministic Indian users
131
+ fakedata generate -n 500 -l in --seed 42 -o india.json
132
+
133
+ # Fraud detection dataset with 5% anomalies
134
+ fakedata generate -n 10000 -a 0.05 -f csv -o fraud_data.csv
135
+
136
+ # Preview a single user in the console
137
+ fakedata preview
138
+
139
+ # Time-series activity logs for 100 users
140
+ fakedata generate -n 100 --timeseries --days 60 -o activity.json
141
+ ```
142
+
143
+ ---
144
+ ### sample output - one user
145
+ ```fakedata.data.user()```
146
+ ```fakedata.data.user(n) // set n = 100```
147
+
148
+ ```json
149
+ "id": "4612",
150
+ "fullName": "Damaris Carlo Ebervale",
151
+ "firstName": "Damaris",
152
+ "lastName": "Ebervale",
153
+ "middleName": "Carlo",
154
+ "age": 31,
155
+ "gender": "non-binary",
156
+ "email": "damaris.ebervale@liberomail.com",
157
+ "phone": "+1 7469125114",
158
+ "username": "damaris_4612",
159
+ "password": "UQ!VZr0cLUD9",
160
+ "birthDate": "1995-07-19",
161
+ "bloodGroup": "+B",
162
+ "height": 185,
163
+ "weight": 60,
164
+ "domain": "damarisebervale.vg",
165
+ "ip": "48.50.80.113",
166
+ "macaddress": "33:2F:39:EE:3B:1E",
167
+ "address": {
168
+ "street": "3623 Chateau Lane",
169
+ "city": "Kilgore",
170
+ "state": "Texas",
171
+ "country": "Sierra Leone",
172
+ "countryCode": "SL",
173
+ "zipCode": 36434,
174
+ "coordinates": {
175
+ "latitude": "-68.324385",
176
+ "longitude": "55.859967"
177
+ }
178
+ },
179
+ "demographics": {
180
+ "ethnicity": "Hispanic",
181
+ "nationality": "South Korean",
182
+ "language": {
183
+ "primary": "Arabic",
184
+ "secondary": "Turkish"
185
+ },
186
+ "relationshipStatus": "dating"
187
+ },
188
+ "education": {
189
+ "level": "Bachelor's",
190
+ "field": "Computer Science",
191
+ "institution": "Agricultural University of Lublin",
192
+ "institutionCountry": "Poland",
193
+ "gpa": 2.79,
194
+ "graduationYear": 2017,
195
+ "studentDebt": 64117
196
+ },
197
+ "employment": {
198
+ "status": "self-employed",
199
+ "company": "China CITIC Bank",
200
+ "companySize": "enterprise",
201
+ "industry": "Banking",
202
+ "jobTitle": "\"ORACLE DBA\"",
203
+ "jobCategory": "Network Engineering",
204
+ "yearsExperience": 10,
205
+ "workMode": "onsite",
206
+ "workHoursPerWeek": 36,
207
+ "jobSatisfaction": 6
208
+ },
209
+ "financial": {
210
+ "annualIncome": 21600,
211
+ "creditScore": 464,
212
+ "savings": 1680,
213
+ "monthlyExpenses": 1309,
214
+ "debtToIncome": 3.12,
215
+ "taxBracket": "12%",
216
+ "investmentStyle": "moderate",
217
+ "homeOwnership": "own"
218
+ },
219
+ "health": {
220
+ "bmi": 17.5,
221
+ "bmiCategory": "underweight",
222
+ "bloodPressure": {
223
+ "systolic": 100,
224
+ "diastolic": 82
225
+ },
226
+ "exerciseFrequency": "3-4 times/week",
227
+ "smoking": "never",
228
+ "alcohol": "never",
229
+ "sleepHoursPerNight": 8.3,
230
+ "sleepQuality": "poor",
231
+ "diet": "mediterranean",
232
+ "medicalCondition": "None",
233
+ "insuranceProvider": "UnitedHealthcare",
234
+ "medications": [
235
+ "Lisinopril"
236
+ ],
237
+ "lastCheckupMonthsAgo": 11,
238
+ "hasDisability": false,
239
+ "mentalHealth": "poor",
240
+ "vaccination": "partially vaccinated"
241
+ },
242
+ "social": {
243
+ "socialMedia": {
244
+ "platforms": [
245
+ "Pinterest",
246
+ "Twitter/X",
247
+ "Reddit",
248
+ "Instagram"
249
+ ],
250
+ "screenTimeHoursPerDay": 3.8,
251
+ "preferredContent": "video"
252
+ },
253
+ "shopping": {
254
+ "frequency": "weekly",
255
+ "preferredCategories": [
256
+ "toys & games",
257
+ "books"
258
+ ],
259
+ "monthlyOnlineSpending": 175
260
+ },
261
+ "newsSource": "social media",
262
+ "travelFrequency": "weekly",
263
+ "volunteers": false,
264
+ "pet": "multiple"
265
+ },
266
+ "digitalFootprint": {
267
+ "accountCreatedAt": "2021-04-01T09:59:41.867116+00:00",
268
+ "lastLoginAt": "2026-04-24T09:59:41.867116+00:00",
269
+ "lastPasswordChangeAt": "2025-11-06T09:59:41.867116+00:00",
270
+ "userAgent": "Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 Chrome/121.0.0.0 Mobile Safari/537.36",
271
+ "browser": "Chrome",
272
+ "os": "Windows 11",
273
+ "referrer": "facebook.com",
274
+ "avgSessionMinutes": 17.6,
275
+ "sessionsPerWeek": 10,
276
+ "totalSessions": 2666,
277
+ "twoFactorEnabled": false,
278
+ "preferredLanguage": "de",
279
+ "accountStatus": "inactive",
280
+ "verifiedEmail": false,
281
+ "verifiedPhone": true
282
+ },
283
+ "bank": {
284
+ "nameOnCard": "Damaris Carlo Ebervale",
285
+ "cardNumber": "2289970210128357",
286
+ "cardType": "Mastercard",
287
+ "cardExpiry": "5/29",
288
+ "cardCvv": "355"
289
+ },
290
+ "hobbies": [
291
+ "Knitting",
292
+ "Gardening",
293
+ "LARPing"
294
+ ],
295
+ "technology_profile": {
296
+ "devices": {
297
+ "additional_devices": [
298
+ "BlackBerry Bold 9790",
299
+ "Nokia N9"
300
+ ],
301
+ "smartphone": "Sony Ericsson Xperia X10"
302
+ },
303
+ "phone_preferences": {
304
+ "critical_features": [
305
+ "security features",
306
+ "reliability",
307
+ "5G connectivity"
308
+ ],
309
+ "primary_uses": [
310
+ "photography",
311
+ "education",
312
+ "organization"
313
+ ]
314
+ },
315
+ "interest": [
316
+ "Knitting",
317
+ "Gardening",
318
+ "LARPing"
319
+ ]
320
+ }
321
+ }
322
+
323
+ ```
324
+ ---
325
+
326
+ ## Advanced Features Reference
327
+
328
+ Both Python and JS/TS expose the same underlying engine options.
329
+
330
+ ### 1. Configuration Options
331
+ Pass an `options` dictionary/object to `data.user(options)` or `data.users(n, options)`:
332
+
333
+ ```javascript
334
+ const options = {
335
+ seed: 42, // Number: Ensures deterministic, byte-for-byte identical output
336
+ missing_rate: 0.05, // Float (0-1): 5% chance of any leaf field being null
337
+ locale: 'jp', // String: 'en', 'in', 'jp', 'kr', 'de', 'br', 'ar', 'fr'
338
+ anomaly_rate: 0.05, // Float (0-1): 5% of users will have injected fraud anomalies
339
+ days: 30, // Number: Days of time-series activity to generate
340
+ eventsPerDay: 8, // Number: Average events per day for time-series logs
341
+
342
+ // Schema Constraints (force specific data distributions)
343
+ schema: {
344
+ age: { min: 25, max: 40 }, // Can also use { exact: 30 }
345
+ gender: "female", // "male", "female", or "non-binary"
346
+ employment: { status: "employed" },
347
+ education: { level: "Master's" },
348
+ financial: { annualIncome: { min: 60000, max: 120000 } },
349
+ health: { medicalCondition: "Diabetes" },
350
+ address: { country: "Japan" },
351
+ height: { min: 160, max: 180 },
352
+ weight: { min: 50, max: 80 }
353
+ }
354
+ }
355
+ ```
356
+
357
+ ### 2. Supported API Methods
358
+
359
+ | Method (JS) | Method (Python) | Description |
360
+ | :--- | :--- | :--- |
361
+ | `data.user(opts?)` | `data.user(opts=None)` | Generate a single complex user profile. |
362
+ | `data.users(n, opts?)` | `data.users(n, opts=None)` | Generate an array/list of `n` users. |
363
+ | `data.userTimeSeries(opts)` | `data.user_time_series(opts)`| Returns `{ user, activity }` containing chronological event logs. |
364
+ | `data.usersFlat(n, opts?)` | `data.users_flat(n, opts=None)`| Returns flat dicts/objects, perfect for `pandas.DataFrame` ingestion. |
365
+ | `data.usersToCSV(n, opts?)` | `data.users_to_csv(n, opts=None)`| Returns a fully formatted CSV string (112 columns). |
366
+ | `data.usersToJSON(n, opts?)`| `data.users_to_json(n, opts=None)`| Returns a pretty-printed JSON string. |
367
+
368
+ ### 3. Behavioral Personas (Statistical Modeling)
369
+ To ensure the data is useful for **Clustering** and **Regression** analysis, `fakedata` uses a **Persona-driven engine**. Every user is assigned one of 6 personas that orchestrate their life outcomes:
370
+
371
+ - **Executive**: High income, high education (Master's/PhD), premium Apple devices, luxury lifestyle.
372
+ - **Tech Professional**: High income, high-end hardware, heavy social media use, remote work bias.
373
+ - **Student**: Low income, high student debt, budget/mid-range tech, high social media footprint.
374
+ - **Manual Laborer / Service Worker**: Budget-conscious, steady income, consistent employment patterns.
375
+ - **Freelancer**: Flexible work modes, variable income ranges, mid-range tech profile.
376
+
377
+ These personas ensure that an analyst looking at your synthetic data will find **statistically significant clusters** rather than just a uniform cloud of random values.
378
+
379
+ ---
380
+
381
+ ## Data Structure Highlights (112 Columns)
382
+
383
+ ### 3. Locale-Aware Name Generation
384
+ Supports 8 locales with culturally accurate first names, last names, and country/phone codes:
385
+ - `'in'`: Aarav Sharma, Priya Patel (+91, India)
386
+ - `'jp'`: Haruto Tanaka, Sakura Sato (+81, Japan)
387
+ - `'kr'`: Minjun Kim, Seo-yeon Park (+82, South Korea)
388
+ - `'de'`: Lukas Müller, Mia Schmidt (+49, Germany)
389
+ - `'br'`: Miguel Silva, Alice Santos (+55, Brazil)
390
+ - `'ar'`: Mohammed Al-Ahmed, Fatima Khalil (+966, Saudi Arabia)
391
+ - `'fr'`: Gabriel Martin, Emma Dubois (+33, France)
392
+ - `'en'`: James Smith, Mary Johnson (+1, United States)
393
+
394
+ ### 4. Time-Series Activity Data
395
+ Generate chronological behavioral logs for users. Event types include `login`, `page_view`, `purchase`, `search`, `click`, `logout`, `api_call`, `upload`, `download`, and `comment`.
396
+
397
+ ```javascript
398
+ const ts = data.userTimeSeries({ seed: 42, days: 30, eventsPerDay: 8 });
399
+ // ts.user → Full user profile
400
+ // ts.activity → [{ timestamp, type, page, duration, device, ip, success, amount?, query? }]
401
+ ```
402
+
403
+ ### 5. Anomaly Injection Engine (Fraud Detection)
404
+ When `anomaly_rate` is > 0, `fakedata` injects ML-detectable fraud patterns into the dataset. Affected users receive a special `_anomaly` flag object indicating the fraud type.
405
+
406
+ | Anomaly Type | Effect |
407
+ |:---|:---|
408
+ | `income_spike` | Income multiplied 5-15x |
409
+ | `credit_fraud` | Credit score = 100-200 or 850-999, DTI = 10-60 |
410
+ | `session_anomaly` | Sessions/week = 200-700, avg session = 500-1500 min |
411
+ | `age_outlier` | Age = 1, 2, 3, 115, 120, or 130 |
412
+ | `geo_impossible` | Coordinates = (0,0), IP = 0.0.0.0 |
413
+ | `velocity_attack` | Total sessions = 50k-150k, last login = now |
414
+ | `data_mismatch` | Age=12 + employed + 30yr experience + $500k income |
415
+ | `health_outlier` | BMI = 8-9 or 75-80, BP = extreme values |
416
+
417
+ ### 6. The User Profile Schema (109 Correlated Fields)
418
+ Each generated user contains highly realistic, correlated data. For example, age determines education graduation year, which impacts employment salary, which impacts credit score, which impacts housing status and health/BMI metrics.
419
+
420
+ ```text
421
+ identity(9) → personal(6) → network(3) → address(7) → demographics(5)
422
+ → education(7) → employment(10) → financial(8) → health(16)
423
+ → social(9) → digitalFootprint(15) → bank(5) → lifestyle(9)
424
+ ```
425
+
426
+ ---
427
+
428
+ ## License
429
+
430
+ Distributed under the **MIT License**. See `LICENSE` for more information.
431
+
432
+ **Maintainer**: [abhay557](https://github.com/abhay557)
433
+
434
+ - Project Commit History - `https://github.com/abhay557/random-api.xyz`
@@ -1,4 +1,5 @@
1
1
  fakedata/__init__.py,sha256=PXwXDWU2HFUfAF2zFMrxsJ7BvP5RSpTbF0GvxWCTt3g,93
2
+ fakedata/cli.py,sha256=8gwV_PYyUJo0QwmLnabZ_PHBS6UUpS4HJLJeMX-Bi-w,5817
2
3
  fakedata/core.py,sha256=ZiZ51aZ3cAG7n02Giliq0XO5nN-bbjLJWM3pZQ6gWT4,437
3
4
  fakedata/test_python.py,sha256=UpfmArMkM7bcRkV_MTed0vZ1QXokOAZqazcfISlLmZA,3226
4
5
  fakedata/helpers/cardtype.json,sha256=3Ij5N_QPCO1Xg6g7jTp759yOVF9scXkbBDZvWYRaSAM,201
@@ -24,8 +25,9 @@ fakedata/helpers/states.json,sha256=1NLVCllDcRN8QXp3GTv9iiqeqA8rOOlx2v3_w729jfU,
24
25
  fakedata/helpers/street.json,sha256=Z-1cRr7uGMXBqlPoqoedPagfx_hLXqbWDNoylcnS8L0,305724
25
26
  fakedata/helpers/universities.json,sha256=7NHac5anNxnCVSj6kdFc87896S2A3J-zT58Yw4lzkaQ,230421
26
27
  fakedata/modules/__init__.py,sha256=buFp940xk9V39VnBFIca5ADTEtX8qsKz7_VQC3102tI,19
27
- fakedata/modules/data.py,sha256=UQxgdzjBzPD46p5mXoi6T4PPYKvduFtbOLl1SEcaBV4,50652
28
- fakedata_python-2.0.0.dist-info/METADATA,sha256=hvIvrcmlA-1PngIcWd9d50VbebolI7iE3FdEh9lqnTY,8381
29
- fakedata_python-2.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
30
- fakedata_python-2.0.0.dist-info/top_level.txt,sha256=SHFa_6848yAE45QgW-PX_DHp_nakY64Zs_t2NobLcn0,9
31
- fakedata_python-2.0.0.dist-info/RECORD,,
28
+ fakedata/modules/data.py,sha256=8MHRV5AtbzLOCBRKUP3E-dHc8nIQ3VuP-Xyk7c_-Eog,52542
29
+ fakedata_python-2.0.2.dist-info/METADATA,sha256=WlEpLxJNeuYVUsasQCQcHHS_s3d8gBcjYpaiMRY-TqI,16306
30
+ fakedata_python-2.0.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
31
+ fakedata_python-2.0.2.dist-info/entry_points.txt,sha256=qLOKT1Qujc8-qppTaDO2GUWcuoUQR9fSID3qvIaEAPo,47
32
+ fakedata_python-2.0.2.dist-info/top_level.txt,sha256=SHFa_6848yAE45QgW-PX_DHp_nakY64Zs_t2NobLcn0,9
33
+ fakedata_python-2.0.2.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ fakedata = fakedata.cli:main
@@ -1,179 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: fakedata-python
3
- Version: 2.0.0
4
- Summary: The fakedata package generates realistic synthetic user profiles for machine learning, deep learning, data analysis, and data science workflows.
5
- Author-email: abhay557 <abhaycormourya@gmail.com>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/abhay557/fakedata
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Operating System :: OS Independent
10
- Requires-Python: >=3.7
11
- Description-Content-Type: text/markdown
12
-
13
- # fakedata
14
-
15
- [![NPM Version](https://img.shields.io/npm/v/@abhay557/fakedata?color=red&label=npm)](https://www.npmjs.com/package/@abhay557/fakedata)
16
- [![PyPI Version](https://img.shields.io/pypi/v/fakedata-python?color=blue&label=pypi)](https://pypi.org/project/fakedata-python/)
17
- [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
18
-
19
- A high-performance, **zero-dependency** synthetic data generation engine, available for both **Node.js** and **Python**. Designed specifically for machine learning, data science, and analytics workflows, providing 100% data parity across platforms.
20
-
21
- ## Overview
22
-
23
- `fakedata` has been completely rebuilt from the ground up to serve as an **ML-ready synthetic data engine**. It generates deeply interconnected user profiles with **109 flat columns across 13 domains** (Health, Financial, Employment, Digital Footprint, etc.), making it the perfect tool for training models, benchmarking pipelines, or simulating realistic databases.
24
-
25
- ### Machine Learning Power Features:
26
- - **Seed Reproducibility**: Generate byte-for-byte identical datasets across runs (and languages!) using `seed`.
27
- - **Schema Overrides**: Force specific distributions (e.g., age ranges, income brackets, genders) using `schema`.
28
- - **Locale-Aware Generation**: Support for 8 culture-specific name sets and phone formats (`en`, `in`, `jp`, `kr`, `de`, `br`, `ar`, `fr`).
29
- - **Missing Data Simulation**: Automatically inject realistic nulls using `missing_rate` to test your data imputation pipelines.
30
- - **Anomaly Injection**: Inject fraud/outlier profiles (e.g., impossible geography, credit fraud, income spikes) using `anomaly_rate`.
31
- - **Time-Series Data**: Generate chronological activity logs (logins, page views, purchases) per user for behavioral modeling.
32
- - **Pipeline Ready**: Export directly to CSV, JSON, or Flat objects (perfect for `pandas.DataFrame`).
33
-
34
- ---
35
-
36
- ## Node.js / TypeScript Implementation
37
-
38
- ### Installation
39
- ```bash
40
- npm install @abhay557/fakedata
41
- ```
42
-
43
- ### Quick Start
44
- ```javascript
45
- const { data } = require('@abhay557/fakedata');
46
-
47
- // Generate deterministic users with a 5% missing data rate (null injection)
48
- const users = data.users(1000, { seed: 42, missing_rate: 0.05 });
49
-
50
- // Export directly to CSV format
51
- const csvString = data.usersToCSV(1000, { seed: 42 });
52
-
53
- // Time-series activity data
54
- const ts = data.userTimeSeries({ days: 30, eventsPerDay: 8 });
55
- console.log(`Generated ${ts.activity.length} events for ${ts.user.fullName}`);
56
- ```
57
-
58
- ---
59
-
60
- ## Python Implementation
61
-
62
- ### Installation
63
- ```bash
64
- pip install fakedata-python
65
- ```
66
-
67
- ### Quick Start
68
- ```python
69
- import fakedata.data as data
70
- import pandas as pd
71
-
72
- # Generate 10,000 highly correlated users deterministically
73
- users = data.users(10000, {"seed": 42})
74
-
75
- # Or export directly to a Pandas DataFrame
76
- df = pd.DataFrame(data.users_flat(10000, {"seed": 42}))
77
- print(df.head())
78
-
79
- # Create time-series activity data
80
- ts = data.user_time_series({"days": 30, "events_per_day": 8})
81
- print(f"Generated {len(ts['activity'])} events for {ts['user']['fullName']}")
82
- ```
83
-
84
- ---
85
-
86
- ## Advanced Features Reference
87
-
88
- Both Python and JS/TS expose the same underlying engine options.
89
-
90
- ### 1. Configuration Options
91
- Pass an `options` dictionary/object to `data.user(options)` or `data.users(n, options)`:
92
-
93
- ```javascript
94
- const options = {
95
- seed: 42, // Number: Ensures deterministic, byte-for-byte identical output
96
- missing_rate: 0.05, // Float (0-1): 5% chance of any leaf field being null
97
- locale: 'jp', // String: 'en', 'in', 'jp', 'kr', 'de', 'br', 'ar', 'fr'
98
- anomaly_rate: 0.05, // Float (0-1): 5% of users will have injected fraud anomalies
99
- days: 30, // Number: Days of time-series activity to generate
100
- eventsPerDay: 8, // Number: Average events per day for time-series logs
101
-
102
- // Schema Constraints (force specific data distributions)
103
- schema: {
104
- age: { min: 25, max: 40 }, // Can also use { exact: 30 }
105
- gender: "female", // "male", "female", or "non-binary"
106
- employment: { status: "employed" },
107
- education: { level: "Master's" },
108
- financial: { annualIncome: { min: 60000, max: 120000 } },
109
- health: { medicalCondition: "Diabetes" },
110
- address: { country: "Japan" },
111
- height: { min: 160, max: 180 },
112
- weight: { min: 50, max: 80 }
113
- }
114
- }
115
- ```
116
-
117
- ### 2. Supported API Methods
118
-
119
- | Method (JS) | Method (Python) | Description |
120
- | :--- | :--- | :--- |
121
- | `data.user(opts?)` | `data.user(opts=None)` | Generate a single complex user profile. |
122
- | `data.users(n, opts?)` | `data.users(n, opts=None)` | Generate an array/list of `n` users. |
123
- | `data.userTimeSeries(opts)` | `data.user_time_series(opts)`| Returns `{ user, activity }` containing chronological event logs. |
124
- | `data.usersFlat(n, opts?)` | `data.users_flat(n, opts=None)`| Returns flat dicts/objects, perfect for `pandas.DataFrame` ingestion. |
125
- | `data.usersToCSV(n, opts?)` | `data.users_to_csv(n, opts=None)`| Returns a fully formatted CSV string (109 columns). |
126
- | `data.usersToJSON(n, opts?)`| `data.users_to_json(n, opts=None)`| Returns a pretty-printed JSON string. |
127
-
128
- ### 3. Locale-Aware Name Generation
129
- Supports 8 locales with culturally accurate first names, last names, and country/phone codes:
130
- - `'in'`: Aarav Sharma, Priya Patel (+91, India)
131
- - `'jp'`: Haruto Tanaka, Sakura Sato (+81, Japan)
132
- - `'kr'`: Minjun Kim, Seo-yeon Park (+82, South Korea)
133
- - `'de'`: Lukas Müller, Mia Schmidt (+49, Germany)
134
- - `'br'`: Miguel Silva, Alice Santos (+55, Brazil)
135
- - `'ar'`: Mohammed Al-Ahmed, Fatima Khalil (+966, Saudi Arabia)
136
- - `'fr'`: Gabriel Martin, Emma Dubois (+33, France)
137
- - `'en'`: James Smith, Mary Johnson (+1, United States)
138
-
139
- ### 4. Time-Series Activity Data
140
- Generate chronological behavioral logs for users. Event types include `login`, `page_view`, `purchase`, `search`, `click`, `logout`, `api_call`, `upload`, `download`, and `comment`.
141
-
142
- ```javascript
143
- const ts = data.userTimeSeries({ seed: 42, days: 30, eventsPerDay: 8 });
144
- // ts.user → Full user profile
145
- // ts.activity → [{ timestamp, type, page, duration, device, ip, success, amount?, query? }]
146
- ```
147
-
148
- ### 5. Anomaly Injection Engine (Fraud Detection)
149
- When `anomaly_rate` is > 0, `fakedata` injects ML-detectable fraud patterns into the dataset. Affected users receive a special `_anomaly` flag object indicating the fraud type.
150
-
151
- | Anomaly Type | Effect |
152
- |:---|:---|
153
- | `income_spike` | Income multiplied 5-15x |
154
- | `credit_fraud` | Credit score = 100-200 or 850-999, DTI = 10-60 |
155
- | `session_anomaly` | Sessions/week = 200-700, avg session = 500-1500 min |
156
- | `age_outlier` | Age = 1, 2, 3, 115, 120, or 130 |
157
- | `geo_impossible` | Coordinates = (0,0), IP = 0.0.0.0 |
158
- | `velocity_attack` | Total sessions = 50k-150k, last login = now |
159
- | `data_mismatch` | Age=12 + employed + 30yr experience + $500k income |
160
- | `health_outlier` | BMI = 8-9 or 75-80, BP = extreme values |
161
-
162
- ### 6. The User Profile Schema (109 Correlated Fields)
163
- Each generated user contains highly realistic, correlated data. For example, age determines education graduation year, which impacts employment salary, which impacts credit score, which impacts housing status and health/BMI metrics.
164
-
165
- ```text
166
- identity(9) → personal(6) → network(3) → address(7) → demographics(5)
167
- → education(7) → employment(10) → financial(8) → health(16)
168
- → social(9) → digitalFootprint(15) → bank(5) → lifestyle(9)
169
- ```
170
-
171
- ---
172
-
173
- ## License
174
-
175
- Distributed under the **MIT License**. See `LICENSE` for more information.
176
-
177
- **Maintainer**: [abhay557](https://github.com/abhay557)
178
-
179
- - Project Commit History - `https://github.com/abhay557/random-api.xyz`