fakesmith 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fakesmith/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ from .smith import FakeSmith
2
+ from .schema import SchemaField, FieldType
3
+ from .detector import infer_schema
4
+ from .sanitizer import sanitize_text, SanitizeResult
5
+ from .result import GenerationResult, FieldSummary
6
+
7
+ __all__ = [
8
+ "FakeSmith",
9
+ "SchemaField",
10
+ "FieldType",
11
+ "infer_schema",
12
+ "sanitize_text",
13
+ "SanitizeResult",
14
+ "GenerationResult",
15
+ "FieldSummary",
16
+ ]
fakesmith/cli.py ADDED
@@ -0,0 +1,150 @@
1
+ """
2
+ cli.py — Command-line interface for FakeSmith.
3
+
4
+ Usage:
5
+ # Generate fake records from a sample file
6
+ fakesmith generate --file data.json --count 20 --format csv
7
+ fakesmith generate --file data.json --count 5 --seed 42 --format sql --table users
8
+ fakesmith generate --file .env --count 1 --format env
9
+
10
+ # Sanitize raw text in-place (plain text / log lines / config blocks)
11
+ fakesmith sanitize --text "email is john@co.com and key is sk-abc123"
12
+ fakesmith sanitize --file raw_log.txt --out clean_log.txt
13
+
14
+ # Inspect detected schema
15
+ fakesmith describe --file data.json
16
+ """
17
+
18
+ import argparse
19
+ import sys
20
+ import json
21
+ from pathlib import Path
22
+
23
+ from .smith import FakeSmith
24
+ from .sanitizer import sanitize_text
25
+
26
+
27
+ def detect_source_type(path: str) -> str:
28
+ ext = Path(path).suffix.lower()
29
+ return {".json": "json", ".csv": "csv"}.get(ext, "auto")
30
+
31
+
32
+ def cmd_generate(args):
33
+ try:
34
+ with open(args.file) as f:
35
+ raw = f.read()
36
+ except FileNotFoundError:
37
+ print(f"❌ File not found: {args.file}", file=sys.stderr)
38
+ sys.exit(1)
39
+
40
+ smith = FakeSmith.from_sample(raw, source_type=detect_source_type(args.file))
41
+
42
+ result = smith.generate(args.count, seed=args.seed)
43
+
44
+ if args.summary:
45
+ result.print_summary()
46
+
47
+ if args.format == "json":
48
+ output = json.dumps(result.records, indent=2, default=str)
49
+ elif args.format == "csv":
50
+ output = smith.to_csv(args.count, seed=args.seed)
51
+ elif args.format == "sql":
52
+ output = smith.to_sql(args.count, table_name=args.table, seed=args.seed)
53
+ elif args.format == "env":
54
+ output = smith.to_env(seed=args.seed)
55
+
56
+ if args.out:
57
+ with open(args.out, "w") as f:
58
+ f.write(output)
59
+ print(f"✅ Saved {args.count} records → {args.out}")
60
+ else:
61
+ print(output)
62
+
63
+
64
+ def cmd_sanitize(args):
65
+ if args.text:
66
+ raw = args.text
67
+ elif args.file:
68
+ try:
69
+ with open(args.file) as f:
70
+ raw = f.read()
71
+ except FileNotFoundError:
72
+ print(f"❌ File not found: {args.file}", file=sys.stderr)
73
+ sys.exit(1)
74
+ else:
75
+ print("❌ Provide --text or --file", file=sys.stderr)
76
+ sys.exit(1)
77
+
78
+ result = sanitize_text(raw, seed=args.seed)
79
+
80
+ if args.summary or args.file:
81
+ result.print_summary()
82
+
83
+ if args.out:
84
+ with open(args.out, "w") as f:
85
+ f.write(result.sanitized)
86
+ print(f"✅ Sanitized output saved → {args.out}")
87
+ else:
88
+ print(result.sanitized)
89
+
90
+
91
+ def cmd_describe(args):
92
+ try:
93
+ with open(args.file) as f:
94
+ raw = f.read()
95
+ except FileNotFoundError:
96
+ print(f"❌ File not found: {args.file}", file=sys.stderr)
97
+ sys.exit(1)
98
+
99
+ smith = FakeSmith.from_sample(raw, source_type=detect_source_type(args.file))
100
+ smith.describe()
101
+
102
+
103
+ def main():
104
+ parser = argparse.ArgumentParser(
105
+ prog="fakesmith",
106
+ description="Generate or sanitize fake data safe to share with LLMs.",
107
+ )
108
+ sub = parser.add_subparsers(dest="command", required=True)
109
+
110
+ # ── generate ──────────────────────────────────────────────────────────────
111
+ gen = sub.add_parser("generate", aliases=["gen", "g"],
112
+ help="Generate fake records from a sample file")
113
+ gen.add_argument("--file", "-f", required=True)
114
+ gen.add_argument("--count", "-n", type=int, default=10)
115
+ gen.add_argument("--seed", "-s", type=int, default=None,
116
+ help="Integer seed for deterministic output")
117
+ gen.add_argument("--format", "-o", default="json",
118
+ choices=["json", "csv", "sql", "env"])
119
+ gen.add_argument("--table", "-t", default="records")
120
+ gen.add_argument("--out", default=None)
121
+ gen.add_argument("--summary", action="store_true",
122
+ help="Print change summary and warnings")
123
+
124
+ # ── sanitize ──────────────────────────────────────────────────────────────
125
+ san = sub.add_parser("sanitize", aliases=["san", "s"],
126
+ help="Replace secrets/PII in raw text in-place")
127
+ san_src = san.add_mutually_exclusive_group(required=True)
128
+ san_src.add_argument("--text", "-t", help="Inline string to sanitize")
129
+ san_src.add_argument("--file", "-f", help="File to sanitize")
130
+ san.add_argument("--out", "-o", default=None, help="Output file")
131
+ san.add_argument("--seed", "-s", type=int, default=None)
132
+ san.add_argument("--summary", action="store_true")
133
+
134
+ # ── describe ──────────────────────────────────────────────────────────────
135
+ desc = sub.add_parser("describe", aliases=["d"],
136
+ help="Show detected schema and sensitivity flags")
137
+ desc.add_argument("--file", "-f", required=True)
138
+
139
+ args = parser.parse_args()
140
+
141
+ if args.command in ("generate", "gen", "g"):
142
+ cmd_generate(args)
143
+ elif args.command in ("sanitize", "san", "s"):
144
+ cmd_sanitize(args)
145
+ elif args.command in ("describe", "d"):
146
+ cmd_describe(args)
147
+
148
+
149
+ if __name__ == "__main__":
150
+ main()
fakesmith/detector.py ADDED
@@ -0,0 +1,217 @@
1
+ """
2
+ detector.py — Infer a FakeSmith schema from sample JSON, CSV, or a Python dict.
3
+ """
4
+
5
+ import re
6
+ import json
7
+ import csv
8
+ import io
9
+ from typing import Union
10
+ from .schema import SchemaField, FieldType
11
+
12
+
13
+ # Regex heuristics for common patterns
14
+
15
+ PATTERNS = [
16
+ (FieldType.UUID, r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'),
17
+ (FieldType.EMAIL, r'^[\w\.-]+@[\w\.-]+\.\w+$'),
18
+ (FieldType.PHONE, r'^[\+\d][\d\s\-\(\)\.]{6,20}$'),
19
+ (FieldType.IP_ADDRESS, r'^\d{1,3}(\.\d{1,3}){3}$'),
20
+ (FieldType.IPV6, r'^([0-9a-f]{1,4}:){7}[0-9a-f]{1,4}$'),
21
+ (FieldType.MAC_ADDRESS, r'^([0-9a-f]{2}[:\-]){5}[0-9a-f]{2}$'),
22
+ (FieldType.URL, r'^https?://'),
23
+ (FieldType.DATETIME, r'^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}'),
24
+ (FieldType.DATE, r'^\d{4}-\d{2}-\d{2}$'),
25
+ (FieldType.TIME, r'^\d{2}:\d{2}(:\d{2})?$'),
26
+ (FieldType.JWT_TOKEN, r'^eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+$'),
27
+ (FieldType.CARD_NUMBER, r'^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$'),
28
+ (FieldType.IBAN, r'^[A-Z]{2}\d{2}[A-Z0-9]{1,30}$'),
29
+ (FieldType.PASSWORD_HASH, r'^(\$2[aby]\$|sha256\$|[0-9a-f]{32,128})'),
30
+ (FieldType.API_KEY, r'^(sk-|pk-|api-|key-|Bearer )[A-Za-z0-9\-_]{8,}'),
31
+ (FieldType.SLUG, r'^[a-z0-9]+(-[a-z0-9]+)*$'),
32
+ ]
33
+
34
+ # Name-based heuristics (field name keywords → type)
35
+ NAME_HINTS = {
36
+ FieldType.UUID: ['id', 'uuid', 'guid'],
37
+ FieldType.EMAIL: ['email', 'mail'],
38
+ FieldType.PHONE: ['phone', 'mobile', 'tel', 'fax'],
39
+ FieldType.FULL_NAME: ['full_name', 'fullname', 'name'],
40
+ FieldType.FIRST_NAME: ['first_name', 'firstname', 'fname'],
41
+ FieldType.LAST_NAME: ['last_name', 'lastname', 'lname', 'surname'],
42
+ FieldType.USERNAME: ['username', 'user_name', 'handle', 'login'],
43
+ FieldType.PASSWORD: ['password', 'passwd', 'pwd'],
44
+ FieldType.PASSWORD_HASH:['password_hash', 'hashed_password', 'pwd_hash'],
45
+ FieldType.API_KEY: ['api_key', 'apikey', 'api_token', 'access_key'],
46
+ FieldType.SECRET_TOKEN: ['secret', 'token', 'refresh_token', 'auth_token'],
47
+ FieldType.JWT_TOKEN: ['jwt', 'id_token', 'bearer'],
48
+ FieldType.ADDRESS: ['address', 'street', 'addr'],
49
+ FieldType.CITY: ['city', 'town'],
50
+ FieldType.STATE: ['state', 'province', 'region'],
51
+ FieldType.COUNTRY: ['country', 'nation'],
52
+ FieldType.ZIP_CODE: ['zip', 'postal', 'postcode', 'zip_code'],
53
+ FieldType.LATITUDE: ['lat', 'latitude'],
54
+ FieldType.LONGITUDE: ['lon', 'lng', 'longitude'],
55
+ FieldType.AMOUNT: ['amount', 'price', 'cost', 'fee', 'total', 'balance', 'salary'],
56
+ FieldType.CURRENCY: ['currency', 'currency_code'],
57
+ FieldType.CARD_NUMBER: ['card_number', 'card_num', 'pan'],
58
+ FieldType.CARD_EXPIRY: ['expiry', 'expiration', 'exp_date'],
59
+ FieldType.BANK_ACCOUNT: ['account', 'account_number', 'bban'],
60
+ FieldType.IBAN: ['iban'],
61
+ FieldType.COMPANY: ['company', 'organization', 'employer', 'firm'],
62
+ FieldType.JOB_TITLE: ['job', 'title', 'position', 'role', 'occupation'],
63
+ FieldType.DEPARTMENT: ['department', 'dept', 'division', 'team'],
64
+ FieldType.DATETIME: ['created_at', 'updated_at', 'deleted_at', 'datetime', 'timestamp'],
65
+ FieldType.DATE: ['date', 'day'],
66
+ FieldType.DATE_OF_BIRTH:['dob', 'birth_date', 'birthdate', 'date_of_birth'],
67
+ FieldType.IP_ADDRESS: ['ip', 'ip_address', 'ipv4'],
68
+ FieldType.USER_AGENT: ['user_agent', 'useragent', 'browser'],
69
+ FieldType.URL: ['url', 'link', 'href', 'website', 'homepage'],
70
+ FieldType.STATUS: ['status', 'state', 'stage'],
71
+ FieldType.DESCRIPTION: ['description', 'desc', 'details', 'bio', 'summary', 'about'],
72
+ FieldType.TITLE: ['title', 'heading', 'subject'],
73
+ FieldType.TAG: ['tag', 'label', 'category', 'type'],
74
+ FieldType.GENDER: ['gender', 'sex'],
75
+ FieldType.PERCENTAGE: ['percent', 'percentage', 'rate', 'ratio'],
76
+ FieldType.BOOLEAN: ['is_', 'has_', 'can_', 'active', 'enabled', 'verified'],
77
+ }
78
+
79
+
80
+ def _detect_from_name(field_name: str) -> FieldType:
81
+ """Guess field type purely from its name."""
82
+ lower = field_name.lower()
83
+ for ftype, keywords in NAME_HINTS.items():
84
+ for kw in keywords:
85
+ if kw in lower:
86
+ return ftype
87
+ return None
88
+
89
+
90
+ def _detect_from_value(value) -> FieldType:
91
+ """Guess field type from a sample value."""
92
+ if value is None:
93
+ return None
94
+ if isinstance(value, bool):
95
+ return FieldType.BOOLEAN
96
+ if isinstance(value, int):
97
+ return FieldType.INTEGER
98
+ if isinstance(value, float):
99
+ return FieldType.FLOAT
100
+ if isinstance(value, str):
101
+ for ftype, pattern in PATTERNS:
102
+ if re.match(pattern, value.strip(), re.IGNORECASE):
103
+ return ftype
104
+ return None
105
+
106
+
107
+ # Name hints for security-sensitive fields always win over value detection.
108
+ _NAME_PRIORITY_TYPES = {
109
+ FieldType.PASSWORD, FieldType.PASSWORD_HASH, FieldType.API_KEY,
110
+ FieldType.SECRET_TOKEN, FieldType.JWT_TOKEN, FieldType.CARD_NUMBER,
111
+ FieldType.CARD_EXPIRY, FieldType.BANK_ACCOUNT, FieldType.IBAN,
112
+ FieldType.DATE_OF_BIRTH,
113
+ }
114
+
115
+
116
+ def _detect_field(field_name: str, sample_value=None) -> FieldType:
117
+ """Combine name + value heuristics.
118
+
119
+ Priority:
120
+ 1. Name hint for security-sensitive types (password, api_key, etc.)
121
+ 2. Value-based regex for precise structural matches (email, UUID, IP…)
122
+ 3. Name hint for non-sensitive types
123
+ 4. Generic value type (int, float, bool)
124
+ 5. SENTENCE fallback
125
+ """
126
+ by_name = _detect_from_name(field_name)
127
+ by_value = _detect_from_value(sample_value) if sample_value is not None else None
128
+
129
+ # Security-sensitive name hints always win — prevents "s3cr3t" → SLUG
130
+ if by_name and by_name in _NAME_PRIORITY_TYPES:
131
+ return by_name
132
+
133
+ # Value regex is precise for structural types
134
+ if by_value and by_value not in (FieldType.INTEGER, FieldType.FLOAT,
135
+ FieldType.BOOLEAN, FieldType.SLUG):
136
+ return by_value
137
+
138
+ if by_name:
139
+ return by_name
140
+ if by_value:
141
+ return by_value
142
+
143
+ # Python type fallbacks
144
+ if isinstance(sample_value, bool): return FieldType.BOOLEAN
145
+ if isinstance(sample_value, int): return FieldType.INTEGER
146
+ if isinstance(sample_value, float): return FieldType.FLOAT
147
+
148
+ return FieldType.SENTENCE
149
+
150
+
151
+ def infer_schema(source: Union[str, dict, list], source_type: str = "auto") -> list[SchemaField]:
152
+ """
153
+ Infer a list of SchemaFields from sample data.
154
+
155
+ Args:
156
+ source: A JSON string, CSV string, Python dict, or list of dicts.
157
+ source_type: "json" | "csv" | "dict" | "auto" (default)
158
+
159
+ Returns:
160
+ List of SchemaField objects ready to pass into FakeSmith.
161
+ """
162
+ record = _extract_sample_record(source, source_type)
163
+ fields = []
164
+ for key, value in record.items():
165
+ detected = _detect_field(key, value)
166
+ nullable = value is None
167
+ fields.append(SchemaField(name=key, field_type=detected, nullable=nullable))
168
+ return fields
169
+
170
+
171
+ def _extract_sample_record(source, source_type) -> dict:
172
+ """Pull the first record out of whatever format was given."""
173
+ if isinstance(source, dict):
174
+ return source
175
+ if isinstance(source, list) and source:
176
+ return source[0]
177
+
178
+ if source_type == "auto":
179
+ stripped = source.strip() if isinstance(source, str) else ""
180
+ if stripped.startswith(("{", "[")):
181
+ source_type = "json"
182
+ else:
183
+ source_type = "csv"
184
+
185
+ if source_type == "json":
186
+ data = json.loads(source)
187
+ return data[0] if isinstance(data, list) else data
188
+
189
+ if source_type == "csv":
190
+ reader = csv.DictReader(io.StringIO(source))
191
+ for row in reader:
192
+ return dict(row)
193
+
194
+ raise ValueError(f"Cannot parse source with source_type='{source_type}'")
195
+
196
+
197
+ # Gap 5: Sensitive field registry
198
+ # Used by smith.py to flag fields that contain real PII or secrets.
199
+
200
+ SENSITIVE_TYPES = {
201
+ FieldType.EMAIL,
202
+ FieldType.PHONE,
203
+ FieldType.PASSWORD,
204
+ FieldType.PASSWORD_HASH,
205
+ FieldType.CARD_NUMBER,
206
+ FieldType.CARD_EXPIRY,
207
+ FieldType.CARD_CVV,
208
+ FieldType.BANK_ACCOUNT,
209
+ FieldType.IBAN,
210
+ FieldType.API_KEY,
211
+ FieldType.SECRET_TOKEN,
212
+ FieldType.JWT_TOKEN,
213
+ FieldType.DATE_OF_BIRTH,
214
+ FieldType.IP_ADDRESS,
215
+ FieldType.IPV6,
216
+ FieldType.MAC_ADDRESS,
217
+ }
@@ -0,0 +1,205 @@
1
+ """
2
+ generators.py — Maps every FieldType to a Faker-backed callable.
3
+ Includes format-preserving helpers (Gap 4) and seed threading (Gap 1).
4
+ """
5
+
6
+ import re
7
+ import random
8
+ import uuid
9
+ import base64
10
+ import hashlib
11
+ import time
12
+ from typing import Union, Optional
13
+ from faker import Faker
14
+ from .schema import SchemaField, FieldType
15
+
16
+ _fakers: dict[str, Faker] = {}
17
+
18
+
19
+ def seed_all(seed: int) -> None:
20
+ """Seed Python random AND every Faker instance for deterministic output."""
21
+ random.seed(seed)
22
+ Faker.seed(seed)
23
+ for f in _fakers.values():
24
+ f.seed_instance(seed)
25
+
26
+
27
+ def _get_faker(locale: str = "en_US") -> Faker:
28
+ if locale not in _fakers:
29
+ _fakers[locale] = Faker(locale)
30
+ return _fakers[locale]
31
+
32
+
33
+ # Gap 4: format-preserving helpers
34
+
35
+ def _preserve_phone(original: str) -> str:
36
+ """Return a fake phone that mirrors the separator + prefix style of original."""
37
+ has_plus = original.strip().startswith('+')
38
+ has_parens = '(' in original
39
+ sep = '-' if '-' in original else (' ' if ' ' in original else '')
40
+ digits = re.sub(r'\D', '', original)
41
+ fake_d = ''.join(str(random.randint(0, 9)) for _ in digits)
42
+ if has_plus and len(digits) > 10:
43
+ cc = fake_d[:len(digits) - 10]
44
+ rest = fake_d[len(digits) - 10:]
45
+ if has_parens:
46
+ return f"+{cc}{sep}({rest[:3]}){sep}{rest[3:6]}{sep}{rest[6:]}"
47
+ return f"+{cc}{sep}{rest[:3]}{sep}{rest[3:6]}{sep}{rest[6:]}"
48
+ if has_parens:
49
+ return f"({fake_d[:3]}){sep}{fake_d[3:6]}{sep}{fake_d[6:]}"
50
+ if sep:
51
+ return f"{fake_d[:3]}{sep}{fake_d[3:6]}{sep}{fake_d[6:]}"
52
+ return fake_d
53
+
54
+
55
+ def _preserve_api_key(original: str, prefix: Optional[str] = None) -> str:
56
+ """Return a fake key with the same prefix and character count."""
57
+ if prefix:
58
+ length = max(len(original) - len(prefix), 16)
59
+ chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
60
+ return prefix + ''.join(random.choices(chars, k=length))
61
+ m = re.match(r'^([a-zA-Z0-9_\-]{2,8}[-_])', original)
62
+ detected = m.group(1) if m else 'key-'
63
+ length = max(len(original) - len(detected), 16)
64
+ chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
65
+ return detected + ''.join(random.choices(chars, k=length))
66
+
67
+
68
+ def _preserve_card(original: str) -> str:
69
+ """Return a fake card that mirrors the spacing/dash format of original."""
70
+ sep = '-' if '-' in original else (' ' if ' ' in original else '')
71
+ fake = Faker().credit_card_number()
72
+ if sep and len(original.replace(sep, '')) == 16:
73
+ return sep.join([fake[i:i+4] for i in range(0, 16, 4)])
74
+ return fake
75
+
76
+
77
+ def _fake_jwt() -> str:
78
+ """Generate a structurally valid but fake JWT token."""
79
+ def b64(data: str) -> str:
80
+ return base64.urlsafe_b64encode(data.encode()).rstrip(b"=").decode()
81
+ header = b64('{"alg":"HS256","typ":"JWT"}')
82
+ payload = b64(f'{{"sub":"{uuid.uuid4()}","iat":{int(time.time())}}}')
83
+ sig = base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b"=").decode()
84
+ return f"{header}.{payload}.{sig}"
85
+
86
+
87
+ def _fake_api_key(prefix: str = "sk-") -> str:
88
+ fake = _get_faker()
89
+ return prefix + fake.lexify("?" * 32, letters="abcdefghijklmnopqrstuvwxyz0123456789")
90
+
91
+
92
+ def _fake_secret_token() -> str:
93
+ return hashlib.sha256(uuid.uuid4().bytes).hexdigest()
94
+
95
+
96
+ def generate_value(field: SchemaField, original_value: Optional[str] = None) -> any:
97
+ """Generate a single fake value. original_value enables format-preserving output."""
98
+ fake = _get_faker(field.locale or "en_US")
99
+
100
+ # Handle nullable fields
101
+ if field.nullable and random.random() < field.null_probability:
102
+ return None
103
+
104
+ # Custom generator takes full priority
105
+ if field.field_type == FieldType.CUSTOM:
106
+ return field.generator()
107
+
108
+ # Choices-based field
109
+ if field.choices:
110
+ return random.choice(field.choices)
111
+
112
+ ft = field.field_type
113
+
114
+ # Identity
115
+ if ft == FieldType.UUID: return fake.uuid4()
116
+ if ft == FieldType.FULL_NAME: return fake.name()
117
+ if ft == FieldType.FIRST_NAME: return fake.first_name()
118
+ if ft == FieldType.LAST_NAME: return fake.last_name()
119
+ if ft == FieldType.USERNAME: return fake.user_name()
120
+ if ft == FieldType.EMAIL: return fake.email()
121
+ if ft == FieldType.PHONE:
122
+ if original_value and isinstance(original_value, str):
123
+ return _preserve_phone(original_value)
124
+ return fake.phone_number()
125
+ if ft == FieldType.PASSWORD: return fake.password(length=12)
126
+ if ft == FieldType.PASSWORD_HASH: return hashlib.sha256(fake.password().encode()).hexdigest()
127
+
128
+ # ── Location
129
+ if ft == FieldType.ADDRESS: return fake.address()
130
+ if ft == FieldType.CITY: return fake.city()
131
+ if ft == FieldType.STATE: return fake.state() if hasattr(fake, "state") else fake.city()
132
+ if ft == FieldType.COUNTRY: return fake.country()
133
+ if ft == FieldType.ZIP_CODE: return fake.postcode()
134
+ if ft == FieldType.LATITUDE: return float(fake.latitude())
135
+ if ft == FieldType.LONGITUDE: return float(fake.longitude())
136
+
137
+ # ── Finance ─
138
+ if ft == FieldType.CARD_NUMBER:
139
+ if original_value and isinstance(original_value, str):
140
+ return _preserve_card(original_value)
141
+ return fake.credit_card_number()
142
+ if ft == FieldType.CARD_EXPIRY: return fake.credit_card_expire()
143
+ if ft == FieldType.CARD_CVV: return fake.credit_card_security_code()
144
+ if ft == FieldType.BANK_ACCOUNT: return fake.bban()
145
+ if ft == FieldType.IBAN: return fake.iban()
146
+ if ft == FieldType.CURRENCY: return random.choice(["USD", "EUR", "GBP", "GHS", "NGN", "JPY", "CAD"])
147
+ if ft == FieldType.AMOUNT:
148
+ lo = field.min_value if field.min_value is not None else 1.0
149
+ hi = field.max_value if field.max_value is not None else 99999.0
150
+ return round(random.uniform(lo, hi), 2)
151
+
152
+ # Business
153
+ if ft == FieldType.COMPANY: return fake.company()
154
+ if ft == FieldType.JOB_TITLE: return fake.job()
155
+ if ft == FieldType.DEPARTMENT: return random.choice(["Engineering", "Marketing", "Finance", "HR", "Sales", "Design", "Operations"])
156
+ if ft == FieldType.API_KEY:
157
+ if original_value and isinstance(original_value, str):
158
+ return _preserve_api_key(original_value, field.prefix)
159
+ return _fake_api_key(field.prefix or "sk-")
160
+ if ft == FieldType.SECRET_TOKEN: return _fake_secret_token()
161
+ if ft == FieldType.JWT_TOKEN: return _fake_jwt()
162
+ if ft == FieldType.WEBHOOK_URL: return f"https://hooks.{fake.domain_name()}/webhook/{uuid.uuid4().hex[:8]}"
163
+
164
+ # Dates & Times
165
+ if ft == FieldType.DATETIME: return fake.date_time_this_year().isoformat()
166
+ if ft == FieldType.DATE: return fake.date_this_year().isoformat()
167
+ if ft == FieldType.TIME: return fake.time()
168
+ if ft == FieldType.DATE_OF_BIRTH: return fake.date_of_birth(minimum_age=18, maximum_age=70).isoformat()
169
+ if ft == FieldType.TIMESTAMP: return int(fake.date_time_this_year().timestamp())
170
+
171
+ # Web & Tech
172
+ if ft == FieldType.IP_ADDRESS: return fake.ipv4()
173
+ if ft == FieldType.IPV6: return fake.ipv6()
174
+ if ft == FieldType.MAC_ADDRESS: return fake.mac_address()
175
+ if ft == FieldType.USER_AGENT: return fake.user_agent()
176
+ if ft == FieldType.URL: return fake.url()
177
+ if ft == FieldType.DOMAIN: return fake.domain_name()
178
+ if ft == FieldType.SLUG: return fake.slug()
179
+
180
+ # Content
181
+ if ft == FieldType.WORD: return fake.word()
182
+ if ft == FieldType.SENTENCE: return fake.sentence()
183
+ if ft == FieldType.PARAGRAPH: return fake.paragraph()
184
+ if ft == FieldType.TITLE: return fake.catch_phrase()
185
+ if ft == FieldType.DESCRIPTION: return fake.text(max_nb_chars=200)
186
+ if ft == FieldType.TAG: return fake.word()
187
+
188
+ # Numeric
189
+ if ft == FieldType.INTEGER:
190
+ lo = int(field.min_value) if field.min_value is not None else 0
191
+ hi = int(field.max_value) if field.max_value is not None else 10000
192
+ return random.randint(lo, hi)
193
+ if ft == FieldType.FLOAT:
194
+ lo = field.min_value if field.min_value is not None else 0.0
195
+ hi = field.max_value if field.max_value is not None else 1000.0
196
+ return round(random.uniform(lo, hi), 4)
197
+ if ft == FieldType.BOOLEAN: return random.choice([True, False])
198
+ if ft == FieldType.PERCENTAGE: return round(random.uniform(0, 100), 2)
199
+
200
+ # Enums
201
+ if ft == FieldType.STATUS: return random.choice(["active", "inactive", "pending", "suspended", "archived"])
202
+ if ft == FieldType.GENDER: return random.choice(["male", "female", "non-binary", "prefer_not_to_say"])
203
+
204
+ # Fallback
205
+ return fake.word()
fakesmith/result.py ADDED
@@ -0,0 +1,75 @@
1
+ """
2
+ result.py — GenerationResult wraps generated records with a change summary
3
+ and any warnings raised during detection.
4
+ """
5
+
6
+ from dataclasses import dataclass, field
7
+ from typing import List, Dict, Optional
8
+
9
+
10
+ @dataclass
11
+ class FieldSummary:
12
+ """What happened to one field."""
13
+ name: str
14
+ detected_type: str
15
+ is_sensitive: bool
16
+ original_sample: str # first value from source (truncated / redacted)
17
+ note: str = ""
18
+
19
+
20
+ @dataclass
21
+ class GenerationResult:
22
+ """
23
+ Returned by FakeSmith.generate() and all to_*() methods when
24
+ return_summary=True is passed (or always from generate_with_summary()).
25
+
26
+ Attributes:
27
+ records: The generated fake records.
28
+ summary: Per-field breakdown of what was detected and replaced.
29
+ warnings: Human-readable warnings for sensitive fields found.
30
+ seed: The seed used, if any (None = random).
31
+ """
32
+ records: List[Dict]
33
+ summary: List[FieldSummary] = field(default_factory=list)
34
+ warnings: List[str] = field(default_factory=list)
35
+ seed: Optional[int] = None
36
+
37
+ # ── Display ───────────────────────────────────────────────────────────────
38
+
39
+ def print_summary(self) -> None:
40
+ """Print a human-readable change report to stdout."""
41
+ print(f"\n{'─'*62}")
42
+ print(f" FakeSmith — change summary ({len(self.records)} records generated)")
43
+ if self.seed is not None:
44
+ print(f" Seed: {self.seed} (re-run with --seed {self.seed} for identical output)")
45
+ print(f"{'─'*62}")
46
+ print(f" {'Field':<22} {'Detected type':<20} {'Sensitive'}")
47
+ print(f" {'─'*20} {'─'*18} {'─'*9}")
48
+ for fs in self.summary:
49
+ flag = " *** YES ***" if fs.is_sensitive else " no"
50
+ print(f" {fs.name:<22} {fs.detected_type:<20}{flag}")
51
+ if fs.note:
52
+ print(f" {'':22} {fs.note}")
53
+ print(f"{'─'*62}")
54
+ if self.warnings:
55
+ print()
56
+ for w in self.warnings:
57
+ print(f" [WARNING] {w}")
58
+ print()
59
+
60
+ def to_dict(self) -> dict:
61
+ """Machine-readable summary dict."""
62
+ return {
63
+ "seed": self.seed,
64
+ "records_generated": len(self.records),
65
+ "fields": [
66
+ {
67
+ "name": fs.name,
68
+ "detected_type": fs.detected_type,
69
+ "sensitive": fs.is_sensitive,
70
+ "note": fs.note,
71
+ }
72
+ for fs in self.summary
73
+ ],
74
+ "warnings": self.warnings,
75
+ }