fakesmith 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fakesmith/__init__.py +16 -0
- fakesmith/cli.py +150 -0
- fakesmith/detector.py +217 -0
- fakesmith/generators.py +205 -0
- fakesmith/result.py +75 -0
- fakesmith/sanitizer.py +180 -0
- fakesmith/schema.py +109 -0
- fakesmith/smith.py +199 -0
- fakesmith-0.1.0.dist-info/METADATA +246 -0
- fakesmith-0.1.0.dist-info/RECORD +13 -0
- fakesmith-0.1.0.dist-info/WHEEL +5 -0
- fakesmith-0.1.0.dist-info/entry_points.txt +2 -0
- fakesmith-0.1.0.dist-info/top_level.txt +1 -0
fakesmith/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from .smith import FakeSmith
|
|
2
|
+
from .schema import SchemaField, FieldType
|
|
3
|
+
from .detector import infer_schema
|
|
4
|
+
from .sanitizer import sanitize_text, SanitizeResult
|
|
5
|
+
from .result import GenerationResult, FieldSummary
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"FakeSmith",
|
|
9
|
+
"SchemaField",
|
|
10
|
+
"FieldType",
|
|
11
|
+
"infer_schema",
|
|
12
|
+
"sanitize_text",
|
|
13
|
+
"SanitizeResult",
|
|
14
|
+
"GenerationResult",
|
|
15
|
+
"FieldSummary",
|
|
16
|
+
]
|
fakesmith/cli.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py — Command-line interface for FakeSmith.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
# Generate fake records from a sample file
|
|
6
|
+
fakesmith generate --file data.json --count 20 --format csv
|
|
7
|
+
fakesmith generate --file data.json --count 5 --seed 42 --format sql --table users
|
|
8
|
+
fakesmith generate --file .env --count 1 --format env
|
|
9
|
+
|
|
10
|
+
# Sanitize raw text in-place (plain text / log lines / config blocks)
|
|
11
|
+
fakesmith sanitize --text "email is john@co.com and key is sk-abc123"
|
|
12
|
+
fakesmith sanitize --file raw_log.txt --out clean_log.txt
|
|
13
|
+
|
|
14
|
+
# Inspect detected schema
|
|
15
|
+
fakesmith describe --file data.json
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
import json
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from .smith import FakeSmith
|
|
24
|
+
from .sanitizer import sanitize_text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def detect_source_type(path: str) -> str:
|
|
28
|
+
ext = Path(path).suffix.lower()
|
|
29
|
+
return {".json": "json", ".csv": "csv"}.get(ext, "auto")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def cmd_generate(args):
|
|
33
|
+
try:
|
|
34
|
+
with open(args.file) as f:
|
|
35
|
+
raw = f.read()
|
|
36
|
+
except FileNotFoundError:
|
|
37
|
+
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
smith = FakeSmith.from_sample(raw, source_type=detect_source_type(args.file))
|
|
41
|
+
|
|
42
|
+
result = smith.generate(args.count, seed=args.seed)
|
|
43
|
+
|
|
44
|
+
if args.summary:
|
|
45
|
+
result.print_summary()
|
|
46
|
+
|
|
47
|
+
if args.format == "json":
|
|
48
|
+
output = json.dumps(result.records, indent=2, default=str)
|
|
49
|
+
elif args.format == "csv":
|
|
50
|
+
output = smith.to_csv(args.count, seed=args.seed)
|
|
51
|
+
elif args.format == "sql":
|
|
52
|
+
output = smith.to_sql(args.count, table_name=args.table, seed=args.seed)
|
|
53
|
+
elif args.format == "env":
|
|
54
|
+
output = smith.to_env(seed=args.seed)
|
|
55
|
+
|
|
56
|
+
if args.out:
|
|
57
|
+
with open(args.out, "w") as f:
|
|
58
|
+
f.write(output)
|
|
59
|
+
print(f"✅ Saved {args.count} records → {args.out}")
|
|
60
|
+
else:
|
|
61
|
+
print(output)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def cmd_sanitize(args):
|
|
65
|
+
if args.text:
|
|
66
|
+
raw = args.text
|
|
67
|
+
elif args.file:
|
|
68
|
+
try:
|
|
69
|
+
with open(args.file) as f:
|
|
70
|
+
raw = f.read()
|
|
71
|
+
except FileNotFoundError:
|
|
72
|
+
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
else:
|
|
75
|
+
print("❌ Provide --text or --file", file=sys.stderr)
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
|
|
78
|
+
result = sanitize_text(raw, seed=args.seed)
|
|
79
|
+
|
|
80
|
+
if args.summary or args.file:
|
|
81
|
+
result.print_summary()
|
|
82
|
+
|
|
83
|
+
if args.out:
|
|
84
|
+
with open(args.out, "w") as f:
|
|
85
|
+
f.write(result.sanitized)
|
|
86
|
+
print(f"✅ Sanitized output saved → {args.out}")
|
|
87
|
+
else:
|
|
88
|
+
print(result.sanitized)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def cmd_describe(args):
|
|
92
|
+
try:
|
|
93
|
+
with open(args.file) as f:
|
|
94
|
+
raw = f.read()
|
|
95
|
+
except FileNotFoundError:
|
|
96
|
+
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
smith = FakeSmith.from_sample(raw, source_type=detect_source_type(args.file))
|
|
100
|
+
smith.describe()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
parser = argparse.ArgumentParser(
|
|
105
|
+
prog="fakesmith",
|
|
106
|
+
description="Generate or sanitize fake data safe to share with LLMs.",
|
|
107
|
+
)
|
|
108
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
109
|
+
|
|
110
|
+
# ── generate ──────────────────────────────────────────────────────────────
|
|
111
|
+
gen = sub.add_parser("generate", aliases=["gen", "g"],
|
|
112
|
+
help="Generate fake records from a sample file")
|
|
113
|
+
gen.add_argument("--file", "-f", required=True)
|
|
114
|
+
gen.add_argument("--count", "-n", type=int, default=10)
|
|
115
|
+
gen.add_argument("--seed", "-s", type=int, default=None,
|
|
116
|
+
help="Integer seed for deterministic output")
|
|
117
|
+
gen.add_argument("--format", "-o", default="json",
|
|
118
|
+
choices=["json", "csv", "sql", "env"])
|
|
119
|
+
gen.add_argument("--table", "-t", default="records")
|
|
120
|
+
gen.add_argument("--out", default=None)
|
|
121
|
+
gen.add_argument("--summary", action="store_true",
|
|
122
|
+
help="Print change summary and warnings")
|
|
123
|
+
|
|
124
|
+
# ── sanitize ──────────────────────────────────────────────────────────────
|
|
125
|
+
san = sub.add_parser("sanitize", aliases=["san", "s"],
|
|
126
|
+
help="Replace secrets/PII in raw text in-place")
|
|
127
|
+
san_src = san.add_mutually_exclusive_group(required=True)
|
|
128
|
+
san_src.add_argument("--text", "-t", help="Inline string to sanitize")
|
|
129
|
+
san_src.add_argument("--file", "-f", help="File to sanitize")
|
|
130
|
+
san.add_argument("--out", "-o", default=None, help="Output file")
|
|
131
|
+
san.add_argument("--seed", "-s", type=int, default=None)
|
|
132
|
+
san.add_argument("--summary", action="store_true")
|
|
133
|
+
|
|
134
|
+
# ── describe ──────────────────────────────────────────────────────────────
|
|
135
|
+
desc = sub.add_parser("describe", aliases=["d"],
|
|
136
|
+
help="Show detected schema and sensitivity flags")
|
|
137
|
+
desc.add_argument("--file", "-f", required=True)
|
|
138
|
+
|
|
139
|
+
args = parser.parse_args()
|
|
140
|
+
|
|
141
|
+
if args.command in ("generate", "gen", "g"):
|
|
142
|
+
cmd_generate(args)
|
|
143
|
+
elif args.command in ("sanitize", "san", "s"):
|
|
144
|
+
cmd_sanitize(args)
|
|
145
|
+
elif args.command in ("describe", "d"):
|
|
146
|
+
cmd_describe(args)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
if __name__ == "__main__":
|
|
150
|
+
main()
|
fakesmith/detector.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
detector.py — Infer a FakeSmith schema from sample JSON, CSV, or a Python dict.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import json
|
|
7
|
+
import csv
|
|
8
|
+
import io
|
|
9
|
+
from typing import Union
|
|
10
|
+
from .schema import SchemaField, FieldType
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Regex heuristics for common patterns
|
|
14
|
+
|
|
15
|
+
PATTERNS = [
|
|
16
|
+
(FieldType.UUID, r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'),
|
|
17
|
+
(FieldType.EMAIL, r'^[\w\.-]+@[\w\.-]+\.\w+$'),
|
|
18
|
+
(FieldType.PHONE, r'^[\+\d][\d\s\-\(\)\.]{6,20}$'),
|
|
19
|
+
(FieldType.IP_ADDRESS, r'^\d{1,3}(\.\d{1,3}){3}$'),
|
|
20
|
+
(FieldType.IPV6, r'^([0-9a-f]{1,4}:){7}[0-9a-f]{1,4}$'),
|
|
21
|
+
(FieldType.MAC_ADDRESS, r'^([0-9a-f]{2}[:\-]){5}[0-9a-f]{2}$'),
|
|
22
|
+
(FieldType.URL, r'^https?://'),
|
|
23
|
+
(FieldType.DATETIME, r'^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}'),
|
|
24
|
+
(FieldType.DATE, r'^\d{4}-\d{2}-\d{2}$'),
|
|
25
|
+
(FieldType.TIME, r'^\d{2}:\d{2}(:\d{2})?$'),
|
|
26
|
+
(FieldType.JWT_TOKEN, r'^eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+$'),
|
|
27
|
+
(FieldType.CARD_NUMBER, r'^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$'),
|
|
28
|
+
(FieldType.IBAN, r'^[A-Z]{2}\d{2}[A-Z0-9]{1,30}$'),
|
|
29
|
+
(FieldType.PASSWORD_HASH, r'^(\$2[aby]\$|sha256\$|[0-9a-f]{32,128})'),
|
|
30
|
+
(FieldType.API_KEY, r'^(sk-|pk-|api-|key-|Bearer )[A-Za-z0-9\-_]{8,}'),
|
|
31
|
+
(FieldType.SLUG, r'^[a-z0-9]+(-[a-z0-9]+)*$'),
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# Name-based heuristics (field name keywords → type)
|
|
35
|
+
NAME_HINTS = {
|
|
36
|
+
FieldType.UUID: ['id', 'uuid', 'guid'],
|
|
37
|
+
FieldType.EMAIL: ['email', 'mail'],
|
|
38
|
+
FieldType.PHONE: ['phone', 'mobile', 'tel', 'fax'],
|
|
39
|
+
FieldType.FULL_NAME: ['full_name', 'fullname', 'name'],
|
|
40
|
+
FieldType.FIRST_NAME: ['first_name', 'firstname', 'fname'],
|
|
41
|
+
FieldType.LAST_NAME: ['last_name', 'lastname', 'lname', 'surname'],
|
|
42
|
+
FieldType.USERNAME: ['username', 'user_name', 'handle', 'login'],
|
|
43
|
+
FieldType.PASSWORD: ['password', 'passwd', 'pwd'],
|
|
44
|
+
FieldType.PASSWORD_HASH:['password_hash', 'hashed_password', 'pwd_hash'],
|
|
45
|
+
FieldType.API_KEY: ['api_key', 'apikey', 'api_token', 'access_key'],
|
|
46
|
+
FieldType.SECRET_TOKEN: ['secret', 'token', 'refresh_token', 'auth_token'],
|
|
47
|
+
FieldType.JWT_TOKEN: ['jwt', 'id_token', 'bearer'],
|
|
48
|
+
FieldType.ADDRESS: ['address', 'street', 'addr'],
|
|
49
|
+
FieldType.CITY: ['city', 'town'],
|
|
50
|
+
FieldType.STATE: ['state', 'province', 'region'],
|
|
51
|
+
FieldType.COUNTRY: ['country', 'nation'],
|
|
52
|
+
FieldType.ZIP_CODE: ['zip', 'postal', 'postcode', 'zip_code'],
|
|
53
|
+
FieldType.LATITUDE: ['lat', 'latitude'],
|
|
54
|
+
FieldType.LONGITUDE: ['lon', 'lng', 'longitude'],
|
|
55
|
+
FieldType.AMOUNT: ['amount', 'price', 'cost', 'fee', 'total', 'balance', 'salary'],
|
|
56
|
+
FieldType.CURRENCY: ['currency', 'currency_code'],
|
|
57
|
+
FieldType.CARD_NUMBER: ['card_number', 'card_num', 'pan'],
|
|
58
|
+
FieldType.CARD_EXPIRY: ['expiry', 'expiration', 'exp_date'],
|
|
59
|
+
FieldType.BANK_ACCOUNT: ['account', 'account_number', 'bban'],
|
|
60
|
+
FieldType.IBAN: ['iban'],
|
|
61
|
+
FieldType.COMPANY: ['company', 'organization', 'employer', 'firm'],
|
|
62
|
+
FieldType.JOB_TITLE: ['job', 'title', 'position', 'role', 'occupation'],
|
|
63
|
+
FieldType.DEPARTMENT: ['department', 'dept', 'division', 'team'],
|
|
64
|
+
FieldType.DATETIME: ['created_at', 'updated_at', 'deleted_at', 'datetime', 'timestamp'],
|
|
65
|
+
FieldType.DATE: ['date', 'day'],
|
|
66
|
+
FieldType.DATE_OF_BIRTH:['dob', 'birth_date', 'birthdate', 'date_of_birth'],
|
|
67
|
+
FieldType.IP_ADDRESS: ['ip', 'ip_address', 'ipv4'],
|
|
68
|
+
FieldType.USER_AGENT: ['user_agent', 'useragent', 'browser'],
|
|
69
|
+
FieldType.URL: ['url', 'link', 'href', 'website', 'homepage'],
|
|
70
|
+
FieldType.STATUS: ['status', 'state', 'stage'],
|
|
71
|
+
FieldType.DESCRIPTION: ['description', 'desc', 'details', 'bio', 'summary', 'about'],
|
|
72
|
+
FieldType.TITLE: ['title', 'heading', 'subject'],
|
|
73
|
+
FieldType.TAG: ['tag', 'label', 'category', 'type'],
|
|
74
|
+
FieldType.GENDER: ['gender', 'sex'],
|
|
75
|
+
FieldType.PERCENTAGE: ['percent', 'percentage', 'rate', 'ratio'],
|
|
76
|
+
FieldType.BOOLEAN: ['is_', 'has_', 'can_', 'active', 'enabled', 'verified'],
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _detect_from_name(field_name: str) -> FieldType:
|
|
81
|
+
"""Guess field type purely from its name."""
|
|
82
|
+
lower = field_name.lower()
|
|
83
|
+
for ftype, keywords in NAME_HINTS.items():
|
|
84
|
+
for kw in keywords:
|
|
85
|
+
if kw in lower:
|
|
86
|
+
return ftype
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _detect_from_value(value) -> FieldType:
|
|
91
|
+
"""Guess field type from a sample value."""
|
|
92
|
+
if value is None:
|
|
93
|
+
return None
|
|
94
|
+
if isinstance(value, bool):
|
|
95
|
+
return FieldType.BOOLEAN
|
|
96
|
+
if isinstance(value, int):
|
|
97
|
+
return FieldType.INTEGER
|
|
98
|
+
if isinstance(value, float):
|
|
99
|
+
return FieldType.FLOAT
|
|
100
|
+
if isinstance(value, str):
|
|
101
|
+
for ftype, pattern in PATTERNS:
|
|
102
|
+
if re.match(pattern, value.strip(), re.IGNORECASE):
|
|
103
|
+
return ftype
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# Name hints for security-sensitive fields always win over value detection.
|
|
108
|
+
_NAME_PRIORITY_TYPES = {
|
|
109
|
+
FieldType.PASSWORD, FieldType.PASSWORD_HASH, FieldType.API_KEY,
|
|
110
|
+
FieldType.SECRET_TOKEN, FieldType.JWT_TOKEN, FieldType.CARD_NUMBER,
|
|
111
|
+
FieldType.CARD_EXPIRY, FieldType.BANK_ACCOUNT, FieldType.IBAN,
|
|
112
|
+
FieldType.DATE_OF_BIRTH,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _detect_field(field_name: str, sample_value=None) -> FieldType:
|
|
117
|
+
"""Combine name + value heuristics.
|
|
118
|
+
|
|
119
|
+
Priority:
|
|
120
|
+
1. Name hint for security-sensitive types (password, api_key, etc.)
|
|
121
|
+
2. Value-based regex for precise structural matches (email, UUID, IP…)
|
|
122
|
+
3. Name hint for non-sensitive types
|
|
123
|
+
4. Generic value type (int, float, bool)
|
|
124
|
+
5. SENTENCE fallback
|
|
125
|
+
"""
|
|
126
|
+
by_name = _detect_from_name(field_name)
|
|
127
|
+
by_value = _detect_from_value(sample_value) if sample_value is not None else None
|
|
128
|
+
|
|
129
|
+
# Security-sensitive name hints always win — prevents "s3cr3t" → SLUG
|
|
130
|
+
if by_name and by_name in _NAME_PRIORITY_TYPES:
|
|
131
|
+
return by_name
|
|
132
|
+
|
|
133
|
+
# Value regex is precise for structural types
|
|
134
|
+
if by_value and by_value not in (FieldType.INTEGER, FieldType.FLOAT,
|
|
135
|
+
FieldType.BOOLEAN, FieldType.SLUG):
|
|
136
|
+
return by_value
|
|
137
|
+
|
|
138
|
+
if by_name:
|
|
139
|
+
return by_name
|
|
140
|
+
if by_value:
|
|
141
|
+
return by_value
|
|
142
|
+
|
|
143
|
+
# Python type fallbacks
|
|
144
|
+
if isinstance(sample_value, bool): return FieldType.BOOLEAN
|
|
145
|
+
if isinstance(sample_value, int): return FieldType.INTEGER
|
|
146
|
+
if isinstance(sample_value, float): return FieldType.FLOAT
|
|
147
|
+
|
|
148
|
+
return FieldType.SENTENCE
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def infer_schema(source: Union[str, dict, list], source_type: str = "auto") -> list[SchemaField]:
|
|
152
|
+
"""
|
|
153
|
+
Infer a list of SchemaFields from sample data.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
source: A JSON string, CSV string, Python dict, or list of dicts.
|
|
157
|
+
source_type: "json" | "csv" | "dict" | "auto" (default)
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
List of SchemaField objects ready to pass into FakeSmith.
|
|
161
|
+
"""
|
|
162
|
+
record = _extract_sample_record(source, source_type)
|
|
163
|
+
fields = []
|
|
164
|
+
for key, value in record.items():
|
|
165
|
+
detected = _detect_field(key, value)
|
|
166
|
+
nullable = value is None
|
|
167
|
+
fields.append(SchemaField(name=key, field_type=detected, nullable=nullable))
|
|
168
|
+
return fields
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _extract_sample_record(source, source_type) -> dict:
|
|
172
|
+
"""Pull the first record out of whatever format was given."""
|
|
173
|
+
if isinstance(source, dict):
|
|
174
|
+
return source
|
|
175
|
+
if isinstance(source, list) and source:
|
|
176
|
+
return source[0]
|
|
177
|
+
|
|
178
|
+
if source_type == "auto":
|
|
179
|
+
stripped = source.strip() if isinstance(source, str) else ""
|
|
180
|
+
if stripped.startswith(("{", "[")):
|
|
181
|
+
source_type = "json"
|
|
182
|
+
else:
|
|
183
|
+
source_type = "csv"
|
|
184
|
+
|
|
185
|
+
if source_type == "json":
|
|
186
|
+
data = json.loads(source)
|
|
187
|
+
return data[0] if isinstance(data, list) else data
|
|
188
|
+
|
|
189
|
+
if source_type == "csv":
|
|
190
|
+
reader = csv.DictReader(io.StringIO(source))
|
|
191
|
+
for row in reader:
|
|
192
|
+
return dict(row)
|
|
193
|
+
|
|
194
|
+
raise ValueError(f"Cannot parse source with source_type='{source_type}'")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# Gap 5: Sensitive field registry
|
|
198
|
+
# Used by smith.py to flag fields that contain real PII or secrets.
|
|
199
|
+
|
|
200
|
+
SENSITIVE_TYPES = {
|
|
201
|
+
FieldType.EMAIL,
|
|
202
|
+
FieldType.PHONE,
|
|
203
|
+
FieldType.PASSWORD,
|
|
204
|
+
FieldType.PASSWORD_HASH,
|
|
205
|
+
FieldType.CARD_NUMBER,
|
|
206
|
+
FieldType.CARD_EXPIRY,
|
|
207
|
+
FieldType.CARD_CVV,
|
|
208
|
+
FieldType.BANK_ACCOUNT,
|
|
209
|
+
FieldType.IBAN,
|
|
210
|
+
FieldType.API_KEY,
|
|
211
|
+
FieldType.SECRET_TOKEN,
|
|
212
|
+
FieldType.JWT_TOKEN,
|
|
213
|
+
FieldType.DATE_OF_BIRTH,
|
|
214
|
+
FieldType.IP_ADDRESS,
|
|
215
|
+
FieldType.IPV6,
|
|
216
|
+
FieldType.MAC_ADDRESS,
|
|
217
|
+
}
|
fakesmith/generators.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
generators.py — Maps every FieldType to a Faker-backed callable.
|
|
3
|
+
Includes format-preserving helpers (Gap 4) and seed threading (Gap 1).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import random
|
|
8
|
+
import uuid
|
|
9
|
+
import base64
|
|
10
|
+
import hashlib
|
|
11
|
+
import time
|
|
12
|
+
from typing import Union, Optional
|
|
13
|
+
from faker import Faker
|
|
14
|
+
from .schema import SchemaField, FieldType
|
|
15
|
+
|
|
16
|
+
_fakers: dict[str, Faker] = {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def seed_all(seed: int) -> None:
|
|
20
|
+
"""Seed Python random AND every Faker instance for deterministic output."""
|
|
21
|
+
random.seed(seed)
|
|
22
|
+
Faker.seed(seed)
|
|
23
|
+
for f in _fakers.values():
|
|
24
|
+
f.seed_instance(seed)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_faker(locale: str = "en_US") -> Faker:
|
|
28
|
+
if locale not in _fakers:
|
|
29
|
+
_fakers[locale] = Faker(locale)
|
|
30
|
+
return _fakers[locale]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Gap 4: format-preserving helpers
|
|
34
|
+
|
|
35
|
+
def _preserve_phone(original: str) -> str:
|
|
36
|
+
"""Return a fake phone that mirrors the separator + prefix style of original."""
|
|
37
|
+
has_plus = original.strip().startswith('+')
|
|
38
|
+
has_parens = '(' in original
|
|
39
|
+
sep = '-' if '-' in original else (' ' if ' ' in original else '')
|
|
40
|
+
digits = re.sub(r'\D', '', original)
|
|
41
|
+
fake_d = ''.join(str(random.randint(0, 9)) for _ in digits)
|
|
42
|
+
if has_plus and len(digits) > 10:
|
|
43
|
+
cc = fake_d[:len(digits) - 10]
|
|
44
|
+
rest = fake_d[len(digits) - 10:]
|
|
45
|
+
if has_parens:
|
|
46
|
+
return f"+{cc}{sep}({rest[:3]}){sep}{rest[3:6]}{sep}{rest[6:]}"
|
|
47
|
+
return f"+{cc}{sep}{rest[:3]}{sep}{rest[3:6]}{sep}{rest[6:]}"
|
|
48
|
+
if has_parens:
|
|
49
|
+
return f"({fake_d[:3]}){sep}{fake_d[3:6]}{sep}{fake_d[6:]}"
|
|
50
|
+
if sep:
|
|
51
|
+
return f"{fake_d[:3]}{sep}{fake_d[3:6]}{sep}{fake_d[6:]}"
|
|
52
|
+
return fake_d
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _preserve_api_key(original: str, prefix: Optional[str] = None) -> str:
|
|
56
|
+
"""Return a fake key with the same prefix and character count."""
|
|
57
|
+
if prefix:
|
|
58
|
+
length = max(len(original) - len(prefix), 16)
|
|
59
|
+
chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
|
|
60
|
+
return prefix + ''.join(random.choices(chars, k=length))
|
|
61
|
+
m = re.match(r'^([a-zA-Z0-9_\-]{2,8}[-_])', original)
|
|
62
|
+
detected = m.group(1) if m else 'key-'
|
|
63
|
+
length = max(len(original) - len(detected), 16)
|
|
64
|
+
chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
|
|
65
|
+
return detected + ''.join(random.choices(chars, k=length))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _preserve_card(original: str) -> str:
|
|
69
|
+
"""Return a fake card that mirrors the spacing/dash format of original."""
|
|
70
|
+
sep = '-' if '-' in original else (' ' if ' ' in original else '')
|
|
71
|
+
fake = Faker().credit_card_number()
|
|
72
|
+
if sep and len(original.replace(sep, '')) == 16:
|
|
73
|
+
return sep.join([fake[i:i+4] for i in range(0, 16, 4)])
|
|
74
|
+
return fake
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _fake_jwt() -> str:
|
|
78
|
+
"""Generate a structurally valid but fake JWT token."""
|
|
79
|
+
def b64(data: str) -> str:
|
|
80
|
+
return base64.urlsafe_b64encode(data.encode()).rstrip(b"=").decode()
|
|
81
|
+
header = b64('{"alg":"HS256","typ":"JWT"}')
|
|
82
|
+
payload = b64(f'{{"sub":"{uuid.uuid4()}","iat":{int(time.time())}}}')
|
|
83
|
+
sig = base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b"=").decode()
|
|
84
|
+
return f"{header}.{payload}.{sig}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _fake_api_key(prefix: str = "sk-") -> str:
|
|
88
|
+
fake = _get_faker()
|
|
89
|
+
return prefix + fake.lexify("?" * 32, letters="abcdefghijklmnopqrstuvwxyz0123456789")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _fake_secret_token() -> str:
|
|
93
|
+
return hashlib.sha256(uuid.uuid4().bytes).hexdigest()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def generate_value(field: SchemaField, original_value: Optional[str] = None) -> any:
|
|
97
|
+
"""Generate a single fake value. original_value enables format-preserving output."""
|
|
98
|
+
fake = _get_faker(field.locale or "en_US")
|
|
99
|
+
|
|
100
|
+
# Handle nullable fields
|
|
101
|
+
if field.nullable and random.random() < field.null_probability:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
# Custom generator takes full priority
|
|
105
|
+
if field.field_type == FieldType.CUSTOM:
|
|
106
|
+
return field.generator()
|
|
107
|
+
|
|
108
|
+
# Choices-based field
|
|
109
|
+
if field.choices:
|
|
110
|
+
return random.choice(field.choices)
|
|
111
|
+
|
|
112
|
+
ft = field.field_type
|
|
113
|
+
|
|
114
|
+
# Identity
|
|
115
|
+
if ft == FieldType.UUID: return fake.uuid4()
|
|
116
|
+
if ft == FieldType.FULL_NAME: return fake.name()
|
|
117
|
+
if ft == FieldType.FIRST_NAME: return fake.first_name()
|
|
118
|
+
if ft == FieldType.LAST_NAME: return fake.last_name()
|
|
119
|
+
if ft == FieldType.USERNAME: return fake.user_name()
|
|
120
|
+
if ft == FieldType.EMAIL: return fake.email()
|
|
121
|
+
if ft == FieldType.PHONE:
|
|
122
|
+
if original_value and isinstance(original_value, str):
|
|
123
|
+
return _preserve_phone(original_value)
|
|
124
|
+
return fake.phone_number()
|
|
125
|
+
if ft == FieldType.PASSWORD: return fake.password(length=12)
|
|
126
|
+
if ft == FieldType.PASSWORD_HASH: return hashlib.sha256(fake.password().encode()).hexdigest()
|
|
127
|
+
|
|
128
|
+
# ── Location
|
|
129
|
+
if ft == FieldType.ADDRESS: return fake.address()
|
|
130
|
+
if ft == FieldType.CITY: return fake.city()
|
|
131
|
+
if ft == FieldType.STATE: return fake.state() if hasattr(fake, "state") else fake.city()
|
|
132
|
+
if ft == FieldType.COUNTRY: return fake.country()
|
|
133
|
+
if ft == FieldType.ZIP_CODE: return fake.postcode()
|
|
134
|
+
if ft == FieldType.LATITUDE: return float(fake.latitude())
|
|
135
|
+
if ft == FieldType.LONGITUDE: return float(fake.longitude())
|
|
136
|
+
|
|
137
|
+
# ── Finance ─
|
|
138
|
+
if ft == FieldType.CARD_NUMBER:
|
|
139
|
+
if original_value and isinstance(original_value, str):
|
|
140
|
+
return _preserve_card(original_value)
|
|
141
|
+
return fake.credit_card_number()
|
|
142
|
+
if ft == FieldType.CARD_EXPIRY: return fake.credit_card_expire()
|
|
143
|
+
if ft == FieldType.CARD_CVV: return fake.credit_card_security_code()
|
|
144
|
+
if ft == FieldType.BANK_ACCOUNT: return fake.bban()
|
|
145
|
+
if ft == FieldType.IBAN: return fake.iban()
|
|
146
|
+
if ft == FieldType.CURRENCY: return random.choice(["USD", "EUR", "GBP", "GHS", "NGN", "JPY", "CAD"])
|
|
147
|
+
if ft == FieldType.AMOUNT:
|
|
148
|
+
lo = field.min_value if field.min_value is not None else 1.0
|
|
149
|
+
hi = field.max_value if field.max_value is not None else 99999.0
|
|
150
|
+
return round(random.uniform(lo, hi), 2)
|
|
151
|
+
|
|
152
|
+
# Business
|
|
153
|
+
if ft == FieldType.COMPANY: return fake.company()
|
|
154
|
+
if ft == FieldType.JOB_TITLE: return fake.job()
|
|
155
|
+
if ft == FieldType.DEPARTMENT: return random.choice(["Engineering", "Marketing", "Finance", "HR", "Sales", "Design", "Operations"])
|
|
156
|
+
if ft == FieldType.API_KEY:
|
|
157
|
+
if original_value and isinstance(original_value, str):
|
|
158
|
+
return _preserve_api_key(original_value, field.prefix)
|
|
159
|
+
return _fake_api_key(field.prefix or "sk-")
|
|
160
|
+
if ft == FieldType.SECRET_TOKEN: return _fake_secret_token()
|
|
161
|
+
if ft == FieldType.JWT_TOKEN: return _fake_jwt()
|
|
162
|
+
if ft == FieldType.WEBHOOK_URL: return f"https://hooks.{fake.domain_name()}/webhook/{uuid.uuid4().hex[:8]}"
|
|
163
|
+
|
|
164
|
+
# Dates & Times
|
|
165
|
+
if ft == FieldType.DATETIME: return fake.date_time_this_year().isoformat()
|
|
166
|
+
if ft == FieldType.DATE: return fake.date_this_year().isoformat()
|
|
167
|
+
if ft == FieldType.TIME: return fake.time()
|
|
168
|
+
if ft == FieldType.DATE_OF_BIRTH: return fake.date_of_birth(minimum_age=18, maximum_age=70).isoformat()
|
|
169
|
+
if ft == FieldType.TIMESTAMP: return int(fake.date_time_this_year().timestamp())
|
|
170
|
+
|
|
171
|
+
# Web & Tech
|
|
172
|
+
if ft == FieldType.IP_ADDRESS: return fake.ipv4()
|
|
173
|
+
if ft == FieldType.IPV6: return fake.ipv6()
|
|
174
|
+
if ft == FieldType.MAC_ADDRESS: return fake.mac_address()
|
|
175
|
+
if ft == FieldType.USER_AGENT: return fake.user_agent()
|
|
176
|
+
if ft == FieldType.URL: return fake.url()
|
|
177
|
+
if ft == FieldType.DOMAIN: return fake.domain_name()
|
|
178
|
+
if ft == FieldType.SLUG: return fake.slug()
|
|
179
|
+
|
|
180
|
+
# Content
|
|
181
|
+
if ft == FieldType.WORD: return fake.word()
|
|
182
|
+
if ft == FieldType.SENTENCE: return fake.sentence()
|
|
183
|
+
if ft == FieldType.PARAGRAPH: return fake.paragraph()
|
|
184
|
+
if ft == FieldType.TITLE: return fake.catch_phrase()
|
|
185
|
+
if ft == FieldType.DESCRIPTION: return fake.text(max_nb_chars=200)
|
|
186
|
+
if ft == FieldType.TAG: return fake.word()
|
|
187
|
+
|
|
188
|
+
# Numeric
|
|
189
|
+
if ft == FieldType.INTEGER:
|
|
190
|
+
lo = int(field.min_value) if field.min_value is not None else 0
|
|
191
|
+
hi = int(field.max_value) if field.max_value is not None else 10000
|
|
192
|
+
return random.randint(lo, hi)
|
|
193
|
+
if ft == FieldType.FLOAT:
|
|
194
|
+
lo = field.min_value if field.min_value is not None else 0.0
|
|
195
|
+
hi = field.max_value if field.max_value is not None else 1000.0
|
|
196
|
+
return round(random.uniform(lo, hi), 4)
|
|
197
|
+
if ft == FieldType.BOOLEAN: return random.choice([True, False])
|
|
198
|
+
if ft == FieldType.PERCENTAGE: return round(random.uniform(0, 100), 2)
|
|
199
|
+
|
|
200
|
+
# Enums
|
|
201
|
+
if ft == FieldType.STATUS: return random.choice(["active", "inactive", "pending", "suspended", "archived"])
|
|
202
|
+
if ft == FieldType.GENDER: return random.choice(["male", "female", "non-binary", "prefer_not_to_say"])
|
|
203
|
+
|
|
204
|
+
# Fallback
|
|
205
|
+
return fake.word()
|
fakesmith/result.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
result.py — GenerationResult wraps generated records with a change summary
|
|
3
|
+
and any warnings raised during detection.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import List, Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class FieldSummary:
|
|
12
|
+
"""What happened to one field."""
|
|
13
|
+
name: str
|
|
14
|
+
detected_type: str
|
|
15
|
+
is_sensitive: bool
|
|
16
|
+
original_sample: str # first value from source (truncated / redacted)
|
|
17
|
+
note: str = ""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class GenerationResult:
|
|
22
|
+
"""
|
|
23
|
+
Returned by FakeSmith.generate() and all to_*() methods when
|
|
24
|
+
return_summary=True is passed (or always from generate_with_summary()).
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
records: The generated fake records.
|
|
28
|
+
summary: Per-field breakdown of what was detected and replaced.
|
|
29
|
+
warnings: Human-readable warnings for sensitive fields found.
|
|
30
|
+
seed: The seed used, if any (None = random).
|
|
31
|
+
"""
|
|
32
|
+
records: List[Dict]
|
|
33
|
+
summary: List[FieldSummary] = field(default_factory=list)
|
|
34
|
+
warnings: List[str] = field(default_factory=list)
|
|
35
|
+
seed: Optional[int] = None
|
|
36
|
+
|
|
37
|
+
# ── Display ───────────────────────────────────────────────────────────────
|
|
38
|
+
|
|
39
|
+
def print_summary(self) -> None:
|
|
40
|
+
"""Print a human-readable change report to stdout."""
|
|
41
|
+
print(f"\n{'─'*62}")
|
|
42
|
+
print(f" FakeSmith — change summary ({len(self.records)} records generated)")
|
|
43
|
+
if self.seed is not None:
|
|
44
|
+
print(f" Seed: {self.seed} (re-run with --seed {self.seed} for identical output)")
|
|
45
|
+
print(f"{'─'*62}")
|
|
46
|
+
print(f" {'Field':<22} {'Detected type':<20} {'Sensitive'}")
|
|
47
|
+
print(f" {'─'*20} {'─'*18} {'─'*9}")
|
|
48
|
+
for fs in self.summary:
|
|
49
|
+
flag = " *** YES ***" if fs.is_sensitive else " no"
|
|
50
|
+
print(f" {fs.name:<22} {fs.detected_type:<20}{flag}")
|
|
51
|
+
if fs.note:
|
|
52
|
+
print(f" {'':22} {fs.note}")
|
|
53
|
+
print(f"{'─'*62}")
|
|
54
|
+
if self.warnings:
|
|
55
|
+
print()
|
|
56
|
+
for w in self.warnings:
|
|
57
|
+
print(f" [WARNING] {w}")
|
|
58
|
+
print()
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict:
|
|
61
|
+
"""Machine-readable summary dict."""
|
|
62
|
+
return {
|
|
63
|
+
"seed": self.seed,
|
|
64
|
+
"records_generated": len(self.records),
|
|
65
|
+
"fields": [
|
|
66
|
+
{
|
|
67
|
+
"name": fs.name,
|
|
68
|
+
"detected_type": fs.detected_type,
|
|
69
|
+
"sensitive": fs.is_sensitive,
|
|
70
|
+
"note": fs.note,
|
|
71
|
+
}
|
|
72
|
+
for fs in self.summary
|
|
73
|
+
],
|
|
74
|
+
"warnings": self.warnings,
|
|
75
|
+
}
|