misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/formulas.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Formula engine for derived columns.
|
|
3
|
+
|
|
4
|
+
This module enables columns that are computed from other columns,
|
|
5
|
+
supporting expressions like:
|
|
6
|
+
- calories_burned = duration_minutes * @exercises.calories_per_minute
|
|
7
|
+
- total_price = quantity * @products.price
|
|
8
|
+
- discount_amount = total_price * 0.1
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import ast
|
|
17
|
+
import operator
|
|
18
|
+
from simpleeval import simple_eval, NameNotDefined
|
|
19
|
+
|
|
20
|
+
# Whitelist of safe functions
|
|
21
|
+
SAFE_FUNCTIONS = {
|
|
22
|
+
'where': np.where,
|
|
23
|
+
'abs': np.abs,
|
|
24
|
+
'round': np.round,
|
|
25
|
+
'ceil': np.ceil,
|
|
26
|
+
'floor': np.floor,
|
|
27
|
+
'min': np.minimum,
|
|
28
|
+
'max': np.maximum,
|
|
29
|
+
'sin': np.sin,
|
|
30
|
+
'cos': np.cos,
|
|
31
|
+
'tan': np.tan,
|
|
32
|
+
'log': np.log,
|
|
33
|
+
'exp': np.exp,
|
|
34
|
+
'sqrt': np.sqrt,
|
|
35
|
+
'random': np.random.random,
|
|
36
|
+
'randint': np.random.randint,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Standard operators to bypass simpleeval's string length checks which fail on numpy arrays
|
|
40
|
+
SAFE_OPERATORS = {
|
|
41
|
+
ast.Add: operator.add,
|
|
42
|
+
ast.Sub: operator.sub,
|
|
43
|
+
ast.Mult: operator.mul,
|
|
44
|
+
ast.Div: operator.truediv,
|
|
45
|
+
ast.FloorDiv: operator.floordiv,
|
|
46
|
+
ast.Pow: operator.pow,
|
|
47
|
+
ast.Mod: operator.mod,
|
|
48
|
+
ast.Eq: operator.eq,
|
|
49
|
+
ast.NotEq: operator.ne,
|
|
50
|
+
ast.Gt: operator.gt,
|
|
51
|
+
ast.Lt: operator.lt,
|
|
52
|
+
ast.GtE: operator.ge,
|
|
53
|
+
ast.LtE: operator.le,
|
|
54
|
+
ast.BitAnd: operator.and_,
|
|
55
|
+
ast.BitOr: operator.or_,
|
|
56
|
+
ast.BitXor: operator.xor,
|
|
57
|
+
ast.USub: operator.neg,
|
|
58
|
+
ast.UAdd: operator.pos,
|
|
59
|
+
ast.Not: operator.not_,
|
|
60
|
+
ast.In: lambda a, b: a in b,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
class SafeNumpy:
|
|
64
|
+
"""Proxy for numpy to allow safe access to whitelisted functions."""
|
|
65
|
+
def __getattr__(self, name):
|
|
66
|
+
if name in SAFE_FUNCTIONS:
|
|
67
|
+
return SAFE_FUNCTIONS[name]
|
|
68
|
+
raise NameNotDefined(name, f"Function 'np.{name}' is not allowed in formulas.")
|
|
69
|
+
|
|
70
|
+
class FormulaEngine:
|
|
71
|
+
"""
|
|
72
|
+
Evaluate column formulas using safe expressions.
|
|
73
|
+
|
|
74
|
+
Supports:
|
|
75
|
+
- Simple arithmetic: duration * 10
|
|
76
|
+
- Column references: quantity * unit_price
|
|
77
|
+
- Cross-table references: @exercises.calories_per_minute
|
|
78
|
+
- Conditional expressions: np.where(status == 'active', 1, 0)
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self, tables: Dict[str, pd.DataFrame]):
|
|
82
|
+
"""
|
|
83
|
+
Initialize with generated tables for cross-table lookups.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
tables: Dict mapping table names to DataFrames
|
|
87
|
+
"""
|
|
88
|
+
self.tables = tables
|
|
89
|
+
|
|
90
|
+
def evaluate(
|
|
91
|
+
self,
|
|
92
|
+
df: pd.DataFrame,
|
|
93
|
+
formula: str,
|
|
94
|
+
fk_column: Optional[str] = None,
|
|
95
|
+
) -> np.ndarray:
|
|
96
|
+
"""
|
|
97
|
+
Evaluate a formula on a DataFrame.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
df: DataFrame to evaluate on
|
|
101
|
+
formula: Expression string
|
|
102
|
+
fk_column: Foreign key column name for cross-table lookups
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Array of computed values
|
|
106
|
+
"""
|
|
107
|
+
# Replace cross-table references with actual values
|
|
108
|
+
processed_formula = self._resolve_cross_table_refs(df, formula, fk_column)
|
|
109
|
+
|
|
110
|
+
# Create evaluation context
|
|
111
|
+
names = {
|
|
112
|
+
'np': SafeNumpy(),
|
|
113
|
+
'pd': pd, # Needed for some checks, but ideally we restrict this too
|
|
114
|
+
# simpleeval defaults allow basic math
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Add columns to context
|
|
118
|
+
for col in df.columns:
|
|
119
|
+
names[col] = df[col].values
|
|
120
|
+
|
|
121
|
+
# Evaluate the expression safely
|
|
122
|
+
try:
|
|
123
|
+
result = simple_eval(
|
|
124
|
+
processed_formula,
|
|
125
|
+
names=names,
|
|
126
|
+
functions=SAFE_FUNCTIONS, # Allow top-level functions too
|
|
127
|
+
operators=SAFE_OPERATORS
|
|
128
|
+
)
|
|
129
|
+
return np.array(result)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
raise ValueError(f"Failed to evaluate formula '{formula}': {e}")
|
|
132
|
+
|
|
133
|
+
def _resolve_cross_table_refs(
|
|
134
|
+
self,
|
|
135
|
+
df: pd.DataFrame,
|
|
136
|
+
formula: str,
|
|
137
|
+
fk_column: Optional[str] = None,
|
|
138
|
+
) -> str:
|
|
139
|
+
"""
|
|
140
|
+
Replace @table.column references with actual looked-up values.
|
|
141
|
+
|
|
142
|
+
Pattern: @tablename.columnname
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
df: Current DataFrame
|
|
146
|
+
formula: Formula with potential cross-table refs
|
|
147
|
+
fk_column: FK column to use for lookups
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Formula with refs replaced by _lookup_N variables
|
|
151
|
+
"""
|
|
152
|
+
# Pattern to match @table.column
|
|
153
|
+
pattern = r'@(\w+)\.(\w+)'
|
|
154
|
+
matches = re.findall(pattern, formula)
|
|
155
|
+
|
|
156
|
+
if not matches:
|
|
157
|
+
return formula
|
|
158
|
+
|
|
159
|
+
result = formula
|
|
160
|
+
|
|
161
|
+
for i, (table_name, col_name) in enumerate(matches):
|
|
162
|
+
if table_name not in self.tables:
|
|
163
|
+
raise ValueError(f"Table '{table_name}' not found for formula lookup")
|
|
164
|
+
|
|
165
|
+
ref_table = self.tables[table_name]
|
|
166
|
+
|
|
167
|
+
if col_name not in ref_table.columns:
|
|
168
|
+
raise ValueError(f"Column '{col_name}' not found in table '{table_name}'")
|
|
169
|
+
|
|
170
|
+
# Determine the FK column to use for lookup
|
|
171
|
+
actual_fk = fk_column or f"{table_name}_id"
|
|
172
|
+
|
|
173
|
+
if actual_fk not in df.columns:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Cannot lookup @{table_name}.{col_name}: "
|
|
176
|
+
f"no foreign key column '{actual_fk}' in current table"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Create lookup mapping and apply
|
|
180
|
+
# Handle duplicates in index by grouping or taking first (safety)
|
|
181
|
+
if not ref_table['id'].is_unique:
|
|
182
|
+
# If IDs are not unique in ref table, we have a problem.
|
|
183
|
+
# Assuming IDs are unique for lookups.
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
lookup_map = ref_table.set_index('id')[col_name].to_dict()
|
|
187
|
+
df[actual_fk].map(lookup_map).values
|
|
188
|
+
|
|
189
|
+
var_name = f'_lookup_{i}'
|
|
190
|
+
# Store in a temporary dict doesn't work well with clean state
|
|
191
|
+
# Instead we'll inject into the names dict in evaluate
|
|
192
|
+
# But specific method needs to handle this exchange.
|
|
193
|
+
# Refactoring to return both formula and context updates would be better
|
|
194
|
+
# For now, sticking to string replacement and hoping evaluate calls context filler
|
|
195
|
+
|
|
196
|
+
# CRITICAL: This method only returns string.
|
|
197
|
+
# The previous implementation put it in _temp_lookups but implementation was messy.
|
|
198
|
+
# We will use evaluate_with_lookups which is the main entry point
|
|
199
|
+
|
|
200
|
+
result = result.replace(f'@{table_name}.{col_name}', var_name)
|
|
201
|
+
|
|
202
|
+
return result
|
|
203
|
+
|
|
204
|
+
def evaluate_with_lookups(
|
|
205
|
+
self,
|
|
206
|
+
df: pd.DataFrame,
|
|
207
|
+
formula: str,
|
|
208
|
+
fk_mappings: Optional[Dict[str, str]] = None,
|
|
209
|
+
) -> np.ndarray:
|
|
210
|
+
"""
|
|
211
|
+
Evaluate formula with automatic cross-table lookups.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
df: DataFrame to evaluate on
|
|
215
|
+
formula: Expression with @table.column references
|
|
216
|
+
fk_mappings: Optional dict mapping table name to FK column name
|
|
217
|
+
e.g., {"exercises": "exercise_id", "products": "product_id"}
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Array of computed values
|
|
221
|
+
"""
|
|
222
|
+
fk_mappings = fk_mappings or {}
|
|
223
|
+
|
|
224
|
+
# Pattern to match @table.column
|
|
225
|
+
pattern = r'@(\w+)\.(\w+)'
|
|
226
|
+
matches = re.findall(pattern, formula)
|
|
227
|
+
|
|
228
|
+
result = formula
|
|
229
|
+
names = {
|
|
230
|
+
'np': SafeNumpy(),
|
|
231
|
+
'pd': pd,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
# Add columns to context
|
|
235
|
+
for col in df.columns:
|
|
236
|
+
names[col] = df[col].values
|
|
237
|
+
|
|
238
|
+
# Resolve each cross-table reference
|
|
239
|
+
for i, (table_name, col_name) in enumerate(matches):
|
|
240
|
+
if table_name not in self.tables:
|
|
241
|
+
raise ValueError(f"Table '{table_name}' not found")
|
|
242
|
+
|
|
243
|
+
ref_table = self.tables[table_name]
|
|
244
|
+
|
|
245
|
+
# Determine FK column
|
|
246
|
+
fk_col = fk_mappings.get(table_name, f"{table_name}_id")
|
|
247
|
+
if fk_col.endswith("s_id"):
|
|
248
|
+
# Try without trailing 's' (exercises -> exercise_id)
|
|
249
|
+
alt_fk = fk_col.replace("s_id", "_id")
|
|
250
|
+
if alt_fk in df.columns:
|
|
251
|
+
fk_col = alt_fk
|
|
252
|
+
|
|
253
|
+
if fk_col not in df.columns:
|
|
254
|
+
# Try common patterns
|
|
255
|
+
for pattern_fk in [f"{table_name}_id", f"{table_name[:-1]}_id", "id"]:
|
|
256
|
+
if pattern_fk in df.columns:
|
|
257
|
+
fk_col = pattern_fk
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
if fk_col not in df.columns:
|
|
261
|
+
raise ValueError(f"No FK column found for table '{table_name}'")
|
|
262
|
+
|
|
263
|
+
# Create lookup and add to context
|
|
264
|
+
if 'id' not in ref_table.columns:
|
|
265
|
+
raise ValueError(f"Reference table '{table_name}' has no 'id' column")
|
|
266
|
+
|
|
267
|
+
lookup_map = ref_table.set_index('id')[col_name].to_dict()
|
|
268
|
+
looked_up = df[fk_col].map(lookup_map).fillna(0).values
|
|
269
|
+
|
|
270
|
+
var_name = f'_ref_{i}'
|
|
271
|
+
names[var_name] = looked_up
|
|
272
|
+
result = result.replace(f'@{table_name}.{col_name}', var_name)
|
|
273
|
+
|
|
274
|
+
# Evaluate safely
|
|
275
|
+
try:
|
|
276
|
+
return np.array(simple_eval(
|
|
277
|
+
result,
|
|
278
|
+
names=names,
|
|
279
|
+
functions=SAFE_FUNCTIONS,
|
|
280
|
+
operators=SAFE_OPERATORS
|
|
281
|
+
))
|
|
282
|
+
except Exception as e:
|
|
283
|
+
raise ValueError(f"Failed to evaluate formula '{formula}': {e}")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class FormulaColumn:
|
|
287
|
+
"""
|
|
288
|
+
Definition of a formula-based column.
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
def __init__(
|
|
292
|
+
self,
|
|
293
|
+
name: str,
|
|
294
|
+
formula: str,
|
|
295
|
+
result_type: str = "float",
|
|
296
|
+
fk_mappings: Optional[Dict[str, str]] = None,
|
|
297
|
+
):
|
|
298
|
+
"""
|
|
299
|
+
Define a formula column.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
name: Column name
|
|
303
|
+
formula: Expression (can include @table.column refs)
|
|
304
|
+
result_type: Type of result (int, float, boolean)
|
|
305
|
+
fk_mappings: Map table names to FK column names
|
|
306
|
+
"""
|
|
307
|
+
self.name = name
|
|
308
|
+
self.formula = formula
|
|
309
|
+
self.result_type = result_type
|
|
310
|
+
self.fk_mappings = fk_mappings or {}
|
|
311
|
+
|
|
312
|
+
def evaluate(
|
|
313
|
+
self,
|
|
314
|
+
df: pd.DataFrame,
|
|
315
|
+
tables: Dict[str, pd.DataFrame],
|
|
316
|
+
) -> np.ndarray:
|
|
317
|
+
"""
|
|
318
|
+
Evaluate this formula column.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
df: Current table DataFrame
|
|
322
|
+
tables: All generated tables for cross-table lookups
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Array of computed values
|
|
326
|
+
"""
|
|
327
|
+
engine = FormulaEngine(tables)
|
|
328
|
+
result = engine.evaluate_with_lookups(df, self.formula, self.fk_mappings)
|
|
329
|
+
|
|
330
|
+
# Cast to result type
|
|
331
|
+
if self.result_type == "int":
|
|
332
|
+
return result.astype(int)
|
|
333
|
+
elif self.result_type == "float":
|
|
334
|
+
return result.astype(float)
|
|
335
|
+
elif self.result_type == "boolean":
|
|
336
|
+
return result.astype(bool)
|
|
337
|
+
|
|
338
|
+
return result
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def apply_formula_columns(
|
|
342
|
+
df: pd.DataFrame,
|
|
343
|
+
formulas: List[FormulaColumn],
|
|
344
|
+
tables: Dict[str, pd.DataFrame],
|
|
345
|
+
) -> pd.DataFrame:
|
|
346
|
+
"""
|
|
347
|
+
Apply formula columns to a DataFrame.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
df: DataFrame to add columns to
|
|
351
|
+
formulas: List of formula column definitions
|
|
352
|
+
tables: All tables for cross-table lookups
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
DataFrame with formula columns added
|
|
356
|
+
"""
|
|
357
|
+
result = df.copy()
|
|
358
|
+
|
|
359
|
+
for formula_col in formulas:
|
|
360
|
+
result[formula_col.name] = formula_col.evaluate(result, tables)
|
|
361
|
+
|
|
362
|
+
return result
|
misata/generators.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pure Python text generators for Misata.
|
|
3
|
+
|
|
4
|
+
Replaces Mimesis with lightweight, built-in generators using curated data pools.
|
|
5
|
+
No external dependencies required.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import random
|
|
9
|
+
import string
|
|
10
|
+
from typing import Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
# ============================================
|
|
13
|
+
# DATA POOLS
|
|
14
|
+
# ============================================
|
|
15
|
+
|
|
16
|
+
FIRST_NAMES = [
|
|
17
|
+
"James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda",
|
|
18
|
+
"David", "Elizabeth", "William", "Barbara", "Richard", "Susan", "Joseph", "Jessica",
|
|
19
|
+
"Thomas", "Sarah", "Christopher", "Karen", "Charles", "Lisa", "Daniel", "Nancy",
|
|
20
|
+
"Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra", "Donald", "Ashley",
|
|
21
|
+
"Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
|
|
22
|
+
"Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa",
|
|
23
|
+
"Timothy", "Deborah", "Ronald", "Stephanie", "Edward", "Rebecca", "Jason", "Sharon",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
LAST_NAMES = [
|
|
27
|
+
"Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
|
|
28
|
+
"Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson",
|
|
29
|
+
"Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson",
|
|
30
|
+
"White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker",
|
|
31
|
+
"Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill",
|
|
32
|
+
"Flores", "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
EMAIL_DOMAINS = [
|
|
36
|
+
"gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "icloud.com",
|
|
37
|
+
"protonmail.com", "mail.com", "aol.com", "zoho.com", "fastmail.com",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
COMPANY_NAMES = [
|
|
41
|
+
"Acme Corp", "Globex", "Initech", "Umbrella Corp", "Stark Industries",
|
|
42
|
+
"Wayne Enterprises", "Cyberdyne Systems", "Soylent Corp", "Massive Dynamic",
|
|
43
|
+
"Aperture Science", "InGen", "Tyrell Corporation", "Weyland-Yutani", "OsCorp",
|
|
44
|
+
"LexCorp", "Oscorp Industries", "Dharma Initiative", "Dunder Mifflin",
|
|
45
|
+
"Sterling Cooper", "Wonka Industries", "Prestige Worldwide", "Vandelay Industries",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
STREET_NAMES = [
|
|
49
|
+
"Main", "Oak", "Maple", "Cedar", "Elm", "Pine", "Washington", "Lake",
|
|
50
|
+
"Hill", "Park", "River", "Sunset", "Highland", "Valley", "Forest", "Spring",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
STREET_SUFFIXES = ["St", "Ave", "Blvd", "Dr", "Ln", "Rd", "Way", "Ct", "Pl", "Cir"]
|
|
54
|
+
|
|
55
|
+
CITIES = [
|
|
56
|
+
"New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia",
|
|
57
|
+
"San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Jacksonville",
|
|
58
|
+
"Fort Worth", "Columbus", "Charlotte", "Seattle", "Denver", "Boston",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
STATES = [
|
|
62
|
+
"NY", "CA", "TX", "FL", "IL", "PA", "OH", "GA", "NC", "MI",
|
|
63
|
+
"NJ", "VA", "WA", "AZ", "MA", "CO", "TN", "IN", "MO", "MD",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
LOREM_WORDS = [
|
|
67
|
+
"lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
|
|
68
|
+
"sed", "do", "eiusmod", "tempor", "incididunt", "ut", "labore", "et", "dolore",
|
|
69
|
+
"magna", "aliqua", "enim", "ad", "minim", "veniam", "quis", "nostrud",
|
|
70
|
+
"exercitation", "ullamco", "laboris", "nisi", "aliquip", "ex", "ea", "commodo",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
URLS = [
|
|
74
|
+
"https://example.com", "https://test.org", "https://demo.io", "https://sample.net",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ============================================
|
|
79
|
+
# GENERATOR CLASS
|
|
80
|
+
# ============================================
|
|
81
|
+
|
|
82
|
+
class TextGenerator:
|
|
83
|
+
"""
|
|
84
|
+
Pure Python text generator for synthetic data.
|
|
85
|
+
|
|
86
|
+
Drop-in replacement for Mimesis functionality.
|
|
87
|
+
Supports expandable data pools that can grow over time.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
# Class-level pools (shared across instances, can be extended)
|
|
91
|
+
_pools = {
|
|
92
|
+
"first_names": list(FIRST_NAMES),
|
|
93
|
+
"last_names": list(LAST_NAMES),
|
|
94
|
+
"email_domains": list(EMAIL_DOMAINS),
|
|
95
|
+
"company_names": list(COMPANY_NAMES),
|
|
96
|
+
"street_names": list(STREET_NAMES),
|
|
97
|
+
"cities": list(CITIES),
|
|
98
|
+
"states": list(STATES),
|
|
99
|
+
"lorem_words": list(LOREM_WORDS),
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def __init__(self, seed: Optional[int] = None):
|
|
103
|
+
"""Initialize with optional random seed."""
|
|
104
|
+
self.rng = random.Random(seed)
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def extend_pool(cls, pool_name: str, values: List[str]) -> int:
|
|
108
|
+
"""
|
|
109
|
+
Extend a data pool with new values.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
pool_name: Name of pool (first_names, last_names, etc.)
|
|
113
|
+
values: List of new values to add
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
New pool size
|
|
117
|
+
"""
|
|
118
|
+
if pool_name not in cls._pools:
|
|
119
|
+
cls._pools[pool_name] = []
|
|
120
|
+
|
|
121
|
+
# Add only unique values
|
|
122
|
+
existing = set(cls._pools[pool_name])
|
|
123
|
+
new_values = [v for v in values if v not in existing]
|
|
124
|
+
cls._pools[pool_name].extend(new_values)
|
|
125
|
+
|
|
126
|
+
return len(cls._pools[pool_name])
|
|
127
|
+
|
|
128
|
+
@classmethod
|
|
129
|
+
def load_pools_from_file(cls, filepath: str) -> Dict[str, int]:
|
|
130
|
+
"""
|
|
131
|
+
Load and extend pools from a JSON file.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
filepath: Path to JSON file with pool data
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Dict of pool names to their new sizes
|
|
138
|
+
"""
|
|
139
|
+
import json
|
|
140
|
+
with open(filepath, 'r') as f:
|
|
141
|
+
data = json.load(f)
|
|
142
|
+
|
|
143
|
+
sizes = {}
|
|
144
|
+
for pool_name, values in data.items():
|
|
145
|
+
if isinstance(values, list):
|
|
146
|
+
sizes[pool_name] = cls.extend_pool(pool_name, values)
|
|
147
|
+
|
|
148
|
+
return sizes
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def save_pools_to_file(cls, filepath: str) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Save current pools to a JSON file.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
filepath: Path to save JSON file
|
|
157
|
+
"""
|
|
158
|
+
import json
|
|
159
|
+
with open(filepath, 'w') as f:
|
|
160
|
+
json.dump(cls._pools, f, indent=2)
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
def get_pool_sizes(cls) -> Dict[str, int]:
|
|
164
|
+
"""Get sizes of all pools."""
|
|
165
|
+
return {name: len(values) for name, values in cls._pools.items()}
|
|
166
|
+
|
|
167
|
+
def name(self) -> str:
|
|
168
|
+
"""Generate a full name."""
|
|
169
|
+
first = self.rng.choice(self._pools["first_names"])
|
|
170
|
+
last = self.rng.choice(self._pools["last_names"])
|
|
171
|
+
return f"{first} {last}"
|
|
172
|
+
|
|
173
|
+
def first_name(self) -> str:
|
|
174
|
+
"""Generate a first name."""
|
|
175
|
+
return self.rng.choice(self._pools["first_names"])
|
|
176
|
+
|
|
177
|
+
def last_name(self) -> str:
|
|
178
|
+
"""Generate a last name."""
|
|
179
|
+
return self.rng.choice(self._pools["last_names"])
|
|
180
|
+
|
|
181
|
+
def email(self) -> str:
|
|
182
|
+
"""Generate an email address."""
|
|
183
|
+
first = self.rng.choice(self._pools["first_names"]).lower()
|
|
184
|
+
last = self.rng.choice(self._pools["last_names"]).lower()
|
|
185
|
+
domain = self.rng.choice(self._pools["email_domains"])
|
|
186
|
+
separator = self.rng.choice([".", "_", ""])
|
|
187
|
+
num = self.rng.randint(1, 99) if self.rng.random() > 0.5 else ""
|
|
188
|
+
return f"{first}{separator}{last}{num}@{domain}"
|
|
189
|
+
|
|
190
|
+
def company(self) -> str:
|
|
191
|
+
"""Generate a company name."""
|
|
192
|
+
return self.rng.choice(self._pools["company_names"])
|
|
193
|
+
|
|
194
|
+
def address(self) -> str:
|
|
195
|
+
"""Generate a street address."""
|
|
196
|
+
number = self.rng.randint(1, 9999)
|
|
197
|
+
street = self.rng.choice(self._pools["street_names"])
|
|
198
|
+
suffix = self.rng.choice(STREET_SUFFIXES)
|
|
199
|
+
return f"{number} {street} {suffix}"
|
|
200
|
+
|
|
201
|
+
def full_address(self) -> str:
|
|
202
|
+
"""Generate a full address with city, state, zip."""
|
|
203
|
+
addr = self.address()
|
|
204
|
+
city = self.rng.choice(self._pools["cities"])
|
|
205
|
+
state = self.rng.choice(self._pools["states"])
|
|
206
|
+
zipcode = self.rng.randint(10000, 99999)
|
|
207
|
+
return f"{addr}, {city}, {state} {zipcode}"
|
|
208
|
+
|
|
209
|
+
def phone_number(self) -> str:
|
|
210
|
+
"""Generate a phone number."""
|
|
211
|
+
area = self.rng.randint(200, 999)
|
|
212
|
+
prefix = self.rng.randint(200, 999)
|
|
213
|
+
line = self.rng.randint(1000, 9999)
|
|
214
|
+
return f"({area}) {prefix}-{line}"
|
|
215
|
+
|
|
216
|
+
def url(self) -> str:
|
|
217
|
+
"""Generate a URL."""
|
|
218
|
+
base = self.rng.choice(URLS)
|
|
219
|
+
path = ''.join(self.rng.choices(string.ascii_lowercase, k=8))
|
|
220
|
+
return f"{base}/{path}"
|
|
221
|
+
|
|
222
|
+
def sentence(self, words: int = 8) -> str:
|
|
223
|
+
"""Generate a lorem ipsum sentence."""
|
|
224
|
+
selected = [self.rng.choice(self._pools["lorem_words"]) for _ in range(words)]
|
|
225
|
+
selected[0] = selected[0].capitalize()
|
|
226
|
+
return ' '.join(selected) + '.'
|
|
227
|
+
|
|
228
|
+
def word(self) -> str:
|
|
229
|
+
"""Generate a single word."""
|
|
230
|
+
return self.rng.choice(self._pools["lorem_words"])
|
|
231
|
+
|
|
232
|
+
def text(self, sentences: int = 3) -> str:
|
|
233
|
+
"""Generate a paragraph of text."""
|
|
234
|
+
return ' '.join(self.sentence() for _ in range(sentences))
|
|
235
|
+
|
|
236
|
+
def uuid(self) -> str:
|
|
237
|
+
"""Generate a UUID-like string."""
|
|
238
|
+
hex_chars = '0123456789abcdef'
|
|
239
|
+
parts = [
|
|
240
|
+
''.join(self.rng.choices(hex_chars, k=8)),
|
|
241
|
+
''.join(self.rng.choices(hex_chars, k=4)),
|
|
242
|
+
''.join(self.rng.choices(hex_chars, k=4)),
|
|
243
|
+
''.join(self.rng.choices(hex_chars, k=4)),
|
|
244
|
+
''.join(self.rng.choices(hex_chars, k=12)),
|
|
245
|
+
]
|
|
246
|
+
return '-'.join(parts)
|
|
247
|
+
|