misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/formulas.py ADDED
@@ -0,0 +1,362 @@
1
+ """
2
+ Formula engine for derived columns.
3
+
4
+ This module enables columns that are computed from other columns,
5
+ supporting expressions like:
6
+ - calories_burned = duration_minutes * @exercises.calories_per_minute
7
+ - total_price = quantity * @products.price
8
+ - discount_amount = total_price * 0.1
9
+ """
10
+
11
+ import re
12
+ from typing import Dict, List, Optional
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import ast
17
+ import operator
18
+ from simpleeval import simple_eval, NameNotDefined
19
+
20
+ # Whitelist of safe functions
21
+ SAFE_FUNCTIONS = {
22
+ 'where': np.where,
23
+ 'abs': np.abs,
24
+ 'round': np.round,
25
+ 'ceil': np.ceil,
26
+ 'floor': np.floor,
27
+ 'min': np.minimum,
28
+ 'max': np.maximum,
29
+ 'sin': np.sin,
30
+ 'cos': np.cos,
31
+ 'tan': np.tan,
32
+ 'log': np.log,
33
+ 'exp': np.exp,
34
+ 'sqrt': np.sqrt,
35
+ 'random': np.random.random,
36
+ 'randint': np.random.randint,
37
+ }
38
+
39
+ # Standard operators to bypass simpleeval's string length checks which fail on numpy arrays
40
+ SAFE_OPERATORS = {
41
+ ast.Add: operator.add,
42
+ ast.Sub: operator.sub,
43
+ ast.Mult: operator.mul,
44
+ ast.Div: operator.truediv,
45
+ ast.FloorDiv: operator.floordiv,
46
+ ast.Pow: operator.pow,
47
+ ast.Mod: operator.mod,
48
+ ast.Eq: operator.eq,
49
+ ast.NotEq: operator.ne,
50
+ ast.Gt: operator.gt,
51
+ ast.Lt: operator.lt,
52
+ ast.GtE: operator.ge,
53
+ ast.LtE: operator.le,
54
+ ast.BitAnd: operator.and_,
55
+ ast.BitOr: operator.or_,
56
+ ast.BitXor: operator.xor,
57
+ ast.USub: operator.neg,
58
+ ast.UAdd: operator.pos,
59
+ ast.Not: operator.not_,
60
+ ast.In: lambda a, b: a in b,
61
+ }
62
+
63
+ class SafeNumpy:
64
+ """Proxy for numpy to allow safe access to whitelisted functions."""
65
+ def __getattr__(self, name):
66
+ if name in SAFE_FUNCTIONS:
67
+ return SAFE_FUNCTIONS[name]
68
+ raise NameNotDefined(name, f"Function 'np.{name}' is not allowed in formulas.")
69
+
70
+ class FormulaEngine:
71
+ """
72
+ Evaluate column formulas using safe expressions.
73
+
74
+ Supports:
75
+ - Simple arithmetic: duration * 10
76
+ - Column references: quantity * unit_price
77
+ - Cross-table references: @exercises.calories_per_minute
78
+ - Conditional expressions: np.where(status == 'active', 1, 0)
79
+ """
80
+
81
+ def __init__(self, tables: Dict[str, pd.DataFrame]):
82
+ """
83
+ Initialize with generated tables for cross-table lookups.
84
+
85
+ Args:
86
+ tables: Dict mapping table names to DataFrames
87
+ """
88
+ self.tables = tables
89
+
90
+ def evaluate(
91
+ self,
92
+ df: pd.DataFrame,
93
+ formula: str,
94
+ fk_column: Optional[str] = None,
95
+ ) -> np.ndarray:
96
+ """
97
+ Evaluate a formula on a DataFrame.
98
+
99
+ Args:
100
+ df: DataFrame to evaluate on
101
+ formula: Expression string
102
+ fk_column: Foreign key column name for cross-table lookups
103
+
104
+ Returns:
105
+ Array of computed values
106
+ """
107
+ # Replace cross-table references with actual values
108
+ processed_formula = self._resolve_cross_table_refs(df, formula, fk_column)
109
+
110
+ # Create evaluation context
111
+ names = {
112
+ 'np': SafeNumpy(),
113
+ 'pd': pd, # Needed for some checks, but ideally we restrict this too
114
+ # simpleeval defaults allow basic math
115
+ }
116
+
117
+ # Add columns to context
118
+ for col in df.columns:
119
+ names[col] = df[col].values
120
+
121
+ # Evaluate the expression safely
122
+ try:
123
+ result = simple_eval(
124
+ processed_formula,
125
+ names=names,
126
+ functions=SAFE_FUNCTIONS, # Allow top-level functions too
127
+ operators=SAFE_OPERATORS
128
+ )
129
+ return np.array(result)
130
+ except Exception as e:
131
+ raise ValueError(f"Failed to evaluate formula '{formula}': {e}")
132
+
133
+ def _resolve_cross_table_refs(
134
+ self,
135
+ df: pd.DataFrame,
136
+ formula: str,
137
+ fk_column: Optional[str] = None,
138
+ ) -> str:
139
+ """
140
+ Replace @table.column references with actual looked-up values.
141
+
142
+ Pattern: @tablename.columnname
143
+
144
+ Args:
145
+ df: Current DataFrame
146
+ formula: Formula with potential cross-table refs
147
+ fk_column: FK column to use for lookups
148
+
149
+ Returns:
150
+ Formula with refs replaced by _lookup_N variables
151
+ """
152
+ # Pattern to match @table.column
153
+ pattern = r'@(\w+)\.(\w+)'
154
+ matches = re.findall(pattern, formula)
155
+
156
+ if not matches:
157
+ return formula
158
+
159
+ result = formula
160
+
161
+ for i, (table_name, col_name) in enumerate(matches):
162
+ if table_name not in self.tables:
163
+ raise ValueError(f"Table '{table_name}' not found for formula lookup")
164
+
165
+ ref_table = self.tables[table_name]
166
+
167
+ if col_name not in ref_table.columns:
168
+ raise ValueError(f"Column '{col_name}' not found in table '{table_name}'")
169
+
170
+ # Determine the FK column to use for lookup
171
+ actual_fk = fk_column or f"{table_name}_id"
172
+
173
+ if actual_fk not in df.columns:
174
+ raise ValueError(
175
+ f"Cannot lookup @{table_name}.{col_name}: "
176
+ f"no foreign key column '{actual_fk}' in current table"
177
+ )
178
+
179
+ # Create lookup mapping and apply
180
+ # Handle duplicates in index by grouping or taking first (safety)
181
+ if not ref_table['id'].is_unique:
182
+ # If IDs are not unique in ref table, we have a problem.
183
+ # Assuming IDs are unique for lookups.
184
+ pass
185
+
186
+ lookup_map = ref_table.set_index('id')[col_name].to_dict()
187
+ df[actual_fk].map(lookup_map).values
188
+
189
+ var_name = f'_lookup_{i}'
190
+ # Store in a temporary dict doesn't work well with clean state
191
+ # Instead we'll inject into the names dict in evaluate
192
+ # But specific method needs to handle this exchange.
193
+ # Refactoring to return both formula and context updates would be better
194
+ # For now, sticking to string replacement and hoping evaluate calls context filler
195
+
196
+ # CRITICAL: This method only returns string.
197
+ # The previous implementation put it in _temp_lookups but implementation was messy.
198
+ # We will use evaluate_with_lookups which is the main entry point
199
+
200
+ result = result.replace(f'@{table_name}.{col_name}', var_name)
201
+
202
+ return result
203
+
204
+ def evaluate_with_lookups(
205
+ self,
206
+ df: pd.DataFrame,
207
+ formula: str,
208
+ fk_mappings: Optional[Dict[str, str]] = None,
209
+ ) -> np.ndarray:
210
+ """
211
+ Evaluate formula with automatic cross-table lookups.
212
+
213
+ Args:
214
+ df: DataFrame to evaluate on
215
+ formula: Expression with @table.column references
216
+ fk_mappings: Optional dict mapping table name to FK column name
217
+ e.g., {"exercises": "exercise_id", "products": "product_id"}
218
+
219
+ Returns:
220
+ Array of computed values
221
+ """
222
+ fk_mappings = fk_mappings or {}
223
+
224
+ # Pattern to match @table.column
225
+ pattern = r'@(\w+)\.(\w+)'
226
+ matches = re.findall(pattern, formula)
227
+
228
+ result = formula
229
+ names = {
230
+ 'np': SafeNumpy(),
231
+ 'pd': pd,
232
+ }
233
+
234
+ # Add columns to context
235
+ for col in df.columns:
236
+ names[col] = df[col].values
237
+
238
+ # Resolve each cross-table reference
239
+ for i, (table_name, col_name) in enumerate(matches):
240
+ if table_name not in self.tables:
241
+ raise ValueError(f"Table '{table_name}' not found")
242
+
243
+ ref_table = self.tables[table_name]
244
+
245
+ # Determine FK column
246
+ fk_col = fk_mappings.get(table_name, f"{table_name}_id")
247
+ if fk_col.endswith("s_id"):
248
+ # Try without trailing 's' (exercises -> exercise_id)
249
+ alt_fk = fk_col.replace("s_id", "_id")
250
+ if alt_fk in df.columns:
251
+ fk_col = alt_fk
252
+
253
+ if fk_col not in df.columns:
254
+ # Try common patterns
255
+ for pattern_fk in [f"{table_name}_id", f"{table_name[:-1]}_id", "id"]:
256
+ if pattern_fk in df.columns:
257
+ fk_col = pattern_fk
258
+ break
259
+
260
+ if fk_col not in df.columns:
261
+ raise ValueError(f"No FK column found for table '{table_name}'")
262
+
263
+ # Create lookup and add to context
264
+ if 'id' not in ref_table.columns:
265
+ raise ValueError(f"Reference table '{table_name}' has no 'id' column")
266
+
267
+ lookup_map = ref_table.set_index('id')[col_name].to_dict()
268
+ looked_up = df[fk_col].map(lookup_map).fillna(0).values
269
+
270
+ var_name = f'_ref_{i}'
271
+ names[var_name] = looked_up
272
+ result = result.replace(f'@{table_name}.{col_name}', var_name)
273
+
274
+ # Evaluate safely
275
+ try:
276
+ return np.array(simple_eval(
277
+ result,
278
+ names=names,
279
+ functions=SAFE_FUNCTIONS,
280
+ operators=SAFE_OPERATORS
281
+ ))
282
+ except Exception as e:
283
+ raise ValueError(f"Failed to evaluate formula '{formula}': {e}")
284
+
285
+
286
+ class FormulaColumn:
287
+ """
288
+ Definition of a formula-based column.
289
+ """
290
+
291
+ def __init__(
292
+ self,
293
+ name: str,
294
+ formula: str,
295
+ result_type: str = "float",
296
+ fk_mappings: Optional[Dict[str, str]] = None,
297
+ ):
298
+ """
299
+ Define a formula column.
300
+
301
+ Args:
302
+ name: Column name
303
+ formula: Expression (can include @table.column refs)
304
+ result_type: Type of result (int, float, boolean)
305
+ fk_mappings: Map table names to FK column names
306
+ """
307
+ self.name = name
308
+ self.formula = formula
309
+ self.result_type = result_type
310
+ self.fk_mappings = fk_mappings or {}
311
+
312
+ def evaluate(
313
+ self,
314
+ df: pd.DataFrame,
315
+ tables: Dict[str, pd.DataFrame],
316
+ ) -> np.ndarray:
317
+ """
318
+ Evaluate this formula column.
319
+
320
+ Args:
321
+ df: Current table DataFrame
322
+ tables: All generated tables for cross-table lookups
323
+
324
+ Returns:
325
+ Array of computed values
326
+ """
327
+ engine = FormulaEngine(tables)
328
+ result = engine.evaluate_with_lookups(df, self.formula, self.fk_mappings)
329
+
330
+ # Cast to result type
331
+ if self.result_type == "int":
332
+ return result.astype(int)
333
+ elif self.result_type == "float":
334
+ return result.astype(float)
335
+ elif self.result_type == "boolean":
336
+ return result.astype(bool)
337
+
338
+ return result
339
+
340
+
341
+ def apply_formula_columns(
342
+ df: pd.DataFrame,
343
+ formulas: List[FormulaColumn],
344
+ tables: Dict[str, pd.DataFrame],
345
+ ) -> pd.DataFrame:
346
+ """
347
+ Apply formula columns to a DataFrame.
348
+
349
+ Args:
350
+ df: DataFrame to add columns to
351
+ formulas: List of formula column definitions
352
+ tables: All tables for cross-table lookups
353
+
354
+ Returns:
355
+ DataFrame with formula columns added
356
+ """
357
+ result = df.copy()
358
+
359
+ for formula_col in formulas:
360
+ result[formula_col.name] = formula_col.evaluate(result, tables)
361
+
362
+ return result
misata/generators.py ADDED
@@ -0,0 +1,247 @@
1
+ """
2
+ Pure Python text generators for Misata.
3
+
4
+ Replaces Mimesis with lightweight, built-in generators using curated data pools.
5
+ No external dependencies required.
6
+ """
7
+
8
+ import random
9
+ import string
10
+ from typing import Dict, List, Optional
11
+
12
+ # ============================================
13
+ # DATA POOLS
14
+ # ============================================
15
+
16
+ FIRST_NAMES = [
17
+ "James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda",
18
+ "David", "Elizabeth", "William", "Barbara", "Richard", "Susan", "Joseph", "Jessica",
19
+ "Thomas", "Sarah", "Christopher", "Karen", "Charles", "Lisa", "Daniel", "Nancy",
20
+ "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra", "Donald", "Ashley",
21
+ "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
22
+ "Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa",
23
+ "Timothy", "Deborah", "Ronald", "Stephanie", "Edward", "Rebecca", "Jason", "Sharon",
24
+ ]
25
+
26
+ LAST_NAMES = [
27
+ "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
28
+ "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson",
29
+ "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson",
30
+ "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker",
31
+ "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill",
32
+ "Flores", "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell",
33
+ ]
34
+
35
+ EMAIL_DOMAINS = [
36
+ "gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "icloud.com",
37
+ "protonmail.com", "mail.com", "aol.com", "zoho.com", "fastmail.com",
38
+ ]
39
+
40
+ COMPANY_NAMES = [
41
+ "Acme Corp", "Globex", "Initech", "Umbrella Corp", "Stark Industries",
42
+ "Wayne Enterprises", "Cyberdyne Systems", "Soylent Corp", "Massive Dynamic",
43
+ "Aperture Science", "InGen", "Tyrell Corporation", "Weyland-Yutani", "OsCorp",
44
+ "LexCorp", "Oscorp Industries", "Dharma Initiative", "Dunder Mifflin",
45
+ "Sterling Cooper", "Wonka Industries", "Prestige Worldwide", "Vandelay Industries",
46
+ ]
47
+
48
+ STREET_NAMES = [
49
+ "Main", "Oak", "Maple", "Cedar", "Elm", "Pine", "Washington", "Lake",
50
+ "Hill", "Park", "River", "Sunset", "Highland", "Valley", "Forest", "Spring",
51
+ ]
52
+
53
+ STREET_SUFFIXES = ["St", "Ave", "Blvd", "Dr", "Ln", "Rd", "Way", "Ct", "Pl", "Cir"]
54
+
55
+ CITIES = [
56
+ "New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia",
57
+ "San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Jacksonville",
58
+ "Fort Worth", "Columbus", "Charlotte", "Seattle", "Denver", "Boston",
59
+ ]
60
+
61
+ STATES = [
62
+ "NY", "CA", "TX", "FL", "IL", "PA", "OH", "GA", "NC", "MI",
63
+ "NJ", "VA", "WA", "AZ", "MA", "CO", "TN", "IN", "MO", "MD",
64
+ ]
65
+
66
+ LOREM_WORDS = [
67
+ "lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
68
+ "sed", "do", "eiusmod", "tempor", "incididunt", "ut", "labore", "et", "dolore",
69
+ "magna", "aliqua", "enim", "ad", "minim", "veniam", "quis", "nostrud",
70
+ "exercitation", "ullamco", "laboris", "nisi", "aliquip", "ex", "ea", "commodo",
71
+ ]
72
+
73
+ URLS = [
74
+ "https://example.com", "https://test.org", "https://demo.io", "https://sample.net",
75
+ ]
76
+
77
+
78
+ # ============================================
79
+ # GENERATOR CLASS
80
+ # ============================================
81
+
82
+ class TextGenerator:
83
+ """
84
+ Pure Python text generator for synthetic data.
85
+
86
+ Drop-in replacement for Mimesis functionality.
87
+ Supports expandable data pools that can grow over time.
88
+ """
89
+
90
+ # Class-level pools (shared across instances, can be extended)
91
+ _pools = {
92
+ "first_names": list(FIRST_NAMES),
93
+ "last_names": list(LAST_NAMES),
94
+ "email_domains": list(EMAIL_DOMAINS),
95
+ "company_names": list(COMPANY_NAMES),
96
+ "street_names": list(STREET_NAMES),
97
+ "cities": list(CITIES),
98
+ "states": list(STATES),
99
+ "lorem_words": list(LOREM_WORDS),
100
+ }
101
+
102
+ def __init__(self, seed: Optional[int] = None):
103
+ """Initialize with optional random seed."""
104
+ self.rng = random.Random(seed)
105
+
106
+ @classmethod
107
+ def extend_pool(cls, pool_name: str, values: List[str]) -> int:
108
+ """
109
+ Extend a data pool with new values.
110
+
111
+ Args:
112
+ pool_name: Name of pool (first_names, last_names, etc.)
113
+ values: List of new values to add
114
+
115
+ Returns:
116
+ New pool size
117
+ """
118
+ if pool_name not in cls._pools:
119
+ cls._pools[pool_name] = []
120
+
121
+ # Add only unique values
122
+ existing = set(cls._pools[pool_name])
123
+ new_values = [v for v in values if v not in existing]
124
+ cls._pools[pool_name].extend(new_values)
125
+
126
+ return len(cls._pools[pool_name])
127
+
128
+ @classmethod
129
+ def load_pools_from_file(cls, filepath: str) -> Dict[str, int]:
130
+ """
131
+ Load and extend pools from a JSON file.
132
+
133
+ Args:
134
+ filepath: Path to JSON file with pool data
135
+
136
+ Returns:
137
+ Dict of pool names to their new sizes
138
+ """
139
+ import json
140
+ with open(filepath, 'r') as f:
141
+ data = json.load(f)
142
+
143
+ sizes = {}
144
+ for pool_name, values in data.items():
145
+ if isinstance(values, list):
146
+ sizes[pool_name] = cls.extend_pool(pool_name, values)
147
+
148
+ return sizes
149
+
150
+ @classmethod
151
+ def save_pools_to_file(cls, filepath: str) -> None:
152
+ """
153
+ Save current pools to a JSON file.
154
+
155
+ Args:
156
+ filepath: Path to save JSON file
157
+ """
158
+ import json
159
+ with open(filepath, 'w') as f:
160
+ json.dump(cls._pools, f, indent=2)
161
+
162
+ @classmethod
163
+ def get_pool_sizes(cls) -> Dict[str, int]:
164
+ """Get sizes of all pools."""
165
+ return {name: len(values) for name, values in cls._pools.items()}
166
+
167
+ def name(self) -> str:
168
+ """Generate a full name."""
169
+ first = self.rng.choice(self._pools["first_names"])
170
+ last = self.rng.choice(self._pools["last_names"])
171
+ return f"{first} {last}"
172
+
173
+ def first_name(self) -> str:
174
+ """Generate a first name."""
175
+ return self.rng.choice(self._pools["first_names"])
176
+
177
+ def last_name(self) -> str:
178
+ """Generate a last name."""
179
+ return self.rng.choice(self._pools["last_names"])
180
+
181
+ def email(self) -> str:
182
+ """Generate an email address."""
183
+ first = self.rng.choice(self._pools["first_names"]).lower()
184
+ last = self.rng.choice(self._pools["last_names"]).lower()
185
+ domain = self.rng.choice(self._pools["email_domains"])
186
+ separator = self.rng.choice([".", "_", ""])
187
+ num = self.rng.randint(1, 99) if self.rng.random() > 0.5 else ""
188
+ return f"{first}{separator}{last}{num}@{domain}"
189
+
190
+ def company(self) -> str:
191
+ """Generate a company name."""
192
+ return self.rng.choice(self._pools["company_names"])
193
+
194
+ def address(self) -> str:
195
+ """Generate a street address."""
196
+ number = self.rng.randint(1, 9999)
197
+ street = self.rng.choice(self._pools["street_names"])
198
+ suffix = self.rng.choice(STREET_SUFFIXES)
199
+ return f"{number} {street} {suffix}"
200
+
201
+ def full_address(self) -> str:
202
+ """Generate a full address with city, state, zip."""
203
+ addr = self.address()
204
+ city = self.rng.choice(self._pools["cities"])
205
+ state = self.rng.choice(self._pools["states"])
206
+ zipcode = self.rng.randint(10000, 99999)
207
+ return f"{addr}, {city}, {state} {zipcode}"
208
+
209
+ def phone_number(self) -> str:
210
+ """Generate a phone number."""
211
+ area = self.rng.randint(200, 999)
212
+ prefix = self.rng.randint(200, 999)
213
+ line = self.rng.randint(1000, 9999)
214
+ return f"({area}) {prefix}-{line}"
215
+
216
+ def url(self) -> str:
217
+ """Generate a URL."""
218
+ base = self.rng.choice(URLS)
219
+ path = ''.join(self.rng.choices(string.ascii_lowercase, k=8))
220
+ return f"{base}/{path}"
221
+
222
+ def sentence(self, words: int = 8) -> str:
223
+ """Generate a lorem ipsum sentence."""
224
+ selected = [self.rng.choice(self._pools["lorem_words"]) for _ in range(words)]
225
+ selected[0] = selected[0].capitalize()
226
+ return ' '.join(selected) + '.'
227
+
228
+ def word(self) -> str:
229
+ """Generate a single word."""
230
+ return self.rng.choice(self._pools["lorem_words"])
231
+
232
+ def text(self, sentences: int = 3) -> str:
233
+ """Generate a paragraph of text."""
234
+ return ' '.join(self.sentence() for _ in range(sentences))
235
+
236
+ def uuid(self) -> str:
237
+ """Generate a UUID-like string."""
238
+ hex_chars = '0123456789abcdef'
239
+ parts = [
240
+ ''.join(self.rng.choices(hex_chars, k=8)),
241
+ ''.join(self.rng.choices(hex_chars, k=4)),
242
+ ''.join(self.rng.choices(hex_chars, k=4)),
243
+ ''.join(self.rng.choices(hex_chars, k=4)),
244
+ ''.join(self.rng.choices(hex_chars, k=12)),
245
+ ]
246
+ return '-'.join(parts)
247
+