misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/semantic.py ADDED
@@ -0,0 +1,185 @@
1
+ """
2
+ Semantic column inference for automatic type detection.
3
+
4
+ This module detects column semantics from names and applies
5
+ the correct data generators, even if the LLM misses it.
6
+ """
7
+
8
+ import re
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ from misata.schema import Column
12
+
13
+
14
+ # Semantic patterns: regex -> (type, distribution_params)
15
+ SEMANTIC_PATTERNS: List[Tuple[str, str, Dict[str, Any]]] = [
16
+ # Email patterns
17
+ (r"^email$|^e_?mail$|^user_?email$|^customer_?email$", "text", {"text_type": "email"}),
18
+
19
+ # Name patterns
20
+ (r"^name$|^full_?name$|^user_?name$|^customer_?name$|^display_?name$", "text", {"text_type": "name"}),
21
+ (r"^first_?name$", "text", {"text_type": "name"}),
22
+ (r"^last_?name$|^surname$|^family_?name$", "text", {"text_type": "name"}),
23
+
24
+ # Phone patterns
25
+ (r"^phone$|^phone_?number$|^mobile$|^cell$|^telephone$", "text", {"text_type": "phone"}),
26
+
27
+ # Address patterns
28
+ (r"^address$|^street$|^full_?address$|^billing_?address$|^shipping_?address$", "text", {"text_type": "address"}),
29
+
30
+ # Company patterns
31
+ (r"^company$|^company_?name$|^organization$|^org_?name$|^employer$", "text", {"text_type": "company"}),
32
+
33
+ # URL patterns
34
+ (r"^url$|^website$|^web_?url$|^link$|^profile_?url$", "text", {"text_type": "url"}),
35
+
36
+ # Price/Money patterns (must be positive)
37
+ (r"^price$|^cost$|^amount$|^fee$|^total$|^subtotal$|^tax$", "float", {"distribution": "uniform", "min": 0, "max": 1000, "decimals": 2}),
38
+ (r"^mrr$|^arr$|^revenue$|^income$|^salary$|^wage$", "float", {"distribution": "uniform", "min": 0, "max": 100000, "decimals": 2}),
39
+
40
+ # Age patterns
41
+ (r"^age$|^user_?age$|^customer_?age$", "int", {"distribution": "uniform", "min": 18, "max": 80}),
42
+
43
+ # Count patterns (non-negative integers)
44
+ (r"^count$|^quantity$|^qty$|^num_|^number_of_|_count$", "int", {"distribution": "poisson", "lambda": 5, "min": 0}),
45
+
46
+ # Percentage patterns
47
+ (r"^percent|percentage$|_pct$|_percent$|^rate$", "float", {"distribution": "uniform", "min": 0, "max": 100, "decimals": 1}),
48
+
49
+ # Duration patterns
50
+ (r"^duration$|^duration_?minutes$|^duration_?hours$|^length$|^time_?spent$", "int", {"distribution": "uniform", "min": 1, "max": 120}),
51
+
52
+ # Weight/Height patterns
53
+ (r"^weight$|^weight_?kg$", "float", {"distribution": "normal", "mean": 70, "std": 15, "min": 30, "max": 200}),
54
+ (r"^height$|^height_?cm$", "float", {"distribution": "normal", "mean": 170, "std": 10, "min": 140, "max": 220}),
55
+
56
+ # Rating patterns
57
+ (r"^rating$|^score$|^stars$|^review_?score$", "float", {"distribution": "uniform", "min": 1, "max": 5, "decimals": 1}),
58
+
59
+ # Boolean patterns
60
+ (r"^is_|^has_|^can_|^should_|^active$|^enabled$|^verified$|^confirmed$", "boolean", {"probability": 0.5}),
61
+
62
+ # Status patterns
63
+ (r"^status$|^state$|^order_?status$|^subscription_?status$", "categorical", {"choices": ["active", "inactive", "pending", "cancelled"]}),
64
+
65
+ # Date patterns (already handled by type, but ensure proper params)
66
+ (r"^date$|^created_?at$|^updated_?at$|^start_?date$|^end_?date$|_date$|_at$", "date", {"start": "2023-01-01", "end": "2024-12-31"}),
67
+ ]
68
+
69
+
70
+ class SemanticInference:
71
+ """
72
+ Automatically infer and fix column semantics based on naming patterns.
73
+
74
+ This acts as a safety net - if the LLM generates incorrect column types
75
+ or parameters, semantic inference can fix them based on column names.
76
+ """
77
+
78
+ def __init__(self, strict_mode: bool = False):
79
+ """
80
+ Initialize semantic inference.
81
+
82
+ Args:
83
+ strict_mode: If True, always override LLM; if False, only fix obvious errors
84
+ """
85
+ self.strict_mode = strict_mode
86
+ self.patterns = [(re.compile(p, re.IGNORECASE), t, params)
87
+ for p, t, params in SEMANTIC_PATTERNS]
88
+
89
+ def infer_column(self, column_name: str) -> Optional[Tuple[str, Dict[str, Any]]]:
90
+ """
91
+ Infer column type and parameters from name.
92
+
93
+ Args:
94
+ column_name: Name of the column
95
+
96
+ Returns:
97
+ Tuple of (type, distribution_params) or None if no match
98
+ """
99
+ for pattern, col_type, params in self.patterns:
100
+ if pattern.search(column_name):
101
+ return (col_type, params.copy())
102
+ return None
103
+
104
+ def fix_column(self, column: Column, table_name: str = "") -> Column:
105
+ """
106
+ Fix a column's type/params based on semantic inference.
107
+
108
+ Args:
109
+ column: Column to potentially fix
110
+ table_name: Name of the table (for context)
111
+
112
+ Returns:
113
+ Fixed column (or original if no fix needed)
114
+ """
115
+ inferred = self.infer_column(column.name)
116
+
117
+ if inferred is None:
118
+ return column
119
+
120
+ inferred_type, inferred_params = inferred
121
+
122
+ # Determine if we should apply the fix
123
+ should_fix = False
124
+
125
+ if self.strict_mode:
126
+ # Always use inferred semantics
127
+ should_fix = True
128
+ else:
129
+ # Only fix if current type seems wrong
130
+ # Case 1: Column named "email" but type is not "text" with email
131
+ if column.type == "text":
132
+ current_text_type = column.distribution_params.get("text_type", "sentence")
133
+ if current_text_type == "sentence":
134
+ # Default sentence generation - probably wrong for semantic names
135
+ should_fix = True
136
+
137
+ # Case 2: Numeric column that could be negative but shouldn't be
138
+ if column.type in ["int", "float"]:
139
+ if "price" in column.name.lower() or "age" in column.name.lower():
140
+ if "min" not in column.distribution_params:
141
+ should_fix = True
142
+
143
+ if should_fix:
144
+ # Merge inferred params with existing (inferred takes precedence)
145
+ merged_params = {**column.distribution_params, **inferred_params}
146
+ return Column(
147
+ name=column.name,
148
+ type=inferred_type,
149
+ distribution_params=merged_params,
150
+ nullable=column.nullable,
151
+ unique=column.unique
152
+ )
153
+
154
+ return column
155
+
156
+ def fix_schema_columns(self, columns: Dict[str, List[Column]]) -> Dict[str, List[Column]]:
157
+ """
158
+ Fix all columns in a schema using semantic inference.
159
+
160
+ Args:
161
+ columns: Dict mapping table names to column lists
162
+
163
+ Returns:
164
+ Fixed columns dict
165
+ """
166
+ fixed = {}
167
+ for table_name, cols in columns.items():
168
+ fixed[table_name] = [self.fix_column(c, table_name) for c in cols]
169
+ return fixed
170
+
171
+
172
+ # Convenience function
173
+ def apply_semantic_inference(columns: Dict[str, List[Column]], strict: bool = False) -> Dict[str, List[Column]]:
174
+ """
175
+ Apply semantic inference to fix column definitions.
176
+
177
+ Args:
178
+ columns: Schema columns to fix
179
+ strict: If True, always apply semantic rules
180
+
181
+ Returns:
182
+ Fixed columns
183
+ """
184
+ inference = SemanticInference(strict_mode=strict)
185
+ return inference.fix_schema_columns(columns)