misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/semantic.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic column inference for automatic type detection.
|
|
3
|
+
|
|
4
|
+
This module detects column semantics from names and applies
|
|
5
|
+
the correct data generators, even if the LLM misses it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from misata.schema import Column
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Semantic patterns: regex -> (type, distribution_params)
|
|
15
|
+
SEMANTIC_PATTERNS: List[Tuple[str, str, Dict[str, Any]]] = [
|
|
16
|
+
# Email patterns
|
|
17
|
+
(r"^email$|^e_?mail$|^user_?email$|^customer_?email$", "text", {"text_type": "email"}),
|
|
18
|
+
|
|
19
|
+
# Name patterns
|
|
20
|
+
(r"^name$|^full_?name$|^user_?name$|^customer_?name$|^display_?name$", "text", {"text_type": "name"}),
|
|
21
|
+
(r"^first_?name$", "text", {"text_type": "name"}),
|
|
22
|
+
(r"^last_?name$|^surname$|^family_?name$", "text", {"text_type": "name"}),
|
|
23
|
+
|
|
24
|
+
# Phone patterns
|
|
25
|
+
(r"^phone$|^phone_?number$|^mobile$|^cell$|^telephone$", "text", {"text_type": "phone"}),
|
|
26
|
+
|
|
27
|
+
# Address patterns
|
|
28
|
+
(r"^address$|^street$|^full_?address$|^billing_?address$|^shipping_?address$", "text", {"text_type": "address"}),
|
|
29
|
+
|
|
30
|
+
# Company patterns
|
|
31
|
+
(r"^company$|^company_?name$|^organization$|^org_?name$|^employer$", "text", {"text_type": "company"}),
|
|
32
|
+
|
|
33
|
+
# URL patterns
|
|
34
|
+
(r"^url$|^website$|^web_?url$|^link$|^profile_?url$", "text", {"text_type": "url"}),
|
|
35
|
+
|
|
36
|
+
# Price/Money patterns (must be positive)
|
|
37
|
+
(r"^price$|^cost$|^amount$|^fee$|^total$|^subtotal$|^tax$", "float", {"distribution": "uniform", "min": 0, "max": 1000, "decimals": 2}),
|
|
38
|
+
(r"^mrr$|^arr$|^revenue$|^income$|^salary$|^wage$", "float", {"distribution": "uniform", "min": 0, "max": 100000, "decimals": 2}),
|
|
39
|
+
|
|
40
|
+
# Age patterns
|
|
41
|
+
(r"^age$|^user_?age$|^customer_?age$", "int", {"distribution": "uniform", "min": 18, "max": 80}),
|
|
42
|
+
|
|
43
|
+
# Count patterns (non-negative integers)
|
|
44
|
+
(r"^count$|^quantity$|^qty$|^num_|^number_of_|_count$", "int", {"distribution": "poisson", "lambda": 5, "min": 0}),
|
|
45
|
+
|
|
46
|
+
# Percentage patterns
|
|
47
|
+
(r"^percent|percentage$|_pct$|_percent$|^rate$", "float", {"distribution": "uniform", "min": 0, "max": 100, "decimals": 1}),
|
|
48
|
+
|
|
49
|
+
# Duration patterns
|
|
50
|
+
(r"^duration$|^duration_?minutes$|^duration_?hours$|^length$|^time_?spent$", "int", {"distribution": "uniform", "min": 1, "max": 120}),
|
|
51
|
+
|
|
52
|
+
# Weight/Height patterns
|
|
53
|
+
(r"^weight$|^weight_?kg$", "float", {"distribution": "normal", "mean": 70, "std": 15, "min": 30, "max": 200}),
|
|
54
|
+
(r"^height$|^height_?cm$", "float", {"distribution": "normal", "mean": 170, "std": 10, "min": 140, "max": 220}),
|
|
55
|
+
|
|
56
|
+
# Rating patterns
|
|
57
|
+
(r"^rating$|^score$|^stars$|^review_?score$", "float", {"distribution": "uniform", "min": 1, "max": 5, "decimals": 1}),
|
|
58
|
+
|
|
59
|
+
# Boolean patterns
|
|
60
|
+
(r"^is_|^has_|^can_|^should_|^active$|^enabled$|^verified$|^confirmed$", "boolean", {"probability": 0.5}),
|
|
61
|
+
|
|
62
|
+
# Status patterns
|
|
63
|
+
(r"^status$|^state$|^order_?status$|^subscription_?status$", "categorical", {"choices": ["active", "inactive", "pending", "cancelled"]}),
|
|
64
|
+
|
|
65
|
+
# Date patterns (already handled by type, but ensure proper params)
|
|
66
|
+
(r"^date$|^created_?at$|^updated_?at$|^start_?date$|^end_?date$|_date$|_at$", "date", {"start": "2023-01-01", "end": "2024-12-31"}),
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SemanticInference:
|
|
71
|
+
"""
|
|
72
|
+
Automatically infer and fix column semantics based on naming patterns.
|
|
73
|
+
|
|
74
|
+
This acts as a safety net - if the LLM generates incorrect column types
|
|
75
|
+
or parameters, semantic inference can fix them based on column names.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(self, strict_mode: bool = False):
|
|
79
|
+
"""
|
|
80
|
+
Initialize semantic inference.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
strict_mode: If True, always override LLM; if False, only fix obvious errors
|
|
84
|
+
"""
|
|
85
|
+
self.strict_mode = strict_mode
|
|
86
|
+
self.patterns = [(re.compile(p, re.IGNORECASE), t, params)
|
|
87
|
+
for p, t, params in SEMANTIC_PATTERNS]
|
|
88
|
+
|
|
89
|
+
def infer_column(self, column_name: str) -> Optional[Tuple[str, Dict[str, Any]]]:
|
|
90
|
+
"""
|
|
91
|
+
Infer column type and parameters from name.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
column_name: Name of the column
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Tuple of (type, distribution_params) or None if no match
|
|
98
|
+
"""
|
|
99
|
+
for pattern, col_type, params in self.patterns:
|
|
100
|
+
if pattern.search(column_name):
|
|
101
|
+
return (col_type, params.copy())
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def fix_column(self, column: Column, table_name: str = "") -> Column:
|
|
105
|
+
"""
|
|
106
|
+
Fix a column's type/params based on semantic inference.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
column: Column to potentially fix
|
|
110
|
+
table_name: Name of the table (for context)
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Fixed column (or original if no fix needed)
|
|
114
|
+
"""
|
|
115
|
+
inferred = self.infer_column(column.name)
|
|
116
|
+
|
|
117
|
+
if inferred is None:
|
|
118
|
+
return column
|
|
119
|
+
|
|
120
|
+
inferred_type, inferred_params = inferred
|
|
121
|
+
|
|
122
|
+
# Determine if we should apply the fix
|
|
123
|
+
should_fix = False
|
|
124
|
+
|
|
125
|
+
if self.strict_mode:
|
|
126
|
+
# Always use inferred semantics
|
|
127
|
+
should_fix = True
|
|
128
|
+
else:
|
|
129
|
+
# Only fix if current type seems wrong
|
|
130
|
+
# Case 1: Column named "email" but type is not "text" with email
|
|
131
|
+
if column.type == "text":
|
|
132
|
+
current_text_type = column.distribution_params.get("text_type", "sentence")
|
|
133
|
+
if current_text_type == "sentence":
|
|
134
|
+
# Default sentence generation - probably wrong for semantic names
|
|
135
|
+
should_fix = True
|
|
136
|
+
|
|
137
|
+
# Case 2: Numeric column that could be negative but shouldn't be
|
|
138
|
+
if column.type in ["int", "float"]:
|
|
139
|
+
if "price" in column.name.lower() or "age" in column.name.lower():
|
|
140
|
+
if "min" not in column.distribution_params:
|
|
141
|
+
should_fix = True
|
|
142
|
+
|
|
143
|
+
if should_fix:
|
|
144
|
+
# Merge inferred params with existing (inferred takes precedence)
|
|
145
|
+
merged_params = {**column.distribution_params, **inferred_params}
|
|
146
|
+
return Column(
|
|
147
|
+
name=column.name,
|
|
148
|
+
type=inferred_type,
|
|
149
|
+
distribution_params=merged_params,
|
|
150
|
+
nullable=column.nullable,
|
|
151
|
+
unique=column.unique
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return column
|
|
155
|
+
|
|
156
|
+
def fix_schema_columns(self, columns: Dict[str, List[Column]]) -> Dict[str, List[Column]]:
|
|
157
|
+
"""
|
|
158
|
+
Fix all columns in a schema using semantic inference.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
columns: Dict mapping table names to column lists
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Fixed columns dict
|
|
165
|
+
"""
|
|
166
|
+
fixed = {}
|
|
167
|
+
for table_name, cols in columns.items():
|
|
168
|
+
fixed[table_name] = [self.fix_column(c, table_name) for c in cols]
|
|
169
|
+
return fixed
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# Convenience function
|
|
173
|
+
def apply_semantic_inference(columns: Dict[str, List[Column]], strict: bool = False) -> Dict[str, List[Column]]:
|
|
174
|
+
"""
|
|
175
|
+
Apply semantic inference to fix column definitions.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
columns: Schema columns to fix
|
|
179
|
+
strict: If True, always apply semantic rules
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Fixed columns
|
|
183
|
+
"""
|
|
184
|
+
inference = SemanticInference(strict_mode=strict)
|
|
185
|
+
return inference.fix_schema_columns(columns)
|