misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/story_parser.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Story parser for converting natural language descriptions to SchemaConfig.
|
|
3
|
+
|
|
4
|
+
This module provides rule-based pattern matching to extract:
|
|
5
|
+
- Business domain (SaaS, E-commerce, Pharma, etc.)
|
|
6
|
+
- Scale parameters (number of users, transactions, etc.)
|
|
7
|
+
- Temporal patterns (growth, churn, seasonality, crashes)
|
|
8
|
+
- Data relationships
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StoryParser:
|
|
18
|
+
"""
|
|
19
|
+
Parses natural language stories into SchemaConfig objects.
|
|
20
|
+
|
|
21
|
+
Uses regex patterns and template matching for MVP version.
|
|
22
|
+
Future: Can be enhanced with LLM integration.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Pattern definitions
|
|
26
|
+
SCALE_PATTERNS = {
|
|
27
|
+
r"(\d+[KkMm]?)\s*users": "users",
|
|
28
|
+
r"(\d+[KkMm]?)\s*customers": "users",
|
|
29
|
+
r"(\d+[KkMm]?)\s*transactions": "transactions",
|
|
30
|
+
r"(\d+[KkMm]?)\s*orders": "orders",
|
|
31
|
+
r"(\d+[KkMm]?)\s*projects": "projects",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
TEMPORAL_PATTERNS = {
|
|
35
|
+
r"(\d+)%\s*growth": ("growth", "rate"),
|
|
36
|
+
r"(\d+)%\s*churn": ("churn", "rate"),
|
|
37
|
+
r"crash\s*in\s*([QqJjFfMmAaSsOoNnDd]+\s*\d{4})": ("crash", "date"),
|
|
38
|
+
r"seasonality": ("seasonality", None),
|
|
39
|
+
r"seasonal": ("seasonality", None),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
DOMAIN_KEYWORDS = {
|
|
43
|
+
"saas": ["saas", "subscription", "mrr", "arr", "churn"],
|
|
44
|
+
"ecommerce": ["ecommerce", "e-commerce", "orders", "cart", "products"],
|
|
45
|
+
"pharma": ["pharma", "research", "timesheet", "clinical", "trials"],
|
|
46
|
+
"fintech": ["fintech", "transactions", "payments", "wallet"],
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def __init__(self):
|
|
50
|
+
"""Initialize the story parser."""
|
|
51
|
+
self.detected_domain: Optional[str] = None
|
|
52
|
+
self.scale_params: Dict[str, int] = {}
|
|
53
|
+
self.temporal_events: List[Tuple[str, Any]] = []
|
|
54
|
+
|
|
55
|
+
def _parse_number(self, num_str: str) -> int:
|
|
56
|
+
"""Parse number strings like '50K', '1.5M' to integers."""
|
|
57
|
+
num_str = num_str.strip().upper()
|
|
58
|
+
|
|
59
|
+
if num_str.endswith('K'):
|
|
60
|
+
return int(float(num_str[:-1]) * 1000)
|
|
61
|
+
elif num_str.endswith('M'):
|
|
62
|
+
return int(float(num_str[:-1]) * 1_000_000)
|
|
63
|
+
else:
|
|
64
|
+
return int(num_str)
|
|
65
|
+
|
|
66
|
+
def _detect_domain(self, story: str) -> Optional[str]:
|
|
67
|
+
"""Detect business domain from story text."""
|
|
68
|
+
story_lower = story.lower()
|
|
69
|
+
|
|
70
|
+
for domain, keywords in self.DOMAIN_KEYWORDS.items():
|
|
71
|
+
for keyword in keywords:
|
|
72
|
+
if keyword in story_lower:
|
|
73
|
+
return domain
|
|
74
|
+
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
def _extract_scale(self, story: str) -> Dict[str, int]:
|
|
78
|
+
"""Extract scale parameters (number of records) from story."""
|
|
79
|
+
scale_params = {}
|
|
80
|
+
|
|
81
|
+
for pattern, entity_type in self.SCALE_PATTERNS.items():
|
|
82
|
+
match = re.search(pattern, story, re.IGNORECASE)
|
|
83
|
+
if match:
|
|
84
|
+
num_str = match.group(1)
|
|
85
|
+
scale_params[entity_type] = self._parse_number(num_str)
|
|
86
|
+
|
|
87
|
+
return scale_params
|
|
88
|
+
|
|
89
|
+
def _extract_temporal_events(self, story: str) -> List[Tuple[str, Any]]:
|
|
90
|
+
"""Extract temporal patterns (growth, churn, crashes, etc.)."""
|
|
91
|
+
events = []
|
|
92
|
+
|
|
93
|
+
for pattern, (event_type, param_type) in self.TEMPORAL_PATTERNS.items():
|
|
94
|
+
matches = re.finditer(pattern, story, re.IGNORECASE)
|
|
95
|
+
for match in matches:
|
|
96
|
+
if param_type == "rate":
|
|
97
|
+
value = int(match.group(1))
|
|
98
|
+
events.append((event_type, value / 100)) # Convert percentage
|
|
99
|
+
elif param_type == "date":
|
|
100
|
+
date_str = match.group(1)
|
|
101
|
+
events.append((event_type, date_str))
|
|
102
|
+
else:
|
|
103
|
+
events.append((event_type, None))
|
|
104
|
+
|
|
105
|
+
return events
|
|
106
|
+
|
|
107
|
+
def parse(self, story: str, default_rows: int = 10000) -> SchemaConfig:
|
|
108
|
+
"""
|
|
109
|
+
Parse a natural language story into a SchemaConfig.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
story: Natural language description of the data to generate
|
|
113
|
+
default_rows: Default number of rows if not specified in story
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
SchemaConfig object ready for data generation
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> parser = StoryParser()
|
|
120
|
+
>>> config = parser.parse(
|
|
121
|
+
... "A SaaS company with 50K users, 20% churn in Q3 2023"
|
|
122
|
+
... )
|
|
123
|
+
"""
|
|
124
|
+
# Extract information from story
|
|
125
|
+
self.detected_domain = self._detect_domain(story)
|
|
126
|
+
self.scale_params = self._extract_scale(story)
|
|
127
|
+
self.temporal_events = self._extract_temporal_events(story)
|
|
128
|
+
|
|
129
|
+
# Build schema based on detected domain
|
|
130
|
+
if self.detected_domain == "saas":
|
|
131
|
+
return self._build_saas_schema(story, default_rows)
|
|
132
|
+
elif self.detected_domain == "ecommerce":
|
|
133
|
+
return self._build_ecommerce_schema(story, default_rows)
|
|
134
|
+
elif self.detected_domain == "pharma":
|
|
135
|
+
return self._build_pharma_schema(story, default_rows)
|
|
136
|
+
else:
|
|
137
|
+
# Generic schema
|
|
138
|
+
return self._build_generic_schema(story, default_rows)
|
|
139
|
+
|
|
140
|
+
def _build_saas_schema(self, story: str, default_rows: int) -> SchemaConfig:
|
|
141
|
+
"""Build a SaaS-specific schema."""
|
|
142
|
+
num_users = self.scale_params.get("users", default_rows)
|
|
143
|
+
num_subscriptions = int(num_users * 1.2) # Some users have multiple subs
|
|
144
|
+
|
|
145
|
+
# Define tables
|
|
146
|
+
tables = [
|
|
147
|
+
Table(name="users", row_count=num_users, description="User accounts"),
|
|
148
|
+
Table(
|
|
149
|
+
name="subscriptions",
|
|
150
|
+
row_count=num_subscriptions,
|
|
151
|
+
description="User subscriptions",
|
|
152
|
+
),
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
# Define columns
|
|
156
|
+
columns = {
|
|
157
|
+
"users": [
|
|
158
|
+
Column(name="user_id", type="int", distribution_params={"min": 1, "max": num_users}),
|
|
159
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
160
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
161
|
+
Column(
|
|
162
|
+
name="signup_date",
|
|
163
|
+
type="date",
|
|
164
|
+
distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
|
|
165
|
+
),
|
|
166
|
+
Column(
|
|
167
|
+
name="plan",
|
|
168
|
+
type="categorical",
|
|
169
|
+
distribution_params={
|
|
170
|
+
"choices": ["free", "starter", "pro", "enterprise"],
|
|
171
|
+
"probabilities": [0.4, 0.3, 0.25, 0.05],
|
|
172
|
+
},
|
|
173
|
+
),
|
|
174
|
+
Column(name="churned", type="boolean", distribution_params={"probability": 0.15}),
|
|
175
|
+
],
|
|
176
|
+
"subscriptions": [
|
|
177
|
+
Column(
|
|
178
|
+
name="subscription_id",
|
|
179
|
+
type="int",
|
|
180
|
+
distribution_params={"min": 1, "max": num_subscriptions},
|
|
181
|
+
),
|
|
182
|
+
Column(name="user_id", type="foreign_key", distribution_params={}),
|
|
183
|
+
Column(
|
|
184
|
+
name="start_date",
|
|
185
|
+
type="date",
|
|
186
|
+
distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
|
|
187
|
+
),
|
|
188
|
+
Column(
|
|
189
|
+
name="mrr",
|
|
190
|
+
type="float",
|
|
191
|
+
distribution_params={
|
|
192
|
+
"distribution": "normal",
|
|
193
|
+
"mean": 150.0,
|
|
194
|
+
"std": 50.0,
|
|
195
|
+
"min": 0.0,
|
|
196
|
+
"decimals": 2,
|
|
197
|
+
},
|
|
198
|
+
),
|
|
199
|
+
Column(
|
|
200
|
+
name="status",
|
|
201
|
+
type="categorical",
|
|
202
|
+
distribution_params={
|
|
203
|
+
"choices": ["active", "cancelled", "paused"],
|
|
204
|
+
"probabilities": [0.7, 0.2, 0.1],
|
|
205
|
+
},
|
|
206
|
+
),
|
|
207
|
+
],
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Define relationships
|
|
211
|
+
relationships = [
|
|
212
|
+
Relationship(
|
|
213
|
+
parent_table="users",
|
|
214
|
+
child_table="subscriptions",
|
|
215
|
+
parent_key="user_id",
|
|
216
|
+
child_key="user_id",
|
|
217
|
+
),
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
# Build scenario events from temporal patterns
|
|
221
|
+
events = []
|
|
222
|
+
for event_type, value in self.temporal_events:
|
|
223
|
+
if event_type == "churn":
|
|
224
|
+
# Parse the churn date from story (e.g., "Q3 2023")
|
|
225
|
+
# For simplicity, use a fixed date
|
|
226
|
+
events.append(
|
|
227
|
+
ScenarioEvent(
|
|
228
|
+
name="High_Churn_Period",
|
|
229
|
+
table="users",
|
|
230
|
+
column="churned",
|
|
231
|
+
condition="signup_date < '2023-06-01'",
|
|
232
|
+
modifier_type="set",
|
|
233
|
+
modifier_value=True,
|
|
234
|
+
description=f"Churn rate of {value*100:.0f}%",
|
|
235
|
+
)
|
|
236
|
+
)
|
|
237
|
+
elif event_type == "growth":
|
|
238
|
+
events.append(
|
|
239
|
+
ScenarioEvent(
|
|
240
|
+
name="MRR_Growth",
|
|
241
|
+
table="subscriptions",
|
|
242
|
+
column="mrr",
|
|
243
|
+
condition="start_date > '2023-06-01'",
|
|
244
|
+
modifier_type="multiply",
|
|
245
|
+
modifier_value=1 + value,
|
|
246
|
+
description=f"Growth rate of {value*100:.0f}%",
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return SchemaConfig(
|
|
251
|
+
name="SaaS Dataset",
|
|
252
|
+
description=f"Generated from story: {story}",
|
|
253
|
+
tables=tables,
|
|
254
|
+
columns=columns,
|
|
255
|
+
relationships=relationships,
|
|
256
|
+
events=events,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def _build_ecommerce_schema(self, story: str, default_rows: int) -> SchemaConfig:
|
|
260
|
+
"""Build an E-commerce-specific schema."""
|
|
261
|
+
num_customers = self.scale_params.get("users", default_rows)
|
|
262
|
+
num_orders = self.scale_params.get("orders", int(num_customers * 3))
|
|
263
|
+
|
|
264
|
+
tables = [
|
|
265
|
+
Table(name="customers", row_count=num_customers),
|
|
266
|
+
Table(name="orders", row_count=num_orders),
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
columns = {
|
|
270
|
+
"customers": [
|
|
271
|
+
Column(name="customer_id", type="int", distribution_params={"min": 1, "max": num_customers}),
|
|
272
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
273
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
274
|
+
Column(
|
|
275
|
+
name="signup_date",
|
|
276
|
+
type="date",
|
|
277
|
+
distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
|
|
278
|
+
),
|
|
279
|
+
],
|
|
280
|
+
"orders": [
|
|
281
|
+
Column(name="order_id", type="int", distribution_params={"min": 1, "max": num_orders}),
|
|
282
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
283
|
+
Column(
|
|
284
|
+
name="order_date",
|
|
285
|
+
type="date",
|
|
286
|
+
distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
|
|
287
|
+
),
|
|
288
|
+
Column(
|
|
289
|
+
name="amount",
|
|
290
|
+
type="float",
|
|
291
|
+
distribution_params={
|
|
292
|
+
"distribution": "normal",
|
|
293
|
+
"mean": 75.0,
|
|
294
|
+
"std": 30.0,
|
|
295
|
+
"min": 10.0,
|
|
296
|
+
"decimals": 2,
|
|
297
|
+
},
|
|
298
|
+
),
|
|
299
|
+
],
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
relationships = [
|
|
303
|
+
Relationship(
|
|
304
|
+
parent_table="customers",
|
|
305
|
+
child_table="orders",
|
|
306
|
+
parent_key="customer_id",
|
|
307
|
+
child_key="customer_id",
|
|
308
|
+
),
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
return SchemaConfig(
|
|
312
|
+
name="E-commerce Dataset",
|
|
313
|
+
description=f"Generated from story: {story}",
|
|
314
|
+
tables=tables,
|
|
315
|
+
columns=columns,
|
|
316
|
+
relationships=relationships,
|
|
317
|
+
events=[],
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _build_pharma_schema(self, story: str, default_rows: int) -> SchemaConfig:
|
|
321
|
+
"""Build a Pharma services-specific schema."""
|
|
322
|
+
num_projects = self.scale_params.get("projects", default_rows // 100)
|
|
323
|
+
num_timesheets = default_rows
|
|
324
|
+
|
|
325
|
+
tables = [
|
|
326
|
+
Table(name="research_projects", row_count=num_projects),
|
|
327
|
+
Table(name="timesheets", row_count=num_timesheets),
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
columns = {
|
|
331
|
+
"research_projects": [
|
|
332
|
+
Column(name="project_id", type="int", distribution_params={"min": 1, "max": num_projects}),
|
|
333
|
+
Column(name="project_name", type="text", distribution_params={"text_type": "company"}),
|
|
334
|
+
Column(
|
|
335
|
+
name="start_date",
|
|
336
|
+
type="date",
|
|
337
|
+
distribution_params={"start": "2022-01-01", "end": "2024-01-01"},
|
|
338
|
+
),
|
|
339
|
+
Column(
|
|
340
|
+
name="status",
|
|
341
|
+
type="categorical",
|
|
342
|
+
distribution_params={
|
|
343
|
+
"choices": ["planning", "active", "completed", "on-hold"],
|
|
344
|
+
"probabilities": [0.1, 0.5, 0.3, 0.1],
|
|
345
|
+
},
|
|
346
|
+
),
|
|
347
|
+
],
|
|
348
|
+
"timesheets": [
|
|
349
|
+
Column(name="entry_id", type="int", distribution_params={"min": 1, "max": num_timesheets}),
|
|
350
|
+
Column(name="project_id", type="foreign_key", distribution_params={}),
|
|
351
|
+
Column(name="employee_name", type="text", distribution_params={"text_type": "name"}),
|
|
352
|
+
Column(
|
|
353
|
+
name="date",
|
|
354
|
+
type="date",
|
|
355
|
+
distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
|
|
356
|
+
),
|
|
357
|
+
Column(
|
|
358
|
+
name="hours",
|
|
359
|
+
type="float",
|
|
360
|
+
distribution_params={
|
|
361
|
+
"distribution": "normal",
|
|
362
|
+
"mean": 7.5,
|
|
363
|
+
"std": 1.5,
|
|
364
|
+
"min": 0.5,
|
|
365
|
+
"max": 12.0,
|
|
366
|
+
"decimals": 1,
|
|
367
|
+
},
|
|
368
|
+
),
|
|
369
|
+
],
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
relationships = [
|
|
373
|
+
Relationship(
|
|
374
|
+
parent_table="research_projects",
|
|
375
|
+
child_table="timesheets",
|
|
376
|
+
parent_key="project_id",
|
|
377
|
+
child_key="project_id",
|
|
378
|
+
),
|
|
379
|
+
]
|
|
380
|
+
|
|
381
|
+
return SchemaConfig(
|
|
382
|
+
name="Pharma Services Dataset",
|
|
383
|
+
description=f"Generated from story: {story}",
|
|
384
|
+
tables=tables,
|
|
385
|
+
columns=columns,
|
|
386
|
+
relationships=relationships,
|
|
387
|
+
events=[],
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
def _build_generic_schema(self, story: str, default_rows: int) -> SchemaConfig:
|
|
391
|
+
"""Build a generic schema when domain is not detected."""
|
|
392
|
+
tables = [
|
|
393
|
+
Table(name="main_table", row_count=default_rows),
|
|
394
|
+
]
|
|
395
|
+
|
|
396
|
+
columns = {
|
|
397
|
+
"main_table": [
|
|
398
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": default_rows}),
|
|
399
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
400
|
+
Column(
|
|
401
|
+
name="value",
|
|
402
|
+
type="float",
|
|
403
|
+
distribution_params={
|
|
404
|
+
"distribution": "normal",
|
|
405
|
+
"mean": 100.0,
|
|
406
|
+
"std": 20.0,
|
|
407
|
+
"decimals": 2,
|
|
408
|
+
},
|
|
409
|
+
),
|
|
410
|
+
Column(
|
|
411
|
+
name="date",
|
|
412
|
+
type="date",
|
|
413
|
+
distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
|
|
414
|
+
),
|
|
415
|
+
],
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
return SchemaConfig(
|
|
419
|
+
name="Generic Dataset",
|
|
420
|
+
description=f"Generated from story: {story}",
|
|
421
|
+
tables=tables,
|
|
422
|
+
columns=columns,
|
|
423
|
+
relationships=[],
|
|
424
|
+
events=[],
|
|
425
|
+
)
|