misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/story_parser.py ADDED
@@ -0,0 +1,425 @@
1
+ """
2
+ Story parser for converting natural language descriptions to SchemaConfig.
3
+
4
+ This module provides rule-based pattern matching to extract:
5
+ - Business domain (SaaS, E-commerce, Pharma, etc.)
6
+ - Scale parameters (number of users, transactions, etc.)
7
+ - Temporal patterns (growth, churn, seasonality, crashes)
8
+ - Data relationships
9
+ """
10
+
11
+ import re
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
15
+
16
+
17
+ class StoryParser:
18
+ """
19
+ Parses natural language stories into SchemaConfig objects.
20
+
21
+ Uses regex patterns and template matching for MVP version.
22
+ Future: Can be enhanced with LLM integration.
23
+ """
24
+
25
+ # Pattern definitions
26
+ SCALE_PATTERNS = {
27
+ r"(\d+[KkMm]?)\s*users": "users",
28
+ r"(\d+[KkMm]?)\s*customers": "users",
29
+ r"(\d+[KkMm]?)\s*transactions": "transactions",
30
+ r"(\d+[KkMm]?)\s*orders": "orders",
31
+ r"(\d+[KkMm]?)\s*projects": "projects",
32
+ }
33
+
34
+ TEMPORAL_PATTERNS = {
35
+ r"(\d+)%\s*growth": ("growth", "rate"),
36
+ r"(\d+)%\s*churn": ("churn", "rate"),
37
+ r"crash\s*in\s*([QqJjFfMmAaSsOoNnDd]+\s*\d{4})": ("crash", "date"),
38
+ r"seasonality": ("seasonality", None),
39
+ r"seasonal": ("seasonality", None),
40
+ }
41
+
42
+ DOMAIN_KEYWORDS = {
43
+ "saas": ["saas", "subscription", "mrr", "arr", "churn"],
44
+ "ecommerce": ["ecommerce", "e-commerce", "orders", "cart", "products"],
45
+ "pharma": ["pharma", "research", "timesheet", "clinical", "trials"],
46
+ "fintech": ["fintech", "transactions", "payments", "wallet"],
47
+ }
48
+
49
+ def __init__(self):
50
+ """Initialize the story parser."""
51
+ self.detected_domain: Optional[str] = None
52
+ self.scale_params: Dict[str, int] = {}
53
+ self.temporal_events: List[Tuple[str, Any]] = []
54
+
55
+ def _parse_number(self, num_str: str) -> int:
56
+ """Parse number strings like '50K', '1.5M' to integers."""
57
+ num_str = num_str.strip().upper()
58
+
59
+ if num_str.endswith('K'):
60
+ return int(float(num_str[:-1]) * 1000)
61
+ elif num_str.endswith('M'):
62
+ return int(float(num_str[:-1]) * 1_000_000)
63
+ else:
64
+ return int(num_str)
65
+
66
+ def _detect_domain(self, story: str) -> Optional[str]:
67
+ """Detect business domain from story text."""
68
+ story_lower = story.lower()
69
+
70
+ for domain, keywords in self.DOMAIN_KEYWORDS.items():
71
+ for keyword in keywords:
72
+ if keyword in story_lower:
73
+ return domain
74
+
75
+ return None
76
+
77
+ def _extract_scale(self, story: str) -> Dict[str, int]:
78
+ """Extract scale parameters (number of records) from story."""
79
+ scale_params = {}
80
+
81
+ for pattern, entity_type in self.SCALE_PATTERNS.items():
82
+ match = re.search(pattern, story, re.IGNORECASE)
83
+ if match:
84
+ num_str = match.group(1)
85
+ scale_params[entity_type] = self._parse_number(num_str)
86
+
87
+ return scale_params
88
+
89
+ def _extract_temporal_events(self, story: str) -> List[Tuple[str, Any]]:
90
+ """Extract temporal patterns (growth, churn, crashes, etc.)."""
91
+ events = []
92
+
93
+ for pattern, (event_type, param_type) in self.TEMPORAL_PATTERNS.items():
94
+ matches = re.finditer(pattern, story, re.IGNORECASE)
95
+ for match in matches:
96
+ if param_type == "rate":
97
+ value = int(match.group(1))
98
+ events.append((event_type, value / 100)) # Convert percentage
99
+ elif param_type == "date":
100
+ date_str = match.group(1)
101
+ events.append((event_type, date_str))
102
+ else:
103
+ events.append((event_type, None))
104
+
105
+ return events
106
+
107
+ def parse(self, story: str, default_rows: int = 10000) -> SchemaConfig:
108
+ """
109
+ Parse a natural language story into a SchemaConfig.
110
+
111
+ Args:
112
+ story: Natural language description of the data to generate
113
+ default_rows: Default number of rows if not specified in story
114
+
115
+ Returns:
116
+ SchemaConfig object ready for data generation
117
+
118
+ Example:
119
+ >>> parser = StoryParser()
120
+ >>> config = parser.parse(
121
+ ... "A SaaS company with 50K users, 20% churn in Q3 2023"
122
+ ... )
123
+ """
124
+ # Extract information from story
125
+ self.detected_domain = self._detect_domain(story)
126
+ self.scale_params = self._extract_scale(story)
127
+ self.temporal_events = self._extract_temporal_events(story)
128
+
129
+ # Build schema based on detected domain
130
+ if self.detected_domain == "saas":
131
+ return self._build_saas_schema(story, default_rows)
132
+ elif self.detected_domain == "ecommerce":
133
+ return self._build_ecommerce_schema(story, default_rows)
134
+ elif self.detected_domain == "pharma":
135
+ return self._build_pharma_schema(story, default_rows)
136
+ else:
137
+ # Generic schema
138
+ return self._build_generic_schema(story, default_rows)
139
+
140
+ def _build_saas_schema(self, story: str, default_rows: int) -> SchemaConfig:
141
+ """Build a SaaS-specific schema."""
142
+ num_users = self.scale_params.get("users", default_rows)
143
+ num_subscriptions = int(num_users * 1.2) # Some users have multiple subs
144
+
145
+ # Define tables
146
+ tables = [
147
+ Table(name="users", row_count=num_users, description="User accounts"),
148
+ Table(
149
+ name="subscriptions",
150
+ row_count=num_subscriptions,
151
+ description="User subscriptions",
152
+ ),
153
+ ]
154
+
155
+ # Define columns
156
+ columns = {
157
+ "users": [
158
+ Column(name="user_id", type="int", distribution_params={"min": 1, "max": num_users}),
159
+ Column(name="email", type="text", distribution_params={"text_type": "email"}),
160
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
161
+ Column(
162
+ name="signup_date",
163
+ type="date",
164
+ distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
165
+ ),
166
+ Column(
167
+ name="plan",
168
+ type="categorical",
169
+ distribution_params={
170
+ "choices": ["free", "starter", "pro", "enterprise"],
171
+ "probabilities": [0.4, 0.3, 0.25, 0.05],
172
+ },
173
+ ),
174
+ Column(name="churned", type="boolean", distribution_params={"probability": 0.15}),
175
+ ],
176
+ "subscriptions": [
177
+ Column(
178
+ name="subscription_id",
179
+ type="int",
180
+ distribution_params={"min": 1, "max": num_subscriptions},
181
+ ),
182
+ Column(name="user_id", type="foreign_key", distribution_params={}),
183
+ Column(
184
+ name="start_date",
185
+ type="date",
186
+ distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
187
+ ),
188
+ Column(
189
+ name="mrr",
190
+ type="float",
191
+ distribution_params={
192
+ "distribution": "normal",
193
+ "mean": 150.0,
194
+ "std": 50.0,
195
+ "min": 0.0,
196
+ "decimals": 2,
197
+ },
198
+ ),
199
+ Column(
200
+ name="status",
201
+ type="categorical",
202
+ distribution_params={
203
+ "choices": ["active", "cancelled", "paused"],
204
+ "probabilities": [0.7, 0.2, 0.1],
205
+ },
206
+ ),
207
+ ],
208
+ }
209
+
210
+ # Define relationships
211
+ relationships = [
212
+ Relationship(
213
+ parent_table="users",
214
+ child_table="subscriptions",
215
+ parent_key="user_id",
216
+ child_key="user_id",
217
+ ),
218
+ ]
219
+
220
+ # Build scenario events from temporal patterns
221
+ events = []
222
+ for event_type, value in self.temporal_events:
223
+ if event_type == "churn":
224
+ # Parse the churn date from story (e.g., "Q3 2023")
225
+ # For simplicity, use a fixed date
226
+ events.append(
227
+ ScenarioEvent(
228
+ name="High_Churn_Period",
229
+ table="users",
230
+ column="churned",
231
+ condition="signup_date < '2023-06-01'",
232
+ modifier_type="set",
233
+ modifier_value=True,
234
+ description=f"Churn rate of {value*100:.0f}%",
235
+ )
236
+ )
237
+ elif event_type == "growth":
238
+ events.append(
239
+ ScenarioEvent(
240
+ name="MRR_Growth",
241
+ table="subscriptions",
242
+ column="mrr",
243
+ condition="start_date > '2023-06-01'",
244
+ modifier_type="multiply",
245
+ modifier_value=1 + value,
246
+ description=f"Growth rate of {value*100:.0f}%",
247
+ )
248
+ )
249
+
250
+ return SchemaConfig(
251
+ name="SaaS Dataset",
252
+ description=f"Generated from story: {story}",
253
+ tables=tables,
254
+ columns=columns,
255
+ relationships=relationships,
256
+ events=events,
257
+ )
258
+
259
+ def _build_ecommerce_schema(self, story: str, default_rows: int) -> SchemaConfig:
260
+ """Build an E-commerce-specific schema."""
261
+ num_customers = self.scale_params.get("users", default_rows)
262
+ num_orders = self.scale_params.get("orders", int(num_customers * 3))
263
+
264
+ tables = [
265
+ Table(name="customers", row_count=num_customers),
266
+ Table(name="orders", row_count=num_orders),
267
+ ]
268
+
269
+ columns = {
270
+ "customers": [
271
+ Column(name="customer_id", type="int", distribution_params={"min": 1, "max": num_customers}),
272
+ Column(name="email", type="text", distribution_params={"text_type": "email"}),
273
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
274
+ Column(
275
+ name="signup_date",
276
+ type="date",
277
+ distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
278
+ ),
279
+ ],
280
+ "orders": [
281
+ Column(name="order_id", type="int", distribution_params={"min": 1, "max": num_orders}),
282
+ Column(name="customer_id", type="foreign_key", distribution_params={}),
283
+ Column(
284
+ name="order_date",
285
+ type="date",
286
+ distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
287
+ ),
288
+ Column(
289
+ name="amount",
290
+ type="float",
291
+ distribution_params={
292
+ "distribution": "normal",
293
+ "mean": 75.0,
294
+ "std": 30.0,
295
+ "min": 10.0,
296
+ "decimals": 2,
297
+ },
298
+ ),
299
+ ],
300
+ }
301
+
302
+ relationships = [
303
+ Relationship(
304
+ parent_table="customers",
305
+ child_table="orders",
306
+ parent_key="customer_id",
307
+ child_key="customer_id",
308
+ ),
309
+ ]
310
+
311
+ return SchemaConfig(
312
+ name="E-commerce Dataset",
313
+ description=f"Generated from story: {story}",
314
+ tables=tables,
315
+ columns=columns,
316
+ relationships=relationships,
317
+ events=[],
318
+ )
319
+
320
+ def _build_pharma_schema(self, story: str, default_rows: int) -> SchemaConfig:
321
+ """Build a Pharma services-specific schema."""
322
+ num_projects = self.scale_params.get("projects", default_rows // 100)
323
+ num_timesheets = default_rows
324
+
325
+ tables = [
326
+ Table(name="research_projects", row_count=num_projects),
327
+ Table(name="timesheets", row_count=num_timesheets),
328
+ ]
329
+
330
+ columns = {
331
+ "research_projects": [
332
+ Column(name="project_id", type="int", distribution_params={"min": 1, "max": num_projects}),
333
+ Column(name="project_name", type="text", distribution_params={"text_type": "company"}),
334
+ Column(
335
+ name="start_date",
336
+ type="date",
337
+ distribution_params={"start": "2022-01-01", "end": "2024-01-01"},
338
+ ),
339
+ Column(
340
+ name="status",
341
+ type="categorical",
342
+ distribution_params={
343
+ "choices": ["planning", "active", "completed", "on-hold"],
344
+ "probabilities": [0.1, 0.5, 0.3, 0.1],
345
+ },
346
+ ),
347
+ ],
348
+ "timesheets": [
349
+ Column(name="entry_id", type="int", distribution_params={"min": 1, "max": num_timesheets}),
350
+ Column(name="project_id", type="foreign_key", distribution_params={}),
351
+ Column(name="employee_name", type="text", distribution_params={"text_type": "name"}),
352
+ Column(
353
+ name="date",
354
+ type="date",
355
+ distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
356
+ ),
357
+ Column(
358
+ name="hours",
359
+ type="float",
360
+ distribution_params={
361
+ "distribution": "normal",
362
+ "mean": 7.5,
363
+ "std": 1.5,
364
+ "min": 0.5,
365
+ "max": 12.0,
366
+ "decimals": 1,
367
+ },
368
+ ),
369
+ ],
370
+ }
371
+
372
+ relationships = [
373
+ Relationship(
374
+ parent_table="research_projects",
375
+ child_table="timesheets",
376
+ parent_key="project_id",
377
+ child_key="project_id",
378
+ ),
379
+ ]
380
+
381
+ return SchemaConfig(
382
+ name="Pharma Services Dataset",
383
+ description=f"Generated from story: {story}",
384
+ tables=tables,
385
+ columns=columns,
386
+ relationships=relationships,
387
+ events=[],
388
+ )
389
+
390
+ def _build_generic_schema(self, story: str, default_rows: int) -> SchemaConfig:
391
+ """Build a generic schema when domain is not detected."""
392
+ tables = [
393
+ Table(name="main_table", row_count=default_rows),
394
+ ]
395
+
396
+ columns = {
397
+ "main_table": [
398
+ Column(name="id", type="int", distribution_params={"min": 1, "max": default_rows}),
399
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
400
+ Column(
401
+ name="value",
402
+ type="float",
403
+ distribution_params={
404
+ "distribution": "normal",
405
+ "mean": 100.0,
406
+ "std": 20.0,
407
+ "decimals": 2,
408
+ },
409
+ ),
410
+ Column(
411
+ name="date",
412
+ type="date",
413
+ distribution_params={"start": "2022-01-01", "end": "2024-12-31"},
414
+ ),
415
+ ],
416
+ }
417
+
418
+ return SchemaConfig(
419
+ name="Generic Dataset",
420
+ description=f"Generated from story: {story}",
421
+ tables=tables,
422
+ columns=columns,
423
+ relationships=[],
424
+ events=[],
425
+ )