misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/llm_parser.py ADDED
@@ -0,0 +1,493 @@
1
+ """
2
+ LLM-powered schema generator using Groq Llama 3.3.
3
+
4
+ This module provides intelligent schema generation from natural language,
5
+ including:
6
+ - Reference tables with actual LLM-generated data (exercises, plans, meals)
7
+ - Transactional tables with foreign keys to reference tables
8
+ - Industry-realistic column configurations
9
+ """
10
+
11
+ import json
12
+ import os
13
+ from pathlib import Path
14
+ from typing import Dict, Optional
15
+
16
+ from groq import Groq
17
+
18
+ from misata.curve_fitting import CurveFitter
19
+ from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
20
+
21
+
22
+ # Load .env file if it exists
23
+ def _load_env():
24
+ """Load environment variables from .env file."""
25
+ env_paths = [
26
+ Path.cwd() / ".env",
27
+ Path(__file__).parent.parent / ".env",
28
+ Path.home() / ".misata" / ".env",
29
+ ]
30
+
31
+ for env_path in env_paths:
32
+ if env_path.exists():
33
+ with open(env_path) as f:
34
+ for line in f:
35
+ line = line.strip()
36
+ if line and not line.startswith("#") and "=" in line:
37
+ key, _, value = line.partition("=")
38
+ os.environ.setdefault(key.strip(), value.strip())
39
+ break
40
+
41
+ _load_env()
42
+
43
+
44
+ SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Generate realistic database schemas with TWO types of tables:
45
+
46
+ ## TABLE TYPES
47
+
48
+ ### 1. REFERENCE TABLES (is_reference: true)
49
+ Small lookup tables with ACTUAL DATA you generate. Include realistic rows.
50
+ Examples: plans, exercises, categories, products, meal_types
51
+
52
+ For reference tables, provide:
53
+ - is_reference: true
54
+ - inline_data: Array of actual rows with realistic values
55
+
56
+ ### 2. TRANSACTIONAL TABLES (is_reference: false)
57
+ Large tables generated by code using foreign keys to reference tables.
58
+ Examples: users, subscriptions, orders, workouts, payments
59
+
60
+ For transactional tables, provide:
61
+ - row_count: Number of rows to generate
62
+ - Columns with distribution parameters
63
+
64
+ ## CRITICAL RULES
65
+
66
+ ### Reference Table Requirements:
67
+ - ALWAYS include an "id" column (integer, sequential from 1)
68
+ - Provide 5-20 realistic rows in inline_data
69
+ - Prices in reference tables are the SOURCE OF TRUTH
70
+
71
+ ### Transactional Table Requirements:
72
+ - Use foreign_key type to reference parent tables (reference or other parents)
73
+ - Users: type="text" with text_type="name" or "email"
74
+ - Metrics use distribution parameters
75
+
76
+ ### Foreign Key Rules:
77
+ - foreign_key columns reference parent table's "id" column
78
+ - Parent can be either reference table (plans.id) or transactional table (users.id)
79
+
80
+ ### Advanced Distributions (Optional):
81
+ Instead of guessing parameters, you can provide "control_points" to draw the shape.
82
+ Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
83
+ Misata will mathematically solve for the best parameters.
84
+
85
+ ## OUTPUT FORMAT
86
+
87
+ {
88
+ "name": "Dataset Name",
89
+ "description": "Description",
90
+ "seed": 42,
91
+ "tables": [
92
+ {
93
+ "name": "plans",
94
+ "is_reference": true,
95
+ "inline_data": [
96
+ {"id": 1, "name": "Free", "price": 0.0, "features": "Basic features"},
97
+ {"id": 2, "name": "Basic", "price": 9.99, "features": "All free + analytics"},
98
+ {"id": 3, "name": "Premium", "price": 19.99, "features": "All basic + priority support"},
99
+ {"id": 4, "name": "Enterprise", "price": 49.99, "features": "All premium + custom integrations"}
100
+ ]
101
+ },
102
+ {
103
+ "name": "exercises",
104
+ "is_reference": true,
105
+ "inline_data": [
106
+ {"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10},
107
+ {"id": 2, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8},
108
+ {"id": 3, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3},
109
+ {"id": 4, "name": "Weightlifting", "category": "Strength", "calories_per_minute": 6},
110
+ {"id": 5, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9},
111
+ {"id": 6, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12},
112
+ {"id": 7, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4},
113
+ {"id": 8, "name": "Boxing", "category": "Cardio", "calories_per_minute": 11}
114
+ ]
115
+ },
116
+ {
117
+ "name": "users",
118
+ "row_count": 50000,
119
+ "is_reference": false
120
+ },
121
+ {
122
+ "name": "subscriptions",
123
+ "row_count": 20000,
124
+ "is_reference": false
125
+ },
126
+ {
127
+ "name": "workouts",
128
+ "row_count": 100000,
129
+ "is_reference": false
130
+ }
131
+ ],
132
+ "columns": {
133
+ "users": [
134
+ {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}, "unique": true},
135
+ {"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
136
+ {"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
137
+ {"name": "age", "type": "int", "distribution_params": {"distribution": "uniform", "min": 18, "max": 65}}
138
+ ],
139
+ "subscriptions": [
140
+ {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 20000}},
141
+ {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
142
+ {"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
143
+ {"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused"], "probabilities": [0.7, 0.2, 0.1]}},
144
+ {"name": "start_date", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}}
145
+ ],
146
+ "workouts": [
147
+ {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
148
+ {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
149
+ {"name": "exercise_id", "type": "foreign_key", "distribution_params": {}},
150
+ {"name": "duration_minutes", "type": "int", "distribution_params": {"distribution": "uniform", "min": 15, "max": 90}},
151
+ {"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}}
152
+ ]
153
+ },
154
+ "relationships": [
155
+ {"parent_table": "users", "child_table": "subscriptions", "parent_key": "id", "child_key": "user_id"},
156
+ {"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
157
+ {"parent_table": "users", "child_table": "workouts", "parent_key": "id", "child_key": "user_id"},
158
+ {"parent_table": "exercises", "child_table": "workouts", "parent_key": "id", "child_key": "exercise_id"}
159
+ ],
160
+ "events": []
161
+ }
162
+
163
+ ## KEY DIFFERENCE FROM BEFORE:
164
+ - Reference tables have ACTUAL DATA in inline_data (plans with real prices!)
165
+ - Transactional tables use foreign_key to REFERENCE those tables
166
+ - When workout.exercise_id = 3, it means "Yoga" because exercises table has {id: 3, name: "Yoga"}
167
+
168
+ Generate schemas following this exact pattern. The reference table inline_data is the source of truth."""
169
+
170
+
171
+ GRAPH_REVERSE_PROMPT = """You are Misata, an expert at reverse-engineering data patterns.
172
+ Given a description of a desired chart or graph pattern, generate a schema that will
173
+ produce data matching that EXACT pattern when plotted.
174
+
175
+ Follow the same two-tier table structure:
176
+ - Reference tables with inline_data for lookup values
177
+ - Transactional tables with foreign keys for mass data
178
+
179
+ The user will describe a chart they want. Your job is to generate data that,
180
+ when plotted, produces that exact chart."""
181
+
182
+
183
+ class LLMSchemaGenerator:
184
+ """
185
+ Generate realistic schemas from natural language using LLMs.
186
+
187
+ Supports multiple providers:
188
+ - groq: Groq Cloud (Llama 3.3) - Fast, free tier
189
+ - openai: OpenAI (GPT-4o) - Best quality
190
+ - ollama: Local Ollama - Free, private
191
+
192
+ This is the "brain" of Misata - what makes it genuinely AI-powered.
193
+ """
194
+
195
+ # Provider configurations
196
+ PROVIDERS = {
197
+ "groq": {
198
+ "base_url": None, # Uses default
199
+ "env_key": "GROQ_API_KEY",
200
+ "default_model": "llama-3.3-70b-versatile",
201
+ },
202
+ "openai": {
203
+ "base_url": None,
204
+ "env_key": "OPENAI_API_KEY",
205
+ "default_model": "gpt-4o-mini",
206
+ },
207
+ "ollama": {
208
+ "base_url": "http://localhost:11434/v1",
209
+ "env_key": None, # No key needed for local
210
+ "default_model": "llama3",
211
+ },
212
+ }
213
+
214
+ def __init__(
215
+ self,
216
+ provider: Optional[str] = None,
217
+ api_key: Optional[str] = None,
218
+ model: Optional[str] = None,
219
+ base_url: Optional[str] = None,
220
+ ):
221
+ """
222
+ Initialize the LLM generator.
223
+
224
+ Args:
225
+ provider: LLM provider ("groq", "openai", "ollama").
226
+ Defaults to MISATA_PROVIDER env var or "groq".
227
+ api_key: API key. If not provided, reads from provider's env var.
228
+ model: Model name. If not provided, uses provider default.
229
+ base_url: Custom API base URL (for Ollama or compatible APIs).
230
+ """
231
+ # Determine provider
232
+ self.provider = provider or os.environ.get("MISATA_PROVIDER", "groq").lower()
233
+
234
+ if self.provider not in self.PROVIDERS:
235
+ raise ValueError(f"Unknown provider: {self.provider}. Use: {list(self.PROVIDERS.keys())}")
236
+
237
+ config = self.PROVIDERS[self.provider]
238
+
239
+ # Get API key
240
+ self.api_key = api_key
241
+ if not self.api_key and config["env_key"]:
242
+ self.api_key = os.environ.get(config["env_key"])
243
+
244
+ if not self.api_key and self.provider != "ollama":
245
+ env_key = config["env_key"]
246
+ raise ValueError(
247
+ f"{self.provider.title()} API key required. "
248
+ f"Set {env_key} environment variable or pass api_key parameter."
249
+ )
250
+
251
+ # Set model
252
+ self.model = model or config["default_model"]
253
+
254
+ # Set base URL
255
+ self.base_url = base_url or config["base_url"]
256
+
257
+ # Initialize client (all providers use OpenAI-compatible API)
258
+ if self.provider == "groq":
259
+ self.client = Groq(api_key=self.api_key)
260
+ else:
261
+ # OpenAI and Ollama use openai package
262
+ try:
263
+ from openai import OpenAI
264
+ except ImportError:
265
+ raise ImportError(
266
+ f"openai package required for {self.provider}. "
267
+ "Install with: pip install openai"
268
+ )
269
+
270
+ client_kwargs = {}
271
+ if self.api_key:
272
+ client_kwargs["api_key"] = self.api_key
273
+ if self.base_url:
274
+ client_kwargs["base_url"] = self.base_url
275
+
276
+ # Ollama doesn't need a real API key
277
+ if self.provider == "ollama":
278
+ client_kwargs["api_key"] = "ollama"
279
+
280
+ self.client = OpenAI(**client_kwargs)
281
+
282
+ def generate_from_story(
283
+ self,
284
+ story: str,
285
+ default_rows: int = 10000,
286
+ temperature: float = 0.3,
287
+ ) -> SchemaConfig:
288
+ """
289
+ Generate a realistic schema from a natural language story.
290
+
291
+ Args:
292
+ story: Natural language description of the data needs
293
+ default_rows: Default row count if not specified in story
294
+ temperature: LLM temperature (lower = more consistent)
295
+
296
+ Returns:
297
+ SchemaConfig ready for data generation
298
+ """
299
+ user_prompt = f"""Generate a complete synthetic data schema in JSON format for:
300
+
301
+ {story}
302
+
303
+ IMPORTANT:
304
+ 1. Create REFERENCE TABLES with inline_data for: plans, exercises, categories, products, etc.
305
+ 2. Create TRANSACTIONAL TABLES with row_count for: users, subscriptions, orders, workouts, etc.
306
+ 3. Use foreign_key to link transactional tables to reference tables
307
+ 4. Default row count for transactional tables: {default_rows}
308
+
309
+ Output valid JSON. Think about what lookup/reference data is needed, then what transactional data references it."""
310
+
311
+
312
+ response = self.client.chat.completions.create(
313
+ model=self.model,
314
+ messages=[
315
+ {"role": "system", "content": SYSTEM_PROMPT},
316
+ {"role": "user", "content": user_prompt}
317
+ ],
318
+ temperature=temperature,
319
+ max_tokens=6000,
320
+ response_format={"type": "json_object"}
321
+ )
322
+
323
+ schema_dict = json.loads(response.choices[0].message.content)
324
+ return self._parse_schema(schema_dict)
325
+
326
+ def generate_from_graph(
327
+ self,
328
+ graph_description: str,
329
+ temperature: float = 0.2,
330
+ ) -> SchemaConfig:
331
+ """
332
+ REVERSE ENGINEERING: Generate schema that produces desired graph patterns.
333
+ """
334
+ user_prompt = f"""Generate a JSON schema that will produce this chart pattern:
335
+
336
+ {graph_description}
337
+
338
+ Include reference tables with inline_data for lookup values and transactional tables for mass data. Output valid JSON."""
339
+
340
+
341
+ response = self.client.chat.completions.create(
342
+ model=self.model,
343
+ messages=[
344
+ {"role": "system", "content": GRAPH_REVERSE_PROMPT},
345
+ {"role": "user", "content": user_prompt}
346
+ ],
347
+ temperature=temperature,
348
+ max_tokens=6000,
349
+ response_format={"type": "json_object"}
350
+ )
351
+
352
+ schema_dict = json.loads(response.choices[0].message.content)
353
+ return self._parse_schema(schema_dict)
354
+
355
+ def _normalize_distribution_params(self, col_type: str, params: Dict) -> Dict:
356
+ """Normalize LLM output variations in distribution_params."""
357
+ normalized = params.copy()
358
+
359
+ # Normalize date column parameters
360
+ if col_type == "date":
361
+ if "start_date" in normalized and "start" not in normalized:
362
+ normalized["start"] = normalized.pop("start_date")
363
+ if "end_date" in normalized and "end" not in normalized:
364
+ normalized["end"] = normalized.pop("end_date")
365
+ if "start" not in normalized:
366
+ normalized["start"] = "2023-01-01"
367
+ if "end" not in normalized:
368
+ normalized["end"] = "2024-12-31"
369
+
370
+ # Normalize categorical parameters
371
+ if col_type == "categorical":
372
+ if "options" in normalized and "choices" not in normalized:
373
+ normalized["choices"] = normalized.pop("options")
374
+ if "choices" not in normalized:
375
+ normalized["choices"] = ["A", "B", "C"]
376
+
377
+ # Curve Fitting for 'control_points'
378
+ if "control_points" in normalized:
379
+ try:
380
+ points = normalized.pop("control_points")
381
+ dist_type = normalized.get("distribution", "normal")
382
+ fitter = CurveFitter()
383
+ fitted_params = fitter.fit_distribution(points, dist_type)
384
+ # Merge fitted params, keeping any manual overrides if they exist (or overwriting? let's overwrite for safety)
385
+ normalized.update(fitted_params)
386
+ except Exception:
387
+ # If fitting fails, fallback to defaults or keep what we have
388
+ pass
389
+
390
+ return normalized
391
+
392
+ def _parse_schema(self, schema_dict: Dict) -> SchemaConfig:
393
+ """Parse LLM output into validated SchemaConfig."""
394
+
395
+ # Parse tables
396
+ tables = []
397
+ for t in schema_dict.get("tables", []):
398
+ is_ref = t.get("is_reference", False)
399
+ inline = t.get("inline_data", None)
400
+ row_count = t.get("row_count", len(inline) if inline else 100)
401
+
402
+ tables.append(Table(
403
+ name=t["name"],
404
+ row_count=row_count,
405
+ description=t.get("description"),
406
+ is_reference=is_ref,
407
+ inline_data=inline
408
+ ))
409
+
410
+ # Parse columns (only for transactional tables, reference tables use inline_data)
411
+ columns = {}
412
+ for table_name, cols in schema_dict.get("columns", {}).items():
413
+ columns[table_name] = []
414
+ for c in cols:
415
+ col_type = c.get("type", "text")
416
+ raw_params = c.get("distribution_params", {})
417
+ normalized_params = self._normalize_distribution_params(col_type, raw_params)
418
+
419
+ columns[table_name].append(Column(
420
+ name=c["name"],
421
+ type=col_type,
422
+ distribution_params=normalized_params,
423
+ nullable=c.get("nullable", False),
424
+ unique=c.get("unique", False)
425
+ ))
426
+
427
+ # For reference tables without columns, create columns from inline_data
428
+ for table in tables:
429
+ if table.is_reference and table.inline_data and table.name not in columns:
430
+ # Infer columns from first row of inline_data
431
+ first_row = table.inline_data[0]
432
+ columns[table.name] = []
433
+ for col_name, value in first_row.items():
434
+ if isinstance(value, int):
435
+ col_type = "int"
436
+ elif isinstance(value, float):
437
+ col_type = "float"
438
+ else:
439
+ col_type = "text"
440
+ columns[table.name].append(Column(
441
+ name=col_name,
442
+ type=col_type,
443
+ distribution_params={}
444
+ ))
445
+
446
+ # Parse relationships
447
+ relationships = []
448
+ for r in schema_dict.get("relationships", []):
449
+ relationships.append(Relationship(
450
+ parent_table=r["parent_table"],
451
+ child_table=r["child_table"],
452
+ parent_key=r["parent_key"],
453
+ child_key=r["child_key"],
454
+ temporal_constraint=r.get("temporal_constraint", False)
455
+ ))
456
+
457
+ # Parse events
458
+ events = []
459
+ for e in schema_dict.get("events", []):
460
+ if not all(key in e for key in ["name", "table", "column", "condition", "modifier_type", "modifier_value"]):
461
+ continue
462
+ events.append(ScenarioEvent(
463
+ name=e["name"],
464
+ table=e["table"],
465
+ column=e["column"],
466
+ condition=e["condition"],
467
+ modifier_type=e["modifier_type"],
468
+ modifier_value=e["modifier_value"],
469
+ description=e.get("description")
470
+ ))
471
+
472
+ return SchemaConfig(
473
+ name=schema_dict.get("name", "Generated Dataset"),
474
+ description=schema_dict.get("description"),
475
+ tables=tables,
476
+ columns=columns,
477
+ relationships=relationships,
478
+ events=events,
479
+ seed=schema_dict.get("seed", 42)
480
+ )
481
+
482
+
483
+ # Convenience functions
484
+ def generate_schema(story: str, api_key: Optional[str] = None) -> SchemaConfig:
485
+ """Quick helper to generate schema from story."""
486
+ generator = LLMSchemaGenerator(api_key=api_key)
487
+ return generator.generate_from_story(story)
488
+
489
+
490
+ def generate_from_chart(description: str, api_key: Optional[str] = None) -> SchemaConfig:
491
+ """Quick helper to reverse-engineer schema from chart description."""
492
+ generator = LLMSchemaGenerator(api_key=api_key)
493
+ return generator.generate_from_graph(description)