misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/__init__.py ADDED
@@ -0,0 +1,48 @@
1
+ """
2
+ Misata - AI-Powered Synthetic Data Engine
3
+
4
+ Generate realistic multi-table datasets from natural language descriptions.
5
+ Supports OpenAI, Groq, Gemini, and Ollama for intelligent schema generation.
6
+
7
+ Usage:
8
+ from misata import DataSimulator, SchemaConfig
9
+
10
+ # Or use the CLI:
11
+ # misata generate --story "A SaaS with 50k users..."
12
+ """
13
+
14
+ __version__ = "0.1.0-beta"
15
+ __author__ = "Muhammed Rasin"
16
+
17
+ from misata.schema import (
18
+ Column,
19
+ Constraint,
20
+ Relationship,
21
+ ScenarioEvent,
22
+ SchemaConfig,
23
+ Table,
24
+ )
25
+ from misata.simulator import DataSimulator
26
+ from misata.generators import TextGenerator
27
+ from misata.noise import NoiseInjector, add_noise
28
+ from misata.customization import Customizer, ColumnOverride
29
+
30
+ __all__ = [
31
+ # Core
32
+ "Column",
33
+ "Constraint",
34
+ "Relationship",
35
+ "ScenarioEvent",
36
+ "SchemaConfig",
37
+ "Table",
38
+ "DataSimulator",
39
+ # Extensibility
40
+ "TextGenerator",
41
+ # ML-ready features
42
+ "NoiseInjector",
43
+ "add_noise",
44
+ "Customizer",
45
+ "ColumnOverride",
46
+ ]
47
+
48
+
misata/api.py ADDED
@@ -0,0 +1,460 @@
1
+ """
2
+ Misata API - FastAPI backend for the web UI.
3
+
4
+ Provides REST endpoints for:
5
+ - Story-to-schema generation (LLM-powered)
6
+ - Graph-to-data reverse engineering
7
+ - Data generation and preview
8
+ - Schema validation and export
9
+ """
10
+
11
+ import io
12
+ import os
13
+ import tempfile
14
+ import zipfile
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from fastapi.responses import StreamingResponse
20
+ from pydantic import BaseModel
21
+
22
+ from misata import DataSimulator, SchemaConfig
23
+ from misata.llm_parser import LLMSchemaGenerator
24
+
25
+
26
+ # ============================================================================
27
+ # Request/Response Models
28
+ # ============================================================================
29
+
30
+ class StoryRequest(BaseModel):
31
+ """Request to generate schema from story."""
32
+ story: str
33
+ default_rows: int = 10000
34
+
35
+
36
+ class GraphRequest(BaseModel):
37
+ """Request to generate schema from graph description."""
38
+ description: str
39
+ chart_type: str = "line"
40
+
41
+
42
+ class GenerateRequest(BaseModel):
43
+ """Request to generate data from schema."""
44
+ schema_config: Dict[str, Any]
45
+ seed: Optional[int] = None
46
+
47
+
48
+ class EnhanceRequest(BaseModel):
49
+ """Request to enhance existing schema."""
50
+ schema_config: Dict[str, Any]
51
+ enhancement: str
52
+
53
+
54
+ class IndustrySuggestionsRequest(BaseModel):
55
+ """Request for industry-specific improvements."""
56
+ schema_config: Dict[str, Any]
57
+ industry: str
58
+
59
+
60
+ class SchemaResponse(BaseModel):
61
+ """Response containing generated schema."""
62
+ schema_config: Dict[str, Any]
63
+ tables_count: int
64
+ total_rows: int
65
+
66
+
67
+ class DataPreviewResponse(BaseModel):
68
+ """Response containing data preview."""
69
+ tables: Dict[str, List[Dict[str, Any]]]
70
+ stats: Dict[str, Dict[str, Any]]
71
+ download_id: str
72
+
73
+
74
+ # ============================================================================
75
+ # FastAPI App
76
+ # ============================================================================
77
+
78
+ app = FastAPI(
79
+ title="Misata API",
80
+ description="AI-Powered Synthetic Data Engine",
81
+ version="2.0.0",
82
+ docs_url="/docs",
83
+ redoc_url="/redoc"
84
+ )
85
+
86
+ # CORS for web UI
87
+ app.add_middleware(
88
+ CORSMiddleware,
89
+ allow_origins=["*"], # In production, restrict to your domain
90
+ allow_credentials=True,
91
+ allow_methods=["*"],
92
+ allow_headers=["*"],
93
+ )
94
+
95
+ # In-memory storage for generated data paths (ID -> directory path)
96
+ _generated_files: Dict[str, str] = {}
97
+
98
+
99
+ # ============================================================================
100
+ # Health Check
101
+ # ============================================================================
102
+
103
+ @app.get("/")
104
+ async def root():
105
+ """Health check and API info."""
106
+ return {
107
+ "name": "Misata API",
108
+ "version": "2.0.0",
109
+ "status": "healthy",
110
+ "docs": "/docs"
111
+ }
112
+
113
+
114
+ @app.get("/api/health")
115
+ async def health_check():
116
+ """Detailed health check."""
117
+ groq_key_set = bool(os.environ.get("GROQ_API_KEY"))
118
+ return {
119
+ "status": "healthy",
120
+ "groq_configured": groq_key_set,
121
+ "message": "Ready to generate synthetic data!" if groq_key_set else "Set GROQ_API_KEY for LLM features"
122
+ }
123
+
124
+
125
+ # ============================================================================
126
+ # Schema Generation Endpoints
127
+ # ============================================================================
128
+
129
+ @app.post("/api/generate-schema", response_model=SchemaResponse)
130
+ async def generate_schema_from_story(request: StoryRequest):
131
+ """
132
+ Generate schema from natural language story using LLM.
133
+
134
+ This is the core AI feature - describe your data needs in plain English.
135
+ """
136
+ try:
137
+ llm = LLMSchemaGenerator()
138
+ schema = llm.generate_from_story(
139
+ request.story,
140
+ default_rows=request.default_rows
141
+ )
142
+
143
+ return SchemaResponse(
144
+ schema_config=schema.model_dump(),
145
+ tables_count=len(schema.tables),
146
+ total_rows=sum(t.row_count for t in schema.tables)
147
+ )
148
+
149
+ except ValueError as e:
150
+ raise HTTPException(status_code=400, detail=str(e))
151
+ except Exception as e:
152
+ raise HTTPException(status_code=500, detail=f"Schema generation failed: {str(e)}")
153
+
154
+
155
+ @app.post("/api/generate-from-graph", response_model=SchemaResponse)
156
+ async def generate_schema_from_graph(request: GraphRequest):
157
+ """
158
+ REVERSE ENGINEERING: Generate schema that produces desired chart patterns.
159
+
160
+ Describe your chart, get data that matches it exactly.
161
+ """
162
+ try:
163
+ llm = LLMSchemaGenerator()
164
+ schema = llm.generate_from_graph(request.description)
165
+
166
+ return SchemaResponse(
167
+ schema_config=schema.model_dump(),
168
+ tables_count=len(schema.tables),
169
+ total_rows=sum(t.row_count for t in schema.tables)
170
+ )
171
+
172
+ except ValueError as e:
173
+ raise HTTPException(status_code=400, detail=str(e))
174
+ except Exception as e:
175
+ raise HTTPException(status_code=500, detail=f"Graph schema generation failed: {str(e)}")
176
+
177
+
178
+ @app.post("/api/enhance-schema", response_model=SchemaResponse)
179
+ async def enhance_schema(request: EnhanceRequest):
180
+ """
181
+ Enhance an existing schema with additional requirements.
182
+ """
183
+ try:
184
+ llm = LLMSchemaGenerator()
185
+ existing = SchemaConfig(**request.schema_config)
186
+ enhanced = llm.enhance_schema(existing, request.enhancement)
187
+
188
+ return SchemaResponse(
189
+ schema_config=enhanced.model_dump(),
190
+ tables_count=len(enhanced.tables),
191
+ total_rows=sum(t.row_count for t in enhanced.tables)
192
+ )
193
+
194
+ except Exception as e:
195
+ raise HTTPException(status_code=500, detail=f"Schema enhancement failed: {str(e)}")
196
+
197
+
198
+ @app.post("/api/industry-suggestions")
199
+ async def get_industry_suggestions(request: IndustrySuggestionsRequest):
200
+ """
201
+ Get AI suggestions for making data more industry-realistic.
202
+ """
203
+ try:
204
+ llm = LLMSchemaGenerator()
205
+ schema = SchemaConfig(**request.schema_config)
206
+ suggestions = llm.suggest_industry_improvements(schema, request.industry)
207
+
208
+ return suggestions
209
+
210
+ except Exception as e:
211
+ raise HTTPException(status_code=500, detail=f"Suggestions failed: {str(e)}")
212
+
213
+
214
+ # ============================================================================
215
+ # Data Generation Endpoints
216
+ # ============================================================================
217
+
218
+ @app.post("/api/generate-data", response_model=DataPreviewResponse)
219
+ async def generate_data(request: GenerateRequest, background_tasks: BackgroundTasks):
220
+ """
221
+ Generate synthetic data from schema configuration.
222
+
223
+ Returns a preview (first 100 rows per table) and a download ID for full data.
224
+ """
225
+ try:
226
+ schema = SchemaConfig(**request.schema_config)
227
+
228
+ if request.seed is not None:
229
+ schema.seed = request.seed
230
+
231
+ simulator = DataSimulator(schema)
232
+
233
+ # Create temp directory for this generation
234
+ import uuid
235
+
236
+ download_id = str(uuid.uuid4())
237
+ temp_dir = tempfile.mkdtemp(prefix=f"misata_{download_id}_")
238
+ _generated_files[download_id] = temp_dir
239
+
240
+ # Build preview and stats
241
+ preview = {}
242
+ stats = {}
243
+ files_created = set()
244
+
245
+ # Generate and stream to disk
246
+ for table_name, batch_df in simulator.generate_all():
247
+ output_path = os.path.join(temp_dir, f"{table_name}.csv")
248
+ mode = 'a' if table_name in files_created else 'w'
249
+ header = table_name not in files_created
250
+
251
+ batch_df.to_csv(output_path, mode=mode, header=header, index=False)
252
+ files_created.add(table_name)
253
+
254
+ # Use first batch for preview/stats if we haven't seen this table yet
255
+ if table_name not in preview:
256
+ preview_df = batch_df.head(100)
257
+ preview[table_name] = preview_df.to_dict(orient="records")
258
+
259
+ # Calculate basic stats on the first batch (approximate for speed)
260
+ stats[table_name] = {
261
+ "row_count": len(batch_df), # Incremented below if needed, but preview just shows batch info?
262
+ # Ideally we want total row count. But we only know it at the end if we stream.
263
+ # Or we trust schema row_count.
264
+ # Let's use schema count for "row_count" or keep updating?
265
+ # Simply using batch info is misleading.
266
+ # Let's trust schema row_count for display.
267
+ "columns": list(batch_df.columns),
268
+ "memory_mb": 0.0, # Not relevant on disk
269
+ "numeric_stats": {}
270
+ }
271
+
272
+ for col in batch_df.select_dtypes(include=["number"]).columns:
273
+ stats[table_name]["numeric_stats"][col] = {
274
+ "mean": float(batch_df[col].mean()),
275
+ "std": float(batch_df[col].std()),
276
+ "min": float(batch_df[col].min()),
277
+ "max": float(batch_df[col].max())
278
+ }
279
+
280
+ # Clean up old data after 1 hour (in background)
281
+ background_tasks.add_task(cleanup_old_data, download_id, 3600)
282
+
283
+ return DataPreviewResponse(
284
+ tables=preview,
285
+ stats=stats,
286
+ download_id=download_id
287
+ )
288
+
289
+ except Exception as e:
290
+ raise HTTPException(status_code=500, detail=f"Data generation failed: {str(e)}")
291
+
292
+
293
+ @app.get("/api/download/{download_id}")
294
+ async def download_data(download_id: str, format: str = "csv"):
295
+ """
296
+ Download generated data as CSV or JSON.
297
+ """
298
+ if download_id not in _generated_files:
299
+ raise HTTPException(status_code=404, detail="Data not found. It may have expired.")
300
+
301
+ temp_dir = _generated_files[download_id]
302
+
303
+ if format == "csv":
304
+ # Create ZIP from CSV files in temp directory
305
+ zip_buffer = io.BytesIO()
306
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
307
+ for filename in os.listdir(temp_dir):
308
+ if filename.endswith(".csv"):
309
+ file_path = os.path.join(temp_dir, filename)
310
+ zf.write(file_path, arcname=filename)
311
+
312
+ zip_buffer.seek(0)
313
+ return StreamingResponse(
314
+ zip_buffer,
315
+ media_type="application/zip",
316
+ headers={"Content-Disposition": f"attachment; filename=misata_data_{download_id[:8]}.zip"}
317
+ )
318
+
319
+ elif format == "json":
320
+ # Convert CSVs to JSON (warning: could be large)
321
+ import pandas as pd
322
+ json_data = {}
323
+ for filename in os.listdir(temp_dir):
324
+ if filename.endswith(".csv"):
325
+ table_name = filename[:-4]
326
+ file_path = os.path.join(temp_dir, filename)
327
+ df = pd.read_csv(file_path)
328
+ json_data[table_name] = df.to_dict(orient="records")
329
+ return json_data
330
+
331
+ else:
332
+ raise HTTPException(status_code=400, detail=f"Unsupported format: {format}")
333
+
334
+
335
+ async def cleanup_old_data(download_id: str, delay_seconds: int):
336
+ """Clean up generated data files after delay."""
337
+ import asyncio
338
+ import shutil
339
+
340
+ await asyncio.sleep(delay_seconds)
341
+ if download_id in _generated_files:
342
+ temp_dir = _generated_files[download_id]
343
+ shutil.rmtree(temp_dir, ignore_errors=True)
344
+ del _generated_files[download_id]
345
+
346
+
347
+ # ============================================================================
348
+ # Validation Endpoints
349
+ # ============================================================================
350
+
351
+ @app.post("/api/validate-schema")
352
+ async def validate_schema(schema_config: Dict[str, Any]):
353
+ """
354
+ Validate a schema configuration.
355
+ """
356
+ try:
357
+ schema = SchemaConfig(**schema_config)
358
+ return {
359
+ "valid": True,
360
+ "tables": len(schema.tables),
361
+ "columns": sum(len(cols) for cols in schema.columns.values()),
362
+ "relationships": len(schema.relationships),
363
+ "events": len(schema.events)
364
+ }
365
+ except Exception as e:
366
+ return {
367
+ "valid": False,
368
+ "error": str(e)
369
+ }
370
+
371
+
372
+ @app.post("/api/preview-distribution")
373
+ async def preview_distribution(
374
+ column_type: str,
375
+ distribution_params: Dict[str, Any],
376
+ sample_size: int = 1000
377
+ ):
378
+ """
379
+ Preview what a distribution will look like before generating.
380
+ """
381
+ import numpy as np
382
+
383
+ rng = np.random.default_rng(42)
384
+
385
+ if column_type in ["int", "float"]:
386
+ dist = distribution_params.get("distribution", "normal")
387
+
388
+ if dist == "normal":
389
+ values = rng.normal(
390
+ distribution_params.get("mean", 100),
391
+ distribution_params.get("std", 20),
392
+ sample_size
393
+ )
394
+ elif dist == "uniform":
395
+ values = rng.uniform(
396
+ distribution_params.get("min", 0),
397
+ distribution_params.get("max", 100),
398
+ sample_size
399
+ )
400
+ elif dist == "exponential":
401
+ values = rng.exponential(
402
+ distribution_params.get("scale", 1.0),
403
+ sample_size
404
+ )
405
+ else:
406
+ values = rng.normal(100, 20, sample_size)
407
+
408
+ # Apply constraints
409
+ if "min" in distribution_params:
410
+ values = np.maximum(values, distribution_params["min"])
411
+ if "max" in distribution_params:
412
+ values = np.minimum(values, distribution_params["max"])
413
+
414
+ if column_type == "int":
415
+ values = values.astype(int)
416
+
417
+ # Return histogram data
418
+ hist, bin_edges = np.histogram(values, bins=50)
419
+
420
+ return {
421
+ "histogram": {
422
+ "counts": hist.tolist(),
423
+ "bin_edges": bin_edges.tolist()
424
+ },
425
+ "stats": {
426
+ "mean": float(values.mean()),
427
+ "std": float(values.std()),
428
+ "min": float(values.min()),
429
+ "max": float(values.max())
430
+ },
431
+ "sample": values[:20].tolist()
432
+ }
433
+
434
+ elif column_type == "categorical":
435
+ choices = distribution_params.get("choices", ["A", "B", "C"])
436
+ probs = distribution_params.get("probabilities")
437
+
438
+ if probs:
439
+ probs = np.array(probs)
440
+ probs = probs / probs.sum()
441
+
442
+ values = rng.choice(choices, size=sample_size, p=probs)
443
+ unique, counts = np.unique(values, return_counts=True)
444
+
445
+ return {
446
+ "distribution": {choice: int(count) for choice, count in zip(unique, counts)},
447
+ "sample": values[:20].tolist()
448
+ }
449
+
450
+ else:
451
+ return {"error": f"Preview not supported for type: {column_type}"}
452
+
453
+
454
+ # ============================================================================
455
+ # Run Server
456
+ # ============================================================================
457
+
458
+ if __name__ == "__main__":
459
+ import uvicorn
460
+ uvicorn.run(app, host="0.0.0.0", port=8000)