misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/smart_values.py ADDED
@@ -0,0 +1,762 @@
1
+ """
2
+ LLM-powered smart value generation for context-aware data.
3
+
4
+ This module generates realistic domain-specific values by:
5
+ 1. Detecting semantic domain from column/table names
6
+ 2. Using LLM to generate domain-appropriate data pools
7
+ 3. Caching pools for fast repeated generation
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import hashlib
13
+ from typing import Dict, List, Optional, Any
14
+ from pathlib import Path
15
+
16
+
17
+ class SmartValueGenerator:
18
+ """
19
+ Generate context-aware realistic values using LLM.
20
+
21
+ Detects domains from column/table names and generates
22
+ appropriate data pools using LLM or curated fallbacks.
23
+ """
24
+
25
+ # Domain detection patterns
26
+ DOMAIN_PATTERNS = {
27
+ # Medical
28
+ "disease": ["disease", "diagnosis", "condition", "illness", "ailment", "disorder", "pathology"],
29
+ "prescription": ["prescription", "medication", "drug", "medicine", "rx", "pharma"],
30
+ "procedure": ["procedure", "surgery", "treatment", "operation", "therapy", "intervention"],
31
+ "symptom": ["symptom", "complaint", "sign", "manifestation"],
32
+ "medical_specialty": ["specialty", "department", "ward"],
33
+
34
+ # Legal
35
+ "case_type": ["case_type", "legal_matter", "litigation_type"],
36
+ "law_firm": ["law_firm", "legal_firm", "attorney_firm"],
37
+ "legal_status": ["legal_status", "case_status", "court_status"],
38
+
39
+ # Retail/E-commerce
40
+ "product": ["product", "item", "merchandise", "goods"],
41
+ "category": ["category", "department", "section"],
42
+ "brand": ["brand", "manufacturer", "vendor"],
43
+
44
+ # Finance
45
+ "transaction_type": ["transaction_type", "payment_type", "transfer_type"],
46
+ "account_type": ["account_type", "bank_account", "financial_account"],
47
+
48
+ # HR/Employment
49
+ "job_title": ["job_title", "position", "role", "designation"],
50
+ "department": ["department", "division", "unit", "team"],
51
+ "skill": ["skill", "competency", "expertise", "qualification"],
52
+
53
+ # NEW: Food & Restaurant
54
+ "restaurant_name": ["restaurant", "diner", "cafe", "eatery", "bistro", "tavern"],
55
+ "cuisine_type": ["cuisine", "food_type", "culinary"],
56
+ "menu_item": ["menu_item", "dish", "meal", "entree", "appetizer"],
57
+
58
+ # NEW: Education
59
+ "course_name": ["course", "class", "lecture", "module", "subject"],
60
+ "university": ["university", "college", "institution", "school"],
61
+ "degree": ["degree", "certification", "diploma", "qualification"],
62
+
63
+ # NEW: Events & Meetings
64
+ "event_name": ["event", "conference", "meeting", "workshop", "seminar", "webinar"],
65
+ "venue": ["venue", "location", "hall", "auditorium", "center"],
66
+
67
+ # NEW: Projects & Work
68
+ "project_name": ["project", "initiative", "campaign", "program"],
69
+ "task_name": ["task", "todo", "action_item", "work_item"],
70
+ "milestone": ["milestone", "deliverable", "goal", "objective"],
71
+
72
+ # NEW: Reviews & Feedback
73
+ "review_title": ["review_title", "feedback_title", "comment_title"],
74
+ "review_text": ["review", "feedback", "comment", "testimonial", "opinion"],
75
+
76
+ # NEW: Location
77
+ "city": ["city", "town", "municipality", "metro"],
78
+ "country": ["country", "nation", "region"],
79
+ "address": ["address", "location", "street", "postal"],
80
+
81
+ # NEW: Business
82
+ "company_name": ["company", "organization", "business", "corporation", "enterprise", "firm"],
83
+ "industry": ["industry", "sector", "vertical", "market"],
84
+
85
+ # NEW: Tech/Software
86
+ "feature_name": ["feature", "capability", "functionality"],
87
+ "bug_type": ["bug", "issue", "defect", "error"],
88
+ "api_endpoint": ["endpoint", "api", "route", "path"],
89
+ }
90
+
91
+ # Curated fallback pools (no LLM needed)
92
+ FALLBACK_POOLS = {
93
+ "disease": [
94
+ "Type 2 Diabetes Mellitus", "Essential Hypertension", "Chronic Obstructive Pulmonary Disease",
95
+ "Major Depressive Disorder", "Generalized Anxiety Disorder", "Acute Myocardial Infarction",
96
+ "Atrial Fibrillation", "Chronic Kidney Disease Stage 3", "Rheumatoid Arthritis",
97
+ "Osteoarthritis", "Migraine without Aura", "Asthma", "Hypothyroidism", "Hyperlipidemia",
98
+ "Gastroesophageal Reflux Disease", "Irritable Bowel Syndrome", "Obesity", "Sleep Apnea",
99
+ "Chronic Lower Back Pain", "Urinary Tract Infection", "Pneumonia", "Bronchitis",
100
+ "Anemia", "Osteoporosis", "Fibromyalgia", "Seizure Disorder", "Glaucoma",
101
+ "Allergic Rhinitis", "Eczema", "Psoriasis", "Hepatitis C", "Cirrhosis",
102
+ "Congestive Heart Failure", "Coronary Artery Disease", "Peripheral Artery Disease",
103
+ "Deep Vein Thrombosis", "Pulmonary Embolism", "Stroke", "Transient Ischemic Attack",
104
+ "Multiple Sclerosis", "Parkinson's Disease", "Alzheimer's Disease", "Epilepsy",
105
+ "Lupus", "Crohn's Disease", "Ulcerative Colitis", "Celiac Disease",
106
+ "Polycystic Ovary Syndrome", "Endometriosis", "Benign Prostatic Hyperplasia",
107
+ ],
108
+ "prescription": [
109
+ "Metformin 500mg - Take twice daily with meals",
110
+ "Lisinopril 10mg - Take once daily",
111
+ "Atorvastatin 20mg - Take at bedtime",
112
+ "Levothyroxine 50mcg - Take on empty stomach",
113
+ "Amlodipine 5mg - Take once daily",
114
+ "Metoprolol 25mg - Take twice daily",
115
+ "Omeprazole 20mg - Take before breakfast",
116
+ "Sertraline 50mg - Take once daily",
117
+ "Gabapentin 300mg - Take three times daily",
118
+ "Tramadol 50mg - Take as needed for pain",
119
+ "Prednisone 10mg - Take with food",
120
+ "Albuterol Inhaler - Use as needed for breathing",
121
+ "Fluticasone Nasal Spray - Use twice daily",
122
+ "Insulin Glargine 20 units - Inject at bedtime",
123
+ "Warfarin 5mg - Take as directed with INR monitoring",
124
+ "Clopidogrel 75mg - Take once daily",
125
+ "Furosemide 40mg - Take in the morning",
126
+ "Losartan 50mg - Take once daily",
127
+ "Hydrochlorothiazide 25mg - Take in the morning",
128
+ "Duloxetine 60mg - Take once daily",
129
+ "Escitalopram 10mg - Take once daily",
130
+ "Alprazolam 0.5mg - Take as needed for anxiety",
131
+ "Zolpidem 10mg - Take at bedtime as needed",
132
+ "Simvastatin 40mg - Take at bedtime",
133
+ "Pantoprazole 40mg - Take before breakfast",
134
+ ],
135
+ "procedure": [
136
+ "Complete Blood Count", "Comprehensive Metabolic Panel", "Lipid Panel",
137
+ "Chest X-Ray", "CT Scan - Abdomen", "MRI - Brain", "Echocardiogram",
138
+ "Colonoscopy", "Upper Endoscopy", "Cardiac Catheterization",
139
+ "Knee Arthroscopy", "Laparoscopic Cholecystectomy", "Appendectomy",
140
+ "Total Hip Replacement", "Total Knee Replacement", "Coronary Artery Bypass",
141
+ "Angioplasty with Stent Placement", "Pacemaker Implantation",
142
+ "Lumbar Puncture", "Bone Marrow Biopsy", "Bronchoscopy",
143
+ "Thyroidectomy", "Mastectomy", "Prostatectomy", "Hysterectomy",
144
+ "Cataract Surgery", "LASIK Eye Surgery", "Tonsillectomy",
145
+ "Cesarean Section", "Spinal Fusion", "Hernia Repair",
146
+ ],
147
+ "symptom": [
148
+ "Chest pain", "Shortness of breath", "Fatigue", "Headache",
149
+ "Dizziness", "Nausea", "Vomiting", "Abdominal pain", "Back pain",
150
+ "Joint pain", "Muscle weakness", "Numbness", "Tingling sensation",
151
+ "Blurred vision", "Hearing loss", "Cough", "Fever", "Chills",
152
+ "Night sweats", "Weight loss", "Weight gain", "Loss of appetite",
153
+ "Insomnia", "Excessive thirst", "Frequent urination", "Swelling",
154
+ "Rash", "Itching", "Bruising", "Bleeding", "Difficulty swallowing",
155
+ "Heartburn", "Constipation", "Diarrhea", "Blood in stool",
156
+ "Difficulty concentrating", "Memory problems", "Anxiety", "Depression",
157
+ "Palpitations", "Leg cramps", "Cold intolerance", "Heat intolerance",
158
+ ],
159
+ "job_title": [
160
+ "Software Engineer", "Senior Software Engineer", "Staff Engineer",
161
+ "Product Manager", "Senior Product Manager", "Director of Product",
162
+ "Data Scientist", "Machine Learning Engineer", "Data Analyst",
163
+ "UX Designer", "UI Designer", "Product Designer",
164
+ "DevOps Engineer", "Site Reliability Engineer", "Platform Engineer",
165
+ "Engineering Manager", "VP of Engineering", "CTO",
166
+ "Sales Representative", "Account Executive", "Sales Manager",
167
+ "Marketing Manager", "Content Strategist", "Growth Manager",
168
+ "HR Manager", "Recruiter", "People Operations",
169
+ "Financial Analyst", "Controller", "CFO",
170
+ "Customer Success Manager", "Support Engineer", "Technical Writer",
171
+ ],
172
+ "department": [
173
+ "Engineering", "Product", "Design", "Marketing", "Sales",
174
+ "Human Resources", "Finance", "Operations", "Customer Success",
175
+ "Research & Development", "Legal", "IT", "Security",
176
+ "Quality Assurance", "Business Development", "Analytics",
177
+ "Supply Chain", "Manufacturing", "Procurement",
178
+ ],
179
+ "product": [
180
+ "Wireless Bluetooth Headphones", "Mechanical Gaming Keyboard",
181
+ "Ultra HD 4K Monitor", "Ergonomic Office Chair",
182
+ "Portable Power Bank 20000mAh", "Smart Home Speaker",
183
+ "Fitness Tracking Smartwatch", "Noise Cancelling Earbuds",
184
+ "USB-C Docking Station", "Laptop Cooling Pad",
185
+ "Wireless Mouse", "Gaming Mouse Pad XL",
186
+ "Webcam 1080p HD", "Ring Light with Tripod",
187
+ "Desk Organizer Set", "Cable Management Kit",
188
+ ],
189
+ "category": [
190
+ "Electronics", "Computers", "Office Supplies", "Home & Garden",
191
+ "Sports & Outdoors", "Clothing", "Beauty & Personal Care",
192
+ "Toys & Games", "Books", "Food & Grocery",
193
+ "Automotive", "Health & Wellness", "Pet Supplies",
194
+ "Baby & Kids", "Jewelry & Watches", "Arts & Crafts",
195
+ ],
196
+ # NEW DOMAIN POOLS
197
+ "restaurant_name": [
198
+ "The Golden Fork", "Bella Italia", "Tokyo Garden", "Blue Ocean Grill",
199
+ "Mountain View Cafe", "The Rustic Table", "Sakura Sushi", "Le Petit Bistro",
200
+ "Spice Route", "The Green Leaf", "Urban Kitchen", "Fire & Ice",
201
+ "The Hungry Bear", "Sunset Terrace", "Casa del Sol", "The Laughing Lobster",
202
+ "Emerald Thai", "Brooklyn Deli", "The Olive Branch", "Maple Street Diner",
203
+ ],
204
+ "cuisine_type": [
205
+ "Italian", "Japanese", "Mexican", "Chinese", "Indian", "Thai",
206
+ "French", "Mediterranean", "American", "Korean", "Vietnamese",
207
+ "Greek", "Middle Eastern", "Spanish", "Brazilian", "Ethiopian",
208
+ ],
209
+ "menu_item": [
210
+ "Grilled Salmon with Lemon Butter", "Margherita Pizza", "Chicken Tikka Masala",
211
+ "Pad Thai with Shrimp", "Caesar Salad", "Beef Bourguignon",
212
+ "Sushi Platter Deluxe", "Fish and Chips", "Vegetable Stir Fry",
213
+ "Lobster Bisque", "BBQ Ribs", "Mushroom Risotto", "Tacos al Pastor",
214
+ "Greek Moussaka", "Tom Yum Soup", "Eggs Benedict", "Avocado Toast",
215
+ "Butter Chicken", "Pho Bo", "Beef Wellington", "Crème Brûlée",
216
+ ],
217
+ "course_name": [
218
+ "Introduction to Machine Learning", "Advanced Data Structures",
219
+ "Web Development Fundamentals", "Cloud Computing Essentials",
220
+ "Digital Marketing Strategy", "Financial Accounting 101",
221
+ "Project Management Professional", "UX Design Principles",
222
+ "Python for Data Science", "Business Analytics", "Agile Methodology",
223
+ "Cybersecurity Fundamentals", "Leadership and Management",
224
+ "Public Speaking Mastery", "Creative Writing Workshop",
225
+ ],
226
+ "university": [
227
+ "MIT", "Stanford University", "Harvard University", "UC Berkeley",
228
+ "Cambridge University", "Oxford University", "ETH Zurich",
229
+ "Carnegie Mellon University", "Georgia Tech", "University of Michigan",
230
+ "UCLA", "Columbia University", "Yale University", "Princeton University",
231
+ "University of Toronto", "National University of Singapore",
232
+ ],
233
+ "degree": [
234
+ "Bachelor of Science in Computer Science", "Master of Business Administration",
235
+ "Doctor of Philosophy in Physics", "Bachelor of Arts in Economics",
236
+ "Master of Science in Data Science", "Bachelor of Engineering",
237
+ "Master of Public Health", "Doctor of Medicine",
238
+ "Master of Fine Arts", "Bachelor of Commerce",
239
+ "Professional Certificate in Project Management",
240
+ ],
241
+ "event_name": [
242
+ "Annual Tech Summit 2024", "Global Innovation Conference",
243
+ "Product Launch Webinar", "Quarterly Business Review",
244
+ "Team Building Workshop", "Customer Success Meetup",
245
+ "Developer Conference", "Marketing Strategy Session",
246
+ "Leadership Retreat", "Industry Networking Event",
247
+ "Hackathon: Build the Future", "AI & ML Symposium",
248
+ ],
249
+ "venue": [
250
+ "Grand Convention Center", "The Ritz-Carlton Ballroom",
251
+ "Silicon Valley Conference Hall", "Downtown Marriott",
252
+ "Tech Hub Auditorium", "Innovation Campus", "The Summit Club",
253
+ "Waterfront Event Space", "Metropolitan Convention Center",
254
+ "Hilton Garden Terrace", "The Forum", "Sunrise Pavilion",
255
+ ],
256
+ "project_name": [
257
+ "Project Phoenix", "Operation Streamline", "Initiative Alpha",
258
+ "Digital Transformation 2024", "Customer Experience Revamp",
259
+ "Platform Migration", "Security Enhancement Program",
260
+ "Market Expansion Initiative", "Product Innovation Sprint",
261
+ "Process Automation Project", "Brand Refresh Campaign",
262
+ "Infrastructure Modernization", "Data Lake Implementation",
263
+ ],
264
+ "task_name": [
265
+ "Review pull request", "Update documentation", "Fix login bug",
266
+ "Design landing page mockup", "Set up CI/CD pipeline",
267
+ "Conduct user interviews", "Write unit tests", "Optimize database queries",
268
+ "Create marketing copy", "Schedule team meeting", "Prepare quarterly report",
269
+ "Refactor authentication module", "Deploy to production",
270
+ ],
271
+ "milestone": [
272
+ "MVP Launch", "Beta Release", "General Availability",
273
+ "100K Users Milestone", "Series A Funding", "Product Market Fit",
274
+ "First Enterprise Customer", "International Expansion",
275
+ "SOC 2 Certification", "Mobile App Launch", "API v2 Release",
276
+ ],
277
+ "review_title": [
278
+ "Great product, highly recommend!", "Exceeded my expectations",
279
+ "Solid quality for the price", "Good but room for improvement",
280
+ "Not what I expected", "Amazing customer service",
281
+ "Would buy again", "Mixed feelings about this",
282
+ "Perfect for my needs", "Disappointing experience",
283
+ ],
284
+ "review_text": [
285
+ "This product exceeded all my expectations. The quality is outstanding and it arrived faster than expected. Highly recommend!",
286
+ "Solid purchase. Works exactly as described. Customer service was helpful when I had questions.",
287
+ "Good value for the money, but the instructions could be clearer. Otherwise satisfied with my purchase.",
288
+ "The quality is excellent and it's clear a lot of thought went into the design. Will definitely buy from this brand again.",
289
+ "Decent product but took longer to arrive than expected. The product itself works fine.",
290
+ "Amazing! This is exactly what I was looking for. The attention to detail is impressive.",
291
+ "Not bad, but I've seen better at this price point. It does the job adequately.",
292
+ "Fantastic experience from start to finish. Easy to set up and works perfectly.",
293
+ "The product is good but packaging was damaged on arrival. Fortunately, the item was intact.",
294
+ "Exceptional quality and great customer support. They went above and beyond to help me.",
295
+ ],
296
+ "city": [
297
+ "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
298
+ "San Francisco", "Seattle", "Boston", "Austin", "Denver",
299
+ "Miami", "Atlanta", "Portland", "San Diego", "Dallas",
300
+ "London", "Paris", "Tokyo", "Singapore", "Sydney",
301
+ "Toronto", "Berlin", "Amsterdam", "Dubai", "Mumbai",
302
+ ],
303
+ "country": [
304
+ "United States", "United Kingdom", "Canada", "Germany", "France",
305
+ "Japan", "Australia", "India", "Brazil", "Mexico",
306
+ "Italy", "Spain", "Netherlands", "Sweden", "Singapore",
307
+ "South Korea", "United Arab Emirates", "Switzerland", "Ireland", "Israel",
308
+ ],
309
+ "address": [
310
+ "123 Main Street, Suite 100", "456 Oak Avenue, Floor 3",
311
+ "789 Innovation Drive", "1001 Tech Boulevard, Building A",
312
+ "2500 Market Street", "350 Fifth Avenue, 21st Floor",
313
+ "1600 Amphitheatre Parkway", "One Microsoft Way",
314
+ "410 Terry Avenue North", "1 Infinite Loop",
315
+ ],
316
+ "company_name": [
317
+ "Acme Corporation", "TechVision Inc.", "Global Dynamics",
318
+ "Innovate Solutions", "Summit Technologies", "Blue Horizon Labs",
319
+ "Apex Industries", "Quantum Systems", "Pioneer Analytics",
320
+ "Stellar Ventures", "Nexus Consulting", "Atlas Enterprises",
321
+ "Synergy Partners", "Velocity Software", "Horizon Digital",
322
+ ],
323
+ "industry": [
324
+ "Technology", "Healthcare", "Finance", "E-commerce", "Manufacturing",
325
+ "Education", "Real Estate", "Consulting", "Telecommunications",
326
+ "Energy", "Retail", "Media & Entertainment", "Transportation",
327
+ "Hospitality", "Insurance", "Pharmaceuticals", "Aerospace",
328
+ ],
329
+ "feature_name": [
330
+ "Single Sign-On (SSO)", "Two-Factor Authentication", "Real-time Analytics",
331
+ "Custom Dashboards", "API Integration", "Advanced Reporting",
332
+ "Role-Based Access Control", "Automated Workflows", "Data Export",
333
+ "Mobile App Support", "Bulk Import", "Audit Logging",
334
+ "Collaborative Editing", "Version History", "Custom Branding",
335
+ ],
336
+ "bug_type": [
337
+ "UI rendering issue", "Authentication failure", "Data sync error",
338
+ "Performance degradation", "Memory leak", "API timeout",
339
+ "Incorrect calculation", "Missing validation", "Broken link",
340
+ "Cross-browser compatibility", "Mobile responsiveness issue",
341
+ "Localization error", "Permission denied unexpectedly",
342
+ ],
343
+ "api_endpoint": [
344
+ "/api/v1/users", "/api/v1/auth/login", "/api/v1/products",
345
+ "/api/v1/orders", "/api/v1/payments", "/api/v1/analytics",
346
+ "/api/v1/notifications", "/api/v1/settings", "/api/v1/search",
347
+ "/api/v1/reports", "/api/v1/webhooks", "/api/v1/integrations",
348
+ ],
349
+ "skill": [
350
+ "Python", "JavaScript", "SQL", "Machine Learning", "Data Analysis",
351
+ "Project Management", "Communication", "Leadership", "Problem Solving",
352
+ "AWS", "Docker", "Kubernetes", "React", "Node.js", "TensorFlow",
353
+ "Agile/Scrum", "Public Speaking", "Negotiation", "Strategic Planning",
354
+ ],
355
+ }
356
+
357
+ def __init__(
358
+ self,
359
+ provider: str = "groq",
360
+ api_key: Optional[str] = None,
361
+ cache_dir: Optional[str] = None,
362
+ ):
363
+ """
364
+ Initialize the smart value generator.
365
+
366
+ Args:
367
+ provider: LLM provider ("groq", "openai", "ollama")
368
+ api_key: API key for the provider
369
+ cache_dir: Directory to cache generated pools
370
+ """
371
+ self.provider = provider
372
+ self.api_key = api_key or os.environ.get("GROQ_API_KEY")
373
+ self.cache_dir = Path(cache_dir) if cache_dir else Path.home() / ".misata" / "value_cache"
374
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
375
+
376
+ # In-memory cache
377
+ self._pool_cache: Dict[str, List[str]] = {}
378
+ self._client = None
379
+
380
+ def _get_client(self):
381
+ """Lazily initialize LLM client."""
382
+ if self._client is None:
383
+ if self.provider == "groq":
384
+ try:
385
+ from groq import Groq
386
+ self._client = Groq(api_key=self.api_key)
387
+ except ImportError:
388
+ return None
389
+ elif self.provider == "openai":
390
+ try:
391
+ from openai import OpenAI
392
+ self._client = OpenAI(api_key=self.api_key)
393
+ except ImportError:
394
+ return None
395
+ return self._client
396
+
397
+ def detect_domain(self, column_name: str, table_name: str = "") -> Optional[str]:
398
+ """
399
+ Detect semantic domain from column and table names.
400
+
401
+ Args:
402
+ column_name: Name of the column
403
+ table_name: Name of the table (optional context)
404
+
405
+ Returns:
406
+ Detected domain name or None
407
+ """
408
+ # Normalize names
409
+ col_lower = column_name.lower().replace("_", " ")
410
+ table_lower = table_name.lower().replace("_", " ")
411
+ combined = f"{table_lower} {col_lower}"
412
+
413
+ # Check each domain pattern
414
+ for domain, patterns in self.DOMAIN_PATTERNS.items():
415
+ for pattern in patterns:
416
+ if pattern in col_lower or pattern in combined:
417
+ return domain
418
+
419
+ return None
420
+
421
+ def _get_cache_key(self, domain: str, context: str, size: int) -> str:
422
+ """Generate a cache key for a pool request."""
423
+ content = f"{domain}:{context}:{size}"
424
+ return hashlib.md5(content.encode()).hexdigest()[:12]
425
+
426
+ def _load_cached_pool(self, cache_key: str) -> Optional[List[str]]:
427
+ """Load a pool from disk cache."""
428
+ cache_file = self.cache_dir / f"{cache_key}.json"
429
+ if cache_file.exists():
430
+ try:
431
+ with open(cache_file, 'r') as f:
432
+ return json.load(f)
433
+ except Exception:
434
+ pass
435
+ return None
436
+
437
+ def _save_pool_to_cache(self, cache_key: str, pool: List[str]) -> None:
438
+ """Save a pool to disk cache."""
439
+ cache_file = self.cache_dir / f"{cache_key}.json"
440
+ try:
441
+ with open(cache_file, 'w') as f:
442
+ json.dump(pool, f)
443
+ except Exception:
444
+ pass
445
+
446
+ def generate_pool_with_llm(
447
+ self,
448
+ domain: str,
449
+ context: str = "",
450
+ size: int = 50,
451
+ ) -> List[str]:
452
+ """
453
+ Generate a pool of realistic values using LLM.
454
+
455
+ Args:
456
+ domain: Semantic domain (e.g., "disease", "prescription")
457
+ context: Additional context (e.g., "hospital emergency room")
458
+ size: Number of values to generate
459
+
460
+ Returns:
461
+ List of generated values
462
+ """
463
+ client = self._get_client()
464
+ if client is None:
465
+ # Fall back to curated pools
466
+ return self.FALLBACK_POOLS.get(domain, [])[:size]
467
+
468
+ # Build prompt
469
+ context_str = f" for a {context}" if context else ""
470
+ prompt = f"""Generate exactly {size} realistic {domain.replace('_', ' ')} values{context_str}.
471
+
472
+ Requirements:
473
+ - Be specific and realistic (not generic placeholders)
474
+ - Include variety (different types, severities, categories)
475
+ - Use proper terminology for the domain
476
+ - Each value should be unique
477
+
478
+ Return ONLY a JSON array of strings, no explanation. Example:
479
+ ["Value 1", "Value 2", "Value 3"]"""
480
+
481
+ try:
482
+ if self.provider == "groq":
483
+ response = client.chat.completions.create(
484
+ model="llama-3.3-70b-versatile",
485
+ messages=[
486
+ {"role": "system", "content": "You are a domain expert generating realistic test data. Output only valid JSON."},
487
+ {"role": "user", "content": prompt}
488
+ ],
489
+ temperature=0.7,
490
+ max_tokens=2000,
491
+ )
492
+ content = response.choices[0].message.content.strip()
493
+ elif self.provider == "openai":
494
+ response = client.chat.completions.create(
495
+ model="gpt-4o-mini",
496
+ messages=[
497
+ {"role": "system", "content": "You are a domain expert generating realistic test data. Output only valid JSON."},
498
+ {"role": "user", "content": prompt}
499
+ ],
500
+ temperature=0.7,
501
+ )
502
+ content = response.choices[0].message.content.strip()
503
+ else:
504
+ return self.FALLBACK_POOLS.get(domain, [])[:size]
505
+
506
+ # Parse JSON response
507
+ # Handle potential markdown code blocks
508
+ if content.startswith("```"):
509
+ lines = content.split("\n")
510
+ content = "\n".join(lines[1:-1])
511
+
512
+ pool = json.loads(content)
513
+
514
+ if isinstance(pool, list) and len(pool) > 0:
515
+ return pool[:size]
516
+ else:
517
+ return self.FALLBACK_POOLS.get(domain, [])[:size]
518
+
519
+ except Exception as e:
520
+ print(f"LLM generation failed: {e}")
521
+ return self.FALLBACK_POOLS.get(domain, [])[:size]
522
+
523
+ def get_pool(
524
+ self,
525
+ column_name: str,
526
+ table_name: str = "",
527
+ domain_hint: Optional[str] = None,
528
+ context: str = "",
529
+ size: int = 50,
530
+ use_llm: bool = True,
531
+ ) -> List[str]:
532
+ """
533
+ Get or create a value pool for a column.
534
+
535
+ Args:
536
+ column_name: Name of the column
537
+ table_name: Name of the table
538
+ domain_hint: Explicit domain override
539
+ context: Additional context for LLM
540
+ size: Pool size
541
+ use_llm: Whether to use LLM for generation
542
+
543
+ Returns:
544
+ List of domain-appropriate values
545
+ """
546
+ # Determine domain
547
+ domain = domain_hint or self.detect_domain(column_name, table_name)
548
+
549
+ if domain is None:
550
+ # No domain detected, return empty
551
+ return []
552
+
553
+ # Build context string
554
+ full_context = context or f"{table_name} {column_name}".strip()
555
+
556
+ # Check in-memory cache first
557
+ cache_key = self._get_cache_key(domain, full_context, size)
558
+ if cache_key in self._pool_cache:
559
+ return self._pool_cache[cache_key]
560
+
561
+ # Check disk cache
562
+ cached = self._load_cached_pool(cache_key)
563
+ if cached:
564
+ self._pool_cache[cache_key] = cached
565
+ return cached
566
+
567
+ # Generate new pool
568
+ if use_llm:
569
+ pool = self.generate_pool_with_llm(domain, full_context, size)
570
+ else:
571
+ pool = self.FALLBACK_POOLS.get(domain, [])[:size]
572
+
573
+ # Cache the pool
574
+ if pool:
575
+ self._pool_cache[cache_key] = pool
576
+ self._save_pool_to_cache(cache_key, pool)
577
+
578
+ return pool
579
+
580
+ def get_fallback_pool(self, domain: str) -> List[str]:
581
+ """Get curated fallback pool for a domain."""
582
+ return self.FALLBACK_POOLS.get(domain, [])
583
+
584
+ def generate_with_template(
585
+ self,
586
+ template: str,
587
+ size: int,
588
+ components: Dict[str, List[str]],
589
+ ) -> List[str]:
590
+ """Generate text by substituting template components.
591
+
592
+ This creates more variety by combining parts rather than
593
+ picking from a fixed pool.
594
+
595
+ Args:
596
+ template: String template with {component_name} placeholders
597
+ size: Number of values to generate
598
+ components: Dict mapping component names to value lists
599
+
600
+ Returns:
601
+ List of generated strings
602
+
603
+ Example:
604
+ template = "{first_name} {last_name}"
605
+ components = {
606
+ "first_name": ["John", "Jane", "Alex"],
607
+ "last_name": ["Smith", "Johnson", "Williams"],
608
+ }
609
+ values = gen.generate_with_template(template, 100, components)
610
+ # Returns: ["John Smith", "Jane Williams", "Alex Johnson", ...]
611
+ """
612
+ import random
613
+
614
+ results = []
615
+ for _ in range(size):
616
+ text = template
617
+ for key, values in components.items():
618
+ if f"{{{key}}}" in text:
619
+ text = text.replace(f"{{{key}}}", random.choice(values), 1)
620
+ results.append(text)
621
+
622
+ return results
623
+
624
+ def generate_composite_pool(
625
+ self,
626
+ domain: str,
627
+ size: int = 200,
628
+ ) -> List[str]:
629
+ """Generate larger pools using template composition.
630
+
631
+ Instead of calling LLM for 200 values, we compose
632
+ templates with varied components.
633
+
634
+ Args:
635
+ domain: Semantic domain
636
+ size: Target pool size
637
+
638
+ Returns:
639
+ List of composed values
640
+ """
641
+ import random
642
+
643
+ # Domain-specific templates
644
+ templates = {
645
+ "address": {
646
+ "template": "{number} {street_name} {street_type}, {city}, {state}",
647
+ "components": {
648
+ "number": [str(i) for i in range(100, 10000)],
649
+ "street_name": ["Oak", "Maple", "Cedar", "Pine", "Elm", "Birch", "Walnut", "Cherry", "Willow", "Aspen",
650
+ "Main", "First", "Second", "Third", "Park", "Lake", "River", "Hill", "Valley", "Spring"],
651
+ "street_type": ["Street", "Avenue", "Boulevard", "Lane", "Drive", "Court", "Place", "Road", "Way", "Circle"],
652
+ "city": ["Springfield", "Riverside", "Franklin", "Georgetown", "Clinton", "Salem", "Madison", "Bristol", "Fairview", "Newport"],
653
+ "state": ["CA", "TX", "NY", "FL", "IL", "PA", "OH", "GA", "MI", "NC", "WA", "CO", "AZ", "MA", "VA"],
654
+ },
655
+ },
656
+ "email": {
657
+ "template": "{name_part}{separator}{domain_part}@{provider}.{tld}",
658
+ "components": {
659
+ "name_part": ["john", "jane", "alex", "sam", "chris", "pat", "taylor", "jordan", "casey", "morgan",
660
+ "mike", "lisa", "david", "emma", "ryan", "kate", "nick", "amy", "steve", "jen"],
661
+ "separator": ["", ".", "_", ""],
662
+ "domain_part": ["smith", "jones", "work", "mail", "pro", "dev", "biz", "123", "2024", "online"],
663
+ "provider": ["gmail", "yahoo", "outlook", "hotmail", "icloud", "proton", "fastmail", "zoho"],
664
+ "tld": ["com", "com", "com", "org", "net", "io", "co"],
665
+ },
666
+ },
667
+ "product": {
668
+ "template": "{adjective} {material} {item_type} - {size_color}",
669
+ "components": {
670
+ "adjective": ["Premium", "Ultra", "Pro", "Classic", "Modern", "Sleek", "Essential", "Deluxe", "Elite", "Smart"],
671
+ "material": ["Stainless Steel", "Bamboo", "Ceramic", "Leather", "Cotton", "Titanium", "Wood", "Glass", "Silicone", "Carbon Fiber"],
672
+ "item_type": ["Water Bottle", "Phone Case", "Backpack", "Wallet", "Watch Band", "Desk Lamp", "Speaker", "Charging Dock", "Notebook", "Organizer"],
673
+ "size_color": ["Black/Large", "White/Medium", "Navy/Standard", "Gray/Compact", "Red/XL", "Brown/Regular", "Silver/Slim", "Green/Mini"],
674
+ },
675
+ },
676
+ "company_name": {
677
+ "template": "{prefix} {industry_word} {suffix}",
678
+ "components": {
679
+ "prefix": ["Nova", "Apex", "Prime", "Vertex", "Quantum", "Fusion", "Nexus", "Stellar", "Vector", "Atlas",
680
+ "Blue", "Red", "Green", "Global", "United", "First", "New", "Smart", "Tech", "Digital"],
681
+ "industry_word": ["Solutions", "Systems", "Tech", "Labs", "Works", "Group", "Partners", "Dynamics", "Innovations", "Ventures",
682
+ "Digital", "Logic", "Flow", "Wave", "Net", "Cloud", "Data", "Edge", "Core", "Sync"],
683
+ "suffix": ["Inc", "Corp", "LLC", "Co", "Ltd", "GmbH", "Technologies", "International", "Enterprises", "Holdings"],
684
+ },
685
+ },
686
+ }
687
+
688
+ if domain in templates:
689
+ config = templates[domain]
690
+ return self.generate_with_template(
691
+ config["template"],
692
+ size,
693
+ config["components"]
694
+ )
695
+
696
+ # Fall back to curated pool with random sampling
697
+ base_pool = self.FALLBACK_POOLS.get(domain, [])
698
+ if len(base_pool) >= size:
699
+ return random.sample(base_pool, size)
700
+ elif len(base_pool) > 0:
701
+ # Repeat with slight variations
702
+ result = []
703
+ for i in range(size):
704
+ base = random.choice(base_pool)
705
+ if random.random() < 0.3: # 30% chance to add suffix
706
+ suffix = random.choice([" (v2)", " Pro", " Plus", " - Updated", " 2.0", ""])
707
+ base = base + suffix
708
+ result.append(base)
709
+ return result
710
+
711
+ return []
712
+
713
+
714
+ # ============ Template Registry ============
715
+
716
+ COMPOSITION_TEMPLATES = {
717
+ "order_id": "{prefix}-{year}-{number}",
718
+ "invoice_number": "INV-{year}{month}-{number}",
719
+ "tracking_number": "{carrier}{number}{check}",
720
+ "sku": "{category}-{brand}-{variant}-{size}",
721
+ "username": "{adjective}{noun}{number}",
722
+ }
723
+
724
+ TEMPLATE_COMPONENTS = {
725
+ "order_id": {
726
+ "prefix": ["ORD", "SO", "PO", "WO", "REQ"],
727
+ "year": ["2023", "2024", "2025"],
728
+ "number": [str(i).zfill(6) for i in range(1, 1000)],
729
+ },
730
+ "invoice_number": {
731
+ "year": ["23", "24", "25"],
732
+ "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
733
+ "number": [str(i).zfill(4) for i in range(1, 10000)],
734
+ },
735
+ "tracking_number": {
736
+ "carrier": ["1Z", "9400", "92", "420"],
737
+ "number": [str(i).zfill(12) for i in range(100000000000, 100001000000)],
738
+ "check": [str(i) for i in range(10)],
739
+ },
740
+ "sku": {
741
+ "category": ["ELC", "CLO", "HOM", "SPT", "TOY", "BOK"],
742
+ "brand": ["APP", "SAM", "NIK", "ADI", "SON", "LG"],
743
+ "variant": ["BLK", "WHT", "RED", "BLU", "GRN", "GRY"],
744
+ "size": ["S", "M", "L", "XL", "XXL", "OS"],
745
+ },
746
+ "username": {
747
+ "adjective": ["cool", "super", "mega", "ultra", "epic", "pro", "fast", "swift", "bold", "smart"],
748
+ "noun": ["ninja", "tiger", "dragon", "wolf", "hawk", "bear", "lion", "eagle", "shark", "fox"],
749
+ "number": [str(i) for i in range(1, 1000)],
750
+ },
751
+ }
752
+
753
+
754
+ # Convenience function for quick testing
755
+ def smart_generate(column_name: str, table_name: str = "", size: int = 10) -> List[str]:
756
+ """Quick smart value generation for testing."""
757
+ gen = SmartValueGenerator()
758
+ pool = gen.get_pool(column_name, table_name, size=max(size * 2, 50))
759
+ if pool:
760
+ import random
761
+ return random.choices(pool, k=size)
762
+ return []