misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +89 -3
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/llm_parser.py +41 -2
- misata/profiles.py +332 -0
- misata/quality.py +329 -0
- misata/schema.py +8 -3
- misata/simulator.py +81 -5
- misata/smart_values.py +762 -0
- misata/streaming.py +228 -0
- misata/templates/library.py +344 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/METADATA +4 -2
- misata-0.3.0b0.dist-info/RECORD +37 -0
- misata-0.3.0b0.dist-info/licenses/LICENSE +21 -0
- misata-0.1.0b0.dist-info/RECORD +0 -25
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/WHEEL +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/entry_points.txt +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/top_level.txt +0 -0
misata/smart_values.py
ADDED
|
@@ -0,0 +1,762 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-powered smart value generation for context-aware data.
|
|
3
|
+
|
|
4
|
+
This module generates realistic domain-specific values by:
|
|
5
|
+
1. Detecting semantic domain from column/table names
|
|
6
|
+
2. Using LLM to generate domain-appropriate data pools
|
|
7
|
+
3. Caching pools for fast repeated generation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import hashlib
|
|
13
|
+
from typing import Dict, List, Optional, Any
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SmartValueGenerator:
|
|
18
|
+
"""
|
|
19
|
+
Generate context-aware realistic values using LLM.
|
|
20
|
+
|
|
21
|
+
Detects domains from column/table names and generates
|
|
22
|
+
appropriate data pools using LLM or curated fallbacks.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Domain detection patterns
|
|
26
|
+
DOMAIN_PATTERNS = {
|
|
27
|
+
# Medical
|
|
28
|
+
"disease": ["disease", "diagnosis", "condition", "illness", "ailment", "disorder", "pathology"],
|
|
29
|
+
"prescription": ["prescription", "medication", "drug", "medicine", "rx", "pharma"],
|
|
30
|
+
"procedure": ["procedure", "surgery", "treatment", "operation", "therapy", "intervention"],
|
|
31
|
+
"symptom": ["symptom", "complaint", "sign", "manifestation"],
|
|
32
|
+
"medical_specialty": ["specialty", "department", "ward"],
|
|
33
|
+
|
|
34
|
+
# Legal
|
|
35
|
+
"case_type": ["case_type", "legal_matter", "litigation_type"],
|
|
36
|
+
"law_firm": ["law_firm", "legal_firm", "attorney_firm"],
|
|
37
|
+
"legal_status": ["legal_status", "case_status", "court_status"],
|
|
38
|
+
|
|
39
|
+
# Retail/E-commerce
|
|
40
|
+
"product": ["product", "item", "merchandise", "goods"],
|
|
41
|
+
"category": ["category", "department", "section"],
|
|
42
|
+
"brand": ["brand", "manufacturer", "vendor"],
|
|
43
|
+
|
|
44
|
+
# Finance
|
|
45
|
+
"transaction_type": ["transaction_type", "payment_type", "transfer_type"],
|
|
46
|
+
"account_type": ["account_type", "bank_account", "financial_account"],
|
|
47
|
+
|
|
48
|
+
# HR/Employment
|
|
49
|
+
"job_title": ["job_title", "position", "role", "designation"],
|
|
50
|
+
"department": ["department", "division", "unit", "team"],
|
|
51
|
+
"skill": ["skill", "competency", "expertise", "qualification"],
|
|
52
|
+
|
|
53
|
+
# NEW: Food & Restaurant
|
|
54
|
+
"restaurant_name": ["restaurant", "diner", "cafe", "eatery", "bistro", "tavern"],
|
|
55
|
+
"cuisine_type": ["cuisine", "food_type", "culinary"],
|
|
56
|
+
"menu_item": ["menu_item", "dish", "meal", "entree", "appetizer"],
|
|
57
|
+
|
|
58
|
+
# NEW: Education
|
|
59
|
+
"course_name": ["course", "class", "lecture", "module", "subject"],
|
|
60
|
+
"university": ["university", "college", "institution", "school"],
|
|
61
|
+
"degree": ["degree", "certification", "diploma", "qualification"],
|
|
62
|
+
|
|
63
|
+
# NEW: Events & Meetings
|
|
64
|
+
"event_name": ["event", "conference", "meeting", "workshop", "seminar", "webinar"],
|
|
65
|
+
"venue": ["venue", "location", "hall", "auditorium", "center"],
|
|
66
|
+
|
|
67
|
+
# NEW: Projects & Work
|
|
68
|
+
"project_name": ["project", "initiative", "campaign", "program"],
|
|
69
|
+
"task_name": ["task", "todo", "action_item", "work_item"],
|
|
70
|
+
"milestone": ["milestone", "deliverable", "goal", "objective"],
|
|
71
|
+
|
|
72
|
+
# NEW: Reviews & Feedback
|
|
73
|
+
"review_title": ["review_title", "feedback_title", "comment_title"],
|
|
74
|
+
"review_text": ["review", "feedback", "comment", "testimonial", "opinion"],
|
|
75
|
+
|
|
76
|
+
# NEW: Location
|
|
77
|
+
"city": ["city", "town", "municipality", "metro"],
|
|
78
|
+
"country": ["country", "nation", "region"],
|
|
79
|
+
"address": ["address", "location", "street", "postal"],
|
|
80
|
+
|
|
81
|
+
# NEW: Business
|
|
82
|
+
"company_name": ["company", "organization", "business", "corporation", "enterprise", "firm"],
|
|
83
|
+
"industry": ["industry", "sector", "vertical", "market"],
|
|
84
|
+
|
|
85
|
+
# NEW: Tech/Software
|
|
86
|
+
"feature_name": ["feature", "capability", "functionality"],
|
|
87
|
+
"bug_type": ["bug", "issue", "defect", "error"],
|
|
88
|
+
"api_endpoint": ["endpoint", "api", "route", "path"],
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Curated fallback pools (no LLM needed)
|
|
92
|
+
FALLBACK_POOLS = {
|
|
93
|
+
"disease": [
|
|
94
|
+
"Type 2 Diabetes Mellitus", "Essential Hypertension", "Chronic Obstructive Pulmonary Disease",
|
|
95
|
+
"Major Depressive Disorder", "Generalized Anxiety Disorder", "Acute Myocardial Infarction",
|
|
96
|
+
"Atrial Fibrillation", "Chronic Kidney Disease Stage 3", "Rheumatoid Arthritis",
|
|
97
|
+
"Osteoarthritis", "Migraine without Aura", "Asthma", "Hypothyroidism", "Hyperlipidemia",
|
|
98
|
+
"Gastroesophageal Reflux Disease", "Irritable Bowel Syndrome", "Obesity", "Sleep Apnea",
|
|
99
|
+
"Chronic Lower Back Pain", "Urinary Tract Infection", "Pneumonia", "Bronchitis",
|
|
100
|
+
"Anemia", "Osteoporosis", "Fibromyalgia", "Seizure Disorder", "Glaucoma",
|
|
101
|
+
"Allergic Rhinitis", "Eczema", "Psoriasis", "Hepatitis C", "Cirrhosis",
|
|
102
|
+
"Congestive Heart Failure", "Coronary Artery Disease", "Peripheral Artery Disease",
|
|
103
|
+
"Deep Vein Thrombosis", "Pulmonary Embolism", "Stroke", "Transient Ischemic Attack",
|
|
104
|
+
"Multiple Sclerosis", "Parkinson's Disease", "Alzheimer's Disease", "Epilepsy",
|
|
105
|
+
"Lupus", "Crohn's Disease", "Ulcerative Colitis", "Celiac Disease",
|
|
106
|
+
"Polycystic Ovary Syndrome", "Endometriosis", "Benign Prostatic Hyperplasia",
|
|
107
|
+
],
|
|
108
|
+
"prescription": [
|
|
109
|
+
"Metformin 500mg - Take twice daily with meals",
|
|
110
|
+
"Lisinopril 10mg - Take once daily",
|
|
111
|
+
"Atorvastatin 20mg - Take at bedtime",
|
|
112
|
+
"Levothyroxine 50mcg - Take on empty stomach",
|
|
113
|
+
"Amlodipine 5mg - Take once daily",
|
|
114
|
+
"Metoprolol 25mg - Take twice daily",
|
|
115
|
+
"Omeprazole 20mg - Take before breakfast",
|
|
116
|
+
"Sertraline 50mg - Take once daily",
|
|
117
|
+
"Gabapentin 300mg - Take three times daily",
|
|
118
|
+
"Tramadol 50mg - Take as needed for pain",
|
|
119
|
+
"Prednisone 10mg - Take with food",
|
|
120
|
+
"Albuterol Inhaler - Use as needed for breathing",
|
|
121
|
+
"Fluticasone Nasal Spray - Use twice daily",
|
|
122
|
+
"Insulin Glargine 20 units - Inject at bedtime",
|
|
123
|
+
"Warfarin 5mg - Take as directed with INR monitoring",
|
|
124
|
+
"Clopidogrel 75mg - Take once daily",
|
|
125
|
+
"Furosemide 40mg - Take in the morning",
|
|
126
|
+
"Losartan 50mg - Take once daily",
|
|
127
|
+
"Hydrochlorothiazide 25mg - Take in the morning",
|
|
128
|
+
"Duloxetine 60mg - Take once daily",
|
|
129
|
+
"Escitalopram 10mg - Take once daily",
|
|
130
|
+
"Alprazolam 0.5mg - Take as needed for anxiety",
|
|
131
|
+
"Zolpidem 10mg - Take at bedtime as needed",
|
|
132
|
+
"Simvastatin 40mg - Take at bedtime",
|
|
133
|
+
"Pantoprazole 40mg - Take before breakfast",
|
|
134
|
+
],
|
|
135
|
+
"procedure": [
|
|
136
|
+
"Complete Blood Count", "Comprehensive Metabolic Panel", "Lipid Panel",
|
|
137
|
+
"Chest X-Ray", "CT Scan - Abdomen", "MRI - Brain", "Echocardiogram",
|
|
138
|
+
"Colonoscopy", "Upper Endoscopy", "Cardiac Catheterization",
|
|
139
|
+
"Knee Arthroscopy", "Laparoscopic Cholecystectomy", "Appendectomy",
|
|
140
|
+
"Total Hip Replacement", "Total Knee Replacement", "Coronary Artery Bypass",
|
|
141
|
+
"Angioplasty with Stent Placement", "Pacemaker Implantation",
|
|
142
|
+
"Lumbar Puncture", "Bone Marrow Biopsy", "Bronchoscopy",
|
|
143
|
+
"Thyroidectomy", "Mastectomy", "Prostatectomy", "Hysterectomy",
|
|
144
|
+
"Cataract Surgery", "LASIK Eye Surgery", "Tonsillectomy",
|
|
145
|
+
"Cesarean Section", "Spinal Fusion", "Hernia Repair",
|
|
146
|
+
],
|
|
147
|
+
"symptom": [
|
|
148
|
+
"Chest pain", "Shortness of breath", "Fatigue", "Headache",
|
|
149
|
+
"Dizziness", "Nausea", "Vomiting", "Abdominal pain", "Back pain",
|
|
150
|
+
"Joint pain", "Muscle weakness", "Numbness", "Tingling sensation",
|
|
151
|
+
"Blurred vision", "Hearing loss", "Cough", "Fever", "Chills",
|
|
152
|
+
"Night sweats", "Weight loss", "Weight gain", "Loss of appetite",
|
|
153
|
+
"Insomnia", "Excessive thirst", "Frequent urination", "Swelling",
|
|
154
|
+
"Rash", "Itching", "Bruising", "Bleeding", "Difficulty swallowing",
|
|
155
|
+
"Heartburn", "Constipation", "Diarrhea", "Blood in stool",
|
|
156
|
+
"Difficulty concentrating", "Memory problems", "Anxiety", "Depression",
|
|
157
|
+
"Palpitations", "Leg cramps", "Cold intolerance", "Heat intolerance",
|
|
158
|
+
],
|
|
159
|
+
"job_title": [
|
|
160
|
+
"Software Engineer", "Senior Software Engineer", "Staff Engineer",
|
|
161
|
+
"Product Manager", "Senior Product Manager", "Director of Product",
|
|
162
|
+
"Data Scientist", "Machine Learning Engineer", "Data Analyst",
|
|
163
|
+
"UX Designer", "UI Designer", "Product Designer",
|
|
164
|
+
"DevOps Engineer", "Site Reliability Engineer", "Platform Engineer",
|
|
165
|
+
"Engineering Manager", "VP of Engineering", "CTO",
|
|
166
|
+
"Sales Representative", "Account Executive", "Sales Manager",
|
|
167
|
+
"Marketing Manager", "Content Strategist", "Growth Manager",
|
|
168
|
+
"HR Manager", "Recruiter", "People Operations",
|
|
169
|
+
"Financial Analyst", "Controller", "CFO",
|
|
170
|
+
"Customer Success Manager", "Support Engineer", "Technical Writer",
|
|
171
|
+
],
|
|
172
|
+
"department": [
|
|
173
|
+
"Engineering", "Product", "Design", "Marketing", "Sales",
|
|
174
|
+
"Human Resources", "Finance", "Operations", "Customer Success",
|
|
175
|
+
"Research & Development", "Legal", "IT", "Security",
|
|
176
|
+
"Quality Assurance", "Business Development", "Analytics",
|
|
177
|
+
"Supply Chain", "Manufacturing", "Procurement",
|
|
178
|
+
],
|
|
179
|
+
"product": [
|
|
180
|
+
"Wireless Bluetooth Headphones", "Mechanical Gaming Keyboard",
|
|
181
|
+
"Ultra HD 4K Monitor", "Ergonomic Office Chair",
|
|
182
|
+
"Portable Power Bank 20000mAh", "Smart Home Speaker",
|
|
183
|
+
"Fitness Tracking Smartwatch", "Noise Cancelling Earbuds",
|
|
184
|
+
"USB-C Docking Station", "Laptop Cooling Pad",
|
|
185
|
+
"Wireless Mouse", "Gaming Mouse Pad XL",
|
|
186
|
+
"Webcam 1080p HD", "Ring Light with Tripod",
|
|
187
|
+
"Desk Organizer Set", "Cable Management Kit",
|
|
188
|
+
],
|
|
189
|
+
"category": [
|
|
190
|
+
"Electronics", "Computers", "Office Supplies", "Home & Garden",
|
|
191
|
+
"Sports & Outdoors", "Clothing", "Beauty & Personal Care",
|
|
192
|
+
"Toys & Games", "Books", "Food & Grocery",
|
|
193
|
+
"Automotive", "Health & Wellness", "Pet Supplies",
|
|
194
|
+
"Baby & Kids", "Jewelry & Watches", "Arts & Crafts",
|
|
195
|
+
],
|
|
196
|
+
# NEW DOMAIN POOLS
|
|
197
|
+
"restaurant_name": [
|
|
198
|
+
"The Golden Fork", "Bella Italia", "Tokyo Garden", "Blue Ocean Grill",
|
|
199
|
+
"Mountain View Cafe", "The Rustic Table", "Sakura Sushi", "Le Petit Bistro",
|
|
200
|
+
"Spice Route", "The Green Leaf", "Urban Kitchen", "Fire & Ice",
|
|
201
|
+
"The Hungry Bear", "Sunset Terrace", "Casa del Sol", "The Laughing Lobster",
|
|
202
|
+
"Emerald Thai", "Brooklyn Deli", "The Olive Branch", "Maple Street Diner",
|
|
203
|
+
],
|
|
204
|
+
"cuisine_type": [
|
|
205
|
+
"Italian", "Japanese", "Mexican", "Chinese", "Indian", "Thai",
|
|
206
|
+
"French", "Mediterranean", "American", "Korean", "Vietnamese",
|
|
207
|
+
"Greek", "Middle Eastern", "Spanish", "Brazilian", "Ethiopian",
|
|
208
|
+
],
|
|
209
|
+
"menu_item": [
|
|
210
|
+
"Grilled Salmon with Lemon Butter", "Margherita Pizza", "Chicken Tikka Masala",
|
|
211
|
+
"Pad Thai with Shrimp", "Caesar Salad", "Beef Bourguignon",
|
|
212
|
+
"Sushi Platter Deluxe", "Fish and Chips", "Vegetable Stir Fry",
|
|
213
|
+
"Lobster Bisque", "BBQ Ribs", "Mushroom Risotto", "Tacos al Pastor",
|
|
214
|
+
"Greek Moussaka", "Tom Yum Soup", "Eggs Benedict", "Avocado Toast",
|
|
215
|
+
"Butter Chicken", "Pho Bo", "Beef Wellington", "Crème Brûlée",
|
|
216
|
+
],
|
|
217
|
+
"course_name": [
|
|
218
|
+
"Introduction to Machine Learning", "Advanced Data Structures",
|
|
219
|
+
"Web Development Fundamentals", "Cloud Computing Essentials",
|
|
220
|
+
"Digital Marketing Strategy", "Financial Accounting 101",
|
|
221
|
+
"Project Management Professional", "UX Design Principles",
|
|
222
|
+
"Python for Data Science", "Business Analytics", "Agile Methodology",
|
|
223
|
+
"Cybersecurity Fundamentals", "Leadership and Management",
|
|
224
|
+
"Public Speaking Mastery", "Creative Writing Workshop",
|
|
225
|
+
],
|
|
226
|
+
"university": [
|
|
227
|
+
"MIT", "Stanford University", "Harvard University", "UC Berkeley",
|
|
228
|
+
"Cambridge University", "Oxford University", "ETH Zurich",
|
|
229
|
+
"Carnegie Mellon University", "Georgia Tech", "University of Michigan",
|
|
230
|
+
"UCLA", "Columbia University", "Yale University", "Princeton University",
|
|
231
|
+
"University of Toronto", "National University of Singapore",
|
|
232
|
+
],
|
|
233
|
+
"degree": [
|
|
234
|
+
"Bachelor of Science in Computer Science", "Master of Business Administration",
|
|
235
|
+
"Doctor of Philosophy in Physics", "Bachelor of Arts in Economics",
|
|
236
|
+
"Master of Science in Data Science", "Bachelor of Engineering",
|
|
237
|
+
"Master of Public Health", "Doctor of Medicine",
|
|
238
|
+
"Master of Fine Arts", "Bachelor of Commerce",
|
|
239
|
+
"Professional Certificate in Project Management",
|
|
240
|
+
],
|
|
241
|
+
"event_name": [
|
|
242
|
+
"Annual Tech Summit 2024", "Global Innovation Conference",
|
|
243
|
+
"Product Launch Webinar", "Quarterly Business Review",
|
|
244
|
+
"Team Building Workshop", "Customer Success Meetup",
|
|
245
|
+
"Developer Conference", "Marketing Strategy Session",
|
|
246
|
+
"Leadership Retreat", "Industry Networking Event",
|
|
247
|
+
"Hackathon: Build the Future", "AI & ML Symposium",
|
|
248
|
+
],
|
|
249
|
+
"venue": [
|
|
250
|
+
"Grand Convention Center", "The Ritz-Carlton Ballroom",
|
|
251
|
+
"Silicon Valley Conference Hall", "Downtown Marriott",
|
|
252
|
+
"Tech Hub Auditorium", "Innovation Campus", "The Summit Club",
|
|
253
|
+
"Waterfront Event Space", "Metropolitan Convention Center",
|
|
254
|
+
"Hilton Garden Terrace", "The Forum", "Sunrise Pavilion",
|
|
255
|
+
],
|
|
256
|
+
"project_name": [
|
|
257
|
+
"Project Phoenix", "Operation Streamline", "Initiative Alpha",
|
|
258
|
+
"Digital Transformation 2024", "Customer Experience Revamp",
|
|
259
|
+
"Platform Migration", "Security Enhancement Program",
|
|
260
|
+
"Market Expansion Initiative", "Product Innovation Sprint",
|
|
261
|
+
"Process Automation Project", "Brand Refresh Campaign",
|
|
262
|
+
"Infrastructure Modernization", "Data Lake Implementation",
|
|
263
|
+
],
|
|
264
|
+
"task_name": [
|
|
265
|
+
"Review pull request", "Update documentation", "Fix login bug",
|
|
266
|
+
"Design landing page mockup", "Set up CI/CD pipeline",
|
|
267
|
+
"Conduct user interviews", "Write unit tests", "Optimize database queries",
|
|
268
|
+
"Create marketing copy", "Schedule team meeting", "Prepare quarterly report",
|
|
269
|
+
"Refactor authentication module", "Deploy to production",
|
|
270
|
+
],
|
|
271
|
+
"milestone": [
|
|
272
|
+
"MVP Launch", "Beta Release", "General Availability",
|
|
273
|
+
"100K Users Milestone", "Series A Funding", "Product Market Fit",
|
|
274
|
+
"First Enterprise Customer", "International Expansion",
|
|
275
|
+
"SOC 2 Certification", "Mobile App Launch", "API v2 Release",
|
|
276
|
+
],
|
|
277
|
+
"review_title": [
|
|
278
|
+
"Great product, highly recommend!", "Exceeded my expectations",
|
|
279
|
+
"Solid quality for the price", "Good but room for improvement",
|
|
280
|
+
"Not what I expected", "Amazing customer service",
|
|
281
|
+
"Would buy again", "Mixed feelings about this",
|
|
282
|
+
"Perfect for my needs", "Disappointing experience",
|
|
283
|
+
],
|
|
284
|
+
"review_text": [
|
|
285
|
+
"This product exceeded all my expectations. The quality is outstanding and it arrived faster than expected. Highly recommend!",
|
|
286
|
+
"Solid purchase. Works exactly as described. Customer service was helpful when I had questions.",
|
|
287
|
+
"Good value for the money, but the instructions could be clearer. Otherwise satisfied with my purchase.",
|
|
288
|
+
"The quality is excellent and it's clear a lot of thought went into the design. Will definitely buy from this brand again.",
|
|
289
|
+
"Decent product but took longer to arrive than expected. The product itself works fine.",
|
|
290
|
+
"Amazing! This is exactly what I was looking for. The attention to detail is impressive.",
|
|
291
|
+
"Not bad, but I've seen better at this price point. It does the job adequately.",
|
|
292
|
+
"Fantastic experience from start to finish. Easy to set up and works perfectly.",
|
|
293
|
+
"The product is good but packaging was damaged on arrival. Fortunately, the item was intact.",
|
|
294
|
+
"Exceptional quality and great customer support. They went above and beyond to help me.",
|
|
295
|
+
],
|
|
296
|
+
"city": [
|
|
297
|
+
"New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
|
|
298
|
+
"San Francisco", "Seattle", "Boston", "Austin", "Denver",
|
|
299
|
+
"Miami", "Atlanta", "Portland", "San Diego", "Dallas",
|
|
300
|
+
"London", "Paris", "Tokyo", "Singapore", "Sydney",
|
|
301
|
+
"Toronto", "Berlin", "Amsterdam", "Dubai", "Mumbai",
|
|
302
|
+
],
|
|
303
|
+
"country": [
|
|
304
|
+
"United States", "United Kingdom", "Canada", "Germany", "France",
|
|
305
|
+
"Japan", "Australia", "India", "Brazil", "Mexico",
|
|
306
|
+
"Italy", "Spain", "Netherlands", "Sweden", "Singapore",
|
|
307
|
+
"South Korea", "United Arab Emirates", "Switzerland", "Ireland", "Israel",
|
|
308
|
+
],
|
|
309
|
+
"address": [
|
|
310
|
+
"123 Main Street, Suite 100", "456 Oak Avenue, Floor 3",
|
|
311
|
+
"789 Innovation Drive", "1001 Tech Boulevard, Building A",
|
|
312
|
+
"2500 Market Street", "350 Fifth Avenue, 21st Floor",
|
|
313
|
+
"1600 Amphitheatre Parkway", "One Microsoft Way",
|
|
314
|
+
"410 Terry Avenue North", "1 Infinite Loop",
|
|
315
|
+
],
|
|
316
|
+
"company_name": [
|
|
317
|
+
"Acme Corporation", "TechVision Inc.", "Global Dynamics",
|
|
318
|
+
"Innovate Solutions", "Summit Technologies", "Blue Horizon Labs",
|
|
319
|
+
"Apex Industries", "Quantum Systems", "Pioneer Analytics",
|
|
320
|
+
"Stellar Ventures", "Nexus Consulting", "Atlas Enterprises",
|
|
321
|
+
"Synergy Partners", "Velocity Software", "Horizon Digital",
|
|
322
|
+
],
|
|
323
|
+
"industry": [
|
|
324
|
+
"Technology", "Healthcare", "Finance", "E-commerce", "Manufacturing",
|
|
325
|
+
"Education", "Real Estate", "Consulting", "Telecommunications",
|
|
326
|
+
"Energy", "Retail", "Media & Entertainment", "Transportation",
|
|
327
|
+
"Hospitality", "Insurance", "Pharmaceuticals", "Aerospace",
|
|
328
|
+
],
|
|
329
|
+
"feature_name": [
|
|
330
|
+
"Single Sign-On (SSO)", "Two-Factor Authentication", "Real-time Analytics",
|
|
331
|
+
"Custom Dashboards", "API Integration", "Advanced Reporting",
|
|
332
|
+
"Role-Based Access Control", "Automated Workflows", "Data Export",
|
|
333
|
+
"Mobile App Support", "Bulk Import", "Audit Logging",
|
|
334
|
+
"Collaborative Editing", "Version History", "Custom Branding",
|
|
335
|
+
],
|
|
336
|
+
"bug_type": [
|
|
337
|
+
"UI rendering issue", "Authentication failure", "Data sync error",
|
|
338
|
+
"Performance degradation", "Memory leak", "API timeout",
|
|
339
|
+
"Incorrect calculation", "Missing validation", "Broken link",
|
|
340
|
+
"Cross-browser compatibility", "Mobile responsiveness issue",
|
|
341
|
+
"Localization error", "Permission denied unexpectedly",
|
|
342
|
+
],
|
|
343
|
+
"api_endpoint": [
|
|
344
|
+
"/api/v1/users", "/api/v1/auth/login", "/api/v1/products",
|
|
345
|
+
"/api/v1/orders", "/api/v1/payments", "/api/v1/analytics",
|
|
346
|
+
"/api/v1/notifications", "/api/v1/settings", "/api/v1/search",
|
|
347
|
+
"/api/v1/reports", "/api/v1/webhooks", "/api/v1/integrations",
|
|
348
|
+
],
|
|
349
|
+
"skill": [
|
|
350
|
+
"Python", "JavaScript", "SQL", "Machine Learning", "Data Analysis",
|
|
351
|
+
"Project Management", "Communication", "Leadership", "Problem Solving",
|
|
352
|
+
"AWS", "Docker", "Kubernetes", "React", "Node.js", "TensorFlow",
|
|
353
|
+
"Agile/Scrum", "Public Speaking", "Negotiation", "Strategic Planning",
|
|
354
|
+
],
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
def __init__(
|
|
358
|
+
self,
|
|
359
|
+
provider: str = "groq",
|
|
360
|
+
api_key: Optional[str] = None,
|
|
361
|
+
cache_dir: Optional[str] = None,
|
|
362
|
+
):
|
|
363
|
+
"""
|
|
364
|
+
Initialize the smart value generator.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
provider: LLM provider ("groq", "openai", "ollama")
|
|
368
|
+
api_key: API key for the provider
|
|
369
|
+
cache_dir: Directory to cache generated pools
|
|
370
|
+
"""
|
|
371
|
+
self.provider = provider
|
|
372
|
+
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
|
373
|
+
self.cache_dir = Path(cache_dir) if cache_dir else Path.home() / ".misata" / "value_cache"
|
|
374
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
375
|
+
|
|
376
|
+
# In-memory cache
|
|
377
|
+
self._pool_cache: Dict[str, List[str]] = {}
|
|
378
|
+
self._client = None
|
|
379
|
+
|
|
380
|
+
def _get_client(self):
|
|
381
|
+
"""Lazily initialize LLM client."""
|
|
382
|
+
if self._client is None:
|
|
383
|
+
if self.provider == "groq":
|
|
384
|
+
try:
|
|
385
|
+
from groq import Groq
|
|
386
|
+
self._client = Groq(api_key=self.api_key)
|
|
387
|
+
except ImportError:
|
|
388
|
+
return None
|
|
389
|
+
elif self.provider == "openai":
|
|
390
|
+
try:
|
|
391
|
+
from openai import OpenAI
|
|
392
|
+
self._client = OpenAI(api_key=self.api_key)
|
|
393
|
+
except ImportError:
|
|
394
|
+
return None
|
|
395
|
+
return self._client
|
|
396
|
+
|
|
397
|
+
def detect_domain(self, column_name: str, table_name: str = "") -> Optional[str]:
|
|
398
|
+
"""
|
|
399
|
+
Detect semantic domain from column and table names.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
column_name: Name of the column
|
|
403
|
+
table_name: Name of the table (optional context)
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Detected domain name or None
|
|
407
|
+
"""
|
|
408
|
+
# Normalize names
|
|
409
|
+
col_lower = column_name.lower().replace("_", " ")
|
|
410
|
+
table_lower = table_name.lower().replace("_", " ")
|
|
411
|
+
combined = f"{table_lower} {col_lower}"
|
|
412
|
+
|
|
413
|
+
# Check each domain pattern
|
|
414
|
+
for domain, patterns in self.DOMAIN_PATTERNS.items():
|
|
415
|
+
for pattern in patterns:
|
|
416
|
+
if pattern in col_lower or pattern in combined:
|
|
417
|
+
return domain
|
|
418
|
+
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
def _get_cache_key(self, domain: str, context: str, size: int) -> str:
|
|
422
|
+
"""Generate a cache key for a pool request."""
|
|
423
|
+
content = f"{domain}:{context}:{size}"
|
|
424
|
+
return hashlib.md5(content.encode()).hexdigest()[:12]
|
|
425
|
+
|
|
426
|
+
def _load_cached_pool(self, cache_key: str) -> Optional[List[str]]:
|
|
427
|
+
"""Load a pool from disk cache."""
|
|
428
|
+
cache_file = self.cache_dir / f"{cache_key}.json"
|
|
429
|
+
if cache_file.exists():
|
|
430
|
+
try:
|
|
431
|
+
with open(cache_file, 'r') as f:
|
|
432
|
+
return json.load(f)
|
|
433
|
+
except Exception:
|
|
434
|
+
pass
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
def _save_pool_to_cache(self, cache_key: str, pool: List[str]) -> None:
|
|
438
|
+
"""Save a pool to disk cache."""
|
|
439
|
+
cache_file = self.cache_dir / f"{cache_key}.json"
|
|
440
|
+
try:
|
|
441
|
+
with open(cache_file, 'w') as f:
|
|
442
|
+
json.dump(pool, f)
|
|
443
|
+
except Exception:
|
|
444
|
+
pass
|
|
445
|
+
|
|
446
|
+
def generate_pool_with_llm(
|
|
447
|
+
self,
|
|
448
|
+
domain: str,
|
|
449
|
+
context: str = "",
|
|
450
|
+
size: int = 50,
|
|
451
|
+
) -> List[str]:
|
|
452
|
+
"""
|
|
453
|
+
Generate a pool of realistic values using LLM.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
domain: Semantic domain (e.g., "disease", "prescription")
|
|
457
|
+
context: Additional context (e.g., "hospital emergency room")
|
|
458
|
+
size: Number of values to generate
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
List of generated values
|
|
462
|
+
"""
|
|
463
|
+
client = self._get_client()
|
|
464
|
+
if client is None:
|
|
465
|
+
# Fall back to curated pools
|
|
466
|
+
return self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
467
|
+
|
|
468
|
+
# Build prompt
|
|
469
|
+
context_str = f" for a {context}" if context else ""
|
|
470
|
+
prompt = f"""Generate exactly {size} realistic {domain.replace('_', ' ')} values{context_str}.
|
|
471
|
+
|
|
472
|
+
Requirements:
|
|
473
|
+
- Be specific and realistic (not generic placeholders)
|
|
474
|
+
- Include variety (different types, severities, categories)
|
|
475
|
+
- Use proper terminology for the domain
|
|
476
|
+
- Each value should be unique
|
|
477
|
+
|
|
478
|
+
Return ONLY a JSON array of strings, no explanation. Example:
|
|
479
|
+
["Value 1", "Value 2", "Value 3"]"""
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
if self.provider == "groq":
|
|
483
|
+
response = client.chat.completions.create(
|
|
484
|
+
model="llama-3.3-70b-versatile",
|
|
485
|
+
messages=[
|
|
486
|
+
{"role": "system", "content": "You are a domain expert generating realistic test data. Output only valid JSON."},
|
|
487
|
+
{"role": "user", "content": prompt}
|
|
488
|
+
],
|
|
489
|
+
temperature=0.7,
|
|
490
|
+
max_tokens=2000,
|
|
491
|
+
)
|
|
492
|
+
content = response.choices[0].message.content.strip()
|
|
493
|
+
elif self.provider == "openai":
|
|
494
|
+
response = client.chat.completions.create(
|
|
495
|
+
model="gpt-4o-mini",
|
|
496
|
+
messages=[
|
|
497
|
+
{"role": "system", "content": "You are a domain expert generating realistic test data. Output only valid JSON."},
|
|
498
|
+
{"role": "user", "content": prompt}
|
|
499
|
+
],
|
|
500
|
+
temperature=0.7,
|
|
501
|
+
)
|
|
502
|
+
content = response.choices[0].message.content.strip()
|
|
503
|
+
else:
|
|
504
|
+
return self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
505
|
+
|
|
506
|
+
# Parse JSON response
|
|
507
|
+
# Handle potential markdown code blocks
|
|
508
|
+
if content.startswith("```"):
|
|
509
|
+
lines = content.split("\n")
|
|
510
|
+
content = "\n".join(lines[1:-1])
|
|
511
|
+
|
|
512
|
+
pool = json.loads(content)
|
|
513
|
+
|
|
514
|
+
if isinstance(pool, list) and len(pool) > 0:
|
|
515
|
+
return pool[:size]
|
|
516
|
+
else:
|
|
517
|
+
return self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
518
|
+
|
|
519
|
+
except Exception as e:
|
|
520
|
+
print(f"LLM generation failed: {e}")
|
|
521
|
+
return self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
522
|
+
|
|
523
|
+
def get_pool(
|
|
524
|
+
self,
|
|
525
|
+
column_name: str,
|
|
526
|
+
table_name: str = "",
|
|
527
|
+
domain_hint: Optional[str] = None,
|
|
528
|
+
context: str = "",
|
|
529
|
+
size: int = 50,
|
|
530
|
+
use_llm: bool = True,
|
|
531
|
+
) -> List[str]:
|
|
532
|
+
"""
|
|
533
|
+
Get or create a value pool for a column.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
column_name: Name of the column
|
|
537
|
+
table_name: Name of the table
|
|
538
|
+
domain_hint: Explicit domain override
|
|
539
|
+
context: Additional context for LLM
|
|
540
|
+
size: Pool size
|
|
541
|
+
use_llm: Whether to use LLM for generation
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
List of domain-appropriate values
|
|
545
|
+
"""
|
|
546
|
+
# Determine domain
|
|
547
|
+
domain = domain_hint or self.detect_domain(column_name, table_name)
|
|
548
|
+
|
|
549
|
+
if domain is None:
|
|
550
|
+
# No domain detected, return empty
|
|
551
|
+
return []
|
|
552
|
+
|
|
553
|
+
# Build context string
|
|
554
|
+
full_context = context or f"{table_name} {column_name}".strip()
|
|
555
|
+
|
|
556
|
+
# Check in-memory cache first
|
|
557
|
+
cache_key = self._get_cache_key(domain, full_context, size)
|
|
558
|
+
if cache_key in self._pool_cache:
|
|
559
|
+
return self._pool_cache[cache_key]
|
|
560
|
+
|
|
561
|
+
# Check disk cache
|
|
562
|
+
cached = self._load_cached_pool(cache_key)
|
|
563
|
+
if cached:
|
|
564
|
+
self._pool_cache[cache_key] = cached
|
|
565
|
+
return cached
|
|
566
|
+
|
|
567
|
+
# Generate new pool
|
|
568
|
+
if use_llm:
|
|
569
|
+
pool = self.generate_pool_with_llm(domain, full_context, size)
|
|
570
|
+
else:
|
|
571
|
+
pool = self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
572
|
+
|
|
573
|
+
# Cache the pool
|
|
574
|
+
if pool:
|
|
575
|
+
self._pool_cache[cache_key] = pool
|
|
576
|
+
self._save_pool_to_cache(cache_key, pool)
|
|
577
|
+
|
|
578
|
+
return pool
|
|
579
|
+
|
|
580
|
+
def get_fallback_pool(self, domain: str) -> List[str]:
|
|
581
|
+
"""Get curated fallback pool for a domain."""
|
|
582
|
+
return self.FALLBACK_POOLS.get(domain, [])
|
|
583
|
+
|
|
584
|
+
def generate_with_template(
|
|
585
|
+
self,
|
|
586
|
+
template: str,
|
|
587
|
+
size: int,
|
|
588
|
+
components: Dict[str, List[str]],
|
|
589
|
+
) -> List[str]:
|
|
590
|
+
"""Generate text by substituting template components.
|
|
591
|
+
|
|
592
|
+
This creates more variety by combining parts rather than
|
|
593
|
+
picking from a fixed pool.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
template: String template with {component_name} placeholders
|
|
597
|
+
size: Number of values to generate
|
|
598
|
+
components: Dict mapping component names to value lists
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
List of generated strings
|
|
602
|
+
|
|
603
|
+
Example:
|
|
604
|
+
template = "{first_name} {last_name}"
|
|
605
|
+
components = {
|
|
606
|
+
"first_name": ["John", "Jane", "Alex"],
|
|
607
|
+
"last_name": ["Smith", "Johnson", "Williams"],
|
|
608
|
+
}
|
|
609
|
+
values = gen.generate_with_template(template, 100, components)
|
|
610
|
+
# Returns: ["John Smith", "Jane Williams", "Alex Johnson", ...]
|
|
611
|
+
"""
|
|
612
|
+
import random
|
|
613
|
+
|
|
614
|
+
results = []
|
|
615
|
+
for _ in range(size):
|
|
616
|
+
text = template
|
|
617
|
+
for key, values in components.items():
|
|
618
|
+
if f"{{{key}}}" in text:
|
|
619
|
+
text = text.replace(f"{{{key}}}", random.choice(values), 1)
|
|
620
|
+
results.append(text)
|
|
621
|
+
|
|
622
|
+
return results
|
|
623
|
+
|
|
624
|
+
def generate_composite_pool(
|
|
625
|
+
self,
|
|
626
|
+
domain: str,
|
|
627
|
+
size: int = 200,
|
|
628
|
+
) -> List[str]:
|
|
629
|
+
"""Generate larger pools using template composition.
|
|
630
|
+
|
|
631
|
+
Instead of calling LLM for 200 values, we compose
|
|
632
|
+
templates with varied components.
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
domain: Semantic domain
|
|
636
|
+
size: Target pool size
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
List of composed values
|
|
640
|
+
"""
|
|
641
|
+
import random
|
|
642
|
+
|
|
643
|
+
# Domain-specific templates
|
|
644
|
+
templates = {
|
|
645
|
+
"address": {
|
|
646
|
+
"template": "{number} {street_name} {street_type}, {city}, {state}",
|
|
647
|
+
"components": {
|
|
648
|
+
"number": [str(i) for i in range(100, 10000)],
|
|
649
|
+
"street_name": ["Oak", "Maple", "Cedar", "Pine", "Elm", "Birch", "Walnut", "Cherry", "Willow", "Aspen",
|
|
650
|
+
"Main", "First", "Second", "Third", "Park", "Lake", "River", "Hill", "Valley", "Spring"],
|
|
651
|
+
"street_type": ["Street", "Avenue", "Boulevard", "Lane", "Drive", "Court", "Place", "Road", "Way", "Circle"],
|
|
652
|
+
"city": ["Springfield", "Riverside", "Franklin", "Georgetown", "Clinton", "Salem", "Madison", "Bristol", "Fairview", "Newport"],
|
|
653
|
+
"state": ["CA", "TX", "NY", "FL", "IL", "PA", "OH", "GA", "MI", "NC", "WA", "CO", "AZ", "MA", "VA"],
|
|
654
|
+
},
|
|
655
|
+
},
|
|
656
|
+
"email": {
|
|
657
|
+
"template": "{name_part}{separator}{domain_part}@{provider}.{tld}",
|
|
658
|
+
"components": {
|
|
659
|
+
"name_part": ["john", "jane", "alex", "sam", "chris", "pat", "taylor", "jordan", "casey", "morgan",
|
|
660
|
+
"mike", "lisa", "david", "emma", "ryan", "kate", "nick", "amy", "steve", "jen"],
|
|
661
|
+
"separator": ["", ".", "_", ""],
|
|
662
|
+
"domain_part": ["smith", "jones", "work", "mail", "pro", "dev", "biz", "123", "2024", "online"],
|
|
663
|
+
"provider": ["gmail", "yahoo", "outlook", "hotmail", "icloud", "proton", "fastmail", "zoho"],
|
|
664
|
+
"tld": ["com", "com", "com", "org", "net", "io", "co"],
|
|
665
|
+
},
|
|
666
|
+
},
|
|
667
|
+
"product": {
|
|
668
|
+
"template": "{adjective} {material} {item_type} - {size_color}",
|
|
669
|
+
"components": {
|
|
670
|
+
"adjective": ["Premium", "Ultra", "Pro", "Classic", "Modern", "Sleek", "Essential", "Deluxe", "Elite", "Smart"],
|
|
671
|
+
"material": ["Stainless Steel", "Bamboo", "Ceramic", "Leather", "Cotton", "Titanium", "Wood", "Glass", "Silicone", "Carbon Fiber"],
|
|
672
|
+
"item_type": ["Water Bottle", "Phone Case", "Backpack", "Wallet", "Watch Band", "Desk Lamp", "Speaker", "Charging Dock", "Notebook", "Organizer"],
|
|
673
|
+
"size_color": ["Black/Large", "White/Medium", "Navy/Standard", "Gray/Compact", "Red/XL", "Brown/Regular", "Silver/Slim", "Green/Mini"],
|
|
674
|
+
},
|
|
675
|
+
},
|
|
676
|
+
"company_name": {
|
|
677
|
+
"template": "{prefix} {industry_word} {suffix}",
|
|
678
|
+
"components": {
|
|
679
|
+
"prefix": ["Nova", "Apex", "Prime", "Vertex", "Quantum", "Fusion", "Nexus", "Stellar", "Vector", "Atlas",
|
|
680
|
+
"Blue", "Red", "Green", "Global", "United", "First", "New", "Smart", "Tech", "Digital"],
|
|
681
|
+
"industry_word": ["Solutions", "Systems", "Tech", "Labs", "Works", "Group", "Partners", "Dynamics", "Innovations", "Ventures",
|
|
682
|
+
"Digital", "Logic", "Flow", "Wave", "Net", "Cloud", "Data", "Edge", "Core", "Sync"],
|
|
683
|
+
"suffix": ["Inc", "Corp", "LLC", "Co", "Ltd", "GmbH", "Technologies", "International", "Enterprises", "Holdings"],
|
|
684
|
+
},
|
|
685
|
+
},
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
if domain in templates:
|
|
689
|
+
config = templates[domain]
|
|
690
|
+
return self.generate_with_template(
|
|
691
|
+
config["template"],
|
|
692
|
+
size,
|
|
693
|
+
config["components"]
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
# Fall back to curated pool with random sampling
|
|
697
|
+
base_pool = self.FALLBACK_POOLS.get(domain, [])
|
|
698
|
+
if len(base_pool) >= size:
|
|
699
|
+
return random.sample(base_pool, size)
|
|
700
|
+
elif len(base_pool) > 0:
|
|
701
|
+
# Repeat with slight variations
|
|
702
|
+
result = []
|
|
703
|
+
for i in range(size):
|
|
704
|
+
base = random.choice(base_pool)
|
|
705
|
+
if random.random() < 0.3: # 30% chance to add suffix
|
|
706
|
+
suffix = random.choice([" (v2)", " Pro", " Plus", " - Updated", " 2.0", ""])
|
|
707
|
+
base = base + suffix
|
|
708
|
+
result.append(base)
|
|
709
|
+
return result
|
|
710
|
+
|
|
711
|
+
return []
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
# ============ Template Registry ============
|
|
715
|
+
|
|
716
|
+
COMPOSITION_TEMPLATES = {
|
|
717
|
+
"order_id": "{prefix}-{year}-{number}",
|
|
718
|
+
"invoice_number": "INV-{year}{month}-{number}",
|
|
719
|
+
"tracking_number": "{carrier}{number}{check}",
|
|
720
|
+
"sku": "{category}-{brand}-{variant}-{size}",
|
|
721
|
+
"username": "{adjective}{noun}{number}",
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
TEMPLATE_COMPONENTS = {
|
|
725
|
+
"order_id": {
|
|
726
|
+
"prefix": ["ORD", "SO", "PO", "WO", "REQ"],
|
|
727
|
+
"year": ["2023", "2024", "2025"],
|
|
728
|
+
"number": [str(i).zfill(6) for i in range(1, 1000)],
|
|
729
|
+
},
|
|
730
|
+
"invoice_number": {
|
|
731
|
+
"year": ["23", "24", "25"],
|
|
732
|
+
"month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
|
|
733
|
+
"number": [str(i).zfill(4) for i in range(1, 10000)],
|
|
734
|
+
},
|
|
735
|
+
"tracking_number": {
|
|
736
|
+
"carrier": ["1Z", "9400", "92", "420"],
|
|
737
|
+
"number": [str(i).zfill(12) for i in range(100000000000, 100001000000)],
|
|
738
|
+
"check": [str(i) for i in range(10)],
|
|
739
|
+
},
|
|
740
|
+
"sku": {
|
|
741
|
+
"category": ["ELC", "CLO", "HOM", "SPT", "TOY", "BOK"],
|
|
742
|
+
"brand": ["APP", "SAM", "NIK", "ADI", "SON", "LG"],
|
|
743
|
+
"variant": ["BLK", "WHT", "RED", "BLU", "GRN", "GRY"],
|
|
744
|
+
"size": ["S", "M", "L", "XL", "XXL", "OS"],
|
|
745
|
+
},
|
|
746
|
+
"username": {
|
|
747
|
+
"adjective": ["cool", "super", "mega", "ultra", "epic", "pro", "fast", "swift", "bold", "smart"],
|
|
748
|
+
"noun": ["ninja", "tiger", "dragon", "wolf", "hawk", "bear", "lion", "eagle", "shark", "fox"],
|
|
749
|
+
"number": [str(i) for i in range(1, 1000)],
|
|
750
|
+
},
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
# Convenience function for quick testing
|
|
755
|
+
def smart_generate(column_name: str, table_name: str = "", size: int = 10) -> List[str]:
|
|
756
|
+
"""Quick smart value generation for testing."""
|
|
757
|
+
gen = SmartValueGenerator()
|
|
758
|
+
pool = gen.get_pool(column_name, table_name, size=max(size * 2, 50))
|
|
759
|
+
if pool:
|
|
760
|
+
import random
|
|
761
|
+
return random.choices(pool, k=size)
|
|
762
|
+
return []
|