misata 0.2.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/profiles.py ADDED
@@ -0,0 +1,332 @@
1
+ """
2
+ Distribution Profiles for Realistic Data Generation.
3
+
4
+ Pre-configured distribution parameters that match real-world patterns
5
+ for common data types like age, salary, prices, etc.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional, Union
9
+ import numpy as np
10
+
11
+
12
+ class DistributionProfile:
13
+ """A named distribution configuration for realistic generation.
14
+
15
+ Example:
16
+ profile = DistributionProfile(
17
+ name="age",
18
+ distribution="mixture",
19
+ params={
20
+ "components": [
21
+ {"mean": 35, "std": 12, "weight": 0.6}, # Working age
22
+ {"mean": 70, "std": 8, "weight": 0.2}, # Retirees
23
+ {"mean": 12, "std": 4, "weight": 0.2}, # Children
24
+ ]
25
+ }
26
+ )
27
+ values = profile.generate(1000)
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ name: str,
33
+ distribution: str,
34
+ params: Dict[str, Any],
35
+ min_value: Optional[float] = None,
36
+ max_value: Optional[float] = None,
37
+ decimals: Optional[int] = None,
38
+ ):
39
+ self.name = name
40
+ self.distribution = distribution
41
+ self.params = params
42
+ self.min_value = min_value
43
+ self.max_value = max_value
44
+ self.decimals = decimals
45
+
46
+ def generate(
47
+ self,
48
+ size: int,
49
+ rng: Optional[np.random.Generator] = None
50
+ ) -> np.ndarray:
51
+ """Generate values according to this profile."""
52
+ if rng is None:
53
+ rng = np.random.default_rng()
54
+
55
+ if self.distribution == "normal":
56
+ mean = self.params.get("mean", 50)
57
+ std = self.params.get("std", 10)
58
+ values = rng.normal(mean, std, size)
59
+
60
+ elif self.distribution == "lognormal":
61
+ mean = self.params.get("mean", 0)
62
+ sigma = self.params.get("sigma", 1)
63
+ values = rng.lognormal(mean, sigma, size)
64
+
65
+ elif self.distribution == "exponential":
66
+ scale = self.params.get("scale", 1.0)
67
+ values = rng.exponential(scale, size)
68
+
69
+ elif self.distribution == "pareto":
70
+ alpha = self.params.get("alpha", 2.0)
71
+ min_val = self.params.get("min", 1.0)
72
+ values = (rng.pareto(alpha, size) + 1) * min_val
73
+
74
+ elif self.distribution == "beta":
75
+ a = self.params.get("a", 2)
76
+ b = self.params.get("b", 5)
77
+ scale = self.params.get("scale", 1.0)
78
+ values = rng.beta(a, b, size) * scale
79
+
80
+ elif self.distribution == "mixture":
81
+ # Gaussian mixture model
82
+ components = self.params.get("components", [])
83
+ if not components:
84
+ values = rng.normal(0, 1, size)
85
+ else:
86
+ weights = np.array([c.get("weight", 1) for c in components])
87
+ weights = weights / weights.sum()
88
+
89
+ # Sample component indices
90
+ component_indices = rng.choice(
91
+ len(components), size=size, p=weights
92
+ )
93
+
94
+ values = np.zeros(size)
95
+ for i, comp in enumerate(components):
96
+ mask = component_indices == i
97
+ n = mask.sum()
98
+ if n > 0:
99
+ values[mask] = rng.normal(
100
+ comp.get("mean", 0),
101
+ comp.get("std", 1),
102
+ n
103
+ )
104
+
105
+ elif self.distribution == "zipf":
106
+ # Zipf distribution for long-tail data
107
+ a = self.params.get("alpha", 2.0)
108
+ values = rng.zipf(a, size).astype(float)
109
+
110
+ elif self.distribution == "uniform":
111
+ low = self.params.get("min", 0)
112
+ high = self.params.get("max", 100)
113
+ values = rng.uniform(low, high, size)
114
+
115
+ else:
116
+ # Default to uniform
117
+ values = rng.uniform(0, 100, size)
118
+
119
+ # Apply constraints
120
+ if self.min_value is not None:
121
+ values = np.maximum(values, self.min_value)
122
+ if self.max_value is not None:
123
+ values = np.minimum(values, self.max_value)
124
+ if self.decimals is not None:
125
+ values = np.round(values, self.decimals)
126
+
127
+ return values
128
+
129
+
130
+ # ============ Pre-built Profiles ============
131
+
132
+ PROFILES: Dict[str, DistributionProfile] = {}
133
+
134
+
135
+ def _register_profile(profile: DistributionProfile) -> None:
136
+ """Register a profile by name."""
137
+ PROFILES[profile.name] = profile
138
+
139
+
140
+ # Age distributions
141
+ _register_profile(DistributionProfile(
142
+ name="age_adult",
143
+ distribution="mixture",
144
+ params={
145
+ "components": [
146
+ {"mean": 28, "std": 6, "weight": 0.3}, # Young adults
147
+ {"mean": 42, "std": 10, "weight": 0.45}, # Middle age
148
+ {"mean": 62, "std": 8, "weight": 0.25}, # Older adults
149
+ ]
150
+ },
151
+ min_value=18,
152
+ max_value=100,
153
+ decimals=0,
154
+ ))
155
+
156
+ _register_profile(DistributionProfile(
157
+ name="age_population",
158
+ distribution="mixture",
159
+ params={
160
+ "components": [
161
+ {"mean": 8, "std": 4, "weight": 0.15}, # Children
162
+ {"mean": 25, "std": 8, "weight": 0.25}, # Young adults
163
+ {"mean": 42, "std": 12, "weight": 0.35}, # Middle age
164
+ {"mean": 68, "std": 10, "weight": 0.25}, # Seniors
165
+ ]
166
+ },
167
+ min_value=0,
168
+ max_value=105,
169
+ decimals=0,
170
+ ))
171
+
172
+ # Salary distributions
173
+ _register_profile(DistributionProfile(
174
+ name="salary_usd",
175
+ distribution="lognormal",
176
+ params={"mean": 11.0, "sigma": 0.5}, # Log of ~$60k median
177
+ min_value=25000,
178
+ max_value=500000,
179
+ decimals=0,
180
+ ))
181
+
182
+ _register_profile(DistributionProfile(
183
+ name="salary_tech",
184
+ distribution="mixture",
185
+ params={
186
+ "components": [
187
+ {"mean": 75000, "std": 15000, "weight": 0.2}, # Junior
188
+ {"mean": 120000, "std": 25000, "weight": 0.4}, # Mid
189
+ {"mean": 180000, "std": 40000, "weight": 0.3}, # Senior
190
+ {"mean": 280000, "std": 60000, "weight": 0.1}, # Staff+
191
+ ]
192
+ },
193
+ min_value=50000,
194
+ max_value=600000,
195
+ decimals=0,
196
+ ))
197
+
198
+ # Price distributions
199
+ _register_profile(DistributionProfile(
200
+ name="price_retail",
201
+ distribution="lognormal",
202
+ params={"mean": 3.5, "sigma": 1.2}, # ~$30 median
203
+ min_value=0.99,
204
+ max_value=10000,
205
+ decimals=2,
206
+ ))
207
+
208
+ _register_profile(DistributionProfile(
209
+ name="price_saas",
210
+ distribution="mixture",
211
+ params={
212
+ "components": [
213
+ {"mean": 15, "std": 5, "weight": 0.3}, # Basic tier
214
+ {"mean": 49, "std": 15, "weight": 0.4}, # Pro tier
215
+ {"mean": 199, "std": 50, "weight": 0.25}, # Enterprise
216
+ {"mean": 999, "std": 200, "weight": 0.05}, # Custom
217
+ ]
218
+ },
219
+ min_value=0,
220
+ max_value=5000,
221
+ decimals=0,
222
+ ))
223
+
224
+ # Transaction amounts
225
+ _register_profile(DistributionProfile(
226
+ name="transaction_amount",
227
+ distribution="pareto",
228
+ params={"alpha": 2.5, "min": 10},
229
+ min_value=1,
230
+ max_value=100000,
231
+ decimals=2,
232
+ ))
233
+
234
+ # Counts / quantities
235
+ _register_profile(DistributionProfile(
236
+ name="order_quantity",
237
+ distribution="zipf",
238
+ params={"alpha": 2.0},
239
+ min_value=1,
240
+ max_value=100,
241
+ decimals=0,
242
+ ))
243
+
244
+ # Time-related
245
+ _register_profile(DistributionProfile(
246
+ name="session_duration_seconds",
247
+ distribution="lognormal",
248
+ params={"mean": 5.5, "sigma": 1.5}, # ~4 min median
249
+ min_value=1,
250
+ max_value=7200, # 2 hours max
251
+ decimals=0,
252
+ ))
253
+
254
+ # Ratings and scores
255
+ _register_profile(DistributionProfile(
256
+ name="rating_5star",
257
+ distribution="beta",
258
+ params={"a": 5, "b": 2, "scale": 5}, # Skewed towards higher ratings
259
+ min_value=1,
260
+ max_value=5,
261
+ decimals=1,
262
+ ))
263
+
264
+ _register_profile(DistributionProfile(
265
+ name="nps_score",
266
+ distribution="mixture",
267
+ params={
268
+ "components": [
269
+ {"mean": 3, "std": 2, "weight": 0.15}, # Detractors
270
+ {"mean": 7, "std": 1, "weight": 0.25}, # Passives
271
+ {"mean": 9, "std": 0.8, "weight": 0.6}, # Promoters
272
+ ]
273
+ },
274
+ min_value=0,
275
+ max_value=10,
276
+ decimals=0,
277
+ ))
278
+
279
+ # Percentages
280
+ _register_profile(DistributionProfile(
281
+ name="conversion_rate",
282
+ distribution="beta",
283
+ params={"a": 2, "b": 50, "scale": 100}, # Low conversion (1-5%)
284
+ min_value=0,
285
+ max_value=100,
286
+ decimals=2,
287
+ ))
288
+
289
+ _register_profile(DistributionProfile(
290
+ name="churn_rate",
291
+ distribution="beta",
292
+ params={"a": 1.5, "b": 30, "scale": 100}, # ~5% typical
293
+ min_value=0,
294
+ max_value=100,
295
+ decimals=2,
296
+ ))
297
+
298
+
299
+ def get_profile(name: str) -> Optional[DistributionProfile]:
300
+ """Get a profile by name."""
301
+ return PROFILES.get(name)
302
+
303
+
304
+ def list_profiles() -> List[str]:
305
+ """List all available profile names."""
306
+ return list(PROFILES.keys())
307
+
308
+
309
+ def generate_with_profile(
310
+ profile_name: str,
311
+ size: int,
312
+ rng: Optional[np.random.Generator] = None
313
+ ) -> np.ndarray:
314
+ """Generate values using a named profile.
315
+
316
+ Args:
317
+ profile_name: Name of the profile (e.g., "salary_tech")
318
+ size: Number of values to generate
319
+ rng: Random number generator
320
+
321
+ Returns:
322
+ Array of generated values
323
+
324
+ Raises:
325
+ ValueError: If profile not found
326
+ """
327
+ profile = get_profile(profile_name)
328
+ if profile is None:
329
+ available = ", ".join(list_profiles())
330
+ raise ValueError(f"Unknown profile: {profile_name}. Available: {available}")
331
+
332
+ return profile.generate(size, rng)
misata/smart_values.py CHANGED
@@ -580,14 +580,183 @@ Return ONLY a JSON array of strings, no explanation. Example:
580
580
  def get_fallback_pool(self, domain: str) -> List[str]:
581
581
  """Get curated fallback pool for a domain."""
582
582
  return self.FALLBACK_POOLS.get(domain, [])
583
+
584
+ def generate_with_template(
585
+ self,
586
+ template: str,
587
+ size: int,
588
+ components: Dict[str, List[str]],
589
+ ) -> List[str]:
590
+ """Generate text by substituting template components.
591
+
592
+ This creates more variety by combining parts rather than
593
+ picking from a fixed pool.
594
+
595
+ Args:
596
+ template: String template with {component_name} placeholders
597
+ size: Number of values to generate
598
+ components: Dict mapping component names to value lists
599
+
600
+ Returns:
601
+ List of generated strings
602
+
603
+ Example:
604
+ template = "{first_name} {last_name}"
605
+ components = {
606
+ "first_name": ["John", "Jane", "Alex"],
607
+ "last_name": ["Smith", "Johnson", "Williams"],
608
+ }
609
+ values = gen.generate_with_template(template, 100, components)
610
+ # Returns: ["John Smith", "Jane Williams", "Alex Johnson", ...]
611
+ """
612
+ import random
613
+
614
+ results = []
615
+ for _ in range(size):
616
+ text = template
617
+ for key, values in components.items():
618
+ if f"{{{key}}}" in text:
619
+ text = text.replace(f"{{{key}}}", random.choice(values), 1)
620
+ results.append(text)
621
+
622
+ return results
623
+
624
+ def generate_composite_pool(
625
+ self,
626
+ domain: str,
627
+ size: int = 200,
628
+ ) -> List[str]:
629
+ """Generate larger pools using template composition.
630
+
631
+ Instead of calling LLM for 200 values, we compose
632
+ templates with varied components.
633
+
634
+ Args:
635
+ domain: Semantic domain
636
+ size: Target pool size
637
+
638
+ Returns:
639
+ List of composed values
640
+ """
641
+ import random
642
+
643
+ # Domain-specific templates
644
+ templates = {
645
+ "address": {
646
+ "template": "{number} {street_name} {street_type}, {city}, {state}",
647
+ "components": {
648
+ "number": [str(i) for i in range(100, 10000)],
649
+ "street_name": ["Oak", "Maple", "Cedar", "Pine", "Elm", "Birch", "Walnut", "Cherry", "Willow", "Aspen",
650
+ "Main", "First", "Second", "Third", "Park", "Lake", "River", "Hill", "Valley", "Spring"],
651
+ "street_type": ["Street", "Avenue", "Boulevard", "Lane", "Drive", "Court", "Place", "Road", "Way", "Circle"],
652
+ "city": ["Springfield", "Riverside", "Franklin", "Georgetown", "Clinton", "Salem", "Madison", "Bristol", "Fairview", "Newport"],
653
+ "state": ["CA", "TX", "NY", "FL", "IL", "PA", "OH", "GA", "MI", "NC", "WA", "CO", "AZ", "MA", "VA"],
654
+ },
655
+ },
656
+ "email": {
657
+ "template": "{name_part}{separator}{domain_part}@{provider}.{tld}",
658
+ "components": {
659
+ "name_part": ["john", "jane", "alex", "sam", "chris", "pat", "taylor", "jordan", "casey", "morgan",
660
+ "mike", "lisa", "david", "emma", "ryan", "kate", "nick", "amy", "steve", "jen"],
661
+ "separator": ["", ".", "_", ""],
662
+ "domain_part": ["smith", "jones", "work", "mail", "pro", "dev", "biz", "123", "2024", "online"],
663
+ "provider": ["gmail", "yahoo", "outlook", "hotmail", "icloud", "proton", "fastmail", "zoho"],
664
+ "tld": ["com", "com", "com", "org", "net", "io", "co"],
665
+ },
666
+ },
667
+ "product": {
668
+ "template": "{adjective} {material} {item_type} - {size_color}",
669
+ "components": {
670
+ "adjective": ["Premium", "Ultra", "Pro", "Classic", "Modern", "Sleek", "Essential", "Deluxe", "Elite", "Smart"],
671
+ "material": ["Stainless Steel", "Bamboo", "Ceramic", "Leather", "Cotton", "Titanium", "Wood", "Glass", "Silicone", "Carbon Fiber"],
672
+ "item_type": ["Water Bottle", "Phone Case", "Backpack", "Wallet", "Watch Band", "Desk Lamp", "Speaker", "Charging Dock", "Notebook", "Organizer"],
673
+ "size_color": ["Black/Large", "White/Medium", "Navy/Standard", "Gray/Compact", "Red/XL", "Brown/Regular", "Silver/Slim", "Green/Mini"],
674
+ },
675
+ },
676
+ "company_name": {
677
+ "template": "{prefix} {industry_word} {suffix}",
678
+ "components": {
679
+ "prefix": ["Nova", "Apex", "Prime", "Vertex", "Quantum", "Fusion", "Nexus", "Stellar", "Vector", "Atlas",
680
+ "Blue", "Red", "Green", "Global", "United", "First", "New", "Smart", "Tech", "Digital"],
681
+ "industry_word": ["Solutions", "Systems", "Tech", "Labs", "Works", "Group", "Partners", "Dynamics", "Innovations", "Ventures",
682
+ "Digital", "Logic", "Flow", "Wave", "Net", "Cloud", "Data", "Edge", "Core", "Sync"],
683
+ "suffix": ["Inc", "Corp", "LLC", "Co", "Ltd", "GmbH", "Technologies", "International", "Enterprises", "Holdings"],
684
+ },
685
+ },
686
+ }
687
+
688
+ if domain in templates:
689
+ config = templates[domain]
690
+ return self.generate_with_template(
691
+ config["template"],
692
+ size,
693
+ config["components"]
694
+ )
695
+
696
+ # Fall back to curated pool with random sampling
697
+ base_pool = self.FALLBACK_POOLS.get(domain, [])
698
+ if len(base_pool) >= size:
699
+ return random.sample(base_pool, size)
700
+ elif len(base_pool) > 0:
701
+ # Repeat with slight variations
702
+ result = []
703
+ for i in range(size):
704
+ base = random.choice(base_pool)
705
+ if random.random() < 0.3: # 30% chance to add suffix
706
+ suffix = random.choice([" (v2)", " Pro", " Plus", " - Updated", " 2.0", ""])
707
+ base = base + suffix
708
+ result.append(base)
709
+ return result
710
+
711
+ return []
712
+
713
+
714
+ # ============ Template Registry ============
715
+
716
+ COMPOSITION_TEMPLATES = {
717
+ "order_id": "{prefix}-{year}-{number}",
718
+ "invoice_number": "INV-{year}{month}-{number}",
719
+ "tracking_number": "{carrier}{number}{check}",
720
+ "sku": "{category}-{brand}-{variant}-{size}",
721
+ "username": "{adjective}{noun}{number}",
722
+ }
723
+
724
+ TEMPLATE_COMPONENTS = {
725
+ "order_id": {
726
+ "prefix": ["ORD", "SO", "PO", "WO", "REQ"],
727
+ "year": ["2023", "2024", "2025"],
728
+ "number": [str(i).zfill(6) for i in range(1, 1000)],
729
+ },
730
+ "invoice_number": {
731
+ "year": ["23", "24", "25"],
732
+ "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
733
+ "number": [str(i).zfill(4) for i in range(1, 10000)],
734
+ },
735
+ "tracking_number": {
736
+ "carrier": ["1Z", "9400", "92", "420"],
737
+ "number": [str(i).zfill(12) for i in range(100000000000, 100001000000)],
738
+ "check": [str(i) for i in range(10)],
739
+ },
740
+ "sku": {
741
+ "category": ["ELC", "CLO", "HOM", "SPT", "TOY", "BOK"],
742
+ "brand": ["APP", "SAM", "NIK", "ADI", "SON", "LG"],
743
+ "variant": ["BLK", "WHT", "RED", "BLU", "GRN", "GRY"],
744
+ "size": ["S", "M", "L", "XL", "XXL", "OS"],
745
+ },
746
+ "username": {
747
+ "adjective": ["cool", "super", "mega", "ultra", "epic", "pro", "fast", "swift", "bold", "smart"],
748
+ "noun": ["ninja", "tiger", "dragon", "wolf", "hawk", "bear", "lion", "eagle", "shark", "fox"],
749
+ "number": [str(i) for i in range(1, 1000)],
750
+ },
751
+ }
583
752
 
584
753
 
585
754
  # Convenience function for quick testing
586
755
  def smart_generate(column_name: str, table_name: str = "", size: int = 10) -> List[str]:
587
756
  """Quick smart value generation for testing."""
588
757
  gen = SmartValueGenerator()
589
- pool = gen.get_pool(column_name, table_name)
758
+ pool = gen.get_pool(column_name, table_name, size=max(size * 2, 50))
590
759
  if pool:
591
760
  import random
592
- return random.sample(pool, min(size, len(pool)))
761
+ return random.choices(pool, k=size)
593
762
  return []