misata 0.2.0b0__py3-none-any.whl → 0.3.1b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/profiles.py ADDED
@@ -0,0 +1,332 @@
1
+ """
2
+ Distribution Profiles for Realistic Data Generation.
3
+
4
+ Pre-configured distribution parameters that match real-world patterns
5
+ for common data types like age, salary, prices, etc.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional, Union
9
+ import numpy as np
10
+
11
+
12
+ class DistributionProfile:
13
+ """A named distribution configuration for realistic generation.
14
+
15
+ Example:
16
+ profile = DistributionProfile(
17
+ name="age",
18
+ distribution="mixture",
19
+ params={
20
+ "components": [
21
+ {"mean": 35, "std": 12, "weight": 0.6}, # Working age
22
+ {"mean": 70, "std": 8, "weight": 0.2}, # Retirees
23
+ {"mean": 12, "std": 4, "weight": 0.2}, # Children
24
+ ]
25
+ }
26
+ )
27
+ values = profile.generate(1000)
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ name: str,
33
+ distribution: str,
34
+ params: Dict[str, Any],
35
+ min_value: Optional[float] = None,
36
+ max_value: Optional[float] = None,
37
+ decimals: Optional[int] = None,
38
+ ):
39
+ self.name = name
40
+ self.distribution = distribution
41
+ self.params = params
42
+ self.min_value = min_value
43
+ self.max_value = max_value
44
+ self.decimals = decimals
45
+
46
+ def generate(
47
+ self,
48
+ size: int,
49
+ rng: Optional[np.random.Generator] = None
50
+ ) -> np.ndarray:
51
+ """Generate values according to this profile."""
52
+ if rng is None:
53
+ rng = np.random.default_rng()
54
+
55
+ if self.distribution == "normal":
56
+ mean = self.params.get("mean", 50)
57
+ std = self.params.get("std", 10)
58
+ values = rng.normal(mean, std, size)
59
+
60
+ elif self.distribution == "lognormal":
61
+ mean = self.params.get("mean", 0)
62
+ sigma = self.params.get("sigma", 1)
63
+ values = rng.lognormal(mean, sigma, size)
64
+
65
+ elif self.distribution == "exponential":
66
+ scale = self.params.get("scale", 1.0)
67
+ values = rng.exponential(scale, size)
68
+
69
+ elif self.distribution == "pareto":
70
+ alpha = self.params.get("alpha", 2.0)
71
+ min_val = self.params.get("min", 1.0)
72
+ values = (rng.pareto(alpha, size) + 1) * min_val
73
+
74
+ elif self.distribution == "beta":
75
+ a = self.params.get("a", 2)
76
+ b = self.params.get("b", 5)
77
+ scale = self.params.get("scale", 1.0)
78
+ values = rng.beta(a, b, size) * scale
79
+
80
+ elif self.distribution == "mixture":
81
+ # Gaussian mixture model
82
+ components = self.params.get("components", [])
83
+ if not components:
84
+ values = rng.normal(0, 1, size)
85
+ else:
86
+ weights = np.array([c.get("weight", 1) for c in components])
87
+ weights = weights / weights.sum()
88
+
89
+ # Sample component indices
90
+ component_indices = rng.choice(
91
+ len(components), size=size, p=weights
92
+ )
93
+
94
+ values = np.zeros(size)
95
+ for i, comp in enumerate(components):
96
+ mask = component_indices == i
97
+ n = mask.sum()
98
+ if n > 0:
99
+ values[mask] = rng.normal(
100
+ comp.get("mean", 0),
101
+ comp.get("std", 1),
102
+ n
103
+ )
104
+
105
+ elif self.distribution == "zipf":
106
+ # Zipf distribution for long-tail data
107
+ a = self.params.get("alpha", 2.0)
108
+ values = rng.zipf(a, size).astype(float)
109
+
110
+ elif self.distribution == "uniform":
111
+ low = self.params.get("min", 0)
112
+ high = self.params.get("max", 100)
113
+ values = rng.uniform(low, high, size)
114
+
115
+ else:
116
+ # Default to uniform
117
+ values = rng.uniform(0, 100, size)
118
+
119
+ # Apply constraints
120
+ if self.min_value is not None:
121
+ values = np.maximum(values, self.min_value)
122
+ if self.max_value is not None:
123
+ values = np.minimum(values, self.max_value)
124
+ if self.decimals is not None:
125
+ values = np.round(values, self.decimals)
126
+
127
+ return values
128
+
129
+
130
+ # ============ Pre-built Profiles ============
131
+
132
+ PROFILES: Dict[str, DistributionProfile] = {}
133
+
134
+
135
+ def _register_profile(profile: DistributionProfile) -> None:
136
+ """Register a profile by name."""
137
+ PROFILES[profile.name] = profile
138
+
139
+
140
+ # Age distributions
141
+ _register_profile(DistributionProfile(
142
+ name="age_adult",
143
+ distribution="mixture",
144
+ params={
145
+ "components": [
146
+ {"mean": 28, "std": 6, "weight": 0.3}, # Young adults
147
+ {"mean": 42, "std": 10, "weight": 0.45}, # Middle age
148
+ {"mean": 62, "std": 8, "weight": 0.25}, # Older adults
149
+ ]
150
+ },
151
+ min_value=18,
152
+ max_value=100,
153
+ decimals=0,
154
+ ))
155
+
156
+ _register_profile(DistributionProfile(
157
+ name="age_population",
158
+ distribution="mixture",
159
+ params={
160
+ "components": [
161
+ {"mean": 8, "std": 4, "weight": 0.15}, # Children
162
+ {"mean": 25, "std": 8, "weight": 0.25}, # Young adults
163
+ {"mean": 42, "std": 12, "weight": 0.35}, # Middle age
164
+ {"mean": 68, "std": 10, "weight": 0.25}, # Seniors
165
+ ]
166
+ },
167
+ min_value=0,
168
+ max_value=105,
169
+ decimals=0,
170
+ ))
171
+
172
+ # Salary distributions
173
+ _register_profile(DistributionProfile(
174
+ name="salary_usd",
175
+ distribution="lognormal",
176
+ params={"mean": 11.0, "sigma": 0.5}, # Log of ~$60k median
177
+ min_value=25000,
178
+ max_value=500000,
179
+ decimals=0,
180
+ ))
181
+
182
+ _register_profile(DistributionProfile(
183
+ name="salary_tech",
184
+ distribution="mixture",
185
+ params={
186
+ "components": [
187
+ {"mean": 75000, "std": 15000, "weight": 0.2}, # Junior
188
+ {"mean": 120000, "std": 25000, "weight": 0.4}, # Mid
189
+ {"mean": 180000, "std": 40000, "weight": 0.3}, # Senior
190
+ {"mean": 280000, "std": 60000, "weight": 0.1}, # Staff+
191
+ ]
192
+ },
193
+ min_value=50000,
194
+ max_value=600000,
195
+ decimals=0,
196
+ ))
197
+
198
+ # Price distributions
199
+ _register_profile(DistributionProfile(
200
+ name="price_retail",
201
+ distribution="lognormal",
202
+ params={"mean": 3.5, "sigma": 1.2}, # ~$30 median
203
+ min_value=0.99,
204
+ max_value=10000,
205
+ decimals=2,
206
+ ))
207
+
208
+ _register_profile(DistributionProfile(
209
+ name="price_saas",
210
+ distribution="mixture",
211
+ params={
212
+ "components": [
213
+ {"mean": 15, "std": 5, "weight": 0.3}, # Basic tier
214
+ {"mean": 49, "std": 15, "weight": 0.4}, # Pro tier
215
+ {"mean": 199, "std": 50, "weight": 0.25}, # Enterprise
216
+ {"mean": 999, "std": 200, "weight": 0.05}, # Custom
217
+ ]
218
+ },
219
+ min_value=0,
220
+ max_value=5000,
221
+ decimals=0,
222
+ ))
223
+
224
+ # Transaction amounts
225
+ _register_profile(DistributionProfile(
226
+ name="transaction_amount",
227
+ distribution="pareto",
228
+ params={"alpha": 2.5, "min": 10},
229
+ min_value=1,
230
+ max_value=100000,
231
+ decimals=2,
232
+ ))
233
+
234
+ # Counts / quantities
235
+ _register_profile(DistributionProfile(
236
+ name="order_quantity",
237
+ distribution="zipf",
238
+ params={"alpha": 2.0},
239
+ min_value=1,
240
+ max_value=100,
241
+ decimals=0,
242
+ ))
243
+
244
+ # Time-related
245
+ _register_profile(DistributionProfile(
246
+ name="session_duration_seconds",
247
+ distribution="lognormal",
248
+ params={"mean": 5.5, "sigma": 1.5}, # ~4 min median
249
+ min_value=1,
250
+ max_value=7200, # 2 hours max
251
+ decimals=0,
252
+ ))
253
+
254
+ # Ratings and scores
255
+ _register_profile(DistributionProfile(
256
+ name="rating_5star",
257
+ distribution="beta",
258
+ params={"a": 5, "b": 2, "scale": 5}, # Skewed towards higher ratings
259
+ min_value=1,
260
+ max_value=5,
261
+ decimals=1,
262
+ ))
263
+
264
+ _register_profile(DistributionProfile(
265
+ name="nps_score",
266
+ distribution="mixture",
267
+ params={
268
+ "components": [
269
+ {"mean": 3, "std": 2, "weight": 0.15}, # Detractors
270
+ {"mean": 7, "std": 1, "weight": 0.25}, # Passives
271
+ {"mean": 9, "std": 0.8, "weight": 0.6}, # Promoters
272
+ ]
273
+ },
274
+ min_value=0,
275
+ max_value=10,
276
+ decimals=0,
277
+ ))
278
+
279
+ # Percentages
280
+ _register_profile(DistributionProfile(
281
+ name="conversion_rate",
282
+ distribution="beta",
283
+ params={"a": 2, "b": 50, "scale": 100}, # Low conversion (1-5%)
284
+ min_value=0,
285
+ max_value=100,
286
+ decimals=2,
287
+ ))
288
+
289
+ _register_profile(DistributionProfile(
290
+ name="churn_rate",
291
+ distribution="beta",
292
+ params={"a": 1.5, "b": 30, "scale": 100}, # ~5% typical
293
+ min_value=0,
294
+ max_value=100,
295
+ decimals=2,
296
+ ))
297
+
298
+
299
+ def get_profile(name: str) -> Optional[DistributionProfile]:
300
+ """Get a profile by name."""
301
+ return PROFILES.get(name)
302
+
303
+
304
+ def list_profiles() -> List[str]:
305
+ """List all available profile names."""
306
+ return list(PROFILES.keys())
307
+
308
+
309
+ def generate_with_profile(
310
+ profile_name: str,
311
+ size: int,
312
+ rng: Optional[np.random.Generator] = None
313
+ ) -> np.ndarray:
314
+ """Generate values using a named profile.
315
+
316
+ Args:
317
+ profile_name: Name of the profile (e.g., "salary_tech")
318
+ size: Number of values to generate
319
+ rng: Random number generator
320
+
321
+ Returns:
322
+ Array of generated values
323
+
324
+ Raises:
325
+ ValueError: If profile not found
326
+ """
327
+ profile = get_profile(profile_name)
328
+ if profile is None:
329
+ available = ", ".join(list_profiles())
330
+ raise ValueError(f"Unknown profile: {profile_name}. Available: {available}")
331
+
332
+ return profile.generate(size, rng)
misata/simulator.py CHANGED
@@ -16,7 +16,9 @@ from typing import Any, Dict, List, Optional
16
16
  import numpy as np
17
17
  import pandas as pd
18
18
 
19
- from misata.generators import TextGenerator
19
+ from misata.generators.base import TextGenerator as _FactoryTextGenerator # Generator factory version
20
+ # Use the original generators.py TextGenerator which supports seed
21
+ from misata.generators_legacy import TextGenerator
20
22
  from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
21
23
 
22
24
 
@@ -34,6 +36,10 @@ class DataSimulator:
34
36
  rng: NumPy random generator for reproducibility
35
37
  """
36
38
 
39
+ # Performance constants
40
+ MAX_CONTEXT_ROWS = 50000 # Cap context storage for memory efficiency
41
+ TEXT_POOL_SIZE = 10000 # Size of text value pools for vectorized sampling
42
+
37
43
  def __init__(self, config: SchemaConfig,
38
44
  apply_semantic_fixes: bool = True, batch_size: int = 10_000,
39
45
  smart_mode: bool = False, use_llm: bool = True):
@@ -57,6 +63,7 @@ class DataSimulator:
57
63
  self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
58
64
  self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
59
65
  self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
66
+ self._text_pools: Dict[str, np.ndarray] = {} # Cache text pools for vectorized sampling
60
67
 
61
68
  # Apply semantic inference to fix column types
62
69
  if apply_semantic_fixes:
@@ -199,10 +206,24 @@ class DataSimulator:
199
206
  ctx_df = df[cols_to_store].copy()
200
207
 
201
208
  if table_name not in self.context:
209
+ # First batch: store up to MAX_CONTEXT_ROWS
210
+ if len(ctx_df) > self.MAX_CONTEXT_ROWS:
211
+ ctx_df = ctx_df.sample(n=self.MAX_CONTEXT_ROWS, random_state=self.config.seed)
202
212
  self.context[table_name] = ctx_df
203
213
  else:
204
- # Append to existing context
205
- self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
214
+ # Append to existing context, but cap at MAX_CONTEXT_ROWS
215
+ current_len = len(self.context[table_name])
216
+ if current_len >= self.MAX_CONTEXT_ROWS:
217
+ # Already at capacity, use reservoir sampling for randomness
218
+ # Replace some existing rows with new ones (probability-based)
219
+ return # Skip appending, we have enough IDs
220
+
221
+ remaining_space = self.MAX_CONTEXT_ROWS - current_len
222
+ rows_to_add = ctx_df.iloc[:remaining_space]
223
+ self.context[table_name] = pd.concat(
224
+ [self.context[table_name], rows_to_add],
225
+ ignore_index=True
226
+ )
206
227
 
207
228
  def generate_column(
208
229
  self,
@@ -225,6 +246,70 @@ class DataSimulator:
225
246
  """
226
247
  params = column.distribution_params
227
248
 
249
+ # ========== CORRELATED COLUMN GENERATION ==========
250
+ # If this column depends on another column's value, use conditional distribution
251
+ if "depends_on" in params and table_data is not None:
252
+ parent_col = params["depends_on"]
253
+ mapping = params.get("mapping", {})
254
+
255
+ if parent_col in table_data.columns and mapping:
256
+ parent_values = table_data[parent_col].values
257
+
258
+ # Check if it's numeric or categorical mapping
259
+ first_val = next(iter(mapping.values()))
260
+ if isinstance(first_val, dict) and "mean" in first_val:
261
+ # Numeric conditional distribution (e.g., salary based on job_title)
262
+ # mapping = {"Intern": {"mean": 40000, "std": 5000}, "CTO": {"mean": 200000, "std": 30000}}
263
+ values = np.zeros(size)
264
+ for key, dist in mapping.items():
265
+ mask = parent_values == key
266
+ count = mask.sum()
267
+ if count > 0:
268
+ mean = dist.get("mean", 50000)
269
+ std = dist.get("std", mean * 0.1)
270
+ values[mask] = self.rng.normal(mean, std, count)
271
+
272
+ # Handle values that didn't match any key (use default)
273
+ default = params.get("default", {"mean": 50000, "std": 10000})
274
+ unmatched = ~np.isin(parent_values, list(mapping.keys()))
275
+ if unmatched.sum() > 0:
276
+ values[unmatched] = self.rng.normal(
277
+ default.get("mean", 50000),
278
+ default.get("std", 10000),
279
+ unmatched.sum()
280
+ )
281
+ return values
282
+
283
+ elif isinstance(first_val, list):
284
+ # Categorical conditional (e.g., state based on country)
285
+ # mapping = {"USA": ["CA", "TX", "NY"], "UK": ["England", "Scotland"]}
286
+ values = np.empty(size, dtype=object)
287
+ for key, choices in mapping.items():
288
+ mask = parent_values == key
289
+ count = mask.sum()
290
+ if count > 0:
291
+ values[mask] = self.rng.choice(choices, count)
292
+
293
+ # Default for unmatched
294
+ default_choices = params.get("default", ["Unknown"])
295
+ unmatched = values == None # noqa
296
+ if unmatched.sum() > 0:
297
+ values[unmatched] = self.rng.choice(default_choices, unmatched.sum())
298
+ return values
299
+
300
+ elif isinstance(first_val, (int, float)):
301
+ # Probability-based boolean (e.g., churn probability based on plan)
302
+ # mapping = {"free": 0.3, "pro": 0.1, "enterprise": 0.05}
303
+ values = np.zeros(size, dtype=bool)
304
+ for key, prob in mapping.items():
305
+ mask = parent_values == key
306
+ count = mask.sum()
307
+ if count > 0:
308
+ values[mask] = self.rng.random(count) < prob
309
+ return values
310
+
311
+ # ========== STANDARD COLUMN GENERATION ==========
312
+
228
313
  # CATEGORICAL
229
314
  if column.type == "categorical":
230
315
  choices = params.get("choices", ["A", "B", "C"])
@@ -469,23 +554,59 @@ class DataSimulator:
469
554
  return values
470
555
 
471
556
  if text_type == "name":
472
- values = np.array([self.text_gen.name() for _ in range(size)])
557
+ pool_key = "text_name"
558
+ if pool_key not in self._text_pools:
559
+ pool_size = min(size, self.TEXT_POOL_SIZE)
560
+ self._text_pools[pool_key] = np.array([self.text_gen.name() for _ in range(pool_size)])
561
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
473
562
  elif text_type == "email":
474
- values = np.array([self.text_gen.email() for _ in range(size)])
563
+ pool_key = "text_email"
564
+ if pool_key not in self._text_pools:
565
+ pool_size = min(size, self.TEXT_POOL_SIZE)
566
+ self._text_pools[pool_key] = np.array([self.text_gen.email() for _ in range(pool_size)])
567
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
475
568
  elif text_type == "company":
476
- values = np.array([self.text_gen.company() for _ in range(size)])
569
+ pool_key = "text_company"
570
+ if pool_key not in self._text_pools:
571
+ pool_size = min(size, self.TEXT_POOL_SIZE)
572
+ self._text_pools[pool_key] = np.array([self.text_gen.company() for _ in range(pool_size)])
573
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
477
574
  elif text_type == "sentence":
478
- values = np.array([self.text_gen.sentence() for _ in range(size)])
575
+ pool_key = "text_sentence"
576
+ if pool_key not in self._text_pools:
577
+ pool_size = min(size, self.TEXT_POOL_SIZE)
578
+ self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
579
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
479
580
  elif text_type == "word":
480
- values = np.array([self.text_gen.word() for _ in range(size)])
581
+ pool_key = "text_word"
582
+ if pool_key not in self._text_pools:
583
+ pool_size = min(size, self.TEXT_POOL_SIZE)
584
+ self._text_pools[pool_key] = np.array([self.text_gen.word() for _ in range(pool_size)])
585
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
481
586
  elif text_type == "address":
482
- values = np.array([self.text_gen.full_address() for _ in range(size)])
587
+ pool_key = "text_address"
588
+ if pool_key not in self._text_pools:
589
+ pool_size = min(size, self.TEXT_POOL_SIZE)
590
+ self._text_pools[pool_key] = np.array([self.text_gen.full_address() for _ in range(pool_size)])
591
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
483
592
  elif text_type == "phone":
484
- values = np.array([self.text_gen.phone_number() for _ in range(size)])
593
+ pool_key = "text_phone"
594
+ if pool_key not in self._text_pools:
595
+ pool_size = min(size, self.TEXT_POOL_SIZE)
596
+ self._text_pools[pool_key] = np.array([self.text_gen.phone_number() for _ in range(pool_size)])
597
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
485
598
  elif text_type == "url":
486
- values = np.array([self.text_gen.url() for _ in range(size)])
599
+ pool_key = "text_url"
600
+ if pool_key not in self._text_pools:
601
+ pool_size = min(size, self.TEXT_POOL_SIZE)
602
+ self._text_pools[pool_key] = np.array([self.text_gen.url() for _ in range(pool_size)])
603
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
487
604
  else:
488
- values = np.array([self.text_gen.sentence() for _ in range(size)])
605
+ pool_key = "text_sentence"
606
+ if pool_key not in self._text_pools:
607
+ pool_size = min(size, self.TEXT_POOL_SIZE)
608
+ self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
609
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
489
610
 
490
611
  return values
491
612