misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,586 @@
1
+ """
2
+ Base generator interface and factory for Misata.
3
+
4
+ Provides abstract base class for all generators and a factory
5
+ pattern for creating generators based on column type.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, Callable, Dict, List, Optional, Type, Union
10
+
11
+ import numpy as np
12
+
13
+ from misata.exceptions import ColumnGenerationError
14
+
15
+
16
+ class BaseGenerator(ABC):
17
+ """Abstract base class for all data generators.
18
+
19
+ All generators must implement the `generate` method which produces
20
+ a numpy array of values.
21
+
22
+ Example:
23
+ class IntegerGenerator(BaseGenerator):
24
+ def generate(self, size: int, params: dict) -> np.ndarray:
25
+ return np.random.randint(params['min'], params['max'], size)
26
+ """
27
+
28
+ @abstractmethod
29
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
30
+ """Generate an array of values.
31
+
32
+ Args:
33
+ size: Number of values to generate
34
+ params: Distribution parameters specific to this generator
35
+
36
+ Returns:
37
+ numpy array of generated values
38
+
39
+ Raises:
40
+ ColumnGenerationError: If generation fails
41
+ """
42
+ pass
43
+
44
+ def validate_params(self, params: Dict[str, Any]) -> None:
45
+ """Validate parameters before generation.
46
+
47
+ Override this method to add custom validation.
48
+
49
+ Args:
50
+ params: Parameters to validate
51
+
52
+ Raises:
53
+ ColumnGenerationError: If validation fails
54
+ """
55
+ pass
56
+
57
+ def inject_nulls(
58
+ self,
59
+ values: np.ndarray,
60
+ null_rate: float = 0.0,
61
+ rng: Optional[np.random.Generator] = None
62
+ ) -> np.ndarray:
63
+ """Inject null values into generated data.
64
+
65
+ Args:
66
+ values: Generated values array
67
+ null_rate: Fraction of values to make null (0.0 to 1.0)
68
+ rng: Random number generator for reproducibility
69
+
70
+ Returns:
71
+ Array with nulls injected (converted to object dtype if needed)
72
+ """
73
+ if null_rate <= 0:
74
+ return values
75
+
76
+ if rng is None:
77
+ rng = np.random.default_rng()
78
+
79
+ mask = rng.random(len(values)) < null_rate
80
+
81
+ # Convert to object dtype to support None values
82
+ result = values.astype(object)
83
+ result[mask] = None
84
+
85
+ return result
86
+
87
+ def inject_outliers(
88
+ self,
89
+ values: np.ndarray,
90
+ outlier_rate: float = 0.0,
91
+ multiplier: float = 3.0,
92
+ rng: Optional[np.random.Generator] = None
93
+ ) -> np.ndarray:
94
+ """Inject outlier values into numeric data.
95
+
96
+ Args:
97
+ values: Generated numeric values
98
+ outlier_rate: Fraction of values to make outliers (0.0 to 1.0)
99
+ multiplier: How many std devs to offset outliers
100
+ rng: Random number generator for reproducibility
101
+
102
+ Returns:
103
+ Array with outliers injected
104
+ """
105
+ if outlier_rate <= 0 or not np.issubdtype(values.dtype, np.number):
106
+ return values
107
+
108
+ if rng is None:
109
+ rng = np.random.default_rng()
110
+
111
+ mask = rng.random(len(values)) < outlier_rate
112
+ n_outliers = mask.sum()
113
+
114
+ if n_outliers == 0:
115
+ return values
116
+
117
+ mean = np.mean(values)
118
+ std = np.std(values)
119
+
120
+ if std == 0:
121
+ std = 1.0 # Avoid division by zero
122
+
123
+ # Generate outliers at mean ± multiplier * std
124
+ outlier_values = mean + rng.choice([-1, 1], n_outliers) * multiplier * std
125
+
126
+ result = values.copy()
127
+ result[mask] = outlier_values
128
+
129
+ return result
130
+
131
+ def post_process(
132
+ self,
133
+ values: np.ndarray,
134
+ params: Dict[str, Any],
135
+ rng: Optional[np.random.Generator] = None
136
+ ) -> np.ndarray:
137
+ """Apply post-processing: nulls, outliers, etc.
138
+
139
+ Args:
140
+ values: Generated values
141
+ params: Parameters including null_rate, outlier_rate
142
+ rng: Random number generator
143
+
144
+ Returns:
145
+ Post-processed values
146
+ """
147
+ null_rate = params.get("null_rate", 0.0)
148
+ outlier_rate = params.get("outlier_rate", 0.0)
149
+
150
+ # Apply outliers first (on numeric data)
151
+ if outlier_rate > 0:
152
+ values = self.inject_outliers(values, outlier_rate, rng=rng)
153
+
154
+ # Apply nulls last
155
+ if null_rate > 0:
156
+ values = self.inject_nulls(values, null_rate, rng=rng)
157
+
158
+ return values
159
+
160
+
161
+ class IntegerGenerator(BaseGenerator):
162
+ """Generator for integer values with various distributions."""
163
+
164
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
165
+ distribution = params.get("distribution", "uniform")
166
+
167
+ if distribution == "sequence":
168
+ start = params.get("start", 1)
169
+ return np.arange(start, start + size)
170
+
171
+ elif distribution == "uniform":
172
+ min_val = params.get("min", 0)
173
+ max_val = params.get("max", 100)
174
+ return np.random.randint(min_val, max_val + 1, size)
175
+
176
+ elif distribution == "normal":
177
+ mean = params.get("mean", 50)
178
+ std = params.get("std", 10)
179
+ return np.clip(np.random.normal(mean, std, size).astype(int), 0, None)
180
+
181
+ elif distribution == "poisson":
182
+ lam = params.get("lambda", 5)
183
+ return np.random.poisson(lam, size)
184
+
185
+ elif distribution == "binomial":
186
+ n = params.get("n", 10)
187
+ p = params.get("p", 0.5)
188
+ return np.random.binomial(n, p, size)
189
+
190
+ else:
191
+ raise ColumnGenerationError(
192
+ f"Unknown integer distribution: {distribution}",
193
+ column_type="int",
194
+ suggestion="Use 'uniform', 'normal', 'poisson', 'binomial', or 'sequence'"
195
+ )
196
+
197
+
198
+ class FloatGenerator(BaseGenerator):
199
+ """Generator for floating-point values with various distributions."""
200
+
201
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
202
+ distribution = params.get("distribution", "uniform")
203
+ decimals = params.get("decimals", 2)
204
+
205
+ if distribution == "uniform":
206
+ min_val = params.get("min", 0.0)
207
+ max_val = params.get("max", 100.0)
208
+ values = np.random.uniform(min_val, max_val, size)
209
+
210
+ elif distribution == "normal":
211
+ mean = params.get("mean", 50.0)
212
+ std = params.get("std", 10.0)
213
+ values = np.random.normal(mean, std, size)
214
+
215
+ elif distribution == "exponential":
216
+ scale = params.get("scale", 1.0)
217
+ values = np.random.exponential(scale, size)
218
+
219
+ elif distribution == "lognormal":
220
+ mean = params.get("mean", 0.0)
221
+ sigma = params.get("sigma", 1.0)
222
+ values = np.random.lognormal(mean, sigma, size)
223
+
224
+ elif distribution == "beta":
225
+ a = params.get("a", 2.0)
226
+ b = params.get("b", 5.0)
227
+ values = np.random.beta(a, b, size)
228
+
229
+ else:
230
+ raise ColumnGenerationError(
231
+ f"Unknown float distribution: {distribution}",
232
+ column_type="float",
233
+ suggestion="Use 'uniform', 'normal', 'exponential', 'lognormal', or 'beta'"
234
+ )
235
+
236
+ return np.round(values, decimals)
237
+
238
+
239
+ class BooleanGenerator(BaseGenerator):
240
+ """Generator for boolean values."""
241
+
242
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
243
+ probability = params.get("probability", 0.5)
244
+ return np.random.random(size) < probability
245
+
246
+
247
+ class CategoricalGenerator(BaseGenerator):
248
+ """Generator for categorical values with optional weights."""
249
+
250
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
251
+ choices = params.get("choices", [])
252
+ if not choices:
253
+ raise ColumnGenerationError(
254
+ "No choices provided for categorical column",
255
+ column_type="categorical",
256
+ suggestion="Add 'choices' parameter with list of values"
257
+ )
258
+
259
+ weights = params.get("weights")
260
+ if weights:
261
+ if len(weights) != len(choices):
262
+ raise ColumnGenerationError(
263
+ f"Weights length ({len(weights)}) doesn't match choices length ({len(choices)})",
264
+ column_type="categorical",
265
+ suggestion="Ensure weights and choices have the same length"
266
+ )
267
+ # Normalize weights
268
+ weights = np.array(weights) / sum(weights)
269
+
270
+ return np.random.choice(choices, size=size, p=weights)
271
+
272
+
273
+ class DateGenerator(BaseGenerator):
274
+ """Generator for date values."""
275
+
276
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
277
+ import pandas as pd
278
+
279
+ start = params.get("start", "2020-01-01")
280
+ end = params.get("end", "2024-12-31")
281
+ distribution = params.get("distribution", "uniform")
282
+
283
+ start_ts = pd.Timestamp(start).value // 10**9
284
+ end_ts = pd.Timestamp(end).value // 10**9
285
+
286
+ if distribution == "uniform":
287
+ timestamps = np.random.randint(start_ts, end_ts, size)
288
+ elif distribution == "recent":
289
+ # Bias towards recent dates (exponential decay)
290
+ u = np.random.exponential(0.3, size)
291
+ u = np.clip(u / u.max(), 0, 1)
292
+ timestamps = (start_ts + (end_ts - start_ts) * u).astype(int)
293
+ else:
294
+ timestamps = np.random.randint(start_ts, end_ts, size)
295
+
296
+ return pd.to_datetime(timestamps, unit='s').strftime('%Y-%m-%d').values
297
+
298
+
299
+ class TextGenerator(BaseGenerator):
300
+ """Generator for text values using Faker or patterns."""
301
+
302
+ def __init__(self):
303
+ try:
304
+ from faker import Faker
305
+ self._faker = Faker()
306
+ except ImportError:
307
+ self._faker = None
308
+
309
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
310
+ text_type = params.get("text_type", params.get("distribution", "uuid"))
311
+
312
+ if text_type == "uuid" or text_type == "text":
313
+ import uuid
314
+ return np.array([str(uuid.uuid4()) for _ in range(size)])
315
+
316
+ if self._faker is None:
317
+ # Fallback without faker
318
+ return np.array([f"text_{i}" for i in range(size)])
319
+
320
+ faker_methods = {
321
+ "name": self._faker.name,
322
+ "fake.name": self._faker.name,
323
+ "email": self._faker.email,
324
+ "fake.email": self._faker.email,
325
+ "address": self._faker.address,
326
+ "fake.address": self._faker.address,
327
+ "company": self._faker.company,
328
+ "fake.company": self._faker.company,
329
+ "phone": self._faker.phone_number,
330
+ "fake.phone": self._faker.phone_number,
331
+ "city": self._faker.city,
332
+ "country": self._faker.country,
333
+ "job": self._faker.job,
334
+ "sentence": self._faker.sentence,
335
+ "paragraph": self._faker.paragraph,
336
+ }
337
+
338
+ method = faker_methods.get(text_type)
339
+ if method:
340
+ return np.array([method() for _ in range(size)])
341
+
342
+ # Default to name
343
+ return np.array([self._faker.name() for _ in range(size)])
344
+
345
+
346
+ class ForeignKeyGenerator(BaseGenerator):
347
+ """Generator for foreign key references."""
348
+
349
+ def __init__(self, parent_ids: Optional[np.ndarray] = None):
350
+ self.parent_ids = parent_ids
351
+
352
+ def set_parent_ids(self, parent_ids: np.ndarray) -> None:
353
+ """Set the valid parent IDs for foreign key generation."""
354
+ self.parent_ids = parent_ids
355
+
356
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
357
+ if self.parent_ids is None or len(self.parent_ids) == 0:
358
+ raise ColumnGenerationError(
359
+ "No parent IDs available for foreign key generation",
360
+ column_type="foreign_key",
361
+ suggestion="Ensure parent table is generated before child table"
362
+ )
363
+
364
+ return np.random.choice(self.parent_ids, size=size)
365
+
366
+
367
+ # ============ Generator Factory ============
368
+
369
+ class GeneratorFactory:
370
+ """Factory for creating generators based on column type.
371
+
372
+ Example:
373
+ factory = GeneratorFactory()
374
+ gen = factory.get_generator("int")
375
+ values = gen.generate(1000, {"min": 1, "max": 100})
376
+ """
377
+
378
+ _generators: Dict[str, Type[BaseGenerator]] = {
379
+ "int": IntegerGenerator,
380
+ "integer": IntegerGenerator,
381
+ "float": FloatGenerator,
382
+ "double": FloatGenerator,
383
+ "decimal": FloatGenerator,
384
+ "boolean": BooleanGenerator,
385
+ "bool": BooleanGenerator,
386
+ "categorical": CategoricalGenerator,
387
+ "category": CategoricalGenerator,
388
+ "date": DateGenerator,
389
+ "datetime": DateGenerator,
390
+ "text": TextGenerator,
391
+ "string": TextGenerator,
392
+ "varchar": TextGenerator,
393
+ "foreign_key": ForeignKeyGenerator,
394
+ "fk": ForeignKeyGenerator,
395
+ }
396
+
397
+ _instances: Dict[str, BaseGenerator] = {}
398
+
399
+ @classmethod
400
+ def register(cls, column_type: str, generator_class: Type[BaseGenerator]) -> None:
401
+ """Register a custom generator for a column type.
402
+
403
+ Args:
404
+ column_type: Type name (e.g., "custom_int")
405
+ generator_class: Generator class to use
406
+ """
407
+ cls._generators[column_type.lower()] = generator_class
408
+
409
+ @classmethod
410
+ def get_generator(cls, column_type: str) -> BaseGenerator:
411
+ """Get a generator instance for the given column type.
412
+
413
+ Args:
414
+ column_type: Column type (e.g., "int", "text", "date")
415
+
416
+ Returns:
417
+ Generator instance
418
+
419
+ Raises:
420
+ ColumnGenerationError: If column type is not supported
421
+ """
422
+ column_type = column_type.lower()
423
+
424
+ if column_type not in cls._generators:
425
+ raise ColumnGenerationError(
426
+ f"Unsupported column type: {column_type}",
427
+ column_type=column_type,
428
+ suggestion=f"Supported types: {', '.join(cls._generators.keys())}"
429
+ )
430
+
431
+ # Get or create instance
432
+ if column_type not in cls._instances:
433
+ cls._instances[column_type] = cls._generators[column_type]()
434
+
435
+ return cls._instances[column_type]
436
+
437
+ @classmethod
438
+ def create_foreign_key_generator(cls, parent_ids: np.ndarray) -> ForeignKeyGenerator:
439
+ """Create a foreign key generator with parent IDs.
440
+
441
+ Args:
442
+ parent_ids: Array of valid parent IDs
443
+
444
+ Returns:
445
+ Configured ForeignKeyGenerator
446
+ """
447
+ gen = ForeignKeyGenerator(parent_ids)
448
+ return gen
449
+
450
+
451
+ class ConditionalCategoricalGenerator(BaseGenerator):
452
+ """Generator for categorical values that depend on another column.
453
+
454
+ Use this for hierarchical data like state/country, department/role.
455
+
456
+ Example:
457
+ lookup = {
458
+ "USA": ["California", "Texas", "New York"],
459
+ "UK": ["England", "Scotland", "Wales"],
460
+ "Germany": ["Bavaria", "Berlin", "Hamburg"],
461
+ }
462
+ gen = ConditionalCategoricalGenerator(lookup, "country")
463
+ states = gen.generate(1000, {"parent_values": country_column})
464
+ """
465
+
466
+ def __init__(
467
+ self,
468
+ lookup: Dict[str, List[str]],
469
+ parent_column: str,
470
+ default_values: Optional[List[str]] = None
471
+ ):
472
+ """Initialize conditional generator.
473
+
474
+ Args:
475
+ lookup: Mapping from parent value to list of child values
476
+ parent_column: Name of the parent column
477
+ default_values: Values to use if parent not in lookup
478
+ """
479
+ self.lookup = lookup
480
+ self.parent_column = parent_column
481
+ self.default_values = default_values or list(lookup.values())[0] if lookup else ["Unknown"]
482
+
483
+ def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
484
+ """Generate values conditioned on parent column.
485
+
486
+ Args:
487
+ size: Number of values to generate
488
+ params: Must include 'parent_values' array
489
+
490
+ Returns:
491
+ Array of generated values
492
+ """
493
+ parent_values = params.get("parent_values")
494
+
495
+ if parent_values is None:
496
+ # No parent values, use uniform random from all possible values
497
+ all_values = []
498
+ for values in self.lookup.values():
499
+ all_values.extend(values)
500
+ if not all_values:
501
+ all_values = self.default_values
502
+ return np.random.choice(all_values, size=size)
503
+
504
+ # Convert to array if needed
505
+ parent_values = np.asarray(parent_values)
506
+
507
+ if len(parent_values) != size:
508
+ raise ColumnGenerationError(
509
+ f"Parent values length ({len(parent_values)}) doesn't match size ({size})",
510
+ column_type="conditional_categorical",
511
+ suggestion="Ensure parent column is generated first"
512
+ )
513
+
514
+ # Generate conditional values
515
+ result = np.empty(size, dtype=object)
516
+ for i, parent in enumerate(parent_values):
517
+ choices = self.lookup.get(str(parent), self.default_values)
518
+ result[i] = np.random.choice(choices)
519
+
520
+ return result
521
+
522
+
523
+ # ============ Built-in Lookup Tables ============
524
+
525
+ CONDITIONAL_LOOKUPS = {
526
+ "country_to_state": {
527
+ "USA": ["California", "Texas", "New York", "Florida", "Illinois", "Pennsylvania", "Ohio", "Georgia", "Michigan", "North Carolina"],
528
+ "UK": ["England", "Scotland", "Wales", "Northern Ireland"],
529
+ "Germany": ["Bavaria", "Berlin", "Hamburg", "Hesse", "North Rhine-Westphalia", "Baden-Württemberg"],
530
+ "France": ["Île-de-France", "Provence", "Normandy", "Brittany", "Alsace"],
531
+ "Canada": ["Ontario", "Quebec", "British Columbia", "Alberta", "Manitoba"],
532
+ "Australia": ["New South Wales", "Victoria", "Queensland", "Western Australia", "South Australia"],
533
+ "India": ["Maharashtra", "Karnataka", "Tamil Nadu", "Delhi", "Gujarat", "Uttar Pradesh"],
534
+ "Japan": ["Tokyo", "Osaka", "Kyoto", "Hokkaido", "Okinawa"],
535
+ },
536
+ "department_to_role": {
537
+ "Engineering": ["Software Engineer", "Senior Engineer", "Staff Engineer", "Principal Engineer", "Engineering Manager"],
538
+ "Product": ["Product Manager", "Senior PM", "Product Director", "VP Product", "Product Analyst"],
539
+ "Design": ["UX Designer", "UI Designer", "Product Designer", "Design Lead", "Design Director"],
540
+ "Sales": ["Sales Rep", "Account Executive", "Sales Manager", "Sales Director", "VP Sales"],
541
+ "Marketing": ["Marketing Manager", "Content Strategist", "Growth Manager", "Marketing Director", "CMO"],
542
+ "HR": ["HR Manager", "Recruiter", "HR Director", "People Partner", "VP People"],
543
+ "Finance": ["Financial Analyst", "Accountant", "Controller", "Finance Director", "CFO"],
544
+ },
545
+ "category_to_subcategory": {
546
+ "Electronics": ["Smartphones", "Laptops", "Tablets", "Accessories", "Wearables"],
547
+ "Clothing": ["Men's Apparel", "Women's Apparel", "Kids", "Shoes", "Accessories"],
548
+ "Home & Garden": ["Furniture", "Decor", "Kitchen", "Outdoor", "Bedding"],
549
+ "Sports": ["Fitness", "Outdoor Sports", "Team Sports", "Water Sports", "Winter Sports"],
550
+ "Books": ["Fiction", "Non-Fiction", "Academic", "Children's", "Comics"],
551
+ },
552
+ "industry_to_company_type": {
553
+ "Technology": ["SaaS", "Consumer Tech", "Enterprise Software", "AI/ML", "Cybersecurity"],
554
+ "Healthcare": ["Hospital", "Pharmaceutical", "Biotech", "Medical Device", "Health Insurance"],
555
+ "Finance": ["Bank", "Investment Firm", "Insurance", "Fintech", "Credit Union"],
556
+ "Retail": ["E-commerce", "Brick & Mortar", "Wholesale", "Specialty Retail", "Marketplace"],
557
+ "Manufacturing": ["Automotive", "Electronics", "Consumer Goods", "Industrial", "Aerospace"],
558
+ },
559
+ }
560
+
561
+
562
+ def create_conditional_generator(
563
+ lookup_name: str,
564
+ parent_column: str
565
+ ) -> ConditionalCategoricalGenerator:
566
+ """Create a conditional generator from built-in lookup tables.
567
+
568
+ Args:
569
+ lookup_name: Name of the lookup (e.g., "country_to_state")
570
+ parent_column: Name of the parent column
571
+
572
+ Returns:
573
+ Configured ConditionalCategoricalGenerator
574
+ """
575
+ if lookup_name not in CONDITIONAL_LOOKUPS:
576
+ available = ", ".join(CONDITIONAL_LOOKUPS.keys())
577
+ raise ColumnGenerationError(
578
+ f"Unknown lookup: {lookup_name}",
579
+ column_type="conditional_categorical",
580
+ suggestion=f"Available lookups: {available}"
581
+ )
582
+
583
+ return ConditionalCategoricalGenerator(
584
+ lookup=CONDITIONAL_LOOKUPS[lookup_name],
585
+ parent_column=parent_column
586
+ )
misata/llm_parser.py CHANGED
@@ -24,7 +24,11 @@ def _load_env():
24
24
  """Load environment variables from .env file."""
25
25
  env_paths = [
26
26
  Path.cwd() / ".env",
27
- Path(__file__).parent.parent / ".env",
27
+ Path.cwd().parent / ".env", # apps/.env or api parent
28
+ Path.cwd().parent.parent / ".env", # Misata root from apps/api
29
+ Path(__file__).parent.parent / ".env", # packages/core/.env
30
+ Path(__file__).parent.parent.parent / ".env", # packages/.env
31
+ Path(__file__).parent.parent.parent.parent / ".env", # Misata root from packages/core/misata
28
32
  Path.home() / ".misata" / ".env",
29
33
  ]
30
34
 
@@ -35,7 +39,9 @@ def _load_env():
35
39
  line = line.strip()
36
40
  if line and not line.startswith("#") and "=" in line:
37
41
  key, _, value = line.partition("=")
38
- os.environ.setdefault(key.strip(), value.strip())
42
+ # Remove quotes if present
43
+ value = value.strip().strip("'\"")
44
+ os.environ.setdefault(key.strip(), value)
39
45
  break
40
46
 
41
47
  _load_env()
@@ -82,6 +88,39 @@ Instead of guessing parameters, you can provide "control_points" to draw the sha
82
88
  Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
83
89
  Misata will mathematically solve for the best parameters.
84
90
 
91
+ ### SMART DEFAULTS (Use These for Realistic Data):
92
+
93
+ **Age columns:**
94
+ - type: "int", distribution: "normal", mean: 35, std: 12, min: 18, max: 80
95
+
96
+ **Price/Amount columns:**
97
+ - type: "float", distribution: "exponential", scale: 50, min: 0.01, decimals: 2
98
+ - OR for products: uniform min: 9.99, max: 499.99
99
+
100
+ **Rating columns (1-5 stars):**
101
+ - type: "int", distribution: "categorical", choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
102
+
103
+ **Quantity/Count columns:**
104
+ - type: "int", distribution: "poisson", lambda: 3, min: 1
105
+
106
+ **Duration (minutes):**
107
+ - type: "int", distribution: "normal", mean: 45, std: 20, min: 5, max: 180
108
+
109
+ **Percentage columns:**
110
+ - type: "float", distribution: "uniform", min: 0.0, max: 100.0, decimals: 1
111
+
112
+ **Status columns:**
113
+ - type: "categorical", choices: ["active", "inactive", "pending"], probabilities: [0.70, 0.20, 0.10]
114
+
115
+ **Boolean probabilities:**
116
+ - is_verified: probability: 0.85
117
+ - is_premium: probability: 0.25
118
+ - is_active: probability: 0.80
119
+
120
+ **Date columns:**
121
+ - For recent data: bias last 30% of range with 70% of values
122
+ - Always use realistic date ranges (not 1970-2100)
123
+
85
124
  ## OUTPUT FORMAT
86
125
 
87
126
  {