misata 0.3.0b0__tar.gz → 0.3.1b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {misata-0.3.0b0 → misata-0.3.1b0}/PKG-INFO +1 -1
  2. {misata-0.3.0b0 → misata-0.3.1b0}/misata/__init__.py +1 -1
  3. {misata-0.3.0b0 → misata-0.3.1b0}/misata/simulator.py +133 -12
  4. {misata-0.3.0b0 → misata-0.3.1b0}/misata.egg-info/PKG-INFO +1 -1
  5. {misata-0.3.0b0 → misata-0.3.1b0}/misata.egg-info/SOURCES.txt +1 -1
  6. {misata-0.3.0b0 → misata-0.3.1b0}/pyproject.toml +1 -1
  7. {misata-0.3.0b0 → misata-0.3.1b0}/LICENSE +0 -0
  8. {misata-0.3.0b0 → misata-0.3.1b0}/README.md +0 -0
  9. {misata-0.3.0b0 → misata-0.3.1b0}/misata/api.py +0 -0
  10. {misata-0.3.0b0 → misata-0.3.1b0}/misata/audit.py +0 -0
  11. {misata-0.3.0b0 → misata-0.3.1b0}/misata/benchmark.py +0 -0
  12. {misata-0.3.0b0 → misata-0.3.1b0}/misata/cache.py +0 -0
  13. {misata-0.3.0b0 → misata-0.3.1b0}/misata/cli.py +0 -0
  14. {misata-0.3.0b0 → misata-0.3.1b0}/misata/codegen.py +0 -0
  15. {misata-0.3.0b0 → misata-0.3.1b0}/misata/constraints.py +0 -0
  16. {misata-0.3.0b0 → misata-0.3.1b0}/misata/context.py +0 -0
  17. {misata-0.3.0b0 → misata-0.3.1b0}/misata/curve_fitting.py +0 -0
  18. {misata-0.3.0b0 → misata-0.3.1b0}/misata/customization.py +0 -0
  19. {misata-0.3.0b0 → misata-0.3.1b0}/misata/exceptions.py +0 -0
  20. {misata-0.3.0b0 → misata-0.3.1b0}/misata/feedback.py +0 -0
  21. {misata-0.3.0b0 → misata-0.3.1b0}/misata/formulas.py +0 -0
  22. {misata-0.3.0b0 → misata-0.3.1b0}/misata/generators/__init__.py +0 -0
  23. {misata-0.3.0b0 → misata-0.3.1b0}/misata/generators/base.py +0 -0
  24. /misata-0.3.0b0/misata/generators.py → /misata-0.3.1b0/misata/generators_legacy.py +0 -0
  25. {misata-0.3.0b0 → misata-0.3.1b0}/misata/hybrid.py +0 -0
  26. {misata-0.3.0b0 → misata-0.3.1b0}/misata/llm_parser.py +0 -0
  27. {misata-0.3.0b0 → misata-0.3.1b0}/misata/noise.py +0 -0
  28. {misata-0.3.0b0 → misata-0.3.1b0}/misata/profiles.py +0 -0
  29. {misata-0.3.0b0 → misata-0.3.1b0}/misata/quality.py +0 -0
  30. {misata-0.3.0b0 → misata-0.3.1b0}/misata/schema.py +0 -0
  31. {misata-0.3.0b0 → misata-0.3.1b0}/misata/semantic.py +0 -0
  32. {misata-0.3.0b0 → misata-0.3.1b0}/misata/smart_values.py +0 -0
  33. {misata-0.3.0b0 → misata-0.3.1b0}/misata/story_parser.py +0 -0
  34. {misata-0.3.0b0 → misata-0.3.1b0}/misata/streaming.py +0 -0
  35. {misata-0.3.0b0 → misata-0.3.1b0}/misata/templates/__init__.py +0 -0
  36. {misata-0.3.0b0 → misata-0.3.1b0}/misata/templates/library.py +0 -0
  37. {misata-0.3.0b0 → misata-0.3.1b0}/misata/validation.py +0 -0
  38. {misata-0.3.0b0 → misata-0.3.1b0}/misata.egg-info/dependency_links.txt +0 -0
  39. {misata-0.3.0b0 → misata-0.3.1b0}/misata.egg-info/entry_points.txt +0 -0
  40. {misata-0.3.0b0 → misata-0.3.1b0}/misata.egg-info/requires.txt +0 -0
  41. {misata-0.3.0b0 → misata-0.3.1b0}/misata.egg-info/top_level.txt +0 -0
  42. {misata-0.3.0b0 → misata-0.3.1b0}/setup.cfg +0 -0
  43. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_api.py +0 -0
  44. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_cli.py +0 -0
  45. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_constraints.py +0 -0
  46. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_curve_fitting.py +0 -0
  47. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_enterprise.py +0 -0
  48. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_formulas.py +0 -0
  49. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_integrity.py +0 -0
  50. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_llm_parser.py +0 -0
  51. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_schema.py +0 -0
  52. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_security.py +0 -0
  53. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_semantic.py +0 -0
  54. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_simulator.py +0 -0
  55. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_templates.py +0 -0
  56. {misata-0.3.0b0 → misata-0.3.1b0}/tests/test_validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: misata
3
- Version: 0.3.0b0
3
+ Version: 0.3.1b0
4
4
  Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
5
5
  Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
6
6
  License: MIT
@@ -15,7 +15,7 @@ Usage:
15
15
  config = load_template("ecommerce")
16
16
  """
17
17
 
18
- __version__ = "0.3.0b0"
18
+ __version__ = "0.3.1b0"
19
19
  __author__ = "Muhammed Rasin"
20
20
 
21
21
  from misata.schema import (
@@ -16,7 +16,9 @@ from typing import Any, Dict, List, Optional
16
16
  import numpy as np
17
17
  import pandas as pd
18
18
 
19
- from misata.generators import TextGenerator
19
+ from misata.generators.base import TextGenerator as _FactoryTextGenerator # Generator factory version
20
+ # Use the original generators.py TextGenerator which supports seed
21
+ from misata.generators_legacy import TextGenerator
20
22
  from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
21
23
 
22
24
 
@@ -34,6 +36,10 @@ class DataSimulator:
34
36
  rng: NumPy random generator for reproducibility
35
37
  """
36
38
 
39
+ # Performance constants
40
+ MAX_CONTEXT_ROWS = 50000 # Cap context storage for memory efficiency
41
+ TEXT_POOL_SIZE = 10000 # Size of text value pools for vectorized sampling
42
+
37
43
  def __init__(self, config: SchemaConfig,
38
44
  apply_semantic_fixes: bool = True, batch_size: int = 10_000,
39
45
  smart_mode: bool = False, use_llm: bool = True):
@@ -57,6 +63,7 @@ class DataSimulator:
57
63
  self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
58
64
  self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
59
65
  self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
66
+ self._text_pools: Dict[str, np.ndarray] = {} # Cache text pools for vectorized sampling
60
67
 
61
68
  # Apply semantic inference to fix column types
62
69
  if apply_semantic_fixes:
@@ -199,10 +206,24 @@ class DataSimulator:
199
206
  ctx_df = df[cols_to_store].copy()
200
207
 
201
208
  if table_name not in self.context:
209
+ # First batch: store up to MAX_CONTEXT_ROWS
210
+ if len(ctx_df) > self.MAX_CONTEXT_ROWS:
211
+ ctx_df = ctx_df.sample(n=self.MAX_CONTEXT_ROWS, random_state=self.config.seed)
202
212
  self.context[table_name] = ctx_df
203
213
  else:
204
- # Append to existing context
205
- self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
214
+ # Append to existing context, but cap at MAX_CONTEXT_ROWS
215
+ current_len = len(self.context[table_name])
216
+ if current_len >= self.MAX_CONTEXT_ROWS:
217
+ # Already at capacity, use reservoir sampling for randomness
218
+ # Replace some existing rows with new ones (probability-based)
219
+ return # Skip appending, we have enough IDs
220
+
221
+ remaining_space = self.MAX_CONTEXT_ROWS - current_len
222
+ rows_to_add = ctx_df.iloc[:remaining_space]
223
+ self.context[table_name] = pd.concat(
224
+ [self.context[table_name], rows_to_add],
225
+ ignore_index=True
226
+ )
206
227
 
207
228
  def generate_column(
208
229
  self,
@@ -225,6 +246,70 @@ class DataSimulator:
225
246
  """
226
247
  params = column.distribution_params
227
248
 
249
+ # ========== CORRELATED COLUMN GENERATION ==========
250
+ # If this column depends on another column's value, use conditional distribution
251
+ if "depends_on" in params and table_data is not None:
252
+ parent_col = params["depends_on"]
253
+ mapping = params.get("mapping", {})
254
+
255
+ if parent_col in table_data.columns and mapping:
256
+ parent_values = table_data[parent_col].values
257
+
258
+ # Check if it's numeric or categorical mapping
259
+ first_val = next(iter(mapping.values()))
260
+ if isinstance(first_val, dict) and "mean" in first_val:
261
+ # Numeric conditional distribution (e.g., salary based on job_title)
262
+ # mapping = {"Intern": {"mean": 40000, "std": 5000}, "CTO": {"mean": 200000, "std": 30000}}
263
+ values = np.zeros(size)
264
+ for key, dist in mapping.items():
265
+ mask = parent_values == key
266
+ count = mask.sum()
267
+ if count > 0:
268
+ mean = dist.get("mean", 50000)
269
+ std = dist.get("std", mean * 0.1)
270
+ values[mask] = self.rng.normal(mean, std, count)
271
+
272
+ # Handle values that didn't match any key (use default)
273
+ default = params.get("default", {"mean": 50000, "std": 10000})
274
+ unmatched = ~np.isin(parent_values, list(mapping.keys()))
275
+ if unmatched.sum() > 0:
276
+ values[unmatched] = self.rng.normal(
277
+ default.get("mean", 50000),
278
+ default.get("std", 10000),
279
+ unmatched.sum()
280
+ )
281
+ return values
282
+
283
+ elif isinstance(first_val, list):
284
+ # Categorical conditional (e.g., state based on country)
285
+ # mapping = {"USA": ["CA", "TX", "NY"], "UK": ["England", "Scotland"]}
286
+ values = np.empty(size, dtype=object)
287
+ for key, choices in mapping.items():
288
+ mask = parent_values == key
289
+ count = mask.sum()
290
+ if count > 0:
291
+ values[mask] = self.rng.choice(choices, count)
292
+
293
+ # Default for unmatched
294
+ default_choices = params.get("default", ["Unknown"])
295
+ unmatched = values == None # noqa
296
+ if unmatched.sum() > 0:
297
+ values[unmatched] = self.rng.choice(default_choices, unmatched.sum())
298
+ return values
299
+
300
+ elif isinstance(first_val, (int, float)):
301
+ # Probability-based boolean (e.g., churn probability based on plan)
302
+ # mapping = {"free": 0.3, "pro": 0.1, "enterprise": 0.05}
303
+ values = np.zeros(size, dtype=bool)
304
+ for key, prob in mapping.items():
305
+ mask = parent_values == key
306
+ count = mask.sum()
307
+ if count > 0:
308
+ values[mask] = self.rng.random(count) < prob
309
+ return values
310
+
311
+ # ========== STANDARD COLUMN GENERATION ==========
312
+
228
313
  # CATEGORICAL
229
314
  if column.type == "categorical":
230
315
  choices = params.get("choices", ["A", "B", "C"])
@@ -469,23 +554,59 @@ class DataSimulator:
469
554
  return values
470
555
 
471
556
  if text_type == "name":
472
- values = np.array([self.text_gen.name() for _ in range(size)])
557
+ pool_key = "text_name"
558
+ if pool_key not in self._text_pools:
559
+ pool_size = min(size, self.TEXT_POOL_SIZE)
560
+ self._text_pools[pool_key] = np.array([self.text_gen.name() for _ in range(pool_size)])
561
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
473
562
  elif text_type == "email":
474
- values = np.array([self.text_gen.email() for _ in range(size)])
563
+ pool_key = "text_email"
564
+ if pool_key not in self._text_pools:
565
+ pool_size = min(size, self.TEXT_POOL_SIZE)
566
+ self._text_pools[pool_key] = np.array([self.text_gen.email() for _ in range(pool_size)])
567
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
475
568
  elif text_type == "company":
476
- values = np.array([self.text_gen.company() for _ in range(size)])
569
+ pool_key = "text_company"
570
+ if pool_key not in self._text_pools:
571
+ pool_size = min(size, self.TEXT_POOL_SIZE)
572
+ self._text_pools[pool_key] = np.array([self.text_gen.company() for _ in range(pool_size)])
573
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
477
574
  elif text_type == "sentence":
478
- values = np.array([self.text_gen.sentence() for _ in range(size)])
575
+ pool_key = "text_sentence"
576
+ if pool_key not in self._text_pools:
577
+ pool_size = min(size, self.TEXT_POOL_SIZE)
578
+ self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
579
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
479
580
  elif text_type == "word":
480
- values = np.array([self.text_gen.word() for _ in range(size)])
581
+ pool_key = "text_word"
582
+ if pool_key not in self._text_pools:
583
+ pool_size = min(size, self.TEXT_POOL_SIZE)
584
+ self._text_pools[pool_key] = np.array([self.text_gen.word() for _ in range(pool_size)])
585
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
481
586
  elif text_type == "address":
482
- values = np.array([self.text_gen.full_address() for _ in range(size)])
587
+ pool_key = "text_address"
588
+ if pool_key not in self._text_pools:
589
+ pool_size = min(size, self.TEXT_POOL_SIZE)
590
+ self._text_pools[pool_key] = np.array([self.text_gen.full_address() for _ in range(pool_size)])
591
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
483
592
  elif text_type == "phone":
484
- values = np.array([self.text_gen.phone_number() for _ in range(size)])
593
+ pool_key = "text_phone"
594
+ if pool_key not in self._text_pools:
595
+ pool_size = min(size, self.TEXT_POOL_SIZE)
596
+ self._text_pools[pool_key] = np.array([self.text_gen.phone_number() for _ in range(pool_size)])
597
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
485
598
  elif text_type == "url":
486
- values = np.array([self.text_gen.url() for _ in range(size)])
599
+ pool_key = "text_url"
600
+ if pool_key not in self._text_pools:
601
+ pool_size = min(size, self.TEXT_POOL_SIZE)
602
+ self._text_pools[pool_key] = np.array([self.text_gen.url() for _ in range(pool_size)])
603
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
487
604
  else:
488
- values = np.array([self.text_gen.sentence() for _ in range(size)])
605
+ pool_key = "text_sentence"
606
+ if pool_key not in self._text_pools:
607
+ pool_size = min(size, self.TEXT_POOL_SIZE)
608
+ self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
609
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
489
610
 
490
611
  return values
491
612
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: misata
3
- Version: 0.3.0b0
3
+ Version: 0.3.1b0
4
4
  Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
5
5
  Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
6
6
  License: MIT
@@ -15,7 +15,7 @@ misata/customization.py
15
15
  misata/exceptions.py
16
16
  misata/feedback.py
17
17
  misata/formulas.py
18
- misata/generators.py
18
+ misata/generators_legacy.py
19
19
  misata/hybrid.py
20
20
  misata/llm_parser.py
21
21
  misata/noise.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "misata"
3
- version = "0.3.0b0"
3
+ version = "0.3.1b0"
4
4
  description = "AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language"
5
5
  authors = [
6
6
  {name = "Muhammed Rasin", email = "rasinbinabdulla@gmail.com"}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes