PyPI - additory - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

additory/__init__.py +15 -0
additory/analysis/__init__.py +48 -0
additory/analysis/cardinality.py +126 -0
additory/analysis/correlations.py +124 -0
additory/analysis/distributions.py +376 -0
additory/analysis/quality.py +158 -0
additory/analysis/scan.py +400 -0
additory/augment/__init__.py +24 -0
additory/augment/augmentor.py +653 -0
additory/augment/builtin_lists.py +430 -0
additory/augment/distributions.py +22 -0
additory/augment/forecast.py +1132 -0
additory/augment/list_registry.py +177 -0
additory/augment/smote.py +320 -0
additory/augment/strategies.py +883 -0
additory/common/__init__.py +157 -0
additory/common/backend.py +355 -0
additory/common/column_utils.py +191 -0
additory/common/distributions.py +737 -0
additory/common/exceptions.py +62 -0
additory/common/lists.py +229 -0
additory/common/patterns.py +240 -0
additory/common/resolver.py +567 -0
additory/common/sample_data.py +182 -0
additory/common/validation.py +197 -0
additory/core/__init__.py +27 -0
additory/core/ast_builder.py +165 -0
additory/core/backends/__init__.py +23 -0
additory/core/backends/arrow_bridge.py +476 -0
additory/core/backends/cudf_bridge.py +355 -0
additory/core/column_positioning.py +358 -0
additory/core/compiler_polars.py +166 -0
additory/core/config.py +342 -0
additory/core/enhanced_cache_manager.py +1119 -0
additory/core/enhanced_matchers.py +473 -0
additory/core/enhanced_version_manager.py +325 -0
additory/core/executor.py +59 -0
additory/core/integrity_manager.py +477 -0
additory/core/loader.py +190 -0
additory/core/logging.py +24 -0
additory/core/memory_manager.py +547 -0
additory/core/namespace_manager.py +657 -0
additory/core/parser.py +176 -0
additory/core/polars_expression_engine.py +551 -0
additory/core/registry.py +176 -0
additory/core/sample_data_manager.py +492 -0
additory/core/user_namespace.py +751 -0
additory/core/validator.py +27 -0
additory/dynamic_api.py +308 -0
additory/expressions/__init__.py +26 -0
additory/expressions/engine.py +551 -0
additory/expressions/parser.py +176 -0
additory/expressions/proxy.py +546 -0
additory/expressions/registry.py +313 -0
additory/expressions/samples.py +492 -0
additory/synthetic/__init__.py +101 -0
additory/synthetic/api.py +220 -0
additory/synthetic/common_integration.py +314 -0
additory/synthetic/config.py +262 -0
additory/synthetic/engines.py +529 -0
additory/synthetic/exceptions.py +180 -0
additory/synthetic/file_managers.py +518 -0
additory/synthetic/generator.py +702 -0
additory/synthetic/generator_parser.py +68 -0
additory/synthetic/integration.py +319 -0
additory/synthetic/models.py +241 -0
additory/synthetic/pattern_resolver.py +573 -0
additory/synthetic/performance.py +469 -0
additory/synthetic/polars_integration.py +464 -0
additory/synthetic/proxy.py +60 -0
additory/synthetic/schema_parser.py +685 -0
additory/synthetic/validator.py +553 -0
additory/utilities/__init__.py +53 -0
additory/utilities/encoding.py +600 -0
additory/utilities/games.py +300 -0
additory/utilities/keys.py +8 -0
additory/utilities/lookup.py +103 -0
additory/utilities/matchers.py +216 -0
additory/utilities/resolvers.py +286 -0
additory/utilities/settings.py +167 -0
additory/utilities/units.py +746 -0
additory/utilities/validators.py +153 -0
additory-0.1.0a1.dist-info/METADATA +293 -0
additory-0.1.0a1.dist-info/RECORD +87 -0
additory-0.1.0a1.dist-info/WHEEL +5 -0
additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
additory-0.1.0a1.dist-info/top_level.txt +1 -0

additory/synthetic/api.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""
+Main API interface for synthetic data generation.
+Provides the primary user-facing functions for generating synthetic data
+with support for different output engines and configuration management.
+"""
+from typing import Union, Optional, Type
+import pandas as pd
+import polars as pl
+import os
+from pathlib import Path
+from .config import SyntheticConfig
+from .exceptions import SyntheticDataError, ValidationError
+from .integration import SyntheticDataIntegrator
+from .generator import GenerationConfig
+from .engines import DistributionEngineFactory, DistributionEngine
+# Global configuration instance
+config = SyntheticConfig()
+def synth(schema_path: str, rows: int = 1000,
+          engine: Optional[str] = None) -> Union[pd.DataFrame, pl.DataFrame]:
+    """
+    Generate synthetic data from a schema file.
+    Args:
+        schema_path: Path to the .toml schema file
+        rows: Number of rows to generate (default: 1000)
+        engine: Output engine ("pandas" or "polars"). If None, uses default from config
+    Returns:
+        Generated DataFrame in the specified format
+    Raises:
+        SyntheticDataError: If generation fails
+        ValidationError: If schema validation fails
+        FileNotFoundError: If schema file doesn't exist
+    Examples:
+        >>> df = synth("customer.toml", rows=5000)  # pandas DataFrame
+        >>> df = synth("customer.toml", rows=5000, engine="polars")  # polars DataFrame
+    """
+    # Validate inputs
+    if rows <= 0:
+        raise ValueError("Number of rows must be positive")
+    # Determine output engine
+    output_engine = engine if engine is not None else config.get_default_engine()
+    if output_engine not in ["pandas", "polars"]:
+        raise ValueError(f"Unsupported engine: {output_engine}. Must be 'pandas' or 'polars'")
+    # Resolve schema path if it's relative to the configured base path
+    resolved_schema_path = _resolve_schema_path(schema_path)
+    # Create generation configuration
+    generation_config = GenerationConfig(
+        batch_size=config.get_default_batch_size(),
+        seed=None,  # Use random seed by default
+        validate_patterns=config.is_validation_enabled()
+    )
+    # Create integrator and generate data
+    integrator = SyntheticDataIntegrator(generation_config)
+    try:
+        result = integrator.generate_from_schema_file(
+            schema_path=resolved_schema_path,
+            target_rows=rows,
+            output_engine=output_engine
+        )
+        return result.dataframe
+    except ValidationError:
+        # Re-raise validation errors as-is
+        raise
+    except Exception as e:
+        # Wrap other exceptions in SyntheticDataError
+        raise SyntheticDataError(f"Failed to generate synthetic data: {e}") from e
+def register_distribution_engine(engine_class: Type[DistributionEngine]) -> None:
+    """
+    Register a custom distribution engine for use in synthetic data generation.
+    Custom engines allow you to implement your own distribution strategies beyond
+    the built-in ones (equal, custom, categorical, high_cardinality, numeric_range, skewed).
+    Args:
+        engine_class: A class that inherits from DistributionEngine and implements
+                     the required methods (supports_strategy, validate_config, apply_distribution)
+    Raises:
+        ValidationError: If the engine class is invalid or already registered
+    Examples:
+        >>> from additory.synthetic.engines import DistributionEngine, DistributionConfig
+        >>> from additory.synthetic.models import DistributionType, ValidationResult
+        >>> import polars as pl
+        >>>
+        >>> class GaussianDistributionEngine(DistributionEngine):
+        ...     def supports_strategy(self, strategy_type):
+        ...         return strategy_type == DistributionType.CUSTOM and strategy_type.value == "gaussian"
+        ...
+        ...     def validate_config(self, config: DistributionConfig) -> ValidationResult:
+        ...         result = self._validate_base_requirements(config)
+        ...         # Add custom validation
+        ...         return result
+        ...
+        ...     def apply_distribution(self, config: DistributionConfig) -> pl.Series:
+        ...         # Implement gaussian distribution logic
+        ...         pass
+        >>>
+        >>> register_distribution_engine(GaussianDistributionEngine)
+    """
+    DistributionEngineFactory.register_custom_engine(engine_class)
+def unregister_distribution_engine(engine_class: Type[DistributionEngine]) -> None:
+    """
+    Unregister a previously registered custom distribution engine.
+    Args:
+        engine_class: The engine class to unregister
+    Raises:
+        ValidationError: If the engine is not registered
+    Examples:
+        >>> unregister_distribution_engine(GaussianDistributionEngine)
+    """
+    DistributionEngineFactory.unregister_custom_engine(engine_class)
+def list_custom_distribution_engines() -> list:
+    """
+    Get a list of all registered custom distribution engines.
+    Returns:
+        List of custom engine classes
+    Examples:
+        >>> engines = list_custom_distribution_engines()
+        >>> for engine in engines:
+        ...     print(engine.__name__)
+    """
+    return DistributionEngineFactory.list_custom_engines()
+def _resolve_schema_path(schema_path: str) -> str:
+    """
+    Resolve schema path, checking both absolute and relative to config base path.
+    Args:
+        schema_path: Input schema path
+    Returns:
+        Resolved absolute path to schema file
+    Raises:
+        FileNotFoundError: If schema file cannot be found
+    """
+    # If it's an absolute path or exists as-is, use it directly
+    if os.path.isabs(schema_path) or os.path.exists(schema_path):
+        if os.path.exists(schema_path):
+            return schema_path
+        else:
+            raise FileNotFoundError(f"Schema file not found: {schema_path}")
+    # Try resolving relative to the configured base path
+    resolved_path = config.resolve_schema_path(schema_path)
+    if resolved_path.exists():
+        return str(resolved_path)
+    # Try current working directory
+    cwd_path = Path.cwd() / schema_path
+    if cwd_path.exists():
+        return str(cwd_path)
+    # File not found in any location
+    raise FileNotFoundError(
+        f"Schema file not found: {schema_path}. "
+        f"Searched in: current directory, {config.get_schema_base_path()}"
+    )
+def augment(df: Union[pd.DataFrame, pl.DataFrame],
+           schema_path: str, **kwargs) -> Union[pd.DataFrame, pl.DataFrame]:
+    """
+    Augment existing DataFrame with synthetic columns.
+    This function will be implemented in future phases to support data augmentation
+    for class balancing and other use cases.
+    Args:
+        df: Input DataFrame to augment
+        schema_path: Path to the .toml schema file
+        **kwargs: Additional augmentation parameters
+    Returns:
+        Augmented DataFrame in the same format as input
+    Raises:
+        NotImplementedError: This feature is planned for future implementation
+    """
+    raise NotImplementedError("augment() function is planned for future implementation")
+# Export the config object and plugin functions for user access
+__all__ = [
+    'synth',
+    'augment',
+    'config',
+    'register_distribution_engine',
+    'unregister_distribution_engine',
+    'list_custom_distribution_engines'
+]

additory/synthetic/common_integration.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""
+Common Module Integration for Synthetic Data Generation
+Provides integration between the common module (lists, patterns, resolver)
+and the synthetic data generation system.
+This module:
+- Wraps common/resolver.py for synthetic-specific needs
+- Handles .list and .properties file loading
+- Implements prefer_mode logic
+- Provides pattern type detection
+- Maintains backward compatibility
+"""
+from typing import Union, List, Optional, Dict, Any
+from pathlib import Path
+import logging
+from additory.common.resolver import (
+    PatternResolver,
+    resolve_pattern,
+    PreferMode,
+    PatternResolutionResult,
+)
+from additory.common.lists import load_list_file, ListFileError
+from additory.common.patterns import load_properties_file, is_regex_pattern, PatternFileError
+from .exceptions import PatternResolutionError, ValidationError
+logger = logging.getLogger(__name__)
+class SyntheticPatternLoader:
+    """
+    Pattern loader for synthetic data generation.
+    Integrates with common module to provide:
+    - .list file loading
+    - .properties file loading
+    - Fallback resolution
+    - Pattern type detection
+    - Prefer mode support
+    """
+    def __init__(self, base_path: str = "reference/schema_definitions"):
+        """
+        Initialize the pattern loader.
+        Args:
+            base_path: Base directory for pattern files
+        """
+        self.base_path = Path(base_path)
+        self.resolver = PatternResolver(base_path=str(self.base_path))
+        # Cache for loaded files
+        self._list_cache: Dict[str, Dict[str, List[str]]] = {}
+        self._properties_cache: Dict[str, Dict[str, str]] = {}
+    def load_pattern(
+        self,
+        pattern_value: Union[str, List[str]],
+        imports: List[str],
+        prefer_mode: str = "default"
+    ) -> tuple[Union[List[str], str], str]:
+        """
+        Load a pattern with fallback resolution.
+        Args:
+            pattern_value: Pattern value from TOML (string reference, array list, or inline regex)
+            imports: List of imports from TOML (e.g., ["global", "finance"])
+            prefer_mode: Resolution preference ("default", "list_only", "regex_only")
+        Returns:
+            Tuple of (resolved_value, pattern_type)
+            - resolved_value: List of values (for lists) or regex string (for regex)
+            - pattern_type: "list" or "regex"
+        Raises:
+            PatternResolutionError: If pattern cannot be resolved
+        Example:
+            >>> loader = SyntheticPatternLoader()
+            >>>
+            >>> # Reference (resolved via fallback)
+            >>> value, type = loader.load_pattern("first_names", ["global"], "default")
+            >>> # value = ['Arjun', 'Vikram', ...], type = 'list'
+            >>>
+            >>> # Inline list
+            >>> value, type = loader.load_pattern(["Active", "Inactive"], ["global"], "default")
+            >>> # value = ['Active', 'Inactive'], type = 'list'
+            >>>
+            >>> # Inline regex
+            >>> value, type = loader.load_pattern("CUST\\d{8}", ["global"], "default")
+            >>> # value = 'CUST\\d{8}', type = 'regex'
+        """
+        # Detect pattern type
+        pattern_type = self._detect_pattern_type(pattern_value)
+        if pattern_type == "inline_list":
+            # Inline list (array)
+            logger.info(f"Using inline list with {len(pattern_value)} values")
+            return (pattern_value, "list")
+        elif pattern_type == "inline_regex":
+            # Inline regex (string with special chars)
+            logger.info(f"Using inline regex: {pattern_value}")
+            return (pattern_value, "regex")
+        elif pattern_type == "reference":
+            # Reference (resolve via fallback)
+            logger.info(f"Resolving reference: {pattern_value}")
+            return self._resolve_reference(pattern_value, imports, prefer_mode)
+        else:
+            raise PatternResolutionError(
+                f"Unknown pattern type for value: {pattern_value}"
+            )
+    def _detect_pattern_type(self, pattern_value: Union[str, List[str]]) -> str:
+        """
+        Detect pattern type from value.
+        Args:
+            pattern_value: Pattern value from TOML
+        Returns:
+            Pattern type: "inline_list", "inline_regex", or "reference"
+        """
+        if isinstance(pattern_value, list):
+            # Array = inline list
+            return "inline_list"
+        elif isinstance(pattern_value, str):
+            # String: check if it's regex or reference
+            if is_regex_pattern(pattern_value):
+                # Has special regex chars = inline regex
+                return "inline_regex"
+            else:
+                # Simple string = reference
+                return "reference"
+        else:
+            raise ValidationError(
+                f"Invalid pattern value type: {type(pattern_value)}. "
+                f"Expected string or list."
+            )
+    def _resolve_reference(
+        self,
+        pattern_name: str,
+        imports: List[str],
+        prefer_mode: str
+    ) -> tuple[Union[List[str], str], str]:
+        """
+        Resolve a pattern reference using fallback resolution.
+        Args:
+            pattern_name: Name of pattern to resolve
+            imports: List of imports
+            prefer_mode: Resolution preference
+        Returns:
+            Tuple of (resolved_value, pattern_type)
+        Raises:
+            PatternResolutionError: If pattern cannot be resolved
+        """
+        # Convert prefer_mode string to enum
+        try:
+            mode = PreferMode(prefer_mode)
+        except ValueError:
+            logger.warning(f"Invalid prefer_mode '{prefer_mode}', using DEFAULT")
+            mode = PreferMode.DEFAULT
+        # Resolve using common module
+        result = self.resolver.resolve(pattern_name, imports, mode)
+        if not result.found:
+            raise PatternResolutionError(
+                f"Pattern '{pattern_name}' not found. {result.error_message}",
+                pattern_name,
+                imports
+            )
+        # Log resolution
+        logger.info(
+            f"Resolved '{pattern_name}' from {result.source} "
+            f"(type: {result.pattern_type}, fallback: {result.fallback_used})"
+        )
+        return (result.value, result.pattern_type)
+    def validate_imports(self, imports: List[str]) -> tuple[bool, List[str]]:
+        """
+        Validate that import files exist.
+        Args:
+            imports: List of import names (e.g., ["global", "finance"])
+        Returns:
+            Tuple of (is_valid, error_messages)
+        """
+        errors = []
+        for import_name in imports:
+            # Check for .list file
+            list_file = self.base_path / f"{import_name}.list"
+            properties_file = self.base_path / f"{import_name}.properties"
+            # At least one should exist
+            if not list_file.exists() and not properties_file.exists():
+                errors.append(
+                    f"Import '{import_name}' not found. "
+                    f"Neither {import_name}.list nor {import_name}.properties exists."
+                )
+        return (len(errors) == 0, errors)
+    def get_available_patterns(self, imports: List[str]) -> Dict[str, str]:
+        """
+        Get all available patterns from imports.
+        Args:
+            imports: List of import names
+        Returns:
+            Dictionary mapping pattern names to sources
+        """
+        available = {}
+        for import_name in imports:
+            # Load .list file if exists
+            list_file = self.base_path / f"{import_name}.list"
+            if list_file.exists():
+                try:
+                    lists = load_list_file(str(list_file))
+                    for list_name in lists.keys():
+                        available[list_name] = f"{import_name}.list"
+                except ListFileError as e:
+                    logger.warning(f"Failed to load {list_file}: {e}")
+            # Load .properties file if exists
+            properties_file = self.base_path / f"{import_name}.properties"
+            if properties_file.exists():
+                try:
+                    patterns = load_properties_file(str(properties_file))
+                    for pattern_name in patterns.keys():
+                        # Don't overwrite if already in list
+                        if pattern_name not in available:
+                            available[pattern_name] = f"{import_name}.properties"
+                except PatternFileError as e:
+                    logger.warning(f"Failed to load {properties_file}: {e}")
+        return available
+    def clear_cache(self):
+        """Clear cached files (useful for testing)."""
+        self._list_cache.clear()
+        self._properties_cache.clear()
+        self.resolver.clear_cache()
+def detect_pattern_type_from_toml(pattern_value: Any) -> str:
+    """
+    Detect pattern type from TOML value.
+    This is a convenience function for use in schema parsing.
+    Args:
+        pattern_value: Value from TOML file
+    Returns:
+        Pattern type: "inline_list", "inline_regex", or "reference"
+    Example:
+        >>> detect_pattern_type_from_toml(["Active", "Inactive"])
+        'inline_list'
+        >>> detect_pattern_type_from_toml("CUST\\d{8}")
+        'inline_regex'
+        >>> detect_pattern_type_from_toml("first_names")
+        'reference'
+    """
+    if isinstance(pattern_value, list):
+        return "inline_list"
+    elif isinstance(pattern_value, str):
+        if is_regex_pattern(pattern_value):
+            return "inline_regex"
+        else:
+            return "reference"
+    else:
+        return "unknown"
+def convert_prefer_mode(mode_str: str) -> PreferMode:
+    """
+    Convert prefer_mode string to enum.
+    Args:
+        mode_str: Mode string ("default", "list_only", "regex_only")
+    Returns:
+        PreferMode enum value
+    Raises:
+        ValueError: If mode string is invalid
+    """
+    try:
+        return PreferMode(mode_str)
+    except ValueError:
+        raise ValueError(
+            f"Invalid prefer_mode '{mode_str}'. "
+            f"Valid values: default, list_only, regex_only"
+        )