PyPI - additory - Versions diffs - 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl - Mend

additory 0.1.0a2py3-none-any.whl → 0.1.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

additory/__init__.py +4 -0
additory/common/__init__.py +2 -2
additory/common/backend.py +20 -4
additory/common/distributions.py +1 -1
additory/common/sample_data.py +19 -19
additory/core/backends/arrow_bridge.py +7 -0
additory/core/polars_expression_engine.py +66 -16
additory/dynamic_api.py +42 -46
additory/expressions/proxy.py +4 -1
additory/synthetic/__init__.py +7 -95
additory/synthetic/column_name_resolver.py +149 -0
additory/{augment → synthetic}/distributions.py +2 -2
additory/{augment → synthetic}/forecast.py +1 -1
additory/synthetic/linked_list_parser.py +415 -0
additory/synthetic/namespace_lookup.py +129 -0
additory/{augment → synthetic}/smote.py +1 -1
additory/{augment → synthetic}/strategies.py +11 -44
additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
additory/utilities/units.py +4 -1
{additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
{additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
{additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
additory/augment/__init__.py +0 -24
additory/augment/builtin_lists.py +0 -430
additory/augment/list_registry.py +0 -177
additory/synthetic/api.py +0 -220
additory/synthetic/common_integration.py +0 -314
additory/synthetic/config.py +0 -262
additory/synthetic/engines.py +0 -529
additory/synthetic/exceptions.py +0 -180
additory/synthetic/file_managers.py +0 -518
additory/synthetic/generator.py +0 -702
additory/synthetic/generator_parser.py +0 -68
additory/synthetic/integration.py +0 -319
additory/synthetic/models.py +0 -241
additory/synthetic/pattern_resolver.py +0 -573
additory/synthetic/performance.py +0 -469
additory/synthetic/polars_integration.py +0 -464
additory/synthetic/proxy.py +0 -60
additory/synthetic/schema_parser.py +0 -685
additory/synthetic/validator.py +0 -553
{additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
{additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0

additory/synthetic/exceptions.py DELETED Viewed

@@ -1,180 +0,0 @@
-"""
-Exception classes for synthetic data generation system.
-Provides a hierarchy of exceptions for different error conditions with
-clear error messages and actionable suggestions.
-"""
-from typing import Optional, List, Dict, Any
-class SyntheticDataError(Exception):
-    """Base exception for all synthetic data generation errors."""
-    def __init__(self, message: str, suggestions: Optional[List[str]] = None):
-        super().__init__(message)
-        self.suggestions = suggestions or []
-    def __str__(self) -> str:
-        msg = super().__str__()
-        if self.suggestions:
-            suggestions_text = "\n".join(f"  - {s}" for s in self.suggestions)
-            msg += f"\n\nSuggestions:\n{suggestions_text}"
-        return msg
-class PatternResolutionError(SyntheticDataError):
-    """Raised when pattern resolution fails in the hierarchy."""
-    def __init__(self, message: str, pattern_name: str, searched_sources: List[str],
-                 details: Optional[str] = None):
-        self.pattern_name = pattern_name
-        self.searched_sources = searched_sources
-        self.details = details
-        suggestions = [
-            f"Define pattern '{pattern_name}' in one of the hierarchy sources",
-            "Check if pattern name is spelled correctly",
-            "Verify import declarations in schema files",
-            "Ensure pattern files are accessible and properly formatted"
-        ]
-        if details:
-            suggestions.append(f"Additional details: {details}")
-        super().__init__(message, suggestions)
-class ValidationError(SyntheticDataError):
-    """Raised when validation fails for patterns, schemas, or configurations."""
-    def __init__(self, message: str, file_path: Optional[str] = None,
-                 line_number: Optional[int] = None, suggestions: Optional[List[str]] = None):
-        self.file_path = file_path
-        self.line_number = line_number
-        if file_path:
-            location = f" in {file_path}"
-            if line_number:
-                location += f" at line {line_number}"
-            message += location
-        super().__init__(message, suggestions)
-class DistributionError(SyntheticDataError):
-    """Raised when distribution strategy application fails."""
-    def __init__(self, strategy_type: str, column_name: str, reason: str):
-        self.strategy_type = strategy_type
-        self.column_name = column_name
-        self.reason = reason
-        message = f"Distribution strategy '{strategy_type}' failed for column '{column_name}': {reason}"
-        suggestions = [
-            f"Check if '{strategy_type}' is compatible with the data type of '{column_name}'",
-            "Verify distribution parameters are within valid ranges",
-            "Ensure the pattern generates appropriate data for the distribution"
-        ]
-        super().__init__(message, suggestions)
-class FileFormatError(SyntheticDataError):
-    """Raised when file format validation fails."""
-    def __init__(self, file_path: str, expected_format: str, actual_format: Optional[str] = None):
-        self.file_path = file_path
-        self.expected_format = expected_format
-        self.actual_format = actual_format
-        message = f"Invalid file format for '{file_path}'. Expected: {expected_format}"
-        if actual_format:
-            message += f", Got: {actual_format}"
-        suggestions = [
-            f"Ensure the file has the correct extension (.{expected_format})",
-            f"Verify the file content follows {expected_format} syntax",
-            "Check file permissions and accessibility"
-        ]
-        super().__init__(message, suggestions)
-class RegexValidationError(ValidationError):
-    """Raised when regex pattern validation fails."""
-    def __init__(self, pattern: str, regex_error: str, pattern_name: Optional[str] = None):
-        self.pattern = pattern
-        self.regex_error = regex_error
-        self.pattern_name = pattern_name
-        name_part = f" for pattern '{pattern_name}'" if pattern_name else ""
-        message = f"Invalid regex pattern{name_part}: {regex_error}"
-        suggestions = [
-            "Check regex syntax for polars compatibility",
-            "Escape special characters properly",
-            "Test the regex pattern with online validators",
-            "Refer to polars regex documentation for supported features"
-        ]
-        super().__init__(message, suggestions=suggestions)
-class SchemaParsingError(ValidationError):
-    """Raised when schema file parsing fails."""
-    def __init__(self, file_path: str, parsing_error: str, line_number: Optional[int] = None):
-        self.parsing_error = parsing_error
-        suggestions = [
-            "Check TOML syntax for proper formatting",
-            "Ensure all strings are properly quoted",
-            "Verify section headers are correctly formatted",
-            "Check for missing commas or brackets"
-        ]
-        super().__init__(f"Schema parsing failed: {parsing_error}",
-                        file_path, line_number, suggestions)
-class PatternImportError(SyntheticDataError):
-    """Raised when pattern file import fails."""
-    def __init__(self, import_name: str, file_path: Optional[str] = None, reason: str = "File not found"):
-        self.import_name = import_name
-        self.file_path = file_path
-        self.reason = reason
-        message = f"Failed to import pattern file '{import_name}': {reason}"
-        if file_path:
-            message += f" (looked for: {file_path})"
-        suggestions = [
-            f"Ensure '{import_name}.properties' exists in reference/schema_definitions/",
-            "Check file permissions and accessibility",
-            "Verify the import name matches the filename exactly",
-            "Check for typos in the import declaration"
-        ]
-        super().__init__(message, suggestions)
-class DistributionValidationError(SyntheticDataError):
-    """Raised when distribution strategy validation fails."""
-    def __init__(self, message: str, distribution_type: str, details: str = ""):
-        super().__init__(message, ["Check distribution strategy parameters and syntax"])
-        self.distribution_type = distribution_type
-        self.details = details
-    def __str__(self) -> str:
-        base_msg = super().__str__()
-        if self.details:
-            return f"{base_msg}\nDistribution Type: {self.distribution_type}\nDetails: {self.details}"
-        return f"{base_msg}\nDistribution Type: {self.distribution_type}"

additory/synthetic/file_managers.py DELETED Viewed

@@ -1,518 +0,0 @@
-"""
-File management for synthetic data generation system.
-Handles loading and validation of .properties and .toml files
-with proper error handling and syntax validation.
-"""
-import re
-import toml
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-from dataclasses import dataclass
-from .models import ValidationResult, PatternDefinition, PatternSource, ValidationStatus
-from .exceptions import FileFormatError, ValidationError, PatternImportError, SchemaParsingError
-@dataclass
-class ParsedPropertiesFile:
-    """Result of parsing a .properties file."""
-    patterns: Dict[str, str]
-    file_path: str
-    line_count: int
-    comments: List[str]
-@dataclass
-class ParsedSchemaFile:
-    """Result of parsing a .toml schema file."""
-    imports: List[str]
-    inline_patterns: Dict[str, str]
-    schema_definitions: Dict[str, str]
-    metadata: Dict[str, str]
-    file_path: str
-class PatternFileManager:
-    """
-    Manages loading and validation of .properties pattern files.
-    Handles the parsing of .properties files with proper syntax validation,
-    comment handling, and error reporting with line numbers.
-    """
-    def __init__(self):
-        self._pattern_cache: Dict[str, ParsedPropertiesFile] = {}
-        self._validation_cache: Dict[str, ValidationResult] = {}
-    def load_properties_file(self, file_path: str) -> ParsedPropertiesFile:
-        """
-        Load and parse a .properties file.
-        Args:
-            file_path: Path to the .properties file
-        Returns:
-            ParsedPropertiesFile with patterns and metadata
-        Raises:
-            FileFormatError: If file format is invalid
-            ValidationError: If file syntax is invalid
-            FileNotFoundError: If file doesn't exist
-        """
-        path = Path(file_path)
-        # Check file extension
-        if not path.suffix == '.properties':
-            raise FileFormatError(file_path, "properties", path.suffix[1:] if path.suffix else "unknown")
-        # Check if file exists
-        if not path.exists():
-            raise FileNotFoundError(f"Properties file not found: {file_path}")
-        # Check cache
-        cache_key = str(path.absolute())
-        if cache_key in self._pattern_cache:
-            return self._pattern_cache[cache_key]
-        try:
-            content = path.read_text(encoding='utf-8')
-        except UnicodeDecodeError as e:
-            raise ValidationError(f"File encoding error: {e}", file_path)
-        # Parse the content
-        parsed = self._parse_properties_content(content, file_path)
-        # Cache the result
-        self._pattern_cache[cache_key] = parsed
-        return parsed
-    def _parse_properties_content(self, content: str, file_path: str) -> ParsedPropertiesFile:
-        """
-        Parse the content of a .properties file.
-        Args:
-            content: File content as string
-            file_path: Path for error reporting
-        Returns:
-            ParsedPropertiesFile with parsed data
-        Raises:
-            ValidationError: If syntax is invalid
-        """
-        patterns = {}
-        comments = []
-        line_number = 0
-        for line in content.splitlines():
-            line_number += 1
-            line = line.strip()
-            # Skip empty lines
-            if not line:
-                continue
-            # Handle comments
-            if line.startswith('#'):
-                comments.append(line[1:].strip())
-                continue
-            # Parse key=value pairs
-            if '=' not in line:
-                raise ValidationError(
-                    f"Invalid syntax: missing '=' separator",
-                    file_path,
-                    line_number,
-                    ["Each line should be in format: pattern_name=regex_pattern"]
-                )
-            # Split on first '=' only
-            key, value = line.split('=', 1)
-            key = key.strip()
-            value = value.strip()
-            # Validate key
-            if not key:
-                raise ValidationError(
-                    "Empty pattern name",
-                    file_path,
-                    line_number,
-                    ["Pattern names must not be empty"]
-                )
-            if not self._is_valid_pattern_name(key):
-                raise ValidationError(
-                    f"Invalid pattern name '{key}'",
-                    file_path,
-                    line_number,
-                    ["Pattern names must start with a letter and contain only letters, numbers, and underscores"]
-                )
-            # Validate value
-            if not value:
-                raise ValidationError(
-                    f"Empty regex pattern for '{key}'",
-                    file_path,
-                    line_number,
-                    ["Regex patterns must not be empty"]
-                )
-            # Check for duplicates
-            if key in patterns:
-                raise ValidationError(
-                    f"Duplicate pattern name '{key}'",
-                    file_path,
-                    line_number,
-                    [f"Pattern '{key}' is already defined in this file"]
-                )
-            patterns[key] = value
-        return ParsedPropertiesFile(
-            patterns=patterns,
-            file_path=file_path,
-            line_count=line_number,
-            comments=comments
-        )
-    def _is_valid_pattern_name(self, name: str) -> bool:
-        """
-        Validate pattern name format.
-        Args:
-            name: Pattern name to validate
-        Returns:
-            True if valid, False otherwise
-        """
-        # Pattern names must start with letter, contain only letters, numbers, underscores
-        pattern = r'^[a-zA-Z][a-zA-Z0-9_]*$'
-        return bool(re.match(pattern, name))
-    def validate_properties_syntax(self, content: str) -> ValidationResult:
-        """
-        Validate .properties file syntax without full parsing.
-        Args:
-            content: File content to validate
-        Returns:
-            ValidationResult with validation status
-        """
-        result = ValidationResult(is_valid=True)
-        line_number = 0
-        seen_keys = set()
-        for line in content.splitlines():
-            line_number += 1
-            line = line.strip()
-            # Skip empty lines and comments
-            if not line or line.startswith('#'):
-                continue
-            # Check for '=' separator
-            if '=' not in line:
-                result.add_error(
-                    f"Line {line_number}: Missing '=' separator",
-                    "Each line should be in format: pattern_name=regex_pattern"
-                )
-                continue
-            # Split and validate
-            key, value = line.split('=', 1)
-            key = key.strip()
-            value = value.strip()
-            # Validate key
-            if not key:
-                result.add_error(f"Line {line_number}: Empty pattern name")
-            elif not self._is_valid_pattern_name(key):
-                result.add_error(
-                    f"Line {line_number}: Invalid pattern name '{key}'",
-                    "Pattern names must start with a letter and contain only letters, numbers, and underscores"
-                )
-            elif key in seen_keys:
-                result.add_error(f"Line {line_number}: Duplicate pattern name '{key}'")
-            else:
-                seen_keys.add(key)
-            # Validate value
-            if not value:
-                result.add_error(f"Line {line_number}: Empty regex pattern for '{key}'")
-        return result
-    def create_pattern_definitions(self, parsed_file: ParsedPropertiesFile,
-                                 source: PatternSource) -> List[PatternDefinition]:
-        """
-        Create PatternDefinition objects from parsed file.
-        Args:
-            parsed_file: Parsed .properties file
-            source: Source type for the patterns
-        Returns:
-            List of PatternDefinition objects
-        """
-        definitions = []
-        for name, regex in parsed_file.patterns.items():
-            definition = PatternDefinition(
-                name=name,
-                regex=regex,
-                source=source,
-                validation_status=ValidationStatus.NOT_VALIDATED,
-                polars_compatible=False,  # Will be validated later
-                source_file=parsed_file.file_path
-            )
-            definitions.append(definition)
-        return definitions
-    def clear_cache(self):
-        """Clear the file cache."""
-        self._pattern_cache.clear()
-        self._validation_cache.clear()
-class SchemaFileManager:
-    """
-    Manages loading and validation of .toml schema files.
-    Handles the parsing of TOML schema files with proper structure validation,
-    import resolution, and error reporting.
-    """
-    def __init__(self):
-        self._schema_cache: Dict[str, ParsedSchemaFile] = {}
-    def load_toml_schema(self, file_path: str) -> ParsedSchemaFile:
-        """
-        Load and parse a .toml schema file.
-        Args:
-            file_path: Path to the .toml file
-        Returns:
-            ParsedSchemaFile with schema data
-        Raises:
-            FileFormatError: If file format is invalid
-            SchemaParsingError: If TOML parsing fails
-            FileNotFoundError: If file doesn't exist
-        """
-        path = Path(file_path)
-        # Check file extension
-        if not path.suffix == '.toml':
-            raise FileFormatError(file_path, "toml", path.suffix[1:] if path.suffix else "unknown")
-        # Check if file exists
-        if not path.exists():
-            raise FileNotFoundError(f"Schema file not found: {file_path}")
-        # Check cache
-        cache_key = str(path.absolute())
-        if cache_key in self._schema_cache:
-            return self._schema_cache[cache_key]
-        try:
-            content = path.read_text(encoding='utf-8')
-        except UnicodeDecodeError as e:
-            raise SchemaParsingError(file_path, f"File encoding error: {e}")
-        # Parse TOML content
-        try:
-            toml_data = toml.loads(content)
-        except toml.TomlDecodeError as e:
-            raise SchemaParsingError(file_path, f"TOML parsing error: {e}")
-        # Parse the schema structure
-        parsed = self._parse_schema_structure(toml_data, file_path)
-        # Cache the result
-        self._schema_cache[cache_key] = parsed
-        return parsed
-    def _parse_schema_structure(self, toml_data: Dict, file_path: str) -> ParsedSchemaFile:
-        """
-        Parse the structure of a TOML schema file.
-        Supports two formats:
-        1. Legacy format: [generator] section with import and inline patterns
-        2. New format: [generation] section with imports, prefer_mode, and patterns
-        Args:
-            toml_data: Parsed TOML data
-            file_path: Path for error reporting
-        Returns:
-            ParsedSchemaFile with structured data
-        Raises:
-            SchemaParsingError: If structure is invalid
-        """
-        imports = []
-        inline_patterns = {}
-        prefer_mode = "default"  # Default prefer mode
-        # Try new format first: [generation] section
-        generation_section = toml_data.get('generation', {})
-        if generation_section:
-            # Handle imports (array format)
-            import_value = generation_section.get('imports', [])
-            if isinstance(import_value, str):
-                imports = [import_value]
-            elif isinstance(import_value, list):
-                imports = import_value
-            elif import_value:  # Not None or empty
-                raise SchemaParsingError(
-                    file_path,
-                    "imports declaration must be a string or list of strings"
-                )
-            # Handle prefer_mode
-            prefer_mode_value = generation_section.get('prefer_mode', 'default')
-            if not isinstance(prefer_mode_value, str):
-                raise SchemaParsingError(
-                    file_path,
-                    "prefer_mode must be a string (default, list_only, or regex_only)"
-                )
-            if prefer_mode_value not in ['default', 'list_only', 'regex_only']:
-                raise SchemaParsingError(
-                    file_path,
-                    f"Invalid prefer_mode '{prefer_mode_value}'. Valid values: default, list_only, regex_only"
-                )
-            prefer_mode = prefer_mode_value
-            # Handle inline patterns (everything except 'imports' and 'prefer_mode')
-            for key, value in generation_section.items():
-                if key not in ['imports', 'prefer_mode']:
-                    # Support both string (regex) and array (list) patterns
-                    if isinstance(value, str):
-                        inline_patterns[key] = value
-                    elif isinstance(value, list):
-                        inline_patterns[key] = value
-                    else:
-                        raise SchemaParsingError(
-                            file_path,
-                            f"Inline pattern '{key}' must be a string (regex) or array (list)"
-                        )
-        # Fallback to legacy format: [generator] section
-        generator_section = toml_data.get('generator', {})
-        if generator_section and not generation_section:
-            # Handle imports (legacy: 'import' instead of 'imports')
-            import_value = generator_section.get('import', [])
-            if isinstance(import_value, str):
-                imports = [import_value]
-            elif isinstance(import_value, list):
-                imports = import_value
-            elif import_value:  # Not None or empty
-                raise SchemaParsingError(
-                    file_path,
-                    "Import declaration must be a string or list of strings"
-                )
-            # Handle inline patterns (everything except 'import')
-            for key, value in generator_section.items():
-                if key != 'import':
-                    if not isinstance(value, str):
-                        raise SchemaParsingError(
-                            file_path,
-                            f"Inline pattern '{key}' must be a string"
-                        )
-                    inline_patterns[key] = value
-        # Extract schema section
-        schema_section = toml_data.get('schema', {})
-        schema_definitions = {}
-        for key, value in schema_section.items():
-            # Support both string (reference/regex) and array (list) patterns
-            if isinstance(value, str):
-                schema_definitions[key] = value
-            elif isinstance(value, list):
-                schema_definitions[key] = value
-            else:
-                raise SchemaParsingError(
-                    file_path,
-                    f"Schema definition '{key}' must be a string or array"
-                )
-        # Extract metadata (any other sections)
-        metadata = {}
-        for key, value in toml_data.items():
-            if key not in ['generator', 'generation', 'schema']:
-                metadata[key] = value
-        # Store prefer_mode in metadata
-        metadata['prefer_mode'] = prefer_mode
-        return ParsedSchemaFile(
-            imports=imports,
-            inline_patterns=inline_patterns,
-            schema_definitions=schema_definitions,
-            metadata=metadata,
-            file_path=file_path
-        )
-    def validate_toml_syntax(self, content: str) -> ValidationResult:
-        """
-        Validate TOML syntax without full parsing.
-        Args:
-            content: TOML content to validate
-        Returns:
-            ValidationResult with validation status
-        """
-        result = ValidationResult(is_valid=True)
-        try:
-            toml_data = toml.loads(content)
-            # Validate expected sections
-            valid_sections = {'generator', 'schema'}
-            for section in toml_data:
-                if section not in valid_sections and not isinstance(toml_data[section], dict):
-                    result.add_warning(f"Unexpected section '{section}' - will be treated as metadata")
-            # Validate generator section structure
-            if 'generator' in toml_data:
-                generator = toml_data['generator']
-                if not isinstance(generator, dict):
-                    result.add_error("Generator section must be a table/dictionary")
-                else:
-                    # Validate import format
-                    if 'import' in generator:
-                        import_val = generator['import']
-                        if not isinstance(import_val, (str, list)):
-                            result.add_error("Import declaration must be a string or list of strings")
-            # Validate schema section structure
-            if 'schema' in toml_data:
-                schema = toml_data['schema']
-                if not isinstance(schema, dict):
-                    result.add_error("Schema section must be a table/dictionary")
-                else:
-                    for key, value in schema.items():
-                        if not isinstance(value, str):
-                            result.add_error(f"Schema definition '{key}' must be a string")
-        except toml.TomlDecodeError as e:
-            result.add_error(
-                f"TOML syntax error: {e}",
-                "Check TOML syntax for proper formatting, quotes, and brackets"
-            )
-        return result
-    def clear_cache(self):
-        """Clear the schema cache."""
-        self._schema_cache.clear()

additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

additory 0.1.0a2py3-none-any.whl → 0.1.0a3py3-none-any.whl