PyPI - additory - Versions diffs - 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl - Mend

additory 0.1.0a1py3-none-any.whl → 0.1.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

additory/__init__.py +4 -0
additory/common/__init__.py +2 -2
additory/common/backend.py +20 -4
additory/common/distributions.py +1 -1
additory/common/sample_data.py +19 -19
additory/core/backends/arrow_bridge.py +7 -0
additory/core/polars_expression_engine.py +66 -16
additory/dynamic_api.py +42 -46
additory/expressions/proxy.py +4 -1
additory/synthetic/__init__.py +7 -95
additory/synthetic/column_name_resolver.py +149 -0
additory/{augment → synthetic}/distributions.py +2 -2
additory/{augment → synthetic}/forecast.py +1 -1
additory/synthetic/linked_list_parser.py +415 -0
additory/synthetic/namespace_lookup.py +129 -0
additory/{augment → synthetic}/smote.py +1 -1
additory/{augment → synthetic}/strategies.py +11 -44
additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
additory/utilities/units.py +4 -1
{additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
{additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
{additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
additory/augment/__init__.py +0 -24
additory/augment/builtin_lists.py +0 -430
additory/augment/list_registry.py +0 -177
additory/synthetic/api.py +0 -220
additory/synthetic/common_integration.py +0 -314
additory/synthetic/config.py +0 -262
additory/synthetic/engines.py +0 -529
additory/synthetic/exceptions.py +0 -180
additory/synthetic/file_managers.py +0 -518
additory/synthetic/generator.py +0 -702
additory/synthetic/generator_parser.py +0 -68
additory/synthetic/integration.py +0 -319
additory/synthetic/models.py +0 -241
additory/synthetic/pattern_resolver.py +0 -573
additory/synthetic/performance.py +0 -469
additory/synthetic/polars_integration.py +0 -464
additory/synthetic/proxy.py +0 -60
additory/synthetic/schema_parser.py +0 -685
additory/synthetic/validator.py +0 -553
{additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
{additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0

additory/synthetic/performance.py DELETED Viewed

@@ -1,469 +0,0 @@
-"""
-Performance monitoring and optimization for synthetic data generation.
-Provides comprehensive performance tracking, memory management, and optimization
-features for the polars-native synthetic data generation system.
-"""
-import time
-import psutil
-import gc
-from typing import Dict, List, Optional, Any, Union
-from dataclasses import dataclass, field
-from contextlib import contextmanager
-import polars as pl
-import pandas as pd
-from .exceptions import SyntheticDataError
-@dataclass
-class PerformanceMetrics:
-    """Performance metrics for data generation operations."""
-    operation_name: str
-    start_time: float
-    end_time: float
-    duration_ms: float
-    memory_before_mb: float
-    memory_after_mb: float
-    memory_peak_mb: float
-    memory_delta_mb: float
-    rows_generated: int
-    columns_generated: int
-    rows_per_second: float
-    memory_per_row_kb: float
-    batch_count: int = 1
-    polars_operations: int = 0
-    conversion_time_ms: float = 0.0
-    metadata: Dict[str, Any] = field(default_factory=dict)
-@dataclass
-class PerformanceComparison:
-    """Comparison between pandas and polars performance."""
-    operation: str
-    polars_metrics: PerformanceMetrics
-    pandas_metrics: Optional[PerformanceMetrics] = None
-    polars_advantage_speed: Optional[float] = None  # How many times faster polars is
-    polars_advantage_memory: Optional[float] = None  # How much less memory polars uses
-    recommendation: str = ""
-class PerformanceMonitor:
-    """
-    Comprehensive performance monitoring system for synthetic data generation.
-    Tracks generation speed, memory usage, and provides optimization recommendations.
-    """
-    def __init__(self):
-        """Initialize the performance monitor."""
-        self.metrics_history: List[PerformanceMetrics] = []
-        self.comparisons: List[PerformanceComparison] = []
-        self._current_operation: Optional[str] = None
-        self._operation_start_time: Optional[float] = None
-        self._operation_start_memory: Optional[float] = None
-        self._peak_memory: float = 0.0
-        self._polars_operations_count: int = 0
-        self._conversion_time_ms: float = 0.0
-    @contextmanager
-    def monitor_operation(self, operation_name: str, rows: int = 0, columns: int = 0, **metadata):
-        """
-        Context manager for monitoring a data generation operation.
-        Args:
-            operation_name: Name of the operation being monitored
-            rows: Number of rows being generated
-            columns: Number of columns being generated
-            **metadata: Additional metadata to track
-        """
-        # Start monitoring
-        start_time = time.time()
-        start_memory = self._get_memory_usage_mb()
-        self._current_operation = operation_name
-        self._operation_start_time = start_time
-        self._operation_start_memory = start_memory
-        self._peak_memory = start_memory
-        self._polars_operations_count = 0
-        self._conversion_time_ms = 0.0
-        # Force garbage collection for accurate memory measurement
-        gc.collect()
-        try:
-            yield self
-        finally:
-            # End monitoring
-            end_time = time.time()
-            end_memory = self._get_memory_usage_mb()
-            # Calculate metrics
-            duration_ms = (end_time - start_time) * 1000
-            memory_delta = end_memory - start_memory
-            rows_per_second = rows / (duration_ms / 1000) if duration_ms > 0 and rows > 0 else 0
-            memory_per_row_kb = (memory_delta * 1024) / rows if rows > 0 else 0
-            # Create metrics object
-            metrics = PerformanceMetrics(
-                operation_name=operation_name,
-                start_time=start_time,
-                end_time=end_time,
-                duration_ms=duration_ms,
-                memory_before_mb=start_memory,
-                memory_after_mb=end_memory,
-                memory_peak_mb=self._peak_memory,
-                memory_delta_mb=memory_delta,
-                rows_generated=rows,
-                columns_generated=columns,
-                rows_per_second=rows_per_second,
-                memory_per_row_kb=memory_per_row_kb,
-                polars_operations=self._polars_operations_count,
-                conversion_time_ms=self._conversion_time_ms,
-                metadata=metadata
-            )
-            # Store metrics
-            self.metrics_history.append(metrics)
-            # Reset monitoring state
-            self._current_operation = None
-            self._operation_start_time = None
-            self._operation_start_memory = None
-            self._peak_memory = 0.0
-            self._polars_operations_count = 0
-            self._conversion_time_ms = 0.0
-    def update_peak_memory(self):
-        """Update peak memory usage during operation."""
-        if self._current_operation:
-            current_memory = self._get_memory_usage_mb()
-            self._peak_memory = max(self._peak_memory, current_memory)
-    def track_polars_operation(self):
-        """Track a polars operation for performance counting."""
-        if self._current_operation:
-            self._polars_operations_count += 1
-    def track_conversion_time(self, conversion_time_ms: float):
-        """Track time spent on format conversion."""
-        if self._current_operation:
-            self._conversion_time_ms += conversion_time_ms
-    def get_latest_metrics(self) -> Optional[PerformanceMetrics]:
-        """Get the most recent performance metrics."""
-        return self.metrics_history[-1] if self.metrics_history else None
-    def get_metrics_summary(self, operation_filter: Optional[str] = None) -> Dict[str, Any]:
-        """
-        Get a summary of performance metrics.
-        Args:
-            operation_filter: Filter metrics by operation name
-        Returns:
-            Dictionary with performance summary statistics
-        """
-        filtered_metrics = self.metrics_history
-        if operation_filter:
-            filtered_metrics = [m for m in self.metrics_history if operation_filter in m.operation_name]
-        if not filtered_metrics:
-            return {"message": "No metrics available"}
-        # Calculate summary statistics
-        total_operations = len(filtered_metrics)
-        total_rows = sum(m.rows_generated for m in filtered_metrics)
-        total_duration_ms = sum(m.duration_ms for m in filtered_metrics)
-        total_memory_delta = sum(m.memory_delta_mb for m in filtered_metrics)
-        avg_rows_per_second = sum(m.rows_per_second for m in filtered_metrics) / total_operations
-        avg_memory_per_row_kb = sum(m.memory_per_row_kb for m in filtered_metrics) / total_operations
-        avg_duration_ms = total_duration_ms / total_operations
-        max_memory_usage = max(m.memory_peak_mb for m in filtered_metrics)
-        min_memory_usage = min(m.memory_before_mb for m in filtered_metrics)
-        return {
-            "total_operations": total_operations,
-            "total_rows_generated": total_rows,
-            "total_duration_ms": total_duration_ms,
-            "total_memory_delta_mb": total_memory_delta,
-            "average_rows_per_second": avg_rows_per_second,
-            "average_memory_per_row_kb": avg_memory_per_row_kb,
-            "average_duration_ms": avg_duration_ms,
-            "peak_memory_usage_mb": max_memory_usage,
-            "min_memory_usage_mb": min_memory_usage,
-            "memory_efficiency_score": self._calculate_memory_efficiency_score(filtered_metrics)
-        }
-    def compare_engines(self, polars_metrics: PerformanceMetrics,
-                       pandas_metrics: Optional[PerformanceMetrics] = None) -> PerformanceComparison:
-        """
-        Compare performance between polars and pandas engines.
-        Args:
-            polars_metrics: Performance metrics from polars engine
-            pandas_metrics: Performance metrics from pandas engine (if available)
-        Returns:
-            PerformanceComparison with analysis and recommendations
-        """
-        comparison = PerformanceComparison(
-            operation=polars_metrics.operation_name,
-            polars_metrics=polars_metrics,
-            pandas_metrics=pandas_metrics
-        )
-        if pandas_metrics:
-            # Calculate advantages
-            if pandas_metrics.duration_ms > 0:
-                comparison.polars_advantage_speed = pandas_metrics.duration_ms / polars_metrics.duration_ms
-            if pandas_metrics.memory_delta_mb > 0:
-                comparison.polars_advantage_memory = pandas_metrics.memory_delta_mb / polars_metrics.memory_delta_mb
-            # Generate recommendation
-            comparison.recommendation = self._generate_engine_recommendation(comparison)
-        else:
-            comparison.recommendation = "Use polars engine for optimal performance"
-        self.comparisons.append(comparison)
-        return comparison
-    def get_optimization_recommendations(self) -> List[str]:
-        """
-        Get optimization recommendations based on performance history.
-        Returns:
-            List of actionable optimization recommendations
-        """
-        recommendations = []
-        if not self.metrics_history:
-            return ["No performance data available for recommendations"]
-        # Analyze recent metrics
-        recent_metrics = self.metrics_history[-10:]  # Last 10 operations
-        # Memory usage recommendations
-        avg_memory_per_row = sum(m.memory_per_row_kb for m in recent_metrics) / len(recent_metrics)
-        if avg_memory_per_row > 100:  # More than 100KB per row
-            recommendations.append(
-                f"High memory usage detected ({avg_memory_per_row:.1f}KB per row). "
-                "Consider reducing batch size or using streaming generation."
-            )
-        # Speed recommendations
-        avg_rows_per_second = sum(m.rows_per_second for m in recent_metrics) / len(recent_metrics)
-        if avg_rows_per_second < 1000:  # Less than 1000 rows per second
-            recommendations.append(
-                f"Low generation speed detected ({avg_rows_per_second:.0f} rows/sec). "
-                "Consider optimizing regex patterns or increasing batch size."
-            )
-        # Batch size recommendations
-        batch_sizes = [m.metadata.get('batch_size', 0) for m in recent_metrics if 'batch_size' in m.metadata]
-        if batch_sizes:
-            avg_batch_size = sum(batch_sizes) / len(batch_sizes)
-            if avg_batch_size < 1000:
-                recommendations.append(
-                    f"Small batch size detected ({avg_batch_size:.0f}). "
-                    "Consider increasing batch size for better performance."
-                )
-            elif avg_batch_size > 100000:
-                recommendations.append(
-                    f"Large batch size detected ({avg_batch_size:.0f}). "
-                    "Consider reducing batch size to manage memory usage."
-                )
-        # Conversion overhead recommendations
-        conversion_times = [m.conversion_time_ms for m in recent_metrics if m.conversion_time_ms > 0]
-        if conversion_times:
-            avg_conversion_time = sum(conversion_times) / len(conversion_times)
-            total_avg_time = sum(m.duration_ms for m in recent_metrics) / len(recent_metrics)
-            conversion_ratio = avg_conversion_time / total_avg_time if total_avg_time > 0 else 0
-            if conversion_ratio > 0.3:  # More than 30% of time spent on conversion
-                recommendations.append(
-                    f"High conversion overhead detected ({conversion_ratio*100:.1f}% of total time). "
-                    "Consider using polars output format for better performance."
-                )
-        return recommendations if recommendations else ["Performance looks good! No specific optimizations needed."]
-    def _get_memory_usage_mb(self) -> float:
-        """Get current memory usage in MB."""
-        try:
-            process = psutil.Process()
-            return process.memory_info().rss / 1024 / 1024
-        except Exception:
-            return 0.0
-    def _calculate_memory_efficiency_score(self, metrics: List[PerformanceMetrics]) -> float:
-        """
-        Calculate a memory efficiency score (0-100).
-        Higher scores indicate better memory efficiency.
-        """
-        if not metrics:
-            return 0.0
-        # Base score on memory per row (lower is better)
-        avg_memory_per_row = sum(m.memory_per_row_kb for m in metrics) / len(metrics)
-        # Score calculation: 100 - (memory_per_row_kb / 10)
-        # This gives 100 for 0KB/row, 90 for 1KB/row, etc.
-        score = max(0, 100 - (avg_memory_per_row / 10))
-        return min(100, score)
-    def _generate_engine_recommendation(self, comparison: PerformanceComparison) -> str:
-        """Generate engine recommendation based on performance comparison."""
-        if not comparison.pandas_metrics:
-            return "Use polars engine for optimal performance"
-        speed_advantage = comparison.polars_advantage_speed or 1.0
-        memory_advantage = comparison.polars_advantage_memory or 1.0
-        if speed_advantage > 2.0 and memory_advantage > 1.5:
-            return f"Strong recommendation: Use polars engine ({speed_advantage:.1f}x faster, {memory_advantage:.1f}x more memory efficient)"
-        elif speed_advantage > 1.5:
-            return f"Recommendation: Use polars engine ({speed_advantage:.1f}x faster)"
-        elif memory_advantage > 1.5:
-            return f"Recommendation: Use polars engine ({memory_advantage:.1f}x more memory efficient)"
-        else:
-            return "Both engines perform similarly. Use polars for consistency."
-    def clear_history(self):
-        """Clear performance metrics history."""
-        self.metrics_history.clear()
-        self.comparisons.clear()
-    def export_metrics(self, format: str = "dict") -> Union[Dict, pd.DataFrame, pl.DataFrame]:
-        """
-        Export performance metrics in various formats.
-        Args:
-            format: Export format ("dict", "pandas", "polars")
-        Returns:
-            Metrics in the requested format
-        """
-        if not self.metrics_history:
-            return {} if format == "dict" else None
-        if format == "dict":
-            return [
-                {
-                    "operation_name": m.operation_name,
-                    "duration_ms": m.duration_ms,
-                    "memory_delta_mb": m.memory_delta_mb,
-                    "rows_generated": m.rows_generated,
-                    "rows_per_second": m.rows_per_second,
-                    "memory_per_row_kb": m.memory_per_row_kb,
-                    "polars_operations": m.polars_operations,
-                    "conversion_time_ms": m.conversion_time_ms
-                }
-                for m in self.metrics_history
-            ]
-        elif format == "pandas":
-            data = self.export_metrics("dict")
-            return pd.DataFrame(data)
-        elif format == "polars":
-            data = self.export_metrics("dict")
-            return pl.DataFrame(data)
-        else:
-            raise ValueError(f"Unsupported export format: {format}")
-class PerformanceOptimizer:
-    """
-    Performance optimization utilities for synthetic data generation.
-    Provides automatic optimization recommendations and configuration tuning.
-    """
-    def __init__(self, monitor: PerformanceMonitor):
-        """Initialize the performance optimizer."""
-        self.monitor = monitor
-    def optimize_batch_size(self, target_rows: int, available_memory_mb: Optional[float] = None) -> int:
-        """
-        Calculate optimal batch size based on target rows and available memory.
-        Args:
-            target_rows: Total number of rows to generate
-            available_memory_mb: Available memory in MB (auto-detected if None)
-        Returns:
-            Recommended batch size
-        """
-        if available_memory_mb is None:
-            available_memory_mb = self._get_available_memory_mb()
-        # Get historical memory usage per row
-        recent_metrics = self.monitor.metrics_history[-5:] if self.monitor.metrics_history else []
-        if recent_metrics:
-            avg_memory_per_row_kb = sum(m.memory_per_row_kb for m in recent_metrics) / len(recent_metrics)
-        else:
-            avg_memory_per_row_kb = 10  # Default estimate: 10KB per row
-        # Calculate batch size to use ~50% of available memory
-        target_memory_mb = available_memory_mb * 0.5
-        target_memory_kb = target_memory_mb * 1024
-        optimal_batch_size = int(target_memory_kb / avg_memory_per_row_kb) if avg_memory_per_row_kb > 0 else 10000
-        # Apply constraints
-        optimal_batch_size = max(1000, min(optimal_batch_size, 100000))  # Between 1K and 100K
-        optimal_batch_size = min(optimal_batch_size, target_rows)  # Don't exceed target rows
-        return optimal_batch_size
-    def should_use_streaming(self, target_rows: int, columns: int) -> bool:
-        """
-        Determine if streaming generation should be used for large datasets.
-        Args:
-            target_rows: Number of rows to generate
-            columns: Number of columns to generate
-        Returns:
-            True if streaming is recommended
-        """
-        # Use streaming for very large datasets
-        total_cells = target_rows * columns
-        return total_cells > 10_000_000  # More than 10M cells
-    def get_memory_optimization_config(self) -> Dict[str, Any]:
-        """
-        Get configuration recommendations for memory optimization.
-        Returns:
-            Dictionary with optimization configuration
-        """
-        available_memory = self._get_available_memory_mb()
-        return {
-            "batch_size": min(50000, max(1000, int(available_memory * 100))),  # Scale with memory
-            "memory_limit_mb": available_memory * 0.8,  # Use 80% of available memory
-            "lazy_evaluation": True,
-            "garbage_collection_frequency": 5,  # GC every 5 batches
-            "streaming_threshold": 1_000_000  # Use streaming for >1M rows
-        }
-    def _get_available_memory_mb(self) -> float:
-        """Get available system memory in MB."""
-        try:
-            memory = psutil.virtual_memory()
-            return memory.available / 1024 / 1024
-        except Exception:
-            return 1024.0  # Default to 1GB if detection fails
-# Global performance monitor instance
-performance_monitor = PerformanceMonitor()
-performance_optimizer = PerformanceOptimizer(performance_monitor)

additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

additory 0.1.0a1py3-none-any.whl → 0.1.0a3py3-none-any.whl