PyPI - gapless-crypto-clickhouse - Versions diffs - 7.1.0__py3-none-any.whl - Mend

gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

gapless_crypto_clickhouse/__init__.py +147 -0
gapless_crypto_clickhouse/__probe__.py +349 -0
gapless_crypto_clickhouse/api.py +1032 -0
gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
gapless_crypto_clickhouse/clickhouse/config.py +119 -0
gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
gapless_crypto_clickhouse/clickhouse_query.py +642 -0
gapless_crypto_clickhouse/collectors/__init__.py +21 -0
gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
gapless_crypto_clickhouse/exceptions.py +145 -0
gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
gapless_crypto_clickhouse/llms.txt +268 -0
gapless_crypto_clickhouse/probe.py +235 -0
gapless_crypto_clickhouse/py.typed +0 -0
gapless_crypto_clickhouse/query_api.py +374 -0
gapless_crypto_clickhouse/resume/__init__.py +12 -0
gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
gapless_crypto_clickhouse/utils/__init__.py +29 -0
gapless_crypto_clickhouse/utils/error_handling.py +202 -0
gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
gapless_crypto_clickhouse/validation/__init__.py +36 -0
gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
gapless_crypto_clickhouse/validation/models.py +220 -0
gapless_crypto_clickhouse/validation/storage.py +502 -0
gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0

gapless_crypto_clickhouse/resume/intelligent_checkpointing.py ADDED Viewed

@@ -0,0 +1,383 @@
+"""
+Intelligent Resume System with SOTA Checkpointing
+Provides bulletproof resume capabilities for large-scale cryptocurrency data collection.
+Uses joblib Memory for disk-cached computation with automatic resume from last successful checkpoint.
+Eliminates restart frustration for multi-symbol, multi-timeframe, multi-year collections.
+Architecture:
+    - Symbol-level checkpointing: Resume from last completed symbol
+    - Timeframe-level checkpointing: Resume from last completed timeframe within symbol
+    - Collection-level checkpointing: Resume from last completed collection task
+    - Progress persistence: Maintains collection state across interruptions
+    - Integrity validation: Verifies checkpoint consistency before resume
+"""
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+# joblib removed - using simple JSON state persistence
+from ..utils import GaplessCryptoError, get_standard_logger
+class CheckpointError(GaplessCryptoError):
+    """Checkpoint-specific errors"""
+    pass
+class IntelligentCheckpointManager:
+    """
+    SOTA checkpoint manager using joblib Memory for disk caching and resume capabilities.
+    Provides enterprise-grade resume functionality for large-scale cryptocurrency data collection
+    with automatic progress tracking, integrity validation, and efficient storage.
+    """
+    def __init__(
+        self,
+        cache_dir: Optional[Union[str, Path]] = None,
+        verbose: int = 1,
+        compress: Union[bool, int] = True,
+    ):
+        """
+        Initialize checkpoint manager with SOTA joblib configuration.
+        Args:
+            cache_dir: Directory for checkpoint cache (default: ./.gapless_checkpoints)
+            verbose: Joblib verbosity level (0=silent, 1=progress, 2=debug)
+            compress: Compression level for checkpoints (True/False or 0-9)
+        """
+        self.cache_dir = Path(cache_dir or ".gapless_checkpoints").resolve()
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Simple JSON state persistence (joblib removed)
+        # Parameters kept for backward compatibility but ignored
+        self._verbose = verbose
+        self._compress = compress
+        self.logger = get_standard_logger("checkpoint_manager")
+        self.session_id = self._generate_session_id()
+        self.checkpoint_file = self.cache_dir / f"session_{self.session_id}.json"
+        # Progress tracking
+        self.progress_data: Dict[str, Any] = {
+            "session_id": self.session_id,
+            "created_at": datetime.now().isoformat(),
+            "last_updated": datetime.now().isoformat(),
+            "symbols_completed": [],
+            "symbols_in_progress": {},
+            "total_datasets_collected": 0,
+            "collection_parameters": {},
+            "errors": [],
+        }
+        self.logger.info(f"🔄 Checkpoint manager initialized: {self.cache_dir}")
+        self.logger.info(f"📋 Session ID: {self.session_id}")
+    def _generate_session_id(self) -> str:
+        """Generate unique session identifier for checkpoint isolation."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        random_suffix = hashlib.md5(str(datetime.now().timestamp()).encode()).hexdigest()[:8]
+        return f"{timestamp}_{random_suffix}"
+    def save_checkpoint(self, checkpoint_data: Dict[str, Any]) -> None:
+        """
+        Save checkpoint with atomic operations and integrity validation.
+        Args:
+            checkpoint_data: Checkpoint state to persist
+        """
+        try:
+            # Update progress data
+            self.progress_data.update(checkpoint_data)
+            self.progress_data["last_updated"] = datetime.now().isoformat()
+            # Atomic write to prevent corruption
+            temp_file = self.checkpoint_file.with_suffix(".tmp")
+            with open(temp_file, "w") as f:
+                json.dump(self.progress_data, f, indent=2, default=str)
+            # Atomic rename for consistency
+            temp_file.replace(self.checkpoint_file)
+            self.logger.debug(
+                f"💾 Checkpoint saved: {checkpoint_data.get('current_symbol', 'unknown')}"
+            )
+        except Exception as e:
+            raise CheckpointError(f"Failed to save checkpoint: {e}")
+    def load_checkpoint(self) -> Optional[Dict[str, Any]]:
+        """
+        Load checkpoint with integrity validation.
+        Returns:
+            Checkpoint data if valid, None if no valid checkpoint exists
+        """
+        try:
+            if not self.checkpoint_file.exists():
+                self.logger.info("📂 No existing checkpoint found")
+                return None
+            with open(self.checkpoint_file, "r") as f:
+                checkpoint_data = json.load(f)
+            # Validate checkpoint integrity
+            if not self._validate_checkpoint(checkpoint_data):
+                self.logger.warning("⚠️  Invalid checkpoint detected, starting fresh")
+                return None
+            self.progress_data = checkpoint_data
+            self.logger.info(f"📋 Loaded checkpoint: Session {checkpoint_data.get('session_id')}")
+            self.logger.info(
+                f"✅ Completed symbols: {len(checkpoint_data.get('symbols_completed', []))}"
+            )
+            return checkpoint_data
+        except Exception as e:
+            self.logger.warning(f"⚠️  Failed to load checkpoint: {e}")
+            return None
+    def _validate_checkpoint(self, checkpoint_data: Dict[str, Any]) -> bool:
+        """Validate checkpoint data integrity and completeness."""
+        required_fields = [
+            "session_id",
+            "created_at",
+            "symbols_completed",
+            "symbols_in_progress",
+            "collection_parameters",
+        ]
+        for field in required_fields:
+            if field not in checkpoint_data:
+                self.logger.warning(f"❌ Missing checkpoint field: {field}")
+                return False
+        return True
+    def get_resume_plan(
+        self,
+        requested_symbols: List[str],
+        requested_timeframes: List[str],
+        collection_params: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """
+        Generate intelligent resume plan based on checkpoint state.
+        Args:
+            requested_symbols: Symbols to collect
+            requested_timeframes: Timeframes to collect
+            collection_params: Collection parameters (dates, output_dir, etc.)
+        Returns:
+            Resume plan with remaining work and progress summary
+        """
+        checkpoint = self.load_checkpoint()
+        if not checkpoint:
+            # No checkpoint - start from beginning
+            return {
+                "resume_required": False,
+                "remaining_symbols": requested_symbols,
+                "completed_symbols": [],
+                "symbols_in_progress": {},
+                "total_progress": 0.0,
+                "message": "Starting fresh collection",
+            }
+        # Validate parameters match checkpoint
+        checkpoint_params = checkpoint.get("collection_parameters", {})
+        if not self._params_compatible(checkpoint_params, collection_params):
+            self.logger.warning("⚠️  Parameters changed, starting fresh collection")
+            self.clear_checkpoint()
+            return {
+                "resume_required": False,
+                "remaining_symbols": requested_symbols,
+                "completed_symbols": [],
+                "symbols_in_progress": {},
+                "total_progress": 0.0,
+                "message": "Parameters changed - starting fresh",
+            }
+        # Calculate remaining work
+        completed_symbols = set(checkpoint.get("symbols_completed", []))
+        symbols_in_progress = checkpoint.get("symbols_in_progress", {})
+        remaining_symbols = [s for s in requested_symbols if s not in completed_symbols]
+        # Calculate progress
+        total_tasks = len(requested_symbols) * len(requested_timeframes)
+        completed_tasks = len(completed_symbols) * len(requested_timeframes)
+        # Add partial progress for symbols in progress
+        for symbol, progress in symbols_in_progress.items():
+            completed_tasks += len(progress.get("completed_timeframes", []))
+        progress_percentage = (completed_tasks / total_tasks * 100) if total_tasks > 0 else 0
+        resume_plan = {
+            "resume_required": len(completed_symbols) > 0 or len(symbols_in_progress) > 0,
+            "remaining_symbols": remaining_symbols,
+            "completed_symbols": list(completed_symbols),
+            "symbols_in_progress": symbols_in_progress,
+            "total_progress": progress_percentage,
+            "completed_datasets": checkpoint.get("total_datasets_collected", 0),
+            "message": f"Resuming from {progress_percentage:.1f}% complete",
+        }
+        if resume_plan["resume_required"]:
+            self.logger.info(f"🔄 Resume plan: {progress_percentage:.1f}% complete")
+            self.logger.info(f"✅ Completed: {len(completed_symbols)} symbols")
+            self.logger.info(f"⏳ In progress: {len(symbols_in_progress)} symbols")
+            self.logger.info(f"🔵 Remaining: {len(remaining_symbols)} symbols")
+        return resume_plan
+    def _params_compatible(
+        self, checkpoint_params: Dict[str, Any], current_params: Dict[str, Any]
+    ) -> bool:
+        """Check if collection parameters are compatible for resume."""
+        critical_params = ["start_date", "end_date", "output_dir"]
+        for param in critical_params:
+            checkpoint_val = checkpoint_params.get(param)
+            current_val = current_params.get(param)
+            if checkpoint_val != current_val:
+                self.logger.debug(
+                    f"Parameter mismatch: {param} changed from {checkpoint_val} to {current_val}"
+                )
+                return False
+        return True
+    def mark_symbol_start(self, symbol: str, timeframes: List[str]) -> None:
+        """Mark symbol collection as started."""
+        self.progress_data["symbols_in_progress"][symbol] = {
+            "started_at": datetime.now().isoformat(),
+            "timeframes": timeframes,
+            "completed_timeframes": [],
+            "failed_timeframes": [],
+        }
+        self.save_checkpoint({"current_symbol": symbol})
+    def mark_timeframe_complete(
+        self, symbol: str, timeframe: str, filepath: Path, file_size_mb: float
+    ) -> None:
+        """Mark timeframe collection as completed."""
+        if symbol in self.progress_data["symbols_in_progress"]:
+            symbol_progress = self.progress_data["symbols_in_progress"][symbol]
+            symbol_progress["completed_timeframes"].append(
+                {
+                    "timeframe": timeframe,
+                    "completed_at": datetime.now().isoformat(),
+                    "filepath": str(filepath),
+                    "file_size_mb": file_size_mb,
+                }
+            )
+            self.progress_data["total_datasets_collected"] += 1
+            self.save_checkpoint({})
+    def mark_symbol_complete(self, symbol: str) -> None:
+        """Mark symbol collection as fully completed."""
+        if symbol in self.progress_data["symbols_in_progress"]:
+            # Move from in_progress to completed
+            self.progress_data["symbols_completed"].append(symbol)
+            del self.progress_data["symbols_in_progress"][symbol]
+            self.save_checkpoint({"completed_symbol": symbol})
+            self.logger.info(f"✅ Symbol completed: {symbol}")
+    def mark_symbol_failed(self, symbol: str, error: str) -> None:
+        """Mark symbol collection as failed."""
+        self.progress_data["errors"].append(
+            {"symbol": symbol, "error": error, "timestamp": datetime.now().isoformat()}
+        )
+        if symbol in self.progress_data["symbols_in_progress"]:
+            del self.progress_data["symbols_in_progress"][symbol]
+        self.save_checkpoint({"failed_symbol": symbol})
+    def clear_checkpoint(self) -> None:
+        """Clear checkpoint and start fresh."""
+        try:
+            if self.checkpoint_file.exists():
+                self.checkpoint_file.unlink()
+            # Clear cache directory (joblib removed)
+            import shutil
+            cache_dir = self.cache_dir / "cache"
+            if cache_dir.exists():
+                shutil.rmtree(cache_dir)
+            self.logger.info("🗑️  Checkpoint cleared - starting fresh")
+        except Exception as e:
+            self.logger.warning(f"⚠️  Failed to clear checkpoint: {e}")
+    def get_cached_collection_function(self, func):
+        """
+        Simple wrapper for collection function (joblib caching removed).
+        Args:
+            func: Function to wrap (deterministic functions recommended)
+        Returns:
+            Original function (no caching applied)
+        """
+        # Return original function - caching removed for simplicity
+        return func
+    def cleanup_old_sessions(self, max_age_days: int = 7) -> None:
+        """Clean up old checkpoint sessions."""
+        try:
+            cutoff_time = datetime.now().timestamp() - (max_age_days * 24 * 3600)
+            for checkpoint_file in self.cache_dir.glob("session_*.json"):
+                if checkpoint_file.stat().st_mtime < cutoff_time:
+                    checkpoint_file.unlink()
+                    self.logger.debug(f"🗑️  Cleaned up old session: {checkpoint_file.name}")
+        except Exception as e:
+            self.logger.warning(f"⚠️  Failed to cleanup old sessions: {e}")
+    def get_progress_summary(self) -> Dict[str, Any]:
+        """Get current progress summary for display."""
+        return {
+            "session_id": self.session_id,
+            "completed_symbols": len(self.progress_data.get("symbols_completed", [])),
+            "symbols_in_progress": len(self.progress_data.get("symbols_in_progress", {})),
+            "total_datasets": self.progress_data.get("total_datasets_collected", 0),
+            "last_updated": self.progress_data.get("last_updated"),
+            "errors": len(self.progress_data.get("errors", [])),
+        }
+    def export_progress_report(self, output_file: Optional[Path] = None) -> Path:
+        """Export detailed progress report for analysis."""
+        if output_file is None:
+            output_file = self.cache_dir / f"progress_report_{self.session_id}.json"
+        report = {
+            "progress_summary": self.get_progress_summary(),
+            "detailed_progress": self.progress_data,
+            "cache_info": {
+                "cache_dir": str(self.cache_dir),
+                "cache_size_mb": sum(
+                    f.stat().st_size for f in self.cache_dir.rglob("*") if f.is_file()
+                )
+                / (1024 * 1024),
+            },
+        }
+        with open(output_file, "w") as f:
+            json.dump(report, f, indent=2, default=str)
+        self.logger.info(f"📊 Progress report exported: {output_file}")
+        return output_file

gapless_crypto_clickhouse/utils/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Utility modules for gapless-crypto-data."""
+from .error_handling import (
+    DataCollectionError,
+    FileOperationError,
+    GapFillingError,
+    GaplessCryptoError,
+    ValidationError,
+    format_user_error,
+    format_user_warning,
+    get_standard_logger,
+    handle_operation_error,
+    safe_operation,
+    validate_file_path,
+)
+__all__ = [
+    "GaplessCryptoError",
+    "DataCollectionError",
+    "GapFillingError",
+    "FileOperationError",
+    "ValidationError",
+    "get_standard_logger",
+    "handle_operation_error",
+    "safe_operation",
+    "validate_file_path",
+    "format_user_error",
+    "format_user_warning",
+]

gapless_crypto_clickhouse/utils/error_handling.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""
+Standardized error handling framework for gapless-crypto-data.
+Provides consistent exception handling, logging, and error reporting across all modules.
+Eliminates duplicate error handling patterns and ensures consistent debugging experience.
+"""
+import logging
+import traceback
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
+class GaplessCryptoError(Exception):
+    """Base exception for all gapless-crypto-data errors."""
+    def __init__(self, message: str, context: Optional[Dict[str, Any]] = None):
+        super().__init__(message)
+        self.context = context or {}
+class DataCollectionError(GaplessCryptoError):
+    """Errors during data collection from Binance."""
+    pass
+class GapFillingError(GaplessCryptoError):
+    """Errors during gap detection or filling operations."""
+    pass
+class FileOperationError(GaplessCryptoError):
+    """Errors during file I/O operations."""
+    pass
+class ValidationError(GaplessCryptoError):
+    """Errors during data validation."""
+    pass
+def get_standard_logger(module_name: str) -> logging.Logger:
+    """Get standardized logger for consistent formatting across modules."""
+    logger = logging.getLogger(f"gapless_crypto_clickhouse.{module_name}")
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(logging.INFO)
+    return logger
+def handle_operation_error(
+    operation_name: str,
+    exception: Exception,
+    context: Optional[Dict[str, Any]] = None,
+    logger: Optional[logging.Logger] = None,
+    reraise: bool = False,
+    default_return: Any = None,
+) -> Any:
+    """
+    Standardized error handling for operations.
+    Args:
+        operation_name: Human-readable operation description
+        exception: The caught exception
+        context: Additional context for debugging
+        logger: Logger instance (uses default if None)
+        reraise: Whether to re-raise the exception after logging
+        default_return: Value to return if not re-raising
+    Returns:
+        default_return value, or re-raises if reraise=True
+    """
+    if logger is None:
+        logger = get_standard_logger("error_handler")
+    # Format context information
+    context_str = ""
+    if context:
+        context_items = [f"{k}={v}" for k, v in context.items()]
+        context_str = f" (Context: {', '.join(context_items)})"
+    # Log the error with standard format
+    error_msg = f"❌ {operation_name} failed: {str(exception)}{context_str}"
+    logger.error(error_msg)
+    # Optionally log full traceback for debugging
+    if logger.isEnabledFor(logging.DEBUG):
+        logger.debug(f"Full traceback: {traceback.format_exc()}")
+    if reraise:
+        raise
+    return default_return
+def safe_operation(
+    operation_name: str,
+    func: Callable,
+    context: Optional[Dict[str, Any]] = None,
+    logger: Optional[logging.Logger] = None,
+    exception_types: tuple = (Exception,),
+    default_return: Any = None,
+    reraise: bool = False,
+) -> Any:
+    """
+    Execute operation with standardized error handling.
+    Args:
+        operation_name: Human-readable operation description
+        func: Function to execute
+        context: Additional context for debugging
+        logger: Logger instance (uses default if None)
+        exception_types: Tuple of exception types to catch
+        default_return: Value to return on error
+        reraise: Whether to re-raise caught exceptions
+    Returns:
+        Function result or default_return on error
+    """
+    try:
+        return func()
+    except exception_types as e:
+        return handle_operation_error(
+            operation_name=operation_name,
+            exception=e,
+            context=context,
+            logger=logger,
+            reraise=reraise,
+            default_return=default_return,
+        )
+def validate_file_path(file_path: Union[str, Path], operation: str = "file operation") -> Path:
+    """
+    Validate file path with standardized error handling.
+    Args:
+        file_path: Path to validate
+        operation: Operation description for error messages
+    Returns:
+        Validated Path object
+    Raises:
+        FileOperationError: If path is invalid
+    """
+    try:
+        path = Path(file_path)
+        if not path.exists():
+            raise FileOperationError(
+                f"File not found: {path}", context={"operation": operation, "path": str(path)}
+            )
+        return path
+    except Exception as e:
+        if isinstance(e, FileOperationError):
+            raise
+        raise FileOperationError(
+            f"Invalid file path: {file_path}", context={"operation": operation, "error": str(e)}
+        )
+def format_user_error(message: str, suggestion: Optional[str] = None) -> str:
+    """
+    Format user-facing error message with consistent styling.
+    Args:
+        message: Error message
+        suggestion: Optional suggestion for resolution
+    Returns:
+        Formatted error message
+    """
+    formatted = f"❌ ERROR: {message}"
+    if suggestion:
+        formatted += f"\n💡 SUGGESTION: {suggestion}"
+    return formatted
+def format_user_warning(message: str, suggestion: Optional[str] = None) -> str:
+    """
+    Format user-facing warning message with consistent styling.
+    Args:
+        message: Warning message
+        suggestion: Optional suggestion for resolution
+    Returns:
+        Formatted warning message
+    """
+    formatted = f"⚠️  WARNING: {message}"
+    if suggestion:
+        formatted += f"\n💡 SUGGESTION: {suggestion}"
+    return formatted