PyPI - archae - Versions diffs - 2026.1.0b2__py3-none-any.whl → 2026.2.0b1__py3-none-any.whl - Mend

archae 2026.1.0b2py3-none-any.whl → 2026.2.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

archae/__init__.py +4 -0
archae/cli.py +140 -345
archae/config.py +107 -0
archae/default_settings.toml +9 -0
archae/extractor.py +249 -0
archae/options.yaml +39 -0
archae/util/archiver/base_archiver.py +2 -2
archae/util/converter/file_size.py +77 -0
archae/util/file_tracker.py +93 -0
archae/util/tool_manager.py +112 -0
{archae-2026.1.0b2.dist-info → archae-2026.2.0b1.dist-info}/METADATA +26 -27
archae-2026.2.0b1.dist-info/RECORD +23 -0
archae-2026.1.0b2.dist-info/RECORD +0 -16
{archae-2026.1.0b2.dist-info → archae-2026.2.0b1.dist-info}/WHEEL +0 -0
{archae-2026.1.0b2.dist-info → archae-2026.2.0b1.dist-info}/entry_points.txt +0 -0

archae/extractor.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Archive extraction module for archae."""
+from __future__ import annotations
+import hashlib
+import logging
+import shutil
+from typing import TYPE_CHECKING
+import magic
+from archae.config import apply_options, default_settings, settings
+from archae.util.file_tracker import FileTracker
+from archae.util.tool_manager import ToolManager
+if TYPE_CHECKING:
+    from pathlib import Path
+    from archae.util.archiver.base_archiver import BaseArchiver
+class WarningAccumulator(logging.Handler):
+    """Logging handler to accumulate warnings while still printing them."""
+    def __init__(self) -> None:
+        """Initialize the WarningAccumulator."""
+        super().__init__()
+        self.warnings: list[str] = []
+    def emit(self, record: logging.LogRecord) -> None:
+        """Print and accumulate warning messages."""
+        if record.levelno >= logging.WARNING:
+            self.warnings.append(self.format(record))
+        print(self.format(record))  # noqa: T201
+logger = logging.getLogger("archae")
+logger.setLevel(logging.INFO)
+accumulator = WarningAccumulator()
+logger.addHandler(accumulator)
+logger.setLevel(logging.DEBUG)
+class ArchiveExtractor:
+    """Handles archive extraction and file tracking."""
+    def __init__(self, extract_dir: Path) -> None:
+        """Initialize the ArchiveExtractor.
+        Args:
+            extract_dir (Path): The base directory for extraction. Defaults to current working directory + extracted.
+        """
+        self.extract_dir = extract_dir
+        if self.extract_dir.exists() and self.extract_dir.is_dir():
+            shutil.rmtree(self.extract_dir)
+        self.extract_dir.mkdir(exist_ok=True)
+        self.file_tracker = FileTracker()
+    def handle_file(self, file_path: Path) -> None:
+        """Handle a file given its path.
+        Args:
+            file_path (Path): The path to the file.
+        """
+        self.__handle_file(file_path)
+    def __handle_file(self, file_path: Path, depth: int = 1) -> None:
+        """Internal implementation of handle_file.
+        Args:
+            file_path (Path): The path to the file.
+            depth (int): The current depth in the archive extraction tree. Defaults to 1.
+        """
+        logger.info("Starting examination of file: %s", file_path)
+        base_hash = self._sha256_hash_file(file_path)
+        file_size_bytes = file_path.stat().st_size
+        self.file_tracker.track_file(base_hash, file_size_bytes)
+        self.file_tracker.track_file_path(base_hash, file_path)
+        self.file_tracker.add_metadata_to_hash(
+            base_hash, "type", magic.from_file(file_path)
+        )
+        self.file_tracker.add_metadata_to_hash(
+            base_hash, "type_mime", magic.from_file(file_path, mime=True)
+        )
+        extension = file_path.suffix.lstrip(".").lower()
+        self.file_tracker.add_metadata_to_hash(base_hash, "extension", extension)
+        is_file_archive = self._is_archive(base_hash)
+        self.file_tracker.add_metadata_to_hash(base_hash, "is_archive", is_file_archive)
+        if is_file_archive:
+            if settings["MAX_DEPTH"] == 0 or depth < settings["MAX_DEPTH"]:
+                archiver = self._get_archiver_for_file(base_hash)
+                if archiver:
+                    extracted_size = archiver.get_archive_uncompressed_size(file_path)
+                    self.file_tracker.add_metadata_to_hash(
+                        base_hash, "extracted_size", extracted_size
+                    )
+                    compression_ratio = extracted_size / file_size_bytes
+                    self.file_tracker.add_metadata_to_hash(
+                        base_hash, "overall_compression_ratio", compression_ratio
+                    )
+                    if extracted_size > settings["MAX_ARCHIVE_SIZE_BYTES"]:
+                        logger.warning(
+                            "MAX_ARCHIVE_SIZE_BYTES: Skipped archive %s because expected size %s is greater than MAX_ARCHIVE_SIZE_BYTES %s",
+                            file_path,
+                            extracted_size,
+                            settings["MAX_ARCHIVE_SIZE_BYTES"],
+                        )
+                    elif (
+                        self.file_tracker.get_tracked_file_size() + extracted_size
+                        > settings["MAX_TOTAL_SIZE_BYTES"]
+                    ):
+                        logger.warning(
+                            "MAX_TOTAL_SIZE_BYTES: Skipped archive %s because expected size %s + current tracked files %s is greater than MAX_TOTAL_SIZE_BYTES %s",
+                            file_path,
+                            extracted_size,
+                            self.file_tracker.get_tracked_file_size(),
+                            settings["MAX_TOTAL_SIZE_BYTES"],
+                        )
+                    elif compression_ratio < settings["MIN_ARCHIVE_RATIO"]:
+                        logger.warning(
+                            "MIN_ARCHIVE_RATIO: Skipped archive %s because compression ratio %.5f is less than MIN_ARCHIVE_RATIO %s",
+                            file_path,
+                            compression_ratio,
+                            settings["MIN_ARCHIVE_RATIO"],
+                        )
+                    elif (
+                        shutil.disk_usage(self.extract_dir).free - extracted_size
+                        < settings["MIN_DISK_FREE_SPACE"]
+                    ):
+                        logger.warning(
+                            "MIN_DISK_FREE_SPACE:Skipped archive %s because extracting it would leave less than MIN_DISK_FREE_SPACE %s bytes free at extraction location %s",
+                            file_path,
+                            settings["MIN_DISK_FREE_SPACE"],
+                            self.extract_dir,
+                        )
+                    else:
+                        extraction_dir = self.extract_dir / base_hash
+                        archiver.extract_archive(file_path, extraction_dir)
+                        child_files = self._list_child_files(extraction_dir)
+                        for child_file in child_files:
+                            self.__handle_file(child_file, depth + 1)
+                else:
+                    logger.warning(
+                        "NO_ARCHIVER: No suitable archiver found for file: %s",
+                        file_path,
+                    )
+            else:
+                logger.warning(
+                    "MAX_DEPTH: File %s is not extracted; max depth reached.", file_path
+                )
+    def _is_archive(self, file_hash: str) -> bool:
+        """Determine the appropriate archiver for a file based on its metadata.
+        Args:
+            file_hash (str): The hash of the file.
+        Returns:
+            bool: True if the file is an archive, otherwise False.
+        """
+        metadata = self.file_tracker.get_tracked_file_metadata(file_hash)
+        mime_type = metadata.get("type_mime", "").lower()
+        extension = metadata.get("extension", "").lower()
+        for tool in ToolManager.get_tools().values():
+            if mime_type in tool.mime_types or extension in tool.file_extensions:
+                return True
+        return False
+    def _get_archiver_for_file(self, file_hash: str) -> BaseArchiver | None:
+        """Determine the appropriate archiver for a file based on its metadata.
+        Args:
+            file_hash (str): The hash of the file.
+        Returns:
+            str | None: The name of the archiver tool if found, otherwise None.
+        """
+        metadata = self.file_tracker.get_tracked_file_metadata(file_hash)
+        mime_type = metadata.get("type_mime", "").lower()
+        extension = metadata.get("extension", "").lower()
+        for tool in ToolManager.get_tools().values():
+            if mime_type in tool.mime_types or extension in tool.file_extensions:
+                return tool
+        return None
+    @staticmethod
+    def _list_child_files(directory_path: Path, pattern: str = "*") -> list[Path]:
+        """Recursively get a list of files matching a pattern in a directory.
+        Args:
+            directory_path (Path): The starting directory path.
+            pattern (str): The file pattern to match (e.g., '*.txt', '*.py').
+        Returns:
+            list: A list of Path objects for the matching files.
+        """
+        # rglob performs a recursive search
+        files = list(directory_path.rglob(pattern))
+        # Optionally, filter out directories if pattern='*'
+        return [file for file in files if file.is_file()]
+    @staticmethod
+    def _sha256_hash_file(file_path: Path) -> str:
+        """Computes the SHA-256 hash of a file.
+        Args:
+            file_path (Path): The path to the file.
+        Returns:
+            str: The SHA-256 hash of the file in hexadecimal format.
+        """
+        try:
+            with file_path.open("rb") as f:
+                digest = hashlib.file_digest(f, "sha256")
+            return digest.hexdigest()
+        except FileNotFoundError:
+            return "Error: File not found"
+    def get_tracked_files(self) -> dict[str, dict]:
+        """Print the tracked files for debugging purposes."""
+        return self.file_tracker.get_tracked_files()
+    def get_warnings(self) -> list[str]:
+        """Print accumulated warnings for debugging purposes."""
+        return accumulator.warnings
+    def get_default_settings(self) -> dict:
+        """Get the default settings from the config module.
+        Returns:
+            dict: Dictionary of default settings.
+        """
+        return dict(default_settings)
+    def apply_settings(self, option_list: list[tuple[str, str]]) -> None:
+        """Apply a list of settings options.
+        Args:
+            option_list (list[tuple[str, str]]): List of (key, value) tuples to apply.
+        Example:
+            extractor.apply_settings([("MAX_ARCHIVE_SIZE_BYTES", "5000000000")])
+        """
+        apply_options(option_list)

archae/options.yaml ADDED Viewed

@@ -0,0 +1,39 @@
+MAX_TOTAL_SIZE_BYTES:
+  type: int
+  converter: archae.util.converter.file_size:convert
+  help: Maximum total size of all archives to extract in bytes.
+  examples:
+    - 1GB
+    - 500M
+    - 500
+MAX_ARCHIVE_SIZE_BYTES:
+  type: int
+  converter: archae.util.converter.file_size:convert
+  help: Maximum size of a single archive to extract in bytes.
+  examples:
+    - 1GB
+    - 500M
+    - 500
+MIN_ARCHIVE_RATIO:
+  type: float
+  converter: float
+  help: Minimum compression ratio (compressed size / uncompressed size) required to extract an archive.
+  examples:
+    - 0.001
+MIN_DISK_FREE_SPACE:
+  type: int
+  converter: archae.util.converter.file_size:convert
+  help: Minimum required estimated disk space after extraction in bytes.
+  examples:
+    - 1GB
+    - 500M
+    - 500
+MAX_DEPTH:
+  type: int
+  converter: int
+  help: Maximum extraction depth for nested archives. Usse 0 for unlimited depth.
+  examples:
+    - 3
+    - 5
+    - 10
+    - 0

archae/util/archiver/base_archiver.py CHANGED Viewed

@@ -28,12 +28,12 @@ class BaseArchiver(ABC):
     @property
     def file_extensions(self) -> list[str]:
         """A non-abstract method that accesses the class impl for the file extensions."""
-        return self.file_extensions
+        return self.__class__.file_extensions  # type: ignore[return-value]
     @property
     def mime_types(self) -> list[str]:
         """A non-abstract method that accesses the class impl for the mime types."""
-        return self.mime_types
+        return self.__class__.mime_types  # type: ignore[return-value]
     @abstractmethod
     def extract_archive(self, archive_path: Path, extract_dir: Path) -> None:

archae/util/converter/file_size.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""File size conversion utilities."""
+import re
+from archae.util.enum.byte_scale import ByteScale
+def compact_value(value: float) -> str:
+    """Convert a float of file size to a FileSize string.
+    Args:
+        value (float): The size to convert
+    Returns:
+        str: A string with the most collapsed exact byte size rep.
+    """
+    exponent = 0
+    modulo: float = 0
+    while modulo == 0 and exponent < int(ByteScale.PETA.value):
+        modulo = value % 1024
+        if modulo == 0:
+            exponent += 1
+            value = int(value / 1024)
+    return f"{value}{ByteScale(exponent).prefix_letter}"  # type: ignore[call-arg]
+def expand_value(value: str | int) -> int:
+    """Convert a FileSize string or int to an int.
+    Args:
+        value (str | int): The value to convert as necessary.
+    Returns:
+        int: Size in bytes
+    """
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    except TypeError:
+        pass
+    # Regex to split number and unit
+    match = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGTP]B?)$", str(value), re.IGNORECASE)
+    if not match:
+        msg = f"{value} is not a valid file size (e.g., 10G, 500M)"
+        raise ValueError(msg)
+    number, unit = match.groups()
+    number = float(number)
+    unit = unit[0].upper()
+    byte_scale = 1024 ** (ByteScale.from_prefix_letter(unit).value)
+    # Default to bytes if no specific unit multiplier, or assume B
+    return int(number * byte_scale)
+def convert(value: str | int) -> int:
+    """Convert a FileSizeParam to an int.
+    Args:
+        value (click.Option): The value to convert as necessary.
+        param (str): The param we are validating.
+        ctx (click.Context): The click Context to fail if we can't parse it.
+    Returns:
+        int: Size in bytes
+    """
+    try:
+        return expand_value(value)
+    except ValueError as err:
+        msg = f"Could not convert {value} to file size: {err}"
+        raise ValueError(msg) from err

archae/util/file_tracker.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""File tracking utilities for archae."""
+from __future__ import annotations
+import copy
+from typing import Any
+class FileTracker:
+    """Manages file tracking by hash with metadata and paths."""
+    def __init__(self) -> None:
+        """Initialize the FileTracker."""
+        self.tracked_files: dict[str, dict] = {}
+    def track_file(self, file_hash: str, file_size_bytes: int) -> None:
+        """Track a file by its hash.
+        Args:
+            file_hash (str): The hash of the file to track.
+            file_size_bytes (int): The size of the file in bytes.
+        """
+        if file_hash not in self.tracked_files:
+            self.tracked_files[file_hash] = {}
+            self.tracked_files[file_hash]["size"] = file_size_bytes
+            self.tracked_files[file_hash]["metadata"] = {}
+        elif self.tracked_files[file_hash]["size"] != file_size_bytes:
+            msg = f"Hash collision detected for hash {file_hash} with differing sizes."
+            raise RuntimeError(msg)
+    def is_file_tracked(self, file_hash: str) -> bool:
+        """Check if a file is tracked by its hash.
+        Args:
+            file_hash (str): The hash of the file to check.
+        Returns:
+            bool: True if the file is tracked, False otherwise.
+        """
+        return file_hash in self.tracked_files
+    def get_tracked_file_metadata(self, file_hash: str) -> dict:
+        """Get metadata for a tracked file by its hash.
+        Args:
+            file_hash (str): The hash of the file.
+        Returns:
+            dict: The metadata of the tracked file.
+        """
+        return copy.deepcopy(self.tracked_files.get(file_hash, {}).get("metadata", {}))
+    def track_file_path(self, file_hash: str, file_path: Any) -> None:
+        """Track a file path by its hash.
+        Args:
+            file_hash (str): The hash of the file.
+            file_path: The path to track.
+        """
+        if "paths" not in self.tracked_files[file_hash]:
+            self.tracked_files[file_hash]["paths"] = []
+        if file_path not in self.tracked_files[file_hash]["paths"]:
+            self.tracked_files[file_hash]["paths"].append(file_path)
+    def add_metadata_to_hash(self, file_hash: str, key: str, value: Any) -> None:
+        """Add metadata to a tracked file.
+        Args:
+            file_hash (str): The hash of the file.
+            key (str): The metadata key.
+            value (Any): The metadata value.
+        """
+        self.tracked_files[file_hash]["metadata"][key] = value
+    def get_tracked_file_size(self) -> int:
+        """Get the total size of all tracked files.
+        Returns:
+            int: The total size in bytes.
+        """
+        return sum(
+            self.tracked_files[file_hash].get("size", 0)
+            for file_hash in self.tracked_files
+        )
+    def get_tracked_files(self) -> dict[str, dict]:
+        """Get all tracked files. This is a deep copy to prevent external modification.
+        Returns:
+            dict[str, dict]: The tracked files dictionary.
+        """
+        return copy.deepcopy(self.tracked_files)

archae/util/tool_manager.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Tool manager for locating and managing external archiving tools."""
+from __future__ import annotations
+import logging
+import shutil
+from typing import TYPE_CHECKING, ClassVar, cast
+import archae.util.archiver
+if TYPE_CHECKING:
+    from archae.util.archiver.base_archiver import BaseArchiver
+logger = logging.getLogger("archae")
+class ToolManager:
+    """Manager for locating and managing external archiving tools."""
+    __tools: ClassVar[dict[str, BaseArchiver]] = {}
+    @classmethod
+    def locate_tools(cls) -> None:
+        """Locate external tools."""
+        for archiver_cls in archae.util.archiver.BaseArchiver.__subclasses__():
+            logger.debug("Locating tool for %s", archiver_cls.archiver_name)
+            tool_path = shutil.which(str(archiver_cls.executable_name))
+            if tool_path:
+                logger.debug("Found %s at %s", archiver_cls.archiver_name, tool_path)
+                cls.__tools[str(archiver_cls.archiver_name)] = archiver_cls(tool_path)  # type: ignore[abstract]
+            else:
+                logger.warning(
+                    "MISSING_ARCHIVER: Could not find %s; some archive types may not be supported",
+                    archiver_cls.archiver_name,
+                )
+    @classmethod
+    def get_supported_extensions(cls) -> list[str]:
+        """Get a sorted list of all file extensions supported by located tools.
+        Returns:
+            list[str]: Sorted list of supported file extensions.
+        """
+        supported: set[str] = set()
+        for tool in cls.__tools.values():
+            supported.update(tool.file_extensions)
+        return sorted(supported)
+    @classmethod
+    def get_unsupported_extensions(cls) -> list[str]:
+        """Get a sorted list of all file extensions from all archiver subclasses that are not currently supported.
+        Returns:
+            list[str]: Sorted list of unsupported file extensions.
+        """
+        all_extensions: set[str] = set()
+        supported: set[str] = set()
+        # Get all extensions from all archiver classes
+        for archiver_cls in archae.util.archiver.BaseArchiver.__subclasses__():
+            all_extensions.update(cast("list[str]", archiver_cls.file_extensions))
+        # Get supported extensions from located tools
+        for tool in cls.__tools.values():
+            supported.update(tool.file_extensions)
+        # Return the difference
+        unsupported = all_extensions - supported
+        return sorted(unsupported)
+    @classmethod
+    def get_supported_mime_types(cls) -> list[str]:
+        """Get a sorted list of all MIME types supported by located tools.
+        Returns:
+            list[str]: Sorted list of supported MIME types.
+        """
+        supported: set[str] = set()
+        for tool in cls.__tools.values():
+            supported.update(tool.mime_types)
+        return sorted(supported)
+    @classmethod
+    def get_unsupported_mime_types(cls) -> list[str]:
+        """Get a sorted list of all MIME types from all archiver subclasses that are not currently supported.
+        Returns:
+            list[str]: Sorted list of unsupported MIME types.
+        """
+        all_mime_types: set[str] = set()
+        supported: set[str] = set()
+        # Get all MIME types from all archiver classes
+        for archiver_cls in archae.util.archiver.BaseArchiver.__subclasses__():
+            all_mime_types.update(cast("list[str]", archiver_cls.mime_types))
+        # Get supported MIME types from located tools
+        for tool in cls.__tools.values():
+            supported.update(tool.mime_types)
+        # Return the difference
+        unsupported = all_mime_types - supported
+        return sorted(unsupported)
+    @classmethod
+    def get_tools(cls) -> dict[str, BaseArchiver]:
+        """Get a shallow copy of the tools dictionary.
+        Returns:
+            dict[str, BaseArchiver]: A shallow copy of the tools dictionary.
+        """
+        return cls.__tools.copy()

archae 2026.1.0b2__py3-none-any.whl → 2026.2.0b1__py3-none-any.whl

archae 2026.1.0b2py3-none-any.whl → 2026.2.0b1py3-none-any.whl