archae 2026.1.0b2__py3-none-any.whl → 2026.2.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
archae/extractor.py ADDED
@@ -0,0 +1,249 @@
1
+ """Archive extraction module for archae."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import logging
7
+ import shutil
8
+ from typing import TYPE_CHECKING
9
+
10
+ import magic
11
+
12
+ from archae.config import apply_options, default_settings, settings
13
+ from archae.util.file_tracker import FileTracker
14
+ from archae.util.tool_manager import ToolManager
15
+
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+
19
+ from archae.util.archiver.base_archiver import BaseArchiver
20
+
21
+
22
+ class WarningAccumulator(logging.Handler):
23
+ """Logging handler to accumulate warnings while still printing them."""
24
+
25
+ def __init__(self) -> None:
26
+ """Initialize the WarningAccumulator."""
27
+ super().__init__()
28
+ self.warnings: list[str] = []
29
+
30
+ def emit(self, record: logging.LogRecord) -> None:
31
+ """Print and accumulate warning messages."""
32
+ if record.levelno >= logging.WARNING:
33
+ self.warnings.append(self.format(record))
34
+ print(self.format(record)) # noqa: T201
35
+
36
+
37
+ logger = logging.getLogger("archae")
38
+ logger.setLevel(logging.INFO)
39
+ accumulator = WarningAccumulator()
40
+ logger.addHandler(accumulator)
41
+ logger.setLevel(logging.DEBUG)
42
+
43
+
44
+ class ArchiveExtractor:
45
+ """Handles archive extraction and file tracking."""
46
+
47
+ def __init__(self, extract_dir: Path) -> None:
48
+ """Initialize the ArchiveExtractor.
49
+
50
+ Args:
51
+ extract_dir (Path): The base directory for extraction. Defaults to current working directory + extracted.
52
+ """
53
+ self.extract_dir = extract_dir
54
+ if self.extract_dir.exists() and self.extract_dir.is_dir():
55
+ shutil.rmtree(self.extract_dir)
56
+ self.extract_dir.mkdir(exist_ok=True)
57
+ self.file_tracker = FileTracker()
58
+
59
+ def handle_file(self, file_path: Path) -> None:
60
+ """Handle a file given its path.
61
+
62
+ Args:
63
+ file_path (Path): The path to the file.
64
+ """
65
+ self.__handle_file(file_path)
66
+
67
+ def __handle_file(self, file_path: Path, depth: int = 1) -> None:
68
+ """Internal implementation of handle_file.
69
+
70
+ Args:
71
+ file_path (Path): The path to the file.
72
+ depth (int): The current depth in the archive extraction tree. Defaults to 1.
73
+ """
74
+ logger.info("Starting examination of file: %s", file_path)
75
+
76
+ base_hash = self._sha256_hash_file(file_path)
77
+ file_size_bytes = file_path.stat().st_size
78
+ self.file_tracker.track_file(base_hash, file_size_bytes)
79
+ self.file_tracker.track_file_path(base_hash, file_path)
80
+ self.file_tracker.add_metadata_to_hash(
81
+ base_hash, "type", magic.from_file(file_path)
82
+ )
83
+ self.file_tracker.add_metadata_to_hash(
84
+ base_hash, "type_mime", magic.from_file(file_path, mime=True)
85
+ )
86
+ extension = file_path.suffix.lstrip(".").lower()
87
+ self.file_tracker.add_metadata_to_hash(base_hash, "extension", extension)
88
+ is_file_archive = self._is_archive(base_hash)
89
+ self.file_tracker.add_metadata_to_hash(base_hash, "is_archive", is_file_archive)
90
+ if is_file_archive:
91
+ if settings["MAX_DEPTH"] == 0 or depth < settings["MAX_DEPTH"]:
92
+ archiver = self._get_archiver_for_file(base_hash)
93
+ if archiver:
94
+ extracted_size = archiver.get_archive_uncompressed_size(file_path)
95
+ self.file_tracker.add_metadata_to_hash(
96
+ base_hash, "extracted_size", extracted_size
97
+ )
98
+ compression_ratio = extracted_size / file_size_bytes
99
+ self.file_tracker.add_metadata_to_hash(
100
+ base_hash, "overall_compression_ratio", compression_ratio
101
+ )
102
+ if extracted_size > settings["MAX_ARCHIVE_SIZE_BYTES"]:
103
+ logger.warning(
104
+ "MAX_ARCHIVE_SIZE_BYTES: Skipped archive %s because expected size %s is greater than MAX_ARCHIVE_SIZE_BYTES %s",
105
+ file_path,
106
+ extracted_size,
107
+ settings["MAX_ARCHIVE_SIZE_BYTES"],
108
+ )
109
+ elif (
110
+ self.file_tracker.get_tracked_file_size() + extracted_size
111
+ > settings["MAX_TOTAL_SIZE_BYTES"]
112
+ ):
113
+ logger.warning(
114
+ "MAX_TOTAL_SIZE_BYTES: Skipped archive %s because expected size %s + current tracked files %s is greater than MAX_TOTAL_SIZE_BYTES %s",
115
+ file_path,
116
+ extracted_size,
117
+ self.file_tracker.get_tracked_file_size(),
118
+ settings["MAX_TOTAL_SIZE_BYTES"],
119
+ )
120
+ elif compression_ratio < settings["MIN_ARCHIVE_RATIO"]:
121
+ logger.warning(
122
+ "MIN_ARCHIVE_RATIO: Skipped archive %s because compression ratio %.5f is less than MIN_ARCHIVE_RATIO %s",
123
+ file_path,
124
+ compression_ratio,
125
+ settings["MIN_ARCHIVE_RATIO"],
126
+ )
127
+ elif (
128
+ shutil.disk_usage(self.extract_dir).free - extracted_size
129
+ < settings["MIN_DISK_FREE_SPACE"]
130
+ ):
131
+ logger.warning(
132
+ "MIN_DISK_FREE_SPACE:Skipped archive %s because extracting it would leave less than MIN_DISK_FREE_SPACE %s bytes free at extraction location %s",
133
+ file_path,
134
+ settings["MIN_DISK_FREE_SPACE"],
135
+ self.extract_dir,
136
+ )
137
+ else:
138
+ extraction_dir = self.extract_dir / base_hash
139
+ archiver.extract_archive(file_path, extraction_dir)
140
+ child_files = self._list_child_files(extraction_dir)
141
+ for child_file in child_files:
142
+ self.__handle_file(child_file, depth + 1)
143
+ else:
144
+ logger.warning(
145
+ "NO_ARCHIVER: No suitable archiver found for file: %s",
146
+ file_path,
147
+ )
148
+ else:
149
+ logger.warning(
150
+ "MAX_DEPTH: File %s is not extracted; max depth reached.", file_path
151
+ )
152
+
153
+ def _is_archive(self, file_hash: str) -> bool:
154
+ """Determine the appropriate archiver for a file based on its metadata.
155
+
156
+ Args:
157
+ file_hash (str): The hash of the file.
158
+
159
+ Returns:
160
+ bool: True if the file is an archive, otherwise False.
161
+
162
+ """
163
+ metadata = self.file_tracker.get_tracked_file_metadata(file_hash)
164
+ mime_type = metadata.get("type_mime", "").lower()
165
+ extension = metadata.get("extension", "").lower()
166
+
167
+ for tool in ToolManager.get_tools().values():
168
+ if mime_type in tool.mime_types or extension in tool.file_extensions:
169
+ return True
170
+
171
+ return False
172
+
173
+ def _get_archiver_for_file(self, file_hash: str) -> BaseArchiver | None:
174
+ """Determine the appropriate archiver for a file based on its metadata.
175
+
176
+ Args:
177
+ file_hash (str): The hash of the file.
178
+
179
+ Returns:
180
+ str | None: The name of the archiver tool if found, otherwise None.
181
+ """
182
+ metadata = self.file_tracker.get_tracked_file_metadata(file_hash)
183
+ mime_type = metadata.get("type_mime", "").lower()
184
+ extension = metadata.get("extension", "").lower()
185
+
186
+ for tool in ToolManager.get_tools().values():
187
+ if mime_type in tool.mime_types or extension in tool.file_extensions:
188
+ return tool
189
+ return None
190
+
191
+ @staticmethod
192
+ def _list_child_files(directory_path: Path, pattern: str = "*") -> list[Path]:
193
+ """Recursively get a list of files matching a pattern in a directory.
194
+
195
+ Args:
196
+ directory_path (Path): The starting directory path.
197
+ pattern (str): The file pattern to match (e.g., '*.txt', '*.py').
198
+
199
+ Returns:
200
+ list: A list of Path objects for the matching files.
201
+ """
202
+ # rglob performs a recursive search
203
+ files = list(directory_path.rglob(pattern))
204
+ # Optionally, filter out directories if pattern='*'
205
+ return [file for file in files if file.is_file()]
206
+
207
+ @staticmethod
208
+ def _sha256_hash_file(file_path: Path) -> str:
209
+ """Computes the SHA-256 hash of a file.
210
+
211
+ Args:
212
+ file_path (Path): The path to the file.
213
+
214
+ Returns:
215
+ str: The SHA-256 hash of the file in hexadecimal format.
216
+ """
217
+ try:
218
+ with file_path.open("rb") as f:
219
+ digest = hashlib.file_digest(f, "sha256")
220
+ return digest.hexdigest()
221
+ except FileNotFoundError:
222
+ return "Error: File not found"
223
+
224
+ def get_tracked_files(self) -> dict[str, dict]:
225
+ """Print the tracked files for debugging purposes."""
226
+ return self.file_tracker.get_tracked_files()
227
+
228
+ def get_warnings(self) -> list[str]:
229
+ """Print accumulated warnings for debugging purposes."""
230
+ return accumulator.warnings
231
+
232
+ def get_default_settings(self) -> dict:
233
+ """Get the default settings from the config module.
234
+
235
+ Returns:
236
+ dict: Dictionary of default settings.
237
+ """
238
+ return dict(default_settings)
239
+
240
+ def apply_settings(self, option_list: list[tuple[str, str]]) -> None:
241
+ """Apply a list of settings options.
242
+
243
+ Args:
244
+ option_list (list[tuple[str, str]]): List of (key, value) tuples to apply.
245
+
246
+ Example:
247
+ extractor.apply_settings([("MAX_ARCHIVE_SIZE_BYTES", "5000000000")])
248
+ """
249
+ apply_options(option_list)
archae/options.yaml ADDED
@@ -0,0 +1,39 @@
1
+ MAX_TOTAL_SIZE_BYTES:
2
+ type: int
3
+ converter: archae.util.converter.file_size:convert
4
+ help: Maximum total size of all archives to extract in bytes.
5
+ examples:
6
+ - 1GB
7
+ - 500M
8
+ - 500
9
+ MAX_ARCHIVE_SIZE_BYTES:
10
+ type: int
11
+ converter: archae.util.converter.file_size:convert
12
+ help: Maximum size of a single archive to extract in bytes.
13
+ examples:
14
+ - 1GB
15
+ - 500M
16
+ - 500
17
+ MIN_ARCHIVE_RATIO:
18
+ type: float
19
+ converter: float
20
+ help: Minimum compression ratio (compressed size / uncompressed size) required to extract an archive.
21
+ examples:
22
+ - 0.001
23
+ MIN_DISK_FREE_SPACE:
24
+ type: int
25
+ converter: archae.util.converter.file_size:convert
26
+ help: Minimum required estimated disk space after extraction in bytes.
27
+ examples:
28
+ - 1GB
29
+ - 500M
30
+ - 500
31
+ MAX_DEPTH:
32
+ type: int
33
+ converter: int
34
+ help: Maximum extraction depth for nested archives. Usse 0 for unlimited depth.
35
+ examples:
36
+ - 3
37
+ - 5
38
+ - 10
39
+ - 0
@@ -28,12 +28,12 @@ class BaseArchiver(ABC):
28
28
  @property
29
29
  def file_extensions(self) -> list[str]:
30
30
  """A non-abstract method that accesses the class impl for the file extensions."""
31
- return self.file_extensions
31
+ return self.__class__.file_extensions # type: ignore[return-value]
32
32
 
33
33
  @property
34
34
  def mime_types(self) -> list[str]:
35
35
  """A non-abstract method that accesses the class impl for the mime types."""
36
- return self.mime_types
36
+ return self.__class__.mime_types # type: ignore[return-value]
37
37
 
38
38
  @abstractmethod
39
39
  def extract_archive(self, archive_path: Path, extract_dir: Path) -> None:
@@ -0,0 +1,77 @@
1
+ """File size conversion utilities."""
2
+
3
+ import re
4
+
5
+ from archae.util.enum.byte_scale import ByteScale
6
+
7
+
8
+ def compact_value(value: float) -> str:
9
+ """Convert a float of file size to a FileSize string.
10
+
11
+ Args:
12
+ value (float): The size to convert
13
+
14
+ Returns:
15
+ str: A string with the most collapsed exact byte size rep.
16
+
17
+ """
18
+ exponent = 0
19
+ modulo: float = 0
20
+ while modulo == 0 and exponent < int(ByteScale.PETA.value):
21
+ modulo = value % 1024
22
+ if modulo == 0:
23
+ exponent += 1
24
+ value = int(value / 1024)
25
+ return f"{value}{ByteScale(exponent).prefix_letter}" # type: ignore[call-arg]
26
+
27
+
28
+ def expand_value(value: str | int) -> int:
29
+ """Convert a FileSize string or int to an int.
30
+
31
+ Args:
32
+ value (str | int): The value to convert as necessary.
33
+
34
+ Returns:
35
+ int: Size in bytes
36
+
37
+ """
38
+ try:
39
+ return int(value)
40
+ except ValueError:
41
+ pass
42
+ except TypeError:
43
+ pass
44
+
45
+ # Regex to split number and unit
46
+ match = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGTP]B?)$", str(value), re.IGNORECASE)
47
+ if not match:
48
+ msg = f"{value} is not a valid file size (e.g., 10G, 500M)"
49
+ raise ValueError(msg)
50
+
51
+ number, unit = match.groups()
52
+ number = float(number)
53
+ unit = unit[0].upper()
54
+
55
+ byte_scale = 1024 ** (ByteScale.from_prefix_letter(unit).value)
56
+
57
+ # Default to bytes if no specific unit multiplier, or assume B
58
+ return int(number * byte_scale)
59
+
60
+
61
+ def convert(value: str | int) -> int:
62
+ """Convert a FileSizeParam to an int.
63
+
64
+ Args:
65
+ value (click.Option): The value to convert as necessary.
66
+ param (str): The param we are validating.
67
+ ctx (click.Context): The click Context to fail if we can't parse it.
68
+
69
+ Returns:
70
+ int: Size in bytes
71
+
72
+ """
73
+ try:
74
+ return expand_value(value)
75
+ except ValueError as err:
76
+ msg = f"Could not convert {value} to file size: {err}"
77
+ raise ValueError(msg) from err
@@ -0,0 +1,93 @@
1
+ """File tracking utilities for archae."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ from typing import Any
7
+
8
+
9
+ class FileTracker:
10
+ """Manages file tracking by hash with metadata and paths."""
11
+
12
+ def __init__(self) -> None:
13
+ """Initialize the FileTracker."""
14
+ self.tracked_files: dict[str, dict] = {}
15
+
16
+ def track_file(self, file_hash: str, file_size_bytes: int) -> None:
17
+ """Track a file by its hash.
18
+
19
+ Args:
20
+ file_hash (str): The hash of the file to track.
21
+ file_size_bytes (int): The size of the file in bytes.
22
+ """
23
+ if file_hash not in self.tracked_files:
24
+ self.tracked_files[file_hash] = {}
25
+ self.tracked_files[file_hash]["size"] = file_size_bytes
26
+ self.tracked_files[file_hash]["metadata"] = {}
27
+ elif self.tracked_files[file_hash]["size"] != file_size_bytes:
28
+ msg = f"Hash collision detected for hash {file_hash} with differing sizes."
29
+ raise RuntimeError(msg)
30
+
31
+ def is_file_tracked(self, file_hash: str) -> bool:
32
+ """Check if a file is tracked by its hash.
33
+
34
+ Args:
35
+ file_hash (str): The hash of the file to check.
36
+
37
+ Returns:
38
+ bool: True if the file is tracked, False otherwise.
39
+ """
40
+ return file_hash in self.tracked_files
41
+
42
+ def get_tracked_file_metadata(self, file_hash: str) -> dict:
43
+ """Get metadata for a tracked file by its hash.
44
+
45
+ Args:
46
+ file_hash (str): The hash of the file.
47
+
48
+ Returns:
49
+ dict: The metadata of the tracked file.
50
+ """
51
+ return copy.deepcopy(self.tracked_files.get(file_hash, {}).get("metadata", {}))
52
+
53
+ def track_file_path(self, file_hash: str, file_path: Any) -> None:
54
+ """Track a file path by its hash.
55
+
56
+ Args:
57
+ file_hash (str): The hash of the file.
58
+ file_path: The path to track.
59
+ """
60
+ if "paths" not in self.tracked_files[file_hash]:
61
+ self.tracked_files[file_hash]["paths"] = []
62
+
63
+ if file_path not in self.tracked_files[file_hash]["paths"]:
64
+ self.tracked_files[file_hash]["paths"].append(file_path)
65
+
66
+ def add_metadata_to_hash(self, file_hash: str, key: str, value: Any) -> None:
67
+ """Add metadata to a tracked file.
68
+
69
+ Args:
70
+ file_hash (str): The hash of the file.
71
+ key (str): The metadata key.
72
+ value (Any): The metadata value.
73
+ """
74
+ self.tracked_files[file_hash]["metadata"][key] = value
75
+
76
+ def get_tracked_file_size(self) -> int:
77
+ """Get the total size of all tracked files.
78
+
79
+ Returns:
80
+ int: The total size in bytes.
81
+ """
82
+ return sum(
83
+ self.tracked_files[file_hash].get("size", 0)
84
+ for file_hash in self.tracked_files
85
+ )
86
+
87
+ def get_tracked_files(self) -> dict[str, dict]:
88
+ """Get all tracked files. This is a deep copy to prevent external modification.
89
+
90
+ Returns:
91
+ dict[str, dict]: The tracked files dictionary.
92
+ """
93
+ return copy.deepcopy(self.tracked_files)
@@ -0,0 +1,112 @@
1
+ """Tool manager for locating and managing external archiving tools."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import shutil
7
+ from typing import TYPE_CHECKING, ClassVar, cast
8
+
9
+ import archae.util.archiver
10
+
11
+ if TYPE_CHECKING:
12
+ from archae.util.archiver.base_archiver import BaseArchiver
13
+
14
+ logger = logging.getLogger("archae")
15
+
16
+
17
+ class ToolManager:
18
+ """Manager for locating and managing external archiving tools."""
19
+
20
+ __tools: ClassVar[dict[str, BaseArchiver]] = {}
21
+
22
+ @classmethod
23
+ def locate_tools(cls) -> None:
24
+ """Locate external tools."""
25
+ for archiver_cls in archae.util.archiver.BaseArchiver.__subclasses__():
26
+ logger.debug("Locating tool for %s", archiver_cls.archiver_name)
27
+ tool_path = shutil.which(str(archiver_cls.executable_name))
28
+ if tool_path:
29
+ logger.debug("Found %s at %s", archiver_cls.archiver_name, tool_path)
30
+ cls.__tools[str(archiver_cls.archiver_name)] = archiver_cls(tool_path) # type: ignore[abstract]
31
+ else:
32
+ logger.warning(
33
+ "MISSING_ARCHIVER: Could not find %s; some archive types may not be supported",
34
+ archiver_cls.archiver_name,
35
+ )
36
+
37
+ @classmethod
38
+ def get_supported_extensions(cls) -> list[str]:
39
+ """Get a sorted list of all file extensions supported by located tools.
40
+
41
+ Returns:
42
+ list[str]: Sorted list of supported file extensions.
43
+ """
44
+ supported: set[str] = set()
45
+ for tool in cls.__tools.values():
46
+ supported.update(tool.file_extensions)
47
+ return sorted(supported)
48
+
49
+ @classmethod
50
+ def get_unsupported_extensions(cls) -> list[str]:
51
+ """Get a sorted list of all file extensions from all archiver subclasses that are not currently supported.
52
+
53
+ Returns:
54
+ list[str]: Sorted list of unsupported file extensions.
55
+ """
56
+ all_extensions: set[str] = set()
57
+ supported: set[str] = set()
58
+
59
+ # Get all extensions from all archiver classes
60
+ for archiver_cls in archae.util.archiver.BaseArchiver.__subclasses__():
61
+ all_extensions.update(cast("list[str]", archiver_cls.file_extensions))
62
+
63
+ # Get supported extensions from located tools
64
+ for tool in cls.__tools.values():
65
+ supported.update(tool.file_extensions)
66
+
67
+ # Return the difference
68
+ unsupported = all_extensions - supported
69
+ return sorted(unsupported)
70
+
71
+ @classmethod
72
+ def get_supported_mime_types(cls) -> list[str]:
73
+ """Get a sorted list of all MIME types supported by located tools.
74
+
75
+ Returns:
76
+ list[str]: Sorted list of supported MIME types.
77
+ """
78
+ supported: set[str] = set()
79
+ for tool in cls.__tools.values():
80
+ supported.update(tool.mime_types)
81
+ return sorted(supported)
82
+
83
+ @classmethod
84
+ def get_unsupported_mime_types(cls) -> list[str]:
85
+ """Get a sorted list of all MIME types from all archiver subclasses that are not currently supported.
86
+
87
+ Returns:
88
+ list[str]: Sorted list of unsupported MIME types.
89
+ """
90
+ all_mime_types: set[str] = set()
91
+ supported: set[str] = set()
92
+
93
+ # Get all MIME types from all archiver classes
94
+ for archiver_cls in archae.util.archiver.BaseArchiver.__subclasses__():
95
+ all_mime_types.update(cast("list[str]", archiver_cls.mime_types))
96
+
97
+ # Get supported MIME types from located tools
98
+ for tool in cls.__tools.values():
99
+ supported.update(tool.mime_types)
100
+
101
+ # Return the difference
102
+ unsupported = all_mime_types - supported
103
+ return sorted(unsupported)
104
+
105
+ @classmethod
106
+ def get_tools(cls) -> dict[str, BaseArchiver]:
107
+ """Get a shallow copy of the tools dictionary.
108
+
109
+ Returns:
110
+ dict[str, BaseArchiver]: A shallow copy of the tools dictionary.
111
+ """
112
+ return cls.__tools.copy()