archae 2026.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- archae/__init__.py +5 -0
- archae/__main__.py +9 -0
- archae/cli.py +176 -0
- archae/config.py +107 -0
- archae/default_settings.toml +9 -0
- archae/extractor.py +249 -0
- archae/options.yaml +39 -0
- archae/py.typed +0 -0
- archae/util/__init__.py +1 -0
- archae/util/archiver/__init__.py +6 -0
- archae/util/archiver/base_archiver.py +55 -0
- archae/util/archiver/peazip.py +159 -0
- archae/util/archiver/seven_zip.py +199 -0
- archae/util/archiver/unar.py +158 -0
- archae/util/converter/file_size.py +77 -0
- archae/util/enum/__init__.py +5 -0
- archae/util/enum/byte_scale.py +55 -0
- archae/util/file_tracker.py +93 -0
- archae/util/tool_manager.py +112 -0
- archae-2026.2.0.dist-info/METADATA +161 -0
- archae-2026.2.0.dist-info/RECORD +23 -0
- archae-2026.2.0.dist-info/WHEEL +4 -0
- archae-2026.2.0.dist-info/entry_points.txt +3 -0
archae/__init__.py
ADDED
archae/__main__.py
ADDED
archae/cli.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Main CLI for archae."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import pathlib
|
|
7
|
+
from importlib import metadata
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import rich_click as click
|
|
11
|
+
|
|
12
|
+
from archae.config import apply_options, convert_settings, get_options
|
|
13
|
+
from archae.extractor import ArchiveExtractor
|
|
14
|
+
from archae.util.tool_manager import ToolManager
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("archae")
|
|
17
|
+
logger.setLevel(logging.INFO)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@click.group(
|
|
21
|
+
context_settings={"help_option_names": ["-h", "--help"], "show_default": True}
|
|
22
|
+
)
|
|
23
|
+
@click.rich_config(
|
|
24
|
+
help_config=click.RichHelpConfiguration(
|
|
25
|
+
width=88,
|
|
26
|
+
show_arguments=True,
|
|
27
|
+
text_markup=True,
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
@click.version_option(metadata.version("archae"), "-v", "--version")
|
|
31
|
+
def cli() -> None:
|
|
32
|
+
"""Archae explodes archives."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@cli.command()
|
|
36
|
+
@click.argument(
|
|
37
|
+
"archive_path",
|
|
38
|
+
type=click.Path(exists=True, dir_okay=False, readable=True, path_type=pathlib.Path),
|
|
39
|
+
default=Path.cwd() / "extracted",
|
|
40
|
+
help="Archive to examine",
|
|
41
|
+
)
|
|
42
|
+
@click.option(
|
|
43
|
+
"-o",
|
|
44
|
+
"--opt",
|
|
45
|
+
"options",
|
|
46
|
+
nargs=2,
|
|
47
|
+
type=click.Tuple([str, str]),
|
|
48
|
+
multiple=True,
|
|
49
|
+
help="Set config options as key value pairs. Use 'archae listopts' to see available options.",
|
|
50
|
+
)
|
|
51
|
+
@click.option(
|
|
52
|
+
"-e",
|
|
53
|
+
"--extract-dir",
|
|
54
|
+
"extract_dir",
|
|
55
|
+
nargs=1,
|
|
56
|
+
type=click.Path(
|
|
57
|
+
dir_okay=True,
|
|
58
|
+
file_okay=False,
|
|
59
|
+
readable=True,
|
|
60
|
+
writable=True,
|
|
61
|
+
path_type=pathlib.Path,
|
|
62
|
+
),
|
|
63
|
+
default=Path.cwd() / "extracted",
|
|
64
|
+
help="Set config options as key value pairs. Use 'archae listopts' to see available options.",
|
|
65
|
+
)
|
|
66
|
+
def extract(
|
|
67
|
+
archive_path: pathlib.Path,
|
|
68
|
+
options: list[tuple[str, str]] | None,
|
|
69
|
+
extract_dir: pathlib.Path,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Extract and analyze an archive."""
|
|
72
|
+
# Apply any options from the command line, then convert any convertible settings
|
|
73
|
+
if options:
|
|
74
|
+
apply_options(options)
|
|
75
|
+
convert_settings()
|
|
76
|
+
|
|
77
|
+
# Locate external tools
|
|
78
|
+
ToolManager.locate_tools()
|
|
79
|
+
extractor = ArchiveExtractor(extract_dir=extract_dir)
|
|
80
|
+
extractor.handle_file(archive_path)
|
|
81
|
+
print_tracked_files(extractor.get_tracked_files())
|
|
82
|
+
print_warnings(extractor.get_warnings())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@cli.command()
|
|
86
|
+
def listopts() -> None:
|
|
87
|
+
"""List all available configuration options."""
|
|
88
|
+
options = get_options()
|
|
89
|
+
|
|
90
|
+
# Load default settings
|
|
91
|
+
defaults_path = Path(__file__).parent / "default_settings.toml"
|
|
92
|
+
defaults_content = defaults_path.read_text()
|
|
93
|
+
defaults = {}
|
|
94
|
+
in_default_section = False
|
|
95
|
+
for line in defaults_content.split("\n"):
|
|
96
|
+
if line.strip() == "[default]":
|
|
97
|
+
in_default_section = True
|
|
98
|
+
continue
|
|
99
|
+
if in_default_section and line.startswith("["):
|
|
100
|
+
break
|
|
101
|
+
if in_default_section and "=" in line:
|
|
102
|
+
key, value = line.split("=", 1)
|
|
103
|
+
defaults[key.strip()] = value.strip().strip('"')
|
|
104
|
+
|
|
105
|
+
logger.info("Available configuration options:")
|
|
106
|
+
logger.info("------------------------------------------------")
|
|
107
|
+
for option_name, option_def in sorted(options.items()):
|
|
108
|
+
logger.info("%s (%s)", option_name, option_def.get("type", "unknown"))
|
|
109
|
+
logger.info(" %s", option_def.get("help", "No description available"))
|
|
110
|
+
if option_name in defaults:
|
|
111
|
+
logger.info(" Default: %s", defaults[option_name])
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@cli.command()
|
|
115
|
+
def status() -> None:
|
|
116
|
+
"""Show archae status and available tools."""
|
|
117
|
+
logger.info("Archae status:")
|
|
118
|
+
logger.info("Version: %s", metadata.version("archae"))
|
|
119
|
+
ToolManager.locate_tools()
|
|
120
|
+
logger.info("Tools located and ready to use.")
|
|
121
|
+
logger.info("------------------------------------------------")
|
|
122
|
+
|
|
123
|
+
# Show supported extensions
|
|
124
|
+
supported_ext = ToolManager.get_supported_extensions()
|
|
125
|
+
logger.info("Supported file extensions (%d):", len(supported_ext))
|
|
126
|
+
if supported_ext:
|
|
127
|
+
logger.info(" %s", ", ".join(supported_ext))
|
|
128
|
+
else:
|
|
129
|
+
logger.info(" (none)")
|
|
130
|
+
|
|
131
|
+
# Show unsupported extensions
|
|
132
|
+
unsupported_ext = ToolManager.get_unsupported_extensions()
|
|
133
|
+
logger.info("Unsupported file extensions (%d):", len(unsupported_ext))
|
|
134
|
+
if unsupported_ext:
|
|
135
|
+
logger.info(" %s", ", ".join(unsupported_ext))
|
|
136
|
+
else:
|
|
137
|
+
logger.info(" (none)")
|
|
138
|
+
|
|
139
|
+
logger.info("------------------------------------------------")
|
|
140
|
+
|
|
141
|
+
# Show supported MIME types
|
|
142
|
+
supported_mime = ToolManager.get_supported_mime_types()
|
|
143
|
+
logger.info("Supported MIME types (%d):", len(supported_mime))
|
|
144
|
+
if supported_mime:
|
|
145
|
+
logger.info(" %s", ", ".join(supported_mime))
|
|
146
|
+
else:
|
|
147
|
+
logger.info(" (none)")
|
|
148
|
+
|
|
149
|
+
# Show unsupported MIME types
|
|
150
|
+
unsupported_mime = ToolManager.get_unsupported_mime_types()
|
|
151
|
+
logger.info("Unsupported MIME types (%d):", len(unsupported_mime))
|
|
152
|
+
if unsupported_mime:
|
|
153
|
+
logger.info(" %s", ", ".join(unsupported_mime))
|
|
154
|
+
else:
|
|
155
|
+
logger.info(" (none)")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def print_tracked_files(tracked_files: dict[str, dict]) -> None:
|
|
159
|
+
"""Print the tracked files for debugging purposes."""
|
|
160
|
+
logger.info("------------------------------------------------")
|
|
161
|
+
for hash, info in tracked_files.items():
|
|
162
|
+
logger.info("Hash: %s", hash)
|
|
163
|
+
logger.info(" Size: %s bytes", info.get("size", "Unknown"))
|
|
164
|
+
for path in info.get("paths", []):
|
|
165
|
+
logger.info(" Path: %s", path)
|
|
166
|
+
logger.info(" Metadata:")
|
|
167
|
+
for key, value in info.get("metadata", {}).items():
|
|
168
|
+
logger.info(" %s: %s", key, value)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def print_warnings(warnings: list[str]) -> None:
|
|
172
|
+
"""Print accumulated warnings for debugging purposes."""
|
|
173
|
+
logger.info("------------------------------------------------")
|
|
174
|
+
logger.info("Accumulated Warnings:")
|
|
175
|
+
for warning in warnings: # type: ignore[attr-defined]
|
|
176
|
+
logger.info(warning)
|
archae/config.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Runtime config management (default, userconfig and envvars)."""
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import typing
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import platformdirs
|
|
8
|
+
import yaml
|
|
9
|
+
from dynaconf import Dynaconf
|
|
10
|
+
|
|
11
|
+
# Get the package directory for default settings
|
|
12
|
+
package_dir = Path(__file__).parent
|
|
13
|
+
default_settings_file = package_dir / "default_settings.toml"
|
|
14
|
+
|
|
15
|
+
# Get the config directory following XDG standards
|
|
16
|
+
config_dir = Path(platformdirs.user_config_dir("archae"))
|
|
17
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
|
|
19
|
+
# Define the user config file path
|
|
20
|
+
user_config_file = config_dir / "settings.toml"
|
|
21
|
+
|
|
22
|
+
# Create a default settings.toml if it doesn't exist
|
|
23
|
+
if not user_config_file.exists():
|
|
24
|
+
user_config_file.write_text("""# Archae configuration
|
|
25
|
+
# Override defaults from the package here
|
|
26
|
+
""")
|
|
27
|
+
|
|
28
|
+
settings = Dynaconf(
|
|
29
|
+
envvar_prefix="ARCHAE",
|
|
30
|
+
settings_files=[
|
|
31
|
+
str(default_settings_file), # Load package defaults first
|
|
32
|
+
str(user_config_file), # User settings override defaults
|
|
33
|
+
],
|
|
34
|
+
environments=True,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
default_settings = Dynaconf(
|
|
38
|
+
envvar_prefix="ARCHAE",
|
|
39
|
+
settings_files=[
|
|
40
|
+
str(default_settings_file), # Load package defaults first
|
|
41
|
+
],
|
|
42
|
+
environments=True,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
options_file = package_dir / "options.yaml"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_options() -> dict:
|
|
50
|
+
"""Return the contents of options.yaml."""
|
|
51
|
+
with Path.open(options_file) as f:
|
|
52
|
+
return yaml.safe_load(f)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_converter(converter_def: str) -> typing.Callable:
|
|
56
|
+
"""Dynamically import and instantiate a converter class.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
converter_def (str): Converter definition in format "module.path:ClassName" or a builtin type like "float" or "int".
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Converter function.
|
|
63
|
+
"""
|
|
64
|
+
# Handle built-in types
|
|
65
|
+
if converter_def == "float":
|
|
66
|
+
return float
|
|
67
|
+
if converter_def == "int":
|
|
68
|
+
return int
|
|
69
|
+
|
|
70
|
+
# Split the definition into module path and class name
|
|
71
|
+
module_name, class_name = converter_def.split(":")
|
|
72
|
+
|
|
73
|
+
# Import the module
|
|
74
|
+
module = importlib.import_module(module_name)
|
|
75
|
+
|
|
76
|
+
# Get the class from the module
|
|
77
|
+
return getattr(module, class_name)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def apply_options(option_list: list[tuple[str, str]]) -> None:
|
|
81
|
+
"""Apply a list of options to the settings.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
option_list (list[tuple[str, str]]): List of key-value pairs to apply.
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
options = get_options()
|
|
88
|
+
for key, value in option_list:
|
|
89
|
+
# Find the option definition by matching the key
|
|
90
|
+
option_def = None
|
|
91
|
+
for def_key in options:
|
|
92
|
+
option_def = options[def_key]
|
|
93
|
+
break
|
|
94
|
+
if option_def:
|
|
95
|
+
settings[key] = value
|
|
96
|
+
else:
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def convert_settings() -> None:
|
|
101
|
+
"""Convert settings using their defined converters."""
|
|
102
|
+
options = get_options()
|
|
103
|
+
for key in options:
|
|
104
|
+
option_def = options[key]
|
|
105
|
+
if "converter" in option_def:
|
|
106
|
+
converter = get_converter(option_def["converter"])
|
|
107
|
+
settings[key] = converter(settings[key])
|
archae/extractor.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Archive extraction module for archae."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
import shutil
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import magic
|
|
11
|
+
|
|
12
|
+
from archae.config import apply_options, default_settings, settings
|
|
13
|
+
from archae.util.file_tracker import FileTracker
|
|
14
|
+
from archae.util.tool_manager import ToolManager
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from archae.util.archiver.base_archiver import BaseArchiver
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class WarningAccumulator(logging.Handler):
|
|
23
|
+
"""Logging handler to accumulate warnings while still printing them."""
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
"""Initialize the WarningAccumulator."""
|
|
27
|
+
super().__init__()
|
|
28
|
+
self.warnings: list[str] = []
|
|
29
|
+
|
|
30
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
31
|
+
"""Print and accumulate warning messages."""
|
|
32
|
+
if record.levelno >= logging.WARNING:
|
|
33
|
+
self.warnings.append(self.format(record))
|
|
34
|
+
print(self.format(record)) # noqa: T201
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger("archae")
|
|
38
|
+
logger.setLevel(logging.INFO)
|
|
39
|
+
accumulator = WarningAccumulator()
|
|
40
|
+
logger.addHandler(accumulator)
|
|
41
|
+
logger.setLevel(logging.DEBUG)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ArchiveExtractor:
|
|
45
|
+
"""Handles archive extraction and file tracking."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, extract_dir: Path) -> None:
|
|
48
|
+
"""Initialize the ArchiveExtractor.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
extract_dir (Path): The base directory for extraction. Defaults to current working directory + extracted.
|
|
52
|
+
"""
|
|
53
|
+
self.extract_dir = extract_dir
|
|
54
|
+
if self.extract_dir.exists() and self.extract_dir.is_dir():
|
|
55
|
+
shutil.rmtree(self.extract_dir)
|
|
56
|
+
self.extract_dir.mkdir(exist_ok=True)
|
|
57
|
+
self.file_tracker = FileTracker()
|
|
58
|
+
|
|
59
|
+
def handle_file(self, file_path: Path) -> None:
|
|
60
|
+
"""Handle a file given its path.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
file_path (Path): The path to the file.
|
|
64
|
+
"""
|
|
65
|
+
self.__handle_file(file_path)
|
|
66
|
+
|
|
67
|
+
def __handle_file(self, file_path: Path, depth: int = 1) -> None:
|
|
68
|
+
"""Internal implementation of handle_file.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
file_path (Path): The path to the file.
|
|
72
|
+
depth (int): The current depth in the archive extraction tree. Defaults to 1.
|
|
73
|
+
"""
|
|
74
|
+
logger.info("Starting examination of file: %s", file_path)
|
|
75
|
+
|
|
76
|
+
base_hash = self._sha256_hash_file(file_path)
|
|
77
|
+
file_size_bytes = file_path.stat().st_size
|
|
78
|
+
self.file_tracker.track_file(base_hash, file_size_bytes)
|
|
79
|
+
self.file_tracker.track_file_path(base_hash, file_path)
|
|
80
|
+
self.file_tracker.add_metadata_to_hash(
|
|
81
|
+
base_hash, "type", magic.from_file(file_path)
|
|
82
|
+
)
|
|
83
|
+
self.file_tracker.add_metadata_to_hash(
|
|
84
|
+
base_hash, "type_mime", magic.from_file(file_path, mime=True)
|
|
85
|
+
)
|
|
86
|
+
extension = file_path.suffix.lstrip(".").lower()
|
|
87
|
+
self.file_tracker.add_metadata_to_hash(base_hash, "extension", extension)
|
|
88
|
+
is_file_archive = self._is_archive(base_hash)
|
|
89
|
+
self.file_tracker.add_metadata_to_hash(base_hash, "is_archive", is_file_archive)
|
|
90
|
+
if is_file_archive:
|
|
91
|
+
if settings["MAX_DEPTH"] == 0 or depth < settings["MAX_DEPTH"]:
|
|
92
|
+
archiver = self._get_archiver_for_file(base_hash)
|
|
93
|
+
if archiver:
|
|
94
|
+
extracted_size = archiver.get_archive_uncompressed_size(file_path)
|
|
95
|
+
self.file_tracker.add_metadata_to_hash(
|
|
96
|
+
base_hash, "extracted_size", extracted_size
|
|
97
|
+
)
|
|
98
|
+
compression_ratio = extracted_size / file_size_bytes
|
|
99
|
+
self.file_tracker.add_metadata_to_hash(
|
|
100
|
+
base_hash, "overall_compression_ratio", compression_ratio
|
|
101
|
+
)
|
|
102
|
+
if extracted_size > settings["MAX_ARCHIVE_SIZE_BYTES"]:
|
|
103
|
+
logger.warning(
|
|
104
|
+
"MAX_ARCHIVE_SIZE_BYTES: Skipped archive %s because expected size %s is greater than MAX_ARCHIVE_SIZE_BYTES %s",
|
|
105
|
+
file_path,
|
|
106
|
+
extracted_size,
|
|
107
|
+
settings["MAX_ARCHIVE_SIZE_BYTES"],
|
|
108
|
+
)
|
|
109
|
+
elif (
|
|
110
|
+
self.file_tracker.get_tracked_file_size() + extracted_size
|
|
111
|
+
> settings["MAX_TOTAL_SIZE_BYTES"]
|
|
112
|
+
):
|
|
113
|
+
logger.warning(
|
|
114
|
+
"MAX_TOTAL_SIZE_BYTES: Skipped archive %s because expected size %s + current tracked files %s is greater than MAX_TOTAL_SIZE_BYTES %s",
|
|
115
|
+
file_path,
|
|
116
|
+
extracted_size,
|
|
117
|
+
self.file_tracker.get_tracked_file_size(),
|
|
118
|
+
settings["MAX_TOTAL_SIZE_BYTES"],
|
|
119
|
+
)
|
|
120
|
+
elif compression_ratio < settings["MIN_ARCHIVE_RATIO"]:
|
|
121
|
+
logger.warning(
|
|
122
|
+
"MIN_ARCHIVE_RATIO: Skipped archive %s because compression ratio %.5f is less than MIN_ARCHIVE_RATIO %s",
|
|
123
|
+
file_path,
|
|
124
|
+
compression_ratio,
|
|
125
|
+
settings["MIN_ARCHIVE_RATIO"],
|
|
126
|
+
)
|
|
127
|
+
elif (
|
|
128
|
+
shutil.disk_usage(self.extract_dir).free - extracted_size
|
|
129
|
+
< settings["MIN_DISK_FREE_SPACE"]
|
|
130
|
+
):
|
|
131
|
+
logger.warning(
|
|
132
|
+
"MIN_DISK_FREE_SPACE:Skipped archive %s because extracting it would leave less than MIN_DISK_FREE_SPACE %s bytes free at extraction location %s",
|
|
133
|
+
file_path,
|
|
134
|
+
settings["MIN_DISK_FREE_SPACE"],
|
|
135
|
+
self.extract_dir,
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
extraction_dir = self.extract_dir / base_hash
|
|
139
|
+
archiver.extract_archive(file_path, extraction_dir)
|
|
140
|
+
child_files = self._list_child_files(extraction_dir)
|
|
141
|
+
for child_file in child_files:
|
|
142
|
+
self.__handle_file(child_file, depth + 1)
|
|
143
|
+
else:
|
|
144
|
+
logger.warning(
|
|
145
|
+
"NO_ARCHIVER: No suitable archiver found for file: %s",
|
|
146
|
+
file_path,
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
logger.warning(
|
|
150
|
+
"MAX_DEPTH: File %s is not extracted; max depth reached.", file_path
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def _is_archive(self, file_hash: str) -> bool:
|
|
154
|
+
"""Determine the appropriate archiver for a file based on its metadata.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
file_hash (str): The hash of the file.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
bool: True if the file is an archive, otherwise False.
|
|
161
|
+
|
|
162
|
+
"""
|
|
163
|
+
metadata = self.file_tracker.get_tracked_file_metadata(file_hash)
|
|
164
|
+
mime_type = metadata.get("type_mime", "").lower()
|
|
165
|
+
extension = metadata.get("extension", "").lower()
|
|
166
|
+
|
|
167
|
+
for tool in ToolManager.get_tools().values():
|
|
168
|
+
if mime_type in tool.mime_types or extension in tool.file_extensions:
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
def _get_archiver_for_file(self, file_hash: str) -> BaseArchiver | None:
|
|
174
|
+
"""Determine the appropriate archiver for a file based on its metadata.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
file_hash (str): The hash of the file.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
str | None: The name of the archiver tool if found, otherwise None.
|
|
181
|
+
"""
|
|
182
|
+
metadata = self.file_tracker.get_tracked_file_metadata(file_hash)
|
|
183
|
+
mime_type = metadata.get("type_mime", "").lower()
|
|
184
|
+
extension = metadata.get("extension", "").lower()
|
|
185
|
+
|
|
186
|
+
for tool in ToolManager.get_tools().values():
|
|
187
|
+
if mime_type in tool.mime_types or extension in tool.file_extensions:
|
|
188
|
+
return tool
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _list_child_files(directory_path: Path, pattern: str = "*") -> list[Path]:
|
|
193
|
+
"""Recursively get a list of files matching a pattern in a directory.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
directory_path (Path): The starting directory path.
|
|
197
|
+
pattern (str): The file pattern to match (e.g., '*.txt', '*.py').
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
list: A list of Path objects for the matching files.
|
|
201
|
+
"""
|
|
202
|
+
# rglob performs a recursive search
|
|
203
|
+
files = list(directory_path.rglob(pattern))
|
|
204
|
+
# Optionally, filter out directories if pattern='*'
|
|
205
|
+
return [file for file in files if file.is_file()]
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def _sha256_hash_file(file_path: Path) -> str:
|
|
209
|
+
"""Computes the SHA-256 hash of a file.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
file_path (Path): The path to the file.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
str: The SHA-256 hash of the file in hexadecimal format.
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
with file_path.open("rb") as f:
|
|
219
|
+
digest = hashlib.file_digest(f, "sha256")
|
|
220
|
+
return digest.hexdigest()
|
|
221
|
+
except FileNotFoundError:
|
|
222
|
+
return "Error: File not found"
|
|
223
|
+
|
|
224
|
+
def get_tracked_files(self) -> dict[str, dict]:
|
|
225
|
+
"""Print the tracked files for debugging purposes."""
|
|
226
|
+
return self.file_tracker.get_tracked_files()
|
|
227
|
+
|
|
228
|
+
def get_warnings(self) -> list[str]:
|
|
229
|
+
"""Print accumulated warnings for debugging purposes."""
|
|
230
|
+
return accumulator.warnings
|
|
231
|
+
|
|
232
|
+
def get_default_settings(self) -> dict:
|
|
233
|
+
"""Get the default settings from the config module.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
dict: Dictionary of default settings.
|
|
237
|
+
"""
|
|
238
|
+
return dict(default_settings)
|
|
239
|
+
|
|
240
|
+
def apply_settings(self, option_list: list[tuple[str, str]]) -> None:
|
|
241
|
+
"""Apply a list of settings options.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
option_list (list[tuple[str, str]]): List of (key, value) tuples to apply.
|
|
245
|
+
|
|
246
|
+
Example:
|
|
247
|
+
extractor.apply_settings([("MAX_ARCHIVE_SIZE_BYTES", "5000000000")])
|
|
248
|
+
"""
|
|
249
|
+
apply_options(option_list)
|
archae/options.yaml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
MAX_TOTAL_SIZE_BYTES:
|
|
2
|
+
type: int
|
|
3
|
+
converter: archae.util.converter.file_size:convert
|
|
4
|
+
help: Maximum total size of all archives to extract in bytes.
|
|
5
|
+
examples:
|
|
6
|
+
- 1GB
|
|
7
|
+
- 500M
|
|
8
|
+
- 500
|
|
9
|
+
MAX_ARCHIVE_SIZE_BYTES:
|
|
10
|
+
type: int
|
|
11
|
+
converter: archae.util.converter.file_size:convert
|
|
12
|
+
help: Maximum size of a single archive to extract in bytes.
|
|
13
|
+
examples:
|
|
14
|
+
- 1GB
|
|
15
|
+
- 500M
|
|
16
|
+
- 500
|
|
17
|
+
MIN_ARCHIVE_RATIO:
|
|
18
|
+
type: float
|
|
19
|
+
converter: float
|
|
20
|
+
help: Minimum compression ratio (compressed size / uncompressed size) required to extract an archive.
|
|
21
|
+
examples:
|
|
22
|
+
- 0.001
|
|
23
|
+
MIN_DISK_FREE_SPACE:
|
|
24
|
+
type: int
|
|
25
|
+
converter: archae.util.converter.file_size:convert
|
|
26
|
+
help: Minimum required estimated disk space after extraction in bytes.
|
|
27
|
+
examples:
|
|
28
|
+
- 1GB
|
|
29
|
+
- 500M
|
|
30
|
+
- 500
|
|
31
|
+
MAX_DEPTH:
|
|
32
|
+
type: int
|
|
33
|
+
converter: int
|
|
34
|
+
help: Maximum extraction depth for nested archives. Usse 0 for unlimited depth.
|
|
35
|
+
examples:
|
|
36
|
+
- 3
|
|
37
|
+
- 5
|
|
38
|
+
- 10
|
|
39
|
+
- 0
|
archae/py.typed
ADDED
|
File without changes
|
archae/util/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility modules for archae."""
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Base archiver class for extraction tools."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseArchiver(ABC):
|
|
8
|
+
"""Base class for archiver/extractor tools."""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def __init__(self, executable_path: str | Path) -> None:
|
|
12
|
+
"""Initialize the archiver.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
executable_path: Path to the executable.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def archiver_name(self) -> str:
|
|
20
|
+
"""Get the archiver name."""
|
|
21
|
+
return self.archiver_name
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def executable_name(self) -> str:
|
|
25
|
+
"""Get the executable name."""
|
|
26
|
+
return self.executable_name
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def file_extensions(self) -> list[str]:
|
|
30
|
+
"""A non-abstract method that accesses the class impl for the file extensions."""
|
|
31
|
+
return self.__class__.file_extensions # type: ignore[return-value]
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def mime_types(self) -> list[str]:
|
|
35
|
+
"""A non-abstract method that accesses the class impl for the mime types."""
|
|
36
|
+
return self.__class__.mime_types # type: ignore[return-value]
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def extract_archive(self, archive_path: Path, extract_dir: Path) -> None:
|
|
40
|
+
"""Extracts an archive to a specified directory.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
archive_path (Path): The path to the archive file.
|
|
44
|
+
extract_dir (Path): The directory to extract the archive to.
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def get_archive_uncompressed_size(self, archive_path: Path) -> int:
|
|
50
|
+
"""Get the uncompressed size of the contents.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
archive_path (Path): The path to the archive file.
|
|
54
|
+
|
|
55
|
+
"""
|