archae 2026.1.0b2__py3-none-any.whl → 2026.2.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
archae/__init__.py CHANGED
@@ -1 +1,5 @@
1
1
  """Archae explodes archives."""
2
+
3
+ from archae.extractor import ArchiveExtractor
4
+
5
+ __all__ = ["ArchiveExtractor"]
archae/cli.py CHANGED
@@ -2,111 +2,22 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import copy
6
- import hashlib
7
- import re
8
- import shutil
5
+ import logging
6
+ import pathlib
9
7
  from importlib import metadata
10
8
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any
12
9
 
13
- import magic
14
10
  import rich_click as click
15
11
 
16
- import archae.util.archiver
17
- from archae.util.enum import ByteScale
12
+ from archae.config import apply_options, convert_settings, get_options
13
+ from archae.extractor import ArchiveExtractor
14
+ from archae.util.tool_manager import ToolManager
18
15
 
19
- if TYPE_CHECKING:
20
- from archae.util.archiver.base_archiver import BaseArchiver
16
+ logger = logging.getLogger("archae")
17
+ logger.setLevel(logging.INFO)
21
18
 
22
- tools: dict[str, BaseArchiver] = {}
23
19
 
24
-
25
- class FileSizeParamType(click.ParamType):
26
- """Class to handle FileSize as a Click Param."""
27
-
28
- name = "filesize"
29
-
30
- @staticmethod
31
- def compact_value(value: float) -> str:
32
- """Convert a float of file size to a FileSizeParam string.
33
-
34
- Args:
35
- value (float): The size to convert
36
-
37
- Returns:
38
- str: A string with the most collapsed exact byte size rep.
39
-
40
- """
41
- exponent = 0
42
- modulo: float = 0
43
- while modulo == 0 and exponent < int(ByteScale.PETA.value):
44
- modulo = value % 1024
45
- if modulo == 0:
46
- exponent += 1
47
- value = int(value / 1024)
48
- return f"{value}{ByteScale(exponent).prefix_letter}" # type: ignore[call-arg]
49
-
50
- @staticmethod
51
- def expand_value(value: str | int) -> int:
52
- """Convert a FileSizeParam string or int to an int.
53
-
54
- Args:
55
- value (str | int): The value to convert as necessary.
56
-
57
- Returns:
58
- int: Size in bytes
59
-
60
- """
61
- try:
62
- return int(value)
63
- except ValueError:
64
- pass
65
-
66
- # Regex to split number and unit
67
- match = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGTP]B?)$", str(value), re.IGNORECASE)
68
- if not match:
69
- msg = f"{value} is not a valid file size (e.g., 10G, 500M)"
70
- raise ValueError(msg)
71
-
72
- number, unit = match.groups()
73
- number = float(number)
74
- unit = unit[0].upper()
75
-
76
- byte_scale = 1024 ** (ByteScale.from_prefix_letter(unit).value)
77
-
78
- # Default to bytes if no specific unit multiplier, or assume B
79
- return int(number * byte_scale)
80
-
81
- def convert(self, value: click.Option, param: str, ctx: click.Context) -> int:
82
- """Convert a FileSizeParam to an int.
83
-
84
- Args:
85
- value (click.Option): The value to convert as necessary.
86
- param (str): The param we are validating.
87
- ctx (click.Context): The click Context to fail if we can't parse it.
88
-
89
- Returns:
90
- int: Size in bytes
91
-
92
- """
93
- try:
94
- return self.expand_value(value)
95
- except ValueError as err:
96
- self.fail(str(err), param, ctx)
97
- return 0
98
-
99
-
100
- defaults = {
101
- "max_total_size_bytes": FileSizeParamType.expand_value("100G"),
102
- "max_archive_size_bytes": FileSizeParamType.expand_value("10G"),
103
- "min_archive_ratio": 0.005,
104
- }
105
-
106
- config = copy.deepcopy(defaults)
107
-
108
-
109
- @click.command(
20
+ @click.group(
110
21
  context_settings={"help_option_names": ["-h", "--help"], "show_default": True}
111
22
  )
112
23
  @click.rich_config(
@@ -116,266 +27,150 @@ config = copy.deepcopy(defaults)
116
27
  text_markup=True,
117
28
  ),
118
29
  )
30
+ @click.version_option(metadata.version("archae"), "-v", "--version")
31
+ def cli() -> None:
32
+ """Archae explodes archives."""
33
+
34
+
35
+ @cli.command()
119
36
  @click.argument(
120
37
  "archive_path",
121
- type=click.Path(exists=True, dir_okay=False),
38
+ type=click.Path(exists=True, dir_okay=False, readable=True, path_type=pathlib.Path),
39
+ default=Path.cwd() / "extracted",
122
40
  help="Archive to examine",
123
41
  )
124
42
  @click.option(
125
- "--max_total_size_bytes",
126
- type=FileSizeParamType(),
127
- default=defaults["max_total_size_bytes"],
128
- help=f"Maximum total extraction size before failing, default {FileSizeParamType.compact_value(defaults['max_total_size_bytes'])}",
43
+ "-o",
44
+ "--opt",
45
+ "options",
46
+ nargs=2,
47
+ type=click.Tuple([str, str]),
48
+ multiple=True,
49
+ help="Set config options as key value pairs. Use 'archae listopts' to see available options.",
129
50
  )
130
51
  @click.option(
131
- "--max_archive_size_bytes",
132
- type=FileSizeParamType(),
133
- default=defaults["max_archive_size_bytes"],
134
- help=f"Maximum individual archive extraction size before failing, default {FileSizeParamType.compact_value(defaults['max_archive_size_bytes'])}",
135
- )
136
- @click.option(
137
- "--min_archive_ratio",
138
- type=click.FloatRange(0, 1),
139
- default=defaults["min_archive_ratio"],
140
- help=f"Minimum allowed compression ratio for an archive. A floating-point value between 0.0 and 1.0, inclusive. Default is {defaults['min_archive_ratio']}",
52
+ "-e",
53
+ "--extract-dir",
54
+ "extract_dir",
55
+ nargs=1,
56
+ type=click.Path(
57
+ dir_okay=True,
58
+ file_okay=False,
59
+ readable=True,
60
+ writable=True,
61
+ path_type=pathlib.Path,
62
+ ),
63
+ default=Path.cwd() / "extracted",
64
+ help="Set config options as key value pairs. Use 'archae listopts' to see available options.",
141
65
  )
142
- @click.version_option(metadata.version("archae"), "-v", "--version")
143
- def cli(
144
- archive_path: str,
145
- max_total_size_bytes: int,
146
- max_archive_size_bytes: int,
147
- min_archive_ratio: float,
66
+ def extract(
67
+ archive_path: pathlib.Path,
68
+ options: list[tuple[str, str]] | None,
69
+ extract_dir: pathlib.Path,
148
70
  ) -> None:
149
- """Archae explodes archives."""
150
- locate_tools()
151
- config["max_total_size_bytes"] = max_total_size_bytes
152
- config["max_archive_size_bytes"] = max_archive_size_bytes
153
- config["min_archive_ratio"] = min_archive_ratio
154
- handle_file(Path(archive_path))
155
- debug_print_tracked_files()
156
-
157
-
158
- tracked_files: dict[str, dict] = {}
159
- base_dir = Path.cwd()
160
- extract_dir = base_dir / "extracted"
161
- if extract_dir.exists() and extract_dir.is_dir():
162
- shutil.rmtree(extract_dir)
163
- extract_dir.mkdir(exist_ok=True)
164
-
165
-
166
- def locate_tools() -> None:
167
- """Locate external tools."""
168
- for cls in archae.util.archiver.BaseArchiver.__subclasses__():
169
- tool_path = shutil.which(str(cls.executable_name))
170
- if tool_path is not None:
171
- tools[str(cls.archiver_name)] = cls(tool_path) # type: ignore[abstract]
172
-
173
-
174
- def handle_file(file_path: Path) -> None:
175
- """Handle a file given its path.
176
-
177
- Args:
178
- file_path (Path): The path to the file.
179
- """
180
- click.echo(f"Starting examination of file: {file_path!s}")
181
-
182
- base_hash = sha256_hash_file(file_path)
183
- file_size_bytes = file_path.stat().st_size
184
- track_file(base_hash, file_size_bytes)
185
- track_file_path(base_hash, file_path)
186
- add_metadata_to_hash(base_hash, "type", magic.from_file(file_path))
187
- add_metadata_to_hash(base_hash, "type_mime", magic.from_file(file_path, mime=True))
188
- extension = file_path.suffix.lstrip(".").lower()
189
- add_metadata_to_hash(base_hash, "extension", extension)
190
- is_file_archive = is_archive(base_hash)
191
- add_metadata_to_hash(base_hash, "is_archive", is_file_archive)
192
- if is_file_archive:
193
- archiver = get_archiver_for_file(base_hash)
194
- if archiver:
195
- extracted_size = archiver.get_archive_uncompressed_size(file_path)
196
- add_metadata_to_hash(base_hash, "extracted_size", extracted_size)
197
- compression_ratio = extracted_size / file_size_bytes
198
- add_metadata_to_hash(
199
- base_hash, "overall_compression_ratio", compression_ratio
200
- )
201
- if extracted_size > config["max_archive_size_bytes"]:
202
- click.echo(
203
- f"Skipped archive {file_path} because expected size {extracted_size} is greater than max_archive_size_bytes {config['max_archive_size_bytes']}"
204
- )
205
- elif (
206
- get_tracked_file_size() + extracted_size
207
- > config["max_total_size_bytes"]
208
- ):
209
- click.echo(
210
- f"Skipped archive {file_path} because expected size {extracted_size} + current tracked files {get_tracked_file_size()} is greater than max_total_size_bytes {config['max_total_size_bytes']}"
211
- )
212
- elif compression_ratio < config["min_archive_ratio"]:
213
- click.echo(
214
- f"Skipped archive {file_path} because compression ratio {compression_ratio:.5f} is less than min_archive_ratio {config['min_archive_ratio']}"
215
- )
216
- else:
217
- extraction_dir = extract_dir / base_hash
218
- archiver.extract_archive(file_path, extraction_dir)
219
- child_files = list_child_files(extraction_dir)
220
- for child_file in child_files:
221
- handle_file(child_file)
222
- else:
223
- click.echo(f"No suitable archiver found for file: {file_path!s}")
224
-
225
-
226
- def is_archive(hash: str) -> bool:
227
- """Determine the appropriate archiver for a file based on its metadata.
228
-
229
- Args:
230
- hash (str): The hash of the file.
231
-
232
- Returns:
233
- bool: True if the file is an archive, otherwise False.
234
-
235
- """
236
- metadata = get_tracked_file_metadata(hash)
237
- mime_type = metadata.get("type_mime", "").lower()
238
- extension = metadata.get("extension", "").lower()
239
-
240
- for tool in tools.values():
241
- if mime_type in tool.mime_types or extension in tool.file_extensions:
242
- return True
243
-
244
- return False
245
-
246
-
247
- def get_archiver_for_file(hash: str) -> BaseArchiver | None:
248
- """Determine the appropriate archiver for a file based on its metadata.
249
-
250
- Args:
251
- hash (str): The hash of the file.
252
-
253
- Returns:
254
- str | None: The name of the archiver tool if found, otherwise None.
255
- """
256
- metadata = get_tracked_file_metadata(hash)
257
- mime_type = metadata.get("type_mime", "").lower()
258
- extension = metadata.get("extension", "").lower()
259
-
260
- for tool in tools.values():
261
- if mime_type in tool.mime_types or extension in tool.file_extensions:
262
- return tool
263
- return None
264
-
265
-
266
- def list_child_files(directory_path: Path, pattern: str = "*") -> list[Path]:
267
- """Recursively get a list of files matching a pattern in a directory.
268
-
269
- Args:
270
- directory_path (Path): The starting directory path.
271
- pattern (str): The file pattern to match (e.g., '*.txt', '*.py').
272
-
273
- Returns:
274
- list: A list of Path objects for the matching files.
275
- """
276
- # rglob performs a recursive search
277
- files = list(directory_path.rglob(pattern))
278
- # Optionally, filter out directories if pattern='*'
279
- return [file for file in files if file.is_file()]
280
-
281
-
282
- def sha256_hash_file(file_path: Path) -> str:
283
- """Computes the SHA-256 hash of a file.
284
-
285
- Args:
286
- file_path (Path): The path to the file.
287
-
288
- Returns:
289
- str: The SHA-256 hash of the file in hexadecimal format.
290
- """
291
- try:
292
- with file_path.open("rb") as f:
293
- # Use hashlib.file_digest for simplicity and efficiency in Python 3.11+
294
- digest = hashlib.file_digest(f, "sha256")
295
- return digest.hexdigest()
296
- except FileNotFoundError:
297
- return "Error: File not found"
298
-
299
-
300
- def debug_print_tracked_files() -> None:
71
+ """Extract and analyze an archive."""
72
+ # Apply any options from the command line, then convert any convertible settings
73
+ if options:
74
+ apply_options(options)
75
+ convert_settings()
76
+
77
+ # Locate external tools
78
+ ToolManager.locate_tools()
79
+ extractor = ArchiveExtractor(extract_dir=extract_dir)
80
+ extractor.handle_file(archive_path)
81
+ print_tracked_files(extractor.get_tracked_files())
82
+ print_warnings(extractor.get_warnings())
83
+
84
+
85
+ @cli.command()
86
+ def listopts() -> None:
87
+ """List all available configuration options."""
88
+ options = get_options()
89
+
90
+ # Load default settings
91
+ defaults_path = Path(__file__).parent / "default_settings.toml"
92
+ defaults_content = defaults_path.read_text()
93
+ defaults = {}
94
+ in_default_section = False
95
+ for line in defaults_content.split("\n"):
96
+ if line.strip() == "[default]":
97
+ in_default_section = True
98
+ continue
99
+ if in_default_section and line.startswith("["):
100
+ break
101
+ if in_default_section and "=" in line:
102
+ key, value = line.split("=", 1)
103
+ defaults[key.strip()] = value.strip().strip('"')
104
+
105
+ logger.info("Available configuration options:")
106
+ logger.info("------------------------------------------------")
107
+ for option_name, option_def in sorted(options.items()):
108
+ logger.info("%s (%s)", option_name, option_def.get("type", "unknown"))
109
+ logger.info(" %s", option_def.get("help", "No description available"))
110
+ if option_name in defaults:
111
+ logger.info(" Default: %s", defaults[option_name])
112
+
113
+
114
+ @cli.command()
115
+ def status() -> None:
116
+ """Show archae status and available tools."""
117
+ logger.info("Archae status:")
118
+ logger.info("Version: %s", metadata.version("archae"))
119
+ ToolManager.locate_tools()
120
+ logger.info("Tools located and ready to use.")
121
+ logger.info("------------------------------------------------")
122
+
123
+ # Show supported extensions
124
+ supported_ext = ToolManager.get_supported_extensions()
125
+ logger.info("Supported file extensions (%d):", len(supported_ext))
126
+ if supported_ext:
127
+ logger.info(" %s", ", ".join(supported_ext))
128
+ else:
129
+ logger.info(" (none)")
130
+
131
+ # Show unsupported extensions
132
+ unsupported_ext = ToolManager.get_unsupported_extensions()
133
+ logger.info("Unsupported file extensions (%d):", len(unsupported_ext))
134
+ if unsupported_ext:
135
+ logger.info(" %s", ", ".join(unsupported_ext))
136
+ else:
137
+ logger.info(" (none)")
138
+
139
+ logger.info("------------------------------------------------")
140
+
141
+ # Show supported MIME types
142
+ supported_mime = ToolManager.get_supported_mime_types()
143
+ logger.info("Supported MIME types (%d):", len(supported_mime))
144
+ if supported_mime:
145
+ logger.info(" %s", ", ".join(supported_mime))
146
+ else:
147
+ logger.info(" (none)")
148
+
149
+ # Show unsupported MIME types
150
+ unsupported_mime = ToolManager.get_unsupported_mime_types()
151
+ logger.info("Unsupported MIME types (%d):", len(unsupported_mime))
152
+ if unsupported_mime:
153
+ logger.info(" %s", ", ".join(unsupported_mime))
154
+ else:
155
+ logger.info(" (none)")
156
+
157
+
158
+ def print_tracked_files(tracked_files: dict[str, dict]) -> None:
301
159
  """Print the tracked files for debugging purposes."""
302
- click.echo("------------------------------------------------")
160
+ logger.info("------------------------------------------------")
303
161
  for hash, info in tracked_files.items():
304
- click.echo(f"Hash: {hash}")
305
- click.echo(f" Size: {info.get('size', 'Unknown')} bytes")
162
+ logger.info("Hash: %s", hash)
163
+ logger.info(" Size: %s bytes", info.get("size", "Unknown"))
306
164
  for path in info.get("paths", []):
307
- click.echo(f" Path: {path}")
308
- click.echo(" Metadata:")
165
+ logger.info(" Path: %s", path)
166
+ logger.info(" Metadata:")
309
167
  for key, value in info.get("metadata", {}).items():
310
- click.echo(f" {key}: {value}")
311
-
312
-
313
- def track_file(hash: str, file_size_bytes: int) -> None:
314
- """Track a file by its hash.
315
-
316
- Args:
317
- hash (str): The hash of the file to track.
318
- file_size_bytes (int): The size of the file in bytes.
319
- """
320
- if hash not in tracked_files:
321
- tracked_files[hash] = {}
322
- tracked_files[hash]["size"] = file_size_bytes
323
- tracked_files[hash]["metadata"] = {}
324
- elif tracked_files[hash]["size"] != file_size_bytes:
325
- msg = f"Hash collision detected for hash {hash} with differing sizes."
326
- raise RuntimeError(msg)
327
-
328
-
329
- def is_file_tracked(hash: str) -> bool:
330
- """Check if a file is tracked by its hash.
331
-
332
- Args:
333
- hash (str): The hash of the file to check.
334
- """
335
- return hash in tracked_files
336
-
337
-
338
- def get_tracked_file_metadata(hash: str) -> dict:
339
- """Get metadata for a tracked file by its hash.
340
-
341
- Args:
342
- hash (str): The hash of the file.
343
-
344
- Returns:
345
- dict: The metadata of the tracked file.
346
- """
347
- return copy.deepcopy(tracked_files.get(hash, {}).get("metadata", {}))
348
-
349
-
350
- def track_file_path(hash: str, file_path: Path) -> None:
351
- """Track a file path by its hash.
352
-
353
- Args:
354
- hash (str): The hash of the file.
355
- file_path (Path): The path to track.
356
- """
357
- if "paths" not in tracked_files[hash]:
358
- tracked_files[hash]["paths"] = []
359
-
360
- if file_path not in tracked_files[hash]["paths"]:
361
- tracked_files[hash]["paths"].append(file_path)
362
-
363
-
364
- def add_metadata_to_hash(hash: str, key: str, value: Any) -> None:
365
- """Add metadata to a tracked file.
366
-
367
- Args:
368
- hash (str): The hash of the file.
369
- key (str): The metadata key.
370
- value (Any): The metadata value.
371
- """
372
- tracked_files[hash]["metadata"][key] = value
373
-
168
+ logger.info(" %s: %s", key, value)
374
169
 
375
- def get_tracked_file_size() -> int:
376
- """Get the total size of all tracked files.
377
170
 
378
- Returns:
379
- int: The total size in bytes.
380
- """
381
- return sum(tracked_files[hash].get("size", 0) for hash in tracked_files)
171
+ def print_warnings(warnings: list[str]) -> None:
172
+ """Print accumulated warnings for debugging purposes."""
173
+ logger.info("------------------------------------------------")
174
+ logger.info("Accumulated Warnings:")
175
+ for warning in warnings: # type: ignore[attr-defined]
176
+ logger.info(warning)
archae/config.py ADDED
@@ -0,0 +1,107 @@
1
+ """Runtime config management (default, userconfig and envvars)."""
2
+
3
+ import importlib
4
+ import typing
5
+ from pathlib import Path
6
+
7
+ import platformdirs
8
+ import yaml
9
+ from dynaconf import Dynaconf
10
+
11
+ # Get the package directory for default settings
12
+ package_dir = Path(__file__).parent
13
+ default_settings_file = package_dir / "default_settings.toml"
14
+
15
+ # Get the config directory following XDG standards
16
+ config_dir = Path(platformdirs.user_config_dir("archae"))
17
+ config_dir.mkdir(parents=True, exist_ok=True)
18
+
19
+ # Define the user config file path
20
+ user_config_file = config_dir / "settings.toml"
21
+
22
+ # Create a default settings.toml if it doesn't exist
23
+ if not user_config_file.exists():
24
+ user_config_file.write_text("""# Archae configuration
25
+ # Override defaults from the package here
26
+ """)
27
+
28
+ settings = Dynaconf(
29
+ envvar_prefix="ARCHAE",
30
+ settings_files=[
31
+ str(default_settings_file), # Load package defaults first
32
+ str(user_config_file), # User settings override defaults
33
+ ],
34
+ environments=True,
35
+ )
36
+
37
+ default_settings = Dynaconf(
38
+ envvar_prefix="ARCHAE",
39
+ settings_files=[
40
+ str(default_settings_file), # Load package defaults first
41
+ ],
42
+ environments=True,
43
+ )
44
+
45
+
46
+ options_file = package_dir / "options.yaml"
47
+
48
+
49
+ def get_options() -> dict:
50
+ """Return the contents of options.yaml."""
51
+ with Path.open(options_file) as f:
52
+ return yaml.safe_load(f)
53
+
54
+
55
+ def get_converter(converter_def: str) -> typing.Callable:
56
+ """Dynamically import and instantiate a converter class.
57
+
58
+ Args:
59
+ converter_def (str): Converter definition in format "module.path:ClassName" or a builtin type like "float" or "int".
60
+
61
+ Returns:
62
+ Converter function.
63
+ """
64
+ # Handle built-in types
65
+ if converter_def == "float":
66
+ return float
67
+ if converter_def == "int":
68
+ return int
69
+
70
+ # Split the definition into module path and class name
71
+ module_name, class_name = converter_def.split(":")
72
+
73
+ # Import the module
74
+ module = importlib.import_module(module_name)
75
+
76
+ # Get the class from the module
77
+ return getattr(module, class_name)
78
+
79
+
80
+ def apply_options(option_list: list[tuple[str, str]]) -> None:
81
+ """Apply a list of options to the settings.
82
+
83
+ Args:
84
+ option_list (list[tuple[str, str]]): List of key-value pairs to apply.
85
+
86
+ """
87
+ options = get_options()
88
+ for key, value in option_list:
89
+ # Find the option definition by matching the key
90
+ option_def = None
91
+ for def_key in options:
92
+ option_def = options[def_key]
93
+ break
94
+ if option_def:
95
+ settings[key] = value
96
+ else:
97
+ pass
98
+
99
+
100
+ def convert_settings() -> None:
101
+ """Convert settings using their defined converters."""
102
+ options = get_options()
103
+ for key in options:
104
+ option_def = options[key]
105
+ if "converter" in option_def:
106
+ converter = get_converter(option_def["converter"])
107
+ settings[key] = converter(settings[key])
@@ -0,0 +1,9 @@
1
+ # Default archae configuration
2
+ # Users can override these in ~/.config/archae/settings.toml
3
+
4
+ [default]
5
+ MAX_TOTAL_SIZE_BYTES = "100G"
6
+ MAX_ARCHIVE_SIZE_BYTES = "10G"
7
+ MIN_ARCHIVE_RATIO = 0.005
8
+ MIN_DISK_FREE_SPACE = "10G"
9
+ MAX_DEPTH=0