archive-extractor 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- archive_extractor-0.2.0/CLAUDE.md +65 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/PKG-INFO +2 -2
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/README.md +44 -4
- archive_extractor-0.2.0/archive_extractor/__init__.py +118 -0
- archive_extractor-0.2.0/archive_extractor/core.py +160 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/pyproject.toml +3 -6
- archive_extractor-0.1.3/CLAUDE.md +0 -54
- archive_extractor-0.1.3/main.py +0 -131
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/.github/dependabot.yml +0 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/.github/workflows/release.yml +0 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/.gitignore +0 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/.pre-commit-config.yaml +0 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/LICENSE +0 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/logo.png +0 -0
- {archive_extractor-0.1.3 → archive_extractor-0.2.0}/uv.lock +0 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
Archive Extractor is a Python library and CLI tool that recursively searches for and extracts ZIP and 7z archives within a directory tree. It handles password-protected archives, preserves folder structures, and prevents path traversal attacks.
|
|
8
|
+
|
|
9
|
+
## Architecture
|
|
10
|
+
|
|
11
|
+
This is a Python package (`archive_extractor/`) with the following structure:
|
|
12
|
+
|
|
13
|
+
- **`archive_extractor/__init__.py`**: Public API and CLI entry point
|
|
14
|
+
- `extract_archives()`: Main library function for programmatic use
|
|
15
|
+
- `main()`: CLI entry point for command-line usage
|
|
16
|
+
- **`archive_extractor/core.py`**: Core extraction logic
|
|
17
|
+
- `sanitize_filename()`: Path security
|
|
18
|
+
- `find_archive_files()`: Archive discovery generator
|
|
19
|
+
- `load_passwords()`: Password file parsing
|
|
20
|
+
- `extract_zip_archive()`: ZIP extraction with password support
|
|
21
|
+
- `extract_7z_archive()`: 7z extraction with password support
|
|
22
|
+
|
|
23
|
+
## Key Dependencies
|
|
24
|
+
|
|
25
|
+
- `zipfile` (stdlib): ZIP extraction
|
|
26
|
+
- `py7zr`: 7z archive extraction
|
|
27
|
+
- `tqdm`: Progress bars during extraction
|
|
28
|
+
- `lzma` (stdlib): Referenced in exception handling for 7z corruption detection
|
|
29
|
+
|
|
30
|
+
## Development Commands
|
|
31
|
+
|
|
32
|
+
**Install as a tool**:
|
|
33
|
+
```bash
|
|
34
|
+
uv tool install .
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
**Run directly**:
|
|
38
|
+
```bash
|
|
39
|
+
python -m archive_extractor /path/to/search
|
|
40
|
+
python -m archive_extractor /path/to/search --passwords passwords.txt
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Install in editable mode for development**:
|
|
44
|
+
```bash
|
|
45
|
+
uv pip install -e .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Library usage**:
|
|
49
|
+
```python
|
|
50
|
+
from archive_extractor import extract_archives
|
|
51
|
+
results = extract_archives("/path/to/search", passwords=["pass1", "pass2"])
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Important Implementation Notes
|
|
55
|
+
|
|
56
|
+
- Extracted files are placed in directories named after each archive (without the archive extension)
|
|
57
|
+
- Path safety is enforced at extraction time: absolute paths and paths containing `..` are skipped
|
|
58
|
+
- For password-protected archives, the tool tries each password in sequence and stops at the first successful extraction
|
|
59
|
+
- Error handling is intentionally broad (catching generic `Exception`) to ensure the tool continues processing other archives even if one fails
|
|
60
|
+
- The `lzma.LZMAError` exception is caught to handle corrupt 7z archives
|
|
61
|
+
- The `extract_archives()` function returns a dictionary mapping archive paths to extraction counts (-1 for failures)
|
|
62
|
+
|
|
63
|
+
## README Requirements
|
|
64
|
+
|
|
65
|
+
README.md must be kept up to date with any significant project changes, including new archive format support, command-line options, or security-related improvements.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: archive-extractor
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Recursively extract ZIP and 7z archives from directory trees, with password-cracking support
|
|
5
5
|
Author-email: Tiago Silva <eng.tiago.silva@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: >=3.12
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
# archive-extractor
|
|
5
5
|
|
|
6
|
-
[](https://www.python.org/)
|
|
7
7
|
[](LICENSE)
|
|
8
8
|
[](https://pypi.org/project/archive-extractor/)
|
|
9
9
|
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
## Overview
|
|
16
16
|
|
|
17
|
-
archive-extractor is a
|
|
17
|
+
archive-extractor is a Python library and CLI tool for bulk extraction of archives nested within directory trees. It discovers and extracts `.zip` and `.7z` files, handles password-protected archives using a wordlist, and includes security measures against path traversal attacks.
|
|
18
18
|
|
|
19
19
|
Ideal for bulk extraction tasks or forensic analysis where archives may be deeply nested or encrypted.
|
|
20
20
|
|
|
@@ -25,6 +25,7 @@ Ideal for bulk extraction tasks or forensic analysis where archives may be deepl
|
|
|
25
25
|
- **🛡️ Path traversal protection** - Sanitizes filenames and rejects unsafe paths
|
|
26
26
|
- **📊 Progress indicators** - Shows extraction progress with tqdm
|
|
27
27
|
- **📁 Preserves structure** - Extracts each archive into its own named folder
|
|
28
|
+
- **📚 Library API** - Use programmatically in your Python projects
|
|
28
29
|
|
|
29
30
|
## Installation
|
|
30
31
|
|
|
@@ -40,6 +41,8 @@ uv pip install -e .
|
|
|
40
41
|
|
|
41
42
|
## Usage
|
|
42
43
|
|
|
44
|
+
### 🖥️ CLI
|
|
45
|
+
|
|
43
46
|
Extract all archives under a directory:
|
|
44
47
|
|
|
45
48
|
```bash
|
|
@@ -52,11 +55,48 @@ Extract with a password list (one password per line):
|
|
|
52
55
|
archive-extractor /path/to/search --passwords passwords.txt
|
|
53
56
|
```
|
|
54
57
|
|
|
58
|
+
Extract to a custom output directory:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
archive-extractor /path/to/search --output-dir /path/to/output
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Quiet mode (suppress progress output):
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
archive-extractor /path/to/search --quiet
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 📚 Library
|
|
71
|
+
|
|
72
|
+
Use archive-extractor programmatically in your Python projects:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from archive_extractor import extract_archives
|
|
76
|
+
|
|
77
|
+
# Extract all archives in a directory
|
|
78
|
+
results = extract_archives("/path/to/search")
|
|
79
|
+
|
|
80
|
+
# Extract a single archive
|
|
81
|
+
results = extract_archives("/path/to/archive.zip")
|
|
82
|
+
|
|
83
|
+
# With passwords
|
|
84
|
+
results = extract_archives("/path/to/search", passwords=["pass1", "pass2"])
|
|
85
|
+
|
|
86
|
+
# Custom output directory
|
|
87
|
+
results = extract_archives("/path/to/search", output_dir="/path/to/output")
|
|
88
|
+
|
|
89
|
+
# Silent mode (no progress bars)
|
|
90
|
+
results = extract_archives("/path/to/search", show_progress=False)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
The `extract_archives()` function returns a dictionary mapping archive paths to extraction counts (-1 indicates failure).
|
|
94
|
+
|
|
55
95
|
### Output
|
|
56
96
|
|
|
57
97
|
- Archives extract to folders named after the archive file (without extension)
|
|
58
|
-
- Success:
|
|
59
|
-
- Failure:
|
|
98
|
+
- Success: `Extracted 'archive.7z' to 'archive'.`
|
|
99
|
+
- Failure: `Could not extract 'archive.zip': no valid password found or archive is corrupt.`
|
|
60
100
|
|
|
61
101
|
## Security
|
|
62
102
|
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Archive Extractor - Recursively extract ZIP and 7z archives.
|
|
2
|
+
|
|
3
|
+
CLI Usage:
|
|
4
|
+
archive-extractor /path/to/search
|
|
5
|
+
archive-extractor /path/to/search --passwords passwords.txt
|
|
6
|
+
|
|
7
|
+
Library Usage:
|
|
8
|
+
from archive_extractor import extract_archives
|
|
9
|
+
|
|
10
|
+
# Extract all archives in a directory
|
|
11
|
+
results = extract_archives("/path/to/search")
|
|
12
|
+
|
|
13
|
+
# Extract a single archive
|
|
14
|
+
results = extract_archives("/path/to/archive.zip")
|
|
15
|
+
|
|
16
|
+
# With passwords
|
|
17
|
+
results = extract_archives("/path/to/search", passwords=["pass1", "pass2"])
|
|
18
|
+
|
|
19
|
+
# Custom output directory
|
|
20
|
+
results = extract_archives("/path/to/search", output_dir="/path/to/output")
|
|
21
|
+
|
|
22
|
+
# Silent mode (no progress bars)
|
|
23
|
+
results = extract_archives("/path/to/search", show_progress=False)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import os
|
|
28
|
+
|
|
29
|
+
from .core import (
|
|
30
|
+
find_archive_files,
|
|
31
|
+
load_passwords,
|
|
32
|
+
extract_zip_archive,
|
|
33
|
+
extract_7z_archive,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__all__ = ["extract_archives"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def extract_archives(
|
|
40
|
+
path: str,
|
|
41
|
+
output_dir: str | None = None,
|
|
42
|
+
passwords: list[str] | None = None,
|
|
43
|
+
show_progress: bool = True
|
|
44
|
+
) -> dict[str, int]:
|
|
45
|
+
"""Extract all archives found at the given path.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
path: Single archive file or directory to search for archives.
|
|
49
|
+
output_dir: Optional base directory for extraction output.
|
|
50
|
+
If None, each archive extracts to a sibling directory named after the archive.
|
|
51
|
+
passwords: Optional list of password strings to try for encrypted archives.
|
|
52
|
+
show_progress: Whether to show progress bars during extraction.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dictionary mapping archive paths to extraction counts.
|
|
56
|
+
A count of -1 indicates extraction failure.
|
|
57
|
+
"""
|
|
58
|
+
results = {}
|
|
59
|
+
|
|
60
|
+
for archive_path in find_archive_files(path):
|
|
61
|
+
if output_dir:
|
|
62
|
+
archive_name = os.path.splitext(os.path.basename(archive_path))[0]
|
|
63
|
+
dest_dir = os.path.join(output_dir, archive_name)
|
|
64
|
+
else:
|
|
65
|
+
dest_dir = os.path.splitext(archive_path)[0]
|
|
66
|
+
|
|
67
|
+
ext = os.path.splitext(archive_path)[1].lower()
|
|
68
|
+
|
|
69
|
+
if ext == ".zip":
|
|
70
|
+
count = extract_zip_archive(archive_path, dest_dir, passwords, show_progress)
|
|
71
|
+
elif ext == ".7z":
|
|
72
|
+
count = extract_7z_archive(archive_path, dest_dir, passwords, show_progress)
|
|
73
|
+
else:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
results[archive_path] = count
|
|
77
|
+
|
|
78
|
+
if show_progress:
|
|
79
|
+
if count >= 0:
|
|
80
|
+
print(f"Extracted '{archive_path}' to '{dest_dir}'.")
|
|
81
|
+
else:
|
|
82
|
+
print(f"Could not extract '{archive_path}': no valid password found or archive is corrupt.")
|
|
83
|
+
|
|
84
|
+
return results
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def main():
|
|
88
|
+
"""CLI entry point for archive-extractor."""
|
|
89
|
+
parser = argparse.ArgumentParser(
|
|
90
|
+
description="Recursively extract all files from .zip and .7z archives under a given path."
|
|
91
|
+
)
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
"path",
|
|
94
|
+
help="Root directory or file to search for .zip/.7z files"
|
|
95
|
+
)
|
|
96
|
+
parser.add_argument(
|
|
97
|
+
"--passwords",
|
|
98
|
+
help="Path to a file containing passwords (one per line) to try for encrypted archives"
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--output-dir",
|
|
102
|
+
help="Base directory for extraction output (default: sibling directory of each archive)"
|
|
103
|
+
)
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--quiet", "-q",
|
|
106
|
+
action="store_true",
|
|
107
|
+
help="Suppress progress output"
|
|
108
|
+
)
|
|
109
|
+
args = parser.parse_args()
|
|
110
|
+
|
|
111
|
+
passwords = load_passwords(args.passwords) if args.passwords else None
|
|
112
|
+
|
|
113
|
+
extract_archives(
|
|
114
|
+
args.path,
|
|
115
|
+
output_dir=args.output_dir,
|
|
116
|
+
passwords=passwords,
|
|
117
|
+
show_progress=not args.quiet
|
|
118
|
+
)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Core extraction logic for archive-extractor."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import zipfile
|
|
6
|
+
import lzma
|
|
7
|
+
|
|
8
|
+
import py7zr
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def sanitize_filename(filename: str) -> str:
|
|
13
|
+
"""Remove directories and illegal characters from a filename.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
filename: The filename to sanitize.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
A safe filename with illegal characters replaced and path components removed.
|
|
20
|
+
"""
|
|
21
|
+
filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
|
|
22
|
+
filename = filename.replace("..", "")
|
|
23
|
+
return os.path.basename(filename)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def find_archive_files(root_path: str):
|
|
27
|
+
"""Recursively yield paths to all .zip and .7z files under root_path.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
root_path: Directory to search, or a single archive file path.
|
|
31
|
+
|
|
32
|
+
Yields:
|
|
33
|
+
Absolute paths to archive files found.
|
|
34
|
+
"""
|
|
35
|
+
if os.path.isfile(root_path):
|
|
36
|
+
ext = os.path.splitext(root_path)[1].lower()
|
|
37
|
+
if ext in ('.zip', '.7z'):
|
|
38
|
+
yield root_path
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
for dirpath, _, filenames in os.walk(root_path):
|
|
42
|
+
for fname in filenames:
|
|
43
|
+
if fname.lower().endswith('.zip') or fname.lower().endswith('.7z'):
|
|
44
|
+
yield os.path.join(dirpath, fname)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def load_passwords(password_file: str) -> list[str]:
|
|
48
|
+
"""Load passwords from a file, one per line, stripping whitespace.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
password_file: Path to a file containing passwords.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of password strings.
|
|
55
|
+
"""
|
|
56
|
+
with open(password_file, 'r', encoding='utf-8') as f:
|
|
57
|
+
return [line.strip() for line in f if line.strip()]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_zip_archive(
|
|
61
|
+
zip_file: str,
|
|
62
|
+
output_dir: str,
|
|
63
|
+
passwords: list[str] | None = None,
|
|
64
|
+
show_progress: bool = True
|
|
65
|
+
) -> int:
|
|
66
|
+
"""Extract a ZIP archive to the specified directory.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
zip_file: Path to the ZIP file.
|
|
70
|
+
output_dir: Directory to extract files to.
|
|
71
|
+
passwords: Optional list of passwords to try for encrypted archives.
|
|
72
|
+
show_progress: Whether to show tqdm progress bar.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Number of files extracted, or -1 on failure.
|
|
76
|
+
"""
|
|
77
|
+
if not os.path.exists(output_dir):
|
|
78
|
+
os.makedirs(output_dir)
|
|
79
|
+
|
|
80
|
+
with zipfile.ZipFile(zip_file, 'r') as zf:
|
|
81
|
+
members = zf.infolist()
|
|
82
|
+
extracted_count = 0
|
|
83
|
+
|
|
84
|
+
def extract_members(pwd_bytes=None):
|
|
85
|
+
nonlocal extracted_count
|
|
86
|
+
iterator = tqdm(members, desc=f"Extracting {os.path.basename(zip_file)}", leave=False) if show_progress else members
|
|
87
|
+
for member in iterator:
|
|
88
|
+
if member.is_dir():
|
|
89
|
+
continue
|
|
90
|
+
safe_member_path = os.path.normpath(member.filename)
|
|
91
|
+
if os.path.isabs(safe_member_path) or safe_member_path.startswith(".."):
|
|
92
|
+
continue
|
|
93
|
+
out_path = os.path.join(output_dir, safe_member_path)
|
|
94
|
+
out_dir = os.path.dirname(out_path)
|
|
95
|
+
if not os.path.exists(out_dir):
|
|
96
|
+
os.makedirs(out_dir)
|
|
97
|
+
with open(out_path, 'wb') as f:
|
|
98
|
+
f.write(zf.read(member, pwd_bytes))
|
|
99
|
+
extracted_count += 1
|
|
100
|
+
|
|
101
|
+
if not passwords:
|
|
102
|
+
try:
|
|
103
|
+
extract_members()
|
|
104
|
+
return extracted_count
|
|
105
|
+
except RuntimeError:
|
|
106
|
+
return -1
|
|
107
|
+
else:
|
|
108
|
+
for pwd in passwords:
|
|
109
|
+
extracted_count = 0
|
|
110
|
+
try:
|
|
111
|
+
extract_members(pwd.encode('utf-8'))
|
|
112
|
+
return extracted_count
|
|
113
|
+
except RuntimeError:
|
|
114
|
+
continue
|
|
115
|
+
except zipfile.BadZipFile:
|
|
116
|
+
continue
|
|
117
|
+
return -1
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def extract_7z_archive(
|
|
121
|
+
archive_file: str,
|
|
122
|
+
output_dir: str,
|
|
123
|
+
passwords: list[str] | None = None,
|
|
124
|
+
show_progress: bool = True
|
|
125
|
+
) -> int:
|
|
126
|
+
"""Extract a 7z archive to the specified directory.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
archive_file: Path to the 7z file.
|
|
130
|
+
output_dir: Directory to extract files to.
|
|
131
|
+
passwords: Optional list of passwords to try for encrypted archives.
|
|
132
|
+
show_progress: Whether to show tqdm progress bar (currently unused for 7z).
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Number of files extracted, or -1 on failure.
|
|
136
|
+
"""
|
|
137
|
+
if not os.path.exists(output_dir):
|
|
138
|
+
os.makedirs(output_dir)
|
|
139
|
+
|
|
140
|
+
def try_extract(password=None):
|
|
141
|
+
with py7zr.SevenZipFile(archive_file, mode='r', password=password) as archive:
|
|
142
|
+
archive.extractall(path=output_dir)
|
|
143
|
+
return len(archive.getnames())
|
|
144
|
+
|
|
145
|
+
if not passwords:
|
|
146
|
+
try:
|
|
147
|
+
return try_extract()
|
|
148
|
+
except (py7zr.exceptions.PasswordRequired, py7zr.exceptions.Bad7zFile, lzma.LZMAError):
|
|
149
|
+
return -1
|
|
150
|
+
except Exception:
|
|
151
|
+
return -1
|
|
152
|
+
else:
|
|
153
|
+
for pwd in passwords:
|
|
154
|
+
try:
|
|
155
|
+
return try_extract(pwd)
|
|
156
|
+
except (py7zr.exceptions.PasswordRequired, py7zr.exceptions.Bad7zFile, lzma.LZMAError):
|
|
157
|
+
continue
|
|
158
|
+
except Exception:
|
|
159
|
+
continue
|
|
160
|
+
return -1
|
|
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "archive-extractor"
|
|
7
|
-
version = "0.
|
|
8
|
-
description = "
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Recursively extract ZIP and 7z archives from directory trees, with password-cracking support"
|
|
9
9
|
authors = [
|
|
10
10
|
{ name = "Tiago Silva", email = "eng.tiago.silva@gmail.com" }
|
|
11
11
|
]
|
|
@@ -16,7 +16,4 @@ dependencies = [
|
|
|
16
16
|
requires-python = ">=3.12"
|
|
17
17
|
|
|
18
18
|
[project.scripts]
|
|
19
|
-
archive-extractor = "
|
|
20
|
-
|
|
21
|
-
[tool.hatch.build.targets.wheel]
|
|
22
|
-
include = ["main.py"]
|
|
19
|
+
archive-extractor = "archive_extractor:main"
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
# CLAUDE.md
|
|
2
|
-
|
|
3
|
-
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
-
|
|
5
|
-
## Project Overview
|
|
6
|
-
|
|
7
|
-
Archive Extractor is a command-line utility that recursively searches for and extracts ZIP and 7z archives within a directory tree. It handles password-protected archives, preserves folder structures, and prevents path traversal attacks.
|
|
8
|
-
|
|
9
|
-
## Architecture
|
|
10
|
-
|
|
11
|
-
This is a single-module Python CLI tool (`main.py`) with a straightforward architecture:
|
|
12
|
-
|
|
13
|
-
- **Entry point**: `main()` function parses arguments and orchestrates archive discovery and extraction
|
|
14
|
-
- **Archive discovery**: `find_archive_files()` walks the directory tree to locate .zip and .7z files
|
|
15
|
-
- **Extraction logic**: Separate functions for ZIP (`extract_zip()`) and 7z (`extract_7z()`) formats
|
|
16
|
-
- **Password handling**: `load_passwords()` reads password lists; extraction functions attempt each password sequentially until success
|
|
17
|
-
- **Security**: `sanitize_filename()` prevents directory traversal; extraction functions validate paths with `os.path.normpath()` and reject absolute paths or `..` sequences
|
|
18
|
-
|
|
19
|
-
## Key Dependencies
|
|
20
|
-
|
|
21
|
-
- `zipfile` (stdlib): ZIP extraction
|
|
22
|
-
- `py7zr`: 7z archive extraction
|
|
23
|
-
- `tqdm`: Progress bars during extraction
|
|
24
|
-
- `lzma`: Referenced in exception handling for 7z corruption detection (note: currently imported but not directly used due to py7zr wrapping it)
|
|
25
|
-
|
|
26
|
-
## Development Commands
|
|
27
|
-
|
|
28
|
-
**Install as a tool**:
|
|
29
|
-
```bash
|
|
30
|
-
uv tool install .
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
**Run directly**:
|
|
34
|
-
```bash
|
|
35
|
-
python main.py /path/to/search
|
|
36
|
-
python main.py /path/to/search --passwords passwords.txt
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
**Install in editable mode for development**:
|
|
40
|
-
```bash
|
|
41
|
-
uv pip install -e .
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
## Important Implementation Notes
|
|
45
|
-
|
|
46
|
-
- Extracted files are placed in directories named after each archive (without the archive extension)
|
|
47
|
-
- Path safety is enforced at extraction time: absolute paths and paths containing `..` are skipped
|
|
48
|
-
- For password-protected archives, the tool tries each password in sequence and stops at the first successful extraction
|
|
49
|
-
- Error handling is intentionally broad (catching generic `Exception`) to ensure the tool continues processing other archives even if one fails
|
|
50
|
-
- The `lzma.LZMAError` exception is caught to handle corrupt 7z archives, though `lzma` is no longer a direct dependency (handled internally by py7zr)
|
|
51
|
-
|
|
52
|
-
## README Requirements
|
|
53
|
-
|
|
54
|
-
README.md must be kept up to date with any significant project changes, including new archive format support, command-line options, or security-related improvements.
|
archive_extractor-0.1.3/main.py
DELETED
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
import argparse
|
|
4
|
-
from tqdm import tqdm
|
|
5
|
-
import zipfile
|
|
6
|
-
import py7zr
|
|
7
|
-
import lzma
|
|
8
|
-
|
|
9
|
-
def sanitize_filename(filename):
|
|
10
|
-
# Remove directories and illegal characters
|
|
11
|
-
filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
|
|
12
|
-
filename = filename.replace("..", "") # extra safety
|
|
13
|
-
return os.path.basename(filename)
|
|
14
|
-
|
|
15
|
-
def find_archive_files(root_path):
|
|
16
|
-
"""Recursively yield paths to all .zip and .7z files under root_path."""
|
|
17
|
-
for dirpath, _, filenames in os.walk(root_path):
|
|
18
|
-
for fname in filenames:
|
|
19
|
-
if fname.lower().endswith('.zip') or fname.lower().endswith('.7z'):
|
|
20
|
-
yield os.path.join(dirpath, fname)
|
|
21
|
-
|
|
22
|
-
def load_passwords(password_file):
|
|
23
|
-
"""Load passwords from a file, one per line, stripping whitespace."""
|
|
24
|
-
with open(password_file, 'r', encoding='utf-8') as f:
|
|
25
|
-
return [line.strip() for line in f if line.strip()]
|
|
26
|
-
|
|
27
|
-
def extract_zip(zip_file, output_dir, passwords=None):
|
|
28
|
-
if not os.path.exists(output_dir):
|
|
29
|
-
os.makedirs(output_dir)
|
|
30
|
-
|
|
31
|
-
with zipfile.ZipFile(zip_file, 'r') as zf:
|
|
32
|
-
members = zf.infolist()
|
|
33
|
-
extracted = False
|
|
34
|
-
if not passwords:
|
|
35
|
-
# No passwords provided, extract directly
|
|
36
|
-
for member in tqdm(members, desc=f"Extracting {os.path.basename(zip_file)}"):
|
|
37
|
-
if member.is_dir():
|
|
38
|
-
continue
|
|
39
|
-
safe_member_path = os.path.normpath(member.filename)
|
|
40
|
-
if os.path.isabs(safe_member_path) or safe_member_path.startswith(".."):
|
|
41
|
-
continue
|
|
42
|
-
out_path = os.path.join(output_dir, safe_member_path)
|
|
43
|
-
out_dir = os.path.dirname(out_path)
|
|
44
|
-
if not os.path.exists(out_dir):
|
|
45
|
-
os.makedirs(out_dir)
|
|
46
|
-
with open(out_path, 'wb') as f:
|
|
47
|
-
f.write(zf.read(member))
|
|
48
|
-
extracted = True
|
|
49
|
-
else:
|
|
50
|
-
# Try each password for the whole zip
|
|
51
|
-
for pwd in passwords:
|
|
52
|
-
try:
|
|
53
|
-
for member in tqdm(members, desc=f"Extracting {os.path.basename(zip_file)}", leave=False):
|
|
54
|
-
if member.is_dir():
|
|
55
|
-
continue
|
|
56
|
-
safe_member_path = os.path.normpath(member.filename)
|
|
57
|
-
if os.path.isabs(safe_member_path) or safe_member_path.startswith(".."):
|
|
58
|
-
continue
|
|
59
|
-
out_path = os.path.join(output_dir, safe_member_path)
|
|
60
|
-
out_dir = os.path.dirname(out_path)
|
|
61
|
-
if not os.path.exists(out_dir):
|
|
62
|
-
os.makedirs(out_dir)
|
|
63
|
-
with open(out_path, 'wb') as f:
|
|
64
|
-
f.write(zf.read(member, pwd.encode('utf-8')))
|
|
65
|
-
extracted = True
|
|
66
|
-
break # Stop trying passwords after success
|
|
67
|
-
except RuntimeError:
|
|
68
|
-
# Wrong password, try next
|
|
69
|
-
continue
|
|
70
|
-
except zipfile.BadZipFile:
|
|
71
|
-
continue
|
|
72
|
-
if not extracted:
|
|
73
|
-
print(f"❌ Could not extract '{zip_file}': no valid password found.")
|
|
74
|
-
else:
|
|
75
|
-
print(f"✅ Extracted {len(members)} items to '{output_dir}'.")
|
|
76
|
-
|
|
77
|
-
def extract_7z(archive_file, output_dir, passwords=None):
|
|
78
|
-
if not os.path.exists(output_dir):
|
|
79
|
-
os.makedirs(output_dir)
|
|
80
|
-
|
|
81
|
-
extracted = False
|
|
82
|
-
if not passwords:
|
|
83
|
-
try:
|
|
84
|
-
with py7zr.SevenZipFile(archive_file, mode='r') as archive:
|
|
85
|
-
archive.extractall(path=output_dir)
|
|
86
|
-
extracted = True
|
|
87
|
-
except (py7zr.exceptions.PasswordRequired, py7zr.exceptions.Bad7zFile, lzma.LZMAError):
|
|
88
|
-
pass
|
|
89
|
-
except Exception:
|
|
90
|
-
pass
|
|
91
|
-
else:
|
|
92
|
-
for pwd in passwords:
|
|
93
|
-
try:
|
|
94
|
-
with py7zr.SevenZipFile(archive_file, mode='r', password=pwd) as archive:
|
|
95
|
-
archive.extractall(path=output_dir)
|
|
96
|
-
extracted = True
|
|
97
|
-
break
|
|
98
|
-
except (py7zr.exceptions.PasswordRequired, py7zr.exceptions.Bad7zFile, lzma.LZMAError):
|
|
99
|
-
continue
|
|
100
|
-
except Exception:
|
|
101
|
-
continue
|
|
102
|
-
if not extracted:
|
|
103
|
-
print(f"❌ Could not extract '{archive_file}': no valid password found or archive is corrupt.")
|
|
104
|
-
else:
|
|
105
|
-
print(f"✅ Extracted '{archive_file}' to '{output_dir}'.")
|
|
106
|
-
|
|
107
|
-
def main():
|
|
108
|
-
parser = argparse.ArgumentParser(
|
|
109
|
-
description="Recursively extract all files from .zip and .7z archives under a given path."
|
|
110
|
-
)
|
|
111
|
-
parser.add_argument(
|
|
112
|
-
"path",
|
|
113
|
-
help="Root directory or file to search for .zip/.7z files"
|
|
114
|
-
)
|
|
115
|
-
parser.add_argument(
|
|
116
|
-
"--passwords",
|
|
117
|
-
help="Path to a file containing passwords (one per line) to try for encrypted archives"
|
|
118
|
-
)
|
|
119
|
-
args = parser.parse_args()
|
|
120
|
-
root_path = args.path
|
|
121
|
-
passwords = load_passwords(args.passwords) if args.passwords else None
|
|
122
|
-
for archive_path in find_archive_files(root_path):
|
|
123
|
-
archive_dir = os.path.splitext(archive_path)[0]
|
|
124
|
-
ext = os.path.splitext(archive_path)[1].lower()
|
|
125
|
-
if ext == ".zip":
|
|
126
|
-
extract_zip(archive_path, archive_dir, passwords)
|
|
127
|
-
elif ext == ".7z":
|
|
128
|
-
extract_7z(archive_path, archive_dir, passwords)
|
|
129
|
-
|
|
130
|
-
if __name__ == "__main__":
|
|
131
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|