clang-tool-chain 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of clang-tool-chain might be problematic. Click here for more details.
- clang_tool_chain/__init__.py +0 -0
- clang_tool_chain/__version__.py +4 -0
- clang_tool_chain/checksums.py +270 -0
- clang_tool_chain/cli.py +575 -0
- clang_tool_chain/downloader.py +1325 -0
- clang_tool_chain/downloads/README.md +144 -0
- clang_tool_chain/downloads/__init__.py +22 -0
- clang_tool_chain/downloads/__main__.py +11 -0
- clang_tool_chain/downloads/create_hardlink_archive.py +390 -0
- clang_tool_chain/downloads/create_iwyu_archives.py +330 -0
- clang_tool_chain/downloads/deduplicate_binaries.py +217 -0
- clang_tool_chain/downloads/download_binaries.py +463 -0
- clang_tool_chain/downloads/expand_archive.py +260 -0
- clang_tool_chain/downloads/extract_mingw_sysroot.py +349 -0
- clang_tool_chain/downloads/fetch_and_archive.py +1376 -0
- clang_tool_chain/downloads/strip_binaries.py +436 -0
- clang_tool_chain/downloads/test_compression.py +259 -0
- clang_tool_chain/fetch.py +158 -0
- clang_tool_chain/paths.py +93 -0
- clang_tool_chain/sccache_runner.py +160 -0
- clang_tool_chain/wrapper.py +1383 -0
- clang_tool_chain-1.0.2.dist-info/METADATA +1766 -0
- clang_tool_chain-1.0.2.dist-info/RECORD +26 -0
- clang_tool_chain-1.0.2.dist-info/WHEEL +4 -0
- clang_tool_chain-1.0.2.dist-info/entry_points.txt +31 -0
- clang_tool_chain-1.0.2.dist-info/licenses/LICENSE +204 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Strip and optimize LLVM binaries for minimal package size.
|
|
4
|
+
|
|
5
|
+
This script removes unnecessary files from downloaded LLVM distributions
|
|
6
|
+
and strips debug symbols from binaries to minimize package size.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# Essential binaries to keep
|
|
17
|
+
ESSENTIAL_BINARIES = {
|
|
18
|
+
# Core compilation
|
|
19
|
+
"clang",
|
|
20
|
+
"clang++",
|
|
21
|
+
"clang-cl", # Windows only
|
|
22
|
+
"clang-cpp",
|
|
23
|
+
# Linkers
|
|
24
|
+
"lld",
|
|
25
|
+
"lld-link",
|
|
26
|
+
"ld.lld",
|
|
27
|
+
"ld64.lld",
|
|
28
|
+
"wasm-ld",
|
|
29
|
+
# Binary utilities
|
|
30
|
+
"llvm-ar",
|
|
31
|
+
"llvm-nm",
|
|
32
|
+
"llvm-objdump",
|
|
33
|
+
"llvm-objcopy",
|
|
34
|
+
"llvm-ranlib",
|
|
35
|
+
"llvm-strip",
|
|
36
|
+
"llvm-readelf",
|
|
37
|
+
"llvm-readobj",
|
|
38
|
+
# Additional utilities
|
|
39
|
+
"llvm-as",
|
|
40
|
+
"llvm-dis",
|
|
41
|
+
"clang-format",
|
|
42
|
+
"clang-tidy",
|
|
43
|
+
"llvm-symbolizer",
|
|
44
|
+
"llvm-config",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Directories to remove completely
|
|
48
|
+
REMOVE_DIRS = {
|
|
49
|
+
"share/doc",
|
|
50
|
+
"share/man",
|
|
51
|
+
"docs",
|
|
52
|
+
"share/clang",
|
|
53
|
+
"share/opt-viewer",
|
|
54
|
+
"share/scan-build",
|
|
55
|
+
"share/scan-view",
|
|
56
|
+
"python_packages",
|
|
57
|
+
"libexec", # Helper scripts usually not needed
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# File patterns to remove
|
|
61
|
+
REMOVE_PATTERNS = {
|
|
62
|
+
"*.a", # Static libraries
|
|
63
|
+
"*.lib", # Windows static libraries
|
|
64
|
+
"CMakeLists.txt",
|
|
65
|
+
"*.cmake",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Directories containing files to remove by pattern
|
|
69
|
+
PATTERN_REMOVE_DIRS = {
|
|
70
|
+
"lib",
|
|
71
|
+
"lib64",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class BinaryStripper:
|
|
76
|
+
"""Strip and optimize LLVM binary distributions."""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
source_dir: Path,
|
|
81
|
+
output_dir: Path,
|
|
82
|
+
platform: str,
|
|
83
|
+
keep_headers: bool = False,
|
|
84
|
+
strip_binaries: bool = True,
|
|
85
|
+
verbose: bool = False,
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Initialize the binary stripper.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
source_dir: Directory containing extracted LLVM binaries
|
|
92
|
+
output_dir: Directory to output stripped binaries
|
|
93
|
+
platform: Platform identifier (e.g., "linux-x86_64")
|
|
94
|
+
keep_headers: Whether to keep header files
|
|
95
|
+
strip_binaries: Whether to strip debug symbols
|
|
96
|
+
verbose: Whether to print verbose output
|
|
97
|
+
"""
|
|
98
|
+
self.source_dir = Path(source_dir)
|
|
99
|
+
self.output_dir = Path(output_dir)
|
|
100
|
+
self.platform = platform
|
|
101
|
+
self.keep_headers = keep_headers
|
|
102
|
+
self.strip_binaries = strip_binaries
|
|
103
|
+
self.verbose = verbose
|
|
104
|
+
|
|
105
|
+
# Statistics
|
|
106
|
+
self.original_size = 0
|
|
107
|
+
self.final_size = 0
|
|
108
|
+
self.files_removed = 0
|
|
109
|
+
self.files_kept = 0
|
|
110
|
+
|
|
111
|
+
def log(self, message: str) -> None:
|
|
112
|
+
"""Print a message if verbose mode is enabled."""
|
|
113
|
+
if self.verbose:
|
|
114
|
+
print(message)
|
|
115
|
+
|
|
116
|
+
def get_dir_size(self, path: Path) -> int:
|
|
117
|
+
"""Get total size of a directory in bytes."""
|
|
118
|
+
total = 0
|
|
119
|
+
try:
|
|
120
|
+
for entry in path.rglob("*"):
|
|
121
|
+
if entry.is_file():
|
|
122
|
+
total += entry.stat().st_size
|
|
123
|
+
except Exception as e:
|
|
124
|
+
self.log(f"Warning: Could not calculate size of {path}: {e}")
|
|
125
|
+
return total
|
|
126
|
+
|
|
127
|
+
def find_llvm_root(self) -> Path | None:
|
|
128
|
+
"""
|
|
129
|
+
Find the root directory of the LLVM installation.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Path to LLVM root, or None if not found
|
|
133
|
+
"""
|
|
134
|
+
# Check if source_dir is already the root
|
|
135
|
+
if (self.source_dir / "bin").exists():
|
|
136
|
+
return self.source_dir
|
|
137
|
+
|
|
138
|
+
# Look for subdirectories that might be the root
|
|
139
|
+
for subdir in self.source_dir.iterdir():
|
|
140
|
+
if subdir.is_dir() and (subdir / "bin").exists():
|
|
141
|
+
return subdir
|
|
142
|
+
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
def should_keep_binary(self, binary_name: str) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Check if a binary should be kept.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
binary_name: Name of the binary (without extension)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
True if binary should be kept, False otherwise
|
|
154
|
+
"""
|
|
155
|
+
# Remove common extensions
|
|
156
|
+
name = binary_name
|
|
157
|
+
for ext in [".exe", ".dll", ".so", ".dylib"]:
|
|
158
|
+
if name.endswith(ext):
|
|
159
|
+
name = name[: -len(ext)]
|
|
160
|
+
break
|
|
161
|
+
|
|
162
|
+
return name in ESSENTIAL_BINARIES
|
|
163
|
+
|
|
164
|
+
def copy_essential_files(self, src_root: Path, dst_root: Path) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Copy only essential files from source to destination.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
src_root: Source LLVM root directory
|
|
170
|
+
dst_root: Destination directory
|
|
171
|
+
"""
|
|
172
|
+
dst_root.mkdir(parents=True, exist_ok=True)
|
|
173
|
+
|
|
174
|
+
# Copy bin directory (filtered)
|
|
175
|
+
src_bin = src_root / "bin"
|
|
176
|
+
if src_bin.exists():
|
|
177
|
+
dst_bin = dst_root / "bin"
|
|
178
|
+
dst_bin.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
for binary in src_bin.iterdir():
|
|
181
|
+
if binary.is_file() and self.should_keep_binary(binary.name):
|
|
182
|
+
shutil.copy2(binary, dst_bin / binary.name)
|
|
183
|
+
self.files_kept += 1
|
|
184
|
+
self.log(f"Keeping binary: {binary.name}")
|
|
185
|
+
else:
|
|
186
|
+
self.files_removed += 1
|
|
187
|
+
self.log(f"Removing binary: {binary.name}")
|
|
188
|
+
|
|
189
|
+
# Copy lib directory (filtered - keep only runtime libraries)
|
|
190
|
+
for lib_dir_name in ["lib", "lib64"]:
|
|
191
|
+
src_lib = src_root / lib_dir_name
|
|
192
|
+
if not src_lib.exists():
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
dst_lib = dst_root / lib_dir_name
|
|
196
|
+
dst_lib.mkdir(parents=True, exist_ok=True)
|
|
197
|
+
|
|
198
|
+
for item in src_lib.iterdir():
|
|
199
|
+
# Keep clang runtime directory
|
|
200
|
+
if item.is_dir() and item.name == "clang":
|
|
201
|
+
dst_clang = dst_lib / "clang"
|
|
202
|
+
shutil.copytree(item, dst_clang, dirs_exist_ok=True)
|
|
203
|
+
self.files_kept += 1
|
|
204
|
+
self.log(f"Keeping runtime: {item.name}")
|
|
205
|
+
# Keep dynamic libraries (.so, .dll, .dylib)
|
|
206
|
+
elif item.is_file():
|
|
207
|
+
if any(item.name.endswith(ext) for ext in [".so", ".dll", ".dylib"]):
|
|
208
|
+
# Check if it's a versioned .so file
|
|
209
|
+
if ".so." in item.name or item.suffix in [".so", ".dll", ".dylib"]:
|
|
210
|
+
shutil.copy2(item, dst_lib / item.name)
|
|
211
|
+
self.files_kept += 1
|
|
212
|
+
self.log(f"Keeping library: {item.name}")
|
|
213
|
+
# Remove static libraries
|
|
214
|
+
elif item.suffix in [".a", ".lib"]:
|
|
215
|
+
self.files_removed += 1
|
|
216
|
+
self.log(f"Removing static library: {item.name}")
|
|
217
|
+
# Keep CMake and other config files if small
|
|
218
|
+
elif item.suffix in [".cmake"] or "LLVMConfig" in item.name:
|
|
219
|
+
self.files_removed += 1
|
|
220
|
+
self.log(f"Removing config file: {item.name}")
|
|
221
|
+
else:
|
|
222
|
+
# Keep other files (might be needed)
|
|
223
|
+
shutil.copy2(item, dst_lib / item.name)
|
|
224
|
+
self.files_kept += 1
|
|
225
|
+
|
|
226
|
+
# Copy include directory only if requested
|
|
227
|
+
if self.keep_headers:
|
|
228
|
+
src_include = src_root / "include"
|
|
229
|
+
if src_include.exists():
|
|
230
|
+
dst_include = dst_root / "include"
|
|
231
|
+
shutil.copytree(src_include, dst_include, dirs_exist_ok=True)
|
|
232
|
+
self.log("Keeping include directory")
|
|
233
|
+
else:
|
|
234
|
+
self.log("Removing include directory")
|
|
235
|
+
|
|
236
|
+
# Copy license and readme files
|
|
237
|
+
for pattern in ["LICENSE*", "README*", "NOTICE*"]:
|
|
238
|
+
for item in src_root.glob(pattern):
|
|
239
|
+
if item.is_file():
|
|
240
|
+
shutil.copy2(item, dst_root / item.name)
|
|
241
|
+
self.log(f"Keeping license file: {item.name}")
|
|
242
|
+
|
|
243
|
+
def strip_binary(self, binary_path: Path) -> bool:
|
|
244
|
+
"""
|
|
245
|
+
Strip debug symbols from a binary.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
binary_path: Path to the binary to strip
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if stripping was successful, False otherwise
|
|
252
|
+
"""
|
|
253
|
+
if not self.strip_binaries:
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
# Determine strip command based on platform
|
|
258
|
+
if "win" in self.platform:
|
|
259
|
+
# On Windows, try to find llvm-strip in the output
|
|
260
|
+
llvm_strip = self.output_dir / "bin" / "llvm-strip.exe"
|
|
261
|
+
if not llvm_strip.exists():
|
|
262
|
+
self.log(f"Skipping strip for {binary_path.name}: llvm-strip not found")
|
|
263
|
+
return False
|
|
264
|
+
strip_cmd = [str(llvm_strip), "--strip-all", str(binary_path)]
|
|
265
|
+
else:
|
|
266
|
+
# On Unix, use llvm-strip from the output
|
|
267
|
+
llvm_strip = self.output_dir / "bin" / "llvm-strip"
|
|
268
|
+
if not llvm_strip.exists():
|
|
269
|
+
# Fallback to system strip
|
|
270
|
+
strip_cmd = ["strip", "--strip-all", str(binary_path)]
|
|
271
|
+
else:
|
|
272
|
+
strip_cmd = [str(llvm_strip), "--strip-all", str(binary_path)]
|
|
273
|
+
|
|
274
|
+
# Get original size
|
|
275
|
+
original_size = binary_path.stat().st_size
|
|
276
|
+
|
|
277
|
+
# Run strip command
|
|
278
|
+
result = subprocess.run(strip_cmd, capture_output=True, text=True)
|
|
279
|
+
|
|
280
|
+
if result.returncode == 0:
|
|
281
|
+
new_size = binary_path.stat().st_size
|
|
282
|
+
saved = original_size - new_size
|
|
283
|
+
saved_pct = (saved / original_size * 100) if original_size > 0 else 0
|
|
284
|
+
self.log(
|
|
285
|
+
f"Stripped {binary_path.name}: "
|
|
286
|
+
f"{original_size/1024/1024:.1f}MB -> {new_size/1024/1024:.1f}MB "
|
|
287
|
+
f"(saved {saved_pct:.1f}%)"
|
|
288
|
+
)
|
|
289
|
+
return True
|
|
290
|
+
else:
|
|
291
|
+
self.log(f"Failed to strip {binary_path.name}: {result.stderr}")
|
|
292
|
+
return False
|
|
293
|
+
|
|
294
|
+
except Exception as e:
|
|
295
|
+
self.log(f"Error stripping {binary_path.name}: {e}")
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
def strip_all_binaries(self) -> None:
|
|
299
|
+
"""Strip debug symbols from all binaries in output directory."""
|
|
300
|
+
if not self.strip_binaries:
|
|
301
|
+
print("Skipping binary stripping (disabled)")
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
print("Stripping debug symbols from binaries...")
|
|
305
|
+
|
|
306
|
+
bin_dir = self.output_dir / "bin"
|
|
307
|
+
if not bin_dir.exists():
|
|
308
|
+
print("Warning: No bin directory found")
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
# Get list of binaries to strip
|
|
312
|
+
binaries = []
|
|
313
|
+
for binary in bin_dir.iterdir():
|
|
314
|
+
if binary.is_file():
|
|
315
|
+
# Check if file is executable or library
|
|
316
|
+
if "win" in self.platform:
|
|
317
|
+
if binary.suffix in [".exe", ".dll"]:
|
|
318
|
+
binaries.append(binary)
|
|
319
|
+
else:
|
|
320
|
+
# On Unix, check if file has executable bit
|
|
321
|
+
if os.access(binary, os.X_OK) or binary.suffix in [".so", ".dylib"]:
|
|
322
|
+
binaries.append(binary)
|
|
323
|
+
|
|
324
|
+
print(f"Found {len(binaries)} binaries to strip")
|
|
325
|
+
|
|
326
|
+
# Strip each binary
|
|
327
|
+
success_count = 0
|
|
328
|
+
for binary in binaries:
|
|
329
|
+
if self.strip_binary(binary):
|
|
330
|
+
success_count += 1
|
|
331
|
+
|
|
332
|
+
print(f"Successfully stripped {success_count}/{len(binaries)} binaries")
|
|
333
|
+
|
|
334
|
+
def process(self) -> bool:
|
|
335
|
+
"""
|
|
336
|
+
Process the LLVM distribution: copy essential files and strip binaries.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
True if processing was successful, False otherwise
|
|
340
|
+
"""
|
|
341
|
+
print(f"Processing {self.platform}...")
|
|
342
|
+
|
|
343
|
+
# Find LLVM root
|
|
344
|
+
llvm_root = self.find_llvm_root()
|
|
345
|
+
if not llvm_root:
|
|
346
|
+
print(f"Error: Could not find LLVM root in {self.source_dir}")
|
|
347
|
+
return False
|
|
348
|
+
|
|
349
|
+
print(f"Found LLVM root: {llvm_root}")
|
|
350
|
+
|
|
351
|
+
# Calculate original size
|
|
352
|
+
print("Calculating original size...")
|
|
353
|
+
self.original_size = self.get_dir_size(llvm_root)
|
|
354
|
+
print(f"Original size: {self.original_size / 1024 / 1024:.1f} MB")
|
|
355
|
+
|
|
356
|
+
# Copy essential files
|
|
357
|
+
print("Copying essential files...")
|
|
358
|
+
self.copy_essential_files(llvm_root, self.output_dir)
|
|
359
|
+
|
|
360
|
+
# Strip binaries
|
|
361
|
+
if self.strip_binaries:
|
|
362
|
+
self.strip_all_binaries()
|
|
363
|
+
|
|
364
|
+
# Calculate final size
|
|
365
|
+
print("Calculating final size...")
|
|
366
|
+
self.final_size = self.get_dir_size(self.output_dir)
|
|
367
|
+
print(f"Final size: {self.final_size / 1024 / 1024:.1f} MB")
|
|
368
|
+
|
|
369
|
+
# Print statistics
|
|
370
|
+
saved = self.original_size - self.final_size
|
|
371
|
+
saved_pct = (saved / self.original_size * 100) if self.original_size > 0 else 0
|
|
372
|
+
|
|
373
|
+
print(f"\n{'='*60}")
|
|
374
|
+
print("Statistics")
|
|
375
|
+
print(f"{'='*60}")
|
|
376
|
+
print(f"Original size: {self.original_size / 1024 / 1024:>10.1f} MB")
|
|
377
|
+
print(f"Final size: {self.final_size / 1024 / 1024:>10.1f} MB")
|
|
378
|
+
print(f"Saved: {saved / 1024 / 1024:>10.1f} MB ({saved_pct:.1f}%)")
|
|
379
|
+
print(f"Files kept: {self.files_kept:>10}")
|
|
380
|
+
print(f"Files removed: {self.files_removed:>10}")
|
|
381
|
+
print(f"{'='*60}\n")
|
|
382
|
+
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def main() -> None:
|
|
387
|
+
"""Main entry point for the strip script."""
|
|
388
|
+
parser = argparse.ArgumentParser(description="Strip and optimize LLVM binaries for minimal package size")
|
|
389
|
+
parser.add_argument("source_dir", help="Directory containing extracted LLVM binaries")
|
|
390
|
+
parser.add_argument("output_dir", help="Directory to output stripped binaries")
|
|
391
|
+
parser.add_argument(
|
|
392
|
+
"--platform",
|
|
393
|
+
required=True,
|
|
394
|
+
choices=["win-x86_64", "linux-x86_64", "linux-aarch64", "darwin-x86_64", "darwin-arm64"],
|
|
395
|
+
help="Platform identifier",
|
|
396
|
+
)
|
|
397
|
+
parser.add_argument(
|
|
398
|
+
"--keep-headers",
|
|
399
|
+
action="store_true",
|
|
400
|
+
help="Keep header files (increases size significantly)",
|
|
401
|
+
)
|
|
402
|
+
parser.add_argument(
|
|
403
|
+
"--no-strip",
|
|
404
|
+
action="store_true",
|
|
405
|
+
help="Don't strip debug symbols from binaries",
|
|
406
|
+
)
|
|
407
|
+
parser.add_argument(
|
|
408
|
+
"--verbose",
|
|
409
|
+
"-v",
|
|
410
|
+
action="store_true",
|
|
411
|
+
help="Print verbose output",
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
args = parser.parse_args()
|
|
415
|
+
|
|
416
|
+
# Create stripper and process
|
|
417
|
+
stripper = BinaryStripper(
|
|
418
|
+
source_dir=args.source_dir,
|
|
419
|
+
output_dir=args.output_dir,
|
|
420
|
+
platform=args.platform,
|
|
421
|
+
keep_headers=args.keep_headers,
|
|
422
|
+
strip_binaries=not args.no_strip,
|
|
423
|
+
verbose=args.verbose,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
success = stripper.process()
|
|
427
|
+
|
|
428
|
+
if not success:
|
|
429
|
+
print("\nError: Failed to process binaries")
|
|
430
|
+
sys.exit(1)
|
|
431
|
+
|
|
432
|
+
print("\n✓ Successfully processed binaries")
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
if __name__ == "__main__":
|
|
436
|
+
main()
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Test various compression methods to find the smallest archive size.
|
|
4
|
+
|
|
5
|
+
Tests:
|
|
6
|
+
- gzip (levels 1-9)
|
|
7
|
+
- bzip2 (levels 1-9)
|
|
8
|
+
- xz (levels 0-9, plus extreme mode)
|
|
9
|
+
- zstd (levels 1-22)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import subprocess
|
|
14
|
+
import sys
|
|
15
|
+
import tarfile
|
|
16
|
+
import time
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import zstandard as zstd
|
|
22
|
+
except ImportError:
|
|
23
|
+
print("Warning: zstandard module not available")
|
|
24
|
+
zstd = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def format_size(bytes_size: int) -> str:
|
|
28
|
+
"""Format bytes as human-readable string."""
|
|
29
|
+
mb = bytes_size / (1024 * 1024)
|
|
30
|
+
return f"{mb:.2f} MB"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def format_time(seconds: float) -> str:
|
|
34
|
+
"""Format seconds as human-readable string."""
|
|
35
|
+
if seconds < 60:
|
|
36
|
+
return f"{seconds:.1f}s"
|
|
37
|
+
minutes = int(seconds // 60)
|
|
38
|
+
secs = seconds % 60
|
|
39
|
+
return f"{minutes}m {secs:.1f}s"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_gzip(source_dir: str, output_base: str, levels: list[int] | None = None) -> list[dict[str, Any]]:
|
|
43
|
+
"""Test gzip compression at various levels."""
|
|
44
|
+
if levels is None:
|
|
45
|
+
levels = [1, 6, 9] # Fast, default, max
|
|
46
|
+
|
|
47
|
+
results = []
|
|
48
|
+
source_path = Path(source_dir)
|
|
49
|
+
|
|
50
|
+
for level in levels:
|
|
51
|
+
output = f"{output_base}_gzip{level}.tar.gz"
|
|
52
|
+
print(f"Testing gzip level {level}...", end=" ", flush=True)
|
|
53
|
+
|
|
54
|
+
start = time.time()
|
|
55
|
+
cmd = f'tar -czf "{output}" -C "{source_path.parent}" "{source_path.name}"'
|
|
56
|
+
env = {"GZIP": f"-{level}"}
|
|
57
|
+
subprocess.run(cmd, shell=True, env={**os.environ, **env}, check=True)
|
|
58
|
+
elapsed = time.time() - start
|
|
59
|
+
|
|
60
|
+
size = Path(output).stat().st_size
|
|
61
|
+
print(f"{format_size(size)} in {format_time(elapsed)}")
|
|
62
|
+
|
|
63
|
+
results.append({"method": f"gzip-{level}", "file": output, "size": size, "time": elapsed})
|
|
64
|
+
|
|
65
|
+
return results
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_bzip2(source_dir: str, output_base: str, levels: list[int] | None = None) -> list[dict[str, Any]]:
|
|
69
|
+
"""Test bzip2 compression at various levels."""
|
|
70
|
+
if levels is None:
|
|
71
|
+
levels = [1, 6, 9] # Fast, default, max
|
|
72
|
+
|
|
73
|
+
results = []
|
|
74
|
+
source_path = Path(source_dir)
|
|
75
|
+
|
|
76
|
+
for level in levels:
|
|
77
|
+
output = f"{output_base}_bzip2_{level}.tar.bz2"
|
|
78
|
+
print(f"Testing bzip2 level {level}...", end=" ", flush=True)
|
|
79
|
+
|
|
80
|
+
start = time.time()
|
|
81
|
+
cmd = f'tar -cjf "{output}" -C "{source_path.parent}" "{source_path.name}"'
|
|
82
|
+
env = {"BZIP2": f"-{level}"}
|
|
83
|
+
subprocess.run(cmd, shell=True, env={**os.environ, **env}, check=True)
|
|
84
|
+
elapsed = time.time() - start
|
|
85
|
+
|
|
86
|
+
size = Path(output).stat().st_size
|
|
87
|
+
print(f"{format_size(size)} in {format_time(elapsed)}")
|
|
88
|
+
|
|
89
|
+
results.append({"method": f"bzip2-{level}", "file": output, "size": size, "time": elapsed})
|
|
90
|
+
|
|
91
|
+
return results
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_xz(
|
|
95
|
+
source_dir: str, output_base: str, levels: list[int] | None = None, test_extreme: bool = True
|
|
96
|
+
) -> list[dict[str, Any]]:
|
|
97
|
+
"""Test xz compression at various levels."""
|
|
98
|
+
if levels is None:
|
|
99
|
+
levels = [0, 6, 9] # Fast, default, max
|
|
100
|
+
|
|
101
|
+
results = []
|
|
102
|
+
source_path = Path(source_dir)
|
|
103
|
+
|
|
104
|
+
for level in levels:
|
|
105
|
+
output = f"{output_base}_xz{level}.tar.xz"
|
|
106
|
+
print(f"Testing xz level {level}...", end=" ", flush=True)
|
|
107
|
+
|
|
108
|
+
start = time.time()
|
|
109
|
+
cmd = f'tar -cJf "{output}" -C "{source_path.parent}" "{source_path.name}"'
|
|
110
|
+
env = {"XZ_OPT": f"-{level}"}
|
|
111
|
+
subprocess.run(cmd, shell=True, env={**os.environ, **env}, check=True)
|
|
112
|
+
elapsed = time.time() - start
|
|
113
|
+
|
|
114
|
+
size = Path(output).stat().st_size
|
|
115
|
+
print(f"{format_size(size)} in {format_time(elapsed)}")
|
|
116
|
+
|
|
117
|
+
results.append({"method": f"xz-{level}", "file": output, "size": size, "time": elapsed})
|
|
118
|
+
|
|
119
|
+
# Test extreme mode
|
|
120
|
+
if test_extreme:
|
|
121
|
+
for level in [9]: # Only test extreme on max level
|
|
122
|
+
output = f"{output_base}_xz{level}e.tar.xz"
|
|
123
|
+
print(f"Testing xz level {level} --extreme...", end=" ", flush=True)
|
|
124
|
+
|
|
125
|
+
start = time.time()
|
|
126
|
+
cmd = f'tar -cJf "{output}" -C "{source_path.parent}" "{source_path.name}"'
|
|
127
|
+
env = {"XZ_OPT": f"-{level}e"}
|
|
128
|
+
subprocess.run(cmd, shell=True, env={**os.environ, **env}, check=True)
|
|
129
|
+
elapsed = time.time() - start
|
|
130
|
+
|
|
131
|
+
size = Path(output).stat().st_size
|
|
132
|
+
print(f"{format_size(size)} in {format_time(elapsed)}")
|
|
133
|
+
|
|
134
|
+
results.append({"method": f"xz-{level}e", "file": output, "size": size, "time": elapsed})
|
|
135
|
+
|
|
136
|
+
return results
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_zstd_python(source_dir: str, output_base: str, levels: list[int] | None = None) -> list[dict[str, Any]]:
|
|
140
|
+
"""Test zstd compression using Python library."""
|
|
141
|
+
if zstd is None:
|
|
142
|
+
print("Skipping zstd tests (module not available)")
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
if levels is None:
|
|
146
|
+
levels = [1, 3, 10, 19, 22] # Fast, default, high, very high, ultra
|
|
147
|
+
|
|
148
|
+
results = []
|
|
149
|
+
source_path = Path(source_dir)
|
|
150
|
+
|
|
151
|
+
for level in levels:
|
|
152
|
+
output = f"{output_base}_zstd{level}.tar.zst"
|
|
153
|
+
print(f"Testing zstd level {level}...", end=" ", flush=True)
|
|
154
|
+
|
|
155
|
+
start = time.time()
|
|
156
|
+
|
|
157
|
+
# Create tar in memory, then compress with zstd
|
|
158
|
+
# Create tar data
|
|
159
|
+
import io
|
|
160
|
+
|
|
161
|
+
tar_buffer = io.BytesIO()
|
|
162
|
+
with tarfile.open(fileobj=tar_buffer, mode="w") as tar:
|
|
163
|
+
tar.add(source_path, arcname=source_path.name)
|
|
164
|
+
tar_data = tar_buffer.getvalue()
|
|
165
|
+
|
|
166
|
+
# Compress with zstd
|
|
167
|
+
cctx = zstd.ZstdCompressor(level=level)
|
|
168
|
+
compressed = cctx.compress(tar_data)
|
|
169
|
+
|
|
170
|
+
# Write to file
|
|
171
|
+
with open(output, "wb") as f:
|
|
172
|
+
f.write(compressed)
|
|
173
|
+
|
|
174
|
+
elapsed = time.time() - start
|
|
175
|
+
size = len(compressed)
|
|
176
|
+
print(f"{format_size(size)} in {format_time(elapsed)}")
|
|
177
|
+
|
|
178
|
+
results.append({"method": f"zstd-{level}", "file": output, "size": size, "time": elapsed})
|
|
179
|
+
|
|
180
|
+
return results
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def print_results_table(all_results: list[dict[str, Any]]) -> None:
|
|
184
|
+
"""Print formatted results table."""
|
|
185
|
+
print("\n" + "=" * 80)
|
|
186
|
+
print("COMPRESSION COMPARISON RESULTS")
|
|
187
|
+
print("=" * 80)
|
|
188
|
+
print()
|
|
189
|
+
print(f"{'Method':<15} {'Size':<12} {'Time':<10} {'vs Best':<12}")
|
|
190
|
+
print("-" * 80)
|
|
191
|
+
|
|
192
|
+
# Sort by size
|
|
193
|
+
sorted_results = sorted(all_results, key=lambda x: x["size"])
|
|
194
|
+
best_size = sorted_results[0]["size"]
|
|
195
|
+
|
|
196
|
+
for result in sorted_results:
|
|
197
|
+
size_str = format_size(result["size"])
|
|
198
|
+
time_str = format_time(result["time"])
|
|
199
|
+
percent_vs_best = (result["size"] / best_size - 1) * 100
|
|
200
|
+
vs_best = f"+{percent_vs_best:.1f}%" if percent_vs_best > 0 else "BEST"
|
|
201
|
+
|
|
202
|
+
marker = " ⭐" if result["size"] == best_size else ""
|
|
203
|
+
print(f"{result['method']:<15} {size_str:<12} {time_str:<10} {vs_best:<12}{marker}")
|
|
204
|
+
|
|
205
|
+
print()
|
|
206
|
+
print(f"Best compression: {sorted_results[0]['method']} - {format_size(sorted_results[0]['size'])}")
|
|
207
|
+
print(f"Worst compression: {sorted_results[-1]['method']} - {format_size(sorted_results[-1]['size'])}")
|
|
208
|
+
print(f"Difference: {format_size(sorted_results[-1]['size'] - sorted_results[0]['size'])}")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def main() -> None:
|
|
212
|
+
if len(sys.argv) < 2:
|
|
213
|
+
print("Usage: python test_compression.py <directory_to_compress> [output_prefix]")
|
|
214
|
+
sys.exit(1)
|
|
215
|
+
|
|
216
|
+
source_dir = sys.argv[1]
|
|
217
|
+
output_base = sys.argv[2] if len(sys.argv) > 2 else "compressed"
|
|
218
|
+
|
|
219
|
+
if not Path(source_dir).exists():
|
|
220
|
+
print(f"Error: Directory '{source_dir}' does not exist")
|
|
221
|
+
sys.exit(1)
|
|
222
|
+
|
|
223
|
+
print(f"Testing compression methods on: {source_dir}")
|
|
224
|
+
print(f"Output prefix: {output_base}")
|
|
225
|
+
print()
|
|
226
|
+
|
|
227
|
+
all_results = []
|
|
228
|
+
|
|
229
|
+
# Test gzip
|
|
230
|
+
print("=" * 80)
|
|
231
|
+
print("GZIP COMPRESSION")
|
|
232
|
+
print("=" * 80)
|
|
233
|
+
all_results.extend(test_gzip(source_dir, output_base, levels=[1, 6, 9]))
|
|
234
|
+
|
|
235
|
+
# Test bzip2
|
|
236
|
+
print("\n" + "=" * 80)
|
|
237
|
+
print("BZIP2 COMPRESSION")
|
|
238
|
+
print("=" * 80)
|
|
239
|
+
all_results.extend(test_bzip2(source_dir, output_base, levels=[1, 6, 9]))
|
|
240
|
+
|
|
241
|
+
# Test xz
|
|
242
|
+
print("\n" + "=" * 80)
|
|
243
|
+
print("XZ COMPRESSION")
|
|
244
|
+
print("=" * 80)
|
|
245
|
+
all_results.extend(test_xz(source_dir, output_base, levels=[0, 6, 9], test_extreme=True))
|
|
246
|
+
|
|
247
|
+
# Test zstd
|
|
248
|
+
if zstd is not None:
|
|
249
|
+
print("\n" + "=" * 80)
|
|
250
|
+
print("ZSTD COMPRESSION")
|
|
251
|
+
print("=" * 80)
|
|
252
|
+
all_results.extend(test_zstd_python(source_dir, output_base, levels=[1, 3, 10, 15, 19, 22]))
|
|
253
|
+
|
|
254
|
+
# Print final results
|
|
255
|
+
print_results_table(all_results)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
if __name__ == "__main__":
|
|
259
|
+
main()
|