clang-tool-chain 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of clang-tool-chain might be problematic. Click here for more details.
- clang_tool_chain/__init__.py +0 -0
- clang_tool_chain/__version__.py +4 -0
- clang_tool_chain/checksums.py +270 -0
- clang_tool_chain/cli.py +575 -0
- clang_tool_chain/downloader.py +1325 -0
- clang_tool_chain/downloads/README.md +144 -0
- clang_tool_chain/downloads/__init__.py +22 -0
- clang_tool_chain/downloads/__main__.py +11 -0
- clang_tool_chain/downloads/create_hardlink_archive.py +390 -0
- clang_tool_chain/downloads/create_iwyu_archives.py +330 -0
- clang_tool_chain/downloads/deduplicate_binaries.py +217 -0
- clang_tool_chain/downloads/download_binaries.py +463 -0
- clang_tool_chain/downloads/expand_archive.py +260 -0
- clang_tool_chain/downloads/extract_mingw_sysroot.py +349 -0
- clang_tool_chain/downloads/fetch_and_archive.py +1376 -0
- clang_tool_chain/downloads/strip_binaries.py +436 -0
- clang_tool_chain/downloads/test_compression.py +259 -0
- clang_tool_chain/fetch.py +158 -0
- clang_tool_chain/paths.py +93 -0
- clang_tool_chain/sccache_runner.py +160 -0
- clang_tool_chain/wrapper.py +1383 -0
- clang_tool_chain-1.0.2.dist-info/METADATA +1766 -0
- clang_tool_chain-1.0.2.dist-info/RECORD +26 -0
- clang_tool_chain-1.0.2.dist-info/WHEEL +4 -0
- clang_tool_chain-1.0.2.dist-info/entry_points.txt +31 -0
- clang_tool_chain-1.0.2.dist-info/licenses/LICENSE +204 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Create IWYU archives for all platforms.
|
|
4
|
+
|
|
5
|
+
This script:
|
|
6
|
+
1. Scans downloads-bins/assets/iwyu/ for extracted binaries
|
|
7
|
+
2. Creates tar archives with proper permissions
|
|
8
|
+
3. Compresses with zstd level 22
|
|
9
|
+
4. Generates SHA256 checksums
|
|
10
|
+
5. Outputs archives to downloads-bins/assets/iwyu/{platform}/{arch}/
|
|
11
|
+
|
|
12
|
+
Unlike the Clang toolchain, IWYU has no duplicate binaries, so no deduplication is needed.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import json
|
|
17
|
+
import sys
|
|
18
|
+
import tarfile
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_tar_archive(source_dir: Path, output_tar: Path) -> Path:
|
|
23
|
+
"""
|
|
24
|
+
Create tar archive with correct permissions for IWYU.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
source_dir: Directory containing bin/ and share/ (e.g., downloads-bins/assets/iwyu/win/x86_64/)
|
|
28
|
+
output_tar: Output tar file path
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Path to created tar file
|
|
32
|
+
"""
|
|
33
|
+
print("\n" + "=" * 70)
|
|
34
|
+
print("CREATING TAR ARCHIVE")
|
|
35
|
+
print("=" * 70)
|
|
36
|
+
print(f"Source: {source_dir}")
|
|
37
|
+
print(f"Output: {output_tar}")
|
|
38
|
+
print()
|
|
39
|
+
|
|
40
|
+
def tar_filter(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
|
|
41
|
+
"""Filter to set correct permissions for IWYU files."""
|
|
42
|
+
if tarinfo.isfile():
|
|
43
|
+
# Python scripts and the main binary should be executable
|
|
44
|
+
if "/bin/" in tarinfo.name or tarinfo.name.startswith("bin/"):
|
|
45
|
+
if tarinfo.name.endswith((".py", "include-what-you-use", ".exe")):
|
|
46
|
+
tarinfo.mode = 0o755 # rwxr-xr-x
|
|
47
|
+
print(f" Setting executable: {tarinfo.name}")
|
|
48
|
+
else:
|
|
49
|
+
# Other files in bin/ default to readable
|
|
50
|
+
tarinfo.mode = 0o644 # rw-r--r--
|
|
51
|
+
# Mapping files and other share/ content should be readable
|
|
52
|
+
elif "/share/" in tarinfo.name or tarinfo.name.startswith("share/"):
|
|
53
|
+
tarinfo.mode = 0o644 # rw-r--r--
|
|
54
|
+
# Other files (LICENSE, README, etc.)
|
|
55
|
+
else:
|
|
56
|
+
tarinfo.mode = 0o644 # rw-r--r--
|
|
57
|
+
return tarinfo
|
|
58
|
+
|
|
59
|
+
print("Creating tar archive...")
|
|
60
|
+
print("Setting permissions...")
|
|
61
|
+
|
|
62
|
+
# Get the architecture directory name (x86_64, arm64)
|
|
63
|
+
# We want the archive structure to be flat: bin/, share/, etc.
|
|
64
|
+
with tarfile.open(output_tar, "w") as tar:
|
|
65
|
+
# Add bin/ directory
|
|
66
|
+
bin_dir = source_dir / "bin"
|
|
67
|
+
if bin_dir.exists():
|
|
68
|
+
tar.add(bin_dir, arcname="bin", filter=tar_filter)
|
|
69
|
+
|
|
70
|
+
# Add share/ directory
|
|
71
|
+
share_dir = source_dir / "share"
|
|
72
|
+
if share_dir.exists():
|
|
73
|
+
tar.add(share_dir, arcname="share", filter=tar_filter)
|
|
74
|
+
|
|
75
|
+
# Add any other top-level files (LICENSE, README, etc.)
|
|
76
|
+
for item in source_dir.iterdir():
|
|
77
|
+
if item.is_file():
|
|
78
|
+
tar.add(item, arcname=item.name, filter=tar_filter)
|
|
79
|
+
|
|
80
|
+
size = output_tar.stat().st_size
|
|
81
|
+
print(f"Created: {output_tar} ({size / (1024*1024):.2f} MB)")
|
|
82
|
+
|
|
83
|
+
return output_tar
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def verify_tar_permissions(tar_file: Path) -> int:
|
|
87
|
+
"""Verify that files in the tar archive have correct permissions."""
|
|
88
|
+
print("\n" + "=" * 70)
|
|
89
|
+
print("VERIFYING TAR PERMISSIONS")
|
|
90
|
+
print("=" * 70)
|
|
91
|
+
print(f"Checking permissions in: {tar_file}")
|
|
92
|
+
print()
|
|
93
|
+
|
|
94
|
+
issues_found = []
|
|
95
|
+
executables_checked = 0
|
|
96
|
+
data_files_checked = 0
|
|
97
|
+
|
|
98
|
+
with tarfile.open(tar_file, "r") as tar:
|
|
99
|
+
for member in tar.getmembers():
|
|
100
|
+
if not member.isfile():
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
# Check files in bin/ directory
|
|
104
|
+
if "/bin/" in member.name or member.name.startswith("bin/"):
|
|
105
|
+
# Python scripts and binaries should be executable
|
|
106
|
+
if member.name.endswith((".py", "include-what-you-use", ".exe")):
|
|
107
|
+
executables_checked += 1
|
|
108
|
+
if not (member.mode & 0o100):
|
|
109
|
+
issues_found.append((member.name, oct(member.mode), "executable missing +x"))
|
|
110
|
+
print(f" ✗ Missing executable permission: {member.name} (mode: {oct(member.mode)})")
|
|
111
|
+
else:
|
|
112
|
+
print(f" ✓ bin: {member.name} (mode: {oct(member.mode)})")
|
|
113
|
+
|
|
114
|
+
# Check files in share/ directory
|
|
115
|
+
elif "/share/" in member.name or member.name.startswith("share/"):
|
|
116
|
+
data_files_checked += 1
|
|
117
|
+
# These should NOT be executable
|
|
118
|
+
if member.mode & 0o100:
|
|
119
|
+
issues_found.append((member.name, oct(member.mode), "data file has +x"))
|
|
120
|
+
print(f" ✗ Data file should not be executable: {member.name} (mode: {oct(member.mode)})")
|
|
121
|
+
|
|
122
|
+
print()
|
|
123
|
+
print(f"Total executables checked: {executables_checked}")
|
|
124
|
+
print(f"Total data files checked: {data_files_checked}")
|
|
125
|
+
|
|
126
|
+
if issues_found:
|
|
127
|
+
print(f"\n⚠️ WARNING: Found {len(issues_found)} files with incorrect permissions!")
|
|
128
|
+
print("\nFiles with issues:")
|
|
129
|
+
for name, mode, issue in issues_found:
|
|
130
|
+
print(f" - {name} (mode: {mode}) - {issue}")
|
|
131
|
+
raise RuntimeError(f"Tar archive has {len(issues_found)} files with incorrect permissions")
|
|
132
|
+
else:
|
|
133
|
+
print("✅ All files have correct permissions")
|
|
134
|
+
|
|
135
|
+
return executables_checked + data_files_checked
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def compress_with_zstd(tar_file: Path, output_zst: Path, level: int = 22) -> Path:
|
|
139
|
+
"""Compress tar with zstd."""
|
|
140
|
+
import zstandard as zstd
|
|
141
|
+
|
|
142
|
+
print("\n" + "=" * 70)
|
|
143
|
+
print(f"COMPRESSING WITH ZSTD LEVEL {level}")
|
|
144
|
+
print("=" * 70)
|
|
145
|
+
print(f"Input: {tar_file} ({tar_file.stat().st_size / (1024*1024):.2f} MB)")
|
|
146
|
+
print(f"Output: {output_zst}")
|
|
147
|
+
print()
|
|
148
|
+
|
|
149
|
+
# Use streaming compression to handle large files and allow interruption
|
|
150
|
+
print("Compressing (this may take a while)...")
|
|
151
|
+
|
|
152
|
+
import time
|
|
153
|
+
|
|
154
|
+
start = time.time()
|
|
155
|
+
|
|
156
|
+
# Create compressor with multi-threading
|
|
157
|
+
cctx = zstd.ZstdCompressor(level=level, threads=-1)
|
|
158
|
+
|
|
159
|
+
# Stream compress
|
|
160
|
+
with open(tar_file, "rb") as ifh, open(output_zst, "wb") as ofh:
|
|
161
|
+
# Read in chunks to allow interruption
|
|
162
|
+
chunk_size = 1024 * 1024 # 1MB chunks
|
|
163
|
+
reader = cctx.stream_reader(ifh, size=tar_file.stat().st_size)
|
|
164
|
+
|
|
165
|
+
while True:
|
|
166
|
+
chunk = reader.read(chunk_size)
|
|
167
|
+
if not chunk:
|
|
168
|
+
break
|
|
169
|
+
ofh.write(chunk)
|
|
170
|
+
|
|
171
|
+
elapsed = time.time() - start
|
|
172
|
+
|
|
173
|
+
original_size = tar_file.stat().st_size
|
|
174
|
+
compressed_size = output_zst.stat().st_size
|
|
175
|
+
ratio = original_size / compressed_size if compressed_size > 0 else 0
|
|
176
|
+
|
|
177
|
+
print(f"Compressed in {elapsed:.1f}s")
|
|
178
|
+
print(f"Original: {original_size / (1024*1024):.2f} MB")
|
|
179
|
+
print(f"Compressed: {compressed_size / (1024*1024):.2f} MB")
|
|
180
|
+
print(f"Ratio: {ratio:.2f}:1")
|
|
181
|
+
print(f"Reduction: {(1 - compressed_size/original_size) * 100:.1f}%")
|
|
182
|
+
|
|
183
|
+
return output_zst
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def generate_checksum(file_path: Path) -> str:
|
|
187
|
+
"""Generate SHA256 checksum for a file."""
|
|
188
|
+
sha256_hash = hashlib.sha256()
|
|
189
|
+
|
|
190
|
+
with open(file_path, "rb") as f:
|
|
191
|
+
# Read in chunks to handle large files
|
|
192
|
+
for byte_block in iter(lambda: f.read(4096), b""):
|
|
193
|
+
sha256_hash.update(byte_block)
|
|
194
|
+
|
|
195
|
+
return sha256_hash.hexdigest()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def process_platform_arch(iwyu_root: Path, platform: str, arch: str, version: str) -> dict[str, str | int] | None:
|
|
199
|
+
"""
|
|
200
|
+
Process a single platform/arch combination.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
iwyu_root: Root downloads-bins/assets/iwyu directory
|
|
204
|
+
platform: Platform name (win, linux, darwin)
|
|
205
|
+
arch: Architecture (x86_64, arm64)
|
|
206
|
+
version: IWYU version (e.g., "0.25")
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Dict with archive info, or None if skipped
|
|
210
|
+
"""
|
|
211
|
+
source_dir = iwyu_root / platform / arch
|
|
212
|
+
|
|
213
|
+
# Check if directory exists and has bin/
|
|
214
|
+
if not source_dir.exists() or not (source_dir / "bin").exists():
|
|
215
|
+
print(f"Skipping {platform}/{arch} - no binaries found")
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
print("\n" + "=" * 70)
|
|
219
|
+
print(f"PROCESSING: {platform}/{arch}")
|
|
220
|
+
print("=" * 70)
|
|
221
|
+
|
|
222
|
+
# Create archive name
|
|
223
|
+
archive_base = f"iwyu-{version}-{platform}-{arch}"
|
|
224
|
+
tar_file = source_dir / f"{archive_base}.tar"
|
|
225
|
+
zst_file = source_dir / f"{archive_base}.tar.zst"
|
|
226
|
+
|
|
227
|
+
# Step 1: Create TAR
|
|
228
|
+
create_tar_archive(source_dir, tar_file)
|
|
229
|
+
|
|
230
|
+
# Step 2: Verify permissions
|
|
231
|
+
verify_tar_permissions(tar_file)
|
|
232
|
+
|
|
233
|
+
# Step 3: Compress with zstd
|
|
234
|
+
compress_with_zstd(tar_file, zst_file)
|
|
235
|
+
|
|
236
|
+
# Step 4: Generate checksum
|
|
237
|
+
print("\nGenerating SHA256 checksum...")
|
|
238
|
+
sha256 = generate_checksum(zst_file)
|
|
239
|
+
print(f"SHA256: {sha256}")
|
|
240
|
+
|
|
241
|
+
# Write checksum file
|
|
242
|
+
checksum_file = zst_file.with_suffix(".tar.zst.sha256")
|
|
243
|
+
with open(checksum_file, "w") as f:
|
|
244
|
+
f.write(f"{sha256} {zst_file.name}\n")
|
|
245
|
+
|
|
246
|
+
# Clean up uncompressed tar
|
|
247
|
+
print(f"\nRemoving uncompressed tar: {tar_file}")
|
|
248
|
+
tar_file.unlink()
|
|
249
|
+
|
|
250
|
+
print("\n✅ SUCCESS!")
|
|
251
|
+
print(f"Archive: {zst_file}")
|
|
252
|
+
print(f"Size: {zst_file.stat().st_size / (1024*1024):.2f} MB")
|
|
253
|
+
print(f"SHA256: {sha256}")
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
"filename": zst_file.name,
|
|
257
|
+
"path": str(zst_file.relative_to(iwyu_root)),
|
|
258
|
+
"sha256": sha256,
|
|
259
|
+
"size": zst_file.stat().st_size,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def main() -> None:
|
|
264
|
+
"""Main entry point."""
|
|
265
|
+
import argparse
|
|
266
|
+
|
|
267
|
+
parser = argparse.ArgumentParser(description="Create IWYU archives for all platforms")
|
|
268
|
+
parser.add_argument(
|
|
269
|
+
"--iwyu-root",
|
|
270
|
+
type=Path,
|
|
271
|
+
default=Path("downloads-bins/assets/iwyu"),
|
|
272
|
+
help="Root IWYU directory (default: downloads-bins/assets/iwyu)",
|
|
273
|
+
)
|
|
274
|
+
parser.add_argument("--version", default="0.25", help="IWYU version (default: 0.25)")
|
|
275
|
+
parser.add_argument("--zstd-level", type=int, default=22, help="Zstd compression level (default: 22)")
|
|
276
|
+
parser.add_argument(
|
|
277
|
+
"--platform", help="Process only this platform (win, linux, darwin). If not specified, process all."
|
|
278
|
+
)
|
|
279
|
+
parser.add_argument("--arch", help="Process only this architecture (x86_64, arm64). If not specified, process all.")
|
|
280
|
+
|
|
281
|
+
args = parser.parse_args()
|
|
282
|
+
|
|
283
|
+
iwyu_root = args.iwyu_root.resolve()
|
|
284
|
+
|
|
285
|
+
if not iwyu_root.exists():
|
|
286
|
+
print(f"Error: IWYU root directory not found: {iwyu_root}")
|
|
287
|
+
sys.exit(1)
|
|
288
|
+
|
|
289
|
+
# Define platforms and architectures to process
|
|
290
|
+
platforms = [args.platform] if args.platform else ["win", "linux", "darwin"]
|
|
291
|
+
architectures = [args.arch] if args.arch else ["x86_64", "arm64"]
|
|
292
|
+
|
|
293
|
+
# Process each platform/arch combination
|
|
294
|
+
results = {}
|
|
295
|
+
for platform in platforms:
|
|
296
|
+
results[platform] = {}
|
|
297
|
+
for arch in architectures:
|
|
298
|
+
result = process_platform_arch(iwyu_root, platform, arch, args.version)
|
|
299
|
+
if result:
|
|
300
|
+
results[platform][arch] = result
|
|
301
|
+
|
|
302
|
+
# Print summary
|
|
303
|
+
print("\n" + "=" * 70)
|
|
304
|
+
print("SUMMARY")
|
|
305
|
+
print("=" * 70)
|
|
306
|
+
|
|
307
|
+
total_archives = sum(len(arches) for arches in results.values())
|
|
308
|
+
print(f"\nCreated {total_archives} archives:")
|
|
309
|
+
|
|
310
|
+
for platform, arches in results.items():
|
|
311
|
+
for arch, info in arches.items():
|
|
312
|
+
print(f"\n{platform}/{arch}:")
|
|
313
|
+
print(f" File: {info['filename']}")
|
|
314
|
+
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
|
|
315
|
+
print(f" SHA256: {info['sha256']}")
|
|
316
|
+
|
|
317
|
+
# Save results to JSON for manifest creation
|
|
318
|
+
results_file = iwyu_root / "archive_results.json"
|
|
319
|
+
with open(results_file, "w") as f:
|
|
320
|
+
json.dump(results, f, indent=2)
|
|
321
|
+
|
|
322
|
+
print(f"\nArchive info saved to: {results_file}")
|
|
323
|
+
print("\nNext steps:")
|
|
324
|
+
print("1. Create manifests with these SHA256 hashes")
|
|
325
|
+
print("2. Upload archives to GitHub")
|
|
326
|
+
print("3. Update downloader.py to support IWYU")
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
if __name__ == "__main__":
|
|
330
|
+
main()
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Deduplicate identical binaries in the toolchain by storing one copy
|
|
4
|
+
and creating a manifest for expansion.
|
|
5
|
+
|
|
6
|
+
This script:
|
|
7
|
+
1. Identifies duplicate files by MD5 hash
|
|
8
|
+
2. Keeps one "canonical" copy of each unique file
|
|
9
|
+
3. Creates a manifest mapping all filenames to their canonical source
|
|
10
|
+
4. Can expand the deduped structure back to full structure
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import shutil
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_file_hash(filepath: Path | str) -> str:
|
|
21
|
+
"""Calculate MD5 hash of a file."""
|
|
22
|
+
md5 = hashlib.md5()
|
|
23
|
+
with open(filepath, "rb") as f:
|
|
24
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
25
|
+
md5.update(chunk)
|
|
26
|
+
return md5.hexdigest()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def analyze_directory(directory: Path | str) -> tuple[dict[str, list[str]], dict[str, int]]:
|
|
30
|
+
"""Analyze directory for duplicate files."""
|
|
31
|
+
directory = Path(directory)
|
|
32
|
+
|
|
33
|
+
# Map hash -> list of files
|
|
34
|
+
hash_to_files = {}
|
|
35
|
+
# Map hash -> file size
|
|
36
|
+
hash_to_size = {}
|
|
37
|
+
|
|
38
|
+
for exe_file in directory.glob("*.exe"):
|
|
39
|
+
file_hash = get_file_hash(exe_file)
|
|
40
|
+
size = exe_file.stat().st_size
|
|
41
|
+
|
|
42
|
+
if file_hash not in hash_to_files:
|
|
43
|
+
hash_to_files[file_hash] = []
|
|
44
|
+
hash_to_size[file_hash] = size
|
|
45
|
+
|
|
46
|
+
hash_to_files[file_hash].append(exe_file.name)
|
|
47
|
+
|
|
48
|
+
return hash_to_files, hash_to_size
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def calculate_savings(hash_to_files: dict[str, list[str]], hash_to_size: dict[str, int]) -> dict[str, Any]:
|
|
52
|
+
"""Calculate potential space savings from deduplication."""
|
|
53
|
+
total_size = 0
|
|
54
|
+
deduped_size = 0
|
|
55
|
+
duplicate_count = 0
|
|
56
|
+
|
|
57
|
+
for file_hash, files in hash_to_files.items():
|
|
58
|
+
size = hash_to_size[file_hash]
|
|
59
|
+
total_size += size * len(files)
|
|
60
|
+
deduped_size += size # Only count once
|
|
61
|
+
|
|
62
|
+
if len(files) > 1:
|
|
63
|
+
duplicate_count += len(files) - 1
|
|
64
|
+
|
|
65
|
+
savings = total_size - deduped_size
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"total_size": total_size,
|
|
69
|
+
"deduped_size": deduped_size,
|
|
70
|
+
"savings": savings,
|
|
71
|
+
"savings_percent": (savings / total_size * 100) if total_size > 0 else 0,
|
|
72
|
+
"duplicate_count": duplicate_count,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def create_deduped_structure(source_dir: Path | str, dest_dir: Path | str) -> dict[str, Any]:
|
|
77
|
+
"""Create deduplicated directory structure with manifest."""
|
|
78
|
+
source_dir = Path(source_dir)
|
|
79
|
+
dest_dir = Path(dest_dir)
|
|
80
|
+
|
|
81
|
+
# Create destination directories
|
|
82
|
+
bin_dir = dest_dir / "bin"
|
|
83
|
+
canonical_dir = dest_dir / "canonical"
|
|
84
|
+
bin_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
canonical_dir.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
|
|
87
|
+
hash_to_files, hash_to_size = analyze_directory(source_dir)
|
|
88
|
+
|
|
89
|
+
# Manifest: filename -> canonical_file
|
|
90
|
+
manifest = {}
|
|
91
|
+
canonical_files = {} # hash -> canonical filename
|
|
92
|
+
|
|
93
|
+
# Process each unique hash
|
|
94
|
+
for file_hash, files in sorted(hash_to_files.items()):
|
|
95
|
+
# First file in sorted list becomes canonical
|
|
96
|
+
canonical = sorted(files)[0]
|
|
97
|
+
canonical_path = canonical_dir / canonical
|
|
98
|
+
|
|
99
|
+
# Copy canonical file
|
|
100
|
+
shutil.copy2(source_dir / canonical, canonical_path)
|
|
101
|
+
canonical_files[file_hash] = canonical
|
|
102
|
+
|
|
103
|
+
# Map all files to this canonical
|
|
104
|
+
for filename in files:
|
|
105
|
+
manifest[filename] = canonical
|
|
106
|
+
|
|
107
|
+
# Save manifest
|
|
108
|
+
manifest_data = {
|
|
109
|
+
"manifest": manifest,
|
|
110
|
+
"canonical_files": canonical_files,
|
|
111
|
+
"stats": calculate_savings(hash_to_files, hash_to_size),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
with open(dest_dir / "dedup_manifest.json", "w") as f:
|
|
115
|
+
json.dump(manifest_data, f, indent=2)
|
|
116
|
+
|
|
117
|
+
return manifest_data
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def expand_deduped_structure(deduped_dir: Path | str, output_dir: Path | str) -> None:
|
|
121
|
+
"""Expand deduplicated structure back to full structure."""
|
|
122
|
+
deduped_dir = Path(deduped_dir)
|
|
123
|
+
output_dir = Path(output_dir)
|
|
124
|
+
|
|
125
|
+
# Load manifest
|
|
126
|
+
with open(deduped_dir / "dedup_manifest.json") as f:
|
|
127
|
+
manifest_data = json.load(f)
|
|
128
|
+
|
|
129
|
+
manifest = manifest_data["manifest"]
|
|
130
|
+
canonical_dir = deduped_dir / "canonical"
|
|
131
|
+
output_bin_dir = output_dir / "bin"
|
|
132
|
+
output_bin_dir.mkdir(parents=True, exist_ok=True)
|
|
133
|
+
|
|
134
|
+
# Copy or hardlink each file
|
|
135
|
+
for filename, canonical in manifest.items():
|
|
136
|
+
src = canonical_dir / canonical
|
|
137
|
+
dst = output_bin_dir / filename
|
|
138
|
+
|
|
139
|
+
# Copy the file
|
|
140
|
+
shutil.copy2(src, dst)
|
|
141
|
+
print(f"Created {filename} from {canonical}")
|
|
142
|
+
|
|
143
|
+
print(f"\nExpanded {len(manifest)} files from {len(set(manifest.values()))} canonical files")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def print_analysis(source_dir: Path | str) -> None:
|
|
147
|
+
"""Print detailed analysis of duplicates."""
|
|
148
|
+
hash_to_files, hash_to_size = analyze_directory(source_dir)
|
|
149
|
+
stats = calculate_savings(hash_to_files, hash_to_size)
|
|
150
|
+
|
|
151
|
+
print("=" * 70)
|
|
152
|
+
print("BINARY DEDUPLICATION ANALYSIS")
|
|
153
|
+
print("=" * 70)
|
|
154
|
+
print()
|
|
155
|
+
|
|
156
|
+
print(f"Total uncompressed size: {stats['total_size'] / (1024*1024):.1f} MB")
|
|
157
|
+
print(f"Deduplicated size: {stats['deduped_size'] / (1024*1024):.1f} MB")
|
|
158
|
+
print(f"Space savings: {stats['savings'] / (1024*1024):.1f} MB ({stats['savings_percent']:.1f}%)")
|
|
159
|
+
print(f"Duplicate files: {stats['duplicate_count']}")
|
|
160
|
+
print()
|
|
161
|
+
|
|
162
|
+
print("Duplicate Groups:")
|
|
163
|
+
print("-" * 70)
|
|
164
|
+
|
|
165
|
+
for file_hash, files in sorted(hash_to_files.items()):
|
|
166
|
+
if len(files) > 1:
|
|
167
|
+
size_mb = hash_to_size[file_hash] / (1024 * 1024)
|
|
168
|
+
waste_mb = size_mb * (len(files) - 1)
|
|
169
|
+
print(f"\n{len(files)} identical files ({size_mb:.1f} MB each, {waste_mb:.1f} MB wasted):")
|
|
170
|
+
for filename in sorted(files):
|
|
171
|
+
canonical = "← CANONICAL" if filename == sorted(files)[0] else ""
|
|
172
|
+
print(f" - {filename} {canonical}")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
import sys
|
|
177
|
+
|
|
178
|
+
if len(sys.argv) < 2:
|
|
179
|
+
print("Usage:")
|
|
180
|
+
print(" Analyze: python deduplicate_binaries.py analyze <directory>")
|
|
181
|
+
print(" Deduplicate: python deduplicate_binaries.py dedup <source_dir> <dest_dir>")
|
|
182
|
+
print(" Expand: python deduplicate_binaries.py expand <deduped_dir> <output_dir>")
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
command = sys.argv[1]
|
|
186
|
+
|
|
187
|
+
if command == "analyze":
|
|
188
|
+
if len(sys.argv) < 3:
|
|
189
|
+
print("Error: Missing directory argument")
|
|
190
|
+
sys.exit(1)
|
|
191
|
+
print_analysis(sys.argv[2])
|
|
192
|
+
|
|
193
|
+
elif command == "dedup":
|
|
194
|
+
if len(sys.argv) < 4:
|
|
195
|
+
print("Error: Missing source or destination directory")
|
|
196
|
+
sys.exit(1)
|
|
197
|
+
source = sys.argv[2]
|
|
198
|
+
dest = sys.argv[3]
|
|
199
|
+
print("Creating deduplicated structure...")
|
|
200
|
+
manifest_data = create_deduped_structure(source, dest)
|
|
201
|
+
print("\nDeduplication complete!")
|
|
202
|
+
print(f"Original size: {manifest_data['stats']['total_size'] / (1024*1024):.1f} MB")
|
|
203
|
+
print(f"Deduped size: {manifest_data['stats']['deduped_size'] / (1024*1024):.1f} MB")
|
|
204
|
+
print(f"Saved: {manifest_data['stats']['savings'] / (1024*1024):.1f} MB")
|
|
205
|
+
print(f"Manifest saved to: {dest}/dedup_manifest.json")
|
|
206
|
+
|
|
207
|
+
elif command == "expand":
|
|
208
|
+
if len(sys.argv) < 4:
|
|
209
|
+
print("Error: Missing deduped or output directory")
|
|
210
|
+
sys.exit(1)
|
|
211
|
+
deduped = sys.argv[2]
|
|
212
|
+
output = sys.argv[3]
|
|
213
|
+
expand_deduped_structure(deduped, output)
|
|
214
|
+
|
|
215
|
+
else:
|
|
216
|
+
print(f"Unknown command: {command}")
|
|
217
|
+
sys.exit(1)
|