clang-tool-chain 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of clang-tool-chain might be problematic. Click here for more details.

@@ -0,0 +1,1376 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fetch and Archive LLVM/Clang Toolchain
4
+
5
+ This script automates the entire process:
6
+ 1. Downloads LLVM/Clang binaries for specified platform/architecture
7
+ 2. Strips them of unnecessary extras (keeping only essential build tools)
8
+ 3. Deduplicates identical binaries
9
+ 4. Creates hard-linked structure
10
+ 5. Compresses with zstd level 22
11
+ 6. Names according to convention: llvm-{version}-{platform}-{arch}.tar.zst
12
+ 7. Generates checksums
13
+ 8. Places final archive in downloads-bins/assets/clang/{platform}/{arch}/
14
+
15
+ Usage:
16
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform win --arch x86_64
17
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform linux --arch x86_64
18
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform darwin --arch arm64
19
+
20
+ Requirements:
21
+ - Python 3.7+
22
+ - zstandard module: pip install zstandard
23
+ """
24
+
25
+ import argparse
26
+ import hashlib
27
+ import json
28
+ import os
29
+ import shutil
30
+ import subprocess
31
+ import sys
32
+ import tarfile
33
+ import urllib.request
34
+ from pathlib import Path
35
+ from typing import Any
36
+
37
+ # ============================================================================
38
+ # Configuration
39
+ # ============================================================================
40
+
41
+ LLVM_VERSION = "19.1.7"
42
+
43
+ # Official LLVM download URLs
44
+ LLVM_DOWNLOAD_URLS = {
45
+ (
46
+ "win",
47
+ "x86_64",
48
+ ): f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/LLVM-{LLVM_VERSION}-win64.exe",
49
+ (
50
+ "win",
51
+ "arm64",
52
+ ): f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/LLVM-{LLVM_VERSION}-woa64.exe",
53
+ (
54
+ "linux",
55
+ "x86_64",
56
+ ): f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/LLVM-{LLVM_VERSION}-Linux-X64.tar.xz",
57
+ (
58
+ "linux",
59
+ "arm64",
60
+ ): f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/clang+llvm-{LLVM_VERSION}-aarch64-linux-gnu.tar.xz",
61
+ (
62
+ "darwin",
63
+ "x86_64",
64
+ ): f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/LLVM-{LLVM_VERSION}-macOS-X64.tar.xz",
65
+ (
66
+ "darwin",
67
+ "arm64",
68
+ ): f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/LLVM-{LLVM_VERSION}-macOS-ARM64.tar.xz",
69
+ }
70
+
71
+ # Essential binaries to keep (for C/C++ compilation)
72
+ ESSENTIAL_BINARIES = {
73
+ # Compilers
74
+ "clang",
75
+ "clang++",
76
+ "clang-cl",
77
+ "clang-cpp",
78
+ # Linkers
79
+ "lld",
80
+ "lld-link",
81
+ "ld.lld",
82
+ "ld64.lld",
83
+ "wasm-ld",
84
+ # Archive tools
85
+ "llvm-ar",
86
+ "llvm-ranlib",
87
+ # Binary utilities
88
+ "llvm-nm",
89
+ "llvm-objdump",
90
+ "llvm-objcopy",
91
+ "llvm-strip",
92
+ "llvm-readobj",
93
+ "llvm-readelf",
94
+ "llvm-symbolizer",
95
+ # NOTE: Removed clang-format and clang-tidy to reduce archive size
96
+ # These are code quality tools, not needed for compilation
97
+ }
98
+
99
+
100
+ # ============================================================================
101
+ # Utility Functions
102
+ # ============================================================================
103
+
104
+
105
+ def print_section(title: str) -> None:
106
+ """Print a formatted section header."""
107
+ print("\n" + "=" * 70)
108
+ print(title)
109
+ print("=" * 70)
110
+
111
+
112
+ def download_file(url: str, output_path: Path | str, show_progress: bool = True) -> None:
113
+ """Download a file with progress indication."""
114
+ print(f"Downloading from: {url}")
115
+ print(f"Saving to: {output_path}")
116
+
117
+ output_path = Path(output_path)
118
+ breadcrumb_path = Path(str(output_path) + ".downloading")
119
+
120
+ # Create breadcrumb file to mark download in progress
121
+ breadcrumb_path.touch()
122
+
123
+ def report_progress(block_num: int, block_size: int, total_size: int) -> None:
124
+ if show_progress and total_size > 0:
125
+ downloaded = block_num * block_size
126
+ percent = min(100, (downloaded / total_size) * 100)
127
+ mb_downloaded = downloaded / (1024 * 1024)
128
+ mb_total = total_size / (1024 * 1024)
129
+ print(f"\rProgress: {percent:5.1f}% ({mb_downloaded:6.1f} MB / {mb_total:6.1f} MB)", end="", flush=True)
130
+
131
+ try:
132
+ urllib.request.urlretrieve(url, output_path, reporthook=report_progress)
133
+ if show_progress:
134
+ print() # New line after progress
135
+ # Download completed successfully, remove breadcrumb
136
+ breadcrumb_path.unlink(missing_ok=True)
137
+ except (KeyboardInterrupt, Exception):
138
+ # Download interrupted or failed, clean up partial file and breadcrumb
139
+ if output_path.exists():
140
+ output_path.unlink()
141
+ breadcrumb_path.unlink(missing_ok=True)
142
+ raise
143
+
144
+
145
+ def get_file_hash(filepath: Path | str, algorithm: str = "md5") -> str:
146
+ """Calculate hash of a file."""
147
+ h = hashlib.new(algorithm)
148
+ with open(filepath, "rb") as f:
149
+ for chunk in iter(lambda: f.read(8192), b""):
150
+ h.update(chunk)
151
+ return h.hexdigest()
152
+
153
+
154
+ def find_binaries(directory: Path | str, extensions: list[str] | None = None) -> list[Path]:
155
+ """Find all binary files in a directory."""
156
+ if extensions is None:
157
+ extensions = [".exe", ""] # Windows executables and Unix executables (no extension)
158
+
159
+ directory = Path(directory)
160
+ binaries = []
161
+
162
+ for ext in extensions:
163
+ if ext:
164
+ binaries.extend(directory.glob(f"**/*{ext}"))
165
+ else:
166
+ # Find files without extension that are executable
167
+ for item in directory.rglob("*"):
168
+ if item.is_file() and os.access(item, os.X_OK) and not item.suffix:
169
+ binaries.append(item)
170
+
171
+ return binaries
172
+
173
+
174
+ def should_exclude_lib_file(file_path: Path | str) -> bool:
175
+ """
176
+ Determine if a library file should be excluded to reduce size.
177
+
178
+ Excludes:
179
+ - Fortran runtime libraries (libflang_rt.*) - only needed for Fortran compilation
180
+ - hwasan_symbolize binary - debugging tool, not needed for compilation
181
+
182
+ Keeps:
183
+ - Headers (.h, .inc, .modulemap, .tcc)
184
+ - Runtime libraries (including sanitizers)
185
+ - Builtins
186
+ - Directory structures
187
+ """
188
+ file_path = Path(file_path)
189
+ name = file_path.name
190
+
191
+ # Always keep directories
192
+ if file_path.is_dir():
193
+ return False
194
+
195
+ # Always keep headers and text files
196
+ if file_path.suffix in {".h", ".inc", ".modulemap", ".tcc", ".txt"}:
197
+ return False
198
+
199
+ # Exclude Fortran runtime (27 MB) - not needed for C/C++
200
+ if "libflang_rt" in name:
201
+ return True
202
+
203
+ # Exclude hwasan_symbolize binary - debugging tool only
204
+ # Keep everything else (sanitizers, builtins, headers, etc.)
205
+ return "hwasan_symbolize" in name
206
+
207
+
208
+ # ============================================================================
209
+ # Step 1: Download
210
+ # ============================================================================
211
+
212
+
213
+ def download_llvm(platform: str, arch: str, work_dir: Path) -> Path:
214
+ """Download LLVM binaries for the specified platform and architecture."""
215
+ print_section("STEP 1: DOWNLOAD LLVM BINARIES")
216
+
217
+ key = (platform, arch)
218
+ if key not in LLVM_DOWNLOAD_URLS:
219
+ raise ValueError(f"Unsupported platform/arch combination: {platform}/{arch}")
220
+
221
+ url = LLVM_DOWNLOAD_URLS[key]
222
+ filename = Path(url).name
223
+ download_path = work_dir / filename
224
+ breadcrumb_path = Path(str(download_path) + ".downloading")
225
+
226
+ print(f"Platform: {platform}")
227
+ print(f"Architecture: {arch}")
228
+ print(f"LLVM Version: {LLVM_VERSION}")
229
+ print()
230
+
231
+ # Check for incomplete download from previous attempt
232
+ if breadcrumb_path.exists():
233
+ print(f"āš ļø Found incomplete download marker: {breadcrumb_path.name}")
234
+ if download_path.exists():
235
+ print(f"Removing partial download: {download_path}")
236
+ download_path.unlink()
237
+ breadcrumb_path.unlink()
238
+ print()
239
+
240
+ if download_path.exists():
241
+ print(f"File already exists: {download_path}")
242
+ print("Skipping download...")
243
+ else:
244
+ download_file(url, download_path)
245
+
246
+ print(f"\nDownloaded: {download_path}")
247
+ print(f"Size: {download_path.stat().st_size / (1024*1024):.2f} MB")
248
+
249
+ return download_path
250
+
251
+
252
+ # ============================================================================
253
+ # Step 2: Extract
254
+ # ============================================================================
255
+
256
+
257
+ def extract_archive(archive_path: Path, extract_dir: Path) -> Path:
258
+ """Extract the downloaded archive."""
259
+ print_section("STEP 2: EXTRACT ARCHIVE")
260
+
261
+ archive_path = Path(archive_path)
262
+ extract_dir = Path(extract_dir)
263
+
264
+ print(f"Archive: {archive_path}")
265
+ print(f"Extract to: {extract_dir}")
266
+ print()
267
+
268
+ extract_dir.mkdir(parents=True, exist_ok=True)
269
+
270
+ if archive_path.suffix == ".exe":
271
+ # Windows installer - need 7z or similar
272
+ print("Windows .exe installer detected")
273
+ print("Using 7z to extract...")
274
+
275
+ # Try to use 7z
276
+ try:
277
+ subprocess.run(["7z", "x", str(archive_path), f"-o{extract_dir}", "-y"], check=True)
278
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
279
+ raise RuntimeError(
280
+ "7z is required to extract Windows .exe installer.\n"
281
+ "Install 7z: https://www.7-zip.org/\n"
282
+ "Or provide pre-extracted binaries."
283
+ ) from e
284
+
285
+ elif archive_path.suffix == ".xz" or archive_path.name.endswith(".tar.xz"):
286
+ print("Extracting tar.xz archive...")
287
+ print()
288
+
289
+ # Try to use external tar command for better performance (supports multi-threaded decompression)
290
+ # Falls back to Python implementation if tar command not available
291
+ import time
292
+
293
+ start = time.time()
294
+
295
+ # Check if we have tar command available (much faster, can use pixz for parallel decompression)
296
+ tar_available = shutil.which("tar") is not None
297
+
298
+ if tar_available:
299
+ print("Using system tar command for faster extraction...")
300
+ print("NOTE: Progress tracking not available with external tar")
301
+ print(" The process IS working - please wait (typically 30-90 seconds for LLVM)")
302
+ print()
303
+ print("Extracting...")
304
+ sys.stdout.flush()
305
+ try:
306
+ # Use tar command - it's much faster and may use parallel decompression
307
+ subprocess.run(
308
+ ["tar", "-xJf", str(archive_path), "-C", str(extract_dir)],
309
+ check=True,
310
+ capture_output=True,
311
+ text=True,
312
+ )
313
+ elapsed = time.time() - start
314
+ print(f"Extraction complete in {elapsed:.1f}s")
315
+ except subprocess.CalledProcessError as e:
316
+ print(f"External tar failed: {e.stderr}")
317
+ print("Falling back to Python extraction...")
318
+ tar_available = False
319
+
320
+ if not tar_available:
321
+ # Fallback to Python's built-in lzma and tarfile modules
322
+ print("Using Python built-in extraction (slower but with progress tracking)...")
323
+ print("Reading archive index (this may take a moment)...")
324
+ import lzma
325
+
326
+ with lzma.open(archive_path) as xz_file, tarfile.open(fileobj=xz_file) as tar:
327
+ # Get list of members for progress tracking
328
+ members = tar.getmembers()
329
+ total_members = len(members)
330
+ total_size = sum(m.size for m in members)
331
+
332
+ print(f"Found {total_members} files/directories to extract ({total_size / (1024*1024):.1f} MB)")
333
+ print()
334
+
335
+ extracted_count = 0
336
+ extracted_size = 0
337
+ last_progress = -1
338
+ last_update_time = start
339
+ progress_counter = 0
340
+
341
+ for member in members:
342
+ tar.extract(member, path=extract_dir)
343
+ extracted_count += 1
344
+ extracted_size += member.size
345
+
346
+ # Show progress every 5% or every 2 seconds
347
+ current_time = time.time()
348
+ # Use data size for progress percentage (more meaningful than file count)
349
+ progress = int((extracted_size / total_size) * 100) if total_size > 0 else 0
350
+ time_since_update = current_time - last_update_time
351
+
352
+ if (progress // 5 > last_progress // 5) or (time_since_update >= 2.0):
353
+ elapsed = current_time - start
354
+ mb_extracted = extracted_size / (1024 * 1024)
355
+ mb_total = total_size / (1024 * 1024)
356
+ mb_per_sec = mb_extracted / elapsed if elapsed > 0 else 0
357
+
358
+ progress_counter += 1
359
+ print(
360
+ f" [{progress_counter:3d}] Progress: {progress:3d}% "
361
+ f"({mb_extracted:7.1f} / {mb_total:7.1f} MB) "
362
+ f"- {mb_per_sec:6.1f} MB/s - {elapsed:5.1f}s elapsed",
363
+ flush=True,
364
+ )
365
+ last_progress = progress
366
+ last_update_time = current_time
367
+
368
+ elapsed = time.time() - start
369
+ print()
370
+ print(f"Extracted {extracted_count} files ({extracted_size / (1024*1024):.1f} MB) in {elapsed:.1f}s")
371
+
372
+ elif archive_path.suffix == ".gz" or archive_path.name.endswith(".tar.gz"):
373
+ print("Extracting tar.gz archive...")
374
+ print()
375
+ import gzip
376
+ import time
377
+
378
+ start = time.time()
379
+
380
+ with gzip.open(archive_path, "rb") as gz_file, tarfile.open(fileobj=gz_file) as tar:
381
+ # Get list of members for progress tracking
382
+ members = tar.getmembers()
383
+ total_members = len(members)
384
+ total_size = sum(m.size for m in members)
385
+
386
+ print(f"Found {total_members} files/directories to extract ({total_size / (1024*1024):.1f} MB)")
387
+ print()
388
+
389
+ extracted_count = 0
390
+ extracted_size = 0
391
+ last_progress = -1
392
+ last_update_time = start
393
+ progress_counter = 0
394
+
395
+ for member in members:
396
+ tar.extract(member, path=extract_dir)
397
+ extracted_count += 1
398
+ extracted_size += member.size
399
+
400
+ # Show progress every 5% or every 2 seconds
401
+ current_time = time.time()
402
+ # Use data size for progress percentage (more meaningful than file count)
403
+ progress = int((extracted_size / total_size) * 100) if total_size > 0 else 0
404
+ time_since_update = current_time - last_update_time
405
+
406
+ if (progress // 5 > last_progress // 5) or (time_since_update >= 2.0):
407
+ elapsed = current_time - start
408
+ mb_extracted = extracted_size / (1024 * 1024)
409
+ mb_total = total_size / (1024 * 1024)
410
+ mb_per_sec = mb_extracted / elapsed if elapsed > 0 else 0
411
+
412
+ progress_counter += 1
413
+ print(
414
+ f" [{progress_counter:3d}] Progress: {progress:3d}% "
415
+ f"({mb_extracted:7.1f} / {mb_total:7.1f} MB) "
416
+ f"- {mb_per_sec:6.1f} MB/s - {elapsed:5.1f}s elapsed",
417
+ flush=True,
418
+ )
419
+ last_progress = progress
420
+ last_update_time = current_time
421
+
422
+ elapsed = time.time() - start
423
+ print()
424
+ print(f"Extracted {extracted_count} files ({extracted_size / (1024*1024):.1f} MB) in {elapsed:.1f}s")
425
+
426
+ else:
427
+ raise ValueError(f"Unsupported archive format: {archive_path.suffix}")
428
+
429
+ print("Extraction complete!")
430
+ return extract_dir
431
+
432
+
433
+ # ============================================================================
434
+ # Step 3: Strip Extras (Keep Only Essential Binaries)
435
+ # ============================================================================
436
+
437
+
438
+ def strip_extras(extracted_dir: Path, output_dir: Path, platform: str) -> Path:
439
+ """Keep only essential binaries, remove extras."""
440
+ print_section("STEP 3: STRIP UNNECESSARY FILES")
441
+
442
+ extracted_dir = Path(extracted_dir)
443
+ output_dir = Path(output_dir)
444
+
445
+ # Find the bin directory
446
+ bin_dirs = list(extracted_dir.glob("**/bin"))
447
+ if not bin_dirs:
448
+ raise RuntimeError(f"No bin directory found in {extracted_dir}")
449
+
450
+ bin_dir = bin_dirs[0]
451
+ print(f"Found bin directory: {bin_dir}")
452
+
453
+ # Create output structure
454
+ output_bin = output_dir / "bin"
455
+ output_bin.mkdir(parents=True, exist_ok=True)
456
+
457
+ # Determine binary extension
458
+ ext = ".exe" if platform == "win" else ""
459
+
460
+ # Copy essential binaries
461
+ kept_count = 0
462
+ skipped_count = 0
463
+
464
+ print("\nKeeping essential binaries:")
465
+ for binary_name in ESSENTIAL_BINARIES:
466
+ binary_file = bin_dir / f"{binary_name}{ext}"
467
+
468
+ if binary_file.exists():
469
+ dest = output_bin / binary_file.name
470
+ shutil.copy2(binary_file, dest)
471
+ print(f" āœ“ {binary_file.name}")
472
+ kept_count += 1
473
+ else:
474
+ print(f" - {binary_name}{ext} (not found)")
475
+ skipped_count += 1
476
+
477
+ # Copy only essential lib/clang directory (builtin headers and runtime)
478
+ # Skip Fortran runtime, sanitizers, and other optional libraries
479
+ lib_src = extracted_dir.glob("**/lib/clang")
480
+ lib_clang_copied = False
481
+ excluded_count = 0
482
+ excluded_size = 0
483
+
484
+ for lib_clang_dir in lib_src:
485
+ if lib_clang_dir.is_dir():
486
+ lib_dst = output_dir / "lib" / "clang"
487
+ print("\nCopying essential lib/clang files (filtering out optional libraries)...")
488
+ print(f"Source: {lib_clang_dir}")
489
+ print(f"Dest: {lib_dst}")
490
+
491
+ # Use factory function to properly bind lib_clang_dir in closure
492
+ def make_ignore_function(base_dir: Path): # type: ignore[return]
493
+ def ignore_optional_libs(directory: str, contents: list[str]) -> list[str]:
494
+ ignored = []
495
+ for item in contents:
496
+ item_path = Path(directory) / item
497
+ if should_exclude_lib_file(item_path):
498
+ # Calculate size if it's a file
499
+ if item_path.is_file():
500
+ size = item_path.stat().st_size
501
+ excluded_size_mb = size / (1024 * 1024)
502
+ print(f" Excluding: {item_path.relative_to(base_dir)} ({excluded_size_mb:.1f} MB)")
503
+ nonlocal excluded_count, excluded_size
504
+ excluded_count += 1
505
+ excluded_size += size
506
+ ignored.append(item)
507
+ return ignored
508
+
509
+ return ignore_optional_libs
510
+
511
+ shutil.copytree(lib_clang_dir, lib_dst, dirs_exist_ok=True, ignore=make_ignore_function(lib_clang_dir))
512
+ lib_clang_copied = True
513
+ break
514
+
515
+ print("\nSummary:")
516
+ print(f" Kept: {kept_count} binaries")
517
+ print(f" Skipped: {skipped_count} binaries (not found)")
518
+ if lib_clang_copied:
519
+ print(" Copied lib/clang directory")
520
+ if excluded_count > 0:
521
+ print(f" Excluded {excluded_count} optional files ({excluded_size / (1024*1024):.1f} MB)")
522
+ print(" (Fortran runtime removed - not needed for C/C++ compilation)")
523
+
524
+ return output_dir
525
+
526
+
527
+ # ============================================================================
528
+ # Step 3.5: Strip Linux Binaries (Remove Debug Symbols)
529
+ # ============================================================================
530
+
531
+
532
+ def strip_linux_binaries(bin_dir: Path, platform: str) -> None:
533
+ """
534
+ Strip debug symbols from Linux binaries to reduce size.
535
+
536
+ Uses llvm-strip (cross-platform) to remove debug symbols from ELF binaries.
537
+ This typically reduces binary size by ~14% without affecting functionality.
538
+
539
+ Windows binaries are skipped as they don't benefit from stripping.
540
+ """
541
+ if platform != "linux":
542
+ return # Only strip Linux binaries
543
+
544
+ print_section("STEP 3.5: STRIP DEBUG SYMBOLS FROM LINUX BINARIES")
545
+
546
+ bin_dir = Path(bin_dir)
547
+
548
+ # Try to find llvm-strip
549
+ llvm_strip = shutil.which("llvm-strip")
550
+ if not llvm_strip:
551
+ # Try common locations on Windows
552
+ common_paths = [
553
+ r"C:\Program Files\LLVM\bin\llvm-strip.exe",
554
+ r"C:\Program Files (x86)\LLVM\bin\llvm-strip.exe",
555
+ ]
556
+ for path in common_paths:
557
+ if Path(path).exists():
558
+ llvm_strip = path
559
+ break
560
+
561
+ if not llvm_strip:
562
+ print("āš ļø llvm-strip not found - skipping binary stripping")
563
+ print(" Install LLVM to enable stripping: https://llvm.org/")
564
+ print(" Binaries will be larger but still functional")
565
+ return
566
+
567
+ print(f"Using: {llvm_strip}")
568
+ print()
569
+
570
+ # Find all binaries
571
+ binaries = sorted(bin_dir.glob("*"))
572
+ binaries = [b for b in binaries if b.is_file()]
573
+
574
+ if not binaries:
575
+ print("No binaries found to strip")
576
+ return
577
+
578
+ print(f"Stripping {len(binaries)} binaries...")
579
+ print()
580
+
581
+ total_before = 0
582
+ total_after = 0
583
+ stripped_count = 0
584
+
585
+ for binary in binaries:
586
+ size_before = binary.stat().st_size
587
+ size_before_mb = size_before / (1024 * 1024)
588
+
589
+ try:
590
+ # Use --strip-all for maximum size reduction
591
+ # This removes debug symbols and other non-essential data
592
+ subprocess.run([llvm_strip, "--strip-all", str(binary)], check=True, capture_output=True, text=True)
593
+
594
+ size_after = binary.stat().st_size
595
+ size_after_mb = size_after / (1024 * 1024)
596
+ saved = size_before - size_after
597
+ saved_mb = saved / (1024 * 1024)
598
+ percent = (saved / size_before * 100) if size_before > 0 else 0
599
+
600
+ print(
601
+ f" āœ“ {binary.name:30s} {size_before_mb:7.1f} MB → {size_after_mb:7.1f} MB (saved {saved_mb:5.1f} MB, {percent:4.1f}%)"
602
+ )
603
+
604
+ total_before += size_before
605
+ total_after += size_after
606
+ stripped_count += 1
607
+
608
+ except subprocess.CalledProcessError as e:
609
+ print(f" āœ— {binary.name:30s} - Failed to strip: {e.stderr}")
610
+ except Exception as e:
611
+ print(f" āœ— {binary.name:30s} - Error: {e}")
612
+
613
+ total_saved = total_before - total_after
614
+
615
+ print()
616
+ print("Summary:")
617
+ print(f" Stripped: {stripped_count} binaries")
618
+ print(f" Total before: {total_before / (1024*1024):.2f} MB")
619
+ print(f" Total after: {total_after / (1024*1024):.2f} MB")
620
+ print(f" Total saved: {total_saved / (1024*1024):.2f} MB ({(total_saved/total_before)*100:.1f}%)")
621
+
622
+
623
+ # ============================================================================
624
+ # Step 4: Deduplicate (Create Manifest)
625
+ # ============================================================================
626
+
627
+
628
+ def deduplicate_binaries(bin_dir: Path) -> dict[str, Any]:
629
+ """Identify duplicate binaries and create deduplication manifest."""
630
+ print_section("STEP 4: ANALYZE AND DEDUPLICATE BINARIES")
631
+
632
+ bin_dir = Path(bin_dir)
633
+
634
+ # Find all binaries
635
+ binaries = sorted(bin_dir.glob("*"))
636
+ binaries = [b for b in binaries if b.is_file()]
637
+
638
+ print(f"Found {len(binaries)} binary files")
639
+ print("\nCalculating MD5 hashes...")
640
+
641
+ # Calculate hashes
642
+ hash_to_files = {}
643
+ hash_to_size = {}
644
+
645
+ for binary in binaries:
646
+ file_hash = get_file_hash(binary, "md5")
647
+ size = binary.stat().st_size
648
+
649
+ if file_hash not in hash_to_files:
650
+ hash_to_files[file_hash] = []
651
+ hash_to_size[file_hash] = size
652
+
653
+ hash_to_files[file_hash].append(binary.name)
654
+
655
+ # Create deduplication manifest
656
+ manifest = {}
657
+ canonical_files = {}
658
+
659
+ for file_hash, files in sorted(hash_to_files.items()):
660
+ # First file (alphabetically) becomes canonical
661
+ canonical = sorted(files)[0]
662
+ canonical_files[file_hash] = canonical
663
+
664
+ for filename in files:
665
+ manifest[filename] = canonical
666
+
667
+ # Calculate savings
668
+ total_files = len(binaries)
669
+ unique_files = len(hash_to_files)
670
+ duplicate_count = total_files - unique_files
671
+
672
+ total_size = sum(hash_to_size[h] * len(files) for h, files in hash_to_files.items())
673
+ deduped_size = sum(hash_to_size.values())
674
+ savings = total_size - deduped_size
675
+
676
+ print("\nDeduplication Analysis:")
677
+ print(f" Total files: {total_files}")
678
+ print(f" Unique files: {unique_files}")
679
+ print(f" Duplicates: {duplicate_count}")
680
+ print(f" Total size: {total_size / (1024*1024):.1f} MB")
681
+ print(f" Deduplicated size: {deduped_size / (1024*1024):.1f} MB")
682
+ print(f" Space savings: {savings / (1024*1024):.1f} MB ({(savings/total_size)*100:.1f}%)")
683
+
684
+ # Print duplicate groups
685
+ if duplicate_count > 0:
686
+ print("\nDuplicate groups:")
687
+ for file_hash, files in sorted(hash_to_files.items()):
688
+ if len(files) > 1:
689
+ size_mb = hash_to_size[file_hash] / (1024 * 1024)
690
+ print(f" {len(files)} files @ {size_mb:.1f} MB each: {', '.join(sorted(files))}")
691
+
692
+ manifest_data = {
693
+ "manifest": manifest,
694
+ "canonical_files": canonical_files,
695
+ "stats": {
696
+ "total_size": total_size,
697
+ "deduped_size": deduped_size,
698
+ "savings": savings,
699
+ "savings_percent": (savings / total_size * 100) if total_size > 0 else 0,
700
+ "duplicate_count": duplicate_count,
701
+ },
702
+ }
703
+
704
+ return manifest_data
705
+
706
+
707
+ # ============================================================================
708
+ # Step 5: Create Hard-Linked Structure
709
+ # ============================================================================
710
+
711
+
712
+ def create_hardlink_structure(manifest_data: dict[str, Any], source_bin_dir: Path, output_dir: Path) -> Path:
713
+ """Create directory with hard links based on deduplication manifest."""
714
+ print_section("STEP 5: CREATE HARD-LINKED STRUCTURE")
715
+
716
+ source_bin_dir = Path(source_bin_dir)
717
+ output_dir = Path(output_dir)
718
+
719
+ manifest = manifest_data["manifest"]
720
+
721
+ # Create output bin directory
722
+ bin_dir = output_dir / "bin"
723
+ bin_dir.mkdir(parents=True, exist_ok=True)
724
+
725
+ # Track which canonical files we've copied
726
+ canonical_copied = {}
727
+
728
+ print("\nCreating hard-linked structure:")
729
+ for filename, canonical_name in sorted(manifest.items()):
730
+ src = source_bin_dir / canonical_name
731
+ dst = bin_dir / filename
732
+
733
+ if not src.exists():
734
+ print(f" Warning: {canonical_name} not found")
735
+ continue
736
+
737
+ if canonical_name not in canonical_copied:
738
+ # First occurrence - copy the file
739
+ shutil.copy2(src, dst)
740
+ canonical_copied[canonical_name] = dst
741
+ print(f" Copy: {filename} <- {canonical_name}")
742
+ else:
743
+ # Create hard link
744
+ first_copy = canonical_copied[canonical_name]
745
+ print(f" Hardlink: {filename} -> {first_copy.name}")
746
+
747
+ try:
748
+ if dst.exists():
749
+ dst.unlink()
750
+ os.link(first_copy, dst)
751
+ except OSError:
752
+ # Hard link failed, copy instead
753
+ shutil.copy2(src, dst)
754
+ print(" (hard link failed, used copy)")
755
+
756
+ return output_dir
757
+
758
+
759
+ # ============================================================================
760
+ # Step 6: Create TAR Archive
761
+ # ============================================================================
762
+
763
+
764
+ def create_tar_archive(source_dir: Path, output_tar: Path) -> Path:
765
+ """Create tar archive (auto-detects hard links)."""
766
+ print_section("STEP 6: CREATE TAR ARCHIVE")
767
+
768
+ source_dir = Path(source_dir)
769
+ output_tar = Path(output_tar)
770
+
771
+ print(f"Source: {source_dir}")
772
+ print(f"Output: {output_tar}")
773
+ print()
774
+
775
+ def tar_filter(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
776
+ """Filter to set correct permissions for binaries and shared libraries."""
777
+ if tarinfo.isfile():
778
+ # Set executable permissions for files in main bin/ directory
779
+ if "/bin/" in tarinfo.name and "/lib/" not in tarinfo.name:
780
+ tarinfo.mode = 0o755 # rwxr-xr-x
781
+ print(f" Setting executable: {tarinfo.name}")
782
+ # Set executable permissions for shared libraries and certain executables in lib/
783
+ elif "/lib/" in tarinfo.name:
784
+ # Headers, text files, and static libraries should be readable but not executable (check first)
785
+ if tarinfo.name.endswith((".h", ".inc", ".modulemap", ".tcc", ".txt", ".a", ".syms")):
786
+ tarinfo.mode = 0o644 # rw-r--r--
787
+ # Shared libraries (.so, .dylib) need executable permissions on Unix
788
+ elif tarinfo.name.endswith((".so", ".dylib")) or ".so." in tarinfo.name:
789
+ tarinfo.mode = 0o755 # rwxr-xr-x for shared libraries
790
+ print(f" Setting executable (shared lib): {tarinfo.name}")
791
+ # Executable binaries in lib/clang/*/bin/ directories
792
+ elif "/bin/" in tarinfo.name and not tarinfo.name.endswith(
793
+ (".h", ".inc", ".txt", ".a", ".so", ".dylib")
794
+ ):
795
+ tarinfo.mode = 0o755 # rwxr-xr-x
796
+ print(f" Setting executable (lib binary): {tarinfo.name}")
797
+ return tarinfo
798
+
799
+ print("Creating tar archive using Python tarfile module...")
800
+ print("Setting executable permissions for binaries in bin/...")
801
+ with tarfile.open(output_tar, "w") as tar:
802
+ tar.add(source_dir, arcname=source_dir.name, filter=tar_filter)
803
+
804
+ size = output_tar.stat().st_size
805
+ print(f"\nCreated: {output_tar}")
806
+ print(f"Size: {size / (1024*1024):.2f} MB")
807
+
808
+ return output_tar
809
+
810
+
811
+ def verify_tar_permissions(tar_file: Path) -> int:
812
+ """Verify that binaries and shared libraries in the tar archive have correct permissions."""
813
+ print_section("STEP 6.5: VERIFY TAR PERMISSIONS")
814
+
815
+ tar_file = Path(tar_file)
816
+
817
+ print(f"Checking permissions in: {tar_file}")
818
+ print()
819
+
820
+ issues_found = []
821
+ binaries_checked = 0
822
+ libs_checked = 0
823
+ headers_checked = 0
824
+
825
+ with tarfile.open(tar_file, "r") as tar:
826
+ for member in tar.getmembers():
827
+ if not member.isfile():
828
+ continue
829
+
830
+ # Check files in bin/ directory - should all be executable
831
+ if "/bin/" in member.name:
832
+ binaries_checked += 1
833
+ # Check if executable bit is set (0o100 for user execute)
834
+ if not (member.mode & 0o100):
835
+ issues_found.append((member.name, oct(member.mode), "binary missing executable"))
836
+ print(f" āœ— Missing executable permission: {member.name} (mode: {oct(member.mode)})")
837
+ else:
838
+ # Only print every 10th binary to avoid spam
839
+ if binaries_checked % 10 == 1:
840
+ print(f" āœ“ bin: {member.name} (mode: {oct(member.mode)})")
841
+
842
+ # Check files in lib/ directory
843
+ elif "/lib/" in member.name:
844
+ # Headers and static libraries should NOT be executable (check this first)
845
+ if member.name.endswith((".h", ".inc", ".modulemap", ".tcc", ".txt", ".a", ".syms")):
846
+ headers_checked += 1
847
+ if member.mode & 0o100:
848
+ issues_found.append((member.name, oct(member.mode), "header/static lib has executable bit"))
849
+ print(
850
+ f" āœ— Header/static lib should not be executable: {member.name} (mode: {oct(member.mode)})"
851
+ )
852
+
853
+ # Shared libraries (.so, .dylib) should be executable
854
+ elif member.name.endswith((".so", ".dylib")) or ".so." in member.name:
855
+ libs_checked += 1
856
+ if not (member.mode & 0o100):
857
+ issues_found.append((member.name, oct(member.mode), "shared lib missing executable"))
858
+ print(f" āœ— Shared lib missing executable: {member.name} (mode: {oct(member.mode)})")
859
+ elif libs_checked % 10 == 1:
860
+ print(f" āœ“ lib: {member.name} (mode: {oct(member.mode)})")
861
+
862
+ # Executable binaries in lib/ (like *symbolize) - must be files without common extensions
863
+ # These are typically in lib/clang/*/bin/ directories
864
+ elif "/bin/" in member.name and not member.name.endswith((".h", ".inc", ".txt", ".a", ".so", ".dylib")):
865
+ binaries_checked += 1
866
+ if not (member.mode & 0o100):
867
+ issues_found.append((member.name, oct(member.mode), "lib binary missing executable"))
868
+ print(f" āœ— Lib binary missing executable: {member.name} (mode: {oct(member.mode)})")
869
+
870
+ print()
871
+ print(f"Total binaries checked: {binaries_checked}")
872
+ print(f"Total shared libraries checked: {libs_checked}")
873
+ print(f"Total headers/text files checked: {headers_checked}")
874
+
875
+ if issues_found:
876
+ print(f"\nāš ļø WARNING: Found {len(issues_found)} files with incorrect permissions!")
877
+ print("\nFiles with issues:")
878
+ for name, mode, issue in issues_found:
879
+ print(f" - {name} (mode: {mode}) - {issue}")
880
+ print("\nThese files may not work correctly when extracted on Unix systems.")
881
+ raise RuntimeError(f"Tar archive has {len(issues_found)} files with incorrect permissions")
882
+ else:
883
+ print("āœ… All files have correct permissions")
884
+
885
+ return binaries_checked + libs_checked
886
+
887
+
888
+ # ============================================================================
889
+ # Step 7: Compress with ZSTD
890
+ # ============================================================================
891
+
892
+
893
+ def compress_with_zstd(tar_file: Path, output_zst: Path, level: int = 22) -> Path:
894
+ """Compress tar with zstd using streaming compression for better interrupt handling."""
895
+ print_section(f"STEP 7: COMPRESS WITH ZSTD LEVEL {level}")
896
+
897
+ try:
898
+ import zstandard as zstd
899
+ except ImportError as e:
900
+ raise ImportError("zstandard module required!\n" "Install with: pip install zstandard") from e
901
+
902
+ tar_file = Path(tar_file)
903
+ output_zst = Path(output_zst)
904
+
905
+ file_size = tar_file.stat().st_size
906
+ print(f"Input: {tar_file} ({file_size / (1024*1024):.2f} MB)")
907
+ print(f"Output: {output_zst}")
908
+ print(f"Level: {level}")
909
+ print()
910
+
911
+ print(f"Compressing {file_size / (1024*1024):.1f} MB (streaming mode - press Ctrl+C to cancel)...")
912
+ print()
913
+
914
+ # Compress using streaming for better interrupt handling
915
+ import time
916
+
917
+ start = time.time()
918
+
919
+ try:
920
+ cctx = zstd.ZstdCompressor(level=level, threads=-1)
921
+
922
+ # Use streaming compression instead of loading entire file
923
+ # Use 1MB chunks for better interrupt responsiveness on Windows
924
+ chunk_size = 1 * 1024 * 1024 # 1MB chunks for better interrupt handling
925
+ bytes_read = 0
926
+ last_progress = -1
927
+ last_update_time = start
928
+ progress_counter = 0
929
+
930
+ with (
931
+ open(tar_file, "rb") as ifh,
932
+ open(output_zst, "wb") as ofh,
933
+ cctx.stream_writer(ofh, closefd=False) as compressor,
934
+ ):
935
+ while True:
936
+ chunk = ifh.read(chunk_size)
937
+ if not chunk:
938
+ break
939
+
940
+ compressor.write(chunk)
941
+ bytes_read += len(chunk)
942
+
943
+ # Show progress every 5% for cleaner output that works on all terminals
944
+ current_time = time.time()
945
+ progress = int((bytes_read / file_size) * 100)
946
+ time_since_update = current_time - last_update_time
947
+
948
+ # Update every 5% OR every 2 seconds (whichever comes first)
949
+ if (progress // 5 > last_progress // 5) or (time_since_update >= 2.0):
950
+ elapsed = current_time - start
951
+ mb_read = bytes_read / (1024 * 1024)
952
+ mb_total = file_size / (1024 * 1024)
953
+ mb_per_sec = mb_read / elapsed if elapsed > 0 else 0
954
+
955
+ # Use simple newline-based progress for cross-platform compatibility
956
+ progress_counter += 1
957
+ print(
958
+ f" [{progress_counter:3d}] Progress: {progress:3d}% "
959
+ f"({mb_read:7.1f} / {mb_total:7.1f} MB) "
960
+ f"- {mb_per_sec:6.1f} MB/s - {elapsed:5.1f}s elapsed",
961
+ flush=True,
962
+ )
963
+ last_progress = progress
964
+ last_update_time = current_time
965
+
966
+ # Print final newline and show finalizing message
967
+ print()
968
+ print(" Data read complete. Now finalizing compression...")
969
+ print(" NOTE: Level 22 compression requires flushing buffers - this may take 30-60 seconds...")
970
+ print(" (The process is NOT stalled, just working hard to achieve maximum compression)")
971
+ print()
972
+ finalize_start = time.time()
973
+
974
+ # The with block closes here, which triggers the final compression flush
975
+ # This is where most of the CPU time is actually spent for level 22
976
+ finalize_elapsed = time.time() - finalize_start
977
+ print(f" Finalization complete! ({finalize_elapsed:.1f}s)")
978
+ print()
979
+
980
+ elapsed = time.time() - start
981
+
982
+ original_size = file_size
983
+ compressed_size = output_zst.stat().st_size
984
+ ratio = original_size / compressed_size
985
+
986
+ print("Compression complete!")
987
+ print(f" Total time: {elapsed:.1f}s")
988
+ print(f" Reading: {elapsed - finalize_elapsed:.1f}s")
989
+ print(f" Finalizing: {finalize_elapsed:.1f}s")
990
+ print(f" Original: {original_size / (1024*1024):.2f} MB")
991
+ print(f" Compressed: {compressed_size / (1024*1024):.2f} MB")
992
+ print(f" Ratio: {ratio:.2f}:1")
993
+ print(f" Reduction: {(1 - compressed_size/original_size) * 100:.1f}%")
994
+
995
+ return output_zst
996
+
997
+ except KeyboardInterrupt:
998
+ # Clean up partial output file on interrupt
999
+ print("\nāš ļø Compression interrupted - cleaning up partial file...")
1000
+ if output_zst.exists():
1001
+ output_zst.unlink()
1002
+ raise
1003
+
1004
+
1005
+ # ============================================================================
1006
+ # Step 8: Generate Checksums
1007
+ # ============================================================================
1008
+
1009
+
1010
+ def generate_checksums(archive_path: Path) -> tuple[str, str]:
1011
+ """Generate SHA256 and MD5 checksums."""
1012
+ print_section("STEP 8: GENERATE CHECKSUMS")
1013
+
1014
+ archive_path = Path(archive_path)
1015
+
1016
+ print(f"Generating checksums for: {archive_path.name}")
1017
+ print()
1018
+
1019
+ # SHA256
1020
+ print("Calculating SHA256...")
1021
+ sha256 = get_file_hash(archive_path, "sha256")
1022
+ sha256_file = archive_path.parent / f"{archive_path.name}.sha256"
1023
+ with open(sha256_file, "w") as f:
1024
+ f.write(f"{sha256} *{archive_path.name}\n")
1025
+ print(f" SHA256: {sha256}")
1026
+ print(f" Saved to: {sha256_file.name}")
1027
+
1028
+ # MD5
1029
+ print("\nCalculating MD5...")
1030
+ md5 = get_file_hash(archive_path, "md5")
1031
+ md5_file = archive_path.parent / f"{archive_path.name}.md5"
1032
+ with open(md5_file, "w") as f:
1033
+ f.write(f"{md5} *{archive_path.name}\n")
1034
+ print(f" MD5: {md5}")
1035
+ print(f" Saved to: {md5_file.name}")
1036
+
1037
+ return sha256, md5
1038
+
1039
+
1040
+ # ============================================================================
1041
+ # Step 9: Split Archive (If Needed)
1042
+ # ============================================================================
1043
+
1044
+
1045
+ def split_archive(archive_path: Path, max_size_mb: int = 99) -> list[Path] | None:
1046
+ """
1047
+ Split archive into parts if it exceeds max_size_mb.
1048
+
1049
+ Creates files like:
1050
+ - archive.tar.zst.part1
1051
+ - archive.tar.zst.part2
1052
+ - archive.tar.zst.join (script to join them back)
1053
+
1054
+ Args:
1055
+ archive_path: Path to the archive file
1056
+ max_size_mb: Maximum size in MB before splitting (default: 99)
1057
+
1058
+ Returns:
1059
+ List of part files created, or None if no split needed
1060
+ """
1061
+ print_section(f"STEP 9: CHECK IF SPLIT NEEDED (max {max_size_mb} MB)")
1062
+
1063
+ archive_path = Path(archive_path)
1064
+ size_mb = archive_path.stat().st_size / (1024 * 1024)
1065
+
1066
+ print(f"Archive: {archive_path.name}")
1067
+ print(f"Size: {size_mb:.2f} MB")
1068
+ print(f"Limit: {max_size_mb} MB")
1069
+ print()
1070
+
1071
+ if size_mb <= max_size_mb:
1072
+ print(f"āœ… Archive is under {max_size_mb} MB - no split needed")
1073
+ return None
1074
+
1075
+ print(f"āš ļø Archive exceeds {max_size_mb} MB - splitting into parts...")
1076
+ print()
1077
+
1078
+ # Calculate part size (slightly under max to account for overhead)
1079
+ part_size = int((max_size_mb - 1) * 1024 * 1024) # Leave 1 MB margin
1080
+
1081
+ # Read and split
1082
+ parts = []
1083
+ part_num = 1
1084
+
1085
+ with open(archive_path, "rb") as f:
1086
+ while True:
1087
+ chunk = f.read(part_size)
1088
+ if not chunk:
1089
+ break
1090
+
1091
+ part_name = f"{archive_path.name}.part{part_num}"
1092
+ part_path = archive_path.parent / part_name
1093
+
1094
+ with open(part_path, "wb") as pf:
1095
+ pf.write(chunk)
1096
+
1097
+ part_size_mb = len(chunk) / (1024 * 1024)
1098
+ print(f" Created: {part_name} ({part_size_mb:.2f} MB)")
1099
+ parts.append(part_path)
1100
+ part_num += 1
1101
+
1102
+ # Create join script for convenience
1103
+ join_script_name = f"{archive_path.name}.join"
1104
+ join_script_path = archive_path.parent / join_script_name
1105
+
1106
+ # Create both shell script and Python script
1107
+ shell_script = f"""#!/bin/bash
1108
+ # Join script for {archive_path.name}
1109
+ # This script joins the split parts back into the original archive
1110
+
1111
+ echo "Joining {len(parts)} parts into {archive_path.name}..."
1112
+
1113
+ cat {' '.join(p.name for p in parts)} > {archive_path.name}
1114
+
1115
+ echo "Done! Created {archive_path.name}"
1116
+ echo "Size: $(du -h {archive_path.name} | cut -f1)"
1117
+ echo ""
1118
+ echo "To extract:"
1119
+ echo " tar --zstd -xf {archive_path.name}"
1120
+ """
1121
+
1122
+ with open(join_script_path, "w", newline="\n") as f:
1123
+ f.write(shell_script)
1124
+
1125
+ # Make it executable on Unix-like systems
1126
+ import contextlib
1127
+
1128
+ with contextlib.suppress(Exception):
1129
+ os.chmod(join_script_path, 0o755)
1130
+
1131
+ # Also create Python join script for Windows
1132
+ py_script_name = f"{archive_path.name}.join.py"
1133
+ py_script_path = archive_path.parent / py_script_name
1134
+
1135
+ python_script = f"""#!/usr/bin/env python3
1136
+ \"\"\"Join script for {archive_path.name}\"\"\"
1137
+ import sys
1138
+ from pathlib import Path
1139
+
1140
+ parts = {[p.name for p in parts]}
1141
+ output = "{archive_path.name}"
1142
+
1143
+ print(f"Joining {{len(parts)}} parts into {{output}}...")
1144
+
1145
+ try:
1146
+ with open(output, 'wb') as out:
1147
+ for part in parts:
1148
+ print(f" Adding {{part}}...")
1149
+ with open(part, 'rb') as inp:
1150
+ out.write(inp.read())
1151
+
1152
+ size_mb = Path(output).stat().st_size / (1024 * 1024)
1153
+ print(f"\\nDone! Created {{output}} ({{size_mb:.2f}} MB)")
1154
+ print("\\nTo extract:")
1155
+ print(f" tar --zstd -xf {{output}}")
1156
+
1157
+ except Exception as e:
1158
+ print(f"Error: {{e}}", file=sys.stderr)
1159
+ sys.exit(1)
1160
+ """
1161
+
1162
+ with open(py_script_path, "w") as f:
1163
+ f.write(python_script)
1164
+
1165
+ print()
1166
+ print("Summary:")
1167
+ print(f" Created {len(parts)} parts")
1168
+ print(f" Total size: {size_mb:.2f} MB")
1169
+ print(f" Part size: ~{max_size_mb - 1} MB each")
1170
+ print()
1171
+ print("Join scripts created:")
1172
+ print(f" {join_script_name} (for Linux/Mac)")
1173
+ print(f" {py_script_name} (for Windows/cross-platform)")
1174
+ print()
1175
+ print("To rejoin:")
1176
+ print(f" bash {join_script_name}")
1177
+ print(" or")
1178
+ print(f" python {py_script_name}")
1179
+
1180
+ # Remove original archive
1181
+ print()
1182
+ print(f"Removing original archive: {archive_path.name}")
1183
+ archive_path.unlink()
1184
+
1185
+ return parts
1186
+
1187
+
1188
+ # ============================================================================
1189
+ # Main Pipeline
1190
+ # ============================================================================
1191
+
1192
+
1193
+ def main() -> None:
1194
+ parser = argparse.ArgumentParser(
1195
+ description="Fetch and archive LLVM/Clang toolchain binaries",
1196
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1197
+ epilog="""
1198
+ Examples:
1199
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform win --arch x86_64
1200
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform linux --arch x86_64
1201
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform darwin --arch arm64
1202
+
1203
+ # Use existing extracted binaries:
1204
+ python -m clang_tool_chain.downloads.fetch_and_archive --platform win --arch x86_64 --source-dir ./assets/win
1205
+
1206
+ Note: Press Ctrl+C at any time to safely interrupt the operation.
1207
+ """,
1208
+ )
1209
+
1210
+ parser.add_argument("--platform", required=True, choices=["win", "linux", "darwin"], help="Target platform")
1211
+ parser.add_argument("--arch", required=True, choices=["x86_64", "arm64"], help="Target architecture")
1212
+ parser.add_argument("--version", default=LLVM_VERSION, help=f"LLVM version (default: {LLVM_VERSION})")
1213
+ parser.add_argument("--source-dir", type=Path, help="Use existing extracted binaries instead of downloading")
1214
+ parser.add_argument(
1215
+ "--work-dir", type=Path, default=Path("work"), help="Working directory for temporary files (default: work)"
1216
+ )
1217
+ parser.add_argument(
1218
+ "--output-dir",
1219
+ type=Path,
1220
+ default=None,
1221
+ help="Output directory (default: downloads-bins/assets/clang/{platform}/{arch})",
1222
+ )
1223
+ parser.add_argument("--zstd-level", type=int, default=22, help="Zstd compression level (default: 22)")
1224
+ parser.add_argument("--keep-intermediate", action="store_true", help="Keep intermediate files (for debugging)")
1225
+
1226
+ args = parser.parse_args()
1227
+
1228
+ # Use version from args
1229
+ llvm_version = args.version
1230
+
1231
+ # Setup directories
1232
+ work_dir = args.work_dir
1233
+ work_dir.mkdir(parents=True, exist_ok=True)
1234
+
1235
+ output_dir = args.output_dir or Path("downloads-bins/assets/clang") / args.platform / args.arch
1236
+ output_dir.mkdir(parents=True, exist_ok=True)
1237
+
1238
+ # Archive name
1239
+ archive_name = f"llvm-{llvm_version}-{args.platform}-{args.arch}"
1240
+
1241
+ print("=" * 70)
1242
+ print("LLVM/Clang Toolchain Fetch and Archive")
1243
+ print("=" * 70)
1244
+ print(f"Platform: {args.platform}")
1245
+ print(f"Architecture: {args.arch}")
1246
+ print(f"Version: {llvm_version}")
1247
+ print(f"Output: {output_dir}/{archive_name}.tar.zst")
1248
+ print("=" * 70)
1249
+ print("\nšŸ’” Tip: Press Ctrl+C at any time to safely interrupt the operation.\n")
1250
+
1251
+ try:
1252
+ # Step 1: Download (or use existing)
1253
+ if args.source_dir:
1254
+ print_section("STEP 1: USING EXISTING BINARIES")
1255
+ print(f"Source directory: {args.source_dir}")
1256
+ extracted_dir = args.source_dir
1257
+ else:
1258
+ archive_path = download_llvm(args.platform, args.arch, work_dir)
1259
+
1260
+ # Step 2: Extract
1261
+ extracted_dir = extract_archive(archive_path, work_dir / "extracted")
1262
+
1263
+ # Step 3: Strip extras
1264
+ stripped_dir = work_dir / "stripped"
1265
+ strip_extras(extracted_dir, stripped_dir, args.platform)
1266
+
1267
+ # Step 3.5: Strip Linux binaries (remove debug symbols)
1268
+ strip_linux_binaries(stripped_dir / "bin", args.platform)
1269
+
1270
+ # Step 4: Deduplicate
1271
+ manifest_data = deduplicate_binaries(stripped_dir / "bin")
1272
+
1273
+ # Save manifest
1274
+ manifest_file = stripped_dir / "dedup_manifest.json"
1275
+ with open(manifest_file, "w") as f:
1276
+ json.dump(manifest_data, f, indent=2)
1277
+ print(f"\nManifest saved: {manifest_file}")
1278
+
1279
+ # Step 5: Create hard-linked structure
1280
+ hardlinked_dir = work_dir / "hardlinked"
1281
+ create_hardlink_structure(manifest_data, stripped_dir / "bin", hardlinked_dir)
1282
+
1283
+ # Copy lib/clang directory if it exists (builtin headers only)
1284
+ lib_clang_src = stripped_dir / "lib" / "clang"
1285
+ if lib_clang_src.exists():
1286
+ lib_dst = hardlinked_dir / "lib" / "clang"
1287
+ print("\nCopying lib/clang directory (builtin headers)...")
1288
+ shutil.copytree(lib_clang_src, lib_dst, dirs_exist_ok=True)
1289
+
1290
+ # Step 6: Create TAR
1291
+ tar_file = work_dir / f"{archive_name}.tar"
1292
+ create_tar_archive(hardlinked_dir, tar_file)
1293
+
1294
+ # Step 6.5: Verify permissions in TAR archive
1295
+ verify_tar_permissions(tar_file)
1296
+
1297
+ # Step 7: Compress with ZSTD
1298
+ # Initialize final_archive here, before compression, so it's defined for cleanup
1299
+ final_archive: Path = output_dir / f"{archive_name}.tar.zst"
1300
+ compress_with_zstd(tar_file, final_archive, level=args.zstd_level)
1301
+
1302
+ # Step 8: Generate checksums
1303
+ sha256, md5 = generate_checksums(final_archive)
1304
+
1305
+ # Step 9: Split if too large (before cleanup, so we can remove original)
1306
+ parts = split_archive(final_archive, max_size_mb=99)
1307
+
1308
+ # Cleanup
1309
+ if not args.keep_intermediate:
1310
+ print_section("CLEANUP")
1311
+ print("Removing intermediate files...")
1312
+ if tar_file.exists():
1313
+ tar_file.unlink()
1314
+ print(f" Removed: {tar_file.name}")
1315
+ if not args.source_dir: # Don't remove if using existing source
1316
+ for item in [work_dir / "extracted", work_dir / "stripped", work_dir / "hardlinked"]:
1317
+ if item.exists():
1318
+ shutil.rmtree(item)
1319
+ print(f" Removed: {item}")
1320
+
1321
+ # Final summary
1322
+ print_section("SUCCESS!")
1323
+
1324
+ if parts:
1325
+ # Archive was split
1326
+ print(f"Archive split into {len(parts)} parts:")
1327
+ for i, part in enumerate(parts, 1):
1328
+ size_mb = part.stat().st_size / (1024 * 1024)
1329
+ print(f" {i}. {part.name} ({size_mb:.2f} MB)")
1330
+ print()
1331
+ print("Join scripts:")
1332
+ print(f" {final_archive.name}.join (bash)")
1333
+ print(f" {final_archive.name}.join.py (python)")
1334
+ print()
1335
+ print("To rejoin and extract:")
1336
+ print(f" python {final_archive.name}.join.py")
1337
+ print(f" tar --zstd -xf {final_archive.name}")
1338
+ else:
1339
+ # Single archive
1340
+ print(f"Archive created: {final_archive}")
1341
+ print(f"Size: {final_archive.stat().st_size / (1024*1024):.2f} MB")
1342
+ print(f"SHA256: {sha256}")
1343
+ print(f"MD5: {md5}")
1344
+ print()
1345
+ print("Files created:")
1346
+ print(f" {final_archive.name}")
1347
+ print(f" {final_archive.name}.sha256")
1348
+ print(f" {final_archive.name}.md5")
1349
+
1350
+ print()
1351
+ print("āœ… Done!")
1352
+
1353
+ except KeyboardInterrupt:
1354
+ print("\n\n" + "=" * 70)
1355
+ print("āŒ OPERATION CANCELLED BY USER")
1356
+ print("=" * 70)
1357
+ print("\nInterrupted! Cleaning up...")
1358
+ # Cleanup on interrupt - check if final_archive was defined
1359
+ # Use locals() check to avoid NameError if interrupted before final_archive is set
1360
+ if "final_archive" in locals():
1361
+ final_archive_local: Path = final_archive # type: ignore[possibly-undefined]
1362
+ if final_archive_local.exists():
1363
+ print(f" Removing incomplete archive: {final_archive_local}")
1364
+ final_archive_local.unlink()
1365
+ sys.exit(130) # Standard exit code for SIGINT
1366
+
1367
+ except Exception as e:
1368
+ print(f"\nāŒ Error: {e}", file=sys.stderr)
1369
+ import traceback
1370
+
1371
+ traceback.print_exc()
1372
+ sys.exit(1)
1373
+
1374
+
1375
+ if __name__ == "__main__":
1376
+ main()