PyPI - mapillary-downloader - Versions diffs - 0.5.1__tar.gz → 0.6.0__tar.gz - Mend

mapillary-downloader 0.5.1tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mapillary_downloader
-Version: 0.5.1
+Version: 0.6.0
 Summary: Download your Mapillary data before it's gone
 Author-email: Gareth Davidson <gaz@bitplane.net>
 Requires-Python: >=3.10
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
 | `--quality`     | 256, 1024, 2048 or original                  | `original`         |
 | `--bbox`        | `west,south,east,north`                      | `None`             |
 | `--no-webp`     | Don't convert to WebP                        | `False`            |
-| `--workers`     | Number of parallel download workers          | Half of CPU count  |
-| `--no-tar`      | Don't tar sequence directories               | `False`            |
+| `--max-workers` | Maximum number of parallel download workers  | `128`              |
+| `--no-tar`      | Don't tar bucket directories                 | `False`            |
 | `--no-check-ia` | Don't check if exists on Internet Archive    | `False`            |
 The downloader will:
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
 mapillary-downloader --no-webp USERNAME
 ```
-## Sequence Tarball Creation
+## Tarballs
-By default, sequence directories are automatically tarred after download because
-if they weren't, you'd spend more time setting up upload metadata than actually
-uploading files to IA.
+Images are organized by sequence ID, bucketed by the first character of the
+sequence to reduce directory count:
+```
+mapillary-username-quality/
+  a/
+    abc123/
+      image1.webp
+      image2.webp
+```
+By default, these bucket directories are automatically tarred after download
+(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
+because large collections with millions of images would otherwise create hundreds
+of thousands of tiny tars, and anger the archive gods.
 To keep individual files instead of creating tars, use the `--no-tar` flag.

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/README.md RENAMED Viewed

@@ -36,8 +36,8 @@ mapillary-downloader --output ./downloads USERNAME1
 | `--quality`     | 256, 1024, 2048 or original                  | `original`         |
 | `--bbox`        | `west,south,east,north`                      | `None`             |
 | `--no-webp`     | Don't convert to WebP                        | `False`            |
-| `--workers`     | Number of parallel download workers          | Half of CPU count  |
-| `--no-tar`      | Don't tar sequence directories               | `False`            |
+| `--max-workers` | Maximum number of parallel download workers  | `128`              |
+| `--no-tar`      | Don't tar bucket directories                 | `False`            |
 | `--no-check-ia` | Don't check if exists on Internet Archive    | `False`            |
 The downloader will:
@@ -68,11 +68,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
 mapillary-downloader --no-webp USERNAME
 ```
-## Sequence Tarball Creation
+## Tarballs
-By default, sequence directories are automatically tarred after download because
-if they weren't, you'd spend more time setting up upload metadata than actually
-uploading files to IA.
+Images are organized by sequence ID, bucketed by the first character of the
+sequence to reduce directory count:
+```
+mapillary-username-quality/
+  a/
+    abc123/
+      image1.webp
+      image2.webp
+```
+By default, these bucket directories are automatically tarred after download
+(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
+because large collections with millions of images would otherwise create hundreds
+of thousands of tiny tars, and anger the archive gods.
 To keep individual files instead of creating tars, use the `--no-tar` flag.

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "mapillary_downloader"
 description = "Download your Mapillary data before it's gone"
-version = "0.5.1"
+version = "0.6.0"
 authors = [
     { name = "Gareth Davidson", email = "gaz@bitplane.net" }
 ]

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/__main__.py RENAMED Viewed

@@ -41,10 +41,10 @@ def main():
         help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
     )
     parser.add_argument(
-        "--workers",
+        "--max-workers",
         type=int,
-        default=None,
-        help="Number of parallel workers (default: half of CPU cores)",
+        default=128,
+        help="Maximum number of parallel workers (default: 128)",
     )
     parser.add_argument(
         "--no-tar",
@@ -114,7 +114,7 @@ def main():
                 args.output,
                 username,
                 args.quality,
-                workers=args.workers,
+                max_workers=args.max_workers,
                 tar_sequences=not args.no_tar,
                 convert_webp=convert_webp,
                 check_ia=not args.no_check_ia,

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/downloader.py RENAMED Viewed

@@ -45,7 +45,7 @@ class MapillaryDownloader:
         output_dir,
         username=None,
         quality=None,
-        workers=None,
+        max_workers=128,
         tar_sequences=True,
         convert_webp=False,
         check_ia=True,
@@ -57,7 +57,7 @@ class MapillaryDownloader:
             output_dir: Base directory to save downloads (final destination)
             username: Mapillary username (for collection directory)
             quality: Image quality (for collection directory)
-            workers: Number of parallel workers (default: half of cpu_count)
+            max_workers: Maximum number of parallel workers (default: 128)
             tar_sequences: Whether to tar sequence directories after download (default: True)
             convert_webp: Whether to convert images to WebP (affects collection name)
             check_ia: Whether to check if collection exists on Internet Archive (default: True)
@@ -66,7 +66,8 @@ class MapillaryDownloader:
         self.base_output_dir = Path(output_dir)
         self.username = username
         self.quality = quality
-        self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
+        self.max_workers = max_workers
+        self.initial_workers = os.cpu_count() or 1  # Start with CPU count
         self.tar_sequences = tar_sequences
         self.convert_webp = convert_webp
         self.check_ia = check_ia
@@ -177,7 +178,7 @@ class MapillaryDownloader:
         logger.info(f"Downloading images for user: {self.username}")
         logger.info(f"Output directory: {self.output_dir}")
         logger.info(f"Quality: {self.quality}")
-        logger.info(f"Using {self.workers} parallel workers")
+        logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
         start_time = time.time()
@@ -191,8 +192,10 @@ class MapillaryDownloader:
         # Step 2: Start worker pool
         # Since workers do both I/O (download) and CPU (WebP), need many more workers
-        # Cap at 128 for now - will build proper dynamic scaling on a new branch later
-        pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
+        # Start with CPU count and scale up based on throughput
+        pool = AdaptiveWorkerPool(
+            worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
+        )
         pool.start()
         # Step 3: Download images from metadata file while fetching new from API
@@ -247,8 +250,9 @@ class MapillaryDownloader:
             # Helper to process results from queue
             def process_results():
                 nonlocal downloaded_count, total_bytes, failed_count
+                # Drain ALL available results to prevent queue from filling up
                 while True:
-                    result = pool.get_result(timeout=0.001)
+                    result = pool.get_result(timeout=0)  # Non-blocking
                     if result is None:
                         break
@@ -379,6 +383,10 @@ class MapillaryDownloader:
                         last_position = f.tell()
+                # If API is already complete, we've read the whole file, so break
+                if api_fetch_complete is None:
+                    break
                 # Sleep briefly before next tail iteration
                 time.sleep(0.1)

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/tar_sequences.py RENAMED Viewed

@@ -23,51 +23,43 @@ def tar_sequence_directories(collection_dir):
         logger.error(f"Collection directory not found: {collection_dir}")
         return 0, 0
-    # Find all sequence directories (skip special dirs)
+    # Find all bucket directories (skip special dirs)
+    # Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
     skip_dirs = {".meta", "__pycache__"}
-    sequence_dirs = []
+    bucket_dirs = []
     for item in collection_dir.iterdir():
         if item.is_dir() and item.name not in skip_dirs:
-            sequence_dirs.append(item)
+            # Check if this is a bucket dir (single char)
+            if len(item.name) == 1:
+                bucket_dirs.append(item)
-    if not sequence_dirs:
-        logger.info("No sequence directories to tar")
+    if not bucket_dirs:
+        logger.info("No bucket directories to tar")
         return 0, 0
-    logger.info(f"Tarring {len(sequence_dirs)} sequence directories...")
+    logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
     tarred_count = 0
     total_files = 0
     total_tar_bytes = 0
-    for seq_dir in sequence_dirs:
-        seq_name = seq_dir.name
-        tar_path = collection_dir / f"{seq_name}.tar"
+    for bucket_dir in bucket_dirs:
+        bucket_name = bucket_dir.name
+        tar_path = collection_dir / f"{bucket_name}.tar"
-        # Handle naming collision - find next available name
-        counter = 1
-        while tar_path.exists():
-            counter += 1
-            tar_path = collection_dir / f"{seq_name}.{counter}.tar"
-        # Count files in sequence
-        files = list(seq_dir.glob("*"))
-        file_count = len([f for f in files if f.is_file()])
+        # Count files in bucket
+        files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
+        file_count = len(files_to_tar)
         if file_count == 0:
-            logger.warning(f"Skipping empty directory: {seq_name}")
+            logger.warning(f"Skipping empty bucket directory: {bucket_name}")
             continue
         try:
-            # Create reproducible uncompressed tar (WebP already compressed)
-            # Sort files by name for deterministic ordering
-            files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
-            if not files_to_tar:
-                logger.warning(f"Skipping directory with no files: {seq_name}")
-                continue
+            logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
+            # Create reproducible uncompressed tar (WebP already compressed)
             with tarfile.open(tar_path, "w") as tar:
                 for file_path in files_to_tar:
                     # Get path relative to collection_dir for tar archive
@@ -92,33 +84,32 @@ def tar_sequence_directories(collection_dir):
                 tar_size = tar_path.stat().st_size
                 total_tar_bytes += tar_size
-                # Remove original directory
-                for file in seq_dir.rglob("*"):
+                # Remove original bucket directory
+                for file in bucket_dir.rglob("*"):
                     if file.is_file():
                         file.unlink()
                 # Remove empty subdirs and main dir
-                for subdir in list(seq_dir.rglob("*")):
+                for subdir in list(bucket_dir.rglob("*")):
                     if subdir.is_dir():
                         try:
                             subdir.rmdir()
                         except OSError:
                             pass  # Not empty yet
-                seq_dir.rmdir()
+                bucket_dir.rmdir()
                 tarred_count += 1
                 total_files += file_count
-                if tarred_count % 10 == 0:
-                    logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
+                logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
             else:
                 logger.error(f"Tar file empty or not created: {tar_path}")
                 if tar_path.exists():
                     tar_path.unlink()
         except Exception as e:
-            logger.error(f"Error tarring {seq_name}: {e}")
+            logger.error(f"Error tarring bucket {bucket_name}: {e}")
             if tar_path.exists():
                 tar_path.unlink()

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/worker.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """Worker process for parallel image download and conversion."""
 import os
+import signal
 import tempfile
 from pathlib import Path
 import requests
@@ -17,6 +18,9 @@ def worker_process(work_queue, result_queue, worker_id):
         result_queue: Queue to push results to
         worker_id: Unique worker identifier
     """
+    # Ignore SIGINT in worker process - parent will handle it
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
     # Create session once per worker (reuse HTTP connections)
     session = requests.Session()
@@ -65,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
         if not image_url:
             return (image_id, 0, False, f"No {quality} URL")
-        # Determine final output directory
+        # Determine final output directory - organize by first char of sequence ID
         output_dir = Path(output_dir)
         sequence_id = image_data.get("sequence")
         if sequence_id:
-            img_dir = output_dir / sequence_id
+            # Use first character as bucket (gives us ~62 dirs instead of millions)
+            first_char = sequence_id[0]
+            img_dir = output_dir / first_char / sequence_id
             img_dir.mkdir(parents=True, exist_ok=True)
         else:
             img_dir = output_dir

{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/worker_pool.py RENAMED Viewed

@@ -185,20 +185,18 @@ class AdaptiveWorkerPool:
         else:
             logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
-    def shutdown(self, timeout=30):
+    def shutdown(self, timeout=2):
         """Shutdown the worker pool gracefully."""
         logger.info("Shutting down worker pool...")
         self.running = False
-        # Send stop signals
-        for _ in self.workers:
-            self.work_queue.put(None)
-        # Wait for workers to finish
+        # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
         for p in self.workers:
-            p.join(timeout=timeout)
             if p.is_alive():
-                logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
                 p.terminate()
+        # Give them a brief moment to exit
+        for p in self.workers:
+            p.join(timeout=timeout)
         logger.info("Worker pool shutdown complete")