PyPI - mapillary-downloader - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

mapillary-downloader 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

mapillary_downloader/__main__.py CHANGED Viewed

@@ -41,10 +41,10 @@ def main():
         help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
     )
     parser.add_argument(
-        "--workers",
+        "--max-workers",
         type=int,
-        default=None,
-        help="Number of parallel workers (default: half of CPU cores)",
+        default=128,
+        help="Maximum number of parallel workers (default: 128)",
     )
     parser.add_argument(
         "--no-tar",
@@ -114,7 +114,7 @@ def main():
                 args.output,
                 username,
                 args.quality,
-                workers=args.workers,
+                max_workers=args.max_workers,
                 tar_sequences=not args.no_tar,
                 convert_webp=convert_webp,
                 check_ia=not args.no_check_ia,

mapillary_downloader/downloader.py CHANGED Viewed

@@ -45,7 +45,7 @@ class MapillaryDownloader:
         output_dir,
         username=None,
         quality=None,
-        workers=None,
+        max_workers=128,
         tar_sequences=True,
         convert_webp=False,
         check_ia=True,
@@ -57,7 +57,7 @@ class MapillaryDownloader:
             output_dir: Base directory to save downloads (final destination)
             username: Mapillary username (for collection directory)
             quality: Image quality (for collection directory)
-            workers: Number of parallel workers (default: half of cpu_count)
+            max_workers: Maximum number of parallel workers (default: 128)
             tar_sequences: Whether to tar sequence directories after download (default: True)
             convert_webp: Whether to convert images to WebP (affects collection name)
             check_ia: Whether to check if collection exists on Internet Archive (default: True)
@@ -66,7 +66,8 @@ class MapillaryDownloader:
         self.base_output_dir = Path(output_dir)
         self.username = username
         self.quality = quality
-        self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
+        self.max_workers = max_workers
+        self.initial_workers = os.cpu_count() or 1  # Start with CPU count
         self.tar_sequences = tar_sequences
         self.convert_webp = convert_webp
         self.check_ia = check_ia
@@ -177,26 +178,28 @@ class MapillaryDownloader:
         logger.info(f"Downloading images for user: {self.username}")
         logger.info(f"Output directory: {self.output_dir}")
         logger.info(f"Quality: {self.quality}")
-        logger.info(f"Using {self.workers} parallel workers")
+        logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
         start_time = time.time()
-        # Step 1: Build seen_ids from metadata file (streaming, only IDs)
-        logger.info("Building seen_ids from metadata...")
+        # Step 1: Check if API fetch is already complete
         reader = MetadataReader(self.metadata_file)
-        seen_ids = reader.get_all_ids()
         api_complete = reader.is_complete
-        logger.info(f"Found {len(seen_ids)} existing images in metadata")
+        if api_complete:
+            logger.info("API fetch already complete, will only download")
+        else:
+            logger.info("API fetch incomplete, will fetch and download in parallel")
-        # Step 2: Start worker pool (fork AFTER building seen_ids, BEFORE downloading)
+        # Step 2: Start worker pool
+        # Since workers do both I/O (download) and CPU (WebP), need many more workers
+        # Start with CPU count and scale up based on throughput
         pool = AdaptiveWorkerPool(
-            worker_process, min_workers=max(1, self.workers // 2), max_workers=self.workers, monitoring_interval=30
+            worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
         )
         pool.start()
-        # Step 3: Download images from existing metadata while fetching new from API
+        # Step 3: Download images from metadata file while fetching new from API
         downloaded_count = 0
-        skipped = 0
         total_bytes = 0
         failed_count = 0
         submitted = 0
@@ -218,25 +221,18 @@ class MapillaryDownloader:
                         logger.info("API fetch thread: Starting...")
                         with open(self.metadata_file, "a") as meta_f:
                             for image in self.client.get_user_images(self.username, bbox=bbox):
-                                image_id = image["id"]
-                                # Skip if we already have this in our metadata file
-                                if image_id in seen_ids:
-                                    continue
-                                seen_ids.add(image_id)
                                 new_images_count[0] += 1
-                                # Save new metadata
+                                # Save metadata (don't dedupe here, let the tailer handle it)
                                 meta_f.write(json.dumps(image) + "\n")
                                 meta_f.flush()
                                 if new_images_count[0] % 1000 == 0:
-                                    logger.info(f"API: Fetched {new_images_count[0]} new images from API")
+                                    logger.info(f"API: Fetched {new_images_count[0]} images from API")
                             # Mark as complete
                             MetadataReader.mark_complete(self.metadata_file)
-                            logger.info(f"API fetch complete: {new_images_count[0]} new images")
+                            logger.info(f"API fetch complete: {new_images_count[0]} images")
                     finally:
                         api_fetch_complete.set()
@@ -254,8 +250,9 @@ class MapillaryDownloader:
             # Helper to process results from queue
             def process_results():
                 nonlocal downloaded_count, total_bytes, failed_count
+                # Drain ALL available results to prevent queue from filling up
                 while True:
-                    result = pool.get_result(timeout=0.001)
+                    result = pool.get_result(timeout=0)  # Non-blocking
                     if result is None:
                         break
@@ -386,6 +383,10 @@ class MapillaryDownloader:
                         last_position = f.tell()
+                # If API is already complete, we've read the whole file, so break
+                if api_fetch_complete is None:
+                    break
                 # Sleep briefly before next tail iteration
                 time.sleep(0.1)
@@ -438,14 +439,7 @@ class MapillaryDownloader:
         self._save_progress()
         elapsed = time.time() - start_time
-        # Count total images in metadata
-        total_images = len(seen_ids)
-        skipped = total_images - downloaded_count - failed_count
-        logger.info(
-            f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
-            f"skipped {skipped}, failed {failed_count}"
-        )
+        logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
         logger.info(f"Total time: {format_time(elapsed)}")
         # Tar sequence directories for efficient IA uploads

mapillary_downloader/metadata_reader.py CHANGED Viewed

@@ -23,7 +23,47 @@ class MetadataReader:
             metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
         """
         self.metadata_file = Path(metadata_file)
-        self.is_complete = False
+        self.is_complete = self._check_complete()
+    def _check_complete(self):
+        """Check if metadata file has completion marker.
+        Returns:
+            True if completion marker found, False otherwise
+        """
+        if not self.metadata_file.exists():
+            return False
+        # Check last few lines for completion marker (it should be at the end)
+        try:
+            if self.metadata_file.suffix == ".gz":
+                file_handle = gzip.open(self.metadata_file, "rt")
+            else:
+                file_handle = open(self.metadata_file)
+            with file_handle as f:
+                # Read last 10 lines to find completion marker
+                lines = []
+                for line in f:
+                    lines.append(line)
+                    if len(lines) > 10:
+                        lines.pop(0)
+                # Check if any of the last lines is the completion marker
+                for line in reversed(lines):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                        if data.get("__complete__"):
+                            return True
+                    except json.JSONDecodeError:
+                        continue
+            return False
+        except Exception:
+            return False
     def iter_images(self, quality_field=None, downloaded_ids=None):
         """Stream images from metadata file with filtering.

mapillary_downloader/worker.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Worker process for parallel image download and conversion."""
 import os
+import signal
 import tempfile
 from pathlib import Path
 import requests
@@ -17,6 +18,12 @@ def worker_process(work_queue, result_queue, worker_id):
         result_queue: Queue to push results to
         worker_id: Unique worker identifier
     """
+    # Ignore SIGINT in worker process - parent will handle it
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+    # Create session once per worker (reuse HTTP connections)
+    session = requests.Session()
     while True:
         work_item = work_queue.get()
@@ -27,14 +34,17 @@ def worker_process(work_queue, result_queue, worker_id):
         # Unpack work item
         image_data, output_dir, quality, convert_webp, access_token = work_item
+        # Update session auth for this request
+        session.headers.update({"Authorization": f"OAuth {access_token}"})
         # Process the image
-        result = download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token)
+        result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
         # Push result back
         result_queue.put(result)
-def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
+def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
     """Download and optionally convert a single image.
     This function is designed to run in a worker process.
@@ -44,7 +54,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
         output_dir: Base output directory path
         quality: Quality level (256, 1024, 2048, original)
         convert_webp: Whether to convert to WebP
-        access_token: Mapillary API access token
+        session: requests.Session with auth already configured
     Returns:
         Tuple of (image_id, bytes_downloaded, success, error_msg)
@@ -78,11 +88,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
             jpg_path = img_dir / f"{image_id}.jpg"
             final_path = jpg_path
-        # Download image
-        # No retries for CDN images - they're cheap, just skip failures and move on
-        session = requests.Session()
-        session.headers.update({"Authorization": f"OAuth {access_token}"})
+        # Download image (using session passed from worker)
         bytes_downloaded = 0
         try:

mapillary_downloader/worker_pool.py CHANGED Viewed

@@ -17,7 +17,7 @@ class AdaptiveWorkerPool:
     - If throughput plateauing/decreasing: reduce workers
     """
-    def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
+    def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
         """Initialize adaptive worker pool.
         Args:
@@ -37,10 +37,11 @@ class AdaptiveWorkerPool:
         # Worker management
         self.workers = []
-        self.current_workers = min_workers
+        self.current_workers = min_workers  # Start small and ramp up
         # Throughput monitoring
         self.throughput_history = deque(maxlen=5)  # Last 5 measurements
+        self.worker_count_history = deque(maxlen=5)  # Track worker counts at each measurement
         self.last_processed = 0
         self.last_check_time = time.time()
@@ -86,51 +87,116 @@ class AdaptiveWorkerPool:
         elapsed = now - self.last_check_time
         if elapsed < self.monitoring_interval:
+            logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
             return
         # Calculate current throughput (items/sec)
         items_since_check = total_processed - self.last_processed
         throughput = items_since_check / elapsed
+        current_workers = len(self.workers)
         self.throughput_history.append(throughput)
+        self.worker_count_history.append(current_workers)
         self.last_processed = total_processed
         self.last_check_time = now
-        # Need at least 3 measurements to detect trends
-        if len(self.throughput_history) < 3:
+        logger.info(
+            f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
+            f"history: {len(self.throughput_history)} measurements)"
+        )
+        # Need at least 2 measurements to calculate gain per worker
+        if len(self.throughput_history) < 2:
+            # First measurement - add 20% more workers
+            if current_workers < self.max_workers:
+                workers_to_add = max(1, int(current_workers * 0.2))
+                for i in range(workers_to_add):
+                    if len(self.workers) < self.max_workers:
+                        new_worker_id = len(self.workers)
+                        self._add_worker(new_worker_id)
+                        self.current_workers += 1
+                logger.info(
+                    f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
+                )
             return
-        # Check if throughput is increasing
-        recent_avg = sum(list(self.throughput_history)[-2:]) / 2
-        older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
-        if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
-            # Throughput increasing by >10%, add workers
-            new_worker_id = len(self.workers)
-            self._add_worker(new_worker_id)
-            self.current_workers += 1
-            logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
-        elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
-            # Throughput decreasing by >10%, remove worker
-            # (workers will exit naturally when they finish current work)
-            self.current_workers = max(self.min_workers, self.current_workers - 1)
-            logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
-    def shutdown(self, timeout=30):
+        # Calculate throughput gain per worker added
+        current_throughput = self.throughput_history[-1]
+        previous_throughput = self.throughput_history[-2]
+        previous_workers = self.worker_count_history[-2]
+        throughput_gain = current_throughput - previous_throughput
+        workers_added = current_workers - previous_workers
+        logger.debug(
+            f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
+            f"{current_throughput:.1f} items/s @ {current_workers} workers "
+            f"(gain: {throughput_gain:.1f}, added: {workers_added})"
+        )
+        # If throughput decreased significantly, stop adding workers
+        if current_throughput < previous_throughput * 0.95:
+            logger.info(
+                f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
+                f"stopping at {current_workers} workers"
+            )
+        # If throughput is still increasing or stable, add more workers
+        elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
+            if workers_added > 0 and throughput_gain > 0:
+                # Calculate gain per worker
+                gain_per_worker = throughput_gain / workers_added
+                logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
+                # Estimate how many more workers we could benefit from
+                # Assume diminishing returns, so be conservative
+                if gain_per_worker > 0.5:
+                    # Good gain per worker - add more aggressively
+                    workers_to_add = max(1, int(current_workers * 0.3))
+                elif gain_per_worker > 0.2:
+                    # Moderate gain - add moderately
+                    workers_to_add = max(1, int(current_workers * 0.2))
+                else:
+                    # Small gain - add conservatively
+                    workers_to_add = max(1, int(current_workers * 0.1))
+                added = 0
+                for i in range(workers_to_add):
+                    if len(self.workers) < self.max_workers:
+                        new_worker_id = len(self.workers)
+                        self._add_worker(new_worker_id)
+                        self.current_workers += 1
+                        added += 1
+                logger.info(
+                    f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
+                    f"added {added} workers (now {self.current_workers}/{self.max_workers})"
+                )
+            else:
+                # Fallback to 20% if we can't calculate gain per worker
+                workers_to_add = max(1, int(current_workers * 0.2))
+                added = 0
+                for i in range(workers_to_add):
+                    if len(self.workers) < self.max_workers:
+                        new_worker_id = len(self.workers)
+                        self._add_worker(new_worker_id)
+                        self.current_workers += 1
+                        added += 1
+                logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
+        else:
+            logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
+    def shutdown(self, timeout=2):
         """Shutdown the worker pool gracefully."""
         logger.info("Shutting down worker pool...")
         self.running = False
-        # Send stop signals
-        for _ in self.workers:
-            self.work_queue.put(None)
-        # Wait for workers to finish
+        # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
         for p in self.workers:
-            p.join(timeout=timeout)
             if p.is_alive():
-                logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
                 p.terminate()
+        # Give them a brief moment to exit
+        for p in self.workers:
+            p.join(timeout=timeout)
         logger.info("Worker pool shutdown complete")

{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mapillary_downloader
-Version: 0.5.0
+Version: 0.5.2
 Summary: Download your Mapillary data before it's gone
 Author-email: Gareth Davidson <gaz@bitplane.net>
 Requires-Python: >=3.10

{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.2.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
 mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
-mapillary_downloader/__main__.py,sha256=Kjfx2woMyCvAxYAdqvtXtYJknCMviV_K2PSo0cDc8Hg,4320
+mapillary_downloader/__main__.py,sha256=G4tTNN6V7jPZn4X9gjRDx0faw-Im9zhyTLbDRTOyo3k,4325
 mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
-mapillary_downloader/downloader.py,sha256=F36AtB0Ro_EXR78EDOqH248llV7fGVeR4j9nZf0q7qg,19988
+mapillary_downloader/downloader.py,sha256=6JFEIVBIdhN9L4DcPJmx6UCTgAnWSuH0jO0D_8wKz_U,19886
 mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
 mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
 mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
 mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
-mapillary_downloader/metadata_reader.py,sha256=-4BmtLVI9sldZU0LlqMc-bporiYNpk6-F2RKKMvzLu4,3560
+mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
 mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
 mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
 mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
-mapillary_downloader/worker.py,sha256=RMZO8N67Kl-bhHC1qUdZg6Sx8k6RYbPRhyuLyOjr29o,4450
-mapillary_downloader/worker_pool.py,sha256=QFYIbqkgamOtB-iRyZp5kN6jdZuYw93izls61ayVIZ8,4771
-mapillary_downloader-0.5.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
-mapillary_downloader-0.5.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
-mapillary_downloader-0.5.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
-mapillary_downloader-0.5.0.dist-info/METADATA,sha256=A0AhsIjGV9FBf5vz28hSC2jugcRqz5A8gsZwMGGEw2A,4982
-mapillary_downloader-0.5.0.dist-info/RECORD,,
+mapillary_downloader/worker.py,sha256=n9m6PzSjlLOOYZJd9j1vH-2ag9aOeNndfgRlunzI14s,4637
+mapillary_downloader/worker_pool.py,sha256=iGRq5uFwBNNVQnI4vEjbKHkbKTaEVCdmvMvXcRGuDMg,8203
+mapillary_downloader-0.5.2.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
+mapillary_downloader-0.5.2.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
+mapillary_downloader-0.5.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
+mapillary_downloader-0.5.2.dist-info/METADATA,sha256=PHO4jDVxqsIo9Hs9GX3J2Cfnfc8gy_PI6xhssE9jrMk,4982
+mapillary_downloader-0.5.2.dist-info/RECORD,,

{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.2.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

mapillary-downloader 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

mapillary-downloader 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl