PyPI - mapillary-downloader - Versions diffs - 0.6.1__tar.gz → 0.7.0__tar.gz - Mend

mapillary-downloader 0.6.1tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mapillary_downloader
-Version: 0.6.1
+Version: 0.7.0
 Summary: Archive user data from Mapillary
 Author-email: Gareth Davidson <gaz@bitplane.net>
 Requires-Python: >=3.10
@@ -100,21 +100,28 @@ mapillary-downloader --no-webp USERNAME
 ## Tarballs
-Images are organized by sequence ID, bucketed by the first character of the
-sequence to reduce directory count:
+Images are organized by capture date (YYYY-MM-DD) for incremental archiving:
 ```
 mapillary-username-quality/
-  a/
+  2024-01-15/
     abc123/
       image1.webp
       image2.webp
+    bcd456/
+      image3.webp
+  2024-01-16/
+    def789/
+      image4.webp
 ```
-By default, these bucket directories are automatically tarred after download
-(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
-because large collections with millions of images would otherwise create hundreds
-of thousands of tiny tars, and anger the archive gods.
+By default, these date directories are automatically tarred after download
+(resulting in `2024-01-15.tar`, `2024-01-16.tar`, etc.). This date-based
+organization enables:
+- **Incremental uploads** - Upload each day's tar as soon as it's ready
+- **Manageable file counts** - ~365 days/year × 10 years = 3,650 tars max
+- **Chronological organization** - Natural sorting and progress tracking
 To keep individual files instead of creating tars, use the `--no-tar` flag.
@@ -128,8 +135,15 @@ See inlay for details:
 * [📀 rip](https://bitplane.net/dev/sh/rip)
+## 📊 Stats
+To see overall project progress, or an estimate, use `--stats`
+```bash
+mapillary-downloader --stats
+```
-## Development
+## 🚧 Development
 ```bash
 make dev      # Setup dev environment
@@ -138,7 +152,7 @@ make dist     # Build the distribution
 make help     # See other make options
 ```
-## Links
+## 🔗 Links
 * [🏠 home](https://bitplane.net/dev/python/mapillary_downloader)
   * [📖 pydoc](https://bitplane.net/dev/python/mapillary_downloader/pydoc)

{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.0}/README.md RENAMED Viewed

@@ -70,21 +70,28 @@ mapillary-downloader --no-webp USERNAME
 ## Tarballs
-Images are organized by sequence ID, bucketed by the first character of the
-sequence to reduce directory count:
+Images are organized by capture date (YYYY-MM-DD) for incremental archiving:
 ```
 mapillary-username-quality/
-  a/
+  2024-01-15/
     abc123/
       image1.webp
       image2.webp
+    bcd456/
+      image3.webp
+  2024-01-16/
+    def789/
+      image4.webp
 ```
-By default, these bucket directories are automatically tarred after download
-(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
-because large collections with millions of images would otherwise create hundreds
-of thousands of tiny tars, and anger the archive gods.
+By default, these date directories are automatically tarred after download
+(resulting in `2024-01-15.tar`, `2024-01-16.tar`, etc.). This date-based
+organization enables:
+- **Incremental uploads** - Upload each day's tar as soon as it's ready
+- **Manageable file counts** - ~365 days/year × 10 years = 3,650 tars max
+- **Chronological organization** - Natural sorting and progress tracking
 To keep individual files instead of creating tars, use the `--no-tar` flag.
@@ -98,8 +105,15 @@ See inlay for details:
 * [📀 rip](https://bitplane.net/dev/sh/rip)
+## 📊 Stats
+To see overall project progress, or an estimate, use `--stats`
+```bash
+mapillary-downloader --stats
+```
-## Development
+## 🚧 Development
 ```bash
 make dev      # Setup dev environment
@@ -108,7 +122,7 @@ make dist     # Build the distribution
 make help     # See other make options
 ```
-## Links
+## 🔗 Links
 * [🏠 home](https://bitplane.net/dev/python/mapillary_downloader)
   * [📖 pydoc](https://bitplane.net/dev/python/mapillary_downloader/pydoc)

{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.0}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "mapillary_downloader"
 description = "Archive user data from Mapillary"
-version = "0.6.1"
+version = "0.7.0"
 authors = [
     { name = "Gareth Davidson", email = "gaz@bitplane.net" }
 ]

{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.0}/src/mapillary_downloader/tar_sequences.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """Tar sequence directories for efficient Internet Archive uploads."""
 import logging
+import re
 import tarfile
 from pathlib import Path
 from mapillary_downloader.utils import format_size
@@ -9,7 +10,9 @@ logger = logging.getLogger("mapillary_downloader")
 def tar_sequence_directories(collection_dir):
-    """Tar all sequence directories in a collection for faster IA uploads.
+    """Tar all date directories in a collection for faster IA uploads.
+    Organizes by capture date (YYYY-MM-DD) for incremental archive.org uploads.
     Args:
         collection_dir: Path to collection directory (e.g., mapillary-user-quality/)
@@ -23,44 +26,44 @@ def tar_sequence_directories(collection_dir):
         logger.error(f"Collection directory not found: {collection_dir}")
         return 0, 0
-    # Find all bucket directories (skip special dirs)
-    # Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
+    # Find all date directories (skip special dirs)
+    # Date format: YYYY-MM-DD or unknown-date
     skip_dirs = {".meta", "__pycache__"}
-    bucket_dirs = []
+    date_dirs = []
     for item in collection_dir.iterdir():
         if item.is_dir() and item.name not in skip_dirs:
-            # Check if this is a bucket dir (single char)
-            if len(item.name) == 1:
-                bucket_dirs.append(item)
+            # Check if this is a date dir (YYYY-MM-DD) or unknown-date
+            if re.match(r"\d{4}-\d{2}-\d{2}$", item.name) or item.name == "unknown-date":
+                date_dirs.append(item)
-    if not bucket_dirs:
-        logger.info("No bucket directories to tar")
+    if not date_dirs:
+        logger.info("No date directories to tar")
         return 0, 0
-    # Sort bucket directories alphabetically for consistent progress tracking
-    bucket_dirs = sorted(bucket_dirs, key=lambda x: x.name)
+    # Sort date directories chronologically (YYYY-MM-DD sorts naturally)
+    date_dirs = sorted(date_dirs, key=lambda x: x.name)
-    logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
+    logger.info(f"Tarring {len(date_dirs)} date directories...")
     tarred_count = 0
     total_files = 0
     total_tar_bytes = 0
-    for bucket_dir in bucket_dirs:
-        bucket_name = bucket_dir.name
-        tar_path = collection_dir / f"{bucket_name}.tar"
+    for date_dir in date_dirs:
+        date_name = date_dir.name
+        tar_path = collection_dir / f"{date_name}.tar"
-        # Count files in bucket
-        files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
+        # Count files in date directory
+        files_to_tar = sorted([f for f in date_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
         file_count = len(files_to_tar)
         if file_count == 0:
-            logger.warning(f"Skipping empty bucket directory: {bucket_name}")
+            logger.warning(f"Skipping empty date directory: {date_name}")
             continue
         try:
-            logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
+            logger.info(f"Tarring date '{date_name}' ({file_count} files)...")
             # Create reproducible uncompressed tar (WebP already compressed)
             with tarfile.open(tar_path, "w") as tar:
@@ -87,36 +90,34 @@ def tar_sequence_directories(collection_dir):
                 tar_size = tar_path.stat().st_size
                 total_tar_bytes += tar_size
-                # Remove original bucket directory
-                for file in bucket_dir.rglob("*"):
+                # Remove original date directory
+                for file in date_dir.rglob("*"):
                     if file.is_file():
                         file.unlink()
                 # Remove empty subdirs and main dir
-                for subdir in list(bucket_dir.rglob("*")):
+                for subdir in list(date_dir.rglob("*")):
                     if subdir.is_dir():
                         try:
                             subdir.rmdir()
                         except OSError:
                             pass  # Not empty yet
-                bucket_dir.rmdir()
+                date_dir.rmdir()
                 tarred_count += 1
                 total_files += file_count
-                logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
+                logger.info(f"Tarred date '{date_name}': {file_count:,} files, {format_size(tar_size)}")
             else:
                 logger.error(f"Tar file empty or not created: {tar_path}")
                 if tar_path.exists():
                     tar_path.unlink()
         except Exception as e:
-            logger.error(f"Error tarring bucket {bucket_name}: {e}")
+            logger.error(f"Error tarring date {date_name}: {e}")
             if tar_path.exists():
                 tar_path.unlink()
-    logger.info(
-        f"Tarred {tarred_count} sequences ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)"
-    )
+    logger.info(f"Tarred {tarred_count} dates ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)")
     return tarred_count, total_files

{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.0}/src/mapillary_downloader/worker.py RENAMED Viewed

@@ -3,6 +3,7 @@
 import os
 import signal
 import tempfile
+from datetime import datetime
 from pathlib import Path
 import requests
 from mapillary_downloader.exif_writer import write_exif_to_image
@@ -69,16 +70,25 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
         if not image_url:
             return (image_id, 0, False, f"No {quality} URL")
-        # Determine final output directory - organize by first char of sequence ID
+        # Determine final output directory - organize by capture date
         output_dir = Path(output_dir)
         sequence_id = image_data.get("sequence")
+        # Extract date from captured_at timestamp (milliseconds since epoch)
+        captured_at = image_data.get("captured_at")
+        if captured_at:
+            # Convert to UTC date string (YYYY-MM-DD)
+            date_str = datetime.utcfromtimestamp(captured_at / 1000).strftime("%Y-%m-%d")
+        else:
+            # Fallback for missing timestamp (should be rare per API docs)
+            date_str = "unknown-date"
         if sequence_id:
-            # Use first character as bucket (gives us ~62 dirs instead of millions)
-            first_char = sequence_id[0]
-            img_dir = output_dir / first_char / sequence_id
+            img_dir = output_dir / date_str / sequence_id
             img_dir.mkdir(parents=True, exist_ok=True)
         else:
-            img_dir = output_dir
+            img_dir = output_dir / date_str
+            img_dir.mkdir(parents=True, exist_ok=True)
         # If converting to WebP, use /tmp for intermediate JPEG
         # Otherwise write JPEG directly to final location