mapillary-downloader 0.5.2__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/PKG-INFO +20 -8
  2. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/README.md +18 -6
  3. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/pyproject.toml +2 -2
  4. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/__main__.py +18 -1
  5. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/downloader.py +3 -8
  6. mapillary_downloader-0.6.1/src/mapillary_downloader/ia_stats.py +242 -0
  7. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/tar_sequences.py +27 -33
  8. mapillary_downloader-0.6.1/src/mapillary_downloader/utils.py +108 -0
  9. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/worker.py +9 -8
  10. mapillary_downloader-0.5.2/src/mapillary_downloader/utils.py +0 -47
  11. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/LICENSE.md +0 -0
  12. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/__init__.py +0 -0
  13. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/client.py +0 -0
  14. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/exif_writer.py +0 -0
  15. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/ia_check.py +0 -0
  16. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/ia_meta.py +0 -0
  17. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/logging_config.py +0 -0
  18. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/metadata_reader.py +0 -0
  19. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/webp_converter.py +0 -0
  20. {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/worker_pool.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.2
4
- Summary: Download your Mapillary data before it's gone
3
+ Version: 0.6.1
4
+ Summary: Archive user data from Mapillary
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
66
66
  | `--quality` | 256, 1024, 2048 or original | `original` |
67
67
  | `--bbox` | `west,south,east,north` | `None` |
68
68
  | `--no-webp` | Don't convert to WebP | `False` |
69
- | `--workers` | Number of parallel download workers | Half of CPU count |
70
- | `--no-tar` | Don't tar sequence directories | `False` |
69
+ | `--max-workers` | Maximum number of parallel download workers | `128` |
70
+ | `--no-tar` | Don't tar bucket directories | `False` |
71
71
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
72
72
 
73
73
  The downloader will:
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
98
98
  mapillary-downloader --no-webp USERNAME
99
99
  ```
100
100
 
101
- ## Sequence Tarball Creation
101
+ ## Tarballs
102
102
 
103
- By default, sequence directories are automatically tarred after download because
104
- if they weren't, you'd spend more time setting up upload metadata than actually
105
- uploading files to IA.
103
+ Images are organized by sequence ID, bucketed by the first character of the
104
+ sequence to reduce directory count:
105
+
106
+ ```
107
+ mapillary-username-quality/
108
+ a/
109
+ abc123/
110
+ image1.webp
111
+ image2.webp
112
+ ```
113
+
114
+ By default, these bucket directories are automatically tarred after download
115
+ (resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
116
+ because large collections with millions of images would otherwise create hundreds
117
+ of thousands of tiny tars, and anger the archive gods.
106
118
 
107
119
  To keep individual files instead of creating tars, use the `--no-tar` flag.
108
120
 
@@ -36,8 +36,8 @@ mapillary-downloader --output ./downloads USERNAME1
36
36
  | `--quality` | 256, 1024, 2048 or original | `original` |
37
37
  | `--bbox` | `west,south,east,north` | `None` |
38
38
  | `--no-webp` | Don't convert to WebP | `False` |
39
- | `--workers` | Number of parallel download workers | Half of CPU count |
40
- | `--no-tar` | Don't tar sequence directories | `False` |
39
+ | `--max-workers` | Maximum number of parallel download workers | `128` |
40
+ | `--no-tar` | Don't tar bucket directories | `False` |
41
41
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
42
42
 
43
43
  The downloader will:
@@ -68,11 +68,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
68
68
  mapillary-downloader --no-webp USERNAME
69
69
  ```
70
70
 
71
- ## Sequence Tarball Creation
71
+ ## Tarballs
72
72
 
73
- By default, sequence directories are automatically tarred after download because
74
- if they weren't, you'd spend more time setting up upload metadata than actually
75
- uploading files to IA.
73
+ Images are organized by sequence ID, bucketed by the first character of the
74
+ sequence to reduce directory count:
75
+
76
+ ```
77
+ mapillary-username-quality/
78
+ a/
79
+ abc123/
80
+ image1.webp
81
+ image2.webp
82
+ ```
83
+
84
+ By default, these bucket directories are automatically tarred after download
85
+ (resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
86
+ because large collections with millions of images would otherwise create hundreds
87
+ of thousands of tiny tars, and anger the archive gods.
76
88
 
77
89
  To keep individual files instead of creating tars, use the `--no-tar` flag.
78
90
 
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
- description = "Download your Mapillary data before it's gone"
4
- version = "0.5.2"
3
+ description = "Archive user data from Mapillary"
4
+ version = "0.6.1"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -26,7 +26,7 @@ def main():
26
26
  default=os.environ.get("MAPILLARY_TOKEN"),
27
27
  help="Mapillary API access token (or set MAPILLARY_TOKEN env var)",
28
28
  )
29
- parser.add_argument("usernames", nargs="+", help="Mapillary username(s) to download")
29
+ parser.add_argument("usernames", nargs="*", help="Mapillary username(s) to download")
30
30
  parser.add_argument("--output", default="./mapillary_data", help="Output directory (default: ./mapillary_data)")
31
31
  parser.add_argument(
32
32
  "--quality",
@@ -61,9 +61,21 @@ def main():
61
61
  action="store_true",
62
62
  help="Enable debug logging (EXIF data, API responses, etc.)",
63
63
  )
64
+ parser.add_argument(
65
+ "--stats",
66
+ action="store_true",
67
+ help="Show statistics of collections on archive.org and exit",
68
+ )
64
69
 
65
70
  args = parser.parse_args()
66
71
 
72
+ # Handle --stats early (before token check)
73
+ if args.stats:
74
+ from mapillary_downloader.ia_stats import show_stats
75
+
76
+ show_stats()
77
+ sys.exit(0)
78
+
67
79
  # Set debug logging level if requested
68
80
  if args.debug:
69
81
  import logging
@@ -71,6 +83,11 @@ def main():
71
83
  logging.getLogger("mapillary_downloader").setLevel(logging.DEBUG)
72
84
  logger.debug("Debug logging enabled")
73
85
 
86
+ # Check for usernames (required unless using --stats)
87
+ if not args.usernames:
88
+ logger.error("Error: At least one username is required")
89
+ sys.exit(1)
90
+
74
91
  # Check for token
75
92
  if not args.token:
76
93
  logger.error("Error: Mapillary API token required. Use --token or set MAPILLARY_TOKEN environment variable")
@@ -7,7 +7,7 @@ import os
7
7
  import shutil
8
8
  import time
9
9
  from pathlib import Path
10
- from mapillary_downloader.utils import format_size, format_time
10
+ from mapillary_downloader.utils import format_size, format_time, safe_json_save
11
11
  from mapillary_downloader.ia_meta import generate_ia_metadata
12
12
  from mapillary_downloader.ia_check import check_ia_exists
13
13
  from mapillary_downloader.worker import worker_process
@@ -143,13 +143,8 @@ class MapillaryDownloader:
143
143
  # Update this quality's progress
144
144
  progress[str(self.quality)] = list(self.downloaded)
145
145
 
146
- # Write atomically
147
- temp_file = self.progress_file.with_suffix(".json.tmp")
148
- with open(temp_file, "w") as f:
149
- json.dump(progress, f)
150
- f.flush()
151
- os.fsync(f.fileno())
152
- temp_file.replace(self.progress_file)
146
+ # Write atomically using utility function
147
+ safe_json_save(self.progress_file, progress)
153
148
 
154
149
  def download_user_data(self, bbox=None, convert_webp=False):
155
150
  """Download all images for a user using streaming queue-based architecture.
@@ -0,0 +1,242 @@
1
+ """Internet Archive statistics for mapillary_downloader collections."""
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ from mapillary_downloader.utils import safe_json_save, http_get_with_retry, format_size
7
+ from mapillary_downloader.downloader import get_cache_dir
8
+
9
+ logger = logging.getLogger("mapillary_downloader")
10
+
11
+ CACHE_FILE = get_cache_dir() / ".stats.json"
12
+
13
+
14
+ def search_ia_collections():
15
+ """Search IA for all mapillary_downloader collections.
16
+
17
+ Returns:
18
+ List of dicts with: identifier, description, item_size, uploader
19
+ """
20
+ logger.info("Searching archive.org for mapillary_downloader collections...")
21
+
22
+ url = "https://archive.org/advancedsearch.php"
23
+ params = {
24
+ "q": "mapillary_downloader:*",
25
+ "fl[]": ["identifier", "description", "item_size", "uploader"],
26
+ "rows": 10000,
27
+ "output": "json",
28
+ }
29
+
30
+ response = http_get_with_retry(url, params=params, max_retries=3)
31
+ data = response.json()
32
+
33
+ collections = data["response"]["docs"]
34
+ logger.info(f"Found {len(collections)} collections on archive.org")
35
+
36
+ return collections
37
+
38
+
39
+ def parse_collection_info(identifier):
40
+ """Parse username, quality, webp from collection identifier.
41
+
42
+ Returns:
43
+ dict with username, quality, is_webp or None if invalid
44
+ """
45
+ match = re.match(r"mapillary-(.+)-(256|1024|2048|original)(?:-webp)?$", identifier)
46
+ if match:
47
+ return {"username": match.group(1), "quality": match.group(2), "is_webp": "-webp" in identifier}
48
+ return None
49
+
50
+
51
+ def extract_image_count(description):
52
+ """Extract image count from IA description field.
53
+
54
+ Description format: "Contains 12,345 images in..."
55
+ """
56
+ if not description:
57
+ return None
58
+
59
+ match = re.search(r"Contains ([\d,]+) images", description)
60
+ if match:
61
+ return int(match.group(1).replace(",", ""))
62
+ return None
63
+
64
+
65
+ def load_cache():
66
+ """Load cached collection data.
67
+
68
+ Returns:
69
+ dict of {collection_id: {size, uploader, images, quality, username}}
70
+ """
71
+ if CACHE_FILE.exists():
72
+ try:
73
+ with open(CACHE_FILE) as f:
74
+ return json.load(f)
75
+ except Exception as e:
76
+ logger.warning(f"Failed to load cache: {e}")
77
+ return {}
78
+
79
+
80
+ def update_cache(ia_collections):
81
+ """Update cache with new IA search results.
82
+
83
+ Merges new collections into existing cache.
84
+
85
+ Returns:
86
+ Updated cache dict
87
+ """
88
+ cache = load_cache()
89
+
90
+ for item in ia_collections:
91
+ identifier = item.get("identifier")
92
+ if not identifier:
93
+ continue
94
+
95
+ info = parse_collection_info(identifier)
96
+ if not info:
97
+ logger.debug(f"Skipping non-mapillary collection: {identifier}")
98
+ continue
99
+
100
+ # Parse item data
101
+ size_bytes = item.get("item_size", 0)
102
+ if isinstance(size_bytes, str):
103
+ size_bytes = int(size_bytes)
104
+
105
+ image_count = extract_image_count(item.get("description"))
106
+
107
+ # Update cache entry
108
+ cache[identifier] = {
109
+ "size": size_bytes,
110
+ "uploader": item.get("uploader"),
111
+ "images": image_count,
112
+ "quality": info["quality"],
113
+ "username": info["username"],
114
+ "is_webp": info["is_webp"],
115
+ }
116
+
117
+ # Save updated cache
118
+ safe_json_save(CACHE_FILE, cache)
119
+ logger.info(f"Updated cache with {len(cache)} collections")
120
+
121
+ return cache
122
+
123
+
124
+ def aggregate_stats(cache):
125
+ """Aggregate statistics from cached collection data.
126
+
127
+ Returns:
128
+ dict with total and per-quality stats
129
+ """
130
+ stats = {
131
+ "total": {"collections": 0, "total_images": 0, "unique_images": 0, "bytes": 0},
132
+ "by_quality": {},
133
+ "users": set(),
134
+ }
135
+
136
+ # Track images per user for deduplication
137
+ user_images = {} # {username: max_images_across_qualities}
138
+
139
+ for collection_id, data in cache.items():
140
+ images = data.get("images") or 0
141
+ size = data.get("size") or 0
142
+ quality = data.get("quality", "unknown")
143
+ username = data.get("username")
144
+
145
+ # Track user coverage
146
+ if username:
147
+ stats["users"].add(username)
148
+ # Keep maximum image count across all qualities for this user
149
+ if username not in user_images or images > user_images[username]:
150
+ user_images[username] = images
151
+
152
+ # Total stats (collections, total images, and bytes)
153
+ stats["total"]["collections"] += 1
154
+ stats["total"]["total_images"] += images
155
+ stats["total"]["bytes"] += size
156
+
157
+ # Per-quality stats
158
+ if quality not in stats["by_quality"]:
159
+ stats["by_quality"][quality] = {"collections": 0, "images": 0, "bytes": 0}
160
+
161
+ stats["by_quality"][quality]["collections"] += 1
162
+ stats["by_quality"][quality]["images"] += images
163
+ stats["by_quality"][quality]["bytes"] += size
164
+
165
+ # Unique images is sum of max images per user
166
+ stats["total"]["unique_images"] = sum(user_images.values())
167
+
168
+ return stats
169
+
170
+
171
+ def format_stats(stats):
172
+ """Format statistics as human-readable text.
173
+
174
+ Args:
175
+ stats: Dict from aggregate_stats()
176
+
177
+ Returns:
178
+ Formatted string
179
+ """
180
+ TOTAL_MAPILLARY_IMAGES = 2_000_000_000 # 2 billion
181
+
182
+ output = []
183
+ output.append("=" * 70)
184
+ output.append("Mapillary Downloader - Archive.org Statistics")
185
+ output.append("=" * 70)
186
+ output.append("")
187
+
188
+ # Total stats
189
+ total = stats["total"]
190
+ unique_pct = (total["unique_images"] / TOTAL_MAPILLARY_IMAGES * 100) if total["unique_images"] else 0
191
+
192
+ output.append(f"Total Collections: {total['collections']:,}")
193
+ output.append(f"Total Users: {len(stats['users']):,}")
194
+ output.append(f"Total Images: {total['total_images']:,}")
195
+ output.append(f"Unique Images: {total['unique_images']:,} ({unique_pct:.3f}% of 2B)")
196
+ output.append(f"Total Size: {format_size(total['bytes'])}")
197
+ output.append("")
198
+
199
+ # Per-quality breakdown
200
+ output.append("By Quality:")
201
+ output.append("-" * 70)
202
+
203
+ # Sort by quality (original first, then numeric)
204
+ qualities = sorted(stats["by_quality"].items(), key=lambda x: (x[0] != "original", x[0]))
205
+
206
+ for quality, data in qualities:
207
+ pct = (data["images"] / TOTAL_MAPILLARY_IMAGES * 100) if data["images"] else 0
208
+ output.append(
209
+ f" {quality:8s} {data['collections']:3d} collections "
210
+ f"{data['images']:12,d} images ({pct:.3f}%) "
211
+ f"{format_size(data['bytes']):>8s}"
212
+ )
213
+
214
+ output.append("")
215
+ output.append(f"Cache: {CACHE_FILE}")
216
+
217
+ return "\n".join(output)
218
+
219
+
220
+ def show_stats(refresh=True):
221
+ """Show archive.org statistics for mapillary_downloader collections.
222
+
223
+ Args:
224
+ refresh: If True, fetch fresh data from IA. If False, use cache only.
225
+ """
226
+ if refresh:
227
+ try:
228
+ ia_collections = search_ia_collections()
229
+ cache = update_cache(ia_collections)
230
+ except Exception as e:
231
+ logger.error(f"Failed to fetch IA data: {e}")
232
+ logger.info("Using cached data...")
233
+ cache = load_cache()
234
+ else:
235
+ cache = load_cache()
236
+
237
+ if not cache:
238
+ logger.error("No cached data and failed to fetch from IA")
239
+ return
240
+
241
+ stats = aggregate_stats(cache)
242
+ print(format_stats(stats))
@@ -23,51 +23,46 @@ def tar_sequence_directories(collection_dir):
23
23
  logger.error(f"Collection directory not found: {collection_dir}")
24
24
  return 0, 0
25
25
 
26
- # Find all sequence directories (skip special dirs)
26
+ # Find all bucket directories (skip special dirs)
27
+ # Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
27
28
  skip_dirs = {".meta", "__pycache__"}
28
- sequence_dirs = []
29
+ bucket_dirs = []
29
30
 
30
31
  for item in collection_dir.iterdir():
31
32
  if item.is_dir() and item.name not in skip_dirs:
32
- sequence_dirs.append(item)
33
+ # Check if this is a bucket dir (single char)
34
+ if len(item.name) == 1:
35
+ bucket_dirs.append(item)
33
36
 
34
- if not sequence_dirs:
35
- logger.info("No sequence directories to tar")
37
+ if not bucket_dirs:
38
+ logger.info("No bucket directories to tar")
36
39
  return 0, 0
37
40
 
38
- logger.info(f"Tarring {len(sequence_dirs)} sequence directories...")
41
+ # Sort bucket directories alphabetically for consistent progress tracking
42
+ bucket_dirs = sorted(bucket_dirs, key=lambda x: x.name)
43
+
44
+ logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
39
45
 
40
46
  tarred_count = 0
41
47
  total_files = 0
42
48
  total_tar_bytes = 0
43
49
 
44
- for seq_dir in sequence_dirs:
45
- seq_name = seq_dir.name
46
- tar_path = collection_dir / f"{seq_name}.tar"
47
-
48
- # Handle naming collision - find next available name
49
- counter = 1
50
- while tar_path.exists():
51
- counter += 1
52
- tar_path = collection_dir / f"{seq_name}.{counter}.tar"
50
+ for bucket_dir in bucket_dirs:
51
+ bucket_name = bucket_dir.name
52
+ tar_path = collection_dir / f"{bucket_name}.tar"
53
53
 
54
- # Count files in sequence
55
- files = list(seq_dir.glob("*"))
56
- file_count = len([f for f in files if f.is_file()])
54
+ # Count files in bucket
55
+ files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
56
+ file_count = len(files_to_tar)
57
57
 
58
58
  if file_count == 0:
59
- logger.warning(f"Skipping empty directory: {seq_name}")
59
+ logger.warning(f"Skipping empty bucket directory: {bucket_name}")
60
60
  continue
61
61
 
62
62
  try:
63
- # Create reproducible uncompressed tar (WebP already compressed)
64
- # Sort files by name for deterministic ordering
65
- files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
66
-
67
- if not files_to_tar:
68
- logger.warning(f"Skipping directory with no files: {seq_name}")
69
- continue
63
+ logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
70
64
 
65
+ # Create reproducible uncompressed tar (WebP already compressed)
71
66
  with tarfile.open(tar_path, "w") as tar:
72
67
  for file_path in files_to_tar:
73
68
  # Get path relative to collection_dir for tar archive
@@ -92,33 +87,32 @@ def tar_sequence_directories(collection_dir):
92
87
  tar_size = tar_path.stat().st_size
93
88
  total_tar_bytes += tar_size
94
89
 
95
- # Remove original directory
96
- for file in seq_dir.rglob("*"):
90
+ # Remove original bucket directory
91
+ for file in bucket_dir.rglob("*"):
97
92
  if file.is_file():
98
93
  file.unlink()
99
94
 
100
95
  # Remove empty subdirs and main dir
101
- for subdir in list(seq_dir.rglob("*")):
96
+ for subdir in list(bucket_dir.rglob("*")):
102
97
  if subdir.is_dir():
103
98
  try:
104
99
  subdir.rmdir()
105
100
  except OSError:
106
101
  pass # Not empty yet
107
102
 
108
- seq_dir.rmdir()
103
+ bucket_dir.rmdir()
109
104
 
110
105
  tarred_count += 1
111
106
  total_files += file_count
112
107
 
113
- if tarred_count % 10 == 0:
114
- logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
108
+ logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
115
109
  else:
116
110
  logger.error(f"Tar file empty or not created: {tar_path}")
117
111
  if tar_path.exists():
118
112
  tar_path.unlink()
119
113
 
120
114
  except Exception as e:
121
- logger.error(f"Error tarring {seq_name}: {e}")
115
+ logger.error(f"Error tarring bucket {bucket_name}: {e}")
122
116
  if tar_path.exists():
123
117
  tar_path.unlink()
124
118
 
@@ -0,0 +1,108 @@
1
+ """Utility functions for formatting and display."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import time
7
+ from pathlib import Path
8
+ import requests
9
+ from requests.exceptions import RequestException
10
+
11
+ logger = logging.getLogger("mapillary_downloader")
12
+
13
+
14
+ def format_size(bytes_count):
15
+ """Format bytes as human-readable size.
16
+
17
+ Args:
18
+ bytes_count: Number of bytes
19
+
20
+ Returns:
21
+ Formatted string (e.g. "1.23 GB", "456.78 MB")
22
+ """
23
+ if bytes_count >= 1_000_000_000:
24
+ return f"{bytes_count / 1_000_000_000:.2f} GB"
25
+ if bytes_count >= 1_000_000:
26
+ return f"{bytes_count / 1_000_000:.2f} MB"
27
+ if bytes_count >= 1_000:
28
+ return f"{bytes_count / 1000:.2f} KB"
29
+ return f"{bytes_count} B"
30
+
31
+
32
+ def format_time(seconds):
33
+ """Format seconds as human-readable time.
34
+
35
+ Args:
36
+ seconds: Number of seconds
37
+
38
+ Returns:
39
+ Formatted string (e.g. "2h 15m", "45m 30s", "30s")
40
+ """
41
+ if seconds < 60:
42
+ return f"{int(seconds)}s"
43
+
44
+ minutes = int(seconds / 60)
45
+ remaining_seconds = int(seconds % 60)
46
+
47
+ if minutes < 60:
48
+ if remaining_seconds > 0:
49
+ return f"{minutes}m {remaining_seconds}s"
50
+ return f"{minutes}m"
51
+
52
+ hours = int(minutes / 60)
53
+ remaining_minutes = minutes % 60
54
+
55
+ if remaining_minutes > 0:
56
+ return f"{hours}h {remaining_minutes}m"
57
+ return f"{hours}h"
58
+
59
+
60
+ def safe_json_save(file_path, data):
61
+ """Atomically save JSON data to file.
62
+
63
+ Writes to temp file, then atomic rename to prevent corruption.
64
+
65
+ Args:
66
+ file_path: Path to JSON file
67
+ data: Data to serialize to JSON
68
+ """
69
+ file_path = Path(file_path)
70
+ file_path.parent.mkdir(parents=True, exist_ok=True)
71
+
72
+ temp_file = file_path.with_suffix(".json.tmp")
73
+ with open(temp_file, "w") as f:
74
+ json.dump(data, f, indent=2)
75
+ f.flush()
76
+ os.fsync(f.fileno())
77
+ temp_file.replace(file_path)
78
+
79
+
80
+ def http_get_with_retry(url, params=None, max_retries=5, base_delay=1.0, timeout=60):
81
+ """HTTP GET with exponential backoff retry.
82
+
83
+ Args:
84
+ url: URL to fetch
85
+ params: Optional query parameters
86
+ max_retries: Maximum retry attempts (default: 5)
87
+ base_delay: Initial delay in seconds (default: 1.0)
88
+ timeout: Request timeout in seconds (default: 60)
89
+
90
+ Returns:
91
+ requests.Response object
92
+
93
+ Raises:
94
+ requests.RequestException: If all retries exhausted
95
+ """
96
+ for attempt in range(max_retries):
97
+ try:
98
+ response = requests.get(url, params=params, timeout=timeout)
99
+ response.raise_for_status()
100
+ return response
101
+ except RequestException as e:
102
+ if attempt == max_retries - 1:
103
+ raise
104
+
105
+ delay = base_delay * (2**attempt)
106
+ logger.warning(f"Request failed (attempt {attempt + 1}/{max_retries}): {e}")
107
+ logger.info(f"Retrying in {delay:.1f} seconds...")
108
+ time.sleep(delay)
@@ -5,9 +5,9 @@ import signal
5
5
  import tempfile
6
6
  from pathlib import Path
7
7
  import requests
8
- from requests.exceptions import RequestException
9
8
  from mapillary_downloader.exif_writer import write_exif_to_image
10
9
  from mapillary_downloader.webp_converter import convert_to_webp
10
+ from mapillary_downloader.utils import http_get_with_retry
11
11
 
12
12
 
13
13
  def worker_process(work_queue, result_queue, worker_id):
@@ -69,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
69
69
  if not image_url:
70
70
  return (image_id, 0, False, f"No {quality} URL")
71
71
 
72
- # Determine final output directory
72
+ # Determine final output directory - organize by first char of sequence ID
73
73
  output_dir = Path(output_dir)
74
74
  sequence_id = image_data.get("sequence")
75
75
  if sequence_id:
76
- img_dir = output_dir / sequence_id
76
+ # Use first character as bucket (gives us ~62 dirs instead of millions)
77
+ first_char = sequence_id[0]
78
+ img_dir = output_dir / first_char / sequence_id
77
79
  img_dir.mkdir(parents=True, exist_ok=True)
78
80
  else:
79
81
  img_dir = output_dir
@@ -88,19 +90,18 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
88
90
  jpg_path = img_dir / f"{image_id}.jpg"
89
91
  final_path = jpg_path
90
92
 
91
- # Download image (using session passed from worker)
93
+ # Download image with retry logic
92
94
  bytes_downloaded = 0
93
95
 
94
96
  try:
95
- # 60 second timeout for entire download (connection + read)
96
- response = session.get(image_url, stream=True, timeout=60)
97
- response.raise_for_status()
97
+ # Use retry logic with 3 attempts for image downloads
98
+ response = http_get_with_retry(image_url, max_retries=3, base_delay=1.0, timeout=60)
98
99
 
99
100
  with open(jpg_path, "wb") as f:
100
101
  for chunk in response.iter_content(chunk_size=8192):
101
102
  f.write(chunk)
102
103
  bytes_downloaded += len(chunk)
103
- except RequestException as e:
104
+ except Exception as e:
104
105
  return (image_id, 0, False, f"Download failed: {e}")
105
106
 
106
107
  # Write EXIF metadata
@@ -1,47 +0,0 @@
1
- """Utility functions for formatting and display."""
2
-
3
-
4
- def format_size(bytes_count):
5
- """Format bytes as human-readable size.
6
-
7
- Args:
8
- bytes_count: Number of bytes
9
-
10
- Returns:
11
- Formatted string (e.g. "1.23 GB", "456.78 MB")
12
- """
13
- if bytes_count >= 1_000_000_000:
14
- return f"{bytes_count / 1_000_000_000:.2f} GB"
15
- if bytes_count >= 1_000_000:
16
- return f"{bytes_count / 1_000_000:.2f} MB"
17
- if bytes_count >= 1_000:
18
- return f"{bytes_count / 1000:.2f} KB"
19
- return f"{bytes_count} B"
20
-
21
-
22
- def format_time(seconds):
23
- """Format seconds as human-readable time.
24
-
25
- Args:
26
- seconds: Number of seconds
27
-
28
- Returns:
29
- Formatted string (e.g. "2h 15m", "45m 30s", "30s")
30
- """
31
- if seconds < 60:
32
- return f"{int(seconds)}s"
33
-
34
- minutes = int(seconds / 60)
35
- remaining_seconds = int(seconds % 60)
36
-
37
- if minutes < 60:
38
- if remaining_seconds > 0:
39
- return f"{minutes}m {remaining_seconds}s"
40
- return f"{minutes}m"
41
-
42
- hours = int(minutes / 60)
43
- remaining_minutes = minutes % 60
44
-
45
- if remaining_minutes > 0:
46
- return f"{hours}h {remaining_minutes}m"
47
- return f"{hours}h"