mapillary-downloader 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,9 +56,21 @@ def main():
56
56
  action="store_true",
57
57
  help="Don't check if collection exists on Internet Archive before downloading",
58
58
  )
59
+ parser.add_argument(
60
+ "--debug",
61
+ action="store_true",
62
+ help="Enable debug logging (EXIF data, API responses, etc.)",
63
+ )
59
64
 
60
65
  args = parser.parse_args()
61
66
 
67
+ # Set debug logging level if requested
68
+ if args.debug:
69
+ import logging
70
+
71
+ logging.getLogger("mapillary_downloader").setLevel(logging.DEBUG)
72
+ logger.debug("Debug logging enabled")
73
+
62
74
  # Check for token
63
75
  if not args.token:
64
76
  logger.error("Error: Mapillary API token required. Use --token or set MAPILLARY_TOKEN environment variable")
@@ -92,8 +92,10 @@ class MapillaryClient:
92
92
  images = data.get("data", [])
93
93
  total_fetched += len(images)
94
94
  logger.info(f"Fetched metadata for {total_fetched:,} images...")
95
+ logger.debug(f"API response paging: {data.get('paging', {})}")
95
96
 
96
97
  for image in images:
98
+ logger.debug(f"Image metadata: {image}")
97
99
  yield image
98
100
 
99
101
  # Get next page URL
@@ -7,11 +7,12 @@ import os
7
7
  import shutil
8
8
  import time
9
9
  from pathlib import Path
10
- from concurrent.futures import ProcessPoolExecutor, as_completed
11
10
  from mapillary_downloader.utils import format_size, format_time
12
11
  from mapillary_downloader.ia_meta import generate_ia_metadata
13
12
  from mapillary_downloader.ia_check import check_ia_exists
14
- from mapillary_downloader.worker import download_and_convert_image
13
+ from mapillary_downloader.worker import worker_process
14
+ from mapillary_downloader.worker_pool import AdaptiveWorkerPool
15
+ from mapillary_downloader.metadata_reader import MetadataReader
15
16
  from mapillary_downloader.tar_sequences import tar_sequence_directories
16
17
  from mapillary_downloader.logging_config import add_file_handler
17
18
 
@@ -106,23 +107,51 @@ class MapillaryDownloader:
106
107
  self.downloaded = self._load_progress()
107
108
 
108
109
  def _load_progress(self):
109
- """Load previously downloaded image IDs."""
110
+ """Load previously downloaded image IDs for this quality."""
110
111
  if self.progress_file.exists():
111
112
  with open(self.progress_file) as f:
112
- return set(json.load(f).get("downloaded", []))
113
+ data = json.load(f)
114
+ # Support both old format (single list) and new format (per-quality dict)
115
+ if isinstance(data, dict):
116
+ if "downloaded" in data:
117
+ # Old format: {"downloaded": [...]}
118
+ return set(data["downloaded"])
119
+ else:
120
+ # New format: {"256": [...], "1024": [...], ...}
121
+ return set(data.get(str(self.quality), []))
122
+ else:
123
+ # Very old format: just a list
124
+ return set(data)
113
125
  return set()
114
126
 
115
127
  def _save_progress(self):
116
- """Save progress to disk atomically."""
128
+ """Save progress to disk atomically, per-quality."""
129
+ # Load existing progress for all qualities
130
+ if self.progress_file.exists():
131
+ with open(self.progress_file) as f:
132
+ data = json.load(f)
133
+ # Convert old format to new format if needed
134
+ if isinstance(data, dict) and "downloaded" in data:
135
+ # Old format: {"downloaded": [...]} - migrate to per-quality
136
+ progress = {}
137
+ else:
138
+ progress = data if isinstance(data, dict) else {}
139
+ else:
140
+ progress = {}
141
+
142
+ # Update this quality's progress
143
+ progress[str(self.quality)] = list(self.downloaded)
144
+
145
+ # Write atomically
117
146
  temp_file = self.progress_file.with_suffix(".json.tmp")
118
147
  with open(temp_file, "w") as f:
119
- json.dump({"downloaded": list(self.downloaded)}, f)
148
+ json.dump(progress, f)
120
149
  f.flush()
121
150
  os.fsync(f.fileno())
122
151
  temp_file.replace(self.progress_file)
123
152
 
124
153
  def download_user_data(self, bbox=None, convert_webp=False):
125
- """Download all images for a user.
154
+ """Download all images for a user using streaming queue-based architecture.
126
155
 
127
156
  Args:
128
157
  bbox: Optional bounding box [west, south, east, north]
@@ -150,85 +179,271 @@ class MapillaryDownloader:
150
179
  logger.info(f"Quality: {self.quality}")
151
180
  logger.info(f"Using {self.workers} parallel workers")
152
181
 
153
- processed = 0
154
- downloaded_count = 0
155
- skipped = 0
156
- total_bytes = 0
157
- failed_count = 0
158
-
159
182
  start_time = time.time()
160
183
 
161
- # Track which image IDs we've seen in metadata to avoid re-fetching
162
- seen_ids = set()
163
-
164
- # Collect images to download from existing metadata
165
- images_to_download = []
166
-
167
- if self.metadata_file.exists():
168
- logger.info("Processing existing metadata file...")
169
- with open(self.metadata_file) as f:
170
- for line in f:
171
- if line.strip():
172
- image = json.loads(line)
173
- image_id = image["id"]
174
- seen_ids.add(image_id)
175
- processed += 1
176
-
177
- if image_id in self.downloaded:
178
- skipped += 1
179
- continue
180
-
181
- # Queue for download
182
- if image.get(quality_field):
183
- images_to_download.append(image)
184
-
185
- # Download images from existing metadata in parallel
186
- if images_to_download:
187
- logger.info(f"Downloading {len(images_to_download)} images from existing metadata...")
188
- downloaded_count, total_bytes, failed_count = self._download_images_parallel(
189
- images_to_download, convert_webp
190
- )
191
-
192
- # Always check API for new images (will skip duplicates via seen_ids)
193
- logger.info("Checking for new images from API...")
194
- new_images = []
184
+ # Step 1: Build seen_ids from metadata file (streaming, only IDs)
185
+ logger.info("Building seen_ids from metadata...")
186
+ reader = MetadataReader(self.metadata_file)
187
+ seen_ids = reader.get_all_ids()
188
+ api_complete = reader.is_complete
189
+ logger.info(f"Found {len(seen_ids)} existing images in metadata")
195
190
 
196
- with open(self.metadata_file, "a") as meta_f:
197
- for image in self.client.get_user_images(self.username, bbox=bbox):
198
- image_id = image["id"]
191
+ # Step 2: Start worker pool (fork AFTER building seen_ids, BEFORE downloading)
192
+ pool = AdaptiveWorkerPool(
193
+ worker_process, min_workers=max(1, self.workers // 2), max_workers=self.workers, monitoring_interval=30
194
+ )
195
+ pool.start()
199
196
 
200
- # Skip if we already have this in our metadata file
201
- if image_id in seen_ids:
197
+ # Step 3: Download images from existing metadata while fetching new from API
198
+ downloaded_count = 0
199
+ skipped = 0
200
+ total_bytes = 0
201
+ failed_count = 0
202
+ submitted = 0
203
+ batch_start = time.time()
204
+
205
+ logger.info("Starting parallel download and API fetch...")
206
+
207
+ try:
208
+ # Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
209
+ if not api_complete:
210
+ import threading
211
+
212
+ api_fetch_complete = threading.Event()
213
+ new_images_count = [0] # Mutable so thread can update it
214
+
215
+ def fetch_api_metadata():
216
+ """Fetch metadata from API and write to file (runs in thread)."""
217
+ try:
218
+ logger.info("API fetch thread: Starting...")
219
+ with open(self.metadata_file, "a") as meta_f:
220
+ for image in self.client.get_user_images(self.username, bbox=bbox):
221
+ image_id = image["id"]
222
+
223
+ # Skip if we already have this in our metadata file
224
+ if image_id in seen_ids:
225
+ continue
226
+
227
+ seen_ids.add(image_id)
228
+ new_images_count[0] += 1
229
+
230
+ # Save new metadata
231
+ meta_f.write(json.dumps(image) + "\n")
232
+ meta_f.flush()
233
+
234
+ if new_images_count[0] % 1000 == 0:
235
+ logger.info(f"API: Fetched {new_images_count[0]} new images from API")
236
+
237
+ # Mark as complete
238
+ MetadataReader.mark_complete(self.metadata_file)
239
+ logger.info(f"API fetch complete: {new_images_count[0]} new images")
240
+ finally:
241
+ api_fetch_complete.set()
242
+
243
+ # Start API fetch in background thread
244
+ api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
245
+ api_thread.start()
246
+ else:
247
+ logger.info("API fetch already complete, skipping API thread")
248
+ api_fetch_complete = None
249
+
250
+ # Step 3b: Tail metadata file and submit to workers
251
+ logger.info("Starting metadata tail and download queue feeder...")
252
+ last_position = 0
253
+
254
+ # Helper to process results from queue
255
+ def process_results():
256
+ nonlocal downloaded_count, total_bytes, failed_count
257
+ while True:
258
+ result = pool.get_result(timeout=0.001)
259
+ if result is None:
260
+ break
261
+
262
+ image_id, bytes_dl, success, error_msg = result
263
+
264
+ if success:
265
+ self.downloaded.add(image_id)
266
+ downloaded_count += 1
267
+ total_bytes += bytes_dl
268
+
269
+ # Log every download for first 10, then every 100
270
+ should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
271
+ if should_log:
272
+ elapsed = time.time() - batch_start
273
+ rate = downloaded_count / elapsed if elapsed > 0 else 0
274
+ logger.info(
275
+ f"Downloaded: {downloaded_count} ({format_size(total_bytes)}) "
276
+ f"- Rate: {rate:.1f} images/sec"
277
+ )
278
+
279
+ if downloaded_count % 100 == 0:
280
+ self._save_progress()
281
+ pool.check_throughput(downloaded_count)
282
+ else:
283
+ failed_count += 1
284
+ logger.warning(f"Failed to download {image_id}: {error_msg}")
285
+
286
+ # Tail the metadata file and submit to workers
287
+ while True:
288
+ # Check if API fetch is done and we've processed everything
289
+ if api_fetch_complete and api_fetch_complete.is_set():
290
+ # Read any remaining lines
291
+ if self.metadata_file.exists():
292
+ with open(self.metadata_file) as f:
293
+ f.seek(last_position)
294
+ for line in f:
295
+ line = line.strip()
296
+ if not line:
297
+ continue
298
+
299
+ try:
300
+ image = json.loads(line)
301
+ except json.JSONDecodeError:
302
+ # Incomplete line, will retry
303
+ continue
304
+
305
+ # Skip completion marker
306
+ if image.get("__complete__"):
307
+ continue
308
+
309
+ image_id = image.get("id")
310
+ if not image_id:
311
+ continue
312
+
313
+ # Skip if already downloaded or no quality URL
314
+ if image_id in self.downloaded:
315
+ continue
316
+ if not image.get(quality_field):
317
+ continue
318
+
319
+ # Submit to workers
320
+ work_item = (
321
+ image,
322
+ str(self.output_dir),
323
+ self.quality,
324
+ convert_webp,
325
+ self.client.access_token,
326
+ )
327
+ pool.submit(work_item)
328
+ submitted += 1
329
+
330
+ if submitted % 1000 == 0:
331
+ logger.info(f"Queue: Submitted {submitted} images")
332
+
333
+ # Process results while submitting
334
+ process_results()
335
+
336
+ last_position = f.tell()
337
+
338
+ # API done and all lines processed, break
339
+ break
340
+
341
+ # API still running or API was already complete, tail the file
342
+ if self.metadata_file.exists():
343
+ with open(self.metadata_file) as f:
344
+ f.seek(last_position)
345
+ for line in f:
346
+ line = line.strip()
347
+ if not line:
348
+ continue
349
+
350
+ try:
351
+ image = json.loads(line)
352
+ except json.JSONDecodeError:
353
+ # Incomplete line, will retry next iteration
354
+ continue
355
+
356
+ # Skip completion marker
357
+ if image.get("__complete__"):
358
+ continue
359
+
360
+ image_id = image.get("id")
361
+ if not image_id:
362
+ continue
363
+
364
+ # Skip if already downloaded or no quality URL
365
+ if image_id in self.downloaded:
366
+ continue
367
+ if not image.get(quality_field):
368
+ continue
369
+
370
+ # Submit to workers
371
+ work_item = (
372
+ image,
373
+ str(self.output_dir),
374
+ self.quality,
375
+ convert_webp,
376
+ self.client.access_token,
377
+ )
378
+ pool.submit(work_item)
379
+ submitted += 1
380
+
381
+ if submitted % 1000 == 0:
382
+ logger.info(f"Queue: Submitted {submitted} images")
383
+
384
+ # Process results while submitting
385
+ process_results()
386
+
387
+ last_position = f.tell()
388
+
389
+ # Sleep briefly before next tail iteration
390
+ time.sleep(0.1)
391
+
392
+ # Process any results that came in
393
+ process_results()
394
+
395
+ # Send shutdown signals
396
+ logger.info(f"Submitted {submitted} images, waiting for workers to finish...")
397
+ for _ in range(pool.current_workers):
398
+ pool.submit(None)
399
+
400
+ # Collect remaining results
401
+ completed = downloaded_count + failed_count
402
+
403
+ while completed < submitted:
404
+ result = pool.get_result(timeout=5)
405
+ if result is None:
406
+ # Check throughput periodically
407
+ pool.check_throughput(downloaded_count)
202
408
  continue
203
409
 
204
- seen_ids.add(image_id)
205
- processed += 1
410
+ image_id, bytes_dl, success, error_msg = result
411
+ completed += 1
206
412
 
207
- # Save new metadata
208
- meta_f.write(json.dumps(image) + "\n")
209
- meta_f.flush()
413
+ if success:
414
+ self.downloaded.add(image_id)
415
+ downloaded_count += 1
416
+ total_bytes += bytes_dl
210
417
 
211
- # Skip if already downloaded
212
- if image_id in self.downloaded:
213
- skipped += 1
214
- continue
418
+ if downloaded_count % 10 == 0:
419
+ elapsed = time.time() - batch_start
420
+ rate = downloaded_count / elapsed if elapsed > 0 else 0
421
+ remaining = submitted - completed
422
+ eta_seconds = remaining / rate if rate > 0 else 0
215
423
 
216
- # Queue for download
217
- if image.get(quality_field):
218
- new_images.append(image)
424
+ logger.info(
425
+ f"Downloaded: {downloaded_count}/{submitted} ({format_size(total_bytes)}) "
426
+ f"- ETA: {format_time(eta_seconds)}"
427
+ )
428
+ self._save_progress()
429
+ pool.check_throughput(downloaded_count)
430
+ else:
431
+ failed_count += 1
432
+ logger.warning(f"Failed to download {image_id}: {error_msg}")
219
433
 
220
- # Download new images in parallel
221
- if new_images:
222
- logger.info(f"Downloading {len(new_images)} new images...")
223
- new_downloaded, new_bytes, new_failed = self._download_images_parallel(new_images, convert_webp)
224
- downloaded_count += new_downloaded
225
- total_bytes += new_bytes
226
- failed_count += new_failed
434
+ finally:
435
+ # Shutdown worker pool
436
+ pool.shutdown()
227
437
 
228
438
  self._save_progress()
229
439
  elapsed = time.time() - start_time
440
+
441
+ # Count total images in metadata
442
+ total_images = len(seen_ids)
443
+ skipped = total_images - downloaded_count - failed_count
444
+
230
445
  logger.info(
231
- f"Complete! Processed {processed} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
446
+ f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
232
447
  f"skipped {skipped}, failed {failed_count}"
233
448
  )
234
449
  logger.info(f"Total time: {format_time(elapsed)}")
@@ -268,59 +483,3 @@ class MapillaryDownloader:
268
483
  self.final_dir.parent.mkdir(parents=True, exist_ok=True)
269
484
  shutil.move(str(self.staging_dir), str(self.final_dir))
270
485
  logger.info(f"Collection moved to: {self.final_dir}")
271
-
272
- def _download_images_parallel(self, images, convert_webp):
273
- """Download images in parallel using worker pool.
274
-
275
- Args:
276
- images: List of image metadata dicts
277
- convert_webp: Whether to convert to WebP
278
-
279
- Returns:
280
- Tuple of (downloaded_count, total_bytes, failed_count)
281
- """
282
- downloaded_count = 0
283
- total_bytes = 0
284
- failed_count = 0
285
- batch_start_time = time.time()
286
-
287
- with ProcessPoolExecutor(max_workers=self.workers) as executor:
288
- # Submit all tasks
289
- future_to_image = {}
290
- for image in images:
291
- future = executor.submit(
292
- download_and_convert_image,
293
- image,
294
- str(self.output_dir),
295
- self.quality,
296
- convert_webp,
297
- self.client.access_token,
298
- )
299
- future_to_image[future] = image["id"]
300
-
301
- # Process results as they complete
302
- for future in as_completed(future_to_image):
303
- image_id, bytes_dl, success, error_msg = future.result()
304
-
305
- if success:
306
- self.downloaded.add(image_id)
307
- downloaded_count += 1
308
- total_bytes += bytes_dl
309
-
310
- if downloaded_count % 10 == 0:
311
- # Calculate ETA
312
- elapsed = time.time() - batch_start_time
313
- rate = downloaded_count / elapsed if elapsed > 0 else 0
314
- remaining = len(images) - downloaded_count
315
- eta_seconds = remaining / rate if rate > 0 else 0
316
-
317
- logger.info(
318
- f"Downloaded: {downloaded_count}/{len(images)} ({format_size(total_bytes)}) "
319
- f"- ETA: {format_time(eta_seconds)}"
320
- )
321
- self._save_progress()
322
- else:
323
- failed_count += 1
324
- logger.warning(f"Failed to download {image_id}: {error_msg}")
325
-
326
- return downloaded_count, total_bytes, failed_count
@@ -1,8 +1,11 @@
1
1
  """EXIF metadata writer for Mapillary images."""
2
2
 
3
+ import logging
3
4
  import piexif
4
5
  from datetime import datetime
5
6
 
7
+ logger = logging.getLogger("mapillary_downloader")
8
+
6
9
 
7
10
  def decimal_to_dms(decimal):
8
11
  """Convert decimal degrees to degrees, minutes, seconds format for EXIF.
@@ -47,6 +50,9 @@ def write_exif_to_image(image_path, metadata):
47
50
  True if successful, False otherwise
48
51
  """
49
52
  try:
53
+ logger.debug(f"Writing EXIF to {image_path}")
54
+ logger.debug(f"Metadata: {metadata}")
55
+
50
56
  # Load existing EXIF data if any
51
57
  try:
52
58
  exif_dict = piexif.load(str(image_path))
@@ -99,13 +105,17 @@ def write_exif_to_image(image_path, metadata):
99
105
  # GPS Altitude - prefer computed_altitude over altitude
100
106
  altitude = metadata.get("computed_altitude") or metadata.get("altitude")
101
107
  if altitude is not None:
102
- exif_dict["GPS"][piexif.GPSIFD.GPSAltitude] = (int(abs(altitude) * 100), 100)
108
+ altitude_val = int(abs(altitude) * 100)
109
+ logger.debug(f"Raw altitude value: {altitude}, calculated: {altitude_val}")
110
+ exif_dict["GPS"][piexif.GPSIFD.GPSAltitude] = (altitude_val, 100)
103
111
  exif_dict["GPS"][piexif.GPSIFD.GPSAltitudeRef] = 1 if altitude < 0 else 0
104
112
 
105
113
  # GPS Compass direction
106
114
  compass = metadata.get("computed_compass_angle") or metadata.get("compass_angle")
107
115
  if compass is not None:
108
- exif_dict["GPS"][piexif.GPSIFD.GPSImgDirection] = (int(compass * 100), 100)
116
+ # Normalize compass to 0-360 range
117
+ compass_val = int((compass % 360) * 100)
118
+ exif_dict["GPS"][piexif.GPSIFD.GPSImgDirection] = (compass_val, 100)
109
119
  exif_dict["GPS"][piexif.GPSIFD.GPSImgDirectionRef] = b"T" # True north
110
120
 
111
121
  # GPS Version
@@ -115,8 +125,10 @@ def write_exif_to_image(image_path, metadata):
115
125
  exif_bytes = piexif.dump(exif_dict)
116
126
  piexif.insert(exif_bytes, str(image_path))
117
127
 
128
+ logger.debug(f"Successfully wrote EXIF to {image_path}")
118
129
  return True
119
130
 
120
131
  except Exception as e:
121
- print(f"Warning: Failed to write EXIF data to {image_path}: {e}")
132
+ logger.warning(f"Failed to write EXIF data to {image_path}: {e}")
133
+ logger.debug(f"Full metadata: {metadata}")
122
134
  return False
@@ -0,0 +1,123 @@
1
+ """Streaming metadata reader with filtering."""
2
+
3
+ import gzip
4
+ import json
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ logger = logging.getLogger("mapillary_downloader")
9
+
10
+
11
+ class MetadataReader:
12
+ """Streams metadata.jsonl line-by-line with filtering.
13
+
14
+ This avoids loading millions of image dicts into memory.
15
+ """
16
+
17
+ COMPLETION_MARKER = {"__complete__": True}
18
+
19
+ def __init__(self, metadata_file):
20
+ """Initialize metadata reader.
21
+
22
+ Args:
23
+ metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
24
+ """
25
+ self.metadata_file = Path(metadata_file)
26
+ self.is_complete = False
27
+
28
+ def iter_images(self, quality_field=None, downloaded_ids=None):
29
+ """Stream images from metadata file with filtering.
30
+
31
+ Args:
32
+ quality_field: Optional field to check exists (e.g., 'thumb_1024_url')
33
+ downloaded_ids: Optional set of already downloaded IDs to skip
34
+
35
+ Yields:
36
+ Image metadata dicts that pass filters
37
+ """
38
+ if not self.metadata_file.exists():
39
+ return
40
+
41
+ # Handle gzipped files
42
+ if self.metadata_file.suffix == ".gz":
43
+ file_handle = gzip.open(self.metadata_file, "rt")
44
+ else:
45
+ file_handle = open(self.metadata_file)
46
+
47
+ with file_handle as f:
48
+ for line in f:
49
+ line = line.strip()
50
+ if not line:
51
+ continue
52
+
53
+ image = json.loads(line)
54
+
55
+ # Check for completion marker
56
+ if image.get("__complete__"):
57
+ self.is_complete = True
58
+ logger.debug("Found API fetch completion marker")
59
+ continue
60
+
61
+ image_id = image.get("id")
62
+ if not image_id:
63
+ continue
64
+
65
+ # Filter by downloaded status
66
+ if downloaded_ids and image_id in downloaded_ids:
67
+ continue
68
+
69
+ # Filter by quality field availability
70
+ if quality_field and not image.get(quality_field):
71
+ continue
72
+
73
+ yield image
74
+
75
+ def get_all_ids(self):
76
+ """Get set of all image IDs in metadata file.
77
+
78
+ Returns:
79
+ Set of image IDs (for building seen_ids)
80
+ """
81
+ ids = set()
82
+
83
+ if not self.metadata_file.exists():
84
+ return ids
85
+
86
+ # Handle gzipped files
87
+ if self.metadata_file.suffix == ".gz":
88
+ file_handle = gzip.open(self.metadata_file, "rt")
89
+ else:
90
+ file_handle = open(self.metadata_file)
91
+
92
+ with file_handle as f:
93
+ for line in f:
94
+ line = line.strip()
95
+ if not line:
96
+ continue
97
+
98
+ image = json.loads(line)
99
+
100
+ # Skip completion marker
101
+ if image.get("__complete__"):
102
+ self.is_complete = True
103
+ continue
104
+
105
+ image_id = image.get("id")
106
+ if image_id:
107
+ ids.add(image_id)
108
+
109
+ return ids
110
+
111
+ @staticmethod
112
+ def mark_complete(metadata_file):
113
+ """Append completion marker to metadata file.
114
+
115
+ Args:
116
+ metadata_file: Path to metadata.jsonl
117
+ """
118
+ metadata_file = Path(metadata_file)
119
+ if metadata_file.exists():
120
+ with open(metadata_file, "a") as f:
121
+ f.write(json.dumps(MetadataReader.COMPLETION_MARKER) + "\n")
122
+ f.flush()
123
+ logger.info("Marked metadata file as complete")
@@ -9,6 +9,31 @@ from mapillary_downloader.exif_writer import write_exif_to_image
9
9
  from mapillary_downloader.webp_converter import convert_to_webp
10
10
 
11
11
 
12
+ def worker_process(work_queue, result_queue, worker_id):
13
+ """Worker process that pulls from queue and processes images.
14
+
15
+ Args:
16
+ work_queue: Queue to pull work items from
17
+ result_queue: Queue to push results to
18
+ worker_id: Unique worker identifier
19
+ """
20
+ while True:
21
+ work_item = work_queue.get()
22
+
23
+ # None is the shutdown signal
24
+ if work_item is None:
25
+ break
26
+
27
+ # Unpack work item
28
+ image_data, output_dir, quality, convert_webp, access_token = work_item
29
+
30
+ # Process the image
31
+ result = download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token)
32
+
33
+ # Push result back
34
+ result_queue.put(result)
35
+
36
+
12
37
  def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
13
38
  """Download and optionally convert a single image.
14
39
 
@@ -0,0 +1,136 @@
1
+ """Adaptive worker pool for parallel processing."""
2
+
3
+ import logging
4
+ import multiprocessing as mp
5
+ import queue
6
+ import time
7
+ from collections import deque
8
+
9
+ logger = logging.getLogger("mapillary_downloader")
10
+
11
+
12
+ class AdaptiveWorkerPool:
13
+ """Worker pool that scales based on throughput.
14
+
15
+ Monitors throughput every 30 seconds and adjusts worker count:
16
+ - If throughput increasing: add workers (up to max)
17
+ - If throughput plateauing/decreasing: reduce workers
18
+ """
19
+
20
+ def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
21
+ """Initialize adaptive worker pool.
22
+
23
+ Args:
24
+ worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
+ min_workers: Minimum number of workers
26
+ max_workers: Maximum number of workers
27
+ monitoring_interval: Seconds between throughput checks
28
+ """
29
+ self.worker_func = worker_func
30
+ self.min_workers = min_workers
31
+ self.max_workers = max_workers
32
+ self.monitoring_interval = monitoring_interval
33
+
34
+ # Queues
35
+ self.work_queue = mp.Queue(maxsize=max_workers)
36
+ self.result_queue = mp.Queue()
37
+
38
+ # Worker management
39
+ self.workers = []
40
+ self.current_workers = min_workers
41
+
42
+ # Throughput monitoring
43
+ self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
+ self.last_processed = 0
45
+ self.last_check_time = time.time()
46
+
47
+ self.running = False
48
+
49
+ def start(self):
50
+ """Start the worker pool."""
51
+ self.running = True
52
+ logger.info(f"Starting worker pool with {self.current_workers} workers")
53
+
54
+ for i in range(self.current_workers):
55
+ self._add_worker(i)
56
+
57
+ def _add_worker(self, worker_id):
58
+ """Add a new worker to the pool."""
59
+ p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
60
+ p.start()
61
+ self.workers.append(p)
62
+ logger.debug(f"Started worker {worker_id}")
63
+
64
+ def submit(self, work_item):
65
+ """Submit work to the pool (blocks if queue is full)."""
66
+ self.work_queue.put(work_item)
67
+
68
+ def get_result(self, timeout=None):
69
+ """Get a result from the workers.
70
+
71
+ Returns:
72
+ Result from worker, or None if timeout
73
+ """
74
+ try:
75
+ return self.result_queue.get(timeout=timeout)
76
+ except queue.Empty:
77
+ return None
78
+
79
+ def check_throughput(self, total_processed):
80
+ """Check throughput and adjust workers if needed.
81
+
82
+ Args:
83
+ total_processed: Total number of items processed so far
84
+ """
85
+ now = time.time()
86
+ elapsed = now - self.last_check_time
87
+
88
+ if elapsed < self.monitoring_interval:
89
+ return
90
+
91
+ # Calculate current throughput (items/sec)
92
+ items_since_check = total_processed - self.last_processed
93
+ throughput = items_since_check / elapsed
94
+
95
+ self.throughput_history.append(throughput)
96
+ self.last_processed = total_processed
97
+ self.last_check_time = now
98
+
99
+ # Need at least 3 measurements to detect trends
100
+ if len(self.throughput_history) < 3:
101
+ return
102
+
103
+ # Check if throughput is increasing
104
+ recent_avg = sum(list(self.throughput_history)[-2:]) / 2
105
+ older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
106
+
107
+ if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
108
+ # Throughput increasing by >10%, add workers
109
+ new_worker_id = len(self.workers)
110
+ self._add_worker(new_worker_id)
111
+ self.current_workers += 1
112
+ logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
113
+
114
+ elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
115
+ # Throughput decreasing by >10%, remove worker
116
+ # (workers will exit naturally when they finish current work)
117
+ self.current_workers = max(self.min_workers, self.current_workers - 1)
118
+ logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
119
+
120
+ def shutdown(self, timeout=30):
121
+ """Shutdown the worker pool gracefully."""
122
+ logger.info("Shutting down worker pool...")
123
+ self.running = False
124
+
125
+ # Send stop signals
126
+ for _ in self.workers:
127
+ self.work_queue.put(None)
128
+
129
+ # Wait for workers to finish
130
+ for p in self.workers:
131
+ p.join(timeout=timeout)
132
+ if p.is_alive():
133
+ logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
134
+ p.terminate()
135
+
136
+ logger.info("Worker pool shutdown complete")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1,19 @@
1
+ mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
2
+ mapillary_downloader/__main__.py,sha256=Kjfx2woMyCvAxYAdqvtXtYJknCMviV_K2PSo0cDc8Hg,4320
3
+ mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
4
+ mapillary_downloader/downloader.py,sha256=F36AtB0Ro_EXR78EDOqH248llV7fGVeR4j9nZf0q7qg,19988
5
+ mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
6
+ mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
7
+ mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
+ mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
9
+ mapillary_downloader/metadata_reader.py,sha256=-4BmtLVI9sldZU0LlqMc-bporiYNpk6-F2RKKMvzLu4,3560
10
+ mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
11
+ mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
12
+ mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
13
+ mapillary_downloader/worker.py,sha256=RMZO8N67Kl-bhHC1qUdZg6Sx8k6RYbPRhyuLyOjr29o,4450
14
+ mapillary_downloader/worker_pool.py,sha256=QFYIbqkgamOtB-iRyZp5kN6jdZuYw93izls61ayVIZ8,4771
15
+ mapillary_downloader-0.5.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
+ mapillary_downloader-0.5.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
+ mapillary_downloader-0.5.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
+ mapillary_downloader-0.5.0.dist-info/METADATA,sha256=A0AhsIjGV9FBf5vz28hSC2jugcRqz5A8gsZwMGGEw2A,4982
19
+ mapillary_downloader-0.5.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
2
- mapillary_downloader/__main__.py,sha256=avh546grDz379HbA4JOOH2ovSH64Z69okGZO8LKciJ8,3964
3
- mapillary_downloader/client.py,sha256=O7JgshaM3QKUv0xXuBbe_uPqsTr4lgyuVUHYndvXTfA,4611
4
- mapillary_downloader/downloader.py,sha256=cVV24uIc3nQ_YXzqpwdVSr-L4fkME3sXq3pCfFS-0Ls,12476
5
- mapillary_downloader/exif_writer.py,sha256=Bn1u3QULfHtae86FnUGcqN450NccJwtwW9wVaSRyx9E,4615
6
- mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
7
- mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
- mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
9
- mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
10
- mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
11
- mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
12
- mapillary_downloader/worker.py,sha256=eqaBhP5NE_VoJSTZfFb4OAqGyVX65xyoVUp2vosYBM8,3722
13
- mapillary_downloader-0.4.1.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
14
- mapillary_downloader-0.4.1.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
15
- mapillary_downloader-0.4.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
16
- mapillary_downloader-0.4.1.dist-info/METADATA,sha256=wgVRFgLesT4OFb-dsyQ-14KvXQTagnx7WjqHkAt2aFQ,4982
17
- mapillary_downloader-0.4.1.dist-info/RECORD,,