mapillary-downloader 0.4.2__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/PKG-INFO +1 -1
  2. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/pyproject.toml +1 -1
  3. mapillary_downloader-0.5.1/src/mapillary_downloader/downloader.py +471 -0
  4. mapillary_downloader-0.5.1/src/mapillary_downloader/metadata_reader.py +163 -0
  5. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/worker.py +34 -7
  6. mapillary_downloader-0.5.1/src/mapillary_downloader/worker_pool.py +204 -0
  7. mapillary_downloader-0.4.2/src/mapillary_downloader/downloader.py +0 -326
  8. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/LICENSE.md +0 -0
  9. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/README.md +0 -0
  10. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__init__.py +0 -0
  11. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__main__.py +0 -0
  12. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/client.py +0 -0
  13. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/exif_writer.py +0 -0
  14. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_check.py +0 -0
  15. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_meta.py +0 -0
  16. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/logging_config.py +0 -0
  17. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/tar_sequences.py +0 -0
  18. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/utils.py +0 -0
  19. {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/webp_converter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.4.2
3
+ Version: 0.5.1
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Download your Mapillary data before it's gone"
4
- version = "0.4.2"
4
+ version = "0.5.1"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -0,0 +1,471 @@
1
+ """Main downloader logic."""
2
+
3
+ import gzip
4
+ import json
5
+ import logging
6
+ import os
7
+ import shutil
8
+ import time
9
+ from pathlib import Path
10
+ from mapillary_downloader.utils import format_size, format_time
11
+ from mapillary_downloader.ia_meta import generate_ia_metadata
12
+ from mapillary_downloader.ia_check import check_ia_exists
13
+ from mapillary_downloader.worker import worker_process
14
+ from mapillary_downloader.worker_pool import AdaptiveWorkerPool
15
+ from mapillary_downloader.metadata_reader import MetadataReader
16
+ from mapillary_downloader.tar_sequences import tar_sequence_directories
17
+ from mapillary_downloader.logging_config import add_file_handler
18
+
19
+ logger = logging.getLogger("mapillary_downloader")
20
+
21
+
22
+ def get_cache_dir():
23
+ """Get XDG cache directory for staging downloads.
24
+
25
+ Returns:
26
+ Path to cache directory for mapillary_downloader
27
+ """
28
+ xdg_cache = os.environ.get("XDG_CACHE_HOME")
29
+ if xdg_cache:
30
+ cache_dir = Path(xdg_cache)
31
+ else:
32
+ cache_dir = Path.home() / ".cache"
33
+
34
+ mapillary_cache = cache_dir / "mapillary_downloader"
35
+ mapillary_cache.mkdir(parents=True, exist_ok=True)
36
+ return mapillary_cache
37
+
38
+
39
+ class MapillaryDownloader:
40
+ """Handles downloading Mapillary data for a user."""
41
+
42
+ def __init__(
43
+ self,
44
+ client,
45
+ output_dir,
46
+ username=None,
47
+ quality=None,
48
+ workers=None,
49
+ tar_sequences=True,
50
+ convert_webp=False,
51
+ check_ia=True,
52
+ ):
53
+ """Initialize the downloader.
54
+
55
+ Args:
56
+ client: MapillaryClient instance
57
+ output_dir: Base directory to save downloads (final destination)
58
+ username: Mapillary username (for collection directory)
59
+ quality: Image quality (for collection directory)
60
+ workers: Number of parallel workers (default: half of cpu_count)
61
+ tar_sequences: Whether to tar sequence directories after download (default: True)
62
+ convert_webp: Whether to convert images to WebP (affects collection name)
63
+ check_ia: Whether to check if collection exists on Internet Archive (default: True)
64
+ """
65
+ self.client = client
66
+ self.base_output_dir = Path(output_dir)
67
+ self.username = username
68
+ self.quality = quality
69
+ self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
70
+ self.tar_sequences = tar_sequences
71
+ self.convert_webp = convert_webp
72
+ self.check_ia = check_ia
73
+
74
+ # Determine collection name
75
+ if username and quality:
76
+ collection_name = f"mapillary-{username}-{quality}"
77
+ if convert_webp:
78
+ collection_name += "-webp"
79
+ self.collection_name = collection_name
80
+ else:
81
+ self.collection_name = None
82
+
83
+ # Set up staging directory in cache
84
+ cache_dir = get_cache_dir()
85
+ if self.collection_name:
86
+ self.staging_dir = cache_dir / self.collection_name
87
+ self.final_dir = self.base_output_dir / self.collection_name
88
+ else:
89
+ self.staging_dir = cache_dir / "download"
90
+ self.final_dir = self.base_output_dir
91
+
92
+ # Work in staging directory during download
93
+ self.output_dir = self.staging_dir
94
+ self.output_dir.mkdir(parents=True, exist_ok=True)
95
+
96
+ logger.info(f"Staging directory: {self.staging_dir}")
97
+ logger.info(f"Final destination: {self.final_dir}")
98
+
99
+ # Set up file logging for archival with timestamp for incremental runs
100
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
101
+ log_file = self.output_dir / f"download.log.{timestamp}"
102
+ add_file_handler(log_file)
103
+ logger.info(f"Logging to: {log_file}")
104
+
105
+ self.metadata_file = self.output_dir / "metadata.jsonl"
106
+ self.progress_file = self.output_dir / "progress.json"
107
+ self.downloaded = self._load_progress()
108
+
109
+ def _load_progress(self):
110
+ """Load previously downloaded image IDs for this quality."""
111
+ if self.progress_file.exists():
112
+ with open(self.progress_file) as f:
113
+ data = json.load(f)
114
+ # Support both old format (single list) and new format (per-quality dict)
115
+ if isinstance(data, dict):
116
+ if "downloaded" in data:
117
+ # Old format: {"downloaded": [...]}
118
+ return set(data["downloaded"])
119
+ else:
120
+ # New format: {"256": [...], "1024": [...], ...}
121
+ return set(data.get(str(self.quality), []))
122
+ else:
123
+ # Very old format: just a list
124
+ return set(data)
125
+ return set()
126
+
127
+ def _save_progress(self):
128
+ """Save progress to disk atomically, per-quality."""
129
+ # Load existing progress for all qualities
130
+ if self.progress_file.exists():
131
+ with open(self.progress_file) as f:
132
+ data = json.load(f)
133
+ # Convert old format to new format if needed
134
+ if isinstance(data, dict) and "downloaded" in data:
135
+ # Old format: {"downloaded": [...]} - migrate to per-quality
136
+ progress = {}
137
+ else:
138
+ progress = data if isinstance(data, dict) else {}
139
+ else:
140
+ progress = {}
141
+
142
+ # Update this quality's progress
143
+ progress[str(self.quality)] = list(self.downloaded)
144
+
145
+ # Write atomically
146
+ temp_file = self.progress_file.with_suffix(".json.tmp")
147
+ with open(temp_file, "w") as f:
148
+ json.dump(progress, f)
149
+ f.flush()
150
+ os.fsync(f.fileno())
151
+ temp_file.replace(self.progress_file)
152
+
153
+ def download_user_data(self, bbox=None, convert_webp=False):
154
+ """Download all images for a user using streaming queue-based architecture.
155
+
156
+ Args:
157
+ bbox: Optional bounding box [west, south, east, north]
158
+ convert_webp: Convert images to WebP format after download
159
+ """
160
+ if not self.username or not self.quality:
161
+ raise ValueError("Username and quality must be provided during initialization")
162
+
163
+ # Check if collection already exists on Internet Archive
164
+ if self.check_ia and self.collection_name:
165
+ logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
166
+ if check_ia_exists(self.collection_name):
167
+ logger.info("Collection already exists on archive.org, skipping download")
168
+ return
169
+
170
+ # Check if collection already exists in final destination
171
+ if self.final_dir.exists():
172
+ logger.info(f"Collection already exists at {self.final_dir}, skipping download")
173
+ return
174
+
175
+ quality_field = f"thumb_{self.quality}_url"
176
+
177
+ logger.info(f"Downloading images for user: {self.username}")
178
+ logger.info(f"Output directory: {self.output_dir}")
179
+ logger.info(f"Quality: {self.quality}")
180
+ logger.info(f"Using {self.workers} parallel workers")
181
+
182
+ start_time = time.time()
183
+
184
+ # Step 1: Check if API fetch is already complete
185
+ reader = MetadataReader(self.metadata_file)
186
+ api_complete = reader.is_complete
187
+ if api_complete:
188
+ logger.info("API fetch already complete, will only download")
189
+ else:
190
+ logger.info("API fetch incomplete, will fetch and download in parallel")
191
+
192
+ # Step 2: Start worker pool
193
+ # Since workers do both I/O (download) and CPU (WebP), need many more workers
194
+ # Cap at 128 for now - will build proper dynamic scaling on a new branch later
195
+ pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
196
+ pool.start()
197
+
198
+ # Step 3: Download images from metadata file while fetching new from API
199
+ downloaded_count = 0
200
+ total_bytes = 0
201
+ failed_count = 0
202
+ submitted = 0
203
+ batch_start = time.time()
204
+
205
+ logger.info("Starting parallel download and API fetch...")
206
+
207
+ try:
208
+ # Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
209
+ if not api_complete:
210
+ import threading
211
+
212
+ api_fetch_complete = threading.Event()
213
+ new_images_count = [0] # Mutable so thread can update it
214
+
215
+ def fetch_api_metadata():
216
+ """Fetch metadata from API and write to file (runs in thread)."""
217
+ try:
218
+ logger.info("API fetch thread: Starting...")
219
+ with open(self.metadata_file, "a") as meta_f:
220
+ for image in self.client.get_user_images(self.username, bbox=bbox):
221
+ new_images_count[0] += 1
222
+
223
+ # Save metadata (don't dedupe here, let the tailer handle it)
224
+ meta_f.write(json.dumps(image) + "\n")
225
+ meta_f.flush()
226
+
227
+ if new_images_count[0] % 1000 == 0:
228
+ logger.info(f"API: Fetched {new_images_count[0]} images from API")
229
+
230
+ # Mark as complete
231
+ MetadataReader.mark_complete(self.metadata_file)
232
+ logger.info(f"API fetch complete: {new_images_count[0]} images")
233
+ finally:
234
+ api_fetch_complete.set()
235
+
236
+ # Start API fetch in background thread
237
+ api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
238
+ api_thread.start()
239
+ else:
240
+ logger.info("API fetch already complete, skipping API thread")
241
+ api_fetch_complete = None
242
+
243
+ # Step 3b: Tail metadata file and submit to workers
244
+ logger.info("Starting metadata tail and download queue feeder...")
245
+ last_position = 0
246
+
247
+ # Helper to process results from queue
248
+ def process_results():
249
+ nonlocal downloaded_count, total_bytes, failed_count
250
+ while True:
251
+ result = pool.get_result(timeout=0.001)
252
+ if result is None:
253
+ break
254
+
255
+ image_id, bytes_dl, success, error_msg = result
256
+
257
+ if success:
258
+ self.downloaded.add(image_id)
259
+ downloaded_count += 1
260
+ total_bytes += bytes_dl
261
+
262
+ # Log every download for first 10, then every 100
263
+ should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
264
+ if should_log:
265
+ elapsed = time.time() - batch_start
266
+ rate = downloaded_count / elapsed if elapsed > 0 else 0
267
+ logger.info(
268
+ f"Downloaded: {downloaded_count} ({format_size(total_bytes)}) "
269
+ f"- Rate: {rate:.1f} images/sec"
270
+ )
271
+
272
+ if downloaded_count % 100 == 0:
273
+ self._save_progress()
274
+ pool.check_throughput(downloaded_count)
275
+ else:
276
+ failed_count += 1
277
+ logger.warning(f"Failed to download {image_id}: {error_msg}")
278
+
279
+ # Tail the metadata file and submit to workers
280
+ while True:
281
+ # Check if API fetch is done and we've processed everything
282
+ if api_fetch_complete and api_fetch_complete.is_set():
283
+ # Read any remaining lines
284
+ if self.metadata_file.exists():
285
+ with open(self.metadata_file) as f:
286
+ f.seek(last_position)
287
+ for line in f:
288
+ line = line.strip()
289
+ if not line:
290
+ continue
291
+
292
+ try:
293
+ image = json.loads(line)
294
+ except json.JSONDecodeError:
295
+ # Incomplete line, will retry
296
+ continue
297
+
298
+ # Skip completion marker
299
+ if image.get("__complete__"):
300
+ continue
301
+
302
+ image_id = image.get("id")
303
+ if not image_id:
304
+ continue
305
+
306
+ # Skip if already downloaded or no quality URL
307
+ if image_id in self.downloaded:
308
+ continue
309
+ if not image.get(quality_field):
310
+ continue
311
+
312
+ # Submit to workers
313
+ work_item = (
314
+ image,
315
+ str(self.output_dir),
316
+ self.quality,
317
+ convert_webp,
318
+ self.client.access_token,
319
+ )
320
+ pool.submit(work_item)
321
+ submitted += 1
322
+
323
+ if submitted % 1000 == 0:
324
+ logger.info(f"Queue: Submitted {submitted} images")
325
+
326
+ # Process results while submitting
327
+ process_results()
328
+
329
+ last_position = f.tell()
330
+
331
+ # API done and all lines processed, break
332
+ break
333
+
334
+ # API still running or API was already complete, tail the file
335
+ if self.metadata_file.exists():
336
+ with open(self.metadata_file) as f:
337
+ f.seek(last_position)
338
+ for line in f:
339
+ line = line.strip()
340
+ if not line:
341
+ continue
342
+
343
+ try:
344
+ image = json.loads(line)
345
+ except json.JSONDecodeError:
346
+ # Incomplete line, will retry next iteration
347
+ continue
348
+
349
+ # Skip completion marker
350
+ if image.get("__complete__"):
351
+ continue
352
+
353
+ image_id = image.get("id")
354
+ if not image_id:
355
+ continue
356
+
357
+ # Skip if already downloaded or no quality URL
358
+ if image_id in self.downloaded:
359
+ continue
360
+ if not image.get(quality_field):
361
+ continue
362
+
363
+ # Submit to workers
364
+ work_item = (
365
+ image,
366
+ str(self.output_dir),
367
+ self.quality,
368
+ convert_webp,
369
+ self.client.access_token,
370
+ )
371
+ pool.submit(work_item)
372
+ submitted += 1
373
+
374
+ if submitted % 1000 == 0:
375
+ logger.info(f"Queue: Submitted {submitted} images")
376
+
377
+ # Process results while submitting
378
+ process_results()
379
+
380
+ last_position = f.tell()
381
+
382
+ # Sleep briefly before next tail iteration
383
+ time.sleep(0.1)
384
+
385
+ # Process any results that came in
386
+ process_results()
387
+
388
+ # Send shutdown signals
389
+ logger.info(f"Submitted {submitted} images, waiting for workers to finish...")
390
+ for _ in range(pool.current_workers):
391
+ pool.submit(None)
392
+
393
+ # Collect remaining results
394
+ completed = downloaded_count + failed_count
395
+
396
+ while completed < submitted:
397
+ result = pool.get_result(timeout=5)
398
+ if result is None:
399
+ # Check throughput periodically
400
+ pool.check_throughput(downloaded_count)
401
+ continue
402
+
403
+ image_id, bytes_dl, success, error_msg = result
404
+ completed += 1
405
+
406
+ if success:
407
+ self.downloaded.add(image_id)
408
+ downloaded_count += 1
409
+ total_bytes += bytes_dl
410
+
411
+ if downloaded_count % 10 == 0:
412
+ elapsed = time.time() - batch_start
413
+ rate = downloaded_count / elapsed if elapsed > 0 else 0
414
+ remaining = submitted - completed
415
+ eta_seconds = remaining / rate if rate > 0 else 0
416
+
417
+ logger.info(
418
+ f"Downloaded: {downloaded_count}/{submitted} ({format_size(total_bytes)}) "
419
+ f"- ETA: {format_time(eta_seconds)}"
420
+ )
421
+ self._save_progress()
422
+ pool.check_throughput(downloaded_count)
423
+ else:
424
+ failed_count += 1
425
+ logger.warning(f"Failed to download {image_id}: {error_msg}")
426
+
427
+ finally:
428
+ # Shutdown worker pool
429
+ pool.shutdown()
430
+
431
+ self._save_progress()
432
+ elapsed = time.time() - start_time
433
+
434
+ logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
435
+ logger.info(f"Total time: {format_time(elapsed)}")
436
+
437
+ # Tar sequence directories for efficient IA uploads
438
+ if self.tar_sequences:
439
+ tar_sequence_directories(self.output_dir)
440
+
441
+ # Gzip metadata.jsonl to save space
442
+ if self.metadata_file.exists():
443
+ logger.info("Compressing metadata.jsonl...")
444
+ original_size = self.metadata_file.stat().st_size
445
+ gzipped_file = self.metadata_file.with_suffix(".jsonl.gz")
446
+
447
+ with open(self.metadata_file, "rb") as f_in:
448
+ with gzip.open(gzipped_file, "wb", compresslevel=9) as f_out:
449
+ shutil.copyfileobj(f_in, f_out)
450
+
451
+ compressed_size = gzipped_file.stat().st_size
452
+ self.metadata_file.unlink()
453
+
454
+ savings = 100 * (1 - compressed_size / original_size)
455
+ logger.info(
456
+ f"Compressed metadata: {format_size(original_size)} → {format_size(compressed_size)} "
457
+ f"({savings:.1f}% savings)"
458
+ )
459
+
460
+ # Generate IA metadata
461
+ generate_ia_metadata(self.output_dir)
462
+
463
+ # Move from staging to final destination
464
+ logger.info("Moving collection from staging to final destination...")
465
+ if self.final_dir.exists():
466
+ logger.warning(f"Destination already exists, removing: {self.final_dir}")
467
+ shutil.rmtree(self.final_dir)
468
+
469
+ self.final_dir.parent.mkdir(parents=True, exist_ok=True)
470
+ shutil.move(str(self.staging_dir), str(self.final_dir))
471
+ logger.info(f"Collection moved to: {self.final_dir}")
@@ -0,0 +1,163 @@
1
+ """Streaming metadata reader with filtering."""
2
+
3
+ import gzip
4
+ import json
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ logger = logging.getLogger("mapillary_downloader")
9
+
10
+
11
+ class MetadataReader:
12
+ """Streams metadata.jsonl line-by-line with filtering.
13
+
14
+ This avoids loading millions of image dicts into memory.
15
+ """
16
+
17
+ COMPLETION_MARKER = {"__complete__": True}
18
+
19
+ def __init__(self, metadata_file):
20
+ """Initialize metadata reader.
21
+
22
+ Args:
23
+ metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
24
+ """
25
+ self.metadata_file = Path(metadata_file)
26
+ self.is_complete = self._check_complete()
27
+
28
+ def _check_complete(self):
29
+ """Check if metadata file has completion marker.
30
+
31
+ Returns:
32
+ True if completion marker found, False otherwise
33
+ """
34
+ if not self.metadata_file.exists():
35
+ return False
36
+
37
+ # Check last few lines for completion marker (it should be at the end)
38
+ try:
39
+ if self.metadata_file.suffix == ".gz":
40
+ file_handle = gzip.open(self.metadata_file, "rt")
41
+ else:
42
+ file_handle = open(self.metadata_file)
43
+
44
+ with file_handle as f:
45
+ # Read last 10 lines to find completion marker
46
+ lines = []
47
+ for line in f:
48
+ lines.append(line)
49
+ if len(lines) > 10:
50
+ lines.pop(0)
51
+
52
+ # Check if any of the last lines is the completion marker
53
+ for line in reversed(lines):
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ if data.get("__complete__"):
60
+ return True
61
+ except json.JSONDecodeError:
62
+ continue
63
+
64
+ return False
65
+ except Exception:
66
+ return False
67
+
68
+ def iter_images(self, quality_field=None, downloaded_ids=None):
69
+ """Stream images from metadata file with filtering.
70
+
71
+ Args:
72
+ quality_field: Optional field to check exists (e.g., 'thumb_1024_url')
73
+ downloaded_ids: Optional set of already downloaded IDs to skip
74
+
75
+ Yields:
76
+ Image metadata dicts that pass filters
77
+ """
78
+ if not self.metadata_file.exists():
79
+ return
80
+
81
+ # Handle gzipped files
82
+ if self.metadata_file.suffix == ".gz":
83
+ file_handle = gzip.open(self.metadata_file, "rt")
84
+ else:
85
+ file_handle = open(self.metadata_file)
86
+
87
+ with file_handle as f:
88
+ for line in f:
89
+ line = line.strip()
90
+ if not line:
91
+ continue
92
+
93
+ image = json.loads(line)
94
+
95
+ # Check for completion marker
96
+ if image.get("__complete__"):
97
+ self.is_complete = True
98
+ logger.debug("Found API fetch completion marker")
99
+ continue
100
+
101
+ image_id = image.get("id")
102
+ if not image_id:
103
+ continue
104
+
105
+ # Filter by downloaded status
106
+ if downloaded_ids and image_id in downloaded_ids:
107
+ continue
108
+
109
+ # Filter by quality field availability
110
+ if quality_field and not image.get(quality_field):
111
+ continue
112
+
113
+ yield image
114
+
115
+ def get_all_ids(self):
116
+ """Get set of all image IDs in metadata file.
117
+
118
+ Returns:
119
+ Set of image IDs (for building seen_ids)
120
+ """
121
+ ids = set()
122
+
123
+ if not self.metadata_file.exists():
124
+ return ids
125
+
126
+ # Handle gzipped files
127
+ if self.metadata_file.suffix == ".gz":
128
+ file_handle = gzip.open(self.metadata_file, "rt")
129
+ else:
130
+ file_handle = open(self.metadata_file)
131
+
132
+ with file_handle as f:
133
+ for line in f:
134
+ line = line.strip()
135
+ if not line:
136
+ continue
137
+
138
+ image = json.loads(line)
139
+
140
+ # Skip completion marker
141
+ if image.get("__complete__"):
142
+ self.is_complete = True
143
+ continue
144
+
145
+ image_id = image.get("id")
146
+ if image_id:
147
+ ids.add(image_id)
148
+
149
+ return ids
150
+
151
+ @staticmethod
152
+ def mark_complete(metadata_file):
153
+ """Append completion marker to metadata file.
154
+
155
+ Args:
156
+ metadata_file: Path to metadata.jsonl
157
+ """
158
+ metadata_file = Path(metadata_file)
159
+ if metadata_file.exists():
160
+ with open(metadata_file, "a") as f:
161
+ f.write(json.dumps(MetadataReader.COMPLETION_MARKER) + "\n")
162
+ f.flush()
163
+ logger.info("Marked metadata file as complete")
@@ -9,7 +9,38 @@ from mapillary_downloader.exif_writer import write_exif_to_image
9
9
  from mapillary_downloader.webp_converter import convert_to_webp
10
10
 
11
11
 
12
- def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
12
+ def worker_process(work_queue, result_queue, worker_id):
13
+ """Worker process that pulls from queue and processes images.
14
+
15
+ Args:
16
+ work_queue: Queue to pull work items from
17
+ result_queue: Queue to push results to
18
+ worker_id: Unique worker identifier
19
+ """
20
+ # Create session once per worker (reuse HTTP connections)
21
+ session = requests.Session()
22
+
23
+ while True:
24
+ work_item = work_queue.get()
25
+
26
+ # None is the shutdown signal
27
+ if work_item is None:
28
+ break
29
+
30
+ # Unpack work item
31
+ image_data, output_dir, quality, convert_webp, access_token = work_item
32
+
33
+ # Update session auth for this request
34
+ session.headers.update({"Authorization": f"OAuth {access_token}"})
35
+
36
+ # Process the image
37
+ result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
38
+
39
+ # Push result back
40
+ result_queue.put(result)
41
+
42
+
43
+ def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
13
44
  """Download and optionally convert a single image.
14
45
 
15
46
  This function is designed to run in a worker process.
@@ -19,7 +50,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
19
50
  output_dir: Base output directory path
20
51
  quality: Quality level (256, 1024, 2048, original)
21
52
  convert_webp: Whether to convert to WebP
22
- access_token: Mapillary API access token
53
+ session: requests.Session with auth already configured
23
54
 
24
55
  Returns:
25
56
  Tuple of (image_id, bytes_downloaded, success, error_msg)
@@ -53,11 +84,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
53
84
  jpg_path = img_dir / f"{image_id}.jpg"
54
85
  final_path = jpg_path
55
86
 
56
- # Download image
57
- # No retries for CDN images - they're cheap, just skip failures and move on
58
- session = requests.Session()
59
- session.headers.update({"Authorization": f"OAuth {access_token}"})
60
-
87
+ # Download image (using session passed from worker)
61
88
  bytes_downloaded = 0
62
89
 
63
90
  try:
@@ -0,0 +1,204 @@
1
+ """Adaptive worker pool for parallel processing."""
2
+
3
+ import logging
4
+ import multiprocessing as mp
5
+ import queue
6
+ import time
7
+ from collections import deque
8
+
9
+ logger = logging.getLogger("mapillary_downloader")
10
+
11
+
12
+ class AdaptiveWorkerPool:
13
+ """Worker pool that scales based on throughput.
14
+
15
+ Monitors throughput every 30 seconds and adjusts worker count:
16
+ - If throughput increasing: add workers (up to max)
17
+ - If throughput plateauing/decreasing: reduce workers
18
+ """
19
+
20
+ def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
21
+ """Initialize adaptive worker pool.
22
+
23
+ Args:
24
+ worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
+ min_workers: Minimum number of workers
26
+ max_workers: Maximum number of workers
27
+ monitoring_interval: Seconds between throughput checks
28
+ """
29
+ self.worker_func = worker_func
30
+ self.min_workers = min_workers
31
+ self.max_workers = max_workers
32
+ self.monitoring_interval = monitoring_interval
33
+
34
+ # Queues
35
+ self.work_queue = mp.Queue(maxsize=max_workers)
36
+ self.result_queue = mp.Queue()
37
+
38
+ # Worker management
39
+ self.workers = []
40
+ self.current_workers = min_workers # Start small and ramp up
41
+
42
+ # Throughput monitoring
43
+ self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
+ self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
45
+ self.last_processed = 0
46
+ self.last_check_time = time.time()
47
+
48
+ self.running = False
49
+
50
+ def start(self):
51
+ """Start the worker pool."""
52
+ self.running = True
53
+ logger.info(f"Starting worker pool with {self.current_workers} workers")
54
+
55
+ for i in range(self.current_workers):
56
+ self._add_worker(i)
57
+
58
+ def _add_worker(self, worker_id):
59
+ """Add a new worker to the pool."""
60
+ p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
61
+ p.start()
62
+ self.workers.append(p)
63
+ logger.debug(f"Started worker {worker_id}")
64
+
65
+ def submit(self, work_item):
66
+ """Submit work to the pool (blocks if queue is full)."""
67
+ self.work_queue.put(work_item)
68
+
69
+ def get_result(self, timeout=None):
70
+ """Get a result from the workers.
71
+
72
+ Returns:
73
+ Result from worker, or None if timeout
74
+ """
75
+ try:
76
+ return self.result_queue.get(timeout=timeout)
77
+ except queue.Empty:
78
+ return None
79
+
80
+ def check_throughput(self, total_processed):
81
+ """Check throughput and adjust workers if needed.
82
+
83
+ Args:
84
+ total_processed: Total number of items processed so far
85
+ """
86
+ now = time.time()
87
+ elapsed = now - self.last_check_time
88
+
89
+ if elapsed < self.monitoring_interval:
90
+ logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
91
+ return
92
+
93
+ # Calculate current throughput (items/sec)
94
+ items_since_check = total_processed - self.last_processed
95
+ throughput = items_since_check / elapsed
96
+
97
+ current_workers = len(self.workers)
98
+ self.throughput_history.append(throughput)
99
+ self.worker_count_history.append(current_workers)
100
+ self.last_processed = total_processed
101
+ self.last_check_time = now
102
+
103
+ logger.info(
104
+ f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
105
+ f"history: {len(self.throughput_history)} measurements)"
106
+ )
107
+
108
+ # Need at least 2 measurements to calculate gain per worker
109
+ if len(self.throughput_history) < 2:
110
+ # First measurement - add 20% more workers
111
+ if current_workers < self.max_workers:
112
+ workers_to_add = max(1, int(current_workers * 0.2))
113
+ for i in range(workers_to_add):
114
+ if len(self.workers) < self.max_workers:
115
+ new_worker_id = len(self.workers)
116
+ self._add_worker(new_worker_id)
117
+ self.current_workers += 1
118
+ logger.info(
119
+ f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
120
+ )
121
+ return
122
+
123
+ # Calculate throughput gain per worker added
124
+ current_throughput = self.throughput_history[-1]
125
+ previous_throughput = self.throughput_history[-2]
126
+ previous_workers = self.worker_count_history[-2]
127
+
128
+ throughput_gain = current_throughput - previous_throughput
129
+ workers_added = current_workers - previous_workers
130
+
131
+ logger.debug(
132
+ f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
133
+ f"{current_throughput:.1f} items/s @ {current_workers} workers "
134
+ f"(gain: {throughput_gain:.1f}, added: {workers_added})"
135
+ )
136
+
137
+ # If throughput decreased significantly, stop adding workers
138
+ if current_throughput < previous_throughput * 0.95:
139
+ logger.info(
140
+ f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
141
+ f"stopping at {current_workers} workers"
142
+ )
143
+ # If throughput is still increasing or stable, add more workers
144
+ elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
145
+ if workers_added > 0 and throughput_gain > 0:
146
+ # Calculate gain per worker
147
+ gain_per_worker = throughput_gain / workers_added
148
+ logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
149
+
150
+ # Estimate how many more workers we could benefit from
151
+ # Assume diminishing returns, so be conservative
152
+ if gain_per_worker > 0.5:
153
+ # Good gain per worker - add more aggressively
154
+ workers_to_add = max(1, int(current_workers * 0.3))
155
+ elif gain_per_worker > 0.2:
156
+ # Moderate gain - add moderately
157
+ workers_to_add = max(1, int(current_workers * 0.2))
158
+ else:
159
+ # Small gain - add conservatively
160
+ workers_to_add = max(1, int(current_workers * 0.1))
161
+
162
+ added = 0
163
+ for i in range(workers_to_add):
164
+ if len(self.workers) < self.max_workers:
165
+ new_worker_id = len(self.workers)
166
+ self._add_worker(new_worker_id)
167
+ self.current_workers += 1
168
+ added += 1
169
+
170
+ logger.info(
171
+ f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
172
+ f"added {added} workers (now {self.current_workers}/{self.max_workers})"
173
+ )
174
+ else:
175
+ # Fallback to 20% if we can't calculate gain per worker
176
+ workers_to_add = max(1, int(current_workers * 0.2))
177
+ added = 0
178
+ for i in range(workers_to_add):
179
+ if len(self.workers) < self.max_workers:
180
+ new_worker_id = len(self.workers)
181
+ self._add_worker(new_worker_id)
182
+ self.current_workers += 1
183
+ added += 1
184
+ logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
185
+ else:
186
+ logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
187
+
188
+ def shutdown(self, timeout=30):
189
+ """Shutdown the worker pool gracefully."""
190
+ logger.info("Shutting down worker pool...")
191
+ self.running = False
192
+
193
+ # Send stop signals
194
+ for _ in self.workers:
195
+ self.work_queue.put(None)
196
+
197
+ # Wait for workers to finish
198
+ for p in self.workers:
199
+ p.join(timeout=timeout)
200
+ if p.is_alive():
201
+ logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
202
+ p.terminate()
203
+
204
+ logger.info("Worker pool shutdown complete")
@@ -1,326 +0,0 @@
1
- """Main downloader logic."""
2
-
3
- import gzip
4
- import json
5
- import logging
6
- import os
7
- import shutil
8
- import time
9
- from pathlib import Path
10
- from concurrent.futures import ProcessPoolExecutor, as_completed
11
- from mapillary_downloader.utils import format_size, format_time
12
- from mapillary_downloader.ia_meta import generate_ia_metadata
13
- from mapillary_downloader.ia_check import check_ia_exists
14
- from mapillary_downloader.worker import download_and_convert_image
15
- from mapillary_downloader.tar_sequences import tar_sequence_directories
16
- from mapillary_downloader.logging_config import add_file_handler
17
-
18
- logger = logging.getLogger("mapillary_downloader")
19
-
20
-
21
- def get_cache_dir():
22
- """Get XDG cache directory for staging downloads.
23
-
24
- Returns:
25
- Path to cache directory for mapillary_downloader
26
- """
27
- xdg_cache = os.environ.get("XDG_CACHE_HOME")
28
- if xdg_cache:
29
- cache_dir = Path(xdg_cache)
30
- else:
31
- cache_dir = Path.home() / ".cache"
32
-
33
- mapillary_cache = cache_dir / "mapillary_downloader"
34
- mapillary_cache.mkdir(parents=True, exist_ok=True)
35
- return mapillary_cache
36
-
37
-
38
- class MapillaryDownloader:
39
- """Handles downloading Mapillary data for a user."""
40
-
41
- def __init__(
42
- self,
43
- client,
44
- output_dir,
45
- username=None,
46
- quality=None,
47
- workers=None,
48
- tar_sequences=True,
49
- convert_webp=False,
50
- check_ia=True,
51
- ):
52
- """Initialize the downloader.
53
-
54
- Args:
55
- client: MapillaryClient instance
56
- output_dir: Base directory to save downloads (final destination)
57
- username: Mapillary username (for collection directory)
58
- quality: Image quality (for collection directory)
59
- workers: Number of parallel workers (default: half of cpu_count)
60
- tar_sequences: Whether to tar sequence directories after download (default: True)
61
- convert_webp: Whether to convert images to WebP (affects collection name)
62
- check_ia: Whether to check if collection exists on Internet Archive (default: True)
63
- """
64
- self.client = client
65
- self.base_output_dir = Path(output_dir)
66
- self.username = username
67
- self.quality = quality
68
- self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
69
- self.tar_sequences = tar_sequences
70
- self.convert_webp = convert_webp
71
- self.check_ia = check_ia
72
-
73
- # Determine collection name
74
- if username and quality:
75
- collection_name = f"mapillary-{username}-{quality}"
76
- if convert_webp:
77
- collection_name += "-webp"
78
- self.collection_name = collection_name
79
- else:
80
- self.collection_name = None
81
-
82
- # Set up staging directory in cache
83
- cache_dir = get_cache_dir()
84
- if self.collection_name:
85
- self.staging_dir = cache_dir / self.collection_name
86
- self.final_dir = self.base_output_dir / self.collection_name
87
- else:
88
- self.staging_dir = cache_dir / "download"
89
- self.final_dir = self.base_output_dir
90
-
91
- # Work in staging directory during download
92
- self.output_dir = self.staging_dir
93
- self.output_dir.mkdir(parents=True, exist_ok=True)
94
-
95
- logger.info(f"Staging directory: {self.staging_dir}")
96
- logger.info(f"Final destination: {self.final_dir}")
97
-
98
- # Set up file logging for archival with timestamp for incremental runs
99
- timestamp = time.strftime("%Y%m%d-%H%M%S")
100
- log_file = self.output_dir / f"download.log.{timestamp}"
101
- add_file_handler(log_file)
102
- logger.info(f"Logging to: {log_file}")
103
-
104
- self.metadata_file = self.output_dir / "metadata.jsonl"
105
- self.progress_file = self.output_dir / "progress.json"
106
- self.downloaded = self._load_progress()
107
-
108
- def _load_progress(self):
109
- """Load previously downloaded image IDs."""
110
- if self.progress_file.exists():
111
- with open(self.progress_file) as f:
112
- return set(json.load(f).get("downloaded", []))
113
- return set()
114
-
115
- def _save_progress(self):
116
- """Save progress to disk atomically."""
117
- temp_file = self.progress_file.with_suffix(".json.tmp")
118
- with open(temp_file, "w") as f:
119
- json.dump({"downloaded": list(self.downloaded)}, f)
120
- f.flush()
121
- os.fsync(f.fileno())
122
- temp_file.replace(self.progress_file)
123
-
124
- def download_user_data(self, bbox=None, convert_webp=False):
125
- """Download all images for a user.
126
-
127
- Args:
128
- bbox: Optional bounding box [west, south, east, north]
129
- convert_webp: Convert images to WebP format after download
130
- """
131
- if not self.username or not self.quality:
132
- raise ValueError("Username and quality must be provided during initialization")
133
-
134
- # Check if collection already exists on Internet Archive
135
- if self.check_ia and self.collection_name:
136
- logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
137
- if check_ia_exists(self.collection_name):
138
- logger.info("Collection already exists on archive.org, skipping download")
139
- return
140
-
141
- # Check if collection already exists in final destination
142
- if self.final_dir.exists():
143
- logger.info(f"Collection already exists at {self.final_dir}, skipping download")
144
- return
145
-
146
- quality_field = f"thumb_{self.quality}_url"
147
-
148
- logger.info(f"Downloading images for user: {self.username}")
149
- logger.info(f"Output directory: {self.output_dir}")
150
- logger.info(f"Quality: {self.quality}")
151
- logger.info(f"Using {self.workers} parallel workers")
152
-
153
- processed = 0
154
- downloaded_count = 0
155
- skipped = 0
156
- total_bytes = 0
157
- failed_count = 0
158
-
159
- start_time = time.time()
160
-
161
- # Track which image IDs we've seen in metadata to avoid re-fetching
162
- seen_ids = set()
163
-
164
- # Collect images to download from existing metadata
165
- images_to_download = []
166
-
167
- if self.metadata_file.exists():
168
- logger.info("Processing existing metadata file...")
169
- with open(self.metadata_file) as f:
170
- for line in f:
171
- if line.strip():
172
- image = json.loads(line)
173
- image_id = image["id"]
174
- seen_ids.add(image_id)
175
- processed += 1
176
-
177
- if image_id in self.downloaded:
178
- skipped += 1
179
- continue
180
-
181
- # Queue for download
182
- if image.get(quality_field):
183
- images_to_download.append(image)
184
-
185
- # Download images from existing metadata in parallel
186
- if images_to_download:
187
- logger.info(f"Downloading {len(images_to_download)} images from existing metadata...")
188
- downloaded_count, total_bytes, failed_count = self._download_images_parallel(
189
- images_to_download, convert_webp
190
- )
191
-
192
- # Always check API for new images (will skip duplicates via seen_ids)
193
- logger.info("Checking for new images from API...")
194
- new_images = []
195
-
196
- with open(self.metadata_file, "a") as meta_f:
197
- for image in self.client.get_user_images(self.username, bbox=bbox):
198
- image_id = image["id"]
199
-
200
- # Skip if we already have this in our metadata file
201
- if image_id in seen_ids:
202
- continue
203
-
204
- seen_ids.add(image_id)
205
- processed += 1
206
-
207
- # Save new metadata
208
- meta_f.write(json.dumps(image) + "\n")
209
- meta_f.flush()
210
-
211
- # Skip if already downloaded
212
- if image_id in self.downloaded:
213
- skipped += 1
214
- continue
215
-
216
- # Queue for download
217
- if image.get(quality_field):
218
- new_images.append(image)
219
-
220
- # Download new images in parallel
221
- if new_images:
222
- logger.info(f"Downloading {len(new_images)} new images...")
223
- new_downloaded, new_bytes, new_failed = self._download_images_parallel(new_images, convert_webp)
224
- downloaded_count += new_downloaded
225
- total_bytes += new_bytes
226
- failed_count += new_failed
227
-
228
- self._save_progress()
229
- elapsed = time.time() - start_time
230
- logger.info(
231
- f"Complete! Processed {processed} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
232
- f"skipped {skipped}, failed {failed_count}"
233
- )
234
- logger.info(f"Total time: {format_time(elapsed)}")
235
-
236
- # Tar sequence directories for efficient IA uploads
237
- if self.tar_sequences:
238
- tar_sequence_directories(self.output_dir)
239
-
240
- # Gzip metadata.jsonl to save space
241
- if self.metadata_file.exists():
242
- logger.info("Compressing metadata.jsonl...")
243
- original_size = self.metadata_file.stat().st_size
244
- gzipped_file = self.metadata_file.with_suffix(".jsonl.gz")
245
-
246
- with open(self.metadata_file, "rb") as f_in:
247
- with gzip.open(gzipped_file, "wb", compresslevel=9) as f_out:
248
- shutil.copyfileobj(f_in, f_out)
249
-
250
- compressed_size = gzipped_file.stat().st_size
251
- self.metadata_file.unlink()
252
-
253
- savings = 100 * (1 - compressed_size / original_size)
254
- logger.info(
255
- f"Compressed metadata: {format_size(original_size)} → {format_size(compressed_size)} "
256
- f"({savings:.1f}% savings)"
257
- )
258
-
259
- # Generate IA metadata
260
- generate_ia_metadata(self.output_dir)
261
-
262
- # Move from staging to final destination
263
- logger.info("Moving collection from staging to final destination...")
264
- if self.final_dir.exists():
265
- logger.warning(f"Destination already exists, removing: {self.final_dir}")
266
- shutil.rmtree(self.final_dir)
267
-
268
- self.final_dir.parent.mkdir(parents=True, exist_ok=True)
269
- shutil.move(str(self.staging_dir), str(self.final_dir))
270
- logger.info(f"Collection moved to: {self.final_dir}")
271
-
272
- def _download_images_parallel(self, images, convert_webp):
273
- """Download images in parallel using worker pool.
274
-
275
- Args:
276
- images: List of image metadata dicts
277
- convert_webp: Whether to convert to WebP
278
-
279
- Returns:
280
- Tuple of (downloaded_count, total_bytes, failed_count)
281
- """
282
- downloaded_count = 0
283
- total_bytes = 0
284
- failed_count = 0
285
- batch_start_time = time.time()
286
-
287
- with ProcessPoolExecutor(max_workers=self.workers) as executor:
288
- # Submit all tasks
289
- future_to_image = {}
290
- for image in images:
291
- future = executor.submit(
292
- download_and_convert_image,
293
- image,
294
- str(self.output_dir),
295
- self.quality,
296
- convert_webp,
297
- self.client.access_token,
298
- )
299
- future_to_image[future] = image["id"]
300
-
301
- # Process results as they complete
302
- for future in as_completed(future_to_image):
303
- image_id, bytes_dl, success, error_msg = future.result()
304
-
305
- if success:
306
- self.downloaded.add(image_id)
307
- downloaded_count += 1
308
- total_bytes += bytes_dl
309
-
310
- if downloaded_count % 10 == 0:
311
- # Calculate ETA
312
- elapsed = time.time() - batch_start_time
313
- rate = downloaded_count / elapsed if elapsed > 0 else 0
314
- remaining = len(images) - downloaded_count
315
- eta_seconds = remaining / rate if rate > 0 else 0
316
-
317
- logger.info(
318
- f"Downloaded: {downloaded_count}/{len(images)} ({format_size(total_bytes)}) "
319
- f"- ETA: {format_time(eta_seconds)}"
320
- )
321
- self._save_progress()
322
- else:
323
- failed_count += 1
324
- logger.warning(f"Failed to download {image_id}: {error_msg}")
325
-
326
- return downloaded_count, total_bytes, failed_count