mapillary-downloader 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,10 +41,10 @@ def main():
41
41
  help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
42
42
  )
43
43
  parser.add_argument(
44
- "--workers",
44
+ "--max-workers",
45
45
  type=int,
46
- default=None,
47
- help="Number of parallel workers (default: half of CPU cores)",
46
+ default=128,
47
+ help="Maximum number of parallel workers (default: 128)",
48
48
  )
49
49
  parser.add_argument(
50
50
  "--no-tar",
@@ -114,7 +114,7 @@ def main():
114
114
  args.output,
115
115
  username,
116
116
  args.quality,
117
- workers=args.workers,
117
+ max_workers=args.max_workers,
118
118
  tar_sequences=not args.no_tar,
119
119
  convert_webp=convert_webp,
120
120
  check_ia=not args.no_check_ia,
@@ -45,7 +45,7 @@ class MapillaryDownloader:
45
45
  output_dir,
46
46
  username=None,
47
47
  quality=None,
48
- workers=None,
48
+ max_workers=128,
49
49
  tar_sequences=True,
50
50
  convert_webp=False,
51
51
  check_ia=True,
@@ -57,7 +57,7 @@ class MapillaryDownloader:
57
57
  output_dir: Base directory to save downloads (final destination)
58
58
  username: Mapillary username (for collection directory)
59
59
  quality: Image quality (for collection directory)
60
- workers: Number of parallel workers (default: half of cpu_count)
60
+ max_workers: Maximum number of parallel workers (default: 128)
61
61
  tar_sequences: Whether to tar sequence directories after download (default: True)
62
62
  convert_webp: Whether to convert images to WebP (affects collection name)
63
63
  check_ia: Whether to check if collection exists on Internet Archive (default: True)
@@ -66,7 +66,8 @@ class MapillaryDownloader:
66
66
  self.base_output_dir = Path(output_dir)
67
67
  self.username = username
68
68
  self.quality = quality
69
- self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
69
+ self.max_workers = max_workers
70
+ self.initial_workers = os.cpu_count() or 1 # Start with CPU count
70
71
  self.tar_sequences = tar_sequences
71
72
  self.convert_webp = convert_webp
72
73
  self.check_ia = check_ia
@@ -177,26 +178,28 @@ class MapillaryDownloader:
177
178
  logger.info(f"Downloading images for user: {self.username}")
178
179
  logger.info(f"Output directory: {self.output_dir}")
179
180
  logger.info(f"Quality: {self.quality}")
180
- logger.info(f"Using {self.workers} parallel workers")
181
+ logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
181
182
 
182
183
  start_time = time.time()
183
184
 
184
- # Step 1: Build seen_ids from metadata file (streaming, only IDs)
185
- logger.info("Building seen_ids from metadata...")
185
+ # Step 1: Check if API fetch is already complete
186
186
  reader = MetadataReader(self.metadata_file)
187
- seen_ids = reader.get_all_ids()
188
187
  api_complete = reader.is_complete
189
- logger.info(f"Found {len(seen_ids)} existing images in metadata")
188
+ if api_complete:
189
+ logger.info("API fetch already complete, will only download")
190
+ else:
191
+ logger.info("API fetch incomplete, will fetch and download in parallel")
190
192
 
191
- # Step 2: Start worker pool (fork AFTER building seen_ids, BEFORE downloading)
193
+ # Step 2: Start worker pool
194
+ # Since workers do both I/O (download) and CPU (WebP), need many more workers
195
+ # Start with CPU count and scale up based on throughput
192
196
  pool = AdaptiveWorkerPool(
193
- worker_process, min_workers=max(1, self.workers // 2), max_workers=self.workers, monitoring_interval=30
197
+ worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
194
198
  )
195
199
  pool.start()
196
200
 
197
- # Step 3: Download images from existing metadata while fetching new from API
201
+ # Step 3: Download images from metadata file while fetching new from API
198
202
  downloaded_count = 0
199
- skipped = 0
200
203
  total_bytes = 0
201
204
  failed_count = 0
202
205
  submitted = 0
@@ -218,25 +221,18 @@ class MapillaryDownloader:
218
221
  logger.info("API fetch thread: Starting...")
219
222
  with open(self.metadata_file, "a") as meta_f:
220
223
  for image in self.client.get_user_images(self.username, bbox=bbox):
221
- image_id = image["id"]
222
-
223
- # Skip if we already have this in our metadata file
224
- if image_id in seen_ids:
225
- continue
226
-
227
- seen_ids.add(image_id)
228
224
  new_images_count[0] += 1
229
225
 
230
- # Save new metadata
226
+ # Save metadata (don't dedupe here, let the tailer handle it)
231
227
  meta_f.write(json.dumps(image) + "\n")
232
228
  meta_f.flush()
233
229
 
234
230
  if new_images_count[0] % 1000 == 0:
235
- logger.info(f"API: Fetched {new_images_count[0]} new images from API")
231
+ logger.info(f"API: Fetched {new_images_count[0]} images from API")
236
232
 
237
233
  # Mark as complete
238
234
  MetadataReader.mark_complete(self.metadata_file)
239
- logger.info(f"API fetch complete: {new_images_count[0]} new images")
235
+ logger.info(f"API fetch complete: {new_images_count[0]} images")
240
236
  finally:
241
237
  api_fetch_complete.set()
242
238
 
@@ -254,8 +250,9 @@ class MapillaryDownloader:
254
250
  # Helper to process results from queue
255
251
  def process_results():
256
252
  nonlocal downloaded_count, total_bytes, failed_count
253
+ # Drain ALL available results to prevent queue from filling up
257
254
  while True:
258
- result = pool.get_result(timeout=0.001)
255
+ result = pool.get_result(timeout=0) # Non-blocking
259
256
  if result is None:
260
257
  break
261
258
 
@@ -386,6 +383,10 @@ class MapillaryDownloader:
386
383
 
387
384
  last_position = f.tell()
388
385
 
386
+ # If API is already complete, we've read the whole file, so break
387
+ if api_fetch_complete is None:
388
+ break
389
+
389
390
  # Sleep briefly before next tail iteration
390
391
  time.sleep(0.1)
391
392
 
@@ -438,14 +439,7 @@ class MapillaryDownloader:
438
439
  self._save_progress()
439
440
  elapsed = time.time() - start_time
440
441
 
441
- # Count total images in metadata
442
- total_images = len(seen_ids)
443
- skipped = total_images - downloaded_count - failed_count
444
-
445
- logger.info(
446
- f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
447
- f"skipped {skipped}, failed {failed_count}"
448
- )
442
+ logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
449
443
  logger.info(f"Total time: {format_time(elapsed)}")
450
444
 
451
445
  # Tar sequence directories for efficient IA uploads
@@ -23,7 +23,47 @@ class MetadataReader:
23
23
  metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
24
24
  """
25
25
  self.metadata_file = Path(metadata_file)
26
- self.is_complete = False
26
+ self.is_complete = self._check_complete()
27
+
28
+ def _check_complete(self):
29
+ """Check if metadata file has completion marker.
30
+
31
+ Returns:
32
+ True if completion marker found, False otherwise
33
+ """
34
+ if not self.metadata_file.exists():
35
+ return False
36
+
37
+ # Check last few lines for completion marker (it should be at the end)
38
+ try:
39
+ if self.metadata_file.suffix == ".gz":
40
+ file_handle = gzip.open(self.metadata_file, "rt")
41
+ else:
42
+ file_handle = open(self.metadata_file)
43
+
44
+ with file_handle as f:
45
+ # Read last 10 lines to find completion marker
46
+ lines = []
47
+ for line in f:
48
+ lines.append(line)
49
+ if len(lines) > 10:
50
+ lines.pop(0)
51
+
52
+ # Check if any of the last lines is the completion marker
53
+ for line in reversed(lines):
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ if data.get("__complete__"):
60
+ return True
61
+ except json.JSONDecodeError:
62
+ continue
63
+
64
+ return False
65
+ except Exception:
66
+ return False
27
67
 
28
68
  def iter_images(self, quality_field=None, downloaded_ids=None):
29
69
  """Stream images from metadata file with filtering.
@@ -1,6 +1,7 @@
1
1
  """Worker process for parallel image download and conversion."""
2
2
 
3
3
  import os
4
+ import signal
4
5
  import tempfile
5
6
  from pathlib import Path
6
7
  import requests
@@ -17,6 +18,12 @@ def worker_process(work_queue, result_queue, worker_id):
17
18
  result_queue: Queue to push results to
18
19
  worker_id: Unique worker identifier
19
20
  """
21
+ # Ignore SIGINT in worker process - parent will handle it
22
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
23
+
24
+ # Create session once per worker (reuse HTTP connections)
25
+ session = requests.Session()
26
+
20
27
  while True:
21
28
  work_item = work_queue.get()
22
29
 
@@ -27,14 +34,17 @@ def worker_process(work_queue, result_queue, worker_id):
27
34
  # Unpack work item
28
35
  image_data, output_dir, quality, convert_webp, access_token = work_item
29
36
 
37
+ # Update session auth for this request
38
+ session.headers.update({"Authorization": f"OAuth {access_token}"})
39
+
30
40
  # Process the image
31
- result = download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token)
41
+ result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
32
42
 
33
43
  # Push result back
34
44
  result_queue.put(result)
35
45
 
36
46
 
37
- def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
47
+ def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
38
48
  """Download and optionally convert a single image.
39
49
 
40
50
  This function is designed to run in a worker process.
@@ -44,7 +54,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
44
54
  output_dir: Base output directory path
45
55
  quality: Quality level (256, 1024, 2048, original)
46
56
  convert_webp: Whether to convert to WebP
47
- access_token: Mapillary API access token
57
+ session: requests.Session with auth already configured
48
58
 
49
59
  Returns:
50
60
  Tuple of (image_id, bytes_downloaded, success, error_msg)
@@ -78,11 +88,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
78
88
  jpg_path = img_dir / f"{image_id}.jpg"
79
89
  final_path = jpg_path
80
90
 
81
- # Download image
82
- # No retries for CDN images - they're cheap, just skip failures and move on
83
- session = requests.Session()
84
- session.headers.update({"Authorization": f"OAuth {access_token}"})
85
-
91
+ # Download image (using session passed from worker)
86
92
  bytes_downloaded = 0
87
93
 
88
94
  try:
@@ -17,7 +17,7 @@ class AdaptiveWorkerPool:
17
17
  - If throughput plateauing/decreasing: reduce workers
18
18
  """
19
19
 
20
- def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
20
+ def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
21
21
  """Initialize adaptive worker pool.
22
22
 
23
23
  Args:
@@ -37,10 +37,11 @@ class AdaptiveWorkerPool:
37
37
 
38
38
  # Worker management
39
39
  self.workers = []
40
- self.current_workers = min_workers
40
+ self.current_workers = min_workers # Start small and ramp up
41
41
 
42
42
  # Throughput monitoring
43
43
  self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
+ self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
44
45
  self.last_processed = 0
45
46
  self.last_check_time = time.time()
46
47
 
@@ -86,51 +87,116 @@ class AdaptiveWorkerPool:
86
87
  elapsed = now - self.last_check_time
87
88
 
88
89
  if elapsed < self.monitoring_interval:
90
+ logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
89
91
  return
90
92
 
91
93
  # Calculate current throughput (items/sec)
92
94
  items_since_check = total_processed - self.last_processed
93
95
  throughput = items_since_check / elapsed
94
96
 
97
+ current_workers = len(self.workers)
95
98
  self.throughput_history.append(throughput)
99
+ self.worker_count_history.append(current_workers)
96
100
  self.last_processed = total_processed
97
101
  self.last_check_time = now
98
102
 
99
- # Need at least 3 measurements to detect trends
100
- if len(self.throughput_history) < 3:
103
+ logger.info(
104
+ f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
105
+ f"history: {len(self.throughput_history)} measurements)"
106
+ )
107
+
108
+ # Need at least 2 measurements to calculate gain per worker
109
+ if len(self.throughput_history) < 2:
110
+ # First measurement - add 20% more workers
111
+ if current_workers < self.max_workers:
112
+ workers_to_add = max(1, int(current_workers * 0.2))
113
+ for i in range(workers_to_add):
114
+ if len(self.workers) < self.max_workers:
115
+ new_worker_id = len(self.workers)
116
+ self._add_worker(new_worker_id)
117
+ self.current_workers += 1
118
+ logger.info(
119
+ f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
120
+ )
101
121
  return
102
122
 
103
- # Check if throughput is increasing
104
- recent_avg = sum(list(self.throughput_history)[-2:]) / 2
105
- older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
106
-
107
- if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
108
- # Throughput increasing by >10%, add workers
109
- new_worker_id = len(self.workers)
110
- self._add_worker(new_worker_id)
111
- self.current_workers += 1
112
- logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
113
-
114
- elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
115
- # Throughput decreasing by >10%, remove worker
116
- # (workers will exit naturally when they finish current work)
117
- self.current_workers = max(self.min_workers, self.current_workers - 1)
118
- logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
119
-
120
- def shutdown(self, timeout=30):
123
+ # Calculate throughput gain per worker added
124
+ current_throughput = self.throughput_history[-1]
125
+ previous_throughput = self.throughput_history[-2]
126
+ previous_workers = self.worker_count_history[-2]
127
+
128
+ throughput_gain = current_throughput - previous_throughput
129
+ workers_added = current_workers - previous_workers
130
+
131
+ logger.debug(
132
+ f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers "
133
+ f"{current_throughput:.1f} items/s @ {current_workers} workers "
134
+ f"(gain: {throughput_gain:.1f}, added: {workers_added})"
135
+ )
136
+
137
+ # If throughput decreased significantly, stop adding workers
138
+ if current_throughput < previous_throughput * 0.95:
139
+ logger.info(
140
+ f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
141
+ f"stopping at {current_workers} workers"
142
+ )
143
+ # If throughput is still increasing or stable, add more workers
144
+ elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
145
+ if workers_added > 0 and throughput_gain > 0:
146
+ # Calculate gain per worker
147
+ gain_per_worker = throughput_gain / workers_added
148
+ logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
149
+
150
+ # Estimate how many more workers we could benefit from
151
+ # Assume diminishing returns, so be conservative
152
+ if gain_per_worker > 0.5:
153
+ # Good gain per worker - add more aggressively
154
+ workers_to_add = max(1, int(current_workers * 0.3))
155
+ elif gain_per_worker > 0.2:
156
+ # Moderate gain - add moderately
157
+ workers_to_add = max(1, int(current_workers * 0.2))
158
+ else:
159
+ # Small gain - add conservatively
160
+ workers_to_add = max(1, int(current_workers * 0.1))
161
+
162
+ added = 0
163
+ for i in range(workers_to_add):
164
+ if len(self.workers) < self.max_workers:
165
+ new_worker_id = len(self.workers)
166
+ self._add_worker(new_worker_id)
167
+ self.current_workers += 1
168
+ added += 1
169
+
170
+ logger.info(
171
+ f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
172
+ f"added {added} workers (now {self.current_workers}/{self.max_workers})"
173
+ )
174
+ else:
175
+ # Fallback to 20% if we can't calculate gain per worker
176
+ workers_to_add = max(1, int(current_workers * 0.2))
177
+ added = 0
178
+ for i in range(workers_to_add):
179
+ if len(self.workers) < self.max_workers:
180
+ new_worker_id = len(self.workers)
181
+ self._add_worker(new_worker_id)
182
+ self.current_workers += 1
183
+ added += 1
184
+ logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
185
+ else:
186
+ logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
187
+
188
+ def shutdown(self, timeout=2):
121
189
  """Shutdown the worker pool gracefully."""
122
190
  logger.info("Shutting down worker pool...")
123
191
  self.running = False
124
192
 
125
- # Send stop signals
126
- for _ in self.workers:
127
- self.work_queue.put(None)
128
-
129
- # Wait for workers to finish
193
+ # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
130
194
  for p in self.workers:
131
- p.join(timeout=timeout)
132
195
  if p.is_alive():
133
- logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
134
196
  p.terminate()
135
197
 
198
+ # Give them a brief moment to exit
199
+ for p in self.workers:
200
+ p.join(timeout=timeout)
201
+
136
202
  logger.info("Worker pool shutdown complete")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,19 +1,19 @@
1
1
  mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
2
- mapillary_downloader/__main__.py,sha256=Kjfx2woMyCvAxYAdqvtXtYJknCMviV_K2PSo0cDc8Hg,4320
2
+ mapillary_downloader/__main__.py,sha256=G4tTNN6V7jPZn4X9gjRDx0faw-Im9zhyTLbDRTOyo3k,4325
3
3
  mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
4
- mapillary_downloader/downloader.py,sha256=F36AtB0Ro_EXR78EDOqH248llV7fGVeR4j9nZf0q7qg,19988
4
+ mapillary_downloader/downloader.py,sha256=6JFEIVBIdhN9L4DcPJmx6UCTgAnWSuH0jO0D_8wKz_U,19886
5
5
  mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
6
6
  mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
7
7
  mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
8
  mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
9
- mapillary_downloader/metadata_reader.py,sha256=-4BmtLVI9sldZU0LlqMc-bporiYNpk6-F2RKKMvzLu4,3560
9
+ mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
10
10
  mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
11
11
  mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
12
12
  mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
13
- mapillary_downloader/worker.py,sha256=RMZO8N67Kl-bhHC1qUdZg6Sx8k6RYbPRhyuLyOjr29o,4450
14
- mapillary_downloader/worker_pool.py,sha256=QFYIbqkgamOtB-iRyZp5kN6jdZuYw93izls61ayVIZ8,4771
15
- mapillary_downloader-0.5.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
- mapillary_downloader-0.5.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
- mapillary_downloader-0.5.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
- mapillary_downloader-0.5.0.dist-info/METADATA,sha256=A0AhsIjGV9FBf5vz28hSC2jugcRqz5A8gsZwMGGEw2A,4982
19
- mapillary_downloader-0.5.0.dist-info/RECORD,,
13
+ mapillary_downloader/worker.py,sha256=n9m6PzSjlLOOYZJd9j1vH-2ag9aOeNndfgRlunzI14s,4637
14
+ mapillary_downloader/worker_pool.py,sha256=iGRq5uFwBNNVQnI4vEjbKHkbKTaEVCdmvMvXcRGuDMg,8203
15
+ mapillary_downloader-0.5.2.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
+ mapillary_downloader-0.5.2.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
+ mapillary_downloader-0.5.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
+ mapillary_downloader-0.5.2.dist-info/METADATA,sha256=PHO4jDVxqsIo9Hs9GX3J2Cfnfc8gy_PI6xhssE9jrMk,4982
19
+ mapillary_downloader-0.5.2.dist-info/RECORD,,