mapillary-downloader 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/PKG-INFO +1 -1
  2. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/pyproject.toml +1 -1
  3. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/downloader.py +14 -28
  4. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/metadata_reader.py +41 -1
  5. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/worker.py +10 -8
  6. mapillary_downloader-0.5.1/src/mapillary_downloader/worker_pool.py +204 -0
  7. mapillary_downloader-0.5.0/src/mapillary_downloader/worker_pool.py +0 -136
  8. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/LICENSE.md +0 -0
  9. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/README.md +0 -0
  10. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__init__.py +0 -0
  11. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__main__.py +0 -0
  12. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/client.py +0 -0
  13. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/exif_writer.py +0 -0
  14. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_check.py +0 -0
  15. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_meta.py +0 -0
  16. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/logging_config.py +0 -0
  17. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/tar_sequences.py +0 -0
  18. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/utils.py +0 -0
  19. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.1}/src/mapillary_downloader/webp_converter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Download your Mapillary data before it's gone"
4
- version = "0.5.0"
4
+ version = "0.5.1"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -181,22 +181,22 @@ class MapillaryDownloader:
181
181
 
182
182
  start_time = time.time()
183
183
 
184
- # Step 1: Build seen_ids from metadata file (streaming, only IDs)
185
- logger.info("Building seen_ids from metadata...")
184
+ # Step 1: Check if API fetch is already complete
186
185
  reader = MetadataReader(self.metadata_file)
187
- seen_ids = reader.get_all_ids()
188
186
  api_complete = reader.is_complete
189
- logger.info(f"Found {len(seen_ids)} existing images in metadata")
187
+ if api_complete:
188
+ logger.info("API fetch already complete, will only download")
189
+ else:
190
+ logger.info("API fetch incomplete, will fetch and download in parallel")
190
191
 
191
- # Step 2: Start worker pool (fork AFTER building seen_ids, BEFORE downloading)
192
- pool = AdaptiveWorkerPool(
193
- worker_process, min_workers=max(1, self.workers // 2), max_workers=self.workers, monitoring_interval=30
194
- )
192
+ # Step 2: Start worker pool
193
+ # Since workers do both I/O (download) and CPU (WebP), need many more workers
194
+ # Cap at 128 for now - will build proper dynamic scaling on a new branch later
195
+ pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
195
196
  pool.start()
196
197
 
197
- # Step 3: Download images from existing metadata while fetching new from API
198
+ # Step 3: Download images from metadata file while fetching new from API
198
199
  downloaded_count = 0
199
- skipped = 0
200
200
  total_bytes = 0
201
201
  failed_count = 0
202
202
  submitted = 0
@@ -218,25 +218,18 @@ class MapillaryDownloader:
218
218
  logger.info("API fetch thread: Starting...")
219
219
  with open(self.metadata_file, "a") as meta_f:
220
220
  for image in self.client.get_user_images(self.username, bbox=bbox):
221
- image_id = image["id"]
222
-
223
- # Skip if we already have this in our metadata file
224
- if image_id in seen_ids:
225
- continue
226
-
227
- seen_ids.add(image_id)
228
221
  new_images_count[0] += 1
229
222
 
230
- # Save new metadata
223
+ # Save metadata (don't dedupe here, let the tailer handle it)
231
224
  meta_f.write(json.dumps(image) + "\n")
232
225
  meta_f.flush()
233
226
 
234
227
  if new_images_count[0] % 1000 == 0:
235
- logger.info(f"API: Fetched {new_images_count[0]} new images from API")
228
+ logger.info(f"API: Fetched {new_images_count[0]} images from API")
236
229
 
237
230
  # Mark as complete
238
231
  MetadataReader.mark_complete(self.metadata_file)
239
- logger.info(f"API fetch complete: {new_images_count[0]} new images")
232
+ logger.info(f"API fetch complete: {new_images_count[0]} images")
240
233
  finally:
241
234
  api_fetch_complete.set()
242
235
 
@@ -438,14 +431,7 @@ class MapillaryDownloader:
438
431
  self._save_progress()
439
432
  elapsed = time.time() - start_time
440
433
 
441
- # Count total images in metadata
442
- total_images = len(seen_ids)
443
- skipped = total_images - downloaded_count - failed_count
444
-
445
- logger.info(
446
- f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
447
- f"skipped {skipped}, failed {failed_count}"
448
- )
434
+ logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
449
435
  logger.info(f"Total time: {format_time(elapsed)}")
450
436
 
451
437
  # Tar sequence directories for efficient IA uploads
@@ -23,7 +23,47 @@ class MetadataReader:
23
23
  metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
24
24
  """
25
25
  self.metadata_file = Path(metadata_file)
26
- self.is_complete = False
26
+ self.is_complete = self._check_complete()
27
+
28
+ def _check_complete(self):
29
+ """Check if metadata file has completion marker.
30
+
31
+ Returns:
32
+ True if completion marker found, False otherwise
33
+ """
34
+ if not self.metadata_file.exists():
35
+ return False
36
+
37
+ # Check last few lines for completion marker (it should be at the end)
38
+ try:
39
+ if self.metadata_file.suffix == ".gz":
40
+ file_handle = gzip.open(self.metadata_file, "rt")
41
+ else:
42
+ file_handle = open(self.metadata_file)
43
+
44
+ with file_handle as f:
45
+ # Read last 10 lines to find completion marker
46
+ lines = []
47
+ for line in f:
48
+ lines.append(line)
49
+ if len(lines) > 10:
50
+ lines.pop(0)
51
+
52
+ # Check if any of the last lines is the completion marker
53
+ for line in reversed(lines):
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ if data.get("__complete__"):
60
+ return True
61
+ except json.JSONDecodeError:
62
+ continue
63
+
64
+ return False
65
+ except Exception:
66
+ return False
27
67
 
28
68
  def iter_images(self, quality_field=None, downloaded_ids=None):
29
69
  """Stream images from metadata file with filtering.
@@ -17,6 +17,9 @@ def worker_process(work_queue, result_queue, worker_id):
17
17
  result_queue: Queue to push results to
18
18
  worker_id: Unique worker identifier
19
19
  """
20
+ # Create session once per worker (reuse HTTP connections)
21
+ session = requests.Session()
22
+
20
23
  while True:
21
24
  work_item = work_queue.get()
22
25
 
@@ -27,14 +30,17 @@ def worker_process(work_queue, result_queue, worker_id):
27
30
  # Unpack work item
28
31
  image_data, output_dir, quality, convert_webp, access_token = work_item
29
32
 
33
+ # Update session auth for this request
34
+ session.headers.update({"Authorization": f"OAuth {access_token}"})
35
+
30
36
  # Process the image
31
- result = download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token)
37
+ result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
32
38
 
33
39
  # Push result back
34
40
  result_queue.put(result)
35
41
 
36
42
 
37
- def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
43
+ def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
38
44
  """Download and optionally convert a single image.
39
45
 
40
46
  This function is designed to run in a worker process.
@@ -44,7 +50,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
44
50
  output_dir: Base output directory path
45
51
  quality: Quality level (256, 1024, 2048, original)
46
52
  convert_webp: Whether to convert to WebP
47
- access_token: Mapillary API access token
53
+ session: requests.Session with auth already configured
48
54
 
49
55
  Returns:
50
56
  Tuple of (image_id, bytes_downloaded, success, error_msg)
@@ -78,11 +84,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
78
84
  jpg_path = img_dir / f"{image_id}.jpg"
79
85
  final_path = jpg_path
80
86
 
81
- # Download image
82
- # No retries for CDN images - they're cheap, just skip failures and move on
83
- session = requests.Session()
84
- session.headers.update({"Authorization": f"OAuth {access_token}"})
85
-
87
+ # Download image (using session passed from worker)
86
88
  bytes_downloaded = 0
87
89
 
88
90
  try:
@@ -0,0 +1,204 @@
1
+ """Adaptive worker pool for parallel processing."""
2
+
3
+ import logging
4
+ import multiprocessing as mp
5
+ import queue
6
+ import time
7
+ from collections import deque
8
+
9
+ logger = logging.getLogger("mapillary_downloader")
10
+
11
+
12
+ class AdaptiveWorkerPool:
13
+ """Worker pool that scales based on throughput.
14
+
15
+ Monitors throughput every 30 seconds and adjusts worker count:
16
+ - If throughput increasing: add workers (up to max)
17
+ - If throughput plateauing/decreasing: reduce workers
18
+ """
19
+
20
+ def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
21
+ """Initialize adaptive worker pool.
22
+
23
+ Args:
24
+ worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
+ min_workers: Minimum number of workers
26
+ max_workers: Maximum number of workers
27
+ monitoring_interval: Seconds between throughput checks
28
+ """
29
+ self.worker_func = worker_func
30
+ self.min_workers = min_workers
31
+ self.max_workers = max_workers
32
+ self.monitoring_interval = monitoring_interval
33
+
34
+ # Queues
35
+ self.work_queue = mp.Queue(maxsize=max_workers)
36
+ self.result_queue = mp.Queue()
37
+
38
+ # Worker management
39
+ self.workers = []
40
+ self.current_workers = min_workers # Start small and ramp up
41
+
42
+ # Throughput monitoring
43
+ self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
+ self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
45
+ self.last_processed = 0
46
+ self.last_check_time = time.time()
47
+
48
+ self.running = False
49
+
50
+ def start(self):
51
+ """Start the worker pool."""
52
+ self.running = True
53
+ logger.info(f"Starting worker pool with {self.current_workers} workers")
54
+
55
+ for i in range(self.current_workers):
56
+ self._add_worker(i)
57
+
58
+ def _add_worker(self, worker_id):
59
+ """Add a new worker to the pool."""
60
+ p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
61
+ p.start()
62
+ self.workers.append(p)
63
+ logger.debug(f"Started worker {worker_id}")
64
+
65
+ def submit(self, work_item):
66
+ """Submit work to the pool (blocks if queue is full)."""
67
+ self.work_queue.put(work_item)
68
+
69
+ def get_result(self, timeout=None):
70
+ """Get a result from the workers.
71
+
72
+ Returns:
73
+ Result from worker, or None if timeout
74
+ """
75
+ try:
76
+ return self.result_queue.get(timeout=timeout)
77
+ except queue.Empty:
78
+ return None
79
+
80
+ def check_throughput(self, total_processed):
81
+ """Check throughput and adjust workers if needed.
82
+
83
+ Args:
84
+ total_processed: Total number of items processed so far
85
+ """
86
+ now = time.time()
87
+ elapsed = now - self.last_check_time
88
+
89
+ if elapsed < self.monitoring_interval:
90
+ logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
91
+ return
92
+
93
+ # Calculate current throughput (items/sec)
94
+ items_since_check = total_processed - self.last_processed
95
+ throughput = items_since_check / elapsed
96
+
97
+ current_workers = len(self.workers)
98
+ self.throughput_history.append(throughput)
99
+ self.worker_count_history.append(current_workers)
100
+ self.last_processed = total_processed
101
+ self.last_check_time = now
102
+
103
+ logger.info(
104
+ f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
105
+ f"history: {len(self.throughput_history)} measurements)"
106
+ )
107
+
108
+ # Need at least 2 measurements to calculate gain per worker
109
+ if len(self.throughput_history) < 2:
110
+ # First measurement - add 20% more workers
111
+ if current_workers < self.max_workers:
112
+ workers_to_add = max(1, int(current_workers * 0.2))
113
+ for i in range(workers_to_add):
114
+ if len(self.workers) < self.max_workers:
115
+ new_worker_id = len(self.workers)
116
+ self._add_worker(new_worker_id)
117
+ self.current_workers += 1
118
+ logger.info(
119
+ f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
120
+ )
121
+ return
122
+
123
+ # Calculate throughput gain per worker added
124
+ current_throughput = self.throughput_history[-1]
125
+ previous_throughput = self.throughput_history[-2]
126
+ previous_workers = self.worker_count_history[-2]
127
+
128
+ throughput_gain = current_throughput - previous_throughput
129
+ workers_added = current_workers - previous_workers
130
+
131
+ logger.debug(
132
+ f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
133
+ f"{current_throughput:.1f} items/s @ {current_workers} workers "
134
+ f"(gain: {throughput_gain:.1f}, added: {workers_added})"
135
+ )
136
+
137
+ # If throughput decreased significantly, stop adding workers
138
+ if current_throughput < previous_throughput * 0.95:
139
+ logger.info(
140
+ f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
141
+ f"stopping at {current_workers} workers"
142
+ )
143
+ # If throughput is still increasing or stable, add more workers
144
+ elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
145
+ if workers_added > 0 and throughput_gain > 0:
146
+ # Calculate gain per worker
147
+ gain_per_worker = throughput_gain / workers_added
148
+ logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
149
+
150
+ # Estimate how many more workers we could benefit from
151
+ # Assume diminishing returns, so be conservative
152
+ if gain_per_worker > 0.5:
153
+ # Good gain per worker - add more aggressively
154
+ workers_to_add = max(1, int(current_workers * 0.3))
155
+ elif gain_per_worker > 0.2:
156
+ # Moderate gain - add moderately
157
+ workers_to_add = max(1, int(current_workers * 0.2))
158
+ else:
159
+ # Small gain - add conservatively
160
+ workers_to_add = max(1, int(current_workers * 0.1))
161
+
162
+ added = 0
163
+ for i in range(workers_to_add):
164
+ if len(self.workers) < self.max_workers:
165
+ new_worker_id = len(self.workers)
166
+ self._add_worker(new_worker_id)
167
+ self.current_workers += 1
168
+ added += 1
169
+
170
+ logger.info(
171
+ f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
172
+ f"added {added} workers (now {self.current_workers}/{self.max_workers})"
173
+ )
174
+ else:
175
+ # Fallback to 20% if we can't calculate gain per worker
176
+ workers_to_add = max(1, int(current_workers * 0.2))
177
+ added = 0
178
+ for i in range(workers_to_add):
179
+ if len(self.workers) < self.max_workers:
180
+ new_worker_id = len(self.workers)
181
+ self._add_worker(new_worker_id)
182
+ self.current_workers += 1
183
+ added += 1
184
+ logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
185
+ else:
186
+ logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
187
+
188
+ def shutdown(self, timeout=30):
189
+ """Shutdown the worker pool gracefully."""
190
+ logger.info("Shutting down worker pool...")
191
+ self.running = False
192
+
193
+ # Send stop signals
194
+ for _ in self.workers:
195
+ self.work_queue.put(None)
196
+
197
+ # Wait for workers to finish
198
+ for p in self.workers:
199
+ p.join(timeout=timeout)
200
+ if p.is_alive():
201
+ logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
202
+ p.terminate()
203
+
204
+ logger.info("Worker pool shutdown complete")
@@ -1,136 +0,0 @@
1
- """Adaptive worker pool for parallel processing."""
2
-
3
- import logging
4
- import multiprocessing as mp
5
- import queue
6
- import time
7
- from collections import deque
8
-
9
- logger = logging.getLogger("mapillary_downloader")
10
-
11
-
12
- class AdaptiveWorkerPool:
13
- """Worker pool that scales based on throughput.
14
-
15
- Monitors throughput every 30 seconds and adjusts worker count:
16
- - If throughput increasing: add workers (up to max)
17
- - If throughput plateauing/decreasing: reduce workers
18
- """
19
-
20
- def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
21
- """Initialize adaptive worker pool.
22
-
23
- Args:
24
- worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
- min_workers: Minimum number of workers
26
- max_workers: Maximum number of workers
27
- monitoring_interval: Seconds between throughput checks
28
- """
29
- self.worker_func = worker_func
30
- self.min_workers = min_workers
31
- self.max_workers = max_workers
32
- self.monitoring_interval = monitoring_interval
33
-
34
- # Queues
35
- self.work_queue = mp.Queue(maxsize=max_workers)
36
- self.result_queue = mp.Queue()
37
-
38
- # Worker management
39
- self.workers = []
40
- self.current_workers = min_workers
41
-
42
- # Throughput monitoring
43
- self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
- self.last_processed = 0
45
- self.last_check_time = time.time()
46
-
47
- self.running = False
48
-
49
- def start(self):
50
- """Start the worker pool."""
51
- self.running = True
52
- logger.info(f"Starting worker pool with {self.current_workers} workers")
53
-
54
- for i in range(self.current_workers):
55
- self._add_worker(i)
56
-
57
- def _add_worker(self, worker_id):
58
- """Add a new worker to the pool."""
59
- p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
60
- p.start()
61
- self.workers.append(p)
62
- logger.debug(f"Started worker {worker_id}")
63
-
64
- def submit(self, work_item):
65
- """Submit work to the pool (blocks if queue is full)."""
66
- self.work_queue.put(work_item)
67
-
68
- def get_result(self, timeout=None):
69
- """Get a result from the workers.
70
-
71
- Returns:
72
- Result from worker, or None if timeout
73
- """
74
- try:
75
- return self.result_queue.get(timeout=timeout)
76
- except queue.Empty:
77
- return None
78
-
79
- def check_throughput(self, total_processed):
80
- """Check throughput and adjust workers if needed.
81
-
82
- Args:
83
- total_processed: Total number of items processed so far
84
- """
85
- now = time.time()
86
- elapsed = now - self.last_check_time
87
-
88
- if elapsed < self.monitoring_interval:
89
- return
90
-
91
- # Calculate current throughput (items/sec)
92
- items_since_check = total_processed - self.last_processed
93
- throughput = items_since_check / elapsed
94
-
95
- self.throughput_history.append(throughput)
96
- self.last_processed = total_processed
97
- self.last_check_time = now
98
-
99
- # Need at least 3 measurements to detect trends
100
- if len(self.throughput_history) < 3:
101
- return
102
-
103
- # Check if throughput is increasing
104
- recent_avg = sum(list(self.throughput_history)[-2:]) / 2
105
- older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
106
-
107
- if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
108
- # Throughput increasing by >10%, add workers
109
- new_worker_id = len(self.workers)
110
- self._add_worker(new_worker_id)
111
- self.current_workers += 1
112
- logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
113
-
114
- elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
115
- # Throughput decreasing by >10%, remove worker
116
- # (workers will exit naturally when they finish current work)
117
- self.current_workers = max(self.min_workers, self.current_workers - 1)
118
- logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
119
-
120
- def shutdown(self, timeout=30):
121
- """Shutdown the worker pool gracefully."""
122
- logger.info("Shutting down worker pool...")
123
- self.running = False
124
-
125
- # Send stop signals
126
- for _ in self.workers:
127
- self.work_queue.put(None)
128
-
129
- # Wait for workers to finish
130
- for p in self.workers:
131
- p.join(timeout=timeout)
132
- if p.is_alive():
133
- logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
134
- p.terminate()
135
-
136
- logger.info("Worker pool shutdown complete")