mapillary-downloader 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -181,22 +181,22 @@ class MapillaryDownloader:
181
181
 
182
182
  start_time = time.time()
183
183
 
184
- # Step 1: Build seen_ids from metadata file (streaming, only IDs)
185
- logger.info("Building seen_ids from metadata...")
184
+ # Step 1: Check if API fetch is already complete
186
185
  reader = MetadataReader(self.metadata_file)
187
- seen_ids = reader.get_all_ids()
188
186
  api_complete = reader.is_complete
189
- logger.info(f"Found {len(seen_ids)} existing images in metadata")
187
+ if api_complete:
188
+ logger.info("API fetch already complete, will only download")
189
+ else:
190
+ logger.info("API fetch incomplete, will fetch and download in parallel")
190
191
 
191
- # Step 2: Start worker pool (fork AFTER building seen_ids, BEFORE downloading)
192
- pool = AdaptiveWorkerPool(
193
- worker_process, min_workers=max(1, self.workers // 2), max_workers=self.workers, monitoring_interval=30
194
- )
192
+ # Step 2: Start worker pool
193
+ # Since workers do both I/O (download) and CPU (WebP), need many more workers
194
+ # Cap at 128 for now - will build proper dynamic scaling on a new branch later
195
+ pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
195
196
  pool.start()
196
197
 
197
- # Step 3: Download images from existing metadata while fetching new from API
198
+ # Step 3: Download images from metadata file while fetching new from API
198
199
  downloaded_count = 0
199
- skipped = 0
200
200
  total_bytes = 0
201
201
  failed_count = 0
202
202
  submitted = 0
@@ -218,25 +218,18 @@ class MapillaryDownloader:
218
218
  logger.info("API fetch thread: Starting...")
219
219
  with open(self.metadata_file, "a") as meta_f:
220
220
  for image in self.client.get_user_images(self.username, bbox=bbox):
221
- image_id = image["id"]
222
-
223
- # Skip if we already have this in our metadata file
224
- if image_id in seen_ids:
225
- continue
226
-
227
- seen_ids.add(image_id)
228
221
  new_images_count[0] += 1
229
222
 
230
- # Save new metadata
223
+ # Save metadata (don't dedupe here, let the tailer handle it)
231
224
  meta_f.write(json.dumps(image) + "\n")
232
225
  meta_f.flush()
233
226
 
234
227
  if new_images_count[0] % 1000 == 0:
235
- logger.info(f"API: Fetched {new_images_count[0]} new images from API")
228
+ logger.info(f"API: Fetched {new_images_count[0]} images from API")
236
229
 
237
230
  # Mark as complete
238
231
  MetadataReader.mark_complete(self.metadata_file)
239
- logger.info(f"API fetch complete: {new_images_count[0]} new images")
232
+ logger.info(f"API fetch complete: {new_images_count[0]} images")
240
233
  finally:
241
234
  api_fetch_complete.set()
242
235
 
@@ -438,14 +431,7 @@ class MapillaryDownloader:
438
431
  self._save_progress()
439
432
  elapsed = time.time() - start_time
440
433
 
441
- # Count total images in metadata
442
- total_images = len(seen_ids)
443
- skipped = total_images - downloaded_count - failed_count
444
-
445
- logger.info(
446
- f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
447
- f"skipped {skipped}, failed {failed_count}"
448
- )
434
+ logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
449
435
  logger.info(f"Total time: {format_time(elapsed)}")
450
436
 
451
437
  # Tar sequence directories for efficient IA uploads
@@ -23,7 +23,47 @@ class MetadataReader:
23
23
  metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
24
24
  """
25
25
  self.metadata_file = Path(metadata_file)
26
- self.is_complete = False
26
+ self.is_complete = self._check_complete()
27
+
28
+ def _check_complete(self):
29
+ """Check if metadata file has completion marker.
30
+
31
+ Returns:
32
+ True if completion marker found, False otherwise
33
+ """
34
+ if not self.metadata_file.exists():
35
+ return False
36
+
37
+ # Check last few lines for completion marker (it should be at the end)
38
+ try:
39
+ if self.metadata_file.suffix == ".gz":
40
+ file_handle = gzip.open(self.metadata_file, "rt")
41
+ else:
42
+ file_handle = open(self.metadata_file)
43
+
44
+ with file_handle as f:
45
+ # Read last 10 lines to find completion marker
46
+ lines = []
47
+ for line in f:
48
+ lines.append(line)
49
+ if len(lines) > 10:
50
+ lines.pop(0)
51
+
52
+ # Check if any of the last lines is the completion marker
53
+ for line in reversed(lines):
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ if data.get("__complete__"):
60
+ return True
61
+ except json.JSONDecodeError:
62
+ continue
63
+
64
+ return False
65
+ except Exception:
66
+ return False
27
67
 
28
68
  def iter_images(self, quality_field=None, downloaded_ids=None):
29
69
  """Stream images from metadata file with filtering.
@@ -17,6 +17,9 @@ def worker_process(work_queue, result_queue, worker_id):
17
17
  result_queue: Queue to push results to
18
18
  worker_id: Unique worker identifier
19
19
  """
20
+ # Create session once per worker (reuse HTTP connections)
21
+ session = requests.Session()
22
+
20
23
  while True:
21
24
  work_item = work_queue.get()
22
25
 
@@ -27,14 +30,17 @@ def worker_process(work_queue, result_queue, worker_id):
27
30
  # Unpack work item
28
31
  image_data, output_dir, quality, convert_webp, access_token = work_item
29
32
 
33
+ # Update session auth for this request
34
+ session.headers.update({"Authorization": f"OAuth {access_token}"})
35
+
30
36
  # Process the image
31
- result = download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token)
37
+ result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
32
38
 
33
39
  # Push result back
34
40
  result_queue.put(result)
35
41
 
36
42
 
37
- def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
43
+ def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
38
44
  """Download and optionally convert a single image.
39
45
 
40
46
  This function is designed to run in a worker process.
@@ -44,7 +50,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
44
50
  output_dir: Base output directory path
45
51
  quality: Quality level (256, 1024, 2048, original)
46
52
  convert_webp: Whether to convert to WebP
47
- access_token: Mapillary API access token
53
+ session: requests.Session with auth already configured
48
54
 
49
55
  Returns:
50
56
  Tuple of (image_id, bytes_downloaded, success, error_msg)
@@ -78,11 +84,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
78
84
  jpg_path = img_dir / f"{image_id}.jpg"
79
85
  final_path = jpg_path
80
86
 
81
- # Download image
82
- # No retries for CDN images - they're cheap, just skip failures and move on
83
- session = requests.Session()
84
- session.headers.update({"Authorization": f"OAuth {access_token}"})
85
-
87
+ # Download image (using session passed from worker)
86
88
  bytes_downloaded = 0
87
89
 
88
90
  try:
@@ -17,7 +17,7 @@ class AdaptiveWorkerPool:
17
17
  - If throughput plateauing/decreasing: reduce workers
18
18
  """
19
19
 
20
- def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
20
+ def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
21
21
  """Initialize adaptive worker pool.
22
22
 
23
23
  Args:
@@ -37,10 +37,11 @@ class AdaptiveWorkerPool:
37
37
 
38
38
  # Worker management
39
39
  self.workers = []
40
- self.current_workers = min_workers
40
+ self.current_workers = min_workers # Start small and ramp up
41
41
 
42
42
  # Throughput monitoring
43
43
  self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
+ self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
44
45
  self.last_processed = 0
45
46
  self.last_check_time = time.time()
46
47
 
@@ -86,36 +87,103 @@ class AdaptiveWorkerPool:
86
87
  elapsed = now - self.last_check_time
87
88
 
88
89
  if elapsed < self.monitoring_interval:
90
+ logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
89
91
  return
90
92
 
91
93
  # Calculate current throughput (items/sec)
92
94
  items_since_check = total_processed - self.last_processed
93
95
  throughput = items_since_check / elapsed
94
96
 
97
+ current_workers = len(self.workers)
95
98
  self.throughput_history.append(throughput)
99
+ self.worker_count_history.append(current_workers)
96
100
  self.last_processed = total_processed
97
101
  self.last_check_time = now
98
102
 
99
- # Need at least 3 measurements to detect trends
100
- if len(self.throughput_history) < 3:
103
+ logger.info(
104
+ f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
105
+ f"history: {len(self.throughput_history)} measurements)"
106
+ )
107
+
108
+ # Need at least 2 measurements to calculate gain per worker
109
+ if len(self.throughput_history) < 2:
110
+ # First measurement - add 20% more workers
111
+ if current_workers < self.max_workers:
112
+ workers_to_add = max(1, int(current_workers * 0.2))
113
+ for i in range(workers_to_add):
114
+ if len(self.workers) < self.max_workers:
115
+ new_worker_id = len(self.workers)
116
+ self._add_worker(new_worker_id)
117
+ self.current_workers += 1
118
+ logger.info(
119
+ f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
120
+ )
101
121
  return
102
122
 
103
- # Check if throughput is increasing
104
- recent_avg = sum(list(self.throughput_history)[-2:]) / 2
105
- older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
106
-
107
- if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
108
- # Throughput increasing by >10%, add workers
109
- new_worker_id = len(self.workers)
110
- self._add_worker(new_worker_id)
111
- self.current_workers += 1
112
- logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
113
-
114
- elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
115
- # Throughput decreasing by >10%, remove worker
116
- # (workers will exit naturally when they finish current work)
117
- self.current_workers = max(self.min_workers, self.current_workers - 1)
118
- logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
123
+ # Calculate throughput gain per worker added
124
+ current_throughput = self.throughput_history[-1]
125
+ previous_throughput = self.throughput_history[-2]
126
+ previous_workers = self.worker_count_history[-2]
127
+
128
+ throughput_gain = current_throughput - previous_throughput
129
+ workers_added = current_workers - previous_workers
130
+
131
+ logger.debug(
132
+ f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers "
133
+ f"{current_throughput:.1f} items/s @ {current_workers} workers "
134
+ f"(gain: {throughput_gain:.1f}, added: {workers_added})"
135
+ )
136
+
137
+ # If throughput decreased significantly, stop adding workers
138
+ if current_throughput < previous_throughput * 0.95:
139
+ logger.info(
140
+ f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
141
+ f"stopping at {current_workers} workers"
142
+ )
143
+ # If throughput is still increasing or stable, add more workers
144
+ elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
145
+ if workers_added > 0 and throughput_gain > 0:
146
+ # Calculate gain per worker
147
+ gain_per_worker = throughput_gain / workers_added
148
+ logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
149
+
150
+ # Estimate how many more workers we could benefit from
151
+ # Assume diminishing returns, so be conservative
152
+ if gain_per_worker > 0.5:
153
+ # Good gain per worker - add more aggressively
154
+ workers_to_add = max(1, int(current_workers * 0.3))
155
+ elif gain_per_worker > 0.2:
156
+ # Moderate gain - add moderately
157
+ workers_to_add = max(1, int(current_workers * 0.2))
158
+ else:
159
+ # Small gain - add conservatively
160
+ workers_to_add = max(1, int(current_workers * 0.1))
161
+
162
+ added = 0
163
+ for i in range(workers_to_add):
164
+ if len(self.workers) < self.max_workers:
165
+ new_worker_id = len(self.workers)
166
+ self._add_worker(new_worker_id)
167
+ self.current_workers += 1
168
+ added += 1
169
+
170
+ logger.info(
171
+ f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
172
+ f"added {added} workers (now {self.current_workers}/{self.max_workers})"
173
+ )
174
+ else:
175
+ # Fallback to 20% if we can't calculate gain per worker
176
+ workers_to_add = max(1, int(current_workers * 0.2))
177
+ added = 0
178
+ for i in range(workers_to_add):
179
+ if len(self.workers) < self.max_workers:
180
+ new_worker_id = len(self.workers)
181
+ self._add_worker(new_worker_id)
182
+ self.current_workers += 1
183
+ added += 1
184
+ logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
185
+ else:
186
+ logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
119
187
 
120
188
  def shutdown(self, timeout=30):
121
189
  """Shutdown the worker pool gracefully."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,19 +1,19 @@
1
1
  mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
2
2
  mapillary_downloader/__main__.py,sha256=Kjfx2woMyCvAxYAdqvtXtYJknCMviV_K2PSo0cDc8Hg,4320
3
3
  mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
4
- mapillary_downloader/downloader.py,sha256=F36AtB0Ro_EXR78EDOqH248llV7fGVeR4j9nZf0q7qg,19988
4
+ mapillary_downloader/downloader.py,sha256=v0vLovW80DMpEzQIb_tdETF1HzO2GEMbbVzD93aKnnQ,19561
5
5
  mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
6
6
  mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
7
7
  mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
8
  mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
9
- mapillary_downloader/metadata_reader.py,sha256=-4BmtLVI9sldZU0LlqMc-bporiYNpk6-F2RKKMvzLu4,3560
9
+ mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
10
10
  mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
11
11
  mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
12
12
  mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
13
- mapillary_downloader/worker.py,sha256=RMZO8N67Kl-bhHC1qUdZg6Sx8k6RYbPRhyuLyOjr29o,4450
14
- mapillary_downloader/worker_pool.py,sha256=QFYIbqkgamOtB-iRyZp5kN6jdZuYw93izls61ayVIZ8,4771
15
- mapillary_downloader-0.5.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
- mapillary_downloader-0.5.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
- mapillary_downloader-0.5.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
- mapillary_downloader-0.5.0.dist-info/METADATA,sha256=A0AhsIjGV9FBf5vz28hSC2jugcRqz5A8gsZwMGGEw2A,4982
19
- mapillary_downloader-0.5.0.dist-info/RECORD,,
13
+ mapillary_downloader/worker.py,sha256=syxsE2pPX_9EXzSGEGeUaeLFqmqZjTma-rB7S2zpYac,4511
14
+ mapillary_downloader/worker_pool.py,sha256=ctFl40UgFUjpLL_e6Mw5h7YNMfKwXulRhaX18r9sIkE,8257
15
+ mapillary_downloader-0.5.1.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
+ mapillary_downloader-0.5.1.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
+ mapillary_downloader-0.5.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
+ mapillary_downloader-0.5.1.dist-info/METADATA,sha256=1vM80Kq2WHUzfSt7lIx91hb0fycHosxpll-xnCda6JU,4982
19
+ mapillary_downloader-0.5.1.dist-info/RECORD,,