mapillary-downloader 0.8.0__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/PKG-INFO +1 -1
  2. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/pyproject.toml +1 -1
  3. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/downloader.py +77 -110
  4. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/logging_config.py +5 -0
  5. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/webp_converter.py +0 -4
  6. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/LICENSE.md +0 -0
  7. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/README.md +0 -0
  8. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/__init__.py +0 -0
  9. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/__main__.py +0 -0
  10. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/client.py +0 -0
  11. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/exif_writer.py +0 -0
  12. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/ia_check.py +0 -0
  13. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/ia_meta.py +0 -0
  14. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/ia_stats.py +0 -0
  15. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/metadata_reader.py +0 -0
  16. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/tar_sequences.py +0 -0
  17. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/utils.py +0 -0
  18. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/worker.py +0 -0
  19. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/worker_pool.py +0 -0
  20. {mapillary_downloader-0.8.0 → mapillary_downloader-0.8.1}/src/mapillary_downloader/xmp_writer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: Archive user data from Mapillary
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Archive user data from Mapillary"
4
- version = "0.8.0"
4
+ version = "0.8.1"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -5,6 +5,7 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import shutil
8
+ import threading
8
9
  import time
9
10
  from pathlib import Path
10
11
  from mapillary_downloader.utils import format_size, format_time, safe_json_save
@@ -146,6 +147,65 @@ class MapillaryDownloader:
146
147
  # Write atomically using utility function
147
148
  safe_json_save(self.progress_file, progress)
148
149
 
150
+ def _submit_metadata_batch(self, file_handle, quality_field, pool, convert_webp, process_results, base_submitted):
151
+ """Read metadata lines from current position, submit to workers.
152
+
153
+ Args:
154
+ file_handle: Open file positioned at read point
155
+ quality_field: Field name for quality URL (e.g., "thumb_1024_url")
156
+ pool: Worker pool to submit to
157
+ convert_webp: Whether to convert to webp
158
+ process_results: Callback to drain result queue
159
+ base_submitted: Running total for cumulative logging
160
+
161
+ Returns:
162
+ tuple: (submitted_count, skipped_count) for this batch
163
+ """
164
+ submitted = 0
165
+ skipped = 0
166
+
167
+ for line in file_handle:
168
+ line = line.strip()
169
+ if not line:
170
+ continue
171
+
172
+ try:
173
+ image = json.loads(line)
174
+ except json.JSONDecodeError:
175
+ continue
176
+
177
+ if image.get("__complete__"):
178
+ continue
179
+
180
+ image_id = image.get("id")
181
+ if not image_id:
182
+ continue
183
+
184
+ if image_id in self.downloaded:
185
+ skipped += 1
186
+ continue
187
+
188
+ if not image.get(quality_field):
189
+ continue
190
+
191
+ work_item = (
192
+ image,
193
+ str(self.output_dir),
194
+ self.quality,
195
+ convert_webp,
196
+ self.client.access_token,
197
+ )
198
+ pool.submit(work_item)
199
+ submitted += 1
200
+
201
+ total = base_submitted + submitted
202
+ if total % 1000 == 0:
203
+ logger.info(f"Queue: submitted {total:,} images")
204
+
205
+ process_results()
206
+
207
+ return submitted, skipped
208
+
149
209
  def download_user_data(self, bbox=None, convert_webp=False):
150
210
  """Download all images for a user using streaming queue-based architecture.
151
211
 
@@ -187,13 +247,13 @@ class MapillaryDownloader:
187
247
  total_bytes = 0
188
248
  failed_count = 0
189
249
  submitted = 0
250
+ skipped_count = 0
190
251
 
191
252
  try:
192
253
  # Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
193
- if not api_complete:
194
- import threading
254
+ api_fetch_complete = threading.Event()
195
255
 
196
- api_fetch_complete = threading.Event()
256
+ if not api_complete:
197
257
  new_images_count = [0] # Mutable so thread can update it
198
258
 
199
259
  def fetch_api_metadata():
@@ -221,7 +281,7 @@ class MapillaryDownloader:
221
281
  api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
222
282
  api_thread.start()
223
283
  else:
224
- api_fetch_complete = None
284
+ api_fetch_complete.set()
225
285
 
226
286
  # Step 3b: Tail metadata file and submit to workers
227
287
  logger.debug("Starting metadata tail and download queue feeder")
@@ -244,9 +304,10 @@ class MapillaryDownloader:
244
304
  total_bytes += bytes_dl
245
305
 
246
306
  # Log every download for first 10, then every 100
307
+ total_downloaded = len(self.downloaded)
247
308
  should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
248
309
  if should_log:
249
- logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
310
+ logger.info(f"Downloaded: {total_downloaded:,} ({format_size(total_bytes)} this session)")
250
311
 
251
312
  if downloaded_count % 100 == 0:
252
313
  pool.check_throughput(downloaded_count)
@@ -260,117 +321,20 @@ class MapillaryDownloader:
260
321
 
261
322
  # Tail the metadata file and submit to workers
262
323
  while True:
263
- # Check if API fetch is done and we've processed everything
264
- if api_fetch_complete and api_fetch_complete.is_set():
265
- # Read any remaining lines
266
- if self.metadata_file.exists():
267
- with open(self.metadata_file) as f:
268
- f.seek(last_position)
269
- for line in f:
270
- line = line.strip()
271
- if not line:
272
- continue
273
-
274
- try:
275
- image = json.loads(line)
276
- except json.JSONDecodeError:
277
- # Incomplete line, will retry
278
- continue
279
-
280
- # Skip completion marker
281
- if image.get("__complete__"):
282
- continue
283
-
284
- image_id = image.get("id")
285
- if not image_id:
286
- continue
287
-
288
- # Skip if already downloaded or no quality URL
289
- if image_id in self.downloaded:
290
- downloaded_count += 1
291
- continue
292
- if not image.get(quality_field):
293
- continue
294
-
295
- # Submit to workers
296
- work_item = (
297
- image,
298
- str(self.output_dir),
299
- self.quality,
300
- convert_webp,
301
- self.client.access_token,
302
- )
303
- pool.submit(work_item)
304
- submitted += 1
305
-
306
- if submitted % 1000 == 0:
307
- logger.info(f"Queue: submitted {submitted:,} images")
308
-
309
- # Process results while submitting
310
- process_results()
311
-
312
- last_position = f.tell()
313
-
314
- # API done and all lines processed, break
315
- break
316
-
317
- # API still running or API was already complete, tail the file
318
324
  if self.metadata_file.exists():
319
325
  with open(self.metadata_file) as f:
320
326
  f.seek(last_position)
321
- for line in f:
322
- line = line.strip()
323
- if not line:
324
- continue
325
-
326
- try:
327
- image = json.loads(line)
328
- except json.JSONDecodeError:
329
- # Incomplete line, will retry next iteration
330
- continue
331
-
332
- # Skip completion marker
333
- if image.get("__complete__"):
334
- continue
335
-
336
- image_id = image.get("id")
337
- if not image_id:
338
- continue
339
-
340
- # Skip if already downloaded or no quality URL
341
- if image_id in self.downloaded:
342
- downloaded_count += 1
343
- continue
344
- if not image.get(quality_field):
345
- continue
346
-
347
- # Submit to workers
348
- work_item = (
349
- image,
350
- str(self.output_dir),
351
- self.quality,
352
- convert_webp,
353
- self.client.access_token,
354
- )
355
- pool.submit(work_item)
356
- submitted += 1
357
-
358
- if submitted % 1000 == 0:
359
- logger.info(f"Queue: submitted {submitted:,} images")
360
-
361
- # Process results while submitting
362
- process_results()
363
-
327
+ batch_submitted, batch_skipped = self._submit_metadata_batch(
328
+ f, quality_field, pool, convert_webp, process_results, submitted
329
+ )
330
+ submitted += batch_submitted
331
+ skipped_count += batch_skipped
364
332
  last_position = f.tell()
365
333
 
366
- # If API is already complete, we've read the whole file, so break
367
- if api_fetch_complete is None:
334
+ if api_fetch_complete.is_set():
368
335
  break
369
336
 
370
- # Sleep briefly before next tail iteration
371
337
  time.sleep(0.1)
372
-
373
- # Process any results that came in
374
338
  process_results()
375
339
 
376
340
  # Send shutdown signals
@@ -397,7 +361,7 @@ class MapillaryDownloader:
397
361
  total_bytes += bytes_dl
398
362
 
399
363
  if downloaded_count % 100 == 0:
400
- logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
364
+ logger.info(f"Downloaded: {len(self.downloaded):,} ({format_size(total_bytes)} this session)")
401
365
  pool.check_throughput(downloaded_count)
402
366
  # Save progress every 5 minutes
403
367
  if time.time() - self._last_save_time >= 300:
@@ -414,7 +378,10 @@ class MapillaryDownloader:
414
378
  self._save_progress()
415
379
  elapsed = time.time() - start_time
416
380
 
417
- logger.info(f"Complete! Downloaded {downloaded_count:,} ({format_size(total_bytes)}), failed {failed_count:,}")
381
+ logger.info(
382
+ f"Complete! Downloaded {downloaded_count:,} this session ({format_size(total_bytes)}), "
383
+ f"{len(self.downloaded):,} total, skipped {skipped_count:,}, failed {failed_count:,}"
384
+ )
418
385
  logger.info(f"Total time: {format_time(elapsed)}")
419
386
 
420
387
  # Tar sequence directories for efficient IA uploads
@@ -15,6 +15,7 @@ class ColoredFormatter(logging.Formatter):
15
15
  "DEBUG": "\033[94m", # Blue
16
16
  "RESET": "\033[0m",
17
17
  }
18
+ CYAN = "\033[96m"
18
19
 
19
20
  def __init__(self, fmt=None, datefmt=None, use_color=True):
20
21
  """Initialize the formatter.
@@ -41,6 +42,10 @@ class ColoredFormatter(logging.Formatter):
41
42
  if levelname in self.COLORS:
42
43
  record.levelname = f"{self.COLORS[levelname]}{levelname}{self.COLORS['RESET']}"
43
44
 
45
+ # Color API messages differently so they stand out
46
+ if record.msg.startswith("API"):
47
+ record.msg = f"{self.CYAN}{record.msg}{self.COLORS['RESET']}"
48
+
44
49
  return super().format(record)
45
50
 
46
51
 
@@ -43,7 +43,6 @@ def convert_to_webp(jpg_path, output_path=None, delete_original=True):
43
43
  ["cwebp", "-metadata", "all", str(jpg_path), "-o", str(webp_path)],
44
44
  capture_output=True,
45
45
  text=True,
46
- timeout=60,
47
46
  )
48
47
 
49
48
  if result.returncode != 0:
@@ -55,9 +54,6 @@ def convert_to_webp(jpg_path, output_path=None, delete_original=True):
55
54
  jpg_path.unlink()
56
55
  return webp_path
57
56
 
58
- except subprocess.TimeoutExpired:
59
- logger.error(f"cwebp conversion timed out for {jpg_path}")
60
- return None
61
57
  except Exception as e:
62
58
  logger.error(f"Error converting {jpg_path} to WebP: {e}")
63
59
  return None