mapillary-downloader 0.7.2__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/PKG-INFO +1 -1
  2. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/pyproject.toml +1 -1
  3. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/downloader.py +21 -41
  4. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_stats.py +91 -7
  5. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/worker_pool.py +3 -10
  6. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/LICENSE.md +0 -0
  7. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/README.md +0 -0
  8. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/__init__.py +0 -0
  9. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/__main__.py +0 -0
  10. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/client.py +0 -0
  11. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/exif_writer.py +0 -0
  12. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/graphql_web.py +0 -0
  13. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_check.py +0 -0
  14. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_meta.py +0 -0
  15. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/logging_config.py +0 -0
  16. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/metadata_reader.py +0 -0
  17. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/tar_sequences.py +0 -0
  18. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/utils.py +0 -0
  19. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/webp_converter.py +0 -0
  20. {mapillary_downloader-0.7.2 → mapillary_downloader-0.7.3}/src/mapillary_downloader/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.7.2
3
+ Version: 0.7.3
4
4
  Summary: Archive user data from Mapillary
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Archive user data from Mapillary"
4
- version = "0.7.2"
4
+ version = "0.7.3"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -99,7 +99,7 @@ class MapillaryDownloader:
99
99
  # Set up file logging for archival with timestamp for incremental runs
100
100
  timestamp = time.strftime("%Y%m%d-%H%M%S")
101
101
  log_file = self.output_dir / f"download.log.{timestamp}"
102
- add_file_handler(log_file)
102
+ self.file_handler = add_file_handler(log_file)
103
103
  logger.info(f"Logging to: {log_file}")
104
104
 
105
105
  self.metadata_file = self.output_dir / "metadata.jsonl"
@@ -169,24 +169,15 @@ class MapillaryDownloader:
169
169
 
170
170
  quality_field = f"thumb_{self.quality}_url"
171
171
 
172
- logger.info(f"Downloading images for user: {self.username}")
173
- logger.info(f"Output directory: {self.output_dir}")
174
- logger.info(f"Quality: {self.quality}")
175
- logger.info(f"Worker pool: max {self.max_workers} workers")
172
+ logger.info(f"Downloading {self.username} @ {self.quality} (max {self.max_workers} workers)")
176
173
 
177
174
  start_time = time.time()
178
175
 
179
176
  # Step 1: Check if API fetch is already complete
180
177
  reader = MetadataReader(self.metadata_file)
181
178
  api_complete = reader.is_complete
182
- if api_complete:
183
- logger.info("API fetch already complete, will only download")
184
- else:
185
- logger.info("API fetch incomplete, will fetch and download in parallel")
186
179
 
187
180
  # Step 2: Start worker pool
188
- # Since workers do both I/O (download) and CPU (WebP), need many more workers
189
- # Start with CPU count and scale up based on throughput
190
181
  pool = AdaptiveWorkerPool(worker_process, max_workers=self.max_workers, monitoring_interval=10)
191
182
  pool.start()
192
183
 
@@ -195,9 +186,6 @@ class MapillaryDownloader:
195
186
  total_bytes = 0
196
187
  failed_count = 0
197
188
  submitted = 0
198
- batch_start = time.time()
199
-
200
- logger.info("Starting parallel download and API fetch...")
201
189
 
202
190
  try:
203
191
  # Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
@@ -210,7 +198,7 @@ class MapillaryDownloader:
210
198
  def fetch_api_metadata():
211
199
  """Fetch metadata from API and write to file (runs in thread)."""
212
200
  try:
213
- logger.info("API fetch thread: Starting...")
201
+ logger.debug("API fetch thread starting")
214
202
  with open(self.metadata_file, "a") as meta_f:
215
203
  for image in self.client.get_user_images(self.username, bbox=bbox):
216
204
  new_images_count[0] += 1
@@ -220,11 +208,11 @@ class MapillaryDownloader:
220
208
  meta_f.flush()
221
209
 
222
210
  if new_images_count[0] % 1000 == 0:
223
- logger.info(f"API: Fetched {new_images_count[0]} images from API")
211
+ logger.info(f"API: fetched {new_images_count[0]:,} image URLs")
224
212
 
225
213
  # Mark as complete
226
214
  MetadataReader.mark_complete(self.metadata_file)
227
- logger.info(f"API fetch complete: {new_images_count[0]} images")
215
+ logger.info(f"API fetch complete: {new_images_count[0]:,} images")
228
216
  finally:
229
217
  api_fetch_complete.set()
230
218
 
@@ -232,11 +220,10 @@ class MapillaryDownloader:
232
220
  api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
233
221
  api_thread.start()
234
222
  else:
235
- logger.info("API fetch already complete, skipping API thread")
236
223
  api_fetch_complete = None
237
224
 
238
225
  # Step 3b: Tail metadata file and submit to workers
239
- logger.info("Starting metadata tail and download queue feeder...")
226
+ logger.debug("Starting metadata tail and download queue feeder")
240
227
  last_position = 0
241
228
 
242
229
  # Helper to process results from queue
@@ -258,12 +245,7 @@ class MapillaryDownloader:
258
245
  # Log every download for first 10, then every 100
259
246
  should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
260
247
  if should_log:
261
- elapsed = time.time() - batch_start
262
- rate = downloaded_count / elapsed if elapsed > 0 else 0
263
- logger.info(
264
- f"Downloaded: {downloaded_count} ({format_size(total_bytes)}) "
265
- f"- Rate: {rate:.1f} images/sec"
266
- )
248
+ logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
267
249
 
268
250
  if downloaded_count % 100 == 0:
269
251
  self._save_progress()
@@ -301,6 +283,7 @@ class MapillaryDownloader:
301
283
 
302
284
  # Skip if already downloaded or no quality URL
303
285
  if image_id in self.downloaded:
286
+ downloaded_count += 1
304
287
  continue
305
288
  if not image.get(quality_field):
306
289
  continue
@@ -317,7 +300,7 @@ class MapillaryDownloader:
317
300
  submitted += 1
318
301
 
319
302
  if submitted % 1000 == 0:
320
- logger.info(f"Queue: Submitted {submitted} images")
303
+ logger.info(f"Queue: submitted {submitted:,} images")
321
304
 
322
305
  # Process results while submitting
323
306
  process_results()
@@ -352,6 +335,7 @@ class MapillaryDownloader:
352
335
 
353
336
  # Skip if already downloaded or no quality URL
354
337
  if image_id in self.downloaded:
338
+ downloaded_count += 1
355
339
  continue
356
340
  if not image.get(quality_field):
357
341
  continue
@@ -368,7 +352,7 @@ class MapillaryDownloader:
368
352
  submitted += 1
369
353
 
370
354
  if submitted % 1000 == 0:
371
- logger.info(f"Queue: Submitted {submitted} images")
355
+ logger.info(f"Queue: submitted {submitted:,} images")
372
356
 
373
357
  # Process results while submitting
374
358
  process_results()
@@ -386,7 +370,7 @@ class MapillaryDownloader:
386
370
  process_results()
387
371
 
388
372
  # Send shutdown signals
389
- logger.info(f"Submitted {submitted} images, waiting for workers to finish...")
373
+ logger.debug(f"Submitted {submitted:,} images, waiting for workers")
390
374
  for _ in range(pool.current_workers):
391
375
  pool.submit(None)
392
376
 
@@ -408,16 +392,8 @@ class MapillaryDownloader:
408
392
  downloaded_count += 1
409
393
  total_bytes += bytes_dl
410
394
 
411
- if downloaded_count % 10 == 0:
412
- elapsed = time.time() - batch_start
413
- rate = downloaded_count / elapsed if elapsed > 0 else 0
414
- remaining = submitted - completed
415
- eta_seconds = remaining / rate if rate > 0 else 0
416
-
417
- logger.info(
418
- f"Downloaded: {downloaded_count}/{submitted} ({format_size(total_bytes)}) "
419
- f"- ETA: {format_time(eta_seconds)}"
420
- )
395
+ if downloaded_count % 100 == 0:
396
+ logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
421
397
  self._save_progress()
422
398
  pool.check_throughput(downloaded_count)
423
399
  else:
@@ -431,7 +407,7 @@ class MapillaryDownloader:
431
407
  self._save_progress()
432
408
  elapsed = time.time() - start_time
433
409
 
434
- logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
410
+ logger.info(f"Complete! Downloaded {downloaded_count:,} ({format_size(total_bytes)}), failed {failed_count:,}")
435
411
  logger.info(f"Total time: {format_time(elapsed)}")
436
412
 
437
413
  # Tar sequence directories for efficient IA uploads
@@ -460,12 +436,16 @@ class MapillaryDownloader:
460
436
  # Generate IA metadata
461
437
  generate_ia_metadata(self.output_dir)
462
438
 
439
+ # Close log file handler before moving directory
440
+ self.file_handler.close()
441
+ logger.removeHandler(self.file_handler)
442
+
463
443
  # Move from staging to final destination
464
- logger.info("Moving collection from staging to final destination...")
444
+ logger.info("Moving to final destination...")
465
445
  if self.final_dir.exists():
466
446
  logger.warning(f"Destination already exists, removing: {self.final_dir}")
467
447
  shutil.rmtree(self.final_dir)
468
448
 
469
449
  self.final_dir.parent.mkdir(parents=True, exist_ok=True)
470
450
  shutil.move(str(self.staging_dir), str(self.final_dir))
471
- logger.info(f"Collection moved to: {self.final_dir}")
451
+ logger.info(f"Done: {self.final_dir}")
@@ -15,14 +15,14 @@ def search_ia_collections():
15
15
  """Search IA for all mapillary_downloader collections.
16
16
 
17
17
  Returns:
18
- List of dicts with: identifier, description, item_size, uploader
18
+ List of dicts with: identifier, description, item_size, collection
19
19
  """
20
20
  logger.info("Searching archive.org for mapillary_downloader collections...")
21
21
 
22
22
  url = "https://archive.org/advancedsearch.php"
23
23
  params = {
24
24
  "q": "mapillary_downloader:*",
25
- "fl[]": ["identifier", "description", "item_size", "uploader"],
25
+ "fl[]": ["identifier", "description", "item_size", "collection"],
26
26
  "rows": 10000,
27
27
  "output": "json",
28
28
  }
@@ -31,11 +31,29 @@ def search_ia_collections():
31
31
  data = response.json()
32
32
 
33
33
  collections = data["response"]["docs"]
34
- logger.info(f"Found {len(collections)} collections on archive.org")
34
+ logger.info(f"Found {len(collections):,} collections on archive.org")
35
35
 
36
36
  return collections
37
37
 
38
38
 
39
+ def fetch_uploader(identifier):
40
+ """Fetch uploader email from item metadata.
41
+
42
+ Args:
43
+ identifier: IA item identifier
44
+
45
+ Returns:
46
+ Uploader email or None
47
+ """
48
+ url = f"https://archive.org/metadata/{identifier}/metadata/uploader"
49
+ try:
50
+ response = http_get_with_retry(url, max_retries=2)
51
+ data = response.json()
52
+ return data.get("result")
53
+ except Exception:
54
+ return None
55
+
56
+
39
57
  def parse_collection_info(identifier):
40
58
  """Parse username, quality, webp from collection identifier.
41
59
 
@@ -104,19 +122,28 @@ def update_cache(ia_collections):
104
122
 
105
123
  image_count = extract_image_count(item.get("description"))
106
124
 
125
+ # Get IA collection(s) - can be a string or list
126
+ ia_collection = item.get("collection", [])
127
+ if isinstance(ia_collection, str):
128
+ ia_collection = [ia_collection]
129
+
130
+ # Preserve existing uploader if we have it cached
131
+ existing = cache.get(identifier, {})
132
+
107
133
  # Update cache entry
108
134
  cache[identifier] = {
109
135
  "size": size_bytes,
110
- "uploader": item.get("uploader"),
136
+ "uploader": existing.get("uploader"), # Preserve cached uploader
111
137
  "images": image_count,
112
138
  "quality": info["quality"],
113
139
  "username": info["username"],
114
140
  "is_webp": info["is_webp"],
141
+ "ia_collection": ia_collection,
115
142
  }
116
143
 
117
144
  # Save updated cache
118
145
  safe_json_save(CACHE_FILE, cache)
119
- logger.info(f"Updated cache with {len(cache)} collections")
146
+ logger.info(f"Updated cache with {len(cache):,} collections")
120
147
 
121
148
  return cache
122
149
 
@@ -168,11 +195,12 @@ def aggregate_stats(cache):
168
195
  return stats
169
196
 
170
197
 
171
- def format_stats(stats):
198
+ def format_stats(stats, cache):
172
199
  """Format statistics as human-readable text.
173
200
 
174
201
  Args:
175
202
  stats: Dict from aggregate_stats()
203
+ cache: Dict of collection data
176
204
 
177
205
  Returns:
178
206
  Formatted string
@@ -212,6 +240,62 @@ def format_stats(stats):
212
240
  )
213
241
 
214
242
  output.append("")
243
+
244
+ # Find items not in mapillary-images and fetch uploaders
245
+ not_in_mapillary_images = []
246
+ need_uploader_fetch = []
247
+
248
+ for identifier, data in cache.items():
249
+ ia_collections = data.get("ia_collection", [])
250
+ if "mapillary-images" not in ia_collections:
251
+ not_in_mapillary_images.append(identifier)
252
+ if not data.get("uploader"):
253
+ need_uploader_fetch.append(identifier)
254
+
255
+ # Fetch missing uploaders
256
+ if need_uploader_fetch:
257
+ logger.info(f"Fetching uploader info for {len(need_uploader_fetch)} items...")
258
+ for i, identifier in enumerate(need_uploader_fetch, 1):
259
+ logger.info(f" [{i}/{len(need_uploader_fetch)}] {identifier}")
260
+ uploader = fetch_uploader(identifier)
261
+ if uploader:
262
+ cache[identifier]["uploader"] = uploader
263
+ # Save updated cache with uploaders
264
+ safe_json_save(CACHE_FILE, cache)
265
+
266
+ # Group by uploader (only for items not in mapillary-images)
267
+ by_uploader = {}
268
+ for identifier in not_in_mapillary_images:
269
+ uploader = cache[identifier].get("uploader") or "unknown"
270
+ if uploader not in by_uploader:
271
+ by_uploader[uploader] = {"items": [], "images": 0, "size": 0}
272
+ by_uploader[uploader]["items"].append(identifier)
273
+ by_uploader[uploader]["images"] += cache[identifier].get("images") or 0
274
+ by_uploader[uploader]["size"] += cache[identifier].get("size") or 0
275
+
276
+ # By uploader (only those with items outside mapillary-images)
277
+ if by_uploader:
278
+ output.append("Uploaders with items outside mapillary-images:")
279
+ output.append("-" * 70)
280
+ for uploader, data in sorted(by_uploader.items(), key=lambda x: -len(x[1]["items"])):
281
+ output.append(
282
+ f" {uploader}: {len(data['items'])} items, " f"{data['images']:,} images, {format_size(data['size'])}"
283
+ )
284
+ output.append("")
285
+
286
+ # Items not in mapillary-images, grouped by uploader
287
+ if not_in_mapillary_images:
288
+ output.append(f"Items NOT in mapillary-images ({len(not_in_mapillary_images)}):")
289
+ output.append("-" * 70)
290
+ for uploader, data in sorted(by_uploader.items(), key=lambda x: x[0].lower()):
291
+ output.append(f"{uploader}:")
292
+ for identifier in sorted(data["items"]):
293
+ output.append(identifier)
294
+ output.append("")
295
+ else:
296
+ output.append("All items are in mapillary-images collection!")
297
+ output.append("")
298
+
215
299
  output.append(f"Cache: {CACHE_FILE}")
216
300
 
217
301
  return "\n".join(output)
@@ -239,4 +323,4 @@ def show_stats(refresh=True):
239
323
  return
240
324
 
241
325
  stats = aggregate_stats(cache)
242
- print(format_stats(stats))
326
+ print(format_stats(stats, cache))
@@ -49,7 +49,7 @@ class AdaptiveWorkerPool:
49
49
  def start(self):
50
50
  """Start the worker pool."""
51
51
  self.running = True
52
- logger.info(f"Starting worker pool with {self.current_workers} workers")
52
+ logger.debug(f"Starting worker pool with {self.current_workers} workers")
53
53
 
54
54
  for i in range(self.current_workers):
55
55
  self._add_worker(i)
@@ -99,10 +99,7 @@ class AdaptiveWorkerPool:
99
99
  self.last_processed = total_processed
100
100
  self.last_check_time = now
101
101
 
102
- logger.info(
103
- f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
104
- f"history: {len(self.throughput_history)} measurements)"
105
- )
102
+ logger.info(f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers})")
106
103
 
107
104
  # Need at least 2 measurements to calculate gain per worker
108
105
  if len(self.throughput_history) < 2:
@@ -181,12 +178,10 @@ class AdaptiveWorkerPool:
181
178
  self.current_workers += 1
182
179
  added += 1
183
180
  logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
184
- else:
185
- logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
186
181
 
187
182
  def shutdown(self, timeout=2):
188
183
  """Shutdown the worker pool gracefully."""
189
- logger.info("Shutting down worker pool...")
184
+ logger.debug("Shutting down worker pool")
190
185
  self.running = False
191
186
 
192
187
  # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
@@ -197,5 +192,3 @@ class AdaptiveWorkerPool:
197
192
  # Give them a brief moment to exit
198
193
  for p in self.workers:
199
194
  p.join(timeout=timeout)
200
-
201
- logger.info("Worker pool shutdown complete")