mapillary-downloader 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,10 +41,10 @@ def main():
41
41
  help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
42
42
  )
43
43
  parser.add_argument(
44
- "--workers",
44
+ "--max-workers",
45
45
  type=int,
46
- default=None,
47
- help="Number of parallel workers (default: half of CPU cores)",
46
+ default=128,
47
+ help="Maximum number of parallel workers (default: 128)",
48
48
  )
49
49
  parser.add_argument(
50
50
  "--no-tar",
@@ -114,7 +114,7 @@ def main():
114
114
  args.output,
115
115
  username,
116
116
  args.quality,
117
- workers=args.workers,
117
+ max_workers=args.max_workers,
118
118
  tar_sequences=not args.no_tar,
119
119
  convert_webp=convert_webp,
120
120
  check_ia=not args.no_check_ia,
@@ -45,7 +45,7 @@ class MapillaryDownloader:
45
45
  output_dir,
46
46
  username=None,
47
47
  quality=None,
48
- workers=None,
48
+ max_workers=128,
49
49
  tar_sequences=True,
50
50
  convert_webp=False,
51
51
  check_ia=True,
@@ -57,7 +57,7 @@ class MapillaryDownloader:
57
57
  output_dir: Base directory to save downloads (final destination)
58
58
  username: Mapillary username (for collection directory)
59
59
  quality: Image quality (for collection directory)
60
- workers: Number of parallel workers (default: half of cpu_count)
60
+ max_workers: Maximum number of parallel workers (default: 128)
61
61
  tar_sequences: Whether to tar sequence directories after download (default: True)
62
62
  convert_webp: Whether to convert images to WebP (affects collection name)
63
63
  check_ia: Whether to check if collection exists on Internet Archive (default: True)
@@ -66,7 +66,8 @@ class MapillaryDownloader:
66
66
  self.base_output_dir = Path(output_dir)
67
67
  self.username = username
68
68
  self.quality = quality
69
- self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
69
+ self.max_workers = max_workers
70
+ self.initial_workers = os.cpu_count() or 1 # Start with CPU count
70
71
  self.tar_sequences = tar_sequences
71
72
  self.convert_webp = convert_webp
72
73
  self.check_ia = check_ia
@@ -177,7 +178,7 @@ class MapillaryDownloader:
177
178
  logger.info(f"Downloading images for user: {self.username}")
178
179
  logger.info(f"Output directory: {self.output_dir}")
179
180
  logger.info(f"Quality: {self.quality}")
180
- logger.info(f"Using {self.workers} parallel workers")
181
+ logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
181
182
 
182
183
  start_time = time.time()
183
184
 
@@ -191,8 +192,10 @@ class MapillaryDownloader:
191
192
 
192
193
  # Step 2: Start worker pool
193
194
  # Since workers do both I/O (download) and CPU (WebP), need many more workers
194
- # Cap at 128 for now - will build proper dynamic scaling on a new branch later
195
- pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
195
+ # Start with CPU count and scale up based on throughput
196
+ pool = AdaptiveWorkerPool(
197
+ worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
198
+ )
196
199
  pool.start()
197
200
 
198
201
  # Step 3: Download images from metadata file while fetching new from API
@@ -247,8 +250,9 @@ class MapillaryDownloader:
247
250
  # Helper to process results from queue
248
251
  def process_results():
249
252
  nonlocal downloaded_count, total_bytes, failed_count
253
+ # Drain ALL available results to prevent queue from filling up
250
254
  while True:
251
- result = pool.get_result(timeout=0.001)
255
+ result = pool.get_result(timeout=0) # Non-blocking
252
256
  if result is None:
253
257
  break
254
258
 
@@ -379,6 +383,10 @@ class MapillaryDownloader:
379
383
 
380
384
  last_position = f.tell()
381
385
 
386
+ # If API is already complete, we've read the whole file, so break
387
+ if api_fetch_complete is None:
388
+ break
389
+
382
390
  # Sleep briefly before next tail iteration
383
391
  time.sleep(0.1)
384
392
 
@@ -23,51 +23,43 @@ def tar_sequence_directories(collection_dir):
23
23
  logger.error(f"Collection directory not found: {collection_dir}")
24
24
  return 0, 0
25
25
 
26
- # Find all sequence directories (skip special dirs)
26
+ # Find all bucket directories (skip special dirs)
27
+ # Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
27
28
  skip_dirs = {".meta", "__pycache__"}
28
- sequence_dirs = []
29
+ bucket_dirs = []
29
30
 
30
31
  for item in collection_dir.iterdir():
31
32
  if item.is_dir() and item.name not in skip_dirs:
32
- sequence_dirs.append(item)
33
+ # Check if this is a bucket dir (single char)
34
+ if len(item.name) == 1:
35
+ bucket_dirs.append(item)
33
36
 
34
- if not sequence_dirs:
35
- logger.info("No sequence directories to tar")
37
+ if not bucket_dirs:
38
+ logger.info("No bucket directories to tar")
36
39
  return 0, 0
37
40
 
38
- logger.info(f"Tarring {len(sequence_dirs)} sequence directories...")
41
+ logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
39
42
 
40
43
  tarred_count = 0
41
44
  total_files = 0
42
45
  total_tar_bytes = 0
43
46
 
44
- for seq_dir in sequence_dirs:
45
- seq_name = seq_dir.name
46
- tar_path = collection_dir / f"{seq_name}.tar"
47
+ for bucket_dir in bucket_dirs:
48
+ bucket_name = bucket_dir.name
49
+ tar_path = collection_dir / f"{bucket_name}.tar"
47
50
 
48
- # Handle naming collision - find next available name
49
- counter = 1
50
- while tar_path.exists():
51
- counter += 1
52
- tar_path = collection_dir / f"{seq_name}.{counter}.tar"
53
-
54
- # Count files in sequence
55
- files = list(seq_dir.glob("*"))
56
- file_count = len([f for f in files if f.is_file()])
51
+ # Count files in bucket
52
+ files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
53
+ file_count = len(files_to_tar)
57
54
 
58
55
  if file_count == 0:
59
- logger.warning(f"Skipping empty directory: {seq_name}")
56
+ logger.warning(f"Skipping empty bucket directory: {bucket_name}")
60
57
  continue
61
58
 
62
59
  try:
63
- # Create reproducible uncompressed tar (WebP already compressed)
64
- # Sort files by name for deterministic ordering
65
- files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
66
-
67
- if not files_to_tar:
68
- logger.warning(f"Skipping directory with no files: {seq_name}")
69
- continue
60
+ logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
70
61
 
62
+ # Create reproducible uncompressed tar (WebP already compressed)
71
63
  with tarfile.open(tar_path, "w") as tar:
72
64
  for file_path in files_to_tar:
73
65
  # Get path relative to collection_dir for tar archive
@@ -92,33 +84,32 @@ def tar_sequence_directories(collection_dir):
92
84
  tar_size = tar_path.stat().st_size
93
85
  total_tar_bytes += tar_size
94
86
 
95
- # Remove original directory
96
- for file in seq_dir.rglob("*"):
87
+ # Remove original bucket directory
88
+ for file in bucket_dir.rglob("*"):
97
89
  if file.is_file():
98
90
  file.unlink()
99
91
 
100
92
  # Remove empty subdirs and main dir
101
- for subdir in list(seq_dir.rglob("*")):
93
+ for subdir in list(bucket_dir.rglob("*")):
102
94
  if subdir.is_dir():
103
95
  try:
104
96
  subdir.rmdir()
105
97
  except OSError:
106
98
  pass # Not empty yet
107
99
 
108
- seq_dir.rmdir()
100
+ bucket_dir.rmdir()
109
101
 
110
102
  tarred_count += 1
111
103
  total_files += file_count
112
104
 
113
- if tarred_count % 10 == 0:
114
- logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
105
+ logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
115
106
  else:
116
107
  logger.error(f"Tar file empty or not created: {tar_path}")
117
108
  if tar_path.exists():
118
109
  tar_path.unlink()
119
110
 
120
111
  except Exception as e:
121
- logger.error(f"Error tarring {seq_name}: {e}")
112
+ logger.error(f"Error tarring bucket {bucket_name}: {e}")
122
113
  if tar_path.exists():
123
114
  tar_path.unlink()
124
115
 
@@ -1,6 +1,7 @@
1
1
  """Worker process for parallel image download and conversion."""
2
2
 
3
3
  import os
4
+ import signal
4
5
  import tempfile
5
6
  from pathlib import Path
6
7
  import requests
@@ -17,6 +18,9 @@ def worker_process(work_queue, result_queue, worker_id):
17
18
  result_queue: Queue to push results to
18
19
  worker_id: Unique worker identifier
19
20
  """
21
+ # Ignore SIGINT in worker process - parent will handle it
22
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
23
+
20
24
  # Create session once per worker (reuse HTTP connections)
21
25
  session = requests.Session()
22
26
 
@@ -65,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
65
69
  if not image_url:
66
70
  return (image_id, 0, False, f"No {quality} URL")
67
71
 
68
- # Determine final output directory
72
+ # Determine final output directory - organize by first char of sequence ID
69
73
  output_dir = Path(output_dir)
70
74
  sequence_id = image_data.get("sequence")
71
75
  if sequence_id:
72
- img_dir = output_dir / sequence_id
76
+ # Use first character as bucket (gives us ~62 dirs instead of millions)
77
+ first_char = sequence_id[0]
78
+ img_dir = output_dir / first_char / sequence_id
73
79
  img_dir.mkdir(parents=True, exist_ok=True)
74
80
  else:
75
81
  img_dir = output_dir
@@ -185,20 +185,18 @@ class AdaptiveWorkerPool:
185
185
  else:
186
186
  logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
187
187
 
188
- def shutdown(self, timeout=30):
188
+ def shutdown(self, timeout=2):
189
189
  """Shutdown the worker pool gracefully."""
190
190
  logger.info("Shutting down worker pool...")
191
191
  self.running = False
192
192
 
193
- # Send stop signals
194
- for _ in self.workers:
195
- self.work_queue.put(None)
196
-
197
- # Wait for workers to finish
193
+ # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
198
194
  for p in self.workers:
199
- p.join(timeout=timeout)
200
195
  if p.is_alive():
201
- logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
202
196
  p.terminate()
203
197
 
198
+ # Give them a brief moment to exit
199
+ for p in self.workers:
200
+ p.join(timeout=timeout)
201
+
204
202
  logger.info("Worker pool shutdown complete")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
66
66
  | `--quality` | 256, 1024, 2048 or original | `original` |
67
67
  | `--bbox` | `west,south,east,north` | `None` |
68
68
  | `--no-webp` | Don't convert to WebP | `False` |
69
- | `--workers` | Number of parallel download workers | Half of CPU count |
70
- | `--no-tar` | Don't tar sequence directories | `False` |
69
+ | `--max-workers` | Maximum number of parallel download workers | `128` |
70
+ | `--no-tar` | Don't tar bucket directories | `False` |
71
71
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
72
72
 
73
73
  The downloader will:
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
98
98
  mapillary-downloader --no-webp USERNAME
99
99
  ```
100
100
 
101
- ## Sequence Tarball Creation
101
+ ## Tarballs
102
102
 
103
- By default, sequence directories are automatically tarred after download because
104
- if they weren't, you'd spend more time setting up upload metadata than actually
105
- uploading files to IA.
103
+ Images are organized by sequence ID, bucketed by the first character of the
104
+ sequence to reduce directory count:
105
+
106
+ ```
107
+ mapillary-username-quality/
108
+ a/
109
+ abc123/
110
+ image1.webp
111
+ image2.webp
112
+ ```
113
+
114
+ By default, these bucket directories are automatically tarred after download
115
+ (resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
116
+ because large collections with millions of images would otherwise create hundreds
117
+ of thousands of tiny tars, and anger the archive gods.
106
118
 
107
119
  To keep individual files instead of creating tars, use the `--no-tar` flag.
108
120
 
@@ -1,19 +1,19 @@
1
1
  mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
2
- mapillary_downloader/__main__.py,sha256=Kjfx2woMyCvAxYAdqvtXtYJknCMviV_K2PSo0cDc8Hg,4320
2
+ mapillary_downloader/__main__.py,sha256=G4tTNN6V7jPZn4X9gjRDx0faw-Im9zhyTLbDRTOyo3k,4325
3
3
  mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
4
- mapillary_downloader/downloader.py,sha256=v0vLovW80DMpEzQIb_tdETF1HzO2GEMbbVzD93aKnnQ,19561
4
+ mapillary_downloader/downloader.py,sha256=6JFEIVBIdhN9L4DcPJmx6UCTgAnWSuH0jO0D_8wKz_U,19886
5
5
  mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
6
6
  mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
7
7
  mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
8
  mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
9
9
  mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
10
- mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
10
+ mapillary_downloader/tar_sequences.py,sha256=758yVQGSLC_x8tT7h1qzAdo8b-4OmARZYseNacM1Nv8,4223
11
11
  mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
12
12
  mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
13
- mapillary_downloader/worker.py,sha256=syxsE2pPX_9EXzSGEGeUaeLFqmqZjTma-rB7S2zpYac,4511
14
- mapillary_downloader/worker_pool.py,sha256=ctFl40UgFUjpLL_e6Mw5h7YNMfKwXulRhaX18r9sIkE,8257
15
- mapillary_downloader-0.5.1.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
- mapillary_downloader-0.5.1.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
- mapillary_downloader-0.5.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
- mapillary_downloader-0.5.1.dist-info/METADATA,sha256=1vM80Kq2WHUzfSt7lIx91hb0fycHosxpll-xnCda6JU,4982
19
- mapillary_downloader-0.5.1.dist-info/RECORD,,
13
+ mapillary_downloader/worker.py,sha256=Q82Q1mnTL_CUwNXum9GAg2Fz40dolh_gByDkeN72p9o,4814
14
+ mapillary_downloader/worker_pool.py,sha256=iGRq5uFwBNNVQnI4vEjbKHkbKTaEVCdmvMvXcRGuDMg,8203
15
+ mapillary_downloader-0.6.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
+ mapillary_downloader-0.6.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
+ mapillary_downloader-0.6.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
+ mapillary_downloader-0.6.0.dist-info/METADATA,sha256=dvPNrWfk-wB_xIFoowuIH5-17Oib14hpHpik4FpqC7k,5277
19
+ mapillary_downloader-0.6.0.dist-info/RECORD,,