mapillary-downloader 0.5.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/PKG-INFO +19 -7
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/README.md +18 -6
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/pyproject.toml +1 -1
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/__main__.py +4 -4
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/downloader.py +15 -7
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/tar_sequences.py +24 -33
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/worker.py +8 -2
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/worker_pool.py +6 -8
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/LICENSE.md +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/ia_check.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/ia_meta.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/logging_config.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/metadata_reader.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/utils.py +0 -0
- {mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/webp_converter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mapillary_downloader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Download your Mapillary data before it's gone
|
|
5
5
|
Author-email: Gareth Davidson <gaz@bitplane.net>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
66
66
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
67
67
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
68
68
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
69
|
-
| `--workers`
|
|
70
|
-
| `--no-tar` | Don't tar
|
|
69
|
+
| `--max-workers` | Maximum number of parallel download workers | `128` |
|
|
70
|
+
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
71
71
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
72
72
|
|
|
73
73
|
The downloader will:
|
|
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
|
|
98
98
|
mapillary-downloader --no-webp USERNAME
|
|
99
99
|
```
|
|
100
100
|
|
|
101
|
-
##
|
|
101
|
+
## Tarballs
|
|
102
102
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
103
|
+
Images are organized by sequence ID, bucketed by the first character of the
|
|
104
|
+
sequence to reduce directory count:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
mapillary-username-quality/
|
|
108
|
+
a/
|
|
109
|
+
abc123/
|
|
110
|
+
image1.webp
|
|
111
|
+
image2.webp
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
By default, these bucket directories are automatically tarred after download
|
|
115
|
+
(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
|
|
116
|
+
because large collections with millions of images would otherwise create hundreds
|
|
117
|
+
of thousands of tiny tars, and anger the archive gods.
|
|
106
118
|
|
|
107
119
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
108
120
|
|
|
@@ -36,8 +36,8 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
36
36
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
37
37
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
38
38
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
39
|
-
| `--workers`
|
|
40
|
-
| `--no-tar` | Don't tar
|
|
39
|
+
| `--max-workers` | Maximum number of parallel download workers | `128` |
|
|
40
|
+
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
41
41
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
42
42
|
|
|
43
43
|
The downloader will:
|
|
@@ -68,11 +68,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
|
|
68
68
|
mapillary-downloader --no-webp USERNAME
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
-
##
|
|
71
|
+
## Tarballs
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
73
|
+
Images are organized by sequence ID, bucketed by the first character of the
|
|
74
|
+
sequence to reduce directory count:
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
mapillary-username-quality/
|
|
78
|
+
a/
|
|
79
|
+
abc123/
|
|
80
|
+
image1.webp
|
|
81
|
+
image2.webp
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
By default, these bucket directories are automatically tarred after download
|
|
85
|
+
(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
|
|
86
|
+
because large collections with millions of images would otherwise create hundreds
|
|
87
|
+
of thousands of tiny tars, and anger the archive gods.
|
|
76
88
|
|
|
77
89
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
78
90
|
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/__main__.py
RENAMED
|
@@ -41,10 +41,10 @@ def main():
|
|
|
41
41
|
help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
|
|
42
42
|
)
|
|
43
43
|
parser.add_argument(
|
|
44
|
-
"--workers",
|
|
44
|
+
"--max-workers",
|
|
45
45
|
type=int,
|
|
46
|
-
default=
|
|
47
|
-
help="
|
|
46
|
+
default=128,
|
|
47
|
+
help="Maximum number of parallel workers (default: 128)",
|
|
48
48
|
)
|
|
49
49
|
parser.add_argument(
|
|
50
50
|
"--no-tar",
|
|
@@ -114,7 +114,7 @@ def main():
|
|
|
114
114
|
args.output,
|
|
115
115
|
username,
|
|
116
116
|
args.quality,
|
|
117
|
-
|
|
117
|
+
max_workers=args.max_workers,
|
|
118
118
|
tar_sequences=not args.no_tar,
|
|
119
119
|
convert_webp=convert_webp,
|
|
120
120
|
check_ia=not args.no_check_ia,
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/downloader.py
RENAMED
|
@@ -45,7 +45,7 @@ class MapillaryDownloader:
|
|
|
45
45
|
output_dir,
|
|
46
46
|
username=None,
|
|
47
47
|
quality=None,
|
|
48
|
-
|
|
48
|
+
max_workers=128,
|
|
49
49
|
tar_sequences=True,
|
|
50
50
|
convert_webp=False,
|
|
51
51
|
check_ia=True,
|
|
@@ -57,7 +57,7 @@ class MapillaryDownloader:
|
|
|
57
57
|
output_dir: Base directory to save downloads (final destination)
|
|
58
58
|
username: Mapillary username (for collection directory)
|
|
59
59
|
quality: Image quality (for collection directory)
|
|
60
|
-
|
|
60
|
+
max_workers: Maximum number of parallel workers (default: 128)
|
|
61
61
|
tar_sequences: Whether to tar sequence directories after download (default: True)
|
|
62
62
|
convert_webp: Whether to convert images to WebP (affects collection name)
|
|
63
63
|
check_ia: Whether to check if collection exists on Internet Archive (default: True)
|
|
@@ -66,7 +66,8 @@ class MapillaryDownloader:
|
|
|
66
66
|
self.base_output_dir = Path(output_dir)
|
|
67
67
|
self.username = username
|
|
68
68
|
self.quality = quality
|
|
69
|
-
self.
|
|
69
|
+
self.max_workers = max_workers
|
|
70
|
+
self.initial_workers = os.cpu_count() or 1 # Start with CPU count
|
|
70
71
|
self.tar_sequences = tar_sequences
|
|
71
72
|
self.convert_webp = convert_webp
|
|
72
73
|
self.check_ia = check_ia
|
|
@@ -177,7 +178,7 @@ class MapillaryDownloader:
|
|
|
177
178
|
logger.info(f"Downloading images for user: {self.username}")
|
|
178
179
|
logger.info(f"Output directory: {self.output_dir}")
|
|
179
180
|
logger.info(f"Quality: {self.quality}")
|
|
180
|
-
logger.info(f"
|
|
181
|
+
logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
|
|
181
182
|
|
|
182
183
|
start_time = time.time()
|
|
183
184
|
|
|
@@ -191,8 +192,10 @@ class MapillaryDownloader:
|
|
|
191
192
|
|
|
192
193
|
# Step 2: Start worker pool
|
|
193
194
|
# Since workers do both I/O (download) and CPU (WebP), need many more workers
|
|
194
|
-
#
|
|
195
|
-
pool = AdaptiveWorkerPool(
|
|
195
|
+
# Start with CPU count and scale up based on throughput
|
|
196
|
+
pool = AdaptiveWorkerPool(
|
|
197
|
+
worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
|
|
198
|
+
)
|
|
196
199
|
pool.start()
|
|
197
200
|
|
|
198
201
|
# Step 3: Download images from metadata file while fetching new from API
|
|
@@ -247,8 +250,9 @@ class MapillaryDownloader:
|
|
|
247
250
|
# Helper to process results from queue
|
|
248
251
|
def process_results():
|
|
249
252
|
nonlocal downloaded_count, total_bytes, failed_count
|
|
253
|
+
# Drain ALL available results to prevent queue from filling up
|
|
250
254
|
while True:
|
|
251
|
-
result = pool.get_result(timeout=0
|
|
255
|
+
result = pool.get_result(timeout=0) # Non-blocking
|
|
252
256
|
if result is None:
|
|
253
257
|
break
|
|
254
258
|
|
|
@@ -379,6 +383,10 @@ class MapillaryDownloader:
|
|
|
379
383
|
|
|
380
384
|
last_position = f.tell()
|
|
381
385
|
|
|
386
|
+
# If API is already complete, we've read the whole file, so break
|
|
387
|
+
if api_fetch_complete is None:
|
|
388
|
+
break
|
|
389
|
+
|
|
382
390
|
# Sleep briefly before next tail iteration
|
|
383
391
|
time.sleep(0.1)
|
|
384
392
|
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/tar_sequences.py
RENAMED
|
@@ -23,51 +23,43 @@ def tar_sequence_directories(collection_dir):
|
|
|
23
23
|
logger.error(f"Collection directory not found: {collection_dir}")
|
|
24
24
|
return 0, 0
|
|
25
25
|
|
|
26
|
-
# Find all
|
|
26
|
+
# Find all bucket directories (skip special dirs)
|
|
27
|
+
# Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
|
|
27
28
|
skip_dirs = {".meta", "__pycache__"}
|
|
28
|
-
|
|
29
|
+
bucket_dirs = []
|
|
29
30
|
|
|
30
31
|
for item in collection_dir.iterdir():
|
|
31
32
|
if item.is_dir() and item.name not in skip_dirs:
|
|
32
|
-
|
|
33
|
+
# Check if this is a bucket dir (single char)
|
|
34
|
+
if len(item.name) == 1:
|
|
35
|
+
bucket_dirs.append(item)
|
|
33
36
|
|
|
34
|
-
if not
|
|
35
|
-
logger.info("No
|
|
37
|
+
if not bucket_dirs:
|
|
38
|
+
logger.info("No bucket directories to tar")
|
|
36
39
|
return 0, 0
|
|
37
40
|
|
|
38
|
-
logger.info(f"Tarring {len(
|
|
41
|
+
logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
|
|
39
42
|
|
|
40
43
|
tarred_count = 0
|
|
41
44
|
total_files = 0
|
|
42
45
|
total_tar_bytes = 0
|
|
43
46
|
|
|
44
|
-
for
|
|
45
|
-
|
|
46
|
-
tar_path = collection_dir / f"{
|
|
47
|
+
for bucket_dir in bucket_dirs:
|
|
48
|
+
bucket_name = bucket_dir.name
|
|
49
|
+
tar_path = collection_dir / f"{bucket_name}.tar"
|
|
47
50
|
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
counter += 1
|
|
52
|
-
tar_path = collection_dir / f"{seq_name}.{counter}.tar"
|
|
53
|
-
|
|
54
|
-
# Count files in sequence
|
|
55
|
-
files = list(seq_dir.glob("*"))
|
|
56
|
-
file_count = len([f for f in files if f.is_file()])
|
|
51
|
+
# Count files in bucket
|
|
52
|
+
files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
|
|
53
|
+
file_count = len(files_to_tar)
|
|
57
54
|
|
|
58
55
|
if file_count == 0:
|
|
59
|
-
logger.warning(f"Skipping empty directory: {
|
|
56
|
+
logger.warning(f"Skipping empty bucket directory: {bucket_name}")
|
|
60
57
|
continue
|
|
61
58
|
|
|
62
59
|
try:
|
|
63
|
-
|
|
64
|
-
# Sort files by name for deterministic ordering
|
|
65
|
-
files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
|
|
66
|
-
|
|
67
|
-
if not files_to_tar:
|
|
68
|
-
logger.warning(f"Skipping directory with no files: {seq_name}")
|
|
69
|
-
continue
|
|
60
|
+
logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
|
|
70
61
|
|
|
62
|
+
# Create reproducible uncompressed tar (WebP already compressed)
|
|
71
63
|
with tarfile.open(tar_path, "w") as tar:
|
|
72
64
|
for file_path in files_to_tar:
|
|
73
65
|
# Get path relative to collection_dir for tar archive
|
|
@@ -92,33 +84,32 @@ def tar_sequence_directories(collection_dir):
|
|
|
92
84
|
tar_size = tar_path.stat().st_size
|
|
93
85
|
total_tar_bytes += tar_size
|
|
94
86
|
|
|
95
|
-
# Remove original directory
|
|
96
|
-
for file in
|
|
87
|
+
# Remove original bucket directory
|
|
88
|
+
for file in bucket_dir.rglob("*"):
|
|
97
89
|
if file.is_file():
|
|
98
90
|
file.unlink()
|
|
99
91
|
|
|
100
92
|
# Remove empty subdirs and main dir
|
|
101
|
-
for subdir in list(
|
|
93
|
+
for subdir in list(bucket_dir.rglob("*")):
|
|
102
94
|
if subdir.is_dir():
|
|
103
95
|
try:
|
|
104
96
|
subdir.rmdir()
|
|
105
97
|
except OSError:
|
|
106
98
|
pass # Not empty yet
|
|
107
99
|
|
|
108
|
-
|
|
100
|
+
bucket_dir.rmdir()
|
|
109
101
|
|
|
110
102
|
tarred_count += 1
|
|
111
103
|
total_files += file_count
|
|
112
104
|
|
|
113
|
-
|
|
114
|
-
logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
|
|
105
|
+
logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
|
|
115
106
|
else:
|
|
116
107
|
logger.error(f"Tar file empty or not created: {tar_path}")
|
|
117
108
|
if tar_path.exists():
|
|
118
109
|
tar_path.unlink()
|
|
119
110
|
|
|
120
111
|
except Exception as e:
|
|
121
|
-
logger.error(f"Error tarring {
|
|
112
|
+
logger.error(f"Error tarring bucket {bucket_name}: {e}")
|
|
122
113
|
if tar_path.exists():
|
|
123
114
|
tar_path.unlink()
|
|
124
115
|
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/worker.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Worker process for parallel image download and conversion."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import signal
|
|
4
5
|
import tempfile
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import requests
|
|
@@ -17,6 +18,9 @@ def worker_process(work_queue, result_queue, worker_id):
|
|
|
17
18
|
result_queue: Queue to push results to
|
|
18
19
|
worker_id: Unique worker identifier
|
|
19
20
|
"""
|
|
21
|
+
# Ignore SIGINT in worker process - parent will handle it
|
|
22
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
23
|
+
|
|
20
24
|
# Create session once per worker (reuse HTTP connections)
|
|
21
25
|
session = requests.Session()
|
|
22
26
|
|
|
@@ -65,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
|
|
|
65
69
|
if not image_url:
|
|
66
70
|
return (image_id, 0, False, f"No {quality} URL")
|
|
67
71
|
|
|
68
|
-
# Determine final output directory
|
|
72
|
+
# Determine final output directory - organize by first char of sequence ID
|
|
69
73
|
output_dir = Path(output_dir)
|
|
70
74
|
sequence_id = image_data.get("sequence")
|
|
71
75
|
if sequence_id:
|
|
72
|
-
|
|
76
|
+
# Use first character as bucket (gives us ~62 dirs instead of millions)
|
|
77
|
+
first_char = sequence_id[0]
|
|
78
|
+
img_dir = output_dir / first_char / sequence_id
|
|
73
79
|
img_dir.mkdir(parents=True, exist_ok=True)
|
|
74
80
|
else:
|
|
75
81
|
img_dir = output_dir
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/worker_pool.py
RENAMED
|
@@ -185,20 +185,18 @@ class AdaptiveWorkerPool:
|
|
|
185
185
|
else:
|
|
186
186
|
logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
|
|
187
187
|
|
|
188
|
-
def shutdown(self, timeout=
|
|
188
|
+
def shutdown(self, timeout=2):
|
|
189
189
|
"""Shutdown the worker pool gracefully."""
|
|
190
190
|
logger.info("Shutting down worker pool...")
|
|
191
191
|
self.running = False
|
|
192
192
|
|
|
193
|
-
#
|
|
194
|
-
for _ in self.workers:
|
|
195
|
-
self.work_queue.put(None)
|
|
196
|
-
|
|
197
|
-
# Wait for workers to finish
|
|
193
|
+
# Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
|
|
198
194
|
for p in self.workers:
|
|
199
|
-
p.join(timeout=timeout)
|
|
200
195
|
if p.is_alive():
|
|
201
|
-
logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
|
|
202
196
|
p.terminate()
|
|
203
197
|
|
|
198
|
+
# Give them a brief moment to exit
|
|
199
|
+
for p in self.workers:
|
|
200
|
+
p.join(timeout=timeout)
|
|
201
|
+
|
|
204
202
|
logger.info("Worker pool shutdown complete")
|
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/__init__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/client.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/exif_writer.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/ia_check.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/ia_meta.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/logging_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.5.1 → mapillary_downloader-0.6.0}/src/mapillary_downloader/webp_converter.py
RENAMED
|
File without changes
|