mapillary-downloader 0.4.2__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/PKG-INFO +1 -1
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/pyproject.toml +1 -1
- mapillary_downloader-0.5.1/src/mapillary_downloader/downloader.py +471 -0
- mapillary_downloader-0.5.1/src/mapillary_downloader/metadata_reader.py +163 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/worker.py +34 -7
- mapillary_downloader-0.5.1/src/mapillary_downloader/worker_pool.py +204 -0
- mapillary_downloader-0.4.2/src/mapillary_downloader/downloader.py +0 -326
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/LICENSE.md +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/README.md +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__main__.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_check.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_meta.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/logging_config.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/tar_sequences.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/utils.py +0 -0
- {mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/webp_converter.py +0 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
"""Main downloader logic."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from mapillary_downloader.utils import format_size, format_time
|
|
11
|
+
from mapillary_downloader.ia_meta import generate_ia_metadata
|
|
12
|
+
from mapillary_downloader.ia_check import check_ia_exists
|
|
13
|
+
from mapillary_downloader.worker import worker_process
|
|
14
|
+
from mapillary_downloader.worker_pool import AdaptiveWorkerPool
|
|
15
|
+
from mapillary_downloader.metadata_reader import MetadataReader
|
|
16
|
+
from mapillary_downloader.tar_sequences import tar_sequence_directories
|
|
17
|
+
from mapillary_downloader.logging_config import add_file_handler
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_cache_dir():
|
|
23
|
+
"""Get XDG cache directory for staging downloads.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Path to cache directory for mapillary_downloader
|
|
27
|
+
"""
|
|
28
|
+
xdg_cache = os.environ.get("XDG_CACHE_HOME")
|
|
29
|
+
if xdg_cache:
|
|
30
|
+
cache_dir = Path(xdg_cache)
|
|
31
|
+
else:
|
|
32
|
+
cache_dir = Path.home() / ".cache"
|
|
33
|
+
|
|
34
|
+
mapillary_cache = cache_dir / "mapillary_downloader"
|
|
35
|
+
mapillary_cache.mkdir(parents=True, exist_ok=True)
|
|
36
|
+
return mapillary_cache
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class MapillaryDownloader:
|
|
40
|
+
"""Handles downloading Mapillary data for a user."""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
client,
|
|
45
|
+
output_dir,
|
|
46
|
+
username=None,
|
|
47
|
+
quality=None,
|
|
48
|
+
workers=None,
|
|
49
|
+
tar_sequences=True,
|
|
50
|
+
convert_webp=False,
|
|
51
|
+
check_ia=True,
|
|
52
|
+
):
|
|
53
|
+
"""Initialize the downloader.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
client: MapillaryClient instance
|
|
57
|
+
output_dir: Base directory to save downloads (final destination)
|
|
58
|
+
username: Mapillary username (for collection directory)
|
|
59
|
+
quality: Image quality (for collection directory)
|
|
60
|
+
workers: Number of parallel workers (default: half of cpu_count)
|
|
61
|
+
tar_sequences: Whether to tar sequence directories after download (default: True)
|
|
62
|
+
convert_webp: Whether to convert images to WebP (affects collection name)
|
|
63
|
+
check_ia: Whether to check if collection exists on Internet Archive (default: True)
|
|
64
|
+
"""
|
|
65
|
+
self.client = client
|
|
66
|
+
self.base_output_dir = Path(output_dir)
|
|
67
|
+
self.username = username
|
|
68
|
+
self.quality = quality
|
|
69
|
+
self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
|
|
70
|
+
self.tar_sequences = tar_sequences
|
|
71
|
+
self.convert_webp = convert_webp
|
|
72
|
+
self.check_ia = check_ia
|
|
73
|
+
|
|
74
|
+
# Determine collection name
|
|
75
|
+
if username and quality:
|
|
76
|
+
collection_name = f"mapillary-{username}-{quality}"
|
|
77
|
+
if convert_webp:
|
|
78
|
+
collection_name += "-webp"
|
|
79
|
+
self.collection_name = collection_name
|
|
80
|
+
else:
|
|
81
|
+
self.collection_name = None
|
|
82
|
+
|
|
83
|
+
# Set up staging directory in cache
|
|
84
|
+
cache_dir = get_cache_dir()
|
|
85
|
+
if self.collection_name:
|
|
86
|
+
self.staging_dir = cache_dir / self.collection_name
|
|
87
|
+
self.final_dir = self.base_output_dir / self.collection_name
|
|
88
|
+
else:
|
|
89
|
+
self.staging_dir = cache_dir / "download"
|
|
90
|
+
self.final_dir = self.base_output_dir
|
|
91
|
+
|
|
92
|
+
# Work in staging directory during download
|
|
93
|
+
self.output_dir = self.staging_dir
|
|
94
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
logger.info(f"Staging directory: {self.staging_dir}")
|
|
97
|
+
logger.info(f"Final destination: {self.final_dir}")
|
|
98
|
+
|
|
99
|
+
# Set up file logging for archival with timestamp for incremental runs
|
|
100
|
+
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
101
|
+
log_file = self.output_dir / f"download.log.{timestamp}"
|
|
102
|
+
add_file_handler(log_file)
|
|
103
|
+
logger.info(f"Logging to: {log_file}")
|
|
104
|
+
|
|
105
|
+
self.metadata_file = self.output_dir / "metadata.jsonl"
|
|
106
|
+
self.progress_file = self.output_dir / "progress.json"
|
|
107
|
+
self.downloaded = self._load_progress()
|
|
108
|
+
|
|
109
|
+
def _load_progress(self):
|
|
110
|
+
"""Load previously downloaded image IDs for this quality."""
|
|
111
|
+
if self.progress_file.exists():
|
|
112
|
+
with open(self.progress_file) as f:
|
|
113
|
+
data = json.load(f)
|
|
114
|
+
# Support both old format (single list) and new format (per-quality dict)
|
|
115
|
+
if isinstance(data, dict):
|
|
116
|
+
if "downloaded" in data:
|
|
117
|
+
# Old format: {"downloaded": [...]}
|
|
118
|
+
return set(data["downloaded"])
|
|
119
|
+
else:
|
|
120
|
+
# New format: {"256": [...], "1024": [...], ...}
|
|
121
|
+
return set(data.get(str(self.quality), []))
|
|
122
|
+
else:
|
|
123
|
+
# Very old format: just a list
|
|
124
|
+
return set(data)
|
|
125
|
+
return set()
|
|
126
|
+
|
|
127
|
+
def _save_progress(self):
|
|
128
|
+
"""Save progress to disk atomically, per-quality."""
|
|
129
|
+
# Load existing progress for all qualities
|
|
130
|
+
if self.progress_file.exists():
|
|
131
|
+
with open(self.progress_file) as f:
|
|
132
|
+
data = json.load(f)
|
|
133
|
+
# Convert old format to new format if needed
|
|
134
|
+
if isinstance(data, dict) and "downloaded" in data:
|
|
135
|
+
# Old format: {"downloaded": [...]} - migrate to per-quality
|
|
136
|
+
progress = {}
|
|
137
|
+
else:
|
|
138
|
+
progress = data if isinstance(data, dict) else {}
|
|
139
|
+
else:
|
|
140
|
+
progress = {}
|
|
141
|
+
|
|
142
|
+
# Update this quality's progress
|
|
143
|
+
progress[str(self.quality)] = list(self.downloaded)
|
|
144
|
+
|
|
145
|
+
# Write atomically
|
|
146
|
+
temp_file = self.progress_file.with_suffix(".json.tmp")
|
|
147
|
+
with open(temp_file, "w") as f:
|
|
148
|
+
json.dump(progress, f)
|
|
149
|
+
f.flush()
|
|
150
|
+
os.fsync(f.fileno())
|
|
151
|
+
temp_file.replace(self.progress_file)
|
|
152
|
+
|
|
153
|
+
def download_user_data(self, bbox=None, convert_webp=False):
|
|
154
|
+
"""Download all images for a user using streaming queue-based architecture.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
bbox: Optional bounding box [west, south, east, north]
|
|
158
|
+
convert_webp: Convert images to WebP format after download
|
|
159
|
+
"""
|
|
160
|
+
if not self.username or not self.quality:
|
|
161
|
+
raise ValueError("Username and quality must be provided during initialization")
|
|
162
|
+
|
|
163
|
+
# Check if collection already exists on Internet Archive
|
|
164
|
+
if self.check_ia and self.collection_name:
|
|
165
|
+
logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
|
|
166
|
+
if check_ia_exists(self.collection_name):
|
|
167
|
+
logger.info("Collection already exists on archive.org, skipping download")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
# Check if collection already exists in final destination
|
|
171
|
+
if self.final_dir.exists():
|
|
172
|
+
logger.info(f"Collection already exists at {self.final_dir}, skipping download")
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
quality_field = f"thumb_{self.quality}_url"
|
|
176
|
+
|
|
177
|
+
logger.info(f"Downloading images for user: {self.username}")
|
|
178
|
+
logger.info(f"Output directory: {self.output_dir}")
|
|
179
|
+
logger.info(f"Quality: {self.quality}")
|
|
180
|
+
logger.info(f"Using {self.workers} parallel workers")
|
|
181
|
+
|
|
182
|
+
start_time = time.time()
|
|
183
|
+
|
|
184
|
+
# Step 1: Check if API fetch is already complete
|
|
185
|
+
reader = MetadataReader(self.metadata_file)
|
|
186
|
+
api_complete = reader.is_complete
|
|
187
|
+
if api_complete:
|
|
188
|
+
logger.info("API fetch already complete, will only download")
|
|
189
|
+
else:
|
|
190
|
+
logger.info("API fetch incomplete, will fetch and download in parallel")
|
|
191
|
+
|
|
192
|
+
# Step 2: Start worker pool
|
|
193
|
+
# Since workers do both I/O (download) and CPU (WebP), need many more workers
|
|
194
|
+
# Cap at 128 for now - will build proper dynamic scaling on a new branch later
|
|
195
|
+
pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
|
|
196
|
+
pool.start()
|
|
197
|
+
|
|
198
|
+
# Step 3: Download images from metadata file while fetching new from API
|
|
199
|
+
downloaded_count = 0
|
|
200
|
+
total_bytes = 0
|
|
201
|
+
failed_count = 0
|
|
202
|
+
submitted = 0
|
|
203
|
+
batch_start = time.time()
|
|
204
|
+
|
|
205
|
+
logger.info("Starting parallel download and API fetch...")
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
# Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
|
|
209
|
+
if not api_complete:
|
|
210
|
+
import threading
|
|
211
|
+
|
|
212
|
+
api_fetch_complete = threading.Event()
|
|
213
|
+
new_images_count = [0] # Mutable so thread can update it
|
|
214
|
+
|
|
215
|
+
def fetch_api_metadata():
|
|
216
|
+
"""Fetch metadata from API and write to file (runs in thread)."""
|
|
217
|
+
try:
|
|
218
|
+
logger.info("API fetch thread: Starting...")
|
|
219
|
+
with open(self.metadata_file, "a") as meta_f:
|
|
220
|
+
for image in self.client.get_user_images(self.username, bbox=bbox):
|
|
221
|
+
new_images_count[0] += 1
|
|
222
|
+
|
|
223
|
+
# Save metadata (don't dedupe here, let the tailer handle it)
|
|
224
|
+
meta_f.write(json.dumps(image) + "\n")
|
|
225
|
+
meta_f.flush()
|
|
226
|
+
|
|
227
|
+
if new_images_count[0] % 1000 == 0:
|
|
228
|
+
logger.info(f"API: Fetched {new_images_count[0]} images from API")
|
|
229
|
+
|
|
230
|
+
# Mark as complete
|
|
231
|
+
MetadataReader.mark_complete(self.metadata_file)
|
|
232
|
+
logger.info(f"API fetch complete: {new_images_count[0]} images")
|
|
233
|
+
finally:
|
|
234
|
+
api_fetch_complete.set()
|
|
235
|
+
|
|
236
|
+
# Start API fetch in background thread
|
|
237
|
+
api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
|
|
238
|
+
api_thread.start()
|
|
239
|
+
else:
|
|
240
|
+
logger.info("API fetch already complete, skipping API thread")
|
|
241
|
+
api_fetch_complete = None
|
|
242
|
+
|
|
243
|
+
# Step 3b: Tail metadata file and submit to workers
|
|
244
|
+
logger.info("Starting metadata tail and download queue feeder...")
|
|
245
|
+
last_position = 0
|
|
246
|
+
|
|
247
|
+
# Helper to process results from queue
|
|
248
|
+
def process_results():
|
|
249
|
+
nonlocal downloaded_count, total_bytes, failed_count
|
|
250
|
+
while True:
|
|
251
|
+
result = pool.get_result(timeout=0.001)
|
|
252
|
+
if result is None:
|
|
253
|
+
break
|
|
254
|
+
|
|
255
|
+
image_id, bytes_dl, success, error_msg = result
|
|
256
|
+
|
|
257
|
+
if success:
|
|
258
|
+
self.downloaded.add(image_id)
|
|
259
|
+
downloaded_count += 1
|
|
260
|
+
total_bytes += bytes_dl
|
|
261
|
+
|
|
262
|
+
# Log every download for first 10, then every 100
|
|
263
|
+
should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
|
|
264
|
+
if should_log:
|
|
265
|
+
elapsed = time.time() - batch_start
|
|
266
|
+
rate = downloaded_count / elapsed if elapsed > 0 else 0
|
|
267
|
+
logger.info(
|
|
268
|
+
f"Downloaded: {downloaded_count} ({format_size(total_bytes)}) "
|
|
269
|
+
f"- Rate: {rate:.1f} images/sec"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
if downloaded_count % 100 == 0:
|
|
273
|
+
self._save_progress()
|
|
274
|
+
pool.check_throughput(downloaded_count)
|
|
275
|
+
else:
|
|
276
|
+
failed_count += 1
|
|
277
|
+
logger.warning(f"Failed to download {image_id}: {error_msg}")
|
|
278
|
+
|
|
279
|
+
# Tail the metadata file and submit to workers
|
|
280
|
+
while True:
|
|
281
|
+
# Check if API fetch is done and we've processed everything
|
|
282
|
+
if api_fetch_complete and api_fetch_complete.is_set():
|
|
283
|
+
# Read any remaining lines
|
|
284
|
+
if self.metadata_file.exists():
|
|
285
|
+
with open(self.metadata_file) as f:
|
|
286
|
+
f.seek(last_position)
|
|
287
|
+
for line in f:
|
|
288
|
+
line = line.strip()
|
|
289
|
+
if not line:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
image = json.loads(line)
|
|
294
|
+
except json.JSONDecodeError:
|
|
295
|
+
# Incomplete line, will retry
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Skip completion marker
|
|
299
|
+
if image.get("__complete__"):
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
image_id = image.get("id")
|
|
303
|
+
if not image_id:
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
# Skip if already downloaded or no quality URL
|
|
307
|
+
if image_id in self.downloaded:
|
|
308
|
+
continue
|
|
309
|
+
if not image.get(quality_field):
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
# Submit to workers
|
|
313
|
+
work_item = (
|
|
314
|
+
image,
|
|
315
|
+
str(self.output_dir),
|
|
316
|
+
self.quality,
|
|
317
|
+
convert_webp,
|
|
318
|
+
self.client.access_token,
|
|
319
|
+
)
|
|
320
|
+
pool.submit(work_item)
|
|
321
|
+
submitted += 1
|
|
322
|
+
|
|
323
|
+
if submitted % 1000 == 0:
|
|
324
|
+
logger.info(f"Queue: Submitted {submitted} images")
|
|
325
|
+
|
|
326
|
+
# Process results while submitting
|
|
327
|
+
process_results()
|
|
328
|
+
|
|
329
|
+
last_position = f.tell()
|
|
330
|
+
|
|
331
|
+
# API done and all lines processed, break
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
# API still running or API was already complete, tail the file
|
|
335
|
+
if self.metadata_file.exists():
|
|
336
|
+
with open(self.metadata_file) as f:
|
|
337
|
+
f.seek(last_position)
|
|
338
|
+
for line in f:
|
|
339
|
+
line = line.strip()
|
|
340
|
+
if not line:
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
image = json.loads(line)
|
|
345
|
+
except json.JSONDecodeError:
|
|
346
|
+
# Incomplete line, will retry next iteration
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
# Skip completion marker
|
|
350
|
+
if image.get("__complete__"):
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
image_id = image.get("id")
|
|
354
|
+
if not image_id:
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
# Skip if already downloaded or no quality URL
|
|
358
|
+
if image_id in self.downloaded:
|
|
359
|
+
continue
|
|
360
|
+
if not image.get(quality_field):
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Submit to workers
|
|
364
|
+
work_item = (
|
|
365
|
+
image,
|
|
366
|
+
str(self.output_dir),
|
|
367
|
+
self.quality,
|
|
368
|
+
convert_webp,
|
|
369
|
+
self.client.access_token,
|
|
370
|
+
)
|
|
371
|
+
pool.submit(work_item)
|
|
372
|
+
submitted += 1
|
|
373
|
+
|
|
374
|
+
if submitted % 1000 == 0:
|
|
375
|
+
logger.info(f"Queue: Submitted {submitted} images")
|
|
376
|
+
|
|
377
|
+
# Process results while submitting
|
|
378
|
+
process_results()
|
|
379
|
+
|
|
380
|
+
last_position = f.tell()
|
|
381
|
+
|
|
382
|
+
# Sleep briefly before next tail iteration
|
|
383
|
+
time.sleep(0.1)
|
|
384
|
+
|
|
385
|
+
# Process any results that came in
|
|
386
|
+
process_results()
|
|
387
|
+
|
|
388
|
+
# Send shutdown signals
|
|
389
|
+
logger.info(f"Submitted {submitted} images, waiting for workers to finish...")
|
|
390
|
+
for _ in range(pool.current_workers):
|
|
391
|
+
pool.submit(None)
|
|
392
|
+
|
|
393
|
+
# Collect remaining results
|
|
394
|
+
completed = downloaded_count + failed_count
|
|
395
|
+
|
|
396
|
+
while completed < submitted:
|
|
397
|
+
result = pool.get_result(timeout=5)
|
|
398
|
+
if result is None:
|
|
399
|
+
# Check throughput periodically
|
|
400
|
+
pool.check_throughput(downloaded_count)
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
image_id, bytes_dl, success, error_msg = result
|
|
404
|
+
completed += 1
|
|
405
|
+
|
|
406
|
+
if success:
|
|
407
|
+
self.downloaded.add(image_id)
|
|
408
|
+
downloaded_count += 1
|
|
409
|
+
total_bytes += bytes_dl
|
|
410
|
+
|
|
411
|
+
if downloaded_count % 10 == 0:
|
|
412
|
+
elapsed = time.time() - batch_start
|
|
413
|
+
rate = downloaded_count / elapsed if elapsed > 0 else 0
|
|
414
|
+
remaining = submitted - completed
|
|
415
|
+
eta_seconds = remaining / rate if rate > 0 else 0
|
|
416
|
+
|
|
417
|
+
logger.info(
|
|
418
|
+
f"Downloaded: {downloaded_count}/{submitted} ({format_size(total_bytes)}) "
|
|
419
|
+
f"- ETA: {format_time(eta_seconds)}"
|
|
420
|
+
)
|
|
421
|
+
self._save_progress()
|
|
422
|
+
pool.check_throughput(downloaded_count)
|
|
423
|
+
else:
|
|
424
|
+
failed_count += 1
|
|
425
|
+
logger.warning(f"Failed to download {image_id}: {error_msg}")
|
|
426
|
+
|
|
427
|
+
finally:
|
|
428
|
+
# Shutdown worker pool
|
|
429
|
+
pool.shutdown()
|
|
430
|
+
|
|
431
|
+
self._save_progress()
|
|
432
|
+
elapsed = time.time() - start_time
|
|
433
|
+
|
|
434
|
+
logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
|
|
435
|
+
logger.info(f"Total time: {format_time(elapsed)}")
|
|
436
|
+
|
|
437
|
+
# Tar sequence directories for efficient IA uploads
|
|
438
|
+
if self.tar_sequences:
|
|
439
|
+
tar_sequence_directories(self.output_dir)
|
|
440
|
+
|
|
441
|
+
# Gzip metadata.jsonl to save space
|
|
442
|
+
if self.metadata_file.exists():
|
|
443
|
+
logger.info("Compressing metadata.jsonl...")
|
|
444
|
+
original_size = self.metadata_file.stat().st_size
|
|
445
|
+
gzipped_file = self.metadata_file.with_suffix(".jsonl.gz")
|
|
446
|
+
|
|
447
|
+
with open(self.metadata_file, "rb") as f_in:
|
|
448
|
+
with gzip.open(gzipped_file, "wb", compresslevel=9) as f_out:
|
|
449
|
+
shutil.copyfileobj(f_in, f_out)
|
|
450
|
+
|
|
451
|
+
compressed_size = gzipped_file.stat().st_size
|
|
452
|
+
self.metadata_file.unlink()
|
|
453
|
+
|
|
454
|
+
savings = 100 * (1 - compressed_size / original_size)
|
|
455
|
+
logger.info(
|
|
456
|
+
f"Compressed metadata: {format_size(original_size)} → {format_size(compressed_size)} "
|
|
457
|
+
f"({savings:.1f}% savings)"
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Generate IA metadata
|
|
461
|
+
generate_ia_metadata(self.output_dir)
|
|
462
|
+
|
|
463
|
+
# Move from staging to final destination
|
|
464
|
+
logger.info("Moving collection from staging to final destination...")
|
|
465
|
+
if self.final_dir.exists():
|
|
466
|
+
logger.warning(f"Destination already exists, removing: {self.final_dir}")
|
|
467
|
+
shutil.rmtree(self.final_dir)
|
|
468
|
+
|
|
469
|
+
self.final_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
470
|
+
shutil.move(str(self.staging_dir), str(self.final_dir))
|
|
471
|
+
logger.info(f"Collection moved to: {self.final_dir}")
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Streaming metadata reader with filtering."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MetadataReader:
|
|
12
|
+
"""Streams metadata.jsonl line-by-line with filtering.
|
|
13
|
+
|
|
14
|
+
This avoids loading millions of image dicts into memory.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
COMPLETION_MARKER = {"__complete__": True}
|
|
18
|
+
|
|
19
|
+
def __init__(self, metadata_file):
|
|
20
|
+
"""Initialize metadata reader.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
|
|
24
|
+
"""
|
|
25
|
+
self.metadata_file = Path(metadata_file)
|
|
26
|
+
self.is_complete = self._check_complete()
|
|
27
|
+
|
|
28
|
+
def _check_complete(self):
|
|
29
|
+
"""Check if metadata file has completion marker.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
True if completion marker found, False otherwise
|
|
33
|
+
"""
|
|
34
|
+
if not self.metadata_file.exists():
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Check last few lines for completion marker (it should be at the end)
|
|
38
|
+
try:
|
|
39
|
+
if self.metadata_file.suffix == ".gz":
|
|
40
|
+
file_handle = gzip.open(self.metadata_file, "rt")
|
|
41
|
+
else:
|
|
42
|
+
file_handle = open(self.metadata_file)
|
|
43
|
+
|
|
44
|
+
with file_handle as f:
|
|
45
|
+
# Read last 10 lines to find completion marker
|
|
46
|
+
lines = []
|
|
47
|
+
for line in f:
|
|
48
|
+
lines.append(line)
|
|
49
|
+
if len(lines) > 10:
|
|
50
|
+
lines.pop(0)
|
|
51
|
+
|
|
52
|
+
# Check if any of the last lines is the completion marker
|
|
53
|
+
for line in reversed(lines):
|
|
54
|
+
line = line.strip()
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
try:
|
|
58
|
+
data = json.loads(line)
|
|
59
|
+
if data.get("__complete__"):
|
|
60
|
+
return True
|
|
61
|
+
except json.JSONDecodeError:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
return False
|
|
65
|
+
except Exception:
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
def iter_images(self, quality_field=None, downloaded_ids=None):
|
|
69
|
+
"""Stream images from metadata file with filtering.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
quality_field: Optional field to check exists (e.g., 'thumb_1024_url')
|
|
73
|
+
downloaded_ids: Optional set of already downloaded IDs to skip
|
|
74
|
+
|
|
75
|
+
Yields:
|
|
76
|
+
Image metadata dicts that pass filters
|
|
77
|
+
"""
|
|
78
|
+
if not self.metadata_file.exists():
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
# Handle gzipped files
|
|
82
|
+
if self.metadata_file.suffix == ".gz":
|
|
83
|
+
file_handle = gzip.open(self.metadata_file, "rt")
|
|
84
|
+
else:
|
|
85
|
+
file_handle = open(self.metadata_file)
|
|
86
|
+
|
|
87
|
+
with file_handle as f:
|
|
88
|
+
for line in f:
|
|
89
|
+
line = line.strip()
|
|
90
|
+
if not line:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
image = json.loads(line)
|
|
94
|
+
|
|
95
|
+
# Check for completion marker
|
|
96
|
+
if image.get("__complete__"):
|
|
97
|
+
self.is_complete = True
|
|
98
|
+
logger.debug("Found API fetch completion marker")
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
image_id = image.get("id")
|
|
102
|
+
if not image_id:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Filter by downloaded status
|
|
106
|
+
if downloaded_ids and image_id in downloaded_ids:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# Filter by quality field availability
|
|
110
|
+
if quality_field and not image.get(quality_field):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
yield image
|
|
114
|
+
|
|
115
|
+
def get_all_ids(self):
|
|
116
|
+
"""Get set of all image IDs in metadata file.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Set of image IDs (for building seen_ids)
|
|
120
|
+
"""
|
|
121
|
+
ids = set()
|
|
122
|
+
|
|
123
|
+
if not self.metadata_file.exists():
|
|
124
|
+
return ids
|
|
125
|
+
|
|
126
|
+
# Handle gzipped files
|
|
127
|
+
if self.metadata_file.suffix == ".gz":
|
|
128
|
+
file_handle = gzip.open(self.metadata_file, "rt")
|
|
129
|
+
else:
|
|
130
|
+
file_handle = open(self.metadata_file)
|
|
131
|
+
|
|
132
|
+
with file_handle as f:
|
|
133
|
+
for line in f:
|
|
134
|
+
line = line.strip()
|
|
135
|
+
if not line:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
image = json.loads(line)
|
|
139
|
+
|
|
140
|
+
# Skip completion marker
|
|
141
|
+
if image.get("__complete__"):
|
|
142
|
+
self.is_complete = True
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
image_id = image.get("id")
|
|
146
|
+
if image_id:
|
|
147
|
+
ids.add(image_id)
|
|
148
|
+
|
|
149
|
+
return ids
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def mark_complete(metadata_file):
|
|
153
|
+
"""Append completion marker to metadata file.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
metadata_file: Path to metadata.jsonl
|
|
157
|
+
"""
|
|
158
|
+
metadata_file = Path(metadata_file)
|
|
159
|
+
if metadata_file.exists():
|
|
160
|
+
with open(metadata_file, "a") as f:
|
|
161
|
+
f.write(json.dumps(MetadataReader.COMPLETION_MARKER) + "\n")
|
|
162
|
+
f.flush()
|
|
163
|
+
logger.info("Marked metadata file as complete")
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/worker.py
RENAMED
|
@@ -9,7 +9,38 @@ from mapillary_downloader.exif_writer import write_exif_to_image
|
|
|
9
9
|
from mapillary_downloader.webp_converter import convert_to_webp
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def
|
|
12
|
+
def worker_process(work_queue, result_queue, worker_id):
|
|
13
|
+
"""Worker process that pulls from queue and processes images.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
work_queue: Queue to pull work items from
|
|
17
|
+
result_queue: Queue to push results to
|
|
18
|
+
worker_id: Unique worker identifier
|
|
19
|
+
"""
|
|
20
|
+
# Create session once per worker (reuse HTTP connections)
|
|
21
|
+
session = requests.Session()
|
|
22
|
+
|
|
23
|
+
while True:
|
|
24
|
+
work_item = work_queue.get()
|
|
25
|
+
|
|
26
|
+
# None is the shutdown signal
|
|
27
|
+
if work_item is None:
|
|
28
|
+
break
|
|
29
|
+
|
|
30
|
+
# Unpack work item
|
|
31
|
+
image_data, output_dir, quality, convert_webp, access_token = work_item
|
|
32
|
+
|
|
33
|
+
# Update session auth for this request
|
|
34
|
+
session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
35
|
+
|
|
36
|
+
# Process the image
|
|
37
|
+
result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
|
|
38
|
+
|
|
39
|
+
# Push result back
|
|
40
|
+
result_queue.put(result)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
|
|
13
44
|
"""Download and optionally convert a single image.
|
|
14
45
|
|
|
15
46
|
This function is designed to run in a worker process.
|
|
@@ -19,7 +50,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
|
19
50
|
output_dir: Base output directory path
|
|
20
51
|
quality: Quality level (256, 1024, 2048, original)
|
|
21
52
|
convert_webp: Whether to convert to WebP
|
|
22
|
-
|
|
53
|
+
session: requests.Session with auth already configured
|
|
23
54
|
|
|
24
55
|
Returns:
|
|
25
56
|
Tuple of (image_id, bytes_downloaded, success, error_msg)
|
|
@@ -53,11 +84,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
|
53
84
|
jpg_path = img_dir / f"{image_id}.jpg"
|
|
54
85
|
final_path = jpg_path
|
|
55
86
|
|
|
56
|
-
# Download image
|
|
57
|
-
# No retries for CDN images - they're cheap, just skip failures and move on
|
|
58
|
-
session = requests.Session()
|
|
59
|
-
session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
60
|
-
|
|
87
|
+
# Download image (using session passed from worker)
|
|
61
88
|
bytes_downloaded = 0
|
|
62
89
|
|
|
63
90
|
try:
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Adaptive worker pool for parallel processing."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
import queue
|
|
6
|
+
import time
|
|
7
|
+
from collections import deque
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AdaptiveWorkerPool:
|
|
13
|
+
"""Worker pool that scales based on throughput.
|
|
14
|
+
|
|
15
|
+
Monitors throughput every 30 seconds and adjusts worker count:
|
|
16
|
+
- If throughput increasing: add workers (up to max)
|
|
17
|
+
- If throughput plateauing/decreasing: reduce workers
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
|
|
21
|
+
"""Initialize adaptive worker pool.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
worker_func: Function to run in each worker (must accept work_queue, result_queue)
|
|
25
|
+
min_workers: Minimum number of workers
|
|
26
|
+
max_workers: Maximum number of workers
|
|
27
|
+
monitoring_interval: Seconds between throughput checks
|
|
28
|
+
"""
|
|
29
|
+
self.worker_func = worker_func
|
|
30
|
+
self.min_workers = min_workers
|
|
31
|
+
self.max_workers = max_workers
|
|
32
|
+
self.monitoring_interval = monitoring_interval
|
|
33
|
+
|
|
34
|
+
# Queues
|
|
35
|
+
self.work_queue = mp.Queue(maxsize=max_workers)
|
|
36
|
+
self.result_queue = mp.Queue()
|
|
37
|
+
|
|
38
|
+
# Worker management
|
|
39
|
+
self.workers = []
|
|
40
|
+
self.current_workers = min_workers # Start small and ramp up
|
|
41
|
+
|
|
42
|
+
# Throughput monitoring
|
|
43
|
+
self.throughput_history = deque(maxlen=5) # Last 5 measurements
|
|
44
|
+
self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
|
|
45
|
+
self.last_processed = 0
|
|
46
|
+
self.last_check_time = time.time()
|
|
47
|
+
|
|
48
|
+
self.running = False
|
|
49
|
+
|
|
50
|
+
def start(self):
|
|
51
|
+
"""Start the worker pool."""
|
|
52
|
+
self.running = True
|
|
53
|
+
logger.info(f"Starting worker pool with {self.current_workers} workers")
|
|
54
|
+
|
|
55
|
+
for i in range(self.current_workers):
|
|
56
|
+
self._add_worker(i)
|
|
57
|
+
|
|
58
|
+
def _add_worker(self, worker_id):
|
|
59
|
+
"""Add a new worker to the pool."""
|
|
60
|
+
p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
|
|
61
|
+
p.start()
|
|
62
|
+
self.workers.append(p)
|
|
63
|
+
logger.debug(f"Started worker {worker_id}")
|
|
64
|
+
|
|
65
|
+
def submit(self, work_item):
|
|
66
|
+
"""Submit work to the pool (blocks if queue is full)."""
|
|
67
|
+
self.work_queue.put(work_item)
|
|
68
|
+
|
|
69
|
+
def get_result(self, timeout=None):
|
|
70
|
+
"""Get a result from the workers.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Result from worker, or None if timeout
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
return self.result_queue.get(timeout=timeout)
|
|
77
|
+
except queue.Empty:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def check_throughput(self, total_processed):
|
|
81
|
+
"""Check throughput and adjust workers if needed.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
total_processed: Total number of items processed so far
|
|
85
|
+
"""
|
|
86
|
+
now = time.time()
|
|
87
|
+
elapsed = now - self.last_check_time
|
|
88
|
+
|
|
89
|
+
if elapsed < self.monitoring_interval:
|
|
90
|
+
logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
# Calculate current throughput (items/sec)
|
|
94
|
+
items_since_check = total_processed - self.last_processed
|
|
95
|
+
throughput = items_since_check / elapsed
|
|
96
|
+
|
|
97
|
+
current_workers = len(self.workers)
|
|
98
|
+
self.throughput_history.append(throughput)
|
|
99
|
+
self.worker_count_history.append(current_workers)
|
|
100
|
+
self.last_processed = total_processed
|
|
101
|
+
self.last_check_time = now
|
|
102
|
+
|
|
103
|
+
logger.info(
|
|
104
|
+
f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
|
|
105
|
+
f"history: {len(self.throughput_history)} measurements)"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Need at least 2 measurements to calculate gain per worker
|
|
109
|
+
if len(self.throughput_history) < 2:
|
|
110
|
+
# First measurement - add 20% more workers
|
|
111
|
+
if current_workers < self.max_workers:
|
|
112
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
113
|
+
for i in range(workers_to_add):
|
|
114
|
+
if len(self.workers) < self.max_workers:
|
|
115
|
+
new_worker_id = len(self.workers)
|
|
116
|
+
self._add_worker(new_worker_id)
|
|
117
|
+
self.current_workers += 1
|
|
118
|
+
logger.info(
|
|
119
|
+
f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
|
|
120
|
+
)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
# Calculate throughput gain per worker added
|
|
124
|
+
current_throughput = self.throughput_history[-1]
|
|
125
|
+
previous_throughput = self.throughput_history[-2]
|
|
126
|
+
previous_workers = self.worker_count_history[-2]
|
|
127
|
+
|
|
128
|
+
throughput_gain = current_throughput - previous_throughput
|
|
129
|
+
workers_added = current_workers - previous_workers
|
|
130
|
+
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
|
|
133
|
+
f"{current_throughput:.1f} items/s @ {current_workers} workers "
|
|
134
|
+
f"(gain: {throughput_gain:.1f}, added: {workers_added})"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# If throughput decreased significantly, stop adding workers
|
|
138
|
+
if current_throughput < previous_throughput * 0.95:
|
|
139
|
+
logger.info(
|
|
140
|
+
f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
|
|
141
|
+
f"stopping at {current_workers} workers"
|
|
142
|
+
)
|
|
143
|
+
# If throughput is still increasing or stable, add more workers
|
|
144
|
+
elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
|
|
145
|
+
if workers_added > 0 and throughput_gain > 0:
|
|
146
|
+
# Calculate gain per worker
|
|
147
|
+
gain_per_worker = throughput_gain / workers_added
|
|
148
|
+
logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
|
|
149
|
+
|
|
150
|
+
# Estimate how many more workers we could benefit from
|
|
151
|
+
# Assume diminishing returns, so be conservative
|
|
152
|
+
if gain_per_worker > 0.5:
|
|
153
|
+
# Good gain per worker - add more aggressively
|
|
154
|
+
workers_to_add = max(1, int(current_workers * 0.3))
|
|
155
|
+
elif gain_per_worker > 0.2:
|
|
156
|
+
# Moderate gain - add moderately
|
|
157
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
158
|
+
else:
|
|
159
|
+
# Small gain - add conservatively
|
|
160
|
+
workers_to_add = max(1, int(current_workers * 0.1))
|
|
161
|
+
|
|
162
|
+
added = 0
|
|
163
|
+
for i in range(workers_to_add):
|
|
164
|
+
if len(self.workers) < self.max_workers:
|
|
165
|
+
new_worker_id = len(self.workers)
|
|
166
|
+
self._add_worker(new_worker_id)
|
|
167
|
+
self.current_workers += 1
|
|
168
|
+
added += 1
|
|
169
|
+
|
|
170
|
+
logger.info(
|
|
171
|
+
f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
|
|
172
|
+
f"added {added} workers (now {self.current_workers}/{self.max_workers})"
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
# Fallback to 20% if we can't calculate gain per worker
|
|
176
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
177
|
+
added = 0
|
|
178
|
+
for i in range(workers_to_add):
|
|
179
|
+
if len(self.workers) < self.max_workers:
|
|
180
|
+
new_worker_id = len(self.workers)
|
|
181
|
+
self._add_worker(new_worker_id)
|
|
182
|
+
self.current_workers += 1
|
|
183
|
+
added += 1
|
|
184
|
+
logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
|
|
185
|
+
else:
|
|
186
|
+
logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
|
|
187
|
+
|
|
188
|
+
def shutdown(self, timeout=30):
|
|
189
|
+
"""Shutdown the worker pool gracefully."""
|
|
190
|
+
logger.info("Shutting down worker pool...")
|
|
191
|
+
self.running = False
|
|
192
|
+
|
|
193
|
+
# Send stop signals
|
|
194
|
+
for _ in self.workers:
|
|
195
|
+
self.work_queue.put(None)
|
|
196
|
+
|
|
197
|
+
# Wait for workers to finish
|
|
198
|
+
for p in self.workers:
|
|
199
|
+
p.join(timeout=timeout)
|
|
200
|
+
if p.is_alive():
|
|
201
|
+
logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
|
|
202
|
+
p.terminate()
|
|
203
|
+
|
|
204
|
+
logger.info("Worker pool shutdown complete")
|
|
@@ -1,326 +0,0 @@
|
|
|
1
|
-
"""Main downloader logic."""
|
|
2
|
-
|
|
3
|
-
import gzip
|
|
4
|
-
import json
|
|
5
|
-
import logging
|
|
6
|
-
import os
|
|
7
|
-
import shutil
|
|
8
|
-
import time
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
11
|
-
from mapillary_downloader.utils import format_size, format_time
|
|
12
|
-
from mapillary_downloader.ia_meta import generate_ia_metadata
|
|
13
|
-
from mapillary_downloader.ia_check import check_ia_exists
|
|
14
|
-
from mapillary_downloader.worker import download_and_convert_image
|
|
15
|
-
from mapillary_downloader.tar_sequences import tar_sequence_directories
|
|
16
|
-
from mapillary_downloader.logging_config import add_file_handler
|
|
17
|
-
|
|
18
|
-
logger = logging.getLogger("mapillary_downloader")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def get_cache_dir():
|
|
22
|
-
"""Get XDG cache directory for staging downloads.
|
|
23
|
-
|
|
24
|
-
Returns:
|
|
25
|
-
Path to cache directory for mapillary_downloader
|
|
26
|
-
"""
|
|
27
|
-
xdg_cache = os.environ.get("XDG_CACHE_HOME")
|
|
28
|
-
if xdg_cache:
|
|
29
|
-
cache_dir = Path(xdg_cache)
|
|
30
|
-
else:
|
|
31
|
-
cache_dir = Path.home() / ".cache"
|
|
32
|
-
|
|
33
|
-
mapillary_cache = cache_dir / "mapillary_downloader"
|
|
34
|
-
mapillary_cache.mkdir(parents=True, exist_ok=True)
|
|
35
|
-
return mapillary_cache
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class MapillaryDownloader:
|
|
39
|
-
"""Handles downloading Mapillary data for a user."""
|
|
40
|
-
|
|
41
|
-
def __init__(
|
|
42
|
-
self,
|
|
43
|
-
client,
|
|
44
|
-
output_dir,
|
|
45
|
-
username=None,
|
|
46
|
-
quality=None,
|
|
47
|
-
workers=None,
|
|
48
|
-
tar_sequences=True,
|
|
49
|
-
convert_webp=False,
|
|
50
|
-
check_ia=True,
|
|
51
|
-
):
|
|
52
|
-
"""Initialize the downloader.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
client: MapillaryClient instance
|
|
56
|
-
output_dir: Base directory to save downloads (final destination)
|
|
57
|
-
username: Mapillary username (for collection directory)
|
|
58
|
-
quality: Image quality (for collection directory)
|
|
59
|
-
workers: Number of parallel workers (default: half of cpu_count)
|
|
60
|
-
tar_sequences: Whether to tar sequence directories after download (default: True)
|
|
61
|
-
convert_webp: Whether to convert images to WebP (affects collection name)
|
|
62
|
-
check_ia: Whether to check if collection exists on Internet Archive (default: True)
|
|
63
|
-
"""
|
|
64
|
-
self.client = client
|
|
65
|
-
self.base_output_dir = Path(output_dir)
|
|
66
|
-
self.username = username
|
|
67
|
-
self.quality = quality
|
|
68
|
-
self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
|
|
69
|
-
self.tar_sequences = tar_sequences
|
|
70
|
-
self.convert_webp = convert_webp
|
|
71
|
-
self.check_ia = check_ia
|
|
72
|
-
|
|
73
|
-
# Determine collection name
|
|
74
|
-
if username and quality:
|
|
75
|
-
collection_name = f"mapillary-{username}-{quality}"
|
|
76
|
-
if convert_webp:
|
|
77
|
-
collection_name += "-webp"
|
|
78
|
-
self.collection_name = collection_name
|
|
79
|
-
else:
|
|
80
|
-
self.collection_name = None
|
|
81
|
-
|
|
82
|
-
# Set up staging directory in cache
|
|
83
|
-
cache_dir = get_cache_dir()
|
|
84
|
-
if self.collection_name:
|
|
85
|
-
self.staging_dir = cache_dir / self.collection_name
|
|
86
|
-
self.final_dir = self.base_output_dir / self.collection_name
|
|
87
|
-
else:
|
|
88
|
-
self.staging_dir = cache_dir / "download"
|
|
89
|
-
self.final_dir = self.base_output_dir
|
|
90
|
-
|
|
91
|
-
# Work in staging directory during download
|
|
92
|
-
self.output_dir = self.staging_dir
|
|
93
|
-
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
94
|
-
|
|
95
|
-
logger.info(f"Staging directory: {self.staging_dir}")
|
|
96
|
-
logger.info(f"Final destination: {self.final_dir}")
|
|
97
|
-
|
|
98
|
-
# Set up file logging for archival with timestamp for incremental runs
|
|
99
|
-
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
100
|
-
log_file = self.output_dir / f"download.log.{timestamp}"
|
|
101
|
-
add_file_handler(log_file)
|
|
102
|
-
logger.info(f"Logging to: {log_file}")
|
|
103
|
-
|
|
104
|
-
self.metadata_file = self.output_dir / "metadata.jsonl"
|
|
105
|
-
self.progress_file = self.output_dir / "progress.json"
|
|
106
|
-
self.downloaded = self._load_progress()
|
|
107
|
-
|
|
108
|
-
def _load_progress(self):
|
|
109
|
-
"""Load previously downloaded image IDs."""
|
|
110
|
-
if self.progress_file.exists():
|
|
111
|
-
with open(self.progress_file) as f:
|
|
112
|
-
return set(json.load(f).get("downloaded", []))
|
|
113
|
-
return set()
|
|
114
|
-
|
|
115
|
-
def _save_progress(self):
|
|
116
|
-
"""Save progress to disk atomically."""
|
|
117
|
-
temp_file = self.progress_file.with_suffix(".json.tmp")
|
|
118
|
-
with open(temp_file, "w") as f:
|
|
119
|
-
json.dump({"downloaded": list(self.downloaded)}, f)
|
|
120
|
-
f.flush()
|
|
121
|
-
os.fsync(f.fileno())
|
|
122
|
-
temp_file.replace(self.progress_file)
|
|
123
|
-
|
|
124
|
-
def download_user_data(self, bbox=None, convert_webp=False):
|
|
125
|
-
"""Download all images for a user.
|
|
126
|
-
|
|
127
|
-
Args:
|
|
128
|
-
bbox: Optional bounding box [west, south, east, north]
|
|
129
|
-
convert_webp: Convert images to WebP format after download
|
|
130
|
-
"""
|
|
131
|
-
if not self.username or not self.quality:
|
|
132
|
-
raise ValueError("Username and quality must be provided during initialization")
|
|
133
|
-
|
|
134
|
-
# Check if collection already exists on Internet Archive
|
|
135
|
-
if self.check_ia and self.collection_name:
|
|
136
|
-
logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
|
|
137
|
-
if check_ia_exists(self.collection_name):
|
|
138
|
-
logger.info("Collection already exists on archive.org, skipping download")
|
|
139
|
-
return
|
|
140
|
-
|
|
141
|
-
# Check if collection already exists in final destination
|
|
142
|
-
if self.final_dir.exists():
|
|
143
|
-
logger.info(f"Collection already exists at {self.final_dir}, skipping download")
|
|
144
|
-
return
|
|
145
|
-
|
|
146
|
-
quality_field = f"thumb_{self.quality}_url"
|
|
147
|
-
|
|
148
|
-
logger.info(f"Downloading images for user: {self.username}")
|
|
149
|
-
logger.info(f"Output directory: {self.output_dir}")
|
|
150
|
-
logger.info(f"Quality: {self.quality}")
|
|
151
|
-
logger.info(f"Using {self.workers} parallel workers")
|
|
152
|
-
|
|
153
|
-
processed = 0
|
|
154
|
-
downloaded_count = 0
|
|
155
|
-
skipped = 0
|
|
156
|
-
total_bytes = 0
|
|
157
|
-
failed_count = 0
|
|
158
|
-
|
|
159
|
-
start_time = time.time()
|
|
160
|
-
|
|
161
|
-
# Track which image IDs we've seen in metadata to avoid re-fetching
|
|
162
|
-
seen_ids = set()
|
|
163
|
-
|
|
164
|
-
# Collect images to download from existing metadata
|
|
165
|
-
images_to_download = []
|
|
166
|
-
|
|
167
|
-
if self.metadata_file.exists():
|
|
168
|
-
logger.info("Processing existing metadata file...")
|
|
169
|
-
with open(self.metadata_file) as f:
|
|
170
|
-
for line in f:
|
|
171
|
-
if line.strip():
|
|
172
|
-
image = json.loads(line)
|
|
173
|
-
image_id = image["id"]
|
|
174
|
-
seen_ids.add(image_id)
|
|
175
|
-
processed += 1
|
|
176
|
-
|
|
177
|
-
if image_id in self.downloaded:
|
|
178
|
-
skipped += 1
|
|
179
|
-
continue
|
|
180
|
-
|
|
181
|
-
# Queue for download
|
|
182
|
-
if image.get(quality_field):
|
|
183
|
-
images_to_download.append(image)
|
|
184
|
-
|
|
185
|
-
# Download images from existing metadata in parallel
|
|
186
|
-
if images_to_download:
|
|
187
|
-
logger.info(f"Downloading {len(images_to_download)} images from existing metadata...")
|
|
188
|
-
downloaded_count, total_bytes, failed_count = self._download_images_parallel(
|
|
189
|
-
images_to_download, convert_webp
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
# Always check API for new images (will skip duplicates via seen_ids)
|
|
193
|
-
logger.info("Checking for new images from API...")
|
|
194
|
-
new_images = []
|
|
195
|
-
|
|
196
|
-
with open(self.metadata_file, "a") as meta_f:
|
|
197
|
-
for image in self.client.get_user_images(self.username, bbox=bbox):
|
|
198
|
-
image_id = image["id"]
|
|
199
|
-
|
|
200
|
-
# Skip if we already have this in our metadata file
|
|
201
|
-
if image_id in seen_ids:
|
|
202
|
-
continue
|
|
203
|
-
|
|
204
|
-
seen_ids.add(image_id)
|
|
205
|
-
processed += 1
|
|
206
|
-
|
|
207
|
-
# Save new metadata
|
|
208
|
-
meta_f.write(json.dumps(image) + "\n")
|
|
209
|
-
meta_f.flush()
|
|
210
|
-
|
|
211
|
-
# Skip if already downloaded
|
|
212
|
-
if image_id in self.downloaded:
|
|
213
|
-
skipped += 1
|
|
214
|
-
continue
|
|
215
|
-
|
|
216
|
-
# Queue for download
|
|
217
|
-
if image.get(quality_field):
|
|
218
|
-
new_images.append(image)
|
|
219
|
-
|
|
220
|
-
# Download new images in parallel
|
|
221
|
-
if new_images:
|
|
222
|
-
logger.info(f"Downloading {len(new_images)} new images...")
|
|
223
|
-
new_downloaded, new_bytes, new_failed = self._download_images_parallel(new_images, convert_webp)
|
|
224
|
-
downloaded_count += new_downloaded
|
|
225
|
-
total_bytes += new_bytes
|
|
226
|
-
failed_count += new_failed
|
|
227
|
-
|
|
228
|
-
self._save_progress()
|
|
229
|
-
elapsed = time.time() - start_time
|
|
230
|
-
logger.info(
|
|
231
|
-
f"Complete! Processed {processed} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
|
|
232
|
-
f"skipped {skipped}, failed {failed_count}"
|
|
233
|
-
)
|
|
234
|
-
logger.info(f"Total time: {format_time(elapsed)}")
|
|
235
|
-
|
|
236
|
-
# Tar sequence directories for efficient IA uploads
|
|
237
|
-
if self.tar_sequences:
|
|
238
|
-
tar_sequence_directories(self.output_dir)
|
|
239
|
-
|
|
240
|
-
# Gzip metadata.jsonl to save space
|
|
241
|
-
if self.metadata_file.exists():
|
|
242
|
-
logger.info("Compressing metadata.jsonl...")
|
|
243
|
-
original_size = self.metadata_file.stat().st_size
|
|
244
|
-
gzipped_file = self.metadata_file.with_suffix(".jsonl.gz")
|
|
245
|
-
|
|
246
|
-
with open(self.metadata_file, "rb") as f_in:
|
|
247
|
-
with gzip.open(gzipped_file, "wb", compresslevel=9) as f_out:
|
|
248
|
-
shutil.copyfileobj(f_in, f_out)
|
|
249
|
-
|
|
250
|
-
compressed_size = gzipped_file.stat().st_size
|
|
251
|
-
self.metadata_file.unlink()
|
|
252
|
-
|
|
253
|
-
savings = 100 * (1 - compressed_size / original_size)
|
|
254
|
-
logger.info(
|
|
255
|
-
f"Compressed metadata: {format_size(original_size)} → {format_size(compressed_size)} "
|
|
256
|
-
f"({savings:.1f}% savings)"
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
# Generate IA metadata
|
|
260
|
-
generate_ia_metadata(self.output_dir)
|
|
261
|
-
|
|
262
|
-
# Move from staging to final destination
|
|
263
|
-
logger.info("Moving collection from staging to final destination...")
|
|
264
|
-
if self.final_dir.exists():
|
|
265
|
-
logger.warning(f"Destination already exists, removing: {self.final_dir}")
|
|
266
|
-
shutil.rmtree(self.final_dir)
|
|
267
|
-
|
|
268
|
-
self.final_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
269
|
-
shutil.move(str(self.staging_dir), str(self.final_dir))
|
|
270
|
-
logger.info(f"Collection moved to: {self.final_dir}")
|
|
271
|
-
|
|
272
|
-
def _download_images_parallel(self, images, convert_webp):
|
|
273
|
-
"""Download images in parallel using worker pool.
|
|
274
|
-
|
|
275
|
-
Args:
|
|
276
|
-
images: List of image metadata dicts
|
|
277
|
-
convert_webp: Whether to convert to WebP
|
|
278
|
-
|
|
279
|
-
Returns:
|
|
280
|
-
Tuple of (downloaded_count, total_bytes, failed_count)
|
|
281
|
-
"""
|
|
282
|
-
downloaded_count = 0
|
|
283
|
-
total_bytes = 0
|
|
284
|
-
failed_count = 0
|
|
285
|
-
batch_start_time = time.time()
|
|
286
|
-
|
|
287
|
-
with ProcessPoolExecutor(max_workers=self.workers) as executor:
|
|
288
|
-
# Submit all tasks
|
|
289
|
-
future_to_image = {}
|
|
290
|
-
for image in images:
|
|
291
|
-
future = executor.submit(
|
|
292
|
-
download_and_convert_image,
|
|
293
|
-
image,
|
|
294
|
-
str(self.output_dir),
|
|
295
|
-
self.quality,
|
|
296
|
-
convert_webp,
|
|
297
|
-
self.client.access_token,
|
|
298
|
-
)
|
|
299
|
-
future_to_image[future] = image["id"]
|
|
300
|
-
|
|
301
|
-
# Process results as they complete
|
|
302
|
-
for future in as_completed(future_to_image):
|
|
303
|
-
image_id, bytes_dl, success, error_msg = future.result()
|
|
304
|
-
|
|
305
|
-
if success:
|
|
306
|
-
self.downloaded.add(image_id)
|
|
307
|
-
downloaded_count += 1
|
|
308
|
-
total_bytes += bytes_dl
|
|
309
|
-
|
|
310
|
-
if downloaded_count % 10 == 0:
|
|
311
|
-
# Calculate ETA
|
|
312
|
-
elapsed = time.time() - batch_start_time
|
|
313
|
-
rate = downloaded_count / elapsed if elapsed > 0 else 0
|
|
314
|
-
remaining = len(images) - downloaded_count
|
|
315
|
-
eta_seconds = remaining / rate if rate > 0 else 0
|
|
316
|
-
|
|
317
|
-
logger.info(
|
|
318
|
-
f"Downloaded: {downloaded_count}/{len(images)} ({format_size(total_bytes)}) "
|
|
319
|
-
f"- ETA: {format_time(eta_seconds)}"
|
|
320
|
-
)
|
|
321
|
-
self._save_progress()
|
|
322
|
-
else:
|
|
323
|
-
failed_count += 1
|
|
324
|
-
logger.warning(f"Failed to download {image_id}: {error_msg}")
|
|
325
|
-
|
|
326
|
-
return downloaded_count, total_bytes, failed_count
|
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__init__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/__main__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/client.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/exif_writer.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_check.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/ia_meta.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/logging_config.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/tar_sequences.py
RENAMED
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.4.2 → mapillary_downloader-0.5.1}/src/mapillary_downloader/webp_converter.py
RENAMED
|
File without changes
|