mapillary-downloader 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mapillary_downloader/client.py +3 -5
- mapillary_downloader/downloader.py +80 -112
- mapillary_downloader/ia_check.py +3 -3
- mapillary_downloader/ia_stats.py +16 -8
- mapillary_downloader/logging_config.py +5 -0
- mapillary_downloader/metadata_reader.py +0 -47
- mapillary_downloader/utils.py +3 -5
- mapillary_downloader/webp_converter.py +4 -13
- mapillary_downloader/worker.py +1 -1
- {mapillary_downloader-0.8.0.dist-info → mapillary_downloader-0.9.0.dist-info}/METADATA +1 -1
- mapillary_downloader-0.9.0.dist-info/RECORD +21 -0
- mapillary_downloader-0.8.0.dist-info/RECORD +0 -21
- {mapillary_downloader-0.8.0.dist-info → mapillary_downloader-0.9.0.dist-info}/WHEEL +0 -0
- {mapillary_downloader-0.8.0.dist-info → mapillary_downloader-0.9.0.dist-info}/entry_points.txt +0 -0
- {mapillary_downloader-0.8.0.dist-info → mapillary_downloader-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
mapillary_downloader/client.py
CHANGED
|
@@ -22,11 +22,12 @@ class MapillaryClient:
|
|
|
22
22
|
self.session = requests.Session()
|
|
23
23
|
self.session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
24
24
|
|
|
25
|
-
def get_user_images(self, username, bbox=None, limit=2000):
|
|
25
|
+
def get_user_images(self, username, quality, bbox=None, limit=2000):
|
|
26
26
|
"""Get images uploaded by a specific user.
|
|
27
27
|
|
|
28
28
|
Args:
|
|
29
29
|
username: Mapillary username
|
|
30
|
+
quality: Image quality (256, 1024, 2048, or original)
|
|
30
31
|
bbox: Optional bounding box [west, south, east, north]
|
|
31
32
|
limit: Number of results per page (max 2000)
|
|
32
33
|
|
|
@@ -56,10 +57,7 @@ class MapillaryClient:
|
|
|
56
57
|
"computed_rotation",
|
|
57
58
|
"height",
|
|
58
59
|
"width",
|
|
59
|
-
"
|
|
60
|
-
"thumb_1024_url",
|
|
61
|
-
"thumb_2048_url",
|
|
62
|
-
"thumb_original_url",
|
|
60
|
+
f"thumb_{quality}_url",
|
|
63
61
|
]
|
|
64
62
|
),
|
|
65
63
|
}
|
|
@@ -5,8 +5,10 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import shutil
|
|
8
|
+
import threading
|
|
8
9
|
import time
|
|
9
10
|
from pathlib import Path
|
|
11
|
+
import requests
|
|
10
12
|
from mapillary_downloader.utils import format_size, format_time, safe_json_save
|
|
11
13
|
from mapillary_downloader.ia_meta import generate_ia_metadata
|
|
12
14
|
from mapillary_downloader.ia_check import check_ia_exists
|
|
@@ -146,6 +148,65 @@ class MapillaryDownloader:
|
|
|
146
148
|
# Write atomically using utility function
|
|
147
149
|
safe_json_save(self.progress_file, progress)
|
|
148
150
|
|
|
151
|
+
def _submit_metadata_batch(self, file_handle, quality_field, pool, convert_webp, process_results, base_submitted):
|
|
152
|
+
"""Read metadata lines from current position, submit to workers.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
file_handle: Open file positioned at read point
|
|
156
|
+
quality_field: Field name for quality URL (e.g., "thumb_1024_url")
|
|
157
|
+
pool: Worker pool to submit to
|
|
158
|
+
convert_webp: Whether to convert to webp
|
|
159
|
+
process_results: Callback to drain result queue
|
|
160
|
+
base_submitted: Running total for cumulative logging
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
tuple: (submitted_count, skipped_count) for this batch
|
|
164
|
+
"""
|
|
165
|
+
submitted = 0
|
|
166
|
+
skipped = 0
|
|
167
|
+
|
|
168
|
+
for line in file_handle:
|
|
169
|
+
line = line.strip()
|
|
170
|
+
if not line:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
image = json.loads(line)
|
|
175
|
+
except json.JSONDecodeError:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
if image.get("__complete__"):
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
image_id = image.get("id")
|
|
182
|
+
if not image_id:
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
if image_id in self.downloaded:
|
|
186
|
+
skipped += 1
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
if not image.get(quality_field):
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
work_item = (
|
|
193
|
+
image,
|
|
194
|
+
str(self.output_dir),
|
|
195
|
+
self.quality,
|
|
196
|
+
convert_webp,
|
|
197
|
+
self.client.access_token,
|
|
198
|
+
)
|
|
199
|
+
pool.submit(work_item)
|
|
200
|
+
submitted += 1
|
|
201
|
+
|
|
202
|
+
total = base_submitted + submitted
|
|
203
|
+
if total % 1000 == 0:
|
|
204
|
+
logger.info(f"Queue: submitted {total:,} images")
|
|
205
|
+
|
|
206
|
+
process_results()
|
|
207
|
+
|
|
208
|
+
return submitted, skipped
|
|
209
|
+
|
|
149
210
|
def download_user_data(self, bbox=None, convert_webp=False):
|
|
150
211
|
"""Download all images for a user using streaming queue-based architecture.
|
|
151
212
|
|
|
@@ -159,7 +220,7 @@ class MapillaryDownloader:
|
|
|
159
220
|
# Check if collection already exists on Internet Archive
|
|
160
221
|
if self.check_ia and self.collection_name:
|
|
161
222
|
logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
|
|
162
|
-
if check_ia_exists(self.collection_name):
|
|
223
|
+
if check_ia_exists(requests.Session(), self.collection_name):
|
|
163
224
|
logger.info("Collection already exists on archive.org, skipping download")
|
|
164
225
|
return
|
|
165
226
|
|
|
@@ -187,13 +248,13 @@ class MapillaryDownloader:
|
|
|
187
248
|
total_bytes = 0
|
|
188
249
|
failed_count = 0
|
|
189
250
|
submitted = 0
|
|
251
|
+
skipped_count = 0
|
|
190
252
|
|
|
191
253
|
try:
|
|
192
254
|
# Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
|
|
193
|
-
|
|
194
|
-
import threading
|
|
255
|
+
api_fetch_complete = threading.Event()
|
|
195
256
|
|
|
196
|
-
|
|
257
|
+
if not api_complete:
|
|
197
258
|
new_images_count = [0] # Mutable so thread can update it
|
|
198
259
|
|
|
199
260
|
def fetch_api_metadata():
|
|
@@ -201,7 +262,7 @@ class MapillaryDownloader:
|
|
|
201
262
|
try:
|
|
202
263
|
logger.debug("API fetch thread starting")
|
|
203
264
|
with open(self.metadata_file, "a") as meta_f:
|
|
204
|
-
for image in self.client.get_user_images(self.username, bbox=bbox):
|
|
265
|
+
for image in self.client.get_user_images(self.username, self.quality, bbox=bbox):
|
|
205
266
|
new_images_count[0] += 1
|
|
206
267
|
|
|
207
268
|
# Save metadata (don't dedupe here, let the tailer handle it)
|
|
@@ -221,7 +282,7 @@ class MapillaryDownloader:
|
|
|
221
282
|
api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
|
|
222
283
|
api_thread.start()
|
|
223
284
|
else:
|
|
224
|
-
api_fetch_complete
|
|
285
|
+
api_fetch_complete.set()
|
|
225
286
|
|
|
226
287
|
# Step 3b: Tail metadata file and submit to workers
|
|
227
288
|
logger.debug("Starting metadata tail and download queue feeder")
|
|
@@ -244,9 +305,10 @@ class MapillaryDownloader:
|
|
|
244
305
|
total_bytes += bytes_dl
|
|
245
306
|
|
|
246
307
|
# Log every download for first 10, then every 100
|
|
308
|
+
total_downloaded = len(self.downloaded)
|
|
247
309
|
should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
|
|
248
310
|
if should_log:
|
|
249
|
-
logger.info(f"Downloaded: {
|
|
311
|
+
logger.info(f"Downloaded: {total_downloaded:,} ({format_size(total_bytes)} this session)")
|
|
250
312
|
|
|
251
313
|
if downloaded_count % 100 == 0:
|
|
252
314
|
pool.check_throughput(downloaded_count)
|
|
@@ -260,117 +322,20 @@ class MapillaryDownloader:
|
|
|
260
322
|
|
|
261
323
|
# Tail the metadata file and submit to workers
|
|
262
324
|
while True:
|
|
263
|
-
# Check if API fetch is done and we've processed everything
|
|
264
|
-
if api_fetch_complete and api_fetch_complete.is_set():
|
|
265
|
-
# Read any remaining lines
|
|
266
|
-
if self.metadata_file.exists():
|
|
267
|
-
with open(self.metadata_file) as f:
|
|
268
|
-
f.seek(last_position)
|
|
269
|
-
for line in f:
|
|
270
|
-
line = line.strip()
|
|
271
|
-
if not line:
|
|
272
|
-
continue
|
|
273
|
-
|
|
274
|
-
try:
|
|
275
|
-
image = json.loads(line)
|
|
276
|
-
except json.JSONDecodeError:
|
|
277
|
-
# Incomplete line, will retry
|
|
278
|
-
continue
|
|
279
|
-
|
|
280
|
-
# Skip completion marker
|
|
281
|
-
if image.get("__complete__"):
|
|
282
|
-
continue
|
|
283
|
-
|
|
284
|
-
image_id = image.get("id")
|
|
285
|
-
if not image_id:
|
|
286
|
-
continue
|
|
287
|
-
|
|
288
|
-
# Skip if already downloaded or no quality URL
|
|
289
|
-
if image_id in self.downloaded:
|
|
290
|
-
downloaded_count += 1
|
|
291
|
-
continue
|
|
292
|
-
if not image.get(quality_field):
|
|
293
|
-
continue
|
|
294
|
-
|
|
295
|
-
# Submit to workers
|
|
296
|
-
work_item = (
|
|
297
|
-
image,
|
|
298
|
-
str(self.output_dir),
|
|
299
|
-
self.quality,
|
|
300
|
-
convert_webp,
|
|
301
|
-
self.client.access_token,
|
|
302
|
-
)
|
|
303
|
-
pool.submit(work_item)
|
|
304
|
-
submitted += 1
|
|
305
|
-
|
|
306
|
-
if submitted % 1000 == 0:
|
|
307
|
-
logger.info(f"Queue: submitted {submitted:,} images")
|
|
308
|
-
|
|
309
|
-
# Process results while submitting
|
|
310
|
-
process_results()
|
|
311
|
-
|
|
312
|
-
last_position = f.tell()
|
|
313
|
-
|
|
314
|
-
# API done and all lines processed, break
|
|
315
|
-
break
|
|
316
|
-
|
|
317
|
-
# API still running or API was already complete, tail the file
|
|
318
325
|
if self.metadata_file.exists():
|
|
319
326
|
with open(self.metadata_file) as f:
|
|
320
327
|
f.seek(last_position)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
try:
|
|
327
|
-
image = json.loads(line)
|
|
328
|
-
except json.JSONDecodeError:
|
|
329
|
-
# Incomplete line, will retry next iteration
|
|
330
|
-
continue
|
|
331
|
-
|
|
332
|
-
# Skip completion marker
|
|
333
|
-
if image.get("__complete__"):
|
|
334
|
-
continue
|
|
335
|
-
|
|
336
|
-
image_id = image.get("id")
|
|
337
|
-
if not image_id:
|
|
338
|
-
continue
|
|
339
|
-
|
|
340
|
-
# Skip if already downloaded or no quality URL
|
|
341
|
-
if image_id in self.downloaded:
|
|
342
|
-
downloaded_count += 1
|
|
343
|
-
continue
|
|
344
|
-
if not image.get(quality_field):
|
|
345
|
-
continue
|
|
346
|
-
|
|
347
|
-
# Submit to workers
|
|
348
|
-
work_item = (
|
|
349
|
-
image,
|
|
350
|
-
str(self.output_dir),
|
|
351
|
-
self.quality,
|
|
352
|
-
convert_webp,
|
|
353
|
-
self.client.access_token,
|
|
354
|
-
)
|
|
355
|
-
pool.submit(work_item)
|
|
356
|
-
submitted += 1
|
|
357
|
-
|
|
358
|
-
if submitted % 1000 == 0:
|
|
359
|
-
logger.info(f"Queue: submitted {submitted:,} images")
|
|
360
|
-
|
|
361
|
-
# Process results while submitting
|
|
362
|
-
process_results()
|
|
363
|
-
|
|
328
|
+
batch_submitted, batch_skipped = self._submit_metadata_batch(
|
|
329
|
+
f, quality_field, pool, convert_webp, process_results, submitted
|
|
330
|
+
)
|
|
331
|
+
submitted += batch_submitted
|
|
332
|
+
skipped_count += batch_skipped
|
|
364
333
|
last_position = f.tell()
|
|
365
334
|
|
|
366
|
-
|
|
367
|
-
if api_fetch_complete is None:
|
|
335
|
+
if api_fetch_complete.is_set():
|
|
368
336
|
break
|
|
369
337
|
|
|
370
|
-
# Sleep briefly before next tail iteration
|
|
371
338
|
time.sleep(0.1)
|
|
372
|
-
|
|
373
|
-
# Process any results that came in
|
|
374
339
|
process_results()
|
|
375
340
|
|
|
376
341
|
# Send shutdown signals
|
|
@@ -397,7 +362,7 @@ class MapillaryDownloader:
|
|
|
397
362
|
total_bytes += bytes_dl
|
|
398
363
|
|
|
399
364
|
if downloaded_count % 100 == 0:
|
|
400
|
-
logger.info(f"Downloaded: {
|
|
365
|
+
logger.info(f"Downloaded: {len(self.downloaded):,} ({format_size(total_bytes)} this session)")
|
|
401
366
|
pool.check_throughput(downloaded_count)
|
|
402
367
|
# Save progress every 5 minutes
|
|
403
368
|
if time.time() - self._last_save_time >= 300:
|
|
@@ -414,7 +379,10 @@ class MapillaryDownloader:
|
|
|
414
379
|
self._save_progress()
|
|
415
380
|
elapsed = time.time() - start_time
|
|
416
381
|
|
|
417
|
-
logger.info(
|
|
382
|
+
logger.info(
|
|
383
|
+
f"Complete! Downloaded {downloaded_count:,} this session ({format_size(total_bytes)}), "
|
|
384
|
+
f"{len(self.downloaded):,} total, skipped {skipped_count:,}, failed {failed_count:,}"
|
|
385
|
+
)
|
|
418
386
|
logger.info(f"Total time: {format_time(elapsed)}")
|
|
419
387
|
|
|
420
388
|
# Tar sequence directories for efficient IA uploads
|
mapillary_downloader/ia_check.py
CHANGED
|
@@ -6,20 +6,20 @@ import requests
|
|
|
6
6
|
logger = logging.getLogger("mapillary_downloader")
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def check_ia_exists(collection_name):
|
|
9
|
+
def check_ia_exists(session, collection_name):
|
|
10
10
|
"""Check if a collection exists on Internet Archive.
|
|
11
11
|
|
|
12
12
|
Args:
|
|
13
|
+
session: requests.Session for connection pooling
|
|
13
14
|
collection_name: Name of the collection (e.g., mapillary-username-original-webp)
|
|
14
15
|
|
|
15
16
|
Returns:
|
|
16
17
|
Boolean indicating if the collection exists on IA
|
|
17
18
|
"""
|
|
18
|
-
# IA identifier format
|
|
19
19
|
ia_url = f"https://archive.org/metadata/{collection_name}"
|
|
20
20
|
|
|
21
21
|
try:
|
|
22
|
-
response =
|
|
22
|
+
response = session.get(ia_url, timeout=10)
|
|
23
23
|
# If we get a 200, the item exists
|
|
24
24
|
if response.status_code == 200:
|
|
25
25
|
data = response.json()
|
mapillary_downloader/ia_stats.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
|
+
import requests
|
|
6
7
|
from mapillary_downloader.utils import safe_json_save, http_get_with_retry, format_size
|
|
7
8
|
from mapillary_downloader.downloader import get_cache_dir
|
|
8
9
|
|
|
@@ -11,9 +12,12 @@ logger = logging.getLogger("mapillary_downloader")
|
|
|
11
12
|
CACHE_FILE = get_cache_dir() / ".stats.json"
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
def search_ia_collections():
|
|
15
|
+
def search_ia_collections(session):
|
|
15
16
|
"""Search IA for all mapillary_downloader collections.
|
|
16
17
|
|
|
18
|
+
Args:
|
|
19
|
+
session: requests.Session for connection pooling
|
|
20
|
+
|
|
17
21
|
Returns:
|
|
18
22
|
List of dicts with: identifier, description, item_size, collection
|
|
19
23
|
"""
|
|
@@ -27,7 +31,7 @@ def search_ia_collections():
|
|
|
27
31
|
"output": "json",
|
|
28
32
|
}
|
|
29
33
|
|
|
30
|
-
response = http_get_with_retry(url, params=params, max_retries=3)
|
|
34
|
+
response = http_get_with_retry(session, url, params=params, max_retries=3)
|
|
31
35
|
data = response.json()
|
|
32
36
|
|
|
33
37
|
collections = data["response"]["docs"]
|
|
@@ -36,10 +40,11 @@ def search_ia_collections():
|
|
|
36
40
|
return collections
|
|
37
41
|
|
|
38
42
|
|
|
39
|
-
def fetch_uploader(identifier):
|
|
43
|
+
def fetch_uploader(session, identifier):
|
|
40
44
|
"""Fetch uploader email from item metadata.
|
|
41
45
|
|
|
42
46
|
Args:
|
|
47
|
+
session: requests.Session for connection pooling
|
|
43
48
|
identifier: IA item identifier
|
|
44
49
|
|
|
45
50
|
Returns:
|
|
@@ -47,7 +52,7 @@ def fetch_uploader(identifier):
|
|
|
47
52
|
"""
|
|
48
53
|
url = f"https://archive.org/metadata/{identifier}/metadata/uploader"
|
|
49
54
|
try:
|
|
50
|
-
response = http_get_with_retry(url, max_retries=2)
|
|
55
|
+
response = http_get_with_retry(session, url, max_retries=2)
|
|
51
56
|
data = response.json()
|
|
52
57
|
return data.get("result")
|
|
53
58
|
except Exception:
|
|
@@ -195,10 +200,11 @@ def aggregate_stats(cache):
|
|
|
195
200
|
return stats
|
|
196
201
|
|
|
197
202
|
|
|
198
|
-
def format_stats(stats, cache):
|
|
203
|
+
def format_stats(session, stats, cache):
|
|
199
204
|
"""Format statistics as human-readable text.
|
|
200
205
|
|
|
201
206
|
Args:
|
|
207
|
+
session: requests.Session for connection pooling
|
|
202
208
|
stats: Dict from aggregate_stats()
|
|
203
209
|
cache: Dict of collection data
|
|
204
210
|
|
|
@@ -257,7 +263,7 @@ def format_stats(stats, cache):
|
|
|
257
263
|
logger.info(f"Fetching uploader info for {len(need_uploader_fetch)} items...")
|
|
258
264
|
for i, identifier in enumerate(need_uploader_fetch, 1):
|
|
259
265
|
logger.info(f" [{i}/{len(need_uploader_fetch)}] {identifier}")
|
|
260
|
-
uploader = fetch_uploader(identifier)
|
|
266
|
+
uploader = fetch_uploader(session, identifier)
|
|
261
267
|
if uploader:
|
|
262
268
|
cache[identifier]["uploader"] = uploader
|
|
263
269
|
# Save updated cache with uploaders
|
|
@@ -307,9 +313,11 @@ def show_stats(refresh=True):
|
|
|
307
313
|
Args:
|
|
308
314
|
refresh: If True, fetch fresh data from IA. If False, use cache only.
|
|
309
315
|
"""
|
|
316
|
+
session = requests.Session()
|
|
317
|
+
|
|
310
318
|
if refresh:
|
|
311
319
|
try:
|
|
312
|
-
ia_collections = search_ia_collections()
|
|
320
|
+
ia_collections = search_ia_collections(session)
|
|
313
321
|
cache = update_cache(ia_collections)
|
|
314
322
|
except Exception as e:
|
|
315
323
|
logger.error(f"Failed to fetch IA data: {e}")
|
|
@@ -323,4 +331,4 @@ def show_stats(refresh=True):
|
|
|
323
331
|
return
|
|
324
332
|
|
|
325
333
|
stats = aggregate_stats(cache)
|
|
326
|
-
print(format_stats(stats, cache))
|
|
334
|
+
print(format_stats(session, stats, cache))
|
|
@@ -15,6 +15,7 @@ class ColoredFormatter(logging.Formatter):
|
|
|
15
15
|
"DEBUG": "\033[94m", # Blue
|
|
16
16
|
"RESET": "\033[0m",
|
|
17
17
|
}
|
|
18
|
+
CYAN = "\033[96m"
|
|
18
19
|
|
|
19
20
|
def __init__(self, fmt=None, datefmt=None, use_color=True):
|
|
20
21
|
"""Initialize the formatter.
|
|
@@ -41,6 +42,10 @@ class ColoredFormatter(logging.Formatter):
|
|
|
41
42
|
if levelname in self.COLORS:
|
|
42
43
|
record.levelname = f"{self.COLORS[levelname]}{levelname}{self.COLORS['RESET']}"
|
|
43
44
|
|
|
45
|
+
# Color API messages differently so they stand out
|
|
46
|
+
if record.msg.startswith("API"):
|
|
47
|
+
record.msg = f"{self.CYAN}{record.msg}{self.COLORS['RESET']}"
|
|
48
|
+
|
|
44
49
|
return super().format(record)
|
|
45
50
|
|
|
46
51
|
|
|
@@ -65,53 +65,6 @@ class MetadataReader:
|
|
|
65
65
|
except Exception:
|
|
66
66
|
return False
|
|
67
67
|
|
|
68
|
-
def iter_images(self, quality_field=None, downloaded_ids=None):
|
|
69
|
-
"""Stream images from metadata file with filtering.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
quality_field: Optional field to check exists (e.g., 'thumb_1024_url')
|
|
73
|
-
downloaded_ids: Optional set of already downloaded IDs to skip
|
|
74
|
-
|
|
75
|
-
Yields:
|
|
76
|
-
Image metadata dicts that pass filters
|
|
77
|
-
"""
|
|
78
|
-
if not self.metadata_file.exists():
|
|
79
|
-
return
|
|
80
|
-
|
|
81
|
-
# Handle gzipped files
|
|
82
|
-
if self.metadata_file.suffix == ".gz":
|
|
83
|
-
file_handle = gzip.open(self.metadata_file, "rt")
|
|
84
|
-
else:
|
|
85
|
-
file_handle = open(self.metadata_file)
|
|
86
|
-
|
|
87
|
-
with file_handle as f:
|
|
88
|
-
for line in f:
|
|
89
|
-
line = line.strip()
|
|
90
|
-
if not line:
|
|
91
|
-
continue
|
|
92
|
-
|
|
93
|
-
image = json.loads(line)
|
|
94
|
-
|
|
95
|
-
# Check for completion marker
|
|
96
|
-
if image.get("__complete__"):
|
|
97
|
-
self.is_complete = True
|
|
98
|
-
logger.debug("Found API fetch completion marker")
|
|
99
|
-
continue
|
|
100
|
-
|
|
101
|
-
image_id = image.get("id")
|
|
102
|
-
if not image_id:
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Filter by downloaded status
|
|
106
|
-
if downloaded_ids and image_id in downloaded_ids:
|
|
107
|
-
continue
|
|
108
|
-
|
|
109
|
-
# Filter by quality field availability
|
|
110
|
-
if quality_field and not image.get(quality_field):
|
|
111
|
-
continue
|
|
112
|
-
|
|
113
|
-
yield image
|
|
114
|
-
|
|
115
68
|
def get_all_ids(self):
|
|
116
69
|
"""Get set of all image IDs in metadata file.
|
|
117
70
|
|
mapillary_downloader/utils.py
CHANGED
|
@@ -5,7 +5,6 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
import requests
|
|
9
8
|
from requests.exceptions import RequestException
|
|
10
9
|
|
|
11
10
|
logger = logging.getLogger("mapillary_downloader")
|
|
@@ -77,16 +76,16 @@ def safe_json_save(file_path, data):
|
|
|
77
76
|
temp_file.replace(file_path)
|
|
78
77
|
|
|
79
78
|
|
|
80
|
-
def http_get_with_retry(url, params=None, max_retries=5, base_delay=1.0, timeout=60
|
|
79
|
+
def http_get_with_retry(session, url, params=None, max_retries=5, base_delay=1.0, timeout=60):
|
|
81
80
|
"""HTTP GET with exponential backoff retry.
|
|
82
81
|
|
|
83
82
|
Args:
|
|
83
|
+
session: requests.Session for connection pooling
|
|
84
84
|
url: URL to fetch
|
|
85
85
|
params: Optional query parameters
|
|
86
86
|
max_retries: Maximum retry attempts (default: 5)
|
|
87
87
|
base_delay: Initial delay in seconds (default: 1.0)
|
|
88
88
|
timeout: Request timeout in seconds (default: 60)
|
|
89
|
-
session: Optional requests.Session for connection pooling
|
|
90
89
|
|
|
91
90
|
Returns:
|
|
92
91
|
requests.Response object
|
|
@@ -94,10 +93,9 @@ def http_get_with_retry(url, params=None, max_retries=5, base_delay=1.0, timeout
|
|
|
94
93
|
Raises:
|
|
95
94
|
requests.RequestException: If all retries exhausted
|
|
96
95
|
"""
|
|
97
|
-
getter = session or requests
|
|
98
96
|
for attempt in range(max_retries):
|
|
99
97
|
try:
|
|
100
|
-
response =
|
|
98
|
+
response = session.get(url, params=params, timeout=timeout)
|
|
101
99
|
response.raise_for_status()
|
|
102
100
|
return response
|
|
103
101
|
except RequestException as e:
|
|
@@ -17,25 +17,20 @@ def check_cwebp_available():
|
|
|
17
17
|
return shutil.which("cwebp") is not None
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
def convert_to_webp(jpg_path, output_path
|
|
20
|
+
def convert_to_webp(jpg_path, output_path, delete_original=True):
|
|
21
21
|
"""Convert a JPG image to WebP format, preserving EXIF metadata.
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
24
|
jpg_path: Path to the JPG file
|
|
25
|
-
output_path:
|
|
25
|
+
output_path: Path for the WebP output
|
|
26
26
|
delete_original: Whether to delete the original JPG after conversion (default: True)
|
|
27
27
|
|
|
28
28
|
Returns:
|
|
29
29
|
Path object to the new WebP file, or None if conversion failed
|
|
30
30
|
"""
|
|
31
31
|
jpg_path = Path(jpg_path)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
webp_path = jpg_path.with_suffix(".webp")
|
|
35
|
-
else:
|
|
36
|
-
webp_path = Path(output_path)
|
|
37
|
-
# Ensure output directory exists
|
|
38
|
-
webp_path.parent.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
webp_path = Path(output_path)
|
|
33
|
+
webp_path.parent.mkdir(parents=True, exist_ok=True)
|
|
39
34
|
|
|
40
35
|
try:
|
|
41
36
|
# Convert with cwebp, preserving all metadata
|
|
@@ -43,7 +38,6 @@ def convert_to_webp(jpg_path, output_path=None, delete_original=True):
|
|
|
43
38
|
["cwebp", "-metadata", "all", str(jpg_path), "-o", str(webp_path)],
|
|
44
39
|
capture_output=True,
|
|
45
40
|
text=True,
|
|
46
|
-
timeout=60,
|
|
47
41
|
)
|
|
48
42
|
|
|
49
43
|
if result.returncode != 0:
|
|
@@ -55,9 +49,6 @@ def convert_to_webp(jpg_path, output_path=None, delete_original=True):
|
|
|
55
49
|
jpg_path.unlink()
|
|
56
50
|
return webp_path
|
|
57
51
|
|
|
58
|
-
except subprocess.TimeoutExpired:
|
|
59
|
-
logger.error(f"cwebp conversion timed out for {jpg_path}")
|
|
60
|
-
return None
|
|
61
52
|
except Exception as e:
|
|
62
53
|
logger.error(f"Error converting {jpg_path} to WebP: {e}")
|
|
63
54
|
return None
|
mapillary_downloader/worker.py
CHANGED
|
@@ -106,7 +106,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
|
|
|
106
106
|
|
|
107
107
|
try:
|
|
108
108
|
# Use retry logic with 3 attempts for image downloads
|
|
109
|
-
response = http_get_with_retry(image_url, max_retries=3, base_delay=1.0, timeout=60
|
|
109
|
+
response = http_get_with_retry(session, image_url, max_retries=3, base_delay=1.0, timeout=60)
|
|
110
110
|
|
|
111
111
|
with open(jpg_path, "wb") as f:
|
|
112
112
|
for chunk in response.iter_content(chunk_size=8192):
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
|
|
2
|
+
mapillary_downloader/__main__.py,sha256=iuDGZoFVu8q_dTvJuExSpj4Jx1x9xASSjUITRGwd0RA,4864
|
|
3
|
+
mapillary_downloader/client.py,sha256=nZTcSnRNdWHjqENSOtXA3FXKfHXZQT6QPWWS59mtSyM,4707
|
|
4
|
+
mapillary_downloader/downloader.py,sha256=9zpmfO3oaGQVJ52fkaCtpwNkx4vyP_0gzS4GVQ3M4I4,16788
|
|
5
|
+
mapillary_downloader/exif_writer.py,sha256=Dwh6RFotCnI4HVRQNqmsuWy05BPQYh3tMOQlKUk0gL8,5213
|
|
6
|
+
mapillary_downloader/ia_check.py,sha256=MmwN9QWevfSlE_XBb4Wzwbm9R6mjICLvlET7M_Ncy0A,1138
|
|
7
|
+
mapillary_downloader/ia_meta.py,sha256=DTmFwIKN03aNgBaerQWF5x_hveDpjvrMBTdRAgHoFRk,6365
|
|
8
|
+
mapillary_downloader/ia_stats.py,sha256=4bqXl_o6CIZX8lX9Sgl2qGafL1j7mWehRMRDXZnV-so,10978
|
|
9
|
+
mapillary_downloader/logging_config.py,sha256=dkhmHY-keCQzfv8LNXr52BkJNo4mwGiogPKoED3ZdMw,2533
|
|
10
|
+
mapillary_downloader/metadata_reader.py,sha256=YUVdsQtI6D8r5tbhcq5A4CgMBNGDVBLelVzeC7JgMoA,3422
|
|
11
|
+
mapillary_downloader/tar_sequences.py,sha256=hh77hfj0DFSPrPRfbGrOhvnZMctKESgO0gSpJXUxCCs,4886
|
|
12
|
+
mapillary_downloader/utils.py,sha256=MP_TEXwoDWb-a7vQA9F2t79w3AOHU5u7OHKiLCmjhls,2996
|
|
13
|
+
mapillary_downloader/webp_converter.py,sha256=fvl5YqTh4zh7ESAJ1ONmRU6uioB2km3nv0hUKQ1jFzs,1555
|
|
14
|
+
mapillary_downloader/worker.py,sha256=bVfVqzmvSIzu5OGGxbWcFfwWjq2TNhzdiFD8ZoWXkLc,5327
|
|
15
|
+
mapillary_downloader/worker_pool.py,sha256=QnqYcPCi3GNu2e8GNG_qQ8v680PWzCZcGE5KeskqZxU,7868
|
|
16
|
+
mapillary_downloader/xmp_writer.py,sha256=6kjAP3JpqVnknDETgjd8Ze-P7c1kMbmvuQ14GF0dMoA,5163
|
|
17
|
+
mapillary_downloader-0.9.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
|
|
18
|
+
mapillary_downloader-0.9.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
|
|
19
|
+
mapillary_downloader-0.9.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
20
|
+
mapillary_downloader-0.9.0.dist-info/METADATA,sha256=z0RrCNVh8S3Uom8anffB8ieILTP-I_mqx1bu8rJJvvw,5791
|
|
21
|
+
mapillary_downloader-0.9.0.dist-info/RECORD,,
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
|
|
2
|
-
mapillary_downloader/__main__.py,sha256=iuDGZoFVu8q_dTvJuExSpj4Jx1x9xASSjUITRGwd0RA,4864
|
|
3
|
-
mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
|
|
4
|
-
mapillary_downloader/downloader.py,sha256=l6MT3dFOB-lZfoLEVVGIkioKSXcDu30Q9xc2MZ17iGI,18897
|
|
5
|
-
mapillary_downloader/exif_writer.py,sha256=Dwh6RFotCnI4HVRQNqmsuWy05BPQYh3tMOQlKUk0gL8,5213
|
|
6
|
-
mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
|
|
7
|
-
mapillary_downloader/ia_meta.py,sha256=DTmFwIKN03aNgBaerQWF5x_hveDpjvrMBTdRAgHoFRk,6365
|
|
8
|
-
mapillary_downloader/ia_stats.py,sha256=kjbNUVXtZziWxTx1yi2TLTZt_F0BWjrv1WWyy6ZeCLY,10678
|
|
9
|
-
mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
|
|
10
|
-
mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
|
|
11
|
-
mapillary_downloader/tar_sequences.py,sha256=hh77hfj0DFSPrPRfbGrOhvnZMctKESgO0gSpJXUxCCs,4886
|
|
12
|
-
mapillary_downloader/utils.py,sha256=qQ_ewhN0b0r4KLfBtf9tjwewF9PHVF1swLt71t8x9F0,3058
|
|
13
|
-
mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
|
|
14
|
-
mapillary_downloader/worker.py,sha256=rMqeDj5pfLoEPwKOGN28R7yMZ_XDSzLayrL5ht0cqN0,5335
|
|
15
|
-
mapillary_downloader/worker_pool.py,sha256=QnqYcPCi3GNu2e8GNG_qQ8v680PWzCZcGE5KeskqZxU,7868
|
|
16
|
-
mapillary_downloader/xmp_writer.py,sha256=6kjAP3JpqVnknDETgjd8Ze-P7c1kMbmvuQ14GF0dMoA,5163
|
|
17
|
-
mapillary_downloader-0.8.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
|
|
18
|
-
mapillary_downloader-0.8.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
|
|
19
|
-
mapillary_downloader-0.8.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
20
|
-
mapillary_downloader-0.8.0.dist-info/METADATA,sha256=cF1WSbpDbCjL9zCs2R8s8w1768-qfpUWgG_R1mL_q_w,5791
|
|
21
|
-
mapillary_downloader-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
{mapillary_downloader-0.8.0.dist-info → mapillary_downloader-0.9.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{mapillary_downloader-0.8.0.dist-info → mapillary_downloader-0.9.0.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|