mapillary-downloader 0.7.0__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/PKG-INFO +2 -2
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/README.md +1 -1
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/pyproject.toml +1 -1
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/__main__.py +2 -2
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/downloader.py +22 -45
- mapillary_downloader-0.7.3/src/mapillary_downloader/graphql_web.py +193 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_meta.py +1 -1
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_stats.py +91 -7
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/worker_pool.py +6 -14
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/LICENSE.md +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_check.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/logging_config.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/metadata_reader.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/tar_sequences.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/utils.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/webp_converter.py +0 -0
- {mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mapillary_downloader
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Archive user data from Mapillary
|
|
5
5
|
Author-email: Gareth Davidson <gaz@bitplane.net>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -66,7 +66,7 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
66
66
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
67
67
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
68
68
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
69
|
-
| `--max-workers` | Maximum number of parallel download workers |
|
|
69
|
+
| `--max-workers` | Maximum number of parallel download workers | CPU count |
|
|
70
70
|
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
71
71
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
72
72
|
|
|
@@ -36,7 +36,7 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
36
36
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
37
37
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
38
38
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
39
|
-
| `--max-workers` | Maximum number of parallel download workers |
|
|
39
|
+
| `--max-workers` | Maximum number of parallel download workers | CPU count |
|
|
40
40
|
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
41
41
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
42
42
|
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/__main__.py
RENAMED
|
@@ -43,8 +43,8 @@ def main():
|
|
|
43
43
|
parser.add_argument(
|
|
44
44
|
"--max-workers",
|
|
45
45
|
type=int,
|
|
46
|
-
default=
|
|
47
|
-
help="Maximum number of parallel workers (default:
|
|
46
|
+
default=os.cpu_count() or 8,
|
|
47
|
+
help=f"Maximum number of parallel workers (default: CPU count = {os.cpu_count() or 8})",
|
|
48
48
|
)
|
|
49
49
|
parser.add_argument(
|
|
50
50
|
"--no-tar",
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/downloader.py
RENAMED
|
@@ -67,7 +67,6 @@ class MapillaryDownloader:
|
|
|
67
67
|
self.username = username
|
|
68
68
|
self.quality = quality
|
|
69
69
|
self.max_workers = max_workers
|
|
70
|
-
self.initial_workers = os.cpu_count() or 1 # Start with CPU count
|
|
71
70
|
self.tar_sequences = tar_sequences
|
|
72
71
|
self.convert_webp = convert_webp
|
|
73
72
|
self.check_ia = check_ia
|
|
@@ -100,7 +99,7 @@ class MapillaryDownloader:
|
|
|
100
99
|
# Set up file logging for archival with timestamp for incremental runs
|
|
101
100
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
102
101
|
log_file = self.output_dir / f"download.log.{timestamp}"
|
|
103
|
-
add_file_handler(log_file)
|
|
102
|
+
self.file_handler = add_file_handler(log_file)
|
|
104
103
|
logger.info(f"Logging to: {log_file}")
|
|
105
104
|
|
|
106
105
|
self.metadata_file = self.output_dir / "metadata.jsonl"
|
|
@@ -170,27 +169,16 @@ class MapillaryDownloader:
|
|
|
170
169
|
|
|
171
170
|
quality_field = f"thumb_{self.quality}_url"
|
|
172
171
|
|
|
173
|
-
logger.info(f"Downloading
|
|
174
|
-
logger.info(f"Output directory: {self.output_dir}")
|
|
175
|
-
logger.info(f"Quality: {self.quality}")
|
|
176
|
-
logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
|
|
172
|
+
logger.info(f"Downloading {self.username} @ {self.quality} (max {self.max_workers} workers)")
|
|
177
173
|
|
|
178
174
|
start_time = time.time()
|
|
179
175
|
|
|
180
176
|
# Step 1: Check if API fetch is already complete
|
|
181
177
|
reader = MetadataReader(self.metadata_file)
|
|
182
178
|
api_complete = reader.is_complete
|
|
183
|
-
if api_complete:
|
|
184
|
-
logger.info("API fetch already complete, will only download")
|
|
185
|
-
else:
|
|
186
|
-
logger.info("API fetch incomplete, will fetch and download in parallel")
|
|
187
179
|
|
|
188
180
|
# Step 2: Start worker pool
|
|
189
|
-
|
|
190
|
-
# Start with CPU count and scale up based on throughput
|
|
191
|
-
pool = AdaptiveWorkerPool(
|
|
192
|
-
worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
|
|
193
|
-
)
|
|
181
|
+
pool = AdaptiveWorkerPool(worker_process, max_workers=self.max_workers, monitoring_interval=10)
|
|
194
182
|
pool.start()
|
|
195
183
|
|
|
196
184
|
# Step 3: Download images from metadata file while fetching new from API
|
|
@@ -198,9 +186,6 @@ class MapillaryDownloader:
|
|
|
198
186
|
total_bytes = 0
|
|
199
187
|
failed_count = 0
|
|
200
188
|
submitted = 0
|
|
201
|
-
batch_start = time.time()
|
|
202
|
-
|
|
203
|
-
logger.info("Starting parallel download and API fetch...")
|
|
204
189
|
|
|
205
190
|
try:
|
|
206
191
|
# Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
|
|
@@ -213,7 +198,7 @@ class MapillaryDownloader:
|
|
|
213
198
|
def fetch_api_metadata():
|
|
214
199
|
"""Fetch metadata from API and write to file (runs in thread)."""
|
|
215
200
|
try:
|
|
216
|
-
logger.
|
|
201
|
+
logger.debug("API fetch thread starting")
|
|
217
202
|
with open(self.metadata_file, "a") as meta_f:
|
|
218
203
|
for image in self.client.get_user_images(self.username, bbox=bbox):
|
|
219
204
|
new_images_count[0] += 1
|
|
@@ -223,11 +208,11 @@ class MapillaryDownloader:
|
|
|
223
208
|
meta_f.flush()
|
|
224
209
|
|
|
225
210
|
if new_images_count[0] % 1000 == 0:
|
|
226
|
-
logger.info(f"API:
|
|
211
|
+
logger.info(f"API: fetched {new_images_count[0]:,} image URLs")
|
|
227
212
|
|
|
228
213
|
# Mark as complete
|
|
229
214
|
MetadataReader.mark_complete(self.metadata_file)
|
|
230
|
-
logger.info(f"API fetch complete: {new_images_count[0]} images")
|
|
215
|
+
logger.info(f"API fetch complete: {new_images_count[0]:,} images")
|
|
231
216
|
finally:
|
|
232
217
|
api_fetch_complete.set()
|
|
233
218
|
|
|
@@ -235,11 +220,10 @@ class MapillaryDownloader:
|
|
|
235
220
|
api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
|
|
236
221
|
api_thread.start()
|
|
237
222
|
else:
|
|
238
|
-
logger.info("API fetch already complete, skipping API thread")
|
|
239
223
|
api_fetch_complete = None
|
|
240
224
|
|
|
241
225
|
# Step 3b: Tail metadata file and submit to workers
|
|
242
|
-
logger.
|
|
226
|
+
logger.debug("Starting metadata tail and download queue feeder")
|
|
243
227
|
last_position = 0
|
|
244
228
|
|
|
245
229
|
# Helper to process results from queue
|
|
@@ -261,12 +245,7 @@ class MapillaryDownloader:
|
|
|
261
245
|
# Log every download for first 10, then every 100
|
|
262
246
|
should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
|
|
263
247
|
if should_log:
|
|
264
|
-
|
|
265
|
-
rate = downloaded_count / elapsed if elapsed > 0 else 0
|
|
266
|
-
logger.info(
|
|
267
|
-
f"Downloaded: {downloaded_count} ({format_size(total_bytes)}) "
|
|
268
|
-
f"- Rate: {rate:.1f} images/sec"
|
|
269
|
-
)
|
|
248
|
+
logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
|
|
270
249
|
|
|
271
250
|
if downloaded_count % 100 == 0:
|
|
272
251
|
self._save_progress()
|
|
@@ -304,6 +283,7 @@ class MapillaryDownloader:
|
|
|
304
283
|
|
|
305
284
|
# Skip if already downloaded or no quality URL
|
|
306
285
|
if image_id in self.downloaded:
|
|
286
|
+
downloaded_count += 1
|
|
307
287
|
continue
|
|
308
288
|
if not image.get(quality_field):
|
|
309
289
|
continue
|
|
@@ -320,7 +300,7 @@ class MapillaryDownloader:
|
|
|
320
300
|
submitted += 1
|
|
321
301
|
|
|
322
302
|
if submitted % 1000 == 0:
|
|
323
|
-
logger.info(f"Queue:
|
|
303
|
+
logger.info(f"Queue: submitted {submitted:,} images")
|
|
324
304
|
|
|
325
305
|
# Process results while submitting
|
|
326
306
|
process_results()
|
|
@@ -355,6 +335,7 @@ class MapillaryDownloader:
|
|
|
355
335
|
|
|
356
336
|
# Skip if already downloaded or no quality URL
|
|
357
337
|
if image_id in self.downloaded:
|
|
338
|
+
downloaded_count += 1
|
|
358
339
|
continue
|
|
359
340
|
if not image.get(quality_field):
|
|
360
341
|
continue
|
|
@@ -371,7 +352,7 @@ class MapillaryDownloader:
|
|
|
371
352
|
submitted += 1
|
|
372
353
|
|
|
373
354
|
if submitted % 1000 == 0:
|
|
374
|
-
logger.info(f"Queue:
|
|
355
|
+
logger.info(f"Queue: submitted {submitted:,} images")
|
|
375
356
|
|
|
376
357
|
# Process results while submitting
|
|
377
358
|
process_results()
|
|
@@ -389,7 +370,7 @@ class MapillaryDownloader:
|
|
|
389
370
|
process_results()
|
|
390
371
|
|
|
391
372
|
# Send shutdown signals
|
|
392
|
-
logger.
|
|
373
|
+
logger.debug(f"Submitted {submitted:,} images, waiting for workers")
|
|
393
374
|
for _ in range(pool.current_workers):
|
|
394
375
|
pool.submit(None)
|
|
395
376
|
|
|
@@ -411,16 +392,8 @@ class MapillaryDownloader:
|
|
|
411
392
|
downloaded_count += 1
|
|
412
393
|
total_bytes += bytes_dl
|
|
413
394
|
|
|
414
|
-
if downloaded_count %
|
|
415
|
-
|
|
416
|
-
rate = downloaded_count / elapsed if elapsed > 0 else 0
|
|
417
|
-
remaining = submitted - completed
|
|
418
|
-
eta_seconds = remaining / rate if rate > 0 else 0
|
|
419
|
-
|
|
420
|
-
logger.info(
|
|
421
|
-
f"Downloaded: {downloaded_count}/{submitted} ({format_size(total_bytes)}) "
|
|
422
|
-
f"- ETA: {format_time(eta_seconds)}"
|
|
423
|
-
)
|
|
395
|
+
if downloaded_count % 100 == 0:
|
|
396
|
+
logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
|
|
424
397
|
self._save_progress()
|
|
425
398
|
pool.check_throughput(downloaded_count)
|
|
426
399
|
else:
|
|
@@ -434,7 +407,7 @@ class MapillaryDownloader:
|
|
|
434
407
|
self._save_progress()
|
|
435
408
|
elapsed = time.time() - start_time
|
|
436
409
|
|
|
437
|
-
logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}),
|
|
410
|
+
logger.info(f"Complete! Downloaded {downloaded_count:,} ({format_size(total_bytes)}), failed {failed_count:,}")
|
|
438
411
|
logger.info(f"Total time: {format_time(elapsed)}")
|
|
439
412
|
|
|
440
413
|
# Tar sequence directories for efficient IA uploads
|
|
@@ -463,12 +436,16 @@ class MapillaryDownloader:
|
|
|
463
436
|
# Generate IA metadata
|
|
464
437
|
generate_ia_metadata(self.output_dir)
|
|
465
438
|
|
|
439
|
+
# Close log file handler before moving directory
|
|
440
|
+
self.file_handler.close()
|
|
441
|
+
logger.removeHandler(self.file_handler)
|
|
442
|
+
|
|
466
443
|
# Move from staging to final destination
|
|
467
|
-
logger.info("Moving
|
|
444
|
+
logger.info("Moving to final destination...")
|
|
468
445
|
if self.final_dir.exists():
|
|
469
446
|
logger.warning(f"Destination already exists, removing: {self.final_dir}")
|
|
470
447
|
shutil.rmtree(self.final_dir)
|
|
471
448
|
|
|
472
449
|
self.final_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
473
450
|
shutil.move(str(self.staging_dir), str(self.final_dir))
|
|
474
|
-
logger.info(f"
|
|
451
|
+
logger.info(f"Done: {self.final_dir}")
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""GraphQL web API utilities (unofficial, experimental).
|
|
2
|
+
|
|
3
|
+
This module provides access to Mapillary's GraphQL endpoint used by the web interface.
|
|
4
|
+
Unlike the official v4 REST API, this requires a public web token extracted from the
|
|
5
|
+
JavaScript bundle.
|
|
6
|
+
|
|
7
|
+
Use cases:
|
|
8
|
+
- Get user image counts without pagination
|
|
9
|
+
- Access leaderboard data
|
|
10
|
+
- Check for updates to existing downloads
|
|
11
|
+
|
|
12
|
+
WARNING: This is not officially documented and may break at any time.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import re
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from urllib.parse import urlencode, quote
|
|
20
|
+
import requests
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
23
|
+
|
|
24
|
+
# Fallback token (extracted from main JS bundle as of 2025-01-09)
|
|
25
|
+
FALLBACK_TOKEN = "MLY|4223665974375089|d62822dd792b6a823d0794ef26450398"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_token_from_js():
|
|
29
|
+
"""Extract public web token from Mapillary's JavaScript bundle.
|
|
30
|
+
|
|
31
|
+
This fetches the main page, finds the main JS bundle, and extracts
|
|
32
|
+
the hardcoded MLY token used for GraphQL queries.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Token string (e.g., "MLY|123|abc...") or None if extraction failed
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
# Fetch main page to find JS bundle URL
|
|
39
|
+
# Need consent cookie to get actual page (not GDPR banner)
|
|
40
|
+
logger.debug("Fetching Mapillary main page...")
|
|
41
|
+
# Generate today's date in the format YYYY_MM_DD for cookie
|
|
42
|
+
today = datetime.now().strftime("%Y_%m_%d")
|
|
43
|
+
cookies = {
|
|
44
|
+
"mly_cb": f'{{"version":"1","date":"{today}","third_party_consent":"withdrawn","categories":{{"content_and_media":"withdrawn"}},"integration_controls":{{"YOUTUBE":"withdrawn"}}}}'
|
|
45
|
+
}
|
|
46
|
+
headers = {
|
|
47
|
+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
|
|
48
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
49
|
+
"Accept-Language": "en-GB,en;q=0.5",
|
|
50
|
+
"Sec-GPC": "1",
|
|
51
|
+
"Upgrade-Insecure-Requests": "1",
|
|
52
|
+
"Sec-Fetch-Dest": "document",
|
|
53
|
+
"Sec-Fetch-Mode": "navigate",
|
|
54
|
+
"Sec-Fetch-Site": "none",
|
|
55
|
+
"Sec-Fetch-User": "?1",
|
|
56
|
+
}
|
|
57
|
+
response = requests.get("https://www.mapillary.com/app/", cookies=cookies, headers=headers, timeout=30)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
|
|
60
|
+
# Find main JS file URL
|
|
61
|
+
# Pattern: <script src="main.{hash}.js" type="module"></script>
|
|
62
|
+
js_match = re.search(r'src="(main\.[a-f0-9]+\.js)"', response.text)
|
|
63
|
+
if not js_match:
|
|
64
|
+
logger.warning("Could not find main JS bundle URL in page")
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
# URL is relative to /app/ base path
|
|
68
|
+
js_url = f"https://www.mapillary.com/app/{js_match.group(1)}"
|
|
69
|
+
logger.debug(f"Found JS bundle: {js_url}")
|
|
70
|
+
|
|
71
|
+
# Fetch JS bundle
|
|
72
|
+
logger.debug("Fetching JS bundle...")
|
|
73
|
+
js_response = requests.get(js_url, timeout=30)
|
|
74
|
+
js_response.raise_for_status()
|
|
75
|
+
|
|
76
|
+
# Extract token
|
|
77
|
+
# Pattern: "MLY|{client_id}|{secret}"
|
|
78
|
+
token_match = re.search(r'"(MLY\|[^"]+)"', js_response.text)
|
|
79
|
+
if not token_match:
|
|
80
|
+
logger.warning("Could not find MLY token in JS bundle")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
token = token_match.group(1)
|
|
84
|
+
logger.info(f"Extracted web token: {token[:20]}...")
|
|
85
|
+
return token
|
|
86
|
+
|
|
87
|
+
except requests.RequestException as e:
|
|
88
|
+
logger.error(f"Failed to extract web token: {e}")
|
|
89
|
+
return None
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Unexpected error extracting web token: {e}")
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_leaderboard(key="global", token=None):
|
|
96
|
+
"""Get leaderboard data from Mapillary GraphQL API.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
key: Leaderboard key (e.g., "global", country name, etc.)
|
|
100
|
+
token: MLY token (if None, will extract from JS bundle or use fallback)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Dict with leaderboard data, or None on error
|
|
104
|
+
"""
|
|
105
|
+
if token is None:
|
|
106
|
+
token = extract_token_from_js()
|
|
107
|
+
if token is None:
|
|
108
|
+
logger.warning("Failed to extract token, using fallback")
|
|
109
|
+
token = FALLBACK_TOKEN
|
|
110
|
+
|
|
111
|
+
# GraphQL query for leaderboard (lifetime stats only)
|
|
112
|
+
query = """query getUserLeaderboard($key: String!) {
|
|
113
|
+
user_leaderboards(key: $key) {
|
|
114
|
+
lifetime {
|
|
115
|
+
count
|
|
116
|
+
user {
|
|
117
|
+
id
|
|
118
|
+
username
|
|
119
|
+
profile_photo_url
|
|
120
|
+
__typename
|
|
121
|
+
}
|
|
122
|
+
__typename
|
|
123
|
+
}
|
|
124
|
+
__typename
|
|
125
|
+
}
|
|
126
|
+
}"""
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
headers = {
|
|
130
|
+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
|
|
131
|
+
"Accept": "*/*",
|
|
132
|
+
"Accept-Language": "en-GB,en;q=0.5",
|
|
133
|
+
"Referer": "https://www.mapillary.com/",
|
|
134
|
+
"content-type": "application/json",
|
|
135
|
+
"authorization": f"OAuth {token}",
|
|
136
|
+
"Origin": "https://www.mapillary.com",
|
|
137
|
+
"Sec-Fetch-Dest": "empty",
|
|
138
|
+
"Sec-Fetch-Mode": "cors",
|
|
139
|
+
"Sec-Fetch-Site": "same-site",
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# Build query params - use quote_via=quote to get %20 instead of +
|
|
143
|
+
# Note: both 'doc' and 'query' params seem to be required (from observed curl)
|
|
144
|
+
params = {
|
|
145
|
+
"doc": query,
|
|
146
|
+
"query": query,
|
|
147
|
+
"operationName": "getUserLeaderboard",
|
|
148
|
+
"variables": json.dumps({"key": key}, separators=(',', ':')),
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Build URL with proper percent encoding (not + for spaces)
|
|
152
|
+
# Don't encode parentheses to match curl behavior
|
|
153
|
+
query_string = urlencode(params, quote_via=lambda s, safe='', encoding=None, errors=None: quote(s, safe='()!'))
|
|
154
|
+
url = f"https://graph.mapillary.com/graphql?{query_string}"
|
|
155
|
+
|
|
156
|
+
logger.debug(f"Querying leaderboard for key: {key}")
|
|
157
|
+
|
|
158
|
+
response = requests.get(
|
|
159
|
+
url,
|
|
160
|
+
headers=headers,
|
|
161
|
+
timeout=30
|
|
162
|
+
)
|
|
163
|
+
response.raise_for_status()
|
|
164
|
+
|
|
165
|
+
return response.json()
|
|
166
|
+
|
|
167
|
+
except requests.RequestException as e:
|
|
168
|
+
logger.error(f"Failed to query leaderboard: {e}")
|
|
169
|
+
return None
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Unexpected error querying leaderboard: {e}")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
# Test the extraction and leaderboard query
|
|
177
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
178
|
+
|
|
179
|
+
print("=== Extracting token ===")
|
|
180
|
+
token = extract_token_from_js()
|
|
181
|
+
if token:
|
|
182
|
+
print(f"Success! Token: {token}")
|
|
183
|
+
else:
|
|
184
|
+
print("Failed to extract token")
|
|
185
|
+
print(f"Fallback: {FALLBACK_TOKEN}")
|
|
186
|
+
token = FALLBACK_TOKEN
|
|
187
|
+
|
|
188
|
+
print("\n=== Querying global leaderboard ===")
|
|
189
|
+
data = get_leaderboard("global", token=token)
|
|
190
|
+
if data:
|
|
191
|
+
print(json.dumps(data, indent=2))
|
|
192
|
+
else:
|
|
193
|
+
print("Failed to get leaderboard data")
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_meta.py
RENAMED
|
@@ -182,7 +182,7 @@ def generate_ia_metadata(collection_dir):
|
|
|
182
182
|
write_meta_tag(meta_dir, "coverage", f"{first_date} - {last_date}")
|
|
183
183
|
write_meta_tag(meta_dir, "licenseurl", "https://creativecommons.org/licenses/by-sa/4.0/")
|
|
184
184
|
write_meta_tag(meta_dir, "mediatype", "data")
|
|
185
|
-
write_meta_tag(meta_dir, "collection", "
|
|
185
|
+
write_meta_tag(meta_dir, "collection", "mapillary-images")
|
|
186
186
|
|
|
187
187
|
# Source and scanner metadata
|
|
188
188
|
write_meta_tag(meta_dir, "source", f"https://www.mapillary.com/app/user/{username}")
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_stats.py
RENAMED
|
@@ -15,14 +15,14 @@ def search_ia_collections():
|
|
|
15
15
|
"""Search IA for all mapillary_downloader collections.
|
|
16
16
|
|
|
17
17
|
Returns:
|
|
18
|
-
List of dicts with: identifier, description, item_size,
|
|
18
|
+
List of dicts with: identifier, description, item_size, collection
|
|
19
19
|
"""
|
|
20
20
|
logger.info("Searching archive.org for mapillary_downloader collections...")
|
|
21
21
|
|
|
22
22
|
url = "https://archive.org/advancedsearch.php"
|
|
23
23
|
params = {
|
|
24
24
|
"q": "mapillary_downloader:*",
|
|
25
|
-
"fl[]": ["identifier", "description", "item_size", "
|
|
25
|
+
"fl[]": ["identifier", "description", "item_size", "collection"],
|
|
26
26
|
"rows": 10000,
|
|
27
27
|
"output": "json",
|
|
28
28
|
}
|
|
@@ -31,11 +31,29 @@ def search_ia_collections():
|
|
|
31
31
|
data = response.json()
|
|
32
32
|
|
|
33
33
|
collections = data["response"]["docs"]
|
|
34
|
-
logger.info(f"Found {len(collections)} collections on archive.org")
|
|
34
|
+
logger.info(f"Found {len(collections):,} collections on archive.org")
|
|
35
35
|
|
|
36
36
|
return collections
|
|
37
37
|
|
|
38
38
|
|
|
39
|
+
def fetch_uploader(identifier):
|
|
40
|
+
"""Fetch uploader email from item metadata.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
identifier: IA item identifier
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Uploader email or None
|
|
47
|
+
"""
|
|
48
|
+
url = f"https://archive.org/metadata/{identifier}/metadata/uploader"
|
|
49
|
+
try:
|
|
50
|
+
response = http_get_with_retry(url, max_retries=2)
|
|
51
|
+
data = response.json()
|
|
52
|
+
return data.get("result")
|
|
53
|
+
except Exception:
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
39
57
|
def parse_collection_info(identifier):
|
|
40
58
|
"""Parse username, quality, webp from collection identifier.
|
|
41
59
|
|
|
@@ -104,19 +122,28 @@ def update_cache(ia_collections):
|
|
|
104
122
|
|
|
105
123
|
image_count = extract_image_count(item.get("description"))
|
|
106
124
|
|
|
125
|
+
# Get IA collection(s) - can be a string or list
|
|
126
|
+
ia_collection = item.get("collection", [])
|
|
127
|
+
if isinstance(ia_collection, str):
|
|
128
|
+
ia_collection = [ia_collection]
|
|
129
|
+
|
|
130
|
+
# Preserve existing uploader if we have it cached
|
|
131
|
+
existing = cache.get(identifier, {})
|
|
132
|
+
|
|
107
133
|
# Update cache entry
|
|
108
134
|
cache[identifier] = {
|
|
109
135
|
"size": size_bytes,
|
|
110
|
-
"uploader":
|
|
136
|
+
"uploader": existing.get("uploader"), # Preserve cached uploader
|
|
111
137
|
"images": image_count,
|
|
112
138
|
"quality": info["quality"],
|
|
113
139
|
"username": info["username"],
|
|
114
140
|
"is_webp": info["is_webp"],
|
|
141
|
+
"ia_collection": ia_collection,
|
|
115
142
|
}
|
|
116
143
|
|
|
117
144
|
# Save updated cache
|
|
118
145
|
safe_json_save(CACHE_FILE, cache)
|
|
119
|
-
logger.info(f"Updated cache with {len(cache)} collections")
|
|
146
|
+
logger.info(f"Updated cache with {len(cache):,} collections")
|
|
120
147
|
|
|
121
148
|
return cache
|
|
122
149
|
|
|
@@ -168,11 +195,12 @@ def aggregate_stats(cache):
|
|
|
168
195
|
return stats
|
|
169
196
|
|
|
170
197
|
|
|
171
|
-
def format_stats(stats):
|
|
198
|
+
def format_stats(stats, cache):
|
|
172
199
|
"""Format statistics as human-readable text.
|
|
173
200
|
|
|
174
201
|
Args:
|
|
175
202
|
stats: Dict from aggregate_stats()
|
|
203
|
+
cache: Dict of collection data
|
|
176
204
|
|
|
177
205
|
Returns:
|
|
178
206
|
Formatted string
|
|
@@ -212,6 +240,62 @@ def format_stats(stats):
|
|
|
212
240
|
)
|
|
213
241
|
|
|
214
242
|
output.append("")
|
|
243
|
+
|
|
244
|
+
# Find items not in mapillary-images and fetch uploaders
|
|
245
|
+
not_in_mapillary_images = []
|
|
246
|
+
need_uploader_fetch = []
|
|
247
|
+
|
|
248
|
+
for identifier, data in cache.items():
|
|
249
|
+
ia_collections = data.get("ia_collection", [])
|
|
250
|
+
if "mapillary-images" not in ia_collections:
|
|
251
|
+
not_in_mapillary_images.append(identifier)
|
|
252
|
+
if not data.get("uploader"):
|
|
253
|
+
need_uploader_fetch.append(identifier)
|
|
254
|
+
|
|
255
|
+
# Fetch missing uploaders
|
|
256
|
+
if need_uploader_fetch:
|
|
257
|
+
logger.info(f"Fetching uploader info for {len(need_uploader_fetch)} items...")
|
|
258
|
+
for i, identifier in enumerate(need_uploader_fetch, 1):
|
|
259
|
+
logger.info(f" [{i}/{len(need_uploader_fetch)}] {identifier}")
|
|
260
|
+
uploader = fetch_uploader(identifier)
|
|
261
|
+
if uploader:
|
|
262
|
+
cache[identifier]["uploader"] = uploader
|
|
263
|
+
# Save updated cache with uploaders
|
|
264
|
+
safe_json_save(CACHE_FILE, cache)
|
|
265
|
+
|
|
266
|
+
# Group by uploader (only for items not in mapillary-images)
|
|
267
|
+
by_uploader = {}
|
|
268
|
+
for identifier in not_in_mapillary_images:
|
|
269
|
+
uploader = cache[identifier].get("uploader") or "unknown"
|
|
270
|
+
if uploader not in by_uploader:
|
|
271
|
+
by_uploader[uploader] = {"items": [], "images": 0, "size": 0}
|
|
272
|
+
by_uploader[uploader]["items"].append(identifier)
|
|
273
|
+
by_uploader[uploader]["images"] += cache[identifier].get("images") or 0
|
|
274
|
+
by_uploader[uploader]["size"] += cache[identifier].get("size") or 0
|
|
275
|
+
|
|
276
|
+
# By uploader (only those with items outside mapillary-images)
|
|
277
|
+
if by_uploader:
|
|
278
|
+
output.append("Uploaders with items outside mapillary-images:")
|
|
279
|
+
output.append("-" * 70)
|
|
280
|
+
for uploader, data in sorted(by_uploader.items(), key=lambda x: -len(x[1]["items"])):
|
|
281
|
+
output.append(
|
|
282
|
+
f" {uploader}: {len(data['items'])} items, " f"{data['images']:,} images, {format_size(data['size'])}"
|
|
283
|
+
)
|
|
284
|
+
output.append("")
|
|
285
|
+
|
|
286
|
+
# Items not in mapillary-images, grouped by uploader
|
|
287
|
+
if not_in_mapillary_images:
|
|
288
|
+
output.append(f"Items NOT in mapillary-images ({len(not_in_mapillary_images)}):")
|
|
289
|
+
output.append("-" * 70)
|
|
290
|
+
for uploader, data in sorted(by_uploader.items(), key=lambda x: x[0].lower()):
|
|
291
|
+
output.append(f"{uploader}:")
|
|
292
|
+
for identifier in sorted(data["items"]):
|
|
293
|
+
output.append(identifier)
|
|
294
|
+
output.append("")
|
|
295
|
+
else:
|
|
296
|
+
output.append("All items are in mapillary-images collection!")
|
|
297
|
+
output.append("")
|
|
298
|
+
|
|
215
299
|
output.append(f"Cache: {CACHE_FILE}")
|
|
216
300
|
|
|
217
301
|
return "\n".join(output)
|
|
@@ -239,4 +323,4 @@ def show_stats(refresh=True):
|
|
|
239
323
|
return
|
|
240
324
|
|
|
241
325
|
stats = aggregate_stats(cache)
|
|
242
|
-
print(format_stats(stats))
|
|
326
|
+
print(format_stats(stats, cache))
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/worker_pool.py
RENAMED
|
@@ -17,17 +17,15 @@ class AdaptiveWorkerPool:
|
|
|
17
17
|
- If throughput plateauing/decreasing: reduce workers
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
def __init__(self, worker_func,
|
|
20
|
+
def __init__(self, worker_func, max_workers=16, monitoring_interval=10):
|
|
21
21
|
"""Initialize adaptive worker pool.
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
24
|
worker_func: Function to run in each worker (must accept work_queue, result_queue)
|
|
25
|
-
min_workers: Minimum number of workers
|
|
26
25
|
max_workers: Maximum number of workers
|
|
27
26
|
monitoring_interval: Seconds between throughput checks
|
|
28
27
|
"""
|
|
29
28
|
self.worker_func = worker_func
|
|
30
|
-
self.min_workers = min_workers
|
|
31
29
|
self.max_workers = max_workers
|
|
32
30
|
self.monitoring_interval = monitoring_interval
|
|
33
31
|
|
|
@@ -37,7 +35,8 @@ class AdaptiveWorkerPool:
|
|
|
37
35
|
|
|
38
36
|
# Worker management
|
|
39
37
|
self.workers = []
|
|
40
|
-
|
|
38
|
+
# Start at 25% of max_workers (at least 1)
|
|
39
|
+
self.current_workers = max(1, int(max_workers * 0.25))
|
|
41
40
|
|
|
42
41
|
# Throughput monitoring
|
|
43
42
|
self.throughput_history = deque(maxlen=5) # Last 5 measurements
|
|
@@ -50,7 +49,7 @@ class AdaptiveWorkerPool:
|
|
|
50
49
|
def start(self):
|
|
51
50
|
"""Start the worker pool."""
|
|
52
51
|
self.running = True
|
|
53
|
-
logger.
|
|
52
|
+
logger.debug(f"Starting worker pool with {self.current_workers} workers")
|
|
54
53
|
|
|
55
54
|
for i in range(self.current_workers):
|
|
56
55
|
self._add_worker(i)
|
|
@@ -100,10 +99,7 @@ class AdaptiveWorkerPool:
|
|
|
100
99
|
self.last_processed = total_processed
|
|
101
100
|
self.last_check_time = now
|
|
102
101
|
|
|
103
|
-
logger.info(
|
|
104
|
-
f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
|
|
105
|
-
f"history: {len(self.throughput_history)} measurements)"
|
|
106
|
-
)
|
|
102
|
+
logger.info(f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers})")
|
|
107
103
|
|
|
108
104
|
# Need at least 2 measurements to calculate gain per worker
|
|
109
105
|
if len(self.throughput_history) < 2:
|
|
@@ -182,12 +178,10 @@ class AdaptiveWorkerPool:
|
|
|
182
178
|
self.current_workers += 1
|
|
183
179
|
added += 1
|
|
184
180
|
logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
|
|
185
|
-
else:
|
|
186
|
-
logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
|
|
187
181
|
|
|
188
182
|
def shutdown(self, timeout=2):
|
|
189
183
|
"""Shutdown the worker pool gracefully."""
|
|
190
|
-
logger.
|
|
184
|
+
logger.debug("Shutting down worker pool")
|
|
191
185
|
self.running = False
|
|
192
186
|
|
|
193
187
|
# Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
|
|
@@ -198,5 +192,3 @@ class AdaptiveWorkerPool:
|
|
|
198
192
|
# Give them a brief moment to exit
|
|
199
193
|
for p in self.workers:
|
|
200
194
|
p.join(timeout=timeout)
|
|
201
|
-
|
|
202
|
-
logger.info("Worker pool shutdown complete")
|
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/__init__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/client.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/exif_writer.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/ia_check.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/logging_config.py
RENAMED
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/tar_sequences.py
RENAMED
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/webp_converter.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.7.0 → mapillary_downloader-0.7.3}/src/mapillary_downloader/worker.py
RENAMED
|
File without changes
|