mapillary-downloader 0.7.0__py3-none-any.whl → 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,8 +43,8 @@ def main():
43
43
  parser.add_argument(
44
44
  "--max-workers",
45
45
  type=int,
46
- default=128,
47
- help="Maximum number of parallel workers (default: 128)",
46
+ default=os.cpu_count() or 8,
47
+ help=f"Maximum number of parallel workers (default: CPU count = {os.cpu_count() or 8})",
48
48
  )
49
49
  parser.add_argument(
50
50
  "--no-tar",
@@ -67,7 +67,6 @@ class MapillaryDownloader:
67
67
  self.username = username
68
68
  self.quality = quality
69
69
  self.max_workers = max_workers
70
- self.initial_workers = os.cpu_count() or 1 # Start with CPU count
71
70
  self.tar_sequences = tar_sequences
72
71
  self.convert_webp = convert_webp
73
72
  self.check_ia = check_ia
@@ -100,7 +99,7 @@ class MapillaryDownloader:
100
99
  # Set up file logging for archival with timestamp for incremental runs
101
100
  timestamp = time.strftime("%Y%m%d-%H%M%S")
102
101
  log_file = self.output_dir / f"download.log.{timestamp}"
103
- add_file_handler(log_file)
102
+ self.file_handler = add_file_handler(log_file)
104
103
  logger.info(f"Logging to: {log_file}")
105
104
 
106
105
  self.metadata_file = self.output_dir / "metadata.jsonl"
@@ -170,27 +169,16 @@ class MapillaryDownloader:
170
169
 
171
170
  quality_field = f"thumb_{self.quality}_url"
172
171
 
173
- logger.info(f"Downloading images for user: {self.username}")
174
- logger.info(f"Output directory: {self.output_dir}")
175
- logger.info(f"Quality: {self.quality}")
176
- logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
172
+ logger.info(f"Downloading {self.username} @ {self.quality} (max {self.max_workers} workers)")
177
173
 
178
174
  start_time = time.time()
179
175
 
180
176
  # Step 1: Check if API fetch is already complete
181
177
  reader = MetadataReader(self.metadata_file)
182
178
  api_complete = reader.is_complete
183
- if api_complete:
184
- logger.info("API fetch already complete, will only download")
185
- else:
186
- logger.info("API fetch incomplete, will fetch and download in parallel")
187
179
 
188
180
  # Step 2: Start worker pool
189
- # Since workers do both I/O (download) and CPU (WebP), need many more workers
190
- # Start with CPU count and scale up based on throughput
191
- pool = AdaptiveWorkerPool(
192
- worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
193
- )
181
+ pool = AdaptiveWorkerPool(worker_process, max_workers=self.max_workers, monitoring_interval=10)
194
182
  pool.start()
195
183
 
196
184
  # Step 3: Download images from metadata file while fetching new from API
@@ -198,9 +186,6 @@ class MapillaryDownloader:
198
186
  total_bytes = 0
199
187
  failed_count = 0
200
188
  submitted = 0
201
- batch_start = time.time()
202
-
203
- logger.info("Starting parallel download and API fetch...")
204
189
 
205
190
  try:
206
191
  # Step 3a: Fetch metadata from API in parallel (write-only, don't block on queue)
@@ -213,7 +198,7 @@ class MapillaryDownloader:
213
198
  def fetch_api_metadata():
214
199
  """Fetch metadata from API and write to file (runs in thread)."""
215
200
  try:
216
- logger.info("API fetch thread: Starting...")
201
+ logger.debug("API fetch thread starting")
217
202
  with open(self.metadata_file, "a") as meta_f:
218
203
  for image in self.client.get_user_images(self.username, bbox=bbox):
219
204
  new_images_count[0] += 1
@@ -223,11 +208,11 @@ class MapillaryDownloader:
223
208
  meta_f.flush()
224
209
 
225
210
  if new_images_count[0] % 1000 == 0:
226
- logger.info(f"API: Fetched {new_images_count[0]} images from API")
211
+ logger.info(f"API: fetched {new_images_count[0]:,} image URLs")
227
212
 
228
213
  # Mark as complete
229
214
  MetadataReader.mark_complete(self.metadata_file)
230
- logger.info(f"API fetch complete: {new_images_count[0]} images")
215
+ logger.info(f"API fetch complete: {new_images_count[0]:,} images")
231
216
  finally:
232
217
  api_fetch_complete.set()
233
218
 
@@ -235,11 +220,10 @@ class MapillaryDownloader:
235
220
  api_thread = threading.Thread(target=fetch_api_metadata, daemon=True)
236
221
  api_thread.start()
237
222
  else:
238
- logger.info("API fetch already complete, skipping API thread")
239
223
  api_fetch_complete = None
240
224
 
241
225
  # Step 3b: Tail metadata file and submit to workers
242
- logger.info("Starting metadata tail and download queue feeder...")
226
+ logger.debug("Starting metadata tail and download queue feeder")
243
227
  last_position = 0
244
228
 
245
229
  # Helper to process results from queue
@@ -261,12 +245,7 @@ class MapillaryDownloader:
261
245
  # Log every download for first 10, then every 100
262
246
  should_log = downloaded_count <= 10 or downloaded_count % 100 == 0
263
247
  if should_log:
264
- elapsed = time.time() - batch_start
265
- rate = downloaded_count / elapsed if elapsed > 0 else 0
266
- logger.info(
267
- f"Downloaded: {downloaded_count} ({format_size(total_bytes)}) "
268
- f"- Rate: {rate:.1f} images/sec"
269
- )
248
+ logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
270
249
 
271
250
  if downloaded_count % 100 == 0:
272
251
  self._save_progress()
@@ -304,6 +283,7 @@ class MapillaryDownloader:
304
283
 
305
284
  # Skip if already downloaded or no quality URL
306
285
  if image_id in self.downloaded:
286
+ downloaded_count += 1
307
287
  continue
308
288
  if not image.get(quality_field):
309
289
  continue
@@ -320,7 +300,7 @@ class MapillaryDownloader:
320
300
  submitted += 1
321
301
 
322
302
  if submitted % 1000 == 0:
323
- logger.info(f"Queue: Submitted {submitted} images")
303
+ logger.info(f"Queue: submitted {submitted:,} images")
324
304
 
325
305
  # Process results while submitting
326
306
  process_results()
@@ -355,6 +335,7 @@ class MapillaryDownloader:
355
335
 
356
336
  # Skip if already downloaded or no quality URL
357
337
  if image_id in self.downloaded:
338
+ downloaded_count += 1
358
339
  continue
359
340
  if not image.get(quality_field):
360
341
  continue
@@ -371,7 +352,7 @@ class MapillaryDownloader:
371
352
  submitted += 1
372
353
 
373
354
  if submitted % 1000 == 0:
374
- logger.info(f"Queue: Submitted {submitted} images")
355
+ logger.info(f"Queue: submitted {submitted:,} images")
375
356
 
376
357
  # Process results while submitting
377
358
  process_results()
@@ -389,7 +370,7 @@ class MapillaryDownloader:
389
370
  process_results()
390
371
 
391
372
  # Send shutdown signals
392
- logger.info(f"Submitted {submitted} images, waiting for workers to finish...")
373
+ logger.debug(f"Submitted {submitted:,} images, waiting for workers")
393
374
  for _ in range(pool.current_workers):
394
375
  pool.submit(None)
395
376
 
@@ -411,16 +392,8 @@ class MapillaryDownloader:
411
392
  downloaded_count += 1
412
393
  total_bytes += bytes_dl
413
394
 
414
- if downloaded_count % 10 == 0:
415
- elapsed = time.time() - batch_start
416
- rate = downloaded_count / elapsed if elapsed > 0 else 0
417
- remaining = submitted - completed
418
- eta_seconds = remaining / rate if rate > 0 else 0
419
-
420
- logger.info(
421
- f"Downloaded: {downloaded_count}/{submitted} ({format_size(total_bytes)}) "
422
- f"- ETA: {format_time(eta_seconds)}"
423
- )
395
+ if downloaded_count % 100 == 0:
396
+ logger.info(f"Downloaded: {downloaded_count:,} ({format_size(total_bytes)})")
424
397
  self._save_progress()
425
398
  pool.check_throughput(downloaded_count)
426
399
  else:
@@ -434,7 +407,7 @@ class MapillaryDownloader:
434
407
  self._save_progress()
435
408
  elapsed = time.time() - start_time
436
409
 
437
- logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
410
+ logger.info(f"Complete! Downloaded {downloaded_count:,} ({format_size(total_bytes)}), failed {failed_count:,}")
438
411
  logger.info(f"Total time: {format_time(elapsed)}")
439
412
 
440
413
  # Tar sequence directories for efficient IA uploads
@@ -463,12 +436,16 @@ class MapillaryDownloader:
463
436
  # Generate IA metadata
464
437
  generate_ia_metadata(self.output_dir)
465
438
 
439
+ # Close log file handler before moving directory
440
+ self.file_handler.close()
441
+ logger.removeHandler(self.file_handler)
442
+
466
443
  # Move from staging to final destination
467
- logger.info("Moving collection from staging to final destination...")
444
+ logger.info("Moving to final destination...")
468
445
  if self.final_dir.exists():
469
446
  logger.warning(f"Destination already exists, removing: {self.final_dir}")
470
447
  shutil.rmtree(self.final_dir)
471
448
 
472
449
  self.final_dir.parent.mkdir(parents=True, exist_ok=True)
473
450
  shutil.move(str(self.staging_dir), str(self.final_dir))
474
- logger.info(f"Collection moved to: {self.final_dir}")
451
+ logger.info(f"Done: {self.final_dir}")
@@ -0,0 +1,193 @@
1
+ """GraphQL web API utilities (unofficial, experimental).
2
+
3
+ This module provides access to Mapillary's GraphQL endpoint used by the web interface.
4
+ Unlike the official v4 REST API, this requires a public web token extracted from the
5
+ JavaScript bundle.
6
+
7
+ Use cases:
8
+ - Get user image counts without pagination
9
+ - Access leaderboard data
10
+ - Check for updates to existing downloads
11
+
12
+ WARNING: This is not officially documented and may break at any time.
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import re
18
+ from datetime import datetime
19
+ from urllib.parse import urlencode, quote
20
+ import requests
21
+
22
+ logger = logging.getLogger("mapillary_downloader")
23
+
24
+ # Fallback token (extracted from main JS bundle as of 2025-01-09)
25
+ FALLBACK_TOKEN = "MLY|4223665974375089|d62822dd792b6a823d0794ef26450398"
26
+
27
+
28
+ def extract_token_from_js():
29
+ """Extract public web token from Mapillary's JavaScript bundle.
30
+
31
+ This fetches the main page, finds the main JS bundle, and extracts
32
+ the hardcoded MLY token used for GraphQL queries.
33
+
34
+ Returns:
35
+ Token string (e.g., "MLY|123|abc...") or None if extraction failed
36
+ """
37
+ try:
38
+ # Fetch main page to find JS bundle URL
39
+ # Need consent cookie to get actual page (not GDPR banner)
40
+ logger.debug("Fetching Mapillary main page...")
41
+ # Generate today's date in the format YYYY_MM_DD for cookie
42
+ today = datetime.now().strftime("%Y_%m_%d")
43
+ cookies = {
44
+ "mly_cb": f'{{"version":"1","date":"{today}","third_party_consent":"withdrawn","categories":{{"content_and_media":"withdrawn"}},"integration_controls":{{"YOUTUBE":"withdrawn"}}}}'
45
+ }
46
+ headers = {
47
+ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
48
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
49
+ "Accept-Language": "en-GB,en;q=0.5",
50
+ "Sec-GPC": "1",
51
+ "Upgrade-Insecure-Requests": "1",
52
+ "Sec-Fetch-Dest": "document",
53
+ "Sec-Fetch-Mode": "navigate",
54
+ "Sec-Fetch-Site": "none",
55
+ "Sec-Fetch-User": "?1",
56
+ }
57
+ response = requests.get("https://www.mapillary.com/app/", cookies=cookies, headers=headers, timeout=30)
58
+ response.raise_for_status()
59
+
60
+ # Find main JS file URL
61
+ # Pattern: <script src="main.{hash}.js" type="module"></script>
62
+ js_match = re.search(r'src="(main\.[a-f0-9]+\.js)"', response.text)
63
+ if not js_match:
64
+ logger.warning("Could not find main JS bundle URL in page")
65
+ return None
66
+
67
+ # URL is relative to /app/ base path
68
+ js_url = f"https://www.mapillary.com/app/{js_match.group(1)}"
69
+ logger.debug(f"Found JS bundle: {js_url}")
70
+
71
+ # Fetch JS bundle
72
+ logger.debug("Fetching JS bundle...")
73
+ js_response = requests.get(js_url, timeout=30)
74
+ js_response.raise_for_status()
75
+
76
+ # Extract token
77
+ # Pattern: "MLY|{client_id}|{secret}"
78
+ token_match = re.search(r'"(MLY\|[^"]+)"', js_response.text)
79
+ if not token_match:
80
+ logger.warning("Could not find MLY token in JS bundle")
81
+ return None
82
+
83
+ token = token_match.group(1)
84
+ logger.info(f"Extracted web token: {token[:20]}...")
85
+ return token
86
+
87
+ except requests.RequestException as e:
88
+ logger.error(f"Failed to extract web token: {e}")
89
+ return None
90
+ except Exception as e:
91
+ logger.error(f"Unexpected error extracting web token: {e}")
92
+ return None
93
+
94
+
95
+ def get_leaderboard(key="global", token=None):
96
+ """Get leaderboard data from Mapillary GraphQL API.
97
+
98
+ Args:
99
+ key: Leaderboard key (e.g., "global", country name, etc.)
100
+ token: MLY token (if None, will extract from JS bundle or use fallback)
101
+
102
+ Returns:
103
+ Dict with leaderboard data, or None on error
104
+ """
105
+ if token is None:
106
+ token = extract_token_from_js()
107
+ if token is None:
108
+ logger.warning("Failed to extract token, using fallback")
109
+ token = FALLBACK_TOKEN
110
+
111
+ # GraphQL query for leaderboard (lifetime stats only)
112
+ query = """query getUserLeaderboard($key: String!) {
113
+ user_leaderboards(key: $key) {
114
+ lifetime {
115
+ count
116
+ user {
117
+ id
118
+ username
119
+ profile_photo_url
120
+ __typename
121
+ }
122
+ __typename
123
+ }
124
+ __typename
125
+ }
126
+ }"""
127
+
128
+ try:
129
+ headers = {
130
+ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
131
+ "Accept": "*/*",
132
+ "Accept-Language": "en-GB,en;q=0.5",
133
+ "Referer": "https://www.mapillary.com/",
134
+ "content-type": "application/json",
135
+ "authorization": f"OAuth {token}",
136
+ "Origin": "https://www.mapillary.com",
137
+ "Sec-Fetch-Dest": "empty",
138
+ "Sec-Fetch-Mode": "cors",
139
+ "Sec-Fetch-Site": "same-site",
140
+ }
141
+
142
+ # Build query params - use quote_via=quote to get %20 instead of +
143
+ # Note: both 'doc' and 'query' params seem to be required (from observed curl)
144
+ params = {
145
+ "doc": query,
146
+ "query": query,
147
+ "operationName": "getUserLeaderboard",
148
+ "variables": json.dumps({"key": key}, separators=(',', ':')),
149
+ }
150
+
151
+ # Build URL with proper percent encoding (not + for spaces)
152
+ # Don't encode parentheses to match curl behavior
153
+ query_string = urlencode(params, quote_via=lambda s, safe='', encoding=None, errors=None: quote(s, safe='()!'))
154
+ url = f"https://graph.mapillary.com/graphql?{query_string}"
155
+
156
+ logger.debug(f"Querying leaderboard for key: {key}")
157
+
158
+ response = requests.get(
159
+ url,
160
+ headers=headers,
161
+ timeout=30
162
+ )
163
+ response.raise_for_status()
164
+
165
+ return response.json()
166
+
167
+ except requests.RequestException as e:
168
+ logger.error(f"Failed to query leaderboard: {e}")
169
+ return None
170
+ except Exception as e:
171
+ logger.error(f"Unexpected error querying leaderboard: {e}")
172
+ return None
173
+
174
+
175
+ if __name__ == "__main__":
176
+ # Test the extraction and leaderboard query
177
+ logging.basicConfig(level=logging.DEBUG)
178
+
179
+ print("=== Extracting token ===")
180
+ token = extract_token_from_js()
181
+ if token:
182
+ print(f"Success! Token: {token}")
183
+ else:
184
+ print("Failed to extract token")
185
+ print(f"Fallback: {FALLBACK_TOKEN}")
186
+ token = FALLBACK_TOKEN
187
+
188
+ print("\n=== Querying global leaderboard ===")
189
+ data = get_leaderboard("global", token=token)
190
+ if data:
191
+ print(json.dumps(data, indent=2))
192
+ else:
193
+ print("Failed to get leaderboard data")
@@ -182,7 +182,7 @@ def generate_ia_metadata(collection_dir):
182
182
  write_meta_tag(meta_dir, "coverage", f"{first_date} - {last_date}")
183
183
  write_meta_tag(meta_dir, "licenseurl", "https://creativecommons.org/licenses/by-sa/4.0/")
184
184
  write_meta_tag(meta_dir, "mediatype", "data")
185
- write_meta_tag(meta_dir, "collection", "opensource_media")
185
+ write_meta_tag(meta_dir, "collection", "mapillary-images")
186
186
 
187
187
  # Source and scanner metadata
188
188
  write_meta_tag(meta_dir, "source", f"https://www.mapillary.com/app/user/{username}")
@@ -15,14 +15,14 @@ def search_ia_collections():
15
15
  """Search IA for all mapillary_downloader collections.
16
16
 
17
17
  Returns:
18
- List of dicts with: identifier, description, item_size, uploader
18
+ List of dicts with: identifier, description, item_size, collection
19
19
  """
20
20
  logger.info("Searching archive.org for mapillary_downloader collections...")
21
21
 
22
22
  url = "https://archive.org/advancedsearch.php"
23
23
  params = {
24
24
  "q": "mapillary_downloader:*",
25
- "fl[]": ["identifier", "description", "item_size", "uploader"],
25
+ "fl[]": ["identifier", "description", "item_size", "collection"],
26
26
  "rows": 10000,
27
27
  "output": "json",
28
28
  }
@@ -31,11 +31,29 @@ def search_ia_collections():
31
31
  data = response.json()
32
32
 
33
33
  collections = data["response"]["docs"]
34
- logger.info(f"Found {len(collections)} collections on archive.org")
34
+ logger.info(f"Found {len(collections):,} collections on archive.org")
35
35
 
36
36
  return collections
37
37
 
38
38
 
39
+ def fetch_uploader(identifier):
40
+ """Fetch uploader email from item metadata.
41
+
42
+ Args:
43
+ identifier: IA item identifier
44
+
45
+ Returns:
46
+ Uploader email or None
47
+ """
48
+ url = f"https://archive.org/metadata/{identifier}/metadata/uploader"
49
+ try:
50
+ response = http_get_with_retry(url, max_retries=2)
51
+ data = response.json()
52
+ return data.get("result")
53
+ except Exception:
54
+ return None
55
+
56
+
39
57
  def parse_collection_info(identifier):
40
58
  """Parse username, quality, webp from collection identifier.
41
59
 
@@ -104,19 +122,28 @@ def update_cache(ia_collections):
104
122
 
105
123
  image_count = extract_image_count(item.get("description"))
106
124
 
125
+ # Get IA collection(s) - can be a string or list
126
+ ia_collection = item.get("collection", [])
127
+ if isinstance(ia_collection, str):
128
+ ia_collection = [ia_collection]
129
+
130
+ # Preserve existing uploader if we have it cached
131
+ existing = cache.get(identifier, {})
132
+
107
133
  # Update cache entry
108
134
  cache[identifier] = {
109
135
  "size": size_bytes,
110
- "uploader": item.get("uploader"),
136
+ "uploader": existing.get("uploader"), # Preserve cached uploader
111
137
  "images": image_count,
112
138
  "quality": info["quality"],
113
139
  "username": info["username"],
114
140
  "is_webp": info["is_webp"],
141
+ "ia_collection": ia_collection,
115
142
  }
116
143
 
117
144
  # Save updated cache
118
145
  safe_json_save(CACHE_FILE, cache)
119
- logger.info(f"Updated cache with {len(cache)} collections")
146
+ logger.info(f"Updated cache with {len(cache):,} collections")
120
147
 
121
148
  return cache
122
149
 
@@ -168,11 +195,12 @@ def aggregate_stats(cache):
168
195
  return stats
169
196
 
170
197
 
171
- def format_stats(stats):
198
+ def format_stats(stats, cache):
172
199
  """Format statistics as human-readable text.
173
200
 
174
201
  Args:
175
202
  stats: Dict from aggregate_stats()
203
+ cache: Dict of collection data
176
204
 
177
205
  Returns:
178
206
  Formatted string
@@ -212,6 +240,62 @@ def format_stats(stats):
212
240
  )
213
241
 
214
242
  output.append("")
243
+
244
+ # Find items not in mapillary-images and fetch uploaders
245
+ not_in_mapillary_images = []
246
+ need_uploader_fetch = []
247
+
248
+ for identifier, data in cache.items():
249
+ ia_collections = data.get("ia_collection", [])
250
+ if "mapillary-images" not in ia_collections:
251
+ not_in_mapillary_images.append(identifier)
252
+ if not data.get("uploader"):
253
+ need_uploader_fetch.append(identifier)
254
+
255
+ # Fetch missing uploaders
256
+ if need_uploader_fetch:
257
+ logger.info(f"Fetching uploader info for {len(need_uploader_fetch)} items...")
258
+ for i, identifier in enumerate(need_uploader_fetch, 1):
259
+ logger.info(f" [{i}/{len(need_uploader_fetch)}] {identifier}")
260
+ uploader = fetch_uploader(identifier)
261
+ if uploader:
262
+ cache[identifier]["uploader"] = uploader
263
+ # Save updated cache with uploaders
264
+ safe_json_save(CACHE_FILE, cache)
265
+
266
+ # Group by uploader (only for items not in mapillary-images)
267
+ by_uploader = {}
268
+ for identifier in not_in_mapillary_images:
269
+ uploader = cache[identifier].get("uploader") or "unknown"
270
+ if uploader not in by_uploader:
271
+ by_uploader[uploader] = {"items": [], "images": 0, "size": 0}
272
+ by_uploader[uploader]["items"].append(identifier)
273
+ by_uploader[uploader]["images"] += cache[identifier].get("images") or 0
274
+ by_uploader[uploader]["size"] += cache[identifier].get("size") or 0
275
+
276
+ # By uploader (only those with items outside mapillary-images)
277
+ if by_uploader:
278
+ output.append("Uploaders with items outside mapillary-images:")
279
+ output.append("-" * 70)
280
+ for uploader, data in sorted(by_uploader.items(), key=lambda x: -len(x[1]["items"])):
281
+ output.append(
282
+ f" {uploader}: {len(data['items'])} items, " f"{data['images']:,} images, {format_size(data['size'])}"
283
+ )
284
+ output.append("")
285
+
286
+ # Items not in mapillary-images, grouped by uploader
287
+ if not_in_mapillary_images:
288
+ output.append(f"Items NOT in mapillary-images ({len(not_in_mapillary_images)}):")
289
+ output.append("-" * 70)
290
+ for uploader, data in sorted(by_uploader.items(), key=lambda x: x[0].lower()):
291
+ output.append(f"{uploader}:")
292
+ for identifier in sorted(data["items"]):
293
+ output.append(identifier)
294
+ output.append("")
295
+ else:
296
+ output.append("All items are in mapillary-images collection!")
297
+ output.append("")
298
+
215
299
  output.append(f"Cache: {CACHE_FILE}")
216
300
 
217
301
  return "\n".join(output)
@@ -239,4 +323,4 @@ def show_stats(refresh=True):
239
323
  return
240
324
 
241
325
  stats = aggregate_stats(cache)
242
- print(format_stats(stats))
326
+ print(format_stats(stats, cache))
@@ -17,17 +17,15 @@ class AdaptiveWorkerPool:
17
17
  - If throughput plateauing/decreasing: reduce workers
18
18
  """
19
19
 
20
- def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
20
+ def __init__(self, worker_func, max_workers=16, monitoring_interval=10):
21
21
  """Initialize adaptive worker pool.
22
22
 
23
23
  Args:
24
24
  worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
- min_workers: Minimum number of workers
26
25
  max_workers: Maximum number of workers
27
26
  monitoring_interval: Seconds between throughput checks
28
27
  """
29
28
  self.worker_func = worker_func
30
- self.min_workers = min_workers
31
29
  self.max_workers = max_workers
32
30
  self.monitoring_interval = monitoring_interval
33
31
 
@@ -37,7 +35,8 @@ class AdaptiveWorkerPool:
37
35
 
38
36
  # Worker management
39
37
  self.workers = []
40
- self.current_workers = min_workers # Start small and ramp up
38
+ # Start at 25% of max_workers (at least 1)
39
+ self.current_workers = max(1, int(max_workers * 0.25))
41
40
 
42
41
  # Throughput monitoring
43
42
  self.throughput_history = deque(maxlen=5) # Last 5 measurements
@@ -50,7 +49,7 @@ class AdaptiveWorkerPool:
50
49
  def start(self):
51
50
  """Start the worker pool."""
52
51
  self.running = True
53
- logger.info(f"Starting worker pool with {self.current_workers} workers")
52
+ logger.debug(f"Starting worker pool with {self.current_workers} workers")
54
53
 
55
54
  for i in range(self.current_workers):
56
55
  self._add_worker(i)
@@ -100,10 +99,7 @@ class AdaptiveWorkerPool:
100
99
  self.last_processed = total_processed
101
100
  self.last_check_time = now
102
101
 
103
- logger.info(
104
- f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
105
- f"history: {len(self.throughput_history)} measurements)"
106
- )
102
+ logger.info(f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers})")
107
103
 
108
104
  # Need at least 2 measurements to calculate gain per worker
109
105
  if len(self.throughput_history) < 2:
@@ -182,12 +178,10 @@ class AdaptiveWorkerPool:
182
178
  self.current_workers += 1
183
179
  added += 1
184
180
  logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
185
- else:
186
- logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
187
181
 
188
182
  def shutdown(self, timeout=2):
189
183
  """Shutdown the worker pool gracefully."""
190
- logger.info("Shutting down worker pool...")
184
+ logger.debug("Shutting down worker pool")
191
185
  self.running = False
192
186
 
193
187
  # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
@@ -198,5 +192,3 @@ class AdaptiveWorkerPool:
198
192
  # Give them a brief moment to exit
199
193
  for p in self.workers:
200
194
  p.join(timeout=timeout)
201
-
202
- logger.info("Worker pool shutdown complete")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.7.0
3
+ Version: 0.7.3
4
4
  Summary: Archive user data from Mapillary
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -66,7 +66,7 @@ mapillary-downloader --output ./downloads USERNAME1
66
66
  | `--quality` | 256, 1024, 2048 or original | `original` |
67
67
  | `--bbox` | `west,south,east,north` | `None` |
68
68
  | `--no-webp` | Don't convert to WebP | `False` |
69
- | `--max-workers` | Maximum number of parallel download workers | `128` |
69
+ | `--max-workers` | Maximum number of parallel download workers | CPU count |
70
70
  | `--no-tar` | Don't tar bucket directories | `False` |
71
71
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
72
72
 
@@ -1,20 +1,21 @@
1
1
  mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
2
- mapillary_downloader/__main__.py,sha256=jK0MU9Xn9KGb_V8x5giIRuwhUjNFQ-jjlprtbeW6b94,4817
2
+ mapillary_downloader/__main__.py,sha256=iuDGZoFVu8q_dTvJuExSpj4Jx1x9xASSjUITRGwd0RA,4864
3
3
  mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
4
- mapillary_downloader/downloader.py,sha256=TrFy9eTcZD_wyVh7L58HuVxgCIKbAYRnmr2gAWtXOuY,19738
4
+ mapillary_downloader/downloader.py,sha256=TWk3CiPbuEWgFZoyA146xdvH-heZnmBWFJKKaBbwa40,18456
5
5
  mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
6
+ mapillary_downloader/graphql_web.py,sha256=69pmx8xDEuu-zn7xy5CkpUBjijGTS506ZJd5I_GAepk,6671
6
7
  mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
7
- mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
- mapillary_downloader/ia_stats.py,sha256=TSVCoaCcGFDPTYqxikGdvMo7uWtExRniYABjQQS26fw,7302
8
+ mapillary_downloader/ia_meta.py,sha256=3aj1RunSLap2Kh3q9Mm_4WpcVT92elfR6k-um_GRTpk,6266
9
+ mapillary_downloader/ia_stats.py,sha256=kjbNUVXtZziWxTx1yi2TLTZt_F0BWjrv1WWyy6ZeCLY,10678
9
10
  mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
10
11
  mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
11
12
  mapillary_downloader/tar_sequences.py,sha256=UchKvvajBr5uaoE8xDHgyiFTkjh08EK7pPhtwkyCQXU,4416
12
13
  mapillary_downloader/utils.py,sha256=VgcwbC8yb2XlTGerTNwHBU42K2IN14VU7P-I52Vb01c,2947
13
14
  mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
14
15
  mapillary_downloader/worker.py,sha256=K2DkQgFzALKs20TsG1KibNUdFiWN_v8MtVnBX_0xVyc,5162
15
- mapillary_downloader/worker_pool.py,sha256=iGRq5uFwBNNVQnI4vEjbKHkbKTaEVCdmvMvXcRGuDMg,8203
16
- mapillary_downloader-0.7.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
17
- mapillary_downloader-0.7.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
18
- mapillary_downloader-0.7.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
19
- mapillary_downloader-0.7.0.dist-info/METADATA,sha256=Ftc--29thU8dc-J_11_NlBUnf6SsOlvQP4r28nclsnk,5540
20
- mapillary_downloader-0.7.0.dist-info/RECORD,,
16
+ mapillary_downloader/worker_pool.py,sha256=QnqYcPCi3GNu2e8GNG_qQ8v680PWzCZcGE5KeskqZxU,7868
17
+ mapillary_downloader-0.7.3.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
18
+ mapillary_downloader-0.7.3.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
19
+ mapillary_downloader-0.7.3.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
20
+ mapillary_downloader-0.7.3.dist-info/METADATA,sha256=G93Ph_ffq5DRZ4BpaIqKkvtQVtX7lWuzlKs05SwKV68,5540
21
+ mapillary_downloader-0.7.3.dist-info/RECORD,,