mapillary-downloader 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/PKG-INFO +1 -1
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/pyproject.toml +1 -1
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/__main__.py +4 -4
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/downloader.py +25 -31
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/metadata_reader.py +41 -1
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/worker.py +14 -8
- mapillary_downloader-0.5.2/src/mapillary_downloader/worker_pool.py +202 -0
- mapillary_downloader-0.5.0/src/mapillary_downloader/worker_pool.py +0 -136
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/LICENSE.md +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/README.md +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/ia_check.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/ia_meta.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/logging_config.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/tar_sequences.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/utils.py +0 -0
- {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/webp_converter.py +0 -0
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/__main__.py
RENAMED
|
@@ -41,10 +41,10 @@ def main():
|
|
|
41
41
|
help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
|
|
42
42
|
)
|
|
43
43
|
parser.add_argument(
|
|
44
|
-
"--workers",
|
|
44
|
+
"--max-workers",
|
|
45
45
|
type=int,
|
|
46
|
-
default=
|
|
47
|
-
help="
|
|
46
|
+
default=128,
|
|
47
|
+
help="Maximum number of parallel workers (default: 128)",
|
|
48
48
|
)
|
|
49
49
|
parser.add_argument(
|
|
50
50
|
"--no-tar",
|
|
@@ -114,7 +114,7 @@ def main():
|
|
|
114
114
|
args.output,
|
|
115
115
|
username,
|
|
116
116
|
args.quality,
|
|
117
|
-
|
|
117
|
+
max_workers=args.max_workers,
|
|
118
118
|
tar_sequences=not args.no_tar,
|
|
119
119
|
convert_webp=convert_webp,
|
|
120
120
|
check_ia=not args.no_check_ia,
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/downloader.py
RENAMED
|
@@ -45,7 +45,7 @@ class MapillaryDownloader:
|
|
|
45
45
|
output_dir,
|
|
46
46
|
username=None,
|
|
47
47
|
quality=None,
|
|
48
|
-
|
|
48
|
+
max_workers=128,
|
|
49
49
|
tar_sequences=True,
|
|
50
50
|
convert_webp=False,
|
|
51
51
|
check_ia=True,
|
|
@@ -57,7 +57,7 @@ class MapillaryDownloader:
|
|
|
57
57
|
output_dir: Base directory to save downloads (final destination)
|
|
58
58
|
username: Mapillary username (for collection directory)
|
|
59
59
|
quality: Image quality (for collection directory)
|
|
60
|
-
|
|
60
|
+
max_workers: Maximum number of parallel workers (default: 128)
|
|
61
61
|
tar_sequences: Whether to tar sequence directories after download (default: True)
|
|
62
62
|
convert_webp: Whether to convert images to WebP (affects collection name)
|
|
63
63
|
check_ia: Whether to check if collection exists on Internet Archive (default: True)
|
|
@@ -66,7 +66,8 @@ class MapillaryDownloader:
|
|
|
66
66
|
self.base_output_dir = Path(output_dir)
|
|
67
67
|
self.username = username
|
|
68
68
|
self.quality = quality
|
|
69
|
-
self.
|
|
69
|
+
self.max_workers = max_workers
|
|
70
|
+
self.initial_workers = os.cpu_count() or 1 # Start with CPU count
|
|
70
71
|
self.tar_sequences = tar_sequences
|
|
71
72
|
self.convert_webp = convert_webp
|
|
72
73
|
self.check_ia = check_ia
|
|
@@ -177,26 +178,28 @@ class MapillaryDownloader:
|
|
|
177
178
|
logger.info(f"Downloading images for user: {self.username}")
|
|
178
179
|
logger.info(f"Output directory: {self.output_dir}")
|
|
179
180
|
logger.info(f"Quality: {self.quality}")
|
|
180
|
-
logger.info(f"
|
|
181
|
+
logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
|
|
181
182
|
|
|
182
183
|
start_time = time.time()
|
|
183
184
|
|
|
184
|
-
# Step 1:
|
|
185
|
-
logger.info("Building seen_ids from metadata...")
|
|
185
|
+
# Step 1: Check if API fetch is already complete
|
|
186
186
|
reader = MetadataReader(self.metadata_file)
|
|
187
|
-
seen_ids = reader.get_all_ids()
|
|
188
187
|
api_complete = reader.is_complete
|
|
189
|
-
|
|
188
|
+
if api_complete:
|
|
189
|
+
logger.info("API fetch already complete, will only download")
|
|
190
|
+
else:
|
|
191
|
+
logger.info("API fetch incomplete, will fetch and download in parallel")
|
|
190
192
|
|
|
191
|
-
# Step 2: Start worker pool
|
|
193
|
+
# Step 2: Start worker pool
|
|
194
|
+
# Since workers do both I/O (download) and CPU (WebP), need many more workers
|
|
195
|
+
# Start with CPU count and scale up based on throughput
|
|
192
196
|
pool = AdaptiveWorkerPool(
|
|
193
|
-
worker_process, min_workers=
|
|
197
|
+
worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
|
|
194
198
|
)
|
|
195
199
|
pool.start()
|
|
196
200
|
|
|
197
|
-
# Step 3: Download images from
|
|
201
|
+
# Step 3: Download images from metadata file while fetching new from API
|
|
198
202
|
downloaded_count = 0
|
|
199
|
-
skipped = 0
|
|
200
203
|
total_bytes = 0
|
|
201
204
|
failed_count = 0
|
|
202
205
|
submitted = 0
|
|
@@ -218,25 +221,18 @@ class MapillaryDownloader:
|
|
|
218
221
|
logger.info("API fetch thread: Starting...")
|
|
219
222
|
with open(self.metadata_file, "a") as meta_f:
|
|
220
223
|
for image in self.client.get_user_images(self.username, bbox=bbox):
|
|
221
|
-
image_id = image["id"]
|
|
222
|
-
|
|
223
|
-
# Skip if we already have this in our metadata file
|
|
224
|
-
if image_id in seen_ids:
|
|
225
|
-
continue
|
|
226
|
-
|
|
227
|
-
seen_ids.add(image_id)
|
|
228
224
|
new_images_count[0] += 1
|
|
229
225
|
|
|
230
|
-
# Save
|
|
226
|
+
# Save metadata (don't dedupe here, let the tailer handle it)
|
|
231
227
|
meta_f.write(json.dumps(image) + "\n")
|
|
232
228
|
meta_f.flush()
|
|
233
229
|
|
|
234
230
|
if new_images_count[0] % 1000 == 0:
|
|
235
|
-
logger.info(f"API: Fetched {new_images_count[0]}
|
|
231
|
+
logger.info(f"API: Fetched {new_images_count[0]} images from API")
|
|
236
232
|
|
|
237
233
|
# Mark as complete
|
|
238
234
|
MetadataReader.mark_complete(self.metadata_file)
|
|
239
|
-
logger.info(f"API fetch complete: {new_images_count[0]}
|
|
235
|
+
logger.info(f"API fetch complete: {new_images_count[0]} images")
|
|
240
236
|
finally:
|
|
241
237
|
api_fetch_complete.set()
|
|
242
238
|
|
|
@@ -254,8 +250,9 @@ class MapillaryDownloader:
|
|
|
254
250
|
# Helper to process results from queue
|
|
255
251
|
def process_results():
|
|
256
252
|
nonlocal downloaded_count, total_bytes, failed_count
|
|
253
|
+
# Drain ALL available results to prevent queue from filling up
|
|
257
254
|
while True:
|
|
258
|
-
result = pool.get_result(timeout=0
|
|
255
|
+
result = pool.get_result(timeout=0) # Non-blocking
|
|
259
256
|
if result is None:
|
|
260
257
|
break
|
|
261
258
|
|
|
@@ -386,6 +383,10 @@ class MapillaryDownloader:
|
|
|
386
383
|
|
|
387
384
|
last_position = f.tell()
|
|
388
385
|
|
|
386
|
+
# If API is already complete, we've read the whole file, so break
|
|
387
|
+
if api_fetch_complete is None:
|
|
388
|
+
break
|
|
389
|
+
|
|
389
390
|
# Sleep briefly before next tail iteration
|
|
390
391
|
time.sleep(0.1)
|
|
391
392
|
|
|
@@ -438,14 +439,7 @@ class MapillaryDownloader:
|
|
|
438
439
|
self._save_progress()
|
|
439
440
|
elapsed = time.time() - start_time
|
|
440
441
|
|
|
441
|
-
|
|
442
|
-
total_images = len(seen_ids)
|
|
443
|
-
skipped = total_images - downloaded_count - failed_count
|
|
444
|
-
|
|
445
|
-
logger.info(
|
|
446
|
-
f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
|
|
447
|
-
f"skipped {skipped}, failed {failed_count}"
|
|
448
|
-
)
|
|
442
|
+
logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
|
|
449
443
|
logger.info(f"Total time: {format_time(elapsed)}")
|
|
450
444
|
|
|
451
445
|
# Tar sequence directories for efficient IA uploads
|
|
@@ -23,7 +23,47 @@ class MetadataReader:
|
|
|
23
23
|
metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
|
|
24
24
|
"""
|
|
25
25
|
self.metadata_file = Path(metadata_file)
|
|
26
|
-
self.is_complete =
|
|
26
|
+
self.is_complete = self._check_complete()
|
|
27
|
+
|
|
28
|
+
def _check_complete(self):
|
|
29
|
+
"""Check if metadata file has completion marker.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
True if completion marker found, False otherwise
|
|
33
|
+
"""
|
|
34
|
+
if not self.metadata_file.exists():
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Check last few lines for completion marker (it should be at the end)
|
|
38
|
+
try:
|
|
39
|
+
if self.metadata_file.suffix == ".gz":
|
|
40
|
+
file_handle = gzip.open(self.metadata_file, "rt")
|
|
41
|
+
else:
|
|
42
|
+
file_handle = open(self.metadata_file)
|
|
43
|
+
|
|
44
|
+
with file_handle as f:
|
|
45
|
+
# Read last 10 lines to find completion marker
|
|
46
|
+
lines = []
|
|
47
|
+
for line in f:
|
|
48
|
+
lines.append(line)
|
|
49
|
+
if len(lines) > 10:
|
|
50
|
+
lines.pop(0)
|
|
51
|
+
|
|
52
|
+
# Check if any of the last lines is the completion marker
|
|
53
|
+
for line in reversed(lines):
|
|
54
|
+
line = line.strip()
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
try:
|
|
58
|
+
data = json.loads(line)
|
|
59
|
+
if data.get("__complete__"):
|
|
60
|
+
return True
|
|
61
|
+
except json.JSONDecodeError:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
return False
|
|
65
|
+
except Exception:
|
|
66
|
+
return False
|
|
27
67
|
|
|
28
68
|
def iter_images(self, quality_field=None, downloaded_ids=None):
|
|
29
69
|
"""Stream images from metadata file with filtering.
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/worker.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Worker process for parallel image download and conversion."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import signal
|
|
4
5
|
import tempfile
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import requests
|
|
@@ -17,6 +18,12 @@ def worker_process(work_queue, result_queue, worker_id):
|
|
|
17
18
|
result_queue: Queue to push results to
|
|
18
19
|
worker_id: Unique worker identifier
|
|
19
20
|
"""
|
|
21
|
+
# Ignore SIGINT in worker process - parent will handle it
|
|
22
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
23
|
+
|
|
24
|
+
# Create session once per worker (reuse HTTP connections)
|
|
25
|
+
session = requests.Session()
|
|
26
|
+
|
|
20
27
|
while True:
|
|
21
28
|
work_item = work_queue.get()
|
|
22
29
|
|
|
@@ -27,14 +34,17 @@ def worker_process(work_queue, result_queue, worker_id):
|
|
|
27
34
|
# Unpack work item
|
|
28
35
|
image_data, output_dir, quality, convert_webp, access_token = work_item
|
|
29
36
|
|
|
37
|
+
# Update session auth for this request
|
|
38
|
+
session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
39
|
+
|
|
30
40
|
# Process the image
|
|
31
|
-
result = download_and_convert_image(image_data, output_dir, quality, convert_webp,
|
|
41
|
+
result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
|
|
32
42
|
|
|
33
43
|
# Push result back
|
|
34
44
|
result_queue.put(result)
|
|
35
45
|
|
|
36
46
|
|
|
37
|
-
def download_and_convert_image(image_data, output_dir, quality, convert_webp,
|
|
47
|
+
def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
|
|
38
48
|
"""Download and optionally convert a single image.
|
|
39
49
|
|
|
40
50
|
This function is designed to run in a worker process.
|
|
@@ -44,7 +54,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
|
44
54
|
output_dir: Base output directory path
|
|
45
55
|
quality: Quality level (256, 1024, 2048, original)
|
|
46
56
|
convert_webp: Whether to convert to WebP
|
|
47
|
-
|
|
57
|
+
session: requests.Session with auth already configured
|
|
48
58
|
|
|
49
59
|
Returns:
|
|
50
60
|
Tuple of (image_id, bytes_downloaded, success, error_msg)
|
|
@@ -78,11 +88,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
|
78
88
|
jpg_path = img_dir / f"{image_id}.jpg"
|
|
79
89
|
final_path = jpg_path
|
|
80
90
|
|
|
81
|
-
# Download image
|
|
82
|
-
# No retries for CDN images - they're cheap, just skip failures and move on
|
|
83
|
-
session = requests.Session()
|
|
84
|
-
session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
85
|
-
|
|
91
|
+
# Download image (using session passed from worker)
|
|
86
92
|
bytes_downloaded = 0
|
|
87
93
|
|
|
88
94
|
try:
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Adaptive worker pool for parallel processing."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
import queue
|
|
6
|
+
import time
|
|
7
|
+
from collections import deque
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AdaptiveWorkerPool:
|
|
13
|
+
"""Worker pool that scales based on throughput.
|
|
14
|
+
|
|
15
|
+
Monitors throughput every 30 seconds and adjusts worker count:
|
|
16
|
+
- If throughput increasing: add workers (up to max)
|
|
17
|
+
- If throughput plateauing/decreasing: reduce workers
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
|
|
21
|
+
"""Initialize adaptive worker pool.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
worker_func: Function to run in each worker (must accept work_queue, result_queue)
|
|
25
|
+
min_workers: Minimum number of workers
|
|
26
|
+
max_workers: Maximum number of workers
|
|
27
|
+
monitoring_interval: Seconds between throughput checks
|
|
28
|
+
"""
|
|
29
|
+
self.worker_func = worker_func
|
|
30
|
+
self.min_workers = min_workers
|
|
31
|
+
self.max_workers = max_workers
|
|
32
|
+
self.monitoring_interval = monitoring_interval
|
|
33
|
+
|
|
34
|
+
# Queues
|
|
35
|
+
self.work_queue = mp.Queue(maxsize=max_workers)
|
|
36
|
+
self.result_queue = mp.Queue()
|
|
37
|
+
|
|
38
|
+
# Worker management
|
|
39
|
+
self.workers = []
|
|
40
|
+
self.current_workers = min_workers # Start small and ramp up
|
|
41
|
+
|
|
42
|
+
# Throughput monitoring
|
|
43
|
+
self.throughput_history = deque(maxlen=5) # Last 5 measurements
|
|
44
|
+
self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
|
|
45
|
+
self.last_processed = 0
|
|
46
|
+
self.last_check_time = time.time()
|
|
47
|
+
|
|
48
|
+
self.running = False
|
|
49
|
+
|
|
50
|
+
def start(self):
|
|
51
|
+
"""Start the worker pool."""
|
|
52
|
+
self.running = True
|
|
53
|
+
logger.info(f"Starting worker pool with {self.current_workers} workers")
|
|
54
|
+
|
|
55
|
+
for i in range(self.current_workers):
|
|
56
|
+
self._add_worker(i)
|
|
57
|
+
|
|
58
|
+
def _add_worker(self, worker_id):
|
|
59
|
+
"""Add a new worker to the pool."""
|
|
60
|
+
p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
|
|
61
|
+
p.start()
|
|
62
|
+
self.workers.append(p)
|
|
63
|
+
logger.debug(f"Started worker {worker_id}")
|
|
64
|
+
|
|
65
|
+
def submit(self, work_item):
|
|
66
|
+
"""Submit work to the pool (blocks if queue is full)."""
|
|
67
|
+
self.work_queue.put(work_item)
|
|
68
|
+
|
|
69
|
+
def get_result(self, timeout=None):
|
|
70
|
+
"""Get a result from the workers.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Result from worker, or None if timeout
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
return self.result_queue.get(timeout=timeout)
|
|
77
|
+
except queue.Empty:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def check_throughput(self, total_processed):
|
|
81
|
+
"""Check throughput and adjust workers if needed.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
total_processed: Total number of items processed so far
|
|
85
|
+
"""
|
|
86
|
+
now = time.time()
|
|
87
|
+
elapsed = now - self.last_check_time
|
|
88
|
+
|
|
89
|
+
if elapsed < self.monitoring_interval:
|
|
90
|
+
logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
# Calculate current throughput (items/sec)
|
|
94
|
+
items_since_check = total_processed - self.last_processed
|
|
95
|
+
throughput = items_since_check / elapsed
|
|
96
|
+
|
|
97
|
+
current_workers = len(self.workers)
|
|
98
|
+
self.throughput_history.append(throughput)
|
|
99
|
+
self.worker_count_history.append(current_workers)
|
|
100
|
+
self.last_processed = total_processed
|
|
101
|
+
self.last_check_time = now
|
|
102
|
+
|
|
103
|
+
logger.info(
|
|
104
|
+
f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
|
|
105
|
+
f"history: {len(self.throughput_history)} measurements)"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Need at least 2 measurements to calculate gain per worker
|
|
109
|
+
if len(self.throughput_history) < 2:
|
|
110
|
+
# First measurement - add 20% more workers
|
|
111
|
+
if current_workers < self.max_workers:
|
|
112
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
113
|
+
for i in range(workers_to_add):
|
|
114
|
+
if len(self.workers) < self.max_workers:
|
|
115
|
+
new_worker_id = len(self.workers)
|
|
116
|
+
self._add_worker(new_worker_id)
|
|
117
|
+
self.current_workers += 1
|
|
118
|
+
logger.info(
|
|
119
|
+
f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
|
|
120
|
+
)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
# Calculate throughput gain per worker added
|
|
124
|
+
current_throughput = self.throughput_history[-1]
|
|
125
|
+
previous_throughput = self.throughput_history[-2]
|
|
126
|
+
previous_workers = self.worker_count_history[-2]
|
|
127
|
+
|
|
128
|
+
throughput_gain = current_throughput - previous_throughput
|
|
129
|
+
workers_added = current_workers - previous_workers
|
|
130
|
+
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
|
|
133
|
+
f"{current_throughput:.1f} items/s @ {current_workers} workers "
|
|
134
|
+
f"(gain: {throughput_gain:.1f}, added: {workers_added})"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# If throughput decreased significantly, stop adding workers
|
|
138
|
+
if current_throughput < previous_throughput * 0.95:
|
|
139
|
+
logger.info(
|
|
140
|
+
f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
|
|
141
|
+
f"stopping at {current_workers} workers"
|
|
142
|
+
)
|
|
143
|
+
# If throughput is still increasing or stable, add more workers
|
|
144
|
+
elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
|
|
145
|
+
if workers_added > 0 and throughput_gain > 0:
|
|
146
|
+
# Calculate gain per worker
|
|
147
|
+
gain_per_worker = throughput_gain / workers_added
|
|
148
|
+
logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
|
|
149
|
+
|
|
150
|
+
# Estimate how many more workers we could benefit from
|
|
151
|
+
# Assume diminishing returns, so be conservative
|
|
152
|
+
if gain_per_worker > 0.5:
|
|
153
|
+
# Good gain per worker - add more aggressively
|
|
154
|
+
workers_to_add = max(1, int(current_workers * 0.3))
|
|
155
|
+
elif gain_per_worker > 0.2:
|
|
156
|
+
# Moderate gain - add moderately
|
|
157
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
158
|
+
else:
|
|
159
|
+
# Small gain - add conservatively
|
|
160
|
+
workers_to_add = max(1, int(current_workers * 0.1))
|
|
161
|
+
|
|
162
|
+
added = 0
|
|
163
|
+
for i in range(workers_to_add):
|
|
164
|
+
if len(self.workers) < self.max_workers:
|
|
165
|
+
new_worker_id = len(self.workers)
|
|
166
|
+
self._add_worker(new_worker_id)
|
|
167
|
+
self.current_workers += 1
|
|
168
|
+
added += 1
|
|
169
|
+
|
|
170
|
+
logger.info(
|
|
171
|
+
f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
|
|
172
|
+
f"added {added} workers (now {self.current_workers}/{self.max_workers})"
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
# Fallback to 20% if we can't calculate gain per worker
|
|
176
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
177
|
+
added = 0
|
|
178
|
+
for i in range(workers_to_add):
|
|
179
|
+
if len(self.workers) < self.max_workers:
|
|
180
|
+
new_worker_id = len(self.workers)
|
|
181
|
+
self._add_worker(new_worker_id)
|
|
182
|
+
self.current_workers += 1
|
|
183
|
+
added += 1
|
|
184
|
+
logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
|
|
185
|
+
else:
|
|
186
|
+
logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
|
|
187
|
+
|
|
188
|
+
def shutdown(self, timeout=2):
|
|
189
|
+
"""Shutdown the worker pool gracefully."""
|
|
190
|
+
logger.info("Shutting down worker pool...")
|
|
191
|
+
self.running = False
|
|
192
|
+
|
|
193
|
+
# Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
|
|
194
|
+
for p in self.workers:
|
|
195
|
+
if p.is_alive():
|
|
196
|
+
p.terminate()
|
|
197
|
+
|
|
198
|
+
# Give them a brief moment to exit
|
|
199
|
+
for p in self.workers:
|
|
200
|
+
p.join(timeout=timeout)
|
|
201
|
+
|
|
202
|
+
logger.info("Worker pool shutdown complete")
|
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
"""Adaptive worker pool for parallel processing."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import multiprocessing as mp
|
|
5
|
-
import queue
|
|
6
|
-
import time
|
|
7
|
-
from collections import deque
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger("mapillary_downloader")
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class AdaptiveWorkerPool:
|
|
13
|
-
"""Worker pool that scales based on throughput.
|
|
14
|
-
|
|
15
|
-
Monitors throughput every 30 seconds and adjusts worker count:
|
|
16
|
-
- If throughput increasing: add workers (up to max)
|
|
17
|
-
- If throughput plateauing/decreasing: reduce workers
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
|
|
21
|
-
"""Initialize adaptive worker pool.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
worker_func: Function to run in each worker (must accept work_queue, result_queue)
|
|
25
|
-
min_workers: Minimum number of workers
|
|
26
|
-
max_workers: Maximum number of workers
|
|
27
|
-
monitoring_interval: Seconds between throughput checks
|
|
28
|
-
"""
|
|
29
|
-
self.worker_func = worker_func
|
|
30
|
-
self.min_workers = min_workers
|
|
31
|
-
self.max_workers = max_workers
|
|
32
|
-
self.monitoring_interval = monitoring_interval
|
|
33
|
-
|
|
34
|
-
# Queues
|
|
35
|
-
self.work_queue = mp.Queue(maxsize=max_workers)
|
|
36
|
-
self.result_queue = mp.Queue()
|
|
37
|
-
|
|
38
|
-
# Worker management
|
|
39
|
-
self.workers = []
|
|
40
|
-
self.current_workers = min_workers
|
|
41
|
-
|
|
42
|
-
# Throughput monitoring
|
|
43
|
-
self.throughput_history = deque(maxlen=5) # Last 5 measurements
|
|
44
|
-
self.last_processed = 0
|
|
45
|
-
self.last_check_time = time.time()
|
|
46
|
-
|
|
47
|
-
self.running = False
|
|
48
|
-
|
|
49
|
-
def start(self):
|
|
50
|
-
"""Start the worker pool."""
|
|
51
|
-
self.running = True
|
|
52
|
-
logger.info(f"Starting worker pool with {self.current_workers} workers")
|
|
53
|
-
|
|
54
|
-
for i in range(self.current_workers):
|
|
55
|
-
self._add_worker(i)
|
|
56
|
-
|
|
57
|
-
def _add_worker(self, worker_id):
|
|
58
|
-
"""Add a new worker to the pool."""
|
|
59
|
-
p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
|
|
60
|
-
p.start()
|
|
61
|
-
self.workers.append(p)
|
|
62
|
-
logger.debug(f"Started worker {worker_id}")
|
|
63
|
-
|
|
64
|
-
def submit(self, work_item):
|
|
65
|
-
"""Submit work to the pool (blocks if queue is full)."""
|
|
66
|
-
self.work_queue.put(work_item)
|
|
67
|
-
|
|
68
|
-
def get_result(self, timeout=None):
|
|
69
|
-
"""Get a result from the workers.
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
Result from worker, or None if timeout
|
|
73
|
-
"""
|
|
74
|
-
try:
|
|
75
|
-
return self.result_queue.get(timeout=timeout)
|
|
76
|
-
except queue.Empty:
|
|
77
|
-
return None
|
|
78
|
-
|
|
79
|
-
def check_throughput(self, total_processed):
|
|
80
|
-
"""Check throughput and adjust workers if needed.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
total_processed: Total number of items processed so far
|
|
84
|
-
"""
|
|
85
|
-
now = time.time()
|
|
86
|
-
elapsed = now - self.last_check_time
|
|
87
|
-
|
|
88
|
-
if elapsed < self.monitoring_interval:
|
|
89
|
-
return
|
|
90
|
-
|
|
91
|
-
# Calculate current throughput (items/sec)
|
|
92
|
-
items_since_check = total_processed - self.last_processed
|
|
93
|
-
throughput = items_since_check / elapsed
|
|
94
|
-
|
|
95
|
-
self.throughput_history.append(throughput)
|
|
96
|
-
self.last_processed = total_processed
|
|
97
|
-
self.last_check_time = now
|
|
98
|
-
|
|
99
|
-
# Need at least 3 measurements to detect trends
|
|
100
|
-
if len(self.throughput_history) < 3:
|
|
101
|
-
return
|
|
102
|
-
|
|
103
|
-
# Check if throughput is increasing
|
|
104
|
-
recent_avg = sum(list(self.throughput_history)[-2:]) / 2
|
|
105
|
-
older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
|
|
106
|
-
|
|
107
|
-
if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
|
|
108
|
-
# Throughput increasing by >10%, add workers
|
|
109
|
-
new_worker_id = len(self.workers)
|
|
110
|
-
self._add_worker(new_worker_id)
|
|
111
|
-
self.current_workers += 1
|
|
112
|
-
logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
|
|
113
|
-
|
|
114
|
-
elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
|
|
115
|
-
# Throughput decreasing by >10%, remove worker
|
|
116
|
-
# (workers will exit naturally when they finish current work)
|
|
117
|
-
self.current_workers = max(self.min_workers, self.current_workers - 1)
|
|
118
|
-
logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
|
|
119
|
-
|
|
120
|
-
def shutdown(self, timeout=30):
|
|
121
|
-
"""Shutdown the worker pool gracefully."""
|
|
122
|
-
logger.info("Shutting down worker pool...")
|
|
123
|
-
self.running = False
|
|
124
|
-
|
|
125
|
-
# Send stop signals
|
|
126
|
-
for _ in self.workers:
|
|
127
|
-
self.work_queue.put(None)
|
|
128
|
-
|
|
129
|
-
# Wait for workers to finish
|
|
130
|
-
for p in self.workers:
|
|
131
|
-
p.join(timeout=timeout)
|
|
132
|
-
if p.is_alive():
|
|
133
|
-
logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
|
|
134
|
-
p.terminate()
|
|
135
|
-
|
|
136
|
-
logger.info("Worker pool shutdown complete")
|
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/__init__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/client.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/exif_writer.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/ia_check.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/ia_meta.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/logging_config.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/tar_sequences.py
RENAMED
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/webp_converter.py
RENAMED
|
File without changes
|