mapillary-downloader 0.5.0__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/PKG-INFO +1 -1
  2. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/pyproject.toml +1 -1
  3. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/__main__.py +4 -4
  4. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/downloader.py +25 -31
  5. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/metadata_reader.py +41 -1
  6. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/worker.py +14 -8
  7. mapillary_downloader-0.5.2/src/mapillary_downloader/worker_pool.py +202 -0
  8. mapillary_downloader-0.5.0/src/mapillary_downloader/worker_pool.py +0 -136
  9. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/LICENSE.md +0 -0
  10. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/README.md +0 -0
  11. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/__init__.py +0 -0
  12. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/client.py +0 -0
  13. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/exif_writer.py +0 -0
  14. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/ia_check.py +0 -0
  15. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/ia_meta.py +0 -0
  16. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/logging_config.py +0 -0
  17. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/tar_sequences.py +0 -0
  18. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/utils.py +0 -0
  19. {mapillary_downloader-0.5.0 → mapillary_downloader-0.5.2}/src/mapillary_downloader/webp_converter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Download your Mapillary data before it's gone"
4
- version = "0.5.0"
4
+ version = "0.5.2"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -41,10 +41,10 @@ def main():
41
41
  help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
42
42
  )
43
43
  parser.add_argument(
44
- "--workers",
44
+ "--max-workers",
45
45
  type=int,
46
- default=None,
47
- help="Number of parallel workers (default: half of CPU cores)",
46
+ default=128,
47
+ help="Maximum number of parallel workers (default: 128)",
48
48
  )
49
49
  parser.add_argument(
50
50
  "--no-tar",
@@ -114,7 +114,7 @@ def main():
114
114
  args.output,
115
115
  username,
116
116
  args.quality,
117
- workers=args.workers,
117
+ max_workers=args.max_workers,
118
118
  tar_sequences=not args.no_tar,
119
119
  convert_webp=convert_webp,
120
120
  check_ia=not args.no_check_ia,
@@ -45,7 +45,7 @@ class MapillaryDownloader:
45
45
  output_dir,
46
46
  username=None,
47
47
  quality=None,
48
- workers=None,
48
+ max_workers=128,
49
49
  tar_sequences=True,
50
50
  convert_webp=False,
51
51
  check_ia=True,
@@ -57,7 +57,7 @@ class MapillaryDownloader:
57
57
  output_dir: Base directory to save downloads (final destination)
58
58
  username: Mapillary username (for collection directory)
59
59
  quality: Image quality (for collection directory)
60
- workers: Number of parallel workers (default: half of cpu_count)
60
+ max_workers: Maximum number of parallel workers (default: 128)
61
61
  tar_sequences: Whether to tar sequence directories after download (default: True)
62
62
  convert_webp: Whether to convert images to WebP (affects collection name)
63
63
  check_ia: Whether to check if collection exists on Internet Archive (default: True)
@@ -66,7 +66,8 @@ class MapillaryDownloader:
66
66
  self.base_output_dir = Path(output_dir)
67
67
  self.username = username
68
68
  self.quality = quality
69
- self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
69
+ self.max_workers = max_workers
70
+ self.initial_workers = os.cpu_count() or 1 # Start with CPU count
70
71
  self.tar_sequences = tar_sequences
71
72
  self.convert_webp = convert_webp
72
73
  self.check_ia = check_ia
@@ -177,26 +178,28 @@ class MapillaryDownloader:
177
178
  logger.info(f"Downloading images for user: {self.username}")
178
179
  logger.info(f"Output directory: {self.output_dir}")
179
180
  logger.info(f"Quality: {self.quality}")
180
- logger.info(f"Using {self.workers} parallel workers")
181
+ logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
181
182
 
182
183
  start_time = time.time()
183
184
 
184
- # Step 1: Build seen_ids from metadata file (streaming, only IDs)
185
- logger.info("Building seen_ids from metadata...")
185
+ # Step 1: Check if API fetch is already complete
186
186
  reader = MetadataReader(self.metadata_file)
187
- seen_ids = reader.get_all_ids()
188
187
  api_complete = reader.is_complete
189
- logger.info(f"Found {len(seen_ids)} existing images in metadata")
188
+ if api_complete:
189
+ logger.info("API fetch already complete, will only download")
190
+ else:
191
+ logger.info("API fetch incomplete, will fetch and download in parallel")
190
192
 
191
- # Step 2: Start worker pool (fork AFTER building seen_ids, BEFORE downloading)
193
+ # Step 2: Start worker pool
194
+ # Since workers do both I/O (download) and CPU (WebP), need many more workers
195
+ # Start with CPU count and scale up based on throughput
192
196
  pool = AdaptiveWorkerPool(
193
- worker_process, min_workers=max(1, self.workers // 2), max_workers=self.workers, monitoring_interval=30
197
+ worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
194
198
  )
195
199
  pool.start()
196
200
 
197
- # Step 3: Download images from existing metadata while fetching new from API
201
+ # Step 3: Download images from metadata file while fetching new from API
198
202
  downloaded_count = 0
199
- skipped = 0
200
203
  total_bytes = 0
201
204
  failed_count = 0
202
205
  submitted = 0
@@ -218,25 +221,18 @@ class MapillaryDownloader:
218
221
  logger.info("API fetch thread: Starting...")
219
222
  with open(self.metadata_file, "a") as meta_f:
220
223
  for image in self.client.get_user_images(self.username, bbox=bbox):
221
- image_id = image["id"]
222
-
223
- # Skip if we already have this in our metadata file
224
- if image_id in seen_ids:
225
- continue
226
-
227
- seen_ids.add(image_id)
228
224
  new_images_count[0] += 1
229
225
 
230
- # Save new metadata
226
+ # Save metadata (don't dedupe here, let the tailer handle it)
231
227
  meta_f.write(json.dumps(image) + "\n")
232
228
  meta_f.flush()
233
229
 
234
230
  if new_images_count[0] % 1000 == 0:
235
- logger.info(f"API: Fetched {new_images_count[0]} new images from API")
231
+ logger.info(f"API: Fetched {new_images_count[0]} images from API")
236
232
 
237
233
  # Mark as complete
238
234
  MetadataReader.mark_complete(self.metadata_file)
239
- logger.info(f"API fetch complete: {new_images_count[0]} new images")
235
+ logger.info(f"API fetch complete: {new_images_count[0]} images")
240
236
  finally:
241
237
  api_fetch_complete.set()
242
238
 
@@ -254,8 +250,9 @@ class MapillaryDownloader:
254
250
  # Helper to process results from queue
255
251
  def process_results():
256
252
  nonlocal downloaded_count, total_bytes, failed_count
253
+ # Drain ALL available results to prevent queue from filling up
257
254
  while True:
258
- result = pool.get_result(timeout=0.001)
255
+ result = pool.get_result(timeout=0) # Non-blocking
259
256
  if result is None:
260
257
  break
261
258
 
@@ -386,6 +383,10 @@ class MapillaryDownloader:
386
383
 
387
384
  last_position = f.tell()
388
385
 
386
+ # If API is already complete, we've read the whole file, so break
387
+ if api_fetch_complete is None:
388
+ break
389
+
389
390
  # Sleep briefly before next tail iteration
390
391
  time.sleep(0.1)
391
392
 
@@ -438,14 +439,7 @@ class MapillaryDownloader:
438
439
  self._save_progress()
439
440
  elapsed = time.time() - start_time
440
441
 
441
- # Count total images in metadata
442
- total_images = len(seen_ids)
443
- skipped = total_images - downloaded_count - failed_count
444
-
445
- logger.info(
446
- f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
447
- f"skipped {skipped}, failed {failed_count}"
448
- )
442
+ logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
449
443
  logger.info(f"Total time: {format_time(elapsed)}")
450
444
 
451
445
  # Tar sequence directories for efficient IA uploads
@@ -23,7 +23,47 @@ class MetadataReader:
23
23
  metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
24
24
  """
25
25
  self.metadata_file = Path(metadata_file)
26
- self.is_complete = False
26
+ self.is_complete = self._check_complete()
27
+
28
+ def _check_complete(self):
29
+ """Check if metadata file has completion marker.
30
+
31
+ Returns:
32
+ True if completion marker found, False otherwise
33
+ """
34
+ if not self.metadata_file.exists():
35
+ return False
36
+
37
+ # Check last few lines for completion marker (it should be at the end)
38
+ try:
39
+ if self.metadata_file.suffix == ".gz":
40
+ file_handle = gzip.open(self.metadata_file, "rt")
41
+ else:
42
+ file_handle = open(self.metadata_file)
43
+
44
+ with file_handle as f:
45
+ # Read last 10 lines to find completion marker
46
+ lines = []
47
+ for line in f:
48
+ lines.append(line)
49
+ if len(lines) > 10:
50
+ lines.pop(0)
51
+
52
+ # Check if any of the last lines is the completion marker
53
+ for line in reversed(lines):
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ if data.get("__complete__"):
60
+ return True
61
+ except json.JSONDecodeError:
62
+ continue
63
+
64
+ return False
65
+ except Exception:
66
+ return False
27
67
 
28
68
  def iter_images(self, quality_field=None, downloaded_ids=None):
29
69
  """Stream images from metadata file with filtering.
@@ -1,6 +1,7 @@
1
1
  """Worker process for parallel image download and conversion."""
2
2
 
3
3
  import os
4
+ import signal
4
5
  import tempfile
5
6
  from pathlib import Path
6
7
  import requests
@@ -17,6 +18,12 @@ def worker_process(work_queue, result_queue, worker_id):
17
18
  result_queue: Queue to push results to
18
19
  worker_id: Unique worker identifier
19
20
  """
21
+ # Ignore SIGINT in worker process - parent will handle it
22
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
23
+
24
+ # Create session once per worker (reuse HTTP connections)
25
+ session = requests.Session()
26
+
20
27
  while True:
21
28
  work_item = work_queue.get()
22
29
 
@@ -27,14 +34,17 @@ def worker_process(work_queue, result_queue, worker_id):
27
34
  # Unpack work item
28
35
  image_data, output_dir, quality, convert_webp, access_token = work_item
29
36
 
37
+ # Update session auth for this request
38
+ session.headers.update({"Authorization": f"OAuth {access_token}"})
39
+
30
40
  # Process the image
31
- result = download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token)
41
+ result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
32
42
 
33
43
  # Push result back
34
44
  result_queue.put(result)
35
45
 
36
46
 
37
- def download_and_convert_image(image_data, output_dir, quality, convert_webp, access_token):
47
+ def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
38
48
  """Download and optionally convert a single image.
39
49
 
40
50
  This function is designed to run in a worker process.
@@ -44,7 +54,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
44
54
  output_dir: Base output directory path
45
55
  quality: Quality level (256, 1024, 2048, original)
46
56
  convert_webp: Whether to convert to WebP
47
- access_token: Mapillary API access token
57
+ session: requests.Session with auth already configured
48
58
 
49
59
  Returns:
50
60
  Tuple of (image_id, bytes_downloaded, success, error_msg)
@@ -78,11 +88,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
78
88
  jpg_path = img_dir / f"{image_id}.jpg"
79
89
  final_path = jpg_path
80
90
 
81
- # Download image
82
- # No retries for CDN images - they're cheap, just skip failures and move on
83
- session = requests.Session()
84
- session.headers.update({"Authorization": f"OAuth {access_token}"})
85
-
91
+ # Download image (using session passed from worker)
86
92
  bytes_downloaded = 0
87
93
 
88
94
  try:
@@ -0,0 +1,202 @@
1
+ """Adaptive worker pool for parallel processing."""
2
+
3
+ import logging
4
+ import multiprocessing as mp
5
+ import queue
6
+ import time
7
+ from collections import deque
8
+
9
+ logger = logging.getLogger("mapillary_downloader")
10
+
11
+
12
+ class AdaptiveWorkerPool:
13
+ """Worker pool that scales based on throughput.
14
+
15
+ Monitors throughput every 30 seconds and adjusts worker count:
16
+ - If throughput increasing: add workers (up to max)
17
+ - If throughput plateauing/decreasing: reduce workers
18
+ """
19
+
20
+ def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
21
+ """Initialize adaptive worker pool.
22
+
23
+ Args:
24
+ worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
+ min_workers: Minimum number of workers
26
+ max_workers: Maximum number of workers
27
+ monitoring_interval: Seconds between throughput checks
28
+ """
29
+ self.worker_func = worker_func
30
+ self.min_workers = min_workers
31
+ self.max_workers = max_workers
32
+ self.monitoring_interval = monitoring_interval
33
+
34
+ # Queues
35
+ self.work_queue = mp.Queue(maxsize=max_workers)
36
+ self.result_queue = mp.Queue()
37
+
38
+ # Worker management
39
+ self.workers = []
40
+ self.current_workers = min_workers # Start small and ramp up
41
+
42
+ # Throughput monitoring
43
+ self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
+ self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
45
+ self.last_processed = 0
46
+ self.last_check_time = time.time()
47
+
48
+ self.running = False
49
+
50
+ def start(self):
51
+ """Start the worker pool."""
52
+ self.running = True
53
+ logger.info(f"Starting worker pool with {self.current_workers} workers")
54
+
55
+ for i in range(self.current_workers):
56
+ self._add_worker(i)
57
+
58
+ def _add_worker(self, worker_id):
59
+ """Add a new worker to the pool."""
60
+ p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
61
+ p.start()
62
+ self.workers.append(p)
63
+ logger.debug(f"Started worker {worker_id}")
64
+
65
+ def submit(self, work_item):
66
+ """Submit work to the pool (blocks if queue is full)."""
67
+ self.work_queue.put(work_item)
68
+
69
+ def get_result(self, timeout=None):
70
+ """Get a result from the workers.
71
+
72
+ Returns:
73
+ Result from worker, or None if timeout
74
+ """
75
+ try:
76
+ return self.result_queue.get(timeout=timeout)
77
+ except queue.Empty:
78
+ return None
79
+
80
+ def check_throughput(self, total_processed):
81
+ """Check throughput and adjust workers if needed.
82
+
83
+ Args:
84
+ total_processed: Total number of items processed so far
85
+ """
86
+ now = time.time()
87
+ elapsed = now - self.last_check_time
88
+
89
+ if elapsed < self.monitoring_interval:
90
+ logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
91
+ return
92
+
93
+ # Calculate current throughput (items/sec)
94
+ items_since_check = total_processed - self.last_processed
95
+ throughput = items_since_check / elapsed
96
+
97
+ current_workers = len(self.workers)
98
+ self.throughput_history.append(throughput)
99
+ self.worker_count_history.append(current_workers)
100
+ self.last_processed = total_processed
101
+ self.last_check_time = now
102
+
103
+ logger.info(
104
+ f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
105
+ f"history: {len(self.throughput_history)} measurements)"
106
+ )
107
+
108
+ # Need at least 2 measurements to calculate gain per worker
109
+ if len(self.throughput_history) < 2:
110
+ # First measurement - add 20% more workers
111
+ if current_workers < self.max_workers:
112
+ workers_to_add = max(1, int(current_workers * 0.2))
113
+ for i in range(workers_to_add):
114
+ if len(self.workers) < self.max_workers:
115
+ new_worker_id = len(self.workers)
116
+ self._add_worker(new_worker_id)
117
+ self.current_workers += 1
118
+ logger.info(
119
+ f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
120
+ )
121
+ return
122
+
123
+ # Calculate throughput gain per worker added
124
+ current_throughput = self.throughput_history[-1]
125
+ previous_throughput = self.throughput_history[-2]
126
+ previous_workers = self.worker_count_history[-2]
127
+
128
+ throughput_gain = current_throughput - previous_throughput
129
+ workers_added = current_workers - previous_workers
130
+
131
+ logger.debug(
132
+ f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
133
+ f"{current_throughput:.1f} items/s @ {current_workers} workers "
134
+ f"(gain: {throughput_gain:.1f}, added: {workers_added})"
135
+ )
136
+
137
+ # If throughput decreased significantly, stop adding workers
138
+ if current_throughput < previous_throughput * 0.95:
139
+ logger.info(
140
+ f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
141
+ f"stopping at {current_workers} workers"
142
+ )
143
+ # If throughput is still increasing or stable, add more workers
144
+ elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
145
+ if workers_added > 0 and throughput_gain > 0:
146
+ # Calculate gain per worker
147
+ gain_per_worker = throughput_gain / workers_added
148
+ logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
149
+
150
+ # Estimate how many more workers we could benefit from
151
+ # Assume diminishing returns, so be conservative
152
+ if gain_per_worker > 0.5:
153
+ # Good gain per worker - add more aggressively
154
+ workers_to_add = max(1, int(current_workers * 0.3))
155
+ elif gain_per_worker > 0.2:
156
+ # Moderate gain - add moderately
157
+ workers_to_add = max(1, int(current_workers * 0.2))
158
+ else:
159
+ # Small gain - add conservatively
160
+ workers_to_add = max(1, int(current_workers * 0.1))
161
+
162
+ added = 0
163
+ for i in range(workers_to_add):
164
+ if len(self.workers) < self.max_workers:
165
+ new_worker_id = len(self.workers)
166
+ self._add_worker(new_worker_id)
167
+ self.current_workers += 1
168
+ added += 1
169
+
170
+ logger.info(
171
+ f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
172
+ f"added {added} workers (now {self.current_workers}/{self.max_workers})"
173
+ )
174
+ else:
175
+ # Fallback to 20% if we can't calculate gain per worker
176
+ workers_to_add = max(1, int(current_workers * 0.2))
177
+ added = 0
178
+ for i in range(workers_to_add):
179
+ if len(self.workers) < self.max_workers:
180
+ new_worker_id = len(self.workers)
181
+ self._add_worker(new_worker_id)
182
+ self.current_workers += 1
183
+ added += 1
184
+ logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
185
+ else:
186
+ logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
187
+
188
+ def shutdown(self, timeout=2):
189
+ """Shutdown the worker pool gracefully."""
190
+ logger.info("Shutting down worker pool...")
191
+ self.running = False
192
+
193
+ # Terminate all workers immediately (they ignore SIGINT so we need to be forceful)
194
+ for p in self.workers:
195
+ if p.is_alive():
196
+ p.terminate()
197
+
198
+ # Give them a brief moment to exit
199
+ for p in self.workers:
200
+ p.join(timeout=timeout)
201
+
202
+ logger.info("Worker pool shutdown complete")
@@ -1,136 +0,0 @@
1
- """Adaptive worker pool for parallel processing."""
2
-
3
- import logging
4
- import multiprocessing as mp
5
- import queue
6
- import time
7
- from collections import deque
8
-
9
- logger = logging.getLogger("mapillary_downloader")
10
-
11
-
12
- class AdaptiveWorkerPool:
13
- """Worker pool that scales based on throughput.
14
-
15
- Monitors throughput every 30 seconds and adjusts worker count:
16
- - If throughput increasing: add workers (up to max)
17
- - If throughput plateauing/decreasing: reduce workers
18
- """
19
-
20
- def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=30):
21
- """Initialize adaptive worker pool.
22
-
23
- Args:
24
- worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
- min_workers: Minimum number of workers
26
- max_workers: Maximum number of workers
27
- monitoring_interval: Seconds between throughput checks
28
- """
29
- self.worker_func = worker_func
30
- self.min_workers = min_workers
31
- self.max_workers = max_workers
32
- self.monitoring_interval = monitoring_interval
33
-
34
- # Queues
35
- self.work_queue = mp.Queue(maxsize=max_workers)
36
- self.result_queue = mp.Queue()
37
-
38
- # Worker management
39
- self.workers = []
40
- self.current_workers = min_workers
41
-
42
- # Throughput monitoring
43
- self.throughput_history = deque(maxlen=5) # Last 5 measurements
44
- self.last_processed = 0
45
- self.last_check_time = time.time()
46
-
47
- self.running = False
48
-
49
- def start(self):
50
- """Start the worker pool."""
51
- self.running = True
52
- logger.info(f"Starting worker pool with {self.current_workers} workers")
53
-
54
- for i in range(self.current_workers):
55
- self._add_worker(i)
56
-
57
- def _add_worker(self, worker_id):
58
- """Add a new worker to the pool."""
59
- p = mp.Process(target=self.worker_func, args=(self.work_queue, self.result_queue, worker_id))
60
- p.start()
61
- self.workers.append(p)
62
- logger.debug(f"Started worker {worker_id}")
63
-
64
- def submit(self, work_item):
65
- """Submit work to the pool (blocks if queue is full)."""
66
- self.work_queue.put(work_item)
67
-
68
- def get_result(self, timeout=None):
69
- """Get a result from the workers.
70
-
71
- Returns:
72
- Result from worker, or None if timeout
73
- """
74
- try:
75
- return self.result_queue.get(timeout=timeout)
76
- except queue.Empty:
77
- return None
78
-
79
- def check_throughput(self, total_processed):
80
- """Check throughput and adjust workers if needed.
81
-
82
- Args:
83
- total_processed: Total number of items processed so far
84
- """
85
- now = time.time()
86
- elapsed = now - self.last_check_time
87
-
88
- if elapsed < self.monitoring_interval:
89
- return
90
-
91
- # Calculate current throughput (items/sec)
92
- items_since_check = total_processed - self.last_processed
93
- throughput = items_since_check / elapsed
94
-
95
- self.throughput_history.append(throughput)
96
- self.last_processed = total_processed
97
- self.last_check_time = now
98
-
99
- # Need at least 3 measurements to detect trends
100
- if len(self.throughput_history) < 3:
101
- return
102
-
103
- # Check if throughput is increasing
104
- recent_avg = sum(list(self.throughput_history)[-2:]) / 2
105
- older_avg = sum(list(self.throughput_history)[-4:-2]) / 2
106
-
107
- if recent_avg > older_avg * 1.1 and len(self.workers) < self.max_workers:
108
- # Throughput increasing by >10%, add workers
109
- new_worker_id = len(self.workers)
110
- self._add_worker(new_worker_id)
111
- self.current_workers += 1
112
- logger.info(f"Throughput increasing ({throughput:.1f} items/s), added worker (now {self.current_workers})")
113
-
114
- elif recent_avg < older_avg * 0.9 and len(self.workers) > self.min_workers:
115
- # Throughput decreasing by >10%, remove worker
116
- # (workers will exit naturally when they finish current work)
117
- self.current_workers = max(self.min_workers, self.current_workers - 1)
118
- logger.info(f"Throughput plateauing ({throughput:.1f} items/s), reducing to {self.current_workers} workers")
119
-
120
- def shutdown(self, timeout=30):
121
- """Shutdown the worker pool gracefully."""
122
- logger.info("Shutting down worker pool...")
123
- self.running = False
124
-
125
- # Send stop signals
126
- for _ in self.workers:
127
- self.work_queue.put(None)
128
-
129
- # Wait for workers to finish
130
- for p in self.workers:
131
- p.join(timeout=timeout)
132
- if p.is_alive():
133
- logger.warning(f"Worker {p.pid} did not exit cleanly, terminating")
134
- p.terminate()
135
-
136
- logger.info("Worker pool shutdown complete")