mapillary-downloader 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mapillary_downloader/downloader.py +14 -28
- mapillary_downloader/metadata_reader.py +41 -1
- mapillary_downloader/worker.py +10 -8
- mapillary_downloader/worker_pool.py +88 -20
- {mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/METADATA +1 -1
- {mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/RECORD +9 -9
- {mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/WHEEL +0 -0
- {mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/entry_points.txt +0 -0
- {mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -181,22 +181,22 @@ class MapillaryDownloader:
|
|
|
181
181
|
|
|
182
182
|
start_time = time.time()
|
|
183
183
|
|
|
184
|
-
# Step 1:
|
|
185
|
-
logger.info("Building seen_ids from metadata...")
|
|
184
|
+
# Step 1: Check if API fetch is already complete
|
|
186
185
|
reader = MetadataReader(self.metadata_file)
|
|
187
|
-
seen_ids = reader.get_all_ids()
|
|
188
186
|
api_complete = reader.is_complete
|
|
189
|
-
|
|
187
|
+
if api_complete:
|
|
188
|
+
logger.info("API fetch already complete, will only download")
|
|
189
|
+
else:
|
|
190
|
+
logger.info("API fetch incomplete, will fetch and download in parallel")
|
|
190
191
|
|
|
191
|
-
# Step 2: Start worker pool
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
192
|
+
# Step 2: Start worker pool
|
|
193
|
+
# Since workers do both I/O (download) and CPU (WebP), need many more workers
|
|
194
|
+
# Cap at 128 for now - will build proper dynamic scaling on a new branch later
|
|
195
|
+
pool = AdaptiveWorkerPool(worker_process, min_workers=self.workers, max_workers=128, monitoring_interval=10)
|
|
195
196
|
pool.start()
|
|
196
197
|
|
|
197
|
-
# Step 3: Download images from
|
|
198
|
+
# Step 3: Download images from metadata file while fetching new from API
|
|
198
199
|
downloaded_count = 0
|
|
199
|
-
skipped = 0
|
|
200
200
|
total_bytes = 0
|
|
201
201
|
failed_count = 0
|
|
202
202
|
submitted = 0
|
|
@@ -218,25 +218,18 @@ class MapillaryDownloader:
|
|
|
218
218
|
logger.info("API fetch thread: Starting...")
|
|
219
219
|
with open(self.metadata_file, "a") as meta_f:
|
|
220
220
|
for image in self.client.get_user_images(self.username, bbox=bbox):
|
|
221
|
-
image_id = image["id"]
|
|
222
|
-
|
|
223
|
-
# Skip if we already have this in our metadata file
|
|
224
|
-
if image_id in seen_ids:
|
|
225
|
-
continue
|
|
226
|
-
|
|
227
|
-
seen_ids.add(image_id)
|
|
228
221
|
new_images_count[0] += 1
|
|
229
222
|
|
|
230
|
-
# Save
|
|
223
|
+
# Save metadata (don't dedupe here, let the tailer handle it)
|
|
231
224
|
meta_f.write(json.dumps(image) + "\n")
|
|
232
225
|
meta_f.flush()
|
|
233
226
|
|
|
234
227
|
if new_images_count[0] % 1000 == 0:
|
|
235
|
-
logger.info(f"API: Fetched {new_images_count[0]}
|
|
228
|
+
logger.info(f"API: Fetched {new_images_count[0]} images from API")
|
|
236
229
|
|
|
237
230
|
# Mark as complete
|
|
238
231
|
MetadataReader.mark_complete(self.metadata_file)
|
|
239
|
-
logger.info(f"API fetch complete: {new_images_count[0]}
|
|
232
|
+
logger.info(f"API fetch complete: {new_images_count[0]} images")
|
|
240
233
|
finally:
|
|
241
234
|
api_fetch_complete.set()
|
|
242
235
|
|
|
@@ -438,14 +431,7 @@ class MapillaryDownloader:
|
|
|
438
431
|
self._save_progress()
|
|
439
432
|
elapsed = time.time() - start_time
|
|
440
433
|
|
|
441
|
-
|
|
442
|
-
total_images = len(seen_ids)
|
|
443
|
-
skipped = total_images - downloaded_count - failed_count
|
|
444
|
-
|
|
445
|
-
logger.info(
|
|
446
|
-
f"Complete! Total {total_images} images, downloaded {downloaded_count} ({format_size(total_bytes)}), "
|
|
447
|
-
f"skipped {skipped}, failed {failed_count}"
|
|
448
|
-
)
|
|
434
|
+
logger.info(f"Complete! Downloaded {downloaded_count} ({format_size(total_bytes)}), " f"failed {failed_count}")
|
|
449
435
|
logger.info(f"Total time: {format_time(elapsed)}")
|
|
450
436
|
|
|
451
437
|
# Tar sequence directories for efficient IA uploads
|
|
@@ -23,7 +23,47 @@ class MetadataReader:
|
|
|
23
23
|
metadata_file: Path to metadata.jsonl or metadata.jsonl.gz
|
|
24
24
|
"""
|
|
25
25
|
self.metadata_file = Path(metadata_file)
|
|
26
|
-
self.is_complete =
|
|
26
|
+
self.is_complete = self._check_complete()
|
|
27
|
+
|
|
28
|
+
def _check_complete(self):
|
|
29
|
+
"""Check if metadata file has completion marker.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
True if completion marker found, False otherwise
|
|
33
|
+
"""
|
|
34
|
+
if not self.metadata_file.exists():
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Check last few lines for completion marker (it should be at the end)
|
|
38
|
+
try:
|
|
39
|
+
if self.metadata_file.suffix == ".gz":
|
|
40
|
+
file_handle = gzip.open(self.metadata_file, "rt")
|
|
41
|
+
else:
|
|
42
|
+
file_handle = open(self.metadata_file)
|
|
43
|
+
|
|
44
|
+
with file_handle as f:
|
|
45
|
+
# Read last 10 lines to find completion marker
|
|
46
|
+
lines = []
|
|
47
|
+
for line in f:
|
|
48
|
+
lines.append(line)
|
|
49
|
+
if len(lines) > 10:
|
|
50
|
+
lines.pop(0)
|
|
51
|
+
|
|
52
|
+
# Check if any of the last lines is the completion marker
|
|
53
|
+
for line in reversed(lines):
|
|
54
|
+
line = line.strip()
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
try:
|
|
58
|
+
data = json.loads(line)
|
|
59
|
+
if data.get("__complete__"):
|
|
60
|
+
return True
|
|
61
|
+
except json.JSONDecodeError:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
return False
|
|
65
|
+
except Exception:
|
|
66
|
+
return False
|
|
27
67
|
|
|
28
68
|
def iter_images(self, quality_field=None, downloaded_ids=None):
|
|
29
69
|
"""Stream images from metadata file with filtering.
|
mapillary_downloader/worker.py
CHANGED
|
@@ -17,6 +17,9 @@ def worker_process(work_queue, result_queue, worker_id):
|
|
|
17
17
|
result_queue: Queue to push results to
|
|
18
18
|
worker_id: Unique worker identifier
|
|
19
19
|
"""
|
|
20
|
+
# Create session once per worker (reuse HTTP connections)
|
|
21
|
+
session = requests.Session()
|
|
22
|
+
|
|
20
23
|
while True:
|
|
21
24
|
work_item = work_queue.get()
|
|
22
25
|
|
|
@@ -27,14 +30,17 @@ def worker_process(work_queue, result_queue, worker_id):
|
|
|
27
30
|
# Unpack work item
|
|
28
31
|
image_data, output_dir, quality, convert_webp, access_token = work_item
|
|
29
32
|
|
|
33
|
+
# Update session auth for this request
|
|
34
|
+
session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
35
|
+
|
|
30
36
|
# Process the image
|
|
31
|
-
result = download_and_convert_image(image_data, output_dir, quality, convert_webp,
|
|
37
|
+
result = download_and_convert_image(image_data, output_dir, quality, convert_webp, session)
|
|
32
38
|
|
|
33
39
|
# Push result back
|
|
34
40
|
result_queue.put(result)
|
|
35
41
|
|
|
36
42
|
|
|
37
|
-
def download_and_convert_image(image_data, output_dir, quality, convert_webp,
|
|
43
|
+
def download_and_convert_image(image_data, output_dir, quality, convert_webp, session):
|
|
38
44
|
"""Download and optionally convert a single image.
|
|
39
45
|
|
|
40
46
|
This function is designed to run in a worker process.
|
|
@@ -44,7 +50,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
|
44
50
|
output_dir: Base output directory path
|
|
45
51
|
quality: Quality level (256, 1024, 2048, original)
|
|
46
52
|
convert_webp: Whether to convert to WebP
|
|
47
|
-
|
|
53
|
+
session: requests.Session with auth already configured
|
|
48
54
|
|
|
49
55
|
Returns:
|
|
50
56
|
Tuple of (image_id, bytes_downloaded, success, error_msg)
|
|
@@ -78,11 +84,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
|
78
84
|
jpg_path = img_dir / f"{image_id}.jpg"
|
|
79
85
|
final_path = jpg_path
|
|
80
86
|
|
|
81
|
-
# Download image
|
|
82
|
-
# No retries for CDN images - they're cheap, just skip failures and move on
|
|
83
|
-
session = requests.Session()
|
|
84
|
-
session.headers.update({"Authorization": f"OAuth {access_token}"})
|
|
85
|
-
|
|
87
|
+
# Download image (using session passed from worker)
|
|
86
88
|
bytes_downloaded = 0
|
|
87
89
|
|
|
88
90
|
try:
|
|
@@ -17,7 +17,7 @@ class AdaptiveWorkerPool:
|
|
|
17
17
|
- If throughput plateauing/decreasing: reduce workers
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=
|
|
20
|
+
def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
|
|
21
21
|
"""Initialize adaptive worker pool.
|
|
22
22
|
|
|
23
23
|
Args:
|
|
@@ -37,10 +37,11 @@ class AdaptiveWorkerPool:
|
|
|
37
37
|
|
|
38
38
|
# Worker management
|
|
39
39
|
self.workers = []
|
|
40
|
-
self.current_workers = min_workers
|
|
40
|
+
self.current_workers = min_workers # Start small and ramp up
|
|
41
41
|
|
|
42
42
|
# Throughput monitoring
|
|
43
43
|
self.throughput_history = deque(maxlen=5) # Last 5 measurements
|
|
44
|
+
self.worker_count_history = deque(maxlen=5) # Track worker counts at each measurement
|
|
44
45
|
self.last_processed = 0
|
|
45
46
|
self.last_check_time = time.time()
|
|
46
47
|
|
|
@@ -86,36 +87,103 @@ class AdaptiveWorkerPool:
|
|
|
86
87
|
elapsed = now - self.last_check_time
|
|
87
88
|
|
|
88
89
|
if elapsed < self.monitoring_interval:
|
|
90
|
+
logger.debug(f"Throughput check skipped (elapsed {elapsed:.1f}s < {self.monitoring_interval}s)")
|
|
89
91
|
return
|
|
90
92
|
|
|
91
93
|
# Calculate current throughput (items/sec)
|
|
92
94
|
items_since_check = total_processed - self.last_processed
|
|
93
95
|
throughput = items_since_check / elapsed
|
|
94
96
|
|
|
97
|
+
current_workers = len(self.workers)
|
|
95
98
|
self.throughput_history.append(throughput)
|
|
99
|
+
self.worker_count_history.append(current_workers)
|
|
96
100
|
self.last_processed = total_processed
|
|
97
101
|
self.last_check_time = now
|
|
98
102
|
|
|
99
|
-
|
|
100
|
-
|
|
103
|
+
logger.info(
|
|
104
|
+
f"Throughput: {throughput:.1f} items/s (workers: {current_workers}/{self.max_workers}, "
|
|
105
|
+
f"history: {len(self.throughput_history)} measurements)"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Need at least 2 measurements to calculate gain per worker
|
|
109
|
+
if len(self.throughput_history) < 2:
|
|
110
|
+
# First measurement - add 20% more workers
|
|
111
|
+
if current_workers < self.max_workers:
|
|
112
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
113
|
+
for i in range(workers_to_add):
|
|
114
|
+
if len(self.workers) < self.max_workers:
|
|
115
|
+
new_worker_id = len(self.workers)
|
|
116
|
+
self._add_worker(new_worker_id)
|
|
117
|
+
self.current_workers += 1
|
|
118
|
+
logger.info(
|
|
119
|
+
f"Ramping up: added {workers_to_add} workers (now {self.current_workers}/{self.max_workers})"
|
|
120
|
+
)
|
|
101
121
|
return
|
|
102
122
|
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
123
|
+
# Calculate throughput gain per worker added
|
|
124
|
+
current_throughput = self.throughput_history[-1]
|
|
125
|
+
previous_throughput = self.throughput_history[-2]
|
|
126
|
+
previous_workers = self.worker_count_history[-2]
|
|
127
|
+
|
|
128
|
+
throughput_gain = current_throughput - previous_throughput
|
|
129
|
+
workers_added = current_workers - previous_workers
|
|
130
|
+
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Trend: {previous_throughput:.1f} items/s @ {previous_workers} workers → "
|
|
133
|
+
f"{current_throughput:.1f} items/s @ {current_workers} workers "
|
|
134
|
+
f"(gain: {throughput_gain:.1f}, added: {workers_added})"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# If throughput decreased significantly, stop adding workers
|
|
138
|
+
if current_throughput < previous_throughput * 0.95:
|
|
139
|
+
logger.info(
|
|
140
|
+
f"Throughput decreasing ({current_throughput:.1f} vs {previous_throughput:.1f} items/s), "
|
|
141
|
+
f"stopping at {current_workers} workers"
|
|
142
|
+
)
|
|
143
|
+
# If throughput is still increasing or stable, add more workers
|
|
144
|
+
elif current_throughput >= previous_throughput * 0.95 and current_workers < self.max_workers:
|
|
145
|
+
if workers_added > 0 and throughput_gain > 0:
|
|
146
|
+
# Calculate gain per worker
|
|
147
|
+
gain_per_worker = throughput_gain / workers_added
|
|
148
|
+
logger.debug(f"Gain per worker: {gain_per_worker:.2f} items/s")
|
|
149
|
+
|
|
150
|
+
# Estimate how many more workers we could benefit from
|
|
151
|
+
# Assume diminishing returns, so be conservative
|
|
152
|
+
if gain_per_worker > 0.5:
|
|
153
|
+
# Good gain per worker - add more aggressively
|
|
154
|
+
workers_to_add = max(1, int(current_workers * 0.3))
|
|
155
|
+
elif gain_per_worker > 0.2:
|
|
156
|
+
# Moderate gain - add moderately
|
|
157
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
158
|
+
else:
|
|
159
|
+
# Small gain - add conservatively
|
|
160
|
+
workers_to_add = max(1, int(current_workers * 0.1))
|
|
161
|
+
|
|
162
|
+
added = 0
|
|
163
|
+
for i in range(workers_to_add):
|
|
164
|
+
if len(self.workers) < self.max_workers:
|
|
165
|
+
new_worker_id = len(self.workers)
|
|
166
|
+
self._add_worker(new_worker_id)
|
|
167
|
+
self.current_workers += 1
|
|
168
|
+
added += 1
|
|
169
|
+
|
|
170
|
+
logger.info(
|
|
171
|
+
f"Throughput increasing (gain: {gain_per_worker:.2f} items/s per worker), "
|
|
172
|
+
f"added {added} workers (now {self.current_workers}/{self.max_workers})"
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
# Fallback to 20% if we can't calculate gain per worker
|
|
176
|
+
workers_to_add = max(1, int(current_workers * 0.2))
|
|
177
|
+
added = 0
|
|
178
|
+
for i in range(workers_to_add):
|
|
179
|
+
if len(self.workers) < self.max_workers:
|
|
180
|
+
new_worker_id = len(self.workers)
|
|
181
|
+
self._add_worker(new_worker_id)
|
|
182
|
+
self.current_workers += 1
|
|
183
|
+
added += 1
|
|
184
|
+
logger.info(f"Ramping up: added {added} workers (now {self.current_workers}/{self.max_workers})")
|
|
185
|
+
else:
|
|
186
|
+
logger.info(f"At optimal worker count: {current_workers} workers, {current_throughput:.1f} items/s")
|
|
119
187
|
|
|
120
188
|
def shutdown(self, timeout=30):
|
|
121
189
|
"""Shutdown the worker pool gracefully."""
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
mapillary_downloader/__init__.py,sha256=KEjiBRghXDeA7E15RJeLBfQm-yNJkowZarL59QOh_1w,120
|
|
2
2
|
mapillary_downloader/__main__.py,sha256=Kjfx2woMyCvAxYAdqvtXtYJknCMviV_K2PSo0cDc8Hg,4320
|
|
3
3
|
mapillary_downloader/client.py,sha256=a5n43FLHP45EHodEjl0ieziBK-b6Ey-rZJwYB6EFhNI,4743
|
|
4
|
-
mapillary_downloader/downloader.py,sha256=
|
|
4
|
+
mapillary_downloader/downloader.py,sha256=v0vLovW80DMpEzQIb_tdETF1HzO2GEMbbVzD93aKnnQ,19561
|
|
5
5
|
mapillary_downloader/exif_writer.py,sha256=K_441EG1siWyNMmFGZSfnORUCjBThkeg4JFtbg9AOsA,5120
|
|
6
6
|
mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNMbq8,1100
|
|
7
7
|
mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
|
|
8
8
|
mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
|
|
9
|
-
mapillary_downloader/metadata_reader.py,sha256
|
|
9
|
+
mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
|
|
10
10
|
mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
|
|
11
11
|
mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
|
|
12
12
|
mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
|
|
13
|
-
mapillary_downloader/worker.py,sha256=
|
|
14
|
-
mapillary_downloader/worker_pool.py,sha256=
|
|
15
|
-
mapillary_downloader-0.5.
|
|
16
|
-
mapillary_downloader-0.5.
|
|
17
|
-
mapillary_downloader-0.5.
|
|
18
|
-
mapillary_downloader-0.5.
|
|
19
|
-
mapillary_downloader-0.5.
|
|
13
|
+
mapillary_downloader/worker.py,sha256=syxsE2pPX_9EXzSGEGeUaeLFqmqZjTma-rB7S2zpYac,4511
|
|
14
|
+
mapillary_downloader/worker_pool.py,sha256=ctFl40UgFUjpLL_e6Mw5h7YNMfKwXulRhaX18r9sIkE,8257
|
|
15
|
+
mapillary_downloader-0.5.1.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
|
|
16
|
+
mapillary_downloader-0.5.1.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
|
|
17
|
+
mapillary_downloader-0.5.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
18
|
+
mapillary_downloader-0.5.1.dist-info/METADATA,sha256=1vM80Kq2WHUzfSt7lIx91hb0fycHosxpll-xnCda6JU,4982
|
|
19
|
+
mapillary_downloader-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.0.dist-info → mapillary_downloader-0.5.1.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|