mapillary-downloader 0.5.2__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/PKG-INFO +20 -8
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/README.md +18 -6
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/pyproject.toml +2 -2
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/__main__.py +18 -1
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/downloader.py +3 -8
- mapillary_downloader-0.6.1/src/mapillary_downloader/ia_stats.py +242 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/tar_sequences.py +27 -33
- mapillary_downloader-0.6.1/src/mapillary_downloader/utils.py +108 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/worker.py +9 -8
- mapillary_downloader-0.5.2/src/mapillary_downloader/utils.py +0 -47
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/LICENSE.md +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/ia_check.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/ia_meta.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/logging_config.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/metadata_reader.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/webp_converter.py +0 -0
- {mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/worker_pool.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mapillary_downloader
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.6.1
|
|
4
|
+
Summary: Archive user data from Mapillary
|
|
5
5
|
Author-email: Gareth Davidson <gaz@bitplane.net>
|
|
6
6
|
Requires-Python: >=3.10
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
66
66
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
67
67
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
68
68
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
69
|
-
| `--workers`
|
|
70
|
-
| `--no-tar` | Don't tar
|
|
69
|
+
| `--max-workers` | Maximum number of parallel download workers | `128` |
|
|
70
|
+
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
71
71
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
72
72
|
|
|
73
73
|
The downloader will:
|
|
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
|
|
98
98
|
mapillary-downloader --no-webp USERNAME
|
|
99
99
|
```
|
|
100
100
|
|
|
101
|
-
##
|
|
101
|
+
## Tarballs
|
|
102
102
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
103
|
+
Images are organized by sequence ID, bucketed by the first character of the
|
|
104
|
+
sequence to reduce directory count:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
mapillary-username-quality/
|
|
108
|
+
a/
|
|
109
|
+
abc123/
|
|
110
|
+
image1.webp
|
|
111
|
+
image2.webp
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
By default, these bucket directories are automatically tarred after download
|
|
115
|
+
(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
|
|
116
|
+
because large collections with millions of images would otherwise create hundreds
|
|
117
|
+
of thousands of tiny tars, and anger the archive gods.
|
|
106
118
|
|
|
107
119
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
108
120
|
|
|
@@ -36,8 +36,8 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
36
36
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
37
37
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
38
38
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
39
|
-
| `--workers`
|
|
40
|
-
| `--no-tar` | Don't tar
|
|
39
|
+
| `--max-workers` | Maximum number of parallel download workers | `128` |
|
|
40
|
+
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
41
41
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
42
42
|
|
|
43
43
|
The downloader will:
|
|
@@ -68,11 +68,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
|
|
68
68
|
mapillary-downloader --no-webp USERNAME
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
-
##
|
|
71
|
+
## Tarballs
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
73
|
+
Images are organized by sequence ID, bucketed by the first character of the
|
|
74
|
+
sequence to reduce directory count:
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
mapillary-username-quality/
|
|
78
|
+
a/
|
|
79
|
+
abc123/
|
|
80
|
+
image1.webp
|
|
81
|
+
image2.webp
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
By default, these bucket directories are automatically tarred after download
|
|
85
|
+
(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
|
|
86
|
+
because large collections with millions of images would otherwise create hundreds
|
|
87
|
+
of thousands of tiny tars, and anger the archive gods.
|
|
76
88
|
|
|
77
89
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
78
90
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mapillary_downloader"
|
|
3
|
-
description = "
|
|
4
|
-
version = "0.
|
|
3
|
+
description = "Archive user data from Mapillary"
|
|
4
|
+
version = "0.6.1"
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Gareth Davidson", email = "gaz@bitplane.net" }
|
|
7
7
|
]
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/__main__.py
RENAMED
|
@@ -26,7 +26,7 @@ def main():
|
|
|
26
26
|
default=os.environ.get("MAPILLARY_TOKEN"),
|
|
27
27
|
help="Mapillary API access token (or set MAPILLARY_TOKEN env var)",
|
|
28
28
|
)
|
|
29
|
-
parser.add_argument("usernames", nargs="
|
|
29
|
+
parser.add_argument("usernames", nargs="*", help="Mapillary username(s) to download")
|
|
30
30
|
parser.add_argument("--output", default="./mapillary_data", help="Output directory (default: ./mapillary_data)")
|
|
31
31
|
parser.add_argument(
|
|
32
32
|
"--quality",
|
|
@@ -61,9 +61,21 @@ def main():
|
|
|
61
61
|
action="store_true",
|
|
62
62
|
help="Enable debug logging (EXIF data, API responses, etc.)",
|
|
63
63
|
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--stats",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Show statistics of collections on archive.org and exit",
|
|
68
|
+
)
|
|
64
69
|
|
|
65
70
|
args = parser.parse_args()
|
|
66
71
|
|
|
72
|
+
# Handle --stats early (before token check)
|
|
73
|
+
if args.stats:
|
|
74
|
+
from mapillary_downloader.ia_stats import show_stats
|
|
75
|
+
|
|
76
|
+
show_stats()
|
|
77
|
+
sys.exit(0)
|
|
78
|
+
|
|
67
79
|
# Set debug logging level if requested
|
|
68
80
|
if args.debug:
|
|
69
81
|
import logging
|
|
@@ -71,6 +83,11 @@ def main():
|
|
|
71
83
|
logging.getLogger("mapillary_downloader").setLevel(logging.DEBUG)
|
|
72
84
|
logger.debug("Debug logging enabled")
|
|
73
85
|
|
|
86
|
+
# Check for usernames (required unless using --stats)
|
|
87
|
+
if not args.usernames:
|
|
88
|
+
logger.error("Error: At least one username is required")
|
|
89
|
+
sys.exit(1)
|
|
90
|
+
|
|
74
91
|
# Check for token
|
|
75
92
|
if not args.token:
|
|
76
93
|
logger.error("Error: Mapillary API token required. Use --token or set MAPILLARY_TOKEN environment variable")
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/downloader.py
RENAMED
|
@@ -7,7 +7,7 @@ import os
|
|
|
7
7
|
import shutil
|
|
8
8
|
import time
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from mapillary_downloader.utils import format_size, format_time
|
|
10
|
+
from mapillary_downloader.utils import format_size, format_time, safe_json_save
|
|
11
11
|
from mapillary_downloader.ia_meta import generate_ia_metadata
|
|
12
12
|
from mapillary_downloader.ia_check import check_ia_exists
|
|
13
13
|
from mapillary_downloader.worker import worker_process
|
|
@@ -143,13 +143,8 @@ class MapillaryDownloader:
|
|
|
143
143
|
# Update this quality's progress
|
|
144
144
|
progress[str(self.quality)] = list(self.downloaded)
|
|
145
145
|
|
|
146
|
-
# Write atomically
|
|
147
|
-
|
|
148
|
-
with open(temp_file, "w") as f:
|
|
149
|
-
json.dump(progress, f)
|
|
150
|
-
f.flush()
|
|
151
|
-
os.fsync(f.fileno())
|
|
152
|
-
temp_file.replace(self.progress_file)
|
|
146
|
+
# Write atomically using utility function
|
|
147
|
+
safe_json_save(self.progress_file, progress)
|
|
153
148
|
|
|
154
149
|
def download_user_data(self, bbox=None, convert_webp=False):
|
|
155
150
|
"""Download all images for a user using streaming queue-based architecture.
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Internet Archive statistics for mapillary_downloader collections."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from mapillary_downloader.utils import safe_json_save, http_get_with_retry, format_size
|
|
7
|
+
from mapillary_downloader.downloader import get_cache_dir
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
10
|
+
|
|
11
|
+
CACHE_FILE = get_cache_dir() / ".stats.json"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def search_ia_collections():
|
|
15
|
+
"""Search IA for all mapillary_downloader collections.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
List of dicts with: identifier, description, item_size, uploader
|
|
19
|
+
"""
|
|
20
|
+
logger.info("Searching archive.org for mapillary_downloader collections...")
|
|
21
|
+
|
|
22
|
+
url = "https://archive.org/advancedsearch.php"
|
|
23
|
+
params = {
|
|
24
|
+
"q": "mapillary_downloader:*",
|
|
25
|
+
"fl[]": ["identifier", "description", "item_size", "uploader"],
|
|
26
|
+
"rows": 10000,
|
|
27
|
+
"output": "json",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
response = http_get_with_retry(url, params=params, max_retries=3)
|
|
31
|
+
data = response.json()
|
|
32
|
+
|
|
33
|
+
collections = data["response"]["docs"]
|
|
34
|
+
logger.info(f"Found {len(collections)} collections on archive.org")
|
|
35
|
+
|
|
36
|
+
return collections
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def parse_collection_info(identifier):
|
|
40
|
+
"""Parse username, quality, webp from collection identifier.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
dict with username, quality, is_webp or None if invalid
|
|
44
|
+
"""
|
|
45
|
+
match = re.match(r"mapillary-(.+)-(256|1024|2048|original)(?:-webp)?$", identifier)
|
|
46
|
+
if match:
|
|
47
|
+
return {"username": match.group(1), "quality": match.group(2), "is_webp": "-webp" in identifier}
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def extract_image_count(description):
|
|
52
|
+
"""Extract image count from IA description field.
|
|
53
|
+
|
|
54
|
+
Description format: "Contains 12,345 images in..."
|
|
55
|
+
"""
|
|
56
|
+
if not description:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
match = re.search(r"Contains ([\d,]+) images", description)
|
|
60
|
+
if match:
|
|
61
|
+
return int(match.group(1).replace(",", ""))
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def load_cache():
|
|
66
|
+
"""Load cached collection data.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
dict of {collection_id: {size, uploader, images, quality, username}}
|
|
70
|
+
"""
|
|
71
|
+
if CACHE_FILE.exists():
|
|
72
|
+
try:
|
|
73
|
+
with open(CACHE_FILE) as f:
|
|
74
|
+
return json.load(f)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.warning(f"Failed to load cache: {e}")
|
|
77
|
+
return {}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def update_cache(ia_collections):
|
|
81
|
+
"""Update cache with new IA search results.
|
|
82
|
+
|
|
83
|
+
Merges new collections into existing cache.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Updated cache dict
|
|
87
|
+
"""
|
|
88
|
+
cache = load_cache()
|
|
89
|
+
|
|
90
|
+
for item in ia_collections:
|
|
91
|
+
identifier = item.get("identifier")
|
|
92
|
+
if not identifier:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
info = parse_collection_info(identifier)
|
|
96
|
+
if not info:
|
|
97
|
+
logger.debug(f"Skipping non-mapillary collection: {identifier}")
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
# Parse item data
|
|
101
|
+
size_bytes = item.get("item_size", 0)
|
|
102
|
+
if isinstance(size_bytes, str):
|
|
103
|
+
size_bytes = int(size_bytes)
|
|
104
|
+
|
|
105
|
+
image_count = extract_image_count(item.get("description"))
|
|
106
|
+
|
|
107
|
+
# Update cache entry
|
|
108
|
+
cache[identifier] = {
|
|
109
|
+
"size": size_bytes,
|
|
110
|
+
"uploader": item.get("uploader"),
|
|
111
|
+
"images": image_count,
|
|
112
|
+
"quality": info["quality"],
|
|
113
|
+
"username": info["username"],
|
|
114
|
+
"is_webp": info["is_webp"],
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Save updated cache
|
|
118
|
+
safe_json_save(CACHE_FILE, cache)
|
|
119
|
+
logger.info(f"Updated cache with {len(cache)} collections")
|
|
120
|
+
|
|
121
|
+
return cache
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def aggregate_stats(cache):
|
|
125
|
+
"""Aggregate statistics from cached collection data.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
dict with total and per-quality stats
|
|
129
|
+
"""
|
|
130
|
+
stats = {
|
|
131
|
+
"total": {"collections": 0, "total_images": 0, "unique_images": 0, "bytes": 0},
|
|
132
|
+
"by_quality": {},
|
|
133
|
+
"users": set(),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Track images per user for deduplication
|
|
137
|
+
user_images = {} # {username: max_images_across_qualities}
|
|
138
|
+
|
|
139
|
+
for collection_id, data in cache.items():
|
|
140
|
+
images = data.get("images") or 0
|
|
141
|
+
size = data.get("size") or 0
|
|
142
|
+
quality = data.get("quality", "unknown")
|
|
143
|
+
username = data.get("username")
|
|
144
|
+
|
|
145
|
+
# Track user coverage
|
|
146
|
+
if username:
|
|
147
|
+
stats["users"].add(username)
|
|
148
|
+
# Keep maximum image count across all qualities for this user
|
|
149
|
+
if username not in user_images or images > user_images[username]:
|
|
150
|
+
user_images[username] = images
|
|
151
|
+
|
|
152
|
+
# Total stats (collections, total images, and bytes)
|
|
153
|
+
stats["total"]["collections"] += 1
|
|
154
|
+
stats["total"]["total_images"] += images
|
|
155
|
+
stats["total"]["bytes"] += size
|
|
156
|
+
|
|
157
|
+
# Per-quality stats
|
|
158
|
+
if quality not in stats["by_quality"]:
|
|
159
|
+
stats["by_quality"][quality] = {"collections": 0, "images": 0, "bytes": 0}
|
|
160
|
+
|
|
161
|
+
stats["by_quality"][quality]["collections"] += 1
|
|
162
|
+
stats["by_quality"][quality]["images"] += images
|
|
163
|
+
stats["by_quality"][quality]["bytes"] += size
|
|
164
|
+
|
|
165
|
+
# Unique images is sum of max images per user
|
|
166
|
+
stats["total"]["unique_images"] = sum(user_images.values())
|
|
167
|
+
|
|
168
|
+
return stats
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def format_stats(stats):
|
|
172
|
+
"""Format statistics as human-readable text.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
stats: Dict from aggregate_stats()
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Formatted string
|
|
179
|
+
"""
|
|
180
|
+
TOTAL_MAPILLARY_IMAGES = 2_000_000_000 # 2 billion
|
|
181
|
+
|
|
182
|
+
output = []
|
|
183
|
+
output.append("=" * 70)
|
|
184
|
+
output.append("Mapillary Downloader - Archive.org Statistics")
|
|
185
|
+
output.append("=" * 70)
|
|
186
|
+
output.append("")
|
|
187
|
+
|
|
188
|
+
# Total stats
|
|
189
|
+
total = stats["total"]
|
|
190
|
+
unique_pct = (total["unique_images"] / TOTAL_MAPILLARY_IMAGES * 100) if total["unique_images"] else 0
|
|
191
|
+
|
|
192
|
+
output.append(f"Total Collections: {total['collections']:,}")
|
|
193
|
+
output.append(f"Total Users: {len(stats['users']):,}")
|
|
194
|
+
output.append(f"Total Images: {total['total_images']:,}")
|
|
195
|
+
output.append(f"Unique Images: {total['unique_images']:,} ({unique_pct:.3f}% of 2B)")
|
|
196
|
+
output.append(f"Total Size: {format_size(total['bytes'])}")
|
|
197
|
+
output.append("")
|
|
198
|
+
|
|
199
|
+
# Per-quality breakdown
|
|
200
|
+
output.append("By Quality:")
|
|
201
|
+
output.append("-" * 70)
|
|
202
|
+
|
|
203
|
+
# Sort by quality (original first, then numeric)
|
|
204
|
+
qualities = sorted(stats["by_quality"].items(), key=lambda x: (x[0] != "original", x[0]))
|
|
205
|
+
|
|
206
|
+
for quality, data in qualities:
|
|
207
|
+
pct = (data["images"] / TOTAL_MAPILLARY_IMAGES * 100) if data["images"] else 0
|
|
208
|
+
output.append(
|
|
209
|
+
f" {quality:8s} {data['collections']:3d} collections "
|
|
210
|
+
f"{data['images']:12,d} images ({pct:.3f}%) "
|
|
211
|
+
f"{format_size(data['bytes']):>8s}"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
output.append("")
|
|
215
|
+
output.append(f"Cache: {CACHE_FILE}")
|
|
216
|
+
|
|
217
|
+
return "\n".join(output)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def show_stats(refresh=True):
|
|
221
|
+
"""Show archive.org statistics for mapillary_downloader collections.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
refresh: If True, fetch fresh data from IA. If False, use cache only.
|
|
225
|
+
"""
|
|
226
|
+
if refresh:
|
|
227
|
+
try:
|
|
228
|
+
ia_collections = search_ia_collections()
|
|
229
|
+
cache = update_cache(ia_collections)
|
|
230
|
+
except Exception as e:
|
|
231
|
+
logger.error(f"Failed to fetch IA data: {e}")
|
|
232
|
+
logger.info("Using cached data...")
|
|
233
|
+
cache = load_cache()
|
|
234
|
+
else:
|
|
235
|
+
cache = load_cache()
|
|
236
|
+
|
|
237
|
+
if not cache:
|
|
238
|
+
logger.error("No cached data and failed to fetch from IA")
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
stats = aggregate_stats(cache)
|
|
242
|
+
print(format_stats(stats))
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/tar_sequences.py
RENAMED
|
@@ -23,51 +23,46 @@ def tar_sequence_directories(collection_dir):
|
|
|
23
23
|
logger.error(f"Collection directory not found: {collection_dir}")
|
|
24
24
|
return 0, 0
|
|
25
25
|
|
|
26
|
-
# Find all
|
|
26
|
+
# Find all bucket directories (skip special dirs)
|
|
27
|
+
# Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
|
|
27
28
|
skip_dirs = {".meta", "__pycache__"}
|
|
28
|
-
|
|
29
|
+
bucket_dirs = []
|
|
29
30
|
|
|
30
31
|
for item in collection_dir.iterdir():
|
|
31
32
|
if item.is_dir() and item.name not in skip_dirs:
|
|
32
|
-
|
|
33
|
+
# Check if this is a bucket dir (single char)
|
|
34
|
+
if len(item.name) == 1:
|
|
35
|
+
bucket_dirs.append(item)
|
|
33
36
|
|
|
34
|
-
if not
|
|
35
|
-
logger.info("No
|
|
37
|
+
if not bucket_dirs:
|
|
38
|
+
logger.info("No bucket directories to tar")
|
|
36
39
|
return 0, 0
|
|
37
40
|
|
|
38
|
-
|
|
41
|
+
# Sort bucket directories alphabetically for consistent progress tracking
|
|
42
|
+
bucket_dirs = sorted(bucket_dirs, key=lambda x: x.name)
|
|
43
|
+
|
|
44
|
+
logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
|
|
39
45
|
|
|
40
46
|
tarred_count = 0
|
|
41
47
|
total_files = 0
|
|
42
48
|
total_tar_bytes = 0
|
|
43
49
|
|
|
44
|
-
for
|
|
45
|
-
|
|
46
|
-
tar_path = collection_dir / f"{
|
|
47
|
-
|
|
48
|
-
# Handle naming collision - find next available name
|
|
49
|
-
counter = 1
|
|
50
|
-
while tar_path.exists():
|
|
51
|
-
counter += 1
|
|
52
|
-
tar_path = collection_dir / f"{seq_name}.{counter}.tar"
|
|
50
|
+
for bucket_dir in bucket_dirs:
|
|
51
|
+
bucket_name = bucket_dir.name
|
|
52
|
+
tar_path = collection_dir / f"{bucket_name}.tar"
|
|
53
53
|
|
|
54
|
-
# Count files in
|
|
55
|
-
|
|
56
|
-
file_count = len(
|
|
54
|
+
# Count files in bucket
|
|
55
|
+
files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
|
|
56
|
+
file_count = len(files_to_tar)
|
|
57
57
|
|
|
58
58
|
if file_count == 0:
|
|
59
|
-
logger.warning(f"Skipping empty directory: {
|
|
59
|
+
logger.warning(f"Skipping empty bucket directory: {bucket_name}")
|
|
60
60
|
continue
|
|
61
61
|
|
|
62
62
|
try:
|
|
63
|
-
|
|
64
|
-
# Sort files by name for deterministic ordering
|
|
65
|
-
files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
|
|
66
|
-
|
|
67
|
-
if not files_to_tar:
|
|
68
|
-
logger.warning(f"Skipping directory with no files: {seq_name}")
|
|
69
|
-
continue
|
|
63
|
+
logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
|
|
70
64
|
|
|
65
|
+
# Create reproducible uncompressed tar (WebP already compressed)
|
|
71
66
|
with tarfile.open(tar_path, "w") as tar:
|
|
72
67
|
for file_path in files_to_tar:
|
|
73
68
|
# Get path relative to collection_dir for tar archive
|
|
@@ -92,33 +87,32 @@ def tar_sequence_directories(collection_dir):
|
|
|
92
87
|
tar_size = tar_path.stat().st_size
|
|
93
88
|
total_tar_bytes += tar_size
|
|
94
89
|
|
|
95
|
-
# Remove original directory
|
|
96
|
-
for file in
|
|
90
|
+
# Remove original bucket directory
|
|
91
|
+
for file in bucket_dir.rglob("*"):
|
|
97
92
|
if file.is_file():
|
|
98
93
|
file.unlink()
|
|
99
94
|
|
|
100
95
|
# Remove empty subdirs and main dir
|
|
101
|
-
for subdir in list(
|
|
96
|
+
for subdir in list(bucket_dir.rglob("*")):
|
|
102
97
|
if subdir.is_dir():
|
|
103
98
|
try:
|
|
104
99
|
subdir.rmdir()
|
|
105
100
|
except OSError:
|
|
106
101
|
pass # Not empty yet
|
|
107
102
|
|
|
108
|
-
|
|
103
|
+
bucket_dir.rmdir()
|
|
109
104
|
|
|
110
105
|
tarred_count += 1
|
|
111
106
|
total_files += file_count
|
|
112
107
|
|
|
113
|
-
|
|
114
|
-
logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
|
|
108
|
+
logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
|
|
115
109
|
else:
|
|
116
110
|
logger.error(f"Tar file empty or not created: {tar_path}")
|
|
117
111
|
if tar_path.exists():
|
|
118
112
|
tar_path.unlink()
|
|
119
113
|
|
|
120
114
|
except Exception as e:
|
|
121
|
-
logger.error(f"Error tarring {
|
|
115
|
+
logger.error(f"Error tarring bucket {bucket_name}: {e}")
|
|
122
116
|
if tar_path.exists():
|
|
123
117
|
tar_path.unlink()
|
|
124
118
|
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Utility functions for formatting and display."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import requests
|
|
9
|
+
from requests.exceptions import RequestException
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def format_size(bytes_count):
|
|
15
|
+
"""Format bytes as human-readable size.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
bytes_count: Number of bytes
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Formatted string (e.g. "1.23 GB", "456.78 MB")
|
|
22
|
+
"""
|
|
23
|
+
if bytes_count >= 1_000_000_000:
|
|
24
|
+
return f"{bytes_count / 1_000_000_000:.2f} GB"
|
|
25
|
+
if bytes_count >= 1_000_000:
|
|
26
|
+
return f"{bytes_count / 1_000_000:.2f} MB"
|
|
27
|
+
if bytes_count >= 1_000:
|
|
28
|
+
return f"{bytes_count / 1000:.2f} KB"
|
|
29
|
+
return f"{bytes_count} B"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def format_time(seconds):
|
|
33
|
+
"""Format seconds as human-readable time.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
seconds: Number of seconds
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Formatted string (e.g. "2h 15m", "45m 30s", "30s")
|
|
40
|
+
"""
|
|
41
|
+
if seconds < 60:
|
|
42
|
+
return f"{int(seconds)}s"
|
|
43
|
+
|
|
44
|
+
minutes = int(seconds / 60)
|
|
45
|
+
remaining_seconds = int(seconds % 60)
|
|
46
|
+
|
|
47
|
+
if minutes < 60:
|
|
48
|
+
if remaining_seconds > 0:
|
|
49
|
+
return f"{minutes}m {remaining_seconds}s"
|
|
50
|
+
return f"{minutes}m"
|
|
51
|
+
|
|
52
|
+
hours = int(minutes / 60)
|
|
53
|
+
remaining_minutes = minutes % 60
|
|
54
|
+
|
|
55
|
+
if remaining_minutes > 0:
|
|
56
|
+
return f"{hours}h {remaining_minutes}m"
|
|
57
|
+
return f"{hours}h"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def safe_json_save(file_path, data):
|
|
61
|
+
"""Atomically save JSON data to file.
|
|
62
|
+
|
|
63
|
+
Writes to temp file, then atomic rename to prevent corruption.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
file_path: Path to JSON file
|
|
67
|
+
data: Data to serialize to JSON
|
|
68
|
+
"""
|
|
69
|
+
file_path = Path(file_path)
|
|
70
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
temp_file = file_path.with_suffix(".json.tmp")
|
|
73
|
+
with open(temp_file, "w") as f:
|
|
74
|
+
json.dump(data, f, indent=2)
|
|
75
|
+
f.flush()
|
|
76
|
+
os.fsync(f.fileno())
|
|
77
|
+
temp_file.replace(file_path)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def http_get_with_retry(url, params=None, max_retries=5, base_delay=1.0, timeout=60):
|
|
81
|
+
"""HTTP GET with exponential backoff retry.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
url: URL to fetch
|
|
85
|
+
params: Optional query parameters
|
|
86
|
+
max_retries: Maximum retry attempts (default: 5)
|
|
87
|
+
base_delay: Initial delay in seconds (default: 1.0)
|
|
88
|
+
timeout: Request timeout in seconds (default: 60)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
requests.Response object
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
requests.RequestException: If all retries exhausted
|
|
95
|
+
"""
|
|
96
|
+
for attempt in range(max_retries):
|
|
97
|
+
try:
|
|
98
|
+
response = requests.get(url, params=params, timeout=timeout)
|
|
99
|
+
response.raise_for_status()
|
|
100
|
+
return response
|
|
101
|
+
except RequestException as e:
|
|
102
|
+
if attempt == max_retries - 1:
|
|
103
|
+
raise
|
|
104
|
+
|
|
105
|
+
delay = base_delay * (2**attempt)
|
|
106
|
+
logger.warning(f"Request failed (attempt {attempt + 1}/{max_retries}): {e}")
|
|
107
|
+
logger.info(f"Retrying in {delay:.1f} seconds...")
|
|
108
|
+
time.sleep(delay)
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/worker.py
RENAMED
|
@@ -5,9 +5,9 @@ import signal
|
|
|
5
5
|
import tempfile
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import requests
|
|
8
|
-
from requests.exceptions import RequestException
|
|
9
8
|
from mapillary_downloader.exif_writer import write_exif_to_image
|
|
10
9
|
from mapillary_downloader.webp_converter import convert_to_webp
|
|
10
|
+
from mapillary_downloader.utils import http_get_with_retry
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def worker_process(work_queue, result_queue, worker_id):
|
|
@@ -69,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
|
|
|
69
69
|
if not image_url:
|
|
70
70
|
return (image_id, 0, False, f"No {quality} URL")
|
|
71
71
|
|
|
72
|
-
# Determine final output directory
|
|
72
|
+
# Determine final output directory - organize by first char of sequence ID
|
|
73
73
|
output_dir = Path(output_dir)
|
|
74
74
|
sequence_id = image_data.get("sequence")
|
|
75
75
|
if sequence_id:
|
|
76
|
-
|
|
76
|
+
# Use first character as bucket (gives us ~62 dirs instead of millions)
|
|
77
|
+
first_char = sequence_id[0]
|
|
78
|
+
img_dir = output_dir / first_char / sequence_id
|
|
77
79
|
img_dir.mkdir(parents=True, exist_ok=True)
|
|
78
80
|
else:
|
|
79
81
|
img_dir = output_dir
|
|
@@ -88,19 +90,18 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
|
|
|
88
90
|
jpg_path = img_dir / f"{image_id}.jpg"
|
|
89
91
|
final_path = jpg_path
|
|
90
92
|
|
|
91
|
-
# Download image
|
|
93
|
+
# Download image with retry logic
|
|
92
94
|
bytes_downloaded = 0
|
|
93
95
|
|
|
94
96
|
try:
|
|
95
|
-
#
|
|
96
|
-
response =
|
|
97
|
-
response.raise_for_status()
|
|
97
|
+
# Use retry logic with 3 attempts for image downloads
|
|
98
|
+
response = http_get_with_retry(image_url, max_retries=3, base_delay=1.0, timeout=60)
|
|
98
99
|
|
|
99
100
|
with open(jpg_path, "wb") as f:
|
|
100
101
|
for chunk in response.iter_content(chunk_size=8192):
|
|
101
102
|
f.write(chunk)
|
|
102
103
|
bytes_downloaded += len(chunk)
|
|
103
|
-
except
|
|
104
|
+
except Exception as e:
|
|
104
105
|
return (image_id, 0, False, f"Download failed: {e}")
|
|
105
106
|
|
|
106
107
|
# Write EXIF metadata
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
"""Utility functions for formatting and display."""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def format_size(bytes_count):
|
|
5
|
-
"""Format bytes as human-readable size.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
bytes_count: Number of bytes
|
|
9
|
-
|
|
10
|
-
Returns:
|
|
11
|
-
Formatted string (e.g. "1.23 GB", "456.78 MB")
|
|
12
|
-
"""
|
|
13
|
-
if bytes_count >= 1_000_000_000:
|
|
14
|
-
return f"{bytes_count / 1_000_000_000:.2f} GB"
|
|
15
|
-
if bytes_count >= 1_000_000:
|
|
16
|
-
return f"{bytes_count / 1_000_000:.2f} MB"
|
|
17
|
-
if bytes_count >= 1_000:
|
|
18
|
-
return f"{bytes_count / 1000:.2f} KB"
|
|
19
|
-
return f"{bytes_count} B"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def format_time(seconds):
|
|
23
|
-
"""Format seconds as human-readable time.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
seconds: Number of seconds
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
Formatted string (e.g. "2h 15m", "45m 30s", "30s")
|
|
30
|
-
"""
|
|
31
|
-
if seconds < 60:
|
|
32
|
-
return f"{int(seconds)}s"
|
|
33
|
-
|
|
34
|
-
minutes = int(seconds / 60)
|
|
35
|
-
remaining_seconds = int(seconds % 60)
|
|
36
|
-
|
|
37
|
-
if minutes < 60:
|
|
38
|
-
if remaining_seconds > 0:
|
|
39
|
-
return f"{minutes}m {remaining_seconds}s"
|
|
40
|
-
return f"{minutes}m"
|
|
41
|
-
|
|
42
|
-
hours = int(minutes / 60)
|
|
43
|
-
remaining_minutes = minutes % 60
|
|
44
|
-
|
|
45
|
-
if remaining_minutes > 0:
|
|
46
|
-
return f"{hours}h {remaining_minutes}m"
|
|
47
|
-
return f"{hours}h"
|
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/__init__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/client.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/exif_writer.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/ia_check.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/ia_meta.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/logging_config.py
RENAMED
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/webp_converter.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2 → mapillary_downloader-0.6.1}/src/mapillary_downloader/worker_pool.py
RENAMED
|
File without changes
|