mapillary-downloader 0.6.1__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/PKG-INFO +25 -11
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/README.md +24 -10
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/pyproject.toml +1 -1
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/__main__.py +2 -2
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/downloader.py +2 -5
- mapillary_downloader-0.7.2/src/mapillary_downloader/graphql_web.py +193 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_meta.py +1 -1
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/tar_sequences.py +29 -28
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/worker.py +15 -5
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/worker_pool.py +3 -4
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/LICENSE.md +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_check.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_stats.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/logging_config.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/metadata_reader.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/utils.py +0 -0
- {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/webp_converter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mapillary_downloader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: Archive user data from Mapillary
|
|
5
5
|
Author-email: Gareth Davidson <gaz@bitplane.net>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -66,7 +66,7 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
66
66
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
67
67
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
68
68
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
69
|
-
| `--max-workers` | Maximum number of parallel download workers |
|
|
69
|
+
| `--max-workers` | Maximum number of parallel download workers | CPU count |
|
|
70
70
|
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
71
71
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
72
72
|
|
|
@@ -100,21 +100,28 @@ mapillary-downloader --no-webp USERNAME
|
|
|
100
100
|
|
|
101
101
|
## Tarballs
|
|
102
102
|
|
|
103
|
-
Images are organized by
|
|
104
|
-
sequence to reduce directory count:
|
|
103
|
+
Images are organized by capture date (YYYY-MM-DD) for incremental archiving:
|
|
105
104
|
|
|
106
105
|
```
|
|
107
106
|
mapillary-username-quality/
|
|
108
|
-
|
|
107
|
+
2024-01-15/
|
|
109
108
|
abc123/
|
|
110
109
|
image1.webp
|
|
111
110
|
image2.webp
|
|
111
|
+
bcd456/
|
|
112
|
+
image3.webp
|
|
113
|
+
2024-01-16/
|
|
114
|
+
def789/
|
|
115
|
+
image4.webp
|
|
112
116
|
```
|
|
113
117
|
|
|
114
|
-
By default, these
|
|
115
|
-
(resulting in `
|
|
116
|
-
|
|
117
|
-
|
|
118
|
+
By default, these date directories are automatically tarred after download
|
|
119
|
+
(resulting in `2024-01-15.tar`, `2024-01-16.tar`, etc.). This date-based
|
|
120
|
+
organization enables:
|
|
121
|
+
|
|
122
|
+
- **Incremental uploads** - Upload each day's tar as soon as it's ready
|
|
123
|
+
- **Manageable file counts** - ~365 days/year × 10 years = 3,650 tars max
|
|
124
|
+
- **Chronological organization** - Natural sorting and progress tracking
|
|
118
125
|
|
|
119
126
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
120
127
|
|
|
@@ -128,8 +135,15 @@ See inlay for details:
|
|
|
128
135
|
|
|
129
136
|
* [📀 rip](https://bitplane.net/dev/sh/rip)
|
|
130
137
|
|
|
138
|
+
## 📊 Stats
|
|
139
|
+
|
|
140
|
+
To see overall project progress, or an estimate, use `--stats`
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
mapillary-downloader --stats
|
|
144
|
+
```
|
|
131
145
|
|
|
132
|
-
## Development
|
|
146
|
+
## 🚧 Development
|
|
133
147
|
|
|
134
148
|
```bash
|
|
135
149
|
make dev # Setup dev environment
|
|
@@ -138,7 +152,7 @@ make dist # Build the distribution
|
|
|
138
152
|
make help # See other make options
|
|
139
153
|
```
|
|
140
154
|
|
|
141
|
-
## Links
|
|
155
|
+
## 🔗 Links
|
|
142
156
|
|
|
143
157
|
* [🏠 home](https://bitplane.net/dev/python/mapillary_downloader)
|
|
144
158
|
* [📖 pydoc](https://bitplane.net/dev/python/mapillary_downloader/pydoc)
|
|
@@ -36,7 +36,7 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
36
36
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
37
37
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
38
38
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
39
|
-
| `--max-workers` | Maximum number of parallel download workers |
|
|
39
|
+
| `--max-workers` | Maximum number of parallel download workers | CPU count |
|
|
40
40
|
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
41
41
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
42
42
|
|
|
@@ -70,21 +70,28 @@ mapillary-downloader --no-webp USERNAME
|
|
|
70
70
|
|
|
71
71
|
## Tarballs
|
|
72
72
|
|
|
73
|
-
Images are organized by
|
|
74
|
-
sequence to reduce directory count:
|
|
73
|
+
Images are organized by capture date (YYYY-MM-DD) for incremental archiving:
|
|
75
74
|
|
|
76
75
|
```
|
|
77
76
|
mapillary-username-quality/
|
|
78
|
-
|
|
77
|
+
2024-01-15/
|
|
79
78
|
abc123/
|
|
80
79
|
image1.webp
|
|
81
80
|
image2.webp
|
|
81
|
+
bcd456/
|
|
82
|
+
image3.webp
|
|
83
|
+
2024-01-16/
|
|
84
|
+
def789/
|
|
85
|
+
image4.webp
|
|
82
86
|
```
|
|
83
87
|
|
|
84
|
-
By default, these
|
|
85
|
-
(resulting in `
|
|
86
|
-
|
|
87
|
-
|
|
88
|
+
By default, these date directories are automatically tarred after download
|
|
89
|
+
(resulting in `2024-01-15.tar`, `2024-01-16.tar`, etc.). This date-based
|
|
90
|
+
organization enables:
|
|
91
|
+
|
|
92
|
+
- **Incremental uploads** - Upload each day's tar as soon as it's ready
|
|
93
|
+
- **Manageable file counts** - ~365 days/year × 10 years = 3,650 tars max
|
|
94
|
+
- **Chronological organization** - Natural sorting and progress tracking
|
|
88
95
|
|
|
89
96
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
90
97
|
|
|
@@ -98,8 +105,15 @@ See inlay for details:
|
|
|
98
105
|
|
|
99
106
|
* [📀 rip](https://bitplane.net/dev/sh/rip)
|
|
100
107
|
|
|
108
|
+
## 📊 Stats
|
|
109
|
+
|
|
110
|
+
To see overall project progress, or an estimate, use `--stats`
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
mapillary-downloader --stats
|
|
114
|
+
```
|
|
101
115
|
|
|
102
|
-
## Development
|
|
116
|
+
## 🚧 Development
|
|
103
117
|
|
|
104
118
|
```bash
|
|
105
119
|
make dev # Setup dev environment
|
|
@@ -108,7 +122,7 @@ make dist # Build the distribution
|
|
|
108
122
|
make help # See other make options
|
|
109
123
|
```
|
|
110
124
|
|
|
111
|
-
## Links
|
|
125
|
+
## 🔗 Links
|
|
112
126
|
|
|
113
127
|
* [🏠 home](https://bitplane.net/dev/python/mapillary_downloader)
|
|
114
128
|
* [📖 pydoc](https://bitplane.net/dev/python/mapillary_downloader/pydoc)
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/__main__.py
RENAMED
|
@@ -43,8 +43,8 @@ def main():
|
|
|
43
43
|
parser.add_argument(
|
|
44
44
|
"--max-workers",
|
|
45
45
|
type=int,
|
|
46
|
-
default=
|
|
47
|
-
help="Maximum number of parallel workers (default:
|
|
46
|
+
default=os.cpu_count() or 8,
|
|
47
|
+
help=f"Maximum number of parallel workers (default: CPU count = {os.cpu_count() or 8})",
|
|
48
48
|
)
|
|
49
49
|
parser.add_argument(
|
|
50
50
|
"--no-tar",
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/downloader.py
RENAMED
|
@@ -67,7 +67,6 @@ class MapillaryDownloader:
|
|
|
67
67
|
self.username = username
|
|
68
68
|
self.quality = quality
|
|
69
69
|
self.max_workers = max_workers
|
|
70
|
-
self.initial_workers = os.cpu_count() or 1 # Start with CPU count
|
|
71
70
|
self.tar_sequences = tar_sequences
|
|
72
71
|
self.convert_webp = convert_webp
|
|
73
72
|
self.check_ia = check_ia
|
|
@@ -173,7 +172,7 @@ class MapillaryDownloader:
|
|
|
173
172
|
logger.info(f"Downloading images for user: {self.username}")
|
|
174
173
|
logger.info(f"Output directory: {self.output_dir}")
|
|
175
174
|
logger.info(f"Quality: {self.quality}")
|
|
176
|
-
logger.info(f"Worker pool:
|
|
175
|
+
logger.info(f"Worker pool: max {self.max_workers} workers")
|
|
177
176
|
|
|
178
177
|
start_time = time.time()
|
|
179
178
|
|
|
@@ -188,9 +187,7 @@ class MapillaryDownloader:
|
|
|
188
187
|
# Step 2: Start worker pool
|
|
189
188
|
# Since workers do both I/O (download) and CPU (WebP), need many more workers
|
|
190
189
|
# Start with CPU count and scale up based on throughput
|
|
191
|
-
pool = AdaptiveWorkerPool(
|
|
192
|
-
worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
|
|
193
|
-
)
|
|
190
|
+
pool = AdaptiveWorkerPool(worker_process, max_workers=self.max_workers, monitoring_interval=10)
|
|
194
191
|
pool.start()
|
|
195
192
|
|
|
196
193
|
# Step 3: Download images from metadata file while fetching new from API
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""GraphQL web API utilities (unofficial, experimental).
|
|
2
|
+
|
|
3
|
+
This module provides access to Mapillary's GraphQL endpoint used by the web interface.
|
|
4
|
+
Unlike the official v4 REST API, this requires a public web token extracted from the
|
|
5
|
+
JavaScript bundle.
|
|
6
|
+
|
|
7
|
+
Use cases:
|
|
8
|
+
- Get user image counts without pagination
|
|
9
|
+
- Access leaderboard data
|
|
10
|
+
- Check for updates to existing downloads
|
|
11
|
+
|
|
12
|
+
WARNING: This is not officially documented and may break at any time.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import re
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from urllib.parse import urlencode, quote
|
|
20
|
+
import requests
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("mapillary_downloader")
|
|
23
|
+
|
|
24
|
+
# Fallback token (extracted from main JS bundle as of 2025-01-09)
|
|
25
|
+
FALLBACK_TOKEN = "MLY|4223665974375089|d62822dd792b6a823d0794ef26450398"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_token_from_js():
|
|
29
|
+
"""Extract public web token from Mapillary's JavaScript bundle.
|
|
30
|
+
|
|
31
|
+
This fetches the main page, finds the main JS bundle, and extracts
|
|
32
|
+
the hardcoded MLY token used for GraphQL queries.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Token string (e.g., "MLY|123|abc...") or None if extraction failed
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
# Fetch main page to find JS bundle URL
|
|
39
|
+
# Need consent cookie to get actual page (not GDPR banner)
|
|
40
|
+
logger.debug("Fetching Mapillary main page...")
|
|
41
|
+
# Generate today's date in the format YYYY_MM_DD for cookie
|
|
42
|
+
today = datetime.now().strftime("%Y_%m_%d")
|
|
43
|
+
cookies = {
|
|
44
|
+
"mly_cb": f'{{"version":"1","date":"{today}","third_party_consent":"withdrawn","categories":{{"content_and_media":"withdrawn"}},"integration_controls":{{"YOUTUBE":"withdrawn"}}}}'
|
|
45
|
+
}
|
|
46
|
+
headers = {
|
|
47
|
+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
|
|
48
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
49
|
+
"Accept-Language": "en-GB,en;q=0.5",
|
|
50
|
+
"Sec-GPC": "1",
|
|
51
|
+
"Upgrade-Insecure-Requests": "1",
|
|
52
|
+
"Sec-Fetch-Dest": "document",
|
|
53
|
+
"Sec-Fetch-Mode": "navigate",
|
|
54
|
+
"Sec-Fetch-Site": "none",
|
|
55
|
+
"Sec-Fetch-User": "?1",
|
|
56
|
+
}
|
|
57
|
+
response = requests.get("https://www.mapillary.com/app/", cookies=cookies, headers=headers, timeout=30)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
|
|
60
|
+
# Find main JS file URL
|
|
61
|
+
# Pattern: <script src="main.{hash}.js" type="module"></script>
|
|
62
|
+
js_match = re.search(r'src="(main\.[a-f0-9]+\.js)"', response.text)
|
|
63
|
+
if not js_match:
|
|
64
|
+
logger.warning("Could not find main JS bundle URL in page")
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
# URL is relative to /app/ base path
|
|
68
|
+
js_url = f"https://www.mapillary.com/app/{js_match.group(1)}"
|
|
69
|
+
logger.debug(f"Found JS bundle: {js_url}")
|
|
70
|
+
|
|
71
|
+
# Fetch JS bundle
|
|
72
|
+
logger.debug("Fetching JS bundle...")
|
|
73
|
+
js_response = requests.get(js_url, timeout=30)
|
|
74
|
+
js_response.raise_for_status()
|
|
75
|
+
|
|
76
|
+
# Extract token
|
|
77
|
+
# Pattern: "MLY|{client_id}|{secret}"
|
|
78
|
+
token_match = re.search(r'"(MLY\|[^"]+)"', js_response.text)
|
|
79
|
+
if not token_match:
|
|
80
|
+
logger.warning("Could not find MLY token in JS bundle")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
token = token_match.group(1)
|
|
84
|
+
logger.info(f"Extracted web token: {token[:20]}...")
|
|
85
|
+
return token
|
|
86
|
+
|
|
87
|
+
except requests.RequestException as e:
|
|
88
|
+
logger.error(f"Failed to extract web token: {e}")
|
|
89
|
+
return None
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Unexpected error extracting web token: {e}")
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_leaderboard(key="global", token=None):
|
|
96
|
+
"""Get leaderboard data from Mapillary GraphQL API.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
key: Leaderboard key (e.g., "global", country name, etc.)
|
|
100
|
+
token: MLY token (if None, will extract from JS bundle or use fallback)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Dict with leaderboard data, or None on error
|
|
104
|
+
"""
|
|
105
|
+
if token is None:
|
|
106
|
+
token = extract_token_from_js()
|
|
107
|
+
if token is None:
|
|
108
|
+
logger.warning("Failed to extract token, using fallback")
|
|
109
|
+
token = FALLBACK_TOKEN
|
|
110
|
+
|
|
111
|
+
# GraphQL query for leaderboard (lifetime stats only)
|
|
112
|
+
query = """query getUserLeaderboard($key: String!) {
|
|
113
|
+
user_leaderboards(key: $key) {
|
|
114
|
+
lifetime {
|
|
115
|
+
count
|
|
116
|
+
user {
|
|
117
|
+
id
|
|
118
|
+
username
|
|
119
|
+
profile_photo_url
|
|
120
|
+
__typename
|
|
121
|
+
}
|
|
122
|
+
__typename
|
|
123
|
+
}
|
|
124
|
+
__typename
|
|
125
|
+
}
|
|
126
|
+
}"""
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
headers = {
|
|
130
|
+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
|
|
131
|
+
"Accept": "*/*",
|
|
132
|
+
"Accept-Language": "en-GB,en;q=0.5",
|
|
133
|
+
"Referer": "https://www.mapillary.com/",
|
|
134
|
+
"content-type": "application/json",
|
|
135
|
+
"authorization": f"OAuth {token}",
|
|
136
|
+
"Origin": "https://www.mapillary.com",
|
|
137
|
+
"Sec-Fetch-Dest": "empty",
|
|
138
|
+
"Sec-Fetch-Mode": "cors",
|
|
139
|
+
"Sec-Fetch-Site": "same-site",
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# Build query params - use quote_via=quote to get %20 instead of +
|
|
143
|
+
# Note: both 'doc' and 'query' params seem to be required (from observed curl)
|
|
144
|
+
params = {
|
|
145
|
+
"doc": query,
|
|
146
|
+
"query": query,
|
|
147
|
+
"operationName": "getUserLeaderboard",
|
|
148
|
+
"variables": json.dumps({"key": key}, separators=(',', ':')),
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Build URL with proper percent encoding (not + for spaces)
|
|
152
|
+
# Don't encode parentheses to match curl behavior
|
|
153
|
+
query_string = urlencode(params, quote_via=lambda s, safe='', encoding=None, errors=None: quote(s, safe='()!'))
|
|
154
|
+
url = f"https://graph.mapillary.com/graphql?{query_string}"
|
|
155
|
+
|
|
156
|
+
logger.debug(f"Querying leaderboard for key: {key}")
|
|
157
|
+
|
|
158
|
+
response = requests.get(
|
|
159
|
+
url,
|
|
160
|
+
headers=headers,
|
|
161
|
+
timeout=30
|
|
162
|
+
)
|
|
163
|
+
response.raise_for_status()
|
|
164
|
+
|
|
165
|
+
return response.json()
|
|
166
|
+
|
|
167
|
+
except requests.RequestException as e:
|
|
168
|
+
logger.error(f"Failed to query leaderboard: {e}")
|
|
169
|
+
return None
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Unexpected error querying leaderboard: {e}")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
# Test the extraction and leaderboard query
|
|
177
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
178
|
+
|
|
179
|
+
print("=== Extracting token ===")
|
|
180
|
+
token = extract_token_from_js()
|
|
181
|
+
if token:
|
|
182
|
+
print(f"Success! Token: {token}")
|
|
183
|
+
else:
|
|
184
|
+
print("Failed to extract token")
|
|
185
|
+
print(f"Fallback: {FALLBACK_TOKEN}")
|
|
186
|
+
token = FALLBACK_TOKEN
|
|
187
|
+
|
|
188
|
+
print("\n=== Querying global leaderboard ===")
|
|
189
|
+
data = get_leaderboard("global", token=token)
|
|
190
|
+
if data:
|
|
191
|
+
print(json.dumps(data, indent=2))
|
|
192
|
+
else:
|
|
193
|
+
print("Failed to get leaderboard data")
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_meta.py
RENAMED
|
@@ -182,7 +182,7 @@ def generate_ia_metadata(collection_dir):
|
|
|
182
182
|
write_meta_tag(meta_dir, "coverage", f"{first_date} - {last_date}")
|
|
183
183
|
write_meta_tag(meta_dir, "licenseurl", "https://creativecommons.org/licenses/by-sa/4.0/")
|
|
184
184
|
write_meta_tag(meta_dir, "mediatype", "data")
|
|
185
|
-
write_meta_tag(meta_dir, "collection", "
|
|
185
|
+
write_meta_tag(meta_dir, "collection", "mapillary-images")
|
|
186
186
|
|
|
187
187
|
# Source and scanner metadata
|
|
188
188
|
write_meta_tag(meta_dir, "source", f"https://www.mapillary.com/app/user/{username}")
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/tar_sequences.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Tar sequence directories for efficient Internet Archive uploads."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import re
|
|
4
5
|
import tarfile
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from mapillary_downloader.utils import format_size
|
|
@@ -9,7 +10,9 @@ logger = logging.getLogger("mapillary_downloader")
|
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def tar_sequence_directories(collection_dir):
|
|
12
|
-
"""Tar all
|
|
13
|
+
"""Tar all date directories in a collection for faster IA uploads.
|
|
14
|
+
|
|
15
|
+
Organizes by capture date (YYYY-MM-DD) for incremental archive.org uploads.
|
|
13
16
|
|
|
14
17
|
Args:
|
|
15
18
|
collection_dir: Path to collection directory (e.g., mapillary-user-quality/)
|
|
@@ -23,44 +26,44 @@ def tar_sequence_directories(collection_dir):
|
|
|
23
26
|
logger.error(f"Collection directory not found: {collection_dir}")
|
|
24
27
|
return 0, 0
|
|
25
28
|
|
|
26
|
-
# Find all
|
|
27
|
-
#
|
|
29
|
+
# Find all date directories (skip special dirs)
|
|
30
|
+
# Date format: YYYY-MM-DD or unknown-date
|
|
28
31
|
skip_dirs = {".meta", "__pycache__"}
|
|
29
|
-
|
|
32
|
+
date_dirs = []
|
|
30
33
|
|
|
31
34
|
for item in collection_dir.iterdir():
|
|
32
35
|
if item.is_dir() and item.name not in skip_dirs:
|
|
33
|
-
# Check if this is a
|
|
34
|
-
if
|
|
35
|
-
|
|
36
|
+
# Check if this is a date dir (YYYY-MM-DD) or unknown-date
|
|
37
|
+
if re.match(r"\d{4}-\d{2}-\d{2}$", item.name) or item.name == "unknown-date":
|
|
38
|
+
date_dirs.append(item)
|
|
36
39
|
|
|
37
|
-
if not
|
|
38
|
-
logger.info("No
|
|
40
|
+
if not date_dirs:
|
|
41
|
+
logger.info("No date directories to tar")
|
|
39
42
|
return 0, 0
|
|
40
43
|
|
|
41
|
-
# Sort
|
|
42
|
-
|
|
44
|
+
# Sort date directories chronologically (YYYY-MM-DD sorts naturally)
|
|
45
|
+
date_dirs = sorted(date_dirs, key=lambda x: x.name)
|
|
43
46
|
|
|
44
|
-
logger.info(f"Tarring {len(
|
|
47
|
+
logger.info(f"Tarring {len(date_dirs)} date directories...")
|
|
45
48
|
|
|
46
49
|
tarred_count = 0
|
|
47
50
|
total_files = 0
|
|
48
51
|
total_tar_bytes = 0
|
|
49
52
|
|
|
50
|
-
for
|
|
51
|
-
|
|
52
|
-
tar_path = collection_dir / f"{
|
|
53
|
+
for date_dir in date_dirs:
|
|
54
|
+
date_name = date_dir.name
|
|
55
|
+
tar_path = collection_dir / f"{date_name}.tar"
|
|
53
56
|
|
|
54
|
-
# Count files in
|
|
55
|
-
files_to_tar = sorted([f for f in
|
|
57
|
+
# Count files in date directory
|
|
58
|
+
files_to_tar = sorted([f for f in date_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
|
|
56
59
|
file_count = len(files_to_tar)
|
|
57
60
|
|
|
58
61
|
if file_count == 0:
|
|
59
|
-
logger.warning(f"Skipping empty
|
|
62
|
+
logger.warning(f"Skipping empty date directory: {date_name}")
|
|
60
63
|
continue
|
|
61
64
|
|
|
62
65
|
try:
|
|
63
|
-
logger.info(f"Tarring
|
|
66
|
+
logger.info(f"Tarring date '{date_name}' ({file_count} files)...")
|
|
64
67
|
|
|
65
68
|
# Create reproducible uncompressed tar (WebP already compressed)
|
|
66
69
|
with tarfile.open(tar_path, "w") as tar:
|
|
@@ -87,36 +90,34 @@ def tar_sequence_directories(collection_dir):
|
|
|
87
90
|
tar_size = tar_path.stat().st_size
|
|
88
91
|
total_tar_bytes += tar_size
|
|
89
92
|
|
|
90
|
-
# Remove original
|
|
91
|
-
for file in
|
|
93
|
+
# Remove original date directory
|
|
94
|
+
for file in date_dir.rglob("*"):
|
|
92
95
|
if file.is_file():
|
|
93
96
|
file.unlink()
|
|
94
97
|
|
|
95
98
|
# Remove empty subdirs and main dir
|
|
96
|
-
for subdir in list(
|
|
99
|
+
for subdir in list(date_dir.rglob("*")):
|
|
97
100
|
if subdir.is_dir():
|
|
98
101
|
try:
|
|
99
102
|
subdir.rmdir()
|
|
100
103
|
except OSError:
|
|
101
104
|
pass # Not empty yet
|
|
102
105
|
|
|
103
|
-
|
|
106
|
+
date_dir.rmdir()
|
|
104
107
|
|
|
105
108
|
tarred_count += 1
|
|
106
109
|
total_files += file_count
|
|
107
110
|
|
|
108
|
-
logger.info(f"Tarred
|
|
111
|
+
logger.info(f"Tarred date '{date_name}': {file_count:,} files, {format_size(tar_size)}")
|
|
109
112
|
else:
|
|
110
113
|
logger.error(f"Tar file empty or not created: {tar_path}")
|
|
111
114
|
if tar_path.exists():
|
|
112
115
|
tar_path.unlink()
|
|
113
116
|
|
|
114
117
|
except Exception as e:
|
|
115
|
-
logger.error(f"Error tarring
|
|
118
|
+
logger.error(f"Error tarring date {date_name}: {e}")
|
|
116
119
|
if tar_path.exists():
|
|
117
120
|
tar_path.unlink()
|
|
118
121
|
|
|
119
|
-
logger.info(
|
|
120
|
-
f"Tarred {tarred_count} sequences ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)"
|
|
121
|
-
)
|
|
122
|
+
logger.info(f"Tarred {tarred_count} dates ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)")
|
|
122
123
|
return tarred_count, total_files
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/worker.py
RENAMED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import signal
|
|
5
5
|
import tempfile
|
|
6
|
+
from datetime import datetime
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
import requests
|
|
8
9
|
from mapillary_downloader.exif_writer import write_exif_to_image
|
|
@@ -69,16 +70,25 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
|
|
|
69
70
|
if not image_url:
|
|
70
71
|
return (image_id, 0, False, f"No {quality} URL")
|
|
71
72
|
|
|
72
|
-
# Determine final output directory - organize by
|
|
73
|
+
# Determine final output directory - organize by capture date
|
|
73
74
|
output_dir = Path(output_dir)
|
|
74
75
|
sequence_id = image_data.get("sequence")
|
|
76
|
+
|
|
77
|
+
# Extract date from captured_at timestamp (milliseconds since epoch)
|
|
78
|
+
captured_at = image_data.get("captured_at")
|
|
79
|
+
if captured_at:
|
|
80
|
+
# Convert to UTC date string (YYYY-MM-DD)
|
|
81
|
+
date_str = datetime.utcfromtimestamp(captured_at / 1000).strftime("%Y-%m-%d")
|
|
82
|
+
else:
|
|
83
|
+
# Fallback for missing timestamp (should be rare per API docs)
|
|
84
|
+
date_str = "unknown-date"
|
|
85
|
+
|
|
75
86
|
if sequence_id:
|
|
76
|
-
|
|
77
|
-
first_char = sequence_id[0]
|
|
78
|
-
img_dir = output_dir / first_char / sequence_id
|
|
87
|
+
img_dir = output_dir / date_str / sequence_id
|
|
79
88
|
img_dir.mkdir(parents=True, exist_ok=True)
|
|
80
89
|
else:
|
|
81
|
-
img_dir = output_dir
|
|
90
|
+
img_dir = output_dir / date_str
|
|
91
|
+
img_dir.mkdir(parents=True, exist_ok=True)
|
|
82
92
|
|
|
83
93
|
# If converting to WebP, use /tmp for intermediate JPEG
|
|
84
94
|
# Otherwise write JPEG directly to final location
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/worker_pool.py
RENAMED
|
@@ -17,17 +17,15 @@ class AdaptiveWorkerPool:
|
|
|
17
17
|
- If throughput plateauing/decreasing: reduce workers
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
def __init__(self, worker_func,
|
|
20
|
+
def __init__(self, worker_func, max_workers=16, monitoring_interval=10):
|
|
21
21
|
"""Initialize adaptive worker pool.
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
24
|
worker_func: Function to run in each worker (must accept work_queue, result_queue)
|
|
25
|
-
min_workers: Minimum number of workers
|
|
26
25
|
max_workers: Maximum number of workers
|
|
27
26
|
monitoring_interval: Seconds between throughput checks
|
|
28
27
|
"""
|
|
29
28
|
self.worker_func = worker_func
|
|
30
|
-
self.min_workers = min_workers
|
|
31
29
|
self.max_workers = max_workers
|
|
32
30
|
self.monitoring_interval = monitoring_interval
|
|
33
31
|
|
|
@@ -37,7 +35,8 @@ class AdaptiveWorkerPool:
|
|
|
37
35
|
|
|
38
36
|
# Worker management
|
|
39
37
|
self.workers = []
|
|
40
|
-
|
|
38
|
+
# Start at 25% of max_workers (at least 1)
|
|
39
|
+
self.current_workers = max(1, int(max_workers * 0.25))
|
|
41
40
|
|
|
42
41
|
# Throughput monitoring
|
|
43
42
|
self.throughput_history = deque(maxlen=5) # Last 5 measurements
|
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/__init__.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/client.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/exif_writer.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_check.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_stats.py
RENAMED
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/logging_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/webp_converter.py
RENAMED
|
File without changes
|