mapillary-downloader 0.6.1__tar.gz → 0.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/PKG-INFO +25 -11
  2. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/README.md +24 -10
  3. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/pyproject.toml +1 -1
  4. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/__main__.py +2 -2
  5. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/downloader.py +2 -5
  6. mapillary_downloader-0.7.2/src/mapillary_downloader/graphql_web.py +193 -0
  7. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_meta.py +1 -1
  8. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/tar_sequences.py +29 -28
  9. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/worker.py +15 -5
  10. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/worker_pool.py +3 -4
  11. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/LICENSE.md +0 -0
  12. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/__init__.py +0 -0
  13. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/client.py +0 -0
  14. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/exif_writer.py +0 -0
  15. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_check.py +0 -0
  16. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/ia_stats.py +0 -0
  17. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/logging_config.py +0 -0
  18. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/metadata_reader.py +0 -0
  19. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/utils.py +0 -0
  20. {mapillary_downloader-0.6.1 → mapillary_downloader-0.7.2}/src/mapillary_downloader/webp_converter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.6.1
3
+ Version: 0.7.2
4
4
  Summary: Archive user data from Mapillary
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -66,7 +66,7 @@ mapillary-downloader --output ./downloads USERNAME1
66
66
  | `--quality` | 256, 1024, 2048 or original | `original` |
67
67
  | `--bbox` | `west,south,east,north` | `None` |
68
68
  | `--no-webp` | Don't convert to WebP | `False` |
69
- | `--max-workers` | Maximum number of parallel download workers | `128` |
69
+ | `--max-workers` | Maximum number of parallel download workers | CPU count |
70
70
  | `--no-tar` | Don't tar bucket directories | `False` |
71
71
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
72
72
 
@@ -100,21 +100,28 @@ mapillary-downloader --no-webp USERNAME
100
100
 
101
101
  ## Tarballs
102
102
 
103
- Images are organized by sequence ID, bucketed by the first character of the
104
- sequence to reduce directory count:
103
+ Images are organized by capture date (YYYY-MM-DD) for incremental archiving:
105
104
 
106
105
  ```
107
106
  mapillary-username-quality/
108
- a/
107
+ 2024-01-15/
109
108
  abc123/
110
109
  image1.webp
111
110
  image2.webp
111
+ bcd456/
112
+ image3.webp
113
+ 2024-01-16/
114
+ def789/
115
+ image4.webp
112
116
  ```
113
117
 
114
- By default, these bucket directories are automatically tarred after download
115
- (resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
116
- because large collections with millions of images would otherwise create hundreds
117
- of thousands of tiny tars, and anger the archive gods.
118
+ By default, these date directories are automatically tarred after download
119
+ (resulting in `2024-01-15.tar`, `2024-01-16.tar`, etc.). This date-based
120
+ organization enables:
121
+
122
+ - **Incremental uploads** - Upload each day's tar as soon as it's ready
123
+ - **Manageable file counts** - ~365 days/year × 10 years = 3,650 tars max
124
+ - **Chronological organization** - Natural sorting and progress tracking
118
125
 
119
126
  To keep individual files instead of creating tars, use the `--no-tar` flag.
120
127
 
@@ -128,8 +135,15 @@ See inlay for details:
128
135
 
129
136
  * [📀 rip](https://bitplane.net/dev/sh/rip)
130
137
 
138
+ ## 📊 Stats
139
+
140
+ To see overall project progress, or an estimate, use `--stats`
141
+
142
+ ```bash
143
+ mapillary-downloader --stats
144
+ ```
131
145
 
132
- ## Development
146
+ ## 🚧 Development
133
147
 
134
148
  ```bash
135
149
  make dev # Setup dev environment
@@ -138,7 +152,7 @@ make dist # Build the distribution
138
152
  make help # See other make options
139
153
  ```
140
154
 
141
- ## Links
155
+ ## 🔗 Links
142
156
 
143
157
  * [🏠 home](https://bitplane.net/dev/python/mapillary_downloader)
144
158
  * [📖 pydoc](https://bitplane.net/dev/python/mapillary_downloader/pydoc)
@@ -36,7 +36,7 @@ mapillary-downloader --output ./downloads USERNAME1
36
36
  | `--quality` | 256, 1024, 2048 or original | `original` |
37
37
  | `--bbox` | `west,south,east,north` | `None` |
38
38
  | `--no-webp` | Don't convert to WebP | `False` |
39
- | `--max-workers` | Maximum number of parallel download workers | `128` |
39
+ | `--max-workers` | Maximum number of parallel download workers | CPU count |
40
40
  | `--no-tar` | Don't tar bucket directories | `False` |
41
41
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
42
42
 
@@ -70,21 +70,28 @@ mapillary-downloader --no-webp USERNAME
70
70
 
71
71
  ## Tarballs
72
72
 
73
- Images are organized by sequence ID, bucketed by the first character of the
74
- sequence to reduce directory count:
73
+ Images are organized by capture date (YYYY-MM-DD) for incremental archiving:
75
74
 
76
75
  ```
77
76
  mapillary-username-quality/
78
- a/
77
+ 2024-01-15/
79
78
  abc123/
80
79
  image1.webp
81
80
  image2.webp
81
+ bcd456/
82
+ image3.webp
83
+ 2024-01-16/
84
+ def789/
85
+ image4.webp
82
86
  ```
83
87
 
84
- By default, these bucket directories are automatically tarred after download
85
- (resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
86
- because large collections with millions of images would otherwise create hundreds
87
- of thousands of tiny tars, and anger the archive gods.
88
+ By default, these date directories are automatically tarred after download
89
+ (resulting in `2024-01-15.tar`, `2024-01-16.tar`, etc.). This date-based
90
+ organization enables:
91
+
92
+ - **Incremental uploads** - Upload each day's tar as soon as it's ready
93
+ - **Manageable file counts** - ~365 days/year × 10 years = 3,650 tars max
94
+ - **Chronological organization** - Natural sorting and progress tracking
88
95
 
89
96
  To keep individual files instead of creating tars, use the `--no-tar` flag.
90
97
 
@@ -98,8 +105,15 @@ See inlay for details:
98
105
 
99
106
  * [📀 rip](https://bitplane.net/dev/sh/rip)
100
107
 
108
+ ## 📊 Stats
109
+
110
+ To see overall project progress, or an estimate, use `--stats`
111
+
112
+ ```bash
113
+ mapillary-downloader --stats
114
+ ```
101
115
 
102
- ## Development
116
+ ## 🚧 Development
103
117
 
104
118
  ```bash
105
119
  make dev # Setup dev environment
@@ -108,7 +122,7 @@ make dist # Build the distribution
108
122
  make help # See other make options
109
123
  ```
110
124
 
111
- ## Links
125
+ ## 🔗 Links
112
126
 
113
127
  * [🏠 home](https://bitplane.net/dev/python/mapillary_downloader)
114
128
  * [📖 pydoc](https://bitplane.net/dev/python/mapillary_downloader/pydoc)
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Archive user data from Mapillary"
4
- version = "0.6.1"
4
+ version = "0.7.2"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -43,8 +43,8 @@ def main():
43
43
  parser.add_argument(
44
44
  "--max-workers",
45
45
  type=int,
46
- default=128,
47
- help="Maximum number of parallel workers (default: 128)",
46
+ default=os.cpu_count() or 8,
47
+ help=f"Maximum number of parallel workers (default: CPU count = {os.cpu_count() or 8})",
48
48
  )
49
49
  parser.add_argument(
50
50
  "--no-tar",
@@ -67,7 +67,6 @@ class MapillaryDownloader:
67
67
  self.username = username
68
68
  self.quality = quality
69
69
  self.max_workers = max_workers
70
- self.initial_workers = os.cpu_count() or 1 # Start with CPU count
71
70
  self.tar_sequences = tar_sequences
72
71
  self.convert_webp = convert_webp
73
72
  self.check_ia = check_ia
@@ -173,7 +172,7 @@ class MapillaryDownloader:
173
172
  logger.info(f"Downloading images for user: {self.username}")
174
173
  logger.info(f"Output directory: {self.output_dir}")
175
174
  logger.info(f"Quality: {self.quality}")
176
- logger.info(f"Worker pool: {self.initial_workers} initial, {self.max_workers} max")
175
+ logger.info(f"Worker pool: max {self.max_workers} workers")
177
176
 
178
177
  start_time = time.time()
179
178
 
@@ -188,9 +187,7 @@ class MapillaryDownloader:
188
187
  # Step 2: Start worker pool
189
188
  # Since workers do both I/O (download) and CPU (WebP), need many more workers
190
189
  # Start with CPU count and scale up based on throughput
191
- pool = AdaptiveWorkerPool(
192
- worker_process, min_workers=self.initial_workers, max_workers=self.max_workers, monitoring_interval=10
193
- )
190
+ pool = AdaptiveWorkerPool(worker_process, max_workers=self.max_workers, monitoring_interval=10)
194
191
  pool.start()
195
192
 
196
193
  # Step 3: Download images from metadata file while fetching new from API
@@ -0,0 +1,193 @@
1
+ """GraphQL web API utilities (unofficial, experimental).
2
+
3
+ This module provides access to Mapillary's GraphQL endpoint used by the web interface.
4
+ Unlike the official v4 REST API, this requires a public web token extracted from the
5
+ JavaScript bundle.
6
+
7
+ Use cases:
8
+ - Get user image counts without pagination
9
+ - Access leaderboard data
10
+ - Check for updates to existing downloads
11
+
12
+ WARNING: This is not officially documented and may break at any time.
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import re
18
+ from datetime import datetime
19
+ from urllib.parse import urlencode, quote
20
+ import requests
21
+
22
+ logger = logging.getLogger("mapillary_downloader")
23
+
24
+ # Fallback token (extracted from main JS bundle as of 2025-01-09)
25
+ FALLBACK_TOKEN = "MLY|4223665974375089|d62822dd792b6a823d0794ef26450398"
26
+
27
+
28
+ def extract_token_from_js():
29
+ """Extract public web token from Mapillary's JavaScript bundle.
30
+
31
+ This fetches the main page, finds the main JS bundle, and extracts
32
+ the hardcoded MLY token used for GraphQL queries.
33
+
34
+ Returns:
35
+ Token string (e.g., "MLY|123|abc...") or None if extraction failed
36
+ """
37
+ try:
38
+ # Fetch main page to find JS bundle URL
39
+ # Need consent cookie to get actual page (not GDPR banner)
40
+ logger.debug("Fetching Mapillary main page...")
41
+ # Generate today's date in the format YYYY_MM_DD for cookie
42
+ today = datetime.now().strftime("%Y_%m_%d")
43
+ cookies = {
44
+ "mly_cb": f'{{"version":"1","date":"{today}","third_party_consent":"withdrawn","categories":{{"content_and_media":"withdrawn"}},"integration_controls":{{"YOUTUBE":"withdrawn"}}}}'
45
+ }
46
+ headers = {
47
+ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
48
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
49
+ "Accept-Language": "en-GB,en;q=0.5",
50
+ "Sec-GPC": "1",
51
+ "Upgrade-Insecure-Requests": "1",
52
+ "Sec-Fetch-Dest": "document",
53
+ "Sec-Fetch-Mode": "navigate",
54
+ "Sec-Fetch-Site": "none",
55
+ "Sec-Fetch-User": "?1",
56
+ }
57
+ response = requests.get("https://www.mapillary.com/app/", cookies=cookies, headers=headers, timeout=30)
58
+ response.raise_for_status()
59
+
60
+ # Find main JS file URL
61
+ # Pattern: <script src="main.{hash}.js" type="module"></script>
62
+ js_match = re.search(r'src="(main\.[a-f0-9]+\.js)"', response.text)
63
+ if not js_match:
64
+ logger.warning("Could not find main JS bundle URL in page")
65
+ return None
66
+
67
+ # URL is relative to /app/ base path
68
+ js_url = f"https://www.mapillary.com/app/{js_match.group(1)}"
69
+ logger.debug(f"Found JS bundle: {js_url}")
70
+
71
+ # Fetch JS bundle
72
+ logger.debug("Fetching JS bundle...")
73
+ js_response = requests.get(js_url, timeout=30)
74
+ js_response.raise_for_status()
75
+
76
+ # Extract token
77
+ # Pattern: "MLY|{client_id}|{secret}"
78
+ token_match = re.search(r'"(MLY\|[^"]+)"', js_response.text)
79
+ if not token_match:
80
+ logger.warning("Could not find MLY token in JS bundle")
81
+ return None
82
+
83
+ token = token_match.group(1)
84
+ logger.info(f"Extracted web token: {token[:20]}...")
85
+ return token
86
+
87
+ except requests.RequestException as e:
88
+ logger.error(f"Failed to extract web token: {e}")
89
+ return None
90
+ except Exception as e:
91
+ logger.error(f"Unexpected error extracting web token: {e}")
92
+ return None
93
+
94
+
95
+ def get_leaderboard(key="global", token=None):
96
+ """Get leaderboard data from Mapillary GraphQL API.
97
+
98
+ Args:
99
+ key: Leaderboard key (e.g., "global", country name, etc.)
100
+ token: MLY token (if None, will extract from JS bundle or use fallback)
101
+
102
+ Returns:
103
+ Dict with leaderboard data, or None on error
104
+ """
105
+ if token is None:
106
+ token = extract_token_from_js()
107
+ if token is None:
108
+ logger.warning("Failed to extract token, using fallback")
109
+ token = FALLBACK_TOKEN
110
+
111
+ # GraphQL query for leaderboard (lifetime stats only)
112
+ query = """query getUserLeaderboard($key: String!) {
113
+ user_leaderboards(key: $key) {
114
+ lifetime {
115
+ count
116
+ user {
117
+ id
118
+ username
119
+ profile_photo_url
120
+ __typename
121
+ }
122
+ __typename
123
+ }
124
+ __typename
125
+ }
126
+ }"""
127
+
128
+ try:
129
+ headers = {
130
+ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0",
131
+ "Accept": "*/*",
132
+ "Accept-Language": "en-GB,en;q=0.5",
133
+ "Referer": "https://www.mapillary.com/",
134
+ "content-type": "application/json",
135
+ "authorization": f"OAuth {token}",
136
+ "Origin": "https://www.mapillary.com",
137
+ "Sec-Fetch-Dest": "empty",
138
+ "Sec-Fetch-Mode": "cors",
139
+ "Sec-Fetch-Site": "same-site",
140
+ }
141
+
142
+ # Build query params - use quote_via=quote to get %20 instead of +
143
+ # Note: both 'doc' and 'query' params seem to be required (from observed curl)
144
+ params = {
145
+ "doc": query,
146
+ "query": query,
147
+ "operationName": "getUserLeaderboard",
148
+ "variables": json.dumps({"key": key}, separators=(',', ':')),
149
+ }
150
+
151
+ # Build URL with proper percent encoding (not + for spaces)
152
+ # Don't encode parentheses to match curl behavior
153
+ query_string = urlencode(params, quote_via=lambda s, safe='', encoding=None, errors=None: quote(s, safe='()!'))
154
+ url = f"https://graph.mapillary.com/graphql?{query_string}"
155
+
156
+ logger.debug(f"Querying leaderboard for key: {key}")
157
+
158
+ response = requests.get(
159
+ url,
160
+ headers=headers,
161
+ timeout=30
162
+ )
163
+ response.raise_for_status()
164
+
165
+ return response.json()
166
+
167
+ except requests.RequestException as e:
168
+ logger.error(f"Failed to query leaderboard: {e}")
169
+ return None
170
+ except Exception as e:
171
+ logger.error(f"Unexpected error querying leaderboard: {e}")
172
+ return None
173
+
174
+
175
+ if __name__ == "__main__":
176
+ # Test the extraction and leaderboard query
177
+ logging.basicConfig(level=logging.DEBUG)
178
+
179
+ print("=== Extracting token ===")
180
+ token = extract_token_from_js()
181
+ if token:
182
+ print(f"Success! Token: {token}")
183
+ else:
184
+ print("Failed to extract token")
185
+ print(f"Fallback: {FALLBACK_TOKEN}")
186
+ token = FALLBACK_TOKEN
187
+
188
+ print("\n=== Querying global leaderboard ===")
189
+ data = get_leaderboard("global", token=token)
190
+ if data:
191
+ print(json.dumps(data, indent=2))
192
+ else:
193
+ print("Failed to get leaderboard data")
@@ -182,7 +182,7 @@ def generate_ia_metadata(collection_dir):
182
182
  write_meta_tag(meta_dir, "coverage", f"{first_date} - {last_date}")
183
183
  write_meta_tag(meta_dir, "licenseurl", "https://creativecommons.org/licenses/by-sa/4.0/")
184
184
  write_meta_tag(meta_dir, "mediatype", "data")
185
- write_meta_tag(meta_dir, "collection", "opensource_media")
185
+ write_meta_tag(meta_dir, "collection", "mapillary-images")
186
186
 
187
187
  # Source and scanner metadata
188
188
  write_meta_tag(meta_dir, "source", f"https://www.mapillary.com/app/user/{username}")
@@ -1,6 +1,7 @@
1
1
  """Tar sequence directories for efficient Internet Archive uploads."""
2
2
 
3
3
  import logging
4
+ import re
4
5
  import tarfile
5
6
  from pathlib import Path
6
7
  from mapillary_downloader.utils import format_size
@@ -9,7 +10,9 @@ logger = logging.getLogger("mapillary_downloader")
9
10
 
10
11
 
11
12
  def tar_sequence_directories(collection_dir):
12
- """Tar all sequence directories in a collection for faster IA uploads.
13
+ """Tar all date directories in a collection for faster IA uploads.
14
+
15
+ Organizes by capture date (YYYY-MM-DD) for incremental archive.org uploads.
13
16
 
14
17
  Args:
15
18
  collection_dir: Path to collection directory (e.g., mapillary-user-quality/)
@@ -23,44 +26,44 @@ def tar_sequence_directories(collection_dir):
23
26
  logger.error(f"Collection directory not found: {collection_dir}")
24
27
  return 0, 0
25
28
 
26
- # Find all bucket directories (skip special dirs)
27
- # Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
29
+ # Find all date directories (skip special dirs)
30
+ # Date format: YYYY-MM-DD or unknown-date
28
31
  skip_dirs = {".meta", "__pycache__"}
29
- bucket_dirs = []
32
+ date_dirs = []
30
33
 
31
34
  for item in collection_dir.iterdir():
32
35
  if item.is_dir() and item.name not in skip_dirs:
33
- # Check if this is a bucket dir (single char)
34
- if len(item.name) == 1:
35
- bucket_dirs.append(item)
36
+ # Check if this is a date dir (YYYY-MM-DD) or unknown-date
37
+ if re.match(r"\d{4}-\d{2}-\d{2}$", item.name) or item.name == "unknown-date":
38
+ date_dirs.append(item)
36
39
 
37
- if not bucket_dirs:
38
- logger.info("No bucket directories to tar")
40
+ if not date_dirs:
41
+ logger.info("No date directories to tar")
39
42
  return 0, 0
40
43
 
41
- # Sort bucket directories alphabetically for consistent progress tracking
42
- bucket_dirs = sorted(bucket_dirs, key=lambda x: x.name)
44
+ # Sort date directories chronologically (YYYY-MM-DD sorts naturally)
45
+ date_dirs = sorted(date_dirs, key=lambda x: x.name)
43
46
 
44
- logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
47
+ logger.info(f"Tarring {len(date_dirs)} date directories...")
45
48
 
46
49
  tarred_count = 0
47
50
  total_files = 0
48
51
  total_tar_bytes = 0
49
52
 
50
- for bucket_dir in bucket_dirs:
51
- bucket_name = bucket_dir.name
52
- tar_path = collection_dir / f"{bucket_name}.tar"
53
+ for date_dir in date_dirs:
54
+ date_name = date_dir.name
55
+ tar_path = collection_dir / f"{date_name}.tar"
53
56
 
54
- # Count files in bucket
55
- files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
57
+ # Count files in date directory
58
+ files_to_tar = sorted([f for f in date_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
56
59
  file_count = len(files_to_tar)
57
60
 
58
61
  if file_count == 0:
59
- logger.warning(f"Skipping empty bucket directory: {bucket_name}")
62
+ logger.warning(f"Skipping empty date directory: {date_name}")
60
63
  continue
61
64
 
62
65
  try:
63
- logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
66
+ logger.info(f"Tarring date '{date_name}' ({file_count} files)...")
64
67
 
65
68
  # Create reproducible uncompressed tar (WebP already compressed)
66
69
  with tarfile.open(tar_path, "w") as tar:
@@ -87,36 +90,34 @@ def tar_sequence_directories(collection_dir):
87
90
  tar_size = tar_path.stat().st_size
88
91
  total_tar_bytes += tar_size
89
92
 
90
- # Remove original bucket directory
91
- for file in bucket_dir.rglob("*"):
93
+ # Remove original date directory
94
+ for file in date_dir.rglob("*"):
92
95
  if file.is_file():
93
96
  file.unlink()
94
97
 
95
98
  # Remove empty subdirs and main dir
96
- for subdir in list(bucket_dir.rglob("*")):
99
+ for subdir in list(date_dir.rglob("*")):
97
100
  if subdir.is_dir():
98
101
  try:
99
102
  subdir.rmdir()
100
103
  except OSError:
101
104
  pass # Not empty yet
102
105
 
103
- bucket_dir.rmdir()
106
+ date_dir.rmdir()
104
107
 
105
108
  tarred_count += 1
106
109
  total_files += file_count
107
110
 
108
- logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
111
+ logger.info(f"Tarred date '{date_name}': {file_count:,} files, {format_size(tar_size)}")
109
112
  else:
110
113
  logger.error(f"Tar file empty or not created: {tar_path}")
111
114
  if tar_path.exists():
112
115
  tar_path.unlink()
113
116
 
114
117
  except Exception as e:
115
- logger.error(f"Error tarring bucket {bucket_name}: {e}")
118
+ logger.error(f"Error tarring date {date_name}: {e}")
116
119
  if tar_path.exists():
117
120
  tar_path.unlink()
118
121
 
119
- logger.info(
120
- f"Tarred {tarred_count} sequences ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)"
121
- )
122
+ logger.info(f"Tarred {tarred_count} dates ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)")
122
123
  return tarred_count, total_files
@@ -3,6 +3,7 @@
3
3
  import os
4
4
  import signal
5
5
  import tempfile
6
+ from datetime import datetime
6
7
  from pathlib import Path
7
8
  import requests
8
9
  from mapillary_downloader.exif_writer import write_exif_to_image
@@ -69,16 +70,25 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
69
70
  if not image_url:
70
71
  return (image_id, 0, False, f"No {quality} URL")
71
72
 
72
- # Determine final output directory - organize by first char of sequence ID
73
+ # Determine final output directory - organize by capture date
73
74
  output_dir = Path(output_dir)
74
75
  sequence_id = image_data.get("sequence")
76
+
77
+ # Extract date from captured_at timestamp (milliseconds since epoch)
78
+ captured_at = image_data.get("captured_at")
79
+ if captured_at:
80
+ # Convert to UTC date string (YYYY-MM-DD)
81
+ date_str = datetime.utcfromtimestamp(captured_at / 1000).strftime("%Y-%m-%d")
82
+ else:
83
+ # Fallback for missing timestamp (should be rare per API docs)
84
+ date_str = "unknown-date"
85
+
75
86
  if sequence_id:
76
- # Use first character as bucket (gives us ~62 dirs instead of millions)
77
- first_char = sequence_id[0]
78
- img_dir = output_dir / first_char / sequence_id
87
+ img_dir = output_dir / date_str / sequence_id
79
88
  img_dir.mkdir(parents=True, exist_ok=True)
80
89
  else:
81
- img_dir = output_dir
90
+ img_dir = output_dir / date_str
91
+ img_dir.mkdir(parents=True, exist_ok=True)
82
92
 
83
93
  # If converting to WebP, use /tmp for intermediate JPEG
84
94
  # Otherwise write JPEG directly to final location
@@ -17,17 +17,15 @@ class AdaptiveWorkerPool:
17
17
  - If throughput plateauing/decreasing: reduce workers
18
18
  """
19
19
 
20
- def __init__(self, worker_func, min_workers=4, max_workers=16, monitoring_interval=10):
20
+ def __init__(self, worker_func, max_workers=16, monitoring_interval=10):
21
21
  """Initialize adaptive worker pool.
22
22
 
23
23
  Args:
24
24
  worker_func: Function to run in each worker (must accept work_queue, result_queue)
25
- min_workers: Minimum number of workers
26
25
  max_workers: Maximum number of workers
27
26
  monitoring_interval: Seconds between throughput checks
28
27
  """
29
28
  self.worker_func = worker_func
30
- self.min_workers = min_workers
31
29
  self.max_workers = max_workers
32
30
  self.monitoring_interval = monitoring_interval
33
31
 
@@ -37,7 +35,8 @@ class AdaptiveWorkerPool:
37
35
 
38
36
  # Worker management
39
37
  self.workers = []
40
- self.current_workers = min_workers # Start small and ramp up
38
+ # Start at 25% of max_workers (at least 1)
39
+ self.current_workers = max(1, int(max_workers * 0.25))
41
40
 
42
41
  # Throughput monitoring
43
42
  self.throughput_history = deque(maxlen=5) # Last 5 measurements