mapillary-downloader 0.3.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/PKG-INFO +30 -18
  2. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/README.md +29 -17
  3. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/pyproject.toml +1 -1
  4. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/__main__.py +43 -10
  5. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/downloader.py +113 -6
  6. mapillary_downloader-0.4.0/src/mapillary_downloader/ia_check.py +33 -0
  7. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/ia_meta.py +12 -11
  8. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/logging_config.py +20 -0
  9. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/tar_sequences.py +34 -18
  10. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/worker.py +7 -0
  11. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/LICENSE.md +0 -0
  12. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/__init__.py +0 -0
  13. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/client.py +0 -0
  14. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/exif_writer.py +0 -0
  15. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/utils.py +0 -0
  16. {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/webp_converter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -47,37 +47,43 @@ First, get your Mapillary API access token from
47
47
  [the developer dashboard](https://www.mapillary.com/dashboard/developers)
48
48
 
49
49
  ```bash
50
- # Set token via environment variable
50
+ # Set token via environment variable (recommended)
51
51
  export MAPILLARY_TOKEN=YOUR_TOKEN
52
- mapillary-downloader --username SOME_USERNAME --output ./downloads
52
+ mapillary-downloader USERNAME1 USERNAME2 USERNAME3
53
53
 
54
54
  # Or pass token directly, and have it in your shell history 💩👀
55
- mapillary-downloader --token YOUR_TOKEN --username SOME_USERNAME --output ./downloads
55
+ mapillary-downloader --token YOUR_TOKEN USERNAME1 USERNAME2
56
+
57
+ # Download to specific directory
58
+ mapillary-downloader --output ./downloads USERNAME1
56
59
  ```
57
60
 
58
- | option | because | default |
59
- | ------------- | ------------------------------------- | ------------------ |
60
- | `--username` | Mapillary username | None (required) |
61
- | `--token` | Mapillary API token (or env var) | `$MAPILLARY_TOKEN` |
62
- | `--output` | Output directory | `./mapillary_data` |
63
- | `--quality` | 256, 1024, 2048 or original | `original` |
64
- | `--bbox` | `west,south,east,north` | `None` |
65
- | `--webp` | Convert to WebP (saves ~70% space) | `False` |
66
- | `--workers` | Number of parallel download workers | Half of CPU count |
67
- | `--no-tar` | Don't tar sequence directories | `False` |
61
+ | option | because | default |
62
+ | --------------- | -------------------------------------------- | ------------------ |
63
+ | `usernames` | One or more Mapillary usernames | (required) |
64
+ | `--token` | Mapillary API token (or env var) | `$MAPILLARY_TOKEN` |
65
+ | `--output` | Output directory | `./mapillary_data` |
66
+ | `--quality` | 256, 1024, 2048 or original | `original` |
67
+ | `--bbox` | `west,south,east,north` | `None` |
68
+ | `--no-webp` | Don't convert to WebP | `False` |
69
+ | `--workers` | Number of parallel download workers | Half of CPU count |
70
+ | `--no-tar` | Don't tar sequence directories | `False` |
71
+ | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
68
72
 
69
73
  The downloader will:
70
74
 
71
- * 📷 Download a user's images organized by sequence
75
+ * 📷 Download multiple users' images organized by sequence
72
76
  * 📜 Inject EXIF metadata (GPS coordinates, camera info, timestamps,
73
77
  compass direction)
74
78
  * 🛟 Save progress so you can safely resume if interrupted
75
- * 🗜️ Optionally convert to WebP to save space
79
+ * 🗜️ Convert to WebP by default to save ~70% disk space
76
80
  * 📦 Tar sequence directories for faster uploads
81
+ * 🏛️ Check Internet Archive to avoid duplicate downloads
82
+ * 💾 Stage downloads in cache, move atomically when complete
77
83
 
78
84
  ## WebP Conversion
79
85
 
80
- You'll need `cwebp` to use the `--webp` flag. So install it:
86
+ WebP conversion is **enabled by default** (saves ~70% disk space). You'll need the `cwebp` binary installed:
81
87
 
82
88
  ```bash
83
89
  # Debian/Ubuntu
@@ -87,6 +93,12 @@ sudo apt install webp
87
93
  brew install webp
88
94
  ```
89
95
 
96
+ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
97
+
98
+ ```bash
99
+ mapillary-downloader --no-webp USERNAME
100
+ ```
101
+
90
102
  ## Sequence Tarball Creation
91
103
 
92
104
  By default, sequence directories are automatically tarred after download because
@@ -96,7 +108,7 @@ uploading files to IA.
96
108
  To keep individual files instead of creating tars, use the `--no-tar` flag:
97
109
 
98
110
  ```bash
99
- mapillary-downloader --username WHOEVER --no-tar
111
+ mapillary-downloader --no-tar USERNAME
100
112
  ```
101
113
 
102
114
  ## Internet Archive upload
@@ -17,37 +17,43 @@ First, get your Mapillary API access token from
17
17
  [the developer dashboard](https://www.mapillary.com/dashboard/developers)
18
18
 
19
19
  ```bash
20
- # Set token via environment variable
20
+ # Set token via environment variable (recommended)
21
21
  export MAPILLARY_TOKEN=YOUR_TOKEN
22
- mapillary-downloader --username SOME_USERNAME --output ./downloads
22
+ mapillary-downloader USERNAME1 USERNAME2 USERNAME3
23
23
 
24
24
  # Or pass token directly, and have it in your shell history 💩👀
25
- mapillary-downloader --token YOUR_TOKEN --username SOME_USERNAME --output ./downloads
25
+ mapillary-downloader --token YOUR_TOKEN USERNAME1 USERNAME2
26
+
27
+ # Download to specific directory
28
+ mapillary-downloader --output ./downloads USERNAME1
26
29
  ```
27
30
 
28
- | option | because | default |
29
- | ------------- | ------------------------------------- | ------------------ |
30
- | `--username` | Mapillary username | None (required) |
31
- | `--token` | Mapillary API token (or env var) | `$MAPILLARY_TOKEN` |
32
- | `--output` | Output directory | `./mapillary_data` |
33
- | `--quality` | 256, 1024, 2048 or original | `original` |
34
- | `--bbox` | `west,south,east,north` | `None` |
35
- | `--webp` | Convert to WebP (saves ~70% space) | `False` |
36
- | `--workers` | Number of parallel download workers | Half of CPU count |
37
- | `--no-tar` | Don't tar sequence directories | `False` |
31
+ | option | because | default |
32
+ | --------------- | -------------------------------------------- | ------------------ |
33
+ | `usernames` | One or more Mapillary usernames | (required) |
34
+ | `--token` | Mapillary API token (or env var) | `$MAPILLARY_TOKEN` |
35
+ | `--output` | Output directory | `./mapillary_data` |
36
+ | `--quality` | 256, 1024, 2048 or original | `original` |
37
+ | `--bbox` | `west,south,east,north` | `None` |
38
+ | `--no-webp` | Don't convert to WebP | `False` |
39
+ | `--workers` | Number of parallel download workers | Half of CPU count |
40
+ | `--no-tar` | Don't tar sequence directories | `False` |
41
+ | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
38
42
 
39
43
  The downloader will:
40
44
 
41
- * 📷 Download a user's images organized by sequence
45
+ * 📷 Download multiple users' images organized by sequence
42
46
  * 📜 Inject EXIF metadata (GPS coordinates, camera info, timestamps,
43
47
  compass direction)
44
48
  * 🛟 Save progress so you can safely resume if interrupted
45
- * 🗜️ Optionally convert to WebP to save space
49
+ * 🗜️ Convert to WebP by default to save ~70% disk space
46
50
  * 📦 Tar sequence directories for faster uploads
51
+ * 🏛️ Check Internet Archive to avoid duplicate downloads
52
+ * 💾 Stage downloads in cache, move atomically when complete
47
53
 
48
54
  ## WebP Conversion
49
55
 
50
- You'll need `cwebp` to use the `--webp` flag. So install it:
56
+ WebP conversion is **enabled by default** (saves ~70% disk space). You'll need the `cwebp` binary installed:
51
57
 
52
58
  ```bash
53
59
  # Debian/Ubuntu
@@ -57,6 +63,12 @@ sudo apt install webp
57
63
  brew install webp
58
64
  ```
59
65
 
66
+ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
67
+
68
+ ```bash
69
+ mapillary-downloader --no-webp USERNAME
70
+ ```
71
+
60
72
  ## Sequence Tarball Creation
61
73
 
62
74
  By default, sequence directories are automatically tarred after download because
@@ -66,7 +78,7 @@ uploading files to IA.
66
78
  To keep individual files instead of creating tars, use the `--no-tar` flag:
67
79
 
68
80
  ```bash
69
- mapillary-downloader --username WHOEVER --no-tar
81
+ mapillary-downloader --no-tar USERNAME
70
82
  ```
71
83
 
72
84
  ## Internet Archive upload
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Download your Mapillary data before it's gone"
4
- version = "0.3.1"
4
+ version = "0.4.0"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -3,6 +3,7 @@
3
3
  import argparse
4
4
  import os
5
5
  import sys
6
+ from importlib.metadata import version
6
7
  from mapillary_downloader.client import MapillaryClient
7
8
  from mapillary_downloader.downloader import MapillaryDownloader
8
9
  from mapillary_downloader.logging_config import setup_logging
@@ -15,12 +16,17 @@ def main():
15
16
  logger = setup_logging()
16
17
 
17
18
  parser = argparse.ArgumentParser(description="Download your Mapillary data before it's gone")
19
+ parser.add_argument(
20
+ "--version",
21
+ action="version",
22
+ version=f"%(prog)s {version('mapillary-downloader')}",
23
+ )
18
24
  parser.add_argument(
19
25
  "--token",
20
26
  default=os.environ.get("MAPILLARY_TOKEN"),
21
27
  help="Mapillary API access token (or set MAPILLARY_TOKEN env var)",
22
28
  )
23
- parser.add_argument("--username", required=True, help="Mapillary username")
29
+ parser.add_argument("usernames", nargs="+", help="Mapillary username(s) to download")
24
30
  parser.add_argument("--output", default="./mapillary_data", help="Output directory (default: ./mapillary_data)")
25
31
  parser.add_argument(
26
32
  "--quality",
@@ -30,9 +36,9 @@ def main():
30
36
  )
31
37
  parser.add_argument("--bbox", help="Bounding box: west,south,east,north")
32
38
  parser.add_argument(
33
- "--webp",
39
+ "--no-webp",
34
40
  action="store_true",
35
- help="Convert images to WebP format (saves ~70%% disk space, requires cwebp binary)",
41
+ help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
36
42
  )
37
43
  parser.add_argument(
38
44
  "--workers",
@@ -45,6 +51,11 @@ def main():
45
51
  action="store_true",
46
52
  help="Don't tar sequence directories (keep individual files)",
47
53
  )
54
+ parser.add_argument(
55
+ "--no-check-ia",
56
+ action="store_true",
57
+ help="Don't check if collection exists on Internet Archive before downloading",
58
+ )
48
59
 
49
60
  args = parser.parse_args()
50
61
 
@@ -63,19 +74,41 @@ def main():
63
74
  logger.error("Error: bbox must be four comma-separated numbers")
64
75
  sys.exit(1)
65
76
 
66
- # Check for cwebp binary if WebP conversion is requested
67
- if args.webp:
77
+ # WebP is enabled by default, disabled with --no-webp
78
+ convert_webp = not args.no_webp
79
+
80
+ # Check for cwebp binary if WebP conversion is enabled
81
+ if convert_webp:
68
82
  if not check_cwebp_available():
69
- logger.error("Error: cwebp binary not found. Install webp package (e.g., apt install webp)")
83
+ logger.error(
84
+ "Error: cwebp binary not found. Install webp package (e.g., apt install webp) or use --no-webp"
85
+ )
70
86
  sys.exit(1)
71
87
  logger.info("WebP conversion enabled - images will be converted after download")
72
88
 
73
89
  try:
74
90
  client = MapillaryClient(args.token)
75
- downloader = MapillaryDownloader(
76
- client, args.output, args.username, args.quality, workers=args.workers, tar_sequences=not args.no_tar
77
- )
78
- downloader.download_user_data(bbox=bbox, convert_webp=args.webp)
91
+
92
+ # Process each username
93
+ for username in args.usernames:
94
+ logger.info("")
95
+ logger.info("=" * 60)
96
+ logger.info(f"Processing user: {username}")
97
+ logger.info("=" * 60)
98
+ logger.info("")
99
+
100
+ downloader = MapillaryDownloader(
101
+ client,
102
+ args.output,
103
+ username,
104
+ args.quality,
105
+ workers=args.workers,
106
+ tar_sequences=not args.no_tar,
107
+ convert_webp=convert_webp,
108
+ check_ia=not args.no_check_ia,
109
+ )
110
+ downloader.download_user_data(bbox=bbox, convert_webp=convert_webp)
111
+
79
112
  except KeyboardInterrupt:
80
113
  logger.info("\nInterrupted by user")
81
114
  sys.exit(1)
@@ -1,32 +1,65 @@
1
1
  """Main downloader logic."""
2
2
 
3
+ import gzip
3
4
  import json
4
5
  import logging
5
6
  import os
7
+ import shutil
6
8
  import time
7
9
  from pathlib import Path
8
10
  from concurrent.futures import ProcessPoolExecutor, as_completed
9
11
  from mapillary_downloader.utils import format_size, format_time
10
12
  from mapillary_downloader.ia_meta import generate_ia_metadata
13
+ from mapillary_downloader.ia_check import check_ia_exists
11
14
  from mapillary_downloader.worker import download_and_convert_image
12
15
  from mapillary_downloader.tar_sequences import tar_sequence_directories
16
+ from mapillary_downloader.logging_config import add_file_handler
13
17
 
14
18
  logger = logging.getLogger("mapillary_downloader")
15
19
 
16
20
 
21
+ def get_cache_dir():
22
+ """Get XDG cache directory for staging downloads.
23
+
24
+ Returns:
25
+ Path to cache directory for mapillary_downloader
26
+ """
27
+ xdg_cache = os.environ.get("XDG_CACHE_HOME")
28
+ if xdg_cache:
29
+ cache_dir = Path(xdg_cache)
30
+ else:
31
+ cache_dir = Path.home() / ".cache"
32
+
33
+ mapillary_cache = cache_dir / "mapillary_downloader"
34
+ mapillary_cache.mkdir(parents=True, exist_ok=True)
35
+ return mapillary_cache
36
+
37
+
17
38
  class MapillaryDownloader:
18
39
  """Handles downloading Mapillary data for a user."""
19
40
 
20
- def __init__(self, client, output_dir, username=None, quality=None, workers=None, tar_sequences=True):
41
+ def __init__(
42
+ self,
43
+ client,
44
+ output_dir,
45
+ username=None,
46
+ quality=None,
47
+ workers=None,
48
+ tar_sequences=True,
49
+ convert_webp=False,
50
+ check_ia=True,
51
+ ):
21
52
  """Initialize the downloader.
22
53
 
23
54
  Args:
24
55
  client: MapillaryClient instance
25
- output_dir: Base directory to save downloads
56
+ output_dir: Base directory to save downloads (final destination)
26
57
  username: Mapillary username (for collection directory)
27
58
  quality: Image quality (for collection directory)
28
59
  workers: Number of parallel workers (default: half of cpu_count)
29
60
  tar_sequences: Whether to tar sequence directories after download (default: True)
61
+ convert_webp: Whether to convert images to WebP (affects collection name)
62
+ check_ia: Whether to check if collection exists on Internet Archive (default: True)
30
63
  """
31
64
  self.client = client
32
65
  self.base_output_dir = Path(output_dir)
@@ -34,16 +67,39 @@ class MapillaryDownloader:
34
67
  self.quality = quality
35
68
  self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
36
69
  self.tar_sequences = tar_sequences
70
+ self.convert_webp = convert_webp
71
+ self.check_ia = check_ia
37
72
 
38
- # If username and quality provided, create collection directory
73
+ # Determine collection name
39
74
  if username and quality:
40
75
  collection_name = f"mapillary-{username}-{quality}"
41
- self.output_dir = self.base_output_dir / collection_name
76
+ if convert_webp:
77
+ collection_name += "-webp"
78
+ self.collection_name = collection_name
42
79
  else:
43
- self.output_dir = self.base_output_dir
80
+ self.collection_name = None
44
81
 
82
+ # Set up staging directory in cache
83
+ cache_dir = get_cache_dir()
84
+ if self.collection_name:
85
+ self.staging_dir = cache_dir / self.collection_name
86
+ self.final_dir = self.base_output_dir / self.collection_name
87
+ else:
88
+ self.staging_dir = cache_dir / "download"
89
+ self.final_dir = self.base_output_dir
90
+
91
+ # Work in staging directory during download
92
+ self.output_dir = self.staging_dir
45
93
  self.output_dir.mkdir(parents=True, exist_ok=True)
46
94
 
95
+ logger.info(f"Staging directory: {self.staging_dir}")
96
+ logger.info(f"Final destination: {self.final_dir}")
97
+
98
+ # Set up file logging for archival
99
+ log_file = self.output_dir / "download.log"
100
+ add_file_handler(log_file)
101
+ logger.info(f"Logging to: {log_file}")
102
+
47
103
  self.metadata_file = self.output_dir / "metadata.jsonl"
48
104
  self.progress_file = self.output_dir / "progress.json"
49
105
  self.downloaded = self._load_progress()
@@ -74,6 +130,18 @@ class MapillaryDownloader:
74
130
  if not self.username or not self.quality:
75
131
  raise ValueError("Username and quality must be provided during initialization")
76
132
 
133
+ # Check if collection already exists on Internet Archive
134
+ if self.check_ia and self.collection_name:
135
+ logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
136
+ if check_ia_exists(self.collection_name):
137
+ logger.info("Collection already exists on archive.org, skipping download")
138
+ return
139
+
140
+ # Check if collection already exists in final destination
141
+ if self.final_dir.exists():
142
+ logger.info(f"Collection already exists at {self.final_dir}, skipping download")
143
+ return
144
+
77
145
  quality_field = f"thumb_{self.quality}_url"
78
146
 
79
147
  logger.info(f"Downloading images for user: {self.username}")
@@ -168,9 +236,38 @@ class MapillaryDownloader:
168
236
  if self.tar_sequences:
169
237
  tar_sequence_directories(self.output_dir)
170
238
 
239
+ # Gzip metadata.jsonl to save space
240
+ if self.metadata_file.exists():
241
+ logger.info("Compressing metadata.jsonl...")
242
+ original_size = self.metadata_file.stat().st_size
243
+ gzipped_file = self.metadata_file.with_suffix(".jsonl.gz")
244
+
245
+ with open(self.metadata_file, "rb") as f_in:
246
+ with gzip.open(gzipped_file, "wb", compresslevel=9) as f_out:
247
+ shutil.copyfileobj(f_in, f_out)
248
+
249
+ compressed_size = gzipped_file.stat().st_size
250
+ self.metadata_file.unlink()
251
+
252
+ savings = 100 * (1 - compressed_size / original_size)
253
+ logger.info(
254
+ f"Compressed metadata: {format_size(original_size)} → {format_size(compressed_size)} "
255
+ f"({savings:.1f}% savings)"
256
+ )
257
+
171
258
  # Generate IA metadata
172
259
  generate_ia_metadata(self.output_dir)
173
260
 
261
+ # Move from staging to final destination
262
+ logger.info("Moving collection from staging to final destination...")
263
+ if self.final_dir.exists():
264
+ logger.warning(f"Destination already exists, removing: {self.final_dir}")
265
+ shutil.rmtree(self.final_dir)
266
+
267
+ self.final_dir.parent.mkdir(parents=True, exist_ok=True)
268
+ shutil.move(str(self.staging_dir), str(self.final_dir))
269
+ logger.info(f"Collection moved to: {self.final_dir}")
270
+
174
271
  def _download_images_parallel(self, images, convert_webp):
175
272
  """Download images in parallel using worker pool.
176
273
 
@@ -184,6 +281,7 @@ class MapillaryDownloader:
184
281
  downloaded_count = 0
185
282
  total_bytes = 0
186
283
  failed_count = 0
284
+ batch_start_time = time.time()
187
285
 
188
286
  with ProcessPoolExecutor(max_workers=self.workers) as executor:
189
287
  # Submit all tasks
@@ -209,7 +307,16 @@ class MapillaryDownloader:
209
307
  total_bytes += bytes_dl
210
308
 
211
309
  if downloaded_count % 10 == 0:
212
- logger.info(f"Downloaded: {downloaded_count}/{len(images)} ({format_size(total_bytes)})")
310
+ # Calculate ETA
311
+ elapsed = time.time() - batch_start_time
312
+ rate = downloaded_count / elapsed if elapsed > 0 else 0
313
+ remaining = len(images) - downloaded_count
314
+ eta_seconds = remaining / rate if rate > 0 else 0
315
+
316
+ logger.info(
317
+ f"Downloaded: {downloaded_count}/{len(images)} ({format_size(total_bytes)}) "
318
+ f"- ETA: {format_time(eta_seconds)}"
319
+ )
213
320
  self._save_progress()
214
321
  else:
215
322
  failed_count += 1
@@ -0,0 +1,33 @@
1
+ """Check if collections exist on Internet Archive."""
2
+
3
+ import logging
4
+ import requests
5
+
6
+ logger = logging.getLogger("mapillary_downloader")
7
+
8
+
9
+ def check_ia_exists(collection_name):
10
+ """Check if a collection exists on Internet Archive.
11
+
12
+ Args:
13
+ collection_name: Name of the collection (e.g., mapillary-username-original-webp)
14
+
15
+ Returns:
16
+ Boolean indicating if the collection exists on IA
17
+ """
18
+ # IA identifier format
19
+ ia_url = f"https://archive.org/metadata/{collection_name}"
20
+
21
+ try:
22
+ response = requests.get(ia_url, timeout=10)
23
+ # If we get a 200, the item exists
24
+ if response.status_code == 200:
25
+ data = response.json()
26
+ # Check if it's a valid item (not just metadata for non-existent item)
27
+ if "metadata" in data and data.get("is_dark") is not True:
28
+ return True
29
+ return False
30
+ except requests.RequestException as e:
31
+ logger.warning(f"Failed to check IA for {collection_name}: {e}")
32
+ # On error, assume it doesn't exist (better to download than skip)
33
+ return False
@@ -1,5 +1,6 @@
1
1
  """Internet Archive metadata generation for Mapillary collections."""
2
2
 
3
+ import gzip
3
4
  import json
4
5
  import logging
5
6
  import re
@@ -14,22 +15,22 @@ def parse_collection_name(directory):
14
15
  """Parse username and quality from directory name.
15
16
 
16
17
  Args:
17
- directory: Path to collection directory (e.g., mapillary-username-original)
18
+ directory: Path to collection directory (e.g., mapillary-username-original or mapillary-username-original-webp)
18
19
 
19
20
  Returns:
20
21
  Tuple of (username, quality) or (None, None) if parsing fails
21
22
  """
22
- match = re.match(r"mapillary-(.+)-(256|1024|2048|original)$", Path(directory).name)
23
+ match = re.match(r"mapillary-(.+)-(256|1024|2048|original)(?:-webp)?$", Path(directory).name)
23
24
  if match:
24
25
  return match.group(1), match.group(2)
25
26
  return None, None
26
27
 
27
28
 
28
29
  def get_date_range(metadata_file):
29
- """Get first and last captured_at dates from metadata.jsonl.
30
+ """Get first and last captured_at dates from metadata.jsonl.gz.
30
31
 
31
32
  Args:
32
- metadata_file: Path to metadata.jsonl file
33
+ metadata_file: Path to metadata.jsonl.gz file
33
34
 
34
35
  Returns:
35
36
  Tuple of (first_date, last_date) as ISO format strings, or (None, None)
@@ -38,7 +39,7 @@ def get_date_range(metadata_file):
38
39
  return None, None
39
40
 
40
41
  timestamps = []
41
- with open(metadata_file) as f:
42
+ with gzip.open(metadata_file, "rt") as f:
42
43
  for line in f:
43
44
  if line.strip():
44
45
  data = json.loads(line)
@@ -59,10 +60,10 @@ def get_date_range(metadata_file):
59
60
 
60
61
 
61
62
  def count_images(metadata_file):
62
- """Count number of images in metadata.jsonl.
63
+ """Count number of images in metadata.jsonl.gz.
63
64
 
64
65
  Args:
65
- metadata_file: Path to metadata.jsonl file
66
+ metadata_file: Path to metadata.jsonl.gz file
66
67
 
67
68
  Returns:
68
69
  Number of images
@@ -71,7 +72,7 @@ def count_images(metadata_file):
71
72
  return 0
72
73
 
73
74
  count = 0
74
- with open(metadata_file) as f:
75
+ with gzip.open(metadata_file, "rt") as f:
75
76
  for line in f:
76
77
  if line.strip():
77
78
  count += 1
@@ -112,9 +113,9 @@ def generate_ia_metadata(collection_dir):
112
113
  logger.error(f"Could not parse username/quality from directory: {collection_dir.name}")
113
114
  return False
114
115
 
115
- metadata_file = collection_dir / "metadata.jsonl"
116
+ metadata_file = collection_dir / "metadata.jsonl.gz"
116
117
  if not metadata_file.exists():
117
- logger.error(f"metadata.jsonl not found in {collection_dir}")
118
+ logger.error(f"metadata.jsonl.gz not found in {collection_dir}")
118
119
  return False
119
120
 
120
121
  logger.info(f"Generating IA metadata for {collection_dir.name}...")
@@ -135,7 +136,7 @@ def generate_ia_metadata(collection_dir):
135
136
  write_meta_tag(
136
137
  meta_dir,
137
138
  "title",
138
- f"Mapillary images by {username} ({quality} quality)",
139
+ f"Mapillary images by {username}",
139
140
  )
140
141
 
141
142
  description = (
@@ -60,3 +60,23 @@ def setup_logging(level=logging.INFO):
60
60
  logger.addHandler(handler)
61
61
 
62
62
  return logger
63
+
64
+
65
+ def add_file_handler(log_file, level=logging.INFO):
66
+ """Add a file handler to the logger for archival.
67
+
68
+ Args:
69
+ log_file: Path to log file
70
+ level: Logging level for file handler
71
+ """
72
+ # Use plain formatter for file (no colors)
73
+ formatter = logging.Formatter(fmt="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
74
+
75
+ handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
76
+ handler.setFormatter(formatter)
77
+ handler.setLevel(level)
78
+
79
+ logger = logging.getLogger("mapillary_downloader")
80
+ logger.addHandler(handler)
81
+
82
+ return handler
@@ -1,8 +1,9 @@
1
1
  """Tar sequence directories for efficient Internet Archive uploads."""
2
2
 
3
3
  import logging
4
- import subprocess
4
+ import tarfile
5
5
  from pathlib import Path
6
+ from mapillary_downloader.utils import format_size
6
7
 
7
8
  logger = logging.getLogger("mapillary_downloader")
8
9
 
@@ -38,6 +39,7 @@ def tar_sequence_directories(collection_dir):
38
39
 
39
40
  tarred_count = 0
40
41
  total_files = 0
42
+ total_tar_bytes = 0
41
43
 
42
44
  for seq_dir in sequence_dirs:
43
45
  seq_name = seq_dir.name
@@ -58,22 +60,38 @@ def tar_sequence_directories(collection_dir):
58
60
  continue
59
61
 
60
62
  try:
61
- # Create uncompressed tar (WebP already compressed)
62
- # Use -C to change directory so paths in tar are relative
63
- # Use -- to prevent sequence IDs starting with - from being interpreted as options
64
- result = subprocess.run(
65
- ["tar", "-cf", str(tar_path), "-C", str(collection_dir), "--", seq_name],
66
- capture_output=True,
67
- text=True,
68
- timeout=300, # 5 minute timeout per tar
69
- )
70
-
71
- if result.returncode != 0:
72
- logger.error(f"Failed to tar {seq_name}: {result.stderr}")
63
+ # Create reproducible uncompressed tar (WebP already compressed)
64
+ # Sort files by name for deterministic ordering
65
+ files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
66
+
67
+ if not files_to_tar:
68
+ logger.warning(f"Skipping directory with no files: {seq_name}")
73
69
  continue
74
70
 
71
+ with tarfile.open(tar_path, "w") as tar:
72
+ for file_path in files_to_tar:
73
+ # Get path relative to collection_dir for tar archive
74
+ arcname = file_path.relative_to(collection_dir)
75
+
76
+ # Create TarInfo for reproducibility
77
+ tarinfo = tar.gettarinfo(str(file_path), arcname=str(arcname))
78
+
79
+ # Normalize for reproducibility across platforms
80
+ tarinfo.uid = 0
81
+ tarinfo.gid = 0
82
+ tarinfo.uname = ""
83
+ tarinfo.gname = ""
84
+ # mtime already set on file by worker, preserve it
85
+
86
+ # Add file to tar
87
+ with open(file_path, "rb") as f:
88
+ tar.addfile(tarinfo, f)
89
+
75
90
  # Verify tar was created and has size
76
91
  if tar_path.exists() and tar_path.stat().st_size > 0:
92
+ tar_size = tar_path.stat().st_size
93
+ total_tar_bytes += tar_size
94
+
77
95
  # Remove original directory
78
96
  for file in seq_dir.rglob("*"):
79
97
  if file.is_file():
@@ -99,14 +117,12 @@ def tar_sequence_directories(collection_dir):
99
117
  if tar_path.exists():
100
118
  tar_path.unlink()
101
119
 
102
- except subprocess.TimeoutExpired:
103
- logger.error(f"Timeout tarring {seq_name}")
104
- if tar_path.exists():
105
- tar_path.unlink()
106
120
  except Exception as e:
107
121
  logger.error(f"Error tarring {seq_name}: {e}")
108
122
  if tar_path.exists():
109
123
  tar_path.unlink()
110
124
 
111
- logger.info(f"Tarred {tarred_count} sequences ({total_files:,} files total)")
125
+ logger.info(
126
+ f"Tarred {tarred_count} sequences ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)"
127
+ )
112
128
  return tarred_count, total_files
@@ -1,5 +1,6 @@
1
1
  """Worker process for parallel image download and conversion."""
2
2
 
3
+ import os
3
4
  import tempfile
4
5
  from pathlib import Path
5
6
  import requests
@@ -80,6 +81,12 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
80
81
  if not webp_path:
81
82
  return (image_id, bytes_downloaded, False, "WebP conversion failed")
82
83
 
84
+ # Set file mtime to captured_at timestamp for reproducibility
85
+ if "captured_at" in image_data:
86
+ # captured_at is in milliseconds, convert to seconds
87
+ mtime = image_data["captured_at"] / 1000
88
+ os.utime(final_path, (mtime, mtime))
89
+
83
90
  return (image_id, bytes_downloaded, True, None)
84
91
 
85
92
  except Exception as e: