mapillary-downloader 0.3.1__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/PKG-INFO +30 -18
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/README.md +29 -17
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/pyproject.toml +1 -1
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/__main__.py +43 -10
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/downloader.py +113 -6
- mapillary_downloader-0.4.0/src/mapillary_downloader/ia_check.py +33 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/ia_meta.py +12 -11
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/logging_config.py +20 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/tar_sequences.py +34 -18
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/worker.py +7 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/LICENSE.md +0 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/__init__.py +0 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/client.py +0 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/exif_writer.py +0 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/utils.py +0 -0
- {mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/webp_converter.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mapillary_downloader
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Download your Mapillary data before it's gone
|
5
5
|
Author-email: Gareth Davidson <gaz@bitplane.net>
|
6
6
|
Requires-Python: >=3.10
|
@@ -47,37 +47,43 @@ First, get your Mapillary API access token from
|
|
47
47
|
[the developer dashboard](https://www.mapillary.com/dashboard/developers)
|
48
48
|
|
49
49
|
```bash
|
50
|
-
# Set token via environment variable
|
50
|
+
# Set token via environment variable (recommended)
|
51
51
|
export MAPILLARY_TOKEN=YOUR_TOKEN
|
52
|
-
mapillary-downloader
|
52
|
+
mapillary-downloader USERNAME1 USERNAME2 USERNAME3
|
53
53
|
|
54
54
|
# Or pass token directly, and have it in your shell history 💩👀
|
55
|
-
mapillary-downloader --token YOUR_TOKEN
|
55
|
+
mapillary-downloader --token YOUR_TOKEN USERNAME1 USERNAME2
|
56
|
+
|
57
|
+
# Download to specific directory
|
58
|
+
mapillary-downloader --output ./downloads USERNAME1
|
56
59
|
```
|
57
60
|
|
58
|
-
| option
|
59
|
-
|
|
60
|
-
|
|
61
|
-
| `--token`
|
62
|
-
| `--output`
|
63
|
-
| `--quality`
|
64
|
-
| `--bbox`
|
65
|
-
| `--webp`
|
66
|
-
| `--workers`
|
67
|
-
| `--no-tar`
|
61
|
+
| option | because | default |
|
62
|
+
| --------------- | -------------------------------------------- | ------------------ |
|
63
|
+
| `usernames` | One or more Mapillary usernames | (required) |
|
64
|
+
| `--token` | Mapillary API token (or env var) | `$MAPILLARY_TOKEN` |
|
65
|
+
| `--output` | Output directory | `./mapillary_data` |
|
66
|
+
| `--quality` | 256, 1024, 2048 or original | `original` |
|
67
|
+
| `--bbox` | `west,south,east,north` | `None` |
|
68
|
+
| `--no-webp` | Don't convert to WebP | `False` |
|
69
|
+
| `--workers` | Number of parallel download workers | Half of CPU count |
|
70
|
+
| `--no-tar` | Don't tar sequence directories | `False` |
|
71
|
+
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
68
72
|
|
69
73
|
The downloader will:
|
70
74
|
|
71
|
-
* 📷 Download
|
75
|
+
* 📷 Download multiple users' images organized by sequence
|
72
76
|
* 📜 Inject EXIF metadata (GPS coordinates, camera info, timestamps,
|
73
77
|
compass direction)
|
74
78
|
* 🛟 Save progress so you can safely resume if interrupted
|
75
|
-
* 🗜️
|
79
|
+
* 🗜️ Convert to WebP by default to save ~70% disk space
|
76
80
|
* 📦 Tar sequence directories for faster uploads
|
81
|
+
* 🏛️ Check Internet Archive to avoid duplicate downloads
|
82
|
+
* 💾 Stage downloads in cache, move atomically when complete
|
77
83
|
|
78
84
|
## WebP Conversion
|
79
85
|
|
80
|
-
You'll need `cwebp`
|
86
|
+
WebP conversion is **enabled by default** (saves ~70% disk space). You'll need the `cwebp` binary installed:
|
81
87
|
|
82
88
|
```bash
|
83
89
|
# Debian/Ubuntu
|
@@ -87,6 +93,12 @@ sudo apt install webp
|
|
87
93
|
brew install webp
|
88
94
|
```
|
89
95
|
|
96
|
+
To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
97
|
+
|
98
|
+
```bash
|
99
|
+
mapillary-downloader --no-webp USERNAME
|
100
|
+
```
|
101
|
+
|
90
102
|
## Sequence Tarball Creation
|
91
103
|
|
92
104
|
By default, sequence directories are automatically tarred after download because
|
@@ -96,7 +108,7 @@ uploading files to IA.
|
|
96
108
|
To keep individual files instead of creating tars, use the `--no-tar` flag:
|
97
109
|
|
98
110
|
```bash
|
99
|
-
mapillary-downloader --
|
111
|
+
mapillary-downloader --no-tar USERNAME
|
100
112
|
```
|
101
113
|
|
102
114
|
## Internet Archive upload
|
@@ -17,37 +17,43 @@ First, get your Mapillary API access token from
|
|
17
17
|
[the developer dashboard](https://www.mapillary.com/dashboard/developers)
|
18
18
|
|
19
19
|
```bash
|
20
|
-
# Set token via environment variable
|
20
|
+
# Set token via environment variable (recommended)
|
21
21
|
export MAPILLARY_TOKEN=YOUR_TOKEN
|
22
|
-
mapillary-downloader
|
22
|
+
mapillary-downloader USERNAME1 USERNAME2 USERNAME3
|
23
23
|
|
24
24
|
# Or pass token directly, and have it in your shell history 💩👀
|
25
|
-
mapillary-downloader --token YOUR_TOKEN
|
25
|
+
mapillary-downloader --token YOUR_TOKEN USERNAME1 USERNAME2
|
26
|
+
|
27
|
+
# Download to specific directory
|
28
|
+
mapillary-downloader --output ./downloads USERNAME1
|
26
29
|
```
|
27
30
|
|
28
|
-
| option
|
29
|
-
|
|
30
|
-
|
|
31
|
-
| `--token`
|
32
|
-
| `--output`
|
33
|
-
| `--quality`
|
34
|
-
| `--bbox`
|
35
|
-
| `--webp`
|
36
|
-
| `--workers`
|
37
|
-
| `--no-tar`
|
31
|
+
| option | because | default |
|
32
|
+
| --------------- | -------------------------------------------- | ------------------ |
|
33
|
+
| `usernames` | One or more Mapillary usernames | (required) |
|
34
|
+
| `--token` | Mapillary API token (or env var) | `$MAPILLARY_TOKEN` |
|
35
|
+
| `--output` | Output directory | `./mapillary_data` |
|
36
|
+
| `--quality` | 256, 1024, 2048 or original | `original` |
|
37
|
+
| `--bbox` | `west,south,east,north` | `None` |
|
38
|
+
| `--no-webp` | Don't convert to WebP | `False` |
|
39
|
+
| `--workers` | Number of parallel download workers | Half of CPU count |
|
40
|
+
| `--no-tar` | Don't tar sequence directories | `False` |
|
41
|
+
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
38
42
|
|
39
43
|
The downloader will:
|
40
44
|
|
41
|
-
* 📷 Download
|
45
|
+
* 📷 Download multiple users' images organized by sequence
|
42
46
|
* 📜 Inject EXIF metadata (GPS coordinates, camera info, timestamps,
|
43
47
|
compass direction)
|
44
48
|
* 🛟 Save progress so you can safely resume if interrupted
|
45
|
-
* 🗜️
|
49
|
+
* 🗜️ Convert to WebP by default to save ~70% disk space
|
46
50
|
* 📦 Tar sequence directories for faster uploads
|
51
|
+
* 🏛️ Check Internet Archive to avoid duplicate downloads
|
52
|
+
* 💾 Stage downloads in cache, move atomically when complete
|
47
53
|
|
48
54
|
## WebP Conversion
|
49
55
|
|
50
|
-
You'll need `cwebp`
|
56
|
+
WebP conversion is **enabled by default** (saves ~70% disk space). You'll need the `cwebp` binary installed:
|
51
57
|
|
52
58
|
```bash
|
53
59
|
# Debian/Ubuntu
|
@@ -57,6 +63,12 @@ sudo apt install webp
|
|
57
63
|
brew install webp
|
58
64
|
```
|
59
65
|
|
66
|
+
To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
67
|
+
|
68
|
+
```bash
|
69
|
+
mapillary-downloader --no-webp USERNAME
|
70
|
+
```
|
71
|
+
|
60
72
|
## Sequence Tarball Creation
|
61
73
|
|
62
74
|
By default, sequence directories are automatically tarred after download because
|
@@ -66,7 +78,7 @@ uploading files to IA.
|
|
66
78
|
To keep individual files instead of creating tars, use the `--no-tar` flag:
|
67
79
|
|
68
80
|
```bash
|
69
|
-
mapillary-downloader --
|
81
|
+
mapillary-downloader --no-tar USERNAME
|
70
82
|
```
|
71
83
|
|
72
84
|
## Internet Archive upload
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/__main__.py
RENAMED
@@ -3,6 +3,7 @@
|
|
3
3
|
import argparse
|
4
4
|
import os
|
5
5
|
import sys
|
6
|
+
from importlib.metadata import version
|
6
7
|
from mapillary_downloader.client import MapillaryClient
|
7
8
|
from mapillary_downloader.downloader import MapillaryDownloader
|
8
9
|
from mapillary_downloader.logging_config import setup_logging
|
@@ -15,12 +16,17 @@ def main():
|
|
15
16
|
logger = setup_logging()
|
16
17
|
|
17
18
|
parser = argparse.ArgumentParser(description="Download your Mapillary data before it's gone")
|
19
|
+
parser.add_argument(
|
20
|
+
"--version",
|
21
|
+
action="version",
|
22
|
+
version=f"%(prog)s {version('mapillary-downloader')}",
|
23
|
+
)
|
18
24
|
parser.add_argument(
|
19
25
|
"--token",
|
20
26
|
default=os.environ.get("MAPILLARY_TOKEN"),
|
21
27
|
help="Mapillary API access token (or set MAPILLARY_TOKEN env var)",
|
22
28
|
)
|
23
|
-
parser.add_argument("
|
29
|
+
parser.add_argument("usernames", nargs="+", help="Mapillary username(s) to download")
|
24
30
|
parser.add_argument("--output", default="./mapillary_data", help="Output directory (default: ./mapillary_data)")
|
25
31
|
parser.add_argument(
|
26
32
|
"--quality",
|
@@ -30,9 +36,9 @@ def main():
|
|
30
36
|
)
|
31
37
|
parser.add_argument("--bbox", help="Bounding box: west,south,east,north")
|
32
38
|
parser.add_argument(
|
33
|
-
"--webp",
|
39
|
+
"--no-webp",
|
34
40
|
action="store_true",
|
35
|
-
help="
|
41
|
+
help="Don't convert to WebP (WebP conversion is enabled by default, saves ~70%% disk space)",
|
36
42
|
)
|
37
43
|
parser.add_argument(
|
38
44
|
"--workers",
|
@@ -45,6 +51,11 @@ def main():
|
|
45
51
|
action="store_true",
|
46
52
|
help="Don't tar sequence directories (keep individual files)",
|
47
53
|
)
|
54
|
+
parser.add_argument(
|
55
|
+
"--no-check-ia",
|
56
|
+
action="store_true",
|
57
|
+
help="Don't check if collection exists on Internet Archive before downloading",
|
58
|
+
)
|
48
59
|
|
49
60
|
args = parser.parse_args()
|
50
61
|
|
@@ -63,19 +74,41 @@ def main():
|
|
63
74
|
logger.error("Error: bbox must be four comma-separated numbers")
|
64
75
|
sys.exit(1)
|
65
76
|
|
66
|
-
#
|
67
|
-
|
77
|
+
# WebP is enabled by default, disabled with --no-webp
|
78
|
+
convert_webp = not args.no_webp
|
79
|
+
|
80
|
+
# Check for cwebp binary if WebP conversion is enabled
|
81
|
+
if convert_webp:
|
68
82
|
if not check_cwebp_available():
|
69
|
-
logger.error(
|
83
|
+
logger.error(
|
84
|
+
"Error: cwebp binary not found. Install webp package (e.g., apt install webp) or use --no-webp"
|
85
|
+
)
|
70
86
|
sys.exit(1)
|
71
87
|
logger.info("WebP conversion enabled - images will be converted after download")
|
72
88
|
|
73
89
|
try:
|
74
90
|
client = MapillaryClient(args.token)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
91
|
+
|
92
|
+
# Process each username
|
93
|
+
for username in args.usernames:
|
94
|
+
logger.info("")
|
95
|
+
logger.info("=" * 60)
|
96
|
+
logger.info(f"Processing user: {username}")
|
97
|
+
logger.info("=" * 60)
|
98
|
+
logger.info("")
|
99
|
+
|
100
|
+
downloader = MapillaryDownloader(
|
101
|
+
client,
|
102
|
+
args.output,
|
103
|
+
username,
|
104
|
+
args.quality,
|
105
|
+
workers=args.workers,
|
106
|
+
tar_sequences=not args.no_tar,
|
107
|
+
convert_webp=convert_webp,
|
108
|
+
check_ia=not args.no_check_ia,
|
109
|
+
)
|
110
|
+
downloader.download_user_data(bbox=bbox, convert_webp=convert_webp)
|
111
|
+
|
79
112
|
except KeyboardInterrupt:
|
80
113
|
logger.info("\nInterrupted by user")
|
81
114
|
sys.exit(1)
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/downloader.py
RENAMED
@@ -1,32 +1,65 @@
|
|
1
1
|
"""Main downloader logic."""
|
2
2
|
|
3
|
+
import gzip
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import os
|
7
|
+
import shutil
|
6
8
|
import time
|
7
9
|
from pathlib import Path
|
8
10
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
9
11
|
from mapillary_downloader.utils import format_size, format_time
|
10
12
|
from mapillary_downloader.ia_meta import generate_ia_metadata
|
13
|
+
from mapillary_downloader.ia_check import check_ia_exists
|
11
14
|
from mapillary_downloader.worker import download_and_convert_image
|
12
15
|
from mapillary_downloader.tar_sequences import tar_sequence_directories
|
16
|
+
from mapillary_downloader.logging_config import add_file_handler
|
13
17
|
|
14
18
|
logger = logging.getLogger("mapillary_downloader")
|
15
19
|
|
16
20
|
|
21
|
+
def get_cache_dir():
|
22
|
+
"""Get XDG cache directory for staging downloads.
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
Path to cache directory for mapillary_downloader
|
26
|
+
"""
|
27
|
+
xdg_cache = os.environ.get("XDG_CACHE_HOME")
|
28
|
+
if xdg_cache:
|
29
|
+
cache_dir = Path(xdg_cache)
|
30
|
+
else:
|
31
|
+
cache_dir = Path.home() / ".cache"
|
32
|
+
|
33
|
+
mapillary_cache = cache_dir / "mapillary_downloader"
|
34
|
+
mapillary_cache.mkdir(parents=True, exist_ok=True)
|
35
|
+
return mapillary_cache
|
36
|
+
|
37
|
+
|
17
38
|
class MapillaryDownloader:
|
18
39
|
"""Handles downloading Mapillary data for a user."""
|
19
40
|
|
20
|
-
def __init__(
|
41
|
+
def __init__(
|
42
|
+
self,
|
43
|
+
client,
|
44
|
+
output_dir,
|
45
|
+
username=None,
|
46
|
+
quality=None,
|
47
|
+
workers=None,
|
48
|
+
tar_sequences=True,
|
49
|
+
convert_webp=False,
|
50
|
+
check_ia=True,
|
51
|
+
):
|
21
52
|
"""Initialize the downloader.
|
22
53
|
|
23
54
|
Args:
|
24
55
|
client: MapillaryClient instance
|
25
|
-
output_dir: Base directory to save downloads
|
56
|
+
output_dir: Base directory to save downloads (final destination)
|
26
57
|
username: Mapillary username (for collection directory)
|
27
58
|
quality: Image quality (for collection directory)
|
28
59
|
workers: Number of parallel workers (default: half of cpu_count)
|
29
60
|
tar_sequences: Whether to tar sequence directories after download (default: True)
|
61
|
+
convert_webp: Whether to convert images to WebP (affects collection name)
|
62
|
+
check_ia: Whether to check if collection exists on Internet Archive (default: True)
|
30
63
|
"""
|
31
64
|
self.client = client
|
32
65
|
self.base_output_dir = Path(output_dir)
|
@@ -34,16 +67,39 @@ class MapillaryDownloader:
|
|
34
67
|
self.quality = quality
|
35
68
|
self.workers = workers if workers is not None else max(1, os.cpu_count() // 2)
|
36
69
|
self.tar_sequences = tar_sequences
|
70
|
+
self.convert_webp = convert_webp
|
71
|
+
self.check_ia = check_ia
|
37
72
|
|
38
|
-
#
|
73
|
+
# Determine collection name
|
39
74
|
if username and quality:
|
40
75
|
collection_name = f"mapillary-{username}-{quality}"
|
41
|
-
|
76
|
+
if convert_webp:
|
77
|
+
collection_name += "-webp"
|
78
|
+
self.collection_name = collection_name
|
42
79
|
else:
|
43
|
-
self.
|
80
|
+
self.collection_name = None
|
44
81
|
|
82
|
+
# Set up staging directory in cache
|
83
|
+
cache_dir = get_cache_dir()
|
84
|
+
if self.collection_name:
|
85
|
+
self.staging_dir = cache_dir / self.collection_name
|
86
|
+
self.final_dir = self.base_output_dir / self.collection_name
|
87
|
+
else:
|
88
|
+
self.staging_dir = cache_dir / "download"
|
89
|
+
self.final_dir = self.base_output_dir
|
90
|
+
|
91
|
+
# Work in staging directory during download
|
92
|
+
self.output_dir = self.staging_dir
|
45
93
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
46
94
|
|
95
|
+
logger.info(f"Staging directory: {self.staging_dir}")
|
96
|
+
logger.info(f"Final destination: {self.final_dir}")
|
97
|
+
|
98
|
+
# Set up file logging for archival
|
99
|
+
log_file = self.output_dir / "download.log"
|
100
|
+
add_file_handler(log_file)
|
101
|
+
logger.info(f"Logging to: {log_file}")
|
102
|
+
|
47
103
|
self.metadata_file = self.output_dir / "metadata.jsonl"
|
48
104
|
self.progress_file = self.output_dir / "progress.json"
|
49
105
|
self.downloaded = self._load_progress()
|
@@ -74,6 +130,18 @@ class MapillaryDownloader:
|
|
74
130
|
if not self.username or not self.quality:
|
75
131
|
raise ValueError("Username and quality must be provided during initialization")
|
76
132
|
|
133
|
+
# Check if collection already exists on Internet Archive
|
134
|
+
if self.check_ia and self.collection_name:
|
135
|
+
logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
|
136
|
+
if check_ia_exists(self.collection_name):
|
137
|
+
logger.info("Collection already exists on archive.org, skipping download")
|
138
|
+
return
|
139
|
+
|
140
|
+
# Check if collection already exists in final destination
|
141
|
+
if self.final_dir.exists():
|
142
|
+
logger.info(f"Collection already exists at {self.final_dir}, skipping download")
|
143
|
+
return
|
144
|
+
|
77
145
|
quality_field = f"thumb_{self.quality}_url"
|
78
146
|
|
79
147
|
logger.info(f"Downloading images for user: {self.username}")
|
@@ -168,9 +236,38 @@ class MapillaryDownloader:
|
|
168
236
|
if self.tar_sequences:
|
169
237
|
tar_sequence_directories(self.output_dir)
|
170
238
|
|
239
|
+
# Gzip metadata.jsonl to save space
|
240
|
+
if self.metadata_file.exists():
|
241
|
+
logger.info("Compressing metadata.jsonl...")
|
242
|
+
original_size = self.metadata_file.stat().st_size
|
243
|
+
gzipped_file = self.metadata_file.with_suffix(".jsonl.gz")
|
244
|
+
|
245
|
+
with open(self.metadata_file, "rb") as f_in:
|
246
|
+
with gzip.open(gzipped_file, "wb", compresslevel=9) as f_out:
|
247
|
+
shutil.copyfileobj(f_in, f_out)
|
248
|
+
|
249
|
+
compressed_size = gzipped_file.stat().st_size
|
250
|
+
self.metadata_file.unlink()
|
251
|
+
|
252
|
+
savings = 100 * (1 - compressed_size / original_size)
|
253
|
+
logger.info(
|
254
|
+
f"Compressed metadata: {format_size(original_size)} → {format_size(compressed_size)} "
|
255
|
+
f"({savings:.1f}% savings)"
|
256
|
+
)
|
257
|
+
|
171
258
|
# Generate IA metadata
|
172
259
|
generate_ia_metadata(self.output_dir)
|
173
260
|
|
261
|
+
# Move from staging to final destination
|
262
|
+
logger.info("Moving collection from staging to final destination...")
|
263
|
+
if self.final_dir.exists():
|
264
|
+
logger.warning(f"Destination already exists, removing: {self.final_dir}")
|
265
|
+
shutil.rmtree(self.final_dir)
|
266
|
+
|
267
|
+
self.final_dir.parent.mkdir(parents=True, exist_ok=True)
|
268
|
+
shutil.move(str(self.staging_dir), str(self.final_dir))
|
269
|
+
logger.info(f"Collection moved to: {self.final_dir}")
|
270
|
+
|
174
271
|
def _download_images_parallel(self, images, convert_webp):
|
175
272
|
"""Download images in parallel using worker pool.
|
176
273
|
|
@@ -184,6 +281,7 @@ class MapillaryDownloader:
|
|
184
281
|
downloaded_count = 0
|
185
282
|
total_bytes = 0
|
186
283
|
failed_count = 0
|
284
|
+
batch_start_time = time.time()
|
187
285
|
|
188
286
|
with ProcessPoolExecutor(max_workers=self.workers) as executor:
|
189
287
|
# Submit all tasks
|
@@ -209,7 +307,16 @@ class MapillaryDownloader:
|
|
209
307
|
total_bytes += bytes_dl
|
210
308
|
|
211
309
|
if downloaded_count % 10 == 0:
|
212
|
-
|
310
|
+
# Calculate ETA
|
311
|
+
elapsed = time.time() - batch_start_time
|
312
|
+
rate = downloaded_count / elapsed if elapsed > 0 else 0
|
313
|
+
remaining = len(images) - downloaded_count
|
314
|
+
eta_seconds = remaining / rate if rate > 0 else 0
|
315
|
+
|
316
|
+
logger.info(
|
317
|
+
f"Downloaded: {downloaded_count}/{len(images)} ({format_size(total_bytes)}) "
|
318
|
+
f"- ETA: {format_time(eta_seconds)}"
|
319
|
+
)
|
213
320
|
self._save_progress()
|
214
321
|
else:
|
215
322
|
failed_count += 1
|
@@ -0,0 +1,33 @@
|
|
1
|
+
"""Check if collections exist on Internet Archive."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import requests
|
5
|
+
|
6
|
+
logger = logging.getLogger("mapillary_downloader")
|
7
|
+
|
8
|
+
|
9
|
+
def check_ia_exists(collection_name):
|
10
|
+
"""Check if a collection exists on Internet Archive.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
collection_name: Name of the collection (e.g., mapillary-username-original-webp)
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
Boolean indicating if the collection exists on IA
|
17
|
+
"""
|
18
|
+
# IA identifier format
|
19
|
+
ia_url = f"https://archive.org/metadata/{collection_name}"
|
20
|
+
|
21
|
+
try:
|
22
|
+
response = requests.get(ia_url, timeout=10)
|
23
|
+
# If we get a 200, the item exists
|
24
|
+
if response.status_code == 200:
|
25
|
+
data = response.json()
|
26
|
+
# Check if it's a valid item (not just metadata for non-existent item)
|
27
|
+
if "metadata" in data and data.get("is_dark") is not True:
|
28
|
+
return True
|
29
|
+
return False
|
30
|
+
except requests.RequestException as e:
|
31
|
+
logger.warning(f"Failed to check IA for {collection_name}: {e}")
|
32
|
+
# On error, assume it doesn't exist (better to download than skip)
|
33
|
+
return False
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/ia_meta.py
RENAMED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Internet Archive metadata generation for Mapillary collections."""
|
2
2
|
|
3
|
+
import gzip
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import re
|
@@ -14,22 +15,22 @@ def parse_collection_name(directory):
|
|
14
15
|
"""Parse username and quality from directory name.
|
15
16
|
|
16
17
|
Args:
|
17
|
-
directory: Path to collection directory (e.g., mapillary-username-original)
|
18
|
+
directory: Path to collection directory (e.g., mapillary-username-original or mapillary-username-original-webp)
|
18
19
|
|
19
20
|
Returns:
|
20
21
|
Tuple of (username, quality) or (None, None) if parsing fails
|
21
22
|
"""
|
22
|
-
match = re.match(r"mapillary-(.+)-(256|1024|2048|original)
|
23
|
+
match = re.match(r"mapillary-(.+)-(256|1024|2048|original)(?:-webp)?$", Path(directory).name)
|
23
24
|
if match:
|
24
25
|
return match.group(1), match.group(2)
|
25
26
|
return None, None
|
26
27
|
|
27
28
|
|
28
29
|
def get_date_range(metadata_file):
|
29
|
-
"""Get first and last captured_at dates from metadata.jsonl.
|
30
|
+
"""Get first and last captured_at dates from metadata.jsonl.gz.
|
30
31
|
|
31
32
|
Args:
|
32
|
-
metadata_file: Path to metadata.jsonl file
|
33
|
+
metadata_file: Path to metadata.jsonl.gz file
|
33
34
|
|
34
35
|
Returns:
|
35
36
|
Tuple of (first_date, last_date) as ISO format strings, or (None, None)
|
@@ -38,7 +39,7 @@ def get_date_range(metadata_file):
|
|
38
39
|
return None, None
|
39
40
|
|
40
41
|
timestamps = []
|
41
|
-
with open(metadata_file) as f:
|
42
|
+
with gzip.open(metadata_file, "rt") as f:
|
42
43
|
for line in f:
|
43
44
|
if line.strip():
|
44
45
|
data = json.loads(line)
|
@@ -59,10 +60,10 @@ def get_date_range(metadata_file):
|
|
59
60
|
|
60
61
|
|
61
62
|
def count_images(metadata_file):
|
62
|
-
"""Count number of images in metadata.jsonl.
|
63
|
+
"""Count number of images in metadata.jsonl.gz.
|
63
64
|
|
64
65
|
Args:
|
65
|
-
metadata_file: Path to metadata.jsonl file
|
66
|
+
metadata_file: Path to metadata.jsonl.gz file
|
66
67
|
|
67
68
|
Returns:
|
68
69
|
Number of images
|
@@ -71,7 +72,7 @@ def count_images(metadata_file):
|
|
71
72
|
return 0
|
72
73
|
|
73
74
|
count = 0
|
74
|
-
with open(metadata_file) as f:
|
75
|
+
with gzip.open(metadata_file, "rt") as f:
|
75
76
|
for line in f:
|
76
77
|
if line.strip():
|
77
78
|
count += 1
|
@@ -112,9 +113,9 @@ def generate_ia_metadata(collection_dir):
|
|
112
113
|
logger.error(f"Could not parse username/quality from directory: {collection_dir.name}")
|
113
114
|
return False
|
114
115
|
|
115
|
-
metadata_file = collection_dir / "metadata.jsonl"
|
116
|
+
metadata_file = collection_dir / "metadata.jsonl.gz"
|
116
117
|
if not metadata_file.exists():
|
117
|
-
logger.error(f"metadata.jsonl not found in {collection_dir}")
|
118
|
+
logger.error(f"metadata.jsonl.gz not found in {collection_dir}")
|
118
119
|
return False
|
119
120
|
|
120
121
|
logger.info(f"Generating IA metadata for {collection_dir.name}...")
|
@@ -135,7 +136,7 @@ def generate_ia_metadata(collection_dir):
|
|
135
136
|
write_meta_tag(
|
136
137
|
meta_dir,
|
137
138
|
"title",
|
138
|
-
f"Mapillary images by {username}
|
139
|
+
f"Mapillary images by {username}",
|
139
140
|
)
|
140
141
|
|
141
142
|
description = (
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/logging_config.py
RENAMED
@@ -60,3 +60,23 @@ def setup_logging(level=logging.INFO):
|
|
60
60
|
logger.addHandler(handler)
|
61
61
|
|
62
62
|
return logger
|
63
|
+
|
64
|
+
|
65
|
+
def add_file_handler(log_file, level=logging.INFO):
|
66
|
+
"""Add a file handler to the logger for archival.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
log_file: Path to log file
|
70
|
+
level: Logging level for file handler
|
71
|
+
"""
|
72
|
+
# Use plain formatter for file (no colors)
|
73
|
+
formatter = logging.Formatter(fmt="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
74
|
+
|
75
|
+
handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
|
76
|
+
handler.setFormatter(formatter)
|
77
|
+
handler.setLevel(level)
|
78
|
+
|
79
|
+
logger = logging.getLogger("mapillary_downloader")
|
80
|
+
logger.addHandler(handler)
|
81
|
+
|
82
|
+
return handler
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/tar_sequences.py
RENAMED
@@ -1,8 +1,9 @@
|
|
1
1
|
"""Tar sequence directories for efficient Internet Archive uploads."""
|
2
2
|
|
3
3
|
import logging
|
4
|
-
import
|
4
|
+
import tarfile
|
5
5
|
from pathlib import Path
|
6
|
+
from mapillary_downloader.utils import format_size
|
6
7
|
|
7
8
|
logger = logging.getLogger("mapillary_downloader")
|
8
9
|
|
@@ -38,6 +39,7 @@ def tar_sequence_directories(collection_dir):
|
|
38
39
|
|
39
40
|
tarred_count = 0
|
40
41
|
total_files = 0
|
42
|
+
total_tar_bytes = 0
|
41
43
|
|
42
44
|
for seq_dir in sequence_dirs:
|
43
45
|
seq_name = seq_dir.name
|
@@ -58,22 +60,38 @@ def tar_sequence_directories(collection_dir):
|
|
58
60
|
continue
|
59
61
|
|
60
62
|
try:
|
61
|
-
# Create uncompressed tar (WebP already compressed)
|
62
|
-
#
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
text=True,
|
68
|
-
timeout=300, # 5 minute timeout per tar
|
69
|
-
)
|
70
|
-
|
71
|
-
if result.returncode != 0:
|
72
|
-
logger.error(f"Failed to tar {seq_name}: {result.stderr}")
|
63
|
+
# Create reproducible uncompressed tar (WebP already compressed)
|
64
|
+
# Sort files by name for deterministic ordering
|
65
|
+
files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
|
66
|
+
|
67
|
+
if not files_to_tar:
|
68
|
+
logger.warning(f"Skipping directory with no files: {seq_name}")
|
73
69
|
continue
|
74
70
|
|
71
|
+
with tarfile.open(tar_path, "w") as tar:
|
72
|
+
for file_path in files_to_tar:
|
73
|
+
# Get path relative to collection_dir for tar archive
|
74
|
+
arcname = file_path.relative_to(collection_dir)
|
75
|
+
|
76
|
+
# Create TarInfo for reproducibility
|
77
|
+
tarinfo = tar.gettarinfo(str(file_path), arcname=str(arcname))
|
78
|
+
|
79
|
+
# Normalize for reproducibility across platforms
|
80
|
+
tarinfo.uid = 0
|
81
|
+
tarinfo.gid = 0
|
82
|
+
tarinfo.uname = ""
|
83
|
+
tarinfo.gname = ""
|
84
|
+
# mtime already set on file by worker, preserve it
|
85
|
+
|
86
|
+
# Add file to tar
|
87
|
+
with open(file_path, "rb") as f:
|
88
|
+
tar.addfile(tarinfo, f)
|
89
|
+
|
75
90
|
# Verify tar was created and has size
|
76
91
|
if tar_path.exists() and tar_path.stat().st_size > 0:
|
92
|
+
tar_size = tar_path.stat().st_size
|
93
|
+
total_tar_bytes += tar_size
|
94
|
+
|
77
95
|
# Remove original directory
|
78
96
|
for file in seq_dir.rglob("*"):
|
79
97
|
if file.is_file():
|
@@ -99,14 +117,12 @@ def tar_sequence_directories(collection_dir):
|
|
99
117
|
if tar_path.exists():
|
100
118
|
tar_path.unlink()
|
101
119
|
|
102
|
-
except subprocess.TimeoutExpired:
|
103
|
-
logger.error(f"Timeout tarring {seq_name}")
|
104
|
-
if tar_path.exists():
|
105
|
-
tar_path.unlink()
|
106
120
|
except Exception as e:
|
107
121
|
logger.error(f"Error tarring {seq_name}: {e}")
|
108
122
|
if tar_path.exists():
|
109
123
|
tar_path.unlink()
|
110
124
|
|
111
|
-
logger.info(
|
125
|
+
logger.info(
|
126
|
+
f"Tarred {tarred_count} sequences ({total_files:,} files, {format_size(total_tar_bytes)} total tar size)"
|
127
|
+
)
|
112
128
|
return tarred_count, total_files
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/worker.py
RENAMED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Worker process for parallel image download and conversion."""
|
2
2
|
|
3
|
+
import os
|
3
4
|
import tempfile
|
4
5
|
from pathlib import Path
|
5
6
|
import requests
|
@@ -80,6 +81,12 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, ac
|
|
80
81
|
if not webp_path:
|
81
82
|
return (image_id, bytes_downloaded, False, "WebP conversion failed")
|
82
83
|
|
84
|
+
# Set file mtime to captured_at timestamp for reproducibility
|
85
|
+
if "captured_at" in image_data:
|
86
|
+
# captured_at is in milliseconds, convert to seconds
|
87
|
+
mtime = image_data["captured_at"] / 1000
|
88
|
+
os.utime(final_path, (mtime, mtime))
|
89
|
+
|
83
90
|
return (image_id, bytes_downloaded, True, None)
|
84
91
|
|
85
92
|
except Exception as e:
|
File without changes
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/__init__.py
RENAMED
File without changes
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/client.py
RENAMED
File without changes
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/exif_writer.py
RENAMED
File without changes
|
File without changes
|
{mapillary_downloader-0.3.1 → mapillary_downloader-0.4.0}/src/mapillary_downloader/webp_converter.py
RENAMED
File without changes
|