mapillary-downloader 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mapillary_downloader/tar_sequences.py +24 -33
- mapillary_downloader/worker.py +4 -2
- {mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/METADATA +19 -7
- {mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/RECORD +7 -7
- {mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/WHEEL +0 -0
- {mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/entry_points.txt +0 -0
- {mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -23,51 +23,43 @@ def tar_sequence_directories(collection_dir):
|
|
|
23
23
|
logger.error(f"Collection directory not found: {collection_dir}")
|
|
24
24
|
return 0, 0
|
|
25
25
|
|
|
26
|
-
# Find all
|
|
26
|
+
# Find all bucket directories (skip special dirs)
|
|
27
|
+
# Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
|
|
27
28
|
skip_dirs = {".meta", "__pycache__"}
|
|
28
|
-
|
|
29
|
+
bucket_dirs = []
|
|
29
30
|
|
|
30
31
|
for item in collection_dir.iterdir():
|
|
31
32
|
if item.is_dir() and item.name not in skip_dirs:
|
|
32
|
-
|
|
33
|
+
# Check if this is a bucket dir (single char)
|
|
34
|
+
if len(item.name) == 1:
|
|
35
|
+
bucket_dirs.append(item)
|
|
33
36
|
|
|
34
|
-
if not
|
|
35
|
-
logger.info("No
|
|
37
|
+
if not bucket_dirs:
|
|
38
|
+
logger.info("No bucket directories to tar")
|
|
36
39
|
return 0, 0
|
|
37
40
|
|
|
38
|
-
logger.info(f"Tarring {len(
|
|
41
|
+
logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
|
|
39
42
|
|
|
40
43
|
tarred_count = 0
|
|
41
44
|
total_files = 0
|
|
42
45
|
total_tar_bytes = 0
|
|
43
46
|
|
|
44
|
-
for
|
|
45
|
-
|
|
46
|
-
tar_path = collection_dir / f"{
|
|
47
|
+
for bucket_dir in bucket_dirs:
|
|
48
|
+
bucket_name = bucket_dir.name
|
|
49
|
+
tar_path = collection_dir / f"{bucket_name}.tar"
|
|
47
50
|
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
counter += 1
|
|
52
|
-
tar_path = collection_dir / f"{seq_name}.{counter}.tar"
|
|
53
|
-
|
|
54
|
-
# Count files in sequence
|
|
55
|
-
files = list(seq_dir.glob("*"))
|
|
56
|
-
file_count = len([f for f in files if f.is_file()])
|
|
51
|
+
# Count files in bucket
|
|
52
|
+
files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
|
|
53
|
+
file_count = len(files_to_tar)
|
|
57
54
|
|
|
58
55
|
if file_count == 0:
|
|
59
|
-
logger.warning(f"Skipping empty directory: {
|
|
56
|
+
logger.warning(f"Skipping empty bucket directory: {bucket_name}")
|
|
60
57
|
continue
|
|
61
58
|
|
|
62
59
|
try:
|
|
63
|
-
|
|
64
|
-
# Sort files by name for deterministic ordering
|
|
65
|
-
files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
|
|
66
|
-
|
|
67
|
-
if not files_to_tar:
|
|
68
|
-
logger.warning(f"Skipping directory with no files: {seq_name}")
|
|
69
|
-
continue
|
|
60
|
+
logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
|
|
70
61
|
|
|
62
|
+
# Create reproducible uncompressed tar (WebP already compressed)
|
|
71
63
|
with tarfile.open(tar_path, "w") as tar:
|
|
72
64
|
for file_path in files_to_tar:
|
|
73
65
|
# Get path relative to collection_dir for tar archive
|
|
@@ -92,33 +84,32 @@ def tar_sequence_directories(collection_dir):
|
|
|
92
84
|
tar_size = tar_path.stat().st_size
|
|
93
85
|
total_tar_bytes += tar_size
|
|
94
86
|
|
|
95
|
-
# Remove original directory
|
|
96
|
-
for file in
|
|
87
|
+
# Remove original bucket directory
|
|
88
|
+
for file in bucket_dir.rglob("*"):
|
|
97
89
|
if file.is_file():
|
|
98
90
|
file.unlink()
|
|
99
91
|
|
|
100
92
|
# Remove empty subdirs and main dir
|
|
101
|
-
for subdir in list(
|
|
93
|
+
for subdir in list(bucket_dir.rglob("*")):
|
|
102
94
|
if subdir.is_dir():
|
|
103
95
|
try:
|
|
104
96
|
subdir.rmdir()
|
|
105
97
|
except OSError:
|
|
106
98
|
pass # Not empty yet
|
|
107
99
|
|
|
108
|
-
|
|
100
|
+
bucket_dir.rmdir()
|
|
109
101
|
|
|
110
102
|
tarred_count += 1
|
|
111
103
|
total_files += file_count
|
|
112
104
|
|
|
113
|
-
|
|
114
|
-
logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
|
|
105
|
+
logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
|
|
115
106
|
else:
|
|
116
107
|
logger.error(f"Tar file empty or not created: {tar_path}")
|
|
117
108
|
if tar_path.exists():
|
|
118
109
|
tar_path.unlink()
|
|
119
110
|
|
|
120
111
|
except Exception as e:
|
|
121
|
-
logger.error(f"Error tarring {
|
|
112
|
+
logger.error(f"Error tarring bucket {bucket_name}: {e}")
|
|
122
113
|
if tar_path.exists():
|
|
123
114
|
tar_path.unlink()
|
|
124
115
|
|
mapillary_downloader/worker.py
CHANGED
|
@@ -69,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
|
|
|
69
69
|
if not image_url:
|
|
70
70
|
return (image_id, 0, False, f"No {quality} URL")
|
|
71
71
|
|
|
72
|
-
# Determine final output directory
|
|
72
|
+
# Determine final output directory - organize by first char of sequence ID
|
|
73
73
|
output_dir = Path(output_dir)
|
|
74
74
|
sequence_id = image_data.get("sequence")
|
|
75
75
|
if sequence_id:
|
|
76
|
-
|
|
76
|
+
# Use first character as bucket (gives us ~62 dirs instead of millions)
|
|
77
|
+
first_char = sequence_id[0]
|
|
78
|
+
img_dir = output_dir / first_char / sequence_id
|
|
77
79
|
img_dir.mkdir(parents=True, exist_ok=True)
|
|
78
80
|
else:
|
|
79
81
|
img_dir = output_dir
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mapillary_downloader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Download your Mapillary data before it's gone
|
|
5
5
|
Author-email: Gareth Davidson <gaz@bitplane.net>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
|
|
|
66
66
|
| `--quality` | 256, 1024, 2048 or original | `original` |
|
|
67
67
|
| `--bbox` | `west,south,east,north` | `None` |
|
|
68
68
|
| `--no-webp` | Don't convert to WebP | `False` |
|
|
69
|
-
| `--workers`
|
|
70
|
-
| `--no-tar` | Don't tar
|
|
69
|
+
| `--max-workers` | Maximum number of parallel download workers | `128` |
|
|
70
|
+
| `--no-tar` | Don't tar bucket directories | `False` |
|
|
71
71
|
| `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
|
|
72
72
|
|
|
73
73
|
The downloader will:
|
|
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
|
|
|
98
98
|
mapillary-downloader --no-webp USERNAME
|
|
99
99
|
```
|
|
100
100
|
|
|
101
|
-
##
|
|
101
|
+
## Tarballs
|
|
102
102
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
103
|
+
Images are organized by sequence ID, bucketed by the first character of the
|
|
104
|
+
sequence to reduce directory count:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
mapillary-username-quality/
|
|
108
|
+
a/
|
|
109
|
+
abc123/
|
|
110
|
+
image1.webp
|
|
111
|
+
image2.webp
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
By default, these bucket directories are automatically tarred after download
|
|
115
|
+
(resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
|
|
116
|
+
because large collections with millions of images would otherwise create hundreds
|
|
117
|
+
of thousands of tiny tars, and anger the archive gods.
|
|
106
118
|
|
|
107
119
|
To keep individual files instead of creating tars, use the `--no-tar` flag.
|
|
108
120
|
|
|
@@ -7,13 +7,13 @@ mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNM
|
|
|
7
7
|
mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
|
|
8
8
|
mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
|
|
9
9
|
mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
|
|
10
|
-
mapillary_downloader/tar_sequences.py,sha256=
|
|
10
|
+
mapillary_downloader/tar_sequences.py,sha256=758yVQGSLC_x8tT7h1qzAdo8b-4OmARZYseNacM1Nv8,4223
|
|
11
11
|
mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
|
|
12
12
|
mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
|
|
13
|
-
mapillary_downloader/worker.py,sha256=
|
|
13
|
+
mapillary_downloader/worker.py,sha256=Q82Q1mnTL_CUwNXum9GAg2Fz40dolh_gByDkeN72p9o,4814
|
|
14
14
|
mapillary_downloader/worker_pool.py,sha256=iGRq5uFwBNNVQnI4vEjbKHkbKTaEVCdmvMvXcRGuDMg,8203
|
|
15
|
-
mapillary_downloader-0.
|
|
16
|
-
mapillary_downloader-0.
|
|
17
|
-
mapillary_downloader-0.
|
|
18
|
-
mapillary_downloader-0.
|
|
19
|
-
mapillary_downloader-0.
|
|
15
|
+
mapillary_downloader-0.6.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
|
|
16
|
+
mapillary_downloader-0.6.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
|
|
17
|
+
mapillary_downloader-0.6.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
18
|
+
mapillary_downloader-0.6.0.dist-info/METADATA,sha256=dvPNrWfk-wB_xIFoowuIH5-17Oib14hpHpik4FpqC7k,5277
|
|
19
|
+
mapillary_downloader-0.6.0.dist-info/RECORD,,
|
|
File without changes
|
{mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{mapillary_downloader-0.5.2.dist-info → mapillary_downloader-0.6.0.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|