mapillary-downloader 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,51 +23,43 @@ def tar_sequence_directories(collection_dir):
23
23
  logger.error(f"Collection directory not found: {collection_dir}")
24
24
  return 0, 0
25
25
 
26
- # Find all sequence directories (skip special dirs)
26
+ # Find all bucket directories (skip special dirs)
27
+ # Now we tar entire bucket dirs (e.g., a/, b/, etc) to get ~62 tar files
27
28
  skip_dirs = {".meta", "__pycache__"}
28
- sequence_dirs = []
29
+ bucket_dirs = []
29
30
 
30
31
  for item in collection_dir.iterdir():
31
32
  if item.is_dir() and item.name not in skip_dirs:
32
- sequence_dirs.append(item)
33
+ # Check if this is a bucket dir (single char)
34
+ if len(item.name) == 1:
35
+ bucket_dirs.append(item)
33
36
 
34
- if not sequence_dirs:
35
- logger.info("No sequence directories to tar")
37
+ if not bucket_dirs:
38
+ logger.info("No bucket directories to tar")
36
39
  return 0, 0
37
40
 
38
- logger.info(f"Tarring {len(sequence_dirs)} sequence directories...")
41
+ logger.info(f"Tarring {len(bucket_dirs)} bucket directories...")
39
42
 
40
43
  tarred_count = 0
41
44
  total_files = 0
42
45
  total_tar_bytes = 0
43
46
 
44
- for seq_dir in sequence_dirs:
45
- seq_name = seq_dir.name
46
- tar_path = collection_dir / f"{seq_name}.tar"
47
+ for bucket_dir in bucket_dirs:
48
+ bucket_name = bucket_dir.name
49
+ tar_path = collection_dir / f"{bucket_name}.tar"
47
50
 
48
- # Handle naming collision - find next available name
49
- counter = 1
50
- while tar_path.exists():
51
- counter += 1
52
- tar_path = collection_dir / f"{seq_name}.{counter}.tar"
53
-
54
- # Count files in sequence
55
- files = list(seq_dir.glob("*"))
56
- file_count = len([f for f in files if f.is_file()])
51
+ # Count files in bucket
52
+ files_to_tar = sorted([f for f in bucket_dir.rglob("*") if f.is_file()], key=lambda x: str(x))
53
+ file_count = len(files_to_tar)
57
54
 
58
55
  if file_count == 0:
59
- logger.warning(f"Skipping empty directory: {seq_name}")
56
+ logger.warning(f"Skipping empty bucket directory: {bucket_name}")
60
57
  continue
61
58
 
62
59
  try:
63
- # Create reproducible uncompressed tar (WebP already compressed)
64
- # Sort files by name for deterministic ordering
65
- files_to_tar = sorted([f for f in seq_dir.rglob("*") if f.is_file()], key=lambda x: x.name)
66
-
67
- if not files_to_tar:
68
- logger.warning(f"Skipping directory with no files: {seq_name}")
69
- continue
60
+ logger.info(f"Tarring bucket '{bucket_name}' ({file_count} files)...")
70
61
 
62
+ # Create reproducible uncompressed tar (WebP already compressed)
71
63
  with tarfile.open(tar_path, "w") as tar:
72
64
  for file_path in files_to_tar:
73
65
  # Get path relative to collection_dir for tar archive
@@ -92,33 +84,32 @@ def tar_sequence_directories(collection_dir):
92
84
  tar_size = tar_path.stat().st_size
93
85
  total_tar_bytes += tar_size
94
86
 
95
- # Remove original directory
96
- for file in seq_dir.rglob("*"):
87
+ # Remove original bucket directory
88
+ for file in bucket_dir.rglob("*"):
97
89
  if file.is_file():
98
90
  file.unlink()
99
91
 
100
92
  # Remove empty subdirs and main dir
101
- for subdir in list(seq_dir.rglob("*")):
93
+ for subdir in list(bucket_dir.rglob("*")):
102
94
  if subdir.is_dir():
103
95
  try:
104
96
  subdir.rmdir()
105
97
  except OSError:
106
98
  pass # Not empty yet
107
99
 
108
- seq_dir.rmdir()
100
+ bucket_dir.rmdir()
109
101
 
110
102
  tarred_count += 1
111
103
  total_files += file_count
112
104
 
113
- if tarred_count % 10 == 0:
114
- logger.info(f"Tarred {tarred_count}/{len(sequence_dirs)} sequences...")
105
+ logger.info(f"Tarred bucket '{bucket_name}': {file_count:,} files, {format_size(tar_size)}")
115
106
  else:
116
107
  logger.error(f"Tar file empty or not created: {tar_path}")
117
108
  if tar_path.exists():
118
109
  tar_path.unlink()
119
110
 
120
111
  except Exception as e:
121
- logger.error(f"Error tarring {seq_name}: {e}")
112
+ logger.error(f"Error tarring bucket {bucket_name}: {e}")
122
113
  if tar_path.exists():
123
114
  tar_path.unlink()
124
115
 
@@ -69,11 +69,13 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
69
69
  if not image_url:
70
70
  return (image_id, 0, False, f"No {quality} URL")
71
71
 
72
- # Determine final output directory
72
+ # Determine final output directory - organize by first char of sequence ID
73
73
  output_dir = Path(output_dir)
74
74
  sequence_id = image_data.get("sequence")
75
75
  if sequence_id:
76
- img_dir = output_dir / sequence_id
76
+ # Use first character as bucket (gives us ~62 dirs instead of millions)
77
+ first_char = sequence_id[0]
78
+ img_dir = output_dir / first_char / sequence_id
77
79
  img_dir.mkdir(parents=True, exist_ok=True)
78
80
  else:
79
81
  img_dir = output_dir
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: Download your Mapillary data before it's gone
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -66,8 +66,8 @@ mapillary-downloader --output ./downloads USERNAME1
66
66
  | `--quality` | 256, 1024, 2048 or original | `original` |
67
67
  | `--bbox` | `west,south,east,north` | `None` |
68
68
  | `--no-webp` | Don't convert to WebP | `False` |
69
- | `--workers` | Number of parallel download workers | Half of CPU count |
70
- | `--no-tar` | Don't tar sequence directories | `False` |
69
+ | `--max-workers` | Maximum number of parallel download workers | `128` |
70
+ | `--no-tar` | Don't tar bucket directories | `False` |
71
71
  | `--no-check-ia` | Don't check if exists on Internet Archive | `False` |
72
72
 
73
73
  The downloader will:
@@ -98,11 +98,23 @@ To disable WebP conversion and keep original JPEGs, use `--no-webp`:
98
98
  mapillary-downloader --no-webp USERNAME
99
99
  ```
100
100
 
101
- ## Sequence Tarball Creation
101
+ ## Tarballs
102
102
 
103
- By default, sequence directories are automatically tarred after download because
104
- if they weren't, you'd spend more time setting up upload metadata than actually
105
- uploading files to IA.
103
+ Images are organized by sequence ID, bucketed by the first character of the
104
+ sequence to reduce directory count:
105
+
106
+ ```
107
+ mapillary-username-quality/
108
+ a/
109
+ abc123/
110
+ image1.webp
111
+ image2.webp
112
+ ```
113
+
114
+ By default, these bucket directories are automatically tarred after download
115
+ (resulting in `a.tar`, `b.tar`, etc. - about 62 tar files total). This is done
116
+ because large collections with millions of images would otherwise create hundreds
117
+ of thousands of tiny tars, and anger the archive gods.
106
118
 
107
119
  To keep individual files instead of creating tars, use the `--no-tar` flag.
108
120
 
@@ -7,13 +7,13 @@ mapillary_downloader/ia_check.py,sha256=L2MEbG_KmlAd5NLmo2HQkO8HWvRN0brE5wXXoyNM
7
7
  mapillary_downloader/ia_meta.py,sha256=78rcybHIPnQDsF02KGj6RYmDXzYzrU8sdVx4Q9Y0sfI,6266
8
8
  mapillary_downloader/logging_config.py,sha256=Z-wNq34nt7aIhJWdeKc1feTY46P9-Or7HtiX7eUFjEI,2324
9
9
  mapillary_downloader/metadata_reader.py,sha256=Re-HN0Vfc7Hs1eOut7uOoW7jWJ2PIbKoNzC7Ak3ah5o,4933
10
- mapillary_downloader/tar_sequences.py,sha256=mqs5p3N7osV_bxTkw6i34GVmxCBBEbIiKKxeh-fWNdU,4430
10
+ mapillary_downloader/tar_sequences.py,sha256=758yVQGSLC_x8tT7h1qzAdo8b-4OmARZYseNacM1Nv8,4223
11
11
  mapillary_downloader/utils.py,sha256=yzVgS1mwsklDAqrimaFafgTTXtRYQUbKP98Xgh9d2KA,1174
12
12
  mapillary_downloader/webp_converter.py,sha256=vYLLQxDmdnqRz0nm7wXwRUd4x9mQZNah-DrncpA8sNs,1901
13
- mapillary_downloader/worker.py,sha256=n9m6PzSjlLOOYZJd9j1vH-2ag9aOeNndfgRlunzI14s,4637
13
+ mapillary_downloader/worker.py,sha256=Q82Q1mnTL_CUwNXum9GAg2Fz40dolh_gByDkeN72p9o,4814
14
14
  mapillary_downloader/worker_pool.py,sha256=iGRq5uFwBNNVQnI4vEjbKHkbKTaEVCdmvMvXcRGuDMg,8203
15
- mapillary_downloader-0.5.2.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
- mapillary_downloader-0.5.2.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
- mapillary_downloader-0.5.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
- mapillary_downloader-0.5.2.dist-info/METADATA,sha256=PHO4jDVxqsIo9Hs9GX3J2Cfnfc8gy_PI6xhssE9jrMk,4982
19
- mapillary_downloader-0.5.2.dist-info/RECORD,,
15
+ mapillary_downloader-0.6.0.dist-info/entry_points.txt,sha256=PdYtxOXHMJrUhmiPO4G-F98VuhUI4MN9D_T4KPrVZ5w,75
16
+ mapillary_downloader-0.6.0.dist-info/licenses/LICENSE.md,sha256=7_BIuQ-veOrsF-WarH8kTkm0-xrCLvJ1PFE1C4Ebs64,146
17
+ mapillary_downloader-0.6.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
18
+ mapillary_downloader-0.6.0.dist-info/METADATA,sha256=dvPNrWfk-wB_xIFoowuIH5-17Oib14hpHpik4FpqC7k,5277
19
+ mapillary_downloader-0.6.0.dist-info/RECORD,,