mapillary-downloader 0.8.1__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/PKG-INFO +1 -1
  2. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/pyproject.toml +1 -1
  3. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/client.py +3 -5
  4. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/downloader.py +3 -2
  5. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/ia_check.py +3 -3
  6. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/ia_stats.py +16 -8
  7. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/metadata_reader.py +0 -47
  8. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/utils.py +3 -5
  9. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/webp_converter.py +4 -9
  10. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/worker.py +1 -1
  11. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/LICENSE.md +0 -0
  12. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/README.md +0 -0
  13. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/__init__.py +0 -0
  14. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/__main__.py +0 -0
  15. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/exif_writer.py +0 -0
  16. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/ia_meta.py +0 -0
  17. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/logging_config.py +0 -0
  18. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/tar_sequences.py +0 -0
  19. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/worker_pool.py +0 -0
  20. {mapillary_downloader-0.8.1 → mapillary_downloader-0.9.0}/src/mapillary_downloader/xmp_writer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mapillary_downloader
3
- Version: 0.8.1
3
+ Version: 0.9.0
4
4
  Summary: Archive user data from Mapillary
5
5
  Author-email: Gareth Davidson <gaz@bitplane.net>
6
6
  Requires-Python: >=3.10
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "mapillary_downloader"
3
3
  description = "Archive user data from Mapillary"
4
- version = "0.8.1"
4
+ version = "0.9.0"
5
5
  authors = [
6
6
  { name = "Gareth Davidson", email = "gaz@bitplane.net" }
7
7
  ]
@@ -22,11 +22,12 @@ class MapillaryClient:
22
22
  self.session = requests.Session()
23
23
  self.session.headers.update({"Authorization": f"OAuth {access_token}"})
24
24
 
25
- def get_user_images(self, username, bbox=None, limit=2000):
25
+ def get_user_images(self, username, quality, bbox=None, limit=2000):
26
26
  """Get images uploaded by a specific user.
27
27
 
28
28
  Args:
29
29
  username: Mapillary username
30
+ quality: Image quality (256, 1024, 2048, or original)
30
31
  bbox: Optional bounding box [west, south, east, north]
31
32
  limit: Number of results per page (max 2000)
32
33
 
@@ -56,10 +57,7 @@ class MapillaryClient:
56
57
  "computed_rotation",
57
58
  "height",
58
59
  "width",
59
- "thumb_256_url",
60
- "thumb_1024_url",
61
- "thumb_2048_url",
62
- "thumb_original_url",
60
+ f"thumb_{quality}_url",
63
61
  ]
64
62
  ),
65
63
  }
@@ -8,6 +8,7 @@ import shutil
8
8
  import threading
9
9
  import time
10
10
  from pathlib import Path
11
+ import requests
11
12
  from mapillary_downloader.utils import format_size, format_time, safe_json_save
12
13
  from mapillary_downloader.ia_meta import generate_ia_metadata
13
14
  from mapillary_downloader.ia_check import check_ia_exists
@@ -219,7 +220,7 @@ class MapillaryDownloader:
219
220
  # Check if collection already exists on Internet Archive
220
221
  if self.check_ia and self.collection_name:
221
222
  logger.info(f"Checking if {self.collection_name} exists on Internet Archive...")
222
- if check_ia_exists(self.collection_name):
223
+ if check_ia_exists(requests.Session(), self.collection_name):
223
224
  logger.info("Collection already exists on archive.org, skipping download")
224
225
  return
225
226
 
@@ -261,7 +262,7 @@ class MapillaryDownloader:
261
262
  try:
262
263
  logger.debug("API fetch thread starting")
263
264
  with open(self.metadata_file, "a") as meta_f:
264
- for image in self.client.get_user_images(self.username, bbox=bbox):
265
+ for image in self.client.get_user_images(self.username, self.quality, bbox=bbox):
265
266
  new_images_count[0] += 1
266
267
 
267
268
  # Save metadata (don't dedupe here, let the tailer handle it)
@@ -6,20 +6,20 @@ import requests
6
6
  logger = logging.getLogger("mapillary_downloader")
7
7
 
8
8
 
9
- def check_ia_exists(collection_name):
9
+ def check_ia_exists(session, collection_name):
10
10
  """Check if a collection exists on Internet Archive.
11
11
 
12
12
  Args:
13
+ session: requests.Session for connection pooling
13
14
  collection_name: Name of the collection (e.g., mapillary-username-original-webp)
14
15
 
15
16
  Returns:
16
17
  Boolean indicating if the collection exists on IA
17
18
  """
18
- # IA identifier format
19
19
  ia_url = f"https://archive.org/metadata/{collection_name}"
20
20
 
21
21
  try:
22
- response = requests.get(ia_url, timeout=10)
22
+ response = session.get(ia_url, timeout=10)
23
23
  # If we get a 200, the item exists
24
24
  if response.status_code == 200:
25
25
  data = response.json()
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  import re
6
+ import requests
6
7
  from mapillary_downloader.utils import safe_json_save, http_get_with_retry, format_size
7
8
  from mapillary_downloader.downloader import get_cache_dir
8
9
 
@@ -11,9 +12,12 @@ logger = logging.getLogger("mapillary_downloader")
11
12
  CACHE_FILE = get_cache_dir() / ".stats.json"
12
13
 
13
14
 
14
- def search_ia_collections():
15
+ def search_ia_collections(session):
15
16
  """Search IA for all mapillary_downloader collections.
16
17
 
18
+ Args:
19
+ session: requests.Session for connection pooling
20
+
17
21
  Returns:
18
22
  List of dicts with: identifier, description, item_size, collection
19
23
  """
@@ -27,7 +31,7 @@ def search_ia_collections():
27
31
  "output": "json",
28
32
  }
29
33
 
30
- response = http_get_with_retry(url, params=params, max_retries=3)
34
+ response = http_get_with_retry(session, url, params=params, max_retries=3)
31
35
  data = response.json()
32
36
 
33
37
  collections = data["response"]["docs"]
@@ -36,10 +40,11 @@ def search_ia_collections():
36
40
  return collections
37
41
 
38
42
 
39
- def fetch_uploader(identifier):
43
+ def fetch_uploader(session, identifier):
40
44
  """Fetch uploader email from item metadata.
41
45
 
42
46
  Args:
47
+ session: requests.Session for connection pooling
43
48
  identifier: IA item identifier
44
49
 
45
50
  Returns:
@@ -47,7 +52,7 @@ def fetch_uploader(identifier):
47
52
  """
48
53
  url = f"https://archive.org/metadata/{identifier}/metadata/uploader"
49
54
  try:
50
- response = http_get_with_retry(url, max_retries=2)
55
+ response = http_get_with_retry(session, url, max_retries=2)
51
56
  data = response.json()
52
57
  return data.get("result")
53
58
  except Exception:
@@ -195,10 +200,11 @@ def aggregate_stats(cache):
195
200
  return stats
196
201
 
197
202
 
198
- def format_stats(stats, cache):
203
+ def format_stats(session, stats, cache):
199
204
  """Format statistics as human-readable text.
200
205
 
201
206
  Args:
207
+ session: requests.Session for connection pooling
202
208
  stats: Dict from aggregate_stats()
203
209
  cache: Dict of collection data
204
210
 
@@ -257,7 +263,7 @@ def format_stats(stats, cache):
257
263
  logger.info(f"Fetching uploader info for {len(need_uploader_fetch)} items...")
258
264
  for i, identifier in enumerate(need_uploader_fetch, 1):
259
265
  logger.info(f" [{i}/{len(need_uploader_fetch)}] {identifier}")
260
- uploader = fetch_uploader(identifier)
266
+ uploader = fetch_uploader(session, identifier)
261
267
  if uploader:
262
268
  cache[identifier]["uploader"] = uploader
263
269
  # Save updated cache with uploaders
@@ -307,9 +313,11 @@ def show_stats(refresh=True):
307
313
  Args:
308
314
  refresh: If True, fetch fresh data from IA. If False, use cache only.
309
315
  """
316
+ session = requests.Session()
317
+
310
318
  if refresh:
311
319
  try:
312
- ia_collections = search_ia_collections()
320
+ ia_collections = search_ia_collections(session)
313
321
  cache = update_cache(ia_collections)
314
322
  except Exception as e:
315
323
  logger.error(f"Failed to fetch IA data: {e}")
@@ -323,4 +331,4 @@ def show_stats(refresh=True):
323
331
  return
324
332
 
325
333
  stats = aggregate_stats(cache)
326
- print(format_stats(stats, cache))
334
+ print(format_stats(session, stats, cache))
@@ -65,53 +65,6 @@ class MetadataReader:
65
65
  except Exception:
66
66
  return False
67
67
 
68
- def iter_images(self, quality_field=None, downloaded_ids=None):
69
- """Stream images from metadata file with filtering.
70
-
71
- Args:
72
- quality_field: Optional field to check exists (e.g., 'thumb_1024_url')
73
- downloaded_ids: Optional set of already downloaded IDs to skip
74
-
75
- Yields:
76
- Image metadata dicts that pass filters
77
- """
78
- if not self.metadata_file.exists():
79
- return
80
-
81
- # Handle gzipped files
82
- if self.metadata_file.suffix == ".gz":
83
- file_handle = gzip.open(self.metadata_file, "rt")
84
- else:
85
- file_handle = open(self.metadata_file)
86
-
87
- with file_handle as f:
88
- for line in f:
89
- line = line.strip()
90
- if not line:
91
- continue
92
-
93
- image = json.loads(line)
94
-
95
- # Check for completion marker
96
- if image.get("__complete__"):
97
- self.is_complete = True
98
- logger.debug("Found API fetch completion marker")
99
- continue
100
-
101
- image_id = image.get("id")
102
- if not image_id:
103
- continue
104
-
105
- # Filter by downloaded status
106
- if downloaded_ids and image_id in downloaded_ids:
107
- continue
108
-
109
- # Filter by quality field availability
110
- if quality_field and not image.get(quality_field):
111
- continue
112
-
113
- yield image
114
-
115
68
  def get_all_ids(self):
116
69
  """Get set of all image IDs in metadata file.
117
70
 
@@ -5,7 +5,6 @@ import logging
5
5
  import os
6
6
  import time
7
7
  from pathlib import Path
8
- import requests
9
8
  from requests.exceptions import RequestException
10
9
 
11
10
  logger = logging.getLogger("mapillary_downloader")
@@ -77,16 +76,16 @@ def safe_json_save(file_path, data):
77
76
  temp_file.replace(file_path)
78
77
 
79
78
 
80
- def http_get_with_retry(url, params=None, max_retries=5, base_delay=1.0, timeout=60, session=None):
79
+ def http_get_with_retry(session, url, params=None, max_retries=5, base_delay=1.0, timeout=60):
81
80
  """HTTP GET with exponential backoff retry.
82
81
 
83
82
  Args:
83
+ session: requests.Session for connection pooling
84
84
  url: URL to fetch
85
85
  params: Optional query parameters
86
86
  max_retries: Maximum retry attempts (default: 5)
87
87
  base_delay: Initial delay in seconds (default: 1.0)
88
88
  timeout: Request timeout in seconds (default: 60)
89
- session: Optional requests.Session for connection pooling
90
89
 
91
90
  Returns:
92
91
  requests.Response object
@@ -94,10 +93,9 @@ def http_get_with_retry(url, params=None, max_retries=5, base_delay=1.0, timeout
94
93
  Raises:
95
94
  requests.RequestException: If all retries exhausted
96
95
  """
97
- getter = session or requests
98
96
  for attempt in range(max_retries):
99
97
  try:
100
- response = getter.get(url, params=params, timeout=timeout)
98
+ response = session.get(url, params=params, timeout=timeout)
101
99
  response.raise_for_status()
102
100
  return response
103
101
  except RequestException as e:
@@ -17,25 +17,20 @@ def check_cwebp_available():
17
17
  return shutil.which("cwebp") is not None
18
18
 
19
19
 
20
- def convert_to_webp(jpg_path, output_path=None, delete_original=True):
20
+ def convert_to_webp(jpg_path, output_path, delete_original=True):
21
21
  """Convert a JPG image to WebP format, preserving EXIF metadata.
22
22
 
23
23
  Args:
24
24
  jpg_path: Path to the JPG file
25
- output_path: Optional path for the WebP output. If None, uses jpg_path with .webp extension
25
+ output_path: Path for the WebP output
26
26
  delete_original: Whether to delete the original JPG after conversion (default: True)
27
27
 
28
28
  Returns:
29
29
  Path object to the new WebP file, or None if conversion failed
30
30
  """
31
31
  jpg_path = Path(jpg_path)
32
-
33
- if output_path is None:
34
- webp_path = jpg_path.with_suffix(".webp")
35
- else:
36
- webp_path = Path(output_path)
37
- # Ensure output directory exists
38
- webp_path.parent.mkdir(parents=True, exist_ok=True)
32
+ webp_path = Path(output_path)
33
+ webp_path.parent.mkdir(parents=True, exist_ok=True)
39
34
 
40
35
  try:
41
36
  # Convert with cwebp, preserving all metadata
@@ -106,7 +106,7 @@ def download_and_convert_image(image_data, output_dir, quality, convert_webp, se
106
106
 
107
107
  try:
108
108
  # Use retry logic with 3 attempts for image downloads
109
- response = http_get_with_retry(image_url, max_retries=3, base_delay=1.0, timeout=60, session=session)
109
+ response = http_get_with_retry(session, image_url, max_retries=3, base_delay=1.0, timeout=60)
110
110
 
111
111
  with open(jpg_path, "wb") as f:
112
112
  for chunk in response.iter_content(chunk_size=8192):