mapillary-tools 0.14.0a2__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. mapillary_tools/__init__.py +1 -1
  2. mapillary_tools/api_v4.py +66 -262
  3. mapillary_tools/authenticate.py +54 -46
  4. mapillary_tools/blackvue_parser.py +79 -22
  5. mapillary_tools/commands/__main__.py +15 -16
  6. mapillary_tools/commands/upload.py +33 -4
  7. mapillary_tools/config.py +38 -17
  8. mapillary_tools/constants.py +127 -43
  9. mapillary_tools/exceptions.py +4 -0
  10. mapillary_tools/exif_read.py +2 -1
  11. mapillary_tools/exif_write.py +3 -1
  12. mapillary_tools/exiftool_read_video.py +52 -15
  13. mapillary_tools/exiftool_runner.py +4 -24
  14. mapillary_tools/ffmpeg.py +406 -232
  15. mapillary_tools/geo.py +16 -0
  16. mapillary_tools/geotag/__init__.py +0 -0
  17. mapillary_tools/geotag/base.py +8 -4
  18. mapillary_tools/geotag/factory.py +106 -89
  19. mapillary_tools/geotag/geotag_images_from_exiftool.py +27 -20
  20. mapillary_tools/geotag/geotag_images_from_gpx.py +7 -6
  21. mapillary_tools/geotag/geotag_images_from_video.py +35 -0
  22. mapillary_tools/geotag/geotag_videos_from_exiftool.py +61 -14
  23. mapillary_tools/geotag/geotag_videos_from_gpx.py +22 -9
  24. mapillary_tools/geotag/options.py +25 -3
  25. mapillary_tools/geotag/utils.py +9 -12
  26. mapillary_tools/geotag/video_extractors/base.py +1 -1
  27. mapillary_tools/geotag/video_extractors/exiftool.py +1 -1
  28. mapillary_tools/geotag/video_extractors/gpx.py +61 -70
  29. mapillary_tools/geotag/video_extractors/native.py +34 -31
  30. mapillary_tools/history.py +128 -8
  31. mapillary_tools/http.py +211 -0
  32. mapillary_tools/mp4/construct_mp4_parser.py +8 -2
  33. mapillary_tools/process_geotag_properties.py +47 -35
  34. mapillary_tools/process_sequence_properties.py +340 -325
  35. mapillary_tools/sample_video.py +8 -8
  36. mapillary_tools/serializer/description.py +587 -0
  37. mapillary_tools/serializer/gpx.py +132 -0
  38. mapillary_tools/types.py +44 -610
  39. mapillary_tools/upload.py +327 -352
  40. mapillary_tools/upload_api_v4.py +125 -72
  41. mapillary_tools/uploader.py +797 -216
  42. mapillary_tools/utils.py +57 -5
  43. {mapillary_tools-0.14.0a2.dist-info → mapillary_tools-0.14.1.dist-info}/METADATA +91 -34
  44. mapillary_tools-0.14.1.dist-info/RECORD +76 -0
  45. {mapillary_tools-0.14.0a2.dist-info → mapillary_tools-0.14.1.dist-info}/WHEEL +1 -1
  46. mapillary_tools-0.14.0a2.dist-info/RECORD +0 -72
  47. {mapillary_tools-0.14.0a2.dist-info → mapillary_tools-0.14.1.dist-info}/entry_points.txt +0 -0
  48. {mapillary_tools-0.14.0a2.dist-info → mapillary_tools-0.14.1.dist-info}/licenses/LICENSE +0 -0
  49. {mapillary_tools-0.14.0a2.dist-info → mapillary_tools-0.14.1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
4
+ import dbm
3
5
  import json
4
6
  import logging
5
7
  import string
8
+ import threading
9
+ import time
6
10
  import typing as T
7
11
  from pathlib import Path
8
12
 
13
+ # dbm modules are dynamically imported, so here we explicitly import dbm.sqlite3 to make sure pyinstaller include it
14
+ # Otherwise you will see: ImportError: no dbm clone found; tried ['dbm.sqlite3', 'dbm.gnu', 'dbm.ndbm', 'dbm.dumb']
15
+ try:
16
+ import dbm.sqlite3 # type: ignore
17
+ except ImportError:
18
+ pass
19
+
20
+
9
21
  from . import constants, types
22
+ from .serializer.description import DescriptionJSONSerializer
10
23
 
11
24
  JSONDict = T.Dict[str, T.Union[str, int, float, None]]
12
25
 
@@ -35,10 +48,21 @@ def history_desc_path(md5sum: str) -> Path:
35
48
  )
36
49
 
37
50
 
38
- def is_uploaded(md5sum: str) -> bool:
51
+ def read_history_record(md5sum: str) -> None | T.Dict[str, T.Any]:
39
52
  if not constants.MAPILLARY_UPLOAD_HISTORY_PATH:
40
- return False
41
- return history_desc_path(md5sum).is_file()
53
+ return None
54
+
55
+ path = history_desc_path(md5sum)
56
+
57
+ if not path.is_file():
58
+ return None
59
+
60
+ with path.open("r") as fp:
61
+ try:
62
+ return json.load(fp)
63
+ except json.JSONDecodeError as ex:
64
+ LOG.error(f"Failed to read upload history {path}: {ex}")
65
+ return None
42
66
 
43
67
 
44
68
  def write_history(
@@ -52,11 +76,107 @@ def write_history(
52
76
  path = history_desc_path(md5sum)
53
77
  LOG.debug("Writing upload history: %s", path)
54
78
  path.resolve().parent.mkdir(parents=True, exist_ok=True)
55
- history: dict[str, T.Any] = {
56
- "params": params,
57
- "summary": summary,
58
- }
79
+ history: dict[str, T.Any] = {"params": params, "summary": summary}
59
80
  if metadatas is not None:
60
- history["descs"] = [types.as_desc(metadata) for metadata in metadatas]
81
+ history["descs"] = [
82
+ DescriptionJSONSerializer.as_desc(metadata) for metadata in metadatas
83
+ ]
61
84
  with open(path, "w") as fp:
62
85
  fp.write(json.dumps(history))
86
+
87
+
88
+ class PersistentCache:
89
+ _lock: contextlib.nullcontext | threading.Lock
90
+
91
+ def __init__(self, file: str):
92
+ # SQLite3 backend supports concurrent access without a lock
93
+ if dbm.whichdb(file) == "dbm.sqlite3":
94
+ self._lock = contextlib.nullcontext()
95
+ else:
96
+ self._lock = threading.Lock()
97
+ self._file = file
98
+
99
+ def get(self, key: str) -> str | None:
100
+ s = time.perf_counter()
101
+
102
+ with self._lock:
103
+ with dbm.open(self._file, flag="c") as db:
104
+ value: bytes | None = db.get(key)
105
+
106
+ if value is None:
107
+ return None
108
+
109
+ payload = self._decode(value)
110
+
111
+ if self._is_expired(payload):
112
+ return None
113
+
114
+ file_handle = payload.get("file_handle")
115
+
116
+ LOG.debug(
117
+ f"Found file handle for {key} in cache ({(time.perf_counter() - s) * 1000:.0f} ms)"
118
+ )
119
+
120
+ return T.cast(str, file_handle)
121
+
122
+ def set(self, key: str, file_handle: str, expires_in: int = 3600 * 24 * 2) -> None:
123
+ s = time.perf_counter()
124
+
125
+ payload = {
126
+ "expires_at": time.time() + expires_in,
127
+ "file_handle": file_handle,
128
+ }
129
+
130
+ value: bytes = json.dumps(payload).encode("utf-8")
131
+
132
+ with self._lock:
133
+ with dbm.open(self._file, flag="c") as db:
134
+ db[key] = value
135
+
136
+ LOG.debug(
137
+ f"Cached file handle for {key} ({(time.perf_counter() - s) * 1000:.0f} ms)"
138
+ )
139
+
140
+ def clear_expired(self) -> list[str]:
141
+ s = time.perf_counter()
142
+
143
+ expired_keys: list[str] = []
144
+
145
+ with self._lock:
146
+ with dbm.open(self._file, flag="c") as db:
147
+ if hasattr(db, "items"):
148
+ items: T.Iterable[tuple[str | bytes, bytes]] = db.items()
149
+ else:
150
+ items = ((key, db[key]) for key in db.keys())
151
+
152
+ for key, value in items:
153
+ payload = self._decode(value)
154
+ if self._is_expired(payload):
155
+ del db[key]
156
+ expired_keys.append(T.cast(str, key))
157
+
158
+ if expired_keys:
159
+ LOG.debug(
160
+ f"Cleared {len(expired_keys)} expired entries from the cache ({(time.perf_counter() - s) * 1000:.0f} ms)"
161
+ )
162
+
163
+ return expired_keys
164
+
165
+ def _is_expired(self, payload: JSONDict) -> bool:
166
+ expires_at = payload.get("expires_at")
167
+ if isinstance(expires_at, (int, float)):
168
+ return expires_at is None or expires_at <= time.time()
169
+ return False
170
+
171
+ def _decode(self, value: bytes) -> JSONDict:
172
+ try:
173
+ payload = json.loads(value.decode("utf-8"))
174
+ except json.JSONDecodeError as ex:
175
+ LOG.warning(f"Failed to decode cache value: {ex}")
176
+ return {}
177
+
178
+ if not isinstance(payload, dict):
179
+ LOG.warning(f"Invalid cache value format: {payload}")
180
+ return {}
181
+
182
+ return payload
@@ -0,0 +1,211 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import ssl
6
+ import sys
7
+ import typing as T
8
+ from json import dumps
9
+
10
+ if sys.version_info >= (3, 12):
11
+ from typing import override
12
+ else:
13
+ from typing_extensions import override
14
+
15
+ import requests
16
+ from requests.adapters import HTTPAdapter
17
+
18
+
19
+ LOG = logging.getLogger(__name__)
20
+
21
+
22
+ class HTTPSystemCertsAdapter(HTTPAdapter):
23
+ """
24
+ This adapter uses the system's certificate store instead of the certifi module.
25
+
26
+ The implementation is based on the project https://pypi.org/project/pip-system-certs/,
27
+ which has a system-wide effect.
28
+ """
29
+
30
+ def init_poolmanager(self, *args, **kwargs):
31
+ ssl_context = ssl.create_default_context()
32
+ ssl_context.load_default_certs()
33
+ kwargs["ssl_context"] = ssl_context
34
+
35
+ super().init_poolmanager(*args, **kwargs)
36
+
37
+ def cert_verify(self, *args, **kwargs):
38
+ super().cert_verify(*args, **kwargs)
39
+
40
+ # By default Python requests uses the ca_certs from the certifi module
41
+ # But we want to use the certificate store instead.
42
+ # By clearing the ca_certs variable we force it to fall back on that behaviour (handled in urllib3)
43
+ if "conn" in kwargs:
44
+ conn = kwargs["conn"]
45
+ else:
46
+ conn = args[0]
47
+
48
+ conn.ca_certs = None
49
+
50
+
51
+ class Session(requests.Session):
52
+ # NOTE: This is a global flag that affects all Session instances
53
+ USE_SYSTEM_CERTS: T.ClassVar[bool] = False
54
+ # Instance variables
55
+ disable_logging_request: bool = False
56
+ disable_logging_response: bool = False
57
+ # Avoid mounting twice
58
+ _mounted: bool = False
59
+
60
+ @override
61
+ def request(self, method: str | bytes, url: str | bytes, *args, **kwargs):
62
+ self._log_debug_request(method, url, *args, **kwargs)
63
+
64
+ if Session.USE_SYSTEM_CERTS:
65
+ if not self._mounted:
66
+ self.mount("https://", HTTPSystemCertsAdapter())
67
+ self._mounted = True
68
+ resp = super().request(method, url, *args, **kwargs)
69
+ else:
70
+ try:
71
+ resp = super().request(method, url, *args, **kwargs)
72
+ except requests.exceptions.SSLError as ex:
73
+ if "SSLCertVerificationError" not in str(ex):
74
+ raise ex
75
+ Session.USE_SYSTEM_CERTS = True
76
+ # HTTPSConnectionPool(host='graph.mapillary.com', port=443): Max retries exceeded with url: /login (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1018)')))
77
+ LOG.warning(
78
+ "SSL error occurred, falling back to system SSL certificates: %s",
79
+ ex,
80
+ )
81
+ return self.request(method, url, *args, **kwargs)
82
+
83
+ self._log_debug_response(resp)
84
+
85
+ return resp
86
+
87
+ def _log_debug_request(self, method: str | bytes, url: str | bytes, **kwargs):
88
+ if self.disable_logging_request:
89
+ return
90
+
91
+ if not LOG.isEnabledFor(logging.DEBUG):
92
+ return
93
+
94
+ if isinstance(method, str) and isinstance(url, str):
95
+ msg = f"HTTP {method} {url}"
96
+ else:
97
+ msg = f"HTTP {method!r} {url!r}"
98
+
99
+ if Session.USE_SYSTEM_CERTS:
100
+ msg += " (w/sys_certs)"
101
+
102
+ json = kwargs.get("json")
103
+ if json is not None:
104
+ t = _truncate(dumps(_sanitize(json)))
105
+ msg += f" JSON={t}"
106
+
107
+ params = kwargs.get("params")
108
+ if params is not None:
109
+ msg += f" PARAMS={_sanitize(params)}"
110
+
111
+ headers = kwargs.get("headers")
112
+ if headers is not None:
113
+ msg += f" HEADERS={_sanitize(headers)}"
114
+
115
+ timeout = kwargs.get("timeout")
116
+ if timeout is not None:
117
+ msg += f" TIMEOUT={timeout}"
118
+
119
+ msg = msg.replace("\n", "\\n")
120
+
121
+ LOG.debug(msg)
122
+
123
+ def _log_debug_response(self, resp: requests.Response):
124
+ if self.disable_logging_response:
125
+ return
126
+
127
+ if not LOG.isEnabledFor(logging.DEBUG):
128
+ return
129
+
130
+ elapsed = resp.elapsed.total_seconds() * 1000 # Convert to milliseconds
131
+ msg = f"HTTP {resp.status_code} {resp.reason} ({elapsed:.0f} ms): {str(_truncate_response_content(resp))}"
132
+
133
+ LOG.debug(msg)
134
+
135
+
136
+ def readable_http_error(ex: requests.HTTPError) -> str:
137
+ return readable_http_response(ex.response)
138
+
139
+
140
+ def readable_http_response(resp: requests.Response) -> str:
141
+ return f"{resp.request.method} {resp.url} => {resp.status_code} {resp.reason}: {str(_truncate_response_content(resp))}"
142
+
143
+
144
+ @T.overload
145
+ def _truncate(s: bytes, limit: int = 256) -> bytes | str: ...
146
+
147
+
148
+ @T.overload
149
+ def _truncate(s: str, limit: int = 256) -> str: ...
150
+
151
+
152
+ def _truncate(s, limit=256):
153
+ if limit < len(s):
154
+ if isinstance(s, bytes):
155
+ try:
156
+ s = s.decode("utf-8")
157
+ except UnicodeDecodeError:
158
+ pass
159
+ remaining = len(s) - limit
160
+ if isinstance(s, bytes):
161
+ return s[:limit] + f"...({remaining} bytes truncated)".encode("utf-8")
162
+ else:
163
+ return str(s[:limit]) + f"...({remaining} chars truncated)"
164
+ else:
165
+ return s
166
+
167
+
168
+ def _sanitize(headers: T.Mapping[T.Any, T.Any]) -> T.Mapping[T.Any, T.Any]:
169
+ new_headers = {}
170
+
171
+ for k, v in headers.items():
172
+ if k.lower() in [
173
+ "authorization",
174
+ "cookie",
175
+ "x-fb-access-token",
176
+ "access-token",
177
+ "access_token",
178
+ "password",
179
+ "user_upload_token",
180
+ ]:
181
+ new_headers[k] = "[REDACTED]"
182
+ else:
183
+ if isinstance(v, (str, bytes)):
184
+ new_headers[k] = T.cast(T.Any, _truncate(v))
185
+ else:
186
+ new_headers[k] = v
187
+
188
+ return new_headers
189
+
190
+
191
+ def _truncate_response_content(resp: requests.Response) -> str | bytes:
192
+ try:
193
+ json_data = resp.json()
194
+ except requests.JSONDecodeError:
195
+ if resp.content is not None:
196
+ data = _truncate(resp.content)
197
+ else:
198
+ data = ""
199
+ else:
200
+ if isinstance(json_data, dict):
201
+ data = _truncate(dumps(_sanitize(json_data)))
202
+ else:
203
+ data = _truncate(str(json_data))
204
+
205
+ if isinstance(data, bytes):
206
+ return data.replace(b"\n", b"\\n")
207
+
208
+ elif isinstance(data, str):
209
+ return data.replace("\n", "\\n")
210
+
211
+ return data
@@ -370,6 +370,10 @@ BoxHeader64 = C.Struct(
370
370
  SwitchMapType = T.Dict[BoxType, T.Union[C.Construct, "SwitchMapType"]]
371
371
 
372
372
 
373
+ class BoxNotFoundError(Exception):
374
+ pass
375
+
376
+
373
377
  class Box64ConstructBuilder:
374
378
  """
375
379
  Build a box struct that **parses** MP4 boxes with both 32-bit and 64-bit sizes.
@@ -567,7 +571,9 @@ def _new_cmap_without_boxes(
567
571
  # pyre-ignore[9]: pyre does not support recursive type SwitchMapType
568
572
  MP4_WITHOUT_STBL_CMAP: SwitchMapType = {
569
573
  # pyre-ignore[6]: pyre does not support recursive type SwitchMapType
570
- b"moov": _new_cmap_without_boxes(CMAP[b"moov"], [b"stbl"]),
574
+ b"moov": _new_cmap_without_boxes(
575
+ CMAP[b"moov"], T.cast(T.Sequence[BoxType], [b"stbl"])
576
+ ),
571
577
  }
572
578
 
573
579
  # for parsing mp4 only
@@ -589,7 +595,7 @@ def find_box_at_pathx(
589
595
  ) -> BoxDict:
590
596
  found = find_box_at_path(box, path)
591
597
  if found is None:
592
- raise ValueError(f"box at path {path} not found")
598
+ raise BoxNotFoundError(f"box at path {path} not found")
593
599
  return found
594
600
 
595
601
 
@@ -1,12 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
- import collections
4
3
  import datetime
5
- import json
6
4
  import logging
7
5
  import typing as T
8
6
  from pathlib import Path
9
7
 
8
+ import humanize
10
9
  from tqdm import tqdm
11
10
 
12
11
  from . import constants, exceptions, exif_write, types, utils
@@ -17,6 +16,11 @@ from .geotag.options import (
17
16
  SourcePathOption,
18
17
  SourceType,
19
18
  )
19
+ from .serializer.description import (
20
+ DescriptionJSONSerializer,
21
+ validate_and_fail_metadata,
22
+ )
23
+ from .serializer.gpx import GPXSerializer
20
24
 
21
25
  LOG = logging.getLogger(__name__)
22
26
  DEFAULT_GEOTAG_SOURCE_OPTIONS = [
@@ -42,8 +46,10 @@ def _parse_source_options(
42
46
  ) -> list[SourceOption]:
43
47
  parsed_options: list[SourceOption] = []
44
48
 
45
- for s in geotag_source:
46
- parsed_options.extend(parse_source_option(s))
49
+ if video_geotag_source and geotag_source:
50
+ LOG.warning(
51
+ "Video source options will be processed BEFORE the generic source options"
52
+ )
47
53
 
48
54
  for s in video_geotag_source:
49
55
  for video_option in parse_source_option(s):
@@ -52,6 +58,9 @@ def _parse_source_options(
52
58
  )
53
59
  parsed_options.append(video_option)
54
60
 
61
+ for s in geotag_source:
62
+ parsed_options.extend(parse_source_option(s))
63
+
55
64
  if geotag_source_path is not None:
56
65
  for parsed_option in parsed_options:
57
66
  if parsed_option.source_path is None:
@@ -163,7 +172,7 @@ def _overwrite_exif_tags(
163
172
  metadatas,
164
173
  desc="Overwriting EXIF",
165
174
  unit="images",
166
- disable=LOG.getEffectiveLevel() <= logging.DEBUG,
175
+ disable=LOG.isEnabledFor(logging.DEBUG),
167
176
  ):
168
177
  dt = datetime.datetime.fromtimestamp(metadata.time, datetime.timezone.utc)
169
178
  dt = dt.replace(tzinfo=datetime.timezone.utc)
@@ -200,25 +209,33 @@ def _write_metadatas(
200
209
  desc_path: str,
201
210
  ) -> None:
202
211
  if desc_path == "-":
203
- descs = [types.as_desc(metadata) for metadata in metadatas]
204
- print(json.dumps(descs, indent=2))
212
+ descs = DescriptionJSONSerializer.serialize(metadatas)
213
+ print(descs.decode("utf-8"))
205
214
  else:
206
- descs = [types.as_desc(metadata) for metadata in metadatas]
207
- with open(desc_path, "w") as fp:
208
- json.dump(descs, fp)
215
+ normalized_suffix = Path(desc_path).suffix.strip().lower()
216
+ if normalized_suffix in [".gpx"]:
217
+ descs = GPXSerializer.serialize(metadatas)
218
+ else:
219
+ descs = DescriptionJSONSerializer.serialize(metadatas)
220
+ with open(desc_path, "wb") as fp:
221
+ fp.write(descs)
209
222
  LOG.info("Check the description file for details: %s", desc_path)
210
223
 
211
224
 
212
- def _is_error_skipped(error_type: str, skipped_process_errors: set[T.Type[Exception]]):
213
- skipped_process_error_names = set(err.__name__ for err in skipped_process_errors)
214
- skip_all = Exception in skipped_process_errors
215
- return skip_all or error_type in skipped_process_error_names
225
+ def _is_error_skipped(
226
+ error_type: type[Exception], skipped_process_errors: set[type[Exception]]
227
+ ):
228
+ return (Exception in skipped_process_errors) or (
229
+ error_type in skipped_process_errors
230
+ )
216
231
 
217
232
 
218
233
  def _show_stats(
219
234
  metadatas: T.Sequence[types.MetadataOrError],
220
235
  skipped_process_errors: set[T.Type[Exception]],
221
236
  ) -> None:
237
+ LOG.info("==> Process summary")
238
+
222
239
  metadatas_by_filetype: dict[types.FileType, list[types.MetadataOrError]] = {}
223
240
  for metadata in metadatas:
224
241
  if isinstance(metadata, types.ImageMetadata):
@@ -234,9 +251,7 @@ def _show_stats(
234
251
  metadata
235
252
  for metadata in metadatas
236
253
  if isinstance(metadata, types.ErrorMetadata)
237
- and not _is_error_skipped(
238
- metadata.error.__class__.__name__, skipped_process_errors
239
- )
254
+ and not _is_error_skipped(type(metadata.error), skipped_process_errors)
240
255
  ]
241
256
  if critical_error_metadatas:
242
257
  raise exceptions.MapillaryProcessError(
@@ -252,38 +267,35 @@ def _show_stats_per_filetype(
252
267
  good_metadatas: list[types.Metadata]
253
268
  good_metadatas, error_metadatas = types.separate_errors(metadatas)
254
269
 
255
- filesize_to_upload = sum(
256
- [0 if m.filesize is None else m.filesize for m in good_metadatas]
257
- )
258
-
259
- LOG.info("%8d %s(s) read in total", len(metadatas), filetype.value)
270
+ LOG.info(f"{len(metadatas)} {filetype.value} read in total")
260
271
  if good_metadatas:
272
+ total_filesize = sum(
273
+ [0 if m.filesize is None else m.filesize for m in good_metadatas]
274
+ )
261
275
  LOG.info(
262
- "\t %8d %s(s) (%s MB) are ready to be uploaded",
263
- len(good_metadatas),
264
- filetype.value,
265
- round(filesize_to_upload / 1024 / 1024, 1),
276
+ f"\t{len(good_metadatas)} ({humanize.naturalsize(total_filesize)}) ready"
266
277
  )
267
278
 
268
- error_counter = collections.Counter(
269
- metadata.error.__class__.__name__ for metadata in error_metadatas
270
- )
279
+ errors_by_type: dict[type[Exception], list[types.ErrorMetadata]] = {}
280
+ for metadata in error_metadatas:
281
+ errors_by_type.setdefault(type(metadata.error), []).append(metadata)
271
282
 
272
- for error_type, count in error_counter.items():
283
+ for error_type, errors in errors_by_type.items():
284
+ total_filesize = sum([utils.get_file_size_quietly(m.filename) for m in errors])
273
285
  if _is_error_skipped(error_type, skipped_process_errors):
274
286
  LOG.warning(
275
- "\t %8d %s(s) skipped due to %s", count, filetype.value, error_type
287
+ f"\t{len(errors)} ({humanize.naturalsize(total_filesize)}) {error_type.__name__}"
276
288
  )
277
289
  else:
278
290
  LOG.error(
279
- "\t %8d %s(s) failed due to %s", count, filetype.value, error_type
291
+ f"\t{len(errors)} ({humanize.naturalsize(total_filesize)}) {error_type.__name__}"
280
292
  )
281
293
 
282
294
 
283
295
  def _validate_metadatas(
284
296
  metadatas: T.Collection[types.MetadataOrError], num_processes: int | None
285
297
  ) -> list[types.MetadataOrError]:
286
- LOG.debug("Validating %d metadatas", len(metadatas))
298
+ LOG.info(f"==> Validating {len(metadatas)} metadatas...")
287
299
 
288
300
  # validating metadatas is slow, hence multiprocessing
289
301
 
@@ -293,7 +305,7 @@ def _validate_metadatas(
293
305
  # See https://stackoverflow.com/a/61432070
294
306
  good_metadatas, error_metadatas = types.separate_errors(metadatas)
295
307
  map_results = utils.mp_map_maybe(
296
- types.validate_and_fail_metadata,
308
+ validate_and_fail_metadata,
297
309
  T.cast(T.Iterable[types.Metadata], good_metadatas),
298
310
  num_processes=num_processes,
299
311
  )
@@ -308,7 +320,7 @@ def _validate_metadatas(
308
320
  )
309
321
  )
310
322
 
311
- return validated_metadatas + error_metadatas
323
+ return T.cast(list[types.MetadataOrError], validated_metadatas + error_metadatas)
312
324
 
313
325
 
314
326
  def process_finalize(