PyPI - mapillary-tools - Versions diffs - 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

mapillary-tools 0.13.3py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

mapillary_tools/__init__.py +1 -1
mapillary_tools/api_v4.py +198 -55
mapillary_tools/authenticate.py +326 -64
mapillary_tools/blackvue_parser.py +195 -0
mapillary_tools/camm/camm_builder.py +55 -97
mapillary_tools/camm/camm_parser.py +429 -181
mapillary_tools/commands/__main__.py +10 -6
mapillary_tools/commands/authenticate.py +8 -1
mapillary_tools/commands/process.py +27 -51
mapillary_tools/commands/process_and_upload.py +18 -5
mapillary_tools/commands/sample_video.py +2 -3
mapillary_tools/commands/upload.py +44 -13
mapillary_tools/commands/video_process_and_upload.py +19 -5
mapillary_tools/config.py +65 -26
mapillary_tools/constants.py +141 -18
mapillary_tools/exceptions.py +37 -34
mapillary_tools/exif_read.py +221 -116
mapillary_tools/exif_write.py +10 -8
mapillary_tools/exiftool_read.py +33 -42
mapillary_tools/exiftool_read_video.py +97 -47
mapillary_tools/exiftool_runner.py +57 -0
mapillary_tools/ffmpeg.py +417 -242
mapillary_tools/geo.py +158 -118
mapillary_tools/geotag/__init__.py +0 -1
mapillary_tools/geotag/base.py +147 -0
mapillary_tools/geotag/factory.py +307 -0
mapillary_tools/geotag/geotag_images_from_exif.py +14 -131
mapillary_tools/geotag/geotag_images_from_exiftool.py +136 -85
mapillary_tools/geotag/geotag_images_from_gpx.py +60 -124
mapillary_tools/geotag/geotag_images_from_gpx_file.py +13 -126
mapillary_tools/geotag/geotag_images_from_nmea_file.py +4 -5
mapillary_tools/geotag/geotag_images_from_video.py +88 -51
mapillary_tools/geotag/geotag_videos_from_exiftool.py +123 -0
mapillary_tools/geotag/geotag_videos_from_gpx.py +52 -0
mapillary_tools/geotag/geotag_videos_from_video.py +20 -185
mapillary_tools/geotag/image_extractors/base.py +18 -0
mapillary_tools/geotag/image_extractors/exif.py +60 -0
mapillary_tools/geotag/image_extractors/exiftool.py +18 -0
mapillary_tools/geotag/options.py +182 -0
mapillary_tools/geotag/utils.py +52 -16
mapillary_tools/geotag/video_extractors/base.py +18 -0
mapillary_tools/geotag/video_extractors/exiftool.py +70 -0
mapillary_tools/geotag/video_extractors/gpx.py +116 -0
mapillary_tools/geotag/video_extractors/native.py +160 -0
mapillary_tools/{geotag → gpmf}/gpmf_parser.py +205 -182
mapillary_tools/{geotag → gpmf}/gps_filter.py +5 -3
mapillary_tools/history.py +134 -20
mapillary_tools/mp4/construct_mp4_parser.py +17 -10
mapillary_tools/mp4/io_utils.py +0 -1
mapillary_tools/mp4/mp4_sample_parser.py +36 -28
mapillary_tools/mp4/simple_mp4_builder.py +10 -9
mapillary_tools/mp4/simple_mp4_parser.py +13 -22
mapillary_tools/process_geotag_properties.py +184 -414
mapillary_tools/process_sequence_properties.py +594 -225
mapillary_tools/sample_video.py +20 -26
mapillary_tools/serializer/description.py +587 -0
mapillary_tools/serializer/gpx.py +132 -0
mapillary_tools/telemetry.py +26 -13
mapillary_tools/types.py +98 -611
mapillary_tools/upload.py +411 -387
mapillary_tools/upload_api_v4.py +167 -142
mapillary_tools/uploader.py +804 -284
mapillary_tools/utils.py +49 -18
{mapillary_tools-0.13.3.dist-info → mapillary_tools-0.14.0.dist-info}/METADATA +93 -35
mapillary_tools-0.14.0.dist-info/RECORD +75 -0
{mapillary_tools-0.13.3.dist-info → mapillary_tools-0.14.0.dist-info}/WHEEL +1 -1
mapillary_tools/geotag/blackvue_parser.py +0 -118
mapillary_tools/geotag/geotag_from_generic.py +0 -22
mapillary_tools/geotag/geotag_images_from_exiftool_both_image_and_video.py +0 -93
mapillary_tools/geotag/geotag_videos_from_exiftool_video.py +0 -145
mapillary_tools/video_data_extraction/cli_options.py +0 -22
mapillary_tools/video_data_extraction/extract_video_data.py +0 -176
mapillary_tools/video_data_extraction/extractors/base_parser.py +0 -75
mapillary_tools/video_data_extraction/extractors/blackvue_parser.py +0 -34
mapillary_tools/video_data_extraction/extractors/camm_parser.py +0 -38
mapillary_tools/video_data_extraction/extractors/exiftool_runtime_parser.py +0 -71
mapillary_tools/video_data_extraction/extractors/exiftool_xml_parser.py +0 -53
mapillary_tools/video_data_extraction/extractors/generic_video_parser.py +0 -52
mapillary_tools/video_data_extraction/extractors/gopro_parser.py +0 -43
mapillary_tools/video_data_extraction/extractors/gpx_parser.py +0 -108
mapillary_tools/video_data_extraction/extractors/nmea_parser.py +0 -24
mapillary_tools/video_data_extraction/video_data_parser_factory.py +0 -39
mapillary_tools-0.13.3.dist-info/RECORD +0 -75
/mapillary_tools/{geotag → gpmf}/gpmf_gps_filter.py +0 -0
{mapillary_tools-0.13.3.dist-info → mapillary_tools-0.14.0.dist-info}/entry_points.txt +0 -0
{mapillary_tools-0.13.3.dist-info → mapillary_tools-0.14.0.dist-info/licenses}/LICENSE +0 -0
{mapillary_tools-0.13.3.dist-info → mapillary_tools-0.14.0.dist-info}/top_level.txt +0 -0

mapillary_tools/uploader.py CHANGED Viewed

@@ -1,8 +1,16 @@
+from __future__ import annotations
+import concurrent.futures
+import dataclasses
 import io
 import json
 import logging
 import os
+import struct
+import sys
 import tempfile
+import threading
 import time
 import typing as T
 import uuid
@@ -10,33 +18,96 @@ import zipfile
 from contextlib import contextmanager
 from pathlib import Path
-import jsonschema
+if sys.version_info >= (3, 11):
+    from typing import Required
+else:
+    from typing_extensions import Required
 import requests
-from . import constants, exif_write, types, upload_api_v4, utils
+from . import (
+    api_v4,
+    config,
+    constants,
+    exif_write,
+    geo,
+    history,
+    telemetry,
+    types,
+    upload_api_v4,
+    utils,
+)
+from .camm import camm_builder, camm_parser
+from .gpmf import gpmf_parser
+from .mp4 import simple_mp4_builder
+from .serializer.description import (
+    desc_file_to_exif,
+    DescriptionJSONSerializer,
+    validate_image_desc,
+)
 LOG = logging.getLogger(__name__)
-class Progress(T.TypedDict, total=False):
-    # The size of the chunk, in bytes, that has been uploaded in the last request
+@dataclasses.dataclass(frozen=True)
+class UploadOptions:
+    user_items: config.UserItem
+    chunk_size: int = int(constants.UPLOAD_CHUNK_SIZE_MB * 1024 * 1024)
+    dry_run: bool = False
+    nofinish: bool = False
+    noresume: bool = False
+class UploaderProgress(T.TypedDict, total=True):
+    """
+    Progress data that Uploader cares about.
+    """
+    # The size, in bytes, of the last chunk that has been read and upload
     chunk_size: int
-    # File type
-    file_type: str
+    # The initial offset returned by the upload service, which is also the offset
+    # uploader start uploading from.
+    # Assert:
+    #   - 0 <= begin_offset <= offset <= entity_size
+    #   - Be non-None after at least a successful "upload_fetch_offset"
+    begin_offset: int | None
-    # How many bytes has been uploaded so far since "upload_start"
+    # How many bytes of the file has been uploaded so far
     offset: int
-    # Size in bytes of the zipfile/BlackVue/CAMM
+    # Size in bytes of the file (i.e. fp.tell() after seek to the end)
+    # NOTE: It's different from filesize in file system
+    # Assert:
+    #   - offset == entity_size when "upload_end" or "upload_finished"
     entity_size: int
+    # An "upload_interrupted" will increase it. Reset to 0 if a chunk is uploaded
+    retries: int
+    # Cluster ID after finishing the upload
+    cluster_id: str
+class SequenceProgress(T.TypedDict, total=False):
+    """Progress data at sequence level"""
+    # Used to check if it is uploaded or not
+    sequence_md5sum: Required[str]
+    # Used to resume from the previous upload,
+    # so it has to an unique identifier (hash) of the upload content
+    upload_md5sum: str
+    # File type
+    file_type: Required[str]
     # How many sequences in total. It's always 1 when uploading Zipfile/BlackVue/CAMM
-    total_sequence_count: int
+    total_sequence_count: Required[int]
     # 0-based nth sequence. It is always 0 when uploading Zipfile/BlackVue/CAMM
-    sequence_idx: int
+    sequence_idx: Required[int]
     # How many images in the sequence. It's available only when uploading directories/Zipfiles
     sequence_image_count: int
@@ -44,20 +115,31 @@ class Progress(T.TypedDict, total=False):
     # MAPSequenceUUID. It is only available for directory uploading
     sequence_uuid: str
-    # An "upload_interrupted" will increase it. Reset to 0 if the chunk is uploaded
-    retries: int
-    # md5sum of the zipfile/BlackVue/CAMM in uploading
-    md5sum: str
     # Path to the Zipfile/BlackVue/CAMM
     import_path: str
-    # Cluster ID after finishing the upload
-    cluster_id: str
+class Progress(SequenceProgress, UploaderProgress):
+    pass
+class SequenceError(Exception):
+    """
+    Base class for sequence specific errors. These errors will cause the
+    current sequence upload to fail but will not interrupt the overall upload
+    process for other sequences.
+    """
-class UploadCancelled(Exception):
+    pass
+class ExifError(SequenceError):
+    def __init__(self, message: str, image_path: Path):
+        super().__init__(message)
+        self.image_path = image_path
+class InvalidMapillaryZipFileError(SequenceError):
     pass
@@ -65,14 +147,15 @@ EventName = T.Literal[
     "upload_start",
     "upload_fetch_offset",
     "upload_progress",
+    "upload_interrupted",
     "upload_end",
+    "upload_failed",
     "upload_finished",
-    "upload_interrupted",
 ]
 class EventEmitter:
-    events: T.Dict[EventName, T.List]
+    events: dict[EventName, list]
     def __init__(self):
         self.events = {}
@@ -80,6 +163,7 @@ class EventEmitter:
     def on(self, event: EventName):
         def _wrap(callback):
             self.events.setdefault(event, []).append(callback)
+            return callback
         return _wrap
@@ -88,237 +172,731 @@ class EventEmitter:
             callback(*args, **kwargs)
-class Uploader:
-    def __init__(
-        self,
-        user_items: types.UserItem,
-        emitter: T.Optional[EventEmitter] = None,
-        chunk_size: int = upload_api_v4.DEFAULT_CHUNK_SIZE,
-        dry_run=False,
-    ):
-        jsonschema.validate(instance=user_items, schema=types.UserItemSchema)
-        self.user_items = user_items
-        self.emitter = emitter
-        self.chunk_size = chunk_size
-        self.dry_run = dry_run
+@dataclasses.dataclass
+class UploadResult:
+    result: str | None = None
+    error: Exception | None = None
-    def upload_zipfile(
-        self,
+class VideoUploader:
+    @classmethod
+    def upload_videos(
+        cls, mly_uploader: Uploader, video_metadatas: T.Sequence[types.VideoMetadata]
+    ) -> T.Generator[tuple[types.VideoMetadata, UploadResult], None, None]:
+        # If upload in a random order, then interrupted uploads has a higher chance to expire.
+        # Therefore sort videos to make sure interrupted uploads are resumed as early as possible
+        sorted_video_metadatas = sorted(video_metadatas, key=lambda m: m.filename)
+        for idx, video_metadata in enumerate(sorted_video_metadatas):
+            LOG.debug(f"Checksum for video {video_metadata.filename}...")
+            try:
+                video_metadata.update_md5sum()
+            except Exception as ex:
+                yield video_metadata, UploadResult(error=ex)
+                continue
+            assert isinstance(video_metadata.md5sum, str), "md5sum should be updated"
+            progress: SequenceProgress = {
+                "total_sequence_count": len(sorted_video_metadatas),
+                "sequence_idx": idx,
+                "file_type": video_metadata.filetype.value,
+                "import_path": str(video_metadata.filename),
+                "sequence_md5sum": video_metadata.md5sum,
+            }
+            try:
+                with cls.build_camm_stream(video_metadata) as camm_fp:
+                    # Upload the mp4 stream
+                    file_handle = mly_uploader.upload_stream(
+                        T.cast(T.IO[bytes], camm_fp),
+                        progress=T.cast(T.Dict[str, T.Any], progress),
+                    )
+                cluster_id = mly_uploader.finish_upload(
+                    file_handle,
+                    api_v4.ClusterFileType.CAMM,
+                    progress=T.cast(T.Dict[str, T.Any], progress),
+                )
+            except Exception as ex:
+                yield video_metadata, UploadResult(error=ex)
+            else:
+                yield video_metadata, UploadResult(result=cluster_id)
+    @classmethod
+    @contextmanager
+    def build_camm_stream(cls, video_metadata: types.VideoMetadata):
+        # Convert video metadata to CAMMInfo
+        camm_info = cls.prepare_camm_info(video_metadata)
+        # Create the CAMM sample generator
+        camm_sample_generator = camm_builder.camm_sample_generator2(camm_info)
+        with video_metadata.filename.open("rb") as src_fp:
+            # Build the mp4 stream with the CAMM samples
+            yield simple_mp4_builder.transform_mp4(src_fp, camm_sample_generator)
+    @classmethod
+    def prepare_camm_info(
+        cls, video_metadata: types.VideoMetadata
+    ) -> camm_parser.CAMMInfo:
+        camm_info = camm_parser.CAMMInfo(
+            make=video_metadata.make or "", model=video_metadata.model or ""
+        )
+        for point in video_metadata.points:
+            if isinstance(point, telemetry.CAMMGPSPoint):
+                if camm_info.gps is None:
+                    camm_info.gps = []
+                camm_info.gps.append(point)
+            elif isinstance(point, telemetry.GPSPoint):
+                # There is no proper CAMM entry for GoPro GPS
+                if camm_info.mini_gps is None:
+                    camm_info.mini_gps = []
+                camm_info.mini_gps.append(point)
+            elif isinstance(point, geo.Point):
+                if camm_info.mini_gps is None:
+                    camm_info.mini_gps = []
+                camm_info.mini_gps.append(point)
+            else:
+                raise ValueError(f"Unknown point type: {point}")
+        if constants.MAPILLARY__EXPERIMENTAL_ENABLE_IMU:
+            if video_metadata.filetype is types.FileType.GOPRO:
+                with video_metadata.filename.open("rb") as fp:
+                    gopro_info = gpmf_parser.extract_gopro_info(fp, telemetry_only=True)
+                if gopro_info is not None:
+                    camm_info.accl = gopro_info.accl or []
+                    camm_info.gyro = gopro_info.gyro or []
+                    camm_info.magn = gopro_info.magn or []
+        return camm_info
+class ZipUploader:
+    @classmethod
+    def upload_zipfiles(
+        cls, mly_uploader: Uploader, zip_paths: T.Sequence[Path]
+    ) -> T.Generator[tuple[Path, UploadResult], None, None]:
+        # If upload in a random order, then interrupted uploads has a higher chance to expire.
+        # Therefore sort zipfiles to make sure interrupted uploads are resumed as early as possible
+        sorted_zip_paths = sorted(zip_paths)
+        for idx, zip_path in enumerate(sorted_zip_paths):
+            progress: SequenceProgress = {
+                "total_sequence_count": len(sorted_zip_paths),
+                "sequence_idx": idx,
+                "import_path": str(zip_path),
+                "file_type": types.FileType.ZIP.value,
+                "sequence_md5sum": "",  # Placeholder, will be set in upload_zipfile
+            }
+            try:
+                cluster_id = cls._upload_zipfile(
+                    mly_uploader,
+                    zip_path,
+                    progress=T.cast(T.Dict[str, T.Any], progress),
+                )
+            except Exception as ex:
+                yield zip_path, UploadResult(error=ex)
+            else:
+                yield zip_path, UploadResult(result=cluster_id)
+    @classmethod
+    def zip_images(
+        cls, metadatas: T.Sequence[types.ImageMetadata], zip_dir: Path
+    ) -> None:
+        """
+        Group images into sequences and zip each sequence into a zipfile.
+        """
+        sequences = types.group_and_sort_images(metadatas)
+        os.makedirs(zip_dir, exist_ok=True)
+        for sequence_uuid, sequence in sequences.items():
+            _validate_metadatas(sequence)
+            # For atomicity we write into a WIP file and then rename to the final file
+            wip_zip_filename = zip_dir.joinpath(
+                f".mly_zip_{uuid.uuid4()}_{sequence_uuid}_{os.getpid()}_{int(time.time())}"
+            )
+            with cls._wip_file_context(wip_zip_filename) as wip_path:
+                with wip_path.open("wb") as wip_fp:
+                    cls._zip_sequence_fp(sequence, wip_fp)
+    @classmethod
+    def zip_images_and_upload(
+        cls, uploader: Uploader, image_metadatas: T.Sequence[types.ImageMetadata]
+    ) -> T.Generator[tuple[str, UploadResult], None, None]:
+        sequences = types.group_and_sort_images(image_metadatas)
+        for sequence_idx, (sequence_uuid, sequence) in enumerate(sequences.items()):
+            try:
+                _validate_metadatas(sequence)
+            except Exception as ex:
+                yield sequence_uuid, UploadResult(error=ex)
+                continue
+            with tempfile.NamedTemporaryFile() as fp:
+                try:
+                    sequence_md5sum = cls._zip_sequence_fp(sequence, fp)
+                except Exception as ex:
+                    yield sequence_uuid, UploadResult(error=ex)
+                    continue
+                sequence_progress: SequenceProgress = {
+                    "sequence_idx": sequence_idx,
+                    "total_sequence_count": len(sequences),
+                    "sequence_image_count": len(sequence),
+                    "sequence_uuid": sequence_uuid,
+                    "file_type": types.FileType.ZIP.value,
+                    "sequence_md5sum": sequence_md5sum,
+                }
+                try:
+                    file_handle = uploader.upload_stream(
+                        fp, progress=T.cast(T.Dict[str, T.Any], sequence_progress)
+                    )
+                    cluster_id = uploader.finish_upload(
+                        file_handle,
+                        api_v4.ClusterFileType.ZIP,
+                        progress=T.cast(T.Dict[str, T.Any], sequence_progress),
+                    )
+                except Exception as ex:
+                    yield sequence_uuid, UploadResult(error=ex)
+                    continue
+            yield sequence_uuid, UploadResult(result=cluster_id)
+    @classmethod
+    def _upload_zipfile(
+        cls,
+        uploader: Uploader,
         zip_path: Path,
-        event_payload: T.Optional[Progress] = None,
-    ) -> T.Optional[str]:
-        if event_payload is None:
-            event_payload = {}
+        progress: dict[str, T.Any] | None = None,
+    ) -> str:
+        if progress is None:
+            progress = {}
         with zipfile.ZipFile(zip_path) as ziph:
             namelist = ziph.namelist()
             if not namelist:
-                LOG.warning("Skipping empty zipfile: %s", zip_path)
-                return None
+                raise InvalidMapillaryZipFileError("Zipfile has no files")
+        with zip_path.open("rb") as zip_fp:
+            sequence_md5sum = cls._extract_sequence_md5sum(zip_fp)
-        final_event_payload: Progress = {
-            **event_payload,  # type: ignore
+        # Send the copy of the input progress to each upload session, to avoid modifying the original one
+        mutable_progress: SequenceProgress = {
+            **T.cast(SequenceProgress, progress),
             "sequence_image_count": len(namelist),
+            "sequence_md5sum": sequence_md5sum,
+            "file_type": types.FileType.ZIP.value,
         }
-        with zip_path.open("rb") as fp:
-            upload_md5sum = _extract_upload_md5sum(fp)
+        with zip_path.open("rb") as zip_fp:
+            file_handle = uploader.upload_stream(
+                zip_fp, progress=T.cast(T.Dict[str, T.Any], mutable_progress)
+            )
+        cluster_id = uploader.finish_upload(
+            file_handle,
+            api_v4.ClusterFileType.ZIP,
+            progress=T.cast(T.Dict[str, T.Any], mutable_progress),
+        )
+        return cluster_id
+    @classmethod
+    def _zip_sequence_fp(
+        cls,
+        sequence: T.Sequence[types.ImageMetadata],
+        zip_fp: T.IO[bytes],
+    ) -> str:
+        """
+        Write a sequence of ImageMetadata into the zipfile handle.
+        The sequence has to be one sequence and sorted.
+        """
+        sequence_groups = types.group_and_sort_images(sequence)
+        assert len(sequence_groups) == 1, (
+            f"Only one sequence is allowed but got {len(sequence_groups)}: {list(sequence_groups.keys())}"
+        )
+        if sequence:
+            LOG.debug(f"Checksum for sequence {sequence[0].MAPSequenceUUID}...")
+        sequence_md5sum = types.update_sequence_md5sum(sequence)
+        with zipfile.ZipFile(zip_fp, "w", zipfile.ZIP_DEFLATED) as zipf:
+            for idx, metadata in enumerate(sequence):
+                # Arcname should be unique, the name does not matter
+                arcname = f"{idx}.jpg"
+                zipinfo = zipfile.ZipInfo(arcname, date_time=(1980, 1, 1, 0, 0, 0))
+                zipf.writestr(zipinfo, SingleImageUploader.dump_image_bytes(metadata))
+            assert len(sequence) == len(set(zipf.namelist()))
+            zipf.comment = json.dumps(
+                {"sequence_md5sum": sequence_md5sum},
+                sort_keys=True,
+                separators=(",", ":"),
+            ).encode("utf-8")
+        return sequence_md5sum
+    @classmethod
+    def _extract_sequence_md5sum(cls, zip_fp: T.IO[bytes]) -> str:
+        with zipfile.ZipFile(zip_fp, "r", zipfile.ZIP_DEFLATED) as ziph:
+            comment = ziph.comment
+        if not comment:
+            raise InvalidMapillaryZipFileError("No comment found in the zipfile")
+        try:
+            decoded = comment.decode("utf-8")
+            zip_metadata = json.loads(decoded)
+        except UnicodeDecodeError as ex:
+            raise InvalidMapillaryZipFileError(str(ex)) from ex
+        except json.JSONDecodeError as ex:
+            raise InvalidMapillaryZipFileError(str(ex)) from ex
+        sequence_md5sum = zip_metadata.get("sequence_md5sum")
+        if not sequence_md5sum and not isinstance(sequence_md5sum, str):
+            raise InvalidMapillaryZipFileError("No sequence_md5sum found")
+        return sequence_md5sum
-        if upload_md5sum is None:
-            with zip_path.open("rb") as fp:
+    @classmethod
+    @contextmanager
+    def _wip_file_context(cls, wip_path: Path):
+        try:
+            os.remove(wip_path)
+        except FileNotFoundError:
+            pass
+        try:
+            yield wip_path
+            with wip_path.open("rb") as fp:
                 upload_md5sum = utils.md5sum_fp(fp).hexdigest()
-        with zip_path.open("rb") as fp:
-            return self.upload_stream(
-                fp,
-                upload_api_v4.ClusterFileType.ZIP,
-                upload_md5sum,
-                event_payload=final_event_payload,
+            done_path = wip_path.parent.joinpath(
+                _session_key(upload_md5sum, api_v4.ClusterFileType.ZIP)
             )
-    def upload_images(
-        self,
-        image_metadatas: T.Sequence[types.ImageMetadata],
-        event_payload: T.Optional[Progress] = None,
-    ) -> T.Dict[str, str]:
-        if event_payload is None:
-            event_payload = {}
+            try:
+                os.remove(done_path)
+            except FileNotFoundError:
+                pass
+            wip_path.rename(done_path)
+        finally:
+            try:
+                os.remove(wip_path)
+            except FileNotFoundError:
+                pass
-        _validate_metadatas(image_metadatas)
+class ImageSequenceUploader:
+    @classmethod
+    def upload_images(
+        cls, uploader: Uploader, image_metadatas: T.Sequence[types.ImageMetadata]
+    ) -> T.Generator[tuple[str, UploadResult], None, None]:
         sequences = types.group_and_sort_images(image_metadatas)
-        ret: T.Dict[str, str] = {}
         for sequence_idx, (sequence_uuid, sequence) in enumerate(sequences.items()):
-            final_event_payload: Progress = {
-                **event_payload,  # type: ignore
+            LOG.debug(f"Checksum for image sequence {sequence_uuid}...")
+            sequence_md5sum = types.update_sequence_md5sum(sequence)
+            sequence_progress: SequenceProgress = {
                 "sequence_idx": sequence_idx,
                 "total_sequence_count": len(sequences),
                 "sequence_image_count": len(sequence),
                 "sequence_uuid": sequence_uuid,
+                "file_type": types.FileType.IMAGE.value,
+                "sequence_md5sum": sequence_md5sum,
             }
-            for metadata in sequence:
-                metadata.update_md5sum()
-            upload_md5sum = types.sequence_md5sum(sequence)
-            with tempfile.NamedTemporaryFile() as fp:
-                _zip_sequence_fp(sequence, fp, upload_md5sum)
-                cluster_id = self.upload_stream(
-                    fp,
-                    upload_api_v4.ClusterFileType.ZIP,
-                    upload_md5sum,
-                    final_event_payload,
+            try:
+                cluster_id = cls._upload_sequence(
+                    uploader,
+                    sequence,
+                    progress=T.cast(dict[str, T.Any], sequence_progress),
+                )
+            except Exception as ex:
+                yield sequence_uuid, UploadResult(error=ex)
+            else:
+                yield sequence_uuid, UploadResult(result=cluster_id)
+    @classmethod
+    def _upload_sequence(
+        cls,
+        uploader: Uploader,
+        sequence: T.Sequence[types.ImageMetadata],
+        progress: dict[str, T.Any],
+    ) -> str:
+        _validate_metadatas(sequence)
+        progress["entity_size"] = sum(m.filesize or 0 for m in sequence)
+        uploader.emitter.emit("upload_start", progress)
+        single_image_uploader = SingleImageUploader(uploader, progress=progress)
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=constants.MAX_IMAGE_UPLOAD_WORKERS
+        ) as executor:
+            image_file_handles = list(
+                executor.map(single_image_uploader.upload, sequence)
+            )
+        manifest_file_handle = cls._upload_manifest(uploader, image_file_handles)
+        uploader.emitter.emit("upload_end", progress)
+        cluster_id = uploader.finish_upload(
+            manifest_file_handle,
+            api_v4.ClusterFileType.MLY_BUNDLE_MANIFEST,
+            progress=progress,
+        )
+        return cluster_id
+    @classmethod
+    def _upload_manifest(
+        cls, uploader: Uploader, image_file_handles: T.Sequence[str]
+    ) -> str:
+        uploader_without_emitter = Uploader(uploader.upload_options)
+        manifest = {
+            "version": "1",
+            "upload_type": "images",
+            "image_handles": image_file_handles,
+        }
+        with io.BytesIO() as manifest_fp:
+            manifest_fp.write(
+                json.dumps(manifest, sort_keys=True, separators=(",", ":")).encode(
+                    "utf-8"
                 )
-            if cluster_id is not None:
-                ret[sequence_uuid] = cluster_id
-        return ret
+            )
+            manifest_fp.seek(0, io.SEEK_SET)
+            return uploader_without_emitter.upload_stream(
+                manifest_fp, session_key=f"{_prefixed_uuid4()}.json"
+            )
+class SingleImageUploader:
+    def __init__(
+        self,
+        uploader: Uploader,
+        progress: dict[str, T.Any] | None = None,
+    ):
+        self.uploader = uploader
+        self.progress = progress or {}
+        self.lock = threading.Lock()
+        self.cache = self._maybe_create_persistent_cache_instance(
+            uploader.upload_options.user_items
+        )
+    def upload(self, image_metadata: types.ImageMetadata) -> str:
+        mutable_progress = {
+            **(self.progress or {}),
+            "filename": str(image_metadata.filename),
+        }
+        image_bytes = self.dump_image_bytes(image_metadata)
+        uploader_without_emitter = Uploader(self.uploader.upload_options)
+        session_key = uploader_without_emitter._gen_session_key(
+            io.BytesIO(image_bytes), mutable_progress
+        )
+        file_handle = self._file_handle_cache_get(session_key)
+        if file_handle is None:
+            file_handle = uploader_without_emitter.upload_stream(
+                io.BytesIO(image_bytes),
+                session_key=session_key,
+                progress=mutable_progress,
+            )
+            self._file_handle_cache_set(session_key, file_handle)
+        # Override chunk_size with the actual filesize
+        mutable_progress["chunk_size"] = image_metadata.filesize
+        with self.lock:
+            self.uploader.emitter.emit("upload_progress", mutable_progress)
+        return file_handle
+    @classmethod
+    def dump_image_bytes(cls, metadata: types.ImageMetadata) -> bytes:
+        try:
+            edit = exif_write.ExifEdit(metadata.filename)
+        except struct.error as ex:
+            raise ExifError(f"Failed to load EXIF: {ex}", metadata.filename) from ex
+        # The cast is to fix the type checker error
+        edit.add_image_description(
+            T.cast(
+                T.Dict, desc_file_to_exif(DescriptionJSONSerializer.as_desc(metadata))
+            )
+        )
+        try:
+            return edit.dump_image_bytes()
+        except struct.error as ex:
+            raise ExifError(
+                f"Failed to dump EXIF bytes: {ex}", metadata.filename
+            ) from ex
+    @classmethod
+    def _maybe_create_persistent_cache_instance(
+        cls, user_items: config.UserItem
+    ) -> history.PersistentCache | None:
+        if not constants.UPLOAD_CACHE_DIR:
+            LOG.debug(
+                "Upload cache directory is set empty, skipping caching upload file handles"
+            )
+            return None
+        cache_path_dir = (
+            Path(constants.UPLOAD_CACHE_DIR)
+            .joinpath(api_v4.MAPILLARY_CLIENT_TOKEN.replace("|", "_"))
+            .joinpath(
+                user_items.get("MAPSettingsUserKey", user_items["user_upload_token"])
+            )
+        )
+        cache_path_dir.mkdir(parents=True, exist_ok=True)
+        cache_path = cache_path_dir.joinpath("cached_file_handles")
+        LOG.debug(f"File handle cache path: {cache_path}")
+        cache = history.PersistentCache(str(cache_path.resolve()))
+        cache.clear_expired()
+        return cache
+    def _file_handle_cache_get(self, key: str) -> str | None:
+        if self.cache is None:
+            return None
+        if _is_uuid(key):
+            return None
+        return self.cache.get(key)
+    def _file_handle_cache_set(self, key: str, value: str) -> None:
+        if self.cache is None:
+            return
+        if _is_uuid(key):
+            return
+        self.cache.set(key, value)
+class Uploader:
+    def __init__(
+        self, upload_options: UploadOptions, emitter: EventEmitter | None = None
+    ):
+        self.upload_options = upload_options
+        if emitter is None:
+            # An empty event emitter that does nothing
+            self.emitter = EventEmitter()
+        else:
+            self.emitter = emitter
     def upload_stream(
         self,
         fp: T.IO[bytes],
-        cluster_filetype: upload_api_v4.ClusterFileType,
-        upload_md5sum: str,
-        event_payload: T.Optional[Progress] = None,
-    ) -> T.Optional[str]:
-        if event_payload is None:
-            event_payload = {}
+        session_key: str | None = None,
+        progress: dict[str, T.Any] | None = None,
+    ) -> str:
+        if progress is None:
+            progress = {}
+        if session_key is None:
+            session_key = self._gen_session_key(fp, progress)
         fp.seek(0, io.SEEK_END)
         entity_size = fp.tell()
-        SUFFIX_MAP: T.Dict[upload_api_v4.ClusterFileType, str] = {
-            upload_api_v4.ClusterFileType.ZIP: ".zip",
-            upload_api_v4.ClusterFileType.CAMM: ".mp4",
-            upload_api_v4.ClusterFileType.BLACKVUE: ".mp4",
-        }
-        session_key = f"mly_tools_{upload_md5sum}{SUFFIX_MAP[cluster_filetype]}"
-        if self.dry_run:
-            upload_service: upload_api_v4.UploadService = (
-                upload_api_v4.FakeUploadService(
-                    user_access_token=self.user_items["user_upload_token"],
-                    session_key=session_key,
-                    organization_id=self.user_items.get("MAPOrganizationKey"),
-                    cluster_filetype=cluster_filetype,
-                    chunk_size=self.chunk_size,
+        progress["entity_size"] = entity_size
+        progress["chunk_size"] = self.upload_options.chunk_size
+        progress["retries"] = 0
+        progress["begin_offset"] = None
+        self.emitter.emit("upload_start", progress)
+        upload_service = self._create_upload_service(session_key)
+        while True:
+            try:
+                file_handle = self._upload_stream_retryable(
+                    upload_service, fp, T.cast(UploaderProgress, progress)
                 )
+            except Exception as ex:
+                self._handle_upload_exception(ex, T.cast(UploaderProgress, progress))
+            except BaseException as ex:
+                self.emitter.emit("upload_failed", progress)
+                raise ex
+            else:
+                break
+            progress["retries"] += 1
+        self.emitter.emit("upload_end", progress)
+        return file_handle
+    def finish_upload(
+        self,
+        file_handle: str,
+        cluster_filetype: api_v4.ClusterFileType,
+        progress: dict[str, T.Any] | None = None,
+    ) -> str:
+        """Finish upload with safe retries guraranteed"""
+        if progress is None:
+            progress = {}
+        if self.upload_options.dry_run or self.upload_options.nofinish:
+            cluster_id = "0"
+        else:
+            resp = api_v4.finish_upload(
+                self.upload_options.user_items["user_upload_token"],
+                file_handle,
+                cluster_filetype,
+                organization_id=self.upload_options.user_items.get(
+                    "MAPOrganizationKey"
+                ),
+            )
+            body = api_v4.jsonify_response(resp)
+            # TODO: Validate cluster_id
+            cluster_id = body.get("cluster_id")
+        progress["cluster_id"] = cluster_id
+        self.emitter.emit("upload_finished", progress)
+        return cluster_id
+    def _create_upload_service(self, session_key: str) -> upload_api_v4.UploadService:
+        upload_service: upload_api_v4.UploadService
+        if self.upload_options.dry_run:
+            upload_path = os.getenv("MAPILLARY_UPLOAD_ENDPOINT")
+            upload_service = upload_api_v4.FakeUploadService(
+                user_access_token=self.upload_options.user_items["user_upload_token"],
+                session_key=session_key,
+                upload_path=Path(upload_path) if upload_path is not None else None,
+            )
+            LOG.info(
+                "Dry run mode enabled. Data will be uploaded to %s",
+                upload_service.upload_path.joinpath(session_key),
             )
         else:
             upload_service = upload_api_v4.UploadService(
-                user_access_token=self.user_items["user_upload_token"],
+                user_access_token=self.upload_options.user_items["user_upload_token"],
                 session_key=session_key,
-                organization_id=self.user_items.get("MAPOrganizationKey"),
-                cluster_filetype=cluster_filetype,
-                chunk_size=self.chunk_size,
             )
-        final_event_payload: Progress = {
-            **event_payload,  # type: ignore
-            "entity_size": entity_size,
-            "md5sum": upload_md5sum,
-        }
+        return upload_service
-        try:
-            return _upload_stream(
-                upload_service,
-                fp,
-                event_payload=final_event_payload,
-                emitter=self.emitter,
+    def _handle_upload_exception(
+        self, ex: Exception, progress: UploaderProgress
+    ) -> None:
+        retries = progress.get("retries", 0)
+        begin_offset = progress.get("begin_offset")
+        offset = progress.get("offset")
+        if retries <= constants.MAX_UPLOAD_RETRIES and _is_retriable_exception(ex):
+            self.emitter.emit("upload_interrupted", progress)
+            LOG.warning(
+                f"Error uploading at {offset=} since {begin_offset=}: {ex.__class__.__name__}: {ex}"
             )
-        except UploadCancelled:
-            return None
+            # Keep things immutable here. Will increment retries in the caller
+            retries += 1
+            if _is_immediate_retriable_exception(ex):
+                sleep_for = 0
+            else:
+                sleep_for = min(2**retries, 16)
+            LOG.info(
+                f"Retrying in {sleep_for} seconds ({retries}/{constants.MAX_UPLOAD_RETRIES})"
+            )
+            if sleep_for:
+                time.sleep(sleep_for)
+        else:
+            self.emitter.emit("upload_failed", progress)
+            raise ex
+    def _chunk_with_progress_emitted(
+        self,
+        stream: T.IO[bytes],
+        progress: UploaderProgress,
+    ) -> T.Generator[bytes, None, None]:
+        for chunk in upload_api_v4.UploadService.chunkize_byte_stream(
+            stream, self.upload_options.chunk_size
+        ):
+            yield chunk
+            progress["offset"] += len(chunk)
+            progress["chunk_size"] = len(chunk)
+            # Whenever a chunk is uploaded, reset retries
+            progress["retries"] = 0
+            self.emitter.emit("upload_progress", progress)
+    def _upload_stream_retryable(
+        self,
+        upload_service: upload_api_v4.UploadService,
+        fp: T.IO[bytes],
+        progress: UploaderProgress,
+    ) -> str:
+        """Upload the stream with safe retries guraranteed"""
+        begin_offset = upload_service.fetch_offset()
+        progress["begin_offset"] = begin_offset
+        progress["offset"] = begin_offset
+        if not constants.MIN_UPLOAD_SPEED:
+            read_timeout = None
+        else:
+            remaining_bytes = abs(progress["entity_size"] - begin_offset)
+            read_timeout = max(
+                api_v4.REQUESTS_TIMEOUT, remaining_bytes / constants.MIN_UPLOAD_SPEED
+            )
+        self.emitter.emit("upload_fetch_offset", progress)
+        fp.seek(begin_offset, io.SEEK_SET)
+        shifted_chunks = self._chunk_with_progress_emitted(fp, progress)
+        return upload_service.upload_shifted_chunks(
+            shifted_chunks, begin_offset, read_timeout=read_timeout
+        )
+    def _gen_session_key(self, fp: T.IO[bytes], progress: dict[str, T.Any]) -> str:
+        if self.upload_options.noresume:
+            # Generate a unique UUID for session_key when noresume is True
+            # to prevent resuming from previous uploads
+            session_key = f"{_prefixed_uuid4()}"
+        else:
+            fp.seek(0, io.SEEK_SET)
+            session_key = utils.md5sum_fp(fp).hexdigest()
+        filetype = progress.get("file_type")
+        if filetype is not None:
+            session_key = _session_key(session_key, types.FileType(filetype))
+        return session_key
 def _validate_metadatas(metadatas: T.Sequence[types.ImageMetadata]):
     for metadata in metadatas:
-        types.validate_image_desc(types.as_desc(metadata))
+        validate_image_desc(DescriptionJSONSerializer.as_desc(metadata))
         if not metadata.filename.is_file():
             raise FileNotFoundError(f"No such file {metadata.filename}")
-@contextmanager
-def wip_file_context(wip_path: Path, done_path: Path):
-    assert wip_path != done_path, "should not be the same file"
-    try:
-        os.remove(wip_path)
-    except FileNotFoundError:
-        pass
-    try:
-        yield wip_path
-        try:
-            os.remove(done_path)
-        except FileNotFoundError:
-            pass
-        wip_path.rename(done_path)
-    finally:
-        try:
-            os.remove(wip_path)
-        except FileNotFoundError:
-            pass
-def zip_images(
-    metadatas: T.List[types.ImageMetadata],
-    zip_dir: Path,
-) -> None:
-    _validate_metadatas(metadatas)
-    sequences = types.group_and_sort_images(metadatas)
-    os.makedirs(zip_dir, exist_ok=True)
-    for sequence_uuid, sequence in sequences.items():
-        for metadata in sequence:
-            metadata.update_md5sum()
-        upload_md5sum = types.sequence_md5sum(sequence)
-        timestamp = int(time.time())
-        wip_zip_filename = zip_dir.joinpath(
-            f".mly_zip_{uuid.uuid4()}_{sequence_uuid}_{os.getpid()}_{timestamp}"
-        )
-        zip_filename = zip_dir.joinpath(f"mly_tools_{upload_md5sum}.zip")
-        with wip_file_context(wip_zip_filename, zip_filename) as wip_dir:
-            with wip_dir.open("wb") as fp:
-                _zip_sequence_fp(sequence, fp, upload_md5sum)
-def _zip_sequence_fp(
-    sequence: T.Sequence[types.ImageMetadata],
-    fp: T.IO[bytes],
-    upload_md5sum: str,
-) -> None:
-    arcname_idx = 0
-    arcnames = set()
-    with zipfile.ZipFile(fp, "w", zipfile.ZIP_DEFLATED) as ziph:
-        for metadata in sequence:
-            edit = exif_write.ExifEdit(metadata.filename)
-            # The cast is to fix the type checker error
-            edit.add_image_description(
-                T.cast(T.Dict, types.desc_file_to_exif(types.as_desc(metadata)))
-            )
-            image_bytes = edit.dump_image_bytes()
-            arcname: str = metadata.filename.name
-            # make sure the arcname is unique, otherwise zipfile.extractAll will eliminate duplicated ones
-            while arcname in arcnames:
-                arcname_idx += 1
-                arcname = (
-                    f"{metadata.filename.stem}_{arcname_idx}{metadata.filename.suffix}"
-                )
-            arcnames.add(arcname)
-            zipinfo = zipfile.ZipInfo(arcname, date_time=(1980, 1, 1, 0, 0, 0))
-            ziph.writestr(zipinfo, image_bytes)
-            ziph.comment = json.dumps({"upload_md5sum": upload_md5sum}).encode("utf-8")
-        assert len(sequence) == len(set(ziph.namelist()))
-def _extract_upload_md5sum(fp: T.IO[bytes]) -> T.Optional[str]:
-    with zipfile.ZipFile(fp, "r", zipfile.ZIP_DEFLATED) as ziph:
-        comment = ziph.comment
-    if not comment:
-        return None
-    try:
-        upload_md5sum = json.loads(comment.decode("utf-8")).get("upload_md5sum")
-    except Exception:
-        return None
-    if not upload_md5sum:
-        return None
-    return str(upload_md5sum)
-def _is_immediate_retry(ex: Exception):
+def _is_immediate_retriable_exception(ex: Exception) -> bool:
     if (
         isinstance(ex, requests.HTTPError)
         and isinstance(ex.response, requests.Response)
@@ -331,8 +909,10 @@ def _is_immediate_retry(ex: Exception):
         # resp: {"debug_info":{"retriable":true,"type":"OffsetInvalidError","message":"Request starting offset is invalid"}}
         return resp.get("debug_info", {}).get("retriable", False)
+    return False
-def _is_retriable_exception(ex: Exception):
+def _is_retriable_exception(ex: Exception) -> bool:
     if isinstance(ex, (requests.ConnectionError, requests.Timeout)):
         return True
@@ -351,89 +931,29 @@ def _is_retriable_exception(ex: Exception):
     return False
-def _setup_callback(emitter: EventEmitter, mutable_payload: Progress):
-    def _callback(chunk: bytes, _):
-        assert isinstance(emitter, EventEmitter)
-        mutable_payload["offset"] += len(chunk)
-        mutable_payload["chunk_size"] = len(chunk)
-        emitter.emit("upload_progress", mutable_payload)
-    return _callback
-def _upload_stream(
-    upload_service: upload_api_v4.UploadService,
-    fp: T.IO[bytes],
-    event_payload: T.Optional[Progress] = None,
-    emitter: T.Optional[EventEmitter] = None,
+def _session_key(
+    upload_md5sum: str, filetype: api_v4.ClusterFileType | types.FileType
 ) -> str:
-    retries = 0
-    if event_payload is None:
-        event_payload = {}
-    mutable_payload = T.cast(Progress, {**event_payload})
-    # when it progresses, we reset retries
-    def _reset_retries(_, __):
-        nonlocal retries
-        retries = 0
-    if emitter:
-        emitter.emit("upload_start", mutable_payload)
+    _SUFFIX_MAP: dict[api_v4.ClusterFileType | types.FileType, str] = {
+        api_v4.ClusterFileType.ZIP: ".zip",
+        api_v4.ClusterFileType.CAMM: ".mp4",
+        api_v4.ClusterFileType.BLACKVUE: ".mp4",
+        types.FileType.IMAGE: ".jpg",
+        types.FileType.ZIP: ".zip",
+        types.FileType.BLACKVUE: ".mp4",
+        types.FileType.CAMM: ".mp4",
+        types.FileType.GOPRO: ".mp4",
+        types.FileType.VIDEO: ".mp4",
+    }
-    while True:
-        fp.seek(0, io.SEEK_SET)
-        begin_offset: T.Optional[int] = None
-        try:
-            begin_offset = upload_service.fetch_offset()
-            upload_service.callbacks = [_reset_retries]
-            if emitter:
-                mutable_payload["offset"] = begin_offset
-                mutable_payload["retries"] = retries
-                emitter.emit("upload_fetch_offset", mutable_payload)
-                upload_service.callbacks.append(
-                    _setup_callback(emitter, mutable_payload)
-                )
-            file_handle = upload_service.upload(fp, offset=begin_offset)
-        except Exception as ex:
-            if retries < constants.MAX_UPLOAD_RETRIES and _is_retriable_exception(ex):
-                if emitter:
-                    emitter.emit("upload_interrupted", mutable_payload)
-                LOG.warning(
-                    # use %s instead of %d because offset could be None
-                    "Error uploading chunk_size %d at begin_offset %s: %s: %s",
-                    upload_service.chunk_size,
-                    begin_offset,
-                    ex.__class__.__name__,
-                    str(ex),
-                )
-                retries += 1
-                if _is_immediate_retry(ex):
-                    sleep_for = 0
-                else:
-                    sleep_for = min(2**retries, 16)
-                LOG.info(
-                    "Retrying in %d seconds (%d/%d)",
-                    sleep_for,
-                    retries,
-                    constants.MAX_UPLOAD_RETRIES,
-                )
-                if sleep_for:
-                    time.sleep(sleep_for)
-            else:
-                raise ex
-        else:
-            break
+    return f"mly_tools_{upload_md5sum}{_SUFFIX_MAP[filetype]}"
-    if emitter:
-        emitter.emit("upload_end", mutable_payload)
-    # TODO: retry here
-    cluster_id = upload_service.finish(file_handle)
+def _prefixed_uuid4():
+    prefixed = f"uuid_{uuid.uuid4().hex}"
+    assert _is_uuid(prefixed)
+    return prefixed
-    if emitter:
-        mutable_payload["cluster_id"] = cluster_id
-        emitter.emit("upload_finished", mutable_payload)
-    return cluster_id
+def _is_uuid(session_key: str) -> bool:
+    return session_key.startswith("uuid_")

mapillary-tools 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl

mapillary-tools 0.13.3py3-none-any.whl → 0.14.0py3-none-any.whl