PyPI - scale-nucleus - Versions diffs - 0.1.22__py3-none-any.whl → 0.6.4__py3-none-any.whl - Mend

scale-nucleus 0.1.22py3-none-any.whl → 0.6.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

cli/client.py +14 -0
cli/datasets.py +77 -0
cli/helpers/__init__.py +0 -0
cli/helpers/nucleus_url.py +10 -0
cli/helpers/web_helper.py +40 -0
cli/install_completion.py +33 -0
cli/jobs.py +42 -0
cli/models.py +35 -0
cli/nu.py +42 -0
cli/reference.py +8 -0
cli/slices.py +62 -0
cli/tests.py +121 -0
nucleus/__init__.py +453 -699
nucleus/annotation.py +435 -80
nucleus/autocurate.py +9 -0
nucleus/connection.py +87 -0
nucleus/constants.py +12 -2
nucleus/data_transfer_object/__init__.py +0 -0
nucleus/data_transfer_object/dataset_details.py +9 -0
nucleus/data_transfer_object/dataset_info.py +26 -0
nucleus/data_transfer_object/dataset_size.py +5 -0
nucleus/data_transfer_object/scenes_list.py +18 -0
nucleus/dataset.py +1139 -215
nucleus/dataset_item.py +130 -26
nucleus/dataset_item_uploader.py +297 -0
nucleus/deprecation_warning.py +32 -0
nucleus/errors.py +21 -1
nucleus/job.py +71 -3
nucleus/logger.py +9 -0
nucleus/metadata_manager.py +45 -0
nucleus/metrics/__init__.py +10 -0
nucleus/metrics/base.py +117 -0
nucleus/metrics/categorization_metrics.py +197 -0
nucleus/metrics/errors.py +7 -0
nucleus/metrics/filters.py +40 -0
nucleus/metrics/geometry.py +198 -0
nucleus/metrics/metric_utils.py +28 -0
nucleus/metrics/polygon_metrics.py +480 -0
nucleus/metrics/polygon_utils.py +299 -0
nucleus/model.py +121 -15
nucleus/model_run.py +34 -57
nucleus/payload_constructor.py +30 -18
nucleus/prediction.py +259 -17
nucleus/pydantic_base.py +26 -0
nucleus/retry_strategy.py +4 -0
nucleus/scene.py +204 -19
nucleus/slice.py +230 -67
nucleus/upload_response.py +20 -9
nucleus/url_utils.py +4 -0
nucleus/utils.py +139 -35
nucleus/validate/__init__.py +24 -0
nucleus/validate/client.py +168 -0
nucleus/validate/constants.py +20 -0
nucleus/validate/data_transfer_objects/__init__.py +0 -0
nucleus/validate/data_transfer_objects/eval_function.py +81 -0
nucleus/validate/data_transfer_objects/scenario_test.py +19 -0
nucleus/validate/data_transfer_objects/scenario_test_evaluations.py +11 -0
nucleus/validate/data_transfer_objects/scenario_test_metric.py +12 -0
nucleus/validate/errors.py +6 -0
nucleus/validate/eval_functions/__init__.py +0 -0
nucleus/validate/eval_functions/available_eval_functions.py +212 -0
nucleus/validate/eval_functions/base_eval_function.py +60 -0
nucleus/validate/scenario_test.py +143 -0
nucleus/validate/scenario_test_evaluation.py +114 -0
nucleus/validate/scenario_test_metric.py +14 -0
nucleus/validate/utils.py +8 -0
{scale_nucleus-0.1.22.dist-info → scale_nucleus-0.6.4.dist-info}/LICENSE +0 -0
scale_nucleus-0.6.4.dist-info/METADATA +213 -0
scale_nucleus-0.6.4.dist-info/RECORD +71 -0
{scale_nucleus-0.1.22.dist-info → scale_nucleus-0.6.4.dist-info}/WHEEL +1 -1
scale_nucleus-0.6.4.dist-info/entry_points.txt +3 -0
scale_nucleus-0.1.22.dist-info/METADATA +0 -85
scale_nucleus-0.1.22.dist-info/RECORD +0 -21

nucleus/dataset_item.py CHANGED Viewed

@@ -1,36 +1,53 @@
-from collections import Counter
 import json
 import os.path
+from collections import Counter
 from dataclasses import dataclass
-from typing import Optional, Sequence, Dict, Any
 from enum import Enum
+from typing import Any, Dict, Optional, Sequence
-from .annotation import is_local_path, Point3D
+from .annotation import Point3D, is_local_path
 from .constants import (
+    CAMERA_PARAMS_KEY,
+    CX_KEY,
+    CY_KEY,
+    FX_KEY,
+    FY_KEY,
+    HEADING_KEY,
     IMAGE_URL_KEY,
     METADATA_KEY,
     ORIGINAL_IMAGE_URL_KEY,
-    UPLOAD_TO_SCALE_KEY,
+    POINTCLOUD_URL_KEY,
+    POSITION_KEY,
     REFERENCE_ID_KEY,
     TYPE_KEY,
+    UPLOAD_TO_SCALE_KEY,
     URL_KEY,
-    CAMERA_PARAMS_KEY,
-    POINTCLOUD_URL_KEY,
+    W_KEY,
     X_KEY,
     Y_KEY,
     Z_KEY,
-    W_KEY,
-    POSITION_KEY,
-    HEADING_KEY,
-    FX_KEY,
-    FY_KEY,
-    CX_KEY,
-    CY_KEY,
 )
 @dataclass
 class Quaternion:
+    """Quaternion objects are used to represent rotation.
+    We use the Hamilton/right-handed quaternion convention, where
+    ::
+        i^2 = j^2 = k^2 = ijk = -1
+    The quaternion represented by the tuple ``(x, y, z, w)`` is equal to
+    ``w + x*i + y*j + z*k``.
+    Parameters:
+        x (float): The x value.
+        y (float): The y value.
+        x (float): The z value.
+        w (float): The w value.
+    """
     x: float
     y: float
     z: float
@@ -38,11 +55,13 @@ class Quaternion:
     @classmethod
     def from_json(cls, payload: Dict[str, float]):
+        """Instantiates quaternion object from schematized JSON dict payload."""
         return cls(
             payload[X_KEY], payload[Y_KEY], payload[Z_KEY], payload[W_KEY]
         )
     def to_payload(self) -> dict:
+        """Serializes quaternion object to schematized JSON dict."""
         return {
             X_KEY: self.x,
             Y_KEY: self.y,
@@ -53,6 +72,20 @@ class Quaternion:
 @dataclass
 class CameraParams:
+    """Camera position/heading used to record the image.
+    Args:
+        position (:class:`Point3D`): World-normalized position of the camera
+        heading (:class:`Quaternion`): Vector4 indicating the quaternion of the
+          camera direction; note that the z-axis of the camera frame
+          represents the camera's optical axis.  See `Heading Examples
+          <https://docs.scale.com/reference/data-types-and-the-frame-objects#heading-examples>`_.
+        fx (float): Focal length in x direction (in pixels).
+        fy (float): Focal length in y direction (in pixels).
+        cx (float): Principal point x value.
+        cy (float): Principal point y value.
+    """
     position: Point3D
     heading: Quaternion
     fx: float
@@ -62,6 +95,7 @@ class CameraParams:
     @classmethod
     def from_json(cls, payload: Dict[str, Any]):
+        """Instantiates camera params object from schematized JSON dict payload."""
         return cls(
             Point3D.from_json(payload[POSITION_KEY]),
             Quaternion.from_json(payload[HEADING_KEY]),
@@ -72,6 +106,7 @@ class CameraParams:
         )
     def to_payload(self) -> dict:
+        """Serializes camera params object to schematized JSON dict."""
         return {
             POSITION_KEY: self.position.to_payload(),
             HEADING_KEY: self.heading.to_payload(),
@@ -89,14 +124,87 @@ class DatasetItemType(Enum):
 @dataclass  # pylint: disable=R0902
 class DatasetItem:  # pylint: disable=R0902
+    """A dataset item is an image or pointcloud that has associated metadata.
+    Note: for 3D data, please include a :class:`CameraParams` object under a key named
+    "camera_params" within the metadata dictionary. This will allow for projecting
+    3D annotations to any image within a scene.
+    Args:
+        image_location (Optional[str]): Required if pointcloud_location not present: The
+          location containing the image for the given row of data. This can be a
+          local path, or a remote URL.  Remote formats supported include any URL
+          (``http://`` or ``https://``) or URIs for AWS S3, Azure, or GCS
+          (i.e. ``s3://``, ``gcs://``).
+        pointcloud_location (Optional[str]): Required if image_location not
+          present: The remote URL containing the pointcloud JSON. Remote
+          formats supported include any URL (``http://`` or ``https://``) or
+          URIs for AWS S3, Azure, or GCS (i.e. ``s3://``, ``gcs://``).
+        reference_id (Optional[str]): A user-specified identifier to reference the
+          item.
+        metadata (Optional[dict]): Extra information about the particular
+          dataset item. ints, floats, string values will be made searchable in
+          the query bar by the key in this dict For example, ``{"animal":
+          "dog"}`` will become searchable via ``metadata.animal = "dog"``.
+          Categorical data can be passed as a string and will be treated
+          categorically by Nucleus if there are less than 250 unique values in the
+          dataset. This means histograms of values in the "Insights" section and
+          autocomplete within the query bar.
+          Numerical metadata will generate histograms in the "Insights" section,
+          allow for sorting the results of any query, and can be used with the
+          modulo operator For example: metadata.frame_number % 5 = 0
+          All other types of metadata will be visible from the dataset item detail
+          view.
+          It is important that string and numerical metadata fields are consistent
+          - if a metadata field has a string value, then all metadata fields with
+          the same key should also have string values, and vice versa for numerical
+          metadata.  If conflicting types are found, Nucleus will return an error
+          during upload!
+          The recommended way of adding or updating existing metadata is to re-run
+          the ingestion (dataset.append) with update=True, which will replace any
+          existing metadata with whatever your new ingestion run uses. This will
+          delete any metadata keys that are not present in the new ingestion run.
+          We have a cache based on image_location that will skip the need for a
+          re-upload of the images, so your second ingestion will be faster than
+          your first.
+          For 3D (sensor fusion) data, it is highly recommended to include
+          camera intrinsics the metadata of your camera image items. Nucleus
+          requires these intrinsics to create visualizations such as cuboid
+          projections. Refer to our `guide to uploading 3D data
+          <https://nucleus.scale.com/docs/uploading-3d-data>`_ for more
+          info.
+          .. todo ::
+              Shorten this once we have a guide migrated for metadata, or maybe link
+              from other places to here.
+        upload_to_scale (Optional[bool]): Set this to false in order to use
+          `privacy mode <https://nucleus.scale.com/docs/privacy-mode>`_.
+          Setting this to false means the actual data within the item (i.e. the
+          image or pointcloud) will not be uploaded to scale meaning that you can
+          send in links that are only accessible to certain users, and not to Scale.
+    """
     image_location: Optional[str] = None
-    reference_id: Optional[str] = None
+    reference_id: str = (
+        "DUMMY_VALUE"  # preserve argument ordering for backwards compatibility
+    )
     metadata: Optional[dict] = None
     pointcloud_location: Optional[str] = None
     upload_to_scale: Optional[bool] = True
     def __post_init__(self):
-        assert self.reference_id is not None, "reference_id is required."
+        assert self.reference_id != "DUMMY_VALUE", "reference_id is required."
         assert bool(self.image_location) != bool(
             self.pointcloud_location
         ), "Must specify exactly one of the image_location, pointcloud_location parameters"
@@ -122,30 +230,25 @@ class DatasetItem:  # pylint: disable=R0902
         )
     @classmethod
-    def from_json(cls, payload: dict, is_scene=False):
+    def from_json(cls, payload: dict):
+        """Instantiates dataset item object from schematized JSON dict payload."""
         image_url = payload.get(IMAGE_URL_KEY, None) or payload.get(
             ORIGINAL_IMAGE_URL_KEY, None
         )
-        if is_scene:
-            return cls(
-                image_location=image_url,
-                pointcloud_location=payload.get(POINTCLOUD_URL_KEY, None),
-                reference_id=payload.get(REFERENCE_ID_KEY, None),
-                metadata=payload.get(METADATA_KEY, {}),
-            )
         return cls(
             image_location=image_url,
+            pointcloud_location=payload.get(POINTCLOUD_URL_KEY, None),
             reference_id=payload.get(REFERENCE_ID_KEY, None),
             metadata=payload.get(METADATA_KEY, {}),
-            upload_to_scale=payload.get(UPLOAD_TO_SCALE_KEY, None),
+            upload_to_scale=payload.get(UPLOAD_TO_SCALE_KEY, True),
         )
     def local_file_exists(self):
+        # TODO: make private
         return os.path.isfile(self.image_location)
     def to_payload(self, is_scene=False) -> dict:
+        """Serializes dataset item object to schematized JSON dict."""
         payload: Dict[str, Any] = {
             METADATA_KEY: self.metadata or {},
         }
@@ -170,6 +273,7 @@ class DatasetItem:  # pylint: disable=R0902
         return payload
     def to_json(self) -> str:
+        """Serializes dataset item object to schematized JSON string."""
         return json.dumps(self.to_payload(), allow_nan=False)

nucleus/dataset_item_uploader.py ADDED Viewed

@@ -0,0 +1,297 @@
+import asyncio
+import json
+import os
+import time
+from typing import TYPE_CHECKING, Any, List
+import aiohttp
+import nest_asyncio
+from .constants import (
+    DATASET_ID_KEY,
+    DEFAULT_NETWORK_TIMEOUT_SEC,
+    IMAGE_KEY,
+    IMAGE_URL_KEY,
+    ITEMS_KEY,
+    UPDATE_KEY,
+)
+from .dataset_item import DatasetItem
+from .errors import NotFoundError
+from .logger import logger
+from .payload_constructor import construct_append_payload
+from .retry_strategy import RetryStrategy
+from .upload_response import UploadResponse
+if TYPE_CHECKING:
+    from . import NucleusClient
+class DatasetItemUploader:
+    def __init__(self, dataset_id: str, client: "NucleusClient"):  # noqa: F821
+        self.dataset_id = dataset_id
+        self._client = client
+    def upload(
+        self,
+        dataset_items: List[DatasetItem],
+        batch_size: int = 20,
+        update: bool = False,
+    ) -> UploadResponse:
+        """
+        Args:
+            dataset_items: Items to Upload
+            batch_size: How many items to pool together for a single request
+            update: Update records instead of overwriting
+        Returns:
+        """
+        local_items = []
+        remote_items = []
+        # Check local files exist before sending requests
+        for item in dataset_items:
+            if item.local:
+                if not item.local_file_exists():
+                    raise NotFoundError()
+                local_items.append(item)
+            else:
+                remote_items.append(item)
+        local_batches = [
+            local_items[i : i + batch_size]
+            for i in range(0, len(local_items), batch_size)
+        ]
+        remote_batches = [
+            remote_items[i : i + batch_size]
+            for i in range(0, len(remote_items), batch_size)
+        ]
+        agg_response = UploadResponse(json={DATASET_ID_KEY: self.dataset_id})
+        async_responses: List[Any] = []
+        if local_batches:
+            tqdm_local_batches = self._client.tqdm_bar(
+                local_batches, desc="Local file batches"
+            )
+            for batch in tqdm_local_batches:
+                payload = construct_append_payload(batch, update)
+                responses = self._process_append_requests_local(
+                    self.dataset_id, payload, update
+                )
+                async_responses.extend(responses)
+        if remote_batches:
+            tqdm_remote_batches = self._client.tqdm_bar(
+                remote_batches, desc="Remote file batches"
+            )
+            for batch in tqdm_remote_batches:
+                payload = construct_append_payload(batch, update)
+                responses = self._process_append_requests(
+                    dataset_id=self.dataset_id,
+                    payload=payload,
+                    update=update,
+                    batch_size=batch_size,
+                )
+                async_responses.extend(responses)
+        for response in async_responses:
+            agg_response.update_response(response)
+        return agg_response
+    def _process_append_requests_local(
+        self,
+        dataset_id: str,
+        payload: dict,
+        update: bool,  # TODO: understand how to pass this in.
+        local_batch_size: int = 10,
+    ):
+        def get_files(batch):
+            for item in batch:
+                item[UPDATE_KEY] = update
+            request_payload = [
+                (
+                    ITEMS_KEY,
+                    (
+                        None,
+                        json.dumps(batch, allow_nan=False),
+                        "application/json",
+                    ),
+                )
+            ]
+            for item in batch:
+                image = open(  # pylint: disable=R1732
+                    item.get(IMAGE_URL_KEY), "rb"  # pylint: disable=R1732
+                )  # pylint: disable=R1732
+                img_name = os.path.basename(image.name)
+                img_type = (
+                    f"image/{os.path.splitext(image.name)[1].strip('.')}"
+                )
+                request_payload.append(
+                    (IMAGE_KEY, (img_name, image, img_type))
+                )
+            return request_payload
+        items = payload[ITEMS_KEY]
+        responses: List[Any] = []
+        files_per_request = []
+        payload_items = []
+        for i in range(0, len(items), local_batch_size):
+            batch = items[i : i + local_batch_size]
+            files_per_request.append(get_files(batch))
+            payload_items.append(batch)
+        future = self.make_many_files_requests_asynchronously(
+            files_per_request,
+            f"dataset/{dataset_id}/append",
+        )
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:  # no event loop running:
+            loop = asyncio.new_event_loop()
+            responses = loop.run_until_complete(future)
+        else:
+            nest_asyncio.apply(loop)
+            return loop.run_until_complete(future)
+        def close_files(request_items):
+            for item in request_items:
+                # file buffer in location [1][1]
+                if item[0] == IMAGE_KEY:
+                    item[1][1].close()
+        # don't forget to close all open files
+        for p in files_per_request:
+            close_files(p)
+        return responses
+    async def make_many_files_requests_asynchronously(
+        self, files_per_request, route
+    ):
+        """
+        Makes an async post request with files to a Nucleus endpoint.
+        :param files_per_request: A list of lists of tuples (name, (filename, file_pointer, content_type))
+           name will become the name by which the multer can build an array.
+        :param route: route for the request
+        :return: awaitable list(response)
+        """
+        async with aiohttp.ClientSession() as session:
+            tasks = [
+                asyncio.ensure_future(
+                    self._make_files_request(
+                        files=files, route=route, session=session
+                    )
+                )
+                for files in files_per_request
+            ]
+            return await asyncio.gather(*tasks)
+    async def _make_files_request(
+        self,
+        files,
+        route: str,
+        session: aiohttp.ClientSession,
+        retry_attempt=0,
+        max_retries=3,
+        sleep_intervals=(1, 3, 9),
+    ):
+        """
+        Makes an async post request with files to a Nucleus endpoint.
+        :param files: A list of tuples (name, (filename, file_pointer, file_type))
+        :param route: route for the request
+        :param session: Session to use for post.
+        :return: response
+        """
+        endpoint = f"{self._client.endpoint}/{route}"
+        logger.info("Posting to %s", endpoint)
+        form = aiohttp.FormData()
+        for file in files:
+            form.add_field(
+                name=file[0],
+                filename=file[1][0],
+                value=file[1][1],
+                content_type=file[1][2],
+            )
+        for sleep_time in RetryStrategy.sleep_times + [-1]:
+            async with session.post(
+                endpoint,
+                data=form,
+                auth=aiohttp.BasicAuth(self._client.api_key, ""),
+                timeout=DEFAULT_NETWORK_TIMEOUT_SEC,
+            ) as response:
+                logger.info(
+                    "API request has response code %s", response.status
+                )
+                try:
+                    data = await response.json()
+                except aiohttp.client_exceptions.ContentTypeError:
+                    # In case of 404, the server returns text
+                    data = await response.text()
+                if (
+                    response.status in RetryStrategy.statuses
+                    and sleep_time != -1
+                ):
+                    time.sleep(sleep_time)
+                    continue
+                if not response.ok:
+                    if retry_attempt < max_retries:
+                        time.sleep(sleep_intervals[retry_attempt])
+                        retry_attempt += 1
+                        return self._make_files_request(
+                            files,
+                            route,
+                            session,
+                            retry_attempt,
+                            max_retries,
+                            sleep_intervals,
+                        )
+                    else:
+                        self._client.handle_bad_response(
+                            endpoint,
+                            session.post,
+                            aiohttp_response=(
+                                response.status,
+                                response.reason,
+                                data,
+                            ),
+                        )
+                return data
+    def _process_append_requests(
+        self,
+        dataset_id: str,
+        payload: dict,
+        update: bool,
+        batch_size: int = 20,
+    ):
+        items = payload[ITEMS_KEY]
+        payloads = [
+            # batch_size images per request
+            {ITEMS_KEY: items[i : i + batch_size], UPDATE_KEY: update}
+            for i in range(0, len(items), batch_size)
+        ]
+        return [
+            self._client.make_request(
+                payload,
+                f"dataset/{dataset_id}/append",
+            )
+            for payload in payloads
+        ]

nucleus/deprecation_warning.py ADDED Viewed

@@ -0,0 +1,32 @@
+import warnings
+from functools import wraps
+from typing import Callable
+def deprecated(msg: str):
+    """Adds a deprecation warning via the `warnings` lib which can be caught by linters.
+    Args:
+        msg: State reason of deprecation and point towards preferred practices
+    Returns:
+        Deprecation wrapped function
+    """
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # NOTE: __qualname looks a lot better for method calls
+            name = (
+                func.__qualname__
+                if hasattr(func, "__qualname__")
+                else func.__name__
+            )
+            full_message = f"Calling {name} is deprecated: {msg}"
+            # NOTE: stacklevel=2 makes sure that the level is applied to the decorated function
+            warnings.warn(full_message, DeprecationWarning, stacklevel=2)
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator

nucleus/errors.py CHANGED Viewed

@@ -4,6 +4,11 @@ nucleus_client_version = pkg_resources.get_distribution(
     "scale-nucleus"
 ).version
+INFRA_FLAKE_MESSAGES = [
+    "downstream duration timeout",
+    "upstream connect error or disconnect/reset before headers. reset reason: local reset",
+]
 class ModelCreationError(Exception):
     def __init__(self, message="Could not create the model"):
@@ -35,7 +40,7 @@ class NucleusAPIError(Exception):
     def __init__(
         self, endpoint, command, requests_response=None, aiohttp_response=None
     ):
-        message = f"Your client is on version {nucleus_client_version}. Before reporting this error, please make sure you update to the latest version of the client by running pip install --upgrade scale-nucleus\n"
+        message = f"Your client is on version {nucleus_client_version}. If you have not recently done so, please make sure you have updated to the latest version of the client by running pip install --upgrade scale-nucleus\n"
         if requests_response is not None:
             message += f"Tried to {command.__name__} {endpoint}, but received {requests_response.status_code}: {requests_response.reason}."
             if hasattr(requests_response, "text"):
@@ -50,4 +55,19 @@ class NucleusAPIError(Exception):
             if data:
                 message += f"\nThe detailed error is:\n{data}"
+        if any(
+            infra_flake_message in message
+            for infra_flake_message in INFRA_FLAKE_MESSAGES
+        ):
+            message += "\n This likely indicates temporary downtime of the API, please try again in a minute or two"
         super().__init__(message)
+class NoAPIKey(Exception):
+    def __init__(
+        self,
+        message="You need to pass an API key to the NucleusClient or set the environment variable NUCLEUS_API_KEY",
+    ):
+        self.message = message
+        super().__init__(self.message)

scale-nucleus 0.1.22__py3-none-any.whl → 0.6.4__py3-none-any.whl

scale-nucleus 0.1.22py3-none-any.whl → 0.6.4py3-none-any.whl