PyPI - hirundo - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

hirundo 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

hirundo/__init__.py +1 -1
hirundo/_env.py +19 -5
hirundo/_headers.py +8 -4
hirundo/_http.py +14 -0
hirundo/_iter_sse_retrying.py +2 -2
hirundo/cli.py +80 -17
hirundo/dataset_optimization.py +233 -53
hirundo/git.py +17 -15
hirundo/logger.py +10 -0
hirundo/storage.py +57 -22
hirundo-0.1.8.dist-info/METADATA +176 -0
hirundo-0.1.8.dist-info/RECORD +20 -0
{hirundo-0.1.6.dist-info → hirundo-0.1.8.dist-info}/WHEEL +1 -1
hirundo-0.1.6.dist-info/METADATA +0 -117
hirundo-0.1.6.dist-info/RECORD +0 -18
{hirundo-0.1.6.dist-info → hirundo-0.1.8.dist-info}/LICENSE +0 -0
{hirundo-0.1.6.dist-info → hirundo-0.1.8.dist-info}/entry_points.txt +0 -0
{hirundo-0.1.6.dist-info → hirundo-0.1.8.dist-info}/top_level.txt +0 -0

hirundo/dataset_optimization.py CHANGED Viewed

@@ -1,22 +1,29 @@
 import json
-import logging
+import typing
 from collections.abc import AsyncGenerator, Generator
+from enum import Enum
 from io import StringIO
-from typing import Union
+from typing import overload
 import httpx
+import numpy as np
 import pandas as pd
 import requests
+from pandas._typing import DtypeArg
 from pydantic import BaseModel, Field, model_validator
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 from hirundo._env import API_HOST
-from hirundo._headers import auth_headers, json_headers
+from hirundo._headers import get_auth_headers, json_headers
+from hirundo._http import raise_for_status_with_reason
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.enum import DatasetMetadataType, LabellingType
+from hirundo.logger import get_logger
 from hirundo.storage import StorageIntegration, StorageLink
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 class HirundoError(Exception):
@@ -30,6 +37,66 @@ class HirundoError(Exception):
 MAX_RETRIES = 200  # Max 200 retries for HTTP SSE connection
+class RunStatus(Enum):
+    STARTED = "STARTED"
+    PENDING = "PENDING"
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
+    RETRYING = "RETRYING"
+STATUS_TO_TEXT_MAP = {
+    RunStatus.STARTED.value: "Optimization run in progress. Downloading dataset",
+    RunStatus.PENDING.value: "Optimization run queued and not yet started",
+    RunStatus.SUCCESS.value: "Optimization run completed successfully",
+    RunStatus.FAILURE.value: "Optimization run failed",
+    RunStatus.AWAITING_MANUAL_APPROVAL.value: "Awaiting manual approval",
+    RunStatus.RETRYING.value: "Optimization run failed. Retrying",
+}
+STATUS_TO_PROGRESS_MAP = {
+    RunStatus.STARTED.value: 0.0,
+    RunStatus.PENDING.value: 0.0,
+    RunStatus.SUCCESS.value: 100.0,
+    RunStatus.FAILURE.value: 100.0,
+    RunStatus.AWAITING_MANUAL_APPROVAL.value: 100.0,
+    RunStatus.RETRYING.value: 0.0,
+}
+class DatasetOptimizationResults(BaseModel):
+    model_config = {"arbitrary_types_allowed": True}
+    suspects: pd.DataFrame
+    """
+    A pandas DataFrame containing the results of the optimization run
+    """
+    warnings_and_errors: pd.DataFrame
+    """
+    A pandas DataFrame containing the warnings and errors of the optimization run
+    """
+CUSTOMER_INTERCHANGE_DTYPES: DtypeArg = {
+    "image_path": str,
+    "label_path": str,
+    "segments_mask_path": str,
+    "segment_id": np.int32,
+    "label": str,
+    "bbox_id": str,
+    "xmin": np.int32,
+    "ymin": np.int32,
+    "xmax": np.int32,
+    "ymax": np.int32,
+    "suspect_level": np.float32,  # If exists, must be one of the values in the enum below
+    "suggested_label": str,
+    "suggested_label_conf": np.float32,
+    "status": str,
+    # ⬆️ If exists, must be one of the following:
+    # NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
+}
 class OptimizationDataset(BaseModel):
     name: str
     """
@@ -42,13 +109,13 @@ class OptimizationDataset(BaseModel):
     - `LabellingType.SingleLabelClassification`: Indicates that the dataset is for classification tasks
     - `LabellingType.ObjectDetection`: Indicates that the dataset is for object detection tasks
     """
-    dataset_storage: Union[StorageLink, None]
+    dataset_storage: typing.Optional[StorageLink]
     """
     The storage link to the dataset. This can be a link to a file or a directory containing the dataset.
     If `None`, the `dataset_id` field must be set.
     """
-    classes: list[str]
+    classes: typing.Optional[list[str]] = None
     """
     A full list of possible classes used in classification / object detection.
     It is currently required for clarity and performance.
@@ -66,15 +133,15 @@ class OptimizationDataset(BaseModel):
     Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
     """
-    storage_integration_id: Union[int, None] = Field(default=None, init=False)
+    storage_integration_id: typing.Optional[int] = Field(default=None, init=False)
     """
     The ID of the storage integration used to store the dataset and metadata.
     """
-    dataset_id: Union[int, None] = Field(default=None, init=False)
+    dataset_id: typing.Optional[int] = Field(default=None, init=False)
     """
     The ID of the dataset created on the server.
     """
-    run_id: Union[str, None] = Field(default=None, init=False)
+    run_id: typing.Optional[str] = Field(default=None, init=False)
     """
     The ID of the Dataset Optimization run created on the server.
     """
@@ -86,7 +153,7 @@ class OptimizationDataset(BaseModel):
         return self
     @staticmethod
-    def list(organization_id: Union[int, None] = None) -> list[dict]:
+    def list(organization_id: typing.Optional[int] = None) -> list[dict]:
         """
         Lists all the `OptimizationDataset` instances created by user's default organization
         or the `organization_id` passed
@@ -98,10 +165,10 @@ class OptimizationDataset(BaseModel):
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/",
             params={"dataset_organization_id": organization_id},
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=READ_TIMEOUT,
         )
-        response.raise_for_status()
+        raise_for_status_with_reason(response)
         return response.json()
     @staticmethod
@@ -114,10 +181,11 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
-        response.raise_for_status()
+        raise_for_status_with_reason(response)
+        logger.info("Deleted dataset with ID: %s", dataset_id)
     def delete(self, storage_integration=True) -> None:
         """
@@ -167,14 +235,15 @@ class OptimizationDataset(BaseModel):
             },
             headers={
                 **json_headers,
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )
-        dataset_response.raise_for_status()
+        raise_for_status_with_reason(dataset_response)
         self.dataset_id = dataset_response.json()["id"]
         if not self.dataset_id:
             raise HirundoError("Failed to create the dataset")
+        logger.info("Created dataset with ID: %s", self.dataset_id)
         return self.dataset_id
     @staticmethod
@@ -191,10 +260,10 @@ class OptimizationDataset(BaseModel):
         """
         run_response = requests.post(
             f"{API_HOST}/dataset-optimization/run/{dataset_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
-        run_response.raise_for_status()
+        raise_for_status_with_reason(run_response)
         return run_response.json()["run_id"]
     def run_optimization(self) -> str:
@@ -210,6 +279,7 @@ class OptimizationDataset(BaseModel):
                 self.dataset_id = self.create()
             run_id = self.launch_optimization_run(self.dataset_id)
             self.run_id = run_id
+            logger.info("Started the run with ID: %s", run_id)
             return run_id
         except requests.HTTPError as error:
             try:
@@ -237,30 +307,47 @@ class OptimizationDataset(BaseModel):
         self.run_id = None
     @staticmethod
-    def _read_csv_to_df(data: dict):
-        if data["state"] == "SUCCESS":
-            data["result"] = pd.read_csv(StringIO(data["result"]))
-        else:
-            pass
-    @staticmethod
-    def check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
+    def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
         """
-        Check the status of a run given its ID
-        This generator will produce values to show progress of the run.
+        Clean the index of a dataframe in case it has unnamed columns.
         Args:
-            run_id: The `run_id` produced by a `run_optimization` call
-            retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually.
-        Yields:
-            Each event will be a dict, where:
-            - `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
-            - `"result"` is a string describing the progress as a percentage for a PENDING state,
-              or the error for a FAILURE state or the results for a SUCCESS state
+            df (DataFrame): Dataframe to clean
+        Returns:
+            DataFrame: Cleaned dataframe
         """
+        index_cols = sorted(
+            [col for col in df.columns if col.startswith("Unnamed")], reverse=True
+        )
+        if len(index_cols) > 0:
+            df.set_index(index_cols.pop(), inplace=True)
+            df.rename_axis(index=None, columns=None, inplace=True)
+            if len(index_cols) > 0:
+                df.drop(columns=index_cols, inplace=True)
+        return df
+    @staticmethod
+    def _read_csvs_to_df(data: dict):
+        if data["state"] == RunStatus.SUCCESS.value:
+            data["result"]["suspects"] = OptimizationDataset._clean_df_index(
+                pd.read_csv(
+                    StringIO(data["result"]["suspects"]),
+                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
+                )
+            )
+            data["result"]["warnings_and_errors"] = OptimizationDataset._clean_df_index(
+                pd.read_csv(
+                    StringIO(data["result"]["warnings_and_errors"]),
+                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
+                )
+            )
+        else:
+            pass
+    @staticmethod
+    def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
         if retry > MAX_RETRIES:
             raise HirundoError("Max retries reached")
         last_event = None
@@ -269,7 +356,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=auth_headers,
+                headers=get_auth_headers(),
             ):
                 if sse.event == "ping":
                     continue
@@ -284,26 +371,117 @@ class OptimizationDataset(BaseModel):
                 if not last_event:
                     continue
                 data = last_event["data"]
-                OptimizationDataset._read_csv_to_df(data)
+                OptimizationDataset._read_csvs_to_df(data)
                 yield data
-        if not last_event or last_event["data"]["state"] == "PENDING":
-            OptimizationDataset.check_run_by_id(run_id, retry + 1)
+        if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
+            OptimizationDataset._check_run_by_id(run_id, retry + 1)
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[True]
+    ) -> typing.Optional[DatasetOptimizationResults]: ...
-    def check_run(self) -> Generator[dict, None, None]:
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> DatasetOptimizationResults: ...
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool
+    ) -> typing.Optional[DatasetOptimizationResults]: ...
+    @staticmethod
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool = False
+    ) -> typing.Optional[DatasetOptimizationResults]:
         """
-        Check the status of the current active instance's run.
+        Check the status of a run given its ID
-        This generator will produce values to show progress of the run.
+        Args:
+            run_id: The `run_id` produced by a `run_optimization` call
+            stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval
-        Yields:
-            Each event will be a dict, where:
-            - `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
-            - `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
+        Returns:
+            A DatasetOptimizationResults object with the results of the optimization run
+        Raises:
+            HirundoError: If the maximum number of retries is reached or if the run fails
+        """
+        logger.debug("Checking run with ID: %s", run_id)
+        with logging_redirect_tqdm():
+            t = tqdm(total=100.0)
+            for iteration in OptimizationDataset._check_run_by_id(run_id):
+                if iteration["state"] in STATUS_TO_PROGRESS_MAP:
+                    t.set_description(STATUS_TO_TEXT_MAP[iteration["state"]])
+                    t.n = STATUS_TO_PROGRESS_MAP[iteration["state"]]
+                    logger.debug("Setting progress to %s", t.n)
+                    t.refresh()
+                    if iteration["state"] == RunStatus.FAILURE.value:
+                        raise HirundoError(
+                            f"Optimization run failed with error: {iteration['result']}"
+                        )
+                    elif iteration["state"] == RunStatus.SUCCESS.value:
+                        t.close()
+                        return DatasetOptimizationResults(
+                            suspects=iteration["result"]["suspects"],
+                            warnings_and_errors=iteration["result"][
+                                "warnings_and_errors"
+                            ],
+                        )
+                    elif (
+                        iteration["state"] == RunStatus.AWAITING_MANUAL_APPROVAL.value
+                        and stop_on_manual_approval
+                    ):
+                        t.close()
+                        return None
+                elif iteration["state"] is None:
+                    if (
+                        iteration["result"]
+                        and isinstance(iteration["result"], dict)
+                        and iteration["result"]["result"]
+                        and isinstance(iteration["result"]["result"], str)
+                    ):
+                        current_progress_percentage = float(
+                            iteration["result"]["result"].removesuffix("% done")
+                        )
+                        desc = (
+                            "Optimization run completed. Uploading results"
+                            if current_progress_percentage == 100.0
+                            else "Optimization run in progress"
+                        )
+                        t.set_description(desc)
+                        t.n = current_progress_percentage
+                        logger.debug("Setting progress to %s", t.n)
+                        t.refresh()
+        raise HirundoError("Optimization run failed with an unknown error")
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[True]
+    ) -> typing.Optional[DatasetOptimizationResults]: ...
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> DatasetOptimizationResults: ...
+    def check_run(
+        self, stop_on_manual_approval: bool = False
+    ) -> typing.Optional[DatasetOptimizationResults]:
+        """
+        Check the status of the current active instance's run.
+        Returns:
+            A pandas DataFrame with the results of the optimization run
         """
         if not self.run_id:
             raise ValueError("No run has been started")
-        return self.check_run_by_id(self.run_id)
+        return self.check_run_by_id(self.run_id, stop_on_manual_approval)
     @staticmethod
     async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]:
@@ -324,6 +502,7 @@ class OptimizationDataset(BaseModel):
             - `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
         """
+        logger.debug("Checking run with ID: %s", run_id)
         if retry > MAX_RETRIES:
             raise HirundoError("Max retries reached")
         last_event = None
@@ -334,7 +513,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=auth_headers,
+                headers=get_auth_headers(),
             )
             async for sse in async_iterator:
                 if sse.event == "ping":
@@ -348,7 +527,7 @@ class OptimizationDataset(BaseModel):
                 )
                 last_event = json.loads(sse.data)
                 yield last_event["data"]
-        if not last_event or last_event["data"]["state"] == "PENDING":
+        if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
             OptimizationDataset.acheck_run_by_id(run_id, retry + 1)
     async def acheck_run(self) -> AsyncGenerator[dict, None]:
@@ -380,12 +559,13 @@ class OptimizationDataset(BaseModel):
         """
         if not run_id:
             raise ValueError("No run has been started")
+        logger.info("Cancelling run with ID: %s", run_id)
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/run/{run_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
-        response.raise_for_status()
+        raise_for_status_with_reason(response)
     def cancel(self) -> None:
         """

hirundo/git.py CHANGED Viewed

@@ -1,6 +1,6 @@
-import logging
 import re
-from typing import Annotated, Union
+import typing
+from typing import Annotated
 import pydantic
 import requests
@@ -8,10 +8,12 @@ from pydantic import BaseModel, field_validator
 from pydantic_core import Url
 from hirundo._env import API_HOST
-from hirundo._headers import auth_headers, json_headers
+from hirundo._headers import get_auth_headers, json_headers
+from hirundo._http import raise_for_status_with_reason
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
+from hirundo.logger import get_logger
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 class GitPlainAuthBase(BaseModel):
@@ -30,14 +32,14 @@ class GitSSHAuthBase(BaseModel):
     """
     The SSH key for the Git repository
     """
-    ssh_password: Union[str, None]
+    ssh_password: typing.Optional[str]
     """
     The password for the SSH key for the Git repository.
     """
 class GitRepo(BaseModel):
-    id: Union[int, None] = None
+    id: typing.Optional[int] = None
     """
     The ID of the Git repository.
     """
@@ -51,20 +53,20 @@ class GitRepo(BaseModel):
     The URL of the Git repository, it should start with `ssh://` or `https://` or be in the form `user@host:path`.
     If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host:path`.
     """
-    organization_id: Union[int, None] = None
+    organization_id: typing.Optional[int] = None
     """
     The ID of the organization that the Git repository belongs to.
     If not provided, it will be assigned to your default organization.
     """
-    plain_auth: Union[GitPlainAuthBase, None] = pydantic.Field(
+    plain_auth: typing.Optional[GitPlainAuthBase] = pydantic.Field(
         default=None, examples=[None, {"username": "ben", "password": "password"}]
     )
     """
     The plain authentication details for the Git repository.
     Use this if using a special user with a username and password for authentication.
     """
-    ssh_auth: Union[GitSSHAuthBase, None] = pydantic.Field(
+    ssh_auth: typing.Optional[GitSSHAuthBase] = pydantic.Field(
         default=None,
         examples=[
             {
@@ -108,11 +110,11 @@ class GitRepo(BaseModel):
             json=self.model_dump(),
             headers={
                 **json_headers,
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )
-        git_repo.raise_for_status()
+        raise_for_status_with_reason(git_repo)
         git_repo_id = git_repo.json()["id"]
         self.id = git_repo_id
         return git_repo_id
@@ -125,11 +127,11 @@ class GitRepo(BaseModel):
         git_repos = requests.get(
             f"{API_HOST}/git-repo/",
             headers={
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=READ_TIMEOUT,
         )
-        git_repos.raise_for_status()
+        raise_for_status_with_reason(git_repos)
         return git_repos.json()
     @staticmethod
@@ -143,11 +145,11 @@ class GitRepo(BaseModel):
         git_repo = requests.delete(
             f"{API_HOST}/git-repo/{git_repo_id}",
             headers={
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )
-        git_repo.raise_for_status()
+        raise_for_status_with_reason(git_repo)
     def delete(self):
         """

hirundo/logger.py ADDED Viewed

@@ -0,0 +1,10 @@
+import logging
+import os
+def get_logger(name: str) -> logging.Logger:
+    logger = logging.getLogger(name)
+    log_level = os.getenv("LOG_LEVEL")
+    logger.setLevel(log_level if log_level else logging.INFO)
+    logger.addHandler(logging.StreamHandler())
+    return logger

hirundo 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

hirundo 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl