PyPI - hirundo - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

hirundo 0.1.9py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

hirundo/__init__.py +30 -11
hirundo/_constraints.py +164 -53
hirundo/_dataframe.py +43 -0
hirundo/_env.py +2 -2
hirundo/_headers.py +18 -2
hirundo/_timeouts.py +1 -0
hirundo/_urls.py +59 -0
hirundo/cli.py +52 -0
hirundo/dataset_enum.py +46 -0
hirundo/dataset_optimization.py +93 -182
hirundo/dataset_optimization_results.py +42 -0
hirundo/git.py +12 -19
hirundo/labeling.py +140 -0
hirundo/storage.py +48 -67
hirundo/unzip.py +247 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.18.dist-info}/METADATA +55 -44
hirundo-0.1.18.dist-info/RECORD +25 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.18.dist-info}/WHEEL +1 -1
hirundo/enum.py +0 -23
hirundo-0.1.9.dist-info/RECORD +0 -20
{hirundo-0.1.9.dist-info → hirundo-0.1.18.dist-info}/entry_points.txt +0 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.18.dist-info/licenses}/LICENSE +0 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.18.dist-info}/top_level.txt +0 -0

hirundo/dataset_optimization.py CHANGED Viewed

@@ -1,30 +1,29 @@
 import datetime
 import json
 import typing
-from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Generator
 from enum import Enum
-from io import StringIO
 from typing import overload
 import httpx
-import numpy as np
-import pandas as pd
 import requests
-from pandas._typing import DtypeArg
 from pydantic import BaseModel, Field, model_validator
 from tqdm import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
-from hirundo._constraints import HirundoUrl
+from hirundo._constraints import validate_labeling_info, validate_url
 from hirundo._env import API_HOST
-from hirundo._headers import get_auth_headers, json_headers
+from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
-from hirundo.enum import DatasetMetadataType, LabelingType
+from hirundo._urls import HirundoUrl
+from hirundo.dataset_enum import DatasetMetadataType, LabelingType
+from hirundo.dataset_optimization_results import DatasetOptimizationResults
+from hirundo.labeling import YOLO, LabelingInfo
 from hirundo.logger import get_logger
 from hirundo.storage import ResponseStorageConfig, StorageConfig
+from hirundo.unzip import download_and_extract_zip
 logger = get_logger(__name__)
@@ -73,105 +72,6 @@ STATUS_TO_PROGRESS_MAP = {
 }
-class DatasetOptimizationResults(BaseModel):
-    model_config = {"arbitrary_types_allowed": True}
-    suspects: pd.DataFrame
-    """
-    A pandas DataFrame containing the results of the optimization run
-    """
-    warnings_and_errors: pd.DataFrame
-    """
-    A pandas DataFrame containing the warnings and errors of the optimization run
-    """
-CUSTOMER_INTERCHANGE_DTYPES: DtypeArg = {
-    "image_path": str,
-    "label_path": str,
-    "segments_mask_path": str,
-    "segment_id": np.int32,
-    "label": str,
-    "bbox_id": str,
-    "xmin": np.float32,
-    "ymin": np.float32,
-    "xmax": np.float32,
-    "ymax": np.float32,
-    "suspect_level": np.float32,  # If exists, must be one of the values in the enum below
-    "suggested_label": str,
-    "suggested_label_conf": np.float32,
-    "status": str,
-    # ⬆️ If exists, must be one of the following:
-    # NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
-}
-class Metadata(BaseModel, ABC):
-    type: DatasetMetadataType
-    @property
-    @abstractmethod
-    def metadata_url(self) -> HirundoUrl:
-        raise NotImplementedError()
-class HirundoCSV(Metadata):
-    """
-    A dataset metadata file in the Hirundo CSV format
-    """
-    type: DatasetMetadataType = DatasetMetadataType.HIRUNDO_CSV
-    csv_url: HirundoUrl
-    """
-    The URL to access the dataset metadata CSV file.
-    e.g. `s3://my-bucket-name/my-folder/my-metadata.csv`, `gs://my-bucket-name/my-folder/my-metadata.csv`,
-    or `ssh://my-username@my-repo-name/my-folder/my-metadata.csv`
-    (or `file:///datasets/my-folder/my-metadata.csv` if using LOCAL storage type with on-premises installation)
-    """
-    @property
-    def metadata_url(self) -> HirundoUrl:
-        return self.csv_url
-class COCO(Metadata):
-    """
-    A dataset metadata file in the COCO format
-    """
-    type: DatasetMetadataType = DatasetMetadataType.COCO
-    json_url: HirundoUrl
-    """
-    The URL to access the dataset metadata JSON file.
-    e.g. `s3://my-bucket-name/my-folder/my-metadata.json`, `gs://my-bucket-name/my-folder/my-metadata.json`,
-    or `ssh://my-username@my-repo-name/my-folder/my-metadata.json`
-    (or `file:///datasets/my-folder/my-metadata.json` if using LOCAL storage type with on-premises installation)
-    """
-    @property
-    def metadata_url(self) -> HirundoUrl:
-        return self.json_url
-class YOLO(Metadata):
-    type: DatasetMetadataType = DatasetMetadataType.YOLO
-    data_yaml_url: typing.Optional[HirundoUrl] = None
-    labels_dir_url: HirundoUrl
-    @property
-    def metadata_url(self) -> HirundoUrl:
-        return self.labels_dir_url
-LabelingInfo = typing.Union[HirundoCSV, COCO, YOLO]
-"""
-The dataset labeling info. The dataset labeling info can be one of the following:
-- `DatasetMetadataType.HirundoCSV`: Indicates that the dataset metadata file is a CSV file with the Hirundo format
-Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
-"""
 class VisionRunArgs(BaseModel):
     upsample: bool = False
     """
@@ -201,13 +101,14 @@ class VisionRunArgs(BaseModel):
 RunArgs = typing.Union[VisionRunArgs]
-class AugmentationNames(str, Enum):
-    RandomHorizontalFlip = "RandomHorizontalFlip"
-    RandomVerticalFlip = "RandomVerticalFlip"
-    RandomRotation = "RandomRotation"
-    ColorJitter = "ColorJitter"
-    RandomAffine = "RandomAffine"
-    RandomPerspective = "RandomPerspective"
+class AugmentationName(str, Enum):
+    RANDOM_HORIZONTAL_FLIP = "RandomHorizontalFlip"
+    RANDOM_VERTICAL_FLIP = "RandomVerticalFlip"
+    RANDOM_ROTATION = "RandomRotation"
+    RANDOM_PERSPECTIVE = "RandomPerspective"
+    GAUSSIAN_NOISE = "GaussianNoise"
+    RANDOM_GRAYSCALE = "RandomGrayscale"
+    GAUSSIAN_BLUR = "GaussianBlur"
 class Modality(str, Enum):
@@ -262,9 +163,9 @@ class OptimizationDataset(BaseModel):
     A full list of possible classes used in classification / object detection.
     It is currently required for clarity and performance.
     """
-    labeling_info: LabelingInfo
+    labeling_info: typing.Union[LabelingInfo, list[LabelingInfo]]
-    augmentations: typing.Optional[list[AugmentationNames]] = None
+    augmentations: typing.Optional[list[AugmentationName]] = None
     """
     Used to define which augmentations are apply to a vision dataset.
     For audio datasets, this field is ignored.
@@ -301,16 +202,30 @@ class OptimizationDataset(BaseModel):
         ):
             raise ValueError("Language is only allowed for Speech-to-Text datasets.")
         if (
-            self.labeling_info.type == DatasetMetadataType.YOLO
+            not isinstance(self.labeling_info, list)
+            and self.labeling_info.type == DatasetMetadataType.YOLO
             and isinstance(self.labeling_info, YOLO)
             and (
                 self.labeling_info.data_yaml_url is not None
                 and self.classes is not None
             )
+        ) or (
+            isinstance(self.labeling_info, list)
+            and self.classes is not None
+            and any(
+                isinstance(info, YOLO) and info.data_yaml_url is not None
+                for info in self.labeling_info
+            )
         ):
             raise ValueError(
                 "Only one of `classes` or `labeling_info.data_yaml_url` should be provided for YOLO datasets"
             )
+        if self.storage_config:
+            validate_labeling_info(
+                self.labeling_type, self.labeling_info, self.storage_config
+            )
+        if self.data_root_url and self.storage_config:
+            validate_url(self.data_root_url, self.storage_config)
         return self
     @staticmethod
@@ -323,7 +238,7 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -340,7 +255,7 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/by-name/{name}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -361,7 +276,7 @@ class OptimizationDataset(BaseModel):
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/",
             params={"dataset_organization_id": organization_id},
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -388,7 +303,7 @@ class OptimizationDataset(BaseModel):
         response = requests.get(
             f"{API_HOST}/dataset-optimization/run/list",
             params={"dataset_organization_id": organization_id},
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -410,7 +325,7 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -482,10 +397,7 @@ class OptimizationDataset(BaseModel):
                 "organization_id": organization_id,
                 "replace_if_exists": replace_if_exists,
             },
-            headers={
-                **json_headers,
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(dataset_response)
@@ -519,7 +431,7 @@ class OptimizationDataset(BaseModel):
         run_response = requests.post(
             f"{API_HOST}/dataset-optimization/run/{dataset_id}",
             json=run_info if len(run_info) > 0 else None,
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(run_response)
@@ -595,46 +507,6 @@ class OptimizationDataset(BaseModel):
         self.id = None
         self.run_id = None
-    @staticmethod
-    def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
-        """
-        Clean the index of a dataframe in case it has unnamed columns.
-        Args:
-            df (DataFrame): Dataframe to clean
-        Returns:
-            DataFrame: Cleaned dataframe
-        """
-        index_cols = sorted(
-            [col for col in df.columns if col.startswith("Unnamed")], reverse=True
-        )
-        if len(index_cols) > 0:
-            df.set_index(index_cols.pop(), inplace=True)
-            df.rename_axis(index=None, columns=None, inplace=True)
-            if len(index_cols) > 0:
-                df.drop(columns=index_cols, inplace=True)
-        return df
-    @staticmethod
-    def _read_csvs_to_df(data: dict):
-        if data["state"] == RunStatus.SUCCESS.value:
-            data["result"]["suspects"] = OptimizationDataset._clean_df_index(
-                pd.read_csv(
-                    StringIO(data["result"]["suspects"]),
-                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
-                )
-            )
-            data["result"]["warnings_and_errors"] = OptimizationDataset._clean_df_index(
-                pd.read_csv(
-                    StringIO(data["result"]["warnings_and_errors"]),
-                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
-                )
-            )
-        else:
-            pass
     @staticmethod
     def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
         if retry > MAX_RETRIES:
@@ -645,7 +517,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=get_auth_headers(),
+                headers=get_headers(),
             ):
                 if sse.event == "ping":
                     continue
@@ -668,11 +540,21 @@ class OptimizationDataset(BaseModel):
                         raise HirundoError(last_event["reason"])
                     else:
                         raise HirundoError("Unknown error")
-                OptimizationDataset._read_csvs_to_df(data)
                 yield data
         if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
             OptimizationDataset._check_run_by_id(run_id, retry + 1)
+    @staticmethod
+    def _handle_failure(iteration: dict):
+        if iteration["result"]:
+            raise HirundoError(
+                f"Optimization run failed with error: {iteration['result']}"
+            )
+        else:
+            raise HirundoError(
+                "Optimization run failed with an unknown error in _handle_failure"
+            )
     @staticmethod
     @overload
     def check_run_by_id(
@@ -722,16 +604,19 @@ class OptimizationDataset(BaseModel):
                         RunStatus.REJECTED.value,
                         RunStatus.REVOKED.value,
                     ]:
-                        raise HirundoError(
-                            f"Optimization run failed with error: {iteration['result']}"
+                        logger.error(
+                            "State is failure, rejected, or revoked: %s",
+                            iteration["state"],
                         )
+                        OptimizationDataset._handle_failure(iteration)
                     elif iteration["state"] == RunStatus.SUCCESS.value:
                         t.close()
-                        return DatasetOptimizationResults(
-                            suspects=iteration["result"]["suspects"],
-                            warnings_and_errors=iteration["result"][
-                                "warnings_and_errors"
-                            ],
+                        zip_temporary_url = iteration["result"]
+                        logger.debug("Optimization run completed. Downloading results")
+                        return download_and_extract_zip(
+                            run_id,
+                            zip_temporary_url,
                         )
                     elif (
                         iteration["state"] == RunStatus.AWAITING_MANUAL_APPROVAL.value
@@ -767,7 +652,9 @@ class OptimizationDataset(BaseModel):
                         t.n = current_progress_percentage
                         logger.debug("Setting progress to %s", t.n)
                         t.refresh()
-        raise HirundoError("Optimization run failed with an unknown error")
+        raise HirundoError(
+            "Optimization run failed with an unknown error in check_run_by_id"
+        )
     @overload
     def check_run(
@@ -823,7 +710,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=get_auth_headers(),
+                headers=get_headers(),
             )
             async for sse in async_iterator:
                 if sse.event == "ping":
@@ -867,12 +754,10 @@ class OptimizationDataset(BaseModel):
         Args:
             run_id: The ID of the run to cancel
         """
-        if not run_id:
-            raise ValueError("No run has been started")
         logger.info("Cancelling run with ID: %s", run_id)
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/run/{run_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -885,6 +770,30 @@ class OptimizationDataset(BaseModel):
             raise ValueError("No run has been started")
         self.cancel_by_id(self.run_id)
+    @staticmethod
+    def archive_run_by_id(run_id: str) -> None:
+        """
+        Archive the dataset optimization run for the given `run_id`.
+        Args:
+            run_id: The ID of the run to archive
+        """
+        logger.info("Archiving run with ID: %s", run_id)
+        response = requests.patch(
+            f"{API_HOST}/dataset-optimization/run/archive/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+    def archive(self) -> None:
+        """
+        Archive the current active instance's run.
+        """
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        self.archive_run_by_id(self.run_id)
 class DataOptimizationDatasetOut(BaseModel):
     id: int
@@ -897,7 +806,7 @@ class DataOptimizationDatasetOut(BaseModel):
     data_root_url: HirundoUrl
     classes: typing.Optional[list[str]] = None
-    labeling_info: LabelingInfo
+    labeling_info: typing.Union[LabelingInfo, list[LabelingInfo]]
     organization_id: typing.Optional[int]
     creator_id: typing.Optional[int]
@@ -908,7 +817,9 @@ class DataOptimizationDatasetOut(BaseModel):
 class DataOptimizationRunOut(BaseModel):
     id: int
     name: str
+    dataset_id: int
     run_id: str
     status: RunStatus
     approved: bool
     created_at: datetime.datetime
+    run_args: typing.Optional[RunArgs]

hirundo/dataset_optimization_results.py ADDED Viewed

@@ -0,0 +1,42 @@
+import typing
+from pathlib import Path
+from pydantic import BaseModel
+from typing_extensions import TypeAliasType
+from hirundo._dataframe import has_pandas, has_polars
+DataFrameType = TypeAliasType("DataFrameType", None)
+if has_pandas:
+    from hirundo._dataframe import pd
+    DataFrameType = TypeAliasType("DataFrameType", typing.Union[pd.DataFrame, None])
+if has_polars:
+    from hirundo._dataframe import pl
+    DataFrameType = TypeAliasType("DataFrameType", typing.Union[pl.DataFrame, None])
+T = typing.TypeVar("T")
+class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
+    model_config = {"arbitrary_types_allowed": True}
+    cached_zip_path: Path
+    """
+    The path to the cached zip file of the results
+    """
+    suspects: T
+    """
+    A polars/pandas DataFrame containing the results of the optimization run
+    """
+    object_suspects: typing.Optional[T]
+    """
+    A polars/pandas DataFrame containing the object-level results of the optimization run
+    """
+    warnings_and_errors: T
+    """
+    A polars/pandas DataFrame containing the warnings and errors of the optimization run
+    """

hirundo/git.py CHANGED Viewed

@@ -7,17 +7,17 @@ import requests
 from pydantic import BaseModel, field_validator
 from pydantic_core import Url
-from hirundo._constraints import RepoUrl
 from hirundo._env import API_HOST
-from hirundo._headers import get_auth_headers, json_headers
+from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
+from hirundo._urls import RepoUrl
 from hirundo.logger import get_logger
 logger = get_logger(__name__)
-class GitPlainAuthBase(BaseModel):
+class GitPlainAuth(BaseModel):
     username: str
     """
     The username for the Git repository
@@ -28,7 +28,7 @@ class GitPlainAuthBase(BaseModel):
     """
-class GitSSHAuthBase(BaseModel):
+class GitSSHAuth(BaseModel):
     ssh_key: str
     """
     The SSH key for the Git repository
@@ -52,7 +52,7 @@ class GitRepo(BaseModel):
     repository_url: typing.Union[str, RepoUrl]
     """
     The URL of the Git repository, it should start with `ssh://` or `https://` or be in the form `user@host:path`.
-    If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host:path`.
+    If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host/path`.
     """
     organization_id: typing.Optional[int] = None
     """
@@ -60,14 +60,14 @@ class GitRepo(BaseModel):
     If not provided, it will be assigned to your default organization.
     """
-    plain_auth: typing.Optional[GitPlainAuthBase] = pydantic.Field(
+    plain_auth: typing.Optional[GitPlainAuth] = pydantic.Field(
         default=None, examples=[None, {"username": "ben", "password": "password"}]
     )
     """
     The plain authentication details for the Git repository.
     Use this if using a special user with a username and password for authentication.
     """
-    ssh_auth: typing.Optional[GitSSHAuthBase] = pydantic.Field(
+    ssh_auth: typing.Optional[GitSSHAuth] = pydantic.Field(
         default=None,
         examples=[
             {
@@ -124,10 +124,7 @@ class GitRepo(BaseModel):
                 **self.model_dump(mode="json"),
                 "replace_if_exists": replace_if_exists,
             },
-            headers={
-                **json_headers,
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)
@@ -145,7 +142,7 @@ class GitRepo(BaseModel):
         """
         git_repo = requests.get(
             f"{API_HOST}/git-repo/{git_repo_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)
@@ -163,7 +160,7 @@ class GitRepo(BaseModel):
         """
         git_repo = requests.get(
             f"{API_HOST}/git-repo/by-name/{name}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)
@@ -176,9 +173,7 @@ class GitRepo(BaseModel):
         """
         git_repos = requests.get(
             f"{API_HOST}/git-repo/",
-            headers={
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(git_repos)
@@ -200,9 +195,7 @@ class GitRepo(BaseModel):
         """
         git_repo = requests.delete(
             f"{API_HOST}/git-repo/{git_repo_id}",
-            headers={
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)

hirundo 0.1.9__py3-none-any.whl → 0.1.18__py3-none-any.whl

hirundo 0.1.9py3-none-any.whl → 0.1.18py3-none-any.whl