PyPI - hirundo - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

hirundo 0.1.8py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

hirundo/__init__.py +28 -13
hirundo/_constraints.py +34 -2
hirundo/_dataframe.py +43 -0
hirundo/_env.py +2 -2
hirundo/_headers.py +18 -2
hirundo/_http.py +7 -2
hirundo/_iter_sse_retrying.py +61 -17
hirundo/_timeouts.py +1 -0
hirundo/cli.py +52 -0
hirundo/dataset_enum.py +23 -0
hirundo/dataset_optimization.py +427 -164
hirundo/dataset_optimization_results.py +42 -0
hirundo/git.py +93 -35
hirundo/storage.py +236 -68
hirundo/unzip.py +247 -0
{hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/METADATA +84 -44
hirundo-0.1.16.dist-info/RECORD +23 -0
{hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/WHEEL +1 -1
hirundo/enum.py +0 -20
hirundo-0.1.8.dist-info/RECORD +0 -20
{hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/entry_points.txt +0 -0
{hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info/licenses}/LICENSE +0 -0
{hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/top_level.txt +0 -0

hirundo/dataset_optimization.py CHANGED Viewed

@@ -1,27 +1,28 @@
+import datetime
 import json
 import typing
+from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Generator
 from enum import Enum
-from io import StringIO
 from typing import overload
 import httpx
-import numpy as np
-import pandas as pd
 import requests
-from pandas._typing import DtypeArg
 from pydantic import BaseModel, Field, model_validator
 from tqdm import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
+from hirundo._constraints import HirundoUrl
 from hirundo._env import API_HOST
-from hirundo._headers import get_auth_headers, json_headers
+from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
-from hirundo.enum import DatasetMetadataType, LabellingType
+from hirundo.dataset_enum import DatasetMetadataType, LabelingType
+from hirundo.dataset_optimization_results import DatasetOptimizationResults
 from hirundo.logger import get_logger
-from hirundo.storage import StorageIntegration, StorageLink
+from hirundo.storage import ResponseStorageConfig, StorageConfig
+from hirundo.unzip import download_and_extract_zip
 logger = get_logger(__name__)
@@ -38,12 +39,14 @@ MAX_RETRIES = 200  # Max 200 retries for HTTP SSE connection
 class RunStatus(Enum):
-    STARTED = "STARTED"
     PENDING = "PENDING"
+    STARTED = "STARTED"
     SUCCESS = "SUCCESS"
     FAILURE = "FAILURE"
     AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
-    RETRYING = "RETRYING"
+    REVOKED = "REVOKED"
+    REJECTED = "REJECTED"
+    RETRY = "RETRY"
 STATUS_TO_TEXT_MAP = {
@@ -52,7 +55,9 @@ STATUS_TO_TEXT_MAP = {
     RunStatus.SUCCESS.value: "Optimization run completed successfully",
     RunStatus.FAILURE.value: "Optimization run failed",
     RunStatus.AWAITING_MANUAL_APPROVAL.value: "Awaiting manual approval",
-    RunStatus.RETRYING.value: "Optimization run failed. Retrying",
+    RunStatus.RETRY.value: "Optimization run failed. Retrying",
+    RunStatus.REVOKED.value: "Optimization run was cancelled",
+    RunStatus.REJECTED.value: "Optimization run was rejected",
 }
 STATUS_TO_PROGRESS_MAP = {
     RunStatus.STARTED.value: 0.0,
@@ -60,100 +65,284 @@ STATUS_TO_PROGRESS_MAP = {
     RunStatus.SUCCESS.value: 100.0,
     RunStatus.FAILURE.value: 100.0,
     RunStatus.AWAITING_MANUAL_APPROVAL.value: 100.0,
-    RunStatus.RETRYING.value: 0.0,
+    RunStatus.RETRY.value: 0.0,
+    RunStatus.REVOKED.value: 100.0,
+    RunStatus.REJECTED.value: 0.0,
 }
-class DatasetOptimizationResults(BaseModel):
-    model_config = {"arbitrary_types_allowed": True}
+class Metadata(BaseModel, ABC):
+    type: DatasetMetadataType
+    @property
+    @abstractmethod
+    def metadata_url(self) -> HirundoUrl:
+        raise NotImplementedError()
-    suspects: pd.DataFrame
+class HirundoCSV(Metadata):
     """
-    A pandas DataFrame containing the results of the optimization run
+    A dataset metadata file in the Hirundo CSV format
     """
-    warnings_and_errors: pd.DataFrame
+    type: DatasetMetadataType = DatasetMetadataType.HIRUNDO_CSV
+    csv_url: HirundoUrl
     """
-    A pandas DataFrame containing the warnings and errors of the optimization run
+    The URL to access the dataset metadata CSV file.
+    e.g. `s3://my-bucket-name/my-folder/my-metadata.csv`, `gs://my-bucket-name/my-folder/my-metadata.csv`,
+    or `ssh://my-username@my-repo-name/my-folder/my-metadata.csv`
+    (or `file:///datasets/my-folder/my-metadata.csv` if using LOCAL storage type with on-premises installation)
     """
+    @property
+    def metadata_url(self) -> HirundoUrl:
+        return self.csv_url
+class COCO(Metadata):
+    """
+    A dataset metadata file in the COCO format
+    """
+    type: DatasetMetadataType = DatasetMetadataType.COCO
+    json_url: HirundoUrl
+    """
+    The URL to access the dataset metadata JSON file.
+    e.g. `s3://my-bucket-name/my-folder/my-metadata.json`, `gs://my-bucket-name/my-folder/my-metadata.json`,
+    or `ssh://my-username@my-repo-name/my-folder/my-metadata.json`
+    (or `file:///datasets/my-folder/my-metadata.json` if using LOCAL storage type with on-premises installation)
+    """
+    @property
+    def metadata_url(self) -> HirundoUrl:
+        return self.json_url
+class YOLO(Metadata):
+    type: DatasetMetadataType = DatasetMetadataType.YOLO
+    data_yaml_url: typing.Optional[HirundoUrl] = None
+    labels_dir_url: HirundoUrl
+    @property
+    def metadata_url(self) -> HirundoUrl:
+        return self.labels_dir_url
+LabelingInfo = typing.Union[HirundoCSV, COCO, YOLO]
+"""
+The dataset labeling info. The dataset labeling info can be one of the following:
+- `DatasetMetadataType.HirundoCSV`: Indicates that the dataset metadata file is a CSV file with the Hirundo format
+Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
+"""
+class VisionRunArgs(BaseModel):
+    upsample: bool = False
+    """
+    Whether to upsample the dataset to attempt to balance the classes.
+    """
+    min_abs_bbox_size: int = 0
+    """
+    Minimum valid size (in pixels) of a bounding box to keep it in the dataset for optimization.
+    """
+    min_abs_bbox_area: int = 0
+    """
+    Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for optimization.
+    """
+    min_rel_bbox_size: float = 0.0
+    """
+    Minimum valid size (as a fraction of both image height and width) for a bounding box
+    to keep it in the dataset for optimization, relative to the corresponding dimension size,
+    i.e. if the bounding box is 10% of the image width and 5% of the image height, it will be kept if this value is 0.05, but not if the
+    value is 0.06 (since both width and height are checked).
+    """
+    min_rel_bbox_area: float = 0.0
+    """
+    Minimum valid relative area (as a fraction of the image area) of a bounding box to keep it in the dataset for optimization.
+    """
+RunArgs = typing.Union[VisionRunArgs]
+class AugmentationName(str, Enum):
+    RANDOM_HORIZONTAL_FLIP = "RandomHorizontalFlip"
+    RANDOM_VERTICAL_FLIP = "RandomVerticalFlip"
+    RANDOM_ROTATION = "RandomRotation"
+    RANDOM_PERSPECTIVE = "RandomPerspective"
+    GAUSSIAN_NOISE = "GaussianNoise"
+    RANDOM_GRAYSCALE = "RandomGrayscale"
+    GAUSSIAN_BLUR = "GaussianBlur"
-CUSTOMER_INTERCHANGE_DTYPES: DtypeArg = {
-    "image_path": str,
-    "label_path": str,
-    "segments_mask_path": str,
-    "segment_id": np.int32,
-    "label": str,
-    "bbox_id": str,
-    "xmin": np.int32,
-    "ymin": np.int32,
-    "xmax": np.int32,
-    "ymax": np.int32,
-    "suspect_level": np.float32,  # If exists, must be one of the values in the enum below
-    "suggested_label": str,
-    "suggested_label_conf": np.float32,
-    "status": str,
-    # ⬆️ If exists, must be one of the following:
-    # NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
-}
+class Modality(str, Enum):
+    IMAGE = "Image"
+    RADAR = "Radar"
+    EKG = "EKG"
 class OptimizationDataset(BaseModel):
+    id: typing.Optional[int] = Field(default=None)
+    """
+    The ID of the dataset created on the server.
+    """
     name: str
     """
     The name of the dataset. Used to identify it amongst the list of datasets
     belonging to your organization in `hirundo`.
     """
-    labelling_type: LabellingType
+    labeling_type: LabelingType
     """
-    Indicates the labelling type of the dataset. The labelling type can be one of the following:
-    - `LabellingType.SingleLabelClassification`: Indicates that the dataset is for classification tasks
-    - `LabellingType.ObjectDetection`: Indicates that the dataset is for object detection tasks
+    Indicates the labeling type of the dataset. The labeling type can be one of the following:
+    - `LabelingType.SINGLE_LABEL_CLASSIFICATION`: Indicates that the dataset is for classification tasks
+    - `LabelingType.OBJECT_DETECTION`: Indicates that the dataset is for object detection tasks
+    - `LabelingType.SPEECH_TO_TEXT`: Indicates that the dataset is for speech-to-text tasks
     """
-    dataset_storage: typing.Optional[StorageLink]
+    language: typing.Optional[str] = None
     """
-    The storage link to the dataset. This can be a link to a file or a directory containing the dataset.
-    If `None`, the `dataset_id` field must be set.
+    Language of the Speech-to-Text audio dataset. This is required for Speech-to-Text datasets.
     """
-    classes: typing.Optional[list[str]] = None
+    storage_config_id: typing.Optional[int] = None
     """
-    A full list of possible classes used in classification / object detection.
-    It is currently required for clarity and performance.
+    The ID of the storage config used to store the dataset and metadata.
+    """
+    storage_config: typing.Optional[
+        typing.Union[StorageConfig, ResponseStorageConfig]
+    ] = None
     """
-    dataset_metadata_path: str = "metadata.csv"
+    The `StorageConfig` instance to link to.
     """
-    The path to the dataset metadata file within storage integration, e.g. S3 Bucket / GCP Bucket / Azure Blob storage / Git repo.
-    Note: This path will be prefixed with the `StorageLink`'s `path`.
+    data_root_url: HirundoUrl
     """
-    dataset_metadata_type: DatasetMetadataType = DatasetMetadataType.HirundoCSV
+    URL for data (e.g. images) within the `StorageConfig` instance,
+    e.g. `s3://my-bucket-name/my-images-folder`, `gs://my-bucket-name/my-images-folder`,
+    or `ssh://my-username@my-repo-name/my-images-folder`
+    (or `file:///datasets/my-images-folder` if using LOCAL storage type with on-premises installation)
+    Note: All CSV `image_path` entries in the metadata file should be relative to this folder.
     """
-    The type of dataset metadata file. The dataset metadata file can be one of the following:
-    - `DatasetMetadataType.HirundoCSV`: Indicates that the dataset metadata file is a CSV file with the Hirundo format
-    Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
+    classes: typing.Optional[list[str]] = None
+    """
+    A full list of possible classes used in classification / object detection.
+    It is currently required for clarity and performance.
     """
+    labeling_info: LabelingInfo
-    storage_integration_id: typing.Optional[int] = Field(default=None, init=False)
+    augmentations: typing.Optional[list[AugmentationName]] = None
     """
-    The ID of the storage integration used to store the dataset and metadata.
+    Used to define which augmentations are apply to a vision dataset.
+    For audio datasets, this field is ignored.
+    If no value is provided, all augmentations are applied to vision datasets.
     """
-    dataset_id: typing.Optional[int] = Field(default=None, init=False)
+    modality: Modality = Modality.IMAGE
     """
-    The ID of the dataset created on the server.
+    Used to define the modality of the dataset.
+    Defaults to Image.
     """
     run_id: typing.Optional[str] = Field(default=None, init=False)
     """
     The ID of the Dataset Optimization run created on the server.
     """
+    status: typing.Optional[RunStatus] = None
     @model_validator(mode="after")
     def validate_dataset(self):
-        if self.dataset_storage is None and self.storage_integration_id is None:
-            raise ValueError("No dataset storage has been provided")
+        if self.storage_config is None and self.storage_config_id is None:
+            raise ValueError(
+                "No dataset storage has been provided. Provide one via `storage_config` or `storage_config_id`"
+            )
+        elif self.storage_config is not None and self.storage_config_id is not None:
+            raise ValueError(
+                "Both `storage_config` and `storage_config_id` have been provided. Pick one."
+            )
+        if self.labeling_type == LabelingType.SPEECH_TO_TEXT and self.language is None:
+            raise ValueError("Language is required for Speech-to-Text datasets.")
+        elif (
+            self.labeling_type != LabelingType.SPEECH_TO_TEXT
+            and self.language is not None
+        ):
+            raise ValueError("Language is only allowed for Speech-to-Text datasets.")
+        if (
+            self.labeling_info.type == DatasetMetadataType.YOLO
+            and isinstance(self.labeling_info, YOLO)
+            and (
+                self.labeling_info.data_yaml_url is not None
+                and self.classes is not None
+            )
+        ):
+            raise ValueError(
+                "Only one of `classes` or `labeling_info.data_yaml_url` should be provided for YOLO datasets"
+            )
         return self
     @staticmethod
-    def list(organization_id: typing.Optional[int] = None) -> list[dict]:
+    def get_by_id(dataset_id: int) -> "OptimizationDataset":
+        """
+        Get a `OptimizationDataset` instance from the server by its ID
+        Args:
+            dataset_id: The ID of the `OptimizationDataset` instance to get
+        """
+        response = requests.get(
+            f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        dataset = response.json()
+        return OptimizationDataset(**dataset)
+    @staticmethod
+    def get_by_name(name: str) -> "OptimizationDataset":
+        """
+        Get a `OptimizationDataset` instance from the server by its name
+        Args:
+            name: The name of the `OptimizationDataset` instance to get
+        """
+        response = requests.get(
+            f"{API_HOST}/dataset-optimization/dataset/by-name/{name}",
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        dataset = response.json()
+        return OptimizationDataset(**dataset)
+    @staticmethod
+    def list_datasets(
+        organization_id: typing.Optional[int] = None,
+    ) -> list["DataOptimizationDatasetOut"]:
+        """
+        Lists all the optimization datasets created by user's default organization
+        or the `organization_id` passed
+        Args:
+            organization_id: The ID of the organization to list the datasets for.
+        """
+        response = requests.get(
+            f"{API_HOST}/dataset-optimization/dataset/",
+            params={"dataset_organization_id": organization_id},
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        datasets = response.json()
+        return [
+            DataOptimizationDatasetOut(
+                **ds,
+            )
+            for ds in datasets
+        ]
+    @staticmethod
+    def list_runs(
+        organization_id: typing.Optional[int] = None,
+    ) -> list["DataOptimizationRunOut"]:
         """
         Lists all the `OptimizationDataset` instances created by user's default organization
         or the `organization_id` passed
@@ -163,13 +352,19 @@ class OptimizationDataset(BaseModel):
             organization_id: The ID of the organization to list the datasets for.
         """
         response = requests.get(
-            f"{API_HOST}/dataset-optimization/dataset/",
+            f"{API_HOST}/dataset-optimization/run/list",
             params={"dataset_organization_id": organization_id},
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
-        return response.json()
+        runs = response.json()
+        return [
+            DataOptimizationRunOut(
+                **run,
+            )
+            for run in runs
+        ]
     @staticmethod
     def delete_by_id(dataset_id: int) -> None:
@@ -181,73 +376,94 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(response)
         logger.info("Deleted dataset with ID: %s", dataset_id)
-    def delete(self, storage_integration=True) -> None:
+    def delete(self, storage_config=True) -> None:
         """
         Deletes the active `OptimizationDataset` instance from the server.
         It can only be used on a `OptimizationDataset` instance that has been created.
         Args:
-            storage_integration: If True, the `OptimizationDataset`'s `StorageIntegration` will also be deleted
+            storage_config: If True, the `OptimizationDataset`'s `StorageConfig` will also be deleted
-        Note: If `storage_integration` is not set to `False` then the `storage_integration_id` must be set
-        This can either be set manually or by creating the `StorageIntegration` instance via the `OptimizationDataset`'s
+        Note: If `storage_config` is not set to `False` then the `storage_config_id` must be set
+        This can either be set manually or by creating the `StorageConfig` instance via the `OptimizationDataset`'s
         `create` method
         """
-        if storage_integration:
-            if not self.storage_integration_id:
-                raise ValueError("No storage integration has been created")
-            StorageIntegration.delete_by_id(self.storage_integration_id)
-        if not self.dataset_id:
+        if storage_config:
+            if not self.storage_config_id:
+                raise ValueError("No storage config has been created")
+            StorageConfig.delete_by_id(self.storage_config_id)
+        if not self.id:
             raise ValueError("No dataset has been created")
-        self.delete_by_id(self.dataset_id)
+        self.delete_by_id(self.id)
-    def create(self) -> int:
+    def create(
+        self,
+        organization_id: typing.Optional[int] = None,
+        replace_if_exists: bool = False,
+    ) -> int:
         """
         Create a `OptimizationDataset` instance on the server.
-        If `storage_integration_id` is not set, it will be created.
+        If the `storage_config_id` field is not set, the storage config will also be created and the field will be set.
+        Args:
+            organization_id: The ID of the organization to create the dataset for.
+            replace_if_exists: If True, the dataset will be replaced if it already exists
+                (this is determined by a dataset of the same name in the same organization).
+        Returns:
+            The ID of the created `OptimizationDataset` instance
         """
-        if not self.dataset_storage:
+        if self.storage_config is None and self.storage_config_id is None:
             raise ValueError("No dataset storage has been provided")
-        if (
-            self.dataset_storage
-            and self.dataset_storage.storage_integration
-            and not self.storage_integration_id
+        elif self.storage_config and self.storage_config_id is None:
+            if isinstance(self.storage_config, ResponseStorageConfig):
+                self.storage_config_id = self.storage_config.id
+            elif isinstance(self.storage_config, StorageConfig):
+                self.storage_config_id = self.storage_config.create(
+                    replace_if_exists=replace_if_exists,
+                )
+        elif (
+            self.storage_config is not None
+            and self.storage_config_id is not None
+            and (
+                not isinstance(self.storage_config, ResponseStorageConfig)
+                or self.storage_config.id != self.storage_config_id
+            )
         ):
-            self.storage_integration_id = (
-                self.dataset_storage.storage_integration.create()
+            raise ValueError(
+                "Both `storage_config` and `storage_config_id` have been provided. Storage config IDs do not match."
             )
-        model_dict = self.model_dump()
+        model_dict = self.model_dump(mode="json")
         # ⬆️ Get dict of model fields from Pydantic model instance
         dataset_response = requests.post(
             f"{API_HOST}/dataset-optimization/dataset/",
             json={
-                "dataset_storage": {
-                    "storage_integration_id": self.storage_integration_id,
-                    "path": self.dataset_storage.path,
-                },
-                **{k: model_dict[k] for k in model_dict.keys() - {"dataset_storage"}},
-            },
-            headers={
-                **json_headers,
-                **get_auth_headers(),
+                **{k: model_dict[k] for k in model_dict.keys() - {"storage_config"}},
+                "organization_id": organization_id,
+                "replace_if_exists": replace_if_exists,
             },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(dataset_response)
-        self.dataset_id = dataset_response.json()["id"]
-        if not self.dataset_id:
-            raise HirundoError("Failed to create the dataset")
-        logger.info("Created dataset with ID: %s", self.dataset_id)
-        return self.dataset_id
+        self.id = dataset_response.json()["id"]
+        if not self.id:
+            raise HirundoError("An error ocurred while trying to create the dataset")
+        logger.info("Created dataset with ID: %s", self.id)
+        return self.id
     @staticmethod
-    def launch_optimization_run(dataset_id: int) -> str:
+    def launch_optimization_run(
+        dataset_id: int,
+        organization_id: typing.Optional[int] = None,
+        run_args: typing.Optional[RunArgs] = None,
+    ) -> str:
         """
         Run the dataset optimization process on the server using the dataset with the given ID
         i.e. `dataset_id`.
@@ -258,26 +474,62 @@ class OptimizationDataset(BaseModel):
         Returns:
             ID of the run (`run_id`).
         """
+        run_info = {}
+        if organization_id:
+            run_info["organization_id"] = organization_id
+        if run_args:
+            run_info["run_args"] = run_args.model_dump(mode="json")
         run_response = requests.post(
             f"{API_HOST}/dataset-optimization/run/{dataset_id}",
-            headers=get_auth_headers(),
+            json=run_info if len(run_info) > 0 else None,
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(run_response)
         return run_response.json()["run_id"]
-    def run_optimization(self) -> str:
+    def _validate_run_args(self, run_args: RunArgs) -> None:
+        if self.labeling_type == LabelingType.SPEECH_TO_TEXT:
+            raise Exception("Speech to text cannot have `run_args` set")
+        if self.labeling_type != LabelingType.OBJECT_DETECTION and any(
+            (
+                run_args.min_abs_bbox_size != 0,
+                run_args.min_abs_bbox_area != 0,
+                run_args.min_rel_bbox_size != 0,
+                run_args.min_rel_bbox_area != 0,
+            )
+        ):
+            raise Exception(
+                "Cannot set `min_abs_bbox_size`, `min_abs_bbox_area`, "
+                + "`min_rel_bbox_size`, or `min_rel_bbox_area` for "
+                + f"labeling type {self.labeling_type}"
+            )
+    def run_optimization(
+        self,
+        organization_id: typing.Optional[int] = None,
+        replace_dataset_if_exists: bool = False,
+        run_args: typing.Optional[RunArgs] = None,
+    ) -> str:
         """
         If the dataset was not created on the server yet, it is created.
         Run the dataset optimization process on the server using the active `OptimizationDataset` instance
+        Args:
+            organization_id: The ID of the organization to run the optimization for.
+            replace_dataset_if_exists: If True, the dataset will be replaced if it already exists
+                (this is determined by a dataset of the same name in the same organization).
+            run_args: The run arguments to use for the optimization run
         Returns:
             An ID of the run (`run_id`) and stores that `run_id` on the instance
         """
         try:
-            if not self.dataset_id:
-                self.dataset_id = self.create()
-            run_id = self.launch_optimization_run(self.dataset_id)
+            if not self.id:
+                self.id = self.create(replace_if_exists=replace_dataset_if_exists)
+            if run_args is not None:
+                self._validate_run_args(run_args)
+            run_id = self.launch_optimization_run(self.id, organization_id, run_args)
             self.run_id = run_id
             logger.info("Started the run with ID: %s", run_id)
             return run_id
@@ -293,59 +545,19 @@ class OptimizationDataset(BaseModel):
             except Exception:
                 content = error.response.text
             raise HirundoError(
-                f"Failed to start the run. Status code: {error.response.status_code} Content: {content}"
+                f"Unable to start the run. Status code: {error.response.status_code} Content: {content}"
             ) from error
         except Exception as error:
-            raise HirundoError(f"Failed to start the run: {error}") from error
+            raise HirundoError(f"Unable to start the run: {error}") from error
     def clean_ids(self):
         """
-        Reset `dataset_id`, `storage_integration_id`, and `run_id` values on the instance to default value of `None`
+        Reset `dataset_id`, `storage_config_id`, and `run_id` values on the instance to default value of `None`
         """
-        self.storage_integration_id = None
-        self.dataset_id = None
+        self.storage_config_id = None
+        self.id = None
         self.run_id = None
-    @staticmethod
-    def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
-        """
-        Clean the index of a dataframe in case it has unnamed columns.
-        Args:
-            df (DataFrame): Dataframe to clean
-        Returns:
-            DataFrame: Cleaned dataframe
-        """
-        index_cols = sorted(
-            [col for col in df.columns if col.startswith("Unnamed")], reverse=True
-        )
-        if len(index_cols) > 0:
-            df.set_index(index_cols.pop(), inplace=True)
-            df.rename_axis(index=None, columns=None, inplace=True)
-            if len(index_cols) > 0:
-                df.drop(columns=index_cols, inplace=True)
-        return df
-    @staticmethod
-    def _read_csvs_to_df(data: dict):
-        if data["state"] == RunStatus.SUCCESS.value:
-            data["result"]["suspects"] = OptimizationDataset._clean_df_index(
-                pd.read_csv(
-                    StringIO(data["result"]["suspects"]),
-                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
-                )
-            )
-            data["result"]["warnings_and_errors"] = OptimizationDataset._clean_df_index(
-                pd.read_csv(
-                    StringIO(data["result"]["warnings_and_errors"]),
-                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
-                )
-            )
-        else:
-            pass
     @staticmethod
     def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
         if retry > MAX_RETRIES:
@@ -356,7 +568,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=get_auth_headers(),
+                headers=get_headers(),
             ):
                 if sse.event == "ping":
                     continue
@@ -370,8 +582,15 @@ class OptimizationDataset(BaseModel):
                 last_event = json.loads(sse.data)
                 if not last_event:
                     continue
-                data = last_event["data"]
-                OptimizationDataset._read_csvs_to_df(data)
+                if "data" in last_event:
+                    data = last_event["data"]
+                else:
+                    if "detail" in last_event:
+                        raise HirundoError(last_event["detail"])
+                    elif "reason" in last_event:
+                        raise HirundoError(last_event["reason"])
+                    else:
+                        raise HirundoError("Unknown error")
                 yield data
         if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
             OptimizationDataset._check_run_by_id(run_id, retry + 1)
@@ -420,17 +639,22 @@ class OptimizationDataset(BaseModel):
                     t.n = STATUS_TO_PROGRESS_MAP[iteration["state"]]
                     logger.debug("Setting progress to %s", t.n)
                     t.refresh()
-                    if iteration["state"] == RunStatus.FAILURE.value:
+                    if iteration["state"] in [
+                        RunStatus.FAILURE.value,
+                        RunStatus.REJECTED.value,
+                        RunStatus.REVOKED.value,
+                    ]:
                         raise HirundoError(
                             f"Optimization run failed with error: {iteration['result']}"
                         )
                     elif iteration["state"] == RunStatus.SUCCESS.value:
                         t.close()
-                        return DatasetOptimizationResults(
-                            suspects=iteration["result"]["suspects"],
-                            warnings_and_errors=iteration["result"][
-                                "warnings_and_errors"
-                            ],
+                        zip_temporary_url = iteration["result"]
+                        logger.debug("Optimization run completed. Downloading results")
+                        return download_and_extract_zip(
+                            run_id,
+                            zip_temporary_url,
                         )
                     elif (
                         iteration["state"] == RunStatus.AWAITING_MANUAL_APPROVAL.value
@@ -445,13 +669,22 @@ class OptimizationDataset(BaseModel):
                         and iteration["result"]["result"]
                         and isinstance(iteration["result"]["result"], str)
                     ):
-                        current_progress_percentage = float(
-                            iteration["result"]["result"].removesuffix("% done")
-                        )
+                        result_info = iteration["result"]["result"].split(":")
+                        if len(result_info) > 1:
+                            stage = result_info[0]
+                            current_progress_percentage = float(
+                                result_info[1].removeprefix(" ").removesuffix("% done")
+                            )
+                        elif len(result_info) == 1:
+                            stage = result_info[0]
+                            current_progress_percentage = t.n  # Keep the same progress
+                        else:
+                            stage = "Unknown progress state"
+                            current_progress_percentage = t.n  # Keep the same progress
                         desc = (
                             "Optimization run completed. Uploading results"
                             if current_progress_percentage == 100.0
-                            else "Optimization run in progress"
+                            else stage
                         )
                         t.set_description(desc)
                         t.n = current_progress_percentage
@@ -513,7 +746,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=get_auth_headers(),
+                headers=get_headers(),
             )
             async for sse in async_iterator:
                 if sse.event == "ping":
@@ -562,7 +795,7 @@ class OptimizationDataset(BaseModel):
         logger.info("Cancelling run with ID: %s", run_id)
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/run/{run_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -574,3 +807,33 @@ class OptimizationDataset(BaseModel):
         if not self.run_id:
             raise ValueError("No run has been started")
         self.cancel_by_id(self.run_id)
+class DataOptimizationDatasetOut(BaseModel):
+    id: int
+    name: str
+    labeling_type: LabelingType
+    storage_config: ResponseStorageConfig
+    data_root_url: HirundoUrl
+    classes: typing.Optional[list[str]] = None
+    labeling_info: LabelingInfo
+    organization_id: typing.Optional[int]
+    creator_id: typing.Optional[int]
+    created_at: datetime.datetime
+    updated_at: datetime.datetime
+class DataOptimizationRunOut(BaseModel):
+    id: int
+    name: str
+    dataset_id: int
+    run_id: str
+    status: RunStatus
+    approved: bool
+    created_at: datetime.datetime
+    run_args: typing.Optional[RunArgs]

hirundo 0.1.8__py3-none-any.whl → 0.1.16__py3-none-any.whl

hirundo 0.1.8py3-none-any.whl → 0.1.16py3-none-any.whl