PyPI - hirundo - Versions diffs - 0.1.6__tar.gz → 0.1.7__tar.gz - Mend

hirundo 0.1.6tar.gz → 0.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{hirundo-0.1.6 → hirundo-0.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: hirundo
-Version: 0.1.6
+Version: 0.1.7
 Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
 Author-email: Hirundo <dev@hirundo.io>
 License: MIT License
@@ -32,6 +32,7 @@ Requires-Dist: httpx>=0.27.0
 Requires-Dist: stamina>=24.2.0
 Requires-Dist: httpx-sse>=0.4.0
 Requires-Dist: pandas>=2.2.2
+Requires-Dist: tqdm>=4.66.5
 Provides-Extra: dev
 Requires-Dist: pyyaml>=6.0.1; extra == "dev"
 Requires-Dist: types-PyYAML>=6.0.12; extra == "dev"

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo/__init__.py RENAMED Viewed

@@ -32,4 +32,4 @@ __all__ = [
     "StorageIntegration",
 ]
-__version__ = "0.1.6"
+__version__ = "0.1.7"

hirundo-0.1.7/hirundo/_env.py ADDED Viewed

@@ -0,0 +1,15 @@
+import os
+from dotenv import load_dotenv
+load_dotenv()
+API_HOST = os.getenv("API_HOST", "https://api.hirundo.io")
+API_KEY = os.getenv("API_KEY")
+def check_api_key():
+    if not API_KEY:
+        raise ValueError(
+            "API_KEY is not set. Please run `hirundo setup` to set the API key"
+        )

hirundo-0.1.7/hirundo/_headers.py ADDED Viewed

@@ -0,0 +1,13 @@
+from hirundo._env import API_KEY, check_api_key
+json_headers = {
+    "Content-Type": "application/json",
+    "Accept": "application/json",
+}
+def get_auth_headers():
+    check_api_key()
+    return {
+        "Authorization": f"Bearer {API_KEY}",
+    }

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo/cli.py RENAMED Viewed

@@ -14,8 +14,12 @@ hirundo_epilog = (
     else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
 )
 app = typer.Typer(
-    name="hirundo", no_args_is_help=True, rich_markup_mode="rich", epilog=hirundo_epilog
+    name="hirundo",
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+    epilog=hirundo_epilog,
 )

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo/dataset_optimization.py RENAMED Viewed

@@ -1,22 +1,26 @@
 import json
-import logging
+import typing
 from collections.abc import AsyncGenerator, Generator
+from enum import Enum
 from io import StringIO
-from typing import Union
+from typing import Union, overload
 import httpx
 import pandas as pd
 import requests
 from pydantic import BaseModel, Field, model_validator
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 from hirundo._env import API_HOST
-from hirundo._headers import auth_headers, json_headers
+from hirundo._headers import get_auth_headers, json_headers
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.enum import DatasetMetadataType, LabellingType
+from hirundo.logger import get_logger
 from hirundo.storage import StorageIntegration, StorageLink
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 class HirundoError(Exception):
@@ -30,6 +34,14 @@ class HirundoError(Exception):
 MAX_RETRIES = 200  # Max 200 retries for HTTP SSE connection
+class RunStatus(Enum):
+    STARTED = "STARTED"
+    PENDING = "PENDING"
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
 class OptimizationDataset(BaseModel):
     name: str
     """
@@ -98,7 +110,7 @@ class OptimizationDataset(BaseModel):
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/",
             params={"dataset_organization_id": organization_id},
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=READ_TIMEOUT,
         )
         response.raise_for_status()
@@ -114,10 +126,11 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         response.raise_for_status()
+        logger.info("Deleted dataset with ID: %s", dataset_id)
     def delete(self, storage_integration=True) -> None:
         """
@@ -167,7 +180,7 @@ class OptimizationDataset(BaseModel):
             },
             headers={
                 **json_headers,
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )
@@ -175,6 +188,7 @@ class OptimizationDataset(BaseModel):
         self.dataset_id = dataset_response.json()["id"]
         if not self.dataset_id:
             raise HirundoError("Failed to create the dataset")
+        logger.info("Created dataset with ID: %s", self.dataset_id)
         return self.dataset_id
     @staticmethod
@@ -191,7 +205,7 @@ class OptimizationDataset(BaseModel):
         """
         run_response = requests.post(
             f"{API_HOST}/dataset-optimization/run/{dataset_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         run_response.raise_for_status()
@@ -210,6 +224,7 @@ class OptimizationDataset(BaseModel):
                 self.dataset_id = self.create()
             run_id = self.launch_optimization_run(self.dataset_id)
             self.run_id = run_id
+            logger.info("Started the run with ID: %s", run_id)
             return run_id
         except requests.HTTPError as error:
             try:
@@ -237,30 +252,38 @@ class OptimizationDataset(BaseModel):
         self.run_id = None
     @staticmethod
-    def _read_csv_to_df(data: dict):
-        if data["state"] == "SUCCESS":
-            data["result"] = pd.read_csv(StringIO(data["result"]))
-        else:
-            pass
-    @staticmethod
-    def check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
+    def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
         """
-        Check the status of a run given its ID
-        This generator will produce values to show progress of the run.
+        Clean the index of a dataframe in case it has unnamed columns.
         Args:
-            run_id: The `run_id` produced by a `run_optimization` call
-            retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually.
-        Yields:
-            Each event will be a dict, where:
-            - `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
-            - `"result"` is a string describing the progress as a percentage for a PENDING state,
-              or the error for a FAILURE state or the results for a SUCCESS state
+            df (DataFrame): Dataframe to clean
+        Returns:
+            DataFrame: Cleaned dataframe
         """
+        index_cols = sorted(
+            [col for col in df.columns if col.startswith("Unnamed")], reverse=True
+        )
+        if len(index_cols) > 0:
+            df.set_index(index_cols.pop(), inplace=True)
+            df.rename_axis(index=None, columns=None, inplace=True)
+            if len(index_cols) > 0:
+                df.drop(columns=index_cols, inplace=True)
+        return df
+    @staticmethod
+    def _read_csv_to_df(data: dict):
+        if data["state"] == RunStatus.SUCCESS.value:
+            data["result"] = OptimizationDataset._clean_df_index(
+                pd.read_csv(StringIO(data["result"]))
+            )
+        else:
+            pass
+    @staticmethod
+    def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
         if retry > MAX_RETRIES:
             raise HirundoError("Max retries reached")
         last_event = None
@@ -269,7 +292,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=auth_headers,
+                headers=get_auth_headers(),
             ):
                 if sse.event == "ping":
                     continue
@@ -286,24 +309,125 @@ class OptimizationDataset(BaseModel):
                 data = last_event["data"]
                 OptimizationDataset._read_csv_to_df(data)
                 yield data
-        if not last_event or last_event["data"]["state"] == "PENDING":
-            OptimizationDataset.check_run_by_id(run_id, retry + 1)
+        if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
+            OptimizationDataset._check_run_by_id(run_id, retry + 1)
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[True]
+    ) -> typing.Optional[pd.DataFrame]:
+        ...
-    def check_run(self) -> Generator[dict, None, None]:
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> pd.DataFrame:
+        ...
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool
+    ) -> typing.Optional[pd.DataFrame]:
+        ...
+    @staticmethod
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool = False
+    ) -> typing.Optional[pd.DataFrame]:
         """
-        Check the status of the current active instance's run.
+        Check the status of a run given its ID
-        This generator will produce values to show progress of the run.
+        Args:
+            run_id: The `run_id` produced by a `run_optimization` call
+            stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval
-        Yields:
-            Each event will be a dict, where:
-            - `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
-            - `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
+        Returns:
+            A pandas DataFrame with the results of the optimization run
+        Raises:
+            HirundoError: If the maximum number of retries is reached or if the run fails
+        """
+        logger.debug("Checking run with ID: %s", run_id)
+        with logging_redirect_tqdm():
+            t = tqdm(total=100.0)
+            for iteration in OptimizationDataset._check_run_by_id(run_id):
+                if iteration["state"] == RunStatus.SUCCESS.value:
+                    t.set_description("Optimization run completed successfully")
+                    t.n = 100.0
+                    t.refresh()
+                    t.close()
+                    return iteration["result"]
+                elif iteration["state"] == RunStatus.PENDING.value:
+                    t.set_description("Optimization run queued and not yet started")
+                    t.n = 0.0
+                    t.refresh()
+                elif iteration["state"] == RunStatus.STARTED.value:
+                    t.set_description(
+                        "Optimization run in progress. Downloading dataset"
+                    )
+                    t.n = 0.0
+                    t.refresh()
+                elif iteration["state"] is None:
+                    if (
+                        iteration["result"]
+                        and isinstance(iteration["result"], dict)
+                        and iteration["result"]["result"]
+                        and isinstance(iteration["result"]["result"], str)
+                    ):
+                        current_progress_percentage = float(
+                            iteration["result"]["result"].removesuffix("% done")
+                        )
+                        desc = (
+                            "Optimization run completed. Uploading results"
+                            if current_progress_percentage == 100.0
+                            else "Optimization run in progress"
+                        )
+                        t.set_description(desc)
+                        t.n = current_progress_percentage
+                        t.refresh()
+                elif iteration["state"] == RunStatus.AWAITING_MANUAL_APPROVAL.value:
+                    t.set_description("Awaiting manual approval")
+                    t.n = 100.0
+                    t.refresh()
+                    if stop_on_manual_approval:
+                        t.close()
+                        return None
+                elif iteration["state"] == RunStatus.FAILURE.value:
+                    t.set_description("Optimization run failed")
+                    t.close()
+                    raise HirundoError(
+                        f"Optimization run failed with error: {iteration['result']}"
+                    )
+        raise HirundoError("Optimization run failed with an unknown error")
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[True]
+    ) -> typing.Union[pd.DataFrame, None]:
+        ...
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> pd.DataFrame:
+        ...
+    def check_run(
+        self, stop_on_manual_approval: bool = False
+    ) -> typing.Union[pd.DataFrame, None]:
+        """
+        Check the status of the current active instance's run.
+        Returns:
+            A pandas DataFrame with the results of the optimization run
         """
         if not self.run_id:
             raise ValueError("No run has been started")
-        return self.check_run_by_id(self.run_id)
+        return self.check_run_by_id(self.run_id, stop_on_manual_approval)
     @staticmethod
     async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]:
@@ -324,6 +448,7 @@ class OptimizationDataset(BaseModel):
             - `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
         """
+        logger.debug("Checking run with ID: %s", run_id)
         if retry > MAX_RETRIES:
             raise HirundoError("Max retries reached")
         last_event = None
@@ -334,7 +459,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=auth_headers,
+                headers=get_auth_headers(),
             )
             async for sse in async_iterator:
                 if sse.event == "ping":
@@ -348,7 +473,7 @@ class OptimizationDataset(BaseModel):
                 )
                 last_event = json.loads(sse.data)
                 yield last_event["data"]
-        if not last_event or last_event["data"]["state"] == "PENDING":
+        if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
             OptimizationDataset.acheck_run_by_id(run_id, retry + 1)
     async def acheck_run(self) -> AsyncGenerator[dict, None]:
@@ -380,9 +505,10 @@ class OptimizationDataset(BaseModel):
         """
         if not run_id:
             raise ValueError("No run has been started")
+        logger.info("Cancelling run with ID: %s", run_id)
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/run/{run_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         response.raise_for_status()

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo/git.py RENAMED Viewed

@@ -1,4 +1,3 @@
-import logging
 import re
 from typing import Annotated, Union
@@ -8,10 +7,11 @@ from pydantic import BaseModel, field_validator
 from pydantic_core import Url
 from hirundo._env import API_HOST
-from hirundo._headers import auth_headers, json_headers
+from hirundo._headers import get_auth_headers, json_headers
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
+from hirundo.logger import get_logger
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 class GitPlainAuthBase(BaseModel):
@@ -108,7 +108,7 @@ class GitRepo(BaseModel):
             json=self.model_dump(),
             headers={
                 **json_headers,
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )
@@ -125,7 +125,7 @@ class GitRepo(BaseModel):
         git_repos = requests.get(
             f"{API_HOST}/git-repo/",
             headers={
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=READ_TIMEOUT,
         )
@@ -143,7 +143,7 @@ class GitRepo(BaseModel):
         git_repo = requests.delete(
             f"{API_HOST}/git-repo/{git_repo_id}",
             headers={
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )

hirundo-0.1.7/hirundo/logger.py ADDED Viewed

@@ -0,0 +1,8 @@
+import logging
+def get_logger(name: str) -> logging.Logger:
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    logger.addHandler(logging.StreamHandler())
+    return logger

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo/storage.py RENAMED Viewed

@@ -9,9 +9,12 @@ from pydantic_core import Url
 from hirundo._constraints import S3BucketUrl, StorageIntegrationName
 from hirundo._env import API_HOST
-from hirundo._headers import auth_headers, json_headers
+from hirundo._headers import get_auth_headers, json_headers
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.git import GitRepo
+from hirundo.logger import get_logger
+logger = get_logger(__name__)
 class StorageS3(BaseModel):
@@ -69,6 +72,10 @@ class StorageTypes(str, Enum):
     GCP = "GCP"
     # AZURE = "Azure"  TODO: Azure storage integration is coming soon
     GIT = "Git"
+    LOCAL = "Local"
+    """
+    Local storage integration is only supported for on-premises installations.
+    """
 class StorageIntegration(BaseModel):
@@ -84,7 +91,7 @@ class StorageIntegration(BaseModel):
     """
     A name to identify the `StorageIntegration` in the Hirundo system.
     """
-    type: StorageTypes = pydantic.Field(
+    type: typing.Optional[StorageTypes] = pydantic.Field(
         examples=[
             StorageTypes.S3,
             StorageTypes.GCP,
@@ -196,7 +203,7 @@ class StorageIntegration(BaseModel):
         storage_integrations = requests.get(
             f"{API_HOST}/storage-integration/",
             params={"storage_integration_organization_id": organization_id},
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=READ_TIMEOUT,
         )
         storage_integrations.raise_for_status()
@@ -212,10 +219,11 @@ class StorageIntegration(BaseModel):
         """
         storage_integration = requests.delete(
             f"{API_HOST}/storage-integration/{storage_integration_id}",
-            headers=auth_headers,
+            headers=get_auth_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         storage_integration.raise_for_status()
+        logger.info("Deleted storage integration with ID: %s", storage_integration_id)
     def delete(self) -> None:
         """
@@ -236,15 +244,42 @@ class StorageIntegration(BaseModel):
             json=self.model_dump(),
             headers={
                 **json_headers,
-                **auth_headers,
+                **get_auth_headers(),
             },
             timeout=MODIFY_TIMEOUT,
         )
         storage_integration.raise_for_status()
         storage_integration_id = storage_integration.json()["id"]
         self.id = storage_integration_id
+        logger.info("Created storage integration with ID: %s", storage_integration_id)
         return storage_integration_id
+    @model_validator(mode="after")
+    def validate_storage_type(self):
+        if self.type != StorageTypes.LOCAL and (
+            [self.s3, self.gcp, self.git].count(None) != 2
+        ):
+            raise ValueError("Exactly one of S3, GCP, or Git must be provided")
+        if self.type == StorageTypes.S3 and self.s3 is None:
+            raise ValueError("S3 storage details must be provided")
+        elif self.type == StorageTypes.GCP and self.gcp is None:
+            raise ValueError("GCP storage details must be provided")
+        elif self.type == StorageTypes.GIT and self.git is None:
+            raise ValueError("Git storage details must be provided")
+        if not self.type and not any([self.s3, self.gcp, self.git]):
+            raise ValueError("Storage type must be provided")
+        elif not self.type:
+            self.type = (
+                StorageTypes.S3
+                if self.s3 is not None
+                else StorageTypes.GCP
+                if self.gcp is not None
+                else StorageTypes.GIT
+                if self.git is not None
+                else StorageTypes.LOCAL
+            )
+        return self
 class StorageLink(BaseModel):
     storage_integration: StorageIntegration

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: hirundo
-Version: 0.1.6
+Version: 0.1.7
 Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
 Author-email: Hirundo <dev@hirundo.io>
 License: MIT License
@@ -32,6 +32,7 @@ Requires-Dist: httpx>=0.27.0
 Requires-Dist: stamina>=24.2.0
 Requires-Dist: httpx-sse>=0.4.0
 Requires-Dist: pandas>=2.2.2
+Requires-Dist: tqdm>=4.66.5
 Provides-Extra: dev
 Requires-Dist: pyyaml>=6.0.1; extra == "dev"
 Requires-Dist: types-PyYAML>=6.0.12; extra == "dev"

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo.egg-info/SOURCES.txt RENAMED Viewed

@@ -12,6 +12,7 @@ hirundo/cli.py
 hirundo/dataset_optimization.py
 hirundo/enum.py
 hirundo/git.py
+hirundo/logger.py
 hirundo/storage.py
 hirundo.egg-info/PKG-INFO
 hirundo.egg-info/SOURCES.txt

{hirundo-0.1.6 → hirundo-0.1.7}/hirundo.egg-info/requires.txt RENAMED Viewed

@@ -9,6 +9,7 @@ httpx>=0.27.0
 stamina>=24.2.0
 httpx-sse>=0.4.0
 pandas>=2.2.2
+tqdm>=4.66.5
 [dev]
 pyyaml>=6.0.1

{hirundo-0.1.6 → hirundo-0.1.7}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ packages = ["hirundo"]
 [project]
 name = "hirundo"
-version = "0.1.6"
+version = "0.1.7"
 description = "This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets."
 authors = [{ name = "Hirundo", email = "dev@hirundo.io" }]
 readme = "README.md"
@@ -36,6 +36,7 @@ dependencies = [
     "stamina>=24.2.0",
     "httpx-sse>=0.4.0",
     "pandas>=2.2.2",
+    "tqdm>=4.66.5",
 ]
 [project.scripts]

hirundo-0.1.6/hirundo/_env.py DELETED Viewed

@@ -1,12 +0,0 @@
-import os
-from dotenv import load_dotenv
-load_dotenv()
-API_HOST = os.getenv("API_HOST", "https://api.hirundo.io")
-API_KEY = os.getenv("API_KEY")
-if not API_KEY:
-    raise ValueError(
-        "API_KEY is not set. Please run `hirundo setup` to set the API key"
-    )

hirundo-0.1.6/hirundo/_headers.py DELETED Viewed

@@ -1,9 +0,0 @@
-from hirundo._env import API_KEY
-json_headers = {
-    "Content-Type": "application/json",
-    "Accept": "application/json",
-}
-auth_headers = {
-    "Authorization": f"Bearer {API_KEY}",
-}