PyPI - hirundo - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

hirundo 0.1.9py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

hirundo/__init__.py +13 -6
hirundo/_dataframe.py +43 -0
hirundo/_env.py +2 -2
hirundo/_headers.py +18 -2
hirundo/_timeouts.py +1 -0
hirundo/cli.py +52 -0
hirundo/dataset_optimization.py +31 -106
hirundo/dataset_optimization_results.py +42 -0
hirundo/git.py +11 -18
hirundo/storage.py +13 -16
hirundo/unzip.py +247 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/METADATA +9 -5
hirundo-0.1.16.dist-info/RECORD +23 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/WHEEL +1 -1
hirundo-0.1.9.dist-info/RECORD +0 -20
/hirundo/{enum.py → dataset_enum.py} +0 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/entry_points.txt +0 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info/licenses}/LICENSE +0 -0
{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/top_level.txt +0 -0

hirundo/__init__.py CHANGED Viewed

@@ -1,3 +1,7 @@
+from .dataset_enum import (
+    DatasetMetadataType,
+    LabelingType,
+)
 from .dataset_optimization import (
     COCO,
     YOLO,
@@ -7,11 +11,8 @@ from .dataset_optimization import (
     RunArgs,
     VisionRunArgs,
 )
-from .enum import (
-    DatasetMetadataType,
-    LabelingType,
-)
-from .git import GitRepo
+from .dataset_optimization_results import DatasetOptimizationResults
+from .git import GitPlainAuth, GitRepo, GitSSHAuth
 from .storage import (
     StorageConfig,
     StorageGCP,
@@ -20,6 +21,7 @@ from .storage import (
     StorageS3,
     StorageTypes,
 )
+from .unzip import load_df, load_from_zip
 __all__ = [
     "COCO",
@@ -31,13 +33,18 @@ __all__ = [
     "VisionRunArgs",
     "LabelingType",
     "DatasetMetadataType",
+    "GitPlainAuth",
     "GitRepo",
+    "GitSSHAuth",
     "StorageTypes",
     "StorageS3",
     "StorageGCP",
     # "StorageAzure",  TODO: Azure storage is coming soon
     "StorageGit",
     "StorageConfig",
+    "DatasetOptimizationResults",
+    "load_df",
+    "load_from_zip",
 ]
-__version__ = "0.1.9"
+__version__ = "0.1.16"

hirundo/_dataframe.py ADDED Viewed

@@ -0,0 +1,43 @@
+has_pandas = False
+has_polars = False
+pd = None
+pl = None
+int32 = type[None]
+float32 = type[None]
+string = type[None]
+#  ⬆️ These are just placeholders for the int32, float32 and string types
+#    for when neither pandas nor polars are available
+try:
+    import numpy as np
+    import pandas as pd
+    has_pandas = True
+    int32 = np.int32
+    float32 = np.float32
+    string = str
+except ImportError:
+    pass
+try:
+    import polars as pl
+    import polars.datatypes as pl_datatypes
+    has_polars = True
+    int32 = pl_datatypes.Int32
+    float32 = pl_datatypes.Float32
+    string = pl_datatypes.String
+except ImportError:
+    pass
+__all__ = [
+    "has_polars",
+    "has_pandas",
+    "pd",
+    "pl",
+    "int32",
+    "float32",
+    "string",
+]

hirundo/_env.py CHANGED Viewed

@@ -2,11 +2,11 @@ import enum
 import os
 from pathlib import Path
-from dotenv import load_dotenv
+from dotenv import find_dotenv, load_dotenv
 class EnvLocation(enum.Enum):
-    DOTENV = Path.cwd() / ".env"
+    DOTENV = find_dotenv(".env")
     HOME = Path.home() / ".hirundo.conf"

hirundo/_headers.py CHANGED Viewed

@@ -1,13 +1,29 @@
 from hirundo._env import API_KEY, check_api_key
-json_headers = {
+HIRUNDO_API_VERSION = "0.2"
+_json_headers = {
     "Content-Type": "application/json",
     "Accept": "application/json",
 }
-def get_auth_headers():
+def _get_auth_headers():
     check_api_key()
     return {
         "Authorization": f"Bearer {API_KEY}",
     }
+def _get_api_version_header():
+    return {
+        "HIRUNDO-API-VERSION": HIRUNDO_API_VERSION,
+    }
+def get_headers():
+    return {
+        **_json_headers,
+        **_get_auth_headers(),
+        **_get_api_version_header(),
+    }

hirundo/_timeouts.py CHANGED Viewed

@@ -1,2 +1,3 @@
 READ_TIMEOUT = 30.0
 MODIFY_TIMEOUT = 60.0
+DOWNLOAD_READ_TIMEOUT = 600.0  # 10 minutes

hirundo/cli.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Annotated
 from urllib.parse import urlparse
 import typer
+from rich.console import Console
+from rich.table import Table
 from hirundo._env import API_HOST, EnvLocation
@@ -189,6 +191,56 @@ def setup(
         )
+@app.command("check-run", epilog=hirundo_epilog)
+def check_run(
+    run_id: str,
+):
+    """
+    Check the status of a run.
+    """
+    from hirundo.dataset_optimization import OptimizationDataset
+    results = OptimizationDataset.check_run_by_id(run_id)
+    print(f"Run results saved to {results.cached_zip_path}")
+@app.command("list-runs", epilog=hirundo_epilog)
+def list_runs():
+    """
+    List all runs available.
+    """
+    from hirundo.dataset_optimization import OptimizationDataset
+    runs = OptimizationDataset.list_runs()
+    console = Console()
+    table = Table(
+        title="Runs:",
+        expand=True,
+    )
+    cols = (
+        "Dataset name",
+        "Run ID",
+        "Status",
+        "Created At",
+        "Run Args",
+    )
+    for col in cols:
+        table.add_column(
+            col,
+            overflow="fold",
+        )
+    for run in runs:
+        table.add_row(
+            str(run.name),
+            str(run.id),
+            str(run.status),
+            run.created_at.isoformat(),
+            run.run_args.model_dump_json() if run.run_args else None,
+        )
+    console.print(table)
 typer_click_object = typer.main.get_command(app)
 if __name__ == "__main__":

hirundo/dataset_optimization.py CHANGED Viewed

@@ -4,27 +4,25 @@ import typing
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Generator
 from enum import Enum
-from io import StringIO
 from typing import overload
 import httpx
-import numpy as np
-import pandas as pd
 import requests
-from pandas._typing import DtypeArg
 from pydantic import BaseModel, Field, model_validator
 from tqdm import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
 from hirundo._constraints import HirundoUrl
 from hirundo._env import API_HOST
-from hirundo._headers import get_auth_headers, json_headers
+from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
-from hirundo.enum import DatasetMetadataType, LabelingType
+from hirundo.dataset_enum import DatasetMetadataType, LabelingType
+from hirundo.dataset_optimization_results import DatasetOptimizationResults
 from hirundo.logger import get_logger
 from hirundo.storage import ResponseStorageConfig, StorageConfig
+from hirundo.unzip import download_and_extract_zip
 logger = get_logger(__name__)
@@ -73,39 +71,6 @@ STATUS_TO_PROGRESS_MAP = {
 }
-class DatasetOptimizationResults(BaseModel):
-    model_config = {"arbitrary_types_allowed": True}
-    suspects: pd.DataFrame
-    """
-    A pandas DataFrame containing the results of the optimization run
-    """
-    warnings_and_errors: pd.DataFrame
-    """
-    A pandas DataFrame containing the warnings and errors of the optimization run
-    """
-CUSTOMER_INTERCHANGE_DTYPES: DtypeArg = {
-    "image_path": str,
-    "label_path": str,
-    "segments_mask_path": str,
-    "segment_id": np.int32,
-    "label": str,
-    "bbox_id": str,
-    "xmin": np.float32,
-    "ymin": np.float32,
-    "xmax": np.float32,
-    "ymax": np.float32,
-    "suspect_level": np.float32,  # If exists, must be one of the values in the enum below
-    "suggested_label": str,
-    "suggested_label_conf": np.float32,
-    "status": str,
-    # ⬆️ If exists, must be one of the following:
-    # NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
-}
 class Metadata(BaseModel, ABC):
     type: DatasetMetadataType
@@ -201,13 +166,14 @@ class VisionRunArgs(BaseModel):
 RunArgs = typing.Union[VisionRunArgs]
-class AugmentationNames(str, Enum):
-    RandomHorizontalFlip = "RandomHorizontalFlip"
-    RandomVerticalFlip = "RandomVerticalFlip"
-    RandomRotation = "RandomRotation"
-    ColorJitter = "ColorJitter"
-    RandomAffine = "RandomAffine"
-    RandomPerspective = "RandomPerspective"
+class AugmentationName(str, Enum):
+    RANDOM_HORIZONTAL_FLIP = "RandomHorizontalFlip"
+    RANDOM_VERTICAL_FLIP = "RandomVerticalFlip"
+    RANDOM_ROTATION = "RandomRotation"
+    RANDOM_PERSPECTIVE = "RandomPerspective"
+    GAUSSIAN_NOISE = "GaussianNoise"
+    RANDOM_GRAYSCALE = "RandomGrayscale"
+    GAUSSIAN_BLUR = "GaussianBlur"
 class Modality(str, Enum):
@@ -264,7 +230,7 @@ class OptimizationDataset(BaseModel):
     """
     labeling_info: LabelingInfo
-    augmentations: typing.Optional[list[AugmentationNames]] = None
+    augmentations: typing.Optional[list[AugmentationName]] = None
     """
     Used to define which augmentations are apply to a vision dataset.
     For audio datasets, this field is ignored.
@@ -323,7 +289,7 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -340,7 +306,7 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/by-name/{name}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -361,7 +327,7 @@ class OptimizationDataset(BaseModel):
         response = requests.get(
             f"{API_HOST}/dataset-optimization/dataset/",
             params={"dataset_organization_id": organization_id},
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -388,7 +354,7 @@ class OptimizationDataset(BaseModel):
         response = requests.get(
             f"{API_HOST}/dataset-optimization/run/list",
             params={"dataset_organization_id": organization_id},
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -410,7 +376,7 @@ class OptimizationDataset(BaseModel):
         """
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -482,10 +448,7 @@ class OptimizationDataset(BaseModel):
                 "organization_id": organization_id,
                 "replace_if_exists": replace_if_exists,
             },
-            headers={
-                **json_headers,
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(dataset_response)
@@ -519,7 +482,7 @@ class OptimizationDataset(BaseModel):
         run_response = requests.post(
             f"{API_HOST}/dataset-optimization/run/{dataset_id}",
             json=run_info if len(run_info) > 0 else None,
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(run_response)
@@ -595,46 +558,6 @@ class OptimizationDataset(BaseModel):
         self.id = None
         self.run_id = None
-    @staticmethod
-    def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
-        """
-        Clean the index of a dataframe in case it has unnamed columns.
-        Args:
-            df (DataFrame): Dataframe to clean
-        Returns:
-            DataFrame: Cleaned dataframe
-        """
-        index_cols = sorted(
-            [col for col in df.columns if col.startswith("Unnamed")], reverse=True
-        )
-        if len(index_cols) > 0:
-            df.set_index(index_cols.pop(), inplace=True)
-            df.rename_axis(index=None, columns=None, inplace=True)
-            if len(index_cols) > 0:
-                df.drop(columns=index_cols, inplace=True)
-        return df
-    @staticmethod
-    def _read_csvs_to_df(data: dict):
-        if data["state"] == RunStatus.SUCCESS.value:
-            data["result"]["suspects"] = OptimizationDataset._clean_df_index(
-                pd.read_csv(
-                    StringIO(data["result"]["suspects"]),
-                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
-                )
-            )
-            data["result"]["warnings_and_errors"] = OptimizationDataset._clean_df_index(
-                pd.read_csv(
-                    StringIO(data["result"]["warnings_and_errors"]),
-                    dtype=CUSTOMER_INTERCHANGE_DTYPES,
-                )
-            )
-        else:
-            pass
     @staticmethod
     def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
         if retry > MAX_RETRIES:
@@ -645,7 +568,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=get_auth_headers(),
+                headers=get_headers(),
             ):
                 if sse.event == "ping":
                     continue
@@ -668,7 +591,6 @@ class OptimizationDataset(BaseModel):
                         raise HirundoError(last_event["reason"])
                     else:
                         raise HirundoError("Unknown error")
-                OptimizationDataset._read_csvs_to_df(data)
                 yield data
         if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
             OptimizationDataset._check_run_by_id(run_id, retry + 1)
@@ -727,11 +649,12 @@ class OptimizationDataset(BaseModel):
                         )
                     elif iteration["state"] == RunStatus.SUCCESS.value:
                         t.close()
-                        return DatasetOptimizationResults(
-                            suspects=iteration["result"]["suspects"],
-                            warnings_and_errors=iteration["result"][
-                                "warnings_and_errors"
-                            ],
+                        zip_temporary_url = iteration["result"]
+                        logger.debug("Optimization run completed. Downloading results")
+                        return download_and_extract_zip(
+                            run_id,
+                            zip_temporary_url,
                         )
                     elif (
                         iteration["state"] == RunStatus.AWAITING_MANUAL_APPROVAL.value
@@ -823,7 +746,7 @@ class OptimizationDataset(BaseModel):
                 client,
                 "GET",
                 f"{API_HOST}/dataset-optimization/run/{run_id}",
-                headers=get_auth_headers(),
+                headers=get_headers(),
             )
             async for sse in async_iterator:
                 if sse.event == "ping":
@@ -872,7 +795,7 @@ class OptimizationDataset(BaseModel):
         logger.info("Cancelling run with ID: %s", run_id)
         response = requests.delete(
             f"{API_HOST}/dataset-optimization/run/{run_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(response)
@@ -908,7 +831,9 @@ class DataOptimizationDatasetOut(BaseModel):
 class DataOptimizationRunOut(BaseModel):
     id: int
     name: str
+    dataset_id: int
     run_id: str
     status: RunStatus
     approved: bool
     created_at: datetime.datetime
+    run_args: typing.Optional[RunArgs]

hirundo/dataset_optimization_results.py ADDED Viewed

@@ -0,0 +1,42 @@
+import typing
+from pathlib import Path
+from pydantic import BaseModel
+from typing_extensions import TypeAliasType
+from hirundo._dataframe import has_pandas, has_polars
+DataFrameType = TypeAliasType("DataFrameType", None)
+if has_pandas:
+    from hirundo._dataframe import pd
+    DataFrameType = TypeAliasType("DataFrameType", typing.Union[pd.DataFrame, None])
+if has_polars:
+    from hirundo._dataframe import pl
+    DataFrameType = TypeAliasType("DataFrameType", typing.Union[pl.DataFrame, None])
+T = typing.TypeVar("T")
+class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
+    model_config = {"arbitrary_types_allowed": True}
+    cached_zip_path: Path
+    """
+    The path to the cached zip file of the results
+    """
+    suspects: T
+    """
+    A polars/pandas DataFrame containing the results of the optimization run
+    """
+    object_suspects: typing.Optional[T]
+    """
+    A polars/pandas DataFrame containing the object-level results of the optimization run
+    """
+    warnings_and_errors: T
+    """
+    A polars/pandas DataFrame containing the warnings and errors of the optimization run
+    """

hirundo/git.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic_core import Url
 from hirundo._constraints import RepoUrl
 from hirundo._env import API_HOST
-from hirundo._headers import get_auth_headers, json_headers
+from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.logger import get_logger
@@ -17,7 +17,7 @@ from hirundo.logger import get_logger
 logger = get_logger(__name__)
-class GitPlainAuthBase(BaseModel):
+class GitPlainAuth(BaseModel):
     username: str
     """
     The username for the Git repository
@@ -28,7 +28,7 @@ class GitPlainAuthBase(BaseModel):
     """
-class GitSSHAuthBase(BaseModel):
+class GitSSHAuth(BaseModel):
     ssh_key: str
     """
     The SSH key for the Git repository
@@ -52,7 +52,7 @@ class GitRepo(BaseModel):
     repository_url: typing.Union[str, RepoUrl]
     """
     The URL of the Git repository, it should start with `ssh://` or `https://` or be in the form `user@host:path`.
-    If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host:path`.
+    If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host/path`.
     """
     organization_id: typing.Optional[int] = None
     """
@@ -60,14 +60,14 @@ class GitRepo(BaseModel):
     If not provided, it will be assigned to your default organization.
     """
-    plain_auth: typing.Optional[GitPlainAuthBase] = pydantic.Field(
+    plain_auth: typing.Optional[GitPlainAuth] = pydantic.Field(
         default=None, examples=[None, {"username": "ben", "password": "password"}]
     )
     """
     The plain authentication details for the Git repository.
     Use this if using a special user with a username and password for authentication.
     """
-    ssh_auth: typing.Optional[GitSSHAuthBase] = pydantic.Field(
+    ssh_auth: typing.Optional[GitSSHAuth] = pydantic.Field(
         default=None,
         examples=[
             {
@@ -124,10 +124,7 @@ class GitRepo(BaseModel):
                 **self.model_dump(mode="json"),
                 "replace_if_exists": replace_if_exists,
             },
-            headers={
-                **json_headers,
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)
@@ -145,7 +142,7 @@ class GitRepo(BaseModel):
         """
         git_repo = requests.get(
             f"{API_HOST}/git-repo/{git_repo_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)
@@ -163,7 +160,7 @@ class GitRepo(BaseModel):
         """
         git_repo = requests.get(
             f"{API_HOST}/git-repo/by-name/{name}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)
@@ -176,9 +173,7 @@ class GitRepo(BaseModel):
         """
         git_repos = requests.get(
             f"{API_HOST}/git-repo/",
-            headers={
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(git_repos)
@@ -200,9 +195,7 @@ class GitRepo(BaseModel):
         """
         git_repo = requests.delete(
             f"{API_HOST}/git-repo/{git_repo_id}",
-            headers={
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(git_repo)

hirundo/storage.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic_core import Url
 from hirundo._constraints import S3BucketUrl, StorageConfigName
 from hirundo._env import API_HOST
-from hirundo._headers import get_auth_headers, json_headers
+from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.git import GitRepo, GitRepoOut
@@ -34,7 +34,7 @@ class StorageS3Base(BaseModel):
         Chains the bucket URL with the path, ensuring that the path is formatted correctly
         Args:
-            - path: The path to the file in the S3 bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
+            path: The path to the file in the S3 bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
         Returns:
             The full URL to the file in the S3 bucket, e.g. `s3://my-bucket/my-file.txt` or `s3://my-bucket/my-folder/my-file.txt`,
@@ -64,7 +64,7 @@ class StorageGCPBase(BaseModel):
         Chains the bucket URL with the path, ensuring that the path is formatted correctly
         Args:
-            - path: The path to the file in the GCP bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
+            path: The path to the file in the GCP bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
         Returns:
             The full URL to the file in the GCP bucket, e.g. `gs://my-bucket/my-file.txt` or `gs://my-bucket/my-folder/my-file.txt`,
@@ -94,7 +94,7 @@ class StorageGCPOut(StorageGCPBase):
 #         Chains the container URL with the path, ensuring that the path is formatted correctly
 #         Args:
-#             - path: The path to the file in the Azure container, e.g. `my-file.txt` or `/my-folder/my-file.txt`
+#             path: The path to the file in the Azure container, e.g. `my-file.txt` or `/my-folder/my-file.txt`
 #         Returns:
 #             The full URL to the file in the Azure container
@@ -114,8 +114,8 @@ def get_git_repo_url(
     Chains the repository URL with the path, ensuring that the path is formatted correctly
     Args:
-        - repo_url: The URL of the git repository, e.g. `https://my-git-repository.com`
-        - path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
+        repo_url: The URL of the git repository, e.g. `https://my-git-repository.com`
+        path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
     Returns:
         The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`
@@ -156,7 +156,7 @@ class StorageGit(BaseModel):
         Chains the repository URL with the path, ensuring that the path is formatted correctly
         Args:
-            - path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
+            path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
         Returns:
             The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
@@ -179,7 +179,7 @@ class StorageGitOut(BaseModel):
         Chains the repository URL with the path, ensuring that the path is formatted correctly
         Args:
-            - path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
+            path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
         Returns:
             The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
@@ -330,7 +330,7 @@ class StorageConfig(BaseModel):
         """
         storage_config = requests.get(
             f"{API_HOST}/storage-config/{storage_config_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(storage_config)
@@ -349,7 +349,7 @@ class StorageConfig(BaseModel):
         """
         storage_config = requests.get(
             f"{API_HOST}/storage-config/by-name/{name}?storage_type={storage_type.value}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(storage_config)
@@ -370,7 +370,7 @@ class StorageConfig(BaseModel):
         storage_configs = requests.get(
             f"{API_HOST}/storage-config/",
             params={"storage_config_organization_id": organization_id},
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=READ_TIMEOUT,
         )
         raise_for_status_with_reason(storage_configs)
@@ -386,7 +386,7 @@ class StorageConfig(BaseModel):
         """
         storage_config = requests.delete(
             f"{API_HOST}/storage-config/{storage_config_id}",
-            headers=get_auth_headers(),
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(storage_config)
@@ -415,10 +415,7 @@ class StorageConfig(BaseModel):
                 **self.model_dump(mode="json"),
                 "replace_if_exists": replace_if_exists,
             },
-            headers={
-                **json_headers,
-                **get_auth_headers(),
-            },
+            headers=get_headers(),
             timeout=MODIFY_TIMEOUT,
         )
         raise_for_status_with_reason(storage_config)

hirundo/unzip.py ADDED Viewed

@@ -0,0 +1,247 @@
+import typing
+import zipfile
+from collections.abc import Mapping
+from pathlib import Path
+from typing import IO, cast
+import requests
+from pydantic_core import Url
+from hirundo._dataframe import (
+    float32,
+    has_pandas,
+    has_polars,
+    int32,
+    pd,
+    pl,
+    string,
+)
+from hirundo._env import API_HOST
+from hirundo._headers import _get_auth_headers
+from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT
+from hirundo.dataset_optimization_results import (
+    DataFrameType,
+    DatasetOptimizationResults,
+)
+from hirundo.logger import get_logger
+ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024  # 50 MB
+Dtype = typing.Union[type[int32], type[float32], type[string]]
+CUSTOMER_INTERCHANGE_DTYPES: Mapping[str, Dtype] = {
+    "image_path": string,
+    "label_path": string,
+    "segments_mask_path": string,
+    "segment_id": int32,
+    "label": string,
+    "bbox_id": string,
+    "xmin": float32,
+    "ymin": float32,
+    "xmax": float32,
+    "ymax": float32,
+    "suspect_level": float32,  # If exists, must be one of the values in the enum below
+    "suggested_label": string,
+    "suggested_label_conf": float32,
+    "status": string,
+    # ⬆️ If exists, must be one of the following:
+    # NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
+}
+logger = get_logger(__name__)
+def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
+    """
+    Clean the index of a DataFrame in case it has unnamed columns.
+    Args:
+        df (DataFrame): DataFrame to clean
+    Returns:
+        Cleaned Pandas DataFrame
+    """
+    index_cols = sorted(
+        [col for col in df.columns if col.startswith("Unnamed")], reverse=True
+    )
+    if len(index_cols) > 0:
+        df.set_index(index_cols.pop(), inplace=True)
+        df.rename_axis(index=None, columns=None, inplace=True)
+        if len(index_cols) > 0:
+            df.drop(columns=index_cols, inplace=True)
+    return df
+def load_df(
+    file: "typing.Union[str, IO[bytes]]",
+) -> "DataFrameType":
+    """
+    Load a DataFrame from a CSV file.
+    Args:
+        file_name: The name of the CSV file to load.
+        dtypes: The data types of the columns in the DataFrame.
+    Returns:
+        The loaded DataFrame or `None` if neither Polars nor Pandas is available.
+    """
+    if has_polars:
+        return pl.read_csv(file, schema_overrides=CUSTOMER_INTERCHANGE_DTYPES)
+    elif has_pandas:
+        if typing.TYPE_CHECKING:
+            from pandas._typing import DtypeArg
+        dtype = cast("DtypeArg", CUSTOMER_INTERCHANGE_DTYPES)
+        #  ⬆️ Casting since CUSTOMER_INTERCHANGE_DTYPES is a Mapping[str, Dtype] in this case
+        df = pd.read_csv(file, dtype=dtype)
+        return cast("DataFrameType", _clean_df_index(df))
+        #  ⬆️ Casting since the return type is pd.DataFrame, but this is what DataFrameType is in this case
+    else:
+        return None
+def get_mislabel_suspect_filename(filenames: list[str]):
+    mislabel_suspect_filename = "mislabel_suspects.csv"
+    if mislabel_suspect_filename not in filenames:
+        mislabel_suspect_filename = "image_mislabel_suspects.csv"
+    if mislabel_suspect_filename not in filenames:
+        mislabel_suspect_filename = "suspects.csv"
+    if mislabel_suspect_filename not in filenames:
+        raise ValueError(
+            "None of mislabel_suspects.csv, image_mislabel_suspects.csv or suspects.csv were found in the zip file"
+        )
+    return mislabel_suspect_filename
+def download_and_extract_zip(
+    run_id: str, zip_url: str
+) -> DatasetOptimizationResults[DataFrameType]:
+    """
+    Download and extract the zip file from the given URL.
+    Note: It will only extract the `mislabel_suspects.csv` (vision - classification)
+    or `image_mislabel_suspects.csv` & `object_mislabel_suspects.csv` (vision - OD)
+    or `suspects.csv` (STT)
+    and `warnings_and_errors.csv` files from the zip file.
+    Args:
+        run_id: The ID of the optimization run.
+        zip_url: The URL of the zip file to download.
+    Returns:
+        The dataset optimization results object.
+    """
+    # Define the local file path
+    cache_dir = Path.home() / ".hirundo" / "cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    zip_file_path = cache_dir / f"{run_id}.zip"
+    headers = None
+    if Url(zip_url).scheme == "file":
+        zip_url = (
+            f"{API_HOST}/dataset-optimization/run/local-download"
+            + zip_url.replace("file://", "")
+        )
+        headers = _get_auth_headers()
+    # Stream the zip file download
+    with requests.get(
+        zip_url,
+        headers=headers,
+        timeout=DOWNLOAD_READ_TIMEOUT,
+        stream=True,
+    ) as r:
+        r.raise_for_status()
+        with open(zip_file_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                f.write(chunk)
+        logger.info(
+            "Successfully downloaded the result zip file for run ID %s to %s",
+            run_id,
+            zip_file_path,
+        )
+        with zipfile.ZipFile(zip_file_path, "r") as z:
+            # Extract suspects file
+            suspects_df = None
+            object_suspects_df = None
+            warnings_and_errors_df = None
+            filenames = []
+            try:
+                filenames = [file.filename for file in z.filelist]
+            except Exception as e:
+                logger.error("Failed to get filenames from ZIP", exc_info=e)
+            try:
+                mislabel_suspect_filename = get_mislabel_suspect_filename(filenames)
+                with z.open(mislabel_suspect_filename) as suspects_file:
+                    suspects_df = load_df(suspects_file)
+                logger.debug(
+                    "Successfully loaded mislabel suspects into DataFrame for run ID %s",
+                    run_id,
+                )
+            except Exception as e:
+                logger.error(
+                    "Failed to load mislabel suspects into DataFrame", exc_info=e
+                )
+            object_mislabel_suspects_filename = "object_mislabel_suspects.csv"
+            if object_mislabel_suspects_filename in filenames:
+                try:
+                    with z.open(
+                        object_mislabel_suspects_filename
+                    ) as object_suspects_file:
+                        object_suspects_df = load_df(object_suspects_file)
+                    logger.debug(
+                        "Successfully loaded object mislabel suspects into DataFrame for run ID %s",
+                        run_id,
+                    )
+                except Exception as e:
+                    logger.error(
+                        "Failed to load object mislabel suspects into DataFrame",
+                        exc_info=e,
+                    )
+            try:
+                # Extract warnings_and_errors file
+                with z.open("warnings_and_errors.csv") as warnings_file:
+                    warnings_and_errors_df = load_df(warnings_file)
+                logger.debug(
+                    "Successfully loaded warnings and errors into DataFrame for run ID %s",
+                    run_id,
+                )
+            except Exception as e:
+                logger.error(
+                    "Failed to load warnings and errors into DataFrame", exc_info=e
+                )
+            return DatasetOptimizationResults[DataFrameType](
+                cached_zip_path=zip_file_path,
+                suspects=suspects_df,
+                object_suspects=object_suspects_df,
+                warnings_and_errors=warnings_and_errors_df,
+            )
+def load_from_zip(
+    zip_path: Path, file_name: str
+) -> "typing.Union[pd.DataFrame, pl.DataFrame, None]":
+    """
+    Load a given file from a given zip file.
+    Args:
+        zip_path: The path to the zip file.
+        file_name: The name of the file to load.
+    Returns:
+        The loaded DataFrame or `None` if neither Polars nor Pandas is available.
+    """
+    with zipfile.ZipFile(zip_path, "r") as z:
+        try:
+            with z.open(file_name) as file:
+                return load_df(file)
+        except Exception as e:
+            logger.error("Failed to load %s from zip file", file_name, exc_info=e)
+    return None

{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: hirundo
-Version: 0.1.9
+Version: 0.1.16
 Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
 Author-email: Hirundo <dev@hirundo.io>
 License: MIT License
@@ -31,7 +31,6 @@ Requires-Dist: typer>=0.12.3
 Requires-Dist: httpx>=0.27.0
 Requires-Dist: stamina>=24.2.0
 Requires-Dist: httpx-sse>=0.4.0
-Requires-Dist: pandas>=2.2.2
 Requires-Dist: tqdm>=4.66.5
 Provides-Extra: dev
 Requires-Dist: pyyaml>=6.0.1; extra == "dev"
@@ -50,7 +49,7 @@ Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
 Requires-Dist: uv>=0.5.8; extra == "dev"
 Requires-Dist: pre-commit>=3.7.1; extra == "dev"
 Requires-Dist: virtualenv>=20.6.6; extra == "dev"
-Requires-Dist: ruff>=0.8.2; extra == "dev"
+Requires-Dist: ruff>=0.11.6; extra == "dev"
 Requires-Dist: bumpver; extra == "dev"
 Requires-Dist: platformdirs>=4.3.6; extra == "dev"
 Requires-Dist: safety>=3.2.13; extra == "dev"
@@ -64,6 +63,11 @@ Requires-Dist: sphinx-multiversion; extra == "docs"
 Requires-Dist: esbonio; extra == "docs"
 Requires-Dist: starlette>0.40.0; extra == "docs"
 Requires-Dist: markupsafe>=3.0.2; extra == "docs"
+Provides-Extra: pandas
+Requires-Dist: pandas>=2.2.2; extra == "pandas"
+Provides-Extra: polars
+Requires-Dist: polars>=1.0.0; extra == "polars"
+Dynamic: license-file
 # Hirundo
@@ -165,7 +169,7 @@ from hirundo import (
 git_storage = StorageGit(
     repo=GitRepo(
         name="BDD-100k-validation-dataset",
-        repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only.git",
+        repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only",
     ),
     branch="main",
 )

hirundo-0.1.16.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+hirundo/__init__.py,sha256=qKC89bNReZSjGtmf7l3PZD2JoptyVphpsD0Kf2PNXvY,1035
+hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
+hirundo/_constraints.py,sha256=gRv7fXwtjPGqYWIhkVYxu1B__3PdlYRqFyDkTpa9f74,1032
+hirundo/_dataframe.py,sha256=sXEEbCNcLi83wyU9ii884YikCzfASo_3nnrDxhuCv7U,758
+hirundo/_env.py,sha256=efX2sjvYlHkFr2Lcstelei67YSTFpVGT0l08ZsfiMuE,622
+hirundo/_headers.py,sha256=3hybpD_X4SODv3cFZPt9AjGY2vvZaag5OKT3z1SHSjA,521
+hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
+hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
+hirundo/_timeouts.py,sha256=gE58NU0t2e4KgKq2sk5rZcezDJAkgvRIbM5AVYFY6Ho,86
+hirundo/cli.py,sha256=5Tn0eXZGG92BR9HJYUaYozjFbS1t6UTw_I2R0tZBE04,7824
+hirundo/dataset_enum.py,sha256=ZEYBP-lrlVqfNWptlmw7JgLNhCyDirtWWPtoMvtg2AE,531
+hirundo/dataset_optimization.py,sha256=jR4ZOlKKl05jrA4cq9L1IQuKVPJ3ytXkhOJEg6efFqI,31390
+hirundo/dataset_optimization_results.py,sha256=A9YyF5zaZXVtzeDE08I_05v90dhZQADpSjDcS_6eLMc,1129
+hirundo/git.py,sha256=6h1hFPlw5FfYMGWXPCitnTqGICmBKmQtb5qKGe3Icmk,6580
+hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
+hirundo/storage.py,sha256=kO-LWlQAM3qTnALEl8s79AiFMYqCG9Sem4MIFQcyvAg,15950
+hirundo/unzip.py,sha256=XJqvt2m5pWR-G-fnzgW75VOdd-K4_Rw2r4wiEhZgKZA,8245
+hirundo-0.1.16.dist-info/licenses/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
+hirundo-0.1.16.dist-info/METADATA,sha256=CxdCbzafRuVRf1BGsS_tgjodO0g745uuNBl7y4UFMj8,8501
+hirundo-0.1.16.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+hirundo-0.1.16.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
+hirundo-0.1.16.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
+hirundo-0.1.16.dist-info/RECORD,,

{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (79.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

hirundo-0.1.9.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-hirundo/__init__.py,sha256=U_wcm3e0r1T66OQ7KHlWaOiwlPxf6e4RkTxA5uvaOOA,781
-hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
-hirundo/_constraints.py,sha256=gRv7fXwtjPGqYWIhkVYxu1B__3PdlYRqFyDkTpa9f74,1032
-hirundo/_env.py,sha256=dXUFPeEL1zPe-eBdWD4_WZvlgiY2cpWuVDzf41Qjuto,609
-hirundo/_headers.py,sha256=ggTyBwVT3nGyPidCcmYMX6pv0idzMxCI2S1BJQE-Bbs,253
-hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
-hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
-hirundo/_timeouts.py,sha256=IfX8-mrLp809-A_xSLv1DhIqZnO-Qvy4FcTtOtvqLog,42
-hirundo/cli.py,sha256=4-pdV483zqRJl8d-R9p_9YOGlehOnoMJzb3XAAdPRb0,6634
-hirundo/dataset_optimization.py,sha256=CuSrauzXiSa4kGBREao3nn-vmLVwMKTeHM7yEXesuso,33756
-hirundo/enum.py,sha256=ZEYBP-lrlVqfNWptlmw7JgLNhCyDirtWWPtoMvtg2AE,531
-hirundo/git.py,sha256=zzpEHGqoQXwOBQzNSmyf5lpUMc2FbomPqiokwMc4M8o,6777
-hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
-hirundo/storage.py,sha256=RsEmtbn79_iCY7pE1AKcBoAEqzXNkOc_UPUTaxSE0BM,16075
-hirundo-0.1.9.dist-info/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
-hirundo-0.1.9.dist-info/METADATA,sha256=8jjs7OGtVZZwFmyfdFGoTxC-de-1V6OLFJW26pYOB2E,8363
-hirundo-0.1.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-hirundo-0.1.9.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
-hirundo-0.1.9.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
-hirundo-0.1.9.dist-info/RECORD,,

/hirundo/{enum.py → dataset_enum.py} RENAMED Viewed

File without changes

{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/top_level.txt RENAMED Viewed

File without changes

hirundo 0.1.9__py3-none-any.whl → 0.1.16__py3-none-any.whl

hirundo 0.1.9py3-none-any.whl → 0.1.16py3-none-any.whl