PyPI - hirundo - Versions diffs - 0.1.21__py3-none-any.whl → 0.2.3.post1__py3-none-any.whl - Mend

hirundo 0.1.21py3-none-any.whl → 0.2.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

hirundo/__init__.py +19 -3
hirundo/_constraints.py +2 -3
hirundo/_iter_sse_retrying.py +7 -4
hirundo/_llm_pipeline.py +153 -0
hirundo/_run_checking.py +283 -0
hirundo/_urls.py +1 -0
hirundo/cli.py +1 -4
hirundo/dataset_enum.py +2 -0
hirundo/dataset_qa.py +106 -190
hirundo/dataset_qa_results.py +3 -3
hirundo/git.py +7 -8
hirundo/labeling.py +22 -19
hirundo/storage.py +25 -24
hirundo/unlearning_llm.py +599 -0
hirundo/unzip.py +3 -3
{hirundo-0.1.21.dist-info → hirundo-0.2.3.post1.dist-info}/METADATA +42 -10
hirundo-0.2.3.post1.dist-info/RECORD +28 -0
{hirundo-0.1.21.dist-info → hirundo-0.2.3.post1.dist-info}/WHEEL +1 -1
hirundo-0.1.21.dist-info/RECORD +0 -25
{hirundo-0.1.21.dist-info → hirundo-0.2.3.post1.dist-info}/entry_points.txt +0 -0
{hirundo-0.1.21.dist-info → hirundo-0.2.3.post1.dist-info}/licenses/LICENSE +0 -0
{hirundo-0.1.21.dist-info → hirundo-0.2.3.post1.dist-info}/top_level.txt +0 -0

hirundo/__init__.py CHANGED Viewed

@@ -5,8 +5,8 @@ from .dataset_enum import (
 )
 from .dataset_qa import (
     ClassificationRunArgs,
-    Domain,
     HirundoError,
+    ModalityType,
     ObjectDetectionRunArgs,
     QADataset,
     RunArgs,
@@ -30,6 +30,15 @@ from .storage import (
     StorageGit,
     StorageS3,
 )
+from .unlearning_llm import (
+    BiasRunInfo,
+    BiasType,
+    HuggingFaceTransformersModel,
+    LlmModel,
+    LlmSources,
+    LlmUnlearningRun,
+    LocalTransformersModel,
+)
 from .unzip import load_df, load_from_zip
 __all__ = [
@@ -43,7 +52,7 @@ __all__ = [
     "KeylabsObjSegImages",
     "KeylabsObjSegVideo",
     "QADataset",
-    "Domain",
+    "ModalityType",
     "RunArgs",
     "ClassificationRunArgs",
     "ObjectDetectionRunArgs",
@@ -59,8 +68,15 @@ __all__ = [
     "StorageGit",
     "StorageConfig",
     "DatasetQAResults",
+    "BiasRunInfo",
+    "BiasType",
+    "HuggingFaceTransformersModel",
+    "LlmModel",
+    "LlmSources",
+    "LlmUnlearningRun",
+    "LocalTransformersModel",
     "load_df",
     "load_from_zip",
 ]
-__version__ = "0.1.21"
+__version__ = "0.2.3.post1"

hirundo/_constraints.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-import typing
 from typing import TYPE_CHECKING
 from hirundo._urls import (
@@ -135,8 +134,8 @@ def validate_labeling_type(
 def validate_labeling_info(
     labeling_type: "LabelingType",
-    labeling_info: "typing.Union[LabelingInfo, list[LabelingInfo]]",
-    storage_config: "typing.Union[StorageConfig, ResponseStorageConfig]",
+    labeling_info: "LabelingInfo | list[LabelingInfo]",
+    storage_config: "StorageConfig | ResponseStorageConfig",
 ) -> None:
     """
     Validate the labeling info for a dataset

hirundo/_iter_sse_retrying.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import asyncio
 import time
-import typing
 import uuid
 from collections.abc import AsyncGenerator, Generator
@@ -15,13 +14,15 @@ from hirundo.logger import get_logger
 logger = get_logger(__name__)
+MAX_RETRIES = 50
 # Credit: https://github.com/florimondmanca/httpx-sse/blob/master/README.md#handling-reconnections
 def iter_sse_retrying(
     client: httpx.Client,
     method: str,
     url: str,
-    headers: typing.Optional[dict[str, str]] = None,
+    headers: dict[str, str] | None = None,
 ) -> Generator[ServerSentEvent, None, None]:
     if headers is None:
         headers = {}
@@ -41,7 +42,8 @@ def iter_sse_retrying(
             httpx.ReadError,
             httpx.RemoteProtocolError,
             urllib3.exceptions.ReadTimeoutError,
-        )
+        ),
+        attempts=MAX_RETRIES,
     )
     def _iter_sse():
         nonlocal last_event_id, reconnection_delay
@@ -105,7 +107,8 @@ async def aiter_sse_retrying(
             httpx.ReadError,
             httpx.RemoteProtocolError,
             urllib3.exceptions.ReadTimeoutError,
-        )
+        ),
+        attempts=MAX_RETRIES,
     )
     async def _iter_sse() -> AsyncGenerator[ServerSentEvent, None]:
         nonlocal last_event_id, reconnection_delay

hirundo/_llm_pipeline.py ADDED Viewed

@@ -0,0 +1,153 @@
+import importlib.util
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+from hirundo import HirundoError
+from hirundo._http import requests
+from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT
+from hirundo.logger import get_logger
+if TYPE_CHECKING:
+    from torch import device as torch_device
+    from transformers.configuration_utils import PretrainedConfig
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.pipelines.base import Pipeline
+    from hirundo.unlearning_llm import LlmModel, LlmModelOut
+logger = get_logger(__name__)
+ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024  # 50 MB
+REQUIRED_PACKAGES_FOR_PIPELINE = ["peft", "transformers", "accelerate"]
+def get_hf_pipeline_for_run_given_model(
+    llm: "LlmModel | LlmModelOut",
+    run_id: str,
+    config: "PretrainedConfig | None" = None,
+    device: "str | int | torch_device | None" = None,
+    device_map: str | dict[str, int | str] | None = None,
+    trust_remote_code: bool = False,
+    token: str | None = None,
+) -> "Pipeline":
+    for package in REQUIRED_PACKAGES_FOR_PIPELINE:
+        if importlib.util.find_spec(package) is None:
+            raise HirundoError(
+                f'{package} is not installed. Please install transformers extra with pip install "hirundo[transformers]"'
+            )
+    from peft import PeftModel
+    from transformers.models.auto.configuration_auto import AutoConfig
+    from transformers.models.auto.modeling_auto import (
+        MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+        AutoModelForCausalLM,
+        AutoModelForImageTextToText,
+    )
+    from transformers.models.auto.tokenization_auto import AutoTokenizer
+    from transformers.pipelines import pipeline
+    from hirundo.unlearning_llm import (
+        HuggingFaceTransformersModel,
+        HuggingFaceTransformersModelOutput,
+        LlmUnlearningRun,
+    )
+    run_results = LlmUnlearningRun.check_run_by_id(run_id)
+    if run_results is None:
+        raise HirundoError("No run results found")
+    result_payload = (
+        run_results.get("result", run_results)
+        if isinstance(run_results, dict)
+        else run_results
+    )
+    if isinstance(result_payload, dict):
+        result_url = result_payload.get("result")
+    else:
+        result_url = result_payload
+    if not isinstance(result_url, str):
+        raise HirundoError("Run results did not include a download URL")
+    # Stream the zip file download
+    zip_file_path = tempfile.NamedTemporaryFile(delete=False).name
+    with requests.get(
+        result_url,
+        timeout=DOWNLOAD_READ_TIMEOUT,
+        stream=True,
+    ) as r:
+        r.raise_for_status()
+        with open(zip_file_path, "wb") as zip_file:
+            for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                zip_file.write(chunk)
+        logger.info(
+            "Successfully downloaded the result zip file for run ID %s to %s",
+            run_id,
+            zip_file_path,
+        )
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_dir_path = Path(temp_dir)
+        with zipfile.ZipFile(zip_file_path, "r") as zip_file:
+            zip_file.extractall(temp_dir_path)
+        # Attempt to load the tokenizer normally
+        base_model_name = (
+            llm.model_source.model_name
+            if isinstance(
+                llm.model_source,
+                HuggingFaceTransformersModel | HuggingFaceTransformersModelOutput,
+            )
+            else llm.model_source.local_path
+        )
+        token = (
+            llm.model_source.token
+            if isinstance(
+                llm.model_source,
+                HuggingFaceTransformersModel,
+            )
+            else token
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            base_model_name,
+            token=token,
+            trust_remote_code=trust_remote_code,
+        )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        config = AutoConfig.from_pretrained(
+            base_model_name,
+            token=token,
+            trust_remote_code=trust_remote_code,
+        )
+        config_dict = config.to_dict() if hasattr(config, "to_dict") else config
+        is_multimodal = (
+            config_dict.get("model_type")
+            in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.keys()
+        )
+        if is_multimodal:
+            base_model = AutoModelForImageTextToText.from_pretrained(
+                base_model_name,
+                token=token,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            base_model = AutoModelForCausalLM.from_pretrained(
+                base_model_name,
+                token=token,
+                trust_remote_code=trust_remote_code,
+            )
+        model = cast(
+            "PreTrainedModel",
+            PeftModel.from_pretrained(
+                base_model, str(temp_dir_path / "unlearned_model_folder")
+            ),
+        )
+        return pipeline(
+            task="text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            config=config,
+            device=device,
+            device_map=device_map,
+        )

hirundo/_run_checking.py ADDED Viewed

@@ -0,0 +1,283 @@
+import json
+from collections.abc import AsyncGenerator, Generator
+from enum import Enum
+import httpx
+from tqdm import tqdm
+from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo.logger import get_logger
+_logger = get_logger(__name__)
+DEFAULT_MAX_RETRIES = 200
+class RunStatus(Enum):
+    PENDING = "PENDING"
+    STARTED = "STARTED"
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
+    REVOKED = "REVOKED"
+    REJECTED = "REJECTED"
+    RETRY = "RETRY"
+STATUS_TO_PROGRESS_MAP = {
+    RunStatus.STARTED.value: 0.0,
+    RunStatus.PENDING.value: 0.0,
+    RunStatus.SUCCESS.value: 100.0,
+    RunStatus.FAILURE.value: 100.0,
+    RunStatus.AWAITING_MANUAL_APPROVAL.value: 100.0,
+    RunStatus.RETRY.value: 0.0,
+    RunStatus.REVOKED.value: 100.0,
+    RunStatus.REJECTED.value: 0.0,
+}
+def build_status_text_map(
+    run_label: str, *, started_detail: str | None = None
+) -> dict[str, str]:
+    """
+    Build a status->text mapping for a given run label.
+    Args:
+        run_label: Human-readable label used in status text.
+        started_detail: Optional override for the STARTED status text.
+    Returns:
+        Mapping of run state values to user-facing status text.
+    """
+    started_text = started_detail or f"{run_label} run in progress"
+    return {
+        RunStatus.STARTED.value: started_text,
+        RunStatus.PENDING.value: f"{run_label} run queued and not yet started",
+        RunStatus.SUCCESS.value: f"{run_label} run completed successfully",
+        RunStatus.FAILURE.value: f"{run_label} run failed",
+        RunStatus.AWAITING_MANUAL_APPROVAL.value: "Awaiting manual approval",
+        RunStatus.RETRY.value: f"{run_label} run failed. Retrying",
+        RunStatus.REVOKED.value: f"{run_label} run was cancelled",
+        RunStatus.REJECTED.value: f"{run_label} run was rejected",
+    }
+def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None:
+    """
+    Return the first non-null state value from a payload using a list of keys.
+    Args:
+        payload: Run payload containing state/status information.
+        status_keys: Ordered keys to search for state values.
+    Returns:
+        The first non-null state value, or None if none are present.
+    """
+    for key in status_keys:
+        value = payload.get(key)
+        if value is not None:
+            return value
+    return None
+def _extract_event_data(event: dict, error_cls: type[Exception]) -> dict:
+    if "data" in event:
+        return event["data"]
+    if "detail" in event:
+        raise error_cls(event["detail"])
+    if "reason" in event:
+        raise error_cls(event["reason"])
+    raise error_cls("Unknown error")
+def _should_retry_after_stream(
+    last_event: dict | None,
+    status_keys: tuple[str, ...],
+    pending_state_value: str,
+) -> bool:
+    if not last_event:
+        return True
+    data = last_event.get("data")
+    if data is None:
+        return False
+    last_state = get_state(data, status_keys)
+    return last_state == pending_state_value
+def iter_run_events(
+    url: str,
+    *,
+    headers: dict[str, str] | None = None,
+    retry: int = 0,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    pending_state_value: str = RunStatus.PENDING.value,
+    status_keys: tuple[str, ...] = ("state",),
+    error_cls: type[Exception] = RuntimeError,
+    log=_logger,
+) -> Generator[dict, None, None]:
+    """
+    Stream run events from an SSE endpoint with retries.
+    Args:
+        url: SSE endpoint URL.
+        headers: Optional HTTP headers.
+        retry: Internal retry counter (do not set manually).
+        max_retries: Maximum number of retry attempts.
+        pending_state_value: State value that triggers a re-check loop.
+        status_keys: Payload keys to search for the run state.
+        error_cls: Exception type to raise on errors.
+        log: Logger instance for debug output.
+    Yields:
+        Event payloads decoded from the SSE data field.
+    """
+    while True:
+        if retry > max_retries:
+            raise error_cls("Max retries reached")
+        last_event = None
+        with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
+            for sse in iter_sse_retrying(
+                client,
+                "GET",
+                url,
+                headers=headers,
+            ):
+                if sse.event == "ping":
+                    continue
+                log.debug(
+                    "[SYNC] received event: %s with data: %s and ID: %s and retry: %s",
+                    sse.event,
+                    sse.data,
+                    sse.id,
+                    sse.retry,
+                )
+                last_event = json.loads(sse.data)
+                if not last_event:
+                    continue
+                data = _extract_event_data(last_event, error_cls)
+                yield data
+        if _should_retry_after_stream(last_event, status_keys, pending_state_value):
+            retry += 1
+            continue
+        return
+async def aiter_run_events(
+    url: str,
+    *,
+    headers: dict[str, str] | None = None,
+    retry: int = 0,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    pending_state_value: str = RunStatus.PENDING.value,
+    status_keys: tuple[str, ...] = ("state",),
+    error_cls: type[Exception] = RuntimeError,
+    log=_logger,
+) -> AsyncGenerator[dict, None]:
+    """
+    Async stream run events from an SSE endpoint with retries.
+    Args:
+        url: SSE endpoint URL.
+        headers: Optional HTTP headers.
+        retry: Internal retry counter (do not set manually).
+        max_retries: Maximum number of retry attempts.
+        pending_state_value: State value that triggers a re-check loop.
+        status_keys: Payload keys to search for the run state.
+        error_cls: Exception type to raise on errors.
+        log: Logger instance for debug output.
+    Yields:
+        Event payloads decoded from the SSE data field.
+    """
+    while True:
+        if retry > max_retries:
+            raise error_cls("Max retries reached")
+        last_event = None
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(None, connect=5.0)
+        ) as client:
+            async_iterator = await aiter_sse_retrying(
+                client,
+                "GET",
+                url,
+                headers=headers or {},
+            )
+            async for sse in async_iterator:
+                if sse.event == "ping":
+                    continue
+                log.debug(
+                    "[ASYNC] Received event: %s with data: %s and ID: %s and retry: %s",
+                    sse.event,
+                    sse.data,
+                    sse.id,
+                    sse.retry,
+                )
+                last_event = json.loads(sse.data)
+                data = _extract_event_data(last_event, error_cls)
+                yield data
+        if _should_retry_after_stream(last_event, status_keys, pending_state_value):
+            retry += 1
+            continue
+        return
+def update_progress_from_result(
+    iteration: dict,
+    progress: tqdm,
+    *,
+    uploading_text: str,
+    log=_logger,
+) -> bool:
+    """
+    Update a tqdm progress bar based on a serialized progress result string.
+    Args:
+        iteration: Payload containing a nested result string.
+        progress: tqdm instance to update.
+        uploading_text: Description to show when progress reaches 100%.
+        log: Logger instance for debug output.
+    Returns:
+        True if a progress update occurred, False otherwise.
+    """
+    if (
+        iteration.get("result")
+        and isinstance(iteration["result"], dict)
+        and iteration["result"].get("result")
+        and isinstance(iteration["result"]["result"], str)
+    ):
+        result_info = iteration["result"]["result"].split(":")
+        if len(result_info) > 1:
+            stage = result_info[0]
+            current_progress_percentage = float(
+                result_info[1].removeprefix(" ").removesuffix("% done")
+            )
+        elif len(result_info) == 1:
+            stage = result_info[0]
+            current_progress_percentage = progress.n
+        else:
+            stage = "Unknown progress state"
+            current_progress_percentage = progress.n
+        desc = uploading_text if current_progress_percentage == 100.0 else stage
+        progress.set_description(desc)
+        progress.n = current_progress_percentage
+        log.debug("Setting progress to %s", progress.n)
+        progress.refresh()
+        return True
+    return False
+def handle_run_failure(
+    iteration: dict, *, error_cls: type[Exception], run_label: str
+) -> None:
+    """
+    Raise a run-specific failure exception based on the iteration payload.
+    Args:
+        iteration: Payload containing error details.
+        error_cls: Exception type to raise.
+        run_label: Human-readable label for the run type.
+    """
+    if iteration.get("result"):
+        raise error_cls(f"{run_label} run failed with error: {iteration['result']}")
+    raise error_cls(f"{run_label} run failed with an unknown error")

hirundo/_urls.py CHANGED Viewed

@@ -54,6 +54,7 @@ HirundoUrl = Annotated[
             "s3",
             "gs",
             "ssh",
+            "hf",
         ]
     ),
 ]

hirundo/cli.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import re
 import sys
-import typing
 from pathlib import Path
 from typing import Annotated
 from urllib.parse import urlparse
@@ -28,9 +27,7 @@ app = typer.Typer(
 )
-def _upsert_env(
-    dotenv_filepath: typing.Union[str, Path], var_name: str, var_value: str
-):
+def _upsert_env(dotenv_filepath: str | Path, var_name: str, var_value: str):
     """
     Change an environment variable in the .env file.
     If the variable does not exist, it will be added.

hirundo/dataset_enum.py CHANGED Viewed

@@ -24,6 +24,7 @@ class DatasetMetadataType(str, Enum):
     HIRUNDO_CSV = "HirundoCSV"
     COCO = "COCO"
     YOLO = "YOLO"
+    HuggingFaceAudio = "HuggingFaceAudio"
     KeylabsObjDetImages = "KeylabsObjDetImages"
     KeylabsObjDetVideo = "KeylabsObjDetVideo"
     KeylabsObjSegImages = "KeylabsObjSegImages"
@@ -44,3 +45,4 @@ class StorageTypes(str, Enum):
     """
     Local storage config is only supported for on-premises installations.
     """
+    HUGGINGFACE = "HuggingFace"

hirundo 0.1.21__py3-none-any.whl → 0.2.3.post1__py3-none-any.whl

hirundo 0.1.21py3-none-any.whl → 0.2.3.post1py3-none-any.whl