PyPI - hirundo - Versions diffs - 0.1.8__tar.gz → 0.1.9__tar.gz - Mend

hirundo 0.1.8tar.gz → 0.1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{hirundo-0.1.8 → hirundo-0.1.9}/PKG-INFO +77 -41
{hirundo-0.1.8 → hirundo-0.1.9}/README.md +68 -38
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/__init__.py +17 -9
hirundo-0.1.9/hirundo/_constraints.py +53 -0
hirundo-0.1.9/hirundo/_http.py +19 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/_iter_sse_retrying.py +61 -17
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/dataset_optimization.py +421 -83
hirundo-0.1.9/hirundo/enum.py +23 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/git.py +85 -20
hirundo-0.1.9/hirundo/storage.py +466 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo.egg-info/PKG-INFO +77 -41
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo.egg-info/requires.txt +8 -2
{hirundo-0.1.8 → hirundo-0.1.9}/pyproject.toml +11 -3
hirundo-0.1.8/hirundo/_constraints.py +0 -21
hirundo-0.1.8/hirundo/_http.py +0 -14
hirundo-0.1.8/hirundo/enum.py +0 -20
hirundo-0.1.8/hirundo/storage.py +0 -295
{hirundo-0.1.8 → hirundo-0.1.9}/LICENSE +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/__main__.py +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/_env.py +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/_headers.py +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/_timeouts.py +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/cli.py +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/logger.py +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo.egg-info/SOURCES.txt +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo.egg-info/dependency_links.txt +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo.egg-info/entry_points.txt +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/hirundo.egg-info/top_level.txt +0 -0
{hirundo-0.1.8 → hirundo-0.1.9}/setup.cfg +0 -0

{hirundo-0.1.8 → hirundo-0.1.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: hirundo
-Version: 0.1.8
+Version: 0.1.9
 Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
 Author-email: Hirundo <dev@hirundo.io>
 License: MIT License
@@ -47,10 +47,13 @@ Requires-Dist: stamina>=24.2.0; extra == "dev"
 Requires-Dist: httpx-sse>=0.4.0; extra == "dev"
 Requires-Dist: pytest>=8.2.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
-Requires-Dist: uv; extra == "dev"
+Requires-Dist: uv>=0.5.8; extra == "dev"
 Requires-Dist: pre-commit>=3.7.1; extra == "dev"
-Requires-Dist: ruff==0.6.5; extra == "dev"
+Requires-Dist: virtualenv>=20.6.6; extra == "dev"
+Requires-Dist: ruff>=0.8.2; extra == "dev"
 Requires-Dist: bumpver; extra == "dev"
+Requires-Dist: platformdirs>=4.3.6; extra == "dev"
+Requires-Dist: safety>=3.2.13; extra == "dev"
 Provides-Extra: docs
 Requires-Dist: sphinx>=7.4.7; extra == "docs"
 Requires-Dist: sphinx-autobuild>=2024.4.16; extra == "docs"
@@ -58,6 +61,9 @@ Requires-Dist: sphinx-click>=5.0.1; extra == "docs"
 Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
 Requires-Dist: furo; extra == "docs"
 Requires-Dist: sphinx-multiversion; extra == "docs"
+Requires-Dist: esbonio; extra == "docs"
+Requires-Dist: starlette>0.40.0; extra == "docs"
+Requires-Dist: markupsafe>=3.0.2; extra == "docs"
 # Hirundo
@@ -66,7 +72,7 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
 Dataset optimization is currently available for datasets labelled for classification and object detection.
-Support dataset storage integrations include:
+Support dataset storage configs include:
    - Google Cloud (GCP) Storage
    - Amazon Web Services (AWS) S3
    - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
@@ -107,27 +113,33 @@ You can install the codebase with a simple `pip install hirundo` to install the
 ## Usage
 Classification example:
-```
-from hirundo.dataset_optimization import OptimizationDataset
-from hirundo.enum import LabellingType
-from hirundo.storage import StorageIntegration, StorageLink, StorageTypes
+```python
+from hirundo import (
+    HirundoCSV,
+    LabelingType,
+    OptimizationDataset,
+    StorageGCP,
+    StorageConfig,
+    StorageTypes,
+)
+gcp_bucket = StorageGCP(
+    bucket_name="cifar100bucket",
+    project="Hirundo-global",
+    credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
+)
 test_dataset = OptimizationDataset(
     name="TEST-GCP cifar 100 classification dataset",
-    labelling_type=LabellingType.SingleLabelClassification,
-    dataset_storage=StorageLink(
-        storage_integration=StorageIntegration(
-            name="cifar100bucket",
-            type=StorageTypes.GCP,
-            gcp=StorageGCP(
-                bucket_name="cifar100bucket",
-                project="Hirundo-global",
-                credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
-            ),
-        ),
-        path="/pytorch-cifar/data",
+    labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
+    storage_config=StorageConfig(
+        name="cifar100bucket",
+        type=StorageTypes.GCP,
+        gcp=gcp_bucket,
+    ),
+    data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"),
+    labeling_info=HirundoCSV(
+        csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"),
     ),
-    dataset_metadata_path="cifar100.csv",
     classes=cifar100_classes,
 )
@@ -139,29 +151,53 @@ print(results)
 Object detection example:
-```
-from hirundo.dataset_optimization import OptimizationDataset
-from hirundo.enum import LabellingType
-from hirundo.storage import StorageIntegration, StorageLink, StorageTypes
+```python
+from hirundo import (
+    GitRepo,
+    HirundoCSV,
+    LabelingType,
+    OptimizationDataset,
+    StorageGit,
+    StorageConfig,
+    StorageTypes,
+)
+git_storage = StorageGit(
+    repo=GitRepo(
+        name="BDD-100k-validation-dataset",
+        repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only.git",
+    ),
+    branch="main",
+)
 test_dataset = OptimizationDataset(
-    name=f"TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset{unique_id}",
-    labelling_type=LabellingType.ObjectDetection,
-    dataset_storage=StorageLink(
-        storage_integration=StorageIntegration(
-            name=f"BDD-100k-validation-dataset{unique_id}",
-            type=StorageTypes.GIT,
-            git=StorageGit(
-                repo=GitRepo(
-                    name=f"BDD-100k-validation-dataset{unique_id}",
-                    repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only",
-                ),
-                branch="main",
-            ),
+    name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
+    labeling_type=LabelingType.OBJECT_DETECTION,
+    storage_config=StorageConfig(
+        name="BDD-100k-validation-dataset",
+        type=StorageTypes.GIT,
+        git=git_storage,
+    ),
+    data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
+    labeling_info=HirundoCSV(
+        csv_url=git_storage.get_url(
+            path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
         ),
-        path="/BDD100K Val from Hirundo.zip/bdd100k",
     ),
-    dataset_metadata_path="bdd100k.csv",
+    classes=[
+        "traffic light",
+        "traffic sign",
+        "car",
+        "pedestrian",
+        "bus",
+        "truck",
+        "rider",
+        "bicycle",
+        "motorcycle",
+        "train",
+        "other vehicle",
+        "other person",
+        "trailer",
+    ],
 )
 test_dataset.run_optimization()
@@ -173,4 +209,4 @@ Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyP
 ## Further documentation
-To learn about mroe how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
+To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.

{hirundo-0.1.8 → hirundo-0.1.9}/README.md RENAMED Viewed

@@ -5,7 +5,7 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
 Dataset optimization is currently available for datasets labelled for classification and object detection.
-Support dataset storage integrations include:
+Support dataset storage configs include:
    - Google Cloud (GCP) Storage
    - Amazon Web Services (AWS) S3
    - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
@@ -46,27 +46,33 @@ You can install the codebase with a simple `pip install hirundo` to install the
 ## Usage
 Classification example:
-```
-from hirundo.dataset_optimization import OptimizationDataset
-from hirundo.enum import LabellingType
-from hirundo.storage import StorageIntegration, StorageLink, StorageTypes
+```python
+from hirundo import (
+    HirundoCSV,
+    LabelingType,
+    OptimizationDataset,
+    StorageGCP,
+    StorageConfig,
+    StorageTypes,
+)
+gcp_bucket = StorageGCP(
+    bucket_name="cifar100bucket",
+    project="Hirundo-global",
+    credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
+)
 test_dataset = OptimizationDataset(
     name="TEST-GCP cifar 100 classification dataset",
-    labelling_type=LabellingType.SingleLabelClassification,
-    dataset_storage=StorageLink(
-        storage_integration=StorageIntegration(
-            name="cifar100bucket",
-            type=StorageTypes.GCP,
-            gcp=StorageGCP(
-                bucket_name="cifar100bucket",
-                project="Hirundo-global",
-                credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
-            ),
-        ),
-        path="/pytorch-cifar/data",
+    labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
+    storage_config=StorageConfig(
+        name="cifar100bucket",
+        type=StorageTypes.GCP,
+        gcp=gcp_bucket,
+    ),
+    data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"),
+    labeling_info=HirundoCSV(
+        csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"),
     ),
-    dataset_metadata_path="cifar100.csv",
     classes=cifar100_classes,
 )
@@ -78,29 +84,53 @@ print(results)
 Object detection example:
-```
-from hirundo.dataset_optimization import OptimizationDataset
-from hirundo.enum import LabellingType
-from hirundo.storage import StorageIntegration, StorageLink, StorageTypes
+```python
+from hirundo import (
+    GitRepo,
+    HirundoCSV,
+    LabelingType,
+    OptimizationDataset,
+    StorageGit,
+    StorageConfig,
+    StorageTypes,
+)
+git_storage = StorageGit(
+    repo=GitRepo(
+        name="BDD-100k-validation-dataset",
+        repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only.git",
+    ),
+    branch="main",
+)
 test_dataset = OptimizationDataset(
-    name=f"TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset{unique_id}",
-    labelling_type=LabellingType.ObjectDetection,
-    dataset_storage=StorageLink(
-        storage_integration=StorageIntegration(
-            name=f"BDD-100k-validation-dataset{unique_id}",
-            type=StorageTypes.GIT,
-            git=StorageGit(
-                repo=GitRepo(
-                    name=f"BDD-100k-validation-dataset{unique_id}",
-                    repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only",
-                ),
-                branch="main",
-            ),
+    name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
+    labeling_type=LabelingType.OBJECT_DETECTION,
+    storage_config=StorageConfig(
+        name="BDD-100k-validation-dataset",
+        type=StorageTypes.GIT,
+        git=git_storage,
+    ),
+    data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
+    labeling_info=HirundoCSV(
+        csv_url=git_storage.get_url(
+            path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
         ),
-        path="/BDD100K Val from Hirundo.zip/bdd100k",
     ),
-    dataset_metadata_path="bdd100k.csv",
+    classes=[
+        "traffic light",
+        "traffic sign",
+        "car",
+        "pedestrian",
+        "bus",
+        "truck",
+        "rider",
+        "bicycle",
+        "motorcycle",
+        "train",
+        "other vehicle",
+        "other person",
+        "trailer",
+    ],
 )
 test_dataset.run_optimization()
@@ -112,4 +142,4 @@ Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyP
 ## Further documentation
-To learn about mroe how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
+To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.

{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/__init__.py RENAMED Viewed

@@ -1,35 +1,43 @@
 from .dataset_optimization import (
+    COCO,
+    YOLO,
+    HirundoCSV,
     HirundoError,
     OptimizationDataset,
+    RunArgs,
+    VisionRunArgs,
 )
 from .enum import (
     DatasetMetadataType,
-    LabellingType,
+    LabelingType,
 )
 from .git import GitRepo
 from .storage import (
+    StorageConfig,
     StorageGCP,
-    # StorageAzure,  TODO: Azure storage integration is coming soon
+    # StorageAzure,  TODO: Azure storage is coming soon
     StorageGit,
-    StorageIntegration,
-    StorageLink,
     StorageS3,
     StorageTypes,
 )
 __all__ = [
+    "COCO",
+    "YOLO",
+    "HirundoCSV",
     "HirundoError",
     "OptimizationDataset",
-    "LabellingType",
+    "RunArgs",
+    "VisionRunArgs",
+    "LabelingType",
     "DatasetMetadataType",
     "GitRepo",
-    "StorageLink",
     "StorageTypes",
     "StorageS3",
     "StorageGCP",
-    # "StorageAzure",  TODO: Azure storage integration is coming soon
+    # "StorageAzure",  TODO: Azure storage is coming soon
     "StorageGit",
-    "StorageIntegration",
+    "StorageConfig",
 ]
-__version__ = "0.1.8"
+__version__ = "0.1.9"

hirundo-0.1.9/hirundo/_constraints.py ADDED Viewed

@@ -0,0 +1,53 @@
+from typing import Annotated
+from pydantic import StringConstraints, UrlConstraints
+from pydantic_core import Url
+S3BucketUrl = Annotated[
+    str,
+    StringConstraints(
+        min_length=8,
+        max_length=1023,
+        pattern=r"s3?://[a-z0-9.-]{3,64}[/]?",  # Only allow real S3 bucket URLs
+    ),
+]
+StorageConfigName = Annotated[
+    str,
+    StringConstraints(
+        min_length=1,
+        max_length=255,
+        pattern=r"^[a-zA-Z0-9-_]+$",
+    ),
+]
+S3_MIN_LENGTH = 8
+S3_MAX_LENGTH = 1023
+S3_PATTERN = r"s3://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
+GCP_MIN_LENGTH = 8
+GCP_MAX_LENGTH = 1023
+GCP_PATTERN = r"gs://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
+RepoUrl = Annotated[
+    Url,
+    UrlConstraints(
+        allowed_schemes=[
+            "ssh",
+            "https",
+            "http",
+        ]
+    ),
+]
+HirundoUrl = Annotated[
+    Url,
+    UrlConstraints(
+        allowed_schemes=[
+            "file",
+            "https",
+            "http",
+            "s3",
+            "gs",
+            "ssh",
+        ]
+    ),
+]

hirundo-0.1.9/hirundo/_http.py ADDED Viewed

@@ -0,0 +1,19 @@
+from requests import Response
+import hirundo.logger
+logger = hirundo.logger.get_logger(__name__)
+MINIMUM_CLIENT_SERVER_ERROR_CODE = 400
+def raise_for_status_with_reason(response: Response):
+    try:
+        if response.status_code >= MINIMUM_CLIENT_SERVER_ERROR_CODE:
+            response.reason = response.json().get("reason", None)
+            if response.reason is None:
+                response.reason = response.json().get("detail", None)
+    except Exception as e:
+        logger.debug("Could not parse response as JSON: %s", e)
+    response.raise_for_status()

{hirundo-0.1.8 → hirundo-0.1.9}/hirundo/_iter_sse_retrying.py RENAMED Viewed

@@ -1,12 +1,20 @@
 import asyncio
 import time
 import typing
+import uuid
 from collections.abc import AsyncGenerator, Generator
 import httpx
-from httpx_sse import ServerSentEvent, aconnect_sse, connect_sse
+import requests
+import urllib3
+from httpx_sse import ServerSentEvent, SSEError, aconnect_sse, connect_sse
 from stamina import retry
+from hirundo._timeouts import READ_TIMEOUT
+from hirundo.logger import get_logger
+logger = get_logger(__name__)
 # Credit: https://github.com/florimondmanca/httpx-sse/blob/master/README.md#handling-reconnections
 def iter_sse_retrying(
@@ -28,7 +36,13 @@ def iter_sse_retrying(
     #  This may happen when the server is overloaded and closes the connection or
     #  when Kubernetes restarts / replaces a pod.
     #  Likewise, this will likely be temporary, hence the retries.
-    @retry(on=(httpx.ReadError, httpx.RemoteProtocolError))
+    @retry(
+        on=(
+            httpx.ReadError,
+            httpx.RemoteProtocolError,
+            urllib3.exceptions.ReadTimeoutError,
+        )
+    )
     def _iter_sse():
         nonlocal last_event_id, reconnection_delay
@@ -44,13 +58,27 @@ def iter_sse_retrying(
             connect_headers["Last-Event-ID"] = last_event_id
         with connect_sse(client, method, url, headers=connect_headers) as event_source:
-            for sse in event_source.iter_sse():
-                last_event_id = sse.id
-                if sse.retry is not None:
-                    reconnection_delay = sse.retry / 1000
-                yield sse
+            try:
+                for sse in event_source.iter_sse():
+                    last_event_id = sse.id
+                    if sse.retry is not None:
+                        reconnection_delay = sse.retry / 1000
+                    yield sse
+            except SSEError:
+                logger.error("SSE error occurred. Trying regular request")
+                response = requests.get(
+                    url,
+                    headers=connect_headers,
+                    timeout=READ_TIMEOUT,
+                )
+                yield ServerSentEvent(
+                    event="",
+                    data=response.text,
+                    id=uuid.uuid4().hex,
+                    retry=None,
+                )
     return _iter_sse()
@@ -72,7 +100,13 @@ async def aiter_sse_retrying(
     #  This may happen when the server is overloaded and closes the connection or
     #  when Kubernetes restarts / replaces a pod.
     #  Likewise, this will likely be temporary, hence the retries.
-    @retry(on=(httpx.ReadError, httpx.RemoteProtocolError))
+    @retry(
+        on=(
+            httpx.ReadError,
+            httpx.RemoteProtocolError,
+            urllib3.exceptions.ReadTimeoutError,
+        )
+    )
     async def _iter_sse() -> AsyncGenerator[ServerSentEvent, None]:
         nonlocal last_event_id, reconnection_delay
@@ -86,12 +120,22 @@ async def aiter_sse_retrying(
         async with aconnect_sse(
             client, method, url, headers=connect_headers
         ) as event_source:
-            async for sse in event_source.aiter_sse():
-                last_event_id = sse.id
-                if sse.retry is not None:
-                    reconnection_delay = sse.retry / 1000
-                yield sse
+            try:
+                async for sse in event_source.aiter_sse():
+                    last_event_id = sse.id
+                    if sse.retry is not None:
+                        reconnection_delay = sse.retry / 1000
+                    yield sse
+            except SSEError:
+                logger.error("SSE error occurred. Trying regular request")
+                response = await client.get(url, headers=connect_headers)
+                yield ServerSentEvent(
+                    event="",
+                    data=response.text,
+                    id=uuid.uuid4().hex,
+                    retry=None,
+                )
     return _iter_sse()

hirundo 0.1.8__tar.gz → 0.1.9__tar.gz

hirundo 0.1.8tar.gz → 0.1.9tar.gz