hirundo 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hirundo/__init__.py +35 -0
- hirundo/__main__.py +3 -0
- hirundo/_constraints.py +21 -0
- hirundo/_env.py +12 -0
- hirundo/_headers.py +9 -0
- hirundo/_iter_sse_retrying.py +97 -0
- hirundo/_timeouts.py +2 -0
- hirundo/cli.py +132 -0
- hirundo/dataset_optimization.py +383 -0
- hirundo/enum.py +20 -0
- hirundo/git.py +158 -0
- hirundo/storage.py +260 -0
- hirundo-0.1.3.dist-info/LICENSE +9 -0
- hirundo-0.1.3.dist-info/METADATA +114 -0
- hirundo-0.1.3.dist-info/RECORD +18 -0
- hirundo-0.1.3.dist-info/WHEEL +5 -0
- hirundo-0.1.3.dist-info/entry_points.txt +2 -0
- hirundo-0.1.3.dist-info/top_level.txt +1 -0
hirundo/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from .dataset_optimization import (
|
|
2
|
+
HirundoError,
|
|
3
|
+
OptimizationDataset,
|
|
4
|
+
)
|
|
5
|
+
from .enum import (
|
|
6
|
+
DatasetMetadataType,
|
|
7
|
+
LabellingType,
|
|
8
|
+
)
|
|
9
|
+
from .git import GitRepo
|
|
10
|
+
from .storage import (
|
|
11
|
+
StorageGCP,
|
|
12
|
+
# StorageAzure, TODO: Azure storage integration is coming soon
|
|
13
|
+
StorageGit,
|
|
14
|
+
StorageIntegration,
|
|
15
|
+
StorageLink,
|
|
16
|
+
StorageS3,
|
|
17
|
+
StorageTypes,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"HirundoError",
|
|
22
|
+
"OptimizationDataset",
|
|
23
|
+
"LabellingType",
|
|
24
|
+
"DatasetMetadataType",
|
|
25
|
+
"GitRepo",
|
|
26
|
+
"StorageLink",
|
|
27
|
+
"StorageTypes",
|
|
28
|
+
"StorageS3",
|
|
29
|
+
"StorageGCP",
|
|
30
|
+
# "StorageAzure", TODO: Azure storage integration is coming soon
|
|
31
|
+
"StorageGit",
|
|
32
|
+
"StorageIntegration",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
__version__ = "0.1.3"
|
hirundo/__main__.py
ADDED
hirundo/_constraints.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from pydantic import StringConstraints
|
|
4
|
+
|
|
5
|
+
S3BucketUrl = Annotated[
|
|
6
|
+
str,
|
|
7
|
+
StringConstraints(
|
|
8
|
+
min_length=8,
|
|
9
|
+
max_length=1023,
|
|
10
|
+
pattern=r"s3?://[a-z0-9.-]{3,64}[/]?", # Only allow real S3 bucket URLs
|
|
11
|
+
),
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
StorageIntegrationName = Annotated[
|
|
15
|
+
str,
|
|
16
|
+
StringConstraints(
|
|
17
|
+
min_length=1,
|
|
18
|
+
max_length=255,
|
|
19
|
+
pattern=r"^[a-zA-Z0-9-_]+$",
|
|
20
|
+
),
|
|
21
|
+
]
|
hirundo/_env.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
|
|
5
|
+
load_dotenv()
|
|
6
|
+
|
|
7
|
+
API_HOST = os.getenv("API_HOST", "https://api.hirundo.io")
|
|
8
|
+
API_KEY = os.getenv("API_KEY")
|
|
9
|
+
if not API_KEY:
|
|
10
|
+
raise ValueError(
|
|
11
|
+
"API_KEY is not set. Please run `hirundo setup` to set the API key"
|
|
12
|
+
)
|
hirundo/_headers.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from collections.abc import AsyncGenerator, Generator
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
from httpx_sse import ServerSentEvent, aconnect_sse, connect_sse
|
|
8
|
+
from stamina import retry
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Credit: https://github.com/florimondmanca/httpx-sse/blob/master/README.md#handling-reconnections
|
|
12
|
+
def iter_sse_retrying(
|
|
13
|
+
client: httpx.Client,
|
|
14
|
+
method: str,
|
|
15
|
+
url: str,
|
|
16
|
+
headers: Union[dict[str, str], None] = None,
|
|
17
|
+
) -> Generator[ServerSentEvent, None, None]:
|
|
18
|
+
if headers is None:
|
|
19
|
+
headers = {}
|
|
20
|
+
last_event_id = ""
|
|
21
|
+
reconnection_delay = 0.0
|
|
22
|
+
|
|
23
|
+
# `stamina` will apply jitter and exponential backoff on top of
|
|
24
|
+
# the `retry` reconnection delay sent by the server.
|
|
25
|
+
# httpx.ReadError is thrown when there is a network error.
|
|
26
|
+
# Some network errors may be temporary, hence the retries.
|
|
27
|
+
# httpx.RemoteProtocolError is thrown when the server closes the connection.
|
|
28
|
+
# This may happen when the server is overloaded and closes the connection or
|
|
29
|
+
# when Kubernetes restarts / replaces a pod.
|
|
30
|
+
# Likewise, this will likely be temporary, hence the retries.
|
|
31
|
+
@retry(on=(httpx.ReadError, httpx.RemoteProtocolError))
|
|
32
|
+
def _iter_sse():
|
|
33
|
+
nonlocal last_event_id, reconnection_delay
|
|
34
|
+
|
|
35
|
+
time.sleep(reconnection_delay)
|
|
36
|
+
|
|
37
|
+
connect_headers = {
|
|
38
|
+
**headers,
|
|
39
|
+
"Accept": "text/event-stream",
|
|
40
|
+
"X-Accel-Buffering": "no",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if last_event_id:
|
|
44
|
+
connect_headers["Last-Event-ID"] = last_event_id
|
|
45
|
+
|
|
46
|
+
with connect_sse(client, method, url, headers=connect_headers) as event_source:
|
|
47
|
+
for sse in event_source.iter_sse():
|
|
48
|
+
last_event_id = sse.id
|
|
49
|
+
|
|
50
|
+
if sse.retry is not None:
|
|
51
|
+
reconnection_delay = sse.retry / 1000
|
|
52
|
+
|
|
53
|
+
yield sse
|
|
54
|
+
|
|
55
|
+
return _iter_sse()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def aiter_sse_retrying(
|
|
59
|
+
client: httpx.AsyncClient,
|
|
60
|
+
method: str,
|
|
61
|
+
url: str,
|
|
62
|
+
headers: dict[str, str],
|
|
63
|
+
) -> AsyncGenerator[ServerSentEvent, None]:
|
|
64
|
+
last_event_id = ""
|
|
65
|
+
reconnection_delay = 0.0
|
|
66
|
+
|
|
67
|
+
# `stamina` will apply jitter and exponential backoff on top of
|
|
68
|
+
# the `retry` reconnection delay sent by the server.
|
|
69
|
+
# httpx.ReadError is thrown when there is a network error.
|
|
70
|
+
# Some network errors may be temporary, hence the retries.
|
|
71
|
+
# httpx.RemoteProtocolError is thrown when the server closes the connection.
|
|
72
|
+
# This may happen when the server is overloaded and closes the connection or
|
|
73
|
+
# when Kubernetes restarts / replaces a pod.
|
|
74
|
+
# Likewise, this will likely be temporary, hence the retries.
|
|
75
|
+
@retry(on=(httpx.ReadError, httpx.RemoteProtocolError))
|
|
76
|
+
async def _iter_sse() -> AsyncGenerator[ServerSentEvent, None]:
|
|
77
|
+
nonlocal last_event_id, reconnection_delay
|
|
78
|
+
|
|
79
|
+
await asyncio.sleep(reconnection_delay)
|
|
80
|
+
|
|
81
|
+
connect_headers = {**headers, "Accept": "text/event-stream"}
|
|
82
|
+
|
|
83
|
+
if last_event_id:
|
|
84
|
+
connect_headers["Last-Event-ID"] = last_event_id
|
|
85
|
+
|
|
86
|
+
async with aconnect_sse(
|
|
87
|
+
client, method, url, headers=connect_headers
|
|
88
|
+
) as event_source:
|
|
89
|
+
async for sse in event_source.aiter_sse():
|
|
90
|
+
last_event_id = sse.id
|
|
91
|
+
|
|
92
|
+
if sse.retry is not None:
|
|
93
|
+
reconnection_delay = sse.retry / 1000
|
|
94
|
+
|
|
95
|
+
yield sse
|
|
96
|
+
|
|
97
|
+
return _iter_sse()
|
hirundo/_timeouts.py
ADDED
hirundo/cli.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Annotated
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
|
|
8
|
+
from hirundo._env import API_HOST
|
|
9
|
+
|
|
10
|
+
docs = "sphinx" in sys.modules
|
|
11
|
+
hirundo_epilog = (
|
|
12
|
+
None
|
|
13
|
+
if docs
|
|
14
|
+
else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
app = typer.Typer(
|
|
18
|
+
name="hirundo", no_args_is_help=True, rich_markup_mode="rich", epilog=hirundo_epilog
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upsert_env(var_name: str, var_value: str):
|
|
23
|
+
"""
|
|
24
|
+
Change an environment variable in the .env file.
|
|
25
|
+
If the variable does not exist, it will be added.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
var_name: The name of the environment variable to change.
|
|
29
|
+
var_value: The new value of the environment variable.
|
|
30
|
+
"""
|
|
31
|
+
dotenv = "./.env"
|
|
32
|
+
regex = re.compile(rf"^{var_name}=.*$")
|
|
33
|
+
with open(dotenv) as f:
|
|
34
|
+
lines = f.readlines()
|
|
35
|
+
|
|
36
|
+
with open(dotenv, "w") as f:
|
|
37
|
+
f.writelines(line for line in lines if not regex.search(line) and line != "\n")
|
|
38
|
+
|
|
39
|
+
with open(dotenv, "a") as f:
|
|
40
|
+
f.writelines(f"\n{var_name}={var_value}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def fix_api_host(api_host: str):
|
|
44
|
+
if not api_host.startswith("http") and not api_host.startswith("https"):
|
|
45
|
+
api_host = f"https://{api_host}"
|
|
46
|
+
print(
|
|
47
|
+
"API host must start with 'http://' or 'https://'. Automatically added 'https://'."
|
|
48
|
+
)
|
|
49
|
+
if (url := urlparse(api_host)) and url.path != "":
|
|
50
|
+
print("API host should not contain a path. Removing path.")
|
|
51
|
+
api_host = f"{url.scheme}://{url.hostname}"
|
|
52
|
+
return api_host
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@app.command("set-api-key", epilog=hirundo_epilog)
|
|
56
|
+
def setup_api_key(
|
|
57
|
+
api_key: Annotated[
|
|
58
|
+
str,
|
|
59
|
+
typer.Option(
|
|
60
|
+
prompt="Please enter the API key value",
|
|
61
|
+
help=""
|
|
62
|
+
if docs
|
|
63
|
+
else f"Visit '{API_HOST}/api-key' to generate your API key.",
|
|
64
|
+
),
|
|
65
|
+
],
|
|
66
|
+
):
|
|
67
|
+
"""
|
|
68
|
+
Setup the API key for the Hirundo client library.
|
|
69
|
+
Values are saved to a .env file in the current directory for use by the library in requests.
|
|
70
|
+
"""
|
|
71
|
+
upsert_env("API_KEY", api_key)
|
|
72
|
+
print("API key saved to .env for future use. Please do not share the .env file")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@app.command("change-remote", epilog=hirundo_epilog)
|
|
76
|
+
def change_api_remote(
|
|
77
|
+
api_host: Annotated[
|
|
78
|
+
str, # TODO: Change to HttpUrl when https://github.com/tiangolo/typer/pull/723 is merged
|
|
79
|
+
typer.Option(
|
|
80
|
+
prompt="Please enter the API server address",
|
|
81
|
+
help=""
|
|
82
|
+
if docs
|
|
83
|
+
else f"Current API server address: '{API_HOST}'. This is the same address where you access the Hirundo web interface.",
|
|
84
|
+
),
|
|
85
|
+
],
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Change the API server address for the Hirundo client library.
|
|
89
|
+
This is the same address where you access the Hirundo web interface.
|
|
90
|
+
"""
|
|
91
|
+
api_host = fix_api_host(api_host)
|
|
92
|
+
|
|
93
|
+
upsert_env("API_HOST", api_host)
|
|
94
|
+
print("API host saved to .env for future use. Please do not share this file")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@app.command("setup", epilog=hirundo_epilog)
|
|
98
|
+
def setup(
|
|
99
|
+
api_key: Annotated[
|
|
100
|
+
str,
|
|
101
|
+
typer.Option(
|
|
102
|
+
prompt="Please enter the API key value",
|
|
103
|
+
help=""
|
|
104
|
+
if docs
|
|
105
|
+
else f"Visit '{API_HOST}/api-key' to generate your API key.",
|
|
106
|
+
),
|
|
107
|
+
],
|
|
108
|
+
api_host: Annotated[
|
|
109
|
+
str, # TODO: Change to HttpUrl as above
|
|
110
|
+
typer.Option(
|
|
111
|
+
prompt="Please enter the API server address",
|
|
112
|
+
help=""
|
|
113
|
+
if docs
|
|
114
|
+
else f"Current API server address: '{API_HOST}'. This is the same address where you access the Hirundo web interface.",
|
|
115
|
+
),
|
|
116
|
+
],
|
|
117
|
+
):
|
|
118
|
+
"""
|
|
119
|
+
Setup the Hirundo client library.
|
|
120
|
+
"""
|
|
121
|
+
api_host = fix_api_host(api_host)
|
|
122
|
+
upsert_env("API_HOST", api_host)
|
|
123
|
+
upsert_env("API_KEY", api_key)
|
|
124
|
+
print(
|
|
125
|
+
"API host and API key saved to .env for future use. Please do not share this file"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
typer_click_object = typer.main.get_command(app)
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
app()
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from collections.abc import AsyncGenerator, Generator
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import requests
|
|
8
|
+
from pydantic import BaseModel, Field, model_validator
|
|
9
|
+
|
|
10
|
+
from hirundo._env import API_HOST
|
|
11
|
+
from hirundo._headers import auth_headers, json_headers
|
|
12
|
+
from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
|
|
13
|
+
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
14
|
+
from hirundo.enum import DatasetMetadataType, LabellingType
|
|
15
|
+
from hirundo.storage import StorageIntegration, StorageLink
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HirundoError(Exception):
|
|
21
|
+
"""
|
|
22
|
+
Custom exception used to indicate errors in `hirundo` dataset optimization runs
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
MAX_RETRIES = 200 # Max 200 retries for HTTP SSE connection
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class OptimizationDataset(BaseModel):
|
|
32
|
+
name: str
|
|
33
|
+
"""
|
|
34
|
+
The name of the dataset. Used to identify it amongst the list of datasets
|
|
35
|
+
belonging to your organization in `hirundo`.
|
|
36
|
+
"""
|
|
37
|
+
labelling_type: LabellingType
|
|
38
|
+
"""
|
|
39
|
+
Indicates the labelling type of the dataset. The labelling type can be one of the following:
|
|
40
|
+
- `LabellingType.SingleLabelClassification`: Indicates that the dataset is for classification tasks
|
|
41
|
+
- `LabellingType.ObjectDetection`: Indicates that the dataset is for object detection tasks
|
|
42
|
+
"""
|
|
43
|
+
dataset_storage: Union[StorageLink, None]
|
|
44
|
+
"""
|
|
45
|
+
The storage link to the dataset. This can be a link to a file or a directory containing the dataset.
|
|
46
|
+
If `None`, the `dataset_id` field must be set.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
classes: list[str]
|
|
50
|
+
"""
|
|
51
|
+
A full list of possible classes used in classification / object detection.
|
|
52
|
+
It is currently required for clarity and performance.
|
|
53
|
+
"""
|
|
54
|
+
dataset_metadata_path: str = "metadata.csv"
|
|
55
|
+
"""
|
|
56
|
+
The path to the dataset metadata file within storage integration, e.g. S3 Bucket / GCP Bucket / Azure Blob storage / Git repo.
|
|
57
|
+
Note: This path will be prefixed with the `StorageLink`'s `path`.
|
|
58
|
+
"""
|
|
59
|
+
dataset_metadata_type: DatasetMetadataType = DatasetMetadataType.HirundoCSV
|
|
60
|
+
"""
|
|
61
|
+
The type of dataset metadata file. The dataset metadata file can be one of the following:
|
|
62
|
+
- `DatasetMetadataType.HirundoCSV`: Indicates that the dataset metadata file is a CSV file with the Hirundo format
|
|
63
|
+
|
|
64
|
+
Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
storage_integration_id: Union[int, None] = Field(default=None, init=False)
|
|
68
|
+
"""
|
|
69
|
+
The ID of the storage integration used to store the dataset and metadata.
|
|
70
|
+
"""
|
|
71
|
+
dataset_id: Union[int, None] = Field(default=None, init=False)
|
|
72
|
+
"""
|
|
73
|
+
The ID of the dataset created on the server.
|
|
74
|
+
"""
|
|
75
|
+
run_id: Union[str, None] = Field(default=None, init=False)
|
|
76
|
+
"""
|
|
77
|
+
The ID of the Dataset Optimization run created on the server.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
@model_validator(mode="after")
|
|
81
|
+
def validate_dataset(self):
|
|
82
|
+
if self.dataset_storage is None and self.storage_integration_id is None:
|
|
83
|
+
raise ValueError("No dataset storage has been provided")
|
|
84
|
+
return self
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def list(organization_id: Union[int, None] = None) -> list[dict]:
|
|
88
|
+
"""
|
|
89
|
+
Lists all the `OptimizationDataset` instances created by user's default organization
|
|
90
|
+
or the `organization_id` passed
|
|
91
|
+
Note: The return type is `list[dict]` and not `list[OptimizationDataset]`
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
organization_id: The ID of the organization to list the datasets for.
|
|
95
|
+
"""
|
|
96
|
+
response = requests.get(
|
|
97
|
+
f"{API_HOST}/dataset-optimization/dataset/",
|
|
98
|
+
params={"dataset_organization_id": organization_id},
|
|
99
|
+
headers=auth_headers,
|
|
100
|
+
timeout=READ_TIMEOUT,
|
|
101
|
+
)
|
|
102
|
+
response.raise_for_status()
|
|
103
|
+
return response.json()
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def delete_by_id(dataset_id: int) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Deletes a `OptimizationDataset` instance from the server by its ID
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
dataset_id: The ID of the `OptimizationDataset` instance to delete
|
|
112
|
+
"""
|
|
113
|
+
response = requests.delete(
|
|
114
|
+
f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
|
|
115
|
+
headers=auth_headers,
|
|
116
|
+
timeout=MODIFY_TIMEOUT,
|
|
117
|
+
)
|
|
118
|
+
response.raise_for_status()
|
|
119
|
+
|
|
120
|
+
def delete(self, storage_integration=True) -> None:
|
|
121
|
+
"""
|
|
122
|
+
Deletes the active `OptimizationDataset` instance from the server.
|
|
123
|
+
It can only be used on a `OptimizationDataset` instance that has been created.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
storage_integration: If True, the `OptimizationDataset`'s `StorageIntegration` will also be deleted
|
|
127
|
+
|
|
128
|
+
Note: If `storage_integration` is not set to `False` then the `storage_integration_id` must be set
|
|
129
|
+
This can either be set manually or by creating the `StorageIntegration` instance via the `OptimizationDataset`'s
|
|
130
|
+
`create` method
|
|
131
|
+
"""
|
|
132
|
+
if storage_integration:
|
|
133
|
+
if not self.storage_integration_id:
|
|
134
|
+
raise ValueError("No storage integration has been created")
|
|
135
|
+
StorageIntegration.delete_by_id(self.storage_integration_id)
|
|
136
|
+
if not self.dataset_id:
|
|
137
|
+
raise ValueError("No dataset has been created")
|
|
138
|
+
self.delete_by_id(self.dataset_id)
|
|
139
|
+
|
|
140
|
+
def create(self) -> int:
|
|
141
|
+
"""
|
|
142
|
+
Create a `OptimizationDataset` instance on the server.
|
|
143
|
+
If `storage_integration_id` is not set, it will be created.
|
|
144
|
+
"""
|
|
145
|
+
if not self.dataset_storage:
|
|
146
|
+
raise ValueError("No dataset storage has been provided")
|
|
147
|
+
if (
|
|
148
|
+
self.dataset_storage
|
|
149
|
+
and self.dataset_storage.storage_integration
|
|
150
|
+
and not self.storage_integration_id
|
|
151
|
+
):
|
|
152
|
+
self.storage_integration_id = (
|
|
153
|
+
self.dataset_storage.storage_integration.create()
|
|
154
|
+
)
|
|
155
|
+
model_dict = self.model_dump()
|
|
156
|
+
# ⬆️ Get dict of model fields from Pydantic model instance
|
|
157
|
+
dataset_response = requests.post(
|
|
158
|
+
f"{API_HOST}/dataset-optimization/dataset/",
|
|
159
|
+
json={
|
|
160
|
+
"dataset_storage": {
|
|
161
|
+
"storage_integration_id": self.storage_integration_id,
|
|
162
|
+
"path": self.dataset_storage.path,
|
|
163
|
+
},
|
|
164
|
+
**{k: model_dict[k] for k in model_dict.keys() - {"dataset_storage"}},
|
|
165
|
+
},
|
|
166
|
+
headers={
|
|
167
|
+
**json_headers,
|
|
168
|
+
**auth_headers,
|
|
169
|
+
},
|
|
170
|
+
timeout=MODIFY_TIMEOUT,
|
|
171
|
+
)
|
|
172
|
+
dataset_response.raise_for_status()
|
|
173
|
+
self.dataset_id = dataset_response.json()["id"]
|
|
174
|
+
if not self.dataset_id:
|
|
175
|
+
raise HirundoError("Failed to create the dataset")
|
|
176
|
+
return self.dataset_id
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def launch_optimization_run(dataset_id: int) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Run the dataset optimization process on the server using the dataset with the given ID
|
|
182
|
+
i.e. `dataset_id`.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
dataset_id: The ID of the dataset to run optimization on.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
ID of the run (`run_id`).
|
|
189
|
+
"""
|
|
190
|
+
run_response = requests.post(
|
|
191
|
+
f"{API_HOST}/dataset-optimization/run/{dataset_id}",
|
|
192
|
+
headers=auth_headers,
|
|
193
|
+
timeout=MODIFY_TIMEOUT,
|
|
194
|
+
)
|
|
195
|
+
run_response.raise_for_status()
|
|
196
|
+
return run_response.json()["run_id"]
|
|
197
|
+
|
|
198
|
+
def run_optimization(self) -> str:
|
|
199
|
+
"""
|
|
200
|
+
If the dataset was not created on the server yet, it is created.
|
|
201
|
+
Run the dataset optimization process on the server using the active `OptimizationDataset` instance
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
An ID of the run (`run_id`) and stores that `run_id` on the instance
|
|
205
|
+
"""
|
|
206
|
+
try:
|
|
207
|
+
if not self.dataset_id:
|
|
208
|
+
self.dataset_id = self.create()
|
|
209
|
+
run_id = self.launch_optimization_run(self.dataset_id)
|
|
210
|
+
self.run_id = run_id
|
|
211
|
+
return run_id
|
|
212
|
+
except requests.HTTPError as error:
|
|
213
|
+
try:
|
|
214
|
+
content = error.response.json()
|
|
215
|
+
logger.error(
|
|
216
|
+
"HTTP Error! Status code:",
|
|
217
|
+
error.response.status_code,
|
|
218
|
+
"Content:",
|
|
219
|
+
content,
|
|
220
|
+
)
|
|
221
|
+
except Exception:
|
|
222
|
+
content = error.response.text
|
|
223
|
+
raise HirundoError(
|
|
224
|
+
f"Failed to start the run. Status code: {error.response.status_code} Content: {content}"
|
|
225
|
+
) from error
|
|
226
|
+
except Exception as error:
|
|
227
|
+
raise HirundoError(f"Failed to start the run: {error}") from error
|
|
228
|
+
|
|
229
|
+
def clean_ids(self):
|
|
230
|
+
"""
|
|
231
|
+
Reset `dataset_id`, `storage_integration_id`, and `run_id` values on the instance to default value of `None`
|
|
232
|
+
"""
|
|
233
|
+
self.storage_integration_id = None
|
|
234
|
+
self.dataset_id = None
|
|
235
|
+
self.run_id = None
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
|
|
239
|
+
"""
|
|
240
|
+
Check the status of a run given its ID
|
|
241
|
+
|
|
242
|
+
This generator will produce values to show progress of the run.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
run_id: The `run_id` produced by a `run_optimization` call
|
|
246
|
+
retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually.
|
|
247
|
+
|
|
248
|
+
Yields:
|
|
249
|
+
Each event will be a dict, where:
|
|
250
|
+
- `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
|
|
251
|
+
- `"result"` is a string describing the progress as a percentage for a PENDING state,
|
|
252
|
+
or the error for a FAILURE state or the results for a SUCCESS state
|
|
253
|
+
|
|
254
|
+
"""
|
|
255
|
+
if retry > MAX_RETRIES:
|
|
256
|
+
raise HirundoError("Max retries reached")
|
|
257
|
+
last_event = None
|
|
258
|
+
with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
|
|
259
|
+
for sse in iter_sse_retrying(
|
|
260
|
+
client,
|
|
261
|
+
"GET",
|
|
262
|
+
f"{API_HOST}/dataset-optimization/run/{run_id}",
|
|
263
|
+
headers=auth_headers,
|
|
264
|
+
):
|
|
265
|
+
if sse.event == "ping":
|
|
266
|
+
continue
|
|
267
|
+
logger.debug(
|
|
268
|
+
"[SYNC] received event: %s with data: %s and ID: %s and retry: %s",
|
|
269
|
+
sse.event,
|
|
270
|
+
sse.data,
|
|
271
|
+
sse.id,
|
|
272
|
+
sse.retry,
|
|
273
|
+
)
|
|
274
|
+
last_event = json.loads(sse.data)
|
|
275
|
+
yield last_event["data"]
|
|
276
|
+
if not last_event or last_event["data"]["state"] == "PENDING":
|
|
277
|
+
OptimizationDataset.check_run_by_id(run_id, retry + 1)
|
|
278
|
+
|
|
279
|
+
def check_run(self) -> Generator[dict, None, None]:
|
|
280
|
+
"""
|
|
281
|
+
Check the status of the current active instance's run.
|
|
282
|
+
|
|
283
|
+
This generator will produce values to show progress of the run.
|
|
284
|
+
|
|
285
|
+
Yields:
|
|
286
|
+
Each event will be a dict, where:
|
|
287
|
+
- `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
|
|
288
|
+
- `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
|
|
289
|
+
|
|
290
|
+
"""
|
|
291
|
+
if not self.run_id:
|
|
292
|
+
raise ValueError("No run has been started")
|
|
293
|
+
return self.check_run_by_id(self.run_id)
|
|
294
|
+
|
|
295
|
+
@staticmethod
|
|
296
|
+
async def acheck_run_by_id(run_id: str, retry=0) -> AsyncGenerator[dict, None]:
|
|
297
|
+
"""
|
|
298
|
+
Async version of :func:`check_run_by_id`
|
|
299
|
+
|
|
300
|
+
Check the status of a run given its ID.
|
|
301
|
+
|
|
302
|
+
This generator will produce values to show progress of the run.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
run_id: The `run_id` produced by a `run_optimization` call
|
|
306
|
+
retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually.
|
|
307
|
+
|
|
308
|
+
Yields:
|
|
309
|
+
Each event will be a dict, where:
|
|
310
|
+
- `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
|
|
311
|
+
- `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
|
|
312
|
+
|
|
313
|
+
"""
|
|
314
|
+
if retry > MAX_RETRIES:
|
|
315
|
+
raise HirundoError("Max retries reached")
|
|
316
|
+
last_event = None
|
|
317
|
+
async with httpx.AsyncClient(
|
|
318
|
+
timeout=httpx.Timeout(None, connect=5.0)
|
|
319
|
+
) as client:
|
|
320
|
+
async_iterator = await aiter_sse_retrying(
|
|
321
|
+
client,
|
|
322
|
+
"GET",
|
|
323
|
+
f"{API_HOST}/dataset-optimization/run/{run_id}",
|
|
324
|
+
headers=auth_headers,
|
|
325
|
+
)
|
|
326
|
+
async for sse in async_iterator:
|
|
327
|
+
if sse.event == "ping":
|
|
328
|
+
continue
|
|
329
|
+
logger.debug(
|
|
330
|
+
"[ASYNC] Received event: %s with data: %s and ID: %s and retry: %s",
|
|
331
|
+
sse.event,
|
|
332
|
+
sse.data,
|
|
333
|
+
sse.id,
|
|
334
|
+
sse.retry,
|
|
335
|
+
)
|
|
336
|
+
last_event = json.loads(sse.data)
|
|
337
|
+
yield last_event["data"]
|
|
338
|
+
if not last_event or last_event["data"]["state"] == "PENDING":
|
|
339
|
+
OptimizationDataset.acheck_run_by_id(run_id, retry + 1)
|
|
340
|
+
|
|
341
|
+
async def acheck_run(self) -> AsyncGenerator[dict, None]:
|
|
342
|
+
"""
|
|
343
|
+
Async version of :func:`check_run`
|
|
344
|
+
|
|
345
|
+
Check the status of the current active instance's run.
|
|
346
|
+
|
|
347
|
+
This generator will produce values to show progress of the run.
|
|
348
|
+
|
|
349
|
+
Yields:
|
|
350
|
+
Each event will be a dict, where:
|
|
351
|
+
- `"state"` is PENDING, STARTED, RETRY, FAILURE or SUCCESS
|
|
352
|
+
- `"result"` is a string describing the progress as a percentage for a PENDING state, or the error for a FAILURE state or the results for a SUCCESS state
|
|
353
|
+
|
|
354
|
+
"""
|
|
355
|
+
if not self.run_id:
|
|
356
|
+
raise ValueError("No run has been started")
|
|
357
|
+
async for iteration in self.acheck_run_by_id(self.run_id):
|
|
358
|
+
yield iteration
|
|
359
|
+
|
|
360
|
+
@staticmethod
|
|
361
|
+
def cancel_by_id(run_id: str) -> None:
|
|
362
|
+
"""
|
|
363
|
+
Cancel the dataset optimization run for the given `run_id`.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
run_id: The ID of the run to cancel
|
|
367
|
+
"""
|
|
368
|
+
if not run_id:
|
|
369
|
+
raise ValueError("No run has been started")
|
|
370
|
+
response = requests.delete(
|
|
371
|
+
f"{API_HOST}/dataset-optimization/run/{run_id}",
|
|
372
|
+
headers=auth_headers,
|
|
373
|
+
timeout=MODIFY_TIMEOUT,
|
|
374
|
+
)
|
|
375
|
+
response.raise_for_status()
|
|
376
|
+
|
|
377
|
+
def cancel(self) -> None:
|
|
378
|
+
"""
|
|
379
|
+
Cancel the current active instance's run.
|
|
380
|
+
"""
|
|
381
|
+
if not self.run_id:
|
|
382
|
+
raise ValueError("No run has been started")
|
|
383
|
+
self.cancel_by_id(self.run_id)
|
hirundo/enum.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LabellingType(str, Enum):
|
|
5
|
+
"""
|
|
6
|
+
Enum indicate what type of labelling is used for the given dataset.
|
|
7
|
+
Supported types are:
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
SingleLabelClassification = "SingleLabelClassification"
|
|
11
|
+
ObjectDetection = "ObjectDetection"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatasetMetadataType(str, Enum):
|
|
15
|
+
"""
|
|
16
|
+
Enum indicate what type of metadata is provided for the given dataset.
|
|
17
|
+
Supported types are:
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
HirundoCSV = "HirundoCSV"
|
hirundo/git.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from typing import Annotated, Union
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
import requests
|
|
7
|
+
from pydantic import BaseModel, field_validator
|
|
8
|
+
from pydantic_core import Url
|
|
9
|
+
|
|
10
|
+
from hirundo._env import API_HOST
|
|
11
|
+
from hirundo._headers import auth_headers, json_headers
|
|
12
|
+
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GitPlainAuthBase(BaseModel):
|
|
18
|
+
username: str
|
|
19
|
+
"""
|
|
20
|
+
The username for the Git repository
|
|
21
|
+
"""
|
|
22
|
+
password: str
|
|
23
|
+
"""
|
|
24
|
+
The password for the Git repository
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GitSSHAuthBase(BaseModel):
|
|
29
|
+
ssh_key: str
|
|
30
|
+
"""
|
|
31
|
+
The SSH key for the Git repository
|
|
32
|
+
"""
|
|
33
|
+
ssh_password: Union[str, None]
|
|
34
|
+
"""
|
|
35
|
+
The password for the SSH key for the Git repository.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class GitRepo(BaseModel):
|
|
40
|
+
id: Union[int, None] = None
|
|
41
|
+
"""
|
|
42
|
+
The ID of the Git repository.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
"""
|
|
47
|
+
A name to identify the Git repository in the Hirundo system.
|
|
48
|
+
"""
|
|
49
|
+
repository_url: Annotated[str, Url]
|
|
50
|
+
"""
|
|
51
|
+
The URL of the Git repository, it should start with `ssh://` or `https://` or be in the form `user@host:path`.
|
|
52
|
+
If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host:path`.
|
|
53
|
+
"""
|
|
54
|
+
organization_id: Union[int, None] = None
|
|
55
|
+
"""
|
|
56
|
+
The ID of the organization that the Git repository belongs to.
|
|
57
|
+
If not provided, it will be assigned to your default organization.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
plain_auth: Union[GitPlainAuthBase, None] = pydantic.Field(
|
|
61
|
+
default=None, examples=[None, {"username": "ben", "password": "password"}]
|
|
62
|
+
)
|
|
63
|
+
"""
|
|
64
|
+
The plain authentication details for the Git repository.
|
|
65
|
+
Use this if using a special user with a username and password for authentication.
|
|
66
|
+
"""
|
|
67
|
+
ssh_auth: Union[GitSSHAuthBase, None] = pydantic.Field(
|
|
68
|
+
default=None,
|
|
69
|
+
examples=[
|
|
70
|
+
{
|
|
71
|
+
"ssh_key": "SOME_PRIVATE_SSH_KEY",
|
|
72
|
+
"ssh_password": "SOME_SSH_KEY_PASSWORD",
|
|
73
|
+
},
|
|
74
|
+
None,
|
|
75
|
+
],
|
|
76
|
+
)
|
|
77
|
+
"""
|
|
78
|
+
The SSH authentication details for the Git repository.
|
|
79
|
+
Use this if using an SSH key for authentication.
|
|
80
|
+
Optionally, you can provide a password for the SSH key.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
@field_validator("repository_url", mode="before", check_fields=True)
|
|
84
|
+
@classmethod
|
|
85
|
+
def check_valid_repository_url(cls, repository_url: str):
|
|
86
|
+
# Check if the URL already has a protocol
|
|
87
|
+
if not re.match(r"^[a-z]+://", repository_url):
|
|
88
|
+
# Check if the URL has the `@` and `:` pattern with a non-numeric section before the next slash
|
|
89
|
+
match = re.match(r"([^@]+@[^:]+):([^0-9/][^/]*)/(.+)", repository_url)
|
|
90
|
+
if match:
|
|
91
|
+
user_host = match.group(1)
|
|
92
|
+
path = match.group(2) + "/" + match.group(3)
|
|
93
|
+
rewritten_url = f"ssh://{user_host}/{path}"
|
|
94
|
+
logger.info("Modified Git repo to add SSH protocol", rewritten_url)
|
|
95
|
+
return rewritten_url
|
|
96
|
+
if not repository_url.startswith("ssh://") and not repository_url.startswith(
|
|
97
|
+
"https://"
|
|
98
|
+
):
|
|
99
|
+
raise ValueError("Repository URL must start with 'ssh://' or 'https://'")
|
|
100
|
+
return repository_url
|
|
101
|
+
|
|
102
|
+
def create(self):
|
|
103
|
+
"""
|
|
104
|
+
Create a Git repository in the Hirundo system.
|
|
105
|
+
"""
|
|
106
|
+
git_repo = requests.post(
|
|
107
|
+
f"{API_HOST}/git-repo/",
|
|
108
|
+
json=self.model_dump(),
|
|
109
|
+
headers={
|
|
110
|
+
**json_headers,
|
|
111
|
+
**auth_headers,
|
|
112
|
+
},
|
|
113
|
+
timeout=MODIFY_TIMEOUT,
|
|
114
|
+
)
|
|
115
|
+
git_repo.raise_for_status()
|
|
116
|
+
git_repo_id = git_repo.json()["id"]
|
|
117
|
+
self.id = git_repo_id
|
|
118
|
+
return git_repo_id
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def list():
|
|
122
|
+
"""
|
|
123
|
+
List all Git repositories in the Hirundo system.
|
|
124
|
+
"""
|
|
125
|
+
git_repos = requests.get(
|
|
126
|
+
f"{API_HOST}/git-repo/",
|
|
127
|
+
headers={
|
|
128
|
+
**auth_headers,
|
|
129
|
+
},
|
|
130
|
+
timeout=READ_TIMEOUT,
|
|
131
|
+
)
|
|
132
|
+
git_repos.raise_for_status()
|
|
133
|
+
return git_repos.json()
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
def delete_by_id(git_repo_id: int):
|
|
137
|
+
"""
|
|
138
|
+
Delete a Git repository by its ID.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
git_repo_id: The ID of the Git repository to delete
|
|
142
|
+
"""
|
|
143
|
+
git_repo = requests.delete(
|
|
144
|
+
f"{API_HOST}/git-repo/{git_repo_id}",
|
|
145
|
+
headers={
|
|
146
|
+
**auth_headers,
|
|
147
|
+
},
|
|
148
|
+
timeout=MODIFY_TIMEOUT,
|
|
149
|
+
)
|
|
150
|
+
git_repo.raise_for_status()
|
|
151
|
+
|
|
152
|
+
def delete(self):
|
|
153
|
+
"""
|
|
154
|
+
Delete the Git repository created by this instance.
|
|
155
|
+
"""
|
|
156
|
+
if not self.id:
|
|
157
|
+
raise ValueError("No GitRepo has been created")
|
|
158
|
+
GitRepo.delete_by_id(self.id)
|
hirundo/storage.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
import requests
|
|
7
|
+
from pydantic import BaseModel, model_validator
|
|
8
|
+
from pydantic_core import Url
|
|
9
|
+
|
|
10
|
+
from hirundo._constraints import S3BucketUrl, StorageIntegrationName
|
|
11
|
+
from hirundo._env import API_HOST
|
|
12
|
+
from hirundo._headers import auth_headers, json_headers
|
|
13
|
+
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
14
|
+
from hirundo.git import GitRepo
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StorageS3(BaseModel):
|
|
18
|
+
endpoint_url: Union[Url, None] = None
|
|
19
|
+
bucket_url: S3BucketUrl
|
|
20
|
+
region_name: str
|
|
21
|
+
# ⬆️ We could restrict this, but if we're allowing custom endpoints then the validation may be wrong
|
|
22
|
+
access_key_id: Union[str, None] = None
|
|
23
|
+
secret_access_key: Union[str, None] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class StorageGCP(BaseModel):
|
|
27
|
+
bucket_name: str
|
|
28
|
+
project: str
|
|
29
|
+
credentials_json: Union[dict, None] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# TODO: Azure storage integration is coming soon
|
|
33
|
+
# class StorageAzure(BaseModel):
|
|
34
|
+
# container: str
|
|
35
|
+
# account_name: str
|
|
36
|
+
# account_key: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StorageGit(BaseModel):
|
|
40
|
+
repo_id: Union[int, None] = None
|
|
41
|
+
"""
|
|
42
|
+
The ID of the Git repository in the Hirundo system.
|
|
43
|
+
Either `repo_id` or `repo` must be provided.
|
|
44
|
+
"""
|
|
45
|
+
repo: Union[GitRepo, None] = None
|
|
46
|
+
"""
|
|
47
|
+
The Git repository to link to.
|
|
48
|
+
Either `repo_id` or `repo` must be provided.
|
|
49
|
+
"""
|
|
50
|
+
branch: str
|
|
51
|
+
"""
|
|
52
|
+
The branch of the Git repository to link to.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
@model_validator(mode="after")
|
|
56
|
+
def validate_repo(self):
|
|
57
|
+
if self.repo_id is None and self.repo is None:
|
|
58
|
+
raise ValueError("Either repo_id or repo must be provided")
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class StorageTypes(str, Enum):
|
|
63
|
+
"""
|
|
64
|
+
Enum for the different types of storage integrations.
|
|
65
|
+
Supported types are:
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
S3 = "S3"
|
|
69
|
+
GCP = "GCP"
|
|
70
|
+
# AZURE = "Azure" TODO: Azure storage integration is coming soon
|
|
71
|
+
GIT = "Git"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class StorageIntegration(BaseModel):
|
|
75
|
+
id: Union[int, None] = None
|
|
76
|
+
|
|
77
|
+
organization_id: Union[int, None] = None
|
|
78
|
+
"""
|
|
79
|
+
The ID of the organization that the `StorageIntegration` belongs to.
|
|
80
|
+
If not provided, it will be assigned to your default organization.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
name: StorageIntegrationName
|
|
84
|
+
"""
|
|
85
|
+
A name to identify the `StorageIntegration` in the Hirundo system.
|
|
86
|
+
"""
|
|
87
|
+
type: StorageTypes = pydantic.Field(
|
|
88
|
+
examples=[
|
|
89
|
+
StorageTypes.S3,
|
|
90
|
+
StorageTypes.GCP,
|
|
91
|
+
# StorageTypes.AZURE, TODO: Azure storage integration is coming soon
|
|
92
|
+
StorageTypes.GIT,
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
"""
|
|
96
|
+
The type of the `StorageIntegration`.
|
|
97
|
+
Supported types are:
|
|
98
|
+
- `S3`
|
|
99
|
+
- `GCP`
|
|
100
|
+
- `Azure` (coming soon)
|
|
101
|
+
- `Git`
|
|
102
|
+
"""
|
|
103
|
+
s3: Union[StorageS3, None] = pydantic.Field(
|
|
104
|
+
default=None,
|
|
105
|
+
examples=[
|
|
106
|
+
{
|
|
107
|
+
"bucket_url": "s3://my-bucket",
|
|
108
|
+
"region_name": "us-west-2",
|
|
109
|
+
"access_key_id": "my-access-key",
|
|
110
|
+
"secret_access_key": "REDACTED",
|
|
111
|
+
},
|
|
112
|
+
None,
|
|
113
|
+
None,
|
|
114
|
+
None,
|
|
115
|
+
],
|
|
116
|
+
)
|
|
117
|
+
"""
|
|
118
|
+
The Amazon Web Services (AWS) S3 storage integration details.
|
|
119
|
+
Use this if you want to link to an S3 bucket.
|
|
120
|
+
"""
|
|
121
|
+
gcp: Union[StorageGCP, None] = pydantic.Field(
|
|
122
|
+
default=None,
|
|
123
|
+
examples=[
|
|
124
|
+
None,
|
|
125
|
+
{
|
|
126
|
+
"bucket_name": "my-bucket",
|
|
127
|
+
"project": "my-project",
|
|
128
|
+
"credentials_json": {
|
|
129
|
+
"type": "service_account",
|
|
130
|
+
"project_id": "my-project",
|
|
131
|
+
"private_key_id": "my-key-id",
|
|
132
|
+
"private_key": "REDACTED",
|
|
133
|
+
"client_email": "my-service-account@my-project.iam.gserviceaccount.com",
|
|
134
|
+
"client_id": "my-id",
|
|
135
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
136
|
+
"token_uri": "https://oauth2.googleapis.com/token",
|
|
137
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
138
|
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-service-account%40my-project.iam.gserviceaccount.com",
|
|
139
|
+
"universe_domain": "googleapis.com",
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
None,
|
|
143
|
+
None,
|
|
144
|
+
],
|
|
145
|
+
)
|
|
146
|
+
"""
|
|
147
|
+
The Google Cloud (GCP) Storage integration details.
|
|
148
|
+
Use this if you want to link to an GCS bucket.
|
|
149
|
+
"""
|
|
150
|
+
azure: None = None
|
|
151
|
+
# azure: Union[StorageAzure, None] = pydantic.Field(
|
|
152
|
+
# default=None,
|
|
153
|
+
# examples=[
|
|
154
|
+
# None,
|
|
155
|
+
# None,
|
|
156
|
+
# {
|
|
157
|
+
# "container": "my-container",
|
|
158
|
+
# "account_name": "my-account-name",
|
|
159
|
+
# "account_key": "my-account",
|
|
160
|
+
# },
|
|
161
|
+
# None,
|
|
162
|
+
# ],
|
|
163
|
+
# ) TODO: Azure storage integration is coming soon
|
|
164
|
+
git: Union[StorageGit, None] = pydantic.Field(
|
|
165
|
+
default=None,
|
|
166
|
+
examples=[
|
|
167
|
+
None,
|
|
168
|
+
None,
|
|
169
|
+
None,
|
|
170
|
+
{
|
|
171
|
+
"repo_id": "my-repo-id",
|
|
172
|
+
"repo": {
|
|
173
|
+
"name": "test-dataset",
|
|
174
|
+
"repository_url": "https://github.com/Hirundo-io/test-dataset.git",
|
|
175
|
+
},
|
|
176
|
+
"branch": "main",
|
|
177
|
+
"path": "/my-path/to/dataset",
|
|
178
|
+
},
|
|
179
|
+
],
|
|
180
|
+
)
|
|
181
|
+
"""
|
|
182
|
+
The Git storage integration details.
|
|
183
|
+
Use this if you want to link to a Git repository.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def list(organization_id: typing.Union[int, None] = None) -> list[dict]:
|
|
188
|
+
"""
|
|
189
|
+
Lists all the `StorageIntegration`'s created by user's default organization
|
|
190
|
+
Note: The return type is `list[dict]` and not `list[StorageIntegration]`
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
organization_id: The ID of the organization to list `StorageIntegration`'s for.
|
|
194
|
+
If not provided, it will list `StorageIntegration`'s for the default organization.
|
|
195
|
+
"""
|
|
196
|
+
storage_integrations = requests.get(
|
|
197
|
+
f"{API_HOST}/storage-integration/",
|
|
198
|
+
params={"storage_integration_organization_id": organization_id},
|
|
199
|
+
headers=auth_headers,
|
|
200
|
+
timeout=READ_TIMEOUT,
|
|
201
|
+
)
|
|
202
|
+
storage_integrations.raise_for_status()
|
|
203
|
+
return storage_integrations.json()
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def delete_by_id(storage_integration_id) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Deletes a `StorageIntegration` instance from the server by its ID
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
storage_integration_id: The ID of the `StorageIntegration` to delete
|
|
212
|
+
"""
|
|
213
|
+
storage_integration = requests.delete(
|
|
214
|
+
f"{API_HOST}/storage-integration/{storage_integration_id}",
|
|
215
|
+
headers=auth_headers,
|
|
216
|
+
timeout=MODIFY_TIMEOUT,
|
|
217
|
+
)
|
|
218
|
+
storage_integration.raise_for_status()
|
|
219
|
+
|
|
220
|
+
def delete(self) -> None:
|
|
221
|
+
"""
|
|
222
|
+
Deletes the `StorageIntegration` instance from the server
|
|
223
|
+
"""
|
|
224
|
+
if not self.id:
|
|
225
|
+
raise ValueError("No StorageIntegration has been created")
|
|
226
|
+
self.delete_by_id(self.id)
|
|
227
|
+
|
|
228
|
+
def create(self) -> int:
|
|
229
|
+
"""
|
|
230
|
+
Create a `StorageIntegration` instance on the server
|
|
231
|
+
"""
|
|
232
|
+
if self.git and self.git.repo:
|
|
233
|
+
self.git.repo_id = self.git.repo.create()
|
|
234
|
+
storage_integration = requests.post(
|
|
235
|
+
f"{API_HOST}/storage-integration/",
|
|
236
|
+
json=self.model_dump(),
|
|
237
|
+
headers={
|
|
238
|
+
**json_headers,
|
|
239
|
+
**auth_headers,
|
|
240
|
+
},
|
|
241
|
+
timeout=MODIFY_TIMEOUT,
|
|
242
|
+
)
|
|
243
|
+
storage_integration.raise_for_status()
|
|
244
|
+
storage_integration_id = storage_integration.json()["id"]
|
|
245
|
+
self.id = storage_integration_id
|
|
246
|
+
return storage_integration_id
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class StorageLink(BaseModel):
|
|
250
|
+
storage_integration: StorageIntegration
|
|
251
|
+
"""
|
|
252
|
+
The `StorageIntegration` instance to link to.
|
|
253
|
+
"""
|
|
254
|
+
path: str = "/"
|
|
255
|
+
"""
|
|
256
|
+
Path for the `root` to link to within the `StorageIntegration` instance,
|
|
257
|
+
e.g. a prefix path/folder within an S3 Bucket / GCP Bucket / Azure Blob storage / Git repo.
|
|
258
|
+
|
|
259
|
+
Note: Only files in this path will be retrieved and it will be used as the root for paths in the CSV.
|
|
260
|
+
"""
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024, Hirundo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: hirundo
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
|
|
5
|
+
Author-email: Hirundo <dev@hirundo.io>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024, Hirundo
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
15
|
+
|
|
16
|
+
Project-URL: Homepage, https://github.com/Hirundo-io/hirundo-client
|
|
17
|
+
Keywords: dataset,machine learning,data science,data engineering
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Programming Language :: Python
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: pyyaml >=6.0.1
|
|
25
|
+
Requires-Dist: types-PyYAML >=6.0.12
|
|
26
|
+
Requires-Dist: pydantic >=2.7.1
|
|
27
|
+
Requires-Dist: twine >=5.0.0
|
|
28
|
+
Requires-Dist: python-dotenv >=1.0.1
|
|
29
|
+
Requires-Dist: types-requests >=2.31.0
|
|
30
|
+
Requires-Dist: typer >=0.12.3
|
|
31
|
+
Requires-Dist: httpx >=0.27.0
|
|
32
|
+
Requires-Dist: stamina >=24.2.0
|
|
33
|
+
Requires-Dist: httpx-sse >=0.4.0
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pyyaml >=6.0.1 ; extra == 'dev'
|
|
36
|
+
Requires-Dist: types-PyYAML >=6.0.12 ; extra == 'dev'
|
|
37
|
+
Requires-Dist: pydantic >=2.7.1 ; extra == 'dev'
|
|
38
|
+
Requires-Dist: twine >=5.0.0 ; extra == 'dev'
|
|
39
|
+
Requires-Dist: python-dotenv >=1.0.1 ; extra == 'dev'
|
|
40
|
+
Requires-Dist: types-requests >=2.31.0 ; extra == 'dev'
|
|
41
|
+
Requires-Dist: types-setuptools >=69.5.0 ; extra == 'dev'
|
|
42
|
+
Requires-Dist: typer >=0.12.3 ; extra == 'dev'
|
|
43
|
+
Requires-Dist: httpx >=0.27.0 ; extra == 'dev'
|
|
44
|
+
Requires-Dist: stamina >=24.2.0 ; extra == 'dev'
|
|
45
|
+
Requires-Dist: httpx-sse >=0.4.0 ; extra == 'dev'
|
|
46
|
+
Requires-Dist: pytest >=8.2.0 ; extra == 'dev'
|
|
47
|
+
Requires-Dist: pytest-asyncio >=0.23.6 ; extra == 'dev'
|
|
48
|
+
Requires-Dist: uv ; extra == 'dev'
|
|
49
|
+
Requires-Dist: pre-commit >=3.7.1 ; extra == 'dev'
|
|
50
|
+
Requires-Dist: ruff ; extra == 'dev'
|
|
51
|
+
Requires-Dist: bumpver ; extra == 'dev'
|
|
52
|
+
Provides-Extra: docs
|
|
53
|
+
Requires-Dist: sphinx >=7.4.7 ; extra == 'docs'
|
|
54
|
+
Requires-Dist: sphinx-autobuild >=2024.4.16 ; extra == 'docs'
|
|
55
|
+
Requires-Dist: sphinx-click >=5.0.1 ; extra == 'docs'
|
|
56
|
+
Requires-Dist: autodoc-pydantic >=2.2.0 ; extra == 'docs'
|
|
57
|
+
Requires-Dist: furo ; extra == 'docs'
|
|
58
|
+
Requires-Dist: sphinx-multiversion ; extra == 'docs'
|
|
59
|
+
|
|
60
|
+
# Hirundo client
|
|
61
|
+
|
|
62
|
+
This repo contains the source code for the Hirundo client library
|
|
63
|
+
|
|
64
|
+
## Usage:
|
|
65
|
+
|
|
66
|
+
To learn about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
|
|
67
|
+
|
|
68
|
+
## Development:
|
|
69
|
+
|
|
70
|
+
### Install dev dependencies
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install -r dev-requirements.txt
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Note: You can install and use `uv` as a faster drop-in replacement for `pip`. We have it as part of our dev dependencies for this reason.
|
|
77
|
+
|
|
78
|
+
### Install `git` hooks (optional)
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pre-commit install
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Check lint and apply formatting with Ruff (optional; pre-commit hooks run this automatically)
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
ruff check
|
|
88
|
+
ruff format
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Change packages
|
|
92
|
+
|
|
93
|
+
#### Update `requirements.txt` files
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
uv pip compile pyproject.toml
|
|
97
|
+
uv pip compile --extra dev -o dev-requirements.txt -c requirements.txt pyproject.toml
|
|
98
|
+
uv pip compile --extra docs -o docs-requirements.txt -c requirements.txt pyproject.toml
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### Sync installed packages
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
uv pip sync dev-requirements.txt
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Build process
|
|
108
|
+
|
|
109
|
+
To build the package, run:
|
|
110
|
+
`python -m build`
|
|
111
|
+
|
|
112
|
+
### Publish documentation & releases
|
|
113
|
+
|
|
114
|
+
Documentation & releases are published via GitHub Actions on merges to `main`.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
hirundo/__init__.py,sha256=hkVyX9WRcnGZ8xis2hrGBD9WGeaYW0CAvAHLzGG1LbU,707
|
|
2
|
+
hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
|
|
3
|
+
hirundo/_constraints.py,sha256=-RAUV9GnCsaT9pLGSqYglKOeK0joPBBexGTo87j5nkI,425
|
|
4
|
+
hirundo/_env.py,sha256=aObkRVLo9NBZiByd2FcoLrk3m8tnswuYzP4Tnj3EE-o,268
|
|
5
|
+
hirundo/_headers.py,sha256=htxHRjtD91C5D0svyk-zqhKV9LwQCEZauIa4ZTAfe5k,188
|
|
6
|
+
hirundo/_iter_sse_retrying.py,sha256=WLp_lw8ycBuAxoJkkGBu4y74Ajhcu11r1X-vd5_571A,3352
|
|
7
|
+
hirundo/_timeouts.py,sha256=IfX8-mrLp809-A_xSLv1DhIqZnO-Qvy4FcTtOtvqLog,42
|
|
8
|
+
hirundo/cli.py,sha256=qj1Txt6lOU3V10SLtzH4uEWJ4DdkdOIEQaKn8wiJMss,3922
|
|
9
|
+
hirundo/dataset_optimization.py,sha256=CLo9eclW_trDwzfr6uZlJ8JQb6XpWcKtACqSTaAF_fo,14583
|
|
10
|
+
hirundo/enum.py,sha256=-3w09g-_yRYIMiM8VA_Nb07WoQXf5IjyERTGonzNDs0,457
|
|
11
|
+
hirundo/git.py,sha256=GtowxPL78KleVhSY3QISu7-cUPrFbWC4YWBAuzuzryw,4731
|
|
12
|
+
hirundo/storage.py,sha256=CxRdSnZGf4mtzNV2Ge_hwowd9pDP7NT9-xvWTbl187M,8185
|
|
13
|
+
hirundo-0.1.3.dist-info/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
|
|
14
|
+
hirundo-0.1.3.dist-info/METADATA,sha256=Y_HZhL5-rlUW1KZlymYNqM2M62k_gKxewAJnH0tkB3M,4428
|
|
15
|
+
hirundo-0.1.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
16
|
+
hirundo-0.1.3.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
|
|
17
|
+
hirundo-0.1.3.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
|
|
18
|
+
hirundo-0.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hirundo
|