hirundo 0.1.8__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hirundo/__init__.py +28 -13
- hirundo/_constraints.py +34 -2
- hirundo/_dataframe.py +43 -0
- hirundo/_env.py +2 -2
- hirundo/_headers.py +18 -2
- hirundo/_http.py +7 -2
- hirundo/_iter_sse_retrying.py +61 -17
- hirundo/_timeouts.py +1 -0
- hirundo/cli.py +52 -0
- hirundo/dataset_enum.py +23 -0
- hirundo/dataset_optimization.py +427 -164
- hirundo/dataset_optimization_results.py +42 -0
- hirundo/git.py +93 -35
- hirundo/storage.py +236 -68
- hirundo/unzip.py +247 -0
- {hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/METADATA +84 -44
- hirundo-0.1.16.dist-info/RECORD +23 -0
- {hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/WHEEL +1 -1
- hirundo/enum.py +0 -20
- hirundo-0.1.8.dist-info/RECORD +0 -20
- {hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/entry_points.txt +0 -0
- {hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info/licenses}/LICENSE +0 -0
- {hirundo-0.1.8.dist-info → hirundo-0.1.16.dist-info}/top_level.txt +0 -0
hirundo/__init__.py
CHANGED
|
@@ -1,35 +1,50 @@
|
|
|
1
|
+
from .dataset_enum import (
|
|
2
|
+
DatasetMetadataType,
|
|
3
|
+
LabelingType,
|
|
4
|
+
)
|
|
1
5
|
from .dataset_optimization import (
|
|
6
|
+
COCO,
|
|
7
|
+
YOLO,
|
|
8
|
+
HirundoCSV,
|
|
2
9
|
HirundoError,
|
|
3
10
|
OptimizationDataset,
|
|
11
|
+
RunArgs,
|
|
12
|
+
VisionRunArgs,
|
|
4
13
|
)
|
|
5
|
-
from .
|
|
6
|
-
|
|
7
|
-
LabellingType,
|
|
8
|
-
)
|
|
9
|
-
from .git import GitRepo
|
|
14
|
+
from .dataset_optimization_results import DatasetOptimizationResults
|
|
15
|
+
from .git import GitPlainAuth, GitRepo, GitSSHAuth
|
|
10
16
|
from .storage import (
|
|
17
|
+
StorageConfig,
|
|
11
18
|
StorageGCP,
|
|
12
|
-
# StorageAzure, TODO: Azure storage
|
|
19
|
+
# StorageAzure, TODO: Azure storage is coming soon
|
|
13
20
|
StorageGit,
|
|
14
|
-
StorageIntegration,
|
|
15
|
-
StorageLink,
|
|
16
21
|
StorageS3,
|
|
17
22
|
StorageTypes,
|
|
18
23
|
)
|
|
24
|
+
from .unzip import load_df, load_from_zip
|
|
19
25
|
|
|
20
26
|
__all__ = [
|
|
27
|
+
"COCO",
|
|
28
|
+
"YOLO",
|
|
29
|
+
"HirundoCSV",
|
|
21
30
|
"HirundoError",
|
|
22
31
|
"OptimizationDataset",
|
|
23
|
-
"
|
|
32
|
+
"RunArgs",
|
|
33
|
+
"VisionRunArgs",
|
|
34
|
+
"LabelingType",
|
|
24
35
|
"DatasetMetadataType",
|
|
36
|
+
"GitPlainAuth",
|
|
25
37
|
"GitRepo",
|
|
26
|
-
"
|
|
38
|
+
"GitSSHAuth",
|
|
27
39
|
"StorageTypes",
|
|
28
40
|
"StorageS3",
|
|
29
41
|
"StorageGCP",
|
|
30
|
-
# "StorageAzure", TODO: Azure storage
|
|
42
|
+
# "StorageAzure", TODO: Azure storage is coming soon
|
|
31
43
|
"StorageGit",
|
|
32
|
-
"
|
|
44
|
+
"StorageConfig",
|
|
45
|
+
"DatasetOptimizationResults",
|
|
46
|
+
"load_df",
|
|
47
|
+
"load_from_zip",
|
|
33
48
|
]
|
|
34
49
|
|
|
35
|
-
__version__ = "0.1.
|
|
50
|
+
__version__ = "0.1.16"
|
hirundo/_constraints.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Annotated
|
|
2
2
|
|
|
3
|
-
from pydantic import StringConstraints
|
|
3
|
+
from pydantic import StringConstraints, UrlConstraints
|
|
4
|
+
from pydantic_core import Url
|
|
4
5
|
|
|
5
6
|
S3BucketUrl = Annotated[
|
|
6
7
|
str,
|
|
@@ -11,7 +12,7 @@ S3BucketUrl = Annotated[
|
|
|
11
12
|
),
|
|
12
13
|
]
|
|
13
14
|
|
|
14
|
-
|
|
15
|
+
StorageConfigName = Annotated[
|
|
15
16
|
str,
|
|
16
17
|
StringConstraints(
|
|
17
18
|
min_length=1,
|
|
@@ -19,3 +20,34 @@ StorageIntegrationName = Annotated[
|
|
|
19
20
|
pattern=r"^[a-zA-Z0-9-_]+$",
|
|
20
21
|
),
|
|
21
22
|
]
|
|
23
|
+
|
|
24
|
+
S3_MIN_LENGTH = 8
|
|
25
|
+
S3_MAX_LENGTH = 1023
|
|
26
|
+
S3_PATTERN = r"s3://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
|
|
27
|
+
GCP_MIN_LENGTH = 8
|
|
28
|
+
GCP_MAX_LENGTH = 1023
|
|
29
|
+
GCP_PATTERN = r"gs://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
|
|
30
|
+
|
|
31
|
+
RepoUrl = Annotated[
|
|
32
|
+
Url,
|
|
33
|
+
UrlConstraints(
|
|
34
|
+
allowed_schemes=[
|
|
35
|
+
"ssh",
|
|
36
|
+
"https",
|
|
37
|
+
"http",
|
|
38
|
+
]
|
|
39
|
+
),
|
|
40
|
+
]
|
|
41
|
+
HirundoUrl = Annotated[
|
|
42
|
+
Url,
|
|
43
|
+
UrlConstraints(
|
|
44
|
+
allowed_schemes=[
|
|
45
|
+
"file",
|
|
46
|
+
"https",
|
|
47
|
+
"http",
|
|
48
|
+
"s3",
|
|
49
|
+
"gs",
|
|
50
|
+
"ssh",
|
|
51
|
+
]
|
|
52
|
+
),
|
|
53
|
+
]
|
hirundo/_dataframe.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
has_pandas = False
|
|
2
|
+
has_polars = False
|
|
3
|
+
|
|
4
|
+
pd = None
|
|
5
|
+
pl = None
|
|
6
|
+
int32 = type[None]
|
|
7
|
+
float32 = type[None]
|
|
8
|
+
string = type[None]
|
|
9
|
+
# ⬆️ These are just placeholders for the int32, float32 and string types
|
|
10
|
+
# for when neither pandas nor polars are available
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
has_pandas = True
|
|
17
|
+
int32 = np.int32
|
|
18
|
+
float32 = np.float32
|
|
19
|
+
string = str
|
|
20
|
+
except ImportError:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
import polars as pl
|
|
25
|
+
import polars.datatypes as pl_datatypes
|
|
26
|
+
|
|
27
|
+
has_polars = True
|
|
28
|
+
int32 = pl_datatypes.Int32
|
|
29
|
+
float32 = pl_datatypes.Float32
|
|
30
|
+
string = pl_datatypes.String
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"has_polars",
|
|
37
|
+
"has_pandas",
|
|
38
|
+
"pd",
|
|
39
|
+
"pl",
|
|
40
|
+
"int32",
|
|
41
|
+
"float32",
|
|
42
|
+
"string",
|
|
43
|
+
]
|
hirundo/_env.py
CHANGED
|
@@ -2,11 +2,11 @@ import enum
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
-
from dotenv import load_dotenv
|
|
5
|
+
from dotenv import find_dotenv, load_dotenv
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class EnvLocation(enum.Enum):
|
|
9
|
-
DOTENV =
|
|
9
|
+
DOTENV = find_dotenv(".env")
|
|
10
10
|
HOME = Path.home() / ".hirundo.conf"
|
|
11
11
|
|
|
12
12
|
|
hirundo/_headers.py
CHANGED
|
@@ -1,13 +1,29 @@
|
|
|
1
1
|
from hirundo._env import API_KEY, check_api_key
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
HIRUNDO_API_VERSION = "0.2"
|
|
4
|
+
|
|
5
|
+
_json_headers = {
|
|
4
6
|
"Content-Type": "application/json",
|
|
5
7
|
"Accept": "application/json",
|
|
6
8
|
}
|
|
7
9
|
|
|
8
10
|
|
|
9
|
-
def
|
|
11
|
+
def _get_auth_headers():
|
|
10
12
|
check_api_key()
|
|
11
13
|
return {
|
|
12
14
|
"Authorization": f"Bearer {API_KEY}",
|
|
13
15
|
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_api_version_header():
|
|
19
|
+
return {
|
|
20
|
+
"HIRUNDO-API-VERSION": HIRUNDO_API_VERSION,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_headers():
|
|
25
|
+
return {
|
|
26
|
+
**_json_headers,
|
|
27
|
+
**_get_auth_headers(),
|
|
28
|
+
**_get_api_version_header(),
|
|
29
|
+
}
|
hirundo/_http.py
CHANGED
|
@@ -4,11 +4,16 @@ import hirundo.logger
|
|
|
4
4
|
|
|
5
5
|
logger = hirundo.logger.get_logger(__name__)
|
|
6
6
|
|
|
7
|
+
MINIMUM_CLIENT_SERVER_ERROR_CODE = 400
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def raise_for_status_with_reason(response: Response):
|
|
9
11
|
try:
|
|
10
|
-
response.
|
|
12
|
+
if response.status_code >= MINIMUM_CLIENT_SERVER_ERROR_CODE:
|
|
13
|
+
response.reason = response.json().get("reason", None)
|
|
14
|
+
if response.reason is None:
|
|
15
|
+
response.reason = response.json().get("detail", None)
|
|
11
16
|
except Exception as e:
|
|
12
|
-
logger.debug("
|
|
17
|
+
logger.debug("Could not parse response as JSON: %s", e)
|
|
13
18
|
|
|
14
19
|
response.raise_for_status()
|
hirundo/_iter_sse_retrying.py
CHANGED
|
@@ -1,12 +1,20 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import time
|
|
3
3
|
import typing
|
|
4
|
+
import uuid
|
|
4
5
|
from collections.abc import AsyncGenerator, Generator
|
|
5
6
|
|
|
6
7
|
import httpx
|
|
7
|
-
|
|
8
|
+
import requests
|
|
9
|
+
import urllib3
|
|
10
|
+
from httpx_sse import ServerSentEvent, SSEError, aconnect_sse, connect_sse
|
|
8
11
|
from stamina import retry
|
|
9
12
|
|
|
13
|
+
from hirundo._timeouts import READ_TIMEOUT
|
|
14
|
+
from hirundo.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
10
18
|
|
|
11
19
|
# Credit: https://github.com/florimondmanca/httpx-sse/blob/master/README.md#handling-reconnections
|
|
12
20
|
def iter_sse_retrying(
|
|
@@ -28,7 +36,13 @@ def iter_sse_retrying(
|
|
|
28
36
|
# This may happen when the server is overloaded and closes the connection or
|
|
29
37
|
# when Kubernetes restarts / replaces a pod.
|
|
30
38
|
# Likewise, this will likely be temporary, hence the retries.
|
|
31
|
-
@retry(
|
|
39
|
+
@retry(
|
|
40
|
+
on=(
|
|
41
|
+
httpx.ReadError,
|
|
42
|
+
httpx.RemoteProtocolError,
|
|
43
|
+
urllib3.exceptions.ReadTimeoutError,
|
|
44
|
+
)
|
|
45
|
+
)
|
|
32
46
|
def _iter_sse():
|
|
33
47
|
nonlocal last_event_id, reconnection_delay
|
|
34
48
|
|
|
@@ -44,13 +58,27 @@ def iter_sse_retrying(
|
|
|
44
58
|
connect_headers["Last-Event-ID"] = last_event_id
|
|
45
59
|
|
|
46
60
|
with connect_sse(client, method, url, headers=connect_headers) as event_source:
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
61
|
+
try:
|
|
62
|
+
for sse in event_source.iter_sse():
|
|
63
|
+
last_event_id = sse.id
|
|
64
|
+
|
|
65
|
+
if sse.retry is not None:
|
|
66
|
+
reconnection_delay = sse.retry / 1000
|
|
67
|
+
|
|
68
|
+
yield sse
|
|
69
|
+
except SSEError:
|
|
70
|
+
logger.error("SSE error occurred. Trying regular request")
|
|
71
|
+
response = requests.get(
|
|
72
|
+
url,
|
|
73
|
+
headers=connect_headers,
|
|
74
|
+
timeout=READ_TIMEOUT,
|
|
75
|
+
)
|
|
76
|
+
yield ServerSentEvent(
|
|
77
|
+
event="",
|
|
78
|
+
data=response.text,
|
|
79
|
+
id=uuid.uuid4().hex,
|
|
80
|
+
retry=None,
|
|
81
|
+
)
|
|
54
82
|
|
|
55
83
|
return _iter_sse()
|
|
56
84
|
|
|
@@ -72,7 +100,13 @@ async def aiter_sse_retrying(
|
|
|
72
100
|
# This may happen when the server is overloaded and closes the connection or
|
|
73
101
|
# when Kubernetes restarts / replaces a pod.
|
|
74
102
|
# Likewise, this will likely be temporary, hence the retries.
|
|
75
|
-
@retry(
|
|
103
|
+
@retry(
|
|
104
|
+
on=(
|
|
105
|
+
httpx.ReadError,
|
|
106
|
+
httpx.RemoteProtocolError,
|
|
107
|
+
urllib3.exceptions.ReadTimeoutError,
|
|
108
|
+
)
|
|
109
|
+
)
|
|
76
110
|
async def _iter_sse() -> AsyncGenerator[ServerSentEvent, None]:
|
|
77
111
|
nonlocal last_event_id, reconnection_delay
|
|
78
112
|
|
|
@@ -86,12 +120,22 @@ async def aiter_sse_retrying(
|
|
|
86
120
|
async with aconnect_sse(
|
|
87
121
|
client, method, url, headers=connect_headers
|
|
88
122
|
) as event_source:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
123
|
+
try:
|
|
124
|
+
async for sse in event_source.aiter_sse():
|
|
125
|
+
last_event_id = sse.id
|
|
126
|
+
|
|
127
|
+
if sse.retry is not None:
|
|
128
|
+
reconnection_delay = sse.retry / 1000
|
|
129
|
+
|
|
130
|
+
yield sse
|
|
131
|
+
except SSEError:
|
|
132
|
+
logger.error("SSE error occurred. Trying regular request")
|
|
133
|
+
response = await client.get(url, headers=connect_headers)
|
|
134
|
+
yield ServerSentEvent(
|
|
135
|
+
event="",
|
|
136
|
+
data=response.text,
|
|
137
|
+
id=uuid.uuid4().hex,
|
|
138
|
+
retry=None,
|
|
139
|
+
)
|
|
96
140
|
|
|
97
141
|
return _iter_sse()
|
hirundo/_timeouts.py
CHANGED
hirundo/cli.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import Annotated
|
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
9
|
import typer
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
10
12
|
|
|
11
13
|
from hirundo._env import API_HOST, EnvLocation
|
|
12
14
|
|
|
@@ -189,6 +191,56 @@ def setup(
|
|
|
189
191
|
)
|
|
190
192
|
|
|
191
193
|
|
|
194
|
+
@app.command("check-run", epilog=hirundo_epilog)
|
|
195
|
+
def check_run(
|
|
196
|
+
run_id: str,
|
|
197
|
+
):
|
|
198
|
+
"""
|
|
199
|
+
Check the status of a run.
|
|
200
|
+
"""
|
|
201
|
+
from hirundo.dataset_optimization import OptimizationDataset
|
|
202
|
+
|
|
203
|
+
results = OptimizationDataset.check_run_by_id(run_id)
|
|
204
|
+
print(f"Run results saved to {results.cached_zip_path}")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@app.command("list-runs", epilog=hirundo_epilog)
|
|
208
|
+
def list_runs():
|
|
209
|
+
"""
|
|
210
|
+
List all runs available.
|
|
211
|
+
"""
|
|
212
|
+
from hirundo.dataset_optimization import OptimizationDataset
|
|
213
|
+
|
|
214
|
+
runs = OptimizationDataset.list_runs()
|
|
215
|
+
|
|
216
|
+
console = Console()
|
|
217
|
+
table = Table(
|
|
218
|
+
title="Runs:",
|
|
219
|
+
expand=True,
|
|
220
|
+
)
|
|
221
|
+
cols = (
|
|
222
|
+
"Dataset name",
|
|
223
|
+
"Run ID",
|
|
224
|
+
"Status",
|
|
225
|
+
"Created At",
|
|
226
|
+
"Run Args",
|
|
227
|
+
)
|
|
228
|
+
for col in cols:
|
|
229
|
+
table.add_column(
|
|
230
|
+
col,
|
|
231
|
+
overflow="fold",
|
|
232
|
+
)
|
|
233
|
+
for run in runs:
|
|
234
|
+
table.add_row(
|
|
235
|
+
str(run.name),
|
|
236
|
+
str(run.id),
|
|
237
|
+
str(run.status),
|
|
238
|
+
run.created_at.isoformat(),
|
|
239
|
+
run.run_args.model_dump_json() if run.run_args else None,
|
|
240
|
+
)
|
|
241
|
+
console.print(table)
|
|
242
|
+
|
|
243
|
+
|
|
192
244
|
typer_click_object = typer.main.get_command(app)
|
|
193
245
|
|
|
194
246
|
if __name__ == "__main__":
|
hirundo/dataset_enum.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LabelingType(str, Enum):
|
|
5
|
+
"""
|
|
6
|
+
Enum indicate what type of labeling is used for the given dataset.
|
|
7
|
+
Supported types are:
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
SINGLE_LABEL_CLASSIFICATION = "SingleLabelClassification"
|
|
11
|
+
OBJECT_DETECTION = "ObjectDetection"
|
|
12
|
+
SPEECH_TO_TEXT = "SpeechToText"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DatasetMetadataType(str, Enum):
|
|
16
|
+
"""
|
|
17
|
+
Enum indicate what type of metadata is provided for the given dataset.
|
|
18
|
+
Supported types are:
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
HIRUNDO_CSV = "HirundoCSV"
|
|
22
|
+
COCO = "COCO"
|
|
23
|
+
YOLO = "YOLO"
|