hirundo 0.1.9__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hirundo/__init__.py +13 -6
- hirundo/_dataframe.py +43 -0
- hirundo/_env.py +2 -2
- hirundo/_headers.py +18 -2
- hirundo/_timeouts.py +1 -0
- hirundo/cli.py +52 -0
- hirundo/dataset_optimization.py +31 -106
- hirundo/dataset_optimization_results.py +42 -0
- hirundo/git.py +11 -18
- hirundo/storage.py +13 -16
- hirundo/unzip.py +247 -0
- {hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/METADATA +9 -5
- hirundo-0.1.16.dist-info/RECORD +23 -0
- {hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/WHEEL +1 -1
- hirundo-0.1.9.dist-info/RECORD +0 -20
- /hirundo/{enum.py → dataset_enum.py} +0 -0
- {hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/entry_points.txt +0 -0
- {hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info/licenses}/LICENSE +0 -0
- {hirundo-0.1.9.dist-info → hirundo-0.1.16.dist-info}/top_level.txt +0 -0
hirundo/__init__.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from .dataset_enum import (
|
|
2
|
+
DatasetMetadataType,
|
|
3
|
+
LabelingType,
|
|
4
|
+
)
|
|
1
5
|
from .dataset_optimization import (
|
|
2
6
|
COCO,
|
|
3
7
|
YOLO,
|
|
@@ -7,11 +11,8 @@ from .dataset_optimization import (
|
|
|
7
11
|
RunArgs,
|
|
8
12
|
VisionRunArgs,
|
|
9
13
|
)
|
|
10
|
-
from .
|
|
11
|
-
|
|
12
|
-
LabelingType,
|
|
13
|
-
)
|
|
14
|
-
from .git import GitRepo
|
|
14
|
+
from .dataset_optimization_results import DatasetOptimizationResults
|
|
15
|
+
from .git import GitPlainAuth, GitRepo, GitSSHAuth
|
|
15
16
|
from .storage import (
|
|
16
17
|
StorageConfig,
|
|
17
18
|
StorageGCP,
|
|
@@ -20,6 +21,7 @@ from .storage import (
|
|
|
20
21
|
StorageS3,
|
|
21
22
|
StorageTypes,
|
|
22
23
|
)
|
|
24
|
+
from .unzip import load_df, load_from_zip
|
|
23
25
|
|
|
24
26
|
__all__ = [
|
|
25
27
|
"COCO",
|
|
@@ -31,13 +33,18 @@ __all__ = [
|
|
|
31
33
|
"VisionRunArgs",
|
|
32
34
|
"LabelingType",
|
|
33
35
|
"DatasetMetadataType",
|
|
36
|
+
"GitPlainAuth",
|
|
34
37
|
"GitRepo",
|
|
38
|
+
"GitSSHAuth",
|
|
35
39
|
"StorageTypes",
|
|
36
40
|
"StorageS3",
|
|
37
41
|
"StorageGCP",
|
|
38
42
|
# "StorageAzure", TODO: Azure storage is coming soon
|
|
39
43
|
"StorageGit",
|
|
40
44
|
"StorageConfig",
|
|
45
|
+
"DatasetOptimizationResults",
|
|
46
|
+
"load_df",
|
|
47
|
+
"load_from_zip",
|
|
41
48
|
]
|
|
42
49
|
|
|
43
|
-
__version__ = "0.1.
|
|
50
|
+
__version__ = "0.1.16"
|
hirundo/_dataframe.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
has_pandas = False
|
|
2
|
+
has_polars = False
|
|
3
|
+
|
|
4
|
+
pd = None
|
|
5
|
+
pl = None
|
|
6
|
+
int32 = type[None]
|
|
7
|
+
float32 = type[None]
|
|
8
|
+
string = type[None]
|
|
9
|
+
# ⬆️ These are just placeholders for the int32, float32 and string types
|
|
10
|
+
# for when neither pandas nor polars are available
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
has_pandas = True
|
|
17
|
+
int32 = np.int32
|
|
18
|
+
float32 = np.float32
|
|
19
|
+
string = str
|
|
20
|
+
except ImportError:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
import polars as pl
|
|
25
|
+
import polars.datatypes as pl_datatypes
|
|
26
|
+
|
|
27
|
+
has_polars = True
|
|
28
|
+
int32 = pl_datatypes.Int32
|
|
29
|
+
float32 = pl_datatypes.Float32
|
|
30
|
+
string = pl_datatypes.String
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"has_polars",
|
|
37
|
+
"has_pandas",
|
|
38
|
+
"pd",
|
|
39
|
+
"pl",
|
|
40
|
+
"int32",
|
|
41
|
+
"float32",
|
|
42
|
+
"string",
|
|
43
|
+
]
|
hirundo/_env.py
CHANGED
|
@@ -2,11 +2,11 @@ import enum
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
-
from dotenv import load_dotenv
|
|
5
|
+
from dotenv import find_dotenv, load_dotenv
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class EnvLocation(enum.Enum):
|
|
9
|
-
DOTENV =
|
|
9
|
+
DOTENV = find_dotenv(".env")
|
|
10
10
|
HOME = Path.home() / ".hirundo.conf"
|
|
11
11
|
|
|
12
12
|
|
hirundo/_headers.py
CHANGED
|
@@ -1,13 +1,29 @@
|
|
|
1
1
|
from hirundo._env import API_KEY, check_api_key
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
HIRUNDO_API_VERSION = "0.2"
|
|
4
|
+
|
|
5
|
+
_json_headers = {
|
|
4
6
|
"Content-Type": "application/json",
|
|
5
7
|
"Accept": "application/json",
|
|
6
8
|
}
|
|
7
9
|
|
|
8
10
|
|
|
9
|
-
def
|
|
11
|
+
def _get_auth_headers():
|
|
10
12
|
check_api_key()
|
|
11
13
|
return {
|
|
12
14
|
"Authorization": f"Bearer {API_KEY}",
|
|
13
15
|
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_api_version_header():
|
|
19
|
+
return {
|
|
20
|
+
"HIRUNDO-API-VERSION": HIRUNDO_API_VERSION,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_headers():
|
|
25
|
+
return {
|
|
26
|
+
**_json_headers,
|
|
27
|
+
**_get_auth_headers(),
|
|
28
|
+
**_get_api_version_header(),
|
|
29
|
+
}
|
hirundo/_timeouts.py
CHANGED
hirundo/cli.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import Annotated
|
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
9
|
import typer
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
10
12
|
|
|
11
13
|
from hirundo._env import API_HOST, EnvLocation
|
|
12
14
|
|
|
@@ -189,6 +191,56 @@ def setup(
|
|
|
189
191
|
)
|
|
190
192
|
|
|
191
193
|
|
|
194
|
+
@app.command("check-run", epilog=hirundo_epilog)
|
|
195
|
+
def check_run(
|
|
196
|
+
run_id: str,
|
|
197
|
+
):
|
|
198
|
+
"""
|
|
199
|
+
Check the status of a run.
|
|
200
|
+
"""
|
|
201
|
+
from hirundo.dataset_optimization import OptimizationDataset
|
|
202
|
+
|
|
203
|
+
results = OptimizationDataset.check_run_by_id(run_id)
|
|
204
|
+
print(f"Run results saved to {results.cached_zip_path}")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@app.command("list-runs", epilog=hirundo_epilog)
|
|
208
|
+
def list_runs():
|
|
209
|
+
"""
|
|
210
|
+
List all runs available.
|
|
211
|
+
"""
|
|
212
|
+
from hirundo.dataset_optimization import OptimizationDataset
|
|
213
|
+
|
|
214
|
+
runs = OptimizationDataset.list_runs()
|
|
215
|
+
|
|
216
|
+
console = Console()
|
|
217
|
+
table = Table(
|
|
218
|
+
title="Runs:",
|
|
219
|
+
expand=True,
|
|
220
|
+
)
|
|
221
|
+
cols = (
|
|
222
|
+
"Dataset name",
|
|
223
|
+
"Run ID",
|
|
224
|
+
"Status",
|
|
225
|
+
"Created At",
|
|
226
|
+
"Run Args",
|
|
227
|
+
)
|
|
228
|
+
for col in cols:
|
|
229
|
+
table.add_column(
|
|
230
|
+
col,
|
|
231
|
+
overflow="fold",
|
|
232
|
+
)
|
|
233
|
+
for run in runs:
|
|
234
|
+
table.add_row(
|
|
235
|
+
str(run.name),
|
|
236
|
+
str(run.id),
|
|
237
|
+
str(run.status),
|
|
238
|
+
run.created_at.isoformat(),
|
|
239
|
+
run.run_args.model_dump_json() if run.run_args else None,
|
|
240
|
+
)
|
|
241
|
+
console.print(table)
|
|
242
|
+
|
|
243
|
+
|
|
192
244
|
typer_click_object = typer.main.get_command(app)
|
|
193
245
|
|
|
194
246
|
if __name__ == "__main__":
|
hirundo/dataset_optimization.py
CHANGED
|
@@ -4,27 +4,25 @@ import typing
|
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from collections.abc import AsyncGenerator, Generator
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from io import StringIO
|
|
8
7
|
from typing import overload
|
|
9
8
|
|
|
10
9
|
import httpx
|
|
11
|
-
import numpy as np
|
|
12
|
-
import pandas as pd
|
|
13
10
|
import requests
|
|
14
|
-
from pandas._typing import DtypeArg
|
|
15
11
|
from pydantic import BaseModel, Field, model_validator
|
|
16
12
|
from tqdm import tqdm
|
|
17
13
|
from tqdm.contrib.logging import logging_redirect_tqdm
|
|
18
14
|
|
|
19
15
|
from hirundo._constraints import HirundoUrl
|
|
20
16
|
from hirundo._env import API_HOST
|
|
21
|
-
from hirundo._headers import
|
|
17
|
+
from hirundo._headers import get_headers
|
|
22
18
|
from hirundo._http import raise_for_status_with_reason
|
|
23
19
|
from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
|
|
24
20
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
25
|
-
from hirundo.
|
|
21
|
+
from hirundo.dataset_enum import DatasetMetadataType, LabelingType
|
|
22
|
+
from hirundo.dataset_optimization_results import DatasetOptimizationResults
|
|
26
23
|
from hirundo.logger import get_logger
|
|
27
24
|
from hirundo.storage import ResponseStorageConfig, StorageConfig
|
|
25
|
+
from hirundo.unzip import download_and_extract_zip
|
|
28
26
|
|
|
29
27
|
logger = get_logger(__name__)
|
|
30
28
|
|
|
@@ -73,39 +71,6 @@ STATUS_TO_PROGRESS_MAP = {
|
|
|
73
71
|
}
|
|
74
72
|
|
|
75
73
|
|
|
76
|
-
class DatasetOptimizationResults(BaseModel):
|
|
77
|
-
model_config = {"arbitrary_types_allowed": True}
|
|
78
|
-
|
|
79
|
-
suspects: pd.DataFrame
|
|
80
|
-
"""
|
|
81
|
-
A pandas DataFrame containing the results of the optimization run
|
|
82
|
-
"""
|
|
83
|
-
warnings_and_errors: pd.DataFrame
|
|
84
|
-
"""
|
|
85
|
-
A pandas DataFrame containing the warnings and errors of the optimization run
|
|
86
|
-
"""
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
CUSTOMER_INTERCHANGE_DTYPES: DtypeArg = {
|
|
90
|
-
"image_path": str,
|
|
91
|
-
"label_path": str,
|
|
92
|
-
"segments_mask_path": str,
|
|
93
|
-
"segment_id": np.int32,
|
|
94
|
-
"label": str,
|
|
95
|
-
"bbox_id": str,
|
|
96
|
-
"xmin": np.float32,
|
|
97
|
-
"ymin": np.float32,
|
|
98
|
-
"xmax": np.float32,
|
|
99
|
-
"ymax": np.float32,
|
|
100
|
-
"suspect_level": np.float32, # If exists, must be one of the values in the enum below
|
|
101
|
-
"suggested_label": str,
|
|
102
|
-
"suggested_label_conf": np.float32,
|
|
103
|
-
"status": str,
|
|
104
|
-
# ⬆️ If exists, must be one of the following:
|
|
105
|
-
# NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
|
|
109
74
|
class Metadata(BaseModel, ABC):
|
|
110
75
|
type: DatasetMetadataType
|
|
111
76
|
|
|
@@ -201,13 +166,14 @@ class VisionRunArgs(BaseModel):
|
|
|
201
166
|
RunArgs = typing.Union[VisionRunArgs]
|
|
202
167
|
|
|
203
168
|
|
|
204
|
-
class
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
169
|
+
class AugmentationName(str, Enum):
|
|
170
|
+
RANDOM_HORIZONTAL_FLIP = "RandomHorizontalFlip"
|
|
171
|
+
RANDOM_VERTICAL_FLIP = "RandomVerticalFlip"
|
|
172
|
+
RANDOM_ROTATION = "RandomRotation"
|
|
173
|
+
RANDOM_PERSPECTIVE = "RandomPerspective"
|
|
174
|
+
GAUSSIAN_NOISE = "GaussianNoise"
|
|
175
|
+
RANDOM_GRAYSCALE = "RandomGrayscale"
|
|
176
|
+
GAUSSIAN_BLUR = "GaussianBlur"
|
|
211
177
|
|
|
212
178
|
|
|
213
179
|
class Modality(str, Enum):
|
|
@@ -264,7 +230,7 @@ class OptimizationDataset(BaseModel):
|
|
|
264
230
|
"""
|
|
265
231
|
labeling_info: LabelingInfo
|
|
266
232
|
|
|
267
|
-
augmentations: typing.Optional[list[
|
|
233
|
+
augmentations: typing.Optional[list[AugmentationName]] = None
|
|
268
234
|
"""
|
|
269
235
|
Used to define which augmentations are apply to a vision dataset.
|
|
270
236
|
For audio datasets, this field is ignored.
|
|
@@ -323,7 +289,7 @@ class OptimizationDataset(BaseModel):
|
|
|
323
289
|
"""
|
|
324
290
|
response = requests.get(
|
|
325
291
|
f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
|
|
326
|
-
headers=
|
|
292
|
+
headers=get_headers(),
|
|
327
293
|
timeout=READ_TIMEOUT,
|
|
328
294
|
)
|
|
329
295
|
raise_for_status_with_reason(response)
|
|
@@ -340,7 +306,7 @@ class OptimizationDataset(BaseModel):
|
|
|
340
306
|
"""
|
|
341
307
|
response = requests.get(
|
|
342
308
|
f"{API_HOST}/dataset-optimization/dataset/by-name/{name}",
|
|
343
|
-
headers=
|
|
309
|
+
headers=get_headers(),
|
|
344
310
|
timeout=READ_TIMEOUT,
|
|
345
311
|
)
|
|
346
312
|
raise_for_status_with_reason(response)
|
|
@@ -361,7 +327,7 @@ class OptimizationDataset(BaseModel):
|
|
|
361
327
|
response = requests.get(
|
|
362
328
|
f"{API_HOST}/dataset-optimization/dataset/",
|
|
363
329
|
params={"dataset_organization_id": organization_id},
|
|
364
|
-
headers=
|
|
330
|
+
headers=get_headers(),
|
|
365
331
|
timeout=READ_TIMEOUT,
|
|
366
332
|
)
|
|
367
333
|
raise_for_status_with_reason(response)
|
|
@@ -388,7 +354,7 @@ class OptimizationDataset(BaseModel):
|
|
|
388
354
|
response = requests.get(
|
|
389
355
|
f"{API_HOST}/dataset-optimization/run/list",
|
|
390
356
|
params={"dataset_organization_id": organization_id},
|
|
391
|
-
headers=
|
|
357
|
+
headers=get_headers(),
|
|
392
358
|
timeout=READ_TIMEOUT,
|
|
393
359
|
)
|
|
394
360
|
raise_for_status_with_reason(response)
|
|
@@ -410,7 +376,7 @@ class OptimizationDataset(BaseModel):
|
|
|
410
376
|
"""
|
|
411
377
|
response = requests.delete(
|
|
412
378
|
f"{API_HOST}/dataset-optimization/dataset/{dataset_id}",
|
|
413
|
-
headers=
|
|
379
|
+
headers=get_headers(),
|
|
414
380
|
timeout=MODIFY_TIMEOUT,
|
|
415
381
|
)
|
|
416
382
|
raise_for_status_with_reason(response)
|
|
@@ -482,10 +448,7 @@ class OptimizationDataset(BaseModel):
|
|
|
482
448
|
"organization_id": organization_id,
|
|
483
449
|
"replace_if_exists": replace_if_exists,
|
|
484
450
|
},
|
|
485
|
-
headers=
|
|
486
|
-
**json_headers,
|
|
487
|
-
**get_auth_headers(),
|
|
488
|
-
},
|
|
451
|
+
headers=get_headers(),
|
|
489
452
|
timeout=MODIFY_TIMEOUT,
|
|
490
453
|
)
|
|
491
454
|
raise_for_status_with_reason(dataset_response)
|
|
@@ -519,7 +482,7 @@ class OptimizationDataset(BaseModel):
|
|
|
519
482
|
run_response = requests.post(
|
|
520
483
|
f"{API_HOST}/dataset-optimization/run/{dataset_id}",
|
|
521
484
|
json=run_info if len(run_info) > 0 else None,
|
|
522
|
-
headers=
|
|
485
|
+
headers=get_headers(),
|
|
523
486
|
timeout=MODIFY_TIMEOUT,
|
|
524
487
|
)
|
|
525
488
|
raise_for_status_with_reason(run_response)
|
|
@@ -595,46 +558,6 @@ class OptimizationDataset(BaseModel):
|
|
|
595
558
|
self.id = None
|
|
596
559
|
self.run_id = None
|
|
597
560
|
|
|
598
|
-
@staticmethod
|
|
599
|
-
def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
|
|
600
|
-
"""
|
|
601
|
-
Clean the index of a dataframe in case it has unnamed columns.
|
|
602
|
-
|
|
603
|
-
Args:
|
|
604
|
-
df (DataFrame): Dataframe to clean
|
|
605
|
-
|
|
606
|
-
Returns:
|
|
607
|
-
DataFrame: Cleaned dataframe
|
|
608
|
-
"""
|
|
609
|
-
index_cols = sorted(
|
|
610
|
-
[col for col in df.columns if col.startswith("Unnamed")], reverse=True
|
|
611
|
-
)
|
|
612
|
-
if len(index_cols) > 0:
|
|
613
|
-
df.set_index(index_cols.pop(), inplace=True)
|
|
614
|
-
df.rename_axis(index=None, columns=None, inplace=True)
|
|
615
|
-
if len(index_cols) > 0:
|
|
616
|
-
df.drop(columns=index_cols, inplace=True)
|
|
617
|
-
|
|
618
|
-
return df
|
|
619
|
-
|
|
620
|
-
@staticmethod
|
|
621
|
-
def _read_csvs_to_df(data: dict):
|
|
622
|
-
if data["state"] == RunStatus.SUCCESS.value:
|
|
623
|
-
data["result"]["suspects"] = OptimizationDataset._clean_df_index(
|
|
624
|
-
pd.read_csv(
|
|
625
|
-
StringIO(data["result"]["suspects"]),
|
|
626
|
-
dtype=CUSTOMER_INTERCHANGE_DTYPES,
|
|
627
|
-
)
|
|
628
|
-
)
|
|
629
|
-
data["result"]["warnings_and_errors"] = OptimizationDataset._clean_df_index(
|
|
630
|
-
pd.read_csv(
|
|
631
|
-
StringIO(data["result"]["warnings_and_errors"]),
|
|
632
|
-
dtype=CUSTOMER_INTERCHANGE_DTYPES,
|
|
633
|
-
)
|
|
634
|
-
)
|
|
635
|
-
else:
|
|
636
|
-
pass
|
|
637
|
-
|
|
638
561
|
@staticmethod
|
|
639
562
|
def _check_run_by_id(run_id: str, retry=0) -> Generator[dict, None, None]:
|
|
640
563
|
if retry > MAX_RETRIES:
|
|
@@ -645,7 +568,7 @@ class OptimizationDataset(BaseModel):
|
|
|
645
568
|
client,
|
|
646
569
|
"GET",
|
|
647
570
|
f"{API_HOST}/dataset-optimization/run/{run_id}",
|
|
648
|
-
headers=
|
|
571
|
+
headers=get_headers(),
|
|
649
572
|
):
|
|
650
573
|
if sse.event == "ping":
|
|
651
574
|
continue
|
|
@@ -668,7 +591,6 @@ class OptimizationDataset(BaseModel):
|
|
|
668
591
|
raise HirundoError(last_event["reason"])
|
|
669
592
|
else:
|
|
670
593
|
raise HirundoError("Unknown error")
|
|
671
|
-
OptimizationDataset._read_csvs_to_df(data)
|
|
672
594
|
yield data
|
|
673
595
|
if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
|
|
674
596
|
OptimizationDataset._check_run_by_id(run_id, retry + 1)
|
|
@@ -727,11 +649,12 @@ class OptimizationDataset(BaseModel):
|
|
|
727
649
|
)
|
|
728
650
|
elif iteration["state"] == RunStatus.SUCCESS.value:
|
|
729
651
|
t.close()
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
652
|
+
zip_temporary_url = iteration["result"]
|
|
653
|
+
logger.debug("Optimization run completed. Downloading results")
|
|
654
|
+
|
|
655
|
+
return download_and_extract_zip(
|
|
656
|
+
run_id,
|
|
657
|
+
zip_temporary_url,
|
|
735
658
|
)
|
|
736
659
|
elif (
|
|
737
660
|
iteration["state"] == RunStatus.AWAITING_MANUAL_APPROVAL.value
|
|
@@ -823,7 +746,7 @@ class OptimizationDataset(BaseModel):
|
|
|
823
746
|
client,
|
|
824
747
|
"GET",
|
|
825
748
|
f"{API_HOST}/dataset-optimization/run/{run_id}",
|
|
826
|
-
headers=
|
|
749
|
+
headers=get_headers(),
|
|
827
750
|
)
|
|
828
751
|
async for sse in async_iterator:
|
|
829
752
|
if sse.event == "ping":
|
|
@@ -872,7 +795,7 @@ class OptimizationDataset(BaseModel):
|
|
|
872
795
|
logger.info("Cancelling run with ID: %s", run_id)
|
|
873
796
|
response = requests.delete(
|
|
874
797
|
f"{API_HOST}/dataset-optimization/run/{run_id}",
|
|
875
|
-
headers=
|
|
798
|
+
headers=get_headers(),
|
|
876
799
|
timeout=MODIFY_TIMEOUT,
|
|
877
800
|
)
|
|
878
801
|
raise_for_status_with_reason(response)
|
|
@@ -908,7 +831,9 @@ class DataOptimizationDatasetOut(BaseModel):
|
|
|
908
831
|
class DataOptimizationRunOut(BaseModel):
|
|
909
832
|
id: int
|
|
910
833
|
name: str
|
|
834
|
+
dataset_id: int
|
|
911
835
|
run_id: str
|
|
912
836
|
status: RunStatus
|
|
913
837
|
approved: bool
|
|
914
838
|
created_at: datetime.datetime
|
|
839
|
+
run_args: typing.Optional[RunArgs]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from typing_extensions import TypeAliasType
|
|
6
|
+
|
|
7
|
+
from hirundo._dataframe import has_pandas, has_polars
|
|
8
|
+
|
|
9
|
+
DataFrameType = TypeAliasType("DataFrameType", None)
|
|
10
|
+
|
|
11
|
+
if has_pandas:
|
|
12
|
+
from hirundo._dataframe import pd
|
|
13
|
+
|
|
14
|
+
DataFrameType = TypeAliasType("DataFrameType", typing.Union[pd.DataFrame, None])
|
|
15
|
+
if has_polars:
|
|
16
|
+
from hirundo._dataframe import pl
|
|
17
|
+
|
|
18
|
+
DataFrameType = TypeAliasType("DataFrameType", typing.Union[pl.DataFrame, None])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
T = typing.TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
|
|
25
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
26
|
+
|
|
27
|
+
cached_zip_path: Path
|
|
28
|
+
"""
|
|
29
|
+
The path to the cached zip file of the results
|
|
30
|
+
"""
|
|
31
|
+
suspects: T
|
|
32
|
+
"""
|
|
33
|
+
A polars/pandas DataFrame containing the results of the optimization run
|
|
34
|
+
"""
|
|
35
|
+
object_suspects: typing.Optional[T]
|
|
36
|
+
"""
|
|
37
|
+
A polars/pandas DataFrame containing the object-level results of the optimization run
|
|
38
|
+
"""
|
|
39
|
+
warnings_and_errors: T
|
|
40
|
+
"""
|
|
41
|
+
A polars/pandas DataFrame containing the warnings and errors of the optimization run
|
|
42
|
+
"""
|
hirundo/git.py
CHANGED
|
@@ -9,7 +9,7 @@ from pydantic_core import Url
|
|
|
9
9
|
|
|
10
10
|
from hirundo._constraints import RepoUrl
|
|
11
11
|
from hirundo._env import API_HOST
|
|
12
|
-
from hirundo._headers import
|
|
12
|
+
from hirundo._headers import get_headers
|
|
13
13
|
from hirundo._http import raise_for_status_with_reason
|
|
14
14
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
15
15
|
from hirundo.logger import get_logger
|
|
@@ -17,7 +17,7 @@ from hirundo.logger import get_logger
|
|
|
17
17
|
logger = get_logger(__name__)
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class
|
|
20
|
+
class GitPlainAuth(BaseModel):
|
|
21
21
|
username: str
|
|
22
22
|
"""
|
|
23
23
|
The username for the Git repository
|
|
@@ -28,7 +28,7 @@ class GitPlainAuthBase(BaseModel):
|
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class
|
|
31
|
+
class GitSSHAuth(BaseModel):
|
|
32
32
|
ssh_key: str
|
|
33
33
|
"""
|
|
34
34
|
The SSH key for the Git repository
|
|
@@ -52,7 +52,7 @@ class GitRepo(BaseModel):
|
|
|
52
52
|
repository_url: typing.Union[str, RepoUrl]
|
|
53
53
|
"""
|
|
54
54
|
The URL of the Git repository, it should start with `ssh://` or `https://` or be in the form `user@host:path`.
|
|
55
|
-
If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host
|
|
55
|
+
If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host/path`.
|
|
56
56
|
"""
|
|
57
57
|
organization_id: typing.Optional[int] = None
|
|
58
58
|
"""
|
|
@@ -60,14 +60,14 @@ class GitRepo(BaseModel):
|
|
|
60
60
|
If not provided, it will be assigned to your default organization.
|
|
61
61
|
"""
|
|
62
62
|
|
|
63
|
-
plain_auth: typing.Optional[
|
|
63
|
+
plain_auth: typing.Optional[GitPlainAuth] = pydantic.Field(
|
|
64
64
|
default=None, examples=[None, {"username": "ben", "password": "password"}]
|
|
65
65
|
)
|
|
66
66
|
"""
|
|
67
67
|
The plain authentication details for the Git repository.
|
|
68
68
|
Use this if using a special user with a username and password for authentication.
|
|
69
69
|
"""
|
|
70
|
-
ssh_auth: typing.Optional[
|
|
70
|
+
ssh_auth: typing.Optional[GitSSHAuth] = pydantic.Field(
|
|
71
71
|
default=None,
|
|
72
72
|
examples=[
|
|
73
73
|
{
|
|
@@ -124,10 +124,7 @@ class GitRepo(BaseModel):
|
|
|
124
124
|
**self.model_dump(mode="json"),
|
|
125
125
|
"replace_if_exists": replace_if_exists,
|
|
126
126
|
},
|
|
127
|
-
headers=
|
|
128
|
-
**json_headers,
|
|
129
|
-
**get_auth_headers(),
|
|
130
|
-
},
|
|
127
|
+
headers=get_headers(),
|
|
131
128
|
timeout=MODIFY_TIMEOUT,
|
|
132
129
|
)
|
|
133
130
|
raise_for_status_with_reason(git_repo)
|
|
@@ -145,7 +142,7 @@ class GitRepo(BaseModel):
|
|
|
145
142
|
"""
|
|
146
143
|
git_repo = requests.get(
|
|
147
144
|
f"{API_HOST}/git-repo/{git_repo_id}",
|
|
148
|
-
headers=
|
|
145
|
+
headers=get_headers(),
|
|
149
146
|
timeout=READ_TIMEOUT,
|
|
150
147
|
)
|
|
151
148
|
raise_for_status_with_reason(git_repo)
|
|
@@ -163,7 +160,7 @@ class GitRepo(BaseModel):
|
|
|
163
160
|
"""
|
|
164
161
|
git_repo = requests.get(
|
|
165
162
|
f"{API_HOST}/git-repo/by-name/{name}",
|
|
166
|
-
headers=
|
|
163
|
+
headers=get_headers(),
|
|
167
164
|
timeout=READ_TIMEOUT,
|
|
168
165
|
)
|
|
169
166
|
raise_for_status_with_reason(git_repo)
|
|
@@ -176,9 +173,7 @@ class GitRepo(BaseModel):
|
|
|
176
173
|
"""
|
|
177
174
|
git_repos = requests.get(
|
|
178
175
|
f"{API_HOST}/git-repo/",
|
|
179
|
-
headers=
|
|
180
|
-
**get_auth_headers(),
|
|
181
|
-
},
|
|
176
|
+
headers=get_headers(),
|
|
182
177
|
timeout=READ_TIMEOUT,
|
|
183
178
|
)
|
|
184
179
|
raise_for_status_with_reason(git_repos)
|
|
@@ -200,9 +195,7 @@ class GitRepo(BaseModel):
|
|
|
200
195
|
"""
|
|
201
196
|
git_repo = requests.delete(
|
|
202
197
|
f"{API_HOST}/git-repo/{git_repo_id}",
|
|
203
|
-
headers=
|
|
204
|
-
**get_auth_headers(),
|
|
205
|
-
},
|
|
198
|
+
headers=get_headers(),
|
|
206
199
|
timeout=MODIFY_TIMEOUT,
|
|
207
200
|
)
|
|
208
201
|
raise_for_status_with_reason(git_repo)
|
hirundo/storage.py
CHANGED
|
@@ -9,7 +9,7 @@ from pydantic_core import Url
|
|
|
9
9
|
|
|
10
10
|
from hirundo._constraints import S3BucketUrl, StorageConfigName
|
|
11
11
|
from hirundo._env import API_HOST
|
|
12
|
-
from hirundo._headers import
|
|
12
|
+
from hirundo._headers import get_headers
|
|
13
13
|
from hirundo._http import raise_for_status_with_reason
|
|
14
14
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
15
15
|
from hirundo.git import GitRepo, GitRepoOut
|
|
@@ -34,7 +34,7 @@ class StorageS3Base(BaseModel):
|
|
|
34
34
|
Chains the bucket URL with the path, ensuring that the path is formatted correctly
|
|
35
35
|
|
|
36
36
|
Args:
|
|
37
|
-
|
|
37
|
+
path: The path to the file in the S3 bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
|
|
38
38
|
|
|
39
39
|
Returns:
|
|
40
40
|
The full URL to the file in the S3 bucket, e.g. `s3://my-bucket/my-file.txt` or `s3://my-bucket/my-folder/my-file.txt`,
|
|
@@ -64,7 +64,7 @@ class StorageGCPBase(BaseModel):
|
|
|
64
64
|
Chains the bucket URL with the path, ensuring that the path is formatted correctly
|
|
65
65
|
|
|
66
66
|
Args:
|
|
67
|
-
|
|
67
|
+
path: The path to the file in the GCP bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
|
|
68
68
|
|
|
69
69
|
Returns:
|
|
70
70
|
The full URL to the file in the GCP bucket, e.g. `gs://my-bucket/my-file.txt` or `gs://my-bucket/my-folder/my-file.txt`,
|
|
@@ -94,7 +94,7 @@ class StorageGCPOut(StorageGCPBase):
|
|
|
94
94
|
# Chains the container URL with the path, ensuring that the path is formatted correctly
|
|
95
95
|
|
|
96
96
|
# Args:
|
|
97
|
-
#
|
|
97
|
+
# path: The path to the file in the Azure container, e.g. `my-file.txt` or `/my-folder/my-file.txt`
|
|
98
98
|
|
|
99
99
|
# Returns:
|
|
100
100
|
# The full URL to the file in the Azure container
|
|
@@ -114,8 +114,8 @@ def get_git_repo_url(
|
|
|
114
114
|
Chains the repository URL with the path, ensuring that the path is formatted correctly
|
|
115
115
|
|
|
116
116
|
Args:
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
repo_url: The URL of the git repository, e.g. `https://my-git-repository.com`
|
|
118
|
+
path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
|
|
119
119
|
|
|
120
120
|
Returns:
|
|
121
121
|
The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`
|
|
@@ -156,7 +156,7 @@ class StorageGit(BaseModel):
|
|
|
156
156
|
Chains the repository URL with the path, ensuring that the path is formatted correctly
|
|
157
157
|
|
|
158
158
|
Args:
|
|
159
|
-
|
|
159
|
+
path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
|
|
160
160
|
|
|
161
161
|
Returns:
|
|
162
162
|
The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
|
|
@@ -179,7 +179,7 @@ class StorageGitOut(BaseModel):
|
|
|
179
179
|
Chains the repository URL with the path, ensuring that the path is formatted correctly
|
|
180
180
|
|
|
181
181
|
Args:
|
|
182
|
-
|
|
182
|
+
path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
|
|
183
183
|
|
|
184
184
|
Returns:
|
|
185
185
|
The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
|
|
@@ -330,7 +330,7 @@ class StorageConfig(BaseModel):
|
|
|
330
330
|
"""
|
|
331
331
|
storage_config = requests.get(
|
|
332
332
|
f"{API_HOST}/storage-config/{storage_config_id}",
|
|
333
|
-
headers=
|
|
333
|
+
headers=get_headers(),
|
|
334
334
|
timeout=READ_TIMEOUT,
|
|
335
335
|
)
|
|
336
336
|
raise_for_status_with_reason(storage_config)
|
|
@@ -349,7 +349,7 @@ class StorageConfig(BaseModel):
|
|
|
349
349
|
"""
|
|
350
350
|
storage_config = requests.get(
|
|
351
351
|
f"{API_HOST}/storage-config/by-name/{name}?storage_type={storage_type.value}",
|
|
352
|
-
headers=
|
|
352
|
+
headers=get_headers(),
|
|
353
353
|
timeout=READ_TIMEOUT,
|
|
354
354
|
)
|
|
355
355
|
raise_for_status_with_reason(storage_config)
|
|
@@ -370,7 +370,7 @@ class StorageConfig(BaseModel):
|
|
|
370
370
|
storage_configs = requests.get(
|
|
371
371
|
f"{API_HOST}/storage-config/",
|
|
372
372
|
params={"storage_config_organization_id": organization_id},
|
|
373
|
-
headers=
|
|
373
|
+
headers=get_headers(),
|
|
374
374
|
timeout=READ_TIMEOUT,
|
|
375
375
|
)
|
|
376
376
|
raise_for_status_with_reason(storage_configs)
|
|
@@ -386,7 +386,7 @@ class StorageConfig(BaseModel):
|
|
|
386
386
|
"""
|
|
387
387
|
storage_config = requests.delete(
|
|
388
388
|
f"{API_HOST}/storage-config/{storage_config_id}",
|
|
389
|
-
headers=
|
|
389
|
+
headers=get_headers(),
|
|
390
390
|
timeout=MODIFY_TIMEOUT,
|
|
391
391
|
)
|
|
392
392
|
raise_for_status_with_reason(storage_config)
|
|
@@ -415,10 +415,7 @@ class StorageConfig(BaseModel):
|
|
|
415
415
|
**self.model_dump(mode="json"),
|
|
416
416
|
"replace_if_exists": replace_if_exists,
|
|
417
417
|
},
|
|
418
|
-
headers=
|
|
419
|
-
**json_headers,
|
|
420
|
-
**get_auth_headers(),
|
|
421
|
-
},
|
|
418
|
+
headers=get_headers(),
|
|
422
419
|
timeout=MODIFY_TIMEOUT,
|
|
423
420
|
)
|
|
424
421
|
raise_for_status_with_reason(storage_config)
|
hirundo/unzip.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
import zipfile
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import IO, cast
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from pydantic_core import Url
|
|
9
|
+
|
|
10
|
+
from hirundo._dataframe import (
|
|
11
|
+
float32,
|
|
12
|
+
has_pandas,
|
|
13
|
+
has_polars,
|
|
14
|
+
int32,
|
|
15
|
+
pd,
|
|
16
|
+
pl,
|
|
17
|
+
string,
|
|
18
|
+
)
|
|
19
|
+
from hirundo._env import API_HOST
|
|
20
|
+
from hirundo._headers import _get_auth_headers
|
|
21
|
+
from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT
|
|
22
|
+
from hirundo.dataset_optimization_results import (
|
|
23
|
+
DataFrameType,
|
|
24
|
+
DatasetOptimizationResults,
|
|
25
|
+
)
|
|
26
|
+
from hirundo.logger import get_logger
|
|
27
|
+
|
|
28
|
+
ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
29
|
+
|
|
30
|
+
Dtype = typing.Union[type[int32], type[float32], type[string]]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
CUSTOMER_INTERCHANGE_DTYPES: Mapping[str, Dtype] = {
|
|
34
|
+
"image_path": string,
|
|
35
|
+
"label_path": string,
|
|
36
|
+
"segments_mask_path": string,
|
|
37
|
+
"segment_id": int32,
|
|
38
|
+
"label": string,
|
|
39
|
+
"bbox_id": string,
|
|
40
|
+
"xmin": float32,
|
|
41
|
+
"ymin": float32,
|
|
42
|
+
"xmax": float32,
|
|
43
|
+
"ymax": float32,
|
|
44
|
+
"suspect_level": float32, # If exists, must be one of the values in the enum below
|
|
45
|
+
"suggested_label": string,
|
|
46
|
+
"suggested_label_conf": float32,
|
|
47
|
+
"status": string,
|
|
48
|
+
# ⬆️ If exists, must be one of the following:
|
|
49
|
+
# NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
logger = get_logger(__name__)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
|
|
56
|
+
"""
|
|
57
|
+
Clean the index of a DataFrame in case it has unnamed columns.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
df (DataFrame): DataFrame to clean
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Cleaned Pandas DataFrame
|
|
64
|
+
"""
|
|
65
|
+
index_cols = sorted(
|
|
66
|
+
[col for col in df.columns if col.startswith("Unnamed")], reverse=True
|
|
67
|
+
)
|
|
68
|
+
if len(index_cols) > 0:
|
|
69
|
+
df.set_index(index_cols.pop(), inplace=True)
|
|
70
|
+
df.rename_axis(index=None, columns=None, inplace=True)
|
|
71
|
+
if len(index_cols) > 0:
|
|
72
|
+
df.drop(columns=index_cols, inplace=True)
|
|
73
|
+
|
|
74
|
+
return df
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def load_df(
|
|
78
|
+
file: "typing.Union[str, IO[bytes]]",
|
|
79
|
+
) -> "DataFrameType":
|
|
80
|
+
"""
|
|
81
|
+
Load a DataFrame from a CSV file.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
file_name: The name of the CSV file to load.
|
|
85
|
+
dtypes: The data types of the columns in the DataFrame.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
The loaded DataFrame or `None` if neither Polars nor Pandas is available.
|
|
89
|
+
"""
|
|
90
|
+
if has_polars:
|
|
91
|
+
return pl.read_csv(file, schema_overrides=CUSTOMER_INTERCHANGE_DTYPES)
|
|
92
|
+
elif has_pandas:
|
|
93
|
+
if typing.TYPE_CHECKING:
|
|
94
|
+
from pandas._typing import DtypeArg
|
|
95
|
+
|
|
96
|
+
dtype = cast("DtypeArg", CUSTOMER_INTERCHANGE_DTYPES)
|
|
97
|
+
# ⬆️ Casting since CUSTOMER_INTERCHANGE_DTYPES is a Mapping[str, Dtype] in this case
|
|
98
|
+
df = pd.read_csv(file, dtype=dtype)
|
|
99
|
+
return cast("DataFrameType", _clean_df_index(df))
|
|
100
|
+
# ⬆️ Casting since the return type is pd.DataFrame, but this is what DataFrameType is in this case
|
|
101
|
+
else:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_mislabel_suspect_filename(filenames: list[str]):
|
|
106
|
+
mislabel_suspect_filename = "mislabel_suspects.csv"
|
|
107
|
+
if mislabel_suspect_filename not in filenames:
|
|
108
|
+
mislabel_suspect_filename = "image_mislabel_suspects.csv"
|
|
109
|
+
if mislabel_suspect_filename not in filenames:
|
|
110
|
+
mislabel_suspect_filename = "suspects.csv"
|
|
111
|
+
if mislabel_suspect_filename not in filenames:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"None of mislabel_suspects.csv, image_mislabel_suspects.csv or suspects.csv were found in the zip file"
|
|
114
|
+
)
|
|
115
|
+
return mislabel_suspect_filename
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def download_and_extract_zip(
|
|
119
|
+
run_id: str, zip_url: str
|
|
120
|
+
) -> DatasetOptimizationResults[DataFrameType]:
|
|
121
|
+
"""
|
|
122
|
+
Download and extract the zip file from the given URL.
|
|
123
|
+
|
|
124
|
+
Note: It will only extract the `mislabel_suspects.csv` (vision - classification)
|
|
125
|
+
or `image_mislabel_suspects.csv` & `object_mislabel_suspects.csv` (vision - OD)
|
|
126
|
+
or `suspects.csv` (STT)
|
|
127
|
+
and `warnings_and_errors.csv` files from the zip file.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
run_id: The ID of the optimization run.
|
|
131
|
+
zip_url: The URL of the zip file to download.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
The dataset optimization results object.
|
|
135
|
+
"""
|
|
136
|
+
# Define the local file path
|
|
137
|
+
cache_dir = Path.home() / ".hirundo" / "cache"
|
|
138
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
139
|
+
zip_file_path = cache_dir / f"{run_id}.zip"
|
|
140
|
+
|
|
141
|
+
headers = None
|
|
142
|
+
if Url(zip_url).scheme == "file":
|
|
143
|
+
zip_url = (
|
|
144
|
+
f"{API_HOST}/dataset-optimization/run/local-download"
|
|
145
|
+
+ zip_url.replace("file://", "")
|
|
146
|
+
)
|
|
147
|
+
headers = _get_auth_headers()
|
|
148
|
+
# Stream the zip file download
|
|
149
|
+
with requests.get(
|
|
150
|
+
zip_url,
|
|
151
|
+
headers=headers,
|
|
152
|
+
timeout=DOWNLOAD_READ_TIMEOUT,
|
|
153
|
+
stream=True,
|
|
154
|
+
) as r:
|
|
155
|
+
r.raise_for_status()
|
|
156
|
+
with open(zip_file_path, "wb") as f:
|
|
157
|
+
for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
|
|
158
|
+
f.write(chunk)
|
|
159
|
+
logger.info(
|
|
160
|
+
"Successfully downloaded the result zip file for run ID %s to %s",
|
|
161
|
+
run_id,
|
|
162
|
+
zip_file_path,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
with zipfile.ZipFile(zip_file_path, "r") as z:
|
|
166
|
+
# Extract suspects file
|
|
167
|
+
suspects_df = None
|
|
168
|
+
object_suspects_df = None
|
|
169
|
+
warnings_and_errors_df = None
|
|
170
|
+
|
|
171
|
+
filenames = []
|
|
172
|
+
try:
|
|
173
|
+
filenames = [file.filename for file in z.filelist]
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error("Failed to get filenames from ZIP", exc_info=e)
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
mislabel_suspect_filename = get_mislabel_suspect_filename(filenames)
|
|
179
|
+
with z.open(mislabel_suspect_filename) as suspects_file:
|
|
180
|
+
suspects_df = load_df(suspects_file)
|
|
181
|
+
logger.debug(
|
|
182
|
+
"Successfully loaded mislabel suspects into DataFrame for run ID %s",
|
|
183
|
+
run_id,
|
|
184
|
+
)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(
|
|
187
|
+
"Failed to load mislabel suspects into DataFrame", exc_info=e
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
object_mislabel_suspects_filename = "object_mislabel_suspects.csv"
|
|
191
|
+
if object_mislabel_suspects_filename in filenames:
|
|
192
|
+
try:
|
|
193
|
+
with z.open(
|
|
194
|
+
object_mislabel_suspects_filename
|
|
195
|
+
) as object_suspects_file:
|
|
196
|
+
object_suspects_df = load_df(object_suspects_file)
|
|
197
|
+
logger.debug(
|
|
198
|
+
"Successfully loaded object mislabel suspects into DataFrame for run ID %s",
|
|
199
|
+
run_id,
|
|
200
|
+
)
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.error(
|
|
203
|
+
"Failed to load object mislabel suspects into DataFrame",
|
|
204
|
+
exc_info=e,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
# Extract warnings_and_errors file
|
|
209
|
+
with z.open("warnings_and_errors.csv") as warnings_file:
|
|
210
|
+
warnings_and_errors_df = load_df(warnings_file)
|
|
211
|
+
logger.debug(
|
|
212
|
+
"Successfully loaded warnings and errors into DataFrame for run ID %s",
|
|
213
|
+
run_id,
|
|
214
|
+
)
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logger.error(
|
|
217
|
+
"Failed to load warnings and errors into DataFrame", exc_info=e
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
return DatasetOptimizationResults[DataFrameType](
|
|
221
|
+
cached_zip_path=zip_file_path,
|
|
222
|
+
suspects=suspects_df,
|
|
223
|
+
object_suspects=object_suspects_df,
|
|
224
|
+
warnings_and_errors=warnings_and_errors_df,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def load_from_zip(
|
|
229
|
+
zip_path: Path, file_name: str
|
|
230
|
+
) -> "typing.Union[pd.DataFrame, pl.DataFrame, None]":
|
|
231
|
+
"""
|
|
232
|
+
Load a given file from a given zip file.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
zip_path: The path to the zip file.
|
|
236
|
+
file_name: The name of the file to load.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
The loaded DataFrame or `None` if neither Polars nor Pandas is available.
|
|
240
|
+
"""
|
|
241
|
+
with zipfile.ZipFile(zip_path, "r") as z:
|
|
242
|
+
try:
|
|
243
|
+
with z.open(file_name) as file:
|
|
244
|
+
return load_df(file)
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.error("Failed to load %s from zip file", file_name, exc_info=e)
|
|
247
|
+
return None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: hirundo
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.16
|
|
4
4
|
Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
|
|
5
5
|
Author-email: Hirundo <dev@hirundo.io>
|
|
6
6
|
License: MIT License
|
|
@@ -31,7 +31,6 @@ Requires-Dist: typer>=0.12.3
|
|
|
31
31
|
Requires-Dist: httpx>=0.27.0
|
|
32
32
|
Requires-Dist: stamina>=24.2.0
|
|
33
33
|
Requires-Dist: httpx-sse>=0.4.0
|
|
34
|
-
Requires-Dist: pandas>=2.2.2
|
|
35
34
|
Requires-Dist: tqdm>=4.66.5
|
|
36
35
|
Provides-Extra: dev
|
|
37
36
|
Requires-Dist: pyyaml>=6.0.1; extra == "dev"
|
|
@@ -50,7 +49,7 @@ Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
|
|
|
50
49
|
Requires-Dist: uv>=0.5.8; extra == "dev"
|
|
51
50
|
Requires-Dist: pre-commit>=3.7.1; extra == "dev"
|
|
52
51
|
Requires-Dist: virtualenv>=20.6.6; extra == "dev"
|
|
53
|
-
Requires-Dist: ruff>=0.
|
|
52
|
+
Requires-Dist: ruff>=0.11.6; extra == "dev"
|
|
54
53
|
Requires-Dist: bumpver; extra == "dev"
|
|
55
54
|
Requires-Dist: platformdirs>=4.3.6; extra == "dev"
|
|
56
55
|
Requires-Dist: safety>=3.2.13; extra == "dev"
|
|
@@ -64,6 +63,11 @@ Requires-Dist: sphinx-multiversion; extra == "docs"
|
|
|
64
63
|
Requires-Dist: esbonio; extra == "docs"
|
|
65
64
|
Requires-Dist: starlette>0.40.0; extra == "docs"
|
|
66
65
|
Requires-Dist: markupsafe>=3.0.2; extra == "docs"
|
|
66
|
+
Provides-Extra: pandas
|
|
67
|
+
Requires-Dist: pandas>=2.2.2; extra == "pandas"
|
|
68
|
+
Provides-Extra: polars
|
|
69
|
+
Requires-Dist: polars>=1.0.0; extra == "polars"
|
|
70
|
+
Dynamic: license-file
|
|
67
71
|
|
|
68
72
|
# Hirundo
|
|
69
73
|
|
|
@@ -165,7 +169,7 @@ from hirundo import (
|
|
|
165
169
|
git_storage = StorageGit(
|
|
166
170
|
repo=GitRepo(
|
|
167
171
|
name="BDD-100k-validation-dataset",
|
|
168
|
-
repository_url="https://
|
|
172
|
+
repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only",
|
|
169
173
|
),
|
|
170
174
|
branch="main",
|
|
171
175
|
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
hirundo/__init__.py,sha256=qKC89bNReZSjGtmf7l3PZD2JoptyVphpsD0Kf2PNXvY,1035
|
|
2
|
+
hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
|
|
3
|
+
hirundo/_constraints.py,sha256=gRv7fXwtjPGqYWIhkVYxu1B__3PdlYRqFyDkTpa9f74,1032
|
|
4
|
+
hirundo/_dataframe.py,sha256=sXEEbCNcLi83wyU9ii884YikCzfASo_3nnrDxhuCv7U,758
|
|
5
|
+
hirundo/_env.py,sha256=efX2sjvYlHkFr2Lcstelei67YSTFpVGT0l08ZsfiMuE,622
|
|
6
|
+
hirundo/_headers.py,sha256=3hybpD_X4SODv3cFZPt9AjGY2vvZaag5OKT3z1SHSjA,521
|
|
7
|
+
hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
|
|
8
|
+
hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
|
|
9
|
+
hirundo/_timeouts.py,sha256=gE58NU0t2e4KgKq2sk5rZcezDJAkgvRIbM5AVYFY6Ho,86
|
|
10
|
+
hirundo/cli.py,sha256=5Tn0eXZGG92BR9HJYUaYozjFbS1t6UTw_I2R0tZBE04,7824
|
|
11
|
+
hirundo/dataset_enum.py,sha256=ZEYBP-lrlVqfNWptlmw7JgLNhCyDirtWWPtoMvtg2AE,531
|
|
12
|
+
hirundo/dataset_optimization.py,sha256=jR4ZOlKKl05jrA4cq9L1IQuKVPJ3ytXkhOJEg6efFqI,31390
|
|
13
|
+
hirundo/dataset_optimization_results.py,sha256=A9YyF5zaZXVtzeDE08I_05v90dhZQADpSjDcS_6eLMc,1129
|
|
14
|
+
hirundo/git.py,sha256=6h1hFPlw5FfYMGWXPCitnTqGICmBKmQtb5qKGe3Icmk,6580
|
|
15
|
+
hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
|
|
16
|
+
hirundo/storage.py,sha256=kO-LWlQAM3qTnALEl8s79AiFMYqCG9Sem4MIFQcyvAg,15950
|
|
17
|
+
hirundo/unzip.py,sha256=XJqvt2m5pWR-G-fnzgW75VOdd-K4_Rw2r4wiEhZgKZA,8245
|
|
18
|
+
hirundo-0.1.16.dist-info/licenses/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
|
|
19
|
+
hirundo-0.1.16.dist-info/METADATA,sha256=CxdCbzafRuVRf1BGsS_tgjodO0g745uuNBl7y4UFMj8,8501
|
|
20
|
+
hirundo-0.1.16.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
21
|
+
hirundo-0.1.16.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
|
|
22
|
+
hirundo-0.1.16.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
|
|
23
|
+
hirundo-0.1.16.dist-info/RECORD,,
|
hirundo-0.1.9.dist-info/RECORD
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
hirundo/__init__.py,sha256=U_wcm3e0r1T66OQ7KHlWaOiwlPxf6e4RkTxA5uvaOOA,781
|
|
2
|
-
hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
|
|
3
|
-
hirundo/_constraints.py,sha256=gRv7fXwtjPGqYWIhkVYxu1B__3PdlYRqFyDkTpa9f74,1032
|
|
4
|
-
hirundo/_env.py,sha256=dXUFPeEL1zPe-eBdWD4_WZvlgiY2cpWuVDzf41Qjuto,609
|
|
5
|
-
hirundo/_headers.py,sha256=ggTyBwVT3nGyPidCcmYMX6pv0idzMxCI2S1BJQE-Bbs,253
|
|
6
|
-
hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
|
|
7
|
-
hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
|
|
8
|
-
hirundo/_timeouts.py,sha256=IfX8-mrLp809-A_xSLv1DhIqZnO-Qvy4FcTtOtvqLog,42
|
|
9
|
-
hirundo/cli.py,sha256=4-pdV483zqRJl8d-R9p_9YOGlehOnoMJzb3XAAdPRb0,6634
|
|
10
|
-
hirundo/dataset_optimization.py,sha256=CuSrauzXiSa4kGBREao3nn-vmLVwMKTeHM7yEXesuso,33756
|
|
11
|
-
hirundo/enum.py,sha256=ZEYBP-lrlVqfNWptlmw7JgLNhCyDirtWWPtoMvtg2AE,531
|
|
12
|
-
hirundo/git.py,sha256=zzpEHGqoQXwOBQzNSmyf5lpUMc2FbomPqiokwMc4M8o,6777
|
|
13
|
-
hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
|
|
14
|
-
hirundo/storage.py,sha256=RsEmtbn79_iCY7pE1AKcBoAEqzXNkOc_UPUTaxSE0BM,16075
|
|
15
|
-
hirundo-0.1.9.dist-info/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
|
|
16
|
-
hirundo-0.1.9.dist-info/METADATA,sha256=8jjs7OGtVZZwFmyfdFGoTxC-de-1V6OLFJW26pYOB2E,8363
|
|
17
|
-
hirundo-0.1.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
18
|
-
hirundo-0.1.9.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
|
|
19
|
-
hirundo-0.1.9.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
|
|
20
|
-
hirundo-0.1.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|