hirundo 0.1.18__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hirundo/__init__.py +12 -8
- hirundo/_constraints.py +1 -1
- hirundo/_headers.py +1 -1
- hirundo/_http.py +53 -0
- hirundo/_iter_sse_retrying.py +1 -1
- hirundo/cli.py +7 -7
- hirundo/{dataset_optimization.py → dataset_qa.py} +149 -108
- hirundo/{dataset_optimization_results.py → dataset_qa_results.py} +4 -4
- hirundo/git.py +1 -2
- hirundo/storage.py +1 -2
- hirundo/unzip.py +9 -10
- {hirundo-0.1.18.dist-info → hirundo-0.1.21.dist-info}/METADATA +21 -14
- hirundo-0.1.21.dist-info/RECORD +25 -0
- hirundo-0.1.18.dist-info/RECORD +0 -25
- {hirundo-0.1.18.dist-info → hirundo-0.1.21.dist-info}/WHEEL +0 -0
- {hirundo-0.1.18.dist-info → hirundo-0.1.21.dist-info}/entry_points.txt +0 -0
- {hirundo-0.1.18.dist-info → hirundo-0.1.21.dist-info}/licenses/LICENSE +0 -0
- {hirundo-0.1.18.dist-info → hirundo-0.1.21.dist-info}/top_level.txt +0 -0
hirundo/__init__.py
CHANGED
|
@@ -3,13 +3,15 @@ from .dataset_enum import (
|
|
|
3
3
|
LabelingType,
|
|
4
4
|
StorageTypes,
|
|
5
5
|
)
|
|
6
|
-
from .
|
|
6
|
+
from .dataset_qa import (
|
|
7
|
+
ClassificationRunArgs,
|
|
8
|
+
Domain,
|
|
7
9
|
HirundoError,
|
|
8
|
-
|
|
10
|
+
ObjectDetectionRunArgs,
|
|
11
|
+
QADataset,
|
|
9
12
|
RunArgs,
|
|
10
|
-
VisionRunArgs,
|
|
11
13
|
)
|
|
12
|
-
from .
|
|
14
|
+
from .dataset_qa_results import DatasetQAResults
|
|
13
15
|
from .git import GitPlainAuth, GitRepo, GitSSHAuth
|
|
14
16
|
from .labeling import (
|
|
15
17
|
COCO,
|
|
@@ -40,9 +42,11 @@ __all__ = [
|
|
|
40
42
|
"KeylabsObjDetVideo",
|
|
41
43
|
"KeylabsObjSegImages",
|
|
42
44
|
"KeylabsObjSegVideo",
|
|
43
|
-
"
|
|
45
|
+
"QADataset",
|
|
46
|
+
"Domain",
|
|
44
47
|
"RunArgs",
|
|
45
|
-
"
|
|
48
|
+
"ClassificationRunArgs",
|
|
49
|
+
"ObjectDetectionRunArgs",
|
|
46
50
|
"DatasetMetadataType",
|
|
47
51
|
"LabelingType",
|
|
48
52
|
"GitPlainAuth",
|
|
@@ -54,9 +58,9 @@ __all__ = [
|
|
|
54
58
|
# "StorageAzure", TODO: Azure storage is coming soon
|
|
55
59
|
"StorageGit",
|
|
56
60
|
"StorageConfig",
|
|
57
|
-
"
|
|
61
|
+
"DatasetQAResults",
|
|
58
62
|
"load_df",
|
|
59
63
|
"load_from_zip",
|
|
60
64
|
]
|
|
61
65
|
|
|
62
|
-
__version__ = "0.1.
|
|
66
|
+
__version__ = "0.1.21"
|
hirundo/_constraints.py
CHANGED
|
@@ -11,7 +11,7 @@ from hirundo.labeling import COCO, YOLO, HirundoCSV, Keylabs
|
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from hirundo._urls import HirundoUrl
|
|
14
|
-
from hirundo.
|
|
14
|
+
from hirundo.dataset_qa import LabelingInfo
|
|
15
15
|
from hirundo.storage import (
|
|
16
16
|
ResponseStorageConfig,
|
|
17
17
|
StorageConfig,
|
hirundo/_headers.py
CHANGED
hirundo/_http.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import requests as _requests
|
|
1
2
|
from requests import Response
|
|
3
|
+
from requests.adapters import HTTPAdapter
|
|
4
|
+
from urllib3.util.retry import Retry
|
|
2
5
|
|
|
3
6
|
import hirundo.logger
|
|
4
7
|
|
|
@@ -7,6 +10,56 @@ logger = hirundo.logger.get_logger(__name__)
|
|
|
7
10
|
MINIMUM_CLIENT_SERVER_ERROR_CODE = 400
|
|
8
11
|
|
|
9
12
|
|
|
13
|
+
def _build_retrying_session() -> _requests.Session:
|
|
14
|
+
# No more than 10 tries total (including the initial attempt)
|
|
15
|
+
# urllib3 Retry.total counts retries, not total attempts, so use 9 retries
|
|
16
|
+
retries = Retry(
|
|
17
|
+
total=9,
|
|
18
|
+
backoff_factor=1.0,
|
|
19
|
+
status_forcelist=(429,),
|
|
20
|
+
allowed_methods=("HEAD", "GET", "PUT", "POST", "PATCH", "DELETE", "OPTIONS"),
|
|
21
|
+
respect_retry_after_header=True,
|
|
22
|
+
raise_on_status=False,
|
|
23
|
+
)
|
|
24
|
+
adapter = HTTPAdapter(max_retries=retries)
|
|
25
|
+
session = _requests.Session()
|
|
26
|
+
session.mount("http://", adapter)
|
|
27
|
+
session.mount("https://", adapter)
|
|
28
|
+
return session
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_SESSION = _build_retrying_session()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _RequestsShim:
|
|
35
|
+
"""Shim exposing a subset of the requests API but backed by a retrying Session."""
|
|
36
|
+
|
|
37
|
+
HTTPError = _requests.HTTPError
|
|
38
|
+
Response = _requests.Response
|
|
39
|
+
|
|
40
|
+
def request(self, method: str, url: str, **kwargs) -> Response:
|
|
41
|
+
return _SESSION.request(method=method, url=url, **kwargs)
|
|
42
|
+
|
|
43
|
+
def get(self, url: str, **kwargs) -> Response:
|
|
44
|
+
return _SESSION.get(url, **kwargs)
|
|
45
|
+
|
|
46
|
+
def post(self, url: str, **kwargs) -> Response:
|
|
47
|
+
return _SESSION.post(url, **kwargs)
|
|
48
|
+
|
|
49
|
+
def delete(self, url: str, **kwargs) -> Response:
|
|
50
|
+
return _SESSION.delete(url, **kwargs)
|
|
51
|
+
|
|
52
|
+
def patch(self, url: str, **kwargs) -> Response:
|
|
53
|
+
return _SESSION.patch(url, **kwargs)
|
|
54
|
+
|
|
55
|
+
def put(self, url: str, **kwargs) -> Response:
|
|
56
|
+
return _SESSION.put(url, **kwargs)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Public shim to be imported by modules instead of the raw requests package
|
|
60
|
+
requests = _RequestsShim()
|
|
61
|
+
|
|
62
|
+
|
|
10
63
|
def raise_for_status_with_reason(response: Response):
|
|
11
64
|
try:
|
|
12
65
|
if response.status_code >= MINIMUM_CLIENT_SERVER_ERROR_CODE:
|
hirundo/_iter_sse_retrying.py
CHANGED
|
@@ -5,11 +5,11 @@ import uuid
|
|
|
5
5
|
from collections.abc import AsyncGenerator, Generator
|
|
6
6
|
|
|
7
7
|
import httpx
|
|
8
|
-
import requests
|
|
9
8
|
import urllib3
|
|
10
9
|
from httpx_sse import ServerSentEvent, SSEError, aconnect_sse, connect_sse
|
|
11
10
|
from stamina import retry
|
|
12
11
|
|
|
12
|
+
from hirundo._http import requests
|
|
13
13
|
from hirundo._timeouts import READ_TIMEOUT
|
|
14
14
|
from hirundo.logger import get_logger
|
|
15
15
|
|
hirundo/cli.py
CHANGED
|
@@ -88,7 +88,7 @@ def setup_api_key(
|
|
|
88
88
|
],
|
|
89
89
|
):
|
|
90
90
|
"""
|
|
91
|
-
Setup the API key for the Hirundo
|
|
91
|
+
Setup the API key for the Hirundo Python SDK.
|
|
92
92
|
Values are saved to a .env file in the current directory for use by the library in requests.
|
|
93
93
|
"""
|
|
94
94
|
saved_to = upsert_env("API_KEY", api_key)
|
|
@@ -115,7 +115,7 @@ def change_api_remote(
|
|
|
115
115
|
],
|
|
116
116
|
):
|
|
117
117
|
"""
|
|
118
|
-
Change the API server address for the Hirundo
|
|
118
|
+
Change the API server address for the Hirundo Python SDK.
|
|
119
119
|
This is the same address where you access the Hirundo web interface.
|
|
120
120
|
"""
|
|
121
121
|
api_host = fix_api_host(api_host)
|
|
@@ -151,7 +151,7 @@ def setup(
|
|
|
151
151
|
],
|
|
152
152
|
):
|
|
153
153
|
"""
|
|
154
|
-
Setup the Hirundo
|
|
154
|
+
Setup the Hirundo Python SDK.
|
|
155
155
|
"""
|
|
156
156
|
api_host = fix_api_host(api_host)
|
|
157
157
|
api_host_saved_to = upsert_env("API_HOST", api_host)
|
|
@@ -198,9 +198,9 @@ def check_run(
|
|
|
198
198
|
"""
|
|
199
199
|
Check the status of a run.
|
|
200
200
|
"""
|
|
201
|
-
from hirundo.
|
|
201
|
+
from hirundo.dataset_qa import QADataset
|
|
202
202
|
|
|
203
|
-
results =
|
|
203
|
+
results = QADataset.check_run_by_id(run_id)
|
|
204
204
|
print(f"Run results saved to {results.cached_zip_path}")
|
|
205
205
|
|
|
206
206
|
|
|
@@ -209,9 +209,9 @@ def list_runs():
|
|
|
209
209
|
"""
|
|
210
210
|
List all runs available.
|
|
211
211
|
"""
|
|
212
|
-
from hirundo.
|
|
212
|
+
from hirundo.dataset_qa import QADataset
|
|
213
213
|
|
|
214
|
-
runs =
|
|
214
|
+
runs = QADataset.list_runs()
|
|
215
215
|
|
|
216
216
|
console = Console()
|
|
217
217
|
table = Table(
|
|
@@ -6,7 +6,6 @@ from enum import Enum
|
|
|
6
6
|
from typing import overload
|
|
7
7
|
|
|
8
8
|
import httpx
|
|
9
|
-
import requests
|
|
10
9
|
from pydantic import BaseModel, Field, model_validator
|
|
11
10
|
from tqdm import tqdm
|
|
12
11
|
from tqdm.contrib.logging import logging_redirect_tqdm
|
|
@@ -14,12 +13,12 @@ from tqdm.contrib.logging import logging_redirect_tqdm
|
|
|
14
13
|
from hirundo._constraints import validate_labeling_info, validate_url
|
|
15
14
|
from hirundo._env import API_HOST
|
|
16
15
|
from hirundo._headers import get_headers
|
|
17
|
-
from hirundo._http import raise_for_status_with_reason
|
|
16
|
+
from hirundo._http import raise_for_status_with_reason, requests
|
|
18
17
|
from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
|
|
19
18
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
20
19
|
from hirundo._urls import HirundoUrl
|
|
21
20
|
from hirundo.dataset_enum import DatasetMetadataType, LabelingType
|
|
22
|
-
from hirundo.
|
|
21
|
+
from hirundo.dataset_qa_results import DatasetQAResults
|
|
23
22
|
from hirundo.labeling import YOLO, LabelingInfo
|
|
24
23
|
from hirundo.logger import get_logger
|
|
25
24
|
from hirundo.storage import ResponseStorageConfig, StorageConfig
|
|
@@ -30,7 +29,7 @@ logger = get_logger(__name__)
|
|
|
30
29
|
|
|
31
30
|
class HirundoError(Exception):
|
|
32
31
|
"""
|
|
33
|
-
Custom exception used to indicate errors in `hirundo` dataset
|
|
32
|
+
Custom exception used to indicate errors in `hirundo` dataset QA runs
|
|
34
33
|
"""
|
|
35
34
|
|
|
36
35
|
pass
|
|
@@ -51,14 +50,14 @@ class RunStatus(Enum):
|
|
|
51
50
|
|
|
52
51
|
|
|
53
52
|
STATUS_TO_TEXT_MAP = {
|
|
54
|
-
RunStatus.STARTED.value: "
|
|
55
|
-
RunStatus.PENDING.value: "
|
|
56
|
-
RunStatus.SUCCESS.value: "
|
|
57
|
-
RunStatus.FAILURE.value: "
|
|
53
|
+
RunStatus.STARTED.value: "Dataset QA run in progress. Downloading dataset",
|
|
54
|
+
RunStatus.PENDING.value: "Dataset QA run queued and not yet started",
|
|
55
|
+
RunStatus.SUCCESS.value: "Dataset QA run completed successfully",
|
|
56
|
+
RunStatus.FAILURE.value: "Dataset QA run failed",
|
|
58
57
|
RunStatus.AWAITING_MANUAL_APPROVAL.value: "Awaiting manual approval",
|
|
59
|
-
RunStatus.RETRY.value: "
|
|
60
|
-
RunStatus.REVOKED.value: "
|
|
61
|
-
RunStatus.REJECTED.value: "
|
|
58
|
+
RunStatus.RETRY.value: "Dataset QA run failed. Retrying",
|
|
59
|
+
RunStatus.REVOKED.value: "Dataset QA run was cancelled",
|
|
60
|
+
RunStatus.REJECTED.value: "Dataset QA run was rejected",
|
|
62
61
|
}
|
|
63
62
|
STATUS_TO_PROGRESS_MAP = {
|
|
64
63
|
RunStatus.STARTED.value: 0.0,
|
|
@@ -72,33 +71,51 @@ STATUS_TO_PROGRESS_MAP = {
|
|
|
72
71
|
}
|
|
73
72
|
|
|
74
73
|
|
|
75
|
-
class
|
|
76
|
-
|
|
74
|
+
class ClassificationRunArgs(BaseModel):
|
|
75
|
+
image_size: typing.Optional[tuple[int, int]] = (224, 224)
|
|
76
|
+
"""
|
|
77
|
+
Size (width, height) to which to resize classification images.
|
|
78
|
+
It is recommended to keep this value at (224, 224) unless your classes are differentiated by very small differences.
|
|
79
|
+
"""
|
|
80
|
+
upsample: typing.Optional[bool] = False
|
|
77
81
|
"""
|
|
78
82
|
Whether to upsample the dataset to attempt to balance the classes.
|
|
79
83
|
"""
|
|
80
|
-
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ObjectDetectionRunArgs(ClassificationRunArgs):
|
|
87
|
+
min_abs_bbox_size: typing.Optional[int] = None
|
|
81
88
|
"""
|
|
82
|
-
Minimum valid size (in pixels) of a bounding box to keep it in the dataset for
|
|
89
|
+
Minimum valid size (in pixels) of a bounding box to keep it in the dataset for QA.
|
|
83
90
|
"""
|
|
84
|
-
min_abs_bbox_area: int =
|
|
91
|
+
min_abs_bbox_area: typing.Optional[int] = None
|
|
85
92
|
"""
|
|
86
|
-
Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for
|
|
93
|
+
Minimum valid absolute area (in pixels²) of a bounding box to keep it in the dataset for QA.
|
|
87
94
|
"""
|
|
88
|
-
min_rel_bbox_size: float =
|
|
95
|
+
min_rel_bbox_size: typing.Optional[float] = None
|
|
89
96
|
"""
|
|
90
97
|
Minimum valid size (as a fraction of both image height and width) for a bounding box
|
|
91
|
-
to keep it in the dataset for
|
|
98
|
+
to keep it in the dataset for QA, relative to the corresponding dimension size,
|
|
92
99
|
i.e. if the bounding box is 10% of the image width and 5% of the image height, it will be kept if this value is 0.05, but not if the
|
|
93
100
|
value is 0.06 (since both width and height are checked).
|
|
94
101
|
"""
|
|
95
|
-
min_rel_bbox_area: float =
|
|
102
|
+
min_rel_bbox_area: typing.Optional[float] = None
|
|
103
|
+
"""
|
|
104
|
+
Minimum valid relative area (as a fraction of the image area) of a bounding box to keep it in the dataset for QA.
|
|
105
|
+
"""
|
|
106
|
+
crop_ratio: typing.Optional[float] = None
|
|
96
107
|
"""
|
|
97
|
-
|
|
108
|
+
Ratio of the bounding box to crop.
|
|
109
|
+
Change this value at your own risk. It is recommended to keep it at 1.0 unless you know what you are doing.
|
|
110
|
+
"""
|
|
111
|
+
add_mask_channel: typing.Optional[bool] = None
|
|
112
|
+
"""
|
|
113
|
+
Whether to add a mask channel to the image.
|
|
114
|
+
Change at your own risk. It is recommended to keep it at False unless you know what you are doing.
|
|
98
115
|
"""
|
|
99
116
|
|
|
100
117
|
|
|
101
|
-
RunArgs = typing.Union[
|
|
118
|
+
RunArgs = typing.Union[ClassificationRunArgs, ObjectDetectionRunArgs]
|
|
102
119
|
|
|
103
120
|
|
|
104
121
|
class AugmentationName(str, Enum):
|
|
@@ -111,13 +128,31 @@ class AugmentationName(str, Enum):
|
|
|
111
128
|
GAUSSIAN_BLUR = "GaussianBlur"
|
|
112
129
|
|
|
113
130
|
|
|
114
|
-
class
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
131
|
+
class Domain(str, Enum):
|
|
132
|
+
RADAR = "RADAR"
|
|
133
|
+
VISION = "VISION"
|
|
134
|
+
SPEECH = "SPEECH"
|
|
135
|
+
TABULAR = "TABULAR"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
DOMAIN_TO_SUPPORTED_LABELING_TYPES = {
|
|
139
|
+
Domain.RADAR: [
|
|
140
|
+
LabelingType.SINGLE_LABEL_CLASSIFICATION,
|
|
141
|
+
LabelingType.OBJECT_DETECTION,
|
|
142
|
+
],
|
|
143
|
+
Domain.VISION: [
|
|
144
|
+
LabelingType.SINGLE_LABEL_CLASSIFICATION,
|
|
145
|
+
LabelingType.OBJECT_DETECTION,
|
|
146
|
+
LabelingType.OBJECT_SEGMENTATION,
|
|
147
|
+
LabelingType.SEMANTIC_SEGMENTATION,
|
|
148
|
+
LabelingType.PANOPTIC_SEGMENTATION,
|
|
149
|
+
],
|
|
150
|
+
Domain.SPEECH: [LabelingType.SPEECH_TO_TEXT],
|
|
151
|
+
Domain.TABULAR: [LabelingType.SINGLE_LABEL_CLASSIFICATION],
|
|
152
|
+
}
|
|
118
153
|
|
|
119
154
|
|
|
120
|
-
class
|
|
155
|
+
class QADataset(BaseModel):
|
|
121
156
|
id: typing.Optional[int] = Field(default=None)
|
|
122
157
|
"""
|
|
123
158
|
The ID of the dataset created on the server.
|
|
@@ -171,21 +206,29 @@ class OptimizationDataset(BaseModel):
|
|
|
171
206
|
For audio datasets, this field is ignored.
|
|
172
207
|
If no value is provided, all augmentations are applied to vision datasets.
|
|
173
208
|
"""
|
|
174
|
-
|
|
209
|
+
domain: Domain = Domain.VISION
|
|
175
210
|
"""
|
|
176
|
-
Used to define the
|
|
211
|
+
Used to define the domain of the dataset.
|
|
177
212
|
Defaults to Image.
|
|
178
213
|
"""
|
|
179
214
|
|
|
180
215
|
run_id: typing.Optional[str] = Field(default=None, init=False)
|
|
181
216
|
"""
|
|
182
|
-
The ID of the Dataset
|
|
217
|
+
The ID of the Dataset QA run created on the server.
|
|
183
218
|
"""
|
|
184
219
|
|
|
185
220
|
status: typing.Optional[RunStatus] = None
|
|
186
221
|
|
|
187
222
|
@model_validator(mode="after")
|
|
188
223
|
def validate_dataset(self):
|
|
224
|
+
if self.domain not in DOMAIN_TO_SUPPORTED_LABELING_TYPES:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"Domain {self.domain} is not supported. Supported domains are: {list(DOMAIN_TO_SUPPORTED_LABELING_TYPES.keys())}"
|
|
227
|
+
)
|
|
228
|
+
if self.labeling_type not in DOMAIN_TO_SUPPORTED_LABELING_TYPES[self.domain]:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Labeling type {self.labeling_type} is not supported for domain {self.domain}. Supported labeling types are: {DOMAIN_TO_SUPPORTED_LABELING_TYPES[self.domain]}"
|
|
231
|
+
)
|
|
189
232
|
if self.storage_config is None and self.storage_config_id is None:
|
|
190
233
|
raise ValueError(
|
|
191
234
|
"No dataset storage has been provided. Provide one via `storage_config` or `storage_config_id`"
|
|
@@ -229,52 +272,52 @@ class OptimizationDataset(BaseModel):
|
|
|
229
272
|
return self
|
|
230
273
|
|
|
231
274
|
@staticmethod
|
|
232
|
-
def get_by_id(dataset_id: int) -> "
|
|
275
|
+
def get_by_id(dataset_id: int) -> "QADataset":
|
|
233
276
|
"""
|
|
234
|
-
Get a `
|
|
277
|
+
Get a `QADataset` instance from the server by its ID
|
|
235
278
|
|
|
236
279
|
Args:
|
|
237
|
-
dataset_id: The ID of the `
|
|
280
|
+
dataset_id: The ID of the `QADataset` instance to get
|
|
238
281
|
"""
|
|
239
282
|
response = requests.get(
|
|
240
|
-
f"{API_HOST}/dataset-
|
|
283
|
+
f"{API_HOST}/dataset-qa/dataset/{dataset_id}",
|
|
241
284
|
headers=get_headers(),
|
|
242
285
|
timeout=READ_TIMEOUT,
|
|
243
286
|
)
|
|
244
287
|
raise_for_status_with_reason(response)
|
|
245
288
|
dataset = response.json()
|
|
246
|
-
return
|
|
289
|
+
return QADataset(**dataset)
|
|
247
290
|
|
|
248
291
|
@staticmethod
|
|
249
|
-
def get_by_name(name: str) -> "
|
|
292
|
+
def get_by_name(name: str) -> "QADataset":
|
|
250
293
|
"""
|
|
251
|
-
Get a `
|
|
294
|
+
Get a `QADataset` instance from the server by its name
|
|
252
295
|
|
|
253
296
|
Args:
|
|
254
|
-
name: The name of the `
|
|
297
|
+
name: The name of the `QADataset` instance to get
|
|
255
298
|
"""
|
|
256
299
|
response = requests.get(
|
|
257
|
-
f"{API_HOST}/dataset-
|
|
300
|
+
f"{API_HOST}/dataset-qa/dataset/by-name/{name}",
|
|
258
301
|
headers=get_headers(),
|
|
259
302
|
timeout=READ_TIMEOUT,
|
|
260
303
|
)
|
|
261
304
|
raise_for_status_with_reason(response)
|
|
262
305
|
dataset = response.json()
|
|
263
|
-
return
|
|
306
|
+
return QADataset(**dataset)
|
|
264
307
|
|
|
265
308
|
@staticmethod
|
|
266
309
|
def list_datasets(
|
|
267
310
|
organization_id: typing.Optional[int] = None,
|
|
268
|
-
) -> list["
|
|
311
|
+
) -> list["QADatasetOut"]:
|
|
269
312
|
"""
|
|
270
|
-
Lists all the
|
|
313
|
+
Lists all the datasets created by user's default organization
|
|
271
314
|
or the `organization_id` passed
|
|
272
315
|
|
|
273
316
|
Args:
|
|
274
317
|
organization_id: The ID of the organization to list the datasets for.
|
|
275
318
|
"""
|
|
276
319
|
response = requests.get(
|
|
277
|
-
f"{API_HOST}/dataset-
|
|
320
|
+
f"{API_HOST}/dataset-qa/dataset/",
|
|
278
321
|
params={"dataset_organization_id": organization_id},
|
|
279
322
|
headers=get_headers(),
|
|
280
323
|
timeout=READ_TIMEOUT,
|
|
@@ -282,7 +325,7 @@ class OptimizationDataset(BaseModel):
|
|
|
282
325
|
raise_for_status_with_reason(response)
|
|
283
326
|
datasets = response.json()
|
|
284
327
|
return [
|
|
285
|
-
|
|
328
|
+
QADatasetOut(
|
|
286
329
|
**ds,
|
|
287
330
|
)
|
|
288
331
|
for ds in datasets
|
|
@@ -291,17 +334,17 @@ class OptimizationDataset(BaseModel):
|
|
|
291
334
|
@staticmethod
|
|
292
335
|
def list_runs(
|
|
293
336
|
organization_id: typing.Optional[int] = None,
|
|
294
|
-
) -> list["
|
|
337
|
+
) -> list["DataQARunOut"]:
|
|
295
338
|
"""
|
|
296
|
-
Lists all the `
|
|
339
|
+
Lists all the `QADataset` instances created by user's default organization
|
|
297
340
|
or the `organization_id` passed
|
|
298
|
-
Note: The return type is `list[dict]` and not `list[
|
|
341
|
+
Note: The return type is `list[dict]` and not `list[QADataset]`
|
|
299
342
|
|
|
300
343
|
Args:
|
|
301
344
|
organization_id: The ID of the organization to list the datasets for.
|
|
302
345
|
"""
|
|
303
346
|
response = requests.get(
|
|
304
|
-
f"{API_HOST}/dataset-
|
|
347
|
+
f"{API_HOST}/dataset-qa/run/list",
|
|
305
348
|
params={"dataset_organization_id": organization_id},
|
|
306
349
|
headers=get_headers(),
|
|
307
350
|
timeout=READ_TIMEOUT,
|
|
@@ -309,7 +352,7 @@ class OptimizationDataset(BaseModel):
|
|
|
309
352
|
raise_for_status_with_reason(response)
|
|
310
353
|
runs = response.json()
|
|
311
354
|
return [
|
|
312
|
-
|
|
355
|
+
DataQARunOut(
|
|
313
356
|
**run,
|
|
314
357
|
)
|
|
315
358
|
for run in runs
|
|
@@ -318,13 +361,13 @@ class OptimizationDataset(BaseModel):
|
|
|
318
361
|
@staticmethod
|
|
319
362
|
def delete_by_id(dataset_id: int) -> None:
|
|
320
363
|
"""
|
|
321
|
-
Deletes a `
|
|
364
|
+
Deletes a `QADataset` instance from the server by its ID
|
|
322
365
|
|
|
323
366
|
Args:
|
|
324
|
-
dataset_id: The ID of the `
|
|
367
|
+
dataset_id: The ID of the `QADataset` instance to delete
|
|
325
368
|
"""
|
|
326
369
|
response = requests.delete(
|
|
327
|
-
f"{API_HOST}/dataset-
|
|
370
|
+
f"{API_HOST}/dataset-qa/dataset/{dataset_id}",
|
|
328
371
|
headers=get_headers(),
|
|
329
372
|
timeout=MODIFY_TIMEOUT,
|
|
330
373
|
)
|
|
@@ -333,14 +376,14 @@ class OptimizationDataset(BaseModel):
|
|
|
333
376
|
|
|
334
377
|
def delete(self, storage_config=True) -> None:
|
|
335
378
|
"""
|
|
336
|
-
Deletes the active `
|
|
337
|
-
It can only be used on a `
|
|
379
|
+
Deletes the active `QADataset` instance from the server.
|
|
380
|
+
It can only be used on a `QADataset` instance that has been created.
|
|
338
381
|
|
|
339
382
|
Args:
|
|
340
|
-
storage_config: If True, the `
|
|
383
|
+
storage_config: If True, the `QADataset`'s `StorageConfig` will also be deleted
|
|
341
384
|
|
|
342
385
|
Note: If `storage_config` is not set to `False` then the `storage_config_id` must be set
|
|
343
|
-
This can either be set manually or by creating the `StorageConfig` instance via the `
|
|
386
|
+
This can either be set manually or by creating the `StorageConfig` instance via the `QADataset`'s
|
|
344
387
|
`create` method
|
|
345
388
|
"""
|
|
346
389
|
if storage_config:
|
|
@@ -357,7 +400,7 @@ class OptimizationDataset(BaseModel):
|
|
|
357
400
|
replace_if_exists: bool = False,
|
|
358
401
|
) -> int:
|
|
359
402
|
"""
|
|
360
|
-
Create a `
|
|
403
|
+
Create a `QADataset` instance on the server.
|
|
361
404
|
If the `storage_config_id` field is not set, the storage config will also be created and the field will be set.
|
|
362
405
|
|
|
363
406
|
Args:
|
|
@@ -366,7 +409,7 @@ class OptimizationDataset(BaseModel):
|
|
|
366
409
|
(this is determined by a dataset of the same name in the same organization).
|
|
367
410
|
|
|
368
411
|
Returns:
|
|
369
|
-
The ID of the created `
|
|
412
|
+
The ID of the created `QADataset` instance
|
|
370
413
|
"""
|
|
371
414
|
if self.storage_config is None and self.storage_config_id is None:
|
|
372
415
|
raise ValueError("No dataset storage has been provided")
|
|
@@ -391,7 +434,7 @@ class OptimizationDataset(BaseModel):
|
|
|
391
434
|
model_dict = self.model_dump(mode="json")
|
|
392
435
|
# ⬆️ Get dict of model fields from Pydantic model instance
|
|
393
436
|
dataset_response = requests.post(
|
|
394
|
-
f"{API_HOST}/dataset-
|
|
437
|
+
f"{API_HOST}/dataset-qa/dataset/",
|
|
395
438
|
json={
|
|
396
439
|
**{k: model_dict[k] for k in model_dict.keys() - {"storage_config"}},
|
|
397
440
|
"organization_id": organization_id,
|
|
@@ -408,17 +451,17 @@ class OptimizationDataset(BaseModel):
|
|
|
408
451
|
return self.id
|
|
409
452
|
|
|
410
453
|
@staticmethod
|
|
411
|
-
def
|
|
454
|
+
def launch_qa_run(
|
|
412
455
|
dataset_id: int,
|
|
413
456
|
organization_id: typing.Optional[int] = None,
|
|
414
457
|
run_args: typing.Optional[RunArgs] = None,
|
|
415
458
|
) -> str:
|
|
416
459
|
"""
|
|
417
|
-
Run the dataset
|
|
460
|
+
Run the dataset QA process on the server using the dataset with the given ID
|
|
418
461
|
i.e. `dataset_id`.
|
|
419
462
|
|
|
420
463
|
Args:
|
|
421
|
-
dataset_id: The ID of the dataset to run
|
|
464
|
+
dataset_id: The ID of the dataset to run QA on.
|
|
422
465
|
|
|
423
466
|
Returns:
|
|
424
467
|
ID of the run (`run_id`).
|
|
@@ -429,7 +472,7 @@ class OptimizationDataset(BaseModel):
|
|
|
429
472
|
if run_args:
|
|
430
473
|
run_info["run_args"] = run_args.model_dump(mode="json")
|
|
431
474
|
run_response = requests.post(
|
|
432
|
-
f"{API_HOST}/dataset-
|
|
475
|
+
f"{API_HOST}/dataset-qa/run/{dataset_id}",
|
|
433
476
|
json=run_info if len(run_info) > 0 else None,
|
|
434
477
|
headers=get_headers(),
|
|
435
478
|
timeout=MODIFY_TIMEOUT,
|
|
@@ -440,12 +483,16 @@ class OptimizationDataset(BaseModel):
|
|
|
440
483
|
def _validate_run_args(self, run_args: RunArgs) -> None:
|
|
441
484
|
if self.labeling_type == LabelingType.SPEECH_TO_TEXT:
|
|
442
485
|
raise Exception("Speech to text cannot have `run_args` set")
|
|
443
|
-
if
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
486
|
+
if (
|
|
487
|
+
self.labeling_type != LabelingType.OBJECT_DETECTION
|
|
488
|
+
and isinstance(run_args, ObjectDetectionRunArgs)
|
|
489
|
+
and any(
|
|
490
|
+
(
|
|
491
|
+
run_args.min_abs_bbox_size != 0,
|
|
492
|
+
run_args.min_abs_bbox_area != 0,
|
|
493
|
+
run_args.min_rel_bbox_size != 0,
|
|
494
|
+
run_args.min_rel_bbox_area != 0,
|
|
495
|
+
)
|
|
449
496
|
)
|
|
450
497
|
):
|
|
451
498
|
raise Exception(
|
|
@@ -454,7 +501,7 @@ class OptimizationDataset(BaseModel):
|
|
|
454
501
|
+ f"labeling type {self.labeling_type}"
|
|
455
502
|
)
|
|
456
503
|
|
|
457
|
-
def
|
|
504
|
+
def run_qa(
|
|
458
505
|
self,
|
|
459
506
|
organization_id: typing.Optional[int] = None,
|
|
460
507
|
replace_dataset_if_exists: bool = False,
|
|
@@ -462,13 +509,13 @@ class OptimizationDataset(BaseModel):
|
|
|
462
509
|
) -> str:
|
|
463
510
|
"""
|
|
464
511
|
If the dataset was not created on the server yet, it is created.
|
|
465
|
-
Run the dataset
|
|
512
|
+
Run the dataset QA process on the server using the active `QADataset` instance
|
|
466
513
|
|
|
467
514
|
Args:
|
|
468
|
-
organization_id: The ID of the organization to run the
|
|
515
|
+
organization_id: The ID of the organization to run the QA for.
|
|
469
516
|
replace_dataset_if_exists: If True, the dataset will be replaced if it already exists
|
|
470
517
|
(this is determined by a dataset of the same name in the same organization).
|
|
471
|
-
run_args: The run arguments to use for the
|
|
518
|
+
run_args: The run arguments to use for the QA run
|
|
472
519
|
|
|
473
520
|
Returns:
|
|
474
521
|
An ID of the run (`run_id`) and stores that `run_id` on the instance
|
|
@@ -478,7 +525,7 @@ class OptimizationDataset(BaseModel):
|
|
|
478
525
|
self.id = self.create(replace_if_exists=replace_dataset_if_exists)
|
|
479
526
|
if run_args is not None:
|
|
480
527
|
self._validate_run_args(run_args)
|
|
481
|
-
run_id = self.
|
|
528
|
+
run_id = self.launch_qa_run(self.id, organization_id, run_args)
|
|
482
529
|
self.run_id = run_id
|
|
483
530
|
logger.info("Started the run with ID: %s", run_id)
|
|
484
531
|
return run_id
|
|
@@ -516,7 +563,7 @@ class OptimizationDataset(BaseModel):
|
|
|
516
563
|
for sse in iter_sse_retrying(
|
|
517
564
|
client,
|
|
518
565
|
"GET",
|
|
519
|
-
f"{API_HOST}/dataset-
|
|
566
|
+
f"{API_HOST}/dataset-qa/run/{run_id}",
|
|
520
567
|
headers=get_headers(),
|
|
521
568
|
):
|
|
522
569
|
if sse.event == "ping":
|
|
@@ -542,50 +589,46 @@ class OptimizationDataset(BaseModel):
|
|
|
542
589
|
raise HirundoError("Unknown error")
|
|
543
590
|
yield data
|
|
544
591
|
if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
|
|
545
|
-
|
|
592
|
+
QADataset._check_run_by_id(run_id, retry + 1)
|
|
546
593
|
|
|
547
594
|
@staticmethod
|
|
548
595
|
def _handle_failure(iteration: dict):
|
|
549
596
|
if iteration["result"]:
|
|
550
|
-
raise HirundoError(
|
|
551
|
-
f"Optimization run failed with error: {iteration['result']}"
|
|
552
|
-
)
|
|
597
|
+
raise HirundoError(f"QA run failed with error: {iteration['result']}")
|
|
553
598
|
else:
|
|
554
|
-
raise HirundoError(
|
|
555
|
-
"Optimization run failed with an unknown error in _handle_failure"
|
|
556
|
-
)
|
|
599
|
+
raise HirundoError("QA run failed with an unknown error in _handle_failure")
|
|
557
600
|
|
|
558
601
|
@staticmethod
|
|
559
602
|
@overload
|
|
560
603
|
def check_run_by_id(
|
|
561
604
|
run_id: str, stop_on_manual_approval: typing.Literal[True]
|
|
562
|
-
) -> typing.Optional[
|
|
605
|
+
) -> typing.Optional[DatasetQAResults]: ...
|
|
563
606
|
|
|
564
607
|
@staticmethod
|
|
565
608
|
@overload
|
|
566
609
|
def check_run_by_id(
|
|
567
610
|
run_id: str, stop_on_manual_approval: typing.Literal[False] = False
|
|
568
|
-
) ->
|
|
611
|
+
) -> DatasetQAResults: ...
|
|
569
612
|
|
|
570
613
|
@staticmethod
|
|
571
614
|
@overload
|
|
572
615
|
def check_run_by_id(
|
|
573
616
|
run_id: str, stop_on_manual_approval: bool
|
|
574
|
-
) -> typing.Optional[
|
|
617
|
+
) -> typing.Optional[DatasetQAResults]: ...
|
|
575
618
|
|
|
576
619
|
@staticmethod
|
|
577
620
|
def check_run_by_id(
|
|
578
621
|
run_id: str, stop_on_manual_approval: bool = False
|
|
579
|
-
) -> typing.Optional[
|
|
622
|
+
) -> typing.Optional[DatasetQAResults]:
|
|
580
623
|
"""
|
|
581
624
|
Check the status of a run given its ID
|
|
582
625
|
|
|
583
626
|
Args:
|
|
584
|
-
run_id: The `run_id` produced by a `
|
|
627
|
+
run_id: The `run_id` produced by a `run_qa` call
|
|
585
628
|
stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval
|
|
586
629
|
|
|
587
630
|
Returns:
|
|
588
|
-
A
|
|
631
|
+
A DatasetQAResults object with the results of the QA run
|
|
589
632
|
|
|
590
633
|
Raises:
|
|
591
634
|
HirundoError: If the maximum number of retries is reached or if the run fails
|
|
@@ -593,7 +636,7 @@ class OptimizationDataset(BaseModel):
|
|
|
593
636
|
logger.debug("Checking run with ID: %s", run_id)
|
|
594
637
|
with logging_redirect_tqdm():
|
|
595
638
|
t = tqdm(total=100.0)
|
|
596
|
-
for iteration in
|
|
639
|
+
for iteration in QADataset._check_run_by_id(run_id):
|
|
597
640
|
if iteration["state"] in STATUS_TO_PROGRESS_MAP:
|
|
598
641
|
t.set_description(STATUS_TO_TEXT_MAP[iteration["state"]])
|
|
599
642
|
t.n = STATUS_TO_PROGRESS_MAP[iteration["state"]]
|
|
@@ -608,11 +651,11 @@ class OptimizationDataset(BaseModel):
|
|
|
608
651
|
"State is failure, rejected, or revoked: %s",
|
|
609
652
|
iteration["state"],
|
|
610
653
|
)
|
|
611
|
-
|
|
654
|
+
QADataset._handle_failure(iteration)
|
|
612
655
|
elif iteration["state"] == RunStatus.SUCCESS.value:
|
|
613
656
|
t.close()
|
|
614
657
|
zip_temporary_url = iteration["result"]
|
|
615
|
-
logger.debug("
|
|
658
|
+
logger.debug("QA run completed. Downloading results")
|
|
616
659
|
|
|
617
660
|
return download_and_extract_zip(
|
|
618
661
|
run_id,
|
|
@@ -644,7 +687,7 @@ class OptimizationDataset(BaseModel):
|
|
|
644
687
|
stage = "Unknown progress state"
|
|
645
688
|
current_progress_percentage = t.n # Keep the same progress
|
|
646
689
|
desc = (
|
|
647
|
-
"
|
|
690
|
+
"QA run completed. Uploading results"
|
|
648
691
|
if current_progress_percentage == 100.0
|
|
649
692
|
else stage
|
|
650
693
|
)
|
|
@@ -652,28 +695,26 @@ class OptimizationDataset(BaseModel):
|
|
|
652
695
|
t.n = current_progress_percentage
|
|
653
696
|
logger.debug("Setting progress to %s", t.n)
|
|
654
697
|
t.refresh()
|
|
655
|
-
raise HirundoError(
|
|
656
|
-
"Optimization run failed with an unknown error in check_run_by_id"
|
|
657
|
-
)
|
|
698
|
+
raise HirundoError("QA run failed with an unknown error in check_run_by_id")
|
|
658
699
|
|
|
659
700
|
@overload
|
|
660
701
|
def check_run(
|
|
661
702
|
self, stop_on_manual_approval: typing.Literal[True]
|
|
662
|
-
) -> typing.Optional[
|
|
703
|
+
) -> typing.Optional[DatasetQAResults]: ...
|
|
663
704
|
|
|
664
705
|
@overload
|
|
665
706
|
def check_run(
|
|
666
707
|
self, stop_on_manual_approval: typing.Literal[False] = False
|
|
667
|
-
) ->
|
|
708
|
+
) -> DatasetQAResults: ...
|
|
668
709
|
|
|
669
710
|
def check_run(
|
|
670
711
|
self, stop_on_manual_approval: bool = False
|
|
671
|
-
) -> typing.Optional[
|
|
712
|
+
) -> typing.Optional[DatasetQAResults]:
|
|
672
713
|
"""
|
|
673
714
|
Check the status of the current active instance's run.
|
|
674
715
|
|
|
675
716
|
Returns:
|
|
676
|
-
A pandas DataFrame with the results of the
|
|
717
|
+
A pandas DataFrame with the results of the QA run
|
|
677
718
|
|
|
678
719
|
"""
|
|
679
720
|
if not self.run_id:
|
|
@@ -690,7 +731,7 @@ class OptimizationDataset(BaseModel):
|
|
|
690
731
|
This generator will produce values to show progress of the run.
|
|
691
732
|
|
|
692
733
|
Args:
|
|
693
|
-
run_id: The `run_id` produced by a `
|
|
734
|
+
run_id: The `run_id` produced by a `run_qa` call
|
|
694
735
|
retry: A number used to track the number of retries to limit re-checks. *Do not* provide this value manually.
|
|
695
736
|
|
|
696
737
|
Yields:
|
|
@@ -709,7 +750,7 @@ class OptimizationDataset(BaseModel):
|
|
|
709
750
|
async_iterator = await aiter_sse_retrying(
|
|
710
751
|
client,
|
|
711
752
|
"GET",
|
|
712
|
-
f"{API_HOST}/dataset-
|
|
753
|
+
f"{API_HOST}/dataset-qa/run/{run_id}",
|
|
713
754
|
headers=get_headers(),
|
|
714
755
|
)
|
|
715
756
|
async for sse in async_iterator:
|
|
@@ -725,7 +766,7 @@ class OptimizationDataset(BaseModel):
|
|
|
725
766
|
last_event = json.loads(sse.data)
|
|
726
767
|
yield last_event["data"]
|
|
727
768
|
if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
|
|
728
|
-
|
|
769
|
+
QADataset.acheck_run_by_id(run_id, retry + 1)
|
|
729
770
|
|
|
730
771
|
async def acheck_run(self) -> AsyncGenerator[dict, None]:
|
|
731
772
|
"""
|
|
@@ -749,14 +790,14 @@ class OptimizationDataset(BaseModel):
|
|
|
749
790
|
@staticmethod
|
|
750
791
|
def cancel_by_id(run_id: str) -> None:
|
|
751
792
|
"""
|
|
752
|
-
Cancel the dataset
|
|
793
|
+
Cancel the dataset QA run for the given `run_id`.
|
|
753
794
|
|
|
754
795
|
Args:
|
|
755
796
|
run_id: The ID of the run to cancel
|
|
756
797
|
"""
|
|
757
798
|
logger.info("Cancelling run with ID: %s", run_id)
|
|
758
799
|
response = requests.delete(
|
|
759
|
-
f"{API_HOST}/dataset-
|
|
800
|
+
f"{API_HOST}/dataset-qa/run/{run_id}",
|
|
760
801
|
headers=get_headers(),
|
|
761
802
|
timeout=MODIFY_TIMEOUT,
|
|
762
803
|
)
|
|
@@ -773,14 +814,14 @@ class OptimizationDataset(BaseModel):
|
|
|
773
814
|
@staticmethod
|
|
774
815
|
def archive_run_by_id(run_id: str) -> None:
|
|
775
816
|
"""
|
|
776
|
-
Archive the dataset
|
|
817
|
+
Archive the dataset QA run for the given `run_id`.
|
|
777
818
|
|
|
778
819
|
Args:
|
|
779
820
|
run_id: The ID of the run to archive
|
|
780
821
|
"""
|
|
781
822
|
logger.info("Archiving run with ID: %s", run_id)
|
|
782
823
|
response = requests.patch(
|
|
783
|
-
f"{API_HOST}/dataset-
|
|
824
|
+
f"{API_HOST}/dataset-qa/run/archive/{run_id}",
|
|
784
825
|
headers=get_headers(),
|
|
785
826
|
timeout=MODIFY_TIMEOUT,
|
|
786
827
|
)
|
|
@@ -795,7 +836,7 @@ class OptimizationDataset(BaseModel):
|
|
|
795
836
|
self.archive_run_by_id(self.run_id)
|
|
796
837
|
|
|
797
838
|
|
|
798
|
-
class
|
|
839
|
+
class QADatasetOut(BaseModel):
|
|
799
840
|
id: int
|
|
800
841
|
|
|
801
842
|
name: str
|
|
@@ -814,7 +855,7 @@ class DataOptimizationDatasetOut(BaseModel):
|
|
|
814
855
|
updated_at: datetime.datetime
|
|
815
856
|
|
|
816
857
|
|
|
817
|
-
class
|
|
858
|
+
class DataQARunOut(BaseModel):
|
|
818
859
|
id: int
|
|
819
860
|
name: str
|
|
820
861
|
dataset_id: int
|
|
@@ -21,7 +21,7 @@ if has_polars:
|
|
|
21
21
|
T = typing.TypeVar("T")
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class
|
|
24
|
+
class DatasetQAResults(BaseModel, typing.Generic[T]):
|
|
25
25
|
model_config = {"arbitrary_types_allowed": True}
|
|
26
26
|
|
|
27
27
|
cached_zip_path: Path
|
|
@@ -30,13 +30,13 @@ class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
|
|
|
30
30
|
"""
|
|
31
31
|
suspects: T
|
|
32
32
|
"""
|
|
33
|
-
A polars/pandas DataFrame containing the results of the
|
|
33
|
+
A polars/pandas DataFrame containing the results of the data QA run
|
|
34
34
|
"""
|
|
35
35
|
object_suspects: typing.Optional[T]
|
|
36
36
|
"""
|
|
37
|
-
A polars/pandas DataFrame containing the object-level results of the
|
|
37
|
+
A polars/pandas DataFrame containing the object-level results of the data QA run
|
|
38
38
|
"""
|
|
39
39
|
warnings_and_errors: T
|
|
40
40
|
"""
|
|
41
|
-
A polars/pandas DataFrame containing the warnings and errors of the
|
|
41
|
+
A polars/pandas DataFrame containing the warnings and errors of the data QA run
|
|
42
42
|
"""
|
hirundo/git.py
CHANGED
|
@@ -3,13 +3,12 @@ import re
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
-
import requests
|
|
7
6
|
from pydantic import BaseModel, field_validator
|
|
8
7
|
from pydantic_core import Url
|
|
9
8
|
|
|
10
9
|
from hirundo._env import API_HOST
|
|
11
10
|
from hirundo._headers import get_headers
|
|
12
|
-
from hirundo._http import raise_for_status_with_reason
|
|
11
|
+
from hirundo._http import raise_for_status_with_reason, requests
|
|
13
12
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
14
13
|
from hirundo._urls import RepoUrl
|
|
15
14
|
from hirundo.logger import get_logger
|
hirundo/storage.py
CHANGED
|
@@ -2,13 +2,12 @@ import typing
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
|
-
import requests
|
|
6
5
|
from pydantic import BaseModel, model_validator
|
|
7
6
|
from pydantic_core import Url
|
|
8
7
|
|
|
9
8
|
from hirundo._env import API_HOST
|
|
10
9
|
from hirundo._headers import get_headers
|
|
11
|
-
from hirundo._http import raise_for_status_with_reason
|
|
10
|
+
from hirundo._http import raise_for_status_with_reason, requests
|
|
12
11
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
13
12
|
from hirundo._urls import S3BucketUrl, StorageConfigName
|
|
14
13
|
from hirundo.dataset_enum import StorageTypes
|
hirundo/unzip.py
CHANGED
|
@@ -4,7 +4,6 @@ from collections.abc import Mapping
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import IO, cast
|
|
6
6
|
|
|
7
|
-
import requests
|
|
8
7
|
from pydantic_core import Url
|
|
9
8
|
|
|
10
9
|
from hirundo._dataframe import (
|
|
@@ -18,10 +17,11 @@ from hirundo._dataframe import (
|
|
|
18
17
|
)
|
|
19
18
|
from hirundo._env import API_HOST
|
|
20
19
|
from hirundo._headers import _get_auth_headers
|
|
20
|
+
from hirundo._http import requests
|
|
21
21
|
from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT
|
|
22
|
-
from hirundo.
|
|
22
|
+
from hirundo.dataset_qa_results import (
|
|
23
23
|
DataFrameType,
|
|
24
|
-
|
|
24
|
+
DatasetQAResults,
|
|
25
25
|
)
|
|
26
26
|
from hirundo.logger import get_logger
|
|
27
27
|
|
|
@@ -117,7 +117,7 @@ def get_mislabel_suspect_filename(filenames: list[str]):
|
|
|
117
117
|
|
|
118
118
|
def download_and_extract_zip(
|
|
119
119
|
run_id: str, zip_url: str
|
|
120
|
-
) ->
|
|
120
|
+
) -> DatasetQAResults[DataFrameType]:
|
|
121
121
|
"""
|
|
122
122
|
Download and extract the zip file from the given URL.
|
|
123
123
|
|
|
@@ -127,11 +127,11 @@ def download_and_extract_zip(
|
|
|
127
127
|
and `warnings_and_errors.csv` files from the zip file.
|
|
128
128
|
|
|
129
129
|
Args:
|
|
130
|
-
run_id: The ID of the
|
|
130
|
+
run_id: The ID of the dataset QA run.
|
|
131
131
|
zip_url: The URL of the zip file to download.
|
|
132
132
|
|
|
133
133
|
Returns:
|
|
134
|
-
The dataset
|
|
134
|
+
The dataset QA results object.
|
|
135
135
|
"""
|
|
136
136
|
# Define the local file path
|
|
137
137
|
cache_dir = Path.home() / ".hirundo" / "cache"
|
|
@@ -140,9 +140,8 @@ def download_and_extract_zip(
|
|
|
140
140
|
|
|
141
141
|
headers = None
|
|
142
142
|
if Url(zip_url).scheme == "file":
|
|
143
|
-
zip_url = (
|
|
144
|
-
|
|
145
|
-
+ zip_url.replace("file://", "")
|
|
143
|
+
zip_url = f"{API_HOST}/dataset-qa/run/local-download" + zip_url.replace(
|
|
144
|
+
"file://", ""
|
|
146
145
|
)
|
|
147
146
|
headers = _get_auth_headers()
|
|
148
147
|
# Stream the zip file download
|
|
@@ -217,7 +216,7 @@ def download_and_extract_zip(
|
|
|
217
216
|
"Failed to load warnings and errors into DataFrame", exc_info=e
|
|
218
217
|
)
|
|
219
218
|
|
|
220
|
-
return
|
|
219
|
+
return DatasetQAResults[DataFrameType](
|
|
221
220
|
cached_zip_path=zip_file_path,
|
|
222
221
|
suspects=suspects_df,
|
|
223
222
|
object_suspects=object_suspects_df,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hirundo
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.21
|
|
4
4
|
Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
|
|
5
5
|
Author-email: Hirundo <dev@hirundo.io>
|
|
6
6
|
License: MIT License
|
|
@@ -13,7 +13,7 @@ License: MIT License
|
|
|
13
13
|
|
|
14
14
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
15
15
|
|
|
16
|
-
Project-URL: Homepage, https://github.com/Hirundo-io/hirundo-
|
|
16
|
+
Project-URL: Homepage, https://github.com/Hirundo-io/hirundo-python-sdk
|
|
17
17
|
Keywords: dataset,machine learning,data science,data engineering
|
|
18
18
|
Classifier: License :: OSI Approved :: MIT License
|
|
19
19
|
Classifier: Programming Language :: Python
|
|
@@ -32,6 +32,10 @@ Requires-Dist: httpx>=0.27.0
|
|
|
32
32
|
Requires-Dist: stamina>=24.2.0
|
|
33
33
|
Requires-Dist: httpx-sse>=0.4.0
|
|
34
34
|
Requires-Dist: tqdm>=4.66.5
|
|
35
|
+
Requires-Dist: h11>=0.16.0
|
|
36
|
+
Requires-Dist: requests>=2.32.4
|
|
37
|
+
Requires-Dist: urllib3>=2.5.0
|
|
38
|
+
Requires-Dist: setuptools>=78.1.1
|
|
35
39
|
Provides-Extra: dev
|
|
36
40
|
Requires-Dist: pyyaml>=6.0.1; extra == "dev"
|
|
37
41
|
Requires-Dist: types-PyYAML>=6.0.12; extra == "dev"
|
|
@@ -46,13 +50,15 @@ Requires-Dist: stamina>=24.2.0; extra == "dev"
|
|
|
46
50
|
Requires-Dist: httpx-sse>=0.4.0; extra == "dev"
|
|
47
51
|
Requires-Dist: pytest>=8.2.0; extra == "dev"
|
|
48
52
|
Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
|
|
49
|
-
Requires-Dist: uv>=0.
|
|
53
|
+
Requires-Dist: uv>=0.8.6; extra == "dev"
|
|
50
54
|
Requires-Dist: pre-commit>=3.7.1; extra == "dev"
|
|
51
55
|
Requires-Dist: virtualenv>=20.6.6; extra == "dev"
|
|
52
|
-
Requires-Dist: ruff>=0.
|
|
56
|
+
Requires-Dist: ruff>=0.12.0; extra == "dev"
|
|
53
57
|
Requires-Dist: bumpver; extra == "dev"
|
|
54
58
|
Requires-Dist: platformdirs>=4.3.6; extra == "dev"
|
|
55
59
|
Requires-Dist: safety>=3.2.13; extra == "dev"
|
|
60
|
+
Requires-Dist: cryptography>=44.0.1; extra == "dev"
|
|
61
|
+
Requires-Dist: jinja2>=3.1.6; extra == "dev"
|
|
56
62
|
Provides-Extra: docs
|
|
57
63
|
Requires-Dist: sphinx>=7.4.7; extra == "docs"
|
|
58
64
|
Requires-Dist: sphinx-autobuild>=2024.9.3; extra == "docs"
|
|
@@ -61,8 +67,9 @@ Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
|
|
|
61
67
|
Requires-Dist: furo; extra == "docs"
|
|
62
68
|
Requires-Dist: sphinx-multiversion; extra == "docs"
|
|
63
69
|
Requires-Dist: esbonio; extra == "docs"
|
|
64
|
-
Requires-Dist: starlette
|
|
70
|
+
Requires-Dist: starlette>=0.47.2; extra == "docs"
|
|
65
71
|
Requires-Dist: markupsafe>=3.0.2; extra == "docs"
|
|
72
|
+
Requires-Dist: jinja2>=3.1.6; extra == "docs"
|
|
66
73
|
Provides-Extra: pandas
|
|
67
74
|
Requires-Dist: pandas>=2.2.3; extra == "pandas"
|
|
68
75
|
Provides-Extra: polars
|
|
@@ -71,9 +78,9 @@ Dynamic: license-file
|
|
|
71
78
|
|
|
72
79
|
# Hirundo
|
|
73
80
|
|
|
74
|
-
This package exposes access to Hirundo APIs for dataset
|
|
81
|
+
This package exposes access to Hirundo APIs for dataset QA for Machine Learning.
|
|
75
82
|
|
|
76
|
-
Dataset
|
|
83
|
+
Dataset QA is currently available for datasets labelled for classification and object detection.
|
|
77
84
|
|
|
78
85
|
Support dataset storage configs include:
|
|
79
86
|
|
|
@@ -144,7 +151,7 @@ Classification example:
|
|
|
144
151
|
from hirundo import (
|
|
145
152
|
HirundoCSV,
|
|
146
153
|
LabelingType,
|
|
147
|
-
|
|
154
|
+
QADataset,
|
|
148
155
|
StorageGCP,
|
|
149
156
|
StorageConfig,
|
|
150
157
|
StorageTypes,
|
|
@@ -155,7 +162,7 @@ gcp_bucket = StorageGCP(
|
|
|
155
162
|
project="Hirundo-global",
|
|
156
163
|
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
|
|
157
164
|
)
|
|
158
|
-
test_dataset =
|
|
165
|
+
test_dataset = QADataset(
|
|
159
166
|
name="TEST-GCP cifar 100 classification dataset",
|
|
160
167
|
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
|
|
161
168
|
storage_config=StorageConfig(
|
|
@@ -170,7 +177,7 @@ test_dataset = OptimizationDataset(
|
|
|
170
177
|
classes=cifar100_classes,
|
|
171
178
|
)
|
|
172
179
|
|
|
173
|
-
test_dataset.
|
|
180
|
+
test_dataset.run_qa()
|
|
174
181
|
results = test_dataset.check_run()
|
|
175
182
|
print(results)
|
|
176
183
|
```
|
|
@@ -182,7 +189,7 @@ from hirundo import (
|
|
|
182
189
|
GitRepo,
|
|
183
190
|
HirundoCSV,
|
|
184
191
|
LabelingType,
|
|
185
|
-
|
|
192
|
+
QADataset,
|
|
186
193
|
StorageGit,
|
|
187
194
|
StorageConfig,
|
|
188
195
|
StorageTypes,
|
|
@@ -195,7 +202,7 @@ git_storage = StorageGit(
|
|
|
195
202
|
),
|
|
196
203
|
branch="main",
|
|
197
204
|
)
|
|
198
|
-
test_dataset =
|
|
205
|
+
test_dataset = QADataset(
|
|
199
206
|
name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
|
|
200
207
|
labeling_type=LabelingType.OBJECT_DETECTION,
|
|
201
208
|
storage_config=StorageConfig(
|
|
@@ -211,7 +218,7 @@ test_dataset = OptimizationDataset(
|
|
|
211
218
|
),
|
|
212
219
|
)
|
|
213
220
|
|
|
214
|
-
test_dataset.
|
|
221
|
+
test_dataset.run_qa()
|
|
215
222
|
results = test_dataset.check_run()
|
|
216
223
|
print(results)
|
|
217
224
|
```
|
|
@@ -220,4 +227,4 @@ Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 &
|
|
|
220
227
|
|
|
221
228
|
## Further documentation
|
|
222
229
|
|
|
223
|
-
To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-
|
|
230
|
+
To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-python-sdk/tree/main/notebooks).
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
hirundo/__init__.py,sha256=GxRK_DHPKG1aqxNa19imqspHRAvBHSAQ5Q0fDwJPCDE,1341
|
|
2
|
+
hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
|
|
3
|
+
hirundo/_constraints.py,sha256=slW7Rk9Ml5fuwjnXTLUvHIhnY_9hmcUUy57v9hFog1o,6003
|
|
4
|
+
hirundo/_dataframe.py,sha256=sXEEbCNcLi83wyU9ii884YikCzfASo_3nnrDxhuCv7U,758
|
|
5
|
+
hirundo/_env.py,sha256=efX2sjvYlHkFr2Lcstelei67YSTFpVGT0l08ZsfiMuE,622
|
|
6
|
+
hirundo/_headers.py,sha256=Cwha8gXEQNXL2lc9Lb1klLotkMLD82XOpAdX33TLVj8,521
|
|
7
|
+
hirundo/_http.py,sha256=0kfoznumU3jinHhJIpB6qn5Mt4a3kso59GNXVbpWH7M,2267
|
|
8
|
+
hirundo/_iter_sse_retrying.py,sha256=xNpf3W5qAHkKPJz8H4NZjKE3CrI_8b3m1iYeahdpdEc,4653
|
|
9
|
+
hirundo/_timeouts.py,sha256=gE58NU0t2e4KgKq2sk5rZcezDJAkgvRIbM5AVYFY6Ho,86
|
|
10
|
+
hirundo/_urls.py,sha256=0C85EbL0T-Bj25vJwjNs_obUG8ROSADpmbFdTAyhzlw,1375
|
|
11
|
+
hirundo/cli.py,sha256=u-LsrN17-J7temjrq6NeUGnJ4mO04tMCiQYqVMm6el8,7752
|
|
12
|
+
hirundo/dataset_enum.py,sha256=QnS3fy1OF4wvUtiIAHubKRhc611idS8huopEEolgqEM,1217
|
|
13
|
+
hirundo/dataset_qa.py,sha256=U7cqV4JbYkaByXEf2XdoJrQZ_rI9pgDxrXVbQLc50R8,32470
|
|
14
|
+
hirundo/dataset_qa_results.py,sha256=1F7JhRf7TQomwW9tjbNn8OBrhWHwEaWOND80r39l5uY,1104
|
|
15
|
+
hirundo/git.py,sha256=cBjP7kPnaUHR77FI5ZaERst38eTUDy8q1gAQzy45EB4,6567
|
|
16
|
+
hirundo/labeling.py,sha256=zXQCaqfdaLIG4qbzFGbb94L3FDdRMpdzHwbrDJE07Yk,5006
|
|
17
|
+
hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
|
|
18
|
+
hirundo/storage.py,sha256=MPKxkhrBmX84Yuexd4QoLDdVIJHrll9RosCLUsz5q3c,15936
|
|
19
|
+
hirundo/unzip.py,sha256=3aPOsBvF-ZgAumHnQ6hq7JtbFUe9eRRRFsiI6K8cRDE,8188
|
|
20
|
+
hirundo-0.1.21.dist-info/licenses/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
|
|
21
|
+
hirundo-0.1.21.dist-info/METADATA,sha256=3m7R5dMN5h_C-L2Wl76lzYjpreP5upyHcEkIoAZF1lY,9497
|
|
22
|
+
hirundo-0.1.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
23
|
+
hirundo-0.1.21.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
|
|
24
|
+
hirundo-0.1.21.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
|
|
25
|
+
hirundo-0.1.21.dist-info/RECORD,,
|
hirundo-0.1.18.dist-info/RECORD
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
hirundo/__init__.py,sha256=1Uy9UZhaZPQQSMfAOJ0A_Of70tM8_MDq-HHdhrmpO6g,1301
|
|
2
|
-
hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
|
|
3
|
-
hirundo/_constraints.py,sha256=tgJfvp7ydyXilT8ViNk837rNRlpOVXLLeCSMt_YUUYA,6013
|
|
4
|
-
hirundo/_dataframe.py,sha256=sXEEbCNcLi83wyU9ii884YikCzfASo_3nnrDxhuCv7U,758
|
|
5
|
-
hirundo/_env.py,sha256=efX2sjvYlHkFr2Lcstelei67YSTFpVGT0l08ZsfiMuE,622
|
|
6
|
-
hirundo/_headers.py,sha256=3hybpD_X4SODv3cFZPt9AjGY2vvZaag5OKT3z1SHSjA,521
|
|
7
|
-
hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
|
|
8
|
-
hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
|
|
9
|
-
hirundo/_timeouts.py,sha256=gE58NU0t2e4KgKq2sk5rZcezDJAkgvRIbM5AVYFY6Ho,86
|
|
10
|
-
hirundo/_urls.py,sha256=0C85EbL0T-Bj25vJwjNs_obUG8ROSADpmbFdTAyhzlw,1375
|
|
11
|
-
hirundo/cli.py,sha256=5Tn0eXZGG92BR9HJYUaYozjFbS1t6UTw_I2R0tZBE04,7824
|
|
12
|
-
hirundo/dataset_enum.py,sha256=QnS3fy1OF4wvUtiIAHubKRhc611idS8huopEEolgqEM,1217
|
|
13
|
-
hirundo/dataset_optimization.py,sha256=fXi8MeI0PWwSyc5NuOzCrkgXT_mz24NV-dGOHDPkBR0,31256
|
|
14
|
-
hirundo/dataset_optimization_results.py,sha256=A9YyF5zaZXVtzeDE08I_05v90dhZQADpSjDcS_6eLMc,1129
|
|
15
|
-
hirundo/git.py,sha256=8LVnF4WCjZsxMHoRaVxbLiDAKpGCBEwlcZp7a30n9Zo,6573
|
|
16
|
-
hirundo/labeling.py,sha256=zXQCaqfdaLIG4qbzFGbb94L3FDdRMpdzHwbrDJE07Yk,5006
|
|
17
|
-
hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
|
|
18
|
-
hirundo/storage.py,sha256=y7cr_dngkfZq0gKnwWxrSqUXb1SycGpwFRVmS9Cn3h8,15942
|
|
19
|
-
hirundo/unzip.py,sha256=XJqvt2m5pWR-G-fnzgW75VOdd-K4_Rw2r4wiEhZgKZA,8245
|
|
20
|
-
hirundo-0.1.18.dist-info/licenses/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
|
|
21
|
-
hirundo-0.1.18.dist-info/METADATA,sha256=F_F0-EfUxVVCcgFue_hwCtxfIfmqBlwnpvzELuhMkAc,9302
|
|
22
|
-
hirundo-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
23
|
-
hirundo-0.1.18.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
|
|
24
|
-
hirundo-0.1.18.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
|
|
25
|
-
hirundo-0.1.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|