hirundo 0.1.18__py3-none-any.whl → 0.2.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hirundo/__init__.py +28 -8
- hirundo/_constraints.py +3 -4
- hirundo/_headers.py +1 -1
- hirundo/_http.py +53 -0
- hirundo/_iter_sse_retrying.py +8 -5
- hirundo/_llm_pipeline.py +153 -0
- hirundo/_run_checking.py +283 -0
- hirundo/_urls.py +1 -0
- hirundo/cli.py +8 -11
- hirundo/dataset_enum.py +2 -0
- hirundo/{dataset_optimization.py → dataset_qa.py} +213 -256
- hirundo/{dataset_optimization_results.py → dataset_qa_results.py} +7 -7
- hirundo/git.py +8 -10
- hirundo/labeling.py +22 -19
- hirundo/storage.py +26 -26
- hirundo/unlearning_llm.py +599 -0
- hirundo/unzip.py +12 -13
- {hirundo-0.1.18.dist-info → hirundo-0.2.3.post1.dist-info}/METADATA +59 -20
- hirundo-0.2.3.post1.dist-info/RECORD +28 -0
- {hirundo-0.1.18.dist-info → hirundo-0.2.3.post1.dist-info}/WHEEL +1 -1
- hirundo-0.1.18.dist-info/RECORD +0 -25
- {hirundo-0.1.18.dist-info → hirundo-0.2.3.post1.dist-info}/entry_points.txt +0 -0
- {hirundo-0.1.18.dist-info → hirundo-0.2.3.post1.dist-info}/licenses/LICENSE +0 -0
- {hirundo-0.1.18.dist-info → hirundo-0.2.3.post1.dist-info}/top_level.txt +0 -0
|
@@ -11,17 +11,17 @@ DataFrameType = TypeAliasType("DataFrameType", None)
|
|
|
11
11
|
if has_pandas:
|
|
12
12
|
from hirundo._dataframe import pd
|
|
13
13
|
|
|
14
|
-
DataFrameType = TypeAliasType("DataFrameType",
|
|
14
|
+
DataFrameType = TypeAliasType("DataFrameType", pd.DataFrame | None)
|
|
15
15
|
if has_polars:
|
|
16
16
|
from hirundo._dataframe import pl
|
|
17
17
|
|
|
18
|
-
DataFrameType = TypeAliasType("DataFrameType",
|
|
18
|
+
DataFrameType = TypeAliasType("DataFrameType", pl.DataFrame | None)
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
T = typing.TypeVar("T")
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class
|
|
24
|
+
class DatasetQAResults(BaseModel, typing.Generic[T]):
|
|
25
25
|
model_config = {"arbitrary_types_allowed": True}
|
|
26
26
|
|
|
27
27
|
cached_zip_path: Path
|
|
@@ -30,13 +30,13 @@ class DatasetOptimizationResults(BaseModel, typing.Generic[T]):
|
|
|
30
30
|
"""
|
|
31
31
|
suspects: T
|
|
32
32
|
"""
|
|
33
|
-
A polars/pandas DataFrame containing the results of the
|
|
33
|
+
A polars/pandas DataFrame containing the results of the data QA run
|
|
34
34
|
"""
|
|
35
|
-
object_suspects:
|
|
35
|
+
object_suspects: T | None
|
|
36
36
|
"""
|
|
37
|
-
A polars/pandas DataFrame containing the object-level results of the
|
|
37
|
+
A polars/pandas DataFrame containing the object-level results of the data QA run
|
|
38
38
|
"""
|
|
39
39
|
warnings_and_errors: T
|
|
40
40
|
"""
|
|
41
|
-
A polars/pandas DataFrame containing the warnings and errors of the
|
|
41
|
+
A polars/pandas DataFrame containing the warnings and errors of the data QA run
|
|
42
42
|
"""
|
hirundo/git.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import re
|
|
3
|
-
import typing
|
|
4
3
|
|
|
5
4
|
import pydantic
|
|
6
|
-
import requests
|
|
7
5
|
from pydantic import BaseModel, field_validator
|
|
8
6
|
from pydantic_core import Url
|
|
9
7
|
|
|
10
8
|
from hirundo._env import API_HOST
|
|
11
9
|
from hirundo._headers import get_headers
|
|
12
|
-
from hirundo._http import raise_for_status_with_reason
|
|
10
|
+
from hirundo._http import raise_for_status_with_reason, requests
|
|
13
11
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
14
12
|
from hirundo._urls import RepoUrl
|
|
15
13
|
from hirundo.logger import get_logger
|
|
@@ -33,14 +31,14 @@ class GitSSHAuth(BaseModel):
|
|
|
33
31
|
"""
|
|
34
32
|
The SSH key for the Git repository
|
|
35
33
|
"""
|
|
36
|
-
ssh_password:
|
|
34
|
+
ssh_password: str | None
|
|
37
35
|
"""
|
|
38
36
|
The password for the SSH key for the Git repository.
|
|
39
37
|
"""
|
|
40
38
|
|
|
41
39
|
|
|
42
40
|
class GitRepo(BaseModel):
|
|
43
|
-
id:
|
|
41
|
+
id: int | None = None
|
|
44
42
|
"""
|
|
45
43
|
The ID of the Git repository.
|
|
46
44
|
"""
|
|
@@ -49,25 +47,25 @@ class GitRepo(BaseModel):
|
|
|
49
47
|
"""
|
|
50
48
|
A name to identify the Git repository in the Hirundo system.
|
|
51
49
|
"""
|
|
52
|
-
repository_url:
|
|
50
|
+
repository_url: str | RepoUrl
|
|
53
51
|
"""
|
|
54
52
|
The URL of the Git repository, it should start with `ssh://` or `https://` or be in the form `user@host:path`.
|
|
55
53
|
If it is in the form `user@host:path`, it will be rewritten to `ssh://user@host/path`.
|
|
56
54
|
"""
|
|
57
|
-
organization_id:
|
|
55
|
+
organization_id: int | None = None
|
|
58
56
|
"""
|
|
59
57
|
The ID of the organization that the Git repository belongs to.
|
|
60
58
|
If not provided, it will be assigned to your default organization.
|
|
61
59
|
"""
|
|
62
60
|
|
|
63
|
-
plain_auth:
|
|
61
|
+
plain_auth: GitPlainAuth | None = pydantic.Field(
|
|
64
62
|
default=None, examples=[None, {"username": "ben", "password": "password"}]
|
|
65
63
|
)
|
|
66
64
|
"""
|
|
67
65
|
The plain authentication details for the Git repository.
|
|
68
66
|
Use this if using a special user with a username and password for authentication.
|
|
69
67
|
"""
|
|
70
|
-
ssh_auth:
|
|
68
|
+
ssh_auth: GitSSHAuth | None = pydantic.Field(
|
|
71
69
|
default=None,
|
|
72
70
|
examples=[
|
|
73
71
|
{
|
|
@@ -85,7 +83,7 @@ class GitRepo(BaseModel):
|
|
|
85
83
|
|
|
86
84
|
@field_validator("repository_url", mode="before", check_fields=True)
|
|
87
85
|
@classmethod
|
|
88
|
-
def check_valid_repository_url(cls, repository_url:
|
|
86
|
+
def check_valid_repository_url(cls, repository_url: str | RepoUrl):
|
|
89
87
|
# Check if the URL has the `@` and `:` pattern with a non-numeric section before the next slash
|
|
90
88
|
match = re.match("([^@]+@[^:]+):([^0-9/][^/]*)/(.+)", str(repository_url))
|
|
91
89
|
if match:
|
hirundo/labeling.py
CHANGED
|
@@ -3,11 +3,9 @@ from abc import ABC
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
5
|
|
|
6
|
+
from hirundo._urls import HirundoUrl
|
|
6
7
|
from hirundo.dataset_enum import DatasetMetadataType
|
|
7
8
|
|
|
8
|
-
if typing.TYPE_CHECKING:
|
|
9
|
-
from hirundo._urls import HirundoUrl
|
|
10
|
-
|
|
11
9
|
|
|
12
10
|
class Metadata(BaseModel, ABC, frozen=True):
|
|
13
11
|
type: DatasetMetadataType
|
|
@@ -21,7 +19,7 @@ class HirundoCSV(Metadata, frozen=True):
|
|
|
21
19
|
type: typing.Literal[DatasetMetadataType.HIRUNDO_CSV] = (
|
|
22
20
|
DatasetMetadataType.HIRUNDO_CSV
|
|
23
21
|
)
|
|
24
|
-
csv_url:
|
|
22
|
+
csv_url: HirundoUrl
|
|
25
23
|
"""
|
|
26
24
|
The URL to access the dataset metadata CSV file.
|
|
27
25
|
e.g. `s3://my-bucket-name/my-folder/my-metadata.csv`, `gs://my-bucket-name/my-folder/my-metadata.csv`,
|
|
@@ -36,7 +34,7 @@ class COCO(Metadata, frozen=True):
|
|
|
36
34
|
"""
|
|
37
35
|
|
|
38
36
|
type: typing.Literal[DatasetMetadataType.COCO] = DatasetMetadataType.COCO
|
|
39
|
-
json_url:
|
|
37
|
+
json_url: HirundoUrl
|
|
40
38
|
"""
|
|
41
39
|
The URL to access the dataset metadata JSON file.
|
|
42
40
|
e.g. `s3://my-bucket-name/my-folder/my-metadata.json`, `gs://my-bucket-name/my-folder/my-metadata.json`,
|
|
@@ -47,8 +45,18 @@ class COCO(Metadata, frozen=True):
|
|
|
47
45
|
|
|
48
46
|
class YOLO(Metadata, frozen=True):
|
|
49
47
|
type: typing.Literal[DatasetMetadataType.YOLO] = DatasetMetadataType.YOLO
|
|
50
|
-
data_yaml_url:
|
|
51
|
-
labels_dir_url:
|
|
48
|
+
data_yaml_url: HirundoUrl | None = None
|
|
49
|
+
labels_dir_url: HirundoUrl
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class HuggingFaceAudio(Metadata, frozen=True):
|
|
53
|
+
type: typing.Literal[DatasetMetadataType.HuggingFaceAudio] = (
|
|
54
|
+
DatasetMetadataType.HuggingFaceAudio
|
|
55
|
+
)
|
|
56
|
+
audio_column: str
|
|
57
|
+
text_column: str
|
|
58
|
+
subset: str | None = None
|
|
59
|
+
split: str | None = None
|
|
52
60
|
|
|
53
61
|
|
|
54
62
|
class KeylabsAuth(BaseModel):
|
|
@@ -63,7 +71,7 @@ class Keylabs(Metadata, frozen=True):
|
|
|
63
71
|
Keylabs project ID.
|
|
64
72
|
"""
|
|
65
73
|
|
|
66
|
-
labels_dir_url:
|
|
74
|
+
labels_dir_url: HirundoUrl
|
|
67
75
|
"""
|
|
68
76
|
URL to the directory containing the Keylabs labels.
|
|
69
77
|
"""
|
|
@@ -73,11 +81,11 @@ class Keylabs(Metadata, frozen=True):
|
|
|
73
81
|
Whether to include attributes in the class name.
|
|
74
82
|
"""
|
|
75
83
|
|
|
76
|
-
project_name:
|
|
84
|
+
project_name: str | None = None
|
|
77
85
|
"""
|
|
78
86
|
Keylabs project name (optional; added to output CSV if provided).
|
|
79
87
|
"""
|
|
80
|
-
keylabs_auth:
|
|
88
|
+
keylabs_auth: KeylabsAuth | None = None
|
|
81
89
|
"""
|
|
82
90
|
Keylabs authentication credentials (optional; if provided, used to provide links to each sample).
|
|
83
91
|
"""
|
|
@@ -107,9 +115,9 @@ class KeylabsObjSegVideo(Keylabs, frozen=True):
|
|
|
107
115
|
)
|
|
108
116
|
|
|
109
117
|
|
|
110
|
-
KeylabsInfo =
|
|
111
|
-
KeylabsObjDetImages
|
|
112
|
-
|
|
118
|
+
KeylabsInfo = (
|
|
119
|
+
KeylabsObjDetImages | KeylabsObjDetVideo | KeylabsObjSegImages | KeylabsObjSegVideo
|
|
120
|
+
)
|
|
113
121
|
"""
|
|
114
122
|
The dataset labeling info for Keylabs. The dataset labeling info can be one of the following:
|
|
115
123
|
- `DatasetMetadataType.KeylabsObjDetImages`: Indicates that the dataset metadata file is in the Keylabs object detection image format
|
|
@@ -118,12 +126,7 @@ The dataset labeling info for Keylabs. The dataset labeling info can be one of t
|
|
|
118
126
|
- `DatasetMetadataType.KeylabsObjSegVideo`: Indicates that the dataset metadata file is in the Keylabs object segmentation video format
|
|
119
127
|
"""
|
|
120
128
|
LabelingInfo = typing.Annotated[
|
|
121
|
-
|
|
122
|
-
HirundoCSV,
|
|
123
|
-
COCO,
|
|
124
|
-
YOLO,
|
|
125
|
-
KeylabsInfo,
|
|
126
|
-
],
|
|
129
|
+
HirundoCSV | COCO | YOLO | KeylabsInfo | HuggingFaceAudio,
|
|
127
130
|
Field(discriminator="type"),
|
|
128
131
|
]
|
|
129
132
|
"""
|
hirundo/storage.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
import datetime
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
|
-
import requests
|
|
6
5
|
from pydantic import BaseModel, model_validator
|
|
7
6
|
from pydantic_core import Url
|
|
8
7
|
|
|
9
8
|
from hirundo._env import API_HOST
|
|
10
9
|
from hirundo._headers import get_headers
|
|
11
|
-
from hirundo._http import raise_for_status_with_reason
|
|
10
|
+
from hirundo._http import raise_for_status_with_reason, requests
|
|
12
11
|
from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
|
|
13
12
|
from hirundo._urls import S3BucketUrl, StorageConfigName
|
|
14
13
|
from hirundo.dataset_enum import StorageTypes
|
|
@@ -21,13 +20,13 @@ S3_PREFIX = "s3://"
|
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class StorageS3Base(BaseModel):
|
|
24
|
-
endpoint_url:
|
|
23
|
+
endpoint_url: Url | None = None
|
|
25
24
|
bucket_url: S3BucketUrl
|
|
26
25
|
region_name: str
|
|
27
26
|
# ⬆️ We could restrict this, but if we're allowing custom endpoints then the validation may be wrong
|
|
28
|
-
access_key_id:
|
|
27
|
+
access_key_id: str | None = None
|
|
29
28
|
|
|
30
|
-
def get_url(self, path:
|
|
29
|
+
def get_url(self, path: str | Path) -> Url:
|
|
31
30
|
"""
|
|
32
31
|
Get the full URL for a file in the S3 bucket
|
|
33
32
|
|
|
@@ -46,7 +45,7 @@ class StorageS3Base(BaseModel):
|
|
|
46
45
|
|
|
47
46
|
|
|
48
47
|
class StorageS3(StorageS3Base):
|
|
49
|
-
secret_access_key:
|
|
48
|
+
secret_access_key: str | None = None
|
|
50
49
|
|
|
51
50
|
|
|
52
51
|
class StorageS3Out(StorageS3Base):
|
|
@@ -57,7 +56,7 @@ class StorageGCPBase(BaseModel):
|
|
|
57
56
|
bucket_name: str
|
|
58
57
|
project: str
|
|
59
58
|
|
|
60
|
-
def get_url(self, path:
|
|
59
|
+
def get_url(self, path: str | Path) -> Url:
|
|
61
60
|
"""
|
|
62
61
|
Get the full URL for a file in the GCP bucket
|
|
63
62
|
|
|
@@ -74,7 +73,7 @@ class StorageGCPBase(BaseModel):
|
|
|
74
73
|
|
|
75
74
|
|
|
76
75
|
class StorageGCP(StorageGCPBase):
|
|
77
|
-
credentials_json:
|
|
76
|
+
credentials_json: dict | None = None
|
|
78
77
|
|
|
79
78
|
|
|
80
79
|
class StorageGCPOut(StorageGCPBase):
|
|
@@ -105,9 +104,7 @@ class StorageGCPOut(StorageGCPBase):
|
|
|
105
104
|
# account_url: str
|
|
106
105
|
|
|
107
106
|
|
|
108
|
-
def get_git_repo_url(
|
|
109
|
-
repo_url: typing.Union[str, Url], path: typing.Union[str, Path]
|
|
110
|
-
) -> Url:
|
|
107
|
+
def get_git_repo_url(repo_url: str | Url, path: str | Path) -> Url:
|
|
111
108
|
"""
|
|
112
109
|
Get the full URL for a file in the git repository
|
|
113
110
|
|
|
@@ -128,12 +125,12 @@ def get_git_repo_url(
|
|
|
128
125
|
|
|
129
126
|
|
|
130
127
|
class StorageGit(BaseModel):
|
|
131
|
-
repo_id:
|
|
128
|
+
repo_id: int | None = None
|
|
132
129
|
"""
|
|
133
130
|
The ID of the Git repository in the Hirundo system.
|
|
134
131
|
Either :code:`repo_id` or :code:`repo` must be provided.
|
|
135
132
|
"""
|
|
136
|
-
repo:
|
|
133
|
+
repo: GitRepo | None = None
|
|
137
134
|
"""
|
|
138
135
|
The Git repository to link to.
|
|
139
136
|
Either :code:`repo_id` or :code:`repo` must be provided.
|
|
@@ -149,7 +146,7 @@ class StorageGit(BaseModel):
|
|
|
149
146
|
raise ValueError("Either repo_id or repo must be provided")
|
|
150
147
|
return self
|
|
151
148
|
|
|
152
|
-
def get_url(self, path:
|
|
149
|
+
def get_url(self, path: str | Path) -> Url:
|
|
153
150
|
"""
|
|
154
151
|
Get the full URL for a file in the git repository
|
|
155
152
|
|
|
@@ -172,7 +169,7 @@ class StorageGitOut(BaseModel):
|
|
|
172
169
|
repo: GitRepoOut
|
|
173
170
|
branch: str
|
|
174
171
|
|
|
175
|
-
def get_url(self, path:
|
|
172
|
+
def get_url(self, path: str | Path) -> Url:
|
|
176
173
|
"""
|
|
177
174
|
Get the full URL for a file in the git repository
|
|
178
175
|
|
|
@@ -190,12 +187,12 @@ class StorageGitOut(BaseModel):
|
|
|
190
187
|
|
|
191
188
|
|
|
192
189
|
class StorageConfig(BaseModel):
|
|
193
|
-
id:
|
|
190
|
+
id: int | None = None
|
|
194
191
|
"""
|
|
195
192
|
The ID of the :code:`StorageConfig` in the Hirundo system.
|
|
196
193
|
"""
|
|
197
194
|
|
|
198
|
-
organization_id:
|
|
195
|
+
organization_id: int | None = None
|
|
199
196
|
"""
|
|
200
197
|
The ID of the organization that the :code:`StorageConfig` belongs to.
|
|
201
198
|
If not provided, it will be assigned to your default organization.
|
|
@@ -205,7 +202,7 @@ class StorageConfig(BaseModel):
|
|
|
205
202
|
"""
|
|
206
203
|
A name to identify the :code:`StorageConfig` in the Hirundo system.
|
|
207
204
|
"""
|
|
208
|
-
type:
|
|
205
|
+
type: StorageTypes | None = pydantic.Field(
|
|
209
206
|
examples=[
|
|
210
207
|
StorageTypes.S3,
|
|
211
208
|
StorageTypes.GCP,
|
|
@@ -221,7 +218,7 @@ class StorageConfig(BaseModel):
|
|
|
221
218
|
- :code:`Azure` (coming soon)
|
|
222
219
|
- :code:`Git`
|
|
223
220
|
"""
|
|
224
|
-
s3:
|
|
221
|
+
s3: StorageS3 | None = pydantic.Field(
|
|
225
222
|
default=None,
|
|
226
223
|
examples=[
|
|
227
224
|
{
|
|
@@ -239,7 +236,7 @@ class StorageConfig(BaseModel):
|
|
|
239
236
|
The Amazon Web Services (AWS) S3 storage config details.
|
|
240
237
|
Use this if you want to link to an S3 bucket.
|
|
241
238
|
"""
|
|
242
|
-
gcp:
|
|
239
|
+
gcp: StorageGCP | None = pydantic.Field(
|
|
243
240
|
default=None,
|
|
244
241
|
examples=[
|
|
245
242
|
None,
|
|
@@ -282,7 +279,7 @@ class StorageConfig(BaseModel):
|
|
|
282
279
|
# None,
|
|
283
280
|
# ],
|
|
284
281
|
# ) TODO: Azure storage config is coming soon
|
|
285
|
-
git:
|
|
282
|
+
git: StorageGit | None = pydantic.Field(
|
|
286
283
|
default=None,
|
|
287
284
|
examples=[
|
|
288
285
|
None,
|
|
@@ -341,7 +338,7 @@ class StorageConfig(BaseModel):
|
|
|
341
338
|
|
|
342
339
|
@staticmethod
|
|
343
340
|
def list(
|
|
344
|
-
organization_id:
|
|
341
|
+
organization_id: int | None = None,
|
|
345
342
|
) -> list["ResponseStorageConfig"]:
|
|
346
343
|
"""
|
|
347
344
|
Lists all the :code:`StorageConfig`'s created by user's default organization
|
|
@@ -441,7 +438,10 @@ class ResponseStorageConfig(BaseModel):
|
|
|
441
438
|
type: StorageTypes
|
|
442
439
|
organization_name: str
|
|
443
440
|
creator_name: str
|
|
444
|
-
s3:
|
|
445
|
-
gcp:
|
|
441
|
+
s3: StorageS3Out | None
|
|
442
|
+
gcp: StorageGCPOut | None
|
|
446
443
|
# azure: typing.Optional[StorageAzureOut]
|
|
447
|
-
git:
|
|
444
|
+
git: StorageGitOut | None
|
|
445
|
+
|
|
446
|
+
created_at: datetime.datetime
|
|
447
|
+
updated_at: datetime.datetime
|