hirundo 0.1.16__tar.gz → 0.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hirundo-0.1.16 → hirundo-0.1.21}/PKG-INFO +67 -53
- {hirundo-0.1.16 → hirundo-0.1.21}/README.md +53 -46
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/__init__.py +30 -14
- hirundo-0.1.21/hirundo/_constraints.py +164 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/_headers.py +1 -1
- hirundo-0.1.21/hirundo/_http.py +72 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/_iter_sse_retrying.py +1 -1
- hirundo-0.1.16/hirundo/_constraints.py → hirundo-0.1.21/hirundo/_urls.py +12 -6
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/cli.py +7 -7
- hirundo-0.1.21/hirundo/dataset_enum.py +46 -0
- hirundo-0.1.16/hirundo/dataset_optimization.py → hirundo-0.1.21/hirundo/dataset_qa.py +195 -168
- hirundo-0.1.16/hirundo/dataset_optimization_results.py → hirundo-0.1.21/hirundo/dataset_qa_results.py +4 -4
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/git.py +2 -3
- hirundo-0.1.21/hirundo/labeling.py +140 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/storage.py +43 -60
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/unzip.py +9 -10
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo.egg-info/PKG-INFO +67 -53
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo.egg-info/SOURCES.txt +4 -2
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo.egg-info/requires.txt +12 -5
- {hirundo-0.1.16 → hirundo-0.1.21}/pyproject.toml +36 -31
- hirundo-0.1.16/hirundo/_http.py +0 -19
- hirundo-0.1.16/hirundo/dataset_enum.py +0 -23
- {hirundo-0.1.16 → hirundo-0.1.21}/LICENSE +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/__main__.py +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/_dataframe.py +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/_env.py +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/_timeouts.py +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo/logger.py +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo.egg-info/dependency_links.txt +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo.egg-info/entry_points.txt +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/hirundo.egg-info/top_level.txt +0 -0
- {hirundo-0.1.16 → hirundo-0.1.21}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hirundo
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.21
|
|
4
4
|
Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
|
|
5
5
|
Author-email: Hirundo <dev@hirundo.io>
|
|
6
6
|
License: MIT License
|
|
@@ -13,7 +13,7 @@ License: MIT License
|
|
|
13
13
|
|
|
14
14
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
15
15
|
|
|
16
|
-
Project-URL: Homepage, https://github.com/Hirundo-io/hirundo-
|
|
16
|
+
Project-URL: Homepage, https://github.com/Hirundo-io/hirundo-python-sdk
|
|
17
17
|
Keywords: dataset,machine learning,data science,data engineering
|
|
18
18
|
Classifier: License :: OSI Approved :: MIT License
|
|
19
19
|
Classifier: Programming Language :: Python
|
|
@@ -32,6 +32,10 @@ Requires-Dist: httpx>=0.27.0
|
|
|
32
32
|
Requires-Dist: stamina>=24.2.0
|
|
33
33
|
Requires-Dist: httpx-sse>=0.4.0
|
|
34
34
|
Requires-Dist: tqdm>=4.66.5
|
|
35
|
+
Requires-Dist: h11>=0.16.0
|
|
36
|
+
Requires-Dist: requests>=2.32.4
|
|
37
|
+
Requires-Dist: urllib3>=2.5.0
|
|
38
|
+
Requires-Dist: setuptools>=78.1.1
|
|
35
39
|
Provides-Extra: dev
|
|
36
40
|
Requires-Dist: pyyaml>=6.0.1; extra == "dev"
|
|
37
41
|
Requires-Dist: types-PyYAML>=6.0.12; extra == "dev"
|
|
@@ -46,69 +50,94 @@ Requires-Dist: stamina>=24.2.0; extra == "dev"
|
|
|
46
50
|
Requires-Dist: httpx-sse>=0.4.0; extra == "dev"
|
|
47
51
|
Requires-Dist: pytest>=8.2.0; extra == "dev"
|
|
48
52
|
Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
|
|
49
|
-
Requires-Dist: uv>=0.
|
|
53
|
+
Requires-Dist: uv>=0.8.6; extra == "dev"
|
|
50
54
|
Requires-Dist: pre-commit>=3.7.1; extra == "dev"
|
|
51
55
|
Requires-Dist: virtualenv>=20.6.6; extra == "dev"
|
|
52
|
-
Requires-Dist: ruff>=0.
|
|
56
|
+
Requires-Dist: ruff>=0.12.0; extra == "dev"
|
|
53
57
|
Requires-Dist: bumpver; extra == "dev"
|
|
54
58
|
Requires-Dist: platformdirs>=4.3.6; extra == "dev"
|
|
55
59
|
Requires-Dist: safety>=3.2.13; extra == "dev"
|
|
60
|
+
Requires-Dist: cryptography>=44.0.1; extra == "dev"
|
|
61
|
+
Requires-Dist: jinja2>=3.1.6; extra == "dev"
|
|
56
62
|
Provides-Extra: docs
|
|
57
63
|
Requires-Dist: sphinx>=7.4.7; extra == "docs"
|
|
58
|
-
Requires-Dist: sphinx-autobuild>=2024.
|
|
64
|
+
Requires-Dist: sphinx-autobuild>=2024.9.3; extra == "docs"
|
|
59
65
|
Requires-Dist: sphinx-click>=5.0.1; extra == "docs"
|
|
60
66
|
Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
|
|
61
67
|
Requires-Dist: furo; extra == "docs"
|
|
62
68
|
Requires-Dist: sphinx-multiversion; extra == "docs"
|
|
63
69
|
Requires-Dist: esbonio; extra == "docs"
|
|
64
|
-
Requires-Dist: starlette
|
|
70
|
+
Requires-Dist: starlette>=0.47.2; extra == "docs"
|
|
65
71
|
Requires-Dist: markupsafe>=3.0.2; extra == "docs"
|
|
72
|
+
Requires-Dist: jinja2>=3.1.6; extra == "docs"
|
|
66
73
|
Provides-Extra: pandas
|
|
67
|
-
Requires-Dist: pandas>=2.2.
|
|
74
|
+
Requires-Dist: pandas>=2.2.3; extra == "pandas"
|
|
68
75
|
Provides-Extra: polars
|
|
69
76
|
Requires-Dist: polars>=1.0.0; extra == "polars"
|
|
70
77
|
Dynamic: license-file
|
|
71
78
|
|
|
72
79
|
# Hirundo
|
|
73
80
|
|
|
74
|
-
This package exposes access to Hirundo APIs for dataset
|
|
75
|
-
|
|
76
|
-
Dataset optimization is currently available for datasets labelled for classification and object detection.
|
|
81
|
+
This package exposes access to Hirundo APIs for dataset QA for Machine Learning.
|
|
77
82
|
|
|
83
|
+
Dataset QA is currently available for datasets labelled for classification and object detection.
|
|
78
84
|
|
|
79
85
|
Support dataset storage configs include:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
86
|
+
|
|
87
|
+
- Google Cloud (GCP) Storage
|
|
88
|
+
- Amazon Web Services (AWS) S3
|
|
89
|
+
- Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
|
|
90
|
+
|
|
91
|
+
Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
|
|
83
92
|
|
|
84
93
|
Optimizing a classification dataset
|
|
85
94
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
86
95
|
|
|
87
|
-
Currently
|
|
88
|
-
|
|
89
|
-
|
|
96
|
+
Currently `hirundo` requires a CSV file with the following columns (all columns are required):
|
|
97
|
+
|
|
98
|
+
- `image_path`: The location of the image within the dataset `data_root_url`
|
|
99
|
+
- `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
|
|
100
|
+
|
|
101
|
+
And outputs two Pandas DataFrames with the dataset columns as well as:
|
|
102
|
+
|
|
103
|
+
Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
|
|
90
104
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
105
|
+
- ``suspect_score``: mislabel suspect score
|
|
106
|
+
- ``suspect_level``: mislabel suspect level
|
|
107
|
+
- ``suspect_rank``: mislabel suspect ranking
|
|
108
|
+
- ``suggested_class_name``: suggested semantic label
|
|
109
|
+
- ``suggested_class_conf``: suggested semantic label confidence
|
|
110
|
+
|
|
111
|
+
Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
|
|
112
|
+
|
|
113
|
+
- ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
|
|
95
114
|
|
|
96
115
|
Optimizing an object detection (OD) dataset
|
|
97
116
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
98
117
|
|
|
99
118
|
Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
|
|
100
|
-
- ``image_path``: The location of the image within the dataset ``root``
|
|
101
|
-
- ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
|
|
102
|
-
- ``label``: The label of the image, i.e. which the class that was annotated for this image
|
|
103
|
-
- ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
|
|
104
119
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
120
|
+
- ``image_path``: The location of the image within the dataset ``data_root_url``
|
|
121
|
+
- ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
|
|
122
|
+
- ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
|
|
123
|
+
- ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
|
|
124
|
+
- ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
|
|
125
|
+
- ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
|
|
126
|
+
- ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
|
|
109
127
|
|
|
110
|
-
Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
|
|
111
128
|
|
|
129
|
+
And outputs two Pandas DataFrames with the dataset columns as well as:
|
|
130
|
+
|
|
131
|
+
Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
|
|
132
|
+
|
|
133
|
+
- ``suspect_score``: object mislabel suspect score
|
|
134
|
+
- ``suspect_level``: object mislabel suspect level
|
|
135
|
+
- ``suspect_rank``: object mislabel suspect ranking
|
|
136
|
+
- ``suggested_class_name``: suggested object semantic label
|
|
137
|
+
- ``suggested_class_conf``: suggested object semantic label confidence
|
|
138
|
+
|
|
139
|
+
Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
|
|
140
|
+
- ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
|
|
112
141
|
|
|
113
142
|
## Installation
|
|
114
143
|
|
|
@@ -117,11 +146,12 @@ You can install the codebase with a simple `pip install hirundo` to install the
|
|
|
117
146
|
## Usage
|
|
118
147
|
|
|
119
148
|
Classification example:
|
|
149
|
+
|
|
120
150
|
```python
|
|
121
151
|
from hirundo import (
|
|
122
152
|
HirundoCSV,
|
|
123
153
|
LabelingType,
|
|
124
|
-
|
|
154
|
+
QADataset,
|
|
125
155
|
StorageGCP,
|
|
126
156
|
StorageConfig,
|
|
127
157
|
StorageTypes,
|
|
@@ -132,7 +162,7 @@ gcp_bucket = StorageGCP(
|
|
|
132
162
|
project="Hirundo-global",
|
|
133
163
|
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
|
|
134
164
|
)
|
|
135
|
-
test_dataset =
|
|
165
|
+
test_dataset = QADataset(
|
|
136
166
|
name="TEST-GCP cifar 100 classification dataset",
|
|
137
167
|
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
|
|
138
168
|
storage_config=StorageConfig(
|
|
@@ -147,12 +177,11 @@ test_dataset = OptimizationDataset(
|
|
|
147
177
|
classes=cifar100_classes,
|
|
148
178
|
)
|
|
149
179
|
|
|
150
|
-
test_dataset.
|
|
180
|
+
test_dataset.run_qa()
|
|
151
181
|
results = test_dataset.check_run()
|
|
152
182
|
print(results)
|
|
153
183
|
```
|
|
154
184
|
|
|
155
|
-
|
|
156
185
|
Object detection example:
|
|
157
186
|
|
|
158
187
|
```python
|
|
@@ -160,7 +189,7 @@ from hirundo import (
|
|
|
160
189
|
GitRepo,
|
|
161
190
|
HirundoCSV,
|
|
162
191
|
LabelingType,
|
|
163
|
-
|
|
192
|
+
QADataset,
|
|
164
193
|
StorageGit,
|
|
165
194
|
StorageConfig,
|
|
166
195
|
StorageTypes,
|
|
@@ -173,7 +202,7 @@ git_storage = StorageGit(
|
|
|
173
202
|
),
|
|
174
203
|
branch="main",
|
|
175
204
|
)
|
|
176
|
-
test_dataset =
|
|
205
|
+
test_dataset = QADataset(
|
|
177
206
|
name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
|
|
178
207
|
labeling_type=LabelingType.OBJECT_DETECTION,
|
|
179
208
|
storage_config=StorageConfig(
|
|
@@ -187,30 +216,15 @@ test_dataset = OptimizationDataset(
|
|
|
187
216
|
path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
|
|
188
217
|
),
|
|
189
218
|
),
|
|
190
|
-
classes=[
|
|
191
|
-
"traffic light",
|
|
192
|
-
"traffic sign",
|
|
193
|
-
"car",
|
|
194
|
-
"pedestrian",
|
|
195
|
-
"bus",
|
|
196
|
-
"truck",
|
|
197
|
-
"rider",
|
|
198
|
-
"bicycle",
|
|
199
|
-
"motorcycle",
|
|
200
|
-
"train",
|
|
201
|
-
"other vehicle",
|
|
202
|
-
"other person",
|
|
203
|
-
"trailer",
|
|
204
|
-
],
|
|
205
219
|
)
|
|
206
220
|
|
|
207
|
-
test_dataset.
|
|
221
|
+
test_dataset.run_qa()
|
|
208
222
|
results = test_dataset.check_run()
|
|
209
223
|
print(results)
|
|
210
224
|
```
|
|
211
225
|
|
|
212
|
-
Note: Currently we only support the main CPython release 3.9, 3.10
|
|
226
|
+
Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
|
|
213
227
|
|
|
214
228
|
## Further documentation
|
|
215
229
|
|
|
216
|
-
To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
|
|
230
|
+
To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-python-sdk/tree/main/notebooks).
|
|
@@ -1,43 +1,65 @@
|
|
|
1
1
|
# Hirundo
|
|
2
2
|
|
|
3
|
-
This package exposes access to Hirundo APIs for dataset
|
|
4
|
-
|
|
5
|
-
Dataset optimization is currently available for datasets labelled for classification and object detection.
|
|
3
|
+
This package exposes access to Hirundo APIs for dataset QA for Machine Learning.
|
|
6
4
|
|
|
5
|
+
Dataset QA is currently available for datasets labelled for classification and object detection.
|
|
7
6
|
|
|
8
7
|
Support dataset storage configs include:
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
|
|
9
|
+
- Google Cloud (GCP) Storage
|
|
10
|
+
- Amazon Web Services (AWS) S3
|
|
11
|
+
- Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
|
|
12
|
+
|
|
13
|
+
Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
|
|
12
14
|
|
|
13
15
|
Optimizing a classification dataset
|
|
14
16
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
15
17
|
|
|
16
|
-
Currently
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
Currently `hirundo` requires a CSV file with the following columns (all columns are required):
|
|
19
|
+
|
|
20
|
+
- `image_path`: The location of the image within the dataset `data_root_url`
|
|
21
|
+
- `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
|
|
22
|
+
|
|
23
|
+
And outputs two Pandas DataFrames with the dataset columns as well as:
|
|
24
|
+
|
|
25
|
+
Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
|
|
19
26
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
27
|
+
- ``suspect_score``: mislabel suspect score
|
|
28
|
+
- ``suspect_level``: mislabel suspect level
|
|
29
|
+
- ``suspect_rank``: mislabel suspect ranking
|
|
30
|
+
- ``suggested_class_name``: suggested semantic label
|
|
31
|
+
- ``suggested_class_conf``: suggested semantic label confidence
|
|
32
|
+
|
|
33
|
+
Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
|
|
34
|
+
|
|
35
|
+
- ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
|
|
24
36
|
|
|
25
37
|
Optimizing an object detection (OD) dataset
|
|
26
38
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
27
39
|
|
|
28
40
|
Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
|
|
29
|
-
- ``image_path``: The location of the image within the dataset ``root``
|
|
30
|
-
- ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
|
|
31
|
-
- ``label``: The label of the image, i.e. which the class that was annotated for this image
|
|
32
|
-
- ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
|
|
33
41
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
42
|
+
- ``image_path``: The location of the image within the dataset ``data_root_url``
|
|
43
|
+
- ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
|
|
44
|
+
- ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
|
|
45
|
+
- ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
|
|
46
|
+
- ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
|
|
47
|
+
- ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
|
|
48
|
+
- ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
|
|
38
49
|
|
|
39
|
-
Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
|
|
40
50
|
|
|
51
|
+
And outputs two Pandas DataFrames with the dataset columns as well as:
|
|
52
|
+
|
|
53
|
+
Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
|
|
54
|
+
|
|
55
|
+
- ``suspect_score``: object mislabel suspect score
|
|
56
|
+
- ``suspect_level``: object mislabel suspect level
|
|
57
|
+
- ``suspect_rank``: object mislabel suspect ranking
|
|
58
|
+
- ``suggested_class_name``: suggested object semantic label
|
|
59
|
+
- ``suggested_class_conf``: suggested object semantic label confidence
|
|
60
|
+
|
|
61
|
+
Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
|
|
62
|
+
- ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
|
|
41
63
|
|
|
42
64
|
## Installation
|
|
43
65
|
|
|
@@ -46,11 +68,12 @@ You can install the codebase with a simple `pip install hirundo` to install the
|
|
|
46
68
|
## Usage
|
|
47
69
|
|
|
48
70
|
Classification example:
|
|
71
|
+
|
|
49
72
|
```python
|
|
50
73
|
from hirundo import (
|
|
51
74
|
HirundoCSV,
|
|
52
75
|
LabelingType,
|
|
53
|
-
|
|
76
|
+
QADataset,
|
|
54
77
|
StorageGCP,
|
|
55
78
|
StorageConfig,
|
|
56
79
|
StorageTypes,
|
|
@@ -61,7 +84,7 @@ gcp_bucket = StorageGCP(
|
|
|
61
84
|
project="Hirundo-global",
|
|
62
85
|
credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
|
|
63
86
|
)
|
|
64
|
-
test_dataset =
|
|
87
|
+
test_dataset = QADataset(
|
|
65
88
|
name="TEST-GCP cifar 100 classification dataset",
|
|
66
89
|
labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
|
|
67
90
|
storage_config=StorageConfig(
|
|
@@ -76,12 +99,11 @@ test_dataset = OptimizationDataset(
|
|
|
76
99
|
classes=cifar100_classes,
|
|
77
100
|
)
|
|
78
101
|
|
|
79
|
-
test_dataset.
|
|
102
|
+
test_dataset.run_qa()
|
|
80
103
|
results = test_dataset.check_run()
|
|
81
104
|
print(results)
|
|
82
105
|
```
|
|
83
106
|
|
|
84
|
-
|
|
85
107
|
Object detection example:
|
|
86
108
|
|
|
87
109
|
```python
|
|
@@ -89,7 +111,7 @@ from hirundo import (
|
|
|
89
111
|
GitRepo,
|
|
90
112
|
HirundoCSV,
|
|
91
113
|
LabelingType,
|
|
92
|
-
|
|
114
|
+
QADataset,
|
|
93
115
|
StorageGit,
|
|
94
116
|
StorageConfig,
|
|
95
117
|
StorageTypes,
|
|
@@ -102,7 +124,7 @@ git_storage = StorageGit(
|
|
|
102
124
|
),
|
|
103
125
|
branch="main",
|
|
104
126
|
)
|
|
105
|
-
test_dataset =
|
|
127
|
+
test_dataset = QADataset(
|
|
106
128
|
name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
|
|
107
129
|
labeling_type=LabelingType.OBJECT_DETECTION,
|
|
108
130
|
storage_config=StorageConfig(
|
|
@@ -116,30 +138,15 @@ test_dataset = OptimizationDataset(
|
|
|
116
138
|
path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
|
|
117
139
|
),
|
|
118
140
|
),
|
|
119
|
-
classes=[
|
|
120
|
-
"traffic light",
|
|
121
|
-
"traffic sign",
|
|
122
|
-
"car",
|
|
123
|
-
"pedestrian",
|
|
124
|
-
"bus",
|
|
125
|
-
"truck",
|
|
126
|
-
"rider",
|
|
127
|
-
"bicycle",
|
|
128
|
-
"motorcycle",
|
|
129
|
-
"train",
|
|
130
|
-
"other vehicle",
|
|
131
|
-
"other person",
|
|
132
|
-
"trailer",
|
|
133
|
-
],
|
|
134
141
|
)
|
|
135
142
|
|
|
136
|
-
test_dataset.
|
|
143
|
+
test_dataset.run_qa()
|
|
137
144
|
results = test_dataset.check_run()
|
|
138
145
|
print(results)
|
|
139
146
|
```
|
|
140
147
|
|
|
141
|
-
Note: Currently we only support the main CPython release 3.9, 3.10
|
|
148
|
+
Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
|
|
142
149
|
|
|
143
150
|
## Further documentation
|
|
144
151
|
|
|
145
|
-
To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
|
|
152
|
+
To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-python-sdk/tree/main/notebooks).
|
|
@@ -1,38 +1,54 @@
|
|
|
1
1
|
from .dataset_enum import (
|
|
2
2
|
DatasetMetadataType,
|
|
3
3
|
LabelingType,
|
|
4
|
+
StorageTypes,
|
|
4
5
|
)
|
|
5
|
-
from .
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
HirundoCSV,
|
|
6
|
+
from .dataset_qa import (
|
|
7
|
+
ClassificationRunArgs,
|
|
8
|
+
Domain,
|
|
9
9
|
HirundoError,
|
|
10
|
-
|
|
10
|
+
ObjectDetectionRunArgs,
|
|
11
|
+
QADataset,
|
|
11
12
|
RunArgs,
|
|
12
|
-
VisionRunArgs,
|
|
13
13
|
)
|
|
14
|
-
from .
|
|
14
|
+
from .dataset_qa_results import DatasetQAResults
|
|
15
15
|
from .git import GitPlainAuth, GitRepo, GitSSHAuth
|
|
16
|
+
from .labeling import (
|
|
17
|
+
COCO,
|
|
18
|
+
YOLO,
|
|
19
|
+
HirundoCSV,
|
|
20
|
+
KeylabsAuth,
|
|
21
|
+
KeylabsObjDetImages,
|
|
22
|
+
KeylabsObjDetVideo,
|
|
23
|
+
KeylabsObjSegImages,
|
|
24
|
+
KeylabsObjSegVideo,
|
|
25
|
+
)
|
|
16
26
|
from .storage import (
|
|
17
27
|
StorageConfig,
|
|
18
28
|
StorageGCP,
|
|
19
29
|
# StorageAzure, TODO: Azure storage is coming soon
|
|
20
30
|
StorageGit,
|
|
21
31
|
StorageS3,
|
|
22
|
-
StorageTypes,
|
|
23
32
|
)
|
|
24
33
|
from .unzip import load_df, load_from_zip
|
|
25
34
|
|
|
26
35
|
__all__ = [
|
|
27
36
|
"COCO",
|
|
28
37
|
"YOLO",
|
|
29
|
-
"HirundoCSV",
|
|
30
38
|
"HirundoError",
|
|
31
|
-
"
|
|
39
|
+
"HirundoCSV",
|
|
40
|
+
"KeylabsAuth",
|
|
41
|
+
"KeylabsObjDetImages",
|
|
42
|
+
"KeylabsObjDetVideo",
|
|
43
|
+
"KeylabsObjSegImages",
|
|
44
|
+
"KeylabsObjSegVideo",
|
|
45
|
+
"QADataset",
|
|
46
|
+
"Domain",
|
|
32
47
|
"RunArgs",
|
|
33
|
-
"
|
|
34
|
-
"
|
|
48
|
+
"ClassificationRunArgs",
|
|
49
|
+
"ObjectDetectionRunArgs",
|
|
35
50
|
"DatasetMetadataType",
|
|
51
|
+
"LabelingType",
|
|
36
52
|
"GitPlainAuth",
|
|
37
53
|
"GitRepo",
|
|
38
54
|
"GitSSHAuth",
|
|
@@ -42,9 +58,9 @@ __all__ = [
|
|
|
42
58
|
# "StorageAzure", TODO: Azure storage is coming soon
|
|
43
59
|
"StorageGit",
|
|
44
60
|
"StorageConfig",
|
|
45
|
-
"
|
|
61
|
+
"DatasetQAResults",
|
|
46
62
|
"load_df",
|
|
47
63
|
"load_from_zip",
|
|
48
64
|
]
|
|
49
65
|
|
|
50
|
-
__version__ = "0.1.
|
|
66
|
+
__version__ = "0.1.21"
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import typing
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from hirundo._urls import (
|
|
6
|
+
LENGTH_CONSTRAINTS,
|
|
7
|
+
STORAGE_PATTERNS,
|
|
8
|
+
)
|
|
9
|
+
from hirundo.dataset_enum import DatasetMetadataType, LabelingType, StorageTypes
|
|
10
|
+
from hirundo.labeling import COCO, YOLO, HirundoCSV, Keylabs
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from hirundo._urls import HirundoUrl
|
|
14
|
+
from hirundo.dataset_qa import LabelingInfo
|
|
15
|
+
from hirundo.storage import (
|
|
16
|
+
ResponseStorageConfig,
|
|
17
|
+
StorageConfig,
|
|
18
|
+
StorageGCP,
|
|
19
|
+
StorageGCPOut,
|
|
20
|
+
StorageS3,
|
|
21
|
+
StorageS3Out,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
LABELING_TYPES_TO_DATASET_METADATA_TYPES = {
|
|
25
|
+
LabelingType.SINGLE_LABEL_CLASSIFICATION: [
|
|
26
|
+
DatasetMetadataType.HIRUNDO_CSV,
|
|
27
|
+
],
|
|
28
|
+
LabelingType.OBJECT_DETECTION: [
|
|
29
|
+
DatasetMetadataType.HIRUNDO_CSV,
|
|
30
|
+
DatasetMetadataType.COCO,
|
|
31
|
+
DatasetMetadataType.YOLO,
|
|
32
|
+
DatasetMetadataType.KeylabsObjDetImages,
|
|
33
|
+
DatasetMetadataType.KeylabsObjDetVideo,
|
|
34
|
+
],
|
|
35
|
+
LabelingType.OBJECT_SEGMENTATION: [
|
|
36
|
+
DatasetMetadataType.HIRUNDO_CSV,
|
|
37
|
+
DatasetMetadataType.KeylabsObjSegImages,
|
|
38
|
+
DatasetMetadataType.KeylabsObjSegVideo,
|
|
39
|
+
],
|
|
40
|
+
LabelingType.SEMANTIC_SEGMENTATION: [
|
|
41
|
+
DatasetMetadataType.HIRUNDO_CSV,
|
|
42
|
+
],
|
|
43
|
+
LabelingType.PANOPTIC_SEGMENTATION: [
|
|
44
|
+
DatasetMetadataType.HIRUNDO_CSV,
|
|
45
|
+
],
|
|
46
|
+
LabelingType.SPEECH_TO_TEXT: [
|
|
47
|
+
DatasetMetadataType.HIRUNDO_CSV,
|
|
48
|
+
],
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def validate_s3_url(str_url: str, s3_config: "StorageS3 | StorageS3Out"):
|
|
53
|
+
if (
|
|
54
|
+
len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.S3]["min_length"]
|
|
55
|
+
or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.S3]["max_length"]
|
|
56
|
+
):
|
|
57
|
+
raise ValueError("S3 URL must be between 8 and 1023 characters")
|
|
58
|
+
elif not re.match(STORAGE_PATTERNS[StorageTypes.S3], str_url):
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"Invalid S3 URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.S3]}"
|
|
61
|
+
)
|
|
62
|
+
elif not str_url.startswith(f"{s3_config.bucket_url}/"):
|
|
63
|
+
raise ValueError(f"S3 URL must start with {s3_config.bucket_url}/")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def validate_gcp_url(str_url: str, gcp_config: "StorageGCP | StorageGCPOut"):
|
|
67
|
+
matches = re.match(STORAGE_PATTERNS[StorageTypes.GCP], str_url)
|
|
68
|
+
if (
|
|
69
|
+
len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.GCP]["min_length"]
|
|
70
|
+
or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.GCP]["max_length"]
|
|
71
|
+
):
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"GCP URL must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['min_length']}"
|
|
74
|
+
+ f" and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['max_length']} characters"
|
|
75
|
+
)
|
|
76
|
+
elif not matches:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"Invalid GCP URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.GCP]}"
|
|
79
|
+
)
|
|
80
|
+
elif (
|
|
81
|
+
matches
|
|
82
|
+
and len(matches.group(1))
|
|
83
|
+
> LENGTH_CONSTRAINTS[StorageTypes.GCP]["bucket_max_length"]
|
|
84
|
+
):
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"GCP bucket name must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_min_length']} "
|
|
87
|
+
+ f"and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_max_length']} characters"
|
|
88
|
+
)
|
|
89
|
+
elif not str_url.startswith(f"gs://{gcp_config.bucket_name}/"):
|
|
90
|
+
raise ValueError(f"GCP URL must start with gs://{gcp_config.bucket_name}")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def validate_url(
|
|
94
|
+
url: "HirundoUrl",
|
|
95
|
+
storage_config: "StorageConfig | ResponseStorageConfig",
|
|
96
|
+
) -> "HirundoUrl":
|
|
97
|
+
s3_config = storage_config.s3
|
|
98
|
+
gcp_config = storage_config.gcp
|
|
99
|
+
git_config = storage_config.git
|
|
100
|
+
str_url = str(url)
|
|
101
|
+
|
|
102
|
+
if s3_config is not None:
|
|
103
|
+
validate_s3_url(str_url, s3_config)
|
|
104
|
+
elif gcp_config is not None:
|
|
105
|
+
validate_gcp_url(str_url, gcp_config)
|
|
106
|
+
elif (
|
|
107
|
+
git_config is not None
|
|
108
|
+
and not str_url.startswith("https://")
|
|
109
|
+
and not str_url.startswith("ssh://")
|
|
110
|
+
):
|
|
111
|
+
raise ValueError("Git URL must start with https:// or ssh://")
|
|
112
|
+
elif storage_config.type == StorageTypes.LOCAL and not str_url.startswith(
|
|
113
|
+
"file:///datasets/"
|
|
114
|
+
):
|
|
115
|
+
raise ValueError("Local URL must start with file:///datasets/")
|
|
116
|
+
return url
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def validate_labeling_type(
|
|
120
|
+
labeling_type: "LabelingType", labeling_info: "LabelingInfo"
|
|
121
|
+
) -> None:
|
|
122
|
+
"""
|
|
123
|
+
Validate that the labeling type is compatible with the labeling info
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
labeling_type: The type of labeling that will be performed
|
|
127
|
+
labeling_info: The labeling info to validate
|
|
128
|
+
"""
|
|
129
|
+
dataset_metadata_types = LABELING_TYPES_TO_DATASET_METADATA_TYPES[labeling_type]
|
|
130
|
+
if labeling_info.type not in dataset_metadata_types:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Cannot use {labeling_info.type.name} labeling info with {labeling_type.name} datasets"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def validate_labeling_info(
|
|
137
|
+
labeling_type: "LabelingType",
|
|
138
|
+
labeling_info: "typing.Union[LabelingInfo, list[LabelingInfo]]",
|
|
139
|
+
storage_config: "typing.Union[StorageConfig, ResponseStorageConfig]",
|
|
140
|
+
) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Validate the labeling info for a dataset
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
labeling_type: The type of labeling that will be performed
|
|
146
|
+
labeling_info: The labeling info to validate
|
|
147
|
+
storage_config: The storage configuration for the dataset.
|
|
148
|
+
StorageConfig is used to validate the URLs in the labeling info
|
|
149
|
+
"""
|
|
150
|
+
if isinstance(labeling_info, list):
|
|
151
|
+
for labeling in labeling_info:
|
|
152
|
+
validate_labeling_info(labeling_type, labeling, storage_config)
|
|
153
|
+
return
|
|
154
|
+
elif isinstance(labeling_info, HirundoCSV):
|
|
155
|
+
validate_url(labeling_info.csv_url, storage_config)
|
|
156
|
+
elif isinstance(labeling_info, COCO):
|
|
157
|
+
validate_url(labeling_info.json_url, storage_config)
|
|
158
|
+
elif isinstance(labeling_info, YOLO):
|
|
159
|
+
validate_url(labeling_info.labels_dir_url, storage_config)
|
|
160
|
+
if labeling_info.data_yaml_url is not None:
|
|
161
|
+
validate_url(labeling_info.data_yaml_url, storage_config)
|
|
162
|
+
elif isinstance(labeling_info, Keylabs):
|
|
163
|
+
validate_url(labeling_info.labels_dir_url, storage_config)
|
|
164
|
+
validate_labeling_type(labeling_type, labeling_info)
|