hirundo 0.1.9__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {hirundo-0.1.9 → hirundo-0.1.18}/PKG-INFO +55 -44
  2. {hirundo-0.1.9 → hirundo-0.1.18}/README.md +46 -39
  3. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/__init__.py +30 -11
  4. hirundo-0.1.18/hirundo/_constraints.py +164 -0
  5. hirundo-0.1.18/hirundo/_dataframe.py +43 -0
  6. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/_env.py +2 -2
  7. hirundo-0.1.18/hirundo/_headers.py +29 -0
  8. hirundo-0.1.18/hirundo/_timeouts.py +3 -0
  9. hirundo-0.1.9/hirundo/_constraints.py → hirundo-0.1.18/hirundo/_urls.py +12 -6
  10. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/cli.py +52 -0
  11. hirundo-0.1.18/hirundo/dataset_enum.py +46 -0
  12. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/dataset_optimization.py +93 -182
  13. hirundo-0.1.18/hirundo/dataset_optimization_results.py +42 -0
  14. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/git.py +12 -19
  15. hirundo-0.1.18/hirundo/labeling.py +140 -0
  16. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/storage.py +48 -67
  17. hirundo-0.1.18/hirundo/unzip.py +247 -0
  18. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo.egg-info/PKG-INFO +55 -44
  19. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo.egg-info/SOURCES.txt +6 -1
  20. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo.egg-info/requires.txt +8 -3
  21. {hirundo-0.1.9 → hirundo-0.1.18}/pyproject.toml +20 -21
  22. hirundo-0.1.9/hirundo/_headers.py +0 -13
  23. hirundo-0.1.9/hirundo/_timeouts.py +0 -2
  24. hirundo-0.1.9/hirundo/enum.py +0 -23
  25. {hirundo-0.1.9 → hirundo-0.1.18}/LICENSE +0 -0
  26. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/__main__.py +0 -0
  27. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/_http.py +0 -0
  28. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/_iter_sse_retrying.py +0 -0
  29. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo/logger.py +0 -0
  30. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo.egg-info/dependency_links.txt +0 -0
  31. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo.egg-info/entry_points.txt +0 -0
  32. {hirundo-0.1.9 → hirundo-0.1.18}/hirundo.egg-info/top_level.txt +0 -0
  33. {hirundo-0.1.9 → hirundo-0.1.18}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: hirundo
3
- Version: 0.1.9
3
+ Version: 0.1.18
4
4
  Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
5
5
  Author-email: Hirundo <dev@hirundo.io>
6
6
  License: MIT License
@@ -31,7 +31,6 @@ Requires-Dist: typer>=0.12.3
31
31
  Requires-Dist: httpx>=0.27.0
32
32
  Requires-Dist: stamina>=24.2.0
33
33
  Requires-Dist: httpx-sse>=0.4.0
34
- Requires-Dist: pandas>=2.2.2
35
34
  Requires-Dist: tqdm>=4.66.5
36
35
  Provides-Extra: dev
37
36
  Requires-Dist: pyyaml>=6.0.1; extra == "dev"
@@ -50,13 +49,13 @@ Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
50
49
  Requires-Dist: uv>=0.5.8; extra == "dev"
51
50
  Requires-Dist: pre-commit>=3.7.1; extra == "dev"
52
51
  Requires-Dist: virtualenv>=20.6.6; extra == "dev"
53
- Requires-Dist: ruff>=0.8.2; extra == "dev"
52
+ Requires-Dist: ruff>=0.11.6; extra == "dev"
54
53
  Requires-Dist: bumpver; extra == "dev"
55
54
  Requires-Dist: platformdirs>=4.3.6; extra == "dev"
56
55
  Requires-Dist: safety>=3.2.13; extra == "dev"
57
56
  Provides-Extra: docs
58
57
  Requires-Dist: sphinx>=7.4.7; extra == "docs"
59
- Requires-Dist: sphinx-autobuild>=2024.4.16; extra == "docs"
58
+ Requires-Dist: sphinx-autobuild>=2024.9.3; extra == "docs"
60
59
  Requires-Dist: sphinx-click>=5.0.1; extra == "docs"
61
60
  Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
62
61
  Requires-Dist: furo; extra == "docs"
@@ -64,6 +63,11 @@ Requires-Dist: sphinx-multiversion; extra == "docs"
64
63
  Requires-Dist: esbonio; extra == "docs"
65
64
  Requires-Dist: starlette>0.40.0; extra == "docs"
66
65
  Requires-Dist: markupsafe>=3.0.2; extra == "docs"
66
+ Provides-Extra: pandas
67
+ Requires-Dist: pandas>=2.2.3; extra == "pandas"
68
+ Provides-Extra: polars
69
+ Requires-Dist: polars>=1.0.0; extra == "polars"
70
+ Dynamic: license-file
67
71
 
68
72
  # Hirundo
69
73
 
@@ -71,40 +75,62 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
71
75
 
72
76
  Dataset optimization is currently available for datasets labelled for classification and object detection.
73
77
 
74
-
75
78
  Support dataset storage configs include:
76
- - Google Cloud (GCP) Storage
77
- - Amazon Web Services (AWS) S3
78
- - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
79
+
80
+ - Google Cloud (GCP) Storage
81
+ - Amazon Web Services (AWS) S3
82
+ - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
83
+
84
+ Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
79
85
 
80
86
  Optimizing a classification dataset
81
87
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
82
88
 
83
- Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
84
- - ``image_path``: The location of the image within the dataset ``root``
85
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
89
+ Currently `hirundo` requires a CSV file with the following columns (all columns are required):
90
+
91
+ - `image_path`: The location of the image within the dataset `data_root_url`
92
+ - `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
93
+
94
+ And outputs two Pandas DataFrames with the dataset columns as well as:
95
+
96
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
86
97
 
87
- And outputs a CSV with the same columns and:
88
- - ``suspect_level``: mislabel suspect level
89
- - ``suggested_label``: suggested label
90
- - ``suggested_label_conf``: suggested label confidence
98
+ - ``suspect_score``: mislabel suspect score
99
+ - ``suspect_level``: mislabel suspect level
100
+ - ``suspect_rank``: mislabel suspect ranking
101
+ - ``suggested_class_name``: suggested semantic label
102
+ - ``suggested_class_conf``: suggested semantic label confidence
103
+
104
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
105
+
106
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
91
107
 
92
108
  Optimizing an object detection (OD) dataset
93
109
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
94
110
 
95
111
  Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
96
- - ``image_path``: The location of the image within the dataset ``root``
97
- - ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
98
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
99
- - ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
100
112
 
101
- And outputs a CSV with the same columns and:
102
- - ``suspect_level``: object mislabel suspect level
103
- - ``suggested_label``: suggested object label
104
- - ``suggested_label_conf``: suggested object label confidence
113
+ - ``image_path``: The location of the image within the dataset ``data_root_url``
114
+ - ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
115
+ - ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
116
+ - ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
117
+ - ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
118
+ - ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
119
+ - ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
105
120
 
106
- Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
107
121
 
122
+ And outputs two Pandas DataFrames with the dataset columns as well as:
123
+
124
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
125
+
126
+ - ``suspect_score``: object mislabel suspect score
127
+ - ``suspect_level``: object mislabel suspect level
128
+ - ``suspect_rank``: object mislabel suspect ranking
129
+ - ``suggested_class_name``: suggested object semantic label
130
+ - ``suggested_class_conf``: suggested object semantic label confidence
131
+
132
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
133
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
108
134
 
109
135
  ## Installation
110
136
 
@@ -113,6 +139,7 @@ You can install the codebase with a simple `pip install hirundo` to install the
113
139
  ## Usage
114
140
 
115
141
  Classification example:
142
+
116
143
  ```python
117
144
  from hirundo import (
118
145
  HirundoCSV,
@@ -148,7 +175,6 @@ results = test_dataset.check_run()
148
175
  print(results)
149
176
  ```
150
177
 
151
-
152
178
  Object detection example:
153
179
 
154
180
  ```python
@@ -165,7 +191,7 @@ from hirundo import (
165
191
  git_storage = StorageGit(
166
192
  repo=GitRepo(
167
193
  name="BDD-100k-validation-dataset",
168
- repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only.git",
194
+ repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only",
169
195
  ),
170
196
  branch="main",
171
197
  )
@@ -183,21 +209,6 @@ test_dataset = OptimizationDataset(
183
209
  path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
184
210
  ),
185
211
  ),
186
- classes=[
187
- "traffic light",
188
- "traffic sign",
189
- "car",
190
- "pedestrian",
191
- "bus",
192
- "truck",
193
- "rider",
194
- "bicycle",
195
- "motorcycle",
196
- "train",
197
- "other vehicle",
198
- "other person",
199
- "trailer",
200
- ],
201
212
  )
202
213
 
203
214
  test_dataset.run_optimization()
@@ -205,8 +216,8 @@ results = test_dataset.check_run()
205
216
  print(results)
206
217
  ```
207
218
 
208
- Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyPy support may be introduced in the future.
219
+ Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
209
220
 
210
221
  ## Further documentation
211
222
 
212
- To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
223
+ To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-client/tree/main/notebooks).
@@ -4,40 +4,62 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
4
4
 
5
5
  Dataset optimization is currently available for datasets labelled for classification and object detection.
6
6
 
7
-
8
7
  Support dataset storage configs include:
9
- - Google Cloud (GCP) Storage
10
- - Amazon Web Services (AWS) S3
11
- - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
8
+
9
+ - Google Cloud (GCP) Storage
10
+ - Amazon Web Services (AWS) S3
11
+ - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
12
+
13
+ Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
12
14
 
13
15
  Optimizing a classification dataset
14
16
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
15
17
 
16
- Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
17
- - ``image_path``: The location of the image within the dataset ``root``
18
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
18
+ Currently `hirundo` requires a CSV file with the following columns (all columns are required):
19
+
20
+ - `image_path`: The location of the image within the dataset `data_root_url`
21
+ - `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
22
+
23
+ And outputs two Pandas DataFrames with the dataset columns as well as:
24
+
25
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
19
26
 
20
- And outputs a CSV with the same columns and:
21
- - ``suspect_level``: mislabel suspect level
22
- - ``suggested_label``: suggested label
23
- - ``suggested_label_conf``: suggested label confidence
27
+ - ``suspect_score``: mislabel suspect score
28
+ - ``suspect_level``: mislabel suspect level
29
+ - ``suspect_rank``: mislabel suspect ranking
30
+ - ``suggested_class_name``: suggested semantic label
31
+ - ``suggested_class_conf``: suggested semantic label confidence
32
+
33
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
34
+
35
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
24
36
 
25
37
  Optimizing an object detection (OD) dataset
26
38
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27
39
 
28
40
  Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
29
- - ``image_path``: The location of the image within the dataset ``root``
30
- - ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
31
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
32
- - ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
33
41
 
34
- And outputs a CSV with the same columns and:
35
- - ``suspect_level``: object mislabel suspect level
36
- - ``suggested_label``: suggested object label
37
- - ``suggested_label_conf``: suggested object label confidence
42
+ - ``image_path``: The location of the image within the dataset ``data_root_url``
43
+ - ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
44
+ - ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
45
+ - ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
46
+ - ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
47
+ - ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
48
+ - ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
38
49
 
39
- Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
40
50
 
51
+ And outputs two Pandas DataFrames with the dataset columns as well as:
52
+
53
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
54
+
55
+ - ``suspect_score``: object mislabel suspect score
56
+ - ``suspect_level``: object mislabel suspect level
57
+ - ``suspect_rank``: object mislabel suspect ranking
58
+ - ``suggested_class_name``: suggested object semantic label
59
+ - ``suggested_class_conf``: suggested object semantic label confidence
60
+
61
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
62
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
41
63
 
42
64
  ## Installation
43
65
 
@@ -46,6 +68,7 @@ You can install the codebase with a simple `pip install hirundo` to install the
46
68
  ## Usage
47
69
 
48
70
  Classification example:
71
+
49
72
  ```python
50
73
  from hirundo import (
51
74
  HirundoCSV,
@@ -81,7 +104,6 @@ results = test_dataset.check_run()
81
104
  print(results)
82
105
  ```
83
106
 
84
-
85
107
  Object detection example:
86
108
 
87
109
  ```python
@@ -98,7 +120,7 @@ from hirundo import (
98
120
  git_storage = StorageGit(
99
121
  repo=GitRepo(
100
122
  name="BDD-100k-validation-dataset",
101
- repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only.git",
123
+ repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only",
102
124
  ),
103
125
  branch="main",
104
126
  )
@@ -116,21 +138,6 @@ test_dataset = OptimizationDataset(
116
138
  path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
117
139
  ),
118
140
  ),
119
- classes=[
120
- "traffic light",
121
- "traffic sign",
122
- "car",
123
- "pedestrian",
124
- "bus",
125
- "truck",
126
- "rider",
127
- "bicycle",
128
- "motorcycle",
129
- "train",
130
- "other vehicle",
131
- "other person",
132
- "trailer",
133
- ],
134
141
  )
135
142
 
136
143
  test_dataset.run_optimization()
@@ -138,8 +145,8 @@ results = test_dataset.check_run()
138
145
  print(results)
139
146
  ```
140
147
 
141
- Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyPy support may be introduced in the future.
148
+ Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
142
149
 
143
150
  ## Further documentation
144
151
 
145
- To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
152
+ To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-client/tree/main/notebooks).
@@ -1,43 +1,62 @@
1
+ from .dataset_enum import (
2
+ DatasetMetadataType,
3
+ LabelingType,
4
+ StorageTypes,
5
+ )
1
6
  from .dataset_optimization import (
2
- COCO,
3
- YOLO,
4
- HirundoCSV,
5
7
  HirundoError,
6
8
  OptimizationDataset,
7
9
  RunArgs,
8
10
  VisionRunArgs,
9
11
  )
10
- from .enum import (
11
- DatasetMetadataType,
12
- LabelingType,
12
+ from .dataset_optimization_results import DatasetOptimizationResults
13
+ from .git import GitPlainAuth, GitRepo, GitSSHAuth
14
+ from .labeling import (
15
+ COCO,
16
+ YOLO,
17
+ HirundoCSV,
18
+ KeylabsAuth,
19
+ KeylabsObjDetImages,
20
+ KeylabsObjDetVideo,
21
+ KeylabsObjSegImages,
22
+ KeylabsObjSegVideo,
13
23
  )
14
- from .git import GitRepo
15
24
  from .storage import (
16
25
  StorageConfig,
17
26
  StorageGCP,
18
27
  # StorageAzure, TODO: Azure storage is coming soon
19
28
  StorageGit,
20
29
  StorageS3,
21
- StorageTypes,
22
30
  )
31
+ from .unzip import load_df, load_from_zip
23
32
 
24
33
  __all__ = [
25
34
  "COCO",
26
35
  "YOLO",
27
- "HirundoCSV",
28
36
  "HirundoError",
37
+ "HirundoCSV",
38
+ "KeylabsAuth",
39
+ "KeylabsObjDetImages",
40
+ "KeylabsObjDetVideo",
41
+ "KeylabsObjSegImages",
42
+ "KeylabsObjSegVideo",
29
43
  "OptimizationDataset",
30
44
  "RunArgs",
31
45
  "VisionRunArgs",
32
- "LabelingType",
33
46
  "DatasetMetadataType",
47
+ "LabelingType",
48
+ "GitPlainAuth",
34
49
  "GitRepo",
50
+ "GitSSHAuth",
35
51
  "StorageTypes",
36
52
  "StorageS3",
37
53
  "StorageGCP",
38
54
  # "StorageAzure", TODO: Azure storage is coming soon
39
55
  "StorageGit",
40
56
  "StorageConfig",
57
+ "DatasetOptimizationResults",
58
+ "load_df",
59
+ "load_from_zip",
41
60
  ]
42
61
 
43
- __version__ = "0.1.9"
62
+ __version__ = "0.1.18"
@@ -0,0 +1,164 @@
1
+ import re
2
+ import typing
3
+ from typing import TYPE_CHECKING
4
+
5
+ from hirundo._urls import (
6
+ LENGTH_CONSTRAINTS,
7
+ STORAGE_PATTERNS,
8
+ )
9
+ from hirundo.dataset_enum import DatasetMetadataType, LabelingType, StorageTypes
10
+ from hirundo.labeling import COCO, YOLO, HirundoCSV, Keylabs
11
+
12
+ if TYPE_CHECKING:
13
+ from hirundo._urls import HirundoUrl
14
+ from hirundo.dataset_optimization import LabelingInfo
15
+ from hirundo.storage import (
16
+ ResponseStorageConfig,
17
+ StorageConfig,
18
+ StorageGCP,
19
+ StorageGCPOut,
20
+ StorageS3,
21
+ StorageS3Out,
22
+ )
23
+
24
+ LABELING_TYPES_TO_DATASET_METADATA_TYPES = {
25
+ LabelingType.SINGLE_LABEL_CLASSIFICATION: [
26
+ DatasetMetadataType.HIRUNDO_CSV,
27
+ ],
28
+ LabelingType.OBJECT_DETECTION: [
29
+ DatasetMetadataType.HIRUNDO_CSV,
30
+ DatasetMetadataType.COCO,
31
+ DatasetMetadataType.YOLO,
32
+ DatasetMetadataType.KeylabsObjDetImages,
33
+ DatasetMetadataType.KeylabsObjDetVideo,
34
+ ],
35
+ LabelingType.OBJECT_SEGMENTATION: [
36
+ DatasetMetadataType.HIRUNDO_CSV,
37
+ DatasetMetadataType.KeylabsObjSegImages,
38
+ DatasetMetadataType.KeylabsObjSegVideo,
39
+ ],
40
+ LabelingType.SEMANTIC_SEGMENTATION: [
41
+ DatasetMetadataType.HIRUNDO_CSV,
42
+ ],
43
+ LabelingType.PANOPTIC_SEGMENTATION: [
44
+ DatasetMetadataType.HIRUNDO_CSV,
45
+ ],
46
+ LabelingType.SPEECH_TO_TEXT: [
47
+ DatasetMetadataType.HIRUNDO_CSV,
48
+ ],
49
+ }
50
+
51
+
52
+ def validate_s3_url(str_url: str, s3_config: "StorageS3 | StorageS3Out"):
53
+ if (
54
+ len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.S3]["min_length"]
55
+ or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.S3]["max_length"]
56
+ ):
57
+ raise ValueError("S3 URL must be between 8 and 1023 characters")
58
+ elif not re.match(STORAGE_PATTERNS[StorageTypes.S3], str_url):
59
+ raise ValueError(
60
+ f"Invalid S3 URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.S3]}"
61
+ )
62
+ elif not str_url.startswith(f"{s3_config.bucket_url}/"):
63
+ raise ValueError(f"S3 URL must start with {s3_config.bucket_url}/")
64
+
65
+
66
+ def validate_gcp_url(str_url: str, gcp_config: "StorageGCP | StorageGCPOut"):
67
+ matches = re.match(STORAGE_PATTERNS[StorageTypes.GCP], str_url)
68
+ if (
69
+ len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.GCP]["min_length"]
70
+ or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.GCP]["max_length"]
71
+ ):
72
+ raise ValueError(
73
+ f"GCP URL must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['min_length']}"
74
+ + f" and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['max_length']} characters"
75
+ )
76
+ elif not matches:
77
+ raise ValueError(
78
+ f"Invalid GCP URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.GCP]}"
79
+ )
80
+ elif (
81
+ matches
82
+ and len(matches.group(1))
83
+ > LENGTH_CONSTRAINTS[StorageTypes.GCP]["bucket_max_length"]
84
+ ):
85
+ raise ValueError(
86
+ f"GCP bucket name must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_min_length']} "
87
+ + f"and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_max_length']} characters"
88
+ )
89
+ elif not str_url.startswith(f"gs://{gcp_config.bucket_name}/"):
90
+ raise ValueError(f"GCP URL must start with gs://{gcp_config.bucket_name}")
91
+
92
+
93
+ def validate_url(
94
+ url: "HirundoUrl",
95
+ storage_config: "StorageConfig | ResponseStorageConfig",
96
+ ) -> "HirundoUrl":
97
+ s3_config = storage_config.s3
98
+ gcp_config = storage_config.gcp
99
+ git_config = storage_config.git
100
+ str_url = str(url)
101
+
102
+ if s3_config is not None:
103
+ validate_s3_url(str_url, s3_config)
104
+ elif gcp_config is not None:
105
+ validate_gcp_url(str_url, gcp_config)
106
+ elif (
107
+ git_config is not None
108
+ and not str_url.startswith("https://")
109
+ and not str_url.startswith("ssh://")
110
+ ):
111
+ raise ValueError("Git URL must start with https:// or ssh://")
112
+ elif storage_config.type == StorageTypes.LOCAL and not str_url.startswith(
113
+ "file:///datasets/"
114
+ ):
115
+ raise ValueError("Local URL must start with file:///datasets/")
116
+ return url
117
+
118
+
119
+ def validate_labeling_type(
120
+ labeling_type: "LabelingType", labeling_info: "LabelingInfo"
121
+ ) -> None:
122
+ """
123
+ Validate that the labeling type is compatible with the labeling info
124
+
125
+ Args:
126
+ labeling_type: The type of labeling that will be performed
127
+ labeling_info: The labeling info to validate
128
+ """
129
+ dataset_metadata_types = LABELING_TYPES_TO_DATASET_METADATA_TYPES[labeling_type]
130
+ if labeling_info.type not in dataset_metadata_types:
131
+ raise ValueError(
132
+ f"Cannot use {labeling_info.type.name} labeling info with {labeling_type.name} datasets"
133
+ )
134
+
135
+
136
+ def validate_labeling_info(
137
+ labeling_type: "LabelingType",
138
+ labeling_info: "typing.Union[LabelingInfo, list[LabelingInfo]]",
139
+ storage_config: "typing.Union[StorageConfig, ResponseStorageConfig]",
140
+ ) -> None:
141
+ """
142
+ Validate the labeling info for a dataset
143
+
144
+ Args:
145
+ labeling_type: The type of labeling that will be performed
146
+ labeling_info: The labeling info to validate
147
+ storage_config: The storage configuration for the dataset.
148
+ StorageConfig is used to validate the URLs in the labeling info
149
+ """
150
+ if isinstance(labeling_info, list):
151
+ for labeling in labeling_info:
152
+ validate_labeling_info(labeling_type, labeling, storage_config)
153
+ return
154
+ elif isinstance(labeling_info, HirundoCSV):
155
+ validate_url(labeling_info.csv_url, storage_config)
156
+ elif isinstance(labeling_info, COCO):
157
+ validate_url(labeling_info.json_url, storage_config)
158
+ elif isinstance(labeling_info, YOLO):
159
+ validate_url(labeling_info.labels_dir_url, storage_config)
160
+ if labeling_info.data_yaml_url is not None:
161
+ validate_url(labeling_info.data_yaml_url, storage_config)
162
+ elif isinstance(labeling_info, Keylabs):
163
+ validate_url(labeling_info.labels_dir_url, storage_config)
164
+ validate_labeling_type(labeling_type, labeling_info)
@@ -0,0 +1,43 @@
1
+ has_pandas = False
2
+ has_polars = False
3
+
4
+ pd = None
5
+ pl = None
6
+ int32 = type[None]
7
+ float32 = type[None]
8
+ string = type[None]
9
+ # ⬆️ These are just placeholders for the int32, float32 and string types
10
+ # for when neither pandas nor polars are available
11
+
12
+ try:
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ has_pandas = True
17
+ int32 = np.int32
18
+ float32 = np.float32
19
+ string = str
20
+ except ImportError:
21
+ pass
22
+
23
+ try:
24
+ import polars as pl
25
+ import polars.datatypes as pl_datatypes
26
+
27
+ has_polars = True
28
+ int32 = pl_datatypes.Int32
29
+ float32 = pl_datatypes.Float32
30
+ string = pl_datatypes.String
31
+ except ImportError:
32
+ pass
33
+
34
+
35
+ __all__ = [
36
+ "has_polars",
37
+ "has_pandas",
38
+ "pd",
39
+ "pl",
40
+ "int32",
41
+ "float32",
42
+ "string",
43
+ ]
@@ -2,11 +2,11 @@ import enum
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
- from dotenv import load_dotenv
5
+ from dotenv import find_dotenv, load_dotenv
6
6
 
7
7
 
8
8
  class EnvLocation(enum.Enum):
9
- DOTENV = Path.cwd() / ".env"
9
+ DOTENV = find_dotenv(".env")
10
10
  HOME = Path.home() / ".hirundo.conf"
11
11
 
12
12
 
@@ -0,0 +1,29 @@
1
+ from hirundo._env import API_KEY, check_api_key
2
+
3
+ HIRUNDO_API_VERSION = "0.2"
4
+
5
+ _json_headers = {
6
+ "Content-Type": "application/json",
7
+ "Accept": "application/json",
8
+ }
9
+
10
+
11
+ def _get_auth_headers():
12
+ check_api_key()
13
+ return {
14
+ "Authorization": f"Bearer {API_KEY}",
15
+ }
16
+
17
+
18
+ def _get_api_version_header():
19
+ return {
20
+ "HIRUNDO-API-VERSION": HIRUNDO_API_VERSION,
21
+ }
22
+
23
+
24
+ def get_headers():
25
+ return {
26
+ **_json_headers,
27
+ **_get_auth_headers(),
28
+ **_get_api_version_header(),
29
+ }
@@ -0,0 +1,3 @@
1
+ READ_TIMEOUT = 30.0
2
+ MODIFY_TIMEOUT = 60.0
3
+ DOWNLOAD_READ_TIMEOUT = 600.0 # 10 minutes