hirundo 0.1.16__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {hirundo-0.1.16 → hirundo-0.1.18}/PKG-INFO +48 -41
  2. {hirundo-0.1.16 → hirundo-0.1.18}/README.md +45 -38
  3. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/__init__.py +19 -7
  4. hirundo-0.1.18/hirundo/_constraints.py +164 -0
  5. hirundo-0.1.16/hirundo/_constraints.py → hirundo-0.1.18/hirundo/_urls.py +12 -6
  6. hirundo-0.1.18/hirundo/dataset_enum.py +46 -0
  7. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/dataset_optimization.py +62 -76
  8. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/git.py +1 -1
  9. hirundo-0.1.18/hirundo/labeling.py +140 -0
  10. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/storage.py +42 -58
  11. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo.egg-info/PKG-INFO +48 -41
  12. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo.egg-info/SOURCES.txt +2 -0
  13. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo.egg-info/requires.txt +2 -2
  14. {hirundo-0.1.16 → hirundo-0.1.18}/pyproject.toml +15 -25
  15. hirundo-0.1.16/hirundo/dataset_enum.py +0 -23
  16. {hirundo-0.1.16 → hirundo-0.1.18}/LICENSE +0 -0
  17. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/__main__.py +0 -0
  18. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/_dataframe.py +0 -0
  19. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/_env.py +0 -0
  20. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/_headers.py +0 -0
  21. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/_http.py +0 -0
  22. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/_iter_sse_retrying.py +0 -0
  23. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/_timeouts.py +0 -0
  24. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/cli.py +0 -0
  25. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/dataset_optimization_results.py +0 -0
  26. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/logger.py +0 -0
  27. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo/unzip.py +0 -0
  28. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo.egg-info/dependency_links.txt +0 -0
  29. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo.egg-info/entry_points.txt +0 -0
  30. {hirundo-0.1.16 → hirundo-0.1.18}/hirundo.egg-info/top_level.txt +0 -0
  31. {hirundo-0.1.16 → hirundo-0.1.18}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hirundo
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
5
5
  Author-email: Hirundo <dev@hirundo.io>
6
6
  License: MIT License
@@ -55,7 +55,7 @@ Requires-Dist: platformdirs>=4.3.6; extra == "dev"
55
55
  Requires-Dist: safety>=3.2.13; extra == "dev"
56
56
  Provides-Extra: docs
57
57
  Requires-Dist: sphinx>=7.4.7; extra == "docs"
58
- Requires-Dist: sphinx-autobuild>=2024.4.16; extra == "docs"
58
+ Requires-Dist: sphinx-autobuild>=2024.9.3; extra == "docs"
59
59
  Requires-Dist: sphinx-click>=5.0.1; extra == "docs"
60
60
  Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
61
61
  Requires-Dist: furo; extra == "docs"
@@ -64,7 +64,7 @@ Requires-Dist: esbonio; extra == "docs"
64
64
  Requires-Dist: starlette>0.40.0; extra == "docs"
65
65
  Requires-Dist: markupsafe>=3.0.2; extra == "docs"
66
66
  Provides-Extra: pandas
67
- Requires-Dist: pandas>=2.2.2; extra == "pandas"
67
+ Requires-Dist: pandas>=2.2.3; extra == "pandas"
68
68
  Provides-Extra: polars
69
69
  Requires-Dist: polars>=1.0.0; extra == "polars"
70
70
  Dynamic: license-file
@@ -75,40 +75,62 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
75
75
 
76
76
  Dataset optimization is currently available for datasets labelled for classification and object detection.
77
77
 
78
-
79
78
  Support dataset storage configs include:
80
- - Google Cloud (GCP) Storage
81
- - Amazon Web Services (AWS) S3
82
- - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
79
+
80
+ - Google Cloud (GCP) Storage
81
+ - Amazon Web Services (AWS) S3
82
+ - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
83
+
84
+ Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
83
85
 
84
86
  Optimizing a classification dataset
85
87
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
86
88
 
87
- Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
88
- - ``image_path``: The location of the image within the dataset ``root``
89
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
89
+ Currently `hirundo` requires a CSV file with the following columns (all columns are required):
90
+
91
+ - `image_path`: The location of the image within the dataset `data_root_url`
92
+ - `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
93
+
94
+ And outputs two Pandas DataFrames with the dataset columns as well as:
95
+
96
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
90
97
 
91
- And outputs a CSV with the same columns and:
92
- - ``suspect_level``: mislabel suspect level
93
- - ``suggested_label``: suggested label
94
- - ``suggested_label_conf``: suggested label confidence
98
+ - ``suspect_score``: mislabel suspect score
99
+ - ``suspect_level``: mislabel suspect level
100
+ - ``suspect_rank``: mislabel suspect ranking
101
+ - ``suggested_class_name``: suggested semantic label
102
+ - ``suggested_class_conf``: suggested semantic label confidence
103
+
104
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
105
+
106
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
95
107
 
96
108
  Optimizing an object detection (OD) dataset
97
109
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
98
110
 
99
111
  Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
100
- - ``image_path``: The location of the image within the dataset ``root``
101
- - ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
102
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
103
- - ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
104
112
 
105
- And outputs a CSV with the same columns and:
106
- - ``suspect_level``: object mislabel suspect level
107
- - ``suggested_label``: suggested object label
108
- - ``suggested_label_conf``: suggested object label confidence
113
+ - ``image_path``: The location of the image within the dataset ``data_root_url``
114
+ - ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
115
+ - ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
116
+ - ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
117
+ - ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
118
+ - ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
119
+ - ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
109
120
 
110
- Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
111
121
 
122
+ And outputs two Pandas DataFrames with the dataset columns as well as:
123
+
124
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
125
+
126
+ - ``suspect_score``: object mislabel suspect score
127
+ - ``suspect_level``: object mislabel suspect level
128
+ - ``suspect_rank``: object mislabel suspect ranking
129
+ - ``suggested_class_name``: suggested object semantic label
130
+ - ``suggested_class_conf``: suggested object semantic label confidence
131
+
132
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
133
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
112
134
 
113
135
  ## Installation
114
136
 
@@ -117,6 +139,7 @@ You can install the codebase with a simple `pip install hirundo` to install the
117
139
  ## Usage
118
140
 
119
141
  Classification example:
142
+
120
143
  ```python
121
144
  from hirundo import (
122
145
  HirundoCSV,
@@ -152,7 +175,6 @@ results = test_dataset.check_run()
152
175
  print(results)
153
176
  ```
154
177
 
155
-
156
178
  Object detection example:
157
179
 
158
180
  ```python
@@ -187,21 +209,6 @@ test_dataset = OptimizationDataset(
187
209
  path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
188
210
  ),
189
211
  ),
190
- classes=[
191
- "traffic light",
192
- "traffic sign",
193
- "car",
194
- "pedestrian",
195
- "bus",
196
- "truck",
197
- "rider",
198
- "bicycle",
199
- "motorcycle",
200
- "train",
201
- "other vehicle",
202
- "other person",
203
- "trailer",
204
- ],
205
212
  )
206
213
 
207
214
  test_dataset.run_optimization()
@@ -209,8 +216,8 @@ results = test_dataset.check_run()
209
216
  print(results)
210
217
  ```
211
218
 
212
- Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyPy support may be introduced in the future.
219
+ Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
213
220
 
214
221
  ## Further documentation
215
222
 
216
- To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
223
+ To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-client/tree/main/notebooks).
@@ -4,40 +4,62 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
4
4
 
5
5
  Dataset optimization is currently available for datasets labelled for classification and object detection.
6
6
 
7
-
8
7
  Support dataset storage configs include:
9
- - Google Cloud (GCP) Storage
10
- - Amazon Web Services (AWS) S3
11
- - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
8
+
9
+ - Google Cloud (GCP) Storage
10
+ - Amazon Web Services (AWS) S3
11
+ - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
12
+
13
+ Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
12
14
 
13
15
  Optimizing a classification dataset
14
16
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
15
17
 
16
- Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
17
- - ``image_path``: The location of the image within the dataset ``root``
18
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
18
+ Currently `hirundo` requires a CSV file with the following columns (all columns are required):
19
+
20
+ - `image_path`: The location of the image within the dataset `data_root_url`
21
+ - `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
22
+
23
+ And outputs two Pandas DataFrames with the dataset columns as well as:
24
+
25
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
19
26
 
20
- And outputs a CSV with the same columns and:
21
- - ``suspect_level``: mislabel suspect level
22
- - ``suggested_label``: suggested label
23
- - ``suggested_label_conf``: suggested label confidence
27
+ - ``suspect_score``: mislabel suspect score
28
+ - ``suspect_level``: mislabel suspect level
29
+ - ``suspect_rank``: mislabel suspect ranking
30
+ - ``suggested_class_name``: suggested semantic label
31
+ - ``suggested_class_conf``: suggested semantic label confidence
32
+
33
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
34
+
35
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
24
36
 
25
37
  Optimizing an object detection (OD) dataset
26
38
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27
39
 
28
40
  Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
29
- - ``image_path``: The location of the image within the dataset ``root``
30
- - ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
31
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
32
- - ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
33
41
 
34
- And outputs a CSV with the same columns and:
35
- - ``suspect_level``: object mislabel suspect level
36
- - ``suggested_label``: suggested object label
37
- - ``suggested_label_conf``: suggested object label confidence
42
+ - ``image_path``: The location of the image within the dataset ``data_root_url``
43
+ - ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
44
+ - ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
45
+ - ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
46
+ - ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
47
+ - ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
48
+ - ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
38
49
 
39
- Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
40
50
 
51
+ And outputs two Pandas DataFrames with the dataset columns as well as:
52
+
53
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
54
+
55
+ - ``suspect_score``: object mislabel suspect score
56
+ - ``suspect_level``: object mislabel suspect level
57
+ - ``suspect_rank``: object mislabel suspect ranking
58
+ - ``suggested_class_name``: suggested object semantic label
59
+ - ``suggested_class_conf``: suggested object semantic label confidence
60
+
61
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
62
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
41
63
 
42
64
  ## Installation
43
65
 
@@ -46,6 +68,7 @@ You can install the codebase with a simple `pip install hirundo` to install the
46
68
  ## Usage
47
69
 
48
70
  Classification example:
71
+
49
72
  ```python
50
73
  from hirundo import (
51
74
  HirundoCSV,
@@ -81,7 +104,6 @@ results = test_dataset.check_run()
81
104
  print(results)
82
105
  ```
83
106
 
84
-
85
107
  Object detection example:
86
108
 
87
109
  ```python
@@ -116,21 +138,6 @@ test_dataset = OptimizationDataset(
116
138
  path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
117
139
  ),
118
140
  ),
119
- classes=[
120
- "traffic light",
121
- "traffic sign",
122
- "car",
123
- "pedestrian",
124
- "bus",
125
- "truck",
126
- "rider",
127
- "bicycle",
128
- "motorcycle",
129
- "train",
130
- "other vehicle",
131
- "other person",
132
- "trailer",
133
- ],
134
141
  )
135
142
 
136
143
  test_dataset.run_optimization()
@@ -138,8 +145,8 @@ results = test_dataset.check_run()
138
145
  print(results)
139
146
  ```
140
147
 
141
- Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyPy support may be introduced in the future.
148
+ Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
142
149
 
143
150
  ## Further documentation
144
151
 
145
- To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
152
+ To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-client/tree/main/notebooks).
@@ -1,11 +1,9 @@
1
1
  from .dataset_enum import (
2
2
  DatasetMetadataType,
3
3
  LabelingType,
4
+ StorageTypes,
4
5
  )
5
6
  from .dataset_optimization import (
6
- COCO,
7
- YOLO,
8
- HirundoCSV,
9
7
  HirundoError,
10
8
  OptimizationDataset,
11
9
  RunArgs,
@@ -13,26 +11,40 @@ from .dataset_optimization import (
13
11
  )
14
12
  from .dataset_optimization_results import DatasetOptimizationResults
15
13
  from .git import GitPlainAuth, GitRepo, GitSSHAuth
14
+ from .labeling import (
15
+ COCO,
16
+ YOLO,
17
+ HirundoCSV,
18
+ KeylabsAuth,
19
+ KeylabsObjDetImages,
20
+ KeylabsObjDetVideo,
21
+ KeylabsObjSegImages,
22
+ KeylabsObjSegVideo,
23
+ )
16
24
  from .storage import (
17
25
  StorageConfig,
18
26
  StorageGCP,
19
27
  # StorageAzure, TODO: Azure storage is coming soon
20
28
  StorageGit,
21
29
  StorageS3,
22
- StorageTypes,
23
30
  )
24
31
  from .unzip import load_df, load_from_zip
25
32
 
26
33
  __all__ = [
27
34
  "COCO",
28
35
  "YOLO",
29
- "HirundoCSV",
30
36
  "HirundoError",
37
+ "HirundoCSV",
38
+ "KeylabsAuth",
39
+ "KeylabsObjDetImages",
40
+ "KeylabsObjDetVideo",
41
+ "KeylabsObjSegImages",
42
+ "KeylabsObjSegVideo",
31
43
  "OptimizationDataset",
32
44
  "RunArgs",
33
45
  "VisionRunArgs",
34
- "LabelingType",
35
46
  "DatasetMetadataType",
47
+ "LabelingType",
36
48
  "GitPlainAuth",
37
49
  "GitRepo",
38
50
  "GitSSHAuth",
@@ -47,4 +59,4 @@ __all__ = [
47
59
  "load_from_zip",
48
60
  ]
49
61
 
50
- __version__ = "0.1.16"
62
+ __version__ = "0.1.18"
@@ -0,0 +1,164 @@
1
+ import re
2
+ import typing
3
+ from typing import TYPE_CHECKING
4
+
5
+ from hirundo._urls import (
6
+ LENGTH_CONSTRAINTS,
7
+ STORAGE_PATTERNS,
8
+ )
9
+ from hirundo.dataset_enum import DatasetMetadataType, LabelingType, StorageTypes
10
+ from hirundo.labeling import COCO, YOLO, HirundoCSV, Keylabs
11
+
12
+ if TYPE_CHECKING:
13
+ from hirundo._urls import HirundoUrl
14
+ from hirundo.dataset_optimization import LabelingInfo
15
+ from hirundo.storage import (
16
+ ResponseStorageConfig,
17
+ StorageConfig,
18
+ StorageGCP,
19
+ StorageGCPOut,
20
+ StorageS3,
21
+ StorageS3Out,
22
+ )
23
+
24
+ LABELING_TYPES_TO_DATASET_METADATA_TYPES = {
25
+ LabelingType.SINGLE_LABEL_CLASSIFICATION: [
26
+ DatasetMetadataType.HIRUNDO_CSV,
27
+ ],
28
+ LabelingType.OBJECT_DETECTION: [
29
+ DatasetMetadataType.HIRUNDO_CSV,
30
+ DatasetMetadataType.COCO,
31
+ DatasetMetadataType.YOLO,
32
+ DatasetMetadataType.KeylabsObjDetImages,
33
+ DatasetMetadataType.KeylabsObjDetVideo,
34
+ ],
35
+ LabelingType.OBJECT_SEGMENTATION: [
36
+ DatasetMetadataType.HIRUNDO_CSV,
37
+ DatasetMetadataType.KeylabsObjSegImages,
38
+ DatasetMetadataType.KeylabsObjSegVideo,
39
+ ],
40
+ LabelingType.SEMANTIC_SEGMENTATION: [
41
+ DatasetMetadataType.HIRUNDO_CSV,
42
+ ],
43
+ LabelingType.PANOPTIC_SEGMENTATION: [
44
+ DatasetMetadataType.HIRUNDO_CSV,
45
+ ],
46
+ LabelingType.SPEECH_TO_TEXT: [
47
+ DatasetMetadataType.HIRUNDO_CSV,
48
+ ],
49
+ }
50
+
51
+
52
+ def validate_s3_url(str_url: str, s3_config: "StorageS3 | StorageS3Out"):
53
+ if (
54
+ len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.S3]["min_length"]
55
+ or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.S3]["max_length"]
56
+ ):
57
+ raise ValueError("S3 URL must be between 8 and 1023 characters")
58
+ elif not re.match(STORAGE_PATTERNS[StorageTypes.S3], str_url):
59
+ raise ValueError(
60
+ f"Invalid S3 URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.S3]}"
61
+ )
62
+ elif not str_url.startswith(f"{s3_config.bucket_url}/"):
63
+ raise ValueError(f"S3 URL must start with {s3_config.bucket_url}/")
64
+
65
+
66
+ def validate_gcp_url(str_url: str, gcp_config: "StorageGCP | StorageGCPOut"):
67
+ matches = re.match(STORAGE_PATTERNS[StorageTypes.GCP], str_url)
68
+ if (
69
+ len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.GCP]["min_length"]
70
+ or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.GCP]["max_length"]
71
+ ):
72
+ raise ValueError(
73
+ f"GCP URL must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['min_length']}"
74
+ + f" and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['max_length']} characters"
75
+ )
76
+ elif not matches:
77
+ raise ValueError(
78
+ f"Invalid GCP URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.GCP]}"
79
+ )
80
+ elif (
81
+ matches
82
+ and len(matches.group(1))
83
+ > LENGTH_CONSTRAINTS[StorageTypes.GCP]["bucket_max_length"]
84
+ ):
85
+ raise ValueError(
86
+ f"GCP bucket name must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_min_length']} "
87
+ + f"and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_max_length']} characters"
88
+ )
89
+ elif not str_url.startswith(f"gs://{gcp_config.bucket_name}/"):
90
+ raise ValueError(f"GCP URL must start with gs://{gcp_config.bucket_name}")
91
+
92
+
93
+ def validate_url(
94
+ url: "HirundoUrl",
95
+ storage_config: "StorageConfig | ResponseStorageConfig",
96
+ ) -> "HirundoUrl":
97
+ s3_config = storage_config.s3
98
+ gcp_config = storage_config.gcp
99
+ git_config = storage_config.git
100
+ str_url = str(url)
101
+
102
+ if s3_config is not None:
103
+ validate_s3_url(str_url, s3_config)
104
+ elif gcp_config is not None:
105
+ validate_gcp_url(str_url, gcp_config)
106
+ elif (
107
+ git_config is not None
108
+ and not str_url.startswith("https://")
109
+ and not str_url.startswith("ssh://")
110
+ ):
111
+ raise ValueError("Git URL must start with https:// or ssh://")
112
+ elif storage_config.type == StorageTypes.LOCAL and not str_url.startswith(
113
+ "file:///datasets/"
114
+ ):
115
+ raise ValueError("Local URL must start with file:///datasets/")
116
+ return url
117
+
118
+
119
+ def validate_labeling_type(
120
+ labeling_type: "LabelingType", labeling_info: "LabelingInfo"
121
+ ) -> None:
122
+ """
123
+ Validate that the labeling type is compatible with the labeling info
124
+
125
+ Args:
126
+ labeling_type: The type of labeling that will be performed
127
+ labeling_info: The labeling info to validate
128
+ """
129
+ dataset_metadata_types = LABELING_TYPES_TO_DATASET_METADATA_TYPES[labeling_type]
130
+ if labeling_info.type not in dataset_metadata_types:
131
+ raise ValueError(
132
+ f"Cannot use {labeling_info.type.name} labeling info with {labeling_type.name} datasets"
133
+ )
134
+
135
+
136
+ def validate_labeling_info(
137
+ labeling_type: "LabelingType",
138
+ labeling_info: "typing.Union[LabelingInfo, list[LabelingInfo]]",
139
+ storage_config: "typing.Union[StorageConfig, ResponseStorageConfig]",
140
+ ) -> None:
141
+ """
142
+ Validate the labeling info for a dataset
143
+
144
+ Args:
145
+ labeling_type: The type of labeling that will be performed
146
+ labeling_info: The labeling info to validate
147
+ storage_config: The storage configuration for the dataset.
148
+ StorageConfig is used to validate the URLs in the labeling info
149
+ """
150
+ if isinstance(labeling_info, list):
151
+ for labeling in labeling_info:
152
+ validate_labeling_info(labeling_type, labeling, storage_config)
153
+ return
154
+ elif isinstance(labeling_info, HirundoCSV):
155
+ validate_url(labeling_info.csv_url, storage_config)
156
+ elif isinstance(labeling_info, COCO):
157
+ validate_url(labeling_info.json_url, storage_config)
158
+ elif isinstance(labeling_info, YOLO):
159
+ validate_url(labeling_info.labels_dir_url, storage_config)
160
+ if labeling_info.data_yaml_url is not None:
161
+ validate_url(labeling_info.data_yaml_url, storage_config)
162
+ elif isinstance(labeling_info, Keylabs):
163
+ validate_url(labeling_info.labels_dir_url, storage_config)
164
+ validate_labeling_type(labeling_type, labeling_info)
@@ -3,6 +3,8 @@ from typing import Annotated
3
3
  from pydantic import StringConstraints, UrlConstraints
4
4
  from pydantic_core import Url
5
5
 
6
+ from hirundo.dataset_enum import StorageTypes
7
+
6
8
  S3BucketUrl = Annotated[
7
9
  str,
8
10
  StringConstraints(
@@ -21,12 +23,16 @@ StorageConfigName = Annotated[
21
23
  ),
22
24
  ]
23
25
 
24
- S3_MIN_LENGTH = 8
25
- S3_MAX_LENGTH = 1023
26
- S3_PATTERN = r"s3://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
27
- GCP_MIN_LENGTH = 8
28
- GCP_MAX_LENGTH = 1023
29
- GCP_PATTERN = r"gs://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
26
+ STORAGE_PATTERNS: dict[StorageTypes, str] = {
27
+ StorageTypes.S3: r"^s3:\/\/[a-z0-9\.\-]{3,63}/[a-zA-Z0-9!\-\/_\.\*'\(\)]+$",
28
+ StorageTypes.GCP: r"^gs:\/\/([a-z0-9][a-z0-9_-]{1,61}[a-z0-9](\.[a-z0-9][a-z0-9_-]{1,61}[a-z0-9])*)\/[^\x00-\x1F\x7F-\x9F\r\n]*$",
29
+ }
30
+
31
+
32
+ LENGTH_CONSTRAINTS: dict[StorageTypes, dict] = {
33
+ StorageTypes.S3: {"min_length": 8, "max_length": 1023, "bucket_max_length": None},
34
+ StorageTypes.GCP: {"min_length": 8, "max_length": 1023, "bucket_max_length": 222},
35
+ }
30
36
 
31
37
  RepoUrl = Annotated[
32
38
  Url,
@@ -0,0 +1,46 @@
1
+ from enum import Enum
2
+
3
+
4
+ class LabelingType(str, Enum):
5
+ """
6
+ Enum indicate what type of labeling is used for the given dataset.
7
+ Supported types are:
8
+ """
9
+
10
+ SINGLE_LABEL_CLASSIFICATION = "SingleLabelClassification"
11
+ OBJECT_DETECTION = "ObjectDetection"
12
+ SPEECH_TO_TEXT = "SpeechToText"
13
+ OBJECT_SEGMENTATION = "ObjectSegmentation"
14
+ SEMANTIC_SEGMENTATION = "SemanticSegmentation"
15
+ PANOPTIC_SEGMENTATION = "PanopticSegmentation"
16
+
17
+
18
+ class DatasetMetadataType(str, Enum):
19
+ """
20
+ Enum indicate what type of metadata is provided for the given dataset.
21
+ Supported types are:
22
+ """
23
+
24
+ HIRUNDO_CSV = "HirundoCSV"
25
+ COCO = "COCO"
26
+ YOLO = "YOLO"
27
+ KeylabsObjDetImages = "KeylabsObjDetImages"
28
+ KeylabsObjDetVideo = "KeylabsObjDetVideo"
29
+ KeylabsObjSegImages = "KeylabsObjSegImages"
30
+ KeylabsObjSegVideo = "KeylabsObjSegVideo"
31
+
32
+
33
+ class StorageTypes(str, Enum):
34
+ """
35
+ Enum for the different types of storage configs.
36
+ Supported types are:
37
+ """
38
+
39
+ S3 = "S3"
40
+ GCP = "GCP"
41
+ # AZURE = "Azure" TODO: Azure storage config is coming soon
42
+ GIT = "Git"
43
+ LOCAL = "Local"
44
+ """
45
+ Local storage config is only supported for on-premises installations.
46
+ """