hirundo 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hirundo/__init__.py CHANGED
@@ -1,11 +1,9 @@
1
1
  from .dataset_enum import (
2
2
  DatasetMetadataType,
3
3
  LabelingType,
4
+ StorageTypes,
4
5
  )
5
6
  from .dataset_optimization import (
6
- COCO,
7
- YOLO,
8
- HirundoCSV,
9
7
  HirundoError,
10
8
  OptimizationDataset,
11
9
  RunArgs,
@@ -13,26 +11,40 @@ from .dataset_optimization import (
13
11
  )
14
12
  from .dataset_optimization_results import DatasetOptimizationResults
15
13
  from .git import GitPlainAuth, GitRepo, GitSSHAuth
14
+ from .labeling import (
15
+ COCO,
16
+ YOLO,
17
+ HirundoCSV,
18
+ KeylabsAuth,
19
+ KeylabsObjDetImages,
20
+ KeylabsObjDetVideo,
21
+ KeylabsObjSegImages,
22
+ KeylabsObjSegVideo,
23
+ )
16
24
  from .storage import (
17
25
  StorageConfig,
18
26
  StorageGCP,
19
27
  # StorageAzure, TODO: Azure storage is coming soon
20
28
  StorageGit,
21
29
  StorageS3,
22
- StorageTypes,
23
30
  )
24
31
  from .unzip import load_df, load_from_zip
25
32
 
26
33
  __all__ = [
27
34
  "COCO",
28
35
  "YOLO",
29
- "HirundoCSV",
30
36
  "HirundoError",
37
+ "HirundoCSV",
38
+ "KeylabsAuth",
39
+ "KeylabsObjDetImages",
40
+ "KeylabsObjDetVideo",
41
+ "KeylabsObjSegImages",
42
+ "KeylabsObjSegVideo",
31
43
  "OptimizationDataset",
32
44
  "RunArgs",
33
45
  "VisionRunArgs",
34
- "LabelingType",
35
46
  "DatasetMetadataType",
47
+ "LabelingType",
36
48
  "GitPlainAuth",
37
49
  "GitRepo",
38
50
  "GitSSHAuth",
@@ -47,4 +59,4 @@ __all__ = [
47
59
  "load_from_zip",
48
60
  ]
49
61
 
50
- __version__ = "0.1.16"
62
+ __version__ = "0.1.18"
hirundo/_constraints.py CHANGED
@@ -1,53 +1,164 @@
1
- from typing import Annotated
2
-
3
- from pydantic import StringConstraints, UrlConstraints
4
- from pydantic_core import Url
5
-
6
- S3BucketUrl = Annotated[
7
- str,
8
- StringConstraints(
9
- min_length=8,
10
- max_length=1023,
11
- pattern=r"s3?://[a-z0-9.-]{3,64}[/]?", # Only allow real S3 bucket URLs
12
- ),
13
- ]
14
-
15
- StorageConfigName = Annotated[
16
- str,
17
- StringConstraints(
18
- min_length=1,
19
- max_length=255,
20
- pattern=r"^[a-zA-Z0-9-_]+$",
21
- ),
22
- ]
23
-
24
- S3_MIN_LENGTH = 8
25
- S3_MAX_LENGTH = 1023
26
- S3_PATTERN = r"s3://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
27
- GCP_MIN_LENGTH = 8
28
- GCP_MAX_LENGTH = 1023
29
- GCP_PATTERN = r"gs://[a-zA-Z0-9.-]{3,64}/[a-zA-Z0-9.-/]+"
30
-
31
- RepoUrl = Annotated[
32
- Url,
33
- UrlConstraints(
34
- allowed_schemes=[
35
- "ssh",
36
- "https",
37
- "http",
38
- ]
39
- ),
40
- ]
41
- HirundoUrl = Annotated[
42
- Url,
43
- UrlConstraints(
44
- allowed_schemes=[
45
- "file",
46
- "https",
47
- "http",
48
- "s3",
49
- "gs",
50
- "ssh",
51
- ]
52
- ),
53
- ]
1
+ import re
2
+ import typing
3
+ from typing import TYPE_CHECKING
4
+
5
+ from hirundo._urls import (
6
+ LENGTH_CONSTRAINTS,
7
+ STORAGE_PATTERNS,
8
+ )
9
+ from hirundo.dataset_enum import DatasetMetadataType, LabelingType, StorageTypes
10
+ from hirundo.labeling import COCO, YOLO, HirundoCSV, Keylabs
11
+
12
+ if TYPE_CHECKING:
13
+ from hirundo._urls import HirundoUrl
14
+ from hirundo.dataset_optimization import LabelingInfo
15
+ from hirundo.storage import (
16
+ ResponseStorageConfig,
17
+ StorageConfig,
18
+ StorageGCP,
19
+ StorageGCPOut,
20
+ StorageS3,
21
+ StorageS3Out,
22
+ )
23
+
24
+ LABELING_TYPES_TO_DATASET_METADATA_TYPES = {
25
+ LabelingType.SINGLE_LABEL_CLASSIFICATION: [
26
+ DatasetMetadataType.HIRUNDO_CSV,
27
+ ],
28
+ LabelingType.OBJECT_DETECTION: [
29
+ DatasetMetadataType.HIRUNDO_CSV,
30
+ DatasetMetadataType.COCO,
31
+ DatasetMetadataType.YOLO,
32
+ DatasetMetadataType.KeylabsObjDetImages,
33
+ DatasetMetadataType.KeylabsObjDetVideo,
34
+ ],
35
+ LabelingType.OBJECT_SEGMENTATION: [
36
+ DatasetMetadataType.HIRUNDO_CSV,
37
+ DatasetMetadataType.KeylabsObjSegImages,
38
+ DatasetMetadataType.KeylabsObjSegVideo,
39
+ ],
40
+ LabelingType.SEMANTIC_SEGMENTATION: [
41
+ DatasetMetadataType.HIRUNDO_CSV,
42
+ ],
43
+ LabelingType.PANOPTIC_SEGMENTATION: [
44
+ DatasetMetadataType.HIRUNDO_CSV,
45
+ ],
46
+ LabelingType.SPEECH_TO_TEXT: [
47
+ DatasetMetadataType.HIRUNDO_CSV,
48
+ ],
49
+ }
50
+
51
+
52
+ def validate_s3_url(str_url: str, s3_config: "StorageS3 | StorageS3Out"):
53
+ if (
54
+ len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.S3]["min_length"]
55
+ or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.S3]["max_length"]
56
+ ):
57
+ raise ValueError("S3 URL must be between 8 and 1023 characters")
58
+ elif not re.match(STORAGE_PATTERNS[StorageTypes.S3], str_url):
59
+ raise ValueError(
60
+ f"Invalid S3 URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.S3]}"
61
+ )
62
+ elif not str_url.startswith(f"{s3_config.bucket_url}/"):
63
+ raise ValueError(f"S3 URL must start with {s3_config.bucket_url}/")
64
+
65
+
66
+ def validate_gcp_url(str_url: str, gcp_config: "StorageGCP | StorageGCPOut"):
67
+ matches = re.match(STORAGE_PATTERNS[StorageTypes.GCP], str_url)
68
+ if (
69
+ len(str_url) < LENGTH_CONSTRAINTS[StorageTypes.GCP]["min_length"]
70
+ or len(str_url) > LENGTH_CONSTRAINTS[StorageTypes.GCP]["max_length"]
71
+ ):
72
+ raise ValueError(
73
+ f"GCP URL must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['min_length']}"
74
+ + f" and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['max_length']} characters"
75
+ )
76
+ elif not matches:
77
+ raise ValueError(
78
+ f"Invalid GCP URL. Pattern must match: {STORAGE_PATTERNS[StorageTypes.GCP]}"
79
+ )
80
+ elif (
81
+ matches
82
+ and len(matches.group(1))
83
+ > LENGTH_CONSTRAINTS[StorageTypes.GCP]["bucket_max_length"]
84
+ ):
85
+ raise ValueError(
86
+ f"GCP bucket name must be between {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_min_length']} "
87
+ + f"and {LENGTH_CONSTRAINTS[StorageTypes.GCP]['bucket_max_length']} characters"
88
+ )
89
+ elif not str_url.startswith(f"gs://{gcp_config.bucket_name}/"):
90
+ raise ValueError(f"GCP URL must start with gs://{gcp_config.bucket_name}")
91
+
92
+
93
+ def validate_url(
94
+ url: "HirundoUrl",
95
+ storage_config: "StorageConfig | ResponseStorageConfig",
96
+ ) -> "HirundoUrl":
97
+ s3_config = storage_config.s3
98
+ gcp_config = storage_config.gcp
99
+ git_config = storage_config.git
100
+ str_url = str(url)
101
+
102
+ if s3_config is not None:
103
+ validate_s3_url(str_url, s3_config)
104
+ elif gcp_config is not None:
105
+ validate_gcp_url(str_url, gcp_config)
106
+ elif (
107
+ git_config is not None
108
+ and not str_url.startswith("https://")
109
+ and not str_url.startswith("ssh://")
110
+ ):
111
+ raise ValueError("Git URL must start with https:// or ssh://")
112
+ elif storage_config.type == StorageTypes.LOCAL and not str_url.startswith(
113
+ "file:///datasets/"
114
+ ):
115
+ raise ValueError("Local URL must start with file:///datasets/")
116
+ return url
117
+
118
+
119
+ def validate_labeling_type(
120
+ labeling_type: "LabelingType", labeling_info: "LabelingInfo"
121
+ ) -> None:
122
+ """
123
+ Validate that the labeling type is compatible with the labeling info
124
+
125
+ Args:
126
+ labeling_type: The type of labeling that will be performed
127
+ labeling_info: The labeling info to validate
128
+ """
129
+ dataset_metadata_types = LABELING_TYPES_TO_DATASET_METADATA_TYPES[labeling_type]
130
+ if labeling_info.type not in dataset_metadata_types:
131
+ raise ValueError(
132
+ f"Cannot use {labeling_info.type.name} labeling info with {labeling_type.name} datasets"
133
+ )
134
+
135
+
136
+ def validate_labeling_info(
137
+ labeling_type: "LabelingType",
138
+ labeling_info: "typing.Union[LabelingInfo, list[LabelingInfo]]",
139
+ storage_config: "typing.Union[StorageConfig, ResponseStorageConfig]",
140
+ ) -> None:
141
+ """
142
+ Validate the labeling info for a dataset
143
+
144
+ Args:
145
+ labeling_type: The type of labeling that will be performed
146
+ labeling_info: The labeling info to validate
147
+ storage_config: The storage configuration for the dataset.
148
+ StorageConfig is used to validate the URLs in the labeling info
149
+ """
150
+ if isinstance(labeling_info, list):
151
+ for labeling in labeling_info:
152
+ validate_labeling_info(labeling_type, labeling, storage_config)
153
+ return
154
+ elif isinstance(labeling_info, HirundoCSV):
155
+ validate_url(labeling_info.csv_url, storage_config)
156
+ elif isinstance(labeling_info, COCO):
157
+ validate_url(labeling_info.json_url, storage_config)
158
+ elif isinstance(labeling_info, YOLO):
159
+ validate_url(labeling_info.labels_dir_url, storage_config)
160
+ if labeling_info.data_yaml_url is not None:
161
+ validate_url(labeling_info.data_yaml_url, storage_config)
162
+ elif isinstance(labeling_info, Keylabs):
163
+ validate_url(labeling_info.labels_dir_url, storage_config)
164
+ validate_labeling_type(labeling_type, labeling_info)
hirundo/_urls.py ADDED
@@ -0,0 +1,59 @@
1
+ from typing import Annotated
2
+
3
+ from pydantic import StringConstraints, UrlConstraints
4
+ from pydantic_core import Url
5
+
6
+ from hirundo.dataset_enum import StorageTypes
7
+
8
+ S3BucketUrl = Annotated[
9
+ str,
10
+ StringConstraints(
11
+ min_length=8,
12
+ max_length=1023,
13
+ pattern=r"s3?://[a-z0-9.-]{3,64}[/]?", # Only allow real S3 bucket URLs
14
+ ),
15
+ ]
16
+
17
+ StorageConfigName = Annotated[
18
+ str,
19
+ StringConstraints(
20
+ min_length=1,
21
+ max_length=255,
22
+ pattern=r"^[a-zA-Z0-9-_]+$",
23
+ ),
24
+ ]
25
+
26
+ STORAGE_PATTERNS: dict[StorageTypes, str] = {
27
+ StorageTypes.S3: r"^s3:\/\/[a-z0-9\.\-]{3,63}/[a-zA-Z0-9!\-\/_\.\*'\(\)]+$",
28
+ StorageTypes.GCP: r"^gs:\/\/([a-z0-9][a-z0-9_-]{1,61}[a-z0-9](\.[a-z0-9][a-z0-9_-]{1,61}[a-z0-9])*)\/[^\x00-\x1F\x7F-\x9F\r\n]*$",
29
+ }
30
+
31
+
32
+ LENGTH_CONSTRAINTS: dict[StorageTypes, dict] = {
33
+ StorageTypes.S3: {"min_length": 8, "max_length": 1023, "bucket_max_length": None},
34
+ StorageTypes.GCP: {"min_length": 8, "max_length": 1023, "bucket_max_length": 222},
35
+ }
36
+
37
+ RepoUrl = Annotated[
38
+ Url,
39
+ UrlConstraints(
40
+ allowed_schemes=[
41
+ "ssh",
42
+ "https",
43
+ "http",
44
+ ]
45
+ ),
46
+ ]
47
+ HirundoUrl = Annotated[
48
+ Url,
49
+ UrlConstraints(
50
+ allowed_schemes=[
51
+ "file",
52
+ "https",
53
+ "http",
54
+ "s3",
55
+ "gs",
56
+ "ssh",
57
+ ]
58
+ ),
59
+ ]
hirundo/dataset_enum.py CHANGED
@@ -10,6 +10,9 @@ class LabelingType(str, Enum):
10
10
  SINGLE_LABEL_CLASSIFICATION = "SingleLabelClassification"
11
11
  OBJECT_DETECTION = "ObjectDetection"
12
12
  SPEECH_TO_TEXT = "SpeechToText"
13
+ OBJECT_SEGMENTATION = "ObjectSegmentation"
14
+ SEMANTIC_SEGMENTATION = "SemanticSegmentation"
15
+ PANOPTIC_SEGMENTATION = "PanopticSegmentation"
13
16
 
14
17
 
15
18
  class DatasetMetadataType(str, Enum):
@@ -21,3 +24,23 @@ class DatasetMetadataType(str, Enum):
21
24
  HIRUNDO_CSV = "HirundoCSV"
22
25
  COCO = "COCO"
23
26
  YOLO = "YOLO"
27
+ KeylabsObjDetImages = "KeylabsObjDetImages"
28
+ KeylabsObjDetVideo = "KeylabsObjDetVideo"
29
+ KeylabsObjSegImages = "KeylabsObjSegImages"
30
+ KeylabsObjSegVideo = "KeylabsObjSegVideo"
31
+
32
+
33
+ class StorageTypes(str, Enum):
34
+ """
35
+ Enum for the different types of storage configs.
36
+ Supported types are:
37
+ """
38
+
39
+ S3 = "S3"
40
+ GCP = "GCP"
41
+ # AZURE = "Azure" TODO: Azure storage config is coming soon
42
+ GIT = "Git"
43
+ LOCAL = "Local"
44
+ """
45
+ Local storage config is only supported for on-premises installations.
46
+ """
@@ -1,7 +1,6 @@
1
1
  import datetime
2
2
  import json
3
3
  import typing
4
- from abc import ABC, abstractmethod
5
4
  from collections.abc import AsyncGenerator, Generator
6
5
  from enum import Enum
7
6
  from typing import overload
@@ -12,14 +11,16 @@ from pydantic import BaseModel, Field, model_validator
12
11
  from tqdm import tqdm
13
12
  from tqdm.contrib.logging import logging_redirect_tqdm
14
13
 
15
- from hirundo._constraints import HirundoUrl
14
+ from hirundo._constraints import validate_labeling_info, validate_url
16
15
  from hirundo._env import API_HOST
17
16
  from hirundo._headers import get_headers
18
17
  from hirundo._http import raise_for_status_with_reason
19
18
  from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
20
19
  from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
20
+ from hirundo._urls import HirundoUrl
21
21
  from hirundo.dataset_enum import DatasetMetadataType, LabelingType
22
22
  from hirundo.dataset_optimization_results import DatasetOptimizationResults
23
+ from hirundo.labeling import YOLO, LabelingInfo
23
24
  from hirundo.logger import get_logger
24
25
  from hirundo.storage import ResponseStorageConfig, StorageConfig
25
26
  from hirundo.unzip import download_and_extract_zip
@@ -71,72 +72,6 @@ STATUS_TO_PROGRESS_MAP = {
71
72
  }
72
73
 
73
74
 
74
- class Metadata(BaseModel, ABC):
75
- type: DatasetMetadataType
76
-
77
- @property
78
- @abstractmethod
79
- def metadata_url(self) -> HirundoUrl:
80
- raise NotImplementedError()
81
-
82
-
83
- class HirundoCSV(Metadata):
84
- """
85
- A dataset metadata file in the Hirundo CSV format
86
- """
87
-
88
- type: DatasetMetadataType = DatasetMetadataType.HIRUNDO_CSV
89
- csv_url: HirundoUrl
90
- """
91
- The URL to access the dataset metadata CSV file.
92
- e.g. `s3://my-bucket-name/my-folder/my-metadata.csv`, `gs://my-bucket-name/my-folder/my-metadata.csv`,
93
- or `ssh://my-username@my-repo-name/my-folder/my-metadata.csv`
94
- (or `file:///datasets/my-folder/my-metadata.csv` if using LOCAL storage type with on-premises installation)
95
- """
96
-
97
- @property
98
- def metadata_url(self) -> HirundoUrl:
99
- return self.csv_url
100
-
101
-
102
- class COCO(Metadata):
103
- """
104
- A dataset metadata file in the COCO format
105
- """
106
-
107
- type: DatasetMetadataType = DatasetMetadataType.COCO
108
- json_url: HirundoUrl
109
- """
110
- The URL to access the dataset metadata JSON file.
111
- e.g. `s3://my-bucket-name/my-folder/my-metadata.json`, `gs://my-bucket-name/my-folder/my-metadata.json`,
112
- or `ssh://my-username@my-repo-name/my-folder/my-metadata.json`
113
- (or `file:///datasets/my-folder/my-metadata.json` if using LOCAL storage type with on-premises installation)
114
- """
115
-
116
- @property
117
- def metadata_url(self) -> HirundoUrl:
118
- return self.json_url
119
-
120
-
121
- class YOLO(Metadata):
122
- type: DatasetMetadataType = DatasetMetadataType.YOLO
123
- data_yaml_url: typing.Optional[HirundoUrl] = None
124
- labels_dir_url: HirundoUrl
125
-
126
- @property
127
- def metadata_url(self) -> HirundoUrl:
128
- return self.labels_dir_url
129
-
130
-
131
- LabelingInfo = typing.Union[HirundoCSV, COCO, YOLO]
132
- """
133
- The dataset labeling info. The dataset labeling info can be one of the following:
134
- - `DatasetMetadataType.HirundoCSV`: Indicates that the dataset metadata file is a CSV file with the Hirundo format
135
-
136
- Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
137
- """
138
-
139
-
140
75
  class VisionRunArgs(BaseModel):
141
76
  upsample: bool = False
142
77
  """
@@ -228,7 +163,7 @@ class OptimizationDataset(BaseModel):
228
163
  A full list of possible classes used in classification / object detection.
229
164
  It is currently required for clarity and performance.
230
165
  """
231
- labeling_info: LabelingInfo
166
+ labeling_info: typing.Union[LabelingInfo, list[LabelingInfo]]
232
167
 
233
168
  augmentations: typing.Optional[list[AugmentationName]] = None
234
169
  """
@@ -267,16 +202,30 @@ class OptimizationDataset(BaseModel):
267
202
  ):
268
203
  raise ValueError("Language is only allowed for Speech-to-Text datasets.")
269
204
  if (
270
- self.labeling_info.type == DatasetMetadataType.YOLO
205
+ not isinstance(self.labeling_info, list)
206
+ and self.labeling_info.type == DatasetMetadataType.YOLO
271
207
  and isinstance(self.labeling_info, YOLO)
272
208
  and (
273
209
  self.labeling_info.data_yaml_url is not None
274
210
  and self.classes is not None
275
211
  )
212
+ ) or (
213
+ isinstance(self.labeling_info, list)
214
+ and self.classes is not None
215
+ and any(
216
+ isinstance(info, YOLO) and info.data_yaml_url is not None
217
+ for info in self.labeling_info
218
+ )
276
219
  ):
277
220
  raise ValueError(
278
221
  "Only one of `classes` or `labeling_info.data_yaml_url` should be provided for YOLO datasets"
279
222
  )
223
+ if self.storage_config:
224
+ validate_labeling_info(
225
+ self.labeling_type, self.labeling_info, self.storage_config
226
+ )
227
+ if self.data_root_url and self.storage_config:
228
+ validate_url(self.data_root_url, self.storage_config)
280
229
  return self
281
230
 
282
231
  @staticmethod
@@ -595,6 +544,17 @@ class OptimizationDataset(BaseModel):
595
544
  if not last_event or last_event["data"]["state"] == RunStatus.PENDING.value:
596
545
  OptimizationDataset._check_run_by_id(run_id, retry + 1)
597
546
 
547
+ @staticmethod
548
+ def _handle_failure(iteration: dict):
549
+ if iteration["result"]:
550
+ raise HirundoError(
551
+ f"Optimization run failed with error: {iteration['result']}"
552
+ )
553
+ else:
554
+ raise HirundoError(
555
+ "Optimization run failed with an unknown error in _handle_failure"
556
+ )
557
+
598
558
  @staticmethod
599
559
  @overload
600
560
  def check_run_by_id(
@@ -644,9 +604,11 @@ class OptimizationDataset(BaseModel):
644
604
  RunStatus.REJECTED.value,
645
605
  RunStatus.REVOKED.value,
646
606
  ]:
647
- raise HirundoError(
648
- f"Optimization run failed with error: {iteration['result']}"
607
+ logger.error(
608
+ "State is failure, rejected, or revoked: %s",
609
+ iteration["state"],
649
610
  )
611
+ OptimizationDataset._handle_failure(iteration)
650
612
  elif iteration["state"] == RunStatus.SUCCESS.value:
651
613
  t.close()
652
614
  zip_temporary_url = iteration["result"]
@@ -690,7 +652,9 @@ class OptimizationDataset(BaseModel):
690
652
  t.n = current_progress_percentage
691
653
  logger.debug("Setting progress to %s", t.n)
692
654
  t.refresh()
693
- raise HirundoError("Optimization run failed with an unknown error")
655
+ raise HirundoError(
656
+ "Optimization run failed with an unknown error in check_run_by_id"
657
+ )
694
658
 
695
659
  @overload
696
660
  def check_run(
@@ -790,8 +754,6 @@ class OptimizationDataset(BaseModel):
790
754
  Args:
791
755
  run_id: The ID of the run to cancel
792
756
  """
793
- if not run_id:
794
- raise ValueError("No run has been started")
795
757
  logger.info("Cancelling run with ID: %s", run_id)
796
758
  response = requests.delete(
797
759
  f"{API_HOST}/dataset-optimization/run/{run_id}",
@@ -808,6 +770,30 @@ class OptimizationDataset(BaseModel):
808
770
  raise ValueError("No run has been started")
809
771
  self.cancel_by_id(self.run_id)
810
772
 
773
+ @staticmethod
774
+ def archive_run_by_id(run_id: str) -> None:
775
+ """
776
+ Archive the dataset optimization run for the given `run_id`.
777
+
778
+ Args:
779
+ run_id: The ID of the run to archive
780
+ """
781
+ logger.info("Archiving run with ID: %s", run_id)
782
+ response = requests.patch(
783
+ f"{API_HOST}/dataset-optimization/run/archive/{run_id}",
784
+ headers=get_headers(),
785
+ timeout=MODIFY_TIMEOUT,
786
+ )
787
+ raise_for_status_with_reason(response)
788
+
789
+ def archive(self) -> None:
790
+ """
791
+ Archive the current active instance's run.
792
+ """
793
+ if not self.run_id:
794
+ raise ValueError("No run has been started")
795
+ self.archive_run_by_id(self.run_id)
796
+
811
797
 
812
798
  class DataOptimizationDatasetOut(BaseModel):
813
799
  id: int
@@ -820,7 +806,7 @@ class DataOptimizationDatasetOut(BaseModel):
820
806
  data_root_url: HirundoUrl
821
807
 
822
808
  classes: typing.Optional[list[str]] = None
823
- labeling_info: LabelingInfo
809
+ labeling_info: typing.Union[LabelingInfo, list[LabelingInfo]]
824
810
 
825
811
  organization_id: typing.Optional[int]
826
812
  creator_id: typing.Optional[int]
hirundo/git.py CHANGED
@@ -7,11 +7,11 @@ import requests
7
7
  from pydantic import BaseModel, field_validator
8
8
  from pydantic_core import Url
9
9
 
10
- from hirundo._constraints import RepoUrl
11
10
  from hirundo._env import API_HOST
12
11
  from hirundo._headers import get_headers
13
12
  from hirundo._http import raise_for_status_with_reason
14
13
  from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
14
+ from hirundo._urls import RepoUrl
15
15
  from hirundo.logger import get_logger
16
16
 
17
17
  logger = get_logger(__name__)
hirundo/labeling.py ADDED
@@ -0,0 +1,140 @@
1
+ import typing
2
+ from abc import ABC
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from hirundo.dataset_enum import DatasetMetadataType
7
+
8
+ if typing.TYPE_CHECKING:
9
+ from hirundo._urls import HirundoUrl
10
+
11
+
12
+ class Metadata(BaseModel, ABC, frozen=True):
13
+ type: DatasetMetadataType
14
+
15
+
16
+ class HirundoCSV(Metadata, frozen=True):
17
+ """
18
+ A dataset metadata file in the Hirundo CSV format
19
+ """
20
+
21
+ type: typing.Literal[DatasetMetadataType.HIRUNDO_CSV] = (
22
+ DatasetMetadataType.HIRUNDO_CSV
23
+ )
24
+ csv_url: "HirundoUrl"
25
+ """
26
+ The URL to access the dataset metadata CSV file.
27
+ e.g. `s3://my-bucket-name/my-folder/my-metadata.csv`, `gs://my-bucket-name/my-folder/my-metadata.csv`,
28
+ or `ssh://my-username@my-repo-name/my-folder/my-metadata.csv`
29
+ (or `file:///datasets/my-folder/my-metadata.csv` if using LOCAL storage type with on-premises installation)
30
+ """
31
+
32
+
33
+ class COCO(Metadata, frozen=True):
34
+ """
35
+ A dataset metadata file in the COCO format
36
+ """
37
+
38
+ type: typing.Literal[DatasetMetadataType.COCO] = DatasetMetadataType.COCO
39
+ json_url: "HirundoUrl"
40
+ """
41
+ The URL to access the dataset metadata JSON file.
42
+ e.g. `s3://my-bucket-name/my-folder/my-metadata.json`, `gs://my-bucket-name/my-folder/my-metadata.json`,
43
+ or `ssh://my-username@my-repo-name/my-folder/my-metadata.json`
44
+ (or `file:///datasets/my-folder/my-metadata.json` if using LOCAL storage type with on-premises installation)
45
+ """
46
+
47
+
48
+ class YOLO(Metadata, frozen=True):
49
+ type: typing.Literal[DatasetMetadataType.YOLO] = DatasetMetadataType.YOLO
50
+ data_yaml_url: "typing.Optional[HirundoUrl]" = None
51
+ labels_dir_url: "HirundoUrl"
52
+
53
+
54
+ class KeylabsAuth(BaseModel):
55
+ username: str
56
+ password: str
57
+ instance: str
58
+
59
+
60
+ class Keylabs(Metadata, frozen=True):
61
+ project_id: str
62
+ """
63
+ Keylabs project ID.
64
+ """
65
+
66
+ labels_dir_url: "HirundoUrl"
67
+ """
68
+ URL to the directory containing the Keylabs labels.
69
+ """
70
+
71
+ with_attributes: bool = True
72
+ """
73
+ Whether to include attributes in the class name.
74
+ """
75
+
76
+ project_name: typing.Optional[str] = None
77
+ """
78
+ Keylabs project name (optional; added to output CSV if provided).
79
+ """
80
+ keylabs_auth: typing.Optional[KeylabsAuth] = None
81
+ """
82
+ Keylabs authentication credentials (optional; if provided, used to provide links to each sample).
83
+ """
84
+
85
+
86
+ class KeylabsObjDetImages(Keylabs, frozen=True):
87
+ type: typing.Literal[DatasetMetadataType.KeylabsObjDetImages] = (
88
+ DatasetMetadataType.KeylabsObjDetImages
89
+ )
90
+
91
+
92
+ class KeylabsObjDetVideo(Keylabs, frozen=True):
93
+ type: typing.Literal[DatasetMetadataType.KeylabsObjDetVideo] = (
94
+ DatasetMetadataType.KeylabsObjDetVideo
95
+ )
96
+
97
+
98
+ class KeylabsObjSegImages(Keylabs, frozen=True):
99
+ type: typing.Literal[DatasetMetadataType.KeylabsObjSegImages] = (
100
+ DatasetMetadataType.KeylabsObjSegImages
101
+ )
102
+
103
+
104
+ class KeylabsObjSegVideo(Keylabs, frozen=True):
105
+ type: typing.Literal[DatasetMetadataType.KeylabsObjSegVideo] = (
106
+ DatasetMetadataType.KeylabsObjSegVideo
107
+ )
108
+
109
+
110
+ KeylabsInfo = typing.Union[
111
+ KeylabsObjDetImages, KeylabsObjDetVideo, KeylabsObjSegImages, KeylabsObjSegVideo
112
+ ]
113
+ """
114
+ The dataset labeling info for Keylabs. The dataset labeling info can be one of the following:
115
+ - `DatasetMetadataType.KeylabsObjDetImages`: Indicates that the dataset metadata file is in the Keylabs object detection image format
116
+ - `DatasetMetadataType.KeylabsObjDetVideo`: Indicates that the dataset metadata file is in the Keylabs object detection video format
117
+ - `DatasetMetadataType.KeylabsObjSegImages`: Indicates that the dataset metadata file is in the Keylabs object segmentation image format
118
+ - `DatasetMetadataType.KeylabsObjSegVideo`: Indicates that the dataset metadata file is in the Keylabs object segmentation video format
119
+ """
120
+ LabelingInfo = typing.Annotated[
121
+ typing.Union[
122
+ HirundoCSV,
123
+ COCO,
124
+ YOLO,
125
+ KeylabsInfo,
126
+ ],
127
+ Field(discriminator="type"),
128
+ ]
129
+ """
130
+ The dataset labeling info. The dataset labeling info can be one of the following:
131
+ - `DatasetMetadataType.HirundoCSV`: Indicates that the dataset metadata file is a CSV file with the Hirundo format
132
+ - `DatasetMetadataType.COCO`: Indicates that the dataset metadata file is a JSON file with the COCO format
133
+ - `DatasetMetadataType.YOLO`: Indicates that the dataset metadata file is in the YOLO format
134
+ - `DatasetMetadataType.KeylabsObjDetImages`: Indicates that the dataset metadata file is in the Keylabs object detection image format
135
+ - `DatasetMetadataType.KeylabsObjDetVideo`: Indicates that the dataset metadata file is in the Keylabs object detection video format
136
+ - `DatasetMetadataType.KeylabsObjSegImages`: Indicates that the dataset metadata file is in the Keylabs object segmentation image format
137
+ - `DatasetMetadataType.KeylabsObjSegVideo`: Indicates that the dataset metadata file is in the Keylabs object segmentation video format
138
+
139
+ Currently no other formats are supported. Future versions of `hirundo` may support additional formats.
140
+ """
hirundo/storage.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import typing
2
- from enum import Enum
3
2
  from pathlib import Path
4
3
 
5
4
  import pydantic
@@ -7,11 +6,12 @@ import requests
7
6
  from pydantic import BaseModel, model_validator
8
7
  from pydantic_core import Url
9
8
 
10
- from hirundo._constraints import S3BucketUrl, StorageConfigName
11
9
  from hirundo._env import API_HOST
12
10
  from hirundo._headers import get_headers
13
11
  from hirundo._http import raise_for_status_with_reason
14
12
  from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
13
+ from hirundo._urls import S3BucketUrl, StorageConfigName
14
+ from hirundo.dataset_enum import StorageTypes
15
15
  from hirundo.git import GitRepo, GitRepoOut
16
16
  from hirundo.logger import get_logger
17
17
 
@@ -34,11 +34,11 @@ class StorageS3Base(BaseModel):
34
34
  Chains the bucket URL with the path, ensuring that the path is formatted correctly
35
35
 
36
36
  Args:
37
- path: The path to the file in the S3 bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
37
+ path: The path to the file in the S3 bucket, e.g. :file:`my-file.txt` or :file:`/my-folder/my-file.txt`
38
38
 
39
39
  Returns:
40
- The full URL to the file in the S3 bucket, e.g. `s3://my-bucket/my-file.txt` or `s3://my-bucket/my-folder/my-file.txt`,
41
- where `s3://my-bucket` is the bucket URL provided in the S3 storage config
40
+ The full URL to the file in the S3 bucket, e.g. :file:`s3://my-bucket/my-file.txt` or :file:`s3://my-bucket/my-folder/my-file.txt`,
41
+ where :file:`s3://my-bucket` is the bucket URL provided in the S3 storage config
42
42
  """
43
43
  return Url(
44
44
  f"{S3_PREFIX}{self.bucket_url.removeprefix(S3_PREFIX).removesuffix('/')}/{str(path).removeprefix('/')}"
@@ -64,11 +64,11 @@ class StorageGCPBase(BaseModel):
64
64
  Chains the bucket URL with the path, ensuring that the path is formatted correctly
65
65
 
66
66
  Args:
67
- path: The path to the file in the GCP bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
67
+ path: The path to the file in the GCP bucket, e.g. :file:`my-file.txt` or :file:`/my-folder/my-file.txt`
68
68
 
69
69
  Returns:
70
- The full URL to the file in the GCP bucket, e.g. `gs://my-bucket/my-file.txt` or `gs://my-bucket/my-folder/my-file.txt`,
71
- where `my-bucket` is the bucket name provided in the GCP storage config
70
+ The full URL to the file in the GCP bucket, e.g. :file:`gs://my-bucket/my-file.txt` or :file:`gs://my-bucket/my-folder/my-file.txt`,
71
+ where :file:`my-bucket` is the bucket name provided in the GCP storage config
72
72
  """
73
73
  return Url(f"gs://{self.bucket_name}/{str(path).removeprefix('/')}")
74
74
 
@@ -94,7 +94,7 @@ class StorageGCPOut(StorageGCPBase):
94
94
  # Chains the container URL with the path, ensuring that the path is formatted correctly
95
95
 
96
96
  # Args:
97
- # path: The path to the file in the Azure container, e.g. `my-file.txt` or `/my-folder/my-file.txt`
97
+ # path: The path to the file in the Azure container, e.g. :file:`my-file.txt` or :file:`/my-folder/my-file.txt`
98
98
 
99
99
  # Returns:
100
100
  # The full URL to the file in the Azure container
@@ -114,11 +114,11 @@ def get_git_repo_url(
114
114
  Chains the repository URL with the path, ensuring that the path is formatted correctly
115
115
 
116
116
  Args:
117
- repo_url: The URL of the git repository, e.g. `https://my-git-repository.com`
118
- path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
117
+ repo_url: The URL of the git repository, e.g. :file:`https://my-git-repository.com`
118
+ path: The path to the file in the git repository, e.g. :file:`my-file.txt` or :file:`/my-folder/my-file.txt`
119
119
 
120
120
  Returns:
121
- The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`
121
+ The full URL to the file in the git repository, e.g. :file:`https://my-git-repository.com/my-file.txt` or :file:`https://my-git-repository.com/my-folder/my-file.txt`
122
122
  """
123
123
  if not isinstance(repo_url, Url):
124
124
  repo_url = Url(repo_url)
@@ -131,12 +131,12 @@ class StorageGit(BaseModel):
131
131
  repo_id: typing.Optional[int] = None
132
132
  """
133
133
  The ID of the Git repository in the Hirundo system.
134
- Either `repo_id` or `repo` must be provided.
134
+ Either :code:`repo_id` or :code:`repo` must be provided.
135
135
  """
136
136
  repo: typing.Optional[GitRepo] = None
137
137
  """
138
138
  The Git repository to link to.
139
- Either `repo_id` or `repo` must be provided.
139
+ Either :code:`repo_id` or :code:`repo` must be provided.
140
140
  """
141
141
  branch: str
142
142
  """
@@ -156,11 +156,11 @@ class StorageGit(BaseModel):
156
156
  Chains the repository URL with the path, ensuring that the path is formatted correctly
157
157
 
158
158
  Args:
159
- path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
159
+ path: The path to the file in the git repository, e.g. :file:`my-file.txt` or :file:`/my-folder/my-file.txt`
160
160
 
161
161
  Returns:
162
- The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
163
- where `https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
162
+ The full URL to the file in the git repository, e.g. :file:`https://my-git-repository.com/my-file.txt` or :file:`https://my-git-repository.com/my-folder/my-file.txt`,
163
+ where :file:`https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
164
164
  """
165
165
  if not self.repo:
166
166
  raise ValueError("Repo must be provided to use `get_url`")
@@ -179,47 +179,31 @@ class StorageGitOut(BaseModel):
179
179
  Chains the repository URL with the path, ensuring that the path is formatted correctly
180
180
 
181
181
  Args:
182
- path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
182
+ path: The path to the file in the git repository, e.g. :file:`my-file.txt` or :file:`/my-folder/my-file.txt`
183
183
 
184
184
  Returns:
185
- The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
186
- where `https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
185
+ The full URL to the file in the git repository, e.g. :file:`https://my-git-repository.com/my-file.txt` or :file:`https://my-git-repository.com/my-folder/my-file.txt`,
186
+ where :file:`https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
187
187
  """
188
188
  repo_url = self.repo.repository_url
189
189
  return get_git_repo_url(repo_url, path)
190
190
 
191
191
 
192
- class StorageTypes(str, Enum):
193
- """
194
- Enum for the different types of storage configs.
195
- Supported types are:
196
- """
197
-
198
- S3 = "S3"
199
- GCP = "GCP"
200
- # AZURE = "Azure" TODO: Azure storage config is coming soon
201
- GIT = "Git"
202
- LOCAL = "Local"
203
- """
204
- Local storage config is only supported for on-premises installations.
205
- """
206
-
207
-
208
192
  class StorageConfig(BaseModel):
209
193
  id: typing.Optional[int] = None
210
194
  """
211
- The ID of the `StorageConfig` in the Hirundo system.
195
+ The ID of the :code:`StorageConfig` in the Hirundo system.
212
196
  """
213
197
 
214
198
  organization_id: typing.Optional[int] = None
215
199
  """
216
- The ID of the organization that the `StorageConfig` belongs to.
200
+ The ID of the organization that the :code:`StorageConfig` belongs to.
217
201
  If not provided, it will be assigned to your default organization.
218
202
  """
219
203
 
220
204
  name: StorageConfigName
221
205
  """
222
- A name to identify the `StorageConfig` in the Hirundo system.
206
+ A name to identify the :code:`StorageConfig` in the Hirundo system.
223
207
  """
224
208
  type: typing.Optional[StorageTypes] = pydantic.Field(
225
209
  examples=[
@@ -230,12 +214,12 @@ class StorageConfig(BaseModel):
230
214
  ]
231
215
  )
232
216
  """
233
- The type of the `StorageConfig`.
217
+ The type of the :code:`StorageConfig`.
234
218
  Supported types are:
235
- - `S3`
236
- - `GCP`
237
- - `Azure` (coming soon)
238
- - `Git`
219
+ - :code:`S3`
220
+ - :code:`GCP`
221
+ - :code:`Azure` (coming soon)
222
+ - :code:`Git`
239
223
  """
240
224
  s3: typing.Optional[StorageS3] = pydantic.Field(
241
225
  default=None,
@@ -323,10 +307,10 @@ class StorageConfig(BaseModel):
323
307
  @staticmethod
324
308
  def get_by_id(storage_config_id: int) -> "ResponseStorageConfig":
325
309
  """
326
- Retrieves a `StorageConfig` instance from the server by its ID
310
+ Retrieves a :code:`StorageConfig` instance from the server by its ID
327
311
 
328
312
  Args:
329
- storage_config_id: The ID of the `StorageConfig` to retrieve
313
+ storage_config_id: The ID of the :code:`StorageConfig` to retrieve
330
314
  """
331
315
  storage_config = requests.get(
332
316
  f"{API_HOST}/storage-config/{storage_config_id}",
@@ -339,11 +323,11 @@ class StorageConfig(BaseModel):
339
323
  @staticmethod
340
324
  def get_by_name(name: str, storage_type: StorageTypes) -> "ResponseStorageConfig":
341
325
  """
342
- Retrieves a `StorageConfig` instance from the server by its name
326
+ Retrieves a :code:`StorageConfig` instance from the server by its name
343
327
 
344
328
  Args:
345
- name: The name of the `StorageConfig` to retrieve
346
- storage_type: The type of the `StorageConfig` to retrieve
329
+ name: The name of the :code:`StorageConfig` to retrieve
330
+ storage_type: The type of the :code:`StorageConfig` to retrieve
347
331
 
348
332
  Note: The type is required because the name is not unique across different storage types
349
333
  """
@@ -360,12 +344,12 @@ class StorageConfig(BaseModel):
360
344
  organization_id: typing.Optional[int] = None,
361
345
  ) -> list["ResponseStorageConfig"]:
362
346
  """
363
- Lists all the `StorageConfig`'s created by user's default organization
364
- Note: The return type is `list[dict]` and not `list[StorageConfig]`
347
+ Lists all the :code:`StorageConfig`'s created by user's default organization
348
+ Note: The return type is :code:`list[dict]` and not :code:`list[StorageConfig]`
365
349
 
366
350
  Args:
367
- organization_id: The ID of the organization to list `StorageConfig`'s for.
368
- If not provided, it will list `StorageConfig`'s for the default organization.
351
+ organization_id: The ID of the organization to list :code:`StorageConfig`'s for.
352
+ If not provided, it will list :code:`StorageConfig`'s for the default organization.
369
353
  """
370
354
  storage_configs = requests.get(
371
355
  f"{API_HOST}/storage-config/",
@@ -379,10 +363,10 @@ class StorageConfig(BaseModel):
379
363
  @staticmethod
380
364
  def delete_by_id(storage_config_id) -> None:
381
365
  """
382
- Deletes a `StorageConfig` instance from the server by its ID
366
+ Deletes a :code:`StorageConfig` instance from the server by its ID
383
367
 
384
368
  Args:
385
- storage_config_id: The ID of the `StorageConfig` to delete
369
+ storage_config_id: The ID of the :code:`StorageConfig` to delete
386
370
  """
387
371
  storage_config = requests.delete(
388
372
  f"{API_HOST}/storage-config/{storage_config_id}",
@@ -394,7 +378,7 @@ class StorageConfig(BaseModel):
394
378
 
395
379
  def delete(self) -> None:
396
380
  """
397
- Deletes the `StorageConfig` instance from the server
381
+ Deletes the :code:`StorageConfig` instance from the server
398
382
  """
399
383
  if not self.id:
400
384
  raise ValueError("No StorageConfig has been created")
@@ -402,10 +386,10 @@ class StorageConfig(BaseModel):
402
386
 
403
387
  def create(self, replace_if_exists: bool = False) -> int:
404
388
  """
405
- Create a `StorageConfig` instance on the server
389
+ Create a :code:`StorageConfig` instance on the server
406
390
 
407
391
  Args:
408
- replace_if_exists: If a `StorageConfig` with the same name and type already exists, replace it.
392
+ replace_if_exists: If a :code:`StorageConfig` with the same name and type already exists, replace it.
409
393
  """
410
394
  if self.git and self.git.repo:
411
395
  self.git.repo_id = self.git.repo.create(replace_if_exists=replace_if_exists)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hirundo
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
5
5
  Author-email: Hirundo <dev@hirundo.io>
6
6
  License: MIT License
@@ -55,7 +55,7 @@ Requires-Dist: platformdirs>=4.3.6; extra == "dev"
55
55
  Requires-Dist: safety>=3.2.13; extra == "dev"
56
56
  Provides-Extra: docs
57
57
  Requires-Dist: sphinx>=7.4.7; extra == "docs"
58
- Requires-Dist: sphinx-autobuild>=2024.4.16; extra == "docs"
58
+ Requires-Dist: sphinx-autobuild>=2024.9.3; extra == "docs"
59
59
  Requires-Dist: sphinx-click>=5.0.1; extra == "docs"
60
60
  Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
61
61
  Requires-Dist: furo; extra == "docs"
@@ -64,7 +64,7 @@ Requires-Dist: esbonio; extra == "docs"
64
64
  Requires-Dist: starlette>0.40.0; extra == "docs"
65
65
  Requires-Dist: markupsafe>=3.0.2; extra == "docs"
66
66
  Provides-Extra: pandas
67
- Requires-Dist: pandas>=2.2.2; extra == "pandas"
67
+ Requires-Dist: pandas>=2.2.3; extra == "pandas"
68
68
  Provides-Extra: polars
69
69
  Requires-Dist: polars>=1.0.0; extra == "polars"
70
70
  Dynamic: license-file
@@ -75,40 +75,62 @@ This package exposes access to Hirundo APIs for dataset optimization for Machine
75
75
 
76
76
  Dataset optimization is currently available for datasets labelled for classification and object detection.
77
77
 
78
-
79
78
  Support dataset storage configs include:
80
- - Google Cloud (GCP) Storage
81
- - Amazon Web Services (AWS) S3
82
- - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
79
+
80
+ - Google Cloud (GCP) Storage
81
+ - Amazon Web Services (AWS) S3
82
+ - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
83
+
84
+ Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
83
85
 
84
86
  Optimizing a classification dataset
85
87
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
86
88
 
87
- Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
88
- - ``image_path``: The location of the image within the dataset ``root``
89
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
89
+ Currently `hirundo` requires a CSV file with the following columns (all columns are required):
90
+
91
+ - `image_path`: The location of the image within the dataset `data_root_url`
92
+ - `class_name`: The semantic label, i.e. the class name of the class that the image was annotated as belonging to
93
+
94
+ And outputs two Pandas DataFrames with the dataset columns as well as:
95
+
96
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
90
97
 
91
- And outputs a CSV with the same columns and:
92
- - ``suspect_level``: mislabel suspect level
93
- - ``suggested_label``: suggested label
94
- - ``suggested_label_conf``: suggested label confidence
98
+ - ``suspect_score``: mislabel suspect score
99
+ - ``suspect_level``: mislabel suspect level
100
+ - ``suspect_rank``: mislabel suspect ranking
101
+ - ``suggested_class_name``: suggested semantic label
102
+ - ``suggested_class_conf``: suggested semantic label confidence
103
+
104
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
105
+
106
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE``)
95
107
 
96
108
  Optimizing an object detection (OD) dataset
97
109
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
98
110
 
99
111
  Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
100
- - ``image_path``: The location of the image within the dataset ``root``
101
- - ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
102
- - ``label``: The label of the image, i.e. which the class that was annotated for this image
103
- - ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
104
112
 
105
- And outputs a CSV with the same columns and:
106
- - ``suspect_level``: object mislabel suspect level
107
- - ``suggested_label``: suggested object label
108
- - ``suggested_label_conf``: suggested object label confidence
113
+ - ``image_path``: The location of the image within the dataset ``data_root_url``
114
+ - ``object_id``: The ID of the bounding box within the dataset. Used to indicate object suspects
115
+ - ``class_name``: Object semantic label, i.e. the class name of the object that was annotated
116
+ - ``xmin``: leftmost horizontal pixel coordinate of the object's bounding box
117
+ - ``ymin``: uppermost vertical pixel coordinate of the object's bounding box
118
+ - ``xmax``: rightmost horizontal pixel coordinate of the object's bounding box
119
+ - ``ymax``: lowermost vertical pixel coordinate of the object's bounding box
109
120
 
110
- Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
111
121
 
122
+ And outputs two Pandas DataFrames with the dataset columns as well as:
123
+
124
+ Suspect DataFrame (filename: `mislabel_suspects.csv`) columns:
125
+
126
+ - ``suspect_score``: object mislabel suspect score
127
+ - ``suspect_level``: object mislabel suspect level
128
+ - ``suspect_rank``: object mislabel suspect ranking
129
+ - ``suggested_class_name``: suggested object semantic label
130
+ - ``suggested_class_conf``: suggested object semantic label confidence
131
+
132
+ Errors and warnings DataFrame (filename: `invalid_data.csv`) columns:
133
+ - ``status``: status message (one of ``NO_LABELS`` / ``MISSING_IMAGE`` / ``INVALID_IMAGE`` / ``INVALID_BBOX`` / ``INVALID_BBOX_SIZE``)
112
134
 
113
135
  ## Installation
114
136
 
@@ -117,6 +139,7 @@ You can install the codebase with a simple `pip install hirundo` to install the
117
139
  ## Usage
118
140
 
119
141
  Classification example:
142
+
120
143
  ```python
121
144
  from hirundo import (
122
145
  HirundoCSV,
@@ -152,7 +175,6 @@ results = test_dataset.check_run()
152
175
  print(results)
153
176
  ```
154
177
 
155
-
156
178
  Object detection example:
157
179
 
158
180
  ```python
@@ -187,21 +209,6 @@ test_dataset = OptimizationDataset(
187
209
  path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
188
210
  ),
189
211
  ),
190
- classes=[
191
- "traffic light",
192
- "traffic sign",
193
- "car",
194
- "pedestrian",
195
- "bus",
196
- "truck",
197
- "rider",
198
- "bicycle",
199
- "motorcycle",
200
- "train",
201
- "other vehicle",
202
- "other person",
203
- "trailer",
204
- ],
205
212
  )
206
213
 
207
214
  test_dataset.run_optimization()
@@ -209,8 +216,8 @@ results = test_dataset.check_run()
209
216
  print(results)
210
217
  ```
211
218
 
212
- Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyPy support may be introduced in the future.
219
+ Note: Currently we only support the main CPython release 3.9, 3.10, 3.11, 3.12 & 3.13. PyPy support may be introduced in the future.
213
220
 
214
221
  ## Further documentation
215
222
 
216
- To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.
223
+ To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the [Google Colab examples](https://github.com/Hirundo-io/hirundo-client/tree/main/notebooks).
@@ -0,0 +1,25 @@
1
+ hirundo/__init__.py,sha256=1Uy9UZhaZPQQSMfAOJ0A_Of70tM8_MDq-HHdhrmpO6g,1301
2
+ hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
3
+ hirundo/_constraints.py,sha256=tgJfvp7ydyXilT8ViNk837rNRlpOVXLLeCSMt_YUUYA,6013
4
+ hirundo/_dataframe.py,sha256=sXEEbCNcLi83wyU9ii884YikCzfASo_3nnrDxhuCv7U,758
5
+ hirundo/_env.py,sha256=efX2sjvYlHkFr2Lcstelei67YSTFpVGT0l08ZsfiMuE,622
6
+ hirundo/_headers.py,sha256=3hybpD_X4SODv3cFZPt9AjGY2vvZaag5OKT3z1SHSjA,521
7
+ hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
8
+ hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
9
+ hirundo/_timeouts.py,sha256=gE58NU0t2e4KgKq2sk5rZcezDJAkgvRIbM5AVYFY6Ho,86
10
+ hirundo/_urls.py,sha256=0C85EbL0T-Bj25vJwjNs_obUG8ROSADpmbFdTAyhzlw,1375
11
+ hirundo/cli.py,sha256=5Tn0eXZGG92BR9HJYUaYozjFbS1t6UTw_I2R0tZBE04,7824
12
+ hirundo/dataset_enum.py,sha256=QnS3fy1OF4wvUtiIAHubKRhc611idS8huopEEolgqEM,1217
13
+ hirundo/dataset_optimization.py,sha256=fXi8MeI0PWwSyc5NuOzCrkgXT_mz24NV-dGOHDPkBR0,31256
14
+ hirundo/dataset_optimization_results.py,sha256=A9YyF5zaZXVtzeDE08I_05v90dhZQADpSjDcS_6eLMc,1129
15
+ hirundo/git.py,sha256=8LVnF4WCjZsxMHoRaVxbLiDAKpGCBEwlcZp7a30n9Zo,6573
16
+ hirundo/labeling.py,sha256=zXQCaqfdaLIG4qbzFGbb94L3FDdRMpdzHwbrDJE07Yk,5006
17
+ hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
18
+ hirundo/storage.py,sha256=y7cr_dngkfZq0gKnwWxrSqUXb1SycGpwFRVmS9Cn3h8,15942
19
+ hirundo/unzip.py,sha256=XJqvt2m5pWR-G-fnzgW75VOdd-K4_Rw2r4wiEhZgKZA,8245
20
+ hirundo-0.1.18.dist-info/licenses/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
21
+ hirundo-0.1.18.dist-info/METADATA,sha256=F_F0-EfUxVVCcgFue_hwCtxfIfmqBlwnpvzELuhMkAc,9302
22
+ hirundo-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
+ hirundo-0.1.18.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
24
+ hirundo-0.1.18.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
25
+ hirundo-0.1.18.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,23 +0,0 @@
1
- hirundo/__init__.py,sha256=qKC89bNReZSjGtmf7l3PZD2JoptyVphpsD0Kf2PNXvY,1035
2
- hirundo/__main__.py,sha256=wcCrL4PjG51r5wVKqJhcoJPTLfHW0wNbD31DrUN0MWI,28
3
- hirundo/_constraints.py,sha256=gRv7fXwtjPGqYWIhkVYxu1B__3PdlYRqFyDkTpa9f74,1032
4
- hirundo/_dataframe.py,sha256=sXEEbCNcLi83wyU9ii884YikCzfASo_3nnrDxhuCv7U,758
5
- hirundo/_env.py,sha256=efX2sjvYlHkFr2Lcstelei67YSTFpVGT0l08ZsfiMuE,622
6
- hirundo/_headers.py,sha256=3hybpD_X4SODv3cFZPt9AjGY2vvZaag5OKT3z1SHSjA,521
7
- hirundo/_http.py,sha256=izlnuxStyPugjTAbD8Lo30tA4lZJ5d3kOENNduqrbX4,573
8
- hirundo/_iter_sse_retrying.py,sha256=U331_wZRIbVzi-jnMqo8bp9jBC8MtFBLEs-X0ZvhSDw,4634
9
- hirundo/_timeouts.py,sha256=gE58NU0t2e4KgKq2sk5rZcezDJAkgvRIbM5AVYFY6Ho,86
10
- hirundo/cli.py,sha256=5Tn0eXZGG92BR9HJYUaYozjFbS1t6UTw_I2R0tZBE04,7824
11
- hirundo/dataset_enum.py,sha256=ZEYBP-lrlVqfNWptlmw7JgLNhCyDirtWWPtoMvtg2AE,531
12
- hirundo/dataset_optimization.py,sha256=jR4ZOlKKl05jrA4cq9L1IQuKVPJ3ytXkhOJEg6efFqI,31390
13
- hirundo/dataset_optimization_results.py,sha256=A9YyF5zaZXVtzeDE08I_05v90dhZQADpSjDcS_6eLMc,1129
14
- hirundo/git.py,sha256=6h1hFPlw5FfYMGWXPCitnTqGICmBKmQtb5qKGe3Icmk,6580
15
- hirundo/logger.py,sha256=MUqrYp0fBlxWFhGl6P5t19_uqO7T_PNhrLN5bqY3i7s,275
16
- hirundo/storage.py,sha256=kO-LWlQAM3qTnALEl8s79AiFMYqCG9Sem4MIFQcyvAg,15950
17
- hirundo/unzip.py,sha256=XJqvt2m5pWR-G-fnzgW75VOdd-K4_Rw2r4wiEhZgKZA,8245
18
- hirundo-0.1.16.dist-info/licenses/LICENSE,sha256=fusGGjqT2RGlU6kbkaOk7d-gDnsjk17wq67AO0mwBZI,1065
19
- hirundo-0.1.16.dist-info/METADATA,sha256=CxdCbzafRuVRf1BGsS_tgjodO0g745uuNBl7y4UFMj8,8501
20
- hirundo-0.1.16.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
21
- hirundo-0.1.16.dist-info/entry_points.txt,sha256=4ZtnA_Nl1Af8fLnHp3lwjbGDEGU1S6ujb_JwtuQ7ZPM,44
22
- hirundo-0.1.16.dist-info/top_level.txt,sha256=cmyNqrNZOAYxnywJGFI1AJBLe4SkH8HGsfFx6ncdrbI,8
23
- hirundo-0.1.16.dist-info/RECORD,,