hirundo 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hirundo/storage.py CHANGED
@@ -1,51 +1,139 @@
1
1
  import typing
2
2
  from enum import Enum
3
- from typing import Union
3
+ from pathlib import Path
4
4
 
5
5
  import pydantic
6
6
  import requests
7
7
  from pydantic import BaseModel, model_validator
8
8
  from pydantic_core import Url
9
9
 
10
- from hirundo._constraints import S3BucketUrl, StorageIntegrationName
10
+ from hirundo._constraints import S3BucketUrl, StorageConfigName
11
11
  from hirundo._env import API_HOST
12
12
  from hirundo._headers import get_auth_headers, json_headers
13
+ from hirundo._http import raise_for_status_with_reason
13
14
  from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
14
- from hirundo.git import GitRepo
15
+ from hirundo.git import GitRepo, GitRepoOut
15
16
  from hirundo.logger import get_logger
16
17
 
17
18
  logger = get_logger(__name__)
18
19
 
20
+ S3_PREFIX = "s3://"
19
21
 
20
- class StorageS3(BaseModel):
21
- endpoint_url: Union[Url, None] = None
22
+
23
+ class StorageS3Base(BaseModel):
24
+ endpoint_url: typing.Optional[Url] = None
22
25
  bucket_url: S3BucketUrl
23
26
  region_name: str
24
27
  # ⬆️ We could restrict this, but if we're allowing custom endpoints then the validation may be wrong
25
- access_key_id: Union[str, None] = None
26
- secret_access_key: Union[str, None] = None
28
+ access_key_id: typing.Optional[str] = None
29
+
30
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
31
+ """
32
+ Get the full URL for a file in the S3 bucket
33
+
34
+ Chains the bucket URL with the path, ensuring that the path is formatted correctly
35
+
36
+ Args:
37
+ - path: The path to the file in the S3 bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
38
+
39
+ Returns:
40
+ The full URL to the file in the S3 bucket, e.g. `s3://my-bucket/my-file.txt` or `s3://my-bucket/my-folder/my-file.txt`,
41
+ where `s3://my-bucket` is the bucket URL provided in the S3 storage config
42
+ """
43
+ return Url(
44
+ f"{S3_PREFIX}{self.bucket_url.removeprefix(S3_PREFIX).removesuffix('/')}/{str(path).removeprefix('/')}"
45
+ )
46
+
47
+
48
+ class StorageS3(StorageS3Base):
49
+ secret_access_key: typing.Optional[str] = None
27
50
 
28
51
 
29
- class StorageGCP(BaseModel):
52
+ class StorageS3Out(StorageS3Base):
53
+ pass
54
+
55
+
56
+ class StorageGCPBase(BaseModel):
30
57
  bucket_name: str
31
58
  project: str
32
- credentials_json: Union[dict, None] = None
33
59
 
60
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
61
+ """
62
+ Get the full URL for a file in the GCP bucket
63
+
64
+ Chains the bucket URL with the path, ensuring that the path is formatted correctly
65
+
66
+ Args:
67
+ - path: The path to the file in the GCP bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
68
+
69
+ Returns:
70
+ The full URL to the file in the GCP bucket, e.g. `gs://my-bucket/my-file.txt` or `gs://my-bucket/my-folder/my-file.txt`,
71
+ where `my-bucket` is the bucket name provided in the GCP storage config
72
+ """
73
+ return Url(f"gs://{self.bucket_name}/{str(path).removeprefix('/')}")
74
+
75
+
76
+ class StorageGCP(StorageGCPBase):
77
+ credentials_json: typing.Optional[dict] = None
34
78
 
35
- # TODO: Azure storage integration is coming soon
79
+
80
+ class StorageGCPOut(StorageGCPBase):
81
+ pass
82
+
83
+
84
+ # TODO: Azure storage config is coming soon
36
85
  # class StorageAzure(BaseModel):
86
+ # account_url: HttpUrl
87
+ # container_name: str
88
+ # tenant_id: str
89
+
90
+ # def get_url(self, path: typing.Union[str, Path]) -> Url:
91
+ # """
92
+ # Get the full URL for a file in the Azure container
93
+
94
+ # Chains the container URL with the path, ensuring that the path is formatted correctly
95
+
96
+ # Args:
97
+ # - path: The path to the file in the Azure container, e.g. `my-file.txt` or `/my-folder/my-file.txt`
98
+
99
+ # Returns:
100
+ # The full URL to the file in the Azure container
101
+ # """
102
+ # return Url(f"{str(self.account_url)}/{self.container_name}/{str(path).removeprefix('/')}")
103
+ # class StorageAzureOut(BaseModel):
37
104
  # container: str
38
- # account_name: str
39
- # account_key: str
105
+ # account_url: str
106
+
107
+
108
+ def get_git_repo_url(
109
+ repo_url: typing.Union[str, Url], path: typing.Union[str, Path]
110
+ ) -> Url:
111
+ """
112
+ Get the full URL for a file in the git repository
113
+
114
+ Chains the repository URL with the path, ensuring that the path is formatted correctly
115
+
116
+ Args:
117
+ - repo_url: The URL of the git repository, e.g. `https://my-git-repository.com`
118
+ - path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
119
+
120
+ Returns:
121
+ The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`
122
+ """
123
+ if not isinstance(repo_url, Url):
124
+ repo_url = Url(repo_url)
125
+ return Url(
126
+ f"{repo_url.scheme}{str(repo_url).removeprefix(repo_url.scheme)}/{str(path).removeprefix('/')}"
127
+ )
40
128
 
41
129
 
42
130
  class StorageGit(BaseModel):
43
- repo_id: Union[int, None] = None
131
+ repo_id: typing.Optional[int] = None
44
132
  """
45
133
  The ID of the Git repository in the Hirundo system.
46
134
  Either `repo_id` or `repo` must be provided.
47
135
  """
48
- repo: Union[GitRepo, None] = None
136
+ repo: typing.Optional[GitRepo] = None
49
137
  """
50
138
  The Git repository to link to.
51
139
  Either `repo_id` or `repo` must be provided.
@@ -61,53 +149,95 @@ class StorageGit(BaseModel):
61
149
  raise ValueError("Either repo_id or repo must be provided")
62
150
  return self
63
151
 
152
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
153
+ """
154
+ Get the full URL for a file in the git repository
155
+
156
+ Chains the repository URL with the path, ensuring that the path is formatted correctly
157
+
158
+ Args:
159
+ - path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
160
+
161
+ Returns:
162
+ The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
163
+ where `https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
164
+ """
165
+ if not self.repo:
166
+ raise ValueError("Repo must be provided to use `get_url`")
167
+ repo_url = self.repo.repository_url
168
+ return get_git_repo_url(repo_url, path)
169
+
170
+
171
+ class StorageGitOut(BaseModel):
172
+ repo: GitRepoOut
173
+ branch: str
174
+
175
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
176
+ """
177
+ Get the full URL for a file in the git repository
178
+
179
+ Chains the repository URL with the path, ensuring that the path is formatted correctly
180
+
181
+ Args:
182
+ - path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
183
+
184
+ Returns:
185
+ The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
186
+ where `https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
187
+ """
188
+ repo_url = self.repo.repository_url
189
+ return get_git_repo_url(repo_url, path)
190
+
64
191
 
65
192
  class StorageTypes(str, Enum):
66
193
  """
67
- Enum for the different types of storage integrations.
194
+ Enum for the different types of storage configs.
68
195
  Supported types are:
69
196
  """
70
197
 
71
198
  S3 = "S3"
72
199
  GCP = "GCP"
73
- # AZURE = "Azure" TODO: Azure storage integration is coming soon
200
+ # AZURE = "Azure" TODO: Azure storage config is coming soon
74
201
  GIT = "Git"
75
202
  LOCAL = "Local"
76
203
  """
77
- Local storage integration is only supported for on-premises installations.
204
+ Local storage config is only supported for on-premises installations.
78
205
  """
79
206
 
80
207
 
81
- class StorageIntegration(BaseModel):
82
- id: Union[int, None] = None
208
+ class StorageConfig(BaseModel):
209
+ id: typing.Optional[int] = None
210
+ """
211
+ The ID of the `StorageConfig` in the Hirundo system.
212
+ """
83
213
 
84
- organization_id: Union[int, None] = None
214
+ organization_id: typing.Optional[int] = None
85
215
  """
86
- The ID of the organization that the `StorageIntegration` belongs to.
216
+ The ID of the organization that the `StorageConfig` belongs to.
87
217
  If not provided, it will be assigned to your default organization.
88
218
  """
89
219
 
90
- name: StorageIntegrationName
220
+ name: StorageConfigName
91
221
  """
92
- A name to identify the `StorageIntegration` in the Hirundo system.
222
+ A name to identify the `StorageConfig` in the Hirundo system.
93
223
  """
94
224
  type: typing.Optional[StorageTypes] = pydantic.Field(
95
225
  examples=[
96
226
  StorageTypes.S3,
97
227
  StorageTypes.GCP,
98
- # StorageTypes.AZURE, TODO: Azure storage integration is coming soon
228
+ # StorageTypes.AZURE, TODO: Azure storage is coming soon
99
229
  StorageTypes.GIT,
100
230
  ]
101
231
  )
102
232
  """
103
- The type of the `StorageIntegration`.
233
+ The type of the `StorageConfig`.
104
234
  Supported types are:
105
235
  - `S3`
106
236
  - `GCP`
107
237
  - `Azure` (coming soon)
108
238
  - `Git`
109
239
  """
110
- s3: Union[StorageS3, None] = pydantic.Field(
240
+ s3: typing.Optional[StorageS3] = pydantic.Field(
111
241
  default=None,
112
242
  examples=[
113
243
  {
@@ -122,10 +252,10 @@ class StorageIntegration(BaseModel):
122
252
  ],
123
253
  )
124
254
  """
125
- The Amazon Web Services (AWS) S3 storage integration details.
255
+ The Amazon Web Services (AWS) S3 storage config details.
126
256
  Use this if you want to link to an S3 bucket.
127
257
  """
128
- gcp: Union[StorageGCP, None] = pydantic.Field(
258
+ gcp: typing.Optional[StorageGCP] = pydantic.Field(
129
259
  default=None,
130
260
  examples=[
131
261
  None,
@@ -151,11 +281,11 @@ class StorageIntegration(BaseModel):
151
281
  ],
152
282
  )
153
283
  """
154
- The Google Cloud (GCP) Storage integration details.
284
+ The Google Cloud (GCP) Storage config details.
155
285
  Use this if you want to link to an GCS bucket.
156
286
  """
157
287
  azure: None = None
158
- # azure: Union[StorageAzure, None] = pydantic.Field(
288
+ # azure: typing.Optional[StorageAzure] = pydantic.Field(
159
289
  # default=None,
160
290
  # examples=[
161
291
  # None,
@@ -167,8 +297,8 @@ class StorageIntegration(BaseModel):
167
297
  # },
168
298
  # None,
169
299
  # ],
170
- # ) TODO: Azure storage integration is coming soon
171
- git: Union[StorageGit, None] = pydantic.Field(
300
+ # ) TODO: Azure storage config is coming soon
301
+ git: typing.Optional[StorageGit] = pydantic.Field(
172
302
  default=None,
173
303
  examples=[
174
304
  None,
@@ -186,73 +316,116 @@ class StorageIntegration(BaseModel):
186
316
  ],
187
317
  )
188
318
  """
189
- The Git storage integration details.
319
+ The Git storage config details.
190
320
  Use this if you want to link to a Git repository.
191
321
  """
192
322
 
193
323
  @staticmethod
194
- def list(organization_id: typing.Union[int, None] = None) -> list[dict]:
324
+ def get_by_id(storage_config_id: int) -> "ResponseStorageConfig":
195
325
  """
196
- Lists all the `StorageIntegration`'s created by user's default organization
197
- Note: The return type is `list[dict]` and not `list[StorageIntegration]`
326
+ Retrieves a `StorageConfig` instance from the server by its ID
198
327
 
199
328
  Args:
200
- organization_id: The ID of the organization to list `StorageIntegration`'s for.
201
- If not provided, it will list `StorageIntegration`'s for the default organization.
329
+ storage_config_id: The ID of the `StorageConfig` to retrieve
202
330
  """
203
- storage_integrations = requests.get(
204
- f"{API_HOST}/storage-integration/",
205
- params={"storage_integration_organization_id": organization_id},
331
+ storage_config = requests.get(
332
+ f"{API_HOST}/storage-config/{storage_config_id}",
206
333
  headers=get_auth_headers(),
207
334
  timeout=READ_TIMEOUT,
208
335
  )
209
- storage_integrations.raise_for_status()
210
- return storage_integrations.json()
336
+ raise_for_status_with_reason(storage_config)
337
+ return ResponseStorageConfig(**storage_config.json())
211
338
 
212
339
  @staticmethod
213
- def delete_by_id(storage_integration_id) -> None:
340
+ def get_by_name(name: str, storage_type: StorageTypes) -> "ResponseStorageConfig":
214
341
  """
215
- Deletes a `StorageIntegration` instance from the server by its ID
342
+ Retrieves a `StorageConfig` instance from the server by its name
216
343
 
217
344
  Args:
218
- storage_integration_id: The ID of the `StorageIntegration` to delete
345
+ name: The name of the `StorageConfig` to retrieve
346
+ storage_type: The type of the `StorageConfig` to retrieve
347
+
348
+ Note: The type is required because the name is not unique across different storage types
219
349
  """
220
- storage_integration = requests.delete(
221
- f"{API_HOST}/storage-integration/{storage_integration_id}",
350
+ storage_config = requests.get(
351
+ f"{API_HOST}/storage-config/by-name/{name}?storage_type={storage_type.value}",
352
+ headers=get_auth_headers(),
353
+ timeout=READ_TIMEOUT,
354
+ )
355
+ raise_for_status_with_reason(storage_config)
356
+ return ResponseStorageConfig(**storage_config.json())
357
+
358
+ @staticmethod
359
+ def list(
360
+ organization_id: typing.Optional[int] = None,
361
+ ) -> list["ResponseStorageConfig"]:
362
+ """
363
+ Lists all the `StorageConfig`'s created by user's default organization
364
+ Note: The return type is `list[dict]` and not `list[StorageConfig]`
365
+
366
+ Args:
367
+ organization_id: The ID of the organization to list `StorageConfig`'s for.
368
+ If not provided, it will list `StorageConfig`'s for the default organization.
369
+ """
370
+ storage_configs = requests.get(
371
+ f"{API_HOST}/storage-config/",
372
+ params={"storage_config_organization_id": organization_id},
373
+ headers=get_auth_headers(),
374
+ timeout=READ_TIMEOUT,
375
+ )
376
+ raise_for_status_with_reason(storage_configs)
377
+ return [ResponseStorageConfig(**si) for si in storage_configs.json()]
378
+
379
+ @staticmethod
380
+ def delete_by_id(storage_config_id) -> None:
381
+ """
382
+ Deletes a `StorageConfig` instance from the server by its ID
383
+
384
+ Args:
385
+ storage_config_id: The ID of the `StorageConfig` to delete
386
+ """
387
+ storage_config = requests.delete(
388
+ f"{API_HOST}/storage-config/{storage_config_id}",
222
389
  headers=get_auth_headers(),
223
390
  timeout=MODIFY_TIMEOUT,
224
391
  )
225
- storage_integration.raise_for_status()
226
- logger.info("Deleted storage integration with ID: %s", storage_integration_id)
392
+ raise_for_status_with_reason(storage_config)
393
+ logger.info("Deleted storage config with ID: %s", storage_config_id)
227
394
 
228
395
  def delete(self) -> None:
229
396
  """
230
- Deletes the `StorageIntegration` instance from the server
397
+ Deletes the `StorageConfig` instance from the server
231
398
  """
232
399
  if not self.id:
233
- raise ValueError("No StorageIntegration has been created")
400
+ raise ValueError("No StorageConfig has been created")
234
401
  self.delete_by_id(self.id)
235
402
 
236
- def create(self) -> int:
403
+ def create(self, replace_if_exists: bool = False) -> int:
237
404
  """
238
- Create a `StorageIntegration` instance on the server
405
+ Create a `StorageConfig` instance on the server
406
+
407
+ Args:
408
+ replace_if_exists: If a `StorageConfig` with the same name and type already exists, replace it.
239
409
  """
240
410
  if self.git and self.git.repo:
241
- self.git.repo_id = self.git.repo.create()
242
- storage_integration = requests.post(
243
- f"{API_HOST}/storage-integration/",
244
- json=self.model_dump(),
411
+ self.git.repo_id = self.git.repo.create(replace_if_exists=replace_if_exists)
412
+ storage_config = requests.post(
413
+ f"{API_HOST}/storage-config/",
414
+ json={
415
+ **self.model_dump(mode="json"),
416
+ "replace_if_exists": replace_if_exists,
417
+ },
245
418
  headers={
246
419
  **json_headers,
247
420
  **get_auth_headers(),
248
421
  },
249
422
  timeout=MODIFY_TIMEOUT,
250
423
  )
251
- storage_integration.raise_for_status()
252
- storage_integration_id = storage_integration.json()["id"]
253
- self.id = storage_integration_id
254
- logger.info("Created storage integration with ID: %s", storage_integration_id)
255
- return storage_integration_id
424
+ raise_for_status_with_reason(storage_config)
425
+ storage_config_id = storage_config.json()["id"]
426
+ self.id = storage_config_id
427
+ logger.info("Created storage config with ID: %s", storage_config_id)
428
+ return storage_config_id
256
429
 
257
430
  @model_validator(mode="after")
258
431
  def validate_storage_type(self):
@@ -281,15 +454,13 @@ class StorageIntegration(BaseModel):
281
454
  return self
282
455
 
283
456
 
284
- class StorageLink(BaseModel):
285
- storage_integration: StorageIntegration
286
- """
287
- The `StorageIntegration` instance to link to.
288
- """
289
- path: str = "/"
290
- """
291
- Path for the `root` to link to within the `StorageIntegration` instance,
292
- e.g. a prefix path/folder within an S3 Bucket / GCP Bucket / Azure Blob storage / Git repo.
293
-
294
- Note: Only files in this path will be retrieved and it will be used as the root for paths in the CSV.
295
- """
457
+ class ResponseStorageConfig(BaseModel):
458
+ id: int
459
+ name: StorageConfigName
460
+ type: StorageTypes
461
+ organization_name: str
462
+ creator_name: str
463
+ s3: typing.Optional[StorageS3Out]
464
+ gcp: typing.Optional[StorageGCPOut]
465
+ # azure: typing.Optional[StorageAzureOut]
466
+ git: typing.Optional[StorageGitOut]
@@ -0,0 +1,212 @@
1
+ Metadata-Version: 2.1
2
+ Name: hirundo
3
+ Version: 0.1.9
4
+ Summary: This package is used to interface with Hirundo's platform. It provides a simple API to optimize your ML datasets.
5
+ Author-email: Hirundo <dev@hirundo.io>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024, Hirundo
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15
+
16
+ Project-URL: Homepage, https://github.com/Hirundo-io/hirundo-client
17
+ Keywords: dataset,machine learning,data science,data engineering
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Programming Language :: Python
20
+ Classifier: Programming Language :: Python :: 3
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: pyyaml>=6.0.1
25
+ Requires-Dist: types-PyYAML>=6.0.12
26
+ Requires-Dist: pydantic>=2.7.1
27
+ Requires-Dist: twine>=5.0.0
28
+ Requires-Dist: python-dotenv>=1.0.1
29
+ Requires-Dist: types-requests>=2.31.0
30
+ Requires-Dist: typer>=0.12.3
31
+ Requires-Dist: httpx>=0.27.0
32
+ Requires-Dist: stamina>=24.2.0
33
+ Requires-Dist: httpx-sse>=0.4.0
34
+ Requires-Dist: pandas>=2.2.2
35
+ Requires-Dist: tqdm>=4.66.5
36
+ Provides-Extra: dev
37
+ Requires-Dist: pyyaml>=6.0.1; extra == "dev"
38
+ Requires-Dist: types-PyYAML>=6.0.12; extra == "dev"
39
+ Requires-Dist: pydantic>=2.7.1; extra == "dev"
40
+ Requires-Dist: twine>=5.0.0; extra == "dev"
41
+ Requires-Dist: python-dotenv>=1.0.1; extra == "dev"
42
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
43
+ Requires-Dist: types-setuptools>=69.5.0; extra == "dev"
44
+ Requires-Dist: typer>=0.12.3; extra == "dev"
45
+ Requires-Dist: httpx>=0.27.0; extra == "dev"
46
+ Requires-Dist: stamina>=24.2.0; extra == "dev"
47
+ Requires-Dist: httpx-sse>=0.4.0; extra == "dev"
48
+ Requires-Dist: pytest>=8.2.0; extra == "dev"
49
+ Requires-Dist: pytest-asyncio>=0.23.6; extra == "dev"
50
+ Requires-Dist: uv>=0.5.8; extra == "dev"
51
+ Requires-Dist: pre-commit>=3.7.1; extra == "dev"
52
+ Requires-Dist: virtualenv>=20.6.6; extra == "dev"
53
+ Requires-Dist: ruff>=0.8.2; extra == "dev"
54
+ Requires-Dist: bumpver; extra == "dev"
55
+ Requires-Dist: platformdirs>=4.3.6; extra == "dev"
56
+ Requires-Dist: safety>=3.2.13; extra == "dev"
57
+ Provides-Extra: docs
58
+ Requires-Dist: sphinx>=7.4.7; extra == "docs"
59
+ Requires-Dist: sphinx-autobuild>=2024.4.16; extra == "docs"
60
+ Requires-Dist: sphinx-click>=5.0.1; extra == "docs"
61
+ Requires-Dist: autodoc_pydantic>=2.2.0; extra == "docs"
62
+ Requires-Dist: furo; extra == "docs"
63
+ Requires-Dist: sphinx-multiversion; extra == "docs"
64
+ Requires-Dist: esbonio; extra == "docs"
65
+ Requires-Dist: starlette>0.40.0; extra == "docs"
66
+ Requires-Dist: markupsafe>=3.0.2; extra == "docs"
67
+
68
+ # Hirundo
69
+
70
+ This package exposes access to Hirundo APIs for dataset optimization for Machine Learning.
71
+
72
+ Dataset optimization is currently available for datasets labelled for classification and object detection.
73
+
74
+
75
+ Support dataset storage configs include:
76
+ - Google Cloud (GCP) Storage
77
+ - Amazon Web Services (AWS) S3
78
+ - Git LFS (Large File Storage) repositories (e.g. GitHub or HuggingFace)
79
+
80
+ Optimizing a classification dataset
81
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
82
+
83
+ Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
84
+ - ``image_path``: The location of the image within the dataset ``root``
85
+ - ``label``: The label of the image, i.e. which the class that was annotated for this image
86
+
87
+ And outputs a CSV with the same columns and:
88
+ - ``suspect_level``: mislabel suspect level
89
+ - ``suggested_label``: suggested label
90
+ - ``suggested_label_conf``: suggested label confidence
91
+
92
+ Optimizing an object detection (OD) dataset
93
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
94
+
95
+ Currently ``hirundo`` requires a CSV file with the following columns (all columns are required):
96
+ - ``image_path``: The location of the image within the dataset ``root``
97
+ - ``bbox_id``: The index of the bounding box within the dataset. Used to indicate label suspects
98
+ - ``label``: The label of the image, i.e. which the class that was annotated for this image
99
+ - ``x1``, ``y1``, ``x2``, ``y2``: The bounding box coordinates of the object within the image
100
+
101
+ And outputs a CSV with the same columns and:
102
+ - ``suspect_level``: object mislabel suspect level
103
+ - ``suggested_label``: suggested object label
104
+ - ``suggested_label_conf``: suggested object label confidence
105
+
106
+ Note: This Python package must be used alongside a Hirundo server, either the SaaS platform, a custom VPC deployment or an on-premises installation.
107
+
108
+
109
+ ## Installation
110
+
111
+ You can install the codebase with a simple `pip install hirundo` to install the latest version of this package. If you prefer to install from the Git repository and/or need a specific version or branch, you can simply clone the repository, check out the relevant commit and then run `pip install .` to install that version. A full list of dependencies can be found in `requirements.txt`, but these will be installed automatically by either of these commands.
112
+
113
+ ## Usage
114
+
115
+ Classification example:
116
+ ```python
117
+ from hirundo import (
118
+ HirundoCSV,
119
+ LabelingType,
120
+ OptimizationDataset,
121
+ StorageGCP,
122
+ StorageConfig,
123
+ StorageTypes,
124
+ )
125
+
126
+ gcp_bucket = StorageGCP(
127
+ bucket_name="cifar100bucket",
128
+ project="Hirundo-global",
129
+ credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
130
+ )
131
+ test_dataset = OptimizationDataset(
132
+ name="TEST-GCP cifar 100 classification dataset",
133
+ labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
134
+ storage_config=StorageConfig(
135
+ name="cifar100bucket",
136
+ type=StorageTypes.GCP,
137
+ gcp=gcp_bucket,
138
+ ),
139
+ data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"),
140
+ labeling_info=HirundoCSV(
141
+ csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"),
142
+ ),
143
+ classes=cifar100_classes,
144
+ )
145
+
146
+ test_dataset.run_optimization()
147
+ results = test_dataset.check_run()
148
+ print(results)
149
+ ```
150
+
151
+
152
+ Object detection example:
153
+
154
+ ```python
155
+ from hirundo import (
156
+ GitRepo,
157
+ HirundoCSV,
158
+ LabelingType,
159
+ OptimizationDataset,
160
+ StorageGit,
161
+ StorageConfig,
162
+ StorageTypes,
163
+ )
164
+
165
+ git_storage = StorageGit(
166
+ repo=GitRepo(
167
+ name="BDD-100k-validation-dataset",
168
+ repository_url="https://git@hf.co/datasets/hirundo-io/bdd100k-validation-only.git",
169
+ ),
170
+ branch="main",
171
+ )
172
+ test_dataset = OptimizationDataset(
173
+ name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
174
+ labeling_type=LabelingType.OBJECT_DETECTION,
175
+ storage_config=StorageConfig(
176
+ name="BDD-100k-validation-dataset",
177
+ type=StorageTypes.GIT,
178
+ git=git_storage,
179
+ ),
180
+ data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
181
+ labeling_info=HirundoCSV(
182
+ csv_url=git_storage.get_url(
183
+ path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
184
+ ),
185
+ ),
186
+ classes=[
187
+ "traffic light",
188
+ "traffic sign",
189
+ "car",
190
+ "pedestrian",
191
+ "bus",
192
+ "truck",
193
+ "rider",
194
+ "bicycle",
195
+ "motorcycle",
196
+ "train",
197
+ "other vehicle",
198
+ "other person",
199
+ "trailer",
200
+ ],
201
+ )
202
+
203
+ test_dataset.run_optimization()
204
+ results = test_dataset.check_run()
205
+ print(results)
206
+ ```
207
+
208
+ Note: Currently we only support the main CPython release 3.9, 3.10 and 3.11. PyPy support may be introduced in the future.
209
+
210
+ ## Further documentation
211
+
212
+ To learn more about how to use this library, please visit the [http://docs.hirundo.io/](documentation) or see the Google Colab examples.