hirundo 0.1.8__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hirundo/storage.py CHANGED
@@ -1,42 +1,130 @@
1
1
  import typing
2
2
  from enum import Enum
3
+ from pathlib import Path
3
4
 
4
5
  import pydantic
5
6
  import requests
6
7
  from pydantic import BaseModel, model_validator
7
8
  from pydantic_core import Url
8
9
 
9
- from hirundo._constraints import S3BucketUrl, StorageIntegrationName
10
+ from hirundo._constraints import S3BucketUrl, StorageConfigName
10
11
  from hirundo._env import API_HOST
11
- from hirundo._headers import get_auth_headers, json_headers
12
+ from hirundo._headers import get_headers
12
13
  from hirundo._http import raise_for_status_with_reason
13
14
  from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
14
- from hirundo.git import GitRepo
15
+ from hirundo.git import GitRepo, GitRepoOut
15
16
  from hirundo.logger import get_logger
16
17
 
17
18
  logger = get_logger(__name__)
18
19
 
20
+ S3_PREFIX = "s3://"
19
21
 
20
- class StorageS3(BaseModel):
22
+
23
+ class StorageS3Base(BaseModel):
21
24
  endpoint_url: typing.Optional[Url] = None
22
25
  bucket_url: S3BucketUrl
23
26
  region_name: str
24
27
  # ⬆️ We could restrict this, but if we're allowing custom endpoints then the validation may be wrong
25
28
  access_key_id: typing.Optional[str] = None
29
+
30
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
31
+ """
32
+ Get the full URL for a file in the S3 bucket
33
+
34
+ Chains the bucket URL with the path, ensuring that the path is formatted correctly
35
+
36
+ Args:
37
+ path: The path to the file in the S3 bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
38
+
39
+ Returns:
40
+ The full URL to the file in the S3 bucket, e.g. `s3://my-bucket/my-file.txt` or `s3://my-bucket/my-folder/my-file.txt`,
41
+ where `s3://my-bucket` is the bucket URL provided in the S3 storage config
42
+ """
43
+ return Url(
44
+ f"{S3_PREFIX}{self.bucket_url.removeprefix(S3_PREFIX).removesuffix('/')}/{str(path).removeprefix('/')}"
45
+ )
46
+
47
+
48
+ class StorageS3(StorageS3Base):
26
49
  secret_access_key: typing.Optional[str] = None
27
50
 
28
51
 
29
- class StorageGCP(BaseModel):
52
+ class StorageS3Out(StorageS3Base):
53
+ pass
54
+
55
+
56
+ class StorageGCPBase(BaseModel):
30
57
  bucket_name: str
31
58
  project: str
59
+
60
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
61
+ """
62
+ Get the full URL for a file in the GCP bucket
63
+
64
+ Chains the bucket URL with the path, ensuring that the path is formatted correctly
65
+
66
+ Args:
67
+ path: The path to the file in the GCP bucket, e.g. `my-file.txt` or `/my-folder/my-file.txt`
68
+
69
+ Returns:
70
+ The full URL to the file in the GCP bucket, e.g. `gs://my-bucket/my-file.txt` or `gs://my-bucket/my-folder/my-file.txt`,
71
+ where `my-bucket` is the bucket name provided in the GCP storage config
72
+ """
73
+ return Url(f"gs://{self.bucket_name}/{str(path).removeprefix('/')}")
74
+
75
+
76
+ class StorageGCP(StorageGCPBase):
32
77
  credentials_json: typing.Optional[dict] = None
33
78
 
34
79
 
35
- # TODO: Azure storage integration is coming soon
80
+ class StorageGCPOut(StorageGCPBase):
81
+ pass
82
+
83
+
84
+ # TODO: Azure storage config is coming soon
36
85
  # class StorageAzure(BaseModel):
86
+ # account_url: HttpUrl
87
+ # container_name: str
88
+ # tenant_id: str
89
+
90
+ # def get_url(self, path: typing.Union[str, Path]) -> Url:
91
+ # """
92
+ # Get the full URL for a file in the Azure container
93
+
94
+ # Chains the container URL with the path, ensuring that the path is formatted correctly
95
+
96
+ # Args:
97
+ # path: The path to the file in the Azure container, e.g. `my-file.txt` or `/my-folder/my-file.txt`
98
+
99
+ # Returns:
100
+ # The full URL to the file in the Azure container
101
+ # """
102
+ # return Url(f"{str(self.account_url)}/{self.container_name}/{str(path).removeprefix('/')}")
103
+ # class StorageAzureOut(BaseModel):
37
104
  # container: str
38
- # account_name: str
39
- # account_key: str
105
+ # account_url: str
106
+
107
+
108
+ def get_git_repo_url(
109
+ repo_url: typing.Union[str, Url], path: typing.Union[str, Path]
110
+ ) -> Url:
111
+ """
112
+ Get the full URL for a file in the git repository
113
+
114
+ Chains the repository URL with the path, ensuring that the path is formatted correctly
115
+
116
+ Args:
117
+ repo_url: The URL of the git repository, e.g. `https://my-git-repository.com`
118
+ path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
119
+
120
+ Returns:
121
+ The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`
122
+ """
123
+ if not isinstance(repo_url, Url):
124
+ repo_url = Url(repo_url)
125
+ return Url(
126
+ f"{repo_url.scheme}{str(repo_url).removeprefix(repo_url.scheme)}/{str(path).removeprefix('/')}"
127
+ )
40
128
 
41
129
 
42
130
  class StorageGit(BaseModel):
@@ -61,46 +149,88 @@ class StorageGit(BaseModel):
61
149
  raise ValueError("Either repo_id or repo must be provided")
62
150
  return self
63
151
 
152
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
153
+ """
154
+ Get the full URL for a file in the git repository
155
+
156
+ Chains the repository URL with the path, ensuring that the path is formatted correctly
157
+
158
+ Args:
159
+ path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
160
+
161
+ Returns:
162
+ The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
163
+ where `https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
164
+ """
165
+ if not self.repo:
166
+ raise ValueError("Repo must be provided to use `get_url`")
167
+ repo_url = self.repo.repository_url
168
+ return get_git_repo_url(repo_url, path)
169
+
170
+
171
+ class StorageGitOut(BaseModel):
172
+ repo: GitRepoOut
173
+ branch: str
174
+
175
+ def get_url(self, path: typing.Union[str, Path]) -> Url:
176
+ """
177
+ Get the full URL for a file in the git repository
178
+
179
+ Chains the repository URL with the path, ensuring that the path is formatted correctly
180
+
181
+ Args:
182
+ path: The path to the file in the git repository, e.g. `my-file.txt` or `/my-folder/my-file.txt`
183
+
184
+ Returns:
185
+ The full URL to the file in the git repository, e.g. `https://my-git-repository.com/my-file.txt` or `https://my-git-repository.com/my-folder/my-file.txt`,
186
+ where `https://my-git-repository.com` is the repository URL provided in the git storage config's git repo
187
+ """
188
+ repo_url = self.repo.repository_url
189
+ return get_git_repo_url(repo_url, path)
190
+
64
191
 
65
192
  class StorageTypes(str, Enum):
66
193
  """
67
- Enum for the different types of storage integrations.
194
+ Enum for the different types of storage configs.
68
195
  Supported types are:
69
196
  """
70
197
 
71
198
  S3 = "S3"
72
199
  GCP = "GCP"
73
- # AZURE = "Azure" TODO: Azure storage integration is coming soon
200
+ # AZURE = "Azure" TODO: Azure storage config is coming soon
74
201
  GIT = "Git"
75
202
  LOCAL = "Local"
76
203
  """
77
- Local storage integration is only supported for on-premises installations.
204
+ Local storage config is only supported for on-premises installations.
78
205
  """
79
206
 
80
207
 
81
- class StorageIntegration(BaseModel):
208
+ class StorageConfig(BaseModel):
82
209
  id: typing.Optional[int] = None
210
+ """
211
+ The ID of the `StorageConfig` in the Hirundo system.
212
+ """
83
213
 
84
214
  organization_id: typing.Optional[int] = None
85
215
  """
86
- The ID of the organization that the `StorageIntegration` belongs to.
216
+ The ID of the organization that the `StorageConfig` belongs to.
87
217
  If not provided, it will be assigned to your default organization.
88
218
  """
89
219
 
90
- name: StorageIntegrationName
220
+ name: StorageConfigName
91
221
  """
92
- A name to identify the `StorageIntegration` in the Hirundo system.
222
+ A name to identify the `StorageConfig` in the Hirundo system.
93
223
  """
94
224
  type: typing.Optional[StorageTypes] = pydantic.Field(
95
225
  examples=[
96
226
  StorageTypes.S3,
97
227
  StorageTypes.GCP,
98
- # StorageTypes.AZURE, TODO: Azure storage integration is coming soon
228
+ # StorageTypes.AZURE, TODO: Azure storage is coming soon
99
229
  StorageTypes.GIT,
100
230
  ]
101
231
  )
102
232
  """
103
- The type of the `StorageIntegration`.
233
+ The type of the `StorageConfig`.
104
234
  Supported types are:
105
235
  - `S3`
106
236
  - `GCP`
@@ -122,7 +252,7 @@ class StorageIntegration(BaseModel):
122
252
  ],
123
253
  )
124
254
  """
125
- The Amazon Web Services (AWS) S3 storage integration details.
255
+ The Amazon Web Services (AWS) S3 storage config details.
126
256
  Use this if you want to link to an S3 bucket.
127
257
  """
128
258
  gcp: typing.Optional[StorageGCP] = pydantic.Field(
@@ -151,7 +281,7 @@ class StorageIntegration(BaseModel):
151
281
  ],
152
282
  )
153
283
  """
154
- The Google Cloud (GCP) Storage integration details.
284
+ The Google Cloud (GCP) Storage config details.
155
285
  Use this if you want to link to an GCS bucket.
156
286
  """
157
287
  azure: None = None
@@ -167,7 +297,7 @@ class StorageIntegration(BaseModel):
167
297
  # },
168
298
  # None,
169
299
  # ],
170
- # ) TODO: Azure storage integration is coming soon
300
+ # ) TODO: Azure storage config is coming soon
171
301
  git: typing.Optional[StorageGit] = pydantic.Field(
172
302
  default=None,
173
303
  examples=[
@@ -186,73 +316,113 @@ class StorageIntegration(BaseModel):
186
316
  ],
187
317
  )
188
318
  """
189
- The Git storage integration details.
319
+ The Git storage config details.
190
320
  Use this if you want to link to a Git repository.
191
321
  """
192
322
 
193
323
  @staticmethod
194
- def list(organization_id: typing.Optional[int] = None) -> list[dict]:
324
+ def get_by_id(storage_config_id: int) -> "ResponseStorageConfig":
195
325
  """
196
- Lists all the `StorageIntegration`'s created by user's default organization
197
- Note: The return type is `list[dict]` and not `list[StorageIntegration]`
326
+ Retrieves a `StorageConfig` instance from the server by its ID
198
327
 
199
328
  Args:
200
- organization_id: The ID of the organization to list `StorageIntegration`'s for.
201
- If not provided, it will list `StorageIntegration`'s for the default organization.
329
+ storage_config_id: The ID of the `StorageConfig` to retrieve
202
330
  """
203
- storage_integrations = requests.get(
204
- f"{API_HOST}/storage-integration/",
205
- params={"storage_integration_organization_id": organization_id},
206
- headers=get_auth_headers(),
331
+ storage_config = requests.get(
332
+ f"{API_HOST}/storage-config/{storage_config_id}",
333
+ headers=get_headers(),
207
334
  timeout=READ_TIMEOUT,
208
335
  )
209
- raise_for_status_with_reason(storage_integrations)
210
- return storage_integrations.json()
336
+ raise_for_status_with_reason(storage_config)
337
+ return ResponseStorageConfig(**storage_config.json())
211
338
 
212
339
  @staticmethod
213
- def delete_by_id(storage_integration_id) -> None:
340
+ def get_by_name(name: str, storage_type: StorageTypes) -> "ResponseStorageConfig":
214
341
  """
215
- Deletes a `StorageIntegration` instance from the server by its ID
342
+ Retrieves a `StorageConfig` instance from the server by its name
216
343
 
217
344
  Args:
218
- storage_integration_id: The ID of the `StorageIntegration` to delete
345
+ name: The name of the `StorageConfig` to retrieve
346
+ storage_type: The type of the `StorageConfig` to retrieve
347
+
348
+ Note: The type is required because the name is not unique across different storage types
219
349
  """
220
- storage_integration = requests.delete(
221
- f"{API_HOST}/storage-integration/{storage_integration_id}",
222
- headers=get_auth_headers(),
350
+ storage_config = requests.get(
351
+ f"{API_HOST}/storage-config/by-name/{name}?storage_type={storage_type.value}",
352
+ headers=get_headers(),
353
+ timeout=READ_TIMEOUT,
354
+ )
355
+ raise_for_status_with_reason(storage_config)
356
+ return ResponseStorageConfig(**storage_config.json())
357
+
358
+ @staticmethod
359
+ def list(
360
+ organization_id: typing.Optional[int] = None,
361
+ ) -> list["ResponseStorageConfig"]:
362
+ """
363
+ Lists all the `StorageConfig`'s created by user's default organization
364
+ Note: The return type is `list[dict]` and not `list[StorageConfig]`
365
+
366
+ Args:
367
+ organization_id: The ID of the organization to list `StorageConfig`'s for.
368
+ If not provided, it will list `StorageConfig`'s for the default organization.
369
+ """
370
+ storage_configs = requests.get(
371
+ f"{API_HOST}/storage-config/",
372
+ params={"storage_config_organization_id": organization_id},
373
+ headers=get_headers(),
374
+ timeout=READ_TIMEOUT,
375
+ )
376
+ raise_for_status_with_reason(storage_configs)
377
+ return [ResponseStorageConfig(**si) for si in storage_configs.json()]
378
+
379
+ @staticmethod
380
+ def delete_by_id(storage_config_id) -> None:
381
+ """
382
+ Deletes a `StorageConfig` instance from the server by its ID
383
+
384
+ Args:
385
+ storage_config_id: The ID of the `StorageConfig` to delete
386
+ """
387
+ storage_config = requests.delete(
388
+ f"{API_HOST}/storage-config/{storage_config_id}",
389
+ headers=get_headers(),
223
390
  timeout=MODIFY_TIMEOUT,
224
391
  )
225
- raise_for_status_with_reason(storage_integration)
226
- logger.info("Deleted storage integration with ID: %s", storage_integration_id)
392
+ raise_for_status_with_reason(storage_config)
393
+ logger.info("Deleted storage config with ID: %s", storage_config_id)
227
394
 
228
395
  def delete(self) -> None:
229
396
  """
230
- Deletes the `StorageIntegration` instance from the server
397
+ Deletes the `StorageConfig` instance from the server
231
398
  """
232
399
  if not self.id:
233
- raise ValueError("No StorageIntegration has been created")
400
+ raise ValueError("No StorageConfig has been created")
234
401
  self.delete_by_id(self.id)
235
402
 
236
- def create(self) -> int:
403
+ def create(self, replace_if_exists: bool = False) -> int:
237
404
  """
238
- Create a `StorageIntegration` instance on the server
405
+ Create a `StorageConfig` instance on the server
406
+
407
+ Args:
408
+ replace_if_exists: If a `StorageConfig` with the same name and type already exists, replace it.
239
409
  """
240
410
  if self.git and self.git.repo:
241
- self.git.repo_id = self.git.repo.create()
242
- storage_integration = requests.post(
243
- f"{API_HOST}/storage-integration/",
244
- json=self.model_dump(),
245
- headers={
246
- **json_headers,
247
- **get_auth_headers(),
411
+ self.git.repo_id = self.git.repo.create(replace_if_exists=replace_if_exists)
412
+ storage_config = requests.post(
413
+ f"{API_HOST}/storage-config/",
414
+ json={
415
+ **self.model_dump(mode="json"),
416
+ "replace_if_exists": replace_if_exists,
248
417
  },
418
+ headers=get_headers(),
249
419
  timeout=MODIFY_TIMEOUT,
250
420
  )
251
- raise_for_status_with_reason(storage_integration)
252
- storage_integration_id = storage_integration.json()["id"]
253
- self.id = storage_integration_id
254
- logger.info("Created storage integration with ID: %s", storage_integration_id)
255
- return storage_integration_id
421
+ raise_for_status_with_reason(storage_config)
422
+ storage_config_id = storage_config.json()["id"]
423
+ self.id = storage_config_id
424
+ logger.info("Created storage config with ID: %s", storage_config_id)
425
+ return storage_config_id
256
426
 
257
427
  @model_validator(mode="after")
258
428
  def validate_storage_type(self):
@@ -281,15 +451,13 @@ class StorageIntegration(BaseModel):
281
451
  return self
282
452
 
283
453
 
284
- class StorageLink(BaseModel):
285
- storage_integration: StorageIntegration
286
- """
287
- The `StorageIntegration` instance to link to.
288
- """
289
- path: str = "/"
290
- """
291
- Path for the `root` to link to within the `StorageIntegration` instance,
292
- e.g. a prefix path/folder within an S3 Bucket / GCP Bucket / Azure Blob storage / Git repo.
293
-
294
- Note: Only files in this path will be retrieved and it will be used as the root for paths in the CSV.
295
- """
454
+ class ResponseStorageConfig(BaseModel):
455
+ id: int
456
+ name: StorageConfigName
457
+ type: StorageTypes
458
+ organization_name: str
459
+ creator_name: str
460
+ s3: typing.Optional[StorageS3Out]
461
+ gcp: typing.Optional[StorageGCPOut]
462
+ # azure: typing.Optional[StorageAzureOut]
463
+ git: typing.Optional[StorageGitOut]
hirundo/unzip.py ADDED
@@ -0,0 +1,247 @@
1
+ import typing
2
+ import zipfile
3
+ from collections.abc import Mapping
4
+ from pathlib import Path
5
+ from typing import IO, cast
6
+
7
+ import requests
8
+ from pydantic_core import Url
9
+
10
+ from hirundo._dataframe import (
11
+ float32,
12
+ has_pandas,
13
+ has_polars,
14
+ int32,
15
+ pd,
16
+ pl,
17
+ string,
18
+ )
19
+ from hirundo._env import API_HOST
20
+ from hirundo._headers import _get_auth_headers
21
+ from hirundo._timeouts import DOWNLOAD_READ_TIMEOUT
22
+ from hirundo.dataset_optimization_results import (
23
+ DataFrameType,
24
+ DatasetOptimizationResults,
25
+ )
26
+ from hirundo.logger import get_logger
27
+
28
+ ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024 # 50 MB
29
+
30
+ Dtype = typing.Union[type[int32], type[float32], type[string]]
31
+
32
+
33
+ CUSTOMER_INTERCHANGE_DTYPES: Mapping[str, Dtype] = {
34
+ "image_path": string,
35
+ "label_path": string,
36
+ "segments_mask_path": string,
37
+ "segment_id": int32,
38
+ "label": string,
39
+ "bbox_id": string,
40
+ "xmin": float32,
41
+ "ymin": float32,
42
+ "xmax": float32,
43
+ "ymax": float32,
44
+ "suspect_level": float32, # If exists, must be one of the values in the enum below
45
+ "suggested_label": string,
46
+ "suggested_label_conf": float32,
47
+ "status": string,
48
+ # ⬆️ If exists, must be one of the following:
49
+ # NO_LABELS/MISSING_IMAGE/INVALID_IMAGE/INVALID_BBOX/INVALID_BBOX_SIZE/INVALID_SEG/INVALID_SEG_SIZE
50
+ }
51
+
52
+ logger = get_logger(__name__)
53
+
54
+
55
+ def _clean_df_index(df: "pd.DataFrame") -> "pd.DataFrame":
56
+ """
57
+ Clean the index of a DataFrame in case it has unnamed columns.
58
+
59
+ Args:
60
+ df (DataFrame): DataFrame to clean
61
+
62
+ Returns:
63
+ Cleaned Pandas DataFrame
64
+ """
65
+ index_cols = sorted(
66
+ [col for col in df.columns if col.startswith("Unnamed")], reverse=True
67
+ )
68
+ if len(index_cols) > 0:
69
+ df.set_index(index_cols.pop(), inplace=True)
70
+ df.rename_axis(index=None, columns=None, inplace=True)
71
+ if len(index_cols) > 0:
72
+ df.drop(columns=index_cols, inplace=True)
73
+
74
+ return df
75
+
76
+
77
+ def load_df(
78
+ file: "typing.Union[str, IO[bytes]]",
79
+ ) -> "DataFrameType":
80
+ """
81
+ Load a DataFrame from a CSV file.
82
+
83
+ Args:
84
+ file_name: The name of the CSV file to load.
85
+ dtypes: The data types of the columns in the DataFrame.
86
+
87
+ Returns:
88
+ The loaded DataFrame or `None` if neither Polars nor Pandas is available.
89
+ """
90
+ if has_polars:
91
+ return pl.read_csv(file, schema_overrides=CUSTOMER_INTERCHANGE_DTYPES)
92
+ elif has_pandas:
93
+ if typing.TYPE_CHECKING:
94
+ from pandas._typing import DtypeArg
95
+
96
+ dtype = cast("DtypeArg", CUSTOMER_INTERCHANGE_DTYPES)
97
+ # ⬆️ Casting since CUSTOMER_INTERCHANGE_DTYPES is a Mapping[str, Dtype] in this case
98
+ df = pd.read_csv(file, dtype=dtype)
99
+ return cast("DataFrameType", _clean_df_index(df))
100
+ # ⬆️ Casting since the return type is pd.DataFrame, but this is what DataFrameType is in this case
101
+ else:
102
+ return None
103
+
104
+
105
+ def get_mislabel_suspect_filename(filenames: list[str]):
106
+ mislabel_suspect_filename = "mislabel_suspects.csv"
107
+ if mislabel_suspect_filename not in filenames:
108
+ mislabel_suspect_filename = "image_mislabel_suspects.csv"
109
+ if mislabel_suspect_filename not in filenames:
110
+ mislabel_suspect_filename = "suspects.csv"
111
+ if mislabel_suspect_filename not in filenames:
112
+ raise ValueError(
113
+ "None of mislabel_suspects.csv, image_mislabel_suspects.csv or suspects.csv were found in the zip file"
114
+ )
115
+ return mislabel_suspect_filename
116
+
117
+
118
+ def download_and_extract_zip(
119
+ run_id: str, zip_url: str
120
+ ) -> DatasetOptimizationResults[DataFrameType]:
121
+ """
122
+ Download and extract the zip file from the given URL.
123
+
124
+ Note: It will only extract the `mislabel_suspects.csv` (vision - classification)
125
+ or `image_mislabel_suspects.csv` & `object_mislabel_suspects.csv` (vision - OD)
126
+ or `suspects.csv` (STT)
127
+ and `warnings_and_errors.csv` files from the zip file.
128
+
129
+ Args:
130
+ run_id: The ID of the optimization run.
131
+ zip_url: The URL of the zip file to download.
132
+
133
+ Returns:
134
+ The dataset optimization results object.
135
+ """
136
+ # Define the local file path
137
+ cache_dir = Path.home() / ".hirundo" / "cache"
138
+ cache_dir.mkdir(parents=True, exist_ok=True)
139
+ zip_file_path = cache_dir / f"{run_id}.zip"
140
+
141
+ headers = None
142
+ if Url(zip_url).scheme == "file":
143
+ zip_url = (
144
+ f"{API_HOST}/dataset-optimization/run/local-download"
145
+ + zip_url.replace("file://", "")
146
+ )
147
+ headers = _get_auth_headers()
148
+ # Stream the zip file download
149
+ with requests.get(
150
+ zip_url,
151
+ headers=headers,
152
+ timeout=DOWNLOAD_READ_TIMEOUT,
153
+ stream=True,
154
+ ) as r:
155
+ r.raise_for_status()
156
+ with open(zip_file_path, "wb") as f:
157
+ for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
158
+ f.write(chunk)
159
+ logger.info(
160
+ "Successfully downloaded the result zip file for run ID %s to %s",
161
+ run_id,
162
+ zip_file_path,
163
+ )
164
+
165
+ with zipfile.ZipFile(zip_file_path, "r") as z:
166
+ # Extract suspects file
167
+ suspects_df = None
168
+ object_suspects_df = None
169
+ warnings_and_errors_df = None
170
+
171
+ filenames = []
172
+ try:
173
+ filenames = [file.filename for file in z.filelist]
174
+ except Exception as e:
175
+ logger.error("Failed to get filenames from ZIP", exc_info=e)
176
+
177
+ try:
178
+ mislabel_suspect_filename = get_mislabel_suspect_filename(filenames)
179
+ with z.open(mislabel_suspect_filename) as suspects_file:
180
+ suspects_df = load_df(suspects_file)
181
+ logger.debug(
182
+ "Successfully loaded mislabel suspects into DataFrame for run ID %s",
183
+ run_id,
184
+ )
185
+ except Exception as e:
186
+ logger.error(
187
+ "Failed to load mislabel suspects into DataFrame", exc_info=e
188
+ )
189
+
190
+ object_mislabel_suspects_filename = "object_mislabel_suspects.csv"
191
+ if object_mislabel_suspects_filename in filenames:
192
+ try:
193
+ with z.open(
194
+ object_mislabel_suspects_filename
195
+ ) as object_suspects_file:
196
+ object_suspects_df = load_df(object_suspects_file)
197
+ logger.debug(
198
+ "Successfully loaded object mislabel suspects into DataFrame for run ID %s",
199
+ run_id,
200
+ )
201
+ except Exception as e:
202
+ logger.error(
203
+ "Failed to load object mislabel suspects into DataFrame",
204
+ exc_info=e,
205
+ )
206
+
207
+ try:
208
+ # Extract warnings_and_errors file
209
+ with z.open("warnings_and_errors.csv") as warnings_file:
210
+ warnings_and_errors_df = load_df(warnings_file)
211
+ logger.debug(
212
+ "Successfully loaded warnings and errors into DataFrame for run ID %s",
213
+ run_id,
214
+ )
215
+ except Exception as e:
216
+ logger.error(
217
+ "Failed to load warnings and errors into DataFrame", exc_info=e
218
+ )
219
+
220
+ return DatasetOptimizationResults[DataFrameType](
221
+ cached_zip_path=zip_file_path,
222
+ suspects=suspects_df,
223
+ object_suspects=object_suspects_df,
224
+ warnings_and_errors=warnings_and_errors_df,
225
+ )
226
+
227
+
228
+ def load_from_zip(
229
+ zip_path: Path, file_name: str
230
+ ) -> "typing.Union[pd.DataFrame, pl.DataFrame, None]":
231
+ """
232
+ Load a given file from a given zip file.
233
+
234
+ Args:
235
+ zip_path: The path to the zip file.
236
+ file_name: The name of the file to load.
237
+
238
+ Returns:
239
+ The loaded DataFrame or `None` if neither Polars nor Pandas is available.
240
+ """
241
+ with zipfile.ZipFile(zip_path, "r") as z:
242
+ try:
243
+ with z.open(file_name) as file:
244
+ return load_df(file)
245
+ except Exception as e:
246
+ logger.error("Failed to load %s from zip file", file_name, exc_info=e)
247
+ return None