rapidata 2.7.0__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidata might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
- """Media Asset Module
1
+ """Media Asset Module with Lazy Loading
2
2
 
3
3
  Defines the MediaAsset class for handling media file paths within assets.
4
+ Implements lazy loading for URL-based media to prevent unnecessary downloads.
4
5
  """
5
6
 
6
7
  import os
@@ -12,18 +13,20 @@ from PIL import Image
12
13
  from tinytag import TinyTag
13
14
  import tempfile
14
15
  from pydantic import StrictStr, StrictBytes
15
- from typing import Optional
16
+ from typing import Optional, cast
16
17
  import logging
18
+ from functools import cached_property
17
19
 
18
20
 
19
21
  class MediaAsset(BaseAsset):
20
- """MediaAsset Class
22
+ """MediaAsset Class with Lazy Loading
21
23
 
22
- Represents a media asset by storing the file path.
24
+ Represents a media asset by storing the file path or URL.
25
+ Only downloads URL content when needed.
23
26
  Supports local files and URLs for images, MP3, and MP4.
24
27
 
25
28
  Args:
26
- path (str): The file system path to the media asset.
29
+ path (str): The file system path to the media asset or URL.
27
30
 
28
31
  Raises:
29
32
  FileNotFoundError: If the provided file path does not exist.
@@ -67,23 +70,40 @@ class MediaAsset(BaseAsset):
67
70
 
68
71
  Raises:
69
72
  FileNotFoundError: If the provided file path does not exist.
70
- ValueError: If media type is unsupported or duration exceeds 25 seconds.
73
+ ValueError: If path is not a string.
71
74
  """
72
75
  if not isinstance(path, str):
73
76
  raise ValueError("Media must be a string, either a local file path or a URL")
74
77
 
78
+ self._url = None
79
+ self._content = None
80
+
75
81
  if re.match(r'^https?://', path):
76
- self.path = self.__get_media_bytes(path)
82
+ self._url = path
77
83
  self.name = path.split('/')[-1]
78
84
  self.name = self.__check_name_ending(self.name)
85
+ self.path = None # Will be set when content is downloaded
79
86
  return
80
87
 
81
88
  if not os.path.exists(path):
82
89
  raise FileNotFoundError(f"File not found: {path}")
83
90
 
84
- self.path: str | bytes = path
91
+ self.path = path
85
92
  self.name = path
86
93
 
94
+ @cached_property
95
+ def content(self) -> bytes:
96
+ """
97
+ Lazy loader for URL content. Only downloads when first accessed.
98
+ Uses cached_property to store the result after first download.
99
+ """
100
+ if self._url is None:
101
+ self.path = cast(str, self.path)
102
+ with open(self.path, 'rb') as f:
103
+ return f.read()
104
+
105
+ return self.__get_media_bytes(self._url)
106
+
87
107
  def get_duration(self) -> int:
88
108
  """
89
109
  Get the duration of audio/video files in milliseconds.
@@ -97,27 +117,22 @@ class MediaAsset(BaseAsset):
97
117
  """
98
118
  path_to_check = self.name.lower()
99
119
 
100
- # Return 0 for other static images
120
+ # Return 0 for static images
101
121
  if any(path_to_check.endswith(ext) for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif')):
102
122
  return 0
103
123
 
104
124
  try:
105
- # For URL downloads (bytes), write to temporary file first
106
- if isinstance(self.path, bytes):
107
- with tempfile.NamedTemporaryFile(suffix=os.path.splitext(self.name)[1], delete=False) as tmp:
108
- tmp.write(self.path)
109
- tmp.flush()
110
- # Close the file so it can be read
111
- tmp_path = tmp.name
125
+ # Create temporary file from content
126
+ with tempfile.NamedTemporaryFile(suffix=os.path.splitext(self.name)[1], delete=False) as tmp:
127
+ tmp.write(self.content)
128
+ tmp.flush()
129
+ tmp_path = tmp.name
112
130
 
113
131
  try:
114
132
  tag = TinyTag.get(tmp_path)
115
133
  finally:
116
134
  # Clean up the temporary file
117
135
  os.unlink(tmp_path)
118
- else:
119
- # For local files, use path directly
120
- tag = TinyTag.get(self.path)
121
136
 
122
137
  if tag.duration is None:
123
138
  raise ValueError("Could not read duration from file")
@@ -136,17 +151,14 @@ class MediaAsset(BaseAsset):
136
151
  return None
137
152
 
138
153
  try:
139
- if isinstance(self.path, bytes):
140
- img = Image.open(BytesIO(self.path))
141
- else:
142
- img = Image.open(self.path)
154
+ img = Image.open(BytesIO(self.content))
143
155
  return img.size
144
156
  except Exception:
145
157
  return None
146
158
 
147
159
  def set_custom_name(self, name: str) -> 'MediaAsset':
148
160
  """Set a custom name for the media asset (only works with URLs)."""
149
- if isinstance(self.path, bytes):
161
+ if self._url is not None:
150
162
  self.name = self.__check_name_ending(name)
151
163
  else:
152
164
  raise ValueError("Custom name can only be set for URLs.")
@@ -265,8 +277,10 @@ class MediaAsset(BaseAsset):
265
277
  self._logger.error(error_msg)
266
278
  raise ValueError(error_msg)
267
279
 
268
- def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes: # types for autogenerated models
269
- if isinstance(self.path, str):
280
+ def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes:
281
+ """Convert the media asset to a file representation."""
282
+ if self._url is None:
283
+ self.path = cast(str, self.path)
270
284
  return self.path
271
- else: # isinstance(self.path, bytes)
272
- return (self.name, self.path)
285
+ else:
286
+ return (self.name, self.content)
@@ -36,6 +36,8 @@ from rapidata.api_client.models.root_filter import RootFilter
36
36
  from rapidata.api_client.models.filter import Filter
37
37
  from rapidata.api_client.models.sort_criterion import SortCriterion
38
38
 
39
+ from tqdm import tqdm
40
+
39
41
 
40
42
  class RapidataOrderManager:
41
43
  """
@@ -58,7 +60,7 @@ class RapidataOrderManager:
58
60
  return [ValidationSelection(validation_set_id=validation_set_id), LabelingSelection(amount=labeling_amount-1)]
59
61
  return [LabelingSelection(amount=labeling_amount)]
60
62
 
61
- def __create_general_order(self,
63
+ def _create_general_order(self,
62
64
  name: str,
63
65
  workflow: Workflow,
64
66
  assets: list[MediaAsset] | list[TextAsset] | list[MultiAsset],
@@ -168,7 +170,7 @@ class RapidataOrderManager:
168
170
  else:
169
171
  raise ValueError(f"Unsupported data type: {data_type}, must be one of {RapidataDataTypes._possible_values()}")
170
172
 
171
- return self.__create_general_order(
173
+ return self._create_general_order(
172
174
  name=name,
173
175
  workflow=ClassifyWorkflow(
174
176
  instruction=instruction,
@@ -226,7 +228,7 @@ class RapidataOrderManager:
226
228
  else:
227
229
  raise ValueError(f"Unsupported data type: {data_type}, must be one of {RapidataDataTypes._possible_values()}")
228
230
 
229
- return self.__create_general_order(
231
+ return self._create_general_order(
230
232
  name=name,
231
233
  workflow=CompareWorkflow(
232
234
  instruction=instruction
@@ -273,7 +275,7 @@ class RapidataOrderManager:
273
275
  else:
274
276
  raise ValueError(f"Unsupported data type: {data_type}, must be one of {RapidataDataTypes._possible_values()}")
275
277
 
276
- return self.__create_general_order(
278
+ return self._create_general_order(
277
279
  name=name,
278
280
  workflow=FreeTextWorkflow(
279
281
  instruction=instruction
@@ -316,7 +318,7 @@ class RapidataOrderManager:
316
318
 
317
319
  assets = [MediaAsset(path=path) for path in datapoints]
318
320
 
319
- return self.__create_general_order(
321
+ return self._create_general_order(
320
322
  name=name,
321
323
  workflow=SelectWordsWorkflow(
322
324
  instruction=instruction
@@ -361,7 +363,7 @@ class RapidataOrderManager:
361
363
 
362
364
  assets = [MediaAsset(path=path) for path in datapoints]
363
365
 
364
- return self.__create_general_order(
366
+ return self._create_general_order(
365
367
  name=name,
366
368
  workflow=LocateWorkflow(target=instruction),
367
369
  assets=assets,
@@ -403,7 +405,7 @@ class RapidataOrderManager:
403
405
 
404
406
  assets = [MediaAsset(path=path) for path in datapoints]
405
407
 
406
- return self.__create_general_order(
408
+ return self._create_general_order(
407
409
  name=name,
408
410
  workflow=DrawWorkflow(target=instruction),
409
411
  assets=assets,
@@ -444,11 +446,11 @@ class RapidataOrderManager:
444
446
 
445
447
  assets = [MediaAsset(path=path) for path in datapoints]
446
448
 
447
- for asset in assets:
449
+ for asset in tqdm(assets, desc="Downloading assets and checking duration"):
448
450
  if not asset.get_duration():
449
451
  raise ValueError("The datapoints for this order must have a duration. (e.g. video or audio)")
450
452
 
451
- return self.__create_general_order(
453
+ return self._create_general_order(
452
454
  name=name,
453
455
  workflow=TimestampWorkflow(
454
456
  instruction=instruction
@@ -15,6 +15,7 @@ from urllib3._collections import HTTPHeaderDict
15
15
  from rapidata.rapidata_client.validation.rapids.box import Box
16
16
 
17
17
  from rapidata.api_client.models.query_validation_set_model import QueryValidationSetModel
18
+ from tqdm import tqdm
18
19
 
19
20
 
20
21
  class ValidationSetManager:
@@ -404,7 +405,7 @@ class ValidationSetManager:
404
405
  openapi_service=self.__openapi_service
405
406
  )
406
407
 
407
- for rapid in rapids:
408
+ for rapid in tqdm(rapids, desc="Uploading validation tasks"):
408
409
  validation_set.add_rapid(rapid)
409
410
 
410
411
  return validation_set
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rapidata
3
- Version: 2.7.0
3
+ Version: 2.7.2
4
4
  Summary: Rapidata package containing the Rapidata Python Client to interact with the Rapidata Web API in an easy way.
5
5
  License: Apache-2.0
6
6
  Author: Rapidata AG
@@ -366,7 +366,7 @@ rapidata/api_client_README.md,sha256=IM9O5YglCc-JICZZpH4XZvBI9vM5ZuZXU_4-ot_oulQ
366
366
  rapidata/rapidata_client/__init__.py,sha256=kkT6FMU4P8rTdYKTJgZNcyptr-Rq1iZmsyLIyRCwpYA,896
367
367
  rapidata/rapidata_client/assets/__init__.py,sha256=hKgrOSn8gJcBSULaf4auYhH1S1N5AfcwIhBSq1BOKwQ,323
368
368
  rapidata/rapidata_client/assets/_base_asset.py,sha256=B2YWH1NgaeYUYHDW3OPpHM_bqawHbH4EjnRCE2BYwiM,298
369
- rapidata/rapidata_client/assets/_media_asset.py,sha256=s7_bfhvYEFLIvpZvECsjtVSLnvZmi19xLJoOsDShzgk,9391
369
+ rapidata/rapidata_client/assets/_media_asset.py,sha256=hzopr2J8_T2EJs0nIwNNtAzIqXR8xHiVi870lOPRrW0,9771
370
370
  rapidata/rapidata_client/assets/_multi_asset.py,sha256=o4_-OvovADSVpl0tr6fPzRR_gHCcIQKfeZKcoFDFBLo,1667
371
371
  rapidata/rapidata_client/assets/_text_asset.py,sha256=itoe3vicn41LbdJ7UeydImORUo9iDL0SZu-ptOlbMRM,618
372
372
  rapidata/rapidata_client/assets/data_type_enum.py,sha256=ELC-ymeKnQlfNAzfqsI7MmUuRiGYamCHVcTc0qR6Fm4,185
@@ -397,7 +397,7 @@ rapidata/rapidata_client/order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
397
397
  rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=t1OFfHXz3Pe0S8qVGdReZm4aGVvxpVWH-VUgfnNtcgQ,5300
398
398
  rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=N6mqmgneJSsb_no_Ps9BG3EhDekLgKxFYpjDCN-VVeg,13095
399
399
  rapidata/rapidata_client/order/rapidata_order.py,sha256=Yf-GSlkFRRnC_dOh6VKKO_XJENu7sym77iO6u-q0R2Y,8497
400
- rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=0sbQWNHUjwKPTwAB-a_fOeOLuJYK7xl6rAj54BTn2gg,27201
400
+ rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=4wW5xtmEtdewFGGwMWSMbdLZpHvygtLCfYNv4lcHGg8,27271
401
401
  rapidata/rapidata_client/rapidata_client.py,sha256=A9mnSX6wzVF9TxS1YH87hTi4jCn75dIuP3KZj5Y_vFg,1957
402
402
  rapidata/rapidata_client/referee/__init__.py,sha256=q0Hv9nmfEpyChejtyMLT8hWKL0vTTf_UgUXPYNJ-H6M,153
403
403
  rapidata/rapidata_client/referee/_base_referee.py,sha256=MdFOhdxt3sRnWXLDKLJZKFdVpjBGn9jypPnWWQ6msQA,496
@@ -429,7 +429,7 @@ rapidata/rapidata_client/validation/rapids/__init__.py,sha256=WU5PPwtTJlte6U90MD
429
429
  rapidata/rapidata_client/validation/rapids/box.py,sha256=t3_Kn6doKXdnJdtbwefXnYKPiTKHneJl9E2inkDSqL8,589
430
430
  rapidata/rapidata_client/validation/rapids/rapids.py,sha256=aWkcjWR6Pr2BFwz8a0MfqEkXnEJPkAbKMeQtGXnsags,4440
431
431
  rapidata/rapidata_client/validation/rapids/rapids_manager.py,sha256=4HOX6c42sCOfpGR2aDoiMgrfjontM8z2KEJwQp4ir6A,14247
432
- rapidata/rapidata_client/validation/validation_set_manager.py,sha256=RR3lP3VtOVCtmbKjEAFYVbQxZyUCXvOuWMqw8dGz-WA,22730
432
+ rapidata/rapidata_client/validation/validation_set_manager.py,sha256=X3T1L41v-B1s2JpE8szOpQNOj2ns6leA3NvomNn9TS0,22793
433
433
  rapidata/rapidata_client/workflow/__init__.py,sha256=eFRx0fm280alXpds6hYcnxN_yERlabF9B5sTdPFsL1g,430
434
434
  rapidata/rapidata_client/workflow/_base_workflow.py,sha256=XyIZFKS_RxAuwIHS848S3AyLEHqd07oTD_5jm2oUbsw,762
435
435
  rapidata/rapidata_client/workflow/_classify_workflow.py,sha256=9bT54wxVJgxC-zLk6MVNbseFpzYrvFPjt7DHvxqYfnk,1736
@@ -445,7 +445,7 @@ rapidata/service/credential_manager.py,sha256=Of0BQs_V1T7rkrWX9groLX790nOknaARwn
445
445
  rapidata/service/local_file_service.py,sha256=pgorvlWcx52Uh3cEG6VrdMK_t__7dacQ_5AnfY14BW8,877
446
446
  rapidata/service/openapi_service.py,sha256=Z4NrAuilLlIWBdGOv6otz36tHS_vvU36w5jmvOUTmqo,3198
447
447
  rapidata/service/token_manager.py,sha256=JZ5YbR5Di8dO3H4kK11d0kzWlrXxjgCmeNkHA4AapCM,6425
448
- rapidata-2.7.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
449
- rapidata-2.7.0.dist-info/METADATA,sha256=30UXvN02zlclEO7PGLNXf8LzcTHC6L25hUqL2TEyMhY,1107
450
- rapidata-2.7.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
451
- rapidata-2.7.0.dist-info/RECORD,,
448
+ rapidata-2.7.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
449
+ rapidata-2.7.2.dist-info/METADATA,sha256=HeqJI37CBd0dICVGEGPSTYst8fLtuz7N5O-OE5iNF6c,1107
450
+ rapidata-2.7.2.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
451
+ rapidata-2.7.2.dist-info/RECORD,,