rapidata 2.4.0__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidata might be problematic. Click here for more details.

@@ -12,6 +12,8 @@ from PIL import Image
12
12
  from tinytag import TinyTag
13
13
  import tempfile
14
14
  from pydantic import StrictStr, StrictBytes
15
+ from typing import Optional
16
+ import logging
15
17
 
16
18
 
17
19
  class MediaAsset(BaseAsset):
@@ -26,6 +28,7 @@ class MediaAsset(BaseAsset):
26
28
  Raises:
27
29
  FileNotFoundError: If the provided file path does not exist.
28
30
  """
31
+ _logger = logging.getLogger(__name__ + '.MediaAsset')
29
32
 
30
33
  ALLOWED_TYPES = [
31
34
  'image/',
@@ -33,6 +36,28 @@ class MediaAsset(BaseAsset):
33
36
  'video/mp4', # MP4
34
37
  ]
35
38
 
39
+ MIME_TYPES = {
40
+ 'jpg': 'image/jpeg',
41
+ 'jpeg': 'image/jpeg',
42
+ 'png': 'image/png',
43
+ 'gif': 'image/gif',
44
+ 'webp': 'image/webp',
45
+ 'mp3': 'audio/mp3',
46
+ 'mp4': 'video/mp4'
47
+ }
48
+
49
+ FILE_SIGNATURES = {
50
+ b'\xFF\xD8\xFF': 'image/jpeg',
51
+ b'\x89PNG\r\n\x1a\n': 'image/png',
52
+ b'GIF87a': 'image/gif',
53
+ b'GIF89a': 'image/gif',
54
+ b'RIFF': 'image/webp',
55
+ b'ID3': 'audio/mp3',
56
+ b'\xFF\xFB': 'audio/mp3',
57
+ b'\xFF\xF3': 'audio/mp3',
58
+ b'ftyp': 'video/mp4',
59
+ }
60
+
36
61
  def __init__(self, path: str):
37
62
  """
38
63
  Initialize a MediaAsset instance.
@@ -134,35 +159,111 @@ class MediaAsset(BaseAsset):
134
159
  name = name + '.jpg'
135
160
  return name
136
161
 
162
+ def __get_media_type_from_extension(self, url: str) -> Optional[str]:
163
+ """
164
+ Determine media type from URL file extension.
165
+
166
+ Args:
167
+ url: The URL to check
168
+
169
+ Returns:
170
+ Optional[str]: MIME type if valid extension found, None otherwise
171
+ """
172
+ try:
173
+ ext = url.lower().split('?')[0].split('.')[-1]
174
+ return self.MIME_TYPES.get(ext)
175
+ except IndexError:
176
+ return None
177
+
178
+ def __validate_image_content(self, content: bytes) -> bool:
179
+ """
180
+ Validate image content using PIL.
181
+
182
+ Args:
183
+ content: Image bytes to validate
184
+
185
+ Returns:
186
+ bool: True if valid image, False otherwise
187
+ """
188
+ try:
189
+ img = Image.open(BytesIO(content))
190
+ img.verify()
191
+ return True
192
+ except Exception as e:
193
+ self._logger.debug(f"Image validation failed: {str(e)}")
194
+ return False
195
+
196
+ def __get_media_type_from_signature(self, content: bytes) -> Optional[str]:
197
+ """
198
+ Determine media type from file signature.
199
+
200
+ Args:
201
+ content: File content bytes
202
+
203
+ Returns:
204
+ Optional[str]: MIME type if valid signature found, None otherwise
205
+ """
206
+ file_start = content[:32]
207
+ for signature, mime_type in self.FILE_SIGNATURES.items():
208
+ if file_start.startswith(signature) or (signature in file_start[:10]):
209
+ return mime_type
210
+ return None
211
+
137
212
  def __get_media_bytes(self, url: str) -> bytes:
138
213
  """
139
- Downloads media files from URL and validates type and duration.
214
+ Downloads and validates media files from URL.
140
215
 
141
216
  Args:
142
217
  url: URL of the media file
143
-
218
+
144
219
  Returns:
145
- bytes: Media data
220
+ bytes: Validated media content
146
221
 
147
222
  Raises:
148
- ValueError: If media type is unsupported or duration exceeds limit
223
+ ValueError: If media type is unsupported or content validation fails
149
224
  requests.exceptions.RequestException: If download fails
150
225
  """
151
- response = requests.get(url, stream=False) # Don't stream, we need full file
152
- response.raise_for_status()
226
+ try:
227
+ response = requests.get(url, stream=False)
228
+ response.raise_for_status()
229
+ except requests.exceptions.RequestException as e:
230
+ self._logger.error(f"Failed to download media from {url}: {str(e)}")
231
+ raise
153
232
 
233
+ content = response.content
154
234
  content_type = response.headers.get('content-type', '').lower()
155
-
156
- # Validate content type
157
- if not any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
158
- raise ValueError(
159
- f'URL does not point to an allowed media type.\n'
160
- f'Content-Type: {content_type}\n'
161
- f'Allowed types: {self.ALLOWED_TYPES}'
162
- )
163
-
164
- content = BytesIO(response.content)
165
- return content.getvalue()
235
+
236
+ # Case 1: Content-type is already allowed
237
+ if any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
238
+ self._logger.debug(f"Content-type {content_type} is allowed")
239
+ return content
240
+
241
+ # Case 2: Try to validate based on extension
242
+ mime_type = self.__get_media_type_from_extension(url)
243
+ if mime_type and mime_type.startswith(tuple(self.ALLOWED_TYPES)):
244
+ self._logger.debug(f"Found valid mime type from extension: {mime_type}")
245
+ return content
246
+
247
+ # Case 3: Try to validate based on file signature
248
+ mime_type = self.__get_media_type_from_signature(content)
249
+ if mime_type and mime_type.startswith(tuple(self.ALLOWED_TYPES)):
250
+ self._logger.debug(f"Found valid mime type from signature: {mime_type}")
251
+ return content
252
+
253
+ # Case 4: Last resort - try direct image validation
254
+ if self.__validate_image_content(content):
255
+ self._logger.debug("Content validated as image through direct validation")
256
+ return content
257
+
258
+ # If we get here, validation failed
259
+ error_msg = (
260
+ f'Could not validate media type from content.\n'
261
+ f'Content-Type: {content_type}\n'
262
+ f'URL extension: {url.split("?")[0].split(".")[-1]}\n'
263
+ f'Allowed types: {self.ALLOWED_TYPES}'
264
+ )
265
+ self._logger.error(error_msg)
266
+ raise ValueError(error_msg)
166
267
 
167
268
  def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes: # types for autogenerated models
168
269
  if isinstance(self.path, str):
@@ -1,3 +1,6 @@
1
+ from rapidata.rapidata_client.validation.rapids.rapids import Rapid
2
+ from rapidata.service.openapi_service import OpenAPIService
3
+
1
4
  class RapidataValidationSet:
2
5
  """A class for interacting with a Rapidata validation set.
3
6
 
@@ -10,9 +13,18 @@ class RapidataValidationSet:
10
13
  name (str): The name of the validation set.
11
14
  """
12
15
 
13
- def __init__(self, validation_set_id, name: str):
16
+ def __init__(self, validation_set_id, name: str, openapi_service: OpenAPIService):
14
17
  self.id = validation_set_id
15
18
  self.name = name
19
+ self.__openapi_service = openapi_service
20
+
21
+ def add_rapid(self, rapid: Rapid):
22
+ """Add a Rapid to the validation set.
23
+
24
+ Args:
25
+ rapid (Rapid): The Rapid to add to the validation set.
26
+ """
27
+ rapid._add_to_validation_set(self.id, self.__openapi_service)
16
28
 
17
29
  def __str__(self):
18
30
  return f"name: '{self.name}' id: {self.id}"
@@ -20,6 +20,7 @@ from rapidata.api_client.models.datapoint_metadata_model_metadata_inner import (
20
20
  DatapointMetadataModelMetadataInner,
21
21
  )
22
22
 
23
+ from rapidata.service.openapi_service import OpenAPIService
23
24
 
24
25
  class Rapid():
25
26
  def __init__(self, asset: MediaAsset | TextAsset | MultiAsset, metadata: Sequence[Metadata], payload: Any, truth: Any, randomCorrectProbability: float, explanation: str | None):
@@ -30,7 +31,22 @@ class Rapid():
30
31
  self.randomCorrectProbability = randomCorrectProbability
31
32
  self.explanation = explanation
32
33
 
33
- def to_media_model(self, validationSetId: str) -> tuple[AddValidationRapidModel, list[StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes]]:
34
+ def _add_to_validation_set(self, validationSetId: str, openapi_service: OpenAPIService):
35
+ if isinstance(self.asset, TextAsset) or (isinstance(self.asset, MultiAsset) and isinstance(self.asset.assets[0], TextAsset)):
36
+ openapi_service.validation_api.validation_add_validation_text_rapid_post(
37
+ add_validation_text_rapid_model=self.__to_text_model(validationSetId)
38
+ )
39
+
40
+ elif isinstance(self.asset, MediaAsset) or (isinstance(self.asset, MultiAsset) and isinstance(self.asset.assets[0], MediaAsset)):
41
+ model = self.__to_media_model(validationSetId)
42
+ openapi_service.validation_api.validation_add_validation_rapid_post(
43
+ model=model[0], files=model[1]
44
+ )
45
+
46
+ else:
47
+ raise TypeError("The asset must be a MediaAsset, TextAsset, or MultiAsset")
48
+
49
+ def __to_media_model(self, validationSetId: str) -> tuple[AddValidationRapidModel, list[StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes]]:
34
50
  assets: list[MediaAsset] = []
35
51
  if isinstance(self.asset, MultiAsset):
36
52
  for asset in self.asset.assets:
@@ -57,7 +73,7 @@ class Rapid():
57
73
  explanation=self.explanation
58
74
  ), [asset.to_file() for asset in assets])
59
75
 
60
- def to_text_model(self, validationSetId: str) -> AddValidationTextRapidModel:
76
+ def __to_text_model(self, validationSetId: str) -> AddValidationTextRapidModel:
61
77
  texts: list[str] = []
62
78
  if isinstance(self.asset, MultiAsset):
63
79
  for asset in self.asset.assets:
@@ -132,7 +132,7 @@ class RapidsManager:
132
132
  asset = MediaAsset(datapoint)
133
133
  transcription_words = [
134
134
  TranscriptionWord(word=word, wordIndex=i)
135
- for i, word in enumerate(sentence)
135
+ for i, word in enumerate(sentence.split(" "))
136
136
  ]
137
137
 
138
138
  correct_transcription_words: list[TranscriptionWord] = []
@@ -85,7 +85,7 @@ class ValidationSetManager:
85
85
  )
86
86
  )
87
87
 
88
- return self._submit(name=name, data_type=data_type, rapids=rapids, print_confirmation=print_confirmation)
88
+ return self._submit(name=name, rapids=rapids, print_confirmation=print_confirmation)
89
89
 
90
90
  def create_compare_set(self,
91
91
  name: str,
@@ -142,7 +142,7 @@ class ValidationSetManager:
142
142
  )
143
143
  )
144
144
 
145
- return self._submit(name=name, data_type=data_type, rapids=rapids, print_confirmation=print_confirmation)
145
+ return self._submit(name=name, rapids=rapids, print_confirmation=print_confirmation)
146
146
 
147
147
  def create_select_words_set(self,
148
148
  name: str,
@@ -197,7 +197,7 @@ class ValidationSetManager:
197
197
  )
198
198
  )
199
199
 
200
- return self._submit(name=name, data_type=RapidataDataTypes.MEDIA, rapids=rapids, print_confirmation=print_confirmation)
200
+ return self._submit(name=name, rapids=rapids, print_confirmation=print_confirmation)
201
201
 
202
202
  def create_locate_set(self,
203
203
  name: str,
@@ -249,7 +249,7 @@ class ValidationSetManager:
249
249
  )
250
250
  )
251
251
 
252
- return self._submit(name=name, data_type=RapidataDataTypes.MEDIA, rapids=rapids, print_confirmation=print_confirmation)
252
+ return self._submit(name=name, rapids=rapids, print_confirmation=print_confirmation)
253
253
 
254
254
  def create_draw_set(self,
255
255
  name: str,
@@ -300,7 +300,7 @@ class ValidationSetManager:
300
300
  )
301
301
  )
302
302
 
303
- return self._submit(name=name, data_type=RapidataDataTypes.MEDIA, rapids=rapids, print_confirmation=print_confirmation)
303
+ return self._submit(name=name, rapids=rapids, print_confirmation=print_confirmation)
304
304
 
305
305
  def create_timestamp_set(self,
306
306
  name: str,
@@ -352,11 +352,10 @@ class ValidationSetManager:
352
352
  )
353
353
  )
354
354
 
355
- return self._submit(name=name, data_type=RapidataDataTypes.MEDIA, rapids=rapids, print_confirmation=print_confirmation)
355
+ return self._submit(name=name, rapids=rapids, print_confirmation=print_confirmation)
356
356
 
357
357
  def create_mixed_set(self,
358
358
  name: str,
359
- data_type: str,
360
359
  rapids: list[Rapid],
361
360
  print_confirmation: bool = True
362
361
  ) -> RapidataValidationSet:
@@ -368,7 +367,7 @@ class ValidationSetManager:
368
367
  print_confirmation (bool, optional): Whether to print a confirmation message that validation set has been created. Defaults to True.
369
368
  """
370
369
 
371
- return self._submit(name, data_type, rapids, print_confirmation)
370
+ return self._submit(name, rapids, print_confirmation)
372
371
 
373
372
  def get_validation_set_by_id(self, validation_set_id: str) -> RapidataValidationSet:
374
373
  """Get a validation set by ID.
@@ -384,9 +383,9 @@ class ValidationSetManager:
384
383
  except Exception:
385
384
  raise ValueError(f"ValidationSet with ID {validation_set_id} not found.")
386
385
 
387
- return RapidataValidationSet(validation_set_id, validation_set.name)
386
+ return RapidataValidationSet(validation_set_id, validation_set.name, self.__openapi_service)
388
387
 
389
- def _submit(self, name: str, data_type: str, rapids: list[Rapid], print_confirmation: bool) -> RapidataValidationSet:
388
+ def _submit(self, name: str, rapids: list[Rapid], print_confirmation: bool) -> RapidataValidationSet:
390
389
  validation_set_id = (
391
390
  self.__openapi_service.validation_api.validation_create_validation_set_post(
392
391
  name=name
@@ -399,22 +398,17 @@ class ValidationSetManager:
399
398
  if print_confirmation:
400
399
  print(f"Validation set '{name}' created with ID {validation_set_id}")
401
400
 
402
- for rapid in rapids:
403
- if data_type == RapidataDataTypes.TEXT:
404
- self.__openapi_service.validation_api.validation_add_validation_text_rapid_post(
405
- add_validation_text_rapid_model=rapid.to_text_model(validation_set_id)
406
- )
407
- else:
408
- model = rapid.to_media_model(validation_set_id)
409
- self.__openapi_service.validation_api.validation_add_validation_rapid_post(
410
- model=model[0], files=model[1]
411
- )
412
-
413
- return RapidataValidationSet(
401
+ validation_set = RapidataValidationSet(
414
402
  name=name,
415
403
  validation_set_id=validation_set_id,
404
+ openapi_service=self.__openapi_service
416
405
  )
417
406
 
407
+ for rapid in rapids:
408
+ validation_set.add_rapid(rapid)
409
+
410
+ return validation_set
411
+
418
412
 
419
413
  def find_validation_sets(self, name: str = "", amount: int = 1) -> list[RapidataValidationSet]:
420
414
  """Find validation sets by name.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rapidata
3
- Version: 2.4.0
3
+ Version: 2.5.0
4
4
  Summary: Rapidata package containing the Rapidata Python Client to interact with the Rapidata Web API in an easy way.
5
5
  License: Apache-2.0
6
6
  Author: Rapidata AG
@@ -349,7 +349,7 @@ rapidata/api_client_README.md,sha256=TwdMWXKwwnzD5CyLjXOqAwKVyMxTJcKewudONv31p40
349
349
  rapidata/rapidata_client/__init__.py,sha256=kkT6FMU4P8rTdYKTJgZNcyptr-Rq1iZmsyLIyRCwpYA,896
350
350
  rapidata/rapidata_client/assets/__init__.py,sha256=hKgrOSn8gJcBSULaf4auYhH1S1N5AfcwIhBSq1BOKwQ,323
351
351
  rapidata/rapidata_client/assets/_base_asset.py,sha256=B2YWH1NgaeYUYHDW3OPpHM_bqawHbH4EjnRCE2BYwiM,298
352
- rapidata/rapidata_client/assets/_media_asset.py,sha256=dji8rM2W6TtzIPDiVVyaHfEDNTmu8uo3rsgrPUcbqDE,5949
352
+ rapidata/rapidata_client/assets/_media_asset.py,sha256=s7_bfhvYEFLIvpZvECsjtVSLnvZmi19xLJoOsDShzgk,9391
353
353
  rapidata/rapidata_client/assets/_multi_asset.py,sha256=o4_-OvovADSVpl0tr6fPzRR_gHCcIQKfeZKcoFDFBLo,1667
354
354
  rapidata/rapidata_client/assets/_text_asset.py,sha256=itoe3vicn41LbdJ7UeydImORUo9iDL0SZu-ptOlbMRM,618
355
355
  rapidata/rapidata_client/assets/data_type_enum.py,sha256=ELC-ymeKnQlfNAzfqsI7MmUuRiGYamCHVcTc0qR6Fm4,185
@@ -406,12 +406,12 @@ rapidata/rapidata_client/settings/play_video_until_the_end.py,sha256=LLHx2_72k5Z
406
406
  rapidata/rapidata_client/settings/rapidata_settings.py,sha256=Kjxm4GStrpfLKylx84BiKdEQOjvNfO74DiSChmH63fg,1165
407
407
  rapidata/rapidata_client/settings/translation_behaviour.py,sha256=i9n_H0eKJyKW6m3MKH_Cm1XEKWVEWsAV_79xGmGIC-4,742
408
408
  rapidata/rapidata_client/validation/__init__.py,sha256=s5wHVtcJkncXSFuL9I0zNwccNOKpWAqxqUjkeohzi2E,24
409
- rapidata/rapidata_client/validation/rapidata_validation_set.py,sha256=c4ywPef1yGwx0D9wTzj7465p-VQa1V6WnHJUU5ReK7Q,646
409
+ rapidata/rapidata_client/validation/rapidata_validation_set.py,sha256=IkHOdQX3ihBBHDvG2EE6wu1ZSG7evOWMYyOx9Lva0tw,1111
410
410
  rapidata/rapidata_client/validation/rapids/__init__.py,sha256=WU5PPwtTJlte6U90MDakzx4I8Y0laj7siw9teeXj5R0,21
411
411
  rapidata/rapidata_client/validation/rapids/box.py,sha256=t3_Kn6doKXdnJdtbwefXnYKPiTKHneJl9E2inkDSqL8,589
412
- rapidata/rapidata_client/validation/rapids/rapids.py,sha256=sZEzKj4FdVChm_rUwcmG_v7X-yO5Eh4q1g54T4c4puc,3507
413
- rapidata/rapidata_client/validation/rapids/rapids_manager.py,sha256=rDmxYkv067O-S95TNc789FktfCPBCudyzBRLSArmfus,14236
414
- rapidata/rapidata_client/validation/validation_set_manager.py,sha256=I2ICynEOjgmp5KDUceXCKGyKGSt21N_GTn1Mi4-6DTc,23295
412
+ rapidata/rapidata_client/validation/rapids/rapids.py,sha256=aWkcjWR6Pr2BFwz8a0MfqEkXnEJPkAbKMeQtGXnsags,4440
413
+ rapidata/rapidata_client/validation/rapids/rapids_manager.py,sha256=4HOX6c42sCOfpGR2aDoiMgrfjontM8z2KEJwQp4ir6A,14247
414
+ rapidata/rapidata_client/validation/validation_set_manager.py,sha256=RR3lP3VtOVCtmbKjEAFYVbQxZyUCXvOuWMqw8dGz-WA,22730
415
415
  rapidata/rapidata_client/workflow/__init__.py,sha256=eFRx0fm280alXpds6hYcnxN_yERlabF9B5sTdPFsL1g,430
416
416
  rapidata/rapidata_client/workflow/_base_workflow.py,sha256=XyIZFKS_RxAuwIHS848S3AyLEHqd07oTD_5jm2oUbsw,762
417
417
  rapidata/rapidata_client/workflow/_classify_workflow.py,sha256=9bT54wxVJgxC-zLk6MVNbseFpzYrvFPjt7DHvxqYfnk,1736
@@ -427,7 +427,7 @@ rapidata/service/credential_manager.py,sha256=Of0BQs_V1T7rkrWX9groLX790nOknaARwn
427
427
  rapidata/service/local_file_service.py,sha256=pgorvlWcx52Uh3cEG6VrdMK_t__7dacQ_5AnfY14BW8,877
428
428
  rapidata/service/openapi_service.py,sha256=Z4NrAuilLlIWBdGOv6otz36tHS_vvU36w5jmvOUTmqo,3198
429
429
  rapidata/service/token_manager.py,sha256=JZ5YbR5Di8dO3H4kK11d0kzWlrXxjgCmeNkHA4AapCM,6425
430
- rapidata-2.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
431
- rapidata-2.4.0.dist-info/METADATA,sha256=ZbXMkT2eRglrqdHTuUgOJlRhKOg8BneVJ2rPRs2-U6k,1107
432
- rapidata-2.4.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
433
- rapidata-2.4.0.dist-info/RECORD,,
430
+ rapidata-2.5.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
431
+ rapidata-2.5.0.dist-info/METADATA,sha256=Bi3-kOaNpfnjqHS9QwC0ua5dnnlhluiMZdopp0fx08U,1107
432
+ rapidata-2.5.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
433
+ rapidata-2.5.0.dist-info/RECORD,,