rapidata 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,8 @@ from PIL import Image
12
12
  from tinytag import TinyTag
13
13
  import tempfile
14
14
  from pydantic import StrictStr, StrictBytes
15
+ from typing import Optional
16
+ import logging
15
17
 
16
18
 
17
19
  class MediaAsset(BaseAsset):
@@ -26,6 +28,7 @@ class MediaAsset(BaseAsset):
26
28
  Raises:
27
29
  FileNotFoundError: If the provided file path does not exist.
28
30
  """
31
+ _logger = logging.getLogger(__name__ + '.MediaAsset')
29
32
 
30
33
  ALLOWED_TYPES = [
31
34
  'image/',
@@ -33,6 +36,28 @@ class MediaAsset(BaseAsset):
33
36
  'video/mp4', # MP4
34
37
  ]
35
38
 
39
+ MIME_TYPES = {
40
+ 'jpg': 'image/jpeg',
41
+ 'jpeg': 'image/jpeg',
42
+ 'png': 'image/png',
43
+ 'gif': 'image/gif',
44
+ 'webp': 'image/webp',
45
+ 'mp3': 'audio/mp3',
46
+ 'mp4': 'video/mp4'
47
+ }
48
+
49
+ FILE_SIGNATURES = {
50
+ b'\xFF\xD8\xFF': 'image/jpeg',
51
+ b'\x89PNG\r\n\x1a\n': 'image/png',
52
+ b'GIF87a': 'image/gif',
53
+ b'GIF89a': 'image/gif',
54
+ b'RIFF': 'image/webp',
55
+ b'ID3': 'audio/mp3',
56
+ b'\xFF\xFB': 'audio/mp3',
57
+ b'\xFF\xF3': 'audio/mp3',
58
+ b'ftyp': 'video/mp4',
59
+ }
60
+
36
61
  def __init__(self, path: str):
37
62
  """
38
63
  Initialize a MediaAsset instance.
@@ -134,35 +159,111 @@ class MediaAsset(BaseAsset):
134
159
  name = name + '.jpg'
135
160
  return name
136
161
 
162
+ def __get_media_type_from_extension(self, url: str) -> Optional[str]:
163
+ """
164
+ Determine media type from URL file extension.
165
+
166
+ Args:
167
+ url: The URL to check
168
+
169
+ Returns:
170
+ Optional[str]: MIME type if valid extension found, None otherwise
171
+ """
172
+ try:
173
+ ext = url.lower().split('?')[0].split('.')[-1]
174
+ return self.MIME_TYPES.get(ext)
175
+ except IndexError:
176
+ return None
177
+
178
+ def __validate_image_content(self, content: bytes) -> bool:
179
+ """
180
+ Validate image content using PIL.
181
+
182
+ Args:
183
+ content: Image bytes to validate
184
+
185
+ Returns:
186
+ bool: True if valid image, False otherwise
187
+ """
188
+ try:
189
+ img = Image.open(BytesIO(content))
190
+ img.verify()
191
+ return True
192
+ except Exception as e:
193
+ self._logger.debug(f"Image validation failed: {str(e)}")
194
+ return False
195
+
196
+ def __get_media_type_from_signature(self, content: bytes) -> Optional[str]:
197
+ """
198
+ Determine media type from file signature.
199
+
200
+ Args:
201
+ content: File content bytes
202
+
203
+ Returns:
204
+ Optional[str]: MIME type if valid signature found, None otherwise
205
+ """
206
+ file_start = content[:32]
207
+ for signature, mime_type in self.FILE_SIGNATURES.items():
208
+ if file_start.startswith(signature) or (signature in file_start[:10]):
209
+ return mime_type
210
+ return None
211
+
137
212
  def __get_media_bytes(self, url: str) -> bytes:
138
213
  """
139
- Downloads media files from URL and validates type and duration.
214
+ Downloads and validates media files from URL.
140
215
 
141
216
  Args:
142
217
  url: URL of the media file
143
-
218
+
144
219
  Returns:
145
- bytes: Media data
220
+ bytes: Validated media content
146
221
 
147
222
  Raises:
148
- ValueError: If media type is unsupported or duration exceeds limit
223
+ ValueError: If media type is unsupported or content validation fails
149
224
  requests.exceptions.RequestException: If download fails
150
225
  """
151
- response = requests.get(url, stream=False) # Don't stream, we need full file
152
- response.raise_for_status()
226
+ try:
227
+ response = requests.get(url, stream=False)
228
+ response.raise_for_status()
229
+ except requests.exceptions.RequestException as e:
230
+ self._logger.error(f"Failed to download media from {url}: {str(e)}")
231
+ raise
153
232
 
233
+ content = response.content
154
234
  content_type = response.headers.get('content-type', '').lower()
155
-
156
- # Validate content type
157
- if not any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
158
- raise ValueError(
159
- f'URL does not point to an allowed media type.\n'
160
- f'Content-Type: {content_type}\n'
161
- f'Allowed types: {self.ALLOWED_TYPES}'
162
- )
163
-
164
- content = BytesIO(response.content)
165
- return content.getvalue()
235
+
236
+ # Case 1: Content-type is already allowed
237
+ if any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
238
+ self._logger.debug(f"Content-type {content_type} is allowed")
239
+ return content
240
+
241
+ # Case 2: Try to validate based on extension
242
+ mime_type = self.__get_media_type_from_extension(url)
243
+ if mime_type and mime_type.startswith(tuple(self.ALLOWED_TYPES)):
244
+ self._logger.debug(f"Found valid mime type from extension: {mime_type}")
245
+ return content
246
+
247
+ # Case 3: Try to validate based on file signature
248
+ mime_type = self.__get_media_type_from_signature(content)
249
+ if mime_type and mime_type.startswith(tuple(self.ALLOWED_TYPES)):
250
+ self._logger.debug(f"Found valid mime type from signature: {mime_type}")
251
+ return content
252
+
253
+ # Case 4: Last resort - try direct image validation
254
+ if self.__validate_image_content(content):
255
+ self._logger.debug("Content validated as image through direct validation")
256
+ return content
257
+
258
+ # If we get here, validation failed
259
+ error_msg = (
260
+ f'Could not validate media type from content.\n'
261
+ f'Content-Type: {content_type}\n'
262
+ f'URL extension: {url.split("?")[0].split(".")[-1]}\n'
263
+ f'Allowed types: {self.ALLOWED_TYPES}'
264
+ )
265
+ self._logger.error(error_msg)
266
+ raise ValueError(error_msg)
166
267
 
167
268
  def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes: # types for autogenerated models
168
269
  if isinstance(self.path, str):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rapidata
3
- Version: 2.4.0
3
+ Version: 2.4.1
4
4
  Summary: Rapidata package containing the Rapidata Python Client to interact with the Rapidata Web API in an easy way.
5
5
  License: Apache-2.0
6
6
  Author: Rapidata AG
@@ -349,7 +349,7 @@ rapidata/api_client_README.md,sha256=TwdMWXKwwnzD5CyLjXOqAwKVyMxTJcKewudONv31p40
349
349
  rapidata/rapidata_client/__init__.py,sha256=kkT6FMU4P8rTdYKTJgZNcyptr-Rq1iZmsyLIyRCwpYA,896
350
350
  rapidata/rapidata_client/assets/__init__.py,sha256=hKgrOSn8gJcBSULaf4auYhH1S1N5AfcwIhBSq1BOKwQ,323
351
351
  rapidata/rapidata_client/assets/_base_asset.py,sha256=B2YWH1NgaeYUYHDW3OPpHM_bqawHbH4EjnRCE2BYwiM,298
352
- rapidata/rapidata_client/assets/_media_asset.py,sha256=dji8rM2W6TtzIPDiVVyaHfEDNTmu8uo3rsgrPUcbqDE,5949
352
+ rapidata/rapidata_client/assets/_media_asset.py,sha256=s7_bfhvYEFLIvpZvECsjtVSLnvZmi19xLJoOsDShzgk,9391
353
353
  rapidata/rapidata_client/assets/_multi_asset.py,sha256=o4_-OvovADSVpl0tr6fPzRR_gHCcIQKfeZKcoFDFBLo,1667
354
354
  rapidata/rapidata_client/assets/_text_asset.py,sha256=itoe3vicn41LbdJ7UeydImORUo9iDL0SZu-ptOlbMRM,618
355
355
  rapidata/rapidata_client/assets/data_type_enum.py,sha256=ELC-ymeKnQlfNAzfqsI7MmUuRiGYamCHVcTc0qR6Fm4,185
@@ -427,7 +427,7 @@ rapidata/service/credential_manager.py,sha256=Of0BQs_V1T7rkrWX9groLX790nOknaARwn
427
427
  rapidata/service/local_file_service.py,sha256=pgorvlWcx52Uh3cEG6VrdMK_t__7dacQ_5AnfY14BW8,877
428
428
  rapidata/service/openapi_service.py,sha256=Z4NrAuilLlIWBdGOv6otz36tHS_vvU36w5jmvOUTmqo,3198
429
429
  rapidata/service/token_manager.py,sha256=JZ5YbR5Di8dO3H4kK11d0kzWlrXxjgCmeNkHA4AapCM,6425
430
- rapidata-2.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
431
- rapidata-2.4.0.dist-info/METADATA,sha256=ZbXMkT2eRglrqdHTuUgOJlRhKOg8BneVJ2rPRs2-U6k,1107
432
- rapidata-2.4.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
433
- rapidata-2.4.0.dist-info/RECORD,,
430
+ rapidata-2.4.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
431
+ rapidata-2.4.1.dist-info/METADATA,sha256=nOQUJTI2cPvoUj7WS7N1Y2d5L_ZW1Tx81tb8SawafqQ,1107
432
+ rapidata-2.4.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
433
+ rapidata-2.4.1.dist-info/RECORD,,