rapidata 2.36.1__py3-none-any.whl → 2.36.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidata might be problematic. Click here for more details.

Files changed (95) hide show
  1. rapidata/__init__.py +2 -2
  2. rapidata/api_client/__init__.py +2 -4
  3. rapidata/api_client/api/validation_set_api.py +54 -31
  4. rapidata/api_client/models/__init__.py +2 -4
  5. rapidata/api_client/models/add_validation_rapid_model.py +17 -2
  6. rapidata/api_client/models/asset_metadata.py +9 -1
  7. rapidata/api_client/models/boost_query_result.py +5 -17
  8. rapidata/api_client/models/campaign_query_result.py +3 -9
  9. rapidata/api_client/models/classification_metadata.py +12 -1
  10. rapidata/api_client/models/compare_workflow_config.py +22 -12
  11. rapidata/api_client/models/compare_workflow_config_model.py +12 -2
  12. rapidata/api_client/models/compare_workflow_model.py +12 -2
  13. rapidata/api_client/models/count_metadata.py +12 -1
  14. rapidata/api_client/models/create_demographic_rapid_model.py +18 -3
  15. rapidata/api_client/models/create_order_model.py +6 -48
  16. rapidata/api_client/models/effort_capped_selection.py +2 -11
  17. rapidata/api_client/models/evaluation_workflow_config.py +13 -3
  18. rapidata/api_client/models/evaluation_workflow_model.py +13 -3
  19. rapidata/api_client/models/file_type_metadata.py +11 -6
  20. rapidata/api_client/models/file_type_metadata_model.py +2 -8
  21. rapidata/api_client/models/filter.py +5 -23
  22. rapidata/api_client/models/get_datapoint_by_id_result.py +3 -9
  23. rapidata/api_client/models/get_rapid_responses_result.py +3 -9
  24. rapidata/api_client/models/get_recommended_validation_set_result.py +95 -0
  25. rapidata/api_client/models/get_standing_by_id_result.py +3 -9
  26. rapidata/api_client/models/get_validation_rapids_result.py +3 -9
  27. rapidata/api_client/models/get_workflow_progress_result.py +3 -9
  28. rapidata/api_client/models/get_workflow_results_result.py +3 -9
  29. rapidata/api_client/models/image_dimension_metadata.py +12 -1
  30. rapidata/api_client/models/labeling_selection.py +2 -11
  31. rapidata/api_client/models/location_metadata.py +12 -1
  32. rapidata/api_client/models/order_model.py +3 -9
  33. rapidata/api_client/models/original_filename_metadata.py +12 -1
  34. rapidata/api_client/models/participant_by_benchmark.py +3 -9
  35. rapidata/api_client/models/prompt_metadata.py +12 -1
  36. rapidata/api_client/models/rapid_model.py +3 -9
  37. rapidata/api_client/models/report_model.py +3 -9
  38. rapidata/api_client/models/response_count_filter.py +2 -8
  39. rapidata/api_client/models/response_count_user_filter_model.py +2 -8
  40. rapidata/api_client/models/root_filter.py +3 -12
  41. rapidata/api_client/models/runs_by_leaderboard_result.py +3 -9
  42. rapidata/api_client/models/simple_workflow_config.py +13 -3
  43. rapidata/api_client/models/simple_workflow_config_model.py +11 -3
  44. rapidata/api_client/models/simple_workflow_model.py +13 -3
  45. rapidata/api_client/models/sort_criterion.py +3 -9
  46. rapidata/api_client/models/source_url_metadata.py +12 -1
  47. rapidata/api_client/models/standing_by_leaderboard.py +3 -9
  48. rapidata/api_client/models/streams_metadata.py +12 -1
  49. rapidata/api_client/models/text_metadata.py +12 -1
  50. rapidata/api_client/models/transcription_metadata.py +9 -1
  51. rapidata/api_client/models/update_should_alert_model.py +1 -1
  52. rapidata/api_client/models/validation_set_model.py +12 -24
  53. rapidata/api_client/models/video_duration_metadata.py +12 -1
  54. rapidata/api_client/models/workflow_aggregation_step_model.py +3 -12
  55. rapidata/api_client_README.md +2 -4
  56. rapidata/rapidata_client/__init__.py +1 -1
  57. rapidata/rapidata_client/benchmark/participant/_participant.py +5 -5
  58. rapidata/rapidata_client/benchmark/rapidata_benchmark.py +2 -1
  59. rapidata/rapidata_client/benchmark/rapidata_benchmark_manager.py +10 -2
  60. rapidata/rapidata_client/config/__init__.py +1 -1
  61. rapidata/rapidata_client/config/rapidata_config.py +31 -0
  62. rapidata/rapidata_client/datapoints/__init__.py +10 -2
  63. rapidata/rapidata_client/datapoints/{datapoint.py → _datapoint.py} +105 -17
  64. rapidata/rapidata_client/datapoints/assets/_media_asset.py +80 -68
  65. rapidata/rapidata_client/datapoints/assets/_sessions.py +3 -3
  66. rapidata/rapidata_client/datapoints/assets/constants.py +7 -0
  67. rapidata/rapidata_client/exceptions/failed_upload_exception.py +42 -13
  68. rapidata/rapidata_client/filter/response_count_filter.py +16 -11
  69. rapidata/rapidata_client/order/_rapidata_dataset.py +8 -8
  70. rapidata/rapidata_client/order/_rapidata_order_builder.py +87 -8
  71. rapidata/rapidata_client/order/rapidata_order_manager.py +18 -4
  72. rapidata/rapidata_client/rapidata_client.py +6 -0
  73. rapidata/rapidata_client/selection/__init__.py +1 -1
  74. rapidata/rapidata_client/selection/effort_selection.py +18 -7
  75. rapidata/rapidata_client/selection/labeling_selection.py +19 -7
  76. rapidata/rapidata_client/selection/{retrieval_modes.py → rapidata_retrieval_modes.py} +7 -4
  77. rapidata/rapidata_client/validation/rapidata_validation_set.py +26 -1
  78. rapidata/rapidata_client/validation/rapids/rapids.py +46 -19
  79. rapidata/rapidata_client/validation/validation_set_manager.py +41 -4
  80. rapidata/rapidata_client/workflow/_base_workflow.py +27 -0
  81. rapidata/rapidata_client/workflow/_classify_workflow.py +25 -9
  82. rapidata/rapidata_client/workflow/_compare_workflow.py +11 -0
  83. rapidata/rapidata_client/workflow/_draw_workflow.py +15 -7
  84. rapidata/rapidata_client/workflow/_evaluation_workflow.py +8 -1
  85. rapidata/rapidata_client/workflow/_free_text_workflow.py +11 -0
  86. rapidata/rapidata_client/workflow/_locate_workflow.py +15 -7
  87. rapidata/rapidata_client/workflow/_ranking_workflow.py +39 -15
  88. rapidata/rapidata_client/workflow/_select_words_workflow.py +41 -7
  89. rapidata/rapidata_client/workflow/_timestamp_workflow.py +17 -8
  90. rapidata/service/openapi_service.py +1 -1
  91. {rapidata-2.36.1.dist-info → rapidata-2.36.2.dist-info}/METADATA +1 -1
  92. {rapidata-2.36.1.dist-info → rapidata-2.36.2.dist-info}/RECORD +94 -92
  93. rapidata/rapidata_client/config/config.py +0 -33
  94. {rapidata-2.36.1.dist-info → rapidata-2.36.2.dist-info}/LICENSE +0 -0
  95. {rapidata-2.36.1.dist-info → rapidata-2.36.2.dist-info}/WHEEL +0 -0
@@ -1,3 +1,11 @@
1
- from .datapoint import Datapoint
1
+ from ._datapoint import Datapoint
2
2
  from .assets import MediaAsset, MultiAsset, TextAsset
3
- from .metadata import Metadata, PromptMetadata, PrivateTextMetadata, PublicTextMetadata, SelectWordsMetadata, MediaAssetMetadata, PromptIdentifierMetadata
3
+ from .metadata import (
4
+ Metadata,
5
+ PromptMetadata,
6
+ PrivateTextMetadata,
7
+ PublicTextMetadata,
8
+ SelectWordsMetadata,
9
+ MediaAssetMetadata,
10
+ PromptIdentifierMetadata,
11
+ )
@@ -1,21 +1,49 @@
1
1
  from typing import Sequence, cast
2
- from rapidata.rapidata_client.datapoints.assets import MediaAsset, TextAsset, MultiAsset, BaseAsset
2
+ from rapidata.rapidata_client.datapoints.assets import (
3
+ MediaAsset,
4
+ TextAsset,
5
+ MultiAsset,
6
+ BaseAsset,
7
+ )
8
+ from rapidata.rapidata_client.datapoints.assets.constants import (
9
+ ALLOWED_VIDEO_EXTENSIONS,
10
+ ALLOWED_IMAGE_EXTENSIONS,
11
+ ALLOWED_AUDIO_EXTENSIONS,
12
+ )
3
13
  from rapidata.rapidata_client.datapoints.metadata import Metadata
4
- from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import DatasetDatasetIdDatapointsPostRequestMetadataInner
5
- from rapidata.api_client.models.create_datapoint_from_text_sources_model import CreateDatapointFromTextSourcesModel
14
+ from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import (
15
+ DatasetDatasetIdDatapointsPostRequestMetadataInner,
16
+ )
17
+ from rapidata.api_client.models.create_datapoint_from_text_sources_model import (
18
+ CreateDatapointFromTextSourcesModel,
19
+ )
6
20
  from pydantic import StrictStr, StrictBytes
21
+ from rapidata.api_client.models.asset_type import AssetType
22
+ from rapidata.api_client.models.prompt_type import PromptType
23
+ from rapidata.rapidata_client.datapoints.metadata._media_asset_metadata import (
24
+ MediaAssetMetadata,
25
+ )
26
+ from rapidata.rapidata_client.datapoints.metadata._prompt_metadata import PromptMetadata
27
+ from rapidata.rapidata_client.logging import logger
28
+
7
29
 
8
30
  class Datapoint:
9
- def __init__(self, asset: MediaAsset | TextAsset | MultiAsset, metadata: Sequence[Metadata] | None = None):
31
+ def __init__(
32
+ self,
33
+ asset: MediaAsset | TextAsset | MultiAsset,
34
+ metadata: Sequence[Metadata] | None = None,
35
+ ):
10
36
  if not isinstance(asset, (MediaAsset, TextAsset, MultiAsset)):
11
- raise TypeError("Asset must be of type MediaAsset, TextAsset, or MultiAsset.")
12
-
37
+ raise TypeError(
38
+ "Asset must be of type MediaAsset, TextAsset, or MultiAsset."
39
+ )
40
+
13
41
  if metadata and not isinstance(metadata, Sequence):
14
42
  raise TypeError("Metadata must be a list of Metadata objects.")
15
-
43
+
16
44
  if metadata and not all(isinstance(m, Metadata) for m in metadata):
17
45
  raise TypeError("All metadata objects must be of type Metadata.")
18
-
46
+
19
47
  self.asset = asset
20
48
  self.metadata = metadata
21
49
 
@@ -35,6 +63,45 @@ class Datapoint:
35
63
  effective_type = self._get_effective_asset_type()
36
64
  return issubclass(effective_type, TextAsset)
37
65
 
66
+ def get_asset_type(self) -> AssetType:
67
+ """Get the asset type of the datapoint."""
68
+ if self.is_text_asset():
69
+ return AssetType.TEXT
70
+ elif self.is_media_asset():
71
+ if isinstance(self.asset, MultiAsset):
72
+ asset = self.asset.assets[0]
73
+ else:
74
+ asset = self.asset
75
+ assert isinstance(asset, MediaAsset)
76
+ if any(asset.path.endswith(ext) for ext in ALLOWED_IMAGE_EXTENSIONS):
77
+ return AssetType.IMAGE
78
+ elif any(asset.path.endswith(ext) for ext in ALLOWED_VIDEO_EXTENSIONS):
79
+ return AssetType.VIDEO
80
+ elif any(asset.path.endswith(ext) for ext in ALLOWED_AUDIO_EXTENSIONS):
81
+ return AssetType.AUDIO
82
+ else:
83
+ logger.debug(
84
+ f"Cannot get asset type for asset type: {type(self.asset)}"
85
+ )
86
+ return AssetType.NONE
87
+ else:
88
+ logger.debug(f"Cannot get asset type for asset type: {type(self.asset)}")
89
+ return AssetType.NONE
90
+
91
+ def get_prompt_type(self) -> list[PromptType]:
92
+ """Get the prompt type of the datapoint."""
93
+ prompt_types = []
94
+ for metadata in self.metadata or []:
95
+ if isinstance(metadata, MediaAssetMetadata):
96
+ prompt_types.append(PromptType.ASSET)
97
+ elif isinstance(metadata, PromptMetadata):
98
+ prompt_types.append(PromptType.TEXT)
99
+
100
+ if len(prompt_types) == 0:
101
+ return [PromptType.NONE]
102
+
103
+ return prompt_types
104
+
38
105
  def get_texts(self) -> list[str]:
39
106
  """Extract text content from the asset(s)."""
40
107
  if isinstance(self.asset, TextAsset):
@@ -59,13 +126,17 @@ class Datapoint:
59
126
  media_assets.append(asset)
60
127
  return media_assets
61
128
  else:
62
- raise ValueError(f"Cannot extract media assets from asset type: {type(self.asset)}")
129
+ raise ValueError(
130
+ f"Cannot extract media assets from asset type: {type(self.asset)}"
131
+ )
63
132
 
64
- def get_local_file_paths(self) -> list[StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes]:
133
+ def get_local_file_paths(
134
+ self,
135
+ ) -> list[StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes]:
65
136
  """Get local file paths for media assets that are stored locally."""
66
137
  if not self.is_media_asset():
67
138
  return []
68
-
139
+
69
140
  media_assets = self.get_media_assets()
70
141
  return [asset.to_file() for asset in media_assets if asset.is_local()]
71
142
 
@@ -73,34 +144,51 @@ class Datapoint:
73
144
  """Get URLs for media assets that are remote."""
74
145
  if not self.is_media_asset():
75
146
  return []
76
-
147
+
77
148
  media_assets = self.get_media_assets()
78
149
  return [asset.path for asset in media_assets if not asset.is_local()]
79
150
 
80
- def get_prepared_metadata(self) -> list[DatasetDatasetIdDatapointsPostRequestMetadataInner]:
151
+ def get_prepared_metadata(
152
+ self,
153
+ ) -> list[DatasetDatasetIdDatapointsPostRequestMetadataInner]:
81
154
  """Prepare metadata for API upload."""
82
155
  metadata: list[DatasetDatasetIdDatapointsPostRequestMetadataInner] = []
83
156
  if self.metadata:
84
157
  for meta in self.metadata:
85
158
  meta_model = meta.to_model() if meta else None
86
159
  if meta_model:
87
- metadata.append(DatasetDatasetIdDatapointsPostRequestMetadataInner(meta_model))
160
+ metadata.append(
161
+ DatasetDatasetIdDatapointsPostRequestMetadataInner(meta_model)
162
+ )
88
163
  return metadata
89
164
 
90
- def create_text_upload_model(self, index: int) -> CreateDatapointFromTextSourcesModel:
165
+ def create_text_upload_model(
166
+ self, index: int
167
+ ) -> CreateDatapointFromTextSourcesModel:
91
168
  """Create the model for uploading text datapoints."""
92
169
  if not self.is_text_asset():
93
170
  raise ValueError("Cannot create text upload model for non-text asset")
94
-
171
+
95
172
  texts = self.get_texts()
96
173
  metadata = self.get_prepared_metadata()
97
-
174
+
98
175
  return CreateDatapointFromTextSourcesModel(
99
176
  textSources=texts,
100
177
  sortIndex=index,
101
178
  metadata=metadata,
102
179
  )
103
180
 
181
+ def get_datapoint_string(self) -> str:
182
+ """Get the datapoint string for the datapoint."""
183
+ if isinstance(self.asset, MediaAsset):
184
+ return self.asset.path
185
+ elif isinstance(self.asset, TextAsset):
186
+ return self.asset.text
187
+ else:
188
+ raise ValueError(
189
+ f"Cannot get datapoint string for asset type: {type(self.asset)}"
190
+ )
191
+
104
192
  def __str__(self):
105
193
  return f"Datapoint(asset={self.asset})"
106
194
 
@@ -18,6 +18,11 @@ import logging
18
18
  from functools import cached_property
19
19
  from rapidata.rapidata_client.datapoints.assets._sessions import SessionManager
20
20
  from rapidata.rapidata_client.logging import logger
21
+ from rapidata.rapidata_client.datapoints.assets.constants import (
22
+ ALLOWED_IMAGE_EXTENSIONS,
23
+ ALLOWED_MEDIA_EXTENSIONS,
24
+ )
25
+
21
26
 
22
27
  class MediaAsset(BaseAsset):
23
28
  """MediaAsset Class with Lazy Loading
@@ -32,34 +37,35 @@ class MediaAsset(BaseAsset):
32
37
  Raises:
33
38
  FileNotFoundError: If the provided file path does not exist.
34
39
  """
35
- _logger = logging.getLogger(__name__ + '.MediaAsset')
40
+
41
+ _logger = logging.getLogger(__name__ + ".MediaAsset")
36
42
 
37
43
  ALLOWED_TYPES = [
38
- 'image/',
39
- 'audio/mp3', # MP3
40
- 'video/mp4', # MP4
44
+ "image/",
45
+ "audio/mp3", # MP3
46
+ "video/mp4", # MP4
41
47
  ]
42
48
 
43
49
  MIME_TYPES = {
44
- 'jpg': 'image/jpeg',
45
- 'jpeg': 'image/jpeg',
46
- 'png': 'image/png',
47
- 'gif': 'image/gif',
48
- 'webp': 'image/webp',
49
- 'mp3': 'audio/mp3',
50
- 'mp4': 'video/mp4'
50
+ "jpg": "image/jpeg",
51
+ "jpeg": "image/jpeg",
52
+ "png": "image/png",
53
+ "gif": "image/gif",
54
+ "webp": "image/webp",
55
+ "mp3": "audio/mp3",
56
+ "mp4": "video/mp4",
51
57
  }
52
58
 
53
59
  FILE_SIGNATURES = {
54
- b'\xFF\xD8\xFF': 'image/jpeg',
55
- b'\x89PNG\r\n\x1a\n': 'image/png',
56
- b'GIF87a': 'image/gif',
57
- b'GIF89a': 'image/gif',
58
- b'RIFF': 'image/webp',
59
- b'ID3': 'audio/mp3',
60
- b'\xFF\xFB': 'audio/mp3',
61
- b'\xFF\xF3': 'audio/mp3',
62
- b'ftyp': 'video/mp4',
60
+ b"\xFF\xD8\xFF": "image/jpeg",
61
+ b"\x89PNG\r\n\x1a\n": "image/png",
62
+ b"GIF87a": "image/gif",
63
+ b"GIF89a": "image/gif",
64
+ b"RIFF": "image/webp",
65
+ b"ID3": "audio/mp3",
66
+ b"\xFF\xFB": "audio/mp3",
67
+ b"\xFF\xF3": "audio/mp3",
68
+ b"ftyp": "video/mp4",
63
69
  }
64
70
 
65
71
  def __init__(self, path: str):
@@ -74,22 +80,24 @@ class MediaAsset(BaseAsset):
74
80
  ValueError: If path is not a string.
75
81
  """
76
82
  if not isinstance(path, str):
77
- raise ValueError(f"Media must be a string, either a local file path or a URL, got {type(path)}")
78
-
83
+ raise ValueError(
84
+ f"Media must be a string, either a local file path or a URL, got {type(path)}"
85
+ )
86
+
79
87
  self._url = None
80
88
  self._content = None
81
- self.session: requests.Session = SessionManager.get_session()
82
-
83
- if re.match(r'^https?://', path):
89
+ self.session: requests.Session = SessionManager.get_session()
90
+
91
+ if re.match(r"^https?://", path):
84
92
  self._url = path
85
- self.name = path.split('/')[-1]
93
+ self.name = path.split("/")[-1]
86
94
  self.name = self.__check_name_ending(self.name)
87
95
  self.path = path
88
96
  return
89
-
97
+
90
98
  if not os.path.exists(path):
91
99
  raise FileNotFoundError(f"File not found: {path}")
92
-
100
+
93
101
  self.path = path
94
102
  self.name = path
95
103
 
@@ -101,9 +109,9 @@ class MediaAsset(BaseAsset):
101
109
  """
102
110
  if self._url is None:
103
111
  self.path = cast(str, self.path)
104
- with open(self.path, 'rb') as f:
112
+ with open(self.path, "rb") as f:
105
113
  return f.read()
106
-
114
+
107
115
  return self.__get_media_bytes(self._url)
108
116
 
109
117
  def get_duration(self) -> int:
@@ -118,29 +126,31 @@ class MediaAsset(BaseAsset):
118
126
  ValueError: If the duration cannot be determined
119
127
  """
120
128
  path_to_check = self.name.lower()
121
-
129
+
122
130
  # Return 0 for static images
123
- if any(path_to_check.endswith(ext) for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif')):
131
+ if any(path_to_check.endswith(ext) for ext in ALLOWED_IMAGE_EXTENSIONS):
124
132
  return 0
125
133
 
126
134
  try:
127
135
  # Create temporary file from content
128
- with tempfile.NamedTemporaryFile(suffix=os.path.splitext(self.name)[1], delete=False) as tmp:
136
+ with tempfile.NamedTemporaryFile(
137
+ suffix=os.path.splitext(self.name)[1], delete=False
138
+ ) as tmp:
129
139
  tmp.write(self.content)
130
140
  tmp.flush()
131
141
  tmp_path = tmp.name
132
-
142
+
133
143
  try:
134
144
  tag = TinyTag.get(tmp_path)
135
145
  finally:
136
146
  # Clean up the temporary file
137
147
  os.unlink(tmp_path)
138
-
148
+
139
149
  if tag.duration is None:
140
150
  raise ValueError("Could not read duration from file")
141
-
151
+
142
152
  return int(tag.duration * 1000) # Convert to milliseconds
143
-
153
+
144
154
  except Exception as e:
145
155
  raise ValueError(f"Could not determine media duration: {str(e)}")
146
156
 
@@ -149,53 +159,55 @@ class MediaAsset(BaseAsset):
149
159
  Get the dimensions (width, height) of an image file.
150
160
  Returns None for non-image files or if dimensions can't be determined.
151
161
  """
152
- if not any(self.name.lower().endswith(ext) for ext in ('.jpg', '.jpeg', '.png', '.gif', '.webp')):
162
+ if not any(self.name.lower().endswith(ext) for ext in ALLOWED_IMAGE_EXTENSIONS):
153
163
  return None
154
-
164
+
155
165
  try:
156
166
  img = Image.open(BytesIO(self.content))
157
167
  return img.size
158
168
  except Exception:
159
169
  return None
160
170
 
161
- def set_custom_name(self, name: str) -> 'MediaAsset':
171
+ def set_custom_name(self, name: str) -> "MediaAsset":
162
172
  """Set a custom name for the media asset (only works with URLs)."""
163
173
  if self._url is not None:
164
174
  self.name = self.__check_name_ending(name)
165
175
  else:
166
176
  raise ValueError("Custom name can only be set for URLs.")
167
177
  return self
168
-
178
+
169
179
  def __check_name_ending(self, name: str) -> str:
170
180
  """Check if the media path is valid."""
171
- if not name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.mp3', '.mp4', '.webp')):
172
- logger.warning("Warning: Supported file types: jpg, jpeg, png, gif, mp3, mp4. Image might not be displayed correctly.")
173
- name = name + '.jpg'
181
+ if not any(name.endswith(ext) for ext in ALLOWED_MEDIA_EXTENSIONS):
182
+ logger.warning(
183
+ f"Warning: Supported file types: {ALLOWED_MEDIA_EXTENSIONS}. Image might not be displayed correctly."
184
+ )
185
+ name = name + ".jpg"
174
186
  return name
175
187
 
176
188
  def __get_media_type_from_extension(self, url: str) -> Optional[str]:
177
189
  """
178
190
  Determine media type from URL file extension.
179
-
191
+
180
192
  Args:
181
193
  url: The URL to check
182
-
194
+
183
195
  Returns:
184
196
  Optional[str]: MIME type if valid extension found, None otherwise
185
197
  """
186
198
  try:
187
- ext = url.lower().split('?')[0].split('.')[-1]
199
+ ext = url.lower().split("?")[0].split(".")[-1]
188
200
  return self.MIME_TYPES.get(ext)
189
201
  except IndexError:
190
202
  return None
191
-
203
+
192
204
  def __validate_image_content(self, content: bytes) -> bool:
193
205
  """
194
206
  Validate image content using PIL.
195
-
207
+
196
208
  Args:
197
209
  content: Image bytes to validate
198
-
210
+
199
211
  Returns:
200
212
  bool: True if valid image, False otherwise
201
213
  """
@@ -206,14 +218,14 @@ class MediaAsset(BaseAsset):
206
218
  except Exception as e:
207
219
  self._logger.debug(f"Image validation failed: {str(e)}")
208
220
  return False
209
-
221
+
210
222
  def __get_media_type_from_signature(self, content: bytes) -> Optional[str]:
211
223
  """
212
224
  Determine media type from file signature.
213
-
225
+
214
226
  Args:
215
227
  content: File content bytes
216
-
228
+
217
229
  Returns:
218
230
  Optional[str]: MIME type if valid signature found, None otherwise
219
231
  """
@@ -226,13 +238,13 @@ class MediaAsset(BaseAsset):
226
238
  def __get_media_bytes(self, url: str) -> bytes:
227
239
  """
228
240
  Downloads and validates media files from URL with retry logic and session reuse.
229
-
241
+
230
242
  Args:
231
243
  url: URL of the media file
232
-
244
+
233
245
  Returns:
234
246
  bytes: Validated media content
235
-
247
+
236
248
  Raises:
237
249
  ValueError: If media type is unsupported or content validation fails
238
250
  requests.exceptions.RequestException: If download fails after all retries
@@ -243,17 +255,17 @@ class MediaAsset(BaseAsset):
243
255
 
244
256
  try:
245
257
  response = self.session.get(
246
- url,
247
- stream=False,
248
- timeout=(5, 30) # (connect timeout, read timeout)
258
+ url, stream=False, timeout=(5, 30) # (connect timeout, read timeout)
249
259
  )
250
260
  response.raise_for_status()
251
261
  except requests.exceptions.RequestException as e:
252
- self._logger.error(f"Failed to download media from {url} after retries: {str(e)}")
262
+ self._logger.error(
263
+ f"Failed to download media from {url} after retries: {str(e)}"
264
+ )
253
265
  raise
254
266
 
255
267
  content = response.content
256
- content_type = response.headers.get('content-type', '').lower()
268
+ content_type = response.headers.get("content-type", "").lower()
257
269
 
258
270
  # Case 1: Content-type is already allowed
259
271
  if any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
@@ -279,18 +291,18 @@ class MediaAsset(BaseAsset):
279
291
 
280
292
  # If we get here, validation failed
281
293
  error_msg = (
282
- f'Could not validate media type from content.\n'
283
- f'Content-Type: {content_type}\n'
294
+ f"Could not validate media type from content.\n"
295
+ f"Content-Type: {content_type}\n"
284
296
  f'URL extension: {url.split("?")[0].split(".")[-1]}\n'
285
- f'Allowed types: {self.ALLOWED_TYPES}'
297
+ f"Allowed types: {self.ALLOWED_TYPES}"
286
298
  )
287
299
  self._logger.error(error_msg)
288
300
  raise ValueError(error_msg)
289
-
301
+
290
302
  def is_local(self) -> bool:
291
303
  """Check if the media asset is a local file."""
292
304
  return self._url is None
293
-
305
+
294
306
  def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes:
295
307
  """Convert the media asset to a file representation."""
296
308
  if self._url is None:
@@ -298,9 +310,9 @@ class MediaAsset(BaseAsset):
298
310
  return self.path
299
311
  else:
300
312
  return (self.name, self.content)
301
-
313
+
302
314
  def __str__(self) -> str:
303
315
  return f"MediaAsset(path={self.path})"
304
-
316
+
305
317
  def __repr__(self) -> str:
306
318
  return f"MediaAsset(path={self.path})"
@@ -2,7 +2,7 @@ import requests
2
2
  from requests.adapters import HTTPAdapter
3
3
  from urllib3.util.retry import Retry
4
4
 
5
- from rapidata.rapidata_client.config.config import rapidata_config
5
+ from rapidata.rapidata_client.config.rapidata_config import rapidata_config
6
6
 
7
7
 
8
8
  class SessionManager:
@@ -18,8 +18,8 @@ class SessionManager:
18
18
  requests.Session: A singleton requests session with retry logic.
19
19
  """
20
20
  if cls._session is None:
21
- max_retries: int = rapidata_config.upload_max_retries
22
- max_workers: int = rapidata_config.max_upload_workers
21
+ max_retries: int = rapidata_config.uploadMaxRetries
22
+ max_workers: int = rapidata_config.maxUploadWorkers
23
23
  cls._session = requests.Session()
24
24
  retries = Retry(
25
25
  total=max_retries,
@@ -0,0 +1,7 @@
1
+ ALLOWED_IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".webp"]
2
+ ALLOWED_AUDIO_EXTENSIONS = [".mp3"]
3
+ ALLOWED_VIDEO_EXTENSIONS = [".mp4"]
4
+
5
+ ALLOWED_MEDIA_EXTENSIONS = (
6
+ ALLOWED_IMAGE_EXTENSIONS + ALLOWED_AUDIO_EXTENSIONS + ALLOWED_VIDEO_EXTENSIONS
7
+ )
@@ -1,21 +1,27 @@
1
1
  from typing import cast
2
2
  from rapidata.api_client.models.file_asset_model import FileAssetModel
3
- from rapidata.api_client.models.get_failed_datapoints_result import GetFailedDatapointsResult
3
+ from rapidata.api_client.models.get_failed_datapoints_result import (
4
+ GetFailedDatapointsResult,
5
+ )
4
6
  from rapidata.api_client.models.multi_asset_model import MultiAssetModel
5
- from rapidata.api_client.models.original_filename_metadata_model import OriginalFilenameMetadataModel
7
+ from rapidata.api_client.models.original_filename_metadata_model import (
8
+ OriginalFilenameMetadataModel,
9
+ )
6
10
  from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
7
11
  from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
8
- from rapidata.rapidata_client.datapoints.datapoint import Datapoint
12
+ from rapidata.rapidata_client.datapoints._datapoint import Datapoint
9
13
  from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
10
14
  from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
11
15
 
16
+
12
17
  class FailedUploadException(Exception):
13
18
  """Custom error class for Failed Uploads to the Rapidata order."""
19
+
14
20
  def __init__(
15
- self,
21
+ self,
16
22
  dataset: RapidataDataset,
17
23
  order: RapidataOrder,
18
- failed_uploads: list[Datapoint]
24
+ failed_uploads: list[Datapoint],
19
25
  ):
20
26
  self.dataset = dataset
21
27
  self.order = order
@@ -30,24 +36,47 @@ def _parse_failed_uploads(failed_uploads: GetFailedDatapointsResult) -> list[Dat
30
36
  if not failed_datapoints:
31
37
  return []
32
38
  if isinstance(failed_datapoints[0].asset.actual_instance, FileAssetModel):
33
- failed_assets = [MediaAsset(__get_asset_name(cast(FileAssetModel, datapoint.asset.actual_instance))) for datapoint in failed_datapoints]
39
+ failed_assets = [
40
+ MediaAsset(
41
+ __get_asset_name(cast(FileAssetModel, datapoint.asset.actual_instance))
42
+ )
43
+ for datapoint in failed_datapoints
44
+ ]
34
45
  elif isinstance(failed_datapoints[0].asset.actual_instance, MultiAssetModel):
35
46
  failed_assets = []
36
- backend_assets = [cast(MultiAssetModel, failed_upload.asset.actual_instance).assets for failed_upload in failed_datapoints]
47
+ backend_assets = [
48
+ cast(MultiAssetModel, failed_upload.asset.actual_instance).assets
49
+ for failed_upload in failed_datapoints
50
+ ]
37
51
  for assets in backend_assets:
38
- failed_assets.append(MultiAsset([MediaAsset(__get_asset_name(cast(FileAssetModel, asset.actual_instance))) for asset in assets if isinstance(asset.actual_instance, FileAssetModel)]))
52
+ failed_assets.append(
53
+ MultiAsset(
54
+ [
55
+ MediaAsset(
56
+ __get_asset_name(
57
+ cast(FileAssetModel, asset.actual_instance)
58
+ )
59
+ )
60
+ for asset in assets
61
+ if isinstance(asset.actual_instance, FileAssetModel)
62
+ ]
63
+ )
64
+ )
39
65
  else:
40
- raise ValueError(f"Unsupported asset type: {type(failed_datapoints[0].asset.actual_instance)}")
41
-
66
+ raise ValueError(
67
+ f"Unsupported asset type: {type(failed_datapoints[0].asset.actual_instance)}"
68
+ )
69
+
42
70
  return [Datapoint(asset=asset) for asset in failed_assets]
43
71
 
72
+
44
73
  def __get_asset_name(failed_datapoint: FileAssetModel) -> str:
45
74
  metadata = failed_datapoint.metadata
46
75
  if "sourceUrl" in metadata:
47
76
  return cast(SourceUrlMetadataModel, metadata["sourceUrl"].actual_instance).url
48
77
  elif "originalFilename" in metadata:
49
- return cast(OriginalFilenameMetadataModel, metadata["originalFilename"].actual_instance).original_filename
78
+ return cast(
79
+ OriginalFilenameMetadataModel, metadata["originalFilename"].actual_instance
80
+ ).original_filename
50
81
  else:
51
82
  return ""
52
-
53
-