rapidata 2.36.0__py3-none-any.whl → 2.36.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidata might be problematic. Click here for more details.
- rapidata/__init__.py +2 -2
- rapidata/api_client/__init__.py +3 -2
- rapidata/api_client/api/leaderboard_api.py +295 -6
- rapidata/api_client/api/validation_set_api.py +561 -313
- rapidata/api_client/models/__init__.py +3 -2
- rapidata/api_client/models/add_validation_rapid_model.py +29 -4
- rapidata/api_client/models/add_validation_rapid_model_truth.py +7 -1
- rapidata/api_client/models/asset_metadata.py +9 -1
- rapidata/api_client/models/asset_type.py +40 -0
- rapidata/api_client/models/boost_query_result.py +5 -17
- rapidata/api_client/models/campaign_query_result.py +3 -9
- rapidata/api_client/models/classification_metadata.py +12 -1
- rapidata/api_client/models/compare_workflow_config.py +22 -12
- rapidata/api_client/models/compare_workflow_config_model.py +12 -2
- rapidata/api_client/models/compare_workflow_model.py +12 -2
- rapidata/api_client/models/count_metadata.py +12 -1
- rapidata/api_client/models/create_demographic_rapid_model.py +18 -3
- rapidata/api_client/models/create_order_model.py +6 -48
- rapidata/api_client/models/effort_capped_selection.py +2 -11
- rapidata/api_client/models/evaluation_workflow_config.py +13 -3
- rapidata/api_client/models/evaluation_workflow_model.py +13 -3
- rapidata/api_client/models/file_type_metadata.py +11 -6
- rapidata/api_client/models/file_type_metadata_model.py +2 -8
- rapidata/api_client/models/filter.py +5 -23
- rapidata/api_client/models/get_datapoint_by_id_result.py +3 -9
- rapidata/api_client/models/get_rapid_responses_result.py +3 -9
- rapidata/api_client/models/get_recommended_validation_set_result.py +95 -0
- rapidata/api_client/models/get_standing_by_id_result.py +3 -9
- rapidata/api_client/models/get_validation_rapids_result.py +3 -9
- rapidata/api_client/models/get_workflow_progress_result.py +3 -9
- rapidata/api_client/models/get_workflow_results_result.py +3 -9
- rapidata/api_client/models/image_dimension_metadata.py +12 -1
- rapidata/api_client/models/labeling_selection.py +2 -11
- rapidata/api_client/models/location_metadata.py +12 -1
- rapidata/api_client/models/order_model.py +3 -9
- rapidata/api_client/models/original_filename_metadata.py +12 -1
- rapidata/api_client/models/participant_by_benchmark.py +3 -9
- rapidata/api_client/models/prompt_metadata.py +12 -1
- rapidata/api_client/models/prompt_type.py +38 -0
- rapidata/api_client/models/rapid_modality.py +46 -0
- rapidata/api_client/models/rapid_model.py +3 -9
- rapidata/api_client/models/report_model.py +3 -9
- rapidata/api_client/models/response_count_filter.py +2 -8
- rapidata/api_client/models/response_count_user_filter_model.py +2 -8
- rapidata/api_client/models/root_filter.py +3 -12
- rapidata/api_client/models/runs_by_leaderboard_result.py +3 -9
- rapidata/api_client/models/simple_workflow_config.py +13 -3
- rapidata/api_client/models/simple_workflow_config_model.py +11 -3
- rapidata/api_client/models/simple_workflow_model.py +13 -3
- rapidata/api_client/models/sort_criterion.py +3 -9
- rapidata/api_client/models/source_url_metadata.py +12 -1
- rapidata/api_client/models/standing_by_leaderboard.py +3 -9
- rapidata/api_client/models/streams_metadata.py +12 -1
- rapidata/api_client/models/text_metadata.py +12 -1
- rapidata/api_client/models/transcription_metadata.py +9 -1
- rapidata/api_client/models/update_leaderboard_model.py +91 -0
- rapidata/api_client/models/update_should_alert_model.py +1 -1
- rapidata/api_client/models/validation_set_model.py +42 -3
- rapidata/api_client/models/video_duration_metadata.py +12 -1
- rapidata/api_client/models/workflow_aggregation_step_model.py +3 -12
- rapidata/api_client_README.md +8 -5
- rapidata/rapidata_client/__init__.py +1 -1
- rapidata/rapidata_client/benchmark/participant/_participant.py +5 -5
- rapidata/rapidata_client/benchmark/rapidata_benchmark.py +2 -1
- rapidata/rapidata_client/benchmark/rapidata_benchmark_manager.py +10 -2
- rapidata/rapidata_client/config/__init__.py +1 -1
- rapidata/rapidata_client/config/rapidata_config.py +31 -0
- rapidata/rapidata_client/datapoints/__init__.py +10 -2
- rapidata/rapidata_client/datapoints/{datapoint.py → _datapoint.py} +105 -17
- rapidata/rapidata_client/datapoints/assets/_media_asset.py +80 -68
- rapidata/rapidata_client/datapoints/assets/_sessions.py +3 -3
- rapidata/rapidata_client/datapoints/assets/constants.py +7 -0
- rapidata/rapidata_client/exceptions/failed_upload_exception.py +42 -13
- rapidata/rapidata_client/filter/response_count_filter.py +16 -11
- rapidata/rapidata_client/order/_rapidata_dataset.py +8 -8
- rapidata/rapidata_client/order/_rapidata_order_builder.py +87 -8
- rapidata/rapidata_client/order/rapidata_order_manager.py +18 -4
- rapidata/rapidata_client/rapidata_client.py +6 -0
- rapidata/rapidata_client/selection/__init__.py +1 -1
- rapidata/rapidata_client/selection/effort_selection.py +18 -7
- rapidata/rapidata_client/selection/labeling_selection.py +19 -7
- rapidata/rapidata_client/selection/{retrieval_modes.py → rapidata_retrieval_modes.py} +7 -4
- rapidata/rapidata_client/validation/rapidata_validation_set.py +26 -1
- rapidata/rapidata_client/validation/rapids/rapids.py +46 -19
- rapidata/rapidata_client/validation/validation_set_manager.py +41 -4
- rapidata/rapidata_client/workflow/_base_workflow.py +27 -0
- rapidata/rapidata_client/workflow/_classify_workflow.py +25 -9
- rapidata/rapidata_client/workflow/_compare_workflow.py +11 -0
- rapidata/rapidata_client/workflow/_draw_workflow.py +15 -7
- rapidata/rapidata_client/workflow/_evaluation_workflow.py +8 -1
- rapidata/rapidata_client/workflow/_free_text_workflow.py +20 -2
- rapidata/rapidata_client/workflow/_locate_workflow.py +15 -7
- rapidata/rapidata_client/workflow/_ranking_workflow.py +39 -15
- rapidata/rapidata_client/workflow/_select_words_workflow.py +41 -7
- rapidata/rapidata_client/workflow/_timestamp_workflow.py +17 -8
- rapidata/service/openapi_service.py +1 -1
- {rapidata-2.36.0.dist-info → rapidata-2.36.2.dist-info}/METADATA +1 -1
- {rapidata-2.36.0.dist-info → rapidata-2.36.2.dist-info}/RECORD +100 -94
- rapidata/rapidata_client/config/config.py +0 -33
- {rapidata-2.36.0.dist-info → rapidata-2.36.2.dist-info}/LICENSE +0 -0
- {rapidata-2.36.0.dist-info → rapidata-2.36.2.dist-info}/WHEEL +0 -0
|
@@ -1,21 +1,49 @@
|
|
|
1
1
|
from typing import Sequence, cast
|
|
2
|
-
from rapidata.rapidata_client.datapoints.assets import
|
|
2
|
+
from rapidata.rapidata_client.datapoints.assets import (
|
|
3
|
+
MediaAsset,
|
|
4
|
+
TextAsset,
|
|
5
|
+
MultiAsset,
|
|
6
|
+
BaseAsset,
|
|
7
|
+
)
|
|
8
|
+
from rapidata.rapidata_client.datapoints.assets.constants import (
|
|
9
|
+
ALLOWED_VIDEO_EXTENSIONS,
|
|
10
|
+
ALLOWED_IMAGE_EXTENSIONS,
|
|
11
|
+
ALLOWED_AUDIO_EXTENSIONS,
|
|
12
|
+
)
|
|
3
13
|
from rapidata.rapidata_client.datapoints.metadata import Metadata
|
|
4
|
-
from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import
|
|
5
|
-
|
|
14
|
+
from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import (
|
|
15
|
+
DatasetDatasetIdDatapointsPostRequestMetadataInner,
|
|
16
|
+
)
|
|
17
|
+
from rapidata.api_client.models.create_datapoint_from_text_sources_model import (
|
|
18
|
+
CreateDatapointFromTextSourcesModel,
|
|
19
|
+
)
|
|
6
20
|
from pydantic import StrictStr, StrictBytes
|
|
21
|
+
from rapidata.api_client.models.asset_type import AssetType
|
|
22
|
+
from rapidata.api_client.models.prompt_type import PromptType
|
|
23
|
+
from rapidata.rapidata_client.datapoints.metadata._media_asset_metadata import (
|
|
24
|
+
MediaAssetMetadata,
|
|
25
|
+
)
|
|
26
|
+
from rapidata.rapidata_client.datapoints.metadata._prompt_metadata import PromptMetadata
|
|
27
|
+
from rapidata.rapidata_client.logging import logger
|
|
28
|
+
|
|
7
29
|
|
|
8
30
|
class Datapoint:
|
|
9
|
-
def __init__(
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
asset: MediaAsset | TextAsset | MultiAsset,
|
|
34
|
+
metadata: Sequence[Metadata] | None = None,
|
|
35
|
+
):
|
|
10
36
|
if not isinstance(asset, (MediaAsset, TextAsset, MultiAsset)):
|
|
11
|
-
raise TypeError(
|
|
12
|
-
|
|
37
|
+
raise TypeError(
|
|
38
|
+
"Asset must be of type MediaAsset, TextAsset, or MultiAsset."
|
|
39
|
+
)
|
|
40
|
+
|
|
13
41
|
if metadata and not isinstance(metadata, Sequence):
|
|
14
42
|
raise TypeError("Metadata must be a list of Metadata objects.")
|
|
15
|
-
|
|
43
|
+
|
|
16
44
|
if metadata and not all(isinstance(m, Metadata) for m in metadata):
|
|
17
45
|
raise TypeError("All metadata objects must be of type Metadata.")
|
|
18
|
-
|
|
46
|
+
|
|
19
47
|
self.asset = asset
|
|
20
48
|
self.metadata = metadata
|
|
21
49
|
|
|
@@ -35,6 +63,45 @@ class Datapoint:
|
|
|
35
63
|
effective_type = self._get_effective_asset_type()
|
|
36
64
|
return issubclass(effective_type, TextAsset)
|
|
37
65
|
|
|
66
|
+
def get_asset_type(self) -> AssetType:
|
|
67
|
+
"""Get the asset type of the datapoint."""
|
|
68
|
+
if self.is_text_asset():
|
|
69
|
+
return AssetType.TEXT
|
|
70
|
+
elif self.is_media_asset():
|
|
71
|
+
if isinstance(self.asset, MultiAsset):
|
|
72
|
+
asset = self.asset.assets[0]
|
|
73
|
+
else:
|
|
74
|
+
asset = self.asset
|
|
75
|
+
assert isinstance(asset, MediaAsset)
|
|
76
|
+
if any(asset.path.endswith(ext) for ext in ALLOWED_IMAGE_EXTENSIONS):
|
|
77
|
+
return AssetType.IMAGE
|
|
78
|
+
elif any(asset.path.endswith(ext) for ext in ALLOWED_VIDEO_EXTENSIONS):
|
|
79
|
+
return AssetType.VIDEO
|
|
80
|
+
elif any(asset.path.endswith(ext) for ext in ALLOWED_AUDIO_EXTENSIONS):
|
|
81
|
+
return AssetType.AUDIO
|
|
82
|
+
else:
|
|
83
|
+
logger.debug(
|
|
84
|
+
f"Cannot get asset type for asset type: {type(self.asset)}"
|
|
85
|
+
)
|
|
86
|
+
return AssetType.NONE
|
|
87
|
+
else:
|
|
88
|
+
logger.debug(f"Cannot get asset type for asset type: {type(self.asset)}")
|
|
89
|
+
return AssetType.NONE
|
|
90
|
+
|
|
91
|
+
def get_prompt_type(self) -> list[PromptType]:
|
|
92
|
+
"""Get the prompt type of the datapoint."""
|
|
93
|
+
prompt_types = []
|
|
94
|
+
for metadata in self.metadata or []:
|
|
95
|
+
if isinstance(metadata, MediaAssetMetadata):
|
|
96
|
+
prompt_types.append(PromptType.ASSET)
|
|
97
|
+
elif isinstance(metadata, PromptMetadata):
|
|
98
|
+
prompt_types.append(PromptType.TEXT)
|
|
99
|
+
|
|
100
|
+
if len(prompt_types) == 0:
|
|
101
|
+
return [PromptType.NONE]
|
|
102
|
+
|
|
103
|
+
return prompt_types
|
|
104
|
+
|
|
38
105
|
def get_texts(self) -> list[str]:
|
|
39
106
|
"""Extract text content from the asset(s)."""
|
|
40
107
|
if isinstance(self.asset, TextAsset):
|
|
@@ -59,13 +126,17 @@ class Datapoint:
|
|
|
59
126
|
media_assets.append(asset)
|
|
60
127
|
return media_assets
|
|
61
128
|
else:
|
|
62
|
-
raise ValueError(
|
|
129
|
+
raise ValueError(
|
|
130
|
+
f"Cannot extract media assets from asset type: {type(self.asset)}"
|
|
131
|
+
)
|
|
63
132
|
|
|
64
|
-
def get_local_file_paths(
|
|
133
|
+
def get_local_file_paths(
|
|
134
|
+
self,
|
|
135
|
+
) -> list[StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes]:
|
|
65
136
|
"""Get local file paths for media assets that are stored locally."""
|
|
66
137
|
if not self.is_media_asset():
|
|
67
138
|
return []
|
|
68
|
-
|
|
139
|
+
|
|
69
140
|
media_assets = self.get_media_assets()
|
|
70
141
|
return [asset.to_file() for asset in media_assets if asset.is_local()]
|
|
71
142
|
|
|
@@ -73,34 +144,51 @@ class Datapoint:
|
|
|
73
144
|
"""Get URLs for media assets that are remote."""
|
|
74
145
|
if not self.is_media_asset():
|
|
75
146
|
return []
|
|
76
|
-
|
|
147
|
+
|
|
77
148
|
media_assets = self.get_media_assets()
|
|
78
149
|
return [asset.path for asset in media_assets if not asset.is_local()]
|
|
79
150
|
|
|
80
|
-
def get_prepared_metadata(
|
|
151
|
+
def get_prepared_metadata(
|
|
152
|
+
self,
|
|
153
|
+
) -> list[DatasetDatasetIdDatapointsPostRequestMetadataInner]:
|
|
81
154
|
"""Prepare metadata for API upload."""
|
|
82
155
|
metadata: list[DatasetDatasetIdDatapointsPostRequestMetadataInner] = []
|
|
83
156
|
if self.metadata:
|
|
84
157
|
for meta in self.metadata:
|
|
85
158
|
meta_model = meta.to_model() if meta else None
|
|
86
159
|
if meta_model:
|
|
87
|
-
metadata.append(
|
|
160
|
+
metadata.append(
|
|
161
|
+
DatasetDatasetIdDatapointsPostRequestMetadataInner(meta_model)
|
|
162
|
+
)
|
|
88
163
|
return metadata
|
|
89
164
|
|
|
90
|
-
def create_text_upload_model(
|
|
165
|
+
def create_text_upload_model(
|
|
166
|
+
self, index: int
|
|
167
|
+
) -> CreateDatapointFromTextSourcesModel:
|
|
91
168
|
"""Create the model for uploading text datapoints."""
|
|
92
169
|
if not self.is_text_asset():
|
|
93
170
|
raise ValueError("Cannot create text upload model for non-text asset")
|
|
94
|
-
|
|
171
|
+
|
|
95
172
|
texts = self.get_texts()
|
|
96
173
|
metadata = self.get_prepared_metadata()
|
|
97
|
-
|
|
174
|
+
|
|
98
175
|
return CreateDatapointFromTextSourcesModel(
|
|
99
176
|
textSources=texts,
|
|
100
177
|
sortIndex=index,
|
|
101
178
|
metadata=metadata,
|
|
102
179
|
)
|
|
103
180
|
|
|
181
|
+
def get_datapoint_string(self) -> str:
|
|
182
|
+
"""Get the datapoint string for the datapoint."""
|
|
183
|
+
if isinstance(self.asset, MediaAsset):
|
|
184
|
+
return self.asset.path
|
|
185
|
+
elif isinstance(self.asset, TextAsset):
|
|
186
|
+
return self.asset.text
|
|
187
|
+
else:
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"Cannot get datapoint string for asset type: {type(self.asset)}"
|
|
190
|
+
)
|
|
191
|
+
|
|
104
192
|
def __str__(self):
|
|
105
193
|
return f"Datapoint(asset={self.asset})"
|
|
106
194
|
|
|
@@ -18,6 +18,11 @@ import logging
|
|
|
18
18
|
from functools import cached_property
|
|
19
19
|
from rapidata.rapidata_client.datapoints.assets._sessions import SessionManager
|
|
20
20
|
from rapidata.rapidata_client.logging import logger
|
|
21
|
+
from rapidata.rapidata_client.datapoints.assets.constants import (
|
|
22
|
+
ALLOWED_IMAGE_EXTENSIONS,
|
|
23
|
+
ALLOWED_MEDIA_EXTENSIONS,
|
|
24
|
+
)
|
|
25
|
+
|
|
21
26
|
|
|
22
27
|
class MediaAsset(BaseAsset):
|
|
23
28
|
"""MediaAsset Class with Lazy Loading
|
|
@@ -32,34 +37,35 @@ class MediaAsset(BaseAsset):
|
|
|
32
37
|
Raises:
|
|
33
38
|
FileNotFoundError: If the provided file path does not exist.
|
|
34
39
|
"""
|
|
35
|
-
|
|
40
|
+
|
|
41
|
+
_logger = logging.getLogger(__name__ + ".MediaAsset")
|
|
36
42
|
|
|
37
43
|
ALLOWED_TYPES = [
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
44
|
+
"image/",
|
|
45
|
+
"audio/mp3", # MP3
|
|
46
|
+
"video/mp4", # MP4
|
|
41
47
|
]
|
|
42
48
|
|
|
43
49
|
MIME_TYPES = {
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
"jpg": "image/jpeg",
|
|
51
|
+
"jpeg": "image/jpeg",
|
|
52
|
+
"png": "image/png",
|
|
53
|
+
"gif": "image/gif",
|
|
54
|
+
"webp": "image/webp",
|
|
55
|
+
"mp3": "audio/mp3",
|
|
56
|
+
"mp4": "video/mp4",
|
|
51
57
|
}
|
|
52
58
|
|
|
53
59
|
FILE_SIGNATURES = {
|
|
54
|
-
b
|
|
55
|
-
b
|
|
56
|
-
b
|
|
57
|
-
b
|
|
58
|
-
b
|
|
59
|
-
b
|
|
60
|
-
b
|
|
61
|
-
b
|
|
62
|
-
b
|
|
60
|
+
b"\xFF\xD8\xFF": "image/jpeg",
|
|
61
|
+
b"\x89PNG\r\n\x1a\n": "image/png",
|
|
62
|
+
b"GIF87a": "image/gif",
|
|
63
|
+
b"GIF89a": "image/gif",
|
|
64
|
+
b"RIFF": "image/webp",
|
|
65
|
+
b"ID3": "audio/mp3",
|
|
66
|
+
b"\xFF\xFB": "audio/mp3",
|
|
67
|
+
b"\xFF\xF3": "audio/mp3",
|
|
68
|
+
b"ftyp": "video/mp4",
|
|
63
69
|
}
|
|
64
70
|
|
|
65
71
|
def __init__(self, path: str):
|
|
@@ -74,22 +80,24 @@ class MediaAsset(BaseAsset):
|
|
|
74
80
|
ValueError: If path is not a string.
|
|
75
81
|
"""
|
|
76
82
|
if not isinstance(path, str):
|
|
77
|
-
raise ValueError(
|
|
78
|
-
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Media must be a string, either a local file path or a URL, got {type(path)}"
|
|
85
|
+
)
|
|
86
|
+
|
|
79
87
|
self._url = None
|
|
80
88
|
self._content = None
|
|
81
|
-
self.session: requests.Session
|
|
82
|
-
|
|
83
|
-
if re.match(r
|
|
89
|
+
self.session: requests.Session = SessionManager.get_session()
|
|
90
|
+
|
|
91
|
+
if re.match(r"^https?://", path):
|
|
84
92
|
self._url = path
|
|
85
|
-
self.name = path.split(
|
|
93
|
+
self.name = path.split("/")[-1]
|
|
86
94
|
self.name = self.__check_name_ending(self.name)
|
|
87
95
|
self.path = path
|
|
88
96
|
return
|
|
89
|
-
|
|
97
|
+
|
|
90
98
|
if not os.path.exists(path):
|
|
91
99
|
raise FileNotFoundError(f"File not found: {path}")
|
|
92
|
-
|
|
100
|
+
|
|
93
101
|
self.path = path
|
|
94
102
|
self.name = path
|
|
95
103
|
|
|
@@ -101,9 +109,9 @@ class MediaAsset(BaseAsset):
|
|
|
101
109
|
"""
|
|
102
110
|
if self._url is None:
|
|
103
111
|
self.path = cast(str, self.path)
|
|
104
|
-
with open(self.path,
|
|
112
|
+
with open(self.path, "rb") as f:
|
|
105
113
|
return f.read()
|
|
106
|
-
|
|
114
|
+
|
|
107
115
|
return self.__get_media_bytes(self._url)
|
|
108
116
|
|
|
109
117
|
def get_duration(self) -> int:
|
|
@@ -118,29 +126,31 @@ class MediaAsset(BaseAsset):
|
|
|
118
126
|
ValueError: If the duration cannot be determined
|
|
119
127
|
"""
|
|
120
128
|
path_to_check = self.name.lower()
|
|
121
|
-
|
|
129
|
+
|
|
122
130
|
# Return 0 for static images
|
|
123
|
-
if any(path_to_check.endswith(ext) for ext in
|
|
131
|
+
if any(path_to_check.endswith(ext) for ext in ALLOWED_IMAGE_EXTENSIONS):
|
|
124
132
|
return 0
|
|
125
133
|
|
|
126
134
|
try:
|
|
127
135
|
# Create temporary file from content
|
|
128
|
-
with tempfile.NamedTemporaryFile(
|
|
136
|
+
with tempfile.NamedTemporaryFile(
|
|
137
|
+
suffix=os.path.splitext(self.name)[1], delete=False
|
|
138
|
+
) as tmp:
|
|
129
139
|
tmp.write(self.content)
|
|
130
140
|
tmp.flush()
|
|
131
141
|
tmp_path = tmp.name
|
|
132
|
-
|
|
142
|
+
|
|
133
143
|
try:
|
|
134
144
|
tag = TinyTag.get(tmp_path)
|
|
135
145
|
finally:
|
|
136
146
|
# Clean up the temporary file
|
|
137
147
|
os.unlink(tmp_path)
|
|
138
|
-
|
|
148
|
+
|
|
139
149
|
if tag.duration is None:
|
|
140
150
|
raise ValueError("Could not read duration from file")
|
|
141
|
-
|
|
151
|
+
|
|
142
152
|
return int(tag.duration * 1000) # Convert to milliseconds
|
|
143
|
-
|
|
153
|
+
|
|
144
154
|
except Exception as e:
|
|
145
155
|
raise ValueError(f"Could not determine media duration: {str(e)}")
|
|
146
156
|
|
|
@@ -149,53 +159,55 @@ class MediaAsset(BaseAsset):
|
|
|
149
159
|
Get the dimensions (width, height) of an image file.
|
|
150
160
|
Returns None for non-image files or if dimensions can't be determined.
|
|
151
161
|
"""
|
|
152
|
-
if not any(self.name.lower().endswith(ext) for ext in
|
|
162
|
+
if not any(self.name.lower().endswith(ext) for ext in ALLOWED_IMAGE_EXTENSIONS):
|
|
153
163
|
return None
|
|
154
|
-
|
|
164
|
+
|
|
155
165
|
try:
|
|
156
166
|
img = Image.open(BytesIO(self.content))
|
|
157
167
|
return img.size
|
|
158
168
|
except Exception:
|
|
159
169
|
return None
|
|
160
170
|
|
|
161
|
-
def set_custom_name(self, name: str) ->
|
|
171
|
+
def set_custom_name(self, name: str) -> "MediaAsset":
|
|
162
172
|
"""Set a custom name for the media asset (only works with URLs)."""
|
|
163
173
|
if self._url is not None:
|
|
164
174
|
self.name = self.__check_name_ending(name)
|
|
165
175
|
else:
|
|
166
176
|
raise ValueError("Custom name can only be set for URLs.")
|
|
167
177
|
return self
|
|
168
|
-
|
|
178
|
+
|
|
169
179
|
def __check_name_ending(self, name: str) -> str:
|
|
170
180
|
"""Check if the media path is valid."""
|
|
171
|
-
if not name.endswith(
|
|
172
|
-
logger.warning(
|
|
173
|
-
|
|
181
|
+
if not any(name.endswith(ext) for ext in ALLOWED_MEDIA_EXTENSIONS):
|
|
182
|
+
logger.warning(
|
|
183
|
+
f"Warning: Supported file types: {ALLOWED_MEDIA_EXTENSIONS}. Image might not be displayed correctly."
|
|
184
|
+
)
|
|
185
|
+
name = name + ".jpg"
|
|
174
186
|
return name
|
|
175
187
|
|
|
176
188
|
def __get_media_type_from_extension(self, url: str) -> Optional[str]:
|
|
177
189
|
"""
|
|
178
190
|
Determine media type from URL file extension.
|
|
179
|
-
|
|
191
|
+
|
|
180
192
|
Args:
|
|
181
193
|
url: The URL to check
|
|
182
|
-
|
|
194
|
+
|
|
183
195
|
Returns:
|
|
184
196
|
Optional[str]: MIME type if valid extension found, None otherwise
|
|
185
197
|
"""
|
|
186
198
|
try:
|
|
187
|
-
ext = url.lower().split(
|
|
199
|
+
ext = url.lower().split("?")[0].split(".")[-1]
|
|
188
200
|
return self.MIME_TYPES.get(ext)
|
|
189
201
|
except IndexError:
|
|
190
202
|
return None
|
|
191
|
-
|
|
203
|
+
|
|
192
204
|
def __validate_image_content(self, content: bytes) -> bool:
|
|
193
205
|
"""
|
|
194
206
|
Validate image content using PIL.
|
|
195
|
-
|
|
207
|
+
|
|
196
208
|
Args:
|
|
197
209
|
content: Image bytes to validate
|
|
198
|
-
|
|
210
|
+
|
|
199
211
|
Returns:
|
|
200
212
|
bool: True if valid image, False otherwise
|
|
201
213
|
"""
|
|
@@ -206,14 +218,14 @@ class MediaAsset(BaseAsset):
|
|
|
206
218
|
except Exception as e:
|
|
207
219
|
self._logger.debug(f"Image validation failed: {str(e)}")
|
|
208
220
|
return False
|
|
209
|
-
|
|
221
|
+
|
|
210
222
|
def __get_media_type_from_signature(self, content: bytes) -> Optional[str]:
|
|
211
223
|
"""
|
|
212
224
|
Determine media type from file signature.
|
|
213
|
-
|
|
225
|
+
|
|
214
226
|
Args:
|
|
215
227
|
content: File content bytes
|
|
216
|
-
|
|
228
|
+
|
|
217
229
|
Returns:
|
|
218
230
|
Optional[str]: MIME type if valid signature found, None otherwise
|
|
219
231
|
"""
|
|
@@ -226,13 +238,13 @@ class MediaAsset(BaseAsset):
|
|
|
226
238
|
def __get_media_bytes(self, url: str) -> bytes:
|
|
227
239
|
"""
|
|
228
240
|
Downloads and validates media files from URL with retry logic and session reuse.
|
|
229
|
-
|
|
241
|
+
|
|
230
242
|
Args:
|
|
231
243
|
url: URL of the media file
|
|
232
|
-
|
|
244
|
+
|
|
233
245
|
Returns:
|
|
234
246
|
bytes: Validated media content
|
|
235
|
-
|
|
247
|
+
|
|
236
248
|
Raises:
|
|
237
249
|
ValueError: If media type is unsupported or content validation fails
|
|
238
250
|
requests.exceptions.RequestException: If download fails after all retries
|
|
@@ -243,17 +255,17 @@ class MediaAsset(BaseAsset):
|
|
|
243
255
|
|
|
244
256
|
try:
|
|
245
257
|
response = self.session.get(
|
|
246
|
-
url,
|
|
247
|
-
stream=False,
|
|
248
|
-
timeout=(5, 30) # (connect timeout, read timeout)
|
|
258
|
+
url, stream=False, timeout=(5, 30) # (connect timeout, read timeout)
|
|
249
259
|
)
|
|
250
260
|
response.raise_for_status()
|
|
251
261
|
except requests.exceptions.RequestException as e:
|
|
252
|
-
self._logger.error(
|
|
262
|
+
self._logger.error(
|
|
263
|
+
f"Failed to download media from {url} after retries: {str(e)}"
|
|
264
|
+
)
|
|
253
265
|
raise
|
|
254
266
|
|
|
255
267
|
content = response.content
|
|
256
|
-
content_type = response.headers.get(
|
|
268
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
257
269
|
|
|
258
270
|
# Case 1: Content-type is already allowed
|
|
259
271
|
if any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
|
|
@@ -279,18 +291,18 @@ class MediaAsset(BaseAsset):
|
|
|
279
291
|
|
|
280
292
|
# If we get here, validation failed
|
|
281
293
|
error_msg = (
|
|
282
|
-
f
|
|
283
|
-
f
|
|
294
|
+
f"Could not validate media type from content.\n"
|
|
295
|
+
f"Content-Type: {content_type}\n"
|
|
284
296
|
f'URL extension: {url.split("?")[0].split(".")[-1]}\n'
|
|
285
|
-
f
|
|
297
|
+
f"Allowed types: {self.ALLOWED_TYPES}"
|
|
286
298
|
)
|
|
287
299
|
self._logger.error(error_msg)
|
|
288
300
|
raise ValueError(error_msg)
|
|
289
|
-
|
|
301
|
+
|
|
290
302
|
def is_local(self) -> bool:
|
|
291
303
|
"""Check if the media asset is a local file."""
|
|
292
304
|
return self._url is None
|
|
293
|
-
|
|
305
|
+
|
|
294
306
|
def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes:
|
|
295
307
|
"""Convert the media asset to a file representation."""
|
|
296
308
|
if self._url is None:
|
|
@@ -298,9 +310,9 @@ class MediaAsset(BaseAsset):
|
|
|
298
310
|
return self.path
|
|
299
311
|
else:
|
|
300
312
|
return (self.name, self.content)
|
|
301
|
-
|
|
313
|
+
|
|
302
314
|
def __str__(self) -> str:
|
|
303
315
|
return f"MediaAsset(path={self.path})"
|
|
304
|
-
|
|
316
|
+
|
|
305
317
|
def __repr__(self) -> str:
|
|
306
318
|
return f"MediaAsset(path={self.path})"
|
|
@@ -2,7 +2,7 @@ import requests
|
|
|
2
2
|
from requests.adapters import HTTPAdapter
|
|
3
3
|
from urllib3.util.retry import Retry
|
|
4
4
|
|
|
5
|
-
from rapidata.rapidata_client.config.
|
|
5
|
+
from rapidata.rapidata_client.config.rapidata_config import rapidata_config
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class SessionManager:
|
|
@@ -18,8 +18,8 @@ class SessionManager:
|
|
|
18
18
|
requests.Session: A singleton requests session with retry logic.
|
|
19
19
|
"""
|
|
20
20
|
if cls._session is None:
|
|
21
|
-
max_retries: int = rapidata_config.
|
|
22
|
-
max_workers: int = rapidata_config.
|
|
21
|
+
max_retries: int = rapidata_config.uploadMaxRetries
|
|
22
|
+
max_workers: int = rapidata_config.maxUploadWorkers
|
|
23
23
|
cls._session = requests.Session()
|
|
24
24
|
retries = Retry(
|
|
25
25
|
total=max_retries,
|
|
@@ -1,21 +1,27 @@
|
|
|
1
1
|
from typing import cast
|
|
2
2
|
from rapidata.api_client.models.file_asset_model import FileAssetModel
|
|
3
|
-
from rapidata.api_client.models.get_failed_datapoints_result import
|
|
3
|
+
from rapidata.api_client.models.get_failed_datapoints_result import (
|
|
4
|
+
GetFailedDatapointsResult,
|
|
5
|
+
)
|
|
4
6
|
from rapidata.api_client.models.multi_asset_model import MultiAssetModel
|
|
5
|
-
from rapidata.api_client.models.original_filename_metadata_model import
|
|
7
|
+
from rapidata.api_client.models.original_filename_metadata_model import (
|
|
8
|
+
OriginalFilenameMetadataModel,
|
|
9
|
+
)
|
|
6
10
|
from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
|
|
7
11
|
from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
|
|
8
|
-
from rapidata.rapidata_client.datapoints.
|
|
12
|
+
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
9
13
|
from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
|
|
10
14
|
from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
|
|
11
15
|
|
|
16
|
+
|
|
12
17
|
class FailedUploadException(Exception):
|
|
13
18
|
"""Custom error class for Failed Uploads to the Rapidata order."""
|
|
19
|
+
|
|
14
20
|
def __init__(
|
|
15
|
-
self,
|
|
21
|
+
self,
|
|
16
22
|
dataset: RapidataDataset,
|
|
17
23
|
order: RapidataOrder,
|
|
18
|
-
failed_uploads: list[Datapoint]
|
|
24
|
+
failed_uploads: list[Datapoint],
|
|
19
25
|
):
|
|
20
26
|
self.dataset = dataset
|
|
21
27
|
self.order = order
|
|
@@ -30,24 +36,47 @@ def _parse_failed_uploads(failed_uploads: GetFailedDatapointsResult) -> list[Dat
|
|
|
30
36
|
if not failed_datapoints:
|
|
31
37
|
return []
|
|
32
38
|
if isinstance(failed_datapoints[0].asset.actual_instance, FileAssetModel):
|
|
33
|
-
failed_assets = [
|
|
39
|
+
failed_assets = [
|
|
40
|
+
MediaAsset(
|
|
41
|
+
__get_asset_name(cast(FileAssetModel, datapoint.asset.actual_instance))
|
|
42
|
+
)
|
|
43
|
+
for datapoint in failed_datapoints
|
|
44
|
+
]
|
|
34
45
|
elif isinstance(failed_datapoints[0].asset.actual_instance, MultiAssetModel):
|
|
35
46
|
failed_assets = []
|
|
36
|
-
backend_assets = [
|
|
47
|
+
backend_assets = [
|
|
48
|
+
cast(MultiAssetModel, failed_upload.asset.actual_instance).assets
|
|
49
|
+
for failed_upload in failed_datapoints
|
|
50
|
+
]
|
|
37
51
|
for assets in backend_assets:
|
|
38
|
-
failed_assets.append(
|
|
52
|
+
failed_assets.append(
|
|
53
|
+
MultiAsset(
|
|
54
|
+
[
|
|
55
|
+
MediaAsset(
|
|
56
|
+
__get_asset_name(
|
|
57
|
+
cast(FileAssetModel, asset.actual_instance)
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
for asset in assets
|
|
61
|
+
if isinstance(asset.actual_instance, FileAssetModel)
|
|
62
|
+
]
|
|
63
|
+
)
|
|
64
|
+
)
|
|
39
65
|
else:
|
|
40
|
-
raise ValueError(
|
|
41
|
-
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"Unsupported asset type: {type(failed_datapoints[0].asset.actual_instance)}"
|
|
68
|
+
)
|
|
69
|
+
|
|
42
70
|
return [Datapoint(asset=asset) for asset in failed_assets]
|
|
43
71
|
|
|
72
|
+
|
|
44
73
|
def __get_asset_name(failed_datapoint: FileAssetModel) -> str:
|
|
45
74
|
metadata = failed_datapoint.metadata
|
|
46
75
|
if "sourceUrl" in metadata:
|
|
47
76
|
return cast(SourceUrlMetadataModel, metadata["sourceUrl"].actual_instance).url
|
|
48
77
|
elif "originalFilename" in metadata:
|
|
49
|
-
return cast(
|
|
78
|
+
return cast(
|
|
79
|
+
OriginalFilenameMetadataModel, metadata["originalFilename"].actual_instance
|
|
80
|
+
).original_filename
|
|
50
81
|
else:
|
|
51
82
|
return ""
|
|
52
|
-
|
|
53
|
-
|
|
@@ -3,6 +3,8 @@ from rapidata.rapidata_client.filter._base_filter import RapidataFilter
|
|
|
3
3
|
from rapidata.api_client.models.response_count_user_filter_model import (
|
|
4
4
|
ResponseCountUserFilterModel,
|
|
5
5
|
)
|
|
6
|
+
from rapidata.api_client.models.comparison_operator import ComparisonOperator
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
class ResponseCountFilter(RapidataFilter):
|
|
8
10
|
"""ResponseCountFilter Class
|
|
@@ -11,12 +13,12 @@ class ResponseCountFilter(RapidataFilter):
|
|
|
11
13
|
response_count (int): The number of user responses to filter by.
|
|
12
14
|
dimension (str): The dimension to apply the filter on (e.g. "default", "electrical", etc.).
|
|
13
15
|
operator (str): The comparison operator to use. Must be one of:
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
-
|
|
16
|
+
- ComparisonOperator.EQUAL
|
|
17
|
+
- ComparisonOperator.NOTEQUAL
|
|
18
|
+
- ComparisonOperator.LESSTHAN
|
|
19
|
+
- ComparisonOperator.LESSTHANOREQUAL
|
|
20
|
+
- ComparisonOperator.GREATERTHAN
|
|
21
|
+
- ComparisonOperator.GREATERTHANOREQUAL
|
|
20
22
|
|
|
21
23
|
Raises:
|
|
22
24
|
ValueError: If `response_count` is not an integer.
|
|
@@ -27,14 +29,14 @@ class ResponseCountFilter(RapidataFilter):
|
|
|
27
29
|
```python
|
|
28
30
|
from rapidata import ResponseCountFilter
|
|
29
31
|
|
|
30
|
-
filter = ResponseCountFilter(response_count=10, dimension="electrical", operator=
|
|
32
|
+
filter = ResponseCountFilter(response_count=10, dimension="electrical", operator=ComparisonOperator.GREATERTHAN)
|
|
31
33
|
```
|
|
32
34
|
This will filter users who have a response count greater than 10 for the "electrical" dimension.
|
|
33
35
|
"""
|
|
34
36
|
|
|
35
|
-
def __init__(
|
|
36
|
-
|
|
37
|
-
|
|
37
|
+
def __init__(
|
|
38
|
+
self, response_count: int, dimension: str, operator: ComparisonOperator
|
|
39
|
+
):
|
|
38
40
|
|
|
39
41
|
self.response_count = response_count
|
|
40
42
|
self.dimension = dimension
|
|
@@ -42,5 +44,8 @@ class ResponseCountFilter(RapidataFilter):
|
|
|
42
44
|
|
|
43
45
|
def _to_model(self):
|
|
44
46
|
return ResponseCountUserFilterModel(
|
|
45
|
-
_t="ResponseCountFilter",
|
|
47
|
+
_t="ResponseCountFilter",
|
|
48
|
+
responseCount=self.response_count,
|
|
49
|
+
dimension=self.dimension,
|
|
50
|
+
operator=self.operator,
|
|
46
51
|
)
|