rapidata 2.27.4__py3-none-any.whl → 2.27.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidata might be problematic. Click here for more details.

rapidata/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "2.27.4"
1
+ __version__ = "2.27.5"
2
2
 
3
3
  from .rapidata_client import (
4
4
  RapidataClient,
@@ -3,20 +3,19 @@ from itertools import zip_longest
3
3
  from rapidata.api_client.models.create_datapoint_from_text_sources_model import CreateDatapointFromTextSourcesModel
4
4
  from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import DatasetDatasetIdDatapointsPostRequestMetadataInner
5
5
  from rapidata.rapidata_client.metadata._base_metadata import Metadata
6
- from rapidata.rapidata_client.assets import TextAsset, MediaAsset, MultiAsset
6
+ from rapidata.rapidata_client.assets import TextAsset, MediaAsset, MultiAsset, BaseAsset
7
7
  from rapidata.service import LocalFileService
8
8
  from rapidata.service.openapi_service import OpenAPIService
9
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
10
  from tqdm import tqdm
11
11
 
12
12
  from typing import cast, Sequence, Generator
13
- from rapidata.rapidata_client.logging import logger, RapidataOutputManager
13
+ from rapidata.rapidata_client.logging import logger, managed_print, RapidataOutputManager
14
14
  import time
15
15
  import threading
16
16
 
17
17
 
18
18
  def chunk_list(lst: list, chunk_size: int) -> Generator:
19
- """Split list into chunks to prevent resource exhaustion"""
20
19
  for i in range(0, len(lst), chunk_size):
21
20
  yield lst[i:i + chunk_size]
22
21
 
@@ -27,6 +26,43 @@ class RapidataDataset:
27
26
  self.openapi_service = openapi_service
28
27
  self.local_file_service = LocalFileService()
29
28
 
29
+ def _get_effective_asset_type(self, datapoints: Sequence[BaseAsset]) -> type:
30
+ if not datapoints:
31
+ raise ValueError("Cannot determine asset type from empty datapoints list.")
32
+
33
+ first_item = datapoints[0]
34
+
35
+ if isinstance(first_item, MultiAsset):
36
+ if not first_item.assets:
37
+ raise ValueError("MultiAsset cannot be empty.")
38
+ return type(first_item.assets[0])
39
+
40
+ return type(first_item)
41
+
42
+ def _add_datapoints(
43
+ self,
44
+ datapoints: Sequence[BaseAsset],
45
+ metadata_list: Sequence[Sequence[Metadata]] | None = None,
46
+ max_workers: int = 10,
47
+ ):
48
+ effective_asset_type = self._get_effective_asset_type(datapoints)
49
+
50
+ for item in datapoints:
51
+ if isinstance(item, MultiAsset):
52
+ if not all(isinstance(asset, effective_asset_type) for asset in item.assets):
53
+ raise ValueError("All MultiAssets must contain the same type of assets.")
54
+ elif not isinstance(item, (MediaAsset, TextAsset, MultiAsset)):
55
+ raise ValueError("All datapoints must be MediaAsset, TextAsset, or MultiAsset.")
56
+
57
+ if issubclass(effective_asset_type, MediaAsset):
58
+ media_datapoints = cast(list[MediaAsset] | list[MultiAsset], datapoints)
59
+ self._add_media_from_paths(media_datapoints, metadata_list, max_workers)
60
+ elif issubclass(effective_asset_type, TextAsset):
61
+ text_datapoints = cast(list[TextAsset] | list[MultiAsset], datapoints)
62
+ self._add_texts(text_datapoints, metadata_list)
63
+ else:
64
+ raise ValueError(f"Unsupported asset type: {effective_asset_type}")
65
+
30
66
  def _add_texts(
31
67
  self,
32
68
  text_assets: list[TextAsset] | list[MultiAsset],
@@ -60,7 +96,7 @@ class RapidataDataset:
60
96
  metadata=metadata,
61
97
  )
62
98
 
63
- upload_response = self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(dataset_id=self.dataset_id, create_datapoint_from_text_sources_model=model)
99
+ self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(dataset_id=self.dataset_id, create_datapoint_from_text_sources_model=model)
64
100
 
65
101
  total_uploads = len(text_assets)
66
102
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -79,14 +115,16 @@ class RapidataDataset:
79
115
  media_asset: MediaAsset | MultiAsset,
80
116
  meta_list: Sequence[Metadata] | None,
81
117
  index: int,
118
+ max_retries: int = 3,
82
119
  ) -> tuple[list[str], list[str]]:
83
120
  """
84
- Process single upload with error tracking.
121
+ Process single upload with retry logic and error tracking.
85
122
 
86
123
  Args:
87
124
  media_asset: MediaAsset or MultiAsset to upload
88
125
  meta_list: Optional sequence of metadata for the asset
89
126
  index: Sort index for the upload
127
+ max_retries: Maximum number of retry attempts (default: 3)
90
128
 
91
129
  Returns:
92
130
  tuple[list[str], list[str]]: Lists of successful and failed identifiers
@@ -95,44 +133,56 @@ class RapidataDataset:
95
133
  local_failed: list[str] = []
96
134
  identifiers_to_track: list[str] = []
97
135
 
98
- try:
99
- # Get identifier for this upload (URL or file path)
100
- if isinstance(media_asset, MediaAsset):
101
- assets = [media_asset]
102
- identifier = media_asset._url if media_asset._url else media_asset.path
103
- identifiers_to_track = [identifier] if identifier else []
104
- elif isinstance(media_asset, MultiAsset):
105
- assets = cast(list[MediaAsset], media_asset.assets)
106
- identifiers_to_track = [
107
- (asset._url if asset._url else cast(str, asset.path))
108
- for asset in assets
109
- ]
110
- else:
111
- raise ValueError(f"Unsupported asset type: {type(media_asset)}")
112
-
113
- metadata: list[DatasetDatasetIdDatapointsPostRequestMetadataInner] = []
114
- if meta_list:
115
- for meta in meta_list:
116
- meta_model = meta.to_model() if meta else None
117
- if meta_model:
118
- metadata.append(DatasetDatasetIdDatapointsPostRequestMetadataInner(meta_model))
119
-
120
- local_paths = [asset.to_file() for asset in assets if asset.is_local()]
121
- urls = [asset.path for asset in assets if not asset.is_local()]
136
+ # Get identifier for this upload (URL or file path)
137
+ if isinstance(media_asset, MediaAsset):
138
+ assets = [media_asset]
139
+ identifier = media_asset._url if media_asset._url else media_asset.path
140
+ identifiers_to_track = [identifier] if identifier else []
141
+ elif isinstance(media_asset, MultiAsset):
142
+ assets = cast(list[MediaAsset], media_asset.assets)
143
+ identifiers_to_track = [
144
+ (asset._url if asset._url else cast(str, asset.path))
145
+ for asset in assets
146
+ ]
147
+ else:
148
+ raise ValueError(f"Unsupported asset type: {type(media_asset)}")
122
149
 
123
- self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
124
- dataset_id=self.dataset_id,
125
- file=local_paths,
126
- url=urls,
127
- metadata=metadata,
128
- sort_index=index,
129
- )
150
+ metadata: list[DatasetDatasetIdDatapointsPostRequestMetadataInner] = []
151
+ if meta_list:
152
+ for meta in meta_list:
153
+ meta_model = meta.to_model() if meta else None
154
+ if meta_model:
155
+ metadata.append(DatasetDatasetIdDatapointsPostRequestMetadataInner(meta_model))
130
156
 
131
- local_successful.extend(identifiers_to_track)
157
+ local_paths = [asset.to_file() for asset in assets if asset.is_local()]
158
+ urls = [asset.path for asset in assets if not asset.is_local()]
132
159
 
133
- except Exception as e:
134
- logger.error(f"\nUpload failed for {identifiers_to_track}: {str(e)}") # \n to avoid same line as tqdm
135
- local_failed.extend(identifiers_to_track)
160
+ last_exception = None
161
+ for attempt in range(max_retries):
162
+ try:
163
+ self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
164
+ dataset_id=self.dataset_id,
165
+ file=local_paths,
166
+ url=urls,
167
+ metadata=metadata,
168
+ sort_index=index,
169
+ )
170
+
171
+ # If we get here, the upload was successful
172
+ local_successful.extend(identifiers_to_track)
173
+ return local_successful, local_failed
174
+
175
+ except Exception as e:
176
+ last_exception = e
177
+ if attempt < max_retries - 1:
178
+ # Exponential backoff: wait 1s, then 2s, then 4s
179
+ retry_delay = 2 ** attempt
180
+ time.sleep(retry_delay)
181
+ managed_print(f"\nRetrying {attempt + 1} of {max_retries}...\n")
182
+
183
+ # If we get here, all retries failed
184
+ logger.error(f"\nUpload failed for {identifiers_to_track} after {max_retries} attempts. Final error: {str(last_exception)}")
185
+ local_failed.extend(identifiers_to_track)
136
186
 
137
187
  return local_successful, local_failed
138
188
 
@@ -341,7 +391,7 @@ class RapidataDataset:
341
391
  self,
342
392
  media_paths: list[MediaAsset] | list[MultiAsset],
343
393
  multi_metadata: Sequence[Sequence[Metadata]] | None = None,
344
- max_workers: int = 5,
394
+ max_workers: int = 10,
345
395
  chunk_size: int = 50,
346
396
  progress_poll_interval: float = 0.5,
347
397
  ) -> tuple[list[str], list[str]]:
@@ -149,50 +149,8 @@ class RapidataOrderBuilder:
149
149
  logger.debug(f"Order created: {order}")
150
150
  logger.debug("Adding media to the order.")
151
151
 
152
- if all(isinstance(item, MediaAsset) for item in self.__assets) and self.__dataset:
153
- assets = cast(list[MediaAsset], self.__assets)
154
- self.__dataset._add_media_from_paths(assets, self.__multi_metadata, max_upload_workers)
155
-
156
- elif (
157
- all(isinstance(item, TextAsset) for item in self.__assets) and self.__dataset
158
- ):
159
- assets = cast(list[TextAsset], self.__assets)
160
- self.__dataset._add_texts(assets, self.__multi_metadata)
161
-
162
- elif (
163
- all(isinstance(item, MultiAsset) for item in self.__assets) and self.__dataset
164
- ):
165
- multi_assets = cast(list[MultiAsset], self.__assets)
166
-
167
- # Check if all MultiAssets contain the same type of assets
168
- first_asset_type = type(multi_assets[0].assets[0])
169
- if not all(
170
- isinstance(asset, first_asset_type)
171
- for multi_asset in multi_assets
172
- for asset in multi_asset.assets
173
- ):
174
- raise ValueError(
175
- "All MultiAssets must contain the same type of assets (either all MediaAssets or all TextAssets)."
176
- )
177
-
178
- # Process based on the asset type
179
- if issubclass(first_asset_type, MediaAsset):
180
- self.__dataset._add_media_from_paths(
181
- multi_assets, self.__multi_metadata, max_upload_workers
182
- )
183
-
184
- elif issubclass(first_asset_type, TextAsset):
185
- self.__dataset._add_texts(multi_assets, self.__multi_metadata)
186
-
187
- else:
188
- raise ValueError(
189
- "MultiAsset must contain MediaAssets or TextAssets objects."
190
- )
191
-
192
- elif self.__dataset:
193
- raise ValueError(
194
- "Media paths must all be of the same type: MediaAsset, TextAsset, or MultiAsset."
195
- )
152
+ if self.__dataset:
153
+ self.__dataset._add_datapoints(self.__assets, self.__multi_metadata, max_upload_workers)
196
154
 
197
155
  logger.debug("Media added to the order.")
198
156
  logger.debug("Setting order to preview")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rapidata
3
- Version: 2.27.4
3
+ Version: 2.27.5
4
4
  Summary: Rapidata package containing the Rapidata Python Client to interact with the Rapidata Web API in an easy way.
5
5
  License: Apache-2.0
6
6
  Author: Rapidata AG
@@ -1,4 +1,4 @@
1
- rapidata/__init__.py,sha256=uidfaMpj0lDuv2SvZct-SvOTbMCiPvsW_0Tpb5nbl2A,812
1
+ rapidata/__init__.py,sha256=tzVc51y8cqb39UGrgV8v880SavSz0vU49N4Fgcl5NU0,812
2
2
  rapidata/api_client/__init__.py,sha256=W-O8FZAfCEJR4U25Uw43g6PxmJTCAhB6v7SKcXwNuGA,28865
3
3
  rapidata/api_client/api/__init__.py,sha256=Dv6v1tCJS4BLVM5BN9k5iRMNMyglhqZ4n8vyoqkLZZw,1292
4
4
  rapidata/api_client/api/campaign_api.py,sha256=ZEYXEp8_mzsElbklLXBLGnKEfPB1mx8-G5CXfSnibq0,80791
@@ -489,8 +489,8 @@ rapidata/rapidata_client/metadata/_prompt_metadata.py,sha256=ecycAq_t2HCEptxgNxy
489
489
  rapidata/rapidata_client/metadata/_public_text_metadata.py,sha256=uXavDp1ucy_9u5n0girqWD_SkFr7tplGMK_2aqyyHIA,529
490
490
  rapidata/rapidata_client/metadata/_select_words_metadata.py,sha256=-MK5yQDi_G3BKEes6aaVyCcobB-sEy29b6bfo5f4pic,594
491
491
  rapidata/rapidata_client/order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
492
- rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=JVGsiBa_BL0E7UZOHZ4uABr1jDVpbTiAQFO9pF2isVA,18891
493
- rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=x-2lpW6Jwlq-9XRz91beWNv2TgdEA-q4b_RhOSr7vhQ,14405
492
+ rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=v5b86EDF0ITIOV4k4QU8gQ4eFPz2ow-4HV_mmC9tb4c,21264
493
+ rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=ioNGmWQF4KMdzvm-GIfAeflK8AgKaczZ1FfKkrZ1xXY,12649
494
494
  rapidata/rapidata_client/order/rapidata_order.py,sha256=uk2p6Hx2KTN4Oq2S35esdip7yLR44y-kkamS-5TBPFE,12752
495
495
  rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=K9Nc66UmBfFQjSe78SMdZTOa4z3j5TRyvBnEX0Cs0u4,38306
496
496
  rapidata/rapidata_client/order/rapidata_results.py,sha256=UllYpuqpm2inKdRNhClaUwApuxsMLrvrGDsrHA5KqbY,8111
@@ -544,7 +544,7 @@ rapidata/service/__init__.py,sha256=s9bS1AJZaWIhLtJX_ZA40_CK39rAAkwdAmymTMbeWl4,
544
544
  rapidata/service/credential_manager.py,sha256=pUEEtp6VrFWYhfUUtyqmS0AlRqe2Y0kFkY6o22IT4KM,8682
545
545
  rapidata/service/local_file_service.py,sha256=pgorvlWcx52Uh3cEG6VrdMK_t__7dacQ_5AnfY14BW8,877
546
546
  rapidata/service/openapi_service.py,sha256=J07TB4P3cz9KCU7k_fwuMQwGXlq_nJx_m1_xHbZoCg0,4867
547
- rapidata-2.27.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
548
- rapidata-2.27.4.dist-info/METADATA,sha256=7IhQtevjPvtNarZYThRc4704zkbIazsscmR5XmRUfg0,1264
549
- rapidata-2.27.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
550
- rapidata-2.27.4.dist-info/RECORD,,
547
+ rapidata-2.27.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
548
+ rapidata-2.27.5.dist-info/METADATA,sha256=-bVQ6j4mkAw7lgIa9u2WCRiKHIuedG1J84ekG5LEcyE,1264
549
+ rapidata-2.27.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
550
+ rapidata-2.27.5.dist-info/RECORD,,