rapidata 2.40.1__py3-none-any.whl → 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidata might be problematic. Click here for more details.

rapidata/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "2.40.1"
1
+ __version__ = "2.40.3"
2
2
 
3
3
  from .rapidata_client import (
4
4
  RapidataClient,
@@ -12,3 +12,4 @@ class UploadConfig(BaseModel):
12
12
 
13
13
  maxWorkers: int = Field(default=10)
14
14
  maxRetries: int = Field(default=3)
15
+ chunkSize: int = Field(default=50)
@@ -10,7 +10,7 @@ from rapidata.api_client.models.original_filename_metadata_model import (
10
10
  from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
11
11
  from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
12
12
  from rapidata.rapidata_client.datapoints._datapoint import Datapoint
13
- from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
13
+ from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
14
14
  from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
15
15
 
16
16
 
@@ -32,7 +32,7 @@ from rapidata.rapidata_client.config import (
32
32
  from rapidata.rapidata_client.validation.validation_set_manager import (
33
33
  ValidationSetManager,
34
34
  )
35
- from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
35
+ from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
36
36
  from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
37
37
  from rapidata.rapidata_client.referee import Referee
38
38
  from rapidata.rapidata_client.referee._naive_referee import NaiveReferee
@@ -235,6 +235,10 @@ class RapidataOrderBuilder:
235
235
  + f"Please open this URL in your browser: '{encoded_url}'"
236
236
  + Fore.RESET
237
237
  )
238
+ managed_print(
239
+ "If you want to avoid the automatic validation set creation in the future, set `rapidata_config.order.autoValidationSetCreation = False`."
240
+ )
241
+ managed_print()
238
242
 
239
243
  self.__dataset = (
240
244
  RapidataDataset(result.dataset_id, self.__openapi_service)
@@ -253,7 +257,7 @@ class RapidataOrderBuilder:
253
257
  )
254
258
 
255
259
  logger.debug("Order created: %s", order)
256
- logger.debug("Adding media to the order.")
260
+ logger.debug("Adding datapoints to the order.")
257
261
 
258
262
  if self.__dataset:
259
263
  with tracer.start_as_current_span("add_datapoints"):
@@ -267,7 +271,7 @@ class RapidataOrderBuilder:
267
271
  f"No dataset created for this order. order_id: {self.order_id}"
268
272
  )
269
273
 
270
- logger.debug("Media added to the order.")
274
+ logger.debug("Datapoints added to the order.")
271
275
  logger.debug("Setting order to preview")
272
276
  try:
273
277
  self.__openapi_service.order_api.order_order_id_preview_post(self.order_id)
@@ -0,0 +1,100 @@
1
+ import threading
2
+ import time
3
+ from tqdm import tqdm
4
+
5
+ from rapidata.service.openapi_service import OpenAPIService
6
+ from rapidata.rapidata_client.config import logger, rapidata_config
7
+
8
+
9
+ class ProgressTracker:
10
+ """
11
+ Track dataset upload progress in a background thread with shallow indentation.
12
+
13
+ This class encapsulates the progress polling loop to keep methods in
14
+ `RapidataDataset` simpler and below the maximum indentation depth.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ dataset_id: str,
20
+ openapi_service: OpenAPIService,
21
+ total_uploads: int,
22
+ progress_poll_interval: float,
23
+ ) -> None:
24
+ self.dataset_id = dataset_id
25
+ self.openapi_service = openapi_service
26
+ self.total_uploads = total_uploads
27
+ self.progress_poll_interval = progress_poll_interval
28
+ self.upload_complete = False
29
+
30
+ def _get_progress_or_none(self):
31
+ try:
32
+ return self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
33
+ self.dataset_id
34
+ )
35
+ except Exception:
36
+ return None
37
+
38
+ def complete(self) -> None:
39
+ logger.debug("Upload complete, setting upload_complete to True")
40
+ self.upload_complete = True
41
+
42
+ def run(self) -> None:
43
+ try:
44
+ with tqdm(
45
+ total=self.total_uploads,
46
+ desc="Uploading datapoints",
47
+ disable=rapidata_config.logging.silent_mode,
48
+ ) as pbar:
49
+ final_pass = False
50
+ while True:
51
+ current_progress = self._get_progress_or_none()
52
+ if current_progress is None:
53
+ time.sleep(self.progress_poll_interval)
54
+ logger.debug(
55
+ "No progress yet, sleeping for %s seconds",
56
+ self.progress_poll_interval,
57
+ )
58
+ continue
59
+
60
+ total_completed = current_progress.ready + current_progress.failed
61
+
62
+ pbar.n = total_completed
63
+ pbar.refresh()
64
+
65
+ time.sleep(self.progress_poll_interval)
66
+ if total_completed >= self.total_uploads:
67
+ break
68
+
69
+ if self.upload_complete and current_progress.pending == 0:
70
+ if not final_pass:
71
+ logger.debug("Final pass")
72
+ time.sleep(self.progress_poll_interval)
73
+ final_pass = True
74
+ continue
75
+ logger.debug("Final pass done, breaking out of loop")
76
+ break
77
+
78
+ pbar.close()
79
+
80
+ success_rate = (
81
+ round((current_progress.ready / self.total_uploads * 100), 2)
82
+ if self.total_uploads > 0
83
+ else 0
84
+ )
85
+
86
+ logger.info(
87
+ "Upload complete: %s ready, %s failed, %s pending (%s%% success rate)",
88
+ current_progress.ready,
89
+ current_progress.failed,
90
+ current_progress.pending,
91
+ success_rate,
92
+ )
93
+ except Exception as e:
94
+ logger.error("Progress tracking thread error: %s", str(e))
95
+ raise RuntimeError("Progress tracking failed, aborting uploads")
96
+
97
+ def create_thread(self) -> threading.Thread:
98
+ thread = threading.Thread(target=self.run)
99
+ thread.daemon = True
100
+ return thread
@@ -0,0 +1,286 @@
1
+ from rapidata.rapidata_client.datapoints._datapoint import Datapoint
2
+ from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
3
+ from rapidata.service import LocalFileService
4
+ from rapidata.service.openapi_service import OpenAPIService
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from tqdm import tqdm
7
+
8
+ from typing import Generator
9
+ from rapidata.rapidata_client.config import logger
10
+ import time
11
+ import threading
12
+ from rapidata.rapidata_client.api.rapidata_api_client import (
13
+ suppress_rapidata_error_logging,
14
+ )
15
+ from rapidata.rapidata_client.config.rapidata_config import rapidata_config
16
+ from rapidata.rapidata_client.order.dataset._progress_tracker import ProgressTracker
17
+
18
+ # Add OpenTelemetry context imports for thread propagation
19
+ from opentelemetry import context as otel_context
20
+
21
+
22
+ def chunk_list(lst: list, chunk_size: int) -> Generator:
23
+ for i in range(0, len(lst), chunk_size):
24
+ yield lst[i : i + chunk_size]
25
+
26
+
27
+ class RapidataDataset:
28
+ def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
29
+ self.id = dataset_id
30
+ self.openapi_service = openapi_service
31
+ self.local_file_service = LocalFileService()
32
+
33
+ def add_datapoints(
34
+ self,
35
+ datapoints: list[Datapoint],
36
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
37
+ if not datapoints:
38
+ return [], []
39
+
40
+ effective_asset_type = datapoints[0]._get_effective_asset_type()
41
+
42
+ logger.debug(f"Config for datapoint upload: {rapidata_config}")
43
+
44
+ if issubclass(effective_asset_type, MediaAsset):
45
+ return self._add_media_from_paths(
46
+ datapoints,
47
+ )
48
+ elif issubclass(effective_asset_type, TextAsset):
49
+ return self._add_texts(datapoints)
50
+ else:
51
+ raise ValueError(f"Unsupported asset type: {effective_asset_type}")
52
+
53
+ def _add_texts(
54
+ self, datapoints: list[Datapoint]
55
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
56
+
57
+ def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
58
+ model = datapoint.create_text_upload_model(index)
59
+
60
+ self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
61
+ dataset_id=self.id, create_datapoint_from_text_sources_model=model
62
+ )
63
+ return datapoint
64
+
65
+ def upload_with_context(
66
+ context: otel_context.Context, datapoint: Datapoint, index: int
67
+ ) -> Datapoint:
68
+ """Wrapper function that runs upload_text_datapoint with the provided context."""
69
+ token = otel_context.attach(context)
70
+ try:
71
+ return upload_text_datapoint(datapoint, index)
72
+ finally:
73
+ otel_context.detach(token)
74
+
75
+ successful_uploads: list[Datapoint] = []
76
+ failed_uploads: list[Datapoint] = []
77
+
78
+ # Capture the current OpenTelemetry context before creating threads
79
+ current_context = otel_context.get_current()
80
+
81
+ total_uploads = len(datapoints)
82
+ with ThreadPoolExecutor(
83
+ max_workers=rapidata_config.upload.maxWorkers
84
+ ) as executor:
85
+ future_to_datapoint = {
86
+ executor.submit(
87
+ upload_with_context, current_context, datapoint, i
88
+ ): datapoint
89
+ for i, datapoint in enumerate(datapoints)
90
+ }
91
+
92
+ with tqdm(
93
+ total=total_uploads,
94
+ desc="Uploading text datapoints",
95
+ disable=rapidata_config.logging.silent_mode,
96
+ ) as pbar:
97
+ for future in as_completed(future_to_datapoint.keys()):
98
+ datapoint = future_to_datapoint[future]
99
+ try:
100
+ result = future.result()
101
+ pbar.update(1)
102
+ successful_uploads.append(result)
103
+ except Exception as e:
104
+ failed_uploads.append(datapoint)
105
+ logger.error("Upload failed for %s: %s", datapoint, str(e))
106
+
107
+ return successful_uploads, failed_uploads
108
+
109
+ def _process_single_upload(
110
+ self,
111
+ datapoint: Datapoint,
112
+ index: int,
113
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
114
+ """
115
+ Process single upload with retry logic and error tracking.
116
+
117
+ Args:
118
+ media_asset: MediaAsset or MultiAsset to upload
119
+ meta_list: Optional sequence of metadata for the asset
120
+ index: Sort index for the upload
121
+ max_retries: Maximum number of retry attempts (default: 3)
122
+
123
+ Returns:
124
+ tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
125
+ """
126
+ logger.debug("Processing single upload for %s with index %s", datapoint, index)
127
+
128
+ local_successful: list[Datapoint] = []
129
+ local_failed: list[Datapoint] = []
130
+
131
+ metadata = datapoint.get_prepared_metadata()
132
+
133
+ local_paths = datapoint.get_local_file_paths()
134
+ urls = datapoint.get_urls()
135
+
136
+ last_exception = None
137
+ for attempt in range(rapidata_config.upload.maxRetries):
138
+ try:
139
+ with suppress_rapidata_error_logging():
140
+ self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
141
+ dataset_id=self.id,
142
+ file=local_paths,
143
+ url=urls,
144
+ metadata=metadata,
145
+ sort_index=index,
146
+ )
147
+
148
+ local_successful.append(datapoint)
149
+
150
+ return local_successful, local_failed
151
+
152
+ except Exception as e:
153
+ last_exception = e
154
+ if attempt < rapidata_config.upload.maxRetries - 1:
155
+ # Exponential backoff: wait 1s, then 2s, then 4s
156
+ retry_delay = 2**attempt
157
+ time.sleep(retry_delay)
158
+ logger.debug("Error: %s", str(last_exception))
159
+ logger.debug(
160
+ "Retrying %s of %s...",
161
+ attempt + 1,
162
+ rapidata_config.upload.maxRetries,
163
+ )
164
+
165
+ # If we get here, all retries failed
166
+ local_failed.append(datapoint)
167
+ tqdm.write(
168
+ f"Upload failed for {datapoint} after {rapidata_config.upload.maxRetries} attempts. \nFinal error: \n{str(last_exception)}"
169
+ )
170
+
171
+ return local_successful, local_failed
172
+
173
+ def _process_uploads_in_chunks(
174
+ self,
175
+ datapoints: list[Datapoint],
176
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
177
+ """
178
+ Process uploads in chunks with a ThreadPoolExecutor.
179
+
180
+ Args:
181
+ media_paths: List of assets to upload
182
+ multi_metadata: Optional sequence of sequences of metadata
183
+ chunk_size: Number of items to process in each batch
184
+
185
+ Returns:
186
+ tuple[list[str], list[str]]: Lists of successful and failed uploads
187
+ """
188
+ successful_uploads: list[Datapoint] = []
189
+ failed_uploads: list[Datapoint] = []
190
+
191
+ def process_upload_with_context(
192
+ context: otel_context.Context, datapoint: Datapoint, index: int
193
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
194
+ """Wrapper function that runs _process_single_upload with the provided context."""
195
+ token = otel_context.attach(context)
196
+ try:
197
+ return self._process_single_upload(datapoint, index)
198
+ finally:
199
+ otel_context.detach(token)
200
+
201
+ # Capture the current OpenTelemetry context before creating threads
202
+ current_context = otel_context.get_current()
203
+
204
+ with ThreadPoolExecutor(
205
+ max_workers=rapidata_config.upload.maxWorkers
206
+ ) as executor:
207
+ # Process uploads in chunks to avoid overwhelming the system
208
+ for chunk_idx, chunk in enumerate(
209
+ chunk_list(datapoints, rapidata_config.upload.chunkSize)
210
+ ):
211
+ futures = [
212
+ executor.submit(
213
+ process_upload_with_context,
214
+ current_context,
215
+ datapoint,
216
+ chunk_idx * rapidata_config.upload.chunkSize + i,
217
+ )
218
+ for i, datapoint in enumerate(chunk)
219
+ ]
220
+
221
+ # Wait for this chunk to complete before starting the next one
222
+ for future in as_completed(futures):
223
+ try:
224
+ chunk_successful, chunk_failed = future.result()
225
+ successful_uploads.extend(chunk_successful)
226
+ failed_uploads.extend(chunk_failed)
227
+ except Exception as e:
228
+ logger.error("Future execution failed: %s", str(e))
229
+
230
+ return successful_uploads, failed_uploads
231
+
232
+ def _add_media_from_paths(
233
+ self,
234
+ datapoints: list[Datapoint],
235
+ progress_poll_interval: float = 0.5,
236
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
237
+ """
238
+ Upload media paths in chunks with managed resources.
239
+
240
+ Args:
241
+ datapoints: List of Datapoint objects to upload
242
+ chunk_size: Number of items to process in each batch
243
+ progress_poll_interval: Time in seconds between progress checks
244
+ Returns:
245
+ tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
246
+
247
+ Raises:
248
+ ValueError: If multi_metadata lengths don't match media_paths length
249
+ """
250
+
251
+ # Setup tracking variables
252
+ total_uploads = len(datapoints)
253
+
254
+ # Create and start progress tracking thread
255
+ progress_tracker = ProgressTracker(
256
+ dataset_id=self.id,
257
+ openapi_service=self.openapi_service,
258
+ total_uploads=total_uploads,
259
+ progress_poll_interval=progress_poll_interval,
260
+ )
261
+ progress_thread = progress_tracker.create_thread()
262
+ progress_thread.start()
263
+
264
+ # Process uploads in chunks
265
+ try:
266
+ successful_uploads, failed_uploads = self._process_uploads_in_chunks(
267
+ datapoints,
268
+ )
269
+ finally:
270
+ progress_tracker.complete()
271
+ progress_thread.join(10)
272
+
273
+ if failed_uploads:
274
+ logger.error(
275
+ "Upload failed for %s datapoints: %s",
276
+ len(failed_uploads),
277
+ failed_uploads,
278
+ )
279
+
280
+ return successful_uploads, failed_uploads
281
+
282
+ def __str__(self) -> str:
283
+ return f"RapidataDataset(id={self.id})"
284
+
285
+ def __repr__(self) -> str:
286
+ return self.__str__()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rapidata
3
- Version: 2.40.1
3
+ Version: 2.40.3
4
4
  Summary: Rapidata package containing the Rapidata Python Client to interact with the Rapidata Web API in an easy way.
5
5
  License: Apache-2.0
6
6
  Author: Rapidata AG
@@ -1,4 +1,4 @@
1
- rapidata/__init__.py,sha256=56QmFS7POsslVrmIEWRk1hMRZpZmKzHDBRzESibZS0Q,917
1
+ rapidata/__init__.py,sha256=gptg5UE0WwAeX8240QsUFhS-lQFHXQui1t08HArcW-s,917
2
2
  rapidata/api_client/__init__.py,sha256=utY2iWepKJQO_iGz6aIg_qSoqoDkV9pBMAA58pIFE4M,36016
3
3
  rapidata/api_client/api/__init__.py,sha256=07qqwzQiBYt5V2BtnzbXhZL2cmVHATyZmCSGshIXLck,1603
4
4
  rapidata/api_client/api/benchmark_api.py,sha256=Mlx2qDDJcgPjWvaBnps9dxvVd0re1knG0SyoLUiHKSc,119756
@@ -587,7 +587,7 @@ rapidata/rapidata_client/config/managed_print.py,sha256=2T6dwgR1EZzFAdOEyPp_BBUs
587
587
  rapidata/rapidata_client/config/order_config.py,sha256=XxRZERzUUA9md6-PVlV__eCw8DD2kPbT_UmMwG1mAS4,615
588
588
  rapidata/rapidata_client/config/rapidata_config.py,sha256=mURnKdl5-2sE4e_IYY9-aBkix6a12t47otEErGE_q0c,1507
589
589
  rapidata/rapidata_client/config/tracer.py,sha256=h3GXzaX79HPcip4fBhLaLW0mRlXttR7D3KA78ZT0KVw,4736
590
- rapidata/rapidata_client/config/upload_config.py,sha256=AYba-Nw9fddLFyfGB4ar2G8zZIVOHrCL_HZjTmvrKGQ,434
590
+ rapidata/rapidata_client/config/upload_config.py,sha256=hjefl-w9WaCNeCEe6hdnrAQEMjgDy-r1zgUUIFR68wk,473
591
591
  rapidata/rapidata_client/country_codes/__init__.py,sha256=FB9Dcks44J6C6YBSYmTmNZ71tE130x6NO_3aLJ8fKzQ,40
592
592
  rapidata/rapidata_client/country_codes/country_codes.py,sha256=ePHqeb7y9DWQZAnddBzPx1puYBcrgUjdR2sbFijuFD8,283
593
593
  rapidata/rapidata_client/datapoints/__init__.py,sha256=YiXWlFKSi3ABP35zDukL7_z5uEdRrCMriquM6BoX6-s,276
@@ -611,7 +611,7 @@ rapidata/rapidata_client/datapoints/metadata/_select_words_metadata.py,sha256=T8
611
611
  rapidata/rapidata_client/demographic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
612
612
  rapidata/rapidata_client/demographic/demographic_manager.py,sha256=x0kQdgqMXAx7VuZJiP2HeI_dtKEd-W-hcY3URDcEfrU,1089
613
613
  rapidata/rapidata_client/exceptions/__init__.py,sha256=2hbWRgjlCGuoLPVDloQmmH81uzm9F2OAX2iFGCJyRu8,59
614
- rapidata/rapidata_client/exceptions/failed_upload_exception.py,sha256=iN0RqEw_mw4Cl3CMI7A3ljj9EFjGdlr9LpnMIwGOH6g,3109
614
+ rapidata/rapidata_client/exceptions/failed_upload_exception.py,sha256=jsd2foR3c8X5g4hgljgMAY5X_JTdmUuhBPWaL12938E,3117
615
615
  rapidata/rapidata_client/filter/__init__.py,sha256=j_Kfz_asNVxwp56SAN2saB7ZAHg3smL5_W2sSitmuJY,548
616
616
  rapidata/rapidata_client/filter/_base_filter.py,sha256=NVa2oWgtXD9kmXWyMkYZZ-2RYzgcN0hO76uGrEXXLEs,2384
617
617
  rapidata/rapidata_client/filter/age_filter.py,sha256=mVZaKyBoK-mml_oFox97l1yUXvINPk-2cEimuU_FJac,908
@@ -631,8 +631,9 @@ rapidata/rapidata_client/filter/rapidata_filters.py,sha256=B8ptQsaAn1e14Grv8xBYQ
631
631
  rapidata/rapidata_client/filter/response_count_filter.py,sha256=i2u2YQD3_RLQRZyqAceAGLQS3es97Q2n8KTlgfDYMko,2332
632
632
  rapidata/rapidata_client/filter/user_score_filter.py,sha256=4B3Zzp7aosDFmte3nLPTlXMN4zatT6Wcq5QLIoXqhgI,1910
633
633
  rapidata/rapidata_client/order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
634
- rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=ftJD0czFX79AG9It7q2qdrQmUIGoyGq713AoUoC6nfU,18976
635
- rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=e2W_aH-TxsZHcApCsdbps4lqYCwRh5uhldy0mxv2oDc,16878
634
+ rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=C-TbKELNuLjQiZt9Gsl6LdtzIUtsu0sNLKGIcLvJEHk,17120
635
+ rapidata/rapidata_client/order/dataset/_progress_tracker.py,sha256=2e9aVMwpdBSO9P3LYn5lygNbtIGzOaJD3b9J7jYaxko,3570
636
+ rapidata/rapidata_client/order/dataset/_rapidata_dataset.py,sha256=ONH56htEvoVZvkSItuTi3_88kaDWohmYYABSoAPEn4Q,10724
636
637
  rapidata/rapidata_client/order/rapidata_order.py,sha256=FvZi3t4dARRNsKWvYiNxVvM50AzPwQYR3AzI4utD6OI,14497
637
638
  rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=XiV_BpJxG6d8o0rFDYhnB3_mb576CQG5hY-qVXlJZKY,42592
638
639
  rapidata/rapidata_client/order/rapidata_results.py,sha256=weL4S14fzug3ZOJbQk9Oj-4tv2jx5aZAMp7VJ-a6Qq4,8437
@@ -689,7 +690,7 @@ rapidata/service/credential_manager.py,sha256=T3yL4tXVnibRytxjQkOC-ex3kFGQR5KcKU
689
690
  rapidata/service/local_file_service.py,sha256=0Q4LdoEtPFKzgXK2oZ1cQ-X7FipakscjGnnBH8dRFRQ,855
690
691
  rapidata/service/openapi_service.py,sha256=k3V4eMNcAjBcxEv17lDivK8LV5TEjRTL9B_5KBlhcas,5482
691
692
  rapidata/types/__init__.py,sha256=gSGrmWV5gEA6pPfAR5vwSy_DvibO5IjCZDiB7LtlMOQ,6134
692
- rapidata-2.40.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
693
- rapidata-2.40.1.dist-info/METADATA,sha256=-BOtvVuVwtdqSq-KcoEi9TYV2irzM9oFRjaIKd3U5Js,1406
694
- rapidata-2.40.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
695
- rapidata-2.40.1.dist-info/RECORD,,
693
+ rapidata-2.40.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
694
+ rapidata-2.40.3.dist-info/METADATA,sha256=6hbAMN-hwKhPKu7APD4_GaZ0dBr6GDVCmEMJ2XgRv1Q,1406
695
+ rapidata-2.40.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
696
+ rapidata-2.40.3.dist-info/RECORD,,
@@ -1,475 +0,0 @@
1
- from rapidata.rapidata_client.datapoints._datapoint import Datapoint
2
- from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
3
- from rapidata.service import LocalFileService
4
- from rapidata.service.openapi_service import OpenAPIService
5
- from concurrent.futures import ThreadPoolExecutor, as_completed
6
- from tqdm import tqdm
7
-
8
- from typing import Generator
9
- from rapidata.rapidata_client.config import logger, managed_print
10
- import time
11
- import threading
12
- from rapidata.rapidata_client.api.rapidata_api_client import (
13
- suppress_rapidata_error_logging,
14
- )
15
- from rapidata.rapidata_client.config.rapidata_config import rapidata_config
16
-
17
- # Add OpenTelemetry context imports for thread propagation
18
- from opentelemetry import context as otel_context
19
-
20
-
21
- def chunk_list(lst: list, chunk_size: int) -> Generator:
22
- for i in range(0, len(lst), chunk_size):
23
- yield lst[i : i + chunk_size]
24
-
25
-
26
- class RapidataDataset:
27
- def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
28
- self.id = dataset_id
29
- self.openapi_service = openapi_service
30
- self.local_file_service = LocalFileService()
31
-
32
- def add_datapoints(
33
- self,
34
- datapoints: list[Datapoint],
35
- ) -> tuple[list[Datapoint], list[Datapoint]]:
36
- if not datapoints:
37
- return [], []
38
-
39
- effective_asset_type = datapoints[0]._get_effective_asset_type()
40
-
41
- logger.debug(f"Config for datapoint upload: {rapidata_config}")
42
-
43
- if issubclass(effective_asset_type, MediaAsset):
44
- return self._add_media_from_paths(
45
- datapoints,
46
- )
47
- elif issubclass(effective_asset_type, TextAsset):
48
- return self._add_texts(datapoints)
49
- else:
50
- raise ValueError(f"Unsupported asset type: {effective_asset_type}")
51
-
52
- def _add_texts(
53
- self, datapoints: list[Datapoint]
54
- ) -> tuple[list[Datapoint], list[Datapoint]]:
55
-
56
- def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
57
- model = datapoint.create_text_upload_model(index)
58
-
59
- self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
60
- dataset_id=self.id, create_datapoint_from_text_sources_model=model
61
- )
62
- return datapoint
63
-
64
- def upload_with_context(
65
- context: otel_context.Context, datapoint: Datapoint, index: int
66
- ) -> Datapoint:
67
- """Wrapper function that runs upload_text_datapoint with the provided context."""
68
- token = otel_context.attach(context)
69
- try:
70
- return upload_text_datapoint(datapoint, index)
71
- finally:
72
- otel_context.detach(token)
73
-
74
- successful_uploads: list[Datapoint] = []
75
- failed_uploads: list[Datapoint] = []
76
-
77
- # Capture the current OpenTelemetry context before creating threads
78
- current_context = otel_context.get_current()
79
-
80
- total_uploads = len(datapoints)
81
- with ThreadPoolExecutor(
82
- max_workers=rapidata_config.upload.maxWorkers
83
- ) as executor:
84
- future_to_datapoint = {
85
- executor.submit(
86
- upload_with_context, current_context, datapoint, i
87
- ): datapoint
88
- for i, datapoint in enumerate(datapoints)
89
- }
90
-
91
- with tqdm(
92
- total=total_uploads,
93
- desc="Uploading text datapoints",
94
- disable=rapidata_config.logging.silent_mode,
95
- ) as pbar:
96
- for future in as_completed(future_to_datapoint.keys()):
97
- datapoint = future_to_datapoint[future]
98
- try:
99
- result = future.result()
100
- pbar.update(1)
101
- successful_uploads.append(result)
102
- except Exception as e:
103
- failed_uploads.append(datapoint)
104
- logger.error("Upload failed for %s: %s", datapoint, str(e))
105
-
106
- return successful_uploads, failed_uploads
107
-
108
- def _process_single_upload(
109
- self,
110
- datapoint: Datapoint,
111
- index: int,
112
- ) -> tuple[list[Datapoint], list[Datapoint]]:
113
- """
114
- Process single upload with retry logic and error tracking.
115
-
116
- Args:
117
- media_asset: MediaAsset or MultiAsset to upload
118
- meta_list: Optional sequence of metadata for the asset
119
- index: Sort index for the upload
120
- max_retries: Maximum number of retry attempts (default: 3)
121
-
122
- Returns:
123
- tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
124
- """
125
- logger.debug("Processing single upload for %s with index %s", datapoint, index)
126
-
127
- local_successful: list[Datapoint] = []
128
- local_failed: list[Datapoint] = []
129
-
130
- metadata = datapoint.get_prepared_metadata()
131
-
132
- local_paths = datapoint.get_local_file_paths()
133
- urls = datapoint.get_urls()
134
-
135
- last_exception = None
136
- for attempt in range(rapidata_config.upload.maxRetries):
137
- try:
138
- with suppress_rapidata_error_logging():
139
- self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
140
- dataset_id=self.id,
141
- file=local_paths,
142
- url=urls,
143
- metadata=metadata,
144
- sort_index=index,
145
- )
146
-
147
- local_successful.append(datapoint)
148
-
149
- return local_successful, local_failed
150
-
151
- except Exception as e:
152
- last_exception = e
153
- if attempt < rapidata_config.upload.maxRetries - 1:
154
- # Exponential backoff: wait 1s, then 2s, then 4s
155
- retry_delay = 2**attempt
156
- time.sleep(retry_delay)
157
- logger.debug("Error: %s", str(last_exception))
158
- logger.debug(
159
- "Retrying %s of %s...",
160
- attempt + 1,
161
- rapidata_config.upload.maxRetries,
162
- )
163
-
164
- # If we get here, all retries failed
165
- local_failed.append(datapoint)
166
- tqdm.write(
167
- f"Upload failed for {datapoint} after {rapidata_config.upload.maxRetries} attempts. \nFinal error: \n{str(last_exception)}"
168
- )
169
-
170
- return local_successful, local_failed
171
-
172
- def _get_progress_tracker(
173
- self,
174
- total_uploads: int,
175
- stop_event: threading.Event,
176
- progress_error_event: threading.Event,
177
- progress_poll_interval: float,
178
- ) -> threading.Thread:
179
- """
180
- Create and return a progress tracking thread that shows actual API progress.
181
-
182
- Args:
183
- total_uploads: Total number of uploads to track
184
- initial_ready: Initial number of ready items
185
- initial_progress: Initial progress state
186
- stop_event: Event to signal thread to stop
187
- progress_error_event: Event to signal an error in progress tracking
188
- progress_poll_interval: Time between progress checks
189
-
190
- Returns:
191
- threading.Thread: The progress tracking thread
192
- """
193
-
194
- def progress_tracking_thread():
195
- try:
196
- # Initialize progress bar with 0 completions
197
- with tqdm(
198
- total=total_uploads,
199
- desc="Uploading datapoints",
200
- disable=rapidata_config.logging.silent_mode,
201
- ) as pbar:
202
- prev_ready = 0
203
- prev_failed = 0
204
- stall_count = 0
205
- last_progress_time = time.time()
206
-
207
- # We'll wait for all uploads to finish + some extra time
208
- # for the backend to fully process everything
209
- all_uploads_complete = threading.Event()
210
-
211
- while not stop_event.is_set() or not all_uploads_complete.is_set():
212
- try:
213
- current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
214
- self.id
215
- )
216
-
217
- # Calculate items completed since our initialization
218
- completed_ready = current_progress.ready
219
- completed_failed = current_progress.failed
220
- total_completed = completed_ready + completed_failed
221
-
222
- # Calculate newly completed items since our last check
223
- new_ready = current_progress.ready - prev_ready
224
- new_failed = current_progress.failed - prev_failed
225
-
226
- # Update progress bar position to show actual completed items
227
- # First reset to match the actual completed count
228
- pbar.n = total_completed
229
- pbar.refresh()
230
-
231
- if new_ready > 0 or new_failed > 0:
232
- # We saw progress
233
- stall_count = 0
234
- last_progress_time = time.time()
235
- else:
236
- stall_count += 1
237
-
238
- # Update our tracking variables
239
- prev_ready = current_progress.ready
240
- prev_failed = current_progress.failed or 0
241
-
242
- # Check if stop_event was set (all uploads submitted)
243
- if stop_event.is_set():
244
- elapsed_since_last_progress = (
245
- time.time() - last_progress_time
246
- )
247
-
248
- # If we haven't seen progress for a while after all uploads were submitted
249
- if elapsed_since_last_progress > 5.0:
250
- # If we're at 100%, we're done
251
- if total_completed >= total_uploads:
252
- all_uploads_complete.set()
253
- break
254
-
255
- # If we're not at 100% but it's been a while with no progress
256
- if stall_count > 5:
257
- # We've polled several times with no progress, assume we're done
258
- logger.warning(
259
- "\nProgress seems stalled at %s/%s.",
260
- total_completed,
261
- total_uploads,
262
- )
263
- break
264
-
265
- except Exception as e:
266
- logger.error("\nError checking progress: %s", str(e))
267
- stall_count += 1
268
-
269
- if stall_count > 10: # Too many consecutive errors
270
- progress_error_event.set()
271
- break
272
-
273
- # Sleep before next poll
274
- time.sleep(progress_poll_interval)
275
-
276
- except Exception as e:
277
- logger.error("Progress tracking thread error: %s", str(e))
278
- progress_error_event.set()
279
-
280
- # Create and return the thread
281
- progress_thread = threading.Thread(target=progress_tracking_thread)
282
- progress_thread.daemon = True
283
- return progress_thread
284
-
285
- def _process_uploads_in_chunks(
286
- self,
287
- datapoints: list[Datapoint],
288
- chunk_size: int,
289
- stop_progress_tracking: threading.Event,
290
- progress_tracking_error: threading.Event,
291
- ) -> tuple[list[Datapoint], list[Datapoint]]:
292
- """
293
- Process uploads in chunks with a ThreadPoolExecutor.
294
-
295
- Args:
296
- media_paths: List of assets to upload
297
- multi_metadata: Optional sequence of sequences of metadata
298
- chunk_size: Number of items to process in each batch
299
- stop_progress_tracking: Event to signal progress tracking to stop
300
- progress_tracking_error: Event to detect progress tracking errors
301
-
302
- Returns:
303
- tuple[list[str], list[str]]: Lists of successful and failed uploads
304
- """
305
- successful_uploads: list[Datapoint] = []
306
- failed_uploads: list[Datapoint] = []
307
-
308
- def process_upload_with_context(
309
- context: otel_context.Context, datapoint: Datapoint, index: int
310
- ) -> tuple[list[Datapoint], list[Datapoint]]:
311
- """Wrapper function that runs _process_single_upload with the provided context."""
312
- token = otel_context.attach(context)
313
- try:
314
- return self._process_single_upload(datapoint, index)
315
- finally:
316
- otel_context.detach(token)
317
-
318
- # Capture the current OpenTelemetry context before creating threads
319
- current_context = otel_context.get_current()
320
-
321
- try:
322
- with ThreadPoolExecutor(
323
- max_workers=rapidata_config.upload.maxWorkers
324
- ) as executor:
325
- # Process uploads in chunks to avoid overwhelming the system
326
- for chunk_idx, chunk in enumerate(chunk_list(datapoints, chunk_size)):
327
- futures = [
328
- executor.submit(
329
- process_upload_with_context,
330
- current_context,
331
- datapoint,
332
- chunk_idx * chunk_size + i,
333
- )
334
- for i, datapoint in enumerate(chunk)
335
- ]
336
-
337
- # Wait for this chunk to complete before starting the next one
338
- for future in as_completed(futures):
339
- if progress_tracking_error.is_set():
340
- raise RuntimeError(
341
- "Progress tracking failed, aborting uploads"
342
- )
343
-
344
- try:
345
- chunk_successful, chunk_failed = future.result()
346
- successful_uploads.extend(chunk_successful)
347
- failed_uploads.extend(chunk_failed)
348
- except Exception as e:
349
- logger.error("Future execution failed: %s", str(e))
350
- finally:
351
- # Signal to the progress tracking thread that all uploads have been submitted
352
- stop_progress_tracking.set()
353
-
354
- return successful_uploads, failed_uploads
355
-
356
- def _log_final_progress(
357
- self,
358
- total_uploads: int,
359
- progress_poll_interval: float,
360
- successful_uploads: list[Datapoint],
361
- failed_uploads: list[Datapoint],
362
- ) -> None:
363
- """
364
- Log the final progress of the upload operation.
365
-
366
- Args:
367
- total_uploads: Total number of uploads
368
- initial_ready: Initial number of ready items
369
- initial_progress: Initial progress state
370
- progress_poll_interval: Time between progress checks
371
- successful_uploads: List of successful uploads for fallback reporting
372
- failed_uploads: List of failed uploads for fallback reporting
373
- """
374
- try:
375
- # Get final progress
376
- final_progress = (
377
- self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
378
- self.id
379
- )
380
- )
381
- total_ready = final_progress.ready
382
- total_failed = final_progress.failed
383
-
384
- # Make sure we account for all uploads
385
- if total_ready + total_failed < total_uploads:
386
- # Try one more time after a longer wait
387
- time.sleep(5 * progress_poll_interval)
388
- final_progress = (
389
- self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
390
- self.id
391
- )
392
- )
393
- total_ready = final_progress.ready
394
- total_failed = final_progress.failed
395
-
396
- success_rate = (
397
- (total_ready / total_uploads * 100) if total_uploads > 0 else 0
398
- )
399
-
400
- logger.info(
401
- "Upload complete: %s ready, %s failed (%s%% success rate)",
402
- total_ready,
403
- total_uploads - total_ready,
404
- success_rate,
405
- )
406
- except Exception as e:
407
- logger.error("Error getting final progress: %s", str(e))
408
- logger.info(
409
- "Upload summary from local tracking: %s succeeded, %s failed",
410
- len(successful_uploads),
411
- len(failed_uploads),
412
- )
413
-
414
- if failed_uploads:
415
- logger.error("Failed uploads: %s", failed_uploads)
416
-
417
- def _add_media_from_paths(
418
- self,
419
- datapoints: list[Datapoint],
420
- chunk_size: int = 50,
421
- progress_poll_interval: float = 0.5,
422
- ) -> tuple[list[Datapoint], list[Datapoint]]:
423
- """
424
- Upload media paths in chunks with managed resources.
425
-
426
- Args:
427
- datapoints: List of Datapoint objects to upload
428
- chunk_size: Number of items to process in each batch
429
- progress_poll_interval: Time in seconds between progress checks
430
- Returns:
431
- tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
432
-
433
- Raises:
434
- ValueError: If multi_metadata lengths don't match media_paths length
435
- """
436
-
437
- # Setup tracking variables
438
- total_uploads = len(datapoints)
439
-
440
- # Create thread control events
441
- stop_progress_tracking = threading.Event()
442
- progress_tracking_error = threading.Event()
443
-
444
- # Create and start progress tracking thread
445
- progress_thread = self._get_progress_tracker(
446
- total_uploads,
447
- stop_progress_tracking,
448
- progress_tracking_error,
449
- progress_poll_interval,
450
- )
451
- progress_thread.start()
452
-
453
- # Process uploads in chunks
454
- try:
455
- successful_uploads, failed_uploads = self._process_uploads_in_chunks(
456
- datapoints,
457
- chunk_size,
458
- stop_progress_tracking,
459
- progress_tracking_error,
460
- )
461
- finally:
462
- progress_thread.join(10) # Add margin to the timeout for tqdm
463
-
464
- # Log final progress
465
- self._log_final_progress(
466
- total_uploads, progress_poll_interval, successful_uploads, failed_uploads
467
- )
468
-
469
- return successful_uploads, failed_uploads
470
-
471
- def __str__(self) -> str:
472
- return f"RapidataDataset(id={self.id})"
473
-
474
- def __repr__(self) -> str:
475
- return self.__str__()