rapidata 2.35.1__py3-none-any.whl → 2.35.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidata might be problematic. Click here for more details.
- rapidata/__init__.py +1 -1
- rapidata/api_client/api/leaderboard_api.py +3 -3
- rapidata/api_client_README.md +1 -1
- rapidata/rapidata_client/api/rapidata_exception.py +61 -32
- rapidata/rapidata_client/datapoints/assets/_multi_asset.py +7 -7
- rapidata/rapidata_client/order/_rapidata_dataset.py +158 -97
- rapidata/rapidata_client/order/_rapidata_order_builder.py +54 -22
- rapidata/rapidata_client/order/rapidata_order.py +109 -48
- rapidata/rapidata_client/rapidata_client.py +19 -14
- rapidata/rapidata_client/validation/rapidata_validation_set.py +13 -7
- rapidata/rapidata_client/validation/validation_set_manager.py +167 -98
- rapidata/service/credential_manager.py +13 -13
- rapidata/service/openapi_service.py +22 -13
- {rapidata-2.35.1.dist-info → rapidata-2.35.2.dist-info}/METADATA +1 -1
- {rapidata-2.35.1.dist-info → rapidata-2.35.2.dist-info}/RECORD +17 -17
- {rapidata-2.35.1.dist-info → rapidata-2.35.2.dist-info}/LICENSE +0 -0
- {rapidata-2.35.1.dist-info → rapidata-2.35.2.dist-info}/WHEEL +0 -0
|
@@ -1,23 +1,41 @@
|
|
|
1
1
|
from itertools import zip_longest
|
|
2
2
|
|
|
3
|
-
from rapidata.api_client.models.create_datapoint_from_text_sources_model import
|
|
4
|
-
|
|
3
|
+
from rapidata.api_client.models.create_datapoint_from_text_sources_model import (
|
|
4
|
+
CreateDatapointFromTextSourcesModel,
|
|
5
|
+
)
|
|
6
|
+
from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import (
|
|
7
|
+
DatasetDatasetIdDatapointsPostRequestMetadataInner,
|
|
8
|
+
)
|
|
5
9
|
from rapidata.rapidata_client.datapoints.datapoint import Datapoint
|
|
6
10
|
from rapidata.rapidata_client.datapoints.metadata import Metadata
|
|
7
|
-
from rapidata.rapidata_client.datapoints.assets import
|
|
11
|
+
from rapidata.rapidata_client.datapoints.assets import (
|
|
12
|
+
TextAsset,
|
|
13
|
+
MediaAsset,
|
|
14
|
+
MultiAsset,
|
|
15
|
+
BaseAsset,
|
|
16
|
+
)
|
|
8
17
|
from rapidata.service import LocalFileService
|
|
9
18
|
from rapidata.service.openapi_service import OpenAPIService
|
|
10
19
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
20
|
from tqdm import tqdm
|
|
12
21
|
|
|
13
22
|
from typing import cast, Sequence, Generator
|
|
14
|
-
from rapidata.rapidata_client.logging import
|
|
23
|
+
from rapidata.rapidata_client.logging import (
|
|
24
|
+
logger,
|
|
25
|
+
managed_print,
|
|
26
|
+
RapidataOutputManager,
|
|
27
|
+
)
|
|
15
28
|
import time
|
|
16
29
|
import threading
|
|
30
|
+
from rapidata.rapidata_client.api.rapidata_exception import (
|
|
31
|
+
suppress_rapidata_error_logging,
|
|
32
|
+
)
|
|
33
|
+
|
|
17
34
|
|
|
18
35
|
def chunk_list(lst: list, chunk_size: int) -> Generator:
|
|
19
36
|
for i in range(0, len(lst), chunk_size):
|
|
20
|
-
yield lst[i:i + chunk_size]
|
|
37
|
+
yield lst[i : i + chunk_size]
|
|
38
|
+
|
|
21
39
|
|
|
22
40
|
class RapidataDataset:
|
|
23
41
|
def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
|
|
@@ -31,9 +49,9 @@ class RapidataDataset:
|
|
|
31
49
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
32
50
|
if not datapoints:
|
|
33
51
|
return [], []
|
|
34
|
-
|
|
52
|
+
|
|
35
53
|
effective_asset_type = datapoints[0]._get_effective_asset_type()
|
|
36
|
-
|
|
54
|
+
|
|
37
55
|
if issubclass(effective_asset_type, MediaAsset):
|
|
38
56
|
return self._add_media_from_paths(datapoints)
|
|
39
57
|
elif issubclass(effective_asset_type, TextAsset):
|
|
@@ -46,11 +64,13 @@ class RapidataDataset:
|
|
|
46
64
|
datapoints: list[Datapoint],
|
|
47
65
|
max_workers: int = 10,
|
|
48
66
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
49
|
-
|
|
67
|
+
|
|
50
68
|
def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
|
|
51
69
|
model = datapoint.create_text_upload_model(index)
|
|
52
|
-
|
|
53
|
-
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
|
|
70
|
+
|
|
71
|
+
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
|
|
72
|
+
dataset_id=self.id, create_datapoint_from_text_sources_model=model
|
|
73
|
+
)
|
|
54
74
|
return datapoint
|
|
55
75
|
|
|
56
76
|
successful_uploads: list[Datapoint] = []
|
|
@@ -63,7 +83,11 @@ class RapidataDataset:
|
|
|
63
83
|
for i, datapoint in enumerate(datapoints)
|
|
64
84
|
}
|
|
65
85
|
|
|
66
|
-
with tqdm(
|
|
86
|
+
with tqdm(
|
|
87
|
+
total=total_uploads,
|
|
88
|
+
desc="Uploading text datapoints",
|
|
89
|
+
disable=RapidataOutputManager.silent_mode,
|
|
90
|
+
) as pbar:
|
|
67
91
|
for future in as_completed(future_to_datapoint.keys()):
|
|
68
92
|
datapoint = future_to_datapoint[future]
|
|
69
93
|
try:
|
|
@@ -72,7 +96,7 @@ class RapidataDataset:
|
|
|
72
96
|
successful_uploads.append(result)
|
|
73
97
|
except Exception as e:
|
|
74
98
|
failed_uploads.append(datapoint)
|
|
75
|
-
logger.error(
|
|
99
|
+
logger.error("Upload failed for %s: %s", datapoint, str(e))
|
|
76
100
|
|
|
77
101
|
return successful_uploads, failed_uploads
|
|
78
102
|
|
|
@@ -84,16 +108,18 @@ class RapidataDataset:
|
|
|
84
108
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
85
109
|
"""
|
|
86
110
|
Process single upload with retry logic and error tracking.
|
|
87
|
-
|
|
111
|
+
|
|
88
112
|
Args:
|
|
89
113
|
media_asset: MediaAsset or MultiAsset to upload
|
|
90
114
|
meta_list: Optional sequence of metadata for the asset
|
|
91
115
|
index: Sort index for the upload
|
|
92
116
|
max_retries: Maximum number of retry attempts (default: 3)
|
|
93
|
-
|
|
117
|
+
|
|
94
118
|
Returns:
|
|
95
119
|
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
96
120
|
"""
|
|
121
|
+
logger.debug("Processing single upload for %s with index %s", datapoint, index)
|
|
122
|
+
|
|
97
123
|
local_successful: list[Datapoint] = []
|
|
98
124
|
local_failed: list[Datapoint] = []
|
|
99
125
|
|
|
@@ -105,42 +131,46 @@ class RapidataDataset:
|
|
|
105
131
|
last_exception = None
|
|
106
132
|
for attempt in range(max_retries):
|
|
107
133
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
134
|
+
with suppress_rapidata_error_logging():
|
|
135
|
+
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
|
|
136
|
+
dataset_id=self.id,
|
|
137
|
+
file=local_paths,
|
|
138
|
+
url=urls,
|
|
139
|
+
metadata=metadata,
|
|
140
|
+
sort_index=index,
|
|
141
|
+
)
|
|
142
|
+
|
|
116
143
|
local_successful.append(datapoint)
|
|
117
144
|
|
|
118
145
|
return local_successful, local_failed
|
|
119
|
-
|
|
146
|
+
|
|
120
147
|
except Exception as e:
|
|
121
148
|
last_exception = e
|
|
122
149
|
if attempt < max_retries - 1:
|
|
123
150
|
# Exponential backoff: wait 1s, then 2s, then 4s
|
|
124
|
-
retry_delay = 2
|
|
151
|
+
retry_delay = 2**attempt
|
|
125
152
|
time.sleep(retry_delay)
|
|
126
|
-
|
|
127
|
-
|
|
153
|
+
logger.debug("Error: %s", str(last_exception))
|
|
154
|
+
logger.debug("Retrying %s of %s...", attempt + 1, max_retries)
|
|
155
|
+
|
|
128
156
|
# If we get here, all retries failed
|
|
129
157
|
local_failed.append(datapoint)
|
|
130
|
-
|
|
158
|
+
tqdm.write(
|
|
159
|
+
f"Upload failed for {datapoint} after {max_retries} attempts. \nFinal error: \n{str(last_exception)}"
|
|
160
|
+
)
|
|
131
161
|
|
|
132
162
|
return local_successful, local_failed
|
|
133
163
|
|
|
134
164
|
def _get_progress_tracker(
|
|
135
|
-
self,
|
|
136
|
-
total_uploads: int,
|
|
137
|
-
stop_event: threading.Event,
|
|
165
|
+
self,
|
|
166
|
+
total_uploads: int,
|
|
167
|
+
stop_event: threading.Event,
|
|
138
168
|
progress_error_event: threading.Event,
|
|
139
169
|
progress_poll_interval: float,
|
|
140
170
|
) -> threading.Thread:
|
|
141
171
|
"""
|
|
142
172
|
Create and return a progress tracking thread that shows actual API progress.
|
|
143
|
-
|
|
173
|
+
|
|
144
174
|
Args:
|
|
145
175
|
total_uploads: Total number of uploads to track
|
|
146
176
|
initial_ready: Initial number of ready items
|
|
@@ -148,84 +178,97 @@ class RapidataDataset:
|
|
|
148
178
|
stop_event: Event to signal thread to stop
|
|
149
179
|
progress_error_event: Event to signal an error in progress tracking
|
|
150
180
|
progress_poll_interval: Time between progress checks
|
|
151
|
-
|
|
181
|
+
|
|
152
182
|
Returns:
|
|
153
183
|
threading.Thread: The progress tracking thread
|
|
154
184
|
"""
|
|
185
|
+
|
|
155
186
|
def progress_tracking_thread():
|
|
156
187
|
try:
|
|
157
188
|
# Initialize progress bar with 0 completions
|
|
158
|
-
with tqdm(
|
|
189
|
+
with tqdm(
|
|
190
|
+
total=total_uploads,
|
|
191
|
+
desc="Uploading datapoints",
|
|
192
|
+
disable=RapidataOutputManager.silent_mode,
|
|
193
|
+
) as pbar:
|
|
159
194
|
prev_ready = 0
|
|
160
195
|
prev_failed = 0
|
|
161
196
|
stall_count = 0
|
|
162
197
|
last_progress_time = time.time()
|
|
163
|
-
|
|
198
|
+
|
|
164
199
|
# We'll wait for all uploads to finish + some extra time
|
|
165
200
|
# for the backend to fully process everything
|
|
166
201
|
all_uploads_complete = threading.Event()
|
|
167
|
-
|
|
202
|
+
|
|
168
203
|
while not stop_event.is_set() or not all_uploads_complete.is_set():
|
|
169
204
|
try:
|
|
170
|
-
current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
171
|
-
|
|
205
|
+
current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
206
|
+
self.id
|
|
207
|
+
)
|
|
208
|
+
|
|
172
209
|
# Calculate items completed since our initialization
|
|
173
210
|
completed_ready = current_progress.ready
|
|
174
211
|
completed_failed = current_progress.failed
|
|
175
212
|
total_completed = completed_ready + completed_failed
|
|
176
|
-
|
|
213
|
+
|
|
177
214
|
# Calculate newly completed items since our last check
|
|
178
215
|
new_ready = current_progress.ready - prev_ready
|
|
179
216
|
new_failed = current_progress.failed - prev_failed
|
|
180
|
-
|
|
217
|
+
|
|
181
218
|
# Update progress bar position to show actual completed items
|
|
182
219
|
# First reset to match the actual completed count
|
|
183
220
|
pbar.n = total_completed
|
|
184
221
|
pbar.refresh()
|
|
185
|
-
|
|
222
|
+
|
|
186
223
|
if new_ready > 0 or new_failed > 0:
|
|
187
224
|
# We saw progress
|
|
188
225
|
stall_count = 0
|
|
189
226
|
last_progress_time = time.time()
|
|
190
227
|
else:
|
|
191
228
|
stall_count += 1
|
|
192
|
-
|
|
229
|
+
|
|
193
230
|
# Update our tracking variables
|
|
194
231
|
prev_ready = current_progress.ready
|
|
195
232
|
prev_failed = current_progress.failed or 0
|
|
196
|
-
|
|
233
|
+
|
|
197
234
|
# Check if stop_event was set (all uploads submitted)
|
|
198
235
|
if stop_event.is_set():
|
|
199
|
-
elapsed_since_last_progress =
|
|
200
|
-
|
|
236
|
+
elapsed_since_last_progress = (
|
|
237
|
+
time.time() - last_progress_time
|
|
238
|
+
)
|
|
239
|
+
|
|
201
240
|
# If we haven't seen progress for a while after all uploads were submitted
|
|
202
241
|
if elapsed_since_last_progress > 5.0:
|
|
203
242
|
# If we're at 100%, we're done
|
|
204
243
|
if total_completed >= total_uploads:
|
|
205
244
|
all_uploads_complete.set()
|
|
206
245
|
break
|
|
207
|
-
|
|
246
|
+
|
|
208
247
|
# If we're not at 100% but it's been a while with no progress
|
|
209
248
|
if stall_count > 5:
|
|
210
249
|
# We've polled several times with no progress, assume we're done
|
|
211
|
-
logger.warning(
|
|
250
|
+
logger.warning(
|
|
251
|
+
"\nProgress seems stalled at %s/%s.",
|
|
252
|
+
total_completed,
|
|
253
|
+
total_uploads,
|
|
254
|
+
)
|
|
212
255
|
break
|
|
213
|
-
|
|
256
|
+
|
|
214
257
|
except Exception as e:
|
|
215
|
-
logger.error(
|
|
258
|
+
logger.error("\nError checking progress: %s", str(e))
|
|
216
259
|
stall_count += 1
|
|
217
|
-
|
|
260
|
+
|
|
218
261
|
if stall_count > 10: # Too many consecutive errors
|
|
219
262
|
progress_error_event.set()
|
|
220
263
|
break
|
|
221
|
-
|
|
264
|
+
|
|
222
265
|
# Sleep before next poll
|
|
223
266
|
time.sleep(progress_poll_interval)
|
|
224
|
-
|
|
267
|
+
|
|
225
268
|
except Exception as e:
|
|
226
|
-
logger.error(
|
|
269
|
+
logger.error("Progress tracking thread error: %s", str(e))
|
|
227
270
|
progress_error_event.set()
|
|
228
|
-
|
|
271
|
+
|
|
229
272
|
# Create and return the thread
|
|
230
273
|
progress_thread = threading.Thread(target=progress_tracking_thread)
|
|
231
274
|
progress_thread.daemon = True
|
|
@@ -237,11 +280,11 @@ class RapidataDataset:
|
|
|
237
280
|
max_workers: int,
|
|
238
281
|
chunk_size: int,
|
|
239
282
|
stop_progress_tracking: threading.Event,
|
|
240
|
-
progress_tracking_error: threading.Event
|
|
283
|
+
progress_tracking_error: threading.Event,
|
|
241
284
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
242
285
|
"""
|
|
243
286
|
Process uploads in chunks with a ThreadPoolExecutor.
|
|
244
|
-
|
|
287
|
+
|
|
245
288
|
Args:
|
|
246
289
|
media_paths: List of assets to upload
|
|
247
290
|
multi_metadata: Optional sequence of sequences of metadata
|
|
@@ -249,53 +292,55 @@ class RapidataDataset:
|
|
|
249
292
|
chunk_size: Number of items to process in each batch
|
|
250
293
|
stop_progress_tracking: Event to signal progress tracking to stop
|
|
251
294
|
progress_tracking_error: Event to detect progress tracking errors
|
|
252
|
-
|
|
295
|
+
|
|
253
296
|
Returns:
|
|
254
297
|
tuple[list[str], list[str]]: Lists of successful and failed uploads
|
|
255
298
|
"""
|
|
256
299
|
successful_uploads: list[Datapoint] = []
|
|
257
300
|
failed_uploads: list[Datapoint] = []
|
|
258
|
-
|
|
301
|
+
|
|
259
302
|
try:
|
|
260
303
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
261
304
|
# Process uploads in chunks to avoid overwhelming the system
|
|
262
305
|
for chunk_idx, chunk in enumerate(chunk_list(datapoints, chunk_size)):
|
|
263
306
|
futures = [
|
|
264
307
|
executor.submit(
|
|
265
|
-
self._process_single_upload,
|
|
266
|
-
datapoint,
|
|
267
|
-
index=(chunk_idx * chunk_size + i)
|
|
308
|
+
self._process_single_upload,
|
|
309
|
+
datapoint,
|
|
310
|
+
index=(chunk_idx * chunk_size + i),
|
|
268
311
|
)
|
|
269
312
|
for i, datapoint in enumerate(chunk)
|
|
270
313
|
]
|
|
271
|
-
|
|
314
|
+
|
|
272
315
|
# Wait for this chunk to complete before starting the next one
|
|
273
316
|
for future in as_completed(futures):
|
|
274
317
|
if progress_tracking_error.is_set():
|
|
275
|
-
raise RuntimeError(
|
|
276
|
-
|
|
318
|
+
raise RuntimeError(
|
|
319
|
+
"Progress tracking failed, aborting uploads"
|
|
320
|
+
)
|
|
321
|
+
|
|
277
322
|
try:
|
|
278
323
|
chunk_successful, chunk_failed = future.result()
|
|
279
324
|
successful_uploads.extend(chunk_successful)
|
|
280
325
|
failed_uploads.extend(chunk_failed)
|
|
281
326
|
except Exception as e:
|
|
282
|
-
logger.error(
|
|
327
|
+
logger.error("Future execution failed: %s", str(e))
|
|
283
328
|
finally:
|
|
284
329
|
# Signal to the progress tracking thread that all uploads have been submitted
|
|
285
330
|
stop_progress_tracking.set()
|
|
286
|
-
|
|
331
|
+
|
|
287
332
|
return successful_uploads, failed_uploads
|
|
288
333
|
|
|
289
334
|
def _log_final_progress(
|
|
290
|
-
self,
|
|
291
|
-
total_uploads: int,
|
|
335
|
+
self,
|
|
336
|
+
total_uploads: int,
|
|
292
337
|
progress_poll_interval: float,
|
|
293
338
|
successful_uploads: list[Datapoint],
|
|
294
|
-
failed_uploads: list[Datapoint]
|
|
339
|
+
failed_uploads: list[Datapoint],
|
|
295
340
|
) -> None:
|
|
296
341
|
"""
|
|
297
342
|
Log the final progress of the upload operation.
|
|
298
|
-
|
|
343
|
+
|
|
299
344
|
Args:
|
|
300
345
|
total_uploads: Total number of uploads
|
|
301
346
|
initial_ready: Initial number of ready items
|
|
@@ -304,29 +349,48 @@ class RapidataDataset:
|
|
|
304
349
|
successful_uploads: List of successful uploads for fallback reporting
|
|
305
350
|
failed_uploads: List of failed uploads for fallback reporting
|
|
306
351
|
"""
|
|
307
|
-
try:
|
|
352
|
+
try:
|
|
308
353
|
# Get final progress
|
|
309
|
-
final_progress =
|
|
354
|
+
final_progress = (
|
|
355
|
+
self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
356
|
+
self.id
|
|
357
|
+
)
|
|
358
|
+
)
|
|
310
359
|
total_ready = final_progress.ready
|
|
311
360
|
total_failed = final_progress.failed
|
|
312
|
-
|
|
361
|
+
|
|
313
362
|
# Make sure we account for all uploads
|
|
314
363
|
if total_ready + total_failed < total_uploads:
|
|
315
364
|
# Try one more time after a longer wait
|
|
316
365
|
time.sleep(5 * progress_poll_interval)
|
|
317
|
-
final_progress =
|
|
366
|
+
final_progress = (
|
|
367
|
+
self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
368
|
+
self.id
|
|
369
|
+
)
|
|
370
|
+
)
|
|
318
371
|
total_ready = final_progress.ready
|
|
319
372
|
total_failed = final_progress.failed
|
|
320
|
-
|
|
321
|
-
success_rate = (
|
|
322
|
-
|
|
323
|
-
|
|
373
|
+
|
|
374
|
+
success_rate = (
|
|
375
|
+
(total_ready / total_uploads * 100) if total_uploads > 0 else 0
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
logger.info(
|
|
379
|
+
"Upload complete: %s ready, %s failed (%s%% success rate)",
|
|
380
|
+
total_ready,
|
|
381
|
+
total_uploads - total_ready,
|
|
382
|
+
success_rate,
|
|
383
|
+
)
|
|
324
384
|
except Exception as e:
|
|
325
|
-
logger.error(
|
|
326
|
-
logger.info(
|
|
385
|
+
logger.error("Error getting final progress: %s", str(e))
|
|
386
|
+
logger.info(
|
|
387
|
+
"Upload summary from local tracking: %s succeeded, %s failed",
|
|
388
|
+
len(successful_uploads),
|
|
389
|
+
len(failed_uploads),
|
|
390
|
+
)
|
|
327
391
|
|
|
328
392
|
if failed_uploads:
|
|
329
|
-
logger.error(
|
|
393
|
+
logger.error("Failed uploads: %s", failed_uploads)
|
|
330
394
|
|
|
331
395
|
def _add_media_from_paths(
|
|
332
396
|
self,
|
|
@@ -337,36 +401,36 @@ class RapidataDataset:
|
|
|
337
401
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
338
402
|
"""
|
|
339
403
|
Upload media paths in chunks with managed resources.
|
|
340
|
-
|
|
404
|
+
|
|
341
405
|
Args:
|
|
342
406
|
datapoints: List of Datapoint objects to upload
|
|
343
407
|
max_workers: Maximum number of concurrent upload workers
|
|
344
408
|
chunk_size: Number of items to process in each batch
|
|
345
409
|
progress_poll_interval: Time in seconds between progress checks
|
|
346
|
-
|
|
410
|
+
|
|
347
411
|
Returns:
|
|
348
412
|
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
349
|
-
|
|
413
|
+
|
|
350
414
|
Raises:
|
|
351
415
|
ValueError: If multi_metadata lengths don't match media_paths length
|
|
352
416
|
"""
|
|
353
|
-
|
|
417
|
+
|
|
354
418
|
# Setup tracking variables
|
|
355
419
|
total_uploads = len(datapoints)
|
|
356
|
-
|
|
420
|
+
|
|
357
421
|
# Create thread control events
|
|
358
422
|
stop_progress_tracking = threading.Event()
|
|
359
423
|
progress_tracking_error = threading.Event()
|
|
360
|
-
|
|
424
|
+
|
|
361
425
|
# Create and start progress tracking thread
|
|
362
426
|
progress_thread = self._get_progress_tracker(
|
|
363
|
-
total_uploads,
|
|
364
|
-
stop_progress_tracking,
|
|
427
|
+
total_uploads,
|
|
428
|
+
stop_progress_tracking,
|
|
365
429
|
progress_tracking_error,
|
|
366
|
-
progress_poll_interval
|
|
430
|
+
progress_poll_interval,
|
|
367
431
|
)
|
|
368
432
|
progress_thread.start()
|
|
369
|
-
|
|
433
|
+
|
|
370
434
|
# Process uploads in chunks
|
|
371
435
|
try:
|
|
372
436
|
successful_uploads, failed_uploads = self._process_uploads_in_chunks(
|
|
@@ -374,23 +438,20 @@ class RapidataDataset:
|
|
|
374
438
|
max_workers,
|
|
375
439
|
chunk_size,
|
|
376
440
|
stop_progress_tracking,
|
|
377
|
-
progress_tracking_error
|
|
441
|
+
progress_tracking_error,
|
|
378
442
|
)
|
|
379
443
|
finally:
|
|
380
444
|
progress_thread.join(10) # Add margin to the timeout for tqdm
|
|
381
|
-
|
|
445
|
+
|
|
382
446
|
# Log final progress
|
|
383
447
|
self._log_final_progress(
|
|
384
|
-
total_uploads,
|
|
385
|
-
progress_poll_interval,
|
|
386
|
-
successful_uploads,
|
|
387
|
-
failed_uploads
|
|
448
|
+
total_uploads, progress_poll_interval, successful_uploads, failed_uploads
|
|
388
449
|
)
|
|
389
450
|
|
|
390
451
|
return successful_uploads, failed_uploads
|
|
391
452
|
|
|
392
453
|
def __str__(self) -> str:
|
|
393
454
|
return f"RapidataDataset(id={self.id})"
|
|
394
|
-
|
|
455
|
+
|
|
395
456
|
def __repr__(self) -> str:
|
|
396
457
|
return self.__str__()
|
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
from typing import Literal, Optional, cast, Sequence
|
|
2
2
|
|
|
3
3
|
from rapidata.api_client.models.ab_test_selection_a_inner import AbTestSelectionAInner
|
|
4
|
-
from rapidata.api_client.models.and_user_filter_model_filters_inner import
|
|
4
|
+
from rapidata.api_client.models.and_user_filter_model_filters_inner import (
|
|
5
|
+
AndUserFilterModelFiltersInner,
|
|
6
|
+
)
|
|
5
7
|
from rapidata.api_client.models.create_order_model import CreateOrderModel
|
|
6
|
-
from rapidata.api_client.models.create_order_model_referee import
|
|
7
|
-
|
|
8
|
+
from rapidata.api_client.models.create_order_model_referee import (
|
|
9
|
+
CreateOrderModelReferee,
|
|
10
|
+
)
|
|
11
|
+
from rapidata.api_client.models.create_order_model_workflow import (
|
|
12
|
+
CreateOrderModelWorkflow,
|
|
13
|
+
)
|
|
8
14
|
|
|
9
15
|
from rapidata.rapidata_client.datapoints.datapoint import Datapoint
|
|
10
|
-
from rapidata.rapidata_client.exceptions.failed_upload_exception import
|
|
16
|
+
from rapidata.rapidata_client.exceptions.failed_upload_exception import (
|
|
17
|
+
FailedUploadException,
|
|
18
|
+
_parse_failed_uploads,
|
|
19
|
+
)
|
|
11
20
|
from rapidata.rapidata_client.filter import RapidataFilter
|
|
12
21
|
from rapidata.rapidata_client.logging import logger, managed_print
|
|
13
22
|
from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
|
|
@@ -108,14 +117,14 @@ class RapidataOrderBuilder:
|
|
|
108
117
|
RapidataOrder: The created RapidataOrder instance.
|
|
109
118
|
"""
|
|
110
119
|
order_model = self._to_model()
|
|
111
|
-
logger.debug(
|
|
120
|
+
logger.debug("Creating order with model: %s", order_model)
|
|
112
121
|
|
|
113
122
|
result = self.__openapi_service.order_api.order_post(
|
|
114
123
|
create_order_model=order_model
|
|
115
124
|
)
|
|
116
125
|
|
|
117
126
|
self.order_id = str(result.order_id)
|
|
118
|
-
logger.debug(
|
|
127
|
+
logger.debug("Order created with ID: %s", self.order_id)
|
|
119
128
|
|
|
120
129
|
self.__dataset = (
|
|
121
130
|
RapidataDataset(result.dataset_id, self.__openapi_service)
|
|
@@ -123,7 +132,7 @@ class RapidataOrderBuilder:
|
|
|
123
132
|
else None
|
|
124
133
|
)
|
|
125
134
|
if self.__dataset:
|
|
126
|
-
logger.debug(
|
|
135
|
+
logger.debug("Dataset created with ID: %s", self.__dataset.id)
|
|
127
136
|
else:
|
|
128
137
|
logger.warning("No dataset created for this order.")
|
|
129
138
|
|
|
@@ -133,25 +142,34 @@ class RapidataOrderBuilder:
|
|
|
133
142
|
name=self._name,
|
|
134
143
|
)
|
|
135
144
|
|
|
136
|
-
logger.debug(
|
|
145
|
+
logger.debug("Order created: %s", order)
|
|
137
146
|
logger.debug("Adding media to the order.")
|
|
138
147
|
|
|
139
148
|
if self.__dataset:
|
|
140
149
|
_, failed_uploads = self.__dataset.add_datapoints(self.__datapoints)
|
|
141
|
-
|
|
150
|
+
|
|
142
151
|
if failed_uploads:
|
|
143
152
|
raise FailedUploadException(self.__dataset, order, failed_uploads)
|
|
144
|
-
|
|
153
|
+
|
|
145
154
|
else:
|
|
146
|
-
raise RuntimeError(
|
|
147
|
-
|
|
155
|
+
raise RuntimeError(
|
|
156
|
+
f"No dataset created for this order. order_id: {self.order_id}"
|
|
157
|
+
)
|
|
158
|
+
|
|
148
159
|
logger.debug("Media added to the order.")
|
|
149
160
|
logger.debug("Setting order to preview")
|
|
150
161
|
try:
|
|
151
162
|
self.__openapi_service.order_api.order_order_id_preview_post(self.order_id)
|
|
152
163
|
except Exception:
|
|
153
|
-
failed_uploads = _parse_failed_uploads(
|
|
154
|
-
|
|
164
|
+
failed_uploads = _parse_failed_uploads(
|
|
165
|
+
self.__openapi_service.dataset_api.dataset_dataset_id_datapoints_failed_get(
|
|
166
|
+
self.__dataset.id
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
logger.error(
|
|
170
|
+
"Internal download error for datapoints: %s\nWARNING: Failed Datapoints in error do not contain metadata.",
|
|
171
|
+
failed_uploads,
|
|
172
|
+
)
|
|
155
173
|
raise FailedUploadException(self.__dataset, order, failed_uploads)
|
|
156
174
|
return order
|
|
157
175
|
|
|
@@ -201,7 +219,9 @@ class RapidataOrderBuilder:
|
|
|
201
219
|
RapidataOrderBuilder: The updated RapidataOrderBuilder instance.
|
|
202
220
|
"""
|
|
203
221
|
if not isinstance(datapoints, list):
|
|
204
|
-
raise TypeError(
|
|
222
|
+
raise TypeError(
|
|
223
|
+
"Datapoints must be provided as a list of Datapoint objects."
|
|
224
|
+
)
|
|
205
225
|
|
|
206
226
|
self.__datapoints = datapoints
|
|
207
227
|
return self
|
|
@@ -219,7 +239,7 @@ class RapidataOrderBuilder:
|
|
|
219
239
|
|
|
220
240
|
if not isinstance(settings, list):
|
|
221
241
|
raise TypeError("Settings must be provided as a list of Setting objects.")
|
|
222
|
-
|
|
242
|
+
|
|
223
243
|
for s in settings:
|
|
224
244
|
if not isinstance(s, RapidataSetting):
|
|
225
245
|
raise TypeError("The settings list must only contain Setting objects.")
|
|
@@ -250,7 +270,9 @@ class RapidataOrderBuilder:
|
|
|
250
270
|
self.__user_filters = filters
|
|
251
271
|
return self
|
|
252
272
|
|
|
253
|
-
def _validation_set_id(
|
|
273
|
+
def _validation_set_id(
|
|
274
|
+
self, validation_set_id: str | None = None
|
|
275
|
+
) -> "RapidataOrderBuilder":
|
|
254
276
|
"""
|
|
255
277
|
Set the validation set ID for the order.
|
|
256
278
|
|
|
@@ -281,7 +303,9 @@ class RapidataOrderBuilder:
|
|
|
281
303
|
"""
|
|
282
304
|
raise NotImplementedError("Not implemented yet.")
|
|
283
305
|
|
|
284
|
-
def _selections(
|
|
306
|
+
def _selections(
|
|
307
|
+
self, selections: Sequence[RapidataSelection]
|
|
308
|
+
) -> "RapidataOrderBuilder":
|
|
285
309
|
"""
|
|
286
310
|
Set the selections for the order.
|
|
287
311
|
|
|
@@ -318,13 +342,21 @@ class RapidataOrderBuilder:
|
|
|
318
342
|
|
|
319
343
|
self.__priority = priority
|
|
320
344
|
return self
|
|
321
|
-
|
|
322
|
-
def _sticky_state(
|
|
345
|
+
|
|
346
|
+
def _sticky_state(
|
|
347
|
+
self, sticky_state: Literal["None", "Temporary", "Permanent"] | None = None
|
|
348
|
+
) -> "RapidataOrderBuilder":
|
|
323
349
|
"""
|
|
324
350
|
Set the sticky state for the order.
|
|
325
351
|
"""
|
|
326
|
-
if sticky_state is not None and sticky_state not in [
|
|
327
|
-
|
|
352
|
+
if sticky_state is not None and sticky_state not in [
|
|
353
|
+
"None",
|
|
354
|
+
"Temporary",
|
|
355
|
+
"Permanent",
|
|
356
|
+
]:
|
|
357
|
+
raise TypeError(
|
|
358
|
+
"Sticky state must be of type Literal['None', 'Temporary', 'Permanent']."
|
|
359
|
+
)
|
|
328
360
|
|
|
329
361
|
self.__sticky_state = sticky_state
|
|
330
362
|
return self
|