rapidata 2.40.1__py3-none-any.whl → 2.40.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidata might be problematic. Click here for more details.
- rapidata/__init__.py +1 -1
- rapidata/rapidata_client/config/upload_config.py +1 -0
- rapidata/rapidata_client/exceptions/failed_upload_exception.py +1 -1
- rapidata/rapidata_client/order/_rapidata_order_builder.py +7 -3
- rapidata/rapidata_client/order/dataset/_progress_tracker.py +91 -0
- rapidata/rapidata_client/order/dataset/_rapidata_dataset.py +286 -0
- {rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/METADATA +1 -1
- {rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/RECORD +10 -9
- rapidata/rapidata_client/order/_rapidata_dataset.py +0 -475
- {rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/LICENSE +0 -0
- {rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/WHEEL +0 -0
rapidata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ from rapidata.api_client.models.original_filename_metadata_model import (
|
|
|
10
10
|
from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
|
|
11
11
|
from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
|
|
12
12
|
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
13
|
-
from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
|
|
13
|
+
from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
|
|
14
14
|
from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
|
|
15
15
|
|
|
16
16
|
|
|
@@ -32,7 +32,7 @@ from rapidata.rapidata_client.config import (
|
|
|
32
32
|
from rapidata.rapidata_client.validation.validation_set_manager import (
|
|
33
33
|
ValidationSetManager,
|
|
34
34
|
)
|
|
35
|
-
from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
|
|
35
|
+
from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
|
|
36
36
|
from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
|
|
37
37
|
from rapidata.rapidata_client.referee import Referee
|
|
38
38
|
from rapidata.rapidata_client.referee._naive_referee import NaiveReferee
|
|
@@ -235,6 +235,10 @@ class RapidataOrderBuilder:
|
|
|
235
235
|
+ f"Please open this URL in your browser: '{encoded_url}'"
|
|
236
236
|
+ Fore.RESET
|
|
237
237
|
)
|
|
238
|
+
managed_print(
|
|
239
|
+
"If you want to avoid the automatic validation set creation in the future, set `rapidata_config.order.autoValidationSetCreation = False`."
|
|
240
|
+
)
|
|
241
|
+
managed_print()
|
|
238
242
|
|
|
239
243
|
self.__dataset = (
|
|
240
244
|
RapidataDataset(result.dataset_id, self.__openapi_service)
|
|
@@ -253,7 +257,7 @@ class RapidataOrderBuilder:
|
|
|
253
257
|
)
|
|
254
258
|
|
|
255
259
|
logger.debug("Order created: %s", order)
|
|
256
|
-
logger.debug("Adding
|
|
260
|
+
logger.debug("Adding datapoints to the order.")
|
|
257
261
|
|
|
258
262
|
if self.__dataset:
|
|
259
263
|
with tracer.start_as_current_span("add_datapoints"):
|
|
@@ -267,7 +271,7 @@ class RapidataOrderBuilder:
|
|
|
267
271
|
f"No dataset created for this order. order_id: {self.order_id}"
|
|
268
272
|
)
|
|
269
273
|
|
|
270
|
-
logger.debug("
|
|
274
|
+
logger.debug("Datapoints added to the order.")
|
|
271
275
|
logger.debug("Setting order to preview")
|
|
272
276
|
try:
|
|
273
277
|
self.__openapi_service.order_api.order_order_id_preview_post(self.order_id)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
from tqdm import tqdm
|
|
4
|
+
|
|
5
|
+
from rapidata.service.openapi_service import OpenAPIService
|
|
6
|
+
from rapidata.rapidata_client.config import logger, rapidata_config
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProgressTracker:
|
|
10
|
+
"""
|
|
11
|
+
Track dataset upload progress in a background thread with shallow indentation.
|
|
12
|
+
|
|
13
|
+
This class encapsulates the progress polling loop to keep methods in
|
|
14
|
+
`RapidataDataset` simpler and below the maximum indentation depth.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
dataset_id: str,
|
|
20
|
+
openapi_service: OpenAPIService,
|
|
21
|
+
total_uploads: int,
|
|
22
|
+
progress_poll_interval: float,
|
|
23
|
+
) -> None:
|
|
24
|
+
self.dataset_id = dataset_id
|
|
25
|
+
self.openapi_service = openapi_service
|
|
26
|
+
self.total_uploads = total_uploads
|
|
27
|
+
self.progress_poll_interval = progress_poll_interval
|
|
28
|
+
self.upload_complete = False
|
|
29
|
+
|
|
30
|
+
def _get_progress_or_none(self):
|
|
31
|
+
try:
|
|
32
|
+
return self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
33
|
+
self.dataset_id
|
|
34
|
+
)
|
|
35
|
+
except Exception: # noqa: BLE001
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def complete(self) -> None:
|
|
39
|
+
self.upload_complete = True
|
|
40
|
+
|
|
41
|
+
def run(self) -> None:
|
|
42
|
+
try:
|
|
43
|
+
with tqdm(
|
|
44
|
+
total=self.total_uploads,
|
|
45
|
+
desc="Uploading datapoints",
|
|
46
|
+
disable=rapidata_config.logging.silent_mode,
|
|
47
|
+
) as pbar:
|
|
48
|
+
while True:
|
|
49
|
+
current_progress = self._get_progress_or_none()
|
|
50
|
+
if current_progress is None:
|
|
51
|
+
time.sleep(self.progress_poll_interval)
|
|
52
|
+
logger.debug(
|
|
53
|
+
"No progress yet, sleeping for %s seconds",
|
|
54
|
+
self.progress_poll_interval,
|
|
55
|
+
)
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
total_completed = current_progress.ready + current_progress.failed
|
|
59
|
+
|
|
60
|
+
pbar.n = total_completed
|
|
61
|
+
pbar.refresh()
|
|
62
|
+
|
|
63
|
+
time.sleep(self.progress_poll_interval)
|
|
64
|
+
if total_completed >= self.total_uploads:
|
|
65
|
+
break
|
|
66
|
+
|
|
67
|
+
if self.upload_complete and current_progress.pending == 0:
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
pbar.close()
|
|
71
|
+
|
|
72
|
+
success_rate = (
|
|
73
|
+
round((current_progress.ready / self.total_uploads * 100), 2)
|
|
74
|
+
if self.total_uploads > 0
|
|
75
|
+
else 0
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
logger.info(
|
|
79
|
+
"Upload complete: %s ready, %s failed (%s%% success rate)",
|
|
80
|
+
current_progress.ready,
|
|
81
|
+
current_progress.failed,
|
|
82
|
+
success_rate,
|
|
83
|
+
)
|
|
84
|
+
except Exception as e: # noqa: BLE001
|
|
85
|
+
logger.error("Progress tracking thread error: %s", str(e))
|
|
86
|
+
raise RuntimeError("Progress tracking failed, aborting uploads")
|
|
87
|
+
|
|
88
|
+
def create_thread(self) -> threading.Thread:
|
|
89
|
+
thread = threading.Thread(target=self.run)
|
|
90
|
+
thread.daemon = True
|
|
91
|
+
return thread
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
2
|
+
from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
|
|
3
|
+
from rapidata.service import LocalFileService
|
|
4
|
+
from rapidata.service.openapi_service import OpenAPIService
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from typing import Generator
|
|
9
|
+
from rapidata.rapidata_client.config import logger
|
|
10
|
+
import time
|
|
11
|
+
import threading
|
|
12
|
+
from rapidata.rapidata_client.api.rapidata_api_client import (
|
|
13
|
+
suppress_rapidata_error_logging,
|
|
14
|
+
)
|
|
15
|
+
from rapidata.rapidata_client.config.rapidata_config import rapidata_config
|
|
16
|
+
from rapidata.rapidata_client.order.dataset._progress_tracker import ProgressTracker
|
|
17
|
+
|
|
18
|
+
# Add OpenTelemetry context imports for thread propagation
|
|
19
|
+
from opentelemetry import context as otel_context
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def chunk_list(lst: list, chunk_size: int) -> Generator:
|
|
23
|
+
for i in range(0, len(lst), chunk_size):
|
|
24
|
+
yield lst[i : i + chunk_size]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RapidataDataset:
|
|
28
|
+
def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
|
|
29
|
+
self.id = dataset_id
|
|
30
|
+
self.openapi_service = openapi_service
|
|
31
|
+
self.local_file_service = LocalFileService()
|
|
32
|
+
|
|
33
|
+
def add_datapoints(
|
|
34
|
+
self,
|
|
35
|
+
datapoints: list[Datapoint],
|
|
36
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
37
|
+
if not datapoints:
|
|
38
|
+
return [], []
|
|
39
|
+
|
|
40
|
+
effective_asset_type = datapoints[0]._get_effective_asset_type()
|
|
41
|
+
|
|
42
|
+
logger.debug(f"Config for datapoint upload: {rapidata_config}")
|
|
43
|
+
|
|
44
|
+
if issubclass(effective_asset_type, MediaAsset):
|
|
45
|
+
return self._add_media_from_paths(
|
|
46
|
+
datapoints,
|
|
47
|
+
)
|
|
48
|
+
elif issubclass(effective_asset_type, TextAsset):
|
|
49
|
+
return self._add_texts(datapoints)
|
|
50
|
+
else:
|
|
51
|
+
raise ValueError(f"Unsupported asset type: {effective_asset_type}")
|
|
52
|
+
|
|
53
|
+
def _add_texts(
|
|
54
|
+
self, datapoints: list[Datapoint]
|
|
55
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
56
|
+
|
|
57
|
+
def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
|
|
58
|
+
model = datapoint.create_text_upload_model(index)
|
|
59
|
+
|
|
60
|
+
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
|
|
61
|
+
dataset_id=self.id, create_datapoint_from_text_sources_model=model
|
|
62
|
+
)
|
|
63
|
+
return datapoint
|
|
64
|
+
|
|
65
|
+
def upload_with_context(
|
|
66
|
+
context: otel_context.Context, datapoint: Datapoint, index: int
|
|
67
|
+
) -> Datapoint:
|
|
68
|
+
"""Wrapper function that runs upload_text_datapoint with the provided context."""
|
|
69
|
+
token = otel_context.attach(context)
|
|
70
|
+
try:
|
|
71
|
+
return upload_text_datapoint(datapoint, index)
|
|
72
|
+
finally:
|
|
73
|
+
otel_context.detach(token)
|
|
74
|
+
|
|
75
|
+
successful_uploads: list[Datapoint] = []
|
|
76
|
+
failed_uploads: list[Datapoint] = []
|
|
77
|
+
|
|
78
|
+
# Capture the current OpenTelemetry context before creating threads
|
|
79
|
+
current_context = otel_context.get_current()
|
|
80
|
+
|
|
81
|
+
total_uploads = len(datapoints)
|
|
82
|
+
with ThreadPoolExecutor(
|
|
83
|
+
max_workers=rapidata_config.upload.maxWorkers
|
|
84
|
+
) as executor:
|
|
85
|
+
future_to_datapoint = {
|
|
86
|
+
executor.submit(
|
|
87
|
+
upload_with_context, current_context, datapoint, i
|
|
88
|
+
): datapoint
|
|
89
|
+
for i, datapoint in enumerate(datapoints)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
with tqdm(
|
|
93
|
+
total=total_uploads,
|
|
94
|
+
desc="Uploading text datapoints",
|
|
95
|
+
disable=rapidata_config.logging.silent_mode,
|
|
96
|
+
) as pbar:
|
|
97
|
+
for future in as_completed(future_to_datapoint.keys()):
|
|
98
|
+
datapoint = future_to_datapoint[future]
|
|
99
|
+
try:
|
|
100
|
+
result = future.result()
|
|
101
|
+
pbar.update(1)
|
|
102
|
+
successful_uploads.append(result)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
failed_uploads.append(datapoint)
|
|
105
|
+
logger.error("Upload failed for %s: %s", datapoint, str(e))
|
|
106
|
+
|
|
107
|
+
return successful_uploads, failed_uploads
|
|
108
|
+
|
|
109
|
+
def _process_single_upload(
|
|
110
|
+
self,
|
|
111
|
+
datapoint: Datapoint,
|
|
112
|
+
index: int,
|
|
113
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
114
|
+
"""
|
|
115
|
+
Process single upload with retry logic and error tracking.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
media_asset: MediaAsset or MultiAsset to upload
|
|
119
|
+
meta_list: Optional sequence of metadata for the asset
|
|
120
|
+
index: Sort index for the upload
|
|
121
|
+
max_retries: Maximum number of retry attempts (default: 3)
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
125
|
+
"""
|
|
126
|
+
logger.debug("Processing single upload for %s with index %s", datapoint, index)
|
|
127
|
+
|
|
128
|
+
local_successful: list[Datapoint] = []
|
|
129
|
+
local_failed: list[Datapoint] = []
|
|
130
|
+
|
|
131
|
+
metadata = datapoint.get_prepared_metadata()
|
|
132
|
+
|
|
133
|
+
local_paths = datapoint.get_local_file_paths()
|
|
134
|
+
urls = datapoint.get_urls()
|
|
135
|
+
|
|
136
|
+
last_exception = None
|
|
137
|
+
for attempt in range(rapidata_config.upload.maxRetries):
|
|
138
|
+
try:
|
|
139
|
+
with suppress_rapidata_error_logging():
|
|
140
|
+
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
|
|
141
|
+
dataset_id=self.id,
|
|
142
|
+
file=local_paths,
|
|
143
|
+
url=urls,
|
|
144
|
+
metadata=metadata,
|
|
145
|
+
sort_index=index,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
local_successful.append(datapoint)
|
|
149
|
+
|
|
150
|
+
return local_successful, local_failed
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
last_exception = e
|
|
154
|
+
if attempt < rapidata_config.upload.maxRetries - 1:
|
|
155
|
+
# Exponential backoff: wait 1s, then 2s, then 4s
|
|
156
|
+
retry_delay = 2**attempt
|
|
157
|
+
time.sleep(retry_delay)
|
|
158
|
+
logger.debug("Error: %s", str(last_exception))
|
|
159
|
+
logger.debug(
|
|
160
|
+
"Retrying %s of %s...",
|
|
161
|
+
attempt + 1,
|
|
162
|
+
rapidata_config.upload.maxRetries,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# If we get here, all retries failed
|
|
166
|
+
local_failed.append(datapoint)
|
|
167
|
+
tqdm.write(
|
|
168
|
+
f"Upload failed for {datapoint} after {rapidata_config.upload.maxRetries} attempts. \nFinal error: \n{str(last_exception)}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return local_successful, local_failed
|
|
172
|
+
|
|
173
|
+
def _process_uploads_in_chunks(
|
|
174
|
+
self,
|
|
175
|
+
datapoints: list[Datapoint],
|
|
176
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
177
|
+
"""
|
|
178
|
+
Process uploads in chunks with a ThreadPoolExecutor.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
media_paths: List of assets to upload
|
|
182
|
+
multi_metadata: Optional sequence of sequences of metadata
|
|
183
|
+
chunk_size: Number of items to process in each batch
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
tuple[list[str], list[str]]: Lists of successful and failed uploads
|
|
187
|
+
"""
|
|
188
|
+
successful_uploads: list[Datapoint] = []
|
|
189
|
+
failed_uploads: list[Datapoint] = []
|
|
190
|
+
|
|
191
|
+
def process_upload_with_context(
|
|
192
|
+
context: otel_context.Context, datapoint: Datapoint, index: int
|
|
193
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
194
|
+
"""Wrapper function that runs _process_single_upload with the provided context."""
|
|
195
|
+
token = otel_context.attach(context)
|
|
196
|
+
try:
|
|
197
|
+
return self._process_single_upload(datapoint, index)
|
|
198
|
+
finally:
|
|
199
|
+
otel_context.detach(token)
|
|
200
|
+
|
|
201
|
+
# Capture the current OpenTelemetry context before creating threads
|
|
202
|
+
current_context = otel_context.get_current()
|
|
203
|
+
|
|
204
|
+
with ThreadPoolExecutor(
|
|
205
|
+
max_workers=rapidata_config.upload.maxWorkers
|
|
206
|
+
) as executor:
|
|
207
|
+
# Process uploads in chunks to avoid overwhelming the system
|
|
208
|
+
for chunk_idx, chunk in enumerate(
|
|
209
|
+
chunk_list(datapoints, rapidata_config.upload.chunkSize)
|
|
210
|
+
):
|
|
211
|
+
futures = [
|
|
212
|
+
executor.submit(
|
|
213
|
+
process_upload_with_context,
|
|
214
|
+
current_context,
|
|
215
|
+
datapoint,
|
|
216
|
+
chunk_idx * rapidata_config.upload.chunkSize + i,
|
|
217
|
+
)
|
|
218
|
+
for i, datapoint in enumerate(chunk)
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
# Wait for this chunk to complete before starting the next one
|
|
222
|
+
for future in as_completed(futures):
|
|
223
|
+
try:
|
|
224
|
+
chunk_successful, chunk_failed = future.result()
|
|
225
|
+
successful_uploads.extend(chunk_successful)
|
|
226
|
+
failed_uploads.extend(chunk_failed)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error("Future execution failed: %s", str(e))
|
|
229
|
+
|
|
230
|
+
return successful_uploads, failed_uploads
|
|
231
|
+
|
|
232
|
+
def _add_media_from_paths(
|
|
233
|
+
self,
|
|
234
|
+
datapoints: list[Datapoint],
|
|
235
|
+
progress_poll_interval: float = 0.5,
|
|
236
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
237
|
+
"""
|
|
238
|
+
Upload media paths in chunks with managed resources.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
datapoints: List of Datapoint objects to upload
|
|
242
|
+
chunk_size: Number of items to process in each batch
|
|
243
|
+
progress_poll_interval: Time in seconds between progress checks
|
|
244
|
+
Returns:
|
|
245
|
+
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ValueError: If multi_metadata lengths don't match media_paths length
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
# Setup tracking variables
|
|
252
|
+
total_uploads = len(datapoints)
|
|
253
|
+
|
|
254
|
+
# Create and start progress tracking thread
|
|
255
|
+
progress_tracker = ProgressTracker(
|
|
256
|
+
dataset_id=self.id,
|
|
257
|
+
openapi_service=self.openapi_service,
|
|
258
|
+
total_uploads=total_uploads,
|
|
259
|
+
progress_poll_interval=progress_poll_interval,
|
|
260
|
+
)
|
|
261
|
+
progress_thread = progress_tracker.create_thread()
|
|
262
|
+
progress_thread.start()
|
|
263
|
+
|
|
264
|
+
# Process uploads in chunks
|
|
265
|
+
try:
|
|
266
|
+
successful_uploads, failed_uploads = self._process_uploads_in_chunks(
|
|
267
|
+
datapoints,
|
|
268
|
+
)
|
|
269
|
+
finally:
|
|
270
|
+
progress_tracker.complete()
|
|
271
|
+
progress_thread.join(10)
|
|
272
|
+
|
|
273
|
+
if failed_uploads:
|
|
274
|
+
logger.error(
|
|
275
|
+
"Upload failed for %s datapoints: %s",
|
|
276
|
+
len(failed_uploads),
|
|
277
|
+
failed_uploads,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
return successful_uploads, failed_uploads
|
|
281
|
+
|
|
282
|
+
def __str__(self) -> str:
|
|
283
|
+
return f"RapidataDataset(id={self.id})"
|
|
284
|
+
|
|
285
|
+
def __repr__(self) -> str:
|
|
286
|
+
return self.__str__()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
rapidata/__init__.py,sha256=
|
|
1
|
+
rapidata/__init__.py,sha256=nNNJT2nQfhHYe5yS9T3V-0MeyarzPFCtE_66Wnit6ho,917
|
|
2
2
|
rapidata/api_client/__init__.py,sha256=utY2iWepKJQO_iGz6aIg_qSoqoDkV9pBMAA58pIFE4M,36016
|
|
3
3
|
rapidata/api_client/api/__init__.py,sha256=07qqwzQiBYt5V2BtnzbXhZL2cmVHATyZmCSGshIXLck,1603
|
|
4
4
|
rapidata/api_client/api/benchmark_api.py,sha256=Mlx2qDDJcgPjWvaBnps9dxvVd0re1knG0SyoLUiHKSc,119756
|
|
@@ -587,7 +587,7 @@ rapidata/rapidata_client/config/managed_print.py,sha256=2T6dwgR1EZzFAdOEyPp_BBUs
|
|
|
587
587
|
rapidata/rapidata_client/config/order_config.py,sha256=XxRZERzUUA9md6-PVlV__eCw8DD2kPbT_UmMwG1mAS4,615
|
|
588
588
|
rapidata/rapidata_client/config/rapidata_config.py,sha256=mURnKdl5-2sE4e_IYY9-aBkix6a12t47otEErGE_q0c,1507
|
|
589
589
|
rapidata/rapidata_client/config/tracer.py,sha256=h3GXzaX79HPcip4fBhLaLW0mRlXttR7D3KA78ZT0KVw,4736
|
|
590
|
-
rapidata/rapidata_client/config/upload_config.py,sha256=
|
|
590
|
+
rapidata/rapidata_client/config/upload_config.py,sha256=hjefl-w9WaCNeCEe6hdnrAQEMjgDy-r1zgUUIFR68wk,473
|
|
591
591
|
rapidata/rapidata_client/country_codes/__init__.py,sha256=FB9Dcks44J6C6YBSYmTmNZ71tE130x6NO_3aLJ8fKzQ,40
|
|
592
592
|
rapidata/rapidata_client/country_codes/country_codes.py,sha256=ePHqeb7y9DWQZAnddBzPx1puYBcrgUjdR2sbFijuFD8,283
|
|
593
593
|
rapidata/rapidata_client/datapoints/__init__.py,sha256=YiXWlFKSi3ABP35zDukL7_z5uEdRrCMriquM6BoX6-s,276
|
|
@@ -611,7 +611,7 @@ rapidata/rapidata_client/datapoints/metadata/_select_words_metadata.py,sha256=T8
|
|
|
611
611
|
rapidata/rapidata_client/demographic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
612
612
|
rapidata/rapidata_client/demographic/demographic_manager.py,sha256=x0kQdgqMXAx7VuZJiP2HeI_dtKEd-W-hcY3URDcEfrU,1089
|
|
613
613
|
rapidata/rapidata_client/exceptions/__init__.py,sha256=2hbWRgjlCGuoLPVDloQmmH81uzm9F2OAX2iFGCJyRu8,59
|
|
614
|
-
rapidata/rapidata_client/exceptions/failed_upload_exception.py,sha256=
|
|
614
|
+
rapidata/rapidata_client/exceptions/failed_upload_exception.py,sha256=jsd2foR3c8X5g4hgljgMAY5X_JTdmUuhBPWaL12938E,3117
|
|
615
615
|
rapidata/rapidata_client/filter/__init__.py,sha256=j_Kfz_asNVxwp56SAN2saB7ZAHg3smL5_W2sSitmuJY,548
|
|
616
616
|
rapidata/rapidata_client/filter/_base_filter.py,sha256=NVa2oWgtXD9kmXWyMkYZZ-2RYzgcN0hO76uGrEXXLEs,2384
|
|
617
617
|
rapidata/rapidata_client/filter/age_filter.py,sha256=mVZaKyBoK-mml_oFox97l1yUXvINPk-2cEimuU_FJac,908
|
|
@@ -631,8 +631,9 @@ rapidata/rapidata_client/filter/rapidata_filters.py,sha256=B8ptQsaAn1e14Grv8xBYQ
|
|
|
631
631
|
rapidata/rapidata_client/filter/response_count_filter.py,sha256=i2u2YQD3_RLQRZyqAceAGLQS3es97Q2n8KTlgfDYMko,2332
|
|
632
632
|
rapidata/rapidata_client/filter/user_score_filter.py,sha256=4B3Zzp7aosDFmte3nLPTlXMN4zatT6Wcq5QLIoXqhgI,1910
|
|
633
633
|
rapidata/rapidata_client/order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
634
|
-
rapidata/rapidata_client/order/
|
|
635
|
-
rapidata/rapidata_client/order/
|
|
634
|
+
rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=C-TbKELNuLjQiZt9Gsl6LdtzIUtsu0sNLKGIcLvJEHk,17120
|
|
635
|
+
rapidata/rapidata_client/order/dataset/_progress_tracker.py,sha256=PkTSYrLVNgtXjklXj5ikBEcgF6qaYXoafYyUZQHRn9M,3109
|
|
636
|
+
rapidata/rapidata_client/order/dataset/_rapidata_dataset.py,sha256=ONH56htEvoVZvkSItuTi3_88kaDWohmYYABSoAPEn4Q,10724
|
|
636
637
|
rapidata/rapidata_client/order/rapidata_order.py,sha256=FvZi3t4dARRNsKWvYiNxVvM50AzPwQYR3AzI4utD6OI,14497
|
|
637
638
|
rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=XiV_BpJxG6d8o0rFDYhnB3_mb576CQG5hY-qVXlJZKY,42592
|
|
638
639
|
rapidata/rapidata_client/order/rapidata_results.py,sha256=weL4S14fzug3ZOJbQk9Oj-4tv2jx5aZAMp7VJ-a6Qq4,8437
|
|
@@ -689,7 +690,7 @@ rapidata/service/credential_manager.py,sha256=T3yL4tXVnibRytxjQkOC-ex3kFGQR5KcKU
|
|
|
689
690
|
rapidata/service/local_file_service.py,sha256=0Q4LdoEtPFKzgXK2oZ1cQ-X7FipakscjGnnBH8dRFRQ,855
|
|
690
691
|
rapidata/service/openapi_service.py,sha256=k3V4eMNcAjBcxEv17lDivK8LV5TEjRTL9B_5KBlhcas,5482
|
|
691
692
|
rapidata/types/__init__.py,sha256=gSGrmWV5gEA6pPfAR5vwSy_DvibO5IjCZDiB7LtlMOQ,6134
|
|
692
|
-
rapidata-2.40.
|
|
693
|
-
rapidata-2.40.
|
|
694
|
-
rapidata-2.40.
|
|
695
|
-
rapidata-2.40.
|
|
693
|
+
rapidata-2.40.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
694
|
+
rapidata-2.40.2.dist-info/METADATA,sha256=j6edMuFl30ALoelOUbusmBXYPwMCJjfUZbWfzk2GsdY,1406
|
|
695
|
+
rapidata-2.40.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
696
|
+
rapidata-2.40.2.dist-info/RECORD,,
|
|
@@ -1,475 +0,0 @@
|
|
|
1
|
-
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
2
|
-
from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
|
|
3
|
-
from rapidata.service import LocalFileService
|
|
4
|
-
from rapidata.service.openapi_service import OpenAPIService
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
-
from tqdm import tqdm
|
|
7
|
-
|
|
8
|
-
from typing import Generator
|
|
9
|
-
from rapidata.rapidata_client.config import logger, managed_print
|
|
10
|
-
import time
|
|
11
|
-
import threading
|
|
12
|
-
from rapidata.rapidata_client.api.rapidata_api_client import (
|
|
13
|
-
suppress_rapidata_error_logging,
|
|
14
|
-
)
|
|
15
|
-
from rapidata.rapidata_client.config.rapidata_config import rapidata_config
|
|
16
|
-
|
|
17
|
-
# Add OpenTelemetry context imports for thread propagation
|
|
18
|
-
from opentelemetry import context as otel_context
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def chunk_list(lst: list, chunk_size: int) -> Generator:
|
|
22
|
-
for i in range(0, len(lst), chunk_size):
|
|
23
|
-
yield lst[i : i + chunk_size]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class RapidataDataset:
|
|
27
|
-
def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
|
|
28
|
-
self.id = dataset_id
|
|
29
|
-
self.openapi_service = openapi_service
|
|
30
|
-
self.local_file_service = LocalFileService()
|
|
31
|
-
|
|
32
|
-
def add_datapoints(
|
|
33
|
-
self,
|
|
34
|
-
datapoints: list[Datapoint],
|
|
35
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
36
|
-
if not datapoints:
|
|
37
|
-
return [], []
|
|
38
|
-
|
|
39
|
-
effective_asset_type = datapoints[0]._get_effective_asset_type()
|
|
40
|
-
|
|
41
|
-
logger.debug(f"Config for datapoint upload: {rapidata_config}")
|
|
42
|
-
|
|
43
|
-
if issubclass(effective_asset_type, MediaAsset):
|
|
44
|
-
return self._add_media_from_paths(
|
|
45
|
-
datapoints,
|
|
46
|
-
)
|
|
47
|
-
elif issubclass(effective_asset_type, TextAsset):
|
|
48
|
-
return self._add_texts(datapoints)
|
|
49
|
-
else:
|
|
50
|
-
raise ValueError(f"Unsupported asset type: {effective_asset_type}")
|
|
51
|
-
|
|
52
|
-
def _add_texts(
|
|
53
|
-
self, datapoints: list[Datapoint]
|
|
54
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
55
|
-
|
|
56
|
-
def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
|
|
57
|
-
model = datapoint.create_text_upload_model(index)
|
|
58
|
-
|
|
59
|
-
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
|
|
60
|
-
dataset_id=self.id, create_datapoint_from_text_sources_model=model
|
|
61
|
-
)
|
|
62
|
-
return datapoint
|
|
63
|
-
|
|
64
|
-
def upload_with_context(
|
|
65
|
-
context: otel_context.Context, datapoint: Datapoint, index: int
|
|
66
|
-
) -> Datapoint:
|
|
67
|
-
"""Wrapper function that runs upload_text_datapoint with the provided context."""
|
|
68
|
-
token = otel_context.attach(context)
|
|
69
|
-
try:
|
|
70
|
-
return upload_text_datapoint(datapoint, index)
|
|
71
|
-
finally:
|
|
72
|
-
otel_context.detach(token)
|
|
73
|
-
|
|
74
|
-
successful_uploads: list[Datapoint] = []
|
|
75
|
-
failed_uploads: list[Datapoint] = []
|
|
76
|
-
|
|
77
|
-
# Capture the current OpenTelemetry context before creating threads
|
|
78
|
-
current_context = otel_context.get_current()
|
|
79
|
-
|
|
80
|
-
total_uploads = len(datapoints)
|
|
81
|
-
with ThreadPoolExecutor(
|
|
82
|
-
max_workers=rapidata_config.upload.maxWorkers
|
|
83
|
-
) as executor:
|
|
84
|
-
future_to_datapoint = {
|
|
85
|
-
executor.submit(
|
|
86
|
-
upload_with_context, current_context, datapoint, i
|
|
87
|
-
): datapoint
|
|
88
|
-
for i, datapoint in enumerate(datapoints)
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
with tqdm(
|
|
92
|
-
total=total_uploads,
|
|
93
|
-
desc="Uploading text datapoints",
|
|
94
|
-
disable=rapidata_config.logging.silent_mode,
|
|
95
|
-
) as pbar:
|
|
96
|
-
for future in as_completed(future_to_datapoint.keys()):
|
|
97
|
-
datapoint = future_to_datapoint[future]
|
|
98
|
-
try:
|
|
99
|
-
result = future.result()
|
|
100
|
-
pbar.update(1)
|
|
101
|
-
successful_uploads.append(result)
|
|
102
|
-
except Exception as e:
|
|
103
|
-
failed_uploads.append(datapoint)
|
|
104
|
-
logger.error("Upload failed for %s: %s", datapoint, str(e))
|
|
105
|
-
|
|
106
|
-
return successful_uploads, failed_uploads
|
|
107
|
-
|
|
108
|
-
def _process_single_upload(
|
|
109
|
-
self,
|
|
110
|
-
datapoint: Datapoint,
|
|
111
|
-
index: int,
|
|
112
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
113
|
-
"""
|
|
114
|
-
Process single upload with retry logic and error tracking.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
media_asset: MediaAsset or MultiAsset to upload
|
|
118
|
-
meta_list: Optional sequence of metadata for the asset
|
|
119
|
-
index: Sort index for the upload
|
|
120
|
-
max_retries: Maximum number of retry attempts (default: 3)
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
124
|
-
"""
|
|
125
|
-
logger.debug("Processing single upload for %s with index %s", datapoint, index)
|
|
126
|
-
|
|
127
|
-
local_successful: list[Datapoint] = []
|
|
128
|
-
local_failed: list[Datapoint] = []
|
|
129
|
-
|
|
130
|
-
metadata = datapoint.get_prepared_metadata()
|
|
131
|
-
|
|
132
|
-
local_paths = datapoint.get_local_file_paths()
|
|
133
|
-
urls = datapoint.get_urls()
|
|
134
|
-
|
|
135
|
-
last_exception = None
|
|
136
|
-
for attempt in range(rapidata_config.upload.maxRetries):
|
|
137
|
-
try:
|
|
138
|
-
with suppress_rapidata_error_logging():
|
|
139
|
-
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
|
|
140
|
-
dataset_id=self.id,
|
|
141
|
-
file=local_paths,
|
|
142
|
-
url=urls,
|
|
143
|
-
metadata=metadata,
|
|
144
|
-
sort_index=index,
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
local_successful.append(datapoint)
|
|
148
|
-
|
|
149
|
-
return local_successful, local_failed
|
|
150
|
-
|
|
151
|
-
except Exception as e:
|
|
152
|
-
last_exception = e
|
|
153
|
-
if attempt < rapidata_config.upload.maxRetries - 1:
|
|
154
|
-
# Exponential backoff: wait 1s, then 2s, then 4s
|
|
155
|
-
retry_delay = 2**attempt
|
|
156
|
-
time.sleep(retry_delay)
|
|
157
|
-
logger.debug("Error: %s", str(last_exception))
|
|
158
|
-
logger.debug(
|
|
159
|
-
"Retrying %s of %s...",
|
|
160
|
-
attempt + 1,
|
|
161
|
-
rapidata_config.upload.maxRetries,
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
# If we get here, all retries failed
|
|
165
|
-
local_failed.append(datapoint)
|
|
166
|
-
tqdm.write(
|
|
167
|
-
f"Upload failed for {datapoint} after {rapidata_config.upload.maxRetries} attempts. \nFinal error: \n{str(last_exception)}"
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
return local_successful, local_failed
|
|
171
|
-
|
|
172
|
-
def _get_progress_tracker(
|
|
173
|
-
self,
|
|
174
|
-
total_uploads: int,
|
|
175
|
-
stop_event: threading.Event,
|
|
176
|
-
progress_error_event: threading.Event,
|
|
177
|
-
progress_poll_interval: float,
|
|
178
|
-
) -> threading.Thread:
|
|
179
|
-
"""
|
|
180
|
-
Create and return a progress tracking thread that shows actual API progress.
|
|
181
|
-
|
|
182
|
-
Args:
|
|
183
|
-
total_uploads: Total number of uploads to track
|
|
184
|
-
initial_ready: Initial number of ready items
|
|
185
|
-
initial_progress: Initial progress state
|
|
186
|
-
stop_event: Event to signal thread to stop
|
|
187
|
-
progress_error_event: Event to signal an error in progress tracking
|
|
188
|
-
progress_poll_interval: Time between progress checks
|
|
189
|
-
|
|
190
|
-
Returns:
|
|
191
|
-
threading.Thread: The progress tracking thread
|
|
192
|
-
"""
|
|
193
|
-
|
|
194
|
-
def progress_tracking_thread():
|
|
195
|
-
try:
|
|
196
|
-
# Initialize progress bar with 0 completions
|
|
197
|
-
with tqdm(
|
|
198
|
-
total=total_uploads,
|
|
199
|
-
desc="Uploading datapoints",
|
|
200
|
-
disable=rapidata_config.logging.silent_mode,
|
|
201
|
-
) as pbar:
|
|
202
|
-
prev_ready = 0
|
|
203
|
-
prev_failed = 0
|
|
204
|
-
stall_count = 0
|
|
205
|
-
last_progress_time = time.time()
|
|
206
|
-
|
|
207
|
-
# We'll wait for all uploads to finish + some extra time
|
|
208
|
-
# for the backend to fully process everything
|
|
209
|
-
all_uploads_complete = threading.Event()
|
|
210
|
-
|
|
211
|
-
while not stop_event.is_set() or not all_uploads_complete.is_set():
|
|
212
|
-
try:
|
|
213
|
-
current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
214
|
-
self.id
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
# Calculate items completed since our initialization
|
|
218
|
-
completed_ready = current_progress.ready
|
|
219
|
-
completed_failed = current_progress.failed
|
|
220
|
-
total_completed = completed_ready + completed_failed
|
|
221
|
-
|
|
222
|
-
# Calculate newly completed items since our last check
|
|
223
|
-
new_ready = current_progress.ready - prev_ready
|
|
224
|
-
new_failed = current_progress.failed - prev_failed
|
|
225
|
-
|
|
226
|
-
# Update progress bar position to show actual completed items
|
|
227
|
-
# First reset to match the actual completed count
|
|
228
|
-
pbar.n = total_completed
|
|
229
|
-
pbar.refresh()
|
|
230
|
-
|
|
231
|
-
if new_ready > 0 or new_failed > 0:
|
|
232
|
-
# We saw progress
|
|
233
|
-
stall_count = 0
|
|
234
|
-
last_progress_time = time.time()
|
|
235
|
-
else:
|
|
236
|
-
stall_count += 1
|
|
237
|
-
|
|
238
|
-
# Update our tracking variables
|
|
239
|
-
prev_ready = current_progress.ready
|
|
240
|
-
prev_failed = current_progress.failed or 0
|
|
241
|
-
|
|
242
|
-
# Check if stop_event was set (all uploads submitted)
|
|
243
|
-
if stop_event.is_set():
|
|
244
|
-
elapsed_since_last_progress = (
|
|
245
|
-
time.time() - last_progress_time
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
# If we haven't seen progress for a while after all uploads were submitted
|
|
249
|
-
if elapsed_since_last_progress > 5.0:
|
|
250
|
-
# If we're at 100%, we're done
|
|
251
|
-
if total_completed >= total_uploads:
|
|
252
|
-
all_uploads_complete.set()
|
|
253
|
-
break
|
|
254
|
-
|
|
255
|
-
# If we're not at 100% but it's been a while with no progress
|
|
256
|
-
if stall_count > 5:
|
|
257
|
-
# We've polled several times with no progress, assume we're done
|
|
258
|
-
logger.warning(
|
|
259
|
-
"\nProgress seems stalled at %s/%s.",
|
|
260
|
-
total_completed,
|
|
261
|
-
total_uploads,
|
|
262
|
-
)
|
|
263
|
-
break
|
|
264
|
-
|
|
265
|
-
except Exception as e:
|
|
266
|
-
logger.error("\nError checking progress: %s", str(e))
|
|
267
|
-
stall_count += 1
|
|
268
|
-
|
|
269
|
-
if stall_count > 10: # Too many consecutive errors
|
|
270
|
-
progress_error_event.set()
|
|
271
|
-
break
|
|
272
|
-
|
|
273
|
-
# Sleep before next poll
|
|
274
|
-
time.sleep(progress_poll_interval)
|
|
275
|
-
|
|
276
|
-
except Exception as e:
|
|
277
|
-
logger.error("Progress tracking thread error: %s", str(e))
|
|
278
|
-
progress_error_event.set()
|
|
279
|
-
|
|
280
|
-
# Create and return the thread
|
|
281
|
-
progress_thread = threading.Thread(target=progress_tracking_thread)
|
|
282
|
-
progress_thread.daemon = True
|
|
283
|
-
return progress_thread
|
|
284
|
-
|
|
285
|
-
def _process_uploads_in_chunks(
|
|
286
|
-
self,
|
|
287
|
-
datapoints: list[Datapoint],
|
|
288
|
-
chunk_size: int,
|
|
289
|
-
stop_progress_tracking: threading.Event,
|
|
290
|
-
progress_tracking_error: threading.Event,
|
|
291
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
292
|
-
"""
|
|
293
|
-
Process uploads in chunks with a ThreadPoolExecutor.
|
|
294
|
-
|
|
295
|
-
Args:
|
|
296
|
-
media_paths: List of assets to upload
|
|
297
|
-
multi_metadata: Optional sequence of sequences of metadata
|
|
298
|
-
chunk_size: Number of items to process in each batch
|
|
299
|
-
stop_progress_tracking: Event to signal progress tracking to stop
|
|
300
|
-
progress_tracking_error: Event to detect progress tracking errors
|
|
301
|
-
|
|
302
|
-
Returns:
|
|
303
|
-
tuple[list[str], list[str]]: Lists of successful and failed uploads
|
|
304
|
-
"""
|
|
305
|
-
successful_uploads: list[Datapoint] = []
|
|
306
|
-
failed_uploads: list[Datapoint] = []
|
|
307
|
-
|
|
308
|
-
def process_upload_with_context(
|
|
309
|
-
context: otel_context.Context, datapoint: Datapoint, index: int
|
|
310
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
311
|
-
"""Wrapper function that runs _process_single_upload with the provided context."""
|
|
312
|
-
token = otel_context.attach(context)
|
|
313
|
-
try:
|
|
314
|
-
return self._process_single_upload(datapoint, index)
|
|
315
|
-
finally:
|
|
316
|
-
otel_context.detach(token)
|
|
317
|
-
|
|
318
|
-
# Capture the current OpenTelemetry context before creating threads
|
|
319
|
-
current_context = otel_context.get_current()
|
|
320
|
-
|
|
321
|
-
try:
|
|
322
|
-
with ThreadPoolExecutor(
|
|
323
|
-
max_workers=rapidata_config.upload.maxWorkers
|
|
324
|
-
) as executor:
|
|
325
|
-
# Process uploads in chunks to avoid overwhelming the system
|
|
326
|
-
for chunk_idx, chunk in enumerate(chunk_list(datapoints, chunk_size)):
|
|
327
|
-
futures = [
|
|
328
|
-
executor.submit(
|
|
329
|
-
process_upload_with_context,
|
|
330
|
-
current_context,
|
|
331
|
-
datapoint,
|
|
332
|
-
chunk_idx * chunk_size + i,
|
|
333
|
-
)
|
|
334
|
-
for i, datapoint in enumerate(chunk)
|
|
335
|
-
]
|
|
336
|
-
|
|
337
|
-
# Wait for this chunk to complete before starting the next one
|
|
338
|
-
for future in as_completed(futures):
|
|
339
|
-
if progress_tracking_error.is_set():
|
|
340
|
-
raise RuntimeError(
|
|
341
|
-
"Progress tracking failed, aborting uploads"
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
try:
|
|
345
|
-
chunk_successful, chunk_failed = future.result()
|
|
346
|
-
successful_uploads.extend(chunk_successful)
|
|
347
|
-
failed_uploads.extend(chunk_failed)
|
|
348
|
-
except Exception as e:
|
|
349
|
-
logger.error("Future execution failed: %s", str(e))
|
|
350
|
-
finally:
|
|
351
|
-
# Signal to the progress tracking thread that all uploads have been submitted
|
|
352
|
-
stop_progress_tracking.set()
|
|
353
|
-
|
|
354
|
-
return successful_uploads, failed_uploads
|
|
355
|
-
|
|
356
|
-
def _log_final_progress(
|
|
357
|
-
self,
|
|
358
|
-
total_uploads: int,
|
|
359
|
-
progress_poll_interval: float,
|
|
360
|
-
successful_uploads: list[Datapoint],
|
|
361
|
-
failed_uploads: list[Datapoint],
|
|
362
|
-
) -> None:
|
|
363
|
-
"""
|
|
364
|
-
Log the final progress of the upload operation.
|
|
365
|
-
|
|
366
|
-
Args:
|
|
367
|
-
total_uploads: Total number of uploads
|
|
368
|
-
initial_ready: Initial number of ready items
|
|
369
|
-
initial_progress: Initial progress state
|
|
370
|
-
progress_poll_interval: Time between progress checks
|
|
371
|
-
successful_uploads: List of successful uploads for fallback reporting
|
|
372
|
-
failed_uploads: List of failed uploads for fallback reporting
|
|
373
|
-
"""
|
|
374
|
-
try:
|
|
375
|
-
# Get final progress
|
|
376
|
-
final_progress = (
|
|
377
|
-
self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
378
|
-
self.id
|
|
379
|
-
)
|
|
380
|
-
)
|
|
381
|
-
total_ready = final_progress.ready
|
|
382
|
-
total_failed = final_progress.failed
|
|
383
|
-
|
|
384
|
-
# Make sure we account for all uploads
|
|
385
|
-
if total_ready + total_failed < total_uploads:
|
|
386
|
-
# Try one more time after a longer wait
|
|
387
|
-
time.sleep(5 * progress_poll_interval)
|
|
388
|
-
final_progress = (
|
|
389
|
-
self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
|
|
390
|
-
self.id
|
|
391
|
-
)
|
|
392
|
-
)
|
|
393
|
-
total_ready = final_progress.ready
|
|
394
|
-
total_failed = final_progress.failed
|
|
395
|
-
|
|
396
|
-
success_rate = (
|
|
397
|
-
(total_ready / total_uploads * 100) if total_uploads > 0 else 0
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
logger.info(
|
|
401
|
-
"Upload complete: %s ready, %s failed (%s%% success rate)",
|
|
402
|
-
total_ready,
|
|
403
|
-
total_uploads - total_ready,
|
|
404
|
-
success_rate,
|
|
405
|
-
)
|
|
406
|
-
except Exception as e:
|
|
407
|
-
logger.error("Error getting final progress: %s", str(e))
|
|
408
|
-
logger.info(
|
|
409
|
-
"Upload summary from local tracking: %s succeeded, %s failed",
|
|
410
|
-
len(successful_uploads),
|
|
411
|
-
len(failed_uploads),
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
if failed_uploads:
|
|
415
|
-
logger.error("Failed uploads: %s", failed_uploads)
|
|
416
|
-
|
|
417
|
-
def _add_media_from_paths(
|
|
418
|
-
self,
|
|
419
|
-
datapoints: list[Datapoint],
|
|
420
|
-
chunk_size: int = 50,
|
|
421
|
-
progress_poll_interval: float = 0.5,
|
|
422
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
423
|
-
"""
|
|
424
|
-
Upload media paths in chunks with managed resources.
|
|
425
|
-
|
|
426
|
-
Args:
|
|
427
|
-
datapoints: List of Datapoint objects to upload
|
|
428
|
-
chunk_size: Number of items to process in each batch
|
|
429
|
-
progress_poll_interval: Time in seconds between progress checks
|
|
430
|
-
Returns:
|
|
431
|
-
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
432
|
-
|
|
433
|
-
Raises:
|
|
434
|
-
ValueError: If multi_metadata lengths don't match media_paths length
|
|
435
|
-
"""
|
|
436
|
-
|
|
437
|
-
# Setup tracking variables
|
|
438
|
-
total_uploads = len(datapoints)
|
|
439
|
-
|
|
440
|
-
# Create thread control events
|
|
441
|
-
stop_progress_tracking = threading.Event()
|
|
442
|
-
progress_tracking_error = threading.Event()
|
|
443
|
-
|
|
444
|
-
# Create and start progress tracking thread
|
|
445
|
-
progress_thread = self._get_progress_tracker(
|
|
446
|
-
total_uploads,
|
|
447
|
-
stop_progress_tracking,
|
|
448
|
-
progress_tracking_error,
|
|
449
|
-
progress_poll_interval,
|
|
450
|
-
)
|
|
451
|
-
progress_thread.start()
|
|
452
|
-
|
|
453
|
-
# Process uploads in chunks
|
|
454
|
-
try:
|
|
455
|
-
successful_uploads, failed_uploads = self._process_uploads_in_chunks(
|
|
456
|
-
datapoints,
|
|
457
|
-
chunk_size,
|
|
458
|
-
stop_progress_tracking,
|
|
459
|
-
progress_tracking_error,
|
|
460
|
-
)
|
|
461
|
-
finally:
|
|
462
|
-
progress_thread.join(10) # Add margin to the timeout for tqdm
|
|
463
|
-
|
|
464
|
-
# Log final progress
|
|
465
|
-
self._log_final_progress(
|
|
466
|
-
total_uploads, progress_poll_interval, successful_uploads, failed_uploads
|
|
467
|
-
)
|
|
468
|
-
|
|
469
|
-
return successful_uploads, failed_uploads
|
|
470
|
-
|
|
471
|
-
def __str__(self) -> str:
|
|
472
|
-
return f"RapidataDataset(id={self.id})"
|
|
473
|
-
|
|
474
|
-
def __repr__(self) -> str:
|
|
475
|
-
return self.__str__()
|
|
File without changes
|
|
File without changes
|