rapidata 2.41.2__py3-none-any.whl → 2.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidata might be problematic. Click here for more details.
- rapidata/__init__.py +1 -5
- rapidata/api_client/__init__.py +14 -14
- rapidata/api_client/api/__init__.py +1 -0
- rapidata/api_client/api/asset_api.py +851 -0
- rapidata/api_client/api/benchmark_api.py +298 -0
- rapidata/api_client/api/customer_rapid_api.py +29 -43
- rapidata/api_client/api/dataset_api.py +163 -1143
- rapidata/api_client/api/participant_api.py +28 -74
- rapidata/api_client/api/validation_set_api.py +283 -0
- rapidata/api_client/models/__init__.py +13 -14
- rapidata/api_client/models/add_validation_rapid_model.py +3 -3
- rapidata/api_client/models/add_validation_rapid_new_model.py +152 -0
- rapidata/api_client/models/add_validation_rapid_new_model_asset.py +182 -0
- rapidata/api_client/models/compare_workflow_model.py +3 -3
- rapidata/api_client/models/create_datapoint_from_files_model.py +3 -3
- rapidata/api_client/models/create_datapoint_from_text_sources_model.py +3 -3
- rapidata/api_client/models/create_datapoint_from_urls_model.py +3 -3
- rapidata/api_client/models/create_datapoint_model.py +108 -0
- rapidata/api_client/models/create_datapoint_model_asset.py +182 -0
- rapidata/api_client/models/create_demographic_rapid_model.py +13 -2
- rapidata/api_client/models/create_demographic_rapid_model_asset.py +188 -0
- rapidata/api_client/models/create_demographic_rapid_model_new.py +119 -0
- rapidata/api_client/models/create_sample_model.py +8 -2
- rapidata/api_client/models/create_sample_model_asset.py +182 -0
- rapidata/api_client/models/create_sample_model_obsolete.py +87 -0
- rapidata/api_client/models/file_asset_input_file.py +8 -22
- rapidata/api_client/models/fork_benchmark_result.py +87 -0
- rapidata/api_client/models/form_file_wrapper.py +17 -2
- rapidata/api_client/models/get_asset_metadata_result.py +100 -0
- rapidata/api_client/models/multi_asset_input_assets_inner.py +10 -24
- rapidata/api_client/models/prompt_asset_metadata_input.py +3 -3
- rapidata/api_client/models/proxy_file_wrapper.py +17 -2
- rapidata/api_client/models/stream_file_wrapper.py +25 -3
- rapidata/api_client/models/submit_prompt_model.py +3 -3
- rapidata/api_client/models/text_metadata.py +6 -1
- rapidata/api_client/models/text_metadata_model.py +7 -2
- rapidata/api_client/models/upload_file_from_url_result.py +87 -0
- rapidata/api_client/models/upload_file_result.py +87 -0
- rapidata/api_client/models/zip_entry_file_wrapper.py +33 -2
- rapidata/api_client_README.md +28 -25
- rapidata/rapidata_client/__init__.py +0 -1
- rapidata/rapidata_client/benchmark/participant/_participant.py +24 -22
- rapidata/rapidata_client/benchmark/rapidata_benchmark.py +89 -102
- rapidata/rapidata_client/datapoints/__init__.py +0 -1
- rapidata/rapidata_client/datapoints/_asset_uploader.py +71 -0
- rapidata/rapidata_client/datapoints/_datapoint.py +58 -171
- rapidata/rapidata_client/datapoints/_datapoint_uploader.py +95 -0
- rapidata/rapidata_client/datapoints/assets/__init__.py +0 -11
- rapidata/rapidata_client/datapoints/metadata/_media_asset_metadata.py +10 -7
- rapidata/rapidata_client/demographic/demographic_manager.py +21 -8
- rapidata/rapidata_client/exceptions/failed_upload_exception.py +0 -62
- rapidata/rapidata_client/order/_rapidata_order_builder.py +0 -10
- rapidata/rapidata_client/order/dataset/_rapidata_dataset.py +67 -187
- rapidata/rapidata_client/order/rapidata_order_manager.py +58 -116
- rapidata/rapidata_client/settings/translation_behaviour.py +1 -1
- rapidata/rapidata_client/validation/rapidata_validation_set.py +9 -5
- rapidata/rapidata_client/validation/rapids/_validation_rapid_uploader.py +101 -0
- rapidata/rapidata_client/validation/rapids/box.py +35 -11
- rapidata/rapidata_client/validation/rapids/rapids.py +26 -128
- rapidata/rapidata_client/validation/rapids/rapids_manager.py +123 -104
- rapidata/rapidata_client/validation/validation_set_manager.py +25 -34
- rapidata/rapidata_client/workflow/_ranking_workflow.py +14 -17
- rapidata/rapidata_client/workflow/_select_words_workflow.py +3 -16
- rapidata/service/openapi_service.py +8 -3
- {rapidata-2.41.2.dist-info → rapidata-2.42.0.dist-info}/METADATA +1 -1
- {rapidata-2.41.2.dist-info → rapidata-2.42.0.dist-info}/RECORD +68 -59
- {rapidata-2.41.2.dist-info → rapidata-2.42.0.dist-info}/WHEEL +1 -1
- rapidata/rapidata_client/datapoints/assets/_base_asset.py +0 -13
- rapidata/rapidata_client/datapoints/assets/_media_asset.py +0 -318
- rapidata/rapidata_client/datapoints/assets/_multi_asset.py +0 -61
- rapidata/rapidata_client/datapoints/assets/_sessions.py +0 -40
- rapidata/rapidata_client/datapoints/assets/_text_asset.py +0 -34
- rapidata/rapidata_client/datapoints/assets/data_type_enum.py +0 -8
- rapidata/rapidata_client/order/dataset/_progress_tracker.py +0 -100
- {rapidata-2.41.2.dist-info → rapidata-2.42.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from rapidata.api_client.models.text_asset_input import TextAssetInput
|
|
2
|
+
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
3
|
+
from rapidata.service.openapi_service import OpenAPIService
|
|
4
|
+
from rapidata.api_client.models.multi_asset_input_assets_inner import (
|
|
5
|
+
MultiAssetInput,
|
|
6
|
+
MultiAssetInputAssetsInner,
|
|
7
|
+
)
|
|
8
|
+
from rapidata.api_client.models.create_datapoint_model import CreateDatapointModel
|
|
9
|
+
from rapidata.api_client.models.create_datapoint_model_asset import (
|
|
10
|
+
CreateDatapointModelAsset,
|
|
11
|
+
)
|
|
12
|
+
from rapidata.api_client.models.create_datapoint_result import CreateDatapointResult
|
|
13
|
+
from rapidata.api_client.models.create_datapoint_from_files_model_metadata_inner import (
|
|
14
|
+
CreateDatapointFromFilesModelMetadataInner,
|
|
15
|
+
)
|
|
16
|
+
from rapidata.api_client.models.existing_asset_input import ExistingAssetInput
|
|
17
|
+
from rapidata.rapidata_client.datapoints._asset_uploader import AssetUploader
|
|
18
|
+
from rapidata.rapidata_client.datapoints.metadata import (
|
|
19
|
+
PromptMetadata,
|
|
20
|
+
MediaAssetMetadata,
|
|
21
|
+
PrivateTextMetadata,
|
|
22
|
+
SelectWordsMetadata,
|
|
23
|
+
Metadata,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatapointUploader:
|
|
28
|
+
def __init__(self, openapi_service: OpenAPIService):
|
|
29
|
+
self.openapi_service = openapi_service
|
|
30
|
+
self.asset_uploader = AssetUploader(openapi_service)
|
|
31
|
+
|
|
32
|
+
def upload_datapoint(
|
|
33
|
+
self, datapoint: Datapoint, dataset_id: str, index: int
|
|
34
|
+
) -> CreateDatapointResult:
|
|
35
|
+
metadata = self._get_metadata(datapoint)
|
|
36
|
+
|
|
37
|
+
uploaded_asset = (
|
|
38
|
+
self._handle_media_datapoint(datapoint)
|
|
39
|
+
if datapoint.data_type == "media"
|
|
40
|
+
else self._handle_text_datapoint(datapoint)
|
|
41
|
+
)
|
|
42
|
+
return self.openapi_service.dataset_api.dataset_dataset_id_datapoint_post(
|
|
43
|
+
dataset_id=dataset_id,
|
|
44
|
+
create_datapoint_model=CreateDatapointModel(
|
|
45
|
+
asset=uploaded_asset,
|
|
46
|
+
metadata=metadata,
|
|
47
|
+
sortIndex=index,
|
|
48
|
+
),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def _get_metadata(
|
|
52
|
+
self, datapoint: Datapoint
|
|
53
|
+
) -> list[CreateDatapointFromFilesModelMetadataInner]:
|
|
54
|
+
datapoint_metadata: list[Metadata] = []
|
|
55
|
+
if datapoint.context:
|
|
56
|
+
datapoint_metadata.append(PromptMetadata(prompt=datapoint.context))
|
|
57
|
+
if datapoint.sentence:
|
|
58
|
+
datapoint_metadata.append(
|
|
59
|
+
SelectWordsMetadata(select_words=datapoint.sentence)
|
|
60
|
+
)
|
|
61
|
+
if datapoint.media_context:
|
|
62
|
+
datapoint_metadata.append(
|
|
63
|
+
MediaAssetMetadata(
|
|
64
|
+
internal_file_name=self.asset_uploader.upload_asset(
|
|
65
|
+
datapoint.media_context
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
if datapoint.private_note:
|
|
70
|
+
datapoint_metadata.append(PrivateTextMetadata(text=datapoint.private_note))
|
|
71
|
+
|
|
72
|
+
metadata = [
|
|
73
|
+
CreateDatapointFromFilesModelMetadataInner(
|
|
74
|
+
actual_instance=metadata.to_model()
|
|
75
|
+
)
|
|
76
|
+
for metadata in datapoint_metadata
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
return metadata
|
|
80
|
+
|
|
81
|
+
def _handle_text_datapoint(self, datapoint: Datapoint) -> CreateDatapointModelAsset:
|
|
82
|
+
return CreateDatapointModelAsset(
|
|
83
|
+
actual_instance=self.asset_uploader.get_uploaded_text_input(
|
|
84
|
+
datapoint.asset
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _handle_media_datapoint(
|
|
89
|
+
self, datapoint: Datapoint
|
|
90
|
+
) -> CreateDatapointModelAsset:
|
|
91
|
+
return CreateDatapointModelAsset(
|
|
92
|
+
actual_instance=self.asset_uploader.get_uploaded_asset_input(
|
|
93
|
+
datapoint.asset
|
|
94
|
+
),
|
|
95
|
+
)
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
"""Assets Package
|
|
2
|
-
|
|
3
|
-
This package provides classes for different types of assets, including MediaAsset, TextAsset, and MultiAsset.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from ._base_asset import BaseAsset
|
|
7
|
-
from ._media_asset import MediaAsset
|
|
8
|
-
from ._text_asset import TextAsset
|
|
9
|
-
from ._multi_asset import MultiAsset
|
|
10
|
-
from .data_type_enum import RapidataDataTypes
|
|
11
|
-
from ._sessions import SessionManager
|
|
@@ -1,23 +1,26 @@
|
|
|
1
1
|
from rapidata.api_client.models.prompt_asset_metadata_input import (
|
|
2
2
|
PromptAssetMetadataInput,
|
|
3
3
|
)
|
|
4
|
-
from rapidata.api_client.models.url_asset_input import UrlAssetInput
|
|
5
4
|
from rapidata.rapidata_client.datapoints.metadata._base_metadata import Metadata
|
|
6
|
-
from rapidata.api_client.models.
|
|
7
|
-
|
|
5
|
+
from rapidata.api_client.models.multi_asset_input_assets_inner import (
|
|
6
|
+
ExistingAssetInput,
|
|
7
|
+
MultiAssetInputAssetsInner,
|
|
8
8
|
)
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class MediaAssetMetadata(Metadata):
|
|
12
12
|
|
|
13
|
-
def __init__(self,
|
|
13
|
+
def __init__(self, internal_file_name: str):
|
|
14
14
|
super().__init__()
|
|
15
|
-
self.
|
|
15
|
+
self._internal_file_name = internal_file_name
|
|
16
16
|
|
|
17
17
|
def to_model(self):
|
|
18
18
|
return PromptAssetMetadataInput(
|
|
19
19
|
_t="PromptAssetMetadataInput",
|
|
20
|
-
asset=
|
|
21
|
-
actual_instance=
|
|
20
|
+
asset=MultiAssetInputAssetsInner(
|
|
21
|
+
actual_instance=ExistingAssetInput(
|
|
22
|
+
_t="ExistingAssetInput",
|
|
23
|
+
name=self._internal_file_name,
|
|
24
|
+
),
|
|
22
25
|
),
|
|
23
26
|
)
|
|
@@ -1,34 +1,47 @@
|
|
|
1
|
-
from
|
|
2
|
-
from rapidata.
|
|
3
|
-
from rapidata.api_client.models.
|
|
4
|
-
|
|
1
|
+
from argparse import Action
|
|
2
|
+
from rapidata.api_client import ExistingAssetInput
|
|
3
|
+
from rapidata.api_client.models.create_demographic_rapid_model_asset import (
|
|
4
|
+
CreateDemographicRapidModelAsset,
|
|
5
5
|
)
|
|
6
|
+
from rapidata.service.openapi_service import OpenAPIService
|
|
6
7
|
from rapidata.api_client.models.classify_payload import ClassifyPayload
|
|
7
8
|
from rapidata.rapidata_client.config import logger
|
|
9
|
+
from rapidata.api_client.models.create_demographic_rapid_model_new import (
|
|
10
|
+
CreateDemographicRapidModelNew,
|
|
11
|
+
)
|
|
12
|
+
from rapidata.rapidata_client.datapoints._asset_uploader import AssetUploader
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class DemographicManager:
|
|
11
16
|
def __init__(self, openapi_service: OpenAPIService):
|
|
12
17
|
self._openapi_service = openapi_service
|
|
18
|
+
self._asset_uploader = AssetUploader(openapi_service)
|
|
13
19
|
logger.debug("DemographicManager initialized")
|
|
14
20
|
|
|
15
21
|
def create_demographic_rapid(
|
|
16
22
|
self, instruction: str, answer_options: list[str], datapoint: str, key: str
|
|
17
23
|
):
|
|
18
24
|
|
|
19
|
-
|
|
20
|
-
model = CreateDemographicRapidModel(
|
|
25
|
+
model = CreateDemographicRapidModelNew(
|
|
21
26
|
key=key,
|
|
22
27
|
payload=ClassifyPayload(
|
|
23
28
|
_t="ClassifyPayload",
|
|
24
29
|
possibleCategories=answer_options,
|
|
25
30
|
title=instruction,
|
|
26
31
|
),
|
|
32
|
+
asset=CreateDemographicRapidModelAsset(
|
|
33
|
+
actual_instance=ExistingAssetInput(
|
|
34
|
+
_t="ExistingAssetInput",
|
|
35
|
+
name=self._asset_uploader.upload_asset(datapoint),
|
|
36
|
+
),
|
|
37
|
+
),
|
|
27
38
|
)
|
|
28
39
|
|
|
29
|
-
self._openapi_service.rapid_api.
|
|
30
|
-
|
|
40
|
+
result = self._openapi_service.rapid_api.rapid_demographic_new_post(
|
|
41
|
+
create_demographic_rapid_model_new=model
|
|
31
42
|
)
|
|
43
|
+
logger.info(f"Demographic Rapid created: {result.rapid_id}")
|
|
44
|
+
return result.rapid_id
|
|
32
45
|
|
|
33
46
|
def __str__(self) -> str:
|
|
34
47
|
return "DemographicManager"
|
|
@@ -1,14 +1,3 @@
|
|
|
1
|
-
from typing import cast
|
|
2
|
-
from rapidata.api_client.models.file_asset_model import FileAssetModel
|
|
3
|
-
from rapidata.api_client.models.get_failed_datapoints_result import (
|
|
4
|
-
GetFailedDatapointsResult,
|
|
5
|
-
)
|
|
6
|
-
from rapidata.api_client.models.multi_asset_model import MultiAssetModel
|
|
7
|
-
from rapidata.api_client.models.original_filename_metadata_model import (
|
|
8
|
-
OriginalFilenameMetadataModel,
|
|
9
|
-
)
|
|
10
|
-
from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
|
|
11
|
-
from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
|
|
12
1
|
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
13
2
|
from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
|
|
14
3
|
from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
|
|
@@ -29,54 +18,3 @@ class FailedUploadException(Exception):
|
|
|
29
18
|
|
|
30
19
|
def __str__(self) -> str:
|
|
31
20
|
return f"Failed to upload {self.failed_uploads}"
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def _parse_failed_uploads(failed_uploads: GetFailedDatapointsResult) -> list[Datapoint]:
|
|
35
|
-
failed_datapoints = failed_uploads.datapoints
|
|
36
|
-
if not failed_datapoints:
|
|
37
|
-
return []
|
|
38
|
-
if isinstance(failed_datapoints[0].asset.actual_instance, FileAssetModel):
|
|
39
|
-
failed_assets = [
|
|
40
|
-
MediaAsset(
|
|
41
|
-
__get_asset_name(cast(FileAssetModel, datapoint.asset.actual_instance))
|
|
42
|
-
)
|
|
43
|
-
for datapoint in failed_datapoints
|
|
44
|
-
]
|
|
45
|
-
elif isinstance(failed_datapoints[0].asset.actual_instance, MultiAssetModel):
|
|
46
|
-
failed_assets = []
|
|
47
|
-
backend_assets = [
|
|
48
|
-
cast(MultiAssetModel, failed_upload.asset.actual_instance).assets
|
|
49
|
-
for failed_upload in failed_datapoints
|
|
50
|
-
]
|
|
51
|
-
for assets in backend_assets:
|
|
52
|
-
failed_assets.append(
|
|
53
|
-
MultiAsset(
|
|
54
|
-
[
|
|
55
|
-
MediaAsset(
|
|
56
|
-
__get_asset_name(
|
|
57
|
-
cast(FileAssetModel, asset.actual_instance)
|
|
58
|
-
)
|
|
59
|
-
)
|
|
60
|
-
for asset in assets
|
|
61
|
-
if isinstance(asset.actual_instance, FileAssetModel)
|
|
62
|
-
]
|
|
63
|
-
)
|
|
64
|
-
)
|
|
65
|
-
else:
|
|
66
|
-
raise ValueError(
|
|
67
|
-
f"Unsupported asset type: {type(failed_datapoints[0].asset.actual_instance)}"
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
return [Datapoint(asset=asset) for asset in failed_assets]
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def __get_asset_name(failed_datapoint: FileAssetModel) -> str:
|
|
74
|
-
metadata = failed_datapoint.metadata
|
|
75
|
-
if "sourceUrl" in metadata:
|
|
76
|
-
return cast(SourceUrlMetadataModel, metadata["sourceUrl"].actual_instance).url
|
|
77
|
-
elif "originalFilename" in metadata:
|
|
78
|
-
return cast(
|
|
79
|
-
OriginalFilenameMetadataModel, metadata["originalFilename"].actual_instance
|
|
80
|
-
).original_filename
|
|
81
|
-
else:
|
|
82
|
-
return ""
|
|
@@ -20,7 +20,6 @@ from rapidata.api_client.models.sticky_state import StickyState
|
|
|
20
20
|
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
21
21
|
from rapidata.rapidata_client.exceptions.failed_upload_exception import (
|
|
22
22
|
FailedUploadException,
|
|
23
|
-
_parse_failed_uploads,
|
|
24
23
|
)
|
|
25
24
|
from rapidata.rapidata_client.filter import RapidataFilter
|
|
26
25
|
from rapidata.rapidata_client.config import (
|
|
@@ -280,15 +279,6 @@ class RapidataOrderBuilder:
|
|
|
280
279
|
try:
|
|
281
280
|
self.__openapi_service.order_api.order_order_id_preview_post(self.order_id)
|
|
282
281
|
except Exception:
|
|
283
|
-
failed_uploads = _parse_failed_uploads(
|
|
284
|
-
self.__openapi_service.dataset_api.dataset_dataset_id_datapoints_failed_get(
|
|
285
|
-
self.__dataset.id
|
|
286
|
-
)
|
|
287
|
-
)
|
|
288
|
-
logger.error(
|
|
289
|
-
"Internal download error for datapoints: %s\nWARNING: Failed Datapoints in error do not contain metadata.",
|
|
290
|
-
failed_uploads,
|
|
291
|
-
)
|
|
292
282
|
raise FailedUploadException(self.__dataset, order, failed_uploads)
|
|
293
283
|
return order
|
|
294
284
|
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
|
|
2
|
-
from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
|
|
3
|
-
from rapidata.service import LocalFileService
|
|
4
2
|
from rapidata.service.openapi_service import OpenAPIService
|
|
5
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
4
|
from tqdm import tqdm
|
|
@@ -8,12 +6,11 @@ from tqdm import tqdm
|
|
|
8
6
|
from typing import Generator
|
|
9
7
|
from rapidata.rapidata_client.config import logger
|
|
10
8
|
import time
|
|
11
|
-
import threading
|
|
12
9
|
from rapidata.rapidata_client.api.rapidata_api_client import (
|
|
13
10
|
suppress_rapidata_error_logging,
|
|
14
11
|
)
|
|
15
12
|
from rapidata.rapidata_client.config.rapidata_config import rapidata_config
|
|
16
|
-
from rapidata.rapidata_client.
|
|
13
|
+
from rapidata.rapidata_client.datapoints._datapoint_uploader import DatapointUploader
|
|
17
14
|
|
|
18
15
|
# Add OpenTelemetry context imports for thread propagation
|
|
19
16
|
from opentelemetry import context as otel_context
|
|
@@ -28,81 +25,80 @@ class RapidataDataset:
|
|
|
28
25
|
def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
|
|
29
26
|
self.id = dataset_id
|
|
30
27
|
self.openapi_service = openapi_service
|
|
31
|
-
self.
|
|
28
|
+
self.datapoint_uploader = DatapointUploader(openapi_service)
|
|
32
29
|
|
|
33
30
|
def add_datapoints(
|
|
34
31
|
self,
|
|
35
32
|
datapoints: list[Datapoint],
|
|
36
33
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
effective_asset_type = datapoints[0]._get_effective_asset_type()
|
|
41
|
-
|
|
42
|
-
logger.debug(f"Config for datapoint upload: {rapidata_config}")
|
|
43
|
-
|
|
44
|
-
if issubclass(effective_asset_type, MediaAsset):
|
|
45
|
-
return self._add_media_from_paths(
|
|
46
|
-
datapoints,
|
|
47
|
-
)
|
|
48
|
-
elif issubclass(effective_asset_type, TextAsset):
|
|
49
|
-
return self._add_texts(datapoints)
|
|
50
|
-
else:
|
|
51
|
-
raise ValueError(f"Unsupported asset type: {effective_asset_type}")
|
|
52
|
-
|
|
53
|
-
def _add_texts(
|
|
54
|
-
self, datapoints: list[Datapoint]
|
|
55
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
56
|
-
|
|
57
|
-
def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
|
|
58
|
-
model = datapoint.create_text_upload_model(index)
|
|
59
|
-
|
|
60
|
-
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
|
|
61
|
-
dataset_id=self.id, create_datapoint_from_text_sources_model=model
|
|
62
|
-
)
|
|
63
|
-
return datapoint
|
|
34
|
+
"""
|
|
35
|
+
Process uploads in chunks with a ThreadPoolExecutor.
|
|
64
36
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
token = otel_context.attach(context)
|
|
70
|
-
try:
|
|
71
|
-
return upload_text_datapoint(datapoint, index)
|
|
72
|
-
finally:
|
|
73
|
-
otel_context.detach(token)
|
|
37
|
+
Args:
|
|
38
|
+
media_paths: List of assets to upload
|
|
39
|
+
multi_metadata: Optional sequence of sequences of metadata
|
|
40
|
+
chunk_size: Number of items to process in each batch
|
|
74
41
|
|
|
42
|
+
Returns:
|
|
43
|
+
tuple[list[str], list[str]]: Lists of successful and failed uploads
|
|
44
|
+
"""
|
|
75
45
|
successful_uploads: list[Datapoint] = []
|
|
76
46
|
failed_uploads: list[Datapoint] = []
|
|
77
47
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
|
|
48
|
+
with tqdm(
|
|
49
|
+
total=len(datapoints),
|
|
50
|
+
desc="Uploading datapoints",
|
|
51
|
+
disable=rapidata_config.logging.silent_mode,
|
|
52
|
+
) as progress_bar:
|
|
53
|
+
|
|
54
|
+
def process_upload_with_context(
|
|
55
|
+
context: otel_context.Context, datapoint: Datapoint, index: int
|
|
56
|
+
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
57
|
+
"""Wrapper function that runs _process_single_upload with the provided context."""
|
|
58
|
+
token = otel_context.attach(context)
|
|
59
|
+
try:
|
|
60
|
+
return self._process_single_upload(datapoint, index)
|
|
61
|
+
finally:
|
|
62
|
+
otel_context.detach(token)
|
|
63
|
+
|
|
64
|
+
# Capture the current OpenTelemetry context before creating threads
|
|
65
|
+
current_context = otel_context.get_current()
|
|
66
|
+
|
|
67
|
+
with ThreadPoolExecutor(
|
|
68
|
+
max_workers=rapidata_config.upload.maxWorkers
|
|
69
|
+
) as executor:
|
|
70
|
+
# Process uploads in chunks to avoid overwhelming the system
|
|
71
|
+
for chunk_idx, chunk in enumerate(
|
|
72
|
+
chunk_list(datapoints, rapidata_config.upload.chunkSize)
|
|
73
|
+
):
|
|
74
|
+
futures = [
|
|
75
|
+
executor.submit(
|
|
76
|
+
process_upload_with_context,
|
|
77
|
+
current_context,
|
|
78
|
+
datapoint,
|
|
79
|
+
chunk_idx * rapidata_config.upload.chunkSize + i,
|
|
80
|
+
)
|
|
81
|
+
for i, datapoint in enumerate(chunk)
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Wait for this chunk to complete before starting the next one
|
|
85
|
+
for future in as_completed(futures):
|
|
86
|
+
try:
|
|
87
|
+
chunk_successful, chunk_failed = future.result()
|
|
88
|
+
successful_uploads.extend(chunk_successful)
|
|
89
|
+
failed_uploads.extend(chunk_failed)
|
|
90
|
+
progress_bar.update(
|
|
91
|
+
len(chunk_successful) + len(chunk_failed)
|
|
92
|
+
)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error("Future execution failed: %s", str(e))
|
|
91
95
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
datapoint = future_to_datapoint[future]
|
|
99
|
-
try:
|
|
100
|
-
result = future.result()
|
|
101
|
-
pbar.update(1)
|
|
102
|
-
successful_uploads.append(result)
|
|
103
|
-
except Exception as e:
|
|
104
|
-
failed_uploads.append(datapoint)
|
|
105
|
-
logger.error("Upload failed for %s: %s", datapoint, str(e))
|
|
96
|
+
if failed_uploads:
|
|
97
|
+
logger.error(
|
|
98
|
+
"Upload failed for %s datapoints: %s",
|
|
99
|
+
len(failed_uploads),
|
|
100
|
+
failed_uploads,
|
|
101
|
+
)
|
|
106
102
|
|
|
107
103
|
return successful_uploads, failed_uploads
|
|
108
104
|
|
|
@@ -128,21 +124,14 @@ class RapidataDataset:
|
|
|
128
124
|
local_successful: list[Datapoint] = []
|
|
129
125
|
local_failed: list[Datapoint] = []
|
|
130
126
|
|
|
131
|
-
metadata = datapoint.get_prepared_metadata()
|
|
132
|
-
|
|
133
|
-
local_paths = datapoint.get_local_file_paths()
|
|
134
|
-
urls = datapoint.get_urls()
|
|
135
|
-
|
|
136
127
|
last_exception = None
|
|
137
128
|
for attempt in range(rapidata_config.upload.maxRetries):
|
|
138
129
|
try:
|
|
139
130
|
with suppress_rapidata_error_logging():
|
|
140
|
-
self.
|
|
131
|
+
self.datapoint_uploader.upload_datapoint(
|
|
141
132
|
dataset_id=self.id,
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
metadata=metadata,
|
|
145
|
-
sort_index=index,
|
|
133
|
+
datapoint=datapoint,
|
|
134
|
+
index=index,
|
|
146
135
|
)
|
|
147
136
|
|
|
148
137
|
local_successful.append(datapoint)
|
|
@@ -170,115 +159,6 @@ class RapidataDataset:
|
|
|
170
159
|
|
|
171
160
|
return local_successful, local_failed
|
|
172
161
|
|
|
173
|
-
def _process_uploads_in_chunks(
|
|
174
|
-
self,
|
|
175
|
-
datapoints: list[Datapoint],
|
|
176
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
177
|
-
"""
|
|
178
|
-
Process uploads in chunks with a ThreadPoolExecutor.
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
media_paths: List of assets to upload
|
|
182
|
-
multi_metadata: Optional sequence of sequences of metadata
|
|
183
|
-
chunk_size: Number of items to process in each batch
|
|
184
|
-
|
|
185
|
-
Returns:
|
|
186
|
-
tuple[list[str], list[str]]: Lists of successful and failed uploads
|
|
187
|
-
"""
|
|
188
|
-
successful_uploads: list[Datapoint] = []
|
|
189
|
-
failed_uploads: list[Datapoint] = []
|
|
190
|
-
|
|
191
|
-
def process_upload_with_context(
|
|
192
|
-
context: otel_context.Context, datapoint: Datapoint, index: int
|
|
193
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
194
|
-
"""Wrapper function that runs _process_single_upload with the provided context."""
|
|
195
|
-
token = otel_context.attach(context)
|
|
196
|
-
try:
|
|
197
|
-
return self._process_single_upload(datapoint, index)
|
|
198
|
-
finally:
|
|
199
|
-
otel_context.detach(token)
|
|
200
|
-
|
|
201
|
-
# Capture the current OpenTelemetry context before creating threads
|
|
202
|
-
current_context = otel_context.get_current()
|
|
203
|
-
|
|
204
|
-
with ThreadPoolExecutor(
|
|
205
|
-
max_workers=rapidata_config.upload.maxWorkers
|
|
206
|
-
) as executor:
|
|
207
|
-
# Process uploads in chunks to avoid overwhelming the system
|
|
208
|
-
for chunk_idx, chunk in enumerate(
|
|
209
|
-
chunk_list(datapoints, rapidata_config.upload.chunkSize)
|
|
210
|
-
):
|
|
211
|
-
futures = [
|
|
212
|
-
executor.submit(
|
|
213
|
-
process_upload_with_context,
|
|
214
|
-
current_context,
|
|
215
|
-
datapoint,
|
|
216
|
-
chunk_idx * rapidata_config.upload.chunkSize + i,
|
|
217
|
-
)
|
|
218
|
-
for i, datapoint in enumerate(chunk)
|
|
219
|
-
]
|
|
220
|
-
|
|
221
|
-
# Wait for this chunk to complete before starting the next one
|
|
222
|
-
for future in as_completed(futures):
|
|
223
|
-
try:
|
|
224
|
-
chunk_successful, chunk_failed = future.result()
|
|
225
|
-
successful_uploads.extend(chunk_successful)
|
|
226
|
-
failed_uploads.extend(chunk_failed)
|
|
227
|
-
except Exception as e:
|
|
228
|
-
logger.error("Future execution failed: %s", str(e))
|
|
229
|
-
|
|
230
|
-
return successful_uploads, failed_uploads
|
|
231
|
-
|
|
232
|
-
def _add_media_from_paths(
|
|
233
|
-
self,
|
|
234
|
-
datapoints: list[Datapoint],
|
|
235
|
-
progress_poll_interval: float = 0.5,
|
|
236
|
-
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
237
|
-
"""
|
|
238
|
-
Upload media paths in chunks with managed resources.
|
|
239
|
-
|
|
240
|
-
Args:
|
|
241
|
-
datapoints: List of Datapoint objects to upload
|
|
242
|
-
chunk_size: Number of items to process in each batch
|
|
243
|
-
progress_poll_interval: Time in seconds between progress checks
|
|
244
|
-
Returns:
|
|
245
|
-
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
246
|
-
|
|
247
|
-
Raises:
|
|
248
|
-
ValueError: If multi_metadata lengths don't match media_paths length
|
|
249
|
-
"""
|
|
250
|
-
|
|
251
|
-
# Setup tracking variables
|
|
252
|
-
total_uploads = len(datapoints)
|
|
253
|
-
|
|
254
|
-
# Create and start progress tracking thread
|
|
255
|
-
progress_tracker = ProgressTracker(
|
|
256
|
-
dataset_id=self.id,
|
|
257
|
-
openapi_service=self.openapi_service,
|
|
258
|
-
total_uploads=total_uploads,
|
|
259
|
-
progress_poll_interval=progress_poll_interval,
|
|
260
|
-
)
|
|
261
|
-
progress_thread = progress_tracker.create_thread()
|
|
262
|
-
progress_thread.start()
|
|
263
|
-
|
|
264
|
-
# Process uploads in chunks
|
|
265
|
-
try:
|
|
266
|
-
successful_uploads, failed_uploads = self._process_uploads_in_chunks(
|
|
267
|
-
datapoints,
|
|
268
|
-
)
|
|
269
|
-
finally:
|
|
270
|
-
progress_tracker.complete()
|
|
271
|
-
progress_thread.join(10)
|
|
272
|
-
|
|
273
|
-
if failed_uploads:
|
|
274
|
-
logger.error(
|
|
275
|
-
"Upload failed for %s datapoints: %s",
|
|
276
|
-
len(failed_uploads),
|
|
277
|
-
failed_uploads,
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
return successful_uploads, failed_uploads
|
|
281
|
-
|
|
282
162
|
def __str__(self) -> str:
|
|
283
163
|
return f"RapidataDataset(id={self.id})"
|
|
284
164
|
|