rapidata 2.41.3__py3-none-any.whl → 2.42.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidata might be problematic. Click here for more details.

Files changed (74) hide show
  1. rapidata/__init__.py +1 -5
  2. rapidata/api_client/__init__.py +14 -14
  3. rapidata/api_client/api/__init__.py +1 -0
  4. rapidata/api_client/api/asset_api.py +851 -0
  5. rapidata/api_client/api/benchmark_api.py +298 -0
  6. rapidata/api_client/api/customer_rapid_api.py +29 -43
  7. rapidata/api_client/api/dataset_api.py +163 -1143
  8. rapidata/api_client/api/participant_api.py +28 -74
  9. rapidata/api_client/api/validation_set_api.py +283 -0
  10. rapidata/api_client/models/__init__.py +13 -14
  11. rapidata/api_client/models/add_validation_rapid_model.py +3 -3
  12. rapidata/api_client/models/add_validation_rapid_new_model.py +152 -0
  13. rapidata/api_client/models/add_validation_rapid_new_model_asset.py +182 -0
  14. rapidata/api_client/models/compare_workflow_model.py +3 -3
  15. rapidata/api_client/models/create_datapoint_from_files_model.py +3 -3
  16. rapidata/api_client/models/create_datapoint_from_text_sources_model.py +3 -3
  17. rapidata/api_client/models/create_datapoint_from_urls_model.py +3 -3
  18. rapidata/api_client/models/create_datapoint_model.py +108 -0
  19. rapidata/api_client/models/create_datapoint_model_asset.py +182 -0
  20. rapidata/api_client/models/create_demographic_rapid_model.py +13 -2
  21. rapidata/api_client/models/create_demographic_rapid_model_asset.py +188 -0
  22. rapidata/api_client/models/create_demographic_rapid_model_new.py +119 -0
  23. rapidata/api_client/models/create_sample_model.py +8 -2
  24. rapidata/api_client/models/create_sample_model_asset.py +182 -0
  25. rapidata/api_client/models/create_sample_model_obsolete.py +87 -0
  26. rapidata/api_client/models/file_asset_input_file.py +8 -22
  27. rapidata/api_client/models/fork_benchmark_result.py +87 -0
  28. rapidata/api_client/models/form_file_wrapper.py +17 -2
  29. rapidata/api_client/models/get_asset_metadata_result.py +100 -0
  30. rapidata/api_client/models/multi_asset_input_assets_inner.py +10 -24
  31. rapidata/api_client/models/prompt_asset_metadata_input.py +3 -3
  32. rapidata/api_client/models/proxy_file_wrapper.py +17 -2
  33. rapidata/api_client/models/stream_file_wrapper.py +25 -3
  34. rapidata/api_client/models/submit_prompt_model.py +3 -3
  35. rapidata/api_client/models/text_metadata.py +6 -1
  36. rapidata/api_client/models/text_metadata_model.py +7 -2
  37. rapidata/api_client/models/upload_file_from_url_result.py +87 -0
  38. rapidata/api_client/models/upload_file_result.py +87 -0
  39. rapidata/api_client/models/zip_entry_file_wrapper.py +33 -2
  40. rapidata/api_client_README.md +28 -25
  41. rapidata/rapidata_client/__init__.py +0 -1
  42. rapidata/rapidata_client/benchmark/participant/_participant.py +25 -24
  43. rapidata/rapidata_client/benchmark/rapidata_benchmark.py +89 -102
  44. rapidata/rapidata_client/datapoints/__init__.py +0 -1
  45. rapidata/rapidata_client/datapoints/_asset_uploader.py +71 -0
  46. rapidata/rapidata_client/datapoints/_datapoint.py +58 -171
  47. rapidata/rapidata_client/datapoints/_datapoint_uploader.py +95 -0
  48. rapidata/rapidata_client/datapoints/assets/__init__.py +0 -11
  49. rapidata/rapidata_client/datapoints/metadata/_media_asset_metadata.py +10 -7
  50. rapidata/rapidata_client/demographic/demographic_manager.py +21 -8
  51. rapidata/rapidata_client/exceptions/failed_upload_exception.py +0 -62
  52. rapidata/rapidata_client/order/_rapidata_order_builder.py +0 -10
  53. rapidata/rapidata_client/order/dataset/_rapidata_dataset.py +65 -187
  54. rapidata/rapidata_client/order/rapidata_order_manager.py +62 -124
  55. rapidata/rapidata_client/validation/rapidata_validation_set.py +9 -5
  56. rapidata/rapidata_client/validation/rapids/_validation_rapid_uploader.py +101 -0
  57. rapidata/rapidata_client/validation/rapids/box.py +35 -11
  58. rapidata/rapidata_client/validation/rapids/rapids.py +26 -128
  59. rapidata/rapidata_client/validation/rapids/rapids_manager.py +123 -104
  60. rapidata/rapidata_client/validation/validation_set_manager.py +41 -38
  61. rapidata/rapidata_client/workflow/_ranking_workflow.py +14 -17
  62. rapidata/rapidata_client/workflow/_select_words_workflow.py +3 -16
  63. rapidata/service/openapi_service.py +8 -3
  64. {rapidata-2.41.3.dist-info → rapidata-2.42.1.dist-info}/METADATA +1 -1
  65. {rapidata-2.41.3.dist-info → rapidata-2.42.1.dist-info}/RECORD +67 -58
  66. {rapidata-2.41.3.dist-info → rapidata-2.42.1.dist-info}/WHEEL +1 -1
  67. rapidata/rapidata_client/datapoints/assets/_base_asset.py +0 -13
  68. rapidata/rapidata_client/datapoints/assets/_media_asset.py +0 -318
  69. rapidata/rapidata_client/datapoints/assets/_multi_asset.py +0 -61
  70. rapidata/rapidata_client/datapoints/assets/_sessions.py +0 -40
  71. rapidata/rapidata_client/datapoints/assets/_text_asset.py +0 -34
  72. rapidata/rapidata_client/datapoints/assets/data_type_enum.py +0 -8
  73. rapidata/rapidata_client/order/dataset/_progress_tracker.py +0 -100
  74. {rapidata-2.41.3.dist-info → rapidata-2.42.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,95 @@
1
+ from rapidata.api_client.models.text_asset_input import TextAssetInput
2
+ from rapidata.rapidata_client.datapoints._datapoint import Datapoint
3
+ from rapidata.service.openapi_service import OpenAPIService
4
+ from rapidata.api_client.models.multi_asset_input_assets_inner import (
5
+ MultiAssetInput,
6
+ MultiAssetInputAssetsInner,
7
+ )
8
+ from rapidata.api_client.models.create_datapoint_model import CreateDatapointModel
9
+ from rapidata.api_client.models.create_datapoint_model_asset import (
10
+ CreateDatapointModelAsset,
11
+ )
12
+ from rapidata.api_client.models.create_datapoint_result import CreateDatapointResult
13
+ from rapidata.api_client.models.create_datapoint_from_files_model_metadata_inner import (
14
+ CreateDatapointFromFilesModelMetadataInner,
15
+ )
16
+ from rapidata.api_client.models.existing_asset_input import ExistingAssetInput
17
+ from rapidata.rapidata_client.datapoints._asset_uploader import AssetUploader
18
+ from rapidata.rapidata_client.datapoints.metadata import (
19
+ PromptMetadata,
20
+ MediaAssetMetadata,
21
+ PrivateTextMetadata,
22
+ SelectWordsMetadata,
23
+ Metadata,
24
+ )
25
+
26
+
27
+ class DatapointUploader:
28
+ def __init__(self, openapi_service: OpenAPIService):
29
+ self.openapi_service = openapi_service
30
+ self.asset_uploader = AssetUploader(openapi_service)
31
+
32
+ def upload_datapoint(
33
+ self, datapoint: Datapoint, dataset_id: str, index: int
34
+ ) -> CreateDatapointResult:
35
+ metadata = self._get_metadata(datapoint)
36
+
37
+ uploaded_asset = (
38
+ self._handle_media_datapoint(datapoint)
39
+ if datapoint.data_type == "media"
40
+ else self._handle_text_datapoint(datapoint)
41
+ )
42
+ return self.openapi_service.dataset_api.dataset_dataset_id_datapoint_post(
43
+ dataset_id=dataset_id,
44
+ create_datapoint_model=CreateDatapointModel(
45
+ asset=uploaded_asset,
46
+ metadata=metadata,
47
+ sortIndex=index,
48
+ ),
49
+ )
50
+
51
+ def _get_metadata(
52
+ self, datapoint: Datapoint
53
+ ) -> list[CreateDatapointFromFilesModelMetadataInner]:
54
+ datapoint_metadata: list[Metadata] = []
55
+ if datapoint.context:
56
+ datapoint_metadata.append(PromptMetadata(prompt=datapoint.context))
57
+ if datapoint.sentence:
58
+ datapoint_metadata.append(
59
+ SelectWordsMetadata(select_words=datapoint.sentence)
60
+ )
61
+ if datapoint.media_context:
62
+ datapoint_metadata.append(
63
+ MediaAssetMetadata(
64
+ internal_file_name=self.asset_uploader.upload_asset(
65
+ datapoint.media_context
66
+ )
67
+ )
68
+ )
69
+ if datapoint.private_note:
70
+ datapoint_metadata.append(PrivateTextMetadata(text=datapoint.private_note))
71
+
72
+ metadata = [
73
+ CreateDatapointFromFilesModelMetadataInner(
74
+ actual_instance=metadata.to_model()
75
+ )
76
+ for metadata in datapoint_metadata
77
+ ]
78
+
79
+ return metadata
80
+
81
+ def _handle_text_datapoint(self, datapoint: Datapoint) -> CreateDatapointModelAsset:
82
+ return CreateDatapointModelAsset(
83
+ actual_instance=self.asset_uploader.get_uploaded_text_input(
84
+ datapoint.asset
85
+ ),
86
+ )
87
+
88
+ def _handle_media_datapoint(
89
+ self, datapoint: Datapoint
90
+ ) -> CreateDatapointModelAsset:
91
+ return CreateDatapointModelAsset(
92
+ actual_instance=self.asset_uploader.get_uploaded_asset_input(
93
+ datapoint.asset
94
+ ),
95
+ )
@@ -1,11 +0,0 @@
1
- """Assets Package
2
-
3
- This package provides classes for different types of assets, including MediaAsset, TextAsset, and MultiAsset.
4
- """
5
-
6
- from ._base_asset import BaseAsset
7
- from ._media_asset import MediaAsset
8
- from ._text_asset import TextAsset
9
- from ._multi_asset import MultiAsset
10
- from .data_type_enum import RapidataDataTypes
11
- from ._sessions import SessionManager
@@ -1,23 +1,26 @@
1
1
  from rapidata.api_client.models.prompt_asset_metadata_input import (
2
2
  PromptAssetMetadataInput,
3
3
  )
4
- from rapidata.api_client.models.url_asset_input import UrlAssetInput
5
4
  from rapidata.rapidata_client.datapoints.metadata._base_metadata import Metadata
6
- from rapidata.api_client.models.prompt_asset_metadata_input_asset import (
7
- PromptAssetMetadataInputAsset,
5
+ from rapidata.api_client.models.multi_asset_input_assets_inner import (
6
+ ExistingAssetInput,
7
+ MultiAssetInputAssetsInner,
8
8
  )
9
9
 
10
10
 
11
11
  class MediaAssetMetadata(Metadata):
12
12
 
13
- def __init__(self, url: str):
13
+ def __init__(self, internal_file_name: str):
14
14
  super().__init__()
15
- self._url = url
15
+ self._internal_file_name = internal_file_name
16
16
 
17
17
  def to_model(self):
18
18
  return PromptAssetMetadataInput(
19
19
  _t="PromptAssetMetadataInput",
20
- asset=PromptAssetMetadataInputAsset(
21
- actual_instance=UrlAssetInput(_t="UrlAssetInput", url=self._url)
20
+ asset=MultiAssetInputAssetsInner(
21
+ actual_instance=ExistingAssetInput(
22
+ _t="ExistingAssetInput",
23
+ name=self._internal_file_name,
24
+ ),
22
25
  ),
23
26
  )
@@ -1,34 +1,47 @@
1
- from rapidata.service.openapi_service import OpenAPIService
2
- from rapidata.rapidata_client.datapoints.assets import MediaAsset
3
- from rapidata.api_client.models.create_demographic_rapid_model import (
4
- CreateDemographicRapidModel,
1
+ from argparse import Action
2
+ from rapidata.api_client import ExistingAssetInput
3
+ from rapidata.api_client.models.create_demographic_rapid_model_asset import (
4
+ CreateDemographicRapidModelAsset,
5
5
  )
6
+ from rapidata.service.openapi_service import OpenAPIService
6
7
  from rapidata.api_client.models.classify_payload import ClassifyPayload
7
8
  from rapidata.rapidata_client.config import logger
9
+ from rapidata.api_client.models.create_demographic_rapid_model_new import (
10
+ CreateDemographicRapidModelNew,
11
+ )
12
+ from rapidata.rapidata_client.datapoints._asset_uploader import AssetUploader
8
13
 
9
14
 
10
15
  class DemographicManager:
11
16
  def __init__(self, openapi_service: OpenAPIService):
12
17
  self._openapi_service = openapi_service
18
+ self._asset_uploader = AssetUploader(openapi_service)
13
19
  logger.debug("DemographicManager initialized")
14
20
 
15
21
  def create_demographic_rapid(
16
22
  self, instruction: str, answer_options: list[str], datapoint: str, key: str
17
23
  ):
18
24
 
19
- media = MediaAsset(path=datapoint)
20
- model = CreateDemographicRapidModel(
25
+ model = CreateDemographicRapidModelNew(
21
26
  key=key,
22
27
  payload=ClassifyPayload(
23
28
  _t="ClassifyPayload",
24
29
  possibleCategories=answer_options,
25
30
  title=instruction,
26
31
  ),
32
+ asset=CreateDemographicRapidModelAsset(
33
+ actual_instance=ExistingAssetInput(
34
+ _t="ExistingAssetInput",
35
+ name=self._asset_uploader.upload_asset(datapoint),
36
+ ),
37
+ ),
27
38
  )
28
39
 
29
- self._openapi_service.rapid_api.rapid_demographic_post(
30
- model=model, file=[media.to_file()]
40
+ result = self._openapi_service.rapid_api.rapid_demographic_new_post(
41
+ create_demographic_rapid_model_new=model
31
42
  )
43
+ logger.info(f"Demographic Rapid created: {result.rapid_id}")
44
+ return result.rapid_id
32
45
 
33
46
  def __str__(self) -> str:
34
47
  return "DemographicManager"
@@ -1,14 +1,3 @@
1
- from typing import cast
2
- from rapidata.api_client.models.file_asset_model import FileAssetModel
3
- from rapidata.api_client.models.get_failed_datapoints_result import (
4
- GetFailedDatapointsResult,
5
- )
6
- from rapidata.api_client.models.multi_asset_model import MultiAssetModel
7
- from rapidata.api_client.models.original_filename_metadata_model import (
8
- OriginalFilenameMetadataModel,
9
- )
10
- from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
11
- from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
12
1
  from rapidata.rapidata_client.datapoints._datapoint import Datapoint
13
2
  from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
14
3
  from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
@@ -29,54 +18,3 @@ class FailedUploadException(Exception):
29
18
 
30
19
  def __str__(self) -> str:
31
20
  return f"Failed to upload {self.failed_uploads}"
32
-
33
-
34
- def _parse_failed_uploads(failed_uploads: GetFailedDatapointsResult) -> list[Datapoint]:
35
- failed_datapoints = failed_uploads.datapoints
36
- if not failed_datapoints:
37
- return []
38
- if isinstance(failed_datapoints[0].asset.actual_instance, FileAssetModel):
39
- failed_assets = [
40
- MediaAsset(
41
- __get_asset_name(cast(FileAssetModel, datapoint.asset.actual_instance))
42
- )
43
- for datapoint in failed_datapoints
44
- ]
45
- elif isinstance(failed_datapoints[0].asset.actual_instance, MultiAssetModel):
46
- failed_assets = []
47
- backend_assets = [
48
- cast(MultiAssetModel, failed_upload.asset.actual_instance).assets
49
- for failed_upload in failed_datapoints
50
- ]
51
- for assets in backend_assets:
52
- failed_assets.append(
53
- MultiAsset(
54
- [
55
- MediaAsset(
56
- __get_asset_name(
57
- cast(FileAssetModel, asset.actual_instance)
58
- )
59
- )
60
- for asset in assets
61
- if isinstance(asset.actual_instance, FileAssetModel)
62
- ]
63
- )
64
- )
65
- else:
66
- raise ValueError(
67
- f"Unsupported asset type: {type(failed_datapoints[0].asset.actual_instance)}"
68
- )
69
-
70
- return [Datapoint(asset=asset) for asset in failed_assets]
71
-
72
-
73
- def __get_asset_name(failed_datapoint: FileAssetModel) -> str:
74
- metadata = failed_datapoint.metadata
75
- if "sourceUrl" in metadata:
76
- return cast(SourceUrlMetadataModel, metadata["sourceUrl"].actual_instance).url
77
- elif "originalFilename" in metadata:
78
- return cast(
79
- OriginalFilenameMetadataModel, metadata["originalFilename"].actual_instance
80
- ).original_filename
81
- else:
82
- return ""
@@ -20,7 +20,6 @@ from rapidata.api_client.models.sticky_state import StickyState
20
20
  from rapidata.rapidata_client.datapoints._datapoint import Datapoint
21
21
  from rapidata.rapidata_client.exceptions.failed_upload_exception import (
22
22
  FailedUploadException,
23
- _parse_failed_uploads,
24
23
  )
25
24
  from rapidata.rapidata_client.filter import RapidataFilter
26
25
  from rapidata.rapidata_client.config import (
@@ -280,15 +279,6 @@ class RapidataOrderBuilder:
280
279
  try:
281
280
  self.__openapi_service.order_api.order_order_id_preview_post(self.order_id)
282
281
  except Exception:
283
- failed_uploads = _parse_failed_uploads(
284
- self.__openapi_service.dataset_api.dataset_dataset_id_datapoints_failed_get(
285
- self.__dataset.id
286
- )
287
- )
288
- logger.error(
289
- "Internal download error for datapoints: %s\nWARNING: Failed Datapoints in error do not contain metadata.",
290
- failed_uploads,
291
- )
292
282
  raise FailedUploadException(self.__dataset, order, failed_uploads)
293
283
  return order
294
284
 
@@ -1,6 +1,4 @@
1
1
  from rapidata.rapidata_client.datapoints._datapoint import Datapoint
2
- from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
3
- from rapidata.service import LocalFileService
4
2
  from rapidata.service.openapi_service import OpenAPIService
5
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
4
  from tqdm import tqdm
@@ -8,12 +6,11 @@ from tqdm import tqdm
8
6
  from typing import Generator
9
7
  from rapidata.rapidata_client.config import logger
10
8
  import time
11
- import threading
12
9
  from rapidata.rapidata_client.api.rapidata_api_client import (
13
10
  suppress_rapidata_error_logging,
14
11
  )
15
12
  from rapidata.rapidata_client.config.rapidata_config import rapidata_config
16
- from rapidata.rapidata_client.order.dataset._progress_tracker import ProgressTracker
13
+ from rapidata.rapidata_client.datapoints._datapoint_uploader import DatapointUploader
17
14
 
18
15
  # Add OpenTelemetry context imports for thread propagation
19
16
  from opentelemetry import context as otel_context
@@ -28,81 +25,78 @@ class RapidataDataset:
28
25
  def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
29
26
  self.id = dataset_id
30
27
  self.openapi_service = openapi_service
31
- self.local_file_service = LocalFileService()
28
+ self.datapoint_uploader = DatapointUploader(openapi_service)
32
29
 
33
30
  def add_datapoints(
34
31
  self,
35
32
  datapoints: list[Datapoint],
36
33
  ) -> tuple[list[Datapoint], list[Datapoint]]:
37
- if not datapoints:
38
- return [], []
39
-
40
- effective_asset_type = datapoints[0]._get_effective_asset_type()
41
-
42
- logger.debug(f"Config for datapoint upload: {rapidata_config}")
43
-
44
- if issubclass(effective_asset_type, MediaAsset):
45
- return self._add_media_from_paths(
46
- datapoints,
47
- )
48
- elif issubclass(effective_asset_type, TextAsset):
49
- return self._add_texts(datapoints)
50
- else:
51
- raise ValueError(f"Unsupported asset type: {effective_asset_type}")
52
-
53
- def _add_texts(
54
- self, datapoints: list[Datapoint]
55
- ) -> tuple[list[Datapoint], list[Datapoint]]:
56
-
57
- def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
58
- model = datapoint.create_text_upload_model(index)
59
-
60
- self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
61
- dataset_id=self.id, create_datapoint_from_text_sources_model=model
62
- )
63
- return datapoint
34
+ """
35
+ Process uploads in chunks with a ThreadPoolExecutor.
64
36
 
65
- def upload_with_context(
66
- context: otel_context.Context, datapoint: Datapoint, index: int
67
- ) -> Datapoint:
68
- """Wrapper function that runs upload_text_datapoint with the provided context."""
69
- token = otel_context.attach(context)
70
- try:
71
- return upload_text_datapoint(datapoint, index)
72
- finally:
73
- otel_context.detach(token)
37
+ Args:
38
+ media_paths: List of assets to upload
39
+ multi_metadata: Optional sequence of sequences of metadata
40
+ chunk_size: Number of items to process in each batch
74
41
 
42
+ Returns:
43
+ tuple[list[str], list[str]]: Lists of successful and failed uploads
44
+ """
75
45
  successful_uploads: list[Datapoint] = []
76
46
  failed_uploads: list[Datapoint] = []
77
47
 
78
- # Capture the current OpenTelemetry context before creating threads
79
- current_context = otel_context.get_current()
80
-
81
- total_uploads = len(datapoints)
82
- with ThreadPoolExecutor(
83
- max_workers=rapidata_config.upload.maxWorkers
84
- ) as executor:
85
- future_to_datapoint = {
86
- executor.submit(
87
- upload_with_context, current_context, datapoint, i
88
- ): datapoint
89
- for i, datapoint in enumerate(datapoints)
90
- }
48
+ with tqdm(
49
+ total=len(datapoints),
50
+ desc="Uploading datapoints",
51
+ disable=rapidata_config.logging.silent_mode,
52
+ ) as progress_bar:
53
+
54
+ def process_upload_with_context(
55
+ context: otel_context.Context, datapoint: Datapoint, index: int
56
+ ) -> tuple[list[Datapoint], list[Datapoint]]:
57
+ """Wrapper function that runs _process_single_upload with the provided context."""
58
+ token = otel_context.attach(context)
59
+ try:
60
+ return self._process_single_upload(datapoint, index)
61
+ finally:
62
+ otel_context.detach(token)
63
+
64
+ # Capture the current OpenTelemetry context before creating threads
65
+ current_context = otel_context.get_current()
66
+
67
+ with ThreadPoolExecutor(
68
+ max_workers=rapidata_config.upload.maxWorkers
69
+ ) as executor:
70
+ # Process uploads in chunks to avoid overwhelming the system
71
+ for chunk_idx, chunk in enumerate(
72
+ chunk_list(datapoints, rapidata_config.upload.chunkSize)
73
+ ):
74
+ futures = [
75
+ executor.submit(
76
+ process_upload_with_context,
77
+ current_context,
78
+ datapoint,
79
+ chunk_idx * rapidata_config.upload.chunkSize + i,
80
+ )
81
+ for i, datapoint in enumerate(chunk)
82
+ ]
83
+
84
+ # Wait for this chunk to complete before starting the next one
85
+ for future in as_completed(futures):
86
+ try:
87
+ chunk_successful, chunk_failed = future.result()
88
+ successful_uploads.extend(chunk_successful)
89
+ failed_uploads.extend(chunk_failed)
90
+ progress_bar.update(len(chunk_successful))
91
+ except Exception as e:
92
+ logger.error("Future execution failed: %s", str(e))
91
93
 
92
- with tqdm(
93
- total=total_uploads,
94
- desc="Uploading text datapoints",
95
- disable=rapidata_config.logging.silent_mode,
96
- ) as pbar:
97
- for future in as_completed(future_to_datapoint.keys()):
98
- datapoint = future_to_datapoint[future]
99
- try:
100
- result = future.result()
101
- pbar.update(1)
102
- successful_uploads.append(result)
103
- except Exception as e:
104
- failed_uploads.append(datapoint)
105
- logger.error("Upload failed for %s: %s", datapoint, str(e))
94
+ if failed_uploads:
95
+ logger.error(
96
+ "Upload failed for %s datapoints: %s",
97
+ len(failed_uploads),
98
+ failed_uploads,
99
+ )
106
100
 
107
101
  return successful_uploads, failed_uploads
108
102
 
@@ -128,21 +122,14 @@ class RapidataDataset:
128
122
  local_successful: list[Datapoint] = []
129
123
  local_failed: list[Datapoint] = []
130
124
 
131
- metadata = datapoint.get_prepared_metadata()
132
-
133
- local_paths = datapoint.get_local_file_paths()
134
- urls = datapoint.get_urls()
135
-
136
125
  last_exception = None
137
126
  for attempt in range(rapidata_config.upload.maxRetries):
138
127
  try:
139
128
  with suppress_rapidata_error_logging():
140
- self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
129
+ self.datapoint_uploader.upload_datapoint(
141
130
  dataset_id=self.id,
142
- file=local_paths,
143
- url=urls,
144
- metadata=metadata,
145
- sort_index=index,
131
+ datapoint=datapoint,
132
+ index=index,
146
133
  )
147
134
 
148
135
  local_successful.append(datapoint)
@@ -170,115 +157,6 @@ class RapidataDataset:
170
157
 
171
158
  return local_successful, local_failed
172
159
 
173
- def _process_uploads_in_chunks(
174
- self,
175
- datapoints: list[Datapoint],
176
- ) -> tuple[list[Datapoint], list[Datapoint]]:
177
- """
178
- Process uploads in chunks with a ThreadPoolExecutor.
179
-
180
- Args:
181
- media_paths: List of assets to upload
182
- multi_metadata: Optional sequence of sequences of metadata
183
- chunk_size: Number of items to process in each batch
184
-
185
- Returns:
186
- tuple[list[str], list[str]]: Lists of successful and failed uploads
187
- """
188
- successful_uploads: list[Datapoint] = []
189
- failed_uploads: list[Datapoint] = []
190
-
191
- def process_upload_with_context(
192
- context: otel_context.Context, datapoint: Datapoint, index: int
193
- ) -> tuple[list[Datapoint], list[Datapoint]]:
194
- """Wrapper function that runs _process_single_upload with the provided context."""
195
- token = otel_context.attach(context)
196
- try:
197
- return self._process_single_upload(datapoint, index)
198
- finally:
199
- otel_context.detach(token)
200
-
201
- # Capture the current OpenTelemetry context before creating threads
202
- current_context = otel_context.get_current()
203
-
204
- with ThreadPoolExecutor(
205
- max_workers=rapidata_config.upload.maxWorkers
206
- ) as executor:
207
- # Process uploads in chunks to avoid overwhelming the system
208
- for chunk_idx, chunk in enumerate(
209
- chunk_list(datapoints, rapidata_config.upload.chunkSize)
210
- ):
211
- futures = [
212
- executor.submit(
213
- process_upload_with_context,
214
- current_context,
215
- datapoint,
216
- chunk_idx * rapidata_config.upload.chunkSize + i,
217
- )
218
- for i, datapoint in enumerate(chunk)
219
- ]
220
-
221
- # Wait for this chunk to complete before starting the next one
222
- for future in as_completed(futures):
223
- try:
224
- chunk_successful, chunk_failed = future.result()
225
- successful_uploads.extend(chunk_successful)
226
- failed_uploads.extend(chunk_failed)
227
- except Exception as e:
228
- logger.error("Future execution failed: %s", str(e))
229
-
230
- return successful_uploads, failed_uploads
231
-
232
- def _add_media_from_paths(
233
- self,
234
- datapoints: list[Datapoint],
235
- progress_poll_interval: float = 0.5,
236
- ) -> tuple[list[Datapoint], list[Datapoint]]:
237
- """
238
- Upload media paths in chunks with managed resources.
239
-
240
- Args:
241
- datapoints: List of Datapoint objects to upload
242
- chunk_size: Number of items to process in each batch
243
- progress_poll_interval: Time in seconds between progress checks
244
- Returns:
245
- tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
246
-
247
- Raises:
248
- ValueError: If multi_metadata lengths don't match media_paths length
249
- """
250
-
251
- # Setup tracking variables
252
- total_uploads = len(datapoints)
253
-
254
- # Create and start progress tracking thread
255
- progress_tracker = ProgressTracker(
256
- dataset_id=self.id,
257
- openapi_service=self.openapi_service,
258
- total_uploads=total_uploads,
259
- progress_poll_interval=progress_poll_interval,
260
- )
261
- progress_thread = progress_tracker.create_thread()
262
- progress_thread.start()
263
-
264
- # Process uploads in chunks
265
- try:
266
- successful_uploads, failed_uploads = self._process_uploads_in_chunks(
267
- datapoints,
268
- )
269
- finally:
270
- progress_tracker.complete()
271
- progress_thread.join(10)
272
-
273
- if failed_uploads:
274
- logger.error(
275
- "Upload failed for %s datapoints: %s",
276
- len(failed_uploads),
277
- failed_uploads,
278
- )
279
-
280
- return successful_uploads, failed_uploads
281
-
282
160
  def __str__(self) -> str:
283
161
  return f"RapidataDataset(id={self.id})"
284
162