PyPI - rapidata - Versions diffs - 2.40.1__py3-none-any.whl → 2.40.2__py3-none-any.whl - Mend

rapidata 2.40.1py3-none-any.whl → 2.40.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rapidata might be problematic. Click here for more details.

Files changed (11) hide show

rapidata/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "2.40.1"
+__version__ = "2.40.2"
 from .rapidata_client import (
     RapidataClient,

rapidata/rapidata_client/config/upload_config.py CHANGED Viewed

@@ -12,3 +12,4 @@ class UploadConfig(BaseModel):
     maxWorkers: int = Field(default=10)
     maxRetries: int = Field(default=3)
+    chunkSize: int = Field(default=50)

rapidata/rapidata_client/exceptions/failed_upload_exception.py CHANGED Viewed

@@ -10,7 +10,7 @@ from rapidata.api_client.models.original_filename_metadata_model import (
 from rapidata.api_client.models.source_url_metadata_model import SourceUrlMetadataModel
 from rapidata.rapidata_client.datapoints.assets import MediaAsset, MultiAsset
 from rapidata.rapidata_client.datapoints._datapoint import Datapoint
-from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
+from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
 from rapidata.rapidata_client.order.rapidata_order import RapidataOrder

rapidata/rapidata_client/order/_rapidata_order_builder.py CHANGED Viewed

@@ -32,7 +32,7 @@ from rapidata.rapidata_client.config import (
 from rapidata.rapidata_client.validation.validation_set_manager import (
     ValidationSetManager,
 )
-from rapidata.rapidata_client.order._rapidata_dataset import RapidataDataset
+from rapidata.rapidata_client.order.dataset._rapidata_dataset import RapidataDataset
 from rapidata.rapidata_client.order.rapidata_order import RapidataOrder
 from rapidata.rapidata_client.referee import Referee
 from rapidata.rapidata_client.referee._naive_referee import NaiveReferee
@@ -235,6 +235,10 @@ class RapidataOrderBuilder:
                     + f"Please open this URL in your browser: '{encoded_url}'"
                     + Fore.RESET
                 )
+            managed_print(
+                "If you want to avoid the automatic validation set creation in the future, set `rapidata_config.order.autoValidationSetCreation = False`."
+            )
+            managed_print()
         self.__dataset = (
             RapidataDataset(result.dataset_id, self.__openapi_service)
@@ -253,7 +257,7 @@ class RapidataOrderBuilder:
         )
         logger.debug("Order created: %s", order)
-        logger.debug("Adding media to the order.")
+        logger.debug("Adding datapoints to the order.")
         if self.__dataset:
             with tracer.start_as_current_span("add_datapoints"):
@@ -267,7 +271,7 @@ class RapidataOrderBuilder:
                 f"No dataset created for this order. order_id: {self.order_id}"
             )
-        logger.debug("Media added to the order.")
+        logger.debug("Datapoints added to the order.")
         logger.debug("Setting order to preview")
         try:
             self.__openapi_service.order_api.order_order_id_preview_post(self.order_id)

rapidata/rapidata_client/order/dataset/_progress_tracker.py ADDED Viewed

@@ -0,0 +1,91 @@
+import threading
+import time
+from tqdm import tqdm
+from rapidata.service.openapi_service import OpenAPIService
+from rapidata.rapidata_client.config import logger, rapidata_config
+class ProgressTracker:
+    """
+    Track dataset upload progress in a background thread with shallow indentation.
+    This class encapsulates the progress polling loop to keep methods in
+    `RapidataDataset` simpler and below the maximum indentation depth.
+    """
+    def __init__(
+        self,
+        dataset_id: str,
+        openapi_service: OpenAPIService,
+        total_uploads: int,
+        progress_poll_interval: float,
+    ) -> None:
+        self.dataset_id = dataset_id
+        self.openapi_service = openapi_service
+        self.total_uploads = total_uploads
+        self.progress_poll_interval = progress_poll_interval
+        self.upload_complete = False
+    def _get_progress_or_none(self):
+        try:
+            return self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
+                self.dataset_id
+            )
+        except Exception:  # noqa: BLE001
+            return None
+    def complete(self) -> None:
+        self.upload_complete = True
+    def run(self) -> None:
+        try:
+            with tqdm(
+                total=self.total_uploads,
+                desc="Uploading datapoints",
+                disable=rapidata_config.logging.silent_mode,
+            ) as pbar:
+                while True:
+                    current_progress = self._get_progress_or_none()
+                    if current_progress is None:
+                        time.sleep(self.progress_poll_interval)
+                        logger.debug(
+                            "No progress yet, sleeping for %s seconds",
+                            self.progress_poll_interval,
+                        )
+                        continue
+                    total_completed = current_progress.ready + current_progress.failed
+                    pbar.n = total_completed
+                    pbar.refresh()
+                    time.sleep(self.progress_poll_interval)
+                    if total_completed >= self.total_uploads:
+                        break
+                    if self.upload_complete and current_progress.pending == 0:
+                        break
+                pbar.close()
+                success_rate = (
+                    round((current_progress.ready / self.total_uploads * 100), 2)
+                    if self.total_uploads > 0
+                    else 0
+                )
+                logger.info(
+                    "Upload complete: %s ready, %s failed (%s%% success rate)",
+                    current_progress.ready,
+                    current_progress.failed,
+                    success_rate,
+                )
+        except Exception as e:  # noqa: BLE001
+            logger.error("Progress tracking thread error: %s", str(e))
+            raise RuntimeError("Progress tracking failed, aborting uploads")
+    def create_thread(self) -> threading.Thread:
+        thread = threading.Thread(target=self.run)
+        thread.daemon = True
+        return thread

rapidata/rapidata_client/order/dataset/_rapidata_dataset.py ADDED Viewed

@@ -0,0 +1,286 @@
+from rapidata.rapidata_client.datapoints._datapoint import Datapoint
+from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
+from rapidata.service import LocalFileService
+from rapidata.service.openapi_service import OpenAPIService
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+from typing import Generator
+from rapidata.rapidata_client.config import logger
+import time
+import threading
+from rapidata.rapidata_client.api.rapidata_api_client import (
+    suppress_rapidata_error_logging,
+)
+from rapidata.rapidata_client.config.rapidata_config import rapidata_config
+from rapidata.rapidata_client.order.dataset._progress_tracker import ProgressTracker
+# Add OpenTelemetry context imports for thread propagation
+from opentelemetry import context as otel_context
+def chunk_list(lst: list, chunk_size: int) -> Generator:
+    for i in range(0, len(lst), chunk_size):
+        yield lst[i : i + chunk_size]
+class RapidataDataset:
+    def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
+        self.id = dataset_id
+        self.openapi_service = openapi_service
+        self.local_file_service = LocalFileService()
+    def add_datapoints(
+        self,
+        datapoints: list[Datapoint],
+    ) -> tuple[list[Datapoint], list[Datapoint]]:
+        if not datapoints:
+            return [], []
+        effective_asset_type = datapoints[0]._get_effective_asset_type()
+        logger.debug(f"Config for datapoint upload: {rapidata_config}")
+        if issubclass(effective_asset_type, MediaAsset):
+            return self._add_media_from_paths(
+                datapoints,
+            )
+        elif issubclass(effective_asset_type, TextAsset):
+            return self._add_texts(datapoints)
+        else:
+            raise ValueError(f"Unsupported asset type: {effective_asset_type}")
+    def _add_texts(
+        self, datapoints: list[Datapoint]
+    ) -> tuple[list[Datapoint], list[Datapoint]]:
+        def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
+            model = datapoint.create_text_upload_model(index)
+            self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
+                dataset_id=self.id, create_datapoint_from_text_sources_model=model
+            )
+            return datapoint
+        def upload_with_context(
+            context: otel_context.Context, datapoint: Datapoint, index: int
+        ) -> Datapoint:
+            """Wrapper function that runs upload_text_datapoint with the provided context."""
+            token = otel_context.attach(context)
+            try:
+                return upload_text_datapoint(datapoint, index)
+            finally:
+                otel_context.detach(token)
+        successful_uploads: list[Datapoint] = []
+        failed_uploads: list[Datapoint] = []
+        # Capture the current OpenTelemetry context before creating threads
+        current_context = otel_context.get_current()
+        total_uploads = len(datapoints)
+        with ThreadPoolExecutor(
+            max_workers=rapidata_config.upload.maxWorkers
+        ) as executor:
+            future_to_datapoint = {
+                executor.submit(
+                    upload_with_context, current_context, datapoint, i
+                ): datapoint
+                for i, datapoint in enumerate(datapoints)
+            }
+            with tqdm(
+                total=total_uploads,
+                desc="Uploading text datapoints",
+                disable=rapidata_config.logging.silent_mode,
+            ) as pbar:
+                for future in as_completed(future_to_datapoint.keys()):
+                    datapoint = future_to_datapoint[future]
+                    try:
+                        result = future.result()
+                        pbar.update(1)
+                        successful_uploads.append(result)
+                    except Exception as e:
+                        failed_uploads.append(datapoint)
+                        logger.error("Upload failed for %s: %s", datapoint, str(e))
+        return successful_uploads, failed_uploads
+    def _process_single_upload(
+        self,
+        datapoint: Datapoint,
+        index: int,
+    ) -> tuple[list[Datapoint], list[Datapoint]]:
+        """
+        Process single upload with retry logic and error tracking.
+        Args:
+            media_asset: MediaAsset or MultiAsset to upload
+            meta_list: Optional sequence of metadata for the asset
+            index: Sort index for the upload
+            max_retries: Maximum number of retry attempts (default: 3)
+        Returns:
+            tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
+        """
+        logger.debug("Processing single upload for %s with index %s", datapoint, index)
+        local_successful: list[Datapoint] = []
+        local_failed: list[Datapoint] = []
+        metadata = datapoint.get_prepared_metadata()
+        local_paths = datapoint.get_local_file_paths()
+        urls = datapoint.get_urls()
+        last_exception = None
+        for attempt in range(rapidata_config.upload.maxRetries):
+            try:
+                with suppress_rapidata_error_logging():
+                    self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
+                        dataset_id=self.id,
+                        file=local_paths,
+                        url=urls,
+                        metadata=metadata,
+                        sort_index=index,
+                    )
+                local_successful.append(datapoint)
+                return local_successful, local_failed
+            except Exception as e:
+                last_exception = e
+                if attempt < rapidata_config.upload.maxRetries - 1:
+                    # Exponential backoff: wait 1s, then 2s, then 4s
+                    retry_delay = 2**attempt
+                    time.sleep(retry_delay)
+                    logger.debug("Error: %s", str(last_exception))
+                    logger.debug(
+                        "Retrying %s of %s...",
+                        attempt + 1,
+                        rapidata_config.upload.maxRetries,
+                    )
+        # If we get here, all retries failed
+        local_failed.append(datapoint)
+        tqdm.write(
+            f"Upload failed for {datapoint} after {rapidata_config.upload.maxRetries} attempts. \nFinal error: \n{str(last_exception)}"
+        )
+        return local_successful, local_failed
+    def _process_uploads_in_chunks(
+        self,
+        datapoints: list[Datapoint],
+    ) -> tuple[list[Datapoint], list[Datapoint]]:
+        """
+        Process uploads in chunks with a ThreadPoolExecutor.
+        Args:
+            media_paths: List of assets to upload
+            multi_metadata: Optional sequence of sequences of metadata
+            chunk_size: Number of items to process in each batch
+        Returns:
+            tuple[list[str], list[str]]: Lists of successful and failed uploads
+        """
+        successful_uploads: list[Datapoint] = []
+        failed_uploads: list[Datapoint] = []
+        def process_upload_with_context(
+            context: otel_context.Context, datapoint: Datapoint, index: int
+        ) -> tuple[list[Datapoint], list[Datapoint]]:
+            """Wrapper function that runs _process_single_upload with the provided context."""
+            token = otel_context.attach(context)
+            try:
+                return self._process_single_upload(datapoint, index)
+            finally:
+                otel_context.detach(token)
+        # Capture the current OpenTelemetry context before creating threads
+        current_context = otel_context.get_current()
+        with ThreadPoolExecutor(
+            max_workers=rapidata_config.upload.maxWorkers
+        ) as executor:
+            # Process uploads in chunks to avoid overwhelming the system
+            for chunk_idx, chunk in enumerate(
+                chunk_list(datapoints, rapidata_config.upload.chunkSize)
+            ):
+                futures = [
+                    executor.submit(
+                        process_upload_with_context,
+                        current_context,
+                        datapoint,
+                        chunk_idx * rapidata_config.upload.chunkSize + i,
+                    )
+                    for i, datapoint in enumerate(chunk)
+                ]
+                # Wait for this chunk to complete before starting the next one
+                for future in as_completed(futures):
+                    try:
+                        chunk_successful, chunk_failed = future.result()
+                        successful_uploads.extend(chunk_successful)
+                        failed_uploads.extend(chunk_failed)
+                    except Exception as e:
+                        logger.error("Future execution failed: %s", str(e))
+        return successful_uploads, failed_uploads
+    def _add_media_from_paths(
+        self,
+        datapoints: list[Datapoint],
+        progress_poll_interval: float = 0.5,
+    ) -> tuple[list[Datapoint], list[Datapoint]]:
+        """
+        Upload media paths in chunks with managed resources.
+        Args:
+            datapoints: List of Datapoint objects to upload
+            chunk_size: Number of items to process in each batch
+            progress_poll_interval: Time in seconds between progress checks
+        Returns:
+            tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
+        Raises:
+            ValueError: If multi_metadata lengths don't match media_paths length
+        """
+        # Setup tracking variables
+        total_uploads = len(datapoints)
+        # Create and start progress tracking thread
+        progress_tracker = ProgressTracker(
+            dataset_id=self.id,
+            openapi_service=self.openapi_service,
+            total_uploads=total_uploads,
+            progress_poll_interval=progress_poll_interval,
+        )
+        progress_thread = progress_tracker.create_thread()
+        progress_thread.start()
+        # Process uploads in chunks
+        try:
+            successful_uploads, failed_uploads = self._process_uploads_in_chunks(
+                datapoints,
+            )
+        finally:
+            progress_tracker.complete()
+            progress_thread.join(10)
+        if failed_uploads:
+            logger.error(
+                "Upload failed for %s datapoints: %s",
+                len(failed_uploads),
+                failed_uploads,
+            )
+        return successful_uploads, failed_uploads
+    def __str__(self) -> str:
+        return f"RapidataDataset(id={self.id})"
+    def __repr__(self) -> str:
+        return self.__str__()

{rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rapidata
-Version: 2.40.1
+Version: 2.40.2
 Summary: Rapidata package containing the Rapidata Python Client to interact with the Rapidata Web API in an easy way.
 License: Apache-2.0
 Author: Rapidata AG

{rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-rapidata/__init__.py,sha256=56QmFS7POsslVrmIEWRk1hMRZpZmKzHDBRzESibZS0Q,917
+rapidata/__init__.py,sha256=nNNJT2nQfhHYe5yS9T3V-0MeyarzPFCtE_66Wnit6ho,917
 rapidata/api_client/__init__.py,sha256=utY2iWepKJQO_iGz6aIg_qSoqoDkV9pBMAA58pIFE4M,36016
 rapidata/api_client/api/__init__.py,sha256=07qqwzQiBYt5V2BtnzbXhZL2cmVHATyZmCSGshIXLck,1603
 rapidata/api_client/api/benchmark_api.py,sha256=Mlx2qDDJcgPjWvaBnps9dxvVd0re1knG0SyoLUiHKSc,119756
@@ -587,7 +587,7 @@ rapidata/rapidata_client/config/managed_print.py,sha256=2T6dwgR1EZzFAdOEyPp_BBUs
 rapidata/rapidata_client/config/order_config.py,sha256=XxRZERzUUA9md6-PVlV__eCw8DD2kPbT_UmMwG1mAS4,615
 rapidata/rapidata_client/config/rapidata_config.py,sha256=mURnKdl5-2sE4e_IYY9-aBkix6a12t47otEErGE_q0c,1507
 rapidata/rapidata_client/config/tracer.py,sha256=h3GXzaX79HPcip4fBhLaLW0mRlXttR7D3KA78ZT0KVw,4736
-rapidata/rapidata_client/config/upload_config.py,sha256=AYba-Nw9fddLFyfGB4ar2G8zZIVOHrCL_HZjTmvrKGQ,434
+rapidata/rapidata_client/config/upload_config.py,sha256=hjefl-w9WaCNeCEe6hdnrAQEMjgDy-r1zgUUIFR68wk,473
 rapidata/rapidata_client/country_codes/__init__.py,sha256=FB9Dcks44J6C6YBSYmTmNZ71tE130x6NO_3aLJ8fKzQ,40
 rapidata/rapidata_client/country_codes/country_codes.py,sha256=ePHqeb7y9DWQZAnddBzPx1puYBcrgUjdR2sbFijuFD8,283
 rapidata/rapidata_client/datapoints/__init__.py,sha256=YiXWlFKSi3ABP35zDukL7_z5uEdRrCMriquM6BoX6-s,276
@@ -611,7 +611,7 @@ rapidata/rapidata_client/datapoints/metadata/_select_words_metadata.py,sha256=T8
 rapidata/rapidata_client/demographic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rapidata/rapidata_client/demographic/demographic_manager.py,sha256=x0kQdgqMXAx7VuZJiP2HeI_dtKEd-W-hcY3URDcEfrU,1089
 rapidata/rapidata_client/exceptions/__init__.py,sha256=2hbWRgjlCGuoLPVDloQmmH81uzm9F2OAX2iFGCJyRu8,59
-rapidata/rapidata_client/exceptions/failed_upload_exception.py,sha256=iN0RqEw_mw4Cl3CMI7A3ljj9EFjGdlr9LpnMIwGOH6g,3109
+rapidata/rapidata_client/exceptions/failed_upload_exception.py,sha256=jsd2foR3c8X5g4hgljgMAY5X_JTdmUuhBPWaL12938E,3117
 rapidata/rapidata_client/filter/__init__.py,sha256=j_Kfz_asNVxwp56SAN2saB7ZAHg3smL5_W2sSitmuJY,548
 rapidata/rapidata_client/filter/_base_filter.py,sha256=NVa2oWgtXD9kmXWyMkYZZ-2RYzgcN0hO76uGrEXXLEs,2384
 rapidata/rapidata_client/filter/age_filter.py,sha256=mVZaKyBoK-mml_oFox97l1yUXvINPk-2cEimuU_FJac,908
@@ -631,8 +631,9 @@ rapidata/rapidata_client/filter/rapidata_filters.py,sha256=B8ptQsaAn1e14Grv8xBYQ
 rapidata/rapidata_client/filter/response_count_filter.py,sha256=i2u2YQD3_RLQRZyqAceAGLQS3es97Q2n8KTlgfDYMko,2332
 rapidata/rapidata_client/filter/user_score_filter.py,sha256=4B3Zzp7aosDFmte3nLPTlXMN4zatT6Wcq5QLIoXqhgI,1910
 rapidata/rapidata_client/order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=ftJD0czFX79AG9It7q2qdrQmUIGoyGq713AoUoC6nfU,18976
-rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=e2W_aH-TxsZHcApCsdbps4lqYCwRh5uhldy0mxv2oDc,16878
+rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=C-TbKELNuLjQiZt9Gsl6LdtzIUtsu0sNLKGIcLvJEHk,17120
+rapidata/rapidata_client/order/dataset/_progress_tracker.py,sha256=PkTSYrLVNgtXjklXj5ikBEcgF6qaYXoafYyUZQHRn9M,3109
+rapidata/rapidata_client/order/dataset/_rapidata_dataset.py,sha256=ONH56htEvoVZvkSItuTi3_88kaDWohmYYABSoAPEn4Q,10724
 rapidata/rapidata_client/order/rapidata_order.py,sha256=FvZi3t4dARRNsKWvYiNxVvM50AzPwQYR3AzI4utD6OI,14497
 rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=XiV_BpJxG6d8o0rFDYhnB3_mb576CQG5hY-qVXlJZKY,42592
 rapidata/rapidata_client/order/rapidata_results.py,sha256=weL4S14fzug3ZOJbQk9Oj-4tv2jx5aZAMp7VJ-a6Qq4,8437
@@ -689,7 +690,7 @@ rapidata/service/credential_manager.py,sha256=T3yL4tXVnibRytxjQkOC-ex3kFGQR5KcKU
 rapidata/service/local_file_service.py,sha256=0Q4LdoEtPFKzgXK2oZ1cQ-X7FipakscjGnnBH8dRFRQ,855
 rapidata/service/openapi_service.py,sha256=k3V4eMNcAjBcxEv17lDivK8LV5TEjRTL9B_5KBlhcas,5482
 rapidata/types/__init__.py,sha256=gSGrmWV5gEA6pPfAR5vwSy_DvibO5IjCZDiB7LtlMOQ,6134
-rapidata-2.40.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-rapidata-2.40.1.dist-info/METADATA,sha256=-BOtvVuVwtdqSq-KcoEi9TYV2irzM9oFRjaIKd3U5Js,1406
-rapidata-2.40.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-rapidata-2.40.1.dist-info/RECORD,,
+rapidata-2.40.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+rapidata-2.40.2.dist-info/METADATA,sha256=j6edMuFl30ALoelOUbusmBXYPwMCJjfUZbWfzk2GsdY,1406
+rapidata-2.40.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+rapidata-2.40.2.dist-info/RECORD,,

rapidata/rapidata_client/order/_rapidata_dataset.py DELETED Viewed

@@ -1,475 +0,0 @@
-from rapidata.rapidata_client.datapoints._datapoint import Datapoint
-from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
-from rapidata.service import LocalFileService
-from rapidata.service.openapi_service import OpenAPIService
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from tqdm import tqdm
-from typing import Generator
-from rapidata.rapidata_client.config import logger, managed_print
-import time
-import threading
-from rapidata.rapidata_client.api.rapidata_api_client import (
-    suppress_rapidata_error_logging,
-)
-from rapidata.rapidata_client.config.rapidata_config import rapidata_config
-# Add OpenTelemetry context imports for thread propagation
-from opentelemetry import context as otel_context
-def chunk_list(lst: list, chunk_size: int) -> Generator:
-    for i in range(0, len(lst), chunk_size):
-        yield lst[i : i + chunk_size]
-class RapidataDataset:
-    def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
-        self.id = dataset_id
-        self.openapi_service = openapi_service
-        self.local_file_service = LocalFileService()
-    def add_datapoints(
-        self,
-        datapoints: list[Datapoint],
-    ) -> tuple[list[Datapoint], list[Datapoint]]:
-        if not datapoints:
-            return [], []
-        effective_asset_type = datapoints[0]._get_effective_asset_type()
-        logger.debug(f"Config for datapoint upload: {rapidata_config}")
-        if issubclass(effective_asset_type, MediaAsset):
-            return self._add_media_from_paths(
-                datapoints,
-            )
-        elif issubclass(effective_asset_type, TextAsset):
-            return self._add_texts(datapoints)
-        else:
-            raise ValueError(f"Unsupported asset type: {effective_asset_type}")
-    def _add_texts(
-        self, datapoints: list[Datapoint]
-    ) -> tuple[list[Datapoint], list[Datapoint]]:
-        def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
-            model = datapoint.create_text_upload_model(index)
-            self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
-                dataset_id=self.id, create_datapoint_from_text_sources_model=model
-            )
-            return datapoint
-        def upload_with_context(
-            context: otel_context.Context, datapoint: Datapoint, index: int
-        ) -> Datapoint:
-            """Wrapper function that runs upload_text_datapoint with the provided context."""
-            token = otel_context.attach(context)
-            try:
-                return upload_text_datapoint(datapoint, index)
-            finally:
-                otel_context.detach(token)
-        successful_uploads: list[Datapoint] = []
-        failed_uploads: list[Datapoint] = []
-        # Capture the current OpenTelemetry context before creating threads
-        current_context = otel_context.get_current()
-        total_uploads = len(datapoints)
-        with ThreadPoolExecutor(
-            max_workers=rapidata_config.upload.maxWorkers
-        ) as executor:
-            future_to_datapoint = {
-                executor.submit(
-                    upload_with_context, current_context, datapoint, i
-                ): datapoint
-                for i, datapoint in enumerate(datapoints)
-            }
-            with tqdm(
-                total=total_uploads,
-                desc="Uploading text datapoints",
-                disable=rapidata_config.logging.silent_mode,
-            ) as pbar:
-                for future in as_completed(future_to_datapoint.keys()):
-                    datapoint = future_to_datapoint[future]
-                    try:
-                        result = future.result()
-                        pbar.update(1)
-                        successful_uploads.append(result)
-                    except Exception as e:
-                        failed_uploads.append(datapoint)
-                        logger.error("Upload failed for %s: %s", datapoint, str(e))
-        return successful_uploads, failed_uploads
-    def _process_single_upload(
-        self,
-        datapoint: Datapoint,
-        index: int,
-    ) -> tuple[list[Datapoint], list[Datapoint]]:
-        """
-        Process single upload with retry logic and error tracking.
-        Args:
-            media_asset: MediaAsset or MultiAsset to upload
-            meta_list: Optional sequence of metadata for the asset
-            index: Sort index for the upload
-            max_retries: Maximum number of retry attempts (default: 3)
-        Returns:
-            tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
-        """
-        logger.debug("Processing single upload for %s with index %s", datapoint, index)
-        local_successful: list[Datapoint] = []
-        local_failed: list[Datapoint] = []
-        metadata = datapoint.get_prepared_metadata()
-        local_paths = datapoint.get_local_file_paths()
-        urls = datapoint.get_urls()
-        last_exception = None
-        for attempt in range(rapidata_config.upload.maxRetries):
-            try:
-                with suppress_rapidata_error_logging():
-                    self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
-                        dataset_id=self.id,
-                        file=local_paths,
-                        url=urls,
-                        metadata=metadata,
-                        sort_index=index,
-                    )
-                local_successful.append(datapoint)
-                return local_successful, local_failed
-            except Exception as e:
-                last_exception = e
-                if attempt < rapidata_config.upload.maxRetries - 1:
-                    # Exponential backoff: wait 1s, then 2s, then 4s
-                    retry_delay = 2**attempt
-                    time.sleep(retry_delay)
-                    logger.debug("Error: %s", str(last_exception))
-                    logger.debug(
-                        "Retrying %s of %s...",
-                        attempt + 1,
-                        rapidata_config.upload.maxRetries,
-                    )
-        # If we get here, all retries failed
-        local_failed.append(datapoint)
-        tqdm.write(
-            f"Upload failed for {datapoint} after {rapidata_config.upload.maxRetries} attempts. \nFinal error: \n{str(last_exception)}"
-        )
-        return local_successful, local_failed
-    def _get_progress_tracker(
-        self,
-        total_uploads: int,
-        stop_event: threading.Event,
-        progress_error_event: threading.Event,
-        progress_poll_interval: float,
-    ) -> threading.Thread:
-        """
-        Create and return a progress tracking thread that shows actual API progress.
-        Args:
-            total_uploads: Total number of uploads to track
-            initial_ready: Initial number of ready items
-            initial_progress: Initial progress state
-            stop_event: Event to signal thread to stop
-            progress_error_event: Event to signal an error in progress tracking
-            progress_poll_interval: Time between progress checks
-        Returns:
-            threading.Thread: The progress tracking thread
-        """
-        def progress_tracking_thread():
-            try:
-                # Initialize progress bar with 0 completions
-                with tqdm(
-                    total=total_uploads,
-                    desc="Uploading datapoints",
-                    disable=rapidata_config.logging.silent_mode,
-                ) as pbar:
-                    prev_ready = 0
-                    prev_failed = 0
-                    stall_count = 0
-                    last_progress_time = time.time()
-                    # We'll wait for all uploads to finish + some extra time
-                    # for the backend to fully process everything
-                    all_uploads_complete = threading.Event()
-                    while not stop_event.is_set() or not all_uploads_complete.is_set():
-                        try:
-                            current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
-                                self.id
-                            )
-                            # Calculate items completed since our initialization
-                            completed_ready = current_progress.ready
-                            completed_failed = current_progress.failed
-                            total_completed = completed_ready + completed_failed
-                            # Calculate newly completed items since our last check
-                            new_ready = current_progress.ready - prev_ready
-                            new_failed = current_progress.failed - prev_failed
-                            # Update progress bar position to show actual completed items
-                            # First reset to match the actual completed count
-                            pbar.n = total_completed
-                            pbar.refresh()
-                            if new_ready > 0 or new_failed > 0:
-                                # We saw progress
-                                stall_count = 0
-                                last_progress_time = time.time()
-                            else:
-                                stall_count += 1
-                            # Update our tracking variables
-                            prev_ready = current_progress.ready
-                            prev_failed = current_progress.failed or 0
-                            # Check if stop_event was set (all uploads submitted)
-                            if stop_event.is_set():
-                                elapsed_since_last_progress = (
-                                    time.time() - last_progress_time
-                                )
-                                # If we haven't seen progress for a while after all uploads were submitted
-                                if elapsed_since_last_progress > 5.0:
-                                    # If we're at 100%, we're done
-                                    if total_completed >= total_uploads:
-                                        all_uploads_complete.set()
-                                        break
-                                    # If we're not at 100% but it's been a while with no progress
-                                    if stall_count > 5:
-                                        # We've polled several times with no progress, assume we're done
-                                        logger.warning(
-                                            "\nProgress seems stalled at %s/%s.",
-                                            total_completed,
-                                            total_uploads,
-                                        )
-                                        break
-                        except Exception as e:
-                            logger.error("\nError checking progress: %s", str(e))
-                            stall_count += 1
-                            if stall_count > 10:  # Too many consecutive errors
-                                progress_error_event.set()
-                                break
-                        # Sleep before next poll
-                        time.sleep(progress_poll_interval)
-            except Exception as e:
-                logger.error("Progress tracking thread error: %s", str(e))
-                progress_error_event.set()
-        # Create and return the thread
-        progress_thread = threading.Thread(target=progress_tracking_thread)
-        progress_thread.daemon = True
-        return progress_thread
-    def _process_uploads_in_chunks(
-        self,
-        datapoints: list[Datapoint],
-        chunk_size: int,
-        stop_progress_tracking: threading.Event,
-        progress_tracking_error: threading.Event,
-    ) -> tuple[list[Datapoint], list[Datapoint]]:
-        """
-        Process uploads in chunks with a ThreadPoolExecutor.
-        Args:
-            media_paths: List of assets to upload
-            multi_metadata: Optional sequence of sequences of metadata
-            chunk_size: Number of items to process in each batch
-            stop_progress_tracking: Event to signal progress tracking to stop
-            progress_tracking_error: Event to detect progress tracking errors
-        Returns:
-            tuple[list[str], list[str]]: Lists of successful and failed uploads
-        """
-        successful_uploads: list[Datapoint] = []
-        failed_uploads: list[Datapoint] = []
-        def process_upload_with_context(
-            context: otel_context.Context, datapoint: Datapoint, index: int
-        ) -> tuple[list[Datapoint], list[Datapoint]]:
-            """Wrapper function that runs _process_single_upload with the provided context."""
-            token = otel_context.attach(context)
-            try:
-                return self._process_single_upload(datapoint, index)
-            finally:
-                otel_context.detach(token)
-        # Capture the current OpenTelemetry context before creating threads
-        current_context = otel_context.get_current()
-        try:
-            with ThreadPoolExecutor(
-                max_workers=rapidata_config.upload.maxWorkers
-            ) as executor:
-                # Process uploads in chunks to avoid overwhelming the system
-                for chunk_idx, chunk in enumerate(chunk_list(datapoints, chunk_size)):
-                    futures = [
-                        executor.submit(
-                            process_upload_with_context,
-                            current_context,
-                            datapoint,
-                            chunk_idx * chunk_size + i,
-                        )
-                        for i, datapoint in enumerate(chunk)
-                    ]
-                    # Wait for this chunk to complete before starting the next one
-                    for future in as_completed(futures):
-                        if progress_tracking_error.is_set():
-                            raise RuntimeError(
-                                "Progress tracking failed, aborting uploads"
-                            )
-                        try:
-                            chunk_successful, chunk_failed = future.result()
-                            successful_uploads.extend(chunk_successful)
-                            failed_uploads.extend(chunk_failed)
-                        except Exception as e:
-                            logger.error("Future execution failed: %s", str(e))
-        finally:
-            # Signal to the progress tracking thread that all uploads have been submitted
-            stop_progress_tracking.set()
-        return successful_uploads, failed_uploads
-    def _log_final_progress(
-        self,
-        total_uploads: int,
-        progress_poll_interval: float,
-        successful_uploads: list[Datapoint],
-        failed_uploads: list[Datapoint],
-    ) -> None:
-        """
-        Log the final progress of the upload operation.
-        Args:
-            total_uploads: Total number of uploads
-            initial_ready: Initial number of ready items
-            initial_progress: Initial progress state
-            progress_poll_interval: Time between progress checks
-            successful_uploads: List of successful uploads for fallback reporting
-            failed_uploads: List of failed uploads for fallback reporting
-        """
-        try:
-            # Get final progress
-            final_progress = (
-                self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
-                    self.id
-                )
-            )
-            total_ready = final_progress.ready
-            total_failed = final_progress.failed
-            # Make sure we account for all uploads
-            if total_ready + total_failed < total_uploads:
-                # Try one more time after a longer wait
-                time.sleep(5 * progress_poll_interval)
-                final_progress = (
-                    self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
-                        self.id
-                    )
-                )
-                total_ready = final_progress.ready
-                total_failed = final_progress.failed
-            success_rate = (
-                (total_ready / total_uploads * 100) if total_uploads > 0 else 0
-            )
-            logger.info(
-                "Upload complete: %s ready, %s failed (%s%% success rate)",
-                total_ready,
-                total_uploads - total_ready,
-                success_rate,
-            )
-        except Exception as e:
-            logger.error("Error getting final progress: %s", str(e))
-            logger.info(
-                "Upload summary from local tracking: %s succeeded, %s failed",
-                len(successful_uploads),
-                len(failed_uploads),
-            )
-        if failed_uploads:
-            logger.error("Failed uploads: %s", failed_uploads)
-    def _add_media_from_paths(
-        self,
-        datapoints: list[Datapoint],
-        chunk_size: int = 50,
-        progress_poll_interval: float = 0.5,
-    ) -> tuple[list[Datapoint], list[Datapoint]]:
-        """
-        Upload media paths in chunks with managed resources.
-        Args:
-            datapoints: List of Datapoint objects to upload
-            chunk_size: Number of items to process in each batch
-            progress_poll_interval: Time in seconds between progress checks
-        Returns:
-            tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
-        Raises:
-            ValueError: If multi_metadata lengths don't match media_paths length
-        """
-        # Setup tracking variables
-        total_uploads = len(datapoints)
-        # Create thread control events
-        stop_progress_tracking = threading.Event()
-        progress_tracking_error = threading.Event()
-        # Create and start progress tracking thread
-        progress_thread = self._get_progress_tracker(
-            total_uploads,
-            stop_progress_tracking,
-            progress_tracking_error,
-            progress_poll_interval,
-        )
-        progress_thread.start()
-        # Process uploads in chunks
-        try:
-            successful_uploads, failed_uploads = self._process_uploads_in_chunks(
-                datapoints,
-                chunk_size,
-                stop_progress_tracking,
-                progress_tracking_error,
-            )
-        finally:
-            progress_thread.join(10)  # Add margin to the timeout for tqdm
-        # Log final progress
-        self._log_final_progress(
-            total_uploads, progress_poll_interval, successful_uploads, failed_uploads
-        )
-        return successful_uploads, failed_uploads
-    def __str__(self) -> str:
-        return f"RapidataDataset(id={self.id})"
-    def __repr__(self) -> str:
-        return self.__str__()

{rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{rapidata-2.40.1.dist-info → rapidata-2.40.2.dist-info}/WHEEL RENAMED Viewed

File without changes

rapidata 2.40.1__py3-none-any.whl → 2.40.2__py3-none-any.whl

Potentially problematic release.

rapidata 2.40.1py3-none-any.whl → 2.40.2py3-none-any.whl