PyPI - rapidata - Versions diffs - 2.35.1__py3-none-any.whl → 2.35.3__py3-none-any.whl - Mend

rapidata 2.35.1py3-none-any.whl → 2.35.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rapidata might be problematic. Click here for more details.

Files changed (23) hide show

rapidata/__init__.py +2 -1
rapidata/api_client/api/leaderboard_api.py +3 -3
rapidata/api_client_README.md +1 -1
rapidata/rapidata_client/__init__.py +5 -13
rapidata/rapidata_client/api/rapidata_exception.py +61 -32
rapidata/rapidata_client/benchmark/participant/_participant.py +45 -26
rapidata/rapidata_client/benchmark/rapidata_benchmark_manager.py +73 -30
rapidata/rapidata_client/config/__init__.py +1 -0
rapidata/rapidata_client/config/config.py +33 -0
rapidata/rapidata_client/datapoints/assets/_multi_asset.py +7 -7
rapidata/rapidata_client/datapoints/assets/_sessions.py +13 -8
rapidata/rapidata_client/order/_rapidata_dataset.py +166 -115
rapidata/rapidata_client/order/_rapidata_order_builder.py +54 -22
rapidata/rapidata_client/order/rapidata_order.py +109 -48
rapidata/rapidata_client/rapidata_client.py +19 -14
rapidata/rapidata_client/validation/rapidata_validation_set.py +13 -7
rapidata/rapidata_client/validation/validation_set_manager.py +167 -98
rapidata/service/credential_manager.py +13 -13
rapidata/service/openapi_service.py +22 -13
{rapidata-2.35.1.dist-info → rapidata-2.35.3.dist-info}/METADATA +1 -1
{rapidata-2.35.1.dist-info → rapidata-2.35.3.dist-info}/RECORD +23 -21
{rapidata-2.35.1.dist-info → rapidata-2.35.3.dist-info}/LICENSE +0 -0
{rapidata-2.35.1.dist-info → rapidata-2.35.3.dist-info}/WHEEL +0 -0

rapidata/rapidata_client/datapoints/assets/_sessions.py CHANGED Viewed

@@ -2,34 +2,39 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
+from rapidata.rapidata_client.config.config import rapidata_config
 class SessionManager:
     _session = None
     @classmethod
-    def get_session(cls, ) -> requests.Session:
+    def get_session(
+        cls,
+    ) -> requests.Session:
         """Get a singleton requests session with retry logic.
         Returns:
             requests.Session: A singleton requests session with retry logic.
         """
         if cls._session is None:
-            max_retries: int = 5
-            max_workers: int = 10
+            max_retries: int = rapidata_config.upload_max_retries
+            max_workers: int = rapidata_config.max_upload_workers
             cls._session = requests.Session()
             retries = Retry(
                 total=max_retries,
                 backoff_factor=1,
                 status_forcelist=[500, 502, 503, 504],
                 allowed_methods=["GET"],
-                respect_retry_after_header=True
+                respect_retry_after_header=True,
             )
             adapter = HTTPAdapter(
                 pool_connections=max_workers * 2,
                 pool_maxsize=max_workers * 4,
-                max_retries=retries
+                max_retries=retries,
             )
-            cls._session.mount('http://', adapter)
-            cls._session.mount('https://', adapter)
+            cls._session.mount("http://", adapter)
+            cls._session.mount("https://", adapter)
         return cls._session

rapidata/rapidata_client/order/_rapidata_dataset.py CHANGED Viewed

@@ -1,23 +1,28 @@
-from itertools import zip_longest
-from rapidata.api_client.models.create_datapoint_from_text_sources_model import CreateDatapointFromTextSourcesModel
-from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import DatasetDatasetIdDatapointsPostRequestMetadataInner
 from rapidata.rapidata_client.datapoints.datapoint import Datapoint
-from rapidata.rapidata_client.datapoints.metadata import Metadata
-from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset, MultiAsset, BaseAsset
+from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
 from rapidata.service import LocalFileService
 from rapidata.service.openapi_service import OpenAPIService
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm import tqdm
-from typing import cast, Sequence, Generator
-from rapidata.rapidata_client.logging import logger, managed_print, RapidataOutputManager
+from typing import Generator
+from rapidata.rapidata_client.logging import (
+    logger,
+    managed_print,
+    RapidataOutputManager,
+)
 import time
 import threading
+from rapidata.rapidata_client.api.rapidata_exception import (
+    suppress_rapidata_error_logging,
+)
+from rapidata.rapidata_client.config.config import rapidata_config
 def chunk_list(lst: list, chunk_size: int) -> Generator:
     for i in range(0, len(lst), chunk_size):
-        yield lst[i:i + chunk_size]
+        yield lst[i : i + chunk_size]
 class RapidataDataset:
     def __init__(self, dataset_id: str, openapi_service: OpenAPIService):
@@ -31,39 +36,49 @@ class RapidataDataset:
     ) -> tuple[list[Datapoint], list[Datapoint]]:
         if not datapoints:
             return [], []
         effective_asset_type = datapoints[0]._get_effective_asset_type()
+        logger.debug(f"Config for datapoint upload: {rapidata_config}")
         if issubclass(effective_asset_type, MediaAsset):
-            return self._add_media_from_paths(datapoints)
+            return self._add_media_from_paths(
+                datapoints,
+            )
         elif issubclass(effective_asset_type, TextAsset):
             return self._add_texts(datapoints)
         else:
             raise ValueError(f"Unsupported asset type: {effective_asset_type}")
     def _add_texts(
-        self,
-        datapoints: list[Datapoint],
-        max_workers: int = 10,
+        self, datapoints: list[Datapoint]
     ) -> tuple[list[Datapoint], list[Datapoint]]:
         def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
             model = datapoint.create_text_upload_model(index)
-            self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(dataset_id=self.id, create_datapoint_from_text_sources_model=model)
+            self.openapi_service.dataset_api.dataset_dataset_id_datapoints_texts_post(
+                dataset_id=self.id, create_datapoint_from_text_sources_model=model
+            )
             return datapoint
         successful_uploads: list[Datapoint] = []
         failed_uploads: list[Datapoint] = []
         total_uploads = len(datapoints)
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        with ThreadPoolExecutor(
+            max_workers=rapidata_config.max_upload_workers
+        ) as executor:
             future_to_datapoint = {
                 executor.submit(upload_text_datapoint, datapoint, index=i): datapoint
                 for i, datapoint in enumerate(datapoints)
             }
-            with tqdm(total=total_uploads, desc="Uploading text datapoints", disable=RapidataOutputManager.silent_mode) as pbar:
+            with tqdm(
+                total=total_uploads,
+                desc="Uploading text datapoints",
+                disable=RapidataOutputManager.silent_mode,
+            ) as pbar:
                 for future in as_completed(future_to_datapoint.keys()):
                     datapoint = future_to_datapoint[future]
                     try:
@@ -72,7 +87,7 @@ class RapidataDataset:
                         successful_uploads.append(result)
                     except Exception as e:
                         failed_uploads.append(datapoint)
-                        logger.error(f"Upload failed for {datapoint}: {str(e)}")
+                        logger.error("Upload failed for %s: %s", datapoint, str(e))
         return successful_uploads, failed_uploads
@@ -80,20 +95,21 @@ class RapidataDataset:
         self,
         datapoint: Datapoint,
         index: int,
-        max_retries: int = 3,
     ) -> tuple[list[Datapoint], list[Datapoint]]:
         """
         Process single upload with retry logic and error tracking.
         Args:
             media_asset: MediaAsset or MultiAsset to upload
             meta_list: Optional sequence of metadata for the asset
             index: Sort index for the upload
             max_retries: Maximum number of retry attempts (default: 3)
         Returns:
             tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
         """
+        logger.debug("Processing single upload for %s with index %s", datapoint, index)
         local_successful: list[Datapoint] = []
         local_failed: list[Datapoint] = []
@@ -103,44 +119,52 @@ class RapidataDataset:
         urls = datapoint.get_urls()
         last_exception = None
-        for attempt in range(max_retries):
+        for attempt in range(rapidata_config.upload_max_retries):
             try:
-                self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
-                    dataset_id=self.id,
-                    file=local_paths,
-                    url=urls,
-                    metadata=metadata,
-                    sort_index=index,
-                )
+                with suppress_rapidata_error_logging():
+                    self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
+                        dataset_id=self.id,
+                        file=local_paths,
+                        url=urls,
+                        metadata=metadata,
+                        sort_index=index,
+                    )
                 local_successful.append(datapoint)
                 return local_successful, local_failed
             except Exception as e:
                 last_exception = e
-                if attempt < max_retries - 1:
+                if attempt < rapidata_config.upload_max_retries - 1:
                     # Exponential backoff: wait 1s, then 2s, then 4s
-                    retry_delay = 2 ** attempt
+                    retry_delay = 2**attempt
                     time.sleep(retry_delay)
-                    managed_print(f"\nRetrying {attempt + 1} of {max_retries}...\n")
+                    logger.debug("Error: %s", str(last_exception))
+                    logger.debug(
+                        "Retrying %s of %s...",
+                        attempt + 1,
+                        rapidata_config.upload_max_retries,
+                    )
         # If we get here, all retries failed
         local_failed.append(datapoint)
-        logger.error(f"\nUpload failed for {datapoint} after {max_retries} attempts. Final error: {str(last_exception)}")
+        tqdm.write(
+            f"Upload failed for {datapoint} after {rapidata_config.upload_max_retries} attempts. \nFinal error: \n{str(last_exception)}"
+        )
         return local_successful, local_failed
     def _get_progress_tracker(
-        self,
-        total_uploads: int,
-        stop_event: threading.Event,
+        self,
+        total_uploads: int,
+        stop_event: threading.Event,
         progress_error_event: threading.Event,
         progress_poll_interval: float,
     ) -> threading.Thread:
         """
         Create and return a progress tracking thread that shows actual API progress.
         Args:
             total_uploads: Total number of uploads to track
             initial_ready: Initial number of ready items
@@ -148,84 +172,97 @@ class RapidataDataset:
             stop_event: Event to signal thread to stop
             progress_error_event: Event to signal an error in progress tracking
             progress_poll_interval: Time between progress checks
         Returns:
             threading.Thread: The progress tracking thread
         """
         def progress_tracking_thread():
             try:
                 # Initialize progress bar with 0 completions
-                with tqdm(total=total_uploads, desc="Uploading datapoints", disable=RapidataOutputManager.silent_mode) as pbar:
+                with tqdm(
+                    total=total_uploads,
+                    desc="Uploading datapoints",
+                    disable=RapidataOutputManager.silent_mode,
+                ) as pbar:
                     prev_ready = 0
                     prev_failed = 0
                     stall_count = 0
                     last_progress_time = time.time()
                     # We'll wait for all uploads to finish + some extra time
                     # for the backend to fully process everything
                     all_uploads_complete = threading.Event()
                     while not stop_event.is_set() or not all_uploads_complete.is_set():
                         try:
-                            current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(self.id)
+                            current_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
+                                self.id
+                            )
                             # Calculate items completed since our initialization
                             completed_ready = current_progress.ready
                             completed_failed = current_progress.failed
                             total_completed = completed_ready + completed_failed
                             # Calculate newly completed items since our last check
                             new_ready = current_progress.ready - prev_ready
                             new_failed = current_progress.failed - prev_failed
                             # Update progress bar position to show actual completed items
                             # First reset to match the actual completed count
                             pbar.n = total_completed
                             pbar.refresh()
                             if new_ready > 0 or new_failed > 0:
                                 # We saw progress
                                 stall_count = 0
                                 last_progress_time = time.time()
                             else:
                                 stall_count += 1
                             # Update our tracking variables
                             prev_ready = current_progress.ready
                             prev_failed = current_progress.failed or 0
                             # Check if stop_event was set (all uploads submitted)
                             if stop_event.is_set():
-                                elapsed_since_last_progress = time.time() - last_progress_time
+                                elapsed_since_last_progress = (
+                                    time.time() - last_progress_time
+                                )
                                 # If we haven't seen progress for a while after all uploads were submitted
                                 if elapsed_since_last_progress > 5.0:
                                     # If we're at 100%, we're done
                                     if total_completed >= total_uploads:
                                         all_uploads_complete.set()
                                         break
                                     # If we're not at 100% but it's been a while with no progress
                                     if stall_count > 5:
                                         # We've polled several times with no progress, assume we're done
-                                        logger.warning(f"\nProgress seems stalled at {total_completed}/{total_uploads}. Please try again.")
+                                        logger.warning(
+                                            "\nProgress seems stalled at %s/%s.",
+                                            total_completed,
+                                            total_uploads,
+                                        )
                                         break
                         except Exception as e:
-                            logger.error(f"\nError checking progress: {str(e)}")
+                            logger.error("\nError checking progress: %s", str(e))
                             stall_count += 1
                             if stall_count > 10:  # Too many consecutive errors
                                 progress_error_event.set()
                                 break
                         # Sleep before next poll
                         time.sleep(progress_poll_interval)
             except Exception as e:
-                logger.error(f"Progress tracking thread error: {str(e)}")
+                logger.error("Progress tracking thread error: %s", str(e))
                 progress_error_event.set()
         # Create and return the thread
         progress_thread = threading.Thread(target=progress_tracking_thread)
         progress_thread.daemon = True
@@ -234,68 +271,70 @@ class RapidataDataset:
     def _process_uploads_in_chunks(
         self,
         datapoints: list[Datapoint],
-        max_workers: int,
         chunk_size: int,
         stop_progress_tracking: threading.Event,
-        progress_tracking_error: threading.Event
+        progress_tracking_error: threading.Event,
     ) -> tuple[list[Datapoint], list[Datapoint]]:
         """
         Process uploads in chunks with a ThreadPoolExecutor.
         Args:
             media_paths: List of assets to upload
             multi_metadata: Optional sequence of sequences of metadata
-            max_workers: Maximum number of concurrent workers
             chunk_size: Number of items to process in each batch
             stop_progress_tracking: Event to signal progress tracking to stop
             progress_tracking_error: Event to detect progress tracking errors
         Returns:
             tuple[list[str], list[str]]: Lists of successful and failed uploads
         """
         successful_uploads: list[Datapoint] = []
         failed_uploads: list[Datapoint] = []
         try:
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            with ThreadPoolExecutor(
+                max_workers=rapidata_config.max_upload_workers
+            ) as executor:
                 # Process uploads in chunks to avoid overwhelming the system
                 for chunk_idx, chunk in enumerate(chunk_list(datapoints, chunk_size)):
                     futures = [
                         executor.submit(
-                            self._process_single_upload,
-                            datapoint,
-                            index=(chunk_idx * chunk_size + i)
+                            self._process_single_upload,
+                            datapoint,
+                            index=(chunk_idx * chunk_size + i),
                         )
                         for i, datapoint in enumerate(chunk)
                     ]
                     # Wait for this chunk to complete before starting the next one
                     for future in as_completed(futures):
                         if progress_tracking_error.is_set():
-                            raise RuntimeError("Progress tracking failed, aborting uploads")
+                            raise RuntimeError(
+                                "Progress tracking failed, aborting uploads"
+                            )
                         try:
                             chunk_successful, chunk_failed = future.result()
                             successful_uploads.extend(chunk_successful)
                             failed_uploads.extend(chunk_failed)
                         except Exception as e:
-                            logger.error(f"Future execution failed: {str(e)}")
+                            logger.error("Future execution failed: %s", str(e))
         finally:
             # Signal to the progress tracking thread that all uploads have been submitted
             stop_progress_tracking.set()
         return successful_uploads, failed_uploads
     def _log_final_progress(
-        self,
-        total_uploads: int,
+        self,
+        total_uploads: int,
         progress_poll_interval: float,
         successful_uploads: list[Datapoint],
-        failed_uploads: list[Datapoint]
+        failed_uploads: list[Datapoint],
     ) -> None:
         """
         Log the final progress of the upload operation.
         Args:
             total_uploads: Total number of uploads
             initial_ready: Initial number of ready items
@@ -304,93 +343,105 @@ class RapidataDataset:
             successful_uploads: List of successful uploads for fallback reporting
             failed_uploads: List of failed uploads for fallback reporting
         """
-        try:
+        try:
             # Get final progress
-            final_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(self.id)
+            final_progress = (
+                self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
+                    self.id
+                )
+            )
             total_ready = final_progress.ready
             total_failed = final_progress.failed
             # Make sure we account for all uploads
             if total_ready + total_failed < total_uploads:
                 # Try one more time after a longer wait
                 time.sleep(5 * progress_poll_interval)
-                final_progress = self.openapi_service.dataset_api.dataset_dataset_id_progress_get(self.id)
+                final_progress = (
+                    self.openapi_service.dataset_api.dataset_dataset_id_progress_get(
+                        self.id
+                    )
+                )
                 total_ready = final_progress.ready
                 total_failed = final_progress.failed
-            success_rate = (total_ready / total_uploads * 100) if total_uploads > 0 else 0
-            logger.info(f"Upload complete: {total_ready} ready, {total_uploads-total_ready} failed ({success_rate:.1f}% success rate)")
+            success_rate = (
+                (total_ready / total_uploads * 100) if total_uploads > 0 else 0
+            )
+            logger.info(
+                "Upload complete: %s ready, %s failed (%s%% success rate)",
+                total_ready,
+                total_uploads - total_ready,
+                success_rate,
+            )
         except Exception as e:
-            logger.error(f"Error getting final progress: {str(e)}")
-            logger.info(f"Upload summary from local tracking: {len(successful_uploads)} succeeded, {len(failed_uploads)} failed")
+            logger.error("Error getting final progress: %s", str(e))
+            logger.info(
+                "Upload summary from local tracking: %s succeeded, %s failed",
+                len(successful_uploads),
+                len(failed_uploads),
+            )
         if failed_uploads:
-            logger.error(f"Failed uploads: {failed_uploads}")
+            logger.error("Failed uploads: %s", failed_uploads)
     def _add_media_from_paths(
         self,
         datapoints: list[Datapoint],
-        max_workers: int = 10,
         chunk_size: int = 50,
         progress_poll_interval: float = 0.5,
     ) -> tuple[list[Datapoint], list[Datapoint]]:
         """
         Upload media paths in chunks with managed resources.
         Args:
             datapoints: List of Datapoint objects to upload
-            max_workers: Maximum number of concurrent upload workers
             chunk_size: Number of items to process in each batch
             progress_poll_interval: Time in seconds between progress checks
         Returns:
             tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
         Raises:
             ValueError: If multi_metadata lengths don't match media_paths length
         """
         # Setup tracking variables
         total_uploads = len(datapoints)
         # Create thread control events
         stop_progress_tracking = threading.Event()
         progress_tracking_error = threading.Event()
         # Create and start progress tracking thread
         progress_thread = self._get_progress_tracker(
-            total_uploads,
-            stop_progress_tracking,
+            total_uploads,
+            stop_progress_tracking,
             progress_tracking_error,
-            progress_poll_interval
+            progress_poll_interval,
         )
         progress_thread.start()
         # Process uploads in chunks
         try:
             successful_uploads, failed_uploads = self._process_uploads_in_chunks(
                 datapoints,
-                max_workers,
                 chunk_size,
                 stop_progress_tracking,
-                progress_tracking_error
+                progress_tracking_error,
             )
         finally:
             progress_thread.join(10)  # Add margin to the timeout for tqdm
         # Log final progress
         self._log_final_progress(
-            total_uploads,
-            progress_poll_interval,
-            successful_uploads,
-            failed_uploads
+            total_uploads, progress_poll_interval, successful_uploads, failed_uploads
         )
         return successful_uploads, failed_uploads
     def __str__(self) -> str:
         return f"RapidataDataset(id={self.id})"
     def __repr__(self) -> str:
         return self.__str__()

rapidata 2.35.1__py3-none-any.whl → 2.35.3__py3-none-any.whl

Potentially problematic release.

rapidata 2.35.1py3-none-any.whl → 2.35.3py3-none-any.whl