PyPI - rapidata - Versions diffs - 2.3.2__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

rapidata 2.3.2py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

rapidata/__init__.py +1 -1
rapidata/api_client/__init__.py +2 -0
rapidata/api_client/api/rapid_api.py +268 -0
rapidata/api_client/models/__init__.py +2 -0
rapidata/api_client/models/add_validation_rapid_model.py +9 -2
rapidata/api_client/models/add_validation_text_rapid_model.py +9 -2
rapidata/api_client/models/query_validation_rapids_result.py +7 -3
rapidata/api_client/models/rapid_issue.py +41 -0
rapidata/api_client/models/report_model.py +103 -0
rapidata/api_client_README.md +3 -0
rapidata/rapidata_client/assets/_media_asset.py +118 -18
rapidata/rapidata_client/selection/rapidata_selections.py +11 -1
rapidata/rapidata_client/validation/rapidata_validation_set.py +2 -294
rapidata/rapidata_client/validation/rapids/rapids.py +72 -125
rapidata/rapidata_client/validation/rapids/rapids_manager.py +206 -42
rapidata/rapidata_client/validation/validation_set_manager.py +108 -69
{rapidata-2.3.2.dist-info → rapidata-2.4.1.dist-info}/METADATA +1 -1
{rapidata-2.3.2.dist-info → rapidata-2.4.1.dist-info}/RECORD +20 -20
rapidata/rapidata_client/validation/_validation_rapid_parts.py +0 -61
rapidata/rapidata_client/validation/_validation_set_builder.py +0 -481
{rapidata-2.3.2.dist-info → rapidata-2.4.1.dist-info}/LICENSE +0 -0
{rapidata-2.3.2.dist-info → rapidata-2.4.1.dist-info}/WHEEL +0 -0

rapidata/rapidata_client/assets/_media_asset.py CHANGED Viewed

@@ -12,6 +12,8 @@ from PIL import Image
 from tinytag import TinyTag
 import tempfile
 from pydantic import StrictStr, StrictBytes
+from typing import Optional
+import logging
 class MediaAsset(BaseAsset):
@@ -26,6 +28,7 @@ class MediaAsset(BaseAsset):
     Raises:
         FileNotFoundError: If the provided file path does not exist.
     """
+    _logger = logging.getLogger(__name__ + '.MediaAsset')
     ALLOWED_TYPES = [
         'image/',
@@ -33,6 +36,28 @@ class MediaAsset(BaseAsset):
         'video/mp4',       # MP4
     ]
+    MIME_TYPES = {
+        'jpg': 'image/jpeg',
+        'jpeg': 'image/jpeg',
+        'png': 'image/png',
+        'gif': 'image/gif',
+        'webp': 'image/webp',
+        'mp3': 'audio/mp3',
+        'mp4': 'video/mp4'
+    }
+    FILE_SIGNATURES = {
+        b'\xFF\xD8\xFF': 'image/jpeg',
+        b'\x89PNG\r\n\x1a\n': 'image/png',
+        b'GIF87a': 'image/gif',
+        b'GIF89a': 'image/gif',
+        b'RIFF': 'image/webp',
+        b'ID3': 'audio/mp3',
+        b'\xFF\xFB': 'audio/mp3',
+        b'\xFF\xF3': 'audio/mp3',
+        b'ftyp': 'video/mp4',
+    }
     def __init__(self, path: str):
         """
         Initialize a MediaAsset instance.
@@ -134,38 +159,113 @@ class MediaAsset(BaseAsset):
             name = name + '.jpg'
         return name
+    def __get_media_type_from_extension(self, url: str) -> Optional[str]:
+        """
+        Determine media type from URL file extension.
+        Args:
+            url: The URL to check
+        Returns:
+            Optional[str]: MIME type if valid extension found, None otherwise
+        """
+        try:
+            ext = url.lower().split('?')[0].split('.')[-1]
+            return self.MIME_TYPES.get(ext)
+        except IndexError:
+            return None
+    def __validate_image_content(self, content: bytes) -> bool:
+        """
+        Validate image content using PIL.
+        Args:
+            content: Image bytes to validate
+        Returns:
+            bool: True if valid image, False otherwise
+        """
+        try:
+            img = Image.open(BytesIO(content))
+            img.verify()
+            return True
+        except Exception as e:
+            self._logger.debug(f"Image validation failed: {str(e)}")
+            return False
+    def __get_media_type_from_signature(self, content: bytes) -> Optional[str]:
+        """
+        Determine media type from file signature.
+        Args:
+            content: File content bytes
+        Returns:
+            Optional[str]: MIME type if valid signature found, None otherwise
+        """
+        file_start = content[:32]
+        for signature, mime_type in self.FILE_SIGNATURES.items():
+            if file_start.startswith(signature) or (signature in file_start[:10]):
+                return mime_type
+        return None
     def __get_media_bytes(self, url: str) -> bytes:
         """
-        Downloads media files from URL and validates type and duration.
+        Downloads and validates media files from URL.
         Args:
             url: URL of the media file
         Returns:
-            bytes: Media data
+            bytes: Validated media content
         Raises:
-            ValueError: If media type is unsupported or duration exceeds limit
+            ValueError: If media type is unsupported or content validation fails
             requests.exceptions.RequestException: If download fails
         """
-        response = requests.get(url, stream=False)  # Don't stream, we need full file
-        response.raise_for_status()
+        try:
+            response = requests.get(url, stream=False)
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e:
+            self._logger.error(f"Failed to download media from {url}: {str(e)}")
+            raise
+        content = response.content
         content_type = response.headers.get('content-type', '').lower()
-        # Validate content type
-        if not any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
-            raise ValueError(
-                f'URL does not point to an allowed media type.\n'
-                f'Content-Type: {content_type}\n'
-                f'Allowed types: {self.ALLOWED_TYPES}'
-            )
-        content = BytesIO(response.content)
-        return content.getvalue()
+        # Case 1: Content-type is already allowed
+        if any(content_type.startswith(t) for t in self.ALLOWED_TYPES):
+            self._logger.debug(f"Content-type {content_type} is allowed")
+            return content
+        # Case 2: Try to validate based on extension
+        mime_type = self.__get_media_type_from_extension(url)
+        if mime_type and mime_type.startswith(tuple(self.ALLOWED_TYPES)):
+            self._logger.debug(f"Found valid mime type from extension: {mime_type}")
+            return content
+        # Case 3: Try to validate based on file signature
+        mime_type = self.__get_media_type_from_signature(content)
+        if mime_type and mime_type.startswith(tuple(self.ALLOWED_TYPES)):
+            self._logger.debug(f"Found valid mime type from signature: {mime_type}")
+            return content
+        # Case 4: Last resort - try direct image validation
+        if self.__validate_image_content(content):
+            self._logger.debug("Content validated as image through direct validation")
+            return content
+        # If we get here, validation failed
+        error_msg = (
+            f'Could not validate media type from content.\n'
+            f'Content-Type: {content_type}\n'
+            f'URL extension: {url.split("?")[0].split(".")[-1]}\n'
+            f'Allowed types: {self.ALLOWED_TYPES}'
+        )
+        self._logger.error(error_msg)
+        raise ValueError(error_msg)
     def to_file(self) -> StrictStr | tuple[StrictStr, StrictBytes] | StrictBytes: # types for autogenerated models
-        files = []
         if isinstance(self.path, str):
             return self.path
         else: # isinstance(self.path, bytes)

rapidata/rapidata_client/selection/rapidata_selections.py CHANGED Viewed

@@ -8,7 +8,16 @@ from rapidata.rapidata_client.selection import (
 class RapidataSelections:
     """RapidataSelections Classes
-    Selections are used to define what type of tasks and in what order they are shown to the user.
+    Selections are used to define what type of tasks and in what order they are shown to the user.
+    All Tasks are called a "Session". A session can contain multiple tasks of different types.
+    Example:
+    ```python
+        selections=[ValidationSelection("your-validation-set-id", 1),
+                    LabelingSelection(2)]
+    ```
+    The above example will create a session with a validation task followed by two labeling tasks.
     Attributes:
         labeling (LabelingSelection): The LabelingSelection instance.
@@ -16,6 +25,7 @@ class RapidataSelections:
         conditional_validation (ConditionalValidationSelection): The ConditionalValidationSelection instance.
         demographic (DemographicSelection): The DemographicSelection instance.
         capped (CappedSelection): The CappedSelection instance."""
     labeling = LabelingSelection
     validation = ValidationSelection
     conditional_validation = ConditionalValidationSelection

rapidata/rapidata_client/validation/rapidata_validation_set.py CHANGED Viewed

@@ -1,50 +1,3 @@
-import os
-from typing import Any
-from rapidata.api_client.models.add_validation_rapid_model import (
-    AddValidationRapidModel,
-)
-from rapidata.api_client.models.add_validation_text_rapid_model import (
-    AddValidationTextRapidModel,
-)
-from rapidata.api_client.models.add_validation_rapid_model_payload import (
-    AddValidationRapidModelPayload,
-)
-from rapidata.api_client.models.add_validation_rapid_model_truth import (
-    AddValidationRapidModelTruth,
-)
-from rapidata.api_client.models.attach_category_truth import AttachCategoryTruth
-from rapidata.api_client.models.bounding_box_payload import BoundingBoxPayload
-from rapidata.api_client.models.bounding_box_truth import BoundingBoxTruth
-from rapidata.api_client.models.classify_payload import ClassifyPayload
-from rapidata.api_client.models.compare_payload import ComparePayload
-from rapidata.api_client.models.compare_truth import CompareTruth
-from rapidata.api_client.models.datapoint_metadata_model_metadata_inner import (
-    DatapointMetadataModelMetadataInner,
-)
-from rapidata.api_client.models.empty_validation_truth import EmptyValidationTruth
-from rapidata.api_client.models.free_text_payload import FreeTextPayload
-from rapidata.api_client.models.line_payload import LinePayload
-from rapidata.api_client.models.line_truth import LineTruth
-from rapidata.api_client.models.locate_box_truth import LocateBoxTruth
-from rapidata.api_client.models.locate_payload import LocatePayload
-from rapidata.api_client.models.named_entity_payload import NamedEntityPayload
-from rapidata.api_client.models.named_entity_truth import NamedEntityTruth
-from rapidata.api_client.models.polygon_payload import PolygonPayload
-from rapidata.api_client.models.polygon_truth import PolygonTruth
-from rapidata.api_client.models.transcription_payload import TranscriptionPayload
-from rapidata.api_client.models.transcription_truth import TranscriptionTruth
-from rapidata.api_client.models.transcription_word import TranscriptionWord
-from rapidata.api_client.models.scrub_payload import ScrubPayload
-from rapidata.api_client.models.scrub_truth import ScrubTruth
-from rapidata.rapidata_client.assets._media_asset import MediaAsset
-from rapidata.rapidata_client.assets._multi_asset import MultiAsset
-from rapidata.rapidata_client.assets._text_asset import TextAsset
-from rapidata.rapidata_client.metadata._base_metadata import Metadata
-from rapidata.service.openapi_service import OpenAPIService
-from typing import Sequence
 class RapidataValidationSet:
     """A class for interacting with a Rapidata validation set.
@@ -57,257 +10,12 @@ class RapidataValidationSet:
         name (str): The name of the validation set.
     """
-    def __init__(self, validation_set_id, openapi_service: OpenAPIService, name: str):
+    def __init__(self, validation_set_id, name: str):
         self.id = validation_set_id
         self.name = name
-        self.__openapi_service = openapi_service
-    def __upload_files(self, model: AddValidationRapidModel, assets: list[MediaAsset]):
-        """Upload a file to the validation set.
-        Args:
-            assets: list[(MediaAsset)]: The asset to upload.
-        """
-        files = []
-        for asset in assets:
-            files.append(asset.to_file())
-        self.__openapi_service.validation_api.validation_add_validation_rapid_post(
-            model=model, files=files
-        )
-    def _add_general_validation_rapid(
-        self,
-        payload: (
-            BoundingBoxPayload
-            | ClassifyPayload
-            | ComparePayload
-            | FreeTextPayload
-            | LinePayload
-            | LocatePayload
-            | NamedEntityPayload
-            | PolygonPayload
-            | TranscriptionPayload
-            | ScrubPayload
-        ),
-        truths: (
-            AttachCategoryTruth
-            | BoundingBoxTruth
-            | CompareTruth
-            | EmptyValidationTruth
-            | LineTruth
-            | LocateBoxTruth
-            | NamedEntityTruth
-            | PolygonTruth
-            | TranscriptionTruth
-            | ScrubTruth
-        ),
-        metadata: Sequence[Metadata],
-        asset: MediaAsset | TextAsset | MultiAsset,
-        randomCorrectProbability: float,
-    ) -> None:
-        """Add a validation rapid to the validation set.
-        Args:
-            payload: The payload for the rapid.
-            truths: The truths for the rapid.
-            metadata (list[Metadata]): The metadata for the rapid.
-            asset: The asset(s) for the rapid.
-            randomCorrectProbability (float): The random correct probability for the rapid.
-        Returns:
-            None
-        Raises:
-            ValueError: If an invalid asset type is provided.
-        """
-        model = AddValidationRapidModel(
-            validationSetId=self.id,
-            payload=AddValidationRapidModelPayload(payload),
-            truth=AddValidationRapidModelTruth(truths),
-            metadata=[
-                DatapointMetadataModelMetadataInner(meta._to_model())
-                for meta in metadata
-            ],
-            randomCorrectProbability=randomCorrectProbability,
-        )
-        if isinstance(asset, MediaAsset):
-            self.__upload_files(model=model, assets=[asset])
-        elif isinstance(asset, TextAsset):
-            model = AddValidationTextRapidModel(
-                validationSetId=self.id,
-                payload=AddValidationRapidModelPayload(payload),
-                truth=AddValidationRapidModelTruth(truths),
-                metadata=[
-                    DatapointMetadataModelMetadataInner(meta._to_model())
-                    for meta in metadata
-                ],
-                randomCorrectProbability=randomCorrectProbability,
-                texts=[asset.text],
-            )
-            self.__openapi_service.validation_api.validation_add_validation_text_rapid_post(
-                add_validation_text_rapid_model=model
-            )
-        elif isinstance(asset, MultiAsset):
-            files = [a for a in asset if isinstance(a, MediaAsset)]
-            texts = [a.text for a in asset if isinstance(a, TextAsset)]
-            if files:
-                self.__upload_files(model=model, assets=files)
-            if texts:
-                model = AddValidationTextRapidModel(
-                    validationSetId=self.id,
-                    payload=AddValidationRapidModelPayload(payload),
-                    truth=AddValidationRapidModelTruth(truths),
-                    metadata=[
-                        DatapointMetadataModelMetadataInner(meta._to_model())
-                        for meta in metadata
-                    ],
-                    randomCorrectProbability=randomCorrectProbability,
-                    texts=texts,
-                )
-                self.__openapi_service.validation_api.validation_add_validation_text_rapid_post(
-                    add_validation_text_rapid_model=model
-                )
-        else:
-            raise ValueError("Invalid asset type")
-    def _add_classify_rapid(
-        self,
-        asset: MediaAsset | TextAsset,
-        instruction: str,
-        categories: list[str],
-        truths: list[str],
-        metadata: Sequence[Metadata] = [],
-    ) -> None:
-        """Add a classify rapid to the validation set.
-        Args:
-            asset (MediaAsset | TextAsset): The asset for the rapid.
-            instruction (str): The instruction for the rapid.
-            categories (list[str]): The list of categories for the rapid.
-            truths (list[str]): The list of truths for the rapid.
-            metadata (Sequence[Metadata], optional): The metadata for the rapid. Defaults to an empty list.
-        Returns:
-            None
-        """
-        payload = ClassifyPayload(
-            _t="ClassifyPayload", possibleCategories=categories, title=instruction
-        )
-        model_truth = AttachCategoryTruth(
-            correctCategories=truths, _t="AttachCategoryTruth"
-        )
-        self._add_general_validation_rapid(
-            payload=payload,
-            truths=model_truth,
-            metadata=metadata,
-            asset=asset,
-            randomCorrectProbability=len(truths) / len(categories),
-        )
-    def _add_compare_rapid(
-        self,
-        asset: MultiAsset,
-        instruction: str,
-        truth: str,
-        metadata: Sequence[Metadata] = [],
-    ) -> None:
-        """Add a compare rapid to the validation set.
-        Args:
-            asset (MultiAsset): The assets for the rapid.
-            instruction (str): The instruction for the rapid.
-            truth (str): The path to the truth file.
-            metadata (Sequence[Metadata], optional): The metadata for the rapid. Defaults to an empty list.
-        Returns:
-            None
-        Raises:
-            ValueError: If the number of assets is not exactly two.
-        """
-        payload = ComparePayload(_t="ComparePayload", criteria=instruction)
-        # take only last part of truth path
-        truth = os.path.basename(truth)
-        model_truth = CompareTruth(_t="CompareTruth", winnerId=truth)
-        if len(asset) != 2:
-            raise ValueError("Compare rapid requires exactly two media paths")
-        self._add_general_validation_rapid(
-            payload=payload,
-            truths=model_truth,
-            metadata=metadata,
-            asset=asset,
-            randomCorrectProbability=1 / len(asset),
-        )
-    def _add_transcription_rapid(
-        self,
-        asset: MediaAsset | TextAsset,
-        instruction: str,
-        text: list[str],
-        correct_words: list[str],
-        required_precision: float = 1,
-        required_completeness: float = 1,
-        metadata: Sequence[Metadata] = [],
-    ) -> None:
-        """Add a transcription rapid to the validation set.
-        Args:
-            asset (MediaAsset | TextAsset): The asset for the rapid.
-            instruction (str): The instruction for the rapid.
-            text (list[str]): The text for the rapid.
-            correct_words (list[str]): The list of correct words for the rapid.
-            required_precision (float, optional): The required precision for the rapid. Defaults to 1.
-            required_completeness (float, optional): The required completeness for the rapid. Defaults to 1.
-            metadata (Sequence[Metadata], optional): The metadata for the rapid. Defaults to an empty list.
-        Returns:
-            None
-        Raises:
-            ValueError: If a correct word is not found in the transcription.
-        """
-        transcription_words = [
-            TranscriptionWord(word=word, wordIndex=i)
-            for i, word in enumerate(text)
-        ]
-        correct_transcription_words = []
-        for word in correct_words:
-            if word not in text:
-                raise ValueError(f"Correct word '{word}' not found in transcription")
-            correct_transcription_words.append(
-                TranscriptionWord(word=word, wordIndex=text.index(word))
-            )
-        payload = TranscriptionPayload(
-            _t="TranscriptionPayload", title=instruction, transcription=transcription_words
-        )
-        model_truth = TranscriptionTruth(
-            _t="TranscriptionTruth",
-            correctWords=correct_transcription_words,
-            requiredPrecision=required_precision,
-            requiredCompleteness=required_completeness,
-        )
-        self._add_general_validation_rapid(
-            payload=payload,
-            truths=model_truth,
-            metadata=metadata,
-            asset=asset,
-            randomCorrectProbability=len(correct_words) / len(text),
-        )
     def __str__(self):
         return f"name: '{self.name}' id: {self.id}"
     def __repr__(self):
         return f"name: '{self.name}' id: {self.id}"

rapidata 2.3.2__py3-none-any.whl → 2.4.1__py3-none-any.whl

rapidata 2.3.2py3-none-any.whl → 2.4.1py3-none-any.whl