PyPI - arize - Versions diffs - 8.0.0a13__py3-none-any.whl → 8.0.0a15__py3-none-any.whl - Mend

arize 8.0.0a13py3-none-any.whl → 8.0.0a15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

arize/_exporter/client.py +18 -3
arize/_flight/client.py +6 -2
arize/datasets/client.py +88 -83
arize/{utils → models}/casting.py +12 -12
arize/models/client.py +330 -5
arize/{utils → models}/proto.py +1 -369
arize/spans/client.py +30 -6
arize/utils/arrow.py +4 -4
arize/version.py +1 -1
{arize-8.0.0a13.dist-info → arize-8.0.0a15.dist-info}/METADATA +11 -3
{arize-8.0.0a13.dist-info → arize-8.0.0a15.dist-info}/RECORD +13 -13
{arize-8.0.0a13.dist-info → arize-8.0.0a15.dist-info}/WHEEL +0 -0
{arize-8.0.0a13.dist-info → arize-8.0.0a15.dist-info}/licenses/LICENSE.md +0 -0

arize/models/client.py CHANGED Viewed

@@ -31,13 +31,17 @@ from arize.exceptions.parameters import (
 from arize.exceptions.spaces import MissingSpaceIDError
 from arize.logging import get_truncation_warning_message
 from arize.models.bounded_executor import BoundedExecutor
+from arize.models.casting import cast_dictionary, cast_typed_columns
 from arize.models.stream_validation import (
     validate_and_convert_prediction_id,
     validate_label,
 )
 from arize.types import (
+    CATEGORICAL_MODEL_TYPES,
+    NUMERIC_MODEL_TYPES,
     ActualLabelTypes,
     BaseSchema,
+    CorpusSchema,
     Embedding,
     Environments,
     LLMRunMetadata,
@@ -51,7 +55,6 @@ from arize.types import (
     convert_element,
     is_list_of,
 )
-from arize.utils.casting import cast_dictionary, cast_typed_columns
 if TYPE_CHECKING:
     import concurrent.futures as cf
@@ -63,6 +66,11 @@ if TYPE_CHECKING:
     from arize._generated.protocol.rec import public_pb2 as pb2
     from arize.config import SDKConfiguration
+    from arize.types import (
+        EmbeddingColumnNames,
+        Schema,
+    )
 logger = logging.getLogger(__name__)
@@ -122,7 +130,7 @@ class MLModelsClient:
     ) -> cf.Future:
         require(_STREAM_EXTRA, _STREAM_DEPS)
         from arize._generated.protocol.rec import public_pb2 as pb2
-        from arize.utils.proto import (
+        from arize.models.proto import (
             get_pb_dictionary,
             get_pb_label,
             get_pb_timestamp,
@@ -469,7 +477,6 @@ class MLModelsClient:
         from arize.models.batch_validation.validator import Validator
         from arize.utils.arrow import post_arrow_table
         from arize.utils.dataframe import remove_extraneous_columns
-        from arize.utils.proto import get_pb_schema, get_pb_schema_corpus
         # This method requires a space_id and project_name
         if not space_id:
@@ -620,12 +627,12 @@ class MLModelsClient:
             )
         if environment == Environments.CORPUS:
-            proto_schema = get_pb_schema_corpus(
+            proto_schema = _get_pb_schema_corpus(
                 schema=schema,
                 model_id=model_name,
             )
         else:
-            proto_schema = get_pb_schema(
+            proto_schema = _get_pb_schema(
                 schema=schema,
                 model_id=model_name,
                 model_version=model_version,
@@ -803,3 +810,321 @@ def _is_timestamp_in_range(now: int, ts: int):
     max_time = now + (MAX_FUTURE_YEARS_FROM_CURRENT_TIME * 365 * 24 * 60 * 60)
     min_time = now - (MAX_PAST_YEARS_FROM_CURRENT_TIME * 365 * 24 * 60 * 60)
     return min_time <= ts <= max_time
+def _get_pb_schema(
+    schema: Schema,
+    model_id: str,
+    model_version: str | None,
+    model_type: ModelTypes,
+    environment: Environments,
+    batch_id: str,
+):
+    s = pb2.Schema()
+    s.constants.model_id = model_id
+    if model_version is not None:
+        s.constants.model_version = model_version
+    if environment == Environments.PRODUCTION:
+        s.constants.environment = pb2.Schema.Environment.PRODUCTION
+    elif environment == Environments.VALIDATION:
+        s.constants.environment = pb2.Schema.Environment.VALIDATION
+    elif environment == Environments.TRAINING:
+        s.constants.environment = pb2.Schema.Environment.TRAINING
+    else:
+        raise ValueError(f"unexpected environment: {environment}")
+    # Map user-friendly external model types -> internal model types when sending to Arize
+    if model_type in NUMERIC_MODEL_TYPES:
+        s.constants.model_type = pb2.Schema.ModelType.NUMERIC
+    elif model_type in CATEGORICAL_MODEL_TYPES:
+        s.constants.model_type = pb2.Schema.ModelType.SCORE_CATEGORICAL
+    elif model_type == ModelTypes.RANKING:
+        s.constants.model_type = pb2.Schema.ModelType.RANKING
+    elif model_type == ModelTypes.OBJECT_DETECTION:
+        s.constants.model_type = pb2.Schema.ModelType.OBJECT_DETECTION
+    elif model_type == ModelTypes.GENERATIVE_LLM:
+        s.constants.model_type = pb2.Schema.ModelType.GENERATIVE_LLM
+    elif model_type == ModelTypes.MULTI_CLASS:
+        s.constants.model_type = pb2.Schema.ModelType.MULTI_CLASS
+    if batch_id is not None:
+        s.constants.batch_id = batch_id
+    if schema.prediction_id_column_name is not None:
+        s.arrow_schema.prediction_id_column_name = (
+            schema.prediction_id_column_name
+        )
+    if schema.timestamp_column_name is not None:
+        s.arrow_schema.timestamp_column_name = schema.timestamp_column_name
+    if schema.prediction_label_column_name is not None:
+        s.arrow_schema.prediction_label_column_name = (
+            schema.prediction_label_column_name
+        )
+    if model_type == ModelTypes.OBJECT_DETECTION:
+        if schema.object_detection_prediction_column_names is not None:
+            s.arrow_schema.prediction_object_detection_label_column_names.bboxes_coordinates_column_name = (
+                schema.object_detection_prediction_column_names.bounding_boxes_coordinates_column_name  # noqa: E501
+            )
+            s.arrow_schema.prediction_object_detection_label_column_names.bboxes_categories_column_name = (
+                schema.object_detection_prediction_column_names.categories_column_name  # noqa: E501
+            )
+            if (
+                schema.object_detection_prediction_column_names.scores_column_name
+                is not None
+            ):
+                s.arrow_schema.prediction_object_detection_label_column_names.bboxes_scores_column_name = (
+                    schema.object_detection_prediction_column_names.scores_column_name  # noqa: E501
+                )
+        if schema.semantic_segmentation_prediction_column_names is not None:
+            s.arrow_schema.prediction_semantic_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
+                schema.semantic_segmentation_prediction_column_names.polygon_coordinates_column_name
+            )
+            s.arrow_schema.prediction_semantic_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
+                schema.semantic_segmentation_prediction_column_names.categories_column_name
+            )
+        if schema.instance_segmentation_prediction_column_names is not None:
+            s.arrow_schema.prediction_instance_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
+                schema.instance_segmentation_prediction_column_names.polygon_coordinates_column_name
+            )
+            s.arrow_schema.prediction_instance_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
+                schema.instance_segmentation_prediction_column_names.categories_column_name
+            )
+            if (
+                schema.instance_segmentation_prediction_column_names.scores_column_name
+                is not None
+            ):
+                s.arrow_schema.prediction_instance_segmentation_label_column_names.polygons_scores_column_name = (  # noqa: E501
+                    schema.instance_segmentation_prediction_column_names.scores_column_name
+                )
+            if (
+                schema.instance_segmentation_prediction_column_names.bounding_boxes_coordinates_column_name
+                is not None
+            ):
+                s.arrow_schema.prediction_instance_segmentation_label_column_names.bboxes_coordinates_column_name = (  # noqa: E501
+                    schema.instance_segmentation_prediction_column_names.bounding_boxes_coordinates_column_name
+                )
+    if schema.prediction_score_column_name is not None:
+        if model_type in NUMERIC_MODEL_TYPES:
+            # allow numeric prediction to be sent in as either prediction_label (legacy) or
+            # prediction_score.
+            s.arrow_schema.prediction_label_column_name = (
+                schema.prediction_score_column_name
+            )
+        else:
+            s.arrow_schema.prediction_score_column_name = (
+                schema.prediction_score_column_name
+            )
+    if schema.feature_column_names is not None:
+        s.arrow_schema.feature_column_names.extend(schema.feature_column_names)
+    if schema.embedding_feature_column_names is not None:
+        for (
+            emb_name,
+            emb_col_names,
+        ) in schema.embedding_feature_column_names.items():
+            # emb_name is how it will show in the UI
+            s.arrow_schema.embedding_feature_column_names_map[
+                emb_name
+            ].vector_column_name = emb_col_names.vector_column_name
+            if emb_col_names.data_column_name:
+                s.arrow_schema.embedding_feature_column_names_map[
+                    emb_name
+                ].data_column_name = emb_col_names.data_column_name
+            if emb_col_names.link_to_data_column_name:
+                s.arrow_schema.embedding_feature_column_names_map[
+                    emb_name
+                ].link_to_data_column_name = (
+                    emb_col_names.link_to_data_column_name
+                )
+    if schema.prompt_column_names is not None:
+        if isinstance(schema.prompt_column_names, str):
+            s.arrow_schema.embedding_feature_column_names_map[
+                "prompt"
+            ].data_column_name = schema.prompt_column_names
+        elif isinstance(schema.prompt_column_names, EmbeddingColumnNames):
+            col_names = schema.prompt_column_names
+            s.arrow_schema.embedding_feature_column_names_map[
+                "prompt"
+            ].vector_column_name = col_names.vector_column_name
+            if col_names.data_column_name:
+                s.arrow_schema.embedding_feature_column_names_map[
+                    "prompt"
+                ].data_column_name = col_names.data_column_name
+    if schema.response_column_names is not None:
+        if isinstance(schema.response_column_names, str):
+            s.arrow_schema.embedding_feature_column_names_map[
+                "response"
+            ].data_column_name = schema.response_column_names
+        elif isinstance(schema.response_column_names, EmbeddingColumnNames):
+            col_names = schema.response_column_names
+            s.arrow_schema.embedding_feature_column_names_map[
+                "response"
+            ].vector_column_name = col_names.vector_column_name
+            if col_names.data_column_name:
+                s.arrow_schema.embedding_feature_column_names_map[
+                    "response"
+                ].data_column_name = col_names.data_column_name
+    if schema.tag_column_names is not None:
+        s.arrow_schema.tag_column_names.extend(schema.tag_column_names)
+    if (
+        model_type == ModelTypes.RANKING
+        and schema.relevance_labels_column_name is not None
+    ):
+        s.arrow_schema.actual_label_column_name = (
+            schema.relevance_labels_column_name
+        )
+    elif (
+        model_type == ModelTypes.RANKING
+        and schema.attributions_column_name is not None
+    ):
+        s.arrow_schema.actual_label_column_name = (
+            schema.attributions_column_name
+        )
+    elif schema.actual_label_column_name is not None:
+        s.arrow_schema.actual_label_column_name = (
+            schema.actual_label_column_name
+        )
+    if (
+        model_type == ModelTypes.RANKING
+        and schema.relevance_score_column_name is not None
+    ):
+        s.arrow_schema.actual_score_column_name = (
+            schema.relevance_score_column_name
+        )
+    elif schema.actual_score_column_name is not None:
+        if model_type in NUMERIC_MODEL_TYPES:
+            # allow numeric prediction to be sent in as either prediction_label (legacy) or
+            # prediction_score.
+            s.arrow_schema.actual_label_column_name = (
+                schema.actual_score_column_name
+            )
+        else:
+            s.arrow_schema.actual_score_column_name = (
+                schema.actual_score_column_name
+            )
+    if schema.shap_values_column_names is not None:
+        s.arrow_schema.shap_values_column_names.update(
+            schema.shap_values_column_names
+        )
+    if schema.prediction_group_id_column_name is not None:
+        s.arrow_schema.prediction_group_id_column_name = (
+            schema.prediction_group_id_column_name
+        )
+    if schema.rank_column_name is not None:
+        s.arrow_schema.rank_column_name = schema.rank_column_name
+    if model_type == ModelTypes.OBJECT_DETECTION:
+        if schema.object_detection_actual_column_names is not None:
+            s.arrow_schema.actual_object_detection_label_column_names.bboxes_coordinates_column_name = (  # noqa: E501
+                schema.object_detection_actual_column_names.bounding_boxes_coordinates_column_name
+            )
+            s.arrow_schema.actual_object_detection_label_column_names.bboxes_categories_column_name = (  # noqa: E501
+                schema.object_detection_actual_column_names.categories_column_name
+            )
+            if (
+                schema.object_detection_actual_column_names.scores_column_name
+                is not None
+            ):
+                s.arrow_schema.actual_object_detection_label_column_names.bboxes_scores_column_name = (  # noqa: E501
+                    schema.object_detection_actual_column_names.scores_column_name
+                )
+        if schema.semantic_segmentation_actual_column_names is not None:
+            s.arrow_schema.actual_semantic_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
+                schema.semantic_segmentation_actual_column_names.polygon_coordinates_column_name
+            )
+            s.arrow_schema.actual_semantic_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
+                schema.semantic_segmentation_actual_column_names.categories_column_name
+            )
+        if schema.instance_segmentation_actual_column_names is not None:
+            s.arrow_schema.actual_instance_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
+                schema.instance_segmentation_actual_column_names.polygon_coordinates_column_name
+            )
+            s.arrow_schema.actual_instance_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
+                schema.instance_segmentation_actual_column_names.categories_column_name
+            )
+            if (
+                schema.instance_segmentation_actual_column_names.bounding_boxes_coordinates_column_name
+                is not None
+            ):
+                s.arrow_schema.actual_instance_segmentation_label_column_names.bboxes_coordinates_column_name = (  # noqa: E501
+                    schema.instance_segmentation_actual_column_names.bounding_boxes_coordinates_column_name
+                )
+    if model_type == ModelTypes.GENERATIVE_LLM:
+        if schema.prompt_template_column_names is not None:
+            s.arrow_schema.prompt_template_column_names.template_column_name = (
+                schema.prompt_template_column_names.template_column_name
+            )
+            s.arrow_schema.prompt_template_column_names.template_version_column_name = (  # noqa: E501
+                schema.prompt_template_column_names.template_version_column_name
+            )
+        if schema.llm_config_column_names is not None:
+            s.arrow_schema.llm_config_column_names.model_column_name = (
+                schema.llm_config_column_names.model_column_name
+            )
+            s.arrow_schema.llm_config_column_names.params_map_column_name = (
+                schema.llm_config_column_names.params_column_name
+            )
+        if schema.retrieved_document_ids_column_name is not None:
+            s.arrow_schema.retrieved_document_ids_column_name = (
+                schema.retrieved_document_ids_column_name
+            )
+    if model_type == ModelTypes.MULTI_CLASS:
+        if schema.prediction_score_column_name is not None:
+            s.arrow_schema.prediction_score_column_name = (
+                schema.prediction_score_column_name
+            )
+        if schema.multi_class_threshold_scores_column_name is not None:
+            s.arrow_schema.multi_class_threshold_scores_column_name = (
+                schema.multi_class_threshold_scores_column_name
+            )
+        if schema.actual_score_column_name is not None:
+            s.arrow_schema.actual_score_column_name = (
+                schema.actual_score_column_name
+            )
+    return s
+def _get_pb_schema_corpus(
+    schema: CorpusSchema,
+    model_id: str,
+) -> pb2.Schema:
+    s = pb2.Schema()
+    s.constants.model_id = model_id
+    s.constants.environment = pb2.Schema.Environment.CORPUS
+    s.constants.model_type = pb2.Schema.ModelType.GENERATIVE_LLM
+    if schema.document_id_column_name is not None:
+        s.arrow_schema.document_column_names.id_column_name = (
+            schema.document_id_column_name
+        )
+    if schema.document_version_column_name is not None:
+        s.arrow_schema.document_column_names.version_column_name = (
+            schema.document_version_column_name
+        )
+    if schema.document_text_embedding_column_names is not None:
+        s.arrow_schema.document_column_names.text_column_name.vector_column_name = schema.document_text_embedding_column_names.vector_column_name  # noqa: E501
+        s.arrow_schema.document_column_names.text_column_name.data_column_name = schema.document_text_embedding_column_names.data_column_name  # noqa: E501
+        if (
+            schema.document_text_embedding_column_names.link_to_data_column_name
+            is not None
+        ):
+            s.arrow_schema.document_column_names.text_column_name.link_to_data_column_name = schema.document_text_embedding_column_names.link_to_data_column_name  # noqa: E501
+    return s

arize 8.0.0a13__py3-none-any.whl → 8.0.0a15__py3-none-any.whl

arize 8.0.0a13py3-none-any.whl → 8.0.0a15py3-none-any.whl