PyPI - arize-phoenix - Versions diffs - 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl - Mend

arize-phoenix 3.25.0py3-none-any.whl → 4.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arize-phoenix might be problematic. Click here for more details.

Files changed (113) hide show

{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/METADATA +26 -4
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/RECORD +80 -75
phoenix/__init__.py +9 -5
phoenix/config.py +109 -53
phoenix/datetime_utils.py +18 -1
phoenix/db/README.md +25 -0
phoenix/db/__init__.py +4 -0
phoenix/db/alembic.ini +119 -0
phoenix/db/bulk_inserter.py +206 -0
phoenix/db/engines.py +152 -0
phoenix/db/helpers.py +47 -0
phoenix/db/insertion/evaluation.py +209 -0
phoenix/db/insertion/helpers.py +51 -0
phoenix/db/insertion/span.py +142 -0
phoenix/db/migrate.py +71 -0
phoenix/db/migrations/env.py +121 -0
phoenix/db/migrations/script.py.mako +26 -0
phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
phoenix/db/models.py +371 -0
phoenix/exceptions.py +5 -1
phoenix/server/api/context.py +40 -3
phoenix/server/api/dataloaders/__init__.py +97 -0
phoenix/server/api/dataloaders/cache/__init__.py +3 -0
phoenix/server/api/dataloaders/cache/two_tier_cache.py +67 -0
phoenix/server/api/dataloaders/document_evaluation_summaries.py +152 -0
phoenix/server/api/dataloaders/document_evaluations.py +37 -0
phoenix/server/api/dataloaders/document_retrieval_metrics.py +98 -0
phoenix/server/api/dataloaders/evaluation_summaries.py +151 -0
phoenix/server/api/dataloaders/latency_ms_quantile.py +198 -0
phoenix/server/api/dataloaders/min_start_or_max_end_times.py +93 -0
phoenix/server/api/dataloaders/record_counts.py +125 -0
phoenix/server/api/dataloaders/span_descendants.py +64 -0
phoenix/server/api/dataloaders/span_evaluations.py +37 -0
phoenix/server/api/dataloaders/token_counts.py +138 -0
phoenix/server/api/dataloaders/trace_evaluations.py +37 -0
phoenix/server/api/input_types/SpanSort.py +138 -68
phoenix/server/api/routers/v1/__init__.py +11 -0
phoenix/server/api/routers/v1/evaluations.py +275 -0
phoenix/server/api/routers/v1/spans.py +126 -0
phoenix/server/api/routers/v1/traces.py +82 -0
phoenix/server/api/schema.py +112 -48
phoenix/server/api/types/DocumentEvaluationSummary.py +1 -1
phoenix/server/api/types/Evaluation.py +29 -12
phoenix/server/api/types/EvaluationSummary.py +29 -44
phoenix/server/api/types/MimeType.py +2 -2
phoenix/server/api/types/Model.py +9 -9
phoenix/server/api/types/Project.py +240 -171
phoenix/server/api/types/Span.py +87 -131
phoenix/server/api/types/Trace.py +29 -20
phoenix/server/api/types/pagination.py +151 -10
phoenix/server/app.py +263 -35
phoenix/server/grpc_server.py +93 -0
phoenix/server/main.py +75 -60
phoenix/server/openapi/docs.py +218 -0
phoenix/server/prometheus.py +23 -7
phoenix/server/static/index.js +662 -643
phoenix/server/telemetry.py +68 -0
phoenix/services.py +4 -0
phoenix/session/client.py +34 -30
phoenix/session/data_extractor.py +8 -3
phoenix/session/session.py +176 -155
phoenix/settings.py +13 -0
phoenix/trace/attributes.py +349 -0
phoenix/trace/dsl/README.md +116 -0
phoenix/trace/dsl/filter.py +660 -192
phoenix/trace/dsl/helpers.py +24 -5
phoenix/trace/dsl/query.py +562 -185
phoenix/trace/fixtures.py +69 -7
phoenix/trace/otel.py +44 -200
phoenix/trace/schemas.py +14 -8
phoenix/trace/span_evaluations.py +5 -2
phoenix/utilities/__init__.py +0 -26
phoenix/utilities/span_store.py +0 -23
phoenix/version.py +1 -1
phoenix/core/project.py +0 -773
phoenix/core/traces.py +0 -96
phoenix/datasets/dataset.py +0 -214
phoenix/datasets/fixtures.py +0 -24
phoenix/datasets/schema.py +0 -31
phoenix/experimental/evals/__init__.py +0 -73
phoenix/experimental/evals/evaluators.py +0 -413
phoenix/experimental/evals/functions/__init__.py +0 -4
phoenix/experimental/evals/functions/classify.py +0 -453
phoenix/experimental/evals/functions/executor.py +0 -353
phoenix/experimental/evals/functions/generate.py +0 -138
phoenix/experimental/evals/functions/processing.py +0 -76
phoenix/experimental/evals/models/__init__.py +0 -14
phoenix/experimental/evals/models/anthropic.py +0 -175
phoenix/experimental/evals/models/base.py +0 -170
phoenix/experimental/evals/models/bedrock.py +0 -221
phoenix/experimental/evals/models/litellm.py +0 -134
phoenix/experimental/evals/models/openai.py +0 -453
phoenix/experimental/evals/models/rate_limiters.py +0 -246
phoenix/experimental/evals/models/vertex.py +0 -173
phoenix/experimental/evals/models/vertexai.py +0 -186
phoenix/experimental/evals/retrievals.py +0 -96
phoenix/experimental/evals/templates/__init__.py +0 -50
phoenix/experimental/evals/templates/default_templates.py +0 -472
phoenix/experimental/evals/templates/template.py +0 -195
phoenix/experimental/evals/utils/__init__.py +0 -172
phoenix/experimental/evals/utils/threads.py +0 -27
phoenix/server/api/routers/evaluation_handler.py +0 -110
phoenix/server/api/routers/span_handler.py +0 -70
phoenix/server/api/routers/trace_handler.py +0 -60
phoenix/storage/span_store/__init__.py +0 -23
phoenix/storage/span_store/text_file.py +0 -85
phoenix/trace/dsl/missing.py +0 -60
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/WHEEL +0 -0
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/IP_NOTICE +0 -0
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/LICENSE +0 -0
/phoenix/{datasets → db/insertion}/__init__.py +0 -0
/phoenix/{experimental → db/migrations}/__init__.py +0 -0
/phoenix/{storage → server/openapi}/__init__.py +0 -0

phoenix/experimental/evals/evaluators.py DELETED Viewed

@@ -1,413 +0,0 @@
-from textwrap import indent
-from typing import List, Mapping, Optional, Tuple, Type
-from phoenix.experimental.evals.models import set_verbosity
-from phoenix.experimental.evals.utils import (
-    NOT_PARSABLE,
-    openai_function_call_kwargs,
-    parse_openai_function_call,
-    snap_to_rail,
-)
-from phoenix.utilities.logging import printif
-from .models import BaseEvalModel, OpenAIModel
-from .templates import ClassificationTemplate, EvalCriteria, PromptOptions, PromptTemplate
-Record = Mapping[str, str]
-_TAB = " " * 4
-class LLMEvaluator:
-    """
-    Leverages an LLM to evaluate individual records.
-    """
-    def __init__(
-        self,
-        model: BaseEvalModel,
-        template: ClassificationTemplate,
-    ) -> None:
-        """Initializer for LLMEvaluator.
-        Args:
-            model (BaseEvalModel): The LLM model to use for evaluation.
-            template (ClassificationTemplate): The evaluation template.
-        """
-        self._model = model
-        self._template = template
-    @property
-    def default_concurrency(self) -> int:
-        return self._model.default_concurrency
-    def reload_client(self) -> None:
-        self._model.reload_client()
-    def evaluate(
-        self,
-        record: Record,
-        provide_explanation: bool = False,
-        use_function_calling_if_available: bool = True,
-        verbose: bool = False,
-    ) -> Tuple[str, Optional[float], Optional[str]]:
-        """
-        Evaluates a single record.
-        Args:
-            record (Record): The record to evaluate.
-            provide_explanation (bool, optional): Whether to provide an
-            explanation.
-            use_function_calling_if_available (bool, optional): If True, use
-            function calling (if available) as a means to constrain the LLM
-            outputs. With function calling, the LLM is instructed to provide its
-            response as a structured JSON object, which is easier to parse.
-            use_function_calling_if_available (bool, optional): If True, use
-            function calling (if available) as a means to constrain the LLM
-            outputs. With function calling, the LLM is instructed to provide its
-            response as a structured JSON object, which is easier to parse.
-            verbose (bool, optional): Whether to print verbose output.
-        Returns:
-            Tuple[str, Optional[float], Optional[str]]: A tuple containing:
-            - label
-            - score (if scores for each label are specified by the template)
-            - explanation (if requested)
-        """
-        use_openai_function_call = (
-            use_function_calling_if_available
-            and isinstance(self._model, OpenAIModel)
-            and self._model.supports_function_calling
-        )
-        prompt = self._template.format(
-            record, options=PromptOptions(provide_explanation=provide_explanation)
-        )
-        with set_verbosity(self._model, verbose) as verbose_model:
-            unparsed_output = verbose_model(
-                prompt,
-                **(
-                    openai_function_call_kwargs(self._template.rails, provide_explanation)
-                    if use_openai_function_call
-                    else {}
-                ),
-            )
-        label, explanation = _extract_label_and_explanation(
-            unparsed_output=unparsed_output,
-            template=self._template,
-            provide_explanation=provide_explanation,
-            use_openai_function_call=use_openai_function_call,
-            verbose=verbose,
-        )
-        score = self._template.score(label)
-        return label, score, explanation
-    async def aevaluate(
-        self,
-        record: Record,
-        provide_explanation: bool = False,
-        use_function_calling_if_available: bool = True,
-        verbose: bool = False,
-    ) -> Tuple[str, Optional[float], Optional[str]]:
-        """
-        Evaluates a single record.
-        Args:
-            record (Record): The record to evaluate.
-            provide_explanation (bool, optional): Whether to provide an
-            explanation.
-            use_function_calling_if_available (bool, optional): If True, use
-            function calling (if available) as a means to constrain the LLM
-            outputs. With function calling, the LLM is instructed to provide its
-            response as a structured JSON object, which is easier to parse.
-            verbose (bool, optional): Whether to print verbose output.
-        Returns:
-            Tuple[str, Optional[float], Optional[str]]: A tuple containing:
-            - label
-            - score (if scores for each label are specified by the template)
-            - explanation (if requested)
-        """
-        use_openai_function_call = (
-            use_function_calling_if_available
-            and isinstance(self._model, OpenAIModel)
-            and self._model.supports_function_calling
-        )
-        prompt = self._template.format(
-            record, options=PromptOptions(provide_explanation=provide_explanation)
-        )
-        with set_verbosity(self._model, verbose) as verbose_model:
-            unparsed_output = await verbose_model._async_generate(
-                prompt,
-                **(
-                    openai_function_call_kwargs(self._template.rails, provide_explanation)
-                    if use_openai_function_call
-                    else {}
-                ),
-            )
-        label, explanation = _extract_label_and_explanation(
-            unparsed_output=unparsed_output,
-            template=self._template,
-            provide_explanation=provide_explanation,
-            use_openai_function_call=use_openai_function_call,
-            verbose=verbose,
-        )
-        score = self._template.score(label)
-        return label, score, explanation
-def _create_llm_evaluator_subclass(
-    class_name: str, template: ClassificationTemplate, docstring: str
-) -> Type[LLMEvaluator]:
-    """A factory method that dynamically creates subclasses of LLMEvaluator.
-    Args:
-        class_name (str): Name of the class to be created (should match the name
-        of the assignment variable).
-        template (ClassificationTemplate): The classification template to use
-        for evaluation.
-        docstring (str): The docstring that will be attached to the subclass.
-    Returns:
-        Type[LLMEvaluator]: The dynamically created subclass.
-    """
-    def __init__(self: LLMEvaluator, model: BaseEvalModel) -> None:
-        LLMEvaluator.__init__(self, model, template)
-    __init__.__doc__ = f"""
-        Initializer for {class_name}.
-        Args:
-            model (BaseEvalModel): The LLM model to use for evaluation."""
-    docstring += f" Outputs railed classes {', '.join(template.rails)}."
-    docstring += "\n\nThe template used for evaluation (without explanation) is:\n\n"
-    docstring += indent(template.template, 2 * _TAB)
-    return type(class_name, (LLMEvaluator,), {"__init__": __init__, "__doc__": docstring})
-(
-    HallucinationEvaluator,
-    RelevanceEvaluator,
-    ToxicityEvaluator,
-    QAEvaluator,
-    SummarizationEvaluator,
-) = map(
-    lambda args: _create_llm_evaluator_subclass(*args),
-    (
-        (
-            "HallucinationEvaluator",
-            EvalCriteria.HALLUCINATION.value,
-            'Leverages an LLM to evaluate whether a response (stored under an "output" column) is a hallucination given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).',  # noqa: E501
-        ),
-        (
-            "RelevanceEvaluator",
-            EvalCriteria.RELEVANCE.value,
-            'Leverages an LLM to evaluate whether a retrieved document (stored under a "reference" column) is relevant or irrelevant to the corresponding query (stored under the "input" column).',  # noqa: E501
-        ),
-        (
-            "ToxicityEvaluator",
-            EvalCriteria.TOXICITY.value,
-            'Leverages an LLM to evaluate whether the string stored under the "input" column contains racist, sexist, chauvinistic, biased, or otherwise toxic content.',  # noqa: E501
-        ),
-        (
-            "QAEvaluator",
-            EvalCriteria.QA.value,
-            'Leverages an LLM to evaluate whether a response (stored under an "output" column) is correct or incorrect given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).',  # noqa: E501
-        ),
-        (
-            "SummarizationEvaluator",
-            EvalCriteria.SUMMARIZATION.value,
-            'Leverages an LLM to evaluate whether a summary (stored under an "output" column) provides an accurate synopsis of an input document (stored under a "input" column).',  # noqa: E501
-        ),
-    ),
-)
-class MapReducer:
-    """
-    Evaluates data that is too large to fit into a single context window using a
-    map-reduce strategy. The data must first be divided into "chunks" that
-    individually fit into an LLM's context window. Each chunk of data is
-    individually evaluated (the "map" step), producing intermediate outputs that
-    are combined into a single result (the "reduce" step).
-    This is the simplest strategy for evaluating long-context data.
-    """
-    def __init__(
-        self,
-        model: BaseEvalModel,
-        map_prompt_template: PromptTemplate,
-        reduce_prompt_template: PromptTemplate,
-    ) -> None:
-        """Initializes an instance.
-        Args:
-            model (BaseEvalModel): The LLM model to use for evaluation.
-            map_prompt_template (PromptTemplate): The template that is mapped
-            over each chunk to produce intermediate outputs. Must contain the
-            {chunk} placeholder.
-            reduce_prompt_template (PromptTemplate): The template that combines
-            the intermediate outputs into a single result. Must contain the
-            {mapped} placeholder, which will be formatted as a list of the
-            intermediate outputs produced by the map step.
-        """
-        self._model = model
-        self._map_prompt_template = map_prompt_template
-        self._reduce_prompt_template = reduce_prompt_template
-    def evaluate(self, chunks: List[str]) -> str:
-        """Evaluates a list of two or more chunks.
-        Args:
-            chunks (List[str]): A list of chunks to be evaluated. Each chunk is
-            inserted into the map_prompt_template and must therefore fit within
-            the LLM's context window and still leave room for the rest of the
-            prompt.
-        Returns:
-            str: The output of the map-reduce process.
-        """
-        if len(chunks) < 2:
-            raise ValueError(
-                "The map-reduce strategy is not needed to evaluate data "
-                "that fits within a single context window. "
-                "Consider using llm_classify instead."
-            )
-        model = self._model
-        mapped_records = []
-        for chunk in chunks:
-            map_prompt = self._map_prompt_template.format({"chunk": chunk})
-            intermediate_output = model(map_prompt)
-            mapped_records.append(intermediate_output)
-        reduce_prompt = self._reduce_prompt_template.format({"mapped": repr(mapped_records)})
-        return model(reduce_prompt)
-class Refiner:
-    """
-    Evaluates data that is too large to fit into a single context window using a
-    refine strategy. The data must first be divided into "chunks" that
-    individually fit into an LLM's context window. An initial "accumulator" is
-    generated from the first chunk of data. The accumulator is subsequently
-    refined by iteratively updating and incorporating new information from each
-    subsequent chunk. An optional synthesis step can be used to synthesize the
-    final accumulator into a desired format.
-    """
-    def __init__(
-        self,
-        model: BaseEvalModel,
-        initial_prompt_template: PromptTemplate,
-        refine_prompt_template: PromptTemplate,
-        synthesize_prompt_template: Optional[PromptTemplate] = None,
-    ) -> None:
-        """Initializes an instance.
-        Args:
-            model (BaseEvalModel): The LLM model to use for evaluation.
-            initial_prompt_template (PromptTemplate): The template for the
-            initial invocation of the model that will generate the initial
-            accumulator. Should contain the {chunk} placeholder.
-            refine_prompt_template (PromptTemplate): The template for refining
-            the accumulator across all subsequence chunks. Must contain the
-            {chunk} and {accumulator} placeholders.
-            synthesize_prompt_template (Optional[PromptTemplate], optional): An
-            optional template to synthesize the final version of the
-            accumulator. Must contain the {accumulator} placeholder.
-        """
-        self._model = model
-        self._initial_prompt_template = initial_prompt_template
-        self._refine_prompt_template = refine_prompt_template
-        self._synthesize_prompt_template = synthesize_prompt_template
-    def evaluate(self, chunks: List[str]) -> str:
-        """Evaluates a list of two or more chunks.
-        Args:
-            chunks (List[str]): A list of chunks to be evaluated. Each chunk is
-            inserted into the initial_prompt_template and refine_prompt_template
-            and must therefore fit within the LLM's context window and still
-            leave room for the rest of the prompt.
-        Returns:
-            str: The output of the refine process.
-        """
-        if len(chunks) < 2:
-            raise ValueError(
-                "The refine strategy is not needed to evaluate data "
-                "that fits within a single context window. "
-                "Consider using llm_classify instead."
-            )
-        model = self._model
-        initial_prompt = self._initial_prompt_template.format({"chunk": chunks[0]})
-        accumulator = model(initial_prompt)
-        for chunk in chunks[1:]:
-            refine_prompt = self._refine_prompt_template.format(
-                {"accumulator": accumulator, "chunk": chunk}
-            )
-            accumulator = model(refine_prompt)
-        if not self._synthesize_prompt_template:
-            return accumulator
-        reduce_prompt = self._synthesize_prompt_template.format({"accumulator": accumulator})
-        return model(reduce_prompt)
-def _extract_label_and_explanation(
-    unparsed_output: str,
-    template: ClassificationTemplate,
-    provide_explanation: bool,
-    use_openai_function_call: bool,
-    verbose: bool,
-) -> Tuple[str, Optional[str]]:
-    """
-    Extracts the label and explanation from the unparsed output.
-    Args:
-        unparsed_output (str): The raw output to be parsed.
-        template (ClassificationTemplate): The template used to generate the
-        output.
-        provide_explanation (bool): Whether the output includes an explanation.
-        use_openai_function_call (bool): Whether the output was generated using
-        function calling.
-        verbose (bool): If True, print verbose output to stdout.
-    Returns:
-        Tuple[str, Optional[str]]: A tuple containing the label and an
-        explanation (if one is provided).
-    """
-    if not use_openai_function_call:
-        if provide_explanation:
-            unrailed_label, explanation = (
-                template.extract_label_from_explanation(unparsed_output),
-                unparsed_output,
-            )
-            printif(
-                verbose and unrailed_label == NOT_PARSABLE,
-                f"- Could not parse {repr(unparsed_output)}",
-            )
-        else:
-            unrailed_label = unparsed_output
-            explanation = None
-    else:
-        unrailed_label, explanation = parse_openai_function_call(unparsed_output)
-    return snap_to_rail(unrailed_label, template.rails, verbose=verbose), explanation

phoenix/experimental/evals/functions/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .classify import llm_classify, run_evals, run_relevance_eval
-from .generate import llm_generate
-__all__ = ["llm_classify", "run_relevance_eval", "llm_generate", "run_evals"]

arize-phoenix 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl

Potentially problematic release.

arize-phoenix 3.25.0py3-none-any.whl → 4.0.1py3-none-any.whl