PyPI - arize-phoenix - Versions diffs - 2.5.0__tar.gz → 2.7.0__tar.gz - Mend

arize-phoenix 2.5.0tar.gz → 2.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arize-phoenix
-Version: 2.5.0
+Version: 2.7.0
 Summary: ML Observability in your notebook
 Project-URL: Documentation, https://docs.arize.com/phoenix/
 Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/config.py RENAMED Viewed

@@ -12,6 +12,11 @@ ENV_PHOENIX_COLLECTOR_ENDPOINT = "PHOENIX_COLLECTOR_ENDPOINT"
 The endpoint traces and evals are sent to. This must be set if the Phoenix
 server is running on a remote instance.
 """
+ENV_WORKING_DIR = "PHOENIX_WORKING_DIR"
+"""
+The directory in which to save, load, and export datasets. This directory must
+be accessible by both the Phoenix server and the notebook environment.
+"""
 def _get_temp_path() -> Path:
@@ -36,13 +41,16 @@ def get_running_pid() -> Optional[int]:
     return None
-for path in (
-    ROOT_DIR := Path.home().resolve() / ".phoenix",
-    EXPORT_DIR := ROOT_DIR / "exports",
-    DATASET_DIR := ROOT_DIR / "datasets",
-    TRACE_DATASET_DIR := ROOT_DIR / "trace_datasets",
-):
-    path.mkdir(parents=True, exist_ok=True)
+def get_working_dir() -> Path:
+    """
+    Get the working directory for saving, loading, and exporting datasets.
+    """
+    working_dir_str = os.getenv(ENV_WORKING_DIR)
+    if working_dir_str is not None:
+        return Path(working_dir_str)
+    # Fall back to ~/.phoenix if PHOENIX_WORKING_DIR is not set
+    return Path.home().resolve() / ".phoenix"
 PHOENIX_DIR = Path(__file__).resolve().parent
 # Server config
@@ -53,6 +61,23 @@ HOST = "0.0.0.0"
 PORT = 6006
 # The prefix of datasets that are auto-assigned a name
 GENERATED_DATASET_NAME_PREFIX = "phoenix_dataset_"
+# The work directory for saving, loading, and exporting datasets
+WORKING_DIR = get_working_dir()
+try:
+    for path in (
+        ROOT_DIR := WORKING_DIR,
+        EXPORT_DIR := ROOT_DIR / "exports",
+        DATASET_DIR := ROOT_DIR / "datasets",
+        TRACE_DATASET_DIR := ROOT_DIR / "trace_datasets",
+    ):
+        path.mkdir(parents=True, exist_ok=True)
+except Exception as e:
+    print(
+        f"⚠️ Failed to initialize the working directory at {WORKING_DIR} due to an error: {str(e)}"
+    )
+    print("⚠️ While phoenix will still run, you will not be able to save, load, or export data")
+    print("ℹ️ To change, set the `{ENV_WORKING_DIR}` environment variable before importing phoenix.")
 def get_exported_files(directory: Path) -> List[Path]:

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/core/evals.py RENAMED Viewed

@@ -9,10 +9,12 @@ from typing import DefaultDict, Dict, List, Optional, Set, Tuple
 import numpy as np
 from google.protobuf.json_format import MessageToDict
+from pandas import DataFrame, Index, MultiIndex
 from typing_extensions import TypeAlias, assert_never
 import phoenix.trace.v1 as pb
 from phoenix.trace.schemas import SpanID, TraceID
+from phoenix.trace.span_evaluations import DocumentEvaluations, Evaluations, SpanEvaluations
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
@@ -171,3 +173,54 @@ class Evals:
                 if result.HasField("score") and document_position < num_documents:
                     scores[document_position] = result.score.value
         return scores
+    def export_evaluations(self) -> List[Evaluations]:
+        evaluations: List[Evaluations] = []
+        evaluations.extend(self._export_span_evaluations())
+        evaluations.extend(self._export_document_evaluations())
+        return evaluations
+    def _export_span_evaluations(self) -> List[SpanEvaluations]:
+        span_evaluations = []
+        with self._lock:
+            span_evaluations_by_name = tuple(self._span_evaluations_by_name.items())
+        for eval_name, _span_evaluations_by_id in span_evaluations_by_name:
+            span_ids = []
+            rows = []
+            with self._lock:
+                span_evaluations_by_id = tuple(_span_evaluations_by_id.items())
+            for span_id, pb_eval in span_evaluations_by_id:
+                span_ids.append(span_id)
+                rows.append(MessageToDict(pb_eval.result))
+            dataframe = DataFrame(rows, index=Index(span_ids, name="context.span_id"))
+            span_evaluations.append(SpanEvaluations(eval_name, dataframe))
+        return span_evaluations
+    def _export_document_evaluations(self) -> List[DocumentEvaluations]:
+        evaluations = []
+        with self._lock:
+            document_evaluations_by_name = tuple(self._document_evaluations_by_name.items())
+        for eval_name, _document_evaluations_by_id in document_evaluations_by_name:
+            span_ids = []
+            document_positions = []
+            rows = []
+            with self._lock:
+                document_evaluations_by_id = tuple(_document_evaluations_by_id.items())
+            for span_id, _document_evaluations_by_position in document_evaluations_by_id:
+                with self._lock:
+                    document_evaluations_by_position = sorted(
+                        _document_evaluations_by_position.items()
+                    )  # ensure the evals are sorted by document position
+                for document_position, pb_eval in document_evaluations_by_position:
+                    span_ids.append(span_id)
+                    document_positions.append(document_position)
+                    rows.append(MessageToDict(pb_eval.result))
+            dataframe = DataFrame(
+                rows,
+                index=MultiIndex.from_arrays(
+                    (span_ids, document_positions),
+                    names=("context.span_id", "document_position"),
+                ),
+            )
+            evaluations.append(DocumentEvaluations(eval_name, dataframe))
+        return evaluations

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/datasets/fixtures.py RENAMED Viewed

@@ -240,6 +240,51 @@ click_through_rate_fixture = Fixture(
     reference_file_name="click_through_rate_train.parquet",
 )
+chatbot_queries_schema = Schema(
+    prediction_id_column_name="id",
+    prompt_column_names=RetrievalEmbeddingColumnNames(
+        vector_column_name="prompt",
+        raw_data_column_name="prompt_text",
+        context_retrieval_ids_column_name="document_ids",
+        context_retrieval_scores_column_name="document_scores",
+    ),
+    response_column_names="response",
+    tag_column_names=[
+        "answer_relevancy",
+        "context_relevancy",
+        "faithfulness",
+        "document_similarity_0",
+        "document_similarity_1",
+        "openai_relevance_0",
+        "openai_relevance_1",
+        "user_feedback",
+    ],
+)
+chatbot_database_schema = Schema(
+    prediction_id_column_name="document_id",
+    prompt_column_names=EmbeddingColumnNames(
+        vector_column_name="text_vector",
+        raw_data_column_name="text",
+    ),
+)
+chatbot_fixture = Fixture(
+    name="chatbot",
+    description="""
+    Investigate RAG performance for a chatbot built on top of Arize's documentation.
+    This use-case highlights how embedding visualizations for a RAG application can
+    highlight issues with the application's retrieval and performance.
+    The data contains relevance metrics generated by LLM Evals as well as RAGAS.
+    """,
+    primary_schema=chatbot_queries_schema,
+    corpus_schema=chatbot_database_schema,
+    prefix="unstructured/llm/chatbot",
+    primary_file_name="chatbot_queries_with_ragas.parquet",
+    corpus_file_name="chatbot_database_ds.parquet",
+)
 wide_data_primary_schema = Schema(
     actual_label_column_name="actual_label",
     prediction_label_column_name="predicted_label",
@@ -363,6 +408,7 @@ FIXTURES: Tuple[Fixture, ...] = (
     deep_data_fixture,
     llm_summarization_fixture,
     wikipedia_fixture,
+    chatbot_fixture,
 )
 NAME_TO_FIXTURE = {fixture.name: fixture for fixture in FIXTURES}

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/evaluators.py RENAMED Viewed

@@ -36,6 +36,10 @@ class LLMEvaluator:
         self._model = model
         self._template = template
+    @property
+    def default_concurrency(self) -> int:
+        return self._model.default_concurrency
     def reload_client(self) -> None:
         self._model.reload_client()

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/functions/classify.py RENAMED Viewed

@@ -73,7 +73,7 @@ def llm_classify(
     include_prompt: bool = False,
     include_response: bool = False,
     run_sync: bool = False,
-    concurrency: int = 20,
+    concurrency: Optional[int] = None,
 ) -> pd.DataFrame:
     """Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
     where the first column is named `label` and contains the classification labels. An optional
@@ -116,8 +116,9 @@ def llm_classify(
         run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
         evaluations will be run asynchronously if possible.
-        concurrency (int, default=20): The number of concurrent evals if async submission is
-        possible.
+        concurrency (Optional[int], default=None): The number of concurrent evals if async
+        submission is possible. If not provided, a recommended default concurrency is set on a
+        per-model basis.
     Returns:
         pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
@@ -127,6 +128,7 @@ def llm_classify(
         from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
         not be parsed.
     """
+    concurrency = concurrency or model.default_concurrency
     # clients need to be reloaded to ensure that async evals work properly
     model.reload_client()
@@ -353,7 +355,7 @@ def run_evals(
     provide_explanation: bool = False,
     use_function_calling_if_available: bool = True,
     verbose: bool = False,
-    concurrency: int = 20,
+    concurrency: Optional[int] = None,
 ) -> List[DataFrame]:
     """
     Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
@@ -381,13 +383,21 @@ def run_evals(
         as model invocation parameters and details about retries and snapping to
         rails.
-        concurrency (int, optional): The number of concurrent evals if async
-        submission is possible.
+        concurrency (Optional[int], default=None): The number of concurrent evals if async
+        submission is possible. If not provided, a recommended default concurrency is set on a
+        per-model basis.
     Returns:
         List[DataFrame]: A list of dataframes, one for each evaluator, all of
         which have the same number of rows as the input dataframe.
     """
+    # use the minimum default concurrency of all the models
+    if concurrency is None:
+        if len(evaluators) == 0:
+            concurrency = 1
+        else:
+            concurrency = min(evaluator.default_concurrency for evaluator in evaluators)
     # clients need to be reloaded to ensure that async evals work properly
     for evaluator in evaluators:
         evaluator.reload_client()

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/functions/generate.py RENAMED Viewed

@@ -31,7 +31,7 @@ def llm_generate(
     include_prompt: bool = False,
     include_response: bool = False,
     run_sync: bool = False,
-    concurrency: int = 20,
+    concurrency: Optional[int] = None,
 ) -> pd.DataFrame:
     """
     Generates a text using a template using an LLM. This function is useful
@@ -70,14 +70,17 @@ def llm_generate(
         run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
         evaluations will be run asynchronously if possible.
-        concurrency (int, default=20): The number of concurrent evals if async submission is
-        possible.
+        concurrency (Optional[int], default=None): The number of concurrent evals if async
+        submission is possible. If not provided, a recommended default concurrency is set on a
+        per-model basis.
     Returns:
         generations_dataframe (pandas.DataFrame): A dataframe where each row
         represents the generated output
     """
+    concurrency = concurrency or model.default_concurrency
     # clients need to be reloaded to ensure that async evals work properly
     model.reload_client()

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/models/anthropic.py RENAMED Viewed

@@ -1,4 +1,3 @@
-import logging
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
@@ -8,8 +7,6 @@ from phoenix.experimental.evals.models.rate_limiters import RateLimiter
 if TYPE_CHECKING:
     from tiktoken import Encoding
-logger = logging.getLogger(__name__)
 MODEL_TOKEN_LIMIT_MAPPING = {
     "claude-2.1": 200000,
     "claude-2.0": 100000,
@@ -80,7 +77,6 @@ class AnthropicModel(BaseEvalModel):
         try:
             encoding = self._tiktoken.encoding_for_model(self.model)
         except KeyError:
-            logger.warning("Warning: model not found. Using cl100k_base encoding.")
             encoding = self._tiktoken.get_encoding("cl100k_base")
         self._tiktoken_encoding = encoding
@@ -149,6 +145,9 @@ class AnthropicModel(BaseEvalModel):
         return _completion_with_retry(**kwargs)
     async def _async_generate(self, prompt: str, **kwargs: Dict[str, Any]) -> str:
+        # instruction is an invalid input to Anthropic models, it is passed in by
+        # BaseEvalModel.__call__ and needs to be removed
+        kwargs.pop("instruction", None)
         invocation_parameters = self.invocation_parameters()
         invocation_parameters.update(kwargs)
         response = await self._async_generate_with_retry(

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/models/base.py RENAMED Viewed

@@ -58,6 +58,7 @@ def set_verbosity(
 @dataclass
 class BaseEvalModel(ABC):
+    default_concurrency: int = 20
     _verbose: bool = False
     _rate_limiter: RateLimiter = field(default_factory=RateLimiter)

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/models/bedrock.py RENAMED Viewed

@@ -87,7 +87,6 @@ class BedrockModel(BaseEvalModel):
         try:
             encoding = self._tiktoken.encoding_for_model(self.model_id)
         except KeyError:
-            logger.warning("Warning: model not found. Using cl100k_base encoding.")
             encoding = self._tiktoken.get_encoding("cl100k_base")
         self._tiktoken_encoding = encoding
@@ -165,7 +164,7 @@ class BedrockModel(BaseEvalModel):
                     "temperature": self.temperature,
                     "topP": self.top_p,
                     "maxTokens": self.max_tokens,
-                    "stopSequences": [self.stop_sequences],
+                    "stopSequences": self.stop_sequences,
                 },
                 **self.extra_parameters,
             }
@@ -204,6 +203,9 @@ class BedrockModel(BaseEvalModel):
         elif self.model_id.startswith("anthropic"):
             body = json.loads(response.get("body").read().decode())
             return body.get("completion")
+        elif self.model_id.startswith("amazon"):
+            body = json.loads(response.get("body").read())
+            return body.get("results")[0].get("outputText")
         else:
             body = json.loads(response.get("body").read())
             return body.get("results")[0].get("data").get("outputText")

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/models/openai.py RENAMED Viewed

@@ -31,6 +31,8 @@ MODEL_TOKEN_LIMIT_MAPPING = {
     "gpt-4-0613": 8192,  # Current gpt-4 default
     "gpt-4-32k-0314": 32768,
     "gpt-4-32k-0613": 32768,
+    "gpt-4-1106-preview": 128000,
+    "gpt-4-vision-preview": 128000,
 }
 LEGACY_COMPLETION_API_MODELS = ("gpt-3.5-turbo-instruct",)
 logger = logging.getLogger(__name__)

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/models/vertex.py RENAMED Viewed

@@ -21,6 +21,9 @@ MODEL_TOKEN_LIMIT_MAPPING = {
 @dataclass
 class GeminiModel(BaseEvalModel):
+    # The vertex SDK runs into connection pool limits at high concurrency
+    default_concurrency: int = 5
     model: str = "gemini-pro"
     """The model name to use."""
     temperature: float = 0.0
@@ -50,6 +53,9 @@ class GeminiModel(BaseEvalModel):
             max_retries=self.max_retries,
         )
+    def reload_client(self) -> None:
+        self._init_client()
     def _init_client(self) -> None:
         try:
             from google.api_core import exceptions  # type:ignore

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/experimental/evals/templates/default_templates.py RENAMED Viewed

@@ -73,13 +73,6 @@ your response.
     [END DATA]
     Is the answer above factual or hallucinated based on the query and reference text?
-Your response should be a single word: either "factual" or "hallucinated", and
-it should not include any other text or characters. "hallucinated" indicates that the answer
-provides factually inaccurate information to the query based on the reference text. "factual"
-indicates that the answer to the question is correct relative to the reference text, and does not
-contain made up information. Please read the query and reference text carefully before determining
-your response.
 """
 HALLUCINATION_PROMPT_TEMPLATE_WITH_EXPLANATION = """
 In this task, you will be presented with a query, a reference text and an answer. The answer is

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/server/static/index.js RENAMED Viewed

@@ -6717,7 +6717,7 @@ fragment SpanEvaluationsTable_evals on Span {
                 gap: var(--ac-global-dimension-static-size-200);
               `,children:i.map((o,l)=>x("li",{children:_(ft,{padding:"size-200",backgroundColor:"purple-100",borderColor:"purple-700",borderWidth:"thin",borderRadius:"medium",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"embedded text"}),x("pre",{css:ee`
                           margin: var(--ac-global-dimension-static-size-100) 0;
-                        `,children:o[mtt]})]})},l))})}):null})}function Xxn(t){let{spanAttributes:e}=t,n=(0,br.useMemo)(()=>{let l=e[wr.tool];return typeof l=="object"?l:{}},[e]);if(!(Object.keys(n).length>0))return null;let r=n[vB.name],a=n[vB.description],o=n[vB.parameters];return x(Be,{direction:"column",gap:"size-200",children:x(uu,{title:"Tool"+(typeof r=="string"?`: ${r}`:""),...eg,children:_(Be,{direction:"column",children:[a!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",backgroundColor:"light",children:_(Be,{direction:"column",alignItems:"start",gap:"size-50",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Description"}),x(Me,{children:a})]})}):null,o!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",children:_(Be,{direction:"column",alignItems:"start",width:"100%",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Parameters"}),x(Tc,{value:JSON.stringify(o),mimeType:"json"})]})}):null]})})})}var Sxn=["irrelevant"];function Gse({document:t,documentEvaluations:e,backgroundColor:n,borderColor:i,labelColor:r}){let a=t[htt],o=e&&e.length;return x(ft,{borderRadius:"medium",backgroundColor:n,borderColor:i,borderWidth:"thin",children:_(Be,{direction:"column",children:[x(ft,{width:"100%",borderBottomWidth:"thin",borderBottomColor:i,children:_(Be,{direction:"row",justifyContent:"space-between",margin:"size-200",alignItems:"center",children:[_(Be,{direction:"row",gap:"size-50",alignItems:"center",children:[x(pt,{svg:x(Et.FileOutline,{})}),_(Nn,{level:4,children:["document ",t[Itt]]})]}),typeof t[Wse]=="number"&&x(Zs,{color:r,children:`score ${mh(t[Wse])}`})]})}),x("pre",{css:ee`
+                        `,children:o[mtt]})]})},l))})}):null})}function Xxn(t){let{spanAttributes:e}=t,n=(0,br.useMemo)(()=>{let l=e[wr.tool];return typeof l=="object"?l:{}},[e]);if(!(Object.keys(n).length>0))return null;let r=n[vB.name],a=n[vB.description],o=n[vB.parameters];return x(Be,{direction:"column",gap:"size-200",children:x(uu,{title:"Tool"+(typeof r=="string"?`: ${r}`:""),...eg,children:_(Be,{direction:"column",children:[a!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",backgroundColor:"light",children:_(Be,{direction:"column",alignItems:"start",gap:"size-50",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Description"}),x(Me,{children:a})]})}):null,o!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",children:_(Be,{direction:"column",alignItems:"start",width:"100%",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Parameters"}),x(Tc,{value:JSON.stringify(o),mimeType:"json"})]})}):null]})})})}var Sxn=["irrelevant","unrelated"];function Gse({document:t,documentEvaluations:e,backgroundColor:n,borderColor:i,labelColor:r}){let a=t[htt],o=e&&e.length;return x(ft,{borderRadius:"medium",backgroundColor:n,borderColor:i,borderWidth:"thin",children:_(Be,{direction:"column",children:[x(ft,{width:"100%",borderBottomWidth:"thin",borderBottomColor:i,children:_(Be,{direction:"row",justifyContent:"space-between",margin:"size-200",alignItems:"center",children:[_(Be,{direction:"row",gap:"size-50",alignItems:"center",children:[x(pt,{svg:x(Et.FileOutline,{})}),_(Nn,{level:4,children:["document ",t[Itt]]})]}),typeof t[Wse]=="number"&&x(Zs,{color:r,children:`score ${mh(t[Wse])}`})]})}),x("pre",{css:ee`
             padding: var(--ac-global-dimension-static-size-200);
             white-space: normal;
             margin: 0;

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/session/evaluation.py RENAMED Viewed

@@ -9,6 +9,7 @@ import math
 from time import sleep
 from typing import (
     Any,
+    Iterator,
     Optional,
     Sequence,
     Tuple,
@@ -33,24 +34,29 @@ __all__ = [
 from phoenix.trace.span_evaluations import Evaluations
-def add_evaluations(
-    exporter: HttpExporter,
-    evaluations: pd.DataFrame,
-    evaluation_name: str,
-) -> None:
-    index_names = evaluations.index.names
-    for index, row in evaluations.iterrows():
+def encode_evaluations(evaluations: Evaluations) -> Iterator[pb.Evaluation]:
+    dataframe = evaluations.dataframe
+    eval_name = evaluations.eval_name
+    index_names = dataframe.index.names
+    for index, row in dataframe.iterrows():
         subject_id = _extract_subject_id_from_index(
             index_names,
             cast(Union[str, Tuple[Any]], index),
         )
         if (result := _extract_result(row)) is None:
             continue
-        evaluation = pb.Evaluation(
-            name=evaluation_name,
+        yield pb.Evaluation(
+            name=eval_name,
             result=result,
             subject_id=subject_id,
         )
+def add_evaluations(
+    exporter: HttpExporter,
+    evaluations: Evaluations,
+) -> None:
+    for evaluation in encode_evaluations(evaluations):
         exporter.export(evaluation)
@@ -130,7 +136,7 @@ def log_evaluations(
         return
     exporter = HttpExporter(endpoint=endpoint, host=host, port=port)
     for eval in filter(bool, evals):
-        add_evaluations(exporter, eval.dataframe, eval.eval_name)
+        add_evaluations(exporter, eval)
     with tqdm(total=n, desc="Sending Evaluations") as pbar:
         while n:
             sleep(0.1)

{arize_phoenix-2.5.0 → arize_phoenix-2.7.0}/src/phoenix/session/session.py RENAMED Viewed

@@ -30,6 +30,7 @@ from phoenix.pointcloud.umap_parameters import get_umap_parameters
 from phoenix.server.app import create_app
 from phoenix.server.thread_server import ThreadServer
 from phoenix.services import AppService
+from phoenix.session.evaluation import encode_evaluations
 from phoenix.trace.dsl import SpanFilter
 from phoenix.trace.dsl.query import SpanQuery
 from phoenix.trace.otel import encode
@@ -46,6 +47,8 @@ logger = logging.getLogger(__name__)
 # type workaround
 # https://github.com/python/mypy/issues/5264#issuecomment-399407428
 if TYPE_CHECKING:
+    from phoenix.trace import Evaluations
     _BaseList = UserList[pd.DataFrame]
 else:
     _BaseList = UserList
@@ -123,6 +126,10 @@ class Session(ABC):
                 self.traces.put(encode(span))
         self.evals: Evals = Evals()
+        if trace_dataset:
+            for evaluations in trace_dataset.evaluations:
+                for pb_evaluation in encode_evaluations(evaluations):
+                    self.evals.put(pb_evaluation)
         self.host = host or get_env_host()
         self.port = port or get_env_port()
@@ -213,6 +220,15 @@ class Session(ABC):
             return None
         return pd.json_normalize(data, max_level=1).set_index("context.span_id", drop=False)
+    def get_evaluations(self) -> List["Evaluations"]:
+        return self.evals.export_evaluations()
+    def get_trace_dataset(self) -> Optional[TraceDataset]:
+        if (dataframe := self.get_spans_dataframe()) is None:
+            return None
+        evaluations = self.get_evaluations()
+        return TraceDataset(dataframe=dataframe, evaluations=evaluations)
 _session: Optional[Session] = None
@@ -479,6 +495,9 @@ def _get_url(host: str, port: int, notebook_env: NotebookEnvironment) -> str:
     if notebook_env == NotebookEnvironment.DATABRICKS:
         context = _get_databricks_context()
         return f"{_get_databricks_notebook_base_url(context)}/{port}/"
+    if host == "0.0.0.0" or host == "127.0.0.1":
+        # The app is running locally, so use localhost
+        return f"http://localhost:{port}/"
     return f"http://{host}:{port}/"

arize_phoenix-2.7.0/src/phoenix/trace/errors.py ADDED Viewed

@@ -0,0 +1,5 @@
+from phoenix.exceptions import PhoenixException
+class InvalidParquetMetadataError(PhoenixException):
+    pass

arize-phoenix 2.5.0__tar.gz → 2.7.0__tar.gz

arize-phoenix 2.5.0tar.gz → 2.7.0tar.gz