PyPI - arize-phoenix - Versions diffs - 0.0.48__py3-none-any.whl → 0.0.50rc0__py3-none-any.whl - Mend

arize-phoenix 0.0.48py3-none-any.whl → 0.0.50rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arize-phoenix might be problematic. Click here for more details.

Files changed (11) hide show

{arize_phoenix-0.0.48.dist-info → arize_phoenix-0.0.50rc0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arize-phoenix
-Version: 0.0.48
+Version: 0.0.50rc0
 Summary: ML Observability in your notebook
 Project-URL: Documentation, https://docs.arize.com/phoenix/
 Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -196,6 +196,7 @@ Launch Phoenix in a notebook and view the traces of your LangChain application i
 ```python
 import phoenix as px
 import pandas as pd
+import numpy as np
 # Launch phoenix
 session = px.launch_app()
@@ -219,7 +220,7 @@ documents_df = pd.read_parquet(
     "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/context-retrieval/langchain-pinecone/database.parquet"
 )
 knn_retriever = KNNRetriever(
-    index=np.stack(df["text_vector"]),
+    index=np.stack(documents_df["text_vector"]),
     texts=documents_df["text"].tolist(),
     embeddings=OpenAIEmbeddings(),
 )
@@ -270,7 +271,7 @@ from phoenix.experimental.evals import (
     RAG_RELEVANCY_PROMPT_RAILS_MAP,
     OpenAIModel,
     download_benchmark_dataset,
-    llm_eval_binary,
+    llm_classify,
 )
 from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
@@ -291,7 +292,7 @@ model = OpenAIModel(
     temperature=0.0,
 )
 rails =list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
-df["eval_relevance"] = llm_eval_binary(df, model, RAG_RELEVANCY_PROMPT_TEMPLATE_STR, rails)
+df["eval_relevance"] = llm_classify(df, model, RAG_RELEVANCY_PROMPT_TEMPLATE_STR, rails)
 #Golden dataset has True/False map to -> "irrelevant" / "relevant"
 #we can then scikit compare to output of template - same format
 y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})

{arize_phoenix-0.0.48.dist-info → arize_phoenix-0.0.50rc0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-phoenix/__init__.py,sha256=XiFQGe2_k_n1IFl8jBHCUKHEU17stN-vsgIK_epR2s8,1255
+phoenix/__init__.py,sha256=culPUmrte05JPLfFqXv4_jGHDnkRB6AEuvZYZTqLpFQ,1257
 phoenix/config.py,sha256=TdMKmU7V490I38x_hvB1s14Y8pV3ldLSpJTKq6crzBY,1952
 phoenix/datetime_utils.py,sha256=D955QLrkgrrSdUM6NyqbCeAu2SMsjhR5rHVQEsVUdng,2773
 phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
@@ -16,17 +16,17 @@ phoenix/datasets/fixtures.py,sha256=0_PacL3dw49zulKpFpPdhvxJxeGmHTguqIyf2VXkBkk,
 phoenix/datasets/schema.py,sha256=bF1d2Md6NyqQZuC4Ym5A52f2_IcazkyxGFZ11HPqSg0,6668
 phoenix/datasets/validation.py,sha256=dZ9lCFUV0EY7HCkQkQBrs-GLAEIZdpOqUxwD5l4dp88,8294
 phoenix/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-phoenix/experimental/evals/__init__.py,sha256=YvB_OMcKbDCh_qAwWGa8HhkVoT-reYS4dRNpqC1pmPU,1128
+phoenix/experimental/evals/__init__.py,sha256=IqCg4owQosfk_QchEsUdx6lkHPbaPXlL2ce8juQSge8,1162
 phoenix/experimental/evals/retrievals.py,sha256=Y3YupYrrzt_orTMEFFW3eDBrHcMnBsqTqEQu7BWAUlk,3828
 phoenix/experimental/evals/utils.py,sha256=ivrYuX5Xotjh12BWOpYk9O7TgOt8uGDfdnRpYfrybmQ,1102
-phoenix/experimental/evals/functions/__init__.py,sha256=v12PUUlxc6JhD87yuc4mMmSK5-dTZIAxnCP_pbf2e4k,160
-phoenix/experimental/evals/functions/binary.py,sha256=15kNsQ-9PkRYhutQn8NXKYR_3p355IPX7NenpeGR49E,11850
+phoenix/experimental/evals/functions/__init__.py,sha256=gHt8xJklzefPFGOcgiselXrjqfJAQ0HL8xanxP3zD-A,192
+phoenix/experimental/evals/functions/classify.py,sha256=svn1yLmwdhnuzaZEA39b9s-GcDl4DgYTyWcbmSl-U9M,11924
 phoenix/experimental/evals/functions/generate.py,sha256=pxoL-D4sCvwVKAD-5nSs1qauDbAqYkJ1t9RCj7SanWA,2053
 phoenix/experimental/evals/functions/processing.py,sha256=F4xtLsulLV4a8CkuLldRddsCim75dSTIShEJUYN6I6w,1823
 phoenix/experimental/evals/models/__init__.py,sha256=Ek98LMKD8hzy4baHjZ0hy_JSxIJm21aLdH6BdQR1OW4,193
 phoenix/experimental/evals/models/base.py,sha256=A6T9F5ucr0WXKGwO6y1hfIyy1_ArTObbimEJR3nBZR8,6978
 phoenix/experimental/evals/models/bedrock.py,sha256=xppB9YaehlapGeyQqWAUEMJUWd7Z18g9MxzL7OEAP0M,7322
-phoenix/experimental/evals/models/openai.py,sha256=TpfnUrL6WBbwvG4re0JoEvz5lIr2wZ5j9uKmsvuEnP8,11357
+phoenix/experimental/evals/models/openai.py,sha256=_d_i0g3zLhn1y80tkUXjGWaNEN-kH2hvdrBBlhhBVGM,11358
 phoenix/experimental/evals/models/vertexai.py,sha256=K6yDGWIkavSoIoXuGc6czp-arz0eh42cWGiRmuvrGcs,5443
 phoenix/experimental/evals/templates/__init__.py,sha256=Tf1gzN-dkgv-szgU08SIj7oZrX-r7VjQ3dcXqoN0Gec,831
 phoenix/experimental/evals/templates/default_templates.py,sha256=0X_NoQZC-dqPeDfhoqo_7-stCfnxFmdOizCSGsNlAlA,6160
@@ -112,7 +112,7 @@ phoenix/server/static/apple-touch-icon-76x76.png,sha256=CT_xT12I0u2i0WU8JzBZBuOQ
 phoenix/server/static/apple-touch-icon.png,sha256=fOfpjqGpWYbJ0eAurKsyoZP1EAs6ZVooBJ_SGk2ZkDs,3801
 phoenix/server/static/favicon.ico,sha256=bY0vvCKRftemZfPShwZtE93DiiQdaYaozkPGwNFr6H8,34494
 phoenix/server/static/index.css,sha256=KKGpx4iwF91VGRm0YN-4cn8oC-oIqC6HecoPf0x3ZM8,1885
-phoenix/server/static/index.js,sha256=tB4m-Zx7moP-d68Md3NIXPs4ltr1vh9WvrQleHmU3Bc,3145906
+phoenix/server/static/index.js,sha256=gzxpSo53BVLk0omGaRucmCkfWvA8n5HcyspNTHgfk_g,3146701
 phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_wcU9c,2564
 phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoenix/server/templates/index.html,sha256=TrupcsIB_TfFhnaG0fDQLfxTpuWc2zQo3RY1xx3k1Fg,1335
@@ -143,8 +143,8 @@ phoenix/trace/v1/trace_pb2.pyi,sha256=2JpgiYz3s8HrxnVIi5Brk7c3RJB4LqDGzwRYonhliR
 phoenix/utilities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoenix/utilities/error_handling.py,sha256=7b5rpGFj9EWZ8yrZK1IHvxB89suWk3lggDayUQcvZds,1946
 phoenix/utilities/logging.py,sha256=D5-NAPYDEc7VD2babENVgKr9AeXUjl7ofDGiLNrWXyw,189
-arize_phoenix-0.0.48.dist-info/METADATA,sha256=_Q6SEbugYODjJm_kb8Id3ZJFs1I6_YFQsKqF5X5S7DM,25556
-arize_phoenix-0.0.48.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
-arize_phoenix-0.0.48.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
-arize_phoenix-0.0.48.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
-arize_phoenix-0.0.48.dist-info/RECORD,,
+arize_phoenix-0.0.50rc0.dist-info/METADATA,sha256=8Q_DS7hlY6H7LvI_MLd75cjMn9ZMctkhdViKXZDeC7A,25582
+arize_phoenix-0.0.50rc0.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
+arize_phoenix-0.0.50rc0.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
+arize_phoenix-0.0.50rc0.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
+arize_phoenix-0.0.50rc0.dist-info/RECORD,,

phoenix/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from .session.session import Session, active_session, close_app, launch_app
 from .trace.fixtures import load_example_traces
 from .trace.trace_dataset import TraceDataset
-__version__ = "0.0.48"
+__version__ = "0.0.50rc"
 # module level doc-string
 __doc__ = """

phoenix/experimental/evals/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .functions import llm_eval_binary, llm_generate, run_relevance_eval
+from .functions import llm_classify, llm_eval_binary, llm_generate, run_relevance_eval
 from .models import OpenAIModel, VertexAIModel
 from .retrievals import compute_precisions_at_k
 from .templates import (
@@ -18,6 +18,7 @@ from .utils.downloads import download_benchmark_dataset
 __all__ = [
     "compute_precisions_at_k",
     "download_benchmark_dataset",
+    "llm_classify",
     "llm_eval_binary",
     "llm_generate",
     "OpenAIModel",

phoenix/experimental/evals/functions/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .binary import llm_eval_binary, run_relevance_eval
+from .classify import llm_classify, llm_eval_binary, run_relevance_eval
 from .generate import llm_generate
-__all__ = ["llm_eval_binary", "run_relevance_eval", "llm_generate"]
+__all__ = ["llm_classify", "llm_eval_binary", "run_relevance_eval", "llm_generate"]

phoenix/experimental/evals/functions/{binary.py → classify.py} RENAMED Viewed

@@ -1,5 +1,6 @@
 import logging
-from typing import Any, Iterable, List, Optional, Set, Union, cast
+import warnings
+from typing import Any, Iterable, List, Optional, Union, cast
 import pandas as pd
@@ -22,7 +23,7 @@ OPENINFERENCE_QUERY_COLUMN_NAME = "attributes." + INPUT_VALUE
 OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
-def llm_eval_binary(
+def llm_classify(
     dataframe: pd.DataFrame,
     model: BaseEvalModel,
     template: Union[PromptTemplate, str],
@@ -30,7 +31,7 @@ def llm_eval_binary(
     system_instruction: Optional[str] = None,
     verbose: bool = False,
 ) -> List[str]:
-    """Runs binary classifications using an LLM.
+    """Classifies each input row of the dataframe using an LLM.
     Args:
         dataframe (pandas.DataFrame): A pandas dataframe in which each row represents a record to be
@@ -62,9 +63,62 @@ def llm_eval_binary(
         eval_template = normalize_template(template)
         prompts = map_template(dataframe, eval_template)
         responses = verbose_model.generate(prompts.to_list(), instruction=system_instruction)
-        rails_set = set(rails)
-        printif(verbose, f"Snapping {len(responses)} responses to rails: {rails_set}")
-        return [_snap_to_rail(response, rails_set, verbose=verbose) for response in responses]
+        printif(verbose, f"Snapping {len(responses)} responses to rails: {rails}")
+        return [_snap_to_rail(response, rails, verbose=verbose) for response in responses]
+def llm_eval_binary(
+    dataframe: pd.DataFrame,
+    model: BaseEvalModel,
+    template: Union[PromptTemplate, str],
+    rails: List[str],
+    system_instruction: Optional[str] = None,
+    verbose: bool = False,
+) -> List[str]:
+    """Performs a binary classification on the rows of the input dataframe using an LLM.
+    Args:
+        dataframe (pandas.DataFrame): A pandas dataframe in which each row represents a record to be
+        classified. All template variable names must appear as column names in the dataframe (extra
+        columns unrelated to the template are permitted).
+        template (Union[PromptTemplate, str]): The prompt template as either an instance of
+        PromptTemplate or a string. If the latter, the variable names should be surrounded by
+        curly braces so that a call to `.format` can be made to substitute variable values.
+        model (BaseEvalModel): An LLM model class.
+        rails (List[str]): A list of strings representing the possible output classes of the model's
+        predictions.
+        system_instruction (Optional[str], optional): An optional system message.
+        verbose (bool, optional): If True, prints detailed info to stdout such as model invocation
+        parameters and details about retries and snapping to rails. Default False.
+    Returns:
+        List[str]: A list of strings representing the predicted class for each record in the
+        dataframe. The list should have the same length as the input dataframe and its values should
+        be the entries in the rails argument or "NOT_PARSABLE" if the model's prediction could not
+        be parsed.
+    """
+    warnings.warn(
+        "This function will soon be deprecated. "
+        "Use llm_classify instead, which has the same function signature "
+        "and provides support for multi-class classification "
+        "in addition to binary classification.",
+        category=DeprecationWarning,
+        stacklevel=2,
+    )
+    return llm_classify(
+        dataframe=dataframe,
+        model=model,
+        template=template,
+        rails=rails,
+        system_instruction=system_instruction,
+        verbose=verbose,
+    )
 def run_relevance_eval(
@@ -161,7 +215,7 @@ def run_relevance_eval(
                 indexes.append(index)
                 expanded_queries.append(query)
                 expanded_documents.append(document)
-        predictions = llm_eval_binary(
+        predictions = llm_classify(
             dataframe=pd.DataFrame(
                 {
                     query_column_name: expanded_queries,
@@ -188,92 +242,33 @@ def _get_contents_from_openinference_documents(documents: Iterable[Any]) -> List
     return [doc.get(DOCUMENT_CONTENT) if isinstance(doc, dict) else None for doc in documents]
-def _snap_to_rail(string: str, rails: Set[str], verbose: bool = False) -> str:
+def _snap_to_rail(raw_string: str, rails: List[str], verbose: bool = False) -> str:
     """
-    Snaps a string to the nearest rail, or returns None if the string cannot be snapped to a
-    rail.
+    Snaps a string to the nearest rail, or returns None if the string cannot be
+    snapped to a rail.
     Args:
-        string (str): An input to be snapped to a rail.
+        raw_string (str): An input to be snapped to a rail.
-        rails (Set[str]): The target set of strings to snap to.
+        rails (List[str]): The target set of strings to snap to.
     Returns:
-        str: A string from the rails argument or None if the input string could not be snapped.
+        str: A string from the rails argument or "UNPARSABLE" if the input
+        string could not be snapped.
     """
-    processed_string = string.strip()
-    rails_list = list(rails)
-    rail = _extract_rail(processed_string, rails_list[0], rails_list[1])
-    if not rail:
-        printif(verbose, f"- Cannot snap {repr(string)} to rails: {rails}")
-        logger.warning(
-            f"LLM output cannot be snapped to rails {list(rails)}, returning {NOT_PARSABLE}. "
-            f'Output: "{string}"'
-        )
+    snap_string = raw_string.lower()
+    rails = list(set(rails))
+    rails = [rail.lower() for rail in rails]
+    rails.sort(key=len, reverse=True)
+    found_rails = set()
+    for rail in rails:
+        if rail in snap_string:
+            found_rails.add(rail)
+            snap_string = snap_string.replace(rail, "")
+    if len(found_rails) != 1:
+        printif(verbose, f"- Cannot snap {repr(raw_string)} to rails")
         return NOT_PARSABLE
-    else:
-        printif(verbose, f"- Snapped {repr(string)} to rail: {rail}")
+    rail = list(found_rails)[0]
+    printif(verbose, f"- Snapped {repr(raw_string)} to rail: {rail}")
     return rail
-def _extract_rail(string: str, positive_rail: str, negative_rail: str) -> Optional[str]:
-    """
-    Extracts the right rails text from the llm output. If the rails have overlapping characters,
-    (e.x. "regular" and "irregular"), it also ensures that the correct rail is returned.
-    Args:
-        string (str): An input to be snapped to a rail.
-        positive_rail (str): The positive rail (e.x. toxic)
-        negative_rail (str): The negative rail. (e.x. non-toxic)
-    Returns:
-        str: A string from the rails or None if the input string could not be extracted.
-    Examples:
-        given: positive_rail = "irregular", negative_rail = "regular"
-        string = "irregular"
-        Output: "irregular"
-        string = "regular"
-        Output: "regular"
-        string = "regular,:....random"
-        Output: "regular"
-        string = "regular..irregular" - contains both rails
-        Output: None
-        string = "Irregular"
-        Output: "irregular"
-    """
-    # Convert the inputs to lowercase for case-insensitive matching
-    string = string.lower()
-    positive_rail = positive_rail.lower()
-    negative_rail = negative_rail.lower()
-    positive_pos, negative_pos = string.find(positive_rail), string.find(negative_rail)
-    # If both positive and negative rails are in the string
-    if positive_pos != -1 and negative_pos != -1:
-        # If either one is a substring of the other, return the longer one
-        # e.x. "regular" and "irregular"
-        if positive_pos < negative_pos < positive_pos + len(
-            positive_rail
-        ) or negative_pos < positive_pos < negative_pos + len(negative_rail):
-            # Return the longer of the rails since it means the LLM returned the longer one
-            return max(positive_rail, negative_rail, key=len)
-        else:
-            # If both rails values are in the string, we cannot determine which to return
-            return None
-    # If only positive is in string
-    elif positive_pos != -1:
-        return positive_rail
-    # If only negative is in the string
-    elif negative_pos != -1:
-        return negative_rail
-    return None

phoenix/experimental/evals/models/openai.py CHANGED Viewed

@@ -56,7 +56,7 @@ class OpenAIModel(BaseEvalModel):
     """Batch size to use when passing multiple documents to generate."""
     request_timeout: Optional[Union[float, Tuple[float, float]]] = None
     """Timeout for requests to OpenAI completion API. Default is 600 seconds."""
-    max_retries: int = 6
+    max_retries: int = 20
     """Maximum number of retries to make when generating."""
     retry_min_seconds: int = 10
     """Minimum number of seconds to wait when retrying."""

arize-phoenix 0.0.48__py3-none-any.whl → 0.0.50rc0__py3-none-any.whl

Potentially problematic release.

arize-phoenix 0.0.48py3-none-any.whl → 0.0.50rc0py3-none-any.whl