arize-phoenix 0.0.48__py3-none-any.whl → 0.0.50rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arize-phoenix
3
- Version: 0.0.48
3
+ Version: 0.0.50rc0
4
4
  Summary: ML Observability in your notebook
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -196,6 +196,7 @@ Launch Phoenix in a notebook and view the traces of your LangChain application i
196
196
  ```python
197
197
  import phoenix as px
198
198
  import pandas as pd
199
+ import numpy as np
199
200
 
200
201
  # Launch phoenix
201
202
  session = px.launch_app()
@@ -219,7 +220,7 @@ documents_df = pd.read_parquet(
219
220
  "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/context-retrieval/langchain-pinecone/database.parquet"
220
221
  )
221
222
  knn_retriever = KNNRetriever(
222
- index=np.stack(df["text_vector"]),
223
+ index=np.stack(documents_df["text_vector"]),
223
224
  texts=documents_df["text"].tolist(),
224
225
  embeddings=OpenAIEmbeddings(),
225
226
  )
@@ -270,7 +271,7 @@ from phoenix.experimental.evals import (
270
271
  RAG_RELEVANCY_PROMPT_RAILS_MAP,
271
272
  OpenAIModel,
272
273
  download_benchmark_dataset,
273
- llm_eval_binary,
274
+ llm_classify,
274
275
  )
275
276
  from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
276
277
 
@@ -291,7 +292,7 @@ model = OpenAIModel(
291
292
  temperature=0.0,
292
293
  )
293
294
  rails =list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
294
- df["eval_relevance"] = llm_eval_binary(df, model, RAG_RELEVANCY_PROMPT_TEMPLATE_STR, rails)
295
+ df["eval_relevance"] = llm_classify(df, model, RAG_RELEVANCY_PROMPT_TEMPLATE_STR, rails)
295
296
  #Golden dataset has True/False map to -> "irrelevant" / "relevant"
296
297
  #we can then scikit compare to output of template - same format
297
298
  y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
@@ -1,4 +1,4 @@
1
- phoenix/__init__.py,sha256=XiFQGe2_k_n1IFl8jBHCUKHEU17stN-vsgIK_epR2s8,1255
1
+ phoenix/__init__.py,sha256=culPUmrte05JPLfFqXv4_jGHDnkRB6AEuvZYZTqLpFQ,1257
2
2
  phoenix/config.py,sha256=TdMKmU7V490I38x_hvB1s14Y8pV3ldLSpJTKq6crzBY,1952
3
3
  phoenix/datetime_utils.py,sha256=D955QLrkgrrSdUM6NyqbCeAu2SMsjhR5rHVQEsVUdng,2773
4
4
  phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
@@ -16,17 +16,17 @@ phoenix/datasets/fixtures.py,sha256=0_PacL3dw49zulKpFpPdhvxJxeGmHTguqIyf2VXkBkk,
16
16
  phoenix/datasets/schema.py,sha256=bF1d2Md6NyqQZuC4Ym5A52f2_IcazkyxGFZ11HPqSg0,6668
17
17
  phoenix/datasets/validation.py,sha256=dZ9lCFUV0EY7HCkQkQBrs-GLAEIZdpOqUxwD5l4dp88,8294
18
18
  phoenix/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- phoenix/experimental/evals/__init__.py,sha256=YvB_OMcKbDCh_qAwWGa8HhkVoT-reYS4dRNpqC1pmPU,1128
19
+ phoenix/experimental/evals/__init__.py,sha256=IqCg4owQosfk_QchEsUdx6lkHPbaPXlL2ce8juQSge8,1162
20
20
  phoenix/experimental/evals/retrievals.py,sha256=Y3YupYrrzt_orTMEFFW3eDBrHcMnBsqTqEQu7BWAUlk,3828
21
21
  phoenix/experimental/evals/utils.py,sha256=ivrYuX5Xotjh12BWOpYk9O7TgOt8uGDfdnRpYfrybmQ,1102
22
- phoenix/experimental/evals/functions/__init__.py,sha256=v12PUUlxc6JhD87yuc4mMmSK5-dTZIAxnCP_pbf2e4k,160
23
- phoenix/experimental/evals/functions/binary.py,sha256=15kNsQ-9PkRYhutQn8NXKYR_3p355IPX7NenpeGR49E,11850
22
+ phoenix/experimental/evals/functions/__init__.py,sha256=gHt8xJklzefPFGOcgiselXrjqfJAQ0HL8xanxP3zD-A,192
23
+ phoenix/experimental/evals/functions/classify.py,sha256=svn1yLmwdhnuzaZEA39b9s-GcDl4DgYTyWcbmSl-U9M,11924
24
24
  phoenix/experimental/evals/functions/generate.py,sha256=pxoL-D4sCvwVKAD-5nSs1qauDbAqYkJ1t9RCj7SanWA,2053
25
25
  phoenix/experimental/evals/functions/processing.py,sha256=F4xtLsulLV4a8CkuLldRddsCim75dSTIShEJUYN6I6w,1823
26
26
  phoenix/experimental/evals/models/__init__.py,sha256=Ek98LMKD8hzy4baHjZ0hy_JSxIJm21aLdH6BdQR1OW4,193
27
27
  phoenix/experimental/evals/models/base.py,sha256=A6T9F5ucr0WXKGwO6y1hfIyy1_ArTObbimEJR3nBZR8,6978
28
28
  phoenix/experimental/evals/models/bedrock.py,sha256=xppB9YaehlapGeyQqWAUEMJUWd7Z18g9MxzL7OEAP0M,7322
29
- phoenix/experimental/evals/models/openai.py,sha256=TpfnUrL6WBbwvG4re0JoEvz5lIr2wZ5j9uKmsvuEnP8,11357
29
+ phoenix/experimental/evals/models/openai.py,sha256=_d_i0g3zLhn1y80tkUXjGWaNEN-kH2hvdrBBlhhBVGM,11358
30
30
  phoenix/experimental/evals/models/vertexai.py,sha256=K6yDGWIkavSoIoXuGc6czp-arz0eh42cWGiRmuvrGcs,5443
31
31
  phoenix/experimental/evals/templates/__init__.py,sha256=Tf1gzN-dkgv-szgU08SIj7oZrX-r7VjQ3dcXqoN0Gec,831
32
32
  phoenix/experimental/evals/templates/default_templates.py,sha256=0X_NoQZC-dqPeDfhoqo_7-stCfnxFmdOizCSGsNlAlA,6160
@@ -112,7 +112,7 @@ phoenix/server/static/apple-touch-icon-76x76.png,sha256=CT_xT12I0u2i0WU8JzBZBuOQ
112
112
  phoenix/server/static/apple-touch-icon.png,sha256=fOfpjqGpWYbJ0eAurKsyoZP1EAs6ZVooBJ_SGk2ZkDs,3801
113
113
  phoenix/server/static/favicon.ico,sha256=bY0vvCKRftemZfPShwZtE93DiiQdaYaozkPGwNFr6H8,34494
114
114
  phoenix/server/static/index.css,sha256=KKGpx4iwF91VGRm0YN-4cn8oC-oIqC6HecoPf0x3ZM8,1885
115
- phoenix/server/static/index.js,sha256=tB4m-Zx7moP-d68Md3NIXPs4ltr1vh9WvrQleHmU3Bc,3145906
115
+ phoenix/server/static/index.js,sha256=gzxpSo53BVLk0omGaRucmCkfWvA8n5HcyspNTHgfk_g,3146701
116
116
  phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_wcU9c,2564
117
117
  phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
118
  phoenix/server/templates/index.html,sha256=TrupcsIB_TfFhnaG0fDQLfxTpuWc2zQo3RY1xx3k1Fg,1335
@@ -143,8 +143,8 @@ phoenix/trace/v1/trace_pb2.pyi,sha256=2JpgiYz3s8HrxnVIi5Brk7c3RJB4LqDGzwRYonhliR
143
143
  phoenix/utilities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
144
  phoenix/utilities/error_handling.py,sha256=7b5rpGFj9EWZ8yrZK1IHvxB89suWk3lggDayUQcvZds,1946
145
145
  phoenix/utilities/logging.py,sha256=D5-NAPYDEc7VD2babENVgKr9AeXUjl7ofDGiLNrWXyw,189
146
- arize_phoenix-0.0.48.dist-info/METADATA,sha256=_Q6SEbugYODjJm_kb8Id3ZJFs1I6_YFQsKqF5X5S7DM,25556
147
- arize_phoenix-0.0.48.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
148
- arize_phoenix-0.0.48.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
149
- arize_phoenix-0.0.48.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
150
- arize_phoenix-0.0.48.dist-info/RECORD,,
146
+ arize_phoenix-0.0.50rc0.dist-info/METADATA,sha256=8Q_DS7hlY6H7LvI_MLd75cjMn9ZMctkhdViKXZDeC7A,25582
147
+ arize_phoenix-0.0.50rc0.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
148
+ arize_phoenix-0.0.50rc0.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
149
+ arize_phoenix-0.0.50rc0.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
150
+ arize_phoenix-0.0.50rc0.dist-info/RECORD,,
phoenix/__init__.py CHANGED
@@ -5,7 +5,7 @@ from .session.session import Session, active_session, close_app, launch_app
5
5
  from .trace.fixtures import load_example_traces
6
6
  from .trace.trace_dataset import TraceDataset
7
7
 
8
- __version__ = "0.0.48"
8
+ __version__ = "0.0.50rc"
9
9
 
10
10
  # module level doc-string
11
11
  __doc__ = """
@@ -1,4 +1,4 @@
1
- from .functions import llm_eval_binary, llm_generate, run_relevance_eval
1
+ from .functions import llm_classify, llm_eval_binary, llm_generate, run_relevance_eval
2
2
  from .models import OpenAIModel, VertexAIModel
3
3
  from .retrievals import compute_precisions_at_k
4
4
  from .templates import (
@@ -18,6 +18,7 @@ from .utils.downloads import download_benchmark_dataset
18
18
  __all__ = [
19
19
  "compute_precisions_at_k",
20
20
  "download_benchmark_dataset",
21
+ "llm_classify",
21
22
  "llm_eval_binary",
22
23
  "llm_generate",
23
24
  "OpenAIModel",
@@ -1,4 +1,4 @@
1
- from .binary import llm_eval_binary, run_relevance_eval
1
+ from .classify import llm_classify, llm_eval_binary, run_relevance_eval
2
2
  from .generate import llm_generate
3
3
 
4
- __all__ = ["llm_eval_binary", "run_relevance_eval", "llm_generate"]
4
+ __all__ = ["llm_classify", "llm_eval_binary", "run_relevance_eval", "llm_generate"]
@@ -1,5 +1,6 @@
1
1
  import logging
2
- from typing import Any, Iterable, List, Optional, Set, Union, cast
2
+ import warnings
3
+ from typing import Any, Iterable, List, Optional, Union, cast
3
4
 
4
5
  import pandas as pd
5
6
 
@@ -22,7 +23,7 @@ OPENINFERENCE_QUERY_COLUMN_NAME = "attributes." + INPUT_VALUE
22
23
  OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
23
24
 
24
25
 
25
- def llm_eval_binary(
26
+ def llm_classify(
26
27
  dataframe: pd.DataFrame,
27
28
  model: BaseEvalModel,
28
29
  template: Union[PromptTemplate, str],
@@ -30,7 +31,7 @@ def llm_eval_binary(
30
31
  system_instruction: Optional[str] = None,
31
32
  verbose: bool = False,
32
33
  ) -> List[str]:
33
- """Runs binary classifications using an LLM.
34
+ """Classifies each input row of the dataframe using an LLM.
34
35
 
35
36
  Args:
36
37
  dataframe (pandas.DataFrame): A pandas dataframe in which each row represents a record to be
@@ -62,9 +63,62 @@ def llm_eval_binary(
62
63
  eval_template = normalize_template(template)
63
64
  prompts = map_template(dataframe, eval_template)
64
65
  responses = verbose_model.generate(prompts.to_list(), instruction=system_instruction)
65
- rails_set = set(rails)
66
- printif(verbose, f"Snapping {len(responses)} responses to rails: {rails_set}")
67
- return [_snap_to_rail(response, rails_set, verbose=verbose) for response in responses]
66
+ printif(verbose, f"Snapping {len(responses)} responses to rails: {rails}")
67
+ return [_snap_to_rail(response, rails, verbose=verbose) for response in responses]
68
+
69
+
70
+ def llm_eval_binary(
71
+ dataframe: pd.DataFrame,
72
+ model: BaseEvalModel,
73
+ template: Union[PromptTemplate, str],
74
+ rails: List[str],
75
+ system_instruction: Optional[str] = None,
76
+ verbose: bool = False,
77
+ ) -> List[str]:
78
+ """Performs a binary classification on the rows of the input dataframe using an LLM.
79
+
80
+ Args:
81
+ dataframe (pandas.DataFrame): A pandas dataframe in which each row represents a record to be
82
+ classified. All template variable names must appear as column names in the dataframe (extra
83
+ columns unrelated to the template are permitted).
84
+
85
+ template (Union[PromptTemplate, str]): The prompt template as either an instance of
86
+ PromptTemplate or a string. If the latter, the variable names should be surrounded by
87
+ curly braces so that a call to `.format` can be made to substitute variable values.
88
+
89
+ model (BaseEvalModel): An LLM model class.
90
+
91
+ rails (List[str]): A list of strings representing the possible output classes of the model's
92
+ predictions.
93
+
94
+ system_instruction (Optional[str], optional): An optional system message.
95
+
96
+ verbose (bool, optional): If True, prints detailed info to stdout such as model invocation
97
+ parameters and details about retries and snapping to rails. Default False.
98
+
99
+ Returns:
100
+ List[str]: A list of strings representing the predicted class for each record in the
101
+ dataframe. The list should have the same length as the input dataframe and its values should
102
+ be the entries in the rails argument or "NOT_PARSABLE" if the model's prediction could not
103
+ be parsed.
104
+ """
105
+
106
+ warnings.warn(
107
+ "This function will soon be deprecated. "
108
+ "Use llm_classify instead, which has the same function signature "
109
+ "and provides support for multi-class classification "
110
+ "in addition to binary classification.",
111
+ category=DeprecationWarning,
112
+ stacklevel=2,
113
+ )
114
+ return llm_classify(
115
+ dataframe=dataframe,
116
+ model=model,
117
+ template=template,
118
+ rails=rails,
119
+ system_instruction=system_instruction,
120
+ verbose=verbose,
121
+ )
68
122
 
69
123
 
70
124
  def run_relevance_eval(
@@ -161,7 +215,7 @@ def run_relevance_eval(
161
215
  indexes.append(index)
162
216
  expanded_queries.append(query)
163
217
  expanded_documents.append(document)
164
- predictions = llm_eval_binary(
218
+ predictions = llm_classify(
165
219
  dataframe=pd.DataFrame(
166
220
  {
167
221
  query_column_name: expanded_queries,
@@ -188,92 +242,33 @@ def _get_contents_from_openinference_documents(documents: Iterable[Any]) -> List
188
242
  return [doc.get(DOCUMENT_CONTENT) if isinstance(doc, dict) else None for doc in documents]
189
243
 
190
244
 
191
- def _snap_to_rail(string: str, rails: Set[str], verbose: bool = False) -> str:
245
+ def _snap_to_rail(raw_string: str, rails: List[str], verbose: bool = False) -> str:
192
246
  """
193
- Snaps a string to the nearest rail, or returns None if the string cannot be snapped to a
194
- rail.
247
+ Snaps a string to the nearest rail, or returns None if the string cannot be
248
+ snapped to a rail.
195
249
 
196
250
  Args:
197
- string (str): An input to be snapped to a rail.
251
+ raw_string (str): An input to be snapped to a rail.
198
252
 
199
- rails (Set[str]): The target set of strings to snap to.
253
+ rails (List[str]): The target set of strings to snap to.
200
254
 
201
255
  Returns:
202
- str: A string from the rails argument or None if the input string could not be snapped.
256
+ str: A string from the rails argument or "UNPARSABLE" if the input
257
+ string could not be snapped.
203
258
  """
204
259
 
205
- processed_string = string.strip()
206
- rails_list = list(rails)
207
- rail = _extract_rail(processed_string, rails_list[0], rails_list[1])
208
- if not rail:
209
- printif(verbose, f"- Cannot snap {repr(string)} to rails: {rails}")
210
- logger.warning(
211
- f"LLM output cannot be snapped to rails {list(rails)}, returning {NOT_PARSABLE}. "
212
- f'Output: "{string}"'
213
- )
260
+ snap_string = raw_string.lower()
261
+ rails = list(set(rails))
262
+ rails = [rail.lower() for rail in rails]
263
+ rails.sort(key=len, reverse=True)
264
+ found_rails = set()
265
+ for rail in rails:
266
+ if rail in snap_string:
267
+ found_rails.add(rail)
268
+ snap_string = snap_string.replace(rail, "")
269
+ if len(found_rails) != 1:
270
+ printif(verbose, f"- Cannot snap {repr(raw_string)} to rails")
214
271
  return NOT_PARSABLE
215
- else:
216
- printif(verbose, f"- Snapped {repr(string)} to rail: {rail}")
272
+ rail = list(found_rails)[0]
273
+ printif(verbose, f"- Snapped {repr(raw_string)} to rail: {rail}")
217
274
  return rail
218
-
219
-
220
- def _extract_rail(string: str, positive_rail: str, negative_rail: str) -> Optional[str]:
221
- """
222
- Extracts the right rails text from the llm output. If the rails have overlapping characters,
223
- (e.x. "regular" and "irregular"), it also ensures that the correct rail is returned.
224
-
225
- Args:
226
- string (str): An input to be snapped to a rail.
227
-
228
- positive_rail (str): The positive rail (e.x. toxic)
229
-
230
- negative_rail (str): The negative rail. (e.x. non-toxic)
231
-
232
- Returns:
233
- str: A string from the rails or None if the input string could not be extracted.
234
-
235
- Examples:
236
- given: positive_rail = "irregular", negative_rail = "regular"
237
-
238
- string = "irregular"
239
- Output: "irregular"
240
-
241
- string = "regular"
242
- Output: "regular"
243
-
244
- string = "regular,:....random"
245
- Output: "regular"
246
-
247
- string = "regular..irregular" - contains both rails
248
- Output: None
249
-
250
- string = "Irregular"
251
- Output: "irregular"
252
- """
253
-
254
- # Convert the inputs to lowercase for case-insensitive matching
255
- string = string.lower()
256
- positive_rail = positive_rail.lower()
257
- negative_rail = negative_rail.lower()
258
-
259
- positive_pos, negative_pos = string.find(positive_rail), string.find(negative_rail)
260
-
261
- # If both positive and negative rails are in the string
262
- if positive_pos != -1 and negative_pos != -1:
263
- # If either one is a substring of the other, return the longer one
264
- # e.x. "regular" and "irregular"
265
- if positive_pos < negative_pos < positive_pos + len(
266
- positive_rail
267
- ) or negative_pos < positive_pos < negative_pos + len(negative_rail):
268
- # Return the longer of the rails since it means the LLM returned the longer one
269
- return max(positive_rail, negative_rail, key=len)
270
- else:
271
- # If both rails values are in the string, we cannot determine which to return
272
- return None
273
- # If only positive is in string
274
- elif positive_pos != -1:
275
- return positive_rail
276
- # If only negative is in the string
277
- elif negative_pos != -1:
278
- return negative_rail
279
- return None
@@ -56,7 +56,7 @@ class OpenAIModel(BaseEvalModel):
56
56
  """Batch size to use when passing multiple documents to generate."""
57
57
  request_timeout: Optional[Union[float, Tuple[float, float]]] = None
58
58
  """Timeout for requests to OpenAI completion API. Default is 600 seconds."""
59
- max_retries: int = 6
59
+ max_retries: int = 20
60
60
  """Maximum number of retries to make when generating."""
61
61
  retry_min_seconds: int = 10
62
62
  """Minimum number of seconds to wait when retrying."""