arize-phoenix 4.4.4rc3__py3-none-any.whl → 4.4.4rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arize-phoenix
3
- Version: 4.4.4rc3
3
+ Version: 4.4.4rc4
4
4
  Summary: AI Observability and Evaluation
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -5,17 +5,20 @@ phoenix/exceptions.py,sha256=n2L2KKuecrdflB9MsCdAYCiSEvGJptIsfRkXMoJle7A,169
5
5
  phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
6
  phoenix/services.py,sha256=aTxhcOA1pZHB6U-B3TEcp6fqDF5oT0xCUvEUNMZVTUQ,5175
7
7
  phoenix/settings.py,sha256=cO-qgis_S27nHirTobYI9hHPfZH18R--WMmxNdsVUwc,273
8
- phoenix/version.py,sha256=vkzG2Z0dkYNWJYkiDnpu7yJxir6A-qjTBfFVeklU7TY,25
8
+ phoenix/version.py,sha256=NZ2gYPUT2LKOK3V9-dZJ34v1J27mnLmDtx-pKAXd1W0,25
9
9
  phoenix/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  phoenix/core/embedding_dimension.py,sha256=zKGbcvwOXgLf-yrJBpQyKtd-LEOPRKHnUToyAU8Owis,87
11
11
  phoenix/core/model.py,sha256=km_a--PBHOuA337ClRw9xqhOHhrUT6Rl9pz_zV0JYkQ,4843
12
12
  phoenix/core/model_schema.py,sha256=F2dbbVnkDLsPYoyZDv1q03uhvP8LcU1wXp0g-exiWs0,50551
13
13
  phoenix/core/model_schema_adapter.py,sha256=0Tm_Y_gV-WED8fKBCaFXAEFwE3CTEZS1dowqnTZ7x7g,8426
14
14
  phoenix/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- phoenix/datasets/evaluators.py,sha256=_ezCRB6cyhuJsxsQJRFt2CKg3vqV-KgtBi9NNtkdeJQ,10410
16
- phoenix/datasets/experiments.py,sha256=D1gmdCbOC7tkjrFjyC_WPNFyY41YrqKESsPE0CebTtc,19223
15
+ phoenix/datasets/experiments.py,sha256=MhuhJWJ-bBqZ_aR3FewudEeo6RUrLgm0hmDlGjWVsrU,19314
17
16
  phoenix/datasets/tracing.py,sha256=Ieb2Uo-9qHpmv65uf1VsFSsWo5Yxj6VHwGS6dxu9NHQ,2248
18
17
  phoenix/datasets/types.py,sha256=w0KoSP7AdlcFlV3I6qVtvKOOWoK0yiY6_s4CvH0flcs,5753
18
+ phoenix/datasets/evaluators/__init__.py,sha256=KSr9fNG4O93swYxNdPj_UihP9Itl_5mj0a492wi_4_0,465
19
+ phoenix/datasets/evaluators/_utils.py,sha256=-MaNdoN1hA3FLzLyIDplUUkUtmM56BMIV83Gh-sgAsU,436
20
+ phoenix/datasets/evaluators/code_evaluators.py,sha256=fwoKfyHD7_xBaHY8Ax78xcry7PtB8Y1FxIn82guAV5M,4640
21
+ phoenix/datasets/evaluators/llm_evaluators.py,sha256=Ghg3bIBtQCdd6LuQ6VdcbkNQKI9ouZXwjlJV5GcdxOg,8675
19
22
  phoenix/db/README.md,sha256=IvKaZyf9ECbGBYYePaRhBveKZwDbxAc-c7BMxJYZh6Q,595
20
23
  phoenix/db/__init__.py,sha256=pDjEFXukHmJBM-1D8RjmXkvLsz85YWNxMQczt81ec3A,118
21
24
  phoenix/db/alembic.ini,sha256=p8DjVqGUs_tTx8oU56JP7qj-rMUebNFizItUSv_hPhs,3763
@@ -173,7 +176,7 @@ phoenix/server/api/types/Retrieval.py,sha256=OhMK2ncjoyp5h1yjKhjlKpoTbQrMHuxmgSF
173
176
  phoenix/server/api/types/ScalarDriftMetricEnum.py,sha256=IUAcRPpgL41WdoIgK6cNk2Te38SspXGyEs-S1fY23_A,232
174
177
  phoenix/server/api/types/Segments.py,sha256=m2yoegrxA1Tn7ZAy1rMjjD1isc752MaAXMoffkBlvrM,2921
175
178
  phoenix/server/api/types/SortDir.py,sha256=OUpXhlCzCxPoXSDkJJygEs9Rw9pMymfaZUG5zPTrw4Y,152
176
- phoenix/server/api/types/Span.py,sha256=Nk0Of6JyHSI7OqrEodyV3d5UUvzCWnDkNSZUcmCvq-I,13837
179
+ phoenix/server/api/types/Span.py,sha256=W4Rsg85bgqbDhgYwpjgOTrIQKbkwpFQPpL6nqMyzhCs,13865
177
180
  phoenix/server/api/types/TimeSeries.py,sha256=wjzuxHFqCey0O7Ys25qiXyuqXK8an-osyNWUE8A_8G4,5227
178
181
  phoenix/server/api/types/Trace.py,sha256=ep-mPexub1ijxAnBvc2KrGsNVXO2SfDR1WxqER2wcD8,2376
179
182
  phoenix/server/api/types/UMAPPoints.py,sha256=5sOuruzM8saXa8C2XiyUfk2XPrkVGmhqKpclMYRw1dk,1656
@@ -194,7 +197,7 @@ phoenix/server/static/apple-touch-icon-76x76.png,sha256=CT_xT12I0u2i0WU8JzBZBuOQ
194
197
  phoenix/server/static/apple-touch-icon.png,sha256=fOfpjqGpWYbJ0eAurKsyoZP1EAs6ZVooBJ_SGk2ZkDs,3801
195
198
  phoenix/server/static/favicon.ico,sha256=bY0vvCKRftemZfPShwZtE93DiiQdaYaozkPGwNFr6H8,34494
196
199
  phoenix/server/static/index.css,sha256=KKGpx4iwF91VGRm0YN-4cn8oC-oIqC6HecoPf0x3ZM8,1885
197
- phoenix/server/static/index.js,sha256=88OQ_pBKrFdD5usFU6Frpm1vBzxL19zO4JS9ChoHWEo,3487681
200
+ phoenix/server/static/index.js,sha256=n8qF_l7ijW-7E8m63oViD8SpXOYjN3wvZUhgB8H6ZLo,3489949
198
201
  phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_wcU9c,2564
199
202
  phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
200
203
  phoenix/server/templates/index.html,sha256=S4z7qSoNSwnKFAH9r96AR-YJEyoKMd-VMWVlJ_IdzME,2039
@@ -211,7 +214,7 @@ phoenix/trace/exporter.py,sha256=eAYemdvDCHMugDJiaR29BFFMTQBdf3oerdkz34Cl3hE,473
211
214
  phoenix/trace/fixtures.py,sha256=gBGFG2gkcBsSDzolzzR9AJDrB_fdOQfUaGgHV-EHdco,14204
212
215
  phoenix/trace/otel.py,sha256=WA720jvRadiZBAKjsYoPyXzypHwbyEK2OZRVUwtbjB8,9976
213
216
  phoenix/trace/projects.py,sha256=2BwlNjFE-uwpqYtCu5YyBiYZk9wRPpM13vh3-Cv7GkA,2157
214
- phoenix/trace/schemas.py,sha256=JiFKhGD2JF6Eai7UOhPF5urcuKGkpMLHc3Vltbe1msk,5967
217
+ phoenix/trace/schemas.py,sha256=Mjc6fD9OyeMnEk5wPPSbveqnNUYWK3p3BxpOvSGanHU,5950
215
218
  phoenix/trace/span_evaluations.py,sha256=GaADtJLi2njra4aYaie0BIwkSgdxPB_SNseglI4ykZA,13104
216
219
  phoenix/trace/span_json_decoder.py,sha256=IAFakPRqSMYxTPKYFMiXYxm7U-FipdN8_xbvapDS0Qc,3131
217
220
  phoenix/trace/span_json_encoder.py,sha256=tzSCIQJbeFBm33K68G8A5M12n_86tCDyuU0WAobxEz4,2010
@@ -239,8 +242,8 @@ phoenix/utilities/logging.py,sha256=lDXd6EGaamBNcQxL4vP1au9-i_SXe0OraUDiJOcszSw,
239
242
  phoenix/utilities/project.py,sha256=qWsvKnG1oKhOFUowXf9qiOL2ia7jaFe_ijFFHEt8GJo,431
240
243
  phoenix/utilities/re.py,sha256=PDve_OLjRTM8yQQJHC8-n3HdIONi7aNils3ZKRZ5uBM,2045
241
244
  phoenix/utilities/span_store.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
242
- arize_phoenix-4.4.4rc3.dist-info/METADATA,sha256=VuX8kXsqxcbsdYmi9-jCDMHgMJ182JMbDYCY-3N74jU,11012
243
- arize_phoenix-4.4.4rc3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
244
- arize_phoenix-4.4.4rc3.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
245
- arize_phoenix-4.4.4rc3.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
246
- arize_phoenix-4.4.4rc3.dist-info/RECORD,,
245
+ arize_phoenix-4.4.4rc4.dist-info/METADATA,sha256=YEUoxXSRba4zRgzM8-lcq7TIp9GNPZSjY_QGoyIJN-w,11012
246
+ arize_phoenix-4.4.4rc4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
247
+ arize_phoenix-4.4.4rc4.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
248
+ arize_phoenix-4.4.4rc4.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
249
+ arize_phoenix-4.4.4rc4.dist-info/RECORD,,
@@ -0,0 +1,18 @@
1
+ from phoenix.datasets.evaluators.code_evaluators import ContainsKeyword, JSONParsable
2
+ from phoenix.datasets.evaluators.llm_evaluators import (
3
+ CoherenceEvaluator,
4
+ ConcisenessEvaluator,
5
+ HelpfulnessEvaluator,
6
+ LLMCriteriaEvaluator,
7
+ RelevanceEvaluator,
8
+ )
9
+
10
+ __all__ = [
11
+ "ContainsKeyword",
12
+ "JSONParsable",
13
+ "CoherenceEvaluator",
14
+ "ConcisenessEvaluator",
15
+ "LLMCriteriaEvaluator",
16
+ "HelpfulnessEvaluator",
17
+ "RelevanceEvaluator",
18
+ ]
@@ -0,0 +1,13 @@
1
+ from phoenix.datasets.types import JSONSerializable
2
+
3
+
4
+ def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
5
+ if isinstance(obj, dict):
6
+ if len(obj) == 1:
7
+ key = next(iter(obj.keys()))
8
+ output = obj[key]
9
+ assert isinstance(
10
+ output, (dict, list, str, int, float, bool, type(None))
11
+ ), "Output must be JSON serializable"
12
+ return output
13
+ return obj
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from typing import TYPE_CHECKING, List, Optional, Union
6
+
7
+ from phoenix.datasets.evaluators._utils import _unwrap_json
8
+ from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
9
+
10
+
11
+ class JSONParsable:
12
+ annotator_kind = "CODE"
13
+ name = "JSONParsable"
14
+
15
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
16
+ assert exp_run.output is not None
17
+ output = _unwrap_json(exp_run.output.result)
18
+ assert isinstance(output, str), "Experiment run output must be a string"
19
+ try:
20
+ json.loads(output)
21
+ json_parsable = True
22
+ except BaseException:
23
+ json_parsable = False
24
+ return EvaluationResult(
25
+ score=int(json_parsable),
26
+ )
27
+
28
+
29
+ class ContainsKeyword:
30
+ annotator_kind = "CODE"
31
+
32
+ def __init__(self, keyword: str, name: Optional[str] = None) -> None:
33
+ self.keyword = keyword
34
+ self.name = name or f"Contains({repr(keyword)})"
35
+
36
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
37
+ assert exp_run.output is not None
38
+ result = _unwrap_json(exp_run.output.result)
39
+ assert isinstance(result, str), "Experiment run output must be a string"
40
+ found = self.keyword in result
41
+ return EvaluationResult(
42
+ score=float(found),
43
+ explanation=(
44
+ f"the string {repr(self.keyword)} was "
45
+ f"{'found' if found else 'not found'} in the output"
46
+ ),
47
+ )
48
+
49
+
50
+ class ContainsAnyKeyword:
51
+ annotator_kind = "CODE"
52
+
53
+ def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
54
+ self.keywords = keywords
55
+ self.name = name or f"ContainsAny({keywords})"
56
+
57
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
58
+ assert exp_run.output is not None
59
+ result = _unwrap_json(exp_run.output.result)
60
+ assert isinstance(result, str), "Experiment run output must be a string"
61
+ found = [keyword for keyword in self.keywords if keyword in result]
62
+ if found:
63
+ explanation = f"the keywords {found} were found in the output"
64
+ else:
65
+ explanation = f"none of the keywords {self.keywords} were found in the output"
66
+ return EvaluationResult(
67
+ score=float(bool(found)),
68
+ explanation=explanation,
69
+ )
70
+
71
+
72
+ class ContainsAllKeywords:
73
+ annotator_kind = "CODE"
74
+
75
+ def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
76
+ self.keywords = keywords
77
+ self.name = name or f"ContainsAll({keywords})"
78
+
79
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
80
+ assert exp_run.output is not None
81
+ result = _unwrap_json(exp_run.output.result)
82
+ assert isinstance(result, str), "Experiment run output must be a string"
83
+ not_found = [keyword for keyword in self.keywords if keyword not in result]
84
+ if not_found:
85
+ contains_all = False
86
+ explanation = f"the keywords {not_found} were not found in the output"
87
+ else:
88
+ contains_all = True
89
+ explanation = f"all of the keywords {self.keywords} were found in the output"
90
+ return EvaluationResult(
91
+ score=float(contains_all),
92
+ explanation=explanation,
93
+ )
94
+
95
+
96
+ class MatchesRegex:
97
+ annotator_kind = "CODE"
98
+
99
+ def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
100
+ if isinstance(pattern, str):
101
+ pattern = re.compile(pattern)
102
+ self.pattern = pattern
103
+ assert isinstance(pattern, re.Pattern)
104
+ self.name = name or f"matches_({pattern})"
105
+
106
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
107
+ assert exp_run.output is not None
108
+ result = _unwrap_json(exp_run.output.result)
109
+ assert isinstance(result, str), "Experiment run output must be a string"
110
+ matches = self.pattern.findall(result)
111
+ if matches:
112
+ explanation = (
113
+ f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
114
+ )
115
+ else:
116
+ explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
117
+ return EvaluationResult(
118
+ score=float(bool(matches)),
119
+ explanation=explanation,
120
+ )
121
+
122
+
123
+ # Someday we'll do typing checking in unit tests.
124
+ if TYPE_CHECKING:
125
+ _: ExperimentEvaluator
126
+ _ = JSONParsable()
127
+ _ = ContainsKeyword("test")
@@ -1,70 +1,12 @@
1
- import json
2
1
  import re
3
- from typing import TYPE_CHECKING, Callable, Optional, Type
2
+ from typing import Callable, Optional, Type
4
3
 
5
- from phoenix.datasets.types import (
6
- EvaluationResult,
7
- Example,
8
- ExperimentEvaluator,
9
- ExperimentRun,
10
- JSONSerializable,
11
- )
4
+ from phoenix.datasets.evaluators._utils import _unwrap_json
5
+ from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
12
6
  from phoenix.evals.models.base import BaseModel as LLMBaseModel
13
7
  from phoenix.evals.utils import snap_to_rail
14
8
 
15
9
 
16
- def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
17
- if isinstance(obj, dict):
18
- if len(obj) == 1:
19
- key = next(iter(obj.keys()))
20
- output = obj[key]
21
- assert isinstance(
22
- output, (dict, list, str, int, float, bool, type(None))
23
- ), "Output must be JSON serializable"
24
- return output
25
- return obj
26
-
27
-
28
- class JSONParsable:
29
- annotator_kind = "CODE"
30
- name = "JSONParsable"
31
-
32
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
33
- assert exp_run.output is not None
34
- output = _unwrap_json(exp_run.output.result)
35
- assert isinstance(output, str), "Experiment run output must be a string"
36
- try:
37
- json.loads(output)
38
- json_parsable = True
39
- except BaseException:
40
- json_parsable = False
41
- return EvaluationResult(
42
- score=int(json_parsable),
43
- )
44
-
45
-
46
- class ContainsKeyword:
47
- annotator_kind = "CODE"
48
-
49
- def __init__(self, keyword: str) -> None:
50
- super().__init__()
51
- self.keyword = keyword
52
- self.name = f"ContainsKeyword({keyword})"
53
-
54
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
55
- assert exp_run.output is not None
56
- result = _unwrap_json(exp_run.output.result)
57
- assert isinstance(result, str), "Experiment run output must be a string"
58
- found = self.keyword in result
59
- return EvaluationResult(
60
- score=float(found),
61
- explanation=(
62
- f"the string {repr(self.keyword)} was "
63
- f"{'found' if found else 'not found'} in the output"
64
- ),
65
- )
66
-
67
-
68
10
  class LLMCriteriaEvaluator:
69
11
  annotator_kind = "LLM"
70
12
  _base_template = (
@@ -77,7 +19,7 @@ class LLMCriteriaEvaluator:
77
19
  "EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
78
20
  "the criteria*\n"
79
21
  "LABEL: *true or false*\n\n"
80
- "Follow this template for the following text:\n\n"
22
+ "Follow this template for the following example:\n\n"
81
23
  "CRITERIA: the text is '{criteria}'\n"
82
24
  "TEXT: {text}\n"
83
25
  "EXPLANATION: "
@@ -142,40 +84,43 @@ class LLMCriteriaEvaluator:
142
84
 
143
85
 
144
86
  def criteria_evaluator_factory(
145
- class_name: str, criteria: str, description: str
87
+ class_name: str, criteria: str, description: str, default_name: str
146
88
  ) -> Type[ExperimentEvaluator]:
89
+ def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
90
+ LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
91
+
147
92
  return type(
148
93
  class_name,
149
94
  (LLMCriteriaEvaluator,),
150
95
  {
151
- "__init__": lambda self, model: LLMCriteriaEvaluator.__init__(
152
- self, model, criteria, description, name=class_name
153
- ),
96
+ "__init__": _init,
154
97
  "__module__": __name__,
155
- "name": class_name,
156
98
  "template": LLMCriteriaEvaluator._format_base_template(criteria, description),
157
99
  },
158
100
  )
159
101
 
160
102
 
161
- LLMConcisenessEvaluator = criteria_evaluator_factory(
162
- class_name="LLMConcisenessEvaluator",
103
+ ConcisenessEvaluator = criteria_evaluator_factory(
104
+ class_name="ConcisenessEvaluator",
163
105
  criteria="concise",
164
106
  description="is just a few sentences and easy to follow",
107
+ default_name="Conciseness",
165
108
  )
166
109
 
167
110
 
168
- LLMHelpfulnessEvaluator = criteria_evaluator_factory(
169
- class_name="LLMHelpfulnessEvaluator",
111
+ HelpfulnessEvaluator = criteria_evaluator_factory(
112
+ class_name="HelpfulnessEvaluator",
170
113
  criteria="helpful",
171
114
  description="provides useful information",
115
+ default_name="Helpfulness",
172
116
  )
173
117
 
174
118
 
175
- LLMCoherenceEvaluator = criteria_evaluator_factory(
176
- class_name="LLMCoherenceEvaluator",
119
+ CoherenceEvaluator = criteria_evaluator_factory(
120
+ class_name="CoherenceEvaluator",
177
121
  criteria="coherent",
178
- description="is coherent, well-structured, and organized",
122
+ description="is coherent, well-structured, and logically sound",
123
+ default_name="Coherence",
179
124
  )
180
125
 
181
126
 
@@ -266,10 +211,3 @@ class RelevanceEvaluator:
266
211
  formatted_template = self._format_eval_template(example, exp_run)
267
212
  unparsed_response = await self.model._async_generate(formatted_template)
268
213
  return self._parse_eval_output(unparsed_response)
269
-
270
-
271
- # Someday we'll do typing checking in unit tests.
272
- if TYPE_CHECKING:
273
- _: ExperimentEvaluator
274
- _ = JSONParsable()
275
- _ = ContainsKeyword("test")
@@ -458,6 +458,7 @@ def _evaluate_experiment(
458
458
  max_retries=0,
459
459
  exit_on_error=False,
460
460
  fallback_return_value=None,
461
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
461
462
  )
462
463
  evaluation_payloads, _execution_details = executor.run(evaluation_inputs)
463
464
  for payload in evaluation_payloads:
@@ -59,6 +59,7 @@ class SpanKind(Enum):
59
59
  embedding = "EMBEDDING"
60
60
  agent = "AGENT"
61
61
  reranker = "RERANKER"
62
+ evaluator = "EVALUATOR"
62
63
  unknown = "UNKNOWN"
63
64
 
64
65
  @classmethod