arize-phoenix 2.5.0__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-2.5.0.dist-info → arize_phoenix-2.7.0.dist-info}/METADATA +1 -1
- {arize_phoenix-2.5.0.dist-info → arize_phoenix-2.7.0.dist-info}/RECORD +24 -23
- phoenix/config.py +32 -7
- phoenix/core/evals.py +53 -0
- phoenix/datasets/fixtures.py +46 -0
- phoenix/experimental/evals/evaluators.py +4 -0
- phoenix/experimental/evals/functions/classify.py +16 -6
- phoenix/experimental/evals/functions/generate.py +6 -3
- phoenix/experimental/evals/models/anthropic.py +3 -4
- phoenix/experimental/evals/models/base.py +1 -0
- phoenix/experimental/evals/models/bedrock.py +4 -2
- phoenix/experimental/evals/models/openai.py +2 -0
- phoenix/experimental/evals/models/vertex.py +6 -0
- phoenix/experimental/evals/templates/default_templates.py +0 -7
- phoenix/server/static/index.js +1 -1
- phoenix/session/evaluation.py +16 -10
- phoenix/session/session.py +19 -0
- phoenix/trace/errors.py +5 -0
- phoenix/trace/span_evaluations.py +46 -61
- phoenix/trace/trace_dataset.py +111 -4
- phoenix/version.py +1 -1
- {arize_phoenix-2.5.0.dist-info → arize_phoenix-2.7.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-2.5.0.dist-info → arize_phoenix-2.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-2.5.0.dist-info → arize_phoenix-2.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
phoenix/__init__.py,sha256=EEh0vZGRQS8686h34GQ64OjQoZ7neKYO_iO5j6Oa9Jw,1402
|
|
2
|
-
phoenix/config.py,sha256=
|
|
2
|
+
phoenix/config.py,sha256=RbQw8AkVyI4SSo5CD520AjUNcwkDNOGZA6_ErE48R7A,3454
|
|
3
3
|
phoenix/datetime_utils.py,sha256=D955QLrkgrrSdUM6NyqbCeAu2SMsjhR5rHVQEsVUdng,2773
|
|
4
4
|
phoenix/exceptions.py,sha256=igIWGAg3m8jm5YwQDeCY1p8ml_60A7zaGVXJ1yZhY9s,44
|
|
5
5
|
phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
6
6
|
phoenix/services.py,sha256=f6AeyKTuOpy9RCcTCjVH3gx5nYZhbTMFOuv1WSUOB5o,4992
|
|
7
|
-
phoenix/version.py,sha256=
|
|
7
|
+
phoenix/version.py,sha256=EtKWW0Hnl5oWglRNH0HZigvcDT2FEs58ek8buJdwW1E,22
|
|
8
8
|
phoenix/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
phoenix/core/embedding_dimension.py,sha256=zKGbcvwOXgLf-yrJBpQyKtd-LEOPRKHnUToyAU8Owis,87
|
|
10
|
-
phoenix/core/evals.py,sha256=
|
|
10
|
+
phoenix/core/evals.py,sha256=gJyqQzpud5YjtoY8h4pgXvHDsdubGfqmEewLuZHPPmQ,10224
|
|
11
11
|
phoenix/core/model.py,sha256=vQ6RxpUPlncezJvur5u6xBN0Lkrk2gW0cTyb-qqaSqA,4713
|
|
12
12
|
phoenix/core/model_schema.py,sha256=rR9VdhL_oXxbprDTPQJBXs5hw5sMPQmzx__m6Kwsxug,50394
|
|
13
13
|
phoenix/core/model_schema_adapter.py,sha256=3GkyzqUST4fYi-Bgs8qAam5hwMCdQRZTDLjZ9Bnzdm4,8268
|
|
@@ -15,29 +15,29 @@ phoenix/core/traces.py,sha256=O01L6qwQfHxHUHNZemKBBsAgqDo1tAIO5-1fK2g0NwE,14618
|
|
|
15
15
|
phoenix/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
phoenix/datasets/dataset.py,sha256=scKVZ7zc6Dpc_ntt-pWhzY-KWqOJEwKePuyNnKSVTGE,30515
|
|
17
17
|
phoenix/datasets/errors.py,sha256=cGp9vxnw4SewFoWBV3ZGMkhE0Kh73lPIv3Ppz_H_RoA,8261
|
|
18
|
-
phoenix/datasets/fixtures.py,sha256=
|
|
18
|
+
phoenix/datasets/fixtures.py,sha256=rGnVnufPvt25cyrlat0vKKtlu08olOuZvbp7EnR33aU,20668
|
|
19
19
|
phoenix/datasets/schema.py,sha256=bF1d2Md6NyqQZuC4Ym5A52f2_IcazkyxGFZ11HPqSg0,6668
|
|
20
20
|
phoenix/datasets/validation.py,sha256=dZ9lCFUV0EY7HCkQkQBrs-GLAEIZdpOqUxwD5l4dp88,8294
|
|
21
21
|
phoenix/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
phoenix/experimental/evals/__init__.py,sha256=q96YKLMt2GJD9zL8sjugvWx1INfw40Wa7E9OsHo2S4s,1885
|
|
23
|
-
phoenix/experimental/evals/evaluators.py,sha256=
|
|
23
|
+
phoenix/experimental/evals/evaluators.py,sha256=r7fXrS-l4gn58SUhLAZSfY3P8lxysouSVJwHddrZJ_Q,15956
|
|
24
24
|
phoenix/experimental/evals/retrievals.py,sha256=o3fqrsYbYZjyGj_jWkN_9VQVyXjLkDKDw5Ws7l8bwdI,3828
|
|
25
25
|
phoenix/experimental/evals/functions/__init__.py,sha256=NNd0-_cmIopdV7vm3rspjfgM726qoQJ4DPq_vqbnaxQ,180
|
|
26
|
-
phoenix/experimental/evals/functions/classify.py,sha256=
|
|
26
|
+
phoenix/experimental/evals/functions/classify.py,sha256=A-seuYrwiNFdc4IK9WJkQVKY78YdBHxaCMSDPL4_SXE,19523
|
|
27
27
|
phoenix/experimental/evals/functions/executor.py,sha256=bM7PI2rcPukQQzZ2rWqN_-Kfo_a935YJj0bh1Red8Ps,13406
|
|
28
|
-
phoenix/experimental/evals/functions/generate.py,sha256=
|
|
28
|
+
phoenix/experimental/evals/functions/generate.py,sha256=8LnnPAjBM9yxitdkaGZ67OabuDTOWBF3fvinJ_uCFRg,5584
|
|
29
29
|
phoenix/experimental/evals/functions/processing.py,sha256=F4xtLsulLV4a8CkuLldRddsCim75dSTIShEJUYN6I6w,1823
|
|
30
30
|
phoenix/experimental/evals/models/__init__.py,sha256=j1N7DhiOPbcaemtVBONcQ0miNnGQwEXz4u3P3Vwe6-4,320
|
|
31
|
-
phoenix/experimental/evals/models/anthropic.py,sha256=
|
|
32
|
-
phoenix/experimental/evals/models/base.py,sha256=
|
|
33
|
-
phoenix/experimental/evals/models/bedrock.py,sha256=
|
|
31
|
+
phoenix/experimental/evals/models/anthropic.py,sha256=VRYYbZr8ZFvC-19VxScMNux_Yp_9DzSRXiSmWUuhlOc,6309
|
|
32
|
+
phoenix/experimental/evals/models/base.py,sha256=z8xB18s6JI_Weihq2yG22Rte2RBde_cdHq9rINAXHYw,8086
|
|
33
|
+
phoenix/experimental/evals/models/bedrock.py,sha256=VrLNifBxmgHVMFqp6j9d1aGQIvDDuw8yjBM8CdIZCH4,8009
|
|
34
34
|
phoenix/experimental/evals/models/litellm.py,sha256=YvlYeAV-gG0IxFoVJ_OuRYwVwQ0LEtYBuWmp-uPGrNU,4368
|
|
35
|
-
phoenix/experimental/evals/models/openai.py,sha256=
|
|
35
|
+
phoenix/experimental/evals/models/openai.py,sha256=Yht-AZDq2iiwMUlkG3ghv3tCxZY8p-L7xxhSeGPtfaM,17238
|
|
36
36
|
phoenix/experimental/evals/models/rate_limiters.py,sha256=5GVN0RQKt36Przg3-9jLgocRmyg-tbeO-cdbuLIx89w,10160
|
|
37
|
-
phoenix/experimental/evals/models/vertex.py,sha256=
|
|
37
|
+
phoenix/experimental/evals/models/vertex.py,sha256=52A1g8j54_VkahjQmLj0eguPKJdQj0xtI4dAlrLsgtY,6592
|
|
38
38
|
phoenix/experimental/evals/models/vertexai.py,sha256=NfBpQq0l7XzP-wDEDsK27IRiQBzA1GXEdfwlAf8leX4,5609
|
|
39
39
|
phoenix/experimental/evals/templates/__init__.py,sha256=GSJSoWJ4jwyoUANniidmWMUtXQhNQYbTJbfFqCvuYuo,1470
|
|
40
|
-
phoenix/experimental/evals/templates/default_templates.py,sha256=
|
|
40
|
+
phoenix/experimental/evals/templates/default_templates.py,sha256=dVKmoLwqgAyGcRuezz9WKnXSHhw7-qk1R8j6wSmqh0s,20722
|
|
41
41
|
phoenix/experimental/evals/templates/template.py,sha256=ImFSaTPo9oalPNwq7cNdOCndrvuwLuIyIFKsgDVcoJE,6715
|
|
42
42
|
phoenix/experimental/evals/utils/__init__.py,sha256=608EX7sG0f5oDG__II16J8xnFJiNpY9dI9AC8vXwR00,5601
|
|
43
43
|
phoenix/experimental/evals/utils/threads.py,sha256=ksI-egarPnlxit0qKKjtjZ2L82qGLxqxZ6s92O0eBA4,1005
|
|
@@ -125,24 +125,25 @@ phoenix/server/static/apple-touch-icon-76x76.png,sha256=CT_xT12I0u2i0WU8JzBZBuOQ
|
|
|
125
125
|
phoenix/server/static/apple-touch-icon.png,sha256=fOfpjqGpWYbJ0eAurKsyoZP1EAs6ZVooBJ_SGk2ZkDs,3801
|
|
126
126
|
phoenix/server/static/favicon.ico,sha256=bY0vvCKRftemZfPShwZtE93DiiQdaYaozkPGwNFr6H8,34494
|
|
127
127
|
phoenix/server/static/index.css,sha256=KKGpx4iwF91VGRm0YN-4cn8oC-oIqC6HecoPf0x3ZM8,1885
|
|
128
|
-
phoenix/server/static/index.js,sha256=
|
|
128
|
+
phoenix/server/static/index.js,sha256=4MEBiTUm4u7QrSnPE7OJrBEYSkFjmyZPugfrowtQOCI,3259882
|
|
129
129
|
phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_wcU9c,2564
|
|
130
130
|
phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
131
|
phoenix/server/templates/index.html,sha256=DlfcGoq1V5C2QkJWqP1j4Nu6_kPfsOzOrtzYF3ogghE,1900
|
|
132
132
|
phoenix/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
|
-
phoenix/session/evaluation.py,sha256=
|
|
134
|
-
phoenix/session/session.py,sha256=
|
|
133
|
+
phoenix/session/evaluation.py,sha256=DaAtA0XYJbXRJO_StGywa-9APlz2ORSmCXzxrtn3rvI,4997
|
|
134
|
+
phoenix/session/session.py,sha256=94hRilOwlEWo6npLNjutaYRCevDPLPnAQdnuP07qeGc,20826
|
|
135
135
|
phoenix/trace/__init__.py,sha256=4d_MqzUIFmlY9WWcFeTONJ4xL5mPGoWZaPM2TJ0ZDBQ,266
|
|
136
|
+
phoenix/trace/errors.py,sha256=DbXSJnNErV7305tKv7pUWLD6jcVHJ6EBdSu4mZJ6IM4,112
|
|
136
137
|
phoenix/trace/evaluation_conventions.py,sha256=t8jydM3U0-T5YpiQKRJ3tWdWGlHtzKyttYdw-ddvPOk,1048
|
|
137
138
|
phoenix/trace/exporter.py,sha256=z3xrGJhIRh7XMy4Q1FkR3KmFZym-GX0XxLTZ6eSnN0Q,4347
|
|
138
139
|
phoenix/trace/fixtures.py,sha256=GGNOVi8Cjj9eduxOenyYLF8mhl-XTbXHtnraP5vLlxQ,6341
|
|
139
140
|
phoenix/trace/otel.py,sha256=Efc6S0IuvI-NEJ_Mv1VWEzQS94-lR_6nJ3ecTzwmyQ4,13933
|
|
140
141
|
phoenix/trace/schemas.py,sha256=m1wVlYFT6qL3FovD3TtTYsEgN6OHvv52gNdJkoPCmuY,5400
|
|
141
142
|
phoenix/trace/semantic_conventions.py,sha256=u6NG85ZhbreriZr8cqJaddldM_jUcew7JilszY7JUk8,4652
|
|
142
|
-
phoenix/trace/span_evaluations.py,sha256=
|
|
143
|
+
phoenix/trace/span_evaluations.py,sha256=asGug9lUHUufBwK1nL_PnHIDKsOc5X4ws7cur9lfoyI,12421
|
|
143
144
|
phoenix/trace/span_json_decoder.py,sha256=Xv-0uCsHgwzQb0dqTa7CuuDeXAPaXjQICyCFK3ZQaSs,3089
|
|
144
145
|
phoenix/trace/span_json_encoder.py,sha256=C5y7rkyOcV08oJC5t8TZqVxsKCZMJKad7bBQzAgLoDs,1763
|
|
145
|
-
phoenix/trace/trace_dataset.py,sha256=
|
|
146
|
+
phoenix/trace/trace_dataset.py,sha256=KW0TzmhlKuX8PUPLV172iTK08myYE0QXUC75KiIqJ7k,13204
|
|
146
147
|
phoenix/trace/tracer.py,sha256=S8UfhI4Qhl_uulD9bj9qFdSB5vwcB42hXd8-qURGcmo,3662
|
|
147
148
|
phoenix/trace/utils.py,sha256=7LurVGXn245cjj4MJsc7v6jq4DSJkpK6YGBfIaSywuw,1307
|
|
148
149
|
phoenix/trace/dsl/__init__.py,sha256=WIQIjJg362XD3s50OsPJJ0xbDsGp41bSv7vDllLrPuA,144
|
|
@@ -165,8 +166,8 @@ phoenix/trace/v1/evaluation_pb2.pyi,sha256=cCbbx06gwQmaH14s3J1X25TtaARh-k1abbxQd
|
|
|
165
166
|
phoenix/utilities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
167
|
phoenix/utilities/error_handling.py,sha256=7b5rpGFj9EWZ8yrZK1IHvxB89suWk3lggDayUQcvZds,1946
|
|
167
168
|
phoenix/utilities/logging.py,sha256=lDXd6EGaamBNcQxL4vP1au9-i_SXe0OraUDiJOcszSw,222
|
|
168
|
-
arize_phoenix-2.
|
|
169
|
-
arize_phoenix-2.
|
|
170
|
-
arize_phoenix-2.
|
|
171
|
-
arize_phoenix-2.
|
|
172
|
-
arize_phoenix-2.
|
|
169
|
+
arize_phoenix-2.7.0.dist-info/METADATA,sha256=G2XhPSpRh7gJHrTc5_MhOvrpFBTWv0_mjb_mZueDuWI,26479
|
|
170
|
+
arize_phoenix-2.7.0.dist-info/WHEEL,sha256=mRYSEL3Ih6g5a_CVMIcwiF__0Ae4_gLYh01YFNwiq1k,87
|
|
171
|
+
arize_phoenix-2.7.0.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
|
|
172
|
+
arize_phoenix-2.7.0.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
|
|
173
|
+
arize_phoenix-2.7.0.dist-info/RECORD,,
|
phoenix/config.py
CHANGED
|
@@ -12,6 +12,11 @@ ENV_PHOENIX_COLLECTOR_ENDPOINT = "PHOENIX_COLLECTOR_ENDPOINT"
|
|
|
12
12
|
The endpoint traces and evals are sent to. This must be set if the Phoenix
|
|
13
13
|
server is running on a remote instance.
|
|
14
14
|
"""
|
|
15
|
+
ENV_WORKING_DIR = "PHOENIX_WORKING_DIR"
|
|
16
|
+
"""
|
|
17
|
+
The directory in which to save, load, and export datasets. This directory must
|
|
18
|
+
be accessible by both the Phoenix server and the notebook environment.
|
|
19
|
+
"""
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
def _get_temp_path() -> Path:
|
|
@@ -36,13 +41,16 @@ def get_running_pid() -> Optional[int]:
|
|
|
36
41
|
return None
|
|
37
42
|
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
def get_working_dir() -> Path:
|
|
45
|
+
"""
|
|
46
|
+
Get the working directory for saving, loading, and exporting datasets.
|
|
47
|
+
"""
|
|
48
|
+
working_dir_str = os.getenv(ENV_WORKING_DIR)
|
|
49
|
+
if working_dir_str is not None:
|
|
50
|
+
return Path(working_dir_str)
|
|
51
|
+
# Fall back to ~/.phoenix if PHOENIX_WORKING_DIR is not set
|
|
52
|
+
return Path.home().resolve() / ".phoenix"
|
|
53
|
+
|
|
46
54
|
|
|
47
55
|
PHOENIX_DIR = Path(__file__).resolve().parent
|
|
48
56
|
# Server config
|
|
@@ -53,6 +61,23 @@ HOST = "0.0.0.0"
|
|
|
53
61
|
PORT = 6006
|
|
54
62
|
# The prefix of datasets that are auto-assigned a name
|
|
55
63
|
GENERATED_DATASET_NAME_PREFIX = "phoenix_dataset_"
|
|
64
|
+
# The work directory for saving, loading, and exporting datasets
|
|
65
|
+
WORKING_DIR = get_working_dir()
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
for path in (
|
|
69
|
+
ROOT_DIR := WORKING_DIR,
|
|
70
|
+
EXPORT_DIR := ROOT_DIR / "exports",
|
|
71
|
+
DATASET_DIR := ROOT_DIR / "datasets",
|
|
72
|
+
TRACE_DATASET_DIR := ROOT_DIR / "trace_datasets",
|
|
73
|
+
):
|
|
74
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print(
|
|
77
|
+
f"⚠️ Failed to initialize the working directory at {WORKING_DIR} due to an error: {str(e)}"
|
|
78
|
+
)
|
|
79
|
+
print("⚠️ While phoenix will still run, you will not be able to save, load, or export data")
|
|
80
|
+
print("ℹ️ To change, set the `{ENV_WORKING_DIR}` environment variable before importing phoenix.")
|
|
56
81
|
|
|
57
82
|
|
|
58
83
|
def get_exported_files(directory: Path) -> List[Path]:
|
phoenix/core/evals.py
CHANGED
|
@@ -9,10 +9,12 @@ from typing import DefaultDict, Dict, List, Optional, Set, Tuple
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
from google.protobuf.json_format import MessageToDict
|
|
12
|
+
from pandas import DataFrame, Index, MultiIndex
|
|
12
13
|
from typing_extensions import TypeAlias, assert_never
|
|
13
14
|
|
|
14
15
|
import phoenix.trace.v1 as pb
|
|
15
16
|
from phoenix.trace.schemas import SpanID, TraceID
|
|
17
|
+
from phoenix.trace.span_evaluations import DocumentEvaluations, Evaluations, SpanEvaluations
|
|
16
18
|
|
|
17
19
|
logger = logging.getLogger(__name__)
|
|
18
20
|
logger.addHandler(logging.NullHandler())
|
|
@@ -171,3 +173,54 @@ class Evals:
|
|
|
171
173
|
if result.HasField("score") and document_position < num_documents:
|
|
172
174
|
scores[document_position] = result.score.value
|
|
173
175
|
return scores
|
|
176
|
+
|
|
177
|
+
def export_evaluations(self) -> List[Evaluations]:
|
|
178
|
+
evaluations: List[Evaluations] = []
|
|
179
|
+
evaluations.extend(self._export_span_evaluations())
|
|
180
|
+
evaluations.extend(self._export_document_evaluations())
|
|
181
|
+
return evaluations
|
|
182
|
+
|
|
183
|
+
def _export_span_evaluations(self) -> List[SpanEvaluations]:
|
|
184
|
+
span_evaluations = []
|
|
185
|
+
with self._lock:
|
|
186
|
+
span_evaluations_by_name = tuple(self._span_evaluations_by_name.items())
|
|
187
|
+
for eval_name, _span_evaluations_by_id in span_evaluations_by_name:
|
|
188
|
+
span_ids = []
|
|
189
|
+
rows = []
|
|
190
|
+
with self._lock:
|
|
191
|
+
span_evaluations_by_id = tuple(_span_evaluations_by_id.items())
|
|
192
|
+
for span_id, pb_eval in span_evaluations_by_id:
|
|
193
|
+
span_ids.append(span_id)
|
|
194
|
+
rows.append(MessageToDict(pb_eval.result))
|
|
195
|
+
dataframe = DataFrame(rows, index=Index(span_ids, name="context.span_id"))
|
|
196
|
+
span_evaluations.append(SpanEvaluations(eval_name, dataframe))
|
|
197
|
+
return span_evaluations
|
|
198
|
+
|
|
199
|
+
def _export_document_evaluations(self) -> List[DocumentEvaluations]:
|
|
200
|
+
evaluations = []
|
|
201
|
+
with self._lock:
|
|
202
|
+
document_evaluations_by_name = tuple(self._document_evaluations_by_name.items())
|
|
203
|
+
for eval_name, _document_evaluations_by_id in document_evaluations_by_name:
|
|
204
|
+
span_ids = []
|
|
205
|
+
document_positions = []
|
|
206
|
+
rows = []
|
|
207
|
+
with self._lock:
|
|
208
|
+
document_evaluations_by_id = tuple(_document_evaluations_by_id.items())
|
|
209
|
+
for span_id, _document_evaluations_by_position in document_evaluations_by_id:
|
|
210
|
+
with self._lock:
|
|
211
|
+
document_evaluations_by_position = sorted(
|
|
212
|
+
_document_evaluations_by_position.items()
|
|
213
|
+
) # ensure the evals are sorted by document position
|
|
214
|
+
for document_position, pb_eval in document_evaluations_by_position:
|
|
215
|
+
span_ids.append(span_id)
|
|
216
|
+
document_positions.append(document_position)
|
|
217
|
+
rows.append(MessageToDict(pb_eval.result))
|
|
218
|
+
dataframe = DataFrame(
|
|
219
|
+
rows,
|
|
220
|
+
index=MultiIndex.from_arrays(
|
|
221
|
+
(span_ids, document_positions),
|
|
222
|
+
names=("context.span_id", "document_position"),
|
|
223
|
+
),
|
|
224
|
+
)
|
|
225
|
+
evaluations.append(DocumentEvaluations(eval_name, dataframe))
|
|
226
|
+
return evaluations
|
phoenix/datasets/fixtures.py
CHANGED
|
@@ -240,6 +240,51 @@ click_through_rate_fixture = Fixture(
|
|
|
240
240
|
reference_file_name="click_through_rate_train.parquet",
|
|
241
241
|
)
|
|
242
242
|
|
|
243
|
+
chatbot_queries_schema = Schema(
|
|
244
|
+
prediction_id_column_name="id",
|
|
245
|
+
prompt_column_names=RetrievalEmbeddingColumnNames(
|
|
246
|
+
vector_column_name="prompt",
|
|
247
|
+
raw_data_column_name="prompt_text",
|
|
248
|
+
context_retrieval_ids_column_name="document_ids",
|
|
249
|
+
context_retrieval_scores_column_name="document_scores",
|
|
250
|
+
),
|
|
251
|
+
response_column_names="response",
|
|
252
|
+
tag_column_names=[
|
|
253
|
+
"answer_relevancy",
|
|
254
|
+
"context_relevancy",
|
|
255
|
+
"faithfulness",
|
|
256
|
+
"document_similarity_0",
|
|
257
|
+
"document_similarity_1",
|
|
258
|
+
"openai_relevance_0",
|
|
259
|
+
"openai_relevance_1",
|
|
260
|
+
"user_feedback",
|
|
261
|
+
],
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
chatbot_database_schema = Schema(
|
|
265
|
+
prediction_id_column_name="document_id",
|
|
266
|
+
prompt_column_names=EmbeddingColumnNames(
|
|
267
|
+
vector_column_name="text_vector",
|
|
268
|
+
raw_data_column_name="text",
|
|
269
|
+
),
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
chatbot_fixture = Fixture(
|
|
273
|
+
name="chatbot",
|
|
274
|
+
description="""
|
|
275
|
+
Investigate RAG performance for a chatbot built on top of Arize's documentation.
|
|
276
|
+
This use-case highlights how embedding visualizations for a RAG application can
|
|
277
|
+
highlight issues with the application's retrieval and performance.
|
|
278
|
+
|
|
279
|
+
The data contains relevance metrics generated by LLM Evals as well as RAGAS.
|
|
280
|
+
""",
|
|
281
|
+
primary_schema=chatbot_queries_schema,
|
|
282
|
+
corpus_schema=chatbot_database_schema,
|
|
283
|
+
prefix="unstructured/llm/chatbot",
|
|
284
|
+
primary_file_name="chatbot_queries_with_ragas.parquet",
|
|
285
|
+
corpus_file_name="chatbot_database_ds.parquet",
|
|
286
|
+
)
|
|
287
|
+
|
|
243
288
|
wide_data_primary_schema = Schema(
|
|
244
289
|
actual_label_column_name="actual_label",
|
|
245
290
|
prediction_label_column_name="predicted_label",
|
|
@@ -363,6 +408,7 @@ FIXTURES: Tuple[Fixture, ...] = (
|
|
|
363
408
|
deep_data_fixture,
|
|
364
409
|
llm_summarization_fixture,
|
|
365
410
|
wikipedia_fixture,
|
|
411
|
+
chatbot_fixture,
|
|
366
412
|
)
|
|
367
413
|
NAME_TO_FIXTURE = {fixture.name: fixture for fixture in FIXTURES}
|
|
368
414
|
|
|
@@ -73,7 +73,7 @@ def llm_classify(
|
|
|
73
73
|
include_prompt: bool = False,
|
|
74
74
|
include_response: bool = False,
|
|
75
75
|
run_sync: bool = False,
|
|
76
|
-
concurrency: int =
|
|
76
|
+
concurrency: Optional[int] = None,
|
|
77
77
|
) -> pd.DataFrame:
|
|
78
78
|
"""Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
|
|
79
79
|
where the first column is named `label` and contains the classification labels. An optional
|
|
@@ -116,8 +116,9 @@ def llm_classify(
|
|
|
116
116
|
run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
|
|
117
117
|
evaluations will be run asynchronously if possible.
|
|
118
118
|
|
|
119
|
-
concurrency (int, default=
|
|
120
|
-
possible.
|
|
119
|
+
concurrency (Optional[int], default=None): The number of concurrent evals if async
|
|
120
|
+
submission is possible. If not provided, a recommended default concurrency is set on a
|
|
121
|
+
per-model basis.
|
|
121
122
|
|
|
122
123
|
Returns:
|
|
123
124
|
pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
|
|
@@ -127,6 +128,7 @@ def llm_classify(
|
|
|
127
128
|
from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
|
|
128
129
|
not be parsed.
|
|
129
130
|
"""
|
|
131
|
+
concurrency = concurrency or model.default_concurrency
|
|
130
132
|
# clients need to be reloaded to ensure that async evals work properly
|
|
131
133
|
model.reload_client()
|
|
132
134
|
|
|
@@ -353,7 +355,7 @@ def run_evals(
|
|
|
353
355
|
provide_explanation: bool = False,
|
|
354
356
|
use_function_calling_if_available: bool = True,
|
|
355
357
|
verbose: bool = False,
|
|
356
|
-
concurrency: int =
|
|
358
|
+
concurrency: Optional[int] = None,
|
|
357
359
|
) -> List[DataFrame]:
|
|
358
360
|
"""
|
|
359
361
|
Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
|
|
@@ -381,13 +383,21 @@ def run_evals(
|
|
|
381
383
|
as model invocation parameters and details about retries and snapping to
|
|
382
384
|
rails.
|
|
383
385
|
|
|
384
|
-
concurrency (int,
|
|
385
|
-
submission is possible.
|
|
386
|
+
concurrency (Optional[int], default=None): The number of concurrent evals if async
|
|
387
|
+
submission is possible. If not provided, a recommended default concurrency is set on a
|
|
388
|
+
per-model basis.
|
|
386
389
|
|
|
387
390
|
Returns:
|
|
388
391
|
List[DataFrame]: A list of dataframes, one for each evaluator, all of
|
|
389
392
|
which have the same number of rows as the input dataframe.
|
|
390
393
|
"""
|
|
394
|
+
# use the minimum default concurrency of all the models
|
|
395
|
+
if concurrency is None:
|
|
396
|
+
if len(evaluators) == 0:
|
|
397
|
+
concurrency = 1
|
|
398
|
+
else:
|
|
399
|
+
concurrency = min(evaluator.default_concurrency for evaluator in evaluators)
|
|
400
|
+
|
|
391
401
|
# clients need to be reloaded to ensure that async evals work properly
|
|
392
402
|
for evaluator in evaluators:
|
|
393
403
|
evaluator.reload_client()
|
|
@@ -31,7 +31,7 @@ def llm_generate(
|
|
|
31
31
|
include_prompt: bool = False,
|
|
32
32
|
include_response: bool = False,
|
|
33
33
|
run_sync: bool = False,
|
|
34
|
-
concurrency: int =
|
|
34
|
+
concurrency: Optional[int] = None,
|
|
35
35
|
) -> pd.DataFrame:
|
|
36
36
|
"""
|
|
37
37
|
Generates a text using a template using an LLM. This function is useful
|
|
@@ -70,14 +70,17 @@ def llm_generate(
|
|
|
70
70
|
run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
|
|
71
71
|
evaluations will be run asynchronously if possible.
|
|
72
72
|
|
|
73
|
-
concurrency (int, default=
|
|
74
|
-
possible.
|
|
73
|
+
concurrency (Optional[int], default=None): The number of concurrent evals if async
|
|
74
|
+
submission is possible. If not provided, a recommended default concurrency is set on a
|
|
75
|
+
per-model basis.
|
|
75
76
|
|
|
76
77
|
Returns:
|
|
77
78
|
generations_dataframe (pandas.DataFrame): A dataframe where each row
|
|
78
79
|
represents the generated output
|
|
79
80
|
|
|
80
81
|
"""
|
|
82
|
+
concurrency = concurrency or model.default_concurrency
|
|
83
|
+
|
|
81
84
|
# clients need to be reloaded to ensure that async evals work properly
|
|
82
85
|
model.reload_client()
|
|
83
86
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
from dataclasses import dataclass, field
|
|
3
2
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
4
3
|
|
|
@@ -8,8 +7,6 @@ from phoenix.experimental.evals.models.rate_limiters import RateLimiter
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from tiktoken import Encoding
|
|
10
9
|
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
10
|
MODEL_TOKEN_LIMIT_MAPPING = {
|
|
14
11
|
"claude-2.1": 200000,
|
|
15
12
|
"claude-2.0": 100000,
|
|
@@ -80,7 +77,6 @@ class AnthropicModel(BaseEvalModel):
|
|
|
80
77
|
try:
|
|
81
78
|
encoding = self._tiktoken.encoding_for_model(self.model)
|
|
82
79
|
except KeyError:
|
|
83
|
-
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
|
84
80
|
encoding = self._tiktoken.get_encoding("cl100k_base")
|
|
85
81
|
self._tiktoken_encoding = encoding
|
|
86
82
|
|
|
@@ -149,6 +145,9 @@ class AnthropicModel(BaseEvalModel):
|
|
|
149
145
|
return _completion_with_retry(**kwargs)
|
|
150
146
|
|
|
151
147
|
async def _async_generate(self, prompt: str, **kwargs: Dict[str, Any]) -> str:
|
|
148
|
+
# instruction is an invalid input to Anthropic models, it is passed in by
|
|
149
|
+
# BaseEvalModel.__call__ and needs to be removed
|
|
150
|
+
kwargs.pop("instruction", None)
|
|
152
151
|
invocation_parameters = self.invocation_parameters()
|
|
153
152
|
invocation_parameters.update(kwargs)
|
|
154
153
|
response = await self._async_generate_with_retry(
|
|
@@ -87,7 +87,6 @@ class BedrockModel(BaseEvalModel):
|
|
|
87
87
|
try:
|
|
88
88
|
encoding = self._tiktoken.encoding_for_model(self.model_id)
|
|
89
89
|
except KeyError:
|
|
90
|
-
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
|
91
90
|
encoding = self._tiktoken.get_encoding("cl100k_base")
|
|
92
91
|
self._tiktoken_encoding = encoding
|
|
93
92
|
|
|
@@ -165,7 +164,7 @@ class BedrockModel(BaseEvalModel):
|
|
|
165
164
|
"temperature": self.temperature,
|
|
166
165
|
"topP": self.top_p,
|
|
167
166
|
"maxTokens": self.max_tokens,
|
|
168
|
-
"stopSequences":
|
|
167
|
+
"stopSequences": self.stop_sequences,
|
|
169
168
|
},
|
|
170
169
|
**self.extra_parameters,
|
|
171
170
|
}
|
|
@@ -204,6 +203,9 @@ class BedrockModel(BaseEvalModel):
|
|
|
204
203
|
elif self.model_id.startswith("anthropic"):
|
|
205
204
|
body = json.loads(response.get("body").read().decode())
|
|
206
205
|
return body.get("completion")
|
|
206
|
+
elif self.model_id.startswith("amazon"):
|
|
207
|
+
body = json.loads(response.get("body").read())
|
|
208
|
+
return body.get("results")[0].get("outputText")
|
|
207
209
|
else:
|
|
208
210
|
body = json.loads(response.get("body").read())
|
|
209
211
|
return body.get("results")[0].get("data").get("outputText")
|
|
@@ -31,6 +31,8 @@ MODEL_TOKEN_LIMIT_MAPPING = {
|
|
|
31
31
|
"gpt-4-0613": 8192, # Current gpt-4 default
|
|
32
32
|
"gpt-4-32k-0314": 32768,
|
|
33
33
|
"gpt-4-32k-0613": 32768,
|
|
34
|
+
"gpt-4-1106-preview": 128000,
|
|
35
|
+
"gpt-4-vision-preview": 128000,
|
|
34
36
|
}
|
|
35
37
|
LEGACY_COMPLETION_API_MODELS = ("gpt-3.5-turbo-instruct",)
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
@@ -21,6 +21,9 @@ MODEL_TOKEN_LIMIT_MAPPING = {
|
|
|
21
21
|
|
|
22
22
|
@dataclass
|
|
23
23
|
class GeminiModel(BaseEvalModel):
|
|
24
|
+
# The vertex SDK runs into connection pool limits at high concurrency
|
|
25
|
+
default_concurrency: int = 5
|
|
26
|
+
|
|
24
27
|
model: str = "gemini-pro"
|
|
25
28
|
"""The model name to use."""
|
|
26
29
|
temperature: float = 0.0
|
|
@@ -50,6 +53,9 @@ class GeminiModel(BaseEvalModel):
|
|
|
50
53
|
max_retries=self.max_retries,
|
|
51
54
|
)
|
|
52
55
|
|
|
56
|
+
def reload_client(self) -> None:
|
|
57
|
+
self._init_client()
|
|
58
|
+
|
|
53
59
|
def _init_client(self) -> None:
|
|
54
60
|
try:
|
|
55
61
|
from google.api_core import exceptions # type:ignore
|
|
@@ -73,13 +73,6 @@ your response.
|
|
|
73
73
|
[END DATA]
|
|
74
74
|
|
|
75
75
|
Is the answer above factual or hallucinated based on the query and reference text?
|
|
76
|
-
|
|
77
|
-
Your response should be a single word: either "factual" or "hallucinated", and
|
|
78
|
-
it should not include any other text or characters. "hallucinated" indicates that the answer
|
|
79
|
-
provides factually inaccurate information to the query based on the reference text. "factual"
|
|
80
|
-
indicates that the answer to the question is correct relative to the reference text, and does not
|
|
81
|
-
contain made up information. Please read the query and reference text carefully before determining
|
|
82
|
-
your response.
|
|
83
76
|
"""
|
|
84
77
|
HALLUCINATION_PROMPT_TEMPLATE_WITH_EXPLANATION = """
|
|
85
78
|
In this task, you will be presented with a query, a reference text and an answer. The answer is
|
phoenix/server/static/index.js
CHANGED
|
@@ -6717,7 +6717,7 @@ fragment SpanEvaluationsTable_evals on Span {
|
|
|
6717
6717
|
gap: var(--ac-global-dimension-static-size-200);
|
|
6718
6718
|
`,children:i.map((o,l)=>x("li",{children:_(ft,{padding:"size-200",backgroundColor:"purple-100",borderColor:"purple-700",borderWidth:"thin",borderRadius:"medium",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"embedded text"}),x("pre",{css:ee`
|
|
6719
6719
|
margin: var(--ac-global-dimension-static-size-100) 0;
|
|
6720
|
-
`,children:o[mtt]})]})},l))})}):null})}function Xxn(t){let{spanAttributes:e}=t,n=(0,br.useMemo)(()=>{let l=e[wr.tool];return typeof l=="object"?l:{}},[e]);if(!(Object.keys(n).length>0))return null;let r=n[vB.name],a=n[vB.description],o=n[vB.parameters];return x(Be,{direction:"column",gap:"size-200",children:x(uu,{title:"Tool"+(typeof r=="string"?`: ${r}`:""),...eg,children:_(Be,{direction:"column",children:[a!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",backgroundColor:"light",children:_(Be,{direction:"column",alignItems:"start",gap:"size-50",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Description"}),x(Me,{children:a})]})}):null,o!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",children:_(Be,{direction:"column",alignItems:"start",width:"100%",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Parameters"}),x(Tc,{value:JSON.stringify(o),mimeType:"json"})]})}):null]})})})}var Sxn=["irrelevant"];function Gse({document:t,documentEvaluations:e,backgroundColor:n,borderColor:i,labelColor:r}){let a=t[htt],o=e&&e.length;return x(ft,{borderRadius:"medium",backgroundColor:n,borderColor:i,borderWidth:"thin",children:_(Be,{direction:"column",children:[x(ft,{width:"100%",borderBottomWidth:"thin",borderBottomColor:i,children:_(Be,{direction:"row",justifyContent:"space-between",margin:"size-200",alignItems:"center",children:[_(Be,{direction:"row",gap:"size-50",alignItems:"center",children:[x(pt,{svg:x(Et.FileOutline,{})}),_(Nn,{level:4,children:["document ",t[Itt]]})]}),typeof t[Wse]=="number"&&x(Zs,{color:r,children:`score ${mh(t[Wse])}`})]})}),x("pre",{css:ee`
|
|
6720
|
+
`,children:o[mtt]})]})},l))})}):null})}function Xxn(t){let{spanAttributes:e}=t,n=(0,br.useMemo)(()=>{let l=e[wr.tool];return typeof l=="object"?l:{}},[e]);if(!(Object.keys(n).length>0))return null;let r=n[vB.name],a=n[vB.description],o=n[vB.parameters];return x(Be,{direction:"column",gap:"size-200",children:x(uu,{title:"Tool"+(typeof r=="string"?`: ${r}`:""),...eg,children:_(Be,{direction:"column",children:[a!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",backgroundColor:"light",children:_(Be,{direction:"column",alignItems:"start",gap:"size-50",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Description"}),x(Me,{children:a})]})}):null,o!=null?x(ft,{paddingStart:"size-200",paddingEnd:"size-200",paddingTop:"size-100",paddingBottom:"size-100",borderBottomColor:"dark",borderBottomWidth:"thin",children:_(Be,{direction:"column",alignItems:"start",width:"100%",children:[x(Me,{color:"text-700",fontStyle:"italic",children:"Parameters"}),x(Tc,{value:JSON.stringify(o),mimeType:"json"})]})}):null]})})})}var Sxn=["irrelevant","unrelated"];function Gse({document:t,documentEvaluations:e,backgroundColor:n,borderColor:i,labelColor:r}){let a=t[htt],o=e&&e.length;return x(ft,{borderRadius:"medium",backgroundColor:n,borderColor:i,borderWidth:"thin",children:_(Be,{direction:"column",children:[x(ft,{width:"100%",borderBottomWidth:"thin",borderBottomColor:i,children:_(Be,{direction:"row",justifyContent:"space-between",margin:"size-200",alignItems:"center",children:[_(Be,{direction:"row",gap:"size-50",alignItems:"center",children:[x(pt,{svg:x(Et.FileOutline,{})}),_(Nn,{level:4,children:["document ",t[Itt]]})]}),typeof t[Wse]=="number"&&x(Zs,{color:r,children:`score ${mh(t[Wse])}`})]})}),x("pre",{css:ee`
|
|
6721
6721
|
padding: var(--ac-global-dimension-static-size-200);
|
|
6722
6722
|
white-space: normal;
|
|
6723
6723
|
margin: 0;
|
phoenix/session/evaluation.py
CHANGED
|
@@ -9,6 +9,7 @@ import math
|
|
|
9
9
|
from time import sleep
|
|
10
10
|
from typing import (
|
|
11
11
|
Any,
|
|
12
|
+
Iterator,
|
|
12
13
|
Optional,
|
|
13
14
|
Sequence,
|
|
14
15
|
Tuple,
|
|
@@ -33,24 +34,29 @@ __all__ = [
|
|
|
33
34
|
from phoenix.trace.span_evaluations import Evaluations
|
|
34
35
|
|
|
35
36
|
|
|
36
|
-
def
|
|
37
|
-
|
|
38
|
-
evaluations
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
index_names = evaluations.index.names
|
|
42
|
-
for index, row in evaluations.iterrows():
|
|
37
|
+
def encode_evaluations(evaluations: Evaluations) -> Iterator[pb.Evaluation]:
|
|
38
|
+
dataframe = evaluations.dataframe
|
|
39
|
+
eval_name = evaluations.eval_name
|
|
40
|
+
index_names = dataframe.index.names
|
|
41
|
+
for index, row in dataframe.iterrows():
|
|
43
42
|
subject_id = _extract_subject_id_from_index(
|
|
44
43
|
index_names,
|
|
45
44
|
cast(Union[str, Tuple[Any]], index),
|
|
46
45
|
)
|
|
47
46
|
if (result := _extract_result(row)) is None:
|
|
48
47
|
continue
|
|
49
|
-
|
|
50
|
-
name=
|
|
48
|
+
yield pb.Evaluation(
|
|
49
|
+
name=eval_name,
|
|
51
50
|
result=result,
|
|
52
51
|
subject_id=subject_id,
|
|
53
52
|
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def add_evaluations(
|
|
56
|
+
exporter: HttpExporter,
|
|
57
|
+
evaluations: Evaluations,
|
|
58
|
+
) -> None:
|
|
59
|
+
for evaluation in encode_evaluations(evaluations):
|
|
54
60
|
exporter.export(evaluation)
|
|
55
61
|
|
|
56
62
|
|
|
@@ -130,7 +136,7 @@ def log_evaluations(
|
|
|
130
136
|
return
|
|
131
137
|
exporter = HttpExporter(endpoint=endpoint, host=host, port=port)
|
|
132
138
|
for eval in filter(bool, evals):
|
|
133
|
-
add_evaluations(exporter, eval
|
|
139
|
+
add_evaluations(exporter, eval)
|
|
134
140
|
with tqdm(total=n, desc="Sending Evaluations") as pbar:
|
|
135
141
|
while n:
|
|
136
142
|
sleep(0.1)
|
phoenix/session/session.py
CHANGED
|
@@ -30,6 +30,7 @@ from phoenix.pointcloud.umap_parameters import get_umap_parameters
|
|
|
30
30
|
from phoenix.server.app import create_app
|
|
31
31
|
from phoenix.server.thread_server import ThreadServer
|
|
32
32
|
from phoenix.services import AppService
|
|
33
|
+
from phoenix.session.evaluation import encode_evaluations
|
|
33
34
|
from phoenix.trace.dsl import SpanFilter
|
|
34
35
|
from phoenix.trace.dsl.query import SpanQuery
|
|
35
36
|
from phoenix.trace.otel import encode
|
|
@@ -46,6 +47,8 @@ logger = logging.getLogger(__name__)
|
|
|
46
47
|
# type workaround
|
|
47
48
|
# https://github.com/python/mypy/issues/5264#issuecomment-399407428
|
|
48
49
|
if TYPE_CHECKING:
|
|
50
|
+
from phoenix.trace import Evaluations
|
|
51
|
+
|
|
49
52
|
_BaseList = UserList[pd.DataFrame]
|
|
50
53
|
else:
|
|
51
54
|
_BaseList = UserList
|
|
@@ -123,6 +126,10 @@ class Session(ABC):
|
|
|
123
126
|
self.traces.put(encode(span))
|
|
124
127
|
|
|
125
128
|
self.evals: Evals = Evals()
|
|
129
|
+
if trace_dataset:
|
|
130
|
+
for evaluations in trace_dataset.evaluations:
|
|
131
|
+
for pb_evaluation in encode_evaluations(evaluations):
|
|
132
|
+
self.evals.put(pb_evaluation)
|
|
126
133
|
|
|
127
134
|
self.host = host or get_env_host()
|
|
128
135
|
self.port = port or get_env_port()
|
|
@@ -213,6 +220,15 @@ class Session(ABC):
|
|
|
213
220
|
return None
|
|
214
221
|
return pd.json_normalize(data, max_level=1).set_index("context.span_id", drop=False)
|
|
215
222
|
|
|
223
|
+
def get_evaluations(self) -> List["Evaluations"]:
|
|
224
|
+
return self.evals.export_evaluations()
|
|
225
|
+
|
|
226
|
+
def get_trace_dataset(self) -> Optional[TraceDataset]:
|
|
227
|
+
if (dataframe := self.get_spans_dataframe()) is None:
|
|
228
|
+
return None
|
|
229
|
+
evaluations = self.get_evaluations()
|
|
230
|
+
return TraceDataset(dataframe=dataframe, evaluations=evaluations)
|
|
231
|
+
|
|
216
232
|
|
|
217
233
|
_session: Optional[Session] = None
|
|
218
234
|
|
|
@@ -479,6 +495,9 @@ def _get_url(host: str, port: int, notebook_env: NotebookEnvironment) -> str:
|
|
|
479
495
|
if notebook_env == NotebookEnvironment.DATABRICKS:
|
|
480
496
|
context = _get_databricks_context()
|
|
481
497
|
return f"{_get_databricks_notebook_base_url(context)}/{port}/"
|
|
498
|
+
if host == "0.0.0.0" or host == "127.0.0.1":
|
|
499
|
+
# The app is running locally, so use localhost
|
|
500
|
+
return f"http://localhost:{port}/"
|
|
482
501
|
return f"http://{host}:{port}/"
|
|
483
502
|
|
|
484
503
|
|
phoenix/trace/errors.py
ADDED
|
@@ -4,21 +4,18 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from itertools import product
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from types import MappingProxyType
|
|
7
|
-
from typing import Any, Callable,
|
|
7
|
+
from typing import Any, Callable, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union
|
|
8
8
|
from uuid import UUID, uuid4
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from pandas.api.types import is_integer_dtype, is_numeric_dtype, is_string_dtype
|
|
12
|
-
from pyarrow import Table, parquet
|
|
12
|
+
from pyarrow import Schema, Table, parquet
|
|
13
13
|
|
|
14
14
|
from phoenix.config import TRACE_DATASET_DIR
|
|
15
|
-
from phoenix.
|
|
15
|
+
from phoenix.trace.errors import InvalidParquetMetadataError
|
|
16
16
|
|
|
17
17
|
EVAL_NAME_COLUMN_PREFIX = "eval."
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class InvalidParquetMetadataError(PhoenixException):
|
|
21
|
-
pass
|
|
18
|
+
EVAL_PARQUET_FILE_NAME = "evaluations-{id}.parquet"
|
|
22
19
|
|
|
23
20
|
|
|
24
21
|
class NeedsNamedIndex(ABC):
|
|
@@ -164,20 +161,21 @@ class Evaluations(NeedsNamedIndex, NeedsResultColumns, ABC):
|
|
|
164
161
|
tuple(sorted(prod)) for prod in product(*cls.index_names.keys())
|
|
165
162
|
)
|
|
166
163
|
|
|
167
|
-
def
|
|
168
|
-
"""
|
|
164
|
+
def save(self, directory: Optional[Union[str, Path]] = None) -> UUID:
|
|
165
|
+
"""
|
|
166
|
+
Persists the evaluations to disk.
|
|
169
167
|
|
|
170
168
|
Args:
|
|
171
169
|
directory (Optional[Union[str, Path]], optional): An optional path
|
|
172
|
-
to a directory where the
|
|
173
|
-
|
|
170
|
+
to a directory where the data will be saved. If not provided, the
|
|
171
|
+
data will be saved to a default location.
|
|
174
172
|
|
|
175
173
|
Returns:
|
|
176
|
-
|
|
177
|
-
|
|
174
|
+
UUID: The ID of the evaluations, which can be used as a key to load
|
|
175
|
+
the evaluations from disk using `load`.
|
|
178
176
|
"""
|
|
179
177
|
directory = Path(directory) if directory else TRACE_DATASET_DIR
|
|
180
|
-
path = directory /
|
|
178
|
+
path = directory / EVAL_PARQUET_FILE_NAME.format(id=self.id)
|
|
181
179
|
table = Table.from_pandas(self.dataframe)
|
|
182
180
|
table = table.replace_schema_metadata(
|
|
183
181
|
{
|
|
@@ -193,23 +191,38 @@ class Evaluations(NeedsNamedIndex, NeedsResultColumns, ABC):
|
|
|
193
191
|
}
|
|
194
192
|
)
|
|
195
193
|
parquet.write_table(table, path)
|
|
196
|
-
return
|
|
194
|
+
return self.id
|
|
197
195
|
|
|
198
196
|
@classmethod
|
|
199
|
-
def
|
|
200
|
-
|
|
197
|
+
def load(
|
|
198
|
+
cls, id: Union[str, UUID], directory: Optional[Union[str, Path]] = None
|
|
199
|
+
) -> "Evaluations":
|
|
200
|
+
"""
|
|
201
|
+
Loads the evaluations from disk.
|
|
201
202
|
|
|
202
203
|
Args:
|
|
203
|
-
|
|
204
|
-
|
|
204
|
+
id (Union[str, UUID]): The ID of the evaluations to load.
|
|
205
|
+
|
|
206
|
+
directory(Optional[Union[str, Path]], optional): The path to the
|
|
207
|
+
directory containing the persisted evaluations. If not provided, the
|
|
208
|
+
parquet file will be loaded from the same default location used by
|
|
209
|
+
`save`.
|
|
205
210
|
|
|
206
211
|
Returns:
|
|
207
212
|
Evaluations: The loaded evaluations. The type of the returned
|
|
208
213
|
evaluations will be the same as the type of the evaluations that
|
|
209
214
|
were originally persisted.
|
|
210
215
|
"""
|
|
216
|
+
if not isinstance(id, UUID):
|
|
217
|
+
id = UUID(id)
|
|
218
|
+
path = Path(directory or TRACE_DATASET_DIR) / EVAL_PARQUET_FILE_NAME.format(id=id)
|
|
211
219
|
schema = parquet.read_schema(path)
|
|
212
|
-
eval_id, eval_name, evaluations_cls = _parse_schema_metadata(schema
|
|
220
|
+
eval_id, eval_name, evaluations_cls = _parse_schema_metadata(schema)
|
|
221
|
+
if id != eval_id:
|
|
222
|
+
raise InvalidParquetMetadataError(
|
|
223
|
+
f"The input id {id} does not match the id {eval_id} in the parquet metadata. "
|
|
224
|
+
"Ensure that you have not renamed the parquet file."
|
|
225
|
+
)
|
|
213
226
|
table = parquet.read_table(path)
|
|
214
227
|
dataframe = table.to_pandas()
|
|
215
228
|
evaluations = evaluations_cls(eval_name=eval_name, dataframe=dataframe)
|
|
@@ -301,48 +314,20 @@ class TraceEvaluations(
|
|
|
301
314
|
...
|
|
302
315
|
|
|
303
316
|
|
|
304
|
-
def _parse_schema_metadata(
|
|
305
|
-
"""
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
Args:
|
|
309
|
-
metadata (Dict[bytes, Any]): A dictionary of schema metadata from a
|
|
310
|
-
parquet file.
|
|
311
|
-
|
|
312
|
-
Returns:
|
|
313
|
-
Tuple[str, ModuleType]: The evaluation name and the evaluations class.
|
|
317
|
+
def _parse_schema_metadata(schema: Schema) -> Tuple[UUID, str, Type[Evaluations]]:
|
|
318
|
+
"""
|
|
319
|
+
Validates and parses the pyarrow schema metadata.
|
|
314
320
|
"""
|
|
315
|
-
if not (arize_metadata_json := metadata.get(b"arize")):
|
|
316
|
-
raise InvalidParquetMetadataError('Schema metadata is missing "arize" key')
|
|
317
321
|
try:
|
|
318
|
-
|
|
319
|
-
|
|
322
|
+
metadata = schema.metadata
|
|
323
|
+
arize_metadata = json.loads(metadata[b"arize"])
|
|
324
|
+
eval_classes = {subclass.__name__: subclass for subclass in Evaluations.__subclasses__()}
|
|
325
|
+
eval_id = UUID(arize_metadata["eval_id"])
|
|
326
|
+
if not isinstance((eval_name := arize_metadata["eval_name"]), str):
|
|
327
|
+
raise ValueError('Arize metadata must contain a string value for key "eval_name"')
|
|
328
|
+
evaluations_cls = eval_classes[arize_metadata["eval_type"]]
|
|
329
|
+
return eval_id, eval_name, evaluations_cls
|
|
330
|
+
except Exception as err:
|
|
320
331
|
raise InvalidParquetMetadataError(
|
|
321
|
-
|
|
332
|
+
"An error occurred while parsing parquet schema metadata"
|
|
322
333
|
) from err
|
|
323
|
-
evaluations_classes = {subclass.__name__: subclass for subclass in Evaluations.__subclasses__()}
|
|
324
|
-
if not (
|
|
325
|
-
isinstance(arize_metadata, dict)
|
|
326
|
-
and (eval_id := _to_uuid(arize_metadata.get("eval_id")))
|
|
327
|
-
and isinstance(eval_name := arize_metadata.get("eval_name"), str)
|
|
328
|
-
and (eval_type := arize_metadata.get("eval_type"))
|
|
329
|
-
and (evaluations_cls := evaluations_classes.get(eval_type))
|
|
330
|
-
):
|
|
331
|
-
raise InvalidParquetMetadataError(f"Invalid Arize metadata: {arize_metadata}")
|
|
332
|
-
return eval_id, eval_name, evaluations_cls
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def _to_uuid(value: Any) -> Optional[UUID]:
|
|
336
|
-
"""
|
|
337
|
-
Converts an input to a UUID if possible, otherwise returns None.
|
|
338
|
-
|
|
339
|
-
Args:
|
|
340
|
-
value (Any): The value to convert to a UUID.
|
|
341
|
-
|
|
342
|
-
Returns:
|
|
343
|
-
Optional[UUID]: A UUID if the value could be converted, otherwise None.
|
|
344
|
-
"""
|
|
345
|
-
try:
|
|
346
|
-
return UUID(value)
|
|
347
|
-
except Exception:
|
|
348
|
-
return None
|
phoenix/trace/trace_dataset.py
CHANGED
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import uuid
|
|
3
2
|
from datetime import datetime
|
|
4
|
-
from
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union, cast
|
|
5
|
+
from uuid import UUID, uuid4
|
|
6
|
+
from warnings import warn
|
|
5
7
|
|
|
6
8
|
import pandas as pd
|
|
7
9
|
from pandas import DataFrame, read_parquet
|
|
10
|
+
from pyarrow import Schema, Table, parquet
|
|
8
11
|
|
|
9
12
|
from phoenix.datetime_utils import normalize_timestamps
|
|
13
|
+
from phoenix.trace.errors import InvalidParquetMetadataError
|
|
10
14
|
|
|
11
|
-
from ..config import DATASET_DIR, GENERATED_DATASET_NAME_PREFIX
|
|
15
|
+
from ..config import DATASET_DIR, GENERATED_DATASET_NAME_PREFIX, TRACE_DATASET_DIR
|
|
12
16
|
from .schemas import ATTRIBUTE_PREFIX, CONTEXT_PREFIX, Span
|
|
13
17
|
from .semantic_conventions import (
|
|
14
18
|
DOCUMENT_METADATA,
|
|
@@ -43,6 +47,8 @@ DOCUMENT_COLUMNS = [
|
|
|
43
47
|
RERANKER_OUTPUT_DOCUMENTS_COLUMN_NAME,
|
|
44
48
|
]
|
|
45
49
|
|
|
50
|
+
TRACE_DATASET_PARQUET_FILE_NAME = "trace_dataset-{id}.parquet"
|
|
51
|
+
|
|
46
52
|
|
|
47
53
|
def normalize_dataframe(dataframe: DataFrame) -> "DataFrame":
|
|
48
54
|
"""Makes the dataframe have appropriate data types"""
|
|
@@ -94,6 +100,7 @@ class TraceDataset:
|
|
|
94
100
|
name: str
|
|
95
101
|
dataframe: pd.DataFrame
|
|
96
102
|
evaluations: List[Evaluations] = []
|
|
103
|
+
_id: UUID = uuid4()
|
|
97
104
|
_data_file_name: str = "data.parquet"
|
|
98
105
|
|
|
99
106
|
def __init__(
|
|
@@ -122,7 +129,7 @@ class TraceDataset:
|
|
|
122
129
|
f"The dataframe is missing some required columns: {', '.join(missing_columns)}"
|
|
123
130
|
)
|
|
124
131
|
self.dataframe = normalize_dataframe(dataframe)
|
|
125
|
-
self.name = name or f"{GENERATED_DATASET_NAME_PREFIX}{str(
|
|
132
|
+
self.name = name or f"{GENERATED_DATASET_NAME_PREFIX}{str(uuid4())}"
|
|
126
133
|
self.evaluations = list(evaluations)
|
|
127
134
|
|
|
128
135
|
@classmethod
|
|
@@ -199,6 +206,89 @@ class TraceDataset:
|
|
|
199
206
|
coerce_timestamps="ms",
|
|
200
207
|
)
|
|
201
208
|
|
|
209
|
+
def save(self, directory: Optional[Union[str, Path]] = None) -> UUID:
|
|
210
|
+
"""
|
|
211
|
+
Writes the trace dataset to disk. If any evaluations have been appended
|
|
212
|
+
to the dataset, those evaluations will be saved to separate files within
|
|
213
|
+
the same directory.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
directory (Optional[Union[str, Path]], optional): An optional path
|
|
217
|
+
to a directory where the data will be written. If not provided, the
|
|
218
|
+
data will be written to a default location.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
UUID: The id of the trace dataset, which can be used as key to load
|
|
222
|
+
the dataset from disk using `load`.
|
|
223
|
+
"""
|
|
224
|
+
directory = Path(directory or TRACE_DATASET_DIR)
|
|
225
|
+
for evals in self.evaluations:
|
|
226
|
+
evals.save(directory)
|
|
227
|
+
path = directory / TRACE_DATASET_PARQUET_FILE_NAME.format(id=self._id)
|
|
228
|
+
dataframe = get_serializable_spans_dataframe(self.dataframe)
|
|
229
|
+
dataframe.to_parquet(
|
|
230
|
+
path,
|
|
231
|
+
allow_truncated_timestamps=True,
|
|
232
|
+
coerce_timestamps="ms",
|
|
233
|
+
)
|
|
234
|
+
table = Table.from_pandas(self.dataframe)
|
|
235
|
+
table = table.replace_schema_metadata(
|
|
236
|
+
{
|
|
237
|
+
**(table.schema.metadata or {}),
|
|
238
|
+
# explicitly encode keys and values, which are automatically encoded regardless
|
|
239
|
+
b"arize": json.dumps(
|
|
240
|
+
{
|
|
241
|
+
"dataset_id": str(self._id),
|
|
242
|
+
"dataset_name": self.name,
|
|
243
|
+
"eval_ids": [str(evals.id) for evals in self.evaluations],
|
|
244
|
+
}
|
|
245
|
+
).encode("utf-8"),
|
|
246
|
+
}
|
|
247
|
+
)
|
|
248
|
+
parquet.write_table(table, path)
|
|
249
|
+
return self._id
|
|
250
|
+
|
|
251
|
+
@classmethod
|
|
252
|
+
def load(
|
|
253
|
+
cls, id: Union[str, UUID], directory: Optional[Union[str, Path]] = None
|
|
254
|
+
) -> "TraceDataset":
|
|
255
|
+
"""
|
|
256
|
+
Reads in a trace dataset from disk. Any associated evaluations will
|
|
257
|
+
automatically be read from disk and attached to the trace dataset.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
id (Union[str, UUID]): The ID of the trace dataset to be loaded.
|
|
261
|
+
|
|
262
|
+
directory (Optional[Union[str, Path]], optional): The path to the
|
|
263
|
+
directory containing the persisted trace dataset parquet file. If
|
|
264
|
+
not provided, the parquet file will be loaded from the same default
|
|
265
|
+
location used by `save`.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
TraceDataset: The loaded trace dataset.
|
|
269
|
+
"""
|
|
270
|
+
if not isinstance(id, UUID):
|
|
271
|
+
id = UUID(id)
|
|
272
|
+
path = Path(directory or TRACE_DATASET_DIR) / TRACE_DATASET_PARQUET_FILE_NAME.format(id=id)
|
|
273
|
+
schema = parquet.read_schema(path)
|
|
274
|
+
dataset_id, dataset_name, eval_ids = _parse_schema_metadata(schema)
|
|
275
|
+
if id != dataset_id:
|
|
276
|
+
raise InvalidParquetMetadataError(
|
|
277
|
+
f"The input id {id} does not match the id {dataset_id} in the parquet metadata. "
|
|
278
|
+
"Ensure that you have not renamed the parquet file."
|
|
279
|
+
)
|
|
280
|
+
evaluations = []
|
|
281
|
+
for eval_id in eval_ids:
|
|
282
|
+
try:
|
|
283
|
+
evaluations.append(Evaluations.load(eval_id, path.parent))
|
|
284
|
+
except Exception:
|
|
285
|
+
warn(f'Failed to load evaluations with id: "{eval_id}"')
|
|
286
|
+
table = parquet.read_table(path)
|
|
287
|
+
dataframe = table.to_pandas()
|
|
288
|
+
ds = cls(dataframe, dataset_name, evaluations)
|
|
289
|
+
ds._id = dataset_id
|
|
290
|
+
return ds
|
|
291
|
+
|
|
202
292
|
def append_evaluations(self, evaluations: Evaluations) -> None:
|
|
203
293
|
"""adds an evaluation to the traces"""
|
|
204
294
|
# Append the evaluations to the list of evaluations
|
|
@@ -233,3 +323,20 @@ class TraceDataset:
|
|
|
233
323
|
# Make sure the index is set to the span_id
|
|
234
324
|
df = self.dataframe.set_index("context.span_id", drop=False)
|
|
235
325
|
return pd.concat([df, evals_df], axis=1)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _parse_schema_metadata(schema: Schema) -> Tuple[UUID, str, List[UUID]]:
|
|
329
|
+
"""
|
|
330
|
+
Returns parsed metadata from a parquet schema or raises an exception if the
|
|
331
|
+
metadata is invalid.
|
|
332
|
+
"""
|
|
333
|
+
try:
|
|
334
|
+
metadata = schema.metadata
|
|
335
|
+
arize_metadata = json.loads(metadata[b"arize"])
|
|
336
|
+
dataset_id = UUID(arize_metadata["dataset_id"])
|
|
337
|
+
if not isinstance(dataset_name := arize_metadata["dataset_name"], str):
|
|
338
|
+
raise ValueError("Arize metadata must contain a dataset_name key with string value")
|
|
339
|
+
eval_ids = [UUID(eval_id) for eval_id in arize_metadata["eval_ids"]]
|
|
340
|
+
return dataset_id, dataset_name, eval_ids
|
|
341
|
+
except Exception as err:
|
|
342
|
+
raise InvalidParquetMetadataError("Unable to parse parquet metadata") from err
|
phoenix/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.
|
|
1
|
+
__version__ = "2.7.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|