arize-phoenix 4.12.1rc1__py3-none-any.whl → 4.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/METADATA +10 -6
- {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/RECORD +70 -68
- phoenix/db/bulk_inserter.py +5 -4
- phoenix/db/engines.py +2 -1
- phoenix/experiments/evaluators/base.py +4 -0
- phoenix/experiments/evaluators/code_evaluators.py +80 -0
- phoenix/experiments/evaluators/llm_evaluators.py +77 -1
- phoenix/experiments/evaluators/utils.py +70 -21
- phoenix/experiments/functions.py +17 -16
- phoenix/server/api/context.py +5 -3
- phoenix/server/api/dataloaders/__init__.py +2 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +25 -25
- phoenix/server/api/dataloaders/dataset_example_revisions.py +2 -4
- phoenix/server/api/dataloaders/dataset_example_spans.py +2 -4
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +2 -4
- phoenix/server/api/dataloaders/document_evaluations.py +2 -4
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +2 -4
- phoenix/server/api/dataloaders/evaluation_summaries.py +2 -4
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +2 -4
- phoenix/server/api/dataloaders/experiment_error_rates.py +32 -14
- phoenix/server/api/dataloaders/experiment_run_counts.py +20 -9
- phoenix/server/api/dataloaders/experiment_sequence_number.py +2 -4
- phoenix/server/api/dataloaders/latency_ms_quantile.py +2 -3
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +2 -4
- phoenix/server/api/dataloaders/project_by_name.py +3 -3
- phoenix/server/api/dataloaders/record_counts.py +2 -4
- phoenix/server/api/dataloaders/span_annotations.py +2 -4
- phoenix/server/api/dataloaders/span_dataset_examples.py +36 -0
- phoenix/server/api/dataloaders/span_descendants.py +2 -4
- phoenix/server/api/dataloaders/span_evaluations.py +2 -4
- phoenix/server/api/dataloaders/span_projects.py +3 -3
- phoenix/server/api/dataloaders/token_counts.py +2 -4
- phoenix/server/api/dataloaders/trace_evaluations.py +2 -4
- phoenix/server/api/dataloaders/trace_row_ids.py +2 -4
- phoenix/server/api/input_types/{CreateSpanAnnotationsInput.py → CreateSpanAnnotationInput.py} +4 -2
- phoenix/server/api/input_types/{CreateTraceAnnotationsInput.py → CreateTraceAnnotationInput.py} +4 -2
- phoenix/server/api/input_types/{PatchAnnotationsInput.py → PatchAnnotationInput.py} +4 -2
- phoenix/server/api/mutations/span_annotations_mutations.py +20 -9
- phoenix/server/api/mutations/trace_annotations_mutations.py +20 -9
- phoenix/server/api/routers/v1/datasets.py +132 -10
- phoenix/server/api/routers/v1/evaluations.py +3 -5
- phoenix/server/api/routers/v1/experiments.py +1 -1
- phoenix/server/api/types/Experiment.py +2 -2
- phoenix/server/api/types/Inferences.py +1 -2
- phoenix/server/api/types/Model.py +1 -2
- phoenix/server/api/types/Span.py +5 -0
- phoenix/server/api/utils.py +4 -4
- phoenix/server/app.py +21 -18
- phoenix/server/grpc_server.py +2 -2
- phoenix/server/main.py +5 -9
- phoenix/server/static/.vite/manifest.json +31 -31
- phoenix/server/static/assets/{components-C8sm_r1F.js → components-kGgeFkHp.js} +150 -110
- phoenix/server/static/assets/index-BctFO6S7.js +100 -0
- phoenix/server/static/assets/{pages-bN7juCjh.js → pages-DabDCmVd.js} +432 -255
- phoenix/server/static/assets/{vendor-CUDAPm8e.js → vendor-CP0b0YG0.js} +2 -2
- phoenix/server/static/assets/{vendor-arizeai-Do2HOmcL.js → vendor-arizeai-B5Hti8OB.js} +27 -27
- phoenix/server/static/assets/vendor-codemirror-DtdPDzrv.js +15 -0
- phoenix/server/static/assets/{vendor-recharts-PKRvByVe.js → vendor-recharts-A0DA1O99.js} +1 -1
- phoenix/server/types.py +18 -0
- phoenix/session/client.py +9 -6
- phoenix/session/session.py +2 -2
- phoenix/trace/dsl/filter.py +40 -25
- phoenix/trace/fixtures.py +17 -23
- phoenix/trace/utils.py +23 -0
- phoenix/utilities/client.py +116 -0
- phoenix/utilities/project.py +1 -1
- phoenix/version.py +1 -1
- phoenix/server/api/routers/v1/dataset_examples.py +0 -157
- phoenix/server/static/assets/index-BEKPzgQs.js +0 -100
- phoenix/server/static/assets/vendor-codemirror-CrdxOlMs.js +0 -12
- {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -18,6 +18,31 @@ from phoenix.experiments.types import (
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class LLMCriteriaEvaluator(LLMEvaluator):
|
|
21
|
+
"""
|
|
22
|
+
An experiment evaluator that uses an LLM to evaluate whether the text meets a custom criteria.
|
|
23
|
+
|
|
24
|
+
This evaluator uses the chain-of-thought technique to perform a binary evaluation of text based
|
|
25
|
+
on a custom criteria and description. When used as an experiment evaluator,
|
|
26
|
+
`LLMCriteriaEvaluator` will return a score of 1.0 if the text meets the criteria and a score of
|
|
27
|
+
0.0 if not. The explanation produced by the chain-of-thought technique will be included in the
|
|
28
|
+
experiment evaluation as well.
|
|
29
|
+
|
|
30
|
+
Example criteria and descriptions:
|
|
31
|
+
- "thoughtfulness" - "shows careful consideration and fair judgement"
|
|
32
|
+
- "clarity" - "is easy to understand and follow"
|
|
33
|
+
- "professionalism" - "is respectful and appropriate for a formal setting"
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
37
|
+
the `phoenix.evals` module.
|
|
38
|
+
criteria: The criteria to evaluate the text against, the criteria should be able to be used
|
|
39
|
+
as a noun in a sentence.
|
|
40
|
+
description (str): A description of the criteria, used to clarify instructions to the LLM.
|
|
41
|
+
The description should complete this sentence: "{criteria} means the text
|
|
42
|
+
{description}".
|
|
43
|
+
name (str): The name of the evaluator
|
|
44
|
+
"""
|
|
45
|
+
|
|
21
46
|
_base_template = (
|
|
22
47
|
"Determine if the following text is {criteria}. {description}"
|
|
23
48
|
"First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
|
|
@@ -117,6 +142,14 @@ ConcisenessEvaluator = criteria_evaluator_factory(
|
|
|
117
142
|
description="is just a few sentences and easy to follow",
|
|
118
143
|
default_name="Conciseness",
|
|
119
144
|
)
|
|
145
|
+
"""
|
|
146
|
+
An experiment evaluator that uses an LLM to evaluate whether the text is concise.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
150
|
+
the `phoenix.evals` module.
|
|
151
|
+
name (str, optional): The name of the evaluator, defaults to "Conciseness".
|
|
152
|
+
"""
|
|
120
153
|
|
|
121
154
|
|
|
122
155
|
HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
@@ -125,6 +158,14 @@ HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
|
125
158
|
description="provides useful information",
|
|
126
159
|
default_name="Helpfulness",
|
|
127
160
|
)
|
|
161
|
+
"""
|
|
162
|
+
An experiment evaluator that uses an LLM to evaluate whether the text is helpful.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
166
|
+
the `phoenix.evals` module.
|
|
167
|
+
name (str, optional): The name of the evaluator, defaults to "Helpfulness".
|
|
168
|
+
"""
|
|
128
169
|
|
|
129
170
|
|
|
130
171
|
CoherenceEvaluator = criteria_evaluator_factory(
|
|
@@ -133,6 +174,14 @@ CoherenceEvaluator = criteria_evaluator_factory(
|
|
|
133
174
|
description="is coherent, well-structured, and logically sound",
|
|
134
175
|
default_name="Coherence",
|
|
135
176
|
)
|
|
177
|
+
"""
|
|
178
|
+
An experiment evaluator that uses an LLM to evaluate whether the text is coherent.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
182
|
+
the `phoenix.evals` module.
|
|
183
|
+
name (str, optional): The name of the evaluator, defaults to "Coherence".
|
|
184
|
+
"""
|
|
136
185
|
|
|
137
186
|
|
|
138
187
|
def _parse_label_from_explanation(raw_string: str) -> str:
|
|
@@ -149,6 +198,33 @@ def _parse_label_from_explanation(raw_string: str) -> str:
|
|
|
149
198
|
|
|
150
199
|
|
|
151
200
|
class RelevanceEvaluator(LLMEvaluator):
|
|
201
|
+
"""
|
|
202
|
+
An experiment evaluator that uses an LLM to evaluate whether a response is relevant to a query.
|
|
203
|
+
|
|
204
|
+
This evaluator uses the chain-of-thought technique to perform a binary evaluation of whether
|
|
205
|
+
the output "response" of an experiment is relevant to its input "query". When used as an
|
|
206
|
+
experiment evaluator, `RelevanceEvaluator` will return a score of 1.0 if the response is
|
|
207
|
+
relevant to the query and a score of 0.0 if not. The explanation produced by the
|
|
208
|
+
chain-of-thought technique will be included in the experiment evaluation as well.
|
|
209
|
+
|
|
210
|
+
Optionally, you can provide custom functions to extract the query and response from the input
|
|
211
|
+
and output of the experiment task. By default, the evaluator will use the dataset example as
|
|
212
|
+
the input and the output of the experiment task as the response.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
216
|
+
the `phoenix.evals` module.
|
|
217
|
+
get_query (callable, optional): A function that extracts the query from the input of the
|
|
218
|
+
experiment task. The function should take the input and metadata of the dataset example
|
|
219
|
+
and return a string. By default, the function will return the string representation of
|
|
220
|
+
the input.
|
|
221
|
+
get_response (callable, optional): A function that extracts the response from the output of
|
|
222
|
+
the experiment task. The function should take the output and metadata of the experiment
|
|
223
|
+
task and return a string. By default, the function will return the string representation
|
|
224
|
+
of the output.
|
|
225
|
+
name (str, optional): The name of the evaluator. Defaults to "Relevance".
|
|
226
|
+
"""
|
|
227
|
+
|
|
152
228
|
template = (
|
|
153
229
|
"Determine if the following response is relevant to the query. In this context, "
|
|
154
230
|
"'relevance' means that the response directly addresses the core question or topic of the "
|
|
@@ -174,7 +250,7 @@ class RelevanceEvaluator(LLMEvaluator):
|
|
|
174
250
|
model: LLMBaseModel,
|
|
175
251
|
get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
|
|
176
252
|
get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
|
|
177
|
-
name: str = "
|
|
253
|
+
name: str = "Relevance",
|
|
178
254
|
):
|
|
179
255
|
self.model = model
|
|
180
256
|
self._name = name
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import inspect
|
|
3
|
-
from itertools import chain, islice, repeat
|
|
4
3
|
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
5
4
|
|
|
6
5
|
from phoenix.experiments.types import (
|
|
@@ -75,6 +74,72 @@ def create_evaluator(
|
|
|
75
74
|
name: Optional[str] = None,
|
|
76
75
|
scorer: Optional[Callable[[Any], EvaluationResult]] = None,
|
|
77
76
|
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
77
|
+
"""
|
|
78
|
+
A decorator that configures a sync or async function to be used as an experiment evaluator.
|
|
79
|
+
|
|
80
|
+
If the `evaluator` is a function of one argument then that argument will be
|
|
81
|
+
bound to the `output` of an experiment task. Alternatively, the `evaluator` can be a function
|
|
82
|
+
of any combination of specific argument names that will be bound to special values:
|
|
83
|
+
`input`: The input field of the dataset example
|
|
84
|
+
`output`: The output of an experiment task
|
|
85
|
+
`expected`: The expected or reference output of the dataset example
|
|
86
|
+
`reference`: An alias for `expected`
|
|
87
|
+
`metadata`: Metadata associated with the dataset example
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
kind (str | AnnotatorKind): Broadly indicates how the evaluator scores an experiment run.
|
|
91
|
+
Valid kinds are: "CODE", "LLM". Defaults to "CODE".
|
|
92
|
+
name (str, optional): The name of the evaluator. If not provided, the name of the function
|
|
93
|
+
will be used.
|
|
94
|
+
scorer (callable, optional): An optional function that converts the output of the wrapped
|
|
95
|
+
function into an `EvaluationResult`. This allows configuring the evaluation
|
|
96
|
+
payload by setting a label, score and explanation. By default, numeric outputs will
|
|
97
|
+
be recorded as scores, boolean outputs will be recorded as scores and labels, and
|
|
98
|
+
string outputs will be recorded as labels. If the output is a 2-tuple, the first item
|
|
99
|
+
will be recorded as the score and the second item will recorded as the explanation.
|
|
100
|
+
|
|
101
|
+
Examples:
|
|
102
|
+
Configuring an evaluator that returns a boolean
|
|
103
|
+
|
|
104
|
+
.. code-block:: python
|
|
105
|
+
@create_evaluator(kind="CODE", name="exact-match)
|
|
106
|
+
def match(output: str, expected: str) -> bool:
|
|
107
|
+
return output == expected
|
|
108
|
+
|
|
109
|
+
Configuring an evaluator that returns a label
|
|
110
|
+
|
|
111
|
+
.. code-block:: python
|
|
112
|
+
client = openai.Client()
|
|
113
|
+
|
|
114
|
+
@create_evaluator(kind="LLM")
|
|
115
|
+
def label(output: str) -> str:
|
|
116
|
+
res = client.chat.completions.create(
|
|
117
|
+
model = "gpt-4",
|
|
118
|
+
messages = [
|
|
119
|
+
{
|
|
120
|
+
"role": "user",
|
|
121
|
+
"content": (
|
|
122
|
+
"in one word, characterize the sentiment of the following customer "
|
|
123
|
+
f"request: {output}"
|
|
124
|
+
)
|
|
125
|
+
},
|
|
126
|
+
],
|
|
127
|
+
)
|
|
128
|
+
label = res.choices[0].message.content
|
|
129
|
+
return label
|
|
130
|
+
|
|
131
|
+
Configuring an evaluator that returns a score and explanation
|
|
132
|
+
|
|
133
|
+
.. code-block:: python
|
|
134
|
+
from textdistance import levenshtein
|
|
135
|
+
|
|
136
|
+
@create_evaluator(kind="CODE", name="levenshtein-distance")
|
|
137
|
+
def ld(output: str, expected: str) -> Tuple[float, str]:
|
|
138
|
+
return (
|
|
139
|
+
levenshtein(output, expected),
|
|
140
|
+
f"Levenshtein distance between {output} and {expected}"
|
|
141
|
+
)
|
|
142
|
+
"""
|
|
78
143
|
if scorer is None:
|
|
79
144
|
scorer = _default_eval_scorer
|
|
80
145
|
|
|
@@ -163,24 +228,8 @@ def _default_eval_scorer(result: Any) -> EvaluationResult:
|
|
|
163
228
|
return EvaluationResult(score=float(result))
|
|
164
229
|
if isinstance(result, str):
|
|
165
230
|
return EvaluationResult(label=result)
|
|
166
|
-
if isinstance(result, (tuple, list)) and
|
|
167
|
-
#
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
# - 1-tuple: (Score, ) or (Label, )
|
|
171
|
-
# Note that (Score, Label) conflicts with (Score, Explanation) and we
|
|
172
|
-
# pick the latter because it's probably more prevalent. To get
|
|
173
|
-
# (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
|
|
174
|
-
a, b, c = islice(chain(result, repeat(None)), 3)
|
|
175
|
-
score, label, explanation = None, a, b
|
|
176
|
-
if hasattr(a, "__float__"):
|
|
177
|
-
try:
|
|
178
|
-
score = float(a)
|
|
179
|
-
except ValueError:
|
|
180
|
-
pass
|
|
181
|
-
else:
|
|
182
|
-
label, explanation = (None, b) if len(result) < 3 else (b, c)
|
|
183
|
-
return EvaluationResult(score=score, label=label, explanation=explanation)
|
|
184
|
-
if result is None:
|
|
185
|
-
return EvaluationResult(score=0)
|
|
231
|
+
if isinstance(result, (tuple, list)) and len(result) == 2:
|
|
232
|
+
# If the result is a 2-tuple, the first item will be recorded as the score
|
|
233
|
+
# and the second item will recorded as the explanation.
|
|
234
|
+
return EvaluationResult(score=float(result[0]), explanation=str(result[1]))
|
|
186
235
|
raise ValueError(f"Unsupported evaluation result type: {type(result)}")
|
phoenix/experiments/functions.py
CHANGED
|
@@ -72,15 +72,16 @@ from phoenix.experiments.types import (
|
|
|
72
72
|
)
|
|
73
73
|
from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
|
|
74
74
|
from phoenix.trace.attributes import flatten
|
|
75
|
+
from phoenix.utilities.client import VersionedAsyncClient, VersionedClient
|
|
75
76
|
from phoenix.utilities.json import jsonify
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
|
|
79
80
|
headers = get_env_client_headers()
|
|
80
|
-
return
|
|
81
|
+
return VersionedClient(
|
|
81
82
|
base_url=get_base_url(),
|
|
82
83
|
headers=headers,
|
|
83
|
-
),
|
|
84
|
+
), VersionedAsyncClient(
|
|
84
85
|
base_url=get_base_url(),
|
|
85
86
|
headers=headers,
|
|
86
87
|
)
|
|
@@ -120,21 +121,23 @@ def run_experiment(
|
|
|
120
121
|
output. If the `task` is a function of one argument then that argument will be bound to the
|
|
121
122
|
`input` field of the dataset example. Alternatively, the `task` can be a function of any
|
|
122
123
|
combination of specific argument names that will be bound to special values:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
124
|
+
|
|
125
|
+
- `input`: The input field of the dataset example
|
|
126
|
+
- `expected`: The expected or reference output of the dataset example
|
|
127
|
+
- `reference`: An alias for `expected`
|
|
128
|
+
- `metadata`: Metadata associated with the dataset example
|
|
129
|
+
- `example`: The dataset `Example` object with all associated fields
|
|
128
130
|
|
|
129
131
|
An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
|
|
130
132
|
or numeric "score". If the `evaluator` is a function of one argument then that argument will be
|
|
131
133
|
bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
|
|
132
134
|
combination of specific argument names that will be bound to special values:
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
|
|
136
|
+
- `input`: The input field of the dataset example
|
|
137
|
+
- `output`: The output of the task
|
|
138
|
+
- `expected`: The expected or reference output of the dataset example
|
|
139
|
+
- `reference`: An alias for `expected`
|
|
140
|
+
- `metadata`: Metadata associated with the dataset example
|
|
138
141
|
|
|
139
142
|
Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
|
|
140
143
|
|
|
@@ -366,10 +369,9 @@ def run_experiment(
|
|
|
366
369
|
return exp_run
|
|
367
370
|
|
|
368
371
|
_errors: Tuple[Type[BaseException], ...]
|
|
369
|
-
if not
|
|
372
|
+
if not isinstance(rate_limit_errors, Sequence):
|
|
370
373
|
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
371
374
|
else:
|
|
372
|
-
rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
|
|
373
375
|
_errors = tuple(filter(None, rate_limit_errors))
|
|
374
376
|
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
|
|
375
377
|
|
|
@@ -606,10 +608,9 @@ def evaluate_experiment(
|
|
|
606
608
|
return eval_run
|
|
607
609
|
|
|
608
610
|
_errors: Tuple[Type[BaseException], ...]
|
|
609
|
-
if not
|
|
611
|
+
if not isinstance(rate_limit_errors, Sequence):
|
|
610
612
|
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
611
613
|
else:
|
|
612
|
-
rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
|
|
613
614
|
_errors = tuple(filter(None, rate_limit_errors))
|
|
614
615
|
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
|
|
615
616
|
|
phoenix/server/api/context.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Callable, Optional
|
|
5
5
|
|
|
6
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
6
|
from strawberry.fastapi import BaseContext
|
|
8
7
|
from typing_extensions import TypeAlias
|
|
9
8
|
|
|
@@ -26,6 +25,7 @@ from phoenix.server.api.dataloaders import (
|
|
|
26
25
|
ProjectByNameDataLoader,
|
|
27
26
|
RecordCountDataLoader,
|
|
28
27
|
SpanAnnotationsDataLoader,
|
|
28
|
+
SpanDatasetExamplesDataLoader,
|
|
29
29
|
SpanDescendantsDataLoader,
|
|
30
30
|
SpanEvaluationsDataLoader,
|
|
31
31
|
SpanProjectsDataLoader,
|
|
@@ -33,6 +33,7 @@ from phoenix.server.api.dataloaders import (
|
|
|
33
33
|
TraceEvaluationsDataLoader,
|
|
34
34
|
TraceRowIdsDataLoader,
|
|
35
35
|
)
|
|
36
|
+
from phoenix.server.types import DbSessionFactory
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
@dataclass
|
|
@@ -51,6 +52,7 @@ class DataLoaders:
|
|
|
51
52
|
latency_ms_quantile: LatencyMsQuantileDataLoader
|
|
52
53
|
min_start_or_max_end_times: MinStartOrMaxEndTimeDataLoader
|
|
53
54
|
record_counts: RecordCountDataLoader
|
|
55
|
+
span_dataset_examples: SpanDatasetExamplesDataLoader
|
|
54
56
|
span_descendants: SpanDescendantsDataLoader
|
|
55
57
|
span_evaluations: SpanEvaluationsDataLoader
|
|
56
58
|
span_projects: SpanProjectsDataLoader
|
|
@@ -66,7 +68,7 @@ ProjectRowId: TypeAlias = int
|
|
|
66
68
|
|
|
67
69
|
@dataclass
|
|
68
70
|
class Context(BaseContext):
|
|
69
|
-
db:
|
|
71
|
+
db: DbSessionFactory
|
|
70
72
|
data_loaders: DataLoaders
|
|
71
73
|
cache_for_dataloaders: Optional[CacheForDataLoaders]
|
|
72
74
|
model: Model
|
|
@@ -27,6 +27,7 @@ from .min_start_or_max_end_times import MinStartOrMaxEndTimeCache, MinStartOrMax
|
|
|
27
27
|
from .project_by_name import ProjectByNameDataLoader
|
|
28
28
|
from .record_counts import RecordCountCache, RecordCountDataLoader
|
|
29
29
|
from .span_annotations import SpanAnnotationsDataLoader
|
|
30
|
+
from .span_dataset_examples import SpanDatasetExamplesDataLoader
|
|
30
31
|
from .span_descendants import SpanDescendantsDataLoader
|
|
31
32
|
from .span_evaluations import SpanEvaluationsDataLoader
|
|
32
33
|
from .span_projects import SpanProjectsDataLoader
|
|
@@ -50,6 +51,7 @@ __all__ = [
|
|
|
50
51
|
"LatencyMsQuantileDataLoader",
|
|
51
52
|
"MinStartOrMaxEndTimeDataLoader",
|
|
52
53
|
"RecordCountDataLoader",
|
|
54
|
+
"SpanDatasetExamplesDataLoader",
|
|
53
55
|
"SpanDescendantsDataLoader",
|
|
54
56
|
"SpanEvaluationsDataLoader",
|
|
55
57
|
"SpanProjectsDataLoader",
|
|
@@ -1,18 +1,14 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
AsyncContextManager,
|
|
3
|
-
Callable,
|
|
4
|
-
List,
|
|
5
|
-
)
|
|
1
|
+
from typing import List, Optional
|
|
6
2
|
|
|
7
3
|
from sqlalchemy import func, select
|
|
8
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
4
|
from strawberry.dataloader import DataLoader
|
|
10
5
|
from typing_extensions import TypeAlias
|
|
11
6
|
|
|
12
7
|
from phoenix.db import models
|
|
8
|
+
from phoenix.server.types import DbSessionFactory
|
|
13
9
|
|
|
14
10
|
ExperimentID: TypeAlias = int
|
|
15
|
-
RunLatency: TypeAlias = float
|
|
11
|
+
RunLatency: TypeAlias = Optional[float]
|
|
16
12
|
Key: TypeAlias = ExperimentID
|
|
17
13
|
Result: TypeAlias = RunLatency
|
|
18
14
|
|
|
@@ -20,33 +16,37 @@ Result: TypeAlias = RunLatency
|
|
|
20
16
|
class AverageExperimentRunLatencyDataLoader(DataLoader[Key, Result]):
|
|
21
17
|
def __init__(
|
|
22
18
|
self,
|
|
23
|
-
db:
|
|
19
|
+
db: DbSessionFactory,
|
|
24
20
|
) -> None:
|
|
25
21
|
super().__init__(load_fn=self._load_fn)
|
|
26
22
|
self._db = db
|
|
27
23
|
|
|
28
24
|
async def _load_fn(self, keys: List[Key]) -> List[Result]:
|
|
29
25
|
experiment_ids = keys
|
|
26
|
+
resolved_experiment_ids = (
|
|
27
|
+
select(models.Experiment.id)
|
|
28
|
+
.where(models.Experiment.id.in_(set(experiment_ids)))
|
|
29
|
+
.subquery()
|
|
30
|
+
)
|
|
31
|
+
query = (
|
|
32
|
+
select(
|
|
33
|
+
resolved_experiment_ids.c.id,
|
|
34
|
+
func.avg(
|
|
35
|
+
func.extract("epoch", models.ExperimentRun.end_time)
|
|
36
|
+
- func.extract("epoch", models.ExperimentRun.start_time)
|
|
37
|
+
),
|
|
38
|
+
)
|
|
39
|
+
.outerjoin_from(
|
|
40
|
+
from_=resolved_experiment_ids,
|
|
41
|
+
target=models.ExperimentRun,
|
|
42
|
+
onclause=resolved_experiment_ids.c.id == models.ExperimentRun.experiment_id,
|
|
43
|
+
)
|
|
44
|
+
.group_by(resolved_experiment_ids.c.id)
|
|
45
|
+
)
|
|
30
46
|
async with self._db() as session:
|
|
31
47
|
avg_latencies = {
|
|
32
48
|
experiment_id: avg_latency
|
|
33
|
-
async for experiment_id, avg_latency in await session.stream(
|
|
34
|
-
select(
|
|
35
|
-
models.ExperimentRun.experiment_id,
|
|
36
|
-
func.avg(
|
|
37
|
-
func.extract(
|
|
38
|
-
"epoch",
|
|
39
|
-
models.ExperimentRun.end_time,
|
|
40
|
-
)
|
|
41
|
-
- func.extract(
|
|
42
|
-
"epoch",
|
|
43
|
-
models.ExperimentRun.start_time,
|
|
44
|
-
)
|
|
45
|
-
),
|
|
46
|
-
)
|
|
47
|
-
.where(models.ExperimentRun.experiment_id.in_(set(experiment_ids)))
|
|
48
|
-
.group_by(models.ExperimentRun.experiment_id)
|
|
49
|
-
)
|
|
49
|
+
async for experiment_id, avg_latency in await session.stream(query)
|
|
50
50
|
}
|
|
51
51
|
return [
|
|
52
52
|
avg_latencies.get(experiment_id, ValueError(f"Unknown experiment: {experiment_id}"))
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
from typing import (
|
|
2
|
-
AsyncContextManager,
|
|
3
|
-
Callable,
|
|
4
2
|
List,
|
|
5
3
|
Optional,
|
|
6
4
|
Tuple,
|
|
@@ -8,12 +6,12 @@ from typing import (
|
|
|
8
6
|
)
|
|
9
7
|
|
|
10
8
|
from sqlalchemy import Integer, case, func, literal, or_, select, union
|
|
11
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
12
9
|
from strawberry.dataloader import DataLoader
|
|
13
10
|
from typing_extensions import TypeAlias
|
|
14
11
|
|
|
15
12
|
from phoenix.db import models
|
|
16
13
|
from phoenix.server.api.types.DatasetExampleRevision import DatasetExampleRevision
|
|
14
|
+
from phoenix.server.types import DbSessionFactory
|
|
17
15
|
|
|
18
16
|
ExampleID: TypeAlias = int
|
|
19
17
|
VersionID: TypeAlias = Optional[int]
|
|
@@ -22,7 +20,7 @@ Result: TypeAlias = DatasetExampleRevision
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
class DatasetExampleRevisionsDataLoader(DataLoader[Key, Result]):
|
|
25
|
-
def __init__(self, db:
|
|
23
|
+
def __init__(self, db: DbSessionFactory) -> None:
|
|
26
24
|
super().__init__(load_fn=self._load_fn)
|
|
27
25
|
self._db = db
|
|
28
26
|
|
|
@@ -1,17 +1,15 @@
|
|
|
1
1
|
from typing import (
|
|
2
|
-
AsyncContextManager,
|
|
3
|
-
Callable,
|
|
4
2
|
List,
|
|
5
3
|
Optional,
|
|
6
4
|
)
|
|
7
5
|
|
|
8
6
|
from sqlalchemy import select
|
|
9
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
7
|
from sqlalchemy.orm import joinedload
|
|
11
8
|
from strawberry.dataloader import DataLoader
|
|
12
9
|
from typing_extensions import TypeAlias
|
|
13
10
|
|
|
14
11
|
from phoenix.db import models
|
|
12
|
+
from phoenix.server.types import DbSessionFactory
|
|
15
13
|
|
|
16
14
|
ExampleID: TypeAlias = int
|
|
17
15
|
Key: TypeAlias = ExampleID
|
|
@@ -19,7 +17,7 @@ Result: TypeAlias = Optional[models.Span]
|
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class DatasetExampleSpansDataLoader(DataLoader[Key, Result]):
|
|
22
|
-
def __init__(self, db:
|
|
20
|
+
def __init__(self, db: DbSessionFactory) -> None:
|
|
23
21
|
super().__init__(load_fn=self._load_fn)
|
|
24
22
|
self._db = db
|
|
25
23
|
|
|
@@ -2,8 +2,6 @@ from collections import defaultdict
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import (
|
|
4
4
|
Any,
|
|
5
|
-
AsyncContextManager,
|
|
6
|
-
Callable,
|
|
7
5
|
DefaultDict,
|
|
8
6
|
List,
|
|
9
7
|
Optional,
|
|
@@ -14,7 +12,6 @@ import numpy as np
|
|
|
14
12
|
from aioitertools.itertools import groupby
|
|
15
13
|
from cachetools import LFUCache, TTLCache
|
|
16
14
|
from sqlalchemy import Select, select
|
|
17
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
18
15
|
from strawberry.dataloader import AbstractCache, DataLoader
|
|
19
16
|
from typing_extensions import TypeAlias
|
|
20
17
|
|
|
@@ -24,6 +21,7 @@ from phoenix.metrics.retrieval_metrics import RetrievalMetrics
|
|
|
24
21
|
from phoenix.server.api.dataloaders.cache import TwoTierCache
|
|
25
22
|
from phoenix.server.api.input_types.TimeRange import TimeRange
|
|
26
23
|
from phoenix.server.api.types.DocumentEvaluationSummary import DocumentEvaluationSummary
|
|
24
|
+
from phoenix.server.types import DbSessionFactory
|
|
27
25
|
from phoenix.trace.dsl import SpanFilter
|
|
28
26
|
|
|
29
27
|
ProjectRowId: TypeAlias = int
|
|
@@ -77,7 +75,7 @@ class DocumentEvaluationSummaryCache(
|
|
|
77
75
|
class DocumentEvaluationSummaryDataLoader(DataLoader[Key, Result]):
|
|
78
76
|
def __init__(
|
|
79
77
|
self,
|
|
80
|
-
db:
|
|
78
|
+
db: DbSessionFactory,
|
|
81
79
|
cache_map: Optional[AbstractCache[Key, Result]] = None,
|
|
82
80
|
) -> None:
|
|
83
81
|
super().__init__(
|
|
@@ -1,25 +1,23 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from typing import (
|
|
3
|
-
AsyncContextManager,
|
|
4
|
-
Callable,
|
|
5
3
|
DefaultDict,
|
|
6
4
|
List,
|
|
7
5
|
)
|
|
8
6
|
|
|
9
7
|
from sqlalchemy import select
|
|
10
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
11
8
|
from strawberry.dataloader import DataLoader
|
|
12
9
|
from typing_extensions import TypeAlias
|
|
13
10
|
|
|
14
11
|
from phoenix.db import models
|
|
15
12
|
from phoenix.server.api.types.Evaluation import DocumentEvaluation
|
|
13
|
+
from phoenix.server.types import DbSessionFactory
|
|
16
14
|
|
|
17
15
|
Key: TypeAlias = int
|
|
18
16
|
Result: TypeAlias = List[DocumentEvaluation]
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class DocumentEvaluationsDataLoader(DataLoader[Key, Result]):
|
|
22
|
-
def __init__(self, db:
|
|
20
|
+
def __init__(self, db: DbSessionFactory) -> None:
|
|
23
21
|
super().__init__(load_fn=self._load_fn)
|
|
24
22
|
self._db = db
|
|
25
23
|
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from typing import (
|
|
3
|
-
AsyncContextManager,
|
|
4
|
-
Callable,
|
|
5
3
|
DefaultDict,
|
|
6
4
|
Dict,
|
|
7
5
|
List,
|
|
@@ -13,13 +11,13 @@ from typing import (
|
|
|
13
11
|
import numpy as np
|
|
14
12
|
from aioitertools.itertools import groupby
|
|
15
13
|
from sqlalchemy import select
|
|
16
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
17
14
|
from strawberry.dataloader import DataLoader
|
|
18
15
|
from typing_extensions import TypeAlias
|
|
19
16
|
|
|
20
17
|
from phoenix.db import models
|
|
21
18
|
from phoenix.metrics.retrieval_metrics import RetrievalMetrics
|
|
22
19
|
from phoenix.server.api.types.DocumentRetrievalMetrics import DocumentRetrievalMetrics
|
|
20
|
+
from phoenix.server.types import DbSessionFactory
|
|
23
21
|
|
|
24
22
|
RowId: TypeAlias = int
|
|
25
23
|
NumDocs: TypeAlias = int
|
|
@@ -30,7 +28,7 @@ Result: TypeAlias = List[DocumentRetrievalMetrics]
|
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
class DocumentRetrievalMetricsDataLoader(DataLoader[Key, Result]):
|
|
33
|
-
def __init__(self, db:
|
|
31
|
+
def __init__(self, db: DbSessionFactory) -> None:
|
|
34
32
|
super().__init__(load_fn=self._load_fn)
|
|
35
33
|
self._db = db
|
|
36
34
|
|
|
@@ -2,8 +2,6 @@ from collections import defaultdict
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import (
|
|
4
4
|
Any,
|
|
5
|
-
AsyncContextManager,
|
|
6
|
-
Callable,
|
|
7
5
|
DefaultDict,
|
|
8
6
|
List,
|
|
9
7
|
Literal,
|
|
@@ -15,7 +13,6 @@ import pandas as pd
|
|
|
15
13
|
from aioitertools.itertools import groupby
|
|
16
14
|
from cachetools import LFUCache, TTLCache
|
|
17
15
|
from sqlalchemy import Select, func, or_, select
|
|
18
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
19
16
|
from strawberry.dataloader import AbstractCache, DataLoader
|
|
20
17
|
from typing_extensions import TypeAlias, assert_never
|
|
21
18
|
|
|
@@ -23,6 +20,7 @@ from phoenix.db import models
|
|
|
23
20
|
from phoenix.server.api.dataloaders.cache import TwoTierCache
|
|
24
21
|
from phoenix.server.api.input_types.TimeRange import TimeRange
|
|
25
22
|
from phoenix.server.api.types.EvaluationSummary import EvaluationSummary
|
|
23
|
+
from phoenix.server.types import DbSessionFactory
|
|
26
24
|
from phoenix.trace.dsl import SpanFilter
|
|
27
25
|
|
|
28
26
|
Kind: TypeAlias = Literal["span", "trace"]
|
|
@@ -77,7 +75,7 @@ class EvaluationSummaryCache(
|
|
|
77
75
|
class EvaluationSummaryDataLoader(DataLoader[Key, Result]):
|
|
78
76
|
def __init__(
|
|
79
77
|
self,
|
|
80
|
-
db:
|
|
78
|
+
db: DbSessionFactory,
|
|
81
79
|
cache_map: Optional[AbstractCache[Key, Result]] = None,
|
|
82
80
|
) -> None:
|
|
83
81
|
super().__init__(
|
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import (
|
|
4
|
-
AsyncContextManager,
|
|
5
|
-
Callable,
|
|
6
4
|
DefaultDict,
|
|
7
5
|
List,
|
|
8
6
|
Optional,
|
|
9
7
|
)
|
|
10
8
|
|
|
11
9
|
from sqlalchemy import func, select
|
|
12
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
10
|
from strawberry.dataloader import AbstractCache, DataLoader
|
|
14
11
|
from typing_extensions import TypeAlias
|
|
15
12
|
|
|
16
13
|
from phoenix.db import models
|
|
14
|
+
from phoenix.server.types import DbSessionFactory
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
@dataclass
|
|
@@ -34,7 +32,7 @@ Result: TypeAlias = List[ExperimentAnnotationSummary]
|
|
|
34
32
|
class ExperimentAnnotationSummaryDataLoader(DataLoader[Key, Result]):
|
|
35
33
|
def __init__(
|
|
36
34
|
self,
|
|
37
|
-
db:
|
|
35
|
+
db: DbSessionFactory,
|
|
38
36
|
cache_map: Optional[AbstractCache[Key, Result]] = None,
|
|
39
37
|
) -> None:
|
|
40
38
|
super().__init__(load_fn=self._load_fn)
|