arize-phoenix 4.12.1rc1__py3-none-any.whl → 4.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (73) hide show
  1. {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/METADATA +10 -6
  2. {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/RECORD +70 -68
  3. phoenix/db/bulk_inserter.py +5 -4
  4. phoenix/db/engines.py +2 -1
  5. phoenix/experiments/evaluators/base.py +4 -0
  6. phoenix/experiments/evaluators/code_evaluators.py +80 -0
  7. phoenix/experiments/evaluators/llm_evaluators.py +77 -1
  8. phoenix/experiments/evaluators/utils.py +70 -21
  9. phoenix/experiments/functions.py +17 -16
  10. phoenix/server/api/context.py +5 -3
  11. phoenix/server/api/dataloaders/__init__.py +2 -0
  12. phoenix/server/api/dataloaders/average_experiment_run_latency.py +25 -25
  13. phoenix/server/api/dataloaders/dataset_example_revisions.py +2 -4
  14. phoenix/server/api/dataloaders/dataset_example_spans.py +2 -4
  15. phoenix/server/api/dataloaders/document_evaluation_summaries.py +2 -4
  16. phoenix/server/api/dataloaders/document_evaluations.py +2 -4
  17. phoenix/server/api/dataloaders/document_retrieval_metrics.py +2 -4
  18. phoenix/server/api/dataloaders/evaluation_summaries.py +2 -4
  19. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +2 -4
  20. phoenix/server/api/dataloaders/experiment_error_rates.py +32 -14
  21. phoenix/server/api/dataloaders/experiment_run_counts.py +20 -9
  22. phoenix/server/api/dataloaders/experiment_sequence_number.py +2 -4
  23. phoenix/server/api/dataloaders/latency_ms_quantile.py +2 -3
  24. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +2 -4
  25. phoenix/server/api/dataloaders/project_by_name.py +3 -3
  26. phoenix/server/api/dataloaders/record_counts.py +2 -4
  27. phoenix/server/api/dataloaders/span_annotations.py +2 -4
  28. phoenix/server/api/dataloaders/span_dataset_examples.py +36 -0
  29. phoenix/server/api/dataloaders/span_descendants.py +2 -4
  30. phoenix/server/api/dataloaders/span_evaluations.py +2 -4
  31. phoenix/server/api/dataloaders/span_projects.py +3 -3
  32. phoenix/server/api/dataloaders/token_counts.py +2 -4
  33. phoenix/server/api/dataloaders/trace_evaluations.py +2 -4
  34. phoenix/server/api/dataloaders/trace_row_ids.py +2 -4
  35. phoenix/server/api/input_types/{CreateSpanAnnotationsInput.py → CreateSpanAnnotationInput.py} +4 -2
  36. phoenix/server/api/input_types/{CreateTraceAnnotationsInput.py → CreateTraceAnnotationInput.py} +4 -2
  37. phoenix/server/api/input_types/{PatchAnnotationsInput.py → PatchAnnotationInput.py} +4 -2
  38. phoenix/server/api/mutations/span_annotations_mutations.py +20 -9
  39. phoenix/server/api/mutations/trace_annotations_mutations.py +20 -9
  40. phoenix/server/api/routers/v1/datasets.py +132 -10
  41. phoenix/server/api/routers/v1/evaluations.py +3 -5
  42. phoenix/server/api/routers/v1/experiments.py +1 -1
  43. phoenix/server/api/types/Experiment.py +2 -2
  44. phoenix/server/api/types/Inferences.py +1 -2
  45. phoenix/server/api/types/Model.py +1 -2
  46. phoenix/server/api/types/Span.py +5 -0
  47. phoenix/server/api/utils.py +4 -4
  48. phoenix/server/app.py +21 -18
  49. phoenix/server/grpc_server.py +2 -2
  50. phoenix/server/main.py +5 -9
  51. phoenix/server/static/.vite/manifest.json +31 -31
  52. phoenix/server/static/assets/{components-C8sm_r1F.js → components-kGgeFkHp.js} +150 -110
  53. phoenix/server/static/assets/index-BctFO6S7.js +100 -0
  54. phoenix/server/static/assets/{pages-bN7juCjh.js → pages-DabDCmVd.js} +432 -255
  55. phoenix/server/static/assets/{vendor-CUDAPm8e.js → vendor-CP0b0YG0.js} +2 -2
  56. phoenix/server/static/assets/{vendor-arizeai-Do2HOmcL.js → vendor-arizeai-B5Hti8OB.js} +27 -27
  57. phoenix/server/static/assets/vendor-codemirror-DtdPDzrv.js +15 -0
  58. phoenix/server/static/assets/{vendor-recharts-PKRvByVe.js → vendor-recharts-A0DA1O99.js} +1 -1
  59. phoenix/server/types.py +18 -0
  60. phoenix/session/client.py +9 -6
  61. phoenix/session/session.py +2 -2
  62. phoenix/trace/dsl/filter.py +40 -25
  63. phoenix/trace/fixtures.py +17 -23
  64. phoenix/trace/utils.py +23 -0
  65. phoenix/utilities/client.py +116 -0
  66. phoenix/utilities/project.py +1 -1
  67. phoenix/version.py +1 -1
  68. phoenix/server/api/routers/v1/dataset_examples.py +0 -157
  69. phoenix/server/static/assets/index-BEKPzgQs.js +0 -100
  70. phoenix/server/static/assets/vendor-codemirror-CrdxOlMs.js +0 -12
  71. {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/WHEEL +0 -0
  72. {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/licenses/IP_NOTICE +0 -0
  73. {arize_phoenix-4.12.1rc1.dist-info → arize_phoenix-4.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -18,6 +18,31 @@ from phoenix.experiments.types import (
18
18
 
19
19
 
20
20
  class LLMCriteriaEvaluator(LLMEvaluator):
21
+ """
22
+ An experiment evaluator that uses an LLM to evaluate whether the text meets a custom criteria.
23
+
24
+ This evaluator uses the chain-of-thought technique to perform a binary evaluation of text based
25
+ on a custom criteria and description. When used as an experiment evaluator,
26
+ `LLMCriteriaEvaluator` will return a score of 1.0 if the text meets the criteria and a score of
27
+ 0.0 if not. The explanation produced by the chain-of-thought technique will be included in the
28
+ experiment evaluation as well.
29
+
30
+ Example criteria and descriptions:
31
+ - "thoughtfulness" - "shows careful consideration and fair judgement"
32
+ - "clarity" - "is easy to understand and follow"
33
+ - "professionalism" - "is respectful and appropriate for a formal setting"
34
+
35
+ Args:
36
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
37
+ the `phoenix.evals` module.
38
+ criteria: The criteria to evaluate the text against, the criteria should be able to be used
39
+ as a noun in a sentence.
40
+ description (str): A description of the criteria, used to clarify instructions to the LLM.
41
+ The description should complete this sentence: "{criteria} means the text
42
+ {description}".
43
+ name (str): The name of the evaluator
44
+ """
45
+
21
46
  _base_template = (
22
47
  "Determine if the following text is {criteria}. {description}"
23
48
  "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
@@ -117,6 +142,14 @@ ConcisenessEvaluator = criteria_evaluator_factory(
117
142
  description="is just a few sentences and easy to follow",
118
143
  default_name="Conciseness",
119
144
  )
145
+ """
146
+ An experiment evaluator that uses an LLM to evaluate whether the text is concise.
147
+
148
+ Args:
149
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
150
+ the `phoenix.evals` module.
151
+ name (str, optional): The name of the evaluator, defaults to "Conciseness".
152
+ """
120
153
 
121
154
 
122
155
  HelpfulnessEvaluator = criteria_evaluator_factory(
@@ -125,6 +158,14 @@ HelpfulnessEvaluator = criteria_evaluator_factory(
125
158
  description="provides useful information",
126
159
  default_name="Helpfulness",
127
160
  )
161
+ """
162
+ An experiment evaluator that uses an LLM to evaluate whether the text is helpful.
163
+
164
+ Args:
165
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
166
+ the `phoenix.evals` module.
167
+ name (str, optional): The name of the evaluator, defaults to "Helpfulness".
168
+ """
128
169
 
129
170
 
130
171
  CoherenceEvaluator = criteria_evaluator_factory(
@@ -133,6 +174,14 @@ CoherenceEvaluator = criteria_evaluator_factory(
133
174
  description="is coherent, well-structured, and logically sound",
134
175
  default_name="Coherence",
135
176
  )
177
+ """
178
+ An experiment evaluator that uses an LLM to evaluate whether the text is coherent.
179
+
180
+ Args:
181
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
182
+ the `phoenix.evals` module.
183
+ name (str, optional): The name of the evaluator, defaults to "Coherence".
184
+ """
136
185
 
137
186
 
138
187
  def _parse_label_from_explanation(raw_string: str) -> str:
@@ -149,6 +198,33 @@ def _parse_label_from_explanation(raw_string: str) -> str:
149
198
 
150
199
 
151
200
  class RelevanceEvaluator(LLMEvaluator):
201
+ """
202
+ An experiment evaluator that uses an LLM to evaluate whether a response is relevant to a query.
203
+
204
+ This evaluator uses the chain-of-thought technique to perform a binary evaluation of whether
205
+ the output "response" of an experiment is relevant to its input "query". When used as an
206
+ experiment evaluator, `RelevanceEvaluator` will return a score of 1.0 if the response is
207
+ relevant to the query and a score of 0.0 if not. The explanation produced by the
208
+ chain-of-thought technique will be included in the experiment evaluation as well.
209
+
210
+ Optionally, you can provide custom functions to extract the query and response from the input
211
+ and output of the experiment task. By default, the evaluator will use the dataset example as
212
+ the input and the output of the experiment task as the response.
213
+
214
+ Args:
215
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
216
+ the `phoenix.evals` module.
217
+ get_query (callable, optional): A function that extracts the query from the input of the
218
+ experiment task. The function should take the input and metadata of the dataset example
219
+ and return a string. By default, the function will return the string representation of
220
+ the input.
221
+ get_response (callable, optional): A function that extracts the response from the output of
222
+ the experiment task. The function should take the output and metadata of the experiment
223
+ task and return a string. By default, the function will return the string representation
224
+ of the output.
225
+ name (str, optional): The name of the evaluator. Defaults to "Relevance".
226
+ """
227
+
152
228
  template = (
153
229
  "Determine if the following response is relevant to the query. In this context, "
154
230
  "'relevance' means that the response directly addresses the core question or topic of the "
@@ -174,7 +250,7 @@ class RelevanceEvaluator(LLMEvaluator):
174
250
  model: LLMBaseModel,
175
251
  get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
176
252
  get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
177
- name: str = "RelevanceEvaluator",
253
+ name: str = "Relevance",
178
254
  ):
179
255
  self.model = model
180
256
  self._name = name
@@ -1,6 +1,5 @@
1
1
  import functools
2
2
  import inspect
3
- from itertools import chain, islice, repeat
4
3
  from typing import TYPE_CHECKING, Any, Callable, Optional, Union
5
4
 
6
5
  from phoenix.experiments.types import (
@@ -75,6 +74,72 @@ def create_evaluator(
75
74
  name: Optional[str] = None,
76
75
  scorer: Optional[Callable[[Any], EvaluationResult]] = None,
77
76
  ) -> Callable[[Callable[..., Any]], "Evaluator"]:
77
+ """
78
+ A decorator that configures a sync or async function to be used as an experiment evaluator.
79
+
80
+ If the `evaluator` is a function of one argument then that argument will be
81
+ bound to the `output` of an experiment task. Alternatively, the `evaluator` can be a function
82
+ of any combination of specific argument names that will be bound to special values:
83
+ `input`: The input field of the dataset example
84
+ `output`: The output of an experiment task
85
+ `expected`: The expected or reference output of the dataset example
86
+ `reference`: An alias for `expected`
87
+ `metadata`: Metadata associated with the dataset example
88
+
89
+ Args:
90
+ kind (str | AnnotatorKind): Broadly indicates how the evaluator scores an experiment run.
91
+ Valid kinds are: "CODE", "LLM". Defaults to "CODE".
92
+ name (str, optional): The name of the evaluator. If not provided, the name of the function
93
+ will be used.
94
+ scorer (callable, optional): An optional function that converts the output of the wrapped
95
+ function into an `EvaluationResult`. This allows configuring the evaluation
96
+ payload by setting a label, score and explanation. By default, numeric outputs will
97
+ be recorded as scores, boolean outputs will be recorded as scores and labels, and
98
+ string outputs will be recorded as labels. If the output is a 2-tuple, the first item
99
+ will be recorded as the score and the second item will recorded as the explanation.
100
+
101
+ Examples:
102
+ Configuring an evaluator that returns a boolean
103
+
104
+ .. code-block:: python
105
+ @create_evaluator(kind="CODE", name="exact-match)
106
+ def match(output: str, expected: str) -> bool:
107
+ return output == expected
108
+
109
+ Configuring an evaluator that returns a label
110
+
111
+ .. code-block:: python
112
+ client = openai.Client()
113
+
114
+ @create_evaluator(kind="LLM")
115
+ def label(output: str) -> str:
116
+ res = client.chat.completions.create(
117
+ model = "gpt-4",
118
+ messages = [
119
+ {
120
+ "role": "user",
121
+ "content": (
122
+ "in one word, characterize the sentiment of the following customer "
123
+ f"request: {output}"
124
+ )
125
+ },
126
+ ],
127
+ )
128
+ label = res.choices[0].message.content
129
+ return label
130
+
131
+ Configuring an evaluator that returns a score and explanation
132
+
133
+ .. code-block:: python
134
+ from textdistance import levenshtein
135
+
136
+ @create_evaluator(kind="CODE", name="levenshtein-distance")
137
+ def ld(output: str, expected: str) -> Tuple[float, str]:
138
+ return (
139
+ levenshtein(output, expected),
140
+ f"Levenshtein distance between {output} and {expected}"
141
+ )
142
+ """
78
143
  if scorer is None:
79
144
  scorer = _default_eval_scorer
80
145
 
@@ -163,24 +228,8 @@ def _default_eval_scorer(result: Any) -> EvaluationResult:
163
228
  return EvaluationResult(score=float(result))
164
229
  if isinstance(result, str):
165
230
  return EvaluationResult(label=result)
166
- if isinstance(result, (tuple, list)) and 0 < len(result) <= 3:
167
- # Possible interpretations are:
168
- # - 3-tuple: (Score, Label, Explanation)
169
- # - 2-tuple: (Score, Explanation) or (Label, Explanation)
170
- # - 1-tuple: (Score, ) or (Label, )
171
- # Note that (Score, Label) conflicts with (Score, Explanation) and we
172
- # pick the latter because it's probably more prevalent. To get
173
- # (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
174
- a, b, c = islice(chain(result, repeat(None)), 3)
175
- score, label, explanation = None, a, b
176
- if hasattr(a, "__float__"):
177
- try:
178
- score = float(a)
179
- except ValueError:
180
- pass
181
- else:
182
- label, explanation = (None, b) if len(result) < 3 else (b, c)
183
- return EvaluationResult(score=score, label=label, explanation=explanation)
184
- if result is None:
185
- return EvaluationResult(score=0)
231
+ if isinstance(result, (tuple, list)) and len(result) == 2:
232
+ # If the result is a 2-tuple, the first item will be recorded as the score
233
+ # and the second item will recorded as the explanation.
234
+ return EvaluationResult(score=float(result[0]), explanation=str(result[1]))
186
235
  raise ValueError(f"Unsupported evaluation result type: {type(result)}")
@@ -72,15 +72,16 @@ from phoenix.experiments.types import (
72
72
  )
73
73
  from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
74
74
  from phoenix.trace.attributes import flatten
75
+ from phoenix.utilities.client import VersionedAsyncClient, VersionedClient
75
76
  from phoenix.utilities.json import jsonify
76
77
 
77
78
 
78
79
  def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
79
80
  headers = get_env_client_headers()
80
- return httpx.Client(
81
+ return VersionedClient(
81
82
  base_url=get_base_url(),
82
83
  headers=headers,
83
- ), httpx.AsyncClient(
84
+ ), VersionedAsyncClient(
84
85
  base_url=get_base_url(),
85
86
  headers=headers,
86
87
  )
@@ -120,21 +121,23 @@ def run_experiment(
120
121
  output. If the `task` is a function of one argument then that argument will be bound to the
121
122
  `input` field of the dataset example. Alternatively, the `task` can be a function of any
122
123
  combination of specific argument names that will be bound to special values:
123
- `input`: The input field of the dataset example
124
- `expected`: The expected or reference output of the dataset example
125
- `reference`: An alias for `expected`
126
- `metadata`: Metadata associated with the dataset example
127
- `example`: The dataset `Example` object with all associated fields
124
+
125
+ - `input`: The input field of the dataset example
126
+ - `expected`: The expected or reference output of the dataset example
127
+ - `reference`: An alias for `expected`
128
+ - `metadata`: Metadata associated with the dataset example
129
+ - `example`: The dataset `Example` object with all associated fields
128
130
 
129
131
  An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
130
132
  or numeric "score". If the `evaluator` is a function of one argument then that argument will be
131
133
  bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
132
134
  combination of specific argument names that will be bound to special values:
133
- `input`: The input field of the dataset example
134
- `output`: The output of the task
135
- `expected`: The expected or reference output of the dataset example
136
- `reference`: An alias for `expected`
137
- `metadata`: Metadata associated with the dataset example
135
+
136
+ - `input`: The input field of the dataset example
137
+ - `output`: The output of the task
138
+ - `expected`: The expected or reference output of the dataset example
139
+ - `reference`: An alias for `expected`
140
+ - `metadata`: Metadata associated with the dataset example
138
141
 
139
142
  Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
140
143
 
@@ -366,10 +369,9 @@ def run_experiment(
366
369
  return exp_run
367
370
 
368
371
  _errors: Tuple[Type[BaseException], ...]
369
- if not hasattr(rate_limit_errors, "__iter__"):
372
+ if not isinstance(rate_limit_errors, Sequence):
370
373
  _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
371
374
  else:
372
- rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
373
375
  _errors = tuple(filter(None, rate_limit_errors))
374
376
  rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
375
377
 
@@ -606,10 +608,9 @@ def evaluate_experiment(
606
608
  return eval_run
607
609
 
608
610
  _errors: Tuple[Type[BaseException], ...]
609
- if not hasattr(rate_limit_errors, "__iter__"):
611
+ if not isinstance(rate_limit_errors, Sequence):
610
612
  _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
611
613
  else:
612
- rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
613
614
  _errors = tuple(filter(None, rate_limit_errors))
614
615
  rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
615
616
 
@@ -1,9 +1,8 @@
1
1
  from dataclasses import dataclass
2
2
  from datetime import datetime
3
3
  from pathlib import Path
4
- from typing import AsyncContextManager, Callable, Optional
4
+ from typing import Callable, Optional
5
5
 
6
- from sqlalchemy.ext.asyncio import AsyncSession
7
6
  from strawberry.fastapi import BaseContext
8
7
  from typing_extensions import TypeAlias
9
8
 
@@ -26,6 +25,7 @@ from phoenix.server.api.dataloaders import (
26
25
  ProjectByNameDataLoader,
27
26
  RecordCountDataLoader,
28
27
  SpanAnnotationsDataLoader,
28
+ SpanDatasetExamplesDataLoader,
29
29
  SpanDescendantsDataLoader,
30
30
  SpanEvaluationsDataLoader,
31
31
  SpanProjectsDataLoader,
@@ -33,6 +33,7 @@ from phoenix.server.api.dataloaders import (
33
33
  TraceEvaluationsDataLoader,
34
34
  TraceRowIdsDataLoader,
35
35
  )
36
+ from phoenix.server.types import DbSessionFactory
36
37
 
37
38
 
38
39
  @dataclass
@@ -51,6 +52,7 @@ class DataLoaders:
51
52
  latency_ms_quantile: LatencyMsQuantileDataLoader
52
53
  min_start_or_max_end_times: MinStartOrMaxEndTimeDataLoader
53
54
  record_counts: RecordCountDataLoader
55
+ span_dataset_examples: SpanDatasetExamplesDataLoader
54
56
  span_descendants: SpanDescendantsDataLoader
55
57
  span_evaluations: SpanEvaluationsDataLoader
56
58
  span_projects: SpanProjectsDataLoader
@@ -66,7 +68,7 @@ ProjectRowId: TypeAlias = int
66
68
 
67
69
  @dataclass
68
70
  class Context(BaseContext):
69
- db: Callable[[], AsyncContextManager[AsyncSession]]
71
+ db: DbSessionFactory
70
72
  data_loaders: DataLoaders
71
73
  cache_for_dataloaders: Optional[CacheForDataLoaders]
72
74
  model: Model
@@ -27,6 +27,7 @@ from .min_start_or_max_end_times import MinStartOrMaxEndTimeCache, MinStartOrMax
27
27
  from .project_by_name import ProjectByNameDataLoader
28
28
  from .record_counts import RecordCountCache, RecordCountDataLoader
29
29
  from .span_annotations import SpanAnnotationsDataLoader
30
+ from .span_dataset_examples import SpanDatasetExamplesDataLoader
30
31
  from .span_descendants import SpanDescendantsDataLoader
31
32
  from .span_evaluations import SpanEvaluationsDataLoader
32
33
  from .span_projects import SpanProjectsDataLoader
@@ -50,6 +51,7 @@ __all__ = [
50
51
  "LatencyMsQuantileDataLoader",
51
52
  "MinStartOrMaxEndTimeDataLoader",
52
53
  "RecordCountDataLoader",
54
+ "SpanDatasetExamplesDataLoader",
53
55
  "SpanDescendantsDataLoader",
54
56
  "SpanEvaluationsDataLoader",
55
57
  "SpanProjectsDataLoader",
@@ -1,18 +1,14 @@
1
- from typing import (
2
- AsyncContextManager,
3
- Callable,
4
- List,
5
- )
1
+ from typing import List, Optional
6
2
 
7
3
  from sqlalchemy import func, select
8
- from sqlalchemy.ext.asyncio import AsyncSession
9
4
  from strawberry.dataloader import DataLoader
10
5
  from typing_extensions import TypeAlias
11
6
 
12
7
  from phoenix.db import models
8
+ from phoenix.server.types import DbSessionFactory
13
9
 
14
10
  ExperimentID: TypeAlias = int
15
- RunLatency: TypeAlias = float
11
+ RunLatency: TypeAlias = Optional[float]
16
12
  Key: TypeAlias = ExperimentID
17
13
  Result: TypeAlias = RunLatency
18
14
 
@@ -20,33 +16,37 @@ Result: TypeAlias = RunLatency
20
16
  class AverageExperimentRunLatencyDataLoader(DataLoader[Key, Result]):
21
17
  def __init__(
22
18
  self,
23
- db: Callable[[], AsyncContextManager[AsyncSession]],
19
+ db: DbSessionFactory,
24
20
  ) -> None:
25
21
  super().__init__(load_fn=self._load_fn)
26
22
  self._db = db
27
23
 
28
24
  async def _load_fn(self, keys: List[Key]) -> List[Result]:
29
25
  experiment_ids = keys
26
+ resolved_experiment_ids = (
27
+ select(models.Experiment.id)
28
+ .where(models.Experiment.id.in_(set(experiment_ids)))
29
+ .subquery()
30
+ )
31
+ query = (
32
+ select(
33
+ resolved_experiment_ids.c.id,
34
+ func.avg(
35
+ func.extract("epoch", models.ExperimentRun.end_time)
36
+ - func.extract("epoch", models.ExperimentRun.start_time)
37
+ ),
38
+ )
39
+ .outerjoin_from(
40
+ from_=resolved_experiment_ids,
41
+ target=models.ExperimentRun,
42
+ onclause=resolved_experiment_ids.c.id == models.ExperimentRun.experiment_id,
43
+ )
44
+ .group_by(resolved_experiment_ids.c.id)
45
+ )
30
46
  async with self._db() as session:
31
47
  avg_latencies = {
32
48
  experiment_id: avg_latency
33
- async for experiment_id, avg_latency in await session.stream(
34
- select(
35
- models.ExperimentRun.experiment_id,
36
- func.avg(
37
- func.extract(
38
- "epoch",
39
- models.ExperimentRun.end_time,
40
- )
41
- - func.extract(
42
- "epoch",
43
- models.ExperimentRun.start_time,
44
- )
45
- ),
46
- )
47
- .where(models.ExperimentRun.experiment_id.in_(set(experiment_ids)))
48
- .group_by(models.ExperimentRun.experiment_id)
49
- )
49
+ async for experiment_id, avg_latency in await session.stream(query)
50
50
  }
51
51
  return [
52
52
  avg_latencies.get(experiment_id, ValueError(f"Unknown experiment: {experiment_id}"))
@@ -1,6 +1,4 @@
1
1
  from typing import (
2
- AsyncContextManager,
3
- Callable,
4
2
  List,
5
3
  Optional,
6
4
  Tuple,
@@ -8,12 +6,12 @@ from typing import (
8
6
  )
9
7
 
10
8
  from sqlalchemy import Integer, case, func, literal, or_, select, union
11
- from sqlalchemy.ext.asyncio import AsyncSession
12
9
  from strawberry.dataloader import DataLoader
13
10
  from typing_extensions import TypeAlias
14
11
 
15
12
  from phoenix.db import models
16
13
  from phoenix.server.api.types.DatasetExampleRevision import DatasetExampleRevision
14
+ from phoenix.server.types import DbSessionFactory
17
15
 
18
16
  ExampleID: TypeAlias = int
19
17
  VersionID: TypeAlias = Optional[int]
@@ -22,7 +20,7 @@ Result: TypeAlias = DatasetExampleRevision
22
20
 
23
21
 
24
22
  class DatasetExampleRevisionsDataLoader(DataLoader[Key, Result]):
25
- def __init__(self, db: Callable[[], AsyncContextManager[AsyncSession]]) -> None:
23
+ def __init__(self, db: DbSessionFactory) -> None:
26
24
  super().__init__(load_fn=self._load_fn)
27
25
  self._db = db
28
26
 
@@ -1,17 +1,15 @@
1
1
  from typing import (
2
- AsyncContextManager,
3
- Callable,
4
2
  List,
5
3
  Optional,
6
4
  )
7
5
 
8
6
  from sqlalchemy import select
9
- from sqlalchemy.ext.asyncio import AsyncSession
10
7
  from sqlalchemy.orm import joinedload
11
8
  from strawberry.dataloader import DataLoader
12
9
  from typing_extensions import TypeAlias
13
10
 
14
11
  from phoenix.db import models
12
+ from phoenix.server.types import DbSessionFactory
15
13
 
16
14
  ExampleID: TypeAlias = int
17
15
  Key: TypeAlias = ExampleID
@@ -19,7 +17,7 @@ Result: TypeAlias = Optional[models.Span]
19
17
 
20
18
 
21
19
  class DatasetExampleSpansDataLoader(DataLoader[Key, Result]):
22
- def __init__(self, db: Callable[[], AsyncContextManager[AsyncSession]]) -> None:
20
+ def __init__(self, db: DbSessionFactory) -> None:
23
21
  super().__init__(load_fn=self._load_fn)
24
22
  self._db = db
25
23
 
@@ -2,8 +2,6 @@ from collections import defaultdict
2
2
  from datetime import datetime
3
3
  from typing import (
4
4
  Any,
5
- AsyncContextManager,
6
- Callable,
7
5
  DefaultDict,
8
6
  List,
9
7
  Optional,
@@ -14,7 +12,6 @@ import numpy as np
14
12
  from aioitertools.itertools import groupby
15
13
  from cachetools import LFUCache, TTLCache
16
14
  from sqlalchemy import Select, select
17
- from sqlalchemy.ext.asyncio import AsyncSession
18
15
  from strawberry.dataloader import AbstractCache, DataLoader
19
16
  from typing_extensions import TypeAlias
20
17
 
@@ -24,6 +21,7 @@ from phoenix.metrics.retrieval_metrics import RetrievalMetrics
24
21
  from phoenix.server.api.dataloaders.cache import TwoTierCache
25
22
  from phoenix.server.api.input_types.TimeRange import TimeRange
26
23
  from phoenix.server.api.types.DocumentEvaluationSummary import DocumentEvaluationSummary
24
+ from phoenix.server.types import DbSessionFactory
27
25
  from phoenix.trace.dsl import SpanFilter
28
26
 
29
27
  ProjectRowId: TypeAlias = int
@@ -77,7 +75,7 @@ class DocumentEvaluationSummaryCache(
77
75
  class DocumentEvaluationSummaryDataLoader(DataLoader[Key, Result]):
78
76
  def __init__(
79
77
  self,
80
- db: Callable[[], AsyncContextManager[AsyncSession]],
78
+ db: DbSessionFactory,
81
79
  cache_map: Optional[AbstractCache[Key, Result]] = None,
82
80
  ) -> None:
83
81
  super().__init__(
@@ -1,25 +1,23 @@
1
1
  from collections import defaultdict
2
2
  from typing import (
3
- AsyncContextManager,
4
- Callable,
5
3
  DefaultDict,
6
4
  List,
7
5
  )
8
6
 
9
7
  from sqlalchemy import select
10
- from sqlalchemy.ext.asyncio import AsyncSession
11
8
  from strawberry.dataloader import DataLoader
12
9
  from typing_extensions import TypeAlias
13
10
 
14
11
  from phoenix.db import models
15
12
  from phoenix.server.api.types.Evaluation import DocumentEvaluation
13
+ from phoenix.server.types import DbSessionFactory
16
14
 
17
15
  Key: TypeAlias = int
18
16
  Result: TypeAlias = List[DocumentEvaluation]
19
17
 
20
18
 
21
19
  class DocumentEvaluationsDataLoader(DataLoader[Key, Result]):
22
- def __init__(self, db: Callable[[], AsyncContextManager[AsyncSession]]) -> None:
20
+ def __init__(self, db: DbSessionFactory) -> None:
23
21
  super().__init__(load_fn=self._load_fn)
24
22
  self._db = db
25
23
 
@@ -1,7 +1,5 @@
1
1
  from collections import defaultdict
2
2
  from typing import (
3
- AsyncContextManager,
4
- Callable,
5
3
  DefaultDict,
6
4
  Dict,
7
5
  List,
@@ -13,13 +11,13 @@ from typing import (
13
11
  import numpy as np
14
12
  from aioitertools.itertools import groupby
15
13
  from sqlalchemy import select
16
- from sqlalchemy.ext.asyncio import AsyncSession
17
14
  from strawberry.dataloader import DataLoader
18
15
  from typing_extensions import TypeAlias
19
16
 
20
17
  from phoenix.db import models
21
18
  from phoenix.metrics.retrieval_metrics import RetrievalMetrics
22
19
  from phoenix.server.api.types.DocumentRetrievalMetrics import DocumentRetrievalMetrics
20
+ from phoenix.server.types import DbSessionFactory
23
21
 
24
22
  RowId: TypeAlias = int
25
23
  NumDocs: TypeAlias = int
@@ -30,7 +28,7 @@ Result: TypeAlias = List[DocumentRetrievalMetrics]
30
28
 
31
29
 
32
30
  class DocumentRetrievalMetricsDataLoader(DataLoader[Key, Result]):
33
- def __init__(self, db: Callable[[], AsyncContextManager[AsyncSession]]) -> None:
31
+ def __init__(self, db: DbSessionFactory) -> None:
34
32
  super().__init__(load_fn=self._load_fn)
35
33
  self._db = db
36
34
 
@@ -2,8 +2,6 @@ from collections import defaultdict
2
2
  from datetime import datetime
3
3
  from typing import (
4
4
  Any,
5
- AsyncContextManager,
6
- Callable,
7
5
  DefaultDict,
8
6
  List,
9
7
  Literal,
@@ -15,7 +13,6 @@ import pandas as pd
15
13
  from aioitertools.itertools import groupby
16
14
  from cachetools import LFUCache, TTLCache
17
15
  from sqlalchemy import Select, func, or_, select
18
- from sqlalchemy.ext.asyncio import AsyncSession
19
16
  from strawberry.dataloader import AbstractCache, DataLoader
20
17
  from typing_extensions import TypeAlias, assert_never
21
18
 
@@ -23,6 +20,7 @@ from phoenix.db import models
23
20
  from phoenix.server.api.dataloaders.cache import TwoTierCache
24
21
  from phoenix.server.api.input_types.TimeRange import TimeRange
25
22
  from phoenix.server.api.types.EvaluationSummary import EvaluationSummary
23
+ from phoenix.server.types import DbSessionFactory
26
24
  from phoenix.trace.dsl import SpanFilter
27
25
 
28
26
  Kind: TypeAlias = Literal["span", "trace"]
@@ -77,7 +75,7 @@ class EvaluationSummaryCache(
77
75
  class EvaluationSummaryDataLoader(DataLoader[Key, Result]):
78
76
  def __init__(
79
77
  self,
80
- db: Callable[[], AsyncContextManager[AsyncSession]],
78
+ db: DbSessionFactory,
81
79
  cache_map: Optional[AbstractCache[Key, Result]] = None,
82
80
  ) -> None:
83
81
  super().__init__(
@@ -1,19 +1,17 @@
1
1
  from collections import defaultdict
2
2
  from dataclasses import dataclass
3
3
  from typing import (
4
- AsyncContextManager,
5
- Callable,
6
4
  DefaultDict,
7
5
  List,
8
6
  Optional,
9
7
  )
10
8
 
11
9
  from sqlalchemy import func, select
12
- from sqlalchemy.ext.asyncio import AsyncSession
13
10
  from strawberry.dataloader import AbstractCache, DataLoader
14
11
  from typing_extensions import TypeAlias
15
12
 
16
13
  from phoenix.db import models
14
+ from phoenix.server.types import DbSessionFactory
17
15
 
18
16
 
19
17
  @dataclass
@@ -34,7 +32,7 @@ Result: TypeAlias = List[ExperimentAnnotationSummary]
34
32
  class ExperimentAnnotationSummaryDataLoader(DataLoader[Key, Result]):
35
33
  def __init__(
36
34
  self,
37
- db: Callable[[], AsyncContextManager[AsyncSession]],
35
+ db: DbSessionFactory,
38
36
  cache_map: Optional[AbstractCache[Key, Result]] = None,
39
37
  ) -> None:
40
38
  super().__init__(load_fn=self._load_fn)