arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (42) hide show
  1. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +11 -5
  2. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +39 -36
  3. phoenix/config.py +21 -0
  4. phoenix/datetime_utils.py +4 -0
  5. phoenix/db/insertion/evaluation.py +4 -4
  6. phoenix/db/insertion/helpers.py +4 -12
  7. phoenix/db/insertion/span.py +3 -3
  8. phoenix/db/models.py +1 -1
  9. phoenix/experiments/__init__.py +6 -0
  10. phoenix/experiments/evaluators/__init__.py +29 -0
  11. phoenix/experiments/evaluators/base.py +153 -0
  12. phoenix/{datasets → experiments}/evaluators/code_evaluators.py +7 -7
  13. phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +9 -9
  14. phoenix/{datasets → experiments}/evaluators/utils.py +38 -141
  15. phoenix/{datasets/experiments.py → experiments/functions.py} +248 -182
  16. phoenix/experiments/types.py +722 -0
  17. phoenix/experiments/utils.py +9 -0
  18. phoenix/server/api/context.py +2 -0
  19. phoenix/server/api/dataloaders/__init__.py +2 -0
  20. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  21. phoenix/server/api/routers/v1/__init__.py +1 -1
  22. phoenix/server/api/routers/v1/dataset_examples.py +10 -10
  23. phoenix/server/api/routers/v1/datasets.py +6 -6
  24. phoenix/server/api/routers/v1/evaluations.py +4 -11
  25. phoenix/server/api/routers/v1/experiment_evaluations.py +22 -23
  26. phoenix/server/api/routers/v1/experiment_runs.py +4 -16
  27. phoenix/server/api/routers/v1/experiments.py +5 -5
  28. phoenix/server/api/routers/v1/spans.py +6 -4
  29. phoenix/server/api/types/Experiment.py +7 -0
  30. phoenix/server/app.py +2 -0
  31. phoenix/server/static/index.js +648 -570
  32. phoenix/session/client.py +256 -85
  33. phoenix/trace/fixtures.py +6 -6
  34. phoenix/utilities/json.py +8 -8
  35. phoenix/version.py +1 -1
  36. phoenix/datasets/__init__.py +0 -0
  37. phoenix/datasets/evaluators/__init__.py +0 -18
  38. phoenix/datasets/types.py +0 -178
  39. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
  40. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
  41. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
  42. /phoenix/{datasets → experiments}/tracing.py +0 -0
@@ -0,0 +1,9 @@
1
+ from phoenix.config import get_web_base_url
2
+
3
+
4
+ def get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
5
+ return f"{get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
6
+
7
+
8
+ def get_dataset_experiments_url(*, dataset_id: str) -> str:
9
+ return f"{get_web_base_url()}datasets/{dataset_id}/experiments"
@@ -11,6 +11,7 @@ from typing_extensions import TypeAlias
11
11
 
12
12
  from phoenix.core.model_schema import Model
13
13
  from phoenix.server.api.dataloaders import (
14
+ AverageExperimentRunLatencyDataLoader,
14
15
  CacheForDataLoaders,
15
16
  DatasetExampleRevisionsDataLoader,
16
17
  DatasetExampleSpansDataLoader,
@@ -37,6 +38,7 @@ from phoenix.server.api.dataloaders import (
37
38
 
38
39
  @dataclass
39
40
  class DataLoaders:
41
+ average_experiment_run_latency: AverageExperimentRunLatencyDataLoader
40
42
  dataset_example_revisions: DatasetExampleRevisionsDataLoader
41
43
  dataset_example_spans: DatasetExampleSpansDataLoader
42
44
  document_evaluation_summaries: DocumentEvaluationSummaryDataLoader
@@ -8,6 +8,7 @@ from phoenix.db.insertion.evaluation import (
8
8
  )
9
9
  from phoenix.db.insertion.span import ClearProjectSpansEvent, SpanInsertionEvent
10
10
 
11
+ from .average_experiment_run_latency import AverageExperimentRunLatencyDataLoader
11
12
  from .dataset_example_revisions import DatasetExampleRevisionsDataLoader
12
13
  from .dataset_example_spans import DatasetExampleSpansDataLoader
13
14
  from .document_evaluation_summaries import (
@@ -34,6 +35,7 @@ from .trace_row_ids import TraceRowIdsDataLoader
34
35
 
35
36
  __all__ = [
36
37
  "CacheForDataLoaders",
38
+ "AverageExperimentRunLatencyDataLoader",
37
39
  "DatasetExampleRevisionsDataLoader",
38
40
  "DatasetExampleSpansDataLoader",
39
41
  "DocumentEvaluationSummaryDataLoader",
@@ -0,0 +1,54 @@
1
+ from typing import (
2
+ AsyncContextManager,
3
+ Callable,
4
+ List,
5
+ )
6
+
7
+ from sqlalchemy import func, select
8
+ from sqlalchemy.ext.asyncio import AsyncSession
9
+ from strawberry.dataloader import DataLoader
10
+ from typing_extensions import TypeAlias
11
+
12
+ from phoenix.db import models
13
+
14
+ ExperimentID: TypeAlias = int
15
+ RunLatency: TypeAlias = float
16
+ Key: TypeAlias = ExperimentID
17
+ Result: TypeAlias = RunLatency
18
+
19
+
20
+ class AverageExperimentRunLatencyDataLoader(DataLoader[Key, Result]):
21
+ def __init__(
22
+ self,
23
+ db: Callable[[], AsyncContextManager[AsyncSession]],
24
+ ) -> None:
25
+ super().__init__(load_fn=self._load_fn)
26
+ self._db = db
27
+
28
+ async def _load_fn(self, keys: List[Key]) -> List[Result]:
29
+ experiment_ids = keys
30
+ async with self._db() as session:
31
+ avg_latencies = {
32
+ experiment_id: avg_latency
33
+ async for experiment_id, avg_latency in await session.stream(
34
+ select(
35
+ models.ExperimentRun.experiment_id,
36
+ func.avg(
37
+ func.extract(
38
+ "epoch",
39
+ models.ExperimentRun.end_time,
40
+ )
41
+ - func.extract(
42
+ "epoch",
43
+ models.ExperimentRun.start_time,
44
+ )
45
+ ),
46
+ )
47
+ .where(models.ExperimentRun.experiment_id.in_(set(experiment_ids)))
48
+ .group_by(models.ExperimentRun.experiment_id)
49
+ )
50
+ }
51
+ return [
52
+ avg_latencies.get(experiment_id, ValueError(f"Unknown experiment: {experiment_id}"))
53
+ for experiment_id in experiment_ids
54
+ ]
@@ -80,7 +80,7 @@ V1_ROUTES = [
80
80
  ),
81
81
  Route(
82
82
  "/v1/experiment_evaluations",
83
- experiment_evaluations.create_experiment_evaluation,
83
+ experiment_evaluations.upsert_experiment_evaluation,
84
84
  methods=["POST"],
85
85
  ),
86
86
  ]
@@ -21,7 +21,7 @@ async def list_dataset_examples(request: Request) -> Response:
21
21
  type: string
22
22
  description: Dataset ID
23
23
  - in: query
24
- name: version-id
24
+ name: version_id
25
25
  schema:
26
26
  type: string
27
27
  description: Dataset version ID. If omitted, returns the latest version.
@@ -79,7 +79,7 @@ async def list_dataset_examples(request: Request) -> Response:
79
79
  description: Dataset does not exist.
80
80
  """
81
81
  dataset_id = GlobalID.from_id(request.path_params["id"])
82
- raw_version_id = request.query_params.get("version-id")
82
+ raw_version_id = request.query_params.get("version_id")
83
83
  version_id = GlobalID.from_id(raw_version_id) if raw_version_id else None
84
84
 
85
85
  if (dataset_type := dataset_id.type_name) != "Dataset":
@@ -167,12 +167,12 @@ async def list_dataset_examples(request: Request) -> Response:
167
167
  }
168
168
  async for example, revision in await session.stream(query)
169
169
  ]
170
- return JSONResponse(
171
- {
172
- "data": {
173
- "dataset_id": str(GlobalID("Dataset", str(resolved_dataset_id))),
174
- "version_id": str(GlobalID("DatasetVersion", str(resolved_version_id))),
175
- "examples": examples,
176
- }
170
+ return JSONResponse(
171
+ {
172
+ "data": {
173
+ "dataset_id": str(GlobalID("Dataset", str(resolved_dataset_id))),
174
+ "version_id": str(GlobalID("DatasetVersion", str(resolved_version_id))),
175
+ "examples": examples,
177
176
  }
178
- )
177
+ }
178
+ )
@@ -233,7 +233,7 @@ async def get_dataset_by_id(request: Request) -> Response:
233
233
  "updated_at": dataset.updated_at.isoformat(),
234
234
  "example_count": example_count,
235
235
  }
236
- return JSONResponse(content=output_dict)
236
+ return JSONResponse(content={"data": output_dict})
237
237
 
238
238
 
239
239
  async def get_dataset_versions(request: Request) -> Response:
@@ -713,7 +713,7 @@ async def get_dataset_csv(request: Request) -> Response:
713
713
  type: string
714
714
  description: Dataset ID
715
715
  - in: query
716
- name: version
716
+ name: version_id
717
717
  schema:
718
718
  type: string
719
719
  description: Dataset version ID. If omitted, returns the latest version.
@@ -762,7 +762,7 @@ async def get_dataset_jsonl_openai_ft(request: Request) -> Response:
762
762
  type: string
763
763
  description: Dataset ID
764
764
  - in: query
765
- name: version
765
+ name: version_id
766
766
  schema:
767
767
  type: string
768
768
  description: Dataset version ID. If omitted, returns the latest version.
@@ -811,7 +811,7 @@ async def get_dataset_jsonl_openai_evals(request: Request) -> Response:
811
811
  type: string
812
812
  description: Dataset ID
813
813
  - in: query
814
- name: version
814
+ name: version_id
815
815
  schema:
816
816
  type: string
817
817
  description: Dataset version ID. If omitted, returns the latest version.
@@ -915,9 +915,9 @@ async def _get_db_examples(request: Request) -> Tuple[str, List[models.DatasetEx
915
915
  raise ValueError("Missing Dataset ID")
916
916
  dataset_id = from_global_id_with_expected_type(GlobalID.from_id(id_), Dataset.__name__)
917
917
  dataset_version_id: Optional[int] = None
918
- if version := request.query_params.get("version"):
918
+ if version_id := request.query_params.get("version_id"):
919
919
  dataset_version_id = from_global_id_with_expected_type(
920
- GlobalID.from_id(version),
920
+ GlobalID.from_id(version_id),
921
921
  DatasetVersion.__name__,
922
922
  )
923
923
  latest_version = (
@@ -45,13 +45,6 @@ async def post_evaluations(request: Request) -> Response:
45
45
  operationId: addEvaluations
46
46
  tags:
47
47
  - private
48
- parameters:
49
- - name: project-name
50
- in: query
51
- schema:
52
- type: string
53
- default: default
54
- description: The project name to add the evaluation to
55
48
  requestBody:
56
49
  required: true
57
50
  content:
@@ -107,7 +100,7 @@ async def get_evaluations(request: Request) -> Response:
107
100
  tags:
108
101
  - private
109
102
  parameters:
110
- - name: project-name
103
+ - name: project_name
111
104
  in: query
112
105
  schema:
113
106
  type: string
@@ -122,9 +115,9 @@ async def get_evaluations(request: Request) -> Response:
122
115
  description: Not found
123
116
  """
124
117
  project_name = (
125
- request.query_params.get("project-name")
126
- # read from headers for backwards compatibility
127
- or request.headers.get("project-name")
118
+ request.query_params.get("project_name")
119
+ or request.query_params.get("project-name") # for backward compatibility
120
+ or request.headers.get("project-name") # read from headers for backwards compatibility
128
121
  or DEFAULT_PROJECT_NAME
129
122
  )
130
123
 
@@ -5,13 +5,13 @@ from starlette.responses import JSONResponse, Response
5
5
  from starlette.status import HTTP_404_NOT_FOUND
6
6
  from strawberry.relay import GlobalID
7
7
 
8
- from phoenix.datasets.types import EvaluationResult, ExperimentEvaluationRun
9
8
  from phoenix.db import models
9
+ from phoenix.db.helpers import SupportedSQLDialect
10
+ from phoenix.db.insertion.helpers import OnConflict, insert_on_conflict
10
11
  from phoenix.server.api.types.node import from_global_id_with_expected_type
11
- from phoenix.utilities.json import jsonify
12
12
 
13
13
 
14
- async def create_experiment_evaluation(request: Request) -> Response:
14
+ async def upsert_experiment_evaluation(request: Request) -> Response:
15
15
  payload = await request.json()
16
16
  experiment_run_gid = GlobalID.from_id(payload["experiment_run_id"])
17
17
  try:
@@ -32,7 +32,7 @@ async def create_experiment_evaluation(request: Request) -> Response:
32
32
  start_time = payload["start_time"]
33
33
  end_time = payload["end_time"]
34
34
  async with request.app.state.db() as session:
35
- exp_eval_run = models.ExperimentRunAnnotation(
35
+ values = dict(
36
36
  experiment_run_id=experiment_run_id,
37
37
  name=name,
38
38
  annotator_kind=annotator_kind,
@@ -40,27 +40,26 @@ async def create_experiment_evaluation(request: Request) -> Response:
40
40
  score=score,
41
41
  explanation=explanation,
42
42
  error=error,
43
- metadata_=metadata,
43
+ metadata_=metadata, # `metadata_` must match database
44
44
  start_time=datetime.fromisoformat(start_time),
45
45
  end_time=datetime.fromisoformat(end_time),
46
46
  trace_id=payload.get("trace_id"),
47
47
  )
48
- session.add(exp_eval_run)
49
- await session.flush()
50
- evaluation_gid = GlobalID("ExperimentEvaluation", str(exp_eval_run.id))
51
- eval_payload = ExperimentEvaluationRun(
52
- id=str(evaluation_gid),
53
- experiment_run_id=str(experiment_run_gid),
54
- start_time=exp_eval_run.start_time,
55
- end_time=exp_eval_run.end_time,
56
- name=exp_eval_run.name,
57
- annotator_kind=exp_eval_run.annotator_kind,
58
- error=exp_eval_run.error,
59
- result=EvaluationResult(
60
- label=exp_eval_run.label,
61
- score=exp_eval_run.score,
62
- explanation=exp_eval_run.explanation,
63
- metadata=exp_eval_run.metadata_,
64
- ),
48
+ set_ = {
49
+ **{k: v for k, v in values.items() if k != "metadata_"},
50
+ "metadata": values["metadata_"], # `metadata` must match database
51
+ }
52
+ dialect = SupportedSQLDialect(session.bind.dialect.name)
53
+ exp_eval_run = await session.scalar(
54
+ insert_on_conflict(
55
+ dialect=dialect,
56
+ table=models.ExperimentRunAnnotation,
57
+ values=values,
58
+ constraint="uq_experiment_run_annotations_experiment_run_id_name",
59
+ column_names=("experiment_run_id", "name"),
60
+ on_conflict=OnConflict.DO_UPDATE,
61
+ set_=set_,
62
+ ).returning(models.ExperimentRunAnnotation)
65
63
  )
66
- return JSONResponse(content=jsonify(eval_payload), status_code=200)
64
+ evaluation_gid = GlobalID("ExperimentEvaluation", str(exp_eval_run.id))
65
+ return JSONResponse(content={"data": {"id": str(evaluation_gid)}})
@@ -6,8 +6,8 @@ from starlette.responses import JSONResponse, Response
6
6
  from starlette.status import HTTP_404_NOT_FOUND
7
7
  from strawberry.relay import GlobalID
8
8
 
9
- from phoenix.datasets.types import ExperimentResult, ExperimentRun
10
9
  from phoenix.db import models
10
+ from phoenix.experiments.types import ExperimentResult, ExperimentRun
11
11
  from phoenix.server.api.types.node import from_global_id_with_expected_type
12
12
  from phoenix.utilities.json import jsonify
13
13
 
@@ -53,20 +53,8 @@ async def create_experiment_run(request: Request) -> Response:
53
53
  )
54
54
  session.add(exp_run)
55
55
  await session.flush()
56
-
57
- run_gid = GlobalID("ExperimentRun", str(exp_run.id))
58
- run_payload = ExperimentRun(
59
- start_time=exp_run.start_time,
60
- end_time=exp_run.end_time,
61
- experiment_id=str(experiment_gid),
62
- dataset_example_id=str(example_gid),
63
- repetition_number=exp_run.repetition_number,
64
- output=ExperimentResult.from_dict(exp_run.output) if exp_run.output else None,
65
- error=exp_run.error,
66
- id=str(run_gid),
67
- trace_id=exp_run.trace_id,
68
- )
69
- return JSONResponse(content=jsonify(run_payload), status_code=200)
56
+ run_gid = GlobalID("ExperimentRun", str(exp_run.id))
57
+ return JSONResponse(content={"data": {"id": str(run_gid)}})
70
58
 
71
59
 
72
60
  async def list_experiment_runs(request: Request) -> Response:
@@ -105,4 +93,4 @@ async def list_experiment_runs(request: Request) -> Response:
105
93
  trace_id=exp_run.trace_id,
106
94
  )
107
95
  )
108
- return JSONResponse(content=jsonify(runs), status_code=200)
96
+ return JSONResponse(content={"data": jsonify(runs)}, status_code=200)
@@ -8,7 +8,7 @@ from strawberry.relay import GlobalID
8
8
 
9
9
  from phoenix.db import models
10
10
  from phoenix.db.helpers import SupportedSQLDialect
11
- from phoenix.db.insertion.helpers import insert_stmt
11
+ from phoenix.db.insertion.helpers import insert_on_conflict
12
12
  from phoenix.server.api.types.node import from_global_id_with_expected_type
13
13
 
14
14
 
@@ -37,7 +37,7 @@ async def create_experiment(request: Request) -> Response:
37
37
  payload = await request.json()
38
38
  repetitions = payload.get("repetitions", 1)
39
39
  metadata = payload.get("metadata") or {}
40
- dataset_version_globalid_str = payload.get("version-id")
40
+ dataset_version_globalid_str = payload.get("version_id")
41
41
  if dataset_version_globalid_str is not None:
42
42
  try:
43
43
  dataset_version_globalid = GlobalID.from_id(dataset_version_globalid_str)
@@ -105,7 +105,7 @@ async def create_experiment(request: Request) -> Response:
105
105
 
106
106
  dialect = SupportedSQLDialect(session.bind.dialect.name)
107
107
  project_rowid = await session.scalar(
108
- insert_stmt(
108
+ insert_on_conflict(
109
109
  dialect=dialect,
110
110
  table=models.Project,
111
111
  constraint="uq_projects_name",
@@ -135,7 +135,7 @@ async def create_experiment(request: Request) -> Response:
135
135
  "created_at": experiment.created_at.isoformat(),
136
136
  "updated_at": experiment.updated_at.isoformat(),
137
137
  }
138
- return JSONResponse(content=experiment_payload, status_code=200)
138
+ return JSONResponse(content={"data": experiment_payload})
139
139
 
140
140
 
141
141
  async def read_experiment(request: Request) -> Response:
@@ -171,4 +171,4 @@ async def read_experiment(request: Request) -> Response:
171
171
  "created_at": experiment.created_at.isoformat(),
172
172
  "updated_at": experiment.updated_at.isoformat(),
173
173
  }
174
- return JSONResponse(content=experiment_payload, status_code=200)
174
+ return JSONResponse(content={"data": experiment_payload})
@@ -21,7 +21,7 @@ async def query_spans_handler(request: Request) -> Response:
21
21
  tags:
22
22
  - private
23
23
  parameters:
24
- - name: project-name
24
+ - name: project_name
25
25
  in: query
26
26
  schema:
27
27
  type: string
@@ -78,9 +78,11 @@ async def query_spans_handler(request: Request) -> Response:
78
78
  payload = await request.json()
79
79
  queries = payload.pop("queries", [])
80
80
  project_name = (
81
- request.query_params.get("project-name")
82
- # read from headers/payload for backward-compatibility
83
- or request.headers.get("project-name")
81
+ request.query_params.get("project_name")
82
+ or request.query_params.get("project-name") # for backward compatibility
83
+ or request.headers.get(
84
+ "project-name"
85
+ ) # read from headers/payload for backward-compatibility
84
86
  or payload.get("project_name")
85
87
  or DEFAULT_PROJECT_NAME
86
88
  )
@@ -103,6 +103,13 @@ class Experiment(Node):
103
103
  async def error_rate(self, info: Info[Context, None]) -> Optional[float]:
104
104
  return await info.context.data_loaders.experiment_error_rates.load(self.id_attr)
105
105
 
106
+ @strawberry.field
107
+ async def average_run_latency_ms(self, info: Info[Context, None]) -> float:
108
+ latency_seconds = await info.context.data_loaders.average_experiment_run_latency.load(
109
+ self.id_attr
110
+ )
111
+ return latency_seconds * 1000
112
+
106
113
  @strawberry.field
107
114
  async def project(self, info: Info[Context, None]) -> Optional[Project]:
108
115
  if self.project_name is None:
phoenix/server/app.py CHANGED
@@ -56,6 +56,7 @@ from phoenix.exceptions import PhoenixMigrationError
56
56
  from phoenix.pointcloud.umap_parameters import UMAPParameters
57
57
  from phoenix.server.api.context import Context, DataLoaders
58
58
  from phoenix.server.api.dataloaders import (
59
+ AverageExperimentRunLatencyDataLoader,
59
60
  CacheForDataLoaders,
60
61
  DatasetExampleRevisionsDataLoader,
61
62
  DatasetExampleSpansDataLoader,
@@ -191,6 +192,7 @@ class GraphQLWithContext(GraphQL): # type: ignore
191
192
  export_path=self.export_path,
192
193
  streaming_last_updated_at=self.streaming_last_updated_at,
193
194
  data_loaders=DataLoaders(
195
+ average_experiment_run_latency=AverageExperimentRunLatencyDataLoader(self.db),
194
196
  dataset_example_revisions=DatasetExampleRevisionsDataLoader(self.db),
195
197
  dataset_example_spans=DatasetExampleSpansDataLoader(self.db),
196
198
  document_evaluation_summaries=DocumentEvaluationSummaryDataLoader(