arize-phoenix 4.4.4rc4__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (52) hide show
  1. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +12 -6
  2. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +47 -42
  3. phoenix/config.py +21 -0
  4. phoenix/datetime_utils.py +4 -0
  5. phoenix/db/insertion/dataset.py +19 -16
  6. phoenix/db/insertion/evaluation.py +4 -4
  7. phoenix/db/insertion/helpers.py +4 -12
  8. phoenix/db/insertion/span.py +3 -3
  9. phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
  10. phoenix/db/models.py +8 -3
  11. phoenix/experiments/__init__.py +6 -0
  12. phoenix/experiments/evaluators/__init__.py +29 -0
  13. phoenix/experiments/evaluators/base.py +153 -0
  14. phoenix/{datasets → experiments}/evaluators/code_evaluators.py +25 -53
  15. phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +62 -31
  16. phoenix/experiments/evaluators/utils.py +189 -0
  17. phoenix/experiments/functions.py +616 -0
  18. phoenix/{datasets → experiments}/tracing.py +19 -0
  19. phoenix/experiments/types.py +722 -0
  20. phoenix/experiments/utils.py +9 -0
  21. phoenix/server/api/context.py +4 -0
  22. phoenix/server/api/dataloaders/__init__.py +4 -0
  23. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  24. phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
  25. phoenix/server/api/helpers/dataset_helpers.py +8 -7
  26. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  27. phoenix/server/api/mutations/project_mutations.py +9 -4
  28. phoenix/server/api/routers/v1/__init__.py +1 -1
  29. phoenix/server/api/routers/v1/dataset_examples.py +10 -10
  30. phoenix/server/api/routers/v1/datasets.py +152 -48
  31. phoenix/server/api/routers/v1/evaluations.py +4 -11
  32. phoenix/server/api/routers/v1/experiment_evaluations.py +23 -23
  33. phoenix/server/api/routers/v1/experiment_runs.py +5 -17
  34. phoenix/server/api/routers/v1/experiments.py +5 -5
  35. phoenix/server/api/routers/v1/spans.py +6 -4
  36. phoenix/server/api/types/Experiment.py +12 -0
  37. phoenix/server/api/types/ExperimentRun.py +1 -1
  38. phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
  39. phoenix/server/app.py +4 -0
  40. phoenix/server/static/index.js +712 -588
  41. phoenix/session/client.py +321 -28
  42. phoenix/trace/fixtures.py +6 -6
  43. phoenix/utilities/json.py +8 -8
  44. phoenix/version.py +1 -1
  45. phoenix/datasets/__init__.py +0 -0
  46. phoenix/datasets/evaluators/__init__.py +0 -18
  47. phoenix/datasets/evaluators/_utils.py +0 -13
  48. phoenix/datasets/experiments.py +0 -485
  49. phoenix/datasets/types.py +0 -212
  50. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
  51. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
  52. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
@@ -5,13 +5,13 @@ from starlette.responses import JSONResponse, Response
5
5
  from starlette.status import HTTP_404_NOT_FOUND
6
6
  from strawberry.relay import GlobalID
7
7
 
8
- from phoenix.datasets.types import EvaluationResult, ExperimentEvaluationRun
9
8
  from phoenix.db import models
9
+ from phoenix.db.helpers import SupportedSQLDialect
10
+ from phoenix.db.insertion.helpers import OnConflict, insert_on_conflict
10
11
  from phoenix.server.api.types.node import from_global_id_with_expected_type
11
- from phoenix.utilities.json import jsonify
12
12
 
13
13
 
14
- async def create_experiment_evaluation(request: Request) -> Response:
14
+ async def upsert_experiment_evaluation(request: Request) -> Response:
15
15
  payload = await request.json()
16
16
  experiment_run_gid = GlobalID.from_id(payload["experiment_run_id"])
17
17
  try:
@@ -32,7 +32,7 @@ async def create_experiment_evaluation(request: Request) -> Response:
32
32
  start_time = payload["start_time"]
33
33
  end_time = payload["end_time"]
34
34
  async with request.app.state.db() as session:
35
- exp_eval_run = models.ExperimentRunAnnotation(
35
+ values = dict(
36
36
  experiment_run_id=experiment_run_id,
37
37
  name=name,
38
38
  annotator_kind=annotator_kind,
@@ -40,26 +40,26 @@ async def create_experiment_evaluation(request: Request) -> Response:
40
40
  score=score,
41
41
  explanation=explanation,
42
42
  error=error,
43
- metadata_=metadata,
43
+ metadata_=metadata, # `metadata_` must match database
44
44
  start_time=datetime.fromisoformat(start_time),
45
45
  end_time=datetime.fromisoformat(end_time),
46
+ trace_id=payload.get("trace_id"),
46
47
  )
47
- session.add(exp_eval_run)
48
- await session.flush()
49
- evaluation_gid = GlobalID("ExperimentEvaluation", str(exp_eval_run.id))
50
- eval_payload = ExperimentEvaluationRun(
51
- id=str(evaluation_gid),
52
- experiment_run_id=str(experiment_run_gid),
53
- start_time=exp_eval_run.start_time,
54
- end_time=exp_eval_run.end_time,
55
- name=exp_eval_run.name,
56
- annotator_kind=exp_eval_run.annotator_kind,
57
- error=exp_eval_run.error,
58
- result=EvaluationResult(
59
- label=exp_eval_run.label,
60
- score=exp_eval_run.score,
61
- explanation=exp_eval_run.explanation,
62
- metadata=exp_eval_run.metadata_,
63
- ),
48
+ set_ = {
49
+ **{k: v for k, v in values.items() if k != "metadata_"},
50
+ "metadata": values["metadata_"], # `metadata` must match database
51
+ }
52
+ dialect = SupportedSQLDialect(session.bind.dialect.name)
53
+ exp_eval_run = await session.scalar(
54
+ insert_on_conflict(
55
+ dialect=dialect,
56
+ table=models.ExperimentRunAnnotation,
57
+ values=values,
58
+ constraint="uq_experiment_run_annotations_experiment_run_id_name",
59
+ column_names=("experiment_run_id", "name"),
60
+ on_conflict=OnConflict.DO_UPDATE,
61
+ set_=set_,
62
+ ).returning(models.ExperimentRunAnnotation)
64
63
  )
65
- return JSONResponse(content=jsonify(eval_payload), status_code=200)
64
+ evaluation_gid = GlobalID("ExperimentEvaluation", str(exp_eval_run.id))
65
+ return JSONResponse(content={"data": {"id": str(evaluation_gid)}})
@@ -6,8 +6,8 @@ from starlette.responses import JSONResponse, Response
6
6
  from starlette.status import HTTP_404_NOT_FOUND
7
7
  from strawberry.relay import GlobalID
8
8
 
9
- from phoenix.datasets.types import ExperimentResult, ExperimentRun
10
9
  from phoenix.db import models
10
+ from phoenix.experiments.types import ExperimentResult, ExperimentRun
11
11
  from phoenix.server.api.types.node import from_global_id_with_expected_type
12
12
  from phoenix.utilities.json import jsonify
13
13
 
@@ -53,20 +53,8 @@ async def create_experiment_run(request: Request) -> Response:
53
53
  )
54
54
  session.add(exp_run)
55
55
  await session.flush()
56
-
57
- run_gid = GlobalID("ExperimentRun", str(exp_run.id))
58
- run_payload = ExperimentRun(
59
- start_time=exp_run.start_time,
60
- end_time=exp_run.end_time,
61
- experiment_id=str(experiment_gid),
62
- dataset_example_id=str(example_gid),
63
- repetition_number=exp_run.repetition_number,
64
- output=ExperimentResult(result=exp_run.output),
65
- error=exp_run.error,
66
- id=str(run_gid),
67
- trace_id=exp_run.trace_id,
68
- )
69
- return JSONResponse(content=jsonify(run_payload), status_code=200)
56
+ run_gid = GlobalID("ExperimentRun", str(exp_run.id))
57
+ return JSONResponse(content={"data": {"id": str(run_gid)}})
70
58
 
71
59
 
72
60
  async def list_experiment_runs(request: Request) -> Response:
@@ -99,10 +87,10 @@ async def list_experiment_runs(request: Request) -> Response:
99
87
  experiment_id=str(experiment_gid),
100
88
  dataset_example_id=str(example_gid),
101
89
  repetition_number=exp_run.repetition_number,
102
- output=ExperimentResult(result=exp_run.output),
90
+ output=ExperimentResult.from_dict(exp_run.output) if exp_run.output else None,
103
91
  error=exp_run.error,
104
92
  id=str(run_gid),
105
93
  trace_id=exp_run.trace_id,
106
94
  )
107
95
  )
108
- return JSONResponse(content=jsonify(runs), status_code=200)
96
+ return JSONResponse(content={"data": jsonify(runs)}, status_code=200)
@@ -8,7 +8,7 @@ from strawberry.relay import GlobalID
8
8
 
9
9
  from phoenix.db import models
10
10
  from phoenix.db.helpers import SupportedSQLDialect
11
- from phoenix.db.insertion.helpers import insert_stmt
11
+ from phoenix.db.insertion.helpers import insert_on_conflict
12
12
  from phoenix.server.api.types.node import from_global_id_with_expected_type
13
13
 
14
14
 
@@ -37,7 +37,7 @@ async def create_experiment(request: Request) -> Response:
37
37
  payload = await request.json()
38
38
  repetitions = payload.get("repetitions", 1)
39
39
  metadata = payload.get("metadata") or {}
40
- dataset_version_globalid_str = payload.get("version-id")
40
+ dataset_version_globalid_str = payload.get("version_id")
41
41
  if dataset_version_globalid_str is not None:
42
42
  try:
43
43
  dataset_version_globalid = GlobalID.from_id(dataset_version_globalid_str)
@@ -105,7 +105,7 @@ async def create_experiment(request: Request) -> Response:
105
105
 
106
106
  dialect = SupportedSQLDialect(session.bind.dialect.name)
107
107
  project_rowid = await session.scalar(
108
- insert_stmt(
108
+ insert_on_conflict(
109
109
  dialect=dialect,
110
110
  table=models.Project,
111
111
  constraint="uq_projects_name",
@@ -135,7 +135,7 @@ async def create_experiment(request: Request) -> Response:
135
135
  "created_at": experiment.created_at.isoformat(),
136
136
  "updated_at": experiment.updated_at.isoformat(),
137
137
  }
138
- return JSONResponse(content=experiment_payload, status_code=200)
138
+ return JSONResponse(content={"data": experiment_payload})
139
139
 
140
140
 
141
141
  async def read_experiment(request: Request) -> Response:
@@ -171,4 +171,4 @@ async def read_experiment(request: Request) -> Response:
171
171
  "created_at": experiment.created_at.isoformat(),
172
172
  "updated_at": experiment.updated_at.isoformat(),
173
173
  }
174
- return JSONResponse(content=experiment_payload, status_code=200)
174
+ return JSONResponse(content={"data": experiment_payload})
@@ -21,7 +21,7 @@ async def query_spans_handler(request: Request) -> Response:
21
21
  tags:
22
22
  - private
23
23
  parameters:
24
- - name: project-name
24
+ - name: project_name
25
25
  in: query
26
26
  schema:
27
27
  type: string
@@ -78,9 +78,11 @@ async def query_spans_handler(request: Request) -> Response:
78
78
  payload = await request.json()
79
79
  queries = payload.pop("queries", [])
80
80
  project_name = (
81
- request.query_params.get("project-name")
82
- # read from headers/payload for backward-compatibility
83
- or request.headers.get("project-name")
81
+ request.query_params.get("project_name")
82
+ or request.query_params.get("project-name") # for backward compatibility
83
+ or request.headers.get(
84
+ "project-name"
85
+ ) # read from headers/payload for backward-compatibility
84
86
  or payload.get("project_name")
85
87
  or DEFAULT_PROJECT_NAME
86
88
  )
@@ -75,6 +75,11 @@ class Experiment(Node):
75
75
  ).all()
76
76
  return connection_from_list([to_gql_experiment_run(run) for run in runs], args)
77
77
 
78
+ @strawberry.field
79
+ async def run_count(self, info: Info[Context, None]) -> int:
80
+ experiment_id = self.id_attr
81
+ return await info.context.data_loaders.experiment_run_counts.load(experiment_id)
82
+
78
83
  @strawberry.field
79
84
  async def annotation_summaries(
80
85
  self, info: Info[Context, None]
@@ -98,6 +103,13 @@ class Experiment(Node):
98
103
  async def error_rate(self, info: Info[Context, None]) -> Optional[float]:
99
104
  return await info.context.data_loaders.experiment_error_rates.load(self.id_attr)
100
105
 
106
+ @strawberry.field
107
+ async def average_run_latency_ms(self, info: Info[Context, None]) -> float:
108
+ latency_seconds = await info.context.data_loaders.average_experiment_run_latency.load(
109
+ self.id_attr
110
+ )
111
+ return latency_seconds * 1000
112
+
101
113
  @strawberry.field
102
114
  async def project(self, info: Info[Context, None]) -> Optional[Project]:
103
115
  if self.project_name is None:
@@ -84,7 +84,7 @@ def to_gql_experiment_run(run: models.ExperimentRun) -> ExperimentRun:
84
84
  trace_id=trace_id
85
85
  if (trace := run.trace) and (trace_id := trace.trace_id) is not None
86
86
  else None,
87
- output=run.output,
87
+ output=run.output.get("result"),
88
88
  start_time=run.start_time,
89
89
  end_time=run.end_time,
90
90
  error=run.error,
@@ -33,7 +33,7 @@ class ExperimentRunAnnotation(Node):
33
33
  if (trace := await dataloader.load(self.trace_id)) is None:
34
34
  return None
35
35
  trace_row_id, project_row_id = trace
36
- return Trace(id_attr=trace_row_id, trace_id=trace.trace_id, project_rowid=project_row_id)
36
+ return Trace(id_attr=trace_row_id, trace_id=self.trace_id, project_rowid=project_row_id)
37
37
 
38
38
 
39
39
  def to_gql_experiment_run_annotation(
phoenix/server/app.py CHANGED
@@ -56,6 +56,7 @@ from phoenix.exceptions import PhoenixMigrationError
56
56
  from phoenix.pointcloud.umap_parameters import UMAPParameters
57
57
  from phoenix.server.api.context import Context, DataLoaders
58
58
  from phoenix.server.api.dataloaders import (
59
+ AverageExperimentRunLatencyDataLoader,
59
60
  CacheForDataLoaders,
60
61
  DatasetExampleRevisionsDataLoader,
61
62
  DatasetExampleSpansDataLoader,
@@ -65,6 +66,7 @@ from phoenix.server.api.dataloaders import (
65
66
  EvaluationSummaryDataLoader,
66
67
  ExperimentAnnotationSummaryDataLoader,
67
68
  ExperimentErrorRatesDataLoader,
69
+ ExperimentRunCountsDataLoader,
68
70
  ExperimentSequenceNumberDataLoader,
69
71
  LatencyMsQuantileDataLoader,
70
72
  MinStartOrMaxEndTimeDataLoader,
@@ -190,6 +192,7 @@ class GraphQLWithContext(GraphQL): # type: ignore
190
192
  export_path=self.export_path,
191
193
  streaming_last_updated_at=self.streaming_last_updated_at,
192
194
  data_loaders=DataLoaders(
195
+ average_experiment_run_latency=AverageExperimentRunLatencyDataLoader(self.db),
193
196
  dataset_example_revisions=DatasetExampleRevisionsDataLoader(self.db),
194
197
  dataset_example_spans=DatasetExampleSpansDataLoader(self.db),
195
198
  document_evaluation_summaries=DocumentEvaluationSummaryDataLoader(
@@ -208,6 +211,7 @@ class GraphQLWithContext(GraphQL): # type: ignore
208
211
  ),
209
212
  experiment_annotation_summaries=ExperimentAnnotationSummaryDataLoader(self.db),
210
213
  experiment_error_rates=ExperimentErrorRatesDataLoader(self.db),
214
+ experiment_run_counts=ExperimentRunCountsDataLoader(self.db),
211
215
  experiment_sequence_number=ExperimentSequenceNumberDataLoader(self.db),
212
216
  latency_ms_quantile=LatencyMsQuantileDataLoader(
213
217
  self.db,