arize-phoenix 4.4.4rc4__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +12 -6
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +47 -42
- phoenix/config.py +21 -0
- phoenix/datetime_utils.py +4 -0
- phoenix/db/insertion/dataset.py +19 -16
- phoenix/db/insertion/evaluation.py +4 -4
- phoenix/db/insertion/helpers.py +4 -12
- phoenix/db/insertion/span.py +3 -3
- phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
- phoenix/db/models.py +8 -3
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +153 -0
- phoenix/{datasets → experiments}/evaluators/code_evaluators.py +25 -53
- phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +62 -31
- phoenix/experiments/evaluators/utils.py +189 -0
- phoenix/experiments/functions.py +616 -0
- phoenix/{datasets → experiments}/tracing.py +19 -0
- phoenix/experiments/types.py +722 -0
- phoenix/experiments/utils.py +9 -0
- phoenix/server/api/context.py +4 -0
- phoenix/server/api/dataloaders/__init__.py +4 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
- phoenix/server/api/helpers/dataset_helpers.py +8 -7
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/mutations/project_mutations.py +9 -4
- phoenix/server/api/routers/v1/__init__.py +1 -1
- phoenix/server/api/routers/v1/dataset_examples.py +10 -10
- phoenix/server/api/routers/v1/datasets.py +152 -48
- phoenix/server/api/routers/v1/evaluations.py +4 -11
- phoenix/server/api/routers/v1/experiment_evaluations.py +23 -23
- phoenix/server/api/routers/v1/experiment_runs.py +5 -17
- phoenix/server/api/routers/v1/experiments.py +5 -5
- phoenix/server/api/routers/v1/spans.py +6 -4
- phoenix/server/api/types/Experiment.py +12 -0
- phoenix/server/api/types/ExperimentRun.py +1 -1
- phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
- phoenix/server/app.py +4 -0
- phoenix/server/static/index.js +712 -588
- phoenix/session/client.py +321 -28
- phoenix/trace/fixtures.py +6 -6
- phoenix/utilities/json.py +8 -8
- phoenix/version.py +1 -1
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators/__init__.py +0 -18
- phoenix/datasets/evaluators/_utils.py +0 -13
- phoenix/datasets/experiments.py +0 -485
- phoenix/datasets/types.py +0 -212
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,13 +5,13 @@ from starlette.responses import JSONResponse, Response
|
|
|
5
5
|
from starlette.status import HTTP_404_NOT_FOUND
|
|
6
6
|
from strawberry.relay import GlobalID
|
|
7
7
|
|
|
8
|
-
from phoenix.datasets.types import EvaluationResult, ExperimentEvaluationRun
|
|
9
8
|
from phoenix.db import models
|
|
9
|
+
from phoenix.db.helpers import SupportedSQLDialect
|
|
10
|
+
from phoenix.db.insertion.helpers import OnConflict, insert_on_conflict
|
|
10
11
|
from phoenix.server.api.types.node import from_global_id_with_expected_type
|
|
11
|
-
from phoenix.utilities.json import jsonify
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
async def
|
|
14
|
+
async def upsert_experiment_evaluation(request: Request) -> Response:
|
|
15
15
|
payload = await request.json()
|
|
16
16
|
experiment_run_gid = GlobalID.from_id(payload["experiment_run_id"])
|
|
17
17
|
try:
|
|
@@ -32,7 +32,7 @@ async def create_experiment_evaluation(request: Request) -> Response:
|
|
|
32
32
|
start_time = payload["start_time"]
|
|
33
33
|
end_time = payload["end_time"]
|
|
34
34
|
async with request.app.state.db() as session:
|
|
35
|
-
|
|
35
|
+
values = dict(
|
|
36
36
|
experiment_run_id=experiment_run_id,
|
|
37
37
|
name=name,
|
|
38
38
|
annotator_kind=annotator_kind,
|
|
@@ -40,26 +40,26 @@ async def create_experiment_evaluation(request: Request) -> Response:
|
|
|
40
40
|
score=score,
|
|
41
41
|
explanation=explanation,
|
|
42
42
|
error=error,
|
|
43
|
-
metadata_=metadata,
|
|
43
|
+
metadata_=metadata, # `metadata_` must match database
|
|
44
44
|
start_time=datetime.fromisoformat(start_time),
|
|
45
45
|
end_time=datetime.fromisoformat(end_time),
|
|
46
|
+
trace_id=payload.get("trace_id"),
|
|
46
47
|
)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
metadata=exp_eval_run.metadata_,
|
|
63
|
-
),
|
|
48
|
+
set_ = {
|
|
49
|
+
**{k: v for k, v in values.items() if k != "metadata_"},
|
|
50
|
+
"metadata": values["metadata_"], # `metadata` must match database
|
|
51
|
+
}
|
|
52
|
+
dialect = SupportedSQLDialect(session.bind.dialect.name)
|
|
53
|
+
exp_eval_run = await session.scalar(
|
|
54
|
+
insert_on_conflict(
|
|
55
|
+
dialect=dialect,
|
|
56
|
+
table=models.ExperimentRunAnnotation,
|
|
57
|
+
values=values,
|
|
58
|
+
constraint="uq_experiment_run_annotations_experiment_run_id_name",
|
|
59
|
+
column_names=("experiment_run_id", "name"),
|
|
60
|
+
on_conflict=OnConflict.DO_UPDATE,
|
|
61
|
+
set_=set_,
|
|
62
|
+
).returning(models.ExperimentRunAnnotation)
|
|
64
63
|
)
|
|
65
|
-
|
|
64
|
+
evaluation_gid = GlobalID("ExperimentEvaluation", str(exp_eval_run.id))
|
|
65
|
+
return JSONResponse(content={"data": {"id": str(evaluation_gid)}})
|
|
@@ -6,8 +6,8 @@ from starlette.responses import JSONResponse, Response
|
|
|
6
6
|
from starlette.status import HTTP_404_NOT_FOUND
|
|
7
7
|
from strawberry.relay import GlobalID
|
|
8
8
|
|
|
9
|
-
from phoenix.datasets.types import ExperimentResult, ExperimentRun
|
|
10
9
|
from phoenix.db import models
|
|
10
|
+
from phoenix.experiments.types import ExperimentResult, ExperimentRun
|
|
11
11
|
from phoenix.server.api.types.node import from_global_id_with_expected_type
|
|
12
12
|
from phoenix.utilities.json import jsonify
|
|
13
13
|
|
|
@@ -53,20 +53,8 @@ async def create_experiment_run(request: Request) -> Response:
|
|
|
53
53
|
)
|
|
54
54
|
session.add(exp_run)
|
|
55
55
|
await session.flush()
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
run_payload = ExperimentRun(
|
|
59
|
-
start_time=exp_run.start_time,
|
|
60
|
-
end_time=exp_run.end_time,
|
|
61
|
-
experiment_id=str(experiment_gid),
|
|
62
|
-
dataset_example_id=str(example_gid),
|
|
63
|
-
repetition_number=exp_run.repetition_number,
|
|
64
|
-
output=ExperimentResult(result=exp_run.output),
|
|
65
|
-
error=exp_run.error,
|
|
66
|
-
id=str(run_gid),
|
|
67
|
-
trace_id=exp_run.trace_id,
|
|
68
|
-
)
|
|
69
|
-
return JSONResponse(content=jsonify(run_payload), status_code=200)
|
|
56
|
+
run_gid = GlobalID("ExperimentRun", str(exp_run.id))
|
|
57
|
+
return JSONResponse(content={"data": {"id": str(run_gid)}})
|
|
70
58
|
|
|
71
59
|
|
|
72
60
|
async def list_experiment_runs(request: Request) -> Response:
|
|
@@ -99,10 +87,10 @@ async def list_experiment_runs(request: Request) -> Response:
|
|
|
99
87
|
experiment_id=str(experiment_gid),
|
|
100
88
|
dataset_example_id=str(example_gid),
|
|
101
89
|
repetition_number=exp_run.repetition_number,
|
|
102
|
-
output=ExperimentResult(
|
|
90
|
+
output=ExperimentResult.from_dict(exp_run.output) if exp_run.output else None,
|
|
103
91
|
error=exp_run.error,
|
|
104
92
|
id=str(run_gid),
|
|
105
93
|
trace_id=exp_run.trace_id,
|
|
106
94
|
)
|
|
107
95
|
)
|
|
108
|
-
|
|
96
|
+
return JSONResponse(content={"data": jsonify(runs)}, status_code=200)
|
|
@@ -8,7 +8,7 @@ from strawberry.relay import GlobalID
|
|
|
8
8
|
|
|
9
9
|
from phoenix.db import models
|
|
10
10
|
from phoenix.db.helpers import SupportedSQLDialect
|
|
11
|
-
from phoenix.db.insertion.helpers import
|
|
11
|
+
from phoenix.db.insertion.helpers import insert_on_conflict
|
|
12
12
|
from phoenix.server.api.types.node import from_global_id_with_expected_type
|
|
13
13
|
|
|
14
14
|
|
|
@@ -37,7 +37,7 @@ async def create_experiment(request: Request) -> Response:
|
|
|
37
37
|
payload = await request.json()
|
|
38
38
|
repetitions = payload.get("repetitions", 1)
|
|
39
39
|
metadata = payload.get("metadata") or {}
|
|
40
|
-
dataset_version_globalid_str = payload.get("
|
|
40
|
+
dataset_version_globalid_str = payload.get("version_id")
|
|
41
41
|
if dataset_version_globalid_str is not None:
|
|
42
42
|
try:
|
|
43
43
|
dataset_version_globalid = GlobalID.from_id(dataset_version_globalid_str)
|
|
@@ -105,7 +105,7 @@ async def create_experiment(request: Request) -> Response:
|
|
|
105
105
|
|
|
106
106
|
dialect = SupportedSQLDialect(session.bind.dialect.name)
|
|
107
107
|
project_rowid = await session.scalar(
|
|
108
|
-
|
|
108
|
+
insert_on_conflict(
|
|
109
109
|
dialect=dialect,
|
|
110
110
|
table=models.Project,
|
|
111
111
|
constraint="uq_projects_name",
|
|
@@ -135,7 +135,7 @@ async def create_experiment(request: Request) -> Response:
|
|
|
135
135
|
"created_at": experiment.created_at.isoformat(),
|
|
136
136
|
"updated_at": experiment.updated_at.isoformat(),
|
|
137
137
|
}
|
|
138
|
-
|
|
138
|
+
return JSONResponse(content={"data": experiment_payload})
|
|
139
139
|
|
|
140
140
|
|
|
141
141
|
async def read_experiment(request: Request) -> Response:
|
|
@@ -171,4 +171,4 @@ async def read_experiment(request: Request) -> Response:
|
|
|
171
171
|
"created_at": experiment.created_at.isoformat(),
|
|
172
172
|
"updated_at": experiment.updated_at.isoformat(),
|
|
173
173
|
}
|
|
174
|
-
|
|
174
|
+
return JSONResponse(content={"data": experiment_payload})
|
|
@@ -21,7 +21,7 @@ async def query_spans_handler(request: Request) -> Response:
|
|
|
21
21
|
tags:
|
|
22
22
|
- private
|
|
23
23
|
parameters:
|
|
24
|
-
- name:
|
|
24
|
+
- name: project_name
|
|
25
25
|
in: query
|
|
26
26
|
schema:
|
|
27
27
|
type: string
|
|
@@ -78,9 +78,11 @@ async def query_spans_handler(request: Request) -> Response:
|
|
|
78
78
|
payload = await request.json()
|
|
79
79
|
queries = payload.pop("queries", [])
|
|
80
80
|
project_name = (
|
|
81
|
-
request.query_params.get("
|
|
82
|
-
#
|
|
83
|
-
or request.headers.get(
|
|
81
|
+
request.query_params.get("project_name")
|
|
82
|
+
or request.query_params.get("project-name") # for backward compatibility
|
|
83
|
+
or request.headers.get(
|
|
84
|
+
"project-name"
|
|
85
|
+
) # read from headers/payload for backward-compatibility
|
|
84
86
|
or payload.get("project_name")
|
|
85
87
|
or DEFAULT_PROJECT_NAME
|
|
86
88
|
)
|
|
@@ -75,6 +75,11 @@ class Experiment(Node):
|
|
|
75
75
|
).all()
|
|
76
76
|
return connection_from_list([to_gql_experiment_run(run) for run in runs], args)
|
|
77
77
|
|
|
78
|
+
@strawberry.field
|
|
79
|
+
async def run_count(self, info: Info[Context, None]) -> int:
|
|
80
|
+
experiment_id = self.id_attr
|
|
81
|
+
return await info.context.data_loaders.experiment_run_counts.load(experiment_id)
|
|
82
|
+
|
|
78
83
|
@strawberry.field
|
|
79
84
|
async def annotation_summaries(
|
|
80
85
|
self, info: Info[Context, None]
|
|
@@ -98,6 +103,13 @@ class Experiment(Node):
|
|
|
98
103
|
async def error_rate(self, info: Info[Context, None]) -> Optional[float]:
|
|
99
104
|
return await info.context.data_loaders.experiment_error_rates.load(self.id_attr)
|
|
100
105
|
|
|
106
|
+
@strawberry.field
|
|
107
|
+
async def average_run_latency_ms(self, info: Info[Context, None]) -> float:
|
|
108
|
+
latency_seconds = await info.context.data_loaders.average_experiment_run_latency.load(
|
|
109
|
+
self.id_attr
|
|
110
|
+
)
|
|
111
|
+
return latency_seconds * 1000
|
|
112
|
+
|
|
101
113
|
@strawberry.field
|
|
102
114
|
async def project(self, info: Info[Context, None]) -> Optional[Project]:
|
|
103
115
|
if self.project_name is None:
|
|
@@ -84,7 +84,7 @@ def to_gql_experiment_run(run: models.ExperimentRun) -> ExperimentRun:
|
|
|
84
84
|
trace_id=trace_id
|
|
85
85
|
if (trace := run.trace) and (trace_id := trace.trace_id) is not None
|
|
86
86
|
else None,
|
|
87
|
-
output=run.output,
|
|
87
|
+
output=run.output.get("result"),
|
|
88
88
|
start_time=run.start_time,
|
|
89
89
|
end_time=run.end_time,
|
|
90
90
|
error=run.error,
|
|
@@ -33,7 +33,7 @@ class ExperimentRunAnnotation(Node):
|
|
|
33
33
|
if (trace := await dataloader.load(self.trace_id)) is None:
|
|
34
34
|
return None
|
|
35
35
|
trace_row_id, project_row_id = trace
|
|
36
|
-
return Trace(id_attr=trace_row_id, trace_id=
|
|
36
|
+
return Trace(id_attr=trace_row_id, trace_id=self.trace_id, project_rowid=project_row_id)
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def to_gql_experiment_run_annotation(
|
phoenix/server/app.py
CHANGED
|
@@ -56,6 +56,7 @@ from phoenix.exceptions import PhoenixMigrationError
|
|
|
56
56
|
from phoenix.pointcloud.umap_parameters import UMAPParameters
|
|
57
57
|
from phoenix.server.api.context import Context, DataLoaders
|
|
58
58
|
from phoenix.server.api.dataloaders import (
|
|
59
|
+
AverageExperimentRunLatencyDataLoader,
|
|
59
60
|
CacheForDataLoaders,
|
|
60
61
|
DatasetExampleRevisionsDataLoader,
|
|
61
62
|
DatasetExampleSpansDataLoader,
|
|
@@ -65,6 +66,7 @@ from phoenix.server.api.dataloaders import (
|
|
|
65
66
|
EvaluationSummaryDataLoader,
|
|
66
67
|
ExperimentAnnotationSummaryDataLoader,
|
|
67
68
|
ExperimentErrorRatesDataLoader,
|
|
69
|
+
ExperimentRunCountsDataLoader,
|
|
68
70
|
ExperimentSequenceNumberDataLoader,
|
|
69
71
|
LatencyMsQuantileDataLoader,
|
|
70
72
|
MinStartOrMaxEndTimeDataLoader,
|
|
@@ -190,6 +192,7 @@ class GraphQLWithContext(GraphQL): # type: ignore
|
|
|
190
192
|
export_path=self.export_path,
|
|
191
193
|
streaming_last_updated_at=self.streaming_last_updated_at,
|
|
192
194
|
data_loaders=DataLoaders(
|
|
195
|
+
average_experiment_run_latency=AverageExperimentRunLatencyDataLoader(self.db),
|
|
193
196
|
dataset_example_revisions=DatasetExampleRevisionsDataLoader(self.db),
|
|
194
197
|
dataset_example_spans=DatasetExampleSpansDataLoader(self.db),
|
|
195
198
|
document_evaluation_summaries=DocumentEvaluationSummaryDataLoader(
|
|
@@ -208,6 +211,7 @@ class GraphQLWithContext(GraphQL): # type: ignore
|
|
|
208
211
|
),
|
|
209
212
|
experiment_annotation_summaries=ExperimentAnnotationSummaryDataLoader(self.db),
|
|
210
213
|
experiment_error_rates=ExperimentErrorRatesDataLoader(self.db),
|
|
214
|
+
experiment_run_counts=ExperimentRunCountsDataLoader(self.db),
|
|
211
215
|
experiment_sequence_number=ExperimentSequenceNumberDataLoader(self.db),
|
|
212
216
|
latency_ms_quantile=LatencyMsQuantileDataLoader(
|
|
213
217
|
self.db,
|