arize-phoenix 11.17.0__py3-none-any.whl → 11.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/METADATA +2 -2
- {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/RECORD +23 -23
- phoenix/db/helpers.py +27 -0
- phoenix/server/api/helpers/playground_clients.py +2 -0
- phoenix/server/api/queries.py +454 -7
- phoenix/server/api/routers/v1/spans.py +128 -3
- phoenix/server/api/routers/v1/traces.py +36 -15
- phoenix/server/prometheus.py +1 -0
- phoenix/server/static/.vite/manifest.json +51 -45
- phoenix/server/static/assets/{components-B7NKnJXz.js → components-C4HZjMqd.js} +529 -340
- phoenix/server/static/assets/{index-9n9lXgT6.js → index-DwyN9UfD.js} +2 -2
- phoenix/server/static/assets/{pages-CvqPVUA3.js → pages-B1S5DLvL.js} +583 -515
- phoenix/server/static/assets/vendor-BbqekBfb.js +905 -0
- phoenix/server/static/assets/vendor-arizeai-CEwHhYfL.js +168 -0
- phoenix/server/static/assets/vendor-codemirror-CHApHLLJ.js +25 -0
- phoenix/server/static/assets/{vendor-recharts-Cu431IpB.js → vendor-recharts-Bqf7C6Cm.js} +6 -6
- phoenix/server/static/assets/vendor-shiki-BQ88Q1b1.js +5 -0
- phoenix/server/static/assets/{vendor-three-C5WAXd5r.js → vendor-three-BLWp5bic.js} +154 -154
- phoenix/version.py +1 -1
- phoenix/server/static/assets/vendor-_6rG8OMg.js +0 -936
- phoenix/server/static/assets/vendor-arizeai-BznCmJFh.js +0 -168
- phoenix/server/static/assets/vendor-codemirror-29fWLPAy.js +0 -27
- phoenix/server/static/assets/vendor-shiki-Ce9e01lU.js +0 -5
- {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/entry_points.txt +0 -0
- {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/licenses/LICENSE +0 -0
phoenix/server/api/queries.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Iterable, Iterator, Optional, Union
|
|
4
|
+
from typing import Any, Iterable, Iterator, Optional, Union
|
|
5
5
|
from typing import cast as type_cast
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import numpy.typing as npt
|
|
9
9
|
import strawberry
|
|
10
|
-
from sqlalchemy import String, and_, cast, distinct, func, select, text
|
|
11
|
-
from sqlalchemy.orm import joinedload
|
|
10
|
+
from sqlalchemy import ColumnElement, String, and_, case, cast, distinct, func, select, text
|
|
11
|
+
from sqlalchemy.orm import aliased, joinedload
|
|
12
12
|
from starlette.authentication import UnauthenticatedUser
|
|
13
13
|
from strawberry import ID, UNSET
|
|
14
14
|
from strawberry.relay import Connection, GlobalID, Node
|
|
@@ -23,6 +23,7 @@ from phoenix.config import (
|
|
|
23
23
|
from phoenix.db import models
|
|
24
24
|
from phoenix.db.constants import DEFAULT_PROJECT_TRACE_RETENTION_POLICY_ID
|
|
25
25
|
from phoenix.db.helpers import SupportedSQLDialect, exclude_experiment_projects
|
|
26
|
+
from phoenix.db.models import LatencyMs
|
|
26
27
|
from phoenix.pointcloud.clustering import Hdbscan
|
|
27
28
|
from phoenix.server.api.auth import MSG_ADMIN_ONLY, IsAdmin
|
|
28
29
|
from phoenix.server.api.context import Context
|
|
@@ -106,6 +107,32 @@ class DbTableStats:
|
|
|
106
107
|
num_bytes: float
|
|
107
108
|
|
|
108
109
|
|
|
110
|
+
@strawberry.type
|
|
111
|
+
class MetricCounts:
|
|
112
|
+
num_increases: int
|
|
113
|
+
num_decreases: int
|
|
114
|
+
num_equal: int
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@strawberry.type
|
|
118
|
+
class CompareExperimentRunMetricCounts:
|
|
119
|
+
compare_experiment_id: GlobalID
|
|
120
|
+
latency: MetricCounts
|
|
121
|
+
prompt_token_count: MetricCounts
|
|
122
|
+
completion_token_count: MetricCounts
|
|
123
|
+
total_token_count: MetricCounts
|
|
124
|
+
total_cost: MetricCounts
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@strawberry.type
|
|
128
|
+
class CompareExperimentRunAnnotationMetricCounts:
|
|
129
|
+
annotation_name: str
|
|
130
|
+
compare_experiment_id: GlobalID
|
|
131
|
+
num_increases: int
|
|
132
|
+
num_decreases: int
|
|
133
|
+
num_equal: int
|
|
134
|
+
|
|
135
|
+
|
|
109
136
|
@strawberry.type
|
|
110
137
|
class Query:
|
|
111
138
|
@strawberry.field
|
|
@@ -338,19 +365,19 @@ class Query:
|
|
|
338
365
|
async def compare_experiments(
|
|
339
366
|
self,
|
|
340
367
|
info: Info[Context, None],
|
|
341
|
-
|
|
368
|
+
base_experiment_id: GlobalID,
|
|
342
369
|
compare_experiment_ids: list[GlobalID],
|
|
343
370
|
first: Optional[int] = 50,
|
|
344
371
|
after: Optional[CursorString] = UNSET,
|
|
345
372
|
filter_condition: Optional[str] = UNSET,
|
|
346
373
|
) -> Connection[ExperimentComparison]:
|
|
347
|
-
if
|
|
348
|
-
raise BadRequest("Compare experiment IDs cannot contain the
|
|
374
|
+
if base_experiment_id in compare_experiment_ids:
|
|
375
|
+
raise BadRequest("Compare experiment IDs cannot contain the base experiment ID")
|
|
349
376
|
if len(set(compare_experiment_ids)) < len(compare_experiment_ids):
|
|
350
377
|
raise BadRequest("Compare experiment IDs must be unique")
|
|
351
378
|
experiment_ids = [
|
|
352
379
|
from_global_id_with_expected_type(experiment_id, models.Experiment.__name__)
|
|
353
|
-
for experiment_id in (
|
|
380
|
+
for experiment_id in (base_experiment_id, *compare_experiment_ids)
|
|
354
381
|
]
|
|
355
382
|
cursor = Cursor.from_string(after) if after else None
|
|
356
383
|
page_size = first or 50
|
|
@@ -481,6 +508,409 @@ class Query:
|
|
|
481
508
|
has_next_page=has_next_page,
|
|
482
509
|
)
|
|
483
510
|
|
|
511
|
+
@strawberry.field
|
|
512
|
+
async def compare_experiment_run_metric_counts(
|
|
513
|
+
self,
|
|
514
|
+
info: Info[Context, None],
|
|
515
|
+
base_experiment_id: GlobalID,
|
|
516
|
+
compare_experiment_ids: list[GlobalID],
|
|
517
|
+
) -> list[CompareExperimentRunMetricCounts]:
|
|
518
|
+
if base_experiment_id in compare_experiment_ids:
|
|
519
|
+
raise BadRequest("Compare experiment IDs cannot contain the base experiment ID")
|
|
520
|
+
if not compare_experiment_ids:
|
|
521
|
+
raise BadRequest("At least one compare experiment ID must be provided")
|
|
522
|
+
if len(set(compare_experiment_ids)) < len(compare_experiment_ids):
|
|
523
|
+
raise BadRequest("Compare experiment IDs must be unique")
|
|
524
|
+
|
|
525
|
+
try:
|
|
526
|
+
base_experiment_rowid = from_global_id_with_expected_type(
|
|
527
|
+
base_experiment_id, models.Experiment.__name__
|
|
528
|
+
)
|
|
529
|
+
except ValueError:
|
|
530
|
+
raise BadRequest(f"Invalid base experiment ID: {base_experiment_id}")
|
|
531
|
+
|
|
532
|
+
compare_experiment_rowids = []
|
|
533
|
+
for compare_experiment_id in compare_experiment_ids:
|
|
534
|
+
try:
|
|
535
|
+
compare_experiment_rowids.append(
|
|
536
|
+
from_global_id_with_expected_type(
|
|
537
|
+
compare_experiment_id, models.Experiment.__name__
|
|
538
|
+
)
|
|
539
|
+
)
|
|
540
|
+
except ValueError:
|
|
541
|
+
raise BadRequest(f"Invalid compare experiment ID: {compare_experiment_id}")
|
|
542
|
+
|
|
543
|
+
base_experiment_runs = (
|
|
544
|
+
select(models.ExperimentRun)
|
|
545
|
+
.where(models.ExperimentRun.experiment_id == base_experiment_rowid)
|
|
546
|
+
.subquery()
|
|
547
|
+
.alias("base_experiment_runs")
|
|
548
|
+
)
|
|
549
|
+
base_experiment_traces = aliased(models.Trace, name="base_experiment_traces")
|
|
550
|
+
base_experiment_span_costs = (
|
|
551
|
+
select(
|
|
552
|
+
models.SpanCost.trace_rowid,
|
|
553
|
+
func.coalesce(func.sum(models.SpanCost.total_tokens), 0).label("total_tokens"),
|
|
554
|
+
func.coalesce(func.sum(models.SpanCost.prompt_tokens), 0).label("prompt_tokens"),
|
|
555
|
+
func.coalesce(func.sum(models.SpanCost.completion_tokens), 0).label(
|
|
556
|
+
"completion_tokens"
|
|
557
|
+
),
|
|
558
|
+
func.coalesce(func.sum(models.SpanCost.total_cost), 0).label("total_cost"),
|
|
559
|
+
)
|
|
560
|
+
.select_from(models.SpanCost)
|
|
561
|
+
.group_by(
|
|
562
|
+
models.SpanCost.trace_rowid,
|
|
563
|
+
)
|
|
564
|
+
.subquery()
|
|
565
|
+
.alias("base_experiment_span_costs")
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
query = (
|
|
569
|
+
select() # add selected columns below
|
|
570
|
+
.select_from(base_experiment_runs)
|
|
571
|
+
.join(
|
|
572
|
+
base_experiment_traces,
|
|
573
|
+
onclause=base_experiment_runs.c.trace_id == base_experiment_traces.trace_id,
|
|
574
|
+
isouter=True,
|
|
575
|
+
)
|
|
576
|
+
.join(
|
|
577
|
+
base_experiment_span_costs,
|
|
578
|
+
onclause=base_experiment_traces.id == base_experiment_span_costs.c.trace_rowid,
|
|
579
|
+
isouter=True,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
base_experiment_run_latency = LatencyMs(
|
|
584
|
+
base_experiment_runs.c.start_time, base_experiment_runs.c.end_time
|
|
585
|
+
).label("base_experiment_run_latency_ms")
|
|
586
|
+
base_experiment_run_prompt_token_count = base_experiment_span_costs.c.prompt_tokens
|
|
587
|
+
base_experiment_run_completion_token_count = base_experiment_span_costs.c.completion_tokens
|
|
588
|
+
base_experiment_run_total_token_count = base_experiment_span_costs.c.total_tokens
|
|
589
|
+
base_experiment_run_total_cost = base_experiment_span_costs.c.total_cost
|
|
590
|
+
|
|
591
|
+
for compare_experiment_index, compare_experiment_rowid in enumerate(
|
|
592
|
+
compare_experiment_rowids
|
|
593
|
+
):
|
|
594
|
+
compare_experiment_runs = (
|
|
595
|
+
select(models.ExperimentRun)
|
|
596
|
+
.where(models.ExperimentRun.experiment_id == compare_experiment_rowid)
|
|
597
|
+
.subquery()
|
|
598
|
+
.alias(f"comp_exp_{compare_experiment_index}_runs")
|
|
599
|
+
)
|
|
600
|
+
compare_experiment_traces = aliased(
|
|
601
|
+
models.Trace, name=f"comp_exp_{compare_experiment_index}_traces"
|
|
602
|
+
)
|
|
603
|
+
compare_experiment_span_costs = (
|
|
604
|
+
select(
|
|
605
|
+
models.SpanCost.trace_rowid,
|
|
606
|
+
func.coalesce(func.sum(models.SpanCost.total_tokens), 0).label("total_tokens"),
|
|
607
|
+
func.coalesce(func.sum(models.SpanCost.prompt_tokens), 0).label(
|
|
608
|
+
"prompt_tokens"
|
|
609
|
+
),
|
|
610
|
+
func.coalesce(func.sum(models.SpanCost.completion_tokens), 0).label(
|
|
611
|
+
"completion_tokens"
|
|
612
|
+
),
|
|
613
|
+
func.coalesce(func.sum(models.SpanCost.total_cost), 0).label("total_cost"),
|
|
614
|
+
)
|
|
615
|
+
.select_from(models.SpanCost)
|
|
616
|
+
.group_by(models.SpanCost.trace_rowid)
|
|
617
|
+
.subquery()
|
|
618
|
+
.alias(f"comp_exp_{compare_experiment_index}_span_costs")
|
|
619
|
+
)
|
|
620
|
+
compare_experiment_run_latency = LatencyMs(
|
|
621
|
+
compare_experiment_runs.c.start_time, compare_experiment_runs.c.end_time
|
|
622
|
+
).label(f"comp_exp_{compare_experiment_index}_run_latency_ms")
|
|
623
|
+
compare_experiment_run_prompt_token_count = (
|
|
624
|
+
compare_experiment_span_costs.c.prompt_tokens
|
|
625
|
+
)
|
|
626
|
+
compare_experiment_run_completion_token_count = (
|
|
627
|
+
compare_experiment_span_costs.c.completion_tokens
|
|
628
|
+
)
|
|
629
|
+
compare_experiment_run_total_token_count = compare_experiment_span_costs.c.total_tokens
|
|
630
|
+
compare_experiment_run_total_cost = compare_experiment_span_costs.c.total_cost
|
|
631
|
+
|
|
632
|
+
query = (
|
|
633
|
+
query.add_columns(
|
|
634
|
+
_count_rows(
|
|
635
|
+
base_experiment_run_latency < compare_experiment_run_latency,
|
|
636
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_increased_latency"),
|
|
637
|
+
_count_rows(
|
|
638
|
+
base_experiment_run_latency > compare_experiment_run_latency,
|
|
639
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_decreased_latency"),
|
|
640
|
+
_count_rows(
|
|
641
|
+
base_experiment_run_latency == compare_experiment_run_latency,
|
|
642
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_equal_latency"),
|
|
643
|
+
_count_rows(
|
|
644
|
+
base_experiment_run_prompt_token_count
|
|
645
|
+
< compare_experiment_run_prompt_token_count,
|
|
646
|
+
).label(
|
|
647
|
+
f"comp_exp_{compare_experiment_index}_num_runs_increased_prompt_token_count"
|
|
648
|
+
),
|
|
649
|
+
_count_rows(
|
|
650
|
+
base_experiment_run_prompt_token_count
|
|
651
|
+
> compare_experiment_run_prompt_token_count,
|
|
652
|
+
).label(
|
|
653
|
+
f"comp_exp_{compare_experiment_index}_num_runs_decreased_prompt_token_count"
|
|
654
|
+
),
|
|
655
|
+
_count_rows(
|
|
656
|
+
base_experiment_run_prompt_token_count
|
|
657
|
+
== compare_experiment_run_prompt_token_count,
|
|
658
|
+
).label(
|
|
659
|
+
f"comp_exp_{compare_experiment_index}_num_runs_equal_prompt_token_count"
|
|
660
|
+
),
|
|
661
|
+
_count_rows(
|
|
662
|
+
base_experiment_run_completion_token_count
|
|
663
|
+
< compare_experiment_run_completion_token_count,
|
|
664
|
+
).label(
|
|
665
|
+
f"comp_exp_{compare_experiment_index}_num_runs_increased_completion_token_count"
|
|
666
|
+
),
|
|
667
|
+
_count_rows(
|
|
668
|
+
base_experiment_run_completion_token_count
|
|
669
|
+
> compare_experiment_run_completion_token_count,
|
|
670
|
+
).label(
|
|
671
|
+
f"comp_exp_{compare_experiment_index}_num_runs_decreased_completion_token_count"
|
|
672
|
+
),
|
|
673
|
+
_count_rows(
|
|
674
|
+
base_experiment_run_completion_token_count
|
|
675
|
+
== compare_experiment_run_completion_token_count,
|
|
676
|
+
).label(
|
|
677
|
+
f"comp_exp_{compare_experiment_index}_num_runs_equal_completion_token_count"
|
|
678
|
+
),
|
|
679
|
+
_count_rows(
|
|
680
|
+
base_experiment_run_total_token_count
|
|
681
|
+
< compare_experiment_run_total_token_count,
|
|
682
|
+
).label(
|
|
683
|
+
f"comp_exp_{compare_experiment_index}_num_runs_increased_total_token_count"
|
|
684
|
+
),
|
|
685
|
+
_count_rows(
|
|
686
|
+
base_experiment_run_total_token_count
|
|
687
|
+
> compare_experiment_run_total_token_count,
|
|
688
|
+
).label(
|
|
689
|
+
f"comp_exp_{compare_experiment_index}_num_runs_decreased_total_token_count"
|
|
690
|
+
),
|
|
691
|
+
_count_rows(
|
|
692
|
+
base_experiment_run_total_token_count
|
|
693
|
+
== compare_experiment_run_total_token_count,
|
|
694
|
+
).label(
|
|
695
|
+
f"comp_exp_{compare_experiment_index}_num_runs_equal_total_token_count"
|
|
696
|
+
),
|
|
697
|
+
_count_rows(
|
|
698
|
+
base_experiment_run_total_cost < compare_experiment_run_total_cost,
|
|
699
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_increased_total_cost"),
|
|
700
|
+
_count_rows(
|
|
701
|
+
base_experiment_run_total_cost > compare_experiment_run_total_cost,
|
|
702
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_decreased_total_cost"),
|
|
703
|
+
_count_rows(
|
|
704
|
+
base_experiment_run_total_cost == compare_experiment_run_total_cost,
|
|
705
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_equal_total_cost"),
|
|
706
|
+
)
|
|
707
|
+
.join(
|
|
708
|
+
compare_experiment_runs,
|
|
709
|
+
onclause=base_experiment_runs.c.dataset_example_id
|
|
710
|
+
== compare_experiment_runs.c.dataset_example_id,
|
|
711
|
+
isouter=True,
|
|
712
|
+
)
|
|
713
|
+
.join(
|
|
714
|
+
compare_experiment_traces,
|
|
715
|
+
onclause=compare_experiment_runs.c.trace_id
|
|
716
|
+
== compare_experiment_traces.trace_id,
|
|
717
|
+
isouter=True,
|
|
718
|
+
)
|
|
719
|
+
.join(
|
|
720
|
+
compare_experiment_span_costs,
|
|
721
|
+
onclause=compare_experiment_traces.id
|
|
722
|
+
== compare_experiment_span_costs.c.trace_rowid,
|
|
723
|
+
isouter=True,
|
|
724
|
+
)
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
async with info.context.db() as session:
|
|
728
|
+
result = (await session.execute(query)).first()
|
|
729
|
+
assert result is not None
|
|
730
|
+
|
|
731
|
+
num_columns_per_compare_experiment = len(query.columns) // len(compare_experiment_ids)
|
|
732
|
+
counts = []
|
|
733
|
+
for compare_experiment_index, compare_experiment_id in enumerate(compare_experiment_ids):
|
|
734
|
+
start_index = compare_experiment_index * num_columns_per_compare_experiment
|
|
735
|
+
end_index = start_index + num_columns_per_compare_experiment
|
|
736
|
+
(
|
|
737
|
+
num_runs_with_increased_latency,
|
|
738
|
+
num_runs_with_decreased_latency,
|
|
739
|
+
num_runs_with_equal_latency,
|
|
740
|
+
num_runs_with_increased_prompt_token_count,
|
|
741
|
+
num_runs_with_decreased_prompt_token_count,
|
|
742
|
+
num_runs_with_equal_prompt_token_count,
|
|
743
|
+
num_runs_with_increased_completion_token_count,
|
|
744
|
+
num_runs_with_decreased_completion_token_count,
|
|
745
|
+
num_runs_with_equal_completion_token_count,
|
|
746
|
+
num_runs_with_increased_total_token_count,
|
|
747
|
+
num_runs_with_decreased_total_token_count,
|
|
748
|
+
num_runs_with_equal_total_token_count,
|
|
749
|
+
num_runs_with_increased_total_cost,
|
|
750
|
+
num_runs_with_decreased_total_cost,
|
|
751
|
+
num_runs_with_equal_total_cost,
|
|
752
|
+
) = result[start_index:end_index]
|
|
753
|
+
counts.append(
|
|
754
|
+
CompareExperimentRunMetricCounts(
|
|
755
|
+
compare_experiment_id=compare_experiment_id,
|
|
756
|
+
latency=MetricCounts(
|
|
757
|
+
num_increases=num_runs_with_increased_latency,
|
|
758
|
+
num_decreases=num_runs_with_decreased_latency,
|
|
759
|
+
num_equal=num_runs_with_equal_latency,
|
|
760
|
+
),
|
|
761
|
+
prompt_token_count=MetricCounts(
|
|
762
|
+
num_increases=num_runs_with_increased_prompt_token_count,
|
|
763
|
+
num_decreases=num_runs_with_decreased_prompt_token_count,
|
|
764
|
+
num_equal=num_runs_with_equal_prompt_token_count,
|
|
765
|
+
),
|
|
766
|
+
completion_token_count=MetricCounts(
|
|
767
|
+
num_increases=num_runs_with_increased_completion_token_count,
|
|
768
|
+
num_decreases=num_runs_with_decreased_completion_token_count,
|
|
769
|
+
num_equal=num_runs_with_equal_completion_token_count,
|
|
770
|
+
),
|
|
771
|
+
total_token_count=MetricCounts(
|
|
772
|
+
num_increases=num_runs_with_increased_total_token_count,
|
|
773
|
+
num_decreases=num_runs_with_decreased_total_token_count,
|
|
774
|
+
num_equal=num_runs_with_equal_total_token_count,
|
|
775
|
+
),
|
|
776
|
+
total_cost=MetricCounts(
|
|
777
|
+
num_increases=num_runs_with_increased_total_cost,
|
|
778
|
+
num_decreases=num_runs_with_decreased_total_cost,
|
|
779
|
+
num_equal=num_runs_with_equal_total_cost,
|
|
780
|
+
),
|
|
781
|
+
)
|
|
782
|
+
)
|
|
783
|
+
return counts
|
|
784
|
+
|
|
785
|
+
@strawberry.field
|
|
786
|
+
async def compare_experiment_run_annotation_metric_counts(
|
|
787
|
+
self,
|
|
788
|
+
info: Info[Context, None],
|
|
789
|
+
base_experiment_id: GlobalID,
|
|
790
|
+
compare_experiment_ids: list[GlobalID],
|
|
791
|
+
) -> list[CompareExperimentRunAnnotationMetricCounts]:
|
|
792
|
+
if base_experiment_id in compare_experiment_ids:
|
|
793
|
+
raise BadRequest("Compare experiment IDs cannot contain the base experiment ID")
|
|
794
|
+
if not compare_experiment_ids:
|
|
795
|
+
raise BadRequest("At least one compare experiment ID must be provided")
|
|
796
|
+
if len(set(compare_experiment_ids)) < len(compare_experiment_ids):
|
|
797
|
+
raise BadRequest("Compare experiment IDs must be unique")
|
|
798
|
+
|
|
799
|
+
try:
|
|
800
|
+
base_experiment_rowid = from_global_id_with_expected_type(
|
|
801
|
+
base_experiment_id, models.Experiment.__name__
|
|
802
|
+
)
|
|
803
|
+
except ValueError:
|
|
804
|
+
raise BadRequest(f"Invalid base experiment ID: {base_experiment_id}")
|
|
805
|
+
|
|
806
|
+
compare_experiment_rowids = []
|
|
807
|
+
for compare_experiment_id in compare_experiment_ids:
|
|
808
|
+
try:
|
|
809
|
+
compare_experiment_rowids.append(
|
|
810
|
+
from_global_id_with_expected_type(
|
|
811
|
+
compare_experiment_id, models.Experiment.__name__
|
|
812
|
+
)
|
|
813
|
+
)
|
|
814
|
+
except ValueError:
|
|
815
|
+
raise BadRequest(f"Invalid compare experiment ID: {compare_experiment_id}")
|
|
816
|
+
|
|
817
|
+
base_experiment_runs = (
|
|
818
|
+
select(models.ExperimentRun)
|
|
819
|
+
.where(
|
|
820
|
+
models.ExperimentRun.experiment_id == base_experiment_rowid,
|
|
821
|
+
)
|
|
822
|
+
.subquery()
|
|
823
|
+
.alias("base_experiment_runs")
|
|
824
|
+
)
|
|
825
|
+
base_experiment_run_annotations = aliased(
|
|
826
|
+
models.ExperimentRunAnnotation, name="base_experiment_run_annotations"
|
|
827
|
+
)
|
|
828
|
+
query = (
|
|
829
|
+
select(base_experiment_run_annotations.name)
|
|
830
|
+
.select_from(base_experiment_runs)
|
|
831
|
+
.join(
|
|
832
|
+
base_experiment_run_annotations,
|
|
833
|
+
onclause=base_experiment_runs.c.id
|
|
834
|
+
== base_experiment_run_annotations.experiment_run_id,
|
|
835
|
+
isouter=True,
|
|
836
|
+
)
|
|
837
|
+
.group_by(base_experiment_run_annotations.name)
|
|
838
|
+
.order_by(base_experiment_run_annotations.name)
|
|
839
|
+
)
|
|
840
|
+
for compare_experiment_index, compare_experiment_rowid in enumerate(
|
|
841
|
+
compare_experiment_rowids
|
|
842
|
+
):
|
|
843
|
+
compare_experiment_runs = (
|
|
844
|
+
select(models.ExperimentRun)
|
|
845
|
+
.where(
|
|
846
|
+
models.ExperimentRun.experiment_id == compare_experiment_rowid,
|
|
847
|
+
)
|
|
848
|
+
.subquery()
|
|
849
|
+
.alias(f"comp_exp_{compare_experiment_index}_runs")
|
|
850
|
+
)
|
|
851
|
+
compare_experiment_run_annotations = aliased(
|
|
852
|
+
models.ExperimentRunAnnotation,
|
|
853
|
+
name=f"comp_exp_{compare_experiment_index}_run_annotations",
|
|
854
|
+
)
|
|
855
|
+
query = (
|
|
856
|
+
query.add_columns(
|
|
857
|
+
_count_rows(
|
|
858
|
+
base_experiment_run_annotations.score
|
|
859
|
+
< compare_experiment_run_annotations.score,
|
|
860
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_increased_score"),
|
|
861
|
+
_count_rows(
|
|
862
|
+
base_experiment_run_annotations.score
|
|
863
|
+
> compare_experiment_run_annotations.score,
|
|
864
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_decreased_score"),
|
|
865
|
+
_count_rows(
|
|
866
|
+
base_experiment_run_annotations.score
|
|
867
|
+
== compare_experiment_run_annotations.score,
|
|
868
|
+
).label(f"comp_exp_{compare_experiment_index}_num_runs_equal_score"),
|
|
869
|
+
)
|
|
870
|
+
.join(
|
|
871
|
+
compare_experiment_runs,
|
|
872
|
+
onclause=base_experiment_runs.c.dataset_example_id
|
|
873
|
+
== compare_experiment_runs.c.dataset_example_id,
|
|
874
|
+
isouter=True,
|
|
875
|
+
)
|
|
876
|
+
.join(
|
|
877
|
+
compare_experiment_run_annotations,
|
|
878
|
+
onclause=compare_experiment_runs.c.id
|
|
879
|
+
== compare_experiment_run_annotations.experiment_run_id,
|
|
880
|
+
isouter=True,
|
|
881
|
+
)
|
|
882
|
+
.where(
|
|
883
|
+
base_experiment_run_annotations.name == compare_experiment_run_annotations.name
|
|
884
|
+
)
|
|
885
|
+
)
|
|
886
|
+
async with info.context.db() as session:
|
|
887
|
+
result = (await session.execute(query)).all()
|
|
888
|
+
assert result is not None
|
|
889
|
+
num_columns_per_compare_experiment = (len(query.columns) - 1) // len(compare_experiment_ids)
|
|
890
|
+
metric_counts = []
|
|
891
|
+
for record in result:
|
|
892
|
+
annotation_name, *counts = record
|
|
893
|
+
for compare_experiment_index, compare_experiment_id in enumerate(
|
|
894
|
+
compare_experiment_ids
|
|
895
|
+
):
|
|
896
|
+
start_index = compare_experiment_index * num_columns_per_compare_experiment
|
|
897
|
+
end_index = start_index + num_columns_per_compare_experiment
|
|
898
|
+
(
|
|
899
|
+
num_runs_with_increased_score,
|
|
900
|
+
num_runs_with_decreased_score,
|
|
901
|
+
num_runs_with_equal_score,
|
|
902
|
+
) = counts[start_index:end_index]
|
|
903
|
+
metric_counts.append(
|
|
904
|
+
CompareExperimentRunAnnotationMetricCounts(
|
|
905
|
+
annotation_name=annotation_name,
|
|
906
|
+
compare_experiment_id=compare_experiment_id,
|
|
907
|
+
num_increases=num_runs_with_increased_score,
|
|
908
|
+
num_decreases=num_runs_with_decreased_score,
|
|
909
|
+
num_equal=num_runs_with_equal_score,
|
|
910
|
+
)
|
|
911
|
+
)
|
|
912
|
+
return metric_counts
|
|
913
|
+
|
|
484
914
|
@strawberry.field
|
|
485
915
|
async def validate_experiment_run_filter_condition(
|
|
486
916
|
self,
|
|
@@ -1106,3 +1536,20 @@ def _longest_matching_prefix(s: str, prefixes: Iterable[str]) -> str:
|
|
|
1106
1536
|
if s.startswith(prefix) and len(prefix) > len(longest):
|
|
1107
1537
|
longest = prefix
|
|
1108
1538
|
return longest
|
|
1539
|
+
|
|
1540
|
+
|
|
1541
|
+
def _count_rows(
|
|
1542
|
+
condition: ColumnElement[Any],
|
|
1543
|
+
) -> ColumnElement[Any]:
|
|
1544
|
+
"""
|
|
1545
|
+
Returns an expression that counts the number of rows satisfying the condition.
|
|
1546
|
+
"""
|
|
1547
|
+
return func.coalesce(
|
|
1548
|
+
func.sum(
|
|
1549
|
+
case(
|
|
1550
|
+
(condition, 1),
|
|
1551
|
+
else_=0,
|
|
1552
|
+
)
|
|
1553
|
+
),
|
|
1554
|
+
0,
|
|
1555
|
+
)
|
|
@@ -8,9 +8,10 @@ from secrets import token_urlsafe
|
|
|
8
8
|
from typing import Annotated, Any, Literal, Optional, Union
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
|
+
import sqlalchemy as sa
|
|
11
12
|
from fastapi import APIRouter, Depends, Header, HTTPException, Path, Query
|
|
12
13
|
from pydantic import BaseModel, Field
|
|
13
|
-
from sqlalchemy import select
|
|
14
|
+
from sqlalchemy import exists, select, update
|
|
14
15
|
from starlette.requests import Request
|
|
15
16
|
from starlette.responses import Response, StreamingResponse
|
|
16
17
|
from starlette.status import (
|
|
@@ -24,13 +25,14 @@ from strawberry.relay import GlobalID
|
|
|
24
25
|
from phoenix.config import DEFAULT_PROJECT_NAME
|
|
25
26
|
from phoenix.datetime_utils import normalize_datetime
|
|
26
27
|
from phoenix.db import models
|
|
27
|
-
from phoenix.db.helpers import SupportedSQLDialect
|
|
28
|
+
from phoenix.db.helpers import SupportedSQLDialect, get_ancestor_span_rowids
|
|
28
29
|
from phoenix.db.insertion.helpers import as_kv, insert_on_conflict
|
|
29
30
|
from phoenix.db.insertion.types import Precursors
|
|
30
31
|
from phoenix.server.api.routers.utils import df_to_bytes
|
|
32
|
+
from phoenix.server.api.types.node import from_global_id_with_expected_type
|
|
31
33
|
from phoenix.server.authorization import is_not_locked
|
|
32
34
|
from phoenix.server.bearer_auth import PhoenixUser
|
|
33
|
-
from phoenix.server.dml_event import SpanAnnotationInsertEvent
|
|
35
|
+
from phoenix.server.dml_event import SpanAnnotationInsertEvent, SpanDeleteEvent
|
|
34
36
|
from phoenix.trace.attributes import flatten
|
|
35
37
|
from phoenix.trace.dsl import SpanQuery as SpanQuery_
|
|
36
38
|
from phoenix.trace.schemas import (
|
|
@@ -1119,3 +1121,126 @@ async def create_spans(
|
|
|
1119
1121
|
total_received=total_received,
|
|
1120
1122
|
total_queued=len(spans_to_queue),
|
|
1121
1123
|
)
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
@router.delete(
|
|
1127
|
+
"/spans/{span_identifier}",
|
|
1128
|
+
dependencies=[Depends(is_not_locked)],
|
|
1129
|
+
operation_id="deleteSpan",
|
|
1130
|
+
summary="Delete a span by span_identifier",
|
|
1131
|
+
description=(
|
|
1132
|
+
"""
|
|
1133
|
+
Delete a single span by identifier.
|
|
1134
|
+
|
|
1135
|
+
**Important**: This operation deletes ONLY the specified span itself and does NOT
|
|
1136
|
+
delete its descendants/children. All child spans will remain in the trace and
|
|
1137
|
+
become orphaned (their parent_id will point to a non-existent span).
|
|
1138
|
+
|
|
1139
|
+
Behavior:
|
|
1140
|
+
- Deletes only the target span (preserves all descendant spans)
|
|
1141
|
+
- If this was the last span in the trace, the trace record is also deleted
|
|
1142
|
+
- If the deleted span had a parent, its cumulative metrics (error count, token counts)
|
|
1143
|
+
are subtracted from all ancestor spans in the chain
|
|
1144
|
+
|
|
1145
|
+
**Note**: This operation is irreversible and may create orphaned spans.
|
|
1146
|
+
"""
|
|
1147
|
+
),
|
|
1148
|
+
responses=add_errors_to_responses([HTTP_404_NOT_FOUND]),
|
|
1149
|
+
status_code=204, # No Content for successful deletion
|
|
1150
|
+
)
|
|
1151
|
+
async def delete_span(
|
|
1152
|
+
request: Request,
|
|
1153
|
+
span_identifier: str = Path(
|
|
1154
|
+
description="The span identifier: either a relay GlobalID or OpenTelemetry span_id"
|
|
1155
|
+
),
|
|
1156
|
+
) -> None:
|
|
1157
|
+
"""
|
|
1158
|
+
Delete a single span by identifier.
|
|
1159
|
+
|
|
1160
|
+
This operation deletes ONLY the specified span and preserves all its descendants,
|
|
1161
|
+
which may become orphaned (parent_id pointing to non-existent span).
|
|
1162
|
+
|
|
1163
|
+
Steps:
|
|
1164
|
+
1. Find the target span to delete (supports both GlobalID and OpenTelemetry span_id)
|
|
1165
|
+
2. Delete only the target span (all descendants remain untouched)
|
|
1166
|
+
3. If trace becomes empty, delete the trace record
|
|
1167
|
+
4. If deleted span had a parent, subtract its cumulative metrics from ancestor chain
|
|
1168
|
+
5. Return 204 No Content on success
|
|
1169
|
+
|
|
1170
|
+
Args:
|
|
1171
|
+
request: FastAPI request object
|
|
1172
|
+
span_identifier: Either relay GlobalID or OpenTelemetry span_id
|
|
1173
|
+
|
|
1174
|
+
Raises:
|
|
1175
|
+
HTTPException(404): If span not found
|
|
1176
|
+
|
|
1177
|
+
Returns:
|
|
1178
|
+
None (204 No Content status)
|
|
1179
|
+
"""
|
|
1180
|
+
async with request.app.state.db() as session:
|
|
1181
|
+
# Determine the predicate for deletion based on identifier type
|
|
1182
|
+
try:
|
|
1183
|
+
span_rowid = from_global_id_with_expected_type(
|
|
1184
|
+
GlobalID.from_id(span_identifier),
|
|
1185
|
+
"Span",
|
|
1186
|
+
)
|
|
1187
|
+
predicate = models.Span.id == span_rowid
|
|
1188
|
+
error_detail = f"Span with relay ID '{span_identifier}' not found"
|
|
1189
|
+
except Exception:
|
|
1190
|
+
predicate = models.Span.span_id == span_identifier
|
|
1191
|
+
error_detail = f"Span with span_id '{span_identifier}' not found"
|
|
1192
|
+
|
|
1193
|
+
# Delete the span and return its data in one operation
|
|
1194
|
+
target_span = await session.scalar(
|
|
1195
|
+
sa.delete(models.Span).where(predicate).returning(models.Span)
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
if target_span is None:
|
|
1199
|
+
raise HTTPException(
|
|
1200
|
+
status_code=HTTP_404_NOT_FOUND,
|
|
1201
|
+
detail=error_detail,
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
# Store values needed for later operations
|
|
1205
|
+
trace_rowid = target_span.trace_rowid
|
|
1206
|
+
parent_id = target_span.parent_id
|
|
1207
|
+
cumulative_error_count = target_span.cumulative_error_count
|
|
1208
|
+
cumulative_llm_token_count_prompt = target_span.cumulative_llm_token_count_prompt
|
|
1209
|
+
cumulative_llm_token_count_completion = target_span.cumulative_llm_token_count_completion
|
|
1210
|
+
|
|
1211
|
+
# Step 2: Check if trace is empty—if so, delete the trace record
|
|
1212
|
+
trace_is_empty = await session.scalar(
|
|
1213
|
+
select(~exists().where(models.Span.trace_rowid == trace_rowid))
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
if trace_is_empty:
|
|
1217
|
+
# Trace is empty, delete the trace record
|
|
1218
|
+
await session.execute(sa.delete(models.Trace).where(models.Trace.id == trace_rowid))
|
|
1219
|
+
|
|
1220
|
+
# Step 3: Propagate negative cumulative values up ancestor chain if parent_id is not null
|
|
1221
|
+
if not trace_is_empty and parent_id is not None:
|
|
1222
|
+
# Use the helper function to get all ancestor span IDs
|
|
1223
|
+
ancestor_ids_query = get_ancestor_span_rowids(parent_id)
|
|
1224
|
+
|
|
1225
|
+
# Propagate negative cumulative values to ancestors
|
|
1226
|
+
await session.execute(
|
|
1227
|
+
update(models.Span)
|
|
1228
|
+
.where(models.Span.id.in_(ancestor_ids_query))
|
|
1229
|
+
.values(
|
|
1230
|
+
cumulative_error_count=(
|
|
1231
|
+
models.Span.cumulative_error_count - cumulative_error_count
|
|
1232
|
+
),
|
|
1233
|
+
cumulative_llm_token_count_prompt=(
|
|
1234
|
+
models.Span.cumulative_llm_token_count_prompt
|
|
1235
|
+
- cumulative_llm_token_count_prompt
|
|
1236
|
+
),
|
|
1237
|
+
cumulative_llm_token_count_completion=(
|
|
1238
|
+
models.Span.cumulative_llm_token_count_completion
|
|
1239
|
+
- cumulative_llm_token_count_completion
|
|
1240
|
+
),
|
|
1241
|
+
)
|
|
1242
|
+
)
|
|
1243
|
+
# Trigger cache invalidation event
|
|
1244
|
+
request.state.event_queue.put(SpanDeleteEvent((trace_rowid,)))
|
|
1245
|
+
|
|
1246
|
+
return None
|