arize-phoenix 11.17.0__py3-none-any.whl → 11.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (27) hide show
  1. {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/METADATA +2 -2
  2. {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/RECORD +23 -23
  3. phoenix/db/helpers.py +27 -0
  4. phoenix/server/api/helpers/playground_clients.py +2 -0
  5. phoenix/server/api/queries.py +454 -7
  6. phoenix/server/api/routers/v1/spans.py +128 -3
  7. phoenix/server/api/routers/v1/traces.py +36 -15
  8. phoenix/server/prometheus.py +1 -0
  9. phoenix/server/static/.vite/manifest.json +51 -45
  10. phoenix/server/static/assets/{components-B7NKnJXz.js → components-C4HZjMqd.js} +529 -340
  11. phoenix/server/static/assets/{index-9n9lXgT6.js → index-DwyN9UfD.js} +2 -2
  12. phoenix/server/static/assets/{pages-CvqPVUA3.js → pages-B1S5DLvL.js} +583 -515
  13. phoenix/server/static/assets/vendor-BbqekBfb.js +905 -0
  14. phoenix/server/static/assets/vendor-arizeai-CEwHhYfL.js +168 -0
  15. phoenix/server/static/assets/vendor-codemirror-CHApHLLJ.js +25 -0
  16. phoenix/server/static/assets/{vendor-recharts-Cu431IpB.js → vendor-recharts-Bqf7C6Cm.js} +6 -6
  17. phoenix/server/static/assets/vendor-shiki-BQ88Q1b1.js +5 -0
  18. phoenix/server/static/assets/{vendor-three-C5WAXd5r.js → vendor-three-BLWp5bic.js} +154 -154
  19. phoenix/version.py +1 -1
  20. phoenix/server/static/assets/vendor-_6rG8OMg.js +0 -936
  21. phoenix/server/static/assets/vendor-arizeai-BznCmJFh.js +0 -168
  22. phoenix/server/static/assets/vendor-codemirror-29fWLPAy.js +0 -27
  23. phoenix/server/static/assets/vendor-shiki-Ce9e01lU.js +0 -5
  24. {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/WHEEL +0 -0
  25. {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/entry_points.txt +0 -0
  26. {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/licenses/IP_NOTICE +0 -0
  27. {arize_phoenix-11.17.0.dist-info → arize_phoenix-11.19.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,14 +1,14 @@
1
1
  import re
2
2
  from collections import defaultdict
3
3
  from datetime import datetime
4
- from typing import Iterable, Iterator, Optional, Union
4
+ from typing import Any, Iterable, Iterator, Optional, Union
5
5
  from typing import cast as type_cast
6
6
 
7
7
  import numpy as np
8
8
  import numpy.typing as npt
9
9
  import strawberry
10
- from sqlalchemy import String, and_, cast, distinct, func, select, text
11
- from sqlalchemy.orm import joinedload
10
+ from sqlalchemy import ColumnElement, String, and_, case, cast, distinct, func, select, text
11
+ from sqlalchemy.orm import aliased, joinedload
12
12
  from starlette.authentication import UnauthenticatedUser
13
13
  from strawberry import ID, UNSET
14
14
  from strawberry.relay import Connection, GlobalID, Node
@@ -23,6 +23,7 @@ from phoenix.config import (
23
23
  from phoenix.db import models
24
24
  from phoenix.db.constants import DEFAULT_PROJECT_TRACE_RETENTION_POLICY_ID
25
25
  from phoenix.db.helpers import SupportedSQLDialect, exclude_experiment_projects
26
+ from phoenix.db.models import LatencyMs
26
27
  from phoenix.pointcloud.clustering import Hdbscan
27
28
  from phoenix.server.api.auth import MSG_ADMIN_ONLY, IsAdmin
28
29
  from phoenix.server.api.context import Context
@@ -106,6 +107,32 @@ class DbTableStats:
106
107
  num_bytes: float
107
108
 
108
109
 
110
+ @strawberry.type
111
+ class MetricCounts:
112
+ num_increases: int
113
+ num_decreases: int
114
+ num_equal: int
115
+
116
+
117
+ @strawberry.type
118
+ class CompareExperimentRunMetricCounts:
119
+ compare_experiment_id: GlobalID
120
+ latency: MetricCounts
121
+ prompt_token_count: MetricCounts
122
+ completion_token_count: MetricCounts
123
+ total_token_count: MetricCounts
124
+ total_cost: MetricCounts
125
+
126
+
127
+ @strawberry.type
128
+ class CompareExperimentRunAnnotationMetricCounts:
129
+ annotation_name: str
130
+ compare_experiment_id: GlobalID
131
+ num_increases: int
132
+ num_decreases: int
133
+ num_equal: int
134
+
135
+
109
136
  @strawberry.type
110
137
  class Query:
111
138
  @strawberry.field
@@ -338,19 +365,19 @@ class Query:
338
365
  async def compare_experiments(
339
366
  self,
340
367
  info: Info[Context, None],
341
- baseline_experiment_id: GlobalID,
368
+ base_experiment_id: GlobalID,
342
369
  compare_experiment_ids: list[GlobalID],
343
370
  first: Optional[int] = 50,
344
371
  after: Optional[CursorString] = UNSET,
345
372
  filter_condition: Optional[str] = UNSET,
346
373
  ) -> Connection[ExperimentComparison]:
347
- if baseline_experiment_id in compare_experiment_ids:
348
- raise BadRequest("Compare experiment IDs cannot contain the baseline experiment ID")
374
+ if base_experiment_id in compare_experiment_ids:
375
+ raise BadRequest("Compare experiment IDs cannot contain the base experiment ID")
349
376
  if len(set(compare_experiment_ids)) < len(compare_experiment_ids):
350
377
  raise BadRequest("Compare experiment IDs must be unique")
351
378
  experiment_ids = [
352
379
  from_global_id_with_expected_type(experiment_id, models.Experiment.__name__)
353
- for experiment_id in (baseline_experiment_id, *compare_experiment_ids)
380
+ for experiment_id in (base_experiment_id, *compare_experiment_ids)
354
381
  ]
355
382
  cursor = Cursor.from_string(after) if after else None
356
383
  page_size = first or 50
@@ -481,6 +508,409 @@ class Query:
481
508
  has_next_page=has_next_page,
482
509
  )
483
510
 
511
+ @strawberry.field
512
+ async def compare_experiment_run_metric_counts(
513
+ self,
514
+ info: Info[Context, None],
515
+ base_experiment_id: GlobalID,
516
+ compare_experiment_ids: list[GlobalID],
517
+ ) -> list[CompareExperimentRunMetricCounts]:
518
+ if base_experiment_id in compare_experiment_ids:
519
+ raise BadRequest("Compare experiment IDs cannot contain the base experiment ID")
520
+ if not compare_experiment_ids:
521
+ raise BadRequest("At least one compare experiment ID must be provided")
522
+ if len(set(compare_experiment_ids)) < len(compare_experiment_ids):
523
+ raise BadRequest("Compare experiment IDs must be unique")
524
+
525
+ try:
526
+ base_experiment_rowid = from_global_id_with_expected_type(
527
+ base_experiment_id, models.Experiment.__name__
528
+ )
529
+ except ValueError:
530
+ raise BadRequest(f"Invalid base experiment ID: {base_experiment_id}")
531
+
532
+ compare_experiment_rowids = []
533
+ for compare_experiment_id in compare_experiment_ids:
534
+ try:
535
+ compare_experiment_rowids.append(
536
+ from_global_id_with_expected_type(
537
+ compare_experiment_id, models.Experiment.__name__
538
+ )
539
+ )
540
+ except ValueError:
541
+ raise BadRequest(f"Invalid compare experiment ID: {compare_experiment_id}")
542
+
543
+ base_experiment_runs = (
544
+ select(models.ExperimentRun)
545
+ .where(models.ExperimentRun.experiment_id == base_experiment_rowid)
546
+ .subquery()
547
+ .alias("base_experiment_runs")
548
+ )
549
+ base_experiment_traces = aliased(models.Trace, name="base_experiment_traces")
550
+ base_experiment_span_costs = (
551
+ select(
552
+ models.SpanCost.trace_rowid,
553
+ func.coalesce(func.sum(models.SpanCost.total_tokens), 0).label("total_tokens"),
554
+ func.coalesce(func.sum(models.SpanCost.prompt_tokens), 0).label("prompt_tokens"),
555
+ func.coalesce(func.sum(models.SpanCost.completion_tokens), 0).label(
556
+ "completion_tokens"
557
+ ),
558
+ func.coalesce(func.sum(models.SpanCost.total_cost), 0).label("total_cost"),
559
+ )
560
+ .select_from(models.SpanCost)
561
+ .group_by(
562
+ models.SpanCost.trace_rowid,
563
+ )
564
+ .subquery()
565
+ .alias("base_experiment_span_costs")
566
+ )
567
+
568
+ query = (
569
+ select() # add selected columns below
570
+ .select_from(base_experiment_runs)
571
+ .join(
572
+ base_experiment_traces,
573
+ onclause=base_experiment_runs.c.trace_id == base_experiment_traces.trace_id,
574
+ isouter=True,
575
+ )
576
+ .join(
577
+ base_experiment_span_costs,
578
+ onclause=base_experiment_traces.id == base_experiment_span_costs.c.trace_rowid,
579
+ isouter=True,
580
+ )
581
+ )
582
+
583
+ base_experiment_run_latency = LatencyMs(
584
+ base_experiment_runs.c.start_time, base_experiment_runs.c.end_time
585
+ ).label("base_experiment_run_latency_ms")
586
+ base_experiment_run_prompt_token_count = base_experiment_span_costs.c.prompt_tokens
587
+ base_experiment_run_completion_token_count = base_experiment_span_costs.c.completion_tokens
588
+ base_experiment_run_total_token_count = base_experiment_span_costs.c.total_tokens
589
+ base_experiment_run_total_cost = base_experiment_span_costs.c.total_cost
590
+
591
+ for compare_experiment_index, compare_experiment_rowid in enumerate(
592
+ compare_experiment_rowids
593
+ ):
594
+ compare_experiment_runs = (
595
+ select(models.ExperimentRun)
596
+ .where(models.ExperimentRun.experiment_id == compare_experiment_rowid)
597
+ .subquery()
598
+ .alias(f"comp_exp_{compare_experiment_index}_runs")
599
+ )
600
+ compare_experiment_traces = aliased(
601
+ models.Trace, name=f"comp_exp_{compare_experiment_index}_traces"
602
+ )
603
+ compare_experiment_span_costs = (
604
+ select(
605
+ models.SpanCost.trace_rowid,
606
+ func.coalesce(func.sum(models.SpanCost.total_tokens), 0).label("total_tokens"),
607
+ func.coalesce(func.sum(models.SpanCost.prompt_tokens), 0).label(
608
+ "prompt_tokens"
609
+ ),
610
+ func.coalesce(func.sum(models.SpanCost.completion_tokens), 0).label(
611
+ "completion_tokens"
612
+ ),
613
+ func.coalesce(func.sum(models.SpanCost.total_cost), 0).label("total_cost"),
614
+ )
615
+ .select_from(models.SpanCost)
616
+ .group_by(models.SpanCost.trace_rowid)
617
+ .subquery()
618
+ .alias(f"comp_exp_{compare_experiment_index}_span_costs")
619
+ )
620
+ compare_experiment_run_latency = LatencyMs(
621
+ compare_experiment_runs.c.start_time, compare_experiment_runs.c.end_time
622
+ ).label(f"comp_exp_{compare_experiment_index}_run_latency_ms")
623
+ compare_experiment_run_prompt_token_count = (
624
+ compare_experiment_span_costs.c.prompt_tokens
625
+ )
626
+ compare_experiment_run_completion_token_count = (
627
+ compare_experiment_span_costs.c.completion_tokens
628
+ )
629
+ compare_experiment_run_total_token_count = compare_experiment_span_costs.c.total_tokens
630
+ compare_experiment_run_total_cost = compare_experiment_span_costs.c.total_cost
631
+
632
+ query = (
633
+ query.add_columns(
634
+ _count_rows(
635
+ base_experiment_run_latency < compare_experiment_run_latency,
636
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_increased_latency"),
637
+ _count_rows(
638
+ base_experiment_run_latency > compare_experiment_run_latency,
639
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_decreased_latency"),
640
+ _count_rows(
641
+ base_experiment_run_latency == compare_experiment_run_latency,
642
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_equal_latency"),
643
+ _count_rows(
644
+ base_experiment_run_prompt_token_count
645
+ < compare_experiment_run_prompt_token_count,
646
+ ).label(
647
+ f"comp_exp_{compare_experiment_index}_num_runs_increased_prompt_token_count"
648
+ ),
649
+ _count_rows(
650
+ base_experiment_run_prompt_token_count
651
+ > compare_experiment_run_prompt_token_count,
652
+ ).label(
653
+ f"comp_exp_{compare_experiment_index}_num_runs_decreased_prompt_token_count"
654
+ ),
655
+ _count_rows(
656
+ base_experiment_run_prompt_token_count
657
+ == compare_experiment_run_prompt_token_count,
658
+ ).label(
659
+ f"comp_exp_{compare_experiment_index}_num_runs_equal_prompt_token_count"
660
+ ),
661
+ _count_rows(
662
+ base_experiment_run_completion_token_count
663
+ < compare_experiment_run_completion_token_count,
664
+ ).label(
665
+ f"comp_exp_{compare_experiment_index}_num_runs_increased_completion_token_count"
666
+ ),
667
+ _count_rows(
668
+ base_experiment_run_completion_token_count
669
+ > compare_experiment_run_completion_token_count,
670
+ ).label(
671
+ f"comp_exp_{compare_experiment_index}_num_runs_decreased_completion_token_count"
672
+ ),
673
+ _count_rows(
674
+ base_experiment_run_completion_token_count
675
+ == compare_experiment_run_completion_token_count,
676
+ ).label(
677
+ f"comp_exp_{compare_experiment_index}_num_runs_equal_completion_token_count"
678
+ ),
679
+ _count_rows(
680
+ base_experiment_run_total_token_count
681
+ < compare_experiment_run_total_token_count,
682
+ ).label(
683
+ f"comp_exp_{compare_experiment_index}_num_runs_increased_total_token_count"
684
+ ),
685
+ _count_rows(
686
+ base_experiment_run_total_token_count
687
+ > compare_experiment_run_total_token_count,
688
+ ).label(
689
+ f"comp_exp_{compare_experiment_index}_num_runs_decreased_total_token_count"
690
+ ),
691
+ _count_rows(
692
+ base_experiment_run_total_token_count
693
+ == compare_experiment_run_total_token_count,
694
+ ).label(
695
+ f"comp_exp_{compare_experiment_index}_num_runs_equal_total_token_count"
696
+ ),
697
+ _count_rows(
698
+ base_experiment_run_total_cost < compare_experiment_run_total_cost,
699
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_increased_total_cost"),
700
+ _count_rows(
701
+ base_experiment_run_total_cost > compare_experiment_run_total_cost,
702
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_decreased_total_cost"),
703
+ _count_rows(
704
+ base_experiment_run_total_cost == compare_experiment_run_total_cost,
705
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_equal_total_cost"),
706
+ )
707
+ .join(
708
+ compare_experiment_runs,
709
+ onclause=base_experiment_runs.c.dataset_example_id
710
+ == compare_experiment_runs.c.dataset_example_id,
711
+ isouter=True,
712
+ )
713
+ .join(
714
+ compare_experiment_traces,
715
+ onclause=compare_experiment_runs.c.trace_id
716
+ == compare_experiment_traces.trace_id,
717
+ isouter=True,
718
+ )
719
+ .join(
720
+ compare_experiment_span_costs,
721
+ onclause=compare_experiment_traces.id
722
+ == compare_experiment_span_costs.c.trace_rowid,
723
+ isouter=True,
724
+ )
725
+ )
726
+
727
+ async with info.context.db() as session:
728
+ result = (await session.execute(query)).first()
729
+ assert result is not None
730
+
731
+ num_columns_per_compare_experiment = len(query.columns) // len(compare_experiment_ids)
732
+ counts = []
733
+ for compare_experiment_index, compare_experiment_id in enumerate(compare_experiment_ids):
734
+ start_index = compare_experiment_index * num_columns_per_compare_experiment
735
+ end_index = start_index + num_columns_per_compare_experiment
736
+ (
737
+ num_runs_with_increased_latency,
738
+ num_runs_with_decreased_latency,
739
+ num_runs_with_equal_latency,
740
+ num_runs_with_increased_prompt_token_count,
741
+ num_runs_with_decreased_prompt_token_count,
742
+ num_runs_with_equal_prompt_token_count,
743
+ num_runs_with_increased_completion_token_count,
744
+ num_runs_with_decreased_completion_token_count,
745
+ num_runs_with_equal_completion_token_count,
746
+ num_runs_with_increased_total_token_count,
747
+ num_runs_with_decreased_total_token_count,
748
+ num_runs_with_equal_total_token_count,
749
+ num_runs_with_increased_total_cost,
750
+ num_runs_with_decreased_total_cost,
751
+ num_runs_with_equal_total_cost,
752
+ ) = result[start_index:end_index]
753
+ counts.append(
754
+ CompareExperimentRunMetricCounts(
755
+ compare_experiment_id=compare_experiment_id,
756
+ latency=MetricCounts(
757
+ num_increases=num_runs_with_increased_latency,
758
+ num_decreases=num_runs_with_decreased_latency,
759
+ num_equal=num_runs_with_equal_latency,
760
+ ),
761
+ prompt_token_count=MetricCounts(
762
+ num_increases=num_runs_with_increased_prompt_token_count,
763
+ num_decreases=num_runs_with_decreased_prompt_token_count,
764
+ num_equal=num_runs_with_equal_prompt_token_count,
765
+ ),
766
+ completion_token_count=MetricCounts(
767
+ num_increases=num_runs_with_increased_completion_token_count,
768
+ num_decreases=num_runs_with_decreased_completion_token_count,
769
+ num_equal=num_runs_with_equal_completion_token_count,
770
+ ),
771
+ total_token_count=MetricCounts(
772
+ num_increases=num_runs_with_increased_total_token_count,
773
+ num_decreases=num_runs_with_decreased_total_token_count,
774
+ num_equal=num_runs_with_equal_total_token_count,
775
+ ),
776
+ total_cost=MetricCounts(
777
+ num_increases=num_runs_with_increased_total_cost,
778
+ num_decreases=num_runs_with_decreased_total_cost,
779
+ num_equal=num_runs_with_equal_total_cost,
780
+ ),
781
+ )
782
+ )
783
+ return counts
784
+
785
+ @strawberry.field
786
+ async def compare_experiment_run_annotation_metric_counts(
787
+ self,
788
+ info: Info[Context, None],
789
+ base_experiment_id: GlobalID,
790
+ compare_experiment_ids: list[GlobalID],
791
+ ) -> list[CompareExperimentRunAnnotationMetricCounts]:
792
+ if base_experiment_id in compare_experiment_ids:
793
+ raise BadRequest("Compare experiment IDs cannot contain the base experiment ID")
794
+ if not compare_experiment_ids:
795
+ raise BadRequest("At least one compare experiment ID must be provided")
796
+ if len(set(compare_experiment_ids)) < len(compare_experiment_ids):
797
+ raise BadRequest("Compare experiment IDs must be unique")
798
+
799
+ try:
800
+ base_experiment_rowid = from_global_id_with_expected_type(
801
+ base_experiment_id, models.Experiment.__name__
802
+ )
803
+ except ValueError:
804
+ raise BadRequest(f"Invalid base experiment ID: {base_experiment_id}")
805
+
806
+ compare_experiment_rowids = []
807
+ for compare_experiment_id in compare_experiment_ids:
808
+ try:
809
+ compare_experiment_rowids.append(
810
+ from_global_id_with_expected_type(
811
+ compare_experiment_id, models.Experiment.__name__
812
+ )
813
+ )
814
+ except ValueError:
815
+ raise BadRequest(f"Invalid compare experiment ID: {compare_experiment_id}")
816
+
817
+ base_experiment_runs = (
818
+ select(models.ExperimentRun)
819
+ .where(
820
+ models.ExperimentRun.experiment_id == base_experiment_rowid,
821
+ )
822
+ .subquery()
823
+ .alias("base_experiment_runs")
824
+ )
825
+ base_experiment_run_annotations = aliased(
826
+ models.ExperimentRunAnnotation, name="base_experiment_run_annotations"
827
+ )
828
+ query = (
829
+ select(base_experiment_run_annotations.name)
830
+ .select_from(base_experiment_runs)
831
+ .join(
832
+ base_experiment_run_annotations,
833
+ onclause=base_experiment_runs.c.id
834
+ == base_experiment_run_annotations.experiment_run_id,
835
+ isouter=True,
836
+ )
837
+ .group_by(base_experiment_run_annotations.name)
838
+ .order_by(base_experiment_run_annotations.name)
839
+ )
840
+ for compare_experiment_index, compare_experiment_rowid in enumerate(
841
+ compare_experiment_rowids
842
+ ):
843
+ compare_experiment_runs = (
844
+ select(models.ExperimentRun)
845
+ .where(
846
+ models.ExperimentRun.experiment_id == compare_experiment_rowid,
847
+ )
848
+ .subquery()
849
+ .alias(f"comp_exp_{compare_experiment_index}_runs")
850
+ )
851
+ compare_experiment_run_annotations = aliased(
852
+ models.ExperimentRunAnnotation,
853
+ name=f"comp_exp_{compare_experiment_index}_run_annotations",
854
+ )
855
+ query = (
856
+ query.add_columns(
857
+ _count_rows(
858
+ base_experiment_run_annotations.score
859
+ < compare_experiment_run_annotations.score,
860
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_increased_score"),
861
+ _count_rows(
862
+ base_experiment_run_annotations.score
863
+ > compare_experiment_run_annotations.score,
864
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_decreased_score"),
865
+ _count_rows(
866
+ base_experiment_run_annotations.score
867
+ == compare_experiment_run_annotations.score,
868
+ ).label(f"comp_exp_{compare_experiment_index}_num_runs_equal_score"),
869
+ )
870
+ .join(
871
+ compare_experiment_runs,
872
+ onclause=base_experiment_runs.c.dataset_example_id
873
+ == compare_experiment_runs.c.dataset_example_id,
874
+ isouter=True,
875
+ )
876
+ .join(
877
+ compare_experiment_run_annotations,
878
+ onclause=compare_experiment_runs.c.id
879
+ == compare_experiment_run_annotations.experiment_run_id,
880
+ isouter=True,
881
+ )
882
+ .where(
883
+ base_experiment_run_annotations.name == compare_experiment_run_annotations.name
884
+ )
885
+ )
886
+ async with info.context.db() as session:
887
+ result = (await session.execute(query)).all()
888
+ assert result is not None
889
+ num_columns_per_compare_experiment = (len(query.columns) - 1) // len(compare_experiment_ids)
890
+ metric_counts = []
891
+ for record in result:
892
+ annotation_name, *counts = record
893
+ for compare_experiment_index, compare_experiment_id in enumerate(
894
+ compare_experiment_ids
895
+ ):
896
+ start_index = compare_experiment_index * num_columns_per_compare_experiment
897
+ end_index = start_index + num_columns_per_compare_experiment
898
+ (
899
+ num_runs_with_increased_score,
900
+ num_runs_with_decreased_score,
901
+ num_runs_with_equal_score,
902
+ ) = counts[start_index:end_index]
903
+ metric_counts.append(
904
+ CompareExperimentRunAnnotationMetricCounts(
905
+ annotation_name=annotation_name,
906
+ compare_experiment_id=compare_experiment_id,
907
+ num_increases=num_runs_with_increased_score,
908
+ num_decreases=num_runs_with_decreased_score,
909
+ num_equal=num_runs_with_equal_score,
910
+ )
911
+ )
912
+ return metric_counts
913
+
484
914
  @strawberry.field
485
915
  async def validate_experiment_run_filter_condition(
486
916
  self,
@@ -1106,3 +1536,20 @@ def _longest_matching_prefix(s: str, prefixes: Iterable[str]) -> str:
1106
1536
  if s.startswith(prefix) and len(prefix) > len(longest):
1107
1537
  longest = prefix
1108
1538
  return longest
1539
+
1540
+
1541
+ def _count_rows(
1542
+ condition: ColumnElement[Any],
1543
+ ) -> ColumnElement[Any]:
1544
+ """
1545
+ Returns an expression that counts the number of rows satisfying the condition.
1546
+ """
1547
+ return func.coalesce(
1548
+ func.sum(
1549
+ case(
1550
+ (condition, 1),
1551
+ else_=0,
1552
+ )
1553
+ ),
1554
+ 0,
1555
+ )
@@ -8,9 +8,10 @@ from secrets import token_urlsafe
8
8
  from typing import Annotated, Any, Literal, Optional, Union
9
9
 
10
10
  import pandas as pd
11
+ import sqlalchemy as sa
11
12
  from fastapi import APIRouter, Depends, Header, HTTPException, Path, Query
12
13
  from pydantic import BaseModel, Field
13
- from sqlalchemy import select
14
+ from sqlalchemy import exists, select, update
14
15
  from starlette.requests import Request
15
16
  from starlette.responses import Response, StreamingResponse
16
17
  from starlette.status import (
@@ -24,13 +25,14 @@ from strawberry.relay import GlobalID
24
25
  from phoenix.config import DEFAULT_PROJECT_NAME
25
26
  from phoenix.datetime_utils import normalize_datetime
26
27
  from phoenix.db import models
27
- from phoenix.db.helpers import SupportedSQLDialect
28
+ from phoenix.db.helpers import SupportedSQLDialect, get_ancestor_span_rowids
28
29
  from phoenix.db.insertion.helpers import as_kv, insert_on_conflict
29
30
  from phoenix.db.insertion.types import Precursors
30
31
  from phoenix.server.api.routers.utils import df_to_bytes
32
+ from phoenix.server.api.types.node import from_global_id_with_expected_type
31
33
  from phoenix.server.authorization import is_not_locked
32
34
  from phoenix.server.bearer_auth import PhoenixUser
33
- from phoenix.server.dml_event import SpanAnnotationInsertEvent
35
+ from phoenix.server.dml_event import SpanAnnotationInsertEvent, SpanDeleteEvent
34
36
  from phoenix.trace.attributes import flatten
35
37
  from phoenix.trace.dsl import SpanQuery as SpanQuery_
36
38
  from phoenix.trace.schemas import (
@@ -1119,3 +1121,126 @@ async def create_spans(
1119
1121
  total_received=total_received,
1120
1122
  total_queued=len(spans_to_queue),
1121
1123
  )
1124
+
1125
+
1126
+ @router.delete(
1127
+ "/spans/{span_identifier}",
1128
+ dependencies=[Depends(is_not_locked)],
1129
+ operation_id="deleteSpan",
1130
+ summary="Delete a span by span_identifier",
1131
+ description=(
1132
+ """
1133
+ Delete a single span by identifier.
1134
+
1135
+ **Important**: This operation deletes ONLY the specified span itself and does NOT
1136
+ delete its descendants/children. All child spans will remain in the trace and
1137
+ become orphaned (their parent_id will point to a non-existent span).
1138
+
1139
+ Behavior:
1140
+ - Deletes only the target span (preserves all descendant spans)
1141
+ - If this was the last span in the trace, the trace record is also deleted
1142
+ - If the deleted span had a parent, its cumulative metrics (error count, token counts)
1143
+ are subtracted from all ancestor spans in the chain
1144
+
1145
+ **Note**: This operation is irreversible and may create orphaned spans.
1146
+ """
1147
+ ),
1148
+ responses=add_errors_to_responses([HTTP_404_NOT_FOUND]),
1149
+ status_code=204, # No Content for successful deletion
1150
+ )
1151
+ async def delete_span(
1152
+ request: Request,
1153
+ span_identifier: str = Path(
1154
+ description="The span identifier: either a relay GlobalID or OpenTelemetry span_id"
1155
+ ),
1156
+ ) -> None:
1157
+ """
1158
+ Delete a single span by identifier.
1159
+
1160
+ This operation deletes ONLY the specified span and preserves all its descendants,
1161
+ which may become orphaned (parent_id pointing to non-existent span).
1162
+
1163
+ Steps:
1164
+ 1. Find the target span to delete (supports both GlobalID and OpenTelemetry span_id)
1165
+ 2. Delete only the target span (all descendants remain untouched)
1166
+ 3. If trace becomes empty, delete the trace record
1167
+ 4. If deleted span had a parent, subtract its cumulative metrics from ancestor chain
1168
+ 5. Return 204 No Content on success
1169
+
1170
+ Args:
1171
+ request: FastAPI request object
1172
+ span_identifier: Either relay GlobalID or OpenTelemetry span_id
1173
+
1174
+ Raises:
1175
+ HTTPException(404): If span not found
1176
+
1177
+ Returns:
1178
+ None (204 No Content status)
1179
+ """
1180
+ async with request.app.state.db() as session:
1181
+ # Determine the predicate for deletion based on identifier type
1182
+ try:
1183
+ span_rowid = from_global_id_with_expected_type(
1184
+ GlobalID.from_id(span_identifier),
1185
+ "Span",
1186
+ )
1187
+ predicate = models.Span.id == span_rowid
1188
+ error_detail = f"Span with relay ID '{span_identifier}' not found"
1189
+ except Exception:
1190
+ predicate = models.Span.span_id == span_identifier
1191
+ error_detail = f"Span with span_id '{span_identifier}' not found"
1192
+
1193
+ # Delete the span and return its data in one operation
1194
+ target_span = await session.scalar(
1195
+ sa.delete(models.Span).where(predicate).returning(models.Span)
1196
+ )
1197
+
1198
+ if target_span is None:
1199
+ raise HTTPException(
1200
+ status_code=HTTP_404_NOT_FOUND,
1201
+ detail=error_detail,
1202
+ )
1203
+
1204
+ # Store values needed for later operations
1205
+ trace_rowid = target_span.trace_rowid
1206
+ parent_id = target_span.parent_id
1207
+ cumulative_error_count = target_span.cumulative_error_count
1208
+ cumulative_llm_token_count_prompt = target_span.cumulative_llm_token_count_prompt
1209
+ cumulative_llm_token_count_completion = target_span.cumulative_llm_token_count_completion
1210
+
1211
+ # Step 2: Check if trace is empty—if so, delete the trace record
1212
+ trace_is_empty = await session.scalar(
1213
+ select(~exists().where(models.Span.trace_rowid == trace_rowid))
1214
+ )
1215
+
1216
+ if trace_is_empty:
1217
+ # Trace is empty, delete the trace record
1218
+ await session.execute(sa.delete(models.Trace).where(models.Trace.id == trace_rowid))
1219
+
1220
+ # Step 3: Propagate negative cumulative values up ancestor chain if parent_id is not null
1221
+ if not trace_is_empty and parent_id is not None:
1222
+ # Use the helper function to get all ancestor span IDs
1223
+ ancestor_ids_query = get_ancestor_span_rowids(parent_id)
1224
+
1225
+ # Propagate negative cumulative values to ancestors
1226
+ await session.execute(
1227
+ update(models.Span)
1228
+ .where(models.Span.id.in_(ancestor_ids_query))
1229
+ .values(
1230
+ cumulative_error_count=(
1231
+ models.Span.cumulative_error_count - cumulative_error_count
1232
+ ),
1233
+ cumulative_llm_token_count_prompt=(
1234
+ models.Span.cumulative_llm_token_count_prompt
1235
+ - cumulative_llm_token_count_prompt
1236
+ ),
1237
+ cumulative_llm_token_count_completion=(
1238
+ models.Span.cumulative_llm_token_count_completion
1239
+ - cumulative_llm_token_count_completion
1240
+ ),
1241
+ )
1242
+ )
1243
+ # Trigger cache invalidation event
1244
+ request.state.event_queue.put(SpanDeleteEvent((trace_rowid,)))
1245
+
1246
+ return None