arize-phoenix 11.23.1__py3-none-any.whl → 12.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. {arize_phoenix-11.23.1.dist-info → arize_phoenix-12.28.1.dist-info}/METADATA +61 -36
  2. {arize_phoenix-11.23.1.dist-info → arize_phoenix-12.28.1.dist-info}/RECORD +212 -162
  3. {arize_phoenix-11.23.1.dist-info → arize_phoenix-12.28.1.dist-info}/WHEEL +1 -1
  4. {arize_phoenix-11.23.1.dist-info → arize_phoenix-12.28.1.dist-info}/licenses/IP_NOTICE +1 -1
  5. phoenix/__generated__/__init__.py +0 -0
  6. phoenix/__generated__/classification_evaluator_configs/__init__.py +20 -0
  7. phoenix/__generated__/classification_evaluator_configs/_document_relevance_classification_evaluator_config.py +17 -0
  8. phoenix/__generated__/classification_evaluator_configs/_hallucination_classification_evaluator_config.py +17 -0
  9. phoenix/__generated__/classification_evaluator_configs/_models.py +18 -0
  10. phoenix/__generated__/classification_evaluator_configs/_tool_selection_classification_evaluator_config.py +17 -0
  11. phoenix/__init__.py +2 -1
  12. phoenix/auth.py +27 -2
  13. phoenix/config.py +1594 -81
  14. phoenix/db/README.md +546 -28
  15. phoenix/db/bulk_inserter.py +119 -116
  16. phoenix/db/engines.py +140 -33
  17. phoenix/db/facilitator.py +22 -1
  18. phoenix/db/helpers.py +818 -65
  19. phoenix/db/iam_auth.py +64 -0
  20. phoenix/db/insertion/dataset.py +133 -1
  21. phoenix/db/insertion/document_annotation.py +9 -6
  22. phoenix/db/insertion/evaluation.py +2 -3
  23. phoenix/db/insertion/helpers.py +2 -2
  24. phoenix/db/insertion/session_annotation.py +176 -0
  25. phoenix/db/insertion/span_annotation.py +3 -4
  26. phoenix/db/insertion/trace_annotation.py +3 -4
  27. phoenix/db/insertion/types.py +41 -18
  28. phoenix/db/migrations/versions/01a8342c9cdf_add_user_id_on_datasets.py +40 -0
  29. phoenix/db/migrations/versions/0df286449799_add_session_annotations_table.py +105 -0
  30. phoenix/db/migrations/versions/272b66ff50f8_drop_single_indices.py +119 -0
  31. phoenix/db/migrations/versions/58228d933c91_dataset_labels.py +67 -0
  32. phoenix/db/migrations/versions/699f655af132_experiment_tags.py +57 -0
  33. phoenix/db/migrations/versions/735d3d93c33e_add_composite_indices.py +41 -0
  34. phoenix/db/migrations/versions/ab513d89518b_add_user_id_on_dataset_versions.py +40 -0
  35. phoenix/db/migrations/versions/d0690a79ea51_users_on_experiments.py +40 -0
  36. phoenix/db/migrations/versions/deb2c81c0bb2_dataset_splits.py +139 -0
  37. phoenix/db/migrations/versions/e76cbd66ffc3_add_experiments_dataset_examples.py +87 -0
  38. phoenix/db/models.py +364 -56
  39. phoenix/db/pg_config.py +10 -0
  40. phoenix/db/types/trace_retention.py +7 -6
  41. phoenix/experiments/functions.py +69 -19
  42. phoenix/inferences/inferences.py +1 -2
  43. phoenix/server/api/auth.py +9 -0
  44. phoenix/server/api/auth_messages.py +46 -0
  45. phoenix/server/api/context.py +60 -0
  46. phoenix/server/api/dataloaders/__init__.py +36 -0
  47. phoenix/server/api/dataloaders/annotation_summaries.py +60 -8
  48. phoenix/server/api/dataloaders/average_experiment_repeated_run_group_latency.py +50 -0
  49. phoenix/server/api/dataloaders/average_experiment_run_latency.py +17 -24
  50. phoenix/server/api/dataloaders/cache/two_tier_cache.py +1 -2
  51. phoenix/server/api/dataloaders/dataset_dataset_splits.py +52 -0
  52. phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -1
  53. phoenix/server/api/dataloaders/dataset_example_splits.py +40 -0
  54. phoenix/server/api/dataloaders/dataset_examples_and_versions_by_experiment_run.py +47 -0
  55. phoenix/server/api/dataloaders/dataset_labels.py +36 -0
  56. phoenix/server/api/dataloaders/document_evaluation_summaries.py +2 -2
  57. phoenix/server/api/dataloaders/document_evaluations.py +6 -9
  58. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +88 -34
  59. phoenix/server/api/dataloaders/experiment_dataset_splits.py +43 -0
  60. phoenix/server/api/dataloaders/experiment_error_rates.py +21 -28
  61. phoenix/server/api/dataloaders/experiment_repeated_run_group_annotation_summaries.py +77 -0
  62. phoenix/server/api/dataloaders/experiment_repeated_run_groups.py +57 -0
  63. phoenix/server/api/dataloaders/experiment_runs_by_experiment_and_example.py +44 -0
  64. phoenix/server/api/dataloaders/latency_ms_quantile.py +40 -8
  65. phoenix/server/api/dataloaders/record_counts.py +37 -10
  66. phoenix/server/api/dataloaders/session_annotations_by_session.py +29 -0
  67. phoenix/server/api/dataloaders/span_cost_summary_by_experiment_repeated_run_group.py +64 -0
  68. phoenix/server/api/dataloaders/span_cost_summary_by_project.py +28 -14
  69. phoenix/server/api/dataloaders/span_costs.py +3 -9
  70. phoenix/server/api/dataloaders/table_fields.py +2 -2
  71. phoenix/server/api/dataloaders/token_prices_by_model.py +30 -0
  72. phoenix/server/api/dataloaders/trace_annotations_by_trace.py +27 -0
  73. phoenix/server/api/exceptions.py +5 -1
  74. phoenix/server/api/helpers/playground_clients.py +263 -83
  75. phoenix/server/api/helpers/playground_spans.py +2 -1
  76. phoenix/server/api/helpers/playground_users.py +26 -0
  77. phoenix/server/api/helpers/prompts/conversions/google.py +103 -0
  78. phoenix/server/api/helpers/prompts/models.py +61 -19
  79. phoenix/server/api/input_types/{SpanAnnotationFilter.py → AnnotationFilter.py} +22 -14
  80. phoenix/server/api/input_types/ChatCompletionInput.py +3 -0
  81. phoenix/server/api/input_types/CreateProjectSessionAnnotationInput.py +37 -0
  82. phoenix/server/api/input_types/DatasetFilter.py +5 -2
  83. phoenix/server/api/input_types/ExperimentRunSort.py +237 -0
  84. phoenix/server/api/input_types/GenerativeModelInput.py +3 -0
  85. phoenix/server/api/input_types/ProjectSessionSort.py +158 -1
  86. phoenix/server/api/input_types/PromptVersionInput.py +47 -1
  87. phoenix/server/api/input_types/SpanSort.py +3 -2
  88. phoenix/server/api/input_types/UpdateAnnotationInput.py +34 -0
  89. phoenix/server/api/input_types/UserRoleInput.py +1 -0
  90. phoenix/server/api/mutations/__init__.py +8 -0
  91. phoenix/server/api/mutations/annotation_config_mutations.py +8 -8
  92. phoenix/server/api/mutations/api_key_mutations.py +15 -20
  93. phoenix/server/api/mutations/chat_mutations.py +106 -37
  94. phoenix/server/api/mutations/dataset_label_mutations.py +243 -0
  95. phoenix/server/api/mutations/dataset_mutations.py +21 -16
  96. phoenix/server/api/mutations/dataset_split_mutations.py +351 -0
  97. phoenix/server/api/mutations/experiment_mutations.py +2 -2
  98. phoenix/server/api/mutations/export_events_mutations.py +3 -3
  99. phoenix/server/api/mutations/model_mutations.py +11 -9
  100. phoenix/server/api/mutations/project_mutations.py +4 -4
  101. phoenix/server/api/mutations/project_session_annotations_mutations.py +158 -0
  102. phoenix/server/api/mutations/project_trace_retention_policy_mutations.py +8 -4
  103. phoenix/server/api/mutations/prompt_label_mutations.py +74 -65
  104. phoenix/server/api/mutations/prompt_mutations.py +65 -129
  105. phoenix/server/api/mutations/prompt_version_tag_mutations.py +11 -8
  106. phoenix/server/api/mutations/span_annotations_mutations.py +15 -10
  107. phoenix/server/api/mutations/trace_annotations_mutations.py +13 -8
  108. phoenix/server/api/mutations/trace_mutations.py +3 -3
  109. phoenix/server/api/mutations/user_mutations.py +55 -26
  110. phoenix/server/api/queries.py +501 -617
  111. phoenix/server/api/routers/__init__.py +2 -2
  112. phoenix/server/api/routers/auth.py +141 -87
  113. phoenix/server/api/routers/ldap.py +229 -0
  114. phoenix/server/api/routers/oauth2.py +349 -101
  115. phoenix/server/api/routers/v1/__init__.py +22 -4
  116. phoenix/server/api/routers/v1/annotation_configs.py +19 -30
  117. phoenix/server/api/routers/v1/annotations.py +455 -13
  118. phoenix/server/api/routers/v1/datasets.py +355 -68
  119. phoenix/server/api/routers/v1/documents.py +142 -0
  120. phoenix/server/api/routers/v1/evaluations.py +20 -28
  121. phoenix/server/api/routers/v1/experiment_evaluations.py +16 -6
  122. phoenix/server/api/routers/v1/experiment_runs.py +335 -59
  123. phoenix/server/api/routers/v1/experiments.py +475 -47
  124. phoenix/server/api/routers/v1/projects.py +16 -50
  125. phoenix/server/api/routers/v1/prompts.py +50 -39
  126. phoenix/server/api/routers/v1/sessions.py +108 -0
  127. phoenix/server/api/routers/v1/spans.py +156 -96
  128. phoenix/server/api/routers/v1/traces.py +51 -77
  129. phoenix/server/api/routers/v1/users.py +64 -24
  130. phoenix/server/api/routers/v1/utils.py +3 -7
  131. phoenix/server/api/subscriptions.py +257 -93
  132. phoenix/server/api/types/Annotation.py +90 -23
  133. phoenix/server/api/types/ApiKey.py +13 -17
  134. phoenix/server/api/types/AuthMethod.py +1 -0
  135. phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +1 -0
  136. phoenix/server/api/types/Dataset.py +199 -72
  137. phoenix/server/api/types/DatasetExample.py +88 -18
  138. phoenix/server/api/types/DatasetExperimentAnnotationSummary.py +10 -0
  139. phoenix/server/api/types/DatasetLabel.py +57 -0
  140. phoenix/server/api/types/DatasetSplit.py +98 -0
  141. phoenix/server/api/types/DatasetVersion.py +49 -4
  142. phoenix/server/api/types/DocumentAnnotation.py +212 -0
  143. phoenix/server/api/types/Experiment.py +215 -68
  144. phoenix/server/api/types/ExperimentComparison.py +3 -9
  145. phoenix/server/api/types/ExperimentRepeatedRunGroup.py +155 -0
  146. phoenix/server/api/types/ExperimentRepeatedRunGroupAnnotationSummary.py +9 -0
  147. phoenix/server/api/types/ExperimentRun.py +120 -70
  148. phoenix/server/api/types/ExperimentRunAnnotation.py +158 -39
  149. phoenix/server/api/types/GenerativeModel.py +95 -42
  150. phoenix/server/api/types/GenerativeProvider.py +1 -1
  151. phoenix/server/api/types/ModelInterface.py +7 -2
  152. phoenix/server/api/types/PlaygroundModel.py +12 -2
  153. phoenix/server/api/types/Project.py +218 -185
  154. phoenix/server/api/types/ProjectSession.py +146 -29
  155. phoenix/server/api/types/ProjectSessionAnnotation.py +187 -0
  156. phoenix/server/api/types/ProjectTraceRetentionPolicy.py +1 -1
  157. phoenix/server/api/types/Prompt.py +119 -39
  158. phoenix/server/api/types/PromptLabel.py +42 -25
  159. phoenix/server/api/types/PromptVersion.py +11 -8
  160. phoenix/server/api/types/PromptVersionTag.py +65 -25
  161. phoenix/server/api/types/Span.py +130 -123
  162. phoenix/server/api/types/SpanAnnotation.py +189 -42
  163. phoenix/server/api/types/SystemApiKey.py +65 -1
  164. phoenix/server/api/types/Trace.py +184 -53
  165. phoenix/server/api/types/TraceAnnotation.py +149 -50
  166. phoenix/server/api/types/User.py +128 -33
  167. phoenix/server/api/types/UserApiKey.py +73 -26
  168. phoenix/server/api/types/node.py +10 -0
  169. phoenix/server/api/types/pagination.py +11 -2
  170. phoenix/server/app.py +154 -36
  171. phoenix/server/authorization.py +5 -4
  172. phoenix/server/bearer_auth.py +13 -5
  173. phoenix/server/cost_tracking/cost_model_lookup.py +42 -14
  174. phoenix/server/cost_tracking/model_cost_manifest.json +1085 -194
  175. phoenix/server/daemons/generative_model_store.py +61 -9
  176. phoenix/server/daemons/span_cost_calculator.py +10 -8
  177. phoenix/server/dml_event.py +13 -0
  178. phoenix/server/email/sender.py +29 -2
  179. phoenix/server/grpc_server.py +9 -9
  180. phoenix/server/jwt_store.py +8 -6
  181. phoenix/server/ldap.py +1449 -0
  182. phoenix/server/main.py +9 -3
  183. phoenix/server/oauth2.py +330 -12
  184. phoenix/server/prometheus.py +43 -6
  185. phoenix/server/rate_limiters.py +4 -9
  186. phoenix/server/retention.py +33 -20
  187. phoenix/server/session_filters.py +49 -0
  188. phoenix/server/static/.vite/manifest.json +51 -53
  189. phoenix/server/static/assets/components-BreFUQQa.js +6702 -0
  190. phoenix/server/static/assets/{index-BPCwGQr8.js → index-CTQoemZv.js} +42 -35
  191. phoenix/server/static/assets/pages-DBE5iYM3.js +9524 -0
  192. phoenix/server/static/assets/vendor-BGzfc4EU.css +1 -0
  193. phoenix/server/static/assets/vendor-DCE4v-Ot.js +920 -0
  194. phoenix/server/static/assets/vendor-codemirror-D5f205eT.js +25 -0
  195. phoenix/server/static/assets/{vendor-recharts-Bw30oz1A.js → vendor-recharts-V9cwpXsm.js} +7 -7
  196. phoenix/server/static/assets/{vendor-shiki-DZajAPeq.js → vendor-shiki-Do--csgv.js} +1 -1
  197. phoenix/server/static/assets/vendor-three-CmB8bl_y.js +3840 -0
  198. phoenix/server/templates/index.html +7 -1
  199. phoenix/server/thread_server.py +1 -2
  200. phoenix/server/utils.py +74 -0
  201. phoenix/session/client.py +55 -1
  202. phoenix/session/data_extractor.py +5 -0
  203. phoenix/session/evaluation.py +8 -4
  204. phoenix/session/session.py +44 -8
  205. phoenix/settings.py +2 -0
  206. phoenix/trace/attributes.py +80 -13
  207. phoenix/trace/dsl/query.py +2 -0
  208. phoenix/trace/projects.py +5 -0
  209. phoenix/utilities/template_formatters.py +1 -1
  210. phoenix/version.py +1 -1
  211. phoenix/server/api/types/Evaluation.py +0 -39
  212. phoenix/server/static/assets/components-D0DWAf0l.js +0 -5650
  213. phoenix/server/static/assets/pages-Creyamao.js +0 -8612
  214. phoenix/server/static/assets/vendor-CU36oj8y.js +0 -905
  215. phoenix/server/static/assets/vendor-CqDb5u4o.css +0 -1
  216. phoenix/server/static/assets/vendor-arizeai-Ctgw0e1G.js +0 -168
  217. phoenix/server/static/assets/vendor-codemirror-Cojjzqb9.js +0 -25
  218. phoenix/server/static/assets/vendor-three-BLWp5bic.js +0 -2998
  219. phoenix/utilities/deprecation.py +0 -31
  220. {arize_phoenix-11.23.1.dist-info → arize_phoenix-12.28.1.dist-info}/entry_points.txt +0 -0
  221. {arize_phoenix-11.23.1.dist-info → arize_phoenix-12.28.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,33 +1,34 @@
1
+ import json
1
2
  from datetime import datetime
2
3
  from typing import Any, Optional
3
4
 
4
- from fastapi import APIRouter, Depends, HTTPException
5
+ from fastapi import APIRouter, Depends, HTTPException, Query
5
6
  from pydantic import Field
6
7
  from sqlalchemy import select
7
- from sqlalchemy.exc import IntegrityError as PostgreSQLIntegrityError
8
- from sqlean.dbapi2 import IntegrityError as SQLiteIntegrityError # type: ignore[import-untyped]
9
8
  from starlette.requests import Request
10
- from starlette.status import HTTP_404_NOT_FOUND, HTTP_409_CONFLICT
11
9
  from strawberry.relay import GlobalID
12
10
 
13
11
  from phoenix.db import models
12
+ from phoenix.db.helpers import get_runs_with_incomplete_evaluations_query
13
+ from phoenix.db.insertion.helpers import OnConflict, insert_on_conflict
14
14
  from phoenix.db.models import ExperimentRunOutput
15
+ from phoenix.server.api.routers.v1.datasets import DatasetExample
15
16
  from phoenix.server.api.types.node import from_global_id_with_expected_type
16
17
  from phoenix.server.authorization import is_not_locked
17
18
  from phoenix.server.dml_event import ExperimentRunInsertEvent
18
19
 
19
20
  from .models import V1RoutesBaseModel
20
- from .utils import ResponseBody, add_errors_to_responses
21
+ from .utils import PaginatedResponseBody, ResponseBody, add_errors_to_responses
21
22
 
22
23
  router = APIRouter(tags=["experiments"], include_in_schema=True)
23
24
 
24
25
 
25
- class ExperimentRun(V1RoutesBaseModel):
26
+ class ExperimentRunData(V1RoutesBaseModel):
26
27
  dataset_example_id: str = Field(
27
28
  description="The ID of the dataset example used in the experiment run"
28
29
  )
29
30
  output: Any = Field(description="The output of the experiment task")
30
- repetition_number: int = Field(description="The repetition number of the experiment run")
31
+ repetition_number: int = Field(description="The repetition number of the experiment run", gt=0)
31
32
  start_time: datetime = Field(description="The start time of the experiment run")
32
33
  end_time: datetime = Field(description="The end time of the experiment run")
33
34
  trace_id: Optional[str] = Field(
@@ -39,7 +40,7 @@ class ExperimentRun(V1RoutesBaseModel):
39
40
  )
40
41
 
41
42
 
42
- class CreateExperimentRunRequestBody(ExperimentRun):
43
+ class CreateExperimentRunRequestBody(ExperimentRunData):
43
44
  pass
44
45
 
45
46
 
@@ -60,12 +61,14 @@ class CreateExperimentRunResponseBody(ResponseBody[CreateExperimentRunResponseBo
60
61
  responses=add_errors_to_responses(
61
62
  [
62
63
  {
63
- "status_code": HTTP_404_NOT_FOUND,
64
+ "status_code": 404,
64
65
  "description": "Experiment or dataset example not found",
65
66
  },
66
67
  {
67
- "status_code": HTTP_409_CONFLICT,
68
- "description": "This experiment run has already been submitted",
68
+ "status_code": 409,
69
+ "description": (
70
+ "Experiment run already exists with a successful result and cannot be updated"
71
+ ),
69
72
  },
70
73
  ]
71
74
  ),
@@ -79,7 +82,7 @@ async def create_experiment_run(
79
82
  except ValueError:
80
83
  raise HTTPException(
81
84
  detail=f"Experiment with ID {experiment_gid} does not exist",
82
- status_code=HTTP_404_NOT_FOUND,
85
+ status_code=404,
83
86
  )
84
87
 
85
88
  example_gid = GlobalID.from_id(request_body.dataset_example_id)
@@ -88,7 +91,7 @@ async def create_experiment_run(
88
91
  except ValueError:
89
92
  raise HTTPException(
90
93
  detail=f"DatasetExample with ID {example_gid} does not exist",
91
- status_code=HTTP_404_NOT_FOUND,
94
+ status_code=404,
92
95
  )
93
96
 
94
97
  trace_id = request_body.trace_id
@@ -99,37 +102,72 @@ async def create_experiment_run(
99
102
  error = request_body.error
100
103
 
101
104
  async with request.app.state.db() as session:
102
- exp_run = models.ExperimentRun(
103
- experiment_id=experiment_rowid,
104
- dataset_example_id=dataset_example_id,
105
- trace_id=trace_id,
106
- output=ExperimentRunOutput(task_output=task_output),
107
- repetition_number=repetition_number,
108
- start_time=start_time,
109
- end_time=end_time,
110
- error=error,
105
+ # Check if a record already exists
106
+ existing_run = await session.scalar(
107
+ select(models.ExperimentRun)
108
+ .where(models.ExperimentRun.experiment_id == experiment_rowid)
109
+ .where(models.ExperimentRun.dataset_example_id == dataset_example_id)
110
+ .where(models.ExperimentRun.repetition_number == repetition_number)
111
111
  )
112
- try:
113
- session.add(exp_run)
114
- await session.flush()
115
- except (PostgreSQLIntegrityError, SQLiteIntegrityError):
112
+
113
+ if existing_run is not None and existing_run.error is None:
114
+ # Record exists and has no error - reject the update
115
+ run_gid = GlobalID("ExperimentRun", str(existing_run.id))
116
116
  raise HTTPException(
117
- detail="This experiment run has already been submitted",
118
- status_code=HTTP_409_CONFLICT,
117
+ status_code=409,
118
+ detail=(
119
+ f"Experiment run {run_gid} already exists with a successful result "
120
+ "and cannot be updated"
121
+ ),
119
122
  )
120
- request.state.event_queue.put(ExperimentRunInsertEvent((exp_run.id,)))
121
- run_gid = GlobalID("ExperimentRun", str(exp_run.id))
123
+ # Either no record exists, or existing record has an error - proceed with upsert
124
+ stmt = insert_on_conflict(
125
+ {
126
+ "experiment_id": experiment_rowid,
127
+ "dataset_example_id": dataset_example_id,
128
+ "trace_id": trace_id,
129
+ "output": ExperimentRunOutput(task_output=task_output),
130
+ "repetition_number": repetition_number,
131
+ "start_time": start_time,
132
+ "end_time": end_time,
133
+ "error": error,
134
+ },
135
+ table=models.ExperimentRun,
136
+ dialect=request.app.state.db.dialect,
137
+ unique_by=["experiment_id", "dataset_example_id", "repetition_number"],
138
+ on_conflict=OnConflict.DO_UPDATE,
139
+ ).returning(models.ExperimentRun.id)
140
+ id_ = await session.scalar(stmt)
141
+
142
+ request.state.event_queue.put(ExperimentRunInsertEvent((id_,)))
143
+ run_gid = GlobalID("ExperimentRun", str(id_))
122
144
  return CreateExperimentRunResponseBody(
123
145
  data=CreateExperimentRunResponseBodyData(id=str(run_gid))
124
146
  )
125
147
 
126
148
 
127
- class ExperimentRunResponse(ExperimentRun):
149
+ class ExperimentRun(ExperimentRunData):
128
150
  id: str = Field(description="The ID of the experiment run")
129
151
  experiment_id: str = Field(description="The ID of the experiment")
130
152
 
131
153
 
132
- class ListExperimentRunsResponseBody(ResponseBody[list[ExperimentRunResponse]]):
154
+ class ListExperimentRunsResponseBody(PaginatedResponseBody[ExperimentRun]):
155
+ pass
156
+
157
+
158
+ class IncompleteExperimentEvaluation(V1RoutesBaseModel):
159
+ """
160
+ Information about an experiment run with incomplete evaluations
161
+ """
162
+
163
+ experiment_run: ExperimentRun = Field(description="The experiment run")
164
+ dataset_example: DatasetExample = Field(description="The dataset example")
165
+ evaluation_names: list[str] = Field(
166
+ description="List of evaluation names that are incomplete (either missing or failed)"
167
+ )
168
+
169
+
170
+ class GetIncompleteEvaluationsResponseBody(PaginatedResponseBody[IncompleteExperimentEvaluation]):
133
171
  pass
134
172
 
135
173
 
@@ -137,47 +175,285 @@ class ListExperimentRunsResponseBody(ResponseBody[list[ExperimentRunResponse]]):
137
175
  "/experiments/{experiment_id}/runs",
138
176
  operation_id="listExperimentRuns",
139
177
  summary="List runs for an experiment",
178
+ description="Retrieve a paginated list of runs for an experiment",
140
179
  response_description="Experiment runs retrieved successfully",
141
180
  responses=add_errors_to_responses(
142
- [{"status_code": HTTP_404_NOT_FOUND, "description": "Experiment not found"}]
181
+ [
182
+ {"status_code": 404, "description": "Experiment not found"},
183
+ {"status_code": 422, "description": "Invalid cursor format"},
184
+ ]
143
185
  ),
144
186
  )
145
187
  async def list_experiment_runs(
146
- request: Request, experiment_id: str
188
+ request: Request,
189
+ experiment_id: str,
190
+ cursor: Optional[str] = Query(
191
+ default=None,
192
+ description="Cursor for pagination (base64-encoded experiment run ID)",
193
+ ),
194
+ limit: Optional[int] = Query(
195
+ default=None,
196
+ description="The max number of experiment runs to return at a time. "
197
+ "If not specified, returns all results.",
198
+ gt=0,
199
+ ),
147
200
  ) -> ListExperimentRunsResponseBody:
148
- experiment_gid = GlobalID.from_id(experiment_id)
201
+ try:
202
+ experiment_gid = GlobalID.from_id(experiment_id)
203
+ except Exception as e:
204
+ raise HTTPException(
205
+ detail=f"Invalid experiment ID format: {experiment_id}",
206
+ status_code=422,
207
+ ) from e
149
208
  try:
150
209
  experiment_rowid = from_global_id_with_expected_type(experiment_gid, "Experiment")
151
210
  except ValueError:
152
211
  raise HTTPException(
153
212
  detail=f"Experiment with ID {experiment_gid} does not exist",
154
- status_code=HTTP_404_NOT_FOUND,
213
+ status_code=404,
155
214
  )
156
215
 
216
+ stmt = (
217
+ select(models.ExperimentRun)
218
+ .filter_by(experiment_id=experiment_rowid)
219
+ .order_by(models.ExperimentRun.id.desc())
220
+ )
221
+
222
+ if cursor:
223
+ try:
224
+ cursor_id = GlobalID.from_id(cursor).node_id
225
+ stmt = stmt.where(models.ExperimentRun.id <= int(cursor_id))
226
+ except ValueError:
227
+ raise HTTPException(
228
+ detail=f"Invalid cursor format: {cursor}",
229
+ status_code=422,
230
+ )
231
+
232
+ # Apply limit only if specified for pagination
233
+ if limit is not None:
234
+ stmt = stmt.limit(limit + 1)
235
+
157
236
  async with request.app.state.db() as session:
158
- experiment_runs = await session.execute(
159
- select(models.ExperimentRun)
160
- .where(models.ExperimentRun.experiment_id == experiment_rowid)
161
- # order by dataset_example_id to be consistent with `list_dataset_examples`
162
- .order_by(models.ExperimentRun.dataset_example_id.asc())
237
+ experiment_runs = (await session.scalars(stmt)).all()
238
+
239
+ if not experiment_runs:
240
+ return ListExperimentRunsResponseBody(next_cursor=None, data=[])
241
+
242
+ next_cursor = None
243
+ # Only check for next cursor if limit was specified
244
+ if limit is not None and len(experiment_runs) == limit + 1:
245
+ last_run = experiment_runs[-1]
246
+ next_cursor = str(GlobalID("ExperimentRun", str(last_run.id)))
247
+ experiment_runs = experiment_runs[:-1]
248
+
249
+ runs = []
250
+ for exp_run in experiment_runs:
251
+ run_gid = GlobalID("ExperimentRun", str(exp_run.id))
252
+ experiment_gid = GlobalID("Experiment", str(exp_run.experiment_id))
253
+ example_gid = GlobalID("DatasetExample", str(exp_run.dataset_example_id))
254
+ runs.append(
255
+ ExperimentRun(
256
+ start_time=exp_run.start_time,
257
+ end_time=exp_run.end_time,
258
+ experiment_id=str(experiment_gid),
259
+ dataset_example_id=str(example_gid),
260
+ repetition_number=exp_run.repetition_number,
261
+ output=exp_run.output.get("task_output"),
262
+ error=exp_run.error,
263
+ id=str(run_gid),
264
+ trace_id=exp_run.trace_id,
265
+ )
266
+ )
267
+ return ListExperimentRunsResponseBody(data=runs, next_cursor=next_cursor)
268
+
269
+
270
+ @router.get(
271
+ "/experiments/{experiment_id}/incomplete-evaluations",
272
+ operation_id="getIncompleteExperimentEvaluations",
273
+ summary="Get incomplete evaluations for an experiment",
274
+ responses=add_errors_to_responses(
275
+ [
276
+ {"status_code": 400, "description": "No evaluator names provided"},
277
+ {"status_code": 404, "description": "Experiment not found"},
278
+ {"status_code": 422, "description": "Invalid cursor format"},
279
+ ]
280
+ ),
281
+ response_description="Incomplete evaluations retrieved successfully",
282
+ )
283
+ async def get_incomplete_evaluations(
284
+ request: Request,
285
+ experiment_id: str,
286
+ evaluation_name: list[str] = Query(default=[], description="Evaluation names to check"),
287
+ cursor: Optional[str] = Query(default=None, description="Cursor for pagination"),
288
+ limit: int = Query(
289
+ default=50, description="Maximum number of runs with incomplete evaluations to return", gt=0
290
+ ),
291
+ ) -> GetIncompleteEvaluationsResponseBody:
292
+ """
293
+ Get experiment runs that have incomplete evaluations.
294
+
295
+ Returns runs with:
296
+ - Missing evaluations (evaluator has not been run)
297
+ - Failed evaluations (evaluator ran but has errors)
298
+
299
+ Args:
300
+ experiment_id: The ID of the experiment
301
+ evaluation_name: List of evaluation names to check (required, at least one)
302
+ cursor: Cursor for pagination
303
+ limit: Maximum number of results to return
304
+
305
+ Returns:
306
+ Paginated list of runs with incomplete evaluations
307
+ """
308
+ try:
309
+ experiment_globalid = GlobalID.from_id(experiment_id)
310
+ except Exception as e:
311
+ raise HTTPException(
312
+ detail=f"Invalid experiment ID format: {experiment_id}",
313
+ status_code=422,
314
+ ) from e
315
+ try:
316
+ experiment_rowid = from_global_id_with_expected_type(experiment_globalid, "Experiment")
317
+ except ValueError:
318
+ raise HTTPException(
319
+ detail=f"Experiment with ID {experiment_globalid} does not exist",
320
+ status_code=404,
321
+ )
322
+
323
+ # Parse cursor if provided
324
+ cursor_run_rowid: Optional[int] = None
325
+ if cursor:
326
+ try:
327
+ cursor_gid = GlobalID.from_id(cursor)
328
+ cursor_run_rowid = from_global_id_with_expected_type(cursor_gid, "ExperimentRun")
329
+ except (ValueError, AttributeError):
330
+ raise HTTPException(
331
+ detail=f"Invalid cursor format: {cursor}",
332
+ status_code=422,
333
+ )
334
+
335
+ # Deduplicate evaluation names
336
+ evaluation_name = list(set(name.strip() for name in evaluation_name if name.strip()))
337
+
338
+ # Require at least one evaluation name
339
+ if not evaluation_name:
340
+ raise HTTPException(
341
+ detail="At least one evaluation_name must be provided",
342
+ status_code=400,
343
+ )
344
+
345
+ # Validate evaluation names - reject null bytes which are invalid in PostgreSQL
346
+ for name in evaluation_name:
347
+ if "\x00" in name:
348
+ raise HTTPException(
349
+ detail="Invalid evaluation name: null bytes are not allowed",
350
+ status_code=400,
351
+ )
352
+
353
+ async with request.app.state.db() as session:
354
+ # Verify experiment exists
355
+ experiment_result = await session.execute(
356
+ select(models.Experiment).filter_by(id=experiment_rowid)
163
357
  )
164
- experiment_runs = experiment_runs.scalars().all()
165
- runs = []
166
- for exp_run in experiment_runs:
167
- run_gid = GlobalID("ExperimentRun", str(exp_run.id))
168
- experiment_gid = GlobalID("Experiment", str(exp_run.experiment_id))
169
- example_gid = GlobalID("DatasetExample", str(exp_run.dataset_example_id))
170
- runs.append(
171
- ExperimentRunResponse(
172
- start_time=exp_run.start_time,
173
- end_time=exp_run.end_time,
174
- experiment_id=str(experiment_gid),
175
- dataset_example_id=str(example_gid),
176
- repetition_number=exp_run.repetition_number,
177
- output=exp_run.output.get("task_output"),
178
- error=exp_run.error,
179
- id=str(run_gid),
180
- trace_id=exp_run.trace_id,
358
+ experiment = experiment_result.scalar()
359
+ if not experiment:
360
+ raise HTTPException(
361
+ detail=f"Experiment with ID {experiment_globalid} does not exist",
362
+ status_code=404,
363
+ )
364
+
365
+ # Query for runs with incomplete evaluations in a single query
366
+ # This fetches runs, revisions, and annotations together to minimize round-trips
367
+ # A run has incomplete evaluations if:
368
+ # 1. It's missing an annotation for any of the requested evaluators
369
+ # 2. It has a failed annotation (error IS NOT NULL) for any evaluator
370
+
371
+ # Get dialect for SQL generation
372
+ dialect = request.app.state.db.dialect
373
+
374
+ # Single query: Get runs with incomplete evaluations + their revisions + annotations
375
+ combined_query = get_runs_with_incomplete_evaluations_query(
376
+ experiment_rowid,
377
+ evaluation_name,
378
+ dialect,
379
+ cursor_run_rowid=cursor_run_rowid,
380
+ limit=limit,
381
+ include_annotations_and_revisions=True,
382
+ )
383
+
384
+ combined_result = await session.execute(combined_query)
385
+ all_rows = combined_result.all()
386
+
387
+ if not all_rows:
388
+ return GetIncompleteEvaluationsResponseBody(data=[], next_cursor=None)
389
+
390
+ # Parse rows - now each row is a single run with successful annotations as JSON array
391
+ # Each row: (ExperimentRun, revision_id, DatasetExampleRevision, annotations_json)
392
+ runs_data: list[tuple[models.ExperimentRun, models.DatasetExampleRevision, set[str]]] = []
393
+
394
+ for row in all_rows:
395
+ run = row[0] # ExperimentRun
396
+ revision = row[2] # DatasetExampleRevision
397
+ annotations_json = row[3] # JSON string or None
398
+
399
+ # Parse successful annotation names (just a list of strings now)
400
+ successful_eval_names: set[str] = set()
401
+ if annotations_json:
402
+ successful_eval_names = set(json.loads(annotations_json))
403
+
404
+ runs_data.append((run, revision, successful_eval_names))
405
+
406
+ # Apply pagination limit
407
+ has_more = len(runs_data) > limit
408
+ if has_more:
409
+ runs_to_process = runs_data[:limit]
410
+ else:
411
+ runs_to_process = runs_data
412
+
413
+ # Build response
414
+ incomplete_evaluations_list: list[IncompleteExperimentEvaluation] = []
415
+ for run, revision, successful_eval_names in runs_to_process:
416
+ # Determine incomplete evaluation names for this run
417
+ # Any evaluation not in the successful set is incomplete (either missing or failed)
418
+ incomplete_evaluation_names = sorted(
419
+ name for name in evaluation_name if name not in successful_eval_names
420
+ )
421
+
422
+ run_globalid = GlobalID("ExperimentRun", str(run.id))
423
+ example_globalid = GlobalID("DatasetExample", str(run.dataset_example_id))
424
+
425
+ incomplete_evaluations_list.append(
426
+ IncompleteExperimentEvaluation(
427
+ experiment_run=ExperimentRun(
428
+ id=str(run_globalid),
429
+ experiment_id=str(experiment_globalid),
430
+ dataset_example_id=str(example_globalid),
431
+ output=run.output.get("task_output"),
432
+ repetition_number=run.repetition_number,
433
+ start_time=run.start_time,
434
+ end_time=run.end_time,
435
+ trace_id=run.trace_id,
436
+ error=run.error,
437
+ ),
438
+ dataset_example=DatasetExample(
439
+ id=str(example_globalid),
440
+ input=revision.input,
441
+ output=revision.output,
442
+ metadata=revision.metadata_,
443
+ updated_at=revision.created_at,
444
+ ),
445
+ evaluation_names=incomplete_evaluation_names,
181
446
  )
182
447
  )
183
- return ListExperimentRunsResponseBody(data=runs)
448
+
449
+ # Set next cursor if we have more results
450
+ next_cursor = None
451
+ if has_more:
452
+ # Cursor is the ID of the next item to fetch
453
+ # (the extra item we fetched but didn't process)
454
+ next_run, _, _ = runs_data[limit] # First item after our limit
455
+ next_cursor = str(GlobalID("ExperimentRun", str(next_run.id)))
456
+
457
+ return GetIncompleteEvaluationsResponseBody(
458
+ data=incomplete_evaluations_list, next_cursor=next_cursor
459
+ )