arize-phoenix 10.0.4__py3-none-any.whl → 12.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/METADATA +124 -72
  2. arize_phoenix-12.28.1.dist-info/RECORD +499 -0
  3. {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/WHEEL +1 -1
  4. {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/licenses/IP_NOTICE +1 -1
  5. phoenix/__generated__/__init__.py +0 -0
  6. phoenix/__generated__/classification_evaluator_configs/__init__.py +20 -0
  7. phoenix/__generated__/classification_evaluator_configs/_document_relevance_classification_evaluator_config.py +17 -0
  8. phoenix/__generated__/classification_evaluator_configs/_hallucination_classification_evaluator_config.py +17 -0
  9. phoenix/__generated__/classification_evaluator_configs/_models.py +18 -0
  10. phoenix/__generated__/classification_evaluator_configs/_tool_selection_classification_evaluator_config.py +17 -0
  11. phoenix/__init__.py +5 -4
  12. phoenix/auth.py +39 -2
  13. phoenix/config.py +1763 -91
  14. phoenix/datetime_utils.py +120 -2
  15. phoenix/db/README.md +595 -25
  16. phoenix/db/bulk_inserter.py +145 -103
  17. phoenix/db/engines.py +140 -33
  18. phoenix/db/enums.py +3 -12
  19. phoenix/db/facilitator.py +302 -35
  20. phoenix/db/helpers.py +1000 -65
  21. phoenix/db/iam_auth.py +64 -0
  22. phoenix/db/insertion/dataset.py +135 -2
  23. phoenix/db/insertion/document_annotation.py +9 -6
  24. phoenix/db/insertion/evaluation.py +2 -3
  25. phoenix/db/insertion/helpers.py +17 -2
  26. phoenix/db/insertion/session_annotation.py +176 -0
  27. phoenix/db/insertion/span.py +15 -11
  28. phoenix/db/insertion/span_annotation.py +3 -4
  29. phoenix/db/insertion/trace_annotation.py +3 -4
  30. phoenix/db/insertion/types.py +50 -20
  31. phoenix/db/migrations/versions/01a8342c9cdf_add_user_id_on_datasets.py +40 -0
  32. phoenix/db/migrations/versions/0df286449799_add_session_annotations_table.py +105 -0
  33. phoenix/db/migrations/versions/272b66ff50f8_drop_single_indices.py +119 -0
  34. phoenix/db/migrations/versions/58228d933c91_dataset_labels.py +67 -0
  35. phoenix/db/migrations/versions/699f655af132_experiment_tags.py +57 -0
  36. phoenix/db/migrations/versions/735d3d93c33e_add_composite_indices.py +41 -0
  37. phoenix/db/migrations/versions/a20694b15f82_cost.py +196 -0
  38. phoenix/db/migrations/versions/ab513d89518b_add_user_id_on_dataset_versions.py +40 -0
  39. phoenix/db/migrations/versions/d0690a79ea51_users_on_experiments.py +40 -0
  40. phoenix/db/migrations/versions/deb2c81c0bb2_dataset_splits.py +139 -0
  41. phoenix/db/migrations/versions/e76cbd66ffc3_add_experiments_dataset_examples.py +87 -0
  42. phoenix/db/models.py +669 -56
  43. phoenix/db/pg_config.py +10 -0
  44. phoenix/db/types/model_provider.py +4 -0
  45. phoenix/db/types/token_price_customization.py +29 -0
  46. phoenix/db/types/trace_retention.py +23 -15
  47. phoenix/experiments/evaluators/utils.py +3 -3
  48. phoenix/experiments/functions.py +160 -52
  49. phoenix/experiments/tracing.py +2 -2
  50. phoenix/experiments/types.py +1 -1
  51. phoenix/inferences/inferences.py +1 -2
  52. phoenix/server/api/auth.py +38 -7
  53. phoenix/server/api/auth_messages.py +46 -0
  54. phoenix/server/api/context.py +100 -4
  55. phoenix/server/api/dataloaders/__init__.py +79 -5
  56. phoenix/server/api/dataloaders/annotation_configs_by_project.py +31 -0
  57. phoenix/server/api/dataloaders/annotation_summaries.py +60 -8
  58. phoenix/server/api/dataloaders/average_experiment_repeated_run_group_latency.py +50 -0
  59. phoenix/server/api/dataloaders/average_experiment_run_latency.py +17 -24
  60. phoenix/server/api/dataloaders/cache/two_tier_cache.py +1 -2
  61. phoenix/server/api/dataloaders/dataset_dataset_splits.py +52 -0
  62. phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -1
  63. phoenix/server/api/dataloaders/dataset_example_splits.py +40 -0
  64. phoenix/server/api/dataloaders/dataset_examples_and_versions_by_experiment_run.py +47 -0
  65. phoenix/server/api/dataloaders/dataset_labels.py +36 -0
  66. phoenix/server/api/dataloaders/document_evaluation_summaries.py +2 -2
  67. phoenix/server/api/dataloaders/document_evaluations.py +6 -9
  68. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +88 -34
  69. phoenix/server/api/dataloaders/experiment_dataset_splits.py +43 -0
  70. phoenix/server/api/dataloaders/experiment_error_rates.py +21 -28
  71. phoenix/server/api/dataloaders/experiment_repeated_run_group_annotation_summaries.py +77 -0
  72. phoenix/server/api/dataloaders/experiment_repeated_run_groups.py +57 -0
  73. phoenix/server/api/dataloaders/experiment_runs_by_experiment_and_example.py +44 -0
  74. phoenix/server/api/dataloaders/last_used_times_by_generative_model_id.py +35 -0
  75. phoenix/server/api/dataloaders/latency_ms_quantile.py +40 -8
  76. phoenix/server/api/dataloaders/record_counts.py +37 -10
  77. phoenix/server/api/dataloaders/session_annotations_by_session.py +29 -0
  78. phoenix/server/api/dataloaders/span_cost_by_span.py +24 -0
  79. phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_generative_model.py +56 -0
  80. phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_project_session.py +57 -0
  81. phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_span.py +43 -0
  82. phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_trace.py +56 -0
  83. phoenix/server/api/dataloaders/span_cost_details_by_span_cost.py +27 -0
  84. phoenix/server/api/dataloaders/span_cost_summary_by_experiment.py +57 -0
  85. phoenix/server/api/dataloaders/span_cost_summary_by_experiment_repeated_run_group.py +64 -0
  86. phoenix/server/api/dataloaders/span_cost_summary_by_experiment_run.py +58 -0
  87. phoenix/server/api/dataloaders/span_cost_summary_by_generative_model.py +55 -0
  88. phoenix/server/api/dataloaders/span_cost_summary_by_project.py +152 -0
  89. phoenix/server/api/dataloaders/span_cost_summary_by_project_session.py +56 -0
  90. phoenix/server/api/dataloaders/span_cost_summary_by_trace.py +55 -0
  91. phoenix/server/api/dataloaders/span_costs.py +29 -0
  92. phoenix/server/api/dataloaders/table_fields.py +2 -2
  93. phoenix/server/api/dataloaders/token_prices_by_model.py +30 -0
  94. phoenix/server/api/dataloaders/trace_annotations_by_trace.py +27 -0
  95. phoenix/server/api/dataloaders/types.py +29 -0
  96. phoenix/server/api/exceptions.py +11 -1
  97. phoenix/server/api/helpers/dataset_helpers.py +5 -1
  98. phoenix/server/api/helpers/playground_clients.py +1243 -292
  99. phoenix/server/api/helpers/playground_registry.py +2 -2
  100. phoenix/server/api/helpers/playground_spans.py +8 -4
  101. phoenix/server/api/helpers/playground_users.py +26 -0
  102. phoenix/server/api/helpers/prompts/conversions/aws.py +83 -0
  103. phoenix/server/api/helpers/prompts/conversions/google.py +103 -0
  104. phoenix/server/api/helpers/prompts/models.py +205 -22
  105. phoenix/server/api/input_types/{SpanAnnotationFilter.py → AnnotationFilter.py} +22 -14
  106. phoenix/server/api/input_types/ChatCompletionInput.py +6 -2
  107. phoenix/server/api/input_types/CreateProjectInput.py +27 -0
  108. phoenix/server/api/input_types/CreateProjectSessionAnnotationInput.py +37 -0
  109. phoenix/server/api/input_types/DatasetFilter.py +17 -0
  110. phoenix/server/api/input_types/ExperimentRunSort.py +237 -0
  111. phoenix/server/api/input_types/GenerativeCredentialInput.py +9 -0
  112. phoenix/server/api/input_types/GenerativeModelInput.py +5 -0
  113. phoenix/server/api/input_types/ProjectSessionSort.py +161 -1
  114. phoenix/server/api/input_types/PromptFilter.py +14 -0
  115. phoenix/server/api/input_types/PromptVersionInput.py +52 -1
  116. phoenix/server/api/input_types/SpanSort.py +44 -7
  117. phoenix/server/api/input_types/TimeBinConfig.py +23 -0
  118. phoenix/server/api/input_types/UpdateAnnotationInput.py +34 -0
  119. phoenix/server/api/input_types/UserRoleInput.py +1 -0
  120. phoenix/server/api/mutations/__init__.py +10 -0
  121. phoenix/server/api/mutations/annotation_config_mutations.py +8 -8
  122. phoenix/server/api/mutations/api_key_mutations.py +19 -23
  123. phoenix/server/api/mutations/chat_mutations.py +154 -47
  124. phoenix/server/api/mutations/dataset_label_mutations.py +243 -0
  125. phoenix/server/api/mutations/dataset_mutations.py +21 -16
  126. phoenix/server/api/mutations/dataset_split_mutations.py +351 -0
  127. phoenix/server/api/mutations/experiment_mutations.py +2 -2
  128. phoenix/server/api/mutations/export_events_mutations.py +3 -3
  129. phoenix/server/api/mutations/model_mutations.py +210 -0
  130. phoenix/server/api/mutations/project_mutations.py +49 -10
  131. phoenix/server/api/mutations/project_session_annotations_mutations.py +158 -0
  132. phoenix/server/api/mutations/project_trace_retention_policy_mutations.py +8 -4
  133. phoenix/server/api/mutations/prompt_label_mutations.py +74 -65
  134. phoenix/server/api/mutations/prompt_mutations.py +65 -129
  135. phoenix/server/api/mutations/prompt_version_tag_mutations.py +11 -8
  136. phoenix/server/api/mutations/span_annotations_mutations.py +15 -10
  137. phoenix/server/api/mutations/trace_annotations_mutations.py +14 -10
  138. phoenix/server/api/mutations/trace_mutations.py +47 -3
  139. phoenix/server/api/mutations/user_mutations.py +66 -41
  140. phoenix/server/api/queries.py +768 -293
  141. phoenix/server/api/routers/__init__.py +2 -2
  142. phoenix/server/api/routers/auth.py +154 -88
  143. phoenix/server/api/routers/ldap.py +229 -0
  144. phoenix/server/api/routers/oauth2.py +369 -106
  145. phoenix/server/api/routers/v1/__init__.py +24 -4
  146. phoenix/server/api/routers/v1/annotation_configs.py +23 -31
  147. phoenix/server/api/routers/v1/annotations.py +481 -17
  148. phoenix/server/api/routers/v1/datasets.py +395 -81
  149. phoenix/server/api/routers/v1/documents.py +142 -0
  150. phoenix/server/api/routers/v1/evaluations.py +24 -31
  151. phoenix/server/api/routers/v1/experiment_evaluations.py +19 -8
  152. phoenix/server/api/routers/v1/experiment_runs.py +337 -59
  153. phoenix/server/api/routers/v1/experiments.py +479 -48
  154. phoenix/server/api/routers/v1/models.py +7 -0
  155. phoenix/server/api/routers/v1/projects.py +18 -49
  156. phoenix/server/api/routers/v1/prompts.py +54 -40
  157. phoenix/server/api/routers/v1/sessions.py +108 -0
  158. phoenix/server/api/routers/v1/spans.py +1091 -81
  159. phoenix/server/api/routers/v1/traces.py +132 -78
  160. phoenix/server/api/routers/v1/users.py +389 -0
  161. phoenix/server/api/routers/v1/utils.py +3 -7
  162. phoenix/server/api/subscriptions.py +305 -88
  163. phoenix/server/api/types/Annotation.py +90 -23
  164. phoenix/server/api/types/ApiKey.py +13 -17
  165. phoenix/server/api/types/AuthMethod.py +1 -0
  166. phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +1 -0
  167. phoenix/server/api/types/CostBreakdown.py +12 -0
  168. phoenix/server/api/types/Dataset.py +226 -72
  169. phoenix/server/api/types/DatasetExample.py +88 -18
  170. phoenix/server/api/types/DatasetExperimentAnnotationSummary.py +10 -0
  171. phoenix/server/api/types/DatasetLabel.py +57 -0
  172. phoenix/server/api/types/DatasetSplit.py +98 -0
  173. phoenix/server/api/types/DatasetVersion.py +49 -4
  174. phoenix/server/api/types/DocumentAnnotation.py +212 -0
  175. phoenix/server/api/types/Experiment.py +264 -59
  176. phoenix/server/api/types/ExperimentComparison.py +5 -10
  177. phoenix/server/api/types/ExperimentRepeatedRunGroup.py +155 -0
  178. phoenix/server/api/types/ExperimentRepeatedRunGroupAnnotationSummary.py +9 -0
  179. phoenix/server/api/types/ExperimentRun.py +169 -65
  180. phoenix/server/api/types/ExperimentRunAnnotation.py +158 -39
  181. phoenix/server/api/types/GenerativeModel.py +245 -3
  182. phoenix/server/api/types/GenerativeProvider.py +70 -11
  183. phoenix/server/api/types/{Model.py → InferenceModel.py} +1 -1
  184. phoenix/server/api/types/ModelInterface.py +16 -0
  185. phoenix/server/api/types/PlaygroundModel.py +20 -0
  186. phoenix/server/api/types/Project.py +1278 -216
  187. phoenix/server/api/types/ProjectSession.py +188 -28
  188. phoenix/server/api/types/ProjectSessionAnnotation.py +187 -0
  189. phoenix/server/api/types/ProjectTraceRetentionPolicy.py +1 -1
  190. phoenix/server/api/types/Prompt.py +119 -39
  191. phoenix/server/api/types/PromptLabel.py +42 -25
  192. phoenix/server/api/types/PromptVersion.py +11 -8
  193. phoenix/server/api/types/PromptVersionTag.py +65 -25
  194. phoenix/server/api/types/ServerStatus.py +6 -0
  195. phoenix/server/api/types/Span.py +167 -123
  196. phoenix/server/api/types/SpanAnnotation.py +189 -42
  197. phoenix/server/api/types/SpanCostDetailSummaryEntry.py +10 -0
  198. phoenix/server/api/types/SpanCostSummary.py +10 -0
  199. phoenix/server/api/types/SystemApiKey.py +65 -1
  200. phoenix/server/api/types/TokenPrice.py +16 -0
  201. phoenix/server/api/types/TokenUsage.py +3 -3
  202. phoenix/server/api/types/Trace.py +223 -51
  203. phoenix/server/api/types/TraceAnnotation.py +149 -50
  204. phoenix/server/api/types/User.py +137 -32
  205. phoenix/server/api/types/UserApiKey.py +73 -26
  206. phoenix/server/api/types/node.py +10 -0
  207. phoenix/server/api/types/pagination.py +11 -2
  208. phoenix/server/app.py +290 -45
  209. phoenix/server/authorization.py +38 -3
  210. phoenix/server/bearer_auth.py +34 -24
  211. phoenix/server/cost_tracking/cost_details_calculator.py +196 -0
  212. phoenix/server/cost_tracking/cost_model_lookup.py +179 -0
  213. phoenix/server/cost_tracking/helpers.py +68 -0
  214. phoenix/server/cost_tracking/model_cost_manifest.json +3657 -830
  215. phoenix/server/cost_tracking/regex_specificity.py +397 -0
  216. phoenix/server/cost_tracking/token_cost_calculator.py +57 -0
  217. phoenix/server/daemons/__init__.py +0 -0
  218. phoenix/server/daemons/db_disk_usage_monitor.py +214 -0
  219. phoenix/server/daemons/generative_model_store.py +103 -0
  220. phoenix/server/daemons/span_cost_calculator.py +99 -0
  221. phoenix/server/dml_event.py +17 -0
  222. phoenix/server/dml_event_handler.py +5 -0
  223. phoenix/server/email/sender.py +56 -3
  224. phoenix/server/email/templates/db_disk_usage_notification.html +19 -0
  225. phoenix/server/email/types.py +11 -0
  226. phoenix/server/experiments/__init__.py +0 -0
  227. phoenix/server/experiments/utils.py +14 -0
  228. phoenix/server/grpc_server.py +11 -11
  229. phoenix/server/jwt_store.py +17 -15
  230. phoenix/server/ldap.py +1449 -0
  231. phoenix/server/main.py +26 -10
  232. phoenix/server/oauth2.py +330 -12
  233. phoenix/server/prometheus.py +66 -6
  234. phoenix/server/rate_limiters.py +4 -9
  235. phoenix/server/retention.py +33 -20
  236. phoenix/server/session_filters.py +49 -0
  237. phoenix/server/static/.vite/manifest.json +55 -51
  238. phoenix/server/static/assets/components-BreFUQQa.js +6702 -0
  239. phoenix/server/static/assets/{index-E0M82BdE.js → index-CTQoemZv.js} +140 -56
  240. phoenix/server/static/assets/pages-DBE5iYM3.js +9524 -0
  241. phoenix/server/static/assets/vendor-BGzfc4EU.css +1 -0
  242. phoenix/server/static/assets/vendor-DCE4v-Ot.js +920 -0
  243. phoenix/server/static/assets/vendor-codemirror-D5f205eT.js +25 -0
  244. phoenix/server/static/assets/vendor-recharts-V9cwpXsm.js +37 -0
  245. phoenix/server/static/assets/vendor-shiki-Do--csgv.js +5 -0
  246. phoenix/server/static/assets/vendor-three-CmB8bl_y.js +3840 -0
  247. phoenix/server/templates/index.html +40 -6
  248. phoenix/server/thread_server.py +1 -2
  249. phoenix/server/types.py +14 -4
  250. phoenix/server/utils.py +74 -0
  251. phoenix/session/client.py +56 -3
  252. phoenix/session/data_extractor.py +5 -0
  253. phoenix/session/evaluation.py +14 -5
  254. phoenix/session/session.py +45 -9
  255. phoenix/settings.py +5 -0
  256. phoenix/trace/attributes.py +80 -13
  257. phoenix/trace/dsl/helpers.py +90 -1
  258. phoenix/trace/dsl/query.py +8 -6
  259. phoenix/trace/projects.py +5 -0
  260. phoenix/utilities/template_formatters.py +1 -1
  261. phoenix/version.py +1 -1
  262. arize_phoenix-10.0.4.dist-info/RECORD +0 -405
  263. phoenix/server/api/types/Evaluation.py +0 -39
  264. phoenix/server/cost_tracking/cost_lookup.py +0 -255
  265. phoenix/server/static/assets/components-DULKeDfL.js +0 -4365
  266. phoenix/server/static/assets/pages-Cl0A-0U2.js +0 -7430
  267. phoenix/server/static/assets/vendor-WIZid84E.css +0 -1
  268. phoenix/server/static/assets/vendor-arizeai-Dy-0mSNw.js +0 -649
  269. phoenix/server/static/assets/vendor-codemirror-DBtifKNr.js +0 -33
  270. phoenix/server/static/assets/vendor-oB4u9zuV.js +0 -905
  271. phoenix/server/static/assets/vendor-recharts-D-T4KPz2.js +0 -59
  272. phoenix/server/static/assets/vendor-shiki-BMn4O_9F.js +0 -5
  273. phoenix/server/static/assets/vendor-three-C5WAXd5r.js +0 -2998
  274. phoenix/utilities/deprecation.py +0 -31
  275. {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/entry_points.txt +0 -0
  276. {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,34 @@
1
+ import json
1
2
  from datetime import datetime
2
3
  from typing import Any, Optional
3
4
 
4
- from fastapi import APIRouter, HTTPException
5
+ from fastapi import APIRouter, Depends, HTTPException, Query
5
6
  from pydantic import Field
6
7
  from sqlalchemy import select
7
- from sqlalchemy.exc import IntegrityError as PostgreSQLIntegrityError
8
- from sqlean.dbapi2 import IntegrityError as SQLiteIntegrityError # type: ignore[import-untyped]
9
8
  from starlette.requests import Request
10
- from starlette.status import HTTP_404_NOT_FOUND, HTTP_409_CONFLICT
11
9
  from strawberry.relay import GlobalID
12
10
 
13
11
  from phoenix.db import models
12
+ from phoenix.db.helpers import get_runs_with_incomplete_evaluations_query
13
+ from phoenix.db.insertion.helpers import OnConflict, insert_on_conflict
14
14
  from phoenix.db.models import ExperimentRunOutput
15
+ from phoenix.server.api.routers.v1.datasets import DatasetExample
15
16
  from phoenix.server.api.types.node import from_global_id_with_expected_type
17
+ from phoenix.server.authorization import is_not_locked
16
18
  from phoenix.server.dml_event import ExperimentRunInsertEvent
17
19
 
18
20
  from .models import V1RoutesBaseModel
19
- from .utils import ResponseBody, add_errors_to_responses
21
+ from .utils import PaginatedResponseBody, ResponseBody, add_errors_to_responses
20
22
 
21
23
  router = APIRouter(tags=["experiments"], include_in_schema=True)
22
24
 
23
25
 
24
- class ExperimentRun(V1RoutesBaseModel):
26
+ class ExperimentRunData(V1RoutesBaseModel):
25
27
  dataset_example_id: str = Field(
26
28
  description="The ID of the dataset example used in the experiment run"
27
29
  )
28
30
  output: Any = Field(description="The output of the experiment task")
29
- repetition_number: int = Field(description="The repetition number of the experiment run")
31
+ repetition_number: int = Field(description="The repetition number of the experiment run", gt=0)
30
32
  start_time: datetime = Field(description="The start time of the experiment run")
31
33
  end_time: datetime = Field(description="The end time of the experiment run")
32
34
  trace_id: Optional[str] = Field(
@@ -38,7 +40,7 @@ class ExperimentRun(V1RoutesBaseModel):
38
40
  )
39
41
 
40
42
 
41
- class CreateExperimentRunRequestBody(ExperimentRun):
43
+ class CreateExperimentRunRequestBody(ExperimentRunData):
42
44
  pass
43
45
 
44
46
 
@@ -52,18 +54,21 @@ class CreateExperimentRunResponseBody(ResponseBody[CreateExperimentRunResponseBo
52
54
 
53
55
  @router.post(
54
56
  "/experiments/{experiment_id}/runs",
57
+ dependencies=[Depends(is_not_locked)],
55
58
  operation_id="createExperimentRun",
56
59
  summary="Create run for an experiment",
57
60
  response_description="Experiment run created successfully",
58
61
  responses=add_errors_to_responses(
59
62
  [
60
63
  {
61
- "status_code": HTTP_404_NOT_FOUND,
64
+ "status_code": 404,
62
65
  "description": "Experiment or dataset example not found",
63
66
  },
64
67
  {
65
- "status_code": HTTP_409_CONFLICT,
66
- "description": "This experiment run has already been submitted",
68
+ "status_code": 409,
69
+ "description": (
70
+ "Experiment run already exists with a successful result and cannot be updated"
71
+ ),
67
72
  },
68
73
  ]
69
74
  ),
@@ -77,7 +82,7 @@ async def create_experiment_run(
77
82
  except ValueError:
78
83
  raise HTTPException(
79
84
  detail=f"Experiment with ID {experiment_gid} does not exist",
80
- status_code=HTTP_404_NOT_FOUND,
85
+ status_code=404,
81
86
  )
82
87
 
83
88
  example_gid = GlobalID.from_id(request_body.dataset_example_id)
@@ -86,7 +91,7 @@ async def create_experiment_run(
86
91
  except ValueError:
87
92
  raise HTTPException(
88
93
  detail=f"DatasetExample with ID {example_gid} does not exist",
89
- status_code=HTTP_404_NOT_FOUND,
94
+ status_code=404,
90
95
  )
91
96
 
92
97
  trace_id = request_body.trace_id
@@ -97,37 +102,72 @@ async def create_experiment_run(
97
102
  error = request_body.error
98
103
 
99
104
  async with request.app.state.db() as session:
100
- exp_run = models.ExperimentRun(
101
- experiment_id=experiment_rowid,
102
- dataset_example_id=dataset_example_id,
103
- trace_id=trace_id,
104
- output=ExperimentRunOutput(task_output=task_output),
105
- repetition_number=repetition_number,
106
- start_time=start_time,
107
- end_time=end_time,
108
- error=error,
105
+ # Check if a record already exists
106
+ existing_run = await session.scalar(
107
+ select(models.ExperimentRun)
108
+ .where(models.ExperimentRun.experiment_id == experiment_rowid)
109
+ .where(models.ExperimentRun.dataset_example_id == dataset_example_id)
110
+ .where(models.ExperimentRun.repetition_number == repetition_number)
109
111
  )
110
- try:
111
- session.add(exp_run)
112
- await session.flush()
113
- except (PostgreSQLIntegrityError, SQLiteIntegrityError):
112
+
113
+ if existing_run is not None and existing_run.error is None:
114
+ # Record exists and has no error - reject the update
115
+ run_gid = GlobalID("ExperimentRun", str(existing_run.id))
114
116
  raise HTTPException(
115
- detail="This experiment run has already been submitted",
116
- status_code=HTTP_409_CONFLICT,
117
+ status_code=409,
118
+ detail=(
119
+ f"Experiment run {run_gid} already exists with a successful result "
120
+ "and cannot be updated"
121
+ ),
117
122
  )
118
- request.state.event_queue.put(ExperimentRunInsertEvent((exp_run.id,)))
119
- run_gid = GlobalID("ExperimentRun", str(exp_run.id))
123
+ # Either no record exists, or existing record has an error - proceed with upsert
124
+ stmt = insert_on_conflict(
125
+ {
126
+ "experiment_id": experiment_rowid,
127
+ "dataset_example_id": dataset_example_id,
128
+ "trace_id": trace_id,
129
+ "output": ExperimentRunOutput(task_output=task_output),
130
+ "repetition_number": repetition_number,
131
+ "start_time": start_time,
132
+ "end_time": end_time,
133
+ "error": error,
134
+ },
135
+ table=models.ExperimentRun,
136
+ dialect=request.app.state.db.dialect,
137
+ unique_by=["experiment_id", "dataset_example_id", "repetition_number"],
138
+ on_conflict=OnConflict.DO_UPDATE,
139
+ ).returning(models.ExperimentRun.id)
140
+ id_ = await session.scalar(stmt)
141
+
142
+ request.state.event_queue.put(ExperimentRunInsertEvent((id_,)))
143
+ run_gid = GlobalID("ExperimentRun", str(id_))
120
144
  return CreateExperimentRunResponseBody(
121
145
  data=CreateExperimentRunResponseBodyData(id=str(run_gid))
122
146
  )
123
147
 
124
148
 
125
- class ExperimentRunResponse(ExperimentRun):
149
+ class ExperimentRun(ExperimentRunData):
126
150
  id: str = Field(description="The ID of the experiment run")
127
151
  experiment_id: str = Field(description="The ID of the experiment")
128
152
 
129
153
 
130
- class ListExperimentRunsResponseBody(ResponseBody[list[ExperimentRunResponse]]):
154
+ class ListExperimentRunsResponseBody(PaginatedResponseBody[ExperimentRun]):
155
+ pass
156
+
157
+
158
+ class IncompleteExperimentEvaluation(V1RoutesBaseModel):
159
+ """
160
+ Information about an experiment run with incomplete evaluations
161
+ """
162
+
163
+ experiment_run: ExperimentRun = Field(description="The experiment run")
164
+ dataset_example: DatasetExample = Field(description="The dataset example")
165
+ evaluation_names: list[str] = Field(
166
+ description="List of evaluation names that are incomplete (either missing or failed)"
167
+ )
168
+
169
+
170
+ class GetIncompleteEvaluationsResponseBody(PaginatedResponseBody[IncompleteExperimentEvaluation]):
131
171
  pass
132
172
 
133
173
 
@@ -135,47 +175,285 @@ class ListExperimentRunsResponseBody(ResponseBody[list[ExperimentRunResponse]]):
135
175
  "/experiments/{experiment_id}/runs",
136
176
  operation_id="listExperimentRuns",
137
177
  summary="List runs for an experiment",
178
+ description="Retrieve a paginated list of runs for an experiment",
138
179
  response_description="Experiment runs retrieved successfully",
139
180
  responses=add_errors_to_responses(
140
- [{"status_code": HTTP_404_NOT_FOUND, "description": "Experiment not found"}]
181
+ [
182
+ {"status_code": 404, "description": "Experiment not found"},
183
+ {"status_code": 422, "description": "Invalid cursor format"},
184
+ ]
141
185
  ),
142
186
  )
143
187
  async def list_experiment_runs(
144
- request: Request, experiment_id: str
188
+ request: Request,
189
+ experiment_id: str,
190
+ cursor: Optional[str] = Query(
191
+ default=None,
192
+ description="Cursor for pagination (base64-encoded experiment run ID)",
193
+ ),
194
+ limit: Optional[int] = Query(
195
+ default=None,
196
+ description="The max number of experiment runs to return at a time. "
197
+ "If not specified, returns all results.",
198
+ gt=0,
199
+ ),
145
200
  ) -> ListExperimentRunsResponseBody:
146
- experiment_gid = GlobalID.from_id(experiment_id)
201
+ try:
202
+ experiment_gid = GlobalID.from_id(experiment_id)
203
+ except Exception as e:
204
+ raise HTTPException(
205
+ detail=f"Invalid experiment ID format: {experiment_id}",
206
+ status_code=422,
207
+ ) from e
147
208
  try:
148
209
  experiment_rowid = from_global_id_with_expected_type(experiment_gid, "Experiment")
149
210
  except ValueError:
150
211
  raise HTTPException(
151
212
  detail=f"Experiment with ID {experiment_gid} does not exist",
152
- status_code=HTTP_404_NOT_FOUND,
213
+ status_code=404,
153
214
  )
154
215
 
216
+ stmt = (
217
+ select(models.ExperimentRun)
218
+ .filter_by(experiment_id=experiment_rowid)
219
+ .order_by(models.ExperimentRun.id.desc())
220
+ )
221
+
222
+ if cursor:
223
+ try:
224
+ cursor_id = GlobalID.from_id(cursor).node_id
225
+ stmt = stmt.where(models.ExperimentRun.id <= int(cursor_id))
226
+ except ValueError:
227
+ raise HTTPException(
228
+ detail=f"Invalid cursor format: {cursor}",
229
+ status_code=422,
230
+ )
231
+
232
+ # Apply limit only if specified for pagination
233
+ if limit is not None:
234
+ stmt = stmt.limit(limit + 1)
235
+
155
236
  async with request.app.state.db() as session:
156
- experiment_runs = await session.execute(
157
- select(models.ExperimentRun)
158
- .where(models.ExperimentRun.experiment_id == experiment_rowid)
159
- # order by dataset_example_id to be consistent with `list_dataset_examples`
160
- .order_by(models.ExperimentRun.dataset_example_id.asc())
237
+ experiment_runs = (await session.scalars(stmt)).all()
238
+
239
+ if not experiment_runs:
240
+ return ListExperimentRunsResponseBody(next_cursor=None, data=[])
241
+
242
+ next_cursor = None
243
+ # Only check for next cursor if limit was specified
244
+ if limit is not None and len(experiment_runs) == limit + 1:
245
+ last_run = experiment_runs[-1]
246
+ next_cursor = str(GlobalID("ExperimentRun", str(last_run.id)))
247
+ experiment_runs = experiment_runs[:-1]
248
+
249
+ runs = []
250
+ for exp_run in experiment_runs:
251
+ run_gid = GlobalID("ExperimentRun", str(exp_run.id))
252
+ experiment_gid = GlobalID("Experiment", str(exp_run.experiment_id))
253
+ example_gid = GlobalID("DatasetExample", str(exp_run.dataset_example_id))
254
+ runs.append(
255
+ ExperimentRun(
256
+ start_time=exp_run.start_time,
257
+ end_time=exp_run.end_time,
258
+ experiment_id=str(experiment_gid),
259
+ dataset_example_id=str(example_gid),
260
+ repetition_number=exp_run.repetition_number,
261
+ output=exp_run.output.get("task_output"),
262
+ error=exp_run.error,
263
+ id=str(run_gid),
264
+ trace_id=exp_run.trace_id,
265
+ )
266
+ )
267
+ return ListExperimentRunsResponseBody(data=runs, next_cursor=next_cursor)
268
+
269
+
270
+ @router.get(
271
+ "/experiments/{experiment_id}/incomplete-evaluations",
272
+ operation_id="getIncompleteExperimentEvaluations",
273
+ summary="Get incomplete evaluations for an experiment",
274
+ responses=add_errors_to_responses(
275
+ [
276
+ {"status_code": 400, "description": "No evaluator names provided"},
277
+ {"status_code": 404, "description": "Experiment not found"},
278
+ {"status_code": 422, "description": "Invalid cursor format"},
279
+ ]
280
+ ),
281
+ response_description="Incomplete evaluations retrieved successfully",
282
+ )
283
+ async def get_incomplete_evaluations(
284
+ request: Request,
285
+ experiment_id: str,
286
+ evaluation_name: list[str] = Query(default=[], description="Evaluation names to check"),
287
+ cursor: Optional[str] = Query(default=None, description="Cursor for pagination"),
288
+ limit: int = Query(
289
+ default=50, description="Maximum number of runs with incomplete evaluations to return", gt=0
290
+ ),
291
+ ) -> GetIncompleteEvaluationsResponseBody:
292
+ """
293
+ Get experiment runs that have incomplete evaluations.
294
+
295
+ Returns runs with:
296
+ - Missing evaluations (evaluator has not been run)
297
+ - Failed evaluations (evaluator ran but has errors)
298
+
299
+ Args:
300
+ experiment_id: The ID of the experiment
301
+ evaluation_name: List of evaluation names to check (required, at least one)
302
+ cursor: Cursor for pagination
303
+ limit: Maximum number of results to return
304
+
305
+ Returns:
306
+ Paginated list of runs with incomplete evaluations
307
+ """
308
+ try:
309
+ experiment_globalid = GlobalID.from_id(experiment_id)
310
+ except Exception as e:
311
+ raise HTTPException(
312
+ detail=f"Invalid experiment ID format: {experiment_id}",
313
+ status_code=422,
314
+ ) from e
315
+ try:
316
+ experiment_rowid = from_global_id_with_expected_type(experiment_globalid, "Experiment")
317
+ except ValueError:
318
+ raise HTTPException(
319
+ detail=f"Experiment with ID {experiment_globalid} does not exist",
320
+ status_code=404,
321
+ )
322
+
323
+ # Parse cursor if provided
324
+ cursor_run_rowid: Optional[int] = None
325
+ if cursor:
326
+ try:
327
+ cursor_gid = GlobalID.from_id(cursor)
328
+ cursor_run_rowid = from_global_id_with_expected_type(cursor_gid, "ExperimentRun")
329
+ except (ValueError, AttributeError):
330
+ raise HTTPException(
331
+ detail=f"Invalid cursor format: {cursor}",
332
+ status_code=422,
333
+ )
334
+
335
+ # Deduplicate evaluation names
336
+ evaluation_name = list(set(name.strip() for name in evaluation_name if name.strip()))
337
+
338
+ # Require at least one evaluation name
339
+ if not evaluation_name:
340
+ raise HTTPException(
341
+ detail="At least one evaluation_name must be provided",
342
+ status_code=400,
343
+ )
344
+
345
+ # Validate evaluation names - reject null bytes which are invalid in PostgreSQL
346
+ for name in evaluation_name:
347
+ if "\x00" in name:
348
+ raise HTTPException(
349
+ detail="Invalid evaluation name: null bytes are not allowed",
350
+ status_code=400,
351
+ )
352
+
353
+ async with request.app.state.db() as session:
354
+ # Verify experiment exists
355
+ experiment_result = await session.execute(
356
+ select(models.Experiment).filter_by(id=experiment_rowid)
161
357
  )
162
- experiment_runs = experiment_runs.scalars().all()
163
- runs = []
164
- for exp_run in experiment_runs:
165
- run_gid = GlobalID("ExperimentRun", str(exp_run.id))
166
- experiment_gid = GlobalID("Experiment", str(exp_run.experiment_id))
167
- example_gid = GlobalID("DatasetExample", str(exp_run.dataset_example_id))
168
- runs.append(
169
- ExperimentRunResponse(
170
- start_time=exp_run.start_time,
171
- end_time=exp_run.end_time,
172
- experiment_id=str(experiment_gid),
173
- dataset_example_id=str(example_gid),
174
- repetition_number=exp_run.repetition_number,
175
- output=exp_run.output.get("task_output"),
176
- error=exp_run.error,
177
- id=str(run_gid),
178
- trace_id=exp_run.trace_id,
358
+ experiment = experiment_result.scalar()
359
+ if not experiment:
360
+ raise HTTPException(
361
+ detail=f"Experiment with ID {experiment_globalid} does not exist",
362
+ status_code=404,
363
+ )
364
+
365
+ # Query for runs with incomplete evaluations in a single query
366
+ # This fetches runs, revisions, and annotations together to minimize round-trips
367
+ # A run has incomplete evaluations if:
368
+ # 1. It's missing an annotation for any of the requested evaluators
369
+ # 2. It has a failed annotation (error IS NOT NULL) for any evaluator
370
+
371
+ # Get dialect for SQL generation
372
+ dialect = request.app.state.db.dialect
373
+
374
+ # Single query: Get runs with incomplete evaluations + their revisions + annotations
375
+ combined_query = get_runs_with_incomplete_evaluations_query(
376
+ experiment_rowid,
377
+ evaluation_name,
378
+ dialect,
379
+ cursor_run_rowid=cursor_run_rowid,
380
+ limit=limit,
381
+ include_annotations_and_revisions=True,
382
+ )
383
+
384
+ combined_result = await session.execute(combined_query)
385
+ all_rows = combined_result.all()
386
+
387
+ if not all_rows:
388
+ return GetIncompleteEvaluationsResponseBody(data=[], next_cursor=None)
389
+
390
+ # Parse rows - now each row is a single run with successful annotations as JSON array
391
+ # Each row: (ExperimentRun, revision_id, DatasetExampleRevision, annotations_json)
392
+ runs_data: list[tuple[models.ExperimentRun, models.DatasetExampleRevision, set[str]]] = []
393
+
394
+ for row in all_rows:
395
+ run = row[0] # ExperimentRun
396
+ revision = row[2] # DatasetExampleRevision
397
+ annotations_json = row[3] # JSON string or None
398
+
399
+ # Parse successful annotation names (just a list of strings now)
400
+ successful_eval_names: set[str] = set()
401
+ if annotations_json:
402
+ successful_eval_names = set(json.loads(annotations_json))
403
+
404
+ runs_data.append((run, revision, successful_eval_names))
405
+
406
+ # Apply pagination limit
407
+ has_more = len(runs_data) > limit
408
+ if has_more:
409
+ runs_to_process = runs_data[:limit]
410
+ else:
411
+ runs_to_process = runs_data
412
+
413
+ # Build response
414
+ incomplete_evaluations_list: list[IncompleteExperimentEvaluation] = []
415
+ for run, revision, successful_eval_names in runs_to_process:
416
+ # Determine incomplete evaluation names for this run
417
+ # Any evaluation not in the successful set is incomplete (either missing or failed)
418
+ incomplete_evaluation_names = sorted(
419
+ name for name in evaluation_name if name not in successful_eval_names
420
+ )
421
+
422
+ run_globalid = GlobalID("ExperimentRun", str(run.id))
423
+ example_globalid = GlobalID("DatasetExample", str(run.dataset_example_id))
424
+
425
+ incomplete_evaluations_list.append(
426
+ IncompleteExperimentEvaluation(
427
+ experiment_run=ExperimentRun(
428
+ id=str(run_globalid),
429
+ experiment_id=str(experiment_globalid),
430
+ dataset_example_id=str(example_globalid),
431
+ output=run.output.get("task_output"),
432
+ repetition_number=run.repetition_number,
433
+ start_time=run.start_time,
434
+ end_time=run.end_time,
435
+ trace_id=run.trace_id,
436
+ error=run.error,
437
+ ),
438
+ dataset_example=DatasetExample(
439
+ id=str(example_globalid),
440
+ input=revision.input,
441
+ output=revision.output,
442
+ metadata=revision.metadata_,
443
+ updated_at=revision.created_at,
444
+ ),
445
+ evaluation_names=incomplete_evaluation_names,
179
446
  )
180
447
  )
181
- return ListExperimentRunsResponseBody(data=runs)
448
+
449
+ # Set next cursor if we have more results
450
+ next_cursor = None
451
+ if has_more:
452
+ # Cursor is the ID of the next item to fetch
453
+ # (the extra item we fetched but didn't process)
454
+ next_run, _, _ = runs_data[limit] # First item after our limit
455
+ next_cursor = str(GlobalID("ExperimentRun", str(next_run.id)))
456
+
457
+ return GetIncompleteEvaluationsResponseBody(
458
+ data=incomplete_evaluations_list, next_cursor=next_cursor
459
+ )