arize-phoenix 10.0.4__py3-none-any.whl → 12.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/METADATA +124 -72
- arize_phoenix-12.28.1.dist-info/RECORD +499 -0
- {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/WHEEL +1 -1
- {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/licenses/IP_NOTICE +1 -1
- phoenix/__generated__/__init__.py +0 -0
- phoenix/__generated__/classification_evaluator_configs/__init__.py +20 -0
- phoenix/__generated__/classification_evaluator_configs/_document_relevance_classification_evaluator_config.py +17 -0
- phoenix/__generated__/classification_evaluator_configs/_hallucination_classification_evaluator_config.py +17 -0
- phoenix/__generated__/classification_evaluator_configs/_models.py +18 -0
- phoenix/__generated__/classification_evaluator_configs/_tool_selection_classification_evaluator_config.py +17 -0
- phoenix/__init__.py +5 -4
- phoenix/auth.py +39 -2
- phoenix/config.py +1763 -91
- phoenix/datetime_utils.py +120 -2
- phoenix/db/README.md +595 -25
- phoenix/db/bulk_inserter.py +145 -103
- phoenix/db/engines.py +140 -33
- phoenix/db/enums.py +3 -12
- phoenix/db/facilitator.py +302 -35
- phoenix/db/helpers.py +1000 -65
- phoenix/db/iam_auth.py +64 -0
- phoenix/db/insertion/dataset.py +135 -2
- phoenix/db/insertion/document_annotation.py +9 -6
- phoenix/db/insertion/evaluation.py +2 -3
- phoenix/db/insertion/helpers.py +17 -2
- phoenix/db/insertion/session_annotation.py +176 -0
- phoenix/db/insertion/span.py +15 -11
- phoenix/db/insertion/span_annotation.py +3 -4
- phoenix/db/insertion/trace_annotation.py +3 -4
- phoenix/db/insertion/types.py +50 -20
- phoenix/db/migrations/versions/01a8342c9cdf_add_user_id_on_datasets.py +40 -0
- phoenix/db/migrations/versions/0df286449799_add_session_annotations_table.py +105 -0
- phoenix/db/migrations/versions/272b66ff50f8_drop_single_indices.py +119 -0
- phoenix/db/migrations/versions/58228d933c91_dataset_labels.py +67 -0
- phoenix/db/migrations/versions/699f655af132_experiment_tags.py +57 -0
- phoenix/db/migrations/versions/735d3d93c33e_add_composite_indices.py +41 -0
- phoenix/db/migrations/versions/a20694b15f82_cost.py +196 -0
- phoenix/db/migrations/versions/ab513d89518b_add_user_id_on_dataset_versions.py +40 -0
- phoenix/db/migrations/versions/d0690a79ea51_users_on_experiments.py +40 -0
- phoenix/db/migrations/versions/deb2c81c0bb2_dataset_splits.py +139 -0
- phoenix/db/migrations/versions/e76cbd66ffc3_add_experiments_dataset_examples.py +87 -0
- phoenix/db/models.py +669 -56
- phoenix/db/pg_config.py +10 -0
- phoenix/db/types/model_provider.py +4 -0
- phoenix/db/types/token_price_customization.py +29 -0
- phoenix/db/types/trace_retention.py +23 -15
- phoenix/experiments/evaluators/utils.py +3 -3
- phoenix/experiments/functions.py +160 -52
- phoenix/experiments/tracing.py +2 -2
- phoenix/experiments/types.py +1 -1
- phoenix/inferences/inferences.py +1 -2
- phoenix/server/api/auth.py +38 -7
- phoenix/server/api/auth_messages.py +46 -0
- phoenix/server/api/context.py +100 -4
- phoenix/server/api/dataloaders/__init__.py +79 -5
- phoenix/server/api/dataloaders/annotation_configs_by_project.py +31 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +60 -8
- phoenix/server/api/dataloaders/average_experiment_repeated_run_group_latency.py +50 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +17 -24
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +1 -2
- phoenix/server/api/dataloaders/dataset_dataset_splits.py +52 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -1
- phoenix/server/api/dataloaders/dataset_example_splits.py +40 -0
- phoenix/server/api/dataloaders/dataset_examples_and_versions_by_experiment_run.py +47 -0
- phoenix/server/api/dataloaders/dataset_labels.py +36 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +2 -2
- phoenix/server/api/dataloaders/document_evaluations.py +6 -9
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +88 -34
- phoenix/server/api/dataloaders/experiment_dataset_splits.py +43 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +21 -28
- phoenix/server/api/dataloaders/experiment_repeated_run_group_annotation_summaries.py +77 -0
- phoenix/server/api/dataloaders/experiment_repeated_run_groups.py +57 -0
- phoenix/server/api/dataloaders/experiment_runs_by_experiment_and_example.py +44 -0
- phoenix/server/api/dataloaders/last_used_times_by_generative_model_id.py +35 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +40 -8
- phoenix/server/api/dataloaders/record_counts.py +37 -10
- phoenix/server/api/dataloaders/session_annotations_by_session.py +29 -0
- phoenix/server/api/dataloaders/span_cost_by_span.py +24 -0
- phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_generative_model.py +56 -0
- phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_project_session.py +57 -0
- phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_span.py +43 -0
- phoenix/server/api/dataloaders/span_cost_detail_summary_entries_by_trace.py +56 -0
- phoenix/server/api/dataloaders/span_cost_details_by_span_cost.py +27 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_experiment.py +57 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_experiment_repeated_run_group.py +64 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_experiment_run.py +58 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_generative_model.py +55 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_project.py +152 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_project_session.py +56 -0
- phoenix/server/api/dataloaders/span_cost_summary_by_trace.py +55 -0
- phoenix/server/api/dataloaders/span_costs.py +29 -0
- phoenix/server/api/dataloaders/table_fields.py +2 -2
- phoenix/server/api/dataloaders/token_prices_by_model.py +30 -0
- phoenix/server/api/dataloaders/trace_annotations_by_trace.py +27 -0
- phoenix/server/api/dataloaders/types.py +29 -0
- phoenix/server/api/exceptions.py +11 -1
- phoenix/server/api/helpers/dataset_helpers.py +5 -1
- phoenix/server/api/helpers/playground_clients.py +1243 -292
- phoenix/server/api/helpers/playground_registry.py +2 -2
- phoenix/server/api/helpers/playground_spans.py +8 -4
- phoenix/server/api/helpers/playground_users.py +26 -0
- phoenix/server/api/helpers/prompts/conversions/aws.py +83 -0
- phoenix/server/api/helpers/prompts/conversions/google.py +103 -0
- phoenix/server/api/helpers/prompts/models.py +205 -22
- phoenix/server/api/input_types/{SpanAnnotationFilter.py → AnnotationFilter.py} +22 -14
- phoenix/server/api/input_types/ChatCompletionInput.py +6 -2
- phoenix/server/api/input_types/CreateProjectInput.py +27 -0
- phoenix/server/api/input_types/CreateProjectSessionAnnotationInput.py +37 -0
- phoenix/server/api/input_types/DatasetFilter.py +17 -0
- phoenix/server/api/input_types/ExperimentRunSort.py +237 -0
- phoenix/server/api/input_types/GenerativeCredentialInput.py +9 -0
- phoenix/server/api/input_types/GenerativeModelInput.py +5 -0
- phoenix/server/api/input_types/ProjectSessionSort.py +161 -1
- phoenix/server/api/input_types/PromptFilter.py +14 -0
- phoenix/server/api/input_types/PromptVersionInput.py +52 -1
- phoenix/server/api/input_types/SpanSort.py +44 -7
- phoenix/server/api/input_types/TimeBinConfig.py +23 -0
- phoenix/server/api/input_types/UpdateAnnotationInput.py +34 -0
- phoenix/server/api/input_types/UserRoleInput.py +1 -0
- phoenix/server/api/mutations/__init__.py +10 -0
- phoenix/server/api/mutations/annotation_config_mutations.py +8 -8
- phoenix/server/api/mutations/api_key_mutations.py +19 -23
- phoenix/server/api/mutations/chat_mutations.py +154 -47
- phoenix/server/api/mutations/dataset_label_mutations.py +243 -0
- phoenix/server/api/mutations/dataset_mutations.py +21 -16
- phoenix/server/api/mutations/dataset_split_mutations.py +351 -0
- phoenix/server/api/mutations/experiment_mutations.py +2 -2
- phoenix/server/api/mutations/export_events_mutations.py +3 -3
- phoenix/server/api/mutations/model_mutations.py +210 -0
- phoenix/server/api/mutations/project_mutations.py +49 -10
- phoenix/server/api/mutations/project_session_annotations_mutations.py +158 -0
- phoenix/server/api/mutations/project_trace_retention_policy_mutations.py +8 -4
- phoenix/server/api/mutations/prompt_label_mutations.py +74 -65
- phoenix/server/api/mutations/prompt_mutations.py +65 -129
- phoenix/server/api/mutations/prompt_version_tag_mutations.py +11 -8
- phoenix/server/api/mutations/span_annotations_mutations.py +15 -10
- phoenix/server/api/mutations/trace_annotations_mutations.py +14 -10
- phoenix/server/api/mutations/trace_mutations.py +47 -3
- phoenix/server/api/mutations/user_mutations.py +66 -41
- phoenix/server/api/queries.py +768 -293
- phoenix/server/api/routers/__init__.py +2 -2
- phoenix/server/api/routers/auth.py +154 -88
- phoenix/server/api/routers/ldap.py +229 -0
- phoenix/server/api/routers/oauth2.py +369 -106
- phoenix/server/api/routers/v1/__init__.py +24 -4
- phoenix/server/api/routers/v1/annotation_configs.py +23 -31
- phoenix/server/api/routers/v1/annotations.py +481 -17
- phoenix/server/api/routers/v1/datasets.py +395 -81
- phoenix/server/api/routers/v1/documents.py +142 -0
- phoenix/server/api/routers/v1/evaluations.py +24 -31
- phoenix/server/api/routers/v1/experiment_evaluations.py +19 -8
- phoenix/server/api/routers/v1/experiment_runs.py +337 -59
- phoenix/server/api/routers/v1/experiments.py +479 -48
- phoenix/server/api/routers/v1/models.py +7 -0
- phoenix/server/api/routers/v1/projects.py +18 -49
- phoenix/server/api/routers/v1/prompts.py +54 -40
- phoenix/server/api/routers/v1/sessions.py +108 -0
- phoenix/server/api/routers/v1/spans.py +1091 -81
- phoenix/server/api/routers/v1/traces.py +132 -78
- phoenix/server/api/routers/v1/users.py +389 -0
- phoenix/server/api/routers/v1/utils.py +3 -7
- phoenix/server/api/subscriptions.py +305 -88
- phoenix/server/api/types/Annotation.py +90 -23
- phoenix/server/api/types/ApiKey.py +13 -17
- phoenix/server/api/types/AuthMethod.py +1 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +1 -0
- phoenix/server/api/types/CostBreakdown.py +12 -0
- phoenix/server/api/types/Dataset.py +226 -72
- phoenix/server/api/types/DatasetExample.py +88 -18
- phoenix/server/api/types/DatasetExperimentAnnotationSummary.py +10 -0
- phoenix/server/api/types/DatasetLabel.py +57 -0
- phoenix/server/api/types/DatasetSplit.py +98 -0
- phoenix/server/api/types/DatasetVersion.py +49 -4
- phoenix/server/api/types/DocumentAnnotation.py +212 -0
- phoenix/server/api/types/Experiment.py +264 -59
- phoenix/server/api/types/ExperimentComparison.py +5 -10
- phoenix/server/api/types/ExperimentRepeatedRunGroup.py +155 -0
- phoenix/server/api/types/ExperimentRepeatedRunGroupAnnotationSummary.py +9 -0
- phoenix/server/api/types/ExperimentRun.py +169 -65
- phoenix/server/api/types/ExperimentRunAnnotation.py +158 -39
- phoenix/server/api/types/GenerativeModel.py +245 -3
- phoenix/server/api/types/GenerativeProvider.py +70 -11
- phoenix/server/api/types/{Model.py → InferenceModel.py} +1 -1
- phoenix/server/api/types/ModelInterface.py +16 -0
- phoenix/server/api/types/PlaygroundModel.py +20 -0
- phoenix/server/api/types/Project.py +1278 -216
- phoenix/server/api/types/ProjectSession.py +188 -28
- phoenix/server/api/types/ProjectSessionAnnotation.py +187 -0
- phoenix/server/api/types/ProjectTraceRetentionPolicy.py +1 -1
- phoenix/server/api/types/Prompt.py +119 -39
- phoenix/server/api/types/PromptLabel.py +42 -25
- phoenix/server/api/types/PromptVersion.py +11 -8
- phoenix/server/api/types/PromptVersionTag.py +65 -25
- phoenix/server/api/types/ServerStatus.py +6 -0
- phoenix/server/api/types/Span.py +167 -123
- phoenix/server/api/types/SpanAnnotation.py +189 -42
- phoenix/server/api/types/SpanCostDetailSummaryEntry.py +10 -0
- phoenix/server/api/types/SpanCostSummary.py +10 -0
- phoenix/server/api/types/SystemApiKey.py +65 -1
- phoenix/server/api/types/TokenPrice.py +16 -0
- phoenix/server/api/types/TokenUsage.py +3 -3
- phoenix/server/api/types/Trace.py +223 -51
- phoenix/server/api/types/TraceAnnotation.py +149 -50
- phoenix/server/api/types/User.py +137 -32
- phoenix/server/api/types/UserApiKey.py +73 -26
- phoenix/server/api/types/node.py +10 -0
- phoenix/server/api/types/pagination.py +11 -2
- phoenix/server/app.py +290 -45
- phoenix/server/authorization.py +38 -3
- phoenix/server/bearer_auth.py +34 -24
- phoenix/server/cost_tracking/cost_details_calculator.py +196 -0
- phoenix/server/cost_tracking/cost_model_lookup.py +179 -0
- phoenix/server/cost_tracking/helpers.py +68 -0
- phoenix/server/cost_tracking/model_cost_manifest.json +3657 -830
- phoenix/server/cost_tracking/regex_specificity.py +397 -0
- phoenix/server/cost_tracking/token_cost_calculator.py +57 -0
- phoenix/server/daemons/__init__.py +0 -0
- phoenix/server/daemons/db_disk_usage_monitor.py +214 -0
- phoenix/server/daemons/generative_model_store.py +103 -0
- phoenix/server/daemons/span_cost_calculator.py +99 -0
- phoenix/server/dml_event.py +17 -0
- phoenix/server/dml_event_handler.py +5 -0
- phoenix/server/email/sender.py +56 -3
- phoenix/server/email/templates/db_disk_usage_notification.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/experiments/__init__.py +0 -0
- phoenix/server/experiments/utils.py +14 -0
- phoenix/server/grpc_server.py +11 -11
- phoenix/server/jwt_store.py +17 -15
- phoenix/server/ldap.py +1449 -0
- phoenix/server/main.py +26 -10
- phoenix/server/oauth2.py +330 -12
- phoenix/server/prometheus.py +66 -6
- phoenix/server/rate_limiters.py +4 -9
- phoenix/server/retention.py +33 -20
- phoenix/server/session_filters.py +49 -0
- phoenix/server/static/.vite/manifest.json +55 -51
- phoenix/server/static/assets/components-BreFUQQa.js +6702 -0
- phoenix/server/static/assets/{index-E0M82BdE.js → index-CTQoemZv.js} +140 -56
- phoenix/server/static/assets/pages-DBE5iYM3.js +9524 -0
- phoenix/server/static/assets/vendor-BGzfc4EU.css +1 -0
- phoenix/server/static/assets/vendor-DCE4v-Ot.js +920 -0
- phoenix/server/static/assets/vendor-codemirror-D5f205eT.js +25 -0
- phoenix/server/static/assets/vendor-recharts-V9cwpXsm.js +37 -0
- phoenix/server/static/assets/vendor-shiki-Do--csgv.js +5 -0
- phoenix/server/static/assets/vendor-three-CmB8bl_y.js +3840 -0
- phoenix/server/templates/index.html +40 -6
- phoenix/server/thread_server.py +1 -2
- phoenix/server/types.py +14 -4
- phoenix/server/utils.py +74 -0
- phoenix/session/client.py +56 -3
- phoenix/session/data_extractor.py +5 -0
- phoenix/session/evaluation.py +14 -5
- phoenix/session/session.py +45 -9
- phoenix/settings.py +5 -0
- phoenix/trace/attributes.py +80 -13
- phoenix/trace/dsl/helpers.py +90 -1
- phoenix/trace/dsl/query.py +8 -6
- phoenix/trace/projects.py +5 -0
- phoenix/utilities/template_formatters.py +1 -1
- phoenix/version.py +1 -1
- arize_phoenix-10.0.4.dist-info/RECORD +0 -405
- phoenix/server/api/types/Evaluation.py +0 -39
- phoenix/server/cost_tracking/cost_lookup.py +0 -255
- phoenix/server/static/assets/components-DULKeDfL.js +0 -4365
- phoenix/server/static/assets/pages-Cl0A-0U2.js +0 -7430
- phoenix/server/static/assets/vendor-WIZid84E.css +0 -1
- phoenix/server/static/assets/vendor-arizeai-Dy-0mSNw.js +0 -649
- phoenix/server/static/assets/vendor-codemirror-DBtifKNr.js +0 -33
- phoenix/server/static/assets/vendor-oB4u9zuV.js +0 -905
- phoenix/server/static/assets/vendor-recharts-D-T4KPz2.js +0 -59
- phoenix/server/static/assets/vendor-shiki-BMn4O_9F.js +0 -5
- phoenix/server/static/assets/vendor-three-C5WAXd5r.js +0 -2998
- phoenix/utilities/deprecation.py +0 -31
- {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/entry_points.txt +0 -0
- {arize_phoenix-10.0.4.dist-info → arize_phoenix-12.28.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,32 +1,34 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from typing import Any, Optional
|
|
3
4
|
|
|
4
|
-
from fastapi import APIRouter, HTTPException
|
|
5
|
+
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
5
6
|
from pydantic import Field
|
|
6
7
|
from sqlalchemy import select
|
|
7
|
-
from sqlalchemy.exc import IntegrityError as PostgreSQLIntegrityError
|
|
8
|
-
from sqlean.dbapi2 import IntegrityError as SQLiteIntegrityError # type: ignore[import-untyped]
|
|
9
8
|
from starlette.requests import Request
|
|
10
|
-
from starlette.status import HTTP_404_NOT_FOUND, HTTP_409_CONFLICT
|
|
11
9
|
from strawberry.relay import GlobalID
|
|
12
10
|
|
|
13
11
|
from phoenix.db import models
|
|
12
|
+
from phoenix.db.helpers import get_runs_with_incomplete_evaluations_query
|
|
13
|
+
from phoenix.db.insertion.helpers import OnConflict, insert_on_conflict
|
|
14
14
|
from phoenix.db.models import ExperimentRunOutput
|
|
15
|
+
from phoenix.server.api.routers.v1.datasets import DatasetExample
|
|
15
16
|
from phoenix.server.api.types.node import from_global_id_with_expected_type
|
|
17
|
+
from phoenix.server.authorization import is_not_locked
|
|
16
18
|
from phoenix.server.dml_event import ExperimentRunInsertEvent
|
|
17
19
|
|
|
18
20
|
from .models import V1RoutesBaseModel
|
|
19
|
-
from .utils import ResponseBody, add_errors_to_responses
|
|
21
|
+
from .utils import PaginatedResponseBody, ResponseBody, add_errors_to_responses
|
|
20
22
|
|
|
21
23
|
router = APIRouter(tags=["experiments"], include_in_schema=True)
|
|
22
24
|
|
|
23
25
|
|
|
24
|
-
class
|
|
26
|
+
class ExperimentRunData(V1RoutesBaseModel):
|
|
25
27
|
dataset_example_id: str = Field(
|
|
26
28
|
description="The ID of the dataset example used in the experiment run"
|
|
27
29
|
)
|
|
28
30
|
output: Any = Field(description="The output of the experiment task")
|
|
29
|
-
repetition_number: int = Field(description="The repetition number of the experiment run")
|
|
31
|
+
repetition_number: int = Field(description="The repetition number of the experiment run", gt=0)
|
|
30
32
|
start_time: datetime = Field(description="The start time of the experiment run")
|
|
31
33
|
end_time: datetime = Field(description="The end time of the experiment run")
|
|
32
34
|
trace_id: Optional[str] = Field(
|
|
@@ -38,7 +40,7 @@ class ExperimentRun(V1RoutesBaseModel):
|
|
|
38
40
|
)
|
|
39
41
|
|
|
40
42
|
|
|
41
|
-
class CreateExperimentRunRequestBody(
|
|
43
|
+
class CreateExperimentRunRequestBody(ExperimentRunData):
|
|
42
44
|
pass
|
|
43
45
|
|
|
44
46
|
|
|
@@ -52,18 +54,21 @@ class CreateExperimentRunResponseBody(ResponseBody[CreateExperimentRunResponseBo
|
|
|
52
54
|
|
|
53
55
|
@router.post(
|
|
54
56
|
"/experiments/{experiment_id}/runs",
|
|
57
|
+
dependencies=[Depends(is_not_locked)],
|
|
55
58
|
operation_id="createExperimentRun",
|
|
56
59
|
summary="Create run for an experiment",
|
|
57
60
|
response_description="Experiment run created successfully",
|
|
58
61
|
responses=add_errors_to_responses(
|
|
59
62
|
[
|
|
60
63
|
{
|
|
61
|
-
"status_code":
|
|
64
|
+
"status_code": 404,
|
|
62
65
|
"description": "Experiment or dataset example not found",
|
|
63
66
|
},
|
|
64
67
|
{
|
|
65
|
-
"status_code":
|
|
66
|
-
"description":
|
|
68
|
+
"status_code": 409,
|
|
69
|
+
"description": (
|
|
70
|
+
"Experiment run already exists with a successful result and cannot be updated"
|
|
71
|
+
),
|
|
67
72
|
},
|
|
68
73
|
]
|
|
69
74
|
),
|
|
@@ -77,7 +82,7 @@ async def create_experiment_run(
|
|
|
77
82
|
except ValueError:
|
|
78
83
|
raise HTTPException(
|
|
79
84
|
detail=f"Experiment with ID {experiment_gid} does not exist",
|
|
80
|
-
status_code=
|
|
85
|
+
status_code=404,
|
|
81
86
|
)
|
|
82
87
|
|
|
83
88
|
example_gid = GlobalID.from_id(request_body.dataset_example_id)
|
|
@@ -86,7 +91,7 @@ async def create_experiment_run(
|
|
|
86
91
|
except ValueError:
|
|
87
92
|
raise HTTPException(
|
|
88
93
|
detail=f"DatasetExample with ID {example_gid} does not exist",
|
|
89
|
-
status_code=
|
|
94
|
+
status_code=404,
|
|
90
95
|
)
|
|
91
96
|
|
|
92
97
|
trace_id = request_body.trace_id
|
|
@@ -97,37 +102,72 @@ async def create_experiment_run(
|
|
|
97
102
|
error = request_body.error
|
|
98
103
|
|
|
99
104
|
async with request.app.state.db() as session:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
repetition_number
|
|
106
|
-
start_time=start_time,
|
|
107
|
-
end_time=end_time,
|
|
108
|
-
error=error,
|
|
105
|
+
# Check if a record already exists
|
|
106
|
+
existing_run = await session.scalar(
|
|
107
|
+
select(models.ExperimentRun)
|
|
108
|
+
.where(models.ExperimentRun.experiment_id == experiment_rowid)
|
|
109
|
+
.where(models.ExperimentRun.dataset_example_id == dataset_example_id)
|
|
110
|
+
.where(models.ExperimentRun.repetition_number == repetition_number)
|
|
109
111
|
)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
|
|
113
|
+
if existing_run is not None and existing_run.error is None:
|
|
114
|
+
# Record exists and has no error - reject the update
|
|
115
|
+
run_gid = GlobalID("ExperimentRun", str(existing_run.id))
|
|
114
116
|
raise HTTPException(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
+
status_code=409,
|
|
118
|
+
detail=(
|
|
119
|
+
f"Experiment run {run_gid} already exists with a successful result "
|
|
120
|
+
"and cannot be updated"
|
|
121
|
+
),
|
|
117
122
|
)
|
|
118
|
-
|
|
119
|
-
|
|
123
|
+
# Either no record exists, or existing record has an error - proceed with upsert
|
|
124
|
+
stmt = insert_on_conflict(
|
|
125
|
+
{
|
|
126
|
+
"experiment_id": experiment_rowid,
|
|
127
|
+
"dataset_example_id": dataset_example_id,
|
|
128
|
+
"trace_id": trace_id,
|
|
129
|
+
"output": ExperimentRunOutput(task_output=task_output),
|
|
130
|
+
"repetition_number": repetition_number,
|
|
131
|
+
"start_time": start_time,
|
|
132
|
+
"end_time": end_time,
|
|
133
|
+
"error": error,
|
|
134
|
+
},
|
|
135
|
+
table=models.ExperimentRun,
|
|
136
|
+
dialect=request.app.state.db.dialect,
|
|
137
|
+
unique_by=["experiment_id", "dataset_example_id", "repetition_number"],
|
|
138
|
+
on_conflict=OnConflict.DO_UPDATE,
|
|
139
|
+
).returning(models.ExperimentRun.id)
|
|
140
|
+
id_ = await session.scalar(stmt)
|
|
141
|
+
|
|
142
|
+
request.state.event_queue.put(ExperimentRunInsertEvent((id_,)))
|
|
143
|
+
run_gid = GlobalID("ExperimentRun", str(id_))
|
|
120
144
|
return CreateExperimentRunResponseBody(
|
|
121
145
|
data=CreateExperimentRunResponseBodyData(id=str(run_gid))
|
|
122
146
|
)
|
|
123
147
|
|
|
124
148
|
|
|
125
|
-
class
|
|
149
|
+
class ExperimentRun(ExperimentRunData):
|
|
126
150
|
id: str = Field(description="The ID of the experiment run")
|
|
127
151
|
experiment_id: str = Field(description="The ID of the experiment")
|
|
128
152
|
|
|
129
153
|
|
|
130
|
-
class ListExperimentRunsResponseBody(
|
|
154
|
+
class ListExperimentRunsResponseBody(PaginatedResponseBody[ExperimentRun]):
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class IncompleteExperimentEvaluation(V1RoutesBaseModel):
|
|
159
|
+
"""
|
|
160
|
+
Information about an experiment run with incomplete evaluations
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
experiment_run: ExperimentRun = Field(description="The experiment run")
|
|
164
|
+
dataset_example: DatasetExample = Field(description="The dataset example")
|
|
165
|
+
evaluation_names: list[str] = Field(
|
|
166
|
+
description="List of evaluation names that are incomplete (either missing or failed)"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class GetIncompleteEvaluationsResponseBody(PaginatedResponseBody[IncompleteExperimentEvaluation]):
|
|
131
171
|
pass
|
|
132
172
|
|
|
133
173
|
|
|
@@ -135,47 +175,285 @@ class ListExperimentRunsResponseBody(ResponseBody[list[ExperimentRunResponse]]):
|
|
|
135
175
|
"/experiments/{experiment_id}/runs",
|
|
136
176
|
operation_id="listExperimentRuns",
|
|
137
177
|
summary="List runs for an experiment",
|
|
178
|
+
description="Retrieve a paginated list of runs for an experiment",
|
|
138
179
|
response_description="Experiment runs retrieved successfully",
|
|
139
180
|
responses=add_errors_to_responses(
|
|
140
|
-
[
|
|
181
|
+
[
|
|
182
|
+
{"status_code": 404, "description": "Experiment not found"},
|
|
183
|
+
{"status_code": 422, "description": "Invalid cursor format"},
|
|
184
|
+
]
|
|
141
185
|
),
|
|
142
186
|
)
|
|
143
187
|
async def list_experiment_runs(
|
|
144
|
-
request: Request,
|
|
188
|
+
request: Request,
|
|
189
|
+
experiment_id: str,
|
|
190
|
+
cursor: Optional[str] = Query(
|
|
191
|
+
default=None,
|
|
192
|
+
description="Cursor for pagination (base64-encoded experiment run ID)",
|
|
193
|
+
),
|
|
194
|
+
limit: Optional[int] = Query(
|
|
195
|
+
default=None,
|
|
196
|
+
description="The max number of experiment runs to return at a time. "
|
|
197
|
+
"If not specified, returns all results.",
|
|
198
|
+
gt=0,
|
|
199
|
+
),
|
|
145
200
|
) -> ListExperimentRunsResponseBody:
|
|
146
|
-
|
|
201
|
+
try:
|
|
202
|
+
experiment_gid = GlobalID.from_id(experiment_id)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
raise HTTPException(
|
|
205
|
+
detail=f"Invalid experiment ID format: {experiment_id}",
|
|
206
|
+
status_code=422,
|
|
207
|
+
) from e
|
|
147
208
|
try:
|
|
148
209
|
experiment_rowid = from_global_id_with_expected_type(experiment_gid, "Experiment")
|
|
149
210
|
except ValueError:
|
|
150
211
|
raise HTTPException(
|
|
151
212
|
detail=f"Experiment with ID {experiment_gid} does not exist",
|
|
152
|
-
status_code=
|
|
213
|
+
status_code=404,
|
|
153
214
|
)
|
|
154
215
|
|
|
216
|
+
stmt = (
|
|
217
|
+
select(models.ExperimentRun)
|
|
218
|
+
.filter_by(experiment_id=experiment_rowid)
|
|
219
|
+
.order_by(models.ExperimentRun.id.desc())
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
if cursor:
|
|
223
|
+
try:
|
|
224
|
+
cursor_id = GlobalID.from_id(cursor).node_id
|
|
225
|
+
stmt = stmt.where(models.ExperimentRun.id <= int(cursor_id))
|
|
226
|
+
except ValueError:
|
|
227
|
+
raise HTTPException(
|
|
228
|
+
detail=f"Invalid cursor format: {cursor}",
|
|
229
|
+
status_code=422,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Apply limit only if specified for pagination
|
|
233
|
+
if limit is not None:
|
|
234
|
+
stmt = stmt.limit(limit + 1)
|
|
235
|
+
|
|
155
236
|
async with request.app.state.db() as session:
|
|
156
|
-
experiment_runs = await session.
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
237
|
+
experiment_runs = (await session.scalars(stmt)).all()
|
|
238
|
+
|
|
239
|
+
if not experiment_runs:
|
|
240
|
+
return ListExperimentRunsResponseBody(next_cursor=None, data=[])
|
|
241
|
+
|
|
242
|
+
next_cursor = None
|
|
243
|
+
# Only check for next cursor if limit was specified
|
|
244
|
+
if limit is not None and len(experiment_runs) == limit + 1:
|
|
245
|
+
last_run = experiment_runs[-1]
|
|
246
|
+
next_cursor = str(GlobalID("ExperimentRun", str(last_run.id)))
|
|
247
|
+
experiment_runs = experiment_runs[:-1]
|
|
248
|
+
|
|
249
|
+
runs = []
|
|
250
|
+
for exp_run in experiment_runs:
|
|
251
|
+
run_gid = GlobalID("ExperimentRun", str(exp_run.id))
|
|
252
|
+
experiment_gid = GlobalID("Experiment", str(exp_run.experiment_id))
|
|
253
|
+
example_gid = GlobalID("DatasetExample", str(exp_run.dataset_example_id))
|
|
254
|
+
runs.append(
|
|
255
|
+
ExperimentRun(
|
|
256
|
+
start_time=exp_run.start_time,
|
|
257
|
+
end_time=exp_run.end_time,
|
|
258
|
+
experiment_id=str(experiment_gid),
|
|
259
|
+
dataset_example_id=str(example_gid),
|
|
260
|
+
repetition_number=exp_run.repetition_number,
|
|
261
|
+
output=exp_run.output.get("task_output"),
|
|
262
|
+
error=exp_run.error,
|
|
263
|
+
id=str(run_gid),
|
|
264
|
+
trace_id=exp_run.trace_id,
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
return ListExperimentRunsResponseBody(data=runs, next_cursor=next_cursor)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@router.get(
|
|
271
|
+
"/experiments/{experiment_id}/incomplete-evaluations",
|
|
272
|
+
operation_id="getIncompleteExperimentEvaluations",
|
|
273
|
+
summary="Get incomplete evaluations for an experiment",
|
|
274
|
+
responses=add_errors_to_responses(
|
|
275
|
+
[
|
|
276
|
+
{"status_code": 400, "description": "No evaluator names provided"},
|
|
277
|
+
{"status_code": 404, "description": "Experiment not found"},
|
|
278
|
+
{"status_code": 422, "description": "Invalid cursor format"},
|
|
279
|
+
]
|
|
280
|
+
),
|
|
281
|
+
response_description="Incomplete evaluations retrieved successfully",
|
|
282
|
+
)
|
|
283
|
+
async def get_incomplete_evaluations(
|
|
284
|
+
request: Request,
|
|
285
|
+
experiment_id: str,
|
|
286
|
+
evaluation_name: list[str] = Query(default=[], description="Evaluation names to check"),
|
|
287
|
+
cursor: Optional[str] = Query(default=None, description="Cursor for pagination"),
|
|
288
|
+
limit: int = Query(
|
|
289
|
+
default=50, description="Maximum number of runs with incomplete evaluations to return", gt=0
|
|
290
|
+
),
|
|
291
|
+
) -> GetIncompleteEvaluationsResponseBody:
|
|
292
|
+
"""
|
|
293
|
+
Get experiment runs that have incomplete evaluations.
|
|
294
|
+
|
|
295
|
+
Returns runs with:
|
|
296
|
+
- Missing evaluations (evaluator has not been run)
|
|
297
|
+
- Failed evaluations (evaluator ran but has errors)
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
experiment_id: The ID of the experiment
|
|
301
|
+
evaluation_name: List of evaluation names to check (required, at least one)
|
|
302
|
+
cursor: Cursor for pagination
|
|
303
|
+
limit: Maximum number of results to return
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Paginated list of runs with incomplete evaluations
|
|
307
|
+
"""
|
|
308
|
+
try:
|
|
309
|
+
experiment_globalid = GlobalID.from_id(experiment_id)
|
|
310
|
+
except Exception as e:
|
|
311
|
+
raise HTTPException(
|
|
312
|
+
detail=f"Invalid experiment ID format: {experiment_id}",
|
|
313
|
+
status_code=422,
|
|
314
|
+
) from e
|
|
315
|
+
try:
|
|
316
|
+
experiment_rowid = from_global_id_with_expected_type(experiment_globalid, "Experiment")
|
|
317
|
+
except ValueError:
|
|
318
|
+
raise HTTPException(
|
|
319
|
+
detail=f"Experiment with ID {experiment_globalid} does not exist",
|
|
320
|
+
status_code=404,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Parse cursor if provided
|
|
324
|
+
cursor_run_rowid: Optional[int] = None
|
|
325
|
+
if cursor:
|
|
326
|
+
try:
|
|
327
|
+
cursor_gid = GlobalID.from_id(cursor)
|
|
328
|
+
cursor_run_rowid = from_global_id_with_expected_type(cursor_gid, "ExperimentRun")
|
|
329
|
+
except (ValueError, AttributeError):
|
|
330
|
+
raise HTTPException(
|
|
331
|
+
detail=f"Invalid cursor format: {cursor}",
|
|
332
|
+
status_code=422,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Deduplicate evaluation names
|
|
336
|
+
evaluation_name = list(set(name.strip() for name in evaluation_name if name.strip()))
|
|
337
|
+
|
|
338
|
+
# Require at least one evaluation name
|
|
339
|
+
if not evaluation_name:
|
|
340
|
+
raise HTTPException(
|
|
341
|
+
detail="At least one evaluation_name must be provided",
|
|
342
|
+
status_code=400,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Validate evaluation names - reject null bytes which are invalid in PostgreSQL
|
|
346
|
+
for name in evaluation_name:
|
|
347
|
+
if "\x00" in name:
|
|
348
|
+
raise HTTPException(
|
|
349
|
+
detail="Invalid evaluation name: null bytes are not allowed",
|
|
350
|
+
status_code=400,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
async with request.app.state.db() as session:
|
|
354
|
+
# Verify experiment exists
|
|
355
|
+
experiment_result = await session.execute(
|
|
356
|
+
select(models.Experiment).filter_by(id=experiment_rowid)
|
|
161
357
|
)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
358
|
+
experiment = experiment_result.scalar()
|
|
359
|
+
if not experiment:
|
|
360
|
+
raise HTTPException(
|
|
361
|
+
detail=f"Experiment with ID {experiment_globalid} does not exist",
|
|
362
|
+
status_code=404,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Query for runs with incomplete evaluations in a single query
|
|
366
|
+
# This fetches runs, revisions, and annotations together to minimize round-trips
|
|
367
|
+
# A run has incomplete evaluations if:
|
|
368
|
+
# 1. It's missing an annotation for any of the requested evaluators
|
|
369
|
+
# 2. It has a failed annotation (error IS NOT NULL) for any evaluator
|
|
370
|
+
|
|
371
|
+
# Get dialect for SQL generation
|
|
372
|
+
dialect = request.app.state.db.dialect
|
|
373
|
+
|
|
374
|
+
# Single query: Get runs with incomplete evaluations + their revisions + annotations
|
|
375
|
+
combined_query = get_runs_with_incomplete_evaluations_query(
|
|
376
|
+
experiment_rowid,
|
|
377
|
+
evaluation_name,
|
|
378
|
+
dialect,
|
|
379
|
+
cursor_run_rowid=cursor_run_rowid,
|
|
380
|
+
limit=limit,
|
|
381
|
+
include_annotations_and_revisions=True,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
combined_result = await session.execute(combined_query)
|
|
385
|
+
all_rows = combined_result.all()
|
|
386
|
+
|
|
387
|
+
if not all_rows:
|
|
388
|
+
return GetIncompleteEvaluationsResponseBody(data=[], next_cursor=None)
|
|
389
|
+
|
|
390
|
+
# Parse rows - now each row is a single run with successful annotations as JSON array
|
|
391
|
+
# Each row: (ExperimentRun, revision_id, DatasetExampleRevision, annotations_json)
|
|
392
|
+
runs_data: list[tuple[models.ExperimentRun, models.DatasetExampleRevision, set[str]]] = []
|
|
393
|
+
|
|
394
|
+
for row in all_rows:
|
|
395
|
+
run = row[0] # ExperimentRun
|
|
396
|
+
revision = row[2] # DatasetExampleRevision
|
|
397
|
+
annotations_json = row[3] # JSON string or None
|
|
398
|
+
|
|
399
|
+
# Parse successful annotation names (just a list of strings now)
|
|
400
|
+
successful_eval_names: set[str] = set()
|
|
401
|
+
if annotations_json:
|
|
402
|
+
successful_eval_names = set(json.loads(annotations_json))
|
|
403
|
+
|
|
404
|
+
runs_data.append((run, revision, successful_eval_names))
|
|
405
|
+
|
|
406
|
+
# Apply pagination limit
|
|
407
|
+
has_more = len(runs_data) > limit
|
|
408
|
+
if has_more:
|
|
409
|
+
runs_to_process = runs_data[:limit]
|
|
410
|
+
else:
|
|
411
|
+
runs_to_process = runs_data
|
|
412
|
+
|
|
413
|
+
# Build response
|
|
414
|
+
incomplete_evaluations_list: list[IncompleteExperimentEvaluation] = []
|
|
415
|
+
for run, revision, successful_eval_names in runs_to_process:
|
|
416
|
+
# Determine incomplete evaluation names for this run
|
|
417
|
+
# Any evaluation not in the successful set is incomplete (either missing or failed)
|
|
418
|
+
incomplete_evaluation_names = sorted(
|
|
419
|
+
name for name in evaluation_name if name not in successful_eval_names
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
run_globalid = GlobalID("ExperimentRun", str(run.id))
|
|
423
|
+
example_globalid = GlobalID("DatasetExample", str(run.dataset_example_id))
|
|
424
|
+
|
|
425
|
+
incomplete_evaluations_list.append(
|
|
426
|
+
IncompleteExperimentEvaluation(
|
|
427
|
+
experiment_run=ExperimentRun(
|
|
428
|
+
id=str(run_globalid),
|
|
429
|
+
experiment_id=str(experiment_globalid),
|
|
430
|
+
dataset_example_id=str(example_globalid),
|
|
431
|
+
output=run.output.get("task_output"),
|
|
432
|
+
repetition_number=run.repetition_number,
|
|
433
|
+
start_time=run.start_time,
|
|
434
|
+
end_time=run.end_time,
|
|
435
|
+
trace_id=run.trace_id,
|
|
436
|
+
error=run.error,
|
|
437
|
+
),
|
|
438
|
+
dataset_example=DatasetExample(
|
|
439
|
+
id=str(example_globalid),
|
|
440
|
+
input=revision.input,
|
|
441
|
+
output=revision.output,
|
|
442
|
+
metadata=revision.metadata_,
|
|
443
|
+
updated_at=revision.created_at,
|
|
444
|
+
),
|
|
445
|
+
evaluation_names=incomplete_evaluation_names,
|
|
179
446
|
)
|
|
180
447
|
)
|
|
181
|
-
|
|
448
|
+
|
|
449
|
+
# Set next cursor if we have more results
|
|
450
|
+
next_cursor = None
|
|
451
|
+
if has_more:
|
|
452
|
+
# Cursor is the ID of the next item to fetch
|
|
453
|
+
# (the extra item we fetched but didn't process)
|
|
454
|
+
next_run, _, _ = runs_data[limit] # First item after our limit
|
|
455
|
+
next_cursor = str(GlobalID("ExperimentRun", str(next_run.id)))
|
|
456
|
+
|
|
457
|
+
return GetIncompleteEvaluationsResponseBody(
|
|
458
|
+
data=incomplete_evaluations_list, next_cursor=next_cursor
|
|
459
|
+
)
|