arize-phoenix 3.16.1__py3-none-any.whl → 7.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.1.dist-info/METADATA +261 -0
- arize_phoenix-7.7.1.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.1.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -241
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +4 -112
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.1.dist-info/METADATA +0 -495
- arize_phoenix-3.16.1.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -619
- phoenix/core/traces.py +0 -96
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
|
@@ -0,0 +1,772 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import functools
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import traceback
|
|
6
|
+
from binascii import hexlify
|
|
7
|
+
from collections.abc import Awaitable, Mapping, Sequence
|
|
8
|
+
from contextlib import ExitStack
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from dataclasses import replace
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from itertools import product
|
|
13
|
+
from typing import Any, Literal, Optional, Union, cast
|
|
14
|
+
from urllib.parse import urljoin
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
import opentelemetry.sdk.trace as trace_sdk
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from openinference.semconv.resource import ResourceAttributes
|
|
20
|
+
from openinference.semconv.trace import (
|
|
21
|
+
OpenInferenceMimeTypeValues,
|
|
22
|
+
OpenInferenceSpanKindValues,
|
|
23
|
+
SpanAttributes,
|
|
24
|
+
)
|
|
25
|
+
from opentelemetry.context import Context
|
|
26
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
27
|
+
from opentelemetry.sdk.resources import Resource
|
|
28
|
+
from opentelemetry.sdk.trace import Span
|
|
29
|
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
30
|
+
from opentelemetry.trace import Status, StatusCode, Tracer
|
|
31
|
+
from typing_extensions import TypeAlias
|
|
32
|
+
|
|
33
|
+
from phoenix.config import get_base_url
|
|
34
|
+
from phoenix.evals.executors import get_executor_on_sync_context
|
|
35
|
+
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
36
|
+
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
37
|
+
from phoenix.experiments.evaluators import create_evaluator
|
|
38
|
+
from phoenix.experiments.evaluators.base import (
|
|
39
|
+
Evaluator,
|
|
40
|
+
ExperimentEvaluator,
|
|
41
|
+
)
|
|
42
|
+
from phoenix.experiments.tracing import capture_spans
|
|
43
|
+
from phoenix.experiments.types import (
|
|
44
|
+
DRY_RUN,
|
|
45
|
+
Dataset,
|
|
46
|
+
EvaluationParameters,
|
|
47
|
+
EvaluationResult,
|
|
48
|
+
EvaluationSummary,
|
|
49
|
+
EvaluatorName,
|
|
50
|
+
Example,
|
|
51
|
+
Experiment,
|
|
52
|
+
ExperimentEvaluationRun,
|
|
53
|
+
ExperimentParameters,
|
|
54
|
+
ExperimentRun,
|
|
55
|
+
ExperimentTask,
|
|
56
|
+
RanExperiment,
|
|
57
|
+
TaskSummary,
|
|
58
|
+
TestCase,
|
|
59
|
+
_asdict,
|
|
60
|
+
_replace,
|
|
61
|
+
)
|
|
62
|
+
from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
|
|
63
|
+
from phoenix.trace.attributes import flatten
|
|
64
|
+
from phoenix.utilities.client import VersionedAsyncClient, VersionedClient
|
|
65
|
+
from phoenix.utilities.json import jsonify
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _phoenix_clients() -> tuple[httpx.Client, httpx.AsyncClient]:
|
|
69
|
+
return VersionedClient(
|
|
70
|
+
base_url=get_base_url(),
|
|
71
|
+
), VersionedAsyncClient(
|
|
72
|
+
base_url=get_base_url(),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
Evaluators: TypeAlias = Union[
|
|
77
|
+
ExperimentEvaluator,
|
|
78
|
+
Sequence[ExperimentEvaluator],
|
|
79
|
+
Mapping[EvaluatorName, ExperimentEvaluator],
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
RateLimitErrors: TypeAlias = Union[type[BaseException], Sequence[type[BaseException]]]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def run_experiment(
|
|
87
|
+
dataset: Dataset,
|
|
88
|
+
task: ExperimentTask,
|
|
89
|
+
evaluators: Optional[Evaluators] = None,
|
|
90
|
+
*,
|
|
91
|
+
experiment_name: Optional[str] = None,
|
|
92
|
+
experiment_description: Optional[str] = None,
|
|
93
|
+
experiment_metadata: Optional[Mapping[str, Any]] = None,
|
|
94
|
+
rate_limit_errors: Optional[RateLimitErrors] = None,
|
|
95
|
+
dry_run: Union[bool, int] = False,
|
|
96
|
+
print_summary: bool = True,
|
|
97
|
+
concurrency: int = 3,
|
|
98
|
+
) -> RanExperiment:
|
|
99
|
+
"""
|
|
100
|
+
Runs an experiment using a given set of dataset of examples.
|
|
101
|
+
|
|
102
|
+
An experiment is a user-defined task that runs on each example in a dataset. The results from
|
|
103
|
+
each experiment can be evaluated using any number of evaluators to measure the behavior of the
|
|
104
|
+
task. The experiment and evaluation results are stored in the Phoenix database for comparison
|
|
105
|
+
and analysis.
|
|
106
|
+
|
|
107
|
+
A `task` is either a synchronous or asynchronous function that returns a JSON serializable
|
|
108
|
+
output. If the `task` is a function of one argument then that argument will be bound to the
|
|
109
|
+
`input` field of the dataset example. Alternatively, the `task` can be a function of any
|
|
110
|
+
combination of specific argument names that will be bound to special values:
|
|
111
|
+
|
|
112
|
+
- `input`: The input field of the dataset example
|
|
113
|
+
- `expected`: The expected or reference output of the dataset example
|
|
114
|
+
- `reference`: An alias for `expected`
|
|
115
|
+
- `metadata`: Metadata associated with the dataset example
|
|
116
|
+
- `example`: The dataset `Example` object with all associated fields
|
|
117
|
+
|
|
118
|
+
An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
|
|
119
|
+
or numeric "score". If the `evaluator` is a function of one argument then that argument will be
|
|
120
|
+
bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
|
|
121
|
+
combination of specific argument names that will be bound to special values:
|
|
122
|
+
|
|
123
|
+
- `input`: The input field of the dataset example
|
|
124
|
+
- `output`: The output of the task
|
|
125
|
+
- `expected`: The expected or reference output of the dataset example
|
|
126
|
+
- `reference`: An alias for `expected`
|
|
127
|
+
- `metadata`: Metadata associated with the dataset example
|
|
128
|
+
|
|
129
|
+
Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
dataset (Dataset): The dataset on which to run the experiment.
|
|
133
|
+
task (ExperimentTask): The task to run on each example in the dataset.
|
|
134
|
+
evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
|
|
135
|
+
evaluate the results of the experiment. Defaults to None.
|
|
136
|
+
experiment_name (Optional[str]): The name of the experiment. Defaults to None.
|
|
137
|
+
experiment_description (Optional[str]): A description of the experiment. Defaults to None.
|
|
138
|
+
experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
|
|
139
|
+
experiment. Defaults to None.
|
|
140
|
+
rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
|
|
141
|
+
sequence of exceptions to adaptively throttle on. Defaults to None.
|
|
142
|
+
dry_run (bool | int): Run the experiment in dry-run mode. When set, experiment results will
|
|
143
|
+
not be recorded in Phoenix. If True, the experiment will run on a random dataset
|
|
144
|
+
example. If an integer, the experiment will run on a random sample of the dataset
|
|
145
|
+
examples of the given size. Defaults to False.
|
|
146
|
+
print_summary (bool): Whether to print a summary of the experiment and evaluation results.
|
|
147
|
+
Defaults to True.
|
|
148
|
+
concurrency (int): Specifies the concurrency for task execution. In order to enable
|
|
149
|
+
concurrent task execution, the task callable must be a coroutine function.
|
|
150
|
+
Defaults to 3.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
|
|
154
|
+
added to the experiment using the `evaluate_experiment` function.
|
|
155
|
+
"""
|
|
156
|
+
task_signature = inspect.signature(task)
|
|
157
|
+
_validate_task_signature(task_signature)
|
|
158
|
+
|
|
159
|
+
if not dataset.examples:
|
|
160
|
+
raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
|
|
161
|
+
# Add this to the params once supported in the UI
|
|
162
|
+
repetitions = 1
|
|
163
|
+
assert repetitions > 0, "Must run the experiment at least once."
|
|
164
|
+
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
165
|
+
|
|
166
|
+
sync_client, async_client = _phoenix_clients()
|
|
167
|
+
|
|
168
|
+
payload = {
|
|
169
|
+
"version_id": dataset.version_id,
|
|
170
|
+
"name": experiment_name,
|
|
171
|
+
"description": experiment_description,
|
|
172
|
+
"metadata": experiment_metadata,
|
|
173
|
+
"repetitions": repetitions,
|
|
174
|
+
}
|
|
175
|
+
if not dry_run:
|
|
176
|
+
experiment_response = sync_client.post(
|
|
177
|
+
f"/v1/datasets/{dataset.id}/experiments",
|
|
178
|
+
json=payload,
|
|
179
|
+
)
|
|
180
|
+
experiment_response.raise_for_status()
|
|
181
|
+
exp_json = experiment_response.json()["data"]
|
|
182
|
+
project_name = exp_json["project_name"]
|
|
183
|
+
experiment = Experiment(
|
|
184
|
+
dataset_id=dataset.id,
|
|
185
|
+
dataset_version_id=dataset.version_id,
|
|
186
|
+
repetitions=repetitions,
|
|
187
|
+
id=exp_json["id"],
|
|
188
|
+
project_name=project_name,
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
experiment = Experiment(
|
|
192
|
+
dataset_id=dataset.id,
|
|
193
|
+
dataset_version_id=dataset.version_id,
|
|
194
|
+
repetitions=repetitions,
|
|
195
|
+
id=DRY_RUN,
|
|
196
|
+
project_name="",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
tracer, resource = _get_tracer(experiment.project_name)
|
|
200
|
+
root_span_name = f"Task: {get_func_name(task)}"
|
|
201
|
+
root_span_kind = CHAIN
|
|
202
|
+
|
|
203
|
+
print("🧪 Experiment started.")
|
|
204
|
+
if dry_run:
|
|
205
|
+
examples = {
|
|
206
|
+
(ex := dataset[i]).id: ex
|
|
207
|
+
for i in pd.Series(range(len(dataset)))
|
|
208
|
+
.sample(min(len(dataset), int(dry_run)), random_state=42)
|
|
209
|
+
.sort_values()
|
|
210
|
+
}
|
|
211
|
+
id_selection = "\n".join(examples)
|
|
212
|
+
print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
|
|
213
|
+
dataset = replace(dataset, examples=examples)
|
|
214
|
+
else:
|
|
215
|
+
dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
|
|
216
|
+
experiment_compare_url = get_experiment_url(
|
|
217
|
+
dataset_id=dataset.id,
|
|
218
|
+
experiment_id=experiment.id,
|
|
219
|
+
)
|
|
220
|
+
print(f"📺 View dataset experiments: {dataset_experiments_url}")
|
|
221
|
+
print(f"🔗 View this experiment: {experiment_compare_url}")
|
|
222
|
+
|
|
223
|
+
def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
224
|
+
example, repetition_number = test_case.example, test_case.repetition_number
|
|
225
|
+
output = None
|
|
226
|
+
error: Optional[BaseException] = None
|
|
227
|
+
status = Status(StatusCode.OK)
|
|
228
|
+
with ExitStack() as stack:
|
|
229
|
+
span: Span = stack.enter_context(
|
|
230
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
231
|
+
)
|
|
232
|
+
stack.enter_context(capture_spans(resource))
|
|
233
|
+
try:
|
|
234
|
+
# Do not use keyword arguments, which can fail at runtime
|
|
235
|
+
# even when function obeys protocol, because keyword arguments
|
|
236
|
+
# are implementation details.
|
|
237
|
+
bound_task_args = _bind_task_signature(task_signature, example)
|
|
238
|
+
_output = task(*bound_task_args.args, **bound_task_args.kwargs)
|
|
239
|
+
if isinstance(_output, Awaitable):
|
|
240
|
+
sync_error_message = (
|
|
241
|
+
"Task is async and cannot be run within an existing event loop. "
|
|
242
|
+
"Consider the following options:\n\n"
|
|
243
|
+
"1. Pass in a synchronous task callable.\n"
|
|
244
|
+
"2. Use `nest_asyncio.apply()` to allow nesting event loops."
|
|
245
|
+
)
|
|
246
|
+
raise RuntimeError(sync_error_message)
|
|
247
|
+
else:
|
|
248
|
+
output = _output
|
|
249
|
+
except BaseException as exc:
|
|
250
|
+
span.record_exception(exc)
|
|
251
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
252
|
+
error = exc
|
|
253
|
+
_print_experiment_error(
|
|
254
|
+
exc,
|
|
255
|
+
example_id=example.id,
|
|
256
|
+
repetition_number=repetition_number,
|
|
257
|
+
kind="task",
|
|
258
|
+
)
|
|
259
|
+
output = jsonify(output)
|
|
260
|
+
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
261
|
+
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
262
|
+
if output is not None:
|
|
263
|
+
if isinstance(output, str):
|
|
264
|
+
span.set_attribute(OUTPUT_VALUE, output)
|
|
265
|
+
else:
|
|
266
|
+
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
267
|
+
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
268
|
+
span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
269
|
+
span.set_status(status)
|
|
270
|
+
|
|
271
|
+
assert isinstance(
|
|
272
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
273
|
+
), "Output must be JSON serializable"
|
|
274
|
+
exp_run = ExperimentRun(
|
|
275
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
276
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
277
|
+
experiment_id=experiment.id,
|
|
278
|
+
dataset_example_id=example.id,
|
|
279
|
+
repetition_number=repetition_number,
|
|
280
|
+
output=output,
|
|
281
|
+
error=repr(error) if error else None,
|
|
282
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
283
|
+
)
|
|
284
|
+
if not dry_run:
|
|
285
|
+
resp = sync_client.post(f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run))
|
|
286
|
+
resp.raise_for_status()
|
|
287
|
+
exp_run = replace(exp_run, id=resp.json()["data"]["id"])
|
|
288
|
+
return exp_run
|
|
289
|
+
|
|
290
|
+
async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
291
|
+
example, repetition_number = test_case.example, test_case.repetition_number
|
|
292
|
+
output = None
|
|
293
|
+
error: Optional[BaseException] = None
|
|
294
|
+
status = Status(StatusCode.OK)
|
|
295
|
+
with ExitStack() as stack:
|
|
296
|
+
span: Span = stack.enter_context(
|
|
297
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
298
|
+
)
|
|
299
|
+
stack.enter_context(capture_spans(resource))
|
|
300
|
+
try:
|
|
301
|
+
# Do not use keyword arguments, which can fail at runtime
|
|
302
|
+
# even when function obeys protocol, because keyword arguments
|
|
303
|
+
# are implementation details.
|
|
304
|
+
bound_task_args = _bind_task_signature(task_signature, example)
|
|
305
|
+
_output = task(*bound_task_args.args, **bound_task_args.kwargs)
|
|
306
|
+
if isinstance(_output, Awaitable):
|
|
307
|
+
output = await _output
|
|
308
|
+
else:
|
|
309
|
+
output = _output
|
|
310
|
+
except BaseException as exc:
|
|
311
|
+
span.record_exception(exc)
|
|
312
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
313
|
+
error = exc
|
|
314
|
+
_print_experiment_error(
|
|
315
|
+
exc,
|
|
316
|
+
example_id=example.id,
|
|
317
|
+
repetition_number=repetition_number,
|
|
318
|
+
kind="task",
|
|
319
|
+
)
|
|
320
|
+
output = jsonify(output)
|
|
321
|
+
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
322
|
+
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
323
|
+
if output is not None:
|
|
324
|
+
if isinstance(output, str):
|
|
325
|
+
span.set_attribute(OUTPUT_VALUE, output)
|
|
326
|
+
else:
|
|
327
|
+
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
328
|
+
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
329
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
330
|
+
span.set_status(status)
|
|
331
|
+
|
|
332
|
+
assert isinstance(
|
|
333
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
334
|
+
), "Output must be JSON serializable"
|
|
335
|
+
exp_run = ExperimentRun(
|
|
336
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
337
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
338
|
+
experiment_id=experiment.id,
|
|
339
|
+
dataset_example_id=example.id,
|
|
340
|
+
repetition_number=repetition_number,
|
|
341
|
+
output=output,
|
|
342
|
+
error=repr(error) if error else None,
|
|
343
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
344
|
+
)
|
|
345
|
+
if not dry_run:
|
|
346
|
+
# Below is a workaround to avoid timeout errors sometimes
|
|
347
|
+
# encountered when the task is a synchronous function that
|
|
348
|
+
# blocks for too long.
|
|
349
|
+
resp = await asyncio.get_running_loop().run_in_executor(
|
|
350
|
+
None,
|
|
351
|
+
functools.partial(
|
|
352
|
+
sync_client.post,
|
|
353
|
+
url=f"/v1/experiments/{experiment.id}/runs",
|
|
354
|
+
json=jsonify(exp_run),
|
|
355
|
+
),
|
|
356
|
+
)
|
|
357
|
+
resp.raise_for_status()
|
|
358
|
+
exp_run = replace(exp_run, id=resp.json()["data"]["id"])
|
|
359
|
+
return exp_run
|
|
360
|
+
|
|
361
|
+
_errors: tuple[type[BaseException], ...]
|
|
362
|
+
if not isinstance(rate_limit_errors, Sequence):
|
|
363
|
+
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
364
|
+
else:
|
|
365
|
+
_errors = tuple(filter(None, rate_limit_errors))
|
|
366
|
+
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
|
|
367
|
+
|
|
368
|
+
rate_limited_sync_run_experiment = functools.reduce(
|
|
369
|
+
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
|
|
370
|
+
)
|
|
371
|
+
rate_limited_async_run_experiment = functools.reduce(
|
|
372
|
+
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
executor = get_executor_on_sync_context(
|
|
376
|
+
rate_limited_sync_run_experiment,
|
|
377
|
+
rate_limited_async_run_experiment,
|
|
378
|
+
max_retries=0,
|
|
379
|
+
exit_on_error=False,
|
|
380
|
+
fallback_return_value=None,
|
|
381
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
|
|
382
|
+
concurrency=concurrency,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
test_cases = [
|
|
386
|
+
TestCase(example=deepcopy(ex), repetition_number=rep)
|
|
387
|
+
for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
|
|
388
|
+
]
|
|
389
|
+
task_runs, _execution_details = executor.run(test_cases)
|
|
390
|
+
print("✅ Task runs completed.")
|
|
391
|
+
params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
|
|
392
|
+
task_summary = TaskSummary.from_task_runs(params, task_runs)
|
|
393
|
+
ran_experiment: RanExperiment = object.__new__(RanExperiment)
|
|
394
|
+
ran_experiment.__init__( # type: ignore[misc]
|
|
395
|
+
params=params,
|
|
396
|
+
dataset=dataset,
|
|
397
|
+
runs={r.id: r for r in task_runs if r is not None},
|
|
398
|
+
task_summary=task_summary,
|
|
399
|
+
**_asdict(experiment),
|
|
400
|
+
)
|
|
401
|
+
if evaluators_by_name:
|
|
402
|
+
return evaluate_experiment(
|
|
403
|
+
ran_experiment,
|
|
404
|
+
evaluators=evaluators_by_name,
|
|
405
|
+
dry_run=dry_run,
|
|
406
|
+
print_summary=print_summary,
|
|
407
|
+
rate_limit_errors=rate_limit_errors,
|
|
408
|
+
concurrency=concurrency,
|
|
409
|
+
)
|
|
410
|
+
if print_summary:
|
|
411
|
+
print(ran_experiment)
|
|
412
|
+
return ran_experiment
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def evaluate_experiment(
|
|
416
|
+
experiment: Experiment,
|
|
417
|
+
evaluators: Evaluators,
|
|
418
|
+
*,
|
|
419
|
+
dry_run: Union[bool, int] = False,
|
|
420
|
+
print_summary: bool = True,
|
|
421
|
+
rate_limit_errors: Optional[RateLimitErrors] = None,
|
|
422
|
+
concurrency: int = 3,
|
|
423
|
+
) -> RanExperiment:
|
|
424
|
+
if not dry_run and _is_dry_run(experiment):
|
|
425
|
+
dry_run = True
|
|
426
|
+
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
427
|
+
if not evaluators_by_name:
|
|
428
|
+
raise ValueError("Must specify at least one Evaluator")
|
|
429
|
+
sync_client, async_client = _phoenix_clients()
|
|
430
|
+
dataset_id = experiment.dataset_id
|
|
431
|
+
dataset_version_id = experiment.dataset_version_id
|
|
432
|
+
if isinstance(experiment, RanExperiment):
|
|
433
|
+
ran_experiment: RanExperiment = experiment
|
|
434
|
+
else:
|
|
435
|
+
dataset = Dataset.from_dict(
|
|
436
|
+
sync_client.get(
|
|
437
|
+
f"/v1/datasets/{dataset_id}/examples",
|
|
438
|
+
params={"version_id": str(dataset_version_id)},
|
|
439
|
+
).json()["data"]
|
|
440
|
+
)
|
|
441
|
+
if not dataset.examples:
|
|
442
|
+
raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
|
|
443
|
+
experiment_runs = {
|
|
444
|
+
exp_run["id"]: ExperimentRun.from_dict(exp_run)
|
|
445
|
+
for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
|
|
446
|
+
}
|
|
447
|
+
if not experiment_runs:
|
|
448
|
+
raise ValueError("Experiment has not been run")
|
|
449
|
+
params = ExperimentParameters(n_examples=len(dataset.examples))
|
|
450
|
+
task_summary = TaskSummary.from_task_runs(params, experiment_runs.values())
|
|
451
|
+
ran_experiment = object.__new__(RanExperiment)
|
|
452
|
+
ran_experiment.__init__( # type: ignore[misc]
|
|
453
|
+
dataset=dataset,
|
|
454
|
+
params=params,
|
|
455
|
+
runs=experiment_runs,
|
|
456
|
+
task_summary=task_summary,
|
|
457
|
+
**_asdict(experiment),
|
|
458
|
+
)
|
|
459
|
+
print("🧠 Evaluation started.")
|
|
460
|
+
examples = ran_experiment.dataset.examples
|
|
461
|
+
if dry_run:
|
|
462
|
+
if not _is_dry_run(ran_experiment):
|
|
463
|
+
dataset = ran_experiment.dataset
|
|
464
|
+
examples = {
|
|
465
|
+
(ex := dataset[i]).id: ex
|
|
466
|
+
for i in pd.Series(range(len(dataset)))
|
|
467
|
+
.sample(min(len(dataset), int(dry_run)), random_state=42)
|
|
468
|
+
.sort_values()
|
|
469
|
+
}
|
|
470
|
+
dataset = replace(ran_experiment.dataset, examples=examples)
|
|
471
|
+
ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
|
|
472
|
+
id_selection = "\n".join(examples)
|
|
473
|
+
print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
|
|
474
|
+
# not all dataset examples have associated experiment runs, so we need to pair them up
|
|
475
|
+
example_run_pairs = []
|
|
476
|
+
examples = ran_experiment.dataset.examples
|
|
477
|
+
for exp_run in ran_experiment.runs.values():
|
|
478
|
+
example = examples.get(exp_run.dataset_example_id)
|
|
479
|
+
if example:
|
|
480
|
+
example_run_pairs.append((deepcopy(example), exp_run))
|
|
481
|
+
evaluation_input = [
|
|
482
|
+
(example, run, evaluator)
|
|
483
|
+
for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
|
|
484
|
+
]
|
|
485
|
+
|
|
486
|
+
tracer, resource = _get_tracer(None if dry_run else "evaluators")
|
|
487
|
+
root_span_kind = EVALUATOR
|
|
488
|
+
|
|
489
|
+
def sync_evaluate_run(
|
|
490
|
+
obj: tuple[Example, ExperimentRun, Evaluator],
|
|
491
|
+
) -> ExperimentEvaluationRun:
|
|
492
|
+
example, experiment_run, evaluator = obj
|
|
493
|
+
result: Optional[EvaluationResult] = None
|
|
494
|
+
error: Optional[BaseException] = None
|
|
495
|
+
status = Status(StatusCode.OK)
|
|
496
|
+
root_span_name = f"Evaluation: {evaluator.name}"
|
|
497
|
+
with ExitStack() as stack:
|
|
498
|
+
span: Span = stack.enter_context(
|
|
499
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
500
|
+
)
|
|
501
|
+
stack.enter_context(capture_spans(resource))
|
|
502
|
+
try:
|
|
503
|
+
result = evaluator.evaluate(
|
|
504
|
+
output=deepcopy(experiment_run.output),
|
|
505
|
+
expected=example.output,
|
|
506
|
+
reference=example.output,
|
|
507
|
+
input=example.input,
|
|
508
|
+
metadata=example.metadata,
|
|
509
|
+
)
|
|
510
|
+
except BaseException as exc:
|
|
511
|
+
span.record_exception(exc)
|
|
512
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
513
|
+
error = exc
|
|
514
|
+
_print_experiment_error(
|
|
515
|
+
exc,
|
|
516
|
+
example_id=example.id,
|
|
517
|
+
repetition_number=experiment_run.repetition_number,
|
|
518
|
+
kind="evaluator",
|
|
519
|
+
)
|
|
520
|
+
if result:
|
|
521
|
+
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
522
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
523
|
+
span.set_status(status)
|
|
524
|
+
|
|
525
|
+
eval_run = ExperimentEvaluationRun(
|
|
526
|
+
experiment_run_id=experiment_run.id,
|
|
527
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
528
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
529
|
+
name=evaluator.name,
|
|
530
|
+
annotator_kind=evaluator.kind,
|
|
531
|
+
error=repr(error) if error else None,
|
|
532
|
+
result=result,
|
|
533
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
534
|
+
)
|
|
535
|
+
if not dry_run:
|
|
536
|
+
resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
|
|
537
|
+
resp.raise_for_status()
|
|
538
|
+
eval_run = replace(eval_run, id=resp.json()["data"]["id"])
|
|
539
|
+
return eval_run
|
|
540
|
+
|
|
541
|
+
async def async_evaluate_run(
|
|
542
|
+
obj: tuple[Example, ExperimentRun, Evaluator],
|
|
543
|
+
) -> ExperimentEvaluationRun:
|
|
544
|
+
example, experiment_run, evaluator = obj
|
|
545
|
+
result: Optional[EvaluationResult] = None
|
|
546
|
+
error: Optional[BaseException] = None
|
|
547
|
+
status = Status(StatusCode.OK)
|
|
548
|
+
root_span_name = f"Evaluation: {evaluator.name}"
|
|
549
|
+
with ExitStack() as stack:
|
|
550
|
+
span: Span = stack.enter_context(
|
|
551
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
552
|
+
)
|
|
553
|
+
stack.enter_context(capture_spans(resource))
|
|
554
|
+
try:
|
|
555
|
+
result = await evaluator.async_evaluate(
|
|
556
|
+
output=deepcopy(experiment_run.output),
|
|
557
|
+
expected=example.output,
|
|
558
|
+
reference=example.output,
|
|
559
|
+
input=example.input,
|
|
560
|
+
metadata=example.metadata,
|
|
561
|
+
)
|
|
562
|
+
except BaseException as exc:
|
|
563
|
+
span.record_exception(exc)
|
|
564
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
565
|
+
error = exc
|
|
566
|
+
_print_experiment_error(
|
|
567
|
+
exc,
|
|
568
|
+
example_id=example.id,
|
|
569
|
+
repetition_number=experiment_run.repetition_number,
|
|
570
|
+
kind="evaluator",
|
|
571
|
+
)
|
|
572
|
+
if result:
|
|
573
|
+
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
574
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
575
|
+
span.set_status(status)
|
|
576
|
+
|
|
577
|
+
eval_run = ExperimentEvaluationRun(
|
|
578
|
+
experiment_run_id=experiment_run.id,
|
|
579
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
580
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
581
|
+
name=evaluator.name,
|
|
582
|
+
annotator_kind=evaluator.kind,
|
|
583
|
+
error=repr(error) if error else None,
|
|
584
|
+
result=result,
|
|
585
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
586
|
+
)
|
|
587
|
+
if not dry_run:
|
|
588
|
+
# Below is a workaround to avoid timeout errors sometimes
|
|
589
|
+
# encountered when the evaluator is a synchronous function
|
|
590
|
+
# that blocks for too long.
|
|
591
|
+
resp = await asyncio.get_running_loop().run_in_executor(
|
|
592
|
+
None,
|
|
593
|
+
functools.partial(
|
|
594
|
+
sync_client.post,
|
|
595
|
+
url="/v1/experiment_evaluations",
|
|
596
|
+
json=jsonify(eval_run),
|
|
597
|
+
),
|
|
598
|
+
)
|
|
599
|
+
resp.raise_for_status()
|
|
600
|
+
eval_run = replace(eval_run, id=resp.json()["data"]["id"])
|
|
601
|
+
return eval_run
|
|
602
|
+
|
|
603
|
+
_errors: tuple[type[BaseException], ...]
|
|
604
|
+
if not isinstance(rate_limit_errors, Sequence):
|
|
605
|
+
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
606
|
+
else:
|
|
607
|
+
_errors = tuple(filter(None, rate_limit_errors))
|
|
608
|
+
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
|
|
609
|
+
|
|
610
|
+
rate_limited_sync_evaluate_run = functools.reduce(
|
|
611
|
+
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_evaluate_run
|
|
612
|
+
)
|
|
613
|
+
rate_limited_async_evaluate_run = functools.reduce(
|
|
614
|
+
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
executor = get_executor_on_sync_context(
|
|
618
|
+
rate_limited_sync_evaluate_run,
|
|
619
|
+
rate_limited_async_evaluate_run,
|
|
620
|
+
max_retries=0,
|
|
621
|
+
exit_on_error=False,
|
|
622
|
+
fallback_return_value=None,
|
|
623
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
|
|
624
|
+
concurrency=concurrency,
|
|
625
|
+
)
|
|
626
|
+
eval_runs, _execution_details = executor.run(evaluation_input)
|
|
627
|
+
eval_summary = EvaluationSummary.from_eval_runs(
|
|
628
|
+
EvaluationParameters(
|
|
629
|
+
eval_names=frozenset(evaluators_by_name),
|
|
630
|
+
exp_params=ran_experiment.params,
|
|
631
|
+
),
|
|
632
|
+
*eval_runs,
|
|
633
|
+
)
|
|
634
|
+
ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
|
|
635
|
+
if print_summary:
|
|
636
|
+
print(ran_experiment)
|
|
637
|
+
return ran_experiment
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
|
|
641
|
+
evaluators_by_name: dict[EvaluatorName, Evaluator] = {}
|
|
642
|
+
if obj is None:
|
|
643
|
+
return evaluators_by_name
|
|
644
|
+
if isinstance(mapping := obj, Mapping):
|
|
645
|
+
for name, value in mapping.items():
|
|
646
|
+
evaluator = (
|
|
647
|
+
create_evaluator(name=name)(value) if not isinstance(value, Evaluator) else value
|
|
648
|
+
)
|
|
649
|
+
name = evaluator.name
|
|
650
|
+
if name in evaluators_by_name:
|
|
651
|
+
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
652
|
+
evaluators_by_name[name] = evaluator
|
|
653
|
+
elif isinstance(seq := obj, Sequence):
|
|
654
|
+
for value in seq:
|
|
655
|
+
evaluator = create_evaluator()(value) if not isinstance(value, Evaluator) else value
|
|
656
|
+
name = evaluator.name
|
|
657
|
+
if name in evaluators_by_name:
|
|
658
|
+
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
659
|
+
evaluators_by_name[name] = evaluator
|
|
660
|
+
else:
|
|
661
|
+
assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
|
|
662
|
+
evaluator = create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
|
|
663
|
+
name = evaluator.name
|
|
664
|
+
if name in evaluators_by_name:
|
|
665
|
+
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
666
|
+
evaluators_by_name[name] = evaluator
|
|
667
|
+
return evaluators_by_name
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def _get_tracer(project_name: Optional[str] = None) -> tuple[Tracer, Resource]:
|
|
671
|
+
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
672
|
+
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
673
|
+
span_processor = (
|
|
674
|
+
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{get_base_url()}", "v1/traces")))
|
|
675
|
+
if project_name
|
|
676
|
+
else _NoOpProcessor()
|
|
677
|
+
)
|
|
678
|
+
tracer_provider.add_span_processor(span_processor)
|
|
679
|
+
return tracer_provider.get_tracer(__name__), resource
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def _str_trace_id(id_: int) -> str:
|
|
683
|
+
return hexlify(id_.to_bytes(16, "big")).decode()
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def _decode_unix_nano(time_unix_nano: int) -> datetime:
|
|
687
|
+
return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def _is_dry_run(obj: Any) -> bool:
|
|
691
|
+
return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def _validate_task_signature(sig: inspect.Signature) -> None:
|
|
695
|
+
# Check that the function signature has a valid signature for use as a task
|
|
696
|
+
# If it does not, raise an error to exit early before running an experiment
|
|
697
|
+
params = sig.parameters
|
|
698
|
+
valid_named_params = {"input", "expected", "reference", "metadata", "example"}
|
|
699
|
+
if len(params) == 0:
|
|
700
|
+
raise ValueError("Task function must have at least one parameter.")
|
|
701
|
+
if len(params) > 1:
|
|
702
|
+
for not_found in set(params) - valid_named_params:
|
|
703
|
+
param = params[not_found]
|
|
704
|
+
if (
|
|
705
|
+
param.kind is inspect.Parameter.VAR_KEYWORD
|
|
706
|
+
or param.default is not inspect.Parameter.empty
|
|
707
|
+
):
|
|
708
|
+
continue
|
|
709
|
+
raise ValueError(
|
|
710
|
+
(
|
|
711
|
+
f"Invalid parameter names in task function: {', '.join(not_found)}. "
|
|
712
|
+
"Parameters names for multi-argument functions must be "
|
|
713
|
+
f"any of: {', '.join(valid_named_params)}."
|
|
714
|
+
)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def _bind_task_signature(sig: inspect.Signature, example: Example) -> inspect.BoundArguments:
|
|
719
|
+
parameter_mapping = {
|
|
720
|
+
"input": example.input,
|
|
721
|
+
"expected": example.output,
|
|
722
|
+
"reference": example.output, # Alias for "expected"
|
|
723
|
+
"metadata": example.metadata,
|
|
724
|
+
"example": example,
|
|
725
|
+
}
|
|
726
|
+
params = sig.parameters
|
|
727
|
+
if len(params) == 1:
|
|
728
|
+
parameter_name = next(iter(params))
|
|
729
|
+
if parameter_name in parameter_mapping:
|
|
730
|
+
return sig.bind(parameter_mapping[parameter_name])
|
|
731
|
+
else:
|
|
732
|
+
return sig.bind(parameter_mapping["input"])
|
|
733
|
+
return sig.bind_partial(
|
|
734
|
+
**{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def _print_experiment_error(
|
|
739
|
+
error: BaseException,
|
|
740
|
+
/,
|
|
741
|
+
*,
|
|
742
|
+
example_id: str,
|
|
743
|
+
repetition_number: int,
|
|
744
|
+
kind: Literal["evaluator", "task"],
|
|
745
|
+
) -> None:
|
|
746
|
+
"""
|
|
747
|
+
Prints an experiment error.
|
|
748
|
+
"""
|
|
749
|
+
display_error = RuntimeError(
|
|
750
|
+
f"{kind} failed for example id {repr(example_id)}, " f"repetition {repr(repetition_number)}"
|
|
751
|
+
)
|
|
752
|
+
display_error.__cause__ = error
|
|
753
|
+
formatted_exception = "".join(
|
|
754
|
+
traceback.format_exception(type(display_error), display_error, display_error.__traceback__)
|
|
755
|
+
)
|
|
756
|
+
print("\033[91m" + formatted_exception + "\033[0m") # prints in red
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
class _NoOpProcessor(trace_sdk.SpanProcessor):
|
|
760
|
+
def force_flush(self, *_: Any) -> bool:
|
|
761
|
+
return True
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
765
|
+
OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
|
|
766
|
+
INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
|
|
767
|
+
OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
|
|
768
|
+
OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
|
|
769
|
+
|
|
770
|
+
CHAIN = OpenInferenceSpanKindValues.CHAIN.value
|
|
771
|
+
EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
|
|
772
|
+
JSON = OpenInferenceMimeTypeValues.JSON
|