arize-phoenix 3.16.0__py3-none-any.whl → 7.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.0.dist-info/METADATA +261 -0
- arize_phoenix-7.7.0.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -247
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +13 -107
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.0.dist-info/METADATA +0 -495
- arize_phoenix-3.16.0.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -617
- phoenix/core/traces.py +0 -100
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
5
|
+
|
|
6
|
+
from phoenix.experiments.types import (
|
|
7
|
+
AnnotatorKind,
|
|
8
|
+
EvaluationResult,
|
|
9
|
+
JSONSerializable,
|
|
10
|
+
)
|
|
11
|
+
from phoenix.experiments.utils import get_func_name
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
18
|
+
if isinstance(obj, dict):
|
|
19
|
+
if len(obj) == 1:
|
|
20
|
+
key = next(iter(obj.keys()))
|
|
21
|
+
output = obj[key]
|
|
22
|
+
assert isinstance(
|
|
23
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
24
|
+
), "Output must be JSON serializable"
|
|
25
|
+
return output
|
|
26
|
+
return obj
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_evaluator_signature(sig: inspect.Signature) -> None:
|
|
30
|
+
# Check that the wrapped function has a valid signature for use as an evaluator
|
|
31
|
+
# If it does not, raise an error to exit early before running evaluations
|
|
32
|
+
params = sig.parameters
|
|
33
|
+
valid_named_params = {"input", "output", "expected", "reference", "metadata"}
|
|
34
|
+
if len(params) == 0:
|
|
35
|
+
raise ValueError("Evaluation function must have at least one parameter.")
|
|
36
|
+
if len(params) > 1:
|
|
37
|
+
for not_found in set(params) - valid_named_params:
|
|
38
|
+
param = params[not_found]
|
|
39
|
+
if (
|
|
40
|
+
param.kind is inspect.Parameter.VAR_KEYWORD
|
|
41
|
+
or param.default is not inspect.Parameter.empty
|
|
42
|
+
):
|
|
43
|
+
continue
|
|
44
|
+
raise ValueError(
|
|
45
|
+
(
|
|
46
|
+
f"Invalid parameter names in evaluation function: {', '.join(not_found)}. "
|
|
47
|
+
"Parameters names for multi-argument functions must be "
|
|
48
|
+
f"any of: {', '.join(valid_named_params)}."
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _bind_evaluator_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
|
|
54
|
+
parameter_mapping = {
|
|
55
|
+
"input": kwargs.get("input"),
|
|
56
|
+
"output": kwargs.get("output"),
|
|
57
|
+
"expected": kwargs.get("expected"),
|
|
58
|
+
"reference": kwargs.get("reference"), # `reference` is an alias for `expected`
|
|
59
|
+
"metadata": kwargs.get("metadata"),
|
|
60
|
+
}
|
|
61
|
+
params = sig.parameters
|
|
62
|
+
if len(params) == 1:
|
|
63
|
+
parameter_name = next(iter(params))
|
|
64
|
+
if parameter_name in parameter_mapping:
|
|
65
|
+
return sig.bind(parameter_mapping[parameter_name])
|
|
66
|
+
else:
|
|
67
|
+
return sig.bind(parameter_mapping["output"])
|
|
68
|
+
return sig.bind_partial(
|
|
69
|
+
**{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def create_evaluator(
|
|
74
|
+
kind: Union[str, AnnotatorKind] = AnnotatorKind.CODE,
|
|
75
|
+
name: Optional[str] = None,
|
|
76
|
+
scorer: Optional[Callable[[Any], EvaluationResult]] = None,
|
|
77
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
78
|
+
"""
|
|
79
|
+
A decorator that configures a sync or async function to be used as an experiment evaluator.
|
|
80
|
+
|
|
81
|
+
If the `evaluator` is a function of one argument then that argument will be
|
|
82
|
+
bound to the `output` of an experiment task. Alternatively, the `evaluator` can be a function
|
|
83
|
+
of any combination of specific argument names that will be bound to special values:
|
|
84
|
+
`input`: The input field of the dataset example
|
|
85
|
+
`output`: The output of an experiment task
|
|
86
|
+
`expected`: The expected or reference output of the dataset example
|
|
87
|
+
`reference`: An alias for `expected`
|
|
88
|
+
`metadata`: Metadata associated with the dataset example
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
kind (str | AnnotatorKind): Broadly indicates how the evaluator scores an experiment run.
|
|
92
|
+
Valid kinds are: "CODE", "LLM". Defaults to "CODE".
|
|
93
|
+
name (str, optional): The name of the evaluator. If not provided, the name of the function
|
|
94
|
+
will be used.
|
|
95
|
+
scorer (callable, optional): An optional function that converts the output of the wrapped
|
|
96
|
+
function into an `EvaluationResult`. This allows configuring the evaluation
|
|
97
|
+
payload by setting a label, score and explanation. By default, numeric outputs will
|
|
98
|
+
be recorded as scores, boolean outputs will be recorded as scores and labels, and
|
|
99
|
+
string outputs will be recorded as labels. If the output is a 2-tuple, the first item
|
|
100
|
+
will be recorded as the score and the second item will recorded as the explanation.
|
|
101
|
+
|
|
102
|
+
Examples:
|
|
103
|
+
Configuring an evaluator that returns a boolean
|
|
104
|
+
|
|
105
|
+
.. code-block:: python
|
|
106
|
+
@create_evaluator(kind="CODE", name="exact-match)
|
|
107
|
+
def match(output: str, expected: str) -> bool:
|
|
108
|
+
return output == expected
|
|
109
|
+
|
|
110
|
+
Configuring an evaluator that returns a label
|
|
111
|
+
|
|
112
|
+
.. code-block:: python
|
|
113
|
+
client = openai.Client()
|
|
114
|
+
|
|
115
|
+
@create_evaluator(kind="LLM")
|
|
116
|
+
def label(output: str) -> str:
|
|
117
|
+
res = client.chat.completions.create(
|
|
118
|
+
model = "gpt-4",
|
|
119
|
+
messages = [
|
|
120
|
+
{
|
|
121
|
+
"role": "user",
|
|
122
|
+
"content": (
|
|
123
|
+
"in one word, characterize the sentiment of the following customer "
|
|
124
|
+
f"request: {output}"
|
|
125
|
+
)
|
|
126
|
+
},
|
|
127
|
+
],
|
|
128
|
+
)
|
|
129
|
+
label = res.choices[0].message.content
|
|
130
|
+
return label
|
|
131
|
+
|
|
132
|
+
Configuring an evaluator that returns a score and explanation
|
|
133
|
+
|
|
134
|
+
.. code-block:: python
|
|
135
|
+
from textdistance import levenshtein
|
|
136
|
+
|
|
137
|
+
@create_evaluator(kind="CODE", name="levenshtein-distance")
|
|
138
|
+
def ld(output: str, expected: str) -> tuple[float, str]:
|
|
139
|
+
return (
|
|
140
|
+
levenshtein(output, expected),
|
|
141
|
+
f"Levenshtein distance between {output} and {expected}"
|
|
142
|
+
)
|
|
143
|
+
"""
|
|
144
|
+
if scorer is None:
|
|
145
|
+
scorer = _default_eval_scorer
|
|
146
|
+
|
|
147
|
+
if isinstance(kind, str):
|
|
148
|
+
kind = AnnotatorKind(kind.upper())
|
|
149
|
+
|
|
150
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
151
|
+
nonlocal name
|
|
152
|
+
if not name:
|
|
153
|
+
name = get_func_name(func)
|
|
154
|
+
assert name is not None
|
|
155
|
+
|
|
156
|
+
wrapped_signature = inspect.signature(func)
|
|
157
|
+
validate_evaluator_signature(wrapped_signature)
|
|
158
|
+
|
|
159
|
+
if inspect.iscoroutinefunction(func):
|
|
160
|
+
return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
|
|
161
|
+
else:
|
|
162
|
+
return _wrap_sync_evaluation_function(name, kind, wrapped_signature, scorer)(func)
|
|
163
|
+
|
|
164
|
+
return wrapper
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _wrap_coroutine_evaluation_function(
|
|
168
|
+
name: str,
|
|
169
|
+
annotator_kind: AnnotatorKind,
|
|
170
|
+
sig: inspect.Signature,
|
|
171
|
+
convert_to_score: Callable[[Any], EvaluationResult],
|
|
172
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
173
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
174
|
+
|
|
175
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
176
|
+
class AsyncEvaluator(Evaluator):
|
|
177
|
+
def __init__(self) -> None:
|
|
178
|
+
self._name = name
|
|
179
|
+
self._kind = annotator_kind
|
|
180
|
+
|
|
181
|
+
@functools.wraps(func)
|
|
182
|
+
async def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
183
|
+
return await func(*args, **kwargs)
|
|
184
|
+
|
|
185
|
+
async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
|
|
186
|
+
bound_signature = _bind_evaluator_signature(sig, **kwargs)
|
|
187
|
+
result = await func(*bound_signature.args, **bound_signature.kwargs)
|
|
188
|
+
return convert_to_score(result)
|
|
189
|
+
|
|
190
|
+
return AsyncEvaluator()
|
|
191
|
+
|
|
192
|
+
return wrapper
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _wrap_sync_evaluation_function(
|
|
196
|
+
name: str,
|
|
197
|
+
annotator_kind: AnnotatorKind,
|
|
198
|
+
sig: inspect.Signature,
|
|
199
|
+
convert_to_score: Callable[[Any], EvaluationResult],
|
|
200
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
201
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
202
|
+
|
|
203
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
204
|
+
class SyncEvaluator(Evaluator):
|
|
205
|
+
def __init__(self) -> None:
|
|
206
|
+
self._name = name
|
|
207
|
+
self._kind = annotator_kind
|
|
208
|
+
|
|
209
|
+
@functools.wraps(func)
|
|
210
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
211
|
+
return func(*args, **kwargs)
|
|
212
|
+
|
|
213
|
+
def evaluate(self, **kwargs: Any) -> EvaluationResult:
|
|
214
|
+
bound_signature = _bind_evaluator_signature(sig, **kwargs)
|
|
215
|
+
result = func(*bound_signature.args, **bound_signature.kwargs)
|
|
216
|
+
return convert_to_score(result)
|
|
217
|
+
|
|
218
|
+
return SyncEvaluator()
|
|
219
|
+
|
|
220
|
+
return wrapper
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _default_eval_scorer(result: Any) -> EvaluationResult:
|
|
224
|
+
if isinstance(result, EvaluationResult):
|
|
225
|
+
return result
|
|
226
|
+
if isinstance(result, bool):
|
|
227
|
+
return EvaluationResult(score=float(result), label=str(result))
|
|
228
|
+
if hasattr(result, "__float__"):
|
|
229
|
+
return EvaluationResult(score=float(result))
|
|
230
|
+
if isinstance(result, str):
|
|
231
|
+
return EvaluationResult(label=result)
|
|
232
|
+
if isinstance(result, (tuple, list)) and len(result) == 2:
|
|
233
|
+
# If the result is a 2-tuple, the first item will be recorded as the score
|
|
234
|
+
# and the second item will recorded as the explanation.
|
|
235
|
+
return EvaluationResult(score=float(result[0]), explanation=str(result[1]))
|
|
236
|
+
raise ValueError(f"Unsupported evaluation result type: {type(result)}")
|