arize-phoenix 3.16.0__py3-none-any.whl → 7.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.0.dist-info/METADATA +261 -0
- arize_phoenix-7.7.0.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -247
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +13 -107
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.0.dist-info/METADATA +0 -495
- arize_phoenix-3.16.0.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -617
- phoenix/core/traces.py +0 -100
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from types import MappingProxyType
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
7
|
+
from phoenix.evals.utils import snap_to_rail
|
|
8
|
+
from phoenix.experiments.evaluators.base import (
|
|
9
|
+
ExperimentEvaluator,
|
|
10
|
+
LLMEvaluator,
|
|
11
|
+
)
|
|
12
|
+
from phoenix.experiments.evaluators.utils import unwrap_json
|
|
13
|
+
from phoenix.experiments.types import (
|
|
14
|
+
EvaluationResult,
|
|
15
|
+
ExampleInput,
|
|
16
|
+
ExampleMetadata,
|
|
17
|
+
TaskOutput,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LLMCriteriaEvaluator(LLMEvaluator):
|
|
22
|
+
"""
|
|
23
|
+
An experiment evaluator that uses an LLM to evaluate whether the text meets a custom criteria.
|
|
24
|
+
|
|
25
|
+
This evaluator uses the chain-of-thought technique to perform a binary evaluation of text based
|
|
26
|
+
on a custom criteria and description. When used as an experiment evaluator,
|
|
27
|
+
`LLMCriteriaEvaluator` will return a score of 1.0 if the text meets the criteria and a score of
|
|
28
|
+
0.0 if not. The explanation produced by the chain-of-thought technique will be included in the
|
|
29
|
+
experiment evaluation as well.
|
|
30
|
+
|
|
31
|
+
Example criteria and descriptions:
|
|
32
|
+
- "thoughtfulness" - "shows careful consideration and fair judgement"
|
|
33
|
+
- "clarity" - "is easy to understand and follow"
|
|
34
|
+
- "professionalism" - "is respectful and appropriate for a formal setting"
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
38
|
+
the `phoenix.evals` module.
|
|
39
|
+
criteria: The criteria to evaluate the text against, the criteria should be able to be used
|
|
40
|
+
as a noun in a sentence.
|
|
41
|
+
description (str): A description of the criteria, used to clarify instructions to the LLM.
|
|
42
|
+
The description should complete this sentence: "{criteria} means the text
|
|
43
|
+
{description}".
|
|
44
|
+
name (str): The name of the evaluator
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_base_template = (
|
|
48
|
+
"Determine if the following text is {criteria}. {description}"
|
|
49
|
+
"First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
|
|
50
|
+
"a single word label; 'true' if the text is {criteria} or 'false' if the text is not "
|
|
51
|
+
"{criteria}. Here is an example template for whether the text meets a criteria:\n\n"
|
|
52
|
+
"CRITERIA: the text is '{criteria}'\n"
|
|
53
|
+
"TEXT: *the provided text to evaluate*\n"
|
|
54
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
|
|
55
|
+
"the criteria*\n"
|
|
56
|
+
"LABEL: *true or false*\n\n"
|
|
57
|
+
"Follow this template for the following example:\n\n"
|
|
58
|
+
"CRITERIA: the text is '{criteria}'\n"
|
|
59
|
+
"TEXT: {text}\n"
|
|
60
|
+
"EXPLANATION: "
|
|
61
|
+
)
|
|
62
|
+
_description = "In this context, '{criteria}' means the text '{description}'. "
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
model: LLMBaseModel,
|
|
67
|
+
criteria: str,
|
|
68
|
+
description: str,
|
|
69
|
+
name: str,
|
|
70
|
+
):
|
|
71
|
+
self.model = model
|
|
72
|
+
self.criteria = criteria
|
|
73
|
+
self.description = description
|
|
74
|
+
self.template = self._format_base_template(self.criteria, self.description)
|
|
75
|
+
self._name = name
|
|
76
|
+
|
|
77
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
78
|
+
formatted_template = self._format_eval_template(output)
|
|
79
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
80
|
+
return self._parse_eval_output(unparsed_response)
|
|
81
|
+
|
|
82
|
+
async def async_evaluate(
|
|
83
|
+
self, *, output: Optional[TaskOutput] = None, **_: Any
|
|
84
|
+
) -> EvaluationResult:
|
|
85
|
+
formatted_template = self._format_eval_template(output)
|
|
86
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
87
|
+
return self._parse_eval_output(unparsed_response)
|
|
88
|
+
|
|
89
|
+
def _format_eval_template(self, output: TaskOutput) -> str:
|
|
90
|
+
assert output is not None
|
|
91
|
+
result = unwrap_json(output)
|
|
92
|
+
return self.template.format(text=str(result))
|
|
93
|
+
|
|
94
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
95
|
+
raw_label, explanation = (
|
|
96
|
+
_parse_label_from_explanation(unparsed_response),
|
|
97
|
+
unparsed_response,
|
|
98
|
+
)
|
|
99
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
100
|
+
if label == "true":
|
|
101
|
+
score = 1.0
|
|
102
|
+
elif label == "false":
|
|
103
|
+
score = 0.0
|
|
104
|
+
else:
|
|
105
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
106
|
+
return EvaluationResult(
|
|
107
|
+
score=score,
|
|
108
|
+
explanation=explanation,
|
|
109
|
+
metadata={},
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def _format_base_template(cls, criteria: str, description: Optional[str] = None) -> str:
|
|
114
|
+
formatted_description = cls._description.format(criteria=criteria, description=description)
|
|
115
|
+
formatted_template = cls._base_template.format(
|
|
116
|
+
criteria=criteria,
|
|
117
|
+
description=formatted_description,
|
|
118
|
+
text="{text}", # leave the text field as a placeholder
|
|
119
|
+
)
|
|
120
|
+
return formatted_template
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def criteria_evaluator_factory(
|
|
124
|
+
class_name: str, criteria: str, description: str, default_name: str
|
|
125
|
+
) -> type[ExperimentEvaluator]:
|
|
126
|
+
def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
|
|
127
|
+
LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
|
|
128
|
+
|
|
129
|
+
return type(
|
|
130
|
+
class_name,
|
|
131
|
+
(LLMCriteriaEvaluator,),
|
|
132
|
+
{
|
|
133
|
+
"__init__": _init,
|
|
134
|
+
"__module__": __name__,
|
|
135
|
+
"template": LLMCriteriaEvaluator._format_base_template(criteria, description),
|
|
136
|
+
},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
ConcisenessEvaluator = criteria_evaluator_factory(
|
|
141
|
+
class_name="ConcisenessEvaluator",
|
|
142
|
+
criteria="concise",
|
|
143
|
+
description="is just a few sentences and easy to follow",
|
|
144
|
+
default_name="Conciseness",
|
|
145
|
+
)
|
|
146
|
+
"""
|
|
147
|
+
An experiment evaluator that uses an LLM to evaluate whether the text is concise.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
151
|
+
the `phoenix.evals` module.
|
|
152
|
+
name (str, optional): The name of the evaluator, defaults to "Conciseness".
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
157
|
+
class_name="HelpfulnessEvaluator",
|
|
158
|
+
criteria="helpful",
|
|
159
|
+
description="provides useful information",
|
|
160
|
+
default_name="Helpfulness",
|
|
161
|
+
)
|
|
162
|
+
"""
|
|
163
|
+
An experiment evaluator that uses an LLM to evaluate whether the text is helpful.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
167
|
+
the `phoenix.evals` module.
|
|
168
|
+
name (str, optional): The name of the evaluator, defaults to "Helpfulness".
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
CoherenceEvaluator = criteria_evaluator_factory(
|
|
173
|
+
class_name="CoherenceEvaluator",
|
|
174
|
+
criteria="coherent",
|
|
175
|
+
description="is coherent, well-structured, and logically sound",
|
|
176
|
+
default_name="Coherence",
|
|
177
|
+
)
|
|
178
|
+
"""
|
|
179
|
+
An experiment evaluator that uses an LLM to evaluate whether the text is coherent.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
183
|
+
the `phoenix.evals` module.
|
|
184
|
+
name (str, optional): The name of the evaluator, defaults to "Coherence".
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _parse_label_from_explanation(raw_string: str) -> str:
|
|
189
|
+
label_delimiter = r"(\W*label\W*)"
|
|
190
|
+
parts = re.split(label_delimiter, raw_string, flags=re.IGNORECASE)
|
|
191
|
+
if len(parts) > 1:
|
|
192
|
+
# Find the last occurrence of the delimiter and take the part after it
|
|
193
|
+
last_index = len(parts) - 1
|
|
194
|
+
while last_index > 0:
|
|
195
|
+
if re.match(label_delimiter, parts[last_index - 1], flags=re.IGNORECASE):
|
|
196
|
+
return parts[last_index].strip()
|
|
197
|
+
last_index -= 1
|
|
198
|
+
return raw_string
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class RelevanceEvaluator(LLMEvaluator):
|
|
202
|
+
"""
|
|
203
|
+
An experiment evaluator that uses an LLM to evaluate whether a response is relevant to a query.
|
|
204
|
+
|
|
205
|
+
This evaluator uses the chain-of-thought technique to perform a binary evaluation of whether
|
|
206
|
+
the output "response" of an experiment is relevant to its input "query". When used as an
|
|
207
|
+
experiment evaluator, `RelevanceEvaluator` will return a score of 1.0 if the response is
|
|
208
|
+
relevant to the query and a score of 0.0 if not. The explanation produced by the
|
|
209
|
+
chain-of-thought technique will be included in the experiment evaluation as well.
|
|
210
|
+
|
|
211
|
+
Optionally, you can provide custom functions to extract the query and response from the input
|
|
212
|
+
and output of the experiment task. By default, the evaluator will use the dataset example as
|
|
213
|
+
the input and the output of the experiment task as the response.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
217
|
+
the `phoenix.evals` module.
|
|
218
|
+
get_query (callable, optional): A function that extracts the query from the input of the
|
|
219
|
+
experiment task. The function should take the input and metadata of the dataset example
|
|
220
|
+
and return a string. By default, the function will return the string representation of
|
|
221
|
+
the input.
|
|
222
|
+
get_response (callable, optional): A function that extracts the response from the output of
|
|
223
|
+
the experiment task. The function should take the output and metadata of the experiment
|
|
224
|
+
task and return a string. By default, the function will return the string representation
|
|
225
|
+
of the output.
|
|
226
|
+
name (str, optional): The name of the evaluator. Defaults to "Relevance".
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
template = (
|
|
230
|
+
"Determine if the following response is relevant to the query. In this context, "
|
|
231
|
+
"'relevance' means that the response directly addresses the core question or topic of the "
|
|
232
|
+
"query. First, explain step-by-step why you think the text is or is not relevant. "
|
|
233
|
+
"Then provide a single word label; 'true' if the text is relevant or 'false' if the text "
|
|
234
|
+
"is not relevant. "
|
|
235
|
+
"Here is an example template for your response:\n\n"
|
|
236
|
+
"CRITERIA: the response is 'relevant' to the query\n"
|
|
237
|
+
"QUERY: *text that contains a query*\n"
|
|
238
|
+
"RESPONSE: *a response that may or may not be relevant to the query*\n"
|
|
239
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
|
|
240
|
+
"response is relevant to the query*\n"
|
|
241
|
+
"LABEL: *true or false*\n\n"
|
|
242
|
+
"Follow this template for the following example:\n\n"
|
|
243
|
+
"CRITERIA: the response is 'relevant' to the query\n"
|
|
244
|
+
"QUERY: {query}\n"
|
|
245
|
+
"RESPONSE: {response}\n"
|
|
246
|
+
"EXPLANATION: "
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def __init__(
|
|
250
|
+
self,
|
|
251
|
+
model: LLMBaseModel,
|
|
252
|
+
get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
|
|
253
|
+
get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
|
|
254
|
+
name: str = "Relevance",
|
|
255
|
+
):
|
|
256
|
+
self.model = model
|
|
257
|
+
self._name = name
|
|
258
|
+
self.get_query = get_query or self._default_get_query
|
|
259
|
+
self.get_response = get_response or self._default_get_response
|
|
260
|
+
|
|
261
|
+
def _format_eval_template(
|
|
262
|
+
self,
|
|
263
|
+
output: Optional[TaskOutput] = None,
|
|
264
|
+
input: ExampleInput = MappingProxyType({}),
|
|
265
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
266
|
+
) -> str:
|
|
267
|
+
assert output is not None
|
|
268
|
+
query = self.get_query(input, metadata)
|
|
269
|
+
response = self.get_response(output, metadata)
|
|
270
|
+
return self.template.format(query=query, response=response)
|
|
271
|
+
|
|
272
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
273
|
+
raw_label, explanation = (
|
|
274
|
+
_parse_label_from_explanation(unparsed_response),
|
|
275
|
+
unparsed_response,
|
|
276
|
+
)
|
|
277
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
278
|
+
if label == "true":
|
|
279
|
+
score = 1.0
|
|
280
|
+
elif label == "false":
|
|
281
|
+
score = 0.0
|
|
282
|
+
else:
|
|
283
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
284
|
+
return EvaluationResult(
|
|
285
|
+
score=score,
|
|
286
|
+
explanation=explanation,
|
|
287
|
+
metadata={},
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
|
|
291
|
+
return str(input)
|
|
292
|
+
|
|
293
|
+
def _default_get_response(
|
|
294
|
+
self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
|
|
295
|
+
) -> str:
|
|
296
|
+
assert output is not None
|
|
297
|
+
return str(unwrap_json(output))
|
|
298
|
+
|
|
299
|
+
def evaluate(
|
|
300
|
+
self,
|
|
301
|
+
*,
|
|
302
|
+
output: Optional[TaskOutput] = None,
|
|
303
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
304
|
+
input: ExampleInput = MappingProxyType({}),
|
|
305
|
+
**_: Any,
|
|
306
|
+
) -> EvaluationResult:
|
|
307
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
308
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
309
|
+
return self._parse_eval_output(unparsed_response)
|
|
310
|
+
|
|
311
|
+
async def async_evaluate(
|
|
312
|
+
self,
|
|
313
|
+
*,
|
|
314
|
+
output: Optional[TaskOutput] = None,
|
|
315
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
316
|
+
input: ExampleInput = MappingProxyType({}),
|
|
317
|
+
**_: Any,
|
|
318
|
+
) -> EvaluationResult:
|
|
319
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
320
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
321
|
+
return self._parse_eval_output(unparsed_response)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class LLMRelationalEvaluator(LLMEvaluator):
|
|
325
|
+
"""
|
|
326
|
+
An LLM experiment evaluator that checks how a response is related to reference text.
|
|
327
|
+
|
|
328
|
+
`LLMRelationalEvaluator` uses the chain-of-thought technique to perform a binary evaluation of
|
|
329
|
+
how a response is related to reference text in a specified manner. When used as an experiment
|
|
330
|
+
evaluator, `LLMRelationalEvaluator` will return a score of 1.0 if the response is related to
|
|
331
|
+
the reference text in the specified manner and a score of 0.0 if not. The explanation
|
|
332
|
+
produced by the chain-of-thought technique will be included in the experiment evaluation as
|
|
333
|
+
well.
|
|
334
|
+
|
|
335
|
+
In order to evaluate how a response is related to reference text, a specific relation and
|
|
336
|
+
description of that relation must be specified. The relation should be a phrase that can be
|
|
337
|
+
used in the following manner: "The response '{relation}' the reference". The description
|
|
338
|
+
should complete the sentence "In this context, '{relation}' means the response {description}".
|
|
339
|
+
|
|
340
|
+
Example relations and descriptions:
|
|
341
|
+
- "is a good summary of" - "the response clearly concisely summarizes the reference"
|
|
342
|
+
- "directly quotes" - "the response contains specific information from the reference"
|
|
343
|
+
- "professionally addresses" - "the response is respectful and relevant to the reference"
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
347
|
+
the `phoenix.evals` module.
|
|
348
|
+
relation: The relation to evaluate the text against, the relation should be a phrase that
|
|
349
|
+
can be used in the following manner: "The response '{relation}' the reference".
|
|
350
|
+
description (str): A description of the relation, used to clarify instructions to the LLM.
|
|
351
|
+
The description should complete the sentence "In this context, '{relation}'
|
|
352
|
+
means {description}". It is helpful to specifically use the words "response" and
|
|
353
|
+
"reference" to describe the relation.
|
|
354
|
+
name (str): The name of the evaluator
|
|
355
|
+
get_reference (callable, optional): A function that extracts the reference from the input of
|
|
356
|
+
the experiment task. The function should take the input and metadata of the dataset
|
|
357
|
+
example and return a string. By default, the function will return the string
|
|
358
|
+
representation of the input.
|
|
359
|
+
get_response (callable, optional): A function that extracts the response from the output of
|
|
360
|
+
the experiment task. The function should take the output and metadata of the experiment
|
|
361
|
+
task and return a string. By default, the function will return the string representation
|
|
362
|
+
of the output.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
_base_template = (
|
|
366
|
+
"Determine if the following response '{relation}' the reference. {description}"
|
|
367
|
+
"First, explain step-by-step why you think the response '{relation}' the reference. "
|
|
368
|
+
"Then provide a single word label; 'true' if the response '{relation}' the reference or "
|
|
369
|
+
"'false' if the text is not '{relation}' to the reference. "
|
|
370
|
+
"Here is an example template for your response:\n\n"
|
|
371
|
+
"CRITERIA: the response '{relation}' the reference\n"
|
|
372
|
+
"REFERENCE: *text that contains a reference*\n"
|
|
373
|
+
"RESPONSE: *a response that may or may not be '{relation}' to the reference*\n"
|
|
374
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
|
|
375
|
+
"response '{relation}' the reference*\n"
|
|
376
|
+
"LABEL: *true or false*\n\n"
|
|
377
|
+
"Follow this template for the following example:\n\n"
|
|
378
|
+
"CRITERIA: the response '{relation}' the reference\n"
|
|
379
|
+
"REFERENCE: {reference}\n"
|
|
380
|
+
"RESPONSE: {response}\n"
|
|
381
|
+
"EXPLANATION: "
|
|
382
|
+
)
|
|
383
|
+
_description = "In this context, '{relation}' means '{description}'. "
|
|
384
|
+
|
|
385
|
+
def __init__(
|
|
386
|
+
self,
|
|
387
|
+
model: LLMBaseModel,
|
|
388
|
+
relation: str,
|
|
389
|
+
description: str,
|
|
390
|
+
name: str,
|
|
391
|
+
get_reference: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
|
|
392
|
+
get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
|
|
393
|
+
):
|
|
394
|
+
self.model = model
|
|
395
|
+
self._name = name
|
|
396
|
+
self.relation = relation
|
|
397
|
+
self.description = description
|
|
398
|
+
self.template = self._format_base_template(self.relation, self.description)
|
|
399
|
+
self.get_reference = get_reference or self._default_get_reference
|
|
400
|
+
self.get_response = get_response or self._default_get_response
|
|
401
|
+
|
|
402
|
+
@classmethod
|
|
403
|
+
def _format_base_template(cls, relation: str, description: Optional[str] = None) -> str:
|
|
404
|
+
formatted_description = cls._description.format(relation=relation, description=description)
|
|
405
|
+
formatted_template = cls._base_template.format(
|
|
406
|
+
relation=relation,
|
|
407
|
+
description=formatted_description,
|
|
408
|
+
response="{response}", # leave the response field as a placeholder
|
|
409
|
+
reference="{reference}", # leave the reference field as a placeholder
|
|
410
|
+
)
|
|
411
|
+
return formatted_template
|
|
412
|
+
|
|
413
|
+
def _format_eval_template(
|
|
414
|
+
self,
|
|
415
|
+
output: Optional[TaskOutput] = None,
|
|
416
|
+
input: ExampleInput = MappingProxyType({}),
|
|
417
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
418
|
+
) -> str:
|
|
419
|
+
assert output is not None
|
|
420
|
+
reference = self.get_reference(input, metadata)
|
|
421
|
+
response = self.get_response(output, metadata)
|
|
422
|
+
return self.template.format(reference=reference, response=response)
|
|
423
|
+
|
|
424
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
425
|
+
raw_label, explanation = (
|
|
426
|
+
_parse_label_from_explanation(unparsed_response),
|
|
427
|
+
unparsed_response,
|
|
428
|
+
)
|
|
429
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
430
|
+
if label == "true":
|
|
431
|
+
score = 1.0
|
|
432
|
+
elif label == "false":
|
|
433
|
+
score = 0.0
|
|
434
|
+
else:
|
|
435
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
436
|
+
return EvaluationResult(
|
|
437
|
+
score=score,
|
|
438
|
+
explanation=explanation,
|
|
439
|
+
metadata={},
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
def _default_get_reference(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
|
|
443
|
+
return str(input)
|
|
444
|
+
|
|
445
|
+
def _default_get_response(
|
|
446
|
+
self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
|
|
447
|
+
) -> str:
|
|
448
|
+
assert output is not None
|
|
449
|
+
return str(unwrap_json(output))
|
|
450
|
+
|
|
451
|
+
def evaluate(
|
|
452
|
+
self,
|
|
453
|
+
*,
|
|
454
|
+
output: Optional[TaskOutput] = None,
|
|
455
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
456
|
+
input: ExampleInput = MappingProxyType({}),
|
|
457
|
+
**_: Any,
|
|
458
|
+
) -> EvaluationResult:
|
|
459
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
460
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
461
|
+
return self._parse_eval_output(unparsed_response)
|
|
462
|
+
|
|
463
|
+
async def async_evaluate(
|
|
464
|
+
self,
|
|
465
|
+
*,
|
|
466
|
+
output: Optional[TaskOutput] = None,
|
|
467
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
468
|
+
input: ExampleInput = MappingProxyType({}),
|
|
469
|
+
**_: Any,
|
|
470
|
+
) -> EvaluationResult:
|
|
471
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
472
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
473
|
+
return self._parse_eval_output(unparsed_response)
|