arize-phoenix 3.16.1__py3-none-any.whl → 7.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.0.dist-info/METADATA +261 -0
- arize_phoenix-7.7.0.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -241
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +4 -112
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.1.dist-info/METADATA +0 -495
- arize_phoenix-3.16.1.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -619
- phoenix/core/traces.py +0 -96
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
phoenix/exceptions.py
CHANGED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from phoenix.experiments.evaluators.code_evaluators import (
|
|
2
|
+
ContainsAllKeywords,
|
|
3
|
+
ContainsAnyKeyword,
|
|
4
|
+
ContainsKeyword,
|
|
5
|
+
JSONParsable,
|
|
6
|
+
MatchesRegex,
|
|
7
|
+
)
|
|
8
|
+
from phoenix.experiments.evaluators.llm_evaluators import (
|
|
9
|
+
CoherenceEvaluator,
|
|
10
|
+
ConcisenessEvaluator,
|
|
11
|
+
HelpfulnessEvaluator,
|
|
12
|
+
LLMCriteriaEvaluator,
|
|
13
|
+
RelevanceEvaluator,
|
|
14
|
+
)
|
|
15
|
+
from phoenix.experiments.evaluators.utils import create_evaluator
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"create_evaluator",
|
|
19
|
+
"ContainsAllKeywords",
|
|
20
|
+
"ContainsAnyKeyword",
|
|
21
|
+
"ContainsKeyword",
|
|
22
|
+
"JSONParsable",
|
|
23
|
+
"MatchesRegex",
|
|
24
|
+
"CoherenceEvaluator",
|
|
25
|
+
"ConcisenessEvaluator",
|
|
26
|
+
"LLMCriteriaEvaluator",
|
|
27
|
+
"HelpfulnessEvaluator",
|
|
28
|
+
"RelevanceEvaluator",
|
|
29
|
+
]
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from collections.abc import Awaitable, Callable
|
|
5
|
+
from types import MappingProxyType
|
|
6
|
+
from typing import Any, Optional, Union
|
|
7
|
+
|
|
8
|
+
from typing_extensions import TypeAlias
|
|
9
|
+
|
|
10
|
+
from phoenix.experiments.evaluators.utils import validate_evaluator_signature
|
|
11
|
+
from phoenix.experiments.types import (
|
|
12
|
+
AnnotatorKind,
|
|
13
|
+
EvaluationResult,
|
|
14
|
+
EvaluatorKind,
|
|
15
|
+
EvaluatorName,
|
|
16
|
+
EvaluatorOutput,
|
|
17
|
+
ExampleInput,
|
|
18
|
+
ExampleMetadata,
|
|
19
|
+
ExampleOutput,
|
|
20
|
+
TaskOutput,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Evaluator(ABC):
|
|
25
|
+
"""
|
|
26
|
+
A helper super class to guide the implementation of an `Evaluator` object.
|
|
27
|
+
Subclasses must implement either the `evaluate` or `async_evaluate` method.
|
|
28
|
+
Implementing both methods is recommended, but not required.
|
|
29
|
+
|
|
30
|
+
This Class is intended to be subclassed, and should not be instantiated directly.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
_kind: AnnotatorKind
|
|
34
|
+
_name: EvaluatorName
|
|
35
|
+
|
|
36
|
+
@functools.cached_property
|
|
37
|
+
def name(self) -> EvaluatorName:
|
|
38
|
+
if hasattr(self, "_name"):
|
|
39
|
+
return self._name
|
|
40
|
+
return self.__class__.__name__
|
|
41
|
+
|
|
42
|
+
@functools.cached_property
|
|
43
|
+
def kind(self) -> EvaluatorKind:
|
|
44
|
+
if hasattr(self, "_kind"):
|
|
45
|
+
return self._kind.value
|
|
46
|
+
return AnnotatorKind.CODE.value
|
|
47
|
+
|
|
48
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
|
|
49
|
+
if cls is Evaluator:
|
|
50
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
51
|
+
return object.__new__(cls)
|
|
52
|
+
|
|
53
|
+
def evaluate(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
output: Optional[TaskOutput] = None,
|
|
57
|
+
expected: Optional[ExampleOutput] = None,
|
|
58
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
59
|
+
input: ExampleInput = MappingProxyType({}),
|
|
60
|
+
**kwargs: Any,
|
|
61
|
+
) -> EvaluationResult:
|
|
62
|
+
# For subclassing, one should implement either this sync method or the
|
|
63
|
+
# async version. Implementing both is recommended but not required.
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
async def async_evaluate(
|
|
67
|
+
self,
|
|
68
|
+
*,
|
|
69
|
+
output: Optional[TaskOutput] = None,
|
|
70
|
+
expected: Optional[ExampleOutput] = None,
|
|
71
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
72
|
+
input: ExampleInput = MappingProxyType({}),
|
|
73
|
+
**kwargs: Any,
|
|
74
|
+
) -> EvaluationResult:
|
|
75
|
+
# For subclassing, one should implement either this async method or the
|
|
76
|
+
# sync version. Implementing both is recommended but not required.
|
|
77
|
+
return self.evaluate(
|
|
78
|
+
output=output,
|
|
79
|
+
expected=expected,
|
|
80
|
+
metadata=metadata,
|
|
81
|
+
input=input,
|
|
82
|
+
**kwargs,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
|
|
86
|
+
super().__init_subclass__(**kwargs)
|
|
87
|
+
if is_abstract:
|
|
88
|
+
return
|
|
89
|
+
evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
|
|
90
|
+
for super_cls in inspect.getmro(cls):
|
|
91
|
+
if super_cls in (LLMEvaluator, Evaluator):
|
|
92
|
+
break
|
|
93
|
+
if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
|
|
94
|
+
if isinstance(evaluate, classmethod):
|
|
95
|
+
evaluate = evaluate.__func__
|
|
96
|
+
assert callable(evaluate), "`evaluate()` method should be callable"
|
|
97
|
+
# need to remove the first param, i.e. `self`
|
|
98
|
+
_validate_sig(functools.partial(evaluate, None), "evaluate")
|
|
99
|
+
return
|
|
100
|
+
if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
|
|
101
|
+
if isinstance(async_evaluate, classmethod):
|
|
102
|
+
async_evaluate = async_evaluate.__func__
|
|
103
|
+
assert callable(async_evaluate), "`async_evaluate()` method should be callable"
|
|
104
|
+
# need to remove the first param, i.e. `self`
|
|
105
|
+
_validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
|
|
106
|
+
return
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Evaluator must implement either "
|
|
109
|
+
f"`def evaluate{evaluate_fn_signature}` or "
|
|
110
|
+
f"`async def async_evaluate{evaluate_fn_signature}`"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
|
|
115
|
+
sig = inspect.signature(fn)
|
|
116
|
+
validate_evaluator_signature(sig)
|
|
117
|
+
for param in sig.parameters.values():
|
|
118
|
+
if param.kind is inspect.Parameter.VAR_KEYWORD:
|
|
119
|
+
return
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class CodeEvaluator(Evaluator, ABC, is_abstract=True):
|
|
125
|
+
"""
|
|
126
|
+
A convenience super class for defining code evaluators.
|
|
127
|
+
|
|
128
|
+
This class is intended to be subclassed, and should not be instantiated directly.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
_kind = AnnotatorKind.CODE
|
|
132
|
+
|
|
133
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "CodeEvaluator":
|
|
134
|
+
if cls is CodeEvaluator:
|
|
135
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
136
|
+
return object.__new__(cls)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class LLMEvaluator(Evaluator, ABC, is_abstract=True):
|
|
140
|
+
"""
|
|
141
|
+
A convenience super class for defining LLM evaluators.
|
|
142
|
+
|
|
143
|
+
This class is intended to be subclassed, and should not be instantiated directly.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
_kind = AnnotatorKind.LLM
|
|
147
|
+
|
|
148
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
|
|
149
|
+
if cls is LLMEvaluator:
|
|
150
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
151
|
+
return object.__new__(cls)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
ExperimentEvaluator: TypeAlias = Union[
|
|
155
|
+
Evaluator,
|
|
156
|
+
Callable[..., EvaluatorOutput],
|
|
157
|
+
Callable[..., Awaitable[EvaluatorOutput]],
|
|
158
|
+
]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Optional,
|
|
8
|
+
Pattern, # import from re module when we drop support for 3.8
|
|
9
|
+
Union,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from phoenix.experiments.evaluators.base import CodeEvaluator
|
|
13
|
+
from phoenix.experiments.types import EvaluationResult, TaskOutput
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JSONParsable(CodeEvaluator):
|
|
17
|
+
"""
|
|
18
|
+
An evaluator that checks if the output of an experiment run is a JSON-parsable string.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
|
|
22
|
+
.. code-block:: python
|
|
23
|
+
from phoenix.experiments import run_experiment
|
|
24
|
+
from phoenix.experiments.evaluators import JSONParsable
|
|
25
|
+
|
|
26
|
+
run_experiment(dataset, task, evaluators=[JSONParsable])
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
31
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
32
|
+
try:
|
|
33
|
+
json.loads(output)
|
|
34
|
+
json_parsable = True
|
|
35
|
+
except BaseException:
|
|
36
|
+
json_parsable = False
|
|
37
|
+
return EvaluationResult(
|
|
38
|
+
score=int(json_parsable),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ContainsKeyword(CodeEvaluator):
|
|
43
|
+
"""
|
|
44
|
+
An evaluator that checks if a keyword is present in the output of an experiment run.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
keyword (str): The keyword to search for in the output.
|
|
48
|
+
name (str, optional): An optional name for the evaluator. Defaults to "Contains(<keyword>)".
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
|
|
52
|
+
.. code-block:: python
|
|
53
|
+
from phoenix.experiments import run_experiment
|
|
54
|
+
from phoenix.experiments.evaluators import ContainsKeyword
|
|
55
|
+
|
|
56
|
+
run_experiment(dataset, task, evaluators=[ContainsKeyword("foo")])
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, keyword: str, name: Optional[str] = None) -> None:
|
|
60
|
+
self.keyword = keyword
|
|
61
|
+
self._name = name or f"Contains({repr(keyword)})"
|
|
62
|
+
|
|
63
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
64
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
65
|
+
found = self.keyword in output
|
|
66
|
+
return EvaluationResult(
|
|
67
|
+
score=float(found),
|
|
68
|
+
explanation=(
|
|
69
|
+
f"the string {repr(self.keyword)} was "
|
|
70
|
+
f"{'found' if found else 'not found'} in the output"
|
|
71
|
+
),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ContainsAnyKeyword(CodeEvaluator):
|
|
76
|
+
"""
|
|
77
|
+
An evaluator that checks if any of the keywords are present in the output of an experiment run.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
keywords (list[str]): The keywords to search for in the output.
|
|
81
|
+
name (str, optional): An optional name for the evaluator. Defaults to
|
|
82
|
+
"ContainsAny(<keywords>)".
|
|
83
|
+
|
|
84
|
+
Example:
|
|
85
|
+
|
|
86
|
+
.. code-block:: python
|
|
87
|
+
from phoenix.experiments import run_experiment
|
|
88
|
+
from phoenix.experiments.evaluators import ContainsAnyKeyword
|
|
89
|
+
|
|
90
|
+
run_experiment(dataset, task, evaluators=[ContainsAnyKeyword(["foo", "bar"])])
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self, keywords: list[str], name: Optional[str] = None) -> None:
|
|
94
|
+
self.keywords = keywords
|
|
95
|
+
self._name = name or f"ContainsAny({keywords})"
|
|
96
|
+
|
|
97
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
98
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
99
|
+
found = [keyword for keyword in self.keywords if keyword in output]
|
|
100
|
+
if found:
|
|
101
|
+
explanation = f"the keywords {found} were found in the output"
|
|
102
|
+
else:
|
|
103
|
+
explanation = f"none of the keywords {self.keywords} were found in the output"
|
|
104
|
+
return EvaluationResult(
|
|
105
|
+
score=float(bool(found)),
|
|
106
|
+
explanation=explanation,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ContainsAllKeywords(CodeEvaluator):
|
|
111
|
+
"""
|
|
112
|
+
An evaluator that checks if all of the keywords are present in the output of an experiment run.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
keywords (list[str]): The keywords to search for in the output.
|
|
116
|
+
name (str, optional): An optional name for the evaluator. Defaults to
|
|
117
|
+
"ContainsAll(<keywords>)".
|
|
118
|
+
|
|
119
|
+
Example:
|
|
120
|
+
.. code-block:: python
|
|
121
|
+
|
|
122
|
+
from phoenix.experiments import run_experiment
|
|
123
|
+
from phoenix.experiments.evaluators import ContainsAllKeywords
|
|
124
|
+
|
|
125
|
+
run_experiment(dataset, task, evaluators=[ContainsAllKeywords(["foo", "bar"])])
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, keywords: list[str], name: Optional[str] = None) -> None:
|
|
129
|
+
self.keywords = keywords
|
|
130
|
+
self._name = name or f"ContainsAll({keywords})"
|
|
131
|
+
|
|
132
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
133
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
134
|
+
not_found = [keyword for keyword in self.keywords if keyword not in output]
|
|
135
|
+
if not_found:
|
|
136
|
+
contains_all = False
|
|
137
|
+
explanation = f"the keywords {not_found} were not found in the output"
|
|
138
|
+
else:
|
|
139
|
+
contains_all = True
|
|
140
|
+
explanation = f"all of the keywords {self.keywords} were found in the output"
|
|
141
|
+
return EvaluationResult(
|
|
142
|
+
score=float(contains_all),
|
|
143
|
+
explanation=explanation,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class MatchesRegex(CodeEvaluator):
|
|
148
|
+
r"""
|
|
149
|
+
An experiment evaluator that checks if the output of an experiment run matches a regex pattern.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
pattern (Union[str, Pattern[str]]): The regex pattern to match the output against.
|
|
153
|
+
name (str, optional): An optional name for the evaluator. Defaults to "matches_({pattern})".
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
.. code-block:: python
|
|
157
|
+
|
|
158
|
+
from phoenix.experiments import run_experiment
|
|
159
|
+
from phoenix.experiments.evaluators import MatchesRegex
|
|
160
|
+
|
|
161
|
+
phone_number_evaluator = MatchesRegex(r"\d{3}-\d{3}-\d{4}", name="valid-phone-number")
|
|
162
|
+
run_experiment(dataset, task, evaluators=[phone_number_evaluator])
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, pattern: Union[str, Pattern[str]], name: Optional[str] = None) -> None:
|
|
166
|
+
if isinstance(pattern, str):
|
|
167
|
+
pattern = re.compile(pattern)
|
|
168
|
+
self.pattern = pattern
|
|
169
|
+
assert isinstance(pattern, re.Pattern)
|
|
170
|
+
self._name = name or f"matches_({pattern})"
|
|
171
|
+
|
|
172
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
173
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
174
|
+
matches = self.pattern.findall(output)
|
|
175
|
+
if matches:
|
|
176
|
+
explanation = (
|
|
177
|
+
f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
|
|
178
|
+
)
|
|
179
|
+
else:
|
|
180
|
+
explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
|
|
181
|
+
return EvaluationResult(
|
|
182
|
+
score=float(bool(matches)),
|
|
183
|
+
explanation=explanation,
|
|
184
|
+
)
|