arize-phoenix 2.0.0__tar.gz → 2.2.0rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/PKG-INFO +5 -1
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/pyproject.toml +7 -3
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/__init__.py +2 -2
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/evals.py +29 -8
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/traces.py +45 -34
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/__init__.py +4 -1
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/evaluators.py +85 -8
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/functions/classify.py +16 -41
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/functions/executor.py +1 -0
- arize_phoenix-2.2.0rc0/src/phoenix/experimental/evals/models/anthropic.py +171 -0
- arize_phoenix-2.2.0rc0/src/phoenix/experimental/evals/models/vertex.py +155 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/templates/__init__.py +2 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/templates/default_templates.py +12 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/utils/__init__.py +64 -2
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/schema.py +24 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/app.py +6 -5
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/main.py +6 -7
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/span_handler.py +7 -7
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/index.js +586 -499
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/templates/index.html +5 -1
- arize_phoenix-2.2.0rc0/src/phoenix/server/trace_handler.py +56 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/session/session.py +2 -1
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/exporter.py +4 -3
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/langchain/tracer.py +14 -4
- arize_phoenix-2.2.0rc0/src/phoenix/trace/otel.py +409 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/semantic_conventions.py +2 -0
- arize_phoenix-2.2.0rc0/src/phoenix/trace/v1/__init__.py +5 -0
- arize_phoenix-2.2.0rc0/src/phoenix/version.py +1 -0
- arize_phoenix-2.0.0/src/phoenix/trace/v1/__init__.py +0 -9
- arize_phoenix-2.0.0/src/phoenix/trace/v1/trace_pb2.py +0 -54
- arize_phoenix-2.0.0/src/phoenix/trace/v1/trace_pb2.pyi +0 -361
- arize_phoenix-2.0.0/src/phoenix/trace/v1/utils.py +0 -538
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/.gitignore +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/IP_NOTICE +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/LICENSE +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/README.md +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/config.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/embedding_dimension.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/model.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/model_schema.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/core/model_schema_adapter.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datasets/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datasets/dataset.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datasets/errors.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datasets/fixtures.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datasets/schema.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datasets/validation.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/datetime_utils.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/exceptions.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/functions/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/functions/generate.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/functions/processing.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/base.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/bedrock.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/litellm.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/openai.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/rate_limiters.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/models/vertexai.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/retrievals.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/templates/template.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/experimental/evals/utils/threads.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/README.md +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/binning.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/metrics.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/mixins.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/retrieval_metrics.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/timeseries.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/metrics/wrappers.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/pointcloud/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/pointcloud/clustering.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/pointcloud/pointcloud.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/pointcloud/projectors.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/pointcloud/umap_parameters.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/py.typed +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/context.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/helpers.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/ClusterInput.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/Coordinates.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/DataQualityMetricInput.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/DimensionFilter.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/DimensionInput.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/Granularity.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/PerformanceMetricInput.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/SpanSort.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/TimeRange.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/input_types/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/interceptor.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Cluster.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DataQualityMetric.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Dataset.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DatasetInfo.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DatasetRole.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DatasetValues.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Dimension.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DimensionDataType.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DimensionShape.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DimensionType.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DimensionWithValue.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DocumentEvaluationSummary.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/DocumentRetrievalMetrics.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/EmbeddingDimension.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/EmbeddingMetadata.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Evaluation.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/EvaluationSummary.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Event.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/EventMetadata.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/ExportEventsMutation.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/ExportedFile.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Functionality.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/MimeType.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Model.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/NumericRange.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/PerformanceMetric.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/PromptResponse.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Retrieval.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/ScalarDriftMetricEnum.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Segments.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/SortDir.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/Span.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/TimeSeries.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/UMAPPoints.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/ValidationResult.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/VectorDriftMetricEnum.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/node.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/api/types/pagination.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/evaluation_handler.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-114x114.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-120x120.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-144x144.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-152x152.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-180x180.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-72x72.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon-76x76.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/apple-touch-icon.png +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/favicon.ico +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/index.css +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/static/modernizr.js +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/templates/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/server/thread_server.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/services.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/session/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/session/evaluation.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/dsl/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/dsl/filter.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/dsl/helpers.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/dsl/missing.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/dsl/query.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/evaluation_conventions.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/fixtures.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/langchain/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/langchain/instrumentor.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/llama_index/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/llama_index/callback.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/llama_index/debug_callback.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/llama_index/streaming.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/openai/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/openai/instrumentor.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/schemas.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/span_evaluations.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/span_json_decoder.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/span_json_encoder.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/trace_dataset.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/tracer.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/utils.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/v1/evaluation_pb2.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/trace/v1/evaluation_pb2.pyi +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/utilities/__init__.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/utilities/error_handling.py +0 -0
- {arize_phoenix-2.0.0 → arize_phoenix-2.2.0rc0}/src/phoenix/utilities/logging.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arize-phoenix
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0rc0
|
|
4
4
|
Summary: ML Observability in your notebook
|
|
5
5
|
Project-URL: Documentation, https://docs.arize.com/phoenix/
|
|
6
6
|
Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
|
|
@@ -20,6 +20,8 @@ Requires-Dist: ddsketch
|
|
|
20
20
|
Requires-Dist: hdbscan<1.0.0,>=0.8.33
|
|
21
21
|
Requires-Dist: jinja2
|
|
22
22
|
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: opentelemetry-proto
|
|
24
|
+
Requires-Dist: opentelemetry-sdk
|
|
23
25
|
Requires-Dist: pandas
|
|
24
26
|
Requires-Dist: protobuf<5.0,>=3.20
|
|
25
27
|
Requires-Dist: psutil
|
|
@@ -36,8 +38,10 @@ Requires-Dist: umap-learn
|
|
|
36
38
|
Requires-Dist: uvicorn
|
|
37
39
|
Requires-Dist: wrapt
|
|
38
40
|
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: anthropic; extra == 'dev'
|
|
39
42
|
Requires-Dist: arize[autoembeddings,llm-evaluation]; extra == 'dev'
|
|
40
43
|
Requires-Dist: gcsfs; extra == 'dev'
|
|
44
|
+
Requires-Dist: google-cloud-aiplatform>=1.3; extra == 'dev'
|
|
41
45
|
Requires-Dist: hatch; extra == 'dev'
|
|
42
46
|
Requires-Dist: jupyter; extra == 'dev'
|
|
43
47
|
Requires-Dist: langchain>=0.0.334; extra == 'dev'
|
|
@@ -40,6 +40,8 @@ dependencies = [
|
|
|
40
40
|
"ddsketch",
|
|
41
41
|
"tqdm",
|
|
42
42
|
"requests",
|
|
43
|
+
"opentelemetry-sdk",
|
|
44
|
+
"opentelemetry-proto",
|
|
43
45
|
]
|
|
44
46
|
dynamic = ["version"]
|
|
45
47
|
|
|
@@ -60,7 +62,9 @@ dev = [
|
|
|
60
62
|
"arize[AutoEmbeddings, LLM_Evaluation]",
|
|
61
63
|
"llama-index>=0.9.14",
|
|
62
64
|
"langchain>=0.0.334",
|
|
63
|
-
"litellm>=1.0.3"
|
|
65
|
+
"litellm>=1.0.3",
|
|
66
|
+
"google-cloud-aiplatform>=1.3",
|
|
67
|
+
"anthropic",
|
|
64
68
|
]
|
|
65
69
|
experimental = [
|
|
66
70
|
"tenacity",
|
|
@@ -75,7 +79,7 @@ Issues = "https://github.com/Arize-ai/phoenix/issues"
|
|
|
75
79
|
Source = "https://github.com/Arize-ai/phoenix"
|
|
76
80
|
|
|
77
81
|
[tool.hatch.version]
|
|
78
|
-
path = "src/phoenix/
|
|
82
|
+
path = "src/phoenix/version.py"
|
|
79
83
|
|
|
80
84
|
[build-system]
|
|
81
85
|
requires = ["hatchling"]
|
|
@@ -242,7 +246,6 @@ dependencies = [
|
|
|
242
246
|
|
|
243
247
|
[tool.hatch.envs.proto.scripts]
|
|
244
248
|
recompile = """
|
|
245
|
-
python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/trace.proto &&
|
|
246
249
|
python -m grpc_tools.protoc -I src/phoenix/proto --python_out=src/phoenix --mypy_out=src/phoenix src/phoenix/proto/trace/v1/evaluation.proto
|
|
247
250
|
"""
|
|
248
251
|
|
|
@@ -288,6 +291,7 @@ module = [
|
|
|
288
291
|
"langchain.*",
|
|
289
292
|
"litellm",
|
|
290
293
|
"nest_asyncio",
|
|
294
|
+
"opentelemetry.*",
|
|
291
295
|
]
|
|
292
296
|
ignore_missing_imports = true
|
|
293
297
|
|
|
@@ -5,8 +5,7 @@ from .session.evaluation import log_evaluations
|
|
|
5
5
|
from .session.session import NotebookEnvironment, Session, active_session, close_app, launch_app
|
|
6
6
|
from .trace.fixtures import load_example_traces
|
|
7
7
|
from .trace.trace_dataset import TraceDataset
|
|
8
|
-
|
|
9
|
-
__version__ = "2.0.0"
|
|
8
|
+
from .version import __version__
|
|
10
9
|
|
|
11
10
|
# module level doc-string
|
|
12
11
|
__doc__ = """
|
|
@@ -25,6 +24,7 @@ Here are just a few of the things that phoenix does well:
|
|
|
25
24
|
"""
|
|
26
25
|
|
|
27
26
|
__all__ = [
|
|
27
|
+
"__version__",
|
|
28
28
|
"Dataset",
|
|
29
29
|
"EmbeddingColumnNames",
|
|
30
30
|
"RetrievalEmbeddingColumnNames",
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import weakref
|
|
3
3
|
from collections import defaultdict
|
|
4
|
+
from datetime import datetime, timezone
|
|
4
5
|
from queue import SimpleQueue
|
|
5
6
|
from threading import RLock, Thread
|
|
6
7
|
from types import MethodType
|
|
@@ -46,6 +47,7 @@ class Evals:
|
|
|
46
47
|
self._document_evaluations_by_name: DefaultDict[
|
|
47
48
|
EvaluationName, DefaultDict[SpanID, Dict[DocumentPosition, pb.Evaluation]]
|
|
48
49
|
] = defaultdict(lambda: defaultdict(dict))
|
|
50
|
+
self._last_updated_at: Optional[datetime] = None
|
|
49
51
|
self._start_consumer()
|
|
50
52
|
|
|
51
53
|
def put(self, evaluation: pb.Evaluation) -> None:
|
|
@@ -92,10 +94,16 @@ class Evals:
|
|
|
92
94
|
)
|
|
93
95
|
else:
|
|
94
96
|
assert_never(subject_id_kind)
|
|
97
|
+
self._last_updated_at = datetime.now(timezone.utc)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def last_updated_at(self) -> Optional[datetime]:
|
|
101
|
+
return self._last_updated_at
|
|
95
102
|
|
|
96
103
|
def get_span_evaluation(self, span_id: SpanID, name: str) -> Optional[pb.Evaluation]:
|
|
97
104
|
with self._lock:
|
|
98
|
-
|
|
105
|
+
span_evaluations = self._evaluations_by_span_id.get(span_id)
|
|
106
|
+
return span_evaluations.get(name) if span_evaluations else None
|
|
99
107
|
|
|
100
108
|
def get_span_evaluation_names(self) -> List[EvaluationName]:
|
|
101
109
|
with self._lock:
|
|
@@ -108,28 +116,36 @@ class Evals:
|
|
|
108
116
|
with self._lock:
|
|
109
117
|
if span_id is None:
|
|
110
118
|
return list(self._document_evaluations_by_name)
|
|
111
|
-
|
|
119
|
+
document_evaluations = self._document_evaluations_by_span_id.get(span_id)
|
|
120
|
+
return list(document_evaluations) if document_evaluations else []
|
|
112
121
|
|
|
113
122
|
def get_span_evaluation_labels(self, name: EvaluationName) -> Tuple[str, ...]:
|
|
114
123
|
with self._lock:
|
|
115
|
-
|
|
124
|
+
labels = self._span_evaluation_labels.get(name)
|
|
125
|
+
return tuple(labels) if labels else ()
|
|
116
126
|
|
|
117
127
|
def get_span_evaluation_span_ids(self, name: EvaluationName) -> Tuple[SpanID, ...]:
|
|
118
128
|
with self._lock:
|
|
119
|
-
|
|
129
|
+
span_evaluations = self._span_evaluations_by_name.get(name)
|
|
130
|
+
return tuple(span_evaluations.keys()) if span_evaluations else ()
|
|
120
131
|
|
|
121
132
|
def get_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
|
|
122
133
|
with self._lock:
|
|
123
|
-
|
|
134
|
+
evaluations = self._evaluations_by_span_id.get(span_id)
|
|
135
|
+
return list(evaluations.values()) if evaluations else []
|
|
124
136
|
|
|
125
137
|
def get_document_evaluation_span_ids(self, name: EvaluationName) -> Tuple[SpanID, ...]:
|
|
126
138
|
with self._lock:
|
|
127
|
-
|
|
139
|
+
document_evaluations = self._document_evaluations_by_name.get(name)
|
|
140
|
+
return tuple(document_evaluations.keys()) if document_evaluations else ()
|
|
128
141
|
|
|
129
142
|
def get_document_evaluations_by_span_id(self, span_id: SpanID) -> List[pb.Evaluation]:
|
|
130
143
|
all_evaluations: List[pb.Evaluation] = []
|
|
131
144
|
with self._lock:
|
|
132
|
-
|
|
145
|
+
document_evaluations = self._document_evaluations_by_span_id.get(span_id)
|
|
146
|
+
if not document_evaluations:
|
|
147
|
+
return all_evaluations
|
|
148
|
+
for evaluations in document_evaluations.values():
|
|
133
149
|
all_evaluations.extend(evaluations.values())
|
|
134
150
|
return all_evaluations
|
|
135
151
|
|
|
@@ -144,7 +160,12 @@ class Evals:
|
|
|
144
160
|
# of one trillion, we would not want to create a result that large.
|
|
145
161
|
scores: List[float] = [np.nan] * num_documents
|
|
146
162
|
with self._lock:
|
|
147
|
-
|
|
163
|
+
document_evaluations = self._document_evaluations_by_span_id.get(span_id)
|
|
164
|
+
if not document_evaluations:
|
|
165
|
+
return scores
|
|
166
|
+
evaluations = document_evaluations.get(evaluation_name)
|
|
167
|
+
if not evaluations:
|
|
168
|
+
return scores
|
|
148
169
|
for document_position, evaluation in evaluations.items():
|
|
149
170
|
result = evaluation.result
|
|
150
171
|
if result.HasField("score") and document_position < num_documents:
|
|
@@ -13,20 +13,21 @@ from typing import (
|
|
|
13
13
|
Iterator,
|
|
14
14
|
List,
|
|
15
15
|
Optional,
|
|
16
|
+
Set,
|
|
16
17
|
SupportsFloat,
|
|
17
18
|
Tuple,
|
|
18
|
-
Union,
|
|
19
19
|
cast,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
+
import opentelemetry.proto.trace.v1.trace_pb2 as otlp
|
|
22
23
|
from ddsketch import DDSketch
|
|
23
24
|
from sortedcontainers import SortedKeyList
|
|
24
25
|
from typing_extensions import TypeAlias
|
|
25
26
|
from wrapt import ObjectProxy
|
|
26
27
|
|
|
27
|
-
import phoenix.trace.v1 as pb
|
|
28
28
|
from phoenix.datetime_utils import right_open_time_range
|
|
29
29
|
from phoenix.trace import semantic_conventions
|
|
30
|
+
from phoenix.trace.otel import decode
|
|
30
31
|
from phoenix.trace.schemas import (
|
|
31
32
|
ATTRIBUTE_PREFIX,
|
|
32
33
|
COMPUTED_PREFIX,
|
|
@@ -34,9 +35,10 @@ from phoenix.trace.schemas import (
|
|
|
34
35
|
Span,
|
|
35
36
|
SpanAttributes,
|
|
36
37
|
SpanID,
|
|
38
|
+
SpanStatusCode,
|
|
37
39
|
TraceID,
|
|
38
40
|
)
|
|
39
|
-
from phoenix.trace.
|
|
41
|
+
from phoenix.trace.semantic_conventions import RETRIEVAL_DOCUMENTS
|
|
40
42
|
|
|
41
43
|
END_OF_QUEUE = None # sentinel value for queue termination
|
|
42
44
|
|
|
@@ -74,15 +76,15 @@ class ReadableSpan(ObjectProxy): # type: ignore
|
|
|
74
76
|
are ingested, and would need to be re-computed on the fly.
|
|
75
77
|
"""
|
|
76
78
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def __init__(self, span: pb.Span) -> None:
|
|
79
|
+
def __init__(self, otlp_span: otlp.Span) -> None:
|
|
80
|
+
span = decode(otlp_span)
|
|
80
81
|
super().__init__(span)
|
|
82
|
+
self._self_otlp_span = otlp_span
|
|
81
83
|
self._self_computed_values: Dict[str, SupportsFloat] = {}
|
|
82
84
|
|
|
83
85
|
@property
|
|
84
86
|
def span(self) -> Span:
|
|
85
|
-
span = decode(self.
|
|
87
|
+
span = decode(self._self_otlp_span)
|
|
86
88
|
span.attributes.update(cast(SpanAttributes, self._self_computed_values))
|
|
87
89
|
# TODO: compute latency rank percent (which can change depending on how
|
|
88
90
|
# many spans already ingested).
|
|
@@ -96,9 +98,7 @@ class ReadableSpan(ObjectProxy): # type: ignore
|
|
|
96
98
|
return getattr(self.__wrapped__.context, suffix_key, None)
|
|
97
99
|
if key.startswith(ATTRIBUTE_PREFIX):
|
|
98
100
|
suffix_key = key[len(ATTRIBUTE_PREFIX) :]
|
|
99
|
-
|
|
100
|
-
return None
|
|
101
|
-
return self.__wrapped__.attributes[suffix_key]
|
|
101
|
+
return self.__wrapped__.attributes.get(suffix_key)
|
|
102
102
|
return getattr(self.__wrapped__, key, None)
|
|
103
103
|
|
|
104
104
|
def __setitem__(self, key: str, value: Any) -> None:
|
|
@@ -113,21 +113,21 @@ ChildSpanID: TypeAlias = SpanID
|
|
|
113
113
|
|
|
114
114
|
class Traces:
|
|
115
115
|
def __init__(self) -> None:
|
|
116
|
-
self._queue: "SimpleQueue[Optional[
|
|
116
|
+
self._queue: "SimpleQueue[Optional[otlp.Span]]" = SimpleQueue()
|
|
117
117
|
# Putting `None` as the sentinel value for queue termination.
|
|
118
118
|
weakref.finalize(self, self._queue.put, END_OF_QUEUE)
|
|
119
119
|
self._lock = RLock()
|
|
120
120
|
self._spans: Dict[SpanID, ReadableSpan] = {}
|
|
121
121
|
self._parent_span_ids: Dict[SpanID, ParentSpanID] = {}
|
|
122
|
-
self._traces:
|
|
123
|
-
self._child_span_ids: DefaultDict[SpanID,
|
|
124
|
-
self._orphan_spans: DefaultDict[ParentSpanID, List[
|
|
122
|
+
self._traces: DefaultDict[TraceID, List[SpanID]] = defaultdict(list)
|
|
123
|
+
self._child_span_ids: DefaultDict[SpanID, Set[ChildSpanID]] = defaultdict(set)
|
|
124
|
+
self._orphan_spans: DefaultDict[ParentSpanID, List[otlp.Span]] = defaultdict(list)
|
|
125
125
|
self._num_documents: DefaultDict[SpanID, int] = defaultdict(int)
|
|
126
126
|
self._start_time_sorted_span_ids: SortedKeyList[SpanID] = SortedKeyList(
|
|
127
|
-
key=lambda span_id: self._spans[span_id].start_time
|
|
127
|
+
key=lambda span_id: self._spans[span_id].start_time,
|
|
128
128
|
)
|
|
129
129
|
self._start_time_sorted_root_span_ids: SortedKeyList[SpanID] = SortedKeyList(
|
|
130
|
-
key=lambda span_id: self._spans[span_id].start_time
|
|
130
|
+
key=lambda span_id: self._spans[span_id].start_time,
|
|
131
131
|
)
|
|
132
132
|
self._latency_sorted_root_span_ids: SortedKeyList[SpanID] = SortedKeyList(
|
|
133
133
|
key=lambda span_id: self._spans[span_id][ComputedAttributes.LATENCY_MS.value],
|
|
@@ -136,15 +136,18 @@ class Traces:
|
|
|
136
136
|
self._min_start_time: Optional[datetime] = None
|
|
137
137
|
self._max_start_time: Optional[datetime] = None
|
|
138
138
|
self._token_count_total: int = 0
|
|
139
|
+
self._last_updated_at: Optional[datetime] = None
|
|
139
140
|
self._start_consumer()
|
|
140
141
|
|
|
141
|
-
def put(self, span: Optional[
|
|
142
|
-
self._queue.put(
|
|
142
|
+
def put(self, span: Optional[otlp.Span] = None) -> None:
|
|
143
|
+
self._queue.put(span)
|
|
143
144
|
|
|
144
145
|
def get_trace(self, trace_id: TraceID) -> Iterator[Span]:
|
|
145
146
|
with self._lock:
|
|
146
147
|
# make a copy because source data can mutate during iteration
|
|
147
|
-
|
|
148
|
+
if not (trace := self._traces.get(trace_id)):
|
|
149
|
+
return
|
|
150
|
+
span_ids = tuple(trace)
|
|
148
151
|
for span_id in span_ids:
|
|
149
152
|
if span := self[span_id]:
|
|
150
153
|
yield span
|
|
@@ -194,7 +197,7 @@ class Traces:
|
|
|
194
197
|
|
|
195
198
|
def get_num_documents(self, span_id: SpanID) -> int:
|
|
196
199
|
with self._lock:
|
|
197
|
-
return self._num_documents
|
|
200
|
+
return self._num_documents.get(span_id) or 0
|
|
198
201
|
|
|
199
202
|
def latency_rank_percent(self, latency_ms: float) -> Optional[float]:
|
|
200
203
|
"""
|
|
@@ -221,11 +224,17 @@ class Traces:
|
|
|
221
224
|
def get_descendant_span_ids(self, span_id: SpanID) -> Iterator[SpanID]:
|
|
222
225
|
with self._lock:
|
|
223
226
|
# make a copy because source data can mutate during iteration
|
|
224
|
-
|
|
227
|
+
if not (child_span_ids := self._child_span_ids.get(span_id)):
|
|
228
|
+
return
|
|
229
|
+
span_ids = tuple(child_span_ids)
|
|
225
230
|
for child_span_id in span_ids:
|
|
226
231
|
yield child_span_id
|
|
227
232
|
yield from self.get_descendant_span_ids(child_span_id)
|
|
228
233
|
|
|
234
|
+
@property
|
|
235
|
+
def last_updated_at(self) -> Optional[datetime]:
|
|
236
|
+
return self._last_updated_at
|
|
237
|
+
|
|
229
238
|
@property
|
|
230
239
|
def span_count(self) -> int:
|
|
231
240
|
"""Total number of spans (excluding orphan spans if any)"""
|
|
@@ -259,24 +268,24 @@ class Traces:
|
|
|
259
268
|
with self._lock:
|
|
260
269
|
self._process_span(item)
|
|
261
270
|
|
|
262
|
-
def _process_span(self, span:
|
|
263
|
-
|
|
271
|
+
def _process_span(self, span: otlp.Span) -> None:
|
|
272
|
+
new_span = ReadableSpan(span)
|
|
273
|
+
span_id = new_span.context.span_id
|
|
264
274
|
existing_span = self._spans.get(span_id)
|
|
265
|
-
if existing_span and existing_span.
|
|
275
|
+
if existing_span and existing_span.end_time:
|
|
266
276
|
# Reject updates if span has ended.
|
|
267
277
|
return
|
|
268
|
-
is_root_span = not
|
|
278
|
+
is_root_span = not new_span.parent_id
|
|
269
279
|
if not is_root_span:
|
|
270
|
-
parent_span_id =
|
|
280
|
+
parent_span_id = new_span.parent_id
|
|
271
281
|
if parent_span_id not in self._spans:
|
|
272
282
|
# Span can't be processed before its parent.
|
|
273
283
|
self._orphan_spans[parent_span_id].append(span)
|
|
274
284
|
return
|
|
275
|
-
self._child_span_ids[parent_span_id].
|
|
285
|
+
self._child_span_ids[parent_span_id].add(span_id)
|
|
276
286
|
self._parent_span_ids[span_id] = parent_span_id
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
end_time = span.end_time.ToDatetime(timezone.utc) if span.HasField("end_time") else None
|
|
287
|
+
start_time = new_span.start_time
|
|
288
|
+
end_time = new_span.end_time
|
|
280
289
|
if end_time:
|
|
281
290
|
new_span[ComputedAttributes.LATENCY_MS.value] = latency = (
|
|
282
291
|
end_time - start_time
|
|
@@ -287,7 +296,7 @@ class Traces:
|
|
|
287
296
|
if is_root_span and end_time:
|
|
288
297
|
self._latency_sorted_root_span_ids.add(span_id)
|
|
289
298
|
if not existing_span:
|
|
290
|
-
trace_id =
|
|
299
|
+
trace_id = new_span.context.trace_id
|
|
291
300
|
self._traces[trace_id].append(span_id)
|
|
292
301
|
if is_root_span:
|
|
293
302
|
self._start_time_sorted_root_span_ids.add(span_id)
|
|
@@ -303,7 +312,7 @@ class Traces:
|
|
|
303
312
|
else max(self._max_start_time, start_time)
|
|
304
313
|
)
|
|
305
314
|
new_span[ComputedAttributes.ERROR_COUNT.value] = int(
|
|
306
|
-
|
|
315
|
+
new_span.status_code is SpanStatusCode.ERROR
|
|
307
316
|
)
|
|
308
317
|
# Update cumulative values for span's ancestors.
|
|
309
318
|
for attribute_name, cumulative_attribute_name in (
|
|
@@ -336,14 +345,16 @@ class Traces:
|
|
|
336
345
|
self._token_count_total -= existing_span[LLM_TOKEN_COUNT_TOTAL] or 0
|
|
337
346
|
self._token_count_total += new_span[LLM_TOKEN_COUNT_TOTAL] or 0
|
|
338
347
|
# Update number of documents
|
|
339
|
-
num_documents_update = len(
|
|
348
|
+
num_documents_update = len(new_span.attributes.get(RETRIEVAL_DOCUMENTS) or ())
|
|
340
349
|
if existing_span:
|
|
341
|
-
num_documents_update -= len(existing_span.
|
|
350
|
+
num_documents_update -= len(existing_span.attributes.get(RETRIEVAL_DOCUMENTS) or ())
|
|
342
351
|
if num_documents_update:
|
|
343
352
|
self._num_documents[span_id] += num_documents_update
|
|
344
353
|
# Process previously orphaned spans, if any.
|
|
345
354
|
for orphan_span in self._orphan_spans.pop(span_id, ()):
|
|
346
355
|
self._process_span(orphan_span)
|
|
356
|
+
# Update last updated timestamp
|
|
357
|
+
self._last_updated_at = datetime.now(timezone.utc)
|
|
347
358
|
|
|
348
359
|
def _add_value_to_span_ancestors(
|
|
349
360
|
self,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from .evaluators import LLMEvaluator
|
|
1
|
+
from .evaluators import InvalidEvalCriteriaError, LLMEvaluator
|
|
2
2
|
from .functions import llm_classify, llm_generate, run_relevance_eval
|
|
3
3
|
from .models import BedrockModel, LiteLLMModel, OpenAIModel, VertexAIModel
|
|
4
4
|
from .retrievals import compute_precisions_at_k
|
|
@@ -16,11 +16,13 @@ from .templates import (
|
|
|
16
16
|
TOXICITY_PROMPT_RAILS_MAP,
|
|
17
17
|
TOXICITY_PROMPT_TEMPLATE,
|
|
18
18
|
ClassificationTemplate,
|
|
19
|
+
EvalCriteria,
|
|
19
20
|
PromptTemplate,
|
|
20
21
|
)
|
|
21
22
|
from .utils import NOT_PARSABLE, download_benchmark_dataset
|
|
22
23
|
|
|
23
24
|
__all__ = [
|
|
25
|
+
"EvalCriteria",
|
|
24
26
|
"compute_precisions_at_k",
|
|
25
27
|
"download_benchmark_dataset",
|
|
26
28
|
"llm_classify",
|
|
@@ -46,4 +48,5 @@ __all__ = [
|
|
|
46
48
|
"QA_PROMPT_TEMPLATE",
|
|
47
49
|
"NOT_PARSABLE",
|
|
48
50
|
"run_relevance_eval",
|
|
51
|
+
"InvalidEvalCriteriaError",
|
|
49
52
|
]
|
|
@@ -1,15 +1,26 @@
|
|
|
1
1
|
from typing import List, Mapping, Optional, Tuple
|
|
2
2
|
|
|
3
|
+
from phoenix.exceptions import PhoenixException
|
|
3
4
|
from phoenix.experimental.evals.models import set_verbosity
|
|
4
|
-
from phoenix.experimental.evals.
|
|
5
|
+
from phoenix.experimental.evals.templates.default_templates import (
|
|
6
|
+
EvalCriteria,
|
|
7
|
+
)
|
|
8
|
+
from phoenix.experimental.evals.utils import (
|
|
9
|
+
NOT_PARSABLE,
|
|
10
|
+
openai_function_call_kwargs,
|
|
11
|
+
parse_openai_function_call,
|
|
12
|
+
snap_to_rail,
|
|
13
|
+
)
|
|
5
14
|
from phoenix.utilities.logging import printif
|
|
6
15
|
|
|
7
|
-
from .models import BaseEvalModel
|
|
16
|
+
from .models import BaseEvalModel, OpenAIModel
|
|
8
17
|
from .templates import ClassificationTemplate, PromptOptions, PromptTemplate
|
|
9
18
|
|
|
10
19
|
Record = Mapping[str, str]
|
|
11
20
|
|
|
12
|
-
|
|
21
|
+
|
|
22
|
+
class InvalidEvalCriteriaError(PhoenixException):
|
|
23
|
+
pass
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
class LLMEvaluator:
|
|
@@ -35,6 +46,7 @@ class LLMEvaluator:
|
|
|
35
46
|
self,
|
|
36
47
|
record: Record,
|
|
37
48
|
provide_explanation: bool = False,
|
|
49
|
+
use_function_calling_if_available: bool = True,
|
|
38
50
|
verbose: bool = False,
|
|
39
51
|
) -> Tuple[str, Optional[str]]:
|
|
40
52
|
"""
|
|
@@ -46,27 +58,53 @@ class LLMEvaluator:
|
|
|
46
58
|
provide_explanation (bool, optional): Whether to provide an
|
|
47
59
|
explanation.
|
|
48
60
|
|
|
61
|
+
use_function_calling_if_available (bool, optional): If True, use
|
|
62
|
+
function calling (if available) as a means to constrain the LLM
|
|
63
|
+
outputs. With function calling, the LLM is instructed to provide its
|
|
64
|
+
response as a structured JSON object, which is easier to parse.
|
|
65
|
+
|
|
66
|
+
use_function_calling_if_available (bool, optional): If True, use
|
|
67
|
+
function calling (if available) as a means to constrain the LLM
|
|
68
|
+
outputs. With function calling, the LLM is instructed to provide its
|
|
69
|
+
response as a structured JSON object, which is easier to parse.
|
|
70
|
+
|
|
49
71
|
verbose (bool, optional): Whether to print verbose output.
|
|
50
72
|
|
|
51
73
|
Returns:
|
|
52
74
|
Tuple[str, Optional[str]]: The label and explanation (if provided).
|
|
53
75
|
"""
|
|
76
|
+
use_openai_function_call = (
|
|
77
|
+
use_function_calling_if_available
|
|
78
|
+
and isinstance(self._model, OpenAIModel)
|
|
79
|
+
and self._model.supports_function_calling
|
|
80
|
+
)
|
|
54
81
|
prompt = self._template.format(
|
|
55
82
|
record, options=PromptOptions(provide_explanation=provide_explanation)
|
|
56
83
|
)
|
|
57
84
|
with set_verbosity(self._model, verbose) as verbose_model:
|
|
58
|
-
unparsed_output = verbose_model(
|
|
85
|
+
unparsed_output = verbose_model(
|
|
86
|
+
prompt,
|
|
87
|
+
**(
|
|
88
|
+
openai_function_call_kwargs(self._template.rails, provide_explanation)
|
|
89
|
+
if use_openai_function_call
|
|
90
|
+
else {}
|
|
91
|
+
),
|
|
92
|
+
)
|
|
59
93
|
label, explanation = _extract_label_and_explanation(
|
|
60
94
|
unparsed_output=unparsed_output,
|
|
61
95
|
template=self._template,
|
|
62
|
-
use_openai_function_call=False,
|
|
63
96
|
provide_explanation=provide_explanation,
|
|
97
|
+
use_openai_function_call=use_openai_function_call,
|
|
64
98
|
verbose=verbose,
|
|
65
99
|
)
|
|
66
100
|
return label, explanation
|
|
67
101
|
|
|
68
102
|
async def aevaluate(
|
|
69
|
-
self,
|
|
103
|
+
self,
|
|
104
|
+
record: Record,
|
|
105
|
+
provide_explanation: bool = False,
|
|
106
|
+
use_function_calling_if_available: bool = True,
|
|
107
|
+
verbose: bool = False,
|
|
70
108
|
) -> Tuple[str, Optional[str]]:
|
|
71
109
|
"""
|
|
72
110
|
Evaluates a single record.
|
|
@@ -77,25 +115,64 @@ class LLMEvaluator:
|
|
|
77
115
|
provide_explanation (bool, optional): Whether to provide an
|
|
78
116
|
explanation.
|
|
79
117
|
|
|
118
|
+
use_function_calling_if_available (bool, optional): If True, use
|
|
119
|
+
function calling (if available) as a means to constrain the LLM
|
|
120
|
+
outputs. With function calling, the LLM is instructed to provide its
|
|
121
|
+
response as a structured JSON object, which is easier to parse.
|
|
122
|
+
|
|
80
123
|
verbose (bool, optional): Whether to print verbose output.
|
|
81
124
|
|
|
82
125
|
Returns:
|
|
83
126
|
Tuple[str, Optional[str]]: The label and explanation (if provided).
|
|
84
127
|
"""
|
|
128
|
+
use_openai_function_call = (
|
|
129
|
+
use_function_calling_if_available
|
|
130
|
+
and isinstance(self._model, OpenAIModel)
|
|
131
|
+
and self._model.supports_function_calling
|
|
132
|
+
)
|
|
85
133
|
prompt = self._template.format(
|
|
86
134
|
record, options=PromptOptions(provide_explanation=provide_explanation)
|
|
87
135
|
)
|
|
88
136
|
with set_verbosity(self._model, verbose) as verbose_model:
|
|
89
|
-
unparsed_output = await verbose_model._async_generate(
|
|
137
|
+
unparsed_output = await verbose_model._async_generate(
|
|
138
|
+
prompt,
|
|
139
|
+
**(
|
|
140
|
+
openai_function_call_kwargs(self._template.rails, provide_explanation)
|
|
141
|
+
if use_openai_function_call
|
|
142
|
+
else {}
|
|
143
|
+
),
|
|
144
|
+
)
|
|
90
145
|
label, explanation = _extract_label_and_explanation(
|
|
91
146
|
unparsed_output=unparsed_output,
|
|
92
147
|
template=self._template,
|
|
93
|
-
use_openai_function_call=False,
|
|
94
148
|
provide_explanation=provide_explanation,
|
|
149
|
+
use_openai_function_call=use_openai_function_call,
|
|
95
150
|
verbose=verbose,
|
|
96
151
|
)
|
|
97
152
|
return label, explanation
|
|
98
153
|
|
|
154
|
+
@classmethod
|
|
155
|
+
def from_criteria(
|
|
156
|
+
cls,
|
|
157
|
+
criteria: EvalCriteria,
|
|
158
|
+
model: BaseEvalModel,
|
|
159
|
+
) -> "LLMEvaluator":
|
|
160
|
+
"""
|
|
161
|
+
Instantiates an LLMEvaluator from an eval criteria.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
criteria (EvalCriteria): The eval criteria.
|
|
165
|
+
|
|
166
|
+
model (BaseEvalModel): The model to use for evaluation.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
LLMEvaluator: The instantiate evaluator.
|
|
170
|
+
"""
|
|
171
|
+
return cls(
|
|
172
|
+
model=model,
|
|
173
|
+
template=criteria.value,
|
|
174
|
+
)
|
|
175
|
+
|
|
99
176
|
|
|
100
177
|
class MapReducer:
|
|
101
178
|
"""
|