arize-phoenix 4.4.4rc3__tar.gz → 4.4.4rc5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/PKG-INFO +2 -2
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/pyproject.toml +2 -1
- arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/__init__.py +18 -0
- arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/code_evaluators.py +99 -0
- arize_phoenix-4.4.4rc3/src/phoenix/datasets/evaluators.py → arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/llm_evaluators.py +75 -106
- arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/utils.py +292 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/experiments.py +148 -82
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/tracing.py +19 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/types.py +18 -52
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/dataset.py +19 -16
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/models.py +8 -3
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/context.py +2 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/__init__.py +2 -0
- arize_phoenix-4.4.4rc5/src/phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/helpers/dataset_helpers.py +8 -7
- arize_phoenix-4.4.4rc5/src/phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/project_mutations.py +9 -4
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/datasets.py +146 -42
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/experiment_evaluations.py +1 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/experiment_runs.py +2 -2
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Experiment.py +5 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentRun.py +1 -1
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Span.py +1 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/app.py +2 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/index.js +638 -588
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/client.py +124 -2
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/schemas.py +1 -2
- arize_phoenix-4.4.4rc5/src/phoenix/version.py +1 -0
- arize_phoenix-4.4.4rc3/src/phoenix/version.py +0 -1
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/.gitignore +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/LICENSE +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/chat-service/chat/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/chat-service/chat/app.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/chat-service/chat/types.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/Dockerfile +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/Makefile +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/pyproject.toml +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/requirements.txt +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/schema.json +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/config.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/embedding_dimension.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/model.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/model_schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/model_schema_adapter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datetime_utils.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/alembic.ini +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/bulk_inserter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/engines.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/evaluation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/span.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrate.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/env.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/script.py.mako +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/types.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/versions/cf03bd6bae1d_init.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/exceptions.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/errors.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/fixtures.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/inferences.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/validation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/binning.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/metrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/mixins.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/retrieval_metrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/timeseries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/wrappers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/clustering.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/pointcloud.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/projectors.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/umap_parameters.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/py.typed +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/cache/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/cache/two_tier_cache.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/dataset_example_spans.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/document_evaluation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/document_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/document_retrieval_metrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/evaluation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/experiment_error_rates.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/latency_ms_quantile.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/min_start_or_max_end_times.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/project_by_name.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/record_counts.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/span_descendants.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/span_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/span_projects.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/token_counts.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/trace_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/trace_row_ids.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/helpers/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/ClusterInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/Coordinates.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/CreateDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DataQualityMetricInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DatasetExampleInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DatasetSort.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DatasetVersionSort.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DeleteDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DimensionFilter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DimensionInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/Granularity.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/PatchDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/PerformanceMetricInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/SpanSort.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/TimeRange.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/interceptor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/auth.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/dataset_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/experiment_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/export_events_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/openapi/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/openapi/main.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/openapi/schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/queries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/utils.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/dataset_examples.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/experiments.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/spans.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/traces.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/AnnotatorKind.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Cluster.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/CreateDatasetPayload.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DataQualityMetric.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Dataset.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetExample.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetExampleRevision.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetValues.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetVersion.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Dimension.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionDataType.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionShape.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionType.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionWithValue.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DocumentEvaluationSummary.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DocumentRetrievalMetrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EmbeddingDimension.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EmbeddingMetadata.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Evaluation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EvaluationSummary.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Event.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EventMetadata.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExampleRevisionInterface.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentComparison.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExportedFile.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Functionality.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Inferences.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/InferencesRole.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/MimeType.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Model.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/NumericRange.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/PerformanceMetric.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Project.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/PromptResponse.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Retrieval.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ScalarDriftMetricEnum.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Segments.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/SortDir.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/TimeSeries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Trace.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/UMAPPoints.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ValidationResult.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/VectorDriftMetricEnum.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/node.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/pagination.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/grpc_server.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/main.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/openapi/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/openapi/docs.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/prometheus.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-114x114.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-120x120.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-144x144.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-152x152.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-180x180.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-72x72.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-76x76.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/favicon.ico +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/index.css +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/modernizr.js +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/telemetry.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/templates/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/templates/index.html +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/thread_server.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/services.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/data_extractor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/evaluation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/session.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/settings.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/attributes.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/filter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/query.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/errors.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/evaluation_conventions.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/exporter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/fixtures.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/langchain/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/langchain/instrumentor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/llama_index/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/llama_index/callback.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/openai/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/openai/instrumentor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/otel.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/projects.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/span_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/span_json_decoder.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/span_json_encoder.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/trace_dataset.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/utils.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/v1/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/v1/evaluation_pb2.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/v1/evaluation_pb2.pyi +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/deprecation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/error_handling.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/json.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/logging.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/project.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/re.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/span_store.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: arize-phoenix
|
|
3
|
-
Version: 4.4.
|
|
3
|
+
Version: 4.4.4rc5
|
|
4
4
|
Summary: AI Observability and Evaluation
|
|
5
5
|
Project-URL: Documentation, https://docs.arize.com/phoenix/
|
|
6
6
|
Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
|
|
@@ -31,7 +31,7 @@ Requires-Dist: openinference-instrumentation
|
|
|
31
31
|
Requires-Dist: openinference-instrumentation-langchain>=0.1.12
|
|
32
32
|
Requires-Dist: openinference-instrumentation-llama-index>=1.2.0
|
|
33
33
|
Requires-Dist: openinference-instrumentation-openai>=0.1.4
|
|
34
|
-
Requires-Dist: openinference-semantic-conventions>=0.1.
|
|
34
|
+
Requires-Dist: openinference-semantic-conventions>=0.1.9
|
|
35
35
|
Requires-Dist: opentelemetry-exporter-otlp
|
|
36
36
|
Requires-Dist: opentelemetry-proto>=1.12.0
|
|
37
37
|
Requires-Dist: opentelemetry-sdk
|
|
@@ -46,7 +46,7 @@ dependencies = [
|
|
|
46
46
|
"opentelemetry-proto>=1.12.0", # needed to avoid this issue: https://github.com/Arize-ai/phoenix/issues/2695
|
|
47
47
|
"opentelemetry-exporter-otlp",
|
|
48
48
|
"opentelemetry-semantic-conventions",
|
|
49
|
-
"openinference-semantic-conventions>=0.1.
|
|
49
|
+
"openinference-semantic-conventions>=0.1.9",
|
|
50
50
|
"openinference-instrumentation",
|
|
51
51
|
"openinference-instrumentation-langchain>=0.1.12",
|
|
52
52
|
"openinference-instrumentation-llama-index>=1.2.0",
|
|
@@ -206,6 +206,7 @@ dependencies = [
|
|
|
206
206
|
[tool.hatch.envs.docs]
|
|
207
207
|
detached = true
|
|
208
208
|
dependencies = [
|
|
209
|
+
"pyment",
|
|
209
210
|
"interrogate",
|
|
210
211
|
]
|
|
211
212
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from phoenix.datasets.evaluators.code_evaluators import ContainsKeyword, JSONParsable
|
|
2
|
+
from phoenix.datasets.evaluators.llm_evaluators import (
|
|
3
|
+
CoherenceEvaluator,
|
|
4
|
+
ConcisenessEvaluator,
|
|
5
|
+
HelpfulnessEvaluator,
|
|
6
|
+
LLMCriteriaEvaluator,
|
|
7
|
+
RelevanceEvaluator,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"ContainsKeyword",
|
|
12
|
+
"JSONParsable",
|
|
13
|
+
"CoherenceEvaluator",
|
|
14
|
+
"ConcisenessEvaluator",
|
|
15
|
+
"LLMCriteriaEvaluator",
|
|
16
|
+
"HelpfulnessEvaluator",
|
|
17
|
+
"RelevanceEvaluator",
|
|
18
|
+
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from phoenix.datasets.evaluators.utils import Evaluator
|
|
8
|
+
from phoenix.datasets.types import EvaluationResult, TaskOutput
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JSONParsable(Evaluator):
|
|
12
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
13
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
14
|
+
try:
|
|
15
|
+
json.loads(output)
|
|
16
|
+
json_parsable = True
|
|
17
|
+
except BaseException:
|
|
18
|
+
json_parsable = False
|
|
19
|
+
return EvaluationResult(
|
|
20
|
+
score=int(json_parsable),
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ContainsKeyword(Evaluator):
|
|
25
|
+
def __init__(self, keyword: str, name: Optional[str] = None) -> None:
|
|
26
|
+
self.keyword = keyword
|
|
27
|
+
self._name = name or f"Contains({repr(keyword)})"
|
|
28
|
+
|
|
29
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
30
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
31
|
+
found = self.keyword in output
|
|
32
|
+
return EvaluationResult(
|
|
33
|
+
score=float(found),
|
|
34
|
+
explanation=(
|
|
35
|
+
f"the string {repr(self.keyword)} was "
|
|
36
|
+
f"{'found' if found else 'not found'} in the output"
|
|
37
|
+
),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ContainsAnyKeyword(Evaluator):
|
|
42
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
43
|
+
self.keywords = keywords
|
|
44
|
+
self._name = name or f"ContainsAny({keywords})"
|
|
45
|
+
|
|
46
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
47
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
48
|
+
found = [keyword for keyword in self.keywords if keyword in output]
|
|
49
|
+
if found:
|
|
50
|
+
explanation = f"the keywords {found} were found in the output"
|
|
51
|
+
else:
|
|
52
|
+
explanation = f"none of the keywords {self.keywords} were found in the output"
|
|
53
|
+
return EvaluationResult(
|
|
54
|
+
score=float(bool(found)),
|
|
55
|
+
explanation=explanation,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ContainsAllKeywords(Evaluator):
|
|
60
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
61
|
+
self.keywords = keywords
|
|
62
|
+
self._name = name or f"ContainsAll({keywords})"
|
|
63
|
+
|
|
64
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
65
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
66
|
+
not_found = [keyword for keyword in self.keywords if keyword not in output]
|
|
67
|
+
if not_found:
|
|
68
|
+
contains_all = False
|
|
69
|
+
explanation = f"the keywords {not_found} were not found in the output"
|
|
70
|
+
else:
|
|
71
|
+
contains_all = True
|
|
72
|
+
explanation = f"all of the keywords {self.keywords} were found in the output"
|
|
73
|
+
return EvaluationResult(
|
|
74
|
+
score=float(contains_all),
|
|
75
|
+
explanation=explanation,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class MatchesRegex(Evaluator):
|
|
80
|
+
def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
|
|
81
|
+
if isinstance(pattern, str):
|
|
82
|
+
pattern = re.compile(pattern)
|
|
83
|
+
self.pattern = pattern
|
|
84
|
+
assert isinstance(pattern, re.Pattern)
|
|
85
|
+
self._name = name or f"matches_({pattern})"
|
|
86
|
+
|
|
87
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
88
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
89
|
+
matches = self.pattern.findall(output)
|
|
90
|
+
if matches:
|
|
91
|
+
explanation = (
|
|
92
|
+
f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
|
|
96
|
+
return EvaluationResult(
|
|
97
|
+
score=float(bool(matches)),
|
|
98
|
+
explanation=explanation,
|
|
99
|
+
)
|
|
@@ -1,72 +1,23 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import re
|
|
3
|
-
from
|
|
2
|
+
from types import MappingProxyType
|
|
3
|
+
from typing import Any, Callable, Optional, Type
|
|
4
4
|
|
|
5
|
+
from phoenix.datasets.evaluators.utils import (
|
|
6
|
+
ExampleInput,
|
|
7
|
+
ExampleMetadata,
|
|
8
|
+
ExperimentEvaluator,
|
|
9
|
+
LLMEvaluator,
|
|
10
|
+
_unwrap_json,
|
|
11
|
+
)
|
|
5
12
|
from phoenix.datasets.types import (
|
|
6
13
|
EvaluationResult,
|
|
7
|
-
|
|
8
|
-
ExperimentEvaluator,
|
|
9
|
-
ExperimentRun,
|
|
10
|
-
JSONSerializable,
|
|
14
|
+
TaskOutput,
|
|
11
15
|
)
|
|
12
16
|
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
13
17
|
from phoenix.evals.utils import snap_to_rail
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
17
|
-
if isinstance(obj, dict):
|
|
18
|
-
if len(obj) == 1:
|
|
19
|
-
key = next(iter(obj.keys()))
|
|
20
|
-
output = obj[key]
|
|
21
|
-
assert isinstance(
|
|
22
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
23
|
-
), "Output must be JSON serializable"
|
|
24
|
-
return output
|
|
25
|
-
return obj
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class JSONParsable:
|
|
29
|
-
annotator_kind = "CODE"
|
|
30
|
-
name = "JSONParsable"
|
|
31
|
-
|
|
32
|
-
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
33
|
-
assert exp_run.output is not None
|
|
34
|
-
output = _unwrap_json(exp_run.output.result)
|
|
35
|
-
assert isinstance(output, str), "Experiment run output must be a string"
|
|
36
|
-
try:
|
|
37
|
-
json.loads(output)
|
|
38
|
-
json_parsable = True
|
|
39
|
-
except BaseException:
|
|
40
|
-
json_parsable = False
|
|
41
|
-
return EvaluationResult(
|
|
42
|
-
score=int(json_parsable),
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class ContainsKeyword:
|
|
47
|
-
annotator_kind = "CODE"
|
|
48
|
-
|
|
49
|
-
def __init__(self, keyword: str) -> None:
|
|
50
|
-
super().__init__()
|
|
51
|
-
self.keyword = keyword
|
|
52
|
-
self.name = f"ContainsKeyword({keyword})"
|
|
53
|
-
|
|
54
|
-
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
55
|
-
assert exp_run.output is not None
|
|
56
|
-
result = _unwrap_json(exp_run.output.result)
|
|
57
|
-
assert isinstance(result, str), "Experiment run output must be a string"
|
|
58
|
-
found = self.keyword in result
|
|
59
|
-
return EvaluationResult(
|
|
60
|
-
score=float(found),
|
|
61
|
-
explanation=(
|
|
62
|
-
f"the string {repr(self.keyword)} was "
|
|
63
|
-
f"{'found' if found else 'not found'} in the output"
|
|
64
|
-
),
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class LLMCriteriaEvaluator:
|
|
69
|
-
annotator_kind = "LLM"
|
|
20
|
+
class LLMCriteriaEvaluator(LLMEvaluator):
|
|
70
21
|
_base_template = (
|
|
71
22
|
"Determine if the following text is {criteria}. {description}"
|
|
72
23
|
"First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
|
|
@@ -77,7 +28,7 @@ class LLMCriteriaEvaluator:
|
|
|
77
28
|
"EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
|
|
78
29
|
"the criteria*\n"
|
|
79
30
|
"LABEL: *true or false*\n\n"
|
|
80
|
-
"Follow this template for the following
|
|
31
|
+
"Follow this template for the following example:\n\n"
|
|
81
32
|
"CRITERIA: the text is '{criteria}'\n"
|
|
82
33
|
"TEXT: {text}\n"
|
|
83
34
|
"EXPLANATION: "
|
|
@@ -95,21 +46,23 @@ class LLMCriteriaEvaluator:
|
|
|
95
46
|
self.criteria = criteria
|
|
96
47
|
self.description = description
|
|
97
48
|
self.template = self._format_base_template(self.criteria, self.description)
|
|
98
|
-
self.
|
|
49
|
+
self._name = name
|
|
99
50
|
|
|
100
|
-
def evaluate(self,
|
|
101
|
-
formatted_template = self._format_eval_template(
|
|
51
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
52
|
+
formatted_template = self._format_eval_template(output)
|
|
102
53
|
unparsed_response = self.model._generate(formatted_template)
|
|
103
54
|
return self._parse_eval_output(unparsed_response)
|
|
104
55
|
|
|
105
|
-
async def async_evaluate(
|
|
106
|
-
|
|
56
|
+
async def async_evaluate(
|
|
57
|
+
self, *, output: Optional[TaskOutput] = None, **_: Any
|
|
58
|
+
) -> EvaluationResult:
|
|
59
|
+
formatted_template = self._format_eval_template(output)
|
|
107
60
|
unparsed_response = await self.model._async_generate(formatted_template)
|
|
108
61
|
return self._parse_eval_output(unparsed_response)
|
|
109
62
|
|
|
110
|
-
def _format_eval_template(self,
|
|
111
|
-
assert
|
|
112
|
-
result = _unwrap_json(
|
|
63
|
+
def _format_eval_template(self, output: TaskOutput) -> str:
|
|
64
|
+
assert output is not None
|
|
65
|
+
result = _unwrap_json(output)
|
|
113
66
|
return self.template.format(text=str(result))
|
|
114
67
|
|
|
115
68
|
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
@@ -142,40 +95,43 @@ class LLMCriteriaEvaluator:
|
|
|
142
95
|
|
|
143
96
|
|
|
144
97
|
def criteria_evaluator_factory(
|
|
145
|
-
class_name: str, criteria: str, description: str
|
|
98
|
+
class_name: str, criteria: str, description: str, default_name: str
|
|
146
99
|
) -> Type[ExperimentEvaluator]:
|
|
100
|
+
def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
|
|
101
|
+
LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
|
|
102
|
+
|
|
147
103
|
return type(
|
|
148
104
|
class_name,
|
|
149
105
|
(LLMCriteriaEvaluator,),
|
|
150
106
|
{
|
|
151
|
-
"__init__":
|
|
152
|
-
self, model, criteria, description, name=class_name
|
|
153
|
-
),
|
|
107
|
+
"__init__": _init,
|
|
154
108
|
"__module__": __name__,
|
|
155
|
-
"name": class_name,
|
|
156
109
|
"template": LLMCriteriaEvaluator._format_base_template(criteria, description),
|
|
157
110
|
},
|
|
158
111
|
)
|
|
159
112
|
|
|
160
113
|
|
|
161
|
-
|
|
162
|
-
class_name="
|
|
114
|
+
ConcisenessEvaluator = criteria_evaluator_factory(
|
|
115
|
+
class_name="ConcisenessEvaluator",
|
|
163
116
|
criteria="concise",
|
|
164
117
|
description="is just a few sentences and easy to follow",
|
|
118
|
+
default_name="Conciseness",
|
|
165
119
|
)
|
|
166
120
|
|
|
167
121
|
|
|
168
|
-
|
|
169
|
-
class_name="
|
|
122
|
+
HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
123
|
+
class_name="HelpfulnessEvaluator",
|
|
170
124
|
criteria="helpful",
|
|
171
125
|
description="provides useful information",
|
|
126
|
+
default_name="Helpfulness",
|
|
172
127
|
)
|
|
173
128
|
|
|
174
129
|
|
|
175
|
-
|
|
176
|
-
class_name="
|
|
130
|
+
CoherenceEvaluator = criteria_evaluator_factory(
|
|
131
|
+
class_name="CoherenceEvaluator",
|
|
177
132
|
criteria="coherent",
|
|
178
|
-
description="is coherent, well-structured, and
|
|
133
|
+
description="is coherent, well-structured, and logically sound",
|
|
134
|
+
default_name="Coherence",
|
|
179
135
|
)
|
|
180
136
|
|
|
181
137
|
|
|
@@ -192,8 +148,7 @@ def _parse_label_from_explanation(raw_string: str) -> str:
|
|
|
192
148
|
return raw_string
|
|
193
149
|
|
|
194
150
|
|
|
195
|
-
class RelevanceEvaluator:
|
|
196
|
-
annotator_kind = "LLM"
|
|
151
|
+
class RelevanceEvaluator(LLMEvaluator):
|
|
197
152
|
template = (
|
|
198
153
|
"Determine if the following response is relevant to the query. In this context, "
|
|
199
154
|
"'relevance' means that the response directly addresses the core question or topic of the "
|
|
@@ -217,19 +172,24 @@ class RelevanceEvaluator:
|
|
|
217
172
|
def __init__(
|
|
218
173
|
self,
|
|
219
174
|
model: LLMBaseModel,
|
|
220
|
-
get_query: Optional[Callable[[
|
|
221
|
-
get_response: Optional[Callable[[
|
|
175
|
+
get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
|
|
176
|
+
get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
|
|
222
177
|
name: str = "RelevanceEvaluator",
|
|
223
178
|
):
|
|
224
179
|
self.model = model
|
|
225
|
-
self.
|
|
180
|
+
self._name = name
|
|
226
181
|
self.get_query = get_query or self._default_get_query
|
|
227
182
|
self.get_response = get_response or self._default_get_response
|
|
228
183
|
|
|
229
|
-
def _format_eval_template(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
184
|
+
def _format_eval_template(
|
|
185
|
+
self,
|
|
186
|
+
output: Optional[TaskOutput] = None,
|
|
187
|
+
input: ExampleInput = MappingProxyType({}),
|
|
188
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
189
|
+
) -> str:
|
|
190
|
+
assert output is not None
|
|
191
|
+
query = self.get_query(input, metadata)
|
|
192
|
+
response = self.get_response(output, metadata)
|
|
233
193
|
return self.template.format(query=query, response=response)
|
|
234
194
|
|
|
235
195
|
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
@@ -250,26 +210,35 @@ class RelevanceEvaluator:
|
|
|
250
210
|
metadata={},
|
|
251
211
|
)
|
|
252
212
|
|
|
253
|
-
def _default_get_query(self,
|
|
254
|
-
return str(
|
|
213
|
+
def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
|
|
214
|
+
return str(input)
|
|
255
215
|
|
|
256
|
-
def _default_get_response(
|
|
257
|
-
|
|
258
|
-
|
|
216
|
+
def _default_get_response(
|
|
217
|
+
self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
|
|
218
|
+
) -> str:
|
|
219
|
+
assert output is not None
|
|
220
|
+
return str(_unwrap_json(output))
|
|
259
221
|
|
|
260
|
-
def evaluate(
|
|
261
|
-
|
|
222
|
+
def evaluate(
|
|
223
|
+
self,
|
|
224
|
+
*,
|
|
225
|
+
output: Optional[TaskOutput] = None,
|
|
226
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
227
|
+
input: ExampleInput = MappingProxyType({}),
|
|
228
|
+
**_: Any,
|
|
229
|
+
) -> EvaluationResult:
|
|
230
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
262
231
|
unparsed_response = self.model._generate(formatted_template)
|
|
263
232
|
return self._parse_eval_output(unparsed_response)
|
|
264
233
|
|
|
265
|
-
async def async_evaluate(
|
|
266
|
-
|
|
234
|
+
async def async_evaluate(
|
|
235
|
+
self,
|
|
236
|
+
*,
|
|
237
|
+
output: Optional[TaskOutput] = None,
|
|
238
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
239
|
+
input: ExampleInput = MappingProxyType({}),
|
|
240
|
+
**_: Any,
|
|
241
|
+
) -> EvaluationResult:
|
|
242
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
267
243
|
unparsed_response = await self.model._async_generate(formatted_template)
|
|
268
244
|
return self._parse_eval_output(unparsed_response)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
# Someday we'll do typing checking in unit tests.
|
|
272
|
-
if TYPE_CHECKING:
|
|
273
|
-
_: ExperimentEvaluator
|
|
274
|
-
_ = JSONParsable()
|
|
275
|
-
_ = ContainsKeyword("test")
|