arize-phoenix 4.4.4rc3__tar.gz → 4.4.4rc4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/PKG-INFO +1 -1
- arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/__init__.py +18 -0
- arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/_utils.py +13 -0
- arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/code_evaluators.py +127 -0
- arize_phoenix-4.4.4rc3/src/phoenix/datasets/evaluators.py → arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/llm_evaluators.py +19 -81
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/experiments.py +1 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Span.py +1 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/index.js +519 -515
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/schemas.py +1 -2
- arize_phoenix-4.4.4rc4/src/phoenix/version.py +1 -0
- arize_phoenix-4.4.4rc3/src/phoenix/version.py +0 -1
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/.gitignore +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/LICENSE +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/chat-service/chat/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/chat-service/chat/app.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/chat-service/chat/types.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/Dockerfile +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/Makefile +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/pyproject.toml +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/requirements.txt +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/schema.json +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/pyproject.toml +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/config.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/core/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/core/embedding_dimension.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/core/model.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/core/model_schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/core/model_schema_adapter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/tracing.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/types.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/datetime_utils.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/alembic.ini +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/bulk_inserter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/engines.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/dataset.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/evaluation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/span.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrate.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/env.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/script.py.mako +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/types.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/versions/cf03bd6bae1d_init.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/db/models.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/exceptions.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/errors.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/fixtures.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/inferences.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/validation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/binning.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/metrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/mixins.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/retrieval_metrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/timeseries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/wrappers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/clustering.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/pointcloud.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/projectors.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/umap_parameters.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/py.typed +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/context.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/cache/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/cache/two_tier_cache.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/dataset_example_spans.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/document_evaluation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/document_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/document_retrieval_metrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/evaluation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/experiment_error_rates.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/latency_ms_quantile.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/min_start_or_max_end_times.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/project_by_name.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/record_counts.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/span_descendants.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/span_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/span_projects.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/token_counts.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/trace_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/trace_row_ids.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/helpers/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/helpers/dataset_helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/ClusterInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/Coordinates.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/CreateDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DataQualityMetricInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DatasetExampleInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DatasetSort.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DatasetVersionSort.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DeleteDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DimensionFilter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DimensionInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/Granularity.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/PatchDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/PerformanceMetricInput.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/SpanSort.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/TimeRange.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/interceptor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/auth.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/dataset_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/experiment_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/export_events_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/project_mutations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/openapi/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/openapi/main.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/openapi/schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/queries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/utils.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/dataset_examples.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/datasets.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/experiment_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/experiment_runs.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/experiments.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/spans.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/traces.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/schema.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/AnnotatorKind.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Cluster.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/CreateDatasetPayload.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DataQualityMetric.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Dataset.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetExample.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetExampleRevision.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetValues.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetVersion.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Dimension.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionDataType.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionShape.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionType.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionWithValue.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DocumentEvaluationSummary.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DocumentRetrievalMetrics.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EmbeddingDimension.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EmbeddingMetadata.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Evaluation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EvaluationSummary.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Event.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EventMetadata.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExampleRevisionInterface.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Experiment.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentComparison.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentRun.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentRunAnnotation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExportedFile.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Functionality.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Inferences.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/InferencesRole.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/MimeType.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Model.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/NumericRange.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/PerformanceMetric.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Project.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/PromptResponse.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Retrieval.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ScalarDriftMetricEnum.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Segments.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/SortDir.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/TimeSeries.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Trace.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/UMAPPoints.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ValidationResult.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/VectorDriftMetricEnum.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/node.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/pagination.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/app.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/grpc_server.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/main.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/openapi/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/openapi/docs.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/prometheus.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-114x114.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-120x120.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-144x144.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-152x152.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-180x180.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-72x72.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-76x76.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon.png +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/favicon.ico +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/index.css +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/modernizr.js +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/telemetry.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/templates/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/templates/index.html +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/server/thread_server.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/services.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/session/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/session/client.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/session/data_extractor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/session/evaluation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/session/session.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/settings.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/attributes.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/README.md +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/filter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/helpers.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/query.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/errors.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/evaluation_conventions.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/exporter.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/fixtures.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/langchain/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/langchain/instrumentor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/llama_index/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/llama_index/callback.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/openai/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/openai/instrumentor.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/otel.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/projects.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/span_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/span_json_decoder.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/span_json_encoder.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/trace_dataset.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/utils.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/v1/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/v1/evaluation_pb2.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/v1/evaluation_pb2.pyi +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/__init__.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/deprecation.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/error_handling.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/json.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/logging.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/project.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/re.py +0 -0
- {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/span_store.py +0 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from phoenix.datasets.evaluators.code_evaluators import ContainsKeyword, JSONParsable
|
|
2
|
+
from phoenix.datasets.evaluators.llm_evaluators import (
|
|
3
|
+
CoherenceEvaluator,
|
|
4
|
+
ConcisenessEvaluator,
|
|
5
|
+
HelpfulnessEvaluator,
|
|
6
|
+
LLMCriteriaEvaluator,
|
|
7
|
+
RelevanceEvaluator,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"ContainsKeyword",
|
|
12
|
+
"JSONParsable",
|
|
13
|
+
"CoherenceEvaluator",
|
|
14
|
+
"ConcisenessEvaluator",
|
|
15
|
+
"LLMCriteriaEvaluator",
|
|
16
|
+
"HelpfulnessEvaluator",
|
|
17
|
+
"RelevanceEvaluator",
|
|
18
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from phoenix.datasets.types import JSONSerializable
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
5
|
+
if isinstance(obj, dict):
|
|
6
|
+
if len(obj) == 1:
|
|
7
|
+
key = next(iter(obj.keys()))
|
|
8
|
+
output = obj[key]
|
|
9
|
+
assert isinstance(
|
|
10
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
11
|
+
), "Output must be JSON serializable"
|
|
12
|
+
return output
|
|
13
|
+
return obj
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from phoenix.datasets.evaluators._utils import _unwrap_json
|
|
8
|
+
from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JSONParsable:
|
|
12
|
+
annotator_kind = "CODE"
|
|
13
|
+
name = "JSONParsable"
|
|
14
|
+
|
|
15
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
16
|
+
assert exp_run.output is not None
|
|
17
|
+
output = _unwrap_json(exp_run.output.result)
|
|
18
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
19
|
+
try:
|
|
20
|
+
json.loads(output)
|
|
21
|
+
json_parsable = True
|
|
22
|
+
except BaseException:
|
|
23
|
+
json_parsable = False
|
|
24
|
+
return EvaluationResult(
|
|
25
|
+
score=int(json_parsable),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ContainsKeyword:
|
|
30
|
+
annotator_kind = "CODE"
|
|
31
|
+
|
|
32
|
+
def __init__(self, keyword: str, name: Optional[str] = None) -> None:
|
|
33
|
+
self.keyword = keyword
|
|
34
|
+
self.name = name or f"Contains({repr(keyword)})"
|
|
35
|
+
|
|
36
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
37
|
+
assert exp_run.output is not None
|
|
38
|
+
result = _unwrap_json(exp_run.output.result)
|
|
39
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
40
|
+
found = self.keyword in result
|
|
41
|
+
return EvaluationResult(
|
|
42
|
+
score=float(found),
|
|
43
|
+
explanation=(
|
|
44
|
+
f"the string {repr(self.keyword)} was "
|
|
45
|
+
f"{'found' if found else 'not found'} in the output"
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ContainsAnyKeyword:
|
|
51
|
+
annotator_kind = "CODE"
|
|
52
|
+
|
|
53
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
54
|
+
self.keywords = keywords
|
|
55
|
+
self.name = name or f"ContainsAny({keywords})"
|
|
56
|
+
|
|
57
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
58
|
+
assert exp_run.output is not None
|
|
59
|
+
result = _unwrap_json(exp_run.output.result)
|
|
60
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
61
|
+
found = [keyword for keyword in self.keywords if keyword in result]
|
|
62
|
+
if found:
|
|
63
|
+
explanation = f"the keywords {found} were found in the output"
|
|
64
|
+
else:
|
|
65
|
+
explanation = f"none of the keywords {self.keywords} were found in the output"
|
|
66
|
+
return EvaluationResult(
|
|
67
|
+
score=float(bool(found)),
|
|
68
|
+
explanation=explanation,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ContainsAllKeywords:
|
|
73
|
+
annotator_kind = "CODE"
|
|
74
|
+
|
|
75
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
76
|
+
self.keywords = keywords
|
|
77
|
+
self.name = name or f"ContainsAll({keywords})"
|
|
78
|
+
|
|
79
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
80
|
+
assert exp_run.output is not None
|
|
81
|
+
result = _unwrap_json(exp_run.output.result)
|
|
82
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
83
|
+
not_found = [keyword for keyword in self.keywords if keyword not in result]
|
|
84
|
+
if not_found:
|
|
85
|
+
contains_all = False
|
|
86
|
+
explanation = f"the keywords {not_found} were not found in the output"
|
|
87
|
+
else:
|
|
88
|
+
contains_all = True
|
|
89
|
+
explanation = f"all of the keywords {self.keywords} were found in the output"
|
|
90
|
+
return EvaluationResult(
|
|
91
|
+
score=float(contains_all),
|
|
92
|
+
explanation=explanation,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class MatchesRegex:
|
|
97
|
+
annotator_kind = "CODE"
|
|
98
|
+
|
|
99
|
+
def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
|
|
100
|
+
if isinstance(pattern, str):
|
|
101
|
+
pattern = re.compile(pattern)
|
|
102
|
+
self.pattern = pattern
|
|
103
|
+
assert isinstance(pattern, re.Pattern)
|
|
104
|
+
self.name = name or f"matches_({pattern})"
|
|
105
|
+
|
|
106
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
107
|
+
assert exp_run.output is not None
|
|
108
|
+
result = _unwrap_json(exp_run.output.result)
|
|
109
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
110
|
+
matches = self.pattern.findall(result)
|
|
111
|
+
if matches:
|
|
112
|
+
explanation = (
|
|
113
|
+
f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
|
|
117
|
+
return EvaluationResult(
|
|
118
|
+
score=float(bool(matches)),
|
|
119
|
+
explanation=explanation,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# Someday we'll do typing checking in unit tests.
|
|
124
|
+
if TYPE_CHECKING:
|
|
125
|
+
_: ExperimentEvaluator
|
|
126
|
+
_ = JSONParsable()
|
|
127
|
+
_ = ContainsKeyword("test")
|
|
@@ -1,70 +1,12 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import re
|
|
3
|
-
from typing import
|
|
2
|
+
from typing import Callable, Optional, Type
|
|
4
3
|
|
|
5
|
-
from phoenix.datasets.
|
|
6
|
-
|
|
7
|
-
Example,
|
|
8
|
-
ExperimentEvaluator,
|
|
9
|
-
ExperimentRun,
|
|
10
|
-
JSONSerializable,
|
|
11
|
-
)
|
|
4
|
+
from phoenix.datasets.evaluators._utils import _unwrap_json
|
|
5
|
+
from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
|
|
12
6
|
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
13
7
|
from phoenix.evals.utils import snap_to_rail
|
|
14
8
|
|
|
15
9
|
|
|
16
|
-
def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
17
|
-
if isinstance(obj, dict):
|
|
18
|
-
if len(obj) == 1:
|
|
19
|
-
key = next(iter(obj.keys()))
|
|
20
|
-
output = obj[key]
|
|
21
|
-
assert isinstance(
|
|
22
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
23
|
-
), "Output must be JSON serializable"
|
|
24
|
-
return output
|
|
25
|
-
return obj
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class JSONParsable:
|
|
29
|
-
annotator_kind = "CODE"
|
|
30
|
-
name = "JSONParsable"
|
|
31
|
-
|
|
32
|
-
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
33
|
-
assert exp_run.output is not None
|
|
34
|
-
output = _unwrap_json(exp_run.output.result)
|
|
35
|
-
assert isinstance(output, str), "Experiment run output must be a string"
|
|
36
|
-
try:
|
|
37
|
-
json.loads(output)
|
|
38
|
-
json_parsable = True
|
|
39
|
-
except BaseException:
|
|
40
|
-
json_parsable = False
|
|
41
|
-
return EvaluationResult(
|
|
42
|
-
score=int(json_parsable),
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class ContainsKeyword:
|
|
47
|
-
annotator_kind = "CODE"
|
|
48
|
-
|
|
49
|
-
def __init__(self, keyword: str) -> None:
|
|
50
|
-
super().__init__()
|
|
51
|
-
self.keyword = keyword
|
|
52
|
-
self.name = f"ContainsKeyword({keyword})"
|
|
53
|
-
|
|
54
|
-
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
55
|
-
assert exp_run.output is not None
|
|
56
|
-
result = _unwrap_json(exp_run.output.result)
|
|
57
|
-
assert isinstance(result, str), "Experiment run output must be a string"
|
|
58
|
-
found = self.keyword in result
|
|
59
|
-
return EvaluationResult(
|
|
60
|
-
score=float(found),
|
|
61
|
-
explanation=(
|
|
62
|
-
f"the string {repr(self.keyword)} was "
|
|
63
|
-
f"{'found' if found else 'not found'} in the output"
|
|
64
|
-
),
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
10
|
class LLMCriteriaEvaluator:
|
|
69
11
|
annotator_kind = "LLM"
|
|
70
12
|
_base_template = (
|
|
@@ -77,7 +19,7 @@ class LLMCriteriaEvaluator:
|
|
|
77
19
|
"EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
|
|
78
20
|
"the criteria*\n"
|
|
79
21
|
"LABEL: *true or false*\n\n"
|
|
80
|
-
"Follow this template for the following
|
|
22
|
+
"Follow this template for the following example:\n\n"
|
|
81
23
|
"CRITERIA: the text is '{criteria}'\n"
|
|
82
24
|
"TEXT: {text}\n"
|
|
83
25
|
"EXPLANATION: "
|
|
@@ -142,40 +84,43 @@ class LLMCriteriaEvaluator:
|
|
|
142
84
|
|
|
143
85
|
|
|
144
86
|
def criteria_evaluator_factory(
|
|
145
|
-
class_name: str, criteria: str, description: str
|
|
87
|
+
class_name: str, criteria: str, description: str, default_name: str
|
|
146
88
|
) -> Type[ExperimentEvaluator]:
|
|
89
|
+
def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
|
|
90
|
+
LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
|
|
91
|
+
|
|
147
92
|
return type(
|
|
148
93
|
class_name,
|
|
149
94
|
(LLMCriteriaEvaluator,),
|
|
150
95
|
{
|
|
151
|
-
"__init__":
|
|
152
|
-
self, model, criteria, description, name=class_name
|
|
153
|
-
),
|
|
96
|
+
"__init__": _init,
|
|
154
97
|
"__module__": __name__,
|
|
155
|
-
"name": class_name,
|
|
156
98
|
"template": LLMCriteriaEvaluator._format_base_template(criteria, description),
|
|
157
99
|
},
|
|
158
100
|
)
|
|
159
101
|
|
|
160
102
|
|
|
161
|
-
|
|
162
|
-
class_name="
|
|
103
|
+
ConcisenessEvaluator = criteria_evaluator_factory(
|
|
104
|
+
class_name="ConcisenessEvaluator",
|
|
163
105
|
criteria="concise",
|
|
164
106
|
description="is just a few sentences and easy to follow",
|
|
107
|
+
default_name="Conciseness",
|
|
165
108
|
)
|
|
166
109
|
|
|
167
110
|
|
|
168
|
-
|
|
169
|
-
class_name="
|
|
111
|
+
HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
112
|
+
class_name="HelpfulnessEvaluator",
|
|
170
113
|
criteria="helpful",
|
|
171
114
|
description="provides useful information",
|
|
115
|
+
default_name="Helpfulness",
|
|
172
116
|
)
|
|
173
117
|
|
|
174
118
|
|
|
175
|
-
|
|
176
|
-
class_name="
|
|
119
|
+
CoherenceEvaluator = criteria_evaluator_factory(
|
|
120
|
+
class_name="CoherenceEvaluator",
|
|
177
121
|
criteria="coherent",
|
|
178
|
-
description="is coherent, well-structured, and
|
|
122
|
+
description="is coherent, well-structured, and logically sound",
|
|
123
|
+
default_name="Coherence",
|
|
179
124
|
)
|
|
180
125
|
|
|
181
126
|
|
|
@@ -266,10 +211,3 @@ class RelevanceEvaluator:
|
|
|
266
211
|
formatted_template = self._format_eval_template(example, exp_run)
|
|
267
212
|
unparsed_response = await self.model._async_generate(formatted_template)
|
|
268
213
|
return self._parse_eval_output(unparsed_response)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
# Someday we'll do typing checking in unit tests.
|
|
272
|
-
if TYPE_CHECKING:
|
|
273
|
-
_: ExperimentEvaluator
|
|
274
|
-
_ = JSONParsable()
|
|
275
|
-
_ = ContainsKeyword("test")
|
|
@@ -458,6 +458,7 @@ def _evaluate_experiment(
|
|
|
458
458
|
max_retries=0,
|
|
459
459
|
exit_on_error=False,
|
|
460
460
|
fallback_return_value=None,
|
|
461
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
|
|
461
462
|
)
|
|
462
463
|
evaluation_payloads, _execution_details = executor.run(evaluation_inputs)
|
|
463
464
|
for payload in evaluation_payloads:
|