arize-phoenix 4.4.4rc2__tar.gz → 4.4.4rc4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/PKG-INFO +1 -1
- arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/__init__.py +18 -0
- arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/_utils.py +13 -0
- arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/code_evaluators.py +127 -0
- arize_phoenix-4.4.4rc2/src/phoenix/datasets/evaluators.py → arize_phoenix-4.4.4rc4/src/phoenix/datasets/evaluators/llm_evaluators.py +19 -81
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/experiments.py +20 -4
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/openapi/schema.py +2 -1
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Span.py +1 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/index.js +519 -515
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/schemas.py +1 -2
- arize_phoenix-4.4.4rc4/src/phoenix/version.py +1 -0
- arize_phoenix-4.4.4rc2/src/phoenix/version.py +0 -1
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/.gitignore +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/LICENSE +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/README.md +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/chat-service/chat/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/chat-service/chat/app.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/chat-service/chat/types.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/Dockerfile +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/Makefile +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/pyproject.toml +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/requirements.txt +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/examples/manually-instrumented-chatbot/frontend/schema.json +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/pyproject.toml +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/config.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/core/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/core/embedding_dimension.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/core/model.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/core/model_schema.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/core/model_schema_adapter.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/tracing.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/datasets/types.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/datetime_utils.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/README.md +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/alembic.ini +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/bulk_inserter.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/engines.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/helpers.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/dataset.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/evaluation.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/helpers.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/insertion/span.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrate.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/env.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/script.py.mako +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/types.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/migrations/versions/cf03bd6bae1d_init.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/db/models.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/exceptions.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/errors.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/fixtures.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/inferences.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/schema.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/inferences/validation.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/README.md +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/binning.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/metrics.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/mixins.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/retrieval_metrics.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/timeseries.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/metrics/wrappers.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/clustering.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/pointcloud.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/projectors.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/pointcloud/umap_parameters.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/py.typed +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/context.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/cache/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/cache/two_tier_cache.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/dataset_example_spans.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/document_evaluation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/document_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/document_retrieval_metrics.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/evaluation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/experiment_error_rates.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/latency_ms_quantile.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/min_start_or_max_end_times.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/project_by_name.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/record_counts.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/span_descendants.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/span_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/span_projects.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/token_counts.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/trace_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/dataloaders/trace_row_ids.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/helpers/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/helpers/dataset_helpers.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/ClusterInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/Coordinates.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/CreateDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DataQualityMetricInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DatasetExampleInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DatasetSort.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DatasetVersionSort.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DeleteDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DimensionFilter.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/DimensionInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/Granularity.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/PatchDatasetInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/PerformanceMetricInput.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/SpanSort.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/TimeRange.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/input_types/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/interceptor.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/auth.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/dataset_mutations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/experiment_mutations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/export_events_mutations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/mutations/project_mutations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/openapi/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/openapi/main.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/queries.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/utils.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/dataset_examples.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/datasets.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/evaluations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/experiment_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/experiment_runs.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/experiments.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/spans.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/routers/v1/traces.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/schema.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/AnnotatorKind.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Cluster.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/CreateDatasetPayload.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DataQualityMetric.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Dataset.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetExample.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetExampleRevision.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetValues.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DatasetVersion.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Dimension.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionDataType.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionShape.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionType.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DimensionWithValue.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DocumentEvaluationSummary.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/DocumentRetrievalMetrics.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EmbeddingDimension.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EmbeddingMetadata.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Evaluation.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EvaluationSummary.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Event.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/EventMetadata.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExampleRevisionInterface.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Experiment.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentComparison.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentRun.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExperimentRunAnnotation.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ExportedFile.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Functionality.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Inferences.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/InferencesRole.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/MimeType.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Model.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/NumericRange.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/PerformanceMetric.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Project.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/PromptResponse.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Retrieval.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ScalarDriftMetricEnum.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Segments.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/SortDir.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/TimeSeries.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/Trace.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/UMAPPoints.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/ValidationResult.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/VectorDriftMetricEnum.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/node.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/api/types/pagination.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/app.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/grpc_server.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/main.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/openapi/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/openapi/docs.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/prometheus.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-114x114.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-120x120.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-144x144.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-152x152.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-180x180.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-72x72.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon-76x76.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/apple-touch-icon.png +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/favicon.ico +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/index.css +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/static/modernizr.js +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/telemetry.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/templates/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/templates/index.html +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/server/thread_server.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/services.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/session/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/session/client.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/session/data_extractor.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/session/evaluation.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/session/session.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/settings.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/attributes.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/README.md +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/filter.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/helpers.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/dsl/query.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/errors.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/evaluation_conventions.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/exporter.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/fixtures.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/langchain/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/langchain/instrumentor.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/llama_index/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/llama_index/callback.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/openai/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/openai/instrumentor.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/otel.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/projects.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/span_evaluations.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/span_json_decoder.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/span_json_encoder.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/trace_dataset.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/utils.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/v1/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/v1/evaluation_pb2.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/trace/v1/evaluation_pb2.pyi +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/__init__.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/deprecation.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/error_handling.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/json.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/logging.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/project.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/re.py +0 -0
- {arize_phoenix-4.4.4rc2 → arize_phoenix-4.4.4rc4}/src/phoenix/utilities/span_store.py +0 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from phoenix.datasets.evaluators.code_evaluators import ContainsKeyword, JSONParsable
|
|
2
|
+
from phoenix.datasets.evaluators.llm_evaluators import (
|
|
3
|
+
CoherenceEvaluator,
|
|
4
|
+
ConcisenessEvaluator,
|
|
5
|
+
HelpfulnessEvaluator,
|
|
6
|
+
LLMCriteriaEvaluator,
|
|
7
|
+
RelevanceEvaluator,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"ContainsKeyword",
|
|
12
|
+
"JSONParsable",
|
|
13
|
+
"CoherenceEvaluator",
|
|
14
|
+
"ConcisenessEvaluator",
|
|
15
|
+
"LLMCriteriaEvaluator",
|
|
16
|
+
"HelpfulnessEvaluator",
|
|
17
|
+
"RelevanceEvaluator",
|
|
18
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from phoenix.datasets.types import JSONSerializable
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
5
|
+
if isinstance(obj, dict):
|
|
6
|
+
if len(obj) == 1:
|
|
7
|
+
key = next(iter(obj.keys()))
|
|
8
|
+
output = obj[key]
|
|
9
|
+
assert isinstance(
|
|
10
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
11
|
+
), "Output must be JSON serializable"
|
|
12
|
+
return output
|
|
13
|
+
return obj
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from phoenix.datasets.evaluators._utils import _unwrap_json
|
|
8
|
+
from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JSONParsable:
|
|
12
|
+
annotator_kind = "CODE"
|
|
13
|
+
name = "JSONParsable"
|
|
14
|
+
|
|
15
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
16
|
+
assert exp_run.output is not None
|
|
17
|
+
output = _unwrap_json(exp_run.output.result)
|
|
18
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
19
|
+
try:
|
|
20
|
+
json.loads(output)
|
|
21
|
+
json_parsable = True
|
|
22
|
+
except BaseException:
|
|
23
|
+
json_parsable = False
|
|
24
|
+
return EvaluationResult(
|
|
25
|
+
score=int(json_parsable),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ContainsKeyword:
|
|
30
|
+
annotator_kind = "CODE"
|
|
31
|
+
|
|
32
|
+
def __init__(self, keyword: str, name: Optional[str] = None) -> None:
|
|
33
|
+
self.keyword = keyword
|
|
34
|
+
self.name = name or f"Contains({repr(keyword)})"
|
|
35
|
+
|
|
36
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
37
|
+
assert exp_run.output is not None
|
|
38
|
+
result = _unwrap_json(exp_run.output.result)
|
|
39
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
40
|
+
found = self.keyword in result
|
|
41
|
+
return EvaluationResult(
|
|
42
|
+
score=float(found),
|
|
43
|
+
explanation=(
|
|
44
|
+
f"the string {repr(self.keyword)} was "
|
|
45
|
+
f"{'found' if found else 'not found'} in the output"
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ContainsAnyKeyword:
|
|
51
|
+
annotator_kind = "CODE"
|
|
52
|
+
|
|
53
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
54
|
+
self.keywords = keywords
|
|
55
|
+
self.name = name or f"ContainsAny({keywords})"
|
|
56
|
+
|
|
57
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
58
|
+
assert exp_run.output is not None
|
|
59
|
+
result = _unwrap_json(exp_run.output.result)
|
|
60
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
61
|
+
found = [keyword for keyword in self.keywords if keyword in result]
|
|
62
|
+
if found:
|
|
63
|
+
explanation = f"the keywords {found} were found in the output"
|
|
64
|
+
else:
|
|
65
|
+
explanation = f"none of the keywords {self.keywords} were found in the output"
|
|
66
|
+
return EvaluationResult(
|
|
67
|
+
score=float(bool(found)),
|
|
68
|
+
explanation=explanation,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ContainsAllKeywords:
|
|
73
|
+
annotator_kind = "CODE"
|
|
74
|
+
|
|
75
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
76
|
+
self.keywords = keywords
|
|
77
|
+
self.name = name or f"ContainsAll({keywords})"
|
|
78
|
+
|
|
79
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
80
|
+
assert exp_run.output is not None
|
|
81
|
+
result = _unwrap_json(exp_run.output.result)
|
|
82
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
83
|
+
not_found = [keyword for keyword in self.keywords if keyword not in result]
|
|
84
|
+
if not_found:
|
|
85
|
+
contains_all = False
|
|
86
|
+
explanation = f"the keywords {not_found} were not found in the output"
|
|
87
|
+
else:
|
|
88
|
+
contains_all = True
|
|
89
|
+
explanation = f"all of the keywords {self.keywords} were found in the output"
|
|
90
|
+
return EvaluationResult(
|
|
91
|
+
score=float(contains_all),
|
|
92
|
+
explanation=explanation,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class MatchesRegex:
|
|
97
|
+
annotator_kind = "CODE"
|
|
98
|
+
|
|
99
|
+
def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
|
|
100
|
+
if isinstance(pattern, str):
|
|
101
|
+
pattern = re.compile(pattern)
|
|
102
|
+
self.pattern = pattern
|
|
103
|
+
assert isinstance(pattern, re.Pattern)
|
|
104
|
+
self.name = name or f"matches_({pattern})"
|
|
105
|
+
|
|
106
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
107
|
+
assert exp_run.output is not None
|
|
108
|
+
result = _unwrap_json(exp_run.output.result)
|
|
109
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
110
|
+
matches = self.pattern.findall(result)
|
|
111
|
+
if matches:
|
|
112
|
+
explanation = (
|
|
113
|
+
f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
|
|
117
|
+
return EvaluationResult(
|
|
118
|
+
score=float(bool(matches)),
|
|
119
|
+
explanation=explanation,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# Someday we'll do typing checking in unit tests.
|
|
124
|
+
if TYPE_CHECKING:
|
|
125
|
+
_: ExperimentEvaluator
|
|
126
|
+
_ = JSONParsable()
|
|
127
|
+
_ = ContainsKeyword("test")
|
|
@@ -1,70 +1,12 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import re
|
|
3
|
-
from typing import
|
|
2
|
+
from typing import Callable, Optional, Type
|
|
4
3
|
|
|
5
|
-
from phoenix.datasets.
|
|
6
|
-
|
|
7
|
-
Example,
|
|
8
|
-
ExperimentEvaluator,
|
|
9
|
-
ExperimentRun,
|
|
10
|
-
JSONSerializable,
|
|
11
|
-
)
|
|
4
|
+
from phoenix.datasets.evaluators._utils import _unwrap_json
|
|
5
|
+
from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
|
|
12
6
|
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
13
7
|
from phoenix.evals.utils import snap_to_rail
|
|
14
8
|
|
|
15
9
|
|
|
16
|
-
def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
17
|
-
if isinstance(obj, dict):
|
|
18
|
-
if len(obj) == 1:
|
|
19
|
-
key = next(iter(obj.keys()))
|
|
20
|
-
output = obj[key]
|
|
21
|
-
assert isinstance(
|
|
22
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
23
|
-
), "Output must be JSON serializable"
|
|
24
|
-
return output
|
|
25
|
-
return obj
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class JSONParsable:
|
|
29
|
-
annotator_kind = "CODE"
|
|
30
|
-
name = "JSONParsable"
|
|
31
|
-
|
|
32
|
-
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
33
|
-
assert exp_run.output is not None
|
|
34
|
-
output = _unwrap_json(exp_run.output.result)
|
|
35
|
-
assert isinstance(output, str), "Experiment run output must be a string"
|
|
36
|
-
try:
|
|
37
|
-
json.loads(output)
|
|
38
|
-
json_parsable = True
|
|
39
|
-
except BaseException:
|
|
40
|
-
json_parsable = False
|
|
41
|
-
return EvaluationResult(
|
|
42
|
-
score=int(json_parsable),
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class ContainsKeyword:
|
|
47
|
-
annotator_kind = "CODE"
|
|
48
|
-
|
|
49
|
-
def __init__(self, keyword: str) -> None:
|
|
50
|
-
super().__init__()
|
|
51
|
-
self.keyword = keyword
|
|
52
|
-
self.name = f"ContainsKeyword({keyword})"
|
|
53
|
-
|
|
54
|
-
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
55
|
-
assert exp_run.output is not None
|
|
56
|
-
result = _unwrap_json(exp_run.output.result)
|
|
57
|
-
assert isinstance(result, str), "Experiment run output must be a string"
|
|
58
|
-
found = self.keyword in result
|
|
59
|
-
return EvaluationResult(
|
|
60
|
-
score=float(found),
|
|
61
|
-
explanation=(
|
|
62
|
-
f"the string {repr(self.keyword)} was "
|
|
63
|
-
f"{'found' if found else 'not found'} in the output"
|
|
64
|
-
),
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
10
|
class LLMCriteriaEvaluator:
|
|
69
11
|
annotator_kind = "LLM"
|
|
70
12
|
_base_template = (
|
|
@@ -77,7 +19,7 @@ class LLMCriteriaEvaluator:
|
|
|
77
19
|
"EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
|
|
78
20
|
"the criteria*\n"
|
|
79
21
|
"LABEL: *true or false*\n\n"
|
|
80
|
-
"Follow this template for the following
|
|
22
|
+
"Follow this template for the following example:\n\n"
|
|
81
23
|
"CRITERIA: the text is '{criteria}'\n"
|
|
82
24
|
"TEXT: {text}\n"
|
|
83
25
|
"EXPLANATION: "
|
|
@@ -142,40 +84,43 @@ class LLMCriteriaEvaluator:
|
|
|
142
84
|
|
|
143
85
|
|
|
144
86
|
def criteria_evaluator_factory(
|
|
145
|
-
class_name: str, criteria: str, description: str
|
|
87
|
+
class_name: str, criteria: str, description: str, default_name: str
|
|
146
88
|
) -> Type[ExperimentEvaluator]:
|
|
89
|
+
def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
|
|
90
|
+
LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
|
|
91
|
+
|
|
147
92
|
return type(
|
|
148
93
|
class_name,
|
|
149
94
|
(LLMCriteriaEvaluator,),
|
|
150
95
|
{
|
|
151
|
-
"__init__":
|
|
152
|
-
self, model, criteria, description, name=class_name
|
|
153
|
-
),
|
|
96
|
+
"__init__": _init,
|
|
154
97
|
"__module__": __name__,
|
|
155
|
-
"name": class_name,
|
|
156
98
|
"template": LLMCriteriaEvaluator._format_base_template(criteria, description),
|
|
157
99
|
},
|
|
158
100
|
)
|
|
159
101
|
|
|
160
102
|
|
|
161
|
-
|
|
162
|
-
class_name="
|
|
103
|
+
ConcisenessEvaluator = criteria_evaluator_factory(
|
|
104
|
+
class_name="ConcisenessEvaluator",
|
|
163
105
|
criteria="concise",
|
|
164
106
|
description="is just a few sentences and easy to follow",
|
|
107
|
+
default_name="Conciseness",
|
|
165
108
|
)
|
|
166
109
|
|
|
167
110
|
|
|
168
|
-
|
|
169
|
-
class_name="
|
|
111
|
+
HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
112
|
+
class_name="HelpfulnessEvaluator",
|
|
170
113
|
criteria="helpful",
|
|
171
114
|
description="provides useful information",
|
|
115
|
+
default_name="Helpfulness",
|
|
172
116
|
)
|
|
173
117
|
|
|
174
118
|
|
|
175
|
-
|
|
176
|
-
class_name="
|
|
119
|
+
CoherenceEvaluator = criteria_evaluator_factory(
|
|
120
|
+
class_name="CoherenceEvaluator",
|
|
177
121
|
criteria="coherent",
|
|
178
|
-
description="is coherent, well-structured, and
|
|
122
|
+
description="is coherent, well-structured, and logically sound",
|
|
123
|
+
default_name="Coherence",
|
|
179
124
|
)
|
|
180
125
|
|
|
181
126
|
|
|
@@ -266,10 +211,3 @@ class RelevanceEvaluator:
|
|
|
266
211
|
formatted_template = self._format_eval_template(example, exp_run)
|
|
267
212
|
unparsed_response = await self.model._async_generate(formatted_template)
|
|
268
213
|
return self._parse_eval_output(unparsed_response)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
# Someday we'll do typing checking in unit tests.
|
|
272
|
-
if TYPE_CHECKING:
|
|
273
|
-
_: ExperimentEvaluator
|
|
274
|
-
_ = JSONParsable()
|
|
275
|
-
_ = ContainsKeyword("test")
|
|
@@ -61,6 +61,7 @@ from phoenix.datasets.types import (
|
|
|
61
61
|
from phoenix.evals.executors import get_executor_on_sync_context
|
|
62
62
|
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
63
63
|
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
64
|
+
from phoenix.session.session import active_session
|
|
64
65
|
from phoenix.trace.attributes import flatten
|
|
65
66
|
from phoenix.utilities.json import jsonify
|
|
66
67
|
|
|
@@ -78,12 +79,23 @@ def _get_base_url() -> str:
|
|
|
78
79
|
return base_url if base_url.endswith("/") else base_url + "/"
|
|
79
80
|
|
|
80
81
|
|
|
82
|
+
def _get_web_base_url() -> str:
|
|
83
|
+
"""Return the web UI base URL.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
str: the web UI base URL
|
|
87
|
+
"""
|
|
88
|
+
if session := active_session():
|
|
89
|
+
return session.url
|
|
90
|
+
return _get_base_url()
|
|
91
|
+
|
|
92
|
+
|
|
81
93
|
def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
|
|
82
|
-
return f"{
|
|
94
|
+
return f"{_get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
|
|
83
95
|
|
|
84
96
|
|
|
85
97
|
def _get_dataset_experiments_url(*, dataset_id: str) -> str:
|
|
86
|
-
return f"{
|
|
98
|
+
return f"{_get_web_base_url()}datasets/{dataset_id}/experiments"
|
|
87
99
|
|
|
88
100
|
|
|
89
101
|
def _phoenix_client() -> httpx.Client:
|
|
@@ -134,7 +146,9 @@ def run_experiment(
|
|
|
134
146
|
|
|
135
147
|
dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
|
|
136
148
|
experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
|
|
137
|
-
print(
|
|
149
|
+
print("🧪 Experiment started.")
|
|
150
|
+
print(f"📺 View dataset experiments: {dataset_experiments_url}")
|
|
151
|
+
print(f"🔗 View this experiment: {experiment_compare_url}")
|
|
138
152
|
|
|
139
153
|
errors: Tuple[Optional[Type[BaseException]], ...]
|
|
140
154
|
if not hasattr(rate_limit_errors, "__iter__"):
|
|
@@ -278,7 +292,8 @@ def run_experiment(
|
|
|
278
292
|
project_name=project_name,
|
|
279
293
|
)
|
|
280
294
|
|
|
281
|
-
print(
|
|
295
|
+
print("✅ Task runs completed.")
|
|
296
|
+
print("🧠 Evaluation started.")
|
|
282
297
|
|
|
283
298
|
if evaluators is not None:
|
|
284
299
|
_evaluate_experiment(experiment, evaluators, dataset.examples, client)
|
|
@@ -443,6 +458,7 @@ def _evaluate_experiment(
|
|
|
443
458
|
max_retries=0,
|
|
444
459
|
exit_on_error=False,
|
|
445
460
|
fallback_return_value=None,
|
|
461
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
|
|
446
462
|
)
|
|
447
463
|
evaluation_payloads, _execution_details = executor.run(evaluation_inputs)
|
|
448
464
|
for payload in evaluation_payloads:
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from phoenix.server.api.routers.v1 import V1_ROUTES
|
|
4
3
|
from starlette.schemas import SchemaGenerator
|
|
5
4
|
|
|
5
|
+
from phoenix.server.api.routers.v1 import V1_ROUTES
|
|
6
|
+
|
|
6
7
|
OPENAPI_SCHEMA_GENERATOR = SchemaGenerator(
|
|
7
8
|
{"openapi": "3.0.0", "info": {"title": "Arize-Phoenix API", "version": "1.0"}}
|
|
8
9
|
)
|