arize-phoenix 3.16.1__py3-none-any.whl → 7.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.1.dist-info/METADATA +261 -0
- arize_phoenix-7.7.1.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.1.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -241
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +4 -112
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.1.dist-info/METADATA +0 -495
- arize_phoenix-3.16.1.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -619
- phoenix/core/traces.py +0 -96
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
|
@@ -1,453 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import warnings
|
|
5
|
-
from collections import defaultdict
|
|
6
|
-
from itertools import product
|
|
7
|
-
from typing import (
|
|
8
|
-
Any,
|
|
9
|
-
DefaultDict,
|
|
10
|
-
Dict,
|
|
11
|
-
Iterable,
|
|
12
|
-
List,
|
|
13
|
-
Mapping,
|
|
14
|
-
NamedTuple,
|
|
15
|
-
Optional,
|
|
16
|
-
Tuple,
|
|
17
|
-
Union,
|
|
18
|
-
cast,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
import pandas as pd
|
|
22
|
-
from openinference.semconv.trace import DocumentAttributes, SpanAttributes
|
|
23
|
-
from pandas import DataFrame
|
|
24
|
-
from typing_extensions import TypeAlias
|
|
25
|
-
|
|
26
|
-
from phoenix.experimental.evals.evaluators import LLMEvaluator
|
|
27
|
-
from phoenix.experimental.evals.functions.executor import get_executor_on_sync_context
|
|
28
|
-
from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
|
|
29
|
-
from phoenix.experimental.evals.templates import (
|
|
30
|
-
RAG_RELEVANCY_PROMPT_RAILS_MAP,
|
|
31
|
-
RAG_RELEVANCY_PROMPT_TEMPLATE,
|
|
32
|
-
ClassificationTemplate,
|
|
33
|
-
PromptOptions,
|
|
34
|
-
PromptTemplate,
|
|
35
|
-
map_template,
|
|
36
|
-
normalize_classification_template,
|
|
37
|
-
)
|
|
38
|
-
from phoenix.experimental.evals.utils import (
|
|
39
|
-
NOT_PARSABLE,
|
|
40
|
-
get_tqdm_progress_bar_formatter,
|
|
41
|
-
openai_function_call_kwargs,
|
|
42
|
-
parse_openai_function_call,
|
|
43
|
-
snap_to_rail,
|
|
44
|
-
)
|
|
45
|
-
from phoenix.utilities.logging import printif
|
|
46
|
-
|
|
47
|
-
DOCUMENT_CONTENT = DocumentAttributes.DOCUMENT_CONTENT
|
|
48
|
-
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
49
|
-
RETRIEVAL_DOCUMENTS = SpanAttributes.RETRIEVAL_DOCUMENTS
|
|
50
|
-
|
|
51
|
-
logger = logging.getLogger(__name__)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
OPENINFERENCE_QUERY_COLUMN_NAME = "attributes." + INPUT_VALUE
|
|
55
|
-
OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
|
|
56
|
-
|
|
57
|
-
ColumnName: TypeAlias = str
|
|
58
|
-
Label: TypeAlias = str
|
|
59
|
-
Score: TypeAlias = Optional[float]
|
|
60
|
-
Explanation: TypeAlias = Optional[str]
|
|
61
|
-
Record: TypeAlias = Mapping[str, Any]
|
|
62
|
-
Index: TypeAlias = int
|
|
63
|
-
|
|
64
|
-
# snapped_response, explanation, response
|
|
65
|
-
ParsedLLMResponse: TypeAlias = Tuple[Optional[str], Optional[str], str]
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def llm_classify(
|
|
69
|
-
dataframe: pd.DataFrame,
|
|
70
|
-
model: BaseEvalModel,
|
|
71
|
-
template: Union[ClassificationTemplate, PromptTemplate, str],
|
|
72
|
-
rails: List[str],
|
|
73
|
-
system_instruction: Optional[str] = None,
|
|
74
|
-
verbose: bool = False,
|
|
75
|
-
use_function_calling_if_available: bool = True,
|
|
76
|
-
provide_explanation: bool = False,
|
|
77
|
-
include_prompt: bool = False,
|
|
78
|
-
include_response: bool = False,
|
|
79
|
-
run_sync: bool = False,
|
|
80
|
-
concurrency: Optional[int] = None,
|
|
81
|
-
) -> pd.DataFrame:
|
|
82
|
-
"""Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
|
|
83
|
-
where the first column is named `label` and contains the classification labels. An optional
|
|
84
|
-
column named `explanation` is added when `provide_explanation=True`.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
dataframe (pandas.DataFrame): A pandas dataframe in which each row represents a record to be
|
|
88
|
-
classified. All template variable names must appear as column names in the dataframe (extra
|
|
89
|
-
columns unrelated to the template are permitted).
|
|
90
|
-
|
|
91
|
-
template (Union[ClassificationTemplate, PromptTemplate, str]): The prompt template as
|
|
92
|
-
either an instance of PromptTemplate, ClassificationTemplate or a string. If a string, the
|
|
93
|
-
variable names should be surrounded by curly braces so that a call to `.format` can be made
|
|
94
|
-
to substitute variable values.
|
|
95
|
-
|
|
96
|
-
model (BaseEvalModel): An LLM model class.
|
|
97
|
-
|
|
98
|
-
rails (List[str]): A list of strings representing the possible output classes of the model's
|
|
99
|
-
predictions.
|
|
100
|
-
|
|
101
|
-
system_instruction (Optional[str], optional): An optional system message.
|
|
102
|
-
|
|
103
|
-
verbose (bool, optional): If True, prints detailed info to stdout such as model invocation
|
|
104
|
-
parameters and details about retries and snapping to rails. Default False.
|
|
105
|
-
|
|
106
|
-
use_function_calling_if_available (bool, default=True): If True, use function calling
|
|
107
|
-
(if available) as a means to constrain the LLM outputs. With function calling, the LLM
|
|
108
|
-
is instructed to provide its response as a structured JSON object, which is easier
|
|
109
|
-
to parse.
|
|
110
|
-
|
|
111
|
-
provide_explanation (bool, default=False): If True, provides an explanation for each
|
|
112
|
-
classification label. A column named `explanation` is added to the output dataframe.
|
|
113
|
-
|
|
114
|
-
include_prompt (bool, default=False): If True, includes a column named `prompt` in the
|
|
115
|
-
output dataframe containing the prompt used for each classification.
|
|
116
|
-
|
|
117
|
-
include_response (bool, default=False): If True, includes a column named `response` in the
|
|
118
|
-
output dataframe containing the raw response from the LLM.
|
|
119
|
-
|
|
120
|
-
run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
|
|
121
|
-
evaluations will be run asynchronously if possible.
|
|
122
|
-
|
|
123
|
-
concurrency (Optional[int], default=None): The number of concurrent evals if async
|
|
124
|
-
submission is possible. If not provided, a recommended default concurrency is set on a
|
|
125
|
-
per-model basis.
|
|
126
|
-
|
|
127
|
-
Returns:
|
|
128
|
-
pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
|
|
129
|
-
the classification labels. If provide_explanation=True, then an additional column named
|
|
130
|
-
`explanation` is added to contain the explanation for each label. The dataframe has
|
|
131
|
-
the same length and index as the input dataframe. The classification label values are
|
|
132
|
-
from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
|
|
133
|
-
not be parsed.
|
|
134
|
-
"""
|
|
135
|
-
concurrency = concurrency or model.default_concurrency
|
|
136
|
-
# clients need to be reloaded to ensure that async evals work properly
|
|
137
|
-
model.reload_client()
|
|
138
|
-
|
|
139
|
-
tqdm_bar_format = get_tqdm_progress_bar_formatter("llm_classify")
|
|
140
|
-
use_openai_function_call = (
|
|
141
|
-
use_function_calling_if_available
|
|
142
|
-
and isinstance(model, OpenAIModel)
|
|
143
|
-
and model.supports_function_calling
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
model_kwargs = (
|
|
147
|
-
openai_function_call_kwargs(rails, provide_explanation) if use_openai_function_call else {}
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
eval_template = normalize_classification_template(rails=rails, template=template)
|
|
151
|
-
|
|
152
|
-
prompt_options = PromptOptions(provide_explanation=provide_explanation)
|
|
153
|
-
prompts = map_template(dataframe, eval_template, options=prompt_options)
|
|
154
|
-
|
|
155
|
-
labels: List[Optional[str]] = [None] * len(dataframe)
|
|
156
|
-
explanations: List[Optional[str]] = [None] * len(dataframe)
|
|
157
|
-
|
|
158
|
-
printif(verbose, f"Using prompt:\n\n{eval_template.prompt(prompt_options)}")
|
|
159
|
-
if generation_info := model.verbose_generation_info():
|
|
160
|
-
printif(verbose, generation_info)
|
|
161
|
-
|
|
162
|
-
def _process_response(response: str) -> Tuple[str, Optional[str]]:
|
|
163
|
-
if not use_openai_function_call:
|
|
164
|
-
if provide_explanation:
|
|
165
|
-
unrailed_label, explanation = (
|
|
166
|
-
eval_template.extract_label_from_explanation(response),
|
|
167
|
-
response,
|
|
168
|
-
)
|
|
169
|
-
printif(
|
|
170
|
-
verbose and unrailed_label == NOT_PARSABLE,
|
|
171
|
-
f"- Could not parse {repr(response)}",
|
|
172
|
-
)
|
|
173
|
-
else:
|
|
174
|
-
unrailed_label = response
|
|
175
|
-
explanation = None
|
|
176
|
-
else:
|
|
177
|
-
unrailed_label, explanation = parse_openai_function_call(response)
|
|
178
|
-
return snap_to_rail(unrailed_label, rails, verbose=verbose), explanation
|
|
179
|
-
|
|
180
|
-
async def _run_llm_classification_async(prompt: str) -> ParsedLLMResponse:
|
|
181
|
-
with set_verbosity(model, verbose) as verbose_model:
|
|
182
|
-
response = await verbose_model._async_generate(
|
|
183
|
-
prompt, instruction=system_instruction, **model_kwargs
|
|
184
|
-
)
|
|
185
|
-
inference, explanation = _process_response(response)
|
|
186
|
-
return inference, explanation, response
|
|
187
|
-
|
|
188
|
-
def _run_llm_classification_sync(prompt: str) -> ParsedLLMResponse:
|
|
189
|
-
with set_verbosity(model, verbose) as verbose_model:
|
|
190
|
-
response = verbose_model._generate(
|
|
191
|
-
prompt, instruction=system_instruction, **model_kwargs
|
|
192
|
-
)
|
|
193
|
-
inference, explanation = _process_response(response)
|
|
194
|
-
return inference, explanation, response
|
|
195
|
-
|
|
196
|
-
fallback_return_value: ParsedLLMResponse = (None, None, "")
|
|
197
|
-
|
|
198
|
-
executor = get_executor_on_sync_context(
|
|
199
|
-
_run_llm_classification_sync,
|
|
200
|
-
_run_llm_classification_async,
|
|
201
|
-
run_sync=run_sync,
|
|
202
|
-
concurrency=concurrency,
|
|
203
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
204
|
-
exit_on_error=True,
|
|
205
|
-
fallback_return_value=fallback_return_value,
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
results = executor.run(prompts.tolist())
|
|
209
|
-
labels, explanations, responses = zip(*results)
|
|
210
|
-
|
|
211
|
-
return pd.DataFrame(
|
|
212
|
-
data={
|
|
213
|
-
"label": labels,
|
|
214
|
-
**({"explanation": explanations} if provide_explanation else {}),
|
|
215
|
-
**({"prompt": prompts} if include_prompt else {}),
|
|
216
|
-
**({"response": responses} if include_response else {}),
|
|
217
|
-
},
|
|
218
|
-
index=dataframe.index,
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def run_relevance_eval(
|
|
223
|
-
dataframe: pd.DataFrame,
|
|
224
|
-
model: BaseEvalModel,
|
|
225
|
-
template: Union[ClassificationTemplate, str] = RAG_RELEVANCY_PROMPT_TEMPLATE,
|
|
226
|
-
rails: List[str] = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
|
|
227
|
-
system_instruction: Optional[str] = None,
|
|
228
|
-
query_column_name: str = "input",
|
|
229
|
-
document_column_name: str = "reference",
|
|
230
|
-
verbose: bool = False,
|
|
231
|
-
) -> List[List[str]]:
|
|
232
|
-
"""
|
|
233
|
-
Given a pandas dataframe containing queries and retrieved documents, classifies the relevance of
|
|
234
|
-
each retrieved document to the corresponding query using an LLM.
|
|
235
|
-
|
|
236
|
-
Args:
|
|
237
|
-
dataframe (pd.DataFrame): A pandas dataframe containing queries and retrieved documents. If
|
|
238
|
-
both query_column_name and reference_column_name are present in the input dataframe, those
|
|
239
|
-
columns are used as inputs and should appear in the following format:
|
|
240
|
-
|
|
241
|
-
- The entries of the query column must be strings.
|
|
242
|
-
- The entries of the documents column must be lists of strings. Each list may contain an
|
|
243
|
-
arbitrary number of document texts retrieved for the corresponding query.
|
|
244
|
-
|
|
245
|
-
If the input dataframe is lacking either query_column_name or reference_column_name but has
|
|
246
|
-
query and retrieved document columns in OpenInference trace format named
|
|
247
|
-
"attributes.input.value" and "attributes.retrieval.documents", respectively, then those
|
|
248
|
-
columns are used as inputs and should appear in the following format:
|
|
249
|
-
|
|
250
|
-
- The entries of the query column must be strings.
|
|
251
|
-
- The entries of the document column must be lists of OpenInference document objects, each
|
|
252
|
-
object being a dictionary that stores the document text under the key "document.content".
|
|
253
|
-
|
|
254
|
-
This latter format is intended for running evaluations on exported OpenInference trace
|
|
255
|
-
dataframes. For more information on the OpenInference tracing specification, see
|
|
256
|
-
https://github.com/Arize-ai/openinference/.
|
|
257
|
-
|
|
258
|
-
model (BaseEvalModel): The model used for evaluation.
|
|
259
|
-
|
|
260
|
-
template (Union[PromptTemplate, str], optional): The template used for evaluation.
|
|
261
|
-
|
|
262
|
-
rails (List[str], optional): A list of strings representing the possible output classes of
|
|
263
|
-
the model's predictions.
|
|
264
|
-
|
|
265
|
-
query_column_name (str, optional): The name of the query column in the dataframe, which
|
|
266
|
-
should also be a template variable.
|
|
267
|
-
|
|
268
|
-
reference_column_name (str, optional): The name of the document column in the dataframe,
|
|
269
|
-
which should also be a template variable.
|
|
270
|
-
|
|
271
|
-
system_instruction (Optional[str], optional): An optional system message.
|
|
272
|
-
|
|
273
|
-
verbose (bool, optional): If True, prints detailed information to stdout such as model
|
|
274
|
-
invocation parameters and retry info. Default False.
|
|
275
|
-
|
|
276
|
-
Returns:
|
|
277
|
-
List[List[str]]: A list of relevant and not relevant classifications. The "shape" of the
|
|
278
|
-
list should mirror the "shape" of the retrieved documents column, in the sense that it has
|
|
279
|
-
the same length as the input dataframe and each sub-list has the same length as the
|
|
280
|
-
corresponding list in the retrieved documents column. The values in the sub-lists are either
|
|
281
|
-
entries from the rails argument or "NOT_PARSABLE" in the case where the LLM output could not
|
|
282
|
-
be parsed.
|
|
283
|
-
"""
|
|
284
|
-
|
|
285
|
-
warnings.warn(
|
|
286
|
-
"run_relevance_eval will soon be deprecated. "
|
|
287
|
-
"Use run_evals with HallucinationEvaluator instead.",
|
|
288
|
-
DeprecationWarning,
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
with set_verbosity(model, verbose) as verbose_model:
|
|
292
|
-
query_column = dataframe.get(query_column_name)
|
|
293
|
-
document_column = dataframe.get(document_column_name)
|
|
294
|
-
if query_column is None or document_column is None:
|
|
295
|
-
openinference_query_column = dataframe.get(OPENINFERENCE_QUERY_COLUMN_NAME)
|
|
296
|
-
openinference_document_column = dataframe.get(OPENINFERENCE_DOCUMENT_COLUMN_NAME)
|
|
297
|
-
if openinference_query_column is None or openinference_document_column is None:
|
|
298
|
-
raise ValueError(
|
|
299
|
-
f'Dataframe columns must include either "{query_column_name}" and '
|
|
300
|
-
f'"{document_column_name}", or "{OPENINFERENCE_QUERY_COLUMN_NAME}" and '
|
|
301
|
-
f'"{OPENINFERENCE_DOCUMENT_COLUMN_NAME}".'
|
|
302
|
-
)
|
|
303
|
-
query_column = openinference_query_column
|
|
304
|
-
document_column = openinference_document_column.map(
|
|
305
|
-
lambda docs: _get_contents_from_openinference_documents(docs)
|
|
306
|
-
if docs is not None
|
|
307
|
-
else None
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
queries = cast("pd.Series[str]", query_column).tolist()
|
|
311
|
-
document_lists = cast("pd.Series[str]", document_column).tolist()
|
|
312
|
-
indexes = []
|
|
313
|
-
expanded_queries = []
|
|
314
|
-
expanded_documents = []
|
|
315
|
-
for index, (query, documents) in enumerate(zip(queries, document_lists)):
|
|
316
|
-
if query is None or documents is None:
|
|
317
|
-
continue
|
|
318
|
-
for document in documents:
|
|
319
|
-
indexes.append(index)
|
|
320
|
-
expanded_queries.append(query)
|
|
321
|
-
expanded_documents.append(document)
|
|
322
|
-
predictions = llm_classify(
|
|
323
|
-
dataframe=pd.DataFrame(
|
|
324
|
-
{
|
|
325
|
-
query_column_name: expanded_queries,
|
|
326
|
-
document_column_name: expanded_documents,
|
|
327
|
-
}
|
|
328
|
-
),
|
|
329
|
-
model=verbose_model,
|
|
330
|
-
template=template,
|
|
331
|
-
rails=rails,
|
|
332
|
-
system_instruction=system_instruction,
|
|
333
|
-
verbose=verbose,
|
|
334
|
-
).iloc[:, 0]
|
|
335
|
-
outputs: List[List[str]] = [[] for _ in range(len(dataframe))]
|
|
336
|
-
for index, prediction in zip(indexes, predictions):
|
|
337
|
-
outputs[index].append(prediction)
|
|
338
|
-
return outputs
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def _get_contents_from_openinference_documents(documents: Iterable[Any]) -> List[Optional[str]]:
|
|
342
|
-
"""
|
|
343
|
-
Get document contents from an iterable of OpenInference document objects, which are dictionaries
|
|
344
|
-
containing the document text under the "document.content" key.
|
|
345
|
-
"""
|
|
346
|
-
return [doc.get(DOCUMENT_CONTENT) if isinstance(doc, dict) else None for doc in documents]
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
class RunEvalsPayload(NamedTuple):
|
|
350
|
-
evaluator: LLMEvaluator
|
|
351
|
-
record: Record
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
def run_evals(
|
|
355
|
-
dataframe: DataFrame,
|
|
356
|
-
evaluators: List[LLMEvaluator],
|
|
357
|
-
provide_explanation: bool = False,
|
|
358
|
-
use_function_calling_if_available: bool = True,
|
|
359
|
-
verbose: bool = False,
|
|
360
|
-
concurrency: Optional[int] = None,
|
|
361
|
-
) -> List[DataFrame]:
|
|
362
|
-
"""
|
|
363
|
-
Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
|
|
364
|
-
which each dataframe contains the outputs of the corresponding evaluator
|
|
365
|
-
applied to the input dataframe.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
dataframe (DataFrame): A pandas dataframe in which each row represents a
|
|
369
|
-
record to be evaluated. All template variable names must appear as
|
|
370
|
-
column names in the dataframe (extra columns unrelated to the template
|
|
371
|
-
are permitted).
|
|
372
|
-
|
|
373
|
-
evaluators (List[LLMEvaluator]): A list of evaluators.
|
|
374
|
-
|
|
375
|
-
provide_explanation (bool, optional): If True, provides an explanation
|
|
376
|
-
for each evaluation. A column named "explanation" is added to each
|
|
377
|
-
output dataframe.
|
|
378
|
-
|
|
379
|
-
use_function_calling_if_available (bool, optional): If True, use
|
|
380
|
-
function calling (if available) as a means to constrain the LLM outputs.
|
|
381
|
-
With function calling, the LLM is instructed to provide its response as
|
|
382
|
-
a structured JSON object, which is easier to parse.
|
|
383
|
-
|
|
384
|
-
verbose (bool, optional): If True, prints detailed info to stdout such
|
|
385
|
-
as model invocation parameters and details about retries and snapping to
|
|
386
|
-
rails.
|
|
387
|
-
|
|
388
|
-
concurrency (Optional[int], default=None): The number of concurrent evals if async
|
|
389
|
-
submission is possible. If not provided, a recommended default concurrency is set on a
|
|
390
|
-
per-model basis.
|
|
391
|
-
|
|
392
|
-
Returns:
|
|
393
|
-
List[DataFrame]: A list of dataframes, one for each evaluator, all of
|
|
394
|
-
which have the same number of rows as the input dataframe.
|
|
395
|
-
"""
|
|
396
|
-
# use the minimum default concurrency of all the models
|
|
397
|
-
if concurrency is None:
|
|
398
|
-
if len(evaluators) == 0:
|
|
399
|
-
concurrency = 1
|
|
400
|
-
else:
|
|
401
|
-
concurrency = min(evaluator.default_concurrency for evaluator in evaluators)
|
|
402
|
-
|
|
403
|
-
# clients need to be reloaded to ensure that async evals work properly
|
|
404
|
-
for evaluator in evaluators:
|
|
405
|
-
evaluator.reload_client()
|
|
406
|
-
|
|
407
|
-
async def _arun_eval(
|
|
408
|
-
payload: RunEvalsPayload,
|
|
409
|
-
) -> Tuple[Label, Score, Explanation]:
|
|
410
|
-
return await payload.evaluator.aevaluate(
|
|
411
|
-
payload.record,
|
|
412
|
-
provide_explanation=provide_explanation,
|
|
413
|
-
use_function_calling_if_available=use_function_calling_if_available,
|
|
414
|
-
)
|
|
415
|
-
|
|
416
|
-
def _run_eval(
|
|
417
|
-
payload: RunEvalsPayload,
|
|
418
|
-
) -> Tuple[Label, Score, Explanation]:
|
|
419
|
-
return payload.evaluator.evaluate(
|
|
420
|
-
payload.record,
|
|
421
|
-
provide_explanation=provide_explanation,
|
|
422
|
-
use_function_calling_if_available=use_function_calling_if_available,
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
executor = get_executor_on_sync_context(
|
|
426
|
-
_run_eval,
|
|
427
|
-
_arun_eval,
|
|
428
|
-
concurrency=concurrency,
|
|
429
|
-
tqdm_bar_format=get_tqdm_progress_bar_formatter("run_evals"),
|
|
430
|
-
exit_on_error=True,
|
|
431
|
-
fallback_return_value=(None, None, None),
|
|
432
|
-
)
|
|
433
|
-
|
|
434
|
-
total_records = len(dataframe)
|
|
435
|
-
payloads = [
|
|
436
|
-
RunEvalsPayload(evaluator=evaluator, record=row)
|
|
437
|
-
for evaluator, (_, row) in product(evaluators, dataframe.iterrows())
|
|
438
|
-
]
|
|
439
|
-
eval_results: List[DefaultDict[Index, Dict[ColumnName, Union[Label, Explanation]]]] = [
|
|
440
|
-
defaultdict(dict) for _ in range(len(evaluators))
|
|
441
|
-
]
|
|
442
|
-
for index, (label, score, explanation) in enumerate(executor.run(payloads)):
|
|
443
|
-
evaluator_index = index // total_records
|
|
444
|
-
row_index = index % total_records
|
|
445
|
-
eval_results[evaluator_index][row_index]["label"] = label
|
|
446
|
-
eval_results[evaluator_index][row_index]["score"] = score
|
|
447
|
-
if provide_explanation:
|
|
448
|
-
eval_results[evaluator_index][row_index]["explanation"] = explanation
|
|
449
|
-
eval_dataframes: List[DataFrame] = []
|
|
450
|
-
for eval_result in eval_results:
|
|
451
|
-
eval_data = [eval_result[row_index] for row_index in range(len(eval_result))]
|
|
452
|
-
eval_dataframes.append(DataFrame(eval_data, index=dataframe.index))
|
|
453
|
-
return eval_dataframes
|