arize-phoenix 3.16.0__py3-none-any.whl → 7.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.0.dist-info/METADATA +261 -0
- arize_phoenix-7.7.0.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -247
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +13 -107
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.0.dist-info/METADATA +0 -495
- arize_phoenix-3.16.0.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -617
- phoenix/core/traces.py +0 -100
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
phoenix/core/traces.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import weakref
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from queue import SimpleQueue
|
|
4
|
-
from threading import RLock, Thread
|
|
5
|
-
from types import MethodType
|
|
6
|
-
from typing import DefaultDict, Iterator, Optional, Tuple, Union
|
|
7
|
-
|
|
8
|
-
from typing_extensions import assert_never
|
|
9
|
-
|
|
10
|
-
import phoenix.trace.v1 as pb
|
|
11
|
-
from phoenix.config import DEFAULT_PROJECT_NAME
|
|
12
|
-
from phoenix.core.project import (
|
|
13
|
-
END_OF_QUEUE,
|
|
14
|
-
Project,
|
|
15
|
-
_ProjectName,
|
|
16
|
-
)
|
|
17
|
-
from phoenix.trace.schemas import Span
|
|
18
|
-
|
|
19
|
-
_SpanItem = Tuple[Span, _ProjectName]
|
|
20
|
-
_EvalItem = Tuple[pb.Evaluation, _ProjectName]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class Traces:
|
|
24
|
-
def __init__(self) -> None:
|
|
25
|
-
self._span_queue: "SimpleQueue[Optional[_SpanItem]]" = SimpleQueue()
|
|
26
|
-
self._eval_queue: "SimpleQueue[Optional[_EvalItem]]" = SimpleQueue()
|
|
27
|
-
# Putting `None` as the sentinel value for queue termination.
|
|
28
|
-
weakref.finalize(self, self._span_queue.put, END_OF_QUEUE)
|
|
29
|
-
weakref.finalize(self, self._eval_queue.put, END_OF_QUEUE)
|
|
30
|
-
self._lock = RLock()
|
|
31
|
-
self._projects: DefaultDict[_ProjectName, "Project"] = defaultdict(
|
|
32
|
-
Project,
|
|
33
|
-
{DEFAULT_PROJECT_NAME: Project()},
|
|
34
|
-
)
|
|
35
|
-
self._start_consumers()
|
|
36
|
-
|
|
37
|
-
def get_project(self, project_name: str) -> Optional["Project"]:
|
|
38
|
-
with self._lock:
|
|
39
|
-
return self._projects.get(project_name)
|
|
40
|
-
|
|
41
|
-
def get_projects(self) -> Iterator[Tuple[int, str, "Project"]]:
|
|
42
|
-
with self._lock:
|
|
43
|
-
for project_id, (project_name, project) in enumerate(self._projects.items()):
|
|
44
|
-
if project.is_archived:
|
|
45
|
-
continue
|
|
46
|
-
yield project_id, project_name, project
|
|
47
|
-
|
|
48
|
-
def archive_project(self, id: int) -> Optional["Project"]:
|
|
49
|
-
with self._lock:
|
|
50
|
-
active_projects = {
|
|
51
|
-
project_id: project
|
|
52
|
-
for project_id, _, project in self.get_projects()
|
|
53
|
-
if not project.is_archived
|
|
54
|
-
}
|
|
55
|
-
if len(active_projects) <= 1:
|
|
56
|
-
return None
|
|
57
|
-
if project := active_projects.get(id):
|
|
58
|
-
project.archive()
|
|
59
|
-
return project
|
|
60
|
-
return None
|
|
61
|
-
|
|
62
|
-
def put(
|
|
63
|
-
self,
|
|
64
|
-
item: Union[Span, pb.Evaluation],
|
|
65
|
-
project_name: Optional[str] = None,
|
|
66
|
-
) -> None:
|
|
67
|
-
if not project_name:
|
|
68
|
-
project_name = DEFAULT_PROJECT_NAME
|
|
69
|
-
if isinstance(item, Span):
|
|
70
|
-
self._span_queue.put((item, project_name))
|
|
71
|
-
elif isinstance(item, pb.Evaluation):
|
|
72
|
-
self._eval_queue.put((item, project_name))
|
|
73
|
-
else:
|
|
74
|
-
assert_never(item)
|
|
75
|
-
|
|
76
|
-
def _start_consumers(self) -> None:
|
|
77
|
-
Thread(
|
|
78
|
-
target=MethodType(self.__class__._consume_spans, weakref.proxy(self)),
|
|
79
|
-
args=(self._span_queue,),
|
|
80
|
-
daemon=True,
|
|
81
|
-
).start()
|
|
82
|
-
Thread(
|
|
83
|
-
target=MethodType(self.__class__._consume_evals, weakref.proxy(self)),
|
|
84
|
-
args=(self._eval_queue,),
|
|
85
|
-
daemon=True,
|
|
86
|
-
).start()
|
|
87
|
-
|
|
88
|
-
def _consume_spans(self, queue: "SimpleQueue[Optional[_SpanItem]]") -> None:
|
|
89
|
-
while (item := queue.get()) is not END_OF_QUEUE:
|
|
90
|
-
span, project_name = item
|
|
91
|
-
with self._lock:
|
|
92
|
-
project = self._projects[project_name]
|
|
93
|
-
project.add_span(span)
|
|
94
|
-
|
|
95
|
-
def _consume_evals(self, queue: "SimpleQueue[Optional[_EvalItem]]") -> None:
|
|
96
|
-
while (item := queue.get()) is not END_OF_QUEUE:
|
|
97
|
-
pb_eval, project_name = item
|
|
98
|
-
with self._lock:
|
|
99
|
-
project = self._projects[project_name]
|
|
100
|
-
project.add_eval(pb_eval)
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from .evaluators import (
|
|
4
|
-
HallucinationEvaluator,
|
|
5
|
-
LLMEvaluator,
|
|
6
|
-
QAEvaluator,
|
|
7
|
-
RelevanceEvaluator,
|
|
8
|
-
SummarizationEvaluator,
|
|
9
|
-
ToxicityEvaluator,
|
|
10
|
-
)
|
|
11
|
-
from .functions import llm_classify, llm_generate, run_evals, run_relevance_eval
|
|
12
|
-
from .models import BedrockModel, LiteLLMModel, OpenAIModel, VertexAIModel
|
|
13
|
-
from .retrievals import compute_precisions_at_k
|
|
14
|
-
from .templates import (
|
|
15
|
-
CODE_READABILITY_PROMPT_RAILS_MAP,
|
|
16
|
-
CODE_READABILITY_PROMPT_TEMPLATE,
|
|
17
|
-
HALLUCINATION_PROMPT_RAILS_MAP,
|
|
18
|
-
HALLUCINATION_PROMPT_TEMPLATE,
|
|
19
|
-
HUMAN_VS_AI_PROMPT_RAILS_MAP,
|
|
20
|
-
HUMAN_VS_AI_PROMPT_TEMPLATE,
|
|
21
|
-
QA_PROMPT_RAILS_MAP,
|
|
22
|
-
QA_PROMPT_TEMPLATE,
|
|
23
|
-
RAG_RELEVANCY_PROMPT_RAILS_MAP,
|
|
24
|
-
RAG_RELEVANCY_PROMPT_TEMPLATE,
|
|
25
|
-
TOXICITY_PROMPT_RAILS_MAP,
|
|
26
|
-
TOXICITY_PROMPT_TEMPLATE,
|
|
27
|
-
ClassificationTemplate,
|
|
28
|
-
PromptTemplate,
|
|
29
|
-
)
|
|
30
|
-
from .utils import NOT_PARSABLE, download_benchmark_dataset
|
|
31
|
-
|
|
32
|
-
logger = logging.getLogger(__name__)
|
|
33
|
-
|
|
34
|
-
__all__ = [
|
|
35
|
-
"compute_precisions_at_k",
|
|
36
|
-
"download_benchmark_dataset",
|
|
37
|
-
"llm_classify",
|
|
38
|
-
"llm_generate",
|
|
39
|
-
"OpenAIModel",
|
|
40
|
-
"VertexAIModel",
|
|
41
|
-
"BedrockModel",
|
|
42
|
-
"LiteLLMModel",
|
|
43
|
-
"PromptTemplate",
|
|
44
|
-
"ClassificationTemplate",
|
|
45
|
-
"CODE_READABILITY_PROMPT_RAILS_MAP",
|
|
46
|
-
"CODE_READABILITY_PROMPT_TEMPLATE",
|
|
47
|
-
"HALLUCINATION_PROMPT_RAILS_MAP",
|
|
48
|
-
"HALLUCINATION_PROMPT_TEMPLATE",
|
|
49
|
-
"RAG_RELEVANCY_PROMPT_RAILS_MAP",
|
|
50
|
-
"RAG_RELEVANCY_PROMPT_TEMPLATE",
|
|
51
|
-
"TOXICITY_PROMPT_RAILS_MAP",
|
|
52
|
-
"TOXICITY_PROMPT_TEMPLATE",
|
|
53
|
-
"HUMAN_VS_AI_PROMPT_RAILS_MAP",
|
|
54
|
-
"HUMAN_VS_AI_PROMPT_TEMPLATE",
|
|
55
|
-
"QA_PROMPT_RAILS_MAP",
|
|
56
|
-
"QA_PROMPT_TEMPLATE",
|
|
57
|
-
"NOT_PARSABLE",
|
|
58
|
-
"run_relevance_eval",
|
|
59
|
-
"run_evals",
|
|
60
|
-
"LLMEvaluator",
|
|
61
|
-
"HallucinationEvaluator",
|
|
62
|
-
"QAEvaluator",
|
|
63
|
-
"RelevanceEvaluator",
|
|
64
|
-
"SummarizationEvaluator",
|
|
65
|
-
"ToxicityEvaluator",
|
|
66
|
-
]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
logger.warning(
|
|
70
|
-
"Evals are moving out of experimental. "
|
|
71
|
-
"Install the evals extra with `pip install arize-phoenix[evals]` and import `phoenix.evals`. "
|
|
72
|
-
"For more info, see the [migration guide](https://github.com/Arize-ai/phoenix/blob/main/MIGRATION.md)."
|
|
73
|
-
)
|
|
@@ -1,413 +0,0 @@
|
|
|
1
|
-
from textwrap import indent
|
|
2
|
-
from typing import List, Mapping, Optional, Tuple, Type
|
|
3
|
-
|
|
4
|
-
from phoenix.experimental.evals.models import set_verbosity
|
|
5
|
-
from phoenix.experimental.evals.utils import (
|
|
6
|
-
NOT_PARSABLE,
|
|
7
|
-
openai_function_call_kwargs,
|
|
8
|
-
parse_openai_function_call,
|
|
9
|
-
snap_to_rail,
|
|
10
|
-
)
|
|
11
|
-
from phoenix.utilities.logging import printif
|
|
12
|
-
|
|
13
|
-
from .models import BaseEvalModel, OpenAIModel
|
|
14
|
-
from .templates import ClassificationTemplate, EvalCriteria, PromptOptions, PromptTemplate
|
|
15
|
-
|
|
16
|
-
Record = Mapping[str, str]
|
|
17
|
-
_TAB = " " * 4
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class LLMEvaluator:
|
|
21
|
-
"""
|
|
22
|
-
Leverages an LLM to evaluate individual records.
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
model: BaseEvalModel,
|
|
28
|
-
template: ClassificationTemplate,
|
|
29
|
-
) -> None:
|
|
30
|
-
"""Initializer for LLMEvaluator.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
model (BaseEvalModel): The LLM model to use for evaluation.
|
|
34
|
-
template (ClassificationTemplate): The evaluation template.
|
|
35
|
-
"""
|
|
36
|
-
self._model = model
|
|
37
|
-
self._template = template
|
|
38
|
-
|
|
39
|
-
@property
|
|
40
|
-
def default_concurrency(self) -> int:
|
|
41
|
-
return self._model.default_concurrency
|
|
42
|
-
|
|
43
|
-
def reload_client(self) -> None:
|
|
44
|
-
self._model.reload_client()
|
|
45
|
-
|
|
46
|
-
def evaluate(
|
|
47
|
-
self,
|
|
48
|
-
record: Record,
|
|
49
|
-
provide_explanation: bool = False,
|
|
50
|
-
use_function_calling_if_available: bool = True,
|
|
51
|
-
verbose: bool = False,
|
|
52
|
-
) -> Tuple[str, Optional[float], Optional[str]]:
|
|
53
|
-
"""
|
|
54
|
-
Evaluates a single record.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
record (Record): The record to evaluate.
|
|
58
|
-
|
|
59
|
-
provide_explanation (bool, optional): Whether to provide an
|
|
60
|
-
explanation.
|
|
61
|
-
|
|
62
|
-
use_function_calling_if_available (bool, optional): If True, use
|
|
63
|
-
function calling (if available) as a means to constrain the LLM
|
|
64
|
-
outputs. With function calling, the LLM is instructed to provide its
|
|
65
|
-
response as a structured JSON object, which is easier to parse.
|
|
66
|
-
|
|
67
|
-
use_function_calling_if_available (bool, optional): If True, use
|
|
68
|
-
function calling (if available) as a means to constrain the LLM
|
|
69
|
-
outputs. With function calling, the LLM is instructed to provide its
|
|
70
|
-
response as a structured JSON object, which is easier to parse.
|
|
71
|
-
|
|
72
|
-
verbose (bool, optional): Whether to print verbose output.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
Tuple[str, Optional[float], Optional[str]]: A tuple containing:
|
|
76
|
-
- label
|
|
77
|
-
- score (if scores for each label are specified by the template)
|
|
78
|
-
- explanation (if requested)
|
|
79
|
-
"""
|
|
80
|
-
use_openai_function_call = (
|
|
81
|
-
use_function_calling_if_available
|
|
82
|
-
and isinstance(self._model, OpenAIModel)
|
|
83
|
-
and self._model.supports_function_calling
|
|
84
|
-
)
|
|
85
|
-
prompt = self._template.format(
|
|
86
|
-
record, options=PromptOptions(provide_explanation=provide_explanation)
|
|
87
|
-
)
|
|
88
|
-
with set_verbosity(self._model, verbose) as verbose_model:
|
|
89
|
-
unparsed_output = verbose_model(
|
|
90
|
-
prompt,
|
|
91
|
-
**(
|
|
92
|
-
openai_function_call_kwargs(self._template.rails, provide_explanation)
|
|
93
|
-
if use_openai_function_call
|
|
94
|
-
else {}
|
|
95
|
-
),
|
|
96
|
-
)
|
|
97
|
-
label, explanation = _extract_label_and_explanation(
|
|
98
|
-
unparsed_output=unparsed_output,
|
|
99
|
-
template=self._template,
|
|
100
|
-
provide_explanation=provide_explanation,
|
|
101
|
-
use_openai_function_call=use_openai_function_call,
|
|
102
|
-
verbose=verbose,
|
|
103
|
-
)
|
|
104
|
-
score = self._template.score(label)
|
|
105
|
-
return label, score, explanation
|
|
106
|
-
|
|
107
|
-
async def aevaluate(
|
|
108
|
-
self,
|
|
109
|
-
record: Record,
|
|
110
|
-
provide_explanation: bool = False,
|
|
111
|
-
use_function_calling_if_available: bool = True,
|
|
112
|
-
verbose: bool = False,
|
|
113
|
-
) -> Tuple[str, Optional[float], Optional[str]]:
|
|
114
|
-
"""
|
|
115
|
-
Evaluates a single record.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
record (Record): The record to evaluate.
|
|
119
|
-
|
|
120
|
-
provide_explanation (bool, optional): Whether to provide an
|
|
121
|
-
explanation.
|
|
122
|
-
|
|
123
|
-
use_function_calling_if_available (bool, optional): If True, use
|
|
124
|
-
function calling (if available) as a means to constrain the LLM
|
|
125
|
-
outputs. With function calling, the LLM is instructed to provide its
|
|
126
|
-
response as a structured JSON object, which is easier to parse.
|
|
127
|
-
|
|
128
|
-
verbose (bool, optional): Whether to print verbose output.
|
|
129
|
-
|
|
130
|
-
Returns:
|
|
131
|
-
Tuple[str, Optional[float], Optional[str]]: A tuple containing:
|
|
132
|
-
- label
|
|
133
|
-
- score (if scores for each label are specified by the template)
|
|
134
|
-
- explanation (if requested)
|
|
135
|
-
"""
|
|
136
|
-
use_openai_function_call = (
|
|
137
|
-
use_function_calling_if_available
|
|
138
|
-
and isinstance(self._model, OpenAIModel)
|
|
139
|
-
and self._model.supports_function_calling
|
|
140
|
-
)
|
|
141
|
-
prompt = self._template.format(
|
|
142
|
-
record, options=PromptOptions(provide_explanation=provide_explanation)
|
|
143
|
-
)
|
|
144
|
-
with set_verbosity(self._model, verbose) as verbose_model:
|
|
145
|
-
unparsed_output = await verbose_model._async_generate(
|
|
146
|
-
prompt,
|
|
147
|
-
**(
|
|
148
|
-
openai_function_call_kwargs(self._template.rails, provide_explanation)
|
|
149
|
-
if use_openai_function_call
|
|
150
|
-
else {}
|
|
151
|
-
),
|
|
152
|
-
)
|
|
153
|
-
label, explanation = _extract_label_and_explanation(
|
|
154
|
-
unparsed_output=unparsed_output,
|
|
155
|
-
template=self._template,
|
|
156
|
-
provide_explanation=provide_explanation,
|
|
157
|
-
use_openai_function_call=use_openai_function_call,
|
|
158
|
-
verbose=verbose,
|
|
159
|
-
)
|
|
160
|
-
score = self._template.score(label)
|
|
161
|
-
return label, score, explanation
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def _create_llm_evaluator_subclass(
|
|
165
|
-
class_name: str, template: ClassificationTemplate, docstring: str
|
|
166
|
-
) -> Type[LLMEvaluator]:
|
|
167
|
-
"""A factory method that dynamically creates subclasses of LLMEvaluator.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
class_name (str): Name of the class to be created (should match the name
|
|
171
|
-
of the assignment variable).
|
|
172
|
-
|
|
173
|
-
template (ClassificationTemplate): The classification template to use
|
|
174
|
-
for evaluation.
|
|
175
|
-
|
|
176
|
-
docstring (str): The docstring that will be attached to the subclass.
|
|
177
|
-
|
|
178
|
-
Returns:
|
|
179
|
-
Type[LLMEvaluator]: The dynamically created subclass.
|
|
180
|
-
"""
|
|
181
|
-
|
|
182
|
-
def __init__(self: LLMEvaluator, model: BaseEvalModel) -> None:
|
|
183
|
-
LLMEvaluator.__init__(self, model, template)
|
|
184
|
-
|
|
185
|
-
__init__.__doc__ = f"""
|
|
186
|
-
Initializer for {class_name}.
|
|
187
|
-
|
|
188
|
-
Args:
|
|
189
|
-
model (BaseEvalModel): The LLM model to use for evaluation."""
|
|
190
|
-
|
|
191
|
-
docstring += f" Outputs railed classes {', '.join(template.rails)}."
|
|
192
|
-
docstring += "\n\nThe template used for evaluation (without explanation) is:\n\n"
|
|
193
|
-
docstring += indent(template.template, 2 * _TAB)
|
|
194
|
-
|
|
195
|
-
return type(class_name, (LLMEvaluator,), {"__init__": __init__, "__doc__": docstring})
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
(
|
|
199
|
-
HallucinationEvaluator,
|
|
200
|
-
RelevanceEvaluator,
|
|
201
|
-
ToxicityEvaluator,
|
|
202
|
-
QAEvaluator,
|
|
203
|
-
SummarizationEvaluator,
|
|
204
|
-
) = map(
|
|
205
|
-
lambda args: _create_llm_evaluator_subclass(*args),
|
|
206
|
-
(
|
|
207
|
-
(
|
|
208
|
-
"HallucinationEvaluator",
|
|
209
|
-
EvalCriteria.HALLUCINATION.value,
|
|
210
|
-
'Leverages an LLM to evaluate whether a response (stored under an "output" column) is a hallucination given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).', # noqa: E501
|
|
211
|
-
),
|
|
212
|
-
(
|
|
213
|
-
"RelevanceEvaluator",
|
|
214
|
-
EvalCriteria.RELEVANCE.value,
|
|
215
|
-
'Leverages an LLM to evaluate whether a retrieved document (stored under a "reference" column) is relevant or irrelevant to the corresponding query (stored under the "input" column).', # noqa: E501
|
|
216
|
-
),
|
|
217
|
-
(
|
|
218
|
-
"ToxicityEvaluator",
|
|
219
|
-
EvalCriteria.TOXICITY.value,
|
|
220
|
-
'Leverages an LLM to evaluate whether the string stored under the "input" column contains racist, sexist, chauvinistic, biased, or otherwise toxic content.', # noqa: E501
|
|
221
|
-
),
|
|
222
|
-
(
|
|
223
|
-
"QAEvaluator",
|
|
224
|
-
EvalCriteria.QA.value,
|
|
225
|
-
'Leverages an LLM to evaluate whether a response (stored under an "output" column) is correct or incorrect given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).', # noqa: E501
|
|
226
|
-
),
|
|
227
|
-
(
|
|
228
|
-
"SummarizationEvaluator",
|
|
229
|
-
EvalCriteria.SUMMARIZATION.value,
|
|
230
|
-
'Leverages an LLM to evaluate whether a summary (stored under an "output" column) provides an accurate synopsis of an input document (stored under a "input" column).', # noqa: E501
|
|
231
|
-
),
|
|
232
|
-
),
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
class MapReducer:
|
|
237
|
-
"""
|
|
238
|
-
Evaluates data that is too large to fit into a single context window using a
|
|
239
|
-
map-reduce strategy. The data must first be divided into "chunks" that
|
|
240
|
-
individually fit into an LLM's context window. Each chunk of data is
|
|
241
|
-
individually evaluated (the "map" step), producing intermediate outputs that
|
|
242
|
-
are combined into a single result (the "reduce" step).
|
|
243
|
-
|
|
244
|
-
This is the simplest strategy for evaluating long-context data.
|
|
245
|
-
"""
|
|
246
|
-
|
|
247
|
-
def __init__(
|
|
248
|
-
self,
|
|
249
|
-
model: BaseEvalModel,
|
|
250
|
-
map_prompt_template: PromptTemplate,
|
|
251
|
-
reduce_prompt_template: PromptTemplate,
|
|
252
|
-
) -> None:
|
|
253
|
-
"""Initializes an instance.
|
|
254
|
-
|
|
255
|
-
Args:
|
|
256
|
-
model (BaseEvalModel): The LLM model to use for evaluation.
|
|
257
|
-
|
|
258
|
-
map_prompt_template (PromptTemplate): The template that is mapped
|
|
259
|
-
over each chunk to produce intermediate outputs. Must contain the
|
|
260
|
-
{chunk} placeholder.
|
|
261
|
-
|
|
262
|
-
reduce_prompt_template (PromptTemplate): The template that combines
|
|
263
|
-
the intermediate outputs into a single result. Must contain the
|
|
264
|
-
{mapped} placeholder, which will be formatted as a list of the
|
|
265
|
-
intermediate outputs produced by the map step.
|
|
266
|
-
"""
|
|
267
|
-
self._model = model
|
|
268
|
-
self._map_prompt_template = map_prompt_template
|
|
269
|
-
self._reduce_prompt_template = reduce_prompt_template
|
|
270
|
-
|
|
271
|
-
def evaluate(self, chunks: List[str]) -> str:
|
|
272
|
-
"""Evaluates a list of two or more chunks.
|
|
273
|
-
|
|
274
|
-
Args:
|
|
275
|
-
chunks (List[str]): A list of chunks to be evaluated. Each chunk is
|
|
276
|
-
inserted into the map_prompt_template and must therefore fit within
|
|
277
|
-
the LLM's context window and still leave room for the rest of the
|
|
278
|
-
prompt.
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
str: The output of the map-reduce process.
|
|
282
|
-
"""
|
|
283
|
-
if len(chunks) < 2:
|
|
284
|
-
raise ValueError(
|
|
285
|
-
"The map-reduce strategy is not needed to evaluate data "
|
|
286
|
-
"that fits within a single context window. "
|
|
287
|
-
"Consider using llm_classify instead."
|
|
288
|
-
)
|
|
289
|
-
model = self._model
|
|
290
|
-
mapped_records = []
|
|
291
|
-
for chunk in chunks:
|
|
292
|
-
map_prompt = self._map_prompt_template.format({"chunk": chunk})
|
|
293
|
-
intermediate_output = model(map_prompt)
|
|
294
|
-
mapped_records.append(intermediate_output)
|
|
295
|
-
reduce_prompt = self._reduce_prompt_template.format({"mapped": repr(mapped_records)})
|
|
296
|
-
return model(reduce_prompt)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
class Refiner:
|
|
300
|
-
"""
|
|
301
|
-
Evaluates data that is too large to fit into a single context window using a
|
|
302
|
-
refine strategy. The data must first be divided into "chunks" that
|
|
303
|
-
individually fit into an LLM's context window. An initial "accumulator" is
|
|
304
|
-
generated from the first chunk of data. The accumulator is subsequently
|
|
305
|
-
refined by iteratively updating and incorporating new information from each
|
|
306
|
-
subsequent chunk. An optional synthesis step can be used to synthesize the
|
|
307
|
-
final accumulator into a desired format.
|
|
308
|
-
"""
|
|
309
|
-
|
|
310
|
-
def __init__(
|
|
311
|
-
self,
|
|
312
|
-
model: BaseEvalModel,
|
|
313
|
-
initial_prompt_template: PromptTemplate,
|
|
314
|
-
refine_prompt_template: PromptTemplate,
|
|
315
|
-
synthesize_prompt_template: Optional[PromptTemplate] = None,
|
|
316
|
-
) -> None:
|
|
317
|
-
"""Initializes an instance.
|
|
318
|
-
|
|
319
|
-
Args:
|
|
320
|
-
model (BaseEvalModel): The LLM model to use for evaluation.
|
|
321
|
-
|
|
322
|
-
initial_prompt_template (PromptTemplate): The template for the
|
|
323
|
-
initial invocation of the model that will generate the initial
|
|
324
|
-
accumulator. Should contain the {chunk} placeholder.
|
|
325
|
-
|
|
326
|
-
refine_prompt_template (PromptTemplate): The template for refining
|
|
327
|
-
the accumulator across all subsequence chunks. Must contain the
|
|
328
|
-
{chunk} and {accumulator} placeholders.
|
|
329
|
-
|
|
330
|
-
synthesize_prompt_template (Optional[PromptTemplate], optional): An
|
|
331
|
-
optional template to synthesize the final version of the
|
|
332
|
-
accumulator. Must contain the {accumulator} placeholder.
|
|
333
|
-
"""
|
|
334
|
-
self._model = model
|
|
335
|
-
self._initial_prompt_template = initial_prompt_template
|
|
336
|
-
self._refine_prompt_template = refine_prompt_template
|
|
337
|
-
self._synthesize_prompt_template = synthesize_prompt_template
|
|
338
|
-
|
|
339
|
-
def evaluate(self, chunks: List[str]) -> str:
|
|
340
|
-
"""Evaluates a list of two or more chunks.
|
|
341
|
-
|
|
342
|
-
Args:
|
|
343
|
-
chunks (List[str]): A list of chunks to be evaluated. Each chunk is
|
|
344
|
-
inserted into the initial_prompt_template and refine_prompt_template
|
|
345
|
-
and must therefore fit within the LLM's context window and still
|
|
346
|
-
leave room for the rest of the prompt.
|
|
347
|
-
|
|
348
|
-
Returns:
|
|
349
|
-
str: The output of the refine process.
|
|
350
|
-
"""
|
|
351
|
-
if len(chunks) < 2:
|
|
352
|
-
raise ValueError(
|
|
353
|
-
"The refine strategy is not needed to evaluate data "
|
|
354
|
-
"that fits within a single context window. "
|
|
355
|
-
"Consider using llm_classify instead."
|
|
356
|
-
)
|
|
357
|
-
model = self._model
|
|
358
|
-
initial_prompt = self._initial_prompt_template.format({"chunk": chunks[0]})
|
|
359
|
-
accumulator = model(initial_prompt)
|
|
360
|
-
for chunk in chunks[1:]:
|
|
361
|
-
refine_prompt = self._refine_prompt_template.format(
|
|
362
|
-
{"accumulator": accumulator, "chunk": chunk}
|
|
363
|
-
)
|
|
364
|
-
accumulator = model(refine_prompt)
|
|
365
|
-
if not self._synthesize_prompt_template:
|
|
366
|
-
return accumulator
|
|
367
|
-
reduce_prompt = self._synthesize_prompt_template.format({"accumulator": accumulator})
|
|
368
|
-
return model(reduce_prompt)
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
def _extract_label_and_explanation(
|
|
372
|
-
unparsed_output: str,
|
|
373
|
-
template: ClassificationTemplate,
|
|
374
|
-
provide_explanation: bool,
|
|
375
|
-
use_openai_function_call: bool,
|
|
376
|
-
verbose: bool,
|
|
377
|
-
) -> Tuple[str, Optional[str]]:
|
|
378
|
-
"""
|
|
379
|
-
Extracts the label and explanation from the unparsed output.
|
|
380
|
-
|
|
381
|
-
Args:
|
|
382
|
-
unparsed_output (str): The raw output to be parsed.
|
|
383
|
-
|
|
384
|
-
template (ClassificationTemplate): The template used to generate the
|
|
385
|
-
output.
|
|
386
|
-
|
|
387
|
-
provide_explanation (bool): Whether the output includes an explanation.
|
|
388
|
-
|
|
389
|
-
use_openai_function_call (bool): Whether the output was generated using
|
|
390
|
-
function calling.
|
|
391
|
-
|
|
392
|
-
verbose (bool): If True, print verbose output to stdout.
|
|
393
|
-
|
|
394
|
-
Returns:
|
|
395
|
-
Tuple[str, Optional[str]]: A tuple containing the label and an
|
|
396
|
-
explanation (if one is provided).
|
|
397
|
-
"""
|
|
398
|
-
if not use_openai_function_call:
|
|
399
|
-
if provide_explanation:
|
|
400
|
-
unrailed_label, explanation = (
|
|
401
|
-
template.extract_label_from_explanation(unparsed_output),
|
|
402
|
-
unparsed_output,
|
|
403
|
-
)
|
|
404
|
-
printif(
|
|
405
|
-
verbose and unrailed_label == NOT_PARSABLE,
|
|
406
|
-
f"- Could not parse {repr(unparsed_output)}",
|
|
407
|
-
)
|
|
408
|
-
else:
|
|
409
|
-
unrailed_label = unparsed_output
|
|
410
|
-
explanation = None
|
|
411
|
-
else:
|
|
412
|
-
unrailed_label, explanation = parse_openai_function_call(unparsed_output)
|
|
413
|
-
return snap_to_rail(unrailed_label, template.rails, verbose=verbose), explanation
|