arize-phoenix 3.16.1__py3-none-any.whl → 7.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- arize_phoenix-7.7.0.dist-info/METADATA +261 -0
- arize_phoenix-7.7.0.dist-info/RECORD +345 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
- arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
- phoenix/__init__.py +86 -14
- phoenix/auth.py +309 -0
- phoenix/config.py +675 -45
- phoenix/core/model.py +32 -30
- phoenix/core/model_schema.py +102 -109
- phoenix/core/model_schema_adapter.py +48 -45
- phoenix/datetime_utils.py +24 -3
- phoenix/db/README.md +54 -0
- phoenix/db/__init__.py +4 -0
- phoenix/db/alembic.ini +85 -0
- phoenix/db/bulk_inserter.py +294 -0
- phoenix/db/engines.py +208 -0
- phoenix/db/enums.py +20 -0
- phoenix/db/facilitator.py +113 -0
- phoenix/db/helpers.py +159 -0
- phoenix/db/insertion/constants.py +2 -0
- phoenix/db/insertion/dataset.py +227 -0
- phoenix/db/insertion/document_annotation.py +171 -0
- phoenix/db/insertion/evaluation.py +191 -0
- phoenix/db/insertion/helpers.py +98 -0
- phoenix/db/insertion/span.py +193 -0
- phoenix/db/insertion/span_annotation.py +158 -0
- phoenix/db/insertion/trace_annotation.py +158 -0
- phoenix/db/insertion/types.py +256 -0
- phoenix/db/migrate.py +86 -0
- phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
- phoenix/db/migrations/env.py +114 -0
- phoenix/db/migrations/script.py.mako +26 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
- phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
- phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
- phoenix/db/models.py +807 -0
- phoenix/exceptions.py +5 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +158 -0
- phoenix/experiments/evaluators/code_evaluators.py +184 -0
- phoenix/experiments/evaluators/llm_evaluators.py +473 -0
- phoenix/experiments/evaluators/utils.py +236 -0
- phoenix/experiments/functions.py +772 -0
- phoenix/experiments/tracing.py +86 -0
- phoenix/experiments/types.py +726 -0
- phoenix/experiments/utils.py +25 -0
- phoenix/inferences/__init__.py +0 -0
- phoenix/{datasets → inferences}/errors.py +6 -5
- phoenix/{datasets → inferences}/fixtures.py +49 -42
- phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
- phoenix/{datasets → inferences}/schema.py +11 -11
- phoenix/{datasets → inferences}/validation.py +13 -14
- phoenix/logging/__init__.py +3 -0
- phoenix/logging/_config.py +90 -0
- phoenix/logging/_filter.py +6 -0
- phoenix/logging/_formatter.py +69 -0
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +4 -3
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +9 -3
- phoenix/pointcloud/clustering.py +5 -5
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/projectors.py +5 -6
- phoenix/pointcloud/umap_parameters.py +53 -52
- phoenix/server/api/README.md +28 -0
- phoenix/server/api/auth.py +44 -0
- phoenix/server/api/context.py +152 -9
- phoenix/server/api/dataloaders/__init__.py +91 -0
- phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/cache/__init__.py +3 -0
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
- phoenix/server/api/dataloaders/document_evaluations.py +31 -0
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
- phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
- phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/record_counts.py +116 -0
- phoenix/server/api/dataloaders/session_io.py +79 -0
- phoenix/server/api/dataloaders/session_num_traces.py +30 -0
- phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
- phoenix/server/api/dataloaders/session_token_usages.py +41 -0
- phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
- phoenix/server/api/dataloaders/span_annotations.py +26 -0
- phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +57 -0
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/token_counts.py +124 -0
- phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
- phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
- phoenix/server/api/dataloaders/user_roles.py +30 -0
- phoenix/server/api/dataloaders/users.py +33 -0
- phoenix/server/api/exceptions.py +48 -0
- phoenix/server/api/helpers/__init__.py +12 -0
- phoenix/server/api/helpers/dataset_helpers.py +217 -0
- phoenix/server/api/helpers/experiment_run_filters.py +763 -0
- phoenix/server/api/helpers/playground_clients.py +948 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +455 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
- phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
- phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +162 -0
- phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
- phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
- phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
- phoenix/server/api/input_types/SpanSort.py +134 -69
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
- phoenix/server/api/input_types/UserRoleInput.py +9 -0
- phoenix/server/api/mutations/__init__.py +28 -0
- phoenix/server/api/mutations/api_key_mutations.py +167 -0
- phoenix/server/api/mutations/chat_mutations.py +593 -0
- phoenix/server/api/mutations/dataset_mutations.py +591 -0
- phoenix/server/api/mutations/experiment_mutations.py +75 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
- phoenix/server/api/mutations/project_mutations.py +57 -0
- phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
- phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
- phoenix/server/api/mutations/user_mutations.py +329 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +17 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +738 -0
- phoenix/server/api/routers/__init__.py +11 -0
- phoenix/server/api/routers/auth.py +284 -0
- phoenix/server/api/routers/embeddings.py +26 -0
- phoenix/server/api/routers/oauth2.py +488 -0
- phoenix/server/api/routers/v1/__init__.py +64 -0
- phoenix/server/api/routers/v1/datasets.py +1017 -0
- phoenix/server/api/routers/v1/evaluations.py +362 -0
- phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
- phoenix/server/api/routers/v1/experiment_runs.py +167 -0
- phoenix/server/api/routers/v1/experiments.py +308 -0
- phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
- phoenix/server/api/routers/v1/spans.py +267 -0
- phoenix/server/api/routers/v1/traces.py +208 -0
- phoenix/server/api/routers/v1/utils.py +95 -0
- phoenix/server/api/schema.py +44 -241
- phoenix/server/api/subscriptions.py +597 -0
- phoenix/server/api/types/Annotation.py +21 -0
- phoenix/server/api/types/AnnotationSummary.py +55 -0
- phoenix/server/api/types/AnnotatorKind.py +16 -0
- phoenix/server/api/types/ApiKey.py +27 -0
- phoenix/server/api/types/AuthMethod.py +9 -0
- phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
- phoenix/server/api/types/Cluster.py +25 -24
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/DataQualityMetric.py +31 -13
- phoenix/server/api/types/Dataset.py +288 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +32 -31
- phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
- phoenix/server/api/types/EmbeddingDimension.py +56 -49
- phoenix/server/api/types/Evaluation.py +25 -31
- phoenix/server/api/types/EvaluationSummary.py +30 -50
- phoenix/server/api/types/Event.py +20 -20
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +152 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +17 -0
- phoenix/server/api/types/ExperimentRun.py +119 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
- phoenix/server/api/types/GenerativeModel.py +9 -0
- phoenix/server/api/types/GenerativeProvider.py +85 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/LabelFraction.py +7 -0
- phoenix/server/api/types/MimeType.py +2 -2
- phoenix/server/api/types/Model.py +54 -54
- phoenix/server/api/types/PerformanceMetric.py +8 -5
- phoenix/server/api/types/Project.py +407 -142
- phoenix/server/api/types/ProjectSession.py +139 -0
- phoenix/server/api/types/Segments.py +4 -4
- phoenix/server/api/types/Span.py +221 -176
- phoenix/server/api/types/SpanAnnotation.py +43 -0
- phoenix/server/api/types/SpanIOValue.py +15 -0
- phoenix/server/api/types/SystemApiKey.py +9 -0
- phoenix/server/api/types/TemplateLanguage.py +10 -0
- phoenix/server/api/types/TimeSeries.py +19 -15
- phoenix/server/api/types/TokenUsage.py +11 -0
- phoenix/server/api/types/Trace.py +154 -0
- phoenix/server/api/types/TraceAnnotation.py +45 -0
- phoenix/server/api/types/UMAPPoints.py +7 -7
- phoenix/server/api/types/User.py +60 -0
- phoenix/server/api/types/UserApiKey.py +45 -0
- phoenix/server/api/types/UserRole.py +15 -0
- phoenix/server/api/types/node.py +4 -112
- phoenix/server/api/types/pagination.py +156 -57
- phoenix/server/api/utils.py +34 -0
- phoenix/server/app.py +864 -115
- phoenix/server/bearer_auth.py +163 -0
- phoenix/server/dml_event.py +136 -0
- phoenix/server/dml_event_handler.py +256 -0
- phoenix/server/email/__init__.py +0 -0
- phoenix/server/email/sender.py +97 -0
- phoenix/server/email/templates/__init__.py +0 -0
- phoenix/server/email/templates/password_reset.html +19 -0
- phoenix/server/email/types.py +11 -0
- phoenix/server/grpc_server.py +102 -0
- phoenix/server/jwt_store.py +505 -0
- phoenix/server/main.py +305 -116
- phoenix/server/oauth2.py +52 -0
- phoenix/server/openapi/__init__.py +0 -0
- phoenix/server/prometheus.py +111 -0
- phoenix/server/rate_limiters.py +188 -0
- phoenix/server/static/.vite/manifest.json +87 -0
- phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
- phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
- phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
- phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
- phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
- phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
- phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
- phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
- phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
- phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
- phoenix/server/telemetry.py +68 -0
- phoenix/server/templates/index.html +82 -23
- phoenix/server/thread_server.py +3 -3
- phoenix/server/types.py +275 -0
- phoenix/services.py +27 -18
- phoenix/session/client.py +743 -68
- phoenix/session/data_extractor.py +31 -7
- phoenix/session/evaluation.py +3 -9
- phoenix/session/session.py +263 -219
- phoenix/settings.py +22 -0
- phoenix/trace/__init__.py +2 -22
- phoenix/trace/attributes.py +338 -0
- phoenix/trace/dsl/README.md +116 -0
- phoenix/trace/dsl/filter.py +663 -213
- phoenix/trace/dsl/helpers.py +73 -21
- phoenix/trace/dsl/query.py +574 -201
- phoenix/trace/exporter.py +24 -19
- phoenix/trace/fixtures.py +368 -32
- phoenix/trace/otel.py +71 -219
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +33 -11
- phoenix/trace/span_evaluations.py +21 -16
- phoenix/trace/span_json_decoder.py +6 -4
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +47 -32
- phoenix/trace/utils.py +21 -4
- phoenix/utilities/__init__.py +0 -26
- phoenix/utilities/client.py +132 -0
- phoenix/utilities/deprecation.py +31 -0
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +109 -0
- phoenix/utilities/logging.py +8 -0
- phoenix/utilities/project.py +2 -2
- phoenix/utilities/re.py +49 -0
- phoenix/utilities/span_store.py +0 -23
- phoenix/utilities/template_formatters.py +99 -0
- phoenix/version.py +1 -1
- arize_phoenix-3.16.1.dist-info/METADATA +0 -495
- arize_phoenix-3.16.1.dist-info/RECORD +0 -178
- phoenix/core/project.py +0 -619
- phoenix/core/traces.py +0 -96
- phoenix/experimental/evals/__init__.py +0 -73
- phoenix/experimental/evals/evaluators.py +0 -413
- phoenix/experimental/evals/functions/__init__.py +0 -4
- phoenix/experimental/evals/functions/classify.py +0 -453
- phoenix/experimental/evals/functions/executor.py +0 -353
- phoenix/experimental/evals/functions/generate.py +0 -138
- phoenix/experimental/evals/functions/processing.py +0 -76
- phoenix/experimental/evals/models/__init__.py +0 -14
- phoenix/experimental/evals/models/anthropic.py +0 -175
- phoenix/experimental/evals/models/base.py +0 -170
- phoenix/experimental/evals/models/bedrock.py +0 -221
- phoenix/experimental/evals/models/litellm.py +0 -134
- phoenix/experimental/evals/models/openai.py +0 -448
- phoenix/experimental/evals/models/rate_limiters.py +0 -246
- phoenix/experimental/evals/models/vertex.py +0 -173
- phoenix/experimental/evals/models/vertexai.py +0 -186
- phoenix/experimental/evals/retrievals.py +0 -96
- phoenix/experimental/evals/templates/__init__.py +0 -50
- phoenix/experimental/evals/templates/default_templates.py +0 -472
- phoenix/experimental/evals/templates/template.py +0 -195
- phoenix/experimental/evals/utils/__init__.py +0 -172
- phoenix/experimental/evals/utils/threads.py +0 -27
- phoenix/server/api/helpers.py +0 -11
- phoenix/server/api/routers/evaluation_handler.py +0 -109
- phoenix/server/api/routers/span_handler.py +0 -70
- phoenix/server/api/routers/trace_handler.py +0 -60
- phoenix/server/api/types/DatasetRole.py +0 -23
- phoenix/server/static/index.css +0 -6
- phoenix/server/static/index.js +0 -7447
- phoenix/storage/span_store/__init__.py +0 -23
- phoenix/storage/span_store/text_file.py +0 -85
- phoenix/trace/dsl/missing.py +0 -60
- phoenix/trace/langchain/__init__.py +0 -3
- phoenix/trace/langchain/instrumentor.py +0 -35
- phoenix/trace/llama_index/__init__.py +0 -3
- phoenix/trace/llama_index/callback.py +0 -102
- phoenix/trace/openai/__init__.py +0 -3
- phoenix/trace/openai/instrumentor.py +0 -30
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → db/insertion}/__init__.py +0 -0
- /phoenix/{experimental → db/migrations}/__init__.py +0 -0
- /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
|
@@ -0,0 +1,726 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import textwrap
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from collections.abc import (
|
|
7
|
+
Awaitable,
|
|
8
|
+
Callable,
|
|
9
|
+
Iterable,
|
|
10
|
+
Iterator,
|
|
11
|
+
Mapping,
|
|
12
|
+
)
|
|
13
|
+
from copy import copy, deepcopy
|
|
14
|
+
from dataclasses import dataclass, field, fields
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from functools import cached_property
|
|
18
|
+
from importlib.metadata import version
|
|
19
|
+
from random import getrandbits
|
|
20
|
+
from typing import Any, Optional, TypeVar, Union, cast, overload
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
from typing_extensions import TypeAlias
|
|
24
|
+
from wrapt import ObjectProxy
|
|
25
|
+
|
|
26
|
+
from phoenix.datetime_utils import local_now
|
|
27
|
+
from phoenix.experiments.utils import get_experiment_url
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AnnotatorKind(Enum):
|
|
31
|
+
CODE = "CODE"
|
|
32
|
+
LLM = "LLM"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
JSONSerializable: TypeAlias = Optional[Union[dict[str, Any], list[Any], str, int, float, bool]]
|
|
36
|
+
ExperimentId: TypeAlias = str
|
|
37
|
+
DatasetId: TypeAlias = str
|
|
38
|
+
DatasetVersionId: TypeAlias = str
|
|
39
|
+
ExampleId: TypeAlias = str
|
|
40
|
+
RepetitionNumber: TypeAlias = int
|
|
41
|
+
ExperimentRunId: TypeAlias = str
|
|
42
|
+
TraceId: TypeAlias = str
|
|
43
|
+
|
|
44
|
+
TaskOutput: TypeAlias = JSONSerializable
|
|
45
|
+
|
|
46
|
+
ExampleOutput: TypeAlias = Mapping[str, JSONSerializable]
|
|
47
|
+
ExampleMetadata: TypeAlias = Mapping[str, JSONSerializable]
|
|
48
|
+
ExampleInput: TypeAlias = Mapping[str, JSONSerializable]
|
|
49
|
+
|
|
50
|
+
Score: TypeAlias = Optional[Union[bool, int, float]]
|
|
51
|
+
Label: TypeAlias = Optional[str]
|
|
52
|
+
Explanation: TypeAlias = Optional[str]
|
|
53
|
+
|
|
54
|
+
EvaluatorName: TypeAlias = str
|
|
55
|
+
EvaluatorKind: TypeAlias = str
|
|
56
|
+
EvaluatorOutput: TypeAlias = Union[
|
|
57
|
+
"EvaluationResult", bool, int, float, str, tuple[Score, Label, Explanation]
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
DRY_RUN: ExperimentId = "DRY_RUN"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _dry_run_id() -> str:
|
|
64
|
+
suffix = getrandbits(24).to_bytes(3, "big").hex()
|
|
65
|
+
return f"{DRY_RUN}_{suffix}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True)
|
|
69
|
+
class Example:
|
|
70
|
+
id: ExampleId
|
|
71
|
+
updated_at: datetime
|
|
72
|
+
input: Mapping[str, JSONSerializable] = field(default_factory=dict)
|
|
73
|
+
output: Mapping[str, JSONSerializable] = field(default_factory=dict)
|
|
74
|
+
metadata: Mapping[str, JSONSerializable] = field(default_factory=dict)
|
|
75
|
+
|
|
76
|
+
def __post_init__(self) -> None:
|
|
77
|
+
object.__setattr__(self, "input", _make_read_only(self.input))
|
|
78
|
+
object.__setattr__(self, "output", _make_read_only(self.output))
|
|
79
|
+
object.__setattr__(self, "metadata", _make_read_only(self.metadata))
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_dict(cls, obj: Mapping[str, Any]) -> Example:
|
|
83
|
+
return cls(
|
|
84
|
+
input=obj["input"],
|
|
85
|
+
output=obj["output"],
|
|
86
|
+
metadata=obj.get("metadata") or {},
|
|
87
|
+
id=obj["id"],
|
|
88
|
+
updated_at=obj["updated_at"],
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def __repr__(self) -> str:
|
|
92
|
+
spaces = " " * 4
|
|
93
|
+
name = self.__class__.__name__
|
|
94
|
+
identifiers = [f'{spaces}id="{self.id}",']
|
|
95
|
+
contents = [
|
|
96
|
+
spaces
|
|
97
|
+
+ f"{_blue(key)}="
|
|
98
|
+
+ json.dumps(
|
|
99
|
+
_shorten(value),
|
|
100
|
+
ensure_ascii=False,
|
|
101
|
+
sort_keys=True,
|
|
102
|
+
indent=len(spaces),
|
|
103
|
+
)
|
|
104
|
+
.replace("\n", f"\n{spaces}")
|
|
105
|
+
.replace(' "..."\n', " ...\n")
|
|
106
|
+
+ ","
|
|
107
|
+
for key in ("input", "output", "metadata")
|
|
108
|
+
if (value := getattr(self, key, None))
|
|
109
|
+
]
|
|
110
|
+
return "\n".join([f"{name}(", *identifiers, *contents, ")"])
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass(frozen=True)
|
|
114
|
+
class Dataset:
|
|
115
|
+
id: DatasetId
|
|
116
|
+
version_id: DatasetVersionId
|
|
117
|
+
examples: Mapping[ExampleId, Example] = field(repr=False, default_factory=dict)
|
|
118
|
+
|
|
119
|
+
def __post_init__(self) -> None:
|
|
120
|
+
object.__setattr__(self, "examples", _ReadOnly(self.examples))
|
|
121
|
+
|
|
122
|
+
def __len__(self) -> int:
|
|
123
|
+
return len(self.examples)
|
|
124
|
+
|
|
125
|
+
def __iter__(self) -> Iterator[Example]:
|
|
126
|
+
return iter(self.examples.values())
|
|
127
|
+
|
|
128
|
+
@cached_property
|
|
129
|
+
def _keys(self) -> tuple[str, ...]:
|
|
130
|
+
return tuple(self.examples.keys())
|
|
131
|
+
|
|
132
|
+
@overload
|
|
133
|
+
def __getitem__(self, key: int) -> Example: ...
|
|
134
|
+
@overload
|
|
135
|
+
def __getitem__(self, key: slice) -> list[Example]: ...
|
|
136
|
+
def __getitem__(self, key: Union[int, slice]) -> Union[Example, list[Example]]:
|
|
137
|
+
if isinstance(key, int):
|
|
138
|
+
return self.examples[self._keys[key]]
|
|
139
|
+
return [self.examples[k] for k in self._keys[key]]
|
|
140
|
+
|
|
141
|
+
def as_dataframe(self, drop_empty_columns: bool = True) -> pd.DataFrame:
|
|
142
|
+
df = pd.DataFrame.from_records(
|
|
143
|
+
[
|
|
144
|
+
{
|
|
145
|
+
"example_id": example.id,
|
|
146
|
+
"input": deepcopy(example.input),
|
|
147
|
+
"output": deepcopy(example.output),
|
|
148
|
+
"metadata": deepcopy(example.metadata),
|
|
149
|
+
}
|
|
150
|
+
for example in self.examples.values()
|
|
151
|
+
]
|
|
152
|
+
).set_index("example_id")
|
|
153
|
+
if drop_empty_columns:
|
|
154
|
+
return df.reindex([k for k, v in df.items() if v.astype(bool).any()], axis=1)
|
|
155
|
+
return df
|
|
156
|
+
|
|
157
|
+
@classmethod
|
|
158
|
+
def from_dict(cls, obj: Mapping[str, Any]) -> Dataset:
|
|
159
|
+
examples = tuple(map(Example.from_dict, obj.get("examples") or ()))
|
|
160
|
+
return cls(
|
|
161
|
+
id=obj["dataset_id"],
|
|
162
|
+
version_id=obj["version_id"],
|
|
163
|
+
examples={ex.id: ex for ex in examples},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass(frozen=True)
|
|
168
|
+
class TestCase:
|
|
169
|
+
example: Example
|
|
170
|
+
repetition_number: RepetitionNumber
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass(frozen=True)
|
|
174
|
+
class Experiment:
|
|
175
|
+
id: ExperimentId
|
|
176
|
+
dataset_id: DatasetId
|
|
177
|
+
dataset_version_id: DatasetVersionId
|
|
178
|
+
repetitions: int
|
|
179
|
+
project_name: str = field(repr=False)
|
|
180
|
+
|
|
181
|
+
@classmethod
|
|
182
|
+
def from_dict(cls, obj: Mapping[str, Any]) -> Experiment:
|
|
183
|
+
return cls(
|
|
184
|
+
id=obj["id"],
|
|
185
|
+
dataset_id=obj["dataset_id"],
|
|
186
|
+
dataset_version_id=obj["dataset_version_id"],
|
|
187
|
+
repetitions=obj.get("repetitions") or 1,
|
|
188
|
+
project_name=obj.get("project_name") or "",
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass(frozen=True)
|
|
193
|
+
class ExperimentRun:
|
|
194
|
+
start_time: datetime
|
|
195
|
+
end_time: datetime
|
|
196
|
+
experiment_id: ExperimentId
|
|
197
|
+
dataset_example_id: ExampleId
|
|
198
|
+
repetition_number: RepetitionNumber
|
|
199
|
+
output: JSONSerializable
|
|
200
|
+
error: Optional[str] = None
|
|
201
|
+
id: ExperimentRunId = field(default_factory=_dry_run_id)
|
|
202
|
+
trace_id: Optional[TraceId] = None
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentRun:
|
|
206
|
+
return cls(
|
|
207
|
+
start_time=obj["start_time"],
|
|
208
|
+
end_time=obj["end_time"],
|
|
209
|
+
experiment_id=obj["experiment_id"],
|
|
210
|
+
dataset_example_id=obj["dataset_example_id"],
|
|
211
|
+
repetition_number=obj.get("repetition_number") or 1,
|
|
212
|
+
output=_make_read_only(obj.get("output")),
|
|
213
|
+
error=obj.get("error"),
|
|
214
|
+
id=obj["id"],
|
|
215
|
+
trace_id=obj.get("trace_id"),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def __post_init__(self) -> None:
|
|
219
|
+
if self.output is None and self.error is None:
|
|
220
|
+
raise ValueError("Must specify exactly one of experiment_run_output or error")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@dataclass(frozen=True)
|
|
224
|
+
class EvaluationResult:
|
|
225
|
+
score: Optional[float] = None
|
|
226
|
+
label: Optional[str] = None
|
|
227
|
+
explanation: Optional[str] = None
|
|
228
|
+
metadata: Mapping[str, JSONSerializable] = field(default_factory=dict)
|
|
229
|
+
|
|
230
|
+
@classmethod
|
|
231
|
+
def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[EvaluationResult]:
|
|
232
|
+
if not obj:
|
|
233
|
+
return None
|
|
234
|
+
return cls(
|
|
235
|
+
score=obj.get("score"),
|
|
236
|
+
label=obj.get("label"),
|
|
237
|
+
explanation=obj.get("explanation"),
|
|
238
|
+
metadata=obj.get("metadata") or {},
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def __post_init__(self) -> None:
|
|
242
|
+
if self.score is None and not self.label:
|
|
243
|
+
raise ValueError("Must specify score or label, or both")
|
|
244
|
+
if self.score is None and not self.label:
|
|
245
|
+
object.__setattr__(self, "score", 0)
|
|
246
|
+
for k in ("label", "explanation"):
|
|
247
|
+
if (v := getattr(self, k, None)) is not None:
|
|
248
|
+
object.__setattr__(self, k, str(v) or None)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@dataclass(frozen=True)
|
|
252
|
+
class ExperimentEvaluationRun:
|
|
253
|
+
experiment_run_id: ExperimentRunId
|
|
254
|
+
start_time: datetime
|
|
255
|
+
end_time: datetime
|
|
256
|
+
name: str
|
|
257
|
+
annotator_kind: str
|
|
258
|
+
error: Optional[str] = None
|
|
259
|
+
result: Optional[EvaluationResult] = None
|
|
260
|
+
id: str = field(default_factory=_dry_run_id)
|
|
261
|
+
trace_id: Optional[TraceId] = None
|
|
262
|
+
|
|
263
|
+
@classmethod
|
|
264
|
+
def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentEvaluationRun:
|
|
265
|
+
return cls(
|
|
266
|
+
experiment_run_id=obj["experiment_run_id"],
|
|
267
|
+
start_time=obj["start_time"],
|
|
268
|
+
end_time=obj["end_time"],
|
|
269
|
+
name=obj["name"],
|
|
270
|
+
annotator_kind=obj["annotator_kind"],
|
|
271
|
+
error=obj.get("error"),
|
|
272
|
+
result=EvaluationResult.from_dict(obj.get("result")),
|
|
273
|
+
id=obj["id"],
|
|
274
|
+
trace_id=obj.get("trace_id"),
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
def __post_init__(self) -> None:
|
|
278
|
+
if self.result is None and self.error is None:
|
|
279
|
+
raise ValueError("Must specify either result or error")
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
ExperimentTask: TypeAlias = Union[
|
|
283
|
+
Callable[[Example], TaskOutput],
|
|
284
|
+
Callable[[Example], Awaitable[TaskOutput]],
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass(frozen=True)
|
|
289
|
+
class ExperimentParameters:
|
|
290
|
+
n_examples: int
|
|
291
|
+
n_repetitions: int = 1
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def count(self) -> int:
|
|
295
|
+
return self.n_examples * self.n_repetitions
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@dataclass(frozen=True)
|
|
299
|
+
class EvaluationParameters:
|
|
300
|
+
eval_names: frozenset[str]
|
|
301
|
+
exp_params: ExperimentParameters
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@dataclass(frozen=True)
|
|
305
|
+
class _HasStats:
|
|
306
|
+
_title: str = field(repr=False, default="")
|
|
307
|
+
_timestamp: datetime = field(repr=False, default_factory=local_now)
|
|
308
|
+
stats: pd.DataFrame = field(repr=False, default_factory=pd.DataFrame)
|
|
309
|
+
|
|
310
|
+
@property
|
|
311
|
+
def title(self) -> str:
|
|
312
|
+
return f"{self._title} ({self._timestamp:%x %I:%M %p %z})"
|
|
313
|
+
|
|
314
|
+
def __str__(self) -> str:
|
|
315
|
+
try:
|
|
316
|
+
assert int(version("pandas").split(".")[0]) >= 1
|
|
317
|
+
# `tabulate` is used by pandas >= 1.0 in DataFrame.to_markdown()
|
|
318
|
+
import tabulate # noqa: F401
|
|
319
|
+
except (AssertionError, ImportError):
|
|
320
|
+
text = self.stats.__str__()
|
|
321
|
+
else:
|
|
322
|
+
text = self.stats.to_markdown(index=False)
|
|
323
|
+
return f"{self.title}\n{'-'*len(self.title)}\n" + text
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@dataclass(frozen=True)
|
|
327
|
+
class EvaluationSummary(_HasStats):
|
|
328
|
+
"""
|
|
329
|
+
Summary statistics of experiment evaluations.
|
|
330
|
+
|
|
331
|
+
Users should not instantiate this directly.
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
_title: str = "Experiment Summary"
|
|
335
|
+
|
|
336
|
+
@classmethod
|
|
337
|
+
def from_eval_runs(
|
|
338
|
+
cls,
|
|
339
|
+
params: EvaluationParameters,
|
|
340
|
+
*eval_runs: Optional[ExperimentEvaluationRun],
|
|
341
|
+
) -> EvaluationSummary:
|
|
342
|
+
df = pd.DataFrame.from_records(
|
|
343
|
+
[
|
|
344
|
+
{
|
|
345
|
+
"evaluator": run.name,
|
|
346
|
+
"error": run.error,
|
|
347
|
+
"score": run.result.score if run.result else None,
|
|
348
|
+
"label": run.result.label if run.result else None,
|
|
349
|
+
}
|
|
350
|
+
for run in eval_runs
|
|
351
|
+
if run is not None
|
|
352
|
+
]
|
|
353
|
+
)
|
|
354
|
+
if df.empty:
|
|
355
|
+
df = pd.DataFrame.from_records(
|
|
356
|
+
[
|
|
357
|
+
{"evaluator": name, "error": None, "score": None, "label": None}
|
|
358
|
+
for name in params.eval_names
|
|
359
|
+
]
|
|
360
|
+
)
|
|
361
|
+
has_error = bool(df.loc[:, "error"].astype(bool).sum())
|
|
362
|
+
has_score = bool(df.loc[:, "score"].dropna().count())
|
|
363
|
+
has_label = bool(df.loc[:, "label"].astype(bool).sum())
|
|
364
|
+
agg = {
|
|
365
|
+
**(
|
|
366
|
+
dict(n_errors=("error", "count"), top_error=("error", _top_string))
|
|
367
|
+
if has_error
|
|
368
|
+
else {}
|
|
369
|
+
),
|
|
370
|
+
**(dict(n_scores=("score", "count"), avg_score=("score", "mean")) if has_score else {}),
|
|
371
|
+
**(
|
|
372
|
+
dict(
|
|
373
|
+
n_labels=("label", "count"),
|
|
374
|
+
top_2_labels=(
|
|
375
|
+
"label",
|
|
376
|
+
lambda s: (dict(Counter(s.dropna()).most_common(2)) or None),
|
|
377
|
+
),
|
|
378
|
+
)
|
|
379
|
+
if has_label
|
|
380
|
+
else {}
|
|
381
|
+
),
|
|
382
|
+
}
|
|
383
|
+
stats = (
|
|
384
|
+
df.groupby("evaluator").agg(**agg) # type: ignore[call-overload]
|
|
385
|
+
if agg
|
|
386
|
+
else pd.DataFrame()
|
|
387
|
+
)
|
|
388
|
+
sorted_eval_names = sorted(params.eval_names)
|
|
389
|
+
eval_names = pd.DataFrame(
|
|
390
|
+
{
|
|
391
|
+
"evaluator": sorted_eval_names,
|
|
392
|
+
"n": [params.exp_params.count] * len(sorted_eval_names),
|
|
393
|
+
}
|
|
394
|
+
).set_index("evaluator")
|
|
395
|
+
stats = pd.concat([eval_names, stats], axis=1).reset_index()
|
|
396
|
+
summary: EvaluationSummary = object.__new__(cls)
|
|
397
|
+
summary.__init__(stats=stats) # type: ignore[misc]
|
|
398
|
+
return summary
|
|
399
|
+
|
|
400
|
+
@classmethod
|
|
401
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> Any:
|
|
402
|
+
# Direct instantiation by users is discouraged.
|
|
403
|
+
raise NotImplementedError
|
|
404
|
+
|
|
405
|
+
@classmethod
|
|
406
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
407
|
+
# Direct sub-classing by users is discouraged.
|
|
408
|
+
raise NotImplementedError
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
@dataclass(frozen=True)
|
|
412
|
+
class TaskSummary(_HasStats):
|
|
413
|
+
"""
|
|
414
|
+
Summary statistics of experiment task executions.
|
|
415
|
+
|
|
416
|
+
**Users should not instantiate this object directly.**
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
_title: str = "Tasks Summary"
|
|
420
|
+
|
|
421
|
+
@classmethod
|
|
422
|
+
def from_task_runs(
|
|
423
|
+
cls,
|
|
424
|
+
params: ExperimentParameters,
|
|
425
|
+
task_runs: Iterable[Optional[ExperimentRun]],
|
|
426
|
+
) -> "TaskSummary":
|
|
427
|
+
df = pd.DataFrame.from_records(
|
|
428
|
+
[
|
|
429
|
+
{
|
|
430
|
+
"example_id": run.dataset_example_id,
|
|
431
|
+
"error": run.error,
|
|
432
|
+
}
|
|
433
|
+
for run in task_runs
|
|
434
|
+
if run is not None
|
|
435
|
+
]
|
|
436
|
+
)
|
|
437
|
+
n_runs = len(df)
|
|
438
|
+
n_errors = 0 if df.empty else df.loc[:, "error"].astype(bool).sum()
|
|
439
|
+
record = {
|
|
440
|
+
"n_examples": params.count,
|
|
441
|
+
"n_runs": n_runs,
|
|
442
|
+
"n_errors": n_errors,
|
|
443
|
+
**(dict(top_error=_top_string(df.loc[:, "error"])) if n_errors else {}),
|
|
444
|
+
}
|
|
445
|
+
stats = pd.DataFrame.from_records([record])
|
|
446
|
+
summary: TaskSummary = object.__new__(cls)
|
|
447
|
+
summary.__init__(stats=stats) # type: ignore[misc]
|
|
448
|
+
return summary
|
|
449
|
+
|
|
450
|
+
@classmethod
|
|
451
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> Any:
|
|
452
|
+
# Direct instantiation by users is discouraged.
|
|
453
|
+
raise NotImplementedError
|
|
454
|
+
|
|
455
|
+
@classmethod
|
|
456
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
457
|
+
# Direct sub-classing by users is discouraged.
|
|
458
|
+
raise NotImplementedError
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _top_string(s: "pd.Series[Any]", length: int = 100) -> Optional[str]:
|
|
462
|
+
if (cnt := s.dropna().str.slice(0, length).value_counts()).empty:
|
|
463
|
+
return None
|
|
464
|
+
return cast(str, cnt.sort_values(ascending=False).index[0])
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@dataclass(frozen=True)
|
|
468
|
+
class RanExperiment(Experiment):
|
|
469
|
+
"""
|
|
470
|
+
An experiment that has been run.
|
|
471
|
+
|
|
472
|
+
**Users should not instantiate this object directly.**
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
params: ExperimentParameters = field(repr=False)
|
|
476
|
+
dataset: Dataset = field(repr=False)
|
|
477
|
+
runs: Mapping[ExperimentRunId, ExperimentRun] = field(repr=False)
|
|
478
|
+
task_summary: TaskSummary = field(repr=False)
|
|
479
|
+
eval_runs: tuple[ExperimentEvaluationRun, ...] = field(repr=False, default=())
|
|
480
|
+
eval_summaries: tuple[EvaluationSummary, ...] = field(repr=False, default=())
|
|
481
|
+
|
|
482
|
+
@property
|
|
483
|
+
def url(self) -> str:
|
|
484
|
+
return get_experiment_url(dataset_id=self.dataset.id, experiment_id=self.id)
|
|
485
|
+
|
|
486
|
+
@property
|
|
487
|
+
def info(self) -> str:
|
|
488
|
+
return f"🔗 View this experiment: {self.url}"
|
|
489
|
+
|
|
490
|
+
def __post_init__(self) -> None:
|
|
491
|
+
runs = {
|
|
492
|
+
id_: (
|
|
493
|
+
_ExperimentRunWithExample(run, example)
|
|
494
|
+
if (example := self.dataset.examples.get(run.dataset_example_id))
|
|
495
|
+
else run
|
|
496
|
+
)
|
|
497
|
+
for id_, run in self.runs.items()
|
|
498
|
+
}
|
|
499
|
+
object.__setattr__(self, "runs", runs)
|
|
500
|
+
|
|
501
|
+
def __len__(self) -> int:
|
|
502
|
+
return len(self.runs)
|
|
503
|
+
|
|
504
|
+
def __iter__(self) -> Iterator[ExperimentRun]:
|
|
505
|
+
return iter(self.runs.values())
|
|
506
|
+
|
|
507
|
+
@cached_property
|
|
508
|
+
def _keys(self) -> tuple[str, ...]:
|
|
509
|
+
return tuple(self.runs.keys())
|
|
510
|
+
|
|
511
|
+
@overload
|
|
512
|
+
def __getitem__(self, key: int) -> ExperimentRun: ...
|
|
513
|
+
@overload
|
|
514
|
+
def __getitem__(self, key: slice) -> list[ExperimentRun]: ...
|
|
515
|
+
def __getitem__(self, key: Union[int, slice]) -> Union[ExperimentRun, list[ExperimentRun]]:
|
|
516
|
+
if isinstance(key, int):
|
|
517
|
+
return self.runs[self._keys[key]]
|
|
518
|
+
return [self.runs[k] for k in self._keys[key]]
|
|
519
|
+
|
|
520
|
+
def get_evaluations(
|
|
521
|
+
self,
|
|
522
|
+
drop_empty_columns: bool = True,
|
|
523
|
+
) -> pd.DataFrame:
|
|
524
|
+
df = pd.DataFrame.from_records(
|
|
525
|
+
[
|
|
526
|
+
{
|
|
527
|
+
"run_id": run.experiment_run_id,
|
|
528
|
+
"name": run.name,
|
|
529
|
+
"error": run.error,
|
|
530
|
+
"score": run.result.score if run.result else None,
|
|
531
|
+
"label": run.result.label if run.result else None,
|
|
532
|
+
"explanation": run.result.explanation if run.result else None,
|
|
533
|
+
}
|
|
534
|
+
for run in self.eval_runs
|
|
535
|
+
]
|
|
536
|
+
).set_index("run_id")
|
|
537
|
+
if drop_empty_columns:
|
|
538
|
+
df = df.reindex([k for k, v in df.items() if v.astype(bool).any()], axis=1)
|
|
539
|
+
return df.join(self.as_dataframe())
|
|
540
|
+
|
|
541
|
+
def as_dataframe(self, drop_empty_columns: bool = True) -> pd.DataFrame:
|
|
542
|
+
df = pd.DataFrame.from_records(
|
|
543
|
+
[
|
|
544
|
+
{
|
|
545
|
+
"run_id": run.id,
|
|
546
|
+
"error": run.error,
|
|
547
|
+
"output": deepcopy(run.output),
|
|
548
|
+
"input": deepcopy((ex := self.dataset.examples[run.dataset_example_id]).input),
|
|
549
|
+
"expected": deepcopy(ex.output),
|
|
550
|
+
"metadata": deepcopy(ex.metadata),
|
|
551
|
+
"example_id": run.dataset_example_id,
|
|
552
|
+
}
|
|
553
|
+
for run in self.runs.values()
|
|
554
|
+
]
|
|
555
|
+
).set_index("run_id")
|
|
556
|
+
if drop_empty_columns:
|
|
557
|
+
return df.reindex([k for k, v in df.items() if v.astype(bool).any()], axis=1)
|
|
558
|
+
return df
|
|
559
|
+
|
|
560
|
+
def add(
|
|
561
|
+
self,
|
|
562
|
+
eval_summary: EvaluationSummary,
|
|
563
|
+
*eval_runs: Optional[ExperimentEvaluationRun],
|
|
564
|
+
) -> "RanExperiment":
|
|
565
|
+
return _replace(
|
|
566
|
+
self,
|
|
567
|
+
eval_runs=(*self.eval_runs, *filter(bool, eval_runs)),
|
|
568
|
+
eval_summaries=(*self.eval_summaries, eval_summary),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
def __str__(self) -> str:
|
|
572
|
+
summaries = (*reversed(self.eval_summaries), self.task_summary)
|
|
573
|
+
return (
|
|
574
|
+
"\n"
|
|
575
|
+
+ ("" if self.id.startswith(DRY_RUN) else f"{self.info}\n\n")
|
|
576
|
+
+ "\n\n".join(map(str, summaries))
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
@classmethod
|
|
580
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> Any:
|
|
581
|
+
# Direct instantiation by users is discouraged.
|
|
582
|
+
raise NotImplementedError
|
|
583
|
+
|
|
584
|
+
@classmethod
|
|
585
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
586
|
+
# Direct sub-classing by users is discouraged.
|
|
587
|
+
raise NotImplementedError
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _asdict(dc: Any) -> dict[str, Any]:
|
|
591
|
+
# non-recursive version of `dataclasses.asdict()`
|
|
592
|
+
return {field.name: getattr(dc, field.name) for field in fields(dc)}
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
T = TypeVar("T")
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def _replace(obj: T, **kwargs: Any) -> T:
|
|
599
|
+
new_obj = object.__new__(obj.__class__)
|
|
600
|
+
new_obj.__init__(**{**_asdict(obj), **kwargs}) # type: ignore[misc]
|
|
601
|
+
return new_obj
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _shorten(obj: Any, width: int = 50) -> Any:
|
|
605
|
+
if isinstance(obj, str):
|
|
606
|
+
return textwrap.shorten(obj, width=width, placeholder="...")
|
|
607
|
+
if isinstance(obj, dict):
|
|
608
|
+
return {k: _shorten(v) for k, v in obj.items()}
|
|
609
|
+
if isinstance(obj, list):
|
|
610
|
+
if len(obj) > 2:
|
|
611
|
+
return [_shorten(v) for v in obj[:2]] + ["..."]
|
|
612
|
+
return [_shorten(v) for v in obj]
|
|
613
|
+
return obj
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def _make_read_only(obj: Any) -> Any:
|
|
617
|
+
if isinstance(obj, dict):
|
|
618
|
+
return _ReadOnly({k: _make_read_only(v) for k, v in obj.items()})
|
|
619
|
+
if isinstance(obj, str):
|
|
620
|
+
return obj
|
|
621
|
+
if isinstance(obj, list):
|
|
622
|
+
return _ReadOnly(list(map(_make_read_only, obj)))
|
|
623
|
+
return obj
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
class _ReadOnly(ObjectProxy): # type: ignore[misc]
|
|
627
|
+
def __setitem__(self, *args: Any, **kwargs: Any) -> Any:
|
|
628
|
+
raise NotImplementedError
|
|
629
|
+
|
|
630
|
+
def __delitem__(self, *args: Any, **kwargs: Any) -> Any:
|
|
631
|
+
raise NotImplementedError
|
|
632
|
+
|
|
633
|
+
def __iadd__(self, *args: Any, **kwargs: Any) -> Any:
|
|
634
|
+
raise NotImplementedError
|
|
635
|
+
|
|
636
|
+
def pop(self, *args: Any, **kwargs: Any) -> Any:
|
|
637
|
+
raise NotImplementedError
|
|
638
|
+
|
|
639
|
+
def append(self, *args: Any, **kwargs: Any) -> Any:
|
|
640
|
+
raise NotImplementedError
|
|
641
|
+
|
|
642
|
+
def __copy__(self, *args: Any, **kwargs: Any) -> Any:
|
|
643
|
+
return copy(self.__wrapped__)
|
|
644
|
+
|
|
645
|
+
def __deepcopy__(self, *args: Any, **kwargs: Any) -> Any:
|
|
646
|
+
return deepcopy(self.__wrapped__)
|
|
647
|
+
|
|
648
|
+
def __repr__(self) -> str:
|
|
649
|
+
return repr(self.__wrapped__)
|
|
650
|
+
|
|
651
|
+
def __str__(self) -> str:
|
|
652
|
+
return str(self.__wrapped__)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
class _ExperimentRunWithExample(ObjectProxy): # type: ignore[misc]
|
|
656
|
+
def __init__(self, wrapped: ExperimentRun, example: Example) -> None:
|
|
657
|
+
super().__init__(wrapped)
|
|
658
|
+
self._self_example = example
|
|
659
|
+
|
|
660
|
+
@property
|
|
661
|
+
def expected(self) -> ExampleOutput:
|
|
662
|
+
return deepcopy(self._self_example.output)
|
|
663
|
+
|
|
664
|
+
@property
|
|
665
|
+
def reference(self) -> ExampleOutput:
|
|
666
|
+
return deepcopy(self._self_example.output)
|
|
667
|
+
|
|
668
|
+
@property
|
|
669
|
+
def input(self) -> ExampleInput:
|
|
670
|
+
return deepcopy(self._self_example.input)
|
|
671
|
+
|
|
672
|
+
@property
|
|
673
|
+
def metadata(self) -> ExampleMetadata:
|
|
674
|
+
return deepcopy(self._self_example.metadata)
|
|
675
|
+
|
|
676
|
+
def __repr__(self) -> str:
|
|
677
|
+
spaces = " " * 4
|
|
678
|
+
name = self.__class__.__name__
|
|
679
|
+
identifiers = [
|
|
680
|
+
f'{spaces}id="{self.id}",',
|
|
681
|
+
f'{spaces}example_id="{self.dataset_example_id}",',
|
|
682
|
+
]
|
|
683
|
+
outputs = [
|
|
684
|
+
*([f'{spaces}error="{self.error}",'] if self.error else []),
|
|
685
|
+
*(
|
|
686
|
+
[
|
|
687
|
+
f"{spaces}{_blue('output')}="
|
|
688
|
+
+ json.dumps(
|
|
689
|
+
_shorten(self.output),
|
|
690
|
+
ensure_ascii=False,
|
|
691
|
+
sort_keys=True,
|
|
692
|
+
indent=len(spaces),
|
|
693
|
+
)
|
|
694
|
+
.replace("\n", f"\n{spaces}")
|
|
695
|
+
.replace(' "..."\n', " ...\n")
|
|
696
|
+
]
|
|
697
|
+
if not self.error
|
|
698
|
+
else []
|
|
699
|
+
),
|
|
700
|
+
]
|
|
701
|
+
dicts = [
|
|
702
|
+
spaces
|
|
703
|
+
+ f"{_blue(alias)}={{"
|
|
704
|
+
+ (f" # {comment}" if comment else "")
|
|
705
|
+
+ json.dumps(
|
|
706
|
+
_shorten(value),
|
|
707
|
+
ensure_ascii=False,
|
|
708
|
+
sort_keys=True,
|
|
709
|
+
indent=len(spaces),
|
|
710
|
+
)[1:]
|
|
711
|
+
.replace("\n", f"\n{spaces}")
|
|
712
|
+
.replace(' "..."\n', " ...\n")
|
|
713
|
+
+ ","
|
|
714
|
+
for alias, value, comment in (
|
|
715
|
+
("expected", self.expected, f"alias for the example.{_blue('output')} dict"),
|
|
716
|
+
("reference", self.reference, f"alias for the example.{_blue('output')} dict"),
|
|
717
|
+
("input", self.input, f"alias for the example.{_blue('input')} dict"),
|
|
718
|
+
("metadata", self.metadata, f"alias for the example.{_blue('metadata')} dict"),
|
|
719
|
+
)
|
|
720
|
+
if value
|
|
721
|
+
]
|
|
722
|
+
return "\n".join([f"{name}(", *identifiers, *outputs, *dicts, ")"])
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
def _blue(text: str) -> str:
|
|
726
|
+
return f"\033[1m\033[94m{text}\033[0m"
|