arize-phoenix 3.16.1__py3-none-any.whl → 7.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (338) hide show
  1. arize_phoenix-7.7.1.dist-info/METADATA +261 -0
  2. arize_phoenix-7.7.1.dist-info/RECORD +345 -0
  3. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/WHEEL +1 -1
  4. arize_phoenix-7.7.1.dist-info/entry_points.txt +3 -0
  5. phoenix/__init__.py +86 -14
  6. phoenix/auth.py +309 -0
  7. phoenix/config.py +675 -45
  8. phoenix/core/model.py +32 -30
  9. phoenix/core/model_schema.py +102 -109
  10. phoenix/core/model_schema_adapter.py +48 -45
  11. phoenix/datetime_utils.py +24 -3
  12. phoenix/db/README.md +54 -0
  13. phoenix/db/__init__.py +4 -0
  14. phoenix/db/alembic.ini +85 -0
  15. phoenix/db/bulk_inserter.py +294 -0
  16. phoenix/db/engines.py +208 -0
  17. phoenix/db/enums.py +20 -0
  18. phoenix/db/facilitator.py +113 -0
  19. phoenix/db/helpers.py +159 -0
  20. phoenix/db/insertion/constants.py +2 -0
  21. phoenix/db/insertion/dataset.py +227 -0
  22. phoenix/db/insertion/document_annotation.py +171 -0
  23. phoenix/db/insertion/evaluation.py +191 -0
  24. phoenix/db/insertion/helpers.py +98 -0
  25. phoenix/db/insertion/span.py +193 -0
  26. phoenix/db/insertion/span_annotation.py +158 -0
  27. phoenix/db/insertion/trace_annotation.py +158 -0
  28. phoenix/db/insertion/types.py +256 -0
  29. phoenix/db/migrate.py +86 -0
  30. phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
  31. phoenix/db/migrations/env.py +114 -0
  32. phoenix/db/migrations/script.py.mako +26 -0
  33. phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
  34. phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
  35. phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
  36. phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
  37. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
  38. phoenix/db/models.py +807 -0
  39. phoenix/exceptions.py +5 -1
  40. phoenix/experiments/__init__.py +6 -0
  41. phoenix/experiments/evaluators/__init__.py +29 -0
  42. phoenix/experiments/evaluators/base.py +158 -0
  43. phoenix/experiments/evaluators/code_evaluators.py +184 -0
  44. phoenix/experiments/evaluators/llm_evaluators.py +473 -0
  45. phoenix/experiments/evaluators/utils.py +236 -0
  46. phoenix/experiments/functions.py +772 -0
  47. phoenix/experiments/tracing.py +86 -0
  48. phoenix/experiments/types.py +726 -0
  49. phoenix/experiments/utils.py +25 -0
  50. phoenix/inferences/__init__.py +0 -0
  51. phoenix/{datasets → inferences}/errors.py +6 -5
  52. phoenix/{datasets → inferences}/fixtures.py +49 -42
  53. phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
  54. phoenix/{datasets → inferences}/schema.py +11 -11
  55. phoenix/{datasets → inferences}/validation.py +13 -14
  56. phoenix/logging/__init__.py +3 -0
  57. phoenix/logging/_config.py +90 -0
  58. phoenix/logging/_filter.py +6 -0
  59. phoenix/logging/_formatter.py +69 -0
  60. phoenix/metrics/__init__.py +5 -4
  61. phoenix/metrics/binning.py +4 -3
  62. phoenix/metrics/metrics.py +2 -1
  63. phoenix/metrics/mixins.py +7 -6
  64. phoenix/metrics/retrieval_metrics.py +2 -1
  65. phoenix/metrics/timeseries.py +5 -4
  66. phoenix/metrics/wrappers.py +9 -3
  67. phoenix/pointcloud/clustering.py +5 -5
  68. phoenix/pointcloud/pointcloud.py +7 -5
  69. phoenix/pointcloud/projectors.py +5 -6
  70. phoenix/pointcloud/umap_parameters.py +53 -52
  71. phoenix/server/api/README.md +28 -0
  72. phoenix/server/api/auth.py +44 -0
  73. phoenix/server/api/context.py +152 -9
  74. phoenix/server/api/dataloaders/__init__.py +91 -0
  75. phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
  76. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  77. phoenix/server/api/dataloaders/cache/__init__.py +3 -0
  78. phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
  79. phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
  80. phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
  81. phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
  82. phoenix/server/api/dataloaders/document_evaluations.py +31 -0
  83. phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
  84. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
  85. phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
  86. phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
  87. phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
  88. phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
  89. phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
  90. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
  91. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  92. phoenix/server/api/dataloaders/record_counts.py +116 -0
  93. phoenix/server/api/dataloaders/session_io.py +79 -0
  94. phoenix/server/api/dataloaders/session_num_traces.py +30 -0
  95. phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
  96. phoenix/server/api/dataloaders/session_token_usages.py +41 -0
  97. phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
  98. phoenix/server/api/dataloaders/span_annotations.py +26 -0
  99. phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
  100. phoenix/server/api/dataloaders/span_descendants.py +57 -0
  101. phoenix/server/api/dataloaders/span_projects.py +33 -0
  102. phoenix/server/api/dataloaders/token_counts.py +124 -0
  103. phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
  104. phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
  105. phoenix/server/api/dataloaders/user_roles.py +30 -0
  106. phoenix/server/api/dataloaders/users.py +33 -0
  107. phoenix/server/api/exceptions.py +48 -0
  108. phoenix/server/api/helpers/__init__.py +12 -0
  109. phoenix/server/api/helpers/dataset_helpers.py +217 -0
  110. phoenix/server/api/helpers/experiment_run_filters.py +763 -0
  111. phoenix/server/api/helpers/playground_clients.py +948 -0
  112. phoenix/server/api/helpers/playground_registry.py +70 -0
  113. phoenix/server/api/helpers/playground_spans.py +455 -0
  114. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  115. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  116. phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
  117. phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
  118. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  119. phoenix/server/api/input_types/ClusterInput.py +2 -2
  120. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  121. phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
  122. phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
  123. phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
  124. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  125. phoenix/server/api/input_types/DatasetSort.py +17 -0
  126. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  127. phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
  128. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  129. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  130. phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
  131. phoenix/server/api/input_types/DimensionFilter.py +4 -4
  132. phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
  133. phoenix/server/api/input_types/Granularity.py +1 -1
  134. phoenix/server/api/input_types/InvocationParameters.py +162 -0
  135. phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
  136. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  137. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  138. phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
  139. phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
  140. phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
  141. phoenix/server/api/input_types/SpanSort.py +134 -69
  142. phoenix/server/api/input_types/TemplateOptions.py +10 -0
  143. phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
  144. phoenix/server/api/input_types/UserRoleInput.py +9 -0
  145. phoenix/server/api/mutations/__init__.py +28 -0
  146. phoenix/server/api/mutations/api_key_mutations.py +167 -0
  147. phoenix/server/api/mutations/chat_mutations.py +593 -0
  148. phoenix/server/api/mutations/dataset_mutations.py +591 -0
  149. phoenix/server/api/mutations/experiment_mutations.py +75 -0
  150. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
  151. phoenix/server/api/mutations/project_mutations.py +57 -0
  152. phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
  153. phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
  154. phoenix/server/api/mutations/user_mutations.py +329 -0
  155. phoenix/server/api/openapi/__init__.py +0 -0
  156. phoenix/server/api/openapi/main.py +17 -0
  157. phoenix/server/api/openapi/schema.py +16 -0
  158. phoenix/server/api/queries.py +738 -0
  159. phoenix/server/api/routers/__init__.py +11 -0
  160. phoenix/server/api/routers/auth.py +284 -0
  161. phoenix/server/api/routers/embeddings.py +26 -0
  162. phoenix/server/api/routers/oauth2.py +488 -0
  163. phoenix/server/api/routers/v1/__init__.py +64 -0
  164. phoenix/server/api/routers/v1/datasets.py +1017 -0
  165. phoenix/server/api/routers/v1/evaluations.py +362 -0
  166. phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
  167. phoenix/server/api/routers/v1/experiment_runs.py +167 -0
  168. phoenix/server/api/routers/v1/experiments.py +308 -0
  169. phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
  170. phoenix/server/api/routers/v1/spans.py +267 -0
  171. phoenix/server/api/routers/v1/traces.py +208 -0
  172. phoenix/server/api/routers/v1/utils.py +95 -0
  173. phoenix/server/api/schema.py +44 -241
  174. phoenix/server/api/subscriptions.py +597 -0
  175. phoenix/server/api/types/Annotation.py +21 -0
  176. phoenix/server/api/types/AnnotationSummary.py +55 -0
  177. phoenix/server/api/types/AnnotatorKind.py +16 -0
  178. phoenix/server/api/types/ApiKey.py +27 -0
  179. phoenix/server/api/types/AuthMethod.py +9 -0
  180. phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
  181. phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
  182. phoenix/server/api/types/Cluster.py +25 -24
  183. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  184. phoenix/server/api/types/DataQualityMetric.py +31 -13
  185. phoenix/server/api/types/Dataset.py +288 -63
  186. phoenix/server/api/types/DatasetExample.py +85 -0
  187. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  188. phoenix/server/api/types/DatasetVersion.py +14 -0
  189. phoenix/server/api/types/Dimension.py +32 -31
  190. phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
  191. phoenix/server/api/types/EmbeddingDimension.py +56 -49
  192. phoenix/server/api/types/Evaluation.py +25 -31
  193. phoenix/server/api/types/EvaluationSummary.py +30 -50
  194. phoenix/server/api/types/Event.py +20 -20
  195. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  196. phoenix/server/api/types/Experiment.py +152 -0
  197. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  198. phoenix/server/api/types/ExperimentComparison.py +17 -0
  199. phoenix/server/api/types/ExperimentRun.py +119 -0
  200. phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
  201. phoenix/server/api/types/GenerativeModel.py +9 -0
  202. phoenix/server/api/types/GenerativeProvider.py +85 -0
  203. phoenix/server/api/types/Inferences.py +80 -0
  204. phoenix/server/api/types/InferencesRole.py +23 -0
  205. phoenix/server/api/types/LabelFraction.py +7 -0
  206. phoenix/server/api/types/MimeType.py +2 -2
  207. phoenix/server/api/types/Model.py +54 -54
  208. phoenix/server/api/types/PerformanceMetric.py +8 -5
  209. phoenix/server/api/types/Project.py +407 -142
  210. phoenix/server/api/types/ProjectSession.py +139 -0
  211. phoenix/server/api/types/Segments.py +4 -4
  212. phoenix/server/api/types/Span.py +221 -176
  213. phoenix/server/api/types/SpanAnnotation.py +43 -0
  214. phoenix/server/api/types/SpanIOValue.py +15 -0
  215. phoenix/server/api/types/SystemApiKey.py +9 -0
  216. phoenix/server/api/types/TemplateLanguage.py +10 -0
  217. phoenix/server/api/types/TimeSeries.py +19 -15
  218. phoenix/server/api/types/TokenUsage.py +11 -0
  219. phoenix/server/api/types/Trace.py +154 -0
  220. phoenix/server/api/types/TraceAnnotation.py +45 -0
  221. phoenix/server/api/types/UMAPPoints.py +7 -7
  222. phoenix/server/api/types/User.py +60 -0
  223. phoenix/server/api/types/UserApiKey.py +45 -0
  224. phoenix/server/api/types/UserRole.py +15 -0
  225. phoenix/server/api/types/node.py +4 -112
  226. phoenix/server/api/types/pagination.py +156 -57
  227. phoenix/server/api/utils.py +34 -0
  228. phoenix/server/app.py +864 -115
  229. phoenix/server/bearer_auth.py +163 -0
  230. phoenix/server/dml_event.py +136 -0
  231. phoenix/server/dml_event_handler.py +256 -0
  232. phoenix/server/email/__init__.py +0 -0
  233. phoenix/server/email/sender.py +97 -0
  234. phoenix/server/email/templates/__init__.py +0 -0
  235. phoenix/server/email/templates/password_reset.html +19 -0
  236. phoenix/server/email/types.py +11 -0
  237. phoenix/server/grpc_server.py +102 -0
  238. phoenix/server/jwt_store.py +505 -0
  239. phoenix/server/main.py +305 -116
  240. phoenix/server/oauth2.py +52 -0
  241. phoenix/server/openapi/__init__.py +0 -0
  242. phoenix/server/prometheus.py +111 -0
  243. phoenix/server/rate_limiters.py +188 -0
  244. phoenix/server/static/.vite/manifest.json +87 -0
  245. phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
  246. phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
  247. phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
  248. phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
  249. phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
  250. phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
  251. phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
  252. phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
  253. phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
  254. phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
  255. phoenix/server/telemetry.py +68 -0
  256. phoenix/server/templates/index.html +82 -23
  257. phoenix/server/thread_server.py +3 -3
  258. phoenix/server/types.py +275 -0
  259. phoenix/services.py +27 -18
  260. phoenix/session/client.py +743 -68
  261. phoenix/session/data_extractor.py +31 -7
  262. phoenix/session/evaluation.py +3 -9
  263. phoenix/session/session.py +263 -219
  264. phoenix/settings.py +22 -0
  265. phoenix/trace/__init__.py +2 -22
  266. phoenix/trace/attributes.py +338 -0
  267. phoenix/trace/dsl/README.md +116 -0
  268. phoenix/trace/dsl/filter.py +663 -213
  269. phoenix/trace/dsl/helpers.py +73 -21
  270. phoenix/trace/dsl/query.py +574 -201
  271. phoenix/trace/exporter.py +24 -19
  272. phoenix/trace/fixtures.py +368 -32
  273. phoenix/trace/otel.py +71 -219
  274. phoenix/trace/projects.py +3 -2
  275. phoenix/trace/schemas.py +33 -11
  276. phoenix/trace/span_evaluations.py +21 -16
  277. phoenix/trace/span_json_decoder.py +6 -4
  278. phoenix/trace/span_json_encoder.py +2 -2
  279. phoenix/trace/trace_dataset.py +47 -32
  280. phoenix/trace/utils.py +21 -4
  281. phoenix/utilities/__init__.py +0 -26
  282. phoenix/utilities/client.py +132 -0
  283. phoenix/utilities/deprecation.py +31 -0
  284. phoenix/utilities/error_handling.py +3 -2
  285. phoenix/utilities/json.py +109 -0
  286. phoenix/utilities/logging.py +8 -0
  287. phoenix/utilities/project.py +2 -2
  288. phoenix/utilities/re.py +49 -0
  289. phoenix/utilities/span_store.py +0 -23
  290. phoenix/utilities/template_formatters.py +99 -0
  291. phoenix/version.py +1 -1
  292. arize_phoenix-3.16.1.dist-info/METADATA +0 -495
  293. arize_phoenix-3.16.1.dist-info/RECORD +0 -178
  294. phoenix/core/project.py +0 -619
  295. phoenix/core/traces.py +0 -96
  296. phoenix/experimental/evals/__init__.py +0 -73
  297. phoenix/experimental/evals/evaluators.py +0 -413
  298. phoenix/experimental/evals/functions/__init__.py +0 -4
  299. phoenix/experimental/evals/functions/classify.py +0 -453
  300. phoenix/experimental/evals/functions/executor.py +0 -353
  301. phoenix/experimental/evals/functions/generate.py +0 -138
  302. phoenix/experimental/evals/functions/processing.py +0 -76
  303. phoenix/experimental/evals/models/__init__.py +0 -14
  304. phoenix/experimental/evals/models/anthropic.py +0 -175
  305. phoenix/experimental/evals/models/base.py +0 -170
  306. phoenix/experimental/evals/models/bedrock.py +0 -221
  307. phoenix/experimental/evals/models/litellm.py +0 -134
  308. phoenix/experimental/evals/models/openai.py +0 -448
  309. phoenix/experimental/evals/models/rate_limiters.py +0 -246
  310. phoenix/experimental/evals/models/vertex.py +0 -173
  311. phoenix/experimental/evals/models/vertexai.py +0 -186
  312. phoenix/experimental/evals/retrievals.py +0 -96
  313. phoenix/experimental/evals/templates/__init__.py +0 -50
  314. phoenix/experimental/evals/templates/default_templates.py +0 -472
  315. phoenix/experimental/evals/templates/template.py +0 -195
  316. phoenix/experimental/evals/utils/__init__.py +0 -172
  317. phoenix/experimental/evals/utils/threads.py +0 -27
  318. phoenix/server/api/helpers.py +0 -11
  319. phoenix/server/api/routers/evaluation_handler.py +0 -109
  320. phoenix/server/api/routers/span_handler.py +0 -70
  321. phoenix/server/api/routers/trace_handler.py +0 -60
  322. phoenix/server/api/types/DatasetRole.py +0 -23
  323. phoenix/server/static/index.css +0 -6
  324. phoenix/server/static/index.js +0 -7447
  325. phoenix/storage/span_store/__init__.py +0 -23
  326. phoenix/storage/span_store/text_file.py +0 -85
  327. phoenix/trace/dsl/missing.py +0 -60
  328. phoenix/trace/langchain/__init__.py +0 -3
  329. phoenix/trace/langchain/instrumentor.py +0 -35
  330. phoenix/trace/llama_index/__init__.py +0 -3
  331. phoenix/trace/llama_index/callback.py +0 -102
  332. phoenix/trace/openai/__init__.py +0 -3
  333. phoenix/trace/openai/instrumentor.py +0 -30
  334. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/licenses/IP_NOTICE +0 -0
  335. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.1.dist-info}/licenses/LICENSE +0 -0
  336. /phoenix/{datasets → db/insertion}/__init__.py +0 -0
  337. /phoenix/{experimental → db/migrations}/__init__.py +0 -0
  338. /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
@@ -0,0 +1,726 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import textwrap
5
+ from collections import Counter
6
+ from collections.abc import (
7
+ Awaitable,
8
+ Callable,
9
+ Iterable,
10
+ Iterator,
11
+ Mapping,
12
+ )
13
+ from copy import copy, deepcopy
14
+ from dataclasses import dataclass, field, fields
15
+ from datetime import datetime
16
+ from enum import Enum
17
+ from functools import cached_property
18
+ from importlib.metadata import version
19
+ from random import getrandbits
20
+ from typing import Any, Optional, TypeVar, Union, cast, overload
21
+
22
+ import pandas as pd
23
+ from typing_extensions import TypeAlias
24
+ from wrapt import ObjectProxy
25
+
26
+ from phoenix.datetime_utils import local_now
27
+ from phoenix.experiments.utils import get_experiment_url
28
+
29
+
30
+ class AnnotatorKind(Enum):
31
+ CODE = "CODE"
32
+ LLM = "LLM"
33
+
34
+
35
+ JSONSerializable: TypeAlias = Optional[Union[dict[str, Any], list[Any], str, int, float, bool]]
36
+ ExperimentId: TypeAlias = str
37
+ DatasetId: TypeAlias = str
38
+ DatasetVersionId: TypeAlias = str
39
+ ExampleId: TypeAlias = str
40
+ RepetitionNumber: TypeAlias = int
41
+ ExperimentRunId: TypeAlias = str
42
+ TraceId: TypeAlias = str
43
+
44
+ TaskOutput: TypeAlias = JSONSerializable
45
+
46
+ ExampleOutput: TypeAlias = Mapping[str, JSONSerializable]
47
+ ExampleMetadata: TypeAlias = Mapping[str, JSONSerializable]
48
+ ExampleInput: TypeAlias = Mapping[str, JSONSerializable]
49
+
50
+ Score: TypeAlias = Optional[Union[bool, int, float]]
51
+ Label: TypeAlias = Optional[str]
52
+ Explanation: TypeAlias = Optional[str]
53
+
54
+ EvaluatorName: TypeAlias = str
55
+ EvaluatorKind: TypeAlias = str
56
+ EvaluatorOutput: TypeAlias = Union[
57
+ "EvaluationResult", bool, int, float, str, tuple[Score, Label, Explanation]
58
+ ]
59
+
60
+ DRY_RUN: ExperimentId = "DRY_RUN"
61
+
62
+
63
+ def _dry_run_id() -> str:
64
+ suffix = getrandbits(24).to_bytes(3, "big").hex()
65
+ return f"{DRY_RUN}_{suffix}"
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class Example:
70
+ id: ExampleId
71
+ updated_at: datetime
72
+ input: Mapping[str, JSONSerializable] = field(default_factory=dict)
73
+ output: Mapping[str, JSONSerializable] = field(default_factory=dict)
74
+ metadata: Mapping[str, JSONSerializable] = field(default_factory=dict)
75
+
76
+ def __post_init__(self) -> None:
77
+ object.__setattr__(self, "input", _make_read_only(self.input))
78
+ object.__setattr__(self, "output", _make_read_only(self.output))
79
+ object.__setattr__(self, "metadata", _make_read_only(self.metadata))
80
+
81
+ @classmethod
82
+ def from_dict(cls, obj: Mapping[str, Any]) -> Example:
83
+ return cls(
84
+ input=obj["input"],
85
+ output=obj["output"],
86
+ metadata=obj.get("metadata") or {},
87
+ id=obj["id"],
88
+ updated_at=obj["updated_at"],
89
+ )
90
+
91
+ def __repr__(self) -> str:
92
+ spaces = " " * 4
93
+ name = self.__class__.__name__
94
+ identifiers = [f'{spaces}id="{self.id}",']
95
+ contents = [
96
+ spaces
97
+ + f"{_blue(key)}="
98
+ + json.dumps(
99
+ _shorten(value),
100
+ ensure_ascii=False,
101
+ sort_keys=True,
102
+ indent=len(spaces),
103
+ )
104
+ .replace("\n", f"\n{spaces}")
105
+ .replace(' "..."\n', " ...\n")
106
+ + ","
107
+ for key in ("input", "output", "metadata")
108
+ if (value := getattr(self, key, None))
109
+ ]
110
+ return "\n".join([f"{name}(", *identifiers, *contents, ")"])
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class Dataset:
115
+ id: DatasetId
116
+ version_id: DatasetVersionId
117
+ examples: Mapping[ExampleId, Example] = field(repr=False, default_factory=dict)
118
+
119
+ def __post_init__(self) -> None:
120
+ object.__setattr__(self, "examples", _ReadOnly(self.examples))
121
+
122
+ def __len__(self) -> int:
123
+ return len(self.examples)
124
+
125
+ def __iter__(self) -> Iterator[Example]:
126
+ return iter(self.examples.values())
127
+
128
+ @cached_property
129
+ def _keys(self) -> tuple[str, ...]:
130
+ return tuple(self.examples.keys())
131
+
132
+ @overload
133
+ def __getitem__(self, key: int) -> Example: ...
134
+ @overload
135
+ def __getitem__(self, key: slice) -> list[Example]: ...
136
+ def __getitem__(self, key: Union[int, slice]) -> Union[Example, list[Example]]:
137
+ if isinstance(key, int):
138
+ return self.examples[self._keys[key]]
139
+ return [self.examples[k] for k in self._keys[key]]
140
+
141
+ def as_dataframe(self, drop_empty_columns: bool = True) -> pd.DataFrame:
142
+ df = pd.DataFrame.from_records(
143
+ [
144
+ {
145
+ "example_id": example.id,
146
+ "input": deepcopy(example.input),
147
+ "output": deepcopy(example.output),
148
+ "metadata": deepcopy(example.metadata),
149
+ }
150
+ for example in self.examples.values()
151
+ ]
152
+ ).set_index("example_id")
153
+ if drop_empty_columns:
154
+ return df.reindex([k for k, v in df.items() if v.astype(bool).any()], axis=1)
155
+ return df
156
+
157
+ @classmethod
158
+ def from_dict(cls, obj: Mapping[str, Any]) -> Dataset:
159
+ examples = tuple(map(Example.from_dict, obj.get("examples") or ()))
160
+ return cls(
161
+ id=obj["dataset_id"],
162
+ version_id=obj["version_id"],
163
+ examples={ex.id: ex for ex in examples},
164
+ )
165
+
166
+
167
+ @dataclass(frozen=True)
168
+ class TestCase:
169
+ example: Example
170
+ repetition_number: RepetitionNumber
171
+
172
+
173
+ @dataclass(frozen=True)
174
+ class Experiment:
175
+ id: ExperimentId
176
+ dataset_id: DatasetId
177
+ dataset_version_id: DatasetVersionId
178
+ repetitions: int
179
+ project_name: str = field(repr=False)
180
+
181
+ @classmethod
182
+ def from_dict(cls, obj: Mapping[str, Any]) -> Experiment:
183
+ return cls(
184
+ id=obj["id"],
185
+ dataset_id=obj["dataset_id"],
186
+ dataset_version_id=obj["dataset_version_id"],
187
+ repetitions=obj.get("repetitions") or 1,
188
+ project_name=obj.get("project_name") or "",
189
+ )
190
+
191
+
192
+ @dataclass(frozen=True)
193
+ class ExperimentRun:
194
+ start_time: datetime
195
+ end_time: datetime
196
+ experiment_id: ExperimentId
197
+ dataset_example_id: ExampleId
198
+ repetition_number: RepetitionNumber
199
+ output: JSONSerializable
200
+ error: Optional[str] = None
201
+ id: ExperimentRunId = field(default_factory=_dry_run_id)
202
+ trace_id: Optional[TraceId] = None
203
+
204
+ @classmethod
205
+ def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentRun:
206
+ return cls(
207
+ start_time=obj["start_time"],
208
+ end_time=obj["end_time"],
209
+ experiment_id=obj["experiment_id"],
210
+ dataset_example_id=obj["dataset_example_id"],
211
+ repetition_number=obj.get("repetition_number") or 1,
212
+ output=_make_read_only(obj.get("output")),
213
+ error=obj.get("error"),
214
+ id=obj["id"],
215
+ trace_id=obj.get("trace_id"),
216
+ )
217
+
218
+ def __post_init__(self) -> None:
219
+ if self.output is None and self.error is None:
220
+ raise ValueError("Must specify exactly one of experiment_run_output or error")
221
+
222
+
223
+ @dataclass(frozen=True)
224
+ class EvaluationResult:
225
+ score: Optional[float] = None
226
+ label: Optional[str] = None
227
+ explanation: Optional[str] = None
228
+ metadata: Mapping[str, JSONSerializable] = field(default_factory=dict)
229
+
230
+ @classmethod
231
+ def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[EvaluationResult]:
232
+ if not obj:
233
+ return None
234
+ return cls(
235
+ score=obj.get("score"),
236
+ label=obj.get("label"),
237
+ explanation=obj.get("explanation"),
238
+ metadata=obj.get("metadata") or {},
239
+ )
240
+
241
+ def __post_init__(self) -> None:
242
+ if self.score is None and not self.label:
243
+ raise ValueError("Must specify score or label, or both")
244
+ if self.score is None and not self.label:
245
+ object.__setattr__(self, "score", 0)
246
+ for k in ("label", "explanation"):
247
+ if (v := getattr(self, k, None)) is not None:
248
+ object.__setattr__(self, k, str(v) or None)
249
+
250
+
251
+ @dataclass(frozen=True)
252
+ class ExperimentEvaluationRun:
253
+ experiment_run_id: ExperimentRunId
254
+ start_time: datetime
255
+ end_time: datetime
256
+ name: str
257
+ annotator_kind: str
258
+ error: Optional[str] = None
259
+ result: Optional[EvaluationResult] = None
260
+ id: str = field(default_factory=_dry_run_id)
261
+ trace_id: Optional[TraceId] = None
262
+
263
+ @classmethod
264
+ def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentEvaluationRun:
265
+ return cls(
266
+ experiment_run_id=obj["experiment_run_id"],
267
+ start_time=obj["start_time"],
268
+ end_time=obj["end_time"],
269
+ name=obj["name"],
270
+ annotator_kind=obj["annotator_kind"],
271
+ error=obj.get("error"),
272
+ result=EvaluationResult.from_dict(obj.get("result")),
273
+ id=obj["id"],
274
+ trace_id=obj.get("trace_id"),
275
+ )
276
+
277
+ def __post_init__(self) -> None:
278
+ if self.result is None and self.error is None:
279
+ raise ValueError("Must specify either result or error")
280
+
281
+
282
+ ExperimentTask: TypeAlias = Union[
283
+ Callable[[Example], TaskOutput],
284
+ Callable[[Example], Awaitable[TaskOutput]],
285
+ ]
286
+
287
+
288
+ @dataclass(frozen=True)
289
+ class ExperimentParameters:
290
+ n_examples: int
291
+ n_repetitions: int = 1
292
+
293
+ @property
294
+ def count(self) -> int:
295
+ return self.n_examples * self.n_repetitions
296
+
297
+
298
+ @dataclass(frozen=True)
299
+ class EvaluationParameters:
300
+ eval_names: frozenset[str]
301
+ exp_params: ExperimentParameters
302
+
303
+
304
+ @dataclass(frozen=True)
305
+ class _HasStats:
306
+ _title: str = field(repr=False, default="")
307
+ _timestamp: datetime = field(repr=False, default_factory=local_now)
308
+ stats: pd.DataFrame = field(repr=False, default_factory=pd.DataFrame)
309
+
310
+ @property
311
+ def title(self) -> str:
312
+ return f"{self._title} ({self._timestamp:%x %I:%M %p %z})"
313
+
314
+ def __str__(self) -> str:
315
+ try:
316
+ assert int(version("pandas").split(".")[0]) >= 1
317
+ # `tabulate` is used by pandas >= 1.0 in DataFrame.to_markdown()
318
+ import tabulate # noqa: F401
319
+ except (AssertionError, ImportError):
320
+ text = self.stats.__str__()
321
+ else:
322
+ text = self.stats.to_markdown(index=False)
323
+ return f"{self.title}\n{'-'*len(self.title)}\n" + text
324
+
325
+
326
+ @dataclass(frozen=True)
327
+ class EvaluationSummary(_HasStats):
328
+ """
329
+ Summary statistics of experiment evaluations.
330
+
331
+ Users should not instantiate this directly.
332
+ """
333
+
334
+ _title: str = "Experiment Summary"
335
+
336
+ @classmethod
337
+ def from_eval_runs(
338
+ cls,
339
+ params: EvaluationParameters,
340
+ *eval_runs: Optional[ExperimentEvaluationRun],
341
+ ) -> EvaluationSummary:
342
+ df = pd.DataFrame.from_records(
343
+ [
344
+ {
345
+ "evaluator": run.name,
346
+ "error": run.error,
347
+ "score": run.result.score if run.result else None,
348
+ "label": run.result.label if run.result else None,
349
+ }
350
+ for run in eval_runs
351
+ if run is not None
352
+ ]
353
+ )
354
+ if df.empty:
355
+ df = pd.DataFrame.from_records(
356
+ [
357
+ {"evaluator": name, "error": None, "score": None, "label": None}
358
+ for name in params.eval_names
359
+ ]
360
+ )
361
+ has_error = bool(df.loc[:, "error"].astype(bool).sum())
362
+ has_score = bool(df.loc[:, "score"].dropna().count())
363
+ has_label = bool(df.loc[:, "label"].astype(bool).sum())
364
+ agg = {
365
+ **(
366
+ dict(n_errors=("error", "count"), top_error=("error", _top_string))
367
+ if has_error
368
+ else {}
369
+ ),
370
+ **(dict(n_scores=("score", "count"), avg_score=("score", "mean")) if has_score else {}),
371
+ **(
372
+ dict(
373
+ n_labels=("label", "count"),
374
+ top_2_labels=(
375
+ "label",
376
+ lambda s: (dict(Counter(s.dropna()).most_common(2)) or None),
377
+ ),
378
+ )
379
+ if has_label
380
+ else {}
381
+ ),
382
+ }
383
+ stats = (
384
+ df.groupby("evaluator").agg(**agg) # type: ignore[call-overload]
385
+ if agg
386
+ else pd.DataFrame()
387
+ )
388
+ sorted_eval_names = sorted(params.eval_names)
389
+ eval_names = pd.DataFrame(
390
+ {
391
+ "evaluator": sorted_eval_names,
392
+ "n": [params.exp_params.count] * len(sorted_eval_names),
393
+ }
394
+ ).set_index("evaluator")
395
+ stats = pd.concat([eval_names, stats], axis=1).reset_index()
396
+ summary: EvaluationSummary = object.__new__(cls)
397
+ summary.__init__(stats=stats) # type: ignore[misc]
398
+ return summary
399
+
400
+ @classmethod
401
+ def __new__(cls, *args: Any, **kwargs: Any) -> Any:
402
+ # Direct instantiation by users is discouraged.
403
+ raise NotImplementedError
404
+
405
+ @classmethod
406
+ def __init_subclass__(cls, **kwargs: Any) -> None:
407
+ # Direct sub-classing by users is discouraged.
408
+ raise NotImplementedError
409
+
410
+
411
+ @dataclass(frozen=True)
412
+ class TaskSummary(_HasStats):
413
+ """
414
+ Summary statistics of experiment task executions.
415
+
416
+ **Users should not instantiate this object directly.**
417
+ """
418
+
419
+ _title: str = "Tasks Summary"
420
+
421
+ @classmethod
422
+ def from_task_runs(
423
+ cls,
424
+ params: ExperimentParameters,
425
+ task_runs: Iterable[Optional[ExperimentRun]],
426
+ ) -> "TaskSummary":
427
+ df = pd.DataFrame.from_records(
428
+ [
429
+ {
430
+ "example_id": run.dataset_example_id,
431
+ "error": run.error,
432
+ }
433
+ for run in task_runs
434
+ if run is not None
435
+ ]
436
+ )
437
+ n_runs = len(df)
438
+ n_errors = 0 if df.empty else df.loc[:, "error"].astype(bool).sum()
439
+ record = {
440
+ "n_examples": params.count,
441
+ "n_runs": n_runs,
442
+ "n_errors": n_errors,
443
+ **(dict(top_error=_top_string(df.loc[:, "error"])) if n_errors else {}),
444
+ }
445
+ stats = pd.DataFrame.from_records([record])
446
+ summary: TaskSummary = object.__new__(cls)
447
+ summary.__init__(stats=stats) # type: ignore[misc]
448
+ return summary
449
+
450
+ @classmethod
451
+ def __new__(cls, *args: Any, **kwargs: Any) -> Any:
452
+ # Direct instantiation by users is discouraged.
453
+ raise NotImplementedError
454
+
455
+ @classmethod
456
+ def __init_subclass__(cls, **kwargs: Any) -> None:
457
+ # Direct sub-classing by users is discouraged.
458
+ raise NotImplementedError
459
+
460
+
461
+ def _top_string(s: "pd.Series[Any]", length: int = 100) -> Optional[str]:
462
+ if (cnt := s.dropna().str.slice(0, length).value_counts()).empty:
463
+ return None
464
+ return cast(str, cnt.sort_values(ascending=False).index[0])
465
+
466
+
467
+ @dataclass(frozen=True)
468
+ class RanExperiment(Experiment):
469
+ """
470
+ An experiment that has been run.
471
+
472
+ **Users should not instantiate this object directly.**
473
+ """
474
+
475
+ params: ExperimentParameters = field(repr=False)
476
+ dataset: Dataset = field(repr=False)
477
+ runs: Mapping[ExperimentRunId, ExperimentRun] = field(repr=False)
478
+ task_summary: TaskSummary = field(repr=False)
479
+ eval_runs: tuple[ExperimentEvaluationRun, ...] = field(repr=False, default=())
480
+ eval_summaries: tuple[EvaluationSummary, ...] = field(repr=False, default=())
481
+
482
+ @property
483
+ def url(self) -> str:
484
+ return get_experiment_url(dataset_id=self.dataset.id, experiment_id=self.id)
485
+
486
+ @property
487
+ def info(self) -> str:
488
+ return f"🔗 View this experiment: {self.url}"
489
+
490
+ def __post_init__(self) -> None:
491
+ runs = {
492
+ id_: (
493
+ _ExperimentRunWithExample(run, example)
494
+ if (example := self.dataset.examples.get(run.dataset_example_id))
495
+ else run
496
+ )
497
+ for id_, run in self.runs.items()
498
+ }
499
+ object.__setattr__(self, "runs", runs)
500
+
501
+ def __len__(self) -> int:
502
+ return len(self.runs)
503
+
504
+ def __iter__(self) -> Iterator[ExperimentRun]:
505
+ return iter(self.runs.values())
506
+
507
+ @cached_property
508
+ def _keys(self) -> tuple[str, ...]:
509
+ return tuple(self.runs.keys())
510
+
511
+ @overload
512
+ def __getitem__(self, key: int) -> ExperimentRun: ...
513
+ @overload
514
+ def __getitem__(self, key: slice) -> list[ExperimentRun]: ...
515
+ def __getitem__(self, key: Union[int, slice]) -> Union[ExperimentRun, list[ExperimentRun]]:
516
+ if isinstance(key, int):
517
+ return self.runs[self._keys[key]]
518
+ return [self.runs[k] for k in self._keys[key]]
519
+
520
+ def get_evaluations(
521
+ self,
522
+ drop_empty_columns: bool = True,
523
+ ) -> pd.DataFrame:
524
+ df = pd.DataFrame.from_records(
525
+ [
526
+ {
527
+ "run_id": run.experiment_run_id,
528
+ "name": run.name,
529
+ "error": run.error,
530
+ "score": run.result.score if run.result else None,
531
+ "label": run.result.label if run.result else None,
532
+ "explanation": run.result.explanation if run.result else None,
533
+ }
534
+ for run in self.eval_runs
535
+ ]
536
+ ).set_index("run_id")
537
+ if drop_empty_columns:
538
+ df = df.reindex([k for k, v in df.items() if v.astype(bool).any()], axis=1)
539
+ return df.join(self.as_dataframe())
540
+
541
+ def as_dataframe(self, drop_empty_columns: bool = True) -> pd.DataFrame:
542
+ df = pd.DataFrame.from_records(
543
+ [
544
+ {
545
+ "run_id": run.id,
546
+ "error": run.error,
547
+ "output": deepcopy(run.output),
548
+ "input": deepcopy((ex := self.dataset.examples[run.dataset_example_id]).input),
549
+ "expected": deepcopy(ex.output),
550
+ "metadata": deepcopy(ex.metadata),
551
+ "example_id": run.dataset_example_id,
552
+ }
553
+ for run in self.runs.values()
554
+ ]
555
+ ).set_index("run_id")
556
+ if drop_empty_columns:
557
+ return df.reindex([k for k, v in df.items() if v.astype(bool).any()], axis=1)
558
+ return df
559
+
560
+ def add(
561
+ self,
562
+ eval_summary: EvaluationSummary,
563
+ *eval_runs: Optional[ExperimentEvaluationRun],
564
+ ) -> "RanExperiment":
565
+ return _replace(
566
+ self,
567
+ eval_runs=(*self.eval_runs, *filter(bool, eval_runs)),
568
+ eval_summaries=(*self.eval_summaries, eval_summary),
569
+ )
570
+
571
+ def __str__(self) -> str:
572
+ summaries = (*reversed(self.eval_summaries), self.task_summary)
573
+ return (
574
+ "\n"
575
+ + ("" if self.id.startswith(DRY_RUN) else f"{self.info}\n\n")
576
+ + "\n\n".join(map(str, summaries))
577
+ )
578
+
579
+ @classmethod
580
+ def __new__(cls, *args: Any, **kwargs: Any) -> Any:
581
+ # Direct instantiation by users is discouraged.
582
+ raise NotImplementedError
583
+
584
+ @classmethod
585
+ def __init_subclass__(cls, **kwargs: Any) -> None:
586
+ # Direct sub-classing by users is discouraged.
587
+ raise NotImplementedError
588
+
589
+
590
+ def _asdict(dc: Any) -> dict[str, Any]:
591
+ # non-recursive version of `dataclasses.asdict()`
592
+ return {field.name: getattr(dc, field.name) for field in fields(dc)}
593
+
594
+
595
+ T = TypeVar("T")
596
+
597
+
598
+ def _replace(obj: T, **kwargs: Any) -> T:
599
+ new_obj = object.__new__(obj.__class__)
600
+ new_obj.__init__(**{**_asdict(obj), **kwargs}) # type: ignore[misc]
601
+ return new_obj
602
+
603
+
604
+ def _shorten(obj: Any, width: int = 50) -> Any:
605
+ if isinstance(obj, str):
606
+ return textwrap.shorten(obj, width=width, placeholder="...")
607
+ if isinstance(obj, dict):
608
+ return {k: _shorten(v) for k, v in obj.items()}
609
+ if isinstance(obj, list):
610
+ if len(obj) > 2:
611
+ return [_shorten(v) for v in obj[:2]] + ["..."]
612
+ return [_shorten(v) for v in obj]
613
+ return obj
614
+
615
+
616
+ def _make_read_only(obj: Any) -> Any:
617
+ if isinstance(obj, dict):
618
+ return _ReadOnly({k: _make_read_only(v) for k, v in obj.items()})
619
+ if isinstance(obj, str):
620
+ return obj
621
+ if isinstance(obj, list):
622
+ return _ReadOnly(list(map(_make_read_only, obj)))
623
+ return obj
624
+
625
+
626
+ class _ReadOnly(ObjectProxy): # type: ignore[misc]
627
+ def __setitem__(self, *args: Any, **kwargs: Any) -> Any:
628
+ raise NotImplementedError
629
+
630
+ def __delitem__(self, *args: Any, **kwargs: Any) -> Any:
631
+ raise NotImplementedError
632
+
633
+ def __iadd__(self, *args: Any, **kwargs: Any) -> Any:
634
+ raise NotImplementedError
635
+
636
+ def pop(self, *args: Any, **kwargs: Any) -> Any:
637
+ raise NotImplementedError
638
+
639
+ def append(self, *args: Any, **kwargs: Any) -> Any:
640
+ raise NotImplementedError
641
+
642
+ def __copy__(self, *args: Any, **kwargs: Any) -> Any:
643
+ return copy(self.__wrapped__)
644
+
645
+ def __deepcopy__(self, *args: Any, **kwargs: Any) -> Any:
646
+ return deepcopy(self.__wrapped__)
647
+
648
+ def __repr__(self) -> str:
649
+ return repr(self.__wrapped__)
650
+
651
+ def __str__(self) -> str:
652
+ return str(self.__wrapped__)
653
+
654
+
655
+ class _ExperimentRunWithExample(ObjectProxy): # type: ignore[misc]
656
+ def __init__(self, wrapped: ExperimentRun, example: Example) -> None:
657
+ super().__init__(wrapped)
658
+ self._self_example = example
659
+
660
+ @property
661
+ def expected(self) -> ExampleOutput:
662
+ return deepcopy(self._self_example.output)
663
+
664
+ @property
665
+ def reference(self) -> ExampleOutput:
666
+ return deepcopy(self._self_example.output)
667
+
668
+ @property
669
+ def input(self) -> ExampleInput:
670
+ return deepcopy(self._self_example.input)
671
+
672
+ @property
673
+ def metadata(self) -> ExampleMetadata:
674
+ return deepcopy(self._self_example.metadata)
675
+
676
+ def __repr__(self) -> str:
677
+ spaces = " " * 4
678
+ name = self.__class__.__name__
679
+ identifiers = [
680
+ f'{spaces}id="{self.id}",',
681
+ f'{spaces}example_id="{self.dataset_example_id}",',
682
+ ]
683
+ outputs = [
684
+ *([f'{spaces}error="{self.error}",'] if self.error else []),
685
+ *(
686
+ [
687
+ f"{spaces}{_blue('output')}="
688
+ + json.dumps(
689
+ _shorten(self.output),
690
+ ensure_ascii=False,
691
+ sort_keys=True,
692
+ indent=len(spaces),
693
+ )
694
+ .replace("\n", f"\n{spaces}")
695
+ .replace(' "..."\n', " ...\n")
696
+ ]
697
+ if not self.error
698
+ else []
699
+ ),
700
+ ]
701
+ dicts = [
702
+ spaces
703
+ + f"{_blue(alias)}={{"
704
+ + (f" # {comment}" if comment else "")
705
+ + json.dumps(
706
+ _shorten(value),
707
+ ensure_ascii=False,
708
+ sort_keys=True,
709
+ indent=len(spaces),
710
+ )[1:]
711
+ .replace("\n", f"\n{spaces}")
712
+ .replace(' "..."\n', " ...\n")
713
+ + ","
714
+ for alias, value, comment in (
715
+ ("expected", self.expected, f"alias for the example.{_blue('output')} dict"),
716
+ ("reference", self.reference, f"alias for the example.{_blue('output')} dict"),
717
+ ("input", self.input, f"alias for the example.{_blue('input')} dict"),
718
+ ("metadata", self.metadata, f"alias for the example.{_blue('metadata')} dict"),
719
+ )
720
+ if value
721
+ ]
722
+ return "\n".join([f"{name}(", *identifiers, *outputs, *dicts, ")"])
723
+
724
+
725
+ def _blue(text: str) -> str:
726
+ return f"\033[1m\033[94m{text}\033[0m"