arize-phoenix 3.16.0__py3-none-any.whl → 7.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (338) hide show
  1. arize_phoenix-7.7.0.dist-info/METADATA +261 -0
  2. arize_phoenix-7.7.0.dist-info/RECORD +345 -0
  3. {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
  4. arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
  5. phoenix/__init__.py +86 -14
  6. phoenix/auth.py +309 -0
  7. phoenix/config.py +675 -45
  8. phoenix/core/model.py +32 -30
  9. phoenix/core/model_schema.py +102 -109
  10. phoenix/core/model_schema_adapter.py +48 -45
  11. phoenix/datetime_utils.py +24 -3
  12. phoenix/db/README.md +54 -0
  13. phoenix/db/__init__.py +4 -0
  14. phoenix/db/alembic.ini +85 -0
  15. phoenix/db/bulk_inserter.py +294 -0
  16. phoenix/db/engines.py +208 -0
  17. phoenix/db/enums.py +20 -0
  18. phoenix/db/facilitator.py +113 -0
  19. phoenix/db/helpers.py +159 -0
  20. phoenix/db/insertion/constants.py +2 -0
  21. phoenix/db/insertion/dataset.py +227 -0
  22. phoenix/db/insertion/document_annotation.py +171 -0
  23. phoenix/db/insertion/evaluation.py +191 -0
  24. phoenix/db/insertion/helpers.py +98 -0
  25. phoenix/db/insertion/span.py +193 -0
  26. phoenix/db/insertion/span_annotation.py +158 -0
  27. phoenix/db/insertion/trace_annotation.py +158 -0
  28. phoenix/db/insertion/types.py +256 -0
  29. phoenix/db/migrate.py +86 -0
  30. phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
  31. phoenix/db/migrations/env.py +114 -0
  32. phoenix/db/migrations/script.py.mako +26 -0
  33. phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
  34. phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
  35. phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
  36. phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
  37. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
  38. phoenix/db/models.py +807 -0
  39. phoenix/exceptions.py +5 -1
  40. phoenix/experiments/__init__.py +6 -0
  41. phoenix/experiments/evaluators/__init__.py +29 -0
  42. phoenix/experiments/evaluators/base.py +158 -0
  43. phoenix/experiments/evaluators/code_evaluators.py +184 -0
  44. phoenix/experiments/evaluators/llm_evaluators.py +473 -0
  45. phoenix/experiments/evaluators/utils.py +236 -0
  46. phoenix/experiments/functions.py +772 -0
  47. phoenix/experiments/tracing.py +86 -0
  48. phoenix/experiments/types.py +726 -0
  49. phoenix/experiments/utils.py +25 -0
  50. phoenix/inferences/__init__.py +0 -0
  51. phoenix/{datasets → inferences}/errors.py +6 -5
  52. phoenix/{datasets → inferences}/fixtures.py +49 -42
  53. phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
  54. phoenix/{datasets → inferences}/schema.py +11 -11
  55. phoenix/{datasets → inferences}/validation.py +13 -14
  56. phoenix/logging/__init__.py +3 -0
  57. phoenix/logging/_config.py +90 -0
  58. phoenix/logging/_filter.py +6 -0
  59. phoenix/logging/_formatter.py +69 -0
  60. phoenix/metrics/__init__.py +5 -4
  61. phoenix/metrics/binning.py +4 -3
  62. phoenix/metrics/metrics.py +2 -1
  63. phoenix/metrics/mixins.py +7 -6
  64. phoenix/metrics/retrieval_metrics.py +2 -1
  65. phoenix/metrics/timeseries.py +5 -4
  66. phoenix/metrics/wrappers.py +9 -3
  67. phoenix/pointcloud/clustering.py +5 -5
  68. phoenix/pointcloud/pointcloud.py +7 -5
  69. phoenix/pointcloud/projectors.py +5 -6
  70. phoenix/pointcloud/umap_parameters.py +53 -52
  71. phoenix/server/api/README.md +28 -0
  72. phoenix/server/api/auth.py +44 -0
  73. phoenix/server/api/context.py +152 -9
  74. phoenix/server/api/dataloaders/__init__.py +91 -0
  75. phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
  76. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  77. phoenix/server/api/dataloaders/cache/__init__.py +3 -0
  78. phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
  79. phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
  80. phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
  81. phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
  82. phoenix/server/api/dataloaders/document_evaluations.py +31 -0
  83. phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
  84. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
  85. phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
  86. phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
  87. phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
  88. phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
  89. phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
  90. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
  91. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  92. phoenix/server/api/dataloaders/record_counts.py +116 -0
  93. phoenix/server/api/dataloaders/session_io.py +79 -0
  94. phoenix/server/api/dataloaders/session_num_traces.py +30 -0
  95. phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
  96. phoenix/server/api/dataloaders/session_token_usages.py +41 -0
  97. phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
  98. phoenix/server/api/dataloaders/span_annotations.py +26 -0
  99. phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
  100. phoenix/server/api/dataloaders/span_descendants.py +57 -0
  101. phoenix/server/api/dataloaders/span_projects.py +33 -0
  102. phoenix/server/api/dataloaders/token_counts.py +124 -0
  103. phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
  104. phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
  105. phoenix/server/api/dataloaders/user_roles.py +30 -0
  106. phoenix/server/api/dataloaders/users.py +33 -0
  107. phoenix/server/api/exceptions.py +48 -0
  108. phoenix/server/api/helpers/__init__.py +12 -0
  109. phoenix/server/api/helpers/dataset_helpers.py +217 -0
  110. phoenix/server/api/helpers/experiment_run_filters.py +763 -0
  111. phoenix/server/api/helpers/playground_clients.py +948 -0
  112. phoenix/server/api/helpers/playground_registry.py +70 -0
  113. phoenix/server/api/helpers/playground_spans.py +455 -0
  114. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  115. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  116. phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
  117. phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
  118. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  119. phoenix/server/api/input_types/ClusterInput.py +2 -2
  120. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  121. phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
  122. phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
  123. phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
  124. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  125. phoenix/server/api/input_types/DatasetSort.py +17 -0
  126. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  127. phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
  128. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  129. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  130. phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
  131. phoenix/server/api/input_types/DimensionFilter.py +4 -4
  132. phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
  133. phoenix/server/api/input_types/Granularity.py +1 -1
  134. phoenix/server/api/input_types/InvocationParameters.py +162 -0
  135. phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
  136. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  137. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  138. phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
  139. phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
  140. phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
  141. phoenix/server/api/input_types/SpanSort.py +134 -69
  142. phoenix/server/api/input_types/TemplateOptions.py +10 -0
  143. phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
  144. phoenix/server/api/input_types/UserRoleInput.py +9 -0
  145. phoenix/server/api/mutations/__init__.py +28 -0
  146. phoenix/server/api/mutations/api_key_mutations.py +167 -0
  147. phoenix/server/api/mutations/chat_mutations.py +593 -0
  148. phoenix/server/api/mutations/dataset_mutations.py +591 -0
  149. phoenix/server/api/mutations/experiment_mutations.py +75 -0
  150. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
  151. phoenix/server/api/mutations/project_mutations.py +57 -0
  152. phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
  153. phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
  154. phoenix/server/api/mutations/user_mutations.py +329 -0
  155. phoenix/server/api/openapi/__init__.py +0 -0
  156. phoenix/server/api/openapi/main.py +17 -0
  157. phoenix/server/api/openapi/schema.py +16 -0
  158. phoenix/server/api/queries.py +738 -0
  159. phoenix/server/api/routers/__init__.py +11 -0
  160. phoenix/server/api/routers/auth.py +284 -0
  161. phoenix/server/api/routers/embeddings.py +26 -0
  162. phoenix/server/api/routers/oauth2.py +488 -0
  163. phoenix/server/api/routers/v1/__init__.py +64 -0
  164. phoenix/server/api/routers/v1/datasets.py +1017 -0
  165. phoenix/server/api/routers/v1/evaluations.py +362 -0
  166. phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
  167. phoenix/server/api/routers/v1/experiment_runs.py +167 -0
  168. phoenix/server/api/routers/v1/experiments.py +308 -0
  169. phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
  170. phoenix/server/api/routers/v1/spans.py +267 -0
  171. phoenix/server/api/routers/v1/traces.py +208 -0
  172. phoenix/server/api/routers/v1/utils.py +95 -0
  173. phoenix/server/api/schema.py +44 -247
  174. phoenix/server/api/subscriptions.py +597 -0
  175. phoenix/server/api/types/Annotation.py +21 -0
  176. phoenix/server/api/types/AnnotationSummary.py +55 -0
  177. phoenix/server/api/types/AnnotatorKind.py +16 -0
  178. phoenix/server/api/types/ApiKey.py +27 -0
  179. phoenix/server/api/types/AuthMethod.py +9 -0
  180. phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
  181. phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
  182. phoenix/server/api/types/Cluster.py +25 -24
  183. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  184. phoenix/server/api/types/DataQualityMetric.py +31 -13
  185. phoenix/server/api/types/Dataset.py +288 -63
  186. phoenix/server/api/types/DatasetExample.py +85 -0
  187. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  188. phoenix/server/api/types/DatasetVersion.py +14 -0
  189. phoenix/server/api/types/Dimension.py +32 -31
  190. phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
  191. phoenix/server/api/types/EmbeddingDimension.py +56 -49
  192. phoenix/server/api/types/Evaluation.py +25 -31
  193. phoenix/server/api/types/EvaluationSummary.py +30 -50
  194. phoenix/server/api/types/Event.py +20 -20
  195. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  196. phoenix/server/api/types/Experiment.py +152 -0
  197. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  198. phoenix/server/api/types/ExperimentComparison.py +17 -0
  199. phoenix/server/api/types/ExperimentRun.py +119 -0
  200. phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
  201. phoenix/server/api/types/GenerativeModel.py +9 -0
  202. phoenix/server/api/types/GenerativeProvider.py +85 -0
  203. phoenix/server/api/types/Inferences.py +80 -0
  204. phoenix/server/api/types/InferencesRole.py +23 -0
  205. phoenix/server/api/types/LabelFraction.py +7 -0
  206. phoenix/server/api/types/MimeType.py +2 -2
  207. phoenix/server/api/types/Model.py +54 -54
  208. phoenix/server/api/types/PerformanceMetric.py +8 -5
  209. phoenix/server/api/types/Project.py +407 -142
  210. phoenix/server/api/types/ProjectSession.py +139 -0
  211. phoenix/server/api/types/Segments.py +4 -4
  212. phoenix/server/api/types/Span.py +221 -176
  213. phoenix/server/api/types/SpanAnnotation.py +43 -0
  214. phoenix/server/api/types/SpanIOValue.py +15 -0
  215. phoenix/server/api/types/SystemApiKey.py +9 -0
  216. phoenix/server/api/types/TemplateLanguage.py +10 -0
  217. phoenix/server/api/types/TimeSeries.py +19 -15
  218. phoenix/server/api/types/TokenUsage.py +11 -0
  219. phoenix/server/api/types/Trace.py +154 -0
  220. phoenix/server/api/types/TraceAnnotation.py +45 -0
  221. phoenix/server/api/types/UMAPPoints.py +7 -7
  222. phoenix/server/api/types/User.py +60 -0
  223. phoenix/server/api/types/UserApiKey.py +45 -0
  224. phoenix/server/api/types/UserRole.py +15 -0
  225. phoenix/server/api/types/node.py +13 -107
  226. phoenix/server/api/types/pagination.py +156 -57
  227. phoenix/server/api/utils.py +34 -0
  228. phoenix/server/app.py +864 -115
  229. phoenix/server/bearer_auth.py +163 -0
  230. phoenix/server/dml_event.py +136 -0
  231. phoenix/server/dml_event_handler.py +256 -0
  232. phoenix/server/email/__init__.py +0 -0
  233. phoenix/server/email/sender.py +97 -0
  234. phoenix/server/email/templates/__init__.py +0 -0
  235. phoenix/server/email/templates/password_reset.html +19 -0
  236. phoenix/server/email/types.py +11 -0
  237. phoenix/server/grpc_server.py +102 -0
  238. phoenix/server/jwt_store.py +505 -0
  239. phoenix/server/main.py +305 -116
  240. phoenix/server/oauth2.py +52 -0
  241. phoenix/server/openapi/__init__.py +0 -0
  242. phoenix/server/prometheus.py +111 -0
  243. phoenix/server/rate_limiters.py +188 -0
  244. phoenix/server/static/.vite/manifest.json +87 -0
  245. phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
  246. phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
  247. phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
  248. phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
  249. phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
  250. phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
  251. phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
  252. phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
  253. phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
  254. phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
  255. phoenix/server/telemetry.py +68 -0
  256. phoenix/server/templates/index.html +82 -23
  257. phoenix/server/thread_server.py +3 -3
  258. phoenix/server/types.py +275 -0
  259. phoenix/services.py +27 -18
  260. phoenix/session/client.py +743 -68
  261. phoenix/session/data_extractor.py +31 -7
  262. phoenix/session/evaluation.py +3 -9
  263. phoenix/session/session.py +263 -219
  264. phoenix/settings.py +22 -0
  265. phoenix/trace/__init__.py +2 -22
  266. phoenix/trace/attributes.py +338 -0
  267. phoenix/trace/dsl/README.md +116 -0
  268. phoenix/trace/dsl/filter.py +663 -213
  269. phoenix/trace/dsl/helpers.py +73 -21
  270. phoenix/trace/dsl/query.py +574 -201
  271. phoenix/trace/exporter.py +24 -19
  272. phoenix/trace/fixtures.py +368 -32
  273. phoenix/trace/otel.py +71 -219
  274. phoenix/trace/projects.py +3 -2
  275. phoenix/trace/schemas.py +33 -11
  276. phoenix/trace/span_evaluations.py +21 -16
  277. phoenix/trace/span_json_decoder.py +6 -4
  278. phoenix/trace/span_json_encoder.py +2 -2
  279. phoenix/trace/trace_dataset.py +47 -32
  280. phoenix/trace/utils.py +21 -4
  281. phoenix/utilities/__init__.py +0 -26
  282. phoenix/utilities/client.py +132 -0
  283. phoenix/utilities/deprecation.py +31 -0
  284. phoenix/utilities/error_handling.py +3 -2
  285. phoenix/utilities/json.py +109 -0
  286. phoenix/utilities/logging.py +8 -0
  287. phoenix/utilities/project.py +2 -2
  288. phoenix/utilities/re.py +49 -0
  289. phoenix/utilities/span_store.py +0 -23
  290. phoenix/utilities/template_formatters.py +99 -0
  291. phoenix/version.py +1 -1
  292. arize_phoenix-3.16.0.dist-info/METADATA +0 -495
  293. arize_phoenix-3.16.0.dist-info/RECORD +0 -178
  294. phoenix/core/project.py +0 -617
  295. phoenix/core/traces.py +0 -100
  296. phoenix/experimental/evals/__init__.py +0 -73
  297. phoenix/experimental/evals/evaluators.py +0 -413
  298. phoenix/experimental/evals/functions/__init__.py +0 -4
  299. phoenix/experimental/evals/functions/classify.py +0 -453
  300. phoenix/experimental/evals/functions/executor.py +0 -353
  301. phoenix/experimental/evals/functions/generate.py +0 -138
  302. phoenix/experimental/evals/functions/processing.py +0 -76
  303. phoenix/experimental/evals/models/__init__.py +0 -14
  304. phoenix/experimental/evals/models/anthropic.py +0 -175
  305. phoenix/experimental/evals/models/base.py +0 -170
  306. phoenix/experimental/evals/models/bedrock.py +0 -221
  307. phoenix/experimental/evals/models/litellm.py +0 -134
  308. phoenix/experimental/evals/models/openai.py +0 -448
  309. phoenix/experimental/evals/models/rate_limiters.py +0 -246
  310. phoenix/experimental/evals/models/vertex.py +0 -173
  311. phoenix/experimental/evals/models/vertexai.py +0 -186
  312. phoenix/experimental/evals/retrievals.py +0 -96
  313. phoenix/experimental/evals/templates/__init__.py +0 -50
  314. phoenix/experimental/evals/templates/default_templates.py +0 -472
  315. phoenix/experimental/evals/templates/template.py +0 -195
  316. phoenix/experimental/evals/utils/__init__.py +0 -172
  317. phoenix/experimental/evals/utils/threads.py +0 -27
  318. phoenix/server/api/helpers.py +0 -11
  319. phoenix/server/api/routers/evaluation_handler.py +0 -109
  320. phoenix/server/api/routers/span_handler.py +0 -70
  321. phoenix/server/api/routers/trace_handler.py +0 -60
  322. phoenix/server/api/types/DatasetRole.py +0 -23
  323. phoenix/server/static/index.css +0 -6
  324. phoenix/server/static/index.js +0 -7447
  325. phoenix/storage/span_store/__init__.py +0 -23
  326. phoenix/storage/span_store/text_file.py +0 -85
  327. phoenix/trace/dsl/missing.py +0 -60
  328. phoenix/trace/langchain/__init__.py +0 -3
  329. phoenix/trace/langchain/instrumentor.py +0 -35
  330. phoenix/trace/llama_index/__init__.py +0 -3
  331. phoenix/trace/llama_index/callback.py +0 -102
  332. phoenix/trace/openai/__init__.py +0 -3
  333. phoenix/trace/openai/instrumentor.py +0 -30
  334. {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
  335. {arize_phoenix-3.16.0.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
  336. /phoenix/{datasets → db/insertion}/__init__.py +0 -0
  337. /phoenix/{experimental → db/migrations}/__init__.py +0 -0
  338. /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
@@ -0,0 +1,772 @@
1
+ import asyncio
2
+ import functools
3
+ import inspect
4
+ import json
5
+ import traceback
6
+ from binascii import hexlify
7
+ from collections.abc import Awaitable, Mapping, Sequence
8
+ from contextlib import ExitStack
9
+ from copy import deepcopy
10
+ from dataclasses import replace
11
+ from datetime import datetime, timezone
12
+ from itertools import product
13
+ from typing import Any, Literal, Optional, Union, cast
14
+ from urllib.parse import urljoin
15
+
16
+ import httpx
17
+ import opentelemetry.sdk.trace as trace_sdk
18
+ import pandas as pd
19
+ from openinference.semconv.resource import ResourceAttributes
20
+ from openinference.semconv.trace import (
21
+ OpenInferenceMimeTypeValues,
22
+ OpenInferenceSpanKindValues,
23
+ SpanAttributes,
24
+ )
25
+ from opentelemetry.context import Context
26
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
27
+ from opentelemetry.sdk.resources import Resource
28
+ from opentelemetry.sdk.trace import Span
29
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
30
+ from opentelemetry.trace import Status, StatusCode, Tracer
31
+ from typing_extensions import TypeAlias
32
+
33
+ from phoenix.config import get_base_url
34
+ from phoenix.evals.executors import get_executor_on_sync_context
35
+ from phoenix.evals.models.rate_limiters import RateLimiter
36
+ from phoenix.evals.utils import get_tqdm_progress_bar_formatter
37
+ from phoenix.experiments.evaluators import create_evaluator
38
+ from phoenix.experiments.evaluators.base import (
39
+ Evaluator,
40
+ ExperimentEvaluator,
41
+ )
42
+ from phoenix.experiments.tracing import capture_spans
43
+ from phoenix.experiments.types import (
44
+ DRY_RUN,
45
+ Dataset,
46
+ EvaluationParameters,
47
+ EvaluationResult,
48
+ EvaluationSummary,
49
+ EvaluatorName,
50
+ Example,
51
+ Experiment,
52
+ ExperimentEvaluationRun,
53
+ ExperimentParameters,
54
+ ExperimentRun,
55
+ ExperimentTask,
56
+ RanExperiment,
57
+ TaskSummary,
58
+ TestCase,
59
+ _asdict,
60
+ _replace,
61
+ )
62
+ from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
63
+ from phoenix.trace.attributes import flatten
64
+ from phoenix.utilities.client import VersionedAsyncClient, VersionedClient
65
+ from phoenix.utilities.json import jsonify
66
+
67
+
68
+ def _phoenix_clients() -> tuple[httpx.Client, httpx.AsyncClient]:
69
+ return VersionedClient(
70
+ base_url=get_base_url(),
71
+ ), VersionedAsyncClient(
72
+ base_url=get_base_url(),
73
+ )
74
+
75
+
76
+ Evaluators: TypeAlias = Union[
77
+ ExperimentEvaluator,
78
+ Sequence[ExperimentEvaluator],
79
+ Mapping[EvaluatorName, ExperimentEvaluator],
80
+ ]
81
+
82
+
83
+ RateLimitErrors: TypeAlias = Union[type[BaseException], Sequence[type[BaseException]]]
84
+
85
+
86
+ def run_experiment(
87
+ dataset: Dataset,
88
+ task: ExperimentTask,
89
+ evaluators: Optional[Evaluators] = None,
90
+ *,
91
+ experiment_name: Optional[str] = None,
92
+ experiment_description: Optional[str] = None,
93
+ experiment_metadata: Optional[Mapping[str, Any]] = None,
94
+ rate_limit_errors: Optional[RateLimitErrors] = None,
95
+ dry_run: Union[bool, int] = False,
96
+ print_summary: bool = True,
97
+ concurrency: int = 3,
98
+ ) -> RanExperiment:
99
+ """
100
+ Runs an experiment using a given set of dataset of examples.
101
+
102
+ An experiment is a user-defined task that runs on each example in a dataset. The results from
103
+ each experiment can be evaluated using any number of evaluators to measure the behavior of the
104
+ task. The experiment and evaluation results are stored in the Phoenix database for comparison
105
+ and analysis.
106
+
107
+ A `task` is either a synchronous or asynchronous function that returns a JSON serializable
108
+ output. If the `task` is a function of one argument then that argument will be bound to the
109
+ `input` field of the dataset example. Alternatively, the `task` can be a function of any
110
+ combination of specific argument names that will be bound to special values:
111
+
112
+ - `input`: The input field of the dataset example
113
+ - `expected`: The expected or reference output of the dataset example
114
+ - `reference`: An alias for `expected`
115
+ - `metadata`: Metadata associated with the dataset example
116
+ - `example`: The dataset `Example` object with all associated fields
117
+
118
+ An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
119
+ or numeric "score". If the `evaluator` is a function of one argument then that argument will be
120
+ bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
121
+ combination of specific argument names that will be bound to special values:
122
+
123
+ - `input`: The input field of the dataset example
124
+ - `output`: The output of the task
125
+ - `expected`: The expected or reference output of the dataset example
126
+ - `reference`: An alias for `expected`
127
+ - `metadata`: Metadata associated with the dataset example
128
+
129
+ Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
130
+
131
+ Args:
132
+ dataset (Dataset): The dataset on which to run the experiment.
133
+ task (ExperimentTask): The task to run on each example in the dataset.
134
+ evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
135
+ evaluate the results of the experiment. Defaults to None.
136
+ experiment_name (Optional[str]): The name of the experiment. Defaults to None.
137
+ experiment_description (Optional[str]): A description of the experiment. Defaults to None.
138
+ experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
139
+ experiment. Defaults to None.
140
+ rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
141
+ sequence of exceptions to adaptively throttle on. Defaults to None.
142
+ dry_run (bool | int): Run the experiment in dry-run mode. When set, experiment results will
143
+ not be recorded in Phoenix. If True, the experiment will run on a random dataset
144
+ example. If an integer, the experiment will run on a random sample of the dataset
145
+ examples of the given size. Defaults to False.
146
+ print_summary (bool): Whether to print a summary of the experiment and evaluation results.
147
+ Defaults to True.
148
+ concurrency (int): Specifies the concurrency for task execution. In order to enable
149
+ concurrent task execution, the task callable must be a coroutine function.
150
+ Defaults to 3.
151
+
152
+ Returns:
153
+ RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
154
+ added to the experiment using the `evaluate_experiment` function.
155
+ """
156
+ task_signature = inspect.signature(task)
157
+ _validate_task_signature(task_signature)
158
+
159
+ if not dataset.examples:
160
+ raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
161
+ # Add this to the params once supported in the UI
162
+ repetitions = 1
163
+ assert repetitions > 0, "Must run the experiment at least once."
164
+ evaluators_by_name = _evaluators_by_name(evaluators)
165
+
166
+ sync_client, async_client = _phoenix_clients()
167
+
168
+ payload = {
169
+ "version_id": dataset.version_id,
170
+ "name": experiment_name,
171
+ "description": experiment_description,
172
+ "metadata": experiment_metadata,
173
+ "repetitions": repetitions,
174
+ }
175
+ if not dry_run:
176
+ experiment_response = sync_client.post(
177
+ f"/v1/datasets/{dataset.id}/experiments",
178
+ json=payload,
179
+ )
180
+ experiment_response.raise_for_status()
181
+ exp_json = experiment_response.json()["data"]
182
+ project_name = exp_json["project_name"]
183
+ experiment = Experiment(
184
+ dataset_id=dataset.id,
185
+ dataset_version_id=dataset.version_id,
186
+ repetitions=repetitions,
187
+ id=exp_json["id"],
188
+ project_name=project_name,
189
+ )
190
+ else:
191
+ experiment = Experiment(
192
+ dataset_id=dataset.id,
193
+ dataset_version_id=dataset.version_id,
194
+ repetitions=repetitions,
195
+ id=DRY_RUN,
196
+ project_name="",
197
+ )
198
+
199
+ tracer, resource = _get_tracer(experiment.project_name)
200
+ root_span_name = f"Task: {get_func_name(task)}"
201
+ root_span_kind = CHAIN
202
+
203
+ print("🧪 Experiment started.")
204
+ if dry_run:
205
+ examples = {
206
+ (ex := dataset[i]).id: ex
207
+ for i in pd.Series(range(len(dataset)))
208
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
209
+ .sort_values()
210
+ }
211
+ id_selection = "\n".join(examples)
212
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
213
+ dataset = replace(dataset, examples=examples)
214
+ else:
215
+ dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
216
+ experiment_compare_url = get_experiment_url(
217
+ dataset_id=dataset.id,
218
+ experiment_id=experiment.id,
219
+ )
220
+ print(f"📺 View dataset experiments: {dataset_experiments_url}")
221
+ print(f"🔗 View this experiment: {experiment_compare_url}")
222
+
223
+ def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
224
+ example, repetition_number = test_case.example, test_case.repetition_number
225
+ output = None
226
+ error: Optional[BaseException] = None
227
+ status = Status(StatusCode.OK)
228
+ with ExitStack() as stack:
229
+ span: Span = stack.enter_context(
230
+ tracer.start_as_current_span(root_span_name, context=Context())
231
+ )
232
+ stack.enter_context(capture_spans(resource))
233
+ try:
234
+ # Do not use keyword arguments, which can fail at runtime
235
+ # even when function obeys protocol, because keyword arguments
236
+ # are implementation details.
237
+ bound_task_args = _bind_task_signature(task_signature, example)
238
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
239
+ if isinstance(_output, Awaitable):
240
+ sync_error_message = (
241
+ "Task is async and cannot be run within an existing event loop. "
242
+ "Consider the following options:\n\n"
243
+ "1. Pass in a synchronous task callable.\n"
244
+ "2. Use `nest_asyncio.apply()` to allow nesting event loops."
245
+ )
246
+ raise RuntimeError(sync_error_message)
247
+ else:
248
+ output = _output
249
+ except BaseException as exc:
250
+ span.record_exception(exc)
251
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
252
+ error = exc
253
+ _print_experiment_error(
254
+ exc,
255
+ example_id=example.id,
256
+ repetition_number=repetition_number,
257
+ kind="task",
258
+ )
259
+ output = jsonify(output)
260
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
261
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
262
+ if output is not None:
263
+ if isinstance(output, str):
264
+ span.set_attribute(OUTPUT_VALUE, output)
265
+ else:
266
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
267
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
268
+ span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
269
+ span.set_status(status)
270
+
271
+ assert isinstance(
272
+ output, (dict, list, str, int, float, bool, type(None))
273
+ ), "Output must be JSON serializable"
274
+ exp_run = ExperimentRun(
275
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
276
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
277
+ experiment_id=experiment.id,
278
+ dataset_example_id=example.id,
279
+ repetition_number=repetition_number,
280
+ output=output,
281
+ error=repr(error) if error else None,
282
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
283
+ )
284
+ if not dry_run:
285
+ resp = sync_client.post(f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run))
286
+ resp.raise_for_status()
287
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
288
+ return exp_run
289
+
290
+ async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
291
+ example, repetition_number = test_case.example, test_case.repetition_number
292
+ output = None
293
+ error: Optional[BaseException] = None
294
+ status = Status(StatusCode.OK)
295
+ with ExitStack() as stack:
296
+ span: Span = stack.enter_context(
297
+ tracer.start_as_current_span(root_span_name, context=Context())
298
+ )
299
+ stack.enter_context(capture_spans(resource))
300
+ try:
301
+ # Do not use keyword arguments, which can fail at runtime
302
+ # even when function obeys protocol, because keyword arguments
303
+ # are implementation details.
304
+ bound_task_args = _bind_task_signature(task_signature, example)
305
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
306
+ if isinstance(_output, Awaitable):
307
+ output = await _output
308
+ else:
309
+ output = _output
310
+ except BaseException as exc:
311
+ span.record_exception(exc)
312
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
313
+ error = exc
314
+ _print_experiment_error(
315
+ exc,
316
+ example_id=example.id,
317
+ repetition_number=repetition_number,
318
+ kind="task",
319
+ )
320
+ output = jsonify(output)
321
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
322
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
323
+ if output is not None:
324
+ if isinstance(output, str):
325
+ span.set_attribute(OUTPUT_VALUE, output)
326
+ else:
327
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
328
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
329
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
330
+ span.set_status(status)
331
+
332
+ assert isinstance(
333
+ output, (dict, list, str, int, float, bool, type(None))
334
+ ), "Output must be JSON serializable"
335
+ exp_run = ExperimentRun(
336
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
337
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
338
+ experiment_id=experiment.id,
339
+ dataset_example_id=example.id,
340
+ repetition_number=repetition_number,
341
+ output=output,
342
+ error=repr(error) if error else None,
343
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
344
+ )
345
+ if not dry_run:
346
+ # Below is a workaround to avoid timeout errors sometimes
347
+ # encountered when the task is a synchronous function that
348
+ # blocks for too long.
349
+ resp = await asyncio.get_running_loop().run_in_executor(
350
+ None,
351
+ functools.partial(
352
+ sync_client.post,
353
+ url=f"/v1/experiments/{experiment.id}/runs",
354
+ json=jsonify(exp_run),
355
+ ),
356
+ )
357
+ resp.raise_for_status()
358
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
359
+ return exp_run
360
+
361
+ _errors: tuple[type[BaseException], ...]
362
+ if not isinstance(rate_limit_errors, Sequence):
363
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
364
+ else:
365
+ _errors = tuple(filter(None, rate_limit_errors))
366
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
367
+
368
+ rate_limited_sync_run_experiment = functools.reduce(
369
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
370
+ )
371
+ rate_limited_async_run_experiment = functools.reduce(
372
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
373
+ )
374
+
375
+ executor = get_executor_on_sync_context(
376
+ rate_limited_sync_run_experiment,
377
+ rate_limited_async_run_experiment,
378
+ max_retries=0,
379
+ exit_on_error=False,
380
+ fallback_return_value=None,
381
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
382
+ concurrency=concurrency,
383
+ )
384
+
385
+ test_cases = [
386
+ TestCase(example=deepcopy(ex), repetition_number=rep)
387
+ for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
388
+ ]
389
+ task_runs, _execution_details = executor.run(test_cases)
390
+ print("✅ Task runs completed.")
391
+ params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
392
+ task_summary = TaskSummary.from_task_runs(params, task_runs)
393
+ ran_experiment: RanExperiment = object.__new__(RanExperiment)
394
+ ran_experiment.__init__( # type: ignore[misc]
395
+ params=params,
396
+ dataset=dataset,
397
+ runs={r.id: r for r in task_runs if r is not None},
398
+ task_summary=task_summary,
399
+ **_asdict(experiment),
400
+ )
401
+ if evaluators_by_name:
402
+ return evaluate_experiment(
403
+ ran_experiment,
404
+ evaluators=evaluators_by_name,
405
+ dry_run=dry_run,
406
+ print_summary=print_summary,
407
+ rate_limit_errors=rate_limit_errors,
408
+ concurrency=concurrency,
409
+ )
410
+ if print_summary:
411
+ print(ran_experiment)
412
+ return ran_experiment
413
+
414
+
415
+ def evaluate_experiment(
416
+ experiment: Experiment,
417
+ evaluators: Evaluators,
418
+ *,
419
+ dry_run: Union[bool, int] = False,
420
+ print_summary: bool = True,
421
+ rate_limit_errors: Optional[RateLimitErrors] = None,
422
+ concurrency: int = 3,
423
+ ) -> RanExperiment:
424
+ if not dry_run and _is_dry_run(experiment):
425
+ dry_run = True
426
+ evaluators_by_name = _evaluators_by_name(evaluators)
427
+ if not evaluators_by_name:
428
+ raise ValueError("Must specify at least one Evaluator")
429
+ sync_client, async_client = _phoenix_clients()
430
+ dataset_id = experiment.dataset_id
431
+ dataset_version_id = experiment.dataset_version_id
432
+ if isinstance(experiment, RanExperiment):
433
+ ran_experiment: RanExperiment = experiment
434
+ else:
435
+ dataset = Dataset.from_dict(
436
+ sync_client.get(
437
+ f"/v1/datasets/{dataset_id}/examples",
438
+ params={"version_id": str(dataset_version_id)},
439
+ ).json()["data"]
440
+ )
441
+ if not dataset.examples:
442
+ raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
443
+ experiment_runs = {
444
+ exp_run["id"]: ExperimentRun.from_dict(exp_run)
445
+ for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
446
+ }
447
+ if not experiment_runs:
448
+ raise ValueError("Experiment has not been run")
449
+ params = ExperimentParameters(n_examples=len(dataset.examples))
450
+ task_summary = TaskSummary.from_task_runs(params, experiment_runs.values())
451
+ ran_experiment = object.__new__(RanExperiment)
452
+ ran_experiment.__init__( # type: ignore[misc]
453
+ dataset=dataset,
454
+ params=params,
455
+ runs=experiment_runs,
456
+ task_summary=task_summary,
457
+ **_asdict(experiment),
458
+ )
459
+ print("🧠 Evaluation started.")
460
+ examples = ran_experiment.dataset.examples
461
+ if dry_run:
462
+ if not _is_dry_run(ran_experiment):
463
+ dataset = ran_experiment.dataset
464
+ examples = {
465
+ (ex := dataset[i]).id: ex
466
+ for i in pd.Series(range(len(dataset)))
467
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
468
+ .sort_values()
469
+ }
470
+ dataset = replace(ran_experiment.dataset, examples=examples)
471
+ ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
472
+ id_selection = "\n".join(examples)
473
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
474
+ # not all dataset examples have associated experiment runs, so we need to pair them up
475
+ example_run_pairs = []
476
+ examples = ran_experiment.dataset.examples
477
+ for exp_run in ran_experiment.runs.values():
478
+ example = examples.get(exp_run.dataset_example_id)
479
+ if example:
480
+ example_run_pairs.append((deepcopy(example), exp_run))
481
+ evaluation_input = [
482
+ (example, run, evaluator)
483
+ for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
484
+ ]
485
+
486
+ tracer, resource = _get_tracer(None if dry_run else "evaluators")
487
+ root_span_kind = EVALUATOR
488
+
489
+ def sync_evaluate_run(
490
+ obj: tuple[Example, ExperimentRun, Evaluator],
491
+ ) -> ExperimentEvaluationRun:
492
+ example, experiment_run, evaluator = obj
493
+ result: Optional[EvaluationResult] = None
494
+ error: Optional[BaseException] = None
495
+ status = Status(StatusCode.OK)
496
+ root_span_name = f"Evaluation: {evaluator.name}"
497
+ with ExitStack() as stack:
498
+ span: Span = stack.enter_context(
499
+ tracer.start_as_current_span(root_span_name, context=Context())
500
+ )
501
+ stack.enter_context(capture_spans(resource))
502
+ try:
503
+ result = evaluator.evaluate(
504
+ output=deepcopy(experiment_run.output),
505
+ expected=example.output,
506
+ reference=example.output,
507
+ input=example.input,
508
+ metadata=example.metadata,
509
+ )
510
+ except BaseException as exc:
511
+ span.record_exception(exc)
512
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
513
+ error = exc
514
+ _print_experiment_error(
515
+ exc,
516
+ example_id=example.id,
517
+ repetition_number=experiment_run.repetition_number,
518
+ kind="evaluator",
519
+ )
520
+ if result:
521
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
522
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
523
+ span.set_status(status)
524
+
525
+ eval_run = ExperimentEvaluationRun(
526
+ experiment_run_id=experiment_run.id,
527
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
528
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
529
+ name=evaluator.name,
530
+ annotator_kind=evaluator.kind,
531
+ error=repr(error) if error else None,
532
+ result=result,
533
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
534
+ )
535
+ if not dry_run:
536
+ resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
537
+ resp.raise_for_status()
538
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
539
+ return eval_run
540
+
541
+ async def async_evaluate_run(
542
+ obj: tuple[Example, ExperimentRun, Evaluator],
543
+ ) -> ExperimentEvaluationRun:
544
+ example, experiment_run, evaluator = obj
545
+ result: Optional[EvaluationResult] = None
546
+ error: Optional[BaseException] = None
547
+ status = Status(StatusCode.OK)
548
+ root_span_name = f"Evaluation: {evaluator.name}"
549
+ with ExitStack() as stack:
550
+ span: Span = stack.enter_context(
551
+ tracer.start_as_current_span(root_span_name, context=Context())
552
+ )
553
+ stack.enter_context(capture_spans(resource))
554
+ try:
555
+ result = await evaluator.async_evaluate(
556
+ output=deepcopy(experiment_run.output),
557
+ expected=example.output,
558
+ reference=example.output,
559
+ input=example.input,
560
+ metadata=example.metadata,
561
+ )
562
+ except BaseException as exc:
563
+ span.record_exception(exc)
564
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
565
+ error = exc
566
+ _print_experiment_error(
567
+ exc,
568
+ example_id=example.id,
569
+ repetition_number=experiment_run.repetition_number,
570
+ kind="evaluator",
571
+ )
572
+ if result:
573
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
574
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
575
+ span.set_status(status)
576
+
577
+ eval_run = ExperimentEvaluationRun(
578
+ experiment_run_id=experiment_run.id,
579
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
580
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
581
+ name=evaluator.name,
582
+ annotator_kind=evaluator.kind,
583
+ error=repr(error) if error else None,
584
+ result=result,
585
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
586
+ )
587
+ if not dry_run:
588
+ # Below is a workaround to avoid timeout errors sometimes
589
+ # encountered when the evaluator is a synchronous function
590
+ # that blocks for too long.
591
+ resp = await asyncio.get_running_loop().run_in_executor(
592
+ None,
593
+ functools.partial(
594
+ sync_client.post,
595
+ url="/v1/experiment_evaluations",
596
+ json=jsonify(eval_run),
597
+ ),
598
+ )
599
+ resp.raise_for_status()
600
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
601
+ return eval_run
602
+
603
+ _errors: tuple[type[BaseException], ...]
604
+ if not isinstance(rate_limit_errors, Sequence):
605
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
606
+ else:
607
+ _errors = tuple(filter(None, rate_limit_errors))
608
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
609
+
610
+ rate_limited_sync_evaluate_run = functools.reduce(
611
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_evaluate_run
612
+ )
613
+ rate_limited_async_evaluate_run = functools.reduce(
614
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
615
+ )
616
+
617
+ executor = get_executor_on_sync_context(
618
+ rate_limited_sync_evaluate_run,
619
+ rate_limited_async_evaluate_run,
620
+ max_retries=0,
621
+ exit_on_error=False,
622
+ fallback_return_value=None,
623
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
624
+ concurrency=concurrency,
625
+ )
626
+ eval_runs, _execution_details = executor.run(evaluation_input)
627
+ eval_summary = EvaluationSummary.from_eval_runs(
628
+ EvaluationParameters(
629
+ eval_names=frozenset(evaluators_by_name),
630
+ exp_params=ran_experiment.params,
631
+ ),
632
+ *eval_runs,
633
+ )
634
+ ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
635
+ if print_summary:
636
+ print(ran_experiment)
637
+ return ran_experiment
638
+
639
+
640
+ def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
641
+ evaluators_by_name: dict[EvaluatorName, Evaluator] = {}
642
+ if obj is None:
643
+ return evaluators_by_name
644
+ if isinstance(mapping := obj, Mapping):
645
+ for name, value in mapping.items():
646
+ evaluator = (
647
+ create_evaluator(name=name)(value) if not isinstance(value, Evaluator) else value
648
+ )
649
+ name = evaluator.name
650
+ if name in evaluators_by_name:
651
+ raise ValueError(f"Two evaluators have the same name: {name}")
652
+ evaluators_by_name[name] = evaluator
653
+ elif isinstance(seq := obj, Sequence):
654
+ for value in seq:
655
+ evaluator = create_evaluator()(value) if not isinstance(value, Evaluator) else value
656
+ name = evaluator.name
657
+ if name in evaluators_by_name:
658
+ raise ValueError(f"Two evaluators have the same name: {name}")
659
+ evaluators_by_name[name] = evaluator
660
+ else:
661
+ assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
662
+ evaluator = create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
663
+ name = evaluator.name
664
+ if name in evaluators_by_name:
665
+ raise ValueError(f"Two evaluators have the same name: {name}")
666
+ evaluators_by_name[name] = evaluator
667
+ return evaluators_by_name
668
+
669
+
670
+ def _get_tracer(project_name: Optional[str] = None) -> tuple[Tracer, Resource]:
671
+ resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
672
+ tracer_provider = trace_sdk.TracerProvider(resource=resource)
673
+ span_processor = (
674
+ SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{get_base_url()}", "v1/traces")))
675
+ if project_name
676
+ else _NoOpProcessor()
677
+ )
678
+ tracer_provider.add_span_processor(span_processor)
679
+ return tracer_provider.get_tracer(__name__), resource
680
+
681
+
682
+ def _str_trace_id(id_: int) -> str:
683
+ return hexlify(id_.to_bytes(16, "big")).decode()
684
+
685
+
686
+ def _decode_unix_nano(time_unix_nano: int) -> datetime:
687
+ return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
688
+
689
+
690
+ def _is_dry_run(obj: Any) -> bool:
691
+ return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
692
+
693
+
694
+ def _validate_task_signature(sig: inspect.Signature) -> None:
695
+ # Check that the function signature has a valid signature for use as a task
696
+ # If it does not, raise an error to exit early before running an experiment
697
+ params = sig.parameters
698
+ valid_named_params = {"input", "expected", "reference", "metadata", "example"}
699
+ if len(params) == 0:
700
+ raise ValueError("Task function must have at least one parameter.")
701
+ if len(params) > 1:
702
+ for not_found in set(params) - valid_named_params:
703
+ param = params[not_found]
704
+ if (
705
+ param.kind is inspect.Parameter.VAR_KEYWORD
706
+ or param.default is not inspect.Parameter.empty
707
+ ):
708
+ continue
709
+ raise ValueError(
710
+ (
711
+ f"Invalid parameter names in task function: {', '.join(not_found)}. "
712
+ "Parameters names for multi-argument functions must be "
713
+ f"any of: {', '.join(valid_named_params)}."
714
+ )
715
+ )
716
+
717
+
718
+ def _bind_task_signature(sig: inspect.Signature, example: Example) -> inspect.BoundArguments:
719
+ parameter_mapping = {
720
+ "input": example.input,
721
+ "expected": example.output,
722
+ "reference": example.output, # Alias for "expected"
723
+ "metadata": example.metadata,
724
+ "example": example,
725
+ }
726
+ params = sig.parameters
727
+ if len(params) == 1:
728
+ parameter_name = next(iter(params))
729
+ if parameter_name in parameter_mapping:
730
+ return sig.bind(parameter_mapping[parameter_name])
731
+ else:
732
+ return sig.bind(parameter_mapping["input"])
733
+ return sig.bind_partial(
734
+ **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
735
+ )
736
+
737
+
738
+ def _print_experiment_error(
739
+ error: BaseException,
740
+ /,
741
+ *,
742
+ example_id: str,
743
+ repetition_number: int,
744
+ kind: Literal["evaluator", "task"],
745
+ ) -> None:
746
+ """
747
+ Prints an experiment error.
748
+ """
749
+ display_error = RuntimeError(
750
+ f"{kind} failed for example id {repr(example_id)}, " f"repetition {repr(repetition_number)}"
751
+ )
752
+ display_error.__cause__ = error
753
+ formatted_exception = "".join(
754
+ traceback.format_exception(type(display_error), display_error, display_error.__traceback__)
755
+ )
756
+ print("\033[91m" + formatted_exception + "\033[0m") # prints in red
757
+
758
+
759
+ class _NoOpProcessor(trace_sdk.SpanProcessor):
760
+ def force_flush(self, *_: Any) -> bool:
761
+ return True
762
+
763
+
764
+ INPUT_VALUE = SpanAttributes.INPUT_VALUE
765
+ OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
766
+ INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
767
+ OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
768
+ OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
769
+
770
+ CHAIN = OpenInferenceSpanKindValues.CHAIN.value
771
+ EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
772
+ JSON = OpenInferenceMimeTypeValues.JSON