arize-phoenix 4.4.4rc6__py3-none-any.whl → 4.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arize-phoenix
3
- Version: 4.4.4rc6
3
+ Version: 4.6.1
4
4
  Summary: AI Observability and Evaluation
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -41,6 +41,7 @@ Requires-Dist: protobuf<6.0,>=3.20
41
41
  Requires-Dist: psutil
42
42
  Requires-Dist: pyarrow
43
43
  Requires-Dist: python-multipart
44
+ Requires-Dist: pyyaml
44
45
  Requires-Dist: scikit-learn
45
46
  Requires-Dist: scipy
46
47
  Requires-Dist: sqlalchemy[asyncio]<3,>=2.0.4
@@ -94,9 +95,10 @@ Requires-Dist: types-tabulate; extra == 'dev'
94
95
  Provides-Extra: evals
95
96
  Provides-Extra: experimental
96
97
  Provides-Extra: llama-index
97
- Requires-Dist: llama-index-embeddings-openai; extra == 'llama-index'
98
- Requires-Dist: llama-index-llms-openai; extra == 'llama-index'
99
- Requires-Dist: llama-index-readers-file; extra == 'llama-index'
98
+ Requires-Dist: llama-index-agent-openai==0.2.7; extra == 'llama-index'
99
+ Requires-Dist: llama-index-embeddings-openai==0.1.10; extra == 'llama-index'
100
+ Requires-Dist: llama-index-llms-openai==0.1.24; extra == 'llama-index'
101
+ Requires-Dist: llama-index-readers-file==0.1.25; extra == 'llama-index'
100
102
  Requires-Dist: llama-index==0.10.51; extra == 'llama-index'
101
103
  Provides-Extra: pg
102
104
  Requires-Dist: asyncpg; extra == 'pg'
@@ -5,7 +5,7 @@ phoenix/exceptions.py,sha256=n2L2KKuecrdflB9MsCdAYCiSEvGJptIsfRkXMoJle7A,169
5
5
  phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
6
  phoenix/services.py,sha256=aTxhcOA1pZHB6U-B3TEcp6fqDF5oT0xCUvEUNMZVTUQ,5175
7
7
  phoenix/settings.py,sha256=cO-qgis_S27nHirTobYI9hHPfZH18R--WMmxNdsVUwc,273
8
- phoenix/version.py,sha256=rZ0Z9PgUs79kMn4HpCH3vAEVOqqPCzzD7Xz8N5sa7qI,25
8
+ phoenix/version.py,sha256=49swO1xv7jkVATRWLWBzrlaZyF15JMuWMQ7j2xkWsTY,22
9
9
  phoenix/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  phoenix/core/embedding_dimension.py,sha256=zKGbcvwOXgLf-yrJBpQyKtd-LEOPRKHnUToyAU8Owis,87
11
11
  phoenix/core/model.py,sha256=km_a--PBHOuA337ClRw9xqhOHhrUT6Rl9pz_zV0JYkQ,4843
@@ -18,7 +18,7 @@ phoenix/db/bulk_inserter.py,sha256=zbZGWZFDybKaGLGzpxgLwxAS5sC0_wXcvM0be4kUhh8,1
18
18
  phoenix/db/engines.py,sha256=vLWaZlToMtDI7rJDxSidYkfOoojamxaZxaz8ND3zTus,4770
19
19
  phoenix/db/helpers.py,sha256=L2_jP1iIWpUREhKLYYb4_vf_6v_BiU1E73Z2PczGm6s,1589
20
20
  phoenix/db/migrate.py,sha256=MuhtNWnR24riROvarvKfbRb4_D5xuQi6P760vBUKl1E,2270
21
- phoenix/db/models.py,sha256=zFtdhVuQFOvquyKsto62aqAVaTRUlq9gxU0j1M1yLdg,20408
21
+ phoenix/db/models.py,sha256=7DBWbxY3cx3ve2P1I0kkDKXzlt04zEFJuRPJWsVpH-I,20422
22
22
  phoenix/db/insertion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  phoenix/db/insertion/dataset.py,sha256=_vxy5e6W5jEuvO2fMKbbNCn9JvHkwI4LRKk_10eKFVg,7171
24
24
  phoenix/db/insertion/evaluation.py,sha256=HoUncZN9ZlIr1QO0uA37SbWhrjmwQVYVJlgFX2VefY8,7211
@@ -31,15 +31,15 @@ phoenix/db/migrations/types.py,sha256=Frq1AKSyBKQQ0FLzON-EmgTqE4kNkOpHMsbWnI-WgC
31
31
  phoenix/db/migrations/versions/10460e46d750_datasets.py,sha256=l69yZfScFrjfZZpY0gnqwhsDUEctLeo02qMgA_aOGDg,8155
32
32
  phoenix/db/migrations/versions/cf03bd6bae1d_init.py,sha256=CbWT3ZTR0CZqeT3zWLoTWhboFmnOy3Ju1z6Ztpq8WIM,8122
33
33
  phoenix/experiments/__init__.py,sha256=6JGwgUd7xCbGpuHqYZlsmErmYvVgv7N_j43bn3dUqsk,123
34
- phoenix/experiments/functions.py,sha256=w0A6BK80avoupxd3sPJZ_btftV1pXrkbZj4omR_H214,24723
34
+ phoenix/experiments/functions.py,sha256=t0c4lCrK1wTjMlkXAXo1iLF0AYNneevzPur6gof_q8s,31643
35
35
  phoenix/experiments/tracing.py,sha256=wVpt8Ie9WNPoi1djJdcrkwCokHdTO0bicXViLg3O-1Y,2831
36
- phoenix/experiments/types.py,sha256=tj7DxfsU_nQP5bNe_h6p4KvRjkXKaaB3FeaIerAi_iA,22790
37
- phoenix/experiments/utils.py,sha256=ZZajvIrZTURhOX5Nx4nyogJEbI18sKCHYiYwOxz2vYU,340
36
+ phoenix/experiments/types.py,sha256=HQ9k7dUTlOLZl0iGtZOnToUtZBYGos6afwvO44subAM,24035
37
+ phoenix/experiments/utils.py,sha256=wLu5Kvt1b4a8rGPRWq5G8RQ9XSiV8fCIVm51zWBI3-g,758
38
38
  phoenix/experiments/evaluators/__init__.py,sha256=j63fi3fa3U7-itVPHa82GowhjQRU-wO6yhO34u_lhsA,714
39
- phoenix/experiments/evaluators/base.py,sha256=uhO4R06YWBbTxdpvXLldANnTxTA5r2h_Ktj-ZMLH57c,5305
39
+ phoenix/experiments/evaluators/base.py,sha256=ani0F2TN7DMN0KLhV89LIr9-W4g-ccEl2YQJgfp44Js,5325
40
40
  phoenix/experiments/evaluators/code_evaluators.py,sha256=0qIKQS14Knze50ziJEPVEnNeV3QIs4g1IXtCmaWZu7o,3923
41
41
  phoenix/experiments/evaluators/llm_evaluators.py,sha256=EFce6LKZwUZDBa5ZozvcdqeZpdWM6n6bmq7_oIzM2Nw,9211
42
- phoenix/experiments/evaluators/utils.py,sha256=o84UTWN7fzjCGZDTS-KpGZ2VBrk2iSuO3X2LoC7pr3Y,6966
42
+ phoenix/experiments/evaluators/utils.py,sha256=SroMoxmPZIFCi2MbEOvXlBAFJbEZY2IWgQvNFp3JP3A,6978
43
43
  phoenix/inferences/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  phoenix/inferences/errors.py,sha256=cGp9vxnw4SewFoWBV3ZGMkhE0Kh73lPIv3Ppz_H_RoA,8261
45
45
  phoenix/inferences/fixtures.py,sha256=FC2eRL4dpobKQHYOilFtDexUWFkMZ_w6jun_4WkbMk0,20792
@@ -134,9 +134,9 @@ phoenix/server/api/routers/v1/__init__.py,sha256=vvdpUa2LJPWEg8HbvDm_ANkBAwubPIF
134
134
  phoenix/server/api/routers/v1/dataset_examples.py,sha256=XfqOvDKF1oxb0pkeYfBycwwGt3LnSyyGdMLKC5VKoGQ,6690
135
135
  phoenix/server/api/routers/v1/datasets.py,sha256=f2gLG-geu-_wtEw4mKSzNWK2cFb5TYOyRL3tQ7Fl7Es,31544
136
136
  phoenix/server/api/routers/v1/evaluations.py,sha256=8g6P_e2BweV3RDU0esFmpkb0L5fCwonQPXiJ0y6HLwg,9126
137
- phoenix/server/api/routers/v1/experiment_evaluations.py,sha256=HeyV3PXS1BxQpzNOUBpQlX_0JH_jbjZjTxrqy2ujwJQ,2746
138
- phoenix/server/api/routers/v1/experiment_runs.py,sha256=_c7qmPIja_gpvoVaf_t7KtNc9Zz-0m9da9MS-EcbPBo,3918
139
- phoenix/server/api/routers/v1/experiments.py,sha256=ntb0lRV2h90mFepWiZfQ1MIAJhOaK9tkWzTejmpwed0,7243
137
+ phoenix/server/api/routers/v1/experiment_evaluations.py,sha256=H_psVyuGUQImo0oxdEAKAMQ-oyVwkVIq5yaMHzHIiPc,5455
138
+ phoenix/server/api/routers/v1/experiment_runs.py,sha256=u4Kgz1i5AffmCF2LHtC9Oo1hlGscZ3Dm8JlTRhM55yU,8307
139
+ phoenix/server/api/routers/v1/experiments.py,sha256=cG-LyIGRdB1jVTL42Xi2__nsXibVe9Up7m3hFiTIYYY,11886
140
140
  phoenix/server/api/routers/v1/spans.py,sha256=PFeS3ayKj4cUle0CH-f-CpM1fRi-JicEG7BEtkANzAo,4074
141
141
  phoenix/server/api/routers/v1/traces.py,sha256=dYEf5pThenAQCgfQljHdrnwd4tC_tAXm6Kvk6GphPYs,2774
142
142
  phoenix/server/api/types/AnnotatorKind.py,sha256=UmYb2KG0JfxdX0mW1qrXrUoIgjMOncRJr1i8mJki1sE,141
@@ -165,7 +165,7 @@ phoenix/server/api/types/ExampleRevisionInterface.py,sha256=gV3Gt9-3Oi5wjaVtepC6
165
165
  phoenix/server/api/types/Experiment.py,sha256=ELYdYFKwgBllxx3cZ_X0XicHjLtshZl0bFqqJdVGXRQ,5177
166
166
  phoenix/server/api/types/ExperimentAnnotationSummary.py,sha256=Uk3JtxIrsMoZT5tqc4nJdUOM3XegVzjUyoV3pkjNotE,256
167
167
  phoenix/server/api/types/ExperimentComparison.py,sha256=0sFz6MoBDw39dds0qVyaqhVs9qqO5rkG1FMSjmfBeCc,441
168
- phoenix/server/api/types/ExperimentRun.py,sha256=8jUIi3ApVCqQHwnYe59CYhrmh5iZ6-QmlH5WpF7UWtM,2990
168
+ phoenix/server/api/types/ExperimentRun.py,sha256=f_3qLeeMQpzjhuI1zOnXDXQlJyDied-7vBPGBPEOEAs,2995
169
169
  phoenix/server/api/types/ExperimentRunAnnotation.py,sha256=zGstMbS5OxNikEhD8VouY7Ls7YbxKm-0EmqvGeY3-DI,1773
170
170
  phoenix/server/api/types/ExportedFile.py,sha256=e3GTn7B5LgsTbqiwjhMCQH7VsiqXitrBO4aCMS1lHsg,163
171
171
  phoenix/server/api/types/Functionality.py,sha256=tzV9xdhB8zqfsjWxP66NDC7EZsplYkYO7jRbLWJIeeg,382
@@ -207,7 +207,7 @@ phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_
207
207
  phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
208
208
  phoenix/server/templates/index.html,sha256=S4z7qSoNSwnKFAH9r96AR-YJEyoKMd-VMWVlJ_IdzME,2039
209
209
  phoenix/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
- phoenix/session/client.py,sha256=5mnWVqMFbC8NYbX4m2oRla1VvlmrgabD1oT2UdwDRJ8,33201
210
+ phoenix/session/client.py,sha256=43MmopBHxPq2MprbSXixAzQyGr0VRhHEYZZ6WvITq1I,32343
211
211
  phoenix/session/data_extractor.py,sha256=dwhiDu-ISaXr8UI9I-CszZhB5BlUNmdDopjFZvMIXMw,2101
212
212
  phoenix/session/evaluation.py,sha256=aKeV8UVOyq3b7CYOwt3cWuLz0xzvMjX7vlEPILJ_fcs,5311
213
213
  phoenix/session/session.py,sha256=rjIuSSK2gAYIUPQTJc4E2ebew5o6I070FWRoFn4W3EI,26620
@@ -247,8 +247,8 @@ phoenix/utilities/logging.py,sha256=lDXd6EGaamBNcQxL4vP1au9-i_SXe0OraUDiJOcszSw,
247
247
  phoenix/utilities/project.py,sha256=qWsvKnG1oKhOFUowXf9qiOL2ia7jaFe_ijFFHEt8GJo,431
248
248
  phoenix/utilities/re.py,sha256=PDve_OLjRTM8yQQJHC8-n3HdIONi7aNils3ZKRZ5uBM,2045
249
249
  phoenix/utilities/span_store.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
250
- arize_phoenix-4.4.4rc6.dist-info/METADATA,sha256=gyc5KyS4aFqefmGcezl1eC_8lCZ5DF0iHdSDh0V41f8,11337
251
- arize_phoenix-4.4.4rc6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
252
- arize_phoenix-4.4.4rc6.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
253
- arize_phoenix-4.4.4rc6.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
254
- arize_phoenix-4.4.4rc6.dist-info/RECORD,,
250
+ arize_phoenix-4.6.1.dist-info/METADATA,sha256=OwKH-IBGxd43sS76T-D1Ix1LioGlWiw7WhQx13UyY-k,11451
251
+ arize_phoenix-4.6.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
252
+ arize_phoenix-4.6.1.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
253
+ arize_phoenix-4.6.1.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
254
+ arize_phoenix-4.6.1.dist-info/RECORD,,
phoenix/db/models.py CHANGED
@@ -91,8 +91,8 @@ class UtcTimeStamp(TypeDecorator[datetime]):
91
91
  return normalize_datetime(value, timezone.utc)
92
92
 
93
93
 
94
- class ExperimentResult(TypedDict, total=False):
95
- result: Any
94
+ class ExperimentRunOutput(TypedDict, total=False):
95
+ task_output: Any
96
96
 
97
97
 
98
98
  class Base(DeclarativeBase):
@@ -110,7 +110,7 @@ class Base(DeclarativeBase):
110
110
  type_annotation_map = {
111
111
  Dict[str, Any]: JsonDict,
112
112
  List[Dict[str, Any]]: JsonList,
113
- ExperimentResult: JsonDict,
113
+ ExperimentRunOutput: JsonDict,
114
114
  }
115
115
 
116
116
 
@@ -561,7 +561,7 @@ class ExperimentRun(Base):
561
561
  )
562
562
  repetition_number: Mapped[int]
563
563
  trace_id: Mapped[Optional[str]]
564
- output: Mapped[ExperimentResult]
564
+ output: Mapped[ExperimentRunOutput]
565
565
  start_time: Mapped[datetime] = mapped_column(UtcTimeStamp)
566
566
  end_time: Mapped[datetime] = mapped_column(UtcTimeStamp)
567
567
  prompt_token_count: Mapped[Optional[int]]
@@ -6,7 +6,7 @@ from typing import Any, Awaitable, Callable, Optional, Union
6
6
 
7
7
  from typing_extensions import TypeAlias
8
8
 
9
- from phoenix.experiments.evaluators.utils import validate_signature
9
+ from phoenix.experiments.evaluators.utils import validate_evaluator_signature
10
10
  from phoenix.experiments.types import (
11
11
  AnnotatorKind,
12
12
  EvaluationResult,
@@ -108,7 +108,7 @@ class Evaluator(ABC):
108
108
 
109
109
  def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
110
110
  sig = inspect.signature(fn)
111
- validate_signature(sig)
111
+ validate_evaluator_signature(sig)
112
112
  for param in sig.parameters.values():
113
113
  if param.kind is inspect.Parameter.VAR_KEYWORD:
114
114
  return
@@ -8,6 +8,7 @@ from phoenix.experiments.types import (
8
8
  EvaluationResult,
9
9
  JSONSerializable,
10
10
  )
11
+ from phoenix.experiments.utils import get_func_name
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from phoenix.experiments.evaluators.base import Evaluator
@@ -25,11 +26,11 @@ def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
25
26
  return obj
26
27
 
27
28
 
28
- def validate_signature(sig: inspect.Signature) -> None:
29
+ def validate_evaluator_signature(sig: inspect.Signature) -> None:
29
30
  # Check that the wrapped function has a valid signature for use as an evaluator
30
31
  # If it does not, raise an error to exit early before running evaluations
31
32
  params = sig.parameters
32
- valid_named_params = {"input", "output", "expected", "metadata"}
33
+ valid_named_params = {"input", "output", "expected", "reference", "metadata"}
33
34
  if len(params) == 0:
34
35
  raise ValueError("Evaluation function must have at least one parameter.")
35
36
  if len(params) > 1:
@@ -49,11 +50,12 @@ def validate_signature(sig: inspect.Signature) -> None:
49
50
  )
50
51
 
51
52
 
52
- def _bind_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
53
+ def _bind_evaluator_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
53
54
  parameter_mapping = {
54
55
  "input": kwargs.get("input"),
55
56
  "output": kwargs.get("output"),
56
57
  "expected": kwargs.get("expected"),
58
+ "reference": kwargs.get("reference"), # `reference` is an alias for `expected`
57
59
  "metadata": kwargs.get("metadata"),
58
60
  }
59
61
  params = sig.parameters
@@ -82,16 +84,11 @@ def create_evaluator(
82
84
  def wrapper(func: Callable[..., Any]) -> "Evaluator":
83
85
  nonlocal name
84
86
  if not name:
85
- if hasattr(func, "__self__"):
86
- name = func.__self__.__class__.__name__
87
- elif hasattr(func, "__name__"):
88
- name = func.__name__
89
- else:
90
- name = str(func)
87
+ name = get_func_name(func)
91
88
  assert name is not None
92
89
 
93
90
  wrapped_signature = inspect.signature(func)
94
- validate_signature(wrapped_signature)
91
+ validate_evaluator_signature(wrapped_signature)
95
92
 
96
93
  if inspect.iscoroutinefunction(func):
97
94
  return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
@@ -120,7 +117,7 @@ def _wrap_coroutine_evaluation_function(
120
117
  return await func(*args, **kwargs)
121
118
 
122
119
  async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
123
- bound_signature = _bind_signature(sig, **kwargs)
120
+ bound_signature = _bind_evaluator_signature(sig, **kwargs)
124
121
  result = await func(*bound_signature.args, **bound_signature.kwargs)
125
122
  return convert_to_score(result)
126
123
 
@@ -148,7 +145,7 @@ def _wrap_sync_evaluation_function(
148
145
  return func(*args, **kwargs)
149
146
 
150
147
  def evaluate(self, **kwargs: Any) -> EvaluationResult:
151
- bound_signature = _bind_signature(sig, **kwargs)
148
+ bound_signature = _bind_evaluator_signature(sig, **kwargs)
152
149
  result = func(*bound_signature.args, **bound_signature.kwargs)
153
150
  return convert_to_score(result)
154
151
 
@@ -1,5 +1,7 @@
1
1
  import functools
2
+ import inspect
2
3
  import json
4
+ import traceback
3
5
  from binascii import hexlify
4
6
  from contextlib import ExitStack
5
7
  from copy import deepcopy
@@ -10,6 +12,7 @@ from typing import (
10
12
  Any,
11
13
  Awaitable,
12
14
  Dict,
15
+ Literal,
13
16
  Mapping,
14
17
  Optional,
15
18
  Sequence,
@@ -58,8 +61,8 @@ from phoenix.experiments.types import (
58
61
  Experiment,
59
62
  ExperimentEvaluationRun,
60
63
  ExperimentParameters,
61
- ExperimentResult,
62
64
  ExperimentRun,
65
+ ExperimentRunOutput,
63
66
  ExperimentTask,
64
67
  RanExperiment,
65
68
  TaskSummary,
@@ -67,7 +70,7 @@ from phoenix.experiments.types import (
67
70
  _asdict,
68
71
  _replace,
69
72
  )
70
- from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url
73
+ from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
71
74
  from phoenix.trace.attributes import flatten
72
75
  from phoenix.utilities.json import jsonify
73
76
 
@@ -105,6 +108,61 @@ def run_experiment(
105
108
  dry_run: Union[bool, int] = False,
106
109
  print_summary: bool = True,
107
110
  ) -> RanExperiment:
111
+ """
112
+ Runs an experiment using a given set of dataset of examples.
113
+
114
+ An experiment is a user-defined task that runs on each example in a dataset. The results from
115
+ each experiment can be evaluated using any number of evaluators to measure the behavior of the
116
+ task. The experiment and evaluation results are stored in the Phoenix database for comparison
117
+ and analysis.
118
+
119
+ A `task` is either a synchronous or asynchronous function that returns a JSON serializable
120
+ output. If the `task` is a function of one argument then that argument will be bound to the
121
+ `input` field of the dataset example. Alternatively, the `task` can be a function of any
122
+ combination of specific argument names that will be bound to special values:
123
+ `input`: The input field of the dataset example
124
+ `expected`: The expected or reference output of the dataset example
125
+ `reference`: An alias for `expected`
126
+ `metadata`: Metadata associated with the dataset example
127
+ `example`: The dataset `Example` object with all associated fields
128
+
129
+ An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
130
+ or numeric "score". If the `evaluator` is a function of one argument then that argument will be
131
+ bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
132
+ combination of specific argument names that will be bound to special values:
133
+ `input`: The input field of the dataset example
134
+ `output`: The output of the task
135
+ `expected`: The expected or reference output of the dataset example
136
+ `reference`: An alias for `expected`
137
+ `metadata`: Metadata associated with the dataset example
138
+
139
+ Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
140
+
141
+ Args:
142
+ dataset (Dataset): The dataset on which to run the experiment.
143
+ task (ExperimentTask): The task to run on each example in the dataset.
144
+ evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
145
+ evaluate the results of the experiment. Defaults to None.
146
+ experiment_name (Optional[str]): The name of the experiment. Defaults to None.
147
+ experiment_description (Optional[str]): A description of the experiment. Defaults to None.
148
+ experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
149
+ experiment. Defaults to None.
150
+ rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
151
+ sequence of exceptions to adaptively throttle on. Defaults to None.
152
+ dry_run (bool | int): R the experiment in dry-run mode. When set, experiment results will
153
+ not be recorded in Phoenix. If True, the experiment will run on a random dataset
154
+ example. If an integer, the experiment will run on a random sample of the dataset
155
+ examples of the given size. Defaults to False.
156
+ print_summary (bool): Whether to print a summary of the experiment and evaluation results.
157
+ Defaults to True.
158
+
159
+ Returns:
160
+ RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
161
+ added to the experiment using the `evaluate_experiment` function.
162
+ """
163
+ task_signature = inspect.signature(task)
164
+ _validate_task_signature(task_signature)
165
+
108
166
  if not dataset.examples:
109
167
  raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
110
168
  # Add this to the params once supported in the UI
@@ -146,7 +204,7 @@ def run_experiment(
146
204
  )
147
205
 
148
206
  tracer, resource = _get_tracer(experiment.project_name)
149
- root_span_name = f"Task: {_get_task_name(task)}"
207
+ root_span_name = f"Task: {get_func_name(task)}"
150
208
  root_span_kind = CHAIN
151
209
 
152
210
  print("🧪 Experiment started.")
@@ -183,25 +241,37 @@ def run_experiment(
183
241
  # Do not use keyword arguments, which can fail at runtime
184
242
  # even when function obeys protocol, because keyword arguments
185
243
  # are implementation details.
186
- _output = task(example)
244
+ bound_task_args = _bind_task_signature(task_signature, example)
245
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
187
246
  if isinstance(_output, Awaitable):
188
- raise RuntimeError("Task is async but running in sync context")
247
+ sync_error_message = (
248
+ "Task is async and cannot be run within an existing event loop. "
249
+ "Consider the following options:\n\n"
250
+ "1. Pass in a synchronous task callable.\n"
251
+ "2. Use `nest_asyncio.apply()` to allow nesting event loops."
252
+ )
253
+ raise RuntimeError(sync_error_message)
189
254
  else:
190
255
  output = _output
191
256
  except BaseException as exc:
192
257
  span.record_exception(exc)
193
258
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
194
259
  error = exc
260
+ _print_experiment_error(
261
+ exc,
262
+ example_id=example.id,
263
+ repetition_number=repetition_number,
264
+ kind="task",
265
+ )
195
266
  output = jsonify(output)
196
267
  span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
197
268
  span.set_attribute(INPUT_MIME_TYPE, JSON.value)
198
- if result := ExperimentResult(result=output) if output is not None else None:
269
+ if output is not None:
199
270
  if isinstance(output, str):
200
271
  span.set_attribute(OUTPUT_VALUE, output)
201
272
  else:
202
273
  span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
203
274
  span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
204
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
205
275
  span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
206
276
  span.set_status(status)
207
277
 
@@ -214,7 +284,7 @@ def run_experiment(
214
284
  experiment_id=experiment.id,
215
285
  dataset_example_id=example.id,
216
286
  repetition_number=repetition_number,
217
- output=result,
287
+ experiment_run_output=ExperimentRunOutput(task_output=output),
218
288
  error=repr(error) if error else None,
219
289
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
220
290
  )
@@ -238,7 +308,8 @@ def run_experiment(
238
308
  # Do not use keyword arguments, which can fail at runtime
239
309
  # even when function obeys protocol, because keyword arguments
240
310
  # are implementation details.
241
- _output = task(example)
311
+ bound_task_args = _bind_task_signature(task_signature, example)
312
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
242
313
  if isinstance(_output, Awaitable):
243
314
  output = await _output
244
315
  else:
@@ -247,16 +318,21 @@ def run_experiment(
247
318
  span.record_exception(exc)
248
319
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
249
320
  error = exc
321
+ _print_experiment_error(
322
+ exc,
323
+ example_id=example.id,
324
+ repetition_number=repetition_number,
325
+ kind="task",
326
+ )
250
327
  output = jsonify(output)
251
328
  span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
252
329
  span.set_attribute(INPUT_MIME_TYPE, JSON.value)
253
- if result := ExperimentResult(result=output) if output is not None else None:
330
+ if output is not None:
254
331
  if isinstance(output, str):
255
332
  span.set_attribute(OUTPUT_VALUE, output)
256
333
  else:
257
334
  span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
258
335
  span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
259
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
260
336
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
261
337
  span.set_status(status)
262
338
 
@@ -269,7 +345,7 @@ def run_experiment(
269
345
  experiment_id=experiment.id,
270
346
  dataset_example_id=example.id,
271
347
  repetition_number=repetition_number,
272
- output=result,
348
+ experiment_run_output=ExperimentRunOutput(task_output=output),
273
349
  error=repr(error) if error else None,
274
350
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
275
351
  )
@@ -422,8 +498,9 @@ def evaluate_experiment(
422
498
  stack.enter_context(capture_spans(resource))
423
499
  try:
424
500
  result = evaluator.evaluate(
425
- output=experiment_run.task_output,
501
+ output=experiment_run.output,
426
502
  expected=example.output,
503
+ reference=example.output,
427
504
  input=example.input,
428
505
  metadata=example.metadata,
429
506
  )
@@ -431,6 +508,12 @@ def evaluate_experiment(
431
508
  span.record_exception(exc)
432
509
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
433
510
  error = exc
511
+ _print_experiment_error(
512
+ exc,
513
+ example_id=example.id,
514
+ repetition_number=experiment_run.repetition_number,
515
+ kind="evaluator",
516
+ )
434
517
  if result:
435
518
  span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
436
519
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
@@ -467,8 +550,9 @@ def evaluate_experiment(
467
550
  stack.enter_context(capture_spans(resource))
468
551
  try:
469
552
  result = await evaluator.async_evaluate(
470
- output=experiment_run.task_output,
553
+ output=experiment_run.output,
471
554
  expected=example.output,
555
+ reference=example.output,
472
556
  input=example.input,
473
557
  metadata=example.metadata,
474
558
  )
@@ -476,6 +560,12 @@ def evaluate_experiment(
476
560
  span.record_exception(exc)
477
561
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
478
562
  error = exc
563
+ _print_experiment_error(
564
+ exc,
565
+ example_id=example.id,
566
+ repetition_number=experiment_run.repetition_number,
567
+ kind="evaluator",
568
+ )
479
569
  if result:
480
570
  span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
481
571
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
@@ -584,20 +674,71 @@ def _decode_unix_nano(time_unix_nano: int) -> datetime:
584
674
  return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
585
675
 
586
676
 
587
- def _get_task_name(task: ExperimentTask) -> str:
588
- """
589
- Makes a best-effort attempt to get the name of the task.
590
- """
677
+ def _is_dry_run(obj: Any) -> bool:
678
+ return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
591
679
 
592
- if isinstance(task, functools.partial):
593
- return task.func.__qualname__
594
- if hasattr(task, "__qualname__"):
595
- return task.__qualname__
596
- return str(task)
597
680
 
681
+ def _validate_task_signature(sig: inspect.Signature) -> None:
682
+ # Check that the function signature has a valid signature for use as a task
683
+ # If it does not, raise an error to exit early before running an experiment
684
+ params = sig.parameters
685
+ valid_named_params = {"input", "expected", "reference", "metadata", "example"}
686
+ if len(params) == 0:
687
+ raise ValueError("Task function must have at least one parameter.")
688
+ if len(params) > 1:
689
+ for not_found in set(params) - valid_named_params:
690
+ param = params[not_found]
691
+ if (
692
+ param.kind is inspect.Parameter.VAR_KEYWORD
693
+ or param.default is not inspect.Parameter.empty
694
+ ):
695
+ continue
696
+ raise ValueError(
697
+ (
698
+ f"Invalid parameter names in task function: {', '.join(not_found)}. "
699
+ "Parameters names for multi-argument functions must be "
700
+ f"any of: {', '.join(valid_named_params)}."
701
+ )
702
+ )
598
703
 
599
- def _is_dry_run(obj: Any) -> bool:
600
- return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
704
+
705
+ def _bind_task_signature(sig: inspect.Signature, example: Example) -> inspect.BoundArguments:
706
+ parameter_mapping = {
707
+ "input": example.input,
708
+ "expected": example.output,
709
+ "reference": example.output, # Alias for "expected"
710
+ "metadata": example.metadata,
711
+ "example": example,
712
+ }
713
+ params = sig.parameters
714
+ if len(params) == 1:
715
+ parameter_name = next(iter(params))
716
+ if parameter_name in parameter_mapping:
717
+ return sig.bind(parameter_mapping[parameter_name])
718
+ else:
719
+ return sig.bind(parameter_mapping["input"])
720
+ return sig.bind_partial(
721
+ **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
722
+ )
723
+
724
+
725
+ def _print_experiment_error(
726
+ error: BaseException,
727
+ /,
728
+ *,
729
+ example_id: str,
730
+ repetition_number: int,
731
+ kind: Literal["evaluator", "task"],
732
+ ) -> None:
733
+ """
734
+ Prints an experiment error.
735
+ """
736
+ display_error = RuntimeError(
737
+ f"{kind} failed for example id {repr(example_id)}, " f"repetition {repr(repetition_number)}"
738
+ )
739
+ display_error.__cause__ = error
740
+ formatted_exception = "".join(traceback.format_exception(display_error)) # type: ignore[arg-type, call-arg, unused-ignore]
741
+ print("\033[91m" + formatted_exception + "\033[0m") # prints in red
601
742
 
602
743
 
603
744
  class _NoOpProcessor(trace_sdk.SpanProcessor):
@@ -103,9 +103,9 @@ class Example:
103
103
  identifiers = [f'{spaces}id="{self.id}",']
104
104
  contents = [
105
105
  spaces
106
- + f"{k}="
106
+ + f"{_blue(key)}="
107
107
  + json.dumps(
108
- _shorten(v),
108
+ _shorten(value),
109
109
  ensure_ascii=False,
110
110
  sort_keys=True,
111
111
  indent=len(spaces),
@@ -113,8 +113,8 @@ class Example:
113
113
  .replace("\n", f"\n{spaces}")
114
114
  .replace(' "..."\n', " ...\n")
115
115
  + ","
116
- for k in ("input", "output", "metadata")
117
- if (v := getattr(self, k, None))
116
+ for key in ("input", "output", "metadata")
117
+ if (value := getattr(self, key, None))
118
118
  ]
119
119
  return "\n".join([f"{name}(", *identifiers, *contents, ")"])
120
120
 
@@ -199,17 +199,17 @@ class Experiment:
199
199
 
200
200
 
201
201
  @dataclass(frozen=True)
202
- class ExperimentResult:
203
- result: TaskOutput
202
+ class ExperimentRunOutput:
203
+ task_output: TaskOutput
204
204
 
205
205
  def __post_init__(self) -> None:
206
- object.__setattr__(self, "result", _make_read_only(self.result))
206
+ object.__setattr__(self, "task_output", _make_read_only(self.task_output))
207
207
 
208
208
  @classmethod
209
- def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[ExperimentResult]:
209
+ def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> ExperimentRunOutput:
210
210
  if not obj:
211
- return None
212
- return cls(result=obj["result"])
211
+ return cls(task_output=None)
212
+ return cls(task_output=obj["task_output"])
213
213
 
214
214
 
215
215
  @dataclass(frozen=True)
@@ -219,14 +219,14 @@ class ExperimentRun:
219
219
  experiment_id: ExperimentId
220
220
  dataset_example_id: ExampleId
221
221
  repetition_number: RepetitionNumber
222
- output: Optional[ExperimentResult] = None
222
+ experiment_run_output: ExperimentRunOutput
223
223
  error: Optional[str] = None
224
224
  id: ExperimentRunId = field(default_factory=_dry_run_id)
225
225
  trace_id: Optional[TraceId] = None
226
226
 
227
227
  @property
228
- def task_output(self) -> Optional[TaskOutput]:
229
- return deepcopy(self.output.result) if self.output else None
228
+ def output(self) -> Optional[TaskOutput]:
229
+ return deepcopy(self.experiment_run_output.task_output)
230
230
 
231
231
  @classmethod
232
232
  def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentRun:
@@ -236,15 +236,15 @@ class ExperimentRun:
236
236
  experiment_id=obj["experiment_id"],
237
237
  dataset_example_id=obj["dataset_example_id"],
238
238
  repetition_number=obj.get("repetition_number") or 1,
239
- output=ExperimentResult.from_dict(obj["output"]),
239
+ experiment_run_output=ExperimentRunOutput.from_dict(obj["experiment_run_output"]),
240
240
  error=obj.get("error"),
241
241
  id=obj["id"],
242
242
  trace_id=obj.get("trace_id"),
243
243
  )
244
244
 
245
245
  def __post_init__(self) -> None:
246
- if bool(self.output) == bool(self.error):
247
- ValueError("Must specify either result or error")
246
+ if bool(self.experiment_run_output) == bool(self.error):
247
+ ValueError("Must specify exactly one of experiment_run_output or error")
248
248
 
249
249
 
250
250
  @dataclass(frozen=True)
@@ -571,7 +571,7 @@ class RanExperiment(Experiment):
571
571
  {
572
572
  "run_id": run.id,
573
573
  "error": run.error,
574
- "result": deepcopy(run.output.result) if run.output else None,
574
+ "output": deepcopy(run.experiment_run_output.task_output),
575
575
  "input": deepcopy((ex := self.dataset.examples[run.dataset_example_id]).input),
576
576
  "expected": deepcopy(ex.output),
577
577
  "metadata": deepcopy(ex.metadata),
@@ -688,6 +688,10 @@ class _ExperimentRunWithExample(ObjectProxy): # type: ignore[misc]
688
688
  def expected(self) -> ExampleOutput:
689
689
  return deepcopy(self._self_example.output)
690
690
 
691
+ @property
692
+ def reference(self) -> ExampleOutput:
693
+ return deepcopy(self._self_example.output)
694
+
691
695
  @property
692
696
  def input(self) -> ExampleInput:
693
697
  return deepcopy(self._self_example.input)
@@ -703,20 +707,47 @@ class _ExperimentRunWithExample(ObjectProxy): # type: ignore[misc]
703
707
  f'{spaces}id="{self.id}",',
704
708
  f'{spaces}example_id="{self.dataset_example_id}",',
705
709
  ]
706
- contents = [
710
+ outputs = [
711
+ *([f'{spaces}error="{self.error}",'] if self.error else []),
712
+ *(
713
+ [
714
+ f"{spaces}{_blue('output')}="
715
+ + json.dumps(
716
+ _shorten(self.output),
717
+ ensure_ascii=False,
718
+ sort_keys=True,
719
+ indent=len(spaces),
720
+ )
721
+ .replace("\n", f"\n{spaces}")
722
+ .replace(' "..."\n', " ...\n")
723
+ ]
724
+ if not self.error
725
+ else []
726
+ ),
727
+ ]
728
+ dicts = [
707
729
  spaces
708
- + f"{k}="
709
- + json.dumps(_shorten(v), ensure_ascii=False, sort_keys=True, indent=len(spaces))
730
+ + f"{_blue(alias)}={{"
731
+ + (f" # {comment}" if comment else "")
732
+ + json.dumps(
733
+ _shorten(value),
734
+ ensure_ascii=False,
735
+ sort_keys=True,
736
+ indent=len(spaces),
737
+ )[1:]
710
738
  .replace("\n", f"\n{spaces}")
711
739
  .replace(' "..."\n', " ...\n")
712
740
  + ","
713
- for k, v in {
714
- "error": self.error,
715
- "output": self.task_output,
716
- "expected": self.expected,
717
- "input": self.input,
718
- "metadata": self.metadata,
719
- }.items()
720
- if v
741
+ for alias, value, comment in (
742
+ ("expected", self.expected, f"alias for the example.{_blue('output')} dict"),
743
+ ("reference", self.reference, f"alias for the example.{_blue('output')} dict"),
744
+ ("input", self.input, f"alias for the example.{_blue('input')} dict"),
745
+ ("metadata", self.metadata, f"alias for the example.{_blue('metadata')} dict"),
746
+ )
747
+ if value
721
748
  ]
722
- return "\n".join([f"{name}(", *identifiers, *contents, ")"])
749
+ return "\n".join([f"{name}(", *identifiers, *outputs, *dicts, ")"])
750
+
751
+
752
+ def _blue(text: str) -> str:
753
+ return f"\033[1m\033[94m{text}\033[0m"
@@ -1,3 +1,6 @@
1
+ import functools
2
+ from typing import Any, Callable
3
+
1
4
  from phoenix.config import get_web_base_url
2
5
 
3
6
 
@@ -7,3 +10,15 @@ def get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
7
10
 
8
11
  def get_dataset_experiments_url(*, dataset_id: str) -> str:
9
12
  return f"{get_web_base_url()}datasets/{dataset_id}/experiments"
13
+
14
+
15
+ def get_func_name(fn: Callable[..., Any]) -> str:
16
+ """
17
+ Makes a best-effort attempt to get the name of the function.
18
+ """
19
+
20
+ if isinstance(fn, functools.partial):
21
+ return fn.func.__qualname__
22
+ if hasattr(fn, "__qualname__") and not fn.__qualname__.endswith("<lambda>"):
23
+ return fn.__qualname__.split(".<locals>.")[-1]
24
+ return str(fn)
@@ -12,6 +12,84 @@ from phoenix.server.api.types.node import from_global_id_with_expected_type
12
12
 
13
13
 
14
14
  async def upsert_experiment_evaluation(request: Request) -> Response:
15
+ """
16
+ summary: Create an evaluation for a specific experiment run
17
+ operationId: upsertExperimentEvaluation
18
+ tags:
19
+ - private
20
+ requestBody:
21
+ description: Details of the experiment evaluation to be upserted
22
+ required: true
23
+ content:
24
+ application/json:
25
+ schema:
26
+ type: object
27
+ properties:
28
+ experiment_run_id:
29
+ type: string
30
+ description: The ID of the experiment run being evaluated
31
+ name:
32
+ type: string
33
+ description: The name of the evaluation
34
+ annotator_kind:
35
+ type: string
36
+ description: The kind of annotator used for the evaluation
37
+ result:
38
+ type: object
39
+ description: The result of the evaluation
40
+ properties:
41
+ label:
42
+ type: string
43
+ description: The label assigned by the evaluation
44
+ score:
45
+ type: number
46
+ format: float
47
+ description: The score assigned by the evaluation
48
+ explanation:
49
+ type: string
50
+ description: Explanation of the evaluation result
51
+ error:
52
+ type: string
53
+ description: Optional error message if the evaluation encountered an error
54
+ metadata:
55
+ type: object
56
+ description: Metadata for the evaluation
57
+ additionalProperties:
58
+ type: string
59
+ start_time:
60
+ type: string
61
+ format: date-time
62
+ description: The start time of the evaluation in ISO format
63
+ end_time:
64
+ type: string
65
+ format: date-time
66
+ description: The end time of the evaluation in ISO format
67
+ trace_id:
68
+ type: string
69
+ description: Optional trace ID for tracking
70
+ required:
71
+ - experiment_run_id
72
+ - name
73
+ - annotator_kind
74
+ - start_time
75
+ - end_time
76
+ responses:
77
+ 200:
78
+ description: Experiment evaluation upserted successfully
79
+ content:
80
+ application/json:
81
+ schema:
82
+ type: object
83
+ properties:
84
+ data:
85
+ type: object
86
+ properties:
87
+ id:
88
+ type: string
89
+ description: The ID of the upserted experiment evaluation
90
+ 404:
91
+ description: ExperimentRun not found
92
+ """
15
93
  payload = await request.json()
16
94
  experiment_run_gid = GlobalID.from_id(payload["experiment_run_id"])
17
95
  try:
@@ -7,12 +7,79 @@ from starlette.status import HTTP_404_NOT_FOUND
7
7
  from strawberry.relay import GlobalID
8
8
 
9
9
  from phoenix.db import models
10
- from phoenix.experiments.types import ExperimentResult, ExperimentRun
10
+ from phoenix.experiments.types import ExperimentRun, ExperimentRunOutput
11
11
  from phoenix.server.api.types.node import from_global_id_with_expected_type
12
12
  from phoenix.utilities.json import jsonify
13
13
 
14
14
 
15
15
  async def create_experiment_run(request: Request) -> Response:
16
+ """
17
+ summary: Create a new experiment run for a specific experiment
18
+ operationId: createExperimentRun
19
+ tags:
20
+ - private
21
+ parameters:
22
+ - in: path
23
+ name: experiment_id
24
+ required: true
25
+ description: The ID of the experiment for which the run is being created
26
+ schema:
27
+ type: string
28
+ requestBody:
29
+ description: Details of the experiment run to be created
30
+ required: true
31
+ content:
32
+ application/json:
33
+ schema:
34
+ type: object
35
+ properties:
36
+ dataset_example_id:
37
+ type: string
38
+ description: The ID of the dataset example used in the experiment run
39
+ trace_id:
40
+ type: string
41
+ description: Optional trace ID for tracking
42
+ experiment_run_output:
43
+ type: object
44
+ description: The output of the experiment run
45
+ repetition_number:
46
+ type: integer
47
+ description: The repetition number of the experiment run
48
+ start_time:
49
+ type: string
50
+ format: date-time
51
+ description: The start time of the experiment run in ISO format
52
+ end_time:
53
+ type: string
54
+ format: date-time
55
+ description: The end time of the experiment run in ISO format
56
+ error:
57
+ type: string
58
+ description: Optional error message if the experiment run encountered an error
59
+ nullable: true
60
+ required:
61
+ - dataset_example_id
62
+ - output
63
+ - repetition_number
64
+ - start_time
65
+ - end_time
66
+ responses:
67
+ 200:
68
+ description: Experiment run created successfully
69
+ content:
70
+ application/json:
71
+ schema:
72
+ type: object
73
+ properties:
74
+ data:
75
+ type: object
76
+ properties:
77
+ id:
78
+ type: string
79
+ description: The ID of the created experiment run
80
+ 404:
81
+ description: Experiment or DatasetExample not found
82
+ """
16
83
  experiment_gid = GlobalID.from_id(request.path_params["experiment_id"])
17
84
  try:
18
85
  experiment_id = from_global_id_with_expected_type(experiment_gid, "Experiment")
@@ -34,7 +101,7 @@ async def create_experiment_run(request: Request) -> Response:
34
101
  )
35
102
 
36
103
  trace_id = payload.get("trace_id", None)
37
- output = payload["output"]
104
+ output = payload["experiment_run_output"]
38
105
  repetition_number = payload["repetition_number"]
39
106
  start_time = payload["start_time"]
40
107
  end_time = payload["end_time"]
@@ -58,6 +125,63 @@ async def create_experiment_run(request: Request) -> Response:
58
125
 
59
126
 
60
127
  async def list_experiment_runs(request: Request) -> Response:
128
+ """
129
+ summary: List all runs for a specific experiment
130
+ operationId: listExperimentRuns
131
+ tags:
132
+ - private
133
+ parameters:
134
+ - in: path
135
+ name: experiment_id
136
+ required: true
137
+ description: The ID of the experiment to list runs for
138
+ schema:
139
+ type: string
140
+ responses:
141
+ 200:
142
+ description: Experiment runs retrieved successfully
143
+ content:
144
+ application/json:
145
+ schema:
146
+ type: object
147
+ properties:
148
+ data:
149
+ type: array
150
+ items:
151
+ type: object
152
+ properties:
153
+ id:
154
+ type: string
155
+ description: The ID of the experiment run
156
+ experiment_id:
157
+ type: string
158
+ description: The ID of the experiment
159
+ dataset_example_id:
160
+ type: string
161
+ description: The ID of the dataset example
162
+ repetition_number:
163
+ type: integer
164
+ description: The repetition number of the experiment run
165
+ start_time:
166
+ type: string
167
+ format: date-time
168
+ description: The start time of the experiment run in ISO format
169
+ end_time:
170
+ type: string
171
+ format: date-time
172
+ description: The end time of the experiment run in ISO format
173
+ experiment_run_output:
174
+ type: object
175
+ description: The output of the experiment run
176
+ error:
177
+ type: string
178
+ description: Error message if the experiment run encountered an error
179
+ trace_id:
180
+ type: string
181
+ description: Optional trace ID for tracking
182
+ 404:
183
+ description: Experiment not found
184
+ """
61
185
  experiment_gid = GlobalID.from_id(request.path_params["experiment_id"])
62
186
  try:
63
187
  experiment_id = from_global_id_with_expected_type(experiment_gid, "Experiment")
@@ -87,7 +211,7 @@ async def list_experiment_runs(request: Request) -> Response:
87
211
  experiment_id=str(experiment_gid),
88
212
  dataset_example_id=str(example_gid),
89
213
  repetition_number=exp_run.repetition_number,
90
- output=ExperimentResult.from_dict(exp_run.output) if exp_run.output else None,
214
+ experiment_run_output=ExperimentRunOutput.from_dict(exp_run.output),
91
215
  error=exp_run.error,
92
216
  id=str(run_gid),
93
217
  trace_id=exp_run.trace_id,
@@ -25,6 +25,80 @@ def _generate_experiment_name(dataset_name: str) -> str:
25
25
 
26
26
 
27
27
  async def create_experiment(request: Request) -> Response:
28
+ """
29
+ summary: Create an experiment using a dataset
30
+ operationId: createExperiment
31
+ tags:
32
+ - private
33
+ parameters:
34
+ - in: path
35
+ name: dataset_id
36
+ required: true
37
+ description: The ID of the dataset to create an experiment for
38
+ schema:
39
+ type: string
40
+ requestBody:
41
+ description: Details of the experiment to be created
42
+ required: true
43
+ content:
44
+ application/json:
45
+ schema:
46
+ type: object
47
+ properties:
48
+ repetitions:
49
+ type: integer
50
+ description: Number of times the experiment should be repeated for each example
51
+ default: 1
52
+ metadata:
53
+ type: object
54
+ description: Metadata for the experiment
55
+ additionalProperties:
56
+ type: string
57
+ version_id:
58
+ type: string
59
+ description: ID of the dataset version to use
60
+ responses:
61
+ 200:
62
+ description: Experiment retrieved successfully
63
+ content:
64
+ application/json:
65
+ schema:
66
+ type: object
67
+ properties:
68
+ data:
69
+ type: object
70
+ properties:
71
+ id:
72
+ type: string
73
+ description: The ID of the experiment
74
+ dataset_id:
75
+ type: string
76
+ description: The ID of the dataset associated with the experiment
77
+ dataset_version_id:
78
+ type: string
79
+ description: The ID of the dataset version associated with the experiment
80
+ repetitions:
81
+ type: integer
82
+ description: Number of times the experiment is repeated
83
+ metadata:
84
+ type: object
85
+ description: Metadata of the experiment
86
+ additionalProperties:
87
+ type: string
88
+ project_name:
89
+ type: string
90
+ description: The name of the project associated with the experiment
91
+ created_at:
92
+ type: string
93
+ format: date-time
94
+ description: The creation timestamp of the experiment
95
+ updated_at:
96
+ type: string
97
+ format: date-time
98
+ description: The last update timestamp of the experiment
99
+ 404:
100
+ description: Dataset or DatasetVersion not found
101
+ """
28
102
  dataset_globalid = GlobalID.from_id(request.path_params["dataset_id"])
29
103
  try:
30
104
  dataset_id = from_global_id_with_expected_type(dataset_globalid, "Dataset")
@@ -139,6 +213,60 @@ async def create_experiment(request: Request) -> Response:
139
213
 
140
214
 
141
215
  async def read_experiment(request: Request) -> Response:
216
+ """
217
+ summary: Get details of a specific experiment
218
+ operationId: getExperiment
219
+ tags:
220
+ - private
221
+ parameters:
222
+ - in: path
223
+ name: experiment_id
224
+ required: true
225
+ description: The ID of the experiment to retrieve
226
+ schema:
227
+ type: string
228
+ responses:
229
+ 200:
230
+ description: Experiment retrieved successfully
231
+ content:
232
+ application/json:
233
+ schema:
234
+ type: object
235
+ properties:
236
+ data:
237
+ type: object
238
+ properties:
239
+ id:
240
+ type: string
241
+ description: The ID of the experiment
242
+ dataset_id:
243
+ type: string
244
+ description: The ID of the dataset associated with the experiment
245
+ dataset_version_id:
246
+ type: string
247
+ description: The ID of the dataset version associated with the experiment
248
+ repetitions:
249
+ type: integer
250
+ description: Number of times the experiment is repeated
251
+ metadata:
252
+ type: object
253
+ description: Metadata of the experiment
254
+ additionalProperties:
255
+ type: string
256
+ project_name:
257
+ type: string
258
+ description: The name of the project associated with the experiment
259
+ created_at:
260
+ type: string
261
+ format: date-time
262
+ description: The creation timestamp of the experiment
263
+ updated_at:
264
+ type: string
265
+ format: date-time
266
+ description: The last update timestamp of the experiment
267
+ 404:
268
+ description: Experiment not found
269
+ """
142
270
  experiment_globalid = GlobalID.from_id(request.path_params["experiment_id"])
143
271
  try:
144
272
  experiment_id = from_global_id_with_expected_type(experiment_globalid, "Experiment")
@@ -84,7 +84,7 @@ def to_gql_experiment_run(run: models.ExperimentRun) -> ExperimentRun:
84
84
  trace_id=trace_id
85
85
  if (trace := run.trace) and (trace_id := trace.trace_id) is not None
86
86
  else None,
87
- output=run.output.get("result"),
87
+ output=run.output.get("task_output"),
88
88
  start_time=run.start_time,
89
89
  end_time=run.end_time,
90
90
  error=run.error,
phoenix/session/client.py CHANGED
@@ -5,7 +5,7 @@ import re
5
5
  import weakref
6
6
  from collections import Counter
7
7
  from datetime import datetime
8
- from io import BytesIO, StringIO
8
+ from io import BytesIO
9
9
  from pathlib import Path
10
10
  from typing import (
11
11
  Any,
@@ -406,35 +406,6 @@ class Client(TraceDataExtractor):
406
406
  df["created_at"] = pd.to_datetime(df.created_at)
407
407
  return df
408
408
 
409
- def download_dataset_examples(
410
- self,
411
- dataset_id: str,
412
- /,
413
- *,
414
- dataset_version_id: Optional[str] = None,
415
- ) -> pd.DataFrame:
416
- """
417
- Download dataset examples as pandas DataFrame.
418
-
419
- Args:
420
- dataset_id (str): dataset ID
421
- dataset_version_id (Optional[str]): dataset version ID, if omitted,
422
- the latest version is returned.
423
-
424
- Returns:
425
- pandas DataFrame
426
- """
427
- url = f"v1/datasets/{dataset_id}/csv"
428
- response = httpx.get(
429
- url=urljoin(self._base_url, url),
430
- params={"version_id": dataset_version_id} if dataset_version_id else {},
431
- )
432
- response.raise_for_status()
433
- return pd.read_csv(
434
- StringIO(response.content.decode()),
435
- index_col="example_id",
436
- )
437
-
438
409
  def upload_dataset(
439
410
  self,
440
411
  *,
@@ -808,7 +779,7 @@ def _prepare_pyarrow(
808
779
  return "pandas", file, "application/x-pandas-pyarrow", {}
809
780
 
810
781
 
811
- _response_header = re.compile(r"(?i)(response|answer)s*$")
782
+ _response_header = re.compile(r"(?i)(response|answer|output)s*$")
812
783
 
813
784
 
814
785
  def _infer_keys(
phoenix/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "4.4.4rc6"
1
+ __version__ = "4.6.1"