PyPI - arize-phoenix - Versions diffs - 4.4.4rc6__py3-none-any.whl → 4.6.1__py3-none-any.whl - Mend

arize-phoenix 4.4.4rc6py3-none-any.whl → 4.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arize-phoenix might be problematic. Click here for more details.

Files changed (17) hide show

{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/METADATA +6 -4
{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/RECORD +17 -17
phoenix/db/models.py +4 -4
phoenix/experiments/evaluators/base.py +2 -2
phoenix/experiments/evaluators/utils.py +9 -12
phoenix/experiments/functions.py +166 -25
phoenix/experiments/types.py +60 -29
phoenix/experiments/utils.py +15 -0
phoenix/server/api/routers/v1/experiment_evaluations.py +78 -0
phoenix/server/api/routers/v1/experiment_runs.py +127 -3
phoenix/server/api/routers/v1/experiments.py +128 -0
phoenix/server/api/types/ExperimentRun.py +1 -1
phoenix/session/client.py +2 -31
phoenix/version.py +1 -1
{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/WHEEL +0 -0
{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/licenses/IP_NOTICE +0 -0
{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/licenses/LICENSE +0 -0

{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: arize-phoenix
-Version: 4.4.4rc6
+Version: 4.6.1
 Summary: AI Observability and Evaluation
 Project-URL: Documentation, https://docs.arize.com/phoenix/
 Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -41,6 +41,7 @@ Requires-Dist: protobuf<6.0,>=3.20
 Requires-Dist: psutil
 Requires-Dist: pyarrow
 Requires-Dist: python-multipart
+Requires-Dist: pyyaml
 Requires-Dist: scikit-learn
 Requires-Dist: scipy
 Requires-Dist: sqlalchemy[asyncio]<3,>=2.0.4
@@ -94,9 +95,10 @@ Requires-Dist: types-tabulate; extra == 'dev'
 Provides-Extra: evals
 Provides-Extra: experimental
 Provides-Extra: llama-index
-Requires-Dist: llama-index-embeddings-openai; extra == 'llama-index'
-Requires-Dist: llama-index-llms-openai; extra == 'llama-index'
-Requires-Dist: llama-index-readers-file; extra == 'llama-index'
+Requires-Dist: llama-index-agent-openai==0.2.7; extra == 'llama-index'
+Requires-Dist: llama-index-embeddings-openai==0.1.10; extra == 'llama-index'
+Requires-Dist: llama-index-llms-openai==0.1.24; extra == 'llama-index'
+Requires-Dist: llama-index-readers-file==0.1.25; extra == 'llama-index'
 Requires-Dist: llama-index==0.10.51; extra == 'llama-index'
 Provides-Extra: pg
 Requires-Dist: asyncpg; extra == 'pg'

{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ phoenix/exceptions.py,sha256=n2L2KKuecrdflB9MsCdAYCiSEvGJptIsfRkXMoJle7A,169
 phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 phoenix/services.py,sha256=aTxhcOA1pZHB6U-B3TEcp6fqDF5oT0xCUvEUNMZVTUQ,5175
 phoenix/settings.py,sha256=cO-qgis_S27nHirTobYI9hHPfZH18R--WMmxNdsVUwc,273
-phoenix/version.py,sha256=rZ0Z9PgUs79kMn4HpCH3vAEVOqqPCzzD7Xz8N5sa7qI,25
+phoenix/version.py,sha256=49swO1xv7jkVATRWLWBzrlaZyF15JMuWMQ7j2xkWsTY,22
 phoenix/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoenix/core/embedding_dimension.py,sha256=zKGbcvwOXgLf-yrJBpQyKtd-LEOPRKHnUToyAU8Owis,87
 phoenix/core/model.py,sha256=km_a--PBHOuA337ClRw9xqhOHhrUT6Rl9pz_zV0JYkQ,4843
@@ -18,7 +18,7 @@ phoenix/db/bulk_inserter.py,sha256=zbZGWZFDybKaGLGzpxgLwxAS5sC0_wXcvM0be4kUhh8,1
 phoenix/db/engines.py,sha256=vLWaZlToMtDI7rJDxSidYkfOoojamxaZxaz8ND3zTus,4770
 phoenix/db/helpers.py,sha256=L2_jP1iIWpUREhKLYYb4_vf_6v_BiU1E73Z2PczGm6s,1589
 phoenix/db/migrate.py,sha256=MuhtNWnR24riROvarvKfbRb4_D5xuQi6P760vBUKl1E,2270
-phoenix/db/models.py,sha256=zFtdhVuQFOvquyKsto62aqAVaTRUlq9gxU0j1M1yLdg,20408
+phoenix/db/models.py,sha256=7DBWbxY3cx3ve2P1I0kkDKXzlt04zEFJuRPJWsVpH-I,20422
 phoenix/db/insertion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoenix/db/insertion/dataset.py,sha256=_vxy5e6W5jEuvO2fMKbbNCn9JvHkwI4LRKk_10eKFVg,7171
 phoenix/db/insertion/evaluation.py,sha256=HoUncZN9ZlIr1QO0uA37SbWhrjmwQVYVJlgFX2VefY8,7211
@@ -31,15 +31,15 @@ phoenix/db/migrations/types.py,sha256=Frq1AKSyBKQQ0FLzON-EmgTqE4kNkOpHMsbWnI-WgC
 phoenix/db/migrations/versions/10460e46d750_datasets.py,sha256=l69yZfScFrjfZZpY0gnqwhsDUEctLeo02qMgA_aOGDg,8155
 phoenix/db/migrations/versions/cf03bd6bae1d_init.py,sha256=CbWT3ZTR0CZqeT3zWLoTWhboFmnOy3Ju1z6Ztpq8WIM,8122
 phoenix/experiments/__init__.py,sha256=6JGwgUd7xCbGpuHqYZlsmErmYvVgv7N_j43bn3dUqsk,123
-phoenix/experiments/functions.py,sha256=w0A6BK80avoupxd3sPJZ_btftV1pXrkbZj4omR_H214,24723
+phoenix/experiments/functions.py,sha256=t0c4lCrK1wTjMlkXAXo1iLF0AYNneevzPur6gof_q8s,31643
 phoenix/experiments/tracing.py,sha256=wVpt8Ie9WNPoi1djJdcrkwCokHdTO0bicXViLg3O-1Y,2831
-phoenix/experiments/types.py,sha256=tj7DxfsU_nQP5bNe_h6p4KvRjkXKaaB3FeaIerAi_iA,22790
-phoenix/experiments/utils.py,sha256=ZZajvIrZTURhOX5Nx4nyogJEbI18sKCHYiYwOxz2vYU,340
+phoenix/experiments/types.py,sha256=HQ9k7dUTlOLZl0iGtZOnToUtZBYGos6afwvO44subAM,24035
+phoenix/experiments/utils.py,sha256=wLu5Kvt1b4a8rGPRWq5G8RQ9XSiV8fCIVm51zWBI3-g,758
 phoenix/experiments/evaluators/__init__.py,sha256=j63fi3fa3U7-itVPHa82GowhjQRU-wO6yhO34u_lhsA,714
-phoenix/experiments/evaluators/base.py,sha256=uhO4R06YWBbTxdpvXLldANnTxTA5r2h_Ktj-ZMLH57c,5305
+phoenix/experiments/evaluators/base.py,sha256=ani0F2TN7DMN0KLhV89LIr9-W4g-ccEl2YQJgfp44Js,5325
 phoenix/experiments/evaluators/code_evaluators.py,sha256=0qIKQS14Knze50ziJEPVEnNeV3QIs4g1IXtCmaWZu7o,3923
 phoenix/experiments/evaluators/llm_evaluators.py,sha256=EFce6LKZwUZDBa5ZozvcdqeZpdWM6n6bmq7_oIzM2Nw,9211
-phoenix/experiments/evaluators/utils.py,sha256=o84UTWN7fzjCGZDTS-KpGZ2VBrk2iSuO3X2LoC7pr3Y,6966
+phoenix/experiments/evaluators/utils.py,sha256=SroMoxmPZIFCi2MbEOvXlBAFJbEZY2IWgQvNFp3JP3A,6978
 phoenix/inferences/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoenix/inferences/errors.py,sha256=cGp9vxnw4SewFoWBV3ZGMkhE0Kh73lPIv3Ppz_H_RoA,8261
 phoenix/inferences/fixtures.py,sha256=FC2eRL4dpobKQHYOilFtDexUWFkMZ_w6jun_4WkbMk0,20792
@@ -134,9 +134,9 @@ phoenix/server/api/routers/v1/__init__.py,sha256=vvdpUa2LJPWEg8HbvDm_ANkBAwubPIF
 phoenix/server/api/routers/v1/dataset_examples.py,sha256=XfqOvDKF1oxb0pkeYfBycwwGt3LnSyyGdMLKC5VKoGQ,6690
 phoenix/server/api/routers/v1/datasets.py,sha256=f2gLG-geu-_wtEw4mKSzNWK2cFb5TYOyRL3tQ7Fl7Es,31544
 phoenix/server/api/routers/v1/evaluations.py,sha256=8g6P_e2BweV3RDU0esFmpkb0L5fCwonQPXiJ0y6HLwg,9126
-phoenix/server/api/routers/v1/experiment_evaluations.py,sha256=HeyV3PXS1BxQpzNOUBpQlX_0JH_jbjZjTxrqy2ujwJQ,2746
-phoenix/server/api/routers/v1/experiment_runs.py,sha256=_c7qmPIja_gpvoVaf_t7KtNc9Zz-0m9da9MS-EcbPBo,3918
-phoenix/server/api/routers/v1/experiments.py,sha256=ntb0lRV2h90mFepWiZfQ1MIAJhOaK9tkWzTejmpwed0,7243
+phoenix/server/api/routers/v1/experiment_evaluations.py,sha256=H_psVyuGUQImo0oxdEAKAMQ-oyVwkVIq5yaMHzHIiPc,5455
+phoenix/server/api/routers/v1/experiment_runs.py,sha256=u4Kgz1i5AffmCF2LHtC9Oo1hlGscZ3Dm8JlTRhM55yU,8307
+phoenix/server/api/routers/v1/experiments.py,sha256=cG-LyIGRdB1jVTL42Xi2__nsXibVe9Up7m3hFiTIYYY,11886
 phoenix/server/api/routers/v1/spans.py,sha256=PFeS3ayKj4cUle0CH-f-CpM1fRi-JicEG7BEtkANzAo,4074
 phoenix/server/api/routers/v1/traces.py,sha256=dYEf5pThenAQCgfQljHdrnwd4tC_tAXm6Kvk6GphPYs,2774
 phoenix/server/api/types/AnnotatorKind.py,sha256=UmYb2KG0JfxdX0mW1qrXrUoIgjMOncRJr1i8mJki1sE,141
@@ -165,7 +165,7 @@ phoenix/server/api/types/ExampleRevisionInterface.py,sha256=gV3Gt9-3Oi5wjaVtepC6
 phoenix/server/api/types/Experiment.py,sha256=ELYdYFKwgBllxx3cZ_X0XicHjLtshZl0bFqqJdVGXRQ,5177
 phoenix/server/api/types/ExperimentAnnotationSummary.py,sha256=Uk3JtxIrsMoZT5tqc4nJdUOM3XegVzjUyoV3pkjNotE,256
 phoenix/server/api/types/ExperimentComparison.py,sha256=0sFz6MoBDw39dds0qVyaqhVs9qqO5rkG1FMSjmfBeCc,441
-phoenix/server/api/types/ExperimentRun.py,sha256=8jUIi3ApVCqQHwnYe59CYhrmh5iZ6-QmlH5WpF7UWtM,2990
+phoenix/server/api/types/ExperimentRun.py,sha256=f_3qLeeMQpzjhuI1zOnXDXQlJyDied-7vBPGBPEOEAs,2995
 phoenix/server/api/types/ExperimentRunAnnotation.py,sha256=zGstMbS5OxNikEhD8VouY7Ls7YbxKm-0EmqvGeY3-DI,1773
 phoenix/server/api/types/ExportedFile.py,sha256=e3GTn7B5LgsTbqiwjhMCQH7VsiqXitrBO4aCMS1lHsg,163
 phoenix/server/api/types/Functionality.py,sha256=tzV9xdhB8zqfsjWxP66NDC7EZsplYkYO7jRbLWJIeeg,382
@@ -207,7 +207,7 @@ phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_
 phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoenix/server/templates/index.html,sha256=S4z7qSoNSwnKFAH9r96AR-YJEyoKMd-VMWVlJ_IdzME,2039
 phoenix/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-phoenix/session/client.py,sha256=5mnWVqMFbC8NYbX4m2oRla1VvlmrgabD1oT2UdwDRJ8,33201
+phoenix/session/client.py,sha256=43MmopBHxPq2MprbSXixAzQyGr0VRhHEYZZ6WvITq1I,32343
 phoenix/session/data_extractor.py,sha256=dwhiDu-ISaXr8UI9I-CszZhB5BlUNmdDopjFZvMIXMw,2101
 phoenix/session/evaluation.py,sha256=aKeV8UVOyq3b7CYOwt3cWuLz0xzvMjX7vlEPILJ_fcs,5311
 phoenix/session/session.py,sha256=rjIuSSK2gAYIUPQTJc4E2ebew5o6I070FWRoFn4W3EI,26620
@@ -247,8 +247,8 @@ phoenix/utilities/logging.py,sha256=lDXd6EGaamBNcQxL4vP1au9-i_SXe0OraUDiJOcszSw,
 phoenix/utilities/project.py,sha256=qWsvKnG1oKhOFUowXf9qiOL2ia7jaFe_ijFFHEt8GJo,431
 phoenix/utilities/re.py,sha256=PDve_OLjRTM8yQQJHC8-n3HdIONi7aNils3ZKRZ5uBM,2045
 phoenix/utilities/span_store.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arize_phoenix-4.4.4rc6.dist-info/METADATA,sha256=gyc5KyS4aFqefmGcezl1eC_8lCZ5DF0iHdSDh0V41f8,11337
-arize_phoenix-4.4.4rc6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-arize_phoenix-4.4.4rc6.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
-arize_phoenix-4.4.4rc6.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
-arize_phoenix-4.4.4rc6.dist-info/RECORD,,
+arize_phoenix-4.6.1.dist-info/METADATA,sha256=OwKH-IBGxd43sS76T-D1Ix1LioGlWiw7WhQx13UyY-k,11451
+arize_phoenix-4.6.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+arize_phoenix-4.6.1.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
+arize_phoenix-4.6.1.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
+arize_phoenix-4.6.1.dist-info/RECORD,,

phoenix/db/models.py CHANGED Viewed

@@ -91,8 +91,8 @@ class UtcTimeStamp(TypeDecorator[datetime]):
         return normalize_datetime(value, timezone.utc)
-class ExperimentResult(TypedDict, total=False):
-    result: Any
+class ExperimentRunOutput(TypedDict, total=False):
+    task_output: Any
 class Base(DeclarativeBase):
@@ -110,7 +110,7 @@ class Base(DeclarativeBase):
     type_annotation_map = {
         Dict[str, Any]: JsonDict,
         List[Dict[str, Any]]: JsonList,
-        ExperimentResult: JsonDict,
+        ExperimentRunOutput: JsonDict,
     }
@@ -561,7 +561,7 @@ class ExperimentRun(Base):
     )
     repetition_number: Mapped[int]
     trace_id: Mapped[Optional[str]]
-    output: Mapped[ExperimentResult]
+    output: Mapped[ExperimentRunOutput]
     start_time: Mapped[datetime] = mapped_column(UtcTimeStamp)
     end_time: Mapped[datetime] = mapped_column(UtcTimeStamp)
     prompt_token_count: Mapped[Optional[int]]

phoenix/experiments/evaluators/base.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Awaitable, Callable, Optional, Union
 from typing_extensions import TypeAlias
-from phoenix.experiments.evaluators.utils import validate_signature
+from phoenix.experiments.evaluators.utils import validate_evaluator_signature
 from phoenix.experiments.types import (
     AnnotatorKind,
     EvaluationResult,
@@ -108,7 +108,7 @@ class Evaluator(ABC):
 def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
     sig = inspect.signature(fn)
-    validate_signature(sig)
+    validate_evaluator_signature(sig)
     for param in sig.parameters.values():
         if param.kind is inspect.Parameter.VAR_KEYWORD:
             return

phoenix/experiments/evaluators/utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ from phoenix.experiments.types import (
     EvaluationResult,
     JSONSerializable,
 )
+from phoenix.experiments.utils import get_func_name
 if TYPE_CHECKING:
     from phoenix.experiments.evaluators.base import Evaluator
@@ -25,11 +26,11 @@ def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
     return obj
-def validate_signature(sig: inspect.Signature) -> None:
+def validate_evaluator_signature(sig: inspect.Signature) -> None:
     # Check that the wrapped function has a valid signature for use as an evaluator
     # If it does not, raise an error to exit early before running evaluations
     params = sig.parameters
-    valid_named_params = {"input", "output", "expected", "metadata"}
+    valid_named_params = {"input", "output", "expected", "reference", "metadata"}
     if len(params) == 0:
         raise ValueError("Evaluation function must have at least one parameter.")
     if len(params) > 1:
@@ -49,11 +50,12 @@ def validate_signature(sig: inspect.Signature) -> None:
             )
-def _bind_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
+def _bind_evaluator_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
     parameter_mapping = {
         "input": kwargs.get("input"),
         "output": kwargs.get("output"),
         "expected": kwargs.get("expected"),
+        "reference": kwargs.get("reference"),  # `reference` is an alias for `expected`
         "metadata": kwargs.get("metadata"),
     }
     params = sig.parameters
@@ -82,16 +84,11 @@ def create_evaluator(
     def wrapper(func: Callable[..., Any]) -> "Evaluator":
         nonlocal name
         if not name:
-            if hasattr(func, "__self__"):
-                name = func.__self__.__class__.__name__
-            elif hasattr(func, "__name__"):
-                name = func.__name__
-            else:
-                name = str(func)
+            name = get_func_name(func)
         assert name is not None
         wrapped_signature = inspect.signature(func)
-        validate_signature(wrapped_signature)
+        validate_evaluator_signature(wrapped_signature)
         if inspect.iscoroutinefunction(func):
             return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
@@ -120,7 +117,7 @@ def _wrap_coroutine_evaluation_function(
                 return await func(*args, **kwargs)
             async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
-                bound_signature = _bind_signature(sig, **kwargs)
+                bound_signature = _bind_evaluator_signature(sig, **kwargs)
                 result = await func(*bound_signature.args, **bound_signature.kwargs)
                 return convert_to_score(result)
@@ -148,7 +145,7 @@ def _wrap_sync_evaluation_function(
                 return func(*args, **kwargs)
             def evaluate(self, **kwargs: Any) -> EvaluationResult:
-                bound_signature = _bind_signature(sig, **kwargs)
+                bound_signature = _bind_evaluator_signature(sig, **kwargs)
                 result = func(*bound_signature.args, **bound_signature.kwargs)
                 return convert_to_score(result)

phoenix/experiments/functions.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import functools
+import inspect
 import json
+import traceback
 from binascii import hexlify
 from contextlib import ExitStack
 from copy import deepcopy
@@ -10,6 +12,7 @@ from typing import (
     Any,
     Awaitable,
     Dict,
+    Literal,
     Mapping,
     Optional,
     Sequence,
@@ -58,8 +61,8 @@ from phoenix.experiments.types import (
     Experiment,
     ExperimentEvaluationRun,
     ExperimentParameters,
-    ExperimentResult,
     ExperimentRun,
+    ExperimentRunOutput,
     ExperimentTask,
     RanExperiment,
     TaskSummary,
@@ -67,7 +70,7 @@ from phoenix.experiments.types import (
     _asdict,
     _replace,
 )
-from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url
+from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
 from phoenix.trace.attributes import flatten
 from phoenix.utilities.json import jsonify
@@ -105,6 +108,61 @@ def run_experiment(
     dry_run: Union[bool, int] = False,
     print_summary: bool = True,
 ) -> RanExperiment:
+    """
+    Runs an experiment using a given set of dataset of examples.
+    An experiment is a user-defined task that runs on each example in a dataset. The results from
+    each experiment can be evaluated using any number of evaluators to measure the behavior of the
+    task. The experiment and evaluation results are stored in the Phoenix database for comparison
+    and analysis.
+    A `task` is either a synchronous or asynchronous function that returns a JSON serializable
+    output. If the `task` is a function of one argument then that argument will be bound to the
+    `input` field of the dataset example. Alternatively, the `task` can be a function of any
+    combination of specific argument names that will be bound to special values:
+        `input`: The input field of the dataset example
+        `expected`: The expected or reference output of the dataset example
+        `reference`: An alias for `expected`
+        `metadata`: Metadata associated with the dataset example
+        `example`: The dataset `Example` object with all associated fields
+    An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
+    or numeric "score". If the `evaluator` is a function of one argument then that argument will be
+    bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
+    combination of specific argument names that will be bound to special values:
+        `input`: The input field of the dataset example
+        `output`: The output of the task
+        `expected`: The expected or reference output of the dataset example
+        `reference`: An alias for `expected`
+        `metadata`: Metadata associated with the dataset example
+    Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
+    Args:
+        dataset (Dataset): The dataset on which to run the experiment.
+        task (ExperimentTask): The task to run on each example in the dataset.
+        evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
+            evaluate the results of the experiment. Defaults to None.
+        experiment_name (Optional[str]): The name of the experiment. Defaults to None.
+        experiment_description (Optional[str]): A description of the experiment. Defaults to None.
+        experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
+            experiment. Defaults to None.
+        rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
+            sequence of exceptions to adaptively throttle on. Defaults to None.
+        dry_run (bool | int): R the experiment in dry-run mode. When set, experiment results will
+            not be recorded in Phoenix. If True, the experiment will run on a random dataset
+            example. If an integer, the experiment will run on a random sample of the dataset
+            examples of the given size. Defaults to False.
+        print_summary (bool): Whether to print a summary of the experiment and evaluation results.
+            Defaults to True.
+    Returns:
+        RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
+            added to the experiment using the `evaluate_experiment` function.
+    """
+    task_signature = inspect.signature(task)
+    _validate_task_signature(task_signature)
     if not dataset.examples:
         raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
     # Add this to the params once supported in the UI
@@ -146,7 +204,7 @@ def run_experiment(
         )
     tracer, resource = _get_tracer(experiment.project_name)
-    root_span_name = f"Task: {_get_task_name(task)}"
+    root_span_name = f"Task: {get_func_name(task)}"
     root_span_kind = CHAIN
     print("🧪 Experiment started.")
@@ -183,25 +241,37 @@ def run_experiment(
                 # Do not use keyword arguments, which can fail at runtime
                 # even when function obeys protocol, because keyword arguments
                 # are implementation details.
-                _output = task(example)
+                bound_task_args = _bind_task_signature(task_signature, example)
+                _output = task(*bound_task_args.args, **bound_task_args.kwargs)
                 if isinstance(_output, Awaitable):
-                    raise RuntimeError("Task is async but running in sync context")
+                    sync_error_message = (
+                        "Task is async and cannot be run within an existing event loop. "
+                        "Consider the following options:\n\n"
+                        "1. Pass in a synchronous task callable.\n"
+                        "2. Use `nest_asyncio.apply()` to allow nesting event loops."
+                    )
+                    raise RuntimeError(sync_error_message)
                 else:
                     output = _output
             except BaseException as exc:
                 span.record_exception(exc)
                 status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
                 error = exc
+                _print_experiment_error(
+                    exc,
+                    example_id=example.id,
+                    repetition_number=repetition_number,
+                    kind="task",
+                )
             output = jsonify(output)
             span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
             span.set_attribute(INPUT_MIME_TYPE, JSON.value)
-            if result := ExperimentResult(result=output) if output is not None else None:
+            if output is not None:
                 if isinstance(output, str):
                     span.set_attribute(OUTPUT_VALUE, output)
                 else:
                     span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
                     span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
-                span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
             span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
             span.set_status(status)
@@ -214,7 +284,7 @@ def run_experiment(
             experiment_id=experiment.id,
             dataset_example_id=example.id,
             repetition_number=repetition_number,
-            output=result,
+            experiment_run_output=ExperimentRunOutput(task_output=output),
             error=repr(error) if error else None,
             trace_id=_str_trace_id(span.get_span_context().trace_id),  # type: ignore[no-untyped-call]
         )
@@ -238,7 +308,8 @@ def run_experiment(
                 # Do not use keyword arguments, which can fail at runtime
                 # even when function obeys protocol, because keyword arguments
                 # are implementation details.
-                _output = task(example)
+                bound_task_args = _bind_task_signature(task_signature, example)
+                _output = task(*bound_task_args.args, **bound_task_args.kwargs)
                 if isinstance(_output, Awaitable):
                     output = await _output
                 else:
@@ -247,16 +318,21 @@ def run_experiment(
                 span.record_exception(exc)
                 status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
                 error = exc
+                _print_experiment_error(
+                    exc,
+                    example_id=example.id,
+                    repetition_number=repetition_number,
+                    kind="task",
+                )
             output = jsonify(output)
             span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
             span.set_attribute(INPUT_MIME_TYPE, JSON.value)
-            if result := ExperimentResult(result=output) if output is not None else None:
+            if output is not None:
                 if isinstance(output, str):
                     span.set_attribute(OUTPUT_VALUE, output)
                 else:
                     span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
                     span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
-                span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
             span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
             span.set_status(status)
@@ -269,7 +345,7 @@ def run_experiment(
             experiment_id=experiment.id,
             dataset_example_id=example.id,
             repetition_number=repetition_number,
-            output=result,
+            experiment_run_output=ExperimentRunOutput(task_output=output),
             error=repr(error) if error else None,
             trace_id=_str_trace_id(span.get_span_context().trace_id),  # type: ignore[no-untyped-call]
         )
@@ -422,8 +498,9 @@ def evaluate_experiment(
             stack.enter_context(capture_spans(resource))
             try:
                 result = evaluator.evaluate(
-                    output=experiment_run.task_output,
+                    output=experiment_run.output,
                     expected=example.output,
+                    reference=example.output,
                     input=example.input,
                     metadata=example.metadata,
                 )
@@ -431,6 +508,12 @@ def evaluate_experiment(
                 span.record_exception(exc)
                 status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
                 error = exc
+                _print_experiment_error(
+                    exc,
+                    example_id=example.id,
+                    repetition_number=experiment_run.repetition_number,
+                    kind="evaluator",
+                )
             if result:
                 span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
             span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
@@ -467,8 +550,9 @@ def evaluate_experiment(
             stack.enter_context(capture_spans(resource))
             try:
                 result = await evaluator.async_evaluate(
-                    output=experiment_run.task_output,
+                    output=experiment_run.output,
                     expected=example.output,
+                    reference=example.output,
                     input=example.input,
                     metadata=example.metadata,
                 )
@@ -476,6 +560,12 @@ def evaluate_experiment(
                 span.record_exception(exc)
                 status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
                 error = exc
+                _print_experiment_error(
+                    exc,
+                    example_id=example.id,
+                    repetition_number=experiment_run.repetition_number,
+                    kind="evaluator",
+                )
             if result:
                 span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
             span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
@@ -584,20 +674,71 @@ def _decode_unix_nano(time_unix_nano: int) -> datetime:
     return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
-def _get_task_name(task: ExperimentTask) -> str:
-    """
-    Makes a best-effort attempt to get the name of the task.
-    """
+def _is_dry_run(obj: Any) -> bool:
+    return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
-    if isinstance(task, functools.partial):
-        return task.func.__qualname__
-    if hasattr(task, "__qualname__"):
-        return task.__qualname__
-    return str(task)
+def _validate_task_signature(sig: inspect.Signature) -> None:
+    # Check that the function signature has a valid signature for use as a task
+    # If it does not, raise an error to exit early before running an experiment
+    params = sig.parameters
+    valid_named_params = {"input", "expected", "reference", "metadata", "example"}
+    if len(params) == 0:
+        raise ValueError("Task function must have at least one parameter.")
+    if len(params) > 1:
+        for not_found in set(params) - valid_named_params:
+            param = params[not_found]
+            if (
+                param.kind is inspect.Parameter.VAR_KEYWORD
+                or param.default is not inspect.Parameter.empty
+            ):
+                continue
+            raise ValueError(
+                (
+                    f"Invalid parameter names in task function: {', '.join(not_found)}. "
+                    "Parameters names for multi-argument functions must be "
+                    f"any of: {', '.join(valid_named_params)}."
+                )
+            )
-def _is_dry_run(obj: Any) -> bool:
-    return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
+def _bind_task_signature(sig: inspect.Signature, example: Example) -> inspect.BoundArguments:
+    parameter_mapping = {
+        "input": example.input,
+        "expected": example.output,
+        "reference": example.output,  # Alias for "expected"
+        "metadata": example.metadata,
+        "example": example,
+    }
+    params = sig.parameters
+    if len(params) == 1:
+        parameter_name = next(iter(params))
+        if parameter_name in parameter_mapping:
+            return sig.bind(parameter_mapping[parameter_name])
+        else:
+            return sig.bind(parameter_mapping["input"])
+    return sig.bind_partial(
+        **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
+    )
+def _print_experiment_error(
+    error: BaseException,
+    /,
+    *,
+    example_id: str,
+    repetition_number: int,
+    kind: Literal["evaluator", "task"],
+) -> None:
+    """
+    Prints an experiment error.
+    """
+    display_error = RuntimeError(
+        f"{kind} failed for example id {repr(example_id)}, " f"repetition {repr(repetition_number)}"
+    )
+    display_error.__cause__ = error
+    formatted_exception = "".join(traceback.format_exception(display_error))  # type: ignore[arg-type, call-arg, unused-ignore]
+    print("\033[91m" + formatted_exception + "\033[0m")  # prints in red
 class _NoOpProcessor(trace_sdk.SpanProcessor):

phoenix/experiments/types.py CHANGED Viewed

@@ -103,9 +103,9 @@ class Example:
         identifiers = [f'{spaces}id="{self.id}",']
         contents = [
             spaces
-            + f"{k}="
+            + f"{_blue(key)}="
             + json.dumps(
-                _shorten(v),
+                _shorten(value),
                 ensure_ascii=False,
                 sort_keys=True,
                 indent=len(spaces),
@@ -113,8 +113,8 @@ class Example:
             .replace("\n", f"\n{spaces}")
             .replace(' "..."\n', " ...\n")
             + ","
-            for k in ("input", "output", "metadata")
-            if (v := getattr(self, k, None))
+            for key in ("input", "output", "metadata")
+            if (value := getattr(self, key, None))
         ]
         return "\n".join([f"{name}(", *identifiers, *contents, ")"])
@@ -199,17 +199,17 @@ class Experiment:
 @dataclass(frozen=True)
-class ExperimentResult:
-    result: TaskOutput
+class ExperimentRunOutput:
+    task_output: TaskOutput
     def __post_init__(self) -> None:
-        object.__setattr__(self, "result", _make_read_only(self.result))
+        object.__setattr__(self, "task_output", _make_read_only(self.task_output))
     @classmethod
-    def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[ExperimentResult]:
+    def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> ExperimentRunOutput:
         if not obj:
-            return None
-        return cls(result=obj["result"])
+            return cls(task_output=None)
+        return cls(task_output=obj["task_output"])
 @dataclass(frozen=True)
@@ -219,14 +219,14 @@ class ExperimentRun:
     experiment_id: ExperimentId
     dataset_example_id: ExampleId
     repetition_number: RepetitionNumber
-    output: Optional[ExperimentResult] = None
+    experiment_run_output: ExperimentRunOutput
     error: Optional[str] = None
     id: ExperimentRunId = field(default_factory=_dry_run_id)
     trace_id: Optional[TraceId] = None
     @property
-    def task_output(self) -> Optional[TaskOutput]:
-        return deepcopy(self.output.result) if self.output else None
+    def output(self) -> Optional[TaskOutput]:
+        return deepcopy(self.experiment_run_output.task_output)
     @classmethod
     def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentRun:
@@ -236,15 +236,15 @@ class ExperimentRun:
             experiment_id=obj["experiment_id"],
             dataset_example_id=obj["dataset_example_id"],
             repetition_number=obj.get("repetition_number") or 1,
-            output=ExperimentResult.from_dict(obj["output"]),
+            experiment_run_output=ExperimentRunOutput.from_dict(obj["experiment_run_output"]),
             error=obj.get("error"),
             id=obj["id"],
             trace_id=obj.get("trace_id"),
         )
     def __post_init__(self) -> None:
-        if bool(self.output) == bool(self.error):
-            ValueError("Must specify either result or error")
+        if bool(self.experiment_run_output) == bool(self.error):
+            ValueError("Must specify exactly one of experiment_run_output or error")
 @dataclass(frozen=True)
@@ -571,7 +571,7 @@ class RanExperiment(Experiment):
                 {
                     "run_id": run.id,
                     "error": run.error,
-                    "result": deepcopy(run.output.result) if run.output else None,
+                    "output": deepcopy(run.experiment_run_output.task_output),
                     "input": deepcopy((ex := self.dataset.examples[run.dataset_example_id]).input),
                     "expected": deepcopy(ex.output),
                     "metadata": deepcopy(ex.metadata),
@@ -688,6 +688,10 @@ class _ExperimentRunWithExample(ObjectProxy):  # type: ignore[misc]
     def expected(self) -> ExampleOutput:
         return deepcopy(self._self_example.output)
+    @property
+    def reference(self) -> ExampleOutput:
+        return deepcopy(self._self_example.output)
     @property
     def input(self) -> ExampleInput:
         return deepcopy(self._self_example.input)
@@ -703,20 +707,47 @@ class _ExperimentRunWithExample(ObjectProxy):  # type: ignore[misc]
             f'{spaces}id="{self.id}",',
             f'{spaces}example_id="{self.dataset_example_id}",',
         ]
-        contents = [
+        outputs = [
+            *([f'{spaces}error="{self.error}",'] if self.error else []),
+            *(
+                [
+                    f"{spaces}{_blue('output')}="
+                    + json.dumps(
+                        _shorten(self.output),
+                        ensure_ascii=False,
+                        sort_keys=True,
+                        indent=len(spaces),
+                    )
+                    .replace("\n", f"\n{spaces}")
+                    .replace(' "..."\n', " ...\n")
+                ]
+                if not self.error
+                else []
+            ),
+        ]
+        dicts = [
             spaces
-            + f"{k}="
-            + json.dumps(_shorten(v), ensure_ascii=False, sort_keys=True, indent=len(spaces))
+            + f"{_blue(alias)}={{"
+            + (f" # {comment}" if comment else "")
+            + json.dumps(
+                _shorten(value),
+                ensure_ascii=False,
+                sort_keys=True,
+                indent=len(spaces),
+            )[1:]
             .replace("\n", f"\n{spaces}")
             .replace(' "..."\n', " ...\n")
             + ","
-            for k, v in {
-                "error": self.error,
-                "output": self.task_output,
-                "expected": self.expected,
-                "input": self.input,
-                "metadata": self.metadata,
-            }.items()
-            if v
+            for alias, value, comment in (
+                ("expected", self.expected, f"alias for the example.{_blue('output')} dict"),
+                ("reference", self.reference, f"alias for the example.{_blue('output')} dict"),
+                ("input", self.input, f"alias for the example.{_blue('input')} dict"),
+                ("metadata", self.metadata, f"alias for the example.{_blue('metadata')} dict"),
+            )
+            if value
         ]
-        return "\n".join([f"{name}(", *identifiers, *contents, ")"])
+        return "\n".join([f"{name}(", *identifiers, *outputs, *dicts, ")"])
+def _blue(text: str) -> str:
+    return f"\033[1m\033[94m{text}\033[0m"

phoenix/experiments/utils.py CHANGED Viewed

@@ -1,3 +1,6 @@
+import functools
+from typing import Any, Callable
 from phoenix.config import get_web_base_url
@@ -7,3 +10,15 @@ def get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
 def get_dataset_experiments_url(*, dataset_id: str) -> str:
     return f"{get_web_base_url()}datasets/{dataset_id}/experiments"
+def get_func_name(fn: Callable[..., Any]) -> str:
+    """
+    Makes a best-effort attempt to get the name of the function.
+    """
+    if isinstance(fn, functools.partial):
+        return fn.func.__qualname__
+    if hasattr(fn, "__qualname__") and not fn.__qualname__.endswith("<lambda>"):
+        return fn.__qualname__.split(".<locals>.")[-1]
+    return str(fn)

phoenix/server/api/routers/v1/experiment_evaluations.py CHANGED Viewed

@@ -12,6 +12,84 @@ from phoenix.server.api.types.node import from_global_id_with_expected_type
 async def upsert_experiment_evaluation(request: Request) -> Response:
+    """
+    summary: Create an evaluation for a specific experiment run
+    operationId: upsertExperimentEvaluation
+    tags:
+      - private
+    requestBody:
+      description: Details of the experiment evaluation to be upserted
+      required: true
+      content:
+        application/json:
+          schema:
+            type: object
+            properties:
+              experiment_run_id:
+                type: string
+                description: The ID of the experiment run being evaluated
+              name:
+                type: string
+                description: The name of the evaluation
+              annotator_kind:
+                type: string
+                description: The kind of annotator used for the evaluation
+              result:
+                type: object
+                description: The result of the evaluation
+                properties:
+                  label:
+                    type: string
+                    description: The label assigned by the evaluation
+                  score:
+                    type: number
+                    format: float
+                    description: The score assigned by the evaluation
+                  explanation:
+                    type: string
+                    description: Explanation of the evaluation result
+              error:
+                type: string
+                description: Optional error message if the evaluation encountered an error
+              metadata:
+                type: object
+                description: Metadata for the evaluation
+                additionalProperties:
+                  type: string
+              start_time:
+                type: string
+                format: date-time
+                description: The start time of the evaluation in ISO format
+              end_time:
+                type: string
+                format: date-time
+                description: The end time of the evaluation in ISO format
+              trace_id:
+                type: string
+                description: Optional trace ID for tracking
+            required:
+              - experiment_run_id
+              - name
+              - annotator_kind
+              - start_time
+              - end_time
+    responses:
+      200:
+        description: Experiment evaluation upserted successfully
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                data:
+                  type: object
+                  properties:
+                    id:
+                      type: string
+                      description: The ID of the upserted experiment evaluation
+      404:
+        description: ExperimentRun not found
+    """
     payload = await request.json()
     experiment_run_gid = GlobalID.from_id(payload["experiment_run_id"])
     try:

phoenix/server/api/routers/v1/experiment_runs.py CHANGED Viewed

@@ -7,12 +7,79 @@ from starlette.status import HTTP_404_NOT_FOUND
 from strawberry.relay import GlobalID
 from phoenix.db import models
-from phoenix.experiments.types import ExperimentResult, ExperimentRun
+from phoenix.experiments.types import ExperimentRun, ExperimentRunOutput
 from phoenix.server.api.types.node import from_global_id_with_expected_type
 from phoenix.utilities.json import jsonify
 async def create_experiment_run(request: Request) -> Response:
+    """
+    summary: Create a new experiment run for a specific experiment
+    operationId: createExperimentRun
+    tags:
+      - private
+    parameters:
+      - in: path
+        name: experiment_id
+        required: true
+        description: The ID of the experiment for which the run is being created
+        schema:
+          type: string
+    requestBody:
+      description: Details of the experiment run to be created
+      required: true
+      content:
+        application/json:
+          schema:
+            type: object
+            properties:
+              dataset_example_id:
+                type: string
+                description: The ID of the dataset example used in the experiment run
+              trace_id:
+                type: string
+                description: Optional trace ID for tracking
+              experiment_run_output:
+                type: object
+                description: The output of the experiment run
+              repetition_number:
+                type: integer
+                description: The repetition number of the experiment run
+              start_time:
+                type: string
+                format: date-time
+                description: The start time of the experiment run in ISO format
+              end_time:
+                type: string
+                format: date-time
+                description: The end time of the experiment run in ISO format
+              error:
+                type: string
+                description: Optional error message if the experiment run encountered an error
+                nullable: true
+            required:
+              - dataset_example_id
+              - output
+              - repetition_number
+              - start_time
+              - end_time
+    responses:
+      200:
+        description: Experiment run created successfully
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                data:
+                  type: object
+                  properties:
+                    id:
+                      type: string
+                      description: The ID of the created experiment run
+      404:
+        description: Experiment or DatasetExample not found
+    """
     experiment_gid = GlobalID.from_id(request.path_params["experiment_id"])
     try:
         experiment_id = from_global_id_with_expected_type(experiment_gid, "Experiment")
@@ -34,7 +101,7 @@ async def create_experiment_run(request: Request) -> Response:
         )
     trace_id = payload.get("trace_id", None)
-    output = payload["output"]
+    output = payload["experiment_run_output"]
     repetition_number = payload["repetition_number"]
     start_time = payload["start_time"]
     end_time = payload["end_time"]
@@ -58,6 +125,63 @@ async def create_experiment_run(request: Request) -> Response:
 async def list_experiment_runs(request: Request) -> Response:
+    """
+    summary: List all runs for a specific experiment
+    operationId: listExperimentRuns
+    tags:
+      - private
+    parameters:
+      - in: path
+        name: experiment_id
+        required: true
+        description: The ID of the experiment to list runs for
+        schema:
+          type: string
+    responses:
+      200:
+        description: Experiment runs retrieved successfully
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                data:
+                  type: array
+                  items:
+                    type: object
+                    properties:
+                      id:
+                        type: string
+                        description: The ID of the experiment run
+                      experiment_id:
+                        type: string
+                        description: The ID of the experiment
+                      dataset_example_id:
+                        type: string
+                        description: The ID of the dataset example
+                      repetition_number:
+                        type: integer
+                        description: The repetition number of the experiment run
+                      start_time:
+                        type: string
+                        format: date-time
+                        description: The start time of the experiment run in ISO format
+                      end_time:
+                        type: string
+                        format: date-time
+                        description: The end time of the experiment run in ISO format
+                      experiment_run_output:
+                        type: object
+                        description: The output of the experiment run
+                      error:
+                        type: string
+                        description: Error message if the experiment run encountered an error
+                      trace_id:
+                        type: string
+                        description: Optional trace ID for tracking
+      404:
+        description: Experiment not found
+    """
     experiment_gid = GlobalID.from_id(request.path_params["experiment_id"])
     try:
         experiment_id = from_global_id_with_expected_type(experiment_gid, "Experiment")
@@ -87,7 +211,7 @@ async def list_experiment_runs(request: Request) -> Response:
                     experiment_id=str(experiment_gid),
                     dataset_example_id=str(example_gid),
                     repetition_number=exp_run.repetition_number,
-                    output=ExperimentResult.from_dict(exp_run.output) if exp_run.output else None,
+                    experiment_run_output=ExperimentRunOutput.from_dict(exp_run.output),
                     error=exp_run.error,
                     id=str(run_gid),
                     trace_id=exp_run.trace_id,

phoenix/server/api/routers/v1/experiments.py CHANGED Viewed

@@ -25,6 +25,80 @@ def _generate_experiment_name(dataset_name: str) -> str:
 async def create_experiment(request: Request) -> Response:
+    """
+    summary: Create an experiment using a dataset
+    operationId: createExperiment
+    tags:
+      - private
+    parameters:
+      - in: path
+        name: dataset_id
+        required: true
+        description: The ID of the dataset to create an experiment for
+        schema:
+          type: string
+    requestBody:
+      description: Details of the experiment to be created
+      required: true
+      content:
+        application/json:
+          schema:
+            type: object
+            properties:
+              repetitions:
+                type: integer
+                description: Number of times the experiment should be repeated for each example
+                default: 1
+              metadata:
+                type: object
+                description: Metadata for the experiment
+                additionalProperties:
+                  type: string
+              version_id:
+                type: string
+                description: ID of the dataset version to use
+    responses:
+      200:
+        description: Experiment retrieved successfully
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                data:
+                  type: object
+                  properties:
+                    id:
+                      type: string
+                      description: The ID of the experiment
+                    dataset_id:
+                      type: string
+                      description: The ID of the dataset associated with the experiment
+                    dataset_version_id:
+                      type: string
+                      description: The ID of the dataset version associated with the experiment
+                    repetitions:
+                      type: integer
+                      description: Number of times the experiment is repeated
+                    metadata:
+                      type: object
+                      description: Metadata of the experiment
+                      additionalProperties:
+                        type: string
+                    project_name:
+                      type: string
+                      description: The name of the project associated with the experiment
+                    created_at:
+                      type: string
+                      format: date-time
+                      description: The creation timestamp of the experiment
+                    updated_at:
+                      type: string
+                      format: date-time
+                      description: The last update timestamp of the experiment
+      404:
+        description: Dataset or DatasetVersion not found
+    """
     dataset_globalid = GlobalID.from_id(request.path_params["dataset_id"])
     try:
         dataset_id = from_global_id_with_expected_type(dataset_globalid, "Dataset")
@@ -139,6 +213,60 @@ async def create_experiment(request: Request) -> Response:
 async def read_experiment(request: Request) -> Response:
+    """
+    summary: Get details of a specific experiment
+    operationId: getExperiment
+    tags:
+      - private
+    parameters:
+      - in: path
+        name: experiment_id
+        required: true
+        description: The ID of the experiment to retrieve
+        schema:
+          type: string
+    responses:
+      200:
+        description: Experiment retrieved successfully
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                data:
+                  type: object
+                  properties:
+                    id:
+                      type: string
+                      description: The ID of the experiment
+                    dataset_id:
+                      type: string
+                      description: The ID of the dataset associated with the experiment
+                    dataset_version_id:
+                      type: string
+                      description: The ID of the dataset version associated with the experiment
+                    repetitions:
+                      type: integer
+                      description: Number of times the experiment is repeated
+                    metadata:
+                      type: object
+                      description: Metadata of the experiment
+                      additionalProperties:
+                        type: string
+                    project_name:
+                      type: string
+                      description: The name of the project associated with the experiment
+                    created_at:
+                      type: string
+                      format: date-time
+                      description: The creation timestamp of the experiment
+                    updated_at:
+                      type: string
+                      format: date-time
+                      description: The last update timestamp of the experiment
+      404:
+        description: Experiment not found
+    """
     experiment_globalid = GlobalID.from_id(request.path_params["experiment_id"])
     try:
         experiment_id = from_global_id_with_expected_type(experiment_globalid, "Experiment")

phoenix/server/api/types/ExperimentRun.py CHANGED Viewed

@@ -84,7 +84,7 @@ def to_gql_experiment_run(run: models.ExperimentRun) -> ExperimentRun:
         trace_id=trace_id
         if (trace := run.trace) and (trace_id := trace.trace_id) is not None
         else None,
-        output=run.output.get("result"),
+        output=run.output.get("task_output"),
         start_time=run.start_time,
         end_time=run.end_time,
         error=run.error,

phoenix/session/client.py CHANGED Viewed

@@ -5,7 +5,7 @@ import re
 import weakref
 from collections import Counter
 from datetime import datetime
-from io import BytesIO, StringIO
+from io import BytesIO
 from pathlib import Path
 from typing import (
     Any,
@@ -406,35 +406,6 @@ class Client(TraceDataExtractor):
         df["created_at"] = pd.to_datetime(df.created_at)
         return df
-    def download_dataset_examples(
-        self,
-        dataset_id: str,
-        /,
-        *,
-        dataset_version_id: Optional[str] = None,
-    ) -> pd.DataFrame:
-        """
-        Download dataset examples as pandas DataFrame.
-        Args:
-            dataset_id (str): dataset ID
-            dataset_version_id (Optional[str]): dataset version ID, if omitted,
-               the latest version is returned.
-        Returns:
-            pandas DataFrame
-        """
-        url = f"v1/datasets/{dataset_id}/csv"
-        response = httpx.get(
-            url=urljoin(self._base_url, url),
-            params={"version_id": dataset_version_id} if dataset_version_id else {},
-        )
-        response.raise_for_status()
-        return pd.read_csv(
-            StringIO(response.content.decode()),
-            index_col="example_id",
-        )
     def upload_dataset(
         self,
         *,
@@ -808,7 +779,7 @@ def _prepare_pyarrow(
     return "pandas", file, "application/x-pandas-pyarrow", {}
-_response_header = re.compile(r"(?i)(response|answer)s*$")
+_response_header = re.compile(r"(?i)(response|answer|output)s*$")
 def _infer_keys(

phoenix/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "4.4.~~4rc6~~"
1	+ __version__ = "4.6.1"

{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/licenses/IP_NOTICE RENAMED Viewed

File without changes

{arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.6.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

arize-phoenix 4.4.4rc6__py3-none-any.whl → 4.6.1__py3-none-any.whl

Potentially problematic release.

arize-phoenix 4.4.4rc6py3-none-any.whl → 4.6.1py3-none-any.whl