arize-phoenix 4.12.0rc1__py3-none-any.whl → 4.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (36) hide show
  1. {arize_phoenix-4.12.0rc1.dist-info → arize_phoenix-4.14.1.dist-info}/METADATA +10 -6
  2. {arize_phoenix-4.12.0rc1.dist-info → arize_phoenix-4.14.1.dist-info}/RECORD +35 -35
  3. phoenix/db/bulk_inserter.py +3 -1
  4. phoenix/experiments/evaluators/base.py +4 -0
  5. phoenix/experiments/evaluators/code_evaluators.py +80 -0
  6. phoenix/experiments/evaluators/llm_evaluators.py +77 -1
  7. phoenix/experiments/evaluators/utils.py +70 -21
  8. phoenix/experiments/functions.py +14 -14
  9. phoenix/server/api/dataloaders/average_experiment_run_latency.py +23 -23
  10. phoenix/server/api/dataloaders/experiment_error_rates.py +30 -10
  11. phoenix/server/api/dataloaders/experiment_run_counts.py +18 -5
  12. phoenix/server/api/input_types/{CreateSpanAnnotationsInput.py → CreateSpanAnnotationInput.py} +4 -2
  13. phoenix/server/api/input_types/{CreateTraceAnnotationsInput.py → CreateTraceAnnotationInput.py} +4 -2
  14. phoenix/server/api/input_types/{PatchAnnotationsInput.py → PatchAnnotationInput.py} +4 -2
  15. phoenix/server/api/mutations/span_annotations_mutations.py +12 -6
  16. phoenix/server/api/mutations/trace_annotations_mutations.py +12 -6
  17. phoenix/server/api/types/Experiment.py +2 -2
  18. phoenix/server/api/types/Inferences.py +1 -2
  19. phoenix/server/api/types/Model.py +1 -2
  20. phoenix/server/app.py +3 -7
  21. phoenix/server/static/.vite/manifest.json +31 -31
  22. phoenix/server/static/assets/{components-C8sm_r1F.js → components-DeS0YEmv.js} +2 -2
  23. phoenix/server/static/assets/index-CQgXRwU0.js +100 -0
  24. phoenix/server/static/assets/{pages-bN7juCjh.js → pages-hdjlFZhO.js} +275 -198
  25. phoenix/server/static/assets/{vendor-CUDAPm8e.js → vendor-DPvSDRn3.js} +1 -1
  26. phoenix/server/static/assets/{vendor-arizeai-Do2HOmcL.js → vendor-arizeai-CkvPT67c.js} +2 -2
  27. phoenix/server/static/assets/{vendor-codemirror-CrdxOlMs.js → vendor-codemirror-Cqwpwlua.js} +1 -1
  28. phoenix/server/static/assets/{vendor-recharts-PKRvByVe.js → vendor-recharts-5jlNaZuF.js} +1 -1
  29. phoenix/server/templates/index.html +51 -43
  30. phoenix/session/client.py +7 -5
  31. phoenix/trace/dsl/filter.py +40 -25
  32. phoenix/version.py +1 -1
  33. phoenix/server/static/assets/index-BEKPzgQs.js +0 -100
  34. {arize_phoenix-4.12.0rc1.dist-info → arize_phoenix-4.14.1.dist-info}/WHEEL +0 -0
  35. {arize_phoenix-4.12.0rc1.dist-info → arize_phoenix-4.14.1.dist-info}/licenses/IP_NOTICE +0 -0
  36. {arize_phoenix-4.12.0rc1.dist-info → arize_phoenix-4.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arize-phoenix
3
- Version: 4.12.0rc1
3
+ Version: 4.14.1
4
4
  Summary: AI Observability and Evaluation
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -47,7 +47,7 @@ Requires-Dist: scipy
47
47
  Requires-Dist: sqlalchemy[asyncio]<3,>=2.0.4
48
48
  Requires-Dist: sqlean-py>=3.45.1
49
49
  Requires-Dist: starlette
50
- Requires-Dist: strawberry-graphql==0.235.0
50
+ Requires-Dist: strawberry-graphql==0.236.0
51
51
  Requires-Dist: tqdm
52
52
  Requires-Dist: typing-extensions>=4.5; python_version < '3.12'
53
53
  Requires-Dist: typing-extensions>=4.6; python_version >= '3.12'
@@ -64,11 +64,12 @@ Requires-Dist: opentelemetry-sdk; extra == 'container'
64
64
  Requires-Dist: opentelemetry-semantic-conventions; extra == 'container'
65
65
  Requires-Dist: prometheus-client; extra == 'container'
66
66
  Requires-Dist: py-grpc-prometheus; extra == 'container'
67
- Requires-Dist: strawberry-graphql[opentelemetry]==0.235.0; extra == 'container'
67
+ Requires-Dist: strawberry-graphql[opentelemetry]==0.236.0; extra == 'container'
68
68
  Requires-Dist: uvloop; (platform_system != 'Windows') and extra == 'container'
69
69
  Provides-Extra: dev
70
70
  Requires-Dist: anthropic; extra == 'dev'
71
71
  Requires-Dist: arize[autoembeddings,llm-evaluation]; extra == 'dev'
72
+ Requires-Dist: asgi-lifespan; extra == 'dev'
72
73
  Requires-Dist: asyncpg; extra == 'dev'
73
74
  Requires-Dist: gcsfs; extra == 'dev'
74
75
  Requires-Dist: google-cloud-aiplatform>=1.3; extra == 'dev'
@@ -77,6 +78,7 @@ Requires-Dist: jupyter; extra == 'dev'
77
78
  Requires-Dist: langchain>=0.0.334; extra == 'dev'
78
79
  Requires-Dist: litellm>=1.0.3; extra == 'dev'
79
80
  Requires-Dist: llama-index>=0.10.3; extra == 'dev'
81
+ Requires-Dist: mypy==1.11.0; extra == 'dev'
80
82
  Requires-Dist: nbqa; extra == 'dev'
81
83
  Requires-Dist: pandas-stubs==2.0.3.230814; (python_version < '3.9') and extra == 'dev'
82
84
  Requires-Dist: pandas-stubs==2.2.2.240603; (python_version >= '3.9') and extra == 'dev'
@@ -87,9 +89,9 @@ Requires-Dist: psycopg[binary]; extra == 'dev'
87
89
  Requires-Dist: pytest-asyncio; extra == 'dev'
88
90
  Requires-Dist: pytest-cov; extra == 'dev'
89
91
  Requires-Dist: pytest-postgresql; extra == 'dev'
90
- Requires-Dist: pytest==8.2.2; extra == 'dev'
91
- Requires-Dist: ruff==0.4.9; extra == 'dev'
92
- Requires-Dist: strawberry-graphql[debug-server,opentelemetry]==0.235.0; extra == 'dev'
92
+ Requires-Dist: pytest==8.3.1; extra == 'dev'
93
+ Requires-Dist: ruff==0.5.4; extra == 'dev'
94
+ Requires-Dist: strawberry-graphql[debug-server,opentelemetry]==0.236.0; extra == 'dev'
93
95
  Requires-Dist: tabulate; extra == 'dev'
94
96
  Requires-Dist: types-tabulate; extra == 'dev'
95
97
  Provides-Extra: evals
@@ -137,6 +139,8 @@ Phoenix is an open-source AI observability platform designed for experimentation
137
139
 
138
140
  - **_Tracing_** - Trace your LLM application's runtime using OpenTelemetry-based instrumentation.
139
141
  - **_Evaluation_** - Leverage LLMs to benchmark your application's performance using response and retrieval evals.
142
+ - **_Datasets_** - Create versioned datasets of examples for experimentation, evaluation, and fine-tuning.
143
+ - **_Experiments_** - Track and evaluate changes to prompts, LLMs, and retrieval.
140
144
  - **_Inference Analysis_** - Visualize inferences and embeddings using dimensionality reduction and clustering to identify drift and performance degradation.
141
145
 
142
146
  Phoenix is vendor and language agnostic with out-of-the-box support for popular frameworks (🦙LlamaIndex, 🦜⛓LangChain, 🧩DSPy) and LLM providers (OpenAI, Bedrock, and more). For details on auto-instrumentation, check out the [OpenInference](https://github.com/Arize-ai/openinference) project.
@@ -5,7 +5,7 @@ phoenix/exceptions.py,sha256=n2L2KKuecrdflB9MsCdAYCiSEvGJptIsfRkXMoJle7A,169
5
5
  phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
6
  phoenix/services.py,sha256=aTxhcOA1pZHB6U-B3TEcp6fqDF5oT0xCUvEUNMZVTUQ,5175
7
7
  phoenix/settings.py,sha256=cO-qgis_S27nHirTobYI9hHPfZH18R--WMmxNdsVUwc,273
8
- phoenix/version.py,sha256=vDTXhJ8GZFVcgTBE3Q26tNDA2kuyeZwK6HnXlO91H70,26
8
+ phoenix/version.py,sha256=9Wn8BwD7EU7A8hupiqYQO3QNgiGcpbdgKgkCwHlUb-o,23
9
9
  phoenix/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  phoenix/core/embedding_dimension.py,sha256=zKGbcvwOXgLf-yrJBpQyKtd-LEOPRKHnUToyAU8Owis,87
11
11
  phoenix/core/model.py,sha256=km_a--PBHOuA337ClRw9xqhOHhrUT6Rl9pz_zV0JYkQ,4843
@@ -14,7 +14,7 @@ phoenix/core/model_schema_adapter.py,sha256=0Tm_Y_gV-WED8fKBCaFXAEFwE3CTEZS1dowq
14
14
  phoenix/db/README.md,sha256=IvKaZyf9ECbGBYYePaRhBveKZwDbxAc-c7BMxJYZh6Q,595
15
15
  phoenix/db/__init__.py,sha256=pDjEFXukHmJBM-1D8RjmXkvLsz85YWNxMQczt81ec3A,118
16
16
  phoenix/db/alembic.ini,sha256=p8DjVqGUs_tTx8oU56JP7qj-rMUebNFizItUSv_hPhs,3763
17
- phoenix/db/bulk_inserter.py,sha256=zbZGWZFDybKaGLGzpxgLwxAS5sC0_wXcvM0be4kUhh8,11286
17
+ phoenix/db/bulk_inserter.py,sha256=pqyfgwBHU7as5ll56q-NEzADuqYzQ2P-Z7-X9JHM35U,11339
18
18
  phoenix/db/engines.py,sha256=vLWaZlToMtDI7rJDxSidYkfOoojamxaZxaz8ND3zTus,4770
19
19
  phoenix/db/helpers.py,sha256=mTBhPzdy_aU9gD7hNzUZJkAnV77ko5CdaXyoWH3snPA,2982
20
20
  phoenix/db/migrate.py,sha256=MuhtNWnR24riROvarvKfbRb4_D5xuQi6P760vBUKl1E,2270
@@ -31,15 +31,15 @@ phoenix/db/migrations/types.py,sha256=Frq1AKSyBKQQ0FLzON-EmgTqE4kNkOpHMsbWnI-WgC
31
31
  phoenix/db/migrations/versions/10460e46d750_datasets.py,sha256=l69yZfScFrjfZZpY0gnqwhsDUEctLeo02qMgA_aOGDg,8155
32
32
  phoenix/db/migrations/versions/cf03bd6bae1d_init.py,sha256=CbWT3ZTR0CZqeT3zWLoTWhboFmnOy3Ju1z6Ztpq8WIM,8122
33
33
  phoenix/experiments/__init__.py,sha256=6JGwgUd7xCbGpuHqYZlsmErmYvVgv7N_j43bn3dUqsk,123
34
- phoenix/experiments/functions.py,sha256=2ZWCcIs0dh_UNB21t0J_PzWcYhu_32Ai3J6sEMjbBGE,32310
34
+ phoenix/experiments/functions.py,sha256=lz5Add19Hf8EQnfkTLfHFtRJsXfruPC4tXhXUpglGMc,32128
35
35
  phoenix/experiments/tracing.py,sha256=wVpt8Ie9WNPoi1djJdcrkwCokHdTO0bicXViLg3O-1Y,2831
36
36
  phoenix/experiments/types.py,sha256=VuvDCcvUGeHIQuXS_xpz7Jq5xHdt3qu-O_C7IQ3DvF8,23397
37
37
  phoenix/experiments/utils.py,sha256=wLu5Kvt1b4a8rGPRWq5G8RQ9XSiV8fCIVm51zWBI3-g,758
38
38
  phoenix/experiments/evaluators/__init__.py,sha256=j63fi3fa3U7-itVPHa82GowhjQRU-wO6yhO34u_lhsA,714
39
- phoenix/experiments/evaluators/base.py,sha256=ani0F2TN7DMN0KLhV89LIr9-W4g-ccEl2YQJgfp44Js,5325
40
- phoenix/experiments/evaluators/code_evaluators.py,sha256=0qIKQS14Knze50ziJEPVEnNeV3QIs4g1IXtCmaWZu7o,3923
41
- phoenix/experiments/evaluators/llm_evaluators.py,sha256=EFce6LKZwUZDBa5ZozvcdqeZpdWM6n6bmq7_oIzM2Nw,9211
42
- phoenix/experiments/evaluators/utils.py,sha256=SroMoxmPZIFCi2MbEOvXlBAFJbEZY2IWgQvNFp3JP3A,6978
39
+ phoenix/experiments/evaluators/base.py,sha256=jAwJs-V7jCp2UBChL0S3813Xyd9GN4rU4IEhX0nkFGs,5549
40
+ phoenix/experiments/evaluators/code_evaluators.py,sha256=O7ZtFk7ZEf3OjgrZeJTIDKeYfcQet8omlGG0s9vEywQ,6683
41
+ phoenix/experiments/evaluators/llm_evaluators.py,sha256=zyGhxXBDNi1qoj_8I95PRSwjfVaCzpFoAVUQeFT0XSM,13176
42
+ phoenix/experiments/evaluators/utils.py,sha256=XYqB0bOljyR0GewmR_mm9Ndl_q95EkjjDqfXd7YVqTk,9303
43
43
  phoenix/inferences/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  phoenix/inferences/errors.py,sha256=cGp9vxnw4SewFoWBV3ZGMkhE0Kh73lPIv3Ppz_H_RoA,8261
45
45
  phoenix/inferences/fixtures.py,sha256=FC2eRL4dpobKQHYOilFtDexUWFkMZ_w6jun_4WkbMk0,20792
@@ -60,7 +60,7 @@ phoenix/pointcloud/pointcloud.py,sha256=4zAIkKs2xOUbchpj4XDAV-iPMXrfAJ15TG6rlIYG
60
60
  phoenix/pointcloud/projectors.py,sha256=zO_RrtDYSv2rqVOfIP2_9Cv11Dc8EmcZR94xhFcBYPU,1057
61
61
  phoenix/pointcloud/umap_parameters.py,sha256=3UQSjrysVOvq2V4KNpTMqNqNiK0BsTZnPBHWZ4fyJtQ,1708
62
62
  phoenix/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- phoenix/server/app.py,sha256=u8s2AEUUDsY1uIpk3d8cMVHnAL6N57Ulb-k88kVvbMA,19673
63
+ phoenix/server/app.py,sha256=Agr0XLJGAEyBtV34qbsdBhabHxpPIxZL9SCaoFMeh2g,19479
64
64
  phoenix/server/grpc_server.py,sha256=faktLxEtWGlCB1bPR4QwwTsRoQloahKMx0hAWqRGI5s,3379
65
65
  phoenix/server/main.py,sha256=dRyODpwkNi_3as14fnZ8LWW_JLWtpXHldRy9SNjNtws,11251
66
66
  phoenix/server/prometheus.py,sha256=j9DHB2fERuq_ZKmwVaqR-9wx5WcPPuU1Cm5Bhg5241Y,2996
@@ -73,7 +73,7 @@ phoenix/server/api/queries.py,sha256=eq2xHaQF-x4k6AGSY6b6mU2pie9bj-AJML6P2Mr0_DM
73
73
  phoenix/server/api/schema.py,sha256=BcxdqO5CSGqpKd-AAJHMjFlzaK9oJA8GJuxmMfcdjn4,434
74
74
  phoenix/server/api/utils.py,sha256=Y1lGu8J8r8BSBX9OzffgewI8QMziovbG-ePDvZrrwGI,949
75
75
  phoenix/server/api/dataloaders/__init__.py,sha256=F6-8dwb-aQ_T8LeRYg3LPR7T94__8Xe4ysM0VGQeQUQ,4936
76
- phoenix/server/api/dataloaders/average_experiment_run_latency.py,sha256=RiO0AKC6Y5byafsV0zTJEIOt8Nudjte73f1T78cBe1k,1817
76
+ phoenix/server/api/dataloaders/average_experiment_run_latency.py,sha256=ITbbwEWuFqqAxY1CLuuG7VtZYfNXxjjibigNDYf7Yl0,1887
77
77
  phoenix/server/api/dataloaders/dataset_example_revisions.py,sha256=Vpr5IEKSR4QnAVxE5NM7u92fPNgeHQV2ieYc6JakCj0,3788
78
78
  phoenix/server/api/dataloaders/dataset_example_spans.py,sha256=_jLlo0KdUS65d4PNTtE9aXVyG_NZWgA7VcpNC9udQ8U,1484
79
79
  phoenix/server/api/dataloaders/document_evaluation_summaries.py,sha256=dgAAlD0n8X6oAPLaD-czoefNkDqP338MouWsKaW8bOY,5684
@@ -81,8 +81,8 @@ phoenix/server/api/dataloaders/document_evaluations.py,sha256=V-y8eyAA0sZpQTjHvm
81
81
  phoenix/server/api/dataloaders/document_retrieval_metrics.py,sha256=8tZYMNLZ7zxUmyTHHZRUTZTumvw6lK2tYOpFbATIPdI,4270
82
82
  phoenix/server/api/dataloaders/evaluation_summaries.py,sha256=z9aal3IQL_t30aNqpAS7x4tjq0xNkuEG8dWW-bhqZmo,5724
83
83
  phoenix/server/api/dataloaders/experiment_annotation_summaries.py,sha256=RsQ-o84kWVTYgIlh9VKkyw2kDMWIlHCRpS7RE2aw9vs,2881
84
- phoenix/server/api/dataloaders/experiment_error_rates.py,sha256=EHlTdZi8F94vo-qJUcnnXFvuSh_d0fTT0Xg4SfW_A70,1397
85
- phoenix/server/api/dataloaders/experiment_run_counts.py,sha256=wxHv08aZELJ91KTjHdt_x33M3wGDDa9GfbFHeRyOyGk,1343
84
+ phoenix/server/api/dataloaders/experiment_error_rates.py,sha256=Q7Cga0IRnzJy1IW26LWQmUu8pdLlBVYj3p6CJL7fcIk,2017
85
+ phoenix/server/api/dataloaders/experiment_run_counts.py,sha256=lnsX4GYll1EXaGYHxRL0HJol9DtqYYwLcMnoh-h994w,1729
86
86
  phoenix/server/api/dataloaders/experiment_sequence_number.py,sha256=Va1KuoHOd-wzvrlKykoV4kLRFW4JsJvGp_DUI4HYZX4,1631
87
87
  phoenix/server/api/dataloaders/latency_ms_quantile.py,sha256=pEc7QjB2iiNOQm_Fmo99F5O_DKOJWgGmcnT0OADJzYE,7423
88
88
  phoenix/server/api/dataloaders/min_start_or_max_end_times.py,sha256=IoFX5PtSpvQdMk_7-oB8TpIse3Q4PMxep4qKggkHpzo,2902
@@ -105,8 +105,8 @@ phoenix/server/api/input_types/ClearProjectInput.py,sha256=cpPFRyQ3ffy2dLbCZgYpw
105
105
  phoenix/server/api/input_types/ClusterInput.py,sha256=EL4ftvZxQ8mVdruUPcdhMhByORmSmM8S-X6RPqU6GX0,179
106
106
  phoenix/server/api/input_types/Coordinates.py,sha256=meTwbIjwTfqx5DGD2DBlH9wQzdQVNM5a8x9dp1FfIgA,173
107
107
  phoenix/server/api/input_types/CreateDatasetInput.py,sha256=Q3MwouIx9jTQBRWDju75iMQXEGJCrL4aD4ESQp771nc,248
108
- phoenix/server/api/input_types/CreateSpanAnnotationsInput.py,sha256=sTs6YdExpaZpTi6ql1BSpUbyT-ArnU5nz4kZaBs3i1w,399
109
- phoenix/server/api/input_types/CreateTraceAnnotationsInput.py,sha256=wc93I3ZLtzJL237UDUz96w6UY1UwLX76sr0FMeyGD4c,401
108
+ phoenix/server/api/input_types/CreateSpanAnnotationInput.py,sha256=bKgT7bdA9-gYpJmqnMq9TEfjNDEYoldc17EjAglXVlU,474
109
+ phoenix/server/api/input_types/CreateTraceAnnotationInput.py,sha256=iSukKAxt-gTTykpkttse3MVOTD3AOk6fWD-N5PaZ2yY,476
110
110
  phoenix/server/api/input_types/DataQualityMetricInput.py,sha256=LazvmQCCM5m9SDZTpyxQXO1rYF4cmsc3lsR2S9S65X4,1292
111
111
  phoenix/server/api/input_types/DatasetExampleInput.py,sha256=9oJ6pCFxFd02IWJuK4YAUvz-jCgFGDUCDDb2--GAzCw,289
112
112
  phoenix/server/api/input_types/DatasetSort.py,sha256=KDKjx5L8WFNwx7O-g1pDzCMMwY-ErgDd1_HkkZBAvCY,333
@@ -118,7 +118,7 @@ phoenix/server/api/input_types/DeleteExperimentsInput.py,sha256=yUbwMckIBvIL-R9t
118
118
  phoenix/server/api/input_types/DimensionFilter.py,sha256=vcXgglSnZcB5pGh-6oEtRmGx95hISgFUR7BEPw01g7U,3143
119
119
  phoenix/server/api/input_types/DimensionInput.py,sha256=Vfx5FmiMKey4-EHDQsQRPzSAMRJMN5oVMLDUl4NKAa8,164
120
120
  phoenix/server/api/input_types/Granularity.py,sha256=6SVfZ5yTZYq1PI6vdpjfkBUc4YilLSkF-k6okuSNbbQ,2301
121
- phoenix/server/api/input_types/PatchAnnotationsInput.py,sha256=NYlZtZG_ssgvG19FaZpqwUaJ2C1jv1hItU-ERX2_sI8,445
121
+ phoenix/server/api/input_types/PatchAnnotationInput.py,sha256=NWhkcbcGNPwfOYsN3wm5YFNNrSc5T-8Y5my74RK99HE,520
122
122
  phoenix/server/api/input_types/PatchDatasetExamplesInput.py,sha256=E86aBGXDBC83jiEGwV5rilnoeQf6eqCfZ0aAVeIt2VI,890
123
123
  phoenix/server/api/input_types/PatchDatasetInput.py,sha256=OURtTVY8Z_oFEDtKwT1LCMaOK5D4QYo5TVQ6mDrex-g,328
124
124
  phoenix/server/api/input_types/PerformanceMetricInput.py,sha256=fElsLTSEYYgGFGMYTEGcYid39tXUKFdV_JkdHavMcbA,591
@@ -131,8 +131,8 @@ phoenix/server/api/mutations/dataset_mutations.py,sha256=CuKhxsYfvwVcdN_9EXhKxB6
131
131
  phoenix/server/api/mutations/experiment_mutations.py,sha256=vV2lbJ7ccXZqe-LY7nXx6QxWqhKQE4UNZAFcML-KQ8I,3011
132
132
  phoenix/server/api/mutations/export_events_mutations.py,sha256=t_wYBxaqvBJYRoHslh3Bmoxmwlzoy0u8SsBKWIKN5hE,4028
133
133
  phoenix/server/api/mutations/project_mutations.py,sha256=d_xtYkYfZ5flpVgEkGknKB8rsEux-zZraczzqAs4e8A,2255
134
- phoenix/server/api/mutations/span_annotations_mutations.py,sha256=Pfaq4y-FGskPit4z_9GvsyWeBwK1g3CDi2UhGKxyjFE,4973
135
- phoenix/server/api/mutations/trace_annotations_mutations.py,sha256=4xm-zg8PEjwjfVRsWzBD_iiOS94JI6UYtK3fV2dtA4M,5013
134
+ phoenix/server/api/mutations/span_annotations_mutations.py,sha256=Kig5hdH-Jw0UZBhQAqyHvF7HdCHCqKZaoLR-jCOVJUA,5197
135
+ phoenix/server/api/mutations/trace_annotations_mutations.py,sha256=X0k49Ysu-su_hJbfjb_q3-G7qB9o_mpC6UXrDYhV5Sw,5237
136
136
  phoenix/server/api/openapi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
137
  phoenix/server/api/openapi/main.py,sha256=WY0pj3B7siQyyYqKyhqnzWC7P8MtEtiukOBUjGwLXfw,153
138
138
  phoenix/server/api/openapi/schema.py,sha256=uuSYe1Ecu72aXRgTNjyMu-9ZPE13DAHJPKtedS-MsSs,451
@@ -171,17 +171,17 @@ phoenix/server/api/types/EvaluationSummary.py,sha256=EFucuzAhcxR9sdEn6WNAtmAGJk-
171
171
  phoenix/server/api/types/Event.py,sha256=XdYgaIxcVIW-YFViCkxj5l9OaVNepyIrCtm5Iqg2le8,3989
172
172
  phoenix/server/api/types/EventMetadata.py,sha256=-J0tYF9eZTHwCjwxQHY7Gckr2_MNW5OoWT1mydweZNM,635
173
173
  phoenix/server/api/types/ExampleRevisionInterface.py,sha256=gV3Gt9-3Oi5wjaVtepC6nOt3FzTzZFD1KebNnqiw56E,294
174
- phoenix/server/api/types/Experiment.py,sha256=ELYdYFKwgBllxx3cZ_X0XicHjLtshZl0bFqqJdVGXRQ,5177
174
+ phoenix/server/api/types/Experiment.py,sha256=K-3w6dniPRSMO4v-4ToDRwH2xr4fPaDumoyeT4We7g4,5228
175
175
  phoenix/server/api/types/ExperimentAnnotationSummary.py,sha256=Uk3JtxIrsMoZT5tqc4nJdUOM3XegVzjUyoV3pkjNotE,256
176
176
  phoenix/server/api/types/ExperimentComparison.py,sha256=0sFz6MoBDw39dds0qVyaqhVs9qqO5rkG1FMSjmfBeCc,441
177
177
  phoenix/server/api/types/ExperimentRun.py,sha256=122_SID7SLKPUq2dJ2Y4BBw40DNUtcxo6QCZuO8UbBs,2997
178
178
  phoenix/server/api/types/ExperimentRunAnnotation.py,sha256=iBxDaD9DgiF-Qymp5QyxWfJRGYXM1_CeWA_qzsZBqkI,1812
179
179
  phoenix/server/api/types/ExportedFile.py,sha256=e3GTn7B5LgsTbqiwjhMCQH7VsiqXitrBO4aCMS1lHsg,163
180
180
  phoenix/server/api/types/Functionality.py,sha256=tzV9xdhB8zqfsjWxP66NDC7EZsplYkYO7jRbLWJIeeg,382
181
- phoenix/server/api/types/Inferences.py,sha256=HWuDZZrXPWVoEy_pA3bRsAOUYsCKgAxf9zshasGqu5Y,3403
181
+ phoenix/server/api/types/Inferences.py,sha256=BOMlOSsRtUV9XQxpnjmZxdBcJ1w-t9PiFfVOSTS160E,3367
182
182
  phoenix/server/api/types/InferencesRole.py,sha256=Kj9aiXOpGhpeg9PHd9MDU7aXVIT28EjJxr4P6xybfzc,601
183
183
  phoenix/server/api/types/MimeType.py,sha256=Zpi6zCalkSFgsvhzvOs-O1gYA04usAi9H__QZUmFlO0,365
184
- phoenix/server/api/types/Model.py,sha256=991I3wKNFTwEOGR9PCmEVBg2Y9DuQR65SovU_8kzkVs,8085
184
+ phoenix/server/api/types/Model.py,sha256=BRIzH5xSGiDrAUYvhwDpwxT6--ddS3Xr3vCvP8_vzdo,8051
185
185
  phoenix/server/api/types/NumericRange.py,sha256=afEjgF97Go_OvmjMggbPBt-zGM8IONewAyEiKEHRds0,192
186
186
  phoenix/server/api/types/PerformanceMetric.py,sha256=W92B7OghEOgzFvmY0LCqpgavHaQggTGshdgfD0yqHX4,350
187
187
  phoenix/server/api/types/Project.py,sha256=R2_nart3H4m8QYRbBe-SRnYvIjH4BCipcj_tKe6GaC8,14516
@@ -213,20 +213,20 @@ phoenix/server/static/apple-touch-icon-76x76.png,sha256=CT_xT12I0u2i0WU8JzBZBuOQ
213
213
  phoenix/server/static/apple-touch-icon.png,sha256=fOfpjqGpWYbJ0eAurKsyoZP1EAs6ZVooBJ_SGk2ZkDs,3801
214
214
  phoenix/server/static/favicon.ico,sha256=bY0vvCKRftemZfPShwZtE93DiiQdaYaozkPGwNFr6H8,34494
215
215
  phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_wcU9c,2564
216
- phoenix/server/static/.vite/manifest.json,sha256=HBf8Uvnr1-ERuZTBCdHFEF8tgiPQzhLFaK84vo0TmNQ,1929
217
- phoenix/server/static/assets/components-C8sm_r1F.js,sha256=eU0wbaPT17BDumNQKBL-VMI4GcXSd_OmNKr7jkDNKJY,160991
218
- phoenix/server/static/assets/index-BEKPzgQs.js,sha256=eo_7Yc0oaqrRnlXKnz8k3adfq0bFNH_mhO_JyHq7Dvc,7362
219
- phoenix/server/static/assets/pages-bN7juCjh.js,sha256=nFAVGriGoDe-udTjwGwz31kU7dg3GETzC1snnT0fq8A,406291
220
- phoenix/server/static/assets/vendor-CUDAPm8e.js,sha256=vgwuODjGR2Mb8FeNurZOmbQrvxkBnWMaHtdQ69soFqA,1355429
216
+ phoenix/server/static/.vite/manifest.json,sha256=10o8Ytfii6SAgHd91u81MQsDMIwhoNgzav27nED77Ow,1929
217
+ phoenix/server/static/assets/components-DeS0YEmv.js,sha256=eE0JsxZZ0MI9DS9b8nDmBpmzCXNxqBGy8qVssaeHdj4,160991
218
+ phoenix/server/static/assets/index-CQgXRwU0.js,sha256=hzyJ3S10rmimJlZny1IO3l0_noAurwHdJw0Nc38B1Bc,6342
219
+ phoenix/server/static/assets/pages-hdjlFZhO.js,sha256=CRfgXRtrvdut-V-KmrKFtezUb_azfs-eCIbTrIPKX04,422269
220
+ phoenix/server/static/assets/vendor-DPvSDRn3.js,sha256=Hc-RClavSPr5CtZbFYbQNrIPim9LJrD4e51QMOxF3Io,1355429
221
221
  phoenix/server/static/assets/vendor-DxkFTwjz.css,sha256=nZrkr0u6NNElFGvpWHk9GTHeGoibCXCli1bE7mXZGZg,1816
222
- phoenix/server/static/assets/vendor-arizeai-Do2HOmcL.js,sha256=YDx2XE1Zwly6oS6Xv_-B792_jpNZ0bN_gvRaHY35btA,290966
223
- phoenix/server/static/assets/vendor-codemirror-CrdxOlMs.js,sha256=KMJHeNKQyBjzBzKZEiDvIpx1hjhlb6zgALnE7CCgfCs,357576
224
- phoenix/server/static/assets/vendor-recharts-PKRvByVe.js,sha256=Uul1I5FtZKipkx1ku4y2OqWd86GiO4aB4dA9o_LJbvM,282859
222
+ phoenix/server/static/assets/vendor-arizeai-CkvPT67c.js,sha256=sD4eqJrzqLBhFUAe2TNFUkoGv2nJl_gv3cK3Lo_Iiqk,290966
223
+ phoenix/server/static/assets/vendor-codemirror-Cqwpwlua.js,sha256=RSF9c9RG5ol0VSYXxItT5llkabRIUO1gIZVrF4-nX1o,357576
224
+ phoenix/server/static/assets/vendor-recharts-5jlNaZuF.js,sha256=3LZ6-as7BKqU7ulx_f_2wE8LVc8IjFrIxJNTxrkPfps,282859
225
225
  phoenix/server/static/assets/vendor-three-DwGkEfCM.js,sha256=0D12ZgKzfKCTSdSTKJBFR2RZO_xxeMXrqDp0AszZqHY,620972
226
226
  phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
227
- phoenix/server/templates/index.html,sha256=lgWe7Smx5KT6XYqnLC2-ZJYtxCJIC-if0-_6TesO1_Q,3884
227
+ phoenix/server/templates/index.html,sha256=gVpjB8pCMiubdMh2DA9mTCtV5AVTXJH_9u5PmG2t7Vk,4238
228
228
  phoenix/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
229
- phoenix/session/client.py,sha256=aq5AhgyoI-rJU5eTNFAk8SzdBl6hiZvj7fWrC09s4aI,32523
229
+ phoenix/session/client.py,sha256=AEqBnjWl1D1KounuUj5T269cqPAZIV_KdWezknj6nC0,32577
230
230
  phoenix/session/data_extractor.py,sha256=gkEM3WWZAlWGMfRgQopAQlid4cSi6GNco-sdrGir0qc,2788
231
231
  phoenix/session/evaluation.py,sha256=aKeV8UVOyq3b7CYOwt3cWuLz0xzvMjX7vlEPILJ_fcs,5311
232
232
  phoenix/session/session.py,sha256=1ZGR0pBmah8bqX353MDf4sq7XuK904EfxNLo0B9z_sU,26714
@@ -246,7 +246,7 @@ phoenix/trace/trace_dataset.py,sha256=Wq89jJ4hYQ1Qt-Uj11ZNzKQYQeKmGY6NqWStQiiTlM
246
246
  phoenix/trace/utils.py,sha256=7LurVGXn245cjj4MJsc7v6jq4DSJkpK6YGBfIaSywuw,1307
247
247
  phoenix/trace/dsl/README.md,sha256=ihmP9zGUC5V-TDbzKla76LuyDqPDQIBUH2BORwxNI68,2902
248
248
  phoenix/trace/dsl/__init__.py,sha256=WIQIjJg362XD3s50OsPJJ0xbDsGp41bSv7vDllLrPuA,144
249
- phoenix/trace/dsl/filter.py,sha256=HCx9FjnmgkzsYDW7ptPz7ozVGbbSnW59tzJdYCKojQI,32170
249
+ phoenix/trace/dsl/filter.py,sha256=DTDERSAexxDbTy5QvC48NSKhnr2pfIRDZ4PS-s3ZW80,32642
250
250
  phoenix/trace/dsl/helpers.py,sha256=ULAhqWULPqYWCSNX7y50DVKIqfySx86nqb6hDvZPnVk,3896
251
251
  phoenix/trace/dsl/query.py,sha256=W0t-tiXh2WIVb96lzFAGQOQ-U46uKux78d4KL3rW-PE,30316
252
252
  phoenix/trace/langchain/__init__.py,sha256=F37GfD1pd5Kuw7R7iRUM1zXXpO8xEcycNZh5dwqBXNk,109
@@ -266,8 +266,8 @@ phoenix/utilities/logging.py,sha256=lDXd6EGaamBNcQxL4vP1au9-i_SXe0OraUDiJOcszSw,
266
266
  phoenix/utilities/project.py,sha256=qWsvKnG1oKhOFUowXf9qiOL2ia7jaFe_ijFFHEt8GJo,431
267
267
  phoenix/utilities/re.py,sha256=PDve_OLjRTM8yQQJHC8-n3HdIONi7aNils3ZKRZ5uBM,2045
268
268
  phoenix/utilities/span_store.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
269
- arize_phoenix-4.12.0rc1.dist-info/METADATA,sha256=RrJgXQpDjiI30wmDKUZrcbBoM6vb3f2suCGJC4mtuQ0,11455
270
- arize_phoenix-4.12.0rc1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
271
- arize_phoenix-4.12.0rc1.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
272
- arize_phoenix-4.12.0rc1.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
273
- arize_phoenix-4.12.0rc1.dist-info/RECORD,,
269
+ arize_phoenix-4.14.1.dist-info/METADATA,sha256=FgEpDDxRhJOIbBjm9IWUQxHltEu7XB1XjtsEHMl23W8,11736
270
+ arize_phoenix-4.14.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
271
+ arize_phoenix-4.14.1.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
272
+ arize_phoenix-4.14.1.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
273
+ arize_phoenix-4.14.1.dist-info/RECORD,,
@@ -105,8 +105,10 @@ class BulkInserter:
105
105
  )
106
106
 
107
107
  async def __aexit__(self, *args: Any) -> None:
108
- self._operations = None
109
108
  self._running = False
109
+ if self._task:
110
+ self._task.cancel()
111
+ self._task = None
110
112
 
111
113
  def _enqueue_operation(self, operation: DataManipulation) -> None:
112
114
  cast("Queue[DataManipulation]", self._operations).put_nowait(operation)
@@ -90,11 +90,15 @@ class Evaluator(ABC):
90
90
  if super_cls in (LLMEvaluator, Evaluator):
91
91
  break
92
92
  if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
93
+ if isinstance(evaluate, classmethod):
94
+ evaluate = evaluate.__func__
93
95
  assert callable(evaluate), "`evaluate()` method should be callable"
94
96
  # need to remove the first param, i.e. `self`
95
97
  _validate_sig(functools.partial(evaluate, None), "evaluate")
96
98
  return
97
99
  if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
100
+ if isinstance(async_evaluate, classmethod):
101
+ async_evaluate = async_evaluate.__func__
98
102
  assert callable(async_evaluate), "`async_evaluate()` method should be callable"
99
103
  # need to remove the first param, i.e. `self`
100
104
  _validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
@@ -9,6 +9,19 @@ from phoenix.experiments.types import EvaluationResult, TaskOutput
9
9
 
10
10
 
11
11
  class JSONParsable(CodeEvaluator):
12
+ """
13
+ An evaluator that checks if the output of an experiment run is a JSON-parsable string.
14
+
15
+ Example:
16
+
17
+ .. code-block:: python
18
+ from phoenix.experiments import run_experiment
19
+ from phoenix.experiments.evaluators import JSONParsable
20
+
21
+ run_experiment(dataset, task, evaluators=[JSONParsable])
22
+ """
23
+
24
+ @classmethod
12
25
  def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
13
26
  assert isinstance(output, str), "Experiment run output must be a string"
14
27
  try:
@@ -22,6 +35,22 @@ class JSONParsable(CodeEvaluator):
22
35
 
23
36
 
24
37
  class ContainsKeyword(CodeEvaluator):
38
+ """
39
+ An evaluator that checks if a keyword is present in the output of an experiment run.
40
+
41
+ Args:
42
+ keyword (str): The keyword to search for in the output.
43
+ name (str, optional): An optional name for the evaluator. Defaults to "Contains(<keyword>)".
44
+
45
+ Example:
46
+
47
+ .. code-block:: python
48
+ from phoenix.experiments import run_experiment
49
+ from phoenix.experiments.evaluators import ContainsKeyword
50
+
51
+ run_experiment(dataset, task, evaluators=[ContainsKeyword("foo")])
52
+ """
53
+
25
54
  def __init__(self, keyword: str, name: Optional[str] = None) -> None:
26
55
  self.keyword = keyword
27
56
  self._name = name or f"Contains({repr(keyword)})"
@@ -39,6 +68,23 @@ class ContainsKeyword(CodeEvaluator):
39
68
 
40
69
 
41
70
  class ContainsAnyKeyword(CodeEvaluator):
71
+ """
72
+ An evaluator that checks if any of the keywords are present in the output of an experiment run.
73
+
74
+ Args:
75
+ keywords (List[str]): The keywords to search for in the output.
76
+ name (str, optional): An optional name for the evaluator. Defaults to
77
+ "ContainsAny(<keywords>)".
78
+
79
+ Example:
80
+
81
+ .. code-block:: python
82
+ from phoenix.experiments import run_experiment
83
+ from phoenix.experiments.evaluators import ContainsAnyKeyword
84
+
85
+ run_experiment(dataset, task, evaluators=[ContainsAnyKeyword(["foo", "bar"])])
86
+ """
87
+
42
88
  def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
43
89
  self.keywords = keywords
44
90
  self._name = name or f"ContainsAny({keywords})"
@@ -57,6 +103,23 @@ class ContainsAnyKeyword(CodeEvaluator):
57
103
 
58
104
 
59
105
  class ContainsAllKeywords(CodeEvaluator):
106
+ """
107
+ An evaluator that checks if all of the keywords are present in the output of an experiment run.
108
+
109
+ Args:
110
+ keywords (List[str]): The keywords to search for in the output.
111
+ name (str, optional): An optional name for the evaluator. Defaults to
112
+ "ContainsAll(<keywords>)".
113
+
114
+ Example:
115
+ .. code-block:: python
116
+
117
+ from phoenix.experiments import run_experiment
118
+ from phoenix.experiments.evaluators import ContainsAllKeywords
119
+
120
+ run_experiment(dataset, task, evaluators=[ContainsAllKeywords(["foo", "bar"])])
121
+ """
122
+
60
123
  def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
61
124
  self.keywords = keywords
62
125
  self._name = name or f"ContainsAll({keywords})"
@@ -77,6 +140,23 @@ class ContainsAllKeywords(CodeEvaluator):
77
140
 
78
141
 
79
142
  class MatchesRegex(CodeEvaluator):
143
+ r"""
144
+ An experiment evaluator that checks if the output of an experiment run matches a regex pattern.
145
+
146
+ Args:
147
+ pattern (Union[str, re.Pattern[str]]): The regex pattern to match the output against.
148
+ name (str, optional): An optional name for the evaluator. Defaults to "matches_({pattern})".
149
+
150
+ Example:
151
+ .. code-block:: python
152
+
153
+ from phoenix.experiments import run_experiment
154
+ from phoenix.experiments.evaluators import MatchesRegex
155
+
156
+ phone_number_evaluator = MatchesRegex(r"\d{3}-\d{3}-\d{4}", name="valid-phone-number")
157
+ run_experiment(dataset, task, evaluators=[phone_number_evaluator])
158
+ """
159
+
80
160
  def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
81
161
  if isinstance(pattern, str):
82
162
  pattern = re.compile(pattern)
@@ -18,6 +18,31 @@ from phoenix.experiments.types import (
18
18
 
19
19
 
20
20
  class LLMCriteriaEvaluator(LLMEvaluator):
21
+ """
22
+ An experiment evaluator that uses an LLM to evaluate whether the text meets a custom criteria.
23
+
24
+ This evaluator uses the chain-of-thought technique to perform a binary evaluation of text based
25
+ on a custom criteria and description. When used as an experiment evaluator,
26
+ `LLMCriteriaEvaluator` will return a score of 1.0 if the text meets the criteria and a score of
27
+ 0.0 if not. The explanation produced by the chain-of-thought technique will be included in the
28
+ experiment evaluation as well.
29
+
30
+ Example criteria and descriptions:
31
+ - "thoughtfulness" - "shows careful consideration and fair judgement"
32
+ - "clarity" - "is easy to understand and follow"
33
+ - "professionalism" - "is respectful and appropriate for a formal setting"
34
+
35
+ Args:
36
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
37
+ the `phoenix.evals` module.
38
+ criteria: The criteria to evaluate the text against, the criteria should be able to be used
39
+ as a noun in a sentence.
40
+ description (str): A description of the criteria, used to clarify instructions to the LLM.
41
+ The description should complete this sentence: "{criteria} means the text
42
+ {description}".
43
+ name (str): The name of the evaluator
44
+ """
45
+
21
46
  _base_template = (
22
47
  "Determine if the following text is {criteria}. {description}"
23
48
  "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
@@ -117,6 +142,14 @@ ConcisenessEvaluator = criteria_evaluator_factory(
117
142
  description="is just a few sentences and easy to follow",
118
143
  default_name="Conciseness",
119
144
  )
145
+ """
146
+ An experiment evaluator that uses an LLM to evaluate whether the text is concise.
147
+
148
+ Args:
149
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
150
+ the `phoenix.evals` module.
151
+ name (str, optional): The name of the evaluator, defaults to "Conciseness".
152
+ """
120
153
 
121
154
 
122
155
  HelpfulnessEvaluator = criteria_evaluator_factory(
@@ -125,6 +158,14 @@ HelpfulnessEvaluator = criteria_evaluator_factory(
125
158
  description="provides useful information",
126
159
  default_name="Helpfulness",
127
160
  )
161
+ """
162
+ An experiment evaluator that uses an LLM to evaluate whether the text is helpful.
163
+
164
+ Args:
165
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
166
+ the `phoenix.evals` module.
167
+ name (str, optional): The name of the evaluator, defaults to "Helpfulness".
168
+ """
128
169
 
129
170
 
130
171
  CoherenceEvaluator = criteria_evaluator_factory(
@@ -133,6 +174,14 @@ CoherenceEvaluator = criteria_evaluator_factory(
133
174
  description="is coherent, well-structured, and logically sound",
134
175
  default_name="Coherence",
135
176
  )
177
+ """
178
+ An experiment evaluator that uses an LLM to evaluate whether the text is coherent.
179
+
180
+ Args:
181
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
182
+ the `phoenix.evals` module.
183
+ name (str, optional): The name of the evaluator, defaults to "Coherence".
184
+ """
136
185
 
137
186
 
138
187
  def _parse_label_from_explanation(raw_string: str) -> str:
@@ -149,6 +198,33 @@ def _parse_label_from_explanation(raw_string: str) -> str:
149
198
 
150
199
 
151
200
  class RelevanceEvaluator(LLMEvaluator):
201
+ """
202
+ An experiment evaluator that uses an LLM to evaluate whether a response is relevant to a query.
203
+
204
+ This evaluator uses the chain-of-thought technique to perform a binary evaluation of whether
205
+ the output "response" of an experiment is relevant to its input "query". When used as an
206
+ experiment evaluator, `RelevanceEvaluator` will return a score of 1.0 if the response is
207
+ relevant to the query and a score of 0.0 if not. The explanation produced by the
208
+ chain-of-thought technique will be included in the experiment evaluation as well.
209
+
210
+ Optionally, you can provide custom functions to extract the query and response from the input
211
+ and output of the experiment task. By default, the evaluator will use the dataset example as
212
+ the input and the output of the experiment task as the response.
213
+
214
+ Args:
215
+ model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
216
+ the `phoenix.evals` module.
217
+ get_query (callable, optional): A function that extracts the query from the input of the
218
+ experiment task. The function should take the input and metadata of the dataset example
219
+ and return a string. By default, the function will return the string representation of
220
+ the input.
221
+ get_response (callable, optional): A function that extracts the response from the output of
222
+ the experiment task. The function should take the output and metadata of the experiment
223
+ task and return a string. By default, the function will return the string representation
224
+ of the output.
225
+ name (str, optional): The name of the evaluator. Defaults to "Relevance".
226
+ """
227
+
152
228
  template = (
153
229
  "Determine if the following response is relevant to the query. In this context, "
154
230
  "'relevance' means that the response directly addresses the core question or topic of the "
@@ -174,7 +250,7 @@ class RelevanceEvaluator(LLMEvaluator):
174
250
  model: LLMBaseModel,
175
251
  get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
176
252
  get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
177
- name: str = "RelevanceEvaluator",
253
+ name: str = "Relevance",
178
254
  ):
179
255
  self.model = model
180
256
  self._name = name
@@ -1,6 +1,5 @@
1
1
  import functools
2
2
  import inspect
3
- from itertools import chain, islice, repeat
4
3
  from typing import TYPE_CHECKING, Any, Callable, Optional, Union
5
4
 
6
5
  from phoenix.experiments.types import (
@@ -75,6 +74,72 @@ def create_evaluator(
75
74
  name: Optional[str] = None,
76
75
  scorer: Optional[Callable[[Any], EvaluationResult]] = None,
77
76
  ) -> Callable[[Callable[..., Any]], "Evaluator"]:
77
+ """
78
+ A decorator that configures a sync or async function to be used as an experiment evaluator.
79
+
80
+ If the `evaluator` is a function of one argument then that argument will be
81
+ bound to the `output` of an experiment task. Alternatively, the `evaluator` can be a function
82
+ of any combination of specific argument names that will be bound to special values:
83
+ `input`: The input field of the dataset example
84
+ `output`: The output of an experiment task
85
+ `expected`: The expected or reference output of the dataset example
86
+ `reference`: An alias for `expected`
87
+ `metadata`: Metadata associated with the dataset example
88
+
89
+ Args:
90
+ kind (str | AnnotatorKind): Broadly indicates how the evaluator scores an experiment run.
91
+ Valid kinds are: "CODE", "LLM". Defaults to "CODE".
92
+ name (str, optional): The name of the evaluator. If not provided, the name of the function
93
+ will be used.
94
+ scorer (callable, optional): An optional function that converts the output of the wrapped
95
+ function into an `EvaluationResult`. This allows configuring the evaluation
96
+ payload by setting a label, score and explanation. By default, numeric outputs will
97
+ be recorded as scores, boolean outputs will be recorded as scores and labels, and
98
+ string outputs will be recorded as labels. If the output is a 2-tuple, the first item
99
+ will be recorded as the score and the second item will recorded as the explanation.
100
+
101
+ Examples:
102
+ Configuring an evaluator that returns a boolean
103
+
104
+ .. code-block:: python
105
+ @create_evaluator(kind="CODE", name="exact-match)
106
+ def match(output: str, expected: str) -> bool:
107
+ return output == expected
108
+
109
+ Configuring an evaluator that returns a label
110
+
111
+ .. code-block:: python
112
+ client = openai.Client()
113
+
114
+ @create_evaluator(kind="LLM")
115
+ def label(output: str) -> str:
116
+ res = client.chat.completions.create(
117
+ model = "gpt-4",
118
+ messages = [
119
+ {
120
+ "role": "user",
121
+ "content": (
122
+ "in one word, characterize the sentiment of the following customer "
123
+ f"request: {output}"
124
+ )
125
+ },
126
+ ],
127
+ )
128
+ label = res.choices[0].message.content
129
+ return label
130
+
131
+ Configuring an evaluator that returns a score and explanation
132
+
133
+ .. code-block:: python
134
+ from textdistance import levenshtein
135
+
136
+ @create_evaluator(kind="CODE", name="levenshtein-distance")
137
+ def ld(output: str, expected: str) -> Tuple[float, str]:
138
+ return (
139
+ levenshtein(output, expected),
140
+ f"Levenshtein distance between {output} and {expected}"
141
+ )
142
+ """
78
143
  if scorer is None:
79
144
  scorer = _default_eval_scorer
80
145
 
@@ -163,24 +228,8 @@ def _default_eval_scorer(result: Any) -> EvaluationResult:
163
228
  return EvaluationResult(score=float(result))
164
229
  if isinstance(result, str):
165
230
  return EvaluationResult(label=result)
166
- if isinstance(result, (tuple, list)) and 0 < len(result) <= 3:
167
- # Possible interpretations are:
168
- # - 3-tuple: (Score, Label, Explanation)
169
- # - 2-tuple: (Score, Explanation) or (Label, Explanation)
170
- # - 1-tuple: (Score, ) or (Label, )
171
- # Note that (Score, Label) conflicts with (Score, Explanation) and we
172
- # pick the latter because it's probably more prevalent. To get
173
- # (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
174
- a, b, c = islice(chain(result, repeat(None)), 3)
175
- score, label, explanation = None, a, b
176
- if hasattr(a, "__float__"):
177
- try:
178
- score = float(a)
179
- except ValueError:
180
- pass
181
- else:
182
- label, explanation = (None, b) if len(result) < 3 else (b, c)
183
- return EvaluationResult(score=score, label=label, explanation=explanation)
184
- if result is None:
185
- return EvaluationResult(score=0)
231
+ if isinstance(result, (tuple, list)) and len(result) == 2:
232
+ # If the result is a 2-tuple, the first item will be recorded as the score
233
+ # and the second item will recorded as the explanation.
234
+ return EvaluationResult(score=float(result[0]), explanation=str(result[1]))
186
235
  raise ValueError(f"Unsupported evaluation result type: {type(result)}")