arize-phoenix 4.4.4rc3__py3-none-any.whl → 4.4.4rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (33) hide show
  1. {arize_phoenix-4.4.4rc3.dist-info → arize_phoenix-4.4.4rc5.dist-info}/METADATA +2 -2
  2. {arize_phoenix-4.4.4rc3.dist-info → arize_phoenix-4.4.4rc5.dist-info}/RECORD +33 -28
  3. phoenix/datasets/evaluators/__init__.py +18 -0
  4. phoenix/datasets/evaluators/code_evaluators.py +99 -0
  5. phoenix/datasets/{evaluators.py → evaluators/llm_evaluators.py} +75 -106
  6. phoenix/datasets/evaluators/utils.py +292 -0
  7. phoenix/datasets/experiments.py +148 -82
  8. phoenix/datasets/tracing.py +19 -0
  9. phoenix/datasets/types.py +18 -52
  10. phoenix/db/insertion/dataset.py +19 -16
  11. phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
  12. phoenix/db/models.py +8 -3
  13. phoenix/server/api/context.py +2 -0
  14. phoenix/server/api/dataloaders/__init__.py +2 -0
  15. phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
  16. phoenix/server/api/helpers/dataset_helpers.py +8 -7
  17. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  18. phoenix/server/api/mutations/project_mutations.py +9 -4
  19. phoenix/server/api/routers/v1/datasets.py +146 -42
  20. phoenix/server/api/routers/v1/experiment_evaluations.py +1 -0
  21. phoenix/server/api/routers/v1/experiment_runs.py +2 -2
  22. phoenix/server/api/types/Experiment.py +5 -0
  23. phoenix/server/api/types/ExperimentRun.py +1 -1
  24. phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
  25. phoenix/server/api/types/Span.py +1 -0
  26. phoenix/server/app.py +2 -0
  27. phoenix/server/static/index.js +638 -588
  28. phoenix/session/client.py +124 -2
  29. phoenix/trace/schemas.py +1 -2
  30. phoenix/version.py +1 -1
  31. {arize_phoenix-4.4.4rc3.dist-info → arize_phoenix-4.4.4rc5.dist-info}/WHEEL +0 -0
  32. {arize_phoenix-4.4.4rc3.dist-info → arize_phoenix-4.4.4rc5.dist-info}/licenses/IP_NOTICE +0 -0
  33. {arize_phoenix-4.4.4rc3.dist-info → arize_phoenix-4.4.4rc5.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arize-phoenix
3
- Version: 4.4.4rc3
3
+ Version: 4.4.4rc5
4
4
  Summary: AI Observability and Evaluation
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -31,7 +31,7 @@ Requires-Dist: openinference-instrumentation
31
31
  Requires-Dist: openinference-instrumentation-langchain>=0.1.12
32
32
  Requires-Dist: openinference-instrumentation-llama-index>=1.2.0
33
33
  Requires-Dist: openinference-instrumentation-openai>=0.1.4
34
- Requires-Dist: openinference-semantic-conventions>=0.1.5
34
+ Requires-Dist: openinference-semantic-conventions>=0.1.9
35
35
  Requires-Dist: opentelemetry-exporter-otlp
36
36
  Requires-Dist: opentelemetry-proto>=1.12.0
37
37
  Requires-Dist: opentelemetry-sdk
@@ -5,17 +5,20 @@ phoenix/exceptions.py,sha256=n2L2KKuecrdflB9MsCdAYCiSEvGJptIsfRkXMoJle7A,169
5
5
  phoenix/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
6
  phoenix/services.py,sha256=aTxhcOA1pZHB6U-B3TEcp6fqDF5oT0xCUvEUNMZVTUQ,5175
7
7
  phoenix/settings.py,sha256=cO-qgis_S27nHirTobYI9hHPfZH18R--WMmxNdsVUwc,273
8
- phoenix/version.py,sha256=vkzG2Z0dkYNWJYkiDnpu7yJxir6A-qjTBfFVeklU7TY,25
8
+ phoenix/version.py,sha256=-Vg_bLotyeJdv0gFqG5-A64nsG-6AR0xZSp3sDDsV_w,25
9
9
  phoenix/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  phoenix/core/embedding_dimension.py,sha256=zKGbcvwOXgLf-yrJBpQyKtd-LEOPRKHnUToyAU8Owis,87
11
11
  phoenix/core/model.py,sha256=km_a--PBHOuA337ClRw9xqhOHhrUT6Rl9pz_zV0JYkQ,4843
12
12
  phoenix/core/model_schema.py,sha256=F2dbbVnkDLsPYoyZDv1q03uhvP8LcU1wXp0g-exiWs0,50551
13
13
  phoenix/core/model_schema_adapter.py,sha256=0Tm_Y_gV-WED8fKBCaFXAEFwE3CTEZS1dowqnTZ7x7g,8426
14
14
  phoenix/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- phoenix/datasets/evaluators.py,sha256=_ezCRB6cyhuJsxsQJRFt2CKg3vqV-KgtBi9NNtkdeJQ,10410
16
- phoenix/datasets/experiments.py,sha256=D1gmdCbOC7tkjrFjyC_WPNFyY41YrqKESsPE0CebTtc,19223
17
- phoenix/datasets/tracing.py,sha256=Ieb2Uo-9qHpmv65uf1VsFSsWo5Yxj6VHwGS6dxu9NHQ,2248
18
- phoenix/datasets/types.py,sha256=w0KoSP7AdlcFlV3I6qVtvKOOWoK0yiY6_s4CvH0flcs,5753
15
+ phoenix/datasets/experiments.py,sha256=RzZezHQcTpPcr7gY9rGtoYlfoesFNhNV7EO5f_oHNFk,21198
16
+ phoenix/datasets/tracing.py,sha256=wVpt8Ie9WNPoi1djJdcrkwCokHdTO0bicXViLg3O-1Y,2831
17
+ phoenix/datasets/types.py,sha256=N17mnnVwmu1k3bnmbyROPt_6TxPaZY_QkOZmCOR5_jE,4835
18
+ phoenix/datasets/evaluators/__init__.py,sha256=KSr9fNG4O93swYxNdPj_UihP9Itl_5mj0a492wi_4_0,465
19
+ phoenix/datasets/evaluators/code_evaluators.py,sha256=DdCcAi274t_TLs_aARd-GmWWpJrxVeNEAegMFEAfe0E,3894
20
+ phoenix/datasets/evaluators/llm_evaluators.py,sha256=aVfAHOWhskBiy0IVeq_ACTs7B37uXTTtDoNBS0XenIc,9165
21
+ phoenix/datasets/evaluators/utils.py,sha256=S7OGrb1sBWg5l9K35X29OKJe5wZ3k7xMhxJBclzxta0,10452
19
22
  phoenix/db/README.md,sha256=IvKaZyf9ECbGBYYePaRhBveKZwDbxAc-c7BMxJYZh6Q,595
20
23
  phoenix/db/__init__.py,sha256=pDjEFXukHmJBM-1D8RjmXkvLsz85YWNxMQczt81ec3A,118
21
24
  phoenix/db/alembic.ini,sha256=p8DjVqGUs_tTx8oU56JP7qj-rMUebNFizItUSv_hPhs,3763
@@ -23,9 +26,9 @@ phoenix/db/bulk_inserter.py,sha256=zbZGWZFDybKaGLGzpxgLwxAS5sC0_wXcvM0be4kUhh8,1
23
26
  phoenix/db/engines.py,sha256=vLWaZlToMtDI7rJDxSidYkfOoojamxaZxaz8ND3zTus,4770
24
27
  phoenix/db/helpers.py,sha256=L2_jP1iIWpUREhKLYYb4_vf_6v_BiU1E73Z2PczGm6s,1589
25
28
  phoenix/db/migrate.py,sha256=MuhtNWnR24riROvarvKfbRb4_D5xuQi6P760vBUKl1E,2270
26
- phoenix/db/models.py,sha256=zzZHXh1NpS3LyOOFp1BS7aVyrU1Qx3gcBY-H8ouoyjg,20282
29
+ phoenix/db/models.py,sha256=lYzI3tCDUl8njXb3Vf3R8e6y56-MErprjjfBE-o9Kao,20419
27
30
  phoenix/db/insertion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- phoenix/db/insertion/dataset.py,sha256=2aBOTgjwRkmJqjE1FEQp7BTu1Jz4-bS1bKyeJgvSxfg,7305
31
+ phoenix/db/insertion/dataset.py,sha256=_vxy5e6W5jEuvO2fMKbbNCn9JvHkwI4LRKk_10eKFVg,7171
29
32
  phoenix/db/insertion/evaluation.py,sha256=fAerUy3QGf2wID_tiVmPvzxBDFGiONPl3pmpZDgJDWQ,7183
30
33
  phoenix/db/insertion/helpers.py,sha256=7tf6qQyJ05nn3IXaZEpj2b4Jz5boGLWT8tzlMaJ9tQY,2337
31
34
  phoenix/db/insertion/span.py,sha256=DNBjSrx5g2W5KuTB1dkHwtkb0SFnMIxN1jB-BAdGKFY,5634
@@ -33,7 +36,7 @@ phoenix/db/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
33
36
  phoenix/db/migrations/env.py,sha256=QbzB5zrRs6XQQmrYeUpuzeilcMlM-MsbaAgHHYcIHTI,3626
34
37
  phoenix/db/migrations/script.py.mako,sha256=MEqL-2qATlST9TAOeYgscMn1uy6HUS9NFvDgl93dMj8,635
35
38
  phoenix/db/migrations/types.py,sha256=Frq1AKSyBKQQ0FLzON-EmgTqE4kNkOpHMsbWnI-WgCE,605
36
- phoenix/db/migrations/versions/10460e46d750_datasets.py,sha256=RapdD9Sud_Gq45Vpz7VnDQB_toG6B6yHlwS93qAh_0c,8133
39
+ phoenix/db/migrations/versions/10460e46d750_datasets.py,sha256=l69yZfScFrjfZZpY0gnqwhsDUEctLeo02qMgA_aOGDg,8155
37
40
  phoenix/db/migrations/versions/cf03bd6bae1d_init.py,sha256=CbWT3ZTR0CZqeT3zWLoTWhboFmnOy3Ju1z6Ztpq8WIM,8122
38
41
  phoenix/inferences/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
42
  phoenix/inferences/errors.py,sha256=cGp9vxnw4SewFoWBV3ZGMkhE0Kh73lPIv3Ppz_H_RoA,8261
@@ -55,18 +58,18 @@ phoenix/pointcloud/pointcloud.py,sha256=4zAIkKs2xOUbchpj4XDAV-iPMXrfAJ15TG6rlIYG
55
58
  phoenix/pointcloud/projectors.py,sha256=zO_RrtDYSv2rqVOfIP2_9Cv11Dc8EmcZR94xhFcBYPU,1057
56
59
  phoenix/pointcloud/umap_parameters.py,sha256=lJsEOrbSuSiqI7g4Yt6xj7kgYxEqoep4ZHWLr6VWBqw,1760
57
60
  phoenix/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
- phoenix/server/app.py,sha256=_D2DgupKJHv8DmS6VgWxvygumSM75qdlDg6qSj61PRU,18227
61
+ phoenix/server/app.py,sha256=LQrHWt5HG_pWqnR9Ozb3-vnAGiiRGuZ3uV_9-886Yxw,18340
59
62
  phoenix/server/grpc_server.py,sha256=faktLxEtWGlCB1bPR4QwwTsRoQloahKMx0hAWqRGI5s,3379
60
63
  phoenix/server/main.py,sha256=mtzH_2Kyvuy3AHiiKfqiCdUQ6SGFzeT4q9fefbV6GLg,11114
61
64
  phoenix/server/prometheus.py,sha256=j9DHB2fERuq_ZKmwVaqR-9wx5WcPPuU1Cm5Bhg5241Y,2996
62
65
  phoenix/server/telemetry.py,sha256=T_2OKrxNViAeaANlNspEekg_Y5uZIFWvKAnpz8Aoqvk,2762
63
66
  phoenix/server/thread_server.py,sha256=dP6cm6Cf08jNhDA1TRlVZpziu1YgtPDmaeIJMm725eI,2154
64
67
  phoenix/server/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- phoenix/server/api/context.py,sha256=GfAD9QHg5erKwYGpqDj_8bL2GwmccARDZQc8yO-4Fm0,2669
68
+ phoenix/server/api/context.py,sha256=jb69SVdb5hpVbfM4U0pZi4sGa2a-0VKOJWcBjjS7l4s,2761
66
69
  phoenix/server/api/interceptor.py,sha256=ykDnoC_apUd-llVli3m1CW18kNSIgjz2qZ6m5JmPDu8,1294
67
70
  phoenix/server/api/queries.py,sha256=wp5BlapuxDIoaQJm7mzG0dURfVxR32vXSJVC0JqG4_Y,19845
68
71
  phoenix/server/api/schema.py,sha256=BcxdqO5CSGqpKd-AAJHMjFlzaK9oJA8GJuxmMfcdjn4,434
69
- phoenix/server/api/dataloaders/__init__.py,sha256=9fFjDNlCtOYTOKJi0uPIRh7xJMpCnrBOhoWGEdv1BrI,4618
72
+ phoenix/server/api/dataloaders/__init__.py,sha256=urbG3M-k2cpj2ymMLYQ28tzIXAG1edECxM-tJ22ylqE,4720
70
73
  phoenix/server/api/dataloaders/dataset_example_revisions.py,sha256=Vpr5IEKSR4QnAVxE5NM7u92fPNgeHQV2ieYc6JakCj0,3788
71
74
  phoenix/server/api/dataloaders/dataset_example_spans.py,sha256=_jLlo0KdUS65d4PNTtE9aXVyG_NZWgA7VcpNC9udQ8U,1484
72
75
  phoenix/server/api/dataloaders/document_evaluation_summaries.py,sha256=dgAAlD0n8X6oAPLaD-czoefNkDqP338MouWsKaW8bOY,5684
@@ -75,6 +78,7 @@ phoenix/server/api/dataloaders/document_retrieval_metrics.py,sha256=8tZYMNLZ7zxU
75
78
  phoenix/server/api/dataloaders/evaluation_summaries.py,sha256=z9aal3IQL_t30aNqpAS7x4tjq0xNkuEG8dWW-bhqZmo,5724
76
79
  phoenix/server/api/dataloaders/experiment_annotation_summaries.py,sha256=RsQ-o84kWVTYgIlh9VKkyw2kDMWIlHCRpS7RE2aw9vs,2881
77
80
  phoenix/server/api/dataloaders/experiment_error_rates.py,sha256=EHlTdZi8F94vo-qJUcnnXFvuSh_d0fTT0Xg4SfW_A70,1397
81
+ phoenix/server/api/dataloaders/experiment_run_counts.py,sha256=wxHv08aZELJ91KTjHdt_x33M3wGDDa9GfbFHeRyOyGk,1343
78
82
  phoenix/server/api/dataloaders/experiment_sequence_number.py,sha256=Va1KuoHOd-wzvrlKykoV4kLRFW4JsJvGp_DUI4HYZX4,1631
79
83
  phoenix/server/api/dataloaders/latency_ms_quantile.py,sha256=pEc7QjB2iiNOQm_Fmo99F5O_DKOJWgGmcnT0OADJzYE,7423
80
84
  phoenix/server/api/dataloaders/min_start_or_max_end_times.py,sha256=IoFX5PtSpvQdMk_7-oB8TpIse3Q4PMxep4qKggkHpzo,2902
@@ -89,9 +93,10 @@ phoenix/server/api/dataloaders/trace_row_ids.py,sha256=yAWuVFWUjDdmmwfXsGs_l6LuG
89
93
  phoenix/server/api/dataloaders/cache/__init__.py,sha256=SYoOM9n8FJaMdQarma5d1blu-jIg2GB8Shqg5ezSzZ8,106
90
94
  phoenix/server/api/dataloaders/cache/two_tier_cache.py,sha256=I38L1RsOis98OQftE7n1Q9QBZfFJO6OW_qIINkuJllo,2295
91
95
  phoenix/server/api/helpers/__init__.py,sha256=_V1eVkchZmTkhOfRC4QqR1sUB2xtIxdsMJkDouZq_IE,251
92
- phoenix/server/api/helpers/dataset_helpers.py,sha256=kIo_kPrV8O40CUypB57JCB5Ek3GJmZXPlz6NIULIsSM,6875
96
+ phoenix/server/api/helpers/dataset_helpers.py,sha256=A6UzEyAb4gFtyc_AV63_yl9OpN0vn8Vw1BBCTNjg9J0,6875
93
97
  phoenix/server/api/input_types/AddExamplesToDatasetInput.py,sha256=ZGXMV0H3DYHi4DdqGhejDzaWdFinyem1Mc8DVA7iCh0,436
94
98
  phoenix/server/api/input_types/AddSpansToDatasetInput.py,sha256=C4oZ0WqYqca1kleNOCMIM2_aY6Qnc5n1xXG51_C1V0w,368
99
+ phoenix/server/api/input_types/ClearProjectInput.py,sha256=cpPFRyQ3ffy2dLbCZgYpway-mCzhdm4QqnUg8caOBfQ,382
95
100
  phoenix/server/api/input_types/ClusterInput.py,sha256=EL4ftvZxQ8mVdruUPcdhMhByORmSmM8S-X6RPqU6GX0,179
96
101
  phoenix/server/api/input_types/Coordinates.py,sha256=meTwbIjwTfqx5DGD2DBlH9wQzdQVNM5a8x9dp1FfIgA,173
97
102
  phoenix/server/api/input_types/CreateDatasetInput.py,sha256=Q3MwouIx9jTQBRWDju75iMQXEGJCrL4aD4ESQp771nc,248
@@ -116,7 +121,7 @@ phoenix/server/api/mutations/auth.py,sha256=vPRFoj7J6PV6QeODewG4K0PhoOebS5AfMRpb
116
121
  phoenix/server/api/mutations/dataset_mutations.py,sha256=Zp2sFWyGyubILUQboR6bafRWafsfeRO2ffUWnkLlfgI,22532
117
122
  phoenix/server/api/mutations/experiment_mutations.py,sha256=Fw_yEdITGJ6A33M5JZ-2YnBTDoBqZUUFON6vy8JoVjE,2569
118
123
  phoenix/server/api/mutations/export_events_mutations.py,sha256=t_wYBxaqvBJYRoHslh3Bmoxmwlzoy0u8SsBKWIKN5hE,4028
119
- phoenix/server/api/mutations/project_mutations.py,sha256=6A7BS3651iaeAwUszKXQB3NK4QJY_tGpALBMNw1bqp8,2021
124
+ phoenix/server/api/mutations/project_mutations.py,sha256=3SVDCZqxB0Iv60cOwBL8c-rY3QUUPs8PXbp-C_K1mWY,2267
120
125
  phoenix/server/api/openapi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
126
  phoenix/server/api/openapi/main.py,sha256=WY0pj3B7siQyyYqKyhqnzWC7P8MtEtiukOBUjGwLXfw,153
122
127
  phoenix/server/api/openapi/schema.py,sha256=uuSYe1Ecu72aXRgTNjyMu-9ZPE13DAHJPKtedS-MsSs,451
@@ -124,10 +129,10 @@ phoenix/server/api/routers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
124
129
  phoenix/server/api/routers/utils.py,sha256=M41BoH-fl37izhRuN2aX7lWm7jOC20A_3uClv9TVUUY,583
125
130
  phoenix/server/api/routers/v1/__init__.py,sha256=B5eSaylPI7MoYia1-VgKrU8rDi-69r_hRwPU5yMLUTE,2808
126
131
  phoenix/server/api/routers/v1/dataset_examples.py,sha256=wtplRUv2ee9xGTrcEMgTn-7L4NX_73IcwUXkCMZEFc4,6726
127
- phoenix/server/api/routers/v1/datasets.py,sha256=2wkBOLqo8ttSN1VNVEcnPcLCitkSLGp62AjWlxJhV4Y,27605
132
+ phoenix/server/api/routers/v1/datasets.py,sha256=ws2Guou9mspwFx3-cBFZoD2VuTwWGoFZmtt2Sr3zg6k,31516
128
133
  phoenix/server/api/routers/v1/evaluations.py,sha256=rwSVg-rpujhsMcDVFt-VAr0Ix9TgvLcY_bSxeh8PzJI,9241
129
- phoenix/server/api/routers/v1/experiment_evaluations.py,sha256=xemnZ10WL5pErndP7jbaDipUj3Tkl813XSIjx7X5MBY,2656
130
- phoenix/server/api/routers/v1/experiment_runs.py,sha256=o6IvcyFDY-cy3KqeO9FIKy3XAgbIJhx7SFUoxML-MeY,4337
134
+ phoenix/server/api/routers/v1/experiment_evaluations.py,sha256=xhrkPUc_4ncIBm24aUyzu47UU0CN1tGlbisn-oLqt_Y,2702
135
+ phoenix/server/api/routers/v1/experiment_runs.py,sha256=0AUNHA5nvpGDeoJUGK8VxP2TFN3iPwhMW3D9QmHstPk,4399
131
136
  phoenix/server/api/routers/v1/experiments.py,sha256=5Rh7q6sHswmk11PZSJ7KMrtqfIE16X_xSKkKSASK9-I,7251
132
137
  phoenix/server/api/routers/v1/spans.py,sha256=FEnmlRPBPl71BSGNBuPrz14fk8nmxJQYsKECdDbdUdw,3977
133
138
  phoenix/server/api/routers/v1/traces.py,sha256=dYEf5pThenAQCgfQljHdrnwd4tC_tAXm6Kvk6GphPYs,2774
@@ -154,11 +159,11 @@ phoenix/server/api/types/EvaluationSummary.py,sha256=EFucuzAhcxR9sdEn6WNAtmAGJk-
154
159
  phoenix/server/api/types/Event.py,sha256=XdYgaIxcVIW-YFViCkxj5l9OaVNepyIrCtm5Iqg2le8,3989
155
160
  phoenix/server/api/types/EventMetadata.py,sha256=-J0tYF9eZTHwCjwxQHY7Gckr2_MNW5OoWT1mydweZNM,635
156
161
  phoenix/server/api/types/ExampleRevisionInterface.py,sha256=gV3Gt9-3Oi5wjaVtepC6nOt3FzTzZFD1KebNnqiw56E,294
157
- phoenix/server/api/types/Experiment.py,sha256=Lon2ZNZYdWXQmj3nLr_TXN8CCtZtC-AXYfyJuoqI2DM,4692
162
+ phoenix/server/api/types/Experiment.py,sha256=Cs0EKhVLI5l5LKFI0hQA-ekZuaiJcOHT88JGFBa2deU,4906
158
163
  phoenix/server/api/types/ExperimentAnnotationSummary.py,sha256=Uk3JtxIrsMoZT5tqc4nJdUOM3XegVzjUyoV3pkjNotE,256
159
164
  phoenix/server/api/types/ExperimentComparison.py,sha256=0sFz6MoBDw39dds0qVyaqhVs9qqO5rkG1FMSjmfBeCc,441
160
- phoenix/server/api/types/ExperimentRun.py,sha256=uM7HxaC8nEjtO7yLr8WjLEfYRvEvbX6ibR8I0fVzdeU,2976
161
- phoenix/server/api/types/ExperimentRunAnnotation.py,sha256=GvWY6wukBhSr2Tk9Ef0R5bH5yCMxVakqeypoyYoUb6o,1774
165
+ phoenix/server/api/types/ExperimentRun.py,sha256=8jUIi3ApVCqQHwnYe59CYhrmh5iZ6-QmlH5WpF7UWtM,2990
166
+ phoenix/server/api/types/ExperimentRunAnnotation.py,sha256=zGstMbS5OxNikEhD8VouY7Ls7YbxKm-0EmqvGeY3-DI,1773
162
167
  phoenix/server/api/types/ExportedFile.py,sha256=e3GTn7B5LgsTbqiwjhMCQH7VsiqXitrBO4aCMS1lHsg,163
163
168
  phoenix/server/api/types/Functionality.py,sha256=tzV9xdhB8zqfsjWxP66NDC7EZsplYkYO7jRbLWJIeeg,382
164
169
  phoenix/server/api/types/Inferences.py,sha256=HWuDZZrXPWVoEy_pA3bRsAOUYsCKgAxf9zshasGqu5Y,3403
@@ -173,7 +178,7 @@ phoenix/server/api/types/Retrieval.py,sha256=OhMK2ncjoyp5h1yjKhjlKpoTbQrMHuxmgSF
173
178
  phoenix/server/api/types/ScalarDriftMetricEnum.py,sha256=IUAcRPpgL41WdoIgK6cNk2Te38SspXGyEs-S1fY23_A,232
174
179
  phoenix/server/api/types/Segments.py,sha256=m2yoegrxA1Tn7ZAy1rMjjD1isc752MaAXMoffkBlvrM,2921
175
180
  phoenix/server/api/types/SortDir.py,sha256=OUpXhlCzCxPoXSDkJJygEs9Rw9pMymfaZUG5zPTrw4Y,152
176
- phoenix/server/api/types/Span.py,sha256=Nk0Of6JyHSI7OqrEodyV3d5UUvzCWnDkNSZUcmCvq-I,13837
181
+ phoenix/server/api/types/Span.py,sha256=W4Rsg85bgqbDhgYwpjgOTrIQKbkwpFQPpL6nqMyzhCs,13865
177
182
  phoenix/server/api/types/TimeSeries.py,sha256=wjzuxHFqCey0O7Ys25qiXyuqXK8an-osyNWUE8A_8G4,5227
178
183
  phoenix/server/api/types/Trace.py,sha256=ep-mPexub1ijxAnBvc2KrGsNVXO2SfDR1WxqER2wcD8,2376
179
184
  phoenix/server/api/types/UMAPPoints.py,sha256=5sOuruzM8saXa8C2XiyUfk2XPrkVGmhqKpclMYRw1dk,1656
@@ -194,12 +199,12 @@ phoenix/server/static/apple-touch-icon-76x76.png,sha256=CT_xT12I0u2i0WU8JzBZBuOQ
194
199
  phoenix/server/static/apple-touch-icon.png,sha256=fOfpjqGpWYbJ0eAurKsyoZP1EAs6ZVooBJ_SGk2ZkDs,3801
195
200
  phoenix/server/static/favicon.ico,sha256=bY0vvCKRftemZfPShwZtE93DiiQdaYaozkPGwNFr6H8,34494
196
201
  phoenix/server/static/index.css,sha256=KKGpx4iwF91VGRm0YN-4cn8oC-oIqC6HecoPf0x3ZM8,1885
197
- phoenix/server/static/index.js,sha256=88OQ_pBKrFdD5usFU6Frpm1vBzxL19zO4JS9ChoHWEo,3487681
202
+ phoenix/server/static/index.js,sha256=I9Y8svcPruUrXklKcZUxFz5HfLB0vOwczYLSwLAs_04,3500011
198
203
  phoenix/server/static/modernizr.js,sha256=mvK-XtkNqjOral-QvzoqsyOMECXIMu5BQwSVN_wcU9c,2564
199
204
  phoenix/server/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
200
205
  phoenix/server/templates/index.html,sha256=S4z7qSoNSwnKFAH9r96AR-YJEyoKMd-VMWVlJ_IdzME,2039
201
206
  phoenix/session/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- phoenix/session/client.py,sha256=tq2qghwYa_mxNYLD41UNfD7n57msB9lYIe5H16lBqoo,20333
207
+ phoenix/session/client.py,sha256=R7dV38yjkIQa522nhG6jhDllWcXft2JJ7RlcPYpqiiQ,24846
203
208
  phoenix/session/data_extractor.py,sha256=dwhiDu-ISaXr8UI9I-CszZhB5BlUNmdDopjFZvMIXMw,2101
204
209
  phoenix/session/evaluation.py,sha256=aKeV8UVOyq3b7CYOwt3cWuLz0xzvMjX7vlEPILJ_fcs,5311
205
210
  phoenix/session/session.py,sha256=rjIuSSK2gAYIUPQTJc4E2ebew5o6I070FWRoFn4W3EI,26620
@@ -211,7 +216,7 @@ phoenix/trace/exporter.py,sha256=eAYemdvDCHMugDJiaR29BFFMTQBdf3oerdkz34Cl3hE,473
211
216
  phoenix/trace/fixtures.py,sha256=gBGFG2gkcBsSDzolzzR9AJDrB_fdOQfUaGgHV-EHdco,14204
212
217
  phoenix/trace/otel.py,sha256=WA720jvRadiZBAKjsYoPyXzypHwbyEK2OZRVUwtbjB8,9976
213
218
  phoenix/trace/projects.py,sha256=2BwlNjFE-uwpqYtCu5YyBiYZk9wRPpM13vh3-Cv7GkA,2157
214
- phoenix/trace/schemas.py,sha256=JiFKhGD2JF6Eai7UOhPF5urcuKGkpMLHc3Vltbe1msk,5967
219
+ phoenix/trace/schemas.py,sha256=Mjc6fD9OyeMnEk5wPPSbveqnNUYWK3p3BxpOvSGanHU,5950
215
220
  phoenix/trace/span_evaluations.py,sha256=GaADtJLi2njra4aYaie0BIwkSgdxPB_SNseglI4ykZA,13104
216
221
  phoenix/trace/span_json_decoder.py,sha256=IAFakPRqSMYxTPKYFMiXYxm7U-FipdN8_xbvapDS0Qc,3131
217
222
  phoenix/trace/span_json_encoder.py,sha256=tzSCIQJbeFBm33K68G8A5M12n_86tCDyuU0WAobxEz4,2010
@@ -239,8 +244,8 @@ phoenix/utilities/logging.py,sha256=lDXd6EGaamBNcQxL4vP1au9-i_SXe0OraUDiJOcszSw,
239
244
  phoenix/utilities/project.py,sha256=qWsvKnG1oKhOFUowXf9qiOL2ia7jaFe_ijFFHEt8GJo,431
240
245
  phoenix/utilities/re.py,sha256=PDve_OLjRTM8yQQJHC8-n3HdIONi7aNils3ZKRZ5uBM,2045
241
246
  phoenix/utilities/span_store.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
242
- arize_phoenix-4.4.4rc3.dist-info/METADATA,sha256=VuX8kXsqxcbsdYmi9-jCDMHgMJ182JMbDYCY-3N74jU,11012
243
- arize_phoenix-4.4.4rc3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
244
- arize_phoenix-4.4.4rc3.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
245
- arize_phoenix-4.4.4rc3.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
246
- arize_phoenix-4.4.4rc3.dist-info/RECORD,,
247
+ arize_phoenix-4.4.4rc5.dist-info/METADATA,sha256=yT0gbMlPkiRkZeC8Yj_eLyaufriREVn3jxz5-qTKDjI,11012
248
+ arize_phoenix-4.4.4rc5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
249
+ arize_phoenix-4.4.4rc5.dist-info/licenses/IP_NOTICE,sha256=JBqyyCYYxGDfzQ0TtsQgjts41IJoa-hiwDrBjCb9gHM,469
250
+ arize_phoenix-4.4.4rc5.dist-info/licenses/LICENSE,sha256=HFkW9REuMOkvKRACuwLPT0hRydHb3zNg-fdFt94td18,3794
251
+ arize_phoenix-4.4.4rc5.dist-info/RECORD,,
@@ -0,0 +1,18 @@
1
+ from phoenix.datasets.evaluators.code_evaluators import ContainsKeyword, JSONParsable
2
+ from phoenix.datasets.evaluators.llm_evaluators import (
3
+ CoherenceEvaluator,
4
+ ConcisenessEvaluator,
5
+ HelpfulnessEvaluator,
6
+ LLMCriteriaEvaluator,
7
+ RelevanceEvaluator,
8
+ )
9
+
10
+ __all__ = [
11
+ "ContainsKeyword",
12
+ "JSONParsable",
13
+ "CoherenceEvaluator",
14
+ "ConcisenessEvaluator",
15
+ "LLMCriteriaEvaluator",
16
+ "HelpfulnessEvaluator",
17
+ "RelevanceEvaluator",
18
+ ]
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from typing import Any, List, Optional, Union
6
+
7
+ from phoenix.datasets.evaluators.utils import Evaluator
8
+ from phoenix.datasets.types import EvaluationResult, TaskOutput
9
+
10
+
11
+ class JSONParsable(Evaluator):
12
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
13
+ assert isinstance(output, str), "Experiment run output must be a string"
14
+ try:
15
+ json.loads(output)
16
+ json_parsable = True
17
+ except BaseException:
18
+ json_parsable = False
19
+ return EvaluationResult(
20
+ score=int(json_parsable),
21
+ )
22
+
23
+
24
+ class ContainsKeyword(Evaluator):
25
+ def __init__(self, keyword: str, name: Optional[str] = None) -> None:
26
+ self.keyword = keyword
27
+ self._name = name or f"Contains({repr(keyword)})"
28
+
29
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
30
+ assert isinstance(output, str), "Experiment run output must be a string"
31
+ found = self.keyword in output
32
+ return EvaluationResult(
33
+ score=float(found),
34
+ explanation=(
35
+ f"the string {repr(self.keyword)} was "
36
+ f"{'found' if found else 'not found'} in the output"
37
+ ),
38
+ )
39
+
40
+
41
+ class ContainsAnyKeyword(Evaluator):
42
+ def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
43
+ self.keywords = keywords
44
+ self._name = name or f"ContainsAny({keywords})"
45
+
46
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
47
+ assert isinstance(output, str), "Experiment run output must be a string"
48
+ found = [keyword for keyword in self.keywords if keyword in output]
49
+ if found:
50
+ explanation = f"the keywords {found} were found in the output"
51
+ else:
52
+ explanation = f"none of the keywords {self.keywords} were found in the output"
53
+ return EvaluationResult(
54
+ score=float(bool(found)),
55
+ explanation=explanation,
56
+ )
57
+
58
+
59
+ class ContainsAllKeywords(Evaluator):
60
+ def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
61
+ self.keywords = keywords
62
+ self._name = name or f"ContainsAll({keywords})"
63
+
64
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
65
+ assert isinstance(output, str), "Experiment run output must be a string"
66
+ not_found = [keyword for keyword in self.keywords if keyword not in output]
67
+ if not_found:
68
+ contains_all = False
69
+ explanation = f"the keywords {not_found} were not found in the output"
70
+ else:
71
+ contains_all = True
72
+ explanation = f"all of the keywords {self.keywords} were found in the output"
73
+ return EvaluationResult(
74
+ score=float(contains_all),
75
+ explanation=explanation,
76
+ )
77
+
78
+
79
+ class MatchesRegex(Evaluator):
80
+ def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
81
+ if isinstance(pattern, str):
82
+ pattern = re.compile(pattern)
83
+ self.pattern = pattern
84
+ assert isinstance(pattern, re.Pattern)
85
+ self._name = name or f"matches_({pattern})"
86
+
87
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
88
+ assert isinstance(output, str), "Experiment run output must be a string"
89
+ matches = self.pattern.findall(output)
90
+ if matches:
91
+ explanation = (
92
+ f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
93
+ )
94
+ else:
95
+ explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
96
+ return EvaluationResult(
97
+ score=float(bool(matches)),
98
+ explanation=explanation,
99
+ )
@@ -1,72 +1,23 @@
1
- import json
2
1
  import re
3
- from typing import TYPE_CHECKING, Callable, Optional, Type
2
+ from types import MappingProxyType
3
+ from typing import Any, Callable, Optional, Type
4
4
 
5
+ from phoenix.datasets.evaluators.utils import (
6
+ ExampleInput,
7
+ ExampleMetadata,
8
+ ExperimentEvaluator,
9
+ LLMEvaluator,
10
+ _unwrap_json,
11
+ )
5
12
  from phoenix.datasets.types import (
6
13
  EvaluationResult,
7
- Example,
8
- ExperimentEvaluator,
9
- ExperimentRun,
10
- JSONSerializable,
14
+ TaskOutput,
11
15
  )
12
16
  from phoenix.evals.models.base import BaseModel as LLMBaseModel
13
17
  from phoenix.evals.utils import snap_to_rail
14
18
 
15
19
 
16
- def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
17
- if isinstance(obj, dict):
18
- if len(obj) == 1:
19
- key = next(iter(obj.keys()))
20
- output = obj[key]
21
- assert isinstance(
22
- output, (dict, list, str, int, float, bool, type(None))
23
- ), "Output must be JSON serializable"
24
- return output
25
- return obj
26
-
27
-
28
- class JSONParsable:
29
- annotator_kind = "CODE"
30
- name = "JSONParsable"
31
-
32
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
33
- assert exp_run.output is not None
34
- output = _unwrap_json(exp_run.output.result)
35
- assert isinstance(output, str), "Experiment run output must be a string"
36
- try:
37
- json.loads(output)
38
- json_parsable = True
39
- except BaseException:
40
- json_parsable = False
41
- return EvaluationResult(
42
- score=int(json_parsable),
43
- )
44
-
45
-
46
- class ContainsKeyword:
47
- annotator_kind = "CODE"
48
-
49
- def __init__(self, keyword: str) -> None:
50
- super().__init__()
51
- self.keyword = keyword
52
- self.name = f"ContainsKeyword({keyword})"
53
-
54
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
55
- assert exp_run.output is not None
56
- result = _unwrap_json(exp_run.output.result)
57
- assert isinstance(result, str), "Experiment run output must be a string"
58
- found = self.keyword in result
59
- return EvaluationResult(
60
- score=float(found),
61
- explanation=(
62
- f"the string {repr(self.keyword)} was "
63
- f"{'found' if found else 'not found'} in the output"
64
- ),
65
- )
66
-
67
-
68
- class LLMCriteriaEvaluator:
69
- annotator_kind = "LLM"
20
+ class LLMCriteriaEvaluator(LLMEvaluator):
70
21
  _base_template = (
71
22
  "Determine if the following text is {criteria}. {description}"
72
23
  "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
@@ -77,7 +28,7 @@ class LLMCriteriaEvaluator:
77
28
  "EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
78
29
  "the criteria*\n"
79
30
  "LABEL: *true or false*\n\n"
80
- "Follow this template for the following text:\n\n"
31
+ "Follow this template for the following example:\n\n"
81
32
  "CRITERIA: the text is '{criteria}'\n"
82
33
  "TEXT: {text}\n"
83
34
  "EXPLANATION: "
@@ -95,21 +46,23 @@ class LLMCriteriaEvaluator:
95
46
  self.criteria = criteria
96
47
  self.description = description
97
48
  self.template = self._format_base_template(self.criteria, self.description)
98
- self.name = name
49
+ self._name = name
99
50
 
100
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
101
- formatted_template = self._format_eval_template(exp_run)
51
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
52
+ formatted_template = self._format_eval_template(output)
102
53
  unparsed_response = self.model._generate(formatted_template)
103
54
  return self._parse_eval_output(unparsed_response)
104
55
 
105
- async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
106
- formatted_template = self._format_eval_template(exp_run)
56
+ async def async_evaluate(
57
+ self, *, output: Optional[TaskOutput] = None, **_: Any
58
+ ) -> EvaluationResult:
59
+ formatted_template = self._format_eval_template(output)
107
60
  unparsed_response = await self.model._async_generate(formatted_template)
108
61
  return self._parse_eval_output(unparsed_response)
109
62
 
110
- def _format_eval_template(self, experiment_run: ExperimentRun) -> str:
111
- assert experiment_run.output is not None
112
- result = _unwrap_json(experiment_run.output.result)
63
+ def _format_eval_template(self, output: TaskOutput) -> str:
64
+ assert output is not None
65
+ result = _unwrap_json(output)
113
66
  return self.template.format(text=str(result))
114
67
 
115
68
  def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
@@ -142,40 +95,43 @@ class LLMCriteriaEvaluator:
142
95
 
143
96
 
144
97
  def criteria_evaluator_factory(
145
- class_name: str, criteria: str, description: str
98
+ class_name: str, criteria: str, description: str, default_name: str
146
99
  ) -> Type[ExperimentEvaluator]:
100
+ def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
101
+ LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
102
+
147
103
  return type(
148
104
  class_name,
149
105
  (LLMCriteriaEvaluator,),
150
106
  {
151
- "__init__": lambda self, model: LLMCriteriaEvaluator.__init__(
152
- self, model, criteria, description, name=class_name
153
- ),
107
+ "__init__": _init,
154
108
  "__module__": __name__,
155
- "name": class_name,
156
109
  "template": LLMCriteriaEvaluator._format_base_template(criteria, description),
157
110
  },
158
111
  )
159
112
 
160
113
 
161
- LLMConcisenessEvaluator = criteria_evaluator_factory(
162
- class_name="LLMConcisenessEvaluator",
114
+ ConcisenessEvaluator = criteria_evaluator_factory(
115
+ class_name="ConcisenessEvaluator",
163
116
  criteria="concise",
164
117
  description="is just a few sentences and easy to follow",
118
+ default_name="Conciseness",
165
119
  )
166
120
 
167
121
 
168
- LLMHelpfulnessEvaluator = criteria_evaluator_factory(
169
- class_name="LLMHelpfulnessEvaluator",
122
+ HelpfulnessEvaluator = criteria_evaluator_factory(
123
+ class_name="HelpfulnessEvaluator",
170
124
  criteria="helpful",
171
125
  description="provides useful information",
126
+ default_name="Helpfulness",
172
127
  )
173
128
 
174
129
 
175
- LLMCoherenceEvaluator = criteria_evaluator_factory(
176
- class_name="LLMCoherenceEvaluator",
130
+ CoherenceEvaluator = criteria_evaluator_factory(
131
+ class_name="CoherenceEvaluator",
177
132
  criteria="coherent",
178
- description="is coherent, well-structured, and organized",
133
+ description="is coherent, well-structured, and logically sound",
134
+ default_name="Coherence",
179
135
  )
180
136
 
181
137
 
@@ -192,8 +148,7 @@ def _parse_label_from_explanation(raw_string: str) -> str:
192
148
  return raw_string
193
149
 
194
150
 
195
- class RelevanceEvaluator:
196
- annotator_kind = "LLM"
151
+ class RelevanceEvaluator(LLMEvaluator):
197
152
  template = (
198
153
  "Determine if the following response is relevant to the query. In this context, "
199
154
  "'relevance' means that the response directly addresses the core question or topic of the "
@@ -217,19 +172,24 @@ class RelevanceEvaluator:
217
172
  def __init__(
218
173
  self,
219
174
  model: LLMBaseModel,
220
- get_query: Optional[Callable[[Example, ExperimentRun], str]] = None,
221
- get_response: Optional[Callable[[Example, ExperimentRun], str]] = None,
175
+ get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
176
+ get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
222
177
  name: str = "RelevanceEvaluator",
223
178
  ):
224
179
  self.model = model
225
- self.name = name
180
+ self._name = name
226
181
  self.get_query = get_query or self._default_get_query
227
182
  self.get_response = get_response or self._default_get_response
228
183
 
229
- def _format_eval_template(self, example: Example, experiment_run: ExperimentRun) -> str:
230
- assert experiment_run.output is not None
231
- query = self.get_query(example, experiment_run)
232
- response = self.get_response(example, experiment_run)
184
+ def _format_eval_template(
185
+ self,
186
+ output: Optional[TaskOutput] = None,
187
+ input: ExampleInput = MappingProxyType({}),
188
+ metadata: ExampleMetadata = MappingProxyType({}),
189
+ ) -> str:
190
+ assert output is not None
191
+ query = self.get_query(input, metadata)
192
+ response = self.get_response(output, metadata)
233
193
  return self.template.format(query=query, response=response)
234
194
 
235
195
  def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
@@ -250,26 +210,35 @@ class RelevanceEvaluator:
250
210
  metadata={},
251
211
  )
252
212
 
253
- def _default_get_query(self, example: Example, experiment_run: ExperimentRun) -> str:
254
- return str(example.input)
213
+ def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
214
+ return str(input)
255
215
 
256
- def _default_get_response(self, example: Example, experiment_run: ExperimentRun) -> str:
257
- assert experiment_run.output is not None
258
- return str(_unwrap_json(experiment_run.output.result))
216
+ def _default_get_response(
217
+ self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
218
+ ) -> str:
219
+ assert output is not None
220
+ return str(_unwrap_json(output))
259
221
 
260
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
261
- formatted_template = self._format_eval_template(example, exp_run)
222
+ def evaluate(
223
+ self,
224
+ *,
225
+ output: Optional[TaskOutput] = None,
226
+ metadata: ExampleMetadata = MappingProxyType({}),
227
+ input: ExampleInput = MappingProxyType({}),
228
+ **_: Any,
229
+ ) -> EvaluationResult:
230
+ formatted_template = self._format_eval_template(output, input, metadata)
262
231
  unparsed_response = self.model._generate(formatted_template)
263
232
  return self._parse_eval_output(unparsed_response)
264
233
 
265
- async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
266
- formatted_template = self._format_eval_template(example, exp_run)
234
+ async def async_evaluate(
235
+ self,
236
+ *,
237
+ output: Optional[TaskOutput] = None,
238
+ metadata: ExampleMetadata = MappingProxyType({}),
239
+ input: ExampleInput = MappingProxyType({}),
240
+ **_: Any,
241
+ ) -> EvaluationResult:
242
+ formatted_template = self._format_eval_template(output, input, metadata)
267
243
  unparsed_response = await self.model._async_generate(formatted_template)
268
244
  return self._parse_eval_output(unparsed_response)
269
-
270
-
271
- # Someday we'll do typing checking in unit tests.
272
- if TYPE_CHECKING:
273
- _: ExperimentEvaluator
274
- _ = JSONParsable()
275
- _ = ContainsKeyword("test")