arize-phoenix 4.4.4rc3__tar.gz → 4.4.4rc5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (262) hide show
  1. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/PKG-INFO +2 -2
  2. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/pyproject.toml +2 -1
  3. arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/__init__.py +18 -0
  4. arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/code_evaluators.py +99 -0
  5. arize_phoenix-4.4.4rc3/src/phoenix/datasets/evaluators.py → arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/llm_evaluators.py +75 -106
  6. arize_phoenix-4.4.4rc5/src/phoenix/datasets/evaluators/utils.py +292 -0
  7. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/experiments.py +148 -82
  8. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/tracing.py +19 -0
  9. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/types.py +18 -52
  10. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/dataset.py +19 -16
  11. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
  12. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/models.py +8 -3
  13. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/context.py +2 -0
  14. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/__init__.py +2 -0
  15. arize_phoenix-4.4.4rc5/src/phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
  16. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/helpers/dataset_helpers.py +8 -7
  17. arize_phoenix-4.4.4rc5/src/phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  18. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/project_mutations.py +9 -4
  19. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/datasets.py +146 -42
  20. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/experiment_evaluations.py +1 -0
  21. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/experiment_runs.py +2 -2
  22. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Experiment.py +5 -0
  23. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentRun.py +1 -1
  24. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
  25. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Span.py +1 -0
  26. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/app.py +2 -0
  27. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/index.js +638 -588
  28. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/client.py +124 -2
  29. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/schemas.py +1 -2
  30. arize_phoenix-4.4.4rc5/src/phoenix/version.py +1 -0
  31. arize_phoenix-4.4.4rc3/src/phoenix/version.py +0 -1
  32. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/.gitignore +0 -0
  33. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/IP_NOTICE +0 -0
  34. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/LICENSE +0 -0
  35. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/README.md +0 -0
  36. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/chat-service/chat/__init__.py +0 -0
  37. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/chat-service/chat/app.py +0 -0
  38. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/chat-service/chat/types.py +0 -0
  39. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/Dockerfile +0 -0
  40. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/Makefile +0 -0
  41. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/__init__.py +0 -0
  42. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/pyproject.toml +0 -0
  43. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/requirements.txt +0 -0
  44. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/examples/manually-instrumented-chatbot/frontend/schema.json +0 -0
  45. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/__init__.py +0 -0
  46. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/config.py +0 -0
  47. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/__init__.py +0 -0
  48. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/embedding_dimension.py +0 -0
  49. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/model.py +0 -0
  50. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/model_schema.py +0 -0
  51. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/core/model_schema_adapter.py +0 -0
  52. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datasets/__init__.py +0 -0
  53. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/datetime_utils.py +0 -0
  54. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/README.md +0 -0
  55. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/__init__.py +0 -0
  56. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/alembic.ini +0 -0
  57. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/bulk_inserter.py +0 -0
  58. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/engines.py +0 -0
  59. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/helpers.py +0 -0
  60. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/__init__.py +0 -0
  61. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/evaluation.py +0 -0
  62. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/helpers.py +0 -0
  63. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/insertion/span.py +0 -0
  64. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrate.py +0 -0
  65. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/__init__.py +0 -0
  66. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/env.py +0 -0
  67. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/script.py.mako +0 -0
  68. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/types.py +0 -0
  69. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/db/migrations/versions/cf03bd6bae1d_init.py +0 -0
  70. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/exceptions.py +0 -0
  71. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/__init__.py +0 -0
  72. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/errors.py +0 -0
  73. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/fixtures.py +0 -0
  74. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/inferences.py +0 -0
  75. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/schema.py +0 -0
  76. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/inferences/validation.py +0 -0
  77. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/README.md +0 -0
  78. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/__init__.py +0 -0
  79. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/binning.py +0 -0
  80. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/metrics.py +0 -0
  81. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/mixins.py +0 -0
  82. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/retrieval_metrics.py +0 -0
  83. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/timeseries.py +0 -0
  84. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/metrics/wrappers.py +0 -0
  85. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/__init__.py +0 -0
  86. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/clustering.py +0 -0
  87. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/pointcloud.py +0 -0
  88. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/projectors.py +0 -0
  89. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/pointcloud/umap_parameters.py +0 -0
  90. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/py.typed +0 -0
  91. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/__init__.py +0 -0
  92. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/__init__.py +0 -0
  93. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/cache/__init__.py +0 -0
  94. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/cache/two_tier_cache.py +0 -0
  95. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -0
  96. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/dataset_example_spans.py +0 -0
  97. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/document_evaluation_summaries.py +0 -0
  98. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/document_evaluations.py +0 -0
  99. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/document_retrieval_metrics.py +0 -0
  100. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/evaluation_summaries.py +0 -0
  101. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -0
  102. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/experiment_error_rates.py +0 -0
  103. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -0
  104. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/latency_ms_quantile.py +0 -0
  105. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/min_start_or_max_end_times.py +0 -0
  106. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/project_by_name.py +0 -0
  107. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/record_counts.py +0 -0
  108. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/span_descendants.py +0 -0
  109. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/span_evaluations.py +0 -0
  110. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/span_projects.py +0 -0
  111. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/token_counts.py +0 -0
  112. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/trace_evaluations.py +0 -0
  113. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/dataloaders/trace_row_ids.py +0 -0
  114. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/helpers/__init__.py +0 -0
  115. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -0
  116. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -0
  117. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/ClusterInput.py +0 -0
  118. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/Coordinates.py +0 -0
  119. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/CreateDatasetInput.py +0 -0
  120. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DataQualityMetricInput.py +0 -0
  121. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DatasetExampleInput.py +0 -0
  122. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DatasetSort.py +0 -0
  123. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DatasetVersionSort.py +0 -0
  124. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -0
  125. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DeleteDatasetInput.py +0 -0
  126. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -0
  127. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DimensionFilter.py +0 -0
  128. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/DimensionInput.py +0 -0
  129. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/Granularity.py +0 -0
  130. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -0
  131. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/PatchDatasetInput.py +0 -0
  132. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/PerformanceMetricInput.py +0 -0
  133. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/SpanSort.py +0 -0
  134. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/TimeRange.py +0 -0
  135. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/input_types/__init__.py +0 -0
  136. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/interceptor.py +0 -0
  137. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/__init__.py +0 -0
  138. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/auth.py +0 -0
  139. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/dataset_mutations.py +0 -0
  140. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/experiment_mutations.py +0 -0
  141. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/mutations/export_events_mutations.py +0 -0
  142. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/openapi/__init__.py +0 -0
  143. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/openapi/main.py +0 -0
  144. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/openapi/schema.py +0 -0
  145. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/queries.py +0 -0
  146. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/__init__.py +0 -0
  147. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/utils.py +0 -0
  148. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/__init__.py +0 -0
  149. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/dataset_examples.py +0 -0
  150. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/evaluations.py +0 -0
  151. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/experiments.py +0 -0
  152. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/spans.py +0 -0
  153. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/routers/v1/traces.py +0 -0
  154. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/schema.py +0 -0
  155. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/AnnotatorKind.py +0 -0
  156. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Cluster.py +0 -0
  157. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/CreateDatasetPayload.py +0 -0
  158. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DataQualityMetric.py +0 -0
  159. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Dataset.py +0 -0
  160. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetExample.py +0 -0
  161. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetExampleRevision.py +0 -0
  162. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetValues.py +0 -0
  163. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DatasetVersion.py +0 -0
  164. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Dimension.py +0 -0
  165. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionDataType.py +0 -0
  166. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionShape.py +0 -0
  167. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionType.py +0 -0
  168. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DimensionWithValue.py +0 -0
  169. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DocumentEvaluationSummary.py +0 -0
  170. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/DocumentRetrievalMetrics.py +0 -0
  171. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EmbeddingDimension.py +0 -0
  172. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EmbeddingMetadata.py +0 -0
  173. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Evaluation.py +0 -0
  174. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EvaluationSummary.py +0 -0
  175. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Event.py +0 -0
  176. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/EventMetadata.py +0 -0
  177. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExampleRevisionInterface.py +0 -0
  178. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -0
  179. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExperimentComparison.py +0 -0
  180. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ExportedFile.py +0 -0
  181. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Functionality.py +0 -0
  182. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Inferences.py +0 -0
  183. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/InferencesRole.py +0 -0
  184. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/MimeType.py +0 -0
  185. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Model.py +0 -0
  186. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/NumericRange.py +0 -0
  187. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/PerformanceMetric.py +0 -0
  188. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Project.py +0 -0
  189. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/PromptResponse.py +0 -0
  190. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Retrieval.py +0 -0
  191. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ScalarDriftMetricEnum.py +0 -0
  192. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Segments.py +0 -0
  193. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/SortDir.py +0 -0
  194. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/TimeSeries.py +0 -0
  195. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/Trace.py +0 -0
  196. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/UMAPPoints.py +0 -0
  197. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/ValidationResult.py +0 -0
  198. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/VectorDriftMetricEnum.py +0 -0
  199. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/__init__.py +0 -0
  200. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/node.py +0 -0
  201. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/api/types/pagination.py +0 -0
  202. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/grpc_server.py +0 -0
  203. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/main.py +0 -0
  204. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/openapi/__init__.py +0 -0
  205. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/openapi/docs.py +0 -0
  206. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/prometheus.py +0 -0
  207. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-114x114.png +0 -0
  208. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-120x120.png +0 -0
  209. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-144x144.png +0 -0
  210. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-152x152.png +0 -0
  211. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-180x180.png +0 -0
  212. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-72x72.png +0 -0
  213. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon-76x76.png +0 -0
  214. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/apple-touch-icon.png +0 -0
  215. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/favicon.ico +0 -0
  216. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/index.css +0 -0
  217. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/static/modernizr.js +0 -0
  218. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/telemetry.py +0 -0
  219. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/templates/__init__.py +0 -0
  220. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/templates/index.html +0 -0
  221. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/server/thread_server.py +0 -0
  222. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/services.py +0 -0
  223. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/__init__.py +0 -0
  224. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/data_extractor.py +0 -0
  225. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/evaluation.py +0 -0
  226. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/session/session.py +0 -0
  227. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/settings.py +0 -0
  228. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/__init__.py +0 -0
  229. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/attributes.py +0 -0
  230. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/README.md +0 -0
  231. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/__init__.py +0 -0
  232. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/filter.py +0 -0
  233. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/helpers.py +0 -0
  234. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/dsl/query.py +0 -0
  235. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/errors.py +0 -0
  236. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/evaluation_conventions.py +0 -0
  237. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/exporter.py +0 -0
  238. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/fixtures.py +0 -0
  239. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/langchain/__init__.py +0 -0
  240. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/langchain/instrumentor.py +0 -0
  241. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/llama_index/__init__.py +0 -0
  242. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/llama_index/callback.py +0 -0
  243. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/openai/__init__.py +0 -0
  244. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/openai/instrumentor.py +0 -0
  245. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/otel.py +0 -0
  246. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/projects.py +0 -0
  247. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/span_evaluations.py +0 -0
  248. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/span_json_decoder.py +0 -0
  249. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/span_json_encoder.py +0 -0
  250. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/trace_dataset.py +0 -0
  251. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/utils.py +0 -0
  252. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/v1/__init__.py +0 -0
  253. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/v1/evaluation_pb2.py +0 -0
  254. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/trace/v1/evaluation_pb2.pyi +0 -0
  255. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/__init__.py +0 -0
  256. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/deprecation.py +0 -0
  257. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/error_handling.py +0 -0
  258. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/json.py +0 -0
  259. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/logging.py +0 -0
  260. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/project.py +0 -0
  261. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/re.py +0 -0
  262. {arize_phoenix-4.4.4rc3 → arize_phoenix-4.4.4rc5}/src/phoenix/utilities/span_store.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arize-phoenix
3
- Version: 4.4.4rc3
3
+ Version: 4.4.4rc5
4
4
  Summary: AI Observability and Evaluation
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -31,7 +31,7 @@ Requires-Dist: openinference-instrumentation
31
31
  Requires-Dist: openinference-instrumentation-langchain>=0.1.12
32
32
  Requires-Dist: openinference-instrumentation-llama-index>=1.2.0
33
33
  Requires-Dist: openinference-instrumentation-openai>=0.1.4
34
- Requires-Dist: openinference-semantic-conventions>=0.1.5
34
+ Requires-Dist: openinference-semantic-conventions>=0.1.9
35
35
  Requires-Dist: opentelemetry-exporter-otlp
36
36
  Requires-Dist: opentelemetry-proto>=1.12.0
37
37
  Requires-Dist: opentelemetry-sdk
@@ -46,7 +46,7 @@ dependencies = [
46
46
  "opentelemetry-proto>=1.12.0", # needed to avoid this issue: https://github.com/Arize-ai/phoenix/issues/2695
47
47
  "opentelemetry-exporter-otlp",
48
48
  "opentelemetry-semantic-conventions",
49
- "openinference-semantic-conventions>=0.1.5",
49
+ "openinference-semantic-conventions>=0.1.9",
50
50
  "openinference-instrumentation",
51
51
  "openinference-instrumentation-langchain>=0.1.12",
52
52
  "openinference-instrumentation-llama-index>=1.2.0",
@@ -206,6 +206,7 @@ dependencies = [
206
206
  [tool.hatch.envs.docs]
207
207
  detached = true
208
208
  dependencies = [
209
+ "pyment",
209
210
  "interrogate",
210
211
  ]
211
212
 
@@ -0,0 +1,18 @@
1
+ from phoenix.datasets.evaluators.code_evaluators import ContainsKeyword, JSONParsable
2
+ from phoenix.datasets.evaluators.llm_evaluators import (
3
+ CoherenceEvaluator,
4
+ ConcisenessEvaluator,
5
+ HelpfulnessEvaluator,
6
+ LLMCriteriaEvaluator,
7
+ RelevanceEvaluator,
8
+ )
9
+
10
+ __all__ = [
11
+ "ContainsKeyword",
12
+ "JSONParsable",
13
+ "CoherenceEvaluator",
14
+ "ConcisenessEvaluator",
15
+ "LLMCriteriaEvaluator",
16
+ "HelpfulnessEvaluator",
17
+ "RelevanceEvaluator",
18
+ ]
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from typing import Any, List, Optional, Union
6
+
7
+ from phoenix.datasets.evaluators.utils import Evaluator
8
+ from phoenix.datasets.types import EvaluationResult, TaskOutput
9
+
10
+
11
+ class JSONParsable(Evaluator):
12
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
13
+ assert isinstance(output, str), "Experiment run output must be a string"
14
+ try:
15
+ json.loads(output)
16
+ json_parsable = True
17
+ except BaseException:
18
+ json_parsable = False
19
+ return EvaluationResult(
20
+ score=int(json_parsable),
21
+ )
22
+
23
+
24
+ class ContainsKeyword(Evaluator):
25
+ def __init__(self, keyword: str, name: Optional[str] = None) -> None:
26
+ self.keyword = keyword
27
+ self._name = name or f"Contains({repr(keyword)})"
28
+
29
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
30
+ assert isinstance(output, str), "Experiment run output must be a string"
31
+ found = self.keyword in output
32
+ return EvaluationResult(
33
+ score=float(found),
34
+ explanation=(
35
+ f"the string {repr(self.keyword)} was "
36
+ f"{'found' if found else 'not found'} in the output"
37
+ ),
38
+ )
39
+
40
+
41
+ class ContainsAnyKeyword(Evaluator):
42
+ def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
43
+ self.keywords = keywords
44
+ self._name = name or f"ContainsAny({keywords})"
45
+
46
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
47
+ assert isinstance(output, str), "Experiment run output must be a string"
48
+ found = [keyword for keyword in self.keywords if keyword in output]
49
+ if found:
50
+ explanation = f"the keywords {found} were found in the output"
51
+ else:
52
+ explanation = f"none of the keywords {self.keywords} were found in the output"
53
+ return EvaluationResult(
54
+ score=float(bool(found)),
55
+ explanation=explanation,
56
+ )
57
+
58
+
59
+ class ContainsAllKeywords(Evaluator):
60
+ def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
61
+ self.keywords = keywords
62
+ self._name = name or f"ContainsAll({keywords})"
63
+
64
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
65
+ assert isinstance(output, str), "Experiment run output must be a string"
66
+ not_found = [keyword for keyword in self.keywords if keyword not in output]
67
+ if not_found:
68
+ contains_all = False
69
+ explanation = f"the keywords {not_found} were not found in the output"
70
+ else:
71
+ contains_all = True
72
+ explanation = f"all of the keywords {self.keywords} were found in the output"
73
+ return EvaluationResult(
74
+ score=float(contains_all),
75
+ explanation=explanation,
76
+ )
77
+
78
+
79
+ class MatchesRegex(Evaluator):
80
+ def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
81
+ if isinstance(pattern, str):
82
+ pattern = re.compile(pattern)
83
+ self.pattern = pattern
84
+ assert isinstance(pattern, re.Pattern)
85
+ self._name = name or f"matches_({pattern})"
86
+
87
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
88
+ assert isinstance(output, str), "Experiment run output must be a string"
89
+ matches = self.pattern.findall(output)
90
+ if matches:
91
+ explanation = (
92
+ f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
93
+ )
94
+ else:
95
+ explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
96
+ return EvaluationResult(
97
+ score=float(bool(matches)),
98
+ explanation=explanation,
99
+ )
@@ -1,72 +1,23 @@
1
- import json
2
1
  import re
3
- from typing import TYPE_CHECKING, Callable, Optional, Type
2
+ from types import MappingProxyType
3
+ from typing import Any, Callable, Optional, Type
4
4
 
5
+ from phoenix.datasets.evaluators.utils import (
6
+ ExampleInput,
7
+ ExampleMetadata,
8
+ ExperimentEvaluator,
9
+ LLMEvaluator,
10
+ _unwrap_json,
11
+ )
5
12
  from phoenix.datasets.types import (
6
13
  EvaluationResult,
7
- Example,
8
- ExperimentEvaluator,
9
- ExperimentRun,
10
- JSONSerializable,
14
+ TaskOutput,
11
15
  )
12
16
  from phoenix.evals.models.base import BaseModel as LLMBaseModel
13
17
  from phoenix.evals.utils import snap_to_rail
14
18
 
15
19
 
16
- def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
17
- if isinstance(obj, dict):
18
- if len(obj) == 1:
19
- key = next(iter(obj.keys()))
20
- output = obj[key]
21
- assert isinstance(
22
- output, (dict, list, str, int, float, bool, type(None))
23
- ), "Output must be JSON serializable"
24
- return output
25
- return obj
26
-
27
-
28
- class JSONParsable:
29
- annotator_kind = "CODE"
30
- name = "JSONParsable"
31
-
32
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
33
- assert exp_run.output is not None
34
- output = _unwrap_json(exp_run.output.result)
35
- assert isinstance(output, str), "Experiment run output must be a string"
36
- try:
37
- json.loads(output)
38
- json_parsable = True
39
- except BaseException:
40
- json_parsable = False
41
- return EvaluationResult(
42
- score=int(json_parsable),
43
- )
44
-
45
-
46
- class ContainsKeyword:
47
- annotator_kind = "CODE"
48
-
49
- def __init__(self, keyword: str) -> None:
50
- super().__init__()
51
- self.keyword = keyword
52
- self.name = f"ContainsKeyword({keyword})"
53
-
54
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
55
- assert exp_run.output is not None
56
- result = _unwrap_json(exp_run.output.result)
57
- assert isinstance(result, str), "Experiment run output must be a string"
58
- found = self.keyword in result
59
- return EvaluationResult(
60
- score=float(found),
61
- explanation=(
62
- f"the string {repr(self.keyword)} was "
63
- f"{'found' if found else 'not found'} in the output"
64
- ),
65
- )
66
-
67
-
68
- class LLMCriteriaEvaluator:
69
- annotator_kind = "LLM"
20
+ class LLMCriteriaEvaluator(LLMEvaluator):
70
21
  _base_template = (
71
22
  "Determine if the following text is {criteria}. {description}"
72
23
  "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
@@ -77,7 +28,7 @@ class LLMCriteriaEvaluator:
77
28
  "EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
78
29
  "the criteria*\n"
79
30
  "LABEL: *true or false*\n\n"
80
- "Follow this template for the following text:\n\n"
31
+ "Follow this template for the following example:\n\n"
81
32
  "CRITERIA: the text is '{criteria}'\n"
82
33
  "TEXT: {text}\n"
83
34
  "EXPLANATION: "
@@ -95,21 +46,23 @@ class LLMCriteriaEvaluator:
95
46
  self.criteria = criteria
96
47
  self.description = description
97
48
  self.template = self._format_base_template(self.criteria, self.description)
98
- self.name = name
49
+ self._name = name
99
50
 
100
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
101
- formatted_template = self._format_eval_template(exp_run)
51
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
52
+ formatted_template = self._format_eval_template(output)
102
53
  unparsed_response = self.model._generate(formatted_template)
103
54
  return self._parse_eval_output(unparsed_response)
104
55
 
105
- async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
106
- formatted_template = self._format_eval_template(exp_run)
56
+ async def async_evaluate(
57
+ self, *, output: Optional[TaskOutput] = None, **_: Any
58
+ ) -> EvaluationResult:
59
+ formatted_template = self._format_eval_template(output)
107
60
  unparsed_response = await self.model._async_generate(formatted_template)
108
61
  return self._parse_eval_output(unparsed_response)
109
62
 
110
- def _format_eval_template(self, experiment_run: ExperimentRun) -> str:
111
- assert experiment_run.output is not None
112
- result = _unwrap_json(experiment_run.output.result)
63
+ def _format_eval_template(self, output: TaskOutput) -> str:
64
+ assert output is not None
65
+ result = _unwrap_json(output)
113
66
  return self.template.format(text=str(result))
114
67
 
115
68
  def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
@@ -142,40 +95,43 @@ class LLMCriteriaEvaluator:
142
95
 
143
96
 
144
97
  def criteria_evaluator_factory(
145
- class_name: str, criteria: str, description: str
98
+ class_name: str, criteria: str, description: str, default_name: str
146
99
  ) -> Type[ExperimentEvaluator]:
100
+ def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
101
+ LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
102
+
147
103
  return type(
148
104
  class_name,
149
105
  (LLMCriteriaEvaluator,),
150
106
  {
151
- "__init__": lambda self, model: LLMCriteriaEvaluator.__init__(
152
- self, model, criteria, description, name=class_name
153
- ),
107
+ "__init__": _init,
154
108
  "__module__": __name__,
155
- "name": class_name,
156
109
  "template": LLMCriteriaEvaluator._format_base_template(criteria, description),
157
110
  },
158
111
  )
159
112
 
160
113
 
161
- LLMConcisenessEvaluator = criteria_evaluator_factory(
162
- class_name="LLMConcisenessEvaluator",
114
+ ConcisenessEvaluator = criteria_evaluator_factory(
115
+ class_name="ConcisenessEvaluator",
163
116
  criteria="concise",
164
117
  description="is just a few sentences and easy to follow",
118
+ default_name="Conciseness",
165
119
  )
166
120
 
167
121
 
168
- LLMHelpfulnessEvaluator = criteria_evaluator_factory(
169
- class_name="LLMHelpfulnessEvaluator",
122
+ HelpfulnessEvaluator = criteria_evaluator_factory(
123
+ class_name="HelpfulnessEvaluator",
170
124
  criteria="helpful",
171
125
  description="provides useful information",
126
+ default_name="Helpfulness",
172
127
  )
173
128
 
174
129
 
175
- LLMCoherenceEvaluator = criteria_evaluator_factory(
176
- class_name="LLMCoherenceEvaluator",
130
+ CoherenceEvaluator = criteria_evaluator_factory(
131
+ class_name="CoherenceEvaluator",
177
132
  criteria="coherent",
178
- description="is coherent, well-structured, and organized",
133
+ description="is coherent, well-structured, and logically sound",
134
+ default_name="Coherence",
179
135
  )
180
136
 
181
137
 
@@ -192,8 +148,7 @@ def _parse_label_from_explanation(raw_string: str) -> str:
192
148
  return raw_string
193
149
 
194
150
 
195
- class RelevanceEvaluator:
196
- annotator_kind = "LLM"
151
+ class RelevanceEvaluator(LLMEvaluator):
197
152
  template = (
198
153
  "Determine if the following response is relevant to the query. In this context, "
199
154
  "'relevance' means that the response directly addresses the core question or topic of the "
@@ -217,19 +172,24 @@ class RelevanceEvaluator:
217
172
  def __init__(
218
173
  self,
219
174
  model: LLMBaseModel,
220
- get_query: Optional[Callable[[Example, ExperimentRun], str]] = None,
221
- get_response: Optional[Callable[[Example, ExperimentRun], str]] = None,
175
+ get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
176
+ get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
222
177
  name: str = "RelevanceEvaluator",
223
178
  ):
224
179
  self.model = model
225
- self.name = name
180
+ self._name = name
226
181
  self.get_query = get_query or self._default_get_query
227
182
  self.get_response = get_response or self._default_get_response
228
183
 
229
- def _format_eval_template(self, example: Example, experiment_run: ExperimentRun) -> str:
230
- assert experiment_run.output is not None
231
- query = self.get_query(example, experiment_run)
232
- response = self.get_response(example, experiment_run)
184
+ def _format_eval_template(
185
+ self,
186
+ output: Optional[TaskOutput] = None,
187
+ input: ExampleInput = MappingProxyType({}),
188
+ metadata: ExampleMetadata = MappingProxyType({}),
189
+ ) -> str:
190
+ assert output is not None
191
+ query = self.get_query(input, metadata)
192
+ response = self.get_response(output, metadata)
233
193
  return self.template.format(query=query, response=response)
234
194
 
235
195
  def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
@@ -250,26 +210,35 @@ class RelevanceEvaluator:
250
210
  metadata={},
251
211
  )
252
212
 
253
- def _default_get_query(self, example: Example, experiment_run: ExperimentRun) -> str:
254
- return str(example.input)
213
+ def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
214
+ return str(input)
255
215
 
256
- def _default_get_response(self, example: Example, experiment_run: ExperimentRun) -> str:
257
- assert experiment_run.output is not None
258
- return str(_unwrap_json(experiment_run.output.result))
216
+ def _default_get_response(
217
+ self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
218
+ ) -> str:
219
+ assert output is not None
220
+ return str(_unwrap_json(output))
259
221
 
260
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
261
- formatted_template = self._format_eval_template(example, exp_run)
222
+ def evaluate(
223
+ self,
224
+ *,
225
+ output: Optional[TaskOutput] = None,
226
+ metadata: ExampleMetadata = MappingProxyType({}),
227
+ input: ExampleInput = MappingProxyType({}),
228
+ **_: Any,
229
+ ) -> EvaluationResult:
230
+ formatted_template = self._format_eval_template(output, input, metadata)
262
231
  unparsed_response = self.model._generate(formatted_template)
263
232
  return self._parse_eval_output(unparsed_response)
264
233
 
265
- async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
266
- formatted_template = self._format_eval_template(example, exp_run)
234
+ async def async_evaluate(
235
+ self,
236
+ *,
237
+ output: Optional[TaskOutput] = None,
238
+ metadata: ExampleMetadata = MappingProxyType({}),
239
+ input: ExampleInput = MappingProxyType({}),
240
+ **_: Any,
241
+ ) -> EvaluationResult:
242
+ formatted_template = self._format_eval_template(output, input, metadata)
267
243
  unparsed_response = await self.model._async_generate(formatted_template)
268
244
  return self._parse_eval_output(unparsed_response)
269
-
270
-
271
- # Someday we'll do typing checking in unit tests.
272
- if TYPE_CHECKING:
273
- _: ExperimentEvaluator
274
- _ = JSONParsable()
275
- _ = ContainsKeyword("test")