arize-phoenix 4.4.4rc6__tar.gz → 4.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (267) hide show
  1. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/PKG-INFO +6 -4
  2. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/pyproject.toml +5 -3
  3. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/models.py +4 -4
  4. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/evaluators/base.py +2 -2
  5. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/evaluators/utils.py +9 -12
  6. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/functions.py +166 -25
  7. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/types.py +60 -29
  8. arize_phoenix-4.6.1/src/phoenix/experiments/utils.py +24 -0
  9. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/experiment_evaluations.py +78 -0
  10. arize_phoenix-4.6.1/src/phoenix/server/api/routers/v1/experiment_runs.py +220 -0
  11. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/experiments.py +128 -0
  12. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ExperimentRun.py +1 -1
  13. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/session/client.py +2 -31
  14. arize_phoenix-4.6.1/src/phoenix/version.py +1 -0
  15. arize_phoenix-4.4.4rc6/src/phoenix/experiments/utils.py +0 -9
  16. arize_phoenix-4.4.4rc6/src/phoenix/server/api/routers/v1/experiment_runs.py +0 -96
  17. arize_phoenix-4.4.4rc6/src/phoenix/version.py +0 -1
  18. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/.gitignore +0 -0
  19. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/IP_NOTICE +0 -0
  20. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/LICENSE +0 -0
  21. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/README.md +0 -0
  22. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/chat-service/chat/__init__.py +0 -0
  23. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/chat-service/chat/app.py +0 -0
  24. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/chat-service/chat/types.py +0 -0
  25. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/frontend/Dockerfile +0 -0
  26. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/frontend/Makefile +0 -0
  27. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/frontend/__init__.py +0 -0
  28. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/frontend/pyproject.toml +0 -0
  29. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/frontend/requirements.txt +0 -0
  30. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/examples/manually-instrumented-chatbot/frontend/schema.json +0 -0
  31. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/__init__.py +0 -0
  32. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/config.py +0 -0
  33. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/core/__init__.py +0 -0
  34. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/core/embedding_dimension.py +0 -0
  35. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/core/model.py +0 -0
  36. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/core/model_schema.py +0 -0
  37. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/core/model_schema_adapter.py +0 -0
  38. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/datetime_utils.py +0 -0
  39. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/README.md +0 -0
  40. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/__init__.py +0 -0
  41. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/alembic.ini +0 -0
  42. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/bulk_inserter.py +0 -0
  43. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/engines.py +0 -0
  44. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/helpers.py +0 -0
  45. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/insertion/__init__.py +0 -0
  46. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/insertion/dataset.py +0 -0
  47. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/insertion/evaluation.py +0 -0
  48. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/insertion/helpers.py +0 -0
  49. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/insertion/span.py +0 -0
  50. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrate.py +0 -0
  51. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrations/__init__.py +0 -0
  52. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrations/env.py +0 -0
  53. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrations/script.py.mako +0 -0
  54. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrations/types.py +0 -0
  55. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -0
  56. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/db/migrations/versions/cf03bd6bae1d_init.py +0 -0
  57. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/exceptions.py +0 -0
  58. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/__init__.py +0 -0
  59. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/evaluators/__init__.py +0 -0
  60. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/evaluators/code_evaluators.py +0 -0
  61. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/evaluators/llm_evaluators.py +0 -0
  62. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/experiments/tracing.py +0 -0
  63. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/inferences/__init__.py +0 -0
  64. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/inferences/errors.py +0 -0
  65. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/inferences/fixtures.py +0 -0
  66. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/inferences/inferences.py +0 -0
  67. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/inferences/schema.py +0 -0
  68. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/inferences/validation.py +0 -0
  69. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/README.md +0 -0
  70. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/__init__.py +0 -0
  71. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/binning.py +0 -0
  72. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/metrics.py +0 -0
  73. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/mixins.py +0 -0
  74. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/retrieval_metrics.py +0 -0
  75. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/timeseries.py +0 -0
  76. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/metrics/wrappers.py +0 -0
  77. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/pointcloud/__init__.py +0 -0
  78. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/pointcloud/clustering.py +0 -0
  79. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/pointcloud/pointcloud.py +0 -0
  80. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/pointcloud/projectors.py +0 -0
  81. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/pointcloud/umap_parameters.py +0 -0
  82. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/py.typed +0 -0
  83. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/__init__.py +0 -0
  84. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/__init__.py +0 -0
  85. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/context.py +0 -0
  86. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/__init__.py +0 -0
  87. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/average_experiment_run_latency.py +0 -0
  88. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/cache/__init__.py +0 -0
  89. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/cache/two_tier_cache.py +0 -0
  90. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -0
  91. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/dataset_example_spans.py +0 -0
  92. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/document_evaluation_summaries.py +0 -0
  93. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/document_evaluations.py +0 -0
  94. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/document_retrieval_metrics.py +0 -0
  95. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/evaluation_summaries.py +0 -0
  96. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -0
  97. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/experiment_error_rates.py +0 -0
  98. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/experiment_run_counts.py +0 -0
  99. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -0
  100. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/latency_ms_quantile.py +0 -0
  101. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/min_start_or_max_end_times.py +0 -0
  102. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/project_by_name.py +0 -0
  103. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/record_counts.py +0 -0
  104. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/span_descendants.py +0 -0
  105. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/span_evaluations.py +0 -0
  106. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/span_projects.py +0 -0
  107. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/token_counts.py +0 -0
  108. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/trace_evaluations.py +0 -0
  109. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/dataloaders/trace_row_ids.py +0 -0
  110. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/helpers/__init__.py +0 -0
  111. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/helpers/dataset_helpers.py +0 -0
  112. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -0
  113. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -0
  114. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/ClearProjectInput.py +0 -0
  115. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/ClusterInput.py +0 -0
  116. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/Coordinates.py +0 -0
  117. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/CreateDatasetInput.py +0 -0
  118. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DataQualityMetricInput.py +0 -0
  119. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DatasetExampleInput.py +0 -0
  120. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DatasetSort.py +0 -0
  121. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DatasetVersionSort.py +0 -0
  122. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -0
  123. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DeleteDatasetInput.py +0 -0
  124. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -0
  125. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DimensionFilter.py +0 -0
  126. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/DimensionInput.py +0 -0
  127. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/Granularity.py +0 -0
  128. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -0
  129. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/PatchDatasetInput.py +0 -0
  130. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/PerformanceMetricInput.py +0 -0
  131. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/SpanSort.py +0 -0
  132. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/TimeRange.py +0 -0
  133. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/input_types/__init__.py +0 -0
  134. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/interceptor.py +0 -0
  135. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/mutations/__init__.py +0 -0
  136. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/mutations/auth.py +0 -0
  137. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/mutations/dataset_mutations.py +0 -0
  138. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/mutations/experiment_mutations.py +0 -0
  139. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/mutations/export_events_mutations.py +0 -0
  140. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/mutations/project_mutations.py +0 -0
  141. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/openapi/__init__.py +0 -0
  142. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/openapi/main.py +0 -0
  143. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/openapi/schema.py +0 -0
  144. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/queries.py +0 -0
  145. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/__init__.py +0 -0
  146. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/utils.py +0 -0
  147. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/__init__.py +0 -0
  148. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/dataset_examples.py +0 -0
  149. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/datasets.py +0 -0
  150. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/evaluations.py +0 -0
  151. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/spans.py +0 -0
  152. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/routers/v1/traces.py +0 -0
  153. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/schema.py +0 -0
  154. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/AnnotatorKind.py +0 -0
  155. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Cluster.py +0 -0
  156. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/CreateDatasetPayload.py +0 -0
  157. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DataQualityMetric.py +0 -0
  158. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Dataset.py +0 -0
  159. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DatasetExample.py +0 -0
  160. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DatasetExampleRevision.py +0 -0
  161. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DatasetValues.py +0 -0
  162. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DatasetVersion.py +0 -0
  163. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Dimension.py +0 -0
  164. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DimensionDataType.py +0 -0
  165. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DimensionShape.py +0 -0
  166. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DimensionType.py +0 -0
  167. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DimensionWithValue.py +0 -0
  168. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DocumentEvaluationSummary.py +0 -0
  169. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/DocumentRetrievalMetrics.py +0 -0
  170. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/EmbeddingDimension.py +0 -0
  171. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/EmbeddingMetadata.py +0 -0
  172. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Evaluation.py +0 -0
  173. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/EvaluationSummary.py +0 -0
  174. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Event.py +0 -0
  175. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/EventMetadata.py +0 -0
  176. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ExampleRevisionInterface.py +0 -0
  177. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Experiment.py +0 -0
  178. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -0
  179. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ExperimentComparison.py +0 -0
  180. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ExperimentRunAnnotation.py +0 -0
  181. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ExportedFile.py +0 -0
  182. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Functionality.py +0 -0
  183. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Inferences.py +0 -0
  184. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/InferencesRole.py +0 -0
  185. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/MimeType.py +0 -0
  186. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Model.py +0 -0
  187. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/NumericRange.py +0 -0
  188. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/PerformanceMetric.py +0 -0
  189. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Project.py +0 -0
  190. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/PromptResponse.py +0 -0
  191. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Retrieval.py +0 -0
  192. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ScalarDriftMetricEnum.py +0 -0
  193. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Segments.py +0 -0
  194. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/SortDir.py +0 -0
  195. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Span.py +0 -0
  196. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/TimeSeries.py +0 -0
  197. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/Trace.py +0 -0
  198. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/UMAPPoints.py +0 -0
  199. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/ValidationResult.py +0 -0
  200. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/VectorDriftMetricEnum.py +0 -0
  201. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/__init__.py +0 -0
  202. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/node.py +0 -0
  203. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/api/types/pagination.py +0 -0
  204. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/app.py +0 -0
  205. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/grpc_server.py +0 -0
  206. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/main.py +0 -0
  207. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/openapi/__init__.py +0 -0
  208. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/openapi/docs.py +0 -0
  209. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/prometheus.py +0 -0
  210. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-114x114.png +0 -0
  211. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-120x120.png +0 -0
  212. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-144x144.png +0 -0
  213. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-152x152.png +0 -0
  214. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-180x180.png +0 -0
  215. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-72x72.png +0 -0
  216. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon-76x76.png +0 -0
  217. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/apple-touch-icon.png +0 -0
  218. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/favicon.ico +0 -0
  219. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/index.css +0 -0
  220. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/index.js +0 -0
  221. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/static/modernizr.js +0 -0
  222. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/telemetry.py +0 -0
  223. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/templates/__init__.py +0 -0
  224. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/templates/index.html +0 -0
  225. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/server/thread_server.py +0 -0
  226. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/services.py +0 -0
  227. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/session/__init__.py +0 -0
  228. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/session/data_extractor.py +0 -0
  229. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/session/evaluation.py +0 -0
  230. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/session/session.py +0 -0
  231. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/settings.py +0 -0
  232. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/__init__.py +0 -0
  233. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/attributes.py +0 -0
  234. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/dsl/README.md +0 -0
  235. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/dsl/__init__.py +0 -0
  236. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/dsl/filter.py +0 -0
  237. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/dsl/helpers.py +0 -0
  238. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/dsl/query.py +0 -0
  239. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/errors.py +0 -0
  240. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/evaluation_conventions.py +0 -0
  241. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/exporter.py +0 -0
  242. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/fixtures.py +0 -0
  243. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/langchain/__init__.py +0 -0
  244. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/langchain/instrumentor.py +0 -0
  245. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/llama_index/__init__.py +0 -0
  246. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/llama_index/callback.py +0 -0
  247. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/openai/__init__.py +0 -0
  248. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/openai/instrumentor.py +0 -0
  249. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/otel.py +0 -0
  250. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/projects.py +0 -0
  251. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/schemas.py +0 -0
  252. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/span_evaluations.py +0 -0
  253. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/span_json_decoder.py +0 -0
  254. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/span_json_encoder.py +0 -0
  255. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/trace_dataset.py +0 -0
  256. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/utils.py +0 -0
  257. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/v1/__init__.py +0 -0
  258. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/v1/evaluation_pb2.py +0 -0
  259. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/trace/v1/evaluation_pb2.pyi +0 -0
  260. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/__init__.py +0 -0
  261. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/deprecation.py +0 -0
  262. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/error_handling.py +0 -0
  263. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/json.py +0 -0
  264. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/logging.py +0 -0
  265. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/project.py +0 -0
  266. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/re.py +0 -0
  267. {arize_phoenix-4.4.4rc6 → arize_phoenix-4.6.1}/src/phoenix/utilities/span_store.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arize-phoenix
3
- Version: 4.4.4rc6
3
+ Version: 4.6.1
4
4
  Summary: AI Observability and Evaluation
5
5
  Project-URL: Documentation, https://docs.arize.com/phoenix/
6
6
  Project-URL: Issues, https://github.com/Arize-ai/phoenix/issues
@@ -41,6 +41,7 @@ Requires-Dist: protobuf<6.0,>=3.20
41
41
  Requires-Dist: psutil
42
42
  Requires-Dist: pyarrow
43
43
  Requires-Dist: python-multipart
44
+ Requires-Dist: pyyaml
44
45
  Requires-Dist: scikit-learn
45
46
  Requires-Dist: scipy
46
47
  Requires-Dist: sqlalchemy[asyncio]<3,>=2.0.4
@@ -94,9 +95,10 @@ Requires-Dist: types-tabulate; extra == 'dev'
94
95
  Provides-Extra: evals
95
96
  Provides-Extra: experimental
96
97
  Provides-Extra: llama-index
97
- Requires-Dist: llama-index-embeddings-openai; extra == 'llama-index'
98
- Requires-Dist: llama-index-llms-openai; extra == 'llama-index'
99
- Requires-Dist: llama-index-readers-file; extra == 'llama-index'
98
+ Requires-Dist: llama-index-agent-openai==0.2.7; extra == 'llama-index'
99
+ Requires-Dist: llama-index-embeddings-openai==0.1.10; extra == 'llama-index'
100
+ Requires-Dist: llama-index-llms-openai==0.1.24; extra == 'llama-index'
101
+ Requires-Dist: llama-index-readers-file==0.1.25; extra == 'llama-index'
100
102
  Requires-Dist: llama-index==0.10.51; extra == 'llama-index'
101
103
  Provides-Extra: pg
102
104
  Requires-Dist: asyncpg; extra == 'pg'
@@ -59,6 +59,7 @@ dependencies = [
59
59
  "cachetools",
60
60
  "python-multipart", # see https://www.starlette.io/#dependencies
61
61
  "arize-phoenix-evals>=0.13.1",
62
+ "pyyaml", # for OpenAPI
62
63
  ]
63
64
  dynamic = ["version"]
64
65
 
@@ -94,9 +95,10 @@ evals = []
94
95
  experimental = []
95
96
  llama-index = [
96
97
  "llama-index==0.10.51", # always pin to a version that keeps our notebooks working
97
- "llama-index-readers-file",
98
- "llama-index-llms-openai",
99
- "llama-index-embeddings-openai",
98
+ "llama-index-readers-file==0.1.25",
99
+ "llama-index-llms-openai==0.1.24",
100
+ "llama-index-embeddings-openai==0.1.10",
101
+ "llama-index-agent-openai==0.2.7",
100
102
  ]
101
103
  pg = [
102
104
  "asyncpg",
@@ -91,8 +91,8 @@ class UtcTimeStamp(TypeDecorator[datetime]):
91
91
  return normalize_datetime(value, timezone.utc)
92
92
 
93
93
 
94
- class ExperimentResult(TypedDict, total=False):
95
- result: Any
94
+ class ExperimentRunOutput(TypedDict, total=False):
95
+ task_output: Any
96
96
 
97
97
 
98
98
  class Base(DeclarativeBase):
@@ -110,7 +110,7 @@ class Base(DeclarativeBase):
110
110
  type_annotation_map = {
111
111
  Dict[str, Any]: JsonDict,
112
112
  List[Dict[str, Any]]: JsonList,
113
- ExperimentResult: JsonDict,
113
+ ExperimentRunOutput: JsonDict,
114
114
  }
115
115
 
116
116
 
@@ -561,7 +561,7 @@ class ExperimentRun(Base):
561
561
  )
562
562
  repetition_number: Mapped[int]
563
563
  trace_id: Mapped[Optional[str]]
564
- output: Mapped[ExperimentResult]
564
+ output: Mapped[ExperimentRunOutput]
565
565
  start_time: Mapped[datetime] = mapped_column(UtcTimeStamp)
566
566
  end_time: Mapped[datetime] = mapped_column(UtcTimeStamp)
567
567
  prompt_token_count: Mapped[Optional[int]]
@@ -6,7 +6,7 @@ from typing import Any, Awaitable, Callable, Optional, Union
6
6
 
7
7
  from typing_extensions import TypeAlias
8
8
 
9
- from phoenix.experiments.evaluators.utils import validate_signature
9
+ from phoenix.experiments.evaluators.utils import validate_evaluator_signature
10
10
  from phoenix.experiments.types import (
11
11
  AnnotatorKind,
12
12
  EvaluationResult,
@@ -108,7 +108,7 @@ class Evaluator(ABC):
108
108
 
109
109
  def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
110
110
  sig = inspect.signature(fn)
111
- validate_signature(sig)
111
+ validate_evaluator_signature(sig)
112
112
  for param in sig.parameters.values():
113
113
  if param.kind is inspect.Parameter.VAR_KEYWORD:
114
114
  return
@@ -8,6 +8,7 @@ from phoenix.experiments.types import (
8
8
  EvaluationResult,
9
9
  JSONSerializable,
10
10
  )
11
+ from phoenix.experiments.utils import get_func_name
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from phoenix.experiments.evaluators.base import Evaluator
@@ -25,11 +26,11 @@ def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
25
26
  return obj
26
27
 
27
28
 
28
- def validate_signature(sig: inspect.Signature) -> None:
29
+ def validate_evaluator_signature(sig: inspect.Signature) -> None:
29
30
  # Check that the wrapped function has a valid signature for use as an evaluator
30
31
  # If it does not, raise an error to exit early before running evaluations
31
32
  params = sig.parameters
32
- valid_named_params = {"input", "output", "expected", "metadata"}
33
+ valid_named_params = {"input", "output", "expected", "reference", "metadata"}
33
34
  if len(params) == 0:
34
35
  raise ValueError("Evaluation function must have at least one parameter.")
35
36
  if len(params) > 1:
@@ -49,11 +50,12 @@ def validate_signature(sig: inspect.Signature) -> None:
49
50
  )
50
51
 
51
52
 
52
- def _bind_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
53
+ def _bind_evaluator_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
53
54
  parameter_mapping = {
54
55
  "input": kwargs.get("input"),
55
56
  "output": kwargs.get("output"),
56
57
  "expected": kwargs.get("expected"),
58
+ "reference": kwargs.get("reference"), # `reference` is an alias for `expected`
57
59
  "metadata": kwargs.get("metadata"),
58
60
  }
59
61
  params = sig.parameters
@@ -82,16 +84,11 @@ def create_evaluator(
82
84
  def wrapper(func: Callable[..., Any]) -> "Evaluator":
83
85
  nonlocal name
84
86
  if not name:
85
- if hasattr(func, "__self__"):
86
- name = func.__self__.__class__.__name__
87
- elif hasattr(func, "__name__"):
88
- name = func.__name__
89
- else:
90
- name = str(func)
87
+ name = get_func_name(func)
91
88
  assert name is not None
92
89
 
93
90
  wrapped_signature = inspect.signature(func)
94
- validate_signature(wrapped_signature)
91
+ validate_evaluator_signature(wrapped_signature)
95
92
 
96
93
  if inspect.iscoroutinefunction(func):
97
94
  return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
@@ -120,7 +117,7 @@ def _wrap_coroutine_evaluation_function(
120
117
  return await func(*args, **kwargs)
121
118
 
122
119
  async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
123
- bound_signature = _bind_signature(sig, **kwargs)
120
+ bound_signature = _bind_evaluator_signature(sig, **kwargs)
124
121
  result = await func(*bound_signature.args, **bound_signature.kwargs)
125
122
  return convert_to_score(result)
126
123
 
@@ -148,7 +145,7 @@ def _wrap_sync_evaluation_function(
148
145
  return func(*args, **kwargs)
149
146
 
150
147
  def evaluate(self, **kwargs: Any) -> EvaluationResult:
151
- bound_signature = _bind_signature(sig, **kwargs)
148
+ bound_signature = _bind_evaluator_signature(sig, **kwargs)
152
149
  result = func(*bound_signature.args, **bound_signature.kwargs)
153
150
  return convert_to_score(result)
154
151
 
@@ -1,5 +1,7 @@
1
1
  import functools
2
+ import inspect
2
3
  import json
4
+ import traceback
3
5
  from binascii import hexlify
4
6
  from contextlib import ExitStack
5
7
  from copy import deepcopy
@@ -10,6 +12,7 @@ from typing import (
10
12
  Any,
11
13
  Awaitable,
12
14
  Dict,
15
+ Literal,
13
16
  Mapping,
14
17
  Optional,
15
18
  Sequence,
@@ -58,8 +61,8 @@ from phoenix.experiments.types import (
58
61
  Experiment,
59
62
  ExperimentEvaluationRun,
60
63
  ExperimentParameters,
61
- ExperimentResult,
62
64
  ExperimentRun,
65
+ ExperimentRunOutput,
63
66
  ExperimentTask,
64
67
  RanExperiment,
65
68
  TaskSummary,
@@ -67,7 +70,7 @@ from phoenix.experiments.types import (
67
70
  _asdict,
68
71
  _replace,
69
72
  )
70
- from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url
73
+ from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
71
74
  from phoenix.trace.attributes import flatten
72
75
  from phoenix.utilities.json import jsonify
73
76
 
@@ -105,6 +108,61 @@ def run_experiment(
105
108
  dry_run: Union[bool, int] = False,
106
109
  print_summary: bool = True,
107
110
  ) -> RanExperiment:
111
+ """
112
+ Runs an experiment using a given set of dataset of examples.
113
+
114
+ An experiment is a user-defined task that runs on each example in a dataset. The results from
115
+ each experiment can be evaluated using any number of evaluators to measure the behavior of the
116
+ task. The experiment and evaluation results are stored in the Phoenix database for comparison
117
+ and analysis.
118
+
119
+ A `task` is either a synchronous or asynchronous function that returns a JSON serializable
120
+ output. If the `task` is a function of one argument then that argument will be bound to the
121
+ `input` field of the dataset example. Alternatively, the `task` can be a function of any
122
+ combination of specific argument names that will be bound to special values:
123
+ `input`: The input field of the dataset example
124
+ `expected`: The expected or reference output of the dataset example
125
+ `reference`: An alias for `expected`
126
+ `metadata`: Metadata associated with the dataset example
127
+ `example`: The dataset `Example` object with all associated fields
128
+
129
+ An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
130
+ or numeric "score". If the `evaluator` is a function of one argument then that argument will be
131
+ bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
132
+ combination of specific argument names that will be bound to special values:
133
+ `input`: The input field of the dataset example
134
+ `output`: The output of the task
135
+ `expected`: The expected or reference output of the dataset example
136
+ `reference`: An alias for `expected`
137
+ `metadata`: Metadata associated with the dataset example
138
+
139
+ Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
140
+
141
+ Args:
142
+ dataset (Dataset): The dataset on which to run the experiment.
143
+ task (ExperimentTask): The task to run on each example in the dataset.
144
+ evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
145
+ evaluate the results of the experiment. Defaults to None.
146
+ experiment_name (Optional[str]): The name of the experiment. Defaults to None.
147
+ experiment_description (Optional[str]): A description of the experiment. Defaults to None.
148
+ experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
149
+ experiment. Defaults to None.
150
+ rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
151
+ sequence of exceptions to adaptively throttle on. Defaults to None.
152
+ dry_run (bool | int): R the experiment in dry-run mode. When set, experiment results will
153
+ not be recorded in Phoenix. If True, the experiment will run on a random dataset
154
+ example. If an integer, the experiment will run on a random sample of the dataset
155
+ examples of the given size. Defaults to False.
156
+ print_summary (bool): Whether to print a summary of the experiment and evaluation results.
157
+ Defaults to True.
158
+
159
+ Returns:
160
+ RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
161
+ added to the experiment using the `evaluate_experiment` function.
162
+ """
163
+ task_signature = inspect.signature(task)
164
+ _validate_task_signature(task_signature)
165
+
108
166
  if not dataset.examples:
109
167
  raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
110
168
  # Add this to the params once supported in the UI
@@ -146,7 +204,7 @@ def run_experiment(
146
204
  )
147
205
 
148
206
  tracer, resource = _get_tracer(experiment.project_name)
149
- root_span_name = f"Task: {_get_task_name(task)}"
207
+ root_span_name = f"Task: {get_func_name(task)}"
150
208
  root_span_kind = CHAIN
151
209
 
152
210
  print("🧪 Experiment started.")
@@ -183,25 +241,37 @@ def run_experiment(
183
241
  # Do not use keyword arguments, which can fail at runtime
184
242
  # even when function obeys protocol, because keyword arguments
185
243
  # are implementation details.
186
- _output = task(example)
244
+ bound_task_args = _bind_task_signature(task_signature, example)
245
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
187
246
  if isinstance(_output, Awaitable):
188
- raise RuntimeError("Task is async but running in sync context")
247
+ sync_error_message = (
248
+ "Task is async and cannot be run within an existing event loop. "
249
+ "Consider the following options:\n\n"
250
+ "1. Pass in a synchronous task callable.\n"
251
+ "2. Use `nest_asyncio.apply()` to allow nesting event loops."
252
+ )
253
+ raise RuntimeError(sync_error_message)
189
254
  else:
190
255
  output = _output
191
256
  except BaseException as exc:
192
257
  span.record_exception(exc)
193
258
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
194
259
  error = exc
260
+ _print_experiment_error(
261
+ exc,
262
+ example_id=example.id,
263
+ repetition_number=repetition_number,
264
+ kind="task",
265
+ )
195
266
  output = jsonify(output)
196
267
  span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
197
268
  span.set_attribute(INPUT_MIME_TYPE, JSON.value)
198
- if result := ExperimentResult(result=output) if output is not None else None:
269
+ if output is not None:
199
270
  if isinstance(output, str):
200
271
  span.set_attribute(OUTPUT_VALUE, output)
201
272
  else:
202
273
  span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
203
274
  span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
204
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
205
275
  span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
206
276
  span.set_status(status)
207
277
 
@@ -214,7 +284,7 @@ def run_experiment(
214
284
  experiment_id=experiment.id,
215
285
  dataset_example_id=example.id,
216
286
  repetition_number=repetition_number,
217
- output=result,
287
+ experiment_run_output=ExperimentRunOutput(task_output=output),
218
288
  error=repr(error) if error else None,
219
289
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
220
290
  )
@@ -238,7 +308,8 @@ def run_experiment(
238
308
  # Do not use keyword arguments, which can fail at runtime
239
309
  # even when function obeys protocol, because keyword arguments
240
310
  # are implementation details.
241
- _output = task(example)
311
+ bound_task_args = _bind_task_signature(task_signature, example)
312
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
242
313
  if isinstance(_output, Awaitable):
243
314
  output = await _output
244
315
  else:
@@ -247,16 +318,21 @@ def run_experiment(
247
318
  span.record_exception(exc)
248
319
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
249
320
  error = exc
321
+ _print_experiment_error(
322
+ exc,
323
+ example_id=example.id,
324
+ repetition_number=repetition_number,
325
+ kind="task",
326
+ )
250
327
  output = jsonify(output)
251
328
  span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
252
329
  span.set_attribute(INPUT_MIME_TYPE, JSON.value)
253
- if result := ExperimentResult(result=output) if output is not None else None:
330
+ if output is not None:
254
331
  if isinstance(output, str):
255
332
  span.set_attribute(OUTPUT_VALUE, output)
256
333
  else:
257
334
  span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
258
335
  span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
259
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
260
336
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
261
337
  span.set_status(status)
262
338
 
@@ -269,7 +345,7 @@ def run_experiment(
269
345
  experiment_id=experiment.id,
270
346
  dataset_example_id=example.id,
271
347
  repetition_number=repetition_number,
272
- output=result,
348
+ experiment_run_output=ExperimentRunOutput(task_output=output),
273
349
  error=repr(error) if error else None,
274
350
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
275
351
  )
@@ -422,8 +498,9 @@ def evaluate_experiment(
422
498
  stack.enter_context(capture_spans(resource))
423
499
  try:
424
500
  result = evaluator.evaluate(
425
- output=experiment_run.task_output,
501
+ output=experiment_run.output,
426
502
  expected=example.output,
503
+ reference=example.output,
427
504
  input=example.input,
428
505
  metadata=example.metadata,
429
506
  )
@@ -431,6 +508,12 @@ def evaluate_experiment(
431
508
  span.record_exception(exc)
432
509
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
433
510
  error = exc
511
+ _print_experiment_error(
512
+ exc,
513
+ example_id=example.id,
514
+ repetition_number=experiment_run.repetition_number,
515
+ kind="evaluator",
516
+ )
434
517
  if result:
435
518
  span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
436
519
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
@@ -467,8 +550,9 @@ def evaluate_experiment(
467
550
  stack.enter_context(capture_spans(resource))
468
551
  try:
469
552
  result = await evaluator.async_evaluate(
470
- output=experiment_run.task_output,
553
+ output=experiment_run.output,
471
554
  expected=example.output,
555
+ reference=example.output,
472
556
  input=example.input,
473
557
  metadata=example.metadata,
474
558
  )
@@ -476,6 +560,12 @@ def evaluate_experiment(
476
560
  span.record_exception(exc)
477
561
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
478
562
  error = exc
563
+ _print_experiment_error(
564
+ exc,
565
+ example_id=example.id,
566
+ repetition_number=experiment_run.repetition_number,
567
+ kind="evaluator",
568
+ )
479
569
  if result:
480
570
  span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
481
571
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
@@ -584,20 +674,71 @@ def _decode_unix_nano(time_unix_nano: int) -> datetime:
584
674
  return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
585
675
 
586
676
 
587
- def _get_task_name(task: ExperimentTask) -> str:
588
- """
589
- Makes a best-effort attempt to get the name of the task.
590
- """
677
+ def _is_dry_run(obj: Any) -> bool:
678
+ return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
591
679
 
592
- if isinstance(task, functools.partial):
593
- return task.func.__qualname__
594
- if hasattr(task, "__qualname__"):
595
- return task.__qualname__
596
- return str(task)
597
680
 
681
+ def _validate_task_signature(sig: inspect.Signature) -> None:
682
+ # Check that the function signature has a valid signature for use as a task
683
+ # If it does not, raise an error to exit early before running an experiment
684
+ params = sig.parameters
685
+ valid_named_params = {"input", "expected", "reference", "metadata", "example"}
686
+ if len(params) == 0:
687
+ raise ValueError("Task function must have at least one parameter.")
688
+ if len(params) > 1:
689
+ for not_found in set(params) - valid_named_params:
690
+ param = params[not_found]
691
+ if (
692
+ param.kind is inspect.Parameter.VAR_KEYWORD
693
+ or param.default is not inspect.Parameter.empty
694
+ ):
695
+ continue
696
+ raise ValueError(
697
+ (
698
+ f"Invalid parameter names in task function: {', '.join(not_found)}. "
699
+ "Parameters names for multi-argument functions must be "
700
+ f"any of: {', '.join(valid_named_params)}."
701
+ )
702
+ )
598
703
 
599
- def _is_dry_run(obj: Any) -> bool:
600
- return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
704
+
705
+ def _bind_task_signature(sig: inspect.Signature, example: Example) -> inspect.BoundArguments:
706
+ parameter_mapping = {
707
+ "input": example.input,
708
+ "expected": example.output,
709
+ "reference": example.output, # Alias for "expected"
710
+ "metadata": example.metadata,
711
+ "example": example,
712
+ }
713
+ params = sig.parameters
714
+ if len(params) == 1:
715
+ parameter_name = next(iter(params))
716
+ if parameter_name in parameter_mapping:
717
+ return sig.bind(parameter_mapping[parameter_name])
718
+ else:
719
+ return sig.bind(parameter_mapping["input"])
720
+ return sig.bind_partial(
721
+ **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
722
+ )
723
+
724
+
725
+ def _print_experiment_error(
726
+ error: BaseException,
727
+ /,
728
+ *,
729
+ example_id: str,
730
+ repetition_number: int,
731
+ kind: Literal["evaluator", "task"],
732
+ ) -> None:
733
+ """
734
+ Prints an experiment error.
735
+ """
736
+ display_error = RuntimeError(
737
+ f"{kind} failed for example id {repr(example_id)}, " f"repetition {repr(repetition_number)}"
738
+ )
739
+ display_error.__cause__ = error
740
+ formatted_exception = "".join(traceback.format_exception(display_error)) # type: ignore[arg-type, call-arg, unused-ignore]
741
+ print("\033[91m" + formatted_exception + "\033[0m") # prints in red
601
742
 
602
743
 
603
744
  class _NoOpProcessor(trace_sdk.SpanProcessor):
@@ -103,9 +103,9 @@ class Example:
103
103
  identifiers = [f'{spaces}id="{self.id}",']
104
104
  contents = [
105
105
  spaces
106
- + f"{k}="
106
+ + f"{_blue(key)}="
107
107
  + json.dumps(
108
- _shorten(v),
108
+ _shorten(value),
109
109
  ensure_ascii=False,
110
110
  sort_keys=True,
111
111
  indent=len(spaces),
@@ -113,8 +113,8 @@ class Example:
113
113
  .replace("\n", f"\n{spaces}")
114
114
  .replace(' "..."\n', " ...\n")
115
115
  + ","
116
- for k in ("input", "output", "metadata")
117
- if (v := getattr(self, k, None))
116
+ for key in ("input", "output", "metadata")
117
+ if (value := getattr(self, key, None))
118
118
  ]
119
119
  return "\n".join([f"{name}(", *identifiers, *contents, ")"])
120
120
 
@@ -199,17 +199,17 @@ class Experiment:
199
199
 
200
200
 
201
201
  @dataclass(frozen=True)
202
- class ExperimentResult:
203
- result: TaskOutput
202
+ class ExperimentRunOutput:
203
+ task_output: TaskOutput
204
204
 
205
205
  def __post_init__(self) -> None:
206
- object.__setattr__(self, "result", _make_read_only(self.result))
206
+ object.__setattr__(self, "task_output", _make_read_only(self.task_output))
207
207
 
208
208
  @classmethod
209
- def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[ExperimentResult]:
209
+ def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> ExperimentRunOutput:
210
210
  if not obj:
211
- return None
212
- return cls(result=obj["result"])
211
+ return cls(task_output=None)
212
+ return cls(task_output=obj["task_output"])
213
213
 
214
214
 
215
215
  @dataclass(frozen=True)
@@ -219,14 +219,14 @@ class ExperimentRun:
219
219
  experiment_id: ExperimentId
220
220
  dataset_example_id: ExampleId
221
221
  repetition_number: RepetitionNumber
222
- output: Optional[ExperimentResult] = None
222
+ experiment_run_output: ExperimentRunOutput
223
223
  error: Optional[str] = None
224
224
  id: ExperimentRunId = field(default_factory=_dry_run_id)
225
225
  trace_id: Optional[TraceId] = None
226
226
 
227
227
  @property
228
- def task_output(self) -> Optional[TaskOutput]:
229
- return deepcopy(self.output.result) if self.output else None
228
+ def output(self) -> Optional[TaskOutput]:
229
+ return deepcopy(self.experiment_run_output.task_output)
230
230
 
231
231
  @classmethod
232
232
  def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentRun:
@@ -236,15 +236,15 @@ class ExperimentRun:
236
236
  experiment_id=obj["experiment_id"],
237
237
  dataset_example_id=obj["dataset_example_id"],
238
238
  repetition_number=obj.get("repetition_number") or 1,
239
- output=ExperimentResult.from_dict(obj["output"]),
239
+ experiment_run_output=ExperimentRunOutput.from_dict(obj["experiment_run_output"]),
240
240
  error=obj.get("error"),
241
241
  id=obj["id"],
242
242
  trace_id=obj.get("trace_id"),
243
243
  )
244
244
 
245
245
  def __post_init__(self) -> None:
246
- if bool(self.output) == bool(self.error):
247
- ValueError("Must specify either result or error")
246
+ if bool(self.experiment_run_output) == bool(self.error):
247
+ ValueError("Must specify exactly one of experiment_run_output or error")
248
248
 
249
249
 
250
250
  @dataclass(frozen=True)
@@ -571,7 +571,7 @@ class RanExperiment(Experiment):
571
571
  {
572
572
  "run_id": run.id,
573
573
  "error": run.error,
574
- "result": deepcopy(run.output.result) if run.output else None,
574
+ "output": deepcopy(run.experiment_run_output.task_output),
575
575
  "input": deepcopy((ex := self.dataset.examples[run.dataset_example_id]).input),
576
576
  "expected": deepcopy(ex.output),
577
577
  "metadata": deepcopy(ex.metadata),
@@ -688,6 +688,10 @@ class _ExperimentRunWithExample(ObjectProxy): # type: ignore[misc]
688
688
  def expected(self) -> ExampleOutput:
689
689
  return deepcopy(self._self_example.output)
690
690
 
691
+ @property
692
+ def reference(self) -> ExampleOutput:
693
+ return deepcopy(self._self_example.output)
694
+
691
695
  @property
692
696
  def input(self) -> ExampleInput:
693
697
  return deepcopy(self._self_example.input)
@@ -703,20 +707,47 @@ class _ExperimentRunWithExample(ObjectProxy): # type: ignore[misc]
703
707
  f'{spaces}id="{self.id}",',
704
708
  f'{spaces}example_id="{self.dataset_example_id}",',
705
709
  ]
706
- contents = [
710
+ outputs = [
711
+ *([f'{spaces}error="{self.error}",'] if self.error else []),
712
+ *(
713
+ [
714
+ f"{spaces}{_blue('output')}="
715
+ + json.dumps(
716
+ _shorten(self.output),
717
+ ensure_ascii=False,
718
+ sort_keys=True,
719
+ indent=len(spaces),
720
+ )
721
+ .replace("\n", f"\n{spaces}")
722
+ .replace(' "..."\n', " ...\n")
723
+ ]
724
+ if not self.error
725
+ else []
726
+ ),
727
+ ]
728
+ dicts = [
707
729
  spaces
708
- + f"{k}="
709
- + json.dumps(_shorten(v), ensure_ascii=False, sort_keys=True, indent=len(spaces))
730
+ + f"{_blue(alias)}={{"
731
+ + (f" # {comment}" if comment else "")
732
+ + json.dumps(
733
+ _shorten(value),
734
+ ensure_ascii=False,
735
+ sort_keys=True,
736
+ indent=len(spaces),
737
+ )[1:]
710
738
  .replace("\n", f"\n{spaces}")
711
739
  .replace(' "..."\n', " ...\n")
712
740
  + ","
713
- for k, v in {
714
- "error": self.error,
715
- "output": self.task_output,
716
- "expected": self.expected,
717
- "input": self.input,
718
- "metadata": self.metadata,
719
- }.items()
720
- if v
741
+ for alias, value, comment in (
742
+ ("expected", self.expected, f"alias for the example.{_blue('output')} dict"),
743
+ ("reference", self.reference, f"alias for the example.{_blue('output')} dict"),
744
+ ("input", self.input, f"alias for the example.{_blue('input')} dict"),
745
+ ("metadata", self.metadata, f"alias for the example.{_blue('metadata')} dict"),
746
+ )
747
+ if value
721
748
  ]
722
- return "\n".join([f"{name}(", *identifiers, *contents, ")"])
749
+ return "\n".join([f"{name}(", *identifiers, *outputs, *dicts, ")"])
750
+
751
+
752
+ def _blue(text: str) -> str:
753
+ return f"\033[1m\033[94m{text}\033[0m"