arize-phoenix 3.16.1__py3-none-any.whl → 7.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (338) hide show
  1. arize_phoenix-7.7.0.dist-info/METADATA +261 -0
  2. arize_phoenix-7.7.0.dist-info/RECORD +345 -0
  3. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
  4. arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
  5. phoenix/__init__.py +86 -14
  6. phoenix/auth.py +309 -0
  7. phoenix/config.py +675 -45
  8. phoenix/core/model.py +32 -30
  9. phoenix/core/model_schema.py +102 -109
  10. phoenix/core/model_schema_adapter.py +48 -45
  11. phoenix/datetime_utils.py +24 -3
  12. phoenix/db/README.md +54 -0
  13. phoenix/db/__init__.py +4 -0
  14. phoenix/db/alembic.ini +85 -0
  15. phoenix/db/bulk_inserter.py +294 -0
  16. phoenix/db/engines.py +208 -0
  17. phoenix/db/enums.py +20 -0
  18. phoenix/db/facilitator.py +113 -0
  19. phoenix/db/helpers.py +159 -0
  20. phoenix/db/insertion/constants.py +2 -0
  21. phoenix/db/insertion/dataset.py +227 -0
  22. phoenix/db/insertion/document_annotation.py +171 -0
  23. phoenix/db/insertion/evaluation.py +191 -0
  24. phoenix/db/insertion/helpers.py +98 -0
  25. phoenix/db/insertion/span.py +193 -0
  26. phoenix/db/insertion/span_annotation.py +158 -0
  27. phoenix/db/insertion/trace_annotation.py +158 -0
  28. phoenix/db/insertion/types.py +256 -0
  29. phoenix/db/migrate.py +86 -0
  30. phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
  31. phoenix/db/migrations/env.py +114 -0
  32. phoenix/db/migrations/script.py.mako +26 -0
  33. phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
  34. phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
  35. phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
  36. phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
  37. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
  38. phoenix/db/models.py +807 -0
  39. phoenix/exceptions.py +5 -1
  40. phoenix/experiments/__init__.py +6 -0
  41. phoenix/experiments/evaluators/__init__.py +29 -0
  42. phoenix/experiments/evaluators/base.py +158 -0
  43. phoenix/experiments/evaluators/code_evaluators.py +184 -0
  44. phoenix/experiments/evaluators/llm_evaluators.py +473 -0
  45. phoenix/experiments/evaluators/utils.py +236 -0
  46. phoenix/experiments/functions.py +772 -0
  47. phoenix/experiments/tracing.py +86 -0
  48. phoenix/experiments/types.py +726 -0
  49. phoenix/experiments/utils.py +25 -0
  50. phoenix/inferences/__init__.py +0 -0
  51. phoenix/{datasets → inferences}/errors.py +6 -5
  52. phoenix/{datasets → inferences}/fixtures.py +49 -42
  53. phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
  54. phoenix/{datasets → inferences}/schema.py +11 -11
  55. phoenix/{datasets → inferences}/validation.py +13 -14
  56. phoenix/logging/__init__.py +3 -0
  57. phoenix/logging/_config.py +90 -0
  58. phoenix/logging/_filter.py +6 -0
  59. phoenix/logging/_formatter.py +69 -0
  60. phoenix/metrics/__init__.py +5 -4
  61. phoenix/metrics/binning.py +4 -3
  62. phoenix/metrics/metrics.py +2 -1
  63. phoenix/metrics/mixins.py +7 -6
  64. phoenix/metrics/retrieval_metrics.py +2 -1
  65. phoenix/metrics/timeseries.py +5 -4
  66. phoenix/metrics/wrappers.py +9 -3
  67. phoenix/pointcloud/clustering.py +5 -5
  68. phoenix/pointcloud/pointcloud.py +7 -5
  69. phoenix/pointcloud/projectors.py +5 -6
  70. phoenix/pointcloud/umap_parameters.py +53 -52
  71. phoenix/server/api/README.md +28 -0
  72. phoenix/server/api/auth.py +44 -0
  73. phoenix/server/api/context.py +152 -9
  74. phoenix/server/api/dataloaders/__init__.py +91 -0
  75. phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
  76. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  77. phoenix/server/api/dataloaders/cache/__init__.py +3 -0
  78. phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
  79. phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
  80. phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
  81. phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
  82. phoenix/server/api/dataloaders/document_evaluations.py +31 -0
  83. phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
  84. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
  85. phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
  86. phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
  87. phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
  88. phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
  89. phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
  90. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
  91. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  92. phoenix/server/api/dataloaders/record_counts.py +116 -0
  93. phoenix/server/api/dataloaders/session_io.py +79 -0
  94. phoenix/server/api/dataloaders/session_num_traces.py +30 -0
  95. phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
  96. phoenix/server/api/dataloaders/session_token_usages.py +41 -0
  97. phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
  98. phoenix/server/api/dataloaders/span_annotations.py +26 -0
  99. phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
  100. phoenix/server/api/dataloaders/span_descendants.py +57 -0
  101. phoenix/server/api/dataloaders/span_projects.py +33 -0
  102. phoenix/server/api/dataloaders/token_counts.py +124 -0
  103. phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
  104. phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
  105. phoenix/server/api/dataloaders/user_roles.py +30 -0
  106. phoenix/server/api/dataloaders/users.py +33 -0
  107. phoenix/server/api/exceptions.py +48 -0
  108. phoenix/server/api/helpers/__init__.py +12 -0
  109. phoenix/server/api/helpers/dataset_helpers.py +217 -0
  110. phoenix/server/api/helpers/experiment_run_filters.py +763 -0
  111. phoenix/server/api/helpers/playground_clients.py +948 -0
  112. phoenix/server/api/helpers/playground_registry.py +70 -0
  113. phoenix/server/api/helpers/playground_spans.py +455 -0
  114. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  115. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  116. phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
  117. phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
  118. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  119. phoenix/server/api/input_types/ClusterInput.py +2 -2
  120. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  121. phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
  122. phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
  123. phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
  124. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  125. phoenix/server/api/input_types/DatasetSort.py +17 -0
  126. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  127. phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
  128. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  129. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  130. phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
  131. phoenix/server/api/input_types/DimensionFilter.py +4 -4
  132. phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
  133. phoenix/server/api/input_types/Granularity.py +1 -1
  134. phoenix/server/api/input_types/InvocationParameters.py +162 -0
  135. phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
  136. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  137. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  138. phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
  139. phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
  140. phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
  141. phoenix/server/api/input_types/SpanSort.py +134 -69
  142. phoenix/server/api/input_types/TemplateOptions.py +10 -0
  143. phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
  144. phoenix/server/api/input_types/UserRoleInput.py +9 -0
  145. phoenix/server/api/mutations/__init__.py +28 -0
  146. phoenix/server/api/mutations/api_key_mutations.py +167 -0
  147. phoenix/server/api/mutations/chat_mutations.py +593 -0
  148. phoenix/server/api/mutations/dataset_mutations.py +591 -0
  149. phoenix/server/api/mutations/experiment_mutations.py +75 -0
  150. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
  151. phoenix/server/api/mutations/project_mutations.py +57 -0
  152. phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
  153. phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
  154. phoenix/server/api/mutations/user_mutations.py +329 -0
  155. phoenix/server/api/openapi/__init__.py +0 -0
  156. phoenix/server/api/openapi/main.py +17 -0
  157. phoenix/server/api/openapi/schema.py +16 -0
  158. phoenix/server/api/queries.py +738 -0
  159. phoenix/server/api/routers/__init__.py +11 -0
  160. phoenix/server/api/routers/auth.py +284 -0
  161. phoenix/server/api/routers/embeddings.py +26 -0
  162. phoenix/server/api/routers/oauth2.py +488 -0
  163. phoenix/server/api/routers/v1/__init__.py +64 -0
  164. phoenix/server/api/routers/v1/datasets.py +1017 -0
  165. phoenix/server/api/routers/v1/evaluations.py +362 -0
  166. phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
  167. phoenix/server/api/routers/v1/experiment_runs.py +167 -0
  168. phoenix/server/api/routers/v1/experiments.py +308 -0
  169. phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
  170. phoenix/server/api/routers/v1/spans.py +267 -0
  171. phoenix/server/api/routers/v1/traces.py +208 -0
  172. phoenix/server/api/routers/v1/utils.py +95 -0
  173. phoenix/server/api/schema.py +44 -241
  174. phoenix/server/api/subscriptions.py +597 -0
  175. phoenix/server/api/types/Annotation.py +21 -0
  176. phoenix/server/api/types/AnnotationSummary.py +55 -0
  177. phoenix/server/api/types/AnnotatorKind.py +16 -0
  178. phoenix/server/api/types/ApiKey.py +27 -0
  179. phoenix/server/api/types/AuthMethod.py +9 -0
  180. phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
  181. phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
  182. phoenix/server/api/types/Cluster.py +25 -24
  183. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  184. phoenix/server/api/types/DataQualityMetric.py +31 -13
  185. phoenix/server/api/types/Dataset.py +288 -63
  186. phoenix/server/api/types/DatasetExample.py +85 -0
  187. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  188. phoenix/server/api/types/DatasetVersion.py +14 -0
  189. phoenix/server/api/types/Dimension.py +32 -31
  190. phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
  191. phoenix/server/api/types/EmbeddingDimension.py +56 -49
  192. phoenix/server/api/types/Evaluation.py +25 -31
  193. phoenix/server/api/types/EvaluationSummary.py +30 -50
  194. phoenix/server/api/types/Event.py +20 -20
  195. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  196. phoenix/server/api/types/Experiment.py +152 -0
  197. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  198. phoenix/server/api/types/ExperimentComparison.py +17 -0
  199. phoenix/server/api/types/ExperimentRun.py +119 -0
  200. phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
  201. phoenix/server/api/types/GenerativeModel.py +9 -0
  202. phoenix/server/api/types/GenerativeProvider.py +85 -0
  203. phoenix/server/api/types/Inferences.py +80 -0
  204. phoenix/server/api/types/InferencesRole.py +23 -0
  205. phoenix/server/api/types/LabelFraction.py +7 -0
  206. phoenix/server/api/types/MimeType.py +2 -2
  207. phoenix/server/api/types/Model.py +54 -54
  208. phoenix/server/api/types/PerformanceMetric.py +8 -5
  209. phoenix/server/api/types/Project.py +407 -142
  210. phoenix/server/api/types/ProjectSession.py +139 -0
  211. phoenix/server/api/types/Segments.py +4 -4
  212. phoenix/server/api/types/Span.py +221 -176
  213. phoenix/server/api/types/SpanAnnotation.py +43 -0
  214. phoenix/server/api/types/SpanIOValue.py +15 -0
  215. phoenix/server/api/types/SystemApiKey.py +9 -0
  216. phoenix/server/api/types/TemplateLanguage.py +10 -0
  217. phoenix/server/api/types/TimeSeries.py +19 -15
  218. phoenix/server/api/types/TokenUsage.py +11 -0
  219. phoenix/server/api/types/Trace.py +154 -0
  220. phoenix/server/api/types/TraceAnnotation.py +45 -0
  221. phoenix/server/api/types/UMAPPoints.py +7 -7
  222. phoenix/server/api/types/User.py +60 -0
  223. phoenix/server/api/types/UserApiKey.py +45 -0
  224. phoenix/server/api/types/UserRole.py +15 -0
  225. phoenix/server/api/types/node.py +4 -112
  226. phoenix/server/api/types/pagination.py +156 -57
  227. phoenix/server/api/utils.py +34 -0
  228. phoenix/server/app.py +864 -115
  229. phoenix/server/bearer_auth.py +163 -0
  230. phoenix/server/dml_event.py +136 -0
  231. phoenix/server/dml_event_handler.py +256 -0
  232. phoenix/server/email/__init__.py +0 -0
  233. phoenix/server/email/sender.py +97 -0
  234. phoenix/server/email/templates/__init__.py +0 -0
  235. phoenix/server/email/templates/password_reset.html +19 -0
  236. phoenix/server/email/types.py +11 -0
  237. phoenix/server/grpc_server.py +102 -0
  238. phoenix/server/jwt_store.py +505 -0
  239. phoenix/server/main.py +305 -116
  240. phoenix/server/oauth2.py +52 -0
  241. phoenix/server/openapi/__init__.py +0 -0
  242. phoenix/server/prometheus.py +111 -0
  243. phoenix/server/rate_limiters.py +188 -0
  244. phoenix/server/static/.vite/manifest.json +87 -0
  245. phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
  246. phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
  247. phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
  248. phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
  249. phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
  250. phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
  251. phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
  252. phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
  253. phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
  254. phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
  255. phoenix/server/telemetry.py +68 -0
  256. phoenix/server/templates/index.html +82 -23
  257. phoenix/server/thread_server.py +3 -3
  258. phoenix/server/types.py +275 -0
  259. phoenix/services.py +27 -18
  260. phoenix/session/client.py +743 -68
  261. phoenix/session/data_extractor.py +31 -7
  262. phoenix/session/evaluation.py +3 -9
  263. phoenix/session/session.py +263 -219
  264. phoenix/settings.py +22 -0
  265. phoenix/trace/__init__.py +2 -22
  266. phoenix/trace/attributes.py +338 -0
  267. phoenix/trace/dsl/README.md +116 -0
  268. phoenix/trace/dsl/filter.py +663 -213
  269. phoenix/trace/dsl/helpers.py +73 -21
  270. phoenix/trace/dsl/query.py +574 -201
  271. phoenix/trace/exporter.py +24 -19
  272. phoenix/trace/fixtures.py +368 -32
  273. phoenix/trace/otel.py +71 -219
  274. phoenix/trace/projects.py +3 -2
  275. phoenix/trace/schemas.py +33 -11
  276. phoenix/trace/span_evaluations.py +21 -16
  277. phoenix/trace/span_json_decoder.py +6 -4
  278. phoenix/trace/span_json_encoder.py +2 -2
  279. phoenix/trace/trace_dataset.py +47 -32
  280. phoenix/trace/utils.py +21 -4
  281. phoenix/utilities/__init__.py +0 -26
  282. phoenix/utilities/client.py +132 -0
  283. phoenix/utilities/deprecation.py +31 -0
  284. phoenix/utilities/error_handling.py +3 -2
  285. phoenix/utilities/json.py +109 -0
  286. phoenix/utilities/logging.py +8 -0
  287. phoenix/utilities/project.py +2 -2
  288. phoenix/utilities/re.py +49 -0
  289. phoenix/utilities/span_store.py +0 -23
  290. phoenix/utilities/template_formatters.py +99 -0
  291. phoenix/version.py +1 -1
  292. arize_phoenix-3.16.1.dist-info/METADATA +0 -495
  293. arize_phoenix-3.16.1.dist-info/RECORD +0 -178
  294. phoenix/core/project.py +0 -619
  295. phoenix/core/traces.py +0 -96
  296. phoenix/experimental/evals/__init__.py +0 -73
  297. phoenix/experimental/evals/evaluators.py +0 -413
  298. phoenix/experimental/evals/functions/__init__.py +0 -4
  299. phoenix/experimental/evals/functions/classify.py +0 -453
  300. phoenix/experimental/evals/functions/executor.py +0 -353
  301. phoenix/experimental/evals/functions/generate.py +0 -138
  302. phoenix/experimental/evals/functions/processing.py +0 -76
  303. phoenix/experimental/evals/models/__init__.py +0 -14
  304. phoenix/experimental/evals/models/anthropic.py +0 -175
  305. phoenix/experimental/evals/models/base.py +0 -170
  306. phoenix/experimental/evals/models/bedrock.py +0 -221
  307. phoenix/experimental/evals/models/litellm.py +0 -134
  308. phoenix/experimental/evals/models/openai.py +0 -448
  309. phoenix/experimental/evals/models/rate_limiters.py +0 -246
  310. phoenix/experimental/evals/models/vertex.py +0 -173
  311. phoenix/experimental/evals/models/vertexai.py +0 -186
  312. phoenix/experimental/evals/retrievals.py +0 -96
  313. phoenix/experimental/evals/templates/__init__.py +0 -50
  314. phoenix/experimental/evals/templates/default_templates.py +0 -472
  315. phoenix/experimental/evals/templates/template.py +0 -195
  316. phoenix/experimental/evals/utils/__init__.py +0 -172
  317. phoenix/experimental/evals/utils/threads.py +0 -27
  318. phoenix/server/api/helpers.py +0 -11
  319. phoenix/server/api/routers/evaluation_handler.py +0 -109
  320. phoenix/server/api/routers/span_handler.py +0 -70
  321. phoenix/server/api/routers/trace_handler.py +0 -60
  322. phoenix/server/api/types/DatasetRole.py +0 -23
  323. phoenix/server/static/index.css +0 -6
  324. phoenix/server/static/index.js +0 -7447
  325. phoenix/storage/span_store/__init__.py +0 -23
  326. phoenix/storage/span_store/text_file.py +0 -85
  327. phoenix/trace/dsl/missing.py +0 -60
  328. phoenix/trace/langchain/__init__.py +0 -3
  329. phoenix/trace/langchain/instrumentor.py +0 -35
  330. phoenix/trace/llama_index/__init__.py +0 -3
  331. phoenix/trace/llama_index/callback.py +0 -102
  332. phoenix/trace/openai/__init__.py +0 -3
  333. phoenix/trace/openai/instrumentor.py +0 -30
  334. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
  335. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
  336. /phoenix/{datasets → db/insertion}/__init__.py +0 -0
  337. /phoenix/{experimental → db/migrations}/__init__.py +0 -0
  338. /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
@@ -1,453 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- import warnings
5
- from collections import defaultdict
6
- from itertools import product
7
- from typing import (
8
- Any,
9
- DefaultDict,
10
- Dict,
11
- Iterable,
12
- List,
13
- Mapping,
14
- NamedTuple,
15
- Optional,
16
- Tuple,
17
- Union,
18
- cast,
19
- )
20
-
21
- import pandas as pd
22
- from openinference.semconv.trace import DocumentAttributes, SpanAttributes
23
- from pandas import DataFrame
24
- from typing_extensions import TypeAlias
25
-
26
- from phoenix.experimental.evals.evaluators import LLMEvaluator
27
- from phoenix.experimental.evals.functions.executor import get_executor_on_sync_context
28
- from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
29
- from phoenix.experimental.evals.templates import (
30
- RAG_RELEVANCY_PROMPT_RAILS_MAP,
31
- RAG_RELEVANCY_PROMPT_TEMPLATE,
32
- ClassificationTemplate,
33
- PromptOptions,
34
- PromptTemplate,
35
- map_template,
36
- normalize_classification_template,
37
- )
38
- from phoenix.experimental.evals.utils import (
39
- NOT_PARSABLE,
40
- get_tqdm_progress_bar_formatter,
41
- openai_function_call_kwargs,
42
- parse_openai_function_call,
43
- snap_to_rail,
44
- )
45
- from phoenix.utilities.logging import printif
46
-
47
- DOCUMENT_CONTENT = DocumentAttributes.DOCUMENT_CONTENT
48
- INPUT_VALUE = SpanAttributes.INPUT_VALUE
49
- RETRIEVAL_DOCUMENTS = SpanAttributes.RETRIEVAL_DOCUMENTS
50
-
51
- logger = logging.getLogger(__name__)
52
-
53
-
54
- OPENINFERENCE_QUERY_COLUMN_NAME = "attributes." + INPUT_VALUE
55
- OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
56
-
57
- ColumnName: TypeAlias = str
58
- Label: TypeAlias = str
59
- Score: TypeAlias = Optional[float]
60
- Explanation: TypeAlias = Optional[str]
61
- Record: TypeAlias = Mapping[str, Any]
62
- Index: TypeAlias = int
63
-
64
- # snapped_response, explanation, response
65
- ParsedLLMResponse: TypeAlias = Tuple[Optional[str], Optional[str], str]
66
-
67
-
68
- def llm_classify(
69
- dataframe: pd.DataFrame,
70
- model: BaseEvalModel,
71
- template: Union[ClassificationTemplate, PromptTemplate, str],
72
- rails: List[str],
73
- system_instruction: Optional[str] = None,
74
- verbose: bool = False,
75
- use_function_calling_if_available: bool = True,
76
- provide_explanation: bool = False,
77
- include_prompt: bool = False,
78
- include_response: bool = False,
79
- run_sync: bool = False,
80
- concurrency: Optional[int] = None,
81
- ) -> pd.DataFrame:
82
- """Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
83
- where the first column is named `label` and contains the classification labels. An optional
84
- column named `explanation` is added when `provide_explanation=True`.
85
-
86
- Args:
87
- dataframe (pandas.DataFrame): A pandas dataframe in which each row represents a record to be
88
- classified. All template variable names must appear as column names in the dataframe (extra
89
- columns unrelated to the template are permitted).
90
-
91
- template (Union[ClassificationTemplate, PromptTemplate, str]): The prompt template as
92
- either an instance of PromptTemplate, ClassificationTemplate or a string. If a string, the
93
- variable names should be surrounded by curly braces so that a call to `.format` can be made
94
- to substitute variable values.
95
-
96
- model (BaseEvalModel): An LLM model class.
97
-
98
- rails (List[str]): A list of strings representing the possible output classes of the model's
99
- predictions.
100
-
101
- system_instruction (Optional[str], optional): An optional system message.
102
-
103
- verbose (bool, optional): If True, prints detailed info to stdout such as model invocation
104
- parameters and details about retries and snapping to rails. Default False.
105
-
106
- use_function_calling_if_available (bool, default=True): If True, use function calling
107
- (if available) as a means to constrain the LLM outputs. With function calling, the LLM
108
- is instructed to provide its response as a structured JSON object, which is easier
109
- to parse.
110
-
111
- provide_explanation (bool, default=False): If True, provides an explanation for each
112
- classification label. A column named `explanation` is added to the output dataframe.
113
-
114
- include_prompt (bool, default=False): If True, includes a column named `prompt` in the
115
- output dataframe containing the prompt used for each classification.
116
-
117
- include_response (bool, default=False): If True, includes a column named `response` in the
118
- output dataframe containing the raw response from the LLM.
119
-
120
- run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
121
- evaluations will be run asynchronously if possible.
122
-
123
- concurrency (Optional[int], default=None): The number of concurrent evals if async
124
- submission is possible. If not provided, a recommended default concurrency is set on a
125
- per-model basis.
126
-
127
- Returns:
128
- pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
129
- the classification labels. If provide_explanation=True, then an additional column named
130
- `explanation` is added to contain the explanation for each label. The dataframe has
131
- the same length and index as the input dataframe. The classification label values are
132
- from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
133
- not be parsed.
134
- """
135
- concurrency = concurrency or model.default_concurrency
136
- # clients need to be reloaded to ensure that async evals work properly
137
- model.reload_client()
138
-
139
- tqdm_bar_format = get_tqdm_progress_bar_formatter("llm_classify")
140
- use_openai_function_call = (
141
- use_function_calling_if_available
142
- and isinstance(model, OpenAIModel)
143
- and model.supports_function_calling
144
- )
145
-
146
- model_kwargs = (
147
- openai_function_call_kwargs(rails, provide_explanation) if use_openai_function_call else {}
148
- )
149
-
150
- eval_template = normalize_classification_template(rails=rails, template=template)
151
-
152
- prompt_options = PromptOptions(provide_explanation=provide_explanation)
153
- prompts = map_template(dataframe, eval_template, options=prompt_options)
154
-
155
- labels: List[Optional[str]] = [None] * len(dataframe)
156
- explanations: List[Optional[str]] = [None] * len(dataframe)
157
-
158
- printif(verbose, f"Using prompt:\n\n{eval_template.prompt(prompt_options)}")
159
- if generation_info := model.verbose_generation_info():
160
- printif(verbose, generation_info)
161
-
162
- def _process_response(response: str) -> Tuple[str, Optional[str]]:
163
- if not use_openai_function_call:
164
- if provide_explanation:
165
- unrailed_label, explanation = (
166
- eval_template.extract_label_from_explanation(response),
167
- response,
168
- )
169
- printif(
170
- verbose and unrailed_label == NOT_PARSABLE,
171
- f"- Could not parse {repr(response)}",
172
- )
173
- else:
174
- unrailed_label = response
175
- explanation = None
176
- else:
177
- unrailed_label, explanation = parse_openai_function_call(response)
178
- return snap_to_rail(unrailed_label, rails, verbose=verbose), explanation
179
-
180
- async def _run_llm_classification_async(prompt: str) -> ParsedLLMResponse:
181
- with set_verbosity(model, verbose) as verbose_model:
182
- response = await verbose_model._async_generate(
183
- prompt, instruction=system_instruction, **model_kwargs
184
- )
185
- inference, explanation = _process_response(response)
186
- return inference, explanation, response
187
-
188
- def _run_llm_classification_sync(prompt: str) -> ParsedLLMResponse:
189
- with set_verbosity(model, verbose) as verbose_model:
190
- response = verbose_model._generate(
191
- prompt, instruction=system_instruction, **model_kwargs
192
- )
193
- inference, explanation = _process_response(response)
194
- return inference, explanation, response
195
-
196
- fallback_return_value: ParsedLLMResponse = (None, None, "")
197
-
198
- executor = get_executor_on_sync_context(
199
- _run_llm_classification_sync,
200
- _run_llm_classification_async,
201
- run_sync=run_sync,
202
- concurrency=concurrency,
203
- tqdm_bar_format=tqdm_bar_format,
204
- exit_on_error=True,
205
- fallback_return_value=fallback_return_value,
206
- )
207
-
208
- results = executor.run(prompts.tolist())
209
- labels, explanations, responses = zip(*results)
210
-
211
- return pd.DataFrame(
212
- data={
213
- "label": labels,
214
- **({"explanation": explanations} if provide_explanation else {}),
215
- **({"prompt": prompts} if include_prompt else {}),
216
- **({"response": responses} if include_response else {}),
217
- },
218
- index=dataframe.index,
219
- )
220
-
221
-
222
- def run_relevance_eval(
223
- dataframe: pd.DataFrame,
224
- model: BaseEvalModel,
225
- template: Union[ClassificationTemplate, str] = RAG_RELEVANCY_PROMPT_TEMPLATE,
226
- rails: List[str] = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
227
- system_instruction: Optional[str] = None,
228
- query_column_name: str = "input",
229
- document_column_name: str = "reference",
230
- verbose: bool = False,
231
- ) -> List[List[str]]:
232
- """
233
- Given a pandas dataframe containing queries and retrieved documents, classifies the relevance of
234
- each retrieved document to the corresponding query using an LLM.
235
-
236
- Args:
237
- dataframe (pd.DataFrame): A pandas dataframe containing queries and retrieved documents. If
238
- both query_column_name and reference_column_name are present in the input dataframe, those
239
- columns are used as inputs and should appear in the following format:
240
-
241
- - The entries of the query column must be strings.
242
- - The entries of the documents column must be lists of strings. Each list may contain an
243
- arbitrary number of document texts retrieved for the corresponding query.
244
-
245
- If the input dataframe is lacking either query_column_name or reference_column_name but has
246
- query and retrieved document columns in OpenInference trace format named
247
- "attributes.input.value" and "attributes.retrieval.documents", respectively, then those
248
- columns are used as inputs and should appear in the following format:
249
-
250
- - The entries of the query column must be strings.
251
- - The entries of the document column must be lists of OpenInference document objects, each
252
- object being a dictionary that stores the document text under the key "document.content".
253
-
254
- This latter format is intended for running evaluations on exported OpenInference trace
255
- dataframes. For more information on the OpenInference tracing specification, see
256
- https://github.com/Arize-ai/openinference/.
257
-
258
- model (BaseEvalModel): The model used for evaluation.
259
-
260
- template (Union[PromptTemplate, str], optional): The template used for evaluation.
261
-
262
- rails (List[str], optional): A list of strings representing the possible output classes of
263
- the model's predictions.
264
-
265
- query_column_name (str, optional): The name of the query column in the dataframe, which
266
- should also be a template variable.
267
-
268
- reference_column_name (str, optional): The name of the document column in the dataframe,
269
- which should also be a template variable.
270
-
271
- system_instruction (Optional[str], optional): An optional system message.
272
-
273
- verbose (bool, optional): If True, prints detailed information to stdout such as model
274
- invocation parameters and retry info. Default False.
275
-
276
- Returns:
277
- List[List[str]]: A list of relevant and not relevant classifications. The "shape" of the
278
- list should mirror the "shape" of the retrieved documents column, in the sense that it has
279
- the same length as the input dataframe and each sub-list has the same length as the
280
- corresponding list in the retrieved documents column. The values in the sub-lists are either
281
- entries from the rails argument or "NOT_PARSABLE" in the case where the LLM output could not
282
- be parsed.
283
- """
284
-
285
- warnings.warn(
286
- "run_relevance_eval will soon be deprecated. "
287
- "Use run_evals with HallucinationEvaluator instead.",
288
- DeprecationWarning,
289
- )
290
-
291
- with set_verbosity(model, verbose) as verbose_model:
292
- query_column = dataframe.get(query_column_name)
293
- document_column = dataframe.get(document_column_name)
294
- if query_column is None or document_column is None:
295
- openinference_query_column = dataframe.get(OPENINFERENCE_QUERY_COLUMN_NAME)
296
- openinference_document_column = dataframe.get(OPENINFERENCE_DOCUMENT_COLUMN_NAME)
297
- if openinference_query_column is None or openinference_document_column is None:
298
- raise ValueError(
299
- f'Dataframe columns must include either "{query_column_name}" and '
300
- f'"{document_column_name}", or "{OPENINFERENCE_QUERY_COLUMN_NAME}" and '
301
- f'"{OPENINFERENCE_DOCUMENT_COLUMN_NAME}".'
302
- )
303
- query_column = openinference_query_column
304
- document_column = openinference_document_column.map(
305
- lambda docs: _get_contents_from_openinference_documents(docs)
306
- if docs is not None
307
- else None
308
- )
309
-
310
- queries = cast("pd.Series[str]", query_column).tolist()
311
- document_lists = cast("pd.Series[str]", document_column).tolist()
312
- indexes = []
313
- expanded_queries = []
314
- expanded_documents = []
315
- for index, (query, documents) in enumerate(zip(queries, document_lists)):
316
- if query is None or documents is None:
317
- continue
318
- for document in documents:
319
- indexes.append(index)
320
- expanded_queries.append(query)
321
- expanded_documents.append(document)
322
- predictions = llm_classify(
323
- dataframe=pd.DataFrame(
324
- {
325
- query_column_name: expanded_queries,
326
- document_column_name: expanded_documents,
327
- }
328
- ),
329
- model=verbose_model,
330
- template=template,
331
- rails=rails,
332
- system_instruction=system_instruction,
333
- verbose=verbose,
334
- ).iloc[:, 0]
335
- outputs: List[List[str]] = [[] for _ in range(len(dataframe))]
336
- for index, prediction in zip(indexes, predictions):
337
- outputs[index].append(prediction)
338
- return outputs
339
-
340
-
341
- def _get_contents_from_openinference_documents(documents: Iterable[Any]) -> List[Optional[str]]:
342
- """
343
- Get document contents from an iterable of OpenInference document objects, which are dictionaries
344
- containing the document text under the "document.content" key.
345
- """
346
- return [doc.get(DOCUMENT_CONTENT) if isinstance(doc, dict) else None for doc in documents]
347
-
348
-
349
- class RunEvalsPayload(NamedTuple):
350
- evaluator: LLMEvaluator
351
- record: Record
352
-
353
-
354
- def run_evals(
355
- dataframe: DataFrame,
356
- evaluators: List[LLMEvaluator],
357
- provide_explanation: bool = False,
358
- use_function_calling_if_available: bool = True,
359
- verbose: bool = False,
360
- concurrency: Optional[int] = None,
361
- ) -> List[DataFrame]:
362
- """
363
- Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
364
- which each dataframe contains the outputs of the corresponding evaluator
365
- applied to the input dataframe.
366
-
367
- Args:
368
- dataframe (DataFrame): A pandas dataframe in which each row represents a
369
- record to be evaluated. All template variable names must appear as
370
- column names in the dataframe (extra columns unrelated to the template
371
- are permitted).
372
-
373
- evaluators (List[LLMEvaluator]): A list of evaluators.
374
-
375
- provide_explanation (bool, optional): If True, provides an explanation
376
- for each evaluation. A column named "explanation" is added to each
377
- output dataframe.
378
-
379
- use_function_calling_if_available (bool, optional): If True, use
380
- function calling (if available) as a means to constrain the LLM outputs.
381
- With function calling, the LLM is instructed to provide its response as
382
- a structured JSON object, which is easier to parse.
383
-
384
- verbose (bool, optional): If True, prints detailed info to stdout such
385
- as model invocation parameters and details about retries and snapping to
386
- rails.
387
-
388
- concurrency (Optional[int], default=None): The number of concurrent evals if async
389
- submission is possible. If not provided, a recommended default concurrency is set on a
390
- per-model basis.
391
-
392
- Returns:
393
- List[DataFrame]: A list of dataframes, one for each evaluator, all of
394
- which have the same number of rows as the input dataframe.
395
- """
396
- # use the minimum default concurrency of all the models
397
- if concurrency is None:
398
- if len(evaluators) == 0:
399
- concurrency = 1
400
- else:
401
- concurrency = min(evaluator.default_concurrency for evaluator in evaluators)
402
-
403
- # clients need to be reloaded to ensure that async evals work properly
404
- for evaluator in evaluators:
405
- evaluator.reload_client()
406
-
407
- async def _arun_eval(
408
- payload: RunEvalsPayload,
409
- ) -> Tuple[Label, Score, Explanation]:
410
- return await payload.evaluator.aevaluate(
411
- payload.record,
412
- provide_explanation=provide_explanation,
413
- use_function_calling_if_available=use_function_calling_if_available,
414
- )
415
-
416
- def _run_eval(
417
- payload: RunEvalsPayload,
418
- ) -> Tuple[Label, Score, Explanation]:
419
- return payload.evaluator.evaluate(
420
- payload.record,
421
- provide_explanation=provide_explanation,
422
- use_function_calling_if_available=use_function_calling_if_available,
423
- )
424
-
425
- executor = get_executor_on_sync_context(
426
- _run_eval,
427
- _arun_eval,
428
- concurrency=concurrency,
429
- tqdm_bar_format=get_tqdm_progress_bar_formatter("run_evals"),
430
- exit_on_error=True,
431
- fallback_return_value=(None, None, None),
432
- )
433
-
434
- total_records = len(dataframe)
435
- payloads = [
436
- RunEvalsPayload(evaluator=evaluator, record=row)
437
- for evaluator, (_, row) in product(evaluators, dataframe.iterrows())
438
- ]
439
- eval_results: List[DefaultDict[Index, Dict[ColumnName, Union[Label, Explanation]]]] = [
440
- defaultdict(dict) for _ in range(len(evaluators))
441
- ]
442
- for index, (label, score, explanation) in enumerate(executor.run(payloads)):
443
- evaluator_index = index // total_records
444
- row_index = index % total_records
445
- eval_results[evaluator_index][row_index]["label"] = label
446
- eval_results[evaluator_index][row_index]["score"] = score
447
- if provide_explanation:
448
- eval_results[evaluator_index][row_index]["explanation"] = explanation
449
- eval_dataframes: List[DataFrame] = []
450
- for eval_result in eval_results:
451
- eval_data = [eval_result[row_index] for row_index in range(len(eval_result))]
452
- eval_dataframes.append(DataFrame(eval_data, index=dataframe.index))
453
- return eval_dataframes