arize-phoenix 3.16.1__py3-none-any.whl → 7.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (338) hide show
  1. arize_phoenix-7.7.0.dist-info/METADATA +261 -0
  2. arize_phoenix-7.7.0.dist-info/RECORD +345 -0
  3. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/WHEEL +1 -1
  4. arize_phoenix-7.7.0.dist-info/entry_points.txt +3 -0
  5. phoenix/__init__.py +86 -14
  6. phoenix/auth.py +309 -0
  7. phoenix/config.py +675 -45
  8. phoenix/core/model.py +32 -30
  9. phoenix/core/model_schema.py +102 -109
  10. phoenix/core/model_schema_adapter.py +48 -45
  11. phoenix/datetime_utils.py +24 -3
  12. phoenix/db/README.md +54 -0
  13. phoenix/db/__init__.py +4 -0
  14. phoenix/db/alembic.ini +85 -0
  15. phoenix/db/bulk_inserter.py +294 -0
  16. phoenix/db/engines.py +208 -0
  17. phoenix/db/enums.py +20 -0
  18. phoenix/db/facilitator.py +113 -0
  19. phoenix/db/helpers.py +159 -0
  20. phoenix/db/insertion/constants.py +2 -0
  21. phoenix/db/insertion/dataset.py +227 -0
  22. phoenix/db/insertion/document_annotation.py +171 -0
  23. phoenix/db/insertion/evaluation.py +191 -0
  24. phoenix/db/insertion/helpers.py +98 -0
  25. phoenix/db/insertion/span.py +193 -0
  26. phoenix/db/insertion/span_annotation.py +158 -0
  27. phoenix/db/insertion/trace_annotation.py +158 -0
  28. phoenix/db/insertion/types.py +256 -0
  29. phoenix/db/migrate.py +86 -0
  30. phoenix/db/migrations/data_migration_scripts/populate_project_sessions.py +199 -0
  31. phoenix/db/migrations/env.py +114 -0
  32. phoenix/db/migrations/script.py.mako +26 -0
  33. phoenix/db/migrations/versions/10460e46d750_datasets.py +317 -0
  34. phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +126 -0
  35. phoenix/db/migrations/versions/4ded9e43755f_create_project_sessions_table.py +66 -0
  36. phoenix/db/migrations/versions/cd164e83824f_users_and_tokens.py +157 -0
  37. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
  38. phoenix/db/models.py +807 -0
  39. phoenix/exceptions.py +5 -1
  40. phoenix/experiments/__init__.py +6 -0
  41. phoenix/experiments/evaluators/__init__.py +29 -0
  42. phoenix/experiments/evaluators/base.py +158 -0
  43. phoenix/experiments/evaluators/code_evaluators.py +184 -0
  44. phoenix/experiments/evaluators/llm_evaluators.py +473 -0
  45. phoenix/experiments/evaluators/utils.py +236 -0
  46. phoenix/experiments/functions.py +772 -0
  47. phoenix/experiments/tracing.py +86 -0
  48. phoenix/experiments/types.py +726 -0
  49. phoenix/experiments/utils.py +25 -0
  50. phoenix/inferences/__init__.py +0 -0
  51. phoenix/{datasets → inferences}/errors.py +6 -5
  52. phoenix/{datasets → inferences}/fixtures.py +49 -42
  53. phoenix/{datasets/dataset.py → inferences/inferences.py} +121 -105
  54. phoenix/{datasets → inferences}/schema.py +11 -11
  55. phoenix/{datasets → inferences}/validation.py +13 -14
  56. phoenix/logging/__init__.py +3 -0
  57. phoenix/logging/_config.py +90 -0
  58. phoenix/logging/_filter.py +6 -0
  59. phoenix/logging/_formatter.py +69 -0
  60. phoenix/metrics/__init__.py +5 -4
  61. phoenix/metrics/binning.py +4 -3
  62. phoenix/metrics/metrics.py +2 -1
  63. phoenix/metrics/mixins.py +7 -6
  64. phoenix/metrics/retrieval_metrics.py +2 -1
  65. phoenix/metrics/timeseries.py +5 -4
  66. phoenix/metrics/wrappers.py +9 -3
  67. phoenix/pointcloud/clustering.py +5 -5
  68. phoenix/pointcloud/pointcloud.py +7 -5
  69. phoenix/pointcloud/projectors.py +5 -6
  70. phoenix/pointcloud/umap_parameters.py +53 -52
  71. phoenix/server/api/README.md +28 -0
  72. phoenix/server/api/auth.py +44 -0
  73. phoenix/server/api/context.py +152 -9
  74. phoenix/server/api/dataloaders/__init__.py +91 -0
  75. phoenix/server/api/dataloaders/annotation_summaries.py +139 -0
  76. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  77. phoenix/server/api/dataloaders/cache/__init__.py +3 -0
  78. phoenix/server/api/dataloaders/cache/two_tier_cache.py +68 -0
  79. phoenix/server/api/dataloaders/dataset_example_revisions.py +131 -0
  80. phoenix/server/api/dataloaders/dataset_example_spans.py +38 -0
  81. phoenix/server/api/dataloaders/document_evaluation_summaries.py +144 -0
  82. phoenix/server/api/dataloaders/document_evaluations.py +31 -0
  83. phoenix/server/api/dataloaders/document_retrieval_metrics.py +89 -0
  84. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +79 -0
  85. phoenix/server/api/dataloaders/experiment_error_rates.py +58 -0
  86. phoenix/server/api/dataloaders/experiment_run_annotations.py +36 -0
  87. phoenix/server/api/dataloaders/experiment_run_counts.py +49 -0
  88. phoenix/server/api/dataloaders/experiment_sequence_number.py +44 -0
  89. phoenix/server/api/dataloaders/latency_ms_quantile.py +188 -0
  90. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +85 -0
  91. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  92. phoenix/server/api/dataloaders/record_counts.py +116 -0
  93. phoenix/server/api/dataloaders/session_io.py +79 -0
  94. phoenix/server/api/dataloaders/session_num_traces.py +30 -0
  95. phoenix/server/api/dataloaders/session_num_traces_with_error.py +32 -0
  96. phoenix/server/api/dataloaders/session_token_usages.py +41 -0
  97. phoenix/server/api/dataloaders/session_trace_latency_ms_quantile.py +55 -0
  98. phoenix/server/api/dataloaders/span_annotations.py +26 -0
  99. phoenix/server/api/dataloaders/span_dataset_examples.py +31 -0
  100. phoenix/server/api/dataloaders/span_descendants.py +57 -0
  101. phoenix/server/api/dataloaders/span_projects.py +33 -0
  102. phoenix/server/api/dataloaders/token_counts.py +124 -0
  103. phoenix/server/api/dataloaders/trace_by_trace_ids.py +25 -0
  104. phoenix/server/api/dataloaders/trace_root_spans.py +32 -0
  105. phoenix/server/api/dataloaders/user_roles.py +30 -0
  106. phoenix/server/api/dataloaders/users.py +33 -0
  107. phoenix/server/api/exceptions.py +48 -0
  108. phoenix/server/api/helpers/__init__.py +12 -0
  109. phoenix/server/api/helpers/dataset_helpers.py +217 -0
  110. phoenix/server/api/helpers/experiment_run_filters.py +763 -0
  111. phoenix/server/api/helpers/playground_clients.py +948 -0
  112. phoenix/server/api/helpers/playground_registry.py +70 -0
  113. phoenix/server/api/helpers/playground_spans.py +455 -0
  114. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  115. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  116. phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
  117. phoenix/server/api/input_types/ChatCompletionMessageInput.py +24 -0
  118. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  119. phoenix/server/api/input_types/ClusterInput.py +2 -2
  120. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  121. phoenix/server/api/input_types/CreateSpanAnnotationInput.py +18 -0
  122. phoenix/server/api/input_types/CreateTraceAnnotationInput.py +18 -0
  123. phoenix/server/api/input_types/DataQualityMetricInput.py +5 -2
  124. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  125. phoenix/server/api/input_types/DatasetSort.py +17 -0
  126. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  127. phoenix/server/api/input_types/DeleteAnnotationsInput.py +7 -0
  128. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  129. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  130. phoenix/server/api/input_types/DeleteExperimentsInput.py +7 -0
  131. phoenix/server/api/input_types/DimensionFilter.py +4 -4
  132. phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
  133. phoenix/server/api/input_types/Granularity.py +1 -1
  134. phoenix/server/api/input_types/InvocationParameters.py +162 -0
  135. phoenix/server/api/input_types/PatchAnnotationInput.py +19 -0
  136. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  137. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  138. phoenix/server/api/input_types/PerformanceMetricInput.py +5 -2
  139. phoenix/server/api/input_types/ProjectSessionSort.py +29 -0
  140. phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
  141. phoenix/server/api/input_types/SpanSort.py +134 -69
  142. phoenix/server/api/input_types/TemplateOptions.py +10 -0
  143. phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
  144. phoenix/server/api/input_types/UserRoleInput.py +9 -0
  145. phoenix/server/api/mutations/__init__.py +28 -0
  146. phoenix/server/api/mutations/api_key_mutations.py +167 -0
  147. phoenix/server/api/mutations/chat_mutations.py +593 -0
  148. phoenix/server/api/mutations/dataset_mutations.py +591 -0
  149. phoenix/server/api/mutations/experiment_mutations.py +75 -0
  150. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +21 -18
  151. phoenix/server/api/mutations/project_mutations.py +57 -0
  152. phoenix/server/api/mutations/span_annotations_mutations.py +128 -0
  153. phoenix/server/api/mutations/trace_annotations_mutations.py +127 -0
  154. phoenix/server/api/mutations/user_mutations.py +329 -0
  155. phoenix/server/api/openapi/__init__.py +0 -0
  156. phoenix/server/api/openapi/main.py +17 -0
  157. phoenix/server/api/openapi/schema.py +16 -0
  158. phoenix/server/api/queries.py +738 -0
  159. phoenix/server/api/routers/__init__.py +11 -0
  160. phoenix/server/api/routers/auth.py +284 -0
  161. phoenix/server/api/routers/embeddings.py +26 -0
  162. phoenix/server/api/routers/oauth2.py +488 -0
  163. phoenix/server/api/routers/v1/__init__.py +64 -0
  164. phoenix/server/api/routers/v1/datasets.py +1017 -0
  165. phoenix/server/api/routers/v1/evaluations.py +362 -0
  166. phoenix/server/api/routers/v1/experiment_evaluations.py +115 -0
  167. phoenix/server/api/routers/v1/experiment_runs.py +167 -0
  168. phoenix/server/api/routers/v1/experiments.py +308 -0
  169. phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
  170. phoenix/server/api/routers/v1/spans.py +267 -0
  171. phoenix/server/api/routers/v1/traces.py +208 -0
  172. phoenix/server/api/routers/v1/utils.py +95 -0
  173. phoenix/server/api/schema.py +44 -241
  174. phoenix/server/api/subscriptions.py +597 -0
  175. phoenix/server/api/types/Annotation.py +21 -0
  176. phoenix/server/api/types/AnnotationSummary.py +55 -0
  177. phoenix/server/api/types/AnnotatorKind.py +16 -0
  178. phoenix/server/api/types/ApiKey.py +27 -0
  179. phoenix/server/api/types/AuthMethod.py +9 -0
  180. phoenix/server/api/types/ChatCompletionMessageRole.py +11 -0
  181. phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +46 -0
  182. phoenix/server/api/types/Cluster.py +25 -24
  183. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  184. phoenix/server/api/types/DataQualityMetric.py +31 -13
  185. phoenix/server/api/types/Dataset.py +288 -63
  186. phoenix/server/api/types/DatasetExample.py +85 -0
  187. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  188. phoenix/server/api/types/DatasetVersion.py +14 -0
  189. phoenix/server/api/types/Dimension.py +32 -31
  190. phoenix/server/api/types/DocumentEvaluationSummary.py +9 -8
  191. phoenix/server/api/types/EmbeddingDimension.py +56 -49
  192. phoenix/server/api/types/Evaluation.py +25 -31
  193. phoenix/server/api/types/EvaluationSummary.py +30 -50
  194. phoenix/server/api/types/Event.py +20 -20
  195. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  196. phoenix/server/api/types/Experiment.py +152 -0
  197. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  198. phoenix/server/api/types/ExperimentComparison.py +17 -0
  199. phoenix/server/api/types/ExperimentRun.py +119 -0
  200. phoenix/server/api/types/ExperimentRunAnnotation.py +56 -0
  201. phoenix/server/api/types/GenerativeModel.py +9 -0
  202. phoenix/server/api/types/GenerativeProvider.py +85 -0
  203. phoenix/server/api/types/Inferences.py +80 -0
  204. phoenix/server/api/types/InferencesRole.py +23 -0
  205. phoenix/server/api/types/LabelFraction.py +7 -0
  206. phoenix/server/api/types/MimeType.py +2 -2
  207. phoenix/server/api/types/Model.py +54 -54
  208. phoenix/server/api/types/PerformanceMetric.py +8 -5
  209. phoenix/server/api/types/Project.py +407 -142
  210. phoenix/server/api/types/ProjectSession.py +139 -0
  211. phoenix/server/api/types/Segments.py +4 -4
  212. phoenix/server/api/types/Span.py +221 -176
  213. phoenix/server/api/types/SpanAnnotation.py +43 -0
  214. phoenix/server/api/types/SpanIOValue.py +15 -0
  215. phoenix/server/api/types/SystemApiKey.py +9 -0
  216. phoenix/server/api/types/TemplateLanguage.py +10 -0
  217. phoenix/server/api/types/TimeSeries.py +19 -15
  218. phoenix/server/api/types/TokenUsage.py +11 -0
  219. phoenix/server/api/types/Trace.py +154 -0
  220. phoenix/server/api/types/TraceAnnotation.py +45 -0
  221. phoenix/server/api/types/UMAPPoints.py +7 -7
  222. phoenix/server/api/types/User.py +60 -0
  223. phoenix/server/api/types/UserApiKey.py +45 -0
  224. phoenix/server/api/types/UserRole.py +15 -0
  225. phoenix/server/api/types/node.py +4 -112
  226. phoenix/server/api/types/pagination.py +156 -57
  227. phoenix/server/api/utils.py +34 -0
  228. phoenix/server/app.py +864 -115
  229. phoenix/server/bearer_auth.py +163 -0
  230. phoenix/server/dml_event.py +136 -0
  231. phoenix/server/dml_event_handler.py +256 -0
  232. phoenix/server/email/__init__.py +0 -0
  233. phoenix/server/email/sender.py +97 -0
  234. phoenix/server/email/templates/__init__.py +0 -0
  235. phoenix/server/email/templates/password_reset.html +19 -0
  236. phoenix/server/email/types.py +11 -0
  237. phoenix/server/grpc_server.py +102 -0
  238. phoenix/server/jwt_store.py +505 -0
  239. phoenix/server/main.py +305 -116
  240. phoenix/server/oauth2.py +52 -0
  241. phoenix/server/openapi/__init__.py +0 -0
  242. phoenix/server/prometheus.py +111 -0
  243. phoenix/server/rate_limiters.py +188 -0
  244. phoenix/server/static/.vite/manifest.json +87 -0
  245. phoenix/server/static/assets/components-Cy9nwIvF.js +2125 -0
  246. phoenix/server/static/assets/index-BKvHIxkk.js +113 -0
  247. phoenix/server/static/assets/pages-CUi2xCVQ.js +4449 -0
  248. phoenix/server/static/assets/vendor-DvC8cT4X.js +894 -0
  249. phoenix/server/static/assets/vendor-DxkFTwjz.css +1 -0
  250. phoenix/server/static/assets/vendor-arizeai-Do1793cv.js +662 -0
  251. phoenix/server/static/assets/vendor-codemirror-BzwZPyJM.js +24 -0
  252. phoenix/server/static/assets/vendor-recharts-_Jb7JjhG.js +59 -0
  253. phoenix/server/static/assets/vendor-shiki-Cl9QBraO.js +5 -0
  254. phoenix/server/static/assets/vendor-three-DwGkEfCM.js +2998 -0
  255. phoenix/server/telemetry.py +68 -0
  256. phoenix/server/templates/index.html +82 -23
  257. phoenix/server/thread_server.py +3 -3
  258. phoenix/server/types.py +275 -0
  259. phoenix/services.py +27 -18
  260. phoenix/session/client.py +743 -68
  261. phoenix/session/data_extractor.py +31 -7
  262. phoenix/session/evaluation.py +3 -9
  263. phoenix/session/session.py +263 -219
  264. phoenix/settings.py +22 -0
  265. phoenix/trace/__init__.py +2 -22
  266. phoenix/trace/attributes.py +338 -0
  267. phoenix/trace/dsl/README.md +116 -0
  268. phoenix/trace/dsl/filter.py +663 -213
  269. phoenix/trace/dsl/helpers.py +73 -21
  270. phoenix/trace/dsl/query.py +574 -201
  271. phoenix/trace/exporter.py +24 -19
  272. phoenix/trace/fixtures.py +368 -32
  273. phoenix/trace/otel.py +71 -219
  274. phoenix/trace/projects.py +3 -2
  275. phoenix/trace/schemas.py +33 -11
  276. phoenix/trace/span_evaluations.py +21 -16
  277. phoenix/trace/span_json_decoder.py +6 -4
  278. phoenix/trace/span_json_encoder.py +2 -2
  279. phoenix/trace/trace_dataset.py +47 -32
  280. phoenix/trace/utils.py +21 -4
  281. phoenix/utilities/__init__.py +0 -26
  282. phoenix/utilities/client.py +132 -0
  283. phoenix/utilities/deprecation.py +31 -0
  284. phoenix/utilities/error_handling.py +3 -2
  285. phoenix/utilities/json.py +109 -0
  286. phoenix/utilities/logging.py +8 -0
  287. phoenix/utilities/project.py +2 -2
  288. phoenix/utilities/re.py +49 -0
  289. phoenix/utilities/span_store.py +0 -23
  290. phoenix/utilities/template_formatters.py +99 -0
  291. phoenix/version.py +1 -1
  292. arize_phoenix-3.16.1.dist-info/METADATA +0 -495
  293. arize_phoenix-3.16.1.dist-info/RECORD +0 -178
  294. phoenix/core/project.py +0 -619
  295. phoenix/core/traces.py +0 -96
  296. phoenix/experimental/evals/__init__.py +0 -73
  297. phoenix/experimental/evals/evaluators.py +0 -413
  298. phoenix/experimental/evals/functions/__init__.py +0 -4
  299. phoenix/experimental/evals/functions/classify.py +0 -453
  300. phoenix/experimental/evals/functions/executor.py +0 -353
  301. phoenix/experimental/evals/functions/generate.py +0 -138
  302. phoenix/experimental/evals/functions/processing.py +0 -76
  303. phoenix/experimental/evals/models/__init__.py +0 -14
  304. phoenix/experimental/evals/models/anthropic.py +0 -175
  305. phoenix/experimental/evals/models/base.py +0 -170
  306. phoenix/experimental/evals/models/bedrock.py +0 -221
  307. phoenix/experimental/evals/models/litellm.py +0 -134
  308. phoenix/experimental/evals/models/openai.py +0 -448
  309. phoenix/experimental/evals/models/rate_limiters.py +0 -246
  310. phoenix/experimental/evals/models/vertex.py +0 -173
  311. phoenix/experimental/evals/models/vertexai.py +0 -186
  312. phoenix/experimental/evals/retrievals.py +0 -96
  313. phoenix/experimental/evals/templates/__init__.py +0 -50
  314. phoenix/experimental/evals/templates/default_templates.py +0 -472
  315. phoenix/experimental/evals/templates/template.py +0 -195
  316. phoenix/experimental/evals/utils/__init__.py +0 -172
  317. phoenix/experimental/evals/utils/threads.py +0 -27
  318. phoenix/server/api/helpers.py +0 -11
  319. phoenix/server/api/routers/evaluation_handler.py +0 -109
  320. phoenix/server/api/routers/span_handler.py +0 -70
  321. phoenix/server/api/routers/trace_handler.py +0 -60
  322. phoenix/server/api/types/DatasetRole.py +0 -23
  323. phoenix/server/static/index.css +0 -6
  324. phoenix/server/static/index.js +0 -7447
  325. phoenix/storage/span_store/__init__.py +0 -23
  326. phoenix/storage/span_store/text_file.py +0 -85
  327. phoenix/trace/dsl/missing.py +0 -60
  328. phoenix/trace/langchain/__init__.py +0 -3
  329. phoenix/trace/langchain/instrumentor.py +0 -35
  330. phoenix/trace/llama_index/__init__.py +0 -3
  331. phoenix/trace/llama_index/callback.py +0 -102
  332. phoenix/trace/openai/__init__.py +0 -3
  333. phoenix/trace/openai/instrumentor.py +0 -30
  334. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/IP_NOTICE +0 -0
  335. {arize_phoenix-3.16.1.dist-info → arize_phoenix-7.7.0.dist-info}/licenses/LICENSE +0 -0
  336. /phoenix/{datasets → db/insertion}/__init__.py +0 -0
  337. /phoenix/{experimental → db/migrations}/__init__.py +0 -0
  338. /phoenix/{storage → db/migrations/data_migration_scripts}/__init__.py +0 -0
phoenix/core/traces.py DELETED
@@ -1,96 +0,0 @@
1
- import weakref
2
- from collections import defaultdict
3
- from queue import SimpleQueue
4
- from threading import RLock, Thread
5
- from types import MethodType
6
- from typing import DefaultDict, Iterator, Optional, Tuple, Union
7
-
8
- from typing_extensions import assert_never
9
-
10
- import phoenix.trace.v1 as pb
11
- from phoenix.config import DEFAULT_PROJECT_NAME
12
- from phoenix.core.project import (
13
- END_OF_QUEUE,
14
- Project,
15
- _ProjectName,
16
- )
17
- from phoenix.trace.schemas import Span
18
-
19
- _SpanItem = Tuple[Span, _ProjectName]
20
- _EvalItem = Tuple[pb.Evaluation, _ProjectName]
21
-
22
-
23
- class Traces:
24
- def __init__(self) -> None:
25
- self._span_queue: "SimpleQueue[Optional[_SpanItem]]" = SimpleQueue()
26
- self._eval_queue: "SimpleQueue[Optional[_EvalItem]]" = SimpleQueue()
27
- # Putting `None` as the sentinel value for queue termination.
28
- weakref.finalize(self, self._span_queue.put, END_OF_QUEUE)
29
- weakref.finalize(self, self._eval_queue.put, END_OF_QUEUE)
30
- self._lock = RLock()
31
- self._projects: DefaultDict[_ProjectName, "Project"] = defaultdict(
32
- Project,
33
- {DEFAULT_PROJECT_NAME: Project()},
34
- )
35
- self._start_consumers()
36
-
37
- def get_project(self, project_name: str) -> Optional["Project"]:
38
- with self._lock:
39
- return self._projects.get(project_name)
40
-
41
- def get_projects(self) -> Iterator[Tuple[int, str, "Project"]]:
42
- with self._lock:
43
- for project_id, (project_name, project) in enumerate(self._projects.items()):
44
- if project.is_archived:
45
- continue
46
- yield project_id, project_name, project
47
-
48
- def archive_project(self, id: int) -> Optional["Project"]:
49
- if id == 0:
50
- raise ValueError("Cannot archive the default project")
51
- with self._lock:
52
- for project_id, _, project in self.get_projects():
53
- if id == project_id:
54
- project.archive()
55
- return project
56
- return None
57
-
58
- def put(
59
- self,
60
- item: Union[Span, pb.Evaluation],
61
- project_name: Optional[str] = None,
62
- ) -> None:
63
- if not project_name:
64
- project_name = DEFAULT_PROJECT_NAME
65
- if isinstance(item, Span):
66
- self._span_queue.put((item, project_name))
67
- elif isinstance(item, pb.Evaluation):
68
- self._eval_queue.put((item, project_name))
69
- else:
70
- assert_never(item)
71
-
72
- def _start_consumers(self) -> None:
73
- Thread(
74
- target=MethodType(self.__class__._consume_spans, weakref.proxy(self)),
75
- args=(self._span_queue,),
76
- daemon=True,
77
- ).start()
78
- Thread(
79
- target=MethodType(self.__class__._consume_evals, weakref.proxy(self)),
80
- args=(self._eval_queue,),
81
- daemon=True,
82
- ).start()
83
-
84
- def _consume_spans(self, queue: "SimpleQueue[Optional[_SpanItem]]") -> None:
85
- while (item := queue.get()) is not END_OF_QUEUE:
86
- span, project_name = item
87
- with self._lock:
88
- project = self._projects[project_name]
89
- project.add_span(span)
90
-
91
- def _consume_evals(self, queue: "SimpleQueue[Optional[_EvalItem]]") -> None:
92
- while (item := queue.get()) is not END_OF_QUEUE:
93
- pb_eval, project_name = item
94
- with self._lock:
95
- project = self._projects[project_name]
96
- project.add_eval(pb_eval)
@@ -1,73 +0,0 @@
1
- import logging
2
-
3
- from .evaluators import (
4
- HallucinationEvaluator,
5
- LLMEvaluator,
6
- QAEvaluator,
7
- RelevanceEvaluator,
8
- SummarizationEvaluator,
9
- ToxicityEvaluator,
10
- )
11
- from .functions import llm_classify, llm_generate, run_evals, run_relevance_eval
12
- from .models import BedrockModel, LiteLLMModel, OpenAIModel, VertexAIModel
13
- from .retrievals import compute_precisions_at_k
14
- from .templates import (
15
- CODE_READABILITY_PROMPT_RAILS_MAP,
16
- CODE_READABILITY_PROMPT_TEMPLATE,
17
- HALLUCINATION_PROMPT_RAILS_MAP,
18
- HALLUCINATION_PROMPT_TEMPLATE,
19
- HUMAN_VS_AI_PROMPT_RAILS_MAP,
20
- HUMAN_VS_AI_PROMPT_TEMPLATE,
21
- QA_PROMPT_RAILS_MAP,
22
- QA_PROMPT_TEMPLATE,
23
- RAG_RELEVANCY_PROMPT_RAILS_MAP,
24
- RAG_RELEVANCY_PROMPT_TEMPLATE,
25
- TOXICITY_PROMPT_RAILS_MAP,
26
- TOXICITY_PROMPT_TEMPLATE,
27
- ClassificationTemplate,
28
- PromptTemplate,
29
- )
30
- from .utils import NOT_PARSABLE, download_benchmark_dataset
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- __all__ = [
35
- "compute_precisions_at_k",
36
- "download_benchmark_dataset",
37
- "llm_classify",
38
- "llm_generate",
39
- "OpenAIModel",
40
- "VertexAIModel",
41
- "BedrockModel",
42
- "LiteLLMModel",
43
- "PromptTemplate",
44
- "ClassificationTemplate",
45
- "CODE_READABILITY_PROMPT_RAILS_MAP",
46
- "CODE_READABILITY_PROMPT_TEMPLATE",
47
- "HALLUCINATION_PROMPT_RAILS_MAP",
48
- "HALLUCINATION_PROMPT_TEMPLATE",
49
- "RAG_RELEVANCY_PROMPT_RAILS_MAP",
50
- "RAG_RELEVANCY_PROMPT_TEMPLATE",
51
- "TOXICITY_PROMPT_RAILS_MAP",
52
- "TOXICITY_PROMPT_TEMPLATE",
53
- "HUMAN_VS_AI_PROMPT_RAILS_MAP",
54
- "HUMAN_VS_AI_PROMPT_TEMPLATE",
55
- "QA_PROMPT_RAILS_MAP",
56
- "QA_PROMPT_TEMPLATE",
57
- "NOT_PARSABLE",
58
- "run_relevance_eval",
59
- "run_evals",
60
- "LLMEvaluator",
61
- "HallucinationEvaluator",
62
- "QAEvaluator",
63
- "RelevanceEvaluator",
64
- "SummarizationEvaluator",
65
- "ToxicityEvaluator",
66
- ]
67
-
68
-
69
- logger.warning(
70
- "Evals are moving out of experimental. "
71
- "Install the evals extra with `pip install arize-phoenix[evals]` and import `phoenix.evals`. "
72
- "For more info, see the [migration guide](https://github.com/Arize-ai/phoenix/blob/main/MIGRATION.md)."
73
- )
@@ -1,413 +0,0 @@
1
- from textwrap import indent
2
- from typing import List, Mapping, Optional, Tuple, Type
3
-
4
- from phoenix.experimental.evals.models import set_verbosity
5
- from phoenix.experimental.evals.utils import (
6
- NOT_PARSABLE,
7
- openai_function_call_kwargs,
8
- parse_openai_function_call,
9
- snap_to_rail,
10
- )
11
- from phoenix.utilities.logging import printif
12
-
13
- from .models import BaseEvalModel, OpenAIModel
14
- from .templates import ClassificationTemplate, EvalCriteria, PromptOptions, PromptTemplate
15
-
16
- Record = Mapping[str, str]
17
- _TAB = " " * 4
18
-
19
-
20
- class LLMEvaluator:
21
- """
22
- Leverages an LLM to evaluate individual records.
23
- """
24
-
25
- def __init__(
26
- self,
27
- model: BaseEvalModel,
28
- template: ClassificationTemplate,
29
- ) -> None:
30
- """Initializer for LLMEvaluator.
31
-
32
- Args:
33
- model (BaseEvalModel): The LLM model to use for evaluation.
34
- template (ClassificationTemplate): The evaluation template.
35
- """
36
- self._model = model
37
- self._template = template
38
-
39
- @property
40
- def default_concurrency(self) -> int:
41
- return self._model.default_concurrency
42
-
43
- def reload_client(self) -> None:
44
- self._model.reload_client()
45
-
46
- def evaluate(
47
- self,
48
- record: Record,
49
- provide_explanation: bool = False,
50
- use_function_calling_if_available: bool = True,
51
- verbose: bool = False,
52
- ) -> Tuple[str, Optional[float], Optional[str]]:
53
- """
54
- Evaluates a single record.
55
-
56
- Args:
57
- record (Record): The record to evaluate.
58
-
59
- provide_explanation (bool, optional): Whether to provide an
60
- explanation.
61
-
62
- use_function_calling_if_available (bool, optional): If True, use
63
- function calling (if available) as a means to constrain the LLM
64
- outputs. With function calling, the LLM is instructed to provide its
65
- response as a structured JSON object, which is easier to parse.
66
-
67
- use_function_calling_if_available (bool, optional): If True, use
68
- function calling (if available) as a means to constrain the LLM
69
- outputs. With function calling, the LLM is instructed to provide its
70
- response as a structured JSON object, which is easier to parse.
71
-
72
- verbose (bool, optional): Whether to print verbose output.
73
-
74
- Returns:
75
- Tuple[str, Optional[float], Optional[str]]: A tuple containing:
76
- - label
77
- - score (if scores for each label are specified by the template)
78
- - explanation (if requested)
79
- """
80
- use_openai_function_call = (
81
- use_function_calling_if_available
82
- and isinstance(self._model, OpenAIModel)
83
- and self._model.supports_function_calling
84
- )
85
- prompt = self._template.format(
86
- record, options=PromptOptions(provide_explanation=provide_explanation)
87
- )
88
- with set_verbosity(self._model, verbose) as verbose_model:
89
- unparsed_output = verbose_model(
90
- prompt,
91
- **(
92
- openai_function_call_kwargs(self._template.rails, provide_explanation)
93
- if use_openai_function_call
94
- else {}
95
- ),
96
- )
97
- label, explanation = _extract_label_and_explanation(
98
- unparsed_output=unparsed_output,
99
- template=self._template,
100
- provide_explanation=provide_explanation,
101
- use_openai_function_call=use_openai_function_call,
102
- verbose=verbose,
103
- )
104
- score = self._template.score(label)
105
- return label, score, explanation
106
-
107
- async def aevaluate(
108
- self,
109
- record: Record,
110
- provide_explanation: bool = False,
111
- use_function_calling_if_available: bool = True,
112
- verbose: bool = False,
113
- ) -> Tuple[str, Optional[float], Optional[str]]:
114
- """
115
- Evaluates a single record.
116
-
117
- Args:
118
- record (Record): The record to evaluate.
119
-
120
- provide_explanation (bool, optional): Whether to provide an
121
- explanation.
122
-
123
- use_function_calling_if_available (bool, optional): If True, use
124
- function calling (if available) as a means to constrain the LLM
125
- outputs. With function calling, the LLM is instructed to provide its
126
- response as a structured JSON object, which is easier to parse.
127
-
128
- verbose (bool, optional): Whether to print verbose output.
129
-
130
- Returns:
131
- Tuple[str, Optional[float], Optional[str]]: A tuple containing:
132
- - label
133
- - score (if scores for each label are specified by the template)
134
- - explanation (if requested)
135
- """
136
- use_openai_function_call = (
137
- use_function_calling_if_available
138
- and isinstance(self._model, OpenAIModel)
139
- and self._model.supports_function_calling
140
- )
141
- prompt = self._template.format(
142
- record, options=PromptOptions(provide_explanation=provide_explanation)
143
- )
144
- with set_verbosity(self._model, verbose) as verbose_model:
145
- unparsed_output = await verbose_model._async_generate(
146
- prompt,
147
- **(
148
- openai_function_call_kwargs(self._template.rails, provide_explanation)
149
- if use_openai_function_call
150
- else {}
151
- ),
152
- )
153
- label, explanation = _extract_label_and_explanation(
154
- unparsed_output=unparsed_output,
155
- template=self._template,
156
- provide_explanation=provide_explanation,
157
- use_openai_function_call=use_openai_function_call,
158
- verbose=verbose,
159
- )
160
- score = self._template.score(label)
161
- return label, score, explanation
162
-
163
-
164
- def _create_llm_evaluator_subclass(
165
- class_name: str, template: ClassificationTemplate, docstring: str
166
- ) -> Type[LLMEvaluator]:
167
- """A factory method that dynamically creates subclasses of LLMEvaluator.
168
-
169
- Args:
170
- class_name (str): Name of the class to be created (should match the name
171
- of the assignment variable).
172
-
173
- template (ClassificationTemplate): The classification template to use
174
- for evaluation.
175
-
176
- docstring (str): The docstring that will be attached to the subclass.
177
-
178
- Returns:
179
- Type[LLMEvaluator]: The dynamically created subclass.
180
- """
181
-
182
- def __init__(self: LLMEvaluator, model: BaseEvalModel) -> None:
183
- LLMEvaluator.__init__(self, model, template)
184
-
185
- __init__.__doc__ = f"""
186
- Initializer for {class_name}.
187
-
188
- Args:
189
- model (BaseEvalModel): The LLM model to use for evaluation."""
190
-
191
- docstring += f" Outputs railed classes {', '.join(template.rails)}."
192
- docstring += "\n\nThe template used for evaluation (without explanation) is:\n\n"
193
- docstring += indent(template.template, 2 * _TAB)
194
-
195
- return type(class_name, (LLMEvaluator,), {"__init__": __init__, "__doc__": docstring})
196
-
197
-
198
- (
199
- HallucinationEvaluator,
200
- RelevanceEvaluator,
201
- ToxicityEvaluator,
202
- QAEvaluator,
203
- SummarizationEvaluator,
204
- ) = map(
205
- lambda args: _create_llm_evaluator_subclass(*args),
206
- (
207
- (
208
- "HallucinationEvaluator",
209
- EvalCriteria.HALLUCINATION.value,
210
- 'Leverages an LLM to evaluate whether a response (stored under an "output" column) is a hallucination given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).', # noqa: E501
211
- ),
212
- (
213
- "RelevanceEvaluator",
214
- EvalCriteria.RELEVANCE.value,
215
- 'Leverages an LLM to evaluate whether a retrieved document (stored under a "reference" column) is relevant or irrelevant to the corresponding query (stored under the "input" column).', # noqa: E501
216
- ),
217
- (
218
- "ToxicityEvaluator",
219
- EvalCriteria.TOXICITY.value,
220
- 'Leverages an LLM to evaluate whether the string stored under the "input" column contains racist, sexist, chauvinistic, biased, or otherwise toxic content.', # noqa: E501
221
- ),
222
- (
223
- "QAEvaluator",
224
- EvalCriteria.QA.value,
225
- 'Leverages an LLM to evaluate whether a response (stored under an "output" column) is correct or incorrect given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).', # noqa: E501
226
- ),
227
- (
228
- "SummarizationEvaluator",
229
- EvalCriteria.SUMMARIZATION.value,
230
- 'Leverages an LLM to evaluate whether a summary (stored under an "output" column) provides an accurate synopsis of an input document (stored under a "input" column).', # noqa: E501
231
- ),
232
- ),
233
- )
234
-
235
-
236
- class MapReducer:
237
- """
238
- Evaluates data that is too large to fit into a single context window using a
239
- map-reduce strategy. The data must first be divided into "chunks" that
240
- individually fit into an LLM's context window. Each chunk of data is
241
- individually evaluated (the "map" step), producing intermediate outputs that
242
- are combined into a single result (the "reduce" step).
243
-
244
- This is the simplest strategy for evaluating long-context data.
245
- """
246
-
247
- def __init__(
248
- self,
249
- model: BaseEvalModel,
250
- map_prompt_template: PromptTemplate,
251
- reduce_prompt_template: PromptTemplate,
252
- ) -> None:
253
- """Initializes an instance.
254
-
255
- Args:
256
- model (BaseEvalModel): The LLM model to use for evaluation.
257
-
258
- map_prompt_template (PromptTemplate): The template that is mapped
259
- over each chunk to produce intermediate outputs. Must contain the
260
- {chunk} placeholder.
261
-
262
- reduce_prompt_template (PromptTemplate): The template that combines
263
- the intermediate outputs into a single result. Must contain the
264
- {mapped} placeholder, which will be formatted as a list of the
265
- intermediate outputs produced by the map step.
266
- """
267
- self._model = model
268
- self._map_prompt_template = map_prompt_template
269
- self._reduce_prompt_template = reduce_prompt_template
270
-
271
- def evaluate(self, chunks: List[str]) -> str:
272
- """Evaluates a list of two or more chunks.
273
-
274
- Args:
275
- chunks (List[str]): A list of chunks to be evaluated. Each chunk is
276
- inserted into the map_prompt_template and must therefore fit within
277
- the LLM's context window and still leave room for the rest of the
278
- prompt.
279
-
280
- Returns:
281
- str: The output of the map-reduce process.
282
- """
283
- if len(chunks) < 2:
284
- raise ValueError(
285
- "The map-reduce strategy is not needed to evaluate data "
286
- "that fits within a single context window. "
287
- "Consider using llm_classify instead."
288
- )
289
- model = self._model
290
- mapped_records = []
291
- for chunk in chunks:
292
- map_prompt = self._map_prompt_template.format({"chunk": chunk})
293
- intermediate_output = model(map_prompt)
294
- mapped_records.append(intermediate_output)
295
- reduce_prompt = self._reduce_prompt_template.format({"mapped": repr(mapped_records)})
296
- return model(reduce_prompt)
297
-
298
-
299
- class Refiner:
300
- """
301
- Evaluates data that is too large to fit into a single context window using a
302
- refine strategy. The data must first be divided into "chunks" that
303
- individually fit into an LLM's context window. An initial "accumulator" is
304
- generated from the first chunk of data. The accumulator is subsequently
305
- refined by iteratively updating and incorporating new information from each
306
- subsequent chunk. An optional synthesis step can be used to synthesize the
307
- final accumulator into a desired format.
308
- """
309
-
310
- def __init__(
311
- self,
312
- model: BaseEvalModel,
313
- initial_prompt_template: PromptTemplate,
314
- refine_prompt_template: PromptTemplate,
315
- synthesize_prompt_template: Optional[PromptTemplate] = None,
316
- ) -> None:
317
- """Initializes an instance.
318
-
319
- Args:
320
- model (BaseEvalModel): The LLM model to use for evaluation.
321
-
322
- initial_prompt_template (PromptTemplate): The template for the
323
- initial invocation of the model that will generate the initial
324
- accumulator. Should contain the {chunk} placeholder.
325
-
326
- refine_prompt_template (PromptTemplate): The template for refining
327
- the accumulator across all subsequence chunks. Must contain the
328
- {chunk} and {accumulator} placeholders.
329
-
330
- synthesize_prompt_template (Optional[PromptTemplate], optional): An
331
- optional template to synthesize the final version of the
332
- accumulator. Must contain the {accumulator} placeholder.
333
- """
334
- self._model = model
335
- self._initial_prompt_template = initial_prompt_template
336
- self._refine_prompt_template = refine_prompt_template
337
- self._synthesize_prompt_template = synthesize_prompt_template
338
-
339
- def evaluate(self, chunks: List[str]) -> str:
340
- """Evaluates a list of two or more chunks.
341
-
342
- Args:
343
- chunks (List[str]): A list of chunks to be evaluated. Each chunk is
344
- inserted into the initial_prompt_template and refine_prompt_template
345
- and must therefore fit within the LLM's context window and still
346
- leave room for the rest of the prompt.
347
-
348
- Returns:
349
- str: The output of the refine process.
350
- """
351
- if len(chunks) < 2:
352
- raise ValueError(
353
- "The refine strategy is not needed to evaluate data "
354
- "that fits within a single context window. "
355
- "Consider using llm_classify instead."
356
- )
357
- model = self._model
358
- initial_prompt = self._initial_prompt_template.format({"chunk": chunks[0]})
359
- accumulator = model(initial_prompt)
360
- for chunk in chunks[1:]:
361
- refine_prompt = self._refine_prompt_template.format(
362
- {"accumulator": accumulator, "chunk": chunk}
363
- )
364
- accumulator = model(refine_prompt)
365
- if not self._synthesize_prompt_template:
366
- return accumulator
367
- reduce_prompt = self._synthesize_prompt_template.format({"accumulator": accumulator})
368
- return model(reduce_prompt)
369
-
370
-
371
- def _extract_label_and_explanation(
372
- unparsed_output: str,
373
- template: ClassificationTemplate,
374
- provide_explanation: bool,
375
- use_openai_function_call: bool,
376
- verbose: bool,
377
- ) -> Tuple[str, Optional[str]]:
378
- """
379
- Extracts the label and explanation from the unparsed output.
380
-
381
- Args:
382
- unparsed_output (str): The raw output to be parsed.
383
-
384
- template (ClassificationTemplate): The template used to generate the
385
- output.
386
-
387
- provide_explanation (bool): Whether the output includes an explanation.
388
-
389
- use_openai_function_call (bool): Whether the output was generated using
390
- function calling.
391
-
392
- verbose (bool): If True, print verbose output to stdout.
393
-
394
- Returns:
395
- Tuple[str, Optional[str]]: A tuple containing the label and an
396
- explanation (if one is provided).
397
- """
398
- if not use_openai_function_call:
399
- if provide_explanation:
400
- unrailed_label, explanation = (
401
- template.extract_label_from_explanation(unparsed_output),
402
- unparsed_output,
403
- )
404
- printif(
405
- verbose and unrailed_label == NOT_PARSABLE,
406
- f"- Could not parse {repr(unparsed_output)}",
407
- )
408
- else:
409
- unrailed_label = unparsed_output
410
- explanation = None
411
- else:
412
- unrailed_label, explanation = parse_openai_function_call(unparsed_output)
413
- return snap_to_rail(unrailed_label, template.rails, verbose=verbose), explanation
@@ -1,4 +0,0 @@
1
- from .classify import llm_classify, run_evals, run_relevance_eval
2
- from .generate import llm_generate
3
-
4
- __all__ = ["llm_classify", "run_relevance_eval", "llm_generate", "run_evals"]