arize-phoenix 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (113) hide show
  1. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/METADATA +26 -4
  2. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/RECORD +80 -75
  3. phoenix/__init__.py +9 -5
  4. phoenix/config.py +109 -53
  5. phoenix/datetime_utils.py +18 -1
  6. phoenix/db/README.md +25 -0
  7. phoenix/db/__init__.py +4 -0
  8. phoenix/db/alembic.ini +119 -0
  9. phoenix/db/bulk_inserter.py +206 -0
  10. phoenix/db/engines.py +152 -0
  11. phoenix/db/helpers.py +47 -0
  12. phoenix/db/insertion/evaluation.py +209 -0
  13. phoenix/db/insertion/helpers.py +51 -0
  14. phoenix/db/insertion/span.py +142 -0
  15. phoenix/db/migrate.py +71 -0
  16. phoenix/db/migrations/env.py +121 -0
  17. phoenix/db/migrations/script.py.mako +26 -0
  18. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
  19. phoenix/db/models.py +371 -0
  20. phoenix/exceptions.py +5 -1
  21. phoenix/server/api/context.py +40 -3
  22. phoenix/server/api/dataloaders/__init__.py +97 -0
  23. phoenix/server/api/dataloaders/cache/__init__.py +3 -0
  24. phoenix/server/api/dataloaders/cache/two_tier_cache.py +67 -0
  25. phoenix/server/api/dataloaders/document_evaluation_summaries.py +152 -0
  26. phoenix/server/api/dataloaders/document_evaluations.py +37 -0
  27. phoenix/server/api/dataloaders/document_retrieval_metrics.py +98 -0
  28. phoenix/server/api/dataloaders/evaluation_summaries.py +151 -0
  29. phoenix/server/api/dataloaders/latency_ms_quantile.py +198 -0
  30. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +93 -0
  31. phoenix/server/api/dataloaders/record_counts.py +125 -0
  32. phoenix/server/api/dataloaders/span_descendants.py +64 -0
  33. phoenix/server/api/dataloaders/span_evaluations.py +37 -0
  34. phoenix/server/api/dataloaders/token_counts.py +138 -0
  35. phoenix/server/api/dataloaders/trace_evaluations.py +37 -0
  36. phoenix/server/api/input_types/SpanSort.py +138 -68
  37. phoenix/server/api/routers/v1/__init__.py +11 -0
  38. phoenix/server/api/routers/v1/evaluations.py +275 -0
  39. phoenix/server/api/routers/v1/spans.py +126 -0
  40. phoenix/server/api/routers/v1/traces.py +82 -0
  41. phoenix/server/api/schema.py +112 -48
  42. phoenix/server/api/types/DocumentEvaluationSummary.py +1 -1
  43. phoenix/server/api/types/Evaluation.py +29 -12
  44. phoenix/server/api/types/EvaluationSummary.py +29 -44
  45. phoenix/server/api/types/MimeType.py +2 -2
  46. phoenix/server/api/types/Model.py +9 -9
  47. phoenix/server/api/types/Project.py +240 -171
  48. phoenix/server/api/types/Span.py +87 -131
  49. phoenix/server/api/types/Trace.py +29 -20
  50. phoenix/server/api/types/pagination.py +151 -10
  51. phoenix/server/app.py +263 -35
  52. phoenix/server/grpc_server.py +93 -0
  53. phoenix/server/main.py +75 -60
  54. phoenix/server/openapi/docs.py +218 -0
  55. phoenix/server/prometheus.py +23 -7
  56. phoenix/server/static/index.js +662 -643
  57. phoenix/server/telemetry.py +68 -0
  58. phoenix/services.py +4 -0
  59. phoenix/session/client.py +34 -30
  60. phoenix/session/data_extractor.py +8 -3
  61. phoenix/session/session.py +176 -155
  62. phoenix/settings.py +13 -0
  63. phoenix/trace/attributes.py +349 -0
  64. phoenix/trace/dsl/README.md +116 -0
  65. phoenix/trace/dsl/filter.py +660 -192
  66. phoenix/trace/dsl/helpers.py +24 -5
  67. phoenix/trace/dsl/query.py +562 -185
  68. phoenix/trace/fixtures.py +69 -7
  69. phoenix/trace/otel.py +44 -200
  70. phoenix/trace/schemas.py +14 -8
  71. phoenix/trace/span_evaluations.py +5 -2
  72. phoenix/utilities/__init__.py +0 -26
  73. phoenix/utilities/span_store.py +0 -23
  74. phoenix/version.py +1 -1
  75. phoenix/core/project.py +0 -773
  76. phoenix/core/traces.py +0 -96
  77. phoenix/datasets/dataset.py +0 -214
  78. phoenix/datasets/fixtures.py +0 -24
  79. phoenix/datasets/schema.py +0 -31
  80. phoenix/experimental/evals/__init__.py +0 -73
  81. phoenix/experimental/evals/evaluators.py +0 -413
  82. phoenix/experimental/evals/functions/__init__.py +0 -4
  83. phoenix/experimental/evals/functions/classify.py +0 -453
  84. phoenix/experimental/evals/functions/executor.py +0 -353
  85. phoenix/experimental/evals/functions/generate.py +0 -138
  86. phoenix/experimental/evals/functions/processing.py +0 -76
  87. phoenix/experimental/evals/models/__init__.py +0 -14
  88. phoenix/experimental/evals/models/anthropic.py +0 -175
  89. phoenix/experimental/evals/models/base.py +0 -170
  90. phoenix/experimental/evals/models/bedrock.py +0 -221
  91. phoenix/experimental/evals/models/litellm.py +0 -134
  92. phoenix/experimental/evals/models/openai.py +0 -453
  93. phoenix/experimental/evals/models/rate_limiters.py +0 -246
  94. phoenix/experimental/evals/models/vertex.py +0 -173
  95. phoenix/experimental/evals/models/vertexai.py +0 -186
  96. phoenix/experimental/evals/retrievals.py +0 -96
  97. phoenix/experimental/evals/templates/__init__.py +0 -50
  98. phoenix/experimental/evals/templates/default_templates.py +0 -472
  99. phoenix/experimental/evals/templates/template.py +0 -195
  100. phoenix/experimental/evals/utils/__init__.py +0 -172
  101. phoenix/experimental/evals/utils/threads.py +0 -27
  102. phoenix/server/api/routers/evaluation_handler.py +0 -110
  103. phoenix/server/api/routers/span_handler.py +0 -70
  104. phoenix/server/api/routers/trace_handler.py +0 -60
  105. phoenix/storage/span_store/__init__.py +0 -23
  106. phoenix/storage/span_store/text_file.py +0 -85
  107. phoenix/trace/dsl/missing.py +0 -60
  108. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/WHEEL +0 -0
  109. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/IP_NOTICE +0 -0
  110. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/LICENSE +0 -0
  111. /phoenix/{datasets → db/insertion}/__init__.py +0 -0
  112. /phoenix/{experimental → db/migrations}/__init__.py +0 -0
  113. /phoenix/{storage → server/openapi}/__init__.py +0 -0
@@ -1,413 +0,0 @@
1
- from textwrap import indent
2
- from typing import List, Mapping, Optional, Tuple, Type
3
-
4
- from phoenix.experimental.evals.models import set_verbosity
5
- from phoenix.experimental.evals.utils import (
6
- NOT_PARSABLE,
7
- openai_function_call_kwargs,
8
- parse_openai_function_call,
9
- snap_to_rail,
10
- )
11
- from phoenix.utilities.logging import printif
12
-
13
- from .models import BaseEvalModel, OpenAIModel
14
- from .templates import ClassificationTemplate, EvalCriteria, PromptOptions, PromptTemplate
15
-
16
- Record = Mapping[str, str]
17
- _TAB = " " * 4
18
-
19
-
20
- class LLMEvaluator:
21
- """
22
- Leverages an LLM to evaluate individual records.
23
- """
24
-
25
- def __init__(
26
- self,
27
- model: BaseEvalModel,
28
- template: ClassificationTemplate,
29
- ) -> None:
30
- """Initializer for LLMEvaluator.
31
-
32
- Args:
33
- model (BaseEvalModel): The LLM model to use for evaluation.
34
- template (ClassificationTemplate): The evaluation template.
35
- """
36
- self._model = model
37
- self._template = template
38
-
39
- @property
40
- def default_concurrency(self) -> int:
41
- return self._model.default_concurrency
42
-
43
- def reload_client(self) -> None:
44
- self._model.reload_client()
45
-
46
- def evaluate(
47
- self,
48
- record: Record,
49
- provide_explanation: bool = False,
50
- use_function_calling_if_available: bool = True,
51
- verbose: bool = False,
52
- ) -> Tuple[str, Optional[float], Optional[str]]:
53
- """
54
- Evaluates a single record.
55
-
56
- Args:
57
- record (Record): The record to evaluate.
58
-
59
- provide_explanation (bool, optional): Whether to provide an
60
- explanation.
61
-
62
- use_function_calling_if_available (bool, optional): If True, use
63
- function calling (if available) as a means to constrain the LLM
64
- outputs. With function calling, the LLM is instructed to provide its
65
- response as a structured JSON object, which is easier to parse.
66
-
67
- use_function_calling_if_available (bool, optional): If True, use
68
- function calling (if available) as a means to constrain the LLM
69
- outputs. With function calling, the LLM is instructed to provide its
70
- response as a structured JSON object, which is easier to parse.
71
-
72
- verbose (bool, optional): Whether to print verbose output.
73
-
74
- Returns:
75
- Tuple[str, Optional[float], Optional[str]]: A tuple containing:
76
- - label
77
- - score (if scores for each label are specified by the template)
78
- - explanation (if requested)
79
- """
80
- use_openai_function_call = (
81
- use_function_calling_if_available
82
- and isinstance(self._model, OpenAIModel)
83
- and self._model.supports_function_calling
84
- )
85
- prompt = self._template.format(
86
- record, options=PromptOptions(provide_explanation=provide_explanation)
87
- )
88
- with set_verbosity(self._model, verbose) as verbose_model:
89
- unparsed_output = verbose_model(
90
- prompt,
91
- **(
92
- openai_function_call_kwargs(self._template.rails, provide_explanation)
93
- if use_openai_function_call
94
- else {}
95
- ),
96
- )
97
- label, explanation = _extract_label_and_explanation(
98
- unparsed_output=unparsed_output,
99
- template=self._template,
100
- provide_explanation=provide_explanation,
101
- use_openai_function_call=use_openai_function_call,
102
- verbose=verbose,
103
- )
104
- score = self._template.score(label)
105
- return label, score, explanation
106
-
107
- async def aevaluate(
108
- self,
109
- record: Record,
110
- provide_explanation: bool = False,
111
- use_function_calling_if_available: bool = True,
112
- verbose: bool = False,
113
- ) -> Tuple[str, Optional[float], Optional[str]]:
114
- """
115
- Evaluates a single record.
116
-
117
- Args:
118
- record (Record): The record to evaluate.
119
-
120
- provide_explanation (bool, optional): Whether to provide an
121
- explanation.
122
-
123
- use_function_calling_if_available (bool, optional): If True, use
124
- function calling (if available) as a means to constrain the LLM
125
- outputs. With function calling, the LLM is instructed to provide its
126
- response as a structured JSON object, which is easier to parse.
127
-
128
- verbose (bool, optional): Whether to print verbose output.
129
-
130
- Returns:
131
- Tuple[str, Optional[float], Optional[str]]: A tuple containing:
132
- - label
133
- - score (if scores for each label are specified by the template)
134
- - explanation (if requested)
135
- """
136
- use_openai_function_call = (
137
- use_function_calling_if_available
138
- and isinstance(self._model, OpenAIModel)
139
- and self._model.supports_function_calling
140
- )
141
- prompt = self._template.format(
142
- record, options=PromptOptions(provide_explanation=provide_explanation)
143
- )
144
- with set_verbosity(self._model, verbose) as verbose_model:
145
- unparsed_output = await verbose_model._async_generate(
146
- prompt,
147
- **(
148
- openai_function_call_kwargs(self._template.rails, provide_explanation)
149
- if use_openai_function_call
150
- else {}
151
- ),
152
- )
153
- label, explanation = _extract_label_and_explanation(
154
- unparsed_output=unparsed_output,
155
- template=self._template,
156
- provide_explanation=provide_explanation,
157
- use_openai_function_call=use_openai_function_call,
158
- verbose=verbose,
159
- )
160
- score = self._template.score(label)
161
- return label, score, explanation
162
-
163
-
164
- def _create_llm_evaluator_subclass(
165
- class_name: str, template: ClassificationTemplate, docstring: str
166
- ) -> Type[LLMEvaluator]:
167
- """A factory method that dynamically creates subclasses of LLMEvaluator.
168
-
169
- Args:
170
- class_name (str): Name of the class to be created (should match the name
171
- of the assignment variable).
172
-
173
- template (ClassificationTemplate): The classification template to use
174
- for evaluation.
175
-
176
- docstring (str): The docstring that will be attached to the subclass.
177
-
178
- Returns:
179
- Type[LLMEvaluator]: The dynamically created subclass.
180
- """
181
-
182
- def __init__(self: LLMEvaluator, model: BaseEvalModel) -> None:
183
- LLMEvaluator.__init__(self, model, template)
184
-
185
- __init__.__doc__ = f"""
186
- Initializer for {class_name}.
187
-
188
- Args:
189
- model (BaseEvalModel): The LLM model to use for evaluation."""
190
-
191
- docstring += f" Outputs railed classes {', '.join(template.rails)}."
192
- docstring += "\n\nThe template used for evaluation (without explanation) is:\n\n"
193
- docstring += indent(template.template, 2 * _TAB)
194
-
195
- return type(class_name, (LLMEvaluator,), {"__init__": __init__, "__doc__": docstring})
196
-
197
-
198
- (
199
- HallucinationEvaluator,
200
- RelevanceEvaluator,
201
- ToxicityEvaluator,
202
- QAEvaluator,
203
- SummarizationEvaluator,
204
- ) = map(
205
- lambda args: _create_llm_evaluator_subclass(*args),
206
- (
207
- (
208
- "HallucinationEvaluator",
209
- EvalCriteria.HALLUCINATION.value,
210
- 'Leverages an LLM to evaluate whether a response (stored under an "output" column) is a hallucination given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).', # noqa: E501
211
- ),
212
- (
213
- "RelevanceEvaluator",
214
- EvalCriteria.RELEVANCE.value,
215
- 'Leverages an LLM to evaluate whether a retrieved document (stored under a "reference" column) is relevant or irrelevant to the corresponding query (stored under the "input" column).', # noqa: E501
216
- ),
217
- (
218
- "ToxicityEvaluator",
219
- EvalCriteria.TOXICITY.value,
220
- 'Leverages an LLM to evaluate whether the string stored under the "input" column contains racist, sexist, chauvinistic, biased, or otherwise toxic content.', # noqa: E501
221
- ),
222
- (
223
- "QAEvaluator",
224
- EvalCriteria.QA.value,
225
- 'Leverages an LLM to evaluate whether a response (stored under an "output" column) is correct or incorrect given a query (stored under an "input" column) and one or more retrieved documents (stored under a "reference" column).', # noqa: E501
226
- ),
227
- (
228
- "SummarizationEvaluator",
229
- EvalCriteria.SUMMARIZATION.value,
230
- 'Leverages an LLM to evaluate whether a summary (stored under an "output" column) provides an accurate synopsis of an input document (stored under a "input" column).', # noqa: E501
231
- ),
232
- ),
233
- )
234
-
235
-
236
- class MapReducer:
237
- """
238
- Evaluates data that is too large to fit into a single context window using a
239
- map-reduce strategy. The data must first be divided into "chunks" that
240
- individually fit into an LLM's context window. Each chunk of data is
241
- individually evaluated (the "map" step), producing intermediate outputs that
242
- are combined into a single result (the "reduce" step).
243
-
244
- This is the simplest strategy for evaluating long-context data.
245
- """
246
-
247
- def __init__(
248
- self,
249
- model: BaseEvalModel,
250
- map_prompt_template: PromptTemplate,
251
- reduce_prompt_template: PromptTemplate,
252
- ) -> None:
253
- """Initializes an instance.
254
-
255
- Args:
256
- model (BaseEvalModel): The LLM model to use for evaluation.
257
-
258
- map_prompt_template (PromptTemplate): The template that is mapped
259
- over each chunk to produce intermediate outputs. Must contain the
260
- {chunk} placeholder.
261
-
262
- reduce_prompt_template (PromptTemplate): The template that combines
263
- the intermediate outputs into a single result. Must contain the
264
- {mapped} placeholder, which will be formatted as a list of the
265
- intermediate outputs produced by the map step.
266
- """
267
- self._model = model
268
- self._map_prompt_template = map_prompt_template
269
- self._reduce_prompt_template = reduce_prompt_template
270
-
271
- def evaluate(self, chunks: List[str]) -> str:
272
- """Evaluates a list of two or more chunks.
273
-
274
- Args:
275
- chunks (List[str]): A list of chunks to be evaluated. Each chunk is
276
- inserted into the map_prompt_template and must therefore fit within
277
- the LLM's context window and still leave room for the rest of the
278
- prompt.
279
-
280
- Returns:
281
- str: The output of the map-reduce process.
282
- """
283
- if len(chunks) < 2:
284
- raise ValueError(
285
- "The map-reduce strategy is not needed to evaluate data "
286
- "that fits within a single context window. "
287
- "Consider using llm_classify instead."
288
- )
289
- model = self._model
290
- mapped_records = []
291
- for chunk in chunks:
292
- map_prompt = self._map_prompt_template.format({"chunk": chunk})
293
- intermediate_output = model(map_prompt)
294
- mapped_records.append(intermediate_output)
295
- reduce_prompt = self._reduce_prompt_template.format({"mapped": repr(mapped_records)})
296
- return model(reduce_prompt)
297
-
298
-
299
- class Refiner:
300
- """
301
- Evaluates data that is too large to fit into a single context window using a
302
- refine strategy. The data must first be divided into "chunks" that
303
- individually fit into an LLM's context window. An initial "accumulator" is
304
- generated from the first chunk of data. The accumulator is subsequently
305
- refined by iteratively updating and incorporating new information from each
306
- subsequent chunk. An optional synthesis step can be used to synthesize the
307
- final accumulator into a desired format.
308
- """
309
-
310
- def __init__(
311
- self,
312
- model: BaseEvalModel,
313
- initial_prompt_template: PromptTemplate,
314
- refine_prompt_template: PromptTemplate,
315
- synthesize_prompt_template: Optional[PromptTemplate] = None,
316
- ) -> None:
317
- """Initializes an instance.
318
-
319
- Args:
320
- model (BaseEvalModel): The LLM model to use for evaluation.
321
-
322
- initial_prompt_template (PromptTemplate): The template for the
323
- initial invocation of the model that will generate the initial
324
- accumulator. Should contain the {chunk} placeholder.
325
-
326
- refine_prompt_template (PromptTemplate): The template for refining
327
- the accumulator across all subsequence chunks. Must contain the
328
- {chunk} and {accumulator} placeholders.
329
-
330
- synthesize_prompt_template (Optional[PromptTemplate], optional): An
331
- optional template to synthesize the final version of the
332
- accumulator. Must contain the {accumulator} placeholder.
333
- """
334
- self._model = model
335
- self._initial_prompt_template = initial_prompt_template
336
- self._refine_prompt_template = refine_prompt_template
337
- self._synthesize_prompt_template = synthesize_prompt_template
338
-
339
- def evaluate(self, chunks: List[str]) -> str:
340
- """Evaluates a list of two or more chunks.
341
-
342
- Args:
343
- chunks (List[str]): A list of chunks to be evaluated. Each chunk is
344
- inserted into the initial_prompt_template and refine_prompt_template
345
- and must therefore fit within the LLM's context window and still
346
- leave room for the rest of the prompt.
347
-
348
- Returns:
349
- str: The output of the refine process.
350
- """
351
- if len(chunks) < 2:
352
- raise ValueError(
353
- "The refine strategy is not needed to evaluate data "
354
- "that fits within a single context window. "
355
- "Consider using llm_classify instead."
356
- )
357
- model = self._model
358
- initial_prompt = self._initial_prompt_template.format({"chunk": chunks[0]})
359
- accumulator = model(initial_prompt)
360
- for chunk in chunks[1:]:
361
- refine_prompt = self._refine_prompt_template.format(
362
- {"accumulator": accumulator, "chunk": chunk}
363
- )
364
- accumulator = model(refine_prompt)
365
- if not self._synthesize_prompt_template:
366
- return accumulator
367
- reduce_prompt = self._synthesize_prompt_template.format({"accumulator": accumulator})
368
- return model(reduce_prompt)
369
-
370
-
371
- def _extract_label_and_explanation(
372
- unparsed_output: str,
373
- template: ClassificationTemplate,
374
- provide_explanation: bool,
375
- use_openai_function_call: bool,
376
- verbose: bool,
377
- ) -> Tuple[str, Optional[str]]:
378
- """
379
- Extracts the label and explanation from the unparsed output.
380
-
381
- Args:
382
- unparsed_output (str): The raw output to be parsed.
383
-
384
- template (ClassificationTemplate): The template used to generate the
385
- output.
386
-
387
- provide_explanation (bool): Whether the output includes an explanation.
388
-
389
- use_openai_function_call (bool): Whether the output was generated using
390
- function calling.
391
-
392
- verbose (bool): If True, print verbose output to stdout.
393
-
394
- Returns:
395
- Tuple[str, Optional[str]]: A tuple containing the label and an
396
- explanation (if one is provided).
397
- """
398
- if not use_openai_function_call:
399
- if provide_explanation:
400
- unrailed_label, explanation = (
401
- template.extract_label_from_explanation(unparsed_output),
402
- unparsed_output,
403
- )
404
- printif(
405
- verbose and unrailed_label == NOT_PARSABLE,
406
- f"- Could not parse {repr(unparsed_output)}",
407
- )
408
- else:
409
- unrailed_label = unparsed_output
410
- explanation = None
411
- else:
412
- unrailed_label, explanation = parse_openai_function_call(unparsed_output)
413
- return snap_to_rail(unrailed_label, template.rails, verbose=verbose), explanation
@@ -1,4 +0,0 @@
1
- from .classify import llm_classify, run_evals, run_relevance_eval
2
- from .generate import llm_generate
3
-
4
- __all__ = ["llm_classify", "run_relevance_eval", "llm_generate", "run_evals"]