arize-phoenix 5.5.2__py3-none-any.whl → 5.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-5.5.2.dist-info → arize_phoenix-5.7.0.dist-info}/METADATA +4 -7
- arize_phoenix-5.7.0.dist-info/RECORD +330 -0
- phoenix/config.py +50 -8
- phoenix/core/model.py +3 -3
- phoenix/core/model_schema.py +41 -50
- phoenix/core/model_schema_adapter.py +17 -16
- phoenix/datetime_utils.py +2 -2
- phoenix/db/bulk_inserter.py +10 -20
- phoenix/db/engines.py +2 -1
- phoenix/db/enums.py +2 -2
- phoenix/db/helpers.py +8 -7
- phoenix/db/insertion/dataset.py +9 -19
- phoenix/db/insertion/document_annotation.py +14 -13
- phoenix/db/insertion/helpers.py +6 -16
- phoenix/db/insertion/span_annotation.py +14 -13
- phoenix/db/insertion/trace_annotation.py +14 -13
- phoenix/db/insertion/types.py +19 -30
- phoenix/db/migrations/versions/3be8647b87d8_add_token_columns_to_spans_table.py +8 -8
- phoenix/db/models.py +28 -28
- phoenix/experiments/evaluators/base.py +2 -1
- phoenix/experiments/evaluators/code_evaluators.py +4 -5
- phoenix/experiments/evaluators/llm_evaluators.py +157 -4
- phoenix/experiments/evaluators/utils.py +3 -2
- phoenix/experiments/functions.py +10 -21
- phoenix/experiments/tracing.py +2 -1
- phoenix/experiments/types.py +20 -29
- phoenix/experiments/utils.py +2 -1
- phoenix/inferences/errors.py +6 -5
- phoenix/inferences/fixtures.py +6 -5
- phoenix/inferences/inferences.py +37 -37
- phoenix/inferences/schema.py +11 -10
- phoenix/inferences/validation.py +13 -14
- phoenix/logging/_formatter.py +3 -3
- phoenix/metrics/__init__.py +5 -4
- phoenix/metrics/binning.py +2 -1
- phoenix/metrics/metrics.py +2 -1
- phoenix/metrics/mixins.py +7 -6
- phoenix/metrics/retrieval_metrics.py +2 -1
- phoenix/metrics/timeseries.py +5 -4
- phoenix/metrics/wrappers.py +2 -2
- phoenix/pointcloud/clustering.py +3 -4
- phoenix/pointcloud/pointcloud.py +7 -5
- phoenix/pointcloud/umap_parameters.py +2 -1
- phoenix/server/api/dataloaders/annotation_summaries.py +12 -19
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +2 -2
- phoenix/server/api/dataloaders/cache/two_tier_cache.py +3 -2
- phoenix/server/api/dataloaders/dataset_example_revisions.py +3 -8
- phoenix/server/api/dataloaders/dataset_example_spans.py +2 -5
- phoenix/server/api/dataloaders/document_evaluation_summaries.py +12 -18
- phoenix/server/api/dataloaders/document_evaluations.py +3 -7
- phoenix/server/api/dataloaders/document_retrieval_metrics.py +6 -13
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +4 -8
- phoenix/server/api/dataloaders/experiment_error_rates.py +2 -5
- phoenix/server/api/dataloaders/experiment_run_annotations.py +3 -7
- phoenix/server/api/dataloaders/experiment_run_counts.py +1 -5
- phoenix/server/api/dataloaders/experiment_sequence_number.py +2 -5
- phoenix/server/api/dataloaders/latency_ms_quantile.py +21 -30
- phoenix/server/api/dataloaders/min_start_or_max_end_times.py +7 -13
- phoenix/server/api/dataloaders/project_by_name.py +3 -3
- phoenix/server/api/dataloaders/record_counts.py +11 -18
- phoenix/server/api/dataloaders/span_annotations.py +3 -7
- phoenix/server/api/dataloaders/span_dataset_examples.py +3 -8
- phoenix/server/api/dataloaders/span_descendants.py +3 -7
- phoenix/server/api/dataloaders/span_projects.py +2 -2
- phoenix/server/api/dataloaders/token_counts.py +12 -19
- phoenix/server/api/dataloaders/trace_row_ids.py +3 -7
- phoenix/server/api/dataloaders/user_roles.py +3 -3
- phoenix/server/api/dataloaders/users.py +3 -3
- phoenix/server/api/helpers/__init__.py +4 -3
- phoenix/server/api/helpers/dataset_helpers.py +10 -9
- phoenix/server/api/helpers/playground_clients.py +671 -0
- phoenix/server/api/helpers/playground_registry.py +70 -0
- phoenix/server/api/helpers/playground_spans.py +325 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +2 -2
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +2 -2
- phoenix/server/api/input_types/ChatCompletionInput.py +38 -0
- phoenix/server/api/input_types/ChatCompletionMessageInput.py +13 -1
- phoenix/server/api/input_types/ClusterInput.py +2 -2
- phoenix/server/api/input_types/DeleteAnnotationsInput.py +1 -3
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +2 -2
- phoenix/server/api/input_types/DeleteExperimentsInput.py +1 -3
- phoenix/server/api/input_types/DimensionFilter.py +4 -4
- phoenix/server/api/input_types/GenerativeModelInput.py +17 -0
- phoenix/server/api/input_types/Granularity.py +1 -1
- phoenix/server/api/input_types/InvocationParameters.py +156 -13
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +2 -2
- phoenix/server/api/input_types/TemplateOptions.py +10 -0
- phoenix/server/api/mutations/__init__.py +4 -0
- phoenix/server/api/mutations/chat_mutations.py +374 -0
- phoenix/server/api/mutations/dataset_mutations.py +4 -4
- phoenix/server/api/mutations/experiment_mutations.py +1 -2
- phoenix/server/api/mutations/export_events_mutations.py +7 -7
- phoenix/server/api/mutations/span_annotations_mutations.py +4 -4
- phoenix/server/api/mutations/trace_annotations_mutations.py +4 -4
- phoenix/server/api/mutations/user_mutations.py +4 -4
- phoenix/server/api/openapi/schema.py +2 -2
- phoenix/server/api/queries.py +61 -72
- phoenix/server/api/routers/oauth2.py +4 -4
- phoenix/server/api/routers/v1/datasets.py +22 -36
- phoenix/server/api/routers/v1/evaluations.py +6 -5
- phoenix/server/api/routers/v1/experiment_evaluations.py +2 -2
- phoenix/server/api/routers/v1/experiment_runs.py +2 -2
- phoenix/server/api/routers/v1/experiments.py +4 -4
- phoenix/server/api/routers/v1/spans.py +13 -12
- phoenix/server/api/routers/v1/traces.py +5 -5
- phoenix/server/api/routers/v1/utils.py +5 -5
- phoenix/server/api/schema.py +42 -10
- phoenix/server/api/subscriptions.py +347 -494
- phoenix/server/api/types/AnnotationSummary.py +3 -3
- phoenix/server/api/types/ChatCompletionSubscriptionPayload.py +44 -0
- phoenix/server/api/types/Cluster.py +8 -7
- phoenix/server/api/types/Dataset.py +5 -4
- phoenix/server/api/types/Dimension.py +3 -3
- phoenix/server/api/types/DocumentEvaluationSummary.py +8 -7
- phoenix/server/api/types/EmbeddingDimension.py +6 -5
- phoenix/server/api/types/EvaluationSummary.py +3 -3
- phoenix/server/api/types/Event.py +7 -7
- phoenix/server/api/types/Experiment.py +3 -3
- phoenix/server/api/types/ExperimentComparison.py +2 -4
- phoenix/server/api/types/GenerativeProvider.py +27 -3
- phoenix/server/api/types/Inferences.py +9 -8
- phoenix/server/api/types/InferencesRole.py +2 -2
- phoenix/server/api/types/Model.py +2 -2
- phoenix/server/api/types/Project.py +11 -18
- phoenix/server/api/types/Segments.py +3 -3
- phoenix/server/api/types/Span.py +45 -7
- phoenix/server/api/types/TemplateLanguage.py +9 -0
- phoenix/server/api/types/TimeSeries.py +8 -7
- phoenix/server/api/types/Trace.py +2 -2
- phoenix/server/api/types/UMAPPoints.py +6 -6
- phoenix/server/api/types/User.py +3 -3
- phoenix/server/api/types/node.py +1 -3
- phoenix/server/api/types/pagination.py +4 -4
- phoenix/server/api/utils.py +2 -4
- phoenix/server/app.py +76 -37
- phoenix/server/bearer_auth.py +4 -10
- phoenix/server/dml_event.py +3 -3
- phoenix/server/dml_event_handler.py +10 -24
- phoenix/server/grpc_server.py +3 -2
- phoenix/server/jwt_store.py +22 -21
- phoenix/server/main.py +17 -4
- phoenix/server/oauth2.py +3 -2
- phoenix/server/rate_limiters.py +5 -8
- phoenix/server/static/.vite/manifest.json +31 -31
- phoenix/server/static/assets/components-Csu8UKOs.js +1612 -0
- phoenix/server/static/assets/{index-DCzakdJq.js → index-Bk5C9EA7.js} +2 -2
- phoenix/server/static/assets/{pages-CAL1FDMt.js → pages-UeWaKXNs.js} +337 -442
- phoenix/server/static/assets/{vendor-6IcPAw_j.js → vendor-CtqfhlbC.js} +6 -6
- phoenix/server/static/assets/{vendor-arizeai-DRZuoyuF.js → vendor-arizeai-C_3SBz56.js} +2 -2
- phoenix/server/static/assets/{vendor-codemirror-DVE2_WBr.js → vendor-codemirror-wfdk9cjp.js} +1 -1
- phoenix/server/static/assets/{vendor-recharts-DwrexFA4.js → vendor-recharts-BiVnSv90.js} +1 -1
- phoenix/server/templates/index.html +1 -0
- phoenix/server/thread_server.py +1 -1
- phoenix/server/types.py +17 -29
- phoenix/services.py +8 -3
- phoenix/session/client.py +12 -24
- phoenix/session/data_extractor.py +3 -3
- phoenix/session/evaluation.py +1 -2
- phoenix/session/session.py +26 -21
- phoenix/trace/attributes.py +16 -28
- phoenix/trace/dsl/filter.py +17 -21
- phoenix/trace/dsl/helpers.py +3 -3
- phoenix/trace/dsl/query.py +13 -22
- phoenix/trace/fixtures.py +11 -17
- phoenix/trace/otel.py +5 -15
- phoenix/trace/projects.py +3 -2
- phoenix/trace/schemas.py +2 -2
- phoenix/trace/span_evaluations.py +9 -8
- phoenix/trace/span_json_decoder.py +3 -3
- phoenix/trace/span_json_encoder.py +2 -2
- phoenix/trace/trace_dataset.py +6 -5
- phoenix/trace/utils.py +6 -6
- phoenix/utilities/deprecation.py +3 -2
- phoenix/utilities/error_handling.py +3 -2
- phoenix/utilities/json.py +2 -1
- phoenix/utilities/logging.py +2 -2
- phoenix/utilities/project.py +1 -1
- phoenix/utilities/re.py +3 -4
- phoenix/utilities/template_formatters.py +16 -5
- phoenix/version.py +1 -1
- arize_phoenix-5.5.2.dist-info/RECORD +0 -321
- phoenix/server/static/assets/components-hX0LgYz3.js +0 -1428
- {arize_phoenix-5.5.2.dist-info → arize_phoenix-5.7.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-5.5.2.dist-info → arize_phoenix-5.7.0.dist-info}/entry_points.txt +0 -0
- {arize_phoenix-5.5.2.dist-info → arize_phoenix-5.7.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-5.5.2.dist-info → arize_phoenix-5.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from collections.abc import Callable
|
|
2
3
|
from types import MappingProxyType
|
|
3
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, Optional
|
|
4
5
|
|
|
5
6
|
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
6
7
|
from phoenix.evals.utils import snap_to_rail
|
|
@@ -121,7 +122,7 @@ class LLMCriteriaEvaluator(LLMEvaluator):
|
|
|
121
122
|
|
|
122
123
|
def criteria_evaluator_factory(
|
|
123
124
|
class_name: str, criteria: str, description: str, default_name: str
|
|
124
|
-
) ->
|
|
125
|
+
) -> type[ExperimentEvaluator]:
|
|
125
126
|
def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
|
|
126
127
|
LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
|
|
127
128
|
|
|
@@ -240,8 +241,8 @@ class RelevanceEvaluator(LLMEvaluator):
|
|
|
240
241
|
"LABEL: *true or false*\n\n"
|
|
241
242
|
"Follow this template for the following example:\n\n"
|
|
242
243
|
"CRITERIA: the response is 'relevant' to the query\n"
|
|
243
|
-
"QUERY: {
|
|
244
|
-
"RESPONSE: {
|
|
244
|
+
"QUERY: {query}\n"
|
|
245
|
+
"RESPONSE: {response}\n"
|
|
245
246
|
"EXPLANATION: "
|
|
246
247
|
)
|
|
247
248
|
|
|
@@ -318,3 +319,155 @@ class RelevanceEvaluator(LLMEvaluator):
|
|
|
318
319
|
formatted_template = self._format_eval_template(output, input, metadata)
|
|
319
320
|
unparsed_response = await self.model._async_generate(formatted_template)
|
|
320
321
|
return self._parse_eval_output(unparsed_response)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class LLMRelationalEvaluator(LLMEvaluator):
|
|
325
|
+
"""
|
|
326
|
+
An LLM experiment evaluator that checks how a response is related to reference text.
|
|
327
|
+
|
|
328
|
+
`LLMRelationalEvaluator` uses the chain-of-thought technique to perform a binary evaluation of
|
|
329
|
+
how a response is related to reference text in a specified manner. When used as an experiment
|
|
330
|
+
evaluator, `LLMRelationalEvaluator` will return a score of 1.0 if the response is related to
|
|
331
|
+
the reference text in the specified manner and a score of 0.0 if not. The explanation
|
|
332
|
+
produced by the chain-of-thought technique will be included in the experiment evaluation as
|
|
333
|
+
well.
|
|
334
|
+
|
|
335
|
+
In order to evaluate how a response is related to reference text, a specific relation and
|
|
336
|
+
description of that relation must be specified. The relation should be a phrase that can be
|
|
337
|
+
used in the following manner: "The response '{relation}' the reference". The description
|
|
338
|
+
should complete the sentence "In this context, '{relation}' means the response {description}".
|
|
339
|
+
|
|
340
|
+
Example relations and descriptions:
|
|
341
|
+
- "is a good summary of" - "the response clearly concisely summarizes the reference"
|
|
342
|
+
- "directly quotes" - "the response contains specific information from the reference"
|
|
343
|
+
- "professionally addresses" - "the response is respectful and relevant to the reference"
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
model: The LLM model wrapper to use for evaluation. Compatible models can be imported from
|
|
347
|
+
the `phoenix.evals` module.
|
|
348
|
+
relation: The relation to evaluate the text against, the relation should be a phrase that
|
|
349
|
+
can be used in the following manner: "The response '{relation}' the reference".
|
|
350
|
+
description (str): A description of the relation, used to clarify instructions to the LLM.
|
|
351
|
+
The description should complete the sentence "In this context, '{relation}'
|
|
352
|
+
means {description}". It is helpful to specifically use the words "response" and
|
|
353
|
+
"reference" to describe the relation.
|
|
354
|
+
name (str): The name of the evaluator
|
|
355
|
+
get_reference (callable, optional): A function that extracts the reference from the input of
|
|
356
|
+
the experiment task. The function should take the input and metadata of the dataset
|
|
357
|
+
example and return a string. By default, the function will return the string
|
|
358
|
+
representation of the input.
|
|
359
|
+
get_response (callable, optional): A function that extracts the response from the output of
|
|
360
|
+
the experiment task. The function should take the output and metadata of the experiment
|
|
361
|
+
task and return a string. By default, the function will return the string representation
|
|
362
|
+
of the output.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
_base_template = (
|
|
366
|
+
"Determine if the following response '{relation}' the reference. {description}"
|
|
367
|
+
"First, explain step-by-step why you think the response '{relation}' the reference. "
|
|
368
|
+
"Then provide a single word label; 'true' if the response '{relation}' the reference or "
|
|
369
|
+
"'false' if the text is not '{relation}' to the reference. "
|
|
370
|
+
"Here is an example template for your reponse:\n\n"
|
|
371
|
+
"CRITERIA: the response '{relation}' the reference\n"
|
|
372
|
+
"REFERENCE: *text that contains a reference*\n"
|
|
373
|
+
"RESPONSE: *a response that may or may not be '{relation}' to the reference*\n"
|
|
374
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
|
|
375
|
+
"response '{relation}' the reference*\n"
|
|
376
|
+
"LABEL: *true or false*\n\n"
|
|
377
|
+
"Follow this template for the following example:\n\n"
|
|
378
|
+
"CRITERIA: the response '{relation}' the reference\n"
|
|
379
|
+
"REFERENCE: {reference}\n"
|
|
380
|
+
"RESPONSE: {response}\n"
|
|
381
|
+
"EXPLANATION: "
|
|
382
|
+
)
|
|
383
|
+
_description = "In this context, '{relation}' means '{description}'. "
|
|
384
|
+
|
|
385
|
+
def __init__(
|
|
386
|
+
self,
|
|
387
|
+
model: LLMBaseModel,
|
|
388
|
+
relation: str,
|
|
389
|
+
description: str,
|
|
390
|
+
name: str,
|
|
391
|
+
get_reference: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
|
|
392
|
+
get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
|
|
393
|
+
):
|
|
394
|
+
self.model = model
|
|
395
|
+
self._name = name
|
|
396
|
+
self.relation = relation
|
|
397
|
+
self.description = description
|
|
398
|
+
self.template = self._format_base_template(self.relation, self.description)
|
|
399
|
+
self.get_reference = get_reference or self._default_get_reference
|
|
400
|
+
self.get_response = get_response or self._default_get_response
|
|
401
|
+
|
|
402
|
+
@classmethod
|
|
403
|
+
def _format_base_template(cls, relation: str, description: Optional[str] = None) -> str:
|
|
404
|
+
formatted_description = cls._description.format(relation=relation, description=description)
|
|
405
|
+
formatted_template = cls._base_template.format(
|
|
406
|
+
relation=relation,
|
|
407
|
+
description=formatted_description,
|
|
408
|
+
response="{response}", # leave the response field as a placeholder
|
|
409
|
+
reference="{reference}", # leave the reference field as a placeholder
|
|
410
|
+
)
|
|
411
|
+
return formatted_template
|
|
412
|
+
|
|
413
|
+
def _format_eval_template(
|
|
414
|
+
self,
|
|
415
|
+
output: Optional[TaskOutput] = None,
|
|
416
|
+
input: ExampleInput = MappingProxyType({}),
|
|
417
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
418
|
+
) -> str:
|
|
419
|
+
assert output is not None
|
|
420
|
+
reference = self.get_reference(input, metadata)
|
|
421
|
+
response = self.get_response(output, metadata)
|
|
422
|
+
return self.template.format(reference=reference, response=response)
|
|
423
|
+
|
|
424
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
425
|
+
raw_label, explanation = (
|
|
426
|
+
_parse_label_from_explanation(unparsed_response),
|
|
427
|
+
unparsed_response,
|
|
428
|
+
)
|
|
429
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
430
|
+
if label == "true":
|
|
431
|
+
score = 1.0
|
|
432
|
+
elif label == "false":
|
|
433
|
+
score = 0.0
|
|
434
|
+
else:
|
|
435
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
436
|
+
return EvaluationResult(
|
|
437
|
+
score=score,
|
|
438
|
+
explanation=explanation,
|
|
439
|
+
metadata={},
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
def _default_get_reference(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
|
|
443
|
+
return str(input)
|
|
444
|
+
|
|
445
|
+
def _default_get_response(
|
|
446
|
+
self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
|
|
447
|
+
) -> str:
|
|
448
|
+
assert output is not None
|
|
449
|
+
return str(unwrap_json(output))
|
|
450
|
+
|
|
451
|
+
def evaluate(
|
|
452
|
+
self,
|
|
453
|
+
*,
|
|
454
|
+
output: Optional[TaskOutput] = None,
|
|
455
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
456
|
+
input: ExampleInput = MappingProxyType({}),
|
|
457
|
+
**_: Any,
|
|
458
|
+
) -> EvaluationResult:
|
|
459
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
460
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
461
|
+
return self._parse_eval_output(unparsed_response)
|
|
462
|
+
|
|
463
|
+
async def async_evaluate(
|
|
464
|
+
self,
|
|
465
|
+
*,
|
|
466
|
+
output: Optional[TaskOutput] = None,
|
|
467
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
468
|
+
input: ExampleInput = MappingProxyType({}),
|
|
469
|
+
**_: Any,
|
|
470
|
+
) -> EvaluationResult:
|
|
471
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
472
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
473
|
+
return self._parse_eval_output(unparsed_response)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import inspect
|
|
3
|
-
from
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
4
5
|
|
|
5
6
|
from phoenix.experiments.types import (
|
|
6
7
|
AnnotatorKind,
|
|
@@ -134,7 +135,7 @@ def create_evaluator(
|
|
|
134
135
|
from textdistance import levenshtein
|
|
135
136
|
|
|
136
137
|
@create_evaluator(kind="CODE", name="levenshtein-distance")
|
|
137
|
-
def ld(output: str, expected: str) ->
|
|
138
|
+
def ld(output: str, expected: str) -> tuple[float, str]:
|
|
138
139
|
return (
|
|
139
140
|
levenshtein(output, expected),
|
|
140
141
|
f"Levenshtein distance between {output} and {expected}"
|
phoenix/experiments/functions.py
CHANGED
|
@@ -4,24 +4,13 @@ import inspect
|
|
|
4
4
|
import json
|
|
5
5
|
import traceback
|
|
6
6
|
from binascii import hexlify
|
|
7
|
+
from collections.abc import Awaitable, Mapping, Sequence
|
|
7
8
|
from contextlib import ExitStack
|
|
8
9
|
from copy import deepcopy
|
|
9
10
|
from dataclasses import replace
|
|
10
11
|
from datetime import datetime, timezone
|
|
11
12
|
from itertools import product
|
|
12
|
-
from typing import
|
|
13
|
-
Any,
|
|
14
|
-
Awaitable,
|
|
15
|
-
Dict,
|
|
16
|
-
Literal,
|
|
17
|
-
Mapping,
|
|
18
|
-
Optional,
|
|
19
|
-
Sequence,
|
|
20
|
-
Tuple,
|
|
21
|
-
Type,
|
|
22
|
-
Union,
|
|
23
|
-
cast,
|
|
24
|
-
)
|
|
13
|
+
from typing import Any, Literal, Optional, Union, cast
|
|
25
14
|
from urllib.parse import urljoin
|
|
26
15
|
|
|
27
16
|
import httpx
|
|
@@ -76,7 +65,7 @@ from phoenix.utilities.client import VersionedAsyncClient, VersionedClient
|
|
|
76
65
|
from phoenix.utilities.json import jsonify
|
|
77
66
|
|
|
78
67
|
|
|
79
|
-
def _phoenix_clients() ->
|
|
68
|
+
def _phoenix_clients() -> tuple[httpx.Client, httpx.AsyncClient]:
|
|
80
69
|
return VersionedClient(
|
|
81
70
|
base_url=get_base_url(),
|
|
82
71
|
), VersionedAsyncClient(
|
|
@@ -91,7 +80,7 @@ Evaluators: TypeAlias = Union[
|
|
|
91
80
|
]
|
|
92
81
|
|
|
93
82
|
|
|
94
|
-
RateLimitErrors: TypeAlias = Union[
|
|
83
|
+
RateLimitErrors: TypeAlias = Union[type[BaseException], Sequence[type[BaseException]]]
|
|
95
84
|
|
|
96
85
|
|
|
97
86
|
def run_experiment(
|
|
@@ -369,7 +358,7 @@ def run_experiment(
|
|
|
369
358
|
exp_run = replace(exp_run, id=resp.json()["data"]["id"])
|
|
370
359
|
return exp_run
|
|
371
360
|
|
|
372
|
-
_errors:
|
|
361
|
+
_errors: tuple[type[BaseException], ...]
|
|
373
362
|
if not isinstance(rate_limit_errors, Sequence):
|
|
374
363
|
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
375
364
|
else:
|
|
@@ -498,7 +487,7 @@ def evaluate_experiment(
|
|
|
498
487
|
root_span_kind = EVALUATOR
|
|
499
488
|
|
|
500
489
|
def sync_evaluate_run(
|
|
501
|
-
obj:
|
|
490
|
+
obj: tuple[Example, ExperimentRun, Evaluator],
|
|
502
491
|
) -> ExperimentEvaluationRun:
|
|
503
492
|
example, experiment_run, evaluator = obj
|
|
504
493
|
result: Optional[EvaluationResult] = None
|
|
@@ -550,7 +539,7 @@ def evaluate_experiment(
|
|
|
550
539
|
return eval_run
|
|
551
540
|
|
|
552
541
|
async def async_evaluate_run(
|
|
553
|
-
obj:
|
|
542
|
+
obj: tuple[Example, ExperimentRun, Evaluator],
|
|
554
543
|
) -> ExperimentEvaluationRun:
|
|
555
544
|
example, experiment_run, evaluator = obj
|
|
556
545
|
result: Optional[EvaluationResult] = None
|
|
@@ -611,7 +600,7 @@ def evaluate_experiment(
|
|
|
611
600
|
eval_run = replace(eval_run, id=resp.json()["data"]["id"])
|
|
612
601
|
return eval_run
|
|
613
602
|
|
|
614
|
-
_errors:
|
|
603
|
+
_errors: tuple[type[BaseException], ...]
|
|
615
604
|
if not isinstance(rate_limit_errors, Sequence):
|
|
616
605
|
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
617
606
|
else:
|
|
@@ -649,7 +638,7 @@ def evaluate_experiment(
|
|
|
649
638
|
|
|
650
639
|
|
|
651
640
|
def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
|
|
652
|
-
evaluators_by_name:
|
|
641
|
+
evaluators_by_name: dict[EvaluatorName, Evaluator] = {}
|
|
653
642
|
if obj is None:
|
|
654
643
|
return evaluators_by_name
|
|
655
644
|
if isinstance(mapping := obj, Mapping):
|
|
@@ -678,7 +667,7 @@ def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Eva
|
|
|
678
667
|
return evaluators_by_name
|
|
679
668
|
|
|
680
669
|
|
|
681
|
-
def _get_tracer(project_name: Optional[str] = None) ->
|
|
670
|
+
def _get_tracer(project_name: Optional[str] = None) -> tuple[Tracer, Resource]:
|
|
682
671
|
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
683
672
|
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
684
673
|
span_processor = (
|
phoenix/experiments/tracing.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from collections.abc import Callable, Iterator
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from contextvars import ContextVar
|
|
5
6
|
from threading import Lock
|
|
6
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Optional
|
|
7
8
|
|
|
8
9
|
from opentelemetry.sdk.resources import Resource
|
|
9
10
|
from opentelemetry.sdk.trace import ReadableSpan
|
phoenix/experiments/types.py
CHANGED
|
@@ -3,6 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import textwrap
|
|
5
5
|
from collections import Counter
|
|
6
|
+
from collections.abc import (
|
|
7
|
+
Awaitable,
|
|
8
|
+
Callable,
|
|
9
|
+
Iterable,
|
|
10
|
+
Iterator,
|
|
11
|
+
Mapping,
|
|
12
|
+
)
|
|
6
13
|
from copy import copy, deepcopy
|
|
7
14
|
from dataclasses import dataclass, field, fields
|
|
8
15
|
from datetime import datetime
|
|
@@ -10,23 +17,7 @@ from enum import Enum
|
|
|
10
17
|
from functools import cached_property
|
|
11
18
|
from importlib.metadata import version
|
|
12
19
|
from random import getrandbits
|
|
13
|
-
from typing import
|
|
14
|
-
Any,
|
|
15
|
-
Awaitable,
|
|
16
|
-
Callable,
|
|
17
|
-
Dict,
|
|
18
|
-
FrozenSet,
|
|
19
|
-
Iterable,
|
|
20
|
-
Iterator,
|
|
21
|
-
List,
|
|
22
|
-
Mapping,
|
|
23
|
-
Optional,
|
|
24
|
-
Tuple,
|
|
25
|
-
TypeVar,
|
|
26
|
-
Union,
|
|
27
|
-
cast,
|
|
28
|
-
overload,
|
|
29
|
-
)
|
|
20
|
+
from typing import Any, Optional, TypeVar, Union, cast, overload
|
|
30
21
|
|
|
31
22
|
import pandas as pd
|
|
32
23
|
from typing_extensions import TypeAlias
|
|
@@ -41,7 +32,7 @@ class AnnotatorKind(Enum):
|
|
|
41
32
|
LLM = "LLM"
|
|
42
33
|
|
|
43
34
|
|
|
44
|
-
JSONSerializable: TypeAlias = Optional[Union[
|
|
35
|
+
JSONSerializable: TypeAlias = Optional[Union[dict[str, Any], list[Any], str, int, float, bool]]
|
|
45
36
|
ExperimentId: TypeAlias = str
|
|
46
37
|
DatasetId: TypeAlias = str
|
|
47
38
|
DatasetVersionId: TypeAlias = str
|
|
@@ -63,7 +54,7 @@ Explanation: TypeAlias = Optional[str]
|
|
|
63
54
|
EvaluatorName: TypeAlias = str
|
|
64
55
|
EvaluatorKind: TypeAlias = str
|
|
65
56
|
EvaluatorOutput: TypeAlias = Union[
|
|
66
|
-
"EvaluationResult", bool, int, float, str,
|
|
57
|
+
"EvaluationResult", bool, int, float, str, tuple[Score, Label, Explanation]
|
|
67
58
|
]
|
|
68
59
|
|
|
69
60
|
DRY_RUN: ExperimentId = "DRY_RUN"
|
|
@@ -135,14 +126,14 @@ class Dataset:
|
|
|
135
126
|
return iter(self.examples.values())
|
|
136
127
|
|
|
137
128
|
@cached_property
|
|
138
|
-
def _keys(self) ->
|
|
129
|
+
def _keys(self) -> tuple[str, ...]:
|
|
139
130
|
return tuple(self.examples.keys())
|
|
140
131
|
|
|
141
132
|
@overload
|
|
142
133
|
def __getitem__(self, key: int) -> Example: ...
|
|
143
134
|
@overload
|
|
144
|
-
def __getitem__(self, key: slice) ->
|
|
145
|
-
def __getitem__(self, key: Union[int, slice]) -> Union[Example,
|
|
135
|
+
def __getitem__(self, key: slice) -> list[Example]: ...
|
|
136
|
+
def __getitem__(self, key: Union[int, slice]) -> Union[Example, list[Example]]:
|
|
146
137
|
if isinstance(key, int):
|
|
147
138
|
return self.examples[self._keys[key]]
|
|
148
139
|
return [self.examples[k] for k in self._keys[key]]
|
|
@@ -306,7 +297,7 @@ class ExperimentParameters:
|
|
|
306
297
|
|
|
307
298
|
@dataclass(frozen=True)
|
|
308
299
|
class EvaluationParameters:
|
|
309
|
-
eval_names:
|
|
300
|
+
eval_names: frozenset[str]
|
|
310
301
|
exp_params: ExperimentParameters
|
|
311
302
|
|
|
312
303
|
|
|
@@ -485,8 +476,8 @@ class RanExperiment(Experiment):
|
|
|
485
476
|
dataset: Dataset = field(repr=False)
|
|
486
477
|
runs: Mapping[ExperimentRunId, ExperimentRun] = field(repr=False)
|
|
487
478
|
task_summary: TaskSummary = field(repr=False)
|
|
488
|
-
eval_runs:
|
|
489
|
-
eval_summaries:
|
|
479
|
+
eval_runs: tuple[ExperimentEvaluationRun, ...] = field(repr=False, default=())
|
|
480
|
+
eval_summaries: tuple[EvaluationSummary, ...] = field(repr=False, default=())
|
|
490
481
|
|
|
491
482
|
@property
|
|
492
483
|
def url(self) -> str:
|
|
@@ -514,14 +505,14 @@ class RanExperiment(Experiment):
|
|
|
514
505
|
return iter(self.runs.values())
|
|
515
506
|
|
|
516
507
|
@cached_property
|
|
517
|
-
def _keys(self) ->
|
|
508
|
+
def _keys(self) -> tuple[str, ...]:
|
|
518
509
|
return tuple(self.runs.keys())
|
|
519
510
|
|
|
520
511
|
@overload
|
|
521
512
|
def __getitem__(self, key: int) -> ExperimentRun: ...
|
|
522
513
|
@overload
|
|
523
|
-
def __getitem__(self, key: slice) ->
|
|
524
|
-
def __getitem__(self, key: Union[int, slice]) -> Union[ExperimentRun,
|
|
514
|
+
def __getitem__(self, key: slice) -> list[ExperimentRun]: ...
|
|
515
|
+
def __getitem__(self, key: Union[int, slice]) -> Union[ExperimentRun, list[ExperimentRun]]:
|
|
525
516
|
if isinstance(key, int):
|
|
526
517
|
return self.runs[self._keys[key]]
|
|
527
518
|
return [self.runs[k] for k in self._keys[key]]
|
|
@@ -596,7 +587,7 @@ class RanExperiment(Experiment):
|
|
|
596
587
|
raise NotImplementedError
|
|
597
588
|
|
|
598
589
|
|
|
599
|
-
def _asdict(dc: Any) ->
|
|
590
|
+
def _asdict(dc: Any) -> dict[str, Any]:
|
|
600
591
|
# non-recursive version of `dataclasses.asdict()`
|
|
601
592
|
return {field.name: getattr(dc, field.name) for field in fields(dc)}
|
|
602
593
|
|
phoenix/experiments/utils.py
CHANGED
phoenix/inferences/errors.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from typing import Any, Union
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class ValidationError(Exception):
|
|
@@ -57,8 +58,8 @@ class InvalidSchemaError(ValidationError):
|
|
|
57
58
|
class DatasetError(Exception):
|
|
58
59
|
"""An error raised when the dataset is invalid or incomplete"""
|
|
59
60
|
|
|
60
|
-
def __init__(self, errors: Union[ValidationError,
|
|
61
|
-
self.errors:
|
|
61
|
+
def __init__(self, errors: Union[ValidationError, list[ValidationError]]):
|
|
62
|
+
self.errors: list[ValidationError] = errors if isinstance(errors, list) else [errors]
|
|
62
63
|
|
|
63
64
|
def __str__(self) -> str:
|
|
64
65
|
return "\n".join(map(str, self.errors))
|
|
@@ -142,7 +143,7 @@ class EmbeddingVectorSizeMismatch(ValidationError):
|
|
|
142
143
|
vector lengths"""
|
|
143
144
|
|
|
144
145
|
def __init__(
|
|
145
|
-
self, embedding_feature_name: str, vector_column_name: str, vector_lengths:
|
|
146
|
+
self, embedding_feature_name: str, vector_column_name: str, vector_lengths: list[int]
|
|
146
147
|
) -> None:
|
|
147
148
|
self.embedding_feature_name = embedding_feature_name
|
|
148
149
|
self.vector_column_name = vector_column_name
|
|
@@ -238,5 +239,5 @@ class MissingTimestampColumnName(ValidationError):
|
|
|
238
239
|
class SchemaError(Exception):
|
|
239
240
|
"""An error raised when the Schema is invalid or incomplete"""
|
|
240
241
|
|
|
241
|
-
def __init__(self, errors: Union[ValidationError,
|
|
242
|
+
def __init__(self, errors: Union[ValidationError, list[ValidationError]]):
|
|
242
243
|
self.errors = errors
|
phoenix/inferences/fixtures.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
from collections.abc import Iterator
|
|
3
4
|
from dataclasses import dataclass, replace
|
|
4
5
|
from enum import Enum, auto
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import NamedTuple, Optional
|
|
7
8
|
from urllib import request
|
|
8
9
|
from urllib.parse import quote, urljoin
|
|
9
10
|
|
|
@@ -39,7 +40,7 @@ class Fixture:
|
|
|
39
40
|
corpus_file_name: Optional[str] = None
|
|
40
41
|
corpus_schema: Optional[Schema] = None
|
|
41
42
|
|
|
42
|
-
def paths(self) -> Iterator[
|
|
43
|
+
def paths(self) -> Iterator[tuple[InferencesRole, Path]]:
|
|
43
44
|
return (
|
|
44
45
|
(role, Path(self.prefix) / name)
|
|
45
46
|
for role, name in zip(
|
|
@@ -397,7 +398,7 @@ wikipedia_fixture = Fixture(
|
|
|
397
398
|
corpus_file_name="corpus.parquet",
|
|
398
399
|
)
|
|
399
400
|
|
|
400
|
-
FIXTURES:
|
|
401
|
+
FIXTURES: tuple[Fixture, ...] = (
|
|
401
402
|
sentiment_classification_language_drift_fixture,
|
|
402
403
|
image_classification_fixture,
|
|
403
404
|
fashion_mnist_fixture,
|
|
@@ -416,7 +417,7 @@ NAME_TO_FIXTURE = {fixture.name: fixture for fixture in FIXTURES}
|
|
|
416
417
|
def get_inferences(
|
|
417
418
|
fixture_name: str,
|
|
418
419
|
no_internet: bool = False,
|
|
419
|
-
) ->
|
|
420
|
+
) -> tuple[Inferences, Optional[Inferences], Optional[Inferences]]:
|
|
420
421
|
"""
|
|
421
422
|
Downloads primary and reference inferences for a fixture if they are not found
|
|
422
423
|
locally.
|
|
@@ -550,7 +551,7 @@ class GCSAssets(NamedTuple):
|
|
|
550
551
|
)
|
|
551
552
|
|
|
552
553
|
|
|
553
|
-
def _download(fixture: Fixture, location: Path) -> Iterator[
|
|
554
|
+
def _download(fixture: Fixture, location: Path) -> Iterator[tuple[InferencesRole, Path]]:
|
|
554
555
|
for role, path in fixture.paths():
|
|
555
556
|
yield role, GCSAssets().metadata(path).save_artifact(location)
|
|
556
557
|
|