PyPI - arize-phoenix - Versions diffs - 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl - Mend

arize-phoenix 3.25.0py3-none-any.whl → 4.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arize-phoenix might be problematic. Click here for more details.

Files changed (113) hide show

{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/METADATA +26 -4
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/RECORD +80 -75
phoenix/__init__.py +9 -5
phoenix/config.py +109 -53
phoenix/datetime_utils.py +18 -1
phoenix/db/README.md +25 -0
phoenix/db/__init__.py +4 -0
phoenix/db/alembic.ini +119 -0
phoenix/db/bulk_inserter.py +206 -0
phoenix/db/engines.py +152 -0
phoenix/db/helpers.py +47 -0
phoenix/db/insertion/evaluation.py +209 -0
phoenix/db/insertion/helpers.py +51 -0
phoenix/db/insertion/span.py +142 -0
phoenix/db/migrate.py +71 -0
phoenix/db/migrations/env.py +121 -0
phoenix/db/migrations/script.py.mako +26 -0
phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
phoenix/db/models.py +371 -0
phoenix/exceptions.py +5 -1
phoenix/server/api/context.py +40 -3
phoenix/server/api/dataloaders/__init__.py +97 -0
phoenix/server/api/dataloaders/cache/__init__.py +3 -0
phoenix/server/api/dataloaders/cache/two_tier_cache.py +67 -0
phoenix/server/api/dataloaders/document_evaluation_summaries.py +152 -0
phoenix/server/api/dataloaders/document_evaluations.py +37 -0
phoenix/server/api/dataloaders/document_retrieval_metrics.py +98 -0
phoenix/server/api/dataloaders/evaluation_summaries.py +151 -0
phoenix/server/api/dataloaders/latency_ms_quantile.py +198 -0
phoenix/server/api/dataloaders/min_start_or_max_end_times.py +93 -0
phoenix/server/api/dataloaders/record_counts.py +125 -0
phoenix/server/api/dataloaders/span_descendants.py +64 -0
phoenix/server/api/dataloaders/span_evaluations.py +37 -0
phoenix/server/api/dataloaders/token_counts.py +138 -0
phoenix/server/api/dataloaders/trace_evaluations.py +37 -0
phoenix/server/api/input_types/SpanSort.py +138 -68
phoenix/server/api/routers/v1/__init__.py +11 -0
phoenix/server/api/routers/v1/evaluations.py +275 -0
phoenix/server/api/routers/v1/spans.py +126 -0
phoenix/server/api/routers/v1/traces.py +82 -0
phoenix/server/api/schema.py +112 -48
phoenix/server/api/types/DocumentEvaluationSummary.py +1 -1
phoenix/server/api/types/Evaluation.py +29 -12
phoenix/server/api/types/EvaluationSummary.py +29 -44
phoenix/server/api/types/MimeType.py +2 -2
phoenix/server/api/types/Model.py +9 -9
phoenix/server/api/types/Project.py +240 -171
phoenix/server/api/types/Span.py +87 -131
phoenix/server/api/types/Trace.py +29 -20
phoenix/server/api/types/pagination.py +151 -10
phoenix/server/app.py +263 -35
phoenix/server/grpc_server.py +93 -0
phoenix/server/main.py +75 -60
phoenix/server/openapi/docs.py +218 -0
phoenix/server/prometheus.py +23 -7
phoenix/server/static/index.js +662 -643
phoenix/server/telemetry.py +68 -0
phoenix/services.py +4 -0
phoenix/session/client.py +34 -30
phoenix/session/data_extractor.py +8 -3
phoenix/session/session.py +176 -155
phoenix/settings.py +13 -0
phoenix/trace/attributes.py +349 -0
phoenix/trace/dsl/README.md +116 -0
phoenix/trace/dsl/filter.py +660 -192
phoenix/trace/dsl/helpers.py +24 -5
phoenix/trace/dsl/query.py +562 -185
phoenix/trace/fixtures.py +69 -7
phoenix/trace/otel.py +44 -200
phoenix/trace/schemas.py +14 -8
phoenix/trace/span_evaluations.py +5 -2
phoenix/utilities/__init__.py +0 -26
phoenix/utilities/span_store.py +0 -23
phoenix/version.py +1 -1
phoenix/core/project.py +0 -773
phoenix/core/traces.py +0 -96
phoenix/datasets/dataset.py +0 -214
phoenix/datasets/fixtures.py +0 -24
phoenix/datasets/schema.py +0 -31
phoenix/experimental/evals/__init__.py +0 -73
phoenix/experimental/evals/evaluators.py +0 -413
phoenix/experimental/evals/functions/__init__.py +0 -4
phoenix/experimental/evals/functions/classify.py +0 -453
phoenix/experimental/evals/functions/executor.py +0 -353
phoenix/experimental/evals/functions/generate.py +0 -138
phoenix/experimental/evals/functions/processing.py +0 -76
phoenix/experimental/evals/models/__init__.py +0 -14
phoenix/experimental/evals/models/anthropic.py +0 -175
phoenix/experimental/evals/models/base.py +0 -170
phoenix/experimental/evals/models/bedrock.py +0 -221
phoenix/experimental/evals/models/litellm.py +0 -134
phoenix/experimental/evals/models/openai.py +0 -453
phoenix/experimental/evals/models/rate_limiters.py +0 -246
phoenix/experimental/evals/models/vertex.py +0 -173
phoenix/experimental/evals/models/vertexai.py +0 -186
phoenix/experimental/evals/retrievals.py +0 -96
phoenix/experimental/evals/templates/__init__.py +0 -50
phoenix/experimental/evals/templates/default_templates.py +0 -472
phoenix/experimental/evals/templates/template.py +0 -195
phoenix/experimental/evals/utils/__init__.py +0 -172
phoenix/experimental/evals/utils/threads.py +0 -27
phoenix/server/api/routers/evaluation_handler.py +0 -110
phoenix/server/api/routers/span_handler.py +0 -70
phoenix/server/api/routers/trace_handler.py +0 -60
phoenix/storage/span_store/__init__.py +0 -23
phoenix/storage/span_store/text_file.py +0 -85
phoenix/trace/dsl/missing.py +0 -60
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/WHEEL +0 -0
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/IP_NOTICE +0 -0
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/LICENSE +0 -0
/phoenix/{datasets → db/insertion}/__init__.py +0 -0
/phoenix/{experimental → db/migrations}/__init__.py +0 -0
/phoenix/{storage → server/openapi}/__init__.py +0 -0

phoenix/server/api/context.py CHANGED Viewed

@@ -1,20 +1,57 @@
 from dataclasses import dataclass
+from datetime import datetime
 from pathlib import Path
-from typing import Optional, Union
+from typing import AsyncContextManager, Callable, Optional, Union
+from sqlalchemy.ext.asyncio import AsyncSession
 from starlette.requests import Request
 from starlette.responses import Response
 from starlette.websockets import WebSocket
+from typing_extensions import TypeAlias
 from phoenix.core.model_schema import Model
-from phoenix.core.traces import Traces
+from phoenix.server.api.dataloaders import (
+    CacheForDataLoaders,
+    DocumentEvaluationsDataLoader,
+    DocumentEvaluationSummaryDataLoader,
+    DocumentRetrievalMetricsDataLoader,
+    EvaluationSummaryDataLoader,
+    LatencyMsQuantileDataLoader,
+    MinStartOrMaxEndTimeDataLoader,
+    RecordCountDataLoader,
+    SpanDescendantsDataLoader,
+    SpanEvaluationsDataLoader,
+    TokenCountDataLoader,
+    TraceEvaluationsDataLoader,
+)
+@dataclass
+class DataLoaders:
+    document_evaluation_summaries: DocumentEvaluationSummaryDataLoader
+    document_evaluations: DocumentEvaluationsDataLoader
+    document_retrieval_metrics: DocumentRetrievalMetricsDataLoader
+    evaluation_summaries: EvaluationSummaryDataLoader
+    latency_ms_quantile: LatencyMsQuantileDataLoader
+    min_start_or_max_end_times: MinStartOrMaxEndTimeDataLoader
+    record_counts: RecordCountDataLoader
+    span_descendants: SpanDescendantsDataLoader
+    span_evaluations: SpanEvaluationsDataLoader
+    token_counts: TokenCountDataLoader
+    trace_evaluations: TraceEvaluationsDataLoader
+ProjectRowId: TypeAlias = int
 @dataclass
 class Context:
     request: Union[Request, WebSocket]
     response: Optional[Response]
+    db: Callable[[], AsyncContextManager[AsyncSession]]
+    data_loaders: DataLoaders
+    cache_for_dataloaders: Optional[CacheForDataLoaders]
     model: Model
     export_path: Path
     corpus: Optional[Model] = None
-    traces: Optional[Traces] = None
+    streaming_last_updated_at: Callable[[ProjectRowId], Optional[datetime]] = lambda _: None

phoenix/server/api/dataloaders/__init__.py ADDED Viewed

@@ -0,0 +1,97 @@
+from dataclasses import dataclass, field
+from functools import singledispatchmethod
+from phoenix.db.insertion.evaluation import (
+    DocumentEvaluationInsertionEvent,
+    SpanEvaluationInsertionEvent,
+    TraceEvaluationInsertionEvent,
+)
+from phoenix.db.insertion.span import ClearProjectSpansEvent, SpanInsertionEvent
+from .document_evaluation_summaries import (
+    DocumentEvaluationSummaryCache,
+    DocumentEvaluationSummaryDataLoader,
+)
+from .document_evaluations import DocumentEvaluationsDataLoader
+from .document_retrieval_metrics import DocumentRetrievalMetricsDataLoader
+from .evaluation_summaries import EvaluationSummaryCache, EvaluationSummaryDataLoader
+from .latency_ms_quantile import LatencyMsQuantileCache, LatencyMsQuantileDataLoader
+from .min_start_or_max_end_times import MinStartOrMaxEndTimeCache, MinStartOrMaxEndTimeDataLoader
+from .record_counts import RecordCountCache, RecordCountDataLoader
+from .span_descendants import SpanDescendantsDataLoader
+from .span_evaluations import SpanEvaluationsDataLoader
+from .token_counts import TokenCountCache, TokenCountDataLoader
+from .trace_evaluations import TraceEvaluationsDataLoader
+__all__ = [
+    "CacheForDataLoaders",
+    "DocumentEvaluationSummaryDataLoader",
+    "DocumentEvaluationsDataLoader",
+    "DocumentRetrievalMetricsDataLoader",
+    "EvaluationSummaryDataLoader",
+    "LatencyMsQuantileDataLoader",
+    "MinStartOrMaxEndTimeDataLoader",
+    "RecordCountDataLoader",
+    "SpanDescendantsDataLoader",
+    "SpanEvaluationsDataLoader",
+    "TokenCountDataLoader",
+    "TraceEvaluationsDataLoader",
+]
+@dataclass(frozen=True)
+class CacheForDataLoaders:
+    document_evaluation_summary: DocumentEvaluationSummaryCache = field(
+        default_factory=DocumentEvaluationSummaryCache,
+    )
+    evaluation_summary: EvaluationSummaryCache = field(
+        default_factory=EvaluationSummaryCache,
+    )
+    latency_ms_quantile: LatencyMsQuantileCache = field(
+        default_factory=LatencyMsQuantileCache,
+    )
+    min_start_or_max_end_time: MinStartOrMaxEndTimeCache = field(
+        default_factory=MinStartOrMaxEndTimeCache,
+    )
+    record_count: RecordCountCache = field(
+        default_factory=RecordCountCache,
+    )
+    token_count: TokenCountCache = field(
+        default_factory=TokenCountCache,
+    )
+    def _update_spans(self, project_rowid: int) -> None:
+        self.latency_ms_quantile.invalidate(project_rowid)
+        self.token_count.invalidate(project_rowid)
+        self.record_count.invalidate(project_rowid)
+        self.min_start_or_max_end_time.invalidate(project_rowid)
+    def _clear_spans(self, project_rowid: int) -> None:
+        self._update_spans(project_rowid)
+        self.evaluation_summary.invalidate_project(project_rowid)
+        self.document_evaluation_summary.invalidate_project(project_rowid)
+    @singledispatchmethod
+    def invalidate(self, event: SpanInsertionEvent) -> None:
+        project_rowid, *_ = event
+        self._update_spans(project_rowid)
+    @invalidate.register
+    def _(self, event: ClearProjectSpansEvent) -> None:
+        project_rowid, *_ = event
+        self._clear_spans(project_rowid)
+    @invalidate.register
+    def _(self, event: DocumentEvaluationInsertionEvent) -> None:
+        project_rowid, evaluation_name = event
+        self.document_evaluation_summary.invalidate((project_rowid, evaluation_name))
+    @invalidate.register
+    def _(self, event: SpanEvaluationInsertionEvent) -> None:
+        project_rowid, evaluation_name = event
+        self.evaluation_summary.invalidate((project_rowid, evaluation_name, "span"))
+    @invalidate.register
+    def _(self, event: TraceEvaluationInsertionEvent) -> None:
+        project_rowid, evaluation_name = event
+        self.evaluation_summary.invalidate((project_rowid, evaluation_name, "trace"))

phoenix/server/api/dataloaders/cache/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from phoenix.server.api.dataloaders.cache.two_tier_cache import TwoTierCache
+__all__ = ("TwoTierCache",)

phoenix/server/api/dataloaders/cache/two_tier_cache.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""
+The primary intent of a two-tier system is to make cache invalidation more efficient,
+because the cache keys are typically tuples such as (project_id, time_interval, ...),
+but we need to invalidate subsets of keys, e.g. all those associated with a
+specific project, very frequently (i.e. essentially at each span insertion). In a
+single-tier system we would need to check all the keys to see if they are in the
+subset that we want to invalidate.
+"""
+from abc import ABC, abstractmethod
+from asyncio import Future
+from typing import Any, Callable, Generic, Optional, Tuple, TypeVar
+from cachetools import Cache
+from strawberry.dataloader import AbstractCache
+_Key = TypeVar("_Key")
+_Result = TypeVar("_Result")
+_Section = TypeVar("_Section")
+_SubKey = TypeVar("_SubKey")
+class TwoTierCache(
+    AbstractCache[_Key, _Result],
+    Generic[_Key, _Result, _Section, _SubKey],
+    ABC,
+):
+    def __init__(
+        self,
+        main_cache: "Cache[_Section, Cache[_SubKey, Future[_Result]]]",
+        sub_cache_factory: Callable[[], "Cache[_SubKey, Future[_Result]]"],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self._cache = main_cache
+        self._sub_cache_factory = sub_cache_factory
+    @abstractmethod
+    def _cache_key(self, key: _Key) -> Tuple[_Section, _SubKey]: ...
+    def invalidate(self, section: _Section) -> None:
+        if sub_cache := self._cache.get(section):
+            sub_cache.clear()
+    def get(self, key: _Key) -> Optional["Future[_Result]"]:
+        section, sub_key = self._cache_key(key)
+        if not (sub_cache := self._cache.get(section)):
+            return None
+        return sub_cache.get(sub_key)
+    def set(self, key: _Key, value: "Future[_Result]") -> None:
+        section, sub_key = self._cache_key(key)
+        if (sub_cache := self._cache.get(section)) is None:
+            self._cache[section] = sub_cache = self._sub_cache_factory()
+        sub_cache[sub_key] = value
+    def delete(self, key: _Key) -> None:
+        section, sub_key = self._cache_key(key)
+        if sub_cache := self._cache.get(section):
+            del sub_cache[sub_key]
+            if not sub_cache:
+                del self._cache[section]
+    def clear(self) -> None:
+        self._cache.clear()

phoenix/server/api/dataloaders/document_evaluation_summaries.py ADDED Viewed

@@ -0,0 +1,152 @@
+from collections import defaultdict
+from datetime import datetime
+from typing import (
+    Any,
+    AsyncContextManager,
+    Callable,
+    DefaultDict,
+    List,
+    Optional,
+    Tuple,
+)
+import numpy as np
+from aioitertools.itertools import groupby
+from cachetools import LFUCache, TTLCache
+from sqlalchemy import Select, select
+from sqlalchemy.ext.asyncio import AsyncSession
+from strawberry.dataloader import AbstractCache, DataLoader
+from typing_extensions import TypeAlias
+from phoenix.db import models
+from phoenix.db.helpers import SupportedSQLDialect, num_docs_col
+from phoenix.metrics.retrieval_metrics import RetrievalMetrics
+from phoenix.server.api.dataloaders.cache import TwoTierCache
+from phoenix.server.api.input_types.TimeRange import TimeRange
+from phoenix.server.api.types.DocumentEvaluationSummary import DocumentEvaluationSummary
+from phoenix.trace.dsl import SpanFilter
+ProjectRowId: TypeAlias = int
+TimeInterval: TypeAlias = Tuple[Optional[datetime], Optional[datetime]]
+FilterCondition: TypeAlias = Optional[str]
+EvalName: TypeAlias = str
+Segment: TypeAlias = Tuple[ProjectRowId, TimeInterval, FilterCondition]
+Param: TypeAlias = EvalName
+Key: TypeAlias = Tuple[ProjectRowId, Optional[TimeRange], FilterCondition, EvalName]
+Result: TypeAlias = Optional[DocumentEvaluationSummary]
+ResultPosition: TypeAlias = int
+DEFAULT_VALUE: Result = None
+def _cache_key_fn(key: Key) -> Tuple[Segment, Param]:
+    project_rowid, time_range, filter_condition, eval_name = key
+    interval = (
+        (time_range.start, time_range.end) if isinstance(time_range, TimeRange) else (None, None)
+    )
+    return (project_rowid, interval, filter_condition), eval_name
+_Section: TypeAlias = Tuple[ProjectRowId, EvalName]
+_SubKey: TypeAlias = Tuple[TimeInterval, FilterCondition]
+class DocumentEvaluationSummaryCache(
+    TwoTierCache[Key, Result, _Section, _SubKey],
+):
+    def __init__(self) -> None:
+        super().__init__(
+            # TTL=3600 (1-hour) because time intervals are always moving forward, but
+            # interval endpoints are rounded down to the hour by the UI, so anything
+            # older than an hour most likely won't be a cache-hit anyway.
+            main_cache=TTLCache(maxsize=64 * 32, ttl=3600),
+            sub_cache_factory=lambda: LFUCache(maxsize=2 * 2),
+        )
+    def invalidate_project(self, project_rowid: ProjectRowId) -> None:
+        for section in self._cache.keys():
+            if section[0] == project_rowid:
+                del self._cache[section]
+    def _cache_key(self, key: Key) -> Tuple[_Section, _SubKey]:
+        (project_rowid, interval, filter_condition), eval_name = _cache_key_fn(key)
+        return (project_rowid, eval_name), (interval, filter_condition)
+class DocumentEvaluationSummaryDataLoader(DataLoader[Key, Result]):
+    def __init__(
+        self,
+        db: Callable[[], AsyncContextManager[AsyncSession]],
+        cache_map: Optional[AbstractCache[Key, Result]] = None,
+    ) -> None:
+        super().__init__(
+            load_fn=self._load_fn,
+            cache_key_fn=_cache_key_fn,
+            cache_map=cache_map,
+        )
+        self._db = db
+    async def _load_fn(self, keys: List[Key]) -> List[Result]:
+        results: List[Result] = [DEFAULT_VALUE] * len(keys)
+        arguments: DefaultDict[
+            Segment,
+            DefaultDict[Param, List[ResultPosition]],
+        ] = defaultdict(lambda: defaultdict(list))
+        for position, key in enumerate(keys):
+            segment, param = _cache_key_fn(key)
+            arguments[segment][param].append(position)
+        for segment, params in arguments.items():
+            async with self._db() as session:
+                dialect = SupportedSQLDialect(session.bind.dialect.name)
+                stmt = _get_stmt(dialect, segment, *params.keys())
+                data = await session.stream(stmt)
+                async for eval_name, group in groupby(data, lambda d: d.name):
+                    metrics_collection = []
+                    async for (_, num_docs), subgroup in groupby(
+                        group, lambda g: (g.id, g.num_docs)
+                    ):
+                        scores = [np.nan] * num_docs
+                        for row in subgroup:
+                            scores[row.document_position] = row.score
+                        metrics_collection.append(RetrievalMetrics(scores))
+                    summary = DocumentEvaluationSummary(
+                        evaluation_name=eval_name,
+                        metrics_collection=metrics_collection,
+                    )
+                    for position in params[eval_name]:
+                        results[position] = summary
+        return results
+def _get_stmt(
+    dialect: SupportedSQLDialect,
+    segment: Segment,
+    *eval_names: Param,
+) -> Select[Any]:
+    project_rowid, (start_time, end_time), filter_condition = segment
+    mda = models.DocumentAnnotation
+    stmt = (
+        select(
+            mda.name,
+            models.Span.id,
+            num_docs_col(dialect),
+            mda.score,
+            mda.document_position,
+        )
+        .join(models.Trace)
+        .where(models.Trace.project_rowid == project_rowid)
+        .join(mda)
+        .where(mda.name.in_(eval_names))
+        .where(mda.annotator_kind == "LLM")
+        .where(mda.score.is_not(None))
+        .order_by(mda.name, models.Span.id)
+    )
+    if start_time:
+        stmt = stmt.where(start_time <= models.Span.start_time)
+    if end_time:
+        stmt = stmt.where(models.Span.start_time < end_time)
+    if filter_condition:
+        span_filter = SpanFilter(condition=filter_condition)
+        stmt = span_filter(stmt)
+    return stmt

phoenix/server/api/dataloaders/document_evaluations.py ADDED Viewed

@@ -0,0 +1,37 @@
+from collections import defaultdict
+from typing import (
+    AsyncContextManager,
+    Callable,
+    DefaultDict,
+    List,
+)
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from strawberry.dataloader import DataLoader
+from typing_extensions import TypeAlias
+from phoenix.db import models
+from phoenix.server.api.types.Evaluation import DocumentEvaluation
+Key: TypeAlias = int
+Result: TypeAlias = List[DocumentEvaluation]
+class DocumentEvaluationsDataLoader(DataLoader[Key, Result]):
+    def __init__(self, db: Callable[[], AsyncContextManager[AsyncSession]]) -> None:
+        super().__init__(load_fn=self._load_fn)
+        self._db = db
+    async def _load_fn(self, keys: List[Key]) -> List[Result]:
+        document_evaluations_by_id: DefaultDict[Key, Result] = defaultdict(list)
+        mda = models.DocumentAnnotation
+        async with self._db() as session:
+            data = await session.stream_scalars(
+                select(mda).where(mda.span_rowid.in_(keys)).where(mda.annotator_kind == "LLM")
+            )
+            async for document_evaluation in data:
+                document_evaluations_by_id[document_evaluation.span_rowid].append(
+                    DocumentEvaluation.from_sql_document_annotation(document_evaluation)
+                )
+        return [document_evaluations_by_id[key] for key in keys]

phoenix/server/api/dataloaders/document_retrieval_metrics.py ADDED Viewed

@@ -0,0 +1,98 @@
+from collections import defaultdict
+from typing import (
+    AsyncContextManager,
+    Callable,
+    DefaultDict,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Tuple,
+)
+import numpy as np
+from aioitertools.itertools import groupby
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from strawberry.dataloader import DataLoader
+from typing_extensions import TypeAlias
+from phoenix.db import models
+from phoenix.metrics.retrieval_metrics import RetrievalMetrics
+from phoenix.server.api.types.DocumentRetrievalMetrics import DocumentRetrievalMetrics
+RowId: TypeAlias = int
+NumDocs: TypeAlias = int
+EvalName: TypeAlias = Optional[str]
+Key: TypeAlias = Tuple[RowId, EvalName, NumDocs]
+Result: TypeAlias = List[DocumentRetrievalMetrics]
+class DocumentRetrievalMetricsDataLoader(DataLoader[Key, Result]):
+    def __init__(self, db: Callable[[], AsyncContextManager[AsyncSession]]) -> None:
+        super().__init__(load_fn=self._load_fn)
+        self._db = db
+    async def _load_fn(self, keys: List[Key]) -> List[Result]:
+        mda = models.DocumentAnnotation
+        stmt = (
+            select(
+                mda.span_rowid,
+                mda.name,
+                mda.score,
+                mda.document_position,
+            )
+            .where(mda.score != None)  # noqa: E711
+            .where(mda.annotator_kind == "LLM")
+            .where(mda.document_position >= 0)
+            .order_by(mda.span_rowid, mda.name)
+        )
+        # Using CTE with VALUES clause is possible in SQLite, but not in
+        # SQLAlchemy v2.0.29, hence the workaround below with over-fetching.
+        # We could use CTE with VALUES for postgresql, but for now we'll keep
+        # it simple and just use one approach for all backends.
+        all_row_ids = {row_id for row_id, _, _ in keys}
+        stmt = stmt.where(mda.span_rowid.in_(all_row_ids))
+        all_eval_names = {eval_name for _, eval_name, _ in keys}
+        if None not in all_eval_names:
+            stmt = stmt.where(mda.name.in_(all_eval_names))
+        max_position = max(num_docs for _, _, num_docs in keys)
+        stmt = stmt.where(mda.document_position < max_position)
+        results: Dict[Key, Result] = {key: [] for key in keys}
+        requested_num_docs: DefaultDict[Tuple[RowId, EvalName], Set[NumDocs]] = defaultdict(set)
+        for row_id, eval_name, num_docs in results.keys():
+            requested_num_docs[(row_id, eval_name)].add(num_docs)
+        async with self._db() as session:
+            data = await session.stream(stmt)
+            async for (span_rowid, name), group in groupby(data, lambda r: (r.span_rowid, r.name)):
+                # We need to fulfill two types of potential requests: 1. when it
+                # specifies an evaluation name, and 2. when it doesn't care about
+                # the evaluation name by specifying None.
+                max_requested_num_docs = max(
+                    (
+                        num_docs
+                        for eval_name in (name, None)
+                        for num_docs in (requested_num_docs.get((span_rowid, eval_name)) or ())
+                    ),
+                    default=0,
+                )
+                if max_requested_num_docs <= 0:
+                    # We have over-fetched. Skip this group.
+                    continue
+                scores = [np.nan] * max_requested_num_docs
+                for row in group:
+                    # Length check is necessary due to over-fetching.
+                    if row.document_position < len(scores):
+                        scores[row.document_position] = row.score
+                for eval_name in (name, None):
+                    for num_docs in requested_num_docs.get((span_rowid, eval_name)) or ():
+                        metrics = RetrievalMetrics(scores[:num_docs])
+                        doc_metrics = DocumentRetrievalMetrics(
+                            evaluation_name=name, metrics=metrics
+                        )
+                        key = (span_rowid, eval_name, num_docs)
+                        results[key].append(doc_metrics)
+        # Make sure to copy the result, so we don't return the same list
+        # object to two different requesters.
+        return [results[key].copy() for key in keys]

phoenix/server/api/dataloaders/evaluation_summaries.py ADDED Viewed

@@ -0,0 +1,151 @@
+from collections import defaultdict
+from datetime import datetime
+from typing import (
+    Any,
+    AsyncContextManager,
+    Callable,
+    DefaultDict,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+)
+import pandas as pd
+from aioitertools.itertools import groupby
+from cachetools import LFUCache, TTLCache
+from sqlalchemy import Select, func, or_, select
+from sqlalchemy.ext.asyncio import AsyncSession
+from strawberry.dataloader import AbstractCache, DataLoader
+from typing_extensions import TypeAlias, assert_never
+from phoenix.db import models
+from phoenix.server.api.dataloaders.cache import TwoTierCache
+from phoenix.server.api.input_types.TimeRange import TimeRange
+from phoenix.server.api.types.EvaluationSummary import EvaluationSummary
+from phoenix.trace.dsl import SpanFilter
+Kind: TypeAlias = Literal["span", "trace"]
+ProjectRowId: TypeAlias = int
+TimeInterval: TypeAlias = Tuple[Optional[datetime], Optional[datetime]]
+FilterCondition: TypeAlias = Optional[str]
+EvalName: TypeAlias = str
+Segment: TypeAlias = Tuple[Kind, ProjectRowId, TimeInterval, FilterCondition]
+Param: TypeAlias = EvalName
+Key: TypeAlias = Tuple[Kind, ProjectRowId, Optional[TimeRange], FilterCondition, EvalName]
+Result: TypeAlias = Optional[EvaluationSummary]
+ResultPosition: TypeAlias = int
+DEFAULT_VALUE: Result = None
+def _cache_key_fn(key: Key) -> Tuple[Segment, Param]:
+    kind, project_rowid, time_range, filter_condition, eval_name = key
+    interval = (
+        (time_range.start, time_range.end) if isinstance(time_range, TimeRange) else (None, None)
+    )
+    return (kind, project_rowid, interval, filter_condition), eval_name
+_Section: TypeAlias = Tuple[ProjectRowId, EvalName, Kind]
+_SubKey: TypeAlias = Tuple[TimeInterval, FilterCondition]
+class EvaluationSummaryCache(
+    TwoTierCache[Key, Result, _Section, _SubKey],
+):
+    def __init__(self) -> None:
+        super().__init__(
+            # TTL=3600 (1-hour) because time intervals are always moving forward, but
+            # interval endpoints are rounded down to the hour by the UI, so anything
+            # older than an hour most likely won't be a cache-hit anyway.
+            main_cache=TTLCache(maxsize=64 * 32 * 2, ttl=3600),
+            sub_cache_factory=lambda: LFUCache(maxsize=2 * 2),
+        )
+    def invalidate_project(self, project_rowid: ProjectRowId) -> None:
+        for section in self._cache.keys():
+            if section[0] == project_rowid:
+                del self._cache[section]
+    def _cache_key(self, key: Key) -> Tuple[_Section, _SubKey]:
+        (kind, project_rowid, interval, filter_condition), eval_name = _cache_key_fn(key)
+        return (project_rowid, eval_name, kind), (interval, filter_condition)
+class EvaluationSummaryDataLoader(DataLoader[Key, Result]):
+    def __init__(
+        self,
+        db: Callable[[], AsyncContextManager[AsyncSession]],
+        cache_map: Optional[AbstractCache[Key, Result]] = None,
+    ) -> None:
+        super().__init__(
+            load_fn=self._load_fn,
+            cache_key_fn=_cache_key_fn,
+            cache_map=cache_map,
+        )
+        self._db = db
+    async def _load_fn(self, keys: List[Key]) -> List[Result]:
+        results: List[Result] = [DEFAULT_VALUE] * len(keys)
+        arguments: DefaultDict[
+            Segment,
+            DefaultDict[Param, List[ResultPosition]],
+        ] = defaultdict(lambda: defaultdict(list))
+        for position, key in enumerate(keys):
+            segment, param = _cache_key_fn(key)
+            arguments[segment][param].append(position)
+        for segment, params in arguments.items():
+            stmt = _get_stmt(segment, *params.keys())
+            async with self._db() as session:
+                data = await session.stream(stmt)
+                async for eval_name, group in groupby(data, lambda row: row.name):
+                    summary = EvaluationSummary(pd.DataFrame(group))
+                    for position in params[eval_name]:
+                        results[position] = summary
+        return results
+def _get_stmt(
+    segment: Segment,
+    *eval_names: Param,
+) -> Select[Any]:
+    kind, project_rowid, (start_time, end_time), filter_condition = segment
+    stmt = select()
+    if kind == "span":
+        msa = models.SpanAnnotation
+        name_column, label_column, score_column = msa.name, msa.label, msa.score
+        annotator_kind_column = msa.annotator_kind
+        time_column = models.Span.start_time
+        stmt = stmt.join(models.Span).join_from(models.Span, models.Trace)
+        if filter_condition:
+            sf = SpanFilter(filter_condition)
+            stmt = sf(stmt)
+    elif kind == "trace":
+        mta = models.TraceAnnotation
+        name_column, label_column, score_column = mta.name, mta.label, mta.score
+        annotator_kind_column = mta.annotator_kind
+        time_column = models.Trace.start_time
+        stmt = stmt.join(models.Trace)
+    else:
+        assert_never(kind)
+    stmt = stmt.add_columns(
+        name_column,
+        label_column,
+        func.count().label("record_count"),
+        func.count(label_column).label("label_count"),
+        func.count(score_column).label("score_count"),
+        func.sum(score_column).label("score_sum"),
+    )
+    stmt = stmt.group_by(name_column, label_column)
+    stmt = stmt.order_by(name_column, label_column)
+    stmt = stmt.where(models.Trace.project_rowid == project_rowid)
+    stmt = stmt.where(annotator_kind_column == "LLM")
+    stmt = stmt.where(or_(score_column.is_not(None), label_column.is_not(None)))
+    stmt = stmt.where(name_column.in_(eval_names))
+    if start_time:
+        stmt = stmt.where(start_time <= time_column)
+    if end_time:
+        stmt = stmt.where(time_column < end_time)
+    return stmt

arize-phoenix 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl

Potentially problematic release.

arize-phoenix 3.25.0py3-none-any.whl → 4.0.1py3-none-any.whl