PyPI - arize-phoenix - Versions diffs - 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl - Mend

arize-phoenix 3.25.0py3-none-any.whl → 4.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arize-phoenix might be problematic. Click here for more details.

Files changed (113) hide show

{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/METADATA +26 -4
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/RECORD +80 -75
phoenix/__init__.py +9 -5
phoenix/config.py +109 -53
phoenix/datetime_utils.py +18 -1
phoenix/db/README.md +25 -0
phoenix/db/__init__.py +4 -0
phoenix/db/alembic.ini +119 -0
phoenix/db/bulk_inserter.py +206 -0
phoenix/db/engines.py +152 -0
phoenix/db/helpers.py +47 -0
phoenix/db/insertion/evaluation.py +209 -0
phoenix/db/insertion/helpers.py +51 -0
phoenix/db/insertion/span.py +142 -0
phoenix/db/migrate.py +71 -0
phoenix/db/migrations/env.py +121 -0
phoenix/db/migrations/script.py.mako +26 -0
phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
phoenix/db/models.py +371 -0
phoenix/exceptions.py +5 -1
phoenix/server/api/context.py +40 -3
phoenix/server/api/dataloaders/__init__.py +97 -0
phoenix/server/api/dataloaders/cache/__init__.py +3 -0
phoenix/server/api/dataloaders/cache/two_tier_cache.py +67 -0
phoenix/server/api/dataloaders/document_evaluation_summaries.py +152 -0
phoenix/server/api/dataloaders/document_evaluations.py +37 -0
phoenix/server/api/dataloaders/document_retrieval_metrics.py +98 -0
phoenix/server/api/dataloaders/evaluation_summaries.py +151 -0
phoenix/server/api/dataloaders/latency_ms_quantile.py +198 -0
phoenix/server/api/dataloaders/min_start_or_max_end_times.py +93 -0
phoenix/server/api/dataloaders/record_counts.py +125 -0
phoenix/server/api/dataloaders/span_descendants.py +64 -0
phoenix/server/api/dataloaders/span_evaluations.py +37 -0
phoenix/server/api/dataloaders/token_counts.py +138 -0
phoenix/server/api/dataloaders/trace_evaluations.py +37 -0
phoenix/server/api/input_types/SpanSort.py +138 -68
phoenix/server/api/routers/v1/__init__.py +11 -0
phoenix/server/api/routers/v1/evaluations.py +275 -0
phoenix/server/api/routers/v1/spans.py +126 -0
phoenix/server/api/routers/v1/traces.py +82 -0
phoenix/server/api/schema.py +112 -48
phoenix/server/api/types/DocumentEvaluationSummary.py +1 -1
phoenix/server/api/types/Evaluation.py +29 -12
phoenix/server/api/types/EvaluationSummary.py +29 -44
phoenix/server/api/types/MimeType.py +2 -2
phoenix/server/api/types/Model.py +9 -9
phoenix/server/api/types/Project.py +240 -171
phoenix/server/api/types/Span.py +87 -131
phoenix/server/api/types/Trace.py +29 -20
phoenix/server/api/types/pagination.py +151 -10
phoenix/server/app.py +263 -35
phoenix/server/grpc_server.py +93 -0
phoenix/server/main.py +75 -60
phoenix/server/openapi/docs.py +218 -0
phoenix/server/prometheus.py +23 -7
phoenix/server/static/index.js +662 -643
phoenix/server/telemetry.py +68 -0
phoenix/services.py +4 -0
phoenix/session/client.py +34 -30
phoenix/session/data_extractor.py +8 -3
phoenix/session/session.py +176 -155
phoenix/settings.py +13 -0
phoenix/trace/attributes.py +349 -0
phoenix/trace/dsl/README.md +116 -0
phoenix/trace/dsl/filter.py +660 -192
phoenix/trace/dsl/helpers.py +24 -5
phoenix/trace/dsl/query.py +562 -185
phoenix/trace/fixtures.py +69 -7
phoenix/trace/otel.py +44 -200
phoenix/trace/schemas.py +14 -8
phoenix/trace/span_evaluations.py +5 -2
phoenix/utilities/__init__.py +0 -26
phoenix/utilities/span_store.py +0 -23
phoenix/version.py +1 -1
phoenix/core/project.py +0 -773
phoenix/core/traces.py +0 -96
phoenix/datasets/dataset.py +0 -214
phoenix/datasets/fixtures.py +0 -24
phoenix/datasets/schema.py +0 -31
phoenix/experimental/evals/__init__.py +0 -73
phoenix/experimental/evals/evaluators.py +0 -413
phoenix/experimental/evals/functions/__init__.py +0 -4
phoenix/experimental/evals/functions/classify.py +0 -453
phoenix/experimental/evals/functions/executor.py +0 -353
phoenix/experimental/evals/functions/generate.py +0 -138
phoenix/experimental/evals/functions/processing.py +0 -76
phoenix/experimental/evals/models/__init__.py +0 -14
phoenix/experimental/evals/models/anthropic.py +0 -175
phoenix/experimental/evals/models/base.py +0 -170
phoenix/experimental/evals/models/bedrock.py +0 -221
phoenix/experimental/evals/models/litellm.py +0 -134
phoenix/experimental/evals/models/openai.py +0 -453
phoenix/experimental/evals/models/rate_limiters.py +0 -246
phoenix/experimental/evals/models/vertex.py +0 -173
phoenix/experimental/evals/models/vertexai.py +0 -186
phoenix/experimental/evals/retrievals.py +0 -96
phoenix/experimental/evals/templates/__init__.py +0 -50
phoenix/experimental/evals/templates/default_templates.py +0 -472
phoenix/experimental/evals/templates/template.py +0 -195
phoenix/experimental/evals/utils/__init__.py +0 -172
phoenix/experimental/evals/utils/threads.py +0 -27
phoenix/server/api/routers/evaluation_handler.py +0 -110
phoenix/server/api/routers/span_handler.py +0 -70
phoenix/server/api/routers/trace_handler.py +0 -60
phoenix/storage/span_store/__init__.py +0 -23
phoenix/storage/span_store/text_file.py +0 -85
phoenix/trace/dsl/missing.py +0 -60
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/WHEEL +0 -0
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/IP_NOTICE +0 -0
{arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/LICENSE +0 -0
/phoenix/{datasets → db/insertion}/__init__.py +0 -0
/phoenix/{experimental → db/migrations}/__init__.py +0 -0
/phoenix/{storage → server/openapi}/__init__.py +0 -0

phoenix/trace/dsl/query.py CHANGED Viewed

@@ -1,31 +1,46 @@
-import json
+import warnings
 from collections import defaultdict
-from dataclasses import dataclass, field, fields, replace
-from functools import cached_property, partial
+from dataclasses import dataclass, field, replace
+from datetime import datetime
+from functools import cached_property
+from itertools import chain
+from random import randint, random
 from types import MappingProxyType
 from typing import (
     Any,
-    Callable,
-    ClassVar,
+    DefaultDict,
     Dict,
     Iterable,
-    Iterator,
     List,
     Mapping,
     Optional,
     Sequence,
-    Sized,
-    Tuple,
     cast,
 )
 import pandas as pd
 from openinference.semconv.trace import SpanAttributes
+from sqlalchemy import JSON, Column, Label, Select, SQLColumnExpression, and_, func, select
+from sqlalchemy.dialects.postgresql import aggregate_order_by
+from sqlalchemy.orm import Session, aliased
+from typing_extensions import assert_never
+from phoenix.config import DEFAULT_PROJECT_NAME
+from phoenix.db import models
+from phoenix.db.helpers import SupportedSQLDialect
+from phoenix.trace.attributes import (
+    JSON_STRING_ATTRIBUTES,
+    SEMANTIC_CONVENTIONS,
+    flatten,
+    get_attribute_value,
+    load_json_strings,
+    unflatten,
+)
 from phoenix.trace.dsl import SpanFilter
-from phoenix.trace.dsl.filter import SupportsGetSpanEvaluation
-from phoenix.trace.schemas import ATTRIBUTE_PREFIX, CONTEXT_PREFIX, Span
-from phoenix.trace.span_json_encoder import span_to_json
+from phoenix.trace.dsl.filter import Projector
+from phoenix.trace.schemas import ATTRIBUTE_PREFIX
+DEFAULT_SPAN_LIMIT = 1000
 RETRIEVAL_DOCUMENTS = SpanAttributes.RETRIEVAL_DOCUMENTS
@@ -39,127 +54,208 @@ _ALIASES = {
     "trace_id": "context.trace_id",
 }
-# Because span_kind is an enum, it needs to be converted to string,
-# so it's serializable by pyarrow.
-_CONVERT_TO_STRING = ("span_kind",)
 def _unalias(key: str) -> str:
     return _ALIASES.get(key, key)
 @dataclass(frozen=True)
-class Projection:
-    key: str = ""
-    value: Callable[[Span], Any] = field(init=False, repr=False)
-    span_fields: ClassVar[Tuple[str, ...]] = tuple(f.name for f in fields(Span))
-    def __bool__(self) -> bool:
-        return bool(self.key)
+class _Base:
+    """The sole purpose of this class is for `super().__post_init__()` to work"""
     def __post_init__(self) -> None:
-        key = _unalias(self.key)
-        object.__setattr__(self, "key", key)
-        if key.startswith(CONTEXT_PREFIX):
-            key = key[len(CONTEXT_PREFIX) :]
-            value = partial(self._from_context, key=key)
-        elif key.startswith(ATTRIBUTE_PREFIX):
-            key = self.key[len(ATTRIBUTE_PREFIX) :]
-            value = partial(self._from_attributes, key=key)
-        elif key in self.span_fields:
-            value = partial(self._from_span, key=key)
-        else:
-            value = partial(self._from_attributes, key=key)
-        if self.key in _CONVERT_TO_STRING:
-            object.__setattr__(
-                self,
-                "value",
-                lambda span: None if (v := value(span)) is None else str(v),
-            )
-        else:
-            object.__setattr__(self, "value", value)
+        pass
-    def __call__(self, span: Span) -> Any:
-        return self.value(span)
-    @staticmethod
-    def _from_attributes(span: Span, key: str) -> Any:
-        return span.attributes.get(key)
+@dataclass(frozen=True)
+class Projection(_Base):
+    key: str = ""
+    _projector: Projector = field(init=False, repr=False)
-    @staticmethod
-    def _from_context(span: Span, key: str) -> Any:
-        return getattr(span.context, key, None)
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        object.__setattr__(self, "key", _unalias(self.key))
+        object.__setattr__(self, "_projector", Projector(self.key))
-    @staticmethod
-    def _from_span(span: Span, key: str) -> Any:
-        return getattr(span, key, None)
+    def __bool__(self) -> bool:
+        return bool(self.key)
+    def __call__(self) -> SQLColumnExpression[Any]:
+        return self._projector()
     def to_dict(self) -> Dict[str, Any]:
         return {"key": self.key}
     @classmethod
     def from_dict(cls, obj: Mapping[str, Any]) -> "Projection":
-        return cls(
-            **({"key": cast(str, key)} if (key := obj.get("key")) else {}),
-        )
+        return cls(**({"key": cast(str, key)} if (key := obj.get("key")) else {}))
+@dataclass(frozen=True)
+class _HasTmpSuffix(_Base):
+    _tmp_suffix: str = field(init=False, repr=False)
+    """Ideally every column label should get a temporary random suffix that will
+    be removed at the end. This is necessary during query construction because
+    sqlalchemy is not always foolproof, e.g. we have seen `group_by` clauses that
+    were incorrect or ambiguous. We should actively avoid name collisions, which
+    is increasingly likely as queries get more complex.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        object.__setattr__(self, "_tmp_suffix", f"{randint(0, 10**6):06d}")
+    def _remove_tmp_suffix(self, name: str) -> str:
+        if name.endswith(self._tmp_suffix):
+            return name[: -len(self._tmp_suffix)]
+        return name
+    def _add_tmp_suffix(self, name: str) -> str:
+        if name.endswith(self._tmp_suffix):
+            return name
+        return name + self._tmp_suffix
 @dataclass(frozen=True)
-class Explosion(Projection):
+class Explosion(_HasTmpSuffix, Projection):
     kwargs: Mapping[str, str] = field(default_factory=lambda: MappingProxyType({}))
     primary_index_key: str = "context.span_id"
-    position_prefix: str = field(init=False, repr=False)
-    primary_index: Projection = field(init=False, repr=False)
+    _position_prefix: str = field(init=False, repr=False)
+    _primary_index: Projection = field(init=False, repr=False)
+    _array_tmp_col_label: str = field(init=False, repr=False)
+    """For sqlite we need to store the array in a temporary column to be able
+    to explode it later in pandas. `_array_tmp_col_label` is the name of this
+    temporary column. The temporary column will have a unique name
+    per instance.
+    """
     def __post_init__(self) -> None:
         super().__post_init__()
         position_prefix = _PRESCRIBED_POSITION_PREFIXES.get(self.key, "")
-        object.__setattr__(self, "position_prefix", position_prefix)
-        object.__setattr__(self, "primary_index", Projection(self.primary_index_key))
+        object.__setattr__(self, "_position_prefix", position_prefix)
+        object.__setattr__(self, "_primary_index", Projection(self.primary_index_key))
+        object.__setattr__(self, "_array_tmp_col_label", f"__array_tmp_col_{random()}")
     @cached_property
-    def index_keys(self) -> Tuple[str, str]:
-        return (self.primary_index.key, f"{self.position_prefix}position")
-    def with_primary_index_key(self, primary_index_key: str) -> "Explosion":
-        return replace(self, primary_index_key=primary_index_key)
-    def __call__(self, span: Span) -> Iterator[Dict[str, Any]]:
-        if not isinstance(seq := self.value(span), Iterable):
-            return
-        has_mapping = False
-        for item in seq:
-            if isinstance(item, Mapping):
-                has_mapping = True
-                break
-        if not has_mapping:
-            for i, item in enumerate(seq):
-                if item is not None:
-                    yield {
-                        self.key: item,
-                        self.primary_index.key: self.primary_index(span),
-                        f"{self.position_prefix}position": i,
-                    }
-            return
-        for i, item in enumerate(seq):
-            if not isinstance(item, Mapping):
-                continue
-            record = (
-                {name: item.get(key) for name, key in self.kwargs.items()}
-                if self.kwargs
-                else dict(item)
+    def index_keys(self) -> List[str]:
+        return [self._primary_index.key, f"{self._position_prefix}position"]
+    def with_primary_index_key(self, _: str) -> "Explosion":
+        print("`.with_primary_index_key(...)` is deprecated and will be removed in the future.")
+        return self
+    def update_sql(
+        self,
+        stmt: Select[Any],
+        dialect: SupportedSQLDialect,
+    ) -> Select[Any]:
+        array = self()
+        if dialect is SupportedSQLDialect.SQLITE:
+            # Because sqlite doesn't support `WITH ORDINALITY`, the order of
+            # the returned (table) values is not guaranteed. So we resort to
+            # post hoc processing using pandas.
+            stmt = stmt.where(
+                func.json_type(array) == "array",
+            ).add_columns(
+                array.label(self._array_tmp_col_label),
             )
-            for v in record.values():
-                if v is not None:
-                    break
+            return stmt
+        elif dialect is SupportedSQLDialect.POSTGRESQL:
+            element = (
+                func.jsonb_array_elements(array)
+                .table_valued(
+                    Column("obj", JSON),
+                    with_ordinality="position",
+                    joins_implicitly=True,
+                )
+                .render_derived()
+            )
+            obj, position = element.c.obj, element.c.position
+            # Use zero-based indexing for backward-compatibility.
+            position_label = (position - 1).label(f"{self._position_prefix}position")
+            if self.kwargs:
+                columns: Iterable[Label[Any]] = (
+                    obj[key.split(".")].label(self._add_tmp_suffix(name))
+                    for name, key in self.kwargs.items()
+                )
             else:
-                record = {}
-            if not record:
-                continue
-            record[self.primary_index.key] = self.primary_index(span)
-            record[f"{self.position_prefix}position"] = i
-            yield record
+                columns = (obj.label(self._array_tmp_col_label),)
+            stmt = (
+                stmt.where(func.jsonb_typeof(array) == "array")
+                .where(func.jsonb_typeof(obj) == "object")
+                .add_columns(position_label, *columns)
+            )
+            return stmt
+        else:
+            assert_never(dialect)
+    def update_df(
+        self,
+        df: pd.DataFrame,
+        dialect: SupportedSQLDialect,
+    ) -> pd.DataFrame:
+        df = df.rename(self._remove_tmp_suffix, axis=1)
+        if df.empty:
+            columns = list(
+                set(
+                    chain(
+                        self.index_keys,
+                        df.drop(self._array_tmp_col_label, axis=1, errors="ignore").columns,
+                        self.kwargs.keys(),
+                    )
+                )
+            )
+            df = pd.DataFrame(columns=columns).set_index(self.index_keys)
+            return df
+        if dialect != SupportedSQLDialect.SQLITE and self.kwargs:
+            df = df.set_index(self.index_keys)
+            return df
+        if dialect is SupportedSQLDialect.SQLITE:
+            # Because sqlite doesn't support `WITH ORDINALITY`, the order of
+            # the returned (table) values is not guaranteed. So we resort to
+            # post hoc processing using pandas.
+            def _extract_values(array: List[Any]) -> List[Dict[str, Any]]:
+                if not isinstance(array, Iterable):
+                    return []
+                if not self.kwargs:
+                    return [
+                        {
+                            **dict(flatten(obj)),
+                            f"{self._position_prefix}position": i,
+                        }
+                        for i, obj in enumerate(array)
+                        if isinstance(obj, Mapping)
+                    ]
+                res: List[Dict[str, Any]] = []
+                for i, obj in enumerate(array):
+                    if not isinstance(obj, Mapping):
+                        continue
+                    values: Dict[str, Any] = {f"{self._position_prefix}position": i}
+                    for name, key in self.kwargs.items():
+                        if (value := get_attribute_value(obj, key)) is not None:
+                            values[name] = value
+                    res.append(values)
+                return res
+            records = df.loc[:, self._array_tmp_col_label].dropna().map(_extract_values).explode()
+        elif dialect is SupportedSQLDialect.POSTGRESQL:
+            records = df.loc[:, self._array_tmp_col_label].dropna().map(flatten).map(dict)
+        else:
+            assert_never(dialect)
+        df = df.drop(self._array_tmp_col_label, axis=1)
+        if records.empty:
+            df = df.set_index(self.index_keys[0])
+            return df
+        df_explode = pd.DataFrame.from_records(records.to_list(), index=records.index)
+        if dialect is SupportedSQLDialect.SQLITE:
+            df = _outer_join(df, df_explode)
+        elif dialect is SupportedSQLDialect.POSTGRESQL:
+            df = pd.concat([df, df_explode], axis=1)
+        else:
+            assert_never(dialect)
+        df = df.set_index(self.index_keys)
+        return df
     def to_dict(self) -> Dict[str, Any]:
         return {
@@ -186,27 +282,126 @@ class Explosion(Projection):
 @dataclass(frozen=True)
-class Concatenation(Projection):
+class Concatenation(_HasTmpSuffix, Projection):
     kwargs: Mapping[str, str] = field(default_factory=lambda: MappingProxyType({}))
     separator: str = "\n\n"
+    _array_tmp_col_label: str = field(init=False, repr=False)
+    """For SQLite we need to store the array in a temporary column to be able
+    to concatenate it later in pandas. `_array_tmp_col_label` is the name of
+    this temporary column. The temporary column will have a unique name
+    per instance.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        object.__setattr__(self, "_array_tmp_col_label", f"__array_tmp_col_{random()}")
     def with_separator(self, separator: str = "\n\n") -> "Concatenation":
         return replace(self, separator=separator)
-    def __call__(self, span: Span) -> Iterator[Tuple[str, str]]:
-        if not isinstance(seq := self.value(span), Iterable):
-            return
-        if not self.kwargs:
-            yield self.key, self.separator.join(map(str, seq))
-        record = defaultdict(list)
-        for item in seq:
-            if not isinstance(item, Mapping):
-                continue
-            for k, v in self.kwargs.items():
-                if value := item.get(v):
-                    record[k].append(value)
-        for name, values in record.items():
-            yield name, self.separator.join(map(str, values))
+    def update_sql(
+        self,
+        stmt: Select[Any],
+        dialect: SupportedSQLDialect,
+    ) -> Select[Any]:
+        array = self()
+        if dialect is SupportedSQLDialect.SQLITE:
+            # Because SQLite doesn't support `WITH ORDINALITY`, the order of
+            # the returned table-values is not guaranteed. So we resort to
+            # post hoc processing using pandas.
+            stmt = stmt.where(
+                func.json_type(array) == "array",
+            ).add_columns(
+                array.label(self._array_tmp_col_label),
+            )
+            return stmt
+        elif dialect is SupportedSQLDialect.POSTGRESQL:
+            element = (
+                (
+                    func.jsonb_array_elements(array)
+                    if self.kwargs
+                    else func.jsonb_array_elements_text(array)
+                )
+                .table_valued(
+                    Column("obj", JSON),
+                    with_ordinality="position",
+                    joins_implicitly=True,
+                )
+                .render_derived()
+            )
+            obj, position = element.c.obj, element.c.position
+            if self.kwargs:
+                columns: Iterable[Label[Any]] = (
+                    func.string_agg(
+                        obj[key.split(".")].as_string(),
+                        aggregate_order_by(self.separator, position),  # type: ignore
+                    ).label(self._add_tmp_suffix(label))
+                    for label, key in self.kwargs.items()
+                )
+            else:
+                columns = (
+                    func.string_agg(
+                        obj,
+                        aggregate_order_by(self.separator, position),  # type: ignore
+                    ).label(self.key),
+                )
+            stmt = (
+                stmt.where(
+                    and_(
+                        func.jsonb_typeof(array) == "array",
+                        *((func.jsonb_typeof(obj) == "object",) if self.kwargs else ()),
+                    )
+                )
+                .add_columns(*columns)
+                .group_by(*stmt.columns.keys())
+            )
+            return stmt
+        else:
+            assert_never(dialect)
+    def update_df(
+        self,
+        df: pd.DataFrame,
+        dialect: SupportedSQLDialect,
+    ) -> pd.DataFrame:
+        df = df.rename(self._remove_tmp_suffix, axis=1)
+        if df.empty:
+            columns = list(
+                set(
+                    chain(
+                        df.drop(self._array_tmp_col_label, axis=1, errors="ignore").columns,
+                        self.kwargs.keys(),
+                    )
+                )
+            )
+            return pd.DataFrame(columns=columns, index=df.index)
+        if dialect is SupportedSQLDialect.SQLITE:
+            # Because SQLite doesn't support `WITH ORDINALITY`, the order of
+            # the returned table-values is not guaranteed. So we resort to
+            # post hoc processing using pandas.
+            def _concat_values(array: List[Any]) -> Dict[str, Any]:
+                if not isinstance(array, Iterable):
+                    return {}
+                if not self.kwargs:
+                    return {self.key: self.separator.join(str(obj) for obj in array)}
+                values: DefaultDict[str, List[str]] = defaultdict(list)
+                for i, obj in enumerate(array):
+                    if not isinstance(obj, Mapping):
+                        continue
+                    for label, key in self.kwargs.items():
+                        if (value := get_attribute_value(obj, key)) is not None:
+                            values[label].append(str(value))
+                return {label: self.separator.join(vs) for label, vs in values.items()}
+            records = df.loc[:, self._array_tmp_col_label].map(_concat_values)
+            df_concat = pd.DataFrame.from_records(records.to_list(), index=records.index)
+            return df.drop(self._array_tmp_col_label, axis=1).join(df_concat, how="outer")
+        elif dialect is SupportedSQLDialect.POSTGRESQL:
+            pass
+        else:
+            assert_never(dialect)
+        return df
     def to_dict(self) -> Dict[str, Any]:
         return {
@@ -233,13 +428,24 @@ class Concatenation(Projection):
 @dataclass(frozen=True)
-class SpanQuery:
+class SpanQuery(_HasTmpSuffix):
     _select: Mapping[str, Projection] = field(default_factory=lambda: MappingProxyType({}))
-    _concat: Concatenation = field(default_factory=Concatenation)
-    _explode: Explosion = field(default_factory=Explosion)
-    _filter: SpanFilter = field(default_factory=SpanFilter)
+    _concat: Optional[Concatenation] = field(default=None)
+    _explode: Optional[Explosion] = field(default=None)
+    _filter: Optional[SpanFilter] = field(default=None)
     _rename: Mapping[str, str] = field(default_factory=lambda: MappingProxyType({}))
     _index: Projection = field(default_factory=lambda: Projection("context.span_id"))
+    _concat_separator: str = field(default="\n\n", repr=False)
+    _pk_tmp_col_label: str = field(init=False, repr=False)
+    """We use `_pk_tmp_col_label` as a temporary column for storing
+    the row id, i.e. the primary key, of the spans table. This will help
+    us with joins without the risk of naming conflicts. The temporary
+    column will have a unique name per instance.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        object.__setattr__(self, "_pk_tmp_col_label", f"__pk_tmp_col_{random()}")
     def __bool__(self) -> bool:
         return bool(self._select) or bool(self._filter) or bool(self._explode) or bool(self._concat)
@@ -255,11 +461,21 @@ class SpanQuery:
         return replace(self, _filter=_filter)
     def explode(self, key: str, **kwargs: str) -> "SpanQuery":
+        assert (
+            isinstance(key, str) and key
+        ), "The field name for explosion must be a non-empty string."
         _explode = Explosion(key=key, kwargs=kwargs, primary_index_key=self._index.key)
         return replace(self, _explode=_explode)
     def concat(self, key: str, **kwargs: str) -> "SpanQuery":
-        _concat = Concatenation(key=key, kwargs=kwargs)
+        assert (
+            isinstance(key, str) and key
+        ), "The field name for concatenation must be a non-empty string."
+        _concat = (
+            Concatenation(key=key, kwargs=kwargs, separator=self._concat.separator)
+            if self._concat
+            else Concatenation(key=key, kwargs=kwargs, separator=self._concat_separator)
+        )
         return replace(self, _concat=_concat)
     def rename(self, **kwargs: str) -> "SpanQuery":
@@ -268,75 +484,136 @@ class SpanQuery:
     def with_index(self, key: str = "context.span_id") -> "SpanQuery":
         _index = Projection(key=key)
-        return replace(self, _index=_index)
+        return (
+            replace(self, _index=_index, _explode=replace(self._explode, primary_index_key=key))
+            if self._explode
+            else replace(self, _index=_index)
+        )
     def with_concat_separator(self, separator: str = "\n\n") -> "SpanQuery":
+        if not self._concat:
+            return replace(self, _concat_separator=separator)
         _concat = self._concat.with_separator(separator)
         return replace(self, _concat=_concat)
-    def with_explode_primary_index_key(self, primary_index_key: str) -> "SpanQuery":
-        _explode = self._explode.with_primary_index_key(primary_index_key)
-        return replace(self, _explode=_explode)
-    def __call__(self, spans: Iterable[Span]) -> pd.DataFrame:
-        if self._filter:
-            spans = filter(self._filter, spans)
-        if self._explode:
-            spans = filter(
-                lambda span: (isinstance(seq := self._explode.value(span), Sized) and len(seq)),
-                spans,
-            )
-        if self._concat:
-            spans = filter(
-                lambda span: (isinstance(seq := self._concat.value(span), Sized) and len(seq)),
-                spans,
+    def with_explode_primary_index_key(self, _: str) -> "SpanQuery":
+        print(
+            "`.with_explode_primary_index_key(...)` is deprecated and will be "
+            "removed in the future. Use `.with_index(...)` instead."
+        )
+        return self
+    def __call__(
+        self,
+        session: Session,
+        project_name: Optional[str] = None,
+        start_time: Optional[datetime] = None,
+        end_time: Optional[datetime] = None,
+        limit: Optional[int] = DEFAULT_SPAN_LIMIT,
+        root_spans_only: Optional[bool] = None,
+        # Deprecated
+        stop_time: Optional[datetime] = None,
+    ) -> pd.DataFrame:
+        if not project_name:
+            project_name = DEFAULT_PROJECT_NAME
+        if stop_time:
+            # Deprecated. Raise a warning
+            warnings.warn(
+                "stop_time is deprecated. Use end_time instead.",
+                DeprecationWarning,
             )
+            end_time = end_time or stop_time
         if not (self._select or self._explode or self._concat):
-            if not (data := [json.loads(span_to_json(span)) for span in spans]):
-                return pd.DataFrame()
-            return (
-                pd.json_normalize(data, max_level=1)
-                .rename(self._rename, axis=1, errors="ignore")
-                .set_index("context.span_id", drop=False)
+            return _get_spans_dataframe(
+                session,
+                project_name,
+                span_filter=self._filter,
+                start_time=start_time,
+                end_time=end_time,
+                limit=limit,
+                root_spans_only=root_spans_only,
             )
-        _selected: List[Dict[str, Any]] = []
-        _exploded: List[Dict[str, Any]] = []
-        for span in spans:
-            if self._select:
-                record = {name: proj(span) for name, proj in self._select.items()}
-                for v in record.values():
-                    if v is not None:
-                        break
-                else:
-                    record = {}
-                if self._concat:
-                    record.update(self._concat(span))
-                if record:
-                    if self._index.key not in record:
-                        record[self._index.key] = self._index(span)
-                    _selected.append(record)
-            elif self._concat:
-                record = {self._index.key: self._index(span)}
-                record.update(self._concat(span))
-                if record:
-                    _selected.append(record)
-            if self._explode:
-                _exploded.extend(self._explode(span))
-        if _selected:
-            select_df = pd.DataFrame(_selected)
-        else:
-            select_df = pd.DataFrame(columns=[self._index.key])
-        select_df = select_df.set_index(self._index.key)
+        assert session.bind is not None
+        dialect = SupportedSQLDialect(session.bind.dialect.name)
+        row_id = models.Span.id.label(self._pk_tmp_col_label)
+        stmt: Select[Any] = (
+            # We do not allow `group_by` anything other than `row_id` because otherwise
+            # it's too complex for the post hoc processing step in pandas.
+            select(row_id)
+            .join(models.Trace)
+            .join(models.Project)
+            .where(models.Project.name == project_name)
+        )
+        if start_time:
+            stmt = stmt.where(start_time <= models.Span.start_time)
+        if end_time:
+            stmt = stmt.where(models.Span.start_time < end_time)
+        if limit is not None:
+            stmt = stmt.limit(limit)
+        if root_spans_only:
+            parent = aliased(models.Span)
+            stmt = stmt.outerjoin(
+                parent,
+                models.Span.parent_id == parent.span_id,
+            ).where(parent.span_id == None)  # noqa E711
+        stmt0_orig: Select[Any] = stmt
+        stmt1_filter: Optional[Select[Any]] = None
+        if self._filter:
+            stmt = stmt1_filter = self._filter(stmt)
+        stmt2_select: Optional[Select[Any]] = None
+        if self._select:
+            columns: Iterable[Label[Any]] = (
+                proj().label(self._add_tmp_suffix(label)) for label, proj in self._select.items()
+            )
+            stmt = stmt2_select = stmt.add_columns(*columns)
+        stmt3_explode: Optional[Select[Any]] = None
         if self._explode:
-            if _exploded:
-                explode_df = pd.DataFrame(_exploded)
+            stmt = stmt3_explode = self._explode.update_sql(stmt, dialect)
+        index: Label[Any] = self._index().label(self._add_tmp_suffix(self._index.key))
+        df: Optional[pd.DataFrame] = None
+        # `concat` is done separately because it has `group_by` but we can't
+        # always join to it as a subquery because it may require post hoc
+        # processing in pandas. It's kept separate for simplicity.
+        df_concat: Optional[pd.DataFrame] = None
+        conn = session.connection()
+        if self._explode or not self._concat:
+            if index.name not in stmt.selected_columns.keys():
+                stmt = stmt.add_columns(index)
+            df = pd.read_sql_query(stmt, conn, self._pk_tmp_col_label)
+        if self._concat:
+            if df is not None:
+                assert stmt3_explode is not None
+                # We can't include stmt3_explode because it may be trying to
+                # explode the same column that we're trying to concatenate,
+                # resulting in duplicated joins.
+                stmt_no_explode = (
+                    stmt2_select
+                    if stmt2_select is not None
+                    else (stmt1_filter if stmt1_filter is not None else stmt0_orig)
+                )
+                stmt4_concat = stmt_no_explode.with_only_columns(row_id)
             else:
-                explode_df = pd.DataFrame(columns=self._explode.index_keys)
-            explode_df = explode_df.set_index(list(self._explode.index_keys))
-            if not self._select:
-                return explode_df.rename(self._rename, axis=1, errors="ignore")
-            select_df = select_df.join(explode_df, how="outer")
-        return select_df.rename(self._rename, axis=1, errors="ignore")
+                assert stmt3_explode is None
+                stmt4_concat = stmt
+            if (df is None or df.empty) and index.name not in stmt4_concat.selected_columns.keys():
+                stmt4_concat = stmt4_concat.add_columns(index)
+            stmt4_concat = self._concat.update_sql(stmt4_concat, dialect)
+            df_concat = pd.read_sql_query(stmt4_concat, conn, self._pk_tmp_col_label)
+            df_concat = self._concat.update_df(df_concat, dialect)
+        assert df is not None or df_concat is not None
+        if df is None:
+            df = df_concat
+        elif df_concat is not None:
+            df = _outer_join(df, df_concat)
+        assert df is not None and self._pk_tmp_col_label not in df.columns
+        df = df.rename(self._remove_tmp_suffix, axis=1)
+        if self._explode:
+            df = self._explode.update_df(df, dialect)
+        else:
+            df = df.set_index(self._index.key)
+        df = df.rename(_ALIASES, axis=1, errors="ignore")
+        df = df.rename(self._rename, axis=1, errors="ignore")
+        return df
     def to_dict(self) -> Dict[str, Any]:
         return {
@@ -345,9 +622,9 @@ class SpanQuery:
                 if self._select
                 else {}
             ),
-            "filter": self._filter.to_dict(),
-            "explode": self._explode.to_dict(),
-            "concat": self._concat.to_dict(),
+            **({"filter": self._filter.to_dict()} if self._filter else {}),
+            **({"explode": self._explode.to_dict()} if self._explode else {}),
+            **({"concat": self._concat.to_dict()} if self._concat else {}),
             **({"rename": dict(self._rename)} if self._rename else {}),
             "index": self._index.to_dict(),
         }
@@ -356,7 +633,6 @@ class SpanQuery:
     def from_dict(
         cls,
         obj: Mapping[str, Any],
-        evals: Optional[SupportsGetSpanEvaluation] = None,
         valid_eval_names: Optional[Sequence[str]] = None,
     ) -> "SpanQuery":
         return cls(
@@ -376,7 +652,6 @@ class SpanQuery:
                 {
                     "_filter": SpanFilter.from_dict(
                         cast(Mapping[str, Any], filter),
-                        evals=evals,
                         valid_eval_names=valid_eval_names,
                     )
                 }  # type: ignore
@@ -386,11 +661,13 @@ class SpanQuery:
             **(
                 {"_explode": Explosion.from_dict(cast(Mapping[str, Any], explode))}  # type: ignore
                 if (explode := obj.get("explode"))
+                and explode.get("key")  # check `key` for backward-compatible truthiness
                 else {}
             ),
             **(
                 {"_concat": Concatenation.from_dict(cast(Mapping[str, Any], concat))}  # type: ignore
                 if (concat := obj.get("concat"))
+                and concat.get("key")  # check `key` for backward-compatible truthiness
                 else {}
             ),
             **(
@@ -404,3 +681,103 @@ class SpanQuery:
                 else {}
             ),
         )
+def _get_spans_dataframe(
+    session: Session,
+    project_name: str,
+    /,
+    *,
+    span_filter: Optional[SpanFilter] = None,
+    start_time: Optional[datetime] = None,
+    end_time: Optional[datetime] = None,
+    limit: Optional[int] = DEFAULT_SPAN_LIMIT,
+    root_spans_only: Optional[bool] = None,
+    # Deprecated
+    stop_time: Optional[datetime] = None,
+) -> pd.DataFrame:
+    # use legacy labels for backward-compatibility
+    span_id_label = "context.span_id"
+    trace_id_label = "context.trace_id"
+    if stop_time:
+        # Deprecated. Raise a warning
+        warnings.warn(
+            "stop_time is deprecated. Use end_time instead.",
+            DeprecationWarning,
+        )
+        end_time = end_time or stop_time
+    stmt: Select[Any] = (
+        select(
+            models.Span.name,
+            models.Span.span_kind,
+            models.Span.parent_id,
+            models.Span.start_time,
+            models.Span.end_time,
+            models.Span.status_code,
+            models.Span.status_message,
+            models.Span.events,
+            models.Span.span_id.label(span_id_label),
+            models.Trace.trace_id.label(trace_id_label),
+            models.Span.attributes,
+        )
+        .join(models.Trace)
+        .join(models.Project)
+        .where(models.Project.name == project_name)
+    )
+    if span_filter:
+        stmt = span_filter(stmt)
+    if start_time:
+        stmt = stmt.where(start_time <= models.Span.start_time)
+    if end_time:
+        stmt = stmt.where(models.Span.start_time < end_time)
+    if limit is not None:
+        stmt = stmt.limit(limit)
+    if root_spans_only:
+        parent = aliased(models.Span)
+        stmt = stmt.outerjoin(
+            parent,
+            models.Span.parent_id == parent.span_id,
+        ).where(parent.span_id == None)  # noqa E711
+    conn = session.connection()
+    # set `drop=False` for backward-compatibility
+    df = pd.read_sql_query(stmt, conn).set_index(span_id_label, drop=False)
+    if df.empty:
+        return df.drop("attributes", axis=1)
+    df_attributes = pd.DataFrame.from_records(
+        df.attributes.map(_flatten_semantic_conventions),
+    ).set_axis(df.index, axis=0)
+    df = pd.concat(
+        [
+            df.drop("attributes", axis=1),
+            df_attributes.add_prefix("attributes" + "."),
+        ],
+        axis=1,
+    )
+    return df
+def _outer_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+    if (columns_intersection := left.columns.intersection(right.columns)).empty:
+        df = left.join(right, how="outer")
+    else:
+        df = left.join(right, how="outer", lsuffix="_L", rsuffix="_R")
+        for col in columns_intersection:
+            df.loc[:, col] = df.loc[:, f"{col}_L"].fillna(df.loc[:, f"{col}_R"])
+            df = df.drop([f"{col}_L", f"{col}_R"], axis=1)
+    return df
+def _flatten_semantic_conventions(attributes: Mapping[str, Any]) -> Dict[str, Any]:
+    # This may be inefficient, but is needed to preserve backward-compatibility.
+    # For example, custom attributes do not get flattened.
+    ans = unflatten(
+        load_json_strings(
+            flatten(
+                attributes,
+                recurse_on_sequence=True,
+                json_string_attributes=JSON_STRING_ATTRIBUTES,
+            ),
+        ),
+        prefix_exclusions=SEMANTIC_CONVENTIONS,
+    )
+    return ans

arize-phoenix 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl

Potentially problematic release.

arize-phoenix 3.25.0py3-none-any.whl → 4.0.1py3-none-any.whl