arize-phoenix 3.25.0__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (113) hide show
  1. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/METADATA +26 -4
  2. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/RECORD +80 -75
  3. phoenix/__init__.py +9 -5
  4. phoenix/config.py +109 -53
  5. phoenix/datetime_utils.py +18 -1
  6. phoenix/db/README.md +25 -0
  7. phoenix/db/__init__.py +4 -0
  8. phoenix/db/alembic.ini +119 -0
  9. phoenix/db/bulk_inserter.py +206 -0
  10. phoenix/db/engines.py +152 -0
  11. phoenix/db/helpers.py +47 -0
  12. phoenix/db/insertion/evaluation.py +209 -0
  13. phoenix/db/insertion/helpers.py +51 -0
  14. phoenix/db/insertion/span.py +142 -0
  15. phoenix/db/migrate.py +71 -0
  16. phoenix/db/migrations/env.py +121 -0
  17. phoenix/db/migrations/script.py.mako +26 -0
  18. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +280 -0
  19. phoenix/db/models.py +371 -0
  20. phoenix/exceptions.py +5 -1
  21. phoenix/server/api/context.py +40 -3
  22. phoenix/server/api/dataloaders/__init__.py +97 -0
  23. phoenix/server/api/dataloaders/cache/__init__.py +3 -0
  24. phoenix/server/api/dataloaders/cache/two_tier_cache.py +67 -0
  25. phoenix/server/api/dataloaders/document_evaluation_summaries.py +152 -0
  26. phoenix/server/api/dataloaders/document_evaluations.py +37 -0
  27. phoenix/server/api/dataloaders/document_retrieval_metrics.py +98 -0
  28. phoenix/server/api/dataloaders/evaluation_summaries.py +151 -0
  29. phoenix/server/api/dataloaders/latency_ms_quantile.py +198 -0
  30. phoenix/server/api/dataloaders/min_start_or_max_end_times.py +93 -0
  31. phoenix/server/api/dataloaders/record_counts.py +125 -0
  32. phoenix/server/api/dataloaders/span_descendants.py +64 -0
  33. phoenix/server/api/dataloaders/span_evaluations.py +37 -0
  34. phoenix/server/api/dataloaders/token_counts.py +138 -0
  35. phoenix/server/api/dataloaders/trace_evaluations.py +37 -0
  36. phoenix/server/api/input_types/SpanSort.py +138 -68
  37. phoenix/server/api/routers/v1/__init__.py +11 -0
  38. phoenix/server/api/routers/v1/evaluations.py +275 -0
  39. phoenix/server/api/routers/v1/spans.py +126 -0
  40. phoenix/server/api/routers/v1/traces.py +82 -0
  41. phoenix/server/api/schema.py +112 -48
  42. phoenix/server/api/types/DocumentEvaluationSummary.py +1 -1
  43. phoenix/server/api/types/Evaluation.py +29 -12
  44. phoenix/server/api/types/EvaluationSummary.py +29 -44
  45. phoenix/server/api/types/MimeType.py +2 -2
  46. phoenix/server/api/types/Model.py +9 -9
  47. phoenix/server/api/types/Project.py +240 -171
  48. phoenix/server/api/types/Span.py +87 -131
  49. phoenix/server/api/types/Trace.py +29 -20
  50. phoenix/server/api/types/pagination.py +151 -10
  51. phoenix/server/app.py +263 -35
  52. phoenix/server/grpc_server.py +93 -0
  53. phoenix/server/main.py +75 -60
  54. phoenix/server/openapi/docs.py +218 -0
  55. phoenix/server/prometheus.py +23 -7
  56. phoenix/server/static/index.js +662 -643
  57. phoenix/server/telemetry.py +68 -0
  58. phoenix/services.py +4 -0
  59. phoenix/session/client.py +34 -30
  60. phoenix/session/data_extractor.py +8 -3
  61. phoenix/session/session.py +176 -155
  62. phoenix/settings.py +13 -0
  63. phoenix/trace/attributes.py +349 -0
  64. phoenix/trace/dsl/README.md +116 -0
  65. phoenix/trace/dsl/filter.py +660 -192
  66. phoenix/trace/dsl/helpers.py +24 -5
  67. phoenix/trace/dsl/query.py +562 -185
  68. phoenix/trace/fixtures.py +69 -7
  69. phoenix/trace/otel.py +44 -200
  70. phoenix/trace/schemas.py +14 -8
  71. phoenix/trace/span_evaluations.py +5 -2
  72. phoenix/utilities/__init__.py +0 -26
  73. phoenix/utilities/span_store.py +0 -23
  74. phoenix/version.py +1 -1
  75. phoenix/core/project.py +0 -773
  76. phoenix/core/traces.py +0 -96
  77. phoenix/datasets/dataset.py +0 -214
  78. phoenix/datasets/fixtures.py +0 -24
  79. phoenix/datasets/schema.py +0 -31
  80. phoenix/experimental/evals/__init__.py +0 -73
  81. phoenix/experimental/evals/evaluators.py +0 -413
  82. phoenix/experimental/evals/functions/__init__.py +0 -4
  83. phoenix/experimental/evals/functions/classify.py +0 -453
  84. phoenix/experimental/evals/functions/executor.py +0 -353
  85. phoenix/experimental/evals/functions/generate.py +0 -138
  86. phoenix/experimental/evals/functions/processing.py +0 -76
  87. phoenix/experimental/evals/models/__init__.py +0 -14
  88. phoenix/experimental/evals/models/anthropic.py +0 -175
  89. phoenix/experimental/evals/models/base.py +0 -170
  90. phoenix/experimental/evals/models/bedrock.py +0 -221
  91. phoenix/experimental/evals/models/litellm.py +0 -134
  92. phoenix/experimental/evals/models/openai.py +0 -453
  93. phoenix/experimental/evals/models/rate_limiters.py +0 -246
  94. phoenix/experimental/evals/models/vertex.py +0 -173
  95. phoenix/experimental/evals/models/vertexai.py +0 -186
  96. phoenix/experimental/evals/retrievals.py +0 -96
  97. phoenix/experimental/evals/templates/__init__.py +0 -50
  98. phoenix/experimental/evals/templates/default_templates.py +0 -472
  99. phoenix/experimental/evals/templates/template.py +0 -195
  100. phoenix/experimental/evals/utils/__init__.py +0 -172
  101. phoenix/experimental/evals/utils/threads.py +0 -27
  102. phoenix/server/api/routers/evaluation_handler.py +0 -110
  103. phoenix/server/api/routers/span_handler.py +0 -70
  104. phoenix/server/api/routers/trace_handler.py +0 -60
  105. phoenix/storage/span_store/__init__.py +0 -23
  106. phoenix/storage/span_store/text_file.py +0 -85
  107. phoenix/trace/dsl/missing.py +0 -60
  108. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/WHEEL +0 -0
  109. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/IP_NOTICE +0 -0
  110. {arize_phoenix-3.25.0.dist-info → arize_phoenix-4.0.1.dist-info}/licenses/LICENSE +0 -0
  111. /phoenix/{datasets → db/insertion}/__init__.py +0 -0
  112. /phoenix/{experimental → db/migrations}/__init__.py +0 -0
  113. /phoenix/{storage → server/openapi}/__init__.py +0 -0
@@ -1,31 +1,46 @@
1
- import json
1
+ import warnings
2
2
  from collections import defaultdict
3
- from dataclasses import dataclass, field, fields, replace
4
- from functools import cached_property, partial
3
+ from dataclasses import dataclass, field, replace
4
+ from datetime import datetime
5
+ from functools import cached_property
6
+ from itertools import chain
7
+ from random import randint, random
5
8
  from types import MappingProxyType
6
9
  from typing import (
7
10
  Any,
8
- Callable,
9
- ClassVar,
11
+ DefaultDict,
10
12
  Dict,
11
13
  Iterable,
12
- Iterator,
13
14
  List,
14
15
  Mapping,
15
16
  Optional,
16
17
  Sequence,
17
- Sized,
18
- Tuple,
19
18
  cast,
20
19
  )
21
20
 
22
21
  import pandas as pd
23
22
  from openinference.semconv.trace import SpanAttributes
24
-
23
+ from sqlalchemy import JSON, Column, Label, Select, SQLColumnExpression, and_, func, select
24
+ from sqlalchemy.dialects.postgresql import aggregate_order_by
25
+ from sqlalchemy.orm import Session, aliased
26
+ from typing_extensions import assert_never
27
+
28
+ from phoenix.config import DEFAULT_PROJECT_NAME
29
+ from phoenix.db import models
30
+ from phoenix.db.helpers import SupportedSQLDialect
31
+ from phoenix.trace.attributes import (
32
+ JSON_STRING_ATTRIBUTES,
33
+ SEMANTIC_CONVENTIONS,
34
+ flatten,
35
+ get_attribute_value,
36
+ load_json_strings,
37
+ unflatten,
38
+ )
25
39
  from phoenix.trace.dsl import SpanFilter
26
- from phoenix.trace.dsl.filter import SupportsGetSpanEvaluation
27
- from phoenix.trace.schemas import ATTRIBUTE_PREFIX, CONTEXT_PREFIX, Span
28
- from phoenix.trace.span_json_encoder import span_to_json
40
+ from phoenix.trace.dsl.filter import Projector
41
+ from phoenix.trace.schemas import ATTRIBUTE_PREFIX
42
+
43
+ DEFAULT_SPAN_LIMIT = 1000
29
44
 
30
45
  RETRIEVAL_DOCUMENTS = SpanAttributes.RETRIEVAL_DOCUMENTS
31
46
 
@@ -39,127 +54,208 @@ _ALIASES = {
39
54
  "trace_id": "context.trace_id",
40
55
  }
41
56
 
42
- # Because span_kind is an enum, it needs to be converted to string,
43
- # so it's serializable by pyarrow.
44
- _CONVERT_TO_STRING = ("span_kind",)
45
-
46
57
 
47
58
  def _unalias(key: str) -> str:
48
59
  return _ALIASES.get(key, key)
49
60
 
50
61
 
51
62
  @dataclass(frozen=True)
52
- class Projection:
53
- key: str = ""
54
- value: Callable[[Span], Any] = field(init=False, repr=False)
55
- span_fields: ClassVar[Tuple[str, ...]] = tuple(f.name for f in fields(Span))
56
-
57
- def __bool__(self) -> bool:
58
- return bool(self.key)
63
+ class _Base:
64
+ """The sole purpose of this class is for `super().__post_init__()` to work"""
59
65
 
60
66
  def __post_init__(self) -> None:
61
- key = _unalias(self.key)
62
- object.__setattr__(self, "key", key)
63
- if key.startswith(CONTEXT_PREFIX):
64
- key = key[len(CONTEXT_PREFIX) :]
65
- value = partial(self._from_context, key=key)
66
- elif key.startswith(ATTRIBUTE_PREFIX):
67
- key = self.key[len(ATTRIBUTE_PREFIX) :]
68
- value = partial(self._from_attributes, key=key)
69
- elif key in self.span_fields:
70
- value = partial(self._from_span, key=key)
71
- else:
72
- value = partial(self._from_attributes, key=key)
73
- if self.key in _CONVERT_TO_STRING:
74
- object.__setattr__(
75
- self,
76
- "value",
77
- lambda span: None if (v := value(span)) is None else str(v),
78
- )
79
- else:
80
- object.__setattr__(self, "value", value)
67
+ pass
81
68
 
82
- def __call__(self, span: Span) -> Any:
83
- return self.value(span)
84
69
 
85
- @staticmethod
86
- def _from_attributes(span: Span, key: str) -> Any:
87
- return span.attributes.get(key)
70
+ @dataclass(frozen=True)
71
+ class Projection(_Base):
72
+ key: str = ""
73
+ _projector: Projector = field(init=False, repr=False)
88
74
 
89
- @staticmethod
90
- def _from_context(span: Span, key: str) -> Any:
91
- return getattr(span.context, key, None)
75
+ def __post_init__(self) -> None:
76
+ super().__post_init__()
77
+ object.__setattr__(self, "key", _unalias(self.key))
78
+ object.__setattr__(self, "_projector", Projector(self.key))
92
79
 
93
- @staticmethod
94
- def _from_span(span: Span, key: str) -> Any:
95
- return getattr(span, key, None)
80
+ def __bool__(self) -> bool:
81
+ return bool(self.key)
82
+
83
+ def __call__(self) -> SQLColumnExpression[Any]:
84
+ return self._projector()
96
85
 
97
86
  def to_dict(self) -> Dict[str, Any]:
98
87
  return {"key": self.key}
99
88
 
100
89
  @classmethod
101
90
  def from_dict(cls, obj: Mapping[str, Any]) -> "Projection":
102
- return cls(
103
- **({"key": cast(str, key)} if (key := obj.get("key")) else {}),
104
- )
91
+ return cls(**({"key": cast(str, key)} if (key := obj.get("key")) else {}))
92
+
93
+
94
+ @dataclass(frozen=True)
95
+ class _HasTmpSuffix(_Base):
96
+ _tmp_suffix: str = field(init=False, repr=False)
97
+ """Ideally every column label should get a temporary random suffix that will
98
+ be removed at the end. This is necessary during query construction because
99
+ sqlalchemy is not always foolproof, e.g. we have seen `group_by` clauses that
100
+ were incorrect or ambiguous. We should actively avoid name collisions, which
101
+ is increasingly likely as queries get more complex.
102
+ """
103
+
104
+ def __post_init__(self) -> None:
105
+ super().__post_init__()
106
+ object.__setattr__(self, "_tmp_suffix", f"{randint(0, 10**6):06d}")
107
+
108
+ def _remove_tmp_suffix(self, name: str) -> str:
109
+ if name.endswith(self._tmp_suffix):
110
+ return name[: -len(self._tmp_suffix)]
111
+ return name
112
+
113
+ def _add_tmp_suffix(self, name: str) -> str:
114
+ if name.endswith(self._tmp_suffix):
115
+ return name
116
+ return name + self._tmp_suffix
105
117
 
106
118
 
107
119
  @dataclass(frozen=True)
108
- class Explosion(Projection):
120
+ class Explosion(_HasTmpSuffix, Projection):
109
121
  kwargs: Mapping[str, str] = field(default_factory=lambda: MappingProxyType({}))
110
122
  primary_index_key: str = "context.span_id"
111
123
 
112
- position_prefix: str = field(init=False, repr=False)
113
- primary_index: Projection = field(init=False, repr=False)
124
+ _position_prefix: str = field(init=False, repr=False)
125
+ _primary_index: Projection = field(init=False, repr=False)
126
+ _array_tmp_col_label: str = field(init=False, repr=False)
127
+ """For sqlite we need to store the array in a temporary column to be able
128
+ to explode it later in pandas. `_array_tmp_col_label` is the name of this
129
+ temporary column. The temporary column will have a unique name
130
+ per instance.
131
+ """
114
132
 
115
133
  def __post_init__(self) -> None:
116
134
  super().__post_init__()
117
135
  position_prefix = _PRESCRIBED_POSITION_PREFIXES.get(self.key, "")
118
- object.__setattr__(self, "position_prefix", position_prefix)
119
- object.__setattr__(self, "primary_index", Projection(self.primary_index_key))
136
+ object.__setattr__(self, "_position_prefix", position_prefix)
137
+ object.__setattr__(self, "_primary_index", Projection(self.primary_index_key))
138
+ object.__setattr__(self, "_array_tmp_col_label", f"__array_tmp_col_{random()}")
120
139
 
121
140
  @cached_property
122
- def index_keys(self) -> Tuple[str, str]:
123
- return (self.primary_index.key, f"{self.position_prefix}position")
124
-
125
- def with_primary_index_key(self, primary_index_key: str) -> "Explosion":
126
- return replace(self, primary_index_key=primary_index_key)
127
-
128
- def __call__(self, span: Span) -> Iterator[Dict[str, Any]]:
129
- if not isinstance(seq := self.value(span), Iterable):
130
- return
131
- has_mapping = False
132
- for item in seq:
133
- if isinstance(item, Mapping):
134
- has_mapping = True
135
- break
136
- if not has_mapping:
137
- for i, item in enumerate(seq):
138
- if item is not None:
139
- yield {
140
- self.key: item,
141
- self.primary_index.key: self.primary_index(span),
142
- f"{self.position_prefix}position": i,
143
- }
144
- return
145
- for i, item in enumerate(seq):
146
- if not isinstance(item, Mapping):
147
- continue
148
- record = (
149
- {name: item.get(key) for name, key in self.kwargs.items()}
150
- if self.kwargs
151
- else dict(item)
141
+ def index_keys(self) -> List[str]:
142
+ return [self._primary_index.key, f"{self._position_prefix}position"]
143
+
144
+ def with_primary_index_key(self, _: str) -> "Explosion":
145
+ print("`.with_primary_index_key(...)` is deprecated and will be removed in the future.")
146
+ return self
147
+
148
+ def update_sql(
149
+ self,
150
+ stmt: Select[Any],
151
+ dialect: SupportedSQLDialect,
152
+ ) -> Select[Any]:
153
+ array = self()
154
+ if dialect is SupportedSQLDialect.SQLITE:
155
+ # Because sqlite doesn't support `WITH ORDINALITY`, the order of
156
+ # the returned (table) values is not guaranteed. So we resort to
157
+ # post hoc processing using pandas.
158
+ stmt = stmt.where(
159
+ func.json_type(array) == "array",
160
+ ).add_columns(
161
+ array.label(self._array_tmp_col_label),
152
162
  )
153
- for v in record.values():
154
- if v is not None:
155
- break
163
+ return stmt
164
+ elif dialect is SupportedSQLDialect.POSTGRESQL:
165
+ element = (
166
+ func.jsonb_array_elements(array)
167
+ .table_valued(
168
+ Column("obj", JSON),
169
+ with_ordinality="position",
170
+ joins_implicitly=True,
171
+ )
172
+ .render_derived()
173
+ )
174
+ obj, position = element.c.obj, element.c.position
175
+ # Use zero-based indexing for backward-compatibility.
176
+ position_label = (position - 1).label(f"{self._position_prefix}position")
177
+ if self.kwargs:
178
+ columns: Iterable[Label[Any]] = (
179
+ obj[key.split(".")].label(self._add_tmp_suffix(name))
180
+ for name, key in self.kwargs.items()
181
+ )
156
182
  else:
157
- record = {}
158
- if not record:
159
- continue
160
- record[self.primary_index.key] = self.primary_index(span)
161
- record[f"{self.position_prefix}position"] = i
162
- yield record
183
+ columns = (obj.label(self._array_tmp_col_label),)
184
+ stmt = (
185
+ stmt.where(func.jsonb_typeof(array) == "array")
186
+ .where(func.jsonb_typeof(obj) == "object")
187
+ .add_columns(position_label, *columns)
188
+ )
189
+ return stmt
190
+ else:
191
+ assert_never(dialect)
192
+
193
+ def update_df(
194
+ self,
195
+ df: pd.DataFrame,
196
+ dialect: SupportedSQLDialect,
197
+ ) -> pd.DataFrame:
198
+ df = df.rename(self._remove_tmp_suffix, axis=1)
199
+ if df.empty:
200
+ columns = list(
201
+ set(
202
+ chain(
203
+ self.index_keys,
204
+ df.drop(self._array_tmp_col_label, axis=1, errors="ignore").columns,
205
+ self.kwargs.keys(),
206
+ )
207
+ )
208
+ )
209
+ df = pd.DataFrame(columns=columns).set_index(self.index_keys)
210
+ return df
211
+ if dialect != SupportedSQLDialect.SQLITE and self.kwargs:
212
+ df = df.set_index(self.index_keys)
213
+ return df
214
+ if dialect is SupportedSQLDialect.SQLITE:
215
+ # Because sqlite doesn't support `WITH ORDINALITY`, the order of
216
+ # the returned (table) values is not guaranteed. So we resort to
217
+ # post hoc processing using pandas.
218
+ def _extract_values(array: List[Any]) -> List[Dict[str, Any]]:
219
+ if not isinstance(array, Iterable):
220
+ return []
221
+ if not self.kwargs:
222
+ return [
223
+ {
224
+ **dict(flatten(obj)),
225
+ f"{self._position_prefix}position": i,
226
+ }
227
+ for i, obj in enumerate(array)
228
+ if isinstance(obj, Mapping)
229
+ ]
230
+ res: List[Dict[str, Any]] = []
231
+ for i, obj in enumerate(array):
232
+ if not isinstance(obj, Mapping):
233
+ continue
234
+ values: Dict[str, Any] = {f"{self._position_prefix}position": i}
235
+ for name, key in self.kwargs.items():
236
+ if (value := get_attribute_value(obj, key)) is not None:
237
+ values[name] = value
238
+ res.append(values)
239
+ return res
240
+
241
+ records = df.loc[:, self._array_tmp_col_label].dropna().map(_extract_values).explode()
242
+ elif dialect is SupportedSQLDialect.POSTGRESQL:
243
+ records = df.loc[:, self._array_tmp_col_label].dropna().map(flatten).map(dict)
244
+ else:
245
+ assert_never(dialect)
246
+ df = df.drop(self._array_tmp_col_label, axis=1)
247
+ if records.empty:
248
+ df = df.set_index(self.index_keys[0])
249
+ return df
250
+ df_explode = pd.DataFrame.from_records(records.to_list(), index=records.index)
251
+ if dialect is SupportedSQLDialect.SQLITE:
252
+ df = _outer_join(df, df_explode)
253
+ elif dialect is SupportedSQLDialect.POSTGRESQL:
254
+ df = pd.concat([df, df_explode], axis=1)
255
+ else:
256
+ assert_never(dialect)
257
+ df = df.set_index(self.index_keys)
258
+ return df
163
259
 
164
260
  def to_dict(self) -> Dict[str, Any]:
165
261
  return {
@@ -186,27 +282,126 @@ class Explosion(Projection):
186
282
 
187
283
 
188
284
  @dataclass(frozen=True)
189
- class Concatenation(Projection):
285
+ class Concatenation(_HasTmpSuffix, Projection):
190
286
  kwargs: Mapping[str, str] = field(default_factory=lambda: MappingProxyType({}))
191
287
  separator: str = "\n\n"
192
288
 
289
+ _array_tmp_col_label: str = field(init=False, repr=False)
290
+ """For SQLite we need to store the array in a temporary column to be able
291
+ to concatenate it later in pandas. `_array_tmp_col_label` is the name of
292
+ this temporary column. The temporary column will have a unique name
293
+ per instance.
294
+ """
295
+
296
+ def __post_init__(self) -> None:
297
+ super().__post_init__()
298
+ object.__setattr__(self, "_array_tmp_col_label", f"__array_tmp_col_{random()}")
299
+
193
300
  def with_separator(self, separator: str = "\n\n") -> "Concatenation":
194
301
  return replace(self, separator=separator)
195
302
 
196
- def __call__(self, span: Span) -> Iterator[Tuple[str, str]]:
197
- if not isinstance(seq := self.value(span), Iterable):
198
- return
199
- if not self.kwargs:
200
- yield self.key, self.separator.join(map(str, seq))
201
- record = defaultdict(list)
202
- for item in seq:
203
- if not isinstance(item, Mapping):
204
- continue
205
- for k, v in self.kwargs.items():
206
- if value := item.get(v):
207
- record[k].append(value)
208
- for name, values in record.items():
209
- yield name, self.separator.join(map(str, values))
303
+ def update_sql(
304
+ self,
305
+ stmt: Select[Any],
306
+ dialect: SupportedSQLDialect,
307
+ ) -> Select[Any]:
308
+ array = self()
309
+ if dialect is SupportedSQLDialect.SQLITE:
310
+ # Because SQLite doesn't support `WITH ORDINALITY`, the order of
311
+ # the returned table-values is not guaranteed. So we resort to
312
+ # post hoc processing using pandas.
313
+ stmt = stmt.where(
314
+ func.json_type(array) == "array",
315
+ ).add_columns(
316
+ array.label(self._array_tmp_col_label),
317
+ )
318
+ return stmt
319
+ elif dialect is SupportedSQLDialect.POSTGRESQL:
320
+ element = (
321
+ (
322
+ func.jsonb_array_elements(array)
323
+ if self.kwargs
324
+ else func.jsonb_array_elements_text(array)
325
+ )
326
+ .table_valued(
327
+ Column("obj", JSON),
328
+ with_ordinality="position",
329
+ joins_implicitly=True,
330
+ )
331
+ .render_derived()
332
+ )
333
+ obj, position = element.c.obj, element.c.position
334
+ if self.kwargs:
335
+ columns: Iterable[Label[Any]] = (
336
+ func.string_agg(
337
+ obj[key.split(".")].as_string(),
338
+ aggregate_order_by(self.separator, position), # type: ignore
339
+ ).label(self._add_tmp_suffix(label))
340
+ for label, key in self.kwargs.items()
341
+ )
342
+ else:
343
+ columns = (
344
+ func.string_agg(
345
+ obj,
346
+ aggregate_order_by(self.separator, position), # type: ignore
347
+ ).label(self.key),
348
+ )
349
+ stmt = (
350
+ stmt.where(
351
+ and_(
352
+ func.jsonb_typeof(array) == "array",
353
+ *((func.jsonb_typeof(obj) == "object",) if self.kwargs else ()),
354
+ )
355
+ )
356
+ .add_columns(*columns)
357
+ .group_by(*stmt.columns.keys())
358
+ )
359
+ return stmt
360
+ else:
361
+ assert_never(dialect)
362
+
363
+ def update_df(
364
+ self,
365
+ df: pd.DataFrame,
366
+ dialect: SupportedSQLDialect,
367
+ ) -> pd.DataFrame:
368
+ df = df.rename(self._remove_tmp_suffix, axis=1)
369
+ if df.empty:
370
+ columns = list(
371
+ set(
372
+ chain(
373
+ df.drop(self._array_tmp_col_label, axis=1, errors="ignore").columns,
374
+ self.kwargs.keys(),
375
+ )
376
+ )
377
+ )
378
+ return pd.DataFrame(columns=columns, index=df.index)
379
+ if dialect is SupportedSQLDialect.SQLITE:
380
+ # Because SQLite doesn't support `WITH ORDINALITY`, the order of
381
+ # the returned table-values is not guaranteed. So we resort to
382
+ # post hoc processing using pandas.
383
+ def _concat_values(array: List[Any]) -> Dict[str, Any]:
384
+ if not isinstance(array, Iterable):
385
+ return {}
386
+ if not self.kwargs:
387
+ return {self.key: self.separator.join(str(obj) for obj in array)}
388
+ values: DefaultDict[str, List[str]] = defaultdict(list)
389
+ for i, obj in enumerate(array):
390
+ if not isinstance(obj, Mapping):
391
+ continue
392
+ for label, key in self.kwargs.items():
393
+ if (value := get_attribute_value(obj, key)) is not None:
394
+ values[label].append(str(value))
395
+ return {label: self.separator.join(vs) for label, vs in values.items()}
396
+
397
+ records = df.loc[:, self._array_tmp_col_label].map(_concat_values)
398
+ df_concat = pd.DataFrame.from_records(records.to_list(), index=records.index)
399
+ return df.drop(self._array_tmp_col_label, axis=1).join(df_concat, how="outer")
400
+ elif dialect is SupportedSQLDialect.POSTGRESQL:
401
+ pass
402
+ else:
403
+ assert_never(dialect)
404
+ return df
210
405
 
211
406
  def to_dict(self) -> Dict[str, Any]:
212
407
  return {
@@ -233,13 +428,24 @@ class Concatenation(Projection):
233
428
 
234
429
 
235
430
  @dataclass(frozen=True)
236
- class SpanQuery:
431
+ class SpanQuery(_HasTmpSuffix):
237
432
  _select: Mapping[str, Projection] = field(default_factory=lambda: MappingProxyType({}))
238
- _concat: Concatenation = field(default_factory=Concatenation)
239
- _explode: Explosion = field(default_factory=Explosion)
240
- _filter: SpanFilter = field(default_factory=SpanFilter)
433
+ _concat: Optional[Concatenation] = field(default=None)
434
+ _explode: Optional[Explosion] = field(default=None)
435
+ _filter: Optional[SpanFilter] = field(default=None)
241
436
  _rename: Mapping[str, str] = field(default_factory=lambda: MappingProxyType({}))
242
437
  _index: Projection = field(default_factory=lambda: Projection("context.span_id"))
438
+ _concat_separator: str = field(default="\n\n", repr=False)
439
+ _pk_tmp_col_label: str = field(init=False, repr=False)
440
+ """We use `_pk_tmp_col_label` as a temporary column for storing
441
+ the row id, i.e. the primary key, of the spans table. This will help
442
+ us with joins without the risk of naming conflicts. The temporary
443
+ column will have a unique name per instance.
444
+ """
445
+
446
+ def __post_init__(self) -> None:
447
+ super().__post_init__()
448
+ object.__setattr__(self, "_pk_tmp_col_label", f"__pk_tmp_col_{random()}")
243
449
 
244
450
  def __bool__(self) -> bool:
245
451
  return bool(self._select) or bool(self._filter) or bool(self._explode) or bool(self._concat)
@@ -255,11 +461,21 @@ class SpanQuery:
255
461
  return replace(self, _filter=_filter)
256
462
 
257
463
  def explode(self, key: str, **kwargs: str) -> "SpanQuery":
464
+ assert (
465
+ isinstance(key, str) and key
466
+ ), "The field name for explosion must be a non-empty string."
258
467
  _explode = Explosion(key=key, kwargs=kwargs, primary_index_key=self._index.key)
259
468
  return replace(self, _explode=_explode)
260
469
 
261
470
  def concat(self, key: str, **kwargs: str) -> "SpanQuery":
262
- _concat = Concatenation(key=key, kwargs=kwargs)
471
+ assert (
472
+ isinstance(key, str) and key
473
+ ), "The field name for concatenation must be a non-empty string."
474
+ _concat = (
475
+ Concatenation(key=key, kwargs=kwargs, separator=self._concat.separator)
476
+ if self._concat
477
+ else Concatenation(key=key, kwargs=kwargs, separator=self._concat_separator)
478
+ )
263
479
  return replace(self, _concat=_concat)
264
480
 
265
481
  def rename(self, **kwargs: str) -> "SpanQuery":
@@ -268,75 +484,136 @@ class SpanQuery:
268
484
 
269
485
  def with_index(self, key: str = "context.span_id") -> "SpanQuery":
270
486
  _index = Projection(key=key)
271
- return replace(self, _index=_index)
487
+ return (
488
+ replace(self, _index=_index, _explode=replace(self._explode, primary_index_key=key))
489
+ if self._explode
490
+ else replace(self, _index=_index)
491
+ )
272
492
 
273
493
  def with_concat_separator(self, separator: str = "\n\n") -> "SpanQuery":
494
+ if not self._concat:
495
+ return replace(self, _concat_separator=separator)
274
496
  _concat = self._concat.with_separator(separator)
275
497
  return replace(self, _concat=_concat)
276
498
 
277
- def with_explode_primary_index_key(self, primary_index_key: str) -> "SpanQuery":
278
- _explode = self._explode.with_primary_index_key(primary_index_key)
279
- return replace(self, _explode=_explode)
280
-
281
- def __call__(self, spans: Iterable[Span]) -> pd.DataFrame:
282
- if self._filter:
283
- spans = filter(self._filter, spans)
284
- if self._explode:
285
- spans = filter(
286
- lambda span: (isinstance(seq := self._explode.value(span), Sized) and len(seq)),
287
- spans,
288
- )
289
- if self._concat:
290
- spans = filter(
291
- lambda span: (isinstance(seq := self._concat.value(span), Sized) and len(seq)),
292
- spans,
499
+ def with_explode_primary_index_key(self, _: str) -> "SpanQuery":
500
+ print(
501
+ "`.with_explode_primary_index_key(...)` is deprecated and will be "
502
+ "removed in the future. Use `.with_index(...)` instead."
503
+ )
504
+ return self
505
+
506
+ def __call__(
507
+ self,
508
+ session: Session,
509
+ project_name: Optional[str] = None,
510
+ start_time: Optional[datetime] = None,
511
+ end_time: Optional[datetime] = None,
512
+ limit: Optional[int] = DEFAULT_SPAN_LIMIT,
513
+ root_spans_only: Optional[bool] = None,
514
+ # Deprecated
515
+ stop_time: Optional[datetime] = None,
516
+ ) -> pd.DataFrame:
517
+ if not project_name:
518
+ project_name = DEFAULT_PROJECT_NAME
519
+ if stop_time:
520
+ # Deprecated. Raise a warning
521
+ warnings.warn(
522
+ "stop_time is deprecated. Use end_time instead.",
523
+ DeprecationWarning,
293
524
  )
525
+ end_time = end_time or stop_time
294
526
  if not (self._select or self._explode or self._concat):
295
- if not (data := [json.loads(span_to_json(span)) for span in spans]):
296
- return pd.DataFrame()
297
- return (
298
- pd.json_normalize(data, max_level=1)
299
- .rename(self._rename, axis=1, errors="ignore")
300
- .set_index("context.span_id", drop=False)
527
+ return _get_spans_dataframe(
528
+ session,
529
+ project_name,
530
+ span_filter=self._filter,
531
+ start_time=start_time,
532
+ end_time=end_time,
533
+ limit=limit,
534
+ root_spans_only=root_spans_only,
301
535
  )
302
- _selected: List[Dict[str, Any]] = []
303
- _exploded: List[Dict[str, Any]] = []
304
- for span in spans:
305
- if self._select:
306
- record = {name: proj(span) for name, proj in self._select.items()}
307
- for v in record.values():
308
- if v is not None:
309
- break
310
- else:
311
- record = {}
312
- if self._concat:
313
- record.update(self._concat(span))
314
- if record:
315
- if self._index.key not in record:
316
- record[self._index.key] = self._index(span)
317
- _selected.append(record)
318
- elif self._concat:
319
- record = {self._index.key: self._index(span)}
320
- record.update(self._concat(span))
321
- if record:
322
- _selected.append(record)
323
- if self._explode:
324
- _exploded.extend(self._explode(span))
325
- if _selected:
326
- select_df = pd.DataFrame(_selected)
327
- else:
328
- select_df = pd.DataFrame(columns=[self._index.key])
329
- select_df = select_df.set_index(self._index.key)
536
+ assert session.bind is not None
537
+ dialect = SupportedSQLDialect(session.bind.dialect.name)
538
+ row_id = models.Span.id.label(self._pk_tmp_col_label)
539
+ stmt: Select[Any] = (
540
+ # We do not allow `group_by` anything other than `row_id` because otherwise
541
+ # it's too complex for the post hoc processing step in pandas.
542
+ select(row_id)
543
+ .join(models.Trace)
544
+ .join(models.Project)
545
+ .where(models.Project.name == project_name)
546
+ )
547
+ if start_time:
548
+ stmt = stmt.where(start_time <= models.Span.start_time)
549
+ if end_time:
550
+ stmt = stmt.where(models.Span.start_time < end_time)
551
+ if limit is not None:
552
+ stmt = stmt.limit(limit)
553
+ if root_spans_only:
554
+ parent = aliased(models.Span)
555
+ stmt = stmt.outerjoin(
556
+ parent,
557
+ models.Span.parent_id == parent.span_id,
558
+ ).where(parent.span_id == None) # noqa E711
559
+ stmt0_orig: Select[Any] = stmt
560
+ stmt1_filter: Optional[Select[Any]] = None
561
+ if self._filter:
562
+ stmt = stmt1_filter = self._filter(stmt)
563
+ stmt2_select: Optional[Select[Any]] = None
564
+ if self._select:
565
+ columns: Iterable[Label[Any]] = (
566
+ proj().label(self._add_tmp_suffix(label)) for label, proj in self._select.items()
567
+ )
568
+ stmt = stmt2_select = stmt.add_columns(*columns)
569
+ stmt3_explode: Optional[Select[Any]] = None
330
570
  if self._explode:
331
- if _exploded:
332
- explode_df = pd.DataFrame(_exploded)
571
+ stmt = stmt3_explode = self._explode.update_sql(stmt, dialect)
572
+ index: Label[Any] = self._index().label(self._add_tmp_suffix(self._index.key))
573
+ df: Optional[pd.DataFrame] = None
574
+ # `concat` is done separately because it has `group_by` but we can't
575
+ # always join to it as a subquery because it may require post hoc
576
+ # processing in pandas. It's kept separate for simplicity.
577
+ df_concat: Optional[pd.DataFrame] = None
578
+ conn = session.connection()
579
+ if self._explode or not self._concat:
580
+ if index.name not in stmt.selected_columns.keys():
581
+ stmt = stmt.add_columns(index)
582
+ df = pd.read_sql_query(stmt, conn, self._pk_tmp_col_label)
583
+ if self._concat:
584
+ if df is not None:
585
+ assert stmt3_explode is not None
586
+ # We can't include stmt3_explode because it may be trying to
587
+ # explode the same column that we're trying to concatenate,
588
+ # resulting in duplicated joins.
589
+ stmt_no_explode = (
590
+ stmt2_select
591
+ if stmt2_select is not None
592
+ else (stmt1_filter if stmt1_filter is not None else stmt0_orig)
593
+ )
594
+ stmt4_concat = stmt_no_explode.with_only_columns(row_id)
333
595
  else:
334
- explode_df = pd.DataFrame(columns=self._explode.index_keys)
335
- explode_df = explode_df.set_index(list(self._explode.index_keys))
336
- if not self._select:
337
- return explode_df.rename(self._rename, axis=1, errors="ignore")
338
- select_df = select_df.join(explode_df, how="outer")
339
- return select_df.rename(self._rename, axis=1, errors="ignore")
596
+ assert stmt3_explode is None
597
+ stmt4_concat = stmt
598
+ if (df is None or df.empty) and index.name not in stmt4_concat.selected_columns.keys():
599
+ stmt4_concat = stmt4_concat.add_columns(index)
600
+ stmt4_concat = self._concat.update_sql(stmt4_concat, dialect)
601
+ df_concat = pd.read_sql_query(stmt4_concat, conn, self._pk_tmp_col_label)
602
+ df_concat = self._concat.update_df(df_concat, dialect)
603
+ assert df is not None or df_concat is not None
604
+ if df is None:
605
+ df = df_concat
606
+ elif df_concat is not None:
607
+ df = _outer_join(df, df_concat)
608
+ assert df is not None and self._pk_tmp_col_label not in df.columns
609
+ df = df.rename(self._remove_tmp_suffix, axis=1)
610
+ if self._explode:
611
+ df = self._explode.update_df(df, dialect)
612
+ else:
613
+ df = df.set_index(self._index.key)
614
+ df = df.rename(_ALIASES, axis=1, errors="ignore")
615
+ df = df.rename(self._rename, axis=1, errors="ignore")
616
+ return df
340
617
 
341
618
  def to_dict(self) -> Dict[str, Any]:
342
619
  return {
@@ -345,9 +622,9 @@ class SpanQuery:
345
622
  if self._select
346
623
  else {}
347
624
  ),
348
- "filter": self._filter.to_dict(),
349
- "explode": self._explode.to_dict(),
350
- "concat": self._concat.to_dict(),
625
+ **({"filter": self._filter.to_dict()} if self._filter else {}),
626
+ **({"explode": self._explode.to_dict()} if self._explode else {}),
627
+ **({"concat": self._concat.to_dict()} if self._concat else {}),
351
628
  **({"rename": dict(self._rename)} if self._rename else {}),
352
629
  "index": self._index.to_dict(),
353
630
  }
@@ -356,7 +633,6 @@ class SpanQuery:
356
633
  def from_dict(
357
634
  cls,
358
635
  obj: Mapping[str, Any],
359
- evals: Optional[SupportsGetSpanEvaluation] = None,
360
636
  valid_eval_names: Optional[Sequence[str]] = None,
361
637
  ) -> "SpanQuery":
362
638
  return cls(
@@ -376,7 +652,6 @@ class SpanQuery:
376
652
  {
377
653
  "_filter": SpanFilter.from_dict(
378
654
  cast(Mapping[str, Any], filter),
379
- evals=evals,
380
655
  valid_eval_names=valid_eval_names,
381
656
  )
382
657
  } # type: ignore
@@ -386,11 +661,13 @@ class SpanQuery:
386
661
  **(
387
662
  {"_explode": Explosion.from_dict(cast(Mapping[str, Any], explode))} # type: ignore
388
663
  if (explode := obj.get("explode"))
664
+ and explode.get("key") # check `key` for backward-compatible truthiness
389
665
  else {}
390
666
  ),
391
667
  **(
392
668
  {"_concat": Concatenation.from_dict(cast(Mapping[str, Any], concat))} # type: ignore
393
669
  if (concat := obj.get("concat"))
670
+ and concat.get("key") # check `key` for backward-compatible truthiness
394
671
  else {}
395
672
  ),
396
673
  **(
@@ -404,3 +681,103 @@ class SpanQuery:
404
681
  else {}
405
682
  ),
406
683
  )
684
+
685
+
686
+ def _get_spans_dataframe(
687
+ session: Session,
688
+ project_name: str,
689
+ /,
690
+ *,
691
+ span_filter: Optional[SpanFilter] = None,
692
+ start_time: Optional[datetime] = None,
693
+ end_time: Optional[datetime] = None,
694
+ limit: Optional[int] = DEFAULT_SPAN_LIMIT,
695
+ root_spans_only: Optional[bool] = None,
696
+ # Deprecated
697
+ stop_time: Optional[datetime] = None,
698
+ ) -> pd.DataFrame:
699
+ # use legacy labels for backward-compatibility
700
+ span_id_label = "context.span_id"
701
+ trace_id_label = "context.trace_id"
702
+ if stop_time:
703
+ # Deprecated. Raise a warning
704
+ warnings.warn(
705
+ "stop_time is deprecated. Use end_time instead.",
706
+ DeprecationWarning,
707
+ )
708
+ end_time = end_time or stop_time
709
+ stmt: Select[Any] = (
710
+ select(
711
+ models.Span.name,
712
+ models.Span.span_kind,
713
+ models.Span.parent_id,
714
+ models.Span.start_time,
715
+ models.Span.end_time,
716
+ models.Span.status_code,
717
+ models.Span.status_message,
718
+ models.Span.events,
719
+ models.Span.span_id.label(span_id_label),
720
+ models.Trace.trace_id.label(trace_id_label),
721
+ models.Span.attributes,
722
+ )
723
+ .join(models.Trace)
724
+ .join(models.Project)
725
+ .where(models.Project.name == project_name)
726
+ )
727
+ if span_filter:
728
+ stmt = span_filter(stmt)
729
+ if start_time:
730
+ stmt = stmt.where(start_time <= models.Span.start_time)
731
+ if end_time:
732
+ stmt = stmt.where(models.Span.start_time < end_time)
733
+ if limit is not None:
734
+ stmt = stmt.limit(limit)
735
+ if root_spans_only:
736
+ parent = aliased(models.Span)
737
+ stmt = stmt.outerjoin(
738
+ parent,
739
+ models.Span.parent_id == parent.span_id,
740
+ ).where(parent.span_id == None) # noqa E711
741
+ conn = session.connection()
742
+ # set `drop=False` for backward-compatibility
743
+ df = pd.read_sql_query(stmt, conn).set_index(span_id_label, drop=False)
744
+ if df.empty:
745
+ return df.drop("attributes", axis=1)
746
+ df_attributes = pd.DataFrame.from_records(
747
+ df.attributes.map(_flatten_semantic_conventions),
748
+ ).set_axis(df.index, axis=0)
749
+ df = pd.concat(
750
+ [
751
+ df.drop("attributes", axis=1),
752
+ df_attributes.add_prefix("attributes" + "."),
753
+ ],
754
+ axis=1,
755
+ )
756
+ return df
757
+
758
+
759
+ def _outer_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
760
+ if (columns_intersection := left.columns.intersection(right.columns)).empty:
761
+ df = left.join(right, how="outer")
762
+ else:
763
+ df = left.join(right, how="outer", lsuffix="_L", rsuffix="_R")
764
+ for col in columns_intersection:
765
+ df.loc[:, col] = df.loc[:, f"{col}_L"].fillna(df.loc[:, f"{col}_R"])
766
+ df = df.drop([f"{col}_L", f"{col}_R"], axis=1)
767
+ return df
768
+
769
+
770
+ def _flatten_semantic_conventions(attributes: Mapping[str, Any]) -> Dict[str, Any]:
771
+ # This may be inefficient, but is needed to preserve backward-compatibility.
772
+ # For example, custom attributes do not get flattened.
773
+ ans = unflatten(
774
+ load_json_strings(
775
+ flatten(
776
+ attributes,
777
+ recurse_on_sequence=True,
778
+ json_string_attributes=JSON_STRING_ATTRIBUTES,
779
+ ),
780
+ ),
781
+ prefix_exclusions=SEMANTIC_CONVENTIONS,
782
+ )
783
+ return ans