arize 8.0.0a22__py3-none-any.whl → 8.0.0a23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +17 -9
- arize/_exporter/client.py +55 -36
- arize/_exporter/parsers/tracing_data_parser.py +41 -30
- arize/_exporter/validation.py +3 -3
- arize/_flight/client.py +207 -76
- arize/_generated/api_client/__init__.py +30 -6
- arize/_generated/api_client/api/__init__.py +1 -0
- arize/_generated/api_client/api/datasets_api.py +864 -190
- arize/_generated/api_client/api/experiments_api.py +167 -131
- arize/_generated/api_client/api/projects_api.py +1197 -0
- arize/_generated/api_client/api_client.py +2 -2
- arize/_generated/api_client/configuration.py +42 -34
- arize/_generated/api_client/exceptions.py +2 -2
- arize/_generated/api_client/models/__init__.py +15 -4
- arize/_generated/api_client/models/dataset.py +10 -10
- arize/_generated/api_client/models/dataset_example.py +111 -0
- arize/_generated/api_client/models/dataset_example_update.py +100 -0
- arize/_generated/api_client/models/dataset_version.py +13 -13
- arize/_generated/api_client/models/datasets_create_request.py +16 -8
- arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
- arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
- arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
- arize/_generated/api_client/models/datasets_list200_response.py +10 -4
- arize/_generated/api_client/models/experiment.py +14 -16
- arize/_generated/api_client/models/experiment_run.py +108 -0
- arize/_generated/api_client/models/experiment_run_create.py +102 -0
- arize/_generated/api_client/models/experiments_create_request.py +16 -10
- arize/_generated/api_client/models/experiments_list200_response.py +10 -4
- arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
- arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
- arize/_generated/api_client/models/primitive_value.py +172 -0
- arize/_generated/api_client/models/problem.py +100 -0
- arize/_generated/api_client/models/project.py +99 -0
- arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
- arize/_generated/api_client/models/projects_list200_response.py +106 -0
- arize/_generated/api_client/rest.py +2 -2
- arize/_generated/api_client/test/test_dataset.py +4 -2
- arize/_generated/api_client/test/test_dataset_example.py +56 -0
- arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
- arize/_generated/api_client/test/test_dataset_version.py +7 -2
- arize/_generated/api_client/test/test_datasets_api.py +27 -13
- arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
- arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
- arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
- arize/_generated/api_client/test/test_experiment.py +2 -4
- arize/_generated/api_client/test/test_experiment_run.py +56 -0
- arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
- arize/_generated/api_client/test/test_experiments_api.py +6 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
- arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
- arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
- arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
- arize/_generated/api_client/test/test_problem.py +57 -0
- arize/_generated/api_client/test/test_project.py +58 -0
- arize/_generated/api_client/test/test_projects_api.py +59 -0
- arize/_generated/api_client/test/test_projects_create_request.py +54 -0
- arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
- arize/_generated/api_client_README.md +43 -29
- arize/_generated/protocol/flight/flight_pb2.py +400 -0
- arize/_lazy.py +27 -19
- arize/client.py +268 -55
- arize/config.py +365 -116
- arize/constants/__init__.py +1 -0
- arize/constants/config.py +11 -4
- arize/constants/ml.py +6 -4
- arize/constants/openinference.py +2 -0
- arize/constants/pyarrow.py +2 -0
- arize/constants/spans.py +3 -1
- arize/datasets/__init__.py +1 -0
- arize/datasets/client.py +299 -84
- arize/datasets/errors.py +32 -2
- arize/datasets/validation.py +18 -8
- arize/embeddings/__init__.py +2 -0
- arize/embeddings/auto_generator.py +23 -19
- arize/embeddings/base_generators.py +89 -36
- arize/embeddings/constants.py +2 -0
- arize/embeddings/cv_generators.py +26 -4
- arize/embeddings/errors.py +27 -5
- arize/embeddings/nlp_generators.py +31 -12
- arize/embeddings/tabular_generators.py +32 -20
- arize/embeddings/usecases.py +12 -2
- arize/exceptions/__init__.py +1 -0
- arize/exceptions/auth.py +11 -1
- arize/exceptions/base.py +29 -4
- arize/exceptions/models.py +21 -2
- arize/exceptions/parameters.py +31 -0
- arize/exceptions/spaces.py +12 -1
- arize/exceptions/types.py +86 -7
- arize/exceptions/values.py +220 -20
- arize/experiments/__init__.py +1 -0
- arize/experiments/client.py +389 -285
- arize/experiments/evaluators/__init__.py +1 -0
- arize/experiments/evaluators/base.py +74 -41
- arize/experiments/evaluators/exceptions.py +6 -3
- arize/experiments/evaluators/executors.py +121 -73
- arize/experiments/evaluators/rate_limiters.py +106 -57
- arize/experiments/evaluators/types.py +34 -7
- arize/experiments/evaluators/utils.py +65 -27
- arize/experiments/functions.py +103 -101
- arize/experiments/tracing.py +52 -44
- arize/experiments/types.py +56 -31
- arize/logging.py +54 -22
- arize/models/__init__.py +1 -0
- arize/models/batch_validation/__init__.py +1 -0
- arize/models/batch_validation/errors.py +543 -65
- arize/models/batch_validation/validator.py +339 -300
- arize/models/bounded_executor.py +20 -7
- arize/models/casting.py +75 -29
- arize/models/client.py +326 -107
- arize/models/proto.py +95 -40
- arize/models/stream_validation.py +42 -14
- arize/models/surrogate_explainer/__init__.py +1 -0
- arize/models/surrogate_explainer/mimic.py +24 -13
- arize/pre_releases.py +43 -0
- arize/projects/__init__.py +1 -0
- arize/projects/client.py +129 -0
- arize/regions.py +40 -0
- arize/spans/__init__.py +1 -0
- arize/spans/client.py +130 -106
- arize/spans/columns.py +13 -0
- arize/spans/conversion.py +54 -38
- arize/spans/validation/__init__.py +1 -0
- arize/spans/validation/annotations/__init__.py +1 -0
- arize/spans/validation/annotations/annotations_validation.py +6 -4
- arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
- arize/spans/validation/annotations/value_validation.py +35 -11
- arize/spans/validation/common/__init__.py +1 -0
- arize/spans/validation/common/argument_validation.py +33 -8
- arize/spans/validation/common/dataframe_form_validation.py +35 -9
- arize/spans/validation/common/errors.py +211 -11
- arize/spans/validation/common/value_validation.py +80 -13
- arize/spans/validation/evals/__init__.py +1 -0
- arize/spans/validation/evals/dataframe_form_validation.py +28 -8
- arize/spans/validation/evals/evals_validation.py +34 -4
- arize/spans/validation/evals/value_validation.py +26 -3
- arize/spans/validation/metadata/__init__.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +14 -5
- arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
- arize/spans/validation/metadata/value_validation.py +24 -10
- arize/spans/validation/spans/__init__.py +1 -0
- arize/spans/validation/spans/dataframe_form_validation.py +34 -13
- arize/spans/validation/spans/spans_validation.py +35 -4
- arize/spans/validation/spans/value_validation.py +76 -7
- arize/types.py +293 -157
- arize/utils/__init__.py +1 -0
- arize/utils/arrow.py +31 -15
- arize/utils/cache.py +34 -6
- arize/utils/dataframe.py +19 -2
- arize/utils/online_tasks/__init__.py +2 -0
- arize/utils/online_tasks/dataframe_preprocessor.py +53 -41
- arize/utils/openinference_conversion.py +44 -5
- arize/utils/proto.py +10 -0
- arize/utils/size.py +5 -3
- arize/version.py +3 -1
- {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/METADATA +4 -3
- arize-8.0.0a23.dist-info/RECORD +174 -0
- {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/WHEEL +1 -1
- arize-8.0.0a23.dist-info/licenses/LICENSE +176 -0
- arize-8.0.0a23.dist-info/licenses/NOTICE +13 -0
- arize/_generated/protocol/flight/export_pb2.py +0 -61
- arize/_generated/protocol/flight/ingest_pb2.py +0 -365
- arize-8.0.0a22.dist-info/RECORD +0 -146
- arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/regions.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Region definitions and configuration for Arize deployment zones."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import StrEnum
|
|
5
|
+
|
|
6
|
+
from arize.constants.config import DEFAULT_FLIGHT_PORT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Region(StrEnum):
|
|
10
|
+
"""Enum representing available Arize deployment regions."""
|
|
11
|
+
|
|
12
|
+
US_CENTRAL_1 = "us-central-1a"
|
|
13
|
+
EU_WEST_1 = "eu-west-1a"
|
|
14
|
+
CA_CENTRAL_1 = "ca-central-1a"
|
|
15
|
+
US_EAST_1 = "us-east-1b"
|
|
16
|
+
UNSPECIFIED = ""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class RegionEndpoints:
|
|
21
|
+
"""Container for region-specific API endpoint hostnames and ports."""
|
|
22
|
+
|
|
23
|
+
api_host: str
|
|
24
|
+
otlp_host: str
|
|
25
|
+
flight_host: str
|
|
26
|
+
flight_port: int
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_region_endpoints(region: Region) -> RegionEndpoints:
|
|
30
|
+
return RegionEndpoints(
|
|
31
|
+
api_host=f"api.{region}.arize.com",
|
|
32
|
+
otlp_host=f"otlp.{region}.arize.com",
|
|
33
|
+
flight_host=f"flight.{region}.arize.com",
|
|
34
|
+
flight_port=DEFAULT_FLIGHT_PORT,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
REGION_ENDPOINTS: dict[Region, RegionEndpoints] = {
|
|
39
|
+
r: _get_region_endpoints(r) for r in Region if r != Region.UNSPECIFIED
|
|
40
|
+
}
|
arize/spans/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""LLM tracing spans functionality for the Arize SDK."""
|
arize/spans/client.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Client implementation for managing spans and traces in the Arize platform."""
|
|
2
|
+
|
|
1
3
|
# type: ignore[pb2]
|
|
2
4
|
from __future__ import annotations
|
|
3
5
|
|
|
@@ -6,7 +8,7 @@ import logging
|
|
|
6
8
|
import re
|
|
7
9
|
from datetime import datetime, timezone
|
|
8
10
|
from functools import partial
|
|
9
|
-
from typing import TYPE_CHECKING, Any
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
10
12
|
|
|
11
13
|
import numpy as np
|
|
12
14
|
import pandas as pd
|
|
@@ -16,10 +18,6 @@ from google.protobuf import json_format, message
|
|
|
16
18
|
from arize._exporter.client import ArizeExportClient
|
|
17
19
|
from arize._flight.client import ArizeFlightClient, FlightPostArrowFileResponse
|
|
18
20
|
from arize._flight.types import FlightRequestType
|
|
19
|
-
from arize._generated.protocol.flight.ingest_pb2 import (
|
|
20
|
-
WriteSpanAnnotationResponse,
|
|
21
|
-
WriteSpanEvaluationResponse,
|
|
22
|
-
)
|
|
23
21
|
from arize.constants.spans import DEFAULT_DATETIME_FMT
|
|
24
22
|
from arize.exceptions.base import (
|
|
25
23
|
INVALID_ARROW_CONVERSION_MSG,
|
|
@@ -29,9 +27,7 @@ from arize.exceptions.models import MissingProjectNameError
|
|
|
29
27
|
from arize.exceptions.spaces import MissingSpaceIDError
|
|
30
28
|
from arize.logging import CtxAdapter
|
|
31
29
|
from arize.types import Environments, SimilaritySearchParams
|
|
32
|
-
from arize.utils.arrow import
|
|
33
|
-
post_arrow_table,
|
|
34
|
-
)
|
|
30
|
+
from arize.utils.arrow import post_arrow_table
|
|
35
31
|
from arize.utils.dataframe import (
|
|
36
32
|
remove_extraneous_columns,
|
|
37
33
|
reset_dataframe_index,
|
|
@@ -41,13 +37,21 @@ from arize.utils.proto import get_pb_schema_tracing
|
|
|
41
37
|
if TYPE_CHECKING:
|
|
42
38
|
import requests
|
|
43
39
|
|
|
40
|
+
from arize._generated.protocol.flight import flight_pb2
|
|
44
41
|
from arize.config import SDKConfiguration
|
|
45
42
|
|
|
46
43
|
logger = logging.getLogger(__name__)
|
|
47
44
|
|
|
48
45
|
|
|
49
46
|
class SpansClient:
|
|
50
|
-
|
|
47
|
+
"""Client for logging LLM tracing spans and evaluations to Arize."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, *, sdk_config: SDKConfiguration) -> None:
|
|
50
|
+
"""Initialize the spans client with SDK configuration.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
sdk_config: SDK configuration containing API endpoints and credentials.
|
|
54
|
+
"""
|
|
51
55
|
self._sdk_config = sdk_config
|
|
52
56
|
|
|
53
57
|
def log(
|
|
@@ -62,12 +66,14 @@ class SpansClient:
|
|
|
62
66
|
timeout: float | None = None,
|
|
63
67
|
tmp_dir: str = "",
|
|
64
68
|
) -> requests.Response:
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
:class:`Response` object from the Requests HTTP library to ensure
|
|
68
|
-
records.
|
|
69
|
+
"""Logs a pandas dataframe containing LLM tracing data to Arize via a POST request.
|
|
70
|
+
|
|
71
|
+
Returns a :class:`Response` object from the Requests HTTP library to ensure
|
|
72
|
+
successful delivery of records.
|
|
69
73
|
|
|
70
74
|
Args:
|
|
75
|
+
space_id (str): The space ID where the project resides.
|
|
76
|
+
project_name (str): A unique name to identify your project in the Arize platform.
|
|
71
77
|
dataframe (pd.DataFrame): The dataframe containing the LLM traces.
|
|
72
78
|
evals_dataframe (pd.DataFrame, optional): A dataframe containing LLM evaluations data.
|
|
73
79
|
The evaluations are joined to their corresponding spans via a left outer join, i.e.,
|
|
@@ -76,11 +82,10 @@ class SpansClient:
|
|
|
76
82
|
Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
|
|
77
83
|
validate (bool, optional): When set to True, validation is run before sending data.
|
|
78
84
|
Defaults to True.
|
|
79
|
-
tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
|
|
80
|
-
before sending to Arize.
|
|
81
85
|
timeout (float, optional): You can stop waiting for a response after a given number
|
|
82
86
|
of seconds with the timeout parameter. Defaults to None.
|
|
83
|
-
|
|
87
|
+
tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
|
|
88
|
+
before sending to Arize.
|
|
84
89
|
|
|
85
90
|
Returns:
|
|
86
91
|
`Response` object
|
|
@@ -220,12 +225,12 @@ class SpansClient:
|
|
|
220
225
|
log.debug("Converting data to Arrow format")
|
|
221
226
|
pa_table = pa.Table.from_pandas(df, preserve_index=False)
|
|
222
227
|
except pa.ArrowInvalid as e:
|
|
223
|
-
log.
|
|
228
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
224
229
|
raise pa.ArrowInvalid(
|
|
225
|
-
f"Error converting to Arrow format: {
|
|
230
|
+
f"Error converting to Arrow format: {e!s}"
|
|
226
231
|
) from e
|
|
227
|
-
except Exception
|
|
228
|
-
log.
|
|
232
|
+
except Exception:
|
|
233
|
+
log.exception("Unexpected error creating Arrow table")
|
|
229
234
|
raise
|
|
230
235
|
|
|
231
236
|
proto_schema = get_pb_schema_tracing(project_name=project_name)
|
|
@@ -262,27 +267,23 @@ class SpansClient:
|
|
|
262
267
|
force_http: bool = False,
|
|
263
268
|
timeout: float | None = None,
|
|
264
269
|
tmp_dir: str = "",
|
|
265
|
-
) -> WriteSpanEvaluationResponse:
|
|
266
|
-
"""
|
|
267
|
-
|
|
268
|
-
The dataframe must contain a column `context.span_id`
|
|
269
|
-
|
|
270
|
+
) -> flight_pb2.WriteSpanEvaluationResponse:
|
|
271
|
+
"""Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
|
|
272
|
+
|
|
273
|
+
The dataframe must contain a column `context.span_id` such that Arize can assign
|
|
274
|
+
each evaluation to its respective span.
|
|
270
275
|
|
|
271
276
|
Args:
|
|
277
|
+
space_id (str): The space ID where the project resides.
|
|
278
|
+
project_name (str): A unique name to identify your project in the Arize platform.
|
|
272
279
|
dataframe (pd.DataFrame): A dataframe containing LLM evaluations data.
|
|
273
|
-
model_id (str): A unique name to identify your model in the Arize platform.
|
|
274
|
-
(Deprecated: Use `project_name` instead.)
|
|
275
|
-
model_version (str, optional): Used to group a subset of traces a given
|
|
276
|
-
model_id to compare and track changes. It should match the model_id of the spans
|
|
277
|
-
sent previously, to which evaluations will be assigned. Defaults to None.
|
|
278
280
|
validate (bool, optional): When set to True, validation is run before sending data.
|
|
279
281
|
Defaults to True.
|
|
280
|
-
|
|
281
|
-
before sending to Arize.
|
|
282
|
+
force_http (bool, optional): Force the use of HTTP for data upload. Defaults to False.
|
|
282
283
|
timeout (float, optional): You can stop waiting for a response after a given number
|
|
283
284
|
of seconds with the timeout parameter. Defaults to None.
|
|
284
|
-
|
|
285
|
-
|
|
285
|
+
tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
|
|
286
|
+
before sending to Arize.
|
|
286
287
|
"""
|
|
287
288
|
from arize.spans.columns import EVAL_COLUMN_PATTERN, SPAN_SPAN_ID_COL
|
|
288
289
|
from arize.spans.validation.evals import evals_validation
|
|
@@ -358,12 +359,12 @@ class SpansClient:
|
|
|
358
359
|
log.debug("Converting data to Arrow format")
|
|
359
360
|
pa_table = pa.Table.from_pandas(evals_df, preserve_index=False)
|
|
360
361
|
except pa.ArrowInvalid as e:
|
|
361
|
-
log.
|
|
362
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
362
363
|
raise pa.ArrowInvalid(
|
|
363
|
-
f"Error converting to Arrow format: {
|
|
364
|
+
f"Error converting to Arrow format: {e!s}"
|
|
364
365
|
) from e
|
|
365
|
-
except Exception
|
|
366
|
-
log.
|
|
366
|
+
except Exception:
|
|
367
|
+
log.exception("Unexpected error creating Arrow table")
|
|
367
368
|
raise
|
|
368
369
|
|
|
369
370
|
if force_http:
|
|
@@ -395,8 +396,8 @@ class SpansClient:
|
|
|
395
396
|
response = None
|
|
396
397
|
with ArizeFlightClient(
|
|
397
398
|
api_key=self._sdk_config.api_key,
|
|
398
|
-
host=self._sdk_config.
|
|
399
|
-
port=self._sdk_config.
|
|
399
|
+
host=self._sdk_config.flight_host,
|
|
400
|
+
port=self._sdk_config.flight_port,
|
|
400
401
|
scheme=self._sdk_config.flight_scheme,
|
|
401
402
|
request_verify=self._sdk_config.request_verify,
|
|
402
403
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -409,8 +410,8 @@ class SpansClient:
|
|
|
409
410
|
request_type=request_type,
|
|
410
411
|
)
|
|
411
412
|
except Exception as e:
|
|
412
|
-
msg = f"Error during update request: {
|
|
413
|
-
log.
|
|
413
|
+
msg = f"Error during update request: {e!s}"
|
|
414
|
+
log.exception(msg)
|
|
414
415
|
raise RuntimeError(msg) from e
|
|
415
416
|
|
|
416
417
|
if response is None:
|
|
@@ -437,18 +438,18 @@ class SpansClient:
|
|
|
437
438
|
project_name: str,
|
|
438
439
|
dataframe: pd.DataFrame,
|
|
439
440
|
validate: bool = True,
|
|
440
|
-
) -> WriteSpanAnnotationResponse:
|
|
441
|
-
"""
|
|
442
|
-
|
|
443
|
-
The dataframe must contain a column `context.span_id`
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
free-form text notes.
|
|
441
|
+
) -> flight_pb2.WriteSpanAnnotationResponse:
|
|
442
|
+
"""Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
|
|
443
|
+
|
|
444
|
+
The dataframe must contain a column `context.span_id` such that Arize can assign
|
|
445
|
+
each annotation to its respective span. Annotation columns should follow the pattern
|
|
446
|
+
`annotation.<name>.<suffix>` where suffix is either `label` or `score`. An optional
|
|
447
|
+
`annotation.notes` column can be included for free-form text notes.
|
|
448
448
|
|
|
449
449
|
Args:
|
|
450
|
-
|
|
450
|
+
space_id (str): The space ID where the project resides.
|
|
451
451
|
project_name (str): A unique name to identify your project in the Arize platform.
|
|
452
|
+
dataframe (pd.DataFrame): A dataframe containing LLM annotation data.
|
|
452
453
|
validate (bool, optional): When set to True, validation is run before sending data.
|
|
453
454
|
Defaults to True.
|
|
454
455
|
"""
|
|
@@ -588,12 +589,12 @@ class SpansClient:
|
|
|
588
589
|
log.debug("Converting data to Arrow format")
|
|
589
590
|
pa_table = pa.Table.from_pandas(anno_df, preserve_index=False)
|
|
590
591
|
except pa.ArrowInvalid as e:
|
|
591
|
-
log.
|
|
592
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
592
593
|
raise pa.ArrowInvalid(
|
|
593
|
-
f"Error converting to Arrow format: {
|
|
594
|
+
f"Error converting to Arrow format: {e!s}"
|
|
594
595
|
) from e
|
|
595
|
-
except Exception
|
|
596
|
-
log.
|
|
596
|
+
except Exception:
|
|
597
|
+
log.exception("Unexpected error creating Arrow table")
|
|
597
598
|
raise
|
|
598
599
|
|
|
599
600
|
if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
|
|
@@ -611,8 +612,8 @@ class SpansClient:
|
|
|
611
612
|
response = None
|
|
612
613
|
with ArizeFlightClient(
|
|
613
614
|
api_key=self._sdk_config.api_key,
|
|
614
|
-
host=self._sdk_config.
|
|
615
|
-
port=self._sdk_config.
|
|
615
|
+
host=self._sdk_config.flight_host,
|
|
616
|
+
port=self._sdk_config.flight_port,
|
|
616
617
|
scheme=self._sdk_config.flight_scheme,
|
|
617
618
|
request_verify=self._sdk_config.request_verify,
|
|
618
619
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -625,8 +626,8 @@ class SpansClient:
|
|
|
625
626
|
request_type=request_type,
|
|
626
627
|
)
|
|
627
628
|
except Exception as e:
|
|
628
|
-
msg = f"Error during update request: {
|
|
629
|
-
log.
|
|
629
|
+
msg = f"Error during update request: {e!s}"
|
|
630
|
+
log.exception(msg)
|
|
630
631
|
raise RuntimeError(msg) from e
|
|
631
632
|
|
|
632
633
|
if response is None:
|
|
@@ -654,9 +655,10 @@ class SpansClient:
|
|
|
654
655
|
dataframe: pd.DataFrame,
|
|
655
656
|
patch_document_column_name: str = "patch_document",
|
|
656
657
|
validate: bool = True,
|
|
657
|
-
) ->
|
|
658
|
-
"""
|
|
659
|
-
|
|
658
|
+
) -> dict[str, Any]:
|
|
659
|
+
"""Log metadata updates using JSON Merge Patch format.
|
|
660
|
+
|
|
661
|
+
This method is only supported for LLM model types.
|
|
660
662
|
|
|
661
663
|
The dataframe must contain a column `context.span_id` to identify spans and either:
|
|
662
664
|
1. A column with JSON patch documents (specified by patch_document_column_name), or
|
|
@@ -674,8 +676,9 @@ class SpansClient:
|
|
|
674
676
|
Note: This differs from standard JSON Merge Patch where null values remove fields.
|
|
675
677
|
|
|
676
678
|
Args:
|
|
677
|
-
|
|
679
|
+
space_id: The space ID where the project resides.
|
|
678
680
|
project_name: A unique name to identify your project in the Arize platform.
|
|
681
|
+
dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
|
|
679
682
|
patch_document_column_name: Name of the column containing JSON patch documents.
|
|
680
683
|
Defaults to "patch_document".
|
|
681
684
|
validate: When set to True, validation is run before sending data.
|
|
@@ -813,11 +816,10 @@ class SpansClient:
|
|
|
813
816
|
)
|
|
814
817
|
|
|
815
818
|
# Create a new column for patch documents if we're going to use it
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
metadata_df[final_patch_column] = None
|
|
819
|
+
# Use 'patch_document' as the standardized column name for downstream processing
|
|
820
|
+
final_patch_column = "patch_document"
|
|
821
|
+
if final_patch_column not in metadata_df.columns:
|
|
822
|
+
metadata_df[final_patch_column] = None
|
|
821
823
|
|
|
822
824
|
# Process metadata field columns if they exist
|
|
823
825
|
if has_metadata_fields:
|
|
@@ -865,7 +867,7 @@ class SpansClient:
|
|
|
865
867
|
if patch:
|
|
866
868
|
processed_patches.append(patch)
|
|
867
869
|
if errors:
|
|
868
|
-
validation_errors.
|
|
870
|
+
validation_errors.extend(errors)
|
|
869
871
|
|
|
870
872
|
# If validation is enabled and errors found, raise ValidationFailure
|
|
871
873
|
if validate and validation_errors:
|
|
@@ -922,9 +924,11 @@ class SpansClient:
|
|
|
922
924
|
metadata_df[final_patch_column] = metadata_df[
|
|
923
925
|
final_patch_column
|
|
924
926
|
].apply(
|
|
925
|
-
lambda p:
|
|
926
|
-
|
|
927
|
-
|
|
927
|
+
lambda p: (
|
|
928
|
+
json.dumps(p)
|
|
929
|
+
if not isinstance(p, float) or not np.isnan(p)
|
|
930
|
+
else json.dumps({})
|
|
931
|
+
)
|
|
928
932
|
)
|
|
929
933
|
|
|
930
934
|
# Convert to Arrow table
|
|
@@ -932,20 +936,20 @@ class SpansClient:
|
|
|
932
936
|
log.debug("Converting data to Arrow format")
|
|
933
937
|
pa_table = pa.Table.from_pandas(metadata_df, preserve_index=False)
|
|
934
938
|
except pa.ArrowInvalid as e:
|
|
935
|
-
log.
|
|
939
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
936
940
|
raise pa.ArrowInvalid(
|
|
937
|
-
f"Error converting to Arrow format: {
|
|
941
|
+
f"Error converting to Arrow format: {e!s}"
|
|
938
942
|
) from e
|
|
939
|
-
except Exception
|
|
940
|
-
log.
|
|
943
|
+
except Exception:
|
|
944
|
+
log.exception("Unexpected error creating Arrow table")
|
|
941
945
|
raise
|
|
942
946
|
|
|
943
947
|
request_type = FlightRequestType.METADATA
|
|
944
948
|
response = None
|
|
945
949
|
with ArizeFlightClient(
|
|
946
950
|
api_key=self._sdk_config.api_key,
|
|
947
|
-
host=self._sdk_config.
|
|
948
|
-
port=self._sdk_config.
|
|
951
|
+
host=self._sdk_config.flight_host,
|
|
952
|
+
port=self._sdk_config.flight_port,
|
|
949
953
|
scheme=self._sdk_config.flight_scheme,
|
|
950
954
|
request_verify=self._sdk_config.request_verify,
|
|
951
955
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -958,8 +962,8 @@ class SpansClient:
|
|
|
958
962
|
request_type=request_type,
|
|
959
963
|
)
|
|
960
964
|
except Exception as e:
|
|
961
|
-
msg = f"Error during update request: {
|
|
962
|
-
log.
|
|
965
|
+
msg = f"Error during update request: {e!s}"
|
|
966
|
+
log.exception(msg)
|
|
963
967
|
raise RuntimeError(msg) from e
|
|
964
968
|
|
|
965
969
|
if response is None:
|
|
@@ -987,14 +991,25 @@ class SpansClient:
|
|
|
987
991
|
start_time: datetime,
|
|
988
992
|
end_time: datetime,
|
|
989
993
|
where: str = "",
|
|
990
|
-
columns:
|
|
994
|
+
columns: list | None = None,
|
|
991
995
|
similarity_search_params: SimilaritySearchParams | None = None,
|
|
992
996
|
stream_chunk_size: int | None = None,
|
|
993
997
|
) -> pd.DataFrame:
|
|
998
|
+
"""Export span data from Arize to a pandas DataFrame.
|
|
999
|
+
|
|
1000
|
+
Retrieves trace/span data from the specified project within a time range
|
|
1001
|
+
and returns it as a pandas DataFrame. Supports filtering with SQL-like
|
|
1002
|
+
WHERE clauses and similarity search for semantic retrieval.
|
|
1003
|
+
|
|
1004
|
+
Returns:
|
|
1005
|
+
-------
|
|
1006
|
+
pd.DataFrame: DataFrame containing the requested span data with columns
|
|
1007
|
+
for span metadata, attributes, events, and any custom fields.
|
|
1008
|
+
"""
|
|
994
1009
|
with ArizeFlightClient(
|
|
995
1010
|
api_key=self._sdk_config.api_key,
|
|
996
|
-
host=self._sdk_config.
|
|
997
|
-
port=self._sdk_config.
|
|
1011
|
+
host=self._sdk_config.flight_host,
|
|
1012
|
+
port=self._sdk_config.flight_port,
|
|
998
1013
|
scheme=self._sdk_config.flight_scheme,
|
|
999
1014
|
request_verify=self._sdk_config.request_verify,
|
|
1000
1015
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -1017,19 +1032,27 @@ class SpansClient:
|
|
|
1017
1032
|
def export_to_parquet(
|
|
1018
1033
|
self,
|
|
1019
1034
|
*,
|
|
1035
|
+
path: str,
|
|
1020
1036
|
space_id: str,
|
|
1021
1037
|
project_name: str,
|
|
1022
1038
|
start_time: datetime,
|
|
1023
1039
|
end_time: datetime,
|
|
1024
1040
|
where: str = "",
|
|
1025
|
-
columns:
|
|
1041
|
+
columns: list | None = None,
|
|
1026
1042
|
similarity_search_params: SimilaritySearchParams | None = None,
|
|
1027
1043
|
stream_chunk_size: int | None = None,
|
|
1028
|
-
) ->
|
|
1044
|
+
) -> None:
|
|
1045
|
+
"""Export span data from Arize to a Parquet file.
|
|
1046
|
+
|
|
1047
|
+
Retrieves trace/span data from the specified project within a time range
|
|
1048
|
+
and writes it directly to a Parquet file at the specified path. Supports
|
|
1049
|
+
filtering with SQL-like WHERE clauses and similarity search for semantic
|
|
1050
|
+
retrieval. Efficient for large datasets and long-term storage.
|
|
1051
|
+
"""
|
|
1029
1052
|
with ArizeFlightClient(
|
|
1030
1053
|
api_key=self._sdk_config.api_key,
|
|
1031
|
-
host=self._sdk_config.
|
|
1032
|
-
port=self._sdk_config.
|
|
1054
|
+
host=self._sdk_config.flight_host,
|
|
1055
|
+
port=self._sdk_config.flight_port,
|
|
1033
1056
|
scheme=self._sdk_config.flight_scheme,
|
|
1034
1057
|
request_verify=self._sdk_config.request_verify,
|
|
1035
1058
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -1038,6 +1061,7 @@ class SpansClient:
|
|
|
1038
1061
|
flight_client=flight_client,
|
|
1039
1062
|
)
|
|
1040
1063
|
return exporter.export_to_parquet(
|
|
1064
|
+
path=path,
|
|
1041
1065
|
space_id=space_id,
|
|
1042
1066
|
model_id=project_name,
|
|
1043
1067
|
environment=Environments.TRACING,
|
|
@@ -1050,7 +1074,7 @@ class SpansClient:
|
|
|
1050
1074
|
)
|
|
1051
1075
|
|
|
1052
1076
|
|
|
1053
|
-
def _build_patch_document(row):
|
|
1077
|
+
def _build_patch_document(row: pd.Series) -> dict[str, object]:
|
|
1054
1078
|
# Extract and preserve metadata values with proper types
|
|
1055
1079
|
patch = {}
|
|
1056
1080
|
for key in row.index:
|
|
@@ -1070,8 +1094,11 @@ def _build_patch_document(row):
|
|
|
1070
1094
|
|
|
1071
1095
|
|
|
1072
1096
|
def _process_patch_document(
|
|
1073
|
-
metadata_df
|
|
1074
|
-
|
|
1097
|
+
metadata_df: pd.DataFrame,
|
|
1098
|
+
patch_document_column_name: str,
|
|
1099
|
+
field_patches: pd.DataFrame,
|
|
1100
|
+
row_idx: int,
|
|
1101
|
+
) -> dict[str, object]:
|
|
1075
1102
|
# Get the field patch for this row
|
|
1076
1103
|
field_patch = field_patches.iloc[row_idx]
|
|
1077
1104
|
|
|
@@ -1111,15 +1138,14 @@ def _process_patch_document(
|
|
|
1111
1138
|
explicit_patch = {}
|
|
1112
1139
|
|
|
1113
1140
|
# Merge patches - explicit patch takes precedence
|
|
1114
|
-
|
|
1115
|
-
return merged_patch
|
|
1141
|
+
return {**field_patch, **explicit_patch}
|
|
1116
1142
|
|
|
1117
1143
|
|
|
1118
1144
|
def _ensure_dict_patch(
|
|
1119
1145
|
metadata_df: pd.DataFrame,
|
|
1120
1146
|
final_patch_column: str,
|
|
1121
1147
|
row_idx: int,
|
|
1122
|
-
):
|
|
1148
|
+
) -> tuple[dict[str, object], list[str]]:
|
|
1123
1149
|
patch = metadata_df.loc[row_idx, final_patch_column]
|
|
1124
1150
|
validation_errors = []
|
|
1125
1151
|
|
|
@@ -1141,19 +1167,19 @@ def _ensure_dict_patch(
|
|
|
1141
1167
|
parsed = json.loads(patch)
|
|
1142
1168
|
if isinstance(parsed, dict):
|
|
1143
1169
|
return parsed
|
|
1144
|
-
else:
|
|
1145
|
-
error_msg = (
|
|
1146
|
-
f"Row {row_idx}: JSON must be an object/dictionary, "
|
|
1147
|
-
f"got {type(parsed).__name__}"
|
|
1148
|
-
)
|
|
1149
|
-
logger.warning(error_msg)
|
|
1150
|
-
validation_errors.append(error_msg)
|
|
1151
|
-
return {}, validation_errors # if not validate else None
|
|
1152
1170
|
except json.JSONDecodeError as e:
|
|
1153
1171
|
error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
|
|
1154
1172
|
logger.warning(error_msg)
|
|
1155
1173
|
validation_errors.append(error_msg)
|
|
1156
1174
|
return {}, validation_errors # if not validate else None
|
|
1175
|
+
else:
|
|
1176
|
+
error_msg = (
|
|
1177
|
+
f"Row {row_idx}: JSON must be an object/dictionary, "
|
|
1178
|
+
f"got {type(parsed).__name__}"
|
|
1179
|
+
)
|
|
1180
|
+
logger.warning(error_msg)
|
|
1181
|
+
validation_errors.append(error_msg)
|
|
1182
|
+
return {}, validation_errors # if not validate else None
|
|
1157
1183
|
|
|
1158
1184
|
# For other types, log warning
|
|
1159
1185
|
error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
|
|
@@ -1165,7 +1191,7 @@ def _ensure_dict_patch(
|
|
|
1165
1191
|
def _format_note_for_storage(
|
|
1166
1192
|
note_text: str,
|
|
1167
1193
|
current_time_ms: int,
|
|
1168
|
-
):
|
|
1194
|
+
) -> list[str] | None:
|
|
1169
1195
|
if pd.isna(note_text):
|
|
1170
1196
|
return None
|
|
1171
1197
|
note_obj = {
|
|
@@ -1225,9 +1251,7 @@ def _log_flight_update_summary(
|
|
|
1225
1251
|
logger.warning("Flight update response missing counts", extra=metrics)
|
|
1226
1252
|
else:
|
|
1227
1253
|
all_processed = int(spans_processed) == int(total_spans)
|
|
1228
|
-
msg =
|
|
1229
|
-
"✅ All spans processed" if all_processed else "Partial processing"
|
|
1230
|
-
)
|
|
1254
|
+
msg = "All spans processed" if all_processed else "Partial processing"
|
|
1231
1255
|
logger.info(msg, extra=metrics)
|
|
1232
1256
|
|
|
1233
1257
|
# Emit individual error lines (structured per-error, easy to aggregate)
|
|
@@ -1246,7 +1270,7 @@ def _message_to_dict(
|
|
|
1246
1270
|
msg: message.Message,
|
|
1247
1271
|
preserve_names: bool = True,
|
|
1248
1272
|
use_int_enums: bool = False,
|
|
1249
|
-
):
|
|
1273
|
+
) -> dict[str, object]:
|
|
1250
1274
|
return json_format.MessageToDict(
|
|
1251
1275
|
msg,
|
|
1252
1276
|
preserving_proto_field_name=preserve_names,
|
arize/spans/columns.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Span column definitions and OpenInference semantic conventions."""
|
|
2
|
+
|
|
1
3
|
from enum import Enum
|
|
2
4
|
|
|
3
5
|
import openinference.semconv.trace as oinf
|
|
@@ -5,6 +7,8 @@ import opentelemetry.semconv.trace as otel
|
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class SpanColumnDataType(Enum):
|
|
10
|
+
"""Enum representing supported data types for span columns."""
|
|
11
|
+
|
|
8
12
|
BOOL = 1
|
|
9
13
|
NUMERIC = 2
|
|
10
14
|
STRING = 3
|
|
@@ -15,12 +19,21 @@ class SpanColumnDataType(Enum):
|
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class SpanColumn:
|
|
22
|
+
"""Configuration for a custom span column with name, data type, and annotation settings."""
|
|
23
|
+
|
|
18
24
|
def __init__(
|
|
19
25
|
self,
|
|
20
26
|
name: str,
|
|
21
27
|
data_type: SpanColumnDataType,
|
|
22
28
|
required: bool = False,
|
|
23
29
|
) -> None:
|
|
30
|
+
"""Initialize a span column configuration.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name: Name of the span column.
|
|
34
|
+
data_type: Data type of the column values.
|
|
35
|
+
required: Whether the column is required.
|
|
36
|
+
"""
|
|
24
37
|
self.name = name
|
|
25
38
|
self.required = required
|
|
26
39
|
self.data_type = data_type
|