arize 8.0.0a22__py3-none-any.whl → 8.0.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +28 -19
- arize/_exporter/client.py +56 -37
- arize/_exporter/parsers/tracing_data_parser.py +41 -30
- arize/_exporter/validation.py +3 -3
- arize/_flight/client.py +207 -76
- arize/_generated/api_client/__init__.py +30 -6
- arize/_generated/api_client/api/__init__.py +1 -0
- arize/_generated/api_client/api/datasets_api.py +864 -190
- arize/_generated/api_client/api/experiments_api.py +167 -131
- arize/_generated/api_client/api/projects_api.py +1197 -0
- arize/_generated/api_client/api_client.py +2 -2
- arize/_generated/api_client/configuration.py +42 -34
- arize/_generated/api_client/exceptions.py +2 -2
- arize/_generated/api_client/models/__init__.py +15 -4
- arize/_generated/api_client/models/dataset.py +10 -10
- arize/_generated/api_client/models/dataset_example.py +111 -0
- arize/_generated/api_client/models/dataset_example_update.py +100 -0
- arize/_generated/api_client/models/dataset_version.py +13 -13
- arize/_generated/api_client/models/datasets_create_request.py +16 -8
- arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
- arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
- arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
- arize/_generated/api_client/models/datasets_list200_response.py +10 -4
- arize/_generated/api_client/models/experiment.py +14 -16
- arize/_generated/api_client/models/experiment_run.py +108 -0
- arize/_generated/api_client/models/experiment_run_create.py +102 -0
- arize/_generated/api_client/models/experiments_create_request.py +16 -10
- arize/_generated/api_client/models/experiments_list200_response.py +10 -4
- arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
- arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
- arize/_generated/api_client/models/primitive_value.py +172 -0
- arize/_generated/api_client/models/problem.py +100 -0
- arize/_generated/api_client/models/project.py +99 -0
- arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
- arize/_generated/api_client/models/projects_list200_response.py +106 -0
- arize/_generated/api_client/rest.py +2 -2
- arize/_generated/api_client/test/test_dataset.py +4 -2
- arize/_generated/api_client/test/test_dataset_example.py +56 -0
- arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
- arize/_generated/api_client/test/test_dataset_version.py +7 -2
- arize/_generated/api_client/test/test_datasets_api.py +27 -13
- arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
- arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
- arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
- arize/_generated/api_client/test/test_experiment.py +2 -4
- arize/_generated/api_client/test/test_experiment_run.py +56 -0
- arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
- arize/_generated/api_client/test/test_experiments_api.py +6 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
- arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
- arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
- arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
- arize/_generated/api_client/test/test_problem.py +57 -0
- arize/_generated/api_client/test/test_project.py +58 -0
- arize/_generated/api_client/test/test_projects_api.py +59 -0
- arize/_generated/api_client/test/test_projects_create_request.py +54 -0
- arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
- arize/_generated/api_client_README.md +43 -29
- arize/_generated/protocol/flight/flight_pb2.py +400 -0
- arize/_lazy.py +27 -19
- arize/client.py +181 -58
- arize/config.py +324 -116
- arize/constants/__init__.py +1 -0
- arize/constants/config.py +11 -4
- arize/constants/ml.py +6 -4
- arize/constants/openinference.py +2 -0
- arize/constants/pyarrow.py +2 -0
- arize/constants/spans.py +3 -1
- arize/datasets/__init__.py +1 -0
- arize/datasets/client.py +304 -84
- arize/datasets/errors.py +32 -2
- arize/datasets/validation.py +18 -8
- arize/embeddings/__init__.py +2 -0
- arize/embeddings/auto_generator.py +23 -19
- arize/embeddings/base_generators.py +89 -36
- arize/embeddings/constants.py +2 -0
- arize/embeddings/cv_generators.py +26 -4
- arize/embeddings/errors.py +27 -5
- arize/embeddings/nlp_generators.py +43 -18
- arize/embeddings/tabular_generators.py +46 -31
- arize/embeddings/usecases.py +12 -2
- arize/exceptions/__init__.py +1 -0
- arize/exceptions/auth.py +11 -1
- arize/exceptions/base.py +29 -4
- arize/exceptions/models.py +21 -2
- arize/exceptions/parameters.py +31 -0
- arize/exceptions/spaces.py +12 -1
- arize/exceptions/types.py +86 -7
- arize/exceptions/values.py +220 -20
- arize/experiments/__init__.py +13 -0
- arize/experiments/client.py +394 -285
- arize/experiments/evaluators/__init__.py +1 -0
- arize/experiments/evaluators/base.py +74 -41
- arize/experiments/evaluators/exceptions.py +6 -3
- arize/experiments/evaluators/executors.py +121 -73
- arize/experiments/evaluators/rate_limiters.py +106 -57
- arize/experiments/evaluators/types.py +34 -7
- arize/experiments/evaluators/utils.py +65 -27
- arize/experiments/functions.py +103 -101
- arize/experiments/tracing.py +52 -44
- arize/experiments/types.py +56 -31
- arize/logging.py +54 -22
- arize/ml/__init__.py +1 -0
- arize/ml/batch_validation/__init__.py +1 -0
- arize/{models → ml}/batch_validation/errors.py +545 -67
- arize/{models → ml}/batch_validation/validator.py +344 -303
- arize/ml/bounded_executor.py +47 -0
- arize/{models → ml}/casting.py +118 -108
- arize/{models → ml}/client.py +339 -118
- arize/{models → ml}/proto.py +97 -42
- arize/{models → ml}/stream_validation.py +43 -15
- arize/ml/surrogate_explainer/__init__.py +1 -0
- arize/{models → ml}/surrogate_explainer/mimic.py +25 -10
- arize/{types.py → ml/types.py} +355 -354
- arize/pre_releases.py +44 -0
- arize/projects/__init__.py +1 -0
- arize/projects/client.py +134 -0
- arize/regions.py +40 -0
- arize/spans/__init__.py +1 -0
- arize/spans/client.py +204 -175
- arize/spans/columns.py +13 -0
- arize/spans/conversion.py +60 -37
- arize/spans/validation/__init__.py +1 -0
- arize/spans/validation/annotations/__init__.py +1 -0
- arize/spans/validation/annotations/annotations_validation.py +6 -4
- arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
- arize/spans/validation/annotations/value_validation.py +35 -11
- arize/spans/validation/common/__init__.py +1 -0
- arize/spans/validation/common/argument_validation.py +33 -8
- arize/spans/validation/common/dataframe_form_validation.py +35 -9
- arize/spans/validation/common/errors.py +211 -11
- arize/spans/validation/common/value_validation.py +81 -14
- arize/spans/validation/evals/__init__.py +1 -0
- arize/spans/validation/evals/dataframe_form_validation.py +28 -8
- arize/spans/validation/evals/evals_validation.py +34 -4
- arize/spans/validation/evals/value_validation.py +26 -3
- arize/spans/validation/metadata/__init__.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +14 -5
- arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
- arize/spans/validation/metadata/value_validation.py +24 -10
- arize/spans/validation/spans/__init__.py +1 -0
- arize/spans/validation/spans/dataframe_form_validation.py +35 -14
- arize/spans/validation/spans/spans_validation.py +35 -4
- arize/spans/validation/spans/value_validation.py +78 -8
- arize/utils/__init__.py +1 -0
- arize/utils/arrow.py +31 -15
- arize/utils/cache.py +34 -6
- arize/utils/dataframe.py +20 -3
- arize/utils/online_tasks/__init__.py +2 -0
- arize/utils/online_tasks/dataframe_preprocessor.py +58 -47
- arize/utils/openinference_conversion.py +44 -5
- arize/utils/proto.py +10 -0
- arize/utils/size.py +5 -3
- arize/utils/types.py +105 -0
- arize/version.py +3 -1
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/METADATA +13 -6
- arize-8.0.0b0.dist-info/RECORD +175 -0
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/WHEEL +1 -1
- arize-8.0.0b0.dist-info/licenses/LICENSE +176 -0
- arize-8.0.0b0.dist-info/licenses/NOTICE +13 -0
- arize/_generated/protocol/flight/export_pb2.py +0 -61
- arize/_generated/protocol/flight/ingest_pb2.py +0 -365
- arize/models/__init__.py +0 -0
- arize/models/batch_validation/__init__.py +0 -0
- arize/models/bounded_executor.py +0 -34
- arize/models/surrogate_explainer/__init__.py +0 -0
- arize-8.0.0a22.dist-info/RECORD +0 -146
- arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/spans/client.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Client implementation for managing spans and traces in the Arize platform."""
|
|
2
|
+
|
|
1
3
|
# type: ignore[pb2]
|
|
2
4
|
from __future__ import annotations
|
|
3
5
|
|
|
@@ -6,7 +8,7 @@ import logging
|
|
|
6
8
|
import re
|
|
7
9
|
from datetime import datetime, timezone
|
|
8
10
|
from functools import partial
|
|
9
|
-
from typing import TYPE_CHECKING, Any
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
10
12
|
|
|
11
13
|
import numpy as np
|
|
12
14
|
import pandas as pd
|
|
@@ -16,10 +18,6 @@ from google.protobuf import json_format, message
|
|
|
16
18
|
from arize._exporter.client import ArizeExportClient
|
|
17
19
|
from arize._flight.client import ArizeFlightClient, FlightPostArrowFileResponse
|
|
18
20
|
from arize._flight.types import FlightRequestType
|
|
19
|
-
from arize._generated.protocol.flight.ingest_pb2 import (
|
|
20
|
-
WriteSpanAnnotationResponse,
|
|
21
|
-
WriteSpanEvaluationResponse,
|
|
22
|
-
)
|
|
23
21
|
from arize.constants.spans import DEFAULT_DATETIME_FMT
|
|
24
22
|
from arize.exceptions.base import (
|
|
25
23
|
INVALID_ARROW_CONVERSION_MSG,
|
|
@@ -28,10 +26,8 @@ from arize.exceptions.base import (
|
|
|
28
26
|
from arize.exceptions.models import MissingProjectNameError
|
|
29
27
|
from arize.exceptions.spaces import MissingSpaceIDError
|
|
30
28
|
from arize.logging import CtxAdapter
|
|
31
|
-
from arize.types import Environments
|
|
32
|
-
from arize.utils.arrow import
|
|
33
|
-
post_arrow_table,
|
|
34
|
-
)
|
|
29
|
+
from arize.ml.types import Environments
|
|
30
|
+
from arize.utils.arrow import post_arrow_table
|
|
35
31
|
from arize.utils.dataframe import (
|
|
36
32
|
remove_extraneous_columns,
|
|
37
33
|
reset_dataframe_index,
|
|
@@ -41,13 +37,25 @@ from arize.utils.proto import get_pb_schema_tracing
|
|
|
41
37
|
if TYPE_CHECKING:
|
|
42
38
|
import requests
|
|
43
39
|
|
|
40
|
+
from arize._generated.protocol.flight import flight_pb2
|
|
44
41
|
from arize.config import SDKConfiguration
|
|
45
42
|
|
|
46
43
|
logger = logging.getLogger(__name__)
|
|
47
44
|
|
|
48
45
|
|
|
49
46
|
class SpansClient:
|
|
50
|
-
|
|
47
|
+
"""Client for logging LLM tracing spans and evaluations to Arize.
|
|
48
|
+
|
|
49
|
+
This class is primarily intended for internal use within the SDK. Users are
|
|
50
|
+
highly encouraged to access resource-specific functionality via
|
|
51
|
+
:class:`arize.ArizeClient`.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, *, sdk_config: SDKConfiguration) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Args:
|
|
57
|
+
sdk_config: Resolved SDK configuration.
|
|
58
|
+
""" # noqa: D205, D212
|
|
51
59
|
self._sdk_config = sdk_config
|
|
52
60
|
|
|
53
61
|
def log(
|
|
@@ -62,25 +70,26 @@ class SpansClient:
|
|
|
62
70
|
timeout: float | None = None,
|
|
63
71
|
tmp_dir: str = "",
|
|
64
72
|
) -> requests.Response:
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
:class:`Response` object from the Requests HTTP library to ensure
|
|
68
|
-
records.
|
|
73
|
+
"""Logs a pandas dataframe containing LLM tracing data to Arize via a POST request.
|
|
74
|
+
|
|
75
|
+
Returns a :class:`Response` object from the Requests HTTP library to ensure
|
|
76
|
+
successful delivery of records.
|
|
69
77
|
|
|
70
78
|
Args:
|
|
71
|
-
|
|
72
|
-
|
|
79
|
+
space_id: The space ID where the project resides.
|
|
80
|
+
project_name: A unique name to identify your project in the Arize platform.
|
|
81
|
+
dataframe: The dataframe containing the LLM traces.
|
|
82
|
+
evals_dataframe: A dataframe containing LLM evaluations data.
|
|
73
83
|
The evaluations are joined to their corresponding spans via a left outer join, i.e.,
|
|
74
84
|
using only `context.span_id` from the spans dataframe. Defaults to None.
|
|
75
|
-
datetime_format
|
|
85
|
+
datetime_format: format for the timestamp captured in the LLM traces.
|
|
76
86
|
Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
|
|
77
|
-
validate
|
|
87
|
+
validate: When set to True, validation is run before sending data.
|
|
78
88
|
Defaults to True.
|
|
79
|
-
|
|
80
|
-
before sending to Arize.
|
|
81
|
-
timeout (float, optional): You can stop waiting for a response after a given number
|
|
89
|
+
timeout: You can stop waiting for a response after a given number
|
|
82
90
|
of seconds with the timeout parameter. Defaults to None.
|
|
83
|
-
|
|
91
|
+
tmp_dir: Temporary directory/file to store the serialized data in binary
|
|
92
|
+
before sending to Arize.
|
|
84
93
|
|
|
85
94
|
Returns:
|
|
86
95
|
`Response` object
|
|
@@ -220,12 +229,12 @@ class SpansClient:
|
|
|
220
229
|
log.debug("Converting data to Arrow format")
|
|
221
230
|
pa_table = pa.Table.from_pandas(df, preserve_index=False)
|
|
222
231
|
except pa.ArrowInvalid as e:
|
|
223
|
-
log.
|
|
232
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
224
233
|
raise pa.ArrowInvalid(
|
|
225
|
-
f"Error converting to Arrow format: {
|
|
234
|
+
f"Error converting to Arrow format: {e!s}"
|
|
226
235
|
) from e
|
|
227
|
-
except Exception
|
|
228
|
-
log.
|
|
236
|
+
except Exception:
|
|
237
|
+
log.exception("Unexpected error creating Arrow table")
|
|
229
238
|
raise
|
|
230
239
|
|
|
231
240
|
proto_schema = get_pb_schema_tracing(project_name=project_name)
|
|
@@ -262,27 +271,23 @@ class SpansClient:
|
|
|
262
271
|
force_http: bool = False,
|
|
263
272
|
timeout: float | None = None,
|
|
264
273
|
tmp_dir: str = "",
|
|
265
|
-
) -> WriteSpanEvaluationResponse:
|
|
266
|
-
"""
|
|
267
|
-
|
|
268
|
-
The dataframe must contain a column `context.span_id`
|
|
269
|
-
|
|
274
|
+
) -> flight_pb2.WriteSpanEvaluationResponse:
|
|
275
|
+
"""Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
|
|
276
|
+
|
|
277
|
+
The dataframe must contain a column `context.span_id` such that Arize can assign
|
|
278
|
+
each evaluation to its respective span.
|
|
270
279
|
|
|
271
280
|
Args:
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
model_id to compare and track changes. It should match the model_id of the spans
|
|
277
|
-
sent previously, to which evaluations will be assigned. Defaults to None.
|
|
278
|
-
validate (bool, optional): When set to True, validation is run before sending data.
|
|
281
|
+
space_id: The space ID where the project resides.
|
|
282
|
+
project_name: A unique name to identify your project in the Arize platform.
|
|
283
|
+
dataframe: A dataframe containing LLM evaluations data.
|
|
284
|
+
validate: When set to True, validation is run before sending data.
|
|
279
285
|
Defaults to True.
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
timeout (float, optional): You can stop waiting for a response after a given number
|
|
286
|
+
force_http: Force the use of HTTP for data upload. Defaults to False.
|
|
287
|
+
timeout: You can stop waiting for a response after a given number
|
|
283
288
|
of seconds with the timeout parameter. Defaults to None.
|
|
284
|
-
|
|
285
|
-
|
|
289
|
+
tmp_dir: Temporary directory/file to store the serialized data in binary
|
|
290
|
+
before sending to Arize.
|
|
286
291
|
"""
|
|
287
292
|
from arize.spans.columns import EVAL_COLUMN_PATTERN, SPAN_SPAN_ID_COL
|
|
288
293
|
from arize.spans.validation.evals import evals_validation
|
|
@@ -358,12 +363,12 @@ class SpansClient:
|
|
|
358
363
|
log.debug("Converting data to Arrow format")
|
|
359
364
|
pa_table = pa.Table.from_pandas(evals_df, preserve_index=False)
|
|
360
365
|
except pa.ArrowInvalid as e:
|
|
361
|
-
log.
|
|
366
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
362
367
|
raise pa.ArrowInvalid(
|
|
363
|
-
f"Error converting to Arrow format: {
|
|
368
|
+
f"Error converting to Arrow format: {e!s}"
|
|
364
369
|
) from e
|
|
365
|
-
except Exception
|
|
366
|
-
log.
|
|
370
|
+
except Exception:
|
|
371
|
+
log.exception("Unexpected error creating Arrow table")
|
|
367
372
|
raise
|
|
368
373
|
|
|
369
374
|
if force_http:
|
|
@@ -395,8 +400,8 @@ class SpansClient:
|
|
|
395
400
|
response = None
|
|
396
401
|
with ArizeFlightClient(
|
|
397
402
|
api_key=self._sdk_config.api_key,
|
|
398
|
-
host=self._sdk_config.
|
|
399
|
-
port=self._sdk_config.
|
|
403
|
+
host=self._sdk_config.flight_host,
|
|
404
|
+
port=self._sdk_config.flight_port,
|
|
400
405
|
scheme=self._sdk_config.flight_scheme,
|
|
401
406
|
request_verify=self._sdk_config.request_verify,
|
|
402
407
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -409,8 +414,8 @@ class SpansClient:
|
|
|
409
414
|
request_type=request_type,
|
|
410
415
|
)
|
|
411
416
|
except Exception as e:
|
|
412
|
-
msg = f"Error during update request: {
|
|
413
|
-
log.
|
|
417
|
+
msg = f"Error during update request: {e!s}"
|
|
418
|
+
log.exception(msg)
|
|
414
419
|
raise RuntimeError(msg) from e
|
|
415
420
|
|
|
416
421
|
if response is None:
|
|
@@ -437,19 +442,19 @@ class SpansClient:
|
|
|
437
442
|
project_name: str,
|
|
438
443
|
dataframe: pd.DataFrame,
|
|
439
444
|
validate: bool = True,
|
|
440
|
-
) -> WriteSpanAnnotationResponse:
|
|
441
|
-
"""
|
|
442
|
-
|
|
443
|
-
The dataframe must contain a column `context.span_id`
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
free-form text notes.
|
|
445
|
+
) -> flight_pb2.WriteSpanAnnotationResponse:
|
|
446
|
+
"""Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
|
|
447
|
+
|
|
448
|
+
The dataframe must contain a column `context.span_id` such that Arize can assign
|
|
449
|
+
each annotation to its respective span. Annotation columns should follow the pattern
|
|
450
|
+
`annotation.<name>.<suffix>` where suffix is either `label` or `score`. An optional
|
|
451
|
+
`annotation.notes` column can be included for free-form text notes.
|
|
448
452
|
|
|
449
453
|
Args:
|
|
450
|
-
|
|
451
|
-
project_name
|
|
452
|
-
|
|
454
|
+
space_id: The space ID where the project resides.
|
|
455
|
+
project_name: A unique name to identify your project in the Arize platform.
|
|
456
|
+
dataframe: A dataframe containing LLM annotation data.
|
|
457
|
+
validate: When set to True, validation is run before sending data.
|
|
453
458
|
Defaults to True.
|
|
454
459
|
"""
|
|
455
460
|
from arize.spans.columns import (
|
|
@@ -588,12 +593,12 @@ class SpansClient:
|
|
|
588
593
|
log.debug("Converting data to Arrow format")
|
|
589
594
|
pa_table = pa.Table.from_pandas(anno_df, preserve_index=False)
|
|
590
595
|
except pa.ArrowInvalid as e:
|
|
591
|
-
log.
|
|
596
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
592
597
|
raise pa.ArrowInvalid(
|
|
593
|
-
f"Error converting to Arrow format: {
|
|
598
|
+
f"Error converting to Arrow format: {e!s}"
|
|
594
599
|
) from e
|
|
595
|
-
except Exception
|
|
596
|
-
log.
|
|
600
|
+
except Exception:
|
|
601
|
+
log.exception("Unexpected error creating Arrow table")
|
|
597
602
|
raise
|
|
598
603
|
|
|
599
604
|
if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
|
|
@@ -611,8 +616,8 @@ class SpansClient:
|
|
|
611
616
|
response = None
|
|
612
617
|
with ArizeFlightClient(
|
|
613
618
|
api_key=self._sdk_config.api_key,
|
|
614
|
-
host=self._sdk_config.
|
|
615
|
-
port=self._sdk_config.
|
|
619
|
+
host=self._sdk_config.flight_host,
|
|
620
|
+
port=self._sdk_config.flight_port,
|
|
616
621
|
scheme=self._sdk_config.flight_scheme,
|
|
617
622
|
request_verify=self._sdk_config.request_verify,
|
|
618
623
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -625,8 +630,8 @@ class SpansClient:
|
|
|
625
630
|
request_type=request_type,
|
|
626
631
|
)
|
|
627
632
|
except Exception as e:
|
|
628
|
-
msg = f"Error during update request: {
|
|
629
|
-
log.
|
|
633
|
+
msg = f"Error during update request: {e!s}"
|
|
634
|
+
log.exception(msg)
|
|
630
635
|
raise RuntimeError(msg) from e
|
|
631
636
|
|
|
632
637
|
if response is None:
|
|
@@ -654,11 +659,13 @@ class SpansClient:
|
|
|
654
659
|
dataframe: pd.DataFrame,
|
|
655
660
|
patch_document_column_name: str = "patch_document",
|
|
656
661
|
validate: bool = True,
|
|
657
|
-
) ->
|
|
658
|
-
"""
|
|
659
|
-
|
|
662
|
+
) -> dict[str, Any]:
|
|
663
|
+
"""Log metadata updates using JSON Merge Patch format.
|
|
664
|
+
|
|
665
|
+
This method is only supported for LLM model types.
|
|
660
666
|
|
|
661
667
|
The dataframe must contain a column `context.span_id` to identify spans and either:
|
|
668
|
+
|
|
662
669
|
1. A column with JSON patch documents (specified by patch_document_column_name), or
|
|
663
670
|
2. One or more columns with prefix `attributes.metadata.` that will be automatically
|
|
664
671
|
converted to a patch document (e.g., `attributes.metadata.tag` → `{"tag": value}`).
|
|
@@ -666,7 +673,8 @@ class SpansClient:
|
|
|
666
673
|
If both methods are used, the explicit patch document is applied after the individual field updates.
|
|
667
674
|
The patches will be applied to the `attributes.metadata` field of each span.
|
|
668
675
|
|
|
669
|
-
|
|
676
|
+
Type Handling:
|
|
677
|
+
|
|
670
678
|
- The client primarily supports string, integer, and float data types.
|
|
671
679
|
- Boolean values are converted to string representations.
|
|
672
680
|
- Nested JSON objects and arrays are serialized to JSON strings during transmission.
|
|
@@ -674,20 +682,23 @@ class SpansClient:
|
|
|
674
682
|
Note: This differs from standard JSON Merge Patch where null values remove fields.
|
|
675
683
|
|
|
676
684
|
Args:
|
|
677
|
-
|
|
685
|
+
space_id: The space ID where the project resides.
|
|
678
686
|
project_name: A unique name to identify your project in the Arize platform.
|
|
687
|
+
dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
|
|
679
688
|
patch_document_column_name: Name of the column containing JSON patch documents.
|
|
680
689
|
Defaults to "patch_document".
|
|
681
690
|
validate: When set to True, validation is run before sending data.
|
|
682
691
|
|
|
683
692
|
Returns:
|
|
684
693
|
Dictionary containing update results with the following keys:
|
|
694
|
+
|
|
685
695
|
- spans_processed: Total number of spans in the input dataframe
|
|
686
696
|
- spans_updated: Count of successfully updated span metadata records
|
|
687
697
|
- spans_failed: Count of spans that failed to update
|
|
688
698
|
- errors: List of dictionaries with 'span_id' and 'error_message' keys for each failed span
|
|
689
699
|
|
|
690
|
-
|
|
700
|
+
Error types from the server include:
|
|
701
|
+
|
|
691
702
|
- parse_failure: Failed to parse JSON metadata
|
|
692
703
|
- patch_failure: Failed to apply JSON patch
|
|
693
704
|
- type_conflict: Type conflict in metadata
|
|
@@ -696,58 +707,60 @@ class SpansClient:
|
|
|
696
707
|
- druid_rejection: Backend rejected the update
|
|
697
708
|
|
|
698
709
|
Raises:
|
|
699
|
-
AuthError: When API key or space ID is missing
|
|
700
|
-
ValidationFailure: When validation of the dataframe or values fails
|
|
701
|
-
ImportError: When required tracing dependencies are missing
|
|
702
|
-
ArrowInvalid: When the dataframe cannot be converted to Arrow format
|
|
703
|
-
RuntimeError: If the request fails or no response is received
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
df = pd.DataFrame(
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
)
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
710
|
+
AuthError: When API key or space ID is missing.
|
|
711
|
+
ValidationFailure: When validation of the dataframe or values fails.
|
|
712
|
+
ImportError: When required tracing dependencies are missing.
|
|
713
|
+
ArrowInvalid: When the dataframe cannot be converted to Arrow format.
|
|
714
|
+
RuntimeError: If the request fails or no response is received.
|
|
715
|
+
|
|
716
|
+
Examples:
|
|
717
|
+
Method 1: Using a patch document
|
|
718
|
+
|
|
719
|
+
>>> df = pd.DataFrame(
|
|
720
|
+
... {
|
|
721
|
+
... "context.span_id": ["span1", "span2"],
|
|
722
|
+
... "patch_document": [
|
|
723
|
+
... {"tag": "important"},
|
|
724
|
+
... {"priority": "high"},
|
|
725
|
+
... ],
|
|
726
|
+
... }
|
|
727
|
+
... )
|
|
728
|
+
|
|
729
|
+
Method 2: Using direct field columns
|
|
730
|
+
|
|
731
|
+
>>> df = pd.DataFrame(
|
|
732
|
+
... {
|
|
733
|
+
... "context.span_id": ["span1", "span2"],
|
|
734
|
+
... "attributes.metadata.tag": ["important", "standard"],
|
|
735
|
+
... "attributes.metadata.priority": ["high", "medium"],
|
|
736
|
+
... }
|
|
737
|
+
... )
|
|
738
|
+
|
|
739
|
+
Method 3: Combining both approaches
|
|
740
|
+
|
|
741
|
+
>>> df = pd.DataFrame(
|
|
742
|
+
... {
|
|
743
|
+
... "context.span_id": ["span1"],
|
|
744
|
+
... "attributes.metadata.tag": ["important"],
|
|
745
|
+
... "patch_document": [
|
|
746
|
+
... {"priority": "high"}
|
|
747
|
+
... ], # Overrides conflicting fields
|
|
748
|
+
... }
|
|
749
|
+
... )
|
|
750
|
+
|
|
751
|
+
Method 4: Setting fields to null
|
|
752
|
+
|
|
753
|
+
>>> df = pd.DataFrame(
|
|
754
|
+
... {
|
|
755
|
+
... "context.span_id": ["span1"],
|
|
756
|
+
... "attributes.metadata.old_field": [
|
|
757
|
+
... None
|
|
758
|
+
... ], # Sets field to JSON null
|
|
759
|
+
... "patch_document": [
|
|
760
|
+
... {"other_field": None}
|
|
761
|
+
... ], # Also sets field to JSON null
|
|
762
|
+
... }
|
|
763
|
+
... )
|
|
751
764
|
"""
|
|
752
765
|
# Import validation modules
|
|
753
766
|
from arize.spans.columns import SPAN_SPAN_ID_COL
|
|
@@ -813,11 +826,10 @@ class SpansClient:
|
|
|
813
826
|
)
|
|
814
827
|
|
|
815
828
|
# Create a new column for patch documents if we're going to use it
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
metadata_df[final_patch_column] = None
|
|
829
|
+
# Use 'patch_document' as the standardized column name for downstream processing
|
|
830
|
+
final_patch_column = "patch_document"
|
|
831
|
+
if final_patch_column not in metadata_df.columns:
|
|
832
|
+
metadata_df[final_patch_column] = None
|
|
821
833
|
|
|
822
834
|
# Process metadata field columns if they exist
|
|
823
835
|
if has_metadata_fields:
|
|
@@ -865,7 +877,7 @@ class SpansClient:
|
|
|
865
877
|
if patch:
|
|
866
878
|
processed_patches.append(patch)
|
|
867
879
|
if errors:
|
|
868
|
-
validation_errors.
|
|
880
|
+
validation_errors.extend(errors)
|
|
869
881
|
|
|
870
882
|
# If validation is enabled and errors found, raise ValidationFailure
|
|
871
883
|
if validate and validation_errors:
|
|
@@ -922,9 +934,11 @@ class SpansClient:
|
|
|
922
934
|
metadata_df[final_patch_column] = metadata_df[
|
|
923
935
|
final_patch_column
|
|
924
936
|
].apply(
|
|
925
|
-
lambda p:
|
|
926
|
-
|
|
927
|
-
|
|
937
|
+
lambda p: (
|
|
938
|
+
json.dumps(p)
|
|
939
|
+
if not isinstance(p, float) or not np.isnan(p)
|
|
940
|
+
else json.dumps({})
|
|
941
|
+
)
|
|
928
942
|
)
|
|
929
943
|
|
|
930
944
|
# Convert to Arrow table
|
|
@@ -932,20 +946,20 @@ class SpansClient:
|
|
|
932
946
|
log.debug("Converting data to Arrow format")
|
|
933
947
|
pa_table = pa.Table.from_pandas(metadata_df, preserve_index=False)
|
|
934
948
|
except pa.ArrowInvalid as e:
|
|
935
|
-
log.
|
|
949
|
+
log.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
936
950
|
raise pa.ArrowInvalid(
|
|
937
|
-
f"Error converting to Arrow format: {
|
|
951
|
+
f"Error converting to Arrow format: {e!s}"
|
|
938
952
|
) from e
|
|
939
|
-
except Exception
|
|
940
|
-
log.
|
|
953
|
+
except Exception:
|
|
954
|
+
log.exception("Unexpected error creating Arrow table")
|
|
941
955
|
raise
|
|
942
956
|
|
|
943
957
|
request_type = FlightRequestType.METADATA
|
|
944
958
|
response = None
|
|
945
959
|
with ArizeFlightClient(
|
|
946
960
|
api_key=self._sdk_config.api_key,
|
|
947
|
-
host=self._sdk_config.
|
|
948
|
-
port=self._sdk_config.
|
|
961
|
+
host=self._sdk_config.flight_host,
|
|
962
|
+
port=self._sdk_config.flight_port,
|
|
949
963
|
scheme=self._sdk_config.flight_scheme,
|
|
950
964
|
request_verify=self._sdk_config.request_verify,
|
|
951
965
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -958,8 +972,8 @@ class SpansClient:
|
|
|
958
972
|
request_type=request_type,
|
|
959
973
|
)
|
|
960
974
|
except Exception as e:
|
|
961
|
-
msg = f"Error during update request: {
|
|
962
|
-
log.
|
|
975
|
+
msg = f"Error during update request: {e!s}"
|
|
976
|
+
log.exception(msg)
|
|
963
977
|
raise RuntimeError(msg) from e
|
|
964
978
|
|
|
965
979
|
if response is None:
|
|
@@ -987,14 +1001,23 @@ class SpansClient:
|
|
|
987
1001
|
start_time: datetime,
|
|
988
1002
|
end_time: datetime,
|
|
989
1003
|
where: str = "",
|
|
990
|
-
columns:
|
|
991
|
-
similarity_search_params: SimilaritySearchParams | None = None,
|
|
1004
|
+
columns: list | None = None,
|
|
992
1005
|
stream_chunk_size: int | None = None,
|
|
993
1006
|
) -> pd.DataFrame:
|
|
1007
|
+
"""Export span data from Arize to a pandas DataFrame.
|
|
1008
|
+
|
|
1009
|
+
Retrieves trace/span data from the specified project within a time range
|
|
1010
|
+
and returns it as a pandas DataFrame. Supports filtering with SQL-like
|
|
1011
|
+
WHERE clauses and similarity search for semantic retrieval.
|
|
1012
|
+
|
|
1013
|
+
Returns:
|
|
1014
|
+
pd.DataFrame: DataFrame containing the requested span data with columns
|
|
1015
|
+
for span metadata, attributes, events, and any custom fields.
|
|
1016
|
+
"""
|
|
994
1017
|
with ArizeFlightClient(
|
|
995
1018
|
api_key=self._sdk_config.api_key,
|
|
996
|
-
host=self._sdk_config.
|
|
997
|
-
port=self._sdk_config.
|
|
1019
|
+
host=self._sdk_config.flight_host,
|
|
1020
|
+
port=self._sdk_config.flight_port,
|
|
998
1021
|
scheme=self._sdk_config.flight_scheme,
|
|
999
1022
|
request_verify=self._sdk_config.request_verify,
|
|
1000
1023
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -1010,26 +1033,32 @@ class SpansClient:
|
|
|
1010
1033
|
end_time=end_time,
|
|
1011
1034
|
where=where,
|
|
1012
1035
|
columns=columns,
|
|
1013
|
-
similarity_search_params=similarity_search_params,
|
|
1014
1036
|
stream_chunk_size=stream_chunk_size,
|
|
1015
1037
|
)
|
|
1016
1038
|
|
|
1017
1039
|
def export_to_parquet(
|
|
1018
1040
|
self,
|
|
1019
1041
|
*,
|
|
1042
|
+
path: str,
|
|
1020
1043
|
space_id: str,
|
|
1021
1044
|
project_name: str,
|
|
1022
1045
|
start_time: datetime,
|
|
1023
1046
|
end_time: datetime,
|
|
1024
1047
|
where: str = "",
|
|
1025
|
-
columns:
|
|
1026
|
-
similarity_search_params: SimilaritySearchParams | None = None,
|
|
1048
|
+
columns: list | None = None,
|
|
1027
1049
|
stream_chunk_size: int | None = None,
|
|
1028
|
-
) ->
|
|
1050
|
+
) -> None:
|
|
1051
|
+
"""Export span data from Arize to a Parquet file.
|
|
1052
|
+
|
|
1053
|
+
Retrieves trace/span data from the specified project within a time range
|
|
1054
|
+
and writes it directly to a Parquet file at the specified path. Supports
|
|
1055
|
+
filtering with SQL-like WHERE clauses and similarity search for semantic
|
|
1056
|
+
retrieval. Efficient for large datasets and long-term storage.
|
|
1057
|
+
"""
|
|
1029
1058
|
with ArizeFlightClient(
|
|
1030
1059
|
api_key=self._sdk_config.api_key,
|
|
1031
|
-
host=self._sdk_config.
|
|
1032
|
-
port=self._sdk_config.
|
|
1060
|
+
host=self._sdk_config.flight_host,
|
|
1061
|
+
port=self._sdk_config.flight_port,
|
|
1033
1062
|
scheme=self._sdk_config.flight_scheme,
|
|
1034
1063
|
request_verify=self._sdk_config.request_verify,
|
|
1035
1064
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -1038,6 +1067,7 @@ class SpansClient:
|
|
|
1038
1067
|
flight_client=flight_client,
|
|
1039
1068
|
)
|
|
1040
1069
|
return exporter.export_to_parquet(
|
|
1070
|
+
path=path,
|
|
1041
1071
|
space_id=space_id,
|
|
1042
1072
|
model_id=project_name,
|
|
1043
1073
|
environment=Environments.TRACING,
|
|
@@ -1045,12 +1075,11 @@ class SpansClient:
|
|
|
1045
1075
|
end_time=end_time,
|
|
1046
1076
|
where=where,
|
|
1047
1077
|
columns=columns,
|
|
1048
|
-
similarity_search_params=similarity_search_params,
|
|
1049
1078
|
stream_chunk_size=stream_chunk_size,
|
|
1050
1079
|
)
|
|
1051
1080
|
|
|
1052
1081
|
|
|
1053
|
-
def _build_patch_document(row):
|
|
1082
|
+
def _build_patch_document(row: pd.Series) -> dict[str, object]:
|
|
1054
1083
|
# Extract and preserve metadata values with proper types
|
|
1055
1084
|
patch = {}
|
|
1056
1085
|
for key in row.index:
|
|
@@ -1070,8 +1099,11 @@ def _build_patch_document(row):
|
|
|
1070
1099
|
|
|
1071
1100
|
|
|
1072
1101
|
def _process_patch_document(
|
|
1073
|
-
metadata_df
|
|
1074
|
-
|
|
1102
|
+
metadata_df: pd.DataFrame,
|
|
1103
|
+
patch_document_column_name: str,
|
|
1104
|
+
field_patches: pd.DataFrame,
|
|
1105
|
+
row_idx: int,
|
|
1106
|
+
) -> dict[str, object]:
|
|
1075
1107
|
# Get the field patch for this row
|
|
1076
1108
|
field_patch = field_patches.iloc[row_idx]
|
|
1077
1109
|
|
|
@@ -1111,15 +1143,14 @@ def _process_patch_document(
|
|
|
1111
1143
|
explicit_patch = {}
|
|
1112
1144
|
|
|
1113
1145
|
# Merge patches - explicit patch takes precedence
|
|
1114
|
-
|
|
1115
|
-
return merged_patch
|
|
1146
|
+
return {**field_patch, **explicit_patch}
|
|
1116
1147
|
|
|
1117
1148
|
|
|
1118
1149
|
def _ensure_dict_patch(
|
|
1119
1150
|
metadata_df: pd.DataFrame,
|
|
1120
1151
|
final_patch_column: str,
|
|
1121
1152
|
row_idx: int,
|
|
1122
|
-
):
|
|
1153
|
+
) -> tuple[dict[str, object], list[str]]:
|
|
1123
1154
|
patch = metadata_df.loc[row_idx, final_patch_column]
|
|
1124
1155
|
validation_errors = []
|
|
1125
1156
|
|
|
@@ -1141,19 +1172,19 @@ def _ensure_dict_patch(
|
|
|
1141
1172
|
parsed = json.loads(patch)
|
|
1142
1173
|
if isinstance(parsed, dict):
|
|
1143
1174
|
return parsed
|
|
1144
|
-
else:
|
|
1145
|
-
error_msg = (
|
|
1146
|
-
f"Row {row_idx}: JSON must be an object/dictionary, "
|
|
1147
|
-
f"got {type(parsed).__name__}"
|
|
1148
|
-
)
|
|
1149
|
-
logger.warning(error_msg)
|
|
1150
|
-
validation_errors.append(error_msg)
|
|
1151
|
-
return {}, validation_errors # if not validate else None
|
|
1152
1175
|
except json.JSONDecodeError as e:
|
|
1153
1176
|
error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
|
|
1154
1177
|
logger.warning(error_msg)
|
|
1155
1178
|
validation_errors.append(error_msg)
|
|
1156
1179
|
return {}, validation_errors # if not validate else None
|
|
1180
|
+
else:
|
|
1181
|
+
error_msg = (
|
|
1182
|
+
f"Row {row_idx}: JSON must be an object/dictionary, "
|
|
1183
|
+
f"got {type(parsed).__name__}"
|
|
1184
|
+
)
|
|
1185
|
+
logger.warning(error_msg)
|
|
1186
|
+
validation_errors.append(error_msg)
|
|
1187
|
+
return {}, validation_errors # if not validate else None
|
|
1157
1188
|
|
|
1158
1189
|
# For other types, log warning
|
|
1159
1190
|
error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
|
|
@@ -1165,7 +1196,7 @@ def _ensure_dict_patch(
|
|
|
1165
1196
|
def _format_note_for_storage(
|
|
1166
1197
|
note_text: str,
|
|
1167
1198
|
current_time_ms: int,
|
|
1168
|
-
):
|
|
1199
|
+
) -> list[str] | None:
|
|
1169
1200
|
if pd.isna(note_text):
|
|
1170
1201
|
return None
|
|
1171
1202
|
note_obj = {
|
|
@@ -1225,9 +1256,7 @@ def _log_flight_update_summary(
|
|
|
1225
1256
|
logger.warning("Flight update response missing counts", extra=metrics)
|
|
1226
1257
|
else:
|
|
1227
1258
|
all_processed = int(spans_processed) == int(total_spans)
|
|
1228
|
-
msg =
|
|
1229
|
-
"✅ All spans processed" if all_processed else "Partial processing"
|
|
1230
|
-
)
|
|
1259
|
+
msg = "All spans processed" if all_processed else "Partial processing"
|
|
1231
1260
|
logger.info(msg, extra=metrics)
|
|
1232
1261
|
|
|
1233
1262
|
# Emit individual error lines (structured per-error, easy to aggregate)
|
|
@@ -1246,7 +1275,7 @@ def _message_to_dict(
|
|
|
1246
1275
|
msg: message.Message,
|
|
1247
1276
|
preserve_names: bool = True,
|
|
1248
1277
|
use_int_enums: bool = False,
|
|
1249
|
-
):
|
|
1278
|
+
) -> dict[str, object]:
|
|
1250
1279
|
return json_format.MessageToDict(
|
|
1251
1280
|
msg,
|
|
1252
1281
|
preserving_proto_field_name=preserve_names,
|