arize 8.0.0a13__py3-none-any.whl → 8.0.0a14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/_exporter/client.py +18 -3
- arize/datasets/client.py +88 -83
- arize/{utils → models}/casting.py +12 -12
- arize/models/client.py +330 -5
- arize/{utils → models}/proto.py +1 -369
- arize/spans/client.py +30 -6
- arize/utils/arrow.py +4 -4
- arize/version.py +1 -1
- {arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/METADATA +11 -3
- {arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/RECORD +12 -12
- {arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/WHEEL +0 -0
- {arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/licenses/LICENSE.md +0 -0
arize/_exporter/client.py
CHANGED
|
@@ -20,7 +20,6 @@ from arize._generated.protocol.flight import export_pb2
|
|
|
20
20
|
from arize.logging import CtxAdapter
|
|
21
21
|
from arize.types import Environments, SimilaritySearchParams
|
|
22
22
|
from arize.utils.dataframe import reset_dataframe_index
|
|
23
|
-
from arize.utils.proto import get_pb_flight_doput_request
|
|
24
23
|
|
|
25
24
|
logger = logging.getLogger(__name__)
|
|
26
25
|
|
|
@@ -131,7 +130,7 @@ class ArizeExportClient:
|
|
|
131
130
|
reset_dataframe_index(df)
|
|
132
131
|
return df
|
|
133
132
|
|
|
134
|
-
def
|
|
133
|
+
def export_to_parquet(
|
|
135
134
|
self,
|
|
136
135
|
path: str,
|
|
137
136
|
space_id: str,
|
|
@@ -285,7 +284,7 @@ class ArizeExportClient:
|
|
|
285
284
|
end_time=Timestamp(seconds=int(end_time.timestamp())),
|
|
286
285
|
filter_expression=where,
|
|
287
286
|
similarity_search_params=(
|
|
288
|
-
|
|
287
|
+
_get_pb_similarity_search_params(similarity_search_params)
|
|
289
288
|
if similarity_search_params
|
|
290
289
|
else None
|
|
291
290
|
),
|
|
@@ -326,3 +325,19 @@ class ArizeExportClient:
|
|
|
326
325
|
colour="#008000",
|
|
327
326
|
unit=" row",
|
|
328
327
|
)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _get_pb_similarity_search_params(
|
|
331
|
+
similarity_params: SimilaritySearchParams,
|
|
332
|
+
) -> export_pb2.SimilaritySearchParams:
|
|
333
|
+
proto_params = export_pb2.SimilaritySearchParams()
|
|
334
|
+
proto_params.search_column_name = similarity_params.search_column_name
|
|
335
|
+
proto_params.threshold = similarity_params.threshold
|
|
336
|
+
for ref in similarity_params.references:
|
|
337
|
+
new_ref = proto_params.references.add()
|
|
338
|
+
new_ref.prediction_id = ref.prediction_id
|
|
339
|
+
new_ref.reference_column_name = ref.reference_column_name
|
|
340
|
+
if ref.prediction_timestamp:
|
|
341
|
+
new_ref.prediction_timestamp.FromDatetime(ref.prediction_timestamp)
|
|
342
|
+
|
|
343
|
+
return proto_params
|
arize/datasets/client.py
CHANGED
|
@@ -39,99 +39,104 @@ class DatasetsClient:
|
|
|
39
39
|
name: str,
|
|
40
40
|
space_id: str,
|
|
41
41
|
examples: List[Dict[str, Any]] | pd.DataFrame,
|
|
42
|
+
force_http: bool = False,
|
|
42
43
|
):
|
|
43
|
-
if (
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
):
|
|
47
|
-
logger.info(
|
|
48
|
-
f"Uploading {len(examples)} examples via REST may be slow. "
|
|
49
|
-
"Trying to convert to DataFrame for more efficient upload."
|
|
44
|
+
if not isinstance(examples, (list, pd.DataFrame)):
|
|
45
|
+
raise TypeError(
|
|
46
|
+
"Examples must be a list of dicts or a pandas DataFrame"
|
|
50
47
|
)
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
data = pd.DataFrame(examples)
|
|
54
|
-
except Exception as e:
|
|
55
|
-
logger.warning(
|
|
56
|
-
f"Could not convert examples to DataFrame: {e}. "
|
|
57
|
-
"Falling back to list upload, which may be less performant."
|
|
58
|
-
)
|
|
59
|
-
# If we can’t convert to a dataframe, just use the list
|
|
60
|
-
data = examples
|
|
61
|
-
else:
|
|
62
|
-
# If we have a dataframe or a small list, just use it directly
|
|
63
|
-
data = examples
|
|
64
|
-
|
|
65
|
-
if isinstance(data, list):
|
|
66
|
-
# If the data is in list format, use the REST endpoint
|
|
48
|
+
if len(examples) <= REST_LIMIT_DATASET_EXAMPLES or force_http:
|
|
67
49
|
from arize._generated import api_client as gen
|
|
68
50
|
|
|
51
|
+
data = (
|
|
52
|
+
examples.to_dict(orient="records")
|
|
53
|
+
if isinstance(examples, pd.DataFrame)
|
|
54
|
+
else examples
|
|
55
|
+
)
|
|
56
|
+
|
|
69
57
|
body = gen.DatasetsCreateRequest(
|
|
70
58
|
name=name,
|
|
71
59
|
spaceId=space_id,
|
|
72
60
|
examples=data,
|
|
73
61
|
)
|
|
74
62
|
return self._api.datasets_create(datasets_create_request=body)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
63
|
+
|
|
64
|
+
# If we have too many examples, try to convert to a dataframe
|
|
65
|
+
# and log via gRPC + flight
|
|
66
|
+
logger.info(
|
|
67
|
+
f"Uploading {len(examples)} examples via REST may be slow. "
|
|
68
|
+
"Trying to convert to DataFrame for more efficient upload via "
|
|
69
|
+
"gRPC + Flight."
|
|
70
|
+
)
|
|
71
|
+
data = (
|
|
72
|
+
pd.DataFrame(examples) if isinstance(examples, list) else examples
|
|
73
|
+
)
|
|
74
|
+
return self._create_dataset_via_flight(
|
|
75
|
+
name=name,
|
|
76
|
+
space_id=space_id,
|
|
77
|
+
examples=data,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def _create_dataset_via_flight(
|
|
81
|
+
self,
|
|
82
|
+
name: str,
|
|
83
|
+
space_id: str,
|
|
84
|
+
examples: pd.DataFrame,
|
|
85
|
+
):
|
|
86
|
+
# Convert datetime columns to int64 (ms since epoch)
|
|
87
|
+
# TODO(Kiko): Missing validation block
|
|
88
|
+
# data = _convert_datetime_columns_to_int(data)
|
|
89
|
+
# df = self._set_default_columns_for_dataset(data)
|
|
90
|
+
# if convert_dict_to_json:
|
|
91
|
+
# df = _convert_default_columns_to_json_str(df)
|
|
92
|
+
# df = _convert_boolean_columns_to_str(df)
|
|
93
|
+
# validation_errors = Validator.validate(df)
|
|
94
|
+
# validation_errors.extend(
|
|
95
|
+
# Validator.validate_max_chunk_size(max_chunk_size)
|
|
96
|
+
# )
|
|
97
|
+
# if validation_errors:
|
|
98
|
+
# raise RuntimeError(
|
|
99
|
+
# [e.error_message() for e in validation_errors]
|
|
100
|
+
# )
|
|
101
|
+
|
|
102
|
+
# Convert to Arrow table
|
|
103
|
+
try:
|
|
104
|
+
logger.debug("Converting data to Arrow format")
|
|
105
|
+
pa_table = pa.Table.from_pandas(examples)
|
|
106
|
+
except pa.ArrowInvalid as e:
|
|
107
|
+
logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
|
|
108
|
+
raise pa.ArrowInvalid(
|
|
109
|
+
f"Error converting to Arrow format: {str(e)}"
|
|
110
|
+
) from e
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error(f"Unexpected error creating Arrow table: {str(e)}")
|
|
113
|
+
raise
|
|
114
|
+
|
|
115
|
+
response = None
|
|
116
|
+
with ArizeFlightClient(
|
|
117
|
+
api_key=self._sdk_config.api_key,
|
|
118
|
+
host=self._sdk_config.flight_server_host,
|
|
119
|
+
port=self._sdk_config.flight_server_port,
|
|
120
|
+
scheme=self._sdk_config.flight_scheme,
|
|
121
|
+
request_verify=self._sdk_config.request_verify,
|
|
122
|
+
) as flight_client:
|
|
93
123
|
try:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
f"Error converting to Arrow format: {str(e)}"
|
|
100
|
-
) from e
|
|
124
|
+
response = flight_client.create_dataset(
|
|
125
|
+
space_id=space_id,
|
|
126
|
+
dataset_name=name,
|
|
127
|
+
pa_table=pa_table,
|
|
128
|
+
)
|
|
101
129
|
except Exception as e:
|
|
102
|
-
|
|
103
|
-
raise
|
|
104
|
-
|
|
105
|
-
response = None
|
|
106
|
-
with ArizeFlightClient(
|
|
107
|
-
api_key=self._sdk_config.api_key,
|
|
108
|
-
host=self._sdk_config.flight_server_host,
|
|
109
|
-
port=self._sdk_config.flight_server_port,
|
|
110
|
-
scheme=self._sdk_config.flight_scheme,
|
|
111
|
-
request_verify=self._sdk_config.request_verify,
|
|
112
|
-
) as flight_client:
|
|
113
|
-
try:
|
|
114
|
-
response = flight_client.create_dataset(
|
|
115
|
-
space_id=space_id,
|
|
116
|
-
dataset_name=name,
|
|
117
|
-
pa_table=pa_table,
|
|
118
|
-
)
|
|
119
|
-
except Exception as e:
|
|
120
|
-
msg = f"Error during update request: {str(e)}"
|
|
121
|
-
logger.error(msg)
|
|
122
|
-
raise RuntimeError(msg) from e
|
|
123
|
-
if response is None:
|
|
124
|
-
# This should not happen with proper Flight client implementation,
|
|
125
|
-
# but we handle it defensively
|
|
126
|
-
msg = "No response received from flight server during update"
|
|
130
|
+
msg = f"Error during update request: {str(e)}"
|
|
127
131
|
logger.error(msg)
|
|
128
|
-
raise RuntimeError(msg)
|
|
129
|
-
|
|
130
|
-
#
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
132
|
+
raise RuntimeError(msg) from e
|
|
133
|
+
if response is None:
|
|
134
|
+
# This should not happen with proper Flight client implementation,
|
|
135
|
+
# but we handle it defensively
|
|
136
|
+
msg = "No response received from flight server during update"
|
|
137
|
+
logger.error(msg)
|
|
138
|
+
raise RuntimeError(msg)
|
|
139
|
+
# The response from flightserver is the dataset ID. To return the dataset
|
|
140
|
+
# object we make a GET query
|
|
141
|
+
dataset = self.get(dataset_id=response)
|
|
142
|
+
return dataset
|
|
@@ -131,28 +131,28 @@ def cast_typed_columns(
|
|
|
131
131
|
f = getattr(schema, field_name)
|
|
132
132
|
if f:
|
|
133
133
|
try:
|
|
134
|
-
|
|
134
|
+
_validate_typed_columns(field_name, f)
|
|
135
135
|
except InvalidTypedColumnsError:
|
|
136
136
|
raise
|
|
137
|
-
dataframe =
|
|
137
|
+
dataframe = _cast_columns(dataframe, f)
|
|
138
138
|
|
|
139
139
|
# Now that the dataframe values have been cast to the specified types:
|
|
140
140
|
# for downstream validation to work as expected,
|
|
141
141
|
# feature & tag schema field types should be List[string] of column names.
|
|
142
142
|
# Since Schema is a frozen class, we must construct a new instance.
|
|
143
|
-
return dataframe,
|
|
143
|
+
return dataframe, _convert_schema_field_types(schema)
|
|
144
144
|
|
|
145
145
|
|
|
146
146
|
def cast_dictionary(d: dict) -> dict:
|
|
147
147
|
cast_dict = {}
|
|
148
148
|
for k, v in d.items():
|
|
149
149
|
if isinstance(v, TypedValue):
|
|
150
|
-
v =
|
|
150
|
+
v = _cast_value(v)
|
|
151
151
|
cast_dict[k] = v
|
|
152
152
|
return cast_dict
|
|
153
153
|
|
|
154
154
|
|
|
155
|
-
def
|
|
155
|
+
def _cast_value(
|
|
156
156
|
typed_value: TypedValue,
|
|
157
157
|
) -> Union[str, int, float, List[str], None]:
|
|
158
158
|
"""
|
|
@@ -224,7 +224,7 @@ def _cast_to_str(typed_value: TypedValue) -> Union[str, None]:
|
|
|
224
224
|
raise CastingError(str(e), typed_value) from e
|
|
225
225
|
|
|
226
226
|
|
|
227
|
-
def
|
|
227
|
+
def _validate_typed_columns(
|
|
228
228
|
field_name: str, typed_columns: TypedColumns
|
|
229
229
|
) -> None:
|
|
230
230
|
"""
|
|
@@ -253,7 +253,7 @@ def validate_typed_columns(
|
|
|
253
253
|
)
|
|
254
254
|
|
|
255
255
|
|
|
256
|
-
def
|
|
256
|
+
def _cast_columns(
|
|
257
257
|
dataframe: pd.DataFrame, columns: TypedColumns
|
|
258
258
|
) -> pd.DataFrame:
|
|
259
259
|
"""
|
|
@@ -288,7 +288,7 @@ def cast_columns(
|
|
|
288
288
|
# uses pd.NA for missing values (when storage arg is not configured)
|
|
289
289
|
# In the future, try out pd.convert_dtypes (new in pandas 2.0):
|
|
290
290
|
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.convert_dtypes.html
|
|
291
|
-
dataframe =
|
|
291
|
+
dataframe = _cast_df(dataframe, columns.to_str, "string")
|
|
292
292
|
except Exception as e:
|
|
293
293
|
raise ColumnCastingError(
|
|
294
294
|
error_msg=str(e),
|
|
@@ -300,7 +300,7 @@ def cast_columns(
|
|
|
300
300
|
# see https://pandas.pydata.org/docs/reference/api/pandas.Int64Dtype.html
|
|
301
301
|
# uses pd.NA for missing values
|
|
302
302
|
try:
|
|
303
|
-
dataframe =
|
|
303
|
+
dataframe = _cast_df(dataframe, columns.to_int, "Int64")
|
|
304
304
|
except Exception as e:
|
|
305
305
|
raise ColumnCastingError(
|
|
306
306
|
error_msg=str(e),
|
|
@@ -312,7 +312,7 @@ def cast_columns(
|
|
|
312
312
|
# see https://pandas.pydata.org/docs/reference/api/pandas.Float64Dtype.html
|
|
313
313
|
# uses pd.NA for missing values
|
|
314
314
|
try:
|
|
315
|
-
dataframe =
|
|
315
|
+
dataframe = _cast_df(dataframe, columns.to_float, "Float64")
|
|
316
316
|
except Exception as e:
|
|
317
317
|
raise ColumnCastingError(
|
|
318
318
|
error_msg=str(e),
|
|
@@ -323,7 +323,7 @@ def cast_columns(
|
|
|
323
323
|
return dataframe
|
|
324
324
|
|
|
325
325
|
|
|
326
|
-
def
|
|
326
|
+
def _cast_df(
|
|
327
327
|
df: pd.DataFrame, cols: List[str], target_type_str: str
|
|
328
328
|
) -> pd.DataFrame:
|
|
329
329
|
"""
|
|
@@ -354,7 +354,7 @@ def cast_df(
|
|
|
354
354
|
return df.astype({col: target_type_str for col in cols})
|
|
355
355
|
|
|
356
356
|
|
|
357
|
-
def
|
|
357
|
+
def _convert_schema_field_types(
|
|
358
358
|
schema: Schema,
|
|
359
359
|
) -> Schema:
|
|
360
360
|
"""
|