arthur-common 1.0.1__py3-none-any.whl → 2.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arthur-common might be problematic. Click here for more details.
- arthur_common/aggregations/aggregator.py +10 -1
- arthur_common/aggregations/functions/categorical_count.py +51 -11
- arthur_common/aggregations/functions/confusion_matrix.py +122 -28
- arthur_common/aggregations/functions/inference_count.py +46 -9
- arthur_common/aggregations/functions/inference_count_by_class.py +101 -24
- arthur_common/aggregations/functions/inference_null_count.py +50 -10
- arthur_common/aggregations/functions/mean_absolute_error.py +55 -15
- arthur_common/aggregations/functions/mean_squared_error.py +55 -15
- arthur_common/aggregations/functions/multiclass_confusion_matrix.py +78 -24
- arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +19 -1
- arthur_common/aggregations/functions/numeric_stats.py +46 -9
- arthur_common/aggregations/functions/numeric_sum.py +52 -12
- arthur_common/models/connectors.py +6 -1
- arthur_common/models/metrics.py +5 -9
- arthur_common/models/schema_definitions.py +2 -0
- arthur_common/tools/aggregation_analyzer.py +31 -1
- arthur_common/tools/duckdb_data_loader.py +1 -1
- {arthur_common-1.0.1.dist-info → arthur_common-2.1.48.dist-info}/METADATA +1 -4
- {arthur_common-1.0.1.dist-info → arthur_common-2.1.48.dist-info}/RECORD +20 -21
- arthur_common/__version__.py +0 -1
- {arthur_common-1.0.1.dist-info → arthur_common-2.1.48.dist-info}/WHEEL +0 -0
|
@@ -73,6 +73,15 @@ class NumericAggregationFunction(AggregationFunction, ABC):
|
|
|
73
73
|
From there, iterate over the group turning each data point to a *Point. At the end, this single instance of the group metrics
|
|
74
74
|
and the list of points (values) are merged to one *TimeSeries
|
|
75
75
|
"""
|
|
76
|
+
if not dim_columns:
|
|
77
|
+
return [
|
|
78
|
+
NumericAggregationFunction._dimensionless_query_results_to_numeric_metrics(
|
|
79
|
+
data,
|
|
80
|
+
value_col,
|
|
81
|
+
timestamp_col,
|
|
82
|
+
),
|
|
83
|
+
]
|
|
84
|
+
|
|
76
85
|
calculated_metrics: list[NumericTimeSeries] = []
|
|
77
86
|
# make sure dropna is False or rows with "null" as a dimension value will be dropped
|
|
78
87
|
groups = data.groupby(dim_columns, dropna=False)
|
|
@@ -99,7 +108,7 @@ class NumericAggregationFunction(AggregationFunction, ABC):
|
|
|
99
108
|
return calculated_metrics
|
|
100
109
|
|
|
101
110
|
@staticmethod
|
|
102
|
-
def
|
|
111
|
+
def _dimensionless_query_results_to_numeric_metrics(
|
|
103
112
|
data: pd.DataFrame,
|
|
104
113
|
value_col: str,
|
|
105
114
|
timestamp_col: str,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Annotated
|
|
1
|
+
from typing import Annotated, Optional
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
from arthur_common.aggregations.aggregator import NumericAggregationFunction
|
|
@@ -7,6 +7,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
7
7
|
DType,
|
|
8
8
|
MetricColumnParameterAnnotation,
|
|
9
9
|
MetricDatasetParameterAnnotation,
|
|
10
|
+
MetricMultipleColumnParameterAnnotation,
|
|
10
11
|
ScalarType,
|
|
11
12
|
ScopeSchemaTag,
|
|
12
13
|
)
|
|
@@ -64,25 +65,64 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
|
|
|
64
65
|
description="A column containing categorical values to count.",
|
|
65
66
|
),
|
|
66
67
|
],
|
|
68
|
+
segmentation_cols: Annotated[
|
|
69
|
+
Optional[list[str]],
|
|
70
|
+
MetricMultipleColumnParameterAnnotation(
|
|
71
|
+
source_dataset_parameter_key="dataset",
|
|
72
|
+
allowed_column_types=[
|
|
73
|
+
ScalarType(dtype=DType.INT),
|
|
74
|
+
ScalarType(dtype=DType.BOOL),
|
|
75
|
+
ScalarType(dtype=DType.STRING),
|
|
76
|
+
ScalarType(dtype=DType.UUID),
|
|
77
|
+
],
|
|
78
|
+
tag_hints=[],
|
|
79
|
+
friendly_name="Segmentation Columns",
|
|
80
|
+
description="All columns to include as dimensions for segmentation.",
|
|
81
|
+
optional=True,
|
|
82
|
+
),
|
|
83
|
+
] = None,
|
|
67
84
|
) -> list[NumericMetric]:
|
|
85
|
+
"""Executed SQL with no segmentation columns:
|
|
86
|
+
select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
|
|
87
|
+
count(*) as count, \
|
|
88
|
+
{categorical_col_escaped} as category, \
|
|
89
|
+
{categorical_col_name_escaped} as column_name \
|
|
90
|
+
from {dataset.dataset_table_name} \
|
|
91
|
+
where ts is not null \
|
|
92
|
+
group by ts, category
|
|
93
|
+
"""
|
|
94
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
68
95
|
timestamp_col_escaped = escape_identifier(timestamp_col)
|
|
69
96
|
categorical_col_escaped = escape_identifier(categorical_col)
|
|
70
97
|
categorical_col_name_escaped = escape_str_literal(categorical_col)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
98
|
+
|
|
99
|
+
# build query components with segmentation columns
|
|
100
|
+
escaped_segmentation_cols = [
|
|
101
|
+
escape_identifier(col) for col in segmentation_cols
|
|
102
|
+
]
|
|
103
|
+
all_select_clause_cols = [
|
|
104
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts",
|
|
105
|
+
f"count(*) as count",
|
|
106
|
+
f"{categorical_col_escaped} as category",
|
|
107
|
+
f"{categorical_col_name_escaped} as column_name",
|
|
108
|
+
] + escaped_segmentation_cols
|
|
109
|
+
all_group_by_cols = ["ts", "category"] + escaped_segmentation_cols
|
|
110
|
+
extra_dims = ["column_name", "category"]
|
|
111
|
+
|
|
112
|
+
# build query
|
|
113
|
+
count_query = f"""
|
|
114
|
+
select {", ".join(all_select_clause_cols)}
|
|
115
|
+
from {dataset.dataset_table_name}
|
|
116
|
+
where ts is not null
|
|
117
|
+
group by {", ".join(all_group_by_cols)}
|
|
118
|
+
"""
|
|
119
|
+
|
|
80
120
|
results = ddb_conn.sql(count_query).df()
|
|
81
121
|
|
|
82
122
|
series = self.group_query_results_to_numeric_metrics(
|
|
83
123
|
results,
|
|
84
124
|
"count",
|
|
85
|
-
|
|
125
|
+
segmentation_cols + extra_dims,
|
|
86
126
|
timestamp_col="ts",
|
|
87
127
|
)
|
|
88
128
|
metric = self.series_to_metric(self.METRIC_NAME, series)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Annotated
|
|
1
|
+
from typing import Annotated, Optional
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
from arthur_common.aggregations.aggregator import NumericAggregationFunction
|
|
@@ -9,10 +9,11 @@ from arthur_common.models.schema_definitions import (
|
|
|
9
9
|
MetricColumnParameterAnnotation,
|
|
10
10
|
MetricDatasetParameterAnnotation,
|
|
11
11
|
MetricLiteralParameterAnnotation,
|
|
12
|
+
MetricMultipleColumnParameterAnnotation,
|
|
12
13
|
ScalarType,
|
|
13
14
|
ScopeSchemaTag,
|
|
14
15
|
)
|
|
15
|
-
from arthur_common.tools.duckdb_data_loader import escape_identifier
|
|
16
|
+
from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
|
|
16
17
|
from duckdb import DuckDBPyConnection
|
|
17
18
|
|
|
18
19
|
|
|
@@ -26,6 +27,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
|
26
27
|
prediction_normalization_case: str,
|
|
27
28
|
gt_normalization_case: str,
|
|
28
29
|
dataset: DatasetReference,
|
|
30
|
+
segmentation_cols: list[str],
|
|
29
31
|
) -> list[NumericMetric]:
|
|
30
32
|
"""
|
|
31
33
|
Generate a SQL query to compute confusion matrix metrics over time.
|
|
@@ -37,59 +39,98 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
|
37
39
|
prediction_normalization_case: SQL CASE statement for normalizing predictions to 0 / 1 / null using 'value' as the target column name
|
|
38
40
|
gt_normalization_case: SQL CASE statement for normalizing ground truth values to 0 / 1 / null using 'value' as the target column name
|
|
39
41
|
dataset: DatasetReference containing dataset metadata
|
|
42
|
+
segmentation_cols: list of columns to segment by
|
|
40
43
|
|
|
41
44
|
Returns:
|
|
42
45
|
str: SQL query that computes confusion matrix metrics
|
|
46
|
+
Without segmentation, this is the query:
|
|
47
|
+
WITH normalized_data AS (
|
|
48
|
+
SELECT
|
|
49
|
+
{escaped_timestamp_col} AS timestamp,
|
|
50
|
+
{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
|
|
51
|
+
{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
|
|
52
|
+
FROM {dataset.dataset_table_name}
|
|
53
|
+
WHERE {escaped_timestamp_col} IS NOT NULL
|
|
54
|
+
)
|
|
55
|
+
SELECT
|
|
56
|
+
time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
|
|
57
|
+
SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count,
|
|
58
|
+
SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
|
|
59
|
+
SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
|
|
60
|
+
SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
|
|
61
|
+
{escaped_prediction_col_name} as prediction_column_name
|
|
62
|
+
FROM normalized_data
|
|
63
|
+
GROUP BY ts
|
|
64
|
+
ORDER BY ts
|
|
43
65
|
"""
|
|
66
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
44
67
|
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
45
68
|
escaped_prediction_col = escape_identifier(prediction_col)
|
|
69
|
+
escaped_prediction_col_name = escape_str_literal(prediction_col)
|
|
46
70
|
escaped_gt_values_col = escape_identifier(gt_values_col)
|
|
71
|
+
# build query components with segmentation columns
|
|
72
|
+
escaped_segmentation_cols = [
|
|
73
|
+
escape_identifier(col) for col in segmentation_cols
|
|
74
|
+
]
|
|
75
|
+
first_subquery_select_cols = [
|
|
76
|
+
f"{escaped_timestamp_col} AS timestamp",
|
|
77
|
+
f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
|
|
78
|
+
f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
|
|
79
|
+
] + escaped_segmentation_cols
|
|
80
|
+
second_subquery_select_cols = [
|
|
81
|
+
"time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
|
|
82
|
+
"SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
|
|
83
|
+
"SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
|
|
84
|
+
"SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
|
|
85
|
+
"SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
|
|
86
|
+
f"{escaped_prediction_col_name} as prediction_column_name",
|
|
87
|
+
] + escaped_segmentation_cols
|
|
88
|
+
second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
|
|
89
|
+
extra_dims = ["prediction_column_name"]
|
|
90
|
+
|
|
91
|
+
# build query
|
|
47
92
|
confusion_matrix_query = f"""
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
{
|
|
51
|
-
{
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count,
|
|
59
|
-
SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
|
|
60
|
-
SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
|
|
61
|
-
SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count
|
|
62
|
-
FROM normalized_data
|
|
63
|
-
GROUP BY ts
|
|
64
|
-
ORDER BY ts
|
|
65
|
-
"""
|
|
93
|
+
WITH normalized_data AS (
|
|
94
|
+
SELECT {", ".join(first_subquery_select_cols)}
|
|
95
|
+
FROM {dataset.dataset_table_name}
|
|
96
|
+
WHERE {escaped_timestamp_col} IS NOT NULL
|
|
97
|
+
)
|
|
98
|
+
SELECT {", ".join(second_subquery_select_cols)}
|
|
99
|
+
FROM normalized_data
|
|
100
|
+
GROUP BY {", ".join(second_subquery_group_by_cols)}
|
|
101
|
+
ORDER BY ts
|
|
102
|
+
"""
|
|
66
103
|
|
|
67
104
|
results = ddb_conn.sql(confusion_matrix_query).df()
|
|
68
105
|
|
|
69
|
-
tp = self.
|
|
106
|
+
tp = self.group_query_results_to_numeric_metrics(
|
|
70
107
|
results,
|
|
71
108
|
"true_positive_count",
|
|
109
|
+
dim_columns=segmentation_cols + extra_dims,
|
|
72
110
|
timestamp_col="ts",
|
|
73
111
|
)
|
|
74
|
-
fp = self.
|
|
112
|
+
fp = self.group_query_results_to_numeric_metrics(
|
|
75
113
|
results,
|
|
76
114
|
"false_positive_count",
|
|
115
|
+
dim_columns=segmentation_cols + extra_dims,
|
|
77
116
|
timestamp_col="ts",
|
|
78
117
|
)
|
|
79
|
-
fn = self.
|
|
118
|
+
fn = self.group_query_results_to_numeric_metrics(
|
|
80
119
|
results,
|
|
81
120
|
"false_negative_count",
|
|
121
|
+
dim_columns=segmentation_cols + extra_dims,
|
|
82
122
|
timestamp_col="ts",
|
|
83
123
|
)
|
|
84
|
-
tn = self.
|
|
124
|
+
tn = self.group_query_results_to_numeric_metrics(
|
|
85
125
|
results,
|
|
86
126
|
"true_negative_count",
|
|
127
|
+
dim_columns=segmentation_cols + extra_dims,
|
|
87
128
|
timestamp_col="ts",
|
|
88
129
|
)
|
|
89
|
-
tp_metric = self.series_to_metric("confusion_matrix_true_positive_count",
|
|
90
|
-
fp_metric = self.series_to_metric("confusion_matrix_false_positive_count",
|
|
91
|
-
fn_metric = self.series_to_metric("confusion_matrix_false_negative_count",
|
|
92
|
-
tn_metric = self.series_to_metric("confusion_matrix_true_negative_count",
|
|
130
|
+
tp_metric = self.series_to_metric("confusion_matrix_true_positive_count", tp)
|
|
131
|
+
fp_metric = self.series_to_metric("confusion_matrix_false_positive_count", fp)
|
|
132
|
+
fn_metric = self.series_to_metric("confusion_matrix_false_negative_count", fn)
|
|
133
|
+
tn_metric = self.series_to_metric("confusion_matrix_true_negative_count", tn)
|
|
93
134
|
return [tp_metric, fp_metric, fn_metric, tn_metric]
|
|
94
135
|
|
|
95
136
|
|
|
@@ -157,7 +198,24 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
|
|
|
157
198
|
description="A column containing boolean or integer ground truth values.",
|
|
158
199
|
),
|
|
159
200
|
],
|
|
201
|
+
segmentation_cols: Annotated[
|
|
202
|
+
Optional[list[str]],
|
|
203
|
+
MetricMultipleColumnParameterAnnotation(
|
|
204
|
+
source_dataset_parameter_key="dataset",
|
|
205
|
+
allowed_column_types=[
|
|
206
|
+
ScalarType(dtype=DType.INT),
|
|
207
|
+
ScalarType(dtype=DType.BOOL),
|
|
208
|
+
ScalarType(dtype=DType.STRING),
|
|
209
|
+
ScalarType(dtype=DType.UUID),
|
|
210
|
+
],
|
|
211
|
+
tag_hints=[],
|
|
212
|
+
friendly_name="Segmentation Columns",
|
|
213
|
+
description="All columns to include as dimensions for segmentation.",
|
|
214
|
+
optional=True,
|
|
215
|
+
),
|
|
216
|
+
] = None,
|
|
160
217
|
) -> list[NumericMetric]:
|
|
218
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
161
219
|
escaped_prediction_col = escape_identifier(prediction_col)
|
|
162
220
|
# Get the type of prediction column
|
|
163
221
|
type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
|
|
@@ -194,6 +252,7 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
|
|
|
194
252
|
normalization_case,
|
|
195
253
|
normalization_case,
|
|
196
254
|
dataset,
|
|
255
|
+
segmentation_cols,
|
|
197
256
|
)
|
|
198
257
|
|
|
199
258
|
|
|
@@ -275,7 +334,24 @@ class BinaryClassifierStringLabelConfusionMatrixAggregationFunction(
|
|
|
275
334
|
description="The label indicating a negative classification to normalize to 0.",
|
|
276
335
|
),
|
|
277
336
|
],
|
|
337
|
+
segmentation_cols: Annotated[
|
|
338
|
+
Optional[list[str]],
|
|
339
|
+
MetricMultipleColumnParameterAnnotation(
|
|
340
|
+
source_dataset_parameter_key="dataset",
|
|
341
|
+
allowed_column_types=[
|
|
342
|
+
ScalarType(dtype=DType.INT),
|
|
343
|
+
ScalarType(dtype=DType.BOOL),
|
|
344
|
+
ScalarType(dtype=DType.STRING),
|
|
345
|
+
ScalarType(dtype=DType.UUID),
|
|
346
|
+
],
|
|
347
|
+
tag_hints=[],
|
|
348
|
+
friendly_name="Segmentation Columns",
|
|
349
|
+
description="All columns to include as dimensions for segmentation.",
|
|
350
|
+
optional=True,
|
|
351
|
+
),
|
|
352
|
+
] = None,
|
|
278
353
|
) -> list[NumericMetric]:
|
|
354
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
279
355
|
normalization_case = f"""
|
|
280
356
|
CASE
|
|
281
357
|
WHEN value = '{true_label}' THEN 1
|
|
@@ -291,6 +367,7 @@ class BinaryClassifierStringLabelConfusionMatrixAggregationFunction(
|
|
|
291
367
|
normalization_case,
|
|
292
368
|
normalization_case,
|
|
293
369
|
dataset,
|
|
370
|
+
segmentation_cols,
|
|
294
371
|
)
|
|
295
372
|
|
|
296
373
|
|
|
@@ -365,6 +442,22 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
|
|
|
365
442
|
description="The threshold to classify predictions to 0 or 1.",
|
|
366
443
|
),
|
|
367
444
|
],
|
|
445
|
+
segmentation_cols: Annotated[
|
|
446
|
+
Optional[list[str]],
|
|
447
|
+
MetricMultipleColumnParameterAnnotation(
|
|
448
|
+
source_dataset_parameter_key="dataset",
|
|
449
|
+
allowed_column_types=[
|
|
450
|
+
ScalarType(dtype=DType.INT),
|
|
451
|
+
ScalarType(dtype=DType.BOOL),
|
|
452
|
+
ScalarType(dtype=DType.STRING),
|
|
453
|
+
ScalarType(dtype=DType.UUID),
|
|
454
|
+
],
|
|
455
|
+
tag_hints=[],
|
|
456
|
+
friendly_name="Segmentation Columns",
|
|
457
|
+
description="All columns to include as dimensions for segmentation.",
|
|
458
|
+
optional=True,
|
|
459
|
+
),
|
|
460
|
+
] = None,
|
|
368
461
|
) -> list[NumericMetric]:
|
|
369
462
|
escaped_gt_values_col = escape_identifier(gt_values_col)
|
|
370
463
|
prediction_normalization_case = f"""
|
|
@@ -409,4 +502,5 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
|
|
|
409
502
|
prediction_normalization_case,
|
|
410
503
|
gt_normalization_case,
|
|
411
504
|
dataset,
|
|
505
|
+
segmentation_cols,
|
|
412
506
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Annotated
|
|
1
|
+
from typing import Annotated, Optional
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
from arthur_common.aggregations.aggregator import NumericAggregationFunction
|
|
@@ -7,6 +7,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
7
7
|
DType,
|
|
8
8
|
MetricColumnParameterAnnotation,
|
|
9
9
|
MetricDatasetParameterAnnotation,
|
|
10
|
+
MetricMultipleColumnParameterAnnotation,
|
|
10
11
|
ScalarType,
|
|
11
12
|
ScopeSchemaTag,
|
|
12
13
|
)
|
|
@@ -51,19 +52,55 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
|
|
|
51
52
|
description="A column containing timestamp values to bucket by.",
|
|
52
53
|
),
|
|
53
54
|
],
|
|
55
|
+
segmentation_cols: Annotated[
|
|
56
|
+
Optional[list[str]],
|
|
57
|
+
MetricMultipleColumnParameterAnnotation(
|
|
58
|
+
source_dataset_parameter_key="dataset",
|
|
59
|
+
allowed_column_types=[
|
|
60
|
+
ScalarType(dtype=DType.INT),
|
|
61
|
+
ScalarType(dtype=DType.BOOL),
|
|
62
|
+
ScalarType(dtype=DType.STRING),
|
|
63
|
+
ScalarType(dtype=DType.UUID),
|
|
64
|
+
],
|
|
65
|
+
tag_hints=[],
|
|
66
|
+
friendly_name="Segmentation Columns",
|
|
67
|
+
description="All columns to include as dimensions for segmentation.",
|
|
68
|
+
optional=True,
|
|
69
|
+
),
|
|
70
|
+
] = None,
|
|
54
71
|
) -> list[NumericMetric]:
|
|
55
|
-
|
|
56
|
-
count_query = f" \
|
|
72
|
+
"""Executed SQL with no segmentation columns:
|
|
57
73
|
select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
"
|
|
74
|
+
count(*) as count \
|
|
75
|
+
from {dataset.dataset_table_name} \
|
|
76
|
+
group by ts \
|
|
77
|
+
"""
|
|
78
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
79
|
+
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
80
|
+
|
|
81
|
+
# build query components with segmentation columns
|
|
82
|
+
escaped_segmentation_cols = [
|
|
83
|
+
escape_identifier(col) for col in segmentation_cols
|
|
84
|
+
]
|
|
85
|
+
all_select_clause_cols = [
|
|
86
|
+
f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
|
|
87
|
+
f"count(*) as count",
|
|
88
|
+
] + escaped_segmentation_cols
|
|
89
|
+
all_group_by_cols = ["ts"] + escaped_segmentation_cols
|
|
90
|
+
|
|
91
|
+
# build query
|
|
92
|
+
count_query = f"""
|
|
93
|
+
select {", ".join(all_select_clause_cols)}
|
|
94
|
+
from {dataset.dataset_table_name}
|
|
95
|
+
group by {", ".join(all_group_by_cols)}
|
|
96
|
+
"""
|
|
97
|
+
|
|
62
98
|
results = ddb_conn.sql(count_query).df()
|
|
63
|
-
series = self.
|
|
99
|
+
series = self.group_query_results_to_numeric_metrics(
|
|
64
100
|
results,
|
|
65
101
|
"count",
|
|
102
|
+
segmentation_cols,
|
|
66
103
|
"ts",
|
|
67
104
|
)
|
|
68
|
-
metric = self.series_to_metric(self.METRIC_NAME,
|
|
105
|
+
metric = self.series_to_metric(self.METRIC_NAME, series)
|
|
69
106
|
return [metric]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Annotated
|
|
1
|
+
from typing import Annotated, Optional
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
from arthur_common.aggregations.aggregator import NumericAggregationFunction
|
|
@@ -9,6 +9,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
9
9
|
MetricColumnParameterAnnotation,
|
|
10
10
|
MetricDatasetParameterAnnotation,
|
|
11
11
|
MetricLiteralParameterAnnotation,
|
|
12
|
+
MetricMultipleColumnParameterAnnotation,
|
|
12
13
|
ScalarType,
|
|
13
14
|
ScopeSchemaTag,
|
|
14
15
|
)
|
|
@@ -70,29 +71,66 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
|
|
|
70
71
|
description="A column containing boolean, integer, or string labelled prediction values.",
|
|
71
72
|
),
|
|
72
73
|
],
|
|
74
|
+
segmentation_cols: Annotated[
|
|
75
|
+
Optional[list[str]],
|
|
76
|
+
MetricMultipleColumnParameterAnnotation(
|
|
77
|
+
source_dataset_parameter_key="dataset",
|
|
78
|
+
allowed_column_types=[
|
|
79
|
+
ScalarType(dtype=DType.INT),
|
|
80
|
+
ScalarType(dtype=DType.BOOL),
|
|
81
|
+
ScalarType(dtype=DType.STRING),
|
|
82
|
+
ScalarType(dtype=DType.UUID),
|
|
83
|
+
],
|
|
84
|
+
tag_hints=[],
|
|
85
|
+
friendly_name="Segmentation Columns",
|
|
86
|
+
description="All columns to include as dimensions for segmentation.",
|
|
87
|
+
optional=True,
|
|
88
|
+
),
|
|
89
|
+
] = None,
|
|
73
90
|
) -> list[NumericMetric]:
|
|
91
|
+
"""Executed SQL with no segmentation columns:
|
|
92
|
+
SELECT
|
|
93
|
+
time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
|
|
94
|
+
{escaped_pred_col} as prediction,
|
|
95
|
+
COUNT(*) as count
|
|
96
|
+
FROM {dataset.dataset_table_name}
|
|
97
|
+
GROUP BY
|
|
98
|
+
ts,
|
|
99
|
+
-- group by raw column name instead of alias in select
|
|
100
|
+
-- in case table has a column called 'prediction'
|
|
101
|
+
{escaped_pred_col}
|
|
102
|
+
ORDER BY ts
|
|
103
|
+
"""
|
|
104
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
74
105
|
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
75
106
|
escaped_pred_col = escape_identifier(prediction_col)
|
|
107
|
+
|
|
108
|
+
# build query components with segmentation columns
|
|
109
|
+
escaped_segmentation_cols = [
|
|
110
|
+
escape_identifier(col) for col in segmentation_cols
|
|
111
|
+
]
|
|
112
|
+
all_select_clause_cols = [
|
|
113
|
+
f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
|
|
114
|
+
f"{escaped_pred_col} as prediction",
|
|
115
|
+
f"COUNT(*) as count",
|
|
116
|
+
] + escaped_segmentation_cols
|
|
117
|
+
all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
|
|
118
|
+
extra_dims = ["prediction"]
|
|
119
|
+
|
|
120
|
+
# build query
|
|
76
121
|
query = f"""
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
GROUP BY
|
|
83
|
-
ts,
|
|
84
|
-
-- group by raw column name instead of alias in select
|
|
85
|
-
-- in case table has a column called 'prediction'
|
|
86
|
-
{escaped_pred_col}
|
|
87
|
-
ORDER BY ts
|
|
88
|
-
"""
|
|
122
|
+
SELECT {", ".join(all_select_clause_cols)}
|
|
123
|
+
FROM {dataset.dataset_table_name}
|
|
124
|
+
GROUP BY {", ".join(all_group_by_cols)}
|
|
125
|
+
ORDER BY ts
|
|
126
|
+
"""
|
|
89
127
|
|
|
90
128
|
result = ddb_conn.sql(query).df()
|
|
91
129
|
|
|
92
130
|
series = self.group_query_results_to_numeric_metrics(
|
|
93
131
|
result,
|
|
94
132
|
"count",
|
|
95
|
-
|
|
133
|
+
segmentation_cols + extra_dims,
|
|
96
134
|
"ts",
|
|
97
135
|
)
|
|
98
136
|
metric = self.series_to_metric(self._metric_name(), series)
|
|
@@ -177,20 +215,59 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
|
|
|
177
215
|
description="The label denoting a negative classification.",
|
|
178
216
|
),
|
|
179
217
|
],
|
|
218
|
+
segmentation_cols: Annotated[
|
|
219
|
+
Optional[list[str]],
|
|
220
|
+
MetricMultipleColumnParameterAnnotation(
|
|
221
|
+
source_dataset_parameter_key="dataset",
|
|
222
|
+
allowed_column_types=[
|
|
223
|
+
ScalarType(dtype=DType.INT),
|
|
224
|
+
ScalarType(dtype=DType.BOOL),
|
|
225
|
+
ScalarType(dtype=DType.STRING),
|
|
226
|
+
ScalarType(dtype=DType.UUID),
|
|
227
|
+
],
|
|
228
|
+
tag_hints=[],
|
|
229
|
+
friendly_name="Segmentation Columns",
|
|
230
|
+
description="All columns to include as dimensions for segmentation.",
|
|
231
|
+
optional=True,
|
|
232
|
+
),
|
|
233
|
+
] = None,
|
|
180
234
|
) -> list[NumericMetric]:
|
|
235
|
+
"""Executed SQL with no segmentation columns:
|
|
236
|
+
SELECT
|
|
237
|
+
time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
|
|
238
|
+
CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
|
|
239
|
+
COUNT(*) as count
|
|
240
|
+
FROM {dataset.dataset_table_name}
|
|
241
|
+
GROUP BY
|
|
242
|
+
ts,
|
|
243
|
+
-- group by raw column name instead of alias in select
|
|
244
|
+
-- in case table has a column called 'prediction'
|
|
245
|
+
{escaped_prediction_col}
|
|
246
|
+
ORDER BY ts
|
|
247
|
+
"""
|
|
248
|
+
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
181
249
|
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
182
250
|
escaped_prediction_col = escape_identifier(prediction_col)
|
|
251
|
+
|
|
252
|
+
# build query components with segmentation columns
|
|
253
|
+
escaped_segmentation_cols = [
|
|
254
|
+
escape_identifier(col) for col in segmentation_cols
|
|
255
|
+
]
|
|
256
|
+
all_select_clause_cols = [
|
|
257
|
+
f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
|
|
258
|
+
f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
|
|
259
|
+
f"COUNT(*) as count",
|
|
260
|
+
] + escaped_segmentation_cols
|
|
261
|
+
all_group_by_cols = [
|
|
262
|
+
"ts",
|
|
263
|
+
f"{escaped_prediction_col}",
|
|
264
|
+
] + escaped_segmentation_cols
|
|
265
|
+
extra_dims = ["prediction"]
|
|
266
|
+
|
|
183
267
|
query = f"""
|
|
184
|
-
SELECT
|
|
185
|
-
time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
|
|
186
|
-
CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
|
|
187
|
-
COUNT(*) as count
|
|
268
|
+
SELECT {", ".join(all_select_clause_cols)}
|
|
188
269
|
FROM {dataset.dataset_table_name}
|
|
189
|
-
GROUP BY
|
|
190
|
-
ts,
|
|
191
|
-
-- group by raw column name instead of alias in select
|
|
192
|
-
-- in case table has a column called 'prediction'
|
|
193
|
-
{escaped_prediction_col}
|
|
270
|
+
GROUP BY {", ".join(all_group_by_cols)}
|
|
194
271
|
ORDER BY ts
|
|
195
272
|
"""
|
|
196
273
|
|
|
@@ -199,7 +276,7 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
|
|
|
199
276
|
series = self.group_query_results_to_numeric_metrics(
|
|
200
277
|
result,
|
|
201
278
|
"count",
|
|
202
|
-
|
|
279
|
+
segmentation_cols + extra_dims,
|
|
203
280
|
"ts",
|
|
204
281
|
)
|
|
205
282
|
metric = self.series_to_metric(self._metric_name(), series)
|