arthur-common 2.1.68__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arthur-common might be problematic. Click here for more details.
- {arthur_common-2.1.68 → arthur_common-2.3.0}/PKG-INFO +1 -1
- {arthur_common-2.1.68 → arthur_common-2.3.0}/pyproject.toml +1 -1
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/categorical_count.py +13 -16
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/confusion_matrix.py +24 -29
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/inference_count.py +7 -10
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/inference_count_by_class.py +20 -28
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/inference_null_count.py +10 -14
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/mean_absolute_error.py +14 -19
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/mean_squared_error.py +14 -19
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/multiclass_confusion_matrix.py +18 -23
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/numeric_stats.py +13 -16
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/numeric_sum.py +12 -16
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/enums.py +11 -3
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/request_schemas.py +263 -8
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/duckdb_data_loader.py +30 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/duckdb_utils.py +3 -5
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/schema_inferer.py +1 -1
- {arthur_common-2.1.68 → arthur_common-2.3.0}/README.md +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/__init__.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/__init__.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/aggregator.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/README.md +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/__init__.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/agentic_aggregations.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/py.typed +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/shield_aggregations.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/py.typed +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/config/__init__.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/config/config.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/config/settings.yaml +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/__init__.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/common_schemas.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/connectors.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/constants.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/datasets.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/metric_schemas.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/metrics.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/py.typed +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/response_schemas.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/schema_definitions.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/task_job_specs.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/py.typed +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/__init__.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/aggregation_analyzer.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/aggregation_loader.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/functions.py +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/py.typed +0 -0
- {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/time_utils.py +0 -0
|
@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
|
|
|
18
18
|
ScalarType,
|
|
19
19
|
ScopeSchemaTag,
|
|
20
20
|
)
|
|
21
|
-
|
|
21
|
+
|
|
22
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class CategoricalCountAggregationFunction(NumericAggregationFunction):
|
|
@@ -93,30 +94,25 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
|
|
|
93
94
|
] = None,
|
|
94
95
|
) -> list[NumericMetric]:
|
|
95
96
|
"""Executed SQL with no segmentation columns:
|
|
96
|
-
select time_bucket(INTERVAL '5 minutes', {
|
|
97
|
+
select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
|
|
97
98
|
count(*) as count, \
|
|
98
|
-
{
|
|
99
|
-
{
|
|
99
|
+
{categorical_col} as category, \
|
|
100
|
+
{categorical_col_name_unescaped} as column_name \
|
|
100
101
|
from {dataset.dataset_table_name} \
|
|
101
102
|
where ts is not null \
|
|
102
103
|
group by ts, category
|
|
103
104
|
"""
|
|
104
105
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
105
|
-
|
|
106
|
-
categorical_col_escaped = escape_identifier(categorical_col)
|
|
107
|
-
categorical_col_name_escaped = escape_str_literal(categorical_col)
|
|
106
|
+
categorical_col_name_unescaped = escape_str_literal(unescape_identifier(categorical_col))
|
|
108
107
|
|
|
109
108
|
# build query components with segmentation columns
|
|
110
|
-
escaped_segmentation_cols = [
|
|
111
|
-
escape_identifier(col) for col in segmentation_cols
|
|
112
|
-
]
|
|
113
109
|
all_select_clause_cols = [
|
|
114
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
110
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
115
111
|
f"count(*) as count",
|
|
116
|
-
f"{
|
|
117
|
-
f"{
|
|
118
|
-
] +
|
|
119
|
-
all_group_by_cols = ["ts", "category"] +
|
|
112
|
+
f"{categorical_col} as category",
|
|
113
|
+
f"{categorical_col_name_unescaped} as column_name",
|
|
114
|
+
] + segmentation_cols
|
|
115
|
+
all_group_by_cols = ["ts", "category"] + segmentation_cols
|
|
120
116
|
extra_dims = ["column_name", "category"]
|
|
121
117
|
|
|
122
118
|
# build query
|
|
@@ -129,10 +125,11 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
|
|
|
129
125
|
|
|
130
126
|
results = ddb_conn.sql(count_query).df()
|
|
131
127
|
|
|
128
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
132
129
|
series = self.group_query_results_to_numeric_metrics(
|
|
133
130
|
results,
|
|
134
131
|
"count",
|
|
135
|
-
|
|
132
|
+
unescaped_segmentation_cols + extra_dims,
|
|
136
133
|
timestamp_col="ts",
|
|
137
134
|
)
|
|
138
135
|
metric = self.series_to_metric(self.METRIC_NAME, series)
|
|
@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
|
|
|
20
20
|
ScalarType,
|
|
21
21
|
ScopeSchemaTag,
|
|
22
22
|
)
|
|
23
|
-
|
|
23
|
+
|
|
24
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
@@ -78,11 +79,11 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
|
78
79
|
Without segmentation, this is the query:
|
|
79
80
|
WITH normalized_data AS (
|
|
80
81
|
SELECT
|
|
81
|
-
{
|
|
82
|
-
{prediction_normalization_case.replace('value',
|
|
83
|
-
{gt_normalization_case.replace('value',
|
|
82
|
+
{timestamp_col} AS timestamp,
|
|
83
|
+
{prediction_normalization_case.replace('value', prediction_col)} AS prediction,
|
|
84
|
+
{gt_normalization_case.replace('value', gt_values_col)} AS actual_value
|
|
84
85
|
FROM {dataset.dataset_table_name}
|
|
85
|
-
WHERE {
|
|
86
|
+
WHERE {timestamp_col} IS NOT NULL
|
|
86
87
|
)
|
|
87
88
|
SELECT
|
|
88
89
|
time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
|
|
@@ -90,34 +91,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
|
90
91
|
SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
|
|
91
92
|
SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
|
|
92
93
|
SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
|
|
93
|
-
{
|
|
94
|
+
{unescaped_prediction_col_name} as prediction_column_name
|
|
94
95
|
FROM normalized_data
|
|
95
96
|
GROUP BY ts
|
|
96
97
|
ORDER BY ts
|
|
97
98
|
"""
|
|
98
99
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
escaped_prediction_col_name = escape_str_literal(prediction_col)
|
|
102
|
-
escaped_gt_values_col = escape_identifier(gt_values_col)
|
|
100
|
+
unescaped_prediction_col_name = escape_str_literal(unescape_identifier(prediction_col))
|
|
101
|
+
|
|
103
102
|
# build query components with segmentation columns
|
|
104
|
-
escaped_segmentation_cols = [
|
|
105
|
-
escape_identifier(col) for col in segmentation_cols
|
|
106
|
-
]
|
|
107
103
|
first_subquery_select_cols = [
|
|
108
|
-
f"{
|
|
109
|
-
f"{prediction_normalization_case.replace('value',
|
|
110
|
-
f"{gt_normalization_case.replace('value',
|
|
111
|
-
] +
|
|
104
|
+
f"{timestamp_col} AS timestamp",
|
|
105
|
+
f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
|
|
106
|
+
f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
|
|
107
|
+
] + segmentation_cols
|
|
112
108
|
second_subquery_select_cols = [
|
|
113
109
|
"time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
|
|
114
110
|
"SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
|
|
115
111
|
"SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
|
|
116
112
|
"SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
|
|
117
113
|
"SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
|
|
118
|
-
f"{
|
|
119
|
-
] +
|
|
120
|
-
second_subquery_group_by_cols = ["ts"] +
|
|
114
|
+
f"{unescaped_prediction_col_name} as prediction_column_name",
|
|
115
|
+
] + segmentation_cols
|
|
116
|
+
second_subquery_group_by_cols = ["ts"] + segmentation_cols
|
|
121
117
|
extra_dims = ["prediction_column_name"]
|
|
122
118
|
|
|
123
119
|
# build query
|
|
@@ -125,7 +121,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
|
125
121
|
WITH normalized_data AS (
|
|
126
122
|
SELECT {", ".join(first_subquery_select_cols)}
|
|
127
123
|
FROM {dataset.dataset_table_name}
|
|
128
|
-
WHERE {
|
|
124
|
+
WHERE {timestamp_col} IS NOT NULL
|
|
129
125
|
)
|
|
130
126
|
SELECT {", ".join(second_subquery_select_cols)}
|
|
131
127
|
FROM normalized_data
|
|
@@ -135,28 +131,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
|
|
|
135
131
|
|
|
136
132
|
results = ddb_conn.sql(confusion_matrix_query).df()
|
|
137
133
|
|
|
134
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
138
135
|
tp = self.group_query_results_to_numeric_metrics(
|
|
139
136
|
results,
|
|
140
137
|
"true_positive_count",
|
|
141
|
-
dim_columns=
|
|
138
|
+
dim_columns=unescaped_segmentation_cols + extra_dims,
|
|
142
139
|
timestamp_col="ts",
|
|
143
140
|
)
|
|
144
141
|
fp = self.group_query_results_to_numeric_metrics(
|
|
145
142
|
results,
|
|
146
143
|
"false_positive_count",
|
|
147
|
-
dim_columns=
|
|
144
|
+
dim_columns=unescaped_segmentation_cols + extra_dims,
|
|
148
145
|
timestamp_col="ts",
|
|
149
146
|
)
|
|
150
147
|
fn = self.group_query_results_to_numeric_metrics(
|
|
151
148
|
results,
|
|
152
149
|
"false_negative_count",
|
|
153
|
-
dim_columns=
|
|
150
|
+
dim_columns=unescaped_segmentation_cols + extra_dims,
|
|
154
151
|
timestamp_col="ts",
|
|
155
152
|
)
|
|
156
153
|
tn = self.group_query_results_to_numeric_metrics(
|
|
157
154
|
results,
|
|
158
155
|
"true_negative_count",
|
|
159
|
-
dim_columns=
|
|
156
|
+
dim_columns=unescaped_segmentation_cols + extra_dims,
|
|
160
157
|
timestamp_col="ts",
|
|
161
158
|
)
|
|
162
159
|
tp_metric = self.series_to_metric(self.TRUE_POSITIVE_METRIC_NAME, tp)
|
|
@@ -243,9 +240,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
|
|
|
243
240
|
] = None,
|
|
244
241
|
) -> list[NumericMetric]:
|
|
245
242
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
246
|
-
escaped_prediction_col = escape_identifier(prediction_col)
|
|
247
243
|
# Get the type of prediction column
|
|
248
|
-
type_query = f"SELECT typeof({
|
|
244
|
+
type_query = f"SELECT typeof({prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
|
|
249
245
|
res = ddb_conn.sql(type_query).fetchone()
|
|
250
246
|
# As long as this column exists, we should be able to get the type. This is here to make mypy happy.
|
|
251
247
|
if not res:
|
|
@@ -476,7 +472,6 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
|
|
|
476
472
|
),
|
|
477
473
|
] = None,
|
|
478
474
|
) -> list[NumericMetric]:
|
|
479
|
-
escaped_gt_values_col = escape_identifier(gt_values_col)
|
|
480
475
|
prediction_normalization_case = f"""
|
|
481
476
|
CASE
|
|
482
477
|
WHEN value >= {threshold} THEN 1
|
|
@@ -485,7 +480,7 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
|
|
|
485
480
|
END
|
|
486
481
|
"""
|
|
487
482
|
|
|
488
|
-
type_query = f"SELECT typeof({
|
|
483
|
+
type_query = f"SELECT typeof({gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
|
|
489
484
|
res = ddb_conn.sql(type_query).fetchone()
|
|
490
485
|
# As long as this column exists, we should be able to get the type. This is here to make mypy happy.
|
|
491
486
|
if not res:
|
|
@@ -18,7 +18,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
18
18
|
ScalarType,
|
|
19
19
|
ScopeSchemaTag,
|
|
20
20
|
)
|
|
21
|
-
from arthur_common.tools.duckdb_data_loader import
|
|
21
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class InferenceCountAggregationFunction(NumericAggregationFunction):
|
|
@@ -80,23 +80,19 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
|
|
|
80
80
|
] = None,
|
|
81
81
|
) -> list[NumericMetric]:
|
|
82
82
|
"""Executed SQL with no segmentation columns:
|
|
83
|
-
select time_bucket(INTERVAL '5 minutes', {
|
|
83
|
+
select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
|
|
84
84
|
count(*) as count \
|
|
85
85
|
from {dataset.dataset_table_name} \
|
|
86
86
|
group by ts \
|
|
87
87
|
"""
|
|
88
88
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
89
|
-
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
90
89
|
|
|
91
90
|
# build query components with segmentation columns
|
|
92
|
-
escaped_segmentation_cols = [
|
|
93
|
-
escape_identifier(col) for col in segmentation_cols
|
|
94
|
-
]
|
|
95
91
|
all_select_clause_cols = [
|
|
96
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
92
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
97
93
|
f"count(*) as count",
|
|
98
|
-
] +
|
|
99
|
-
all_group_by_cols = ["ts"] +
|
|
94
|
+
] + segmentation_cols
|
|
95
|
+
all_group_by_cols = ["ts"] + segmentation_cols
|
|
100
96
|
|
|
101
97
|
# build query
|
|
102
98
|
count_query = f"""
|
|
@@ -106,10 +102,11 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
|
|
|
106
102
|
"""
|
|
107
103
|
|
|
108
104
|
results = ddb_conn.sql(count_query).df()
|
|
105
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
109
106
|
series = self.group_query_results_to_numeric_metrics(
|
|
110
107
|
results,
|
|
111
108
|
"count",
|
|
112
|
-
|
|
109
|
+
unescaped_segmentation_cols,
|
|
113
110
|
"ts",
|
|
114
111
|
)
|
|
115
112
|
metric = self.series_to_metric(self.METRIC_NAME, series)
|
|
@@ -20,7 +20,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
20
20
|
ScalarType,
|
|
21
21
|
ScopeSchemaTag,
|
|
22
22
|
)
|
|
23
|
-
from arthur_common.tools.duckdb_data_loader import
|
|
23
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
|
|
@@ -100,31 +100,26 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
|
|
|
100
100
|
) -> list[NumericMetric]:
|
|
101
101
|
"""Executed SQL with no segmentation columns:
|
|
102
102
|
SELECT
|
|
103
|
-
time_bucket(INTERVAL '5 minutes', {
|
|
104
|
-
{
|
|
103
|
+
time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
|
|
104
|
+
{prediction_col} as prediction,
|
|
105
105
|
COUNT(*) as count
|
|
106
106
|
FROM {dataset.dataset_table_name}
|
|
107
107
|
GROUP BY
|
|
108
108
|
ts,
|
|
109
109
|
-- group by raw column name instead of alias in select
|
|
110
110
|
-- in case table has a column called 'prediction'
|
|
111
|
-
{
|
|
111
|
+
{prediction_col}
|
|
112
112
|
ORDER BY ts
|
|
113
113
|
"""
|
|
114
114
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
115
|
-
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
116
|
-
escaped_pred_col = escape_identifier(prediction_col)
|
|
117
115
|
|
|
118
116
|
# build query components with segmentation columns
|
|
119
|
-
escaped_segmentation_cols = [
|
|
120
|
-
escape_identifier(col) for col in segmentation_cols
|
|
121
|
-
]
|
|
122
117
|
all_select_clause_cols = [
|
|
123
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
124
|
-
f"{
|
|
118
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
119
|
+
f"{prediction_col} as prediction",
|
|
125
120
|
f"COUNT(*) as count",
|
|
126
|
-
] +
|
|
127
|
-
all_group_by_cols = ["ts", f"{
|
|
121
|
+
] + segmentation_cols
|
|
122
|
+
all_group_by_cols = ["ts", f"{prediction_col}"] + segmentation_cols
|
|
128
123
|
extra_dims = ["prediction"]
|
|
129
124
|
|
|
130
125
|
# build query
|
|
@@ -137,10 +132,11 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
|
|
|
137
132
|
|
|
138
133
|
result = ddb_conn.sql(query).df()
|
|
139
134
|
|
|
135
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
140
136
|
series = self.group_query_results_to_numeric_metrics(
|
|
141
137
|
result,
|
|
142
138
|
"count",
|
|
143
|
-
|
|
139
|
+
unescaped_segmentation_cols + extra_dims,
|
|
144
140
|
"ts",
|
|
145
141
|
)
|
|
146
142
|
metric = self.series_to_metric(self._metric_name(), series)
|
|
@@ -248,34 +244,29 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
|
|
|
248
244
|
) -> list[NumericMetric]:
|
|
249
245
|
"""Executed SQL with no segmentation columns:
|
|
250
246
|
SELECT
|
|
251
|
-
time_bucket(INTERVAL '5 minutes', {
|
|
252
|
-
CASE WHEN {
|
|
247
|
+
time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
|
|
248
|
+
CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
|
|
253
249
|
COUNT(*) as count
|
|
254
250
|
FROM {dataset.dataset_table_name}
|
|
255
251
|
GROUP BY
|
|
256
252
|
ts,
|
|
257
253
|
-- group by raw column name instead of alias in select
|
|
258
254
|
-- in case table has a column called 'prediction'
|
|
259
|
-
{
|
|
255
|
+
{prediction_col}
|
|
260
256
|
ORDER BY ts
|
|
261
257
|
"""
|
|
262
258
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
263
|
-
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
264
|
-
escaped_prediction_col = escape_identifier(prediction_col)
|
|
265
259
|
|
|
266
260
|
# build query components with segmentation columns
|
|
267
|
-
escaped_segmentation_cols = [
|
|
268
|
-
escape_identifier(col) for col in segmentation_cols
|
|
269
|
-
]
|
|
270
261
|
all_select_clause_cols = [
|
|
271
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
272
|
-
f"CASE WHEN {
|
|
262
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
263
|
+
f"CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
|
|
273
264
|
f"COUNT(*) as count",
|
|
274
|
-
] +
|
|
265
|
+
] + segmentation_cols
|
|
275
266
|
all_group_by_cols = [
|
|
276
267
|
"ts",
|
|
277
|
-
f"{
|
|
278
|
-
] +
|
|
268
|
+
f"{prediction_col}",
|
|
269
|
+
] + segmentation_cols
|
|
279
270
|
extra_dims = ["prediction"]
|
|
280
271
|
|
|
281
272
|
query = f"""
|
|
@@ -287,10 +278,11 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
|
|
|
287
278
|
|
|
288
279
|
result = ddb_conn.sql(query).df()
|
|
289
280
|
|
|
281
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
290
282
|
series = self.group_query_results_to_numeric_metrics(
|
|
291
283
|
result,
|
|
292
284
|
"count",
|
|
293
|
-
|
|
285
|
+
unescaped_segmentation_cols + extra_dims,
|
|
294
286
|
"ts",
|
|
295
287
|
)
|
|
296
288
|
metric = self.series_to_metric(self._metric_name(), series)
|
|
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
19
19
|
ScalarType,
|
|
20
20
|
ScopeSchemaTag,
|
|
21
21
|
)
|
|
22
|
-
from arthur_common.tools.duckdb_data_loader import
|
|
22
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class InferenceNullCountAggregationFunction(NumericAggregationFunction):
|
|
@@ -90,44 +90,40 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
|
|
|
90
90
|
] = None,
|
|
91
91
|
) -> list[NumericMetric]:
|
|
92
92
|
"""Executed SQL with no segmentation columns:
|
|
93
|
-
select time_bucket(INTERVAL '5 minutes', {
|
|
93
|
+
select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
|
|
94
94
|
count(*) as count \
|
|
95
|
-
from {dataset.dataset_table_name} where {
|
|
95
|
+
from {dataset.dataset_table_name} where {nullable_col} is null \
|
|
96
96
|
group by ts \
|
|
97
97
|
"""
|
|
98
98
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
99
|
-
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
100
|
-
escaped_nullable_col = escape_identifier(nullable_col)
|
|
101
99
|
|
|
102
100
|
# build query components with segmentation columns
|
|
103
|
-
escaped_segmentation_cols = [
|
|
104
|
-
escape_identifier(col) for col in segmentation_cols
|
|
105
|
-
]
|
|
106
101
|
all_select_clause_cols = [
|
|
107
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
102
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
108
103
|
f"count(*) as count",
|
|
109
|
-
] +
|
|
110
|
-
all_group_by_cols = ["ts"] +
|
|
104
|
+
] + segmentation_cols
|
|
105
|
+
all_group_by_cols = ["ts"] + segmentation_cols
|
|
111
106
|
|
|
112
107
|
# build query
|
|
113
108
|
count_query = f"""
|
|
114
109
|
select {", ".join(all_select_clause_cols)}
|
|
115
110
|
from {dataset.dataset_table_name}
|
|
116
|
-
where {
|
|
111
|
+
where {nullable_col} is null
|
|
117
112
|
group by {", ".join(all_group_by_cols)}
|
|
118
113
|
"""
|
|
119
114
|
|
|
120
115
|
results = ddb_conn.sql(count_query).df()
|
|
121
116
|
|
|
117
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
122
118
|
series = self.group_query_results_to_numeric_metrics(
|
|
123
119
|
results,
|
|
124
120
|
"count",
|
|
125
|
-
|
|
121
|
+
unescaped_segmentation_cols,
|
|
126
122
|
"ts",
|
|
127
123
|
)
|
|
128
124
|
# preserve dimension that identifies the name of the nullable column used for the aggregation
|
|
129
125
|
for point in series:
|
|
130
|
-
point.dimensions.append(Dimension(name="column_name", value=nullable_col))
|
|
126
|
+
point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(nullable_col)))
|
|
131
127
|
|
|
132
128
|
metric = self.series_to_metric(self.METRIC_NAME, series)
|
|
133
129
|
return [metric]
|
|
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
19
19
|
ScalarType,
|
|
20
20
|
ScopeSchemaTag,
|
|
21
21
|
)
|
|
22
|
-
from arthur_common.tools.duckdb_data_loader import
|
|
22
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
|
|
@@ -111,50 +111,45 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
|
|
|
111
111
|
] = None,
|
|
112
112
|
) -> list[NumericMetric]:
|
|
113
113
|
"""Executed SQL with no segmentation columns:
|
|
114
|
-
SELECT time_bucket(INTERVAL '5 minutes', {
|
|
115
|
-
SUM(ABS({
|
|
114
|
+
SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
|
|
115
|
+
SUM(ABS({prediction_col} - {ground_truth_col})) as ae, \
|
|
116
116
|
COUNT(*) as count \
|
|
117
117
|
FROM {dataset.dataset_table_name} \
|
|
118
|
-
WHERE {
|
|
119
|
-
AND {
|
|
118
|
+
WHERE {prediction_col} IS NOT NULL \
|
|
119
|
+
AND {ground_truth_col} IS NOT NULL \
|
|
120
120
|
GROUP BY ts order by ts desc \
|
|
121
121
|
"""
|
|
122
122
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
123
|
-
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
124
|
-
escaped_prediction_col = escape_identifier(prediction_col)
|
|
125
|
-
escaped_ground_truth_col = escape_identifier(ground_truth_col)
|
|
126
123
|
|
|
127
124
|
# build query components with segmentation columns
|
|
128
|
-
escaped_segmentation_cols = [
|
|
129
|
-
escape_identifier(col) for col in segmentation_cols
|
|
130
|
-
]
|
|
131
125
|
all_select_clause_cols = [
|
|
132
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
133
|
-
f"SUM(ABS({
|
|
126
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
127
|
+
f"SUM(ABS({prediction_col} - {ground_truth_col})) as ae",
|
|
134
128
|
f"COUNT(*) as count",
|
|
135
|
-
] +
|
|
136
|
-
all_group_by_cols = ["ts"] +
|
|
129
|
+
] + segmentation_cols
|
|
130
|
+
all_group_by_cols = ["ts"] + segmentation_cols
|
|
137
131
|
|
|
138
132
|
# build query
|
|
139
133
|
mae_query = f"""
|
|
140
134
|
SELECT {", ".join(all_select_clause_cols)}
|
|
141
135
|
FROM {dataset.dataset_table_name}
|
|
142
|
-
WHERE {
|
|
143
|
-
AND {
|
|
136
|
+
WHERE {prediction_col} IS NOT NULL
|
|
137
|
+
AND {ground_truth_col} IS NOT NULL
|
|
144
138
|
GROUP BY {", ".join(all_group_by_cols)} order by ts desc
|
|
145
139
|
"""
|
|
146
140
|
|
|
147
141
|
results = ddb_conn.sql(mae_query).df()
|
|
142
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
148
143
|
count_series = self.group_query_results_to_numeric_metrics(
|
|
149
144
|
results,
|
|
150
145
|
"count",
|
|
151
|
-
|
|
146
|
+
unescaped_segmentation_cols,
|
|
152
147
|
"ts",
|
|
153
148
|
)
|
|
154
149
|
absolute_error_series = self.group_query_results_to_numeric_metrics(
|
|
155
150
|
results,
|
|
156
151
|
"ae",
|
|
157
|
-
|
|
152
|
+
unescaped_segmentation_cols,
|
|
158
153
|
"ts",
|
|
159
154
|
)
|
|
160
155
|
|
|
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
|
|
|
19
19
|
ScalarType,
|
|
20
20
|
ScopeSchemaTag,
|
|
21
21
|
)
|
|
22
|
-
from arthur_common.tools.duckdb_data_loader import
|
|
22
|
+
from arthur_common.tools.duckdb_data_loader import unescape_identifier
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
|
|
@@ -111,50 +111,45 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
|
|
|
111
111
|
] = None,
|
|
112
112
|
) -> list[NumericMetric]:
|
|
113
113
|
"""Executed SQL with no segmentation columns:
|
|
114
|
-
SELECT time_bucket(INTERVAL '5 minutes', {
|
|
115
|
-
SUM(POW({
|
|
114
|
+
SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
|
|
115
|
+
SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error, \
|
|
116
116
|
COUNT(*) as count \
|
|
117
117
|
FROM {dataset.dataset_table_name} \
|
|
118
|
-
WHERE {
|
|
119
|
-
AND {
|
|
118
|
+
WHERE {prediction_col} IS NOT NULL \
|
|
119
|
+
AND {ground_truth_col} IS NOT NULL \
|
|
120
120
|
GROUP BY ts order by ts desc \
|
|
121
121
|
"""
|
|
122
122
|
segmentation_cols = [] if not segmentation_cols else segmentation_cols
|
|
123
|
-
escaped_timestamp_col = escape_identifier(timestamp_col)
|
|
124
|
-
escaped_prediction_col = escape_identifier(prediction_col)
|
|
125
|
-
escaped_ground_truth_col = escape_identifier(ground_truth_col)
|
|
126
123
|
|
|
127
124
|
# build query components with segmentation columns
|
|
128
|
-
escaped_segmentation_cols = [
|
|
129
|
-
escape_identifier(col) for col in segmentation_cols
|
|
130
|
-
]
|
|
131
125
|
all_select_clause_cols = [
|
|
132
|
-
f"time_bucket(INTERVAL '5 minutes', {
|
|
133
|
-
f"SUM(POW({
|
|
126
|
+
f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
|
|
127
|
+
f"SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error",
|
|
134
128
|
f"COUNT(*) as count",
|
|
135
|
-
] +
|
|
136
|
-
all_group_by_cols = ["ts"] +
|
|
129
|
+
] + segmentation_cols
|
|
130
|
+
all_group_by_cols = ["ts"] + segmentation_cols
|
|
137
131
|
|
|
138
132
|
# build query
|
|
139
133
|
mse_query = f"""
|
|
140
134
|
SELECT {", ".join(all_select_clause_cols)}
|
|
141
135
|
FROM {dataset.dataset_table_name}
|
|
142
|
-
WHERE {
|
|
143
|
-
AND {
|
|
136
|
+
WHERE {prediction_col} IS NOT NULL
|
|
137
|
+
AND {ground_truth_col} IS NOT NULL
|
|
144
138
|
GROUP BY {", ".join(all_group_by_cols)} order by ts desc
|
|
145
139
|
"""
|
|
146
140
|
|
|
147
141
|
results = ddb_conn.sql(mse_query).df()
|
|
142
|
+
unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
|
|
148
143
|
count_series = self.group_query_results_to_numeric_metrics(
|
|
149
144
|
results,
|
|
150
145
|
"count",
|
|
151
|
-
|
|
146
|
+
unescaped_segmentation_cols,
|
|
152
147
|
"ts",
|
|
153
148
|
)
|
|
154
149
|
squared_error_series = self.group_query_results_to_numeric_metrics(
|
|
155
150
|
results,
|
|
156
151
|
"squared_error",
|
|
157
|
-
|
|
152
|
+
unescaped_segmentation_cols,
|
|
158
153
|
"ts",
|
|
159
154
|
)
|
|
160
155
|
|