arthur-common 2.1.68__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arthur-common might be problematic. Click here for more details.

Files changed (49) hide show
  1. {arthur_common-2.1.68 → arthur_common-2.3.0}/PKG-INFO +1 -1
  2. {arthur_common-2.1.68 → arthur_common-2.3.0}/pyproject.toml +1 -1
  3. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/categorical_count.py +13 -16
  4. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/confusion_matrix.py +24 -29
  5. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/inference_count.py +7 -10
  6. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/inference_count_by_class.py +20 -28
  7. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/inference_null_count.py +10 -14
  8. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/mean_absolute_error.py +14 -19
  9. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/mean_squared_error.py +14 -19
  10. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/multiclass_confusion_matrix.py +18 -23
  11. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/numeric_stats.py +13 -16
  12. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/numeric_sum.py +12 -16
  13. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/enums.py +11 -3
  14. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/request_schemas.py +263 -8
  15. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/duckdb_data_loader.py +30 -0
  16. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/duckdb_utils.py +3 -5
  17. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/schema_inferer.py +1 -1
  18. {arthur_common-2.1.68 → arthur_common-2.3.0}/README.md +0 -0
  19. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/__init__.py +0 -0
  20. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/__init__.py +0 -0
  21. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/aggregator.py +0 -0
  22. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/README.md +0 -0
  23. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/__init__.py +0 -0
  24. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/agentic_aggregations.py +0 -0
  25. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +0 -0
  26. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/py.typed +0 -0
  27. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/functions/shield_aggregations.py +0 -0
  28. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/aggregations/py.typed +0 -0
  29. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/config/__init__.py +0 -0
  30. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/config/config.py +0 -0
  31. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/config/settings.yaml +0 -0
  32. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/__init__.py +0 -0
  33. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/common_schemas.py +0 -0
  34. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/connectors.py +0 -0
  35. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/constants.py +0 -0
  36. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/datasets.py +0 -0
  37. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/metric_schemas.py +0 -0
  38. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/metrics.py +0 -0
  39. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/py.typed +0 -0
  40. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/response_schemas.py +0 -0
  41. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/schema_definitions.py +0 -0
  42. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/models/task_job_specs.py +0 -0
  43. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/py.typed +0 -0
  44. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/__init__.py +0 -0
  45. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/aggregation_analyzer.py +0 -0
  46. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/aggregation_loader.py +0 -0
  47. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/functions.py +0 -0
  48. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/py.typed +0 -0
  49. {arthur_common-2.1.68 → arthur_common-2.3.0}/src/arthur_common/tools/time_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arthur-common
3
- Version: 2.1.68
3
+ Version: 2.3.0
4
4
  Summary: Utility code common to Arthur platform components.
5
5
  License: MIT
6
6
  Author: Arthur
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "arthur-common"
3
- version = "2.1.68"
3
+ version = "2.3.0"
4
4
  description = "Utility code common to Arthur platform components."
5
5
  authors = ["Arthur <engineering@arthur.ai>"]
6
6
  license = "MIT"
@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
18
18
  ScalarType,
19
19
  ScopeSchemaTag,
20
20
  )
21
- from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
21
+
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
22
23
 
23
24
 
24
25
  class CategoricalCountAggregationFunction(NumericAggregationFunction):
@@ -93,30 +94,25 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
93
94
  ] = None,
94
95
  ) -> list[NumericMetric]:
95
96
  """Executed SQL with no segmentation columns:
96
- select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
97
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
97
98
  count(*) as count, \
98
- {categorical_col_escaped} as category, \
99
- {categorical_col_name_escaped} as column_name \
99
+ {categorical_col} as category, \
100
+ {categorical_col_name_unescaped} as column_name \
100
101
  from {dataset.dataset_table_name} \
101
102
  where ts is not null \
102
103
  group by ts, category
103
104
  """
104
105
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
105
- timestamp_col_escaped = escape_identifier(timestamp_col)
106
- categorical_col_escaped = escape_identifier(categorical_col)
107
- categorical_col_name_escaped = escape_str_literal(categorical_col)
106
+ categorical_col_name_unescaped = escape_str_literal(unescape_identifier(categorical_col))
108
107
 
109
108
  # build query components with segmentation columns
110
- escaped_segmentation_cols = [
111
- escape_identifier(col) for col in segmentation_cols
112
- ]
113
109
  all_select_clause_cols = [
114
- f"time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts",
110
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
115
111
  f"count(*) as count",
116
- f"{categorical_col_escaped} as category",
117
- f"{categorical_col_name_escaped} as column_name",
118
- ] + escaped_segmentation_cols
119
- all_group_by_cols = ["ts", "category"] + escaped_segmentation_cols
112
+ f"{categorical_col} as category",
113
+ f"{categorical_col_name_unescaped} as column_name",
114
+ ] + segmentation_cols
115
+ all_group_by_cols = ["ts", "category"] + segmentation_cols
120
116
  extra_dims = ["column_name", "category"]
121
117
 
122
118
  # build query
@@ -129,10 +125,11 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
129
125
 
130
126
  results = ddb_conn.sql(count_query).df()
131
127
 
128
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
132
129
  series = self.group_query_results_to_numeric_metrics(
133
130
  results,
134
131
  "count",
135
- segmentation_cols + extra_dims,
132
+ unescaped_segmentation_cols + extra_dims,
136
133
  timestamp_col="ts",
137
134
  )
138
135
  metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
20
20
  ScalarType,
21
21
  ScopeSchemaTag,
22
22
  )
23
- from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
23
+
24
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
24
25
 
25
26
 
26
27
  class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
@@ -78,11 +79,11 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
78
79
  Without segmentation, this is the query:
79
80
  WITH normalized_data AS (
80
81
  SELECT
81
- {escaped_timestamp_col} AS timestamp,
82
- {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
83
- {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
82
+ {timestamp_col} AS timestamp,
83
+ {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
84
+ {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
84
85
  FROM {dataset.dataset_table_name}
85
- WHERE {escaped_timestamp_col} IS NOT NULL
86
+ WHERE {timestamp_col} IS NOT NULL
86
87
  )
87
88
  SELECT
88
89
  time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -90,34 +91,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
90
91
  SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
91
92
  SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
92
93
  SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
93
- {escaped_prediction_col_name} as prediction_column_name
94
+ {unescaped_prediction_col_name} as prediction_column_name
94
95
  FROM normalized_data
95
96
  GROUP BY ts
96
97
  ORDER BY ts
97
98
  """
98
99
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
99
- escaped_timestamp_col = escape_identifier(timestamp_col)
100
- escaped_prediction_col = escape_identifier(prediction_col)
101
- escaped_prediction_col_name = escape_str_literal(prediction_col)
102
- escaped_gt_values_col = escape_identifier(gt_values_col)
100
+ unescaped_prediction_col_name = escape_str_literal(unescape_identifier(prediction_col))
101
+
103
102
  # build query components with segmentation columns
104
- escaped_segmentation_cols = [
105
- escape_identifier(col) for col in segmentation_cols
106
- ]
107
103
  first_subquery_select_cols = [
108
- f"{escaped_timestamp_col} AS timestamp",
109
- f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
110
- f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
111
- ] + escaped_segmentation_cols
104
+ f"{timestamp_col} AS timestamp",
105
+ f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
106
+ f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
107
+ ] + segmentation_cols
112
108
  second_subquery_select_cols = [
113
109
  "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
114
110
  "SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
115
111
  "SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
116
112
  "SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
117
113
  "SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
118
- f"{escaped_prediction_col_name} as prediction_column_name",
119
- ] + escaped_segmentation_cols
120
- second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
114
+ f"{unescaped_prediction_col_name} as prediction_column_name",
115
+ ] + segmentation_cols
116
+ second_subquery_group_by_cols = ["ts"] + segmentation_cols
121
117
  extra_dims = ["prediction_column_name"]
122
118
 
123
119
  # build query
@@ -125,7 +121,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
125
121
  WITH normalized_data AS (
126
122
  SELECT {", ".join(first_subquery_select_cols)}
127
123
  FROM {dataset.dataset_table_name}
128
- WHERE {escaped_timestamp_col} IS NOT NULL
124
+ WHERE {timestamp_col} IS NOT NULL
129
125
  )
130
126
  SELECT {", ".join(second_subquery_select_cols)}
131
127
  FROM normalized_data
@@ -135,28 +131,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
135
131
 
136
132
  results = ddb_conn.sql(confusion_matrix_query).df()
137
133
 
134
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
138
135
  tp = self.group_query_results_to_numeric_metrics(
139
136
  results,
140
137
  "true_positive_count",
141
- dim_columns=segmentation_cols + extra_dims,
138
+ dim_columns=unescaped_segmentation_cols + extra_dims,
142
139
  timestamp_col="ts",
143
140
  )
144
141
  fp = self.group_query_results_to_numeric_metrics(
145
142
  results,
146
143
  "false_positive_count",
147
- dim_columns=segmentation_cols + extra_dims,
144
+ dim_columns=unescaped_segmentation_cols + extra_dims,
148
145
  timestamp_col="ts",
149
146
  )
150
147
  fn = self.group_query_results_to_numeric_metrics(
151
148
  results,
152
149
  "false_negative_count",
153
- dim_columns=segmentation_cols + extra_dims,
150
+ dim_columns=unescaped_segmentation_cols + extra_dims,
154
151
  timestamp_col="ts",
155
152
  )
156
153
  tn = self.group_query_results_to_numeric_metrics(
157
154
  results,
158
155
  "true_negative_count",
159
- dim_columns=segmentation_cols + extra_dims,
156
+ dim_columns=unescaped_segmentation_cols + extra_dims,
160
157
  timestamp_col="ts",
161
158
  )
162
159
  tp_metric = self.series_to_metric(self.TRUE_POSITIVE_METRIC_NAME, tp)
@@ -243,9 +240,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
243
240
  ] = None,
244
241
  ) -> list[NumericMetric]:
245
242
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
246
- escaped_prediction_col = escape_identifier(prediction_col)
247
243
  # Get the type of prediction column
248
- type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
244
+ type_query = f"SELECT typeof({prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
249
245
  res = ddb_conn.sql(type_query).fetchone()
250
246
  # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
251
247
  if not res:
@@ -476,7 +472,6 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
476
472
  ),
477
473
  ] = None,
478
474
  ) -> list[NumericMetric]:
479
- escaped_gt_values_col = escape_identifier(gt_values_col)
480
475
  prediction_normalization_case = f"""
481
476
  CASE
482
477
  WHEN value >= {threshold} THEN 1
@@ -485,7 +480,7 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
485
480
  END
486
481
  """
487
482
 
488
- type_query = f"SELECT typeof({escaped_gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
483
+ type_query = f"SELECT typeof({gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
489
484
  res = ddb_conn.sql(type_query).fetchone()
490
485
  # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
491
486
  if not res:
@@ -18,7 +18,7 @@ from arthur_common.models.schema_definitions import (
18
18
  ScalarType,
19
19
  ScopeSchemaTag,
20
20
  )
21
- from arthur_common.tools.duckdb_data_loader import escape_identifier
21
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
22
22
 
23
23
 
24
24
  class InferenceCountAggregationFunction(NumericAggregationFunction):
@@ -80,23 +80,19 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
80
80
  ] = None,
81
81
  ) -> list[NumericMetric]:
82
82
  """Executed SQL with no segmentation columns:
83
- select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
83
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
84
84
  count(*) as count \
85
85
  from {dataset.dataset_table_name} \
86
86
  group by ts \
87
87
  """
88
88
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
89
- escaped_timestamp_col = escape_identifier(timestamp_col)
90
89
 
91
90
  # build query components with segmentation columns
92
- escaped_segmentation_cols = [
93
- escape_identifier(col) for col in segmentation_cols
94
- ]
95
91
  all_select_clause_cols = [
96
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
92
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
97
93
  f"count(*) as count",
98
- ] + escaped_segmentation_cols
99
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
94
+ ] + segmentation_cols
95
+ all_group_by_cols = ["ts"] + segmentation_cols
100
96
 
101
97
  # build query
102
98
  count_query = f"""
@@ -106,10 +102,11 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
106
102
  """
107
103
 
108
104
  results = ddb_conn.sql(count_query).df()
105
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
109
106
  series = self.group_query_results_to_numeric_metrics(
110
107
  results,
111
108
  "count",
112
- segmentation_cols,
109
+ unescaped_segmentation_cols,
113
110
  "ts",
114
111
  )
115
112
  metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -20,7 +20,7 @@ from arthur_common.models.schema_definitions import (
20
20
  ScalarType,
21
21
  ScopeSchemaTag,
22
22
  )
23
- from arthur_common.tools.duckdb_data_loader import escape_identifier
23
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
24
24
 
25
25
 
26
26
  class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
@@ -100,31 +100,26 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
100
100
  ) -> list[NumericMetric]:
101
101
  """Executed SQL with no segmentation columns:
102
102
  SELECT
103
- time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
104
- {escaped_pred_col} as prediction,
103
+ time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
104
+ {prediction_col} as prediction,
105
105
  COUNT(*) as count
106
106
  FROM {dataset.dataset_table_name}
107
107
  GROUP BY
108
108
  ts,
109
109
  -- group by raw column name instead of alias in select
110
110
  -- in case table has a column called 'prediction'
111
- {escaped_pred_col}
111
+ {prediction_col}
112
112
  ORDER BY ts
113
113
  """
114
114
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
115
- escaped_timestamp_col = escape_identifier(timestamp_col)
116
- escaped_pred_col = escape_identifier(prediction_col)
117
115
 
118
116
  # build query components with segmentation columns
119
- escaped_segmentation_cols = [
120
- escape_identifier(col) for col in segmentation_cols
121
- ]
122
117
  all_select_clause_cols = [
123
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
124
- f"{escaped_pred_col} as prediction",
118
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
119
+ f"{prediction_col} as prediction",
125
120
  f"COUNT(*) as count",
126
- ] + escaped_segmentation_cols
127
- all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
121
+ ] + segmentation_cols
122
+ all_group_by_cols = ["ts", f"{prediction_col}"] + segmentation_cols
128
123
  extra_dims = ["prediction"]
129
124
 
130
125
  # build query
@@ -137,10 +132,11 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
137
132
 
138
133
  result = ddb_conn.sql(query).df()
139
134
 
135
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
140
136
  series = self.group_query_results_to_numeric_metrics(
141
137
  result,
142
138
  "count",
143
- segmentation_cols + extra_dims,
139
+ unescaped_segmentation_cols + extra_dims,
144
140
  "ts",
145
141
  )
146
142
  metric = self.series_to_metric(self._metric_name(), series)
@@ -248,34 +244,29 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
248
244
  ) -> list[NumericMetric]:
249
245
  """Executed SQL with no segmentation columns:
250
246
  SELECT
251
- time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
252
- CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
247
+ time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
248
+ CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
253
249
  COUNT(*) as count
254
250
  FROM {dataset.dataset_table_name}
255
251
  GROUP BY
256
252
  ts,
257
253
  -- group by raw column name instead of alias in select
258
254
  -- in case table has a column called 'prediction'
259
- {escaped_prediction_col}
255
+ {prediction_col}
260
256
  ORDER BY ts
261
257
  """
262
258
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
263
- escaped_timestamp_col = escape_identifier(timestamp_col)
264
- escaped_prediction_col = escape_identifier(prediction_col)
265
259
 
266
260
  # build query components with segmentation columns
267
- escaped_segmentation_cols = [
268
- escape_identifier(col) for col in segmentation_cols
269
- ]
270
261
  all_select_clause_cols = [
271
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
272
- f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
262
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
263
+ f"CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
273
264
  f"COUNT(*) as count",
274
- ] + escaped_segmentation_cols
265
+ ] + segmentation_cols
275
266
  all_group_by_cols = [
276
267
  "ts",
277
- f"{escaped_prediction_col}",
278
- ] + escaped_segmentation_cols
268
+ f"{prediction_col}",
269
+ ] + segmentation_cols
279
270
  extra_dims = ["prediction"]
280
271
 
281
272
  query = f"""
@@ -287,10 +278,11 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
287
278
 
288
279
  result = ddb_conn.sql(query).df()
289
280
 
281
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
290
282
  series = self.group_query_results_to_numeric_metrics(
291
283
  result,
292
284
  "count",
293
- segmentation_cols + extra_dims,
285
+ unescaped_segmentation_cols + extra_dims,
294
286
  "ts",
295
287
  )
296
288
  metric = self.series_to_metric(self._metric_name(), series)
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class InferenceNullCountAggregationFunction(NumericAggregationFunction):
@@ -90,44 +90,40 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
90
90
  ] = None,
91
91
  ) -> list[NumericMetric]:
92
92
  """Executed SQL with no segmentation columns:
93
- select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
93
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
94
94
  count(*) as count \
95
- from {dataset.dataset_table_name} where {escaped_nullable_col} is null \
95
+ from {dataset.dataset_table_name} where {nullable_col} is null \
96
96
  group by ts \
97
97
  """
98
98
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
99
- escaped_timestamp_col = escape_identifier(timestamp_col)
100
- escaped_nullable_col = escape_identifier(nullable_col)
101
99
 
102
100
  # build query components with segmentation columns
103
- escaped_segmentation_cols = [
104
- escape_identifier(col) for col in segmentation_cols
105
- ]
106
101
  all_select_clause_cols = [
107
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
102
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
108
103
  f"count(*) as count",
109
- ] + escaped_segmentation_cols
110
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
104
+ ] + segmentation_cols
105
+ all_group_by_cols = ["ts"] + segmentation_cols
111
106
 
112
107
  # build query
113
108
  count_query = f"""
114
109
  select {", ".join(all_select_clause_cols)}
115
110
  from {dataset.dataset_table_name}
116
- where {escaped_nullable_col} is null
111
+ where {nullable_col} is null
117
112
  group by {", ".join(all_group_by_cols)}
118
113
  """
119
114
 
120
115
  results = ddb_conn.sql(count_query).df()
121
116
 
117
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
122
118
  series = self.group_query_results_to_numeric_metrics(
123
119
  results,
124
120
  "count",
125
- segmentation_cols,
121
+ unescaped_segmentation_cols,
126
122
  "ts",
127
123
  )
128
124
  # preserve dimension that identifies the name of the nullable column used for the aggregation
129
125
  for point in series:
130
- point.dimensions.append(Dimension(name="column_name", value=nullable_col))
126
+ point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(nullable_col)))
131
127
 
132
128
  metric = self.series_to_metric(self.METRIC_NAME, series)
133
129
  return [metric]
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
111
111
  ] = None,
112
112
  ) -> list[NumericMetric]:
113
113
  """Executed SQL with no segmentation columns:
114
- SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
115
- SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae, \
114
+ SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
115
+ SUM(ABS({prediction_col} - {ground_truth_col})) as ae, \
116
116
  COUNT(*) as count \
117
117
  FROM {dataset.dataset_table_name} \
118
- WHERE {escaped_prediction_col} IS NOT NULL \
119
- AND {escaped_ground_truth_col} IS NOT NULL \
118
+ WHERE {prediction_col} IS NOT NULL \
119
+ AND {ground_truth_col} IS NOT NULL \
120
120
  GROUP BY ts order by ts desc \
121
121
  """
122
122
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
123
- escaped_timestamp_col = escape_identifier(timestamp_col)
124
- escaped_prediction_col = escape_identifier(prediction_col)
125
- escaped_ground_truth_col = escape_identifier(ground_truth_col)
126
123
 
127
124
  # build query components with segmentation columns
128
- escaped_segmentation_cols = [
129
- escape_identifier(col) for col in segmentation_cols
130
- ]
131
125
  all_select_clause_cols = [
132
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
133
- f"SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae",
126
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
127
+ f"SUM(ABS({prediction_col} - {ground_truth_col})) as ae",
134
128
  f"COUNT(*) as count",
135
- ] + escaped_segmentation_cols
136
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
129
+ ] + segmentation_cols
130
+ all_group_by_cols = ["ts"] + segmentation_cols
137
131
 
138
132
  # build query
139
133
  mae_query = f"""
140
134
  SELECT {", ".join(all_select_clause_cols)}
141
135
  FROM {dataset.dataset_table_name}
142
- WHERE {escaped_prediction_col} IS NOT NULL
143
- AND {escaped_ground_truth_col} IS NOT NULL
136
+ WHERE {prediction_col} IS NOT NULL
137
+ AND {ground_truth_col} IS NOT NULL
144
138
  GROUP BY {", ".join(all_group_by_cols)} order by ts desc
145
139
  """
146
140
 
147
141
  results = ddb_conn.sql(mae_query).df()
142
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
148
143
  count_series = self.group_query_results_to_numeric_metrics(
149
144
  results,
150
145
  "count",
151
- segmentation_cols,
146
+ unescaped_segmentation_cols,
152
147
  "ts",
153
148
  )
154
149
  absolute_error_series = self.group_query_results_to_numeric_metrics(
155
150
  results,
156
151
  "ae",
157
- segmentation_cols,
152
+ unescaped_segmentation_cols,
158
153
  "ts",
159
154
  )
160
155
 
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
111
111
  ] = None,
112
112
  ) -> list[NumericMetric]:
113
113
  """Executed SQL with no segmentation columns:
114
- SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
115
- SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error, \
114
+ SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
115
+ SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error, \
116
116
  COUNT(*) as count \
117
117
  FROM {dataset.dataset_table_name} \
118
- WHERE {escaped_prediction_col} IS NOT NULL \
119
- AND {escaped_ground_truth_col} IS NOT NULL \
118
+ WHERE {prediction_col} IS NOT NULL \
119
+ AND {ground_truth_col} IS NOT NULL \
120
120
  GROUP BY ts order by ts desc \
121
121
  """
122
122
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
123
- escaped_timestamp_col = escape_identifier(timestamp_col)
124
- escaped_prediction_col = escape_identifier(prediction_col)
125
- escaped_ground_truth_col = escape_identifier(ground_truth_col)
126
123
 
127
124
  # build query components with segmentation columns
128
- escaped_segmentation_cols = [
129
- escape_identifier(col) for col in segmentation_cols
130
- ]
131
125
  all_select_clause_cols = [
132
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
133
- f"SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error",
126
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
127
+ f"SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error",
134
128
  f"COUNT(*) as count",
135
- ] + escaped_segmentation_cols
136
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
129
+ ] + segmentation_cols
130
+ all_group_by_cols = ["ts"] + segmentation_cols
137
131
 
138
132
  # build query
139
133
  mse_query = f"""
140
134
  SELECT {", ".join(all_select_clause_cols)}
141
135
  FROM {dataset.dataset_table_name}
142
- WHERE {escaped_prediction_col} IS NOT NULL
143
- AND {escaped_ground_truth_col} IS NOT NULL
136
+ WHERE {prediction_col} IS NOT NULL
137
+ AND {ground_truth_col} IS NOT NULL
144
138
  GROUP BY {", ".join(all_group_by_cols)} order by ts desc
145
139
  """
146
140
 
147
141
  results = ddb_conn.sql(mse_query).df()
142
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
148
143
  count_series = self.group_query_results_to_numeric_metrics(
149
144
  results,
150
145
  "count",
151
- segmentation_cols,
146
+ unescaped_segmentation_cols,
152
147
  "ts",
153
148
  )
154
149
  squared_error_series = self.group_query_results_to_numeric_metrics(
155
150
  results,
156
151
  "squared_error",
157
- segmentation_cols,
152
+ unescaped_segmentation_cols,
158
153
  "ts",
159
154
  )
160
155