arthur-common 2.2.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arthur-common might be problematic. Click here for more details.

Files changed (49) hide show
  1. {arthur_common-2.2.0 → arthur_common-2.4.0}/PKG-INFO +1 -1
  2. {arthur_common-2.2.0 → arthur_common-2.4.0}/pyproject.toml +1 -1
  3. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/categorical_count.py +13 -16
  4. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/confusion_matrix.py +24 -29
  5. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/inference_count.py +7 -10
  6. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/inference_count_by_class.py +20 -28
  7. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/inference_null_count.py +10 -14
  8. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/mean_absolute_error.py +14 -19
  9. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/mean_squared_error.py +14 -19
  10. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/multiclass_confusion_matrix.py +18 -23
  11. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/numeric_stats.py +13 -16
  12. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/numeric_sum.py +12 -16
  13. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/metrics.py +0 -21
  14. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/duckdb_data_loader.py +30 -0
  15. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/duckdb_utils.py +3 -5
  16. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/schema_inferer.py +1 -1
  17. {arthur_common-2.2.0 → arthur_common-2.4.0}/README.md +0 -0
  18. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/__init__.py +0 -0
  19. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/__init__.py +0 -0
  20. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/aggregator.py +0 -0
  21. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/README.md +0 -0
  22. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/__init__.py +0 -0
  23. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/agentic_aggregations.py +0 -0
  24. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +0 -0
  25. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/py.typed +0 -0
  26. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/shield_aggregations.py +0 -0
  27. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/py.typed +0 -0
  28. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/config/__init__.py +0 -0
  29. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/config/config.py +0 -0
  30. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/config/settings.yaml +0 -0
  31. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/__init__.py +0 -0
  32. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/common_schemas.py +0 -0
  33. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/connectors.py +0 -0
  34. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/constants.py +0 -0
  35. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/datasets.py +0 -0
  36. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/enums.py +0 -0
  37. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/metric_schemas.py +0 -0
  38. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/py.typed +0 -0
  39. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/request_schemas.py +0 -0
  40. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/response_schemas.py +0 -0
  41. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/schema_definitions.py +0 -0
  42. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/task_job_specs.py +0 -0
  43. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/py.typed +0 -0
  44. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/__init__.py +0 -0
  45. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/aggregation_analyzer.py +0 -0
  46. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/aggregation_loader.py +0 -0
  47. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/functions.py +0 -0
  48. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/py.typed +0 -0
  49. {arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/time_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arthur-common
3
- Version: 2.2.0
3
+ Version: 2.4.0
4
4
  Summary: Utility code common to Arthur platform components.
5
5
  License: MIT
6
6
  Author: Arthur
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "arthur-common"
3
- version = "2.2.0"
3
+ version = "2.4.0"
4
4
  description = "Utility code common to Arthur platform components."
5
5
  authors = ["Arthur <engineering@arthur.ai>"]
6
6
  license = "MIT"
@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
18
18
  ScalarType,
19
19
  ScopeSchemaTag,
20
20
  )
21
- from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
21
+
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
22
23
 
23
24
 
24
25
  class CategoricalCountAggregationFunction(NumericAggregationFunction):
@@ -93,30 +94,25 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
93
94
  ] = None,
94
95
  ) -> list[NumericMetric]:
95
96
  """Executed SQL with no segmentation columns:
96
- select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
97
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
97
98
  count(*) as count, \
98
- {categorical_col_escaped} as category, \
99
- {categorical_col_name_escaped} as column_name \
99
+ {categorical_col} as category, \
100
+ {categorical_col_name_unescaped} as column_name \
100
101
  from {dataset.dataset_table_name} \
101
102
  where ts is not null \
102
103
  group by ts, category
103
104
  """
104
105
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
105
- timestamp_col_escaped = escape_identifier(timestamp_col)
106
- categorical_col_escaped = escape_identifier(categorical_col)
107
- categorical_col_name_escaped = escape_str_literal(categorical_col)
106
+ categorical_col_name_unescaped = escape_str_literal(unescape_identifier(categorical_col))
108
107
 
109
108
  # build query components with segmentation columns
110
- escaped_segmentation_cols = [
111
- escape_identifier(col) for col in segmentation_cols
112
- ]
113
109
  all_select_clause_cols = [
114
- f"time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts",
110
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
115
111
  f"count(*) as count",
116
- f"{categorical_col_escaped} as category",
117
- f"{categorical_col_name_escaped} as column_name",
118
- ] + escaped_segmentation_cols
119
- all_group_by_cols = ["ts", "category"] + escaped_segmentation_cols
112
+ f"{categorical_col} as category",
113
+ f"{categorical_col_name_unescaped} as column_name",
114
+ ] + segmentation_cols
115
+ all_group_by_cols = ["ts", "category"] + segmentation_cols
120
116
  extra_dims = ["column_name", "category"]
121
117
 
122
118
  # build query
@@ -129,10 +125,11 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
129
125
 
130
126
  results = ddb_conn.sql(count_query).df()
131
127
 
128
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
132
129
  series = self.group_query_results_to_numeric_metrics(
133
130
  results,
134
131
  "count",
135
- segmentation_cols + extra_dims,
132
+ unescaped_segmentation_cols + extra_dims,
136
133
  timestamp_col="ts",
137
134
  )
138
135
  metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
20
20
  ScalarType,
21
21
  ScopeSchemaTag,
22
22
  )
23
- from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
23
+
24
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
24
25
 
25
26
 
26
27
  class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
@@ -78,11 +79,11 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
78
79
  Without segmentation, this is the query:
79
80
  WITH normalized_data AS (
80
81
  SELECT
81
- {escaped_timestamp_col} AS timestamp,
82
- {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
83
- {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
82
+ {timestamp_col} AS timestamp,
83
+ {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
84
+ {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
84
85
  FROM {dataset.dataset_table_name}
85
- WHERE {escaped_timestamp_col} IS NOT NULL
86
+ WHERE {timestamp_col} IS NOT NULL
86
87
  )
87
88
  SELECT
88
89
  time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -90,34 +91,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
90
91
  SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
91
92
  SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
92
93
  SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
93
- {escaped_prediction_col_name} as prediction_column_name
94
+ {unescaped_prediction_col_name} as prediction_column_name
94
95
  FROM normalized_data
95
96
  GROUP BY ts
96
97
  ORDER BY ts
97
98
  """
98
99
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
99
- escaped_timestamp_col = escape_identifier(timestamp_col)
100
- escaped_prediction_col = escape_identifier(prediction_col)
101
- escaped_prediction_col_name = escape_str_literal(prediction_col)
102
- escaped_gt_values_col = escape_identifier(gt_values_col)
100
+ unescaped_prediction_col_name = escape_str_literal(unescape_identifier(prediction_col))
101
+
103
102
  # build query components with segmentation columns
104
- escaped_segmentation_cols = [
105
- escape_identifier(col) for col in segmentation_cols
106
- ]
107
103
  first_subquery_select_cols = [
108
- f"{escaped_timestamp_col} AS timestamp",
109
- f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
110
- f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
111
- ] + escaped_segmentation_cols
104
+ f"{timestamp_col} AS timestamp",
105
+ f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
106
+ f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
107
+ ] + segmentation_cols
112
108
  second_subquery_select_cols = [
113
109
  "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
114
110
  "SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
115
111
  "SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
116
112
  "SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
117
113
  "SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
118
- f"{escaped_prediction_col_name} as prediction_column_name",
119
- ] + escaped_segmentation_cols
120
- second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
114
+ f"{unescaped_prediction_col_name} as prediction_column_name",
115
+ ] + segmentation_cols
116
+ second_subquery_group_by_cols = ["ts"] + segmentation_cols
121
117
  extra_dims = ["prediction_column_name"]
122
118
 
123
119
  # build query
@@ -125,7 +121,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
125
121
  WITH normalized_data AS (
126
122
  SELECT {", ".join(first_subquery_select_cols)}
127
123
  FROM {dataset.dataset_table_name}
128
- WHERE {escaped_timestamp_col} IS NOT NULL
124
+ WHERE {timestamp_col} IS NOT NULL
129
125
  )
130
126
  SELECT {", ".join(second_subquery_select_cols)}
131
127
  FROM normalized_data
@@ -135,28 +131,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
135
131
 
136
132
  results = ddb_conn.sql(confusion_matrix_query).df()
137
133
 
134
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
138
135
  tp = self.group_query_results_to_numeric_metrics(
139
136
  results,
140
137
  "true_positive_count",
141
- dim_columns=segmentation_cols + extra_dims,
138
+ dim_columns=unescaped_segmentation_cols + extra_dims,
142
139
  timestamp_col="ts",
143
140
  )
144
141
  fp = self.group_query_results_to_numeric_metrics(
145
142
  results,
146
143
  "false_positive_count",
147
- dim_columns=segmentation_cols + extra_dims,
144
+ dim_columns=unescaped_segmentation_cols + extra_dims,
148
145
  timestamp_col="ts",
149
146
  )
150
147
  fn = self.group_query_results_to_numeric_metrics(
151
148
  results,
152
149
  "false_negative_count",
153
- dim_columns=segmentation_cols + extra_dims,
150
+ dim_columns=unescaped_segmentation_cols + extra_dims,
154
151
  timestamp_col="ts",
155
152
  )
156
153
  tn = self.group_query_results_to_numeric_metrics(
157
154
  results,
158
155
  "true_negative_count",
159
- dim_columns=segmentation_cols + extra_dims,
156
+ dim_columns=unescaped_segmentation_cols + extra_dims,
160
157
  timestamp_col="ts",
161
158
  )
162
159
  tp_metric = self.series_to_metric(self.TRUE_POSITIVE_METRIC_NAME, tp)
@@ -243,9 +240,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
243
240
  ] = None,
244
241
  ) -> list[NumericMetric]:
245
242
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
246
- escaped_prediction_col = escape_identifier(prediction_col)
247
243
  # Get the type of prediction column
248
- type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
244
+ type_query = f"SELECT typeof({prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
249
245
  res = ddb_conn.sql(type_query).fetchone()
250
246
  # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
251
247
  if not res:
@@ -476,7 +472,6 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
476
472
  ),
477
473
  ] = None,
478
474
  ) -> list[NumericMetric]:
479
- escaped_gt_values_col = escape_identifier(gt_values_col)
480
475
  prediction_normalization_case = f"""
481
476
  CASE
482
477
  WHEN value >= {threshold} THEN 1
@@ -485,7 +480,7 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
485
480
  END
486
481
  """
487
482
 
488
- type_query = f"SELECT typeof({escaped_gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
483
+ type_query = f"SELECT typeof({gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
489
484
  res = ddb_conn.sql(type_query).fetchone()
490
485
  # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
491
486
  if not res:
@@ -18,7 +18,7 @@ from arthur_common.models.schema_definitions import (
18
18
  ScalarType,
19
19
  ScopeSchemaTag,
20
20
  )
21
- from arthur_common.tools.duckdb_data_loader import escape_identifier
21
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
22
22
 
23
23
 
24
24
  class InferenceCountAggregationFunction(NumericAggregationFunction):
@@ -80,23 +80,19 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
80
80
  ] = None,
81
81
  ) -> list[NumericMetric]:
82
82
  """Executed SQL with no segmentation columns:
83
- select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
83
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
84
84
  count(*) as count \
85
85
  from {dataset.dataset_table_name} \
86
86
  group by ts \
87
87
  """
88
88
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
89
- escaped_timestamp_col = escape_identifier(timestamp_col)
90
89
 
91
90
  # build query components with segmentation columns
92
- escaped_segmentation_cols = [
93
- escape_identifier(col) for col in segmentation_cols
94
- ]
95
91
  all_select_clause_cols = [
96
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
92
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
97
93
  f"count(*) as count",
98
- ] + escaped_segmentation_cols
99
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
94
+ ] + segmentation_cols
95
+ all_group_by_cols = ["ts"] + segmentation_cols
100
96
 
101
97
  # build query
102
98
  count_query = f"""
@@ -106,10 +102,11 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
106
102
  """
107
103
 
108
104
  results = ddb_conn.sql(count_query).df()
105
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
109
106
  series = self.group_query_results_to_numeric_metrics(
110
107
  results,
111
108
  "count",
112
- segmentation_cols,
109
+ unescaped_segmentation_cols,
113
110
  "ts",
114
111
  )
115
112
  metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -20,7 +20,7 @@ from arthur_common.models.schema_definitions import (
20
20
  ScalarType,
21
21
  ScopeSchemaTag,
22
22
  )
23
- from arthur_common.tools.duckdb_data_loader import escape_identifier
23
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
24
24
 
25
25
 
26
26
  class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
@@ -100,31 +100,26 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
100
100
  ) -> list[NumericMetric]:
101
101
  """Executed SQL with no segmentation columns:
102
102
  SELECT
103
- time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
104
- {escaped_pred_col} as prediction,
103
+ time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
104
+ {prediction_col} as prediction,
105
105
  COUNT(*) as count
106
106
  FROM {dataset.dataset_table_name}
107
107
  GROUP BY
108
108
  ts,
109
109
  -- group by raw column name instead of alias in select
110
110
  -- in case table has a column called 'prediction'
111
- {escaped_pred_col}
111
+ {prediction_col}
112
112
  ORDER BY ts
113
113
  """
114
114
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
115
- escaped_timestamp_col = escape_identifier(timestamp_col)
116
- escaped_pred_col = escape_identifier(prediction_col)
117
115
 
118
116
  # build query components with segmentation columns
119
- escaped_segmentation_cols = [
120
- escape_identifier(col) for col in segmentation_cols
121
- ]
122
117
  all_select_clause_cols = [
123
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
124
- f"{escaped_pred_col} as prediction",
118
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
119
+ f"{prediction_col} as prediction",
125
120
  f"COUNT(*) as count",
126
- ] + escaped_segmentation_cols
127
- all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
121
+ ] + segmentation_cols
122
+ all_group_by_cols = ["ts", f"{prediction_col}"] + segmentation_cols
128
123
  extra_dims = ["prediction"]
129
124
 
130
125
  # build query
@@ -137,10 +132,11 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
137
132
 
138
133
  result = ddb_conn.sql(query).df()
139
134
 
135
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
140
136
  series = self.group_query_results_to_numeric_metrics(
141
137
  result,
142
138
  "count",
143
- segmentation_cols + extra_dims,
139
+ unescaped_segmentation_cols + extra_dims,
144
140
  "ts",
145
141
  )
146
142
  metric = self.series_to_metric(self._metric_name(), series)
@@ -248,34 +244,29 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
248
244
  ) -> list[NumericMetric]:
249
245
  """Executed SQL with no segmentation columns:
250
246
  SELECT
251
- time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
252
- CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
247
+ time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
248
+ CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
253
249
  COUNT(*) as count
254
250
  FROM {dataset.dataset_table_name}
255
251
  GROUP BY
256
252
  ts,
257
253
  -- group by raw column name instead of alias in select
258
254
  -- in case table has a column called 'prediction'
259
- {escaped_prediction_col}
255
+ {prediction_col}
260
256
  ORDER BY ts
261
257
  """
262
258
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
263
- escaped_timestamp_col = escape_identifier(timestamp_col)
264
- escaped_prediction_col = escape_identifier(prediction_col)
265
259
 
266
260
  # build query components with segmentation columns
267
- escaped_segmentation_cols = [
268
- escape_identifier(col) for col in segmentation_cols
269
- ]
270
261
  all_select_clause_cols = [
271
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
272
- f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
262
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
263
+ f"CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
273
264
  f"COUNT(*) as count",
274
- ] + escaped_segmentation_cols
265
+ ] + segmentation_cols
275
266
  all_group_by_cols = [
276
267
  "ts",
277
- f"{escaped_prediction_col}",
278
- ] + escaped_segmentation_cols
268
+ f"{prediction_col}",
269
+ ] + segmentation_cols
279
270
  extra_dims = ["prediction"]
280
271
 
281
272
  query = f"""
@@ -287,10 +278,11 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
287
278
 
288
279
  result = ddb_conn.sql(query).df()
289
280
 
281
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
290
282
  series = self.group_query_results_to_numeric_metrics(
291
283
  result,
292
284
  "count",
293
- segmentation_cols + extra_dims,
285
+ unescaped_segmentation_cols + extra_dims,
294
286
  "ts",
295
287
  )
296
288
  metric = self.series_to_metric(self._metric_name(), series)
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class InferenceNullCountAggregationFunction(NumericAggregationFunction):
@@ -90,44 +90,40 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
90
90
  ] = None,
91
91
  ) -> list[NumericMetric]:
92
92
  """Executed SQL with no segmentation columns:
93
- select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
93
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
94
94
  count(*) as count \
95
- from {dataset.dataset_table_name} where {escaped_nullable_col} is null \
95
+ from {dataset.dataset_table_name} where {nullable_col} is null \
96
96
  group by ts \
97
97
  """
98
98
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
99
- escaped_timestamp_col = escape_identifier(timestamp_col)
100
- escaped_nullable_col = escape_identifier(nullable_col)
101
99
 
102
100
  # build query components with segmentation columns
103
- escaped_segmentation_cols = [
104
- escape_identifier(col) for col in segmentation_cols
105
- ]
106
101
  all_select_clause_cols = [
107
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
102
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
108
103
  f"count(*) as count",
109
- ] + escaped_segmentation_cols
110
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
104
+ ] + segmentation_cols
105
+ all_group_by_cols = ["ts"] + segmentation_cols
111
106
 
112
107
  # build query
113
108
  count_query = f"""
114
109
  select {", ".join(all_select_clause_cols)}
115
110
  from {dataset.dataset_table_name}
116
- where {escaped_nullable_col} is null
111
+ where {nullable_col} is null
117
112
  group by {", ".join(all_group_by_cols)}
118
113
  """
119
114
 
120
115
  results = ddb_conn.sql(count_query).df()
121
116
 
117
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
122
118
  series = self.group_query_results_to_numeric_metrics(
123
119
  results,
124
120
  "count",
125
- segmentation_cols,
121
+ unescaped_segmentation_cols,
126
122
  "ts",
127
123
  )
128
124
  # preserve dimension that identifies the name of the nullable column used for the aggregation
129
125
  for point in series:
130
- point.dimensions.append(Dimension(name="column_name", value=nullable_col))
126
+ point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(nullable_col)))
131
127
 
132
128
  metric = self.series_to_metric(self.METRIC_NAME, series)
133
129
  return [metric]
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
111
111
  ] = None,
112
112
  ) -> list[NumericMetric]:
113
113
  """Executed SQL with no segmentation columns:
114
- SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
115
- SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae, \
114
+ SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
115
+ SUM(ABS({prediction_col} - {ground_truth_col})) as ae, \
116
116
  COUNT(*) as count \
117
117
  FROM {dataset.dataset_table_name} \
118
- WHERE {escaped_prediction_col} IS NOT NULL \
119
- AND {escaped_ground_truth_col} IS NOT NULL \
118
+ WHERE {prediction_col} IS NOT NULL \
119
+ AND {ground_truth_col} IS NOT NULL \
120
120
  GROUP BY ts order by ts desc \
121
121
  """
122
122
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
123
- escaped_timestamp_col = escape_identifier(timestamp_col)
124
- escaped_prediction_col = escape_identifier(prediction_col)
125
- escaped_ground_truth_col = escape_identifier(ground_truth_col)
126
123
 
127
124
  # build query components with segmentation columns
128
- escaped_segmentation_cols = [
129
- escape_identifier(col) for col in segmentation_cols
130
- ]
131
125
  all_select_clause_cols = [
132
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
133
- f"SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae",
126
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
127
+ f"SUM(ABS({prediction_col} - {ground_truth_col})) as ae",
134
128
  f"COUNT(*) as count",
135
- ] + escaped_segmentation_cols
136
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
129
+ ] + segmentation_cols
130
+ all_group_by_cols = ["ts"] + segmentation_cols
137
131
 
138
132
  # build query
139
133
  mae_query = f"""
140
134
  SELECT {", ".join(all_select_clause_cols)}
141
135
  FROM {dataset.dataset_table_name}
142
- WHERE {escaped_prediction_col} IS NOT NULL
143
- AND {escaped_ground_truth_col} IS NOT NULL
136
+ WHERE {prediction_col} IS NOT NULL
137
+ AND {ground_truth_col} IS NOT NULL
144
138
  GROUP BY {", ".join(all_group_by_cols)} order by ts desc
145
139
  """
146
140
 
147
141
  results = ddb_conn.sql(mae_query).df()
142
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
148
143
  count_series = self.group_query_results_to_numeric_metrics(
149
144
  results,
150
145
  "count",
151
- segmentation_cols,
146
+ unescaped_segmentation_cols,
152
147
  "ts",
153
148
  )
154
149
  absolute_error_series = self.group_query_results_to_numeric_metrics(
155
150
  results,
156
151
  "ae",
157
- segmentation_cols,
152
+ unescaped_segmentation_cols,
158
153
  "ts",
159
154
  )
160
155
 
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
111
111
  ] = None,
112
112
  ) -> list[NumericMetric]:
113
113
  """Executed SQL with no segmentation columns:
114
- SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
115
- SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error, \
114
+ SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
115
+ SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error, \
116
116
  COUNT(*) as count \
117
117
  FROM {dataset.dataset_table_name} \
118
- WHERE {escaped_prediction_col} IS NOT NULL \
119
- AND {escaped_ground_truth_col} IS NOT NULL \
118
+ WHERE {prediction_col} IS NOT NULL \
119
+ AND {ground_truth_col} IS NOT NULL \
120
120
  GROUP BY ts order by ts desc \
121
121
  """
122
122
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
123
- escaped_timestamp_col = escape_identifier(timestamp_col)
124
- escaped_prediction_col = escape_identifier(prediction_col)
125
- escaped_ground_truth_col = escape_identifier(ground_truth_col)
126
123
 
127
124
  # build query components with segmentation columns
128
- escaped_segmentation_cols = [
129
- escape_identifier(col) for col in segmentation_cols
130
- ]
131
125
  all_select_clause_cols = [
132
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
133
- f"SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error",
126
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
127
+ f"SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error",
134
128
  f"COUNT(*) as count",
135
- ] + escaped_segmentation_cols
136
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
129
+ ] + segmentation_cols
130
+ all_group_by_cols = ["ts"] + segmentation_cols
137
131
 
138
132
  # build query
139
133
  mse_query = f"""
140
134
  SELECT {", ".join(all_select_clause_cols)}
141
135
  FROM {dataset.dataset_table_name}
142
- WHERE {escaped_prediction_col} IS NOT NULL
143
- AND {escaped_ground_truth_col} IS NOT NULL
136
+ WHERE {prediction_col} IS NOT NULL
137
+ AND {ground_truth_col} IS NOT NULL
144
138
  GROUP BY {", ".join(all_group_by_cols)} order by ts desc
145
139
  """
146
140
 
147
141
  results = ddb_conn.sql(mse_query).df()
142
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
148
143
  count_series = self.group_query_results_to_numeric_metrics(
149
144
  results,
150
145
  "count",
151
- segmentation_cols,
146
+ unescaped_segmentation_cols,
152
147
  "ts",
153
148
  )
154
149
  squared_error_series = self.group_query_results_to_numeric_metrics(
155
150
  results,
156
151
  "squared_error",
157
- segmentation_cols,
152
+ unescaped_segmentation_cols,
158
153
  "ts",
159
154
  )
160
155
 
@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
20
20
  ScalarType,
21
21
  ScopeSchemaTag,
22
22
  )
23
- from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
23
+
24
+ from arthur_common.tools.duckdb_data_loader import escape_str_literal, unescape_identifier
24
25
 
25
26
 
26
27
  class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction(
@@ -194,11 +195,11 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
194
195
  Returns the following SQL with no segmentation:
195
196
  WITH normalized_data AS (
196
197
  SELECT
197
- {escaped_timestamp_col} AS timestamp,
198
- {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
199
- {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
198
+ {timestamp_col} AS timestamp,
199
+ {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
200
+ {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
200
201
  FROM {dataset.dataset_table_name}
201
- WHERE {escaped_timestamp_col} IS NOT NULL
202
+ WHERE {timestamp_col} IS NOT NULL
202
203
  )
203
204
  SELECT
204
205
  time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -212,19 +213,12 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
212
213
  ORDER BY ts
213
214
 
214
215
  """
215
- escaped_timestamp_col = escape_identifier(timestamp_col)
216
- escaped_prediction_col = escape_identifier(prediction_col)
217
- escaped_gt_values_col = escape_identifier(gt_values_col)
218
-
219
216
  # build query components with segmentation columns
220
- escaped_segmentation_cols = [
221
- escape_identifier(col) for col in segmentation_cols
222
- ]
223
217
  first_subquery_select_cols = [
224
- f"{escaped_timestamp_col} AS timestamp",
225
- f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
226
- f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
227
- ] + escaped_segmentation_cols
218
+ f"{timestamp_col} AS timestamp",
219
+ f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
220
+ f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
221
+ ] + segmentation_cols
228
222
  second_subquery_select_cols = [
229
223
  "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
230
224
  "SUM(CASE WHEN prediction = 1 AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
@@ -232,8 +226,8 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
232
226
  "SUM(CASE WHEN prediction = 0 AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
233
227
  "SUM(CASE WHEN prediction = 0 AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
234
228
  f"any_value({escaped_positive_class_label}) as class_label",
235
- ] + escaped_segmentation_cols
236
- second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
229
+ ] + segmentation_cols
230
+ second_subquery_group_by_cols = ["ts"] + segmentation_cols
237
231
  extra_dims = ["class_label"]
238
232
 
239
233
  # build query
@@ -241,7 +235,7 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
241
235
  WITH normalized_data AS (
242
236
  SELECT {", ".join(first_subquery_select_cols)}
243
237
  FROM {dataset.dataset_table_name}
244
- WHERE {escaped_timestamp_col} IS NOT NULL
238
+ WHERE {timestamp_col} IS NOT NULL
245
239
  )
246
240
  SELECT {", ".join(second_subquery_select_cols)}
247
241
  FROM normalized_data
@@ -250,29 +244,30 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
250
244
  """
251
245
 
252
246
  results = ddb_conn.sql(confusion_matrix_query).df()
247
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
253
248
 
254
249
  tp = self.group_query_results_to_numeric_metrics(
255
250
  results,
256
251
  "true_positive_count",
257
- dim_columns=segmentation_cols + extra_dims,
252
+ dim_columns=unescaped_segmentation_cols + extra_dims,
258
253
  timestamp_col="ts",
259
254
  )
260
255
  fp = self.group_query_results_to_numeric_metrics(
261
256
  results,
262
257
  "false_positive_count",
263
- dim_columns=segmentation_cols + extra_dims,
258
+ dim_columns=unescaped_segmentation_cols + extra_dims,
264
259
  timestamp_col="ts",
265
260
  )
266
261
  fn = self.group_query_results_to_numeric_metrics(
267
262
  results,
268
263
  "false_negative_count",
269
- dim_columns=segmentation_cols + extra_dims,
264
+ dim_columns=unescaped_segmentation_cols + extra_dims,
270
265
  timestamp_col="ts",
271
266
  )
272
267
  tn = self.group_query_results_to_numeric_metrics(
273
268
  results,
274
269
  "true_negative_count",
275
- dim_columns=segmentation_cols + extra_dims,
270
+ dim_columns=unescaped_segmentation_cols + extra_dims,
276
271
  timestamp_col="ts",
277
272
  )
278
273
  tp_metric = self.series_to_metric(
@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
18
18
  ScalarType,
19
19
  ScopeSchemaTag,
20
20
  )
21
- from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
21
+
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
22
23
 
23
24
 
24
25
  class NumericSketchAggregationFunction(SketchAggregationFunction):
@@ -95,41 +96,37 @@ class NumericSketchAggregationFunction(SketchAggregationFunction):
95
96
  ] = None,
96
97
  ) -> list[SketchMetric]:
97
98
  """Executed SQL with no segmentation columns:
98
- select {escaped_timestamp_col_id} as ts, \
99
- {escaped_numeric_col_id}, \
99
+ select {timestamp_col} as ts, \
100
+ {numeric_col}, \
100
101
  {numeric_col_name_str} as column_name \
101
102
  from {dataset.dataset_table_name} \
102
- where {escaped_numeric_col_id} is not null \
103
+ where {numeric_col} is not null \
103
104
  """
104
105
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
105
- escaped_timestamp_col_id = escape_identifier(timestamp_col)
106
- escaped_numeric_col_id = escape_identifier(numeric_col)
107
- numeric_col_name_str = escape_str_literal(numeric_col)
106
+ numeric_col_name_str = escape_str_literal(unescape_identifier(numeric_col))
108
107
 
109
108
  # build query components with segmentation columns
110
- escaped_segmentation_cols = [
111
- escape_identifier(col) for col in segmentation_cols
112
- ]
113
109
  all_select_clause_cols = [
114
- f"{escaped_timestamp_col_id} as ts",
115
- f"{escaped_numeric_col_id}",
110
+ f"{timestamp_col} as ts",
111
+ f"{numeric_col}",
116
112
  f"{numeric_col_name_str} as column_name",
117
- ] + escaped_segmentation_cols
113
+ ] + segmentation_cols
118
114
  extra_dims = ["column_name"]
119
115
 
120
116
  # build query
121
117
  data_query = f"""
122
118
  select {", ".join(all_select_clause_cols)}
123
119
  from {dataset.dataset_table_name}
124
- where {escaped_numeric_col_id} is not null
120
+ where {numeric_col} is not null
125
121
  """
126
122
 
127
123
  results = ddb_conn.sql(data_query).df()
124
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
128
125
 
129
126
  series = self.group_query_results_to_sketch_metrics(
130
127
  results,
131
- numeric_col,
132
- segmentation_cols + extra_dims,
128
+ unescape_identifier(numeric_col),
129
+ unescaped_segmentation_cols + extra_dims,
133
130
  "ts",
134
131
  )
135
132
 
@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
19
19
  ScalarType,
20
20
  ScopeSchemaTag,
21
21
  )
22
- from arthur_common.tools.duckdb_data_loader import escape_identifier
22
+ from arthur_common.tools.duckdb_data_loader import unescape_identifier
23
23
 
24
24
 
25
25
  class NumericSumAggregationFunction(NumericAggregationFunction):
@@ -94,45 +94,41 @@ class NumericSumAggregationFunction(NumericAggregationFunction):
94
94
  ] = None,
95
95
  ) -> list[NumericMetric]:
96
96
  """Executed SQL with no segmentation columns:
97
- select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
98
- sum({escaped_numeric_col}) as sum \
97
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
98
+ sum({numeric_col}) as sum \
99
99
  from {dataset.dataset_table_name} \
100
- where {escaped_numeric_col} is not null \
100
+ where {numeric_col} is not null \
101
101
  group by ts \
102
102
  """
103
103
  segmentation_cols = [] if not segmentation_cols else segmentation_cols
104
- escaped_timestamp_col = escape_identifier(timestamp_col)
105
- escaped_numeric_col = escape_identifier(numeric_col)
106
104
 
107
105
  # build query components with segmentation columns
108
- escaped_segmentation_cols = [
109
- escape_identifier(col) for col in segmentation_cols
110
- ]
111
106
  all_select_clause_cols = [
112
- f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
113
- f"sum({escaped_numeric_col}) as sum",
114
- ] + escaped_segmentation_cols
115
- all_group_by_cols = ["ts"] + escaped_segmentation_cols
107
+ f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
108
+ f"sum({numeric_col}) as sum",
109
+ ] + segmentation_cols
110
+ all_group_by_cols = ["ts"] + segmentation_cols
116
111
 
117
112
  # build query
118
113
  query = f"""
119
114
  select {", ".join(all_select_clause_cols)}
120
115
  from {dataset.dataset_table_name}
121
- where {escaped_numeric_col} is not null
116
+ where {numeric_col} is not null
122
117
  group by {", ".join(all_group_by_cols)}
123
118
  """
124
119
 
125
120
  results = ddb_conn.sql(query).df()
121
+ unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
126
122
 
127
123
  series = self.group_query_results_to_numeric_metrics(
128
124
  results,
129
125
  "sum",
130
- segmentation_cols,
126
+ unescaped_segmentation_cols,
131
127
  "ts",
132
128
  )
133
129
  # preserve dimension that identifies the name of the numeric column used for the aggregation
134
130
  for point in series:
135
- point.dimensions.append(Dimension(name="column_name", value=numeric_col))
131
+ point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(numeric_col)))
136
132
 
137
133
  metric = self.series_to_metric(self.METRIC_NAME, series)
138
134
  return [metric]
@@ -122,20 +122,6 @@ class BaseAggregationParameterSchema(BaseModel):
122
122
  description="Description of the parameter.",
123
123
  )
124
124
 
125
- @field_validator("parameter_key")
126
- @classmethod
127
- def validate_parameter_key_allowed_characters(cls, v: str) -> str:
128
- if not v.replace("_", "").isalpha():
129
- raise ValueError("Parameter key can only contain letters and underscores.")
130
- return v
131
-
132
- @field_validator("friendly_name")
133
- @classmethod
134
- def validate_friendly_name_allowed_characters(cls, v: str) -> str:
135
- if not v.replace("_", "").replace(" ", "").isalpha():
136
- raise ValueError("Friendly name can only contain letters and underscores.")
137
- return v
138
-
139
125
 
140
126
  class MetricsParameterSchema(BaseAggregationParameterSchema):
141
127
  # specific to default metrics/Python metrics—not available to custom aggregations
@@ -309,10 +295,3 @@ class ReportedCustomAggregation(BaseReportedAggregation):
309
295
  dimension_columns: list[str] = Field(
310
296
  description="Name of any dimension columns returned from the SQL query. Max length is 1.",
311
297
  )
312
-
313
- @field_validator("dimension_columns")
314
- @classmethod
315
- def validate_dimension_columns_length(cls, v: list[str]) -> list[str]:
316
- if len(v) > 1:
317
- raise ValueError("Only one dimension column can be specified.")
318
- return v
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
  from typing import Any
3
4
 
4
5
  import duckdb
@@ -314,6 +315,9 @@ def escape_identifier(identifier: str) -> str:
314
315
  """
315
316
  Escape an identifier (e.g., column name) for use in a SQL query.
316
317
  This method handles special characters and ensures proper quoting.
318
+
319
+ For struct fields, the identifiers must be escaped as following:
320
+ "struct_column_name"."struct_field"
317
321
  """
318
322
  # Replace any double quotes with two double quotes
319
323
  escaped = identifier.replace('"', '""')
@@ -321,6 +325,32 @@ def escape_identifier(identifier: str) -> str:
321
325
  return f'"{escaped}"'
322
326
 
323
327
 
328
+ def unescape_identifier(identifier: str) -> str:
329
+ """
330
+ Unescape an identifier (e.g., column name).
331
+
332
+ This removes the double quotes and properly handles struct fields, which may be escaped as follows:
333
+ "struct_column_name"."struct_field"
334
+
335
+ Here's a hard case for help understanding this function: "struct "" column name with quotes"."struct.field.name.with.dots"
336
+ """
337
+ unescaped_identifiers = []
338
+ # strip top-level quotes
339
+ identifier = identifier[1:-1]
340
+ # split identifier into struct fields based on delimiter pattern "."
341
+ # at this point there are no external double quotes left; any remaining are escaped double quotes belonging to
342
+ # the column name
343
+ identifier_split_in_struct_fields = re.split(r'"\."', identifier)
344
+
345
+ for identifier in identifier_split_in_struct_fields:
346
+ # replace any escaped double quotes in the column
347
+ unescaped_identifier = identifier.replace('""', '"')
348
+ unescaped_identifiers.append(unescaped_identifier)
349
+
350
+ # join back any struct fields via dot syntax without the escape identifiers
351
+ return ".".join(unescaped_identifiers)
352
+
353
+
324
354
  def escape_str_literal(literal: str) -> str:
325
355
  """
326
356
  Escape a duckDB string literal for use in a SQL query.
@@ -16,17 +16,15 @@ def is_column_possible_segmentation(
16
16
  2. Has an allowed DType.
17
17
 
18
18
  PreReq: Table with column should already be loaded in DuckDB
19
+ column_name already has DuckDB escape identifier for the query syntax
19
20
  """
20
21
  segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
21
22
  if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
22
23
  return False
23
24
 
24
- # check column for unique value count
25
- escaped_column = escape_identifier(column_name)
26
-
27
- # count distinct values in this column
25
+ # check column for unique value count - count distinct values in this column
28
26
  distinct_count_query = f"""
29
- SELECT COUNT(DISTINCT {escaped_column}) as distinct_count
27
+ SELECT COUNT(DISTINCT {column_name}) as distinct_count
30
28
  FROM {table}
31
29
  """
32
30
  result = conn.sql(distinct_count_query).fetchone()
@@ -110,7 +110,7 @@ class SchemaInferer:
110
110
  if not is_nested_col and is_column_possible_segmentation(
111
111
  self.conn,
112
112
  table,
113
- col_name,
113
+ escape_identifier(col_name),
114
114
  scalar_schema.dtype,
115
115
  ):
116
116
  scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)
File without changes