PyPI - arthur-common - Versions diffs - 2.2.0__tar.gz → 2.4.0__tar.gz - Mend

arthur-common 2.2.0tar.gz → 2.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arthur-common might be problematic. Click here for more details.

Files changed (49) hide show

{arthur_common-2.2.0 → arthur_common-2.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: arthur-common
-Version: 2.2.0
+Version: 2.4.0
 Summary: Utility code common to Arthur platform components.
 License: MIT
 Author: Arthur

{arthur_common-2.2.0 → arthur_common-2.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "arthur-common"
-version = "2.2.0"
+version = "2.4.0"
 description = "Utility code common to Arthur platform components."
 authors = ["Arthur <engineering@arthur.ai>"]
 license = "MIT"

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/categorical_count.py RENAMED Viewed

@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
 class CategoricalCountAggregationFunction(NumericAggregationFunction):
@@ -93,30 +94,25 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-            select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                 count(*) as count, \
-                {categorical_col_escaped} as category, \
-                {categorical_col_name_escaped} as column_name \
+                {categorical_col} as category, \
+                {categorical_col_name_unescaped} as column_name \
                 from {dataset.dataset_table_name} \
                 where ts is not null \
                 group by ts, category
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        timestamp_col_escaped = escape_identifier(timestamp_col)
-        categorical_col_escaped = escape_identifier(categorical_col)
-        categorical_col_name_escaped = escape_str_literal(categorical_col)
+        categorical_col_name_unescaped = escape_str_literal(unescape_identifier(categorical_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-            f"{categorical_col_escaped} as category",
-            f"{categorical_col_name_escaped} as column_name",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts", "category"] + escaped_segmentation_cols
+            f"{categorical_col} as category",
+            f"{categorical_col_name_unescaped} as column_name",
+        ] + segmentation_cols
+        all_group_by_cols = ["ts", "category"] + segmentation_cols
         extra_dims = ["column_name", "category"]
         # build query
@@ -129,10 +125,11 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
         results = ddb_conn.sql(count_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols + extra_dims,
+            unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/confusion_matrix.py RENAMED Viewed

@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
 class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
@@ -78,11 +79,11 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
             Without segmentation, this is the query:
                 WITH normalized_data AS (
                     SELECT
-                        {escaped_timestamp_col} AS timestamp,
-                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                        {timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT
                     time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -90,34 +91,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
                     SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
                     SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
                     SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
-                    {escaped_prediction_col_name} as prediction_column_name
+                    {unescaped_prediction_col_name} as prediction_column_name
                 FROM normalized_data
                 GROUP BY ts
                 ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_prediction_col_name = escape_str_literal(prediction_col)
-        escaped_gt_values_col = escape_identifier(gt_values_col)
+        unescaped_prediction_col_name = escape_str_literal(unescape_identifier(prediction_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         first_subquery_select_cols = [
-            f"{escaped_timestamp_col} AS timestamp",
-            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
-            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
-        ] + escaped_segmentation_cols
+            f"{timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
+        ] + segmentation_cols
         second_subquery_select_cols = [
             "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
             "SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
             "SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
             "SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
             "SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
-            f"{escaped_prediction_col_name} as prediction_column_name",
-        ] + escaped_segmentation_cols
-        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+            f"{unescaped_prediction_col_name} as prediction_column_name",
+        ] + segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + segmentation_cols
         extra_dims = ["prediction_column_name"]
         # build query
@@ -125,7 +121,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
                 WITH normalized_data AS (
                     SELECT {", ".join(first_subquery_select_cols)}
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT {", ".join(second_subquery_select_cols)}
                 FROM normalized_data
@@ -135,28 +131,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
         results = ddb_conn.sql(confusion_matrix_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         tp = self.group_query_results_to_numeric_metrics(
             results,
             "true_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fp = self.group_query_results_to_numeric_metrics(
             results,
             "false_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fn = self.group_query_results_to_numeric_metrics(
             results,
             "false_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tn = self.group_query_results_to_numeric_metrics(
             results,
             "true_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tp_metric = self.series_to_metric(self.TRUE_POSITIVE_METRIC_NAME, tp)
@@ -243,9 +240,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
         ] = None,
     ) -> list[NumericMetric]:
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_prediction_col = escape_identifier(prediction_col)
         # Get the type of prediction column
-        type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
+        type_query = f"SELECT typeof({prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
         res = ddb_conn.sql(type_query).fetchone()
         # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
         if not res:
@@ -476,7 +472,6 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
             ),
         ] = None,
     ) -> list[NumericMetric]:
-        escaped_gt_values_col = escape_identifier(gt_values_col)
         prediction_normalization_case = f"""
                 CASE
                     WHEN value >= {threshold} THEN 1
@@ -485,7 +480,7 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
                 END
                 """
-        type_query = f"SELECT typeof({escaped_gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
+        type_query = f"SELECT typeof({gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
         res = ddb_conn.sql(type_query).fetchone()
         # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
         if not res:

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/inference_count.py RENAMED Viewed

@@ -18,7 +18,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class InferenceCountAggregationFunction(NumericAggregationFunction):
@@ -80,23 +80,19 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-            select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                     count(*) as count \
                     from {dataset.dataset_table_name} \
                     group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         count_query = f"""
@@ -106,10 +102,11 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
         """
         results = ddb_conn.sql(count_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/inference_count_by_class.py RENAMED Viewed

@@ -20,7 +20,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
@@ -100,31 +100,26 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
         SELECT
-            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-            {escaped_pred_col} as prediction,
+            time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
+            {prediction_col} as prediction,
             COUNT(*) as count
         FROM {dataset.dataset_table_name}
         GROUP BY
             ts,
             -- group by raw column name instead of alias in select
             -- in case table has a column called 'prediction'
-            {escaped_pred_col}
+            {prediction_col}
         ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_pred_col = escape_identifier(prediction_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"{escaped_pred_col} as prediction",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"{prediction_col} as prediction",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts", f"{prediction_col}"] + segmentation_cols
         extra_dims = ["prediction"]
         # build query
@@ -137,10 +132,11 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
         result = ddb_conn.sql(query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             result,
             "count",
-            segmentation_cols + extra_dims,
+            unescaped_segmentation_cols + extra_dims,
             "ts",
         )
         metric = self.series_to_metric(self._metric_name(), series)
@@ -248,34 +244,29 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
             SELECT
-            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-            CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
+            time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
+            CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
             COUNT(*) as count
         FROM {dataset.dataset_table_name}
         GROUP BY
             ts,
             -- group by raw column name instead of alias in select
             -- in case table has a column called 'prediction'
-            {escaped_prediction_col}
+            {prediction_col}
         ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
+        ] + segmentation_cols
         all_group_by_cols = [
             "ts",
-            f"{escaped_prediction_col}",
-        ] + escaped_segmentation_cols
+            f"{prediction_col}",
+        ] + segmentation_cols
         extra_dims = ["prediction"]
         query = f"""
@@ -287,10 +278,11 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
         result = ddb_conn.sql(query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             result,
             "count",
-            segmentation_cols + extra_dims,
+            unescaped_segmentation_cols + extra_dims,
             "ts",
         )
         metric = self.series_to_metric(self._metric_name(), series)

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/inference_null_count.py RENAMED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class InferenceNullCountAggregationFunction(NumericAggregationFunction):
@@ -90,44 +90,40 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
+                select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                 count(*) as count \
-                from {dataset.dataset_table_name} where {escaped_nullable_col} is null \
+                from {dataset.dataset_table_name} where {nullable_col} is null \
                 group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_nullable_col = escape_identifier(nullable_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         count_query = f"""
             select {", ".join(all_select_clause_cols)}
             from {dataset.dataset_table_name}
-            where {escaped_nullable_col} is null
+            where {nullable_col} is null
             group by {", ".join(all_group_by_cols)}
         """
         results = ddb_conn.sql(count_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         # preserve dimension that identifies the name of the nullable column used for the aggregation
         for point in series:
-            point.dimensions.append(Dimension(name="column_name", value=nullable_col))
+            point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(nullable_col)))
         metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/mean_absolute_error.py RENAMED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae, \
+                SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                SUM(ABS({prediction_col} - {ground_truth_col})) as ae, \
                 COUNT(*) as count \
                 FROM {dataset.dataset_table_name} \
-                WHERE {escaped_prediction_col} IS NOT NULL \
-                AND {escaped_ground_truth_col} IS NOT NULL \
+                WHERE {prediction_col} IS NOT NULL \
+                AND {ground_truth_col} IS NOT NULL \
                 GROUP BY ts order by ts desc \
                 """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_ground_truth_col = escape_identifier(ground_truth_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"SUM(ABS({prediction_col} - {ground_truth_col})) as ae",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         mae_query = f"""
             SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_prediction_col} IS NOT NULL
-                  AND {escaped_ground_truth_col} IS NOT NULL
+            WHERE {prediction_col} IS NOT NULL
+                  AND {ground_truth_col} IS NOT NULL
             GROUP BY {", ".join(all_group_by_cols)} order by ts desc
         """
         results = ddb_conn.sql(mae_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         count_series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         absolute_error_series = self.group_query_results_to_numeric_metrics(
             results,
             "ae",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/mean_squared_error.py RENAMED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error, \
+                SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error, \
                 COUNT(*) as count \
                 FROM {dataset.dataset_table_name} \
-                WHERE {escaped_prediction_col} IS NOT NULL \
-                AND {escaped_ground_truth_col} IS NOT NULL \
+                WHERE {prediction_col} IS NOT NULL \
+                AND {ground_truth_col} IS NOT NULL \
                 GROUP BY ts order by ts desc \
                 """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_ground_truth_col = escape_identifier(ground_truth_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         mse_query = f"""
             SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_prediction_col} IS NOT NULL
-                  AND {escaped_ground_truth_col} IS NOT NULL
+            WHERE {prediction_col} IS NOT NULL
+                  AND {ground_truth_col} IS NOT NULL
             GROUP BY {", ".join(all_group_by_cols)} order by ts desc
         """
         results = ddb_conn.sql(mse_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         count_series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         squared_error_series = self.group_query_results_to_numeric_metrics(
             results,
             "squared_error",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/multiclass_confusion_matrix.py RENAMED Viewed

@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import escape_str_literal, unescape_identifier
 class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction(
@@ -194,11 +195,11 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             Returns the following SQL with no segmentation:
             WITH normalized_data AS (
                     SELECT
-                        {escaped_timestamp_col} AS timestamp,
-                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                        {timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT
                     time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -212,19 +213,12 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
                 ORDER BY ts
         """
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_gt_values_col = escape_identifier(gt_values_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         first_subquery_select_cols = [
-            f"{escaped_timestamp_col} AS timestamp",
-            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
-            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
-        ] + escaped_segmentation_cols
+            f"{timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
+        ] + segmentation_cols
         second_subquery_select_cols = [
             "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
             "SUM(CASE WHEN prediction = 1 AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
@@ -232,8 +226,8 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             "SUM(CASE WHEN prediction = 0 AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
             "SUM(CASE WHEN prediction = 0 AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
             f"any_value({escaped_positive_class_label}) as class_label",
-        ] + escaped_segmentation_cols
-        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + segmentation_cols
         extra_dims = ["class_label"]
         # build query
@@ -241,7 +235,7 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
         WITH normalized_data AS (
             SELECT {", ".join(first_subquery_select_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_timestamp_col} IS NOT NULL
+            WHERE {timestamp_col} IS NOT NULL
         )
         SELECT {", ".join(second_subquery_select_cols)}
         FROM normalized_data
@@ -250,29 +244,30 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
 """
         results = ddb_conn.sql(confusion_matrix_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         tp = self.group_query_results_to_numeric_metrics(
             results,
             "true_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fp = self.group_query_results_to_numeric_metrics(
             results,
             "false_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fn = self.group_query_results_to_numeric_metrics(
             results,
             "false_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tn = self.group_query_results_to_numeric_metrics(
             results,
             "true_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tp_metric = self.series_to_metric(

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/numeric_stats.py RENAMED Viewed

@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
 class NumericSketchAggregationFunction(SketchAggregationFunction):
@@ -95,41 +96,37 @@ class NumericSketchAggregationFunction(SketchAggregationFunction):
         ] = None,
     ) -> list[SketchMetric]:
         """Executed SQL with no segmentation columns:
-                    select {escaped_timestamp_col_id} as ts, \
-                       {escaped_numeric_col_id}, \
+                    select {timestamp_col} as ts, \
+                       {numeric_col}, \
                        {numeric_col_name_str} as column_name \
                 from {dataset.dataset_table_name} \
-                where {escaped_numeric_col_id} is not null \
+                where {numeric_col} is not null \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col_id = escape_identifier(timestamp_col)
-        escaped_numeric_col_id = escape_identifier(numeric_col)
-        numeric_col_name_str = escape_str_literal(numeric_col)
+        numeric_col_name_str = escape_str_literal(unescape_identifier(numeric_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"{escaped_timestamp_col_id} as ts",
-            f"{escaped_numeric_col_id}",
+            f"{timestamp_col} as ts",
+            f"{numeric_col}",
             f"{numeric_col_name_str} as column_name",
-        ] + escaped_segmentation_cols
+        ] + segmentation_cols
         extra_dims = ["column_name"]
         # build query
         data_query = f"""
                     select {", ".join(all_select_clause_cols)}
                     from {dataset.dataset_table_name}
-                    where {escaped_numeric_col_id} is not null
+                    where {numeric_col} is not null
                 """
         results = ddb_conn.sql(data_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_sketch_metrics(
             results,
-            numeric_col,
-            segmentation_cols + extra_dims,
+            unescape_identifier(numeric_col),
+            unescaped_segmentation_cols + extra_dims,
             "ts",
         )

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/aggregations/functions/numeric_sum.py RENAMED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class NumericSumAggregationFunction(NumericAggregationFunction):
@@ -94,45 +94,41 @@ class NumericSumAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                sum({escaped_numeric_col}) as sum \
+                select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                sum({numeric_col}) as sum \
                 from {dataset.dataset_table_name} \
-                where {escaped_numeric_col} is not null \
+                where {numeric_col} is not null \
                 group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_numeric_col = escape_identifier(numeric_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"sum({escaped_numeric_col}) as sum",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"sum({numeric_col}) as sum",
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         query = f"""
                     select {", ".join(all_select_clause_cols)}
                     from {dataset.dataset_table_name}
-                    where {escaped_numeric_col} is not null
+                    where {numeric_col} is not null
                     group by {", ".join(all_group_by_cols)}
                 """
         results = ddb_conn.sql(query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "sum",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         # preserve dimension that identifies the name of the numeric column used for the aggregation
         for point in series:
-            point.dimensions.append(Dimension(name="column_name", value=numeric_col))
+            point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(numeric_col)))
         metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/models/metrics.py RENAMED Viewed

@@ -122,20 +122,6 @@ class BaseAggregationParameterSchema(BaseModel):
         description="Description of the parameter.",
     )
-    @field_validator("parameter_key")
-    @classmethod
-    def validate_parameter_key_allowed_characters(cls, v: str) -> str:
-        if not v.replace("_", "").isalpha():
-            raise ValueError("Parameter key can only contain letters and underscores.")
-        return v
-    @field_validator("friendly_name")
-    @classmethod
-    def validate_friendly_name_allowed_characters(cls, v: str) -> str:
-        if not v.replace("_", "").replace(" ", "").isalpha():
-            raise ValueError("Friendly name can only contain letters and underscores.")
-        return v
 class MetricsParameterSchema(BaseAggregationParameterSchema):
     # specific to default metrics/Python metrics—not available to custom aggregations
@@ -309,10 +295,3 @@ class ReportedCustomAggregation(BaseReportedAggregation):
     dimension_columns: list[str] = Field(
         description="Name of any dimension columns returned from the SQL query. Max length is 1.",
     )
-    @field_validator("dimension_columns")
-    @classmethod
-    def validate_dimension_columns_length(cls, v: list[str]) -> list[str]:
-        if len(v) > 1:
-            raise ValueError("Only one dimension column can be specified.")
-        return v

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/duckdb_data_loader.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Any
 import duckdb
@@ -314,6 +315,9 @@ def escape_identifier(identifier: str) -> str:
     """
     Escape an identifier (e.g., column name) for use in a SQL query.
     This method handles special characters and ensures proper quoting.
+    For struct fields, the identifiers must be escaped as following:
+    "struct_column_name"."struct_field"
     """
     # Replace any double quotes with two double quotes
     escaped = identifier.replace('"', '""')
@@ -321,6 +325,32 @@ def escape_identifier(identifier: str) -> str:
     return f'"{escaped}"'
+def unescape_identifier(identifier: str) -> str:
+    """
+    Unescape an identifier (e.g., column name).
+    This removes the double quotes and properly handles struct fields, which may be escaped as follows:
+    "struct_column_name"."struct_field"
+    Here's a hard case for help understanding this function: "struct "" column name with quotes"."struct.field.name.with.dots"
+    """
+    unescaped_identifiers = []
+    # strip top-level quotes
+    identifier = identifier[1:-1]
+    # split identifier into struct fields based on delimiter pattern "."
+    # at this point there are no external double quotes left; any remaining are escaped double quotes belonging to
+    # the column name
+    identifier_split_in_struct_fields = re.split(r'"\."', identifier)
+    for identifier in identifier_split_in_struct_fields:
+        # replace any escaped double quotes in the column
+        unescaped_identifier = identifier.replace('""', '"')
+        unescaped_identifiers.append(unescaped_identifier)
+    # join back any struct fields via dot syntax without the escape identifiers
+    return ".".join(unescaped_identifiers)
 def escape_str_literal(literal: str) -> str:
     """
     Escape a duckDB string literal for use in a SQL query.

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/duckdb_utils.py RENAMED Viewed

@@ -16,17 +16,15 @@ def is_column_possible_segmentation(
     2. Has an allowed DType.
     PreReq: Table with column should already be loaded in DuckDB
+    column_name already has DuckDB escape identifier for the query syntax
     """
     segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
     if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
         return False
-    # check column for unique value count
-    escaped_column = escape_identifier(column_name)
-    # count distinct values in this column
+    # check column for unique value count - count distinct values in this column
     distinct_count_query = f"""
-        SELECT COUNT(DISTINCT {escaped_column}) as distinct_count
+        SELECT COUNT(DISTINCT {column_name}) as distinct_count
         FROM {table}
     """
     result = conn.sql(distinct_count_query).fetchone()

{arthur_common-2.2.0 → arthur_common-2.4.0}/src/arthur_common/tools/schema_inferer.py RENAMED Viewed

@@ -110,7 +110,7 @@ class SchemaInferer:
                 if not is_nested_col and is_column_possible_segmentation(
                     self.conn,
                     table,
-                    col_name,
+                    escape_identifier(col_name),
                     scalar_schema.dtype,
                 ):
                     scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)