PyPI - arthur-common - Versions diffs - 2.1.68__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

arthur-common 2.1.68py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arthur-common might be problematic. Click here for more details.

Files changed (18) hide show

arthur_common/aggregations/functions/categorical_count.py CHANGED Viewed

@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
 class CategoricalCountAggregationFunction(NumericAggregationFunction):
@@ -93,30 +94,25 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-            select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                 count(*) as count, \
-                {categorical_col_escaped} as category, \
-                {categorical_col_name_escaped} as column_name \
+                {categorical_col} as category, \
+                {categorical_col_name_unescaped} as column_name \
                 from {dataset.dataset_table_name} \
                 where ts is not null \
                 group by ts, category
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        timestamp_col_escaped = escape_identifier(timestamp_col)
-        categorical_col_escaped = escape_identifier(categorical_col)
-        categorical_col_name_escaped = escape_str_literal(categorical_col)
+        categorical_col_name_unescaped = escape_str_literal(unescape_identifier(categorical_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-            f"{categorical_col_escaped} as category",
-            f"{categorical_col_name_escaped} as column_name",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts", "category"] + escaped_segmentation_cols
+            f"{categorical_col} as category",
+            f"{categorical_col_name_unescaped} as column_name",
+        ] + segmentation_cols
+        all_group_by_cols = ["ts", "category"] + segmentation_cols
         extra_dims = ["column_name", "category"]
         # build query
@@ -129,10 +125,11 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
         results = ddb_conn.sql(count_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols + extra_dims,
+            unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)

arthur_common/aggregations/functions/confusion_matrix.py CHANGED Viewed

@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
 class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
@@ -78,11 +79,11 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
             Without segmentation, this is the query:
                 WITH normalized_data AS (
                     SELECT
-                        {escaped_timestamp_col} AS timestamp,
-                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                        {timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT
                     time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -90,34 +91,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
                     SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
                     SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
                     SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
-                    {escaped_prediction_col_name} as prediction_column_name
+                    {unescaped_prediction_col_name} as prediction_column_name
                 FROM normalized_data
                 GROUP BY ts
                 ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_prediction_col_name = escape_str_literal(prediction_col)
-        escaped_gt_values_col = escape_identifier(gt_values_col)
+        unescaped_prediction_col_name = escape_str_literal(unescape_identifier(prediction_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         first_subquery_select_cols = [
-            f"{escaped_timestamp_col} AS timestamp",
-            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
-            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
-        ] + escaped_segmentation_cols
+            f"{timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
+        ] + segmentation_cols
         second_subquery_select_cols = [
             "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
             "SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
             "SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
             "SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
             "SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
-            f"{escaped_prediction_col_name} as prediction_column_name",
-        ] + escaped_segmentation_cols
-        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+            f"{unescaped_prediction_col_name} as prediction_column_name",
+        ] + segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + segmentation_cols
         extra_dims = ["prediction_column_name"]
         # build query
@@ -125,7 +121,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
                 WITH normalized_data AS (
                     SELECT {", ".join(first_subquery_select_cols)}
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT {", ".join(second_subquery_select_cols)}
                 FROM normalized_data
@@ -135,28 +131,29 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
         results = ddb_conn.sql(confusion_matrix_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         tp = self.group_query_results_to_numeric_metrics(
             results,
             "true_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fp = self.group_query_results_to_numeric_metrics(
             results,
             "false_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fn = self.group_query_results_to_numeric_metrics(
             results,
             "false_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tn = self.group_query_results_to_numeric_metrics(
             results,
             "true_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tp_metric = self.series_to_metric(self.TRUE_POSITIVE_METRIC_NAME, tp)
@@ -243,9 +240,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
         ] = None,
     ) -> list[NumericMetric]:
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_prediction_col = escape_identifier(prediction_col)
         # Get the type of prediction column
-        type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
+        type_query = f"SELECT typeof({prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
         res = ddb_conn.sql(type_query).fetchone()
         # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
         if not res:
@@ -476,7 +472,6 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
             ),
         ] = None,
     ) -> list[NumericMetric]:
-        escaped_gt_values_col = escape_identifier(gt_values_col)
         prediction_normalization_case = f"""
                 CASE
                     WHEN value >= {threshold} THEN 1
@@ -485,7 +480,7 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
                 END
                 """
-        type_query = f"SELECT typeof({escaped_gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
+        type_query = f"SELECT typeof({gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
         res = ddb_conn.sql(type_query).fetchone()
         # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
         if not res:

arthur_common/aggregations/functions/inference_count.py CHANGED Viewed

@@ -18,7 +18,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class InferenceCountAggregationFunction(NumericAggregationFunction):
@@ -80,23 +80,19 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-            select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                     count(*) as count \
                     from {dataset.dataset_table_name} \
                     group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         count_query = f"""
@@ -106,10 +102,11 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
         """
         results = ddb_conn.sql(count_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)

arthur_common/aggregations/functions/inference_count_by_class.py CHANGED Viewed

@@ -20,7 +20,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
@@ -100,31 +100,26 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
         SELECT
-            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-            {escaped_pred_col} as prediction,
+            time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
+            {prediction_col} as prediction,
             COUNT(*) as count
         FROM {dataset.dataset_table_name}
         GROUP BY
             ts,
             -- group by raw column name instead of alias in select
             -- in case table has a column called 'prediction'
-            {escaped_pred_col}
+            {prediction_col}
         ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_pred_col = escape_identifier(prediction_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"{escaped_pred_col} as prediction",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"{prediction_col} as prediction",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts", f"{prediction_col}"] + segmentation_cols
         extra_dims = ["prediction"]
         # build query
@@ -137,10 +132,11 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
         result = ddb_conn.sql(query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             result,
             "count",
-            segmentation_cols + extra_dims,
+            unescaped_segmentation_cols + extra_dims,
             "ts",
         )
         metric = self.series_to_metric(self._metric_name(), series)
@@ -248,34 +244,29 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
             SELECT
-            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-            CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
+            time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
+            CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
             COUNT(*) as count
         FROM {dataset.dataset_table_name}
         GROUP BY
             ts,
             -- group by raw column name instead of alias in select
             -- in case table has a column called 'prediction'
-            {escaped_prediction_col}
+            {prediction_col}
         ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
+        ] + segmentation_cols
         all_group_by_cols = [
             "ts",
-            f"{escaped_prediction_col}",
-        ] + escaped_segmentation_cols
+            f"{prediction_col}",
+        ] + segmentation_cols
         extra_dims = ["prediction"]
         query = f"""
@@ -287,10 +278,11 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
         result = ddb_conn.sql(query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             result,
             "count",
-            segmentation_cols + extra_dims,
+            unescaped_segmentation_cols + extra_dims,
             "ts",
         )
         metric = self.series_to_metric(self._metric_name(), series)

arthur_common/aggregations/functions/inference_null_count.py CHANGED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class InferenceNullCountAggregationFunction(NumericAggregationFunction):
@@ -90,44 +90,40 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
+                select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                 count(*) as count \
-                from {dataset.dataset_table_name} where {escaped_nullable_col} is null \
+                from {dataset.dataset_table_name} where {nullable_col} is null \
                 group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_nullable_col = escape_identifier(nullable_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         count_query = f"""
             select {", ".join(all_select_clause_cols)}
             from {dataset.dataset_table_name}
-            where {escaped_nullable_col} is null
+            where {nullable_col} is null
             group by {", ".join(all_group_by_cols)}
         """
         results = ddb_conn.sql(count_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         # preserve dimension that identifies the name of the nullable column used for the aggregation
         for point in series:
-            point.dimensions.append(Dimension(name="column_name", value=nullable_col))
+            point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(nullable_col)))
         metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]

arthur_common/aggregations/functions/mean_absolute_error.py CHANGED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae, \
+                SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                SUM(ABS({prediction_col} - {ground_truth_col})) as ae, \
                 COUNT(*) as count \
                 FROM {dataset.dataset_table_name} \
-                WHERE {escaped_prediction_col} IS NOT NULL \
-                AND {escaped_ground_truth_col} IS NOT NULL \
+                WHERE {prediction_col} IS NOT NULL \
+                AND {ground_truth_col} IS NOT NULL \
                 GROUP BY ts order by ts desc \
                 """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_ground_truth_col = escape_identifier(ground_truth_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"SUM(ABS({prediction_col} - {ground_truth_col})) as ae",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         mae_query = f"""
             SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_prediction_col} IS NOT NULL
-                  AND {escaped_ground_truth_col} IS NOT NULL
+            WHERE {prediction_col} IS NOT NULL
+                  AND {ground_truth_col} IS NOT NULL
             GROUP BY {", ".join(all_group_by_cols)} order by ts desc
         """
         results = ddb_conn.sql(mae_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         count_series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         absolute_error_series = self.group_query_results_to_numeric_metrics(
             results,
             "ae",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )

arthur_common/aggregations/functions/mean_squared_error.py CHANGED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
@@ -111,50 +111,45 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error, \
+                SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error, \
                 COUNT(*) as count \
                 FROM {dataset.dataset_table_name} \
-                WHERE {escaped_prediction_col} IS NOT NULL \
-                AND {escaped_ground_truth_col} IS NOT NULL \
+                WHERE {prediction_col} IS NOT NULL \
+                AND {ground_truth_col} IS NOT NULL \
                 GROUP BY ts order by ts desc \
                 """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_ground_truth_col = escape_identifier(ground_truth_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         mse_query = f"""
             SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_prediction_col} IS NOT NULL
-                  AND {escaped_ground_truth_col} IS NOT NULL
+            WHERE {prediction_col} IS NOT NULL
+                  AND {ground_truth_col} IS NOT NULL
             GROUP BY {", ".join(all_group_by_cols)} order by ts desc
         """
         results = ddb_conn.sql(mse_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         count_series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         squared_error_series = self.group_query_results_to_numeric_metrics(
             results,
             "squared_error",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )

arthur_common/aggregations/functions/multiclass_confusion_matrix.py CHANGED Viewed

@@ -20,7 +20,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import escape_str_literal, unescape_identifier
 class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction(
@@ -194,11 +195,11 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             Returns the following SQL with no segmentation:
             WITH normalized_data AS (
                     SELECT
-                        {escaped_timestamp_col} AS timestamp,
-                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                        {timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT
                     time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -212,19 +213,12 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
                 ORDER BY ts
         """
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_gt_values_col = escape_identifier(gt_values_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         first_subquery_select_cols = [
-            f"{escaped_timestamp_col} AS timestamp",
-            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
-            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
-        ] + escaped_segmentation_cols
+            f"{timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
+        ] + segmentation_cols
         second_subquery_select_cols = [
             "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
             "SUM(CASE WHEN prediction = 1 AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
@@ -232,8 +226,8 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             "SUM(CASE WHEN prediction = 0 AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
             "SUM(CASE WHEN prediction = 0 AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
             f"any_value({escaped_positive_class_label}) as class_label",
-        ] + escaped_segmentation_cols
-        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + segmentation_cols
         extra_dims = ["class_label"]
         # build query
@@ -241,7 +235,7 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
         WITH normalized_data AS (
             SELECT {", ".join(first_subquery_select_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_timestamp_col} IS NOT NULL
+            WHERE {timestamp_col} IS NOT NULL
         )
         SELECT {", ".join(second_subquery_select_cols)}
         FROM normalized_data
@@ -250,29 +244,30 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
 """
         results = ddb_conn.sql(confusion_matrix_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         tp = self.group_query_results_to_numeric_metrics(
             results,
             "true_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fp = self.group_query_results_to_numeric_metrics(
             results,
             "false_positive_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         fn = self.group_query_results_to_numeric_metrics(
             results,
             "false_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tn = self.group_query_results_to_numeric_metrics(
             results,
             "true_negative_count",
-            dim_columns=segmentation_cols + extra_dims,
+            dim_columns=unescaped_segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         tp_metric = self.series_to_metric(

arthur_common/aggregations/functions/numeric_stats.py CHANGED Viewed

@@ -18,7 +18,8 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import unescape_identifier, escape_str_literal
 class NumericSketchAggregationFunction(SketchAggregationFunction):
@@ -95,41 +96,37 @@ class NumericSketchAggregationFunction(SketchAggregationFunction):
         ] = None,
     ) -> list[SketchMetric]:
         """Executed SQL with no segmentation columns:
-                    select {escaped_timestamp_col_id} as ts, \
-                       {escaped_numeric_col_id}, \
+                    select {timestamp_col} as ts, \
+                       {numeric_col}, \
                        {numeric_col_name_str} as column_name \
                 from {dataset.dataset_table_name} \
-                where {escaped_numeric_col_id} is not null \
+                where {numeric_col} is not null \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col_id = escape_identifier(timestamp_col)
-        escaped_numeric_col_id = escape_identifier(numeric_col)
-        numeric_col_name_str = escape_str_literal(numeric_col)
+        numeric_col_name_str = escape_str_literal(unescape_identifier(numeric_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"{escaped_timestamp_col_id} as ts",
-            f"{escaped_numeric_col_id}",
+            f"{timestamp_col} as ts",
+            f"{numeric_col}",
             f"{numeric_col_name_str} as column_name",
-        ] + escaped_segmentation_cols
+        ] + segmentation_cols
         extra_dims = ["column_name"]
         # build query
         data_query = f"""
                     select {", ".join(all_select_clause_cols)}
                     from {dataset.dataset_table_name}
-                    where {escaped_numeric_col_id} is not null
+                    where {numeric_col} is not null
                 """
         results = ddb_conn.sql(data_query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_sketch_metrics(
             results,
-            numeric_col,
-            segmentation_cols + extra_dims,
+            unescape_identifier(numeric_col),
+            unescaped_segmentation_cols + extra_dims,
             "ts",
         )

arthur_common/aggregations/functions/numeric_sum.py CHANGED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class NumericSumAggregationFunction(NumericAggregationFunction):
@@ -94,45 +94,41 @@ class NumericSumAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                sum({escaped_numeric_col}) as sum \
+                select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                sum({numeric_col}) as sum \
                 from {dataset.dataset_table_name} \
-                where {escaped_numeric_col} is not null \
+                where {numeric_col} is not null \
                 group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_numeric_col = escape_identifier(numeric_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"sum({escaped_numeric_col}) as sum",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"sum({numeric_col}) as sum",
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         query = f"""
                     select {", ".join(all_select_clause_cols)}
                     from {dataset.dataset_table_name}
-                    where {escaped_numeric_col} is not null
+                    where {numeric_col} is not null
                     group by {", ".join(all_group_by_cols)}
                 """
         results = ddb_conn.sql(query).df()
+        unescaped_segmentation_cols = [unescape_identifier(seg_col) for seg_col in segmentation_cols]
         series = self.group_query_results_to_numeric_metrics(
             results,
             "sum",
-            segmentation_cols,
+            unescaped_segmentation_cols,
             "ts",
         )
         # preserve dimension that identifies the name of the numeric column used for the aggregation
         for point in series:
-            point.dimensions.append(Dimension(name="column_name", value=numeric_col))
+            point.dimensions.append(Dimension(name="column_name", value=unescape_identifier(numeric_col)))
         metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]

arthur_common/models/enums.py CHANGED Viewed

@@ -119,9 +119,9 @@ class TokenUsageScope(BaseEnum):
 class ToolClassEnum(IntEnum):
-    WRONG_TOOL_SELECTED = 0
-    CORRECT_TOOL_SELECTED = 1
-    NO_TOOL_SELECTED = 2
+    INCORRECT = 0
+    CORRECT = 1
+    NA = 2
     def __str__(self) -> str:
         return str(self.value)
@@ -147,3 +147,11 @@ class UserPermissionResource(BaseEnum):
     RESPONSES = "responses"
     RULES = "rules"
     TASKS = "tasks"
+class ComparisonOperatorEnum(BaseEnum):
+    EQUAL = "eq"
+    GREATER_THAN = "gt"
+    GREATER_THAN_OR_EQUAL = "gte"
+    LESS_THAN = "lt"
+    LESS_THAN_OR_EQUAL = "lte"

arthur_common/models/request_schemas.py CHANGED Viewed

@@ -1,9 +1,16 @@
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Self, Type, Union
+from typing import Any, Dict, List, Optional, Self, Type
 from fastapi import HTTPException
 from openinference.semconv.trace import OpenInferenceSpanKindValues
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    ValidationInfo,
+    field_validator,
+    model_validator,
+)
 from arthur_common.models.common_schemas import (
     ExamplesConfig,
@@ -25,6 +32,7 @@ from arthur_common.models.enums import (
     PIIEntityTypes,
     RuleScope,
     RuleType,
+    ToolClassEnum,
 )
 from arthur_common.models.metric_schemas import RelevanceMetricConfig
@@ -50,12 +58,12 @@ class NewRuleRequest(BaseModel):
         examples=[False],
     )
     config: (
-            KeywordsConfig
-            | RegexConfig
-            | ExamplesConfig
-            | ToxicityConfig
-            | PIIConfig
-            | None
+        KeywordsConfig
+        | RegexConfig
+        | ExamplesConfig
+        | ToxicityConfig
+        | PIIConfig
+        | None
     ) = Field(description="Config of the rule", default=None)
     model_config = ConfigDict(
@@ -554,3 +562,250 @@ class SpanQueryRequest(BaseModel):
                 f"Valid values: {', '.join(sorted(valid_span_kinds))}",
             )
         return value
+class TraceQueryRequest(BaseModel):
+    """Request schema for querying traces with comprehensive filtering."""
+    # Required
+    task_ids: list[str] = Field(
+        ...,
+        description="Task IDs to filter on. At least one is required.",
+        min_length=1,
+    )
+    # Common optional filters
+    trace_ids: Optional[list[str]] = Field(
+        None,
+        description="Trace IDs to filter on. Optional.",
+    )
+    start_time: Optional[datetime] = Field(
+        None,
+        description="Inclusive start date in ISO8601 string format. Use local time (not UTC).",
+    )
+    end_time: Optional[datetime] = Field(
+        None,
+        description="Exclusive end date in ISO8601 string format. Use local time (not UTC).",
+    )
+    # New trace-level filters
+    tool_name: Optional[str] = Field(
+        None,
+        description="Return only results with this tool name.",
+    )
+    span_types: Optional[list[str]] = Field(
+        None,
+        description="Span types to filter on. Optional.",
+    )
+    # Query relevance filters
+    query_relevance_eq: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Equal to this value.",
+    )
+    query_relevance_gt: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Greater than this value.",
+    )
+    query_relevance_gte: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Greater than or equal to this value.",
+    )
+    query_relevance_lt: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Less than this value.",
+    )
+    query_relevance_lte: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Less than or equal to this value.",
+    )
+    # Response relevance filters
+    response_relevance_eq: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Equal to this value.",
+    )
+    response_relevance_gt: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Greater than this value.",
+    )
+    response_relevance_gte: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Greater than or equal to this value.",
+    )
+    response_relevance_lt: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Less than this value.",
+    )
+    response_relevance_lte: Optional[float] = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Less than or equal to this value.",
+    )
+    # Tool classification filters
+    tool_selection: Optional[ToolClassEnum] = Field(
+        None,
+        description="Tool selection evaluation result.",
+    )
+    tool_usage: Optional[ToolClassEnum] = Field(
+        None,
+        description="Tool usage evaluation result.",
+    )
+    # Trace duration filters
+    trace_duration_eq: Optional[float] = Field(
+        None,
+        ge=0,
+        description="Duration exactly equal to this value (seconds).",
+    )
+    trace_duration_gt: Optional[float] = Field(
+        None,
+        ge=0,
+        description="Duration greater than this value (seconds).",
+    )
+    trace_duration_gte: Optional[float] = Field(
+        None,
+        ge=0,
+        description="Duration greater than or equal to this value (seconds).",
+    )
+    trace_duration_lt: Optional[float] = Field(
+        None,
+        ge=0,
+        description="Duration less than this value (seconds).",
+    )
+    trace_duration_lte: Optional[float] = Field(
+        None,
+        ge=0,
+        description="Duration less than or equal to this value (seconds).",
+    )
+    @field_validator(
+        "query_relevance_eq",
+        "query_relevance_gt",
+        "query_relevance_gte",
+        "query_relevance_lt",
+        "query_relevance_lte",
+        "response_relevance_eq",
+        "response_relevance_gt",
+        "response_relevance_gte",
+        "response_relevance_lt",
+        "response_relevance_lte",
+        mode="before",
+    )
+    @classmethod
+    def validate_relevance_scores(
+        cls,
+        value: Optional[float],
+        info: ValidationInfo,
+    ) -> Optional[float]:
+        """Validate that relevance scores are between 0 and 1 (inclusive)."""
+        if value is not None:
+            if not (0.0 <= value <= 1.0):
+                raise ValueError(
+                    f"{info.field_name} value must be between 0 and 1 (inclusive)",
+                )
+        return value
+    @field_validator(
+        "trace_duration_eq",
+        "trace_duration_gt",
+        "trace_duration_gte",
+        "trace_duration_lt",
+        "trace_duration_lte",
+        mode="before",
+    )
+    @classmethod
+    def validate_trace_duration(
+        cls,
+        value: Optional[float],
+        info: ValidationInfo,
+    ) -> Optional[float]:
+        """Validate that trace duration values are non-negative."""
+        if value is not None:
+            if value < 0:
+                raise ValueError(
+                    f"{info.field_name} value must be non-negative (greater than or equal to 0)",
+                )
+        return value
+    @field_validator("tool_selection", "tool_usage", mode="before")
+    @classmethod
+    def validate_tool_classification(cls, value: Any) -> Optional[ToolClassEnum]:
+        """Validate tool classification enum values."""
+        if value is not None:
+            # Handle both integer and enum inputs
+            if isinstance(value, int):
+                if value not in [0, 1, 2]:
+                    raise ValueError(
+                        "Tool classification must be 0 (INCORRECT), "
+                        "1 (CORRECT), or 2 (NA)",
+                    )
+                return ToolClassEnum(value)
+            elif isinstance(value, ToolClassEnum):
+                return value
+            else:
+                raise ValueError(
+                    "Tool classification must be an integer (0, 1, 2) or ToolClassEnum instance",
+                )
+        return value
+    @field_validator("span_types")
+    @classmethod
+    def validate_span_types(cls, value: Optional[list[str]]) -> Optional[list[str]]:
+        """Validate that all span_types are valid OpenInference span kinds."""
+        if not value:
+            return value
+        # Get all valid span kind values
+        valid_span_kinds = [kind.value for kind in OpenInferenceSpanKindValues]
+        invalid_types = [st for st in value if st not in valid_span_kinds]
+        if invalid_types:
+            raise ValueError(
+                f"Invalid span_types received: {invalid_types}. "
+                f"Valid values: {', '.join(sorted(valid_span_kinds))}",
+            )
+        return value
+    @model_validator(mode="after")
+    def validate_filter_combinations(self) -> Self:
+        """Validate that filter combinations are logically valid."""
+        # Check mutually exclusive filters for each metric type
+        for prefix in ["query_relevance", "response_relevance", "trace_duration"]:
+            eq_field = f"{prefix}_eq"
+            comparison_fields = [f"{prefix}_{op}" for op in ["gt", "gte", "lt", "lte"]]
+            if getattr(self, eq_field) and any(
+                getattr(self, field) for field in comparison_fields
+            ):
+                raise ValueError(
+                    f"{eq_field} cannot be combined with other {prefix} comparison operators",
+                )
+            # Check for incompatible operator combinations
+            if getattr(self, f"{prefix}_gt") and getattr(self, f"{prefix}_gte"):
+                raise ValueError(f"Cannot combine {prefix}_gt with {prefix}_gte")
+            if getattr(self, f"{prefix}_lt") and getattr(self, f"{prefix}_lte"):
+                raise ValueError(f"Cannot combine {prefix}_lt with {prefix}_lte")
+        return self

arthur_common/tools/duckdb_data_loader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Any
 import duckdb
@@ -314,6 +315,9 @@ def escape_identifier(identifier: str) -> str:
     """
     Escape an identifier (e.g., column name) for use in a SQL query.
     This method handles special characters and ensures proper quoting.
+    For struct fields, the identifiers must be escaped as following:
+    "struct_column_name"."struct_field"
     """
     # Replace any double quotes with two double quotes
     escaped = identifier.replace('"', '""')
@@ -321,6 +325,32 @@ def escape_identifier(identifier: str) -> str:
     return f'"{escaped}"'
+def unescape_identifier(identifier: str) -> str:
+    """
+    Unescape an identifier (e.g., column name).
+    This removes the double quotes and properly handles struct fields, which may be escaped as follows:
+    "struct_column_name"."struct_field"
+    Here's a hard case for help understanding this function: "struct "" column name with quotes"."struct.field.name.with.dots"
+    """
+    unescaped_identifiers = []
+    # strip top-level quotes
+    identifier = identifier[1:-1]
+    # split identifier into struct fields based on delimiter pattern "."
+    # at this point there are no external double quotes left; any remaining are escaped double quotes belonging to
+    # the column name
+    identifier_split_in_struct_fields = re.split(r'"\."', identifier)
+    for identifier in identifier_split_in_struct_fields:
+        # replace any escaped double quotes in the column
+        unescaped_identifier = identifier.replace('""', '"')
+        unescaped_identifiers.append(unescaped_identifier)
+    # join back any struct fields via dot syntax without the escape identifiers
+    return ".".join(unescaped_identifiers)
 def escape_str_literal(literal: str) -> str:
     """
     Escape a duckDB string literal for use in a SQL query.

arthur_common/tools/duckdb_utils.py CHANGED Viewed

@@ -16,17 +16,15 @@ def is_column_possible_segmentation(
     2. Has an allowed DType.
     PreReq: Table with column should already be loaded in DuckDB
+    column_name already has DuckDB escape identifier for the query syntax
     """
     segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
     if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
         return False
-    # check column for unique value count
-    escaped_column = escape_identifier(column_name)
-    # count distinct values in this column
+    # check column for unique value count - count distinct values in this column
     distinct_count_query = f"""
-        SELECT COUNT(DISTINCT {escaped_column}) as distinct_count
+        SELECT COUNT(DISTINCT {column_name}) as distinct_count
         FROM {table}
     """
     result = conn.sql(distinct_count_query).fetchone()

arthur_common/tools/schema_inferer.py CHANGED Viewed

@@ -110,7 +110,7 @@ class SchemaInferer:
                 if not is_nested_col and is_column_possible_segmentation(
                     self.conn,
                     table,
-                    col_name,
+                    escape_identifier(col_name),
                     scalar_schema.dtype,
                 ):
                     scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)

{arthur_common-2.1.68.dist-info → arthur_common-2.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: arthur-common
-Version: 2.1.68
+Version: 2.3.0
 Summary: Utility code common to Arthur platform components.
 License: MIT
 Author: Arthur

{arthur_common-2.1.68.dist-info → arthur_common-2.3.0.dist-info}/RECORD RENAMED Viewed

@@ -4,17 +4,17 @@ arthur_common/aggregations/aggregator.py,sha256=AhyNqBDEbKtS3ZrnSIT9iZ1SK_TAuiUN
 arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
 arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
 arthur_common/aggregations/functions/agentic_aggregations.py,sha256=09th4RPRf-ogtVWpbcqqmITN2UFtfqXhQ7Rr6IBqQHo,33995
-arthur_common/aggregations/functions/categorical_count.py,sha256=wc1ovL8JoiSeoSTk9h1fgrLj1QuQeYYZmEqgffGc2cw,5328
-arthur_common/aggregations/functions/confusion_matrix.py,sha256=aPL8DaXpflt0z1u1KIeFw9geZLJ6qTuTosCNFV54y8M,22105
-arthur_common/aggregations/functions/inference_count.py,sha256=SrRfxQVnX-wRTZ1zbqUKupPdACvfKeUpZDidZs45ZUY,4079
-arthur_common/aggregations/functions/inference_count_by_class.py,sha256=H64-pZIU1bJ2BPNJl64_H97BASAjGact10AjW_gkvaY,11551
-arthur_common/aggregations/functions/inference_null_count.py,sha256=w9sfu1QDlVBJwMW5EEkgda65nyMAABzd-FBKtj8amw4,4825
-arthur_common/aggregations/functions/mean_absolute_error.py,sha256=mOqE7XO2h7JtTLEKG5gTXu-pQJJIMYKWbUyqWA2dcxk,6831
-arthur_common/aggregations/functions/mean_squared_error.py,sha256=9WFBIhmAg1FZ7tdQYFWsS3yp3kyCYMJVAk-uLSb41Ck,6852
-arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=rXXvXCIb30j_ofsMfp2yjLEdf8LmfKTqOLM3NQowzaU,12612
+arthur_common/aggregations/functions/categorical_count.py,sha256=_TD0s0JAtqC5RmT6ZNWLEBZm-dU4akm-Aor7EDVazzA,5176
+arthur_common/aggregations/functions/confusion_matrix.py,sha256=n33kyyZuxo8k6jUYnBUsc1fLotTmcw0H8rsX_x_oeJ0,21733
+arthur_common/aggregations/functions/inference_count.py,sha256=D49SpwFywipMqeC93gc3_ZGwBoGL89yKuA9_55dBWBw,3984
+arthur_common/aggregations/functions/inference_count_by_class.py,sha256=mYL6xMTb-_VO6mKGWHOtFAvWzTt-C_4vKf8KgioJGDg,11191
+arthur_common/aggregations/functions/inference_null_count.py,sha256=UlE5EZa3k2nKIv6Yzrnjq1MsZEzrau7Olumny8hsHtg,4672
+arthur_common/aggregations/functions/mean_absolute_error.py,sha256=YzrNHox_4HEGWn33E12d6eiQ8A9Rwct7AW3hOWrTW7I,6544
+arthur_common/aggregations/functions/mean_squared_error.py,sha256=b_is7FKRSninYs1ilAXeLPJFfmyCaiKvCC9Ev_OERio,6565
+arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=e1KEyxIZocWMkDbnW0zfJHd5PUi_kyzwNUVFOD0l5Nk,12359
 arthur_common/aggregations/functions/multiclass_inference_count_by_class.py,sha256=yiMpdz4VuX1ELprXYupFu4B9aDLIhgfEi3ma8jZsT_M,4261
-arthur_common/aggregations/functions/numeric_stats.py,sha256=uHTyOAHW6xF6D-TeFLtY16iVR-Ju_6lmXSSY77mH0Qs,4921
-arthur_common/aggregations/functions/numeric_sum.py,sha256=kGE6Jjnjwf2E4TKE3NwPyrlEKgygfCxv1z_YGDCOcCQ,5028
+arthur_common/aggregations/functions/numeric_stats.py,sha256=mMpVH1PvElGaz5mIQWy8sIkKPZ5kyeNOAM2iM2IlBvY,4760
+arthur_common/aggregations/functions/numeric_sum.py,sha256=Vq-dQonKTdLt8pYFwT5tCXyyL_FvVQxb6b3nFNRSqus,4861
 arthur_common/aggregations/functions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arthur_common/aggregations/functions/shield_aggregations.py,sha256=BzPkpbhZRy16iFOobuusGKHfov5DxnXS2v_WThpw2fk,35659
 arthur_common/aggregations/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,11 +26,11 @@ arthur_common/models/common_schemas.py,sha256=31Br7DbIgrwHwzgiyMXrgPYrANhqSqle7k
 arthur_common/models/connectors.py,sha256=RwjY74cs0KTKw7Opywehg46SZ4vwN3xm6ujHRsRIQ8Y,2292
 arthur_common/models/constants.py,sha256=munkU0LrLsDs9BtAfozzw30FCguIowmAUKg_9vqwX24,1049
 arthur_common/models/datasets.py,sha256=7p1tyJEPwXjBs2ZRoai8hTzNl6MK9jU1DluzASApE_4,254
-arthur_common/models/enums.py,sha256=f--GnBHo7_PEISrIS18lCxOhZUZ-BcaBvTlq0kX4tsU,3739
+arthur_common/models/enums.py,sha256=J2beHEMjLfOGgc-vh1aDpE7KmBGKzLoOUGYLtuciJro,3870
 arthur_common/models/metric_schemas.py,sha256=Xf-1RTzg7iYtnBMLkUUUuMPzAujzzNvQx_pe-CksEdU,2484
 arthur_common/models/metrics.py,sha256=87LUU7-8duoKCzaffw9GHMyjsKMNoxKa5n5Hyg_ZK1s,11931
 arthur_common/models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arthur_common/models/request_schemas.py,sha256=l6GvEtUcIJW5GGy9L3jR2djNqg4f1bBFwl2wBpVfL10,21467
+arthur_common/models/request_schemas.py,sha256=ihrWK0SRVXsRmNaiLibbAEWi_RHl440JJvm09WRdNxQ,29329
 arthur_common/models/response_schemas.py,sha256=eZCgxnfOht8isUunAA4rosLFA-tgXRZIcj2CYa5XqOE,24362
 arthur_common/models/schema_definitions.py,sha256=dcUSLjBmvyloStcBFmT_rHdXbKdvA8Yxi_avYUbps3E,16876
 arthur_common/models/task_job_specs.py,sha256=p7jsSb97ylHYNkwoHXNOJvx2zcnh2kxLeh3m0pddo4M,3442
@@ -38,12 +38,12 @@ arthur_common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arthur_common/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arthur_common/tools/aggregation_analyzer.py,sha256=UfMtvFWXV2Dqly8S6nneGgomuvEGN-1tBz81tfkMcAE,11206
 arthur_common/tools/aggregation_loader.py,sha256=3CF46bNi-GdJBNOXkjYfCQ1Aung8lf65L532sdWmR_s,2351
-arthur_common/tools/duckdb_data_loader.py,sha256=OwuvppwcBB9qQxyWr86mH7Gz2FBIuyDl0UpQ7TulhlU,11220
-arthur_common/tools/duckdb_utils.py,sha256=1i-kRXu95gh4Sf9Osl2LFUpdb0yZifOjLDtIgSfSmfs,1197
+arthur_common/tools/duckdb_data_loader.py,sha256=A80wpATSc4VJLghoHwxpBEuUsxY93OZS0Qo4cFX7cRw,12462
+arthur_common/tools/duckdb_utils.py,sha256=8l8bUmjqJyj84DXyEOzO_DsD8VsO25DWYK_IYF--Zek,1211
 arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6d8k,1093
 arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arthur_common/tools/schema_inferer.py,sha256=Ur4CXGAkd6ZMSU0nMNrkOEElsBopHXq0lctTV8X92W8,5188
+arthur_common/tools/schema_inferer.py,sha256=9teI67umlGn0izp6pZ5UBuWxJthaWEmw3wRj2KPIbf4,5207
 arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
-arthur_common-2.1.68.dist-info/METADATA,sha256=Dmyvy60ivlka8sQwvlFN2-XNAybMTRbHsNu9RUd5FkU,2147
-arthur_common-2.1.68.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-arthur_common-2.1.68.dist-info/RECORD,,
+arthur_common-2.3.0.dist-info/METADATA,sha256=AfXaXNFya5qwUZcaI_QBG7b1gTLCgLSoza5kzgyGb0E,2146
+arthur_common-2.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+arthur_common-2.3.0.dist-info/RECORD,,

{arthur_common-2.1.68.dist-info → arthur_common-2.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

arthur-common 2.1.68__py3-none-any.whl → 2.3.0__py3-none-any.whl

Potentially problematic release.

arthur-common 2.1.68py3-none-any.whl → 2.3.0py3-none-any.whl