PyPI - arthur-common - Versions diffs - 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl - Mend

arthur-common 2.1.58py3-none-any.whl → 2.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arthur_common/aggregations/aggregator.py +73 -9
arthur_common/aggregations/functions/agentic_aggregations.py +260 -85
arthur_common/aggregations/functions/categorical_count.py +15 -15
arthur_common/aggregations/functions/confusion_matrix.py +24 -26
arthur_common/aggregations/functions/inference_count.py +5 -9
arthur_common/aggregations/functions/inference_count_by_class.py +16 -27
arthur_common/aggregations/functions/inference_null_count.py +10 -13
arthur_common/aggregations/functions/mean_absolute_error.py +12 -18
arthur_common/aggregations/functions/mean_squared_error.py +12 -18
arthur_common/aggregations/functions/multiclass_confusion_matrix.py +13 -20
arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +1 -1
arthur_common/aggregations/functions/numeric_stats.py +13 -15
arthur_common/aggregations/functions/numeric_sum.py +12 -15
arthur_common/aggregations/functions/shield_aggregations.py +457 -215
arthur_common/models/common_schemas.py +214 -0
arthur_common/models/connectors.py +10 -2
arthur_common/models/constants.py +24 -0
arthur_common/models/datasets.py +0 -9
arthur_common/models/enums.py +177 -0
arthur_common/models/metric_schemas.py +63 -0
arthur_common/models/metrics.py +2 -9
arthur_common/models/request_schemas.py +870 -0
arthur_common/models/response_schemas.py +785 -0
arthur_common/models/schema_definitions.py +6 -1
arthur_common/models/task_job_specs.py +3 -12
arthur_common/tools/duckdb_data_loader.py +34 -2
arthur_common/tools/duckdb_utils.py +3 -6
arthur_common/tools/schema_inferer.py +3 -6
{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/METADATA +12 -4
arthur_common-2.4.13.dist-info/RECORD +49 -0
arthur_common/models/shield.py +0 -642
arthur_common-2.1.58.dist-info/RECORD +0 -44
{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/WHEEL +0 -0

arthur_common/aggregations/functions/confusion_matrix.py CHANGED Viewed

@@ -4,7 +4,7 @@ from uuid import UUID
 from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,
@@ -20,7 +20,10 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import (
+    escape_str_literal,
+    unescape_identifier,
+)
 class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
@@ -78,11 +81,11 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
             Without segmentation, this is the query:
                 WITH normalized_data AS (
                     SELECT
-                        {escaped_timestamp_col} AS timestamp,
-                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                        {timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT
                     time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -90,34 +93,31 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
                     SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
                     SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
                     SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
-                    {escaped_prediction_col_name} as prediction_column_name
+                    {unescaped_prediction_col_name} as prediction_column_name
                 FROM normalized_data
                 GROUP BY ts
                 ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_prediction_col_name = escape_str_literal(prediction_col)
-        escaped_gt_values_col = escape_identifier(gt_values_col)
+        unescaped_prediction_col_name = escape_str_literal(
+            unescape_identifier(prediction_col),
+        )
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         first_subquery_select_cols = [
-            f"{escaped_timestamp_col} AS timestamp",
-            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
-            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
-        ] + escaped_segmentation_cols
+            f"{timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
+        ] + segmentation_cols
         second_subquery_select_cols = [
             "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
             "SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
             "SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
             "SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
             "SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
-            f"{escaped_prediction_col_name} as prediction_column_name",
-        ] + escaped_segmentation_cols
-        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+            f"{unescaped_prediction_col_name} as prediction_column_name",
+        ] + segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + segmentation_cols
         extra_dims = ["prediction_column_name"]
         # build query
@@ -125,7 +125,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
                 WITH normalized_data AS (
                     SELECT {", ".join(first_subquery_select_cols)}
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT {", ".join(second_subquery_select_cols)}
                 FROM normalized_data
@@ -243,9 +243,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
         ] = None,
     ) -> list[NumericMetric]:
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_prediction_col = escape_identifier(prediction_col)
         # Get the type of prediction column
-        type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
+        type_query = f"SELECT typeof({prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
         res = ddb_conn.sql(type_query).fetchone()
         # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
         if not res:
@@ -476,7 +475,6 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
             ),
         ] = None,
     ) -> list[NumericMetric]:
-        escaped_gt_values_col = escape_identifier(gt_values_col)
         prediction_normalization_case = f"""
                 CASE
                     WHEN value >= {threshold} THEN 1
@@ -485,7 +483,7 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
                 END
                 """
-        type_query = f"SELECT typeof({escaped_gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
+        type_query = f"SELECT typeof({gt_values_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
         res = ddb_conn.sql(type_query).fetchone()
         # As long as this column exists, we should be able to get the type. This is here to make mypy happy.
         if not res:

arthur_common/aggregations/functions/inference_count.py CHANGED Viewed

@@ -18,7 +18,6 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
 class InferenceCountAggregationFunction(NumericAggregationFunction):
@@ -80,23 +79,19 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-            select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                     count(*) as count \
                     from {dataset.dataset_table_name} \
                     group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         count_query = f"""
@@ -106,6 +101,7 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
         """
         results = ddb_conn.sql(count_query).df()
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",

arthur_common/aggregations/functions/inference_count_by_class.py CHANGED Viewed

@@ -4,7 +4,7 @@ from uuid import UUID
 from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,
@@ -20,7 +20,6 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
 class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
@@ -100,31 +99,26 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
         SELECT
-            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-            {escaped_pred_col} as prediction,
+            time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
+            {prediction_col} as prediction,
             COUNT(*) as count
         FROM {dataset.dataset_table_name}
         GROUP BY
             ts,
             -- group by raw column name instead of alias in select
             -- in case table has a column called 'prediction'
-            {escaped_pred_col}
+            {prediction_col}
         ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_pred_col = escape_identifier(prediction_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"{escaped_pred_col} as prediction",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"{prediction_col} as prediction",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts", f"{prediction_col}"] + segmentation_cols
         extra_dims = ["prediction"]
         # build query
@@ -248,34 +242,29 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
             SELECT
-            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-            CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
+            time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts,
+            CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
             COUNT(*) as count
         FROM {dataset.dataset_table_name}
         GROUP BY
             ts,
             -- group by raw column name instead of alias in select
             -- in case table has a column called 'prediction'
-            {escaped_prediction_col}
+            {prediction_col}
         ORDER BY ts
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"CASE WHEN {prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
+        ] + segmentation_cols
         all_group_by_cols = [
             "ts",
-            f"{escaped_prediction_col}",
-        ] + escaped_segmentation_cols
+            f"{prediction_col}",
+        ] + segmentation_cols
         extra_dims = ["prediction"]
         query = f"""

arthur_common/aggregations/functions/inference_null_count.py CHANGED Viewed

@@ -19,7 +19,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import unescape_identifier
 class InferenceNullCountAggregationFunction(NumericAggregationFunction):
@@ -90,30 +90,25 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
+                select time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
                 count(*) as count \
-                from {dataset.dataset_table_name} where {escaped_nullable_col} is null \
+                from {dataset.dataset_table_name} where {nullable_col} is null \
                 group by ts \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_nullable_col = escape_identifier(nullable_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
             f"count(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         count_query = f"""
             select {", ".join(all_select_clause_cols)}
             from {dataset.dataset_table_name}
-            where {escaped_nullable_col} is null
+            where {nullable_col} is null
             group by {", ".join(all_group_by_cols)}
         """
@@ -127,7 +122,9 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
         )
         # preserve dimension that identifies the name of the nullable column used for the aggregation
         for point in series:
-            point.dimensions.append(Dimension(name="column_name", value=nullable_col))
+            point.dimensions.append(
+                Dimension(name="column_name", value=unescape_identifier(nullable_col)),
+            )
         metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]

arthur_common/aggregations/functions/mean_absolute_error.py CHANGED Viewed

@@ -4,7 +4,7 @@ from uuid import UUID
 from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,
@@ -19,7 +19,6 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
 class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
@@ -111,40 +110,35 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae, \
+                SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                SUM(ABS({prediction_col} - {ground_truth_col})) as ae, \
                 COUNT(*) as count \
                 FROM {dataset.dataset_table_name} \
-                WHERE {escaped_prediction_col} IS NOT NULL \
-                AND {escaped_ground_truth_col} IS NOT NULL \
+                WHERE {prediction_col} IS NOT NULL \
+                AND {ground_truth_col} IS NOT NULL \
                 GROUP BY ts order by ts desc \
                 """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_ground_truth_col = escape_identifier(ground_truth_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"SUM(ABS({escaped_prediction_col} - {escaped_ground_truth_col})) as ae",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"SUM(ABS({prediction_col} - {ground_truth_col})) as ae",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         mae_query = f"""
             SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_prediction_col} IS NOT NULL
-                  AND {escaped_ground_truth_col} IS NOT NULL
+            WHERE {prediction_col} IS NOT NULL
+                  AND {ground_truth_col} IS NOT NULL
             GROUP BY {", ".join(all_group_by_cols)} order by ts desc
         """
         results = ddb_conn.sql(mae_query).df()
         count_series = self.group_query_results_to_numeric_metrics(
             results,
             "count",

arthur_common/aggregations/functions/mean_squared_error.py CHANGED Viewed

@@ -4,7 +4,7 @@ from uuid import UUID
 from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,
@@ -19,7 +19,6 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
 class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
@@ -111,40 +110,35 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
         ] = None,
     ) -> list[NumericMetric]:
         """Executed SQL with no segmentation columns:
-                SELECT time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-                SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error, \
+                SELECT time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts, \
+                SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error, \
                 COUNT(*) as count \
                 FROM {dataset.dataset_table_name} \
-                WHERE {escaped_prediction_col} IS NOT NULL \
-                AND {escaped_ground_truth_col} IS NOT NULL \
+                WHERE {prediction_col} IS NOT NULL \
+                AND {ground_truth_col} IS NOT NULL \
                 GROUP BY ts order by ts desc \
                 """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_ground_truth_col = escape_identifier(ground_truth_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
-            f"SUM(POW({escaped_prediction_col} - {escaped_ground_truth_col}, 2)) as squared_error",
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col}) as ts",
+            f"SUM(POW({prediction_col} - {ground_truth_col}, 2)) as squared_error",
             f"COUNT(*) as count",
-        ] + escaped_segmentation_cols
-        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        all_group_by_cols = ["ts"] + segmentation_cols
         # build query
         mse_query = f"""
             SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_prediction_col} IS NOT NULL
-                  AND {escaped_ground_truth_col} IS NOT NULL
+            WHERE {prediction_col} IS NOT NULL
+                  AND {ground_truth_col} IS NOT NULL
             GROUP BY {", ".join(all_group_by_cols)} order by ts desc
         """
         results = ddb_conn.sql(mse_query).df()
         count_series = self.group_query_results_to_numeric_metrics(
             results,
             "count",

arthur_common/aggregations/functions/multiclass_confusion_matrix.py CHANGED Viewed

@@ -4,7 +4,7 @@ from uuid import UUID
 from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,
@@ -20,7 +20,7 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import escape_str_literal
 class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction(
@@ -194,11 +194,11 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             Returns the following SQL with no segmentation:
             WITH normalized_data AS (
                     SELECT
-                        {escaped_timestamp_col} AS timestamp,
-                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                        {timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', gt_values_col)} AS actual_value
                     FROM {dataset.dataset_table_name}
-                    WHERE {escaped_timestamp_col} IS NOT NULL
+                    WHERE {timestamp_col} IS NOT NULL
                 )
                 SELECT
                     time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
@@ -212,19 +212,12 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
                 ORDER BY ts
         """
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        escaped_prediction_col = escape_identifier(prediction_col)
-        escaped_gt_values_col = escape_identifier(gt_values_col)
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         first_subquery_select_cols = [
-            f"{escaped_timestamp_col} AS timestamp",
-            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
-            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
-        ] + escaped_segmentation_cols
+            f"{timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', gt_values_col)} AS actual_value",
+        ] + segmentation_cols
         second_subquery_select_cols = [
             "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
             "SUM(CASE WHEN prediction = 1 AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
@@ -232,8 +225,8 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             "SUM(CASE WHEN prediction = 0 AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
             "SUM(CASE WHEN prediction = 0 AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
             f"any_value({escaped_positive_class_label}) as class_label",
-        ] + escaped_segmentation_cols
-        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+        ] + segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + segmentation_cols
         extra_dims = ["class_label"]
         # build query
@@ -241,7 +234,7 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
         WITH normalized_data AS (
             SELECT {", ".join(first_subquery_select_cols)}
             FROM {dataset.dataset_table_name}
-            WHERE {escaped_timestamp_col} IS NOT NULL
+            WHERE {timestamp_col} IS NOT NULL
         )
         SELECT {", ".join(second_subquery_select_cols)}
         FROM normalized_data

arthur_common/aggregations/functions/multiclass_inference_count_by_class.py CHANGED Viewed

@@ -6,7 +6,7 @@ from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.functions.inference_count_by_class import (
     BinaryClassifierCountByClassAggregationFunction,
 )
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,

arthur_common/aggregations/functions/numeric_stats.py CHANGED Viewed

@@ -18,7 +18,10 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from arthur_common.tools.duckdb_data_loader import (
+    escape_str_literal,
+    unescape_identifier,
+)
 class NumericSketchAggregationFunction(SketchAggregationFunction):
@@ -95,40 +98,35 @@ class NumericSketchAggregationFunction(SketchAggregationFunction):
         ] = None,
     ) -> list[SketchMetric]:
         """Executed SQL with no segmentation columns:
-                    select {escaped_timestamp_col_id} as ts, \
-                       {escaped_numeric_col_id}, \
+                    select {timestamp_col} as ts, \
+                       {numeric_col}, \
                        {numeric_col_name_str} as column_name \
                 from {dataset.dataset_table_name} \
-                where {escaped_numeric_col_id} is not null \
+                where {numeric_col} is not null \
         """
         segmentation_cols = [] if not segmentation_cols else segmentation_cols
-        escaped_timestamp_col_id = escape_identifier(timestamp_col)
-        escaped_numeric_col_id = escape_identifier(numeric_col)
-        numeric_col_name_str = escape_str_literal(numeric_col)
+        numeric_col_name_str = escape_str_literal(unescape_identifier(numeric_col))
         # build query components with segmentation columns
-        escaped_segmentation_cols = [
-            escape_identifier(col) for col in segmentation_cols
-        ]
         all_select_clause_cols = [
-            f"{escaped_timestamp_col_id} as ts",
-            f"{escaped_numeric_col_id}",
+            f"{timestamp_col} as ts",
+            f"{numeric_col}",
             f"{numeric_col_name_str} as column_name",
-        ] + escaped_segmentation_cols
+        ] + segmentation_cols
         extra_dims = ["column_name"]
         # build query
         data_query = f"""
                     select {", ".join(all_select_clause_cols)}
                     from {dataset.dataset_table_name}
-                    where {escaped_numeric_col_id} is not null
+                    where {numeric_col} is not null
                 """
         results = ddb_conn.sql(data_query).df()
         series = self.group_query_results_to_sketch_metrics(
             results,
-            numeric_col,
+            unescape_identifier(numeric_col),
             segmentation_cols + extra_dims,
             "ts",
         )

arthur-common 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl

arthur-common 2.1.58py3-none-any.whl → 2.4.13py3-none-any.whl