PyPI - arthur-common - Versions diffs - 1.0.1__py3-none-any.whl → 2.1.48__py3-none-any.whl - Mend

arthur-common 1.0.1py3-none-any.whl → 2.1.48py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arthur-common might be problematic. Click here for more details.

Files changed (21) hide show

arthur_common/aggregations/aggregator.py CHANGED Viewed

@@ -73,6 +73,15 @@ class NumericAggregationFunction(AggregationFunction, ABC):
         From there, iterate over the group turning each data point to a *Point. At the end, this single instance of the group metrics
         and the list of points (values) are merged to one *TimeSeries
         """
+        if not dim_columns:
+            return [
+                NumericAggregationFunction._dimensionless_query_results_to_numeric_metrics(
+                    data,
+                    value_col,
+                    timestamp_col,
+                ),
+            ]
         calculated_metrics: list[NumericTimeSeries] = []
         # make sure dropna is False or rows with "null" as a dimension value will be dropped
         groups = data.groupby(dim_columns, dropna=False)
@@ -99,7 +108,7 @@ class NumericAggregationFunction(AggregationFunction, ABC):
         return calculated_metrics
     @staticmethod
-    def dimensionless_query_results_to_numeric_metrics(
+    def _dimensionless_query_results_to_numeric_metrics(
         data: pd.DataFrame,
         value_col: str,
         timestamp_col: str,

arthur_common/aggregations/functions/categorical_count.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Annotated
+from typing import Annotated, Optional
 from uuid import UUID
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
@@ -7,6 +7,7 @@ from arthur_common.models.schema_definitions import (
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
+    MetricMultipleColumnParameterAnnotation,
     ScalarType,
     ScopeSchemaTag,
 )
@@ -64,25 +65,64 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
                 description="A column containing categorical values to count.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
+        """Executed SQL with no segmentation columns:
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
+                count(*) as count, \
+                {categorical_col_escaped} as category, \
+                {categorical_col_name_escaped} as column_name \
+                from {dataset.dataset_table_name} \
+                where ts is not null \
+                group by ts, category
+        """
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
         timestamp_col_escaped = escape_identifier(timestamp_col)
         categorical_col_escaped = escape_identifier(categorical_col)
         categorical_col_name_escaped = escape_str_literal(categorical_col)
-        count_query = f" \
-            select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
-            count(*) as count, \
-            {categorical_col_escaped} as category, \
-            {categorical_col_name_escaped} as column_name \
-            from {dataset.dataset_table_name} \
-            where ts is not null \
-            group by ts, category  \
-        "
+        # build query components with segmentation columns
+        escaped_segmentation_cols = [
+            escape_identifier(col) for col in segmentation_cols
+        ]
+        all_select_clause_cols = [
+            f"time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts",
+            f"count(*) as count",
+            f"{categorical_col_escaped} as category",
+            f"{categorical_col_name_escaped} as column_name",
+        ] + escaped_segmentation_cols
+        all_group_by_cols = ["ts", "category"] + escaped_segmentation_cols
+        extra_dims = ["column_name", "category"]
+        # build query
+        count_query = f"""
+            select {", ".join(all_select_clause_cols)}
+            from {dataset.dataset_table_name}
+            where ts is not null
+            group by {", ".join(all_group_by_cols)}
+        """
         results = ddb_conn.sql(count_query).df()
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
-            ["column_name", "category"],
+            segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)

arthur_common/aggregations/functions/confusion_matrix.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Annotated
+from typing import Annotated, Optional
 from uuid import UUID
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
@@ -9,10 +9,11 @@ from arthur_common.models.schema_definitions import (
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
     MetricLiteralParameterAnnotation,
+    MetricMultipleColumnParameterAnnotation,
     ScalarType,
     ScopeSchemaTag,
 )
-from arthur_common.tools.duckdb_data_loader import escape_identifier
+from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
 from duckdb import DuckDBPyConnection
@@ -26,6 +27,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
         prediction_normalization_case: str,
         gt_normalization_case: str,
         dataset: DatasetReference,
+        segmentation_cols: list[str],
     ) -> list[NumericMetric]:
         """
         Generate a SQL query to compute confusion matrix metrics over time.
@@ -37,59 +39,98 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
             prediction_normalization_case: SQL CASE statement for normalizing predictions to 0 / 1 / null using 'value' as the target column name
             gt_normalization_case: SQL CASE statement for normalizing ground truth values to 0 / 1 / null using 'value' as the target column name
             dataset: DatasetReference containing dataset metadata
+            segmentation_cols: list of columns to segment by
         Returns:
             str: SQL query that computes confusion matrix metrics
+            Without segmentation, this is the query:
+                WITH normalized_data AS (
+                    SELECT
+                        {escaped_timestamp_col} AS timestamp,
+                        {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
+                        {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
+                    FROM {dataset.dataset_table_name}
+                    WHERE {escaped_timestamp_col} IS NOT NULL
+                )
+                SELECT
+                    time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
+                    SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count,
+                    SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
+                    SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
+                    SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count,
+                    {escaped_prediction_col_name} as prediction_column_name
+                FROM normalized_data
+                GROUP BY ts
+                ORDER BY ts
         """
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
         escaped_timestamp_col = escape_identifier(timestamp_col)
         escaped_prediction_col = escape_identifier(prediction_col)
+        escaped_prediction_col_name = escape_str_literal(prediction_col)
         escaped_gt_values_col = escape_identifier(gt_values_col)
+        # build query components with segmentation columns
+        escaped_segmentation_cols = [
+            escape_identifier(col) for col in segmentation_cols
+        ]
+        first_subquery_select_cols = [
+            f"{escaped_timestamp_col} AS timestamp",
+            f"{prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction",
+            f"{gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value",
+        ] + escaped_segmentation_cols
+        second_subquery_select_cols = [
+            "time_bucket(INTERVAL '5 minutes', timestamp) AS ts",
+            "SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count",
+            "SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count",
+            "SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count",
+            "SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count",
+            f"{escaped_prediction_col_name} as prediction_column_name",
+        ] + escaped_segmentation_cols
+        second_subquery_group_by_cols = ["ts"] + escaped_segmentation_cols
+        extra_dims = ["prediction_column_name"]
+        # build query
         confusion_matrix_query = f"""
-            WITH normalized_data AS (
-                SELECT
-                    {escaped_timestamp_col} AS timestamp,
-                    {prediction_normalization_case.replace('value', escaped_prediction_col)} AS prediction,
-                    {gt_normalization_case.replace('value', escaped_gt_values_col)} AS actual_value
-                FROM {dataset.dataset_table_name}
-                WHERE {escaped_timestamp_col} IS NOT NULL
-            )
-            SELECT
-                time_bucket(INTERVAL '5 minutes', timestamp) AS ts,
-                SUM(CASE WHEN prediction = actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS true_positive_count,
-                SUM(CASE WHEN prediction != actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS false_positive_count,
-                SUM(CASE WHEN prediction != actual_value AND actual_value = 1 THEN 1 ELSE 0 END) AS false_negative_count,
-                SUM(CASE WHEN prediction = actual_value AND actual_value = 0 THEN 1 ELSE 0 END) AS true_negative_count
-            FROM normalized_data
-            GROUP BY ts
-            ORDER BY ts
-        """
+                WITH normalized_data AS (
+                    SELECT {", ".join(first_subquery_select_cols)}
+                    FROM {dataset.dataset_table_name}
+                    WHERE {escaped_timestamp_col} IS NOT NULL
+                )
+                SELECT {", ".join(second_subquery_select_cols)}
+                FROM normalized_data
+                GROUP BY {", ".join(second_subquery_group_by_cols)}
+                ORDER BY ts
+            """
         results = ddb_conn.sql(confusion_matrix_query).df()
-        tp = self.dimensionless_query_results_to_numeric_metrics(
+        tp = self.group_query_results_to_numeric_metrics(
             results,
             "true_positive_count",
+            dim_columns=segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
-        fp = self.dimensionless_query_results_to_numeric_metrics(
+        fp = self.group_query_results_to_numeric_metrics(
             results,
             "false_positive_count",
+            dim_columns=segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
-        fn = self.dimensionless_query_results_to_numeric_metrics(
+        fn = self.group_query_results_to_numeric_metrics(
             results,
             "false_negative_count",
+            dim_columns=segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
-        tn = self.dimensionless_query_results_to_numeric_metrics(
+        tn = self.group_query_results_to_numeric_metrics(
             results,
             "true_negative_count",
+            dim_columns=segmentation_cols + extra_dims,
             timestamp_col="ts",
         )
-        tp_metric = self.series_to_metric("confusion_matrix_true_positive_count", [tp])
-        fp_metric = self.series_to_metric("confusion_matrix_false_positive_count", [fp])
-        fn_metric = self.series_to_metric("confusion_matrix_false_negative_count", [fn])
-        tn_metric = self.series_to_metric("confusion_matrix_true_negative_count", [tn])
+        tp_metric = self.series_to_metric("confusion_matrix_true_positive_count", tp)
+        fp_metric = self.series_to_metric("confusion_matrix_false_positive_count", fp)
+        fn_metric = self.series_to_metric("confusion_matrix_false_negative_count", fn)
+        tn_metric = self.series_to_metric("confusion_matrix_true_negative_count", tn)
         return [tp_metric, fp_metric, fn_metric, tn_metric]
@@ -157,7 +198,24 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
                 description="A column containing boolean or integer ground truth values.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
         escaped_prediction_col = escape_identifier(prediction_col)
         # Get the type of prediction column
         type_query = f"SELECT typeof({escaped_prediction_col}) as col_type FROM {dataset.dataset_table_name} LIMIT 1"
@@ -194,6 +252,7 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
             normalization_case,
             normalization_case,
             dataset,
+            segmentation_cols,
         )
@@ -275,7 +334,24 @@ class BinaryClassifierStringLabelConfusionMatrixAggregationFunction(
                 description="The label indicating a negative classification to normalize to 0.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
         normalization_case = f"""
                 CASE
                     WHEN value = '{true_label}' THEN 1
@@ -291,6 +367,7 @@ class BinaryClassifierStringLabelConfusionMatrixAggregationFunction(
             normalization_case,
             normalization_case,
             dataset,
+            segmentation_cols,
         )
@@ -365,6 +442,22 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
                 description="The threshold to classify predictions to 0 or 1.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
         escaped_gt_values_col = escape_identifier(gt_values_col)
         prediction_normalization_case = f"""
@@ -409,4 +502,5 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
             prediction_normalization_case,
             gt_normalization_case,
             dataset,
+            segmentation_cols,
         )

arthur_common/aggregations/functions/inference_count.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Annotated
+from typing import Annotated, Optional
 from uuid import UUID
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
@@ -7,6 +7,7 @@ from arthur_common.models.schema_definitions import (
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
+    MetricMultipleColumnParameterAnnotation,
     ScalarType,
     ScopeSchemaTag,
 )
@@ -51,19 +52,55 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
                 description="A column containing timestamp values to bucket by.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
-        escaped_timestamp_col = escape_identifier(timestamp_col)
-        count_query = f" \
+        """Executed SQL with no segmentation columns:
             select time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts, \
-            count(*) as count \
-            from {dataset.dataset_table_name} \
-            group by ts \
-        "
+                    count(*) as count \
+                    from {dataset.dataset_table_name} \
+                    group by ts \
+        """
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
+        escaped_timestamp_col = escape_identifier(timestamp_col)
+        # build query components with segmentation columns
+        escaped_segmentation_cols = [
+            escape_identifier(col) for col in segmentation_cols
+        ]
+        all_select_clause_cols = [
+            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"count(*) as count",
+        ] + escaped_segmentation_cols
+        all_group_by_cols = ["ts"] + escaped_segmentation_cols
+        # build query
+        count_query = f"""
+            select {", ".join(all_select_clause_cols)}
+            from {dataset.dataset_table_name}
+            group by {", ".join(all_group_by_cols)}
+        """
         results = ddb_conn.sql(count_query).df()
-        series = self.dimensionless_query_results_to_numeric_metrics(
+        series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
+            segmentation_cols,
             "ts",
         )
-        metric = self.series_to_metric(self.METRIC_NAME, [series])
+        metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]

arthur_common/aggregations/functions/inference_count_by_class.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Annotated
+from typing import Annotated, Optional
 from uuid import UUID
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
@@ -9,6 +9,7 @@ from arthur_common.models.schema_definitions import (
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
     MetricLiteralParameterAnnotation,
+    MetricMultipleColumnParameterAnnotation,
     ScalarType,
     ScopeSchemaTag,
 )
@@ -70,29 +71,66 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
                 description="A column containing boolean, integer, or string labelled prediction values.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
+        """Executed SQL with no segmentation columns:
+        SELECT
+            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
+            {escaped_pred_col} as prediction,
+            COUNT(*) as count
+        FROM {dataset.dataset_table_name}
+        GROUP BY
+            ts,
+            -- group by raw column name instead of alias in select
+            -- in case table has a column called 'prediction'
+            {escaped_pred_col}
+        ORDER BY ts
+        """
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
         escaped_timestamp_col = escape_identifier(timestamp_col)
         escaped_pred_col = escape_identifier(prediction_col)
+        # build query components with segmentation columns
+        escaped_segmentation_cols = [
+            escape_identifier(col) for col in segmentation_cols
+        ]
+        all_select_clause_cols = [
+            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"{escaped_pred_col} as prediction",
+            f"COUNT(*) as count",
+        ] + escaped_segmentation_cols
+        all_group_by_cols = ["ts", f"{escaped_pred_col}"] + escaped_segmentation_cols
+        extra_dims = ["prediction"]
+        # build query
         query = f"""
-            SELECT
-                time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-                {escaped_pred_col} as prediction,
-                COUNT(*) as count
-            FROM {dataset.dataset_table_name}
-            GROUP BY
-                ts,
-                -- group by raw column name instead of alias in select
-                -- in case table has a column called 'prediction'
-                {escaped_pred_col}
-            ORDER BY ts
-        """
+                    SELECT {", ".join(all_select_clause_cols)}
+                    FROM {dataset.dataset_table_name}
+                    GROUP BY {", ".join(all_group_by_cols)}
+                    ORDER BY ts
+                """
         result = ddb_conn.sql(query).df()
         series = self.group_query_results_to_numeric_metrics(
             result,
             "count",
-            ["prediction"],
+            segmentation_cols + extra_dims,
             "ts",
         )
         metric = self.series_to_metric(self._metric_name(), series)
@@ -177,20 +215,59 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
                 description="The label denoting a negative classification.",
             ),
         ],
+        segmentation_cols: Annotated[
+            Optional[list[str]],
+            MetricMultipleColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.INT),
+                    ScalarType(dtype=DType.BOOL),
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.UUID),
+                ],
+                tag_hints=[],
+                friendly_name="Segmentation Columns",
+                description="All columns to include as dimensions for segmentation.",
+                optional=True,
+            ),
+        ] = None,
     ) -> list[NumericMetric]:
+        """Executed SQL with no segmentation columns:
+            SELECT
+            time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
+            CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
+            COUNT(*) as count
+        FROM {dataset.dataset_table_name}
+        GROUP BY
+            ts,
+            -- group by raw column name instead of alias in select
+            -- in case table has a column called 'prediction'
+            {escaped_prediction_col}
+        ORDER BY ts
+        """
+        segmentation_cols = [] if not segmentation_cols else segmentation_cols
         escaped_timestamp_col = escape_identifier(timestamp_col)
         escaped_prediction_col = escape_identifier(prediction_col)
+        # build query components with segmentation columns
+        escaped_segmentation_cols = [
+            escape_identifier(col) for col in segmentation_cols
+        ]
+        all_select_clause_cols = [
+            f"time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts",
+            f"CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction",
+            f"COUNT(*) as count",
+        ] + escaped_segmentation_cols
+        all_group_by_cols = [
+            "ts",
+            f"{escaped_prediction_col}",
+        ] + escaped_segmentation_cols
+        extra_dims = ["prediction"]
         query = f"""
-            SELECT
-                time_bucket(INTERVAL '5 minutes', {escaped_timestamp_col}) as ts,
-                CASE WHEN {escaped_prediction_col} >= {threshold} THEN '{true_label}' ELSE '{false_label}' END as prediction,
-                COUNT(*) as count
+            SELECT {", ".join(all_select_clause_cols)}
             FROM {dataset.dataset_table_name}
-            GROUP BY
-                ts,
-                -- group by raw column name instead of alias in select
-                -- in case table has a column called 'prediction'
-                {escaped_prediction_col}
+            GROUP BY {", ".join(all_group_by_cols)}
             ORDER BY ts
         """
@@ -199,7 +276,7 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
         series = self.group_query_results_to_numeric_metrics(
             result,
             "count",
-            ["prediction"],
+            segmentation_cols + extra_dims,
             "ts",
         )
         metric = self.series_to_metric(self._metric_name(), series)

arthur-common 1.0.1__py3-none-any.whl → 2.1.48__py3-none-any.whl

Potentially problematic release.

arthur-common 1.0.1py3-none-any.whl → 2.1.48py3-none-any.whl