PyPI - arthur-common - Versions diffs - 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl - Mend

arthur-common 2.1.58py3-none-any.whl → 2.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arthur_common/aggregations/aggregator.py +73 -9
arthur_common/aggregations/functions/agentic_aggregations.py +260 -85
arthur_common/aggregations/functions/categorical_count.py +15 -15
arthur_common/aggregations/functions/confusion_matrix.py +24 -26
arthur_common/aggregations/functions/inference_count.py +5 -9
arthur_common/aggregations/functions/inference_count_by_class.py +16 -27
arthur_common/aggregations/functions/inference_null_count.py +10 -13
arthur_common/aggregations/functions/mean_absolute_error.py +12 -18
arthur_common/aggregations/functions/mean_squared_error.py +12 -18
arthur_common/aggregations/functions/multiclass_confusion_matrix.py +13 -20
arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +1 -1
arthur_common/aggregations/functions/numeric_stats.py +13 -15
arthur_common/aggregations/functions/numeric_sum.py +12 -15
arthur_common/aggregations/functions/shield_aggregations.py +457 -215
arthur_common/models/common_schemas.py +214 -0
arthur_common/models/connectors.py +10 -2
arthur_common/models/constants.py +24 -0
arthur_common/models/datasets.py +0 -9
arthur_common/models/enums.py +177 -0
arthur_common/models/metric_schemas.py +63 -0
arthur_common/models/metrics.py +2 -9
arthur_common/models/request_schemas.py +870 -0
arthur_common/models/response_schemas.py +785 -0
arthur_common/models/schema_definitions.py +6 -1
arthur_common/models/task_job_specs.py +3 -12
arthur_common/tools/duckdb_data_loader.py +34 -2
arthur_common/tools/duckdb_utils.py +3 -6
arthur_common/tools/schema_inferer.py +3 -6
{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/METADATA +12 -4
arthur_common-2.4.13.dist-info/RECORD +49 -0
arthur_common/models/shield.py +0 -642
arthur_common-2.1.58.dist-info/RECORD +0 -44
{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/WHEEL +0 -0

arthur_common/aggregations/functions/shield_aggregations.py CHANGED Viewed

@@ -9,7 +9,7 @@ from arthur_common.aggregations.aggregator import (
     NumericAggregationFunction,
     SketchAggregationFunction,
 )
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 from arthur_common.models.metrics import (
     BaseReportedAggregation,
     DatasetReference,
@@ -25,6 +25,7 @@ from arthur_common.models.schema_definitions import (
 class ShieldInferencePassFailCountAggregation(NumericAggregationFunction):
     METRIC_NAME = "inference_count"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_PASS_FAIL_COUNT_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -71,17 +72,41 @@ class ShieldInferencePassFailCountAggregation(NumericAggregationFunction):
             ),
         ],
     ) -> list[NumericMetric]:
-        results = ddb_conn.sql(
-            f"select time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts, count(*) as count, \
-                    result, \
-                    inference_prompt.result AS prompt_result, \
-                    inference_response.result AS response_result \
-                    from {dataset.dataset_table_name} \
-                    group by ts, result, prompt_result, response_result \
-                    order by ts desc; \
-        ",
-        ).df()
-        group_by_dims = ["result", "prompt_result", "response_result"]
+        # Build SELECT clause
+        select_cols = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+            "count(*) as count",
+            "result",
+            "inference_prompt.result AS prompt_result",
+            "inference_response.result AS response_result",
+        ]
+        # Build GROUP BY clause
+        group_by_cols = ["ts", "result", "prompt_result", "response_result"]
+        # Conditionally add conversation_id and user_id based on segmentation flag
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            select_cols.extend(["conversation_id", "user_id as user_id"])
+            group_by_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            select {", ".join(select_cols)}
+            from {dataset.dataset_table_name}
+            group by {", ".join(group_by_cols)}
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        # Build group_by_dims list
+        group_by_dims = [
+            "result",
+            "prompt_result",
+            "response_result",
+        ]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
@@ -94,6 +119,7 @@ class ShieldInferencePassFailCountAggregation(NumericAggregationFunction):
 class ShieldInferenceRuleCountAggregation(NumericAggregationFunction):
     METRIC_NAME = "rule_count"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_RULE_COUNT_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -140,40 +166,72 @@ class ShieldInferenceRuleCountAggregation(NumericAggregationFunction):
             ),
         ],
     ) -> list[NumericMetric]:
-        results = ddb_conn.sql(
-            f" \
-            with unnessted_prompt_rules as (select unnest(inference_prompt.prompt_rule_results) as rule, \
-                'prompt' as location, \
-                time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts \
-            from {dataset.dataset_table_name}), \
-            unnessted_result_rules as (select unnest(inference_response.response_rule_results) as rule,\
-                'response' as location, \
-                time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts \
-            from {dataset.dataset_table_name}) \
-            select ts, \
-                count(*) as count, \
-                location, \
-                rule.rule_type, \
-                rule.result, \
-                rule.name, \
-                rule.id \
-            from unnessted_prompt_rules \
-            group by ts, location, rule.rule_type, rule.result, rule.name, rule.id \
-            UNION ALL \
-            select ts, \
-                count(*) as count, \
-                location, \
-                rule.rule_type, \
-                rule.result, \
-                rule.name, \
-                rule.id \
-            from unnessted_result_rules \
-            group by ts, location, rule.rule_type, rule.result, rule.name, rule.id \
-            order by ts desc, location, rule.rule_type, rule.result; \
-            ",
-        ).df()
-        group_by_dims = ["location", "rule_type", "result", "name", "id"]
+        # Build CTE select columns
+        prompt_cte_select = [
+            "unnest(inference_prompt.prompt_rule_results) as rule",
+            "'prompt' as location",
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+        ]
+        response_cte_select = [
+            "unnest(inference_response.response_rule_results) as rule",
+            "'response' as location",
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+        ]
+        # Build main select columns
+        main_select_cols = [
+            "ts",
+            "count(*) as count",
+            "location",
+            "rule.rule_type",
+            "rule.result",
+            "rule.name",
+            "rule.id",
+        ]
+        # Build group by columns
+        group_by_cols = [
+            "ts",
+            "location",
+            "rule.rule_type",
+            "rule.result",
+            "rule.name",
+            "rule.id",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            prompt_cte_select.extend(["conversation_id", "user_id"])
+            response_cte_select.extend(["conversation_id", "user_id"])
+            main_select_cols.extend(["conversation_id", "user_id"])
+            group_by_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnessted_prompt_rules as (select {", ".join(prompt_cte_select)}
+            from {dataset.dataset_table_name}),
+            unnessted_result_rules as (select {", ".join(response_cte_select)}
+            from {dataset.dataset_table_name})
+            select {", ".join(main_select_cols)}
+            from unnessted_prompt_rules
+            group by {", ".join(group_by_cols)}
+            UNION ALL
+            select {", ".join(main_select_cols)}
+            from unnessted_result_rules
+            group by {", ".join(group_by_cols)}
+            order by ts desc, location, rule.rule_type, rule.result;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = [
+            "location",
+            "rule_type",
+            "result",
+            "name",
+            "id",
+        ]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_numeric_metrics(
             results,
             "count",
@@ -186,6 +244,7 @@ class ShieldInferenceRuleCountAggregation(NumericAggregationFunction):
 class ShieldInferenceHallucinationCountAggregation(NumericAggregationFunction):
     METRIC_NAME = "hallucination_count"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_HALLUCINATION_COUNT_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -232,24 +291,46 @@ class ShieldInferenceHallucinationCountAggregation(NumericAggregationFunction):
             ),
         ],
     ) -> list[NumericMetric]:
-        results = ddb_conn.sql(
-            f" \
-            select time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts, \
-            count(*) as count \
-            from {dataset.dataset_table_name} \
-            where length(list_filter(inference_response.response_rule_results, x -> (x.rule_type = 'ModelHallucinationRuleV2' or x.rule_type = 'ModelHallucinationRule') and x.result = 'Fail')) > 0 \
-            group by ts \
-            order by ts desc; \
-        ",
-        ).df()
-        series = self.group_query_results_to_numeric_metrics(results, "count", [], "ts")
+        # Build SELECT clause
+        select_cols = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+            "count(*) as count",
+        ]
+        # Build GROUP BY clause
+        group_by_cols = ["ts"]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            select_cols.extend(["conversation_id", "user_id"])
+            group_by_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            select {", ".join(select_cols)}
+            from {dataset.dataset_table_name}
+            where length(list_filter(inference_response.response_rule_results, x -> (x.rule_type = 'ModelHallucinationRuleV2' or x.rule_type = 'ModelHallucinationRule') and x.result = 'Fail')) > 0
+            group by {", ".join(group_by_cols)}
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = []
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
+        series = self.group_query_results_to_numeric_metrics(
+            results,
+            "count",
+            group_by_dims,
+            "ts",
+        )
         metric = self.series_to_metric(self.METRIC_NAME, series)
         return [metric]
 class ShieldInferenceRuleToxicityScoreAggregation(SketchAggregationFunction):
     METRIC_NAME = "toxicity_score"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_RULE_TOXICITY_SCORE_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -296,37 +377,57 @@ class ShieldInferenceRuleToxicityScoreAggregation(SketchAggregationFunction):
             ),
         ],
     ) -> list[SketchMetric]:
-        results = ddb_conn.sql(
-            f"\
-                with unnested_prompt_results as (select to_timestamp(created_at / 1000) as ts, \
-                    unnest(inference_prompt.prompt_rule_results) as rule_results, \
-                    'prompt' as location \
-                from {dataset.dataset_table_name}), \
-                unnested_response_results as (select to_timestamp(created_at / 1000) as ts, \
-                        unnest(inference_response.response_rule_results) as rule_results, \
-                        'response' as location \
-                from {dataset.dataset_table_name}) \
-                select ts as timestamp, \
-                    rule_results.details.toxicity_score::DOUBLE as toxicity_score, \
-                    rule_results.result as result, \
-                    location \
-                from unnested_prompt_results \
-                where rule_results.details.toxicity_score IS NOT NULL \
-                UNION ALL \
-                select ts as timestamp, \
-                    rule_results.details.toxicity_score::DOUBLE as toxicity_score, \
-                    rule_results.result as result, \
-                    location \
-                from unnested_response_results \
-                where rule_results.details.toxicity_score IS NOT NULL \
-                order by ts desc;    \
-            ",
-        ).df()
+        # Build CTE select columns
+        prompt_cte_select = [
+            "to_timestamp(created_at / 1000) as ts",
+            "unnest(inference_prompt.prompt_rule_results) as rule_results",
+            "'prompt' as location",
+        ]
+        response_cte_select = [
+            "to_timestamp(created_at / 1000) as ts",
+            "unnest(inference_response.response_rule_results) as rule_results",
+            "'response' as location",
+        ]
+        # Build main select columns
+        main_select_cols = [
+            "ts as timestamp",
+            "rule_results.details.toxicity_score::DOUBLE as toxicity_score",
+            "rule_results.result as result",
+            "location",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            prompt_cte_select.extend(["conversation_id", "user_id"])
+            response_cte_select.extend(["conversation_id", "user_id"])
+            main_select_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnested_prompt_results as (select {", ".join(prompt_cte_select)}
+            from {dataset.dataset_table_name}),
+            unnested_response_results as (select {", ".join(response_cte_select)}
+            from {dataset.dataset_table_name})
+            select {", ".join(main_select_cols)}
+            from unnested_prompt_results
+            where rule_results.details.toxicity_score IS NOT NULL
+            UNION ALL
+            select {", ".join(main_select_cols)}
+            from unnested_response_results
+            where rule_results.details.toxicity_score IS NOT NULL
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["result", "location"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_sketch_metrics(
             results,
             "toxicity_score",
-            ["result", "location"],
+            group_by_dims,
             "timestamp",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -335,6 +436,7 @@ class ShieldInferenceRuleToxicityScoreAggregation(SketchAggregationFunction):
 class ShieldInferenceRulePIIDataScoreAggregation(SketchAggregationFunction):
     METRIC_NAME = "pii_score"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_RULE_PII_DATA_SCORE_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -381,43 +483,71 @@ class ShieldInferenceRulePIIDataScoreAggregation(SketchAggregationFunction):
             ),
         ],
     ) -> list[SketchMetric]:
-        results = ddb_conn.sql(
-            f"\
-with unnested_prompt_results as (select time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts,                    \
-                                        unnest(inference_prompt.prompt_rule_results)                       as rule_results,          \
-                                        'prompt'                                                           as location               \
-                                 from {dataset.dataset_table_name}),                                                                                         \
-     unnested_response_results as (select time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts,                  \
-                                          unnest(inference_response.response_rule_results)                   as rule_results,        \
-                                          'response'                                                         as location             \
-                                   from {dataset.dataset_table_name}),                                                                                       \
-     unnested_entites as (select ts,                                                                                                 \
-                                 rule_results.result,                                                                                \
-                                 rule_results.rule_type,                                                                             \
-                                 location,                                                                                           \
-                                 unnest(rule_results.details.pii_entities) as pii_entity                                             \
-                          from unnested_response_results                                                                             \
-                          where rule_results.rule_type = 'PIIDataRule'                                                               \
-                                                                                                                                     \
-                          UNION ALL                                                                                                  \
-                                                                                                                                     \
-                          select ts,                                                                                                 \
-                                 rule_results.result,                                                                                \
-                                 rule_results.rule_type,                                                                             \
-                                 location,                                                                                           \
-                                 unnest(rule_results.details.pii_entities) as pii_entity                                             \
-                          from unnested_prompt_results                                                                               \
-                          where rule_results.rule_type = 'PIIDataRule')                                                              \
-select ts as timestamp, result, rule_type, location, TRY_CAST(pii_entity.confidence AS FLOAT) as pii_score, pii_entity.entity as entity                 \
-from unnested_entites                                                                                                                \
-order by ts desc;                                                                                                                    \
-            ",
-        ).df()
+        # Build CTE select columns
+        prompt_cte_select = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+            "unnest(inference_prompt.prompt_rule_results) as rule_results",
+            "'prompt' as location",
+        ]
+        response_cte_select = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+            "unnest(inference_response.response_rule_results) as rule_results",
+            "'response' as location",
+        ]
+        # Build unnested_entities select columns
+        entities_select_cols = [
+            "ts",
+            "rule_results.result",
+            "rule_results.rule_type",
+            "location",
+            "unnest(rule_results.details.pii_entities) as pii_entity",
+        ]
+        # Build final select columns
+        final_select_cols = [
+            "ts as timestamp",
+            "result",
+            "rule_type",
+            "location",
+            "TRY_CAST(pii_entity.confidence AS FLOAT) as pii_score",
+            "pii_entity.entity as entity",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            prompt_cte_select.extend(["conversation_id", "user_id"])
+            response_cte_select.extend(["conversation_id", "user_id"])
+            entities_select_cols.extend(["conversation_id", "user_id"])
+            final_select_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnested_prompt_results as (select {", ".join(prompt_cte_select)}
+            from {dataset.dataset_table_name}),
+            unnested_response_results as (select {", ".join(response_cte_select)}
+            from {dataset.dataset_table_name}),
+            unnested_entites as (select {", ".join(entities_select_cols)}
+            from unnested_response_results
+            where rule_results.rule_type = 'PIIDataRule'
+            UNION ALL
+            select {", ".join(entities_select_cols)}
+            from unnested_prompt_results
+            where rule_results.rule_type = 'PIIDataRule')
+            select {", ".join(final_select_cols)}
+            from unnested_entites
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["result", "location", "entity"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_sketch_metrics(
             results,
             "pii_score",
-            ["result", "location", "entity"],
+            group_by_dims,
             "timestamp",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -426,6 +556,7 @@ order by ts desc;
 class ShieldInferenceRuleClaimCountAggregation(SketchAggregationFunction):
     METRIC_NAME = "claim_count"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_RULE_CLAIM_COUNT_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -472,25 +603,44 @@ class ShieldInferenceRuleClaimCountAggregation(SketchAggregationFunction):
             ),
         ],
     ) -> list[SketchMetric]:
-        results = ddb_conn.sql(
-            f"\
-                with unnested_results as (select to_timestamp(created_at / 1000) as ts, \
-                                        unnest(inference_response.response_rule_results) as rule_results \
-                                        from {dataset.dataset_table_name}) \
-                select ts as timestamp, \
-                    length(rule_results.details.claims) as num_claims, \
-                    rule_results.result as result \
-                from unnested_results \
-                where rule_results.rule_type = 'ModelHallucinationRuleV2' \
-                and rule_results.result != 'Skipped' \
-                order by ts desc; \
-            ",
-        ).df()
+        # Build CTE select columns
+        cte_select = [
+            "to_timestamp(created_at / 1000) as ts",
+            "unnest(inference_response.response_rule_results) as rule_results",
+        ]
+        # Build main select columns
+        main_select_cols = [
+            "ts as timestamp",
+            "length(rule_results.details.claims) as num_claims",
+            "rule_results.result as result",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            cte_select.extend(["conversation_id", "user_id"])
+            main_select_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnested_results as (select {", ".join(cte_select)}
+            from {dataset.dataset_table_name})
+            select {", ".join(main_select_cols)}
+            from unnested_results
+            where rule_results.rule_type = 'ModelHallucinationRuleV2'
+            and rule_results.result != 'Skipped'
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["result"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_sketch_metrics(
             results,
             "num_claims",
-            ["result"],
+            group_by_dims,
             "timestamp",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -499,6 +649,9 @@ class ShieldInferenceRuleClaimCountAggregation(SketchAggregationFunction):
 class ShieldInferenceRuleClaimPassCountAggregation(SketchAggregationFunction):
     METRIC_NAME = "claim_valid_count"
+    FEATURE_FLAG_NAME = (
+        "SHIELD_INFERENCE_RULE_CLAIM_PASS_COUNT_AGGREGATION_SEGMENTATION"
+    )
     @staticmethod
     def id() -> UUID:
@@ -545,25 +698,44 @@ class ShieldInferenceRuleClaimPassCountAggregation(SketchAggregationFunction):
             ),
         ],
     ) -> list[SketchMetric]:
-        results = ddb_conn.sql(
-            f"\
-                with unnested_results as (select to_timestamp(created_at / 1000) as ts, \
-                                        unnest(inference_response.response_rule_results) as rule_results \
-                                        from {dataset.dataset_table_name}) \
-                select ts as timestamp, \
-                    length(list_filter(rule_results.details.claims, x -> x.valid)) as num_valid_claims, \
-                    rule_results.result as result \
-                from unnested_results \
-                where rule_results.rule_type = 'ModelHallucinationRuleV2' \
-                and rule_results.result != 'Skipped' \
-                order by ts desc; \
-            ",
-        ).df()
+        # Build CTE select columns
+        cte_select = [
+            "to_timestamp(created_at / 1000) as ts",
+            "unnest(inference_response.response_rule_results) as rule_results",
+        ]
+        # Build main select columns
+        main_select_cols = [
+            "ts as timestamp",
+            "length(list_filter(rule_results.details.claims, x -> x.valid)) as num_valid_claims",
+            "rule_results.result as result",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            cte_select.extend(["conversation_id", "user_id"])
+            main_select_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnested_results as (select {", ".join(cte_select)}
+            from {dataset.dataset_table_name})
+            select {", ".join(main_select_cols)}
+            from unnested_results
+            where rule_results.rule_type = 'ModelHallucinationRuleV2'
+            and rule_results.result != 'Skipped'
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["result"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_sketch_metrics(
             results,
             "num_valid_claims",
-            ["result"],
+            group_by_dims,
             "timestamp",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -572,6 +744,9 @@ class ShieldInferenceRuleClaimPassCountAggregation(SketchAggregationFunction):
 class ShieldInferenceRuleClaimFailCountAggregation(SketchAggregationFunction):
     METRIC_NAME = "claim_invalid_count"
+    FEATURE_FLAG_NAME = (
+        "SHIELD_INFERENCE_RULE_CLAIM_FAIL_COUNT_AGGREGATION_SEGMENTATION"
+    )
     @staticmethod
     def id() -> UUID:
@@ -618,25 +793,44 @@ class ShieldInferenceRuleClaimFailCountAggregation(SketchAggregationFunction):
             ),
         ],
     ) -> list[SketchMetric]:
-        results = ddb_conn.sql(
-            f"\
-                with unnested_results as (select to_timestamp(created_at / 1000) as ts, \
-                                        unnest(inference_response.response_rule_results) as rule_results \
-                                        from {dataset.dataset_table_name}) \
-                select ts as timestamp, \
-                    length(list_filter(rule_results.details.claims, x -> not x.valid)) as num_failed_claims, \
-                    rule_results.result as result \
-                from unnested_results \
-                where rule_results.rule_type = 'ModelHallucinationRuleV2' \
-                and rule_results.result != 'Skipped' \
-                order by ts desc; \
-            ",
-        ).df()
+        # Build CTE select columns
+        cte_select = [
+            "to_timestamp(created_at / 1000) as ts",
+            "unnest(inference_response.response_rule_results) as rule_results",
+        ]
+        # Build main select columns
+        main_select_cols = [
+            "ts as timestamp",
+            "length(list_filter(rule_results.details.claims, x -> not x.valid)) as num_failed_claims",
+            "rule_results.result as result",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            cte_select.extend(["conversation_id", "user_id"])
+            main_select_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnested_results as (select {", ".join(cte_select)}
+            from {dataset.dataset_table_name})
+            select {", ".join(main_select_cols)}
+            from unnested_results
+            where rule_results.rule_type = 'ModelHallucinationRuleV2'
+            and rule_results.result != 'Skipped'
+            order by ts desc;
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["result"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_sketch_metrics(
             results,
             "num_failed_claims",
-            ["result"],
+            group_by_dims,
             "timestamp",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -645,6 +839,7 @@ class ShieldInferenceRuleClaimFailCountAggregation(SketchAggregationFunction):
 class ShieldInferenceRuleLatencyAggregation(SketchAggregationFunction):
     METRIC_NAME = "rule_latency"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_RULE_LATENCY_AGGREGATION_SEGMENTATION"
     @staticmethod
     def id() -> UUID:
@@ -691,36 +886,55 @@ class ShieldInferenceRuleLatencyAggregation(SketchAggregationFunction):
             ),
         ],
     ) -> list[SketchMetric]:
-        results = ddb_conn.sql(
-            f" \
-            with unnested_prompt_rules as (select unnest(inference_prompt.prompt_rule_results) as rule, \
-                'prompt' as location, \
-                to_timestamp(created_at / 1000) as ts, \
-            from {dataset.dataset_table_name}), \
-            unnested_response_rules as (select unnest(inference_response.response_rule_results) as rule,\
-                'response' as location, \
-                to_timestamp(created_at / 1000) as ts, \
-            from {dataset.dataset_table_name}) \
-            select ts, \
-                location, \
-                rule.rule_type, \
-                rule.result, \
-                rule.latency_ms \
-            from unnested_prompt_rules \
-            UNION ALL \
-            select ts, \
-                location, \
-                rule.rule_type, \
-                rule.result, \
-                rule.latency_ms \
-            from unnested_response_rules \
-            ",
-        ).df()
+        # Build CTE select columns
+        prompt_cte_select = [
+            "unnest(inference_prompt.prompt_rule_results) as rule",
+            "'prompt' as location",
+            "to_timestamp(created_at / 1000) as ts",
+        ]
+        response_cte_select = [
+            "unnest(inference_response.response_rule_results) as rule",
+            "'response' as location",
+            "to_timestamp(created_at / 1000) as ts",
+        ]
+        # Build main select columns
+        main_select_cols = [
+            "ts",
+            "location",
+            "rule.rule_type",
+            "rule.result",
+            "rule.latency_ms",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            prompt_cte_select.extend(["conversation_id", "user_id"])
+            response_cte_select.extend(["conversation_id", "user_id"])
+            main_select_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            with unnested_prompt_rules as (select {", ".join(prompt_cte_select)}
+            from {dataset.dataset_table_name}),
+            unnested_response_rules as (select {", ".join(response_cte_select)}
+            from {dataset.dataset_table_name})
+            select {", ".join(main_select_cols)}
+            from unnested_prompt_rules
+            UNION ALL
+            select {", ".join(main_select_cols)}
+            from unnested_response_rules
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["result", "rule_type", "location"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_sketch_metrics(
             results,
             "latency_ms",
-            ["result", "rule_type", "location"],
+            group_by_dims,
             "ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -729,6 +943,7 @@ class ShieldInferenceRuleLatencyAggregation(SketchAggregationFunction):
 class ShieldInferenceTokenCountAggregation(NumericAggregationFunction):
     METRIC_NAME = "token_count"
+    FEATURE_FLAG_NAME = "SHIELD_INFERENCE_TOKEN_COUNT_AGGREGATION_SEGMENTATION"
     SUPPORTED_MODELS = [
         "gpt-4o",
         "gpt-4o-mini",
@@ -799,28 +1014,52 @@ class ShieldInferenceTokenCountAggregation(NumericAggregationFunction):
             ),
         ],
     ) -> list[NumericMetric]:
-        results = ddb_conn.sql(
-            f" \
-            select \
-                time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts, \
-                COALESCE(sum(inference_prompt.tokens), 0) as tokens, \
-                'prompt' as location \
-            from {dataset.dataset_table_name} \
-            group by time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)), location \
-            UNION ALL \
-            select \
-                time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts, \
-                COALESCE(sum(inference_response.tokens), 0) as tokens, \
-                'response' as location \
-            from {dataset.dataset_table_name}  \
-            group by time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)), location; \
-            ",
-        ).df()
+        # Build SELECT clause for prompt
+        prompt_select_cols = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+            "COALESCE(sum(inference_prompt.tokens), 0) as tokens",
+            "'prompt' as location",
+        ]
+        # Build SELECT clause for response
+        response_select_cols = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000)) as ts",
+            "COALESCE(sum(inference_response.tokens), 0) as tokens",
+            "'response' as location",
+        ]
+        # Build GROUP BY clause
+        group_by_cols = [
+            "time_bucket(INTERVAL '5 minutes', to_timestamp(created_at / 1000))",
+            "location",
+        ]
+        # Conditionally add conversation_id and user_id
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            prompt_select_cols.extend(["conversation_id", "user_id"])
+            response_select_cols.extend(["conversation_id", "user_id"])
+            group_by_cols.extend(["conversation_id", "user_id"])
+        query = f"""
+            select {", ".join(prompt_select_cols)}
+            from {dataset.dataset_table_name}
+            group by {", ".join(group_by_cols)}
+            UNION ALL
+            select {", ".join(response_select_cols)}
+            from {dataset.dataset_table_name}
+            group by {", ".join(group_by_cols)};
+        """
+        results = ddb_conn.sql(query).df()
+        group_by_dims = ["location"]
+        if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+            group_by_dims.extend(["conversation_id", "user_id"])
         series = self.group_query_results_to_numeric_metrics(
             results,
             "tokens",
-            ["location"],
+            group_by_dims,
             "ts",
         )
         metric = self.series_to_metric(self.METRIC_NAME, series)
@@ -839,18 +1078,21 @@ class ShieldInferenceTokenCountAggregation(NumericAggregationFunction):
                 for tokens, loc_type in zip(results["tokens"], location_type)
             ]
-            model_df = pd.DataFrame(
-                {
-                    "ts": results["ts"],
-                    "cost": cost_values,
-                    "location": results["location"],
-                },
-            )
+            model_df_dict = {
+                "ts": results["ts"],
+                "cost": cost_values,
+                "location": results["location"],
+            }
+            if self.is_feature_flag_enabled(self.FEATURE_FLAG_NAME):
+                model_df_dict["conversation_id"] = results["conversation_id"]
+                model_df_dict["user_id"] = results["user_id"]
+            model_df = pd.DataFrame(model_df_dict)
             model_series = self.group_query_results_to_numeric_metrics(
                 model_df,
                 "cost",
-                ["location"],
+                group_by_dims,
                 "ts",
             )
             resp.append(

arthur-common 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl

arthur-common 2.1.58py3-none-any.whl → 2.4.13py3-none-any.whl