PyPI - arthur-common - Versions diffs - 2.1.50__tar.gz → 2.1.52__tar.gz - Mend

arthur-common 2.1.50tar.gz → 2.1.52tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arthur-common might be problematic. Click here for more details.

Files changed (43) hide show

{arthur_common-2.1.50 → arthur_common-2.1.52}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: arthur-common
-Version: 2.1.50
+Version: 2.1.52
 Summary: Utility code common to Arthur platform components.
 License: MIT
 Author: Arthur
@@ -16,6 +16,7 @@ Requires-Dist: fastapi (>=0.115.8)
 Requires-Dist: fsspec (>=2024.10.0)
 Requires-Dist: pandas (>=2.2.2)
 Requires-Dist: pydantic (>=2)
+Requires-Dist: simple-settings (>=1.2.0)
 Requires-Dist: tokencost (==0.1.24)
 Requires-Dist: types-python-dateutil (>=2.9.0)
 Requires-Dist: types-requests (>=2.32.0.20241016)

{arthur_common-2.1.50 → arthur_common-2.1.52}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "arthur-common"
-version = "2.1.50"
+version = "2.1.52"
 description = "Utility code common to Arthur platform components."
 authors = ["Arthur <engineering@arthur.ai>"]
 license = "MIT"
@@ -18,6 +18,7 @@ types-python-dateutil = ">=2.9.0"
 fsspec = ">=2024.10.0"
 tokencost = "0.1.24"
 fastapi = ">=0.115.8"
+simple-settings = ">=1.2.0"
 [tool.poetry.group.dev.dependencies]
@@ -26,6 +27,7 @@ responses = "0.25.7"
 pytest-xdist = "3.6.1"
 pytest-cov = "^6.1.1"
 pre-commit = "^4.2.0"
+mypy = "^1.16.1"
 [tool.pytest.ini_options]
 pythonpath = ["src"]

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/aggregator.py RENAMED Viewed

@@ -3,10 +3,11 @@ from base64 import b64encode
 from typing import Any, Type, Union
 import pandas as pd
-from arthur_common.models.metrics import *
 from datasketches import kll_floats_sketch
 from duckdb import DuckDBPyConnection
+from arthur_common.models.metrics import *
 class AggregationFunction(ABC):
     @staticmethod

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/categorical_count.py RENAMED Viewed

@@ -1,9 +1,12 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -12,7 +15,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
-from duckdb import DuckDBPyConnection
 class CategoricalCountAggregationFunction(NumericAggregationFunction):
@@ -69,13 +71,8 @@ class CategoricalCountAggregationFunction(NumericAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/confusion_matrix.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -14,7 +17,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
-from duckdb import DuckDBPyConnection
 class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
@@ -27,7 +29,7 @@ class ConfusionMatrixAggregationFunction(NumericAggregationFunction):
         prediction_normalization_case: str,
         gt_normalization_case: str,
         dataset: DatasetReference,
-        segmentation_cols: list[str],
+        segmentation_cols: Optional[list[str]] = None,
     ) -> list[NumericMetric]:
         """
         Generate a SQL query to compute confusion matrix metrics over time.
@@ -202,13 +204,8 @@ class BinaryClassifierIntBoolConfusionMatrixAggregationFunction(
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,
@@ -338,13 +335,8 @@ class BinaryClassifierStringLabelConfusionMatrixAggregationFunction(
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,
@@ -446,13 +438,8 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,
@@ -495,12 +482,12 @@ class BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction(
                 raise ValueError(f"Unsupported column type: {col_type}")
         return self.generate_confusion_matrix_metrics(
-            ddb_conn,
-            timestamp_col,
-            prediction_col,
-            gt_values_col,
-            prediction_normalization_case,
-            gt_normalization_case,
-            dataset,
-            segmentation_cols,
+            ddb_conn=ddb_conn,
+            timestamp_col=timestamp_col,
+            prediction_col=prediction_col,
+            gt_values_col=gt_values_col,
+            prediction_normalization_case=prediction_normalization_case,
+            gt_normalization_case=gt_normalization_case,
+            dataset=dataset,
+            segmentation_cols=segmentation_cols,
         )

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/inference_count.py RENAMED Viewed

@@ -1,9 +1,12 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -12,7 +15,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier
-from duckdb import DuckDBPyConnection
 class InferenceCountAggregationFunction(NumericAggregationFunction):
@@ -56,13 +58,8 @@ class InferenceCountAggregationFunction(NumericAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/inference_count_by_class.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -14,7 +17,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier
-from duckdb import DuckDBPyConnection
 class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction):
@@ -75,13 +77,8 @@ class BinaryClassifierCountByClassAggregationFunction(NumericAggregationFunction
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,
@@ -219,13 +216,8 @@ class BinaryClassifierCountThresholdClassAggregationFunction(
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/inference_null_count.py RENAMED Viewed

@@ -1,9 +1,12 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.metrics import DatasetReference, Dimension, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -12,7 +15,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier
-from duckdb import DuckDBPyConnection
 class InferenceNullCountAggregationFunction(NumericAggregationFunction):
@@ -65,13 +67,8 @@ class InferenceNullCountAggregationFunction(NumericAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/mean_absolute_error.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -13,7 +16,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier
-from duckdb import DuckDBPyConnection
 class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
@@ -80,13 +82,8 @@ class MeanAbsoluteErrorAggregationFunction(NumericAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/mean_squared_error.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -13,7 +16,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier
-from duckdb import DuckDBPyConnection
 class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
@@ -80,13 +82,8 @@ class MeanSquaredErrorAggregationFunction(NumericAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/multiclass_confusion_matrix.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -14,7 +17,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
-from duckdb import DuckDBPyConnection
 class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction(
@@ -95,13 +97,8 @@ class MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFuncti
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/multiclass_inference_count_by_class.py RENAMED Viewed

@@ -1,12 +1,15 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.functions.inference_count_by_class import (
     BinaryClassifierCountByClassAggregationFunction,
 )
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.metrics import DatasetReference, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -14,7 +17,6 @@ from arthur_common.models.schema_definitions import (
     ScalarType,
     ScopeSchemaTag,
 )
-from duckdb import DuckDBPyConnection
 class MulticlassClassifierCountByClassAggregationFunction(
@@ -86,13 +88,8 @@ class MulticlassClassifierCountByClassAggregationFunction(
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/numeric_stats.py RENAMED Viewed

@@ -1,9 +1,12 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import SketchAggregationFunction
 from arthur_common.models.metrics import DatasetReference, SketchMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -12,7 +15,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
-from duckdb import DuckDBPyConnection
 class NumericSketchAggregationFunction(SketchAggregationFunction):
@@ -71,13 +73,8 @@ class NumericSketchAggregationFunction(SketchAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/numeric_sum.py RENAMED Viewed

@@ -1,9 +1,12 @@
 from typing import Annotated, Optional
 from uuid import UUID
+from duckdb import DuckDBPyConnection
 from arthur_common.aggregations.aggregator import NumericAggregationFunction
 from arthur_common.models.metrics import DatasetReference, Dimension, NumericMetric
 from arthur_common.models.schema_definitions import (
+    SEGMENTATION_ALLOWED_COLUMN_TYPES,
     DType,
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
@@ -12,7 +15,6 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import escape_identifier
-from duckdb import DuckDBPyConnection
 class NumericSumAggregationFunction(NumericAggregationFunction):
@@ -69,13 +71,8 @@ class NumericSumAggregationFunction(NumericAggregationFunction):
             Optional[list[str]],
             MetricMultipleColumnParameterAnnotation(
                 source_dataset_parameter_key="dataset",
-                allowed_column_types=[
-                    ScalarType(dtype=DType.INT),
-                    ScalarType(dtype=DType.BOOL),
-                    ScalarType(dtype=DType.STRING),
-                    ScalarType(dtype=DType.UUID),
-                ],
-                tag_hints=[],
+                allowed_column_types=SEGMENTATION_ALLOWED_COLUMN_TYPES,
+                tag_hints=[ScopeSchemaTag.POSSIBLE_SEGMENTATION],
                 friendly_name="Segmentation Columns",
                 description="All columns to include as dimensions for segmentation.",
                 optional=True,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/aggregations/functions/shield_aggregations.py RENAMED Viewed

@@ -2,6 +2,9 @@ from typing import Annotated
 from uuid import UUID
 import pandas as pd
+from duckdb import DuckDBPyConnection
+from tokencost import calculate_cost_by_tokens
 from arthur_common.aggregations.aggregator import (
     NumericAggregationFunction,
     SketchAggregationFunction,
@@ -13,8 +16,6 @@ from arthur_common.models.schema_definitions import (
     MetricColumnParameterAnnotation,
     MetricDatasetParameterAnnotation,
 )
-from duckdb import DuckDBPyConnection
-from tokencost import calculate_cost_by_tokens
 class ShieldInferencePassFailCountAggregation(NumericAggregationFunction):

arthur_common-2.1.52/src/arthur_common/config/config.py ADDED Viewed

@@ -0,0 +1,42 @@
+# get the current directory of this file
+import logging
+import pathlib
+directory = pathlib.Path(__file__).parent.resolve()
+# create settings object that reads from settings.yaml and takes overrides from env
+# can also be overwritten via the CLI
+# https://github.com/drgarcia1986/simple-settings
+from simple_settings import LazySettings
+settings = LazySettings(f"{directory}/settings.yaml", ".environ")
+logger = logging.getLogger()
+class Config:
+    settings = settings
+    @staticmethod
+    def convert_to_int(value: str | int, setting_name: str) -> int:
+        if isinstance(value, int):
+            return value
+        elif value == "":
+            raise ValueError(
+                f"Config setting {setting_name} could not be cast to an int.",
+            )
+        # attempt to convert setting to int
+        try:
+            return int(value.strip())
+        except TypeError:
+            raise ValueError(
+                f"Config setting {setting_name} could not be cast to an int.",
+            )
+    @staticmethod
+    def segmentation_col_unique_values_limit() -> int:
+        return Config.convert_to_int(
+            settings.SEGMENTATION_COL_UNIQUE_VALUE_LIMIT,
+            "SEGMENTATION_COL_UNIQUE_VALUE_LIMIT",
+        )

arthur_common-2.1.52/src/arthur_common/config/settings.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+# add arthur-common default settings here
+################################################
+# Aggregation Configurations
+SEGMENTATION_COL_UNIQUE_VALUE_LIMIT: 100

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/models/connectors.py RENAMED Viewed

@@ -29,13 +29,14 @@ GOOGLE_CONNECTOR_PROJECT_ID_FIELD = "project_id"
 GOOGLE_CONNECTOR_LOCATION_FIELD = "location"
 SHIELD_CONNECTOR_API_KEY_FIELD = "api_key"
 SHIELD_CONNECTOR_ENDPOINT_FIELD = "endpoint"
-MSSQL_CONNECTOR_HOST_FIELD = "host"
-MSSQL_CONNECTOR_PORT_FIELD = "port"
-MSSQL_CONNECTOR_DATABASE_FIELD = "database"
-MSSQL_CONNECTOR_USERNAME_FIELD = "username"
-MSSQL_CONNECTOR_PASSWORD_FIELD = "password"
-MSSQL_CONNECTOR_DRIVER_FIELD = "driver"
-MSSQL_CONNECTOR_TABLE_NAME_FIELD = "table_name"
+ODBC_CONNECTOR_HOST_FIELD = "host"
+ODBC_CONNECTOR_PORT_FIELD = "port"
+ODBC_CONNECTOR_DATABASE_FIELD = "database"
+ODBC_CONNECTOR_USERNAME_FIELD = "username"
+ODBC_CONNECTOR_PASSWORD_FIELD = "password"
+ODBC_CONNECTOR_DRIVER_FIELD = "driver"
+ODBC_CONNECTOR_TABLE_NAME_FIELD = "table_name"
+ODBC_CONNECTOR_DIALECT_FIELD = "dialect"
 # dataset (connector type dependent) constants

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/models/metrics.py RENAMED Viewed

@@ -4,14 +4,15 @@ from enum import Enum
 from typing import Literal, Optional
 from uuid import UUID
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing_extensions import Self
 from arthur_common.models.datasets import ModelProblemType
 from arthur_common.models.schema_definitions import (
     DType,
     SchemaTypeUnion,
     ScopeSchemaTag,
 )
-from pydantic import BaseModel, Field, model_validator
-from typing_extensions import Self
 # Temporary limited list, expand this as we grow and make it more in line with custom transformations later on
@@ -111,12 +112,9 @@ class AggregationMetricType(Enum):
     NUMERIC = "numeric"
-class MetricsParameterSchema(BaseModel):
+class BaseAggregationParameterSchema(BaseModel):
+    # fields for aggregation parameters shared across all parameter types and between default and custom metrics
     parameter_key: str = Field(description="Name of the parameter.")
-    optional: bool = Field(
-        False,
-        description="Boolean denoting if the parameter is optional.",
-    )
     friendly_name: str = Field(
         description="User facing name of the parameter.",
     )
@@ -125,7 +123,16 @@ class MetricsParameterSchema(BaseModel):
     )
-class MetricsDatasetParameterSchema(MetricsParameterSchema):
+class MetricsParameterSchema(BaseAggregationParameterSchema):
+    # specific to default metrics/Python metrics—not available to custom aggregations
+    optional: bool = Field(
+        False,
+        description="Boolean denoting if the parameter is optional.",
+    )
+class BaseDatasetParameterSchema(BaseAggregationParameterSchema):
+    # fields specific to dataset parameters shared across default and custom metrics
     parameter_type: Literal["dataset"] = "dataset"
     model_problem_type: Optional[ModelProblemType] = Field(
         default=None,
@@ -133,13 +140,24 @@ class MetricsDatasetParameterSchema(MetricsParameterSchema):
     )
-class MetricsLiteralParameterSchema(MetricsParameterSchema):
+class MetricsDatasetParameterSchema(MetricsParameterSchema, BaseDatasetParameterSchema):
+    # dataset parameter schema including fields specific to default metrics
+    pass
+class BaseLiteralParameterSchema(BaseAggregationParameterSchema):
+    # fields specific to literal parameters shared across default and custom metrics
     parameter_type: Literal["literal"] = "literal"
     parameter_dtype: DType = Field(description="Data type of the parameter.")
-class MetricsColumnParameterSchema(MetricsParameterSchema):
-    parameter_type: Literal["column"] = "column"
+class MetricsLiteralParameterSchema(MetricsParameterSchema, BaseLiteralParameterSchema):
+    # literal parameter schema including fields specific to default metrics
+    pass
+class BaseColumnBaseParameterSchema(BaseAggregationParameterSchema):
+    # fields specific to all single or multiple column parameters shared across default and custom metrics
     tag_hints: list[ScopeSchemaTag] = Field(
         [],
         description="List of tags that are applicable to this parameter. Datasets with columns that have matching tags can be inferred this way.",
@@ -165,8 +183,18 @@ class MetricsColumnParameterSchema(MetricsParameterSchema):
         return self
-# Not used /implemented yet. Might turn into group by column list
-class MetricsColumnListParameterSchema(MetricsColumnParameterSchema):
+class BaseColumnParameterSchema(BaseColumnBaseParameterSchema):
+    # single column parameter schema common across default and custom metrics
+    parameter_type: Literal["column"] = "column"
+class MetricsColumnParameterSchema(MetricsParameterSchema, BaseColumnParameterSchema):
+    # single column parameter schema specific to default metrics
+    parameter_type: Literal["column"] = "column"
+class MetricsColumnListParameterSchema(MetricsParameterSchema, BaseColumnParameterSchema):
+    # list column parameter schema specific to default metrics
     parameter_type: Literal["column_list"] = "column_list"
@@ -177,6 +205,17 @@ MetricsParameterSchemaUnion = (
     | MetricsColumnListParameterSchema
 )
+MetricsColumnSchemaUnion = (
+    MetricsColumnParameterSchema | MetricsColumnListParameterSchema
+)
+CustomAggregationParametersSchemaUnion = (
+    BaseDatasetParameterSchema
+    | BaseLiteralParameterSchema
+    | BaseColumnParameterSchema
+)
 @dataclass
 class DatasetReference:
@@ -221,3 +260,28 @@ class AggregationSpecSchema(BaseModel):
                     f"Column parameter '{param.parameter_key}' references dataset parameter '{param.source_dataset_parameter_key}' which does not exist.",
                 )
         return self
+class BaseReportedAggregation(BaseModel):
+    # in future will be used by default metrics
+    metric_name: str = Field(description="Name of the reported aggregation metric.")
+    description: str = Field(
+        description="Description of the reported aggregation metric and what it aggregates.",
+    )
+class ReportedCustomAggregation(BaseReportedAggregation):
+    value_column: str = Field(description="Name of the column returned from the SQL query holding the metric value.")
+    timestamp_column: str = Field(description="Name of the column returned from the SQL query holding the timestamp buckets.")
+    metric_kind: AggregationMetricType = Field(
+        description="Return type of the reported aggregation metric value.",
+    )
+    dimension_columns: list[str] = Field(description="Name of any dimension columns returned from the SQL query. Max length is 1.")
+    @field_validator('dimension_columns')
+    @classmethod
+    def validate_dimension_columns_length(cls, v: list[str]) -> str:
+        if len(v) > 1:
+            raise ValueError('Only one dimension column can be specified.')
+        return v

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/models/schema_definitions.py RENAMED Viewed

@@ -4,9 +4,10 @@ from enum import Enum
 from typing import Optional, Self, Union
 from uuid import UUID, uuid4
-from arthur_common.models.datasets import ModelProblemType
 from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
+from arthur_common.models.datasets import ModelProblemType
 class ScopeSchemaTag(str, Enum):
     LLM_CONTEXT = "llm_context"
@@ -18,6 +19,7 @@ class ScopeSchemaTag(str, Enum):
     PREDICTION = "prediction"
     GROUND_TRUTH = "ground_truth"
     PIN_IN_DEEP_DIVE = "pin_in_deep_dive"
+    POSSIBLE_SEGMENTATION = "possible_segmentation"
 class DType(str, Enum):
@@ -420,3 +422,8 @@ def SHIELD_SCHEMA() -> DatasetSchema:
 SHIELD_RESPONSE_SCHEMA = create_shield_response_schema().to_base_type()
 SHIELD_PROMPT_SCHEMA = create_shield_prompt_schema().to_base_type()
+SEGMENTATION_ALLOWED_DTYPES = [DType.INT, DType.BOOL, DType.STRING, DType.UUID]
+SEGMENTATION_ALLOWED_COLUMN_TYPES = [
+    ScalarType(dtype=d_type) for d_type in SEGMENTATION_ALLOWED_DTYPES
+]

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/models/shield.py RENAMED Viewed

@@ -10,9 +10,7 @@ DEFAULT_PII_RULE_CONFIDENCE_SCORE_THRESHOLD = 0
 class RuleType(str, Enum):
     KEYWORD = "KeywordRule"
-    MODEL_HALLUCINATION = "ModelHallucinationRule"
     MODEL_HALLUCINATION_V2 = "ModelHallucinationRuleV2"
-    MODEL_HALLUCINATION_V3 = "ModelHallucinationRuleV3"
     MODEL_SENSITIVE_DATA = "ModelSensitiveDataRule"
     PII_DATA = "PIIDataRule"
     PROMPT_INJECTION = "PromptInjectionRule"
@@ -456,14 +454,6 @@ class NewRuleRequest(BaseModel):
                 detail="PromptInjectionRule can only be enabled for prompt. Please set the 'apply_to_response' field "
                 "to false.",
             )
-        if (self.type == RuleType.MODEL_HALLUCINATION) and (
-            self.apply_to_prompt is True
-        ):
-            raise HTTPException(
-                status_code=400,
-                detail="ModelHallucinationRule can only be enabled for response. Please set the 'apply_to_prompt' "
-                "field to false.",
-            )
         if (self.type == RuleType.MODEL_HALLUCINATION_V2) and (
             self.apply_to_prompt is True
         ):
@@ -472,14 +462,6 @@ class NewRuleRequest(BaseModel):
                 detail="ModelHallucinationRuleV2 can only be enabled for response. Please set the 'apply_to_prompt' "
                 "field to false.",
             )
-        if (self.type == RuleType.MODEL_HALLUCINATION_V3) and (
-            self.apply_to_prompt is True
-        ):
-            raise HTTPException(
-                status_code=400,
-                detail="ModelHallucinationRuleV3 can only be enabled for response. Please set the "
-                "'apply_to_prompt' field to false.",
-            )
         if (self.apply_to_prompt is False) and (self.apply_to_response is False):
             raise HTTPException(
                 status_code=400,

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/models/task_job_specs.py RENAMED Viewed

@@ -1,9 +1,10 @@
 from typing import Literal, Optional
 from uuid import UUID
-from arthur_common.models.shield import NewRuleRequest
 from pydantic import BaseModel, Field
+from arthur_common.models.shield import NewRuleRequest
 onboarding_id_desc = "An identifier to assign to the created model to make it easy to retrieve. Used by the UI during the GenAI model creation flow."

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/tools/aggregation_analyzer.py RENAMED Viewed

@@ -84,7 +84,7 @@ class FunctionAnalyzer:
     @staticmethod
     def _get_scope_metric_parameter_from_annotation(
         param_name: str,
-        param_dtype: typing.Optional[DType],
+        param_dtype: DType,
         optional: bool,
         annotation: typing.Annotated,  # type: ignore
     ) -> MetricsParameterSchemaUnion:

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/tools/duckdb_data_loader.py RENAMED Viewed

@@ -3,6 +3,10 @@ from typing import Any
 import duckdb
 import pandas as pd
+from dateutil.parser import parse
+from fsspec import filesystem
+from pydantic import BaseModel
 from arthur_common.models.datasets import DatasetJoinKind
 from arthur_common.models.schema_definitions import (
     DatasetListType,
@@ -11,9 +15,6 @@ from arthur_common.models.schema_definitions import (
     DatasetSchema,
     DType,
 )
-from dateutil.parser import parse
-from fsspec import filesystem
-from pydantic import BaseModel
 class ColumnFormat(BaseModel):

arthur_common-2.1.52/src/arthur_common/tools/duckdb_utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+import duckdb
+from arthur_common.config.config import Config
+from arthur_common.models.schema_definitions import SEGMENTATION_ALLOWED_DTYPES, DType
+from arthur_common.tools.duckdb_data_loader import escape_identifier
+def is_column_possible_segmentation(
+    conn: duckdb.DuckDBPyConnection,
+    table: str,
+    column_name: str,
+    column_dtype: DType,
+) -> bool:
+    """Returns whether column fits segmentation criteria:
+    1. Has fewer than SEGMENTATION_COL_UNIQUE_VALUE_LIMIT unique values.
+    2. Has an allowed DType.
+    PreReq: Table with column should already be loaded in DuckDB
+    """
+    segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
+    if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
+        return False
+    # check column for unique value count
+    escaped_column = escape_identifier(column_name)
+    # count distinct values in this column
+    distinct_count_query = f"""
+        SELECT COUNT(DISTINCT {escaped_column}) as distinct_count
+        FROM {table}
+    """
+    result = conn.sql(distinct_count_query).fetchone()
+    distinct_count = result[0] if result else 0
+    return distinct_count < segmentation_col_unique_val_limit

arthur_common-2.1.52/src/arthur_common/tools/py.typed ADDED Viewed

File without changes

{arthur_common-2.1.50 → arthur_common-2.1.52}/src/arthur_common/tools/schema_inferer.py RENAMED Viewed

@@ -2,6 +2,7 @@ from typing import Any
 from uuid import uuid4
 import pandas as pd
 from arthur_common.models.schema_definitions import (
     DatasetColumn,
     DatasetListType,
@@ -12,6 +13,7 @@ from arthur_common.models.schema_definitions import (
     ScopeSchemaTag,
 )
 from arthur_common.tools.duckdb_data_loader import DuckDBOperator, escape_identifier
+from arthur_common.tools.duckdb_utils import is_column_possible_segmentation
 class SchemaInferer:
@@ -38,14 +40,21 @@ class SchemaInferer:
         self.conn.sql(
             f"CREATE OR REPLACE TEMP TABLE {escaped_col} AS SELECT UNNEST({escaped_col}) as {escaped_col} FROM {table}",
         )
-        return self._infer_schema(escaped_col)
+        return self._infer_schema(escaped_col, is_nested_col=True)
-    def _infer_schema(self, table: str = "root") -> DatasetObjectType:
+    def _infer_schema(
+        self,
+        table: str = "root",
+        is_nested_col: bool = False,
+    ) -> DatasetObjectType:
+        """is_nested_col indicates whether the function is being called on an unnested/flattened table that represents
+        a struct column or list column in the root table."""
         ddb_schema: list[tuple[Any, Any, Any]] = self.conn.sql(
             f"DESCRIBE {table}",
         ).fetchall()
         obj = DatasetObjectType(id=uuid4(), object={}, nullable=False)
+        # object has a dict of each column
         timestamp_cols = []
         for column in ddb_schema:
@@ -94,6 +103,18 @@ class SchemaInferer:
                         timestamp_cols.append(scalar_schema)
                     case _:
                         raise NotImplementedError(f"Type {col_type} not mappable.")
+                # tag column as a possible segmentation column if it meets criteria
+                # we only support top-level column aggregations right now (ie you can't aggregate on a nested column)
+                # so we don't want to tag nested columns as possible segmentation columns
+                if not is_nested_col and is_column_possible_segmentation(
+                    self.conn,
+                    table,
+                    col_name,
+                    scalar_schema.dtype,
+                ):
+                    scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)
                 obj.object[col_name] = scalar_schema
         # auto assign primary timestamp tag if there's only one timestamp column