PyPI - arthur-common - Versions diffs - 1.0.1__py3-none-any.whl - Mend

arthur-common 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arthur-common might be problematic. Click here for more details.

Files changed (40) hide show

arthur_common/__init__.py +0 -0
arthur_common/__version__.py +1 -0
arthur_common/aggregations/__init__.py +2 -0
arthur_common/aggregations/aggregator.py +214 -0
arthur_common/aggregations/functions/README.md +26 -0
arthur_common/aggregations/functions/__init__.py +25 -0
arthur_common/aggregations/functions/categorical_count.py +89 -0
arthur_common/aggregations/functions/confusion_matrix.py +412 -0
arthur_common/aggregations/functions/inference_count.py +69 -0
arthur_common/aggregations/functions/inference_count_by_class.py +206 -0
arthur_common/aggregations/functions/inference_null_count.py +82 -0
arthur_common/aggregations/functions/mean_absolute_error.py +110 -0
arthur_common/aggregations/functions/mean_squared_error.py +110 -0
arthur_common/aggregations/functions/multiclass_confusion_matrix.py +205 -0
arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +90 -0
arthur_common/aggregations/functions/numeric_stats.py +90 -0
arthur_common/aggregations/functions/numeric_sum.py +87 -0
arthur_common/aggregations/functions/py.typed +0 -0
arthur_common/aggregations/functions/shield_aggregations.py +752 -0
arthur_common/aggregations/py.typed +0 -0
arthur_common/models/__init__.py +0 -0
arthur_common/models/connectors.py +41 -0
arthur_common/models/datasets.py +22 -0
arthur_common/models/metrics.py +227 -0
arthur_common/models/py.typed +0 -0
arthur_common/models/schema_definitions.py +420 -0
arthur_common/models/shield.py +504 -0
arthur_common/models/task_job_specs.py +78 -0
arthur_common/py.typed +0 -0
arthur_common/tools/__init__.py +0 -0
arthur_common/tools/aggregation_analyzer.py +243 -0
arthur_common/tools/aggregation_loader.py +59 -0
arthur_common/tools/duckdb_data_loader.py +329 -0
arthur_common/tools/functions.py +46 -0
arthur_common/tools/py.typed +0 -0
arthur_common/tools/schema_inferer.py +104 -0
arthur_common/tools/time_utils.py +33 -0
arthur_common-1.0.1.dist-info/METADATA +74 -0
arthur_common-1.0.1.dist-info/RECORD +40 -0
arthur_common-1.0.1.dist-info/WHEEL +4 -0

arthur_common/__init__.py ADDED Viewed

File without changes

arthur_common/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.0.1"

arthur_common/aggregations/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .aggregator import * # noqa
2	+ from .functions import * # noqa

arthur_common/aggregations/aggregator.py ADDED Viewed

@@ -0,0 +1,214 @@
+from abc import ABC, abstractmethod
+from base64 import b64encode
+from typing import Any, Type, Union
+import pandas as pd
+from arthur_common.models.metrics import *
+from datasketches import kll_floats_sketch
+from duckdb import DuckDBPyConnection
+class AggregationFunction(ABC):
+    @staticmethod
+    @abstractmethod
+    def id() -> UUID:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def display_name() -> str:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def description() -> str:
+        raise NotImplementedError
+    @abstractmethod
+    def aggregation_type(self) -> Type[SketchMetric] | Type[NumericMetric]:
+        raise NotImplementedError
+    @abstractmethod
+    def aggregate(
+        self,
+        ddb_conn: DuckDBPyConnection,
+        *args: Any,
+        **kwargs: Any,
+    ) -> Union[list[SketchMetric], list[NumericMetric]]:
+        raise NotImplementedError
+    @staticmethod
+    def string_to_dimension(name: str, value: str | None) -> Dimension:
+        if value is None:
+            value = "null"
+        return Dimension(name=name, value=str(value))
+class NumericAggregationFunction(AggregationFunction, ABC):
+    def aggregation_type(self) -> Type[NumericMetric]:
+        return NumericMetric
+    @abstractmethod
+    def aggregate(
+        self,
+        ddb_conn: DuckDBPyConnection,
+        *args: Any,
+        **kwargs: Any,
+    ) -> list[NumericMetric]:
+        raise NotImplementedError
+    @staticmethod
+    def group_query_results_to_numeric_metrics(
+        data: pd.DataFrame,
+        value_col: str,
+        dim_columns: list[str],
+        timestamp_col: str,
+    ) -> list[NumericTimeSeries]:
+        """
+        Convert a grouped dataframe with repeated dimensions to internal numeric metric definition.
+        At a high level, the query results are already grouped, however,
+        the order isn't guaranteed that groups are sequential (this requires an explicit ORDER BY on the source query.)
+        What this function does is group by the indicated dimensions list, and from each group extract the dimension values once.
+        From there, iterate over the group turning each data point to a *Point. At the end, this single instance of the group metrics
+        and the list of points (values) are merged to one *TimeSeries
+        """
+        calculated_metrics: list[NumericTimeSeries] = []
+        # make sure dropna is False or rows with "null" as a dimension value will be dropped
+        groups = data.groupby(dim_columns, dropna=False)
+        for _, group in groups:
+            dimensions: list[Dimension] = []
+            # Get the first row of the group to determine the group level dimensions
+            dims_row = group.iloc[0]
+            for dim in dim_columns:
+                d = AggregationFunction.string_to_dimension(
+                    name=dim,
+                    value=dims_row[dim],
+                )
+                dimensions.append(d)
+            values: list[NumericPoint] = []
+            for _, row in group.iterrows():
+                values.append(
+                    NumericPoint(timestamp=row[timestamp_col], value=row[value_col]),
+                )
+            calculated_metrics.append(
+                NumericTimeSeries(values=values, dimensions=dimensions),
+            )
+        return calculated_metrics
+    @staticmethod
+    def dimensionless_query_results_to_numeric_metrics(
+        data: pd.DataFrame,
+        value_col: str,
+        timestamp_col: str,
+    ) -> NumericTimeSeries:
+        """
+        Convert a dimensionless time / value series to internal numeric metric definition.
+        """
+        values: list[NumericPoint] = []
+        for _, row in data.iterrows():
+            values.append(
+                NumericPoint(timestamp=row[timestamp_col], value=row[value_col]),
+            )
+        return NumericTimeSeries(values=values, dimensions=[])
+    def series_to_metric(
+        self,
+        metric_name: str,
+        series: list[NumericTimeSeries],
+    ) -> NumericMetric:
+        return NumericMetric(name=metric_name, numeric_series=series)
+class SketchAggregationFunction(AggregationFunction, ABC):
+    def aggregation_type(self) -> Type[SketchMetric]:
+        return SketchMetric
+    @abstractmethod
+    def aggregate(
+        self,
+        ddb_conn: DuckDBPyConnection,
+        *args: Any,
+        **kwargs: Any,
+    ) -> list[SketchMetric]:
+        raise NotImplementedError
+    def group_query_results_to_sketch_metrics(
+        self,
+        data: pd.DataFrame,
+        value_col: str,
+        dim_columns: list[str],
+        timestamp_col: str,
+    ) -> list[SketchTimeSeries]:
+        """
+        Convert a grouped dataframe with repeated dimensions to internal sketch metric definition.
+        For sketch data, what we're doing is grouping the raw row data into the dimensions we care about.
+        Within each group, we extract the dimensions once. Within this single dimension group,
+        we group the data into 5min intervals. Within each interval, the data point we care to sketch is added to the sketch.
+        """
+        calculated_metrics: list[SketchTimeSeries] = []
+        # make sure dropna is False or rows with "null" as a dimension value will be dropped
+        groups = data.groupby(dim_columns, dropna=False)
+        for _, group in groups:
+            calculated_metrics.append(
+                self._group_to_series(group, timestamp_col, dim_columns, value_col),
+            )
+        return calculated_metrics
+    @staticmethod
+    def _group_to_series(
+        group: pd.DataFrame,
+        timestamp_col: str,
+        dim_columns: list[str],
+        value_col: str,
+    ) -> SketchTimeSeries:
+        def to_sketch(col: pd.Series) -> Optional[kll_floats_sketch]:
+            if not len(col):
+                return None
+            s = kll_floats_sketch()
+            for v in col.values:
+                s.update(v)
+            return s
+        dimensions: list[Dimension] = []
+        # Get the first row of the group to determine the group level dimensions
+        dims_row = group.iloc[0]
+        for dim in dim_columns:
+            d = AggregationFunction.string_to_dimension(name=dim, value=dims_row[dim])
+            dimensions.append(d)
+        values: list[SketchPoint] = []
+        # Group query results into 5min buckets
+        group[timestamp_col] = pd.to_datetime(group[timestamp_col])
+        group.set_index(timestamp_col, inplace=True)
+        # make sure dropna is False or rows with "null" as a dimension value will be dropped
+        time_bucketed_groups = group.groupby(pd.Grouper(freq="5min"), dropna=False)
+        for group_timestamp, time_bucket_group in time_bucketed_groups:
+            # Don't generate metrics on empty buckets
+            if time_bucket_group.empty:
+                continue
+            sketch = to_sketch(time_bucket_group[value_col])
+            if sketch is not None:
+                values.append(
+                    SketchPoint(
+                        timestamp=group_timestamp,
+                        value=b64encode(sketch.serialize()).decode(),
+                    ),
+                )
+        return SketchTimeSeries(values=values, dimensions=dimensions)
+    def series_to_metric(
+        self,
+        metric_name: str,
+        series: list[SketchTimeSeries],
+    ) -> SketchMetric:
+        return SketchMetric(name=metric_name, sketch_series=series)

arthur_common/aggregations/functions/README.md ADDED Viewed

@@ -0,0 +1,26 @@
+| Class Name                                                                   | UUID                                 | Name                                                                                    |
+|------------------------------------------------------------------------------|--------------------------------------|-----------------------------------------------------------------------------------------|
+| BinaryClassifierCountThresholdClassAggregationFunction                       | 00000000-0000-0000-0000-000000000020 | Binary Classification Count by Class - Probability Threshold                            |
+| BinaryClassifierCountByClassAggregationFunction                              | 00000000-0000-0000-0000-00000000001f | Binary Classification Count by Class - Class Label                                      |
+| BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction       | 00000000-0000-0000-0000-00000000001e | Binary Classification Confusion Matrix - Probability Threshold                          |
+| BinaryClassifierStringLabelConfusionMatrixAggregationFunction                | 00000000-0000-0000-0000-00000000001d | Binary Classification Confusion Matrix - String Types                                   |
+| BinaryClassifierIntBoolConfusionMatrixAggregationFunction                    | 00000000-0000-0000-0000-00000000001c | Binary Classification Confusion Matrix - Int/Bool Types                                 |
+| NumericSumAggregationFunction                                                | 00000000-0000-0000-0000-00000000000f | Numeric Sum                                                                             |
+| MeanAbsoluteErrorAggregationFunction                                         | 00000000-0000-0000-0000-00000000000e | Mean Absolute Error                                                                     |
+| MeanSquaredErrorAggregationFunction                                          | 00000000-0000-0000-0000-000000000010 | Mean Squared Error                                                                      |
+| NumericSketchAggregationFunction                                             | 00000000-0000-0000-0000-00000000000d | Numeric Distribution                                                                    |
+| CategoricalCountAggregationFunction                                          | 00000000-0000-0000-0000-00000000000c | Category Count                                                                          |
+| InferenceNullCountAggregationFunction                                        | 00000000-0000-0000-0000-00000000000b | Null Value Count                                                                        |
+| InferenceCountAggregationFunction                                            | 00000000-0000-0000-0000-00000000000a | Inference Count                                                                         |
+| ShieldInferenceRuleLatencyAggregation                                        | 00000000-0000-0000-0000-000000000009 | Rule Latency Distribution                                                               |
+| ShieldInferenceRuleClaimFailCountAggregation                                 | 00000000-0000-0000-0000-000000000008 | Claim Count Distribution - Invalid Claims                                               |
+| ShieldInferenceRuleClaimPassCountAggregation                                 | 00000000-0000-0000-0000-000000000007 | Claim Count Distribution - Valid Claims                                                 |
+| ShieldInferenceRuleClaimCountAggregation                                     | 00000000-0000-0000-0000-000000000006 | Claim Count Distribution                                                                |
+| ShieldInferenceRulePIIDataScoreAggregation                                   | 00000000-0000-0000-0000-000000000005 | PII Score Distribution                                                                  |
+| ShieldInferenceRuleToxicityScoreAggregation                                  | 00000000-0000-0000-0000-000000000004 | Toxicity Distribution                                                                   |
+| ShieldInferenceHallucinationCountAggregation                                 | 00000000-0000-0000-0000-000000000003 | Hallucination Count                                                                     |
+| ShieldInferenceRuleCountAggregation                                          | 00000000-0000-0000-0000-000000000002 | Rule Result Count                                                                       |
+| ShieldInferencePassFailCountAggregation                                      | 00000000-0000-0000-0000-000000000001 | Inference Count                                                                         |
+| ShieldInferenceTokenCountAggregation                                         | 00000000-0000-0000-0000-000000000021 | Token Count                                                                             |
+| MulticlassClassifierCountByClassAggregationFunction                          | 64a338fb-6c99-4c40-ba39-81ab8baa8687 | Multiclass Classification Count by Class - Class Label                                  |
+| MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction | dc728927-6928-4a3b-b174-8c1ec8b58d62 | Multiclass Classification Confusion Matrix Single Class - String Class Label Prediction |

arthur_common/aggregations/functions/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+import importlib.util
+import inspect
+import os
+package_dir = os.path.dirname(__file__)
+# Peter 05/08/2024: This is some code I swiped from stackoverflow that iterated through the package directory here looking at .py files
+# It reads each file and imports the classes to add them to the "globals" which we can think of as importing into this namespace
+# By doing that, everything is exported and ready to be read as members of this `functions` package.
+# TLDR: this does what you would think `from . import *` does
+# Benefit here is any file with any class is added to the "exports", so nothing needs to be done after dropping a file in here
+for filename in os.listdir(package_dir):
+    if filename.endswith(".py") and filename != "__init__.py":
+        module_name = filename[:-3]  # Remove the .py extension to get the module name
+        module_path = os.path.join(package_dir, filename)
+        spec = importlib.util.spec_from_file_location(module_name, module_path)
+        if not spec:
+            continue
+        module = importlib.util.module_from_spec(spec)
+        if spec.loader:
+            spec.loader.exec_module(module)
+        for name, value in module.__dict__.items():
+            if inspect.isclass(value) and not name.startswith("_"):
+                globals()[name] = value

arthur_common/aggregations/functions/categorical_count.py ADDED Viewed

@@ -0,0 +1,89 @@
+from typing import Annotated
+from uuid import UUID
+from arthur_common.aggregations.aggregator import NumericAggregationFunction
+from arthur_common.models.metrics import DatasetReference, NumericMetric
+from arthur_common.models.schema_definitions import (
+    DType,
+    MetricColumnParameterAnnotation,
+    MetricDatasetParameterAnnotation,
+    ScalarType,
+    ScopeSchemaTag,
+)
+from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
+from duckdb import DuckDBPyConnection
+class CategoricalCountAggregationFunction(NumericAggregationFunction):
+    METRIC_NAME = "categorical_count"
+    @staticmethod
+    def id() -> UUID:
+        return UUID("00000000-0000-0000-0000-00000000000c")
+    @staticmethod
+    def display_name() -> str:
+        return "Category Count"
+    @staticmethod
+    def description() -> str:
+        return "Metric that counts the number of discrete values of each category in a string column. Creates a separate dimension for each category and the values are the count of occurrences of that category in the time window."
+    def aggregate(
+        self,
+        ddb_conn: DuckDBPyConnection,
+        dataset: Annotated[
+            DatasetReference,
+            MetricDatasetParameterAnnotation(
+                friendly_name="Dataset",
+                description="The dataset containing some categorical data.",
+            ),
+        ],
+        timestamp_col: Annotated[
+            str,
+            MetricColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.TIMESTAMP),
+                ],
+                tag_hints=[ScopeSchemaTag.PRIMARY_TIMESTAMP],
+                friendly_name="Timestamp Column",
+                description="A column containing timestamp values to bucket by.",
+            ),
+        ],
+        categorical_col: Annotated[
+            str,
+            MetricColumnParameterAnnotation(
+                source_dataset_parameter_key="dataset",
+                allowed_column_types=[
+                    ScalarType(dtype=DType.STRING),
+                    ScalarType(dtype=DType.INT),
+                ],
+                tag_hints=[ScopeSchemaTag.CATEGORICAL],
+                friendly_name="Categorical Column",
+                description="A column containing categorical values to count.",
+            ),
+        ],
+    ) -> list[NumericMetric]:
+        timestamp_col_escaped = escape_identifier(timestamp_col)
+        categorical_col_escaped = escape_identifier(categorical_col)
+        categorical_col_name_escaped = escape_str_literal(categorical_col)
+        count_query = f" \
+            select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
+            count(*) as count, \
+            {categorical_col_escaped} as category, \
+            {categorical_col_name_escaped} as column_name \
+            from {dataset.dataset_table_name} \
+            where ts is not null \
+            group by ts, category  \
+        "
+        results = ddb_conn.sql(count_query).df()
+        series = self.group_query_results_to_numeric_metrics(
+            results,
+            "count",
+            ["column_name", "category"],
+            timestamp_col="ts",
+        )
+        metric = self.series_to_metric(self.METRIC_NAME, series)
+        return [metric]