arthur-common 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arthur-common might be problematic. Click here for more details.

Files changed (40) hide show
  1. arthur_common/__init__.py +0 -0
  2. arthur_common/__version__.py +1 -0
  3. arthur_common/aggregations/__init__.py +2 -0
  4. arthur_common/aggregations/aggregator.py +214 -0
  5. arthur_common/aggregations/functions/README.md +26 -0
  6. arthur_common/aggregations/functions/__init__.py +25 -0
  7. arthur_common/aggregations/functions/categorical_count.py +89 -0
  8. arthur_common/aggregations/functions/confusion_matrix.py +412 -0
  9. arthur_common/aggregations/functions/inference_count.py +69 -0
  10. arthur_common/aggregations/functions/inference_count_by_class.py +206 -0
  11. arthur_common/aggregations/functions/inference_null_count.py +82 -0
  12. arthur_common/aggregations/functions/mean_absolute_error.py +110 -0
  13. arthur_common/aggregations/functions/mean_squared_error.py +110 -0
  14. arthur_common/aggregations/functions/multiclass_confusion_matrix.py +205 -0
  15. arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +90 -0
  16. arthur_common/aggregations/functions/numeric_stats.py +90 -0
  17. arthur_common/aggregations/functions/numeric_sum.py +87 -0
  18. arthur_common/aggregations/functions/py.typed +0 -0
  19. arthur_common/aggregations/functions/shield_aggregations.py +752 -0
  20. arthur_common/aggregations/py.typed +0 -0
  21. arthur_common/models/__init__.py +0 -0
  22. arthur_common/models/connectors.py +41 -0
  23. arthur_common/models/datasets.py +22 -0
  24. arthur_common/models/metrics.py +227 -0
  25. arthur_common/models/py.typed +0 -0
  26. arthur_common/models/schema_definitions.py +420 -0
  27. arthur_common/models/shield.py +504 -0
  28. arthur_common/models/task_job_specs.py +78 -0
  29. arthur_common/py.typed +0 -0
  30. arthur_common/tools/__init__.py +0 -0
  31. arthur_common/tools/aggregation_analyzer.py +243 -0
  32. arthur_common/tools/aggregation_loader.py +59 -0
  33. arthur_common/tools/duckdb_data_loader.py +329 -0
  34. arthur_common/tools/functions.py +46 -0
  35. arthur_common/tools/py.typed +0 -0
  36. arthur_common/tools/schema_inferer.py +104 -0
  37. arthur_common/tools/time_utils.py +33 -0
  38. arthur_common-1.0.1.dist-info/METADATA +74 -0
  39. arthur_common-1.0.1.dist-info/RECORD +40 -0
  40. arthur_common-1.0.1.dist-info/WHEEL +4 -0
File without changes
@@ -0,0 +1 @@
1
+ __version__ = "1.0.1"
@@ -0,0 +1,2 @@
1
+ from .aggregator import * # noqa
2
+ from .functions import * # noqa
@@ -0,0 +1,214 @@
1
+ from abc import ABC, abstractmethod
2
+ from base64 import b64encode
3
+ from typing import Any, Type, Union
4
+
5
+ import pandas as pd
6
+ from arthur_common.models.metrics import *
7
+ from datasketches import kll_floats_sketch
8
+ from duckdb import DuckDBPyConnection
9
+
10
+
11
+ class AggregationFunction(ABC):
12
+ @staticmethod
13
+ @abstractmethod
14
+ def id() -> UUID:
15
+ raise NotImplementedError
16
+
17
+ @staticmethod
18
+ @abstractmethod
19
+ def display_name() -> str:
20
+ raise NotImplementedError
21
+
22
+ @staticmethod
23
+ @abstractmethod
24
+ def description() -> str:
25
+ raise NotImplementedError
26
+
27
+ @abstractmethod
28
+ def aggregation_type(self) -> Type[SketchMetric] | Type[NumericMetric]:
29
+ raise NotImplementedError
30
+
31
+ @abstractmethod
32
+ def aggregate(
33
+ self,
34
+ ddb_conn: DuckDBPyConnection,
35
+ *args: Any,
36
+ **kwargs: Any,
37
+ ) -> Union[list[SketchMetric], list[NumericMetric]]:
38
+ raise NotImplementedError
39
+
40
+ @staticmethod
41
+ def string_to_dimension(name: str, value: str | None) -> Dimension:
42
+ if value is None:
43
+ value = "null"
44
+ return Dimension(name=name, value=str(value))
45
+
46
+
47
+ class NumericAggregationFunction(AggregationFunction, ABC):
48
+ def aggregation_type(self) -> Type[NumericMetric]:
49
+ return NumericMetric
50
+
51
+ @abstractmethod
52
+ def aggregate(
53
+ self,
54
+ ddb_conn: DuckDBPyConnection,
55
+ *args: Any,
56
+ **kwargs: Any,
57
+ ) -> list[NumericMetric]:
58
+ raise NotImplementedError
59
+
60
+ @staticmethod
61
+ def group_query_results_to_numeric_metrics(
62
+ data: pd.DataFrame,
63
+ value_col: str,
64
+ dim_columns: list[str],
65
+ timestamp_col: str,
66
+ ) -> list[NumericTimeSeries]:
67
+ """
68
+ Convert a grouped dataframe with repeated dimensions to internal numeric metric definition.
69
+
70
+ At a high level, the query results are already grouped, however,
71
+ the order isn't guaranteed that groups are sequential (this requires an explicit ORDER BY on the source query.)
72
+ What this function does is group by the indicated dimensions list, and from each group extract the dimension values once.
73
+ From there, iterate over the group turning each data point to a *Point. At the end, this single instance of the group metrics
74
+ and the list of points (values) are merged to one *TimeSeries
75
+ """
76
+ calculated_metrics: list[NumericTimeSeries] = []
77
+ # make sure dropna is False or rows with "null" as a dimension value will be dropped
78
+ groups = data.groupby(dim_columns, dropna=False)
79
+ for _, group in groups:
80
+ dimensions: list[Dimension] = []
81
+ # Get the first row of the group to determine the group level dimensions
82
+ dims_row = group.iloc[0]
83
+ for dim in dim_columns:
84
+ d = AggregationFunction.string_to_dimension(
85
+ name=dim,
86
+ value=dims_row[dim],
87
+ )
88
+ dimensions.append(d)
89
+
90
+ values: list[NumericPoint] = []
91
+ for _, row in group.iterrows():
92
+ values.append(
93
+ NumericPoint(timestamp=row[timestamp_col], value=row[value_col]),
94
+ )
95
+ calculated_metrics.append(
96
+ NumericTimeSeries(values=values, dimensions=dimensions),
97
+ )
98
+
99
+ return calculated_metrics
100
+
101
+ @staticmethod
102
+ def dimensionless_query_results_to_numeric_metrics(
103
+ data: pd.DataFrame,
104
+ value_col: str,
105
+ timestamp_col: str,
106
+ ) -> NumericTimeSeries:
107
+ """
108
+ Convert a dimensionless time / value series to internal numeric metric definition.
109
+ """
110
+ values: list[NumericPoint] = []
111
+ for _, row in data.iterrows():
112
+ values.append(
113
+ NumericPoint(timestamp=row[timestamp_col], value=row[value_col]),
114
+ )
115
+ return NumericTimeSeries(values=values, dimensions=[])
116
+
117
+ def series_to_metric(
118
+ self,
119
+ metric_name: str,
120
+ series: list[NumericTimeSeries],
121
+ ) -> NumericMetric:
122
+ return NumericMetric(name=metric_name, numeric_series=series)
123
+
124
+
125
+ class SketchAggregationFunction(AggregationFunction, ABC):
126
+ def aggregation_type(self) -> Type[SketchMetric]:
127
+ return SketchMetric
128
+
129
+ @abstractmethod
130
+ def aggregate(
131
+ self,
132
+ ddb_conn: DuckDBPyConnection,
133
+ *args: Any,
134
+ **kwargs: Any,
135
+ ) -> list[SketchMetric]:
136
+ raise NotImplementedError
137
+
138
+ def group_query_results_to_sketch_metrics(
139
+ self,
140
+ data: pd.DataFrame,
141
+ value_col: str,
142
+ dim_columns: list[str],
143
+ timestamp_col: str,
144
+ ) -> list[SketchTimeSeries]:
145
+ """
146
+ Convert a grouped dataframe with repeated dimensions to internal sketch metric definition.
147
+
148
+ For sketch data, what we're doing is grouping the raw row data into the dimensions we care about.
149
+ Within each group, we extract the dimensions once. Within this single dimension group,
150
+ we group the data into 5min intervals. Within each interval, the data point we care to sketch is added to the sketch.
151
+
152
+ """
153
+
154
+ calculated_metrics: list[SketchTimeSeries] = []
155
+ # make sure dropna is False or rows with "null" as a dimension value will be dropped
156
+ groups = data.groupby(dim_columns, dropna=False)
157
+ for _, group in groups:
158
+ calculated_metrics.append(
159
+ self._group_to_series(group, timestamp_col, dim_columns, value_col),
160
+ )
161
+
162
+ return calculated_metrics
163
+
164
+ @staticmethod
165
+ def _group_to_series(
166
+ group: pd.DataFrame,
167
+ timestamp_col: str,
168
+ dim_columns: list[str],
169
+ value_col: str,
170
+ ) -> SketchTimeSeries:
171
+ def to_sketch(col: pd.Series) -> Optional[kll_floats_sketch]:
172
+ if not len(col):
173
+ return None
174
+ s = kll_floats_sketch()
175
+ for v in col.values:
176
+ s.update(v)
177
+ return s
178
+
179
+ dimensions: list[Dimension] = []
180
+ # Get the first row of the group to determine the group level dimensions
181
+ dims_row = group.iloc[0]
182
+ for dim in dim_columns:
183
+ d = AggregationFunction.string_to_dimension(name=dim, value=dims_row[dim])
184
+ dimensions.append(d)
185
+
186
+ values: list[SketchPoint] = []
187
+
188
+ # Group query results into 5min buckets
189
+ group[timestamp_col] = pd.to_datetime(group[timestamp_col])
190
+ group.set_index(timestamp_col, inplace=True)
191
+ # make sure dropna is False or rows with "null" as a dimension value will be dropped
192
+ time_bucketed_groups = group.groupby(pd.Grouper(freq="5min"), dropna=False)
193
+
194
+ for group_timestamp, time_bucket_group in time_bucketed_groups:
195
+ # Don't generate metrics on empty buckets
196
+ if time_bucket_group.empty:
197
+ continue
198
+ sketch = to_sketch(time_bucket_group[value_col])
199
+ if sketch is not None:
200
+ values.append(
201
+ SketchPoint(
202
+ timestamp=group_timestamp,
203
+ value=b64encode(sketch.serialize()).decode(),
204
+ ),
205
+ )
206
+
207
+ return SketchTimeSeries(values=values, dimensions=dimensions)
208
+
209
+ def series_to_metric(
210
+ self,
211
+ metric_name: str,
212
+ series: list[SketchTimeSeries],
213
+ ) -> SketchMetric:
214
+ return SketchMetric(name=metric_name, sketch_series=series)
@@ -0,0 +1,26 @@
1
+ | Class Name | UUID | Name |
2
+ |------------------------------------------------------------------------------|--------------------------------------|-----------------------------------------------------------------------------------------|
3
+ | BinaryClassifierCountThresholdClassAggregationFunction | 00000000-0000-0000-0000-000000000020 | Binary Classification Count by Class - Probability Threshold |
4
+ | BinaryClassifierCountByClassAggregationFunction | 00000000-0000-0000-0000-00000000001f | Binary Classification Count by Class - Class Label |
5
+ | BinaryClassifierProbabilityThresholdConfusionMatrixAggregationFunction | 00000000-0000-0000-0000-00000000001e | Binary Classification Confusion Matrix - Probability Threshold |
6
+ | BinaryClassifierStringLabelConfusionMatrixAggregationFunction | 00000000-0000-0000-0000-00000000001d | Binary Classification Confusion Matrix - String Types |
7
+ | BinaryClassifierIntBoolConfusionMatrixAggregationFunction | 00000000-0000-0000-0000-00000000001c | Binary Classification Confusion Matrix - Int/Bool Types |
8
+ | NumericSumAggregationFunction | 00000000-0000-0000-0000-00000000000f | Numeric Sum |
9
+ | MeanAbsoluteErrorAggregationFunction | 00000000-0000-0000-0000-00000000000e | Mean Absolute Error |
10
+ | MeanSquaredErrorAggregationFunction | 00000000-0000-0000-0000-000000000010 | Mean Squared Error |
11
+ | NumericSketchAggregationFunction | 00000000-0000-0000-0000-00000000000d | Numeric Distribution |
12
+ | CategoricalCountAggregationFunction | 00000000-0000-0000-0000-00000000000c | Category Count |
13
+ | InferenceNullCountAggregationFunction | 00000000-0000-0000-0000-00000000000b | Null Value Count |
14
+ | InferenceCountAggregationFunction | 00000000-0000-0000-0000-00000000000a | Inference Count |
15
+ | ShieldInferenceRuleLatencyAggregation | 00000000-0000-0000-0000-000000000009 | Rule Latency Distribution |
16
+ | ShieldInferenceRuleClaimFailCountAggregation | 00000000-0000-0000-0000-000000000008 | Claim Count Distribution - Invalid Claims |
17
+ | ShieldInferenceRuleClaimPassCountAggregation | 00000000-0000-0000-0000-000000000007 | Claim Count Distribution - Valid Claims |
18
+ | ShieldInferenceRuleClaimCountAggregation | 00000000-0000-0000-0000-000000000006 | Claim Count Distribution |
19
+ | ShieldInferenceRulePIIDataScoreAggregation | 00000000-0000-0000-0000-000000000005 | PII Score Distribution |
20
+ | ShieldInferenceRuleToxicityScoreAggregation | 00000000-0000-0000-0000-000000000004 | Toxicity Distribution |
21
+ | ShieldInferenceHallucinationCountAggregation | 00000000-0000-0000-0000-000000000003 | Hallucination Count |
22
+ | ShieldInferenceRuleCountAggregation | 00000000-0000-0000-0000-000000000002 | Rule Result Count |
23
+ | ShieldInferencePassFailCountAggregation | 00000000-0000-0000-0000-000000000001 | Inference Count |
24
+ | ShieldInferenceTokenCountAggregation | 00000000-0000-0000-0000-000000000021 | Token Count |
25
+ | MulticlassClassifierCountByClassAggregationFunction | 64a338fb-6c99-4c40-ba39-81ab8baa8687 | Multiclass Classification Count by Class - Class Label |
26
+ | MulticlassClassifierStringLabelSingleClassConfusionMatrixAggregationFunction | dc728927-6928-4a3b-b174-8c1ec8b58d62 | Multiclass Classification Confusion Matrix Single Class - String Class Label Prediction |
@@ -0,0 +1,25 @@
1
+ import importlib.util
2
+ import inspect
3
+ import os
4
+
5
+ package_dir = os.path.dirname(__file__)
6
+
7
+ # Peter 05/08/2024: This is some code I swiped from stackoverflow that iterated through the package directory here looking at .py files
8
+ # It reads each file and imports the classes to add them to the "globals" which we can think of as importing into this namespace
9
+ # By doing that, everything is exported and ready to be read as members of this `functions` package.
10
+ # TLDR: this does what you would think `from . import *` does
11
+ # Benefit here is any file with any class is added to the "exports", so nothing needs to be done after dropping a file in here
12
+ for filename in os.listdir(package_dir):
13
+ if filename.endswith(".py") and filename != "__init__.py":
14
+ module_name = filename[:-3] # Remove the .py extension to get the module name
15
+ module_path = os.path.join(package_dir, filename)
16
+
17
+ spec = importlib.util.spec_from_file_location(module_name, module_path)
18
+ if not spec:
19
+ continue
20
+ module = importlib.util.module_from_spec(spec)
21
+ if spec.loader:
22
+ spec.loader.exec_module(module)
23
+ for name, value in module.__dict__.items():
24
+ if inspect.isclass(value) and not name.startswith("_"):
25
+ globals()[name] = value
@@ -0,0 +1,89 @@
1
+ from typing import Annotated
2
+ from uuid import UUID
3
+
4
+ from arthur_common.aggregations.aggregator import NumericAggregationFunction
5
+ from arthur_common.models.metrics import DatasetReference, NumericMetric
6
+ from arthur_common.models.schema_definitions import (
7
+ DType,
8
+ MetricColumnParameterAnnotation,
9
+ MetricDatasetParameterAnnotation,
10
+ ScalarType,
11
+ ScopeSchemaTag,
12
+ )
13
+ from arthur_common.tools.duckdb_data_loader import escape_identifier, escape_str_literal
14
+ from duckdb import DuckDBPyConnection
15
+
16
+
17
+ class CategoricalCountAggregationFunction(NumericAggregationFunction):
18
+ METRIC_NAME = "categorical_count"
19
+
20
+ @staticmethod
21
+ def id() -> UUID:
22
+ return UUID("00000000-0000-0000-0000-00000000000c")
23
+
24
+ @staticmethod
25
+ def display_name() -> str:
26
+ return "Category Count"
27
+
28
+ @staticmethod
29
+ def description() -> str:
30
+ return "Metric that counts the number of discrete values of each category in a string column. Creates a separate dimension for each category and the values are the count of occurrences of that category in the time window."
31
+
32
+ def aggregate(
33
+ self,
34
+ ddb_conn: DuckDBPyConnection,
35
+ dataset: Annotated[
36
+ DatasetReference,
37
+ MetricDatasetParameterAnnotation(
38
+ friendly_name="Dataset",
39
+ description="The dataset containing some categorical data.",
40
+ ),
41
+ ],
42
+ timestamp_col: Annotated[
43
+ str,
44
+ MetricColumnParameterAnnotation(
45
+ source_dataset_parameter_key="dataset",
46
+ allowed_column_types=[
47
+ ScalarType(dtype=DType.TIMESTAMP),
48
+ ],
49
+ tag_hints=[ScopeSchemaTag.PRIMARY_TIMESTAMP],
50
+ friendly_name="Timestamp Column",
51
+ description="A column containing timestamp values to bucket by.",
52
+ ),
53
+ ],
54
+ categorical_col: Annotated[
55
+ str,
56
+ MetricColumnParameterAnnotation(
57
+ source_dataset_parameter_key="dataset",
58
+ allowed_column_types=[
59
+ ScalarType(dtype=DType.STRING),
60
+ ScalarType(dtype=DType.INT),
61
+ ],
62
+ tag_hints=[ScopeSchemaTag.CATEGORICAL],
63
+ friendly_name="Categorical Column",
64
+ description="A column containing categorical values to count.",
65
+ ),
66
+ ],
67
+ ) -> list[NumericMetric]:
68
+ timestamp_col_escaped = escape_identifier(timestamp_col)
69
+ categorical_col_escaped = escape_identifier(categorical_col)
70
+ categorical_col_name_escaped = escape_str_literal(categorical_col)
71
+ count_query = f" \
72
+ select time_bucket(INTERVAL '5 minutes', {timestamp_col_escaped}) as ts, \
73
+ count(*) as count, \
74
+ {categorical_col_escaped} as category, \
75
+ {categorical_col_name_escaped} as column_name \
76
+ from {dataset.dataset_table_name} \
77
+ where ts is not null \
78
+ group by ts, category \
79
+ "
80
+ results = ddb_conn.sql(count_query).df()
81
+
82
+ series = self.group_query_results_to_numeric_metrics(
83
+ results,
84
+ "count",
85
+ ["column_name", "category"],
86
+ timestamp_col="ts",
87
+ )
88
+ metric = self.series_to_metric(self.METRIC_NAME, series)
89
+ return [metric]