arthur-common 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arthur-common might be problematic. Click here for more details.
- arthur_common/__init__.py +0 -0
- arthur_common/__version__.py +1 -0
- arthur_common/aggregations/__init__.py +2 -0
- arthur_common/aggregations/aggregator.py +214 -0
- arthur_common/aggregations/functions/README.md +26 -0
- arthur_common/aggregations/functions/__init__.py +25 -0
- arthur_common/aggregations/functions/categorical_count.py +89 -0
- arthur_common/aggregations/functions/confusion_matrix.py +412 -0
- arthur_common/aggregations/functions/inference_count.py +69 -0
- arthur_common/aggregations/functions/inference_count_by_class.py +206 -0
- arthur_common/aggregations/functions/inference_null_count.py +82 -0
- arthur_common/aggregations/functions/mean_absolute_error.py +110 -0
- arthur_common/aggregations/functions/mean_squared_error.py +110 -0
- arthur_common/aggregations/functions/multiclass_confusion_matrix.py +205 -0
- arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +90 -0
- arthur_common/aggregations/functions/numeric_stats.py +90 -0
- arthur_common/aggregations/functions/numeric_sum.py +87 -0
- arthur_common/aggregations/functions/py.typed +0 -0
- arthur_common/aggregations/functions/shield_aggregations.py +752 -0
- arthur_common/aggregations/py.typed +0 -0
- arthur_common/models/__init__.py +0 -0
- arthur_common/models/connectors.py +41 -0
- arthur_common/models/datasets.py +22 -0
- arthur_common/models/metrics.py +227 -0
- arthur_common/models/py.typed +0 -0
- arthur_common/models/schema_definitions.py +420 -0
- arthur_common/models/shield.py +504 -0
- arthur_common/models/task_job_specs.py +78 -0
- arthur_common/py.typed +0 -0
- arthur_common/tools/__init__.py +0 -0
- arthur_common/tools/aggregation_analyzer.py +243 -0
- arthur_common/tools/aggregation_loader.py +59 -0
- arthur_common/tools/duckdb_data_loader.py +329 -0
- arthur_common/tools/functions.py +46 -0
- arthur_common/tools/py.typed +0 -0
- arthur_common/tools/schema_inferer.py +104 -0
- arthur_common/tools/time_utils.py +33 -0
- arthur_common-1.0.1.dist-info/METADATA +74 -0
- arthur_common-1.0.1.dist-info/RECORD +40 -0
- arthur_common-1.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from uuid import uuid4
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from arthur_common.models.schema_definitions import (
|
|
6
|
+
DatasetColumn,
|
|
7
|
+
DatasetListType,
|
|
8
|
+
DatasetObjectType,
|
|
9
|
+
DatasetScalarType,
|
|
10
|
+
DatasetSchema,
|
|
11
|
+
DType,
|
|
12
|
+
ScopeSchemaTag,
|
|
13
|
+
)
|
|
14
|
+
from arthur_common.tools.duckdb_data_loader import DuckDBOperator, escape_identifier
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SchemaInferer:
|
|
18
|
+
def __init__(self, data: list[dict[str, Any]] | pd.DataFrame):
|
|
19
|
+
self.conn = DuckDBOperator.load_data_to_duckdb(
|
|
20
|
+
data,
|
|
21
|
+
preprocess_schema=True,
|
|
22
|
+
table_name="root",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def infer_schema(self) -> DatasetSchema:
|
|
26
|
+
columns = self._infer_schema()
|
|
27
|
+
schema = DatasetSchema(columns=[], alias_mask={})
|
|
28
|
+
for key, definition in columns.object.items():
|
|
29
|
+
schema.columns.append(
|
|
30
|
+
DatasetColumn(id=uuid4(), source_name=key, definition=definition),
|
|
31
|
+
)
|
|
32
|
+
# Close connection to destroy all temp tables and free up memory
|
|
33
|
+
self.conn.close()
|
|
34
|
+
return schema
|
|
35
|
+
|
|
36
|
+
def _infer_nested_schema(self, col_name: str, table: str) -> DatasetObjectType:
|
|
37
|
+
escaped_col = escape_identifier(col_name)
|
|
38
|
+
self.conn.sql(
|
|
39
|
+
f"CREATE OR REPLACE TEMP TABLE {escaped_col} AS SELECT UNNEST({escaped_col}) as {escaped_col} FROM {table}",
|
|
40
|
+
)
|
|
41
|
+
return self._infer_schema(escaped_col)
|
|
42
|
+
|
|
43
|
+
def _infer_schema(self, table: str = "root") -> DatasetObjectType:
|
|
44
|
+
ddb_schema: list[tuple[Any, Any, Any]] = self.conn.sql(
|
|
45
|
+
f"DESCRIBE {table}",
|
|
46
|
+
).fetchall()
|
|
47
|
+
|
|
48
|
+
obj = DatasetObjectType(id=uuid4(), object={}, nullable=False)
|
|
49
|
+
timestamp_cols = []
|
|
50
|
+
|
|
51
|
+
for column in ddb_schema:
|
|
52
|
+
col_type, col_name, col_nullable = (
|
|
53
|
+
str(column[1]),
|
|
54
|
+
str(column[0]),
|
|
55
|
+
str(column[2]) == "YES",
|
|
56
|
+
)
|
|
57
|
+
col_is_list = col_type[-2:] == "[]"
|
|
58
|
+
col_type = col_type.replace("[]", "")
|
|
59
|
+
|
|
60
|
+
# Handle structs / lists recursively
|
|
61
|
+
if col_is_list:
|
|
62
|
+
schema = self._infer_nested_schema(col_name, table)
|
|
63
|
+
obj.object[col_name] = DatasetListType(
|
|
64
|
+
id=uuid4(),
|
|
65
|
+
items=schema[col_name],
|
|
66
|
+
nullable=col_nullable,
|
|
67
|
+
)
|
|
68
|
+
elif "STRUCT" in col_type:
|
|
69
|
+
schema = self._infer_nested_schema(col_name, table)
|
|
70
|
+
schema.nullable = col_nullable
|
|
71
|
+
obj.object[col_name] = schema
|
|
72
|
+
else:
|
|
73
|
+
scalar_schema = DatasetScalarType(id=uuid4(), dtype=DType.UNDEFINED)
|
|
74
|
+
match col_type:
|
|
75
|
+
case "UUID":
|
|
76
|
+
scalar_schema.dtype = DType.UUID
|
|
77
|
+
case "VARCHAR":
|
|
78
|
+
scalar_schema.dtype = DType.STRING
|
|
79
|
+
case "BIGINT" | "INTEGER":
|
|
80
|
+
scalar_schema.dtype = DType.INT
|
|
81
|
+
case "DOUBLE" | "FLOAT":
|
|
82
|
+
scalar_schema.dtype = DType.FLOAT
|
|
83
|
+
case "BOOLEAN":
|
|
84
|
+
scalar_schema.dtype = DType.BOOL
|
|
85
|
+
case "JSON":
|
|
86
|
+
# keep duckDB's json type in case the customer's data doesn't fit in well-structured types
|
|
87
|
+
# an example is a JSON list like ["str", 0.234], because arrays can only have a single type
|
|
88
|
+
# in duckDB
|
|
89
|
+
scalar_schema.dtype = DType.JSON
|
|
90
|
+
case "DATE":
|
|
91
|
+
scalar_schema.dtype = DType.DATE
|
|
92
|
+
case "TIMESTAMP_NS" | "TIMESTAMP WITH TIME ZONE" | "TIMESTAMP":
|
|
93
|
+
scalar_schema.dtype = DType.TIMESTAMP
|
|
94
|
+
timestamp_cols.append(scalar_schema)
|
|
95
|
+
case _:
|
|
96
|
+
raise NotImplementedError(f"Type {col_type} not mappable.")
|
|
97
|
+
obj.object[col_name] = scalar_schema
|
|
98
|
+
|
|
99
|
+
# auto assign primary timestamp tag if there's only one timestamp column
|
|
100
|
+
if len(timestamp_cols) == 1:
|
|
101
|
+
timestamp_col = timestamp_cols[0]
|
|
102
|
+
timestamp_col.tag_hints.append(ScopeSchemaTag.PRIMARY_TIMESTAMP)
|
|
103
|
+
|
|
104
|
+
return obj
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
|
|
3
|
+
STRFTIME_CODES_TO_TIMEDELTA = {
|
|
4
|
+
"%H": timedelta(hours=1), # Hour (24-hour clock) as zero-padded decimal number
|
|
5
|
+
"%-H": timedelta(hours=1), # Hour (24-hour clock) as decimal number
|
|
6
|
+
"%I": timedelta(hours=1), # Hour (12-hour clock)
|
|
7
|
+
"%d": timedelta(days=1), # Day of month as zero-padded decimal
|
|
8
|
+
"%-d": timedelta(days=1), # Day of month as decimal number
|
|
9
|
+
"%j": timedelta(
|
|
10
|
+
days=1,
|
|
11
|
+
), # Day of year as zero-padded decimal number (001, ..., 366)
|
|
12
|
+
"%-j": timedelta(days=1), # Day of year as decimal number (1, ..., 366)
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def find_smallest_timedelta(format_string: str) -> timedelta | None:
|
|
17
|
+
"""Scans a linux strftime format string to find the smallest time unit present.
|
|
18
|
+
Only considers units larger than or equal to an hour or less than or equal to a day.
|
|
19
|
+
Example: '%Y-%m-%d_%H' -> timedelta(hours=1), '%Y-%m-%d' -> timedelta(days=1)
|
|
20
|
+
"""
|
|
21
|
+
smallest_delta = None
|
|
22
|
+
for code, delta in STRFTIME_CODES_TO_TIMEDELTA.items():
|
|
23
|
+
if code in format_string:
|
|
24
|
+
if smallest_delta is None or delta < smallest_delta:
|
|
25
|
+
smallest_delta = delta
|
|
26
|
+
return smallest_delta
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def check_datetime_tz_aware(dt: datetime) -> bool:
|
|
30
|
+
"""Returns true if dt is timezone-aware, false if naive."""
|
|
31
|
+
return (
|
|
32
|
+
True if dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None else False
|
|
33
|
+
)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: arthur-common
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Utility code common to Arthur platform components.
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Arthur
|
|
7
|
+
Author-email: engineering@arthur.ai
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: datasketches (>=5.1.0)
|
|
14
|
+
Requires-Dist: duckdb (>=1.1.3)
|
|
15
|
+
Requires-Dist: fastapi (>=0.115.8)
|
|
16
|
+
Requires-Dist: fsspec (>=2024.10.0)
|
|
17
|
+
Requires-Dist: pandas (>=2.2.2)
|
|
18
|
+
Requires-Dist: pydantic (>=2)
|
|
19
|
+
Requires-Dist: tokencost (==0.1.21)
|
|
20
|
+
Requires-Dist: types-python-dateutil (>=2.9.0)
|
|
21
|
+
Requires-Dist: types-requests (>=2.32.0.20241016)
|
|
22
|
+
Requires-Dist: typing-extensions (>=4.7.1)
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# Arthur Common
|
|
26
|
+
|
|
27
|
+
Arthur Common is a library that contains common operations between Arthur platform services.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
To install the package, use [Poetry](https://python-poetry.org/):
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
poetry add arthur-common
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
or pip
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install arthur-common
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Requirements
|
|
44
|
+
|
|
45
|
+
- Python 3.12
|
|
46
|
+
|
|
47
|
+
## Development
|
|
48
|
+
|
|
49
|
+
To set up the development environment, ensure you have [Poetry](https://python-poetry.org/) installed, then run:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
poetry env use 3.12
|
|
53
|
+
poetry install
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Running Tests
|
|
57
|
+
|
|
58
|
+
This project uses [pytest](https://pytest.org/) for testing. To run the tests, execute:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
poetry run pytest
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
This project is licensed under the MIT License.
|
|
67
|
+
|
|
68
|
+
## Authors
|
|
69
|
+
|
|
70
|
+
- Arthur <engineering@arthur.ai>
|
|
71
|
+
|
|
72
|
+
# ALEX
|
|
73
|
+
- Change for testing
|
|
74
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
arthur_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
arthur_common/__version__.py,sha256=d4QHYmS_30j0hPN8NmNPnQ_Z0TphDRbu4MtQj9cT9e8,22
|
|
3
|
+
arthur_common/aggregations/__init__.py,sha256=vISWyciQAtksa71OKeHNP-QyFGd1NzBKq_LBsG0QSG8,67
|
|
4
|
+
arthur_common/aggregations/aggregator.py,sha256=cw6mr1Dl0sH7Rn_EggCRvduC7GvC8XTDP2L84FODKY0,7445
|
|
5
|
+
arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
|
|
6
|
+
arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
|
|
7
|
+
arthur_common/aggregations/functions/categorical_count.py,sha256=_hk1TYkrnCAe-0Hflt4W3Nvrp-c-vWHSgVxaqcFNWAI,3368
|
|
8
|
+
arthur_common/aggregations/functions/confusion_matrix.py,sha256=xqJkwcFZck_F1tsvMeJdmU-EOSOfWJS_N7-PXYJiSmo,16351
|
|
9
|
+
arthur_common/aggregations/functions/inference_count.py,sha256=BGWa262UxpkqY5Y_Pm22URSRvwXzdRbGUcd4DSVJbN0,2312
|
|
10
|
+
arthur_common/aggregations/functions/inference_count_by_class.py,sha256=BwZw8wCmFHRoau5oDgcDAkcwNhXjktFBCG1Sfsj6iGY,7830
|
|
11
|
+
arthur_common/aggregations/functions/inference_null_count.py,sha256=wZDz89_23bGjB_Tb3ob_69_VhugFxzbSRkuENfyJ-ic,2867
|
|
12
|
+
arthur_common/aggregations/functions/mean_absolute_error.py,sha256=rDEQlKEDyy_zRewtgthB_BK2oKrW6ymWmdtkxUFzfW8,4233
|
|
13
|
+
arthur_common/aggregations/functions/mean_squared_error.py,sha256=TMUyPPPEHuG9QFmD2gZxmxy-f_CDU4ds_2R5-DFr42c,4251
|
|
14
|
+
arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=QM27MhZMvF5Q5yxENSsHx_MMV_5h6eiRPP554jsdBqY,8204
|
|
15
|
+
arthur_common/aggregations/functions/multiclass_inference_count_by_class.py,sha256=dmf4OI4KcFVvE61S2TnPZvBlNOhlFNFKrV7x2_ys5ZU,3201
|
|
16
|
+
arthur_common/aggregations/functions/numeric_stats.py,sha256=eN1q10KquF8GeBOOktBuatV9Zxvd_--ZzklqzDpT9qw,3124
|
|
17
|
+
arthur_common/aggregations/functions/numeric_sum.py,sha256=DO7jbmUEnyyJliEtYLHb58SDCVBTsFKvywFnADXFpw8,3036
|
|
18
|
+
arthur_common/aggregations/functions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
arthur_common/aggregations/functions/shield_aggregations.py,sha256=nkMlj9V7NIKeRP46jpsrlfB741RHk2CgwimD7BYp9To,31540
|
|
20
|
+
arthur_common/aggregations/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
arthur_common/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
arthur_common/models/connectors.py,sha256=Ng2hwXQumfLE9SYepYZs2L5Y6aAIXAQfkJIa0rKCvWQ,1565
|
|
23
|
+
arthur_common/models/datasets.py,sha256=giG_8mv_3ilBf7cIvRV0_TDCDdb4qxRbYZvl7hRb6l8,491
|
|
24
|
+
arthur_common/models/metrics.py,sha256=gFEGuM4kuac2CqpPN69BM2cWG-SUPE4-1jZVHv_M3M0,8380
|
|
25
|
+
arthur_common/models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
arthur_common/models/schema_definitions.py,sha256=MiM-oynqh71zZvVyM-XF5nN55dgOeqEajVyL8ZE3Wuo,14571
|
|
27
|
+
arthur_common/models/shield.py,sha256=1ZblfULKCf5BEvYURO5WScyfmijGwjAmcj0XADlF-XY,19110
|
|
28
|
+
arthur_common/models/task_job_specs.py,sha256=GLJ7qmrb5eXnl5PiV27nnx_yG4S4sc4NDJ8-6xmNDLM,2796
|
|
29
|
+
arthur_common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
arthur_common/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
arthur_common/tools/aggregation_analyzer.py,sha256=e4F8vsYDRRTzUmNVIl1vFrn9_nEeYDYcP3ygk7i1964,9534
|
|
32
|
+
arthur_common/tools/aggregation_loader.py,sha256=3CF46bNi-GdJBNOXkjYfCQ1Aung8lf65L532sdWmR_s,2351
|
|
33
|
+
arthur_common/tools/duckdb_data_loader.py,sha256=XrdXRFkgiGtYulOGsC4khVf12sNiSFx5hB5vD7vQzFE,11066
|
|
34
|
+
arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6d8k,1093
|
|
35
|
+
arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
|
+
arthur_common/tools/schema_inferer.py,sha256=PkAOHZRk_rZ1OZSigYrfzH-jERb9B_Gu7pOMl9WJQA8,4202
|
|
37
|
+
arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
|
|
38
|
+
arthur_common-1.0.1.dist-info/METADATA,sha256=WrMAk42ZfTrZhAz12r127fJhuglMx-zHnDXPMp7tzsk,1596
|
|
39
|
+
arthur_common-1.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
40
|
+
arthur_common-1.0.1.dist-info/RECORD,,
|