arthur-common 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arthur-common might be problematic. Click here for more details.

Files changed (40) hide show
  1. arthur_common/__init__.py +0 -0
  2. arthur_common/__version__.py +1 -0
  3. arthur_common/aggregations/__init__.py +2 -0
  4. arthur_common/aggregations/aggregator.py +214 -0
  5. arthur_common/aggregations/functions/README.md +26 -0
  6. arthur_common/aggregations/functions/__init__.py +25 -0
  7. arthur_common/aggregations/functions/categorical_count.py +89 -0
  8. arthur_common/aggregations/functions/confusion_matrix.py +412 -0
  9. arthur_common/aggregations/functions/inference_count.py +69 -0
  10. arthur_common/aggregations/functions/inference_count_by_class.py +206 -0
  11. arthur_common/aggregations/functions/inference_null_count.py +82 -0
  12. arthur_common/aggregations/functions/mean_absolute_error.py +110 -0
  13. arthur_common/aggregations/functions/mean_squared_error.py +110 -0
  14. arthur_common/aggregations/functions/multiclass_confusion_matrix.py +205 -0
  15. arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +90 -0
  16. arthur_common/aggregations/functions/numeric_stats.py +90 -0
  17. arthur_common/aggregations/functions/numeric_sum.py +87 -0
  18. arthur_common/aggregations/functions/py.typed +0 -0
  19. arthur_common/aggregations/functions/shield_aggregations.py +752 -0
  20. arthur_common/aggregations/py.typed +0 -0
  21. arthur_common/models/__init__.py +0 -0
  22. arthur_common/models/connectors.py +41 -0
  23. arthur_common/models/datasets.py +22 -0
  24. arthur_common/models/metrics.py +227 -0
  25. arthur_common/models/py.typed +0 -0
  26. arthur_common/models/schema_definitions.py +420 -0
  27. arthur_common/models/shield.py +504 -0
  28. arthur_common/models/task_job_specs.py +78 -0
  29. arthur_common/py.typed +0 -0
  30. arthur_common/tools/__init__.py +0 -0
  31. arthur_common/tools/aggregation_analyzer.py +243 -0
  32. arthur_common/tools/aggregation_loader.py +59 -0
  33. arthur_common/tools/duckdb_data_loader.py +329 -0
  34. arthur_common/tools/functions.py +46 -0
  35. arthur_common/tools/py.typed +0 -0
  36. arthur_common/tools/schema_inferer.py +104 -0
  37. arthur_common/tools/time_utils.py +33 -0
  38. arthur_common-1.0.1.dist-info/METADATA +74 -0
  39. arthur_common-1.0.1.dist-info/RECORD +40 -0
  40. arthur_common-1.0.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,104 @@
1
+ from typing import Any
2
+ from uuid import uuid4
3
+
4
+ import pandas as pd
5
+ from arthur_common.models.schema_definitions import (
6
+ DatasetColumn,
7
+ DatasetListType,
8
+ DatasetObjectType,
9
+ DatasetScalarType,
10
+ DatasetSchema,
11
+ DType,
12
+ ScopeSchemaTag,
13
+ )
14
+ from arthur_common.tools.duckdb_data_loader import DuckDBOperator, escape_identifier
15
+
16
+
17
+ class SchemaInferer:
18
+ def __init__(self, data: list[dict[str, Any]] | pd.DataFrame):
19
+ self.conn = DuckDBOperator.load_data_to_duckdb(
20
+ data,
21
+ preprocess_schema=True,
22
+ table_name="root",
23
+ )
24
+
25
+ def infer_schema(self) -> DatasetSchema:
26
+ columns = self._infer_schema()
27
+ schema = DatasetSchema(columns=[], alias_mask={})
28
+ for key, definition in columns.object.items():
29
+ schema.columns.append(
30
+ DatasetColumn(id=uuid4(), source_name=key, definition=definition),
31
+ )
32
+ # Close connection to destroy all temp tables and free up memory
33
+ self.conn.close()
34
+ return schema
35
+
36
+ def _infer_nested_schema(self, col_name: str, table: str) -> DatasetObjectType:
37
+ escaped_col = escape_identifier(col_name)
38
+ self.conn.sql(
39
+ f"CREATE OR REPLACE TEMP TABLE {escaped_col} AS SELECT UNNEST({escaped_col}) as {escaped_col} FROM {table}",
40
+ )
41
+ return self._infer_schema(escaped_col)
42
+
43
+ def _infer_schema(self, table: str = "root") -> DatasetObjectType:
44
+ ddb_schema: list[tuple[Any, Any, Any]] = self.conn.sql(
45
+ f"DESCRIBE {table}",
46
+ ).fetchall()
47
+
48
+ obj = DatasetObjectType(id=uuid4(), object={}, nullable=False)
49
+ timestamp_cols = []
50
+
51
+ for column in ddb_schema:
52
+ col_type, col_name, col_nullable = (
53
+ str(column[1]),
54
+ str(column[0]),
55
+ str(column[2]) == "YES",
56
+ )
57
+ col_is_list = col_type[-2:] == "[]"
58
+ col_type = col_type.replace("[]", "")
59
+
60
+ # Handle structs / lists recursively
61
+ if col_is_list:
62
+ schema = self._infer_nested_schema(col_name, table)
63
+ obj.object[col_name] = DatasetListType(
64
+ id=uuid4(),
65
+ items=schema[col_name],
66
+ nullable=col_nullable,
67
+ )
68
+ elif "STRUCT" in col_type:
69
+ schema = self._infer_nested_schema(col_name, table)
70
+ schema.nullable = col_nullable
71
+ obj.object[col_name] = schema
72
+ else:
73
+ scalar_schema = DatasetScalarType(id=uuid4(), dtype=DType.UNDEFINED)
74
+ match col_type:
75
+ case "UUID":
76
+ scalar_schema.dtype = DType.UUID
77
+ case "VARCHAR":
78
+ scalar_schema.dtype = DType.STRING
79
+ case "BIGINT" | "INTEGER":
80
+ scalar_schema.dtype = DType.INT
81
+ case "DOUBLE" | "FLOAT":
82
+ scalar_schema.dtype = DType.FLOAT
83
+ case "BOOLEAN":
84
+ scalar_schema.dtype = DType.BOOL
85
+ case "JSON":
86
+ # keep duckDB's json type in case the customer's data doesn't fit in well-structured types
87
+ # an example is a JSON list like ["str", 0.234], because arrays can only have a single type
88
+ # in duckDB
89
+ scalar_schema.dtype = DType.JSON
90
+ case "DATE":
91
+ scalar_schema.dtype = DType.DATE
92
+ case "TIMESTAMP_NS" | "TIMESTAMP WITH TIME ZONE" | "TIMESTAMP":
93
+ scalar_schema.dtype = DType.TIMESTAMP
94
+ timestamp_cols.append(scalar_schema)
95
+ case _:
96
+ raise NotImplementedError(f"Type {col_type} not mappable.")
97
+ obj.object[col_name] = scalar_schema
98
+
99
+ # auto assign primary timestamp tag if there's only one timestamp column
100
+ if len(timestamp_cols) == 1:
101
+ timestamp_col = timestamp_cols[0]
102
+ timestamp_col.tag_hints.append(ScopeSchemaTag.PRIMARY_TIMESTAMP)
103
+
104
+ return obj
@@ -0,0 +1,33 @@
1
+ from datetime import datetime, timedelta
2
+
3
+ STRFTIME_CODES_TO_TIMEDELTA = {
4
+ "%H": timedelta(hours=1), # Hour (24-hour clock) as zero-padded decimal number
5
+ "%-H": timedelta(hours=1), # Hour (24-hour clock) as decimal number
6
+ "%I": timedelta(hours=1), # Hour (12-hour clock)
7
+ "%d": timedelta(days=1), # Day of month as zero-padded decimal
8
+ "%-d": timedelta(days=1), # Day of month as decimal number
9
+ "%j": timedelta(
10
+ days=1,
11
+ ), # Day of year as zero-padded decimal number (001, ..., 366)
12
+ "%-j": timedelta(days=1), # Day of year as decimal number (1, ..., 366)
13
+ }
14
+
15
+
16
+ def find_smallest_timedelta(format_string: str) -> timedelta | None:
17
+ """Scans a linux strftime format string to find the smallest time unit present.
18
+ Only considers units larger than or equal to an hour or less than or equal to a day.
19
+ Example: '%Y-%m-%d_%H' -> timedelta(hours=1), '%Y-%m-%d' -> timedelta(days=1)
20
+ """
21
+ smallest_delta = None
22
+ for code, delta in STRFTIME_CODES_TO_TIMEDELTA.items():
23
+ if code in format_string:
24
+ if smallest_delta is None or delta < smallest_delta:
25
+ smallest_delta = delta
26
+ return smallest_delta
27
+
28
+
29
+ def check_datetime_tz_aware(dt: datetime) -> bool:
30
+ """Returns true if dt is timezone-aware, false if naive."""
31
+ return (
32
+ True if dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None else False
33
+ )
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.3
2
+ Name: arthur-common
3
+ Version: 1.0.1
4
+ Summary: Utility code common to Arthur platform components.
5
+ License: MIT
6
+ Author: Arthur
7
+ Author-email: engineering@arthur.ai
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: datasketches (>=5.1.0)
14
+ Requires-Dist: duckdb (>=1.1.3)
15
+ Requires-Dist: fastapi (>=0.115.8)
16
+ Requires-Dist: fsspec (>=2024.10.0)
17
+ Requires-Dist: pandas (>=2.2.2)
18
+ Requires-Dist: pydantic (>=2)
19
+ Requires-Dist: tokencost (==0.1.21)
20
+ Requires-Dist: types-python-dateutil (>=2.9.0)
21
+ Requires-Dist: types-requests (>=2.32.0.20241016)
22
+ Requires-Dist: typing-extensions (>=4.7.1)
23
+ Description-Content-Type: text/markdown
24
+
25
+ # Arthur Common
26
+
27
+ Arthur Common is a library that contains common operations between Arthur platform services.
28
+
29
+ ## Installation
30
+
31
+ To install the package, use [Poetry](https://python-poetry.org/):
32
+
33
+ ```bash
34
+ poetry add arthur-common
35
+ ```
36
+
37
+ or pip
38
+
39
+ ```bash
40
+ pip install arthur-common
41
+ ```
42
+
43
+ ## Requirements
44
+
45
+ - Python 3.12
46
+
47
+ ## Development
48
+
49
+ To set up the development environment, ensure you have [Poetry](https://python-poetry.org/) installed, then run:
50
+
51
+ ```bash
52
+ poetry env use 3.12
53
+ poetry install
54
+ ```
55
+
56
+ ### Running Tests
57
+
58
+ This project uses [pytest](https://pytest.org/) for testing. To run the tests, execute:
59
+
60
+ ```bash
61
+ poetry run pytest
62
+ ```
63
+
64
+ ## License
65
+
66
+ This project is licensed under the MIT License.
67
+
68
+ ## Authors
69
+
70
+ - Arthur <engineering@arthur.ai>
71
+
72
+ # ALEX
73
+ - Change for testing
74
+
@@ -0,0 +1,40 @@
1
+ arthur_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ arthur_common/__version__.py,sha256=d4QHYmS_30j0hPN8NmNPnQ_Z0TphDRbu4MtQj9cT9e8,22
3
+ arthur_common/aggregations/__init__.py,sha256=vISWyciQAtksa71OKeHNP-QyFGd1NzBKq_LBsG0QSG8,67
4
+ arthur_common/aggregations/aggregator.py,sha256=cw6mr1Dl0sH7Rn_EggCRvduC7GvC8XTDP2L84FODKY0,7445
5
+ arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
6
+ arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
7
+ arthur_common/aggregations/functions/categorical_count.py,sha256=_hk1TYkrnCAe-0Hflt4W3Nvrp-c-vWHSgVxaqcFNWAI,3368
8
+ arthur_common/aggregations/functions/confusion_matrix.py,sha256=xqJkwcFZck_F1tsvMeJdmU-EOSOfWJS_N7-PXYJiSmo,16351
9
+ arthur_common/aggregations/functions/inference_count.py,sha256=BGWa262UxpkqY5Y_Pm22URSRvwXzdRbGUcd4DSVJbN0,2312
10
+ arthur_common/aggregations/functions/inference_count_by_class.py,sha256=BwZw8wCmFHRoau5oDgcDAkcwNhXjktFBCG1Sfsj6iGY,7830
11
+ arthur_common/aggregations/functions/inference_null_count.py,sha256=wZDz89_23bGjB_Tb3ob_69_VhugFxzbSRkuENfyJ-ic,2867
12
+ arthur_common/aggregations/functions/mean_absolute_error.py,sha256=rDEQlKEDyy_zRewtgthB_BK2oKrW6ymWmdtkxUFzfW8,4233
13
+ arthur_common/aggregations/functions/mean_squared_error.py,sha256=TMUyPPPEHuG9QFmD2gZxmxy-f_CDU4ds_2R5-DFr42c,4251
14
+ arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=QM27MhZMvF5Q5yxENSsHx_MMV_5h6eiRPP554jsdBqY,8204
15
+ arthur_common/aggregations/functions/multiclass_inference_count_by_class.py,sha256=dmf4OI4KcFVvE61S2TnPZvBlNOhlFNFKrV7x2_ys5ZU,3201
16
+ arthur_common/aggregations/functions/numeric_stats.py,sha256=eN1q10KquF8GeBOOktBuatV9Zxvd_--ZzklqzDpT9qw,3124
17
+ arthur_common/aggregations/functions/numeric_sum.py,sha256=DO7jbmUEnyyJliEtYLHb58SDCVBTsFKvywFnADXFpw8,3036
18
+ arthur_common/aggregations/functions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ arthur_common/aggregations/functions/shield_aggregations.py,sha256=nkMlj9V7NIKeRP46jpsrlfB741RHk2CgwimD7BYp9To,31540
20
+ arthur_common/aggregations/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ arthur_common/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ arthur_common/models/connectors.py,sha256=Ng2hwXQumfLE9SYepYZs2L5Y6aAIXAQfkJIa0rKCvWQ,1565
23
+ arthur_common/models/datasets.py,sha256=giG_8mv_3ilBf7cIvRV0_TDCDdb4qxRbYZvl7hRb6l8,491
24
+ arthur_common/models/metrics.py,sha256=gFEGuM4kuac2CqpPN69BM2cWG-SUPE4-1jZVHv_M3M0,8380
25
+ arthur_common/models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ arthur_common/models/schema_definitions.py,sha256=MiM-oynqh71zZvVyM-XF5nN55dgOeqEajVyL8ZE3Wuo,14571
27
+ arthur_common/models/shield.py,sha256=1ZblfULKCf5BEvYURO5WScyfmijGwjAmcj0XADlF-XY,19110
28
+ arthur_common/models/task_job_specs.py,sha256=GLJ7qmrb5eXnl5PiV27nnx_yG4S4sc4NDJ8-6xmNDLM,2796
29
+ arthur_common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
+ arthur_common/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ arthur_common/tools/aggregation_analyzer.py,sha256=e4F8vsYDRRTzUmNVIl1vFrn9_nEeYDYcP3ygk7i1964,9534
32
+ arthur_common/tools/aggregation_loader.py,sha256=3CF46bNi-GdJBNOXkjYfCQ1Aung8lf65L532sdWmR_s,2351
33
+ arthur_common/tools/duckdb_data_loader.py,sha256=XrdXRFkgiGtYulOGsC4khVf12sNiSFx5hB5vD7vQzFE,11066
34
+ arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6d8k,1093
35
+ arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ arthur_common/tools/schema_inferer.py,sha256=PkAOHZRk_rZ1OZSigYrfzH-jERb9B_Gu7pOMl9WJQA8,4202
37
+ arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
38
+ arthur_common-1.0.1.dist-info/METADATA,sha256=WrMAk42ZfTrZhAz12r127fJhuglMx-zHnDXPMp7tzsk,1596
39
+ arthur_common-1.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
40
+ arthur_common-1.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any