arthur-common 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arthur_common/aggregations/aggregator.py +73 -9
- arthur_common/aggregations/functions/agentic_aggregations.py +260 -85
- arthur_common/aggregations/functions/categorical_count.py +15 -15
- arthur_common/aggregations/functions/confusion_matrix.py +24 -26
- arthur_common/aggregations/functions/inference_count.py +5 -9
- arthur_common/aggregations/functions/inference_count_by_class.py +16 -27
- arthur_common/aggregations/functions/inference_null_count.py +10 -13
- arthur_common/aggregations/functions/mean_absolute_error.py +12 -18
- arthur_common/aggregations/functions/mean_squared_error.py +12 -18
- arthur_common/aggregations/functions/multiclass_confusion_matrix.py +13 -20
- arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +1 -1
- arthur_common/aggregations/functions/numeric_stats.py +13 -15
- arthur_common/aggregations/functions/numeric_sum.py +12 -15
- arthur_common/aggregations/functions/shield_aggregations.py +457 -215
- arthur_common/models/common_schemas.py +214 -0
- arthur_common/models/connectors.py +10 -2
- arthur_common/models/constants.py +24 -0
- arthur_common/models/datasets.py +0 -9
- arthur_common/models/enums.py +177 -0
- arthur_common/models/metric_schemas.py +63 -0
- arthur_common/models/metrics.py +2 -9
- arthur_common/models/request_schemas.py +870 -0
- arthur_common/models/response_schemas.py +785 -0
- arthur_common/models/schema_definitions.py +6 -1
- arthur_common/models/task_job_specs.py +3 -12
- arthur_common/tools/duckdb_data_loader.py +34 -2
- arthur_common/tools/duckdb_utils.py +3 -6
- arthur_common/tools/schema_inferer.py +3 -6
- {arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/METADATA +12 -4
- arthur_common-2.4.13.dist-info/RECORD +49 -0
- arthur_common/models/shield.py +0 -642
- arthur_common-2.1.58.dist-info/RECORD +0 -44
- {arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/WHEEL +0 -0
|
@@ -6,7 +6,7 @@ from uuid import UUID, uuid4
|
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
|
|
8
8
|
|
|
9
|
-
from arthur_common.models.
|
|
9
|
+
from arthur_common.models.enums import ModelProblemType
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class ScopeSchemaTag(str, Enum):
|
|
@@ -433,6 +433,11 @@ def SHIELD_SCHEMA() -> DatasetSchema:
|
|
|
433
433
|
source_name="conversation_id",
|
|
434
434
|
definition=create_dataset_scalar_type(DType.STRING),
|
|
435
435
|
),
|
|
436
|
+
DatasetColumn(
|
|
437
|
+
id=uuid4(),
|
|
438
|
+
source_name="user_id",
|
|
439
|
+
definition=create_dataset_scalar_type(DType.STRING),
|
|
440
|
+
),
|
|
436
441
|
DatasetColumn(
|
|
437
442
|
id=uuid4(),
|
|
438
443
|
source_name="inference_prompt",
|
|
@@ -1,23 +1,14 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
1
|
from typing import Literal, Optional, Self
|
|
3
2
|
from uuid import UUID
|
|
4
3
|
|
|
5
|
-
from pydantic import BaseModel, Field
|
|
4
|
+
from pydantic import BaseModel, Field, model_validator
|
|
6
5
|
|
|
7
|
-
from arthur_common.models.
|
|
8
|
-
|
|
9
|
-
NewRuleRequest,
|
|
10
|
-
model_validator,
|
|
11
|
-
)
|
|
6
|
+
from arthur_common.models.enums import TaskType
|
|
7
|
+
from arthur_common.models.request_schemas import NewMetricRequest, NewRuleRequest
|
|
12
8
|
|
|
13
9
|
onboarding_id_desc = "An identifier to assign to the created model to make it easy to retrieve. Used by the UI during the GenAI model creation flow."
|
|
14
10
|
|
|
15
11
|
|
|
16
|
-
class TaskType(str, Enum):
|
|
17
|
-
TRADITIONAL = "traditional"
|
|
18
|
-
AGENTIC = "agentic"
|
|
19
|
-
|
|
20
|
-
|
|
21
12
|
class CreateModelTaskJobSpec(BaseModel):
|
|
22
13
|
job_type: Literal["create_model_task"] = "create_model_task"
|
|
23
14
|
connector_id: UUID = Field(
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
from typing import Any
|
|
3
4
|
|
|
4
5
|
import duckdb
|
|
@@ -16,6 +17,8 @@ from arthur_common.models.schema_definitions import (
|
|
|
16
17
|
DType,
|
|
17
18
|
)
|
|
18
19
|
|
|
20
|
+
MAX_JSON_OBJECT_SIZE = 1024 * 1024 * 1024 # 1GB
|
|
21
|
+
|
|
19
22
|
|
|
20
23
|
class ColumnFormat(BaseModel):
|
|
21
24
|
source_name: str
|
|
@@ -104,9 +107,9 @@ class DuckDBOperator:
|
|
|
104
107
|
stringified_schema = ", ".join([f"{kv}" for kv in key_value_pairs])
|
|
105
108
|
stringified_schema = f"{{ {stringified_schema} }}"
|
|
106
109
|
|
|
107
|
-
read_stmt = f"read_json('memory://inferences.json', format='array', columns={stringified_schema})"
|
|
110
|
+
read_stmt = f"read_json('memory://inferences.json', format='array', columns={stringified_schema}, maximum_object_size={MAX_JSON_OBJECT_SIZE})"
|
|
108
111
|
else:
|
|
109
|
-
read_stmt = "read_json_auto('memory://inferences.json')"
|
|
112
|
+
read_stmt = f"read_json_auto('memory://inferences.json', maximum_object_size={MAX_JSON_OBJECT_SIZE})"
|
|
110
113
|
|
|
111
114
|
conn.sql(
|
|
112
115
|
f"CREATE OR REPLACE TEMP TABLE {table_name} AS SELECT * FROM {read_stmt}",
|
|
@@ -312,6 +315,9 @@ def escape_identifier(identifier: str) -> str:
|
|
|
312
315
|
"""
|
|
313
316
|
Escape an identifier (e.g., column name) for use in a SQL query.
|
|
314
317
|
This method handles special characters and ensures proper quoting.
|
|
318
|
+
|
|
319
|
+
For struct fields, the identifiers must be escaped as following:
|
|
320
|
+
"struct_column_name"."struct_field"
|
|
315
321
|
"""
|
|
316
322
|
# Replace any double quotes with two double quotes
|
|
317
323
|
escaped = identifier.replace('"', '""')
|
|
@@ -319,6 +325,32 @@ def escape_identifier(identifier: str) -> str:
|
|
|
319
325
|
return f'"{escaped}"'
|
|
320
326
|
|
|
321
327
|
|
|
328
|
+
def unescape_identifier(identifier: str) -> str:
|
|
329
|
+
"""
|
|
330
|
+
Unescape an identifier (e.g., column name).
|
|
331
|
+
|
|
332
|
+
This removes the double quotes and properly handles struct fields, which may be escaped as follows:
|
|
333
|
+
"struct_column_name"."struct_field"
|
|
334
|
+
|
|
335
|
+
Here's a hard case for help understanding this function: "struct "" column name with quotes"."struct.field.name.with.dots"
|
|
336
|
+
"""
|
|
337
|
+
unescaped_identifiers = []
|
|
338
|
+
# strip top-level quotes
|
|
339
|
+
identifier = identifier[1:-1]
|
|
340
|
+
# split identifier into struct fields based on delimiter pattern "."
|
|
341
|
+
# at this point there are no external double quotes left; any remaining are escaped double quotes belonging to
|
|
342
|
+
# the column name
|
|
343
|
+
identifier_split_in_struct_fields = re.split(r'"\."', identifier)
|
|
344
|
+
|
|
345
|
+
for identifier in identifier_split_in_struct_fields:
|
|
346
|
+
# replace any escaped double quotes in the column
|
|
347
|
+
unescaped_identifier = identifier.replace('""', '"')
|
|
348
|
+
unescaped_identifiers.append(unescaped_identifier)
|
|
349
|
+
|
|
350
|
+
# join back any struct fields via dot syntax without the escape identifiers
|
|
351
|
+
return ".".join(unescaped_identifiers)
|
|
352
|
+
|
|
353
|
+
|
|
322
354
|
def escape_str_literal(literal: str) -> str:
|
|
323
355
|
"""
|
|
324
356
|
Escape a duckDB string literal for use in a SQL query.
|
|
@@ -2,7 +2,6 @@ import duckdb
|
|
|
2
2
|
|
|
3
3
|
from arthur_common.config.config import Config
|
|
4
4
|
from arthur_common.models.schema_definitions import SEGMENTATION_ALLOWED_DTYPES, DType
|
|
5
|
-
from arthur_common.tools.duckdb_data_loader import escape_identifier
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def is_column_possible_segmentation(
|
|
@@ -16,17 +15,15 @@ def is_column_possible_segmentation(
|
|
|
16
15
|
2. Has an allowed DType.
|
|
17
16
|
|
|
18
17
|
PreReq: Table with column should already be loaded in DuckDB
|
|
18
|
+
column_name already has DuckDB escape identifier for the query syntax
|
|
19
19
|
"""
|
|
20
20
|
segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
|
|
21
21
|
if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
|
|
22
22
|
return False
|
|
23
23
|
|
|
24
|
-
# check column for unique value count
|
|
25
|
-
escaped_column = escape_identifier(column_name)
|
|
26
|
-
|
|
27
|
-
# count distinct values in this column
|
|
24
|
+
# check column for unique value count - count distinct values in this column
|
|
28
25
|
distinct_count_query = f"""
|
|
29
|
-
SELECT COUNT(DISTINCT {
|
|
26
|
+
SELECT COUNT(DISTINCT {column_name}) as distinct_count
|
|
30
27
|
FROM {table}
|
|
31
28
|
"""
|
|
32
29
|
result = conn.sql(distinct_count_query).fetchone()
|
|
@@ -40,12 +40,11 @@ class SchemaInferer:
|
|
|
40
40
|
self.conn.sql(
|
|
41
41
|
f"CREATE OR REPLACE TEMP TABLE {escaped_col} AS SELECT UNNEST({escaped_col}) as {escaped_col} FROM {table}",
|
|
42
42
|
)
|
|
43
|
-
return self._infer_schema(escaped_col
|
|
43
|
+
return self._infer_schema(escaped_col)
|
|
44
44
|
|
|
45
45
|
def _infer_schema(
|
|
46
46
|
self,
|
|
47
47
|
table: str = "root",
|
|
48
|
-
is_nested_col: bool = False,
|
|
49
48
|
) -> DatasetObjectType:
|
|
50
49
|
"""is_nested_col indicates whether the function is being called on an unnested/flattened table that represents
|
|
51
50
|
a struct column or list column in the root table."""
|
|
@@ -105,12 +104,10 @@ class SchemaInferer:
|
|
|
105
104
|
raise NotImplementedError(f"Type {col_type} not mappable.")
|
|
106
105
|
|
|
107
106
|
# tag column as a possible segmentation column if it meets criteria
|
|
108
|
-
|
|
109
|
-
# so we don't want to tag nested columns as possible segmentation columns
|
|
110
|
-
if not is_nested_col and is_column_possible_segmentation(
|
|
107
|
+
if is_column_possible_segmentation(
|
|
111
108
|
self.conn,
|
|
112
109
|
table,
|
|
113
|
-
col_name,
|
|
110
|
+
escape_identifier(col_name),
|
|
114
111
|
scalar_schema.dtype,
|
|
115
112
|
):
|
|
116
113
|
scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: arthur-common
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.13
|
|
4
4
|
Summary: Utility code common to Arthur platform components.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Arthur
|
|
@@ -14,10 +14,11 @@ Requires-Dist: datasketches (>=5.1.0)
|
|
|
14
14
|
Requires-Dist: duckdb (>=1.1.3)
|
|
15
15
|
Requires-Dist: fastapi (>=0.115.8)
|
|
16
16
|
Requires-Dist: fsspec (>=2024.10.0)
|
|
17
|
+
Requires-Dist: openinference-semantic-conventions (>=0.1.12,<0.2.0)
|
|
17
18
|
Requires-Dist: pandas (>=2.2.2)
|
|
18
19
|
Requires-Dist: pydantic (>=2)
|
|
19
20
|
Requires-Dist: simple-settings (>=1.2.0)
|
|
20
|
-
Requires-Dist: tokencost (
|
|
21
|
+
Requires-Dist: tokencost (>=0.1.0,<0.2.0)
|
|
21
22
|
Requires-Dist: types-python-dateutil (>=2.9.0)
|
|
22
23
|
Requires-Dist: types-requests (>=2.32.0.20241016)
|
|
23
24
|
Requires-Dist: typing-extensions (>=4.7.1)
|
|
@@ -43,14 +44,14 @@ pip install arthur-common
|
|
|
43
44
|
|
|
44
45
|
## Requirements
|
|
45
46
|
|
|
46
|
-
- Python 3.
|
|
47
|
+
- Python 3.13
|
|
47
48
|
|
|
48
49
|
## Development
|
|
49
50
|
|
|
50
51
|
To set up the development environment, ensure you have [Poetry](https://python-poetry.org/) installed, then run:
|
|
51
52
|
|
|
52
53
|
```bash
|
|
53
|
-
poetry env use 3.
|
|
54
|
+
poetry env use 3.13
|
|
54
55
|
poetry install
|
|
55
56
|
```
|
|
56
57
|
|
|
@@ -62,6 +63,13 @@ This project uses [pytest](https://pytest.org/) for testing. To run the tests, e
|
|
|
62
63
|
poetry run pytest
|
|
63
64
|
```
|
|
64
65
|
|
|
66
|
+
## Release process
|
|
67
|
+
1. Merge changes into `main` branch
|
|
68
|
+
2. Go to **Actions** -> **Arthur Common Version Bump**
|
|
69
|
+
3. Click **Run workflow**. The workflow will create a new commit with the version bump, push it back to the same branch it is triggered on (default `main`), and start the release process
|
|
70
|
+
4. Watch in [GitHub Actions](https://github.com/arthur-ai/arthur-common/actions) for Arthur Common Release to run
|
|
71
|
+
5. Update package version in your project (arthur-engine)
|
|
72
|
+
|
|
65
73
|
## License
|
|
66
74
|
|
|
67
75
|
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
arthur_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
arthur_common/aggregations/__init__.py,sha256=vISWyciQAtksa71OKeHNP-QyFGd1NzBKq_LBsG0QSG8,67
|
|
3
|
+
arthur_common/aggregations/aggregator.py,sha256=3qWeWKcv5iXhs_LIpyLgicoEds5X1airhmzHj6uBpzo,10469
|
|
4
|
+
arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
|
|
5
|
+
arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
|
|
6
|
+
arthur_common/aggregations/functions/agentic_aggregations.py,sha256=82OJ174uGcDqf7OLXY7dwnnv1g4kubkjazpc7Yj0xw0,39531
|
|
7
|
+
arthur_common/aggregations/functions/categorical_count.py,sha256=jxV2w2Itmoh02VuazWN5z94PmQ-bRZjZpSoODGeBulQ,5099
|
|
8
|
+
arthur_common/aggregations/functions/confusion_matrix.py,sha256=2fIqo50TcbUlGgPXxGtfFr6ehyZn69R8sphigGuMDgo,21626
|
|
9
|
+
arthur_common/aggregations/functions/inference_count.py,sha256=Pxe5WT_Zgnn_wSDcm48l-flh-M5Zr72SbR4tQyNBk-o,3802
|
|
10
|
+
arthur_common/aggregations/functions/inference_count_by_class.py,sha256=fmzrbRxiWgmutJYrBs7JY1iIRF7F6kozBzcsMypatlE,10896
|
|
11
|
+
arthur_common/aggregations/functions/inference_null_count.py,sha256=X8mfeKb46VxUQFrjukSlVpM9AZCNvStsBHU3LsUbcEM,4591
|
|
12
|
+
arthur_common/aggregations/functions/mean_absolute_error.py,sha256=P9H0rRvpObnWQiu4p7-yW6y6R7_-Ju23y2YlZQgxvHA,6352
|
|
13
|
+
arthur_common/aggregations/functions/mean_squared_error.py,sha256=hZrHzfCscNnGKp_SqOeHEebzjMych1EXtnI1K70EYZE,6373
|
|
14
|
+
arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=eA4y0xJikErkRww5OudUAMG9Y6cYztkO4w561nWVh5w,12195
|
|
15
|
+
arthur_common/aggregations/functions/multiclass_inference_count_by_class.py,sha256=yiMpdz4VuX1ELprXYupFu4B9aDLIhgfEi3ma8jZsT_M,4261
|
|
16
|
+
arthur_common/aggregations/functions/numeric_stats.py,sha256=28y0Zdhk3kLFiJYVWq_uev1C1yBZDn1aTUEdvLkqo3k,4660
|
|
17
|
+
arthur_common/aggregations/functions/numeric_sum.py,sha256=TAeVVd5NqF7X9_hnMzbNVOVxdExcra4EZDkubtWHyAs,4780
|
|
18
|
+
arthur_common/aggregations/functions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
arthur_common/aggregations/functions/shield_aggregations.py,sha256=1Nc9kAePoaY9zoybDe5zKwCSu2lhpLV3fahoKC4ErjE,41083
|
|
20
|
+
arthur_common/aggregations/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
arthur_common/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
arthur_common/config/config.py,sha256=fcpjOYjPKu4Duk63CuTHrOWKQKAlAhVUR60kF_2_Xog,1247
|
|
23
|
+
arthur_common/config/settings.yaml,sha256=0CrygUwJzC5mGcO5Xnvv2ttp-P7LIsx682jllYA96NQ,161
|
|
24
|
+
arthur_common/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
arthur_common/models/common_schemas.py,sha256=uGKU2139zyPSX2o7tmOn0ICKPvJ8SIDRYbIWDc2AyYc,6957
|
|
26
|
+
arthur_common/models/connectors.py,sha256=gRdX4lNz0ObU64FqMmoffHVBwEgO3JfOf3wjn3tKv0Q,2264
|
|
27
|
+
arthur_common/models/constants.py,sha256=munkU0LrLsDs9BtAfozzw30FCguIowmAUKg_9vqwX24,1049
|
|
28
|
+
arthur_common/models/datasets.py,sha256=7p1tyJEPwXjBs2ZRoai8hTzNl6MK9jU1DluzASApE_4,254
|
|
29
|
+
arthur_common/models/enums.py,sha256=zv8MpDq9whpxqUDPw_jygHnX53meFyiOB1ERS11AK_o,4237
|
|
30
|
+
arthur_common/models/metric_schemas.py,sha256=Xf-1RTzg7iYtnBMLkUUUuMPzAujzzNvQx_pe-CksEdU,2484
|
|
31
|
+
arthur_common/models/metrics.py,sha256=mCa0aN-nuNHYcqGfkyKFeriI0krz0-ScgmXWXHlKoEI,11109
|
|
32
|
+
arthur_common/models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
arthur_common/models/request_schemas.py,sha256=-5h6BEmoOFfiafeAMNBYxKEoGBmK3vomSNvg8bCZwuc,31327
|
|
34
|
+
arthur_common/models/response_schemas.py,sha256=Hazx_cexqsJ6ehXvIn2EhvbODkYkJCg-ALDO9a6a8Qo,29641
|
|
35
|
+
arthur_common/models/schema_definitions.py,sha256=FJptBo7WRyryJMysC9rPb1KKnEzk6Yy1TY0KoF8hC8Y,17054
|
|
36
|
+
arthur_common/models/task_job_specs.py,sha256=p7jsSb97ylHYNkwoHXNOJvx2zcnh2kxLeh3m0pddo4M,3442
|
|
37
|
+
arthur_common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
|
+
arthur_common/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
arthur_common/tools/aggregation_analyzer.py,sha256=UfMtvFWXV2Dqly8S6nneGgomuvEGN-1tBz81tfkMcAE,11206
|
|
40
|
+
arthur_common/tools/aggregation_loader.py,sha256=3CF46bNi-GdJBNOXkjYfCQ1Aung8lf65L532sdWmR_s,2351
|
|
41
|
+
arthur_common/tools/duckdb_data_loader.py,sha256=A80wpATSc4VJLghoHwxpBEuUsxY93OZS0Qo4cFX7cRw,12462
|
|
42
|
+
arthur_common/tools/duckdb_utils.py,sha256=PZ3AKoBUaU6papqNiNQ4Sm2ugg5bGyXfaC_1I-E2q3s,1142
|
|
43
|
+
arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6d8k,1093
|
|
44
|
+
arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
+
arthur_common/tools/schema_inferer.py,sha256=8ehIqAxuGlgM08RtwPB43a7TfenZyEIf1R0p1RYrkng,4920
|
|
46
|
+
arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
|
|
47
|
+
arthur_common-2.4.13.dist-info/METADATA,sha256=5fglOTXbNEtutVuyHLR0Kv5qCXNk3oJRls_-yTcfIv0,2153
|
|
48
|
+
arthur_common-2.4.13.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
49
|
+
arthur_common-2.4.13.dist-info/RECORD,,
|