PyPI - arthur-common - Versions diffs - 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl - Mend

arthur-common 2.1.58py3-none-any.whl → 2.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

arthur_common/aggregations/aggregator.py +73 -9
arthur_common/aggregations/functions/agentic_aggregations.py +260 -85
arthur_common/aggregations/functions/categorical_count.py +15 -15
arthur_common/aggregations/functions/confusion_matrix.py +24 -26
arthur_common/aggregations/functions/inference_count.py +5 -9
arthur_common/aggregations/functions/inference_count_by_class.py +16 -27
arthur_common/aggregations/functions/inference_null_count.py +10 -13
arthur_common/aggregations/functions/mean_absolute_error.py +12 -18
arthur_common/aggregations/functions/mean_squared_error.py +12 -18
arthur_common/aggregations/functions/multiclass_confusion_matrix.py +13 -20
arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +1 -1
arthur_common/aggregations/functions/numeric_stats.py +13 -15
arthur_common/aggregations/functions/numeric_sum.py +12 -15
arthur_common/aggregations/functions/shield_aggregations.py +457 -215
arthur_common/models/common_schemas.py +214 -0
arthur_common/models/connectors.py +10 -2
arthur_common/models/constants.py +24 -0
arthur_common/models/datasets.py +0 -9
arthur_common/models/enums.py +177 -0
arthur_common/models/metric_schemas.py +63 -0
arthur_common/models/metrics.py +2 -9
arthur_common/models/request_schemas.py +870 -0
arthur_common/models/response_schemas.py +785 -0
arthur_common/models/schema_definitions.py +6 -1
arthur_common/models/task_job_specs.py +3 -12
arthur_common/tools/duckdb_data_loader.py +34 -2
arthur_common/tools/duckdb_utils.py +3 -6
arthur_common/tools/schema_inferer.py +3 -6
{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/METADATA +12 -4
arthur_common-2.4.13.dist-info/RECORD +49 -0
arthur_common/models/shield.py +0 -642
arthur_common-2.1.58.dist-info/RECORD +0 -44
{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/WHEEL +0 -0

arthur_common/models/schema_definitions.py CHANGED Viewed

@@ -6,7 +6,7 @@ from uuid import UUID, uuid4
 from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
-from arthur_common.models.datasets import ModelProblemType
+from arthur_common.models.enums import ModelProblemType
 class ScopeSchemaTag(str, Enum):
@@ -433,6 +433,11 @@ def SHIELD_SCHEMA() -> DatasetSchema:
                 source_name="conversation_id",
                 definition=create_dataset_scalar_type(DType.STRING),
             ),
+            DatasetColumn(
+                id=uuid4(),
+                source_name="user_id",
+                definition=create_dataset_scalar_type(DType.STRING),
+            ),
             DatasetColumn(
                 id=uuid4(),
                 source_name="inference_prompt",

arthur_common/models/task_job_specs.py CHANGED Viewed

@@ -1,23 +1,14 @@
-from enum import Enum
 from typing import Literal, Optional, Self
 from uuid import UUID
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
-from arthur_common.models.shield import (
-    NewMetricRequest,
-    NewRuleRequest,
-    model_validator,
-)
+from arthur_common.models.enums import TaskType
+from arthur_common.models.request_schemas import NewMetricRequest, NewRuleRequest
 onboarding_id_desc = "An identifier to assign to the created model to make it easy to retrieve. Used by the UI during the GenAI model creation flow."
-class TaskType(str, Enum):
-    TRADITIONAL = "traditional"
-    AGENTIC = "agentic"
 class CreateModelTaskJobSpec(BaseModel):
     job_type: Literal["create_model_task"] = "create_model_task"
     connector_id: UUID = Field(

arthur_common/tools/duckdb_data_loader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Any
 import duckdb
@@ -16,6 +17,8 @@ from arthur_common.models.schema_definitions import (
     DType,
 )
+MAX_JSON_OBJECT_SIZE = 1024 * 1024 * 1024  # 1GB
 class ColumnFormat(BaseModel):
     source_name: str
@@ -104,9 +107,9 @@ class DuckDBOperator:
             stringified_schema = ", ".join([f"{kv}" for kv in key_value_pairs])
             stringified_schema = f"{{ {stringified_schema} }}"
-            read_stmt = f"read_json('memory://inferences.json', format='array', columns={stringified_schema})"
+            read_stmt = f"read_json('memory://inferences.json', format='array', columns={stringified_schema}, maximum_object_size={MAX_JSON_OBJECT_SIZE})"
         else:
-            read_stmt = "read_json_auto('memory://inferences.json')"
+            read_stmt = f"read_json_auto('memory://inferences.json', maximum_object_size={MAX_JSON_OBJECT_SIZE})"
         conn.sql(
             f"CREATE OR REPLACE TEMP TABLE {table_name} AS SELECT * FROM {read_stmt}",
@@ -312,6 +315,9 @@ def escape_identifier(identifier: str) -> str:
     """
     Escape an identifier (e.g., column name) for use in a SQL query.
     This method handles special characters and ensures proper quoting.
+    For struct fields, the identifiers must be escaped as following:
+    "struct_column_name"."struct_field"
     """
     # Replace any double quotes with two double quotes
     escaped = identifier.replace('"', '""')
@@ -319,6 +325,32 @@ def escape_identifier(identifier: str) -> str:
     return f'"{escaped}"'
+def unescape_identifier(identifier: str) -> str:
+    """
+    Unescape an identifier (e.g., column name).
+    This removes the double quotes and properly handles struct fields, which may be escaped as follows:
+    "struct_column_name"."struct_field"
+    Here's a hard case for help understanding this function: "struct "" column name with quotes"."struct.field.name.with.dots"
+    """
+    unescaped_identifiers = []
+    # strip top-level quotes
+    identifier = identifier[1:-1]
+    # split identifier into struct fields based on delimiter pattern "."
+    # at this point there are no external double quotes left; any remaining are escaped double quotes belonging to
+    # the column name
+    identifier_split_in_struct_fields = re.split(r'"\."', identifier)
+    for identifier in identifier_split_in_struct_fields:
+        # replace any escaped double quotes in the column
+        unescaped_identifier = identifier.replace('""', '"')
+        unescaped_identifiers.append(unescaped_identifier)
+    # join back any struct fields via dot syntax without the escape identifiers
+    return ".".join(unescaped_identifiers)
 def escape_str_literal(literal: str) -> str:
     """
     Escape a duckDB string literal for use in a SQL query.

arthur_common/tools/duckdb_utils.py CHANGED Viewed

@@ -2,7 +2,6 @@ import duckdb
 from arthur_common.config.config import Config
 from arthur_common.models.schema_definitions import SEGMENTATION_ALLOWED_DTYPES, DType
-from arthur_common.tools.duckdb_data_loader import escape_identifier
 def is_column_possible_segmentation(
@@ -16,17 +15,15 @@ def is_column_possible_segmentation(
     2. Has an allowed DType.
     PreReq: Table with column should already be loaded in DuckDB
+    column_name already has DuckDB escape identifier for the query syntax
     """
     segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
     if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
         return False
-    # check column for unique value count
-    escaped_column = escape_identifier(column_name)
-    # count distinct values in this column
+    # check column for unique value count - count distinct values in this column
     distinct_count_query = f"""
-        SELECT COUNT(DISTINCT {escaped_column}) as distinct_count
+        SELECT COUNT(DISTINCT {column_name}) as distinct_count
         FROM {table}
     """
     result = conn.sql(distinct_count_query).fetchone()

arthur_common/tools/schema_inferer.py CHANGED Viewed

@@ -40,12 +40,11 @@ class SchemaInferer:
         self.conn.sql(
             f"CREATE OR REPLACE TEMP TABLE {escaped_col} AS SELECT UNNEST({escaped_col}) as {escaped_col} FROM {table}",
         )
-        return self._infer_schema(escaped_col, is_nested_col=True)
+        return self._infer_schema(escaped_col)
     def _infer_schema(
         self,
         table: str = "root",
-        is_nested_col: bool = False,
     ) -> DatasetObjectType:
         """is_nested_col indicates whether the function is being called on an unnested/flattened table that represents
         a struct column or list column in the root table."""
@@ -105,12 +104,10 @@ class SchemaInferer:
                         raise NotImplementedError(f"Type {col_type} not mappable.")
                 # tag column as a possible segmentation column if it meets criteria
-                # we only support top-level column aggregations right now (ie you can't aggregate on a nested column)
-                # so we don't want to tag nested columns as possible segmentation columns
-                if not is_nested_col and is_column_possible_segmentation(
+                if is_column_possible_segmentation(
                     self.conn,
                     table,
-                    col_name,
+                    escape_identifier(col_name),
                     scalar_schema.dtype,
                 ):
                     scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)

{arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: arthur-common
-Version: 2.1.58
+Version: 2.4.13
 Summary: Utility code common to Arthur platform components.
 License: MIT
 Author: Arthur
@@ -14,10 +14,11 @@ Requires-Dist: datasketches (>=5.1.0)
 Requires-Dist: duckdb (>=1.1.3)
 Requires-Dist: fastapi (>=0.115.8)
 Requires-Dist: fsspec (>=2024.10.0)
+Requires-Dist: openinference-semantic-conventions (>=0.1.12,<0.2.0)
 Requires-Dist: pandas (>=2.2.2)
 Requires-Dist: pydantic (>=2)
 Requires-Dist: simple-settings (>=1.2.0)
-Requires-Dist: tokencost (==0.1.24)
+Requires-Dist: tokencost (>=0.1.0,<0.2.0)
 Requires-Dist: types-python-dateutil (>=2.9.0)
 Requires-Dist: types-requests (>=2.32.0.20241016)
 Requires-Dist: typing-extensions (>=4.7.1)
@@ -43,14 +44,14 @@ pip install arthur-common
 ## Requirements
-- Python 3.12
+- Python 3.13
 ## Development
 To set up the development environment, ensure you have [Poetry](https://python-poetry.org/) installed, then run:
 ```bash
-poetry env use 3.12
+poetry env use 3.13
 poetry install
 ```
@@ -62,6 +63,13 @@ This project uses [pytest](https://pytest.org/) for testing. To run the tests, e
 poetry run pytest
 ```
+## Release process
+1. Merge changes into `main` branch
+2. Go to **Actions** -> **Arthur Common Version Bump**
+3. Click **Run workflow**. The workflow will create a new commit with the version bump, push it back to the same branch it is triggered on (default `main`), and start the release process
+4. Watch in [GitHub Actions](https://github.com/arthur-ai/arthur-common/actions) for Arthur Common Release to run
+5. Update package version in your project (arthur-engine)
 ## License
 This project is licensed under the MIT License.

arthur_common-2.4.13.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,49 @@
+arthur_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/aggregations/__init__.py,sha256=vISWyciQAtksa71OKeHNP-QyFGd1NzBKq_LBsG0QSG8,67
+arthur_common/aggregations/aggregator.py,sha256=3qWeWKcv5iXhs_LIpyLgicoEds5X1airhmzHj6uBpzo,10469
+arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
+arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
+arthur_common/aggregations/functions/agentic_aggregations.py,sha256=82OJ174uGcDqf7OLXY7dwnnv1g4kubkjazpc7Yj0xw0,39531
+arthur_common/aggregations/functions/categorical_count.py,sha256=jxV2w2Itmoh02VuazWN5z94PmQ-bRZjZpSoODGeBulQ,5099
+arthur_common/aggregations/functions/confusion_matrix.py,sha256=2fIqo50TcbUlGgPXxGtfFr6ehyZn69R8sphigGuMDgo,21626
+arthur_common/aggregations/functions/inference_count.py,sha256=Pxe5WT_Zgnn_wSDcm48l-flh-M5Zr72SbR4tQyNBk-o,3802
+arthur_common/aggregations/functions/inference_count_by_class.py,sha256=fmzrbRxiWgmutJYrBs7JY1iIRF7F6kozBzcsMypatlE,10896
+arthur_common/aggregations/functions/inference_null_count.py,sha256=X8mfeKb46VxUQFrjukSlVpM9AZCNvStsBHU3LsUbcEM,4591
+arthur_common/aggregations/functions/mean_absolute_error.py,sha256=P9H0rRvpObnWQiu4p7-yW6y6R7_-Ju23y2YlZQgxvHA,6352
+arthur_common/aggregations/functions/mean_squared_error.py,sha256=hZrHzfCscNnGKp_SqOeHEebzjMych1EXtnI1K70EYZE,6373
+arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=eA4y0xJikErkRww5OudUAMG9Y6cYztkO4w561nWVh5w,12195
+arthur_common/aggregations/functions/multiclass_inference_count_by_class.py,sha256=yiMpdz4VuX1ELprXYupFu4B9aDLIhgfEi3ma8jZsT_M,4261
+arthur_common/aggregations/functions/numeric_stats.py,sha256=28y0Zdhk3kLFiJYVWq_uev1C1yBZDn1aTUEdvLkqo3k,4660
+arthur_common/aggregations/functions/numeric_sum.py,sha256=TAeVVd5NqF7X9_hnMzbNVOVxdExcra4EZDkubtWHyAs,4780
+arthur_common/aggregations/functions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/aggregations/functions/shield_aggregations.py,sha256=1Nc9kAePoaY9zoybDe5zKwCSu2lhpLV3fahoKC4ErjE,41083
+arthur_common/aggregations/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/config/config.py,sha256=fcpjOYjPKu4Duk63CuTHrOWKQKAlAhVUR60kF_2_Xog,1247
+arthur_common/config/settings.yaml,sha256=0CrygUwJzC5mGcO5Xnvv2ttp-P7LIsx682jllYA96NQ,161
+arthur_common/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/models/common_schemas.py,sha256=uGKU2139zyPSX2o7tmOn0ICKPvJ8SIDRYbIWDc2AyYc,6957
+arthur_common/models/connectors.py,sha256=gRdX4lNz0ObU64FqMmoffHVBwEgO3JfOf3wjn3tKv0Q,2264
+arthur_common/models/constants.py,sha256=munkU0LrLsDs9BtAfozzw30FCguIowmAUKg_9vqwX24,1049
+arthur_common/models/datasets.py,sha256=7p1tyJEPwXjBs2ZRoai8hTzNl6MK9jU1DluzASApE_4,254
+arthur_common/models/enums.py,sha256=zv8MpDq9whpxqUDPw_jygHnX53meFyiOB1ERS11AK_o,4237
+arthur_common/models/metric_schemas.py,sha256=Xf-1RTzg7iYtnBMLkUUUuMPzAujzzNvQx_pe-CksEdU,2484
+arthur_common/models/metrics.py,sha256=mCa0aN-nuNHYcqGfkyKFeriI0krz0-ScgmXWXHlKoEI,11109
+arthur_common/models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/models/request_schemas.py,sha256=-5h6BEmoOFfiafeAMNBYxKEoGBmK3vomSNvg8bCZwuc,31327
+arthur_common/models/response_schemas.py,sha256=Hazx_cexqsJ6ehXvIn2EhvbODkYkJCg-ALDO9a6a8Qo,29641
+arthur_common/models/schema_definitions.py,sha256=FJptBo7WRyryJMysC9rPb1KKnEzk6Yy1TY0KoF8hC8Y,17054
+arthur_common/models/task_job_specs.py,sha256=p7jsSb97ylHYNkwoHXNOJvx2zcnh2kxLeh3m0pddo4M,3442
+arthur_common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/tools/aggregation_analyzer.py,sha256=UfMtvFWXV2Dqly8S6nneGgomuvEGN-1tBz81tfkMcAE,11206
+arthur_common/tools/aggregation_loader.py,sha256=3CF46bNi-GdJBNOXkjYfCQ1Aung8lf65L532sdWmR_s,2351
+arthur_common/tools/duckdb_data_loader.py,sha256=A80wpATSc4VJLghoHwxpBEuUsxY93OZS0Qo4cFX7cRw,12462
+arthur_common/tools/duckdb_utils.py,sha256=PZ3AKoBUaU6papqNiNQ4Sm2ugg5bGyXfaC_1I-E2q3s,1142
+arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6d8k,1093
+arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+arthur_common/tools/schema_inferer.py,sha256=8ehIqAxuGlgM08RtwPB43a7TfenZyEIf1R0p1RYrkng,4920
+arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
+arthur_common-2.4.13.dist-info/METADATA,sha256=5fglOTXbNEtutVuyHLR0Kv5qCXNk3oJRls_-yTcfIv0,2153
+arthur_common-2.4.13.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+arthur_common-2.4.13.dist-info/RECORD,,

arthur-common 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl

arthur-common 2.1.58py3-none-any.whl → 2.4.13py3-none-any.whl