arthur-common 2.1.58__py3-none-any.whl → 2.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. arthur_common/aggregations/aggregator.py +73 -9
  2. arthur_common/aggregations/functions/agentic_aggregations.py +260 -85
  3. arthur_common/aggregations/functions/categorical_count.py +15 -15
  4. arthur_common/aggregations/functions/confusion_matrix.py +24 -26
  5. arthur_common/aggregations/functions/inference_count.py +5 -9
  6. arthur_common/aggregations/functions/inference_count_by_class.py +16 -27
  7. arthur_common/aggregations/functions/inference_null_count.py +10 -13
  8. arthur_common/aggregations/functions/mean_absolute_error.py +12 -18
  9. arthur_common/aggregations/functions/mean_squared_error.py +12 -18
  10. arthur_common/aggregations/functions/multiclass_confusion_matrix.py +13 -20
  11. arthur_common/aggregations/functions/multiclass_inference_count_by_class.py +1 -1
  12. arthur_common/aggregations/functions/numeric_stats.py +13 -15
  13. arthur_common/aggregations/functions/numeric_sum.py +12 -15
  14. arthur_common/aggregations/functions/shield_aggregations.py +457 -215
  15. arthur_common/models/common_schemas.py +214 -0
  16. arthur_common/models/connectors.py +10 -2
  17. arthur_common/models/constants.py +24 -0
  18. arthur_common/models/datasets.py +0 -9
  19. arthur_common/models/enums.py +177 -0
  20. arthur_common/models/metric_schemas.py +63 -0
  21. arthur_common/models/metrics.py +2 -9
  22. arthur_common/models/request_schemas.py +870 -0
  23. arthur_common/models/response_schemas.py +785 -0
  24. arthur_common/models/schema_definitions.py +6 -1
  25. arthur_common/models/task_job_specs.py +3 -12
  26. arthur_common/tools/duckdb_data_loader.py +34 -2
  27. arthur_common/tools/duckdb_utils.py +3 -6
  28. arthur_common/tools/schema_inferer.py +3 -6
  29. {arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/METADATA +12 -4
  30. arthur_common-2.4.13.dist-info/RECORD +49 -0
  31. arthur_common/models/shield.py +0 -642
  32. arthur_common-2.1.58.dist-info/RECORD +0 -44
  33. {arthur_common-2.1.58.dist-info → arthur_common-2.4.13.dist-info}/WHEEL +0 -0
@@ -6,7 +6,7 @@ from uuid import UUID, uuid4
6
6
 
7
7
  from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
8
8
 
9
- from arthur_common.models.datasets import ModelProblemType
9
+ from arthur_common.models.enums import ModelProblemType
10
10
 
11
11
 
12
12
  class ScopeSchemaTag(str, Enum):
@@ -433,6 +433,11 @@ def SHIELD_SCHEMA() -> DatasetSchema:
433
433
  source_name="conversation_id",
434
434
  definition=create_dataset_scalar_type(DType.STRING),
435
435
  ),
436
+ DatasetColumn(
437
+ id=uuid4(),
438
+ source_name="user_id",
439
+ definition=create_dataset_scalar_type(DType.STRING),
440
+ ),
436
441
  DatasetColumn(
437
442
  id=uuid4(),
438
443
  source_name="inference_prompt",
@@ -1,23 +1,14 @@
1
- from enum import Enum
2
1
  from typing import Literal, Optional, Self
3
2
  from uuid import UUID
4
3
 
5
- from pydantic import BaseModel, Field
4
+ from pydantic import BaseModel, Field, model_validator
6
5
 
7
- from arthur_common.models.shield import (
8
- NewMetricRequest,
9
- NewRuleRequest,
10
- model_validator,
11
- )
6
+ from arthur_common.models.enums import TaskType
7
+ from arthur_common.models.request_schemas import NewMetricRequest, NewRuleRequest
12
8
 
13
9
  onboarding_id_desc = "An identifier to assign to the created model to make it easy to retrieve. Used by the UI during the GenAI model creation flow."
14
10
 
15
11
 
16
- class TaskType(str, Enum):
17
- TRADITIONAL = "traditional"
18
- AGENTIC = "agentic"
19
-
20
-
21
12
  class CreateModelTaskJobSpec(BaseModel):
22
13
  job_type: Literal["create_model_task"] = "create_model_task"
23
14
  connector_id: UUID = Field(
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
  from typing import Any
3
4
 
4
5
  import duckdb
@@ -16,6 +17,8 @@ from arthur_common.models.schema_definitions import (
16
17
  DType,
17
18
  )
18
19
 
20
+ MAX_JSON_OBJECT_SIZE = 1024 * 1024 * 1024 # 1GB
21
+
19
22
 
20
23
  class ColumnFormat(BaseModel):
21
24
  source_name: str
@@ -104,9 +107,9 @@ class DuckDBOperator:
104
107
  stringified_schema = ", ".join([f"{kv}" for kv in key_value_pairs])
105
108
  stringified_schema = f"{{ {stringified_schema} }}"
106
109
 
107
- read_stmt = f"read_json('memory://inferences.json', format='array', columns={stringified_schema})"
110
+ read_stmt = f"read_json('memory://inferences.json', format='array', columns={stringified_schema}, maximum_object_size={MAX_JSON_OBJECT_SIZE})"
108
111
  else:
109
- read_stmt = "read_json_auto('memory://inferences.json')"
112
+ read_stmt = f"read_json_auto('memory://inferences.json', maximum_object_size={MAX_JSON_OBJECT_SIZE})"
110
113
 
111
114
  conn.sql(
112
115
  f"CREATE OR REPLACE TEMP TABLE {table_name} AS SELECT * FROM {read_stmt}",
@@ -312,6 +315,9 @@ def escape_identifier(identifier: str) -> str:
312
315
  """
313
316
  Escape an identifier (e.g., column name) for use in a SQL query.
314
317
  This method handles special characters and ensures proper quoting.
318
+
319
+ For struct fields, the identifiers must be escaped as following:
320
+ "struct_column_name"."struct_field"
315
321
  """
316
322
  # Replace any double quotes with two double quotes
317
323
  escaped = identifier.replace('"', '""')
@@ -319,6 +325,32 @@ def escape_identifier(identifier: str) -> str:
319
325
  return f'"{escaped}"'
320
326
 
321
327
 
328
+ def unescape_identifier(identifier: str) -> str:
329
+ """
330
+ Unescape an identifier (e.g., column name).
331
+
332
+ This removes the double quotes and properly handles struct fields, which may be escaped as follows:
333
+ "struct_column_name"."struct_field"
334
+
335
+ Here's a hard case for help understanding this function: "struct "" column name with quotes"."struct.field.name.with.dots"
336
+ """
337
+ unescaped_identifiers = []
338
+ # strip top-level quotes
339
+ identifier = identifier[1:-1]
340
+ # split identifier into struct fields based on delimiter pattern "."
341
+ # at this point there are no external double quotes left; any remaining are escaped double quotes belonging to
342
+ # the column name
343
+ identifier_split_in_struct_fields = re.split(r'"\."', identifier)
344
+
345
+ for identifier in identifier_split_in_struct_fields:
346
+ # replace any escaped double quotes in the column
347
+ unescaped_identifier = identifier.replace('""', '"')
348
+ unescaped_identifiers.append(unescaped_identifier)
349
+
350
+ # join back any struct fields via dot syntax without the escape identifiers
351
+ return ".".join(unescaped_identifiers)
352
+
353
+
322
354
  def escape_str_literal(literal: str) -> str:
323
355
  """
324
356
  Escape a duckDB string literal for use in a SQL query.
@@ -2,7 +2,6 @@ import duckdb
2
2
 
3
3
  from arthur_common.config.config import Config
4
4
  from arthur_common.models.schema_definitions import SEGMENTATION_ALLOWED_DTYPES, DType
5
- from arthur_common.tools.duckdb_data_loader import escape_identifier
6
5
 
7
6
 
8
7
  def is_column_possible_segmentation(
@@ -16,17 +15,15 @@ def is_column_possible_segmentation(
16
15
  2. Has an allowed DType.
17
16
 
18
17
  PreReq: Table with column should already be loaded in DuckDB
18
+ column_name already has DuckDB escape identifier for the query syntax
19
19
  """
20
20
  segmentation_col_unique_val_limit = Config.segmentation_col_unique_values_limit()
21
21
  if column_dtype not in SEGMENTATION_ALLOWED_DTYPES:
22
22
  return False
23
23
 
24
- # check column for unique value count
25
- escaped_column = escape_identifier(column_name)
26
-
27
- # count distinct values in this column
24
+ # check column for unique value count - count distinct values in this column
28
25
  distinct_count_query = f"""
29
- SELECT COUNT(DISTINCT {escaped_column}) as distinct_count
26
+ SELECT COUNT(DISTINCT {column_name}) as distinct_count
30
27
  FROM {table}
31
28
  """
32
29
  result = conn.sql(distinct_count_query).fetchone()
@@ -40,12 +40,11 @@ class SchemaInferer:
40
40
  self.conn.sql(
41
41
  f"CREATE OR REPLACE TEMP TABLE {escaped_col} AS SELECT UNNEST({escaped_col}) as {escaped_col} FROM {table}",
42
42
  )
43
- return self._infer_schema(escaped_col, is_nested_col=True)
43
+ return self._infer_schema(escaped_col)
44
44
 
45
45
  def _infer_schema(
46
46
  self,
47
47
  table: str = "root",
48
- is_nested_col: bool = False,
49
48
  ) -> DatasetObjectType:
50
49
  """is_nested_col indicates whether the function is being called on an unnested/flattened table that represents
51
50
  a struct column or list column in the root table."""
@@ -105,12 +104,10 @@ class SchemaInferer:
105
104
  raise NotImplementedError(f"Type {col_type} not mappable.")
106
105
 
107
106
  # tag column as a possible segmentation column if it meets criteria
108
- # we only support top-level column aggregations right now (ie you can't aggregate on a nested column)
109
- # so we don't want to tag nested columns as possible segmentation columns
110
- if not is_nested_col and is_column_possible_segmentation(
107
+ if is_column_possible_segmentation(
111
108
  self.conn,
112
109
  table,
113
- col_name,
110
+ escape_identifier(col_name),
114
111
  scalar_schema.dtype,
115
112
  ):
116
113
  scalar_schema.tag_hints.append(ScopeSchemaTag.POSSIBLE_SEGMENTATION)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: arthur-common
3
- Version: 2.1.58
3
+ Version: 2.4.13
4
4
  Summary: Utility code common to Arthur platform components.
5
5
  License: MIT
6
6
  Author: Arthur
@@ -14,10 +14,11 @@ Requires-Dist: datasketches (>=5.1.0)
14
14
  Requires-Dist: duckdb (>=1.1.3)
15
15
  Requires-Dist: fastapi (>=0.115.8)
16
16
  Requires-Dist: fsspec (>=2024.10.0)
17
+ Requires-Dist: openinference-semantic-conventions (>=0.1.12,<0.2.0)
17
18
  Requires-Dist: pandas (>=2.2.2)
18
19
  Requires-Dist: pydantic (>=2)
19
20
  Requires-Dist: simple-settings (>=1.2.0)
20
- Requires-Dist: tokencost (==0.1.24)
21
+ Requires-Dist: tokencost (>=0.1.0,<0.2.0)
21
22
  Requires-Dist: types-python-dateutil (>=2.9.0)
22
23
  Requires-Dist: types-requests (>=2.32.0.20241016)
23
24
  Requires-Dist: typing-extensions (>=4.7.1)
@@ -43,14 +44,14 @@ pip install arthur-common
43
44
 
44
45
  ## Requirements
45
46
 
46
- - Python 3.12
47
+ - Python 3.13
47
48
 
48
49
  ## Development
49
50
 
50
51
  To set up the development environment, ensure you have [Poetry](https://python-poetry.org/) installed, then run:
51
52
 
52
53
  ```bash
53
- poetry env use 3.12
54
+ poetry env use 3.13
54
55
  poetry install
55
56
  ```
56
57
 
@@ -62,6 +63,13 @@ This project uses [pytest](https://pytest.org/) for testing. To run the tests, e
62
63
  poetry run pytest
63
64
  ```
64
65
 
66
+ ## Release process
67
+ 1. Merge changes into `main` branch
68
+ 2. Go to **Actions** -> **Arthur Common Version Bump**
69
+ 3. Click **Run workflow**. The workflow will create a new commit with the version bump, push it back to the same branch it is triggered on (default `main`), and start the release process
70
+ 4. Watch in [GitHub Actions](https://github.com/arthur-ai/arthur-common/actions) for Arthur Common Release to run
71
+ 5. Update package version in your project (arthur-engine)
72
+
65
73
  ## License
66
74
 
67
75
  This project is licensed under the MIT License.
@@ -0,0 +1,49 @@
1
+ arthur_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ arthur_common/aggregations/__init__.py,sha256=vISWyciQAtksa71OKeHNP-QyFGd1NzBKq_LBsG0QSG8,67
3
+ arthur_common/aggregations/aggregator.py,sha256=3qWeWKcv5iXhs_LIpyLgicoEds5X1airhmzHj6uBpzo,10469
4
+ arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
5
+ arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
6
+ arthur_common/aggregations/functions/agentic_aggregations.py,sha256=82OJ174uGcDqf7OLXY7dwnnv1g4kubkjazpc7Yj0xw0,39531
7
+ arthur_common/aggregations/functions/categorical_count.py,sha256=jxV2w2Itmoh02VuazWN5z94PmQ-bRZjZpSoODGeBulQ,5099
8
+ arthur_common/aggregations/functions/confusion_matrix.py,sha256=2fIqo50TcbUlGgPXxGtfFr6ehyZn69R8sphigGuMDgo,21626
9
+ arthur_common/aggregations/functions/inference_count.py,sha256=Pxe5WT_Zgnn_wSDcm48l-flh-M5Zr72SbR4tQyNBk-o,3802
10
+ arthur_common/aggregations/functions/inference_count_by_class.py,sha256=fmzrbRxiWgmutJYrBs7JY1iIRF7F6kozBzcsMypatlE,10896
11
+ arthur_common/aggregations/functions/inference_null_count.py,sha256=X8mfeKb46VxUQFrjukSlVpM9AZCNvStsBHU3LsUbcEM,4591
12
+ arthur_common/aggregations/functions/mean_absolute_error.py,sha256=P9H0rRvpObnWQiu4p7-yW6y6R7_-Ju23y2YlZQgxvHA,6352
13
+ arthur_common/aggregations/functions/mean_squared_error.py,sha256=hZrHzfCscNnGKp_SqOeHEebzjMych1EXtnI1K70EYZE,6373
14
+ arthur_common/aggregations/functions/multiclass_confusion_matrix.py,sha256=eA4y0xJikErkRww5OudUAMG9Y6cYztkO4w561nWVh5w,12195
15
+ arthur_common/aggregations/functions/multiclass_inference_count_by_class.py,sha256=yiMpdz4VuX1ELprXYupFu4B9aDLIhgfEi3ma8jZsT_M,4261
16
+ arthur_common/aggregations/functions/numeric_stats.py,sha256=28y0Zdhk3kLFiJYVWq_uev1C1yBZDn1aTUEdvLkqo3k,4660
17
+ arthur_common/aggregations/functions/numeric_sum.py,sha256=TAeVVd5NqF7X9_hnMzbNVOVxdExcra4EZDkubtWHyAs,4780
18
+ arthur_common/aggregations/functions/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ arthur_common/aggregations/functions/shield_aggregations.py,sha256=1Nc9kAePoaY9zoybDe5zKwCSu2lhpLV3fahoKC4ErjE,41083
20
+ arthur_common/aggregations/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ arthur_common/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ arthur_common/config/config.py,sha256=fcpjOYjPKu4Duk63CuTHrOWKQKAlAhVUR60kF_2_Xog,1247
23
+ arthur_common/config/settings.yaml,sha256=0CrygUwJzC5mGcO5Xnvv2ttp-P7LIsx682jllYA96NQ,161
24
+ arthur_common/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ arthur_common/models/common_schemas.py,sha256=uGKU2139zyPSX2o7tmOn0ICKPvJ8SIDRYbIWDc2AyYc,6957
26
+ arthur_common/models/connectors.py,sha256=gRdX4lNz0ObU64FqMmoffHVBwEgO3JfOf3wjn3tKv0Q,2264
27
+ arthur_common/models/constants.py,sha256=munkU0LrLsDs9BtAfozzw30FCguIowmAUKg_9vqwX24,1049
28
+ arthur_common/models/datasets.py,sha256=7p1tyJEPwXjBs2ZRoai8hTzNl6MK9jU1DluzASApE_4,254
29
+ arthur_common/models/enums.py,sha256=zv8MpDq9whpxqUDPw_jygHnX53meFyiOB1ERS11AK_o,4237
30
+ arthur_common/models/metric_schemas.py,sha256=Xf-1RTzg7iYtnBMLkUUUuMPzAujzzNvQx_pe-CksEdU,2484
31
+ arthur_common/models/metrics.py,sha256=mCa0aN-nuNHYcqGfkyKFeriI0krz0-ScgmXWXHlKoEI,11109
32
+ arthur_common/models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ arthur_common/models/request_schemas.py,sha256=-5h6BEmoOFfiafeAMNBYxKEoGBmK3vomSNvg8bCZwuc,31327
34
+ arthur_common/models/response_schemas.py,sha256=Hazx_cexqsJ6ehXvIn2EhvbODkYkJCg-ALDO9a6a8Qo,29641
35
+ arthur_common/models/schema_definitions.py,sha256=FJptBo7WRyryJMysC9rPb1KKnEzk6Yy1TY0KoF8hC8Y,17054
36
+ arthur_common/models/task_job_specs.py,sha256=p7jsSb97ylHYNkwoHXNOJvx2zcnh2kxLeh3m0pddo4M,3442
37
+ arthur_common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ arthur_common/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ arthur_common/tools/aggregation_analyzer.py,sha256=UfMtvFWXV2Dqly8S6nneGgomuvEGN-1tBz81tfkMcAE,11206
40
+ arthur_common/tools/aggregation_loader.py,sha256=3CF46bNi-GdJBNOXkjYfCQ1Aung8lf65L532sdWmR_s,2351
41
+ arthur_common/tools/duckdb_data_loader.py,sha256=A80wpATSc4VJLghoHwxpBEuUsxY93OZS0Qo4cFX7cRw,12462
42
+ arthur_common/tools/duckdb_utils.py,sha256=PZ3AKoBUaU6papqNiNQ4Sm2ugg5bGyXfaC_1I-E2q3s,1142
43
+ arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6d8k,1093
44
+ arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
+ arthur_common/tools/schema_inferer.py,sha256=8ehIqAxuGlgM08RtwPB43a7TfenZyEIf1R0p1RYrkng,4920
46
+ arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
47
+ arthur_common-2.4.13.dist-info/METADATA,sha256=5fglOTXbNEtutVuyHLR0Kv5qCXNk3oJRls_-yTcfIv0,2153
48
+ arthur_common-2.4.13.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
49
+ arthur_common-2.4.13.dist-info/RECORD,,