kumoai 2.10.0.dev202509231831__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512161731__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (53) hide show
  1. kumoai/__init__.py +22 -11
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +17 -16
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/client/rfm.py +37 -8
  7. kumoai/connector/utils.py +23 -2
  8. kumoai/experimental/rfm/__init__.py +164 -46
  9. kumoai/experimental/rfm/backend/__init__.py +0 -0
  10. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  11. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +49 -86
  12. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  13. kumoai/experimental/rfm/backend/local/table.py +119 -0
  14. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  15. kumoai/experimental/rfm/backend/snow/sampler.py +274 -0
  16. kumoai/experimental/rfm/backend/snow/table.py +135 -0
  17. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  18. kumoai/experimental/rfm/backend/sqlite/sampler.py +353 -0
  19. kumoai/experimental/rfm/backend/sqlite/table.py +126 -0
  20. kumoai/experimental/rfm/base/__init__.py +25 -0
  21. kumoai/experimental/rfm/base/column.py +66 -0
  22. kumoai/experimental/rfm/base/sampler.py +773 -0
  23. kumoai/experimental/rfm/base/source.py +19 -0
  24. kumoai/experimental/rfm/base/sql_sampler.py +60 -0
  25. kumoai/experimental/rfm/{local_table.py → base/table.py} +245 -156
  26. kumoai/experimental/rfm/{local_graph.py → graph.py} +425 -137
  27. kumoai/experimental/rfm/infer/__init__.py +6 -0
  28. kumoai/experimental/rfm/infer/dtype.py +79 -0
  29. kumoai/experimental/rfm/infer/pkey.py +126 -0
  30. kumoai/experimental/rfm/infer/time_col.py +62 -0
  31. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  32. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  33. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +24 -58
  34. kumoai/experimental/rfm/pquery/{pandas_backend.py → pandas_executor.py} +278 -224
  35. kumoai/experimental/rfm/rfm.py +669 -246
  36. kumoai/experimental/rfm/sagemaker.py +138 -0
  37. kumoai/jobs.py +1 -0
  38. kumoai/pquery/predictive_query.py +10 -6
  39. kumoai/spcs.py +1 -3
  40. kumoai/testing/decorators.py +1 -1
  41. kumoai/testing/snow.py +50 -0
  42. kumoai/trainer/trainer.py +12 -10
  43. kumoai/utils/__init__.py +3 -2
  44. kumoai/utils/progress_logger.py +239 -4
  45. kumoai/utils/sql.py +3 -0
  46. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/METADATA +15 -5
  47. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/RECORD +50 -32
  48. kumoai/experimental/rfm/local_graph_sampler.py +0 -176
  49. kumoai/experimental/rfm/local_pquery_driver.py +0 -404
  50. kumoai/experimental/rfm/utils.py +0 -344
  51. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/WHEEL +0 -0
  52. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/licenses/LICENSE +0 -0
  53. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,15 @@
1
+ from .dtype import infer_dtype
2
+ from .pkey import infer_primary_key
3
+ from .time_col import infer_time_column
1
4
  from .id import contains_id
2
5
  from .timestamp import contains_timestamp
3
6
  from .categorical import contains_categorical
4
7
  from .multicategorical import contains_multicategorical
5
8
 
6
9
  __all__ = [
10
+ 'infer_dtype',
11
+ 'infer_primary_key',
12
+ 'infer_time_column',
7
13
  'contains_id',
8
14
  'contains_timestamp',
9
15
  'contains_categorical',
@@ -0,0 +1,79 @@
1
+ from typing import Dict
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pyarrow as pa
6
+ from kumoapi.typing import Dtype
7
+
8
+ PANDAS_TO_DTYPE: Dict[str, Dtype] = {
9
+ 'bool': Dtype.bool,
10
+ 'boolean': Dtype.bool,
11
+ 'int8': Dtype.int,
12
+ 'int16': Dtype.int,
13
+ 'int32': Dtype.int,
14
+ 'int64': Dtype.int,
15
+ 'float16': Dtype.float,
16
+ 'float32': Dtype.float,
17
+ 'float64': Dtype.float,
18
+ 'object': Dtype.string,
19
+ 'string': Dtype.string,
20
+ 'string[python]': Dtype.string,
21
+ 'string[pyarrow]': Dtype.string,
22
+ 'binary': Dtype.binary,
23
+ }
24
+
25
+
26
+ def infer_dtype(ser: pd.Series) -> Dtype:
27
+ """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
28
+
29
+ Args:
30
+ ser: A :class:`pandas.Series` to analyze.
31
+
32
+ Returns:
33
+ The data type.
34
+ """
35
+ if pd.api.types.is_datetime64_any_dtype(ser.dtype):
36
+ return Dtype.date
37
+ if pd.api.types.is_timedelta64_dtype(ser.dtype):
38
+ return Dtype.timedelta
39
+ if isinstance(ser.dtype, pd.CategoricalDtype):
40
+ return Dtype.string
41
+
42
+ if (pd.api.types.is_object_dtype(ser.dtype)
43
+ and not isinstance(ser.dtype, pd.ArrowDtype)):
44
+ index = ser.iloc[:1000].first_valid_index()
45
+ if index is not None and pd.api.types.is_list_like(ser[index]):
46
+ pos = ser.index.get_loc(index)
47
+ assert isinstance(pos, int)
48
+ ser = ser.iloc[pos:pos + 1000].dropna()
49
+ arr = pa.array(ser.tolist())
50
+ ser = pd.Series(arr, dtype=pd.ArrowDtype(arr.type))
51
+
52
+ if isinstance(ser.dtype, pd.ArrowDtype):
53
+ if pa.types.is_list(ser.dtype.pyarrow_dtype):
54
+ elem_dtype = ser.dtype.pyarrow_dtype.value_type
55
+ if pa.types.is_integer(elem_dtype):
56
+ return Dtype.intlist
57
+ if pa.types.is_floating(elem_dtype):
58
+ return Dtype.floatlist
59
+ if pa.types.is_decimal(elem_dtype):
60
+ return Dtype.floatlist
61
+ if pa.types.is_string(elem_dtype):
62
+ return Dtype.stringlist
63
+ if pa.types.is_null(elem_dtype):
64
+ return Dtype.floatlist
65
+
66
+ if isinstance(ser.dtype, np.dtype):
67
+ dtype_str = str(ser.dtype).lower()
68
+ elif isinstance(ser.dtype, pd.api.extensions.ExtensionDtype):
69
+ dtype_str = ser.dtype.name.lower()
70
+ dtype_str = dtype_str.split('[')[0] # Remove backend metadata
71
+ elif isinstance(ser.dtype, pa.DataType):
72
+ dtype_str = str(ser.dtype).lower()
73
+ else:
74
+ dtype_str = 'object'
75
+
76
+ if dtype_str not in PANDAS_TO_DTYPE:
77
+ raise ValueError(f"Unsupported data type '{ser.dtype}'")
78
+
79
+ return PANDAS_TO_DTYPE[dtype_str]
@@ -0,0 +1,126 @@
1
+ import re
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def infer_primary_key(
9
+ table_name: str,
10
+ df: pd.DataFrame,
11
+ candidates: list[str],
12
+ ) -> Optional[str]:
13
+ r"""Auto-detect potential primary key column.
14
+
15
+ Args:
16
+ table_name: The table name.
17
+ df: The pandas DataFrame to analyze.
18
+ candidates: A list of potential candidates.
19
+
20
+ Returns:
21
+ The name of the detected primary key, or ``None`` if not found.
22
+ """
23
+ # A list of (potentially modified) table names that are eligible to match
24
+ # with a primary key, i.e.:
25
+ # - UserInfo -> User
26
+ # - snakecase <-> camelcase
27
+ # - camelcase <-> snakecase
28
+ # - plural <-> singular (users -> user, eligibilities -> eligibility)
29
+ # - verb -> noun (qualifying -> qualify)
30
+ _table_names = {table_name}
31
+ if table_name.lower().endswith('_info'):
32
+ _table_names.add(table_name[:-5])
33
+ elif table_name.lower().endswith('info'):
34
+ _table_names.add(table_name[:-4])
35
+
36
+ table_names = set()
37
+ for _table_name in _table_names:
38
+ table_names.add(_table_name.lower())
39
+ snakecase = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', _table_name)
40
+ snakecase = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', snakecase)
41
+ table_names.add(snakecase.lower())
42
+ camelcase = _table_name.replace('_', '')
43
+ table_names.add(camelcase.lower())
44
+ if _table_name.lower().endswith('s'):
45
+ table_names.add(_table_name.lower()[:-1])
46
+ table_names.add(snakecase.lower()[:-1])
47
+ table_names.add(camelcase.lower()[:-1])
48
+ else:
49
+ table_names.add(_table_name.lower() + 's')
50
+ table_names.add(snakecase.lower() + 's')
51
+ table_names.add(camelcase.lower() + 's')
52
+ if _table_name.lower().endswith('ies'):
53
+ table_names.add(_table_name.lower()[:-3] + 'y')
54
+ table_names.add(snakecase.lower()[:-3] + 'y')
55
+ table_names.add(camelcase.lower()[:-3] + 'y')
56
+ elif _table_name.lower().endswith('y'):
57
+ table_names.add(_table_name.lower()[:-1] + 'ies')
58
+ table_names.add(snakecase.lower()[:-1] + 'ies')
59
+ table_names.add(camelcase.lower()[:-1] + 'ies')
60
+ if _table_name.lower().endswith('ing'):
61
+ table_names.add(_table_name.lower()[:-3])
62
+ table_names.add(snakecase.lower()[:-3])
63
+ table_names.add(camelcase.lower()[:-3])
64
+
65
+ scores: list[tuple[str, int]] = []
66
+ for col_name in candidates:
67
+ col_name_lower = col_name.lower()
68
+
69
+ score = 0
70
+
71
+ if col_name_lower == 'id':
72
+ score += 4
73
+
74
+ for table_name_lower in table_names:
75
+
76
+ if col_name_lower == table_name_lower:
77
+ score += 4 # USER -> USER
78
+ break
79
+
80
+ for suffix in ['id', 'hash', 'key', 'code', 'uuid']:
81
+ if not col_name_lower.endswith(suffix):
82
+ continue
83
+
84
+ if col_name_lower == f'{table_name_lower}_{suffix}':
85
+ score += 5 # USER -> USER_ID
86
+ break
87
+
88
+ if col_name_lower == f'{table_name_lower}{suffix}':
89
+ score += 5 # User -> UserId
90
+ break
91
+
92
+ if col_name_lower.endswith(f'{table_name_lower}_{suffix}'):
93
+ score += 2
94
+
95
+ if col_name_lower.endswith(f'{table_name_lower}{suffix}'):
96
+ score += 2
97
+
98
+ # `rel-bench` hard-coding :(
99
+ if table_name == 'studies' and col_name == 'nct_id':
100
+ score += 1
101
+
102
+ ser = df[col_name].iloc[:1_000_000]
103
+ score += 3 * (ser.nunique() / len(ser))
104
+
105
+ scores.append((col_name, score))
106
+
107
+ scores = [x for x in scores if x[-1] >= 4]
108
+ scores.sort(key=lambda x: x[-1], reverse=True)
109
+
110
+ if len(scores) == 0:
111
+ return None
112
+
113
+ if len(scores) == 1:
114
+ return scores[0][0]
115
+
116
+ # In case of multiple candidates, only return one if its score is unique:
117
+ if scores[0][1] != scores[1][1]:
118
+ return scores[0][0]
119
+
120
+ max_score = max(scores, key=lambda x: x[1])
121
+ candidates = [col_name for col_name, score in scores if score == max_score]
122
+ warnings.warn(f"Found multiple potential primary keys in table "
123
+ f"'{table_name}': {candidates}. Please specify the primary "
124
+ f"key for this table manually.")
125
+
126
+ return None
@@ -0,0 +1,62 @@
1
+ import re
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def infer_time_column(
9
+ df: pd.DataFrame,
10
+ candidates: list[str],
11
+ ) -> Optional[str]:
12
+ r"""Auto-detect potential time column.
13
+
14
+ Args:
15
+ df: The pandas DataFrame to analyze.
16
+ candidates: A list of potential candidates.
17
+
18
+ Returns:
19
+ The name of the detected time column, or ``None`` if not found.
20
+ """
21
+ candidates = [ # Exclude all candidates with `*last*` in column names:
22
+ col_name for col_name in candidates
23
+ if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
24
+ ]
25
+
26
+ if len(candidates) == 0:
27
+ return None
28
+
29
+ if len(candidates) == 1:
30
+ return candidates[0]
31
+
32
+ # If there exists a dedicated `create*` column, use it as time column:
33
+ create_candidates = [
34
+ candidate for candidate in candidates
35
+ if candidate.lower().startswith('create')
36
+ ]
37
+ if len(create_candidates) == 1:
38
+ return create_candidates[0]
39
+ if len(create_candidates) > 1:
40
+ candidates = create_candidates
41
+
42
+ # Find the most optimal time column. Usually, it is the one pointing to
43
+ # the oldest timestamps:
44
+ with warnings.catch_warnings():
45
+ warnings.filterwarnings('ignore', message='Could not infer format')
46
+ min_timestamp_dict = {
47
+ key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
48
+ for key in candidates
49
+ }
50
+ min_timestamp_dict = {
51
+ key: value.min().tz_localize(None)
52
+ for key, value in min_timestamp_dict.items()
53
+ }
54
+ min_timestamp_dict = {
55
+ key: value
56
+ for key, value in min_timestamp_dict.items() if not pd.isna(value)
57
+ }
58
+
59
+ if len(min_timestamp_dict) == 0:
60
+ return None
61
+
62
+ return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
@@ -2,6 +2,7 @@ import re
2
2
  import warnings
3
3
 
4
4
  import pandas as pd
5
+ from dateutil.parser import UnknownTimezoneWarning
5
6
  from kumoapi.typing import Dtype, Stype
6
7
 
7
8
 
@@ -20,9 +21,7 @@ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
20
21
  column_name,
21
22
  re.IGNORECASE,
22
23
  )
23
-
24
- if match is not None:
25
- return True
24
+ score = 0.3 if match is not None else 0.0
26
25
 
27
26
  ser = ser.iloc[:100]
28
27
  ser = ser.dropna()
@@ -34,5 +33,9 @@ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
34
33
  ser = ser.astype(str) # Avoid parsing numbers as unix timestamps.
35
34
 
36
35
  with warnings.catch_warnings():
36
+ warnings.simplefilter('ignore', UnknownTimezoneWarning)
37
37
  warnings.filterwarnings('ignore', message='Could not infer format')
38
- return pd.to_datetime(ser, errors='coerce').notna().all()
38
+ mask = pd.to_datetime(ser, errors='coerce').notna()
39
+ score += int(mask.sum()) / len(mask)
40
+
41
+ return score >= 1.0
@@ -1,7 +1,7 @@
1
- from .backend import PQueryBackend
2
- from .pandas_backend import PQueryPandasBackend
1
+ from .executor import PQueryExecutor
2
+ from .pandas_executor import PQueryPandasExecutor
3
3
 
4
4
  __all__ = [
5
- 'PQueryBackend',
6
- 'PQueryPandasBackend',
5
+ 'PQueryExecutor',
6
+ 'PQueryPandasExecutor',
7
7
  ]
@@ -1,23 +1,14 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Dict, Generic, Optional, Tuple, TypeVar, Union
2
+ from typing import Dict, Generic, Tuple, TypeVar
3
3
 
4
- from kumoapi.rfm import PQueryDefinition
5
- from kumoapi.rfm.pquery import (
4
+ from kumoapi.pquery import ValidatedPredictiveQuery
5
+ from kumoapi.pquery.AST import (
6
6
  Aggregation,
7
- AggregationType,
8
- BoolOp,
9
7
  Column,
10
8
  Condition,
11
9
  Filter,
12
- Float,
13
- FloatList,
14
- Int,
15
- IntList,
10
+ Join,
16
11
  LogicalOperation,
17
- MemberOp,
18
- RelOp,
19
- Str,
20
- StrList,
21
12
  )
22
13
 
23
14
  TableData = TypeVar('TableData')
@@ -25,58 +16,33 @@ ColumnData = TypeVar('ColumnData')
25
16
  IndexData = TypeVar('IndexData')
26
17
 
27
18
 
28
- class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
19
+ class PQueryExecutor(Generic[TableData, ColumnData, IndexData], ABC):
29
20
  @abstractmethod
30
- def eval_aggregation_type(
21
+ def execute_column(
31
22
  self,
32
- op: AggregationType,
33
- feat: Optional[ColumnData],
34
- batch: IndexData,
35
- batch_size: int,
23
+ column: Column,
24
+ feat_dict: Dict[str, TableData],
36
25
  filter_na: bool = True,
37
26
  ) -> Tuple[ColumnData, IndexData]:
38
27
  pass
39
28
 
40
29
  @abstractmethod
41
- def eval_rel_op(
42
- self,
43
- left: ColumnData,
44
- op: RelOp,
45
- right: Union[Int, Float, Str, None],
46
- ) -> ColumnData:
47
- pass
48
-
49
- @abstractmethod
50
- def eval_member_op(
51
- self,
52
- left: ColumnData,
53
- op: MemberOp,
54
- right: Union[IntList, FloatList, StrList],
55
- ) -> ColumnData:
56
- pass
57
-
58
- @abstractmethod
59
- def eval_bool_op(
60
- self,
61
- left: ColumnData,
62
- op: BoolOp,
63
- right: Optional[ColumnData],
64
- ) -> ColumnData:
65
- pass
66
-
67
- @abstractmethod
68
- def eval_column(
30
+ def execute_aggregation(
69
31
  self,
70
- column: Column,
32
+ aggr: Aggregation,
71
33
  feat_dict: Dict[str, TableData],
34
+ time_dict: Dict[str, ColumnData],
35
+ batch_dict: Dict[str, IndexData],
36
+ anchor_time: ColumnData,
72
37
  filter_na: bool = True,
38
+ num_forecasts: int = 1,
73
39
  ) -> Tuple[ColumnData, IndexData]:
74
40
  pass
75
41
 
76
42
  @abstractmethod
77
- def eval_aggregation(
43
+ def execute_condition(
78
44
  self,
79
- aggr: Aggregation,
45
+ condition: Condition,
80
46
  feat_dict: Dict[str, TableData],
81
47
  time_dict: Dict[str, ColumnData],
82
48
  batch_dict: Dict[str, IndexData],
@@ -87,9 +53,9 @@ class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
87
53
  pass
88
54
 
89
55
  @abstractmethod
90
- def eval_condition(
56
+ def execute_logical_operation(
91
57
  self,
92
- condition: Condition,
58
+ logical_operation: LogicalOperation,
93
59
  feat_dict: Dict[str, TableData],
94
60
  time_dict: Dict[str, ColumnData],
95
61
  batch_dict: Dict[str, IndexData],
@@ -100,9 +66,9 @@ class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
100
66
  pass
101
67
 
102
68
  @abstractmethod
103
- def eval_logical_operation(
69
+ def execute_join(
104
70
  self,
105
- logical_operation: LogicalOperation,
71
+ join: Join,
106
72
  feat_dict: Dict[str, TableData],
107
73
  time_dict: Dict[str, ColumnData],
108
74
  batch_dict: Dict[str, IndexData],
@@ -113,20 +79,20 @@ class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
113
79
  pass
114
80
 
115
81
  @abstractmethod
116
- def eval_filter(
82
+ def execute_filter(
117
83
  self,
118
84
  filter: Filter,
119
85
  feat_dict: Dict[str, TableData],
120
86
  time_dict: Dict[str, ColumnData],
121
87
  batch_dict: Dict[str, IndexData],
122
88
  anchor_time: ColumnData,
123
- ) -> IndexData:
89
+ ) -> Tuple[ColumnData, IndexData]:
124
90
  pass
125
91
 
126
92
  @abstractmethod
127
- def eval_pquery(
93
+ def execute(
128
94
  self,
129
- query: PQueryDefinition,
95
+ query: ValidatedPredictiveQuery,
130
96
  feat_dict: Dict[str, TableData],
131
97
  time_dict: Dict[str, ColumnData],
132
98
  batch_dict: Dict[str, IndexData],