kumoai 2.13.0.dev202512040649__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. kumoai/__init__.py +35 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +26 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/connector/utils.py +21 -7
  7. kumoai/experimental/rfm/__init__.py +51 -24
  8. kumoai/experimental/rfm/authenticate.py +3 -4
  9. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  10. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +62 -110
  11. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  12. kumoai/experimental/rfm/backend/local/table.py +35 -31
  13. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  14. kumoai/experimental/rfm/backend/snow/sampler.py +366 -0
  15. kumoai/experimental/rfm/backend/snow/table.py +177 -50
  16. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  17. kumoai/experimental/rfm/backend/sqlite/sampler.py +454 -0
  18. kumoai/experimental/rfm/backend/sqlite/table.py +131 -48
  19. kumoai/experimental/rfm/base/__init__.py +23 -3
  20. kumoai/experimental/rfm/base/column.py +96 -10
  21. kumoai/experimental/rfm/base/expression.py +44 -0
  22. kumoai/experimental/rfm/base/sampler.py +782 -0
  23. kumoai/experimental/rfm/base/source.py +2 -1
  24. kumoai/experimental/rfm/base/sql_sampler.py +247 -0
  25. kumoai/experimental/rfm/base/table.py +404 -203
  26. kumoai/experimental/rfm/graph.py +374 -172
  27. kumoai/experimental/rfm/infer/__init__.py +6 -4
  28. kumoai/experimental/rfm/infer/dtype.py +7 -4
  29. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  30. kumoai/experimental/rfm/infer/pkey.py +4 -2
  31. kumoai/experimental/rfm/infer/stype.py +35 -0
  32. kumoai/experimental/rfm/infer/time_col.py +1 -2
  33. kumoai/experimental/rfm/pquery/executor.py +27 -27
  34. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  35. kumoai/experimental/rfm/relbench.py +76 -0
  36. kumoai/experimental/rfm/rfm.py +762 -467
  37. kumoai/experimental/rfm/sagemaker.py +4 -4
  38. kumoai/experimental/rfm/task_table.py +292 -0
  39. kumoai/kumolib.cp313-win_amd64.pyd +0 -0
  40. kumoai/pquery/predictive_query.py +10 -6
  41. kumoai/pquery/training_table.py +16 -2
  42. kumoai/testing/snow.py +50 -0
  43. kumoai/trainer/distilled_trainer.py +175 -0
  44. kumoai/utils/__init__.py +3 -2
  45. kumoai/utils/display.py +87 -0
  46. kumoai/utils/progress_logger.py +190 -12
  47. kumoai/utils/sql.py +3 -0
  48. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/METADATA +3 -2
  49. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/RECORD +52 -41
  50. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  51. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  52. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/WHEEL +0 -0
  53. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/licenses/LICENSE +0 -0
  54. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,22 @@
1
1
  import re
2
- import warnings
3
- from typing import List, Optional, Sequence
2
+ from collections import Counter
3
+ from collections.abc import Sequence
4
+ from typing import cast
4
5
 
5
6
  import pandas as pd
7
+ from kumoapi.model_plan import MissingType
6
8
  from kumoapi.typing import Dtype
7
9
 
8
10
  from kumoai.experimental.rfm.backend.sqlite import Connection
9
- from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
10
- from kumoai.experimental.rfm.infer import infer_dtype
11
+ from kumoai.experimental.rfm.base import (
12
+ ColumnSpec,
13
+ ColumnSpecType,
14
+ DataBackend,
15
+ SourceColumn,
16
+ SourceForeignKey,
17
+ Table,
18
+ )
19
+ from kumoai.utils import quote_ident
11
20
 
12
21
 
13
22
  class SQLiteTable(Table):
@@ -16,6 +25,8 @@ class SQLiteTable(Table):
16
25
  Args:
17
26
  connection: The connection to a :class:`sqlite` database.
18
27
  name: The name of this table.
28
+ source_name: The source name of this table. If set to ``None``,
29
+ ``name`` is being used.
19
30
  columns: The selected columns of this table.
20
31
  primary_key: The name of the primary key of this table, if it exists.
21
32
  time_column: The name of the time column of this table, if it exists.
@@ -26,76 +37,148 @@ class SQLiteTable(Table):
26
37
  self,
27
38
  connection: Connection,
28
39
  name: str,
29
- columns: Optional[Sequence[str]] = None,
30
- primary_key: Optional[str] = None,
31
- time_column: Optional[str] = None,
32
- end_time_column: Optional[str] = None,
40
+ source_name: str | None = None,
41
+ columns: Sequence[ColumnSpecType] | None = None,
42
+ primary_key: MissingType | str | None = MissingType.VALUE,
43
+ time_column: str | None = None,
44
+ end_time_column: str | None = None,
33
45
  ) -> None:
34
46
 
35
47
  self._connection = connection
36
48
 
37
49
  super().__init__(
38
50
  name=name,
51
+ source_name=source_name,
39
52
  columns=columns,
40
53
  primary_key=primary_key,
41
54
  time_column=time_column,
42
55
  end_time_column=end_time_column,
43
56
  )
44
57
 
45
- def _get_source_columns(self) -> List[SourceColumn]:
46
- source_columns: List[SourceColumn] = []
58
+ @property
59
+ def backend(self) -> DataBackend:
60
+ return cast(DataBackend, DataBackend.SQLITE)
61
+
62
+ def _get_source_columns(self) -> list[SourceColumn]:
63
+ source_columns: list[SourceColumn] = []
47
64
  with self._connection.cursor() as cursor:
48
- cursor.execute(f"PRAGMA table_info({self.name})")
49
- rows = cursor.fetchall()
65
+ sql = f"PRAGMA table_info({self._quoted_source_name})"
66
+ cursor.execute(sql)
67
+ columns = cursor.fetchall()
68
+
69
+ if len(columns) == 0:
70
+ raise ValueError(f"Table '{self.source_name}' does not exist "
71
+ f"in the SQLite database")
72
+
73
+ unique_keys: set[str] = set()
74
+ sql = f"PRAGMA index_list({self._quoted_source_name})"
75
+ cursor.execute(sql)
76
+ for _, index_name, is_unique, *_ in cursor.fetchall():
77
+ if bool(is_unique):
78
+ sql = f"PRAGMA index_info({quote_ident(index_name)})"
79
+ cursor.execute(sql)
80
+ index = cursor.fetchall()
81
+ if len(index) == 1:
82
+ unique_keys.add(index[0][2])
50
83
 
51
- if len(rows) == 0:
52
- raise ValueError(f"Table '{self.name}' does not exist")
53
-
54
- for _, column, type, _, _, is_pkey in rows:
55
- # Determine column affinity:
56
- type = type.strip().upper()
57
- if re.search('INT', type):
58
- dtype = Dtype.int
59
- elif re.search('TEXT|CHAR|CLOB', type):
60
- dtype = Dtype.string
61
- elif re.search('REAL|FLOA|DOUB', type):
62
- dtype = Dtype.float
63
- else: # NUMERIC affinity.
64
- ser = self._sample_df[column]
65
- try:
66
- dtype = infer_dtype(ser)
67
- except Exception:
68
- warnings.warn(
69
- f"Data type inference for column '{column}' in "
70
- f"table '{self.name}' failed. Consider changing "
71
- f"the data type of the column to use it within "
72
- f"this table.")
73
- continue
84
+ # Special SQLite case that creates a rowid alias for
85
+ # `INTEGER PRIMARY KEY` annotated columns:
86
+ rowid_candidates = [
87
+ column for _, column, dtype, _, _, is_pkey in columns
88
+ if bool(is_pkey) and dtype.strip().upper() == 'INTEGER'
89
+ ]
90
+ if len(rowid_candidates) == 1:
91
+ unique_keys.add(rowid_candidates[0])
74
92
 
93
+ for _, column, dtype, notnull, _, is_pkey in columns:
75
94
  source_column = SourceColumn(
76
95
  name=column,
77
- dtype=dtype,
96
+ dtype=self._to_dtype(dtype),
78
97
  is_primary_key=bool(is_pkey),
79
- is_unique_key=False,
98
+ is_unique_key=column in unique_keys,
99
+ is_nullable=not bool(is_pkey) and not bool(notnull),
80
100
  )
81
101
  source_columns.append(source_column)
82
102
 
83
103
  return source_columns
84
104
 
85
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
86
- source_fkeys: List[SourceForeignKey] = []
105
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
106
+ source_foreign_keys: list[SourceForeignKey] = []
87
107
  with self._connection.cursor() as cursor:
88
- cursor.execute(f"PRAGMA foreign_key_list({self.name})")
89
- for _, _, dst_table, fkey, pkey, _, _, _ in cursor.fetchall():
90
- source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
91
- return source_fkeys
108
+ sql = f"PRAGMA foreign_key_list({self._quoted_source_name})"
109
+ cursor.execute(sql)
110
+ rows = cursor.fetchall()
111
+ counts = Counter(row[0] for row in rows)
112
+ for idx, _, dst_table, foreign_key, primary_key, *_ in rows:
113
+ if counts[idx] == 1:
114
+ source_foreign_key = SourceForeignKey(
115
+ name=foreign_key,
116
+ dst_table=dst_table,
117
+ primary_key=primary_key,
118
+ )
119
+ source_foreign_keys.append(source_foreign_key)
120
+ return source_foreign_keys
92
121
 
93
- def _get_sample_df(self) -> pd.DataFrame:
122
+ def _get_source_sample_df(self) -> pd.DataFrame:
94
123
  with self._connection.cursor() as cursor:
95
- cursor.execute(f"SELECT * FROM {self.name} "
96
- f"ORDER BY rowid LIMIT 1000")
124
+ columns = [quote_ident(col) for col in self._source_column_dict]
125
+ sql = (f"SELECT {', '.join(columns)} "
126
+ f"FROM {self._quoted_source_name} "
127
+ f"ORDER BY rowid "
128
+ f"LIMIT {self._NUM_SAMPLE_ROWS}")
129
+ cursor.execute(sql)
97
130
  table = cursor.fetch_arrow_table()
98
- return table.to_pandas(types_mapper=pd.ArrowDtype)
99
131
 
100
- def _get_num_rows(self) -> Optional[int]:
132
+ if len(table) == 0:
133
+ raise RuntimeError(f"Table '{self.source_name}' is empty")
134
+
135
+ return self._sanitize(
136
+ df=table.to_pandas(types_mapper=pd.ArrowDtype),
137
+ dtype_dict={
138
+ column.name: column.dtype
139
+ for column in self._source_column_dict.values()
140
+ },
141
+ stype_dict=None,
142
+ )
143
+
144
+ def _get_num_rows(self) -> int | None:
101
145
  return None
146
+
147
+ def _get_expr_sample_df(
148
+ self,
149
+ columns: Sequence[ColumnSpec],
150
+ ) -> pd.DataFrame:
151
+ with self._connection.cursor() as cursor:
152
+ projections = [
153
+ f"{column.expr} AS {quote_ident(column.name)}"
154
+ for column in columns
155
+ ]
156
+ sql = (f"SELECT {', '.join(projections)} "
157
+ f"FROM {self._quoted_source_name} "
158
+ f"ORDER BY rowid "
159
+ f"LIMIT {self._NUM_SAMPLE_ROWS}")
160
+ cursor.execute(sql)
161
+ table = cursor.fetch_arrow_table()
162
+
163
+ if len(table) == 0:
164
+ raise RuntimeError(f"Table '{self.source_name}' is empty")
165
+
166
+ return self._sanitize(
167
+ df=table.to_pandas(types_mapper=pd.ArrowDtype),
168
+ dtype_dict={column.name: column.dtype
169
+ for column in columns},
170
+ stype_dict=None,
171
+ )
172
+
173
+ @staticmethod
174
+ def _to_dtype(dtype: str | None) -> Dtype | None:
175
+ if dtype is None:
176
+ return None
177
+ dtype = dtype.strip().upper()
178
+ if re.search('INT', dtype):
179
+ return Dtype.int
180
+ if re.search('TEXT|CHAR|CLOB', dtype):
181
+ return Dtype.string
182
+ if re.search('REAL|FLOA|DOUB', dtype):
183
+ return Dtype.float
184
+ return None # NUMERIC affinity.
@@ -1,10 +1,30 @@
1
- from .source import SourceColumn, SourceForeignKey
2
- from .column import Column
3
- from .table import Table
1
+ from kumoapi.common import StrEnum
2
+
3
+
4
+ class DataBackend(StrEnum):
5
+ LOCAL = 'local'
6
+ SQLITE = 'sqlite'
7
+ SNOWFLAKE = 'snowflake'
8
+
9
+
10
+ from .source import SourceColumn, SourceForeignKey # noqa: E402
11
+ from .expression import Expression, LocalExpression # noqa: E402
12
+ from .column import ColumnSpec, ColumnSpecType, Column # noqa: E402
13
+ from .table import Table # noqa: E402
14
+ from .sampler import SamplerOutput, Sampler # noqa: E402
15
+ from .sql_sampler import SQLSampler # noqa: E402
4
16
 
5
17
  __all__ = [
18
+ 'DataBackend',
6
19
  'SourceColumn',
7
20
  'SourceForeignKey',
21
+ 'Expression',
22
+ 'LocalExpression',
23
+ 'ColumnSpec',
24
+ 'ColumnSpecType',
8
25
  'Column',
9
26
  'Table',
27
+ 'SamplerOutput',
28
+ 'Sampler',
29
+ 'SQLSampler',
10
30
  ]
@@ -1,37 +1,119 @@
1
+ from __future__ import annotations
2
+
1
3
  from dataclasses import dataclass
2
- from typing import Any
4
+ from typing import Any, Mapping, TypeAlias
3
5
 
4
6
  from kumoapi.typing import Dtype, Stype
7
+ from typing_extensions import Self
8
+
9
+ from kumoai.experimental.rfm.base import Expression
10
+ from kumoai.mixin import CastMixin
11
+
12
+
13
+ @dataclass(init=False)
14
+ class ColumnSpec(CastMixin):
15
+ r"""A column specification for adding a column to a table.
16
+
17
+ A column specification can either refer to a physical column present in
18
+ the data source, or be defined logically via an expression.
19
+
20
+ Args:
21
+ name: The name of the column.
22
+ expr: A column expression to define logical columns.
23
+ dtype: The data type of the column.
24
+ """
25
+ def __init__(
26
+ self,
27
+ name: str,
28
+ expr: Expression | Mapping[str, str] | str | None = None,
29
+ dtype: Dtype | str | None = None,
30
+ stype: Stype | str | None = None,
31
+ ) -> None:
32
+
33
+ self.name = name
34
+ self.expr = Expression.coerce(expr)
35
+ self.dtype = Dtype(dtype) if dtype is not None else None
36
+ self.stype = Stype(dtype) if stype is not None else None
37
+
38
+ @classmethod
39
+ def coerce(cls, spec: ColumnSpec | Mapping[str, Any] | str) -> Self:
40
+ r"""Coerces a column specification into a :class:`ColumnSpec`."""
41
+ if isinstance(spec, cls):
42
+ return spec
43
+ if isinstance(spec, str):
44
+ return cls(name=spec)
45
+ if isinstance(spec, Mapping):
46
+ try:
47
+ return cls(**spec)
48
+ except TypeError:
49
+ pass
50
+ raise TypeError(f"Unable to coerce 'ColumnSpec' from '{spec}'")
51
+
52
+ @property
53
+ def is_source(self) -> bool:
54
+ r"""Whether the column specification refers to a phyiscal column
55
+ present in the data source.
56
+ """
57
+ return self.expr is None
58
+
59
+
60
+ ColumnSpecType: TypeAlias = ColumnSpec | Mapping[str, Any] | str
5
61
 
6
62
 
7
63
  @dataclass(init=False, repr=False, eq=False)
8
64
  class Column:
65
+ r"""Column-level metadata information.
66
+
67
+ A column can either refer to a physical column present in the data source,
68
+ or be defined logically via an expression.
69
+
70
+ Args:
71
+ name: The name of the column.
72
+ expr: A column expression to define logical columns.
73
+ dtype: The data type of the column.
74
+ stype: The semantic type of the column.
75
+ """
9
76
  stype: Stype
10
77
 
11
78
  def __init__(
12
79
  self,
13
80
  name: str,
81
+ expr: Expression | None,
14
82
  dtype: Dtype,
15
83
  stype: Stype,
16
- is_primary_key: bool = False,
17
- is_time_column: bool = False,
18
- is_end_time_column: bool = False,
19
84
  ) -> None:
20
85
  self._name = name
86
+ self._expr = expr
21
87
  self._dtype = Dtype(dtype)
22
- self._is_primary_key = is_primary_key
23
- self._is_time_column = is_time_column
24
- self._is_end_time_column = is_end_time_column
88
+
89
+ self._is_primary_key = False
90
+ self._is_time_column = False
91
+ self._is_end_time_column = False
92
+
25
93
  self.stype = Stype(stype)
26
94
 
27
95
  @property
28
96
  def name(self) -> str:
97
+ r"""The name of the column."""
29
98
  return self._name
30
99
 
100
+ @property
101
+ def expr(self) -> Expression | None:
102
+ r"""The expression of column (if logically)."""
103
+ return self._expr
104
+
31
105
  @property
32
106
  def dtype(self) -> Dtype:
107
+ r"""The data type of the column."""
33
108
  return self._dtype
34
109
 
110
+ @property
111
+ def is_source(self) -> bool:
112
+ r"""Whether the column refers to a phyiscal column present in the data
113
+ source.
114
+ """
115
+ return self.expr is None
116
+
35
117
  def __setattr__(self, key: str, val: Any) -> None:
36
118
  if key == 'stype':
37
119
  if isinstance(val, str):
@@ -54,7 +136,7 @@ class Column:
54
136
  super().__setattr__(key, val)
55
137
 
56
138
  def __hash__(self) -> int:
57
- return hash((self.name, self.stype, self.dtype))
139
+ return hash((self.name, self.expr, self.dtype, self.stype))
58
140
 
59
141
  def __eq__(self, other: Any) -> bool:
60
142
  if not isinstance(other, Column):
@@ -62,5 +144,9 @@ class Column:
62
144
  return hash(self) == hash(other)
63
145
 
64
146
  def __repr__(self) -> str:
65
- return (f'{self.__class__.__name__}(name={self.name}, '
66
- f'stype={self.stype}, dtype={self.dtype})')
147
+ parts = [f'name={self.name}']
148
+ if self.expr is not None:
149
+ parts.append(f'expr={self.expr}')
150
+ parts.append(f'dtype={self.dtype}')
151
+ parts.append(f'stype={self.stype}')
152
+ return f"{self.__class__.__name__}({', '.join(parts)})"
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from dataclasses import dataclass
5
+ from typing import Mapping
6
+
7
+
8
+ class Expression(ABC):
9
+ """A base expression to define logical columns."""
10
+ @classmethod
11
+ def coerce(
12
+ cls,
13
+ spec: Expression | Mapping[str, str] | str | None,
14
+ ) -> Expression | None:
15
+ r"""Coerces an expression specification into an :class:`Expression`, if
16
+ possible.
17
+ """
18
+ if spec is None:
19
+ return None
20
+ if isinstance(spec, Expression):
21
+ return spec
22
+ if isinstance(spec, str):
23
+ return LocalExpression(spec)
24
+ if isinstance(spec, Mapping):
25
+ for sub_cls in (LocalExpression, ):
26
+ try:
27
+ return sub_cls(**spec)
28
+ except TypeError:
29
+ pass
30
+ raise TypeError(f"Unable to coerce 'Expression' from '{spec}'")
31
+
32
+
33
+ @dataclass(frozen=True, repr=False)
34
+ class LocalExpression(Expression):
35
+ r"""A local expression to define a row-level logical attribute based on
36
+ physical columns of the data source in the same row.
37
+
38
+ Args:
39
+ value: The value of the expression.
40
+ """
41
+ value: str
42
+
43
+ def __repr__(self) -> str:
44
+ return self.value