kumoai 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl → 2.15.0.dev202601181732__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kumoai/__init__.py +38 -30
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +26 -0
  5. kumoai/connector/utils.py +21 -7
  6. kumoai/experimental/rfm/__init__.py +24 -22
  7. kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
  8. kumoai/experimental/rfm/backend/local/sampler.py +0 -3
  9. kumoai/experimental/rfm/backend/local/table.py +24 -25
  10. kumoai/experimental/rfm/backend/snow/sampler.py +235 -80
  11. kumoai/experimental/rfm/backend/snow/table.py +146 -70
  12. kumoai/experimental/rfm/backend/sqlite/sampler.py +196 -89
  13. kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
  14. kumoai/experimental/rfm/base/__init__.py +6 -9
  15. kumoai/experimental/rfm/base/column.py +95 -11
  16. kumoai/experimental/rfm/base/expression.py +44 -0
  17. kumoai/experimental/rfm/base/mapper.py +69 -0
  18. kumoai/experimental/rfm/base/sampler.py +28 -18
  19. kumoai/experimental/rfm/base/source.py +1 -1
  20. kumoai/experimental/rfm/base/sql_sampler.py +320 -19
  21. kumoai/experimental/rfm/base/table.py +256 -109
  22. kumoai/experimental/rfm/base/utils.py +36 -0
  23. kumoai/experimental/rfm/graph.py +134 -114
  24. kumoai/experimental/rfm/infer/dtype.py +7 -2
  25. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  26. kumoai/experimental/rfm/infer/time_col.py +4 -2
  27. kumoai/experimental/rfm/relbench.py +76 -0
  28. kumoai/experimental/rfm/rfm.py +541 -307
  29. kumoai/experimental/rfm/task_table.py +292 -0
  30. kumoai/pquery/training_table.py +16 -2
  31. kumoai/testing/snow.py +3 -3
  32. kumoai/trainer/distilled_trainer.py +175 -0
  33. kumoai/utils/display.py +95 -0
  34. kumoai/utils/progress_logger.py +205 -117
  35. kumoai/utils/sql.py +2 -2
  36. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601181732.dist-info}/METADATA +2 -2
  37. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601181732.dist-info}/RECORD +40 -35
  38. kumoai/experimental/rfm/base/column_expression.py +0 -50
  39. kumoai/experimental/rfm/base/sql_table.py +0 -229
  40. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601181732.dist-info}/WHEEL +0 -0
  41. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601181732.dist-info}/licenses/LICENSE +0 -0
  42. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601181732.dist-info}/top_level.txt +0 -0
@@ -1,229 +0,0 @@
1
- import warnings
2
- from abc import abstractmethod
3
- from collections import defaultdict
4
- from collections.abc import Sequence
5
- from functools import cached_property
6
- from typing import Any
7
-
8
- import pandas as pd
9
- from kumoapi.model_plan import MissingType
10
-
11
- from kumoai.experimental.rfm.base import (
12
- ColumnExpression,
13
- ColumnExpressionSpec,
14
- ColumnExpressionType,
15
- SourceForeignKey,
16
- Table,
17
- )
18
- from kumoai.experimental.rfm.infer import infer_dtype, infer_stype
19
- from kumoai.utils import quote_ident
20
-
21
-
22
- class SQLTable(Table):
23
- r"""A :class:`SQLTable` specifies a :class:`Table` backed by a SQL
24
- database.
25
-
26
- Args:
27
- name: The logical name of this table.
28
- source_name: The physical name of this table in the database. If set to
29
- ``None``, ``name`` is being used.
30
- columns: The selected physical columns of this table.
31
- column_expressions: The logical columns of this table.
32
- primary_key: The name of the primary key of this table, if it exists.
33
- time_column: The name of the time column of this table, if it exists.
34
- end_time_column: The name of the end time column of this table, if it
35
- exists.
36
- """
37
- def __init__(
38
- self,
39
- name: str,
40
- source_name: str | None = None,
41
- columns: Sequence[str] | None = None,
42
- column_expressions: Sequence[ColumnExpressionType] | None = None,
43
- primary_key: MissingType | str | None = MissingType.VALUE,
44
- time_column: str | None = None,
45
- end_time_column: str | None = None,
46
- ) -> None:
47
-
48
- self._connection: Any
49
- self._source_name = source_name or name
50
- self._expression_sample_df = pd.DataFrame()
51
-
52
- super().__init__(
53
- name=name,
54
- columns=[],
55
- primary_key=None,
56
- time_column=None,
57
- end_time_column=None,
58
- )
59
-
60
- # Add column expressions with highest priority:
61
- self.add_column_expressions(column_expressions or [])
62
-
63
- if columns is None:
64
- for column_name in self._source_column_dict.keys():
65
- if column_name not in self:
66
- self.add_column(column_name)
67
- else:
68
- for column_name in columns:
69
- self.add_column(column_name)
70
-
71
- if isinstance(primary_key, MissingType):
72
- # Inference from source column metadata:
73
- if '_source_column_dict' in self.__dict__:
74
- primary_key = self._source_primary_key
75
- if (primary_key is not None and primary_key in self
76
- and self[primary_key].is_physical):
77
- self.primary_key = primary_key
78
- elif primary_key is not None:
79
- if primary_key not in self:
80
- self.add_column(primary_key)
81
- self.primary_key = primary_key
82
-
83
- if time_column is not None:
84
- if time_column not in self:
85
- self.add_column(time_column)
86
- self.time_column = time_column
87
-
88
- if end_time_column is not None:
89
- if end_time_column not in self:
90
- self.add_column(end_time_column)
91
- self.end_time_column = end_time_column
92
-
93
- @property
94
- def fqn(self) -> str:
95
- r"""The fully-qualified quoted source table name."""
96
- return quote_ident(self._source_name)
97
-
98
- @cached_property
99
- def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
100
- fkeys = self._get_source_foreign_keys()
101
- # NOTE Drop all keys that link to multiple keys in the same table since
102
- # we don't support composite keys yet:
103
- table_pkeys: dict[str, set[str]] = defaultdict(set)
104
- for fkey in fkeys:
105
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
106
- return {
107
- fkey.name: fkey
108
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
109
- }
110
-
111
- def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
112
- expr_columns: list[str] = []
113
- source_columns: list[str] = []
114
- for column_name in columns:
115
- column = self[column_name]
116
- if isinstance(column, ColumnExpression):
117
- expr_columns.append(column_name)
118
- else:
119
- source_columns.append(column_name)
120
-
121
- dfs: list[pd.DataFrame] = []
122
- if len(expr_columns) > 0:
123
- dfs.append(self._expression_sample_df[expr_columns])
124
- if len(source_columns) > 0:
125
- dfs.append(self._source_sample_df[source_columns])
126
-
127
- if len(dfs) == 0:
128
- return pd.DataFrame(index=range(1000))
129
- if len(dfs) == 1:
130
- return dfs[0]
131
- return pd.concat(dfs, axis=1, ignore_index=True)
132
-
133
- # Column ##################################################################
134
-
135
- def add_column_expressions(
136
- self,
137
- columns: Sequence[ColumnExpressionType],
138
- ) -> None:
139
- r"""Adds a set of column expressions to this table.
140
-
141
- Args:
142
- columns: The set of column expressions.
143
-
144
- Raises:
145
- KeyError: If a column with the same name already exists in the
146
- table.
147
- """
148
- if len(columns) == 0:
149
- return
150
-
151
- column_expression_specs = [
152
- spec for column in columns
153
- if (spec := ColumnExpressionSpec._cast(column))
154
- ]
155
- df = self._get_expression_sample_df(column_expression_specs)
156
-
157
- for spec in column_expression_specs:
158
- if spec.name in self:
159
- raise KeyError(f"Column '{spec.name}' already exists in table "
160
- f"'{self.name}'")
161
-
162
- dtype = spec.dtype
163
- if dtype is None:
164
- ser = df[spec.name]
165
- try:
166
- dtype = infer_dtype(ser)
167
- except Exception:
168
- warnings.warn(f"Encountered unsupported data type "
169
- f"'{ser.dtype}' for column expression "
170
- f"'{spec.name}' in table '{self.name}'."
171
- f"Please manually specify the data type for "
172
- f"this column expression to use it within "
173
- f"this table, or remove it to suppress "
174
- f"this warning.")
175
- continue
176
-
177
- ser = df[spec.name]
178
- try:
179
- stype = infer_stype(ser, spec.name, dtype)
180
- except Exception as e:
181
- raise RuntimeError(f"Could not obtain semantic type for "
182
- f"column expression '{spec.name}' with "
183
- f"data type '{dtype}' in table "
184
- f"'{self.name}'. Change the data type of "
185
- f"the column expression or remove it from "
186
- f"this table.") from e
187
-
188
- self._columns[spec.name] = ColumnExpression(
189
- name=spec.name,
190
- expr=spec.expr,
191
- stype=stype,
192
- dtype=dtype,
193
- )
194
- with warnings.catch_warnings():
195
- warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
196
- self._expression_sample_df[spec.name] = ser
197
-
198
- def add_column_expression(
199
- self,
200
- column: ColumnExpressionType,
201
- ) -> ColumnExpression:
202
- r"""Adds a column expression to this table.
203
-
204
- Args:
205
- column: The column expression.
206
-
207
- Raises:
208
- KeyError: If a column with the same name already exists in the
209
- table.
210
- """
211
- spec = ColumnExpressionSpec._cast(column)
212
- assert spec is not None
213
- self.add_column_expressions([spec])
214
- column_expression = self.column(spec.name)
215
- assert isinstance(column_expression, ColumnExpression)
216
- return column_expression
217
-
218
- # Abstract Methods ########################################################
219
-
220
- @abstractmethod
221
- def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
222
- pass
223
-
224
- @abstractmethod
225
- def _get_expression_sample_df(
226
- self,
227
- specs: Sequence[ColumnExpressionSpec],
228
- ) -> pd.DataFrame:
229
- pass