kumoai 2.13.0.dev202512011731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. kumoai/__init__.py +12 -0
  2. kumoai/_version.py +1 -1
  3. kumoai/client/pquery.py +6 -2
  4. kumoai/experimental/rfm/__init__.py +33 -8
  5. kumoai/experimental/rfm/authenticate.py +3 -4
  6. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  7. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +53 -107
  8. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  9. kumoai/experimental/rfm/backend/local/table.py +41 -80
  10. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  11. kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
  12. kumoai/experimental/rfm/backend/snow/table.py +147 -0
  13. kumoai/experimental/rfm/backend/sqlite/__init__.py +11 -2
  14. kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
  15. kumoai/experimental/rfm/backend/sqlite/table.py +108 -88
  16. kumoai/experimental/rfm/base/__init__.py +26 -2
  17. kumoai/experimental/rfm/base/column.py +6 -12
  18. kumoai/experimental/rfm/base/column_expression.py +16 -0
  19. kumoai/experimental/rfm/base/sampler.py +773 -0
  20. kumoai/experimental/rfm/base/source.py +19 -0
  21. kumoai/experimental/rfm/base/sql_sampler.py +84 -0
  22. kumoai/experimental/rfm/base/sql_table.py +113 -0
  23. kumoai/experimental/rfm/base/table.py +174 -76
  24. kumoai/experimental/rfm/graph.py +444 -84
  25. kumoai/experimental/rfm/infer/__init__.py +6 -0
  26. kumoai/experimental/rfm/infer/dtype.py +77 -0
  27. kumoai/experimental/rfm/infer/pkey.py +128 -0
  28. kumoai/experimental/rfm/infer/time_col.py +61 -0
  29. kumoai/experimental/rfm/pquery/executor.py +27 -27
  30. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  31. kumoai/experimental/rfm/rfm.py +299 -240
  32. kumoai/experimental/rfm/sagemaker.py +4 -4
  33. kumoai/pquery/predictive_query.py +10 -6
  34. kumoai/testing/snow.py +50 -0
  35. kumoai/utils/__init__.py +3 -2
  36. kumoai/utils/progress_logger.py +178 -12
  37. kumoai/utils/sql.py +3 -0
  38. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/METADATA +6 -2
  39. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/RECORD +42 -30
  40. kumoai/experimental/rfm/local_graph_sampler.py +0 -182
  41. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  42. kumoai/experimental/rfm/utils.py +0 -344
  43. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/WHEEL +0 -0
  44. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/licenses/LICENSE +0 -0
  45. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,315 @@
1
+ from typing import TYPE_CHECKING, Literal
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from kumoapi.pquery import ValidatedPredictiveQuery
6
+
7
+ from kumoai.experimental.rfm.backend.local import LocalGraphStore
8
+ from kumoai.experimental.rfm.base import Sampler, SamplerOutput
9
+ from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
10
+ from kumoai.utils import ProgressLogger
11
+
12
+ if TYPE_CHECKING:
13
+ from kumoai.experimental.rfm import Graph
14
+
15
+
16
+ class LocalSampler(Sampler):
17
+ def __init__(
18
+ self,
19
+ graph: 'Graph',
20
+ verbose: bool | ProgressLogger = True,
21
+ ) -> None:
22
+ super().__init__(graph=graph, verbose=verbose)
23
+
24
+ import kumoai.kumolib as kumolib
25
+
26
+ self._graph_store = LocalGraphStore(graph, verbose)
27
+ self._graph_sampler = kumolib.NeighborSampler(
28
+ list(self.table_stype_dict.keys()),
29
+ self.edge_types,
30
+ {
31
+ '__'.join(edge_type): colptr
32
+ for edge_type, colptr in self._graph_store.colptr_dict.items()
33
+ },
34
+ {
35
+ '__'.join(edge_type): row
36
+ for edge_type, row in self._graph_store.row_dict.items()
37
+ },
38
+ self._graph_store.time_dict,
39
+ )
40
+
41
+ def _get_min_max_time_dict(
42
+ self,
43
+ table_names: list[str],
44
+ ) -> dict[str, tuple[pd.Timestamp, pd.Timestamp]]:
45
+ return {
46
+ key: value
47
+ for key, value in self._graph_store.min_max_time_dict.items()
48
+ if key in table_names
49
+ }
50
+
51
+ def _sample_subgraph(
52
+ self,
53
+ entity_table_name: str,
54
+ entity_pkey: pd.Series,
55
+ anchor_time: pd.Series | Literal['entity'],
56
+ columns_dict: dict[str, set[str]],
57
+ num_neighbors: list[int],
58
+ ) -> SamplerOutput:
59
+
60
+ index = self._graph_store.get_node_id(entity_table_name, entity_pkey)
61
+
62
+ if isinstance(anchor_time, pd.Series):
63
+ time = anchor_time.astype(int).to_numpy() // 1000**3 # to seconds
64
+ else:
65
+ assert anchor_time == 'entity'
66
+ time = self._graph_store.time_dict[entity_table_name][index]
67
+
68
+ (
69
+ row_dict,
70
+ col_dict,
71
+ node_dict,
72
+ batch_dict,
73
+ num_sampled_nodes_dict,
74
+ num_sampled_edges_dict,
75
+ ) = self._graph_sampler.sample(
76
+ {
77
+ '__'.join(edge_type): num_neighbors
78
+ for edge_type in self.edge_types
79
+ },
80
+ {},
81
+ entity_table_name,
82
+ index,
83
+ time,
84
+ )
85
+
86
+ df_dict: dict[str, pd.DataFrame] = {}
87
+ inverse_dict: dict[str, np.ndarray] = {}
88
+ for table_name, node in node_dict.items():
89
+ df = self._graph_store.df_dict[table_name]
90
+ columns = columns_dict[table_name]
91
+ if self.end_time_column_dict.get(table_name, None) in columns:
92
+ df = df.iloc[node]
93
+ elif len(columns) == 0:
94
+ df = df.iloc[node]
95
+ else:
96
+ # Only store unique rows in `df` above a certain threshold:
97
+ unique_node, inverse = np.unique(node, return_inverse=True)
98
+ if len(node) > 1.05 * len(unique_node):
99
+ df = df.iloc[unique_node]
100
+ inverse_dict[table_name] = inverse
101
+ else:
102
+ df = df.iloc[node]
103
+ df = df.reset_index(drop=True)
104
+ df = df[list(columns)]
105
+ df_dict[table_name] = df
106
+
107
+ num_sampled_nodes_dict = {
108
+ table_name: num_sampled_nodes.tolist()
109
+ for table_name, num_sampled_nodes in
110
+ num_sampled_nodes_dict.items()
111
+ }
112
+
113
+ row_dict = {
114
+ edge_type: row_dict['__'.join(edge_type)]
115
+ for edge_type in self.edge_types
116
+ }
117
+ col_dict = {
118
+ edge_type: col_dict['__'.join(edge_type)]
119
+ for edge_type in self.edge_types
120
+ }
121
+ num_sampled_edges_dict = {
122
+ edge_type: num_sampled_edges_dict['__'.join(edge_type)].tolist()
123
+ for edge_type in self.edge_types
124
+ }
125
+
126
+ return SamplerOutput(
127
+ anchor_time=time * 1000**3, # to nanoseconds
128
+ df_dict=df_dict,
129
+ inverse_dict=inverse_dict,
130
+ batch_dict=batch_dict,
131
+ num_sampled_nodes_dict=num_sampled_nodes_dict,
132
+ row_dict=row_dict,
133
+ col_dict=col_dict,
134
+ num_sampled_edges_dict=num_sampled_edges_dict,
135
+ )
136
+
137
+ def _sample_entity_table(
138
+ self,
139
+ table_name: str,
140
+ columns: set[str],
141
+ num_rows: int,
142
+ random_seed: int | None = None,
143
+ ) -> pd.DataFrame:
144
+ pkey_map = self._graph_store.pkey_map_dict[table_name]
145
+ if len(pkey_map) > num_rows:
146
+ pkey_map = pkey_map.sample(
147
+ n=num_rows,
148
+ random_state=random_seed,
149
+ ignore_index=True,
150
+ )
151
+ df = self._graph_store.df_dict[table_name]
152
+ df = df.iloc[pkey_map['arange']][list(columns)]
153
+ return df
154
+
155
+ def _sample_target(
156
+ self,
157
+ query: ValidatedPredictiveQuery,
158
+ entity_df: pd.DataFrame,
159
+ train_index: np.ndarray,
160
+ train_time: pd.Series,
161
+ num_train_examples: int,
162
+ test_index: np.ndarray,
163
+ test_time: pd.Series,
164
+ num_test_examples: int,
165
+ columns_dict: dict[str, set[str]],
166
+ time_offset_dict: dict[
167
+ tuple[str, str, str],
168
+ tuple[pd.DateOffset | None, pd.DateOffset],
169
+ ],
170
+ ) -> tuple[pd.Series, np.ndarray, pd.Series, np.ndarray]:
171
+
172
+ train_y, train_mask = self._sample_target_set(
173
+ query=query,
174
+ pkey=entity_df[self.primary_key_dict[query.entity_table]],
175
+ index=train_index,
176
+ anchor_time=train_time,
177
+ num_examples=num_train_examples,
178
+ columns_dict=columns_dict,
179
+ time_offset_dict=time_offset_dict,
180
+ )
181
+
182
+ test_y, test_mask = self._sample_target_set(
183
+ query=query,
184
+ pkey=entity_df[self.primary_key_dict[query.entity_table]],
185
+ index=test_index,
186
+ anchor_time=test_time,
187
+ num_examples=num_test_examples,
188
+ columns_dict=columns_dict,
189
+ time_offset_dict=time_offset_dict,
190
+ )
191
+
192
+ return train_y, train_mask, test_y, test_mask
193
+
194
+ # Helper Methods ##########################################################
195
+
196
+ def _sample_target_set(
197
+ self,
198
+ query: ValidatedPredictiveQuery,
199
+ pkey: pd.Series,
200
+ index: np.ndarray,
201
+ anchor_time: pd.Series,
202
+ num_examples: int,
203
+ columns_dict: dict[str, set[str]],
204
+ time_offset_dict: dict[
205
+ tuple[str, str, str],
206
+ tuple[pd.DateOffset | None, pd.DateOffset],
207
+ ],
208
+ batch_size: int = 10_000,
209
+ ) -> tuple[pd.Series, np.ndarray]:
210
+
211
+ num_hops = 1 if len(time_offset_dict) > 0 else 0
212
+ num_neighbors_dict: dict[str, list[int]] = {}
213
+ unix_time_offset_dict: dict[str, list[list[int | None]]] = {}
214
+ for edge_type, (start, end) in time_offset_dict.items():
215
+ unix_time_offset_dict['__'.join(edge_type)] = [[
216
+ date_offset_to_seconds(start) if start is not None else None,
217
+ date_offset_to_seconds(end),
218
+ ]]
219
+ for edge_type in set(self.edge_types) - set(time_offset_dict.keys()):
220
+ num_neighbors_dict['__'.join(edge_type)] = [0] * num_hops
221
+
222
+ if anchor_time.dtype != 'datetime64[ns]':
223
+ anchor_time = anchor_time.astype('datetime64')
224
+
225
+ count = 0
226
+ ys: list[pd.Series] = []
227
+ mask = np.full(len(index), False, dtype=bool)
228
+ for start in range(0, len(index), batch_size):
229
+ subset = pkey.iloc[index[start:start + batch_size]]
230
+ time = anchor_time.iloc[start:start + batch_size]
231
+
232
+ _, _, node_dict, batch_dict, _, _ = self._graph_sampler.sample(
233
+ num_neighbors_dict,
234
+ unix_time_offset_dict,
235
+ query.entity_table,
236
+ self._graph_store.get_node_id(query.entity_table, subset),
237
+ time.astype(int).to_numpy() // 1000**3, # to seconds
238
+ )
239
+
240
+ feat_dict: dict[str, pd.DataFrame] = {}
241
+ time_dict: dict[str, pd.Series] = {}
242
+ for table_name, columns in columns_dict.items():
243
+ df = self._graph_store.df_dict[table_name]
244
+ df = df.iloc[node_dict[table_name]].reset_index(drop=True)
245
+ df = df[list(columns)]
246
+ feat_dict[table_name] = df
247
+
248
+ time_column = self.time_column_dict.get(table_name)
249
+ if time_column in columns:
250
+ time_dict[table_name] = df[time_column]
251
+
252
+ y, _mask = PQueryPandasExecutor().execute(
253
+ query=query,
254
+ feat_dict=feat_dict,
255
+ time_dict=time_dict,
256
+ batch_dict=batch_dict,
257
+ anchor_time=time,
258
+ num_forecasts=query.num_forecasts,
259
+ )
260
+ ys.append(y)
261
+ mask[start:start + batch_size] = _mask
262
+
263
+ count += len(y)
264
+ if count >= num_examples:
265
+ break
266
+
267
+ if len(ys) == 0:
268
+ y = pd.Series([], dtype=float)
269
+ elif len(ys) == 1:
270
+ y = ys[0]
271
+ else:
272
+ y = pd.concat(ys, axis=0, ignore_index=True)
273
+
274
+ return y, mask
275
+
276
+
277
+ # Helper Functions ############################################################
278
+
279
+
280
+ def date_offset_to_seconds(offset: pd.DateOffset) -> int:
281
+ r"""Convert a :class:`pandas.DateOffset` into a number of seconds.
282
+
283
+ .. note::
284
+ We are conservative and take months and years as their maximum value.
285
+ Additional values are then dropped in label computation where we know
286
+ the actual dates.
287
+ """
288
+ MAX_DAYS_IN_MONTH = 31
289
+ MAX_DAYS_IN_YEAR = 366
290
+
291
+ SECONDS_IN_MINUTE = 60
292
+ SECONDS_IN_HOUR = 60 * SECONDS_IN_MINUTE
293
+ SECONDS_IN_DAY = 24 * SECONDS_IN_HOUR
294
+
295
+ total_sec = 0
296
+ multiplier = getattr(offset, 'n', 1) # The multiplier (if present).
297
+
298
+ for attr, value in offset.__dict__.items():
299
+ if value is None or value == 0:
300
+ continue
301
+ scaled_value = value * multiplier
302
+ if attr == 'years':
303
+ total_sec += scaled_value * MAX_DAYS_IN_YEAR * SECONDS_IN_DAY
304
+ elif attr == 'months':
305
+ total_sec += scaled_value * MAX_DAYS_IN_MONTH * SECONDS_IN_DAY
306
+ elif attr == 'days':
307
+ total_sec += scaled_value * SECONDS_IN_DAY
308
+ elif attr == 'hours':
309
+ total_sec += scaled_value * SECONDS_IN_HOUR
310
+ elif attr == 'minutes':
311
+ total_sec += scaled_value * SECONDS_IN_MINUTE
312
+ elif attr == 'seconds':
313
+ total_sec += scaled_value
314
+
315
+ return total_sec
@@ -1,11 +1,11 @@
1
- from typing import List, Optional
1
+ import warnings
2
+ from typing import cast
2
3
 
3
4
  import pandas as pd
4
- from kumoapi.typing import Dtype, Stype
5
- from typing_extensions import Self
5
+ from kumoapi.model_plan import MissingType
6
6
 
7
- from kumoai.experimental.rfm import utils
8
- from kumoai.experimental.rfm.base import Column, Table
7
+ from kumoai.experimental.rfm.base import DataBackend, SourceColumn, Table
8
+ from kumoai.experimental.rfm.infer import infer_dtype
9
9
 
10
10
 
11
11
  class LocalTable(Table):
@@ -53,13 +53,13 @@ class LocalTable(Table):
53
53
  self,
54
54
  df: pd.DataFrame,
55
55
  name: str,
56
- primary_key: Optional[str] = None,
57
- time_column: Optional[str] = None,
58
- end_time_column: Optional[str] = None,
56
+ primary_key: MissingType | str | None = MissingType.VALUE,
57
+ time_column: str | None = None,
58
+ end_time_column: str | None = None,
59
59
  ) -> None:
60
60
 
61
61
  if df.empty:
62
- raise ValueError("Data frame must have at least one row")
62
+ raise ValueError("Data frame is empty")
63
63
  if isinstance(df.columns, pd.MultiIndex):
64
64
  raise ValueError("Data frame must not have a multi-index")
65
65
  if not df.columns.is_unique:
@@ -77,75 +77,36 @@ class LocalTable(Table):
77
77
  end_time_column=end_time_column,
78
78
  )
79
79
 
80
- def infer_metadata(self, verbose: bool = True) -> Self:
81
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
82
- table.
83
-
84
- Args:
85
- verbose: Whether to print verbose output.
86
- """
87
- logs = []
88
-
89
- # Try to detect primary key if not set:
90
- if not self.has_primary_key():
91
-
92
- def is_candidate(column: Column) -> bool:
93
- if column.stype == Stype.ID:
94
- return True
95
- if all(column.stype != Stype.ID for column in self.columns):
96
- if self.name == column.name:
97
- return True
98
- if (self.name.endswith('s')
99
- and self.name[:-1] == column.name):
100
- return True
101
- return False
102
-
103
- candidates = [
104
- column.name for column in self.columns if is_candidate(column)
105
- ]
106
-
107
- if primary_key := utils.detect_primary_key(
108
- table_name=self.name,
109
- df=self._data,
110
- candidates=candidates,
111
- ):
112
- self.primary_key = primary_key
113
- logs.append(f"primary key '{primary_key}'")
114
-
115
- # Try to detect time column if not set:
116
- if not self.has_time_column():
117
- candidates = [
118
- column.name for column in self.columns
119
- if column.stype == Stype.timestamp
120
- and column.name != self._end_time_column
121
- ]
122
- if time_column := utils.detect_time_column(self._data, candidates):
123
- self.time_column = time_column
124
- logs.append(f"time column '{time_column}'")
125
-
126
- if verbose and len(logs) > 0:
127
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
128
-
129
- return self
130
-
131
- def _has_source_column(self, name: str) -> bool:
132
- return name in self._data.columns
133
-
134
- def _get_source_dtype(self, name: str) -> Dtype:
135
- return utils.to_dtype(self._data[name])
136
-
137
- def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
138
- return utils.infer_stype(self._data[name], name, dtype)
139
-
140
- def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
141
- return utils.detect_primary_key(
142
- table_name=self.name,
143
- df=self._data,
144
- candidates=candidates,
145
- )
146
-
147
- def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
148
- return utils.detect_time_column(df=self._data, candidates=candidates)
149
-
150
- def _num_rows(self) -> Optional[int]:
80
+ @property
81
+ def backend(self) -> DataBackend:
82
+ return cast(DataBackend, DataBackend.LOCAL)
83
+
84
+ def _get_source_columns(self) -> list[SourceColumn]:
85
+ source_columns: list[SourceColumn] = []
86
+ for column in self._data.columns:
87
+ ser = self._data[column]
88
+ try:
89
+ dtype = infer_dtype(ser)
90
+ except Exception:
91
+ warnings.warn(f"Data type inference for column '{column}' in "
92
+ f"table '{self.name}' failed. Consider changing "
93
+ f"the data type of the column to use it within "
94
+ f"this table.")
95
+ continue
96
+
97
+ source_column = SourceColumn(
98
+ name=column,
99
+ dtype=dtype,
100
+ is_primary_key=False,
101
+ is_unique_key=False,
102
+ is_nullable=True,
103
+ )
104
+ source_columns.append(source_column)
105
+
106
+ return source_columns
107
+
108
+ def _get_sample_df(self) -> pd.DataFrame:
109
+ return self._data
110
+
111
+ def _get_num_rows(self) -> int | None:
151
112
  return len(self._data)
@@ -0,0 +1,37 @@
1
+ from typing import Any, TypeAlias
2
+
3
+ try:
4
+ import snowflake.connector
5
+ except ImportError:
6
+ raise ImportError("No module named 'snowflake'. Please install Kumo SDK "
7
+ "with the 'snowflake' extension via "
8
+ "`pip install kumoai[snowflake]`.")
9
+
10
+ Connection: TypeAlias = snowflake.connector.SnowflakeConnection
11
+
12
+
13
+ def connect(**kwargs: Any) -> Connection:
14
+ r"""Opens a connection to a :class:`snowflake` database.
15
+
16
+ If available, will return a connection to the active session.
17
+
18
+ kwargs: Connection arguments, following the :class:`snowflake` protocol.
19
+ """
20
+ try:
21
+ from snowflake.snowpark.context import get_active_session
22
+ return get_active_session().connection
23
+ except Exception:
24
+ pass
25
+
26
+ return snowflake.connector.connect(**kwargs)
27
+
28
+
29
+ from .table import SnowTable # noqa: E402
30
+ from .sampler import SnowSampler # noqa: E402
31
+
32
+ __all__ = [
33
+ 'connect',
34
+ 'Connection',
35
+ 'SnowTable',
36
+ 'SnowSampler',
37
+ ]