kumoai 2.13.0.dev202512040649__cp313-cp313-win_amd64.whl → 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kumoai/__init__.py +12 -0
  2. kumoai/_version.py +1 -1
  3. kumoai/client/pquery.py +6 -2
  4. kumoai/experimental/rfm/__init__.py +33 -8
  5. kumoai/experimental/rfm/authenticate.py +3 -4
  6. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  7. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +52 -91
  8. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  9. kumoai/experimental/rfm/backend/local/table.py +21 -16
  10. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  11. kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
  12. kumoai/experimental/rfm/backend/snow/table.py +102 -48
  13. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  14. kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
  15. kumoai/experimental/rfm/backend/sqlite/table.py +84 -31
  16. kumoai/experimental/rfm/base/__init__.py +26 -3
  17. kumoai/experimental/rfm/base/column.py +14 -12
  18. kumoai/experimental/rfm/base/column_expression.py +50 -0
  19. kumoai/experimental/rfm/base/sampler.py +773 -0
  20. kumoai/experimental/rfm/base/source.py +1 -0
  21. kumoai/experimental/rfm/base/sql_sampler.py +84 -0
  22. kumoai/experimental/rfm/base/sql_table.py +229 -0
  23. kumoai/experimental/rfm/base/table.py +173 -138
  24. kumoai/experimental/rfm/graph.py +302 -108
  25. kumoai/experimental/rfm/infer/__init__.py +6 -4
  26. kumoai/experimental/rfm/infer/dtype.py +3 -3
  27. kumoai/experimental/rfm/infer/pkey.py +4 -2
  28. kumoai/experimental/rfm/infer/stype.py +35 -0
  29. kumoai/experimental/rfm/infer/time_col.py +1 -2
  30. kumoai/experimental/rfm/pquery/executor.py +27 -27
  31. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  32. kumoai/experimental/rfm/rfm.py +299 -230
  33. kumoai/experimental/rfm/sagemaker.py +4 -4
  34. kumoai/kumolib.cp313-win_amd64.pyd +0 -0
  35. kumoai/pquery/predictive_query.py +10 -6
  36. kumoai/testing/snow.py +50 -0
  37. kumoai/utils/__init__.py +3 -2
  38. kumoai/utils/progress_logger.py +178 -12
  39. kumoai/utils/sql.py +3 -0
  40. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +3 -2
  41. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +44 -36
  42. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  43. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  44. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
  45. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
  46. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,20 @@
1
1
  from abc import ABC, abstractmethod
2
- from collections import defaultdict
2
+ from collections.abc import Sequence
3
3
  from functools import cached_property
4
- from typing import Dict, List, Optional, Sequence, Set
5
4
 
6
5
  import pandas as pd
6
+ from kumoapi.model_plan import MissingType
7
7
  from kumoapi.source_table import UnavailableSourceTable
8
8
  from kumoapi.table import Column as ColumnDefinition
9
9
  from kumoapi.table import TableDefinition
10
10
  from kumoapi.typing import Stype
11
11
  from typing_extensions import Self
12
12
 
13
- from kumoai import in_notebook
14
- from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
13
+ from kumoai import in_notebook, in_snowflake_notebook
14
+ from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
15
15
  from kumoai.experimental.rfm.infer import (
16
- contains_categorical,
17
- contains_id,
18
- contains_multicategorical,
19
- contains_timestamp,
20
16
  infer_primary_key,
17
+ infer_stype,
21
18
  infer_time_column,
22
19
  )
23
20
 
@@ -38,44 +35,32 @@ class Table(ABC):
38
35
  def __init__(
39
36
  self,
40
37
  name: str,
41
- columns: Optional[Sequence[str]] = None,
42
- primary_key: Optional[str] = None,
43
- time_column: Optional[str] = None,
44
- end_time_column: Optional[str] = None,
38
+ columns: Sequence[str] | None = None,
39
+ primary_key: MissingType | str | None = MissingType.VALUE,
40
+ time_column: str | None = None,
41
+ end_time_column: str | None = None,
45
42
  ) -> None:
46
43
 
47
44
  self._name = name
48
- self._primary_key: Optional[str] = None
49
- self._time_column: Optional[str] = None
50
- self._end_time_column: Optional[str] = None
45
+ self._primary_key: str | None = None
46
+ self._time_column: str | None = None
47
+ self._end_time_column: str | None = None
51
48
 
52
- if len(self._source_column_dict) == 0:
53
- raise ValueError(f"Table '{name}' does not hold any column with "
54
- f"a supported data type")
49
+ if columns is None:
50
+ columns = list(self._source_column_dict.keys())
55
51
 
56
- primary_keys = [
57
- column.name for column in self._source_column_dict.values()
58
- if column.is_primary_key
59
- ]
60
- if len(primary_keys) == 1: # NOTE No composite keys yet.
61
- if primary_key is not None and primary_key != primary_keys[0]:
62
- raise ValueError(f"Found duplicate primary key "
63
- f"definition '{primary_key}' and "
64
- f"'{primary_keys[0]}' in table '{name}'")
65
- primary_key = primary_keys[0]
66
-
67
- unique_keys = [
68
- column.name for column in self._source_column_dict.values()
69
- if column.is_unique_key
70
- ]
71
- if primary_key is None and len(unique_keys) == 1:
72
- primary_key = unique_keys[0]
73
-
74
- self._columns: Dict[str, Column] = {}
75
- for column_name in columns or list(self._source_column_dict.keys()):
52
+ self._columns: dict[str, Column] = {}
53
+ for column_name in columns:
76
54
  self.add_column(column_name)
77
55
 
78
- if primary_key is not None:
56
+ if isinstance(primary_key, MissingType):
57
+ # Inference from source column metadata:
58
+ if '_source_column_dict' in self.__dict__:
59
+ primary_key = self._source_primary_key
60
+ if (primary_key is not None and primary_key in self
61
+ and self[primary_key].is_physical):
62
+ self.primary_key = primary_key
63
+ elif primary_key is not None:
79
64
  if primary_key not in self:
80
65
  self.add_column(primary_key)
81
66
  self.primary_key = primary_key
@@ -95,7 +80,7 @@ class Table(ABC):
95
80
  r"""The name of this table."""
96
81
  return self._name
97
82
 
98
- # Data column #############################################################
83
+ # Column ##################################################################
99
84
 
100
85
  def has_column(self, name: str) -> bool:
101
86
  r"""Returns ``True`` if this table holds a column with name ``name``;
@@ -117,7 +102,7 @@ class Table(ABC):
117
102
  return self._columns[name]
118
103
 
119
104
  @property
120
- def columns(self) -> List[Column]:
105
+ def columns(self) -> list[Column]:
121
106
  r"""Returns a list of :class:`Column` objects that represent the
122
107
  columns in this table.
123
108
  """
@@ -140,36 +125,22 @@ class Table(ABC):
140
125
  raise KeyError(f"Column '{name}' does not exist in the underlying "
141
126
  f"source table")
142
127
 
143
- try:
144
- dtype = self._source_column_dict[name].dtype
145
- except Exception as e:
146
- raise RuntimeError(f"Could not obtain data type for column "
147
- f"'{name}' in table '{self.name}'. Change "
148
- f"the data type of the column in the source "
149
- f"table or remove it from the table.") from e
128
+ dtype = self._source_column_dict[name].dtype
150
129
 
130
+ ser = self._source_sample_df[name]
151
131
  try:
152
- ser = self._sample_df[name]
153
- if contains_id(ser, name, dtype):
154
- stype = Stype.ID
155
- elif contains_timestamp(ser, name, dtype):
156
- stype = Stype.timestamp
157
- elif contains_multicategorical(ser, name, dtype):
158
- stype = Stype.multicategorical
159
- elif contains_categorical(ser, name, dtype):
160
- stype = Stype.categorical
161
- else:
162
- stype = dtype.default_stype
132
+ stype = infer_stype(ser, name, dtype)
163
133
  except Exception as e:
164
134
  raise RuntimeError(f"Could not obtain semantic type for column "
165
- f"'{name}' in table '{self.name}'. Change "
166
- f"the data type of the column in the source "
167
- f"table or remove it from the table.") from e
135
+ f"'{name}' with data type '{dtype}' in table "
136
+ f"'{self.name}'. Change the data type of the "
137
+ f"column in the source table or remove it from "
138
+ f"this table.") from e
168
139
 
169
140
  self._columns[name] = Column(
170
141
  name=name,
171
- dtype=dtype,
172
142
  stype=stype,
143
+ dtype=dtype,
173
144
  )
174
145
 
175
146
  return self._columns[name]
@@ -205,7 +176,7 @@ class Table(ABC):
205
176
  return self._primary_key is not None
206
177
 
207
178
  @property
208
- def primary_key(self) -> Optional[Column]:
179
+ def primary_key(self) -> Column | None:
209
180
  r"""The primary key column of this table.
210
181
 
211
182
  The getter returns the primary key column of this table, or ``None`` if
@@ -220,7 +191,7 @@ class Table(ABC):
220
191
  return self[self._primary_key]
221
192
 
222
193
  @primary_key.setter
223
- def primary_key(self, name: Optional[str]) -> None:
194
+ def primary_key(self, name: str | None) -> None:
224
195
  if name is not None and name == self._time_column:
225
196
  raise ValueError(f"Cannot specify column '{name}' as a primary "
226
197
  f"key since it is already defined to be a time "
@@ -250,7 +221,7 @@ class Table(ABC):
250
221
  return self._time_column is not None
251
222
 
252
223
  @property
253
- def time_column(self) -> Optional[Column]:
224
+ def time_column(self) -> Column | None:
254
225
  r"""The time column of this table.
255
226
 
256
227
  The getter returns the time column of this table, or ``None`` if no
@@ -265,7 +236,7 @@ class Table(ABC):
265
236
  return self[self._time_column]
266
237
 
267
238
  @time_column.setter
268
- def time_column(self, name: Optional[str]) -> None:
239
+ def time_column(self, name: str | None) -> None:
269
240
  if name is not None and name == self._primary_key:
270
241
  raise ValueError(f"Cannot specify column '{name}' as a time "
271
242
  f"column since it is already defined to be a "
@@ -295,7 +266,7 @@ class Table(ABC):
295
266
  return self._end_time_column is not None
296
267
 
297
268
  @property
298
- def end_time_column(self) -> Optional[Column]:
269
+ def end_time_column(self) -> Column | None:
299
270
  r"""The end time column of this table.
300
271
 
301
272
  The getter returns the end time column of this table, or ``None`` if no
@@ -311,7 +282,7 @@ class Table(ABC):
311
282
  return self[self._end_time_column]
312
283
 
313
284
  @end_time_column.setter
314
- def end_time_column(self, name: Optional[str]) -> None:
285
+ def end_time_column(self, name: str | None) -> None:
315
286
  if name is not None and name == self._primary_key:
316
287
  raise ValueError(f"Cannot specify column '{name}' as an end time "
317
288
  f"column since it is already defined to be a "
@@ -384,7 +355,12 @@ class Table(ABC):
384
355
  if self._num_rows is not None:
385
356
  num_rows_repr = ' ({self._num_rows:,} rows)'
386
357
 
387
- if in_notebook():
358
+ if in_snowflake_notebook():
359
+ import streamlit as st
360
+ md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
361
+ st.markdown(md_repr)
362
+ st.dataframe(self.metadata, hide_index=True)
363
+ elif in_notebook():
388
364
  from IPython.display import Markdown, display
389
365
  md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
390
366
  display(Markdown(md_repr))
@@ -400,8 +376,91 @@ class Table(ABC):
400
376
  print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
401
377
  print(self.metadata.to_string(index=False))
402
378
 
379
+ def infer_primary_key(self, verbose: bool = True) -> Self:
380
+ r"""Infers the primary key in this table.
381
+
382
+ Args:
383
+ verbose: Whether to print verbose output.
384
+ """
385
+ if self.has_primary_key():
386
+ return self
387
+
388
+ def _set_primary_key(primary_key: str) -> None:
389
+ self.primary_key = primary_key
390
+ if verbose:
391
+ print(f"Detected primary key '{primary_key}' in table "
392
+ f"'{self.name}'")
393
+
394
+ # Inference from source column metadata:
395
+ if '_source_column_dict' in self.__dict__:
396
+ primary_key = self._source_primary_key
397
+ if (primary_key is not None and primary_key in self
398
+ and self[primary_key].is_physical):
399
+ _set_primary_key(primary_key)
400
+ return self
401
+
402
+ unique_keys = [
403
+ column.name for column in self._source_column_dict.values()
404
+ if column.is_unique_key
405
+ ]
406
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
407
+ and unique_keys[0] in self
408
+ and self[unique_keys[0]].is_physical):
409
+ _set_primary_key(unique_keys[0])
410
+ return self
411
+
412
+ # Heuristic-based inference:
413
+ candidates = [
414
+ column.name for column in self.columns if column.stype == Stype.ID
415
+ ]
416
+ if len(candidates) == 0:
417
+ for column in self.columns:
418
+ if self.name.lower() == column.name.lower():
419
+ candidates.append(column.name)
420
+ elif (self.name.lower().endswith('s')
421
+ and self.name.lower()[:-1] == column.name.lower()):
422
+ candidates.append(column.name)
423
+
424
+ if primary_key := infer_primary_key(
425
+ table_name=self.name,
426
+ df=self._sample_current_df(columns=candidates),
427
+ candidates=candidates,
428
+ ):
429
+ _set_primary_key(primary_key)
430
+ return self
431
+
432
+ return self
433
+
434
+ def infer_time_column(self, verbose: bool = True) -> Self:
435
+ r"""Infers the time column in this table.
436
+
437
+ Args:
438
+ verbose: Whether to print verbose output.
439
+ """
440
+ if self.has_time_column():
441
+ return self
442
+
443
+ # Heuristic-based inference:
444
+ candidates = [
445
+ column.name for column in self.columns
446
+ if column.stype == Stype.timestamp
447
+ and column.name != self._end_time_column
448
+ ]
449
+
450
+ if time_column := infer_time_column(
451
+ df=self._sample_current_df(columns=candidates),
452
+ candidates=candidates,
453
+ ):
454
+ self.time_column = time_column
455
+
456
+ if verbose:
457
+ print(f"Detected time column '{time_column}' in table "
458
+ f"'{self.name}'")
459
+
460
+ return self
461
+
403
462
  def infer_metadata(self, verbose: bool = True) -> Self:
404
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
463
+ r"""Infers metadata, *i.e.*, primary keys and time columns, in this
405
464
  table.
406
465
 
407
466
  Args:
@@ -409,45 +468,15 @@ class Table(ABC):
409
468
  """
410
469
  logs = []
411
470
 
412
- # Try to detect primary key if not set:
413
471
  if not self.has_primary_key():
472
+ self.infer_primary_key(verbose=False)
473
+ if self.has_primary_key():
474
+ logs.append(f"primary key '{self._primary_key}'")
414
475
 
415
- def is_candidate(column: Column) -> bool:
416
- if column.stype == Stype.ID:
417
- return True
418
- if all(column.stype != Stype.ID for column in self.columns):
419
- if self.name == column.name:
420
- return True
421
- if (self.name.endswith('s')
422
- and self.name[:-1] == column.name):
423
- return True
424
- return False
425
-
426
- candidates = [
427
- column.name for column in self.columns if is_candidate(column)
428
- ]
429
-
430
- if primary_key := infer_primary_key(
431
- table_name=self.name,
432
- df=self._sample_df,
433
- candidates=candidates,
434
- ):
435
- self.primary_key = primary_key
436
- logs.append(f"primary key '{primary_key}'")
437
-
438
- # Try to detect time column if not set:
439
476
  if not self.has_time_column():
440
- candidates = [
441
- column.name for column in self.columns
442
- if column.stype == Stype.timestamp
443
- and column.name != self._end_time_column
444
- ]
445
- if time_column := infer_time_column(
446
- df=self._sample_df,
447
- candidates=candidates,
448
- ):
449
- self.time_column = time_column
450
- logs.append(f"time column '{time_column}'")
477
+ self.infer_time_column(verbose=False)
478
+ if self.has_time_column():
479
+ logs.append(f"time column '{self._time_column}'")
451
480
 
452
481
  if verbose and len(logs) > 0:
453
482
  print(f"Detected {' and '.join(logs)} in table '{self.name}'")
@@ -468,6 +497,36 @@ class Table(ABC):
468
497
  end_time_col=self._end_time_column,
469
498
  )
470
499
 
500
+ @cached_property
501
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
502
+ source_columns = self._get_source_columns()
503
+ if len(source_columns) == 0:
504
+ raise ValueError(f"Table '{self.name}' does not hold any column "
505
+ f"with a supported data type")
506
+ return {column.name: column for column in source_columns}
507
+
508
+ @cached_property
509
+ def _source_sample_df(self) -> pd.DataFrame:
510
+ return self._get_source_sample_df()
511
+
512
+ @property
513
+ def _source_primary_key(self) -> str | None:
514
+ primary_keys = [
515
+ column.name for column in self._source_column_dict.values()
516
+ if column.is_primary_key
517
+ ]
518
+ if len(primary_keys) == 1: # NOTE No composite keys yet.
519
+ return primary_keys[0]
520
+
521
+ return None
522
+
523
+ @cached_property
524
+ def _num_rows(self) -> int | None:
525
+ return self._get_num_rows()
526
+
527
+ def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
528
+ return self._source_sample_df[columns]
529
+
471
530
  # Python builtins #########################################################
472
531
 
473
532
  def __hash__(self) -> int:
@@ -496,45 +555,21 @@ class Table(ABC):
496
555
  f' end_time_column={self._end_time_column},\n'
497
556
  f')')
498
557
 
499
- # Abstract method #########################################################
500
-
501
- @cached_property
502
- def _source_column_dict(self) -> Dict[str, SourceColumn]:
503
- return {col.name: col for col in self._get_source_columns()}
558
+ # Abstract Methods ########################################################
504
559
 
560
+ @property
505
561
  @abstractmethod
506
- def _get_source_columns(self) -> List[SourceColumn]:
507
- pass
508
-
509
- @cached_property
510
- def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
511
- fkeys = self._get_source_foreign_keys()
512
- # NOTE Drop all keys that link to different primary keys in the same
513
- # table since we don't support composite keys yet:
514
- table_pkeys: Dict[str, Set[str]] = defaultdict(set)
515
- for fkey in fkeys:
516
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
517
- return {
518
- fkey.name: fkey
519
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
520
- }
562
+ def backend(self) -> DataBackend:
563
+ r"""The data backend of this table."""
521
564
 
522
565
  @abstractmethod
523
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
566
+ def _get_source_columns(self) -> list[SourceColumn]:
524
567
  pass
525
568
 
526
- @cached_property
527
- def _sample_df(self) -> pd.DataFrame:
528
- return self._get_sample_df()
529
-
530
569
  @abstractmethod
531
- def _get_sample_df(self) -> pd.DataFrame:
570
+ def _get_source_sample_df(self) -> pd.DataFrame:
532
571
  pass
533
572
 
534
- @cached_property
535
- def _num_rows(self) -> Optional[int]:
536
- return self._get_num_rows()
537
-
538
573
  @abstractmethod
539
- def _get_num_rows(self) -> Optional[int]:
574
+ def _get_num_rows(self) -> int | None:
540
575
  pass