kumoai 2.13.0.dev202511211730__py3-none-any.whl → 2.14.0.dev202512141732__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kumoai/__init__.py +12 -0
  2. kumoai/_version.py +1 -1
  3. kumoai/client/pquery.py +6 -2
  4. kumoai/connector/utils.py +23 -2
  5. kumoai/experimental/rfm/__init__.py +20 -45
  6. kumoai/experimental/rfm/backend/__init__.py +0 -0
  7. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  8. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +37 -90
  9. kumoai/experimental/rfm/backend/local/sampler.py +313 -0
  10. kumoai/experimental/rfm/backend/local/table.py +119 -0
  11. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  12. kumoai/experimental/rfm/backend/snow/sampler.py +119 -0
  13. kumoai/experimental/rfm/backend/snow/table.py +135 -0
  14. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  15. kumoai/experimental/rfm/backend/sqlite/sampler.py +112 -0
  16. kumoai/experimental/rfm/backend/sqlite/table.py +115 -0
  17. kumoai/experimental/rfm/base/__init__.py +23 -0
  18. kumoai/experimental/rfm/base/column.py +66 -0
  19. kumoai/experimental/rfm/base/sampler.py +773 -0
  20. kumoai/experimental/rfm/base/source.py +19 -0
  21. kumoai/experimental/rfm/{local_table.py → base/table.py} +152 -141
  22. kumoai/experimental/rfm/{local_graph.py → graph.py} +352 -80
  23. kumoai/experimental/rfm/infer/__init__.py +6 -0
  24. kumoai/experimental/rfm/infer/dtype.py +79 -0
  25. kumoai/experimental/rfm/infer/pkey.py +126 -0
  26. kumoai/experimental/rfm/infer/time_col.py +62 -0
  27. kumoai/experimental/rfm/pquery/pandas_executor.py +1 -1
  28. kumoai/experimental/rfm/rfm.py +224 -167
  29. kumoai/experimental/rfm/sagemaker.py +11 -3
  30. kumoai/pquery/predictive_query.py +10 -6
  31. kumoai/testing/decorators.py +1 -1
  32. kumoai/testing/snow.py +50 -0
  33. kumoai/utils/__init__.py +2 -0
  34. kumoai/utils/sql.py +3 -0
  35. {kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/METADATA +9 -8
  36. {kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/RECORD +39 -23
  37. kumoai/experimental/rfm/local_graph_sampler.py +0 -182
  38. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  39. kumoai/experimental/rfm/utils.py +0 -344
  40. {kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/WHEEL +0 -0
  41. {kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/licenses/LICENSE +0 -0
  42. {kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,19 @@
1
+ from dataclasses import dataclass
2
+
3
+ from kumoapi.typing import Dtype
4
+
5
+
6
+ @dataclass
7
+ class SourceColumn:
8
+ name: str
9
+ dtype: Dtype
10
+ is_primary_key: bool
11
+ is_unique_key: bool
12
+ is_nullable: bool
13
+
14
+
15
+ @dataclass
16
+ class SourceForeignKey:
17
+ name: str
18
+ dst_table: str
19
+ primary_key: str
@@ -1,115 +1,40 @@
1
- from dataclasses import dataclass
2
- from typing import Any, Dict, List, Optional
1
+ from abc import ABC, abstractmethod
2
+ from collections import defaultdict
3
+ from functools import cached_property
4
+ from typing import Dict, List, Optional, Sequence, Set
3
5
 
4
6
  import pandas as pd
5
7
  from kumoapi.source_table import UnavailableSourceTable
6
8
  from kumoapi.table import Column as ColumnDefinition
7
9
  from kumoapi.table import TableDefinition
8
- from kumoapi.typing import Dtype, Stype
10
+ from kumoapi.typing import Stype
9
11
  from typing_extensions import Self
10
12
 
11
- from kumoai import in_notebook
12
- from kumoai.experimental.rfm import utils
13
-
14
-
15
- @dataclass(init=False, repr=False, eq=False)
16
- class Column:
17
- stype: Stype
18
-
19
- def __init__(
20
- self,
21
- name: str,
22
- dtype: Dtype,
23
- stype: Stype,
24
- is_primary_key: bool = False,
25
- is_time_column: bool = False,
26
- is_end_time_column: bool = False,
27
- ) -> None:
28
- self._name = name
29
- self._dtype = Dtype(dtype)
30
- self._is_primary_key = is_primary_key
31
- self._is_time_column = is_time_column
32
- self._is_end_time_column = is_end_time_column
33
- self.stype = Stype(stype)
34
-
35
- @property
36
- def name(self) -> str:
37
- return self._name
38
-
39
- @property
40
- def dtype(self) -> Dtype:
41
- return self._dtype
42
-
43
- def __setattr__(self, key: str, val: Any) -> None:
44
- if key == 'stype':
45
- if isinstance(val, str):
46
- val = Stype(val)
47
- assert isinstance(val, Stype)
48
- if not val.supports_dtype(self.dtype):
49
- raise ValueError(f"Column '{self.name}' received an "
50
- f"incompatible semantic type (got "
51
- f"dtype='{self.dtype}' and stype='{val}')")
52
- if self._is_primary_key and val != Stype.ID:
53
- raise ValueError(f"Primary key '{self.name}' must have 'ID' "
54
- f"semantic type (got '{val}')")
55
- if self._is_time_column and val != Stype.timestamp:
56
- raise ValueError(f"Time column '{self.name}' must have "
57
- f"'timestamp' semantic type (got '{val}')")
58
- if self._is_end_time_column and val != Stype.timestamp:
59
- raise ValueError(f"End time column '{self.name}' must have "
60
- f"'timestamp' semantic type (got '{val}')")
61
-
62
- super().__setattr__(key, val)
63
-
64
- def __hash__(self) -> int:
65
- return hash((self.name, self.stype, self.dtype))
66
-
67
- def __eq__(self, other: Any) -> bool:
68
- if not isinstance(other, Column):
69
- return False
70
- return hash(self) == hash(other)
71
-
72
- def __repr__(self) -> str:
73
- return (f'{self.__class__.__name__}(name={self.name}, '
74
- f'stype={self.stype}, dtype={self.dtype})')
75
-
76
-
77
- class LocalTable:
78
- r"""A table backed by a :class:`pandas.DataFrame`.
79
-
80
- A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
81
- selected columns, column semantic types, primary keys and time columns.
82
- :class:`LocalTable` is used to create a :class:`LocalGraph`.
83
-
84
- .. code-block:: python
85
-
86
- import pandas as pd
87
- import kumoai.experimental.rfm as rfm
88
-
89
- # Load data from a CSV file:
90
- df = pd.read_csv("data.csv")
91
-
92
- # Create a table from a `pandas.DataFrame` and infer its metadata ...
93
- table = rfm.LocalTable(df, name="my_table").infer_metadata()
94
-
95
- # ... or create a table explicitly:
96
- table = rfm.LocalTable(
97
- df=df,
98
- name="my_table",
99
- primary_key="id",
100
- time_column="time",
101
- end_time_column=None,
102
- )
103
-
104
- # Verify metadata:
105
- table.print_metadata()
106
-
107
- # Change the semantic type of a column:
108
- table[column].stype = "text"
13
+ from kumoai import in_notebook, in_snowflake_notebook
14
+ from kumoai.experimental.rfm.base import (
15
+ Column,
16
+ DataBackend,
17
+ SourceColumn,
18
+ SourceForeignKey,
19
+ )
20
+ from kumoai.experimental.rfm.infer import (
21
+ contains_categorical,
22
+ contains_id,
23
+ contains_multicategorical,
24
+ contains_timestamp,
25
+ infer_primary_key,
26
+ infer_time_column,
27
+ )
28
+
29
+
30
+ class Table(ABC):
31
+ r"""A :class:`Table` fully specifies the relevant metadata of a single
32
+ table, *i.e.* its selected columns, data types, semantic types, primary
33
+ keys and time columns.
109
34
 
110
35
  Args:
111
- df: The data frame to create the table from.
112
- name: The name of the table.
36
+ name: The name of this table.
37
+ columns: The selected columns of this table.
113
38
  primary_key: The name of the primary key of this table, if it exists.
114
39
  time_column: The name of the time column of this table, if it exists.
115
40
  end_time_column: The name of the end time column of this table, if it
@@ -117,46 +42,62 @@ class LocalTable:
117
42
  """
118
43
  def __init__(
119
44
  self,
120
- df: pd.DataFrame,
121
45
  name: str,
46
+ columns: Optional[Sequence[str]] = None,
122
47
  primary_key: Optional[str] = None,
123
48
  time_column: Optional[str] = None,
124
49
  end_time_column: Optional[str] = None,
125
50
  ) -> None:
126
51
 
127
- if df.empty:
128
- raise ValueError("Data frame must have at least one row")
129
- if isinstance(df.columns, pd.MultiIndex):
130
- raise ValueError("Data frame must not have a multi-index")
131
- if not df.columns.is_unique:
132
- raise ValueError("Data frame must have unique column names")
133
- if any(col == '' for col in df.columns):
134
- raise ValueError("Data frame must have non-empty column names")
135
-
136
- df = df.copy(deep=False)
137
-
138
- self._data = df
139
52
  self._name = name
140
53
  self._primary_key: Optional[str] = None
141
54
  self._time_column: Optional[str] = None
142
55
  self._end_time_column: Optional[str] = None
143
56
 
57
+ if len(self._source_column_dict) == 0:
58
+ raise ValueError(f"Table '{name}' does not hold any column with "
59
+ f"a supported data type")
60
+
61
+ primary_keys = [
62
+ column.name for column in self._source_column_dict.values()
63
+ if column.is_primary_key
64
+ ]
65
+ if len(primary_keys) == 1: # NOTE No composite keys yet.
66
+ if primary_key is not None and primary_key != primary_keys[0]:
67
+ raise ValueError(f"Found duplicate primary key "
68
+ f"definition '{primary_key}' and "
69
+ f"'{primary_keys[0]}' in table '{name}'")
70
+ primary_key = primary_keys[0]
71
+
72
+ unique_keys = [
73
+ column.name for column in self._source_column_dict.values()
74
+ if column.is_unique_key
75
+ ]
76
+ if primary_key is None and len(unique_keys) == 1:
77
+ primary_key = unique_keys[0]
78
+
144
79
  self._columns: Dict[str, Column] = {}
145
- for column_name in df.columns:
80
+ for column_name in columns or list(self._source_column_dict.keys()):
146
81
  self.add_column(column_name)
147
82
 
148
83
  if primary_key is not None:
84
+ if primary_key not in self:
85
+ self.add_column(primary_key)
149
86
  self.primary_key = primary_key
150
87
 
151
88
  if time_column is not None:
89
+ if time_column not in self:
90
+ self.add_column(time_column)
152
91
  self.time_column = time_column
153
92
 
154
93
  if end_time_column is not None:
94
+ if end_time_column not in self:
95
+ self.add_column(end_time_column)
155
96
  self.end_time_column = end_time_column
156
97
 
157
98
  @property
158
99
  def name(self) -> str:
159
- r"""The name of the table."""
100
+ r"""The name of this table."""
160
101
  return self._name
161
102
 
162
103
  # Data column #############################################################
@@ -200,24 +141,35 @@ class LocalTable:
200
141
  raise KeyError(f"Column '{name}' already exists in table "
201
142
  f"'{self.name}'")
202
143
 
203
- if name not in self._data.columns:
204
- raise KeyError(f"Column '{name}' does not exist in the underyling "
205
- f"data frame")
144
+ if name not in self._source_column_dict:
145
+ raise KeyError(f"Column '{name}' does not exist in the underlying "
146
+ f"source table")
206
147
 
207
148
  try:
208
- dtype = utils.to_dtype(self._data[name])
149
+ dtype = self._source_column_dict[name].dtype
209
150
  except Exception as e:
210
- raise RuntimeError(f"Data type inference for column '{name}' in "
211
- f"table '{self.name}' failed. Consider "
212
- f"changing the data type of the column or "
213
- f"removing it from the table.") from e
151
+ raise RuntimeError(f"Could not obtain data type for column "
152
+ f"'{name}' in table '{self.name}'. Change "
153
+ f"the data type of the column in the source "
154
+ f"table or remove it from the table.") from e
155
+
214
156
  try:
215
- stype = utils.infer_stype(self._data[name], name, dtype)
157
+ ser = self._sample_df[name]
158
+ if contains_id(ser, name, dtype):
159
+ stype = Stype.ID
160
+ elif contains_timestamp(ser, name, dtype):
161
+ stype = Stype.timestamp
162
+ elif contains_multicategorical(ser, name, dtype):
163
+ stype = Stype.multicategorical
164
+ elif contains_categorical(ser, name, dtype):
165
+ stype = Stype.categorical
166
+ else:
167
+ stype = dtype.default_stype
216
168
  except Exception as e:
217
- raise RuntimeError(f"Semantic type inference for column '{name}' "
218
- f"in table '{self.name}' failed. Consider "
219
- f"changing the data type of the column or "
220
- f"removing it from the table.") from e
169
+ raise RuntimeError(f"Could not obtain semantic type for column "
170
+ f"'{name}' in table '{self.name}'. Change "
171
+ f"the data type of the column in the source "
172
+ f"table or remove it from the table.") from e
221
173
 
222
174
  self._columns[name] = Column(
223
175
  name=name,
@@ -432,12 +384,20 @@ class LocalTable:
432
384
  })
433
385
 
434
386
  def print_metadata(self) -> None:
435
- r"""Prints the :meth:`~LocalTable.metadata` of the table."""
436
- if in_notebook():
387
+ r"""Prints the :meth:`~metadata` of this table."""
388
+ num_rows_repr = ''
389
+ if self._num_rows is not None:
390
+ num_rows_repr = ' ({self._num_rows:,} rows)'
391
+
392
+ if in_snowflake_notebook():
393
+ import streamlit as st
394
+ md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
395
+ st.markdown(md_repr)
396
+ st.dataframe(self.metadata, hide_index=True)
397
+ elif in_notebook():
437
398
  from IPython.display import Markdown, display
438
- display(
439
- Markdown(f"### 🏷️ Metadata of Table `{self.name}` "
440
- f"({len(self._data):,} rows)"))
399
+ md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
400
+ display(Markdown(md_repr))
441
401
  df = self.metadata
442
402
  try:
443
403
  if hasattr(df.style, 'hide'):
@@ -447,8 +407,7 @@ class LocalTable:
447
407
  except ImportError:
448
408
  print(df.to_string(index=False)) # missing jinja2
449
409
  else:
450
- print(f"🏷️ Metadata of Table '{self.name}' "
451
- f"({len(self._data):,} rows):")
410
+ print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
452
411
  print(self.metadata.to_string(index=False))
453
412
 
454
413
  def infer_metadata(self, verbose: bool = True) -> Self:
@@ -478,9 +437,9 @@ class LocalTable:
478
437
  column.name for column in self.columns if is_candidate(column)
479
438
  ]
480
439
 
481
- if primary_key := utils.detect_primary_key(
440
+ if primary_key := infer_primary_key(
482
441
  table_name=self.name,
483
- df=self._data,
442
+ df=self._sample_df,
484
443
  candidates=candidates,
485
444
  ):
486
445
  self.primary_key = primary_key
@@ -493,7 +452,10 @@ class LocalTable:
493
452
  if column.stype == Stype.timestamp
494
453
  and column.name != self._end_time_column
495
454
  ]
496
- if time_column := utils.detect_time_column(self._data, candidates):
455
+ if time_column := infer_time_column(
456
+ df=self._sample_df,
457
+ candidates=candidates,
458
+ ):
497
459
  self.time_column = time_column
498
460
  logs.append(f"time column '{time_column}'")
499
461
 
@@ -543,3 +505,52 @@ class LocalTable:
543
505
  f' time_column={self._time_column},\n'
544
506
  f' end_time_column={self._end_time_column},\n'
545
507
  f')')
508
+
509
+ # Abstract Methods ########################################################
510
+
511
+ @property
512
+ @abstractmethod
513
+ def backend(self) -> DataBackend:
514
+ r"""The data backend of this table."""
515
+ pass
516
+
517
+ @cached_property
518
+ def _source_column_dict(self) -> Dict[str, SourceColumn]:
519
+ return {col.name: col for col in self._get_source_columns()}
520
+
521
+ @abstractmethod
522
+ def _get_source_columns(self) -> List[SourceColumn]:
523
+ pass
524
+
525
+ @cached_property
526
+ def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
527
+ fkeys = self._get_source_foreign_keys()
528
+ # NOTE Drop all keys that link to different primary keys in the same
529
+ # table since we don't support composite keys yet:
530
+ table_pkeys: Dict[str, Set[str]] = defaultdict(set)
531
+ for fkey in fkeys:
532
+ table_pkeys[fkey.dst_table].add(fkey.primary_key)
533
+ return {
534
+ fkey.name: fkey
535
+ for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
536
+ }
537
+
538
+ @abstractmethod
539
+ def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
540
+ pass
541
+
542
+ @cached_property
543
+ def _sample_df(self) -> pd.DataFrame:
544
+ return self._get_sample_df()
545
+
546
+ @abstractmethod
547
+ def _get_sample_df(self) -> pd.DataFrame:
548
+ pass
549
+
550
+ @cached_property
551
+ def _num_rows(self) -> Optional[int]:
552
+ return self._get_num_rows()
553
+
554
+ @abstractmethod
555
+ def _get_num_rows(self) -> Optional[int]:
556
+ pass