kumoai 2.13.0.dev202511261731__cp310-cp310-win_amd64.whl → 2.13.0.dev202512021731__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/_version.py +1 -1
- kumoai/connector/utils.py +23 -2
- kumoai/experimental/rfm/__init__.py +20 -45
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +38 -0
- kumoai/experimental/rfm/backend/local/table.py +244 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +30 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +124 -0
- kumoai/experimental/rfm/base/__init__.py +7 -0
- kumoai/experimental/rfm/base/column.py +66 -0
- kumoai/experimental/rfm/{local_table.py → base/table.py} +71 -139
- kumoai/experimental/rfm/{local_graph.py → graph.py} +144 -57
- kumoai/experimental/rfm/infer/__init__.py +2 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/local_graph_store.py +12 -11
- kumoai/experimental/rfm/rfm.py +5 -5
- kumoai/experimental/rfm/sagemaker.py +11 -3
- kumoai/experimental/rfm/utils.py +1 -120
- kumoai/kumolib.cp310-win_amd64.pyd +0 -0
- kumoai/testing/decorators.py +1 -1
- {kumoai-2.13.0.dev202511261731.dist-info → kumoai-2.13.0.dev202512021731.dist-info}/METADATA +8 -8
- {kumoai-2.13.0.dev202511261731.dist-info → kumoai-2.13.0.dev202512021731.dist-info}/RECORD +26 -17
- {kumoai-2.13.0.dev202511261731.dist-info → kumoai-2.13.0.dev202512021731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202511261731.dist-info → kumoai-2.13.0.dev202512021731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202511261731.dist-info → kumoai-2.13.0.dev202512021731.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Dict, List, Optional, Sequence, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from kumoapi.source_table import UnavailableSourceTable
|
|
@@ -9,107 +9,17 @@ from kumoapi.typing import Dtype, Stype
|
|
|
9
9
|
from typing_extensions import Self
|
|
10
10
|
|
|
11
11
|
from kumoai import in_notebook
|
|
12
|
-
from kumoai.experimental.rfm import
|
|
12
|
+
from kumoai.experimental.rfm.base import Column
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
class
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
name: str,
|
|
22
|
-
dtype: Dtype,
|
|
23
|
-
stype: Stype,
|
|
24
|
-
is_primary_key: bool = False,
|
|
25
|
-
is_time_column: bool = False,
|
|
26
|
-
is_end_time_column: bool = False,
|
|
27
|
-
) -> None:
|
|
28
|
-
self._name = name
|
|
29
|
-
self._dtype = Dtype(dtype)
|
|
30
|
-
self._is_primary_key = is_primary_key
|
|
31
|
-
self._is_time_column = is_time_column
|
|
32
|
-
self._is_end_time_column = is_end_time_column
|
|
33
|
-
self.stype = Stype(stype)
|
|
34
|
-
|
|
35
|
-
@property
|
|
36
|
-
def name(self) -> str:
|
|
37
|
-
return self._name
|
|
38
|
-
|
|
39
|
-
@property
|
|
40
|
-
def dtype(self) -> Dtype:
|
|
41
|
-
return self._dtype
|
|
42
|
-
|
|
43
|
-
def __setattr__(self, key: str, val: Any) -> None:
|
|
44
|
-
if key == 'stype':
|
|
45
|
-
if isinstance(val, str):
|
|
46
|
-
val = Stype(val)
|
|
47
|
-
assert isinstance(val, Stype)
|
|
48
|
-
if not val.supports_dtype(self.dtype):
|
|
49
|
-
raise ValueError(f"Column '{self.name}' received an "
|
|
50
|
-
f"incompatible semantic type (got "
|
|
51
|
-
f"dtype='{self.dtype}' and stype='{val}')")
|
|
52
|
-
if self._is_primary_key and val != Stype.ID:
|
|
53
|
-
raise ValueError(f"Primary key '{self.name}' must have 'ID' "
|
|
54
|
-
f"semantic type (got '{val}')")
|
|
55
|
-
if self._is_time_column and val != Stype.timestamp:
|
|
56
|
-
raise ValueError(f"Time column '{self.name}' must have "
|
|
57
|
-
f"'timestamp' semantic type (got '{val}')")
|
|
58
|
-
if self._is_end_time_column and val != Stype.timestamp:
|
|
59
|
-
raise ValueError(f"End time column '{self.name}' must have "
|
|
60
|
-
f"'timestamp' semantic type (got '{val}')")
|
|
61
|
-
|
|
62
|
-
super().__setattr__(key, val)
|
|
63
|
-
|
|
64
|
-
def __hash__(self) -> int:
|
|
65
|
-
return hash((self.name, self.stype, self.dtype))
|
|
66
|
-
|
|
67
|
-
def __eq__(self, other: Any) -> bool:
|
|
68
|
-
if not isinstance(other, Column):
|
|
69
|
-
return False
|
|
70
|
-
return hash(self) == hash(other)
|
|
71
|
-
|
|
72
|
-
def __repr__(self) -> str:
|
|
73
|
-
return (f'{self.__class__.__name__}(name={self.name}, '
|
|
74
|
-
f'stype={self.stype}, dtype={self.dtype})')
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class LocalTable:
|
|
78
|
-
r"""A table backed by a :class:`pandas.DataFrame`.
|
|
79
|
-
|
|
80
|
-
A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
|
|
81
|
-
selected columns, column semantic types, primary keys and time columns.
|
|
82
|
-
:class:`LocalTable` is used to create a :class:`LocalGraph`.
|
|
83
|
-
|
|
84
|
-
.. code-block:: python
|
|
85
|
-
|
|
86
|
-
import pandas as pd
|
|
87
|
-
import kumoai.experimental.rfm as rfm
|
|
88
|
-
|
|
89
|
-
# Load data from a CSV file:
|
|
90
|
-
df = pd.read_csv("data.csv")
|
|
91
|
-
|
|
92
|
-
# Create a table from a `pandas.DataFrame` and infer its metadata ...
|
|
93
|
-
table = rfm.LocalTable(df, name="my_table").infer_metadata()
|
|
94
|
-
|
|
95
|
-
# ... or create a table explicitly:
|
|
96
|
-
table = rfm.LocalTable(
|
|
97
|
-
df=df,
|
|
98
|
-
name="my_table",
|
|
99
|
-
primary_key="id",
|
|
100
|
-
time_column="time",
|
|
101
|
-
end_time_column=None,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Verify metadata:
|
|
105
|
-
table.print_metadata()
|
|
106
|
-
|
|
107
|
-
# Change the semantic type of a column:
|
|
108
|
-
table[column].stype = "text"
|
|
15
|
+
class Table(ABC):
|
|
16
|
+
r"""A :class:`Table` fully specifies the relevant metadata of a single
|
|
17
|
+
table, *i.e.* its selected columns, data types, semantic types, primary
|
|
18
|
+
keys and time columns.
|
|
109
19
|
|
|
110
20
|
Args:
|
|
111
|
-
|
|
112
|
-
|
|
21
|
+
name: The name of this table.
|
|
22
|
+
columns: The selected columns of this table.
|
|
113
23
|
primary_key: The name of the primary key of this table, if it exists.
|
|
114
24
|
time_column: The name of the time column of this table, if it exists.
|
|
115
25
|
end_time_column: The name of the end time column of this table, if it
|
|
@@ -117,46 +27,40 @@ class LocalTable:
|
|
|
117
27
|
"""
|
|
118
28
|
def __init__(
|
|
119
29
|
self,
|
|
120
|
-
df: pd.DataFrame,
|
|
121
30
|
name: str,
|
|
31
|
+
columns: Optional[Sequence[str]] = None,
|
|
122
32
|
primary_key: Optional[str] = None,
|
|
123
33
|
time_column: Optional[str] = None,
|
|
124
34
|
end_time_column: Optional[str] = None,
|
|
125
35
|
) -> None:
|
|
126
36
|
|
|
127
|
-
if df.empty:
|
|
128
|
-
raise ValueError("Data frame must have at least one row")
|
|
129
|
-
if isinstance(df.columns, pd.MultiIndex):
|
|
130
|
-
raise ValueError("Data frame must not have a multi-index")
|
|
131
|
-
if not df.columns.is_unique:
|
|
132
|
-
raise ValueError("Data frame must have unique column names")
|
|
133
|
-
if any(col == '' for col in df.columns):
|
|
134
|
-
raise ValueError("Data frame must have non-empty column names")
|
|
135
|
-
|
|
136
|
-
df = df.copy(deep=False)
|
|
137
|
-
|
|
138
|
-
self._data = df
|
|
139
37
|
self._name = name
|
|
140
38
|
self._primary_key: Optional[str] = None
|
|
141
39
|
self._time_column: Optional[str] = None
|
|
142
40
|
self._end_time_column: Optional[str] = None
|
|
143
41
|
|
|
144
42
|
self._columns: Dict[str, Column] = {}
|
|
145
|
-
for column_name in
|
|
43
|
+
for column_name in columns or []:
|
|
146
44
|
self.add_column(column_name)
|
|
147
45
|
|
|
148
46
|
if primary_key is not None:
|
|
47
|
+
if primary_key not in self:
|
|
48
|
+
self.add_column(primary_key)
|
|
149
49
|
self.primary_key = primary_key
|
|
150
50
|
|
|
151
51
|
if time_column is not None:
|
|
52
|
+
if time_column not in self:
|
|
53
|
+
self.add_column(time_column)
|
|
152
54
|
self.time_column = time_column
|
|
153
55
|
|
|
154
56
|
if end_time_column is not None:
|
|
57
|
+
if end_time_column not in self:
|
|
58
|
+
self.add_column(end_time_column)
|
|
155
59
|
self.end_time_column = end_time_column
|
|
156
60
|
|
|
157
61
|
@property
|
|
158
62
|
def name(self) -> str:
|
|
159
|
-
r"""The name of
|
|
63
|
+
r"""The name of this table."""
|
|
160
64
|
return self._name
|
|
161
65
|
|
|
162
66
|
# Data column #############################################################
|
|
@@ -200,24 +104,25 @@ class LocalTable:
|
|
|
200
104
|
raise KeyError(f"Column '{name}' already exists in table "
|
|
201
105
|
f"'{self.name}'")
|
|
202
106
|
|
|
203
|
-
if
|
|
204
|
-
raise KeyError(f"Column '{name}' does not exist in the
|
|
205
|
-
f"
|
|
107
|
+
if not self._has_source_column(name):
|
|
108
|
+
raise KeyError(f"Column '{name}' does not exist in the underlying "
|
|
109
|
+
f"source table")
|
|
206
110
|
|
|
207
111
|
try:
|
|
208
|
-
dtype =
|
|
112
|
+
dtype = self._get_source_dtype(name)
|
|
209
113
|
except Exception as e:
|
|
210
|
-
raise RuntimeError(f"
|
|
211
|
-
f"table '{self.name}'
|
|
212
|
-
f"
|
|
213
|
-
f"
|
|
114
|
+
raise RuntimeError(f"Could not obtain data type for column "
|
|
115
|
+
f"'{name}' in table '{self.name}'. Change "
|
|
116
|
+
f"the data type of the column in the source "
|
|
117
|
+
f"table or remove it from the table.") from e
|
|
118
|
+
|
|
214
119
|
try:
|
|
215
|
-
stype =
|
|
120
|
+
stype = self._get_source_stype(name, dtype)
|
|
216
121
|
except Exception as e:
|
|
217
|
-
raise RuntimeError(f"
|
|
218
|
-
f"in table '{self.name}'
|
|
219
|
-
f"
|
|
220
|
-
f"
|
|
122
|
+
raise RuntimeError(f"Could not obtain semantic type for column "
|
|
123
|
+
f"'{name}' in table '{self.name}'. Change "
|
|
124
|
+
f"the data type of the column in the source "
|
|
125
|
+
f"table or remove it from the table.") from e
|
|
221
126
|
|
|
222
127
|
self._columns[name] = Column(
|
|
223
128
|
name=name,
|
|
@@ -432,12 +337,14 @@ class LocalTable:
|
|
|
432
337
|
})
|
|
433
338
|
|
|
434
339
|
def print_metadata(self) -> None:
|
|
435
|
-
r"""Prints the :meth:`~
|
|
340
|
+
r"""Prints the :meth:`~metadata` of this table."""
|
|
341
|
+
num_rows = self._num_rows()
|
|
342
|
+
num_rows_repr = ' ({num_rows:,} rows)' if num_rows is not None else ''
|
|
343
|
+
|
|
436
344
|
if in_notebook():
|
|
437
345
|
from IPython.display import Markdown, display
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
f"({len(self._data):,} rows)"))
|
|
346
|
+
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
347
|
+
display(Markdown(md_repr))
|
|
441
348
|
df = self.metadata
|
|
442
349
|
try:
|
|
443
350
|
if hasattr(df.style, 'hide'):
|
|
@@ -447,8 +354,7 @@ class LocalTable:
|
|
|
447
354
|
except ImportError:
|
|
448
355
|
print(df.to_string(index=False)) # missing jinja2
|
|
449
356
|
else:
|
|
450
|
-
print(f"🏷️ Metadata of Table '{self.name}'
|
|
451
|
-
f"({len(self._data):,} rows):")
|
|
357
|
+
print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
|
|
452
358
|
print(self.metadata.to_string(index=False))
|
|
453
359
|
|
|
454
360
|
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
@@ -478,11 +384,7 @@ class LocalTable:
|
|
|
478
384
|
column.name for column in self.columns if is_candidate(column)
|
|
479
385
|
]
|
|
480
386
|
|
|
481
|
-
if primary_key :=
|
|
482
|
-
table_name=self.name,
|
|
483
|
-
df=self._data,
|
|
484
|
-
candidates=candidates,
|
|
485
|
-
):
|
|
387
|
+
if primary_key := self._infer_primary_key(candidates):
|
|
486
388
|
self.primary_key = primary_key
|
|
487
389
|
logs.append(f"primary key '{primary_key}'")
|
|
488
390
|
|
|
@@ -493,7 +395,7 @@ class LocalTable:
|
|
|
493
395
|
if column.stype == Stype.timestamp
|
|
494
396
|
and column.name != self._end_time_column
|
|
495
397
|
]
|
|
496
|
-
if time_column :=
|
|
398
|
+
if time_column := self._infer_time_column(candidates):
|
|
497
399
|
self.time_column = time_column
|
|
498
400
|
logs.append(f"time column '{time_column}'")
|
|
499
401
|
|
|
@@ -543,3 +445,33 @@ class LocalTable:
|
|
|
543
445
|
f' time_column={self._time_column},\n'
|
|
544
446
|
f' end_time_column={self._end_time_column},\n'
|
|
545
447
|
f')')
|
|
448
|
+
|
|
449
|
+
# Abstract method #########################################################
|
|
450
|
+
|
|
451
|
+
@abstractmethod
|
|
452
|
+
def _has_source_column(self, name: str) -> bool:
|
|
453
|
+
pass
|
|
454
|
+
|
|
455
|
+
@abstractmethod
|
|
456
|
+
def _get_source_dtype(self, name: str) -> Dtype:
|
|
457
|
+
pass
|
|
458
|
+
|
|
459
|
+
@abstractmethod
|
|
460
|
+
def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
|
|
461
|
+
pass
|
|
462
|
+
|
|
463
|
+
@abstractmethod
|
|
464
|
+
def _get_source_foreign_keys(self) -> List[Tuple[str, str, str]]:
|
|
465
|
+
pass
|
|
466
|
+
|
|
467
|
+
@abstractmethod
|
|
468
|
+
def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
|
|
469
|
+
pass
|
|
470
|
+
|
|
471
|
+
@abstractmethod
|
|
472
|
+
def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
|
|
473
|
+
pass
|
|
474
|
+
|
|
475
|
+
@abstractmethod
|
|
476
|
+
def _num_rows(self) -> Optional[int]:
|
|
477
|
+
pass
|