kumoai 2.13.0.dev202512011731__cp312-cp312-macosx_11_0_arm64.whl → 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/_version.py +1 -1
- kumoai/experimental/rfm/backend/local/table.py +18 -74
- kumoai/experimental/rfm/backend/snow/__init__.py +35 -0
- kumoai/experimental/rfm/backend/snow/table.py +95 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +7 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +56 -79
- kumoai/experimental/rfm/base/__init__.py +3 -0
- kumoai/experimental/rfm/base/source.py +18 -0
- kumoai/experimental/rfm/base/table.py +88 -21
- kumoai/experimental/rfm/graph.py +192 -39
- kumoai/experimental/rfm/infer/__init__.py +6 -0
- kumoai/experimental/rfm/infer/dtype.py +90 -0
- kumoai/experimental/rfm/infer/pkey.py +126 -0
- kumoai/experimental/rfm/infer/time_col.py +62 -0
- kumoai/experimental/rfm/local_graph_sampler.py +42 -1
- kumoai/experimental/rfm/local_graph_store.py +1 -16
- kumoai/experimental/rfm/rfm.py +1 -11
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.13.0.dev202512031731.dist-info}/METADATA +3 -1
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.13.0.dev202512031731.dist-info}/RECORD +22 -17
- kumoai/experimental/rfm/utils.py +0 -344
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.13.0.dev202512031731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.13.0.dev202512031731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.13.0.dev202512031731.dist-info}/top_level.txt +0 -0
kumoai/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '2.13.0.
|
|
1
|
+
__version__ = '2.13.0.dev202512031731'
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from kumoapi.typing import Dtype, Stype
|
|
5
|
-
from typing_extensions import Self
|
|
6
4
|
|
|
7
|
-
from kumoai.experimental.rfm import
|
|
8
|
-
from kumoai.experimental.rfm.
|
|
5
|
+
from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
|
|
6
|
+
from kumoai.experimental.rfm.infer import infer_dtype
|
|
9
7
|
|
|
10
8
|
|
|
11
9
|
class LocalTable(Table):
|
|
@@ -59,7 +57,7 @@ class LocalTable(Table):
|
|
|
59
57
|
) -> None:
|
|
60
58
|
|
|
61
59
|
if df.empty:
|
|
62
|
-
raise ValueError("Data frame
|
|
60
|
+
raise ValueError("Data frame is empty")
|
|
63
61
|
if isinstance(df.columns, pd.MultiIndex):
|
|
64
62
|
raise ValueError("Data frame must not have a multi-index")
|
|
65
63
|
if not df.columns.is_unique:
|
|
@@ -77,75 +75,21 @@ class LocalTable(Table):
|
|
|
77
75
|
end_time_column=end_time_column,
|
|
78
76
|
)
|
|
79
77
|
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def is_candidate(column: Column) -> bool:
|
|
93
|
-
if column.stype == Stype.ID:
|
|
94
|
-
return True
|
|
95
|
-
if all(column.stype != Stype.ID for column in self.columns):
|
|
96
|
-
if self.name == column.name:
|
|
97
|
-
return True
|
|
98
|
-
if (self.name.endswith('s')
|
|
99
|
-
and self.name[:-1] == column.name):
|
|
100
|
-
return True
|
|
101
|
-
return False
|
|
102
|
-
|
|
103
|
-
candidates = [
|
|
104
|
-
column.name for column in self.columns if is_candidate(column)
|
|
105
|
-
]
|
|
106
|
-
|
|
107
|
-
if primary_key := utils.detect_primary_key(
|
|
108
|
-
table_name=self.name,
|
|
109
|
-
df=self._data,
|
|
110
|
-
candidates=candidates,
|
|
111
|
-
):
|
|
112
|
-
self.primary_key = primary_key
|
|
113
|
-
logs.append(f"primary key '{primary_key}'")
|
|
114
|
-
|
|
115
|
-
# Try to detect time column if not set:
|
|
116
|
-
if not self.has_time_column():
|
|
117
|
-
candidates = [
|
|
118
|
-
column.name for column in self.columns
|
|
119
|
-
if column.stype == Stype.timestamp
|
|
120
|
-
and column.name != self._end_time_column
|
|
121
|
-
]
|
|
122
|
-
if time_column := utils.detect_time_column(self._data, candidates):
|
|
123
|
-
self.time_column = time_column
|
|
124
|
-
logs.append(f"time column '{time_column}'")
|
|
125
|
-
|
|
126
|
-
if verbose and len(logs) > 0:
|
|
127
|
-
print(f"Detected {' and '.join(logs)} in table '{self.name}'")
|
|
128
|
-
|
|
129
|
-
return self
|
|
130
|
-
|
|
131
|
-
def _has_source_column(self, name: str) -> bool:
|
|
132
|
-
return name in self._data.columns
|
|
133
|
-
|
|
134
|
-
def _get_source_dtype(self, name: str) -> Dtype:
|
|
135
|
-
return utils.to_dtype(self._data[name])
|
|
136
|
-
|
|
137
|
-
def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
|
|
138
|
-
return utils.infer_stype(self._data[name], name, dtype)
|
|
139
|
-
|
|
140
|
-
def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
|
|
141
|
-
return utils.detect_primary_key(
|
|
142
|
-
table_name=self.name,
|
|
143
|
-
df=self._data,
|
|
144
|
-
candidates=candidates,
|
|
145
|
-
)
|
|
78
|
+
def _get_source_columns(self) -> List[SourceColumn]:
|
|
79
|
+
return [
|
|
80
|
+
SourceColumn(
|
|
81
|
+
name=column,
|
|
82
|
+
dtype=infer_dtype(self._data[column]),
|
|
83
|
+
is_primary_key=False,
|
|
84
|
+
is_unique_key=False,
|
|
85
|
+
) for column in self._data.columns
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
|
|
89
|
+
return []
|
|
146
90
|
|
|
147
|
-
def
|
|
148
|
-
return
|
|
91
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
92
|
+
return self._data
|
|
149
93
|
|
|
150
|
-
def
|
|
94
|
+
def _get_num_rows(self) -> Optional[int]:
|
|
151
95
|
return len(self._data)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import Any, TypeAlias
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
import snowflake.connector
|
|
5
|
+
except ImportError:
|
|
6
|
+
raise ImportError("No module named 'snowflake'. Please install Kumo SDK "
|
|
7
|
+
"with the 'snowflake' extension via "
|
|
8
|
+
"`pip install kumoai[snowflake]`.")
|
|
9
|
+
|
|
10
|
+
Connection: TypeAlias = snowflake.connector.SnowflakeConnection
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def connect(**kwargs: Any) -> Connection:
|
|
14
|
+
r"""Opens a connection to a :class:`snowflake` database.
|
|
15
|
+
|
|
16
|
+
If available, will return a connection to the active session.
|
|
17
|
+
|
|
18
|
+
kwargs: Connection arguments, following the :class:`snowflake` protocol.
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
from snowflake.snowpark.context import get_active_session
|
|
22
|
+
return get_active_session().connection
|
|
23
|
+
except Exception:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
return snowflake.connector.connect(**kwargs)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
from .table import SnowTable # noqa: E402
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
'connect',
|
|
33
|
+
'Connection',
|
|
34
|
+
'SnowTable',
|
|
35
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Optional, Sequence
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from kumoapi.typing import Dtype
|
|
6
|
+
|
|
7
|
+
from kumoai.experimental.rfm.backend.sqlite import Connection
|
|
8
|
+
from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SnowTable(Table):
|
|
12
|
+
r"""A table backed by a :class:`sqlite` database.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
connection: The connection to a :class:`snowflake` database.
|
|
16
|
+
name: The name of this table.
|
|
17
|
+
columns: The selected columns of this table.
|
|
18
|
+
primary_key: The name of the primary key of this table, if it exists.
|
|
19
|
+
time_column: The name of the time column of this table, if it exists.
|
|
20
|
+
end_time_column: The name of the end time column of this table, if it
|
|
21
|
+
exists.
|
|
22
|
+
"""
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
connection: Connection,
|
|
26
|
+
name: str,
|
|
27
|
+
columns: Optional[Sequence[str]] = None,
|
|
28
|
+
primary_key: Optional[str] = None,
|
|
29
|
+
time_column: Optional[str] = None,
|
|
30
|
+
end_time_column: Optional[str] = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
|
|
33
|
+
self._connection = connection
|
|
34
|
+
|
|
35
|
+
super().__init__(
|
|
36
|
+
name=name,
|
|
37
|
+
columns=columns,
|
|
38
|
+
primary_key=primary_key,
|
|
39
|
+
time_column=time_column,
|
|
40
|
+
end_time_column=end_time_column,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _get_source_columns(self) -> List[SourceColumn]:
|
|
44
|
+
source_columns: List[SourceColumn] = []
|
|
45
|
+
with self._connection.cursor() as cursor:
|
|
46
|
+
try:
|
|
47
|
+
cursor.execute(f"DESCRIBE TABLE {self.name}")
|
|
48
|
+
except Exception as e:
|
|
49
|
+
raise ValueError(f"Table '{self.name}' does not exist") from e
|
|
50
|
+
|
|
51
|
+
for row in cursor.fetchall():
|
|
52
|
+
column, type, _, _, _, is_pkey, is_unique = row[:7]
|
|
53
|
+
|
|
54
|
+
type = type.strip().upper()
|
|
55
|
+
if type.startswith('NUMBER'):
|
|
56
|
+
dtype = Dtype.int
|
|
57
|
+
elif type.startswith('VARCHAR'):
|
|
58
|
+
dtype = Dtype.string
|
|
59
|
+
elif type == 'FLOAT':
|
|
60
|
+
dtype = Dtype.float
|
|
61
|
+
elif type == 'BOOLEAN':
|
|
62
|
+
dtype = Dtype.bool
|
|
63
|
+
elif re.search('DATE|TIMESTAMP', type):
|
|
64
|
+
dtype = Dtype.date
|
|
65
|
+
else:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
source_column = SourceColumn(
|
|
69
|
+
name=column,
|
|
70
|
+
dtype=dtype,
|
|
71
|
+
is_primary_key=is_pkey.strip().upper() == 'Y',
|
|
72
|
+
is_unique_key=is_unique.strip().upper() == 'Y',
|
|
73
|
+
)
|
|
74
|
+
source_columns.append(source_column)
|
|
75
|
+
|
|
76
|
+
return source_columns
|
|
77
|
+
|
|
78
|
+
def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
|
|
79
|
+
source_fkeys: List[SourceForeignKey] = []
|
|
80
|
+
with self._connection.cursor() as cursor:
|
|
81
|
+
cursor.execute(f"SHOW IMPORTED KEYS IN TABLE {self.name}")
|
|
82
|
+
for row in cursor.fetchall():
|
|
83
|
+
_, _, _, dst_table, pkey, _, _, _, fkey = row[:9]
|
|
84
|
+
source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
|
|
85
|
+
return source_fkeys
|
|
86
|
+
|
|
87
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
88
|
+
with self._connection.cursor() as cursor:
|
|
89
|
+
columns = ', '.join(self._source_column_dict.keys())
|
|
90
|
+
cursor.execute(f"SELECT {columns} FROM {self.name} LIMIT 1000")
|
|
91
|
+
table = cursor.fetch_arrow_all()
|
|
92
|
+
return table.to_pandas()
|
|
93
|
+
|
|
94
|
+
def _get_num_rows(self) -> Optional[int]:
|
|
95
|
+
return None
|
|
@@ -12,12 +12,19 @@ Connection: TypeAlias = adbc.AdbcSqliteConnection
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def connect(uri: Union[str, Path, None] = None, **kwargs: Any) -> Connection:
|
|
15
|
+
r"""Opens a connection to a :class:`sqlite` database.
|
|
16
|
+
|
|
17
|
+
uri: The path to the database file to be opened.
|
|
18
|
+
kwargs: Additional connection arguments, following the
|
|
19
|
+
:class:`adbc_driver_sqlite` protocol.
|
|
20
|
+
"""
|
|
15
21
|
return adbc.connect(uri, **kwargs)
|
|
16
22
|
|
|
17
23
|
|
|
18
24
|
from .table import SQLiteTable # noqa: E402
|
|
19
25
|
|
|
20
26
|
__all__ = [
|
|
27
|
+
'connect',
|
|
21
28
|
'Connection',
|
|
22
29
|
'SQLiteTable',
|
|
23
30
|
]
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import List, Optional, Sequence
|
|
3
3
|
|
|
4
|
-
import
|
|
5
|
-
from kumoapi.typing import Dtype
|
|
6
|
-
from typing_extensions import Self
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from kumoapi.typing import Dtype
|
|
7
6
|
|
|
8
|
-
from kumoai.experimental.rfm import utils
|
|
9
7
|
from kumoai.experimental.rfm.backend.sqlite import Connection
|
|
10
|
-
from kumoai.experimental.rfm.base import Table
|
|
8
|
+
from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
|
|
9
|
+
from kumoai.experimental.rfm.infer import infer_dtype
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class SQLiteTable(Table):
|
|
@@ -33,85 +32,63 @@ class SQLiteTable(Table):
|
|
|
33
32
|
) -> None:
|
|
34
33
|
|
|
35
34
|
self._connection = connection
|
|
36
|
-
self._dtype_dict: Dict[str, Dtype] = {}
|
|
37
|
-
|
|
38
|
-
with connection.cursor() as cursor:
|
|
39
|
-
cursor.execute(f"PRAGMA table_info({name})")
|
|
40
|
-
for _, column, dtype, _, _, is_pkey in cursor.fetchall():
|
|
41
|
-
if bool(is_pkey):
|
|
42
|
-
if primary_key is not None and primary_key != column:
|
|
43
|
-
raise ValueError(f"Found duplicate primary key "
|
|
44
|
-
f"definition '{primary_key}' and "
|
|
45
|
-
f"'{column}' in table '{name}'")
|
|
46
|
-
primary_key = column
|
|
47
|
-
|
|
48
|
-
# Determine colun affinity:
|
|
49
|
-
dtype = dtype.strip().upper()
|
|
50
|
-
if re.search('INT', dtype):
|
|
51
|
-
self._dtype_dict[column] = Dtype.int
|
|
52
|
-
elif re.search('TEXT|CHAR|CLOB', dtype):
|
|
53
|
-
self._dtype_dict[column] = Dtype.string
|
|
54
|
-
elif re.search('REAL|FLOA|DOUB', dtype):
|
|
55
|
-
self._dtype_dict[column] = Dtype.float
|
|
56
|
-
else: # NUMERIC affinity.
|
|
57
|
-
self._dtype_dict[column] = Dtype.unsupported
|
|
58
|
-
|
|
59
|
-
if len(self._dtype_dict) > 0:
|
|
60
|
-
column_names = ', '.join(self._dtype_dict.keys())
|
|
61
|
-
cursor.execute(f"SELECT {column_names} FROM {name} "
|
|
62
|
-
f"ORDER BY rowid LIMIT 1000")
|
|
63
|
-
self._sample = cursor.fetch_arrow_table()
|
|
64
|
-
|
|
65
|
-
for column_name in list(self._dtype_dict.keys()):
|
|
66
|
-
if self._dtype_dict[column_name] == Dtype.unsupported:
|
|
67
|
-
dtype = self._sample[column_name].type
|
|
68
|
-
if pa.types.is_integer(dtype):
|
|
69
|
-
self._dtype_dict[column_name] = Dtype.int
|
|
70
|
-
elif pa.types.is_floating(dtype):
|
|
71
|
-
self._dtype_dict[column_name] = Dtype.float
|
|
72
|
-
elif pa.types.is_decimal(dtype):
|
|
73
|
-
self._dtype_dict[column_name] = Dtype.float
|
|
74
|
-
elif pa.types.is_string(dtype):
|
|
75
|
-
self._dtype_dict[column_name] = Dtype.string
|
|
76
|
-
else:
|
|
77
|
-
del self._dtype_dict[column_name]
|
|
78
|
-
|
|
79
|
-
if len(self._dtype_dict) == 0:
|
|
80
|
-
raise RuntimeError(f"Table '{name}' does not exist or does not "
|
|
81
|
-
f"hold any column with a supported data type")
|
|
82
35
|
|
|
83
36
|
super().__init__(
|
|
84
37
|
name=name,
|
|
85
|
-
columns=columns
|
|
38
|
+
columns=columns,
|
|
86
39
|
primary_key=primary_key,
|
|
87
40
|
time_column=time_column,
|
|
88
41
|
end_time_column=end_time_column,
|
|
89
42
|
)
|
|
90
43
|
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
44
|
+
def _get_source_columns(self) -> List[SourceColumn]:
|
|
45
|
+
source_columns: List[SourceColumn] = []
|
|
46
|
+
with self._connection.cursor() as cursor:
|
|
47
|
+
cursor.execute(f"PRAGMA table_info({self.name})")
|
|
48
|
+
rows = cursor.fetchall()
|
|
49
|
+
|
|
50
|
+
if len(rows) == 0:
|
|
51
|
+
raise ValueError(f"Table '{self.name}' does not exist")
|
|
52
|
+
|
|
53
|
+
for _, column, type, _, _, is_pkey in rows:
|
|
54
|
+
# Determine column affinity:
|
|
55
|
+
type = type.strip().upper()
|
|
56
|
+
if re.search('INT', type):
|
|
57
|
+
dtype = Dtype.int
|
|
58
|
+
elif re.search('TEXT|CHAR|CLOB', type):
|
|
59
|
+
dtype = Dtype.string
|
|
60
|
+
elif re.search('REAL|FLOA|DOUB', type):
|
|
61
|
+
dtype = Dtype.float
|
|
62
|
+
else: # NUMERIC affinity.
|
|
63
|
+
try:
|
|
64
|
+
dtype = infer_dtype(self._sample_df[column])
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise e
|
|
67
|
+
|
|
68
|
+
source_column = SourceColumn(
|
|
69
|
+
name=column,
|
|
70
|
+
dtype=dtype,
|
|
71
|
+
is_primary_key=bool(is_pkey),
|
|
72
|
+
is_unique_key=False,
|
|
73
|
+
)
|
|
74
|
+
source_columns.append(source_column)
|
|
75
|
+
|
|
76
|
+
return source_columns
|
|
77
|
+
|
|
78
|
+
def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
|
|
79
|
+
source_fkeys: List[SourceForeignKey] = []
|
|
80
|
+
with self._connection.cursor() as cursor:
|
|
81
|
+
cursor.execute(f"PRAGMA foreign_key_list({self.name})")
|
|
82
|
+
for _, _, dst_table, fkey, pkey, _, _, _ in cursor.fetchall():
|
|
83
|
+
source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
|
|
84
|
+
return source_fkeys
|
|
85
|
+
|
|
86
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
87
|
+
with self._connection.cursor() as cursor:
|
|
88
|
+
cursor.execute(f"SELECT * FROM {self.name} "
|
|
89
|
+
f"ORDER BY rowid LIMIT 1000")
|
|
90
|
+
table = cursor.fetch_arrow_table()
|
|
91
|
+
return table.to_pandas()
|
|
92
|
+
|
|
93
|
+
def _get_num_rows(self) -> Optional[int]:
|
|
117
94
|
return None
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from kumoapi.typing import Dtype
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class SourceColumn:
|
|
8
|
+
name: str
|
|
9
|
+
dtype: Dtype
|
|
10
|
+
is_primary_key: bool
|
|
11
|
+
is_unique_key: bool
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SourceForeignKey:
|
|
16
|
+
name: str
|
|
17
|
+
dst_table: str
|
|
18
|
+
primary_key: str
|
|
@@ -1,15 +1,25 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from typing import Dict, List, Optional, Sequence, Set
|
|
3
5
|
|
|
4
6
|
import pandas as pd
|
|
5
7
|
from kumoapi.source_table import UnavailableSourceTable
|
|
6
8
|
from kumoapi.table import Column as ColumnDefinition
|
|
7
9
|
from kumoapi.table import TableDefinition
|
|
8
|
-
from kumoapi.typing import
|
|
10
|
+
from kumoapi.typing import Stype
|
|
9
11
|
from typing_extensions import Self
|
|
10
12
|
|
|
11
13
|
from kumoai import in_notebook
|
|
12
|
-
from kumoai.experimental.rfm.base import Column
|
|
14
|
+
from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
|
|
15
|
+
from kumoai.experimental.rfm.infer import (
|
|
16
|
+
contains_categorical,
|
|
17
|
+
contains_id,
|
|
18
|
+
contains_multicategorical,
|
|
19
|
+
contains_timestamp,
|
|
20
|
+
infer_primary_key,
|
|
21
|
+
infer_time_column,
|
|
22
|
+
)
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
class Table(ABC):
|
|
@@ -39,8 +49,30 @@ class Table(ABC):
|
|
|
39
49
|
self._time_column: Optional[str] = None
|
|
40
50
|
self._end_time_column: Optional[str] = None
|
|
41
51
|
|
|
52
|
+
if len(self._source_column_dict) == 0:
|
|
53
|
+
raise ValueError(f"Table '{name}' does not hold any column with "
|
|
54
|
+
f"a supported data type")
|
|
55
|
+
|
|
56
|
+
primary_keys = [
|
|
57
|
+
column.name for column in self._source_column_dict.values()
|
|
58
|
+
if column.is_primary_key
|
|
59
|
+
]
|
|
60
|
+
if len(primary_keys) == 1: # NOTE No composite keys yet.
|
|
61
|
+
if primary_key is not None and primary_key != primary_keys[0]:
|
|
62
|
+
raise ValueError(f"Found duplicate primary key "
|
|
63
|
+
f"definition '{primary_key}' and "
|
|
64
|
+
f"'{primary_keys[0]}' in table '{name}'")
|
|
65
|
+
primary_key = primary_keys[0]
|
|
66
|
+
|
|
67
|
+
unique_keys = [
|
|
68
|
+
column.name for column in self._source_column_dict.values()
|
|
69
|
+
if column.is_unique_key
|
|
70
|
+
]
|
|
71
|
+
if primary_key is None and len(unique_keys) == 1:
|
|
72
|
+
primary_key = unique_keys[0]
|
|
73
|
+
|
|
42
74
|
self._columns: Dict[str, Column] = {}
|
|
43
|
-
for column_name in columns or
|
|
75
|
+
for column_name in columns or list(self._source_column_dict.keys()):
|
|
44
76
|
self.add_column(column_name)
|
|
45
77
|
|
|
46
78
|
if primary_key is not None:
|
|
@@ -104,12 +136,12 @@ class Table(ABC):
|
|
|
104
136
|
raise KeyError(f"Column '{name}' already exists in table "
|
|
105
137
|
f"'{self.name}'")
|
|
106
138
|
|
|
107
|
-
if not self.
|
|
139
|
+
if name not in self._source_column_dict:
|
|
108
140
|
raise KeyError(f"Column '{name}' does not exist in the underlying "
|
|
109
141
|
f"source table")
|
|
110
142
|
|
|
111
143
|
try:
|
|
112
|
-
dtype = self.
|
|
144
|
+
dtype = self._source_column_dict[name].dtype
|
|
113
145
|
except Exception as e:
|
|
114
146
|
raise RuntimeError(f"Could not obtain data type for column "
|
|
115
147
|
f"'{name}' in table '{self.name}'. Change "
|
|
@@ -117,7 +149,17 @@ class Table(ABC):
|
|
|
117
149
|
f"table or remove it from the table.") from e
|
|
118
150
|
|
|
119
151
|
try:
|
|
120
|
-
|
|
152
|
+
ser = self._sample_df[name]
|
|
153
|
+
if contains_id(ser, name, dtype):
|
|
154
|
+
stype = Stype.ID
|
|
155
|
+
elif contains_timestamp(ser, name, dtype):
|
|
156
|
+
stype = Stype.timestamp
|
|
157
|
+
elif contains_multicategorical(ser, name, dtype):
|
|
158
|
+
stype = Stype.multicategorical
|
|
159
|
+
elif contains_categorical(ser, name, dtype):
|
|
160
|
+
stype = Stype.categorical
|
|
161
|
+
else:
|
|
162
|
+
stype = dtype.default_stype
|
|
121
163
|
except Exception as e:
|
|
122
164
|
raise RuntimeError(f"Could not obtain semantic type for column "
|
|
123
165
|
f"'{name}' in table '{self.name}'. Change "
|
|
@@ -338,8 +380,9 @@ class Table(ABC):
|
|
|
338
380
|
|
|
339
381
|
def print_metadata(self) -> None:
|
|
340
382
|
r"""Prints the :meth:`~metadata` of this table."""
|
|
341
|
-
|
|
342
|
-
|
|
383
|
+
num_rows_repr = ''
|
|
384
|
+
if self._num_rows is not None:
|
|
385
|
+
num_rows_repr = ' ({self._num_rows:,} rows)'
|
|
343
386
|
|
|
344
387
|
if in_notebook():
|
|
345
388
|
from IPython.display import Markdown, display
|
|
@@ -384,7 +427,11 @@ class Table(ABC):
|
|
|
384
427
|
column.name for column in self.columns if is_candidate(column)
|
|
385
428
|
]
|
|
386
429
|
|
|
387
|
-
if primary_key :=
|
|
430
|
+
if primary_key := infer_primary_key(
|
|
431
|
+
table_name=self.name,
|
|
432
|
+
df=self._sample_df,
|
|
433
|
+
candidates=candidates,
|
|
434
|
+
):
|
|
388
435
|
self.primary_key = primary_key
|
|
389
436
|
logs.append(f"primary key '{primary_key}'")
|
|
390
437
|
|
|
@@ -395,7 +442,10 @@ class Table(ABC):
|
|
|
395
442
|
if column.stype == Stype.timestamp
|
|
396
443
|
and column.name != self._end_time_column
|
|
397
444
|
]
|
|
398
|
-
if time_column :=
|
|
445
|
+
if time_column := infer_time_column(
|
|
446
|
+
df=self._sample_df,
|
|
447
|
+
candidates=candidates,
|
|
448
|
+
):
|
|
399
449
|
self.time_column = time_column
|
|
400
450
|
logs.append(f"time column '{time_column}'")
|
|
401
451
|
|
|
@@ -448,26 +498,43 @@ class Table(ABC):
|
|
|
448
498
|
|
|
449
499
|
# Abstract method #########################################################
|
|
450
500
|
|
|
451
|
-
@
|
|
452
|
-
def
|
|
453
|
-
|
|
501
|
+
@cached_property
|
|
502
|
+
def _source_column_dict(self) -> Dict[str, SourceColumn]:
|
|
503
|
+
return {col.name: col for col in self._get_source_columns()}
|
|
454
504
|
|
|
455
505
|
@abstractmethod
|
|
456
|
-
def
|
|
506
|
+
def _get_source_columns(self) -> List[SourceColumn]:
|
|
457
507
|
pass
|
|
458
508
|
|
|
459
|
-
@
|
|
460
|
-
def
|
|
461
|
-
|
|
509
|
+
@cached_property
|
|
510
|
+
def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
|
|
511
|
+
fkeys = self._get_source_foreign_keys()
|
|
512
|
+
# NOTE Drop all keys that link to different primary keys in the same
|
|
513
|
+
# table since we don't support composite keys yet:
|
|
514
|
+
table_pkeys: Dict[str, Set[str]] = defaultdict(set)
|
|
515
|
+
for fkey in fkeys:
|
|
516
|
+
table_pkeys[fkey.dst_table].add(fkey.primary_key)
|
|
517
|
+
return {
|
|
518
|
+
fkey.name: fkey
|
|
519
|
+
for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
|
|
520
|
+
}
|
|
462
521
|
|
|
463
522
|
@abstractmethod
|
|
464
|
-
def
|
|
523
|
+
def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
|
|
465
524
|
pass
|
|
466
525
|
|
|
526
|
+
@cached_property
|
|
527
|
+
def _sample_df(self) -> pd.DataFrame:
|
|
528
|
+
return self._get_sample_df()
|
|
529
|
+
|
|
467
530
|
@abstractmethod
|
|
468
|
-
def
|
|
531
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
469
532
|
pass
|
|
470
533
|
|
|
471
|
-
@
|
|
534
|
+
@cached_property
|
|
472
535
|
def _num_rows(self) -> Optional[int]:
|
|
536
|
+
return self._get_num_rows()
|
|
537
|
+
|
|
538
|
+
@abstractmethod
|
|
539
|
+
def _get_num_rows(self) -> Optional[int]:
|
|
473
540
|
pass
|