kumoai 2.13.0.dev202512040649__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +35 -26
- kumoai/_version.py +1 -1
- kumoai/client/client.py +6 -0
- kumoai/client/jobs.py +26 -0
- kumoai/client/pquery.py +6 -2
- kumoai/connector/utils.py +21 -7
- kumoai/experimental/rfm/__init__.py +51 -24
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/__init__.py +4 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +62 -110
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +35 -31
- kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +366 -0
- kumoai/experimental/rfm/backend/snow/table.py +177 -50
- kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +454 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +131 -48
- kumoai/experimental/rfm/base/__init__.py +23 -3
- kumoai/experimental/rfm/base/column.py +96 -10
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +782 -0
- kumoai/experimental/rfm/base/source.py +2 -1
- kumoai/experimental/rfm/base/sql_sampler.py +247 -0
- kumoai/experimental/rfm/base/table.py +404 -203
- kumoai/experimental/rfm/graph.py +374 -172
- kumoai/experimental/rfm/infer/__init__.py +6 -4
- kumoai/experimental/rfm/infer/dtype.py +7 -4
- kumoai/experimental/rfm/infer/multicategorical.py +1 -1
- kumoai/experimental/rfm/infer/pkey.py +4 -2
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +1 -2
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +762 -467
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/experimental/rfm/task_table.py +292 -0
- kumoai/kumolib.cp313-win_amd64.pyd +0 -0
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/pquery/training_table.py +16 -2
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/display.py +87 -0
- kumoai/utils/progress_logger.py +190 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/METADATA +3 -2
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/RECORD +52 -41
- kumoai/experimental/rfm/local_graph_sampler.py +0 -223
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,33 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
|
-
from collections import
|
|
3
|
+
from collections.abc import Sequence
|
|
3
4
|
from functools import cached_property
|
|
4
|
-
from typing import Dict, List, Optional, Sequence, Set
|
|
5
5
|
|
|
6
|
+
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
from kumoapi.model_plan import MissingType
|
|
7
10
|
from kumoapi.source_table import UnavailableSourceTable
|
|
8
11
|
from kumoapi.table import Column as ColumnDefinition
|
|
9
12
|
from kumoapi.table import TableDefinition
|
|
10
|
-
from kumoapi.typing import Stype
|
|
13
|
+
from kumoapi.typing import Dtype, Stype
|
|
11
14
|
from typing_extensions import Self
|
|
12
15
|
|
|
13
|
-
from kumoai import
|
|
14
|
-
|
|
16
|
+
from kumoai.experimental.rfm.base import (
|
|
17
|
+
Column,
|
|
18
|
+
ColumnSpec,
|
|
19
|
+
ColumnSpecType,
|
|
20
|
+
DataBackend,
|
|
21
|
+
SourceColumn,
|
|
22
|
+
SourceForeignKey,
|
|
23
|
+
)
|
|
15
24
|
from kumoai.experimental.rfm.infer import (
|
|
16
|
-
|
|
17
|
-
contains_id,
|
|
18
|
-
contains_multicategorical,
|
|
19
|
-
contains_timestamp,
|
|
25
|
+
infer_dtype,
|
|
20
26
|
infer_primary_key,
|
|
27
|
+
infer_stype,
|
|
21
28
|
infer_time_column,
|
|
22
29
|
)
|
|
30
|
+
from kumoai.utils import display, quote_ident
|
|
23
31
|
|
|
24
32
|
|
|
25
33
|
class Table(ABC):
|
|
@@ -29,53 +37,48 @@ class Table(ABC):
|
|
|
29
37
|
|
|
30
38
|
Args:
|
|
31
39
|
name: The name of this table.
|
|
40
|
+
source_name: The source name of this table. If set to ``None``,
|
|
41
|
+
``name`` is being used.
|
|
32
42
|
columns: The selected columns of this table.
|
|
33
43
|
primary_key: The name of the primary key of this table, if it exists.
|
|
34
44
|
time_column: The name of the time column of this table, if it exists.
|
|
35
45
|
end_time_column: The name of the end time column of this table, if it
|
|
36
46
|
exists.
|
|
37
47
|
"""
|
|
48
|
+
_NUM_SAMPLE_ROWS = 1_000
|
|
49
|
+
|
|
38
50
|
def __init__(
|
|
39
51
|
self,
|
|
40
52
|
name: str,
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
53
|
+
source_name: str | None = None,
|
|
54
|
+
columns: Sequence[ColumnSpecType] | None = None,
|
|
55
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
56
|
+
time_column: str | None = None,
|
|
57
|
+
end_time_column: str | None = None,
|
|
45
58
|
) -> None:
|
|
46
59
|
|
|
47
60
|
self._name = name
|
|
48
|
-
self.
|
|
49
|
-
self.
|
|
50
|
-
self.
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
if
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
if column.is_unique_key
|
|
70
|
-
]
|
|
71
|
-
if primary_key is None and len(unique_keys) == 1:
|
|
72
|
-
primary_key = unique_keys[0]
|
|
73
|
-
|
|
74
|
-
self._columns: Dict[str, Column] = {}
|
|
75
|
-
for column_name in columns or list(self._source_column_dict.keys()):
|
|
76
|
-
self.add_column(column_name)
|
|
77
|
-
|
|
78
|
-
if primary_key is not None:
|
|
61
|
+
self._source_name = source_name or name
|
|
62
|
+
self._column_dict: dict[str, Column] = {}
|
|
63
|
+
self._primary_key: str | None = None
|
|
64
|
+
self._time_column: str | None = None
|
|
65
|
+
self._end_time_column: str | None = None
|
|
66
|
+
self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
|
|
67
|
+
|
|
68
|
+
if columns is None:
|
|
69
|
+
columns = list(self._source_column_dict.keys())
|
|
70
|
+
|
|
71
|
+
self.add_columns(columns)
|
|
72
|
+
|
|
73
|
+
if isinstance(primary_key, MissingType):
|
|
74
|
+
# Infer primary key from source metadata, but only set it in case
|
|
75
|
+
# it is already part of the column set (don't magically add it):
|
|
76
|
+
if any(column.is_source for column in self.columns):
|
|
77
|
+
primary_key = self._source_primary_key
|
|
78
|
+
if (primary_key is not None and primary_key in self
|
|
79
|
+
and self[primary_key].is_source):
|
|
80
|
+
self.primary_key = primary_key
|
|
81
|
+
elif primary_key is not None:
|
|
79
82
|
if primary_key not in self:
|
|
80
83
|
self.add_column(primary_key)
|
|
81
84
|
self.primary_key = primary_key
|
|
@@ -95,13 +98,22 @@ class Table(ABC):
|
|
|
95
98
|
r"""The name of this table."""
|
|
96
99
|
return self._name
|
|
97
100
|
|
|
98
|
-
|
|
101
|
+
@property
|
|
102
|
+
def source_name(self) -> str:
|
|
103
|
+
r"""The source name of this table."""
|
|
104
|
+
return self._source_name
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def _quoted_source_name(self) -> str:
|
|
108
|
+
return quote_ident(self._source_name)
|
|
109
|
+
|
|
110
|
+
# Column ##################################################################
|
|
99
111
|
|
|
100
112
|
def has_column(self, name: str) -> bool:
|
|
101
113
|
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
102
114
|
``False`` otherwise.
|
|
103
115
|
"""
|
|
104
|
-
return name in self.
|
|
116
|
+
return name in self._column_dict
|
|
105
117
|
|
|
106
118
|
def column(self, name: str) -> Column:
|
|
107
119
|
r"""Returns the data column named with name ``name`` in this table.
|
|
@@ -114,65 +126,113 @@ class Table(ABC):
|
|
|
114
126
|
"""
|
|
115
127
|
if not self.has_column(name):
|
|
116
128
|
raise KeyError(f"Column '{name}' not found in table '{self.name}'")
|
|
117
|
-
return self.
|
|
129
|
+
return self._column_dict[name]
|
|
118
130
|
|
|
119
131
|
@property
|
|
120
|
-
def columns(self) ->
|
|
132
|
+
def columns(self) -> list[Column]:
|
|
121
133
|
r"""Returns a list of :class:`Column` objects that represent the
|
|
122
134
|
columns in this table.
|
|
123
135
|
"""
|
|
124
|
-
return list(self.
|
|
136
|
+
return list(self._column_dict.values())
|
|
125
137
|
|
|
126
|
-
def
|
|
127
|
-
r"""Adds a
|
|
138
|
+
def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
|
|
139
|
+
r"""Adds a set of columns to this table.
|
|
128
140
|
|
|
129
141
|
Args:
|
|
130
|
-
|
|
142
|
+
columns: The columns to add.
|
|
131
143
|
|
|
132
144
|
Raises:
|
|
133
|
-
KeyError: If
|
|
145
|
+
KeyError: If any of the column names already exist in this table.
|
|
134
146
|
"""
|
|
135
|
-
if
|
|
136
|
-
|
|
137
|
-
f"'{self.name}'")
|
|
138
|
-
|
|
139
|
-
if name not in self._source_column_dict:
|
|
140
|
-
raise KeyError(f"Column '{name}' does not exist in the underlying "
|
|
141
|
-
f"source table")
|
|
142
|
-
|
|
143
|
-
try:
|
|
144
|
-
dtype = self._source_column_dict[name].dtype
|
|
145
|
-
except Exception as e:
|
|
146
|
-
raise RuntimeError(f"Could not obtain data type for column "
|
|
147
|
-
f"'{name}' in table '{self.name}'. Change "
|
|
148
|
-
f"the data type of the column in the source "
|
|
149
|
-
f"table or remove it from the table.") from e
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
ser = self._sample_df[name]
|
|
153
|
-
if contains_id(ser, name, dtype):
|
|
154
|
-
stype = Stype.ID
|
|
155
|
-
elif contains_timestamp(ser, name, dtype):
|
|
156
|
-
stype = Stype.timestamp
|
|
157
|
-
elif contains_multicategorical(ser, name, dtype):
|
|
158
|
-
stype = Stype.multicategorical
|
|
159
|
-
elif contains_categorical(ser, name, dtype):
|
|
160
|
-
stype = Stype.categorical
|
|
161
|
-
else:
|
|
162
|
-
stype = dtype.default_stype
|
|
163
|
-
except Exception as e:
|
|
164
|
-
raise RuntimeError(f"Could not obtain semantic type for column "
|
|
165
|
-
f"'{name}' in table '{self.name}'. Change "
|
|
166
|
-
f"the data type of the column in the source "
|
|
167
|
-
f"table or remove it from the table.") from e
|
|
168
|
-
|
|
169
|
-
self._columns[name] = Column(
|
|
170
|
-
name=name,
|
|
171
|
-
dtype=dtype,
|
|
172
|
-
stype=stype,
|
|
173
|
-
)
|
|
147
|
+
if len(columns) == 0:
|
|
148
|
+
return
|
|
174
149
|
|
|
175
|
-
|
|
150
|
+
column_specs = [ColumnSpec.coerce(column) for column in columns]
|
|
151
|
+
|
|
152
|
+
# Obtain a batch-wise sample for all column expressions:
|
|
153
|
+
expr_specs = [spec for spec in column_specs if not spec.is_source]
|
|
154
|
+
if len(expr_specs) > 0:
|
|
155
|
+
dfs = [
|
|
156
|
+
self._expr_sample_df,
|
|
157
|
+
self._get_expr_sample_df(expr_specs).reset_index(drop=True),
|
|
158
|
+
]
|
|
159
|
+
size = min(map(len, dfs))
|
|
160
|
+
df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
|
|
161
|
+
df = df.loc[:, ~df.columns.duplicated(keep='last')]
|
|
162
|
+
self._expr_sample_df = df
|
|
163
|
+
|
|
164
|
+
for column_spec in column_specs:
|
|
165
|
+
if column_spec.name in self:
|
|
166
|
+
raise KeyError(f"Column '{column_spec.name}' already exists "
|
|
167
|
+
f"in table '{self.name}'")
|
|
168
|
+
|
|
169
|
+
dtype = column_spec.dtype
|
|
170
|
+
stype = column_spec.stype
|
|
171
|
+
|
|
172
|
+
if column_spec.is_source:
|
|
173
|
+
if column_spec.name not in self._source_column_dict:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Column '{column_spec.name}' does not exist in the "
|
|
176
|
+
f"underlying source table")
|
|
177
|
+
|
|
178
|
+
if dtype is None:
|
|
179
|
+
dtype = self._source_column_dict[column_spec.name].dtype
|
|
180
|
+
|
|
181
|
+
if dtype == Dtype.unsupported:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"Encountered unsupported data type for column "
|
|
184
|
+
f"'{column_spec.name}' in table '{self.name}'. Please "
|
|
185
|
+
f"either change the column's data type or remove the "
|
|
186
|
+
f"column from this table.")
|
|
187
|
+
|
|
188
|
+
if dtype is None:
|
|
189
|
+
if column_spec.is_source:
|
|
190
|
+
ser = self._source_sample_df[column_spec.name]
|
|
191
|
+
else:
|
|
192
|
+
ser = self._expr_sample_df[column_spec.name]
|
|
193
|
+
try:
|
|
194
|
+
dtype = infer_dtype(ser)
|
|
195
|
+
except Exception as e:
|
|
196
|
+
raise RuntimeError(
|
|
197
|
+
f"Encountered unsupported data type '{ser.dtype}' for "
|
|
198
|
+
f"column '{column_spec.name}' in table '{self.name}'. "
|
|
199
|
+
f"Please either manually override the columns's data "
|
|
200
|
+
f"type or remove the column from this table.") from e
|
|
201
|
+
|
|
202
|
+
if stype is None:
|
|
203
|
+
if column_spec.is_source:
|
|
204
|
+
ser = self._source_sample_df[column_spec.name]
|
|
205
|
+
else:
|
|
206
|
+
ser = self._expr_sample_df[column_spec.name]
|
|
207
|
+
try:
|
|
208
|
+
stype = infer_stype(ser, column_spec.name, dtype)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
raise RuntimeError(
|
|
211
|
+
f"Could not determine semantic type for column "
|
|
212
|
+
f"'{column_spec.name}' with data type '{dtype}' in "
|
|
213
|
+
f"table '{self.name}'. Please either change the "
|
|
214
|
+
f"column's data type or remove the column from this "
|
|
215
|
+
f"table.") from e
|
|
216
|
+
|
|
217
|
+
self._column_dict[column_spec.name] = Column(
|
|
218
|
+
name=column_spec.name,
|
|
219
|
+
expr=column_spec.expr,
|
|
220
|
+
dtype=dtype,
|
|
221
|
+
stype=stype,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def add_column(self, column: ColumnSpecType) -> Column:
|
|
225
|
+
r"""Adds a column to this table.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
column: The column to add.
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
KeyError: If the column name already exists in this table.
|
|
232
|
+
"""
|
|
233
|
+
column_spec = ColumnSpec.coerce(column)
|
|
234
|
+
self.add_columns([column_spec])
|
|
235
|
+
return self[column_spec.name]
|
|
176
236
|
|
|
177
237
|
def remove_column(self, name: str) -> Self:
|
|
178
238
|
r"""Removes a column from this table.
|
|
@@ -192,7 +252,7 @@ class Table(ABC):
|
|
|
192
252
|
self.time_column = None
|
|
193
253
|
if self._end_time_column == name:
|
|
194
254
|
self.end_time_column = None
|
|
195
|
-
del self.
|
|
255
|
+
del self._column_dict[name]
|
|
196
256
|
|
|
197
257
|
return self
|
|
198
258
|
|
|
@@ -205,22 +265,22 @@ class Table(ABC):
|
|
|
205
265
|
return self._primary_key is not None
|
|
206
266
|
|
|
207
267
|
@property
|
|
208
|
-
def primary_key(self) ->
|
|
268
|
+
def primary_key(self) -> Column | None:
|
|
209
269
|
r"""The primary key column of this table.
|
|
210
270
|
|
|
211
271
|
The getter returns the primary key column of this table, or ``None`` if
|
|
212
272
|
no such primary key is present.
|
|
213
273
|
|
|
214
274
|
The setter sets a column as a primary key on this table, and raises a
|
|
215
|
-
:class:`ValueError` if the primary key has a non-ID
|
|
216
|
-
if the column name does not match a column in the data frame.
|
|
275
|
+
:class:`ValueError` if the primary key has a non-ID compatible data
|
|
276
|
+
type or if the column name does not match a column in the data frame.
|
|
217
277
|
"""
|
|
218
278
|
if self._primary_key is None:
|
|
219
279
|
return None
|
|
220
280
|
return self[self._primary_key]
|
|
221
281
|
|
|
222
282
|
@primary_key.setter
|
|
223
|
-
def primary_key(self, name:
|
|
283
|
+
def primary_key(self, name: str | None) -> None:
|
|
224
284
|
if name is not None and name == self._time_column:
|
|
225
285
|
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
226
286
|
f"key since it is already defined to be a time "
|
|
@@ -250,22 +310,23 @@ class Table(ABC):
|
|
|
250
310
|
return self._time_column is not None
|
|
251
311
|
|
|
252
312
|
@property
|
|
253
|
-
def time_column(self) ->
|
|
313
|
+
def time_column(self) -> Column | None:
|
|
254
314
|
r"""The time column of this table.
|
|
255
315
|
|
|
256
316
|
The getter returns the time column of this table, or ``None`` if no
|
|
257
317
|
such time column is present.
|
|
258
318
|
|
|
259
319
|
The setter sets a column as a time column on this table, and raises a
|
|
260
|
-
:class:`ValueError` if the time column has a non-timestamp
|
|
261
|
-
type or if the column name does not match a column in the data
|
|
320
|
+
:class:`ValueError` if the time column has a non-timestamp compatible
|
|
321
|
+
data type or if the column name does not match a column in the data
|
|
322
|
+
frame.
|
|
262
323
|
"""
|
|
263
324
|
if self._time_column is None:
|
|
264
325
|
return None
|
|
265
326
|
return self[self._time_column]
|
|
266
327
|
|
|
267
328
|
@time_column.setter
|
|
268
|
-
def time_column(self, name:
|
|
329
|
+
def time_column(self, name: str | None) -> None:
|
|
269
330
|
if name is not None and name == self._primary_key:
|
|
270
331
|
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
271
332
|
f"column since it is already defined to be a "
|
|
@@ -295,7 +356,7 @@ class Table(ABC):
|
|
|
295
356
|
return self._end_time_column is not None
|
|
296
357
|
|
|
297
358
|
@property
|
|
298
|
-
def end_time_column(self) ->
|
|
359
|
+
def end_time_column(self) -> Column | None:
|
|
299
360
|
r"""The end time column of this table.
|
|
300
361
|
|
|
301
362
|
The getter returns the end time column of this table, or ``None`` if no
|
|
@@ -303,15 +364,15 @@ class Table(ABC):
|
|
|
303
364
|
|
|
304
365
|
The setter sets a column as an end time column on this table, and
|
|
305
366
|
raises a :class:`ValueError` if the end time column has a non-timestamp
|
|
306
|
-
|
|
307
|
-
frame.
|
|
367
|
+
compatible data type or if the column name does not match a column in
|
|
368
|
+
the data frame.
|
|
308
369
|
"""
|
|
309
370
|
if self._end_time_column is None:
|
|
310
371
|
return None
|
|
311
372
|
return self[self._end_time_column]
|
|
312
373
|
|
|
313
374
|
@end_time_column.setter
|
|
314
|
-
def end_time_column(self, name:
|
|
375
|
+
def end_time_column(self, name: str | None) -> None:
|
|
315
376
|
if name is not None and name == self._primary_key:
|
|
316
377
|
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
317
378
|
f"column since it is already defined to be a "
|
|
@@ -339,39 +400,39 @@ class Table(ABC):
|
|
|
339
400
|
r"""Returns a :class:`pandas.DataFrame` object containing metadata
|
|
340
401
|
information about the columns in this table.
|
|
341
402
|
|
|
342
|
-
The returned dataframe has columns ``
|
|
343
|
-
``
|
|
344
|
-
which provide an
|
|
345
|
-
this table.
|
|
403
|
+
The returned dataframe has columns ``"Name"``, ``"Data Type"``,
|
|
404
|
+
``"Semantic Type"``, ``"Primary Key"``, ``"Time Column"`` and
|
|
405
|
+
``"End Time Column"``, which provide an aggregated view of the
|
|
406
|
+
properties of the columns of this table.
|
|
346
407
|
|
|
347
408
|
Example:
|
|
348
409
|
>>> # doctest: +SKIP
|
|
349
410
|
>>> import kumoai.experimental.rfm as rfm
|
|
350
411
|
>>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
|
|
351
412
|
>>> table.metadata
|
|
352
|
-
|
|
353
|
-
0 CustomerID float64
|
|
413
|
+
Name Data Type Semantic Type Primary Key Time Column End Time Column
|
|
414
|
+
0 CustomerID float64 ID True False False
|
|
354
415
|
""" # noqa: E501
|
|
355
416
|
cols = self.columns
|
|
356
417
|
|
|
357
418
|
return pd.DataFrame({
|
|
358
|
-
'
|
|
419
|
+
'Name':
|
|
359
420
|
pd.Series(dtype=str, data=[c.name for c in cols]),
|
|
360
|
-
'
|
|
421
|
+
'Data Type':
|
|
361
422
|
pd.Series(dtype=str, data=[c.dtype for c in cols]),
|
|
362
|
-
'
|
|
423
|
+
'Semantic Type':
|
|
363
424
|
pd.Series(dtype=str, data=[c.stype for c in cols]),
|
|
364
|
-
'
|
|
425
|
+
'Primary Key':
|
|
365
426
|
pd.Series(
|
|
366
427
|
dtype=bool,
|
|
367
428
|
data=[self._primary_key == c.name for c in cols],
|
|
368
429
|
),
|
|
369
|
-
'
|
|
430
|
+
'Time Column':
|
|
370
431
|
pd.Series(
|
|
371
432
|
dtype=bool,
|
|
372
433
|
data=[self._time_column == c.name for c in cols],
|
|
373
434
|
),
|
|
374
|
-
'
|
|
435
|
+
'End Time Column':
|
|
375
436
|
pd.Series(
|
|
376
437
|
dtype=bool,
|
|
377
438
|
data=[self._end_time_column == c.name for c in cols],
|
|
@@ -380,28 +441,98 @@ class Table(ABC):
|
|
|
380
441
|
|
|
381
442
|
def print_metadata(self) -> None:
|
|
382
443
|
r"""Prints the :meth:`~metadata` of this table."""
|
|
383
|
-
|
|
384
|
-
if self._num_rows
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
444
|
+
msg = f"🏷️ Metadata of Table `{self.name}`"
|
|
445
|
+
if num := self._num_rows:
|
|
446
|
+
msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
|
|
447
|
+
|
|
448
|
+
display.title(msg)
|
|
449
|
+
display.dataframe(self.metadata)
|
|
450
|
+
|
|
451
|
+
def infer_primary_key(self, verbose: bool = True) -> Self:
|
|
452
|
+
r"""Infers the primary key in this table.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
verbose: Whether to print verbose output.
|
|
456
|
+
"""
|
|
457
|
+
if self.has_primary_key():
|
|
458
|
+
return self
|
|
459
|
+
|
|
460
|
+
def _set_primary_key(primary_key: str) -> None:
|
|
461
|
+
self.primary_key = primary_key
|
|
462
|
+
if verbose:
|
|
463
|
+
display.message(f"Inferred primary key `{primary_key}` for "
|
|
464
|
+
f"table `{self.name}`")
|
|
465
|
+
|
|
466
|
+
# Inference from source column metadata:
|
|
467
|
+
if any(column.is_source for column in self.columns):
|
|
468
|
+
primary_key = self._source_primary_key
|
|
469
|
+
if (primary_key is not None and primary_key in self
|
|
470
|
+
and self[primary_key].is_source):
|
|
471
|
+
_set_primary_key(primary_key)
|
|
472
|
+
return self
|
|
473
|
+
|
|
474
|
+
unique_keys = [
|
|
475
|
+
column.name for column in self._source_column_dict.values()
|
|
476
|
+
if column.is_unique_key
|
|
477
|
+
]
|
|
478
|
+
if (len(unique_keys) == 1 # NOTE No composite keys yet.
|
|
479
|
+
and unique_keys[0] in self
|
|
480
|
+
and self[unique_keys[0]].is_source):
|
|
481
|
+
_set_primary_key(unique_keys[0])
|
|
482
|
+
return self
|
|
483
|
+
|
|
484
|
+
# Heuristic-based inference:
|
|
485
|
+
candidates = [
|
|
486
|
+
column.name for column in self.columns if column.stype == Stype.ID
|
|
487
|
+
]
|
|
488
|
+
if len(candidates) == 0:
|
|
489
|
+
for column in self.columns:
|
|
490
|
+
if self.name.lower() == column.name.lower():
|
|
491
|
+
candidates.append(column.name)
|
|
492
|
+
elif (self.name.lower().endswith('s')
|
|
493
|
+
and self.name.lower()[:-1] == column.name.lower()):
|
|
494
|
+
candidates.append(column.name)
|
|
495
|
+
|
|
496
|
+
if primary_key := infer_primary_key(
|
|
497
|
+
table_name=self.name,
|
|
498
|
+
df=self._get_sample_df(),
|
|
499
|
+
candidates=candidates,
|
|
500
|
+
):
|
|
501
|
+
_set_primary_key(primary_key)
|
|
502
|
+
return self
|
|
503
|
+
|
|
504
|
+
return self
|
|
505
|
+
|
|
506
|
+
def infer_time_column(self, verbose: bool = True) -> Self:
|
|
507
|
+
r"""Infers the time column in this table.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
verbose: Whether to print verbose output.
|
|
511
|
+
"""
|
|
512
|
+
if self.has_time_column():
|
|
513
|
+
return self
|
|
514
|
+
|
|
515
|
+
# Heuristic-based inference:
|
|
516
|
+
candidates = [
|
|
517
|
+
column.name for column in self.columns
|
|
518
|
+
if column.stype == Stype.timestamp
|
|
519
|
+
and column.name != self._end_time_column
|
|
520
|
+
]
|
|
521
|
+
|
|
522
|
+
if time_column := infer_time_column(
|
|
523
|
+
df=self._get_sample_df(),
|
|
524
|
+
candidates=candidates,
|
|
525
|
+
):
|
|
526
|
+
self.time_column = time_column
|
|
527
|
+
|
|
528
|
+
if verbose:
|
|
529
|
+
display.message(f"Inferred time column `{time_column}` for "
|
|
530
|
+
f"table `{self.name}`")
|
|
531
|
+
|
|
532
|
+
return self
|
|
402
533
|
|
|
403
534
|
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
404
|
-
r"""Infers metadata, *i.e.*, primary keys and time columns, in
|
|
535
|
+
r"""Infers metadata, *i.e.*, primary keys and time columns, in this
|
|
405
536
|
table.
|
|
406
537
|
|
|
407
538
|
Args:
|
|
@@ -409,48 +540,19 @@ class Table(ABC):
|
|
|
409
540
|
"""
|
|
410
541
|
logs = []
|
|
411
542
|
|
|
412
|
-
# Try to detect primary key if not set:
|
|
413
543
|
if not self.has_primary_key():
|
|
544
|
+
self.infer_primary_key(verbose=False)
|
|
545
|
+
if self.has_primary_key():
|
|
546
|
+
logs.append(f"primary key `{self._primary_key}`")
|
|
414
547
|
|
|
415
|
-
def is_candidate(column: Column) -> bool:
|
|
416
|
-
if column.stype == Stype.ID:
|
|
417
|
-
return True
|
|
418
|
-
if all(column.stype != Stype.ID for column in self.columns):
|
|
419
|
-
if self.name == column.name:
|
|
420
|
-
return True
|
|
421
|
-
if (self.name.endswith('s')
|
|
422
|
-
and self.name[:-1] == column.name):
|
|
423
|
-
return True
|
|
424
|
-
return False
|
|
425
|
-
|
|
426
|
-
candidates = [
|
|
427
|
-
column.name for column in self.columns if is_candidate(column)
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
if primary_key := infer_primary_key(
|
|
431
|
-
table_name=self.name,
|
|
432
|
-
df=self._sample_df,
|
|
433
|
-
candidates=candidates,
|
|
434
|
-
):
|
|
435
|
-
self.primary_key = primary_key
|
|
436
|
-
logs.append(f"primary key '{primary_key}'")
|
|
437
|
-
|
|
438
|
-
# Try to detect time column if not set:
|
|
439
548
|
if not self.has_time_column():
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
and column.name != self._end_time_column
|
|
444
|
-
]
|
|
445
|
-
if time_column := infer_time_column(
|
|
446
|
-
df=self._sample_df,
|
|
447
|
-
candidates=candidates,
|
|
448
|
-
):
|
|
449
|
-
self.time_column = time_column
|
|
450
|
-
logs.append(f"time column '{time_column}'")
|
|
549
|
+
self.infer_time_column(verbose=False)
|
|
550
|
+
if self.has_time_column():
|
|
551
|
+
logs.append(f"time column `{self._time_column}`")
|
|
451
552
|
|
|
452
553
|
if verbose and len(logs) > 0:
|
|
453
|
-
|
|
554
|
+
display.message(f"Inferred {' and '.join(logs)} for table "
|
|
555
|
+
f"`{self.name}`")
|
|
454
556
|
|
|
455
557
|
return self
|
|
456
558
|
|
|
@@ -468,6 +570,118 @@ class Table(ABC):
|
|
|
468
570
|
end_time_col=self._end_time_column,
|
|
469
571
|
)
|
|
470
572
|
|
|
573
|
+
@cached_property
|
|
574
|
+
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
575
|
+
source_columns = self._get_source_columns()
|
|
576
|
+
if len(source_columns) == 0:
|
|
577
|
+
raise ValueError(f"Table '{self.name}' has no columns")
|
|
578
|
+
return {column.name: column for column in source_columns}
|
|
579
|
+
|
|
580
|
+
@cached_property
|
|
581
|
+
def _source_primary_key(self) -> str | None:
|
|
582
|
+
primary_keys = [
|
|
583
|
+
column.name for column in self._source_column_dict.values()
|
|
584
|
+
if column.is_primary_key
|
|
585
|
+
]
|
|
586
|
+
# NOTE No composite keys yet.
|
|
587
|
+
return primary_keys[0] if len(primary_keys) == 1 else None
|
|
588
|
+
|
|
589
|
+
@cached_property
|
|
590
|
+
def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
|
|
591
|
+
return {key.name: key for key in self._get_source_foreign_keys()}
|
|
592
|
+
|
|
593
|
+
@cached_property
|
|
594
|
+
def _source_sample_df(self) -> pd.DataFrame:
|
|
595
|
+
return self._get_source_sample_df().reset_index(drop=True)
|
|
596
|
+
|
|
597
|
+
@cached_property
|
|
598
|
+
def _num_rows(self) -> int | None:
|
|
599
|
+
return self._get_num_rows()
|
|
600
|
+
|
|
601
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
602
|
+
dfs: list[pd.DataFrame] = []
|
|
603
|
+
if any(column.is_source for column in self.columns):
|
|
604
|
+
dfs.append(self._source_sample_df)
|
|
605
|
+
if any(not column.is_source for column in self.columns):
|
|
606
|
+
dfs.append(self._expr_sample_df)
|
|
607
|
+
|
|
608
|
+
if len(dfs) == 0:
|
|
609
|
+
return pd.DataFrame(index=range(1000))
|
|
610
|
+
if len(dfs) == 1:
|
|
611
|
+
return dfs[0]
|
|
612
|
+
|
|
613
|
+
size = min(map(len, dfs))
|
|
614
|
+
df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
|
|
615
|
+
df = df.loc[:, ~df.columns.duplicated(keep='last')]
|
|
616
|
+
return df
|
|
617
|
+
|
|
618
|
+
@staticmethod
|
|
619
|
+
def _sanitize(
|
|
620
|
+
df: pd.DataFrame,
|
|
621
|
+
dtype_dict: dict[str, Dtype | None] | None = None,
|
|
622
|
+
stype_dict: dict[str, Stype | None] | None = None,
|
|
623
|
+
) -> pd.DataFrame:
|
|
624
|
+
r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
|
|
625
|
+
types match table data and semantic type specification.
|
|
626
|
+
"""
|
|
627
|
+
def _to_datetime(ser: pd.Series) -> pd.Series:
|
|
628
|
+
if (not pd.api.types.is_datetime64_any_dtype(ser)
|
|
629
|
+
and not (isinstance(ser.dtype, pd.ArrowDtype) and
|
|
630
|
+
pa.types.is_timestamp(ser.dtype.pyarrow_dtype))):
|
|
631
|
+
with warnings.catch_warnings():
|
|
632
|
+
warnings.filterwarnings(
|
|
633
|
+
'ignore',
|
|
634
|
+
message='Could not infer format',
|
|
635
|
+
)
|
|
636
|
+
ser = pd.to_datetime(ser, errors='coerce')
|
|
637
|
+
if (isinstance(ser.dtype, pd.DatetimeTZDtype)
|
|
638
|
+
or (isinstance(ser.dtype, pd.ArrowDtype)
|
|
639
|
+
and ser.dtype.pyarrow_dtype.tz is not None)):
|
|
640
|
+
ser = ser.dt.tz_localize(None)
|
|
641
|
+
if ser.dtype != 'datetime64[ns]':
|
|
642
|
+
ser = ser.astype('datetime64[ns]')
|
|
643
|
+
return ser
|
|
644
|
+
|
|
645
|
+
def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
|
|
646
|
+
if (pd.api.types.is_string_dtype(ser)
|
|
647
|
+
and dtype in {Dtype.intlist, Dtype.floatlist}):
|
|
648
|
+
try:
|
|
649
|
+
ser = ser.map(lambda row: np.fromstring(
|
|
650
|
+
row.strip('[]'),
|
|
651
|
+
sep=',',
|
|
652
|
+
dtype=int if dtype == Dtype.intlist else np.float32,
|
|
653
|
+
) if row is not None else None)
|
|
654
|
+
except Exception:
|
|
655
|
+
pass
|
|
656
|
+
|
|
657
|
+
if pd.api.types.is_string_dtype(ser):
|
|
658
|
+
try:
|
|
659
|
+
import orjson as json
|
|
660
|
+
except ImportError:
|
|
661
|
+
import json
|
|
662
|
+
try:
|
|
663
|
+
ser = ser.map(lambda row: json.loads(row)
|
|
664
|
+
if row is not None else None)
|
|
665
|
+
except Exception:
|
|
666
|
+
pass
|
|
667
|
+
|
|
668
|
+
return ser
|
|
669
|
+
|
|
670
|
+
for column_name in df.columns:
|
|
671
|
+
dtype = (dtype_dict or {}).get(column_name)
|
|
672
|
+
stype = (stype_dict or {}).get(column_name)
|
|
673
|
+
|
|
674
|
+
if dtype == Dtype.time:
|
|
675
|
+
df[column_name] = _to_datetime(df[column_name])
|
|
676
|
+
elif stype == Stype.timestamp:
|
|
677
|
+
df[column_name] = _to_datetime(df[column_name])
|
|
678
|
+
elif dtype is not None and dtype.is_list():
|
|
679
|
+
df[column_name] = _to_list(df[column_name], dtype)
|
|
680
|
+
elif stype == Stype.sequence:
|
|
681
|
+
df[column_name] = _to_list(df[column_name], Dtype.floatlist)
|
|
682
|
+
|
|
683
|
+
return df
|
|
684
|
+
|
|
471
685
|
# Python builtins #########################################################
|
|
472
686
|
|
|
473
687
|
def __hash__(self) -> int:
|
|
@@ -496,45 +710,32 @@ class Table(ABC):
|
|
|
496
710
|
f' end_time_column={self._end_time_column},\n'
|
|
497
711
|
f')')
|
|
498
712
|
|
|
499
|
-
# Abstract
|
|
713
|
+
# Abstract Methods ########################################################
|
|
500
714
|
|
|
501
|
-
@
|
|
502
|
-
|
|
503
|
-
|
|
715
|
+
@property
|
|
716
|
+
@abstractmethod
|
|
717
|
+
def backend(self) -> DataBackend:
|
|
718
|
+
r"""The data backend of this table."""
|
|
504
719
|
|
|
505
720
|
@abstractmethod
|
|
506
|
-
def _get_source_columns(self) ->
|
|
721
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
507
722
|
pass
|
|
508
723
|
|
|
509
|
-
@cached_property
|
|
510
|
-
def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
|
|
511
|
-
fkeys = self._get_source_foreign_keys()
|
|
512
|
-
# NOTE Drop all keys that link to different primary keys in the same
|
|
513
|
-
# table since we don't support composite keys yet:
|
|
514
|
-
table_pkeys: Dict[str, Set[str]] = defaultdict(set)
|
|
515
|
-
for fkey in fkeys:
|
|
516
|
-
table_pkeys[fkey.dst_table].add(fkey.primary_key)
|
|
517
|
-
return {
|
|
518
|
-
fkey.name: fkey
|
|
519
|
-
for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
|
|
520
|
-
}
|
|
521
|
-
|
|
522
724
|
@abstractmethod
|
|
523
|
-
def _get_source_foreign_keys(self) ->
|
|
725
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
524
726
|
pass
|
|
525
727
|
|
|
526
|
-
@cached_property
|
|
527
|
-
def _sample_df(self) -> pd.DataFrame:
|
|
528
|
-
return self._get_sample_df()
|
|
529
|
-
|
|
530
728
|
@abstractmethod
|
|
531
|
-
def
|
|
729
|
+
def _get_source_sample_df(self) -> pd.DataFrame:
|
|
532
730
|
pass
|
|
533
731
|
|
|
534
|
-
@
|
|
535
|
-
def
|
|
536
|
-
|
|
732
|
+
@abstractmethod
|
|
733
|
+
def _get_expr_sample_df(
|
|
734
|
+
self,
|
|
735
|
+
columns: Sequence[ColumnSpec],
|
|
736
|
+
) -> pd.DataFrame:
|
|
737
|
+
pass
|
|
537
738
|
|
|
538
739
|
@abstractmethod
|
|
539
|
-
def _get_num_rows(self) ->
|
|
740
|
+
def _get_num_rows(self) -> int | None:
|
|
540
741
|
pass
|