kumoai 2.12.0.dev202510231830__cp311-cp311-win_amd64.whl → 2.14.0.dev202512311733__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +41 -35
- kumoai/_version.py +1 -1
- kumoai/client/client.py +15 -13
- kumoai/client/endpoints.py +1 -0
- kumoai/client/jobs.py +24 -0
- kumoai/client/pquery.py +6 -2
- kumoai/client/rfm.py +35 -7
- kumoai/connector/utils.py +23 -2
- kumoai/experimental/rfm/__init__.py +191 -48
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +65 -127
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +113 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
- kumoai/experimental/rfm/backend/snow/table.py +242 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
- kumoai/experimental/rfm/base/__init__.py +30 -0
- kumoai/experimental/rfm/base/column.py +152 -0
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +761 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +143 -0
- kumoai/experimental/rfm/base/table.py +735 -0
- kumoai/experimental/rfm/graph.py +1237 -0
- kumoai/experimental/rfm/infer/__init__.py +8 -0
- kumoai/experimental/rfm/infer/dtype.py +82 -0
- kumoai/experimental/rfm/infer/multicategorical.py +1 -1
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/pquery/__init__.py +0 -4
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +64 -40
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +386 -276
- kumoai/experimental/rfm/sagemaker.py +138 -0
- kumoai/kumolib.cp311-win_amd64.pyd +0 -0
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/spcs.py +1 -3
- kumoai/testing/decorators.py +1 -1
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/trainer/trainer.py +9 -10
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/display.py +51 -0
- kumoai/utils/progress_logger.py +188 -16
- kumoai/utils/sql.py +3 -0
- {kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/METADATA +13 -2
- {kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/RECORD +57 -36
- kumoai/experimental/rfm/local_graph.py +0 -810
- kumoai/experimental/rfm/local_graph_sampler.py +0 -184
- kumoai/experimental/rfm/local_pquery_driver.py +0 -494
- kumoai/experimental/rfm/local_table.py +0 -545
- kumoai/experimental/rfm/pquery/backend.py +0 -136
- kumoai/experimental/rfm/pquery/pandas_backend.py +0 -478
- kumoai/experimental/rfm/utils.py +0 -344
- {kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/WHEEL +0 -0
- {kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,735 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from kumoapi.model_plan import MissingType
|
|
9
|
+
from kumoapi.source_table import UnavailableSourceTable
|
|
10
|
+
from kumoapi.table import Column as ColumnDefinition
|
|
11
|
+
from kumoapi.table import TableDefinition
|
|
12
|
+
from kumoapi.typing import Dtype, Stype
|
|
13
|
+
from typing_extensions import Self
|
|
14
|
+
|
|
15
|
+
from kumoai.experimental.rfm.base import (
|
|
16
|
+
Column,
|
|
17
|
+
ColumnSpec,
|
|
18
|
+
ColumnSpecType,
|
|
19
|
+
DataBackend,
|
|
20
|
+
SourceColumn,
|
|
21
|
+
SourceForeignKey,
|
|
22
|
+
)
|
|
23
|
+
from kumoai.experimental.rfm.infer import (
|
|
24
|
+
infer_dtype,
|
|
25
|
+
infer_primary_key,
|
|
26
|
+
infer_stype,
|
|
27
|
+
infer_time_column,
|
|
28
|
+
)
|
|
29
|
+
from kumoai.utils import display, quote_ident
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Table(ABC):
|
|
33
|
+
r"""A :class:`Table` fully specifies the relevant metadata of a single
|
|
34
|
+
table, *i.e.* its selected columns, data types, semantic types, primary
|
|
35
|
+
keys and time columns.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
name: The name of this table.
|
|
39
|
+
source_name: The source name of this table. If set to ``None``,
|
|
40
|
+
``name`` is being used.
|
|
41
|
+
columns: The selected columns of this table.
|
|
42
|
+
primary_key: The name of the primary key of this table, if it exists.
|
|
43
|
+
time_column: The name of the time column of this table, if it exists.
|
|
44
|
+
end_time_column: The name of the end time column of this table, if it
|
|
45
|
+
exists.
|
|
46
|
+
"""
|
|
47
|
+
_NUM_SAMPLE_ROWS = 1_000
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
name: str,
|
|
52
|
+
source_name: str | None = None,
|
|
53
|
+
columns: Sequence[ColumnSpecType] | None = None,
|
|
54
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
55
|
+
time_column: str | None = None,
|
|
56
|
+
end_time_column: str | None = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
|
|
59
|
+
self._name = name
|
|
60
|
+
self._source_name = source_name or name
|
|
61
|
+
self._column_dict: dict[str, Column] = {}
|
|
62
|
+
self._primary_key: str | None = None
|
|
63
|
+
self._time_column: str | None = None
|
|
64
|
+
self._end_time_column: str | None = None
|
|
65
|
+
self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
|
|
66
|
+
|
|
67
|
+
if columns is None:
|
|
68
|
+
columns = list(self._source_column_dict.keys())
|
|
69
|
+
|
|
70
|
+
self.add_columns(columns)
|
|
71
|
+
|
|
72
|
+
if isinstance(primary_key, MissingType):
|
|
73
|
+
# Infer primary key from source metadata, but only set it in case
|
|
74
|
+
# it is already part of the column set (don't magically add it):
|
|
75
|
+
if any(column.is_source for column in self.columns):
|
|
76
|
+
primary_key = self._source_primary_key
|
|
77
|
+
if (primary_key is not None and primary_key in self
|
|
78
|
+
and self[primary_key].is_source):
|
|
79
|
+
self.primary_key = primary_key
|
|
80
|
+
elif primary_key is not None:
|
|
81
|
+
if primary_key not in self:
|
|
82
|
+
self.add_column(primary_key)
|
|
83
|
+
self.primary_key = primary_key
|
|
84
|
+
|
|
85
|
+
if time_column is not None:
|
|
86
|
+
if time_column not in self:
|
|
87
|
+
self.add_column(time_column)
|
|
88
|
+
self.time_column = time_column
|
|
89
|
+
|
|
90
|
+
if end_time_column is not None:
|
|
91
|
+
if end_time_column not in self:
|
|
92
|
+
self.add_column(end_time_column)
|
|
93
|
+
self.end_time_column = end_time_column
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def name(self) -> str:
|
|
97
|
+
r"""The name of this table."""
|
|
98
|
+
return self._name
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def source_name(self) -> str:
|
|
102
|
+
r"""The source name of this table."""
|
|
103
|
+
return self._source_name
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def _quoted_source_name(self) -> str:
|
|
107
|
+
return quote_ident(self._source_name)
|
|
108
|
+
|
|
109
|
+
# Column ##################################################################
|
|
110
|
+
|
|
111
|
+
def has_column(self, name: str) -> bool:
|
|
112
|
+
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
113
|
+
``False`` otherwise.
|
|
114
|
+
"""
|
|
115
|
+
return name in self._column_dict
|
|
116
|
+
|
|
117
|
+
def column(self, name: str) -> Column:
|
|
118
|
+
r"""Returns the data column named with name ``name`` in this table.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
name: The name of the column.
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
KeyError: If ``name`` is not present in this table.
|
|
125
|
+
"""
|
|
126
|
+
if not self.has_column(name):
|
|
127
|
+
raise KeyError(f"Column '{name}' not found in table '{self.name}'")
|
|
128
|
+
return self._column_dict[name]
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def columns(self) -> list[Column]:
|
|
132
|
+
r"""Returns a list of :class:`Column` objects that represent the
|
|
133
|
+
columns in this table.
|
|
134
|
+
"""
|
|
135
|
+
return list(self._column_dict.values())
|
|
136
|
+
|
|
137
|
+
def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
|
|
138
|
+
r"""Adds a set of columns to this table.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
columns: The columns to add.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
KeyError: If any of the column names already exist in this table.
|
|
145
|
+
"""
|
|
146
|
+
if len(columns) == 0:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
column_specs = [ColumnSpec.coerce(column) for column in columns]
|
|
150
|
+
|
|
151
|
+
# Obtain a batch-wise sample for all column expressions:
|
|
152
|
+
expr_specs = [spec for spec in column_specs if not spec.is_source]
|
|
153
|
+
if len(expr_specs) > 0:
|
|
154
|
+
dfs = [
|
|
155
|
+
self._expr_sample_df,
|
|
156
|
+
self._get_expr_sample_df(expr_specs).reset_index(drop=True),
|
|
157
|
+
]
|
|
158
|
+
size = min(map(len, dfs))
|
|
159
|
+
df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
|
|
160
|
+
df = df.loc[:, ~df.columns.duplicated(keep='last')]
|
|
161
|
+
self._expr_sample_df = df
|
|
162
|
+
|
|
163
|
+
for column_spec in column_specs:
|
|
164
|
+
if column_spec.name in self:
|
|
165
|
+
raise KeyError(f"Column '{column_spec.name}' already exists "
|
|
166
|
+
f"in table '{self.name}'")
|
|
167
|
+
|
|
168
|
+
dtype = column_spec.dtype
|
|
169
|
+
stype = column_spec.stype
|
|
170
|
+
|
|
171
|
+
if column_spec.is_source:
|
|
172
|
+
if column_spec.name not in self._source_column_dict:
|
|
173
|
+
raise ValueError(
|
|
174
|
+
f"Column '{column_spec.name}' does not exist in the "
|
|
175
|
+
f"underlying source table")
|
|
176
|
+
|
|
177
|
+
if dtype is None:
|
|
178
|
+
dtype = self._source_column_dict[column_spec.name].dtype
|
|
179
|
+
|
|
180
|
+
if dtype == Dtype.unsupported:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f"Encountered unsupported data type for column "
|
|
183
|
+
f"'{column_spec.name}' in table '{self.name}'. Please "
|
|
184
|
+
f"either change the column's data type or remove the "
|
|
185
|
+
f"column from this table.")
|
|
186
|
+
|
|
187
|
+
if dtype is None:
|
|
188
|
+
if column_spec.is_source:
|
|
189
|
+
ser = self._source_sample_df[column_spec.name]
|
|
190
|
+
else:
|
|
191
|
+
ser = self._expr_sample_df[column_spec.name]
|
|
192
|
+
try:
|
|
193
|
+
dtype = infer_dtype(ser)
|
|
194
|
+
except Exception as e:
|
|
195
|
+
raise RuntimeError(
|
|
196
|
+
f"Encountered unsupported data type '{ser.dtype}' for "
|
|
197
|
+
f"column '{column_spec.name}' in table '{self.name}'. "
|
|
198
|
+
f"Please either manually override the columns's data "
|
|
199
|
+
f"type or remove the column from this table.") from e
|
|
200
|
+
|
|
201
|
+
if stype is None:
|
|
202
|
+
if column_spec.is_source:
|
|
203
|
+
ser = self._source_sample_df[column_spec.name]
|
|
204
|
+
else:
|
|
205
|
+
ser = self._expr_sample_df[column_spec.name]
|
|
206
|
+
try:
|
|
207
|
+
stype = infer_stype(ser, column_spec.name, dtype)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
raise RuntimeError(
|
|
210
|
+
f"Could not determine semantic type for column "
|
|
211
|
+
f"'{column_spec.name}' with data type '{dtype}' in "
|
|
212
|
+
f"table '{self.name}'. Please either change the "
|
|
213
|
+
f"column's data type or remove the column from this "
|
|
214
|
+
f"table.") from e
|
|
215
|
+
|
|
216
|
+
self._column_dict[column_spec.name] = Column(
|
|
217
|
+
name=column_spec.name,
|
|
218
|
+
expr=column_spec.expr,
|
|
219
|
+
dtype=dtype,
|
|
220
|
+
stype=stype,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def add_column(self, column: ColumnSpecType) -> Column:
|
|
224
|
+
r"""Adds a column to this table.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
column: The column to add.
|
|
228
|
+
|
|
229
|
+
Raises:
|
|
230
|
+
KeyError: If the column name already exists in this table.
|
|
231
|
+
"""
|
|
232
|
+
column_spec = ColumnSpec.coerce(column)
|
|
233
|
+
self.add_columns([column_spec])
|
|
234
|
+
return self[column_spec.name]
|
|
235
|
+
|
|
236
|
+
def remove_column(self, name: str) -> Self:
|
|
237
|
+
r"""Removes a column from this table.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
name: The name of the column.
|
|
241
|
+
|
|
242
|
+
Raises:
|
|
243
|
+
KeyError: If ``name`` is not present in this table.
|
|
244
|
+
"""
|
|
245
|
+
if name not in self:
|
|
246
|
+
raise KeyError(f"Column '{name}' not found in table '{self.name}'")
|
|
247
|
+
|
|
248
|
+
if self._primary_key == name:
|
|
249
|
+
self.primary_key = None
|
|
250
|
+
if self._time_column == name:
|
|
251
|
+
self.time_column = None
|
|
252
|
+
if self._end_time_column == name:
|
|
253
|
+
self.end_time_column = None
|
|
254
|
+
del self._column_dict[name]
|
|
255
|
+
|
|
256
|
+
return self
|
|
257
|
+
|
|
258
|
+
# Primary key #############################################################
|
|
259
|
+
|
|
260
|
+
def has_primary_key(self) -> bool:
|
|
261
|
+
r"""Returns ``True``` if this table has a primary key; ``False``
|
|
262
|
+
otherwise.
|
|
263
|
+
"""
|
|
264
|
+
return self._primary_key is not None
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def primary_key(self) -> Column | None:
|
|
268
|
+
r"""The primary key column of this table.
|
|
269
|
+
|
|
270
|
+
The getter returns the primary key column of this table, or ``None`` if
|
|
271
|
+
no such primary key is present.
|
|
272
|
+
|
|
273
|
+
The setter sets a column as a primary key on this table, and raises a
|
|
274
|
+
:class:`ValueError` if the primary key has a non-ID semantic type or
|
|
275
|
+
if the column name does not match a column in the data frame.
|
|
276
|
+
"""
|
|
277
|
+
if self._primary_key is None:
|
|
278
|
+
return None
|
|
279
|
+
return self[self._primary_key]
|
|
280
|
+
|
|
281
|
+
@primary_key.setter
|
|
282
|
+
def primary_key(self, name: str | None) -> None:
|
|
283
|
+
if name is not None and name == self._time_column:
|
|
284
|
+
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
285
|
+
f"key since it is already defined to be a time "
|
|
286
|
+
f"column")
|
|
287
|
+
if name is not None and name == self._end_time_column:
|
|
288
|
+
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
289
|
+
f"key since it is already defined to be an end "
|
|
290
|
+
f"time column")
|
|
291
|
+
|
|
292
|
+
if self.primary_key is not None:
|
|
293
|
+
self.primary_key._is_primary_key = False
|
|
294
|
+
|
|
295
|
+
if name is None:
|
|
296
|
+
self._primary_key = None
|
|
297
|
+
return
|
|
298
|
+
|
|
299
|
+
self[name].stype = Stype.ID
|
|
300
|
+
self[name]._is_primary_key = True
|
|
301
|
+
self._primary_key = name
|
|
302
|
+
|
|
303
|
+
# Time column #############################################################
|
|
304
|
+
|
|
305
|
+
def has_time_column(self) -> bool:
|
|
306
|
+
r"""Returns ``True`` if this table has a time column; ``False``
|
|
307
|
+
otherwise.
|
|
308
|
+
"""
|
|
309
|
+
return self._time_column is not None
|
|
310
|
+
|
|
311
|
+
@property
|
|
312
|
+
def time_column(self) -> Column | None:
|
|
313
|
+
r"""The time column of this table.
|
|
314
|
+
|
|
315
|
+
The getter returns the time column of this table, or ``None`` if no
|
|
316
|
+
such time column is present.
|
|
317
|
+
|
|
318
|
+
The setter sets a column as a time column on this table, and raises a
|
|
319
|
+
:class:`ValueError` if the time column has a non-timestamp semantic
|
|
320
|
+
type or if the column name does not match a column in the data frame.
|
|
321
|
+
"""
|
|
322
|
+
if self._time_column is None:
|
|
323
|
+
return None
|
|
324
|
+
return self[self._time_column]
|
|
325
|
+
|
|
326
|
+
@time_column.setter
|
|
327
|
+
def time_column(self, name: str | None) -> None:
|
|
328
|
+
if name is not None and name == self._primary_key:
|
|
329
|
+
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
330
|
+
f"column since it is already defined to be a "
|
|
331
|
+
f"primary key")
|
|
332
|
+
if name is not None and name == self._end_time_column:
|
|
333
|
+
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
334
|
+
f"column since it is already defined to be an "
|
|
335
|
+
f"end time column")
|
|
336
|
+
|
|
337
|
+
if self.time_column is not None:
|
|
338
|
+
self.time_column._is_time_column = False
|
|
339
|
+
|
|
340
|
+
if name is None:
|
|
341
|
+
self._time_column = None
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
self[name].stype = Stype.timestamp
|
|
345
|
+
self[name]._is_time_column = True
|
|
346
|
+
self._time_column = name
|
|
347
|
+
|
|
348
|
+
# End Time column #########################################################
|
|
349
|
+
|
|
350
|
+
def has_end_time_column(self) -> bool:
|
|
351
|
+
r"""Returns ``True`` if this table has an end time column; ``False``
|
|
352
|
+
otherwise.
|
|
353
|
+
"""
|
|
354
|
+
return self._end_time_column is not None
|
|
355
|
+
|
|
356
|
+
@property
|
|
357
|
+
def end_time_column(self) -> Column | None:
|
|
358
|
+
r"""The end time column of this table.
|
|
359
|
+
|
|
360
|
+
The getter returns the end time column of this table, or ``None`` if no
|
|
361
|
+
such end time column is present.
|
|
362
|
+
|
|
363
|
+
The setter sets a column as an end time column on this table, and
|
|
364
|
+
raises a :class:`ValueError` if the end time column has a non-timestamp
|
|
365
|
+
semantic type or if the column name does not match a column in the data
|
|
366
|
+
frame.
|
|
367
|
+
"""
|
|
368
|
+
if self._end_time_column is None:
|
|
369
|
+
return None
|
|
370
|
+
return self[self._end_time_column]
|
|
371
|
+
|
|
372
|
+
@end_time_column.setter
|
|
373
|
+
def end_time_column(self, name: str | None) -> None:
|
|
374
|
+
if name is not None and name == self._primary_key:
|
|
375
|
+
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
376
|
+
f"column since it is already defined to be a "
|
|
377
|
+
f"primary key")
|
|
378
|
+
if name is not None and name == self._time_column:
|
|
379
|
+
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
380
|
+
f"column since it is already defined to be a "
|
|
381
|
+
f"time column")
|
|
382
|
+
|
|
383
|
+
if self.end_time_column is not None:
|
|
384
|
+
self.end_time_column._is_end_time_column = False
|
|
385
|
+
|
|
386
|
+
if name is None:
|
|
387
|
+
self._end_time_column = None
|
|
388
|
+
return
|
|
389
|
+
|
|
390
|
+
self[name].stype = Stype.timestamp
|
|
391
|
+
self[name]._is_end_time_column = True
|
|
392
|
+
self._end_time_column = name
|
|
393
|
+
|
|
394
|
+
# Metadata ################################################################
|
|
395
|
+
|
|
396
|
+
@property
|
|
397
|
+
def metadata(self) -> pd.DataFrame:
|
|
398
|
+
r"""Returns a :class:`pandas.DataFrame` object containing metadata
|
|
399
|
+
information about the columns in this table.
|
|
400
|
+
|
|
401
|
+
The returned dataframe has columns ``name``, ``dtype``, ``stype``,
|
|
402
|
+
``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
|
|
403
|
+
which provide an aggregate view of the properties of the columns of
|
|
404
|
+
this table.
|
|
405
|
+
|
|
406
|
+
Example:
|
|
407
|
+
>>> # doctest: +SKIP
|
|
408
|
+
>>> import kumoai.experimental.rfm as rfm
|
|
409
|
+
>>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
|
|
410
|
+
>>> table.metadata
|
|
411
|
+
name dtype stype is_primary_key is_time_column is_end_time_column
|
|
412
|
+
0 CustomerID float64 ID True False False
|
|
413
|
+
""" # noqa: E501
|
|
414
|
+
cols = self.columns
|
|
415
|
+
|
|
416
|
+
return pd.DataFrame({
|
|
417
|
+
'name':
|
|
418
|
+
pd.Series(dtype=str, data=[c.name for c in cols]),
|
|
419
|
+
'dtype':
|
|
420
|
+
pd.Series(dtype=str, data=[c.dtype for c in cols]),
|
|
421
|
+
'stype':
|
|
422
|
+
pd.Series(dtype=str, data=[c.stype for c in cols]),
|
|
423
|
+
'is_primary_key':
|
|
424
|
+
pd.Series(
|
|
425
|
+
dtype=bool,
|
|
426
|
+
data=[self._primary_key == c.name for c in cols],
|
|
427
|
+
),
|
|
428
|
+
'is_time_column':
|
|
429
|
+
pd.Series(
|
|
430
|
+
dtype=bool,
|
|
431
|
+
data=[self._time_column == c.name for c in cols],
|
|
432
|
+
),
|
|
433
|
+
'is_end_time_column':
|
|
434
|
+
pd.Series(
|
|
435
|
+
dtype=bool,
|
|
436
|
+
data=[self._end_time_column == c.name for c in cols],
|
|
437
|
+
),
|
|
438
|
+
})
|
|
439
|
+
|
|
440
|
+
def print_metadata(self) -> None:
|
|
441
|
+
r"""Prints the :meth:`~metadata` of this table."""
|
|
442
|
+
msg = f"🏷️ Metadata of Table `{self.name}`"
|
|
443
|
+
if num := self._num_rows:
|
|
444
|
+
msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
|
|
445
|
+
|
|
446
|
+
display.title(msg)
|
|
447
|
+
display.dataframe(self.metadata)
|
|
448
|
+
|
|
449
|
+
def infer_primary_key(self, verbose: bool = True) -> Self:
|
|
450
|
+
r"""Infers the primary key in this table.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
verbose: Whether to print verbose output.
|
|
454
|
+
"""
|
|
455
|
+
if self.has_primary_key():
|
|
456
|
+
return self
|
|
457
|
+
|
|
458
|
+
def _set_primary_key(primary_key: str) -> None:
|
|
459
|
+
self.primary_key = primary_key
|
|
460
|
+
if verbose:
|
|
461
|
+
display.message(f"Inferred primary key `{primary_key}` for "
|
|
462
|
+
f"table `{self.name}`")
|
|
463
|
+
|
|
464
|
+
# Inference from source column metadata:
|
|
465
|
+
if any(column.is_source for column in self.columns):
|
|
466
|
+
primary_key = self._source_primary_key
|
|
467
|
+
if (primary_key is not None and primary_key in self
|
|
468
|
+
and self[primary_key].is_source):
|
|
469
|
+
_set_primary_key(primary_key)
|
|
470
|
+
return self
|
|
471
|
+
|
|
472
|
+
unique_keys = [
|
|
473
|
+
column.name for column in self._source_column_dict.values()
|
|
474
|
+
if column.is_unique_key
|
|
475
|
+
]
|
|
476
|
+
if (len(unique_keys) == 1 # NOTE No composite keys yet.
|
|
477
|
+
and unique_keys[0] in self
|
|
478
|
+
and self[unique_keys[0]].is_source):
|
|
479
|
+
_set_primary_key(unique_keys[0])
|
|
480
|
+
return self
|
|
481
|
+
|
|
482
|
+
# Heuristic-based inference:
|
|
483
|
+
candidates = [
|
|
484
|
+
column.name for column in self.columns if column.stype == Stype.ID
|
|
485
|
+
]
|
|
486
|
+
if len(candidates) == 0:
|
|
487
|
+
for column in self.columns:
|
|
488
|
+
if self.name.lower() == column.name.lower():
|
|
489
|
+
candidates.append(column.name)
|
|
490
|
+
elif (self.name.lower().endswith('s')
|
|
491
|
+
and self.name.lower()[:-1] == column.name.lower()):
|
|
492
|
+
candidates.append(column.name)
|
|
493
|
+
|
|
494
|
+
if primary_key := infer_primary_key(
|
|
495
|
+
table_name=self.name,
|
|
496
|
+
df=self._get_sample_df(),
|
|
497
|
+
candidates=candidates,
|
|
498
|
+
):
|
|
499
|
+
_set_primary_key(primary_key)
|
|
500
|
+
return self
|
|
501
|
+
|
|
502
|
+
return self
|
|
503
|
+
|
|
504
|
+
def infer_time_column(self, verbose: bool = True) -> Self:
|
|
505
|
+
r"""Infers the time column in this table.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
verbose: Whether to print verbose output.
|
|
509
|
+
"""
|
|
510
|
+
if self.has_time_column():
|
|
511
|
+
return self
|
|
512
|
+
|
|
513
|
+
# Heuristic-based inference:
|
|
514
|
+
candidates = [
|
|
515
|
+
column.name for column in self.columns
|
|
516
|
+
if column.stype == Stype.timestamp
|
|
517
|
+
and column.name != self._end_time_column
|
|
518
|
+
]
|
|
519
|
+
|
|
520
|
+
if time_column := infer_time_column(
|
|
521
|
+
df=self._get_sample_df(),
|
|
522
|
+
candidates=candidates,
|
|
523
|
+
):
|
|
524
|
+
self.time_column = time_column
|
|
525
|
+
|
|
526
|
+
if verbose:
|
|
527
|
+
display.message(f"Inferred time column `{time_column}` for "
|
|
528
|
+
f"table `{self.name}`")
|
|
529
|
+
|
|
530
|
+
return self
|
|
531
|
+
|
|
532
|
+
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
533
|
+
r"""Infers metadata, *i.e.*, primary keys and time columns, in this
|
|
534
|
+
table.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
verbose: Whether to print verbose output.
|
|
538
|
+
"""
|
|
539
|
+
logs = []
|
|
540
|
+
|
|
541
|
+
if not self.has_primary_key():
|
|
542
|
+
self.infer_primary_key(verbose=False)
|
|
543
|
+
if self.has_primary_key():
|
|
544
|
+
logs.append(f"primary key `{self._primary_key}`")
|
|
545
|
+
|
|
546
|
+
if not self.has_time_column():
|
|
547
|
+
self.infer_time_column(verbose=False)
|
|
548
|
+
if self.has_time_column():
|
|
549
|
+
logs.append(f"time column `{self._time_column}`")
|
|
550
|
+
|
|
551
|
+
if verbose and len(logs) > 0:
|
|
552
|
+
display.message(f"Inferred {' and '.join(logs)} for table "
|
|
553
|
+
f"`{self.name}`")
|
|
554
|
+
|
|
555
|
+
return self
|
|
556
|
+
|
|
557
|
+
# Helpers #################################################################
|
|
558
|
+
|
|
559
|
+
def _to_api_table_definition(self) -> TableDefinition:
|
|
560
|
+
return TableDefinition(
|
|
561
|
+
cols=[
|
|
562
|
+
ColumnDefinition(col.name, col.stype, col.dtype)
|
|
563
|
+
for col in self.columns
|
|
564
|
+
],
|
|
565
|
+
source_table=UnavailableSourceTable(table=self.name),
|
|
566
|
+
pkey=self._primary_key,
|
|
567
|
+
time_col=self._time_column,
|
|
568
|
+
end_time_col=self._end_time_column,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
@cached_property
|
|
572
|
+
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
573
|
+
source_columns = self._get_source_columns()
|
|
574
|
+
if len(source_columns) == 0:
|
|
575
|
+
raise ValueError(f"Table '{self.name}' has no columns")
|
|
576
|
+
return {column.name: column for column in source_columns}
|
|
577
|
+
|
|
578
|
+
@cached_property
|
|
579
|
+
def _source_primary_key(self) -> str | None:
|
|
580
|
+
primary_keys = [
|
|
581
|
+
column.name for column in self._source_column_dict.values()
|
|
582
|
+
if column.is_primary_key
|
|
583
|
+
]
|
|
584
|
+
# NOTE No composite keys yet.
|
|
585
|
+
return primary_keys[0] if len(primary_keys) == 1 else None
|
|
586
|
+
|
|
587
|
+
@cached_property
|
|
588
|
+
def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
|
|
589
|
+
return {key.name: key for key in self._get_source_foreign_keys()}
|
|
590
|
+
|
|
591
|
+
@cached_property
|
|
592
|
+
def _source_sample_df(self) -> pd.DataFrame:
|
|
593
|
+
return self._get_source_sample_df().reset_index(drop=True)
|
|
594
|
+
|
|
595
|
+
@cached_property
|
|
596
|
+
def _num_rows(self) -> int | None:
|
|
597
|
+
return self._get_num_rows()
|
|
598
|
+
|
|
599
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
600
|
+
dfs: list[pd.DataFrame] = []
|
|
601
|
+
if any(column.is_source for column in self.columns):
|
|
602
|
+
dfs.append(self._source_sample_df)
|
|
603
|
+
if any(not column.is_source for column in self.columns):
|
|
604
|
+
dfs.append(self._expr_sample_df)
|
|
605
|
+
|
|
606
|
+
if len(dfs) == 0:
|
|
607
|
+
return pd.DataFrame(index=range(1000))
|
|
608
|
+
if len(dfs) == 1:
|
|
609
|
+
return dfs[0]
|
|
610
|
+
|
|
611
|
+
size = min(map(len, dfs))
|
|
612
|
+
df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
|
|
613
|
+
df = df.loc[:, ~df.columns.duplicated(keep='last')]
|
|
614
|
+
return df
|
|
615
|
+
|
|
616
|
+
@staticmethod
|
|
617
|
+
def _sanitize(
|
|
618
|
+
df: pd.DataFrame,
|
|
619
|
+
dtype_dict: dict[str, Dtype | None] | None = None,
|
|
620
|
+
stype_dict: dict[str, Stype | None] | None = None,
|
|
621
|
+
) -> pd.DataFrame:
|
|
622
|
+
r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
|
|
623
|
+
types match table data and semantic type specification.
|
|
624
|
+
"""
|
|
625
|
+
def _to_datetime(ser: pd.Series) -> pd.Series:
|
|
626
|
+
if not pd.api.types.is_datetime64_any_dtype(ser):
|
|
627
|
+
with warnings.catch_warnings():
|
|
628
|
+
warnings.filterwarnings(
|
|
629
|
+
'ignore',
|
|
630
|
+
message='Could not infer format',
|
|
631
|
+
)
|
|
632
|
+
ser = pd.to_datetime(ser, errors='coerce')
|
|
633
|
+
if isinstance(ser.dtype, pd.DatetimeTZDtype):
|
|
634
|
+
ser = ser.dt.tz_localize(None)
|
|
635
|
+
if ser.dtype != 'datetime64[ns]':
|
|
636
|
+
ser = ser.astype('datetime64[ns]')
|
|
637
|
+
return ser
|
|
638
|
+
|
|
639
|
+
def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
|
|
640
|
+
if (pd.api.types.is_string_dtype(ser)
|
|
641
|
+
and dtype in {Dtype.intlist, Dtype.floatlist}):
|
|
642
|
+
try:
|
|
643
|
+
ser = ser.map(lambda row: np.fromstring(
|
|
644
|
+
row.strip('[]'),
|
|
645
|
+
sep=',',
|
|
646
|
+
dtype=int if dtype == Dtype.intlist else np.float32,
|
|
647
|
+
) if row is not None else None)
|
|
648
|
+
except Exception:
|
|
649
|
+
pass
|
|
650
|
+
|
|
651
|
+
if pd.api.types.is_string_dtype(ser):
|
|
652
|
+
try:
|
|
653
|
+
import orjson as json
|
|
654
|
+
except ImportError:
|
|
655
|
+
import json
|
|
656
|
+
try:
|
|
657
|
+
ser = ser.map(lambda row: json.loads(row)
|
|
658
|
+
if row is not None else None)
|
|
659
|
+
except Exception:
|
|
660
|
+
pass
|
|
661
|
+
|
|
662
|
+
return ser
|
|
663
|
+
|
|
664
|
+
for column_name in df.columns:
|
|
665
|
+
dtype = (dtype_dict or {}).get(column_name)
|
|
666
|
+
stype = (stype_dict or {}).get(column_name)
|
|
667
|
+
|
|
668
|
+
if dtype == Dtype.time:
|
|
669
|
+
df[column_name] = _to_datetime(df[column_name])
|
|
670
|
+
elif stype == Stype.timestamp:
|
|
671
|
+
df[column_name] = _to_datetime(df[column_name])
|
|
672
|
+
elif dtype is not None and dtype.is_list():
|
|
673
|
+
df[column_name] = _to_list(df[column_name], dtype)
|
|
674
|
+
elif stype == Stype.sequence:
|
|
675
|
+
df[column_name] = _to_list(df[column_name], Dtype.floatlist)
|
|
676
|
+
|
|
677
|
+
return df
|
|
678
|
+
|
|
679
|
+
# Python builtins #########################################################
|
|
680
|
+
|
|
681
|
+
def __hash__(self) -> int:
|
|
682
|
+
special_columns = [
|
|
683
|
+
self.primary_key,
|
|
684
|
+
self.time_column,
|
|
685
|
+
self.end_time_column,
|
|
686
|
+
]
|
|
687
|
+
return hash(tuple(self.columns + special_columns))
|
|
688
|
+
|
|
689
|
+
def __contains__(self, name: str) -> bool:
|
|
690
|
+
return self.has_column(name)
|
|
691
|
+
|
|
692
|
+
def __getitem__(self, name: str) -> Column:
|
|
693
|
+
return self.column(name)
|
|
694
|
+
|
|
695
|
+
def __delitem__(self, name: str) -> None:
|
|
696
|
+
self.remove_column(name)
|
|
697
|
+
|
|
698
|
+
def __repr__(self) -> str:
|
|
699
|
+
return (f'{self.__class__.__name__}(\n'
|
|
700
|
+
f' name={self.name},\n'
|
|
701
|
+
f' num_columns={len(self.columns)},\n'
|
|
702
|
+
f' primary_key={self._primary_key},\n'
|
|
703
|
+
f' time_column={self._time_column},\n'
|
|
704
|
+
f' end_time_column={self._end_time_column},\n'
|
|
705
|
+
f')')
|
|
706
|
+
|
|
707
|
+
# Abstract Methods ########################################################
|
|
708
|
+
|
|
709
|
+
@property
|
|
710
|
+
@abstractmethod
|
|
711
|
+
def backend(self) -> DataBackend:
|
|
712
|
+
r"""The data backend of this table."""
|
|
713
|
+
|
|
714
|
+
@abstractmethod
|
|
715
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
716
|
+
pass
|
|
717
|
+
|
|
718
|
+
@abstractmethod
|
|
719
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
720
|
+
pass
|
|
721
|
+
|
|
722
|
+
@abstractmethod
|
|
723
|
+
def _get_source_sample_df(self) -> pd.DataFrame:
|
|
724
|
+
pass
|
|
725
|
+
|
|
726
|
+
@abstractmethod
|
|
727
|
+
def _get_expr_sample_df(
|
|
728
|
+
self,
|
|
729
|
+
columns: Sequence[ColumnSpec],
|
|
730
|
+
) -> pd.DataFrame:
|
|
731
|
+
pass
|
|
732
|
+
|
|
733
|
+
@abstractmethod
|
|
734
|
+
def _get_num_rows(self) -> int | None:
|
|
735
|
+
pass
|