kumoai 2.13.0.dev202511181731__cp311-cp311-macosx_11_0_arm64.whl → 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +12 -0
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/connector/utils.py +23 -2
- kumoai/experimental/rfm/__init__.py +52 -52
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +57 -110
- kumoai/experimental/rfm/backend/local/sampler.py +315 -0
- kumoai/experimental/rfm/backend/local/table.py +114 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +169 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +154 -0
- kumoai/experimental/rfm/base/__init__.py +33 -0
- kumoai/experimental/rfm/base/column.py +68 -0
- kumoai/experimental/rfm/base/column_expression.py +50 -0
- kumoai/experimental/rfm/base/sampler.py +773 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +229 -0
- kumoai/experimental/rfm/{local_table.py → base/table.py} +219 -189
- kumoai/experimental/rfm/{local_graph.py → graph.py} +510 -91
- kumoai/experimental/rfm/infer/__init__.py +8 -0
- kumoai/experimental/rfm/infer/dtype.py +79 -0
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/rfm.py +313 -246
- kumoai/experimental/rfm/sagemaker.py +15 -7
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/decorators.py +1 -1
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/METADATA +10 -8
- {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/RECORD +46 -26
- kumoai/experimental/rfm/local_graph_sampler.py +0 -184
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- kumoai/experimental/rfm/utils.py +0 -344
- {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/top_level.txt +0 -0
|
@@ -1,115 +1,32 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from functools import cached_property
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
6
|
+
from kumoapi.model_plan import MissingType
|
|
5
7
|
from kumoapi.source_table import UnavailableSourceTable
|
|
6
8
|
from kumoapi.table import Column as ColumnDefinition
|
|
7
9
|
from kumoapi.table import TableDefinition
|
|
8
|
-
from kumoapi.typing import
|
|
10
|
+
from kumoapi.typing import Stype
|
|
9
11
|
from typing_extensions import Self
|
|
10
12
|
|
|
11
|
-
from kumoai import in_notebook
|
|
12
|
-
from kumoai.experimental.rfm import
|
|
13
|
+
from kumoai import in_notebook, in_snowflake_notebook
|
|
14
|
+
from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
|
|
15
|
+
from kumoai.experimental.rfm.infer import (
|
|
16
|
+
infer_primary_key,
|
|
17
|
+
infer_stype,
|
|
18
|
+
infer_time_column,
|
|
19
|
+
)
|
|
13
20
|
|
|
14
21
|
|
|
15
|
-
|
|
16
|
-
class
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
name: str,
|
|
22
|
-
dtype: Dtype,
|
|
23
|
-
stype: Stype,
|
|
24
|
-
is_primary_key: bool = False,
|
|
25
|
-
is_time_column: bool = False,
|
|
26
|
-
is_end_time_column: bool = False,
|
|
27
|
-
) -> None:
|
|
28
|
-
self._name = name
|
|
29
|
-
self._dtype = Dtype(dtype)
|
|
30
|
-
self._is_primary_key = is_primary_key
|
|
31
|
-
self._is_time_column = is_time_column
|
|
32
|
-
self._is_end_time_column = is_end_time_column
|
|
33
|
-
self.stype = Stype(stype)
|
|
34
|
-
|
|
35
|
-
@property
|
|
36
|
-
def name(self) -> str:
|
|
37
|
-
return self._name
|
|
38
|
-
|
|
39
|
-
@property
|
|
40
|
-
def dtype(self) -> Dtype:
|
|
41
|
-
return self._dtype
|
|
42
|
-
|
|
43
|
-
def __setattr__(self, key: str, val: Any) -> None:
|
|
44
|
-
if key == 'stype':
|
|
45
|
-
if isinstance(val, str):
|
|
46
|
-
val = Stype(val)
|
|
47
|
-
assert isinstance(val, Stype)
|
|
48
|
-
if not val.supports_dtype(self.dtype):
|
|
49
|
-
raise ValueError(f"Column '{self.name}' received an "
|
|
50
|
-
f"incompatible semantic type (got "
|
|
51
|
-
f"dtype='{self.dtype}' and stype='{val}')")
|
|
52
|
-
if self._is_primary_key and val != Stype.ID:
|
|
53
|
-
raise ValueError(f"Primary key '{self.name}' must have 'ID' "
|
|
54
|
-
f"semantic type (got '{val}')")
|
|
55
|
-
if self._is_time_column and val != Stype.timestamp:
|
|
56
|
-
raise ValueError(f"Time column '{self.name}' must have "
|
|
57
|
-
f"'timestamp' semantic type (got '{val}')")
|
|
58
|
-
if self._is_end_time_column and val != Stype.timestamp:
|
|
59
|
-
raise ValueError(f"End time column '{self.name}' must have "
|
|
60
|
-
f"'timestamp' semantic type (got '{val}')")
|
|
61
|
-
|
|
62
|
-
super().__setattr__(key, val)
|
|
63
|
-
|
|
64
|
-
def __hash__(self) -> int:
|
|
65
|
-
return hash((self.name, self.stype, self.dtype))
|
|
66
|
-
|
|
67
|
-
def __eq__(self, other: Any) -> bool:
|
|
68
|
-
if not isinstance(other, Column):
|
|
69
|
-
return False
|
|
70
|
-
return hash(self) == hash(other)
|
|
71
|
-
|
|
72
|
-
def __repr__(self) -> str:
|
|
73
|
-
return (f'{self.__class__.__name__}(name={self.name}, '
|
|
74
|
-
f'stype={self.stype}, dtype={self.dtype})')
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class LocalTable:
|
|
78
|
-
r"""A table backed by a :class:`pandas.DataFrame`.
|
|
79
|
-
|
|
80
|
-
A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
|
|
81
|
-
selected columns, column semantic types, primary keys and time columns.
|
|
82
|
-
:class:`LocalTable` is used to create a :class:`LocalGraph`.
|
|
83
|
-
|
|
84
|
-
.. code-block:: python
|
|
85
|
-
|
|
86
|
-
import pandas as pd
|
|
87
|
-
import kumoai.experimental.rfm as rfm
|
|
88
|
-
|
|
89
|
-
# Load data from a CSV file:
|
|
90
|
-
df = pd.read_csv("data.csv")
|
|
91
|
-
|
|
92
|
-
# Create a table from a `pandas.DataFrame` and infer its metadata ...
|
|
93
|
-
table = rfm.LocalTable(df, name="my_table").infer_metadata()
|
|
94
|
-
|
|
95
|
-
# ... or create a table explicitly:
|
|
96
|
-
table = rfm.LocalTable(
|
|
97
|
-
df=df,
|
|
98
|
-
name="my_table",
|
|
99
|
-
primary_key="id",
|
|
100
|
-
time_column="time",
|
|
101
|
-
end_time_column=None,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Verify metadata:
|
|
105
|
-
table.print_metadata()
|
|
106
|
-
|
|
107
|
-
# Change the semantic type of a column:
|
|
108
|
-
table[column].stype = "text"
|
|
22
|
+
class Table(ABC):
|
|
23
|
+
r"""A :class:`Table` fully specifies the relevant metadata of a single
|
|
24
|
+
table, *i.e.* its selected columns, data types, semantic types, primary
|
|
25
|
+
keys and time columns.
|
|
109
26
|
|
|
110
27
|
Args:
|
|
111
|
-
|
|
112
|
-
|
|
28
|
+
name: The name of this table.
|
|
29
|
+
columns: The selected columns of this table.
|
|
113
30
|
primary_key: The name of the primary key of this table, if it exists.
|
|
114
31
|
time_column: The name of the time column of this table, if it exists.
|
|
115
32
|
end_time_column: The name of the end time column of this table, if it
|
|
@@ -117,49 +34,53 @@ class LocalTable:
|
|
|
117
34
|
"""
|
|
118
35
|
def __init__(
|
|
119
36
|
self,
|
|
120
|
-
df: pd.DataFrame,
|
|
121
37
|
name: str,
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
38
|
+
columns: Sequence[str] | None = None,
|
|
39
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
40
|
+
time_column: str | None = None,
|
|
41
|
+
end_time_column: str | None = None,
|
|
125
42
|
) -> None:
|
|
126
43
|
|
|
127
|
-
if df.empty:
|
|
128
|
-
raise ValueError("Data frame must have at least one row")
|
|
129
|
-
if isinstance(df.columns, pd.MultiIndex):
|
|
130
|
-
raise ValueError("Data frame must not have a multi-index")
|
|
131
|
-
if not df.columns.is_unique:
|
|
132
|
-
raise ValueError("Data frame must have unique column names")
|
|
133
|
-
if any(col == '' for col in df.columns):
|
|
134
|
-
raise ValueError("Data frame must have non-empty column names")
|
|
135
|
-
|
|
136
|
-
df = df.copy(deep=False)
|
|
137
|
-
|
|
138
|
-
self._data = df
|
|
139
44
|
self._name = name
|
|
140
|
-
self._primary_key:
|
|
141
|
-
self._time_column:
|
|
142
|
-
self._end_time_column:
|
|
45
|
+
self._primary_key: str | None = None
|
|
46
|
+
self._time_column: str | None = None
|
|
47
|
+
self._end_time_column: str | None = None
|
|
48
|
+
|
|
49
|
+
if columns is None:
|
|
50
|
+
columns = list(self._source_column_dict.keys())
|
|
143
51
|
|
|
144
|
-
self._columns:
|
|
145
|
-
for column_name in
|
|
52
|
+
self._columns: dict[str, Column] = {}
|
|
53
|
+
for column_name in columns:
|
|
146
54
|
self.add_column(column_name)
|
|
147
55
|
|
|
148
|
-
if primary_key
|
|
56
|
+
if isinstance(primary_key, MissingType):
|
|
57
|
+
# Inference from source column metadata:
|
|
58
|
+
if '_source_column_dict' in self.__dict__:
|
|
59
|
+
primary_key = self._source_primary_key
|
|
60
|
+
if (primary_key is not None and primary_key in self
|
|
61
|
+
and self[primary_key].is_physical):
|
|
62
|
+
self.primary_key = primary_key
|
|
63
|
+
elif primary_key is not None:
|
|
64
|
+
if primary_key not in self:
|
|
65
|
+
self.add_column(primary_key)
|
|
149
66
|
self.primary_key = primary_key
|
|
150
67
|
|
|
151
68
|
if time_column is not None:
|
|
69
|
+
if time_column not in self:
|
|
70
|
+
self.add_column(time_column)
|
|
152
71
|
self.time_column = time_column
|
|
153
72
|
|
|
154
73
|
if end_time_column is not None:
|
|
74
|
+
if end_time_column not in self:
|
|
75
|
+
self.add_column(end_time_column)
|
|
155
76
|
self.end_time_column = end_time_column
|
|
156
77
|
|
|
157
78
|
@property
|
|
158
79
|
def name(self) -> str:
|
|
159
|
-
r"""The name of
|
|
80
|
+
r"""The name of this table."""
|
|
160
81
|
return self._name
|
|
161
82
|
|
|
162
|
-
#
|
|
83
|
+
# Column ##################################################################
|
|
163
84
|
|
|
164
85
|
def has_column(self, name: str) -> bool:
|
|
165
86
|
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
@@ -181,7 +102,7 @@ class LocalTable:
|
|
|
181
102
|
return self._columns[name]
|
|
182
103
|
|
|
183
104
|
@property
|
|
184
|
-
def columns(self) ->
|
|
105
|
+
def columns(self) -> list[Column]:
|
|
185
106
|
r"""Returns a list of :class:`Column` objects that represent the
|
|
186
107
|
columns in this table.
|
|
187
108
|
"""
|
|
@@ -200,29 +121,26 @@ class LocalTable:
|
|
|
200
121
|
raise KeyError(f"Column '{name}' already exists in table "
|
|
201
122
|
f"'{self.name}'")
|
|
202
123
|
|
|
203
|
-
if name not in self.
|
|
204
|
-
raise KeyError(f"Column '{name}' does not exist in the
|
|
205
|
-
f"
|
|
124
|
+
if name not in self._source_column_dict:
|
|
125
|
+
raise KeyError(f"Column '{name}' does not exist in the underlying "
|
|
126
|
+
f"source table")
|
|
206
127
|
|
|
128
|
+
dtype = self._source_column_dict[name].dtype
|
|
129
|
+
|
|
130
|
+
ser = self._source_sample_df[name]
|
|
207
131
|
try:
|
|
208
|
-
|
|
209
|
-
except Exception as e:
|
|
210
|
-
raise RuntimeError(f"Data type inference for column '{name}' in "
|
|
211
|
-
f"table '{self.name}' failed. Consider "
|
|
212
|
-
f"changing the data type of the column or "
|
|
213
|
-
f"removing it from the table.") from e
|
|
214
|
-
try:
|
|
215
|
-
stype = utils.infer_stype(self._data[name], name, dtype)
|
|
132
|
+
stype = infer_stype(ser, name, dtype)
|
|
216
133
|
except Exception as e:
|
|
217
|
-
raise RuntimeError(f"
|
|
218
|
-
f"
|
|
219
|
-
f"
|
|
220
|
-
f"
|
|
134
|
+
raise RuntimeError(f"Could not obtain semantic type for column "
|
|
135
|
+
f"'{name}' with data type '{dtype}' in table "
|
|
136
|
+
f"'{self.name}'. Change the data type of the "
|
|
137
|
+
f"column in the source table or remove it from "
|
|
138
|
+
f"this table.") from e
|
|
221
139
|
|
|
222
140
|
self._columns[name] = Column(
|
|
223
141
|
name=name,
|
|
224
|
-
dtype=dtype,
|
|
225
142
|
stype=stype,
|
|
143
|
+
dtype=dtype,
|
|
226
144
|
)
|
|
227
145
|
|
|
228
146
|
return self._columns[name]
|
|
@@ -258,7 +176,7 @@ class LocalTable:
|
|
|
258
176
|
return self._primary_key is not None
|
|
259
177
|
|
|
260
178
|
@property
|
|
261
|
-
def primary_key(self) ->
|
|
179
|
+
def primary_key(self) -> Column | None:
|
|
262
180
|
r"""The primary key column of this table.
|
|
263
181
|
|
|
264
182
|
The getter returns the primary key column of this table, or ``None`` if
|
|
@@ -273,7 +191,7 @@ class LocalTable:
|
|
|
273
191
|
return self[self._primary_key]
|
|
274
192
|
|
|
275
193
|
@primary_key.setter
|
|
276
|
-
def primary_key(self, name:
|
|
194
|
+
def primary_key(self, name: str | None) -> None:
|
|
277
195
|
if name is not None and name == self._time_column:
|
|
278
196
|
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
279
197
|
f"key since it is already defined to be a time "
|
|
@@ -303,7 +221,7 @@ class LocalTable:
|
|
|
303
221
|
return self._time_column is not None
|
|
304
222
|
|
|
305
223
|
@property
|
|
306
|
-
def time_column(self) ->
|
|
224
|
+
def time_column(self) -> Column | None:
|
|
307
225
|
r"""The time column of this table.
|
|
308
226
|
|
|
309
227
|
The getter returns the time column of this table, or ``None`` if no
|
|
@@ -318,7 +236,7 @@ class LocalTable:
|
|
|
318
236
|
return self[self._time_column]
|
|
319
237
|
|
|
320
238
|
@time_column.setter
|
|
321
|
-
def time_column(self, name:
|
|
239
|
+
def time_column(self, name: str | None) -> None:
|
|
322
240
|
if name is not None and name == self._primary_key:
|
|
323
241
|
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
324
242
|
f"column since it is already defined to be a "
|
|
@@ -348,7 +266,7 @@ class LocalTable:
|
|
|
348
266
|
return self._end_time_column is not None
|
|
349
267
|
|
|
350
268
|
@property
|
|
351
|
-
def end_time_column(self) ->
|
|
269
|
+
def end_time_column(self) -> Column | None:
|
|
352
270
|
r"""The end time column of this table.
|
|
353
271
|
|
|
354
272
|
The getter returns the end time column of this table, or ``None`` if no
|
|
@@ -364,7 +282,7 @@ class LocalTable:
|
|
|
364
282
|
return self[self._end_time_column]
|
|
365
283
|
|
|
366
284
|
@end_time_column.setter
|
|
367
|
-
def end_time_column(self, name:
|
|
285
|
+
def end_time_column(self, name: str | None) -> None:
|
|
368
286
|
if name is not None and name == self._primary_key:
|
|
369
287
|
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
370
288
|
f"column since it is already defined to be a "
|
|
@@ -432,12 +350,20 @@ class LocalTable:
|
|
|
432
350
|
})
|
|
433
351
|
|
|
434
352
|
def print_metadata(self) -> None:
|
|
435
|
-
r"""Prints the :meth:`~
|
|
436
|
-
|
|
353
|
+
r"""Prints the :meth:`~metadata` of this table."""
|
|
354
|
+
num_rows_repr = ''
|
|
355
|
+
if self._num_rows is not None:
|
|
356
|
+
num_rows_repr = ' ({self._num_rows:,} rows)'
|
|
357
|
+
|
|
358
|
+
if in_snowflake_notebook():
|
|
359
|
+
import streamlit as st
|
|
360
|
+
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
361
|
+
st.markdown(md_repr)
|
|
362
|
+
st.dataframe(self.metadata, hide_index=True)
|
|
363
|
+
elif in_notebook():
|
|
437
364
|
from IPython.display import Markdown, display
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
f"({len(self._data):,} rows)"))
|
|
365
|
+
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
366
|
+
display(Markdown(md_repr))
|
|
441
367
|
df = self.metadata
|
|
442
368
|
try:
|
|
443
369
|
if hasattr(df.style, 'hide'):
|
|
@@ -447,12 +373,94 @@ class LocalTable:
|
|
|
447
373
|
except ImportError:
|
|
448
374
|
print(df.to_string(index=False)) # missing jinja2
|
|
449
375
|
else:
|
|
450
|
-
print(f"🏷️ Metadata of Table '{self.name}'
|
|
451
|
-
f"({len(self._data):,} rows):")
|
|
376
|
+
print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
|
|
452
377
|
print(self.metadata.to_string(index=False))
|
|
453
378
|
|
|
379
|
+
def infer_primary_key(self, verbose: bool = True) -> Self:
|
|
380
|
+
r"""Infers the primary key in this table.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
verbose: Whether to print verbose output.
|
|
384
|
+
"""
|
|
385
|
+
if self.has_primary_key():
|
|
386
|
+
return self
|
|
387
|
+
|
|
388
|
+
def _set_primary_key(primary_key: str) -> None:
|
|
389
|
+
self.primary_key = primary_key
|
|
390
|
+
if verbose:
|
|
391
|
+
print(f"Detected primary key '{primary_key}' in table "
|
|
392
|
+
f"'{self.name}'")
|
|
393
|
+
|
|
394
|
+
# Inference from source column metadata:
|
|
395
|
+
if '_source_column_dict' in self.__dict__:
|
|
396
|
+
primary_key = self._source_primary_key
|
|
397
|
+
if (primary_key is not None and primary_key in self
|
|
398
|
+
and self[primary_key].is_physical):
|
|
399
|
+
_set_primary_key(primary_key)
|
|
400
|
+
return self
|
|
401
|
+
|
|
402
|
+
unique_keys = [
|
|
403
|
+
column.name for column in self._source_column_dict.values()
|
|
404
|
+
if column.is_unique_key
|
|
405
|
+
]
|
|
406
|
+
if (len(unique_keys) == 1 # NOTE No composite keys yet.
|
|
407
|
+
and unique_keys[0] in self
|
|
408
|
+
and self[unique_keys[0]].is_physical):
|
|
409
|
+
_set_primary_key(unique_keys[0])
|
|
410
|
+
return self
|
|
411
|
+
|
|
412
|
+
# Heuristic-based inference:
|
|
413
|
+
candidates = [
|
|
414
|
+
column.name for column in self.columns if column.stype == Stype.ID
|
|
415
|
+
]
|
|
416
|
+
if len(candidates) == 0:
|
|
417
|
+
for column in self.columns:
|
|
418
|
+
if self.name.lower() == column.name.lower():
|
|
419
|
+
candidates.append(column.name)
|
|
420
|
+
elif (self.name.lower().endswith('s')
|
|
421
|
+
and self.name.lower()[:-1] == column.name.lower()):
|
|
422
|
+
candidates.append(column.name)
|
|
423
|
+
|
|
424
|
+
if primary_key := infer_primary_key(
|
|
425
|
+
table_name=self.name,
|
|
426
|
+
df=self._sample_current_df(columns=candidates),
|
|
427
|
+
candidates=candidates,
|
|
428
|
+
):
|
|
429
|
+
_set_primary_key(primary_key)
|
|
430
|
+
return self
|
|
431
|
+
|
|
432
|
+
return self
|
|
433
|
+
|
|
434
|
+
def infer_time_column(self, verbose: bool = True) -> Self:
|
|
435
|
+
r"""Infers the time column in this table.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
verbose: Whether to print verbose output.
|
|
439
|
+
"""
|
|
440
|
+
if self.has_time_column():
|
|
441
|
+
return self
|
|
442
|
+
|
|
443
|
+
# Heuristic-based inference:
|
|
444
|
+
candidates = [
|
|
445
|
+
column.name for column in self.columns
|
|
446
|
+
if column.stype == Stype.timestamp
|
|
447
|
+
and column.name != self._end_time_column
|
|
448
|
+
]
|
|
449
|
+
|
|
450
|
+
if time_column := infer_time_column(
|
|
451
|
+
df=self._sample_current_df(columns=candidates),
|
|
452
|
+
candidates=candidates,
|
|
453
|
+
):
|
|
454
|
+
self.time_column = time_column
|
|
455
|
+
|
|
456
|
+
if verbose:
|
|
457
|
+
print(f"Detected time column '{time_column}' in table "
|
|
458
|
+
f"'{self.name}'")
|
|
459
|
+
|
|
460
|
+
return self
|
|
461
|
+
|
|
454
462
|
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
455
|
-
r"""Infers metadata, *i.e.*, primary keys and time columns, in
|
|
463
|
+
r"""Infers metadata, *i.e.*, primary keys and time columns, in this
|
|
456
464
|
table.
|
|
457
465
|
|
|
458
466
|
Args:
|
|
@@ -460,42 +468,15 @@ class LocalTable:
|
|
|
460
468
|
"""
|
|
461
469
|
logs = []
|
|
462
470
|
|
|
463
|
-
# Try to detect primary key if not set:
|
|
464
471
|
if not self.has_primary_key():
|
|
472
|
+
self.infer_primary_key(verbose=False)
|
|
473
|
+
if self.has_primary_key():
|
|
474
|
+
logs.append(f"primary key '{self._primary_key}'")
|
|
465
475
|
|
|
466
|
-
def is_candidate(column: Column) -> bool:
|
|
467
|
-
if column.stype == Stype.ID:
|
|
468
|
-
return True
|
|
469
|
-
if all(column.stype != Stype.ID for column in self.columns):
|
|
470
|
-
if self.name == column.name:
|
|
471
|
-
return True
|
|
472
|
-
if (self.name.endswith('s')
|
|
473
|
-
and self.name[:-1] == column.name):
|
|
474
|
-
return True
|
|
475
|
-
return False
|
|
476
|
-
|
|
477
|
-
candidates = [
|
|
478
|
-
column.name for column in self.columns if is_candidate(column)
|
|
479
|
-
]
|
|
480
|
-
|
|
481
|
-
if primary_key := utils.detect_primary_key(
|
|
482
|
-
table_name=self.name,
|
|
483
|
-
df=self._data,
|
|
484
|
-
candidates=candidates,
|
|
485
|
-
):
|
|
486
|
-
self.primary_key = primary_key
|
|
487
|
-
logs.append(f"primary key '{primary_key}'")
|
|
488
|
-
|
|
489
|
-
# Try to detect time column if not set:
|
|
490
476
|
if not self.has_time_column():
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
and column.name != self._end_time_column
|
|
495
|
-
]
|
|
496
|
-
if time_column := utils.detect_time_column(self._data, candidates):
|
|
497
|
-
self.time_column = time_column
|
|
498
|
-
logs.append(f"time column '{time_column}'")
|
|
477
|
+
self.infer_time_column(verbose=False)
|
|
478
|
+
if self.has_time_column():
|
|
479
|
+
logs.append(f"time column '{self._time_column}'")
|
|
499
480
|
|
|
500
481
|
if verbose and len(logs) > 0:
|
|
501
482
|
print(f"Detected {' and '.join(logs)} in table '{self.name}'")
|
|
@@ -516,6 +497,36 @@ class LocalTable:
|
|
|
516
497
|
end_time_col=self._end_time_column,
|
|
517
498
|
)
|
|
518
499
|
|
|
500
|
+
@cached_property
|
|
501
|
+
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
502
|
+
source_columns = self._get_source_columns()
|
|
503
|
+
if len(source_columns) == 0:
|
|
504
|
+
raise ValueError(f"Table '{self.name}' does not hold any column "
|
|
505
|
+
f"with a supported data type")
|
|
506
|
+
return {column.name: column for column in source_columns}
|
|
507
|
+
|
|
508
|
+
@cached_property
|
|
509
|
+
def _source_sample_df(self) -> pd.DataFrame:
|
|
510
|
+
return self._get_source_sample_df()
|
|
511
|
+
|
|
512
|
+
@property
|
|
513
|
+
def _source_primary_key(self) -> str | None:
|
|
514
|
+
primary_keys = [
|
|
515
|
+
column.name for column in self._source_column_dict.values()
|
|
516
|
+
if column.is_primary_key
|
|
517
|
+
]
|
|
518
|
+
if len(primary_keys) == 1: # NOTE No composite keys yet.
|
|
519
|
+
return primary_keys[0]
|
|
520
|
+
|
|
521
|
+
return None
|
|
522
|
+
|
|
523
|
+
@cached_property
|
|
524
|
+
def _num_rows(self) -> int | None:
|
|
525
|
+
return self._get_num_rows()
|
|
526
|
+
|
|
527
|
+
def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
|
|
528
|
+
return self._source_sample_df[columns]
|
|
529
|
+
|
|
519
530
|
# Python builtins #########################################################
|
|
520
531
|
|
|
521
532
|
def __hash__(self) -> int:
|
|
@@ -543,3 +554,22 @@ class LocalTable:
|
|
|
543
554
|
f' time_column={self._time_column},\n'
|
|
544
555
|
f' end_time_column={self._end_time_column},\n'
|
|
545
556
|
f')')
|
|
557
|
+
|
|
558
|
+
# Abstract Methods ########################################################
|
|
559
|
+
|
|
560
|
+
@property
|
|
561
|
+
@abstractmethod
|
|
562
|
+
def backend(self) -> DataBackend:
|
|
563
|
+
r"""The data backend of this table."""
|
|
564
|
+
|
|
565
|
+
@abstractmethod
|
|
566
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
567
|
+
pass
|
|
568
|
+
|
|
569
|
+
@abstractmethod
|
|
570
|
+
def _get_source_sample_df(self) -> pd.DataFrame:
|
|
571
|
+
pass
|
|
572
|
+
|
|
573
|
+
@abstractmethod
|
|
574
|
+
def _get_num_rows(self) -> int | None:
|
|
575
|
+
pass
|