kumoai 2.13.0.dev202511191731__cp310-cp310-macosx_11_0_arm64.whl → 2.14.0rc2__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +35 -26
- kumoai/_version.py +1 -1
- kumoai/client/client.py +6 -0
- kumoai/client/jobs.py +26 -0
- kumoai/client/pquery.py +6 -2
- kumoai/connector/utils.py +44 -9
- kumoai/experimental/rfm/__init__.py +70 -68
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +65 -127
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +113 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +366 -0
- kumoai/experimental/rfm/backend/snow/table.py +242 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +454 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
- kumoai/experimental/rfm/base/__init__.py +30 -0
- kumoai/experimental/rfm/base/column.py +152 -0
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/mapper.py +67 -0
- kumoai/experimental/rfm/base/sampler.py +782 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +366 -0
- kumoai/experimental/rfm/base/table.py +741 -0
- kumoai/experimental/rfm/{local_graph.py → graph.py} +581 -154
- kumoai/experimental/rfm/infer/__init__.py +8 -0
- kumoai/experimental/rfm/infer/dtype.py +82 -0
- kumoai/experimental/rfm/infer/multicategorical.py +1 -1
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +775 -481
- kumoai/experimental/rfm/sagemaker.py +15 -7
- kumoai/experimental/rfm/task_table.py +292 -0
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/pquery/training_table.py +16 -2
- kumoai/testing/decorators.py +1 -1
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/display.py +87 -0
- kumoai/utils/progress_logger.py +190 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202511191731.dist-info → kumoai-2.14.0rc2.dist-info}/METADATA +10 -8
- {kumoai-2.13.0.dev202511191731.dist-info → kumoai-2.14.0rc2.dist-info}/RECORD +54 -30
- kumoai/experimental/rfm/local_graph_sampler.py +0 -182
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- kumoai/experimental/rfm/local_table.py +0 -545
- kumoai/experimental/rfm/utils.py +0 -344
- {kumoai-2.13.0.dev202511191731.dist-info → kumoai-2.14.0rc2.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202511191731.dist-info → kumoai-2.14.0rc2.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202511191731.dist-info → kumoai-2.14.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,545 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from kumoapi.source_table import UnavailableSourceTable
|
|
6
|
-
from kumoapi.table import Column as ColumnDefinition
|
|
7
|
-
from kumoapi.table import TableDefinition
|
|
8
|
-
from kumoapi.typing import Dtype, Stype
|
|
9
|
-
from typing_extensions import Self
|
|
10
|
-
|
|
11
|
-
from kumoai import in_notebook
|
|
12
|
-
from kumoai.experimental.rfm import utils
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass(init=False, repr=False, eq=False)
|
|
16
|
-
class Column:
|
|
17
|
-
stype: Stype
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
name: str,
|
|
22
|
-
dtype: Dtype,
|
|
23
|
-
stype: Stype,
|
|
24
|
-
is_primary_key: bool = False,
|
|
25
|
-
is_time_column: bool = False,
|
|
26
|
-
is_end_time_column: bool = False,
|
|
27
|
-
) -> None:
|
|
28
|
-
self._name = name
|
|
29
|
-
self._dtype = Dtype(dtype)
|
|
30
|
-
self._is_primary_key = is_primary_key
|
|
31
|
-
self._is_time_column = is_time_column
|
|
32
|
-
self._is_end_time_column = is_end_time_column
|
|
33
|
-
self.stype = Stype(stype)
|
|
34
|
-
|
|
35
|
-
@property
|
|
36
|
-
def name(self) -> str:
|
|
37
|
-
return self._name
|
|
38
|
-
|
|
39
|
-
@property
|
|
40
|
-
def dtype(self) -> Dtype:
|
|
41
|
-
return self._dtype
|
|
42
|
-
|
|
43
|
-
def __setattr__(self, key: str, val: Any) -> None:
|
|
44
|
-
if key == 'stype':
|
|
45
|
-
if isinstance(val, str):
|
|
46
|
-
val = Stype(val)
|
|
47
|
-
assert isinstance(val, Stype)
|
|
48
|
-
if not val.supports_dtype(self.dtype):
|
|
49
|
-
raise ValueError(f"Column '{self.name}' received an "
|
|
50
|
-
f"incompatible semantic type (got "
|
|
51
|
-
f"dtype='{self.dtype}' and stype='{val}')")
|
|
52
|
-
if self._is_primary_key and val != Stype.ID:
|
|
53
|
-
raise ValueError(f"Primary key '{self.name}' must have 'ID' "
|
|
54
|
-
f"semantic type (got '{val}')")
|
|
55
|
-
if self._is_time_column and val != Stype.timestamp:
|
|
56
|
-
raise ValueError(f"Time column '{self.name}' must have "
|
|
57
|
-
f"'timestamp' semantic type (got '{val}')")
|
|
58
|
-
if self._is_end_time_column and val != Stype.timestamp:
|
|
59
|
-
raise ValueError(f"End time column '{self.name}' must have "
|
|
60
|
-
f"'timestamp' semantic type (got '{val}')")
|
|
61
|
-
|
|
62
|
-
super().__setattr__(key, val)
|
|
63
|
-
|
|
64
|
-
def __hash__(self) -> int:
|
|
65
|
-
return hash((self.name, self.stype, self.dtype))
|
|
66
|
-
|
|
67
|
-
def __eq__(self, other: Any) -> bool:
|
|
68
|
-
if not isinstance(other, Column):
|
|
69
|
-
return False
|
|
70
|
-
return hash(self) == hash(other)
|
|
71
|
-
|
|
72
|
-
def __repr__(self) -> str:
|
|
73
|
-
return (f'{self.__class__.__name__}(name={self.name}, '
|
|
74
|
-
f'stype={self.stype}, dtype={self.dtype})')
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class LocalTable:
|
|
78
|
-
r"""A table backed by a :class:`pandas.DataFrame`.
|
|
79
|
-
|
|
80
|
-
A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
|
|
81
|
-
selected columns, column semantic types, primary keys and time columns.
|
|
82
|
-
:class:`LocalTable` is used to create a :class:`LocalGraph`.
|
|
83
|
-
|
|
84
|
-
.. code-block:: python
|
|
85
|
-
|
|
86
|
-
import pandas as pd
|
|
87
|
-
import kumoai.experimental.rfm as rfm
|
|
88
|
-
|
|
89
|
-
# Load data from a CSV file:
|
|
90
|
-
df = pd.read_csv("data.csv")
|
|
91
|
-
|
|
92
|
-
# Create a table from a `pandas.DataFrame` and infer its metadata ...
|
|
93
|
-
table = rfm.LocalTable(df, name="my_table").infer_metadata()
|
|
94
|
-
|
|
95
|
-
# ... or create a table explicitly:
|
|
96
|
-
table = rfm.LocalTable(
|
|
97
|
-
df=df,
|
|
98
|
-
name="my_table",
|
|
99
|
-
primary_key="id",
|
|
100
|
-
time_column="time",
|
|
101
|
-
end_time_column=None,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Verify metadata:
|
|
105
|
-
table.print_metadata()
|
|
106
|
-
|
|
107
|
-
# Change the semantic type of a column:
|
|
108
|
-
table[column].stype = "text"
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
df: The data frame to create the table from.
|
|
112
|
-
name: The name of the table.
|
|
113
|
-
primary_key: The name of the primary key of this table, if it exists.
|
|
114
|
-
time_column: The name of the time column of this table, if it exists.
|
|
115
|
-
end_time_column: The name of the end time column of this table, if it
|
|
116
|
-
exists.
|
|
117
|
-
"""
|
|
118
|
-
def __init__(
|
|
119
|
-
self,
|
|
120
|
-
df: pd.DataFrame,
|
|
121
|
-
name: str,
|
|
122
|
-
primary_key: Optional[str] = None,
|
|
123
|
-
time_column: Optional[str] = None,
|
|
124
|
-
end_time_column: Optional[str] = None,
|
|
125
|
-
) -> None:
|
|
126
|
-
|
|
127
|
-
if df.empty:
|
|
128
|
-
raise ValueError("Data frame must have at least one row")
|
|
129
|
-
if isinstance(df.columns, pd.MultiIndex):
|
|
130
|
-
raise ValueError("Data frame must not have a multi-index")
|
|
131
|
-
if not df.columns.is_unique:
|
|
132
|
-
raise ValueError("Data frame must have unique column names")
|
|
133
|
-
if any(col == '' for col in df.columns):
|
|
134
|
-
raise ValueError("Data frame must have non-empty column names")
|
|
135
|
-
|
|
136
|
-
df = df.copy(deep=False)
|
|
137
|
-
|
|
138
|
-
self._data = df
|
|
139
|
-
self._name = name
|
|
140
|
-
self._primary_key: Optional[str] = None
|
|
141
|
-
self._time_column: Optional[str] = None
|
|
142
|
-
self._end_time_column: Optional[str] = None
|
|
143
|
-
|
|
144
|
-
self._columns: Dict[str, Column] = {}
|
|
145
|
-
for column_name in df.columns:
|
|
146
|
-
self.add_column(column_name)
|
|
147
|
-
|
|
148
|
-
if primary_key is not None:
|
|
149
|
-
self.primary_key = primary_key
|
|
150
|
-
|
|
151
|
-
if time_column is not None:
|
|
152
|
-
self.time_column = time_column
|
|
153
|
-
|
|
154
|
-
if end_time_column is not None:
|
|
155
|
-
self.end_time_column = end_time_column
|
|
156
|
-
|
|
157
|
-
@property
|
|
158
|
-
def name(self) -> str:
|
|
159
|
-
r"""The name of the table."""
|
|
160
|
-
return self._name
|
|
161
|
-
|
|
162
|
-
# Data column #############################################################
|
|
163
|
-
|
|
164
|
-
def has_column(self, name: str) -> bool:
|
|
165
|
-
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
166
|
-
``False`` otherwise.
|
|
167
|
-
"""
|
|
168
|
-
return name in self._columns
|
|
169
|
-
|
|
170
|
-
def column(self, name: str) -> Column:
|
|
171
|
-
r"""Returns the data column named with name ``name`` in this table.
|
|
172
|
-
|
|
173
|
-
Args:
|
|
174
|
-
name: The name of the column.
|
|
175
|
-
|
|
176
|
-
Raises:
|
|
177
|
-
KeyError: If ``name`` is not present in this table.
|
|
178
|
-
"""
|
|
179
|
-
if not self.has_column(name):
|
|
180
|
-
raise KeyError(f"Column '{name}' not found in table '{self.name}'")
|
|
181
|
-
return self._columns[name]
|
|
182
|
-
|
|
183
|
-
@property
|
|
184
|
-
def columns(self) -> List[Column]:
|
|
185
|
-
r"""Returns a list of :class:`Column` objects that represent the
|
|
186
|
-
columns in this table.
|
|
187
|
-
"""
|
|
188
|
-
return list(self._columns.values())
|
|
189
|
-
|
|
190
|
-
def add_column(self, name: str) -> Column:
|
|
191
|
-
r"""Adds a column to this table.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
name: The name of the column.
|
|
195
|
-
|
|
196
|
-
Raises:
|
|
197
|
-
KeyError: If ``name`` is already present in this table.
|
|
198
|
-
"""
|
|
199
|
-
if name in self:
|
|
200
|
-
raise KeyError(f"Column '{name}' already exists in table "
|
|
201
|
-
f"'{self.name}'")
|
|
202
|
-
|
|
203
|
-
if name not in self._data.columns:
|
|
204
|
-
raise KeyError(f"Column '{name}' does not exist in the underyling "
|
|
205
|
-
f"data frame")
|
|
206
|
-
|
|
207
|
-
try:
|
|
208
|
-
dtype = utils.to_dtype(self._data[name])
|
|
209
|
-
except Exception as e:
|
|
210
|
-
raise RuntimeError(f"Data type inference for column '{name}' in "
|
|
211
|
-
f"table '{self.name}' failed. Consider "
|
|
212
|
-
f"changing the data type of the column or "
|
|
213
|
-
f"removing it from the table.") from e
|
|
214
|
-
try:
|
|
215
|
-
stype = utils.infer_stype(self._data[name], name, dtype)
|
|
216
|
-
except Exception as e:
|
|
217
|
-
raise RuntimeError(f"Semantic type inference for column '{name}' "
|
|
218
|
-
f"in table '{self.name}' failed. Consider "
|
|
219
|
-
f"changing the data type of the column or "
|
|
220
|
-
f"removing it from the table.") from e
|
|
221
|
-
|
|
222
|
-
self._columns[name] = Column(
|
|
223
|
-
name=name,
|
|
224
|
-
dtype=dtype,
|
|
225
|
-
stype=stype,
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
return self._columns[name]
|
|
229
|
-
|
|
230
|
-
def remove_column(self, name: str) -> Self:
|
|
231
|
-
r"""Removes a column from this table.
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
name: The name of the column.
|
|
235
|
-
|
|
236
|
-
Raises:
|
|
237
|
-
KeyError: If ``name`` is not present in this table.
|
|
238
|
-
"""
|
|
239
|
-
if name not in self:
|
|
240
|
-
raise KeyError(f"Column '{name}' not found in table '{self.name}'")
|
|
241
|
-
|
|
242
|
-
if self._primary_key == name:
|
|
243
|
-
self.primary_key = None
|
|
244
|
-
if self._time_column == name:
|
|
245
|
-
self.time_column = None
|
|
246
|
-
if self._end_time_column == name:
|
|
247
|
-
self.end_time_column = None
|
|
248
|
-
del self._columns[name]
|
|
249
|
-
|
|
250
|
-
return self
|
|
251
|
-
|
|
252
|
-
# Primary key #############################################################
|
|
253
|
-
|
|
254
|
-
def has_primary_key(self) -> bool:
|
|
255
|
-
r"""Returns ``True``` if this table has a primary key; ``False``
|
|
256
|
-
otherwise.
|
|
257
|
-
"""
|
|
258
|
-
return self._primary_key is not None
|
|
259
|
-
|
|
260
|
-
@property
|
|
261
|
-
def primary_key(self) -> Optional[Column]:
|
|
262
|
-
r"""The primary key column of this table.
|
|
263
|
-
|
|
264
|
-
The getter returns the primary key column of this table, or ``None`` if
|
|
265
|
-
no such primary key is present.
|
|
266
|
-
|
|
267
|
-
The setter sets a column as a primary key on this table, and raises a
|
|
268
|
-
:class:`ValueError` if the primary key has a non-ID semantic type or
|
|
269
|
-
if the column name does not match a column in the data frame.
|
|
270
|
-
"""
|
|
271
|
-
if self._primary_key is None:
|
|
272
|
-
return None
|
|
273
|
-
return self[self._primary_key]
|
|
274
|
-
|
|
275
|
-
@primary_key.setter
|
|
276
|
-
def primary_key(self, name: Optional[str]) -> None:
|
|
277
|
-
if name is not None and name == self._time_column:
|
|
278
|
-
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
279
|
-
f"key since it is already defined to be a time "
|
|
280
|
-
f"column")
|
|
281
|
-
if name is not None and name == self._end_time_column:
|
|
282
|
-
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
283
|
-
f"key since it is already defined to be an end "
|
|
284
|
-
f"time column")
|
|
285
|
-
|
|
286
|
-
if self.primary_key is not None:
|
|
287
|
-
self.primary_key._is_primary_key = False
|
|
288
|
-
|
|
289
|
-
if name is None:
|
|
290
|
-
self._primary_key = None
|
|
291
|
-
return
|
|
292
|
-
|
|
293
|
-
self[name].stype = Stype.ID
|
|
294
|
-
self[name]._is_primary_key = True
|
|
295
|
-
self._primary_key = name
|
|
296
|
-
|
|
297
|
-
# Time column #############################################################
|
|
298
|
-
|
|
299
|
-
def has_time_column(self) -> bool:
|
|
300
|
-
r"""Returns ``True`` if this table has a time column; ``False``
|
|
301
|
-
otherwise.
|
|
302
|
-
"""
|
|
303
|
-
return self._time_column is not None
|
|
304
|
-
|
|
305
|
-
@property
|
|
306
|
-
def time_column(self) -> Optional[Column]:
|
|
307
|
-
r"""The time column of this table.
|
|
308
|
-
|
|
309
|
-
The getter returns the time column of this table, or ``None`` if no
|
|
310
|
-
such time column is present.
|
|
311
|
-
|
|
312
|
-
The setter sets a column as a time column on this table, and raises a
|
|
313
|
-
:class:`ValueError` if the time column has a non-timestamp semantic
|
|
314
|
-
type or if the column name does not match a column in the data frame.
|
|
315
|
-
"""
|
|
316
|
-
if self._time_column is None:
|
|
317
|
-
return None
|
|
318
|
-
return self[self._time_column]
|
|
319
|
-
|
|
320
|
-
@time_column.setter
|
|
321
|
-
def time_column(self, name: Optional[str]) -> None:
|
|
322
|
-
if name is not None and name == self._primary_key:
|
|
323
|
-
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
324
|
-
f"column since it is already defined to be a "
|
|
325
|
-
f"primary key")
|
|
326
|
-
if name is not None and name == self._end_time_column:
|
|
327
|
-
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
328
|
-
f"column since it is already defined to be an "
|
|
329
|
-
f"end time column")
|
|
330
|
-
|
|
331
|
-
if self.time_column is not None:
|
|
332
|
-
self.time_column._is_time_column = False
|
|
333
|
-
|
|
334
|
-
if name is None:
|
|
335
|
-
self._time_column = None
|
|
336
|
-
return
|
|
337
|
-
|
|
338
|
-
self[name].stype = Stype.timestamp
|
|
339
|
-
self[name]._is_time_column = True
|
|
340
|
-
self._time_column = name
|
|
341
|
-
|
|
342
|
-
# End Time column #########################################################
|
|
343
|
-
|
|
344
|
-
def has_end_time_column(self) -> bool:
|
|
345
|
-
r"""Returns ``True`` if this table has an end time column; ``False``
|
|
346
|
-
otherwise.
|
|
347
|
-
"""
|
|
348
|
-
return self._end_time_column is not None
|
|
349
|
-
|
|
350
|
-
@property
|
|
351
|
-
def end_time_column(self) -> Optional[Column]:
|
|
352
|
-
r"""The end time column of this table.
|
|
353
|
-
|
|
354
|
-
The getter returns the end time column of this table, or ``None`` if no
|
|
355
|
-
such end time column is present.
|
|
356
|
-
|
|
357
|
-
The setter sets a column as an end time column on this table, and
|
|
358
|
-
raises a :class:`ValueError` if the end time column has a non-timestamp
|
|
359
|
-
semantic type or if the column name does not match a column in the data
|
|
360
|
-
frame.
|
|
361
|
-
"""
|
|
362
|
-
if self._end_time_column is None:
|
|
363
|
-
return None
|
|
364
|
-
return self[self._end_time_column]
|
|
365
|
-
|
|
366
|
-
@end_time_column.setter
|
|
367
|
-
def end_time_column(self, name: Optional[str]) -> None:
|
|
368
|
-
if name is not None and name == self._primary_key:
|
|
369
|
-
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
370
|
-
f"column since it is already defined to be a "
|
|
371
|
-
f"primary key")
|
|
372
|
-
if name is not None and name == self._time_column:
|
|
373
|
-
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
374
|
-
f"column since it is already defined to be a "
|
|
375
|
-
f"time column")
|
|
376
|
-
|
|
377
|
-
if self.end_time_column is not None:
|
|
378
|
-
self.end_time_column._is_end_time_column = False
|
|
379
|
-
|
|
380
|
-
if name is None:
|
|
381
|
-
self._end_time_column = None
|
|
382
|
-
return
|
|
383
|
-
|
|
384
|
-
self[name].stype = Stype.timestamp
|
|
385
|
-
self[name]._is_end_time_column = True
|
|
386
|
-
self._end_time_column = name
|
|
387
|
-
|
|
388
|
-
# Metadata ################################################################
|
|
389
|
-
|
|
390
|
-
@property
|
|
391
|
-
def metadata(self) -> pd.DataFrame:
|
|
392
|
-
r"""Returns a :class:`pandas.DataFrame` object containing metadata
|
|
393
|
-
information about the columns in this table.
|
|
394
|
-
|
|
395
|
-
The returned dataframe has columns ``name``, ``dtype``, ``stype``,
|
|
396
|
-
``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
|
|
397
|
-
which provide an aggregate view of the properties of the columns of
|
|
398
|
-
this table.
|
|
399
|
-
|
|
400
|
-
Example:
|
|
401
|
-
>>> # doctest: +SKIP
|
|
402
|
-
>>> import kumoai.experimental.rfm as rfm
|
|
403
|
-
>>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
|
|
404
|
-
>>> table.metadata
|
|
405
|
-
name dtype stype is_primary_key is_time_column is_end_time_column
|
|
406
|
-
0 CustomerID float64 ID True False False
|
|
407
|
-
""" # noqa: E501
|
|
408
|
-
cols = self.columns
|
|
409
|
-
|
|
410
|
-
return pd.DataFrame({
|
|
411
|
-
'name':
|
|
412
|
-
pd.Series(dtype=str, data=[c.name for c in cols]),
|
|
413
|
-
'dtype':
|
|
414
|
-
pd.Series(dtype=str, data=[c.dtype for c in cols]),
|
|
415
|
-
'stype':
|
|
416
|
-
pd.Series(dtype=str, data=[c.stype for c in cols]),
|
|
417
|
-
'is_primary_key':
|
|
418
|
-
pd.Series(
|
|
419
|
-
dtype=bool,
|
|
420
|
-
data=[self._primary_key == c.name for c in cols],
|
|
421
|
-
),
|
|
422
|
-
'is_time_column':
|
|
423
|
-
pd.Series(
|
|
424
|
-
dtype=bool,
|
|
425
|
-
data=[self._time_column == c.name for c in cols],
|
|
426
|
-
),
|
|
427
|
-
'is_end_time_column':
|
|
428
|
-
pd.Series(
|
|
429
|
-
dtype=bool,
|
|
430
|
-
data=[self._end_time_column == c.name for c in cols],
|
|
431
|
-
),
|
|
432
|
-
})
|
|
433
|
-
|
|
434
|
-
def print_metadata(self) -> None:
|
|
435
|
-
r"""Prints the :meth:`~LocalTable.metadata` of the table."""
|
|
436
|
-
if in_notebook():
|
|
437
|
-
from IPython.display import Markdown, display
|
|
438
|
-
display(
|
|
439
|
-
Markdown(f"### 🏷️ Metadata of Table `{self.name}` "
|
|
440
|
-
f"({len(self._data):,} rows)"))
|
|
441
|
-
df = self.metadata
|
|
442
|
-
try:
|
|
443
|
-
if hasattr(df.style, 'hide'):
|
|
444
|
-
display(df.style.hide(axis='index')) # pandas=2
|
|
445
|
-
else:
|
|
446
|
-
display(df.style.hide_index()) # pandas<1.3
|
|
447
|
-
except ImportError:
|
|
448
|
-
print(df.to_string(index=False)) # missing jinja2
|
|
449
|
-
else:
|
|
450
|
-
print(f"🏷️ Metadata of Table '{self.name}' "
|
|
451
|
-
f"({len(self._data):,} rows):")
|
|
452
|
-
print(self.metadata.to_string(index=False))
|
|
453
|
-
|
|
454
|
-
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
455
|
-
r"""Infers metadata, *i.e.*, primary keys and time columns, in the
|
|
456
|
-
table.
|
|
457
|
-
|
|
458
|
-
Args:
|
|
459
|
-
verbose: Whether to print verbose output.
|
|
460
|
-
"""
|
|
461
|
-
logs = []
|
|
462
|
-
|
|
463
|
-
# Try to detect primary key if not set:
|
|
464
|
-
if not self.has_primary_key():
|
|
465
|
-
|
|
466
|
-
def is_candidate(column: Column) -> bool:
|
|
467
|
-
if column.stype == Stype.ID:
|
|
468
|
-
return True
|
|
469
|
-
if all(column.stype != Stype.ID for column in self.columns):
|
|
470
|
-
if self.name == column.name:
|
|
471
|
-
return True
|
|
472
|
-
if (self.name.endswith('s')
|
|
473
|
-
and self.name[:-1] == column.name):
|
|
474
|
-
return True
|
|
475
|
-
return False
|
|
476
|
-
|
|
477
|
-
candidates = [
|
|
478
|
-
column.name for column in self.columns if is_candidate(column)
|
|
479
|
-
]
|
|
480
|
-
|
|
481
|
-
if primary_key := utils.detect_primary_key(
|
|
482
|
-
table_name=self.name,
|
|
483
|
-
df=self._data,
|
|
484
|
-
candidates=candidates,
|
|
485
|
-
):
|
|
486
|
-
self.primary_key = primary_key
|
|
487
|
-
logs.append(f"primary key '{primary_key}'")
|
|
488
|
-
|
|
489
|
-
# Try to detect time column if not set:
|
|
490
|
-
if not self.has_time_column():
|
|
491
|
-
candidates = [
|
|
492
|
-
column.name for column in self.columns
|
|
493
|
-
if column.stype == Stype.timestamp
|
|
494
|
-
and column.name != self._end_time_column
|
|
495
|
-
]
|
|
496
|
-
if time_column := utils.detect_time_column(self._data, candidates):
|
|
497
|
-
self.time_column = time_column
|
|
498
|
-
logs.append(f"time column '{time_column}'")
|
|
499
|
-
|
|
500
|
-
if verbose and len(logs) > 0:
|
|
501
|
-
print(f"Detected {' and '.join(logs)} in table '{self.name}'")
|
|
502
|
-
|
|
503
|
-
return self
|
|
504
|
-
|
|
505
|
-
# Helpers #################################################################
|
|
506
|
-
|
|
507
|
-
def _to_api_table_definition(self) -> TableDefinition:
|
|
508
|
-
return TableDefinition(
|
|
509
|
-
cols=[
|
|
510
|
-
ColumnDefinition(col.name, col.stype, col.dtype)
|
|
511
|
-
for col in self.columns
|
|
512
|
-
],
|
|
513
|
-
source_table=UnavailableSourceTable(table=self.name),
|
|
514
|
-
pkey=self._primary_key,
|
|
515
|
-
time_col=self._time_column,
|
|
516
|
-
end_time_col=self._end_time_column,
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
# Python builtins #########################################################
|
|
520
|
-
|
|
521
|
-
def __hash__(self) -> int:
|
|
522
|
-
special_columns = [
|
|
523
|
-
self.primary_key,
|
|
524
|
-
self.time_column,
|
|
525
|
-
self.end_time_column,
|
|
526
|
-
]
|
|
527
|
-
return hash(tuple(self.columns + special_columns))
|
|
528
|
-
|
|
529
|
-
def __contains__(self, name: str) -> bool:
|
|
530
|
-
return self.has_column(name)
|
|
531
|
-
|
|
532
|
-
def __getitem__(self, name: str) -> Column:
|
|
533
|
-
return self.column(name)
|
|
534
|
-
|
|
535
|
-
def __delitem__(self, name: str) -> None:
|
|
536
|
-
self.remove_column(name)
|
|
537
|
-
|
|
538
|
-
def __repr__(self) -> str:
|
|
539
|
-
return (f'{self.__class__.__name__}(\n'
|
|
540
|
-
f' name={self.name},\n'
|
|
541
|
-
f' num_columns={len(self.columns)},\n'
|
|
542
|
-
f' primary_key={self._primary_key},\n'
|
|
543
|
-
f' time_column={self._time_column},\n'
|
|
544
|
-
f' end_time_column={self._end_time_column},\n'
|
|
545
|
-
f')')
|