kumoai 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512301731__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +23 -26
- kumoai/_version.py +1 -1
- kumoai/client/client.py +6 -0
- kumoai/client/jobs.py +24 -0
- kumoai/experimental/rfm/__init__.py +22 -22
- kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
- kumoai/experimental/rfm/backend/local/sampler.py +0 -3
- kumoai/experimental/rfm/backend/local/table.py +25 -24
- kumoai/experimental/rfm/backend/snow/sampler.py +106 -61
- kumoai/experimental/rfm/backend/snow/table.py +146 -51
- kumoai/experimental/rfm/backend/sqlite/sampler.py +127 -78
- kumoai/experimental/rfm/backend/sqlite/table.py +94 -47
- kumoai/experimental/rfm/base/__init__.py +6 -7
- kumoai/experimental/rfm/base/column.py +97 -5
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +5 -17
- kumoai/experimental/rfm/base/source.py +1 -1
- kumoai/experimental/rfm/base/sql_sampler.py +68 -9
- kumoai/experimental/rfm/base/table.py +284 -120
- kumoai/experimental/rfm/graph.py +139 -86
- kumoai/experimental/rfm/infer/__init__.py +6 -4
- kumoai/experimental/rfm/infer/dtype.py +6 -1
- kumoai/experimental/rfm/infer/multicategorical.py +1 -1
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +4 -20
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/utils/display.py +51 -0
- {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/METADATA +1 -1
- {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/RECORD +33 -30
- kumoai/experimental/rfm/base/column_expression.py +0 -16
- kumoai/experimental/rfm/base/sql_table.py +0 -113
- {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/WHEEL +0 -0
- {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,32 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from collections.abc import Sequence
|
|
3
4
|
from functools import cached_property
|
|
4
5
|
|
|
6
|
+
import numpy as np
|
|
5
7
|
import pandas as pd
|
|
6
8
|
from kumoapi.model_plan import MissingType
|
|
7
9
|
from kumoapi.source_table import UnavailableSourceTable
|
|
8
10
|
from kumoapi.table import Column as ColumnDefinition
|
|
9
11
|
from kumoapi.table import TableDefinition
|
|
10
|
-
from kumoapi.typing import Stype
|
|
12
|
+
from kumoapi.typing import Dtype, Stype
|
|
11
13
|
from typing_extensions import Self
|
|
12
14
|
|
|
13
|
-
from kumoai import
|
|
14
|
-
|
|
15
|
+
from kumoai.experimental.rfm.base import (
|
|
16
|
+
Column,
|
|
17
|
+
ColumnSpec,
|
|
18
|
+
ColumnSpecType,
|
|
19
|
+
DataBackend,
|
|
20
|
+
SourceColumn,
|
|
21
|
+
SourceForeignKey,
|
|
22
|
+
)
|
|
15
23
|
from kumoai.experimental.rfm.infer import (
|
|
16
|
-
|
|
17
|
-
contains_id,
|
|
18
|
-
contains_multicategorical,
|
|
19
|
-
contains_timestamp,
|
|
24
|
+
infer_dtype,
|
|
20
25
|
infer_primary_key,
|
|
26
|
+
infer_stype,
|
|
21
27
|
infer_time_column,
|
|
22
28
|
)
|
|
29
|
+
from kumoai.utils import display, quote_ident
|
|
23
30
|
|
|
24
31
|
|
|
25
32
|
class Table(ABC):
|
|
@@ -29,41 +36,48 @@ class Table(ABC):
|
|
|
29
36
|
|
|
30
37
|
Args:
|
|
31
38
|
name: The name of this table.
|
|
39
|
+
source_name: The source name of this table. If set to ``None``,
|
|
40
|
+
``name`` is being used.
|
|
32
41
|
columns: The selected columns of this table.
|
|
33
42
|
primary_key: The name of the primary key of this table, if it exists.
|
|
34
43
|
time_column: The name of the time column of this table, if it exists.
|
|
35
44
|
end_time_column: The name of the end time column of this table, if it
|
|
36
45
|
exists.
|
|
37
46
|
"""
|
|
47
|
+
_NUM_SAMPLE_ROWS = 1_000
|
|
48
|
+
|
|
38
49
|
def __init__(
|
|
39
50
|
self,
|
|
40
51
|
name: str,
|
|
41
|
-
|
|
52
|
+
source_name: str | None = None,
|
|
53
|
+
columns: Sequence[ColumnSpecType] | None = None,
|
|
42
54
|
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
43
55
|
time_column: str | None = None,
|
|
44
56
|
end_time_column: str | None = None,
|
|
45
57
|
) -> None:
|
|
46
58
|
|
|
47
59
|
self._name = name
|
|
60
|
+
self._source_name = source_name or name
|
|
61
|
+
self._column_dict: dict[str, Column] = {}
|
|
48
62
|
self._primary_key: str | None = None
|
|
49
63
|
self._time_column: str | None = None
|
|
50
64
|
self._end_time_column: str | None = None
|
|
65
|
+
self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
|
|
51
66
|
|
|
52
67
|
if columns is None:
|
|
53
68
|
columns = list(self._source_column_dict.keys())
|
|
54
69
|
|
|
55
|
-
|
|
56
|
-
raise ValueError(f"Table '{name}' does not hold any column with "
|
|
57
|
-
f"a supported data type")
|
|
70
|
+
self.add_columns(columns)
|
|
58
71
|
|
|
59
72
|
if isinstance(primary_key, MissingType):
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
73
|
+
# Infer primary key from source metadata, but only set it in case
|
|
74
|
+
# it is already part of the column set (don't magically add it):
|
|
75
|
+
if any(column.is_source for column in self.columns):
|
|
76
|
+
primary_key = self._source_primary_key
|
|
77
|
+
if (primary_key is not None and primary_key in self
|
|
78
|
+
and self[primary_key].is_source):
|
|
79
|
+
self.primary_key = primary_key
|
|
80
|
+
elif primary_key is not None:
|
|
67
81
|
if primary_key not in self:
|
|
68
82
|
self.add_column(primary_key)
|
|
69
83
|
self.primary_key = primary_key
|
|
@@ -83,13 +97,22 @@ class Table(ABC):
|
|
|
83
97
|
r"""The name of this table."""
|
|
84
98
|
return self._name
|
|
85
99
|
|
|
100
|
+
@property
|
|
101
|
+
def source_name(self) -> str:
|
|
102
|
+
r"""The source name of this table."""
|
|
103
|
+
return self._source_name
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def _quoted_source_name(self) -> str:
|
|
107
|
+
return quote_ident(self._source_name)
|
|
108
|
+
|
|
86
109
|
# Column ##################################################################
|
|
87
110
|
|
|
88
111
|
def has_column(self, name: str) -> bool:
|
|
89
112
|
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
90
113
|
``False`` otherwise.
|
|
91
114
|
"""
|
|
92
|
-
return name in self.
|
|
115
|
+
return name in self._column_dict
|
|
93
116
|
|
|
94
117
|
def column(self, name: str) -> Column:
|
|
95
118
|
r"""Returns the data column named with name ``name`` in this table.
|
|
@@ -102,59 +125,113 @@ class Table(ABC):
|
|
|
102
125
|
"""
|
|
103
126
|
if not self.has_column(name):
|
|
104
127
|
raise KeyError(f"Column '{name}' not found in table '{self.name}'")
|
|
105
|
-
return self.
|
|
128
|
+
return self._column_dict[name]
|
|
106
129
|
|
|
107
130
|
@property
|
|
108
131
|
def columns(self) -> list[Column]:
|
|
109
132
|
r"""Returns a list of :class:`Column` objects that represent the
|
|
110
133
|
columns in this table.
|
|
111
134
|
"""
|
|
112
|
-
return list(self.
|
|
135
|
+
return list(self._column_dict.values())
|
|
113
136
|
|
|
114
|
-
def
|
|
115
|
-
r"""Adds a
|
|
137
|
+
def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
|
|
138
|
+
r"""Adds a set of columns to this table.
|
|
116
139
|
|
|
117
140
|
Args:
|
|
118
|
-
|
|
141
|
+
columns: The columns to add.
|
|
119
142
|
|
|
120
143
|
Raises:
|
|
121
|
-
KeyError: If
|
|
144
|
+
KeyError: If any of the column names already exist in this table.
|
|
122
145
|
"""
|
|
123
|
-
if
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
146
|
+
if len(columns) == 0:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
column_specs = [ColumnSpec.coerce(column) for column in columns]
|
|
150
|
+
|
|
151
|
+
# Obtain a batch-wise sample for all column expressions:
|
|
152
|
+
expr_specs = [spec for spec in column_specs if not spec.is_source]
|
|
153
|
+
if len(expr_specs) > 0:
|
|
154
|
+
dfs = [
|
|
155
|
+
self._expr_sample_df,
|
|
156
|
+
self._get_expr_sample_df(expr_specs).reset_index(drop=True),
|
|
157
|
+
]
|
|
158
|
+
size = min(map(len, dfs))
|
|
159
|
+
df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
|
|
160
|
+
df = df.loc[:, ~df.columns.duplicated(keep='last')]
|
|
161
|
+
self._expr_sample_df = df
|
|
162
|
+
|
|
163
|
+
for column_spec in column_specs:
|
|
164
|
+
if column_spec.name in self:
|
|
165
|
+
raise KeyError(f"Column '{column_spec.name}' already exists "
|
|
166
|
+
f"in table '{self.name}'")
|
|
167
|
+
|
|
168
|
+
dtype = column_spec.dtype
|
|
169
|
+
stype = column_spec.stype
|
|
170
|
+
|
|
171
|
+
if column_spec.is_source:
|
|
172
|
+
if column_spec.name not in self._source_column_dict:
|
|
173
|
+
raise ValueError(
|
|
174
|
+
f"Column '{column_spec.name}' does not exist in the "
|
|
175
|
+
f"underlying source table")
|
|
176
|
+
|
|
177
|
+
if dtype is None:
|
|
178
|
+
dtype = self._source_column_dict[column_spec.name].dtype
|
|
179
|
+
|
|
180
|
+
if dtype == Dtype.unsupported:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f"Encountered unsupported data type for column "
|
|
183
|
+
f"'{column_spec.name}' in table '{self.name}'. Please "
|
|
184
|
+
f"either change the column's data type or remove the "
|
|
185
|
+
f"column from this table.")
|
|
186
|
+
|
|
187
|
+
if dtype is None:
|
|
188
|
+
if column_spec.is_source:
|
|
189
|
+
ser = self._source_sample_df[column_spec.name]
|
|
190
|
+
else:
|
|
191
|
+
ser = self._expr_sample_df[column_spec.name]
|
|
192
|
+
try:
|
|
193
|
+
dtype = infer_dtype(ser)
|
|
194
|
+
except Exception as e:
|
|
195
|
+
raise RuntimeError(
|
|
196
|
+
f"Encountered unsupported data type '{ser.dtype}' for "
|
|
197
|
+
f"column '{column_spec.name}' in table '{self.name}'. "
|
|
198
|
+
f"Please either manually override the columns's data "
|
|
199
|
+
f"type or remove the column from this table.") from e
|
|
200
|
+
|
|
201
|
+
if stype is None:
|
|
202
|
+
if column_spec.is_source:
|
|
203
|
+
ser = self._source_sample_df[column_spec.name]
|
|
204
|
+
else:
|
|
205
|
+
ser = self._expr_sample_df[column_spec.name]
|
|
206
|
+
try:
|
|
207
|
+
stype = infer_stype(ser, column_spec.name, dtype)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
raise RuntimeError(
|
|
210
|
+
f"Could not determine semantic type for column "
|
|
211
|
+
f"'{column_spec.name}' with data type '{dtype}' in "
|
|
212
|
+
f"table '{self.name}'. Please either change the "
|
|
213
|
+
f"column's data type or remove the column from this "
|
|
214
|
+
f"table.") from e
|
|
215
|
+
|
|
216
|
+
self._column_dict[column_spec.name] = Column(
|
|
217
|
+
name=column_spec.name,
|
|
218
|
+
expr=column_spec.expr,
|
|
219
|
+
dtype=dtype,
|
|
220
|
+
stype=stype,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def add_column(self, column: ColumnSpecType) -> Column:
|
|
224
|
+
r"""Adds a column to this table.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
column: The column to add.
|
|
156
228
|
|
|
157
|
-
|
|
229
|
+
Raises:
|
|
230
|
+
KeyError: If the column name already exists in this table.
|
|
231
|
+
"""
|
|
232
|
+
column_spec = ColumnSpec.coerce(column)
|
|
233
|
+
self.add_columns([column_spec])
|
|
234
|
+
return self[column_spec.name]
|
|
158
235
|
|
|
159
236
|
def remove_column(self, name: str) -> Self:
|
|
160
237
|
r"""Removes a column from this table.
|
|
@@ -174,7 +251,7 @@ class Table(ABC):
|
|
|
174
251
|
self.time_column = None
|
|
175
252
|
if self._end_time_column == name:
|
|
176
253
|
self.end_time_column = None
|
|
177
|
-
del self.
|
|
254
|
+
del self._column_dict[name]
|
|
178
255
|
|
|
179
256
|
return self
|
|
180
257
|
|
|
@@ -362,30 +439,12 @@ class Table(ABC):
|
|
|
362
439
|
|
|
363
440
|
def print_metadata(self) -> None:
|
|
364
441
|
r"""Prints the :meth:`~metadata` of this table."""
|
|
365
|
-
|
|
366
|
-
if self._num_rows
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
372
|
-
st.markdown(md_repr)
|
|
373
|
-
st.dataframe(self.metadata, hide_index=True)
|
|
374
|
-
elif in_notebook():
|
|
375
|
-
from IPython.display import Markdown, display
|
|
376
|
-
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
377
|
-
display(Markdown(md_repr))
|
|
378
|
-
df = self.metadata
|
|
379
|
-
try:
|
|
380
|
-
if hasattr(df.style, 'hide'):
|
|
381
|
-
display(df.style.hide(axis='index')) # pandas=2
|
|
382
|
-
else:
|
|
383
|
-
display(df.style.hide_index()) # pandas<1.3
|
|
384
|
-
except ImportError:
|
|
385
|
-
print(df.to_string(index=False)) # missing jinja2
|
|
386
|
-
else:
|
|
387
|
-
print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
|
|
388
|
-
print(self.metadata.to_string(index=False))
|
|
442
|
+
msg = f"🏷️ Metadata of Table `{self.name}`"
|
|
443
|
+
if num := self._num_rows:
|
|
444
|
+
msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
|
|
445
|
+
|
|
446
|
+
display.title(msg)
|
|
447
|
+
display.dataframe(self.metadata)
|
|
389
448
|
|
|
390
449
|
def infer_primary_key(self, verbose: bool = True) -> Self:
|
|
391
450
|
r"""Infers the primary key in this table.
|
|
@@ -399,21 +458,28 @@ class Table(ABC):
|
|
|
399
458
|
def _set_primary_key(primary_key: str) -> None:
|
|
400
459
|
self.primary_key = primary_key
|
|
401
460
|
if verbose:
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
if primary_key := self._source_primary_key:
|
|
406
|
-
_set_primary_key(primary_key)
|
|
407
|
-
return self
|
|
408
|
-
|
|
409
|
-
unique_keys = [
|
|
410
|
-
column.name for column in self._source_column_dict.values()
|
|
411
|
-
if column.is_unique_key
|
|
412
|
-
]
|
|
413
|
-
if len(unique_keys) == 1: # NOTE No composite keys yet.
|
|
414
|
-
_set_primary_key(unique_keys[0])
|
|
415
|
-
return self
|
|
461
|
+
display.message(f"Inferred primary key `{primary_key}` for "
|
|
462
|
+
f"table `{self.name}`")
|
|
416
463
|
|
|
464
|
+
# Inference from source column metadata:
|
|
465
|
+
if any(column.is_source for column in self.columns):
|
|
466
|
+
primary_key = self._source_primary_key
|
|
467
|
+
if (primary_key is not None and primary_key in self
|
|
468
|
+
and self[primary_key].is_source):
|
|
469
|
+
_set_primary_key(primary_key)
|
|
470
|
+
return self
|
|
471
|
+
|
|
472
|
+
unique_keys = [
|
|
473
|
+
column.name for column in self._source_column_dict.values()
|
|
474
|
+
if column.is_unique_key
|
|
475
|
+
]
|
|
476
|
+
if (len(unique_keys) == 1 # NOTE No composite keys yet.
|
|
477
|
+
and unique_keys[0] in self
|
|
478
|
+
and self[unique_keys[0]].is_source):
|
|
479
|
+
_set_primary_key(unique_keys[0])
|
|
480
|
+
return self
|
|
481
|
+
|
|
482
|
+
# Heuristic-based inference:
|
|
417
483
|
candidates = [
|
|
418
484
|
column.name for column in self.columns if column.stype == Stype.ID
|
|
419
485
|
]
|
|
@@ -427,7 +493,7 @@ class Table(ABC):
|
|
|
427
493
|
|
|
428
494
|
if primary_key := infer_primary_key(
|
|
429
495
|
table_name=self.name,
|
|
430
|
-
df=self.
|
|
496
|
+
df=self._get_sample_df(),
|
|
431
497
|
candidates=candidates,
|
|
432
498
|
):
|
|
433
499
|
_set_primary_key(primary_key)
|
|
@@ -444,6 +510,7 @@ class Table(ABC):
|
|
|
444
510
|
if self.has_time_column():
|
|
445
511
|
return self
|
|
446
512
|
|
|
513
|
+
# Heuristic-based inference:
|
|
447
514
|
candidates = [
|
|
448
515
|
column.name for column in self.columns
|
|
449
516
|
if column.stype == Stype.timestamp
|
|
@@ -451,14 +518,14 @@ class Table(ABC):
|
|
|
451
518
|
]
|
|
452
519
|
|
|
453
520
|
if time_column := infer_time_column(
|
|
454
|
-
df=self.
|
|
521
|
+
df=self._get_sample_df(),
|
|
455
522
|
candidates=candidates,
|
|
456
523
|
):
|
|
457
524
|
self.time_column = time_column
|
|
458
525
|
|
|
459
526
|
if verbose:
|
|
460
|
-
|
|
461
|
-
|
|
527
|
+
display.message(f"Inferred time column `{time_column}` for "
|
|
528
|
+
f"table `{self.name}`")
|
|
462
529
|
|
|
463
530
|
return self
|
|
464
531
|
|
|
@@ -474,15 +541,16 @@ class Table(ABC):
|
|
|
474
541
|
if not self.has_primary_key():
|
|
475
542
|
self.infer_primary_key(verbose=False)
|
|
476
543
|
if self.has_primary_key():
|
|
477
|
-
logs.append(f"primary key
|
|
544
|
+
logs.append(f"primary key `{self._primary_key}`")
|
|
478
545
|
|
|
479
546
|
if not self.has_time_column():
|
|
480
547
|
self.infer_time_column(verbose=False)
|
|
481
548
|
if self.has_time_column():
|
|
482
|
-
logs.append(f"time column
|
|
549
|
+
logs.append(f"time column `{self._time_column}`")
|
|
483
550
|
|
|
484
551
|
if verbose and len(logs) > 0:
|
|
485
|
-
|
|
552
|
+
display.message(f"Inferred {' and '.join(logs)} for table "
|
|
553
|
+
f"`{self.name}`")
|
|
486
554
|
|
|
487
555
|
return self
|
|
488
556
|
|
|
@@ -500,16 +568,113 @@ class Table(ABC):
|
|
|
500
568
|
end_time_col=self._end_time_column,
|
|
501
569
|
)
|
|
502
570
|
|
|
503
|
-
@
|
|
571
|
+
@cached_property
|
|
572
|
+
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
573
|
+
source_columns = self._get_source_columns()
|
|
574
|
+
if len(source_columns) == 0:
|
|
575
|
+
raise ValueError(f"Table '{self.name}' has no columns")
|
|
576
|
+
return {column.name: column for column in source_columns}
|
|
577
|
+
|
|
578
|
+
@cached_property
|
|
504
579
|
def _source_primary_key(self) -> str | None:
|
|
505
580
|
primary_keys = [
|
|
506
581
|
column.name for column in self._source_column_dict.values()
|
|
507
582
|
if column.is_primary_key
|
|
508
583
|
]
|
|
509
|
-
|
|
510
|
-
|
|
584
|
+
# NOTE No composite keys yet.
|
|
585
|
+
return primary_keys[0] if len(primary_keys) == 1 else None
|
|
511
586
|
|
|
512
|
-
|
|
587
|
+
@cached_property
|
|
588
|
+
def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
|
|
589
|
+
return {key.name: key for key in self._get_source_foreign_keys()}
|
|
590
|
+
|
|
591
|
+
@cached_property
|
|
592
|
+
def _source_sample_df(self) -> pd.DataFrame:
|
|
593
|
+
return self._get_source_sample_df().reset_index(drop=True)
|
|
594
|
+
|
|
595
|
+
@cached_property
|
|
596
|
+
def _num_rows(self) -> int | None:
|
|
597
|
+
return self._get_num_rows()
|
|
598
|
+
|
|
599
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
600
|
+
dfs: list[pd.DataFrame] = []
|
|
601
|
+
if any(column.is_source for column in self.columns):
|
|
602
|
+
dfs.append(self._source_sample_df)
|
|
603
|
+
if any(not column.is_source for column in self.columns):
|
|
604
|
+
dfs.append(self._expr_sample_df)
|
|
605
|
+
|
|
606
|
+
if len(dfs) == 0:
|
|
607
|
+
return pd.DataFrame(index=range(1000))
|
|
608
|
+
if len(dfs) == 1:
|
|
609
|
+
return dfs[0]
|
|
610
|
+
|
|
611
|
+
size = min(map(len, dfs))
|
|
612
|
+
df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
|
|
613
|
+
df = df.loc[:, ~df.columns.duplicated(keep='last')]
|
|
614
|
+
return df
|
|
615
|
+
|
|
616
|
+
@staticmethod
|
|
617
|
+
def _sanitize(
|
|
618
|
+
df: pd.DataFrame,
|
|
619
|
+
dtype_dict: dict[str, Dtype | None] | None = None,
|
|
620
|
+
stype_dict: dict[str, Stype | None] | None = None,
|
|
621
|
+
) -> pd.DataFrame:
|
|
622
|
+
r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
|
|
623
|
+
types match table data and semantic type specification.
|
|
624
|
+
"""
|
|
625
|
+
def _to_datetime(ser: pd.Series) -> pd.Series:
|
|
626
|
+
if not pd.api.types.is_datetime64_any_dtype(ser):
|
|
627
|
+
with warnings.catch_warnings():
|
|
628
|
+
warnings.filterwarnings(
|
|
629
|
+
'ignore',
|
|
630
|
+
message='Could not infer format',
|
|
631
|
+
)
|
|
632
|
+
ser = pd.to_datetime(ser, errors='coerce')
|
|
633
|
+
if isinstance(ser.dtype, pd.DatetimeTZDtype):
|
|
634
|
+
ser = ser.dt.tz_localize(None)
|
|
635
|
+
if ser.dtype != 'datetime64[ns]':
|
|
636
|
+
ser = ser.astype('datetime64[ns]')
|
|
637
|
+
return ser
|
|
638
|
+
|
|
639
|
+
def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
|
|
640
|
+
if (pd.api.types.is_string_dtype(ser)
|
|
641
|
+
and dtype in {Dtype.intlist, Dtype.floatlist}):
|
|
642
|
+
try:
|
|
643
|
+
ser = ser.map(lambda row: np.fromstring(
|
|
644
|
+
row.strip('[]'),
|
|
645
|
+
sep=',',
|
|
646
|
+
dtype=int if dtype == Dtype.intlist else np.float32,
|
|
647
|
+
) if row is not None else None)
|
|
648
|
+
except Exception:
|
|
649
|
+
pass
|
|
650
|
+
|
|
651
|
+
if pd.api.types.is_string_dtype(ser):
|
|
652
|
+
try:
|
|
653
|
+
import orjson as json
|
|
654
|
+
except ImportError:
|
|
655
|
+
import json
|
|
656
|
+
try:
|
|
657
|
+
ser = ser.map(lambda row: json.loads(row)
|
|
658
|
+
if row is not None else None)
|
|
659
|
+
except Exception:
|
|
660
|
+
pass
|
|
661
|
+
|
|
662
|
+
return ser
|
|
663
|
+
|
|
664
|
+
for column_name in df.columns:
|
|
665
|
+
dtype = (dtype_dict or {}).get(column_name)
|
|
666
|
+
stype = (stype_dict or {}).get(column_name)
|
|
667
|
+
|
|
668
|
+
if dtype == Dtype.time:
|
|
669
|
+
df[column_name] = _to_datetime(df[column_name])
|
|
670
|
+
elif stype == Stype.timestamp:
|
|
671
|
+
df[column_name] = _to_datetime(df[column_name])
|
|
672
|
+
elif dtype is not None and dtype.is_list():
|
|
673
|
+
df[column_name] = _to_list(df[column_name], dtype)
|
|
674
|
+
elif stype == Stype.sequence:
|
|
675
|
+
df[column_name] = _to_list(df[column_name], Dtype.floatlist)
|
|
676
|
+
|
|
677
|
+
return df
|
|
513
678
|
|
|
514
679
|
# Python builtins #########################################################
|
|
515
680
|
|
|
@@ -546,25 +711,24 @@ class Table(ABC):
|
|
|
546
711
|
def backend(self) -> DataBackend:
|
|
547
712
|
r"""The data backend of this table."""
|
|
548
713
|
|
|
549
|
-
@cached_property
|
|
550
|
-
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
551
|
-
return {col.name: col for col in self._get_source_columns()}
|
|
552
|
-
|
|
553
714
|
@abstractmethod
|
|
554
715
|
def _get_source_columns(self) -> list[SourceColumn]:
|
|
555
716
|
pass
|
|
556
717
|
|
|
557
|
-
@
|
|
558
|
-
def
|
|
559
|
-
|
|
718
|
+
@abstractmethod
|
|
719
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
720
|
+
pass
|
|
560
721
|
|
|
561
722
|
@abstractmethod
|
|
562
|
-
def
|
|
723
|
+
def _get_source_sample_df(self) -> pd.DataFrame:
|
|
563
724
|
pass
|
|
564
725
|
|
|
565
|
-
@
|
|
566
|
-
def
|
|
567
|
-
|
|
726
|
+
@abstractmethod
|
|
727
|
+
def _get_expr_sample_df(
|
|
728
|
+
self,
|
|
729
|
+
columns: Sequence[ColumnSpec],
|
|
730
|
+
) -> pd.DataFrame:
|
|
731
|
+
pass
|
|
568
732
|
|
|
569
733
|
@abstractmethod
|
|
570
734
|
def _get_num_rows(self) -> int | None:
|