kumoai 2.13.0.dev202512011731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +12 -0
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/experimental/rfm/__init__.py +33 -8
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/__init__.py +4 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +53 -107
- kumoai/experimental/rfm/backend/local/sampler.py +315 -0
- kumoai/experimental/rfm/backend/local/table.py +41 -80
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +147 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +11 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +108 -88
- kumoai/experimental/rfm/base/__init__.py +26 -2
- kumoai/experimental/rfm/base/column.py +6 -12
- kumoai/experimental/rfm/base/column_expression.py +16 -0
- kumoai/experimental/rfm/base/sampler.py +773 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +113 -0
- kumoai/experimental/rfm/base/table.py +174 -76
- kumoai/experimental/rfm/graph.py +444 -84
- kumoai/experimental/rfm/infer/__init__.py +6 -0
- kumoai/experimental/rfm/infer/dtype.py +77 -0
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/rfm.py +299 -240
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/METADATA +6 -2
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/RECORD +42 -30
- kumoai/experimental/rfm/local_graph_sampler.py +0 -182
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- kumoai/experimental/rfm/utils.py +0 -344
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from kumoapi.typing import Dtype
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class SourceColumn:
|
|
8
|
+
name: str
|
|
9
|
+
dtype: Dtype
|
|
10
|
+
is_primary_key: bool
|
|
11
|
+
is_unique_key: bool
|
|
12
|
+
is_nullable: bool
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SourceForeignKey:
|
|
17
|
+
name: str
|
|
18
|
+
dst_table: str
|
|
19
|
+
primary_key: str
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING, Literal
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from kumoai.experimental.rfm.base import Sampler, SamplerOutput, SQLTable
|
|
8
|
+
from kumoai.utils import ProgressLogger
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from kumoai.experimental.rfm import Graph
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SQLSampler(Sampler):
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
graph: 'Graph',
|
|
18
|
+
verbose: bool | ProgressLogger = True,
|
|
19
|
+
) -> None:
|
|
20
|
+
super().__init__(graph=graph, verbose=verbose)
|
|
21
|
+
|
|
22
|
+
self._fqn_dict: dict[str, str] = {}
|
|
23
|
+
for table in graph.tables.values():
|
|
24
|
+
assert isinstance(table, SQLTable)
|
|
25
|
+
self._connection = table._connection
|
|
26
|
+
self._fqn_dict[table.name] = table.fqn
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def fqn_dict(self) -> dict[str, str]:
|
|
30
|
+
r"""The fully-qualified quoted source name for all table names in the
|
|
31
|
+
graph.
|
|
32
|
+
"""
|
|
33
|
+
return self._fqn_dict
|
|
34
|
+
|
|
35
|
+
def _sample_subgraph(
|
|
36
|
+
self,
|
|
37
|
+
entity_table_name: str,
|
|
38
|
+
entity_pkey: pd.Series,
|
|
39
|
+
anchor_time: pd.Series | Literal['entity'],
|
|
40
|
+
columns_dict: dict[str, set[str]],
|
|
41
|
+
num_neighbors: list[int],
|
|
42
|
+
) -> SamplerOutput:
|
|
43
|
+
|
|
44
|
+
df, batch = self._by_pkey(
|
|
45
|
+
table_name=entity_table_name,
|
|
46
|
+
pkey=entity_pkey,
|
|
47
|
+
columns=columns_dict[entity_table_name],
|
|
48
|
+
)
|
|
49
|
+
if len(batch) != len(entity_pkey):
|
|
50
|
+
mask = np.ones(len(entity_pkey), dtype=bool)
|
|
51
|
+
mask[batch] = False
|
|
52
|
+
raise KeyError(f"The primary keys "
|
|
53
|
+
f"{entity_pkey.iloc[mask].tolist()} do not exist "
|
|
54
|
+
f"in the '{entity_table_name}' table")
|
|
55
|
+
|
|
56
|
+
perm = batch.argsort()
|
|
57
|
+
batch = batch[perm]
|
|
58
|
+
df = df.iloc[perm].reset_index(drop=True)
|
|
59
|
+
|
|
60
|
+
if not isinstance(anchor_time, pd.Series):
|
|
61
|
+
time_column = self.time_column_dict[entity_table_name]
|
|
62
|
+
anchor_time = df[time_column]
|
|
63
|
+
|
|
64
|
+
return SamplerOutput(
|
|
65
|
+
anchor_time=anchor_time.astype(int).to_numpy(),
|
|
66
|
+
df_dict={entity_table_name: df},
|
|
67
|
+
inverse_dict={},
|
|
68
|
+
batch_dict={entity_table_name: batch},
|
|
69
|
+
num_sampled_nodes_dict={entity_table_name: [len(batch)]},
|
|
70
|
+
row_dict={},
|
|
71
|
+
col_dict={},
|
|
72
|
+
num_sampled_edges_dict={},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Abstract Methods ########################################################
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def _by_pkey(
|
|
79
|
+
self,
|
|
80
|
+
table_name: str,
|
|
81
|
+
pkey: pd.Series,
|
|
82
|
+
columns: set[str],
|
|
83
|
+
) -> tuple[pd.DataFrame, np.ndarray]:
|
|
84
|
+
pass
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from kumoapi.model_plan import MissingType
|
|
8
|
+
|
|
9
|
+
from kumoai.experimental.rfm.base import (
|
|
10
|
+
ColumnExpressionType,
|
|
11
|
+
SourceForeignKey,
|
|
12
|
+
Table,
|
|
13
|
+
)
|
|
14
|
+
from kumoai.utils import quote_ident
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SQLTable(Table):
|
|
18
|
+
r"""A :class:`SQLTable` specifies a :class:`Table` backed by a SQL
|
|
19
|
+
database.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
name: The logical name of this table.
|
|
23
|
+
source_name: The physical name of this table in the database. If set to
|
|
24
|
+
``None``, ``name`` is being used.
|
|
25
|
+
columns: The selected physical columns of this table.
|
|
26
|
+
column_expressions: The logical columns of this table.
|
|
27
|
+
primary_key: The name of the primary key of this table, if it exists.
|
|
28
|
+
time_column: The name of the time column of this table, if it exists.
|
|
29
|
+
end_time_column: The name of the end time column of this table, if it
|
|
30
|
+
exists.
|
|
31
|
+
"""
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
name: str,
|
|
35
|
+
source_name: str | None = None,
|
|
36
|
+
columns: Sequence[str] | None = None,
|
|
37
|
+
column_expressions: Sequence[ColumnExpressionType] | None = None,
|
|
38
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
39
|
+
time_column: str | None = None,
|
|
40
|
+
end_time_column: str | None = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
|
|
43
|
+
self._connection: Any
|
|
44
|
+
self._source_name = source_name or name
|
|
45
|
+
|
|
46
|
+
super().__init__(
|
|
47
|
+
name=name,
|
|
48
|
+
columns=[],
|
|
49
|
+
primary_key=None,
|
|
50
|
+
time_column=None,
|
|
51
|
+
end_time_column=None,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if isinstance(primary_key, MissingType):
|
|
55
|
+
primary_key = self._source_primary_key
|
|
56
|
+
|
|
57
|
+
# Add column expressions with highest priority:
|
|
58
|
+
self._add_column_expressions(column_expressions or [])
|
|
59
|
+
|
|
60
|
+
if columns is None:
|
|
61
|
+
for column_name in self._source_column_dict.keys():
|
|
62
|
+
if column_name not in self:
|
|
63
|
+
self.add_column(column_name)
|
|
64
|
+
else:
|
|
65
|
+
for column_name in columns:
|
|
66
|
+
self.add_column(column_name)
|
|
67
|
+
|
|
68
|
+
if primary_key is not None:
|
|
69
|
+
if primary_key not in self:
|
|
70
|
+
self.add_column(primary_key)
|
|
71
|
+
self.primary_key = primary_key
|
|
72
|
+
|
|
73
|
+
if time_column is not None:
|
|
74
|
+
if time_column not in self:
|
|
75
|
+
self.add_column(time_column)
|
|
76
|
+
self.time_column = time_column
|
|
77
|
+
|
|
78
|
+
if end_time_column is not None:
|
|
79
|
+
if end_time_column not in self:
|
|
80
|
+
self.add_column(end_time_column)
|
|
81
|
+
self.end_time_column = end_time_column
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def fqn(self) -> str:
|
|
85
|
+
r"""The fully-qualified quoted source table name."""
|
|
86
|
+
return quote_ident(self._source_name)
|
|
87
|
+
|
|
88
|
+
# Column ##################################################################
|
|
89
|
+
|
|
90
|
+
def _add_column_expressions(
|
|
91
|
+
self,
|
|
92
|
+
columns: Sequence[ColumnExpressionType],
|
|
93
|
+
) -> None:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Abstract Methods ########################################################
|
|
97
|
+
|
|
98
|
+
@cached_property
|
|
99
|
+
def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
|
|
100
|
+
fkeys = self._get_source_foreign_keys()
|
|
101
|
+
# NOTE Drop all keys that link to multiple keys in the same table since
|
|
102
|
+
# we don't support composite keys yet:
|
|
103
|
+
table_pkeys: dict[str, set[str]] = defaultdict(set)
|
|
104
|
+
for fkey in fkeys:
|
|
105
|
+
table_pkeys[fkey.dst_table].add(fkey.primary_key)
|
|
106
|
+
return {
|
|
107
|
+
fkey.name: fkey
|
|
108
|
+
for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
113
|
+
pass
|
|
@@ -1,15 +1,25 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from functools import cached_property
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
6
|
+
from kumoapi.model_plan import MissingType
|
|
5
7
|
from kumoapi.source_table import UnavailableSourceTable
|
|
6
8
|
from kumoapi.table import Column as ColumnDefinition
|
|
7
9
|
from kumoapi.table import TableDefinition
|
|
8
|
-
from kumoapi.typing import
|
|
10
|
+
from kumoapi.typing import Stype
|
|
9
11
|
from typing_extensions import Self
|
|
10
12
|
|
|
11
|
-
from kumoai import in_notebook
|
|
12
|
-
from kumoai.experimental.rfm.base import Column
|
|
13
|
+
from kumoai import in_notebook, in_snowflake_notebook
|
|
14
|
+
from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
|
|
15
|
+
from kumoai.experimental.rfm.infer import (
|
|
16
|
+
contains_categorical,
|
|
17
|
+
contains_id,
|
|
18
|
+
contains_multicategorical,
|
|
19
|
+
contains_timestamp,
|
|
20
|
+
infer_primary_key,
|
|
21
|
+
infer_time_column,
|
|
22
|
+
)
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
class Table(ABC):
|
|
@@ -28,19 +38,29 @@ class Table(ABC):
|
|
|
28
38
|
def __init__(
|
|
29
39
|
self,
|
|
30
40
|
name: str,
|
|
31
|
-
columns:
|
|
32
|
-
primary_key:
|
|
33
|
-
time_column:
|
|
34
|
-
end_time_column:
|
|
41
|
+
columns: Sequence[str] | None = None,
|
|
42
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
43
|
+
time_column: str | None = None,
|
|
44
|
+
end_time_column: str | None = None,
|
|
35
45
|
) -> None:
|
|
36
46
|
|
|
37
47
|
self._name = name
|
|
38
|
-
self._primary_key:
|
|
39
|
-
self._time_column:
|
|
40
|
-
self._end_time_column:
|
|
48
|
+
self._primary_key: str | None = None
|
|
49
|
+
self._time_column: str | None = None
|
|
50
|
+
self._end_time_column: str | None = None
|
|
41
51
|
|
|
42
|
-
|
|
43
|
-
|
|
52
|
+
if columns is None:
|
|
53
|
+
columns = list(self._source_column_dict.keys())
|
|
54
|
+
|
|
55
|
+
if len(self._source_column_dict) == 0:
|
|
56
|
+
raise ValueError(f"Table '{name}' does not hold any column with "
|
|
57
|
+
f"a supported data type")
|
|
58
|
+
|
|
59
|
+
if isinstance(primary_key, MissingType):
|
|
60
|
+
primary_key = self._source_primary_key
|
|
61
|
+
|
|
62
|
+
self._columns: dict[str, Column] = {}
|
|
63
|
+
for column_name in columns:
|
|
44
64
|
self.add_column(column_name)
|
|
45
65
|
|
|
46
66
|
if primary_key is not None:
|
|
@@ -63,7 +83,7 @@ class Table(ABC):
|
|
|
63
83
|
r"""The name of this table."""
|
|
64
84
|
return self._name
|
|
65
85
|
|
|
66
|
-
#
|
|
86
|
+
# Column ##################################################################
|
|
67
87
|
|
|
68
88
|
def has_column(self, name: str) -> bool:
|
|
69
89
|
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
@@ -85,7 +105,7 @@ class Table(ABC):
|
|
|
85
105
|
return self._columns[name]
|
|
86
106
|
|
|
87
107
|
@property
|
|
88
|
-
def columns(self) ->
|
|
108
|
+
def columns(self) -> list[Column]:
|
|
89
109
|
r"""Returns a list of :class:`Column` objects that represent the
|
|
90
110
|
columns in this table.
|
|
91
111
|
"""
|
|
@@ -104,20 +124,24 @@ class Table(ABC):
|
|
|
104
124
|
raise KeyError(f"Column '{name}' already exists in table "
|
|
105
125
|
f"'{self.name}'")
|
|
106
126
|
|
|
107
|
-
if not self.
|
|
127
|
+
if name not in self._source_column_dict:
|
|
108
128
|
raise KeyError(f"Column '{name}' does not exist in the underlying "
|
|
109
129
|
f"source table")
|
|
110
130
|
|
|
111
|
-
|
|
112
|
-
dtype = self._get_source_dtype(name)
|
|
113
|
-
except Exception as e:
|
|
114
|
-
raise RuntimeError(f"Could not obtain data type for column "
|
|
115
|
-
f"'{name}' in table '{self.name}'. Change "
|
|
116
|
-
f"the data type of the column in the source "
|
|
117
|
-
f"table or remove it from the table.") from e
|
|
131
|
+
dtype = self._source_column_dict[name].dtype
|
|
118
132
|
|
|
119
133
|
try:
|
|
120
|
-
|
|
134
|
+
ser = self._sample_df[name]
|
|
135
|
+
if contains_id(ser, name, dtype):
|
|
136
|
+
stype = Stype.ID
|
|
137
|
+
elif contains_timestamp(ser, name, dtype):
|
|
138
|
+
stype = Stype.timestamp
|
|
139
|
+
elif contains_multicategorical(ser, name, dtype):
|
|
140
|
+
stype = Stype.multicategorical
|
|
141
|
+
elif contains_categorical(ser, name, dtype):
|
|
142
|
+
stype = Stype.categorical
|
|
143
|
+
else:
|
|
144
|
+
stype = dtype.default_stype
|
|
121
145
|
except Exception as e:
|
|
122
146
|
raise RuntimeError(f"Could not obtain semantic type for column "
|
|
123
147
|
f"'{name}' in table '{self.name}'. Change "
|
|
@@ -126,8 +150,8 @@ class Table(ABC):
|
|
|
126
150
|
|
|
127
151
|
self._columns[name] = Column(
|
|
128
152
|
name=name,
|
|
129
|
-
dtype=dtype,
|
|
130
153
|
stype=stype,
|
|
154
|
+
dtype=dtype,
|
|
131
155
|
)
|
|
132
156
|
|
|
133
157
|
return self._columns[name]
|
|
@@ -163,7 +187,7 @@ class Table(ABC):
|
|
|
163
187
|
return self._primary_key is not None
|
|
164
188
|
|
|
165
189
|
@property
|
|
166
|
-
def primary_key(self) ->
|
|
190
|
+
def primary_key(self) -> Column | None:
|
|
167
191
|
r"""The primary key column of this table.
|
|
168
192
|
|
|
169
193
|
The getter returns the primary key column of this table, or ``None`` if
|
|
@@ -178,7 +202,7 @@ class Table(ABC):
|
|
|
178
202
|
return self[self._primary_key]
|
|
179
203
|
|
|
180
204
|
@primary_key.setter
|
|
181
|
-
def primary_key(self, name:
|
|
205
|
+
def primary_key(self, name: str | None) -> None:
|
|
182
206
|
if name is not None and name == self._time_column:
|
|
183
207
|
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
184
208
|
f"key since it is already defined to be a time "
|
|
@@ -208,7 +232,7 @@ class Table(ABC):
|
|
|
208
232
|
return self._time_column is not None
|
|
209
233
|
|
|
210
234
|
@property
|
|
211
|
-
def time_column(self) ->
|
|
235
|
+
def time_column(self) -> Column | None:
|
|
212
236
|
r"""The time column of this table.
|
|
213
237
|
|
|
214
238
|
The getter returns the time column of this table, or ``None`` if no
|
|
@@ -223,7 +247,7 @@ class Table(ABC):
|
|
|
223
247
|
return self[self._time_column]
|
|
224
248
|
|
|
225
249
|
@time_column.setter
|
|
226
|
-
def time_column(self, name:
|
|
250
|
+
def time_column(self, name: str | None) -> None:
|
|
227
251
|
if name is not None and name == self._primary_key:
|
|
228
252
|
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
229
253
|
f"column since it is already defined to be a "
|
|
@@ -253,7 +277,7 @@ class Table(ABC):
|
|
|
253
277
|
return self._end_time_column is not None
|
|
254
278
|
|
|
255
279
|
@property
|
|
256
|
-
def end_time_column(self) ->
|
|
280
|
+
def end_time_column(self) -> Column | None:
|
|
257
281
|
r"""The end time column of this table.
|
|
258
282
|
|
|
259
283
|
The getter returns the end time column of this table, or ``None`` if no
|
|
@@ -269,7 +293,7 @@ class Table(ABC):
|
|
|
269
293
|
return self[self._end_time_column]
|
|
270
294
|
|
|
271
295
|
@end_time_column.setter
|
|
272
|
-
def end_time_column(self, name:
|
|
296
|
+
def end_time_column(self, name: str | None) -> None:
|
|
273
297
|
if name is not None and name == self._primary_key:
|
|
274
298
|
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
275
299
|
f"column since it is already defined to be a "
|
|
@@ -338,10 +362,16 @@ class Table(ABC):
|
|
|
338
362
|
|
|
339
363
|
def print_metadata(self) -> None:
|
|
340
364
|
r"""Prints the :meth:`~metadata` of this table."""
|
|
341
|
-
|
|
342
|
-
|
|
365
|
+
num_rows_repr = ''
|
|
366
|
+
if self._num_rows is not None:
|
|
367
|
+
num_rows_repr = ' ({self._num_rows:,} rows)'
|
|
343
368
|
|
|
344
|
-
if
|
|
369
|
+
if in_snowflake_notebook():
|
|
370
|
+
import streamlit as st
|
|
371
|
+
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
372
|
+
st.markdown(md_repr)
|
|
373
|
+
st.dataframe(self.metadata, hide_index=True)
|
|
374
|
+
elif in_notebook():
|
|
345
375
|
from IPython.display import Markdown, display
|
|
346
376
|
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
347
377
|
display(Markdown(md_repr))
|
|
@@ -357,8 +387,83 @@ class Table(ABC):
|
|
|
357
387
|
print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
|
|
358
388
|
print(self.metadata.to_string(index=False))
|
|
359
389
|
|
|
390
|
+
def infer_primary_key(self, verbose: bool = True) -> Self:
|
|
391
|
+
r"""Infers the primary key in this table.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
verbose: Whether to print verbose output.
|
|
395
|
+
"""
|
|
396
|
+
if self.has_primary_key():
|
|
397
|
+
return self
|
|
398
|
+
|
|
399
|
+
def _set_primary_key(primary_key: str) -> None:
|
|
400
|
+
self.primary_key = primary_key
|
|
401
|
+
if verbose:
|
|
402
|
+
print(f"Detected primary key '{primary_key}' in table "
|
|
403
|
+
f"'{self.name}'")
|
|
404
|
+
|
|
405
|
+
if primary_key := self._source_primary_key:
|
|
406
|
+
_set_primary_key(primary_key)
|
|
407
|
+
return self
|
|
408
|
+
|
|
409
|
+
unique_keys = [
|
|
410
|
+
column.name for column in self._source_column_dict.values()
|
|
411
|
+
if column.is_unique_key
|
|
412
|
+
]
|
|
413
|
+
if len(unique_keys) == 1: # NOTE No composite keys yet.
|
|
414
|
+
_set_primary_key(unique_keys[0])
|
|
415
|
+
return self
|
|
416
|
+
|
|
417
|
+
candidates = [
|
|
418
|
+
column.name for column in self.columns if column.stype == Stype.ID
|
|
419
|
+
]
|
|
420
|
+
if len(candidates) == 0:
|
|
421
|
+
for column in self.columns:
|
|
422
|
+
if self.name.lower() == column.name.lower():
|
|
423
|
+
candidates.append(column.name)
|
|
424
|
+
elif (self.name.lower().endswith('s')
|
|
425
|
+
and self.name.lower()[:-1] == column.name.lower()):
|
|
426
|
+
candidates.append(column.name)
|
|
427
|
+
|
|
428
|
+
if primary_key := infer_primary_key(
|
|
429
|
+
table_name=self.name,
|
|
430
|
+
df=self._sample_df,
|
|
431
|
+
candidates=candidates,
|
|
432
|
+
):
|
|
433
|
+
_set_primary_key(primary_key)
|
|
434
|
+
return self
|
|
435
|
+
|
|
436
|
+
return self
|
|
437
|
+
|
|
438
|
+
def infer_time_column(self, verbose: bool = True) -> Self:
|
|
439
|
+
r"""Infers the time column in this table.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
verbose: Whether to print verbose output.
|
|
443
|
+
"""
|
|
444
|
+
if self.has_time_column():
|
|
445
|
+
return self
|
|
446
|
+
|
|
447
|
+
candidates = [
|
|
448
|
+
column.name for column in self.columns
|
|
449
|
+
if column.stype == Stype.timestamp
|
|
450
|
+
and column.name != self._end_time_column
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
if time_column := infer_time_column(
|
|
454
|
+
df=self._sample_df,
|
|
455
|
+
candidates=candidates,
|
|
456
|
+
):
|
|
457
|
+
self.time_column = time_column
|
|
458
|
+
|
|
459
|
+
if verbose:
|
|
460
|
+
print(f"Detected time column '{time_column}' in table "
|
|
461
|
+
f"'{self.name}'")
|
|
462
|
+
|
|
463
|
+
return self
|
|
464
|
+
|
|
360
465
|
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
361
|
-
r"""Infers metadata, *i.e.*, primary keys and time columns, in
|
|
466
|
+
r"""Infers metadata, *i.e.*, primary keys and time columns, in this
|
|
362
467
|
table.
|
|
363
468
|
|
|
364
469
|
Args:
|
|
@@ -366,38 +471,15 @@ class Table(ABC):
|
|
|
366
471
|
"""
|
|
367
472
|
logs = []
|
|
368
473
|
|
|
369
|
-
# Try to detect primary key if not set:
|
|
370
474
|
if not self.has_primary_key():
|
|
475
|
+
self.infer_primary_key(verbose=False)
|
|
476
|
+
if self.has_primary_key():
|
|
477
|
+
logs.append(f"primary key '{self._primary_key}'")
|
|
371
478
|
|
|
372
|
-
def is_candidate(column: Column) -> bool:
|
|
373
|
-
if column.stype == Stype.ID:
|
|
374
|
-
return True
|
|
375
|
-
if all(column.stype != Stype.ID for column in self.columns):
|
|
376
|
-
if self.name == column.name:
|
|
377
|
-
return True
|
|
378
|
-
if (self.name.endswith('s')
|
|
379
|
-
and self.name[:-1] == column.name):
|
|
380
|
-
return True
|
|
381
|
-
return False
|
|
382
|
-
|
|
383
|
-
candidates = [
|
|
384
|
-
column.name for column in self.columns if is_candidate(column)
|
|
385
|
-
]
|
|
386
|
-
|
|
387
|
-
if primary_key := self._infer_primary_key(candidates):
|
|
388
|
-
self.primary_key = primary_key
|
|
389
|
-
logs.append(f"primary key '{primary_key}'")
|
|
390
|
-
|
|
391
|
-
# Try to detect time column if not set:
|
|
392
479
|
if not self.has_time_column():
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
and column.name != self._end_time_column
|
|
397
|
-
]
|
|
398
|
-
if time_column := self._infer_time_column(candidates):
|
|
399
|
-
self.time_column = time_column
|
|
400
|
-
logs.append(f"time column '{time_column}'")
|
|
480
|
+
self.infer_time_column(verbose=False)
|
|
481
|
+
if self.has_time_column():
|
|
482
|
+
logs.append(f"time column '{self._time_column}'")
|
|
401
483
|
|
|
402
484
|
if verbose and len(logs) > 0:
|
|
403
485
|
print(f"Detected {' and '.join(logs)} in table '{self.name}'")
|
|
@@ -418,6 +500,17 @@ class Table(ABC):
|
|
|
418
500
|
end_time_col=self._end_time_column,
|
|
419
501
|
)
|
|
420
502
|
|
|
503
|
+
@property
|
|
504
|
+
def _source_primary_key(self) -> str | None:
|
|
505
|
+
primary_keys = [
|
|
506
|
+
column.name for column in self._source_column_dict.values()
|
|
507
|
+
if column.is_primary_key
|
|
508
|
+
]
|
|
509
|
+
if len(primary_keys) == 1: # NOTE No composite keys yet.
|
|
510
|
+
return primary_keys[0]
|
|
511
|
+
|
|
512
|
+
return None
|
|
513
|
+
|
|
421
514
|
# Python builtins #########################################################
|
|
422
515
|
|
|
423
516
|
def __hash__(self) -> int:
|
|
@@ -446,28 +539,33 @@ class Table(ABC):
|
|
|
446
539
|
f' end_time_column={self._end_time_column},\n'
|
|
447
540
|
f')')
|
|
448
541
|
|
|
449
|
-
# Abstract
|
|
542
|
+
# Abstract Methods ########################################################
|
|
450
543
|
|
|
544
|
+
@property
|
|
451
545
|
@abstractmethod
|
|
452
|
-
def
|
|
453
|
-
|
|
546
|
+
def backend(self) -> DataBackend:
|
|
547
|
+
r"""The data backend of this table."""
|
|
454
548
|
|
|
455
|
-
@
|
|
456
|
-
def
|
|
457
|
-
|
|
549
|
+
@cached_property
|
|
550
|
+
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
551
|
+
return {col.name: col for col in self._get_source_columns()}
|
|
458
552
|
|
|
459
553
|
@abstractmethod
|
|
460
|
-
def
|
|
554
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
461
555
|
pass
|
|
462
556
|
|
|
463
|
-
@
|
|
464
|
-
def
|
|
465
|
-
|
|
557
|
+
@cached_property
|
|
558
|
+
def _sample_df(self) -> pd.DataFrame:
|
|
559
|
+
return self._get_sample_df()
|
|
466
560
|
|
|
467
561
|
@abstractmethod
|
|
468
|
-
def
|
|
562
|
+
def _get_sample_df(self) -> pd.DataFrame:
|
|
469
563
|
pass
|
|
470
564
|
|
|
565
|
+
@cached_property
|
|
566
|
+
def _num_rows(self) -> int | None:
|
|
567
|
+
return self._get_num_rows()
|
|
568
|
+
|
|
471
569
|
@abstractmethod
|
|
472
|
-
def
|
|
570
|
+
def _get_num_rows(self) -> int | None:
|
|
473
571
|
pass
|