kumoai 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +12 -0
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/experimental/rfm/__init__.py +33 -8
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/__init__.py +4 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +52 -91
- kumoai/experimental/rfm/backend/local/sampler.py +315 -0
- kumoai/experimental/rfm/backend/local/table.py +31 -14
- kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +75 -23
- kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +71 -28
- kumoai/experimental/rfm/base/__init__.py +24 -3
- kumoai/experimental/rfm/base/column.py +6 -12
- kumoai/experimental/rfm/base/column_expression.py +16 -0
- kumoai/experimental/rfm/base/sampler.py +773 -0
- kumoai/experimental/rfm/base/source.py +1 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +113 -0
- kumoai/experimental/rfm/base/table.py +136 -105
- kumoai/experimental/rfm/graph.py +296 -89
- kumoai/experimental/rfm/infer/dtype.py +46 -59
- kumoai/experimental/rfm/infer/pkey.py +4 -2
- kumoai/experimental/rfm/infer/time_col.py +1 -2
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/rfm.py +299 -230
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/METADATA +4 -2
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/RECORD +41 -34
- kumoai/experimental/rfm/local_graph_sampler.py +0 -223
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING, Literal
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from kumoai.experimental.rfm.base import Sampler, SamplerOutput, SQLTable
|
|
8
|
+
from kumoai.utils import ProgressLogger
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from kumoai.experimental.rfm import Graph
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SQLSampler(Sampler):
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
graph: 'Graph',
|
|
18
|
+
verbose: bool | ProgressLogger = True,
|
|
19
|
+
) -> None:
|
|
20
|
+
super().__init__(graph=graph, verbose=verbose)
|
|
21
|
+
|
|
22
|
+
self._fqn_dict: dict[str, str] = {}
|
|
23
|
+
for table in graph.tables.values():
|
|
24
|
+
assert isinstance(table, SQLTable)
|
|
25
|
+
self._connection = table._connection
|
|
26
|
+
self._fqn_dict[table.name] = table.fqn
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def fqn_dict(self) -> dict[str, str]:
|
|
30
|
+
r"""The fully-qualified quoted source name for all table names in the
|
|
31
|
+
graph.
|
|
32
|
+
"""
|
|
33
|
+
return self._fqn_dict
|
|
34
|
+
|
|
35
|
+
def _sample_subgraph(
|
|
36
|
+
self,
|
|
37
|
+
entity_table_name: str,
|
|
38
|
+
entity_pkey: pd.Series,
|
|
39
|
+
anchor_time: pd.Series | Literal['entity'],
|
|
40
|
+
columns_dict: dict[str, set[str]],
|
|
41
|
+
num_neighbors: list[int],
|
|
42
|
+
) -> SamplerOutput:
|
|
43
|
+
|
|
44
|
+
df, batch = self._by_pkey(
|
|
45
|
+
table_name=entity_table_name,
|
|
46
|
+
pkey=entity_pkey,
|
|
47
|
+
columns=columns_dict[entity_table_name],
|
|
48
|
+
)
|
|
49
|
+
if len(batch) != len(entity_pkey):
|
|
50
|
+
mask = np.ones(len(entity_pkey), dtype=bool)
|
|
51
|
+
mask[batch] = False
|
|
52
|
+
raise KeyError(f"The primary keys "
|
|
53
|
+
f"{entity_pkey.iloc[mask].tolist()} do not exist "
|
|
54
|
+
f"in the '{entity_table_name}' table")
|
|
55
|
+
|
|
56
|
+
perm = batch.argsort()
|
|
57
|
+
batch = batch[perm]
|
|
58
|
+
df = df.iloc[perm].reset_index(drop=True)
|
|
59
|
+
|
|
60
|
+
if not isinstance(anchor_time, pd.Series):
|
|
61
|
+
time_column = self.time_column_dict[entity_table_name]
|
|
62
|
+
anchor_time = df[time_column]
|
|
63
|
+
|
|
64
|
+
return SamplerOutput(
|
|
65
|
+
anchor_time=anchor_time.astype(int).to_numpy(),
|
|
66
|
+
df_dict={entity_table_name: df},
|
|
67
|
+
inverse_dict={},
|
|
68
|
+
batch_dict={entity_table_name: batch},
|
|
69
|
+
num_sampled_nodes_dict={entity_table_name: [len(batch)]},
|
|
70
|
+
row_dict={},
|
|
71
|
+
col_dict={},
|
|
72
|
+
num_sampled_edges_dict={},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Abstract Methods ########################################################
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def _by_pkey(
|
|
79
|
+
self,
|
|
80
|
+
table_name: str,
|
|
81
|
+
pkey: pd.Series,
|
|
82
|
+
columns: set[str],
|
|
83
|
+
) -> tuple[pd.DataFrame, np.ndarray]:
|
|
84
|
+
pass
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from kumoapi.model_plan import MissingType
|
|
8
|
+
|
|
9
|
+
from kumoai.experimental.rfm.base import (
|
|
10
|
+
ColumnExpressionType,
|
|
11
|
+
SourceForeignKey,
|
|
12
|
+
Table,
|
|
13
|
+
)
|
|
14
|
+
from kumoai.utils import quote_ident
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SQLTable(Table):
|
|
18
|
+
r"""A :class:`SQLTable` specifies a :class:`Table` backed by a SQL
|
|
19
|
+
database.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
name: The logical name of this table.
|
|
23
|
+
source_name: The physical name of this table in the database. If set to
|
|
24
|
+
``None``, ``name`` is being used.
|
|
25
|
+
columns: The selected physical columns of this table.
|
|
26
|
+
column_expressions: The logical columns of this table.
|
|
27
|
+
primary_key: The name of the primary key of this table, if it exists.
|
|
28
|
+
time_column: The name of the time column of this table, if it exists.
|
|
29
|
+
end_time_column: The name of the end time column of this table, if it
|
|
30
|
+
exists.
|
|
31
|
+
"""
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
name: str,
|
|
35
|
+
source_name: str | None = None,
|
|
36
|
+
columns: Sequence[str] | None = None,
|
|
37
|
+
column_expressions: Sequence[ColumnExpressionType] | None = None,
|
|
38
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
39
|
+
time_column: str | None = None,
|
|
40
|
+
end_time_column: str | None = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
|
|
43
|
+
self._connection: Any
|
|
44
|
+
self._source_name = source_name or name
|
|
45
|
+
|
|
46
|
+
super().__init__(
|
|
47
|
+
name=name,
|
|
48
|
+
columns=[],
|
|
49
|
+
primary_key=None,
|
|
50
|
+
time_column=None,
|
|
51
|
+
end_time_column=None,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if isinstance(primary_key, MissingType):
|
|
55
|
+
primary_key = self._source_primary_key
|
|
56
|
+
|
|
57
|
+
# Add column expressions with highest priority:
|
|
58
|
+
self._add_column_expressions(column_expressions or [])
|
|
59
|
+
|
|
60
|
+
if columns is None:
|
|
61
|
+
for column_name in self._source_column_dict.keys():
|
|
62
|
+
if column_name not in self:
|
|
63
|
+
self.add_column(column_name)
|
|
64
|
+
else:
|
|
65
|
+
for column_name in columns:
|
|
66
|
+
self.add_column(column_name)
|
|
67
|
+
|
|
68
|
+
if primary_key is not None:
|
|
69
|
+
if primary_key not in self:
|
|
70
|
+
self.add_column(primary_key)
|
|
71
|
+
self.primary_key = primary_key
|
|
72
|
+
|
|
73
|
+
if time_column is not None:
|
|
74
|
+
if time_column not in self:
|
|
75
|
+
self.add_column(time_column)
|
|
76
|
+
self.time_column = time_column
|
|
77
|
+
|
|
78
|
+
if end_time_column is not None:
|
|
79
|
+
if end_time_column not in self:
|
|
80
|
+
self.add_column(end_time_column)
|
|
81
|
+
self.end_time_column = end_time_column
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def fqn(self) -> str:
|
|
85
|
+
r"""The fully-qualified quoted source table name."""
|
|
86
|
+
return quote_ident(self._source_name)
|
|
87
|
+
|
|
88
|
+
# Column ##################################################################
|
|
89
|
+
|
|
90
|
+
def _add_column_expressions(
|
|
91
|
+
self,
|
|
92
|
+
columns: Sequence[ColumnExpressionType],
|
|
93
|
+
) -> None:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Abstract Methods ########################################################
|
|
97
|
+
|
|
98
|
+
@cached_property
|
|
99
|
+
def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
|
|
100
|
+
fkeys = self._get_source_foreign_keys()
|
|
101
|
+
# NOTE Drop all keys that link to multiple keys in the same table since
|
|
102
|
+
# we don't support composite keys yet:
|
|
103
|
+
table_pkeys: dict[str, set[str]] = defaultdict(set)
|
|
104
|
+
for fkey in fkeys:
|
|
105
|
+
table_pkeys[fkey.dst_table].add(fkey.primary_key)
|
|
106
|
+
return {
|
|
107
|
+
fkey.name: fkey
|
|
108
|
+
for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
113
|
+
pass
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from collections import
|
|
2
|
+
from collections.abc import Sequence
|
|
3
3
|
from functools import cached_property
|
|
4
|
-
from typing import Dict, List, Optional, Sequence, Set
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
6
|
+
from kumoapi.model_plan import MissingType
|
|
7
7
|
from kumoapi.source_table import UnavailableSourceTable
|
|
8
8
|
from kumoapi.table import Column as ColumnDefinition
|
|
9
9
|
from kumoapi.table import TableDefinition
|
|
10
10
|
from kumoapi.typing import Stype
|
|
11
11
|
from typing_extensions import Self
|
|
12
12
|
|
|
13
|
-
from kumoai import in_notebook
|
|
14
|
-
from kumoai.experimental.rfm.base import Column,
|
|
13
|
+
from kumoai import in_notebook, in_snowflake_notebook
|
|
14
|
+
from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
|
|
15
15
|
from kumoai.experimental.rfm.infer import (
|
|
16
16
|
contains_categorical,
|
|
17
17
|
contains_id,
|
|
@@ -38,41 +38,29 @@ class Table(ABC):
|
|
|
38
38
|
def __init__(
|
|
39
39
|
self,
|
|
40
40
|
name: str,
|
|
41
|
-
columns:
|
|
42
|
-
primary_key:
|
|
43
|
-
time_column:
|
|
44
|
-
end_time_column:
|
|
41
|
+
columns: Sequence[str] | None = None,
|
|
42
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
43
|
+
time_column: str | None = None,
|
|
44
|
+
end_time_column: str | None = None,
|
|
45
45
|
) -> None:
|
|
46
46
|
|
|
47
47
|
self._name = name
|
|
48
|
-
self._primary_key:
|
|
49
|
-
self._time_column:
|
|
50
|
-
self._end_time_column:
|
|
48
|
+
self._primary_key: str | None = None
|
|
49
|
+
self._time_column: str | None = None
|
|
50
|
+
self._end_time_column: str | None = None
|
|
51
|
+
|
|
52
|
+
if columns is None:
|
|
53
|
+
columns = list(self._source_column_dict.keys())
|
|
51
54
|
|
|
52
55
|
if len(self._source_column_dict) == 0:
|
|
53
56
|
raise ValueError(f"Table '{name}' does not hold any column with "
|
|
54
57
|
f"a supported data type")
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if column.is_primary_key
|
|
59
|
-
]
|
|
60
|
-
if len(primary_keys) == 1: # NOTE No composite keys yet.
|
|
61
|
-
if primary_key is not None and primary_key != primary_keys[0]:
|
|
62
|
-
raise ValueError(f"Found duplicate primary key "
|
|
63
|
-
f"definition '{primary_key}' and "
|
|
64
|
-
f"'{primary_keys[0]}' in table '{name}'")
|
|
65
|
-
primary_key = primary_keys[0]
|
|
66
|
-
|
|
67
|
-
unique_keys = [
|
|
68
|
-
column.name for column in self._source_column_dict.values()
|
|
69
|
-
if column.is_unique_key
|
|
70
|
-
]
|
|
71
|
-
if primary_key is None and len(unique_keys) == 1:
|
|
72
|
-
primary_key = unique_keys[0]
|
|
59
|
+
if isinstance(primary_key, MissingType):
|
|
60
|
+
primary_key = self._source_primary_key
|
|
73
61
|
|
|
74
|
-
self._columns:
|
|
75
|
-
for column_name in columns
|
|
62
|
+
self._columns: dict[str, Column] = {}
|
|
63
|
+
for column_name in columns:
|
|
76
64
|
self.add_column(column_name)
|
|
77
65
|
|
|
78
66
|
if primary_key is not None:
|
|
@@ -95,7 +83,7 @@ class Table(ABC):
|
|
|
95
83
|
r"""The name of this table."""
|
|
96
84
|
return self._name
|
|
97
85
|
|
|
98
|
-
#
|
|
86
|
+
# Column ##################################################################
|
|
99
87
|
|
|
100
88
|
def has_column(self, name: str) -> bool:
|
|
101
89
|
r"""Returns ``True`` if this table holds a column with name ``name``;
|
|
@@ -117,7 +105,7 @@ class Table(ABC):
|
|
|
117
105
|
return self._columns[name]
|
|
118
106
|
|
|
119
107
|
@property
|
|
120
|
-
def columns(self) ->
|
|
108
|
+
def columns(self) -> list[Column]:
|
|
121
109
|
r"""Returns a list of :class:`Column` objects that represent the
|
|
122
110
|
columns in this table.
|
|
123
111
|
"""
|
|
@@ -140,13 +128,7 @@ class Table(ABC):
|
|
|
140
128
|
raise KeyError(f"Column '{name}' does not exist in the underlying "
|
|
141
129
|
f"source table")
|
|
142
130
|
|
|
143
|
-
|
|
144
|
-
dtype = self._source_column_dict[name].dtype
|
|
145
|
-
except Exception as e:
|
|
146
|
-
raise RuntimeError(f"Could not obtain data type for column "
|
|
147
|
-
f"'{name}' in table '{self.name}'. Change "
|
|
148
|
-
f"the data type of the column in the source "
|
|
149
|
-
f"table or remove it from the table.") from e
|
|
131
|
+
dtype = self._source_column_dict[name].dtype
|
|
150
132
|
|
|
151
133
|
try:
|
|
152
134
|
ser = self._sample_df[name]
|
|
@@ -168,8 +150,8 @@ class Table(ABC):
|
|
|
168
150
|
|
|
169
151
|
self._columns[name] = Column(
|
|
170
152
|
name=name,
|
|
171
|
-
dtype=dtype,
|
|
172
153
|
stype=stype,
|
|
154
|
+
dtype=dtype,
|
|
173
155
|
)
|
|
174
156
|
|
|
175
157
|
return self._columns[name]
|
|
@@ -205,7 +187,7 @@ class Table(ABC):
|
|
|
205
187
|
return self._primary_key is not None
|
|
206
188
|
|
|
207
189
|
@property
|
|
208
|
-
def primary_key(self) ->
|
|
190
|
+
def primary_key(self) -> Column | None:
|
|
209
191
|
r"""The primary key column of this table.
|
|
210
192
|
|
|
211
193
|
The getter returns the primary key column of this table, or ``None`` if
|
|
@@ -220,7 +202,7 @@ class Table(ABC):
|
|
|
220
202
|
return self[self._primary_key]
|
|
221
203
|
|
|
222
204
|
@primary_key.setter
|
|
223
|
-
def primary_key(self, name:
|
|
205
|
+
def primary_key(self, name: str | None) -> None:
|
|
224
206
|
if name is not None and name == self._time_column:
|
|
225
207
|
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
226
208
|
f"key since it is already defined to be a time "
|
|
@@ -250,7 +232,7 @@ class Table(ABC):
|
|
|
250
232
|
return self._time_column is not None
|
|
251
233
|
|
|
252
234
|
@property
|
|
253
|
-
def time_column(self) ->
|
|
235
|
+
def time_column(self) -> Column | None:
|
|
254
236
|
r"""The time column of this table.
|
|
255
237
|
|
|
256
238
|
The getter returns the time column of this table, or ``None`` if no
|
|
@@ -265,7 +247,7 @@ class Table(ABC):
|
|
|
265
247
|
return self[self._time_column]
|
|
266
248
|
|
|
267
249
|
@time_column.setter
|
|
268
|
-
def time_column(self, name:
|
|
250
|
+
def time_column(self, name: str | None) -> None:
|
|
269
251
|
if name is not None and name == self._primary_key:
|
|
270
252
|
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
271
253
|
f"column since it is already defined to be a "
|
|
@@ -295,7 +277,7 @@ class Table(ABC):
|
|
|
295
277
|
return self._end_time_column is not None
|
|
296
278
|
|
|
297
279
|
@property
|
|
298
|
-
def end_time_column(self) ->
|
|
280
|
+
def end_time_column(self) -> Column | None:
|
|
299
281
|
r"""The end time column of this table.
|
|
300
282
|
|
|
301
283
|
The getter returns the end time column of this table, or ``None`` if no
|
|
@@ -311,7 +293,7 @@ class Table(ABC):
|
|
|
311
293
|
return self[self._end_time_column]
|
|
312
294
|
|
|
313
295
|
@end_time_column.setter
|
|
314
|
-
def end_time_column(self, name:
|
|
296
|
+
def end_time_column(self, name: str | None) -> None:
|
|
315
297
|
if name is not None and name == self._primary_key:
|
|
316
298
|
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
317
299
|
f"column since it is already defined to be a "
|
|
@@ -384,7 +366,12 @@ class Table(ABC):
|
|
|
384
366
|
if self._num_rows is not None:
|
|
385
367
|
num_rows_repr = ' ({self._num_rows:,} rows)'
|
|
386
368
|
|
|
387
|
-
if
|
|
369
|
+
if in_snowflake_notebook():
|
|
370
|
+
import streamlit as st
|
|
371
|
+
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
372
|
+
st.markdown(md_repr)
|
|
373
|
+
st.dataframe(self.metadata, hide_index=True)
|
|
374
|
+
elif in_notebook():
|
|
388
375
|
from IPython.display import Markdown, display
|
|
389
376
|
md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
|
|
390
377
|
display(Markdown(md_repr))
|
|
@@ -400,8 +387,83 @@ class Table(ABC):
|
|
|
400
387
|
print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
|
|
401
388
|
print(self.metadata.to_string(index=False))
|
|
402
389
|
|
|
390
|
+
def infer_primary_key(self, verbose: bool = True) -> Self:
|
|
391
|
+
r"""Infers the primary key in this table.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
verbose: Whether to print verbose output.
|
|
395
|
+
"""
|
|
396
|
+
if self.has_primary_key():
|
|
397
|
+
return self
|
|
398
|
+
|
|
399
|
+
def _set_primary_key(primary_key: str) -> None:
|
|
400
|
+
self.primary_key = primary_key
|
|
401
|
+
if verbose:
|
|
402
|
+
print(f"Detected primary key '{primary_key}' in table "
|
|
403
|
+
f"'{self.name}'")
|
|
404
|
+
|
|
405
|
+
if primary_key := self._source_primary_key:
|
|
406
|
+
_set_primary_key(primary_key)
|
|
407
|
+
return self
|
|
408
|
+
|
|
409
|
+
unique_keys = [
|
|
410
|
+
column.name for column in self._source_column_dict.values()
|
|
411
|
+
if column.is_unique_key
|
|
412
|
+
]
|
|
413
|
+
if len(unique_keys) == 1: # NOTE No composite keys yet.
|
|
414
|
+
_set_primary_key(unique_keys[0])
|
|
415
|
+
return self
|
|
416
|
+
|
|
417
|
+
candidates = [
|
|
418
|
+
column.name for column in self.columns if column.stype == Stype.ID
|
|
419
|
+
]
|
|
420
|
+
if len(candidates) == 0:
|
|
421
|
+
for column in self.columns:
|
|
422
|
+
if self.name.lower() == column.name.lower():
|
|
423
|
+
candidates.append(column.name)
|
|
424
|
+
elif (self.name.lower().endswith('s')
|
|
425
|
+
and self.name.lower()[:-1] == column.name.lower()):
|
|
426
|
+
candidates.append(column.name)
|
|
427
|
+
|
|
428
|
+
if primary_key := infer_primary_key(
|
|
429
|
+
table_name=self.name,
|
|
430
|
+
df=self._sample_df,
|
|
431
|
+
candidates=candidates,
|
|
432
|
+
):
|
|
433
|
+
_set_primary_key(primary_key)
|
|
434
|
+
return self
|
|
435
|
+
|
|
436
|
+
return self
|
|
437
|
+
|
|
438
|
+
def infer_time_column(self, verbose: bool = True) -> Self:
|
|
439
|
+
r"""Infers the time column in this table.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
verbose: Whether to print verbose output.
|
|
443
|
+
"""
|
|
444
|
+
if self.has_time_column():
|
|
445
|
+
return self
|
|
446
|
+
|
|
447
|
+
candidates = [
|
|
448
|
+
column.name for column in self.columns
|
|
449
|
+
if column.stype == Stype.timestamp
|
|
450
|
+
and column.name != self._end_time_column
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
if time_column := infer_time_column(
|
|
454
|
+
df=self._sample_df,
|
|
455
|
+
candidates=candidates,
|
|
456
|
+
):
|
|
457
|
+
self.time_column = time_column
|
|
458
|
+
|
|
459
|
+
if verbose:
|
|
460
|
+
print(f"Detected time column '{time_column}' in table "
|
|
461
|
+
f"'{self.name}'")
|
|
462
|
+
|
|
463
|
+
return self
|
|
464
|
+
|
|
403
465
|
def infer_metadata(self, verbose: bool = True) -> Self:
|
|
404
|
-
r"""Infers metadata, *i.e.*, primary keys and time columns, in
|
|
466
|
+
r"""Infers metadata, *i.e.*, primary keys and time columns, in this
|
|
405
467
|
table.
|
|
406
468
|
|
|
407
469
|
Args:
|
|
@@ -409,45 +471,15 @@ class Table(ABC):
|
|
|
409
471
|
"""
|
|
410
472
|
logs = []
|
|
411
473
|
|
|
412
|
-
# Try to detect primary key if not set:
|
|
413
474
|
if not self.has_primary_key():
|
|
475
|
+
self.infer_primary_key(verbose=False)
|
|
476
|
+
if self.has_primary_key():
|
|
477
|
+
logs.append(f"primary key '{self._primary_key}'")
|
|
414
478
|
|
|
415
|
-
def is_candidate(column: Column) -> bool:
|
|
416
|
-
if column.stype == Stype.ID:
|
|
417
|
-
return True
|
|
418
|
-
if all(column.stype != Stype.ID for column in self.columns):
|
|
419
|
-
if self.name == column.name:
|
|
420
|
-
return True
|
|
421
|
-
if (self.name.endswith('s')
|
|
422
|
-
and self.name[:-1] == column.name):
|
|
423
|
-
return True
|
|
424
|
-
return False
|
|
425
|
-
|
|
426
|
-
candidates = [
|
|
427
|
-
column.name for column in self.columns if is_candidate(column)
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
if primary_key := infer_primary_key(
|
|
431
|
-
table_name=self.name,
|
|
432
|
-
df=self._sample_df,
|
|
433
|
-
candidates=candidates,
|
|
434
|
-
):
|
|
435
|
-
self.primary_key = primary_key
|
|
436
|
-
logs.append(f"primary key '{primary_key}'")
|
|
437
|
-
|
|
438
|
-
# Try to detect time column if not set:
|
|
439
479
|
if not self.has_time_column():
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
and column.name != self._end_time_column
|
|
444
|
-
]
|
|
445
|
-
if time_column := infer_time_column(
|
|
446
|
-
df=self._sample_df,
|
|
447
|
-
candidates=candidates,
|
|
448
|
-
):
|
|
449
|
-
self.time_column = time_column
|
|
450
|
-
logs.append(f"time column '{time_column}'")
|
|
480
|
+
self.infer_time_column(verbose=False)
|
|
481
|
+
if self.has_time_column():
|
|
482
|
+
logs.append(f"time column '{self._time_column}'")
|
|
451
483
|
|
|
452
484
|
if verbose and len(logs) > 0:
|
|
453
485
|
print(f"Detected {' and '.join(logs)} in table '{self.name}'")
|
|
@@ -468,6 +500,17 @@ class Table(ABC):
|
|
|
468
500
|
end_time_col=self._end_time_column,
|
|
469
501
|
)
|
|
470
502
|
|
|
503
|
+
@property
|
|
504
|
+
def _source_primary_key(self) -> str | None:
|
|
505
|
+
primary_keys = [
|
|
506
|
+
column.name for column in self._source_column_dict.values()
|
|
507
|
+
if column.is_primary_key
|
|
508
|
+
]
|
|
509
|
+
if len(primary_keys) == 1: # NOTE No composite keys yet.
|
|
510
|
+
return primary_keys[0]
|
|
511
|
+
|
|
512
|
+
return None
|
|
513
|
+
|
|
471
514
|
# Python builtins #########################################################
|
|
472
515
|
|
|
473
516
|
def __hash__(self) -> int:
|
|
@@ -496,31 +539,19 @@ class Table(ABC):
|
|
|
496
539
|
f' end_time_column={self._end_time_column},\n'
|
|
497
540
|
f')')
|
|
498
541
|
|
|
499
|
-
# Abstract
|
|
500
|
-
|
|
501
|
-
@cached_property
|
|
502
|
-
def _source_column_dict(self) -> Dict[str, SourceColumn]:
|
|
503
|
-
return {col.name: col for col in self._get_source_columns()}
|
|
542
|
+
# Abstract Methods ########################################################
|
|
504
543
|
|
|
544
|
+
@property
|
|
505
545
|
@abstractmethod
|
|
506
|
-
def
|
|
507
|
-
|
|
546
|
+
def backend(self) -> DataBackend:
|
|
547
|
+
r"""The data backend of this table."""
|
|
508
548
|
|
|
509
549
|
@cached_property
|
|
510
|
-
def
|
|
511
|
-
|
|
512
|
-
# NOTE Drop all keys that link to different primary keys in the same
|
|
513
|
-
# table since we don't support composite keys yet:
|
|
514
|
-
table_pkeys: Dict[str, Set[str]] = defaultdict(set)
|
|
515
|
-
for fkey in fkeys:
|
|
516
|
-
table_pkeys[fkey.dst_table].add(fkey.primary_key)
|
|
517
|
-
return {
|
|
518
|
-
fkey.name: fkey
|
|
519
|
-
for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
|
|
520
|
-
}
|
|
550
|
+
def _source_column_dict(self) -> dict[str, SourceColumn]:
|
|
551
|
+
return {col.name: col for col in self._get_source_columns()}
|
|
521
552
|
|
|
522
553
|
@abstractmethod
|
|
523
|
-
def
|
|
554
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
524
555
|
pass
|
|
525
556
|
|
|
526
557
|
@cached_property
|
|
@@ -532,9 +563,9 @@ class Table(ABC):
|
|
|
532
563
|
pass
|
|
533
564
|
|
|
534
565
|
@cached_property
|
|
535
|
-
def _num_rows(self) ->
|
|
566
|
+
def _num_rows(self) -> int | None:
|
|
536
567
|
return self._get_num_rows()
|
|
537
568
|
|
|
538
569
|
@abstractmethod
|
|
539
|
-
def _get_num_rows(self) ->
|
|
570
|
+
def _get_num_rows(self) -> int | None:
|
|
540
571
|
pass
|