kumoai 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kumoai/_version.py +1 -1
  2. kumoai/experimental/rfm/__init__.py +33 -8
  3. kumoai/experimental/rfm/authenticate.py +3 -4
  4. kumoai/experimental/rfm/backend/local/graph_store.py +25 -25
  5. kumoai/experimental/rfm/backend/local/table.py +16 -21
  6. kumoai/experimental/rfm/backend/snow/sampler.py +22 -34
  7. kumoai/experimental/rfm/backend/snow/table.py +67 -33
  8. kumoai/experimental/rfm/backend/sqlite/__init__.py +2 -2
  9. kumoai/experimental/rfm/backend/sqlite/sampler.py +21 -26
  10. kumoai/experimental/rfm/backend/sqlite/table.py +54 -26
  11. kumoai/experimental/rfm/base/__init__.py +8 -0
  12. kumoai/experimental/rfm/base/column.py +14 -12
  13. kumoai/experimental/rfm/base/column_expression.py +50 -0
  14. kumoai/experimental/rfm/base/sql_sampler.py +31 -3
  15. kumoai/experimental/rfm/base/sql_table.py +229 -0
  16. kumoai/experimental/rfm/base/table.py +162 -143
  17. kumoai/experimental/rfm/graph.py +242 -95
  18. kumoai/experimental/rfm/infer/__init__.py +6 -4
  19. kumoai/experimental/rfm/infer/dtype.py +3 -3
  20. kumoai/experimental/rfm/infer/pkey.py +4 -2
  21. kumoai/experimental/rfm/infer/stype.py +35 -0
  22. kumoai/experimental/rfm/infer/time_col.py +1 -2
  23. kumoai/experimental/rfm/pquery/executor.py +27 -27
  24. kumoai/experimental/rfm/pquery/pandas_executor.py +29 -31
  25. kumoai/experimental/rfm/rfm.py +86 -80
  26. kumoai/experimental/rfm/sagemaker.py +4 -4
  27. kumoai/utils/__init__.py +1 -2
  28. kumoai/utils/progress_logger.py +178 -12
  29. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +2 -1
  30. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +33 -30
  31. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
  32. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
  33. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0
@@ -8,10 +8,9 @@ import pyarrow as pa
8
8
  from kumoapi.pquery import ValidatedPredictiveQuery
9
9
  from kumoapi.typing import Stype
10
10
 
11
- from kumoai.experimental.rfm.backend.sqlite import SQLiteTable
12
11
  from kumoai.experimental.rfm.base import SQLSampler
13
12
  from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
14
- from kumoai.utils import InteractiveProgressLogger, ProgressLogger, quote_ident
13
+ from kumoai.utils import ProgressLogger, quote_ident
15
14
 
16
15
  if TYPE_CHECKING:
17
16
  from kumoai.experimental.rfm import Graph
@@ -26,10 +25,6 @@ class SQLiteSampler(SQLSampler):
26
25
  ) -> None:
27
26
  super().__init__(graph=graph, verbose=verbose)
28
27
 
29
- for table in graph.tables.values():
30
- assert isinstance(table, SQLiteTable)
31
- self._connection = table._connection
32
-
33
28
  if optimize:
34
29
  with self._connection.cursor() as cursor:
35
30
  cursor.execute("PRAGMA temp_store = MEMORY")
@@ -54,7 +49,7 @@ class SQLiteSampler(SQLSampler):
54
49
  with self._connection.cursor() as cursor:
55
50
  for table_name in list(index_dict.keys()):
56
51
  indices = index_dict[table_name]
57
- sql = f"PRAGMA index_list({quote_ident(table_name)})"
52
+ sql = f"PRAGMA index_list({self.fqn_dict[table_name]})"
58
53
  cursor.execute(sql)
59
54
  for _, index_name, *_ in cursor.fetchall():
60
55
  sql = f"PRAGMA index_info({quote_ident(index_name)})"
@@ -72,22 +67,22 @@ class SQLiteSampler(SQLSampler):
72
67
 
73
68
  if optimize and len(index_dict) > 0:
74
69
  if not isinstance(verbose, ProgressLogger):
75
- verbose = InteractiveProgressLogger(
76
- "Optimizing SQLite database",
70
+ verbose = ProgressLogger.default(
71
+ msg="Optimizing SQLite database",
77
72
  verbose=verbose,
78
73
  )
79
74
 
80
- with verbose as logger:
81
- with self._connection.cursor() as cursor:
82
- for table_name, indices in index_dict.items():
83
- for index in indices:
84
- name = f"kumo_index_{table_name}_{'_'.join(index)}"
85
- columns = ', '.join(quote_ident(v) for v in index)
86
- columns += ' DESC' if len(index) > 1 else ''
87
- sql = (f"CREATE INDEX IF NOT EXISTS {name}\n"
88
- f"ON {quote_ident(table_name)}({columns})")
89
- cursor.execute(sql)
90
- self._connection.commit()
75
+ with verbose as logger, self._connection.cursor() as cursor:
76
+ for table_name, indices in index_dict.items():
77
+ for index in indices:
78
+ name = f"kumo_index_{table_name}_{'_'.join(index)}"
79
+ name = quote_ident(name)
80
+ columns = ', '.join(quote_ident(v) for v in index)
81
+ columns += ' DESC' if len(index) > 1 else ''
82
+ sql = (f"CREATE INDEX IF NOT EXISTS {name}\n"
83
+ f"ON {self.fqn_dict[table_name]}({columns})")
84
+ cursor.execute(sql)
85
+ self._connection.commit()
91
86
  logger.log(f"Created {index_repr} in {table_repr}")
92
87
 
93
88
  elif len(index_dict) > 0:
@@ -108,7 +103,7 @@ class SQLiteSampler(SQLSampler):
108
103
  f" ? as table_name,\n"
109
104
  f" MIN({quote_ident(time_column)}) as min_date,\n"
110
105
  f" MAX({quote_ident(time_column)}) as max_date\n"
111
- f"FROM {quote_ident(table_name)}")
106
+ f"FROM {self.fqn_dict[table_name]}")
112
107
  selects.append(select)
113
108
  sql = "\nUNION ALL\n".join(selects)
114
109
 
@@ -142,7 +137,7 @@ class SQLiteSampler(SQLSampler):
142
137
 
143
138
  # TODO Make this query more efficient - it does full table scan.
144
139
  sql = (f"SELECT {', '.join(quote_ident(col) for col in columns)}\n"
145
- f"FROM {quote_ident(table_name)}")
140
+ f"FROM {self.fqn_dict[table_name]}")
146
141
  if len(filters) > 0:
147
142
  sql += f"\nWHERE{' AND'.join(filters)}"
148
143
  sql += f"\nORDER BY RANDOM() LIMIT {num_rows}"
@@ -207,15 +202,15 @@ class SQLiteSampler(SQLSampler):
207
202
  sql = (f"SELECT tmp.rowid - 1 as __batch__, "
208
203
  f"{', '.join('ent.' + quote_ident(c) for c in columns)}\n"
209
204
  f"FROM {quote_ident(tmp_name)} tmp\n"
210
- f"JOIN {quote_ident(table_name)} ent\n"
205
+ f"JOIN {self.fqn_dict[table_name]} ent\n"
211
206
  f" ON ent.{quote_ident(pkey_name)} = tmp.id")
212
207
  else:
213
208
  sql = (f"SELECT tmp.rowid - 1 as __batch__, "
214
209
  f"{', '.join('ent.' + quote_ident(c) for c in columns)}\n"
215
210
  f"FROM {quote_ident(tmp_name)} tmp\n"
216
- f"JOIN {quote_ident(table_name)} ent\n"
211
+ f"JOIN {self.fqn_dict[table_name]} ent\n"
217
212
  f" ON ent.rowid = (\n"
218
- f" SELECT rowid FROM {quote_ident(table_name)}\n"
213
+ f" SELECT rowid FROM {self.fqn_dict[table_name]}\n"
219
214
  f" WHERE {quote_ident(pkey_name)} == tmp.id\n"
220
215
  f" LIMIT 1\n"
221
216
  f")")
@@ -258,7 +253,7 @@ class SQLiteSampler(SQLSampler):
258
253
  sql = (f"SELECT tmp.rowid - 1 as __batch__, "
259
254
  f"{', '.join('fact.' + quote_ident(col) for col in columns)}\n"
260
255
  f"FROM {quote_ident(tmp_name)} tmp\n"
261
- f"JOIN {quote_ident(table_name)} fact\n"
256
+ f"JOIN {self.fqn_dict[table_name]} fact\n"
262
257
  f" ON fact.{quote_ident(fkey)} = tmp.id\n"
263
258
  f" AND fact.{quote_ident(time_column)} <= tmp.end")
264
259
  if min_offset is not None:
@@ -1,28 +1,35 @@
1
1
  import re
2
2
  import warnings
3
- from typing import List, Optional, Sequence, cast
3
+ from collections.abc import Sequence
4
+ from typing import cast
4
5
 
5
6
  import pandas as pd
7
+ from kumoapi.model_plan import MissingType
6
8
  from kumoapi.typing import Dtype
7
9
 
8
10
  from kumoai.experimental.rfm.backend.sqlite import Connection
9
11
  from kumoai.experimental.rfm.base import (
12
+ ColumnExpressionSpec,
13
+ ColumnExpressionType,
10
14
  DataBackend,
11
15
  SourceColumn,
12
16
  SourceForeignKey,
13
- Table,
17
+ SQLTable,
14
18
  )
15
19
  from kumoai.experimental.rfm.infer import infer_dtype
16
20
  from kumoai.utils import quote_ident
17
21
 
18
22
 
19
- class SQLiteTable(Table):
23
+ class SQLiteTable(SQLTable):
20
24
  r"""A table backed by a :class:`sqlite` database.
21
25
 
22
26
  Args:
23
27
  connection: The connection to a :class:`sqlite` database.
24
- name: The name of this table.
25
- columns: The selected columns of this table.
28
+ name: The logical name of this table.
29
+ source_name: The physical name of this table in the database. If set to
30
+ ``None``, ``name`` is being used.
31
+ columns: The selected physical columns of this table.
32
+ column_expressions: The logical columns of this table.
26
33
  primary_key: The name of the primary key of this table, if it exists.
27
34
  time_column: The name of the time column of this table, if it exists.
28
35
  end_time_column: The name of the end time column of this table, if it
@@ -32,17 +39,21 @@ class SQLiteTable(Table):
32
39
  self,
33
40
  connection: Connection,
34
41
  name: str,
35
- columns: Optional[Sequence[str]] = None,
36
- primary_key: Optional[str] = None,
37
- time_column: Optional[str] = None,
38
- end_time_column: Optional[str] = None,
42
+ source_name: str | None = None,
43
+ columns: Sequence[str] | None = None,
44
+ column_expressions: Sequence[ColumnExpressionType] | None = None,
45
+ primary_key: MissingType | str | None = MissingType.VALUE,
46
+ time_column: str | None = None,
47
+ end_time_column: str | None = None,
39
48
  ) -> None:
40
49
 
41
50
  self._connection = connection
42
51
 
43
52
  super().__init__(
44
53
  name=name,
54
+ source_name=source_name,
45
55
  columns=columns,
56
+ column_expressions=column_expressions,
46
57
  primary_key=primary_key,
47
58
  time_column=time_column,
48
59
  end_time_column=end_time_column,
@@ -52,18 +63,19 @@ class SQLiteTable(Table):
52
63
  def backend(self) -> DataBackend:
53
64
  return cast(DataBackend, DataBackend.SQLITE)
54
65
 
55
- def _get_source_columns(self) -> List[SourceColumn]:
56
- source_columns: List[SourceColumn] = []
66
+ def _get_source_columns(self) -> list[SourceColumn]:
67
+ source_columns: list[SourceColumn] = []
57
68
  with self._connection.cursor() as cursor:
58
- sql = f"PRAGMA table_info({quote_ident(self.name)})"
69
+ sql = f"PRAGMA table_info({self.fqn})"
59
70
  cursor.execute(sql)
60
71
  columns = cursor.fetchall()
61
72
 
62
73
  if len(columns) == 0:
63
- raise ValueError(f"Table '{self.name}' does not exist")
74
+ raise ValueError(f"Table '{self._source_name}' does not exist "
75
+ f"in the SQLite database")
64
76
 
65
77
  unique_keys: set[str] = set()
66
- sql = f"PRAGMA index_list({quote_ident(self.name)})"
78
+ sql = f"PRAGMA index_list({self.fqn})"
67
79
  cursor.execute(sql)
68
80
  for _, index_name, is_unique, *_ in cursor.fetchall():
69
81
  if bool(is_unique):
@@ -83,15 +95,17 @@ class SQLiteTable(Table):
83
95
  elif re.search('REAL|FLOA|DOUB', type):
84
96
  dtype = Dtype.float
85
97
  else: # NUMERIC affinity.
86
- ser = self._sample_df[column]
98
+ ser = self._source_sample_df[column]
87
99
  try:
88
100
  dtype = infer_dtype(ser)
89
101
  except Exception:
90
- warnings.warn(
91
- f"Data type inference for column '{column}' in "
92
- f"table '{self.name}' failed. Consider changing "
93
- f"the data type of the column to use it within "
94
- f"this table.")
102
+ warnings.warn(f"Encountered unsupported data type "
103
+ f"'{ser.dtype}' with source data type "
104
+ f"'{type}' for column '{column}' in "
105
+ f"table '{self.name}'. If possible, "
106
+ f"change the data type of the column in "
107
+ f"your SQLite database to use it within "
108
+ f"this table.")
95
109
  continue
96
110
 
97
111
  source_column = SourceColumn(
@@ -105,22 +119,36 @@ class SQLiteTable(Table):
105
119
 
106
120
  return source_columns
107
121
 
108
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
109
- source_fkeys: List[SourceForeignKey] = []
122
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
123
+ source_fkeys: list[SourceForeignKey] = []
110
124
  with self._connection.cursor() as cursor:
111
- sql = f"PRAGMA foreign_key_list({quote_ident(self.name)})"
125
+ sql = f"PRAGMA foreign_key_list({self.fqn})"
112
126
  cursor.execute(sql)
113
127
  for _, _, dst_table, fkey, pkey, *_ in cursor.fetchall():
114
128
  source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
115
129
  return source_fkeys
116
130
 
117
- def _get_sample_df(self) -> pd.DataFrame:
131
+ def _get_source_sample_df(self) -> pd.DataFrame:
118
132
  with self._connection.cursor() as cursor:
119
- sql = (f"SELECT * FROM {quote_ident(self.name)} "
133
+ sql = (f"SELECT * FROM {self.fqn} "
120
134
  f"ORDER BY rowid LIMIT 1000")
121
135
  cursor.execute(sql)
122
136
  table = cursor.fetch_arrow_table()
123
137
  return table.to_pandas(types_mapper=pd.ArrowDtype)
124
138
 
125
- def _get_num_rows(self) -> Optional[int]:
139
+ def _get_num_rows(self) -> int | None:
126
140
  return None
141
+
142
+ def _get_expression_sample_df(
143
+ self,
144
+ specs: Sequence[ColumnExpressionSpec],
145
+ ) -> pd.DataFrame:
146
+ with self._connection.cursor() as cursor:
147
+ columns = [
148
+ f"{spec.expr} AS {quote_ident(spec.name)}" for spec in specs
149
+ ]
150
+ sql = (f"SELECT {', '.join(columns)} FROM {self.fqn} "
151
+ f"ORDER BY rowid LIMIT 1000")
152
+ cursor.execute(sql)
153
+ table = cursor.fetch_arrow_table()
154
+ return table.to_pandas(types_mapper=pd.ArrowDtype)
@@ -9,7 +9,11 @@ class DataBackend(StrEnum):
9
9
 
10
10
  from .source import SourceColumn, SourceForeignKey # noqa: E402
11
11
  from .column import Column # noqa: E402
12
+ from .column_expression import ColumnExpressionSpec # noqa: E402
13
+ from .column_expression import ColumnExpressionType # noqa: E402
14
+ from .column_expression import ColumnExpression # noqa: E402
12
15
  from .table import Table # noqa: E402
16
+ from .sql_table import SQLTable # noqa: E402
13
17
  from .sampler import SamplerOutput, Sampler # noqa: E402
14
18
  from .sql_sampler import SQLSampler # noqa: E402
15
19
 
@@ -18,7 +22,11 @@ __all__ = [
18
22
  'SourceColumn',
19
23
  'SourceForeignKey',
20
24
  'Column',
25
+ 'ColumnExpressionSpec',
26
+ 'ColumnExpressionType',
27
+ 'ColumnExpression',
21
28
  'Table',
29
+ 'SQLTable',
22
30
  'SamplerOutput',
23
31
  'Sampler',
24
32
  'SQLSampler',
@@ -8,20 +8,14 @@ from kumoapi.typing import Dtype, Stype
8
8
  class Column:
9
9
  stype: Stype
10
10
 
11
- def __init__(
12
- self,
13
- name: str,
14
- dtype: Dtype,
15
- stype: Stype,
16
- is_primary_key: bool = False,
17
- is_time_column: bool = False,
18
- is_end_time_column: bool = False,
19
- ) -> None:
11
+ def __init__(self, name: str, stype: Stype, dtype: Dtype) -> None:
20
12
  self._name = name
21
13
  self._dtype = Dtype(dtype)
22
- self._is_primary_key = is_primary_key
23
- self._is_time_column = is_time_column
24
- self._is_end_time_column = is_end_time_column
14
+
15
+ self._is_primary_key = False
16
+ self._is_time_column = False
17
+ self._is_end_time_column = False
18
+
25
19
  self.stype = Stype(stype)
26
20
 
27
21
  @property
@@ -32,6 +26,14 @@ class Column:
32
26
  def dtype(self) -> Dtype:
33
27
  return self._dtype
34
28
 
29
+ @property
30
+ def is_physical(self) -> bool:
31
+ return True
32
+
33
+ @property
34
+ def is_logical(self) -> bool:
35
+ return not self.is_physical
36
+
35
37
  def __setattr__(self, key: str, val: Any) -> None:
36
38
  if key == 'stype':
37
39
  if isinstance(val, str):
@@ -0,0 +1,50 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, TypeAlias
3
+
4
+ from kumoapi.typing import Dtype, Stype
5
+
6
+ from kumoai.experimental.rfm.base import Column
7
+ from kumoai.mixin import CastMixin
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class ColumnExpressionSpec(CastMixin):
12
+ name: str
13
+ expr: str
14
+ dtype: Dtype | None = None
15
+
16
+
17
+ ColumnExpressionType: TypeAlias = ColumnExpressionSpec | dict[str, Any]
18
+
19
+
20
+ @dataclass(init=False, repr=False, eq=False)
21
+ class ColumnExpression(Column):
22
+ def __init__(
23
+ self,
24
+ name: str,
25
+ expr: str,
26
+ stype: Stype,
27
+ dtype: Dtype,
28
+ ) -> None:
29
+ super().__init__(name=name, stype=stype, dtype=dtype)
30
+ self._expr = expr
31
+
32
+ @property
33
+ def expr(self) -> str:
34
+ return self._expr
35
+
36
+ @property
37
+ def is_physical(self) -> bool:
38
+ return False
39
+
40
+ def __hash__(self) -> int:
41
+ return hash((self.name, self.expr, self.stype, self.dtype))
42
+
43
+ def __eq__(self, other: Any) -> bool:
44
+ if not isinstance(other, ColumnExpression):
45
+ return False
46
+ return hash(self) == hash(other)
47
+
48
+ def __repr__(self) -> str:
49
+ return (f'{self.__class__.__name__}(name={self.name}, '
50
+ f'expr={self.expr}, stype={self.stype}, dtype={self.dtype})')
@@ -1,13 +1,37 @@
1
1
  from abc import abstractmethod
2
- from typing import Literal
2
+ from typing import TYPE_CHECKING, Literal
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
 
7
- from kumoai.experimental.rfm.base import Sampler, SamplerOutput
7
+ from kumoai.experimental.rfm.base import Sampler, SamplerOutput, SQLTable
8
+ from kumoai.utils import ProgressLogger
9
+
10
+ if TYPE_CHECKING:
11
+ from kumoai.experimental.rfm import Graph
8
12
 
9
13
 
10
14
  class SQLSampler(Sampler):
15
+ def __init__(
16
+ self,
17
+ graph: 'Graph',
18
+ verbose: bool | ProgressLogger = True,
19
+ ) -> None:
20
+ super().__init__(graph=graph, verbose=verbose)
21
+
22
+ self._fqn_dict: dict[str, str] = {}
23
+ for table in graph.tables.values():
24
+ assert isinstance(table, SQLTable)
25
+ self._connection = table._connection
26
+ self._fqn_dict[table.name] = table.fqn
27
+
28
+ @property
29
+ def fqn_dict(self) -> dict[str, str]:
30
+ r"""The fully-qualified quoted source name for all table names in the
31
+ graph.
32
+ """
33
+ return self._fqn_dict
34
+
11
35
  def _sample_subgraph(
12
36
  self,
13
37
  entity_table_name: str,
@@ -23,7 +47,11 @@ class SQLSampler(Sampler):
23
47
  columns=columns_dict[entity_table_name],
24
48
  )
25
49
  if len(batch) != len(entity_pkey):
26
- raise KeyError("Invalid primary keys") # TODO
50
+ mask = np.ones(len(entity_pkey), dtype=bool)
51
+ mask[batch] = False
52
+ raise KeyError(f"The primary keys "
53
+ f"{entity_pkey.iloc[mask].tolist()} do not exist "
54
+ f"in the '{entity_table_name}' table")
27
55
 
28
56
  perm = batch.argsort()
29
57
  batch = batch[perm]
@@ -0,0 +1,229 @@
1
+ import warnings
2
+ from abc import abstractmethod
3
+ from collections import defaultdict
4
+ from collections.abc import Sequence
5
+ from functools import cached_property
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+ from kumoapi.model_plan import MissingType
10
+
11
+ from kumoai.experimental.rfm.base import (
12
+ ColumnExpression,
13
+ ColumnExpressionSpec,
14
+ ColumnExpressionType,
15
+ SourceForeignKey,
16
+ Table,
17
+ )
18
+ from kumoai.experimental.rfm.infer import infer_dtype, infer_stype
19
+ from kumoai.utils import quote_ident
20
+
21
+
22
+ class SQLTable(Table):
23
+ r"""A :class:`SQLTable` specifies a :class:`Table` backed by a SQL
24
+ database.
25
+
26
+ Args:
27
+ name: The logical name of this table.
28
+ source_name: The physical name of this table in the database. If set to
29
+ ``None``, ``name`` is being used.
30
+ columns: The selected physical columns of this table.
31
+ column_expressions: The logical columns of this table.
32
+ primary_key: The name of the primary key of this table, if it exists.
33
+ time_column: The name of the time column of this table, if it exists.
34
+ end_time_column: The name of the end time column of this table, if it
35
+ exists.
36
+ """
37
+ def __init__(
38
+ self,
39
+ name: str,
40
+ source_name: str | None = None,
41
+ columns: Sequence[str] | None = None,
42
+ column_expressions: Sequence[ColumnExpressionType] | None = None,
43
+ primary_key: MissingType | str | None = MissingType.VALUE,
44
+ time_column: str | None = None,
45
+ end_time_column: str | None = None,
46
+ ) -> None:
47
+
48
+ self._connection: Any
49
+ self._source_name = source_name or name
50
+ self._expression_sample_df = pd.DataFrame()
51
+
52
+ super().__init__(
53
+ name=name,
54
+ columns=[],
55
+ primary_key=None,
56
+ time_column=None,
57
+ end_time_column=None,
58
+ )
59
+
60
+ # Add column expressions with highest priority:
61
+ self.add_column_expressions(column_expressions or [])
62
+
63
+ if columns is None:
64
+ for column_name in self._source_column_dict.keys():
65
+ if column_name not in self:
66
+ self.add_column(column_name)
67
+ else:
68
+ for column_name in columns:
69
+ self.add_column(column_name)
70
+
71
+ if isinstance(primary_key, MissingType):
72
+ # Inference from source column metadata:
73
+ if '_source_column_dict' in self.__dict__:
74
+ primary_key = self._source_primary_key
75
+ if (primary_key is not None and primary_key in self
76
+ and self[primary_key].is_physical):
77
+ self.primary_key = primary_key
78
+ elif primary_key is not None:
79
+ if primary_key not in self:
80
+ self.add_column(primary_key)
81
+ self.primary_key = primary_key
82
+
83
+ if time_column is not None:
84
+ if time_column not in self:
85
+ self.add_column(time_column)
86
+ self.time_column = time_column
87
+
88
+ if end_time_column is not None:
89
+ if end_time_column not in self:
90
+ self.add_column(end_time_column)
91
+ self.end_time_column = end_time_column
92
+
93
+ @property
94
+ def fqn(self) -> str:
95
+ r"""The fully-qualified quoted source table name."""
96
+ return quote_ident(self._source_name)
97
+
98
+ @cached_property
99
+ def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
100
+ fkeys = self._get_source_foreign_keys()
101
+ # NOTE Drop all keys that link to multiple keys in the same table since
102
+ # we don't support composite keys yet:
103
+ table_pkeys: dict[str, set[str]] = defaultdict(set)
104
+ for fkey in fkeys:
105
+ table_pkeys[fkey.dst_table].add(fkey.primary_key)
106
+ return {
107
+ fkey.name: fkey
108
+ for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
109
+ }
110
+
111
+ def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
112
+ expr_columns: list[str] = []
113
+ source_columns: list[str] = []
114
+ for column_name in columns:
115
+ column = self[column_name]
116
+ if isinstance(column, ColumnExpression):
117
+ expr_columns.append(column_name)
118
+ else:
119
+ source_columns.append(column_name)
120
+
121
+ dfs: list[pd.DataFrame] = []
122
+ if len(expr_columns) > 0:
123
+ dfs.append(self._expression_sample_df[expr_columns])
124
+ if len(source_columns) > 0:
125
+ dfs.append(self._source_sample_df[source_columns])
126
+
127
+ if len(dfs) == 0:
128
+ return pd.DataFrame(index=range(1000))
129
+ if len(dfs) == 1:
130
+ return dfs[0]
131
+ return pd.concat(dfs, axis=1, ignore_index=True)
132
+
133
+ # Column ##################################################################
134
+
135
+ def add_column_expressions(
136
+ self,
137
+ columns: Sequence[ColumnExpressionType],
138
+ ) -> None:
139
+ r"""Adds a set of column expressions to this table.
140
+
141
+ Args:
142
+ columns: The set of column expressions.
143
+
144
+ Raises:
145
+ KeyError: If a column with the same name already exists in the
146
+ table.
147
+ """
148
+ if len(columns) == 0:
149
+ return
150
+
151
+ column_expression_specs = [
152
+ spec for column in columns
153
+ if (spec := ColumnExpressionSpec._cast(column))
154
+ ]
155
+ df = self._get_expression_sample_df(column_expression_specs)
156
+
157
+ for spec in column_expression_specs:
158
+ if spec.name in self:
159
+ raise KeyError(f"Column '{spec.name}' already exists in table "
160
+ f"'{self.name}'")
161
+
162
+ dtype = spec.dtype
163
+ if dtype is None:
164
+ ser = df[spec.name]
165
+ try:
166
+ dtype = infer_dtype(ser)
167
+ except Exception:
168
+ warnings.warn(f"Encountered unsupported data type "
169
+ f"'{ser.dtype}' for column expression "
170
+ f"'{spec.name}' in table '{self.name}'."
171
+ f"Please manually specify the data type for "
172
+ f"this column expression to use it within "
173
+ f"this table, or remove it to suppress "
174
+ f"this warning.")
175
+ continue
176
+
177
+ ser = df[spec.name]
178
+ try:
179
+ stype = infer_stype(ser, spec.name, dtype)
180
+ except Exception as e:
181
+ raise RuntimeError(f"Could not obtain semantic type for "
182
+ f"column expression '{spec.name}' with "
183
+ f"data type '{dtype}' in table "
184
+ f"'{self.name}'. Change the data type of "
185
+ f"the column expression or remove it from "
186
+ f"this table.") from e
187
+
188
+ self._columns[spec.name] = ColumnExpression(
189
+ name=spec.name,
190
+ expr=spec.expr,
191
+ stype=stype,
192
+ dtype=dtype,
193
+ )
194
+ with warnings.catch_warnings():
195
+ warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
196
+ self._expression_sample_df[spec.name] = ser
197
+
198
+ def add_column_expression(
199
+ self,
200
+ column: ColumnExpressionType,
201
+ ) -> ColumnExpression:
202
+ r"""Adds a column expression to this table.
203
+
204
+ Args:
205
+ column: The column expression.
206
+
207
+ Raises:
208
+ KeyError: If a column with the same name already exists in the
209
+ table.
210
+ """
211
+ spec = ColumnExpressionSpec._cast(column)
212
+ assert spec is not None
213
+ self.add_column_expressions([spec])
214
+ column_expression = self.column(spec.name)
215
+ assert isinstance(column_expression, ColumnExpression)
216
+ return column_expression
217
+
218
+ # Abstract Methods ########################################################
219
+
220
+ @abstractmethod
221
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
222
+ pass
223
+
224
+ @abstractmethod
225
+ def _get_expression_sample_df(
226
+ self,
227
+ specs: Sequence[ColumnExpressionSpec],
228
+ ) -> pd.DataFrame:
229
+ pass