kumoai 2.13.0.dev202512041731__cp310-cp310-win_amd64.whl → 2.15.0.dev202601141731__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. kumoai/__init__.py +23 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +26 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/connector/utils.py +21 -7
  7. kumoai/experimental/rfm/__init__.py +51 -24
  8. kumoai/experimental/rfm/authenticate.py +3 -4
  9. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  10. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +62 -110
  11. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  12. kumoai/experimental/rfm/backend/local/table.py +35 -31
  13. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  14. kumoai/experimental/rfm/backend/snow/sampler.py +407 -0
  15. kumoai/experimental/rfm/backend/snow/table.py +178 -50
  16. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  17. kumoai/experimental/rfm/backend/sqlite/sampler.py +456 -0
  18. kumoai/experimental/rfm/backend/sqlite/table.py +131 -48
  19. kumoai/experimental/rfm/base/__init__.py +22 -4
  20. kumoai/experimental/rfm/base/column.py +96 -10
  21. kumoai/experimental/rfm/base/expression.py +44 -0
  22. kumoai/experimental/rfm/base/mapper.py +69 -0
  23. kumoai/experimental/rfm/base/sampler.py +696 -47
  24. kumoai/experimental/rfm/base/source.py +2 -1
  25. kumoai/experimental/rfm/base/sql_sampler.py +385 -0
  26. kumoai/experimental/rfm/base/table.py +384 -207
  27. kumoai/experimental/rfm/base/utils.py +36 -0
  28. kumoai/experimental/rfm/graph.py +359 -187
  29. kumoai/experimental/rfm/infer/__init__.py +6 -4
  30. kumoai/experimental/rfm/infer/dtype.py +10 -5
  31. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  32. kumoai/experimental/rfm/infer/pkey.py +4 -2
  33. kumoai/experimental/rfm/infer/stype.py +35 -0
  34. kumoai/experimental/rfm/infer/time_col.py +5 -4
  35. kumoai/experimental/rfm/pquery/executor.py +27 -27
  36. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  37. kumoai/experimental/rfm/relbench.py +76 -0
  38. kumoai/experimental/rfm/rfm.py +770 -467
  39. kumoai/experimental/rfm/sagemaker.py +4 -4
  40. kumoai/experimental/rfm/task_table.py +292 -0
  41. kumoai/kumolib.cp310-win_amd64.pyd +0 -0
  42. kumoai/pquery/predictive_query.py +10 -6
  43. kumoai/pquery/training_table.py +16 -2
  44. kumoai/testing/snow.py +50 -0
  45. kumoai/trainer/distilled_trainer.py +175 -0
  46. kumoai/utils/__init__.py +3 -2
  47. kumoai/utils/display.py +87 -0
  48. kumoai/utils/progress_logger.py +192 -13
  49. kumoai/utils/sql.py +3 -0
  50. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/METADATA +3 -2
  51. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/RECORD +54 -42
  52. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  53. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  54. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/WHEEL +0 -0
  55. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/licenses/LICENSE +0 -0
  56. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,32 @@
1
1
  from abc import ABC, abstractmethod
2
- from collections import defaultdict
2
+ from collections.abc import Sequence
3
3
  from functools import cached_property
4
- from typing import Dict, List, Optional, Sequence, Set
5
4
 
5
+ import numpy as np
6
6
  import pandas as pd
7
+ from kumoapi.model_plan import MissingType
7
8
  from kumoapi.source_table import UnavailableSourceTable
8
9
  from kumoapi.table import Column as ColumnDefinition
9
10
  from kumoapi.table import TableDefinition
10
- from kumoapi.typing import Stype
11
+ from kumoapi.typing import Dtype, Stype
11
12
  from typing_extensions import Self
12
13
 
13
- from kumoai import in_notebook, in_snowflake_notebook
14
- from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
14
+ from kumoai.experimental.rfm.base import (
15
+ Column,
16
+ ColumnSpec,
17
+ ColumnSpecType,
18
+ DataBackend,
19
+ SourceColumn,
20
+ SourceForeignKey,
21
+ )
22
+ from kumoai.experimental.rfm.base.utils import to_datetime
15
23
  from kumoai.experimental.rfm.infer import (
16
- contains_categorical,
17
- contains_id,
18
- contains_multicategorical,
19
- contains_timestamp,
24
+ infer_dtype,
20
25
  infer_primary_key,
26
+ infer_stype,
21
27
  infer_time_column,
22
28
  )
29
+ from kumoai.utils import display, quote_ident
23
30
 
24
31
 
25
32
  class Table(ABC):
@@ -29,53 +36,48 @@ class Table(ABC):
29
36
 
30
37
  Args:
31
38
  name: The name of this table.
39
+ source_name: The source name of this table. If set to ``None``,
40
+ ``name`` is being used.
32
41
  columns: The selected columns of this table.
33
42
  primary_key: The name of the primary key of this table, if it exists.
34
43
  time_column: The name of the time column of this table, if it exists.
35
44
  end_time_column: The name of the end time column of this table, if it
36
45
  exists.
37
46
  """
47
+ _NUM_SAMPLE_ROWS = 1_000
48
+
38
49
  def __init__(
39
50
  self,
40
51
  name: str,
41
- columns: Optional[Sequence[str]] = None,
42
- primary_key: Optional[str] = None,
43
- time_column: Optional[str] = None,
44
- end_time_column: Optional[str] = None,
52
+ source_name: str | None = None,
53
+ columns: Sequence[ColumnSpecType] | None = None,
54
+ primary_key: MissingType | str | None = MissingType.VALUE,
55
+ time_column: str | None = None,
56
+ end_time_column: str | None = None,
45
57
  ) -> None:
46
58
 
47
59
  self._name = name
48
- self._primary_key: Optional[str] = None
49
- self._time_column: Optional[str] = None
50
- self._end_time_column: Optional[str] = None
51
-
52
- if len(self._source_column_dict) == 0:
53
- raise ValueError(f"Table '{name}' does not hold any column with "
54
- f"a supported data type")
55
-
56
- primary_keys = [
57
- column.name for column in self._source_column_dict.values()
58
- if column.is_primary_key
59
- ]
60
- if len(primary_keys) == 1: # NOTE No composite keys yet.
61
- if primary_key is not None and primary_key != primary_keys[0]:
62
- raise ValueError(f"Found duplicate primary key "
63
- f"definition '{primary_key}' and "
64
- f"'{primary_keys[0]}' in table '{name}'")
65
- primary_key = primary_keys[0]
66
-
67
- unique_keys = [
68
- column.name for column in self._source_column_dict.values()
69
- if column.is_unique_key
70
- ]
71
- if primary_key is None and len(unique_keys) == 1:
72
- primary_key = unique_keys[0]
73
-
74
- self._columns: Dict[str, Column] = {}
75
- for column_name in columns or list(self._source_column_dict.keys()):
76
- self.add_column(column_name)
77
-
78
- if primary_key is not None:
60
+ self._source_name = source_name or name
61
+ self._column_dict: dict[str, Column] = {}
62
+ self._primary_key: str | None = None
63
+ self._time_column: str | None = None
64
+ self._end_time_column: str | None = None
65
+ self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
66
+
67
+ if columns is None:
68
+ columns = list(self._source_column_dict.keys())
69
+
70
+ self.add_columns(columns)
71
+
72
+ if isinstance(primary_key, MissingType):
73
+ # Infer primary key from source metadata, but only set it in case
74
+ # it is already part of the column set (don't magically add it):
75
+ if any(column.is_source for column in self.columns):
76
+ primary_key = self._source_primary_key
77
+ if (primary_key is not None and primary_key in self
78
+ and self[primary_key].is_source):
79
+ self.primary_key = primary_key
80
+ elif primary_key is not None:
79
81
  if primary_key not in self:
80
82
  self.add_column(primary_key)
81
83
  self.primary_key = primary_key
@@ -95,13 +97,22 @@ class Table(ABC):
95
97
  r"""The name of this table."""
96
98
  return self._name
97
99
 
98
- # Data column #############################################################
100
+ @property
101
+ def source_name(self) -> str:
102
+ r"""The source name of this table."""
103
+ return self._source_name
104
+
105
+ @property
106
+ def _quoted_source_name(self) -> str:
107
+ return quote_ident(self._source_name)
108
+
109
+ # Column ##################################################################
99
110
 
100
111
  def has_column(self, name: str) -> bool:
101
112
  r"""Returns ``True`` if this table holds a column with name ``name``;
102
113
  ``False`` otherwise.
103
114
  """
104
- return name in self._columns
115
+ return name in self._column_dict
105
116
 
106
117
  def column(self, name: str) -> Column:
107
118
  r"""Returns the data column named with name ``name`` in this table.
@@ -114,65 +125,113 @@ class Table(ABC):
114
125
  """
115
126
  if not self.has_column(name):
116
127
  raise KeyError(f"Column '{name}' not found in table '{self.name}'")
117
- return self._columns[name]
128
+ return self._column_dict[name]
118
129
 
119
130
  @property
120
- def columns(self) -> List[Column]:
131
+ def columns(self) -> list[Column]:
121
132
  r"""Returns a list of :class:`Column` objects that represent the
122
133
  columns in this table.
123
134
  """
124
- return list(self._columns.values())
135
+ return list(self._column_dict.values())
125
136
 
126
- def add_column(self, name: str) -> Column:
127
- r"""Adds a column to this table.
137
+ def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
138
+ r"""Adds a set of columns to this table.
128
139
 
129
140
  Args:
130
- name: The name of the column.
141
+ columns: The columns to add.
131
142
 
132
143
  Raises:
133
- KeyError: If ``name`` is already present in this table.
144
+ KeyError: If any of the column names already exist in this table.
134
145
  """
135
- if name in self:
136
- raise KeyError(f"Column '{name}' already exists in table "
137
- f"'{self.name}'")
138
-
139
- if name not in self._source_column_dict:
140
- raise KeyError(f"Column '{name}' does not exist in the underlying "
141
- f"source table")
142
-
143
- try:
144
- dtype = self._source_column_dict[name].dtype
145
- except Exception as e:
146
- raise RuntimeError(f"Could not obtain data type for column "
147
- f"'{name}' in table '{self.name}'. Change "
148
- f"the data type of the column in the source "
149
- f"table or remove it from the table.") from e
150
-
151
- try:
152
- ser = self._sample_df[name]
153
- if contains_id(ser, name, dtype):
154
- stype = Stype.ID
155
- elif contains_timestamp(ser, name, dtype):
156
- stype = Stype.timestamp
157
- elif contains_multicategorical(ser, name, dtype):
158
- stype = Stype.multicategorical
159
- elif contains_categorical(ser, name, dtype):
160
- stype = Stype.categorical
161
- else:
162
- stype = dtype.default_stype
163
- except Exception as e:
164
- raise RuntimeError(f"Could not obtain semantic type for column "
165
- f"'{name}' in table '{self.name}'. Change "
166
- f"the data type of the column in the source "
167
- f"table or remove it from the table.") from e
168
-
169
- self._columns[name] = Column(
170
- name=name,
171
- dtype=dtype,
172
- stype=stype,
173
- )
146
+ if len(columns) == 0:
147
+ return
174
148
 
175
- return self._columns[name]
149
+ column_specs = [ColumnSpec.coerce(column) for column in columns]
150
+
151
+ # Obtain a batch-wise sample for all column expressions:
152
+ expr_specs = [spec for spec in column_specs if not spec.is_source]
153
+ if len(expr_specs) > 0:
154
+ dfs = [
155
+ self._expr_sample_df,
156
+ self._get_expr_sample_df(expr_specs).reset_index(drop=True),
157
+ ]
158
+ size = min(map(len, dfs))
159
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
160
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
161
+ self._expr_sample_df = df
162
+
163
+ for column_spec in column_specs:
164
+ if column_spec.name in self:
165
+ raise KeyError(f"Column '{column_spec.name}' already exists "
166
+ f"in table '{self.name}'")
167
+
168
+ dtype = column_spec.dtype
169
+ stype = column_spec.stype
170
+
171
+ if column_spec.is_source:
172
+ if column_spec.name not in self._source_column_dict:
173
+ raise ValueError(
174
+ f"Column '{column_spec.name}' does not exist in the "
175
+ f"underlying source table")
176
+
177
+ if dtype is None:
178
+ dtype = self._source_column_dict[column_spec.name].dtype
179
+
180
+ if dtype == Dtype.unsupported:
181
+ raise ValueError(
182
+ f"Encountered unsupported data type for column "
183
+ f"'{column_spec.name}' in table '{self.name}'. Please "
184
+ f"either change the column's data type or remove the "
185
+ f"column from this table.")
186
+
187
+ if dtype is None:
188
+ if column_spec.is_source:
189
+ ser = self._source_sample_df[column_spec.name]
190
+ else:
191
+ ser = self._expr_sample_df[column_spec.name]
192
+ try:
193
+ dtype = infer_dtype(ser)
194
+ except Exception as e:
195
+ raise RuntimeError(
196
+ f"Encountered unsupported data type '{ser.dtype}' for "
197
+ f"column '{column_spec.name}' in table '{self.name}'. "
198
+ f"Please either manually override the columns's data "
199
+ f"type or remove the column from this table.") from e
200
+
201
+ if stype is None:
202
+ if column_spec.is_source:
203
+ ser = self._source_sample_df[column_spec.name]
204
+ else:
205
+ ser = self._expr_sample_df[column_spec.name]
206
+ try:
207
+ stype = infer_stype(ser, column_spec.name, dtype)
208
+ except Exception as e:
209
+ raise RuntimeError(
210
+ f"Could not determine semantic type for column "
211
+ f"'{column_spec.name}' with data type '{dtype}' in "
212
+ f"table '{self.name}'. Please either change the "
213
+ f"column's data type or remove the column from this "
214
+ f"table.") from e
215
+
216
+ self._column_dict[column_spec.name] = Column(
217
+ name=column_spec.name,
218
+ expr=column_spec.expr,
219
+ dtype=dtype,
220
+ stype=stype,
221
+ )
222
+
223
+ def add_column(self, column: ColumnSpecType) -> Column:
224
+ r"""Adds a column to this table.
225
+
226
+ Args:
227
+ column: The column to add.
228
+
229
+ Raises:
230
+ KeyError: If the column name already exists in this table.
231
+ """
232
+ column_spec = ColumnSpec.coerce(column)
233
+ self.add_columns([column_spec])
234
+ return self[column_spec.name]
176
235
 
177
236
  def remove_column(self, name: str) -> Self:
178
237
  r"""Removes a column from this table.
@@ -192,7 +251,7 @@ class Table(ABC):
192
251
  self.time_column = None
193
252
  if self._end_time_column == name:
194
253
  self.end_time_column = None
195
- del self._columns[name]
254
+ del self._column_dict[name]
196
255
 
197
256
  return self
198
257
 
@@ -205,22 +264,22 @@ class Table(ABC):
205
264
  return self._primary_key is not None
206
265
 
207
266
  @property
208
- def primary_key(self) -> Optional[Column]:
267
+ def primary_key(self) -> Column | None:
209
268
  r"""The primary key column of this table.
210
269
 
211
270
  The getter returns the primary key column of this table, or ``None`` if
212
271
  no such primary key is present.
213
272
 
214
273
  The setter sets a column as a primary key on this table, and raises a
215
- :class:`ValueError` if the primary key has a non-ID semantic type or
216
- if the column name does not match a column in the data frame.
274
+ :class:`ValueError` if the primary key has a non-ID compatible data
275
+ type or if the column name does not match a column in the data frame.
217
276
  """
218
277
  if self._primary_key is None:
219
278
  return None
220
279
  return self[self._primary_key]
221
280
 
222
281
  @primary_key.setter
223
- def primary_key(self, name: Optional[str]) -> None:
282
+ def primary_key(self, name: str | None) -> None:
224
283
  if name is not None and name == self._time_column:
225
284
  raise ValueError(f"Cannot specify column '{name}' as a primary "
226
285
  f"key since it is already defined to be a time "
@@ -250,22 +309,23 @@ class Table(ABC):
250
309
  return self._time_column is not None
251
310
 
252
311
  @property
253
- def time_column(self) -> Optional[Column]:
312
+ def time_column(self) -> Column | None:
254
313
  r"""The time column of this table.
255
314
 
256
315
  The getter returns the time column of this table, or ``None`` if no
257
316
  such time column is present.
258
317
 
259
318
  The setter sets a column as a time column on this table, and raises a
260
- :class:`ValueError` if the time column has a non-timestamp semantic
261
- type or if the column name does not match a column in the data frame.
319
+ :class:`ValueError` if the time column has a non-timestamp compatible
320
+ data type or if the column name does not match a column in the data
321
+ frame.
262
322
  """
263
323
  if self._time_column is None:
264
324
  return None
265
325
  return self[self._time_column]
266
326
 
267
327
  @time_column.setter
268
- def time_column(self, name: Optional[str]) -> None:
328
+ def time_column(self, name: str | None) -> None:
269
329
  if name is not None and name == self._primary_key:
270
330
  raise ValueError(f"Cannot specify column '{name}' as a time "
271
331
  f"column since it is already defined to be a "
@@ -295,7 +355,7 @@ class Table(ABC):
295
355
  return self._end_time_column is not None
296
356
 
297
357
  @property
298
- def end_time_column(self) -> Optional[Column]:
358
+ def end_time_column(self) -> Column | None:
299
359
  r"""The end time column of this table.
300
360
 
301
361
  The getter returns the end time column of this table, or ``None`` if no
@@ -303,15 +363,15 @@ class Table(ABC):
303
363
 
304
364
  The setter sets a column as an end time column on this table, and
305
365
  raises a :class:`ValueError` if the end time column has a non-timestamp
306
- semantic type or if the column name does not match a column in the data
307
- frame.
366
+ compatible data type or if the column name does not match a column in
367
+ the data frame.
308
368
  """
309
369
  if self._end_time_column is None:
310
370
  return None
311
371
  return self[self._end_time_column]
312
372
 
313
373
  @end_time_column.setter
314
- def end_time_column(self, name: Optional[str]) -> None:
374
+ def end_time_column(self, name: str | None) -> None:
315
375
  if name is not None and name == self._primary_key:
316
376
  raise ValueError(f"Cannot specify column '{name}' as an end time "
317
377
  f"column since it is already defined to be a "
@@ -339,39 +399,39 @@ class Table(ABC):
339
399
  r"""Returns a :class:`pandas.DataFrame` object containing metadata
340
400
  information about the columns in this table.
341
401
 
342
- The returned dataframe has columns ``name``, ``dtype``, ``stype``,
343
- ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
344
- which provide an aggregate view of the properties of the columns of
345
- this table.
402
+ The returned dataframe has columns ``"Name"``, ``"Data Type"``,
403
+ ``"Semantic Type"``, ``"Primary Key"``, ``"Time Column"`` and
404
+ ``"End Time Column"``, which provide an aggregated view of the
405
+ properties of the columns of this table.
346
406
 
347
407
  Example:
348
408
  >>> # doctest: +SKIP
349
409
  >>> import kumoai.experimental.rfm as rfm
350
410
  >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
351
411
  >>> table.metadata
352
- name dtype stype is_primary_key is_time_column is_end_time_column
353
- 0 CustomerID float64 ID True False False
412
+ Name Data Type Semantic Type Primary Key Time Column End Time Column
413
+ 0 CustomerID float64 ID True False False
354
414
  """ # noqa: E501
355
415
  cols = self.columns
356
416
 
357
417
  return pd.DataFrame({
358
- 'name':
418
+ 'Name':
359
419
  pd.Series(dtype=str, data=[c.name for c in cols]),
360
- 'dtype':
420
+ 'Data Type':
361
421
  pd.Series(dtype=str, data=[c.dtype for c in cols]),
362
- 'stype':
422
+ 'Semantic Type':
363
423
  pd.Series(dtype=str, data=[c.stype for c in cols]),
364
- 'is_primary_key':
424
+ 'Primary Key':
365
425
  pd.Series(
366
426
  dtype=bool,
367
427
  data=[self._primary_key == c.name for c in cols],
368
428
  ),
369
- 'is_time_column':
429
+ 'Time Column':
370
430
  pd.Series(
371
431
  dtype=bool,
372
432
  data=[self._time_column == c.name for c in cols],
373
433
  ),
374
- 'is_end_time_column':
434
+ 'End Time Column':
375
435
  pd.Series(
376
436
  dtype=bool,
377
437
  data=[self._end_time_column == c.name for c in cols],
@@ -380,33 +440,98 @@ class Table(ABC):
380
440
 
381
441
  def print_metadata(self) -> None:
382
442
  r"""Prints the :meth:`~metadata` of this table."""
383
- num_rows_repr = ''
384
- if self._num_rows is not None:
385
- num_rows_repr = ' ({self._num_rows:,} rows)'
386
-
387
- if in_snowflake_notebook():
388
- import streamlit as st
389
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
390
- st.markdown(md_repr)
391
- st.dataframe(self.metadata, hide_index=True)
392
- elif in_notebook():
393
- from IPython.display import Markdown, display
394
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
395
- display(Markdown(md_repr))
396
- df = self.metadata
397
- try:
398
- if hasattr(df.style, 'hide'):
399
- display(df.style.hide(axis='index')) # pandas=2
400
- else:
401
- display(df.style.hide_index()) # pandas<1.3
402
- except ImportError:
403
- print(df.to_string(index=False)) # missing jinja2
404
- else:
405
- print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
406
- print(self.metadata.to_string(index=False))
443
+ msg = f"🏷️ Metadata of Table `{self.name}`"
444
+ if num := self._num_rows:
445
+ msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
446
+
447
+ display.title(msg)
448
+ display.dataframe(self.metadata)
449
+
450
+ def infer_primary_key(self, verbose: bool = True) -> Self:
451
+ r"""Infers the primary key in this table.
452
+
453
+ Args:
454
+ verbose: Whether to print verbose output.
455
+ """
456
+ if self.has_primary_key():
457
+ return self
458
+
459
+ def _set_primary_key(primary_key: str) -> None:
460
+ self.primary_key = primary_key
461
+ if verbose:
462
+ display.message(f"Inferred primary key `{primary_key}` for "
463
+ f"table `{self.name}`")
464
+
465
+ # Inference from source column metadata:
466
+ if any(column.is_source for column in self.columns):
467
+ primary_key = self._source_primary_key
468
+ if (primary_key is not None and primary_key in self
469
+ and self[primary_key].is_source):
470
+ _set_primary_key(primary_key)
471
+ return self
472
+
473
+ unique_keys = [
474
+ column.name for column in self._source_column_dict.values()
475
+ if column.is_unique_key
476
+ ]
477
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
478
+ and unique_keys[0] in self
479
+ and self[unique_keys[0]].is_source):
480
+ _set_primary_key(unique_keys[0])
481
+ return self
482
+
483
+ # Heuristic-based inference:
484
+ candidates = [
485
+ column.name for column in self.columns if column.stype == Stype.ID
486
+ ]
487
+ if len(candidates) == 0:
488
+ for column in self.columns:
489
+ if self.name.lower() == column.name.lower():
490
+ candidates.append(column.name)
491
+ elif (self.name.lower().endswith('s')
492
+ and self.name.lower()[:-1] == column.name.lower()):
493
+ candidates.append(column.name)
494
+
495
+ if primary_key := infer_primary_key(
496
+ table_name=self.name,
497
+ df=self._get_sample_df(),
498
+ candidates=candidates,
499
+ ):
500
+ _set_primary_key(primary_key)
501
+ return self
502
+
503
+ return self
504
+
505
+ def infer_time_column(self, verbose: bool = True) -> Self:
506
+ r"""Infers the time column in this table.
507
+
508
+ Args:
509
+ verbose: Whether to print verbose output.
510
+ """
511
+ if self.has_time_column():
512
+ return self
513
+
514
+ # Heuristic-based inference:
515
+ candidates = [
516
+ column.name for column in self.columns
517
+ if column.stype == Stype.timestamp
518
+ and column.name != self._end_time_column
519
+ ]
520
+
521
+ if time_column := infer_time_column(
522
+ df=self._get_sample_df(),
523
+ candidates=candidates,
524
+ ):
525
+ self.time_column = time_column
526
+
527
+ if verbose:
528
+ display.message(f"Inferred time column `{time_column}` for "
529
+ f"table `{self.name}`")
530
+
531
+ return self
407
532
 
408
533
  def infer_metadata(self, verbose: bool = True) -> Self:
409
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
534
+ r"""Infers metadata, *i.e.*, primary keys and time columns, in this
410
535
  table.
411
536
 
412
537
  Args:
@@ -414,48 +539,19 @@ class Table(ABC):
414
539
  """
415
540
  logs = []
416
541
 
417
- # Try to detect primary key if not set:
418
542
  if not self.has_primary_key():
543
+ self.infer_primary_key(verbose=False)
544
+ if self.has_primary_key():
545
+ logs.append(f"primary key `{self._primary_key}`")
419
546
 
420
- def is_candidate(column: Column) -> bool:
421
- if column.stype == Stype.ID:
422
- return True
423
- if all(column.stype != Stype.ID for column in self.columns):
424
- if self.name == column.name:
425
- return True
426
- if (self.name.endswith('s')
427
- and self.name[:-1] == column.name):
428
- return True
429
- return False
430
-
431
- candidates = [
432
- column.name for column in self.columns if is_candidate(column)
433
- ]
434
-
435
- if primary_key := infer_primary_key(
436
- table_name=self.name,
437
- df=self._sample_df,
438
- candidates=candidates,
439
- ):
440
- self.primary_key = primary_key
441
- logs.append(f"primary key '{primary_key}'")
442
-
443
- # Try to detect time column if not set:
444
547
  if not self.has_time_column():
445
- candidates = [
446
- column.name for column in self.columns
447
- if column.stype == Stype.timestamp
448
- and column.name != self._end_time_column
449
- ]
450
- if time_column := infer_time_column(
451
- df=self._sample_df,
452
- candidates=candidates,
453
- ):
454
- self.time_column = time_column
455
- logs.append(f"time column '{time_column}'")
548
+ self.infer_time_column(verbose=False)
549
+ if self.has_time_column():
550
+ logs.append(f"time column `{self._time_column}`")
456
551
 
457
552
  if verbose and len(logs) > 0:
458
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
553
+ display.message(f"Inferred {' and '.join(logs)} for table "
554
+ f"`{self.name}`")
459
555
 
460
556
  return self
461
557
 
@@ -473,6 +569,100 @@ class Table(ABC):
473
569
  end_time_col=self._end_time_column,
474
570
  )
475
571
 
572
+ @cached_property
573
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
574
+ source_columns = self._get_source_columns()
575
+ if len(source_columns) == 0:
576
+ raise ValueError(f"Table '{self.name}' has no columns")
577
+ return {column.name: column for column in source_columns}
578
+
579
+ @cached_property
580
+ def _source_primary_key(self) -> str | None:
581
+ primary_keys = [
582
+ column.name for column in self._source_column_dict.values()
583
+ if column.is_primary_key
584
+ ]
585
+ # NOTE No composite keys yet.
586
+ return primary_keys[0] if len(primary_keys) == 1 else None
587
+
588
+ @cached_property
589
+ def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
590
+ return {key.name: key for key in self._get_source_foreign_keys()}
591
+
592
+ @cached_property
593
+ def _source_sample_df(self) -> pd.DataFrame:
594
+ return self._get_source_sample_df().reset_index(drop=True)
595
+
596
+ @cached_property
597
+ def _num_rows(self) -> int | None:
598
+ return self._get_num_rows()
599
+
600
+ def _get_sample_df(self) -> pd.DataFrame:
601
+ dfs: list[pd.DataFrame] = []
602
+ if any(column.is_source for column in self.columns):
603
+ dfs.append(self._source_sample_df)
604
+ if any(not column.is_source for column in self.columns):
605
+ dfs.append(self._expr_sample_df)
606
+
607
+ if len(dfs) == 0:
608
+ return pd.DataFrame(index=range(1000))
609
+ if len(dfs) == 1:
610
+ return dfs[0]
611
+
612
+ size = min(map(len, dfs))
613
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
614
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
615
+ return df
616
+
617
+ @staticmethod
618
+ def _sanitize(
619
+ df: pd.DataFrame,
620
+ dtype_dict: dict[str, Dtype | None] | None = None,
621
+ stype_dict: dict[str, Stype | None] | None = None,
622
+ ) -> pd.DataFrame:
623
+ r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
624
+ types match table data and semantic type specification.
625
+ """
626
+ def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
627
+ if (pd.api.types.is_string_dtype(ser)
628
+ and dtype in {Dtype.intlist, Dtype.floatlist}):
629
+ try:
630
+ ser = ser.map(lambda row: np.fromstring(
631
+ row.strip('[]'),
632
+ sep=',',
633
+ dtype=int if dtype == Dtype.intlist else np.float32,
634
+ ) if row is not None else None)
635
+ except Exception:
636
+ pass
637
+
638
+ if pd.api.types.is_string_dtype(ser):
639
+ try:
640
+ import orjson as json
641
+ except ImportError:
642
+ import json
643
+ try:
644
+ ser = ser.map(lambda row: json.loads(row)
645
+ if row is not None else None)
646
+ except Exception:
647
+ pass
648
+
649
+ return ser
650
+
651
+ for column_name in df.columns:
652
+ dtype = (dtype_dict or {}).get(column_name)
653
+ stype = (stype_dict or {}).get(column_name)
654
+
655
+ if dtype == Dtype.time:
656
+ df[column_name] = to_datetime(df[column_name])
657
+ elif stype == Stype.timestamp:
658
+ df[column_name] = to_datetime(df[column_name])
659
+ elif dtype is not None and dtype.is_list():
660
+ df[column_name] = _to_list(df[column_name], dtype)
661
+ elif stype == Stype.sequence:
662
+ df[column_name] = _to_list(df[column_name], Dtype.floatlist)
663
+
664
+ return df
665
+
476
666
  # Python builtins #########################################################
477
667
 
478
668
  def __hash__(self) -> int:
@@ -503,43 +693,30 @@ class Table(ABC):
503
693
 
504
694
  # Abstract Methods ########################################################
505
695
 
506
- @cached_property
507
- def _source_column_dict(self) -> Dict[str, SourceColumn]:
508
- return {col.name: col for col in self._get_source_columns()}
696
+ @property
697
+ @abstractmethod
698
+ def backend(self) -> DataBackend:
699
+ r"""The data backend of this table."""
509
700
 
510
701
  @abstractmethod
511
- def _get_source_columns(self) -> List[SourceColumn]:
702
+ def _get_source_columns(self) -> list[SourceColumn]:
512
703
  pass
513
704
 
514
- @cached_property
515
- def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
516
- fkeys = self._get_source_foreign_keys()
517
- # NOTE Drop all keys that link to different primary keys in the same
518
- # table since we don't support composite keys yet:
519
- table_pkeys: Dict[str, Set[str]] = defaultdict(set)
520
- for fkey in fkeys:
521
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
522
- return {
523
- fkey.name: fkey
524
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
525
- }
526
-
527
705
  @abstractmethod
528
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
706
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
529
707
  pass
530
708
 
531
- @cached_property
532
- def _sample_df(self) -> pd.DataFrame:
533
- return self._get_sample_df()
534
-
535
709
  @abstractmethod
536
- def _get_sample_df(self) -> pd.DataFrame:
710
+ def _get_source_sample_df(self) -> pd.DataFrame:
537
711
  pass
538
712
 
539
- @cached_property
540
- def _num_rows(self) -> Optional[int]:
541
- return self._get_num_rows()
713
+ @abstractmethod
714
+ def _get_expr_sample_df(
715
+ self,
716
+ columns: Sequence[ColumnSpec],
717
+ ) -> pd.DataFrame:
718
+ pass
542
719
 
543
720
  @abstractmethod
544
- def _get_num_rows(self) -> Optional[int]:
721
+ def _get_num_rows(self) -> int | None:
545
722
  pass