kumoai 2.13.0.dev202512040649__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. kumoai/__init__.py +35 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +26 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/connector/utils.py +21 -7
  7. kumoai/experimental/rfm/__init__.py +51 -24
  8. kumoai/experimental/rfm/authenticate.py +3 -4
  9. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  10. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +62 -110
  11. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  12. kumoai/experimental/rfm/backend/local/table.py +35 -31
  13. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  14. kumoai/experimental/rfm/backend/snow/sampler.py +366 -0
  15. kumoai/experimental/rfm/backend/snow/table.py +177 -50
  16. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  17. kumoai/experimental/rfm/backend/sqlite/sampler.py +454 -0
  18. kumoai/experimental/rfm/backend/sqlite/table.py +131 -48
  19. kumoai/experimental/rfm/base/__init__.py +23 -3
  20. kumoai/experimental/rfm/base/column.py +96 -10
  21. kumoai/experimental/rfm/base/expression.py +44 -0
  22. kumoai/experimental/rfm/base/sampler.py +782 -0
  23. kumoai/experimental/rfm/base/source.py +2 -1
  24. kumoai/experimental/rfm/base/sql_sampler.py +247 -0
  25. kumoai/experimental/rfm/base/table.py +404 -203
  26. kumoai/experimental/rfm/graph.py +374 -172
  27. kumoai/experimental/rfm/infer/__init__.py +6 -4
  28. kumoai/experimental/rfm/infer/dtype.py +7 -4
  29. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  30. kumoai/experimental/rfm/infer/pkey.py +4 -2
  31. kumoai/experimental/rfm/infer/stype.py +35 -0
  32. kumoai/experimental/rfm/infer/time_col.py +1 -2
  33. kumoai/experimental/rfm/pquery/executor.py +27 -27
  34. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  35. kumoai/experimental/rfm/relbench.py +76 -0
  36. kumoai/experimental/rfm/rfm.py +762 -467
  37. kumoai/experimental/rfm/sagemaker.py +4 -4
  38. kumoai/experimental/rfm/task_table.py +292 -0
  39. kumoai/kumolib.cp313-win_amd64.pyd +0 -0
  40. kumoai/pquery/predictive_query.py +10 -6
  41. kumoai/pquery/training_table.py +16 -2
  42. kumoai/testing/snow.py +50 -0
  43. kumoai/trainer/distilled_trainer.py +175 -0
  44. kumoai/utils/__init__.py +3 -2
  45. kumoai/utils/display.py +87 -0
  46. kumoai/utils/progress_logger.py +190 -12
  47. kumoai/utils/sql.py +3 -0
  48. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/METADATA +3 -2
  49. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/RECORD +52 -41
  50. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  51. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  52. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/WHEEL +0 -0
  53. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/licenses/LICENSE +0 -0
  54. {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,33 @@
1
+ import warnings
1
2
  from abc import ABC, abstractmethod
2
- from collections import defaultdict
3
+ from collections.abc import Sequence
3
4
  from functools import cached_property
4
- from typing import Dict, List, Optional, Sequence, Set
5
5
 
6
+ import numpy as np
6
7
  import pandas as pd
8
+ import pyarrow as pa
9
+ from kumoapi.model_plan import MissingType
7
10
  from kumoapi.source_table import UnavailableSourceTable
8
11
  from kumoapi.table import Column as ColumnDefinition
9
12
  from kumoapi.table import TableDefinition
10
- from kumoapi.typing import Stype
13
+ from kumoapi.typing import Dtype, Stype
11
14
  from typing_extensions import Self
12
15
 
13
- from kumoai import in_notebook
14
- from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
16
+ from kumoai.experimental.rfm.base import (
17
+ Column,
18
+ ColumnSpec,
19
+ ColumnSpecType,
20
+ DataBackend,
21
+ SourceColumn,
22
+ SourceForeignKey,
23
+ )
15
24
  from kumoai.experimental.rfm.infer import (
16
- contains_categorical,
17
- contains_id,
18
- contains_multicategorical,
19
- contains_timestamp,
25
+ infer_dtype,
20
26
  infer_primary_key,
27
+ infer_stype,
21
28
  infer_time_column,
22
29
  )
30
+ from kumoai.utils import display, quote_ident
23
31
 
24
32
 
25
33
  class Table(ABC):
@@ -29,53 +37,48 @@ class Table(ABC):
29
37
 
30
38
  Args:
31
39
  name: The name of this table.
40
+ source_name: The source name of this table. If set to ``None``,
41
+ ``name`` is being used.
32
42
  columns: The selected columns of this table.
33
43
  primary_key: The name of the primary key of this table, if it exists.
34
44
  time_column: The name of the time column of this table, if it exists.
35
45
  end_time_column: The name of the end time column of this table, if it
36
46
  exists.
37
47
  """
48
+ _NUM_SAMPLE_ROWS = 1_000
49
+
38
50
  def __init__(
39
51
  self,
40
52
  name: str,
41
- columns: Optional[Sequence[str]] = None,
42
- primary_key: Optional[str] = None,
43
- time_column: Optional[str] = None,
44
- end_time_column: Optional[str] = None,
53
+ source_name: str | None = None,
54
+ columns: Sequence[ColumnSpecType] | None = None,
55
+ primary_key: MissingType | str | None = MissingType.VALUE,
56
+ time_column: str | None = None,
57
+ end_time_column: str | None = None,
45
58
  ) -> None:
46
59
 
47
60
  self._name = name
48
- self._primary_key: Optional[str] = None
49
- self._time_column: Optional[str] = None
50
- self._end_time_column: Optional[str] = None
51
-
52
- if len(self._source_column_dict) == 0:
53
- raise ValueError(f"Table '{name}' does not hold any column with "
54
- f"a supported data type")
55
-
56
- primary_keys = [
57
- column.name for column in self._source_column_dict.values()
58
- if column.is_primary_key
59
- ]
60
- if len(primary_keys) == 1: # NOTE No composite keys yet.
61
- if primary_key is not None and primary_key != primary_keys[0]:
62
- raise ValueError(f"Found duplicate primary key "
63
- f"definition '{primary_key}' and "
64
- f"'{primary_keys[0]}' in table '{name}'")
65
- primary_key = primary_keys[0]
66
-
67
- unique_keys = [
68
- column.name for column in self._source_column_dict.values()
69
- if column.is_unique_key
70
- ]
71
- if primary_key is None and len(unique_keys) == 1:
72
- primary_key = unique_keys[0]
73
-
74
- self._columns: Dict[str, Column] = {}
75
- for column_name in columns or list(self._source_column_dict.keys()):
76
- self.add_column(column_name)
77
-
78
- if primary_key is not None:
61
+ self._source_name = source_name or name
62
+ self._column_dict: dict[str, Column] = {}
63
+ self._primary_key: str | None = None
64
+ self._time_column: str | None = None
65
+ self._end_time_column: str | None = None
66
+ self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
67
+
68
+ if columns is None:
69
+ columns = list(self._source_column_dict.keys())
70
+
71
+ self.add_columns(columns)
72
+
73
+ if isinstance(primary_key, MissingType):
74
+ # Infer primary key from source metadata, but only set it in case
75
+ # it is already part of the column set (don't magically add it):
76
+ if any(column.is_source for column in self.columns):
77
+ primary_key = self._source_primary_key
78
+ if (primary_key is not None and primary_key in self
79
+ and self[primary_key].is_source):
80
+ self.primary_key = primary_key
81
+ elif primary_key is not None:
79
82
  if primary_key not in self:
80
83
  self.add_column(primary_key)
81
84
  self.primary_key = primary_key
@@ -95,13 +98,22 @@ class Table(ABC):
95
98
  r"""The name of this table."""
96
99
  return self._name
97
100
 
98
- # Data column #############################################################
101
+ @property
102
+ def source_name(self) -> str:
103
+ r"""The source name of this table."""
104
+ return self._source_name
105
+
106
+ @property
107
+ def _quoted_source_name(self) -> str:
108
+ return quote_ident(self._source_name)
109
+
110
+ # Column ##################################################################
99
111
 
100
112
  def has_column(self, name: str) -> bool:
101
113
  r"""Returns ``True`` if this table holds a column with name ``name``;
102
114
  ``False`` otherwise.
103
115
  """
104
- return name in self._columns
116
+ return name in self._column_dict
105
117
 
106
118
  def column(self, name: str) -> Column:
107
119
  r"""Returns the data column named with name ``name`` in this table.
@@ -114,65 +126,113 @@ class Table(ABC):
114
126
  """
115
127
  if not self.has_column(name):
116
128
  raise KeyError(f"Column '{name}' not found in table '{self.name}'")
117
- return self._columns[name]
129
+ return self._column_dict[name]
118
130
 
119
131
  @property
120
- def columns(self) -> List[Column]:
132
+ def columns(self) -> list[Column]:
121
133
  r"""Returns a list of :class:`Column` objects that represent the
122
134
  columns in this table.
123
135
  """
124
- return list(self._columns.values())
136
+ return list(self._column_dict.values())
125
137
 
126
- def add_column(self, name: str) -> Column:
127
- r"""Adds a column to this table.
138
+ def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
139
+ r"""Adds a set of columns to this table.
128
140
 
129
141
  Args:
130
- name: The name of the column.
142
+ columns: The columns to add.
131
143
 
132
144
  Raises:
133
- KeyError: If ``name`` is already present in this table.
145
+ KeyError: If any of the column names already exist in this table.
134
146
  """
135
- if name in self:
136
- raise KeyError(f"Column '{name}' already exists in table "
137
- f"'{self.name}'")
138
-
139
- if name not in self._source_column_dict:
140
- raise KeyError(f"Column '{name}' does not exist in the underlying "
141
- f"source table")
142
-
143
- try:
144
- dtype = self._source_column_dict[name].dtype
145
- except Exception as e:
146
- raise RuntimeError(f"Could not obtain data type for column "
147
- f"'{name}' in table '{self.name}'. Change "
148
- f"the data type of the column in the source "
149
- f"table or remove it from the table.") from e
150
-
151
- try:
152
- ser = self._sample_df[name]
153
- if contains_id(ser, name, dtype):
154
- stype = Stype.ID
155
- elif contains_timestamp(ser, name, dtype):
156
- stype = Stype.timestamp
157
- elif contains_multicategorical(ser, name, dtype):
158
- stype = Stype.multicategorical
159
- elif contains_categorical(ser, name, dtype):
160
- stype = Stype.categorical
161
- else:
162
- stype = dtype.default_stype
163
- except Exception as e:
164
- raise RuntimeError(f"Could not obtain semantic type for column "
165
- f"'{name}' in table '{self.name}'. Change "
166
- f"the data type of the column in the source "
167
- f"table or remove it from the table.") from e
168
-
169
- self._columns[name] = Column(
170
- name=name,
171
- dtype=dtype,
172
- stype=stype,
173
- )
147
+ if len(columns) == 0:
148
+ return
174
149
 
175
- return self._columns[name]
150
+ column_specs = [ColumnSpec.coerce(column) for column in columns]
151
+
152
+ # Obtain a batch-wise sample for all column expressions:
153
+ expr_specs = [spec for spec in column_specs if not spec.is_source]
154
+ if len(expr_specs) > 0:
155
+ dfs = [
156
+ self._expr_sample_df,
157
+ self._get_expr_sample_df(expr_specs).reset_index(drop=True),
158
+ ]
159
+ size = min(map(len, dfs))
160
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
161
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
162
+ self._expr_sample_df = df
163
+
164
+ for column_spec in column_specs:
165
+ if column_spec.name in self:
166
+ raise KeyError(f"Column '{column_spec.name}' already exists "
167
+ f"in table '{self.name}'")
168
+
169
+ dtype = column_spec.dtype
170
+ stype = column_spec.stype
171
+
172
+ if column_spec.is_source:
173
+ if column_spec.name not in self._source_column_dict:
174
+ raise ValueError(
175
+ f"Column '{column_spec.name}' does not exist in the "
176
+ f"underlying source table")
177
+
178
+ if dtype is None:
179
+ dtype = self._source_column_dict[column_spec.name].dtype
180
+
181
+ if dtype == Dtype.unsupported:
182
+ raise ValueError(
183
+ f"Encountered unsupported data type for column "
184
+ f"'{column_spec.name}' in table '{self.name}'. Please "
185
+ f"either change the column's data type or remove the "
186
+ f"column from this table.")
187
+
188
+ if dtype is None:
189
+ if column_spec.is_source:
190
+ ser = self._source_sample_df[column_spec.name]
191
+ else:
192
+ ser = self._expr_sample_df[column_spec.name]
193
+ try:
194
+ dtype = infer_dtype(ser)
195
+ except Exception as e:
196
+ raise RuntimeError(
197
+ f"Encountered unsupported data type '{ser.dtype}' for "
198
+ f"column '{column_spec.name}' in table '{self.name}'. "
199
+ f"Please either manually override the columns's data "
200
+ f"type or remove the column from this table.") from e
201
+
202
+ if stype is None:
203
+ if column_spec.is_source:
204
+ ser = self._source_sample_df[column_spec.name]
205
+ else:
206
+ ser = self._expr_sample_df[column_spec.name]
207
+ try:
208
+ stype = infer_stype(ser, column_spec.name, dtype)
209
+ except Exception as e:
210
+ raise RuntimeError(
211
+ f"Could not determine semantic type for column "
212
+ f"'{column_spec.name}' with data type '{dtype}' in "
213
+ f"table '{self.name}'. Please either change the "
214
+ f"column's data type or remove the column from this "
215
+ f"table.") from e
216
+
217
+ self._column_dict[column_spec.name] = Column(
218
+ name=column_spec.name,
219
+ expr=column_spec.expr,
220
+ dtype=dtype,
221
+ stype=stype,
222
+ )
223
+
224
+ def add_column(self, column: ColumnSpecType) -> Column:
225
+ r"""Adds a column to this table.
226
+
227
+ Args:
228
+ column: The column to add.
229
+
230
+ Raises:
231
+ KeyError: If the column name already exists in this table.
232
+ """
233
+ column_spec = ColumnSpec.coerce(column)
234
+ self.add_columns([column_spec])
235
+ return self[column_spec.name]
176
236
 
177
237
  def remove_column(self, name: str) -> Self:
178
238
  r"""Removes a column from this table.
@@ -192,7 +252,7 @@ class Table(ABC):
192
252
  self.time_column = None
193
253
  if self._end_time_column == name:
194
254
  self.end_time_column = None
195
- del self._columns[name]
255
+ del self._column_dict[name]
196
256
 
197
257
  return self
198
258
 
@@ -205,22 +265,22 @@ class Table(ABC):
205
265
  return self._primary_key is not None
206
266
 
207
267
  @property
208
- def primary_key(self) -> Optional[Column]:
268
+ def primary_key(self) -> Column | None:
209
269
  r"""The primary key column of this table.
210
270
 
211
271
  The getter returns the primary key column of this table, or ``None`` if
212
272
  no such primary key is present.
213
273
 
214
274
  The setter sets a column as a primary key on this table, and raises a
215
- :class:`ValueError` if the primary key has a non-ID semantic type or
216
- if the column name does not match a column in the data frame.
275
+ :class:`ValueError` if the primary key has a non-ID compatible data
276
+ type or if the column name does not match a column in the data frame.
217
277
  """
218
278
  if self._primary_key is None:
219
279
  return None
220
280
  return self[self._primary_key]
221
281
 
222
282
  @primary_key.setter
223
- def primary_key(self, name: Optional[str]) -> None:
283
+ def primary_key(self, name: str | None) -> None:
224
284
  if name is not None and name == self._time_column:
225
285
  raise ValueError(f"Cannot specify column '{name}' as a primary "
226
286
  f"key since it is already defined to be a time "
@@ -250,22 +310,23 @@ class Table(ABC):
250
310
  return self._time_column is not None
251
311
 
252
312
  @property
253
- def time_column(self) -> Optional[Column]:
313
+ def time_column(self) -> Column | None:
254
314
  r"""The time column of this table.
255
315
 
256
316
  The getter returns the time column of this table, or ``None`` if no
257
317
  such time column is present.
258
318
 
259
319
  The setter sets a column as a time column on this table, and raises a
260
- :class:`ValueError` if the time column has a non-timestamp semantic
261
- type or if the column name does not match a column in the data frame.
320
+ :class:`ValueError` if the time column has a non-timestamp compatible
321
+ data type or if the column name does not match a column in the data
322
+ frame.
262
323
  """
263
324
  if self._time_column is None:
264
325
  return None
265
326
  return self[self._time_column]
266
327
 
267
328
  @time_column.setter
268
- def time_column(self, name: Optional[str]) -> None:
329
+ def time_column(self, name: str | None) -> None:
269
330
  if name is not None and name == self._primary_key:
270
331
  raise ValueError(f"Cannot specify column '{name}' as a time "
271
332
  f"column since it is already defined to be a "
@@ -295,7 +356,7 @@ class Table(ABC):
295
356
  return self._end_time_column is not None
296
357
 
297
358
  @property
298
- def end_time_column(self) -> Optional[Column]:
359
+ def end_time_column(self) -> Column | None:
299
360
  r"""The end time column of this table.
300
361
 
301
362
  The getter returns the end time column of this table, or ``None`` if no
@@ -303,15 +364,15 @@ class Table(ABC):
303
364
 
304
365
  The setter sets a column as an end time column on this table, and
305
366
  raises a :class:`ValueError` if the end time column has a non-timestamp
306
- semantic type or if the column name does not match a column in the data
307
- frame.
367
+ compatible data type or if the column name does not match a column in
368
+ the data frame.
308
369
  """
309
370
  if self._end_time_column is None:
310
371
  return None
311
372
  return self[self._end_time_column]
312
373
 
313
374
  @end_time_column.setter
314
- def end_time_column(self, name: Optional[str]) -> None:
375
+ def end_time_column(self, name: str | None) -> None:
315
376
  if name is not None and name == self._primary_key:
316
377
  raise ValueError(f"Cannot specify column '{name}' as an end time "
317
378
  f"column since it is already defined to be a "
@@ -339,39 +400,39 @@ class Table(ABC):
339
400
  r"""Returns a :class:`pandas.DataFrame` object containing metadata
340
401
  information about the columns in this table.
341
402
 
342
- The returned dataframe has columns ``name``, ``dtype``, ``stype``,
343
- ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
344
- which provide an aggregate view of the properties of the columns of
345
- this table.
403
+ The returned dataframe has columns ``"Name"``, ``"Data Type"``,
404
+ ``"Semantic Type"``, ``"Primary Key"``, ``"Time Column"`` and
405
+ ``"End Time Column"``, which provide an aggregated view of the
406
+ properties of the columns of this table.
346
407
 
347
408
  Example:
348
409
  >>> # doctest: +SKIP
349
410
  >>> import kumoai.experimental.rfm as rfm
350
411
  >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
351
412
  >>> table.metadata
352
- name dtype stype is_primary_key is_time_column is_end_time_column
353
- 0 CustomerID float64 ID True False False
413
+ Name Data Type Semantic Type Primary Key Time Column End Time Column
414
+ 0 CustomerID float64 ID True False False
354
415
  """ # noqa: E501
355
416
  cols = self.columns
356
417
 
357
418
  return pd.DataFrame({
358
- 'name':
419
+ 'Name':
359
420
  pd.Series(dtype=str, data=[c.name for c in cols]),
360
- 'dtype':
421
+ 'Data Type':
361
422
  pd.Series(dtype=str, data=[c.dtype for c in cols]),
362
- 'stype':
423
+ 'Semantic Type':
363
424
  pd.Series(dtype=str, data=[c.stype for c in cols]),
364
- 'is_primary_key':
425
+ 'Primary Key':
365
426
  pd.Series(
366
427
  dtype=bool,
367
428
  data=[self._primary_key == c.name for c in cols],
368
429
  ),
369
- 'is_time_column':
430
+ 'Time Column':
370
431
  pd.Series(
371
432
  dtype=bool,
372
433
  data=[self._time_column == c.name for c in cols],
373
434
  ),
374
- 'is_end_time_column':
435
+ 'End Time Column':
375
436
  pd.Series(
376
437
  dtype=bool,
377
438
  data=[self._end_time_column == c.name for c in cols],
@@ -380,28 +441,98 @@ class Table(ABC):
380
441
 
381
442
  def print_metadata(self) -> None:
382
443
  r"""Prints the :meth:`~metadata` of this table."""
383
- num_rows_repr = ''
384
- if self._num_rows is not None:
385
- num_rows_repr = ' ({self._num_rows:,} rows)'
386
-
387
- if in_notebook():
388
- from IPython.display import Markdown, display
389
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
390
- display(Markdown(md_repr))
391
- df = self.metadata
392
- try:
393
- if hasattr(df.style, 'hide'):
394
- display(df.style.hide(axis='index')) # pandas=2
395
- else:
396
- display(df.style.hide_index()) # pandas<1.3
397
- except ImportError:
398
- print(df.to_string(index=False)) # missing jinja2
399
- else:
400
- print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
401
- print(self.metadata.to_string(index=False))
444
+ msg = f"🏷️ Metadata of Table `{self.name}`"
445
+ if num := self._num_rows:
446
+ msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
447
+
448
+ display.title(msg)
449
+ display.dataframe(self.metadata)
450
+
451
+ def infer_primary_key(self, verbose: bool = True) -> Self:
452
+ r"""Infers the primary key in this table.
453
+
454
+ Args:
455
+ verbose: Whether to print verbose output.
456
+ """
457
+ if self.has_primary_key():
458
+ return self
459
+
460
+ def _set_primary_key(primary_key: str) -> None:
461
+ self.primary_key = primary_key
462
+ if verbose:
463
+ display.message(f"Inferred primary key `{primary_key}` for "
464
+ f"table `{self.name}`")
465
+
466
+ # Inference from source column metadata:
467
+ if any(column.is_source for column in self.columns):
468
+ primary_key = self._source_primary_key
469
+ if (primary_key is not None and primary_key in self
470
+ and self[primary_key].is_source):
471
+ _set_primary_key(primary_key)
472
+ return self
473
+
474
+ unique_keys = [
475
+ column.name for column in self._source_column_dict.values()
476
+ if column.is_unique_key
477
+ ]
478
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
479
+ and unique_keys[0] in self
480
+ and self[unique_keys[0]].is_source):
481
+ _set_primary_key(unique_keys[0])
482
+ return self
483
+
484
+ # Heuristic-based inference:
485
+ candidates = [
486
+ column.name for column in self.columns if column.stype == Stype.ID
487
+ ]
488
+ if len(candidates) == 0:
489
+ for column in self.columns:
490
+ if self.name.lower() == column.name.lower():
491
+ candidates.append(column.name)
492
+ elif (self.name.lower().endswith('s')
493
+ and self.name.lower()[:-1] == column.name.lower()):
494
+ candidates.append(column.name)
495
+
496
+ if primary_key := infer_primary_key(
497
+ table_name=self.name,
498
+ df=self._get_sample_df(),
499
+ candidates=candidates,
500
+ ):
501
+ _set_primary_key(primary_key)
502
+ return self
503
+
504
+ return self
505
+
506
+ def infer_time_column(self, verbose: bool = True) -> Self:
507
+ r"""Infers the time column in this table.
508
+
509
+ Args:
510
+ verbose: Whether to print verbose output.
511
+ """
512
+ if self.has_time_column():
513
+ return self
514
+
515
+ # Heuristic-based inference:
516
+ candidates = [
517
+ column.name for column in self.columns
518
+ if column.stype == Stype.timestamp
519
+ and column.name != self._end_time_column
520
+ ]
521
+
522
+ if time_column := infer_time_column(
523
+ df=self._get_sample_df(),
524
+ candidates=candidates,
525
+ ):
526
+ self.time_column = time_column
527
+
528
+ if verbose:
529
+ display.message(f"Inferred time column `{time_column}` for "
530
+ f"table `{self.name}`")
531
+
532
+ return self
402
533
 
403
534
  def infer_metadata(self, verbose: bool = True) -> Self:
404
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
535
+ r"""Infers metadata, *i.e.*, primary keys and time columns, in this
405
536
  table.
406
537
 
407
538
  Args:
@@ -409,48 +540,19 @@ class Table(ABC):
409
540
  """
410
541
  logs = []
411
542
 
412
- # Try to detect primary key if not set:
413
543
  if not self.has_primary_key():
544
+ self.infer_primary_key(verbose=False)
545
+ if self.has_primary_key():
546
+ logs.append(f"primary key `{self._primary_key}`")
414
547
 
415
- def is_candidate(column: Column) -> bool:
416
- if column.stype == Stype.ID:
417
- return True
418
- if all(column.stype != Stype.ID for column in self.columns):
419
- if self.name == column.name:
420
- return True
421
- if (self.name.endswith('s')
422
- and self.name[:-1] == column.name):
423
- return True
424
- return False
425
-
426
- candidates = [
427
- column.name for column in self.columns if is_candidate(column)
428
- ]
429
-
430
- if primary_key := infer_primary_key(
431
- table_name=self.name,
432
- df=self._sample_df,
433
- candidates=candidates,
434
- ):
435
- self.primary_key = primary_key
436
- logs.append(f"primary key '{primary_key}'")
437
-
438
- # Try to detect time column if not set:
439
548
  if not self.has_time_column():
440
- candidates = [
441
- column.name for column in self.columns
442
- if column.stype == Stype.timestamp
443
- and column.name != self._end_time_column
444
- ]
445
- if time_column := infer_time_column(
446
- df=self._sample_df,
447
- candidates=candidates,
448
- ):
449
- self.time_column = time_column
450
- logs.append(f"time column '{time_column}'")
549
+ self.infer_time_column(verbose=False)
550
+ if self.has_time_column():
551
+ logs.append(f"time column `{self._time_column}`")
451
552
 
452
553
  if verbose and len(logs) > 0:
453
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
554
+ display.message(f"Inferred {' and '.join(logs)} for table "
555
+ f"`{self.name}`")
454
556
 
455
557
  return self
456
558
 
@@ -468,6 +570,118 @@ class Table(ABC):
468
570
  end_time_col=self._end_time_column,
469
571
  )
470
572
 
573
+ @cached_property
574
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
575
+ source_columns = self._get_source_columns()
576
+ if len(source_columns) == 0:
577
+ raise ValueError(f"Table '{self.name}' has no columns")
578
+ return {column.name: column for column in source_columns}
579
+
580
+ @cached_property
581
+ def _source_primary_key(self) -> str | None:
582
+ primary_keys = [
583
+ column.name for column in self._source_column_dict.values()
584
+ if column.is_primary_key
585
+ ]
586
+ # NOTE No composite keys yet.
587
+ return primary_keys[0] if len(primary_keys) == 1 else None
588
+
589
+ @cached_property
590
+ def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
591
+ return {key.name: key for key in self._get_source_foreign_keys()}
592
+
593
+ @cached_property
594
+ def _source_sample_df(self) -> pd.DataFrame:
595
+ return self._get_source_sample_df().reset_index(drop=True)
596
+
597
+ @cached_property
598
+ def _num_rows(self) -> int | None:
599
+ return self._get_num_rows()
600
+
601
+ def _get_sample_df(self) -> pd.DataFrame:
602
+ dfs: list[pd.DataFrame] = []
603
+ if any(column.is_source for column in self.columns):
604
+ dfs.append(self._source_sample_df)
605
+ if any(not column.is_source for column in self.columns):
606
+ dfs.append(self._expr_sample_df)
607
+
608
+ if len(dfs) == 0:
609
+ return pd.DataFrame(index=range(1000))
610
+ if len(dfs) == 1:
611
+ return dfs[0]
612
+
613
+ size = min(map(len, dfs))
614
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
615
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
616
+ return df
617
+
618
+ @staticmethod
619
+ def _sanitize(
620
+ df: pd.DataFrame,
621
+ dtype_dict: dict[str, Dtype | None] | None = None,
622
+ stype_dict: dict[str, Stype | None] | None = None,
623
+ ) -> pd.DataFrame:
624
+ r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
625
+ types match table data and semantic type specification.
626
+ """
627
+ def _to_datetime(ser: pd.Series) -> pd.Series:
628
+ if (not pd.api.types.is_datetime64_any_dtype(ser)
629
+ and not (isinstance(ser.dtype, pd.ArrowDtype) and
630
+ pa.types.is_timestamp(ser.dtype.pyarrow_dtype))):
631
+ with warnings.catch_warnings():
632
+ warnings.filterwarnings(
633
+ 'ignore',
634
+ message='Could not infer format',
635
+ )
636
+ ser = pd.to_datetime(ser, errors='coerce')
637
+ if (isinstance(ser.dtype, pd.DatetimeTZDtype)
638
+ or (isinstance(ser.dtype, pd.ArrowDtype)
639
+ and ser.dtype.pyarrow_dtype.tz is not None)):
640
+ ser = ser.dt.tz_localize(None)
641
+ if ser.dtype != 'datetime64[ns]':
642
+ ser = ser.astype('datetime64[ns]')
643
+ return ser
644
+
645
+ def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
646
+ if (pd.api.types.is_string_dtype(ser)
647
+ and dtype in {Dtype.intlist, Dtype.floatlist}):
648
+ try:
649
+ ser = ser.map(lambda row: np.fromstring(
650
+ row.strip('[]'),
651
+ sep=',',
652
+ dtype=int if dtype == Dtype.intlist else np.float32,
653
+ ) if row is not None else None)
654
+ except Exception:
655
+ pass
656
+
657
+ if pd.api.types.is_string_dtype(ser):
658
+ try:
659
+ import orjson as json
660
+ except ImportError:
661
+ import json
662
+ try:
663
+ ser = ser.map(lambda row: json.loads(row)
664
+ if row is not None else None)
665
+ except Exception:
666
+ pass
667
+
668
+ return ser
669
+
670
+ for column_name in df.columns:
671
+ dtype = (dtype_dict or {}).get(column_name)
672
+ stype = (stype_dict or {}).get(column_name)
673
+
674
+ if dtype == Dtype.time:
675
+ df[column_name] = _to_datetime(df[column_name])
676
+ elif stype == Stype.timestamp:
677
+ df[column_name] = _to_datetime(df[column_name])
678
+ elif dtype is not None and dtype.is_list():
679
+ df[column_name] = _to_list(df[column_name], dtype)
680
+ elif stype == Stype.sequence:
681
+ df[column_name] = _to_list(df[column_name], Dtype.floatlist)
682
+
683
+ return df
684
+
471
685
  # Python builtins #########################################################
472
686
 
473
687
  def __hash__(self) -> int:
@@ -496,45 +710,32 @@ class Table(ABC):
496
710
  f' end_time_column={self._end_time_column},\n'
497
711
  f')')
498
712
 
499
- # Abstract method #########################################################
713
+ # Abstract Methods ########################################################
500
714
 
501
- @cached_property
502
- def _source_column_dict(self) -> Dict[str, SourceColumn]:
503
- return {col.name: col for col in self._get_source_columns()}
715
+ @property
716
+ @abstractmethod
717
+ def backend(self) -> DataBackend:
718
+ r"""The data backend of this table."""
504
719
 
505
720
  @abstractmethod
506
- def _get_source_columns(self) -> List[SourceColumn]:
721
+ def _get_source_columns(self) -> list[SourceColumn]:
507
722
  pass
508
723
 
509
- @cached_property
510
- def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
511
- fkeys = self._get_source_foreign_keys()
512
- # NOTE Drop all keys that link to different primary keys in the same
513
- # table since we don't support composite keys yet:
514
- table_pkeys: Dict[str, Set[str]] = defaultdict(set)
515
- for fkey in fkeys:
516
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
517
- return {
518
- fkey.name: fkey
519
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
520
- }
521
-
522
724
  @abstractmethod
523
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
725
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
524
726
  pass
525
727
 
526
- @cached_property
527
- def _sample_df(self) -> pd.DataFrame:
528
- return self._get_sample_df()
529
-
530
728
  @abstractmethod
531
- def _get_sample_df(self) -> pd.DataFrame:
729
+ def _get_source_sample_df(self) -> pd.DataFrame:
532
730
  pass
533
731
 
534
- @cached_property
535
- def _num_rows(self) -> Optional[int]:
536
- return self._get_num_rows()
732
+ @abstractmethod
733
+ def _get_expr_sample_df(
734
+ self,
735
+ columns: Sequence[ColumnSpec],
736
+ ) -> pd.DataFrame:
737
+ pass
537
738
 
538
739
  @abstractmethod
539
- def _get_num_rows(self) -> Optional[int]:
740
+ def _get_num_rows(self) -> int | None:
540
741
  pass