kumoai 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512301731__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. kumoai/__init__.py +35 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +24 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/experimental/rfm/__init__.py +49 -24
  7. kumoai/experimental/rfm/authenticate.py +3 -4
  8. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  9. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +62 -110
  10. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  11. kumoai/experimental/rfm/backend/local/table.py +32 -14
  12. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  13. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  14. kumoai/experimental/rfm/backend/snow/table.py +186 -39
  15. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  16. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  17. kumoai/experimental/rfm/backend/sqlite/table.py +131 -41
  18. kumoai/experimental/rfm/base/__init__.py +23 -3
  19. kumoai/experimental/rfm/base/column.py +96 -10
  20. kumoai/experimental/rfm/base/expression.py +44 -0
  21. kumoai/experimental/rfm/base/sampler.py +761 -0
  22. kumoai/experimental/rfm/base/source.py +2 -1
  23. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  24. kumoai/experimental/rfm/base/table.py +380 -185
  25. kumoai/experimental/rfm/graph.py +404 -144
  26. kumoai/experimental/rfm/infer/__init__.py +6 -4
  27. kumoai/experimental/rfm/infer/dtype.py +52 -60
  28. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  29. kumoai/experimental/rfm/infer/pkey.py +4 -2
  30. kumoai/experimental/rfm/infer/stype.py +35 -0
  31. kumoai/experimental/rfm/infer/time_col.py +1 -2
  32. kumoai/experimental/rfm/pquery/executor.py +27 -27
  33. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  34. kumoai/experimental/rfm/relbench.py +76 -0
  35. kumoai/experimental/rfm/rfm.py +283 -230
  36. kumoai/experimental/rfm/sagemaker.py +4 -4
  37. kumoai/pquery/predictive_query.py +10 -6
  38. kumoai/testing/snow.py +50 -0
  39. kumoai/trainer/distilled_trainer.py +175 -0
  40. kumoai/utils/__init__.py +3 -2
  41. kumoai/utils/display.py +51 -0
  42. kumoai/utils/progress_logger.py +178 -12
  43. kumoai/utils/sql.py +3 -0
  44. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/METADATA +4 -2
  45. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/RECORD +48 -38
  46. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  47. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  48. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/WHEEL +0 -0
  49. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/licenses/LICENSE +0 -0
  50. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,32 @@
1
+ import warnings
1
2
  from abc import ABC, abstractmethod
2
- from collections import defaultdict
3
+ from collections.abc import Sequence
3
4
  from functools import cached_property
4
- from typing import Dict, List, Optional, Sequence, Set
5
5
 
6
+ import numpy as np
6
7
  import pandas as pd
8
+ from kumoapi.model_plan import MissingType
7
9
  from kumoapi.source_table import UnavailableSourceTable
8
10
  from kumoapi.table import Column as ColumnDefinition
9
11
  from kumoapi.table import TableDefinition
10
- from kumoapi.typing import Stype
12
+ from kumoapi.typing import Dtype, Stype
11
13
  from typing_extensions import Self
12
14
 
13
- from kumoai import in_notebook
14
- from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
15
+ from kumoai.experimental.rfm.base import (
16
+ Column,
17
+ ColumnSpec,
18
+ ColumnSpecType,
19
+ DataBackend,
20
+ SourceColumn,
21
+ SourceForeignKey,
22
+ )
15
23
  from kumoai.experimental.rfm.infer import (
16
- contains_categorical,
17
- contains_id,
18
- contains_multicategorical,
19
- contains_timestamp,
24
+ infer_dtype,
20
25
  infer_primary_key,
26
+ infer_stype,
21
27
  infer_time_column,
22
28
  )
29
+ from kumoai.utils import display, quote_ident
23
30
 
24
31
 
25
32
  class Table(ABC):
@@ -29,53 +36,48 @@ class Table(ABC):
29
36
 
30
37
  Args:
31
38
  name: The name of this table.
39
+ source_name: The source name of this table. If set to ``None``,
40
+ ``name`` is being used.
32
41
  columns: The selected columns of this table.
33
42
  primary_key: The name of the primary key of this table, if it exists.
34
43
  time_column: The name of the time column of this table, if it exists.
35
44
  end_time_column: The name of the end time column of this table, if it
36
45
  exists.
37
46
  """
47
+ _NUM_SAMPLE_ROWS = 1_000
48
+
38
49
  def __init__(
39
50
  self,
40
51
  name: str,
41
- columns: Optional[Sequence[str]] = None,
42
- primary_key: Optional[str] = None,
43
- time_column: Optional[str] = None,
44
- end_time_column: Optional[str] = None,
52
+ source_name: str | None = None,
53
+ columns: Sequence[ColumnSpecType] | None = None,
54
+ primary_key: MissingType | str | None = MissingType.VALUE,
55
+ time_column: str | None = None,
56
+ end_time_column: str | None = None,
45
57
  ) -> None:
46
58
 
47
59
  self._name = name
48
- self._primary_key: Optional[str] = None
49
- self._time_column: Optional[str] = None
50
- self._end_time_column: Optional[str] = None
51
-
52
- if len(self._source_column_dict) == 0:
53
- raise ValueError(f"Table '{name}' does not hold any column with "
54
- f"a supported data type")
55
-
56
- primary_keys = [
57
- column.name for column in self._source_column_dict.values()
58
- if column.is_primary_key
59
- ]
60
- if len(primary_keys) == 1: # NOTE No composite keys yet.
61
- if primary_key is not None and primary_key != primary_keys[0]:
62
- raise ValueError(f"Found duplicate primary key "
63
- f"definition '{primary_key}' and "
64
- f"'{primary_keys[0]}' in table '{name}'")
65
- primary_key = primary_keys[0]
66
-
67
- unique_keys = [
68
- column.name for column in self._source_column_dict.values()
69
- if column.is_unique_key
70
- ]
71
- if primary_key is None and len(unique_keys) == 1:
72
- primary_key = unique_keys[0]
73
-
74
- self._columns: Dict[str, Column] = {}
75
- for column_name in columns or list(self._source_column_dict.keys()):
76
- self.add_column(column_name)
77
-
78
- if primary_key is not None:
60
+ self._source_name = source_name or name
61
+ self._column_dict: dict[str, Column] = {}
62
+ self._primary_key: str | None = None
63
+ self._time_column: str | None = None
64
+ self._end_time_column: str | None = None
65
+ self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
66
+
67
+ if columns is None:
68
+ columns = list(self._source_column_dict.keys())
69
+
70
+ self.add_columns(columns)
71
+
72
+ if isinstance(primary_key, MissingType):
73
+ # Infer primary key from source metadata, but only set it in case
74
+ # it is already part of the column set (don't magically add it):
75
+ if any(column.is_source for column in self.columns):
76
+ primary_key = self._source_primary_key
77
+ if (primary_key is not None and primary_key in self
78
+ and self[primary_key].is_source):
79
+ self.primary_key = primary_key
80
+ elif primary_key is not None:
79
81
  if primary_key not in self:
80
82
  self.add_column(primary_key)
81
83
  self.primary_key = primary_key
@@ -95,13 +97,22 @@ class Table(ABC):
95
97
  r"""The name of this table."""
96
98
  return self._name
97
99
 
98
- # Data column #############################################################
100
+ @property
101
+ def source_name(self) -> str:
102
+ r"""The source name of this table."""
103
+ return self._source_name
104
+
105
+ @property
106
+ def _quoted_source_name(self) -> str:
107
+ return quote_ident(self._source_name)
108
+
109
+ # Column ##################################################################
99
110
 
100
111
  def has_column(self, name: str) -> bool:
101
112
  r"""Returns ``True`` if this table holds a column with name ``name``;
102
113
  ``False`` otherwise.
103
114
  """
104
- return name in self._columns
115
+ return name in self._column_dict
105
116
 
106
117
  def column(self, name: str) -> Column:
107
118
  r"""Returns the data column named with name ``name`` in this table.
@@ -114,65 +125,113 @@ class Table(ABC):
114
125
  """
115
126
  if not self.has_column(name):
116
127
  raise KeyError(f"Column '{name}' not found in table '{self.name}'")
117
- return self._columns[name]
128
+ return self._column_dict[name]
118
129
 
119
130
  @property
120
- def columns(self) -> List[Column]:
131
+ def columns(self) -> list[Column]:
121
132
  r"""Returns a list of :class:`Column` objects that represent the
122
133
  columns in this table.
123
134
  """
124
- return list(self._columns.values())
135
+ return list(self._column_dict.values())
125
136
 
126
- def add_column(self, name: str) -> Column:
127
- r"""Adds a column to this table.
137
+ def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
138
+ r"""Adds a set of columns to this table.
128
139
 
129
140
  Args:
130
- name: The name of the column.
141
+ columns: The columns to add.
131
142
 
132
143
  Raises:
133
- KeyError: If ``name`` is already present in this table.
144
+ KeyError: If any of the column names already exist in this table.
134
145
  """
135
- if name in self:
136
- raise KeyError(f"Column '{name}' already exists in table "
137
- f"'{self.name}'")
138
-
139
- if name not in self._source_column_dict:
140
- raise KeyError(f"Column '{name}' does not exist in the underlying "
141
- f"source table")
142
-
143
- try:
144
- dtype = self._source_column_dict[name].dtype
145
- except Exception as e:
146
- raise RuntimeError(f"Could not obtain data type for column "
147
- f"'{name}' in table '{self.name}'. Change "
148
- f"the data type of the column in the source "
149
- f"table or remove it from the table.") from e
150
-
151
- try:
152
- ser = self._sample_df[name]
153
- if contains_id(ser, name, dtype):
154
- stype = Stype.ID
155
- elif contains_timestamp(ser, name, dtype):
156
- stype = Stype.timestamp
157
- elif contains_multicategorical(ser, name, dtype):
158
- stype = Stype.multicategorical
159
- elif contains_categorical(ser, name, dtype):
160
- stype = Stype.categorical
161
- else:
162
- stype = dtype.default_stype
163
- except Exception as e:
164
- raise RuntimeError(f"Could not obtain semantic type for column "
165
- f"'{name}' in table '{self.name}'. Change "
166
- f"the data type of the column in the source "
167
- f"table or remove it from the table.") from e
168
-
169
- self._columns[name] = Column(
170
- name=name,
171
- dtype=dtype,
172
- stype=stype,
173
- )
146
+ if len(columns) == 0:
147
+ return
174
148
 
175
- return self._columns[name]
149
+ column_specs = [ColumnSpec.coerce(column) for column in columns]
150
+
151
+ # Obtain a batch-wise sample for all column expressions:
152
+ expr_specs = [spec for spec in column_specs if not spec.is_source]
153
+ if len(expr_specs) > 0:
154
+ dfs = [
155
+ self._expr_sample_df,
156
+ self._get_expr_sample_df(expr_specs).reset_index(drop=True),
157
+ ]
158
+ size = min(map(len, dfs))
159
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
160
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
161
+ self._expr_sample_df = df
162
+
163
+ for column_spec in column_specs:
164
+ if column_spec.name in self:
165
+ raise KeyError(f"Column '{column_spec.name}' already exists "
166
+ f"in table '{self.name}'")
167
+
168
+ dtype = column_spec.dtype
169
+ stype = column_spec.stype
170
+
171
+ if column_spec.is_source:
172
+ if column_spec.name not in self._source_column_dict:
173
+ raise ValueError(
174
+ f"Column '{column_spec.name}' does not exist in the "
175
+ f"underlying source table")
176
+
177
+ if dtype is None:
178
+ dtype = self._source_column_dict[column_spec.name].dtype
179
+
180
+ if dtype == Dtype.unsupported:
181
+ raise ValueError(
182
+ f"Encountered unsupported data type for column "
183
+ f"'{column_spec.name}' in table '{self.name}'. Please "
184
+ f"either change the column's data type or remove the "
185
+ f"column from this table.")
186
+
187
+ if dtype is None:
188
+ if column_spec.is_source:
189
+ ser = self._source_sample_df[column_spec.name]
190
+ else:
191
+ ser = self._expr_sample_df[column_spec.name]
192
+ try:
193
+ dtype = infer_dtype(ser)
194
+ except Exception as e:
195
+ raise RuntimeError(
196
+ f"Encountered unsupported data type '{ser.dtype}' for "
197
+ f"column '{column_spec.name}' in table '{self.name}'. "
198
+ f"Please either manually override the columns's data "
199
+ f"type or remove the column from this table.") from e
200
+
201
+ if stype is None:
202
+ if column_spec.is_source:
203
+ ser = self._source_sample_df[column_spec.name]
204
+ else:
205
+ ser = self._expr_sample_df[column_spec.name]
206
+ try:
207
+ stype = infer_stype(ser, column_spec.name, dtype)
208
+ except Exception as e:
209
+ raise RuntimeError(
210
+ f"Could not determine semantic type for column "
211
+ f"'{column_spec.name}' with data type '{dtype}' in "
212
+ f"table '{self.name}'. Please either change the "
213
+ f"column's data type or remove the column from this "
214
+ f"table.") from e
215
+
216
+ self._column_dict[column_spec.name] = Column(
217
+ name=column_spec.name,
218
+ expr=column_spec.expr,
219
+ dtype=dtype,
220
+ stype=stype,
221
+ )
222
+
223
+ def add_column(self, column: ColumnSpecType) -> Column:
224
+ r"""Adds a column to this table.
225
+
226
+ Args:
227
+ column: The column to add.
228
+
229
+ Raises:
230
+ KeyError: If the column name already exists in this table.
231
+ """
232
+ column_spec = ColumnSpec.coerce(column)
233
+ self.add_columns([column_spec])
234
+ return self[column_spec.name]
176
235
 
177
236
  def remove_column(self, name: str) -> Self:
178
237
  r"""Removes a column from this table.
@@ -192,7 +251,7 @@ class Table(ABC):
192
251
  self.time_column = None
193
252
  if self._end_time_column == name:
194
253
  self.end_time_column = None
195
- del self._columns[name]
254
+ del self._column_dict[name]
196
255
 
197
256
  return self
198
257
 
@@ -205,7 +264,7 @@ class Table(ABC):
205
264
  return self._primary_key is not None
206
265
 
207
266
  @property
208
- def primary_key(self) -> Optional[Column]:
267
+ def primary_key(self) -> Column | None:
209
268
  r"""The primary key column of this table.
210
269
 
211
270
  The getter returns the primary key column of this table, or ``None`` if
@@ -220,7 +279,7 @@ class Table(ABC):
220
279
  return self[self._primary_key]
221
280
 
222
281
  @primary_key.setter
223
- def primary_key(self, name: Optional[str]) -> None:
282
+ def primary_key(self, name: str | None) -> None:
224
283
  if name is not None and name == self._time_column:
225
284
  raise ValueError(f"Cannot specify column '{name}' as a primary "
226
285
  f"key since it is already defined to be a time "
@@ -250,7 +309,7 @@ class Table(ABC):
250
309
  return self._time_column is not None
251
310
 
252
311
  @property
253
- def time_column(self) -> Optional[Column]:
312
+ def time_column(self) -> Column | None:
254
313
  r"""The time column of this table.
255
314
 
256
315
  The getter returns the time column of this table, or ``None`` if no
@@ -265,7 +324,7 @@ class Table(ABC):
265
324
  return self[self._time_column]
266
325
 
267
326
  @time_column.setter
268
- def time_column(self, name: Optional[str]) -> None:
327
+ def time_column(self, name: str | None) -> None:
269
328
  if name is not None and name == self._primary_key:
270
329
  raise ValueError(f"Cannot specify column '{name}' as a time "
271
330
  f"column since it is already defined to be a "
@@ -295,7 +354,7 @@ class Table(ABC):
295
354
  return self._end_time_column is not None
296
355
 
297
356
  @property
298
- def end_time_column(self) -> Optional[Column]:
357
+ def end_time_column(self) -> Column | None:
299
358
  r"""The end time column of this table.
300
359
 
301
360
  The getter returns the end time column of this table, or ``None`` if no
@@ -311,7 +370,7 @@ class Table(ABC):
311
370
  return self[self._end_time_column]
312
371
 
313
372
  @end_time_column.setter
314
- def end_time_column(self, name: Optional[str]) -> None:
373
+ def end_time_column(self, name: str | None) -> None:
315
374
  if name is not None and name == self._primary_key:
316
375
  raise ValueError(f"Cannot specify column '{name}' as an end time "
317
376
  f"column since it is already defined to be a "
@@ -380,28 +439,98 @@ class Table(ABC):
380
439
 
381
440
  def print_metadata(self) -> None:
382
441
  r"""Prints the :meth:`~metadata` of this table."""
383
- num_rows_repr = ''
384
- if self._num_rows is not None:
385
- num_rows_repr = ' ({self._num_rows:,} rows)'
386
-
387
- if in_notebook():
388
- from IPython.display import Markdown, display
389
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
390
- display(Markdown(md_repr))
391
- df = self.metadata
392
- try:
393
- if hasattr(df.style, 'hide'):
394
- display(df.style.hide(axis='index')) # pandas=2
395
- else:
396
- display(df.style.hide_index()) # pandas<1.3
397
- except ImportError:
398
- print(df.to_string(index=False)) # missing jinja2
399
- else:
400
- print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
401
- print(self.metadata.to_string(index=False))
442
+ msg = f"🏷️ Metadata of Table `{self.name}`"
443
+ if num := self._num_rows:
444
+ msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
445
+
446
+ display.title(msg)
447
+ display.dataframe(self.metadata)
448
+
449
+ def infer_primary_key(self, verbose: bool = True) -> Self:
450
+ r"""Infers the primary key in this table.
451
+
452
+ Args:
453
+ verbose: Whether to print verbose output.
454
+ """
455
+ if self.has_primary_key():
456
+ return self
457
+
458
+ def _set_primary_key(primary_key: str) -> None:
459
+ self.primary_key = primary_key
460
+ if verbose:
461
+ display.message(f"Inferred primary key `{primary_key}` for "
462
+ f"table `{self.name}`")
463
+
464
+ # Inference from source column metadata:
465
+ if any(column.is_source for column in self.columns):
466
+ primary_key = self._source_primary_key
467
+ if (primary_key is not None and primary_key in self
468
+ and self[primary_key].is_source):
469
+ _set_primary_key(primary_key)
470
+ return self
471
+
472
+ unique_keys = [
473
+ column.name for column in self._source_column_dict.values()
474
+ if column.is_unique_key
475
+ ]
476
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
477
+ and unique_keys[0] in self
478
+ and self[unique_keys[0]].is_source):
479
+ _set_primary_key(unique_keys[0])
480
+ return self
481
+
482
+ # Heuristic-based inference:
483
+ candidates = [
484
+ column.name for column in self.columns if column.stype == Stype.ID
485
+ ]
486
+ if len(candidates) == 0:
487
+ for column in self.columns:
488
+ if self.name.lower() == column.name.lower():
489
+ candidates.append(column.name)
490
+ elif (self.name.lower().endswith('s')
491
+ and self.name.lower()[:-1] == column.name.lower()):
492
+ candidates.append(column.name)
493
+
494
+ if primary_key := infer_primary_key(
495
+ table_name=self.name,
496
+ df=self._get_sample_df(),
497
+ candidates=candidates,
498
+ ):
499
+ _set_primary_key(primary_key)
500
+ return self
501
+
502
+ return self
503
+
504
+ def infer_time_column(self, verbose: bool = True) -> Self:
505
+ r"""Infers the time column in this table.
506
+
507
+ Args:
508
+ verbose: Whether to print verbose output.
509
+ """
510
+ if self.has_time_column():
511
+ return self
512
+
513
+ # Heuristic-based inference:
514
+ candidates = [
515
+ column.name for column in self.columns
516
+ if column.stype == Stype.timestamp
517
+ and column.name != self._end_time_column
518
+ ]
519
+
520
+ if time_column := infer_time_column(
521
+ df=self._get_sample_df(),
522
+ candidates=candidates,
523
+ ):
524
+ self.time_column = time_column
525
+
526
+ if verbose:
527
+ display.message(f"Inferred time column `{time_column}` for "
528
+ f"table `{self.name}`")
529
+
530
+ return self
402
531
 
403
532
  def infer_metadata(self, verbose: bool = True) -> Self:
404
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
533
+ r"""Infers metadata, *i.e.*, primary keys and time columns, in this
405
534
  table.
406
535
 
407
536
  Args:
@@ -409,48 +538,19 @@ class Table(ABC):
409
538
  """
410
539
  logs = []
411
540
 
412
- # Try to detect primary key if not set:
413
541
  if not self.has_primary_key():
542
+ self.infer_primary_key(verbose=False)
543
+ if self.has_primary_key():
544
+ logs.append(f"primary key `{self._primary_key}`")
414
545
 
415
- def is_candidate(column: Column) -> bool:
416
- if column.stype == Stype.ID:
417
- return True
418
- if all(column.stype != Stype.ID for column in self.columns):
419
- if self.name == column.name:
420
- return True
421
- if (self.name.endswith('s')
422
- and self.name[:-1] == column.name):
423
- return True
424
- return False
425
-
426
- candidates = [
427
- column.name for column in self.columns if is_candidate(column)
428
- ]
429
-
430
- if primary_key := infer_primary_key(
431
- table_name=self.name,
432
- df=self._sample_df,
433
- candidates=candidates,
434
- ):
435
- self.primary_key = primary_key
436
- logs.append(f"primary key '{primary_key}'")
437
-
438
- # Try to detect time column if not set:
439
546
  if not self.has_time_column():
440
- candidates = [
441
- column.name for column in self.columns
442
- if column.stype == Stype.timestamp
443
- and column.name != self._end_time_column
444
- ]
445
- if time_column := infer_time_column(
446
- df=self._sample_df,
447
- candidates=candidates,
448
- ):
449
- self.time_column = time_column
450
- logs.append(f"time column '{time_column}'")
547
+ self.infer_time_column(verbose=False)
548
+ if self.has_time_column():
549
+ logs.append(f"time column `{self._time_column}`")
451
550
 
452
551
  if verbose and len(logs) > 0:
453
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
552
+ display.message(f"Inferred {' and '.join(logs)} for table "
553
+ f"`{self.name}`")
454
554
 
455
555
  return self
456
556
 
@@ -468,6 +568,114 @@ class Table(ABC):
468
568
  end_time_col=self._end_time_column,
469
569
  )
470
570
 
571
+ @cached_property
572
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
573
+ source_columns = self._get_source_columns()
574
+ if len(source_columns) == 0:
575
+ raise ValueError(f"Table '{self.name}' has no columns")
576
+ return {column.name: column for column in source_columns}
577
+
578
+ @cached_property
579
+ def _source_primary_key(self) -> str | None:
580
+ primary_keys = [
581
+ column.name for column in self._source_column_dict.values()
582
+ if column.is_primary_key
583
+ ]
584
+ # NOTE No composite keys yet.
585
+ return primary_keys[0] if len(primary_keys) == 1 else None
586
+
587
+ @cached_property
588
+ def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
589
+ return {key.name: key for key in self._get_source_foreign_keys()}
590
+
591
+ @cached_property
592
+ def _source_sample_df(self) -> pd.DataFrame:
593
+ return self._get_source_sample_df().reset_index(drop=True)
594
+
595
+ @cached_property
596
+ def _num_rows(self) -> int | None:
597
+ return self._get_num_rows()
598
+
599
+ def _get_sample_df(self) -> pd.DataFrame:
600
+ dfs: list[pd.DataFrame] = []
601
+ if any(column.is_source for column in self.columns):
602
+ dfs.append(self._source_sample_df)
603
+ if any(not column.is_source for column in self.columns):
604
+ dfs.append(self._expr_sample_df)
605
+
606
+ if len(dfs) == 0:
607
+ return pd.DataFrame(index=range(1000))
608
+ if len(dfs) == 1:
609
+ return dfs[0]
610
+
611
+ size = min(map(len, dfs))
612
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
613
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
614
+ return df
615
+
616
+ @staticmethod
617
+ def _sanitize(
618
+ df: pd.DataFrame,
619
+ dtype_dict: dict[str, Dtype | None] | None = None,
620
+ stype_dict: dict[str, Stype | None] | None = None,
621
+ ) -> pd.DataFrame:
622
+ r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
623
+ types match table data and semantic type specification.
624
+ """
625
+ def _to_datetime(ser: pd.Series) -> pd.Series:
626
+ if not pd.api.types.is_datetime64_any_dtype(ser):
627
+ with warnings.catch_warnings():
628
+ warnings.filterwarnings(
629
+ 'ignore',
630
+ message='Could not infer format',
631
+ )
632
+ ser = pd.to_datetime(ser, errors='coerce')
633
+ if isinstance(ser.dtype, pd.DatetimeTZDtype):
634
+ ser = ser.dt.tz_localize(None)
635
+ if ser.dtype != 'datetime64[ns]':
636
+ ser = ser.astype('datetime64[ns]')
637
+ return ser
638
+
639
+ def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
640
+ if (pd.api.types.is_string_dtype(ser)
641
+ and dtype in {Dtype.intlist, Dtype.floatlist}):
642
+ try:
643
+ ser = ser.map(lambda row: np.fromstring(
644
+ row.strip('[]'),
645
+ sep=',',
646
+ dtype=int if dtype == Dtype.intlist else np.float32,
647
+ ) if row is not None else None)
648
+ except Exception:
649
+ pass
650
+
651
+ if pd.api.types.is_string_dtype(ser):
652
+ try:
653
+ import orjson as json
654
+ except ImportError:
655
+ import json
656
+ try:
657
+ ser = ser.map(lambda row: json.loads(row)
658
+ if row is not None else None)
659
+ except Exception:
660
+ pass
661
+
662
+ return ser
663
+
664
+ for column_name in df.columns:
665
+ dtype = (dtype_dict or {}).get(column_name)
666
+ stype = (stype_dict or {}).get(column_name)
667
+
668
+ if dtype == Dtype.time:
669
+ df[column_name] = _to_datetime(df[column_name])
670
+ elif stype == Stype.timestamp:
671
+ df[column_name] = _to_datetime(df[column_name])
672
+ elif dtype is not None and dtype.is_list():
673
+ df[column_name] = _to_list(df[column_name], dtype)
674
+ elif stype == Stype.sequence:
675
+ df[column_name] = _to_list(df[column_name], Dtype.floatlist)
676
+
677
+ return df
678
+
471
679
  # Python builtins #########################################################
472
680
 
473
681
  def __hash__(self) -> int:
@@ -496,45 +704,32 @@ class Table(ABC):
496
704
  f' end_time_column={self._end_time_column},\n'
497
705
  f')')
498
706
 
499
- # Abstract method #########################################################
707
+ # Abstract Methods ########################################################
500
708
 
501
- @cached_property
502
- def _source_column_dict(self) -> Dict[str, SourceColumn]:
503
- return {col.name: col for col in self._get_source_columns()}
709
+ @property
710
+ @abstractmethod
711
+ def backend(self) -> DataBackend:
712
+ r"""The data backend of this table."""
504
713
 
505
714
  @abstractmethod
506
- def _get_source_columns(self) -> List[SourceColumn]:
715
+ def _get_source_columns(self) -> list[SourceColumn]:
507
716
  pass
508
717
 
509
- @cached_property
510
- def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
511
- fkeys = self._get_source_foreign_keys()
512
- # NOTE Drop all keys that link to different primary keys in the same
513
- # table since we don't support composite keys yet:
514
- table_pkeys: Dict[str, Set[str]] = defaultdict(set)
515
- for fkey in fkeys:
516
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
517
- return {
518
- fkey.name: fkey
519
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
520
- }
521
-
522
718
  @abstractmethod
523
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
719
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
524
720
  pass
525
721
 
526
- @cached_property
527
- def _sample_df(self) -> pd.DataFrame:
528
- return self._get_sample_df()
529
-
530
722
  @abstractmethod
531
- def _get_sample_df(self) -> pd.DataFrame:
723
+ def _get_source_sample_df(self) -> pd.DataFrame:
532
724
  pass
533
725
 
534
- @cached_property
535
- def _num_rows(self) -> Optional[int]:
536
- return self._get_num_rows()
726
+ @abstractmethod
727
+ def _get_expr_sample_df(
728
+ self,
729
+ columns: Sequence[ColumnSpec],
730
+ ) -> pd.DataFrame:
731
+ pass
537
732
 
538
733
  @abstractmethod
539
- def _get_num_rows(self) -> Optional[int]:
734
+ def _get_num_rows(self) -> int | None:
540
735
  pass