kumoai 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512301731__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. kumoai/__init__.py +23 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +24 -0
  5. kumoai/experimental/rfm/__init__.py +22 -22
  6. kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
  7. kumoai/experimental/rfm/backend/local/sampler.py +0 -3
  8. kumoai/experimental/rfm/backend/local/table.py +25 -24
  9. kumoai/experimental/rfm/backend/snow/sampler.py +106 -61
  10. kumoai/experimental/rfm/backend/snow/table.py +146 -51
  11. kumoai/experimental/rfm/backend/sqlite/sampler.py +127 -78
  12. kumoai/experimental/rfm/backend/sqlite/table.py +94 -47
  13. kumoai/experimental/rfm/base/__init__.py +6 -7
  14. kumoai/experimental/rfm/base/column.py +97 -5
  15. kumoai/experimental/rfm/base/expression.py +44 -0
  16. kumoai/experimental/rfm/base/sampler.py +5 -17
  17. kumoai/experimental/rfm/base/source.py +1 -1
  18. kumoai/experimental/rfm/base/sql_sampler.py +68 -9
  19. kumoai/experimental/rfm/base/table.py +284 -120
  20. kumoai/experimental/rfm/graph.py +139 -86
  21. kumoai/experimental/rfm/infer/__init__.py +6 -4
  22. kumoai/experimental/rfm/infer/dtype.py +6 -1
  23. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  24. kumoai/experimental/rfm/infer/stype.py +35 -0
  25. kumoai/experimental/rfm/relbench.py +76 -0
  26. kumoai/experimental/rfm/rfm.py +4 -20
  27. kumoai/trainer/distilled_trainer.py +175 -0
  28. kumoai/utils/display.py +51 -0
  29. {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/METADATA +1 -1
  30. {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/RECORD +33 -30
  31. kumoai/experimental/rfm/base/column_expression.py +0 -16
  32. kumoai/experimental/rfm/base/sql_table.py +0 -113
  33. {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/WHEEL +0 -0
  34. {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/licenses/LICENSE +0 -0
  35. {kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202512301731.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,32 @@
1
+ import warnings
1
2
  from abc import ABC, abstractmethod
2
3
  from collections.abc import Sequence
3
4
  from functools import cached_property
4
5
 
6
+ import numpy as np
5
7
  import pandas as pd
6
8
  from kumoapi.model_plan import MissingType
7
9
  from kumoapi.source_table import UnavailableSourceTable
8
10
  from kumoapi.table import Column as ColumnDefinition
9
11
  from kumoapi.table import TableDefinition
10
- from kumoapi.typing import Stype
12
+ from kumoapi.typing import Dtype, Stype
11
13
  from typing_extensions import Self
12
14
 
13
- from kumoai import in_notebook, in_snowflake_notebook
14
- from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
15
+ from kumoai.experimental.rfm.base import (
16
+ Column,
17
+ ColumnSpec,
18
+ ColumnSpecType,
19
+ DataBackend,
20
+ SourceColumn,
21
+ SourceForeignKey,
22
+ )
15
23
  from kumoai.experimental.rfm.infer import (
16
- contains_categorical,
17
- contains_id,
18
- contains_multicategorical,
19
- contains_timestamp,
24
+ infer_dtype,
20
25
  infer_primary_key,
26
+ infer_stype,
21
27
  infer_time_column,
22
28
  )
29
+ from kumoai.utils import display, quote_ident
23
30
 
24
31
 
25
32
  class Table(ABC):
@@ -29,41 +36,48 @@ class Table(ABC):
29
36
 
30
37
  Args:
31
38
  name: The name of this table.
39
+ source_name: The source name of this table. If set to ``None``,
40
+ ``name`` is being used.
32
41
  columns: The selected columns of this table.
33
42
  primary_key: The name of the primary key of this table, if it exists.
34
43
  time_column: The name of the time column of this table, if it exists.
35
44
  end_time_column: The name of the end time column of this table, if it
36
45
  exists.
37
46
  """
47
+ _NUM_SAMPLE_ROWS = 1_000
48
+
38
49
  def __init__(
39
50
  self,
40
51
  name: str,
41
- columns: Sequence[str] | None = None,
52
+ source_name: str | None = None,
53
+ columns: Sequence[ColumnSpecType] | None = None,
42
54
  primary_key: MissingType | str | None = MissingType.VALUE,
43
55
  time_column: str | None = None,
44
56
  end_time_column: str | None = None,
45
57
  ) -> None:
46
58
 
47
59
  self._name = name
60
+ self._source_name = source_name or name
61
+ self._column_dict: dict[str, Column] = {}
48
62
  self._primary_key: str | None = None
49
63
  self._time_column: str | None = None
50
64
  self._end_time_column: str | None = None
65
+ self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
51
66
 
52
67
  if columns is None:
53
68
  columns = list(self._source_column_dict.keys())
54
69
 
55
- if len(self._source_column_dict) == 0:
56
- raise ValueError(f"Table '{name}' does not hold any column with "
57
- f"a supported data type")
70
+ self.add_columns(columns)
58
71
 
59
72
  if isinstance(primary_key, MissingType):
60
- primary_key = self._source_primary_key
61
-
62
- self._columns: dict[str, Column] = {}
63
- for column_name in columns:
64
- self.add_column(column_name)
65
-
66
- if primary_key is not None:
73
+ # Infer primary key from source metadata, but only set it in case
74
+ # it is already part of the column set (don't magically add it):
75
+ if any(column.is_source for column in self.columns):
76
+ primary_key = self._source_primary_key
77
+ if (primary_key is not None and primary_key in self
78
+ and self[primary_key].is_source):
79
+ self.primary_key = primary_key
80
+ elif primary_key is not None:
67
81
  if primary_key not in self:
68
82
  self.add_column(primary_key)
69
83
  self.primary_key = primary_key
@@ -83,13 +97,22 @@ class Table(ABC):
83
97
  r"""The name of this table."""
84
98
  return self._name
85
99
 
100
+ @property
101
+ def source_name(self) -> str:
102
+ r"""The source name of this table."""
103
+ return self._source_name
104
+
105
+ @property
106
+ def _quoted_source_name(self) -> str:
107
+ return quote_ident(self._source_name)
108
+
86
109
  # Column ##################################################################
87
110
 
88
111
  def has_column(self, name: str) -> bool:
89
112
  r"""Returns ``True`` if this table holds a column with name ``name``;
90
113
  ``False`` otherwise.
91
114
  """
92
- return name in self._columns
115
+ return name in self._column_dict
93
116
 
94
117
  def column(self, name: str) -> Column:
95
118
  r"""Returns the data column named with name ``name`` in this table.
@@ -102,59 +125,113 @@ class Table(ABC):
102
125
  """
103
126
  if not self.has_column(name):
104
127
  raise KeyError(f"Column '{name}' not found in table '{self.name}'")
105
- return self._columns[name]
128
+ return self._column_dict[name]
106
129
 
107
130
  @property
108
131
  def columns(self) -> list[Column]:
109
132
  r"""Returns a list of :class:`Column` objects that represent the
110
133
  columns in this table.
111
134
  """
112
- return list(self._columns.values())
135
+ return list(self._column_dict.values())
113
136
 
114
- def add_column(self, name: str) -> Column:
115
- r"""Adds a column to this table.
137
+ def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
138
+ r"""Adds a set of columns to this table.
116
139
 
117
140
  Args:
118
- name: The name of the column.
141
+ columns: The columns to add.
119
142
 
120
143
  Raises:
121
- KeyError: If ``name`` is already present in this table.
144
+ KeyError: If any of the column names already exist in this table.
122
145
  """
123
- if name in self:
124
- raise KeyError(f"Column '{name}' already exists in table "
125
- f"'{self.name}'")
126
-
127
- if name not in self._source_column_dict:
128
- raise KeyError(f"Column '{name}' does not exist in the underlying "
129
- f"source table")
130
-
131
- dtype = self._source_column_dict[name].dtype
132
-
133
- try:
134
- ser = self._sample_df[name]
135
- if contains_id(ser, name, dtype):
136
- stype = Stype.ID
137
- elif contains_timestamp(ser, name, dtype):
138
- stype = Stype.timestamp
139
- elif contains_multicategorical(ser, name, dtype):
140
- stype = Stype.multicategorical
141
- elif contains_categorical(ser, name, dtype):
142
- stype = Stype.categorical
143
- else:
144
- stype = dtype.default_stype
145
- except Exception as e:
146
- raise RuntimeError(f"Could not obtain semantic type for column "
147
- f"'{name}' in table '{self.name}'. Change "
148
- f"the data type of the column in the source "
149
- f"table or remove it from the table.") from e
150
-
151
- self._columns[name] = Column(
152
- name=name,
153
- stype=stype,
154
- dtype=dtype,
155
- )
146
+ if len(columns) == 0:
147
+ return
148
+
149
+ column_specs = [ColumnSpec.coerce(column) for column in columns]
150
+
151
+ # Obtain a batch-wise sample for all column expressions:
152
+ expr_specs = [spec for spec in column_specs if not spec.is_source]
153
+ if len(expr_specs) > 0:
154
+ dfs = [
155
+ self._expr_sample_df,
156
+ self._get_expr_sample_df(expr_specs).reset_index(drop=True),
157
+ ]
158
+ size = min(map(len, dfs))
159
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
160
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
161
+ self._expr_sample_df = df
162
+
163
+ for column_spec in column_specs:
164
+ if column_spec.name in self:
165
+ raise KeyError(f"Column '{column_spec.name}' already exists "
166
+ f"in table '{self.name}'")
167
+
168
+ dtype = column_spec.dtype
169
+ stype = column_spec.stype
170
+
171
+ if column_spec.is_source:
172
+ if column_spec.name not in self._source_column_dict:
173
+ raise ValueError(
174
+ f"Column '{column_spec.name}' does not exist in the "
175
+ f"underlying source table")
176
+
177
+ if dtype is None:
178
+ dtype = self._source_column_dict[column_spec.name].dtype
179
+
180
+ if dtype == Dtype.unsupported:
181
+ raise ValueError(
182
+ f"Encountered unsupported data type for column "
183
+ f"'{column_spec.name}' in table '{self.name}'. Please "
184
+ f"either change the column's data type or remove the "
185
+ f"column from this table.")
186
+
187
+ if dtype is None:
188
+ if column_spec.is_source:
189
+ ser = self._source_sample_df[column_spec.name]
190
+ else:
191
+ ser = self._expr_sample_df[column_spec.name]
192
+ try:
193
+ dtype = infer_dtype(ser)
194
+ except Exception as e:
195
+ raise RuntimeError(
196
+ f"Encountered unsupported data type '{ser.dtype}' for "
197
+ f"column '{column_spec.name}' in table '{self.name}'. "
198
+ f"Please either manually override the columns's data "
199
+ f"type or remove the column from this table.") from e
200
+
201
+ if stype is None:
202
+ if column_spec.is_source:
203
+ ser = self._source_sample_df[column_spec.name]
204
+ else:
205
+ ser = self._expr_sample_df[column_spec.name]
206
+ try:
207
+ stype = infer_stype(ser, column_spec.name, dtype)
208
+ except Exception as e:
209
+ raise RuntimeError(
210
+ f"Could not determine semantic type for column "
211
+ f"'{column_spec.name}' with data type '{dtype}' in "
212
+ f"table '{self.name}'. Please either change the "
213
+ f"column's data type or remove the column from this "
214
+ f"table.") from e
215
+
216
+ self._column_dict[column_spec.name] = Column(
217
+ name=column_spec.name,
218
+ expr=column_spec.expr,
219
+ dtype=dtype,
220
+ stype=stype,
221
+ )
222
+
223
+ def add_column(self, column: ColumnSpecType) -> Column:
224
+ r"""Adds a column to this table.
225
+
226
+ Args:
227
+ column: The column to add.
156
228
 
157
- return self._columns[name]
229
+ Raises:
230
+ KeyError: If the column name already exists in this table.
231
+ """
232
+ column_spec = ColumnSpec.coerce(column)
233
+ self.add_columns([column_spec])
234
+ return self[column_spec.name]
158
235
 
159
236
  def remove_column(self, name: str) -> Self:
160
237
  r"""Removes a column from this table.
@@ -174,7 +251,7 @@ class Table(ABC):
174
251
  self.time_column = None
175
252
  if self._end_time_column == name:
176
253
  self.end_time_column = None
177
- del self._columns[name]
254
+ del self._column_dict[name]
178
255
 
179
256
  return self
180
257
 
@@ -362,30 +439,12 @@ class Table(ABC):
362
439
 
363
440
  def print_metadata(self) -> None:
364
441
  r"""Prints the :meth:`~metadata` of this table."""
365
- num_rows_repr = ''
366
- if self._num_rows is not None:
367
- num_rows_repr = ' ({self._num_rows:,} rows)'
368
-
369
- if in_snowflake_notebook():
370
- import streamlit as st
371
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
372
- st.markdown(md_repr)
373
- st.dataframe(self.metadata, hide_index=True)
374
- elif in_notebook():
375
- from IPython.display import Markdown, display
376
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
377
- display(Markdown(md_repr))
378
- df = self.metadata
379
- try:
380
- if hasattr(df.style, 'hide'):
381
- display(df.style.hide(axis='index')) # pandas=2
382
- else:
383
- display(df.style.hide_index()) # pandas<1.3
384
- except ImportError:
385
- print(df.to_string(index=False)) # missing jinja2
386
- else:
387
- print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
388
- print(self.metadata.to_string(index=False))
442
+ msg = f"🏷️ Metadata of Table `{self.name}`"
443
+ if num := self._num_rows:
444
+ msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
445
+
446
+ display.title(msg)
447
+ display.dataframe(self.metadata)
389
448
 
390
449
  def infer_primary_key(self, verbose: bool = True) -> Self:
391
450
  r"""Infers the primary key in this table.
@@ -399,21 +458,28 @@ class Table(ABC):
399
458
  def _set_primary_key(primary_key: str) -> None:
400
459
  self.primary_key = primary_key
401
460
  if verbose:
402
- print(f"Detected primary key '{primary_key}' in table "
403
- f"'{self.name}'")
404
-
405
- if primary_key := self._source_primary_key:
406
- _set_primary_key(primary_key)
407
- return self
408
-
409
- unique_keys = [
410
- column.name for column in self._source_column_dict.values()
411
- if column.is_unique_key
412
- ]
413
- if len(unique_keys) == 1: # NOTE No composite keys yet.
414
- _set_primary_key(unique_keys[0])
415
- return self
461
+ display.message(f"Inferred primary key `{primary_key}` for "
462
+ f"table `{self.name}`")
416
463
 
464
+ # Inference from source column metadata:
465
+ if any(column.is_source for column in self.columns):
466
+ primary_key = self._source_primary_key
467
+ if (primary_key is not None and primary_key in self
468
+ and self[primary_key].is_source):
469
+ _set_primary_key(primary_key)
470
+ return self
471
+
472
+ unique_keys = [
473
+ column.name for column in self._source_column_dict.values()
474
+ if column.is_unique_key
475
+ ]
476
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
477
+ and unique_keys[0] in self
478
+ and self[unique_keys[0]].is_source):
479
+ _set_primary_key(unique_keys[0])
480
+ return self
481
+
482
+ # Heuristic-based inference:
417
483
  candidates = [
418
484
  column.name for column in self.columns if column.stype == Stype.ID
419
485
  ]
@@ -427,7 +493,7 @@ class Table(ABC):
427
493
 
428
494
  if primary_key := infer_primary_key(
429
495
  table_name=self.name,
430
- df=self._sample_df,
496
+ df=self._get_sample_df(),
431
497
  candidates=candidates,
432
498
  ):
433
499
  _set_primary_key(primary_key)
@@ -444,6 +510,7 @@ class Table(ABC):
444
510
  if self.has_time_column():
445
511
  return self
446
512
 
513
+ # Heuristic-based inference:
447
514
  candidates = [
448
515
  column.name for column in self.columns
449
516
  if column.stype == Stype.timestamp
@@ -451,14 +518,14 @@ class Table(ABC):
451
518
  ]
452
519
 
453
520
  if time_column := infer_time_column(
454
- df=self._sample_df,
521
+ df=self._get_sample_df(),
455
522
  candidates=candidates,
456
523
  ):
457
524
  self.time_column = time_column
458
525
 
459
526
  if verbose:
460
- print(f"Detected time column '{time_column}' in table "
461
- f"'{self.name}'")
527
+ display.message(f"Inferred time column `{time_column}` for "
528
+ f"table `{self.name}`")
462
529
 
463
530
  return self
464
531
 
@@ -474,15 +541,16 @@ class Table(ABC):
474
541
  if not self.has_primary_key():
475
542
  self.infer_primary_key(verbose=False)
476
543
  if self.has_primary_key():
477
- logs.append(f"primary key '{self._primary_key}'")
544
+ logs.append(f"primary key `{self._primary_key}`")
478
545
 
479
546
  if not self.has_time_column():
480
547
  self.infer_time_column(verbose=False)
481
548
  if self.has_time_column():
482
- logs.append(f"time column '{self._time_column}'")
549
+ logs.append(f"time column `{self._time_column}`")
483
550
 
484
551
  if verbose and len(logs) > 0:
485
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
552
+ display.message(f"Inferred {' and '.join(logs)} for table "
553
+ f"`{self.name}`")
486
554
 
487
555
  return self
488
556
 
@@ -500,16 +568,113 @@ class Table(ABC):
500
568
  end_time_col=self._end_time_column,
501
569
  )
502
570
 
503
- @property
571
+ @cached_property
572
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
573
+ source_columns = self._get_source_columns()
574
+ if len(source_columns) == 0:
575
+ raise ValueError(f"Table '{self.name}' has no columns")
576
+ return {column.name: column for column in source_columns}
577
+
578
+ @cached_property
504
579
  def _source_primary_key(self) -> str | None:
505
580
  primary_keys = [
506
581
  column.name for column in self._source_column_dict.values()
507
582
  if column.is_primary_key
508
583
  ]
509
- if len(primary_keys) == 1: # NOTE No composite keys yet.
510
- return primary_keys[0]
584
+ # NOTE No composite keys yet.
585
+ return primary_keys[0] if len(primary_keys) == 1 else None
511
586
 
512
- return None
587
+ @cached_property
588
+ def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
589
+ return {key.name: key for key in self._get_source_foreign_keys()}
590
+
591
+ @cached_property
592
+ def _source_sample_df(self) -> pd.DataFrame:
593
+ return self._get_source_sample_df().reset_index(drop=True)
594
+
595
+ @cached_property
596
+ def _num_rows(self) -> int | None:
597
+ return self._get_num_rows()
598
+
599
+ def _get_sample_df(self) -> pd.DataFrame:
600
+ dfs: list[pd.DataFrame] = []
601
+ if any(column.is_source for column in self.columns):
602
+ dfs.append(self._source_sample_df)
603
+ if any(not column.is_source for column in self.columns):
604
+ dfs.append(self._expr_sample_df)
605
+
606
+ if len(dfs) == 0:
607
+ return pd.DataFrame(index=range(1000))
608
+ if len(dfs) == 1:
609
+ return dfs[0]
610
+
611
+ size = min(map(len, dfs))
612
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
613
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
614
+ return df
615
+
616
+ @staticmethod
617
+ def _sanitize(
618
+ df: pd.DataFrame,
619
+ dtype_dict: dict[str, Dtype | None] | None = None,
620
+ stype_dict: dict[str, Stype | None] | None = None,
621
+ ) -> pd.DataFrame:
622
+ r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
623
+ types match table data and semantic type specification.
624
+ """
625
+ def _to_datetime(ser: pd.Series) -> pd.Series:
626
+ if not pd.api.types.is_datetime64_any_dtype(ser):
627
+ with warnings.catch_warnings():
628
+ warnings.filterwarnings(
629
+ 'ignore',
630
+ message='Could not infer format',
631
+ )
632
+ ser = pd.to_datetime(ser, errors='coerce')
633
+ if isinstance(ser.dtype, pd.DatetimeTZDtype):
634
+ ser = ser.dt.tz_localize(None)
635
+ if ser.dtype != 'datetime64[ns]':
636
+ ser = ser.astype('datetime64[ns]')
637
+ return ser
638
+
639
+ def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
640
+ if (pd.api.types.is_string_dtype(ser)
641
+ and dtype in {Dtype.intlist, Dtype.floatlist}):
642
+ try:
643
+ ser = ser.map(lambda row: np.fromstring(
644
+ row.strip('[]'),
645
+ sep=',',
646
+ dtype=int if dtype == Dtype.intlist else np.float32,
647
+ ) if row is not None else None)
648
+ except Exception:
649
+ pass
650
+
651
+ if pd.api.types.is_string_dtype(ser):
652
+ try:
653
+ import orjson as json
654
+ except ImportError:
655
+ import json
656
+ try:
657
+ ser = ser.map(lambda row: json.loads(row)
658
+ if row is not None else None)
659
+ except Exception:
660
+ pass
661
+
662
+ return ser
663
+
664
+ for column_name in df.columns:
665
+ dtype = (dtype_dict or {}).get(column_name)
666
+ stype = (stype_dict or {}).get(column_name)
667
+
668
+ if dtype == Dtype.time:
669
+ df[column_name] = _to_datetime(df[column_name])
670
+ elif stype == Stype.timestamp:
671
+ df[column_name] = _to_datetime(df[column_name])
672
+ elif dtype is not None and dtype.is_list():
673
+ df[column_name] = _to_list(df[column_name], dtype)
674
+ elif stype == Stype.sequence:
675
+ df[column_name] = _to_list(df[column_name], Dtype.floatlist)
676
+
677
+ return df
513
678
 
514
679
  # Python builtins #########################################################
515
680
 
@@ -546,25 +711,24 @@ class Table(ABC):
546
711
  def backend(self) -> DataBackend:
547
712
  r"""The data backend of this table."""
548
713
 
549
- @cached_property
550
- def _source_column_dict(self) -> dict[str, SourceColumn]:
551
- return {col.name: col for col in self._get_source_columns()}
552
-
553
714
  @abstractmethod
554
715
  def _get_source_columns(self) -> list[SourceColumn]:
555
716
  pass
556
717
 
557
- @cached_property
558
- def _sample_df(self) -> pd.DataFrame:
559
- return self._get_sample_df()
718
+ @abstractmethod
719
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
720
+ pass
560
721
 
561
722
  @abstractmethod
562
- def _get_sample_df(self) -> pd.DataFrame:
723
+ def _get_source_sample_df(self) -> pd.DataFrame:
563
724
  pass
564
725
 
565
- @cached_property
566
- def _num_rows(self) -> int | None:
567
- return self._get_num_rows()
726
+ @abstractmethod
727
+ def _get_expr_sample_df(
728
+ self,
729
+ columns: Sequence[ColumnSpec],
730
+ ) -> pd.DataFrame:
731
+ pass
568
732
 
569
733
  @abstractmethod
570
734
  def _get_num_rows(self) -> int | None: