kumoai 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. kumoai/__init__.py +23 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +26 -0
  5. kumoai/connector/utils.py +21 -7
  6. kumoai/experimental/rfm/__init__.py +24 -22
  7. kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
  8. kumoai/experimental/rfm/backend/local/sampler.py +0 -3
  9. kumoai/experimental/rfm/backend/local/table.py +24 -25
  10. kumoai/experimental/rfm/backend/snow/sampler.py +184 -70
  11. kumoai/experimental/rfm/backend/snow/table.py +137 -64
  12. kumoai/experimental/rfm/backend/sqlite/sampler.py +191 -86
  13. kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
  14. kumoai/experimental/rfm/base/__init__.py +6 -9
  15. kumoai/experimental/rfm/base/column.py +95 -11
  16. kumoai/experimental/rfm/base/expression.py +44 -0
  17. kumoai/experimental/rfm/base/sampler.py +26 -17
  18. kumoai/experimental/rfm/base/source.py +1 -1
  19. kumoai/experimental/rfm/base/sql_sampler.py +182 -19
  20. kumoai/experimental/rfm/base/table.py +275 -109
  21. kumoai/experimental/rfm/graph.py +115 -107
  22. kumoai/experimental/rfm/infer/dtype.py +4 -1
  23. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  24. kumoai/experimental/rfm/relbench.py +76 -0
  25. kumoai/experimental/rfm/rfm.py +530 -304
  26. kumoai/experimental/rfm/task_table.py +292 -0
  27. kumoai/kumolib.cp313-win_amd64.pyd +0 -0
  28. kumoai/pquery/training_table.py +16 -2
  29. kumoai/trainer/distilled_trainer.py +175 -0
  30. kumoai/utils/display.py +87 -0
  31. kumoai/utils/progress_logger.py +13 -1
  32. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/METADATA +1 -1
  33. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/RECORD +36 -33
  34. kumoai/experimental/rfm/base/column_expression.py +0 -50
  35. kumoai/experimental/rfm/base/sql_table.py +0 -229
  36. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/WHEEL +0 -0
  37. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/licenses/LICENSE +0 -0
  38. {kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,33 @@
1
+ import warnings
1
2
  from abc import ABC, abstractmethod
2
3
  from collections.abc import Sequence
3
4
  from functools import cached_property
4
5
 
6
+ import numpy as np
5
7
  import pandas as pd
8
+ import pyarrow as pa
6
9
  from kumoapi.model_plan import MissingType
7
10
  from kumoapi.source_table import UnavailableSourceTable
8
11
  from kumoapi.table import Column as ColumnDefinition
9
12
  from kumoapi.table import TableDefinition
10
- from kumoapi.typing import Stype
13
+ from kumoapi.typing import Dtype, Stype
11
14
  from typing_extensions import Self
12
15
 
13
- from kumoai import in_notebook, in_snowflake_notebook
14
- from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
16
+ from kumoai.experimental.rfm.base import (
17
+ Column,
18
+ ColumnSpec,
19
+ ColumnSpecType,
20
+ DataBackend,
21
+ SourceColumn,
22
+ SourceForeignKey,
23
+ )
15
24
  from kumoai.experimental.rfm.infer import (
25
+ infer_dtype,
16
26
  infer_primary_key,
17
27
  infer_stype,
18
28
  infer_time_column,
19
29
  )
30
+ from kumoai.utils import display, quote_ident
20
31
 
21
32
 
22
33
  class Table(ABC):
@@ -26,39 +37,46 @@ class Table(ABC):
26
37
 
27
38
  Args:
28
39
  name: The name of this table.
40
+ source_name: The source name of this table. If set to ``None``,
41
+ ``name`` is being used.
29
42
  columns: The selected columns of this table.
30
43
  primary_key: The name of the primary key of this table, if it exists.
31
44
  time_column: The name of the time column of this table, if it exists.
32
45
  end_time_column: The name of the end time column of this table, if it
33
46
  exists.
34
47
  """
48
+ _NUM_SAMPLE_ROWS = 1_000
49
+
35
50
  def __init__(
36
51
  self,
37
52
  name: str,
38
- columns: Sequence[str] | None = None,
53
+ source_name: str | None = None,
54
+ columns: Sequence[ColumnSpecType] | None = None,
39
55
  primary_key: MissingType | str | None = MissingType.VALUE,
40
56
  time_column: str | None = None,
41
57
  end_time_column: str | None = None,
42
58
  ) -> None:
43
59
 
44
60
  self._name = name
61
+ self._source_name = source_name or name
62
+ self._column_dict: dict[str, Column] = {}
45
63
  self._primary_key: str | None = None
46
64
  self._time_column: str | None = None
47
65
  self._end_time_column: str | None = None
66
+ self._expr_sample_df = pd.DataFrame(index=range(self._NUM_SAMPLE_ROWS))
48
67
 
49
68
  if columns is None:
50
69
  columns = list(self._source_column_dict.keys())
51
70
 
52
- self._columns: dict[str, Column] = {}
53
- for column_name in columns:
54
- self.add_column(column_name)
71
+ self.add_columns(columns)
55
72
 
56
73
  if isinstance(primary_key, MissingType):
57
- # Inference from source column metadata:
58
- if '_source_column_dict' in self.__dict__:
74
+ # Infer primary key from source metadata, but only set it in case
75
+ # it is already part of the column set (don't magically add it):
76
+ if any(column.is_source for column in self.columns):
59
77
  primary_key = self._source_primary_key
60
78
  if (primary_key is not None and primary_key in self
61
- and self[primary_key].is_physical):
79
+ and self[primary_key].is_source):
62
80
  self.primary_key = primary_key
63
81
  elif primary_key is not None:
64
82
  if primary_key not in self:
@@ -80,13 +98,22 @@ class Table(ABC):
80
98
  r"""The name of this table."""
81
99
  return self._name
82
100
 
101
+ @property
102
+ def source_name(self) -> str:
103
+ r"""The source name of this table."""
104
+ return self._source_name
105
+
106
+ @property
107
+ def _quoted_source_name(self) -> str:
108
+ return quote_ident(self._source_name)
109
+
83
110
  # Column ##################################################################
84
111
 
85
112
  def has_column(self, name: str) -> bool:
86
113
  r"""Returns ``True`` if this table holds a column with name ``name``;
87
114
  ``False`` otherwise.
88
115
  """
89
- return name in self._columns
116
+ return name in self._column_dict
90
117
 
91
118
  def column(self, name: str) -> Column:
92
119
  r"""Returns the data column named with name ``name`` in this table.
@@ -99,51 +126,113 @@ class Table(ABC):
99
126
  """
100
127
  if not self.has_column(name):
101
128
  raise KeyError(f"Column '{name}' not found in table '{self.name}'")
102
- return self._columns[name]
129
+ return self._column_dict[name]
103
130
 
104
131
  @property
105
132
  def columns(self) -> list[Column]:
106
133
  r"""Returns a list of :class:`Column` objects that represent the
107
134
  columns in this table.
108
135
  """
109
- return list(self._columns.values())
136
+ return list(self._column_dict.values())
110
137
 
111
- def add_column(self, name: str) -> Column:
112
- r"""Adds a column to this table.
138
+ def add_columns(self, columns: Sequence[ColumnSpecType]) -> None:
139
+ r"""Adds a set of columns to this table.
113
140
 
114
141
  Args:
115
- name: The name of the column.
142
+ columns: The columns to add.
116
143
 
117
144
  Raises:
118
- KeyError: If ``name`` is already present in this table.
145
+ KeyError: If any of the column names already exist in this table.
119
146
  """
120
- if name in self:
121
- raise KeyError(f"Column '{name}' already exists in table "
122
- f"'{self.name}'")
123
-
124
- if name not in self._source_column_dict:
125
- raise KeyError(f"Column '{name}' does not exist in the underlying "
126
- f"source table")
127
-
128
- dtype = self._source_column_dict[name].dtype
129
-
130
- ser = self._source_sample_df[name]
131
- try:
132
- stype = infer_stype(ser, name, dtype)
133
- except Exception as e:
134
- raise RuntimeError(f"Could not obtain semantic type for column "
135
- f"'{name}' with data type '{dtype}' in table "
136
- f"'{self.name}'. Change the data type of the "
137
- f"column in the source table or remove it from "
138
- f"this table.") from e
139
-
140
- self._columns[name] = Column(
141
- name=name,
142
- stype=stype,
143
- dtype=dtype,
144
- )
147
+ if len(columns) == 0:
148
+ return
145
149
 
146
- return self._columns[name]
150
+ column_specs = [ColumnSpec.coerce(column) for column in columns]
151
+
152
+ # Obtain a batch-wise sample for all column expressions:
153
+ expr_specs = [spec for spec in column_specs if not spec.is_source]
154
+ if len(expr_specs) > 0:
155
+ dfs = [
156
+ self._expr_sample_df,
157
+ self._get_expr_sample_df(expr_specs).reset_index(drop=True),
158
+ ]
159
+ size = min(map(len, dfs))
160
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
161
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
162
+ self._expr_sample_df = df
163
+
164
+ for column_spec in column_specs:
165
+ if column_spec.name in self:
166
+ raise KeyError(f"Column '{column_spec.name}' already exists "
167
+ f"in table '{self.name}'")
168
+
169
+ dtype = column_spec.dtype
170
+ stype = column_spec.stype
171
+
172
+ if column_spec.is_source:
173
+ if column_spec.name not in self._source_column_dict:
174
+ raise ValueError(
175
+ f"Column '{column_spec.name}' does not exist in the "
176
+ f"underlying source table")
177
+
178
+ if dtype is None:
179
+ dtype = self._source_column_dict[column_spec.name].dtype
180
+
181
+ if dtype == Dtype.unsupported:
182
+ raise ValueError(
183
+ f"Encountered unsupported data type for column "
184
+ f"'{column_spec.name}' in table '{self.name}'. Please "
185
+ f"either change the column's data type or remove the "
186
+ f"column from this table.")
187
+
188
+ if dtype is None:
189
+ if column_spec.is_source:
190
+ ser = self._source_sample_df[column_spec.name]
191
+ else:
192
+ ser = self._expr_sample_df[column_spec.name]
193
+ try:
194
+ dtype = infer_dtype(ser)
195
+ except Exception as e:
196
+ raise RuntimeError(
197
+ f"Encountered unsupported data type '{ser.dtype}' for "
198
+ f"column '{column_spec.name}' in table '{self.name}'. "
199
+ f"Please either manually override the columns's data "
200
+ f"type or remove the column from this table.") from e
201
+
202
+ if stype is None:
203
+ if column_spec.is_source:
204
+ ser = self._source_sample_df[column_spec.name]
205
+ else:
206
+ ser = self._expr_sample_df[column_spec.name]
207
+ try:
208
+ stype = infer_stype(ser, column_spec.name, dtype)
209
+ except Exception as e:
210
+ raise RuntimeError(
211
+ f"Could not determine semantic type for column "
212
+ f"'{column_spec.name}' with data type '{dtype}' in "
213
+ f"table '{self.name}'. Please either change the "
214
+ f"column's data type or remove the column from this "
215
+ f"table.") from e
216
+
217
+ self._column_dict[column_spec.name] = Column(
218
+ name=column_spec.name,
219
+ expr=column_spec.expr,
220
+ dtype=dtype,
221
+ stype=stype,
222
+ )
223
+
224
+ def add_column(self, column: ColumnSpecType) -> Column:
225
+ r"""Adds a column to this table.
226
+
227
+ Args:
228
+ column: The column to add.
229
+
230
+ Raises:
231
+ KeyError: If the column name already exists in this table.
232
+ """
233
+ column_spec = ColumnSpec.coerce(column)
234
+ self.add_columns([column_spec])
235
+ return self[column_spec.name]
147
236
 
148
237
  def remove_column(self, name: str) -> Self:
149
238
  r"""Removes a column from this table.
@@ -163,7 +252,7 @@ class Table(ABC):
163
252
  self.time_column = None
164
253
  if self._end_time_column == name:
165
254
  self.end_time_column = None
166
- del self._columns[name]
255
+ del self._column_dict[name]
167
256
 
168
257
  return self
169
258
 
@@ -183,8 +272,8 @@ class Table(ABC):
183
272
  no such primary key is present.
184
273
 
185
274
  The setter sets a column as a primary key on this table, and raises a
186
- :class:`ValueError` if the primary key has a non-ID semantic type or
187
- if the column name does not match a column in the data frame.
275
+ :class:`ValueError` if the primary key has a non-ID compatible data
276
+ type or if the column name does not match a column in the data frame.
188
277
  """
189
278
  if self._primary_key is None:
190
279
  return None
@@ -228,8 +317,9 @@ class Table(ABC):
228
317
  such time column is present.
229
318
 
230
319
  The setter sets a column as a time column on this table, and raises a
231
- :class:`ValueError` if the time column has a non-timestamp semantic
232
- type or if the column name does not match a column in the data frame.
320
+ :class:`ValueError` if the time column has a non-timestamp compatible
321
+ data type or if the column name does not match a column in the data
322
+ frame.
233
323
  """
234
324
  if self._time_column is None:
235
325
  return None
@@ -274,8 +364,8 @@ class Table(ABC):
274
364
 
275
365
  The setter sets a column as an end time column on this table, and
276
366
  raises a :class:`ValueError` if the end time column has a non-timestamp
277
- semantic type or if the column name does not match a column in the data
278
- frame.
367
+ compatible data type or if the column name does not match a column in
368
+ the data frame.
279
369
  """
280
370
  if self._end_time_column is None:
281
371
  return None
@@ -310,39 +400,39 @@ class Table(ABC):
310
400
  r"""Returns a :class:`pandas.DataFrame` object containing metadata
311
401
  information about the columns in this table.
312
402
 
313
- The returned dataframe has columns ``name``, ``dtype``, ``stype``,
314
- ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
315
- which provide an aggregate view of the properties of the columns of
316
- this table.
403
+ The returned dataframe has columns ``"Name"``, ``"Data Type"``,
404
+ ``"Semantic Type"``, ``"Primary Key"``, ``"Time Column"`` and
405
+ ``"End Time Column"``, which provide an aggregated view of the
406
+ properties of the columns of this table.
317
407
 
318
408
  Example:
319
409
  >>> # doctest: +SKIP
320
410
  >>> import kumoai.experimental.rfm as rfm
321
411
  >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
322
412
  >>> table.metadata
323
- name dtype stype is_primary_key is_time_column is_end_time_column
324
- 0 CustomerID float64 ID True False False
413
+ Name Data Type Semantic Type Primary Key Time Column End Time Column
414
+ 0 CustomerID float64 ID True False False
325
415
  """ # noqa: E501
326
416
  cols = self.columns
327
417
 
328
418
  return pd.DataFrame({
329
- 'name':
419
+ 'Name':
330
420
  pd.Series(dtype=str, data=[c.name for c in cols]),
331
- 'dtype':
421
+ 'Data Type':
332
422
  pd.Series(dtype=str, data=[c.dtype for c in cols]),
333
- 'stype':
423
+ 'Semantic Type':
334
424
  pd.Series(dtype=str, data=[c.stype for c in cols]),
335
- 'is_primary_key':
425
+ 'Primary Key':
336
426
  pd.Series(
337
427
  dtype=bool,
338
428
  data=[self._primary_key == c.name for c in cols],
339
429
  ),
340
- 'is_time_column':
430
+ 'Time Column':
341
431
  pd.Series(
342
432
  dtype=bool,
343
433
  data=[self._time_column == c.name for c in cols],
344
434
  ),
345
- 'is_end_time_column':
435
+ 'End Time Column':
346
436
  pd.Series(
347
437
  dtype=bool,
348
438
  data=[self._end_time_column == c.name for c in cols],
@@ -351,30 +441,12 @@ class Table(ABC):
351
441
 
352
442
  def print_metadata(self) -> None:
353
443
  r"""Prints the :meth:`~metadata` of this table."""
354
- num_rows_repr = ''
355
- if self._num_rows is not None:
356
- num_rows_repr = ' ({self._num_rows:,} rows)'
357
-
358
- if in_snowflake_notebook():
359
- import streamlit as st
360
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
361
- st.markdown(md_repr)
362
- st.dataframe(self.metadata, hide_index=True)
363
- elif in_notebook():
364
- from IPython.display import Markdown, display
365
- md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
366
- display(Markdown(md_repr))
367
- df = self.metadata
368
- try:
369
- if hasattr(df.style, 'hide'):
370
- display(df.style.hide(axis='index')) # pandas=2
371
- else:
372
- display(df.style.hide_index()) # pandas<1.3
373
- except ImportError:
374
- print(df.to_string(index=False)) # missing jinja2
375
- else:
376
- print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
377
- print(self.metadata.to_string(index=False))
444
+ msg = f"🏷️ Metadata of Table `{self.name}`"
445
+ if num := self._num_rows:
446
+ msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
447
+
448
+ display.title(msg)
449
+ display.dataframe(self.metadata)
378
450
 
379
451
  def infer_primary_key(self, verbose: bool = True) -> Self:
380
452
  r"""Infers the primary key in this table.
@@ -388,14 +460,14 @@ class Table(ABC):
388
460
  def _set_primary_key(primary_key: str) -> None:
389
461
  self.primary_key = primary_key
390
462
  if verbose:
391
- print(f"Detected primary key '{primary_key}' in table "
392
- f"'{self.name}'")
463
+ display.message(f"Inferred primary key `{primary_key}` for "
464
+ f"table `{self.name}`")
393
465
 
394
466
  # Inference from source column metadata:
395
- if '_source_column_dict' in self.__dict__:
467
+ if any(column.is_source for column in self.columns):
396
468
  primary_key = self._source_primary_key
397
469
  if (primary_key is not None and primary_key in self
398
- and self[primary_key].is_physical):
470
+ and self[primary_key].is_source):
399
471
  _set_primary_key(primary_key)
400
472
  return self
401
473
 
@@ -405,7 +477,7 @@ class Table(ABC):
405
477
  ]
406
478
  if (len(unique_keys) == 1 # NOTE No composite keys yet.
407
479
  and unique_keys[0] in self
408
- and self[unique_keys[0]].is_physical):
480
+ and self[unique_keys[0]].is_source):
409
481
  _set_primary_key(unique_keys[0])
410
482
  return self
411
483
 
@@ -423,7 +495,7 @@ class Table(ABC):
423
495
 
424
496
  if primary_key := infer_primary_key(
425
497
  table_name=self.name,
426
- df=self._sample_current_df(columns=candidates),
498
+ df=self._get_sample_df(),
427
499
  candidates=candidates,
428
500
  ):
429
501
  _set_primary_key(primary_key)
@@ -448,14 +520,14 @@ class Table(ABC):
448
520
  ]
449
521
 
450
522
  if time_column := infer_time_column(
451
- df=self._sample_current_df(columns=candidates),
523
+ df=self._get_sample_df(),
452
524
  candidates=candidates,
453
525
  ):
454
526
  self.time_column = time_column
455
527
 
456
528
  if verbose:
457
- print(f"Detected time column '{time_column}' in table "
458
- f"'{self.name}'")
529
+ display.message(f"Inferred time column `{time_column}` for "
530
+ f"table `{self.name}`")
459
531
 
460
532
  return self
461
533
 
@@ -471,15 +543,16 @@ class Table(ABC):
471
543
  if not self.has_primary_key():
472
544
  self.infer_primary_key(verbose=False)
473
545
  if self.has_primary_key():
474
- logs.append(f"primary key '{self._primary_key}'")
546
+ logs.append(f"primary key `{self._primary_key}`")
475
547
 
476
548
  if not self.has_time_column():
477
549
  self.infer_time_column(verbose=False)
478
550
  if self.has_time_column():
479
- logs.append(f"time column '{self._time_column}'")
551
+ logs.append(f"time column `{self._time_column}`")
480
552
 
481
553
  if verbose and len(logs) > 0:
482
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
554
+ display.message(f"Inferred {' and '.join(logs)} for table "
555
+ f"`{self.name}`")
483
556
 
484
557
  return self
485
558
 
@@ -501,31 +574,113 @@ class Table(ABC):
501
574
  def _source_column_dict(self) -> dict[str, SourceColumn]:
502
575
  source_columns = self._get_source_columns()
503
576
  if len(source_columns) == 0:
504
- raise ValueError(f"Table '{self.name}' does not hold any column "
505
- f"with a supported data type")
577
+ raise ValueError(f"Table '{self.name}' has no columns")
506
578
  return {column.name: column for column in source_columns}
507
579
 
508
580
  @cached_property
509
- def _source_sample_df(self) -> pd.DataFrame:
510
- return self._get_source_sample_df()
511
-
512
- @property
513
581
  def _source_primary_key(self) -> str | None:
514
582
  primary_keys = [
515
583
  column.name for column in self._source_column_dict.values()
516
584
  if column.is_primary_key
517
585
  ]
518
- if len(primary_keys) == 1: # NOTE No composite keys yet.
519
- return primary_keys[0]
586
+ # NOTE No composite keys yet.
587
+ return primary_keys[0] if len(primary_keys) == 1 else None
588
+
589
+ @cached_property
590
+ def _source_foreign_key_dict(self) -> dict[str, SourceForeignKey]:
591
+ return {key.name: key for key in self._get_source_foreign_keys()}
520
592
 
521
- return None
593
+ @cached_property
594
+ def _source_sample_df(self) -> pd.DataFrame:
595
+ return self._get_source_sample_df().reset_index(drop=True)
522
596
 
523
597
  @cached_property
524
598
  def _num_rows(self) -> int | None:
525
599
  return self._get_num_rows()
526
600
 
527
- def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
528
- return self._source_sample_df[columns]
601
+ def _get_sample_df(self) -> pd.DataFrame:
602
+ dfs: list[pd.DataFrame] = []
603
+ if any(column.is_source for column in self.columns):
604
+ dfs.append(self._source_sample_df)
605
+ if any(not column.is_source for column in self.columns):
606
+ dfs.append(self._expr_sample_df)
607
+
608
+ if len(dfs) == 0:
609
+ return pd.DataFrame(index=range(1000))
610
+ if len(dfs) == 1:
611
+ return dfs[0]
612
+
613
+ size = min(map(len, dfs))
614
+ df = pd.concat([dfs[0].iloc[:size], dfs[1].iloc[:size]], axis=1)
615
+ df = df.loc[:, ~df.columns.duplicated(keep='last')]
616
+ return df
617
+
618
+ @staticmethod
619
+ def _sanitize(
620
+ df: pd.DataFrame,
621
+ dtype_dict: dict[str, Dtype | None] | None = None,
622
+ stype_dict: dict[str, Stype | None] | None = None,
623
+ ) -> pd.DataFrame:
624
+ r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
625
+ types match table data and semantic type specification.
626
+ """
627
+ def _to_datetime(ser: pd.Series) -> pd.Series:
628
+ if (not pd.api.types.is_datetime64_any_dtype(ser)
629
+ and not (isinstance(ser.dtype, pd.ArrowDtype) and
630
+ pa.types.is_timestamp(ser.dtype.pyarrow_dtype))):
631
+ with warnings.catch_warnings():
632
+ warnings.filterwarnings(
633
+ 'ignore',
634
+ message='Could not infer format',
635
+ )
636
+ ser = pd.to_datetime(ser, errors='coerce')
637
+ if (isinstance(ser.dtype, pd.DatetimeTZDtype)
638
+ or (isinstance(ser.dtype, pd.ArrowDtype)
639
+ and ser.dtype.pyarrow_dtype.tz is not None)):
640
+ ser = ser.dt.tz_localize(None)
641
+ if ser.dtype != 'datetime64[ns]':
642
+ ser = ser.astype('datetime64[ns]')
643
+ return ser
644
+
645
+ def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
646
+ if (pd.api.types.is_string_dtype(ser)
647
+ and dtype in {Dtype.intlist, Dtype.floatlist}):
648
+ try:
649
+ ser = ser.map(lambda row: np.fromstring(
650
+ row.strip('[]'),
651
+ sep=',',
652
+ dtype=int if dtype == Dtype.intlist else np.float32,
653
+ ) if row is not None else None)
654
+ except Exception:
655
+ pass
656
+
657
+ if pd.api.types.is_string_dtype(ser):
658
+ try:
659
+ import orjson as json
660
+ except ImportError:
661
+ import json
662
+ try:
663
+ ser = ser.map(lambda row: json.loads(row)
664
+ if row is not None else None)
665
+ except Exception:
666
+ pass
667
+
668
+ return ser
669
+
670
+ for column_name in df.columns:
671
+ dtype = (dtype_dict or {}).get(column_name)
672
+ stype = (stype_dict or {}).get(column_name)
673
+
674
+ if dtype == Dtype.time:
675
+ df[column_name] = _to_datetime(df[column_name])
676
+ elif stype == Stype.timestamp:
677
+ df[column_name] = _to_datetime(df[column_name])
678
+ elif dtype is not None and dtype.is_list():
679
+ df[column_name] = _to_list(df[column_name], dtype)
680
+ elif stype == Stype.sequence:
681
+ df[column_name] = _to_list(df[column_name], Dtype.floatlist)
682
+
683
+ return df
529
684
 
530
685
  # Python builtins #########################################################
531
686
 
@@ -566,10 +721,21 @@ class Table(ABC):
566
721
  def _get_source_columns(self) -> list[SourceColumn]:
567
722
  pass
568
723
 
724
+ @abstractmethod
725
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
726
+ pass
727
+
569
728
  @abstractmethod
570
729
  def _get_source_sample_df(self) -> pd.DataFrame:
571
730
  pass
572
731
 
732
+ @abstractmethod
733
+ def _get_expr_sample_df(
734
+ self,
735
+ columns: Sequence[ColumnSpec],
736
+ ) -> pd.DataFrame:
737
+ pass
738
+
573
739
  @abstractmethod
574
740
  def _get_num_rows(self) -> int | None:
575
741
  pass