kumoai 2.10.0.dev202509231831__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512161731__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (53) hide show
  1. kumoai/__init__.py +22 -11
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +17 -16
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/client/rfm.py +37 -8
  7. kumoai/connector/utils.py +23 -2
  8. kumoai/experimental/rfm/__init__.py +164 -46
  9. kumoai/experimental/rfm/backend/__init__.py +0 -0
  10. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  11. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +49 -86
  12. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  13. kumoai/experimental/rfm/backend/local/table.py +119 -0
  14. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  15. kumoai/experimental/rfm/backend/snow/sampler.py +274 -0
  16. kumoai/experimental/rfm/backend/snow/table.py +135 -0
  17. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  18. kumoai/experimental/rfm/backend/sqlite/sampler.py +353 -0
  19. kumoai/experimental/rfm/backend/sqlite/table.py +126 -0
  20. kumoai/experimental/rfm/base/__init__.py +25 -0
  21. kumoai/experimental/rfm/base/column.py +66 -0
  22. kumoai/experimental/rfm/base/sampler.py +773 -0
  23. kumoai/experimental/rfm/base/source.py +19 -0
  24. kumoai/experimental/rfm/base/sql_sampler.py +60 -0
  25. kumoai/experimental/rfm/{local_table.py → base/table.py} +245 -156
  26. kumoai/experimental/rfm/{local_graph.py → graph.py} +425 -137
  27. kumoai/experimental/rfm/infer/__init__.py +6 -0
  28. kumoai/experimental/rfm/infer/dtype.py +79 -0
  29. kumoai/experimental/rfm/infer/pkey.py +126 -0
  30. kumoai/experimental/rfm/infer/time_col.py +62 -0
  31. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  32. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  33. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +24 -58
  34. kumoai/experimental/rfm/pquery/{pandas_backend.py → pandas_executor.py} +278 -224
  35. kumoai/experimental/rfm/rfm.py +669 -246
  36. kumoai/experimental/rfm/sagemaker.py +138 -0
  37. kumoai/jobs.py +1 -0
  38. kumoai/pquery/predictive_query.py +10 -6
  39. kumoai/spcs.py +1 -3
  40. kumoai/testing/decorators.py +1 -1
  41. kumoai/testing/snow.py +50 -0
  42. kumoai/trainer/trainer.py +12 -10
  43. kumoai/utils/__init__.py +3 -2
  44. kumoai/utils/progress_logger.py +239 -4
  45. kumoai/utils/sql.py +3 -0
  46. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/METADATA +15 -5
  47. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/RECORD +50 -32
  48. kumoai/experimental/rfm/local_graph_sampler.py +0 -176
  49. kumoai/experimental/rfm/local_pquery_driver.py +0 -404
  50. kumoai/experimental/rfm/utils.py +0 -344
  51. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/WHEEL +0 -0
  52. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/licenses/LICENSE +0 -0
  53. {kumoai-2.10.0.dev202509231831.dist-info → kumoai-2.14.0.dev202512161731.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,19 @@
1
+ from dataclasses import dataclass
2
+
3
+ from kumoapi.typing import Dtype
4
+
5
+
6
+ @dataclass
7
+ class SourceColumn:
8
+ name: str
9
+ dtype: Dtype
10
+ is_primary_key: bool
11
+ is_unique_key: bool
12
+ is_nullable: bool
13
+
14
+
15
+ @dataclass
16
+ class SourceForeignKey:
17
+ name: str
18
+ dst_table: str
19
+ primary_key: str
@@ -0,0 +1,60 @@
1
+ from abc import abstractmethod
2
+ from typing import Literal
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from kumoai.experimental.rfm.base import Sampler, SamplerOutput
8
+
9
+
10
+ class SQLSampler(Sampler):
11
+ def _sample_subgraph(
12
+ self,
13
+ entity_table_name: str,
14
+ entity_pkey: pd.Series,
15
+ anchor_time: pd.Series | Literal['entity'],
16
+ columns_dict: dict[str, set[str]],
17
+ num_neighbors: list[int],
18
+ ) -> SamplerOutput:
19
+
20
+ df, batch = self._by_pkey(
21
+ table_name=entity_table_name,
22
+ pkey=entity_pkey,
23
+ columns=columns_dict[entity_table_name],
24
+ )
25
+ if len(batch) != len(entity_pkey):
26
+ mask = np.ones(len(entity_pkey), dtype=bool)
27
+ mask[batch] = False
28
+ raise KeyError(f"The primary keys "
29
+ f"{entity_pkey.iloc[mask].tolist()} do not exist "
30
+ f"in the '{entity_table_name}' table")
31
+
32
+ perm = batch.argsort()
33
+ batch = batch[perm]
34
+ df = df.iloc[perm].reset_index(drop=True)
35
+
36
+ if not isinstance(anchor_time, pd.Series):
37
+ time_column = self.time_column_dict[entity_table_name]
38
+ anchor_time = df[time_column]
39
+
40
+ return SamplerOutput(
41
+ anchor_time=anchor_time.astype(int).to_numpy(),
42
+ df_dict={entity_table_name: df},
43
+ inverse_dict={},
44
+ batch_dict={entity_table_name: batch},
45
+ num_sampled_nodes_dict={entity_table_name: [len(batch)]},
46
+ row_dict={},
47
+ col_dict={},
48
+ num_sampled_edges_dict={},
49
+ )
50
+
51
+ # Abstract Methods ########################################################
52
+
53
+ @abstractmethod
54
+ def _by_pkey(
55
+ self,
56
+ table_name: str,
57
+ pkey: pd.Series,
58
+ columns: set[str],
59
+ ) -> tuple[pd.DataFrame, np.ndarray]:
60
+ pass
@@ -1,149 +1,103 @@
1
- from dataclasses import dataclass
2
- from typing import Any, Dict, List, Optional
1
+ from abc import ABC, abstractmethod
2
+ from collections import defaultdict
3
+ from functools import cached_property
4
+ from typing import Dict, List, Optional, Sequence, Set
3
5
 
4
6
  import pandas as pd
5
7
  from kumoapi.source_table import UnavailableSourceTable
6
8
  from kumoapi.table import Column as ColumnDefinition
7
9
  from kumoapi.table import TableDefinition
8
- from kumoapi.typing import Dtype, Stype
10
+ from kumoapi.typing import Stype
9
11
  from typing_extensions import Self
10
12
 
11
- from kumoai import in_notebook
12
- from kumoai.experimental.rfm import utils
13
-
14
-
15
- @dataclass(init=False, repr=False, eq=False)
16
- class Column:
17
- stype: Stype
18
-
19
- def __init__(
20
- self,
21
- name: str,
22
- dtype: Dtype,
23
- stype: Stype,
24
- is_primary_key: bool = False,
25
- is_time_column: bool = False,
26
- ) -> None:
27
- self._name = name
28
- self._dtype = Dtype(dtype)
29
- self._is_primary_key = is_primary_key
30
- self._is_time_column = is_time_column
31
- self.stype = Stype(stype)
32
-
33
- @property
34
- def name(self) -> str:
35
- return self._name
36
-
37
- @property
38
- def dtype(self) -> Dtype:
39
- return self._dtype
40
-
41
- def __setattr__(self, key: str, val: Any) -> None:
42
- if key == 'stype':
43
- if isinstance(val, str):
44
- val = Stype(val)
45
- assert isinstance(val, Stype)
46
- if not val.supports_dtype(self.dtype):
47
- raise ValueError(f"Column '{self.name}' received an "
48
- f"incompatible semantic type (got "
49
- f"dtype='{self.dtype}' and stype='{val}')")
50
- if self._is_primary_key and val != Stype.ID:
51
- raise ValueError(f"Primary key '{self.name}' must have 'ID' "
52
- f"semantic type (got '{val}')")
53
- if self.name == self._is_time_column and val != Stype.timestamp:
54
- raise ValueError(f"Time column '{self.name}' must have "
55
- f"'timestamp' semantic type (got '{val}')")
56
-
57
- super().__setattr__(key, val)
58
-
59
- def __hash__(self) -> int:
60
- return hash((self.name, self.stype, self.dtype))
61
-
62
- def __eq__(self, other: Any) -> bool:
63
- if not isinstance(other, Column):
64
- return False
65
- return hash(self) == hash(other)
66
-
67
- def __repr__(self) -> str:
68
- return (f'{self.__class__.__name__}(name={self.name}, '
69
- f'stype={self.stype}, dtype={self.dtype})')
70
-
71
-
72
- class LocalTable:
73
- r"""A table backed by a :class:`pandas.DataFrame`.
74
-
75
- A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
76
- selected columns, column semantic types, primary keys and time columns.
77
- :class:`LocalTable` is used to create a :class:`LocalGraph`.
78
-
79
- .. code-block:: python
80
-
81
- import pandas as pd
82
- import kumoai.experimental.rfm as rfm
83
-
84
- # Load data from a CSV file:
85
- df = pd.read_csv("data.csv")
86
-
87
- # Create a table from a `pandas.DataFrame` and infer its metadata ...
88
- table = rfm.LocalTable(df, name="my_table").infer_metadata()
89
-
90
- # ... or create a table explicitly:
91
- table = rfm.LocalTable(
92
- df=df,
93
- name="my_table",
94
- primary_key="id",
95
- time_column="time",
96
- )
97
-
98
- # Verify metadata:
99
- table.print_metadata()
100
-
101
- # Change the semantic type of a column:
102
- table[column].stype = "text"
13
+ from kumoai import in_notebook, in_snowflake_notebook
14
+ from kumoai.experimental.rfm.base import (
15
+ Column,
16
+ DataBackend,
17
+ SourceColumn,
18
+ SourceForeignKey,
19
+ )
20
+ from kumoai.experimental.rfm.infer import (
21
+ contains_categorical,
22
+ contains_id,
23
+ contains_multicategorical,
24
+ contains_timestamp,
25
+ infer_primary_key,
26
+ infer_time_column,
27
+ )
28
+
29
+
30
+ class Table(ABC):
31
+ r"""A :class:`Table` fully specifies the relevant metadata of a single
32
+ table, *i.e.* its selected columns, data types, semantic types, primary
33
+ keys and time columns.
103
34
 
104
35
  Args:
105
- df: The data frame to create the table from.
106
- name: The name of the table.
36
+ name: The name of this table.
37
+ columns: The selected columns of this table.
107
38
  primary_key: The name of the primary key of this table, if it exists.
108
39
  time_column: The name of the time column of this table, if it exists.
40
+ end_time_column: The name of the end time column of this table, if it
41
+ exists.
109
42
  """
110
43
  def __init__(
111
44
  self,
112
- df: pd.DataFrame,
113
45
  name: str,
46
+ columns: Optional[Sequence[str]] = None,
114
47
  primary_key: Optional[str] = None,
115
48
  time_column: Optional[str] = None,
49
+ end_time_column: Optional[str] = None,
116
50
  ) -> None:
117
51
 
118
- if df.empty:
119
- raise ValueError("Data frame must have at least one row")
120
- if isinstance(df.columns, pd.MultiIndex):
121
- raise ValueError("Data frame must not have a multi-index")
122
- if not df.columns.is_unique:
123
- raise ValueError("Data frame must have unique column names")
124
- if any(col == '' for col in df.columns):
125
- raise ValueError("Data frame must have non-empty column names")
126
-
127
- df = df.copy(deep=False)
128
-
129
- self._data = df
130
52
  self._name = name
131
53
  self._primary_key: Optional[str] = None
132
54
  self._time_column: Optional[str] = None
55
+ self._end_time_column: Optional[str] = None
56
+
57
+ if len(self._source_column_dict) == 0:
58
+ raise ValueError(f"Table '{name}' does not hold any column with "
59
+ f"a supported data type")
60
+
61
+ primary_keys = [
62
+ column.name for column in self._source_column_dict.values()
63
+ if column.is_primary_key
64
+ ]
65
+ if len(primary_keys) == 1: # NOTE No composite keys yet.
66
+ if primary_key is not None and primary_key != primary_keys[0]:
67
+ raise ValueError(f"Found duplicate primary key "
68
+ f"definition '{primary_key}' and "
69
+ f"'{primary_keys[0]}' in table '{name}'")
70
+ primary_key = primary_keys[0]
71
+
72
+ unique_keys = [
73
+ column.name for column in self._source_column_dict.values()
74
+ if column.is_unique_key
75
+ ]
76
+ if primary_key is None and len(unique_keys) == 1:
77
+ primary_key = unique_keys[0]
133
78
 
134
79
  self._columns: Dict[str, Column] = {}
135
- for column_name in df.columns:
80
+ for column_name in columns or list(self._source_column_dict.keys()):
136
81
  self.add_column(column_name)
137
82
 
138
83
  if primary_key is not None:
84
+ if primary_key not in self:
85
+ self.add_column(primary_key)
139
86
  self.primary_key = primary_key
140
87
 
141
88
  if time_column is not None:
89
+ if time_column not in self:
90
+ self.add_column(time_column)
142
91
  self.time_column = time_column
143
92
 
93
+ if end_time_column is not None:
94
+ if end_time_column not in self:
95
+ self.add_column(end_time_column)
96
+ self.end_time_column = end_time_column
97
+
144
98
  @property
145
99
  def name(self) -> str:
146
- r"""The name of the table."""
100
+ r"""The name of this table."""
147
101
  return self._name
148
102
 
149
103
  # Data column #############################################################
@@ -187,24 +141,35 @@ class LocalTable:
187
141
  raise KeyError(f"Column '{name}' already exists in table "
188
142
  f"'{self.name}'")
189
143
 
190
- if name not in self._data.columns:
191
- raise KeyError(f"Column '{name}' does not exist in the underyling "
192
- f"data frame")
144
+ if name not in self._source_column_dict:
145
+ raise KeyError(f"Column '{name}' does not exist in the underlying "
146
+ f"source table")
193
147
 
194
148
  try:
195
- dtype = utils.to_dtype(self._data[name])
149
+ dtype = self._source_column_dict[name].dtype
196
150
  except Exception as e:
197
- raise RuntimeError(f"Data type inference for column '{name}' in "
198
- f"table '{self.name}' failed. Consider "
199
- f"changing the data type of the column or "
200
- f"removing it from the table.") from e
151
+ raise RuntimeError(f"Could not obtain data type for column "
152
+ f"'{name}' in table '{self.name}'. Change "
153
+ f"the data type of the column in the source "
154
+ f"table or remove it from the table.") from e
155
+
201
156
  try:
202
- stype = utils.infer_stype(self._data[name], name, dtype)
157
+ ser = self._sample_df[name]
158
+ if contains_id(ser, name, dtype):
159
+ stype = Stype.ID
160
+ elif contains_timestamp(ser, name, dtype):
161
+ stype = Stype.timestamp
162
+ elif contains_multicategorical(ser, name, dtype):
163
+ stype = Stype.multicategorical
164
+ elif contains_categorical(ser, name, dtype):
165
+ stype = Stype.categorical
166
+ else:
167
+ stype = dtype.default_stype
203
168
  except Exception as e:
204
- raise RuntimeError(f"Semantic type inference for column '{name}' "
205
- f"in table '{self.name}' failed. Consider "
206
- f"changing the data type of the column or "
207
- f"removing it from the table.") from e
169
+ raise RuntimeError(f"Could not obtain semantic type for column "
170
+ f"'{name}' in table '{self.name}'. Change "
171
+ f"the data type of the column in the source "
172
+ f"table or remove it from the table.") from e
208
173
 
209
174
  self._columns[name] = Column(
210
175
  name=name,
@@ -230,6 +195,8 @@ class LocalTable:
230
195
  self.primary_key = None
231
196
  if self._time_column == name:
232
197
  self.time_column = None
198
+ if self._end_time_column == name:
199
+ self.end_time_column = None
233
200
  del self._columns[name]
234
201
 
235
202
  return self
@@ -253,9 +220,8 @@ class LocalTable:
253
220
  :class:`ValueError` if the primary key has a non-ID semantic type or
254
221
  if the column name does not match a column in the data frame.
255
222
  """
256
- if not self.has_primary_key():
223
+ if self._primary_key is None:
257
224
  return None
258
- assert self._primary_key is not None
259
225
  return self[self._primary_key]
260
226
 
261
227
  @primary_key.setter
@@ -264,6 +230,10 @@ class LocalTable:
264
230
  raise ValueError(f"Cannot specify column '{name}' as a primary "
265
231
  f"key since it is already defined to be a time "
266
232
  f"column")
233
+ if name is not None and name == self._end_time_column:
234
+ raise ValueError(f"Cannot specify column '{name}' as a primary "
235
+ f"key since it is already defined to be an end "
236
+ f"time column")
267
237
 
268
238
  if self.primary_key is not None:
269
239
  self.primary_key._is_primary_key = False
@@ -295,9 +265,8 @@ class LocalTable:
295
265
  :class:`ValueError` if the time column has a non-timestamp semantic
296
266
  type or if the column name does not match a column in the data frame.
297
267
  """
298
- if not self.has_time_column():
268
+ if self._time_column is None:
299
269
  return None
300
- assert self._time_column is not None
301
270
  return self[self._time_column]
302
271
 
303
272
  @time_column.setter
@@ -306,6 +275,10 @@ class LocalTable:
306
275
  raise ValueError(f"Cannot specify column '{name}' as a time "
307
276
  f"column since it is already defined to be a "
308
277
  f"primary key")
278
+ if name is not None and name == self._end_time_column:
279
+ raise ValueError(f"Cannot specify column '{name}' as a time "
280
+ f"column since it is already defined to be an "
281
+ f"end time column")
309
282
 
310
283
  if self.time_column is not None:
311
284
  self.time_column._is_time_column = False
@@ -318,6 +291,52 @@ class LocalTable:
318
291
  self[name]._is_time_column = True
319
292
  self._time_column = name
320
293
 
294
+ # End Time column #########################################################
295
+
296
+ def has_end_time_column(self) -> bool:
297
+ r"""Returns ``True`` if this table has an end time column; ``False``
298
+ otherwise.
299
+ """
300
+ return self._end_time_column is not None
301
+
302
+ @property
303
+ def end_time_column(self) -> Optional[Column]:
304
+ r"""The end time column of this table.
305
+
306
+ The getter returns the end time column of this table, or ``None`` if no
307
+ such end time column is present.
308
+
309
+ The setter sets a column as an end time column on this table, and
310
+ raises a :class:`ValueError` if the end time column has a non-timestamp
311
+ semantic type or if the column name does not match a column in the data
312
+ frame.
313
+ """
314
+ if self._end_time_column is None:
315
+ return None
316
+ return self[self._end_time_column]
317
+
318
+ @end_time_column.setter
319
+ def end_time_column(self, name: Optional[str]) -> None:
320
+ if name is not None and name == self._primary_key:
321
+ raise ValueError(f"Cannot specify column '{name}' as an end time "
322
+ f"column since it is already defined to be a "
323
+ f"primary key")
324
+ if name is not None and name == self._time_column:
325
+ raise ValueError(f"Cannot specify column '{name}' as an end time "
326
+ f"column since it is already defined to be a "
327
+ f"time column")
328
+
329
+ if self.end_time_column is not None:
330
+ self.end_time_column._is_end_time_column = False
331
+
332
+ if name is None:
333
+ self._end_time_column = None
334
+ return
335
+
336
+ self[name].stype = Stype.timestamp
337
+ self[name]._is_end_time_column = True
338
+ self._end_time_column = name
339
+
321
340
  # Metadata ################################################################
322
341
 
323
342
  @property
@@ -326,16 +345,18 @@ class LocalTable:
326
345
  information about the columns in this table.
327
346
 
328
347
  The returned dataframe has columns ``name``, ``dtype``, ``stype``,
329
- ``is_primary_key``, and ``is_time_column``, which provide an aggregate
330
- view of the properties of the columns of this table.
348
+ ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
349
+ which provide an aggregate view of the properties of the columns of
350
+ this table.
331
351
 
332
352
  Example:
353
+ >>> # doctest: +SKIP
333
354
  >>> import kumoai.experimental.rfm as rfm
334
355
  >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
335
356
  >>> table.metadata
336
- name dtype stype is_primary_key is_time_column
337
- 0 CustomerID float64 ID True False
338
- """
357
+ name dtype stype is_primary_key is_time_column is_end_time_column
358
+ 0 CustomerID float64 ID True False False
359
+ """ # noqa: E501
339
360
  cols = self.columns
340
361
 
341
362
  return pd.DataFrame({
@@ -355,15 +376,28 @@ class LocalTable:
355
376
  dtype=bool,
356
377
  data=[self._time_column == c.name for c in cols],
357
378
  ),
379
+ 'is_end_time_column':
380
+ pd.Series(
381
+ dtype=bool,
382
+ data=[self._end_time_column == c.name for c in cols],
383
+ ),
358
384
  })
359
385
 
360
386
  def print_metadata(self) -> None:
361
- r"""Prints the :meth:`~LocalTable.metadata` of the table."""
362
- if in_notebook():
387
+ r"""Prints the :meth:`~metadata` of this table."""
388
+ num_rows_repr = ''
389
+ if self._num_rows is not None:
390
+ num_rows_repr = ' ({self._num_rows:,} rows)'
391
+
392
+ if in_snowflake_notebook():
393
+ import streamlit as st
394
+ md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
395
+ st.markdown(md_repr)
396
+ st.dataframe(self.metadata, hide_index=True)
397
+ elif in_notebook():
363
398
  from IPython.display import Markdown, display
364
- display(
365
- Markdown(f"### 🏷️ Metadata of Table `{self.name}` "
366
- f"({len(self._data):,} rows)"))
399
+ md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
400
+ display(Markdown(md_repr))
367
401
  df = self.metadata
368
402
  try:
369
403
  if hasattr(df.style, 'hide'):
@@ -373,8 +407,7 @@ class LocalTable:
373
407
  except ImportError:
374
408
  print(df.to_string(index=False)) # missing jinja2
375
409
  else:
376
- print(f"🏷️ Metadata of Table '{self.name}' "
377
- f"({len(self._data):,} rows):")
410
+ print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
378
411
  print(self.metadata.to_string(index=False))
379
412
 
380
413
  def infer_metadata(self, verbose: bool = True) -> Self:
@@ -404,9 +437,9 @@ class LocalTable:
404
437
  column.name for column in self.columns if is_candidate(column)
405
438
  ]
406
439
 
407
- if primary_key := utils.detect_primary_key(
440
+ if primary_key := infer_primary_key(
408
441
  table_name=self.name,
409
- df=self._data,
442
+ df=self._sample_df,
410
443
  candidates=candidates,
411
444
  ):
412
445
  self.primary_key = primary_key
@@ -417,8 +450,12 @@ class LocalTable:
417
450
  candidates = [
418
451
  column.name for column in self.columns
419
452
  if column.stype == Stype.timestamp
453
+ and column.name != self._end_time_column
420
454
  ]
421
- if time_column := utils.detect_time_column(self._data, candidates):
455
+ if time_column := infer_time_column(
456
+ df=self._sample_df,
457
+ candidates=candidates,
458
+ ):
422
459
  self.time_column = time_column
423
460
  logs.append(f"time column '{time_column}'")
424
461
 
@@ -430,24 +467,26 @@ class LocalTable:
430
467
  # Helpers #################################################################
431
468
 
432
469
  def _to_api_table_definition(self) -> TableDefinition:
433
- cols: List[ColumnDefinition] = []
434
- for col in self.columns:
435
- cols.append(ColumnDefinition(col.name, col.stype, col.dtype))
436
- pkey = self._primary_key
437
- time_col = self._time_column
438
- source_table = UnavailableSourceTable(table=self.name)
439
-
440
470
  return TableDefinition(
441
- cols=cols,
442
- source_table=source_table,
443
- pkey=pkey,
444
- time_col=time_col,
471
+ cols=[
472
+ ColumnDefinition(col.name, col.stype, col.dtype)
473
+ for col in self.columns
474
+ ],
475
+ source_table=UnavailableSourceTable(table=self.name),
476
+ pkey=self._primary_key,
477
+ time_col=self._time_column,
478
+ end_time_col=self._end_time_column,
445
479
  )
446
480
 
447
481
  # Python builtins #########################################################
448
482
 
449
483
  def __hash__(self) -> int:
450
- return hash(tuple(self.columns + [self.primary_key, self.time_column]))
484
+ special_columns = [
485
+ self.primary_key,
486
+ self.time_column,
487
+ self.end_time_column,
488
+ ]
489
+ return hash(tuple(self.columns + special_columns))
451
490
 
452
491
  def __contains__(self, name: str) -> bool:
453
492
  return self.has_column(name)
@@ -464,4 +503,54 @@ class LocalTable:
464
503
  f' num_columns={len(self.columns)},\n'
465
504
  f' primary_key={self._primary_key},\n'
466
505
  f' time_column={self._time_column},\n'
506
+ f' end_time_column={self._end_time_column},\n'
467
507
  f')')
508
+
509
+ # Abstract Methods ########################################################
510
+
511
+ @property
512
+ @abstractmethod
513
+ def backend(self) -> DataBackend:
514
+ r"""The data backend of this table."""
515
+ pass
516
+
517
+ @cached_property
518
+ def _source_column_dict(self) -> Dict[str, SourceColumn]:
519
+ return {col.name: col for col in self._get_source_columns()}
520
+
521
+ @abstractmethod
522
+ def _get_source_columns(self) -> List[SourceColumn]:
523
+ pass
524
+
525
+ @cached_property
526
+ def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
527
+ fkeys = self._get_source_foreign_keys()
528
+ # NOTE Drop all keys that link to different primary keys in the same
529
+ # table since we don't support composite keys yet:
530
+ table_pkeys: Dict[str, Set[str]] = defaultdict(set)
531
+ for fkey in fkeys:
532
+ table_pkeys[fkey.dst_table].add(fkey.primary_key)
533
+ return {
534
+ fkey.name: fkey
535
+ for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
536
+ }
537
+
538
+ @abstractmethod
539
+ def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
540
+ pass
541
+
542
+ @cached_property
543
+ def _sample_df(self) -> pd.DataFrame:
544
+ return self._get_sample_df()
545
+
546
+ @abstractmethod
547
+ def _get_sample_df(self) -> pd.DataFrame:
548
+ pass
549
+
550
+ @cached_property
551
+ def _num_rows(self) -> Optional[int]:
552
+ return self._get_num_rows()
553
+
554
+ @abstractmethod
555
+ def _get_num_rows(self) -> Optional[int]:
556
+ pass