kumoai 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kumoai/_version.py +1 -1
  2. kumoai/experimental/rfm/__init__.py +33 -8
  3. kumoai/experimental/rfm/authenticate.py +3 -4
  4. kumoai/experimental/rfm/backend/local/graph_store.py +25 -25
  5. kumoai/experimental/rfm/backend/local/table.py +16 -21
  6. kumoai/experimental/rfm/backend/snow/sampler.py +22 -34
  7. kumoai/experimental/rfm/backend/snow/table.py +67 -33
  8. kumoai/experimental/rfm/backend/sqlite/__init__.py +2 -2
  9. kumoai/experimental/rfm/backend/sqlite/sampler.py +21 -26
  10. kumoai/experimental/rfm/backend/sqlite/table.py +54 -26
  11. kumoai/experimental/rfm/base/__init__.py +8 -0
  12. kumoai/experimental/rfm/base/column.py +14 -12
  13. kumoai/experimental/rfm/base/column_expression.py +50 -0
  14. kumoai/experimental/rfm/base/sql_sampler.py +31 -3
  15. kumoai/experimental/rfm/base/sql_table.py +229 -0
  16. kumoai/experimental/rfm/base/table.py +162 -143
  17. kumoai/experimental/rfm/graph.py +242 -95
  18. kumoai/experimental/rfm/infer/__init__.py +6 -4
  19. kumoai/experimental/rfm/infer/dtype.py +3 -3
  20. kumoai/experimental/rfm/infer/pkey.py +4 -2
  21. kumoai/experimental/rfm/infer/stype.py +35 -0
  22. kumoai/experimental/rfm/infer/time_col.py +1 -2
  23. kumoai/experimental/rfm/pquery/executor.py +27 -27
  24. kumoai/experimental/rfm/pquery/pandas_executor.py +29 -31
  25. kumoai/experimental/rfm/rfm.py +86 -80
  26. kumoai/experimental/rfm/sagemaker.py +4 -4
  27. kumoai/utils/__init__.py +1 -2
  28. kumoai/utils/progress_logger.py +178 -12
  29. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +2 -1
  30. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +33 -30
  31. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
  32. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
  33. {kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  from abc import ABC, abstractmethod
2
- from collections import defaultdict
2
+ from collections.abc import Sequence
3
3
  from functools import cached_property
4
- from typing import Dict, List, Optional, Sequence, Set
5
4
 
6
5
  import pandas as pd
6
+ from kumoapi.model_plan import MissingType
7
7
  from kumoapi.source_table import UnavailableSourceTable
8
8
  from kumoapi.table import Column as ColumnDefinition
9
9
  from kumoapi.table import TableDefinition
@@ -11,18 +11,10 @@ from kumoapi.typing import Stype
11
11
  from typing_extensions import Self
12
12
 
13
13
  from kumoai import in_notebook, in_snowflake_notebook
14
- from kumoai.experimental.rfm.base import (
15
- Column,
16
- DataBackend,
17
- SourceColumn,
18
- SourceForeignKey,
19
- )
14
+ from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
20
15
  from kumoai.experimental.rfm.infer import (
21
- contains_categorical,
22
- contains_id,
23
- contains_multicategorical,
24
- contains_timestamp,
25
16
  infer_primary_key,
17
+ infer_stype,
26
18
  infer_time_column,
27
19
  )
28
20
 
@@ -43,44 +35,32 @@ class Table(ABC):
43
35
  def __init__(
44
36
  self,
45
37
  name: str,
46
- columns: Optional[Sequence[str]] = None,
47
- primary_key: Optional[str] = None,
48
- time_column: Optional[str] = None,
49
- end_time_column: Optional[str] = None,
38
+ columns: Sequence[str] | None = None,
39
+ primary_key: MissingType | str | None = MissingType.VALUE,
40
+ time_column: str | None = None,
41
+ end_time_column: str | None = None,
50
42
  ) -> None:
51
43
 
52
44
  self._name = name
53
- self._primary_key: Optional[str] = None
54
- self._time_column: Optional[str] = None
55
- self._end_time_column: Optional[str] = None
56
-
57
- if len(self._source_column_dict) == 0:
58
- raise ValueError(f"Table '{name}' does not hold any column with "
59
- f"a supported data type")
45
+ self._primary_key: str | None = None
46
+ self._time_column: str | None = None
47
+ self._end_time_column: str | None = None
60
48
 
61
- primary_keys = [
62
- column.name for column in self._source_column_dict.values()
63
- if column.is_primary_key
64
- ]
65
- if len(primary_keys) == 1: # NOTE No composite keys yet.
66
- if primary_key is not None and primary_key != primary_keys[0]:
67
- raise ValueError(f"Found duplicate primary key "
68
- f"definition '{primary_key}' and "
69
- f"'{primary_keys[0]}' in table '{name}'")
70
- primary_key = primary_keys[0]
71
-
72
- unique_keys = [
73
- column.name for column in self._source_column_dict.values()
74
- if column.is_unique_key
75
- ]
76
- if primary_key is None and len(unique_keys) == 1:
77
- primary_key = unique_keys[0]
49
+ if columns is None:
50
+ columns = list(self._source_column_dict.keys())
78
51
 
79
- self._columns: Dict[str, Column] = {}
80
- for column_name in columns or list(self._source_column_dict.keys()):
52
+ self._columns: dict[str, Column] = {}
53
+ for column_name in columns:
81
54
  self.add_column(column_name)
82
55
 
83
- if primary_key is not None:
56
+ if isinstance(primary_key, MissingType):
57
+ # Inference from source column metadata:
58
+ if '_source_column_dict' in self.__dict__:
59
+ primary_key = self._source_primary_key
60
+ if (primary_key is not None and primary_key in self
61
+ and self[primary_key].is_physical):
62
+ self.primary_key = primary_key
63
+ elif primary_key is not None:
84
64
  if primary_key not in self:
85
65
  self.add_column(primary_key)
86
66
  self.primary_key = primary_key
@@ -100,7 +80,7 @@ class Table(ABC):
100
80
  r"""The name of this table."""
101
81
  return self._name
102
82
 
103
- # Data column #############################################################
83
+ # Column ##################################################################
104
84
 
105
85
  def has_column(self, name: str) -> bool:
106
86
  r"""Returns ``True`` if this table holds a column with name ``name``;
@@ -122,7 +102,7 @@ class Table(ABC):
122
102
  return self._columns[name]
123
103
 
124
104
  @property
125
- def columns(self) -> List[Column]:
105
+ def columns(self) -> list[Column]:
126
106
  r"""Returns a list of :class:`Column` objects that represent the
127
107
  columns in this table.
128
108
  """
@@ -145,36 +125,22 @@ class Table(ABC):
145
125
  raise KeyError(f"Column '{name}' does not exist in the underlying "
146
126
  f"source table")
147
127
 
148
- try:
149
- dtype = self._source_column_dict[name].dtype
150
- except Exception as e:
151
- raise RuntimeError(f"Could not obtain data type for column "
152
- f"'{name}' in table '{self.name}'. Change "
153
- f"the data type of the column in the source "
154
- f"table or remove it from the table.") from e
128
+ dtype = self._source_column_dict[name].dtype
155
129
 
130
+ ser = self._source_sample_df[name]
156
131
  try:
157
- ser = self._sample_df[name]
158
- if contains_id(ser, name, dtype):
159
- stype = Stype.ID
160
- elif contains_timestamp(ser, name, dtype):
161
- stype = Stype.timestamp
162
- elif contains_multicategorical(ser, name, dtype):
163
- stype = Stype.multicategorical
164
- elif contains_categorical(ser, name, dtype):
165
- stype = Stype.categorical
166
- else:
167
- stype = dtype.default_stype
132
+ stype = infer_stype(ser, name, dtype)
168
133
  except Exception as e:
169
134
  raise RuntimeError(f"Could not obtain semantic type for column "
170
- f"'{name}' in table '{self.name}'. Change "
171
- f"the data type of the column in the source "
172
- f"table or remove it from the table.") from e
135
+ f"'{name}' with data type '{dtype}' in table "
136
+ f"'{self.name}'. Change the data type of the "
137
+ f"column in the source table or remove it from "
138
+ f"this table.") from e
173
139
 
174
140
  self._columns[name] = Column(
175
141
  name=name,
176
- dtype=dtype,
177
142
  stype=stype,
143
+ dtype=dtype,
178
144
  )
179
145
 
180
146
  return self._columns[name]
@@ -210,7 +176,7 @@ class Table(ABC):
210
176
  return self._primary_key is not None
211
177
 
212
178
  @property
213
- def primary_key(self) -> Optional[Column]:
179
+ def primary_key(self) -> Column | None:
214
180
  r"""The primary key column of this table.
215
181
 
216
182
  The getter returns the primary key column of this table, or ``None`` if
@@ -225,7 +191,7 @@ class Table(ABC):
225
191
  return self[self._primary_key]
226
192
 
227
193
  @primary_key.setter
228
- def primary_key(self, name: Optional[str]) -> None:
194
+ def primary_key(self, name: str | None) -> None:
229
195
  if name is not None and name == self._time_column:
230
196
  raise ValueError(f"Cannot specify column '{name}' as a primary "
231
197
  f"key since it is already defined to be a time "
@@ -255,7 +221,7 @@ class Table(ABC):
255
221
  return self._time_column is not None
256
222
 
257
223
  @property
258
- def time_column(self) -> Optional[Column]:
224
+ def time_column(self) -> Column | None:
259
225
  r"""The time column of this table.
260
226
 
261
227
  The getter returns the time column of this table, or ``None`` if no
@@ -270,7 +236,7 @@ class Table(ABC):
270
236
  return self[self._time_column]
271
237
 
272
238
  @time_column.setter
273
- def time_column(self, name: Optional[str]) -> None:
239
+ def time_column(self, name: str | None) -> None:
274
240
  if name is not None and name == self._primary_key:
275
241
  raise ValueError(f"Cannot specify column '{name}' as a time "
276
242
  f"column since it is already defined to be a "
@@ -300,7 +266,7 @@ class Table(ABC):
300
266
  return self._end_time_column is not None
301
267
 
302
268
  @property
303
- def end_time_column(self) -> Optional[Column]:
269
+ def end_time_column(self) -> Column | None:
304
270
  r"""The end time column of this table.
305
271
 
306
272
  The getter returns the end time column of this table, or ``None`` if no
@@ -316,7 +282,7 @@ class Table(ABC):
316
282
  return self[self._end_time_column]
317
283
 
318
284
  @end_time_column.setter
319
- def end_time_column(self, name: Optional[str]) -> None:
285
+ def end_time_column(self, name: str | None) -> None:
320
286
  if name is not None and name == self._primary_key:
321
287
  raise ValueError(f"Cannot specify column '{name}' as an end time "
322
288
  f"column since it is already defined to be a "
@@ -410,8 +376,91 @@ class Table(ABC):
410
376
  print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
411
377
  print(self.metadata.to_string(index=False))
412
378
 
379
+ def infer_primary_key(self, verbose: bool = True) -> Self:
380
+ r"""Infers the primary key in this table.
381
+
382
+ Args:
383
+ verbose: Whether to print verbose output.
384
+ """
385
+ if self.has_primary_key():
386
+ return self
387
+
388
+ def _set_primary_key(primary_key: str) -> None:
389
+ self.primary_key = primary_key
390
+ if verbose:
391
+ print(f"Detected primary key '{primary_key}' in table "
392
+ f"'{self.name}'")
393
+
394
+ # Inference from source column metadata:
395
+ if '_source_column_dict' in self.__dict__:
396
+ primary_key = self._source_primary_key
397
+ if (primary_key is not None and primary_key in self
398
+ and self[primary_key].is_physical):
399
+ _set_primary_key(primary_key)
400
+ return self
401
+
402
+ unique_keys = [
403
+ column.name for column in self._source_column_dict.values()
404
+ if column.is_unique_key
405
+ ]
406
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
407
+ and unique_keys[0] in self
408
+ and self[unique_keys[0]].is_physical):
409
+ _set_primary_key(unique_keys[0])
410
+ return self
411
+
412
+ # Heuristic-based inference:
413
+ candidates = [
414
+ column.name for column in self.columns if column.stype == Stype.ID
415
+ ]
416
+ if len(candidates) == 0:
417
+ for column in self.columns:
418
+ if self.name.lower() == column.name.lower():
419
+ candidates.append(column.name)
420
+ elif (self.name.lower().endswith('s')
421
+ and self.name.lower()[:-1] == column.name.lower()):
422
+ candidates.append(column.name)
423
+
424
+ if primary_key := infer_primary_key(
425
+ table_name=self.name,
426
+ df=self._sample_current_df(columns=candidates),
427
+ candidates=candidates,
428
+ ):
429
+ _set_primary_key(primary_key)
430
+ return self
431
+
432
+ return self
433
+
434
+ def infer_time_column(self, verbose: bool = True) -> Self:
435
+ r"""Infers the time column in this table.
436
+
437
+ Args:
438
+ verbose: Whether to print verbose output.
439
+ """
440
+ if self.has_time_column():
441
+ return self
442
+
443
+ # Heuristic-based inference:
444
+ candidates = [
445
+ column.name for column in self.columns
446
+ if column.stype == Stype.timestamp
447
+ and column.name != self._end_time_column
448
+ ]
449
+
450
+ if time_column := infer_time_column(
451
+ df=self._sample_current_df(columns=candidates),
452
+ candidates=candidates,
453
+ ):
454
+ self.time_column = time_column
455
+
456
+ if verbose:
457
+ print(f"Detected time column '{time_column}' in table "
458
+ f"'{self.name}'")
459
+
460
+ return self
461
+
413
462
  def infer_metadata(self, verbose: bool = True) -> Self:
414
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
463
+ r"""Infers metadata, *i.e.*, primary keys and time columns, in this
415
464
  table.
416
465
 
417
466
  Args:
@@ -419,45 +468,15 @@ class Table(ABC):
419
468
  """
420
469
  logs = []
421
470
 
422
- # Try to detect primary key if not set:
423
471
  if not self.has_primary_key():
472
+ self.infer_primary_key(verbose=False)
473
+ if self.has_primary_key():
474
+ logs.append(f"primary key '{self._primary_key}'")
424
475
 
425
- def is_candidate(column: Column) -> bool:
426
- if column.stype == Stype.ID:
427
- return True
428
- if all(column.stype != Stype.ID for column in self.columns):
429
- if self.name == column.name:
430
- return True
431
- if (self.name.endswith('s')
432
- and self.name[:-1] == column.name):
433
- return True
434
- return False
435
-
436
- candidates = [
437
- column.name for column in self.columns if is_candidate(column)
438
- ]
439
-
440
- if primary_key := infer_primary_key(
441
- table_name=self.name,
442
- df=self._sample_df,
443
- candidates=candidates,
444
- ):
445
- self.primary_key = primary_key
446
- logs.append(f"primary key '{primary_key}'")
447
-
448
- # Try to detect time column if not set:
449
476
  if not self.has_time_column():
450
- candidates = [
451
- column.name for column in self.columns
452
- if column.stype == Stype.timestamp
453
- and column.name != self._end_time_column
454
- ]
455
- if time_column := infer_time_column(
456
- df=self._sample_df,
457
- candidates=candidates,
458
- ):
459
- self.time_column = time_column
460
- logs.append(f"time column '{time_column}'")
477
+ self.infer_time_column(verbose=False)
478
+ if self.has_time_column():
479
+ logs.append(f"time column '{self._time_column}'")
461
480
 
462
481
  if verbose and len(logs) > 0:
463
482
  print(f"Detected {' and '.join(logs)} in table '{self.name}'")
@@ -478,6 +497,36 @@ class Table(ABC):
478
497
  end_time_col=self._end_time_column,
479
498
  )
480
499
 
500
+ @cached_property
501
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
502
+ source_columns = self._get_source_columns()
503
+ if len(source_columns) == 0:
504
+ raise ValueError(f"Table '{self.name}' does not hold any column "
505
+ f"with a supported data type")
506
+ return {column.name: column for column in source_columns}
507
+
508
+ @cached_property
509
+ def _source_sample_df(self) -> pd.DataFrame:
510
+ return self._get_source_sample_df()
511
+
512
+ @property
513
+ def _source_primary_key(self) -> str | None:
514
+ primary_keys = [
515
+ column.name for column in self._source_column_dict.values()
516
+ if column.is_primary_key
517
+ ]
518
+ if len(primary_keys) == 1: # NOTE No composite keys yet.
519
+ return primary_keys[0]
520
+
521
+ return None
522
+
523
+ @cached_property
524
+ def _num_rows(self) -> int | None:
525
+ return self._get_num_rows()
526
+
527
+ def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
528
+ return self._source_sample_df[columns]
529
+
481
530
  # Python builtins #########################################################
482
531
 
483
532
  def __hash__(self) -> int:
@@ -512,45 +561,15 @@ class Table(ABC):
512
561
  @abstractmethod
513
562
  def backend(self) -> DataBackend:
514
563
  r"""The data backend of this table."""
515
- pass
516
-
517
- @cached_property
518
- def _source_column_dict(self) -> Dict[str, SourceColumn]:
519
- return {col.name: col for col in self._get_source_columns()}
520
564
 
521
565
  @abstractmethod
522
- def _get_source_columns(self) -> List[SourceColumn]:
566
+ def _get_source_columns(self) -> list[SourceColumn]:
523
567
  pass
524
568
 
525
- @cached_property
526
- def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
527
- fkeys = self._get_source_foreign_keys()
528
- # NOTE Drop all keys that link to different primary keys in the same
529
- # table since we don't support composite keys yet:
530
- table_pkeys: Dict[str, Set[str]] = defaultdict(set)
531
- for fkey in fkeys:
532
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
533
- return {
534
- fkey.name: fkey
535
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
536
- }
537
-
538
569
  @abstractmethod
539
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
570
+ def _get_source_sample_df(self) -> pd.DataFrame:
540
571
  pass
541
572
 
542
- @cached_property
543
- def _sample_df(self) -> pd.DataFrame:
544
- return self._get_sample_df()
545
-
546
- @abstractmethod
547
- def _get_sample_df(self) -> pd.DataFrame:
548
- pass
549
-
550
- @cached_property
551
- def _num_rows(self) -> Optional[int]:
552
- return self._get_num_rows()
553
-
554
573
  @abstractmethod
555
- def _get_num_rows(self) -> Optional[int]:
574
+ def _get_num_rows(self) -> int | None:
556
575
  pass