kumoai 2.13.0.dev202512081731__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kumoai/_version.py +1 -1
  2. kumoai/client/pquery.py +6 -2
  3. kumoai/experimental/rfm/__init__.py +33 -8
  4. kumoai/experimental/rfm/authenticate.py +3 -4
  5. kumoai/experimental/rfm/backend/local/graph_store.py +40 -83
  6. kumoai/experimental/rfm/backend/local/sampler.py +213 -14
  7. kumoai/experimental/rfm/backend/local/table.py +21 -16
  8. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  9. kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
  10. kumoai/experimental/rfm/backend/snow/table.py +101 -49
  11. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  12. kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
  13. kumoai/experimental/rfm/backend/sqlite/table.py +84 -31
  14. kumoai/experimental/rfm/base/__init__.py +25 -6
  15. kumoai/experimental/rfm/base/column.py +14 -12
  16. kumoai/experimental/rfm/base/column_expression.py +50 -0
  17. kumoai/experimental/rfm/base/sampler.py +438 -38
  18. kumoai/experimental/rfm/base/source.py +1 -0
  19. kumoai/experimental/rfm/base/sql_sampler.py +84 -0
  20. kumoai/experimental/rfm/base/sql_table.py +229 -0
  21. kumoai/experimental/rfm/base/table.py +165 -135
  22. kumoai/experimental/rfm/graph.py +266 -102
  23. kumoai/experimental/rfm/infer/__init__.py +6 -4
  24. kumoai/experimental/rfm/infer/dtype.py +3 -3
  25. kumoai/experimental/rfm/infer/pkey.py +4 -2
  26. kumoai/experimental/rfm/infer/stype.py +35 -0
  27. kumoai/experimental/rfm/infer/time_col.py +1 -2
  28. kumoai/experimental/rfm/pquery/executor.py +27 -27
  29. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  30. kumoai/experimental/rfm/rfm.py +299 -230
  31. kumoai/experimental/rfm/sagemaker.py +4 -4
  32. kumoai/pquery/predictive_query.py +10 -6
  33. kumoai/testing/snow.py +50 -0
  34. kumoai/utils/__init__.py +3 -2
  35. kumoai/utils/progress_logger.py +178 -12
  36. kumoai/utils/sql.py +3 -0
  37. {kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +3 -2
  38. {kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +41 -35
  39. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  40. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  41. {kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
  42. {kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
  43. {kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  from abc import ABC, abstractmethod
2
- from collections import defaultdict
2
+ from collections.abc import Sequence
3
3
  from functools import cached_property
4
- from typing import Dict, List, Optional, Sequence, Set
5
4
 
6
5
  import pandas as pd
6
+ from kumoapi.model_plan import MissingType
7
7
  from kumoapi.source_table import UnavailableSourceTable
8
8
  from kumoapi.table import Column as ColumnDefinition
9
9
  from kumoapi.table import TableDefinition
@@ -11,13 +11,10 @@ from kumoapi.typing import Stype
11
11
  from typing_extensions import Self
12
12
 
13
13
  from kumoai import in_notebook, in_snowflake_notebook
14
- from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
14
+ from kumoai.experimental.rfm.base import Column, DataBackend, SourceColumn
15
15
  from kumoai.experimental.rfm.infer import (
16
- contains_categorical,
17
- contains_id,
18
- contains_multicategorical,
19
- contains_timestamp,
20
16
  infer_primary_key,
17
+ infer_stype,
21
18
  infer_time_column,
22
19
  )
23
20
 
@@ -38,44 +35,32 @@ class Table(ABC):
38
35
  def __init__(
39
36
  self,
40
37
  name: str,
41
- columns: Optional[Sequence[str]] = None,
42
- primary_key: Optional[str] = None,
43
- time_column: Optional[str] = None,
44
- end_time_column: Optional[str] = None,
38
+ columns: Sequence[str] | None = None,
39
+ primary_key: MissingType | str | None = MissingType.VALUE,
40
+ time_column: str | None = None,
41
+ end_time_column: str | None = None,
45
42
  ) -> None:
46
43
 
47
44
  self._name = name
48
- self._primary_key: Optional[str] = None
49
- self._time_column: Optional[str] = None
50
- self._end_time_column: Optional[str] = None
45
+ self._primary_key: str | None = None
46
+ self._time_column: str | None = None
47
+ self._end_time_column: str | None = None
51
48
 
52
- if len(self._source_column_dict) == 0:
53
- raise ValueError(f"Table '{name}' does not hold any column with "
54
- f"a supported data type")
49
+ if columns is None:
50
+ columns = list(self._source_column_dict.keys())
55
51
 
56
- primary_keys = [
57
- column.name for column in self._source_column_dict.values()
58
- if column.is_primary_key
59
- ]
60
- if len(primary_keys) == 1: # NOTE No composite keys yet.
61
- if primary_key is not None and primary_key != primary_keys[0]:
62
- raise ValueError(f"Found duplicate primary key "
63
- f"definition '{primary_key}' and "
64
- f"'{primary_keys[0]}' in table '{name}'")
65
- primary_key = primary_keys[0]
66
-
67
- unique_keys = [
68
- column.name for column in self._source_column_dict.values()
69
- if column.is_unique_key
70
- ]
71
- if primary_key is None and len(unique_keys) == 1:
72
- primary_key = unique_keys[0]
73
-
74
- self._columns: Dict[str, Column] = {}
75
- for column_name in columns or list(self._source_column_dict.keys()):
52
+ self._columns: dict[str, Column] = {}
53
+ for column_name in columns:
76
54
  self.add_column(column_name)
77
55
 
78
- if primary_key is not None:
56
+ if isinstance(primary_key, MissingType):
57
+ # Inference from source column metadata:
58
+ if '_source_column_dict' in self.__dict__:
59
+ primary_key = self._source_primary_key
60
+ if (primary_key is not None and primary_key in self
61
+ and self[primary_key].is_physical):
62
+ self.primary_key = primary_key
63
+ elif primary_key is not None:
79
64
  if primary_key not in self:
80
65
  self.add_column(primary_key)
81
66
  self.primary_key = primary_key
@@ -95,7 +80,7 @@ class Table(ABC):
95
80
  r"""The name of this table."""
96
81
  return self._name
97
82
 
98
- # Data column #############################################################
83
+ # Column ##################################################################
99
84
 
100
85
  def has_column(self, name: str) -> bool:
101
86
  r"""Returns ``True`` if this table holds a column with name ``name``;
@@ -117,7 +102,7 @@ class Table(ABC):
117
102
  return self._columns[name]
118
103
 
119
104
  @property
120
- def columns(self) -> List[Column]:
105
+ def columns(self) -> list[Column]:
121
106
  r"""Returns a list of :class:`Column` objects that represent the
122
107
  columns in this table.
123
108
  """
@@ -140,36 +125,22 @@ class Table(ABC):
140
125
  raise KeyError(f"Column '{name}' does not exist in the underlying "
141
126
  f"source table")
142
127
 
143
- try:
144
- dtype = self._source_column_dict[name].dtype
145
- except Exception as e:
146
- raise RuntimeError(f"Could not obtain data type for column "
147
- f"'{name}' in table '{self.name}'. Change "
148
- f"the data type of the column in the source "
149
- f"table or remove it from the table.") from e
128
+ dtype = self._source_column_dict[name].dtype
150
129
 
130
+ ser = self._source_sample_df[name]
151
131
  try:
152
- ser = self._sample_df[name]
153
- if contains_id(ser, name, dtype):
154
- stype = Stype.ID
155
- elif contains_timestamp(ser, name, dtype):
156
- stype = Stype.timestamp
157
- elif contains_multicategorical(ser, name, dtype):
158
- stype = Stype.multicategorical
159
- elif contains_categorical(ser, name, dtype):
160
- stype = Stype.categorical
161
- else:
162
- stype = dtype.default_stype
132
+ stype = infer_stype(ser, name, dtype)
163
133
  except Exception as e:
164
134
  raise RuntimeError(f"Could not obtain semantic type for column "
165
- f"'{name}' in table '{self.name}'. Change "
166
- f"the data type of the column in the source "
167
- f"table or remove it from the table.") from e
135
+ f"'{name}' with data type '{dtype}' in table "
136
+ f"'{self.name}'. Change the data type of the "
137
+ f"column in the source table or remove it from "
138
+ f"this table.") from e
168
139
 
169
140
  self._columns[name] = Column(
170
141
  name=name,
171
- dtype=dtype,
172
142
  stype=stype,
143
+ dtype=dtype,
173
144
  )
174
145
 
175
146
  return self._columns[name]
@@ -205,7 +176,7 @@ class Table(ABC):
205
176
  return self._primary_key is not None
206
177
 
207
178
  @property
208
- def primary_key(self) -> Optional[Column]:
179
+ def primary_key(self) -> Column | None:
209
180
  r"""The primary key column of this table.
210
181
 
211
182
  The getter returns the primary key column of this table, or ``None`` if
@@ -220,7 +191,7 @@ class Table(ABC):
220
191
  return self[self._primary_key]
221
192
 
222
193
  @primary_key.setter
223
- def primary_key(self, name: Optional[str]) -> None:
194
+ def primary_key(self, name: str | None) -> None:
224
195
  if name is not None and name == self._time_column:
225
196
  raise ValueError(f"Cannot specify column '{name}' as a primary "
226
197
  f"key since it is already defined to be a time "
@@ -250,7 +221,7 @@ class Table(ABC):
250
221
  return self._time_column is not None
251
222
 
252
223
  @property
253
- def time_column(self) -> Optional[Column]:
224
+ def time_column(self) -> Column | None:
254
225
  r"""The time column of this table.
255
226
 
256
227
  The getter returns the time column of this table, or ``None`` if no
@@ -265,7 +236,7 @@ class Table(ABC):
265
236
  return self[self._time_column]
266
237
 
267
238
  @time_column.setter
268
- def time_column(self, name: Optional[str]) -> None:
239
+ def time_column(self, name: str | None) -> None:
269
240
  if name is not None and name == self._primary_key:
270
241
  raise ValueError(f"Cannot specify column '{name}' as a time "
271
242
  f"column since it is already defined to be a "
@@ -295,7 +266,7 @@ class Table(ABC):
295
266
  return self._end_time_column is not None
296
267
 
297
268
  @property
298
- def end_time_column(self) -> Optional[Column]:
269
+ def end_time_column(self) -> Column | None:
299
270
  r"""The end time column of this table.
300
271
 
301
272
  The getter returns the end time column of this table, or ``None`` if no
@@ -311,7 +282,7 @@ class Table(ABC):
311
282
  return self[self._end_time_column]
312
283
 
313
284
  @end_time_column.setter
314
- def end_time_column(self, name: Optional[str]) -> None:
285
+ def end_time_column(self, name: str | None) -> None:
315
286
  if name is not None and name == self._primary_key:
316
287
  raise ValueError(f"Cannot specify column '{name}' as an end time "
317
288
  f"column since it is already defined to be a "
@@ -405,8 +376,91 @@ class Table(ABC):
405
376
  print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
406
377
  print(self.metadata.to_string(index=False))
407
378
 
379
+ def infer_primary_key(self, verbose: bool = True) -> Self:
380
+ r"""Infers the primary key in this table.
381
+
382
+ Args:
383
+ verbose: Whether to print verbose output.
384
+ """
385
+ if self.has_primary_key():
386
+ return self
387
+
388
+ def _set_primary_key(primary_key: str) -> None:
389
+ self.primary_key = primary_key
390
+ if verbose:
391
+ print(f"Detected primary key '{primary_key}' in table "
392
+ f"'{self.name}'")
393
+
394
+ # Inference from source column metadata:
395
+ if '_source_column_dict' in self.__dict__:
396
+ primary_key = self._source_primary_key
397
+ if (primary_key is not None and primary_key in self
398
+ and self[primary_key].is_physical):
399
+ _set_primary_key(primary_key)
400
+ return self
401
+
402
+ unique_keys = [
403
+ column.name for column in self._source_column_dict.values()
404
+ if column.is_unique_key
405
+ ]
406
+ if (len(unique_keys) == 1 # NOTE No composite keys yet.
407
+ and unique_keys[0] in self
408
+ and self[unique_keys[0]].is_physical):
409
+ _set_primary_key(unique_keys[0])
410
+ return self
411
+
412
+ # Heuristic-based inference:
413
+ candidates = [
414
+ column.name for column in self.columns if column.stype == Stype.ID
415
+ ]
416
+ if len(candidates) == 0:
417
+ for column in self.columns:
418
+ if self.name.lower() == column.name.lower():
419
+ candidates.append(column.name)
420
+ elif (self.name.lower().endswith('s')
421
+ and self.name.lower()[:-1] == column.name.lower()):
422
+ candidates.append(column.name)
423
+
424
+ if primary_key := infer_primary_key(
425
+ table_name=self.name,
426
+ df=self._sample_current_df(columns=candidates),
427
+ candidates=candidates,
428
+ ):
429
+ _set_primary_key(primary_key)
430
+ return self
431
+
432
+ return self
433
+
434
+ def infer_time_column(self, verbose: bool = True) -> Self:
435
+ r"""Infers the time column in this table.
436
+
437
+ Args:
438
+ verbose: Whether to print verbose output.
439
+ """
440
+ if self.has_time_column():
441
+ return self
442
+
443
+ # Heuristic-based inference:
444
+ candidates = [
445
+ column.name for column in self.columns
446
+ if column.stype == Stype.timestamp
447
+ and column.name != self._end_time_column
448
+ ]
449
+
450
+ if time_column := infer_time_column(
451
+ df=self._sample_current_df(columns=candidates),
452
+ candidates=candidates,
453
+ ):
454
+ self.time_column = time_column
455
+
456
+ if verbose:
457
+ print(f"Detected time column '{time_column}' in table "
458
+ f"'{self.name}'")
459
+
460
+ return self
461
+
408
462
  def infer_metadata(self, verbose: bool = True) -> Self:
409
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
463
+ r"""Infers metadata, *i.e.*, primary keys and time columns, in this
410
464
  table.
411
465
 
412
466
  Args:
@@ -414,45 +468,15 @@ class Table(ABC):
414
468
  """
415
469
  logs = []
416
470
 
417
- # Try to detect primary key if not set:
418
471
  if not self.has_primary_key():
472
+ self.infer_primary_key(verbose=False)
473
+ if self.has_primary_key():
474
+ logs.append(f"primary key '{self._primary_key}'")
419
475
 
420
- def is_candidate(column: Column) -> bool:
421
- if column.stype == Stype.ID:
422
- return True
423
- if all(column.stype != Stype.ID for column in self.columns):
424
- if self.name == column.name:
425
- return True
426
- if (self.name.endswith('s')
427
- and self.name[:-1] == column.name):
428
- return True
429
- return False
430
-
431
- candidates = [
432
- column.name for column in self.columns if is_candidate(column)
433
- ]
434
-
435
- if primary_key := infer_primary_key(
436
- table_name=self.name,
437
- df=self._sample_df,
438
- candidates=candidates,
439
- ):
440
- self.primary_key = primary_key
441
- logs.append(f"primary key '{primary_key}'")
442
-
443
- # Try to detect time column if not set:
444
476
  if not self.has_time_column():
445
- candidates = [
446
- column.name for column in self.columns
447
- if column.stype == Stype.timestamp
448
- and column.name != self._end_time_column
449
- ]
450
- if time_column := infer_time_column(
451
- df=self._sample_df,
452
- candidates=candidates,
453
- ):
454
- self.time_column = time_column
455
- logs.append(f"time column '{time_column}'")
477
+ self.infer_time_column(verbose=False)
478
+ if self.has_time_column():
479
+ logs.append(f"time column '{self._time_column}'")
456
480
 
457
481
  if verbose and len(logs) > 0:
458
482
  print(f"Detected {' and '.join(logs)} in table '{self.name}'")
@@ -473,6 +497,36 @@ class Table(ABC):
473
497
  end_time_col=self._end_time_column,
474
498
  )
475
499
 
500
+ @cached_property
501
+ def _source_column_dict(self) -> dict[str, SourceColumn]:
502
+ source_columns = self._get_source_columns()
503
+ if len(source_columns) == 0:
504
+ raise ValueError(f"Table '{self.name}' does not hold any column "
505
+ f"with a supported data type")
506
+ return {column.name: column for column in source_columns}
507
+
508
+ @cached_property
509
+ def _source_sample_df(self) -> pd.DataFrame:
510
+ return self._get_source_sample_df()
511
+
512
+ @property
513
+ def _source_primary_key(self) -> str | None:
514
+ primary_keys = [
515
+ column.name for column in self._source_column_dict.values()
516
+ if column.is_primary_key
517
+ ]
518
+ if len(primary_keys) == 1: # NOTE No composite keys yet.
519
+ return primary_keys[0]
520
+
521
+ return None
522
+
523
+ @cached_property
524
+ def _num_rows(self) -> int | None:
525
+ return self._get_num_rows()
526
+
527
+ def _sample_current_df(self, columns: Sequence[str]) -> pd.DataFrame:
528
+ return self._source_sample_df[columns]
529
+
476
530
  # Python builtins #########################################################
477
531
 
478
532
  def __hash__(self) -> int:
@@ -503,43 +557,19 @@ class Table(ABC):
503
557
 
504
558
  # Abstract Methods ########################################################
505
559
 
506
- @cached_property
507
- def _source_column_dict(self) -> Dict[str, SourceColumn]:
508
- return {col.name: col for col in self._get_source_columns()}
509
-
560
+ @property
510
561
  @abstractmethod
511
- def _get_source_columns(self) -> List[SourceColumn]:
512
- pass
513
-
514
- @cached_property
515
- def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
516
- fkeys = self._get_source_foreign_keys()
517
- # NOTE Drop all keys that link to different primary keys in the same
518
- # table since we don't support composite keys yet:
519
- table_pkeys: Dict[str, Set[str]] = defaultdict(set)
520
- for fkey in fkeys:
521
- table_pkeys[fkey.dst_table].add(fkey.primary_key)
522
- return {
523
- fkey.name: fkey
524
- for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
525
- }
562
+ def backend(self) -> DataBackend:
563
+ r"""The data backend of this table."""
526
564
 
527
565
  @abstractmethod
528
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
566
+ def _get_source_columns(self) -> list[SourceColumn]:
529
567
  pass
530
568
 
531
- @cached_property
532
- def _sample_df(self) -> pd.DataFrame:
533
- return self._get_sample_df()
534
-
535
569
  @abstractmethod
536
- def _get_sample_df(self) -> pd.DataFrame:
570
+ def _get_source_sample_df(self) -> pd.DataFrame:
537
571
  pass
538
572
 
539
- @cached_property
540
- def _num_rows(self) -> Optional[int]:
541
- return self._get_num_rows()
542
-
543
573
  @abstractmethod
544
- def _get_num_rows(self) -> Optional[int]:
574
+ def _get_num_rows(self) -> int | None:
545
575
  pass