kumoai 2.7.0.dev202508201830__cp312-cp312-win_amd64.whl → 2.12.0.dev202511111731__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. kumoai/__init__.py +4 -2
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +10 -5
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/rfm.py +37 -8
  6. kumoai/connector/file_upload_connector.py +94 -85
  7. kumoai/connector/snowflake_connector.py +9 -0
  8. kumoai/connector/utils.py +1377 -209
  9. kumoai/experimental/rfm/__init__.py +5 -3
  10. kumoai/experimental/rfm/authenticate.py +8 -5
  11. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  12. kumoai/experimental/rfm/local_graph.py +96 -82
  13. kumoai/experimental/rfm/local_graph_sampler.py +16 -8
  14. kumoai/experimental/rfm/local_graph_store.py +32 -10
  15. kumoai/experimental/rfm/local_pquery_driver.py +342 -46
  16. kumoai/experimental/rfm/local_table.py +142 -45
  17. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  18. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +28 -58
  19. kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
  20. kumoai/experimental/rfm/rfm.py +535 -125
  21. kumoai/experimental/rfm/utils.py +0 -3
  22. kumoai/jobs.py +27 -1
  23. kumoai/kumolib.cp312-win_amd64.pyd +0 -0
  24. kumoai/pquery/prediction_table.py +5 -3
  25. kumoai/pquery/training_table.py +5 -3
  26. kumoai/trainer/job.py +9 -30
  27. kumoai/trainer/trainer.py +19 -10
  28. kumoai/utils/__init__.py +2 -1
  29. kumoai/utils/progress_logger.py +96 -16
  30. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/METADATA +4 -5
  31. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/RECORD +34 -34
  32. kumoai/experimental/rfm/pquery/pandas_backend.py +0 -437
  33. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/WHEEL +0 -0
  34. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/licenses/LICENSE +0 -0
  35. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/top_level.txt +0 -0
@@ -23,11 +23,13 @@ class Column:
23
23
  stype: Stype,
24
24
  is_primary_key: bool = False,
25
25
  is_time_column: bool = False,
26
+ is_end_time_column: bool = False,
26
27
  ) -> None:
27
28
  self._name = name
28
29
  self._dtype = Dtype(dtype)
29
30
  self._is_primary_key = is_primary_key
30
31
  self._is_time_column = is_time_column
32
+ self._is_end_time_column = is_end_time_column
31
33
  self.stype = Stype(stype)
32
34
 
33
35
  @property
@@ -50,9 +52,12 @@ class Column:
50
52
  if self._is_primary_key and val != Stype.ID:
51
53
  raise ValueError(f"Primary key '{self.name}' must have 'ID' "
52
54
  f"semantic type (got '{val}')")
53
- if self.name == self._is_time_column and val != Stype.timestamp:
55
+ if self._is_time_column and val != Stype.timestamp:
54
56
  raise ValueError(f"Time column '{self.name}' must have "
55
57
  f"'timestamp' semantic type (got '{val}')")
58
+ if self._is_end_time_column and val != Stype.timestamp:
59
+ raise ValueError(f"End time column '{self.name}' must have "
60
+ f"'timestamp' semantic type (got '{val}')")
56
61
 
57
62
  super().__setattr__(key, val)
58
63
 
@@ -93,6 +98,7 @@ class LocalTable:
93
98
  name="my_table",
94
99
  primary_key="id",
95
100
  time_column="time",
101
+ end_time_column=None,
96
102
  )
97
103
 
98
104
  # Verify metadata:
@@ -106,6 +112,8 @@ class LocalTable:
106
112
  name: The name of the table.
107
113
  primary_key: The name of the primary key of this table, if it exists.
108
114
  time_column: The name of the time column of this table, if it exists.
115
+ end_time_column: The name of the end time column of this table, if it
116
+ exists.
109
117
  """
110
118
  def __init__(
111
119
  self,
@@ -113,6 +121,7 @@ class LocalTable:
113
121
  name: str,
114
122
  primary_key: Optional[str] = None,
115
123
  time_column: Optional[str] = None,
124
+ end_time_column: Optional[str] = None,
116
125
  ) -> None:
117
126
 
118
127
  if df.empty:
@@ -125,36 +134,16 @@ class LocalTable:
125
134
  raise ValueError("Data frame must have non-empty column names")
126
135
 
127
136
  df = df.copy(deep=False)
128
- df.columns = df.columns.str.replace(r'\s+', '_', regex=True)
129
137
 
130
138
  self._data = df
131
139
  self._name = name
132
140
  self._primary_key: Optional[str] = None
133
141
  self._time_column: Optional[str] = None
142
+ self._end_time_column: Optional[str] = None
134
143
 
135
144
  self._columns: Dict[str, Column] = {}
136
145
  for column_name in df.columns:
137
- try:
138
- dtype = utils.to_dtype(df[column_name])
139
- except Exception as e:
140
- raise RuntimeError(f"Data type inference for column "
141
- f"'{column_name}' in table '{name}' "
142
- f"failed. Consider changing the data type "
143
- f"of the column or removing it from the "
144
- f"table.") from e
145
- try:
146
- stype = utils.infer_stype(df[column_name], column_name, dtype)
147
- except Exception as e:
148
- raise RuntimeError(f"Semantic type inference for column "
149
- f"'{column_name}' in table '{name}' "
150
- f"failed. Consider changing the data type "
151
- f"of the column or removing it from the "
152
- f"table.") from e
153
- self._columns[column_name] = Column(
154
- name=column_name,
155
- dtype=dtype,
156
- stype=stype,
157
- )
146
+ self.add_column(column_name)
158
147
 
159
148
  if primary_key is not None:
160
149
  self.primary_key = primary_key
@@ -162,6 +151,9 @@ class LocalTable:
162
151
  if time_column is not None:
163
152
  self.time_column = time_column
164
153
 
154
+ if end_time_column is not None:
155
+ self.end_time_column = end_time_column
156
+
165
157
  @property
166
158
  def name(self) -> str:
167
159
  r"""The name of the table."""
@@ -195,6 +187,46 @@ class LocalTable:
195
187
  """
196
188
  return list(self._columns.values())
197
189
 
190
+ def add_column(self, name: str) -> Column:
191
+ r"""Adds a column to this table.
192
+
193
+ Args:
194
+ name: The name of the column.
195
+
196
+ Raises:
197
+ KeyError: If ``name`` is already present in this table.
198
+ """
199
+ if name in self:
200
+ raise KeyError(f"Column '{name}' already exists in table "
201
+ f"'{self.name}'")
202
+
203
+ if name not in self._data.columns:
204
+ raise KeyError(f"Column '{name}' does not exist in the underyling "
205
+ f"data frame")
206
+
207
+ try:
208
+ dtype = utils.to_dtype(self._data[name])
209
+ except Exception as e:
210
+ raise RuntimeError(f"Data type inference for column '{name}' in "
211
+ f"table '{self.name}' failed. Consider "
212
+ f"changing the data type of the column or "
213
+ f"removing it from the table.") from e
214
+ try:
215
+ stype = utils.infer_stype(self._data[name], name, dtype)
216
+ except Exception as e:
217
+ raise RuntimeError(f"Semantic type inference for column '{name}' "
218
+ f"in table '{self.name}' failed. Consider "
219
+ f"changing the data type of the column or "
220
+ f"removing it from the table.") from e
221
+
222
+ self._columns[name] = Column(
223
+ name=name,
224
+ dtype=dtype,
225
+ stype=stype,
226
+ )
227
+
228
+ return self._columns[name]
229
+
198
230
  def remove_column(self, name: str) -> Self:
199
231
  r"""Removes a column from this table.
200
232
 
@@ -204,13 +236,15 @@ class LocalTable:
204
236
  Raises:
205
237
  KeyError: If ``name`` is not present in this table.
206
238
  """
207
- if not self.has_column(name):
239
+ if name not in self:
208
240
  raise KeyError(f"Column '{name}' not found in table '{self.name}'")
209
241
 
210
242
  if self._primary_key == name:
211
243
  self.primary_key = None
212
244
  if self._time_column == name:
213
245
  self.time_column = None
246
+ if self._end_time_column == name:
247
+ self.end_time_column = None
214
248
  del self._columns[name]
215
249
 
216
250
  return self
@@ -234,9 +268,8 @@ class LocalTable:
234
268
  :class:`ValueError` if the primary key has a non-ID semantic type or
235
269
  if the column name does not match a column in the data frame.
236
270
  """
237
- if not self.has_primary_key():
271
+ if self._primary_key is None:
238
272
  return None
239
- assert self._primary_key is not None
240
273
  return self[self._primary_key]
241
274
 
242
275
  @primary_key.setter
@@ -245,6 +278,10 @@ class LocalTable:
245
278
  raise ValueError(f"Cannot specify column '{name}' as a primary "
246
279
  f"key since it is already defined to be a time "
247
280
  f"column")
281
+ if name is not None and name == self._end_time_column:
282
+ raise ValueError(f"Cannot specify column '{name}' as a primary "
283
+ f"key since it is already defined to be an end "
284
+ f"time column")
248
285
 
249
286
  if self.primary_key is not None:
250
287
  self.primary_key._is_primary_key = False
@@ -276,9 +313,8 @@ class LocalTable:
276
313
  :class:`ValueError` if the time column has a non-timestamp semantic
277
314
  type or if the column name does not match a column in the data frame.
278
315
  """
279
- if not self.has_time_column():
316
+ if self._time_column is None:
280
317
  return None
281
- assert self._time_column is not None
282
318
  return self[self._time_column]
283
319
 
284
320
  @time_column.setter
@@ -287,6 +323,10 @@ class LocalTable:
287
323
  raise ValueError(f"Cannot specify column '{name}' as a time "
288
324
  f"column since it is already defined to be a "
289
325
  f"primary key")
326
+ if name is not None and name == self._end_time_column:
327
+ raise ValueError(f"Cannot specify column '{name}' as a time "
328
+ f"column since it is already defined to be an "
329
+ f"end time column")
290
330
 
291
331
  if self.time_column is not None:
292
332
  self.time_column._is_time_column = False
@@ -299,6 +339,52 @@ class LocalTable:
299
339
  self[name]._is_time_column = True
300
340
  self._time_column = name
301
341
 
342
+ # End Time column #########################################################
343
+
344
+ def has_end_time_column(self) -> bool:
345
+ r"""Returns ``True`` if this table has an end time column; ``False``
346
+ otherwise.
347
+ """
348
+ return self._end_time_column is not None
349
+
350
+ @property
351
+ def end_time_column(self) -> Optional[Column]:
352
+ r"""The end time column of this table.
353
+
354
+ The getter returns the end time column of this table, or ``None`` if no
355
+ such end time column is present.
356
+
357
+ The setter sets a column as an end time column on this table, and
358
+ raises a :class:`ValueError` if the end time column has a non-timestamp
359
+ semantic type or if the column name does not match a column in the data
360
+ frame.
361
+ """
362
+ if self._end_time_column is None:
363
+ return None
364
+ return self[self._end_time_column]
365
+
366
+ @end_time_column.setter
367
+ def end_time_column(self, name: Optional[str]) -> None:
368
+ if name is not None and name == self._primary_key:
369
+ raise ValueError(f"Cannot specify column '{name}' as an end time "
370
+ f"column since it is already defined to be a "
371
+ f"primary key")
372
+ if name is not None and name == self._time_column:
373
+ raise ValueError(f"Cannot specify column '{name}' as an end time "
374
+ f"column since it is already defined to be a "
375
+ f"time column")
376
+
377
+ if self.end_time_column is not None:
378
+ self.end_time_column._is_end_time_column = False
379
+
380
+ if name is None:
381
+ self._end_time_column = None
382
+ return
383
+
384
+ self[name].stype = Stype.timestamp
385
+ self[name]._is_end_time_column = True
386
+ self._end_time_column = name
387
+
302
388
  # Metadata ################################################################
303
389
 
304
390
  @property
@@ -307,16 +393,18 @@ class LocalTable:
307
393
  information about the columns in this table.
308
394
 
309
395
  The returned dataframe has columns ``name``, ``dtype``, ``stype``,
310
- ``is_primary_key``, and ``is_time_column``, which provide an aggregate
311
- view of the properties of the columns of this table.
396
+ ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
397
+ which provide an aggregate view of the properties of the columns of
398
+ this table.
312
399
 
313
400
  Example:
401
+ >>> # doctest: +SKIP
314
402
  >>> import kumoai.experimental.rfm as rfm
315
403
  >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
316
404
  >>> table.metadata
317
- name dtype stype is_primary_key is_time_column
318
- 0 CustomerID float64 ID True False
319
- """
405
+ name dtype stype is_primary_key is_time_column is_end_time_column
406
+ 0 CustomerID float64 ID True False False
407
+ """ # noqa: E501
320
408
  cols = self.columns
321
409
 
322
410
  return pd.DataFrame({
@@ -336,6 +424,11 @@ class LocalTable:
336
424
  dtype=bool,
337
425
  data=[self._time_column == c.name for c in cols],
338
426
  ),
427
+ 'is_end_time_column':
428
+ pd.Series(
429
+ dtype=bool,
430
+ data=[self._end_time_column == c.name for c in cols],
431
+ ),
339
432
  })
340
433
 
341
434
  def print_metadata(self) -> None:
@@ -398,6 +491,7 @@ class LocalTable:
398
491
  candidates = [
399
492
  column.name for column in self.columns
400
493
  if column.stype == Stype.timestamp
494
+ and column.name != self._end_time_column
401
495
  ]
402
496
  if time_column := utils.detect_time_column(self._data, candidates):
403
497
  self.time_column = time_column
@@ -411,24 +505,26 @@ class LocalTable:
411
505
  # Helpers #################################################################
412
506
 
413
507
  def _to_api_table_definition(self) -> TableDefinition:
414
- cols: List[ColumnDefinition] = []
415
- for col in self.columns:
416
- cols.append(ColumnDefinition(col.name, col.stype, col.dtype))
417
- pkey = self._primary_key
418
- time_col = self._time_column
419
- source_table = UnavailableSourceTable(table=self.name)
420
-
421
508
  return TableDefinition(
422
- cols=cols,
423
- source_table=source_table,
424
- pkey=pkey,
425
- time_col=time_col,
509
+ cols=[
510
+ ColumnDefinition(col.name, col.stype, col.dtype)
511
+ for col in self.columns
512
+ ],
513
+ source_table=UnavailableSourceTable(table=self.name),
514
+ pkey=self._primary_key,
515
+ time_col=self._time_column,
516
+ end_time_col=self._end_time_column,
426
517
  )
427
518
 
428
519
  # Python builtins #########################################################
429
520
 
430
521
  def __hash__(self) -> int:
431
- return hash(tuple(self.columns + [self.primary_key, self.time_column]))
522
+ special_columns = [
523
+ self.primary_key,
524
+ self.time_column,
525
+ self.end_time_column,
526
+ ]
527
+ return hash(tuple(self.columns + special_columns))
432
528
 
433
529
  def __contains__(self, name: str) -> bool:
434
530
  return self.has_column(name)
@@ -445,4 +541,5 @@ class LocalTable:
445
541
  f' num_columns={len(self.columns)},\n'
446
542
  f' primary_key={self._primary_key},\n'
447
543
  f' time_column={self._time_column},\n'
544
+ f' end_time_column={self._end_time_column},\n'
448
545
  f')')
@@ -1,7 +1,7 @@
1
- from .backend import PQueryBackend
2
- from .pandas_backend import PQueryPandasBackend
1
+ from .executor import PQueryExecutor
2
+ from .pandas_executor import PQueryPandasExecutor
3
3
 
4
4
  __all__ = [
5
- 'PQueryBackend',
6
- 'PQueryPandasBackend',
5
+ 'PQueryExecutor',
6
+ 'PQueryPandasExecutor',
7
7
  ]
@@ -1,23 +1,14 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Dict, Generic, Optional, Tuple, TypeVar, Union
2
+ from typing import Dict, Generic, Tuple, TypeVar
3
3
 
4
- from kumoapi.rfm import PQueryDefinition
5
- from kumoapi.rfm.pquery import (
4
+ from kumoapi.pquery import ValidatedPredictiveQuery
5
+ from kumoapi.pquery.AST import (
6
6
  Aggregation,
7
- AggregationType,
8
- BoolOp,
9
7
  Column,
10
8
  Condition,
11
9
  Filter,
12
- Float,
13
- FloatList,
14
- Int,
15
- IntList,
10
+ Join,
16
11
  LogicalOperation,
17
- MemberOp,
18
- RelOp,
19
- Str,
20
- StrList,
21
12
  )
22
13
 
23
14
  TableData = TypeVar('TableData')
@@ -25,108 +16,87 @@ ColumnData = TypeVar('ColumnData')
25
16
  IndexData = TypeVar('IndexData')
26
17
 
27
18
 
28
- class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
19
+ class PQueryExecutor(Generic[TableData, ColumnData, IndexData], ABC):
29
20
  @abstractmethod
30
- def eval_aggregation_type(
21
+ def execute_column(
31
22
  self,
32
- op: AggregationType,
33
- feat: Optional[ColumnData],
34
- batch: IndexData,
35
- batch_size: int,
23
+ column: Column,
24
+ feat_dict: Dict[str, TableData],
36
25
  filter_na: bool = True,
37
26
  ) -> Tuple[ColumnData, IndexData]:
38
27
  pass
39
28
 
40
29
  @abstractmethod
41
- def eval_rel_op(
30
+ def execute_aggregation(
42
31
  self,
43
- left: ColumnData,
44
- op: RelOp,
45
- right: Union[Int, Float, Str, None],
46
- ) -> ColumnData:
47
- pass
48
-
49
- @abstractmethod
50
- def eval_member_op(
51
- self,
52
- left: ColumnData,
53
- op: MemberOp,
54
- right: Union[IntList, FloatList, StrList],
55
- ) -> ColumnData:
56
- pass
57
-
58
- @abstractmethod
59
- def eval_bool_op(
60
- self,
61
- left: ColumnData,
62
- op: BoolOp,
63
- right: Optional[ColumnData],
64
- ) -> ColumnData:
65
- pass
66
-
67
- @abstractmethod
68
- def eval_column(
69
- self,
70
- column: Column,
32
+ aggr: Aggregation,
71
33
  feat_dict: Dict[str, TableData],
34
+ time_dict: Dict[str, ColumnData],
35
+ batch_dict: Dict[str, IndexData],
36
+ anchor_time: ColumnData,
72
37
  filter_na: bool = True,
38
+ num_forecasts: int = 1,
73
39
  ) -> Tuple[ColumnData, IndexData]:
74
40
  pass
75
41
 
76
42
  @abstractmethod
77
- def eval_aggregation(
43
+ def execute_condition(
78
44
  self,
79
- aggr: Aggregation,
45
+ condition: Condition,
80
46
  feat_dict: Dict[str, TableData],
81
47
  time_dict: Dict[str, ColumnData],
82
48
  batch_dict: Dict[str, IndexData],
83
49
  anchor_time: ColumnData,
84
50
  filter_na: bool = True,
51
+ num_forecasts: int = 1,
85
52
  ) -> Tuple[ColumnData, IndexData]:
86
53
  pass
87
54
 
88
55
  @abstractmethod
89
- def eval_condition(
56
+ def execute_logical_operation(
90
57
  self,
91
- condition: Condition,
58
+ logical_operation: LogicalOperation,
92
59
  feat_dict: Dict[str, TableData],
93
60
  time_dict: Dict[str, ColumnData],
94
61
  batch_dict: Dict[str, IndexData],
95
62
  anchor_time: ColumnData,
96
63
  filter_na: bool = True,
64
+ num_forecasts: int = 1,
97
65
  ) -> Tuple[ColumnData, IndexData]:
98
66
  pass
99
67
 
100
68
  @abstractmethod
101
- def eval_logical_operation(
69
+ def execute_join(
102
70
  self,
103
- logical_operation: LogicalOperation,
71
+ join: Join,
104
72
  feat_dict: Dict[str, TableData],
105
73
  time_dict: Dict[str, ColumnData],
106
74
  batch_dict: Dict[str, IndexData],
107
75
  anchor_time: ColumnData,
108
76
  filter_na: bool = True,
77
+ num_forecasts: int = 1,
109
78
  ) -> Tuple[ColumnData, IndexData]:
110
79
  pass
111
80
 
112
81
  @abstractmethod
113
- def eval_filter(
82
+ def execute_filter(
114
83
  self,
115
84
  filter: Filter,
116
85
  feat_dict: Dict[str, TableData],
117
86
  time_dict: Dict[str, ColumnData],
118
87
  batch_dict: Dict[str, IndexData],
119
88
  anchor_time: ColumnData,
120
- ) -> IndexData:
89
+ ) -> Tuple[ColumnData, IndexData]:
121
90
  pass
122
91
 
123
92
  @abstractmethod
124
- def eval_pquery(
93
+ def execute(
125
94
  self,
126
- query: PQueryDefinition,
95
+ query: ValidatedPredictiveQuery,
127
96
  feat_dict: Dict[str, TableData],
128
97
  time_dict: Dict[str, ColumnData],
129
98
  batch_dict: Dict[str, IndexData],
130
99
  anchor_time: ColumnData,
100
+ num_forecasts: int = 1,
131
101
  ) -> Tuple[ColumnData, IndexData]:
132
102
  pass