kumoai 2.10.0.dev202510021830__py3-none-any.whl → 2.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +4 -2
- kumoai/_version.py +1 -1
- kumoai/client/client.py +10 -5
- kumoai/client/endpoints.py +1 -0
- kumoai/client/rfm.py +35 -7
- kumoai/experimental/rfm/__init__.py +5 -3
- kumoai/experimental/rfm/infer/timestamp.py +5 -4
- kumoai/experimental/rfm/local_graph.py +90 -74
- kumoai/experimental/rfm/local_graph_sampler.py +16 -8
- kumoai/experimental/rfm/local_graph_store.py +13 -1
- kumoai/experimental/rfm/local_pquery_driver.py +323 -38
- kumoai/experimental/rfm/local_table.py +100 -22
- kumoai/experimental/rfm/pquery/__init__.py +4 -4
- kumoai/experimental/rfm/pquery/{backend.py → executor.py} +24 -58
- kumoai/experimental/rfm/pquery/{pandas_backend.py → pandas_executor.py} +277 -223
- kumoai/experimental/rfm/rfm.py +220 -79
- kumoai/jobs.py +1 -0
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/trainer/trainer.py +9 -10
- kumoai/utils/progress_logger.py +13 -0
- {kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/METADATA +4 -5
- {kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/RECORD +25 -25
- {kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/WHEEL +0 -0
- {kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/top_level.txt +0 -0
|
@@ -23,11 +23,13 @@ class Column:
|
|
|
23
23
|
stype: Stype,
|
|
24
24
|
is_primary_key: bool = False,
|
|
25
25
|
is_time_column: bool = False,
|
|
26
|
+
is_end_time_column: bool = False,
|
|
26
27
|
) -> None:
|
|
27
28
|
self._name = name
|
|
28
29
|
self._dtype = Dtype(dtype)
|
|
29
30
|
self._is_primary_key = is_primary_key
|
|
30
31
|
self._is_time_column = is_time_column
|
|
32
|
+
self._is_end_time_column = is_end_time_column
|
|
31
33
|
self.stype = Stype(stype)
|
|
32
34
|
|
|
33
35
|
@property
|
|
@@ -50,9 +52,12 @@ class Column:
|
|
|
50
52
|
if self._is_primary_key and val != Stype.ID:
|
|
51
53
|
raise ValueError(f"Primary key '{self.name}' must have 'ID' "
|
|
52
54
|
f"semantic type (got '{val}')")
|
|
53
|
-
if self.
|
|
55
|
+
if self._is_time_column and val != Stype.timestamp:
|
|
54
56
|
raise ValueError(f"Time column '{self.name}' must have "
|
|
55
57
|
f"'timestamp' semantic type (got '{val}')")
|
|
58
|
+
if self._is_end_time_column and val != Stype.timestamp:
|
|
59
|
+
raise ValueError(f"End time column '{self.name}' must have "
|
|
60
|
+
f"'timestamp' semantic type (got '{val}')")
|
|
56
61
|
|
|
57
62
|
super().__setattr__(key, val)
|
|
58
63
|
|
|
@@ -93,6 +98,7 @@ class LocalTable:
|
|
|
93
98
|
name="my_table",
|
|
94
99
|
primary_key="id",
|
|
95
100
|
time_column="time",
|
|
101
|
+
end_time_column=None,
|
|
96
102
|
)
|
|
97
103
|
|
|
98
104
|
# Verify metadata:
|
|
@@ -106,6 +112,8 @@ class LocalTable:
|
|
|
106
112
|
name: The name of the table.
|
|
107
113
|
primary_key: The name of the primary key of this table, if it exists.
|
|
108
114
|
time_column: The name of the time column of this table, if it exists.
|
|
115
|
+
end_time_column: The name of the end time column of this table, if it
|
|
116
|
+
exists.
|
|
109
117
|
"""
|
|
110
118
|
def __init__(
|
|
111
119
|
self,
|
|
@@ -113,6 +121,7 @@ class LocalTable:
|
|
|
113
121
|
name: str,
|
|
114
122
|
primary_key: Optional[str] = None,
|
|
115
123
|
time_column: Optional[str] = None,
|
|
124
|
+
end_time_column: Optional[str] = None,
|
|
116
125
|
) -> None:
|
|
117
126
|
|
|
118
127
|
if df.empty:
|
|
@@ -130,6 +139,7 @@ class LocalTable:
|
|
|
130
139
|
self._name = name
|
|
131
140
|
self._primary_key: Optional[str] = None
|
|
132
141
|
self._time_column: Optional[str] = None
|
|
142
|
+
self._end_time_column: Optional[str] = None
|
|
133
143
|
|
|
134
144
|
self._columns: Dict[str, Column] = {}
|
|
135
145
|
for column_name in df.columns:
|
|
@@ -141,6 +151,9 @@ class LocalTable:
|
|
|
141
151
|
if time_column is not None:
|
|
142
152
|
self.time_column = time_column
|
|
143
153
|
|
|
154
|
+
if end_time_column is not None:
|
|
155
|
+
self.end_time_column = end_time_column
|
|
156
|
+
|
|
144
157
|
@property
|
|
145
158
|
def name(self) -> str:
|
|
146
159
|
r"""The name of the table."""
|
|
@@ -230,6 +243,8 @@ class LocalTable:
|
|
|
230
243
|
self.primary_key = None
|
|
231
244
|
if self._time_column == name:
|
|
232
245
|
self.time_column = None
|
|
246
|
+
if self._end_time_column == name:
|
|
247
|
+
self.end_time_column = None
|
|
233
248
|
del self._columns[name]
|
|
234
249
|
|
|
235
250
|
return self
|
|
@@ -253,9 +268,8 @@ class LocalTable:
|
|
|
253
268
|
:class:`ValueError` if the primary key has a non-ID semantic type or
|
|
254
269
|
if the column name does not match a column in the data frame.
|
|
255
270
|
"""
|
|
256
|
-
if
|
|
271
|
+
if self._primary_key is None:
|
|
257
272
|
return None
|
|
258
|
-
assert self._primary_key is not None
|
|
259
273
|
return self[self._primary_key]
|
|
260
274
|
|
|
261
275
|
@primary_key.setter
|
|
@@ -264,6 +278,10 @@ class LocalTable:
|
|
|
264
278
|
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
265
279
|
f"key since it is already defined to be a time "
|
|
266
280
|
f"column")
|
|
281
|
+
if name is not None and name == self._end_time_column:
|
|
282
|
+
raise ValueError(f"Cannot specify column '{name}' as a primary "
|
|
283
|
+
f"key since it is already defined to be an end "
|
|
284
|
+
f"time column")
|
|
267
285
|
|
|
268
286
|
if self.primary_key is not None:
|
|
269
287
|
self.primary_key._is_primary_key = False
|
|
@@ -295,9 +313,8 @@ class LocalTable:
|
|
|
295
313
|
:class:`ValueError` if the time column has a non-timestamp semantic
|
|
296
314
|
type or if the column name does not match a column in the data frame.
|
|
297
315
|
"""
|
|
298
|
-
if
|
|
316
|
+
if self._time_column is None:
|
|
299
317
|
return None
|
|
300
|
-
assert self._time_column is not None
|
|
301
318
|
return self[self._time_column]
|
|
302
319
|
|
|
303
320
|
@time_column.setter
|
|
@@ -306,6 +323,10 @@ class LocalTable:
|
|
|
306
323
|
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
307
324
|
f"column since it is already defined to be a "
|
|
308
325
|
f"primary key")
|
|
326
|
+
if name is not None and name == self._end_time_column:
|
|
327
|
+
raise ValueError(f"Cannot specify column '{name}' as a time "
|
|
328
|
+
f"column since it is already defined to be an "
|
|
329
|
+
f"end time column")
|
|
309
330
|
|
|
310
331
|
if self.time_column is not None:
|
|
311
332
|
self.time_column._is_time_column = False
|
|
@@ -318,6 +339,52 @@ class LocalTable:
|
|
|
318
339
|
self[name]._is_time_column = True
|
|
319
340
|
self._time_column = name
|
|
320
341
|
|
|
342
|
+
# End Time column #########################################################
|
|
343
|
+
|
|
344
|
+
def has_end_time_column(self) -> bool:
|
|
345
|
+
r"""Returns ``True`` if this table has an end time column; ``False``
|
|
346
|
+
otherwise.
|
|
347
|
+
"""
|
|
348
|
+
return self._end_time_column is not None
|
|
349
|
+
|
|
350
|
+
@property
|
|
351
|
+
def end_time_column(self) -> Optional[Column]:
|
|
352
|
+
r"""The end time column of this table.
|
|
353
|
+
|
|
354
|
+
The getter returns the end time column of this table, or ``None`` if no
|
|
355
|
+
such end time column is present.
|
|
356
|
+
|
|
357
|
+
The setter sets a column as an end time column on this table, and
|
|
358
|
+
raises a :class:`ValueError` if the end time column has a non-timestamp
|
|
359
|
+
semantic type or if the column name does not match a column in the data
|
|
360
|
+
frame.
|
|
361
|
+
"""
|
|
362
|
+
if self._end_time_column is None:
|
|
363
|
+
return None
|
|
364
|
+
return self[self._end_time_column]
|
|
365
|
+
|
|
366
|
+
@end_time_column.setter
|
|
367
|
+
def end_time_column(self, name: Optional[str]) -> None:
|
|
368
|
+
if name is not None and name == self._primary_key:
|
|
369
|
+
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
370
|
+
f"column since it is already defined to be a "
|
|
371
|
+
f"primary key")
|
|
372
|
+
if name is not None and name == self._time_column:
|
|
373
|
+
raise ValueError(f"Cannot specify column '{name}' as an end time "
|
|
374
|
+
f"column since it is already defined to be a "
|
|
375
|
+
f"time column")
|
|
376
|
+
|
|
377
|
+
if self.end_time_column is not None:
|
|
378
|
+
self.end_time_column._is_end_time_column = False
|
|
379
|
+
|
|
380
|
+
if name is None:
|
|
381
|
+
self._end_time_column = None
|
|
382
|
+
return
|
|
383
|
+
|
|
384
|
+
self[name].stype = Stype.timestamp
|
|
385
|
+
self[name]._is_end_time_column = True
|
|
386
|
+
self._end_time_column = name
|
|
387
|
+
|
|
321
388
|
# Metadata ################################################################
|
|
322
389
|
|
|
323
390
|
@property
|
|
@@ -326,16 +393,18 @@ class LocalTable:
|
|
|
326
393
|
information about the columns in this table.
|
|
327
394
|
|
|
328
395
|
The returned dataframe has columns ``name``, ``dtype``, ``stype``,
|
|
329
|
-
``is_primary_key``, and ``
|
|
330
|
-
view of the properties of the columns of
|
|
396
|
+
``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
|
|
397
|
+
which provide an aggregate view of the properties of the columns of
|
|
398
|
+
this table.
|
|
331
399
|
|
|
332
400
|
Example:
|
|
401
|
+
>>> # doctest: +SKIP
|
|
333
402
|
>>> import kumoai.experimental.rfm as rfm
|
|
334
403
|
>>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
|
|
335
404
|
>>> table.metadata
|
|
336
|
-
name dtype
|
|
337
|
-
0 CustomerID float64
|
|
338
|
-
"""
|
|
405
|
+
name dtype stype is_primary_key is_time_column is_end_time_column
|
|
406
|
+
0 CustomerID float64 ID True False False
|
|
407
|
+
""" # noqa: E501
|
|
339
408
|
cols = self.columns
|
|
340
409
|
|
|
341
410
|
return pd.DataFrame({
|
|
@@ -355,6 +424,11 @@ class LocalTable:
|
|
|
355
424
|
dtype=bool,
|
|
356
425
|
data=[self._time_column == c.name for c in cols],
|
|
357
426
|
),
|
|
427
|
+
'is_end_time_column':
|
|
428
|
+
pd.Series(
|
|
429
|
+
dtype=bool,
|
|
430
|
+
data=[self._end_time_column == c.name for c in cols],
|
|
431
|
+
),
|
|
358
432
|
})
|
|
359
433
|
|
|
360
434
|
def print_metadata(self) -> None:
|
|
@@ -417,6 +491,7 @@ class LocalTable:
|
|
|
417
491
|
candidates = [
|
|
418
492
|
column.name for column in self.columns
|
|
419
493
|
if column.stype == Stype.timestamp
|
|
494
|
+
and column.name != self._end_time_column
|
|
420
495
|
]
|
|
421
496
|
if time_column := utils.detect_time_column(self._data, candidates):
|
|
422
497
|
self.time_column = time_column
|
|
@@ -430,24 +505,26 @@ class LocalTable:
|
|
|
430
505
|
# Helpers #################################################################
|
|
431
506
|
|
|
432
507
|
def _to_api_table_definition(self) -> TableDefinition:
|
|
433
|
-
cols: List[ColumnDefinition] = []
|
|
434
|
-
for col in self.columns:
|
|
435
|
-
cols.append(ColumnDefinition(col.name, col.stype, col.dtype))
|
|
436
|
-
pkey = self._primary_key
|
|
437
|
-
time_col = self._time_column
|
|
438
|
-
source_table = UnavailableSourceTable(table=self.name)
|
|
439
|
-
|
|
440
508
|
return TableDefinition(
|
|
441
|
-
cols=
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
509
|
+
cols=[
|
|
510
|
+
ColumnDefinition(col.name, col.stype, col.dtype)
|
|
511
|
+
for col in self.columns
|
|
512
|
+
],
|
|
513
|
+
source_table=UnavailableSourceTable(table=self.name),
|
|
514
|
+
pkey=self._primary_key,
|
|
515
|
+
time_col=self._time_column,
|
|
516
|
+
end_time_col=self._end_time_column,
|
|
445
517
|
)
|
|
446
518
|
|
|
447
519
|
# Python builtins #########################################################
|
|
448
520
|
|
|
449
521
|
def __hash__(self) -> int:
|
|
450
|
-
|
|
522
|
+
special_columns = [
|
|
523
|
+
self.primary_key,
|
|
524
|
+
self.time_column,
|
|
525
|
+
self.end_time_column,
|
|
526
|
+
]
|
|
527
|
+
return hash(tuple(self.columns + special_columns))
|
|
451
528
|
|
|
452
529
|
def __contains__(self, name: str) -> bool:
|
|
453
530
|
return self.has_column(name)
|
|
@@ -464,4 +541,5 @@ class LocalTable:
|
|
|
464
541
|
f' num_columns={len(self.columns)},\n'
|
|
465
542
|
f' primary_key={self._primary_key},\n'
|
|
466
543
|
f' time_column={self._time_column},\n'
|
|
544
|
+
f' end_time_column={self._end_time_column},\n'
|
|
467
545
|
f')')
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
1
|
+
from .executor import PQueryExecutor
|
|
2
|
+
from .pandas_executor import PQueryPandasExecutor
|
|
3
3
|
|
|
4
4
|
__all__ = [
|
|
5
|
-
'
|
|
6
|
-
'
|
|
5
|
+
'PQueryExecutor',
|
|
6
|
+
'PQueryPandasExecutor',
|
|
7
7
|
]
|
|
@@ -1,23 +1,14 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Dict, Generic,
|
|
2
|
+
from typing import Dict, Generic, Tuple, TypeVar
|
|
3
3
|
|
|
4
|
-
from kumoapi.
|
|
5
|
-
from kumoapi.
|
|
4
|
+
from kumoapi.pquery import ValidatedPredictiveQuery
|
|
5
|
+
from kumoapi.pquery.AST import (
|
|
6
6
|
Aggregation,
|
|
7
|
-
AggregationType,
|
|
8
|
-
BoolOp,
|
|
9
7
|
Column,
|
|
10
8
|
Condition,
|
|
11
9
|
Filter,
|
|
12
|
-
|
|
13
|
-
FloatList,
|
|
14
|
-
Int,
|
|
15
|
-
IntList,
|
|
10
|
+
Join,
|
|
16
11
|
LogicalOperation,
|
|
17
|
-
MemberOp,
|
|
18
|
-
RelOp,
|
|
19
|
-
Str,
|
|
20
|
-
StrList,
|
|
21
12
|
)
|
|
22
13
|
|
|
23
14
|
TableData = TypeVar('TableData')
|
|
@@ -25,58 +16,33 @@ ColumnData = TypeVar('ColumnData')
|
|
|
25
16
|
IndexData = TypeVar('IndexData')
|
|
26
17
|
|
|
27
18
|
|
|
28
|
-
class
|
|
19
|
+
class PQueryExecutor(Generic[TableData, ColumnData, IndexData], ABC):
|
|
29
20
|
@abstractmethod
|
|
30
|
-
def
|
|
21
|
+
def execute_column(
|
|
31
22
|
self,
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
batch: IndexData,
|
|
35
|
-
batch_size: int,
|
|
23
|
+
column: Column,
|
|
24
|
+
feat_dict: Dict[str, TableData],
|
|
36
25
|
filter_na: bool = True,
|
|
37
26
|
) -> Tuple[ColumnData, IndexData]:
|
|
38
27
|
pass
|
|
39
28
|
|
|
40
29
|
@abstractmethod
|
|
41
|
-
def
|
|
42
|
-
self,
|
|
43
|
-
left: ColumnData,
|
|
44
|
-
op: RelOp,
|
|
45
|
-
right: Union[Int, Float, Str, None],
|
|
46
|
-
) -> ColumnData:
|
|
47
|
-
pass
|
|
48
|
-
|
|
49
|
-
@abstractmethod
|
|
50
|
-
def eval_member_op(
|
|
51
|
-
self,
|
|
52
|
-
left: ColumnData,
|
|
53
|
-
op: MemberOp,
|
|
54
|
-
right: Union[IntList, FloatList, StrList],
|
|
55
|
-
) -> ColumnData:
|
|
56
|
-
pass
|
|
57
|
-
|
|
58
|
-
@abstractmethod
|
|
59
|
-
def eval_bool_op(
|
|
60
|
-
self,
|
|
61
|
-
left: ColumnData,
|
|
62
|
-
op: BoolOp,
|
|
63
|
-
right: Optional[ColumnData],
|
|
64
|
-
) -> ColumnData:
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
@abstractmethod
|
|
68
|
-
def eval_column(
|
|
30
|
+
def execute_aggregation(
|
|
69
31
|
self,
|
|
70
|
-
|
|
32
|
+
aggr: Aggregation,
|
|
71
33
|
feat_dict: Dict[str, TableData],
|
|
34
|
+
time_dict: Dict[str, ColumnData],
|
|
35
|
+
batch_dict: Dict[str, IndexData],
|
|
36
|
+
anchor_time: ColumnData,
|
|
72
37
|
filter_na: bool = True,
|
|
38
|
+
num_forecasts: int = 1,
|
|
73
39
|
) -> Tuple[ColumnData, IndexData]:
|
|
74
40
|
pass
|
|
75
41
|
|
|
76
42
|
@abstractmethod
|
|
77
|
-
def
|
|
43
|
+
def execute_condition(
|
|
78
44
|
self,
|
|
79
|
-
|
|
45
|
+
condition: Condition,
|
|
80
46
|
feat_dict: Dict[str, TableData],
|
|
81
47
|
time_dict: Dict[str, ColumnData],
|
|
82
48
|
batch_dict: Dict[str, IndexData],
|
|
@@ -87,9 +53,9 @@ class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
|
|
|
87
53
|
pass
|
|
88
54
|
|
|
89
55
|
@abstractmethod
|
|
90
|
-
def
|
|
56
|
+
def execute_logical_operation(
|
|
91
57
|
self,
|
|
92
|
-
|
|
58
|
+
logical_operation: LogicalOperation,
|
|
93
59
|
feat_dict: Dict[str, TableData],
|
|
94
60
|
time_dict: Dict[str, ColumnData],
|
|
95
61
|
batch_dict: Dict[str, IndexData],
|
|
@@ -100,9 +66,9 @@ class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
|
|
|
100
66
|
pass
|
|
101
67
|
|
|
102
68
|
@abstractmethod
|
|
103
|
-
def
|
|
69
|
+
def execute_join(
|
|
104
70
|
self,
|
|
105
|
-
|
|
71
|
+
join: Join,
|
|
106
72
|
feat_dict: Dict[str, TableData],
|
|
107
73
|
time_dict: Dict[str, ColumnData],
|
|
108
74
|
batch_dict: Dict[str, IndexData],
|
|
@@ -113,20 +79,20 @@ class PQueryBackend(Generic[TableData, ColumnData, IndexData], ABC):
|
|
|
113
79
|
pass
|
|
114
80
|
|
|
115
81
|
@abstractmethod
|
|
116
|
-
def
|
|
82
|
+
def execute_filter(
|
|
117
83
|
self,
|
|
118
84
|
filter: Filter,
|
|
119
85
|
feat_dict: Dict[str, TableData],
|
|
120
86
|
time_dict: Dict[str, ColumnData],
|
|
121
87
|
batch_dict: Dict[str, IndexData],
|
|
122
88
|
anchor_time: ColumnData,
|
|
123
|
-
) -> IndexData:
|
|
89
|
+
) -> Tuple[ColumnData, IndexData]:
|
|
124
90
|
pass
|
|
125
91
|
|
|
126
92
|
@abstractmethod
|
|
127
|
-
def
|
|
93
|
+
def execute(
|
|
128
94
|
self,
|
|
129
|
-
query:
|
|
95
|
+
query: ValidatedPredictiveQuery,
|
|
130
96
|
feat_dict: Dict[str, TableData],
|
|
131
97
|
time_dict: Dict[str, ColumnData],
|
|
132
98
|
batch_dict: Dict[str, IndexData],
|