kumoai 2.13.0.dev202512040649__cp313-cp313-win_amd64.whl → 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +12 -0
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/experimental/rfm/__init__.py +33 -8
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/__init__.py +4 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +52 -91
- kumoai/experimental/rfm/backend/local/sampler.py +315 -0
- kumoai/experimental/rfm/backend/local/table.py +21 -16
- kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +102 -48
- kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +84 -31
- kumoai/experimental/rfm/base/__init__.py +26 -3
- kumoai/experimental/rfm/base/column.py +14 -12
- kumoai/experimental/rfm/base/column_expression.py +50 -0
- kumoai/experimental/rfm/base/sampler.py +773 -0
- kumoai/experimental/rfm/base/source.py +1 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +229 -0
- kumoai/experimental/rfm/base/table.py +173 -138
- kumoai/experimental/rfm/graph.py +302 -108
- kumoai/experimental/rfm/infer/__init__.py +6 -4
- kumoai/experimental/rfm/infer/dtype.py +3 -3
- kumoai/experimental/rfm/infer/pkey.py +4 -2
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +1 -2
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/rfm.py +299 -230
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/kumolib.cp313-win_amd64.pyd +0 -0
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +3 -2
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +44 -36
- kumoai/experimental/rfm/local_graph_sampler.py +0 -223
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
from .dtype import infer_dtype
|
|
2
|
-
from .pkey import infer_primary_key
|
|
3
|
-
from .time_col import infer_time_column
|
|
4
2
|
from .id import contains_id
|
|
5
3
|
from .timestamp import contains_timestamp
|
|
6
4
|
from .categorical import contains_categorical
|
|
7
5
|
from .multicategorical import contains_multicategorical
|
|
6
|
+
from .stype import infer_stype
|
|
7
|
+
from .pkey import infer_primary_key
|
|
8
|
+
from .time_col import infer_time_column
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
'infer_dtype',
|
|
11
|
-
'infer_primary_key',
|
|
12
|
-
'infer_time_column',
|
|
13
12
|
'contains_id',
|
|
14
13
|
'contains_timestamp',
|
|
15
14
|
'contains_categorical',
|
|
16
15
|
'contains_multicategorical',
|
|
16
|
+
'infer_stype',
|
|
17
|
+
'infer_primary_key',
|
|
18
|
+
'infer_time_column',
|
|
17
19
|
]
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
from typing import Dict
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
import pandas as pd
|
|
5
3
|
import pyarrow as pa
|
|
6
4
|
from kumoapi.typing import Dtype
|
|
7
5
|
|
|
8
|
-
PANDAS_TO_DTYPE:
|
|
6
|
+
PANDAS_TO_DTYPE: dict[str, Dtype] = {
|
|
9
7
|
'bool': Dtype.bool,
|
|
10
8
|
'boolean': Dtype.bool,
|
|
11
9
|
'int8': Dtype.int,
|
|
12
10
|
'int16': Dtype.int,
|
|
13
11
|
'int32': Dtype.int,
|
|
14
12
|
'int64': Dtype.int,
|
|
13
|
+
'float': Dtype.float,
|
|
14
|
+
'double': Dtype.float,
|
|
15
15
|
'float16': Dtype.float,
|
|
16
16
|
'float32': Dtype.float,
|
|
17
17
|
'float64': Dtype.float,
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import warnings
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
@@ -9,7 +8,7 @@ def infer_primary_key(
|
|
|
9
8
|
table_name: str,
|
|
10
9
|
df: pd.DataFrame,
|
|
11
10
|
candidates: list[str],
|
|
12
|
-
) ->
|
|
11
|
+
) -> str | None:
|
|
13
12
|
r"""Auto-detect potential primary key column.
|
|
14
13
|
|
|
15
14
|
Args:
|
|
@@ -20,6 +19,9 @@ def infer_primary_key(
|
|
|
20
19
|
Returns:
|
|
21
20
|
The name of the detected primary key, or ``None`` if not found.
|
|
22
21
|
"""
|
|
22
|
+
if len(candidates) == 0:
|
|
23
|
+
return None
|
|
24
|
+
|
|
23
25
|
# A list of (potentially modified) table names that are eligible to match
|
|
24
26
|
# with a primary key, i.e.:
|
|
25
27
|
# - UserInfo -> User
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from kumoapi.typing import Dtype, Stype
|
|
3
|
+
|
|
4
|
+
from kumoai.experimental.rfm.infer import (
|
|
5
|
+
contains_categorical,
|
|
6
|
+
contains_id,
|
|
7
|
+
contains_multicategorical,
|
|
8
|
+
contains_timestamp,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
|
|
13
|
+
"""Infers the :class:`Stype` from a :class:`pandas.Series`.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
ser: A :class:`pandas.Series` to analyze.
|
|
17
|
+
column_name: The column name.
|
|
18
|
+
dtype: The data type.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
The semantic type.
|
|
22
|
+
"""
|
|
23
|
+
if contains_id(ser, column_name, dtype):
|
|
24
|
+
return Stype.ID
|
|
25
|
+
|
|
26
|
+
if contains_timestamp(ser, column_name, dtype):
|
|
27
|
+
return Stype.timestamp
|
|
28
|
+
|
|
29
|
+
if contains_multicategorical(ser, column_name, dtype):
|
|
30
|
+
return Stype.multicategorical
|
|
31
|
+
|
|
32
|
+
if contains_categorical(ser, column_name, dtype):
|
|
33
|
+
return Stype.categorical
|
|
34
|
+
|
|
35
|
+
return dtype.default_stype
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import warnings
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
@@ -8,7 +7,7 @@ import pandas as pd
|
|
|
8
7
|
def infer_time_column(
|
|
9
8
|
df: pd.DataFrame,
|
|
10
9
|
candidates: list[str],
|
|
11
|
-
) ->
|
|
10
|
+
) -> str | None:
|
|
12
11
|
r"""Auto-detect potential time column.
|
|
13
12
|
|
|
14
13
|
Args:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Generic, TypeVar
|
|
3
3
|
|
|
4
4
|
from kumoapi.pquery import ValidatedPredictiveQuery
|
|
5
5
|
from kumoapi.pquery.AST import (
|
|
@@ -21,82 +21,82 @@ class PQueryExecutor(Generic[TableData, ColumnData, IndexData], ABC):
|
|
|
21
21
|
def execute_column(
|
|
22
22
|
self,
|
|
23
23
|
column: Column,
|
|
24
|
-
feat_dict:
|
|
24
|
+
feat_dict: dict[str, TableData],
|
|
25
25
|
filter_na: bool = True,
|
|
26
|
-
) ->
|
|
26
|
+
) -> tuple[ColumnData, IndexData]:
|
|
27
27
|
pass
|
|
28
28
|
|
|
29
29
|
@abstractmethod
|
|
30
30
|
def execute_aggregation(
|
|
31
31
|
self,
|
|
32
32
|
aggr: Aggregation,
|
|
33
|
-
feat_dict:
|
|
34
|
-
time_dict:
|
|
35
|
-
batch_dict:
|
|
33
|
+
feat_dict: dict[str, TableData],
|
|
34
|
+
time_dict: dict[str, ColumnData],
|
|
35
|
+
batch_dict: dict[str, IndexData],
|
|
36
36
|
anchor_time: ColumnData,
|
|
37
37
|
filter_na: bool = True,
|
|
38
38
|
num_forecasts: int = 1,
|
|
39
|
-
) ->
|
|
39
|
+
) -> tuple[ColumnData, IndexData]:
|
|
40
40
|
pass
|
|
41
41
|
|
|
42
42
|
@abstractmethod
|
|
43
43
|
def execute_condition(
|
|
44
44
|
self,
|
|
45
45
|
condition: Condition,
|
|
46
|
-
feat_dict:
|
|
47
|
-
time_dict:
|
|
48
|
-
batch_dict:
|
|
46
|
+
feat_dict: dict[str, TableData],
|
|
47
|
+
time_dict: dict[str, ColumnData],
|
|
48
|
+
batch_dict: dict[str, IndexData],
|
|
49
49
|
anchor_time: ColumnData,
|
|
50
50
|
filter_na: bool = True,
|
|
51
51
|
num_forecasts: int = 1,
|
|
52
|
-
) ->
|
|
52
|
+
) -> tuple[ColumnData, IndexData]:
|
|
53
53
|
pass
|
|
54
54
|
|
|
55
55
|
@abstractmethod
|
|
56
56
|
def execute_logical_operation(
|
|
57
57
|
self,
|
|
58
58
|
logical_operation: LogicalOperation,
|
|
59
|
-
feat_dict:
|
|
60
|
-
time_dict:
|
|
61
|
-
batch_dict:
|
|
59
|
+
feat_dict: dict[str, TableData],
|
|
60
|
+
time_dict: dict[str, ColumnData],
|
|
61
|
+
batch_dict: dict[str, IndexData],
|
|
62
62
|
anchor_time: ColumnData,
|
|
63
63
|
filter_na: bool = True,
|
|
64
64
|
num_forecasts: int = 1,
|
|
65
|
-
) ->
|
|
65
|
+
) -> tuple[ColumnData, IndexData]:
|
|
66
66
|
pass
|
|
67
67
|
|
|
68
68
|
@abstractmethod
|
|
69
69
|
def execute_join(
|
|
70
70
|
self,
|
|
71
71
|
join: Join,
|
|
72
|
-
feat_dict:
|
|
73
|
-
time_dict:
|
|
74
|
-
batch_dict:
|
|
72
|
+
feat_dict: dict[str, TableData],
|
|
73
|
+
time_dict: dict[str, ColumnData],
|
|
74
|
+
batch_dict: dict[str, IndexData],
|
|
75
75
|
anchor_time: ColumnData,
|
|
76
76
|
filter_na: bool = True,
|
|
77
77
|
num_forecasts: int = 1,
|
|
78
|
-
) ->
|
|
78
|
+
) -> tuple[ColumnData, IndexData]:
|
|
79
79
|
pass
|
|
80
80
|
|
|
81
81
|
@abstractmethod
|
|
82
82
|
def execute_filter(
|
|
83
83
|
self,
|
|
84
84
|
filter: Filter,
|
|
85
|
-
feat_dict:
|
|
86
|
-
time_dict:
|
|
87
|
-
batch_dict:
|
|
85
|
+
feat_dict: dict[str, TableData],
|
|
86
|
+
time_dict: dict[str, ColumnData],
|
|
87
|
+
batch_dict: dict[str, IndexData],
|
|
88
88
|
anchor_time: ColumnData,
|
|
89
|
-
) ->
|
|
89
|
+
) -> tuple[ColumnData, IndexData]:
|
|
90
90
|
pass
|
|
91
91
|
|
|
92
92
|
@abstractmethod
|
|
93
93
|
def execute(
|
|
94
94
|
self,
|
|
95
95
|
query: ValidatedPredictiveQuery,
|
|
96
|
-
feat_dict:
|
|
97
|
-
time_dict:
|
|
98
|
-
batch_dict:
|
|
96
|
+
feat_dict: dict[str, TableData],
|
|
97
|
+
time_dict: dict[str, ColumnData],
|
|
98
|
+
batch_dict: dict[str, IndexData],
|
|
99
99
|
anchor_time: ColumnData,
|
|
100
100
|
num_forecasts: int = 1,
|
|
101
|
-
) ->
|
|
101
|
+
) -> tuple[ColumnData, IndexData]:
|
|
102
102
|
pass
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Dict, List, Tuple
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
import pandas as pd
|
|
5
3
|
from kumoapi.pquery import ValidatedPredictiveQuery
|
|
@@ -22,9 +20,9 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
22
20
|
def execute_column(
|
|
23
21
|
self,
|
|
24
22
|
column: Column,
|
|
25
|
-
feat_dict:
|
|
23
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
26
24
|
filter_na: bool = True,
|
|
27
|
-
) ->
|
|
25
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
28
26
|
table_name, column_name = column.fqn.split(".")
|
|
29
27
|
if column_name == '*':
|
|
30
28
|
out = pd.Series(np.ones(len(feat_dict[table_name]), dtype='int64'))
|
|
@@ -60,7 +58,7 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
60
58
|
batch: np.ndarray,
|
|
61
59
|
batch_size: int,
|
|
62
60
|
filter_na: bool = True,
|
|
63
|
-
) ->
|
|
61
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
64
62
|
|
|
65
63
|
mask = feat.notna()
|
|
66
64
|
feat, batch = feat[mask], batch[mask]
|
|
@@ -104,13 +102,13 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
104
102
|
def execute_aggregation(
|
|
105
103
|
self,
|
|
106
104
|
aggr: Aggregation,
|
|
107
|
-
feat_dict:
|
|
108
|
-
time_dict:
|
|
109
|
-
batch_dict:
|
|
105
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
106
|
+
time_dict: dict[str, pd.Series],
|
|
107
|
+
batch_dict: dict[str, np.ndarray],
|
|
110
108
|
anchor_time: pd.Series,
|
|
111
109
|
filter_na: bool = True,
|
|
112
110
|
num_forecasts: int = 1,
|
|
113
|
-
) ->
|
|
111
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
114
112
|
target_table = aggr._get_target_column_name().split('.')[0]
|
|
115
113
|
target_batch = batch_dict[target_table]
|
|
116
114
|
target_time = time_dict[target_table]
|
|
@@ -131,10 +129,10 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
131
129
|
filter_na=True,
|
|
132
130
|
)
|
|
133
131
|
|
|
134
|
-
outs:
|
|
135
|
-
masks:
|
|
132
|
+
outs: list[pd.Series] = []
|
|
133
|
+
masks: list[np.ndarray] = []
|
|
136
134
|
for _ in range(num_forecasts):
|
|
137
|
-
anchor_target_time = anchor_time[target_batch]
|
|
135
|
+
anchor_target_time = anchor_time.iloc[target_batch]
|
|
138
136
|
anchor_target_time = anchor_target_time.reset_index(drop=True)
|
|
139
137
|
|
|
140
138
|
time_filter_mask = (target_time <= anchor_target_time +
|
|
@@ -226,13 +224,13 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
226
224
|
def execute_condition(
|
|
227
225
|
self,
|
|
228
226
|
condition: Condition,
|
|
229
|
-
feat_dict:
|
|
230
|
-
time_dict:
|
|
231
|
-
batch_dict:
|
|
227
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
228
|
+
time_dict: dict[str, pd.Series],
|
|
229
|
+
batch_dict: dict[str, np.ndarray],
|
|
232
230
|
anchor_time: pd.Series,
|
|
233
231
|
filter_na: bool = True,
|
|
234
232
|
num_forecasts: int = 1,
|
|
235
|
-
) ->
|
|
233
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
236
234
|
if num_forecasts > 1:
|
|
237
235
|
raise NotImplementedError("Forecasting not yet implemented for "
|
|
238
236
|
"non-regression tasks")
|
|
@@ -306,13 +304,13 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
306
304
|
def execute_logical_operation(
|
|
307
305
|
self,
|
|
308
306
|
logical_operation: LogicalOperation,
|
|
309
|
-
feat_dict:
|
|
310
|
-
time_dict:
|
|
311
|
-
batch_dict:
|
|
307
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
308
|
+
time_dict: dict[str, pd.Series],
|
|
309
|
+
batch_dict: dict[str, np.ndarray],
|
|
312
310
|
anchor_time: pd.Series,
|
|
313
311
|
filter_na: bool = True,
|
|
314
312
|
num_forecasts: int = 1,
|
|
315
|
-
) ->
|
|
313
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
316
314
|
if num_forecasts > 1:
|
|
317
315
|
raise NotImplementedError("Forecasting not yet implemented for "
|
|
318
316
|
"non-regression tasks")
|
|
@@ -370,13 +368,13 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
370
368
|
def execute_join(
|
|
371
369
|
self,
|
|
372
370
|
join: Join,
|
|
373
|
-
feat_dict:
|
|
374
|
-
time_dict:
|
|
375
|
-
batch_dict:
|
|
371
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
372
|
+
time_dict: dict[str, pd.Series],
|
|
373
|
+
batch_dict: dict[str, np.ndarray],
|
|
376
374
|
anchor_time: pd.Series,
|
|
377
375
|
filter_na: bool = True,
|
|
378
376
|
num_forecasts: int = 1,
|
|
379
|
-
) ->
|
|
377
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
380
378
|
if isinstance(join.rhs_target, Aggregation):
|
|
381
379
|
return self.execute_aggregation(
|
|
382
380
|
aggr=join.rhs_target,
|
|
@@ -393,12 +391,12 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
393
391
|
def execute_filter(
|
|
394
392
|
self,
|
|
395
393
|
filter: Filter,
|
|
396
|
-
feat_dict:
|
|
397
|
-
time_dict:
|
|
398
|
-
batch_dict:
|
|
394
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
395
|
+
time_dict: dict[str, pd.Series],
|
|
396
|
+
batch_dict: dict[str, np.ndarray],
|
|
399
397
|
anchor_time: pd.Series,
|
|
400
398
|
filter_na: bool = True,
|
|
401
|
-
) ->
|
|
399
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
402
400
|
out, mask = self.execute_column(
|
|
403
401
|
column=filter.target,
|
|
404
402
|
feat_dict=feat_dict,
|
|
@@ -431,12 +429,12 @@ class PQueryPandasExecutor(PQueryExecutor[pd.DataFrame, pd.Series,
|
|
|
431
429
|
def execute(
|
|
432
430
|
self,
|
|
433
431
|
query: ValidatedPredictiveQuery,
|
|
434
|
-
feat_dict:
|
|
435
|
-
time_dict:
|
|
436
|
-
batch_dict:
|
|
432
|
+
feat_dict: dict[str, pd.DataFrame],
|
|
433
|
+
time_dict: dict[str, pd.Series],
|
|
434
|
+
batch_dict: dict[str, np.ndarray],
|
|
437
435
|
anchor_time: pd.Series,
|
|
438
436
|
num_forecasts: int = 1,
|
|
439
|
-
) ->
|
|
437
|
+
) -> tuple[pd.Series, np.ndarray]:
|
|
440
438
|
if isinstance(query.entity_ast, Column):
|
|
441
439
|
out, mask = self.execute_column(
|
|
442
440
|
column=query.entity_ast,
|