kumoai 2.13.0.dev202512040649__cp313-cp313-win_amd64.whl → 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +12 -0
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/experimental/rfm/__init__.py +33 -8
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/__init__.py +4 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +52 -91
- kumoai/experimental/rfm/backend/local/sampler.py +315 -0
- kumoai/experimental/rfm/backend/local/table.py +21 -16
- kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +102 -48
- kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +84 -31
- kumoai/experimental/rfm/base/__init__.py +26 -3
- kumoai/experimental/rfm/base/column.py +14 -12
- kumoai/experimental/rfm/base/column_expression.py +50 -0
- kumoai/experimental/rfm/base/sampler.py +773 -0
- kumoai/experimental/rfm/base/source.py +1 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +229 -0
- kumoai/experimental/rfm/base/table.py +173 -138
- kumoai/experimental/rfm/graph.py +302 -108
- kumoai/experimental/rfm/infer/__init__.py +6 -4
- kumoai/experimental/rfm/infer/dtype.py +3 -3
- kumoai/experimental/rfm/infer/pkey.py +4 -2
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +1 -2
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/rfm.py +299 -230
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/kumolib.cp313-win_amd64.pyd +0 -0
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +3 -2
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +44 -36
- kumoai/experimental/rfm/local_graph_sampler.py +0 -223
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512040649.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0
|
@@ -1,22 +1,35 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import warnings
|
|
3
|
-
from
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import cast
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
7
|
+
from kumoapi.model_plan import MissingType
|
|
6
8
|
from kumoapi.typing import Dtype
|
|
7
9
|
|
|
8
10
|
from kumoai.experimental.rfm.backend.sqlite import Connection
|
|
9
|
-
from kumoai.experimental.rfm.base import
|
|
11
|
+
from kumoai.experimental.rfm.base import (
|
|
12
|
+
ColumnExpressionSpec,
|
|
13
|
+
ColumnExpressionType,
|
|
14
|
+
DataBackend,
|
|
15
|
+
SourceColumn,
|
|
16
|
+
SourceForeignKey,
|
|
17
|
+
SQLTable,
|
|
18
|
+
)
|
|
10
19
|
from kumoai.experimental.rfm.infer import infer_dtype
|
|
20
|
+
from kumoai.utils import quote_ident
|
|
11
21
|
|
|
12
22
|
|
|
13
|
-
class SQLiteTable(
|
|
23
|
+
class SQLiteTable(SQLTable):
|
|
14
24
|
r"""A table backed by a :class:`sqlite` database.
|
|
15
25
|
|
|
16
26
|
Args:
|
|
17
27
|
connection: The connection to a :class:`sqlite` database.
|
|
18
|
-
name: The name of this table.
|
|
19
|
-
|
|
28
|
+
name: The logical name of this table.
|
|
29
|
+
source_name: The physical name of this table in the database. If set to
|
|
30
|
+
``None``, ``name`` is being used.
|
|
31
|
+
columns: The selected physical columns of this table.
|
|
32
|
+
column_expressions: The logical columns of this table.
|
|
20
33
|
primary_key: The name of the primary key of this table, if it exists.
|
|
21
34
|
time_column: The name of the time column of this table, if it exists.
|
|
22
35
|
end_time_column: The name of the end time column of this table, if it
|
|
@@ -26,32 +39,53 @@ class SQLiteTable(Table):
|
|
|
26
39
|
self,
|
|
27
40
|
connection: Connection,
|
|
28
41
|
name: str,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
42
|
+
source_name: str | None = None,
|
|
43
|
+
columns: Sequence[str] | None = None,
|
|
44
|
+
column_expressions: Sequence[ColumnExpressionType] | None = None,
|
|
45
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
46
|
+
time_column: str | None = None,
|
|
47
|
+
end_time_column: str | None = None,
|
|
33
48
|
) -> None:
|
|
34
49
|
|
|
35
50
|
self._connection = connection
|
|
36
51
|
|
|
37
52
|
super().__init__(
|
|
38
53
|
name=name,
|
|
54
|
+
source_name=source_name,
|
|
39
55
|
columns=columns,
|
|
56
|
+
column_expressions=column_expressions,
|
|
40
57
|
primary_key=primary_key,
|
|
41
58
|
time_column=time_column,
|
|
42
59
|
end_time_column=end_time_column,
|
|
43
60
|
)
|
|
44
61
|
|
|
45
|
-
|
|
46
|
-
|
|
62
|
+
@property
|
|
63
|
+
def backend(self) -> DataBackend:
|
|
64
|
+
return cast(DataBackend, DataBackend.SQLITE)
|
|
65
|
+
|
|
66
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
67
|
+
source_columns: list[SourceColumn] = []
|
|
47
68
|
with self._connection.cursor() as cursor:
|
|
48
|
-
|
|
49
|
-
|
|
69
|
+
sql = f"PRAGMA table_info({self.fqn})"
|
|
70
|
+
cursor.execute(sql)
|
|
71
|
+
columns = cursor.fetchall()
|
|
72
|
+
|
|
73
|
+
if len(columns) == 0:
|
|
74
|
+
raise ValueError(f"Table '{self._source_name}' does not exist "
|
|
75
|
+
f"in the SQLite database")
|
|
50
76
|
|
|
51
|
-
|
|
52
|
-
|
|
77
|
+
unique_keys: set[str] = set()
|
|
78
|
+
sql = f"PRAGMA index_list({self.fqn})"
|
|
79
|
+
cursor.execute(sql)
|
|
80
|
+
for _, index_name, is_unique, *_ in cursor.fetchall():
|
|
81
|
+
if bool(is_unique):
|
|
82
|
+
sql = f"PRAGMA index_info({quote_ident(index_name)})"
|
|
83
|
+
cursor.execute(sql)
|
|
84
|
+
index = cursor.fetchall()
|
|
85
|
+
if len(index) == 1:
|
|
86
|
+
unique_keys.add(index[0][2])
|
|
53
87
|
|
|
54
|
-
for _, column, type,
|
|
88
|
+
for _, column, type, notnull, _, is_pkey in columns:
|
|
55
89
|
# Determine column affinity:
|
|
56
90
|
type = type.strip().upper()
|
|
57
91
|
if re.search('INT', type):
|
|
@@ -61,41 +95,60 @@ class SQLiteTable(Table):
|
|
|
61
95
|
elif re.search('REAL|FLOA|DOUB', type):
|
|
62
96
|
dtype = Dtype.float
|
|
63
97
|
else: # NUMERIC affinity.
|
|
64
|
-
ser = self.
|
|
98
|
+
ser = self._source_sample_df[column]
|
|
65
99
|
try:
|
|
66
100
|
dtype = infer_dtype(ser)
|
|
67
101
|
except Exception:
|
|
68
|
-
warnings.warn(
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
102
|
+
warnings.warn(f"Encountered unsupported data type "
|
|
103
|
+
f"'{ser.dtype}' with source data type "
|
|
104
|
+
f"'{type}' for column '{column}' in "
|
|
105
|
+
f"table '{self.name}'. If possible, "
|
|
106
|
+
f"change the data type of the column in "
|
|
107
|
+
f"your SQLite database to use it within "
|
|
108
|
+
f"this table.")
|
|
73
109
|
continue
|
|
74
110
|
|
|
75
111
|
source_column = SourceColumn(
|
|
76
112
|
name=column,
|
|
77
113
|
dtype=dtype,
|
|
78
114
|
is_primary_key=bool(is_pkey),
|
|
79
|
-
is_unique_key=
|
|
115
|
+
is_unique_key=column in unique_keys,
|
|
116
|
+
is_nullable=not bool(is_pkey) and not bool(notnull),
|
|
80
117
|
)
|
|
81
118
|
source_columns.append(source_column)
|
|
82
119
|
|
|
83
120
|
return source_columns
|
|
84
121
|
|
|
85
|
-
def _get_source_foreign_keys(self) ->
|
|
86
|
-
source_fkeys:
|
|
122
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
123
|
+
source_fkeys: list[SourceForeignKey] = []
|
|
87
124
|
with self._connection.cursor() as cursor:
|
|
88
|
-
|
|
89
|
-
|
|
125
|
+
sql = f"PRAGMA foreign_key_list({self.fqn})"
|
|
126
|
+
cursor.execute(sql)
|
|
127
|
+
for _, _, dst_table, fkey, pkey, *_ in cursor.fetchall():
|
|
90
128
|
source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
|
|
91
129
|
return source_fkeys
|
|
92
130
|
|
|
93
|
-
def
|
|
131
|
+
def _get_source_sample_df(self) -> pd.DataFrame:
|
|
94
132
|
with self._connection.cursor() as cursor:
|
|
95
|
-
|
|
96
|
-
|
|
133
|
+
sql = (f"SELECT * FROM {self.fqn} "
|
|
134
|
+
f"ORDER BY rowid LIMIT 1000")
|
|
135
|
+
cursor.execute(sql)
|
|
97
136
|
table = cursor.fetch_arrow_table()
|
|
98
137
|
return table.to_pandas(types_mapper=pd.ArrowDtype)
|
|
99
138
|
|
|
100
|
-
def _get_num_rows(self) ->
|
|
139
|
+
def _get_num_rows(self) -> int | None:
|
|
101
140
|
return None
|
|
141
|
+
|
|
142
|
+
def _get_expression_sample_df(
|
|
143
|
+
self,
|
|
144
|
+
specs: Sequence[ColumnExpressionSpec],
|
|
145
|
+
) -> pd.DataFrame:
|
|
146
|
+
with self._connection.cursor() as cursor:
|
|
147
|
+
columns = [
|
|
148
|
+
f"{spec.expr} AS {quote_ident(spec.name)}" for spec in specs
|
|
149
|
+
]
|
|
150
|
+
sql = (f"SELECT {', '.join(columns)} FROM {self.fqn} "
|
|
151
|
+
f"ORDER BY rowid LIMIT 1000")
|
|
152
|
+
cursor.execute(sql)
|
|
153
|
+
table = cursor.fetch_arrow_table()
|
|
154
|
+
return table.to_pandas(types_mapper=pd.ArrowDtype)
|
|
@@ -1,10 +1,33 @@
|
|
|
1
|
-
from .
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
from kumoapi.common import StrEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DataBackend(StrEnum):
|
|
5
|
+
LOCAL = 'local'
|
|
6
|
+
SQLITE = 'sqlite'
|
|
7
|
+
SNOWFLAKE = 'snowflake'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from .source import SourceColumn, SourceForeignKey # noqa: E402
|
|
11
|
+
from .column import Column # noqa: E402
|
|
12
|
+
from .column_expression import ColumnExpressionSpec # noqa: E402
|
|
13
|
+
from .column_expression import ColumnExpressionType # noqa: E402
|
|
14
|
+
from .column_expression import ColumnExpression # noqa: E402
|
|
15
|
+
from .table import Table # noqa: E402
|
|
16
|
+
from .sql_table import SQLTable # noqa: E402
|
|
17
|
+
from .sampler import SamplerOutput, Sampler # noqa: E402
|
|
18
|
+
from .sql_sampler import SQLSampler # noqa: E402
|
|
4
19
|
|
|
5
20
|
__all__ = [
|
|
21
|
+
'DataBackend',
|
|
6
22
|
'SourceColumn',
|
|
7
23
|
'SourceForeignKey',
|
|
8
24
|
'Column',
|
|
25
|
+
'ColumnExpressionSpec',
|
|
26
|
+
'ColumnExpressionType',
|
|
27
|
+
'ColumnExpression',
|
|
9
28
|
'Table',
|
|
29
|
+
'SQLTable',
|
|
30
|
+
'SamplerOutput',
|
|
31
|
+
'Sampler',
|
|
32
|
+
'SQLSampler',
|
|
10
33
|
]
|
|
@@ -8,20 +8,14 @@ from kumoapi.typing import Dtype, Stype
|
|
|
8
8
|
class Column:
|
|
9
9
|
stype: Stype
|
|
10
10
|
|
|
11
|
-
def __init__(
|
|
12
|
-
self,
|
|
13
|
-
name: str,
|
|
14
|
-
dtype: Dtype,
|
|
15
|
-
stype: Stype,
|
|
16
|
-
is_primary_key: bool = False,
|
|
17
|
-
is_time_column: bool = False,
|
|
18
|
-
is_end_time_column: bool = False,
|
|
19
|
-
) -> None:
|
|
11
|
+
def __init__(self, name: str, stype: Stype, dtype: Dtype) -> None:
|
|
20
12
|
self._name = name
|
|
21
13
|
self._dtype = Dtype(dtype)
|
|
22
|
-
|
|
23
|
-
self.
|
|
24
|
-
self.
|
|
14
|
+
|
|
15
|
+
self._is_primary_key = False
|
|
16
|
+
self._is_time_column = False
|
|
17
|
+
self._is_end_time_column = False
|
|
18
|
+
|
|
25
19
|
self.stype = Stype(stype)
|
|
26
20
|
|
|
27
21
|
@property
|
|
@@ -32,6 +26,14 @@ class Column:
|
|
|
32
26
|
def dtype(self) -> Dtype:
|
|
33
27
|
return self._dtype
|
|
34
28
|
|
|
29
|
+
@property
|
|
30
|
+
def is_physical(self) -> bool:
|
|
31
|
+
return True
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_logical(self) -> bool:
|
|
35
|
+
return not self.is_physical
|
|
36
|
+
|
|
35
37
|
def __setattr__(self, key: str, val: Any) -> None:
|
|
36
38
|
if key == 'stype':
|
|
37
39
|
if isinstance(val, str):
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, TypeAlias
|
|
3
|
+
|
|
4
|
+
from kumoapi.typing import Dtype, Stype
|
|
5
|
+
|
|
6
|
+
from kumoai.experimental.rfm.base import Column
|
|
7
|
+
from kumoai.mixin import CastMixin
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class ColumnExpressionSpec(CastMixin):
|
|
12
|
+
name: str
|
|
13
|
+
expr: str
|
|
14
|
+
dtype: Dtype | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
ColumnExpressionType: TypeAlias = ColumnExpressionSpec | dict[str, Any]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(init=False, repr=False, eq=False)
|
|
21
|
+
class ColumnExpression(Column):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
name: str,
|
|
25
|
+
expr: str,
|
|
26
|
+
stype: Stype,
|
|
27
|
+
dtype: Dtype,
|
|
28
|
+
) -> None:
|
|
29
|
+
super().__init__(name=name, stype=stype, dtype=dtype)
|
|
30
|
+
self._expr = expr
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def expr(self) -> str:
|
|
34
|
+
return self._expr
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def is_physical(self) -> bool:
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
def __hash__(self) -> int:
|
|
41
|
+
return hash((self.name, self.expr, self.stype, self.dtype))
|
|
42
|
+
|
|
43
|
+
def __eq__(self, other: Any) -> bool:
|
|
44
|
+
if not isinstance(other, ColumnExpression):
|
|
45
|
+
return False
|
|
46
|
+
return hash(self) == hash(other)
|
|
47
|
+
|
|
48
|
+
def __repr__(self) -> str:
|
|
49
|
+
return (f'{self.__class__.__name__}(name={self.name}, '
|
|
50
|
+
f'expr={self.expr}, stype={self.stype}, dtype={self.dtype})')
|