kumoai 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +12 -0
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/experimental/rfm/__init__.py +33 -8
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/__init__.py +4 -0
- kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +52 -91
- kumoai/experimental/rfm/backend/local/sampler.py +315 -0
- kumoai/experimental/rfm/backend/local/table.py +31 -14
- kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +75 -23
- kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +71 -28
- kumoai/experimental/rfm/base/__init__.py +24 -3
- kumoai/experimental/rfm/base/column.py +6 -12
- kumoai/experimental/rfm/base/column_expression.py +16 -0
- kumoai/experimental/rfm/base/sampler.py +773 -0
- kumoai/experimental/rfm/base/source.py +1 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +113 -0
- kumoai/experimental/rfm/base/table.py +136 -105
- kumoai/experimental/rfm/graph.py +296 -89
- kumoai/experimental/rfm/infer/dtype.py +46 -59
- kumoai/experimental/rfm/infer/pkey.py +4 -2
- kumoai/experimental/rfm/infer/time_col.py +1 -2
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
- kumoai/experimental/rfm/rfm.py +299 -230
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/METADATA +4 -2
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/RECORD +41 -34
- kumoai/experimental/rfm/local_graph_sampler.py +0 -223
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,34 @@
|
|
|
1
1
|
import re
|
|
2
|
-
|
|
2
|
+
import warnings
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import cast
|
|
3
5
|
|
|
4
6
|
import pandas as pd
|
|
7
|
+
from kumoapi.model_plan import MissingType
|
|
5
8
|
from kumoapi.typing import Dtype
|
|
6
9
|
|
|
7
10
|
from kumoai.experimental.rfm.backend.sqlite import Connection
|
|
8
|
-
from kumoai.experimental.rfm.base import
|
|
11
|
+
from kumoai.experimental.rfm.base import (
|
|
12
|
+
ColumnExpressionType,
|
|
13
|
+
DataBackend,
|
|
14
|
+
SourceColumn,
|
|
15
|
+
SourceForeignKey,
|
|
16
|
+
SQLTable,
|
|
17
|
+
)
|
|
9
18
|
from kumoai.experimental.rfm.infer import infer_dtype
|
|
19
|
+
from kumoai.utils import quote_ident
|
|
10
20
|
|
|
11
21
|
|
|
12
|
-
class SQLiteTable(
|
|
22
|
+
class SQLiteTable(SQLTable):
|
|
13
23
|
r"""A table backed by a :class:`sqlite` database.
|
|
14
24
|
|
|
15
25
|
Args:
|
|
16
26
|
connection: The connection to a :class:`sqlite` database.
|
|
17
|
-
name: The name of this table.
|
|
18
|
-
|
|
27
|
+
name: The logical name of this table.
|
|
28
|
+
source_name: The physical name of this table in the database. If set to
|
|
29
|
+
``None``, ``name`` is being used.
|
|
30
|
+
columns: The selected physical columns of this table.
|
|
31
|
+
column_expressions: The logical columns of this table.
|
|
19
32
|
primary_key: The name of the primary key of this table, if it exists.
|
|
20
33
|
time_column: The name of the time column of this table, if it exists.
|
|
21
34
|
end_time_column: The name of the end time column of this table, if it
|
|
@@ -25,32 +38,53 @@ class SQLiteTable(Table):
|
|
|
25
38
|
self,
|
|
26
39
|
connection: Connection,
|
|
27
40
|
name: str,
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
41
|
+
source_name: str | None = None,
|
|
42
|
+
columns: Sequence[str] | None = None,
|
|
43
|
+
column_expressions: Sequence[ColumnExpressionType] | None = None,
|
|
44
|
+
primary_key: MissingType | str | None = MissingType.VALUE,
|
|
45
|
+
time_column: str | None = None,
|
|
46
|
+
end_time_column: str | None = None,
|
|
32
47
|
) -> None:
|
|
33
48
|
|
|
34
49
|
self._connection = connection
|
|
35
50
|
|
|
36
51
|
super().__init__(
|
|
37
52
|
name=name,
|
|
53
|
+
source_name=source_name,
|
|
38
54
|
columns=columns,
|
|
55
|
+
column_expressions=column_expressions,
|
|
39
56
|
primary_key=primary_key,
|
|
40
57
|
time_column=time_column,
|
|
41
58
|
end_time_column=end_time_column,
|
|
42
59
|
)
|
|
43
60
|
|
|
44
|
-
|
|
45
|
-
|
|
61
|
+
@property
|
|
62
|
+
def backend(self) -> DataBackend:
|
|
63
|
+
return cast(DataBackend, DataBackend.SQLITE)
|
|
64
|
+
|
|
65
|
+
def _get_source_columns(self) -> list[SourceColumn]:
|
|
66
|
+
source_columns: list[SourceColumn] = []
|
|
46
67
|
with self._connection.cursor() as cursor:
|
|
47
|
-
|
|
48
|
-
|
|
68
|
+
sql = f"PRAGMA table_info({self.fqn})"
|
|
69
|
+
cursor.execute(sql)
|
|
70
|
+
columns = cursor.fetchall()
|
|
71
|
+
|
|
72
|
+
if len(columns) == 0:
|
|
73
|
+
raise ValueError(f"Table '{self._source_name}' does not exist "
|
|
74
|
+
f"in the SQLite database")
|
|
49
75
|
|
|
50
|
-
|
|
51
|
-
|
|
76
|
+
unique_keys: set[str] = set()
|
|
77
|
+
sql = f"PRAGMA index_list({self.fqn})"
|
|
78
|
+
cursor.execute(sql)
|
|
79
|
+
for _, index_name, is_unique, *_ in cursor.fetchall():
|
|
80
|
+
if bool(is_unique):
|
|
81
|
+
sql = f"PRAGMA index_info({quote_ident(index_name)})"
|
|
82
|
+
cursor.execute(sql)
|
|
83
|
+
index = cursor.fetchall()
|
|
84
|
+
if len(index) == 1:
|
|
85
|
+
unique_keys.add(index[0][2])
|
|
52
86
|
|
|
53
|
-
for _, column, type,
|
|
87
|
+
for _, column, type, notnull, _, is_pkey in columns:
|
|
54
88
|
# Determine column affinity:
|
|
55
89
|
type = type.strip().upper()
|
|
56
90
|
if re.search('INT', type):
|
|
@@ -60,35 +94,44 @@ class SQLiteTable(Table):
|
|
|
60
94
|
elif re.search('REAL|FLOA|DOUB', type):
|
|
61
95
|
dtype = Dtype.float
|
|
62
96
|
else: # NUMERIC affinity.
|
|
97
|
+
ser = self._sample_df[column]
|
|
63
98
|
try:
|
|
64
|
-
dtype = infer_dtype(
|
|
65
|
-
except Exception
|
|
66
|
-
|
|
99
|
+
dtype = infer_dtype(ser)
|
|
100
|
+
except Exception:
|
|
101
|
+
warnings.warn(
|
|
102
|
+
f"Data type inference for column '{column}' in "
|
|
103
|
+
f"table '{self.name}' failed. Consider changing "
|
|
104
|
+
f"the data type of the column in the database or "
|
|
105
|
+
f"remove this column from this table.")
|
|
106
|
+
continue
|
|
67
107
|
|
|
68
108
|
source_column = SourceColumn(
|
|
69
109
|
name=column,
|
|
70
110
|
dtype=dtype,
|
|
71
111
|
is_primary_key=bool(is_pkey),
|
|
72
|
-
is_unique_key=
|
|
112
|
+
is_unique_key=column in unique_keys,
|
|
113
|
+
is_nullable=not bool(is_pkey) and not bool(notnull),
|
|
73
114
|
)
|
|
74
115
|
source_columns.append(source_column)
|
|
75
116
|
|
|
76
117
|
return source_columns
|
|
77
118
|
|
|
78
|
-
def _get_source_foreign_keys(self) ->
|
|
79
|
-
source_fkeys:
|
|
119
|
+
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
|
|
120
|
+
source_fkeys: list[SourceForeignKey] = []
|
|
80
121
|
with self._connection.cursor() as cursor:
|
|
81
|
-
|
|
82
|
-
|
|
122
|
+
sql = f"PRAGMA foreign_key_list({self.fqn})"
|
|
123
|
+
cursor.execute(sql)
|
|
124
|
+
for _, _, dst_table, fkey, pkey, *_ in cursor.fetchall():
|
|
83
125
|
source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
|
|
84
126
|
return source_fkeys
|
|
85
127
|
|
|
86
128
|
def _get_sample_df(self) -> pd.DataFrame:
|
|
87
129
|
with self._connection.cursor() as cursor:
|
|
88
|
-
|
|
89
|
-
|
|
130
|
+
sql = (f"SELECT * FROM {self.fqn} "
|
|
131
|
+
f"ORDER BY rowid LIMIT 1000")
|
|
132
|
+
cursor.execute(sql)
|
|
90
133
|
table = cursor.fetch_arrow_table()
|
|
91
|
-
return table.to_pandas()
|
|
134
|
+
return table.to_pandas(types_mapper=pd.ArrowDtype)
|
|
92
135
|
|
|
93
|
-
def _get_num_rows(self) ->
|
|
136
|
+
def _get_num_rows(self) -> int | None:
|
|
94
137
|
return None
|
|
@@ -1,10 +1,31 @@
|
|
|
1
|
-
from .
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
from kumoapi.common import StrEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DataBackend(StrEnum):
|
|
5
|
+
LOCAL = 'local'
|
|
6
|
+
SQLITE = 'sqlite'
|
|
7
|
+
SNOWFLAKE = 'snowflake'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from .source import SourceColumn, SourceForeignKey # noqa: E402
|
|
11
|
+
from .column import Column # noqa: E402
|
|
12
|
+
from .column_expression import ColumnExpressionSpec # noqa: E402
|
|
13
|
+
from .column_expression import ColumnExpressionType # noqa: E402
|
|
14
|
+
from .table import Table # noqa: E402
|
|
15
|
+
from .sql_table import SQLTable # noqa: E402
|
|
16
|
+
from .sampler import SamplerOutput, Sampler # noqa: E402
|
|
17
|
+
from .sql_sampler import SQLSampler # noqa: E402
|
|
4
18
|
|
|
5
19
|
__all__ = [
|
|
20
|
+
'DataBackend',
|
|
6
21
|
'SourceColumn',
|
|
7
22
|
'SourceForeignKey',
|
|
8
23
|
'Column',
|
|
24
|
+
'ColumnExpressionSpec',
|
|
25
|
+
'ColumnExpressionType',
|
|
9
26
|
'Table',
|
|
27
|
+
'SQLTable',
|
|
28
|
+
'SamplerOutput',
|
|
29
|
+
'Sampler',
|
|
30
|
+
'SQLSampler',
|
|
10
31
|
]
|
|
@@ -8,20 +8,14 @@ from kumoapi.typing import Dtype, Stype
|
|
|
8
8
|
class Column:
|
|
9
9
|
stype: Stype
|
|
10
10
|
|
|
11
|
-
def __init__(
|
|
12
|
-
self,
|
|
13
|
-
name: str,
|
|
14
|
-
dtype: Dtype,
|
|
15
|
-
stype: Stype,
|
|
16
|
-
is_primary_key: bool = False,
|
|
17
|
-
is_time_column: bool = False,
|
|
18
|
-
is_end_time_column: bool = False,
|
|
19
|
-
) -> None:
|
|
11
|
+
def __init__(self, name: str, stype: Stype, dtype: Dtype) -> None:
|
|
20
12
|
self._name = name
|
|
21
13
|
self._dtype = Dtype(dtype)
|
|
22
|
-
|
|
23
|
-
self.
|
|
24
|
-
self.
|
|
14
|
+
|
|
15
|
+
self._is_primary_key = False
|
|
16
|
+
self._is_time_column = False
|
|
17
|
+
self._is_end_time_column = False
|
|
18
|
+
|
|
25
19
|
self.stype = Stype(stype)
|
|
26
20
|
|
|
27
21
|
@property
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, TypeAlias
|
|
3
|
+
|
|
4
|
+
from kumoapi.typing import Dtype
|
|
5
|
+
|
|
6
|
+
from kumoai.mixin import CastMixin
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class ColumnExpressionSpec(CastMixin):
|
|
11
|
+
name: str
|
|
12
|
+
expr: str
|
|
13
|
+
dtype: Dtype | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ColumnExpressionType: TypeAlias = ColumnExpressionSpec | dict[str, Any]
|