kumoai 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kumoai/__init__.py +12 -0
  2. kumoai/_version.py +1 -1
  3. kumoai/client/pquery.py +6 -2
  4. kumoai/experimental/rfm/__init__.py +33 -8
  5. kumoai/experimental/rfm/authenticate.py +3 -4
  6. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  7. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +52 -91
  8. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  9. kumoai/experimental/rfm/backend/local/table.py +31 -14
  10. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  11. kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
  12. kumoai/experimental/rfm/backend/snow/table.py +75 -23
  13. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  14. kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
  15. kumoai/experimental/rfm/backend/sqlite/table.py +71 -28
  16. kumoai/experimental/rfm/base/__init__.py +24 -3
  17. kumoai/experimental/rfm/base/column.py +6 -12
  18. kumoai/experimental/rfm/base/column_expression.py +16 -0
  19. kumoai/experimental/rfm/base/sampler.py +773 -0
  20. kumoai/experimental/rfm/base/source.py +1 -0
  21. kumoai/experimental/rfm/base/sql_sampler.py +84 -0
  22. kumoai/experimental/rfm/base/sql_table.py +113 -0
  23. kumoai/experimental/rfm/base/table.py +136 -105
  24. kumoai/experimental/rfm/graph.py +296 -89
  25. kumoai/experimental/rfm/infer/dtype.py +46 -59
  26. kumoai/experimental/rfm/infer/pkey.py +4 -2
  27. kumoai/experimental/rfm/infer/time_col.py +1 -2
  28. kumoai/experimental/rfm/pquery/executor.py +27 -27
  29. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  30. kumoai/experimental/rfm/rfm.py +299 -230
  31. kumoai/experimental/rfm/sagemaker.py +4 -4
  32. kumoai/pquery/predictive_query.py +10 -6
  33. kumoai/testing/snow.py +50 -0
  34. kumoai/utils/__init__.py +3 -2
  35. kumoai/utils/progress_logger.py +178 -12
  36. kumoai/utils/sql.py +3 -0
  37. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/METADATA +4 -2
  38. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/RECORD +41 -34
  39. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  40. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  41. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/WHEEL +0 -0
  42. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/licenses/LICENSE +0 -0
  43. {kumoai-2.13.0.dev202512031731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,34 @@
1
1
  import re
2
- from typing import List, Optional, Sequence
2
+ import warnings
3
+ from collections.abc import Sequence
4
+ from typing import cast
3
5
 
4
6
  import pandas as pd
7
+ from kumoapi.model_plan import MissingType
5
8
  from kumoapi.typing import Dtype
6
9
 
7
10
  from kumoai.experimental.rfm.backend.sqlite import Connection
8
- from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
11
+ from kumoai.experimental.rfm.base import (
12
+ ColumnExpressionType,
13
+ DataBackend,
14
+ SourceColumn,
15
+ SourceForeignKey,
16
+ SQLTable,
17
+ )
9
18
  from kumoai.experimental.rfm.infer import infer_dtype
19
+ from kumoai.utils import quote_ident
10
20
 
11
21
 
12
- class SQLiteTable(Table):
22
+ class SQLiteTable(SQLTable):
13
23
  r"""A table backed by a :class:`sqlite` database.
14
24
 
15
25
  Args:
16
26
  connection: The connection to a :class:`sqlite` database.
17
- name: The name of this table.
18
- columns: The selected columns of this table.
27
+ name: The logical name of this table.
28
+ source_name: The physical name of this table in the database. If set to
29
+ ``None``, ``name`` is being used.
30
+ columns: The selected physical columns of this table.
31
+ column_expressions: The logical columns of this table.
19
32
  primary_key: The name of the primary key of this table, if it exists.
20
33
  time_column: The name of the time column of this table, if it exists.
21
34
  end_time_column: The name of the end time column of this table, if it
@@ -25,32 +38,53 @@ class SQLiteTable(Table):
25
38
  self,
26
39
  connection: Connection,
27
40
  name: str,
28
- columns: Optional[Sequence[str]] = None,
29
- primary_key: Optional[str] = None,
30
- time_column: Optional[str] = None,
31
- end_time_column: Optional[str] = None,
41
+ source_name: str | None = None,
42
+ columns: Sequence[str] | None = None,
43
+ column_expressions: Sequence[ColumnExpressionType] | None = None,
44
+ primary_key: MissingType | str | None = MissingType.VALUE,
45
+ time_column: str | None = None,
46
+ end_time_column: str | None = None,
32
47
  ) -> None:
33
48
 
34
49
  self._connection = connection
35
50
 
36
51
  super().__init__(
37
52
  name=name,
53
+ source_name=source_name,
38
54
  columns=columns,
55
+ column_expressions=column_expressions,
39
56
  primary_key=primary_key,
40
57
  time_column=time_column,
41
58
  end_time_column=end_time_column,
42
59
  )
43
60
 
44
- def _get_source_columns(self) -> List[SourceColumn]:
45
- source_columns: List[SourceColumn] = []
61
+ @property
62
+ def backend(self) -> DataBackend:
63
+ return cast(DataBackend, DataBackend.SQLITE)
64
+
65
+ def _get_source_columns(self) -> list[SourceColumn]:
66
+ source_columns: list[SourceColumn] = []
46
67
  with self._connection.cursor() as cursor:
47
- cursor.execute(f"PRAGMA table_info({self.name})")
48
- rows = cursor.fetchall()
68
+ sql = f"PRAGMA table_info({self.fqn})"
69
+ cursor.execute(sql)
70
+ columns = cursor.fetchall()
71
+
72
+ if len(columns) == 0:
73
+ raise ValueError(f"Table '{self._source_name}' does not exist "
74
+ f"in the SQLite database")
49
75
 
50
- if len(rows) == 0:
51
- raise ValueError(f"Table '{self.name}' does not exist")
76
+ unique_keys: set[str] = set()
77
+ sql = f"PRAGMA index_list({self.fqn})"
78
+ cursor.execute(sql)
79
+ for _, index_name, is_unique, *_ in cursor.fetchall():
80
+ if bool(is_unique):
81
+ sql = f"PRAGMA index_info({quote_ident(index_name)})"
82
+ cursor.execute(sql)
83
+ index = cursor.fetchall()
84
+ if len(index) == 1:
85
+ unique_keys.add(index[0][2])
52
86
 
53
- for _, column, type, _, _, is_pkey in rows:
87
+ for _, column, type, notnull, _, is_pkey in columns:
54
88
  # Determine column affinity:
55
89
  type = type.strip().upper()
56
90
  if re.search('INT', type):
@@ -60,35 +94,44 @@ class SQLiteTable(Table):
60
94
  elif re.search('REAL|FLOA|DOUB', type):
61
95
  dtype = Dtype.float
62
96
  else: # NUMERIC affinity.
97
+ ser = self._sample_df[column]
63
98
  try:
64
- dtype = infer_dtype(self._sample_df[column])
65
- except Exception as e:
66
- raise e
99
+ dtype = infer_dtype(ser)
100
+ except Exception:
101
+ warnings.warn(
102
+ f"Data type inference for column '{column}' in "
103
+ f"table '{self.name}' failed. Consider changing "
104
+ f"the data type of the column in the database or "
105
+ f"remove this column from this table.")
106
+ continue
67
107
 
68
108
  source_column = SourceColumn(
69
109
  name=column,
70
110
  dtype=dtype,
71
111
  is_primary_key=bool(is_pkey),
72
- is_unique_key=False,
112
+ is_unique_key=column in unique_keys,
113
+ is_nullable=not bool(is_pkey) and not bool(notnull),
73
114
  )
74
115
  source_columns.append(source_column)
75
116
 
76
117
  return source_columns
77
118
 
78
- def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
79
- source_fkeys: List[SourceForeignKey] = []
119
+ def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
120
+ source_fkeys: list[SourceForeignKey] = []
80
121
  with self._connection.cursor() as cursor:
81
- cursor.execute(f"PRAGMA foreign_key_list({self.name})")
82
- for _, _, dst_table, fkey, pkey, _, _, _ in cursor.fetchall():
122
+ sql = f"PRAGMA foreign_key_list({self.fqn})"
123
+ cursor.execute(sql)
124
+ for _, _, dst_table, fkey, pkey, *_ in cursor.fetchall():
83
125
  source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
84
126
  return source_fkeys
85
127
 
86
128
  def _get_sample_df(self) -> pd.DataFrame:
87
129
  with self._connection.cursor() as cursor:
88
- cursor.execute(f"SELECT * FROM {self.name} "
89
- f"ORDER BY rowid LIMIT 1000")
130
+ sql = (f"SELECT * FROM {self.fqn} "
131
+ f"ORDER BY rowid LIMIT 1000")
132
+ cursor.execute(sql)
90
133
  table = cursor.fetch_arrow_table()
91
- return table.to_pandas()
134
+ return table.to_pandas(types_mapper=pd.ArrowDtype)
92
135
 
93
- def _get_num_rows(self) -> Optional[int]:
136
+ def _get_num_rows(self) -> int | None:
94
137
  return None
@@ -1,10 +1,31 @@
1
- from .source import SourceColumn, SourceForeignKey
2
- from .column import Column
3
- from .table import Table
1
+ from kumoapi.common import StrEnum
2
+
3
+
4
+ class DataBackend(StrEnum):
5
+ LOCAL = 'local'
6
+ SQLITE = 'sqlite'
7
+ SNOWFLAKE = 'snowflake'
8
+
9
+
10
+ from .source import SourceColumn, SourceForeignKey # noqa: E402
11
+ from .column import Column # noqa: E402
12
+ from .column_expression import ColumnExpressionSpec # noqa: E402
13
+ from .column_expression import ColumnExpressionType # noqa: E402
14
+ from .table import Table # noqa: E402
15
+ from .sql_table import SQLTable # noqa: E402
16
+ from .sampler import SamplerOutput, Sampler # noqa: E402
17
+ from .sql_sampler import SQLSampler # noqa: E402
4
18
 
5
19
  __all__ = [
20
+ 'DataBackend',
6
21
  'SourceColumn',
7
22
  'SourceForeignKey',
8
23
  'Column',
24
+ 'ColumnExpressionSpec',
25
+ 'ColumnExpressionType',
9
26
  'Table',
27
+ 'SQLTable',
28
+ 'SamplerOutput',
29
+ 'Sampler',
30
+ 'SQLSampler',
10
31
  ]
@@ -8,20 +8,14 @@ from kumoapi.typing import Dtype, Stype
8
8
  class Column:
9
9
  stype: Stype
10
10
 
11
- def __init__(
12
- self,
13
- name: str,
14
- dtype: Dtype,
15
- stype: Stype,
16
- is_primary_key: bool = False,
17
- is_time_column: bool = False,
18
- is_end_time_column: bool = False,
19
- ) -> None:
11
+ def __init__(self, name: str, stype: Stype, dtype: Dtype) -> None:
20
12
  self._name = name
21
13
  self._dtype = Dtype(dtype)
22
- self._is_primary_key = is_primary_key
23
- self._is_time_column = is_time_column
24
- self._is_end_time_column = is_end_time_column
14
+
15
+ self._is_primary_key = False
16
+ self._is_time_column = False
17
+ self._is_end_time_column = False
18
+
25
19
  self.stype = Stype(stype)
26
20
 
27
21
  @property
@@ -0,0 +1,16 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, TypeAlias
3
+
4
+ from kumoapi.typing import Dtype
5
+
6
+ from kumoai.mixin import CastMixin
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ColumnExpressionSpec(CastMixin):
11
+ name: str
12
+ expr: str
13
+ dtype: Dtype | None = None
14
+
15
+
16
+ ColumnExpressionType: TypeAlias = ColumnExpressionSpec | dict[str, Any]