kumoai 2.13.0.dev202512021731__cp310-cp310-win_amd64.whl → 2.13.0.dev202512040252__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kumoai/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = '2.13.0.dev202512021731'
1
+ __version__ = '2.13.0.dev202512040252'
@@ -1,14 +1,10 @@
1
- from typing import Any, Dict, List, Optional, Tuple
1
+ import warnings
2
+ from typing import List, Optional
2
3
 
3
- import numpy as np
4
4
  import pandas as pd
5
- import pyarrow as pa
6
- from kumoapi.typing import Dtype, Stype
7
- from typing_extensions import Self
8
5
 
9
- from kumoai.experimental.rfm import utils
10
- from kumoai.experimental.rfm.base import Column, Table
11
- from kumoai.experimental.rfm.infer import infer_stype
6
+ from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
7
+ from kumoai.experimental.rfm.infer import infer_dtype
12
8
 
13
9
 
14
10
  class LocalTable(Table):
@@ -62,7 +58,7 @@ class LocalTable(Table):
62
58
  ) -> None:
63
59
 
64
60
  if df.empty:
65
- raise ValueError("Data frame must have at least one row")
61
+ raise ValueError("Data frame is empty")
66
62
  if isinstance(df.columns, pd.MultiIndex):
67
63
  raise ValueError("Data frame must not have a multi-index")
68
64
  if not df.columns.is_unique:
@@ -80,165 +76,34 @@ class LocalTable(Table):
80
76
  end_time_column=end_time_column,
81
77
  )
82
78
 
83
- def infer_metadata(self, verbose: bool = True) -> Self:
84
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
85
- table.
86
-
87
- Args:
88
- verbose: Whether to print verbose output.
89
- """
90
- logs = []
91
-
92
- # Try to detect primary key if not set:
93
- if not self.has_primary_key():
94
-
95
- def is_candidate(column: Column) -> bool:
96
- if column.stype == Stype.ID:
97
- return True
98
- if all(column.stype != Stype.ID for column in self.columns):
99
- if self.name == column.name:
100
- return True
101
- if (self.name.endswith('s')
102
- and self.name[:-1] == column.name):
103
- return True
104
- return False
105
-
106
- candidates = [
107
- column.name for column in self.columns if is_candidate(column)
108
- ]
109
-
110
- if primary_key := utils.detect_primary_key(
111
- table_name=self.name,
112
- df=self._data,
113
- candidates=candidates,
114
- ):
115
- self.primary_key = primary_key
116
- logs.append(f"primary key '{primary_key}'")
117
-
118
- # Try to detect time column if not set:
119
- if not self.has_time_column():
120
- candidates = [
121
- column.name for column in self.columns
122
- if column.stype == Stype.timestamp
123
- and column.name != self._end_time_column
124
- ]
125
- if time_column := utils.detect_time_column(self._data, candidates):
126
- self.time_column = time_column
127
- logs.append(f"time column '{time_column}'")
128
-
129
- if verbose and len(logs) > 0:
130
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
131
-
132
- return self
133
-
134
- def _has_source_column(self, name: str) -> bool:
135
- return name in self._data.columns
136
-
137
- def _get_source_dtype(self, name: str) -> Dtype:
138
- return to_dtype(self._data[name])
139
-
140
- def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
141
- return infer_stype(self._data[name], name, dtype)
142
-
143
- def _get_source_foreign_keys(self) -> List[Tuple[str, str, str]]:
79
+ def _get_source_columns(self) -> List[SourceColumn]:
80
+ source_columns: List[SourceColumn] = []
81
+ for column in self._data.columns:
82
+ ser = self._data[column]
83
+ try:
84
+ dtype = infer_dtype(ser)
85
+ except Exception:
86
+ warnings.warn(f"Data type inference for column '{column}' in "
87
+ f"table '{self.name}' failed. Consider changing "
88
+ f"the data type of the column to use it within "
89
+ f"this table.")
90
+ continue
91
+
92
+ source_column = SourceColumn(
93
+ name=column,
94
+ dtype=dtype,
95
+ is_primary_key=False,
96
+ is_unique_key=False,
97
+ )
98
+ source_columns.append(source_column)
99
+
100
+ return source_columns
101
+
102
+ def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
144
103
  return []
145
104
 
146
- def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
147
- return utils.detect_primary_key(
148
- table_name=self.name,
149
- df=self._data,
150
- candidates=candidates,
151
- )
152
-
153
- def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
154
- return utils.detect_time_column(df=self._data, candidates=candidates)
105
+ def _get_sample_df(self) -> pd.DataFrame:
106
+ return self._data
155
107
 
156
- def _num_rows(self) -> Optional[int]:
108
+ def _get_num_rows(self) -> Optional[int]:
157
109
  return len(self._data)
158
-
159
-
160
- # Data Type ###################################################################
161
-
162
- PANDAS_TO_DTYPE: Dict[Any, Dtype] = {
163
- np.dtype('bool'): Dtype.bool,
164
- pd.BooleanDtype(): Dtype.bool,
165
- pa.bool_(): Dtype.bool,
166
- np.dtype('byte'): Dtype.int,
167
- pd.UInt8Dtype(): Dtype.int,
168
- np.dtype('int16'): Dtype.int,
169
- pd.Int16Dtype(): Dtype.int,
170
- np.dtype('int32'): Dtype.int,
171
- pd.Int32Dtype(): Dtype.int,
172
- np.dtype('int64'): Dtype.int,
173
- pd.Int64Dtype(): Dtype.int,
174
- np.dtype('float32'): Dtype.float,
175
- pd.Float32Dtype(): Dtype.float,
176
- np.dtype('float64'): Dtype.float,
177
- pd.Float64Dtype(): Dtype.float,
178
- np.dtype('object'): Dtype.string,
179
- pd.StringDtype(storage='python'): Dtype.string,
180
- pd.StringDtype(storage='pyarrow'): Dtype.string,
181
- pa.string(): Dtype.string,
182
- pa.binary(): Dtype.binary,
183
- np.dtype('datetime64[ns]'): Dtype.date,
184
- np.dtype('timedelta64[ns]'): Dtype.timedelta,
185
- pa.list_(pa.float32()): Dtype.floatlist,
186
- pa.list_(pa.int64()): Dtype.intlist,
187
- pa.list_(pa.string()): Dtype.stringlist,
188
- }
189
-
190
-
191
- def to_dtype(ser: pd.Series) -> Dtype:
192
- """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
193
-
194
- Args:
195
- ser: A :class:`pandas.Series` to analyze.
196
-
197
- Returns:
198
- The data type.
199
- """
200
- if pd.api.types.is_datetime64_any_dtype(ser.dtype):
201
- return Dtype.date
202
-
203
- if isinstance(ser.dtype, pd.CategoricalDtype):
204
- return Dtype.string
205
-
206
- if pd.api.types.is_object_dtype(ser.dtype):
207
- index = ser.iloc[:1000].first_valid_index()
208
- if index is not None and pd.api.types.is_list_like(ser[index]):
209
- pos = ser.index.get_loc(index)
210
- assert isinstance(pos, int)
211
- ser = ser.iloc[pos:pos + 1000].dropna()
212
-
213
- if not ser.map(pd.api.types.is_list_like).all():
214
- raise ValueError("Data contains a mix of list-like and "
215
- "non-list-like values")
216
-
217
- # Remove all empty Python lists without known data type:
218
- ser = ser[ser.map(lambda x: not isinstance(x, list) or len(x) > 0)]
219
-
220
- # Infer unique data types in this series:
221
- dtypes = ser.apply(lambda x: PANDAS_TO_DTYPE.get(
222
- np.array(x).dtype, Dtype.string)).unique().tolist()
223
-
224
- invalid_dtypes = set(dtypes) - {
225
- Dtype.string,
226
- Dtype.int,
227
- Dtype.float,
228
- }
229
- if len(invalid_dtypes) > 0:
230
- raise ValueError(f"Data contains unsupported list data types: "
231
- f"{list(invalid_dtypes)}")
232
-
233
- if Dtype.string in dtypes:
234
- return Dtype.stringlist
235
-
236
- if dtypes == [Dtype.int]:
237
- return Dtype.intlist
238
-
239
- return Dtype.floatlist
240
-
241
- if ser.dtype not in PANDAS_TO_DTYPE:
242
- raise ValueError(f"Unsupported data type '{ser.dtype}'")
243
-
244
- return PANDAS_TO_DTYPE[ser.dtype]
@@ -26,7 +26,10 @@ def connect(**kwargs: Any) -> Connection:
26
26
  return snowflake.connector.connect(**kwargs)
27
27
 
28
28
 
29
+ from .table import SnowTable # noqa: E402
30
+
29
31
  __all__ = [
30
32
  'connect',
31
33
  'Connection',
34
+ 'SnowTable',
32
35
  ]
@@ -0,0 +1,115 @@
1
+ import re
2
+ from typing import List, Optional, Sequence
3
+
4
+ import pandas as pd
5
+ from kumoapi.typing import Dtype
6
+
7
+ from kumoai.experimental.rfm.backend.sqlite import Connection
8
+ from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
9
+
10
+
11
+ class SnowTable(Table):
12
+ r"""A table backed by a :class:`sqlite` database.
13
+
14
+ Args:
15
+ connection: The connection to a :class:`snowflake` database.
16
+ name: The name of this table.
17
+ columns: The selected columns of this table.
18
+ primary_key: The name of the primary key of this table, if it exists.
19
+ time_column: The name of the time column of this table, if it exists.
20
+ end_time_column: The name of the end time column of this table, if it
21
+ exists.
22
+ """
23
+ def __init__(
24
+ self,
25
+ connection: Connection,
26
+ name: str,
27
+ database: str | None = None,
28
+ schema: str | None = None,
29
+ columns: Optional[Sequence[str]] = None,
30
+ primary_key: Optional[str] = None,
31
+ time_column: Optional[str] = None,
32
+ end_time_column: Optional[str] = None,
33
+ ) -> None:
34
+
35
+ if database is not None and schema is None:
36
+ raise ValueError(f"Missing 'schema' for table '{name}' in "
37
+ f"database '{database}'")
38
+
39
+ self._connection = connection
40
+ self._database = database
41
+ self._schema = schema
42
+
43
+ super().__init__(
44
+ name=name,
45
+ columns=columns,
46
+ primary_key=primary_key,
47
+ time_column=time_column,
48
+ end_time_column=end_time_column,
49
+ )
50
+
51
+ @property
52
+ def fqn_name(self) -> str:
53
+ names: List[str] = []
54
+ if self._database is not None:
55
+ assert self._schema is not None
56
+ names.extend([self._database, self._schema])
57
+ elif self._schema is not None:
58
+ names.append(self._schema)
59
+ names.append(self._name)
60
+ return '.'.join(names)
61
+
62
+ def _get_source_columns(self) -> List[SourceColumn]:
63
+ source_columns: List[SourceColumn] = []
64
+ with self._connection.cursor() as cursor:
65
+ try:
66
+ cursor.execute(f"DESCRIBE TABLE {self.fqn_name}")
67
+ except Exception as e:
68
+ raise ValueError(
69
+ f"Table '{self.fqn_name}' does not exist") from e
70
+
71
+ for row in cursor.fetchall():
72
+ column, type, _, _, _, is_pkey, is_unique = row[:7]
73
+
74
+ type = type.strip().upper()
75
+ if type.startswith('NUMBER'):
76
+ dtype = Dtype.int
77
+ elif type.startswith('VARCHAR'):
78
+ dtype = Dtype.string
79
+ elif type == 'FLOAT':
80
+ dtype = Dtype.float
81
+ elif type == 'BOOLEAN':
82
+ dtype = Dtype.bool
83
+ elif re.search('DATE|TIMESTAMP', type):
84
+ dtype = Dtype.date
85
+ else:
86
+ continue
87
+
88
+ source_column = SourceColumn(
89
+ name=column,
90
+ dtype=dtype,
91
+ is_primary_key=is_pkey.strip().upper() == 'Y',
92
+ is_unique_key=is_unique.strip().upper() == 'Y',
93
+ )
94
+ source_columns.append(source_column)
95
+
96
+ return source_columns
97
+
98
+ def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
99
+ source_fkeys: List[SourceForeignKey] = []
100
+ with self._connection.cursor() as cursor:
101
+ cursor.execute(f"SHOW IMPORTED KEYS IN TABLE {self.fqn_name}")
102
+ for row in cursor.fetchall():
103
+ _, _, _, dst_table, pkey, _, _, _, fkey = row[:9]
104
+ source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
105
+ return source_fkeys
106
+
107
+ def _get_sample_df(self) -> pd.DataFrame:
108
+ with self._connection.cursor() as cursor:
109
+ columns = ', '.join(self._source_column_dict.keys())
110
+ cursor.execute(f"SELECT {columns} FROM {self.fqn_name} LIMIT 1000")
111
+ table = cursor.fetch_arrow_all()
112
+ return table.to_pandas(types_mapper=pd.ArrowDtype)
113
+
114
+ def _get_num_rows(self) -> Optional[int]:
115
+ return None
@@ -1,13 +1,13 @@
1
1
  import re
2
- from typing import Dict, List, Optional, Sequence, Tuple
2
+ import warnings
3
+ from typing import List, Optional, Sequence
3
4
 
4
- import pyarrow as pa
5
- from kumoapi.typing import Dtype, Stype
6
- from typing_extensions import Self
5
+ import pandas as pd
6
+ from kumoapi.typing import Dtype
7
7
 
8
8
  from kumoai.experimental.rfm.backend.sqlite import Connection
9
- from kumoai.experimental.rfm.base import Table
10
- from kumoai.experimental.rfm.infer import infer_stype
9
+ from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
10
+ from kumoai.experimental.rfm.infer import infer_dtype
11
11
 
12
12
 
13
13
  class SQLiteTable(Table):
@@ -33,92 +33,69 @@ class SQLiteTable(Table):
33
33
  ) -> None:
34
34
 
35
35
  self._connection = connection
36
- self._dtype_dict: Dict[str, Dtype] = {}
37
-
38
- with connection.cursor() as cursor:
39
- cursor.execute(f"PRAGMA table_info({name})")
40
- for _, column, dtype, _, _, is_pkey in cursor.fetchall():
41
- if bool(is_pkey):
42
- if primary_key is not None and primary_key != column:
43
- raise ValueError(f"Found duplicate primary key "
44
- f"definition '{primary_key}' and "
45
- f"'{column}' in table '{name}'")
46
- primary_key = column
47
-
48
- # Determine colun affinity:
49
- dtype = dtype.strip().upper()
50
- if re.search('INT', dtype):
51
- self._dtype_dict[column] = Dtype.int
52
- elif re.search('TEXT|CHAR|CLOB', dtype):
53
- self._dtype_dict[column] = Dtype.string
54
- elif re.search('REAL|FLOA|DOUB', dtype):
55
- self._dtype_dict[column] = Dtype.float
56
- else: # NUMERIC affinity.
57
- self._dtype_dict[column] = Dtype.unsupported
58
-
59
- if len(self._dtype_dict) > 0:
60
- column_names = ', '.join(self._dtype_dict.keys())
61
- cursor.execute(f"SELECT {column_names} FROM {name} "
62
- f"ORDER BY rowid LIMIT 1000")
63
- self._sample = cursor.fetch_arrow_table()
64
-
65
- for column_name in list(self._dtype_dict.keys()):
66
- if self._dtype_dict[column_name] == Dtype.unsupported:
67
- dtype = self._sample[column_name].type
68
- if pa.types.is_integer(dtype):
69
- self._dtype_dict[column_name] = Dtype.int
70
- elif pa.types.is_floating(dtype):
71
- self._dtype_dict[column_name] = Dtype.float
72
- elif pa.types.is_decimal(dtype):
73
- self._dtype_dict[column_name] = Dtype.float
74
- elif pa.types.is_string(dtype):
75
- self._dtype_dict[column_name] = Dtype.string
76
- else:
77
- del self._dtype_dict[column_name]
78
-
79
- if len(self._dtype_dict) == 0:
80
- raise RuntimeError(f"Table '{name}' does not exist or does not "
81
- f"hold any column with a supported data type")
82
36
 
83
37
  super().__init__(
84
38
  name=name,
85
- columns=columns or list(self._dtype_dict.keys()),
39
+ columns=columns,
86
40
  primary_key=primary_key,
87
41
  time_column=time_column,
88
42
  end_time_column=end_time_column,
89
43
  )
90
44
 
91
- def infer_metadata(self, verbose: bool = True) -> Self:
92
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
93
- table.
94
-
95
- Args:
96
- verbose: Whether to print verbose output.
97
- """
98
- return self
99
-
100
- def _has_source_column(self, name: str) -> bool:
101
- return name in self._dtype_dict
102
-
103
- def _get_source_dtype(self, name: str) -> Dtype:
104
- return self._dtype_dict[name]
105
-
106
- def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
107
- return infer_stype(self._sample[name].to_pandas(), name, dtype)
108
-
109
- def _get_source_foreign_keys(self) -> List[Tuple[str, str, str]]:
110
- edges: List[Tuple[str, str, str]] = []
45
+ def _get_source_columns(self) -> List[SourceColumn]:
46
+ source_columns: List[SourceColumn] = []
47
+ with self._connection.cursor() as cursor:
48
+ cursor.execute(f"PRAGMA table_info({self.name})")
49
+ rows = cursor.fetchall()
50
+
51
+ if len(rows) == 0:
52
+ raise ValueError(f"Table '{self.name}' does not exist")
53
+
54
+ for _, column, type, _, _, is_pkey in rows:
55
+ # Determine column affinity:
56
+ type = type.strip().upper()
57
+ if re.search('INT', type):
58
+ dtype = Dtype.int
59
+ elif re.search('TEXT|CHAR|CLOB', type):
60
+ dtype = Dtype.string
61
+ elif re.search('REAL|FLOA|DOUB', type):
62
+ dtype = Dtype.float
63
+ else: # NUMERIC affinity.
64
+ ser = self._sample_df[column]
65
+ try:
66
+ dtype = infer_dtype(ser)
67
+ except Exception:
68
+ warnings.warn(
69
+ f"Data type inference for column '{column}' in "
70
+ f"table '{self.name}' failed. Consider changing "
71
+ f"the data type of the column to use it within "
72
+ f"this table.")
73
+ continue
74
+
75
+ source_column = SourceColumn(
76
+ name=column,
77
+ dtype=dtype,
78
+ is_primary_key=bool(is_pkey),
79
+ is_unique_key=False,
80
+ )
81
+ source_columns.append(source_column)
82
+
83
+ return source_columns
84
+
85
+ def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
86
+ source_fkeys: List[SourceForeignKey] = []
111
87
  with self._connection.cursor() as cursor:
112
88
  cursor.execute(f"PRAGMA foreign_key_list({self.name})")
113
89
  for _, _, dst_table, fkey, pkey, _, _, _ in cursor.fetchall():
114
- edges.append((fkey, dst_table, pkey))
115
- return edges
116
-
117
- def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
118
- return None # TODO
90
+ source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
91
+ return source_fkeys
119
92
 
120
- def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
121
- return None # TODO
93
+ def _get_sample_df(self) -> pd.DataFrame:
94
+ with self._connection.cursor() as cursor:
95
+ cursor.execute(f"SELECT * FROM {self.name} "
96
+ f"ORDER BY rowid LIMIT 1000")
97
+ table = cursor.fetch_arrow_table()
98
+ return table.to_pandas(types_mapper=pd.ArrowDtype)
122
99
 
123
- def _num_rows(self) -> Optional[int]:
100
+ def _get_num_rows(self) -> Optional[int]:
124
101
  return None
@@ -1,7 +1,10 @@
1
+ from .source import SourceColumn, SourceForeignKey
1
2
  from .column import Column
2
3
  from .table import Table
3
4
 
4
5
  __all__ = [
6
+ 'SourceColumn',
7
+ 'SourceForeignKey',
5
8
  'Column',
6
9
  'Table',
7
10
  ]
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+
3
+ from kumoapi.typing import Dtype
4
+
5
+
6
+ @dataclass
7
+ class SourceColumn:
8
+ name: str
9
+ dtype: Dtype
10
+ is_primary_key: bool
11
+ is_unique_key: bool
12
+
13
+
14
+ @dataclass
15
+ class SourceForeignKey:
16
+ name: str
17
+ dst_table: str
18
+ primary_key: str