kumoai 2.13.0.dev202511131731__cp310-cp310-macosx_11_0_arm64.whl → 2.14.0.dev202512271732__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. kumoai/__init__.py +18 -9
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +15 -13
  4. kumoai/client/jobs.py +24 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/connector/utils.py +23 -2
  7. kumoai/experimental/rfm/__init__.py +191 -50
  8. kumoai/experimental/rfm/authenticate.py +3 -4
  9. kumoai/experimental/rfm/backend/__init__.py +0 -0
  10. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  11. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +65 -127
  12. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  13. kumoai/experimental/rfm/backend/local/table.py +113 -0
  14. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  15. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  16. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  17. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  18. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  19. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  20. kumoai/experimental/rfm/base/__init__.py +30 -0
  21. kumoai/experimental/rfm/base/column.py +152 -0
  22. kumoai/experimental/rfm/base/expression.py +44 -0
  23. kumoai/experimental/rfm/base/sampler.py +761 -0
  24. kumoai/experimental/rfm/base/source.py +19 -0
  25. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  26. kumoai/experimental/rfm/base/table.py +753 -0
  27. kumoai/experimental/rfm/{local_graph.py → graph.py} +546 -116
  28. kumoai/experimental/rfm/infer/__init__.py +8 -0
  29. kumoai/experimental/rfm/infer/dtype.py +81 -0
  30. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  31. kumoai/experimental/rfm/infer/pkey.py +128 -0
  32. kumoai/experimental/rfm/infer/stype.py +35 -0
  33. kumoai/experimental/rfm/infer/time_col.py +61 -0
  34. kumoai/experimental/rfm/pquery/executor.py +27 -27
  35. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  36. kumoai/experimental/rfm/rfm.py +322 -252
  37. kumoai/experimental/rfm/sagemaker.py +138 -0
  38. kumoai/pquery/predictive_query.py +10 -6
  39. kumoai/spcs.py +1 -3
  40. kumoai/testing/decorators.py +1 -1
  41. kumoai/testing/snow.py +50 -0
  42. kumoai/trainer/distilled_trainer.py +175 -0
  43. kumoai/utils/__init__.py +3 -2
  44. kumoai/utils/progress_logger.py +178 -12
  45. kumoai/utils/sql.py +3 -0
  46. {kumoai-2.13.0.dev202511131731.dist-info → kumoai-2.14.0.dev202512271732.dist-info}/METADATA +13 -2
  47. {kumoai-2.13.0.dev202511131731.dist-info → kumoai-2.14.0.dev202512271732.dist-info}/RECORD +50 -29
  48. kumoai/experimental/rfm/local_graph_sampler.py +0 -184
  49. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  50. kumoai/experimental/rfm/local_table.py +0 -545
  51. kumoai/experimental/rfm/utils.py +0 -344
  52. {kumoai-2.13.0.dev202511131731.dist-info → kumoai-2.14.0.dev202512271732.dist-info}/WHEEL +0 -0
  53. {kumoai-2.13.0.dev202511131731.dist-info → kumoai-2.14.0.dev202512271732.dist-info}/licenses/LICENSE +0 -0
  54. {kumoai-2.13.0.dev202511131731.dist-info → kumoai-2.14.0.dev202512271732.dist-info}/top_level.txt +0 -0
@@ -1,545 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Any, Dict, List, Optional
3
-
4
- import pandas as pd
5
- from kumoapi.source_table import UnavailableSourceTable
6
- from kumoapi.table import Column as ColumnDefinition
7
- from kumoapi.table import TableDefinition
8
- from kumoapi.typing import Dtype, Stype
9
- from typing_extensions import Self
10
-
11
- from kumoai import in_notebook
12
- from kumoai.experimental.rfm import utils
13
-
14
-
15
- @dataclass(init=False, repr=False, eq=False)
16
- class Column:
17
- stype: Stype
18
-
19
- def __init__(
20
- self,
21
- name: str,
22
- dtype: Dtype,
23
- stype: Stype,
24
- is_primary_key: bool = False,
25
- is_time_column: bool = False,
26
- is_end_time_column: bool = False,
27
- ) -> None:
28
- self._name = name
29
- self._dtype = Dtype(dtype)
30
- self._is_primary_key = is_primary_key
31
- self._is_time_column = is_time_column
32
- self._is_end_time_column = is_end_time_column
33
- self.stype = Stype(stype)
34
-
35
- @property
36
- def name(self) -> str:
37
- return self._name
38
-
39
- @property
40
- def dtype(self) -> Dtype:
41
- return self._dtype
42
-
43
- def __setattr__(self, key: str, val: Any) -> None:
44
- if key == 'stype':
45
- if isinstance(val, str):
46
- val = Stype(val)
47
- assert isinstance(val, Stype)
48
- if not val.supports_dtype(self.dtype):
49
- raise ValueError(f"Column '{self.name}' received an "
50
- f"incompatible semantic type (got "
51
- f"dtype='{self.dtype}' and stype='{val}')")
52
- if self._is_primary_key and val != Stype.ID:
53
- raise ValueError(f"Primary key '{self.name}' must have 'ID' "
54
- f"semantic type (got '{val}')")
55
- if self._is_time_column and val != Stype.timestamp:
56
- raise ValueError(f"Time column '{self.name}' must have "
57
- f"'timestamp' semantic type (got '{val}')")
58
- if self._is_end_time_column and val != Stype.timestamp:
59
- raise ValueError(f"End time column '{self.name}' must have "
60
- f"'timestamp' semantic type (got '{val}')")
61
-
62
- super().__setattr__(key, val)
63
-
64
- def __hash__(self) -> int:
65
- return hash((self.name, self.stype, self.dtype))
66
-
67
- def __eq__(self, other: Any) -> bool:
68
- if not isinstance(other, Column):
69
- return False
70
- return hash(self) == hash(other)
71
-
72
- def __repr__(self) -> str:
73
- return (f'{self.__class__.__name__}(name={self.name}, '
74
- f'stype={self.stype}, dtype={self.dtype})')
75
-
76
-
77
- class LocalTable:
78
- r"""A table backed by a :class:`pandas.DataFrame`.
79
-
80
- A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
81
- selected columns, column semantic types, primary keys and time columns.
82
- :class:`LocalTable` is used to create a :class:`LocalGraph`.
83
-
84
- .. code-block:: python
85
-
86
- import pandas as pd
87
- import kumoai.experimental.rfm as rfm
88
-
89
- # Load data from a CSV file:
90
- df = pd.read_csv("data.csv")
91
-
92
- # Create a table from a `pandas.DataFrame` and infer its metadata ...
93
- table = rfm.LocalTable(df, name="my_table").infer_metadata()
94
-
95
- # ... or create a table explicitly:
96
- table = rfm.LocalTable(
97
- df=df,
98
- name="my_table",
99
- primary_key="id",
100
- time_column="time",
101
- end_time_column=None,
102
- )
103
-
104
- # Verify metadata:
105
- table.print_metadata()
106
-
107
- # Change the semantic type of a column:
108
- table[column].stype = "text"
109
-
110
- Args:
111
- df: The data frame to create the table from.
112
- name: The name of the table.
113
- primary_key: The name of the primary key of this table, if it exists.
114
- time_column: The name of the time column of this table, if it exists.
115
- end_time_column: The name of the end time column of this table, if it
116
- exists.
117
- """
118
- def __init__(
119
- self,
120
- df: pd.DataFrame,
121
- name: str,
122
- primary_key: Optional[str] = None,
123
- time_column: Optional[str] = None,
124
- end_time_column: Optional[str] = None,
125
- ) -> None:
126
-
127
- if df.empty:
128
- raise ValueError("Data frame must have at least one row")
129
- if isinstance(df.columns, pd.MultiIndex):
130
- raise ValueError("Data frame must not have a multi-index")
131
- if not df.columns.is_unique:
132
- raise ValueError("Data frame must have unique column names")
133
- if any(col == '' for col in df.columns):
134
- raise ValueError("Data frame must have non-empty column names")
135
-
136
- df = df.copy(deep=False)
137
-
138
- self._data = df
139
- self._name = name
140
- self._primary_key: Optional[str] = None
141
- self._time_column: Optional[str] = None
142
- self._end_time_column: Optional[str] = None
143
-
144
- self._columns: Dict[str, Column] = {}
145
- for column_name in df.columns:
146
- self.add_column(column_name)
147
-
148
- if primary_key is not None:
149
- self.primary_key = primary_key
150
-
151
- if time_column is not None:
152
- self.time_column = time_column
153
-
154
- if end_time_column is not None:
155
- self.end_time_column = end_time_column
156
-
157
- @property
158
- def name(self) -> str:
159
- r"""The name of the table."""
160
- return self._name
161
-
162
- # Data column #############################################################
163
-
164
- def has_column(self, name: str) -> bool:
165
- r"""Returns ``True`` if this table holds a column with name ``name``;
166
- ``False`` otherwise.
167
- """
168
- return name in self._columns
169
-
170
- def column(self, name: str) -> Column:
171
- r"""Returns the data column named with name ``name`` in this table.
172
-
173
- Args:
174
- name: The name of the column.
175
-
176
- Raises:
177
- KeyError: If ``name`` is not present in this table.
178
- """
179
- if not self.has_column(name):
180
- raise KeyError(f"Column '{name}' not found in table '{self.name}'")
181
- return self._columns[name]
182
-
183
- @property
184
- def columns(self) -> List[Column]:
185
- r"""Returns a list of :class:`Column` objects that represent the
186
- columns in this table.
187
- """
188
- return list(self._columns.values())
189
-
190
- def add_column(self, name: str) -> Column:
191
- r"""Adds a column to this table.
192
-
193
- Args:
194
- name: The name of the column.
195
-
196
- Raises:
197
- KeyError: If ``name`` is already present in this table.
198
- """
199
- if name in self:
200
- raise KeyError(f"Column '{name}' already exists in table "
201
- f"'{self.name}'")
202
-
203
- if name not in self._data.columns:
204
- raise KeyError(f"Column '{name}' does not exist in the underyling "
205
- f"data frame")
206
-
207
- try:
208
- dtype = utils.to_dtype(self._data[name])
209
- except Exception as e:
210
- raise RuntimeError(f"Data type inference for column '{name}' in "
211
- f"table '{self.name}' failed. Consider "
212
- f"changing the data type of the column or "
213
- f"removing it from the table.") from e
214
- try:
215
- stype = utils.infer_stype(self._data[name], name, dtype)
216
- except Exception as e:
217
- raise RuntimeError(f"Semantic type inference for column '{name}' "
218
- f"in table '{self.name}' failed. Consider "
219
- f"changing the data type of the column or "
220
- f"removing it from the table.") from e
221
-
222
- self._columns[name] = Column(
223
- name=name,
224
- dtype=dtype,
225
- stype=stype,
226
- )
227
-
228
- return self._columns[name]
229
-
230
- def remove_column(self, name: str) -> Self:
231
- r"""Removes a column from this table.
232
-
233
- Args:
234
- name: The name of the column.
235
-
236
- Raises:
237
- KeyError: If ``name`` is not present in this table.
238
- """
239
- if name not in self:
240
- raise KeyError(f"Column '{name}' not found in table '{self.name}'")
241
-
242
- if self._primary_key == name:
243
- self.primary_key = None
244
- if self._time_column == name:
245
- self.time_column = None
246
- if self._end_time_column == name:
247
- self.end_time_column = None
248
- del self._columns[name]
249
-
250
- return self
251
-
252
- # Primary key #############################################################
253
-
254
- def has_primary_key(self) -> bool:
255
- r"""Returns ``True``` if this table has a primary key; ``False``
256
- otherwise.
257
- """
258
- return self._primary_key is not None
259
-
260
- @property
261
- def primary_key(self) -> Optional[Column]:
262
- r"""The primary key column of this table.
263
-
264
- The getter returns the primary key column of this table, or ``None`` if
265
- no such primary key is present.
266
-
267
- The setter sets a column as a primary key on this table, and raises a
268
- :class:`ValueError` if the primary key has a non-ID semantic type or
269
- if the column name does not match a column in the data frame.
270
- """
271
- if self._primary_key is None:
272
- return None
273
- return self[self._primary_key]
274
-
275
- @primary_key.setter
276
- def primary_key(self, name: Optional[str]) -> None:
277
- if name is not None and name == self._time_column:
278
- raise ValueError(f"Cannot specify column '{name}' as a primary "
279
- f"key since it is already defined to be a time "
280
- f"column")
281
- if name is not None and name == self._end_time_column:
282
- raise ValueError(f"Cannot specify column '{name}' as a primary "
283
- f"key since it is already defined to be an end "
284
- f"time column")
285
-
286
- if self.primary_key is not None:
287
- self.primary_key._is_primary_key = False
288
-
289
- if name is None:
290
- self._primary_key = None
291
- return
292
-
293
- self[name].stype = Stype.ID
294
- self[name]._is_primary_key = True
295
- self._primary_key = name
296
-
297
- # Time column #############################################################
298
-
299
- def has_time_column(self) -> bool:
300
- r"""Returns ``True`` if this table has a time column; ``False``
301
- otherwise.
302
- """
303
- return self._time_column is not None
304
-
305
- @property
306
- def time_column(self) -> Optional[Column]:
307
- r"""The time column of this table.
308
-
309
- The getter returns the time column of this table, or ``None`` if no
310
- such time column is present.
311
-
312
- The setter sets a column as a time column on this table, and raises a
313
- :class:`ValueError` if the time column has a non-timestamp semantic
314
- type or if the column name does not match a column in the data frame.
315
- """
316
- if self._time_column is None:
317
- return None
318
- return self[self._time_column]
319
-
320
- @time_column.setter
321
- def time_column(self, name: Optional[str]) -> None:
322
- if name is not None and name == self._primary_key:
323
- raise ValueError(f"Cannot specify column '{name}' as a time "
324
- f"column since it is already defined to be a "
325
- f"primary key")
326
- if name is not None and name == self._end_time_column:
327
- raise ValueError(f"Cannot specify column '{name}' as a time "
328
- f"column since it is already defined to be an "
329
- f"end time column")
330
-
331
- if self.time_column is not None:
332
- self.time_column._is_time_column = False
333
-
334
- if name is None:
335
- self._time_column = None
336
- return
337
-
338
- self[name].stype = Stype.timestamp
339
- self[name]._is_time_column = True
340
- self._time_column = name
341
-
342
- # End Time column #########################################################
343
-
344
- def has_end_time_column(self) -> bool:
345
- r"""Returns ``True`` if this table has an end time column; ``False``
346
- otherwise.
347
- """
348
- return self._end_time_column is not None
349
-
350
- @property
351
- def end_time_column(self) -> Optional[Column]:
352
- r"""The end time column of this table.
353
-
354
- The getter returns the end time column of this table, or ``None`` if no
355
- such end time column is present.
356
-
357
- The setter sets a column as an end time column on this table, and
358
- raises a :class:`ValueError` if the end time column has a non-timestamp
359
- semantic type or if the column name does not match a column in the data
360
- frame.
361
- """
362
- if self._end_time_column is None:
363
- return None
364
- return self[self._end_time_column]
365
-
366
- @end_time_column.setter
367
- def end_time_column(self, name: Optional[str]) -> None:
368
- if name is not None and name == self._primary_key:
369
- raise ValueError(f"Cannot specify column '{name}' as an end time "
370
- f"column since it is already defined to be a "
371
- f"primary key")
372
- if name is not None and name == self._time_column:
373
- raise ValueError(f"Cannot specify column '{name}' as an end time "
374
- f"column since it is already defined to be a "
375
- f"time column")
376
-
377
- if self.end_time_column is not None:
378
- self.end_time_column._is_end_time_column = False
379
-
380
- if name is None:
381
- self._end_time_column = None
382
- return
383
-
384
- self[name].stype = Stype.timestamp
385
- self[name]._is_end_time_column = True
386
- self._end_time_column = name
387
-
388
- # Metadata ################################################################
389
-
390
- @property
391
- def metadata(self) -> pd.DataFrame:
392
- r"""Returns a :class:`pandas.DataFrame` object containing metadata
393
- information about the columns in this table.
394
-
395
- The returned dataframe has columns ``name``, ``dtype``, ``stype``,
396
- ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
397
- which provide an aggregate view of the properties of the columns of
398
- this table.
399
-
400
- Example:
401
- >>> # doctest: +SKIP
402
- >>> import kumoai.experimental.rfm as rfm
403
- >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
404
- >>> table.metadata
405
- name dtype stype is_primary_key is_time_column is_end_time_column
406
- 0 CustomerID float64 ID True False False
407
- """ # noqa: E501
408
- cols = self.columns
409
-
410
- return pd.DataFrame({
411
- 'name':
412
- pd.Series(dtype=str, data=[c.name for c in cols]),
413
- 'dtype':
414
- pd.Series(dtype=str, data=[c.dtype for c in cols]),
415
- 'stype':
416
- pd.Series(dtype=str, data=[c.stype for c in cols]),
417
- 'is_primary_key':
418
- pd.Series(
419
- dtype=bool,
420
- data=[self._primary_key == c.name for c in cols],
421
- ),
422
- 'is_time_column':
423
- pd.Series(
424
- dtype=bool,
425
- data=[self._time_column == c.name for c in cols],
426
- ),
427
- 'is_end_time_column':
428
- pd.Series(
429
- dtype=bool,
430
- data=[self._end_time_column == c.name for c in cols],
431
- ),
432
- })
433
-
434
- def print_metadata(self) -> None:
435
- r"""Prints the :meth:`~LocalTable.metadata` of the table."""
436
- if in_notebook():
437
- from IPython.display import Markdown, display
438
- display(
439
- Markdown(f"### 🏷️ Metadata of Table `{self.name}` "
440
- f"({len(self._data):,} rows)"))
441
- df = self.metadata
442
- try:
443
- if hasattr(df.style, 'hide'):
444
- display(df.style.hide(axis='index')) # pandas=2
445
- else:
446
- display(df.style.hide_index()) # pandas<1.3
447
- except ImportError:
448
- print(df.to_string(index=False)) # missing jinja2
449
- else:
450
- print(f"🏷️ Metadata of Table '{self.name}' "
451
- f"({len(self._data):,} rows):")
452
- print(self.metadata.to_string(index=False))
453
-
454
- def infer_metadata(self, verbose: bool = True) -> Self:
455
- r"""Infers metadata, *i.e.*, primary keys and time columns, in the
456
- table.
457
-
458
- Args:
459
- verbose: Whether to print verbose output.
460
- """
461
- logs = []
462
-
463
- # Try to detect primary key if not set:
464
- if not self.has_primary_key():
465
-
466
- def is_candidate(column: Column) -> bool:
467
- if column.stype == Stype.ID:
468
- return True
469
- if all(column.stype != Stype.ID for column in self.columns):
470
- if self.name == column.name:
471
- return True
472
- if (self.name.endswith('s')
473
- and self.name[:-1] == column.name):
474
- return True
475
- return False
476
-
477
- candidates = [
478
- column.name for column in self.columns if is_candidate(column)
479
- ]
480
-
481
- if primary_key := utils.detect_primary_key(
482
- table_name=self.name,
483
- df=self._data,
484
- candidates=candidates,
485
- ):
486
- self.primary_key = primary_key
487
- logs.append(f"primary key '{primary_key}'")
488
-
489
- # Try to detect time column if not set:
490
- if not self.has_time_column():
491
- candidates = [
492
- column.name for column in self.columns
493
- if column.stype == Stype.timestamp
494
- and column.name != self._end_time_column
495
- ]
496
- if time_column := utils.detect_time_column(self._data, candidates):
497
- self.time_column = time_column
498
- logs.append(f"time column '{time_column}'")
499
-
500
- if verbose and len(logs) > 0:
501
- print(f"Detected {' and '.join(logs)} in table '{self.name}'")
502
-
503
- return self
504
-
505
- # Helpers #################################################################
506
-
507
- def _to_api_table_definition(self) -> TableDefinition:
508
- return TableDefinition(
509
- cols=[
510
- ColumnDefinition(col.name, col.stype, col.dtype)
511
- for col in self.columns
512
- ],
513
- source_table=UnavailableSourceTable(table=self.name),
514
- pkey=self._primary_key,
515
- time_col=self._time_column,
516
- end_time_col=self._end_time_column,
517
- )
518
-
519
- # Python builtins #########################################################
520
-
521
- def __hash__(self) -> int:
522
- special_columns = [
523
- self.primary_key,
524
- self.time_column,
525
- self.end_time_column,
526
- ]
527
- return hash(tuple(self.columns + special_columns))
528
-
529
- def __contains__(self, name: str) -> bool:
530
- return self.has_column(name)
531
-
532
- def __getitem__(self, name: str) -> Column:
533
- return self.column(name)
534
-
535
- def __delitem__(self, name: str) -> None:
536
- self.remove_column(name)
537
-
538
- def __repr__(self) -> str:
539
- return (f'{self.__class__.__name__}(\n'
540
- f' name={self.name},\n'
541
- f' num_columns={len(self.columns)},\n'
542
- f' primary_key={self._primary_key},\n'
543
- f' time_column={self._time_column},\n'
544
- f' end_time_column={self._end_time_column},\n'
545
- f')')