kumoai 2.7.0.dev202508201830__cp312-cp312-win_amd64.whl → 2.12.0.dev202511111731__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. kumoai/__init__.py +4 -2
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +10 -5
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/rfm.py +37 -8
  6. kumoai/connector/file_upload_connector.py +94 -85
  7. kumoai/connector/snowflake_connector.py +9 -0
  8. kumoai/connector/utils.py +1377 -209
  9. kumoai/experimental/rfm/__init__.py +5 -3
  10. kumoai/experimental/rfm/authenticate.py +8 -5
  11. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  12. kumoai/experimental/rfm/local_graph.py +96 -82
  13. kumoai/experimental/rfm/local_graph_sampler.py +16 -8
  14. kumoai/experimental/rfm/local_graph_store.py +32 -10
  15. kumoai/experimental/rfm/local_pquery_driver.py +342 -46
  16. kumoai/experimental/rfm/local_table.py +142 -45
  17. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  18. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +28 -58
  19. kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
  20. kumoai/experimental/rfm/rfm.py +535 -125
  21. kumoai/experimental/rfm/utils.py +0 -3
  22. kumoai/jobs.py +27 -1
  23. kumoai/kumolib.cp312-win_amd64.pyd +0 -0
  24. kumoai/pquery/prediction_table.py +5 -3
  25. kumoai/pquery/training_table.py +5 -3
  26. kumoai/trainer/job.py +9 -30
  27. kumoai/trainer/trainer.py +19 -10
  28. kumoai/utils/__init__.py +2 -1
  29. kumoai/utils/progress_logger.py +96 -16
  30. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/METADATA +4 -5
  31. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/RECORD +34 -34
  32. kumoai/experimental/rfm/pquery/pandas_backend.py +0 -437
  33. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/WHEEL +0 -0
  34. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/licenses/LICENSE +0 -0
  35. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ CPU architecture: {platform.machine()}
12
12
  glibc version: {platform.libc_ver()[1]}
13
13
 
14
14
  ✅ Supported Environments:
15
- * Python versions: 3.9, 3.10, 3.11, 3.12, 3.13
15
+ * Python versions: 3.10, 3.11, 3.12, 3.13
16
16
  * Operating systems and CPU architectures:
17
17
  * Linux (x86_64)
18
18
  * macOS (arm64)
@@ -20,7 +20,7 @@ glibc version: {platform.libc_ver()[1]}
20
20
  * glibc versions: >=2.28
21
21
 
22
22
  ❌ Unsupported Environments:
23
- * Python versions: 3.8, 3.14
23
+ * Python versions: 3.8, 3.9, 3.14
24
24
  * Operating systems and CPU architectures:
25
25
  * Linux (arm64)
26
26
  * macOS (x86_64)
@@ -36,7 +36,7 @@ import os
36
36
  import kumoai
37
37
  from .local_table import LocalTable
38
38
  from .local_graph import LocalGraph
39
- from .rfm import KumoRFM
39
+ from .rfm import ExplainConfig, Explanation, KumoRFM
40
40
  from .authenticate import authenticate
41
41
 
42
42
 
@@ -60,6 +60,8 @@ __all__ = [
60
60
  'LocalTable',
61
61
  'LocalGraph',
62
62
  'KumoRFM',
63
+ 'ExplainConfig',
64
+ 'Explanation',
63
65
  'authenticate',
64
66
  'init',
65
67
  ]
@@ -264,14 +264,17 @@ def _authenticate_local(api_url: str, redirect_port: int = 8765) -> None:
264
264
  f"?callback_url={urllib.parse.quote(callback_url)}" +
265
265
  f"&token_name={urllib.parse.quote(token_name)}")
266
266
 
267
- print("Opening browser page to automatically generate an API key...")
267
+ print(
268
+ "Opening browser page to automatically generate an API key...\n" +
269
+ "If the page does not open, manually create a new API key at " +
270
+ f"{api_url}/api-keys and set it using os.environ[\"KUMO_API_KEY\"] " +
271
+ "= \"YOUR_API_KEY\"")
272
+
268
273
  webbrowser.open(login_url)
269
274
 
270
275
  def get_user_input() -> None:
271
- print("If the page does not open, manually create a new API key at " +
272
- f"{api_url}/api-keys and paste it below:")
273
-
274
- token_entered = getpass("API Key (type then press enter): ").strip()
276
+ token_entered = getpass(
277
+ "or paste the API key here and press enter: ").strip()
275
278
 
276
279
  while (len(token_entered) == 0):
277
280
  token_entered = getpass(
@@ -2,6 +2,7 @@ import re
2
2
  import warnings
3
3
 
4
4
  import pandas as pd
5
+ from dateutil.parser import UnknownTimezoneWarning
5
6
  from kumoapi.typing import Dtype, Stype
6
7
 
7
8
 
@@ -20,9 +21,7 @@ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
20
21
  column_name,
21
22
  re.IGNORECASE,
22
23
  )
23
-
24
- if match is not None:
25
- return True
24
+ score = 0.3 if match is not None else 0.0
26
25
 
27
26
  ser = ser.iloc[:100]
28
27
  ser = ser.dropna()
@@ -34,5 +33,9 @@ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
34
33
  ser = ser.astype(str) # Avoid parsing numbers as unix timestamps.
35
34
 
36
35
  with warnings.catch_warnings():
36
+ warnings.simplefilter('ignore', UnknownTimezoneWarning)
37
37
  warnings.filterwarnings('ignore', message='Could not infer format')
38
- return pd.to_datetime(ser, errors='coerce').notna().all()
38
+ mask = pd.to_datetime(ser, errors='coerce').notna()
39
+ score += int(mask.sum()) / len(mask)
40
+
41
+ return score >= 1.0
@@ -29,45 +29,46 @@ class LocalGraph:
29
29
 
30
30
  .. code-block:: python
31
31
 
32
- import pandas as pd
33
- import kumoai.experimental.rfm as rfm
34
-
35
- # Load data frames into memory:
36
- df1 = pd.DataFrame(...)
37
- df2 = pd.DataFrame(...)
38
- df3 = pd.DataFrame(...)
39
-
40
- # Define tables from data frames:
41
- table1 = rfm.LocalTable(name="table1", data=df1)
42
- table2 = rfm.LocalTable(name="table2", data=df2)
43
- table3 = rfm.LocalTable(name="table3", data=df3)
44
-
45
- # Create a graph from a dictionary of tables:
46
- graph = rfm.LocalGraph({
47
- "table1": table1,
48
- "table2": table2,
49
- "table3": table3,
50
- })
51
-
52
- # Infer table metadata:
53
- graph.infer_metadata()
54
-
55
- # Infer links/edges:
56
- graph.infer_links()
57
-
58
- # Inspect table metadata:
59
- for table in graph.tables.values():
60
- table.print_metadata()
61
-
62
- # Visualize graph (if graphviz is installed):
63
- graph.visualize()
64
-
65
- # Add/Remove edges between tables:
66
- graph.link(src_table="table1", fkey="id1", dst_table="table2")
67
- graph.unlink(src_table="table1", fkey="id1", dst_table="table2")
68
-
69
- # Validate graph:
70
- graph.validate()
32
+ >>> # doctest: +SKIP
33
+ >>> import pandas as pd
34
+ >>> import kumoai.experimental.rfm as rfm
35
+
36
+ >>> # Load data frames into memory:
37
+ >>> df1 = pd.DataFrame(...)
38
+ >>> df2 = pd.DataFrame(...)
39
+ >>> df3 = pd.DataFrame(...)
40
+
41
+ >>> # Define tables from data frames:
42
+ >>> table1 = rfm.LocalTable(name="table1", data=df1)
43
+ >>> table2 = rfm.LocalTable(name="table2", data=df2)
44
+ >>> table3 = rfm.LocalTable(name="table3", data=df3)
45
+
46
+ >>> # Create a graph from a dictionary of tables:
47
+ >>> graph = rfm.LocalGraph({
48
+ ... "table1": table1,
49
+ ... "table2": table2,
50
+ ... "table3": table3,
51
+ ... })
52
+
53
+ >>> # Infer table metadata:
54
+ >>> graph.infer_metadata()
55
+
56
+ >>> # Infer links/edges:
57
+ >>> graph.infer_links()
58
+
59
+ >>> # Inspect table metadata:
60
+ >>> for table in graph.tables.values():
61
+ ... table.print_metadata()
62
+
63
+ >>> # Visualize graph (if graphviz is installed):
64
+ >>> graph.visualize()
65
+
66
+ >>> # Add/Remove edges between tables:
67
+ >>> graph.link(src_table="table1", fkey="id1", dst_table="table2")
68
+ >>> graph.unlink(src_table="table1", fkey="id1", dst_table="table2")
69
+
70
+ >>> # Validate graph:
71
+ >>> graph.validate()
71
72
  """
72
73
 
73
74
  # Constructors ############################################################
@@ -104,27 +105,28 @@ class LocalGraph:
104
105
 
105
106
  .. code-block:: python
106
107
 
107
- import pandas as pd
108
- import kumoai.experimental.rfm as rfm
108
+ >>> # doctest: +SKIP
109
+ >>> import pandas as pd
110
+ >>> import kumoai.experimental.rfm as rfm
109
111
 
110
- # Load data frames into memory:
111
- df1 = pd.DataFrame(...)
112
- df2 = pd.DataFrame(...)
113
- df3 = pd.DataFrame(...)
112
+ >>> # Load data frames into memory:
113
+ >>> df1 = pd.DataFrame(...)
114
+ >>> df2 = pd.DataFrame(...)
115
+ >>> df3 = pd.DataFrame(...)
114
116
 
115
- # Create a graph from a dictionary of data frames:
116
- graph = rfm.LocalGraph.from_data({
117
- "table1": df1,
118
- "table2": df2,
119
- "table3": df3,
120
- })
117
+ >>> # Create a graph from a dictionary of data frames:
118
+ >>> graph = rfm.LocalGraph.from_data({
119
+ ... "table1": df1,
120
+ ... "table2": df2,
121
+ ... "table3": df3,
122
+ ... })
121
123
 
122
- # Inspect table metadata:
123
- for table in graph.tables.values():
124
- table.print_metadata()
124
+ >>> # Inspect table metadata:
125
+ >>> for table in graph.tables.values():
126
+ ... table.print_metadata()
125
127
 
126
- # Visualize graph (if graphviz is installed):
127
- graph.visualize()
128
+ >>> # Visualize graph (if graphviz is installed):
129
+ >>> graph.visualize()
128
130
 
129
131
  Args:
130
132
  df_dict: A dictionary of data frames, where the keys are the names
@@ -141,6 +143,7 @@ class LocalGraph:
141
143
  graph.
142
144
 
143
145
  Example:
146
+ >>> # doctest: +SKIP
144
147
  >>> import kumoai.experimental.rfm as rfm
145
148
  >>> df1 = pd.DataFrame(...)
146
149
  >>> df2 = pd.DataFrame(...)
@@ -150,7 +153,7 @@ class LocalGraph:
150
153
  ... "table2": df2,
151
154
  ... "table3": df3,
152
155
  ... })
153
- ... graph.validate()
156
+ >>> graph.validate()
154
157
  """
155
158
  tables = [LocalTable(df, name) for name, df in df_dict.items()]
156
159
 
@@ -197,12 +200,6 @@ class LocalGraph:
197
200
  KeyError: If a table with the same name already exists in the
198
201
  graph.
199
202
  """
200
- if len(self.tables) >= 15:
201
- raise ValueError("Cannot create a graph with more than 15 "
202
- "tables. Please create a feature request at "
203
- "'https://github.com/kumo-ai/kumo-rfm' if you "
204
- "must go beyond this for your use-case.")
205
-
206
203
  if table.name in self._tables:
207
204
  raise KeyError(f"Cannot add table with name '{table.name}' to "
208
205
  f"this graph; table names must be globally unique.")
@@ -237,16 +234,17 @@ class LocalGraph:
237
234
  r"""Returns a :class:`pandas.DataFrame` object containing metadata
238
235
  information about the tables in this graph.
239
236
 
240
- The returned dataframe has columns ``name``, ``primary_key``, and
241
- ``time_column``, which provide an aggregate view of the properties of
242
- the tables of this graph.
237
+ The returned dataframe has columns ``name``, ``primary_key``,
238
+ ``time_column``, and ``end_time_column``, which provide an aggregate
239
+ view of the properties of the tables of this graph.
243
240
 
244
241
  Example:
242
+ >>> # doctest: +SKIP
245
243
  >>> import kumoai.experimental.rfm as rfm
246
244
  >>> graph = rfm.LocalGraph(tables=...).infer_metadata()
247
- >>> graph.metadata
248
- name primary_key time_column
249
- 0 users user_id -
245
+ >>> graph.metadata # doctest: +SKIP
246
+ name primary_key time_column end_time_column
247
+ 0 users user_id - -
250
248
  """
251
249
  tables = list(self.tables.values())
252
250
 
@@ -257,6 +255,11 @@ class LocalGraph:
257
255
  pd.Series(dtype=str, data=[t._primary_key or '-' for t in tables]),
258
256
  'time_column':
259
257
  pd.Series(dtype=str, data=[t._time_column or '-' for t in tables]),
258
+ 'end_time_column':
259
+ pd.Series(
260
+ dtype=str,
261
+ data=[t._end_time_column or '-' for t in tables],
262
+ ),
260
263
  })
261
264
 
262
265
  def print_metadata(self) -> None:
@@ -580,13 +583,17 @@ class LocalGraph:
580
583
  # Check that the destination table defines a primary key:
581
584
  if dst_key is None:
582
585
  raise ValueError(f"Edge {edge} is invalid since table "
583
- f"'{dst_table}' does not have a primary key")
586
+ f"'{dst_table}' does not have a primary key. "
587
+ f"Add either a primary key or remove the "
588
+ f"link before proceeding.")
584
589
 
585
590
  # Ensure that foreign key is not a primary key:
586
591
  src_pkey = self[src_table].primary_key
587
592
  if src_pkey is not None and src_pkey.name == fkey:
588
593
  raise ValueError(f"Cannot treat the primary key of table "
589
- f"'{src_table}' as a foreign key")
594
+ f"'{src_table}' as a foreign key. Remove "
595
+ f"either the primary key or the link before "
596
+ f"before proceeding.")
590
597
 
591
598
  # Check that fkey/pkey have valid and consistent data types:
592
599
  assert src_key.dtype is not None
@@ -604,8 +611,8 @@ class LocalGraph:
604
611
  raise ValueError(f"{edge} is invalid as foreign key "
605
612
  f"'{fkey}' and primary key '{dst_key.name}' "
606
613
  f"have incompatible data types (got "
607
- f"fkey.dtype '{dst_key.dtype}' and "
608
- f"pkey.dtype '{src_key.dtype}')")
614
+ f"fkey.dtype '{src_key.dtype}' and "
615
+ f"pkey.dtype '{dst_key.dtype}')")
609
616
 
610
617
  return self
611
618
 
@@ -678,6 +685,11 @@ class LocalGraph:
678
685
  ]
679
686
  if time_column := table.time_column:
680
687
  keys += [f'{time_column.name}: Time ({time_column.dtype})']
688
+ if end_time_column := table.end_time_column:
689
+ keys += [
690
+ f'{end_time_column.name}: '
691
+ f'End Time ({end_time_column.dtype})'
692
+ ]
681
693
  key_repr = left_align(keys)
682
694
 
683
695
  columns = []
@@ -685,9 +697,9 @@ class LocalGraph:
685
697
  columns += [
686
698
  f'{column.name}: {column.stype} ({column.dtype})'
687
699
  for column in table.columns
688
- if column.name not in fkeys_dict[table_name]
689
- and column.name != table._primary_key
690
- and column.name != table._time_column
700
+ if column.name not in fkeys_dict[table_name] and
701
+ column.name != table._primary_key and column.name != table.
702
+ _time_column and column.name != table._end_time_column
691
703
  ]
692
704
  column_repr = left_align(columns)
693
705
 
@@ -754,16 +766,18 @@ class LocalGraph:
754
766
  def _to_api_graph_definition(self) -> GraphDefinition:
755
767
  tables: Dict[str, TableDefinition] = {}
756
768
  col_groups: List[ColumnKeyGroup] = []
757
- for t_name, table in self.tables.items():
758
- tables[t_name] = table._to_api_table_definition()
769
+ for table_name, table in self.tables.items():
770
+ tables[table_name] = table._to_api_table_definition()
759
771
  if table.primary_key is None:
760
772
  continue
761
- keys = [ColumnKey(t_name, table.primary_key.name)]
773
+ keys = [ColumnKey(table_name, table.primary_key.name)]
762
774
  for edge in self.edges:
763
- if edge.dst_table == t_name:
775
+ if edge.dst_table == table_name:
764
776
  keys.append(ColumnKey(edge.src_table, edge.fkey))
765
- keys = sorted(list(set(keys)),
766
- key=lambda x: f'{x.table_name}.{x.col_name}')
777
+ keys = sorted(
778
+ list(set(keys)),
779
+ key=lambda x: f'{x.table_name}.{x.col_name}',
780
+ )
767
781
  if len(keys) > 1:
768
782
  col_groups.append(ColumnKeyGroup(keys))
769
783
  return GraphDefinition(tables, col_groups)
@@ -92,15 +92,23 @@ class LocalGraphSampler:
92
92
  )
93
93
  continue
94
94
 
95
- # Only store unique rows in `df` above a certain threshold:
96
- unique_node, inverse_node = np.unique(node, return_inverse=True)
97
- if len(node) > 1.05 * len(unique_node):
98
- df = df.iloc[unique_node]
99
- row = inverse_node
95
+ row: Optional[np.ndarray] = None
96
+ if table_name in self._graph_store.end_time_column_dict:
97
+ # Set end time to NaT for all values greater than anchor time:
98
+ df = df.iloc[node].reset_index(drop=True)
99
+ col_name = self._graph_store.end_time_column_dict[table_name]
100
+ ser = df[col_name]
101
+ value = ser.astype('datetime64[ns]').astype(int).to_numpy()
102
+ mask = value > time[batch]
103
+ df.loc[mask, col_name] = pd.NaT
100
104
  else:
101
- df = df.iloc[node]
102
- row = None
103
- df = df.reset_index(drop=True)
105
+ # Only store unique rows in `df` above a certain threshold:
106
+ unique_node, inverse = np.unique(node, return_inverse=True)
107
+ if len(node) > 1.05 * len(unique_node):
108
+ df = df.iloc[unique_node].reset_index(drop=True)
109
+ row = inverse
110
+ else:
111
+ df = df.iloc[node].reset_index(drop=True)
104
112
 
105
113
  # Filter data frame to minimal set of columns:
106
114
  df = df[columns]
@@ -1,5 +1,5 @@
1
1
  import warnings
2
- from typing import Dict, List, Optional, Tuple
2
+ from typing import Dict, List, Optional, Tuple, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
@@ -8,7 +8,7 @@ from kumoapi.typing import Stype
8
8
 
9
9
  from kumoai.experimental.rfm import LocalGraph
10
10
  from kumoai.experimental.rfm.utils import normalize_text
11
- from kumoai.utils import ProgressLogger
11
+ from kumoai.utils import InteractiveProgressLogger, ProgressLogger
12
12
 
13
13
  try:
14
14
  import torch
@@ -22,10 +22,16 @@ class LocalGraphStore:
22
22
  self,
23
23
  graph: LocalGraph,
24
24
  preprocess: bool = False,
25
- verbose: bool = True,
25
+ verbose: Union[bool, ProgressLogger] = True,
26
26
  ) -> None:
27
27
 
28
- with ProgressLogger("Materializing graph", verbose=verbose) as logger:
28
+ if not isinstance(verbose, ProgressLogger):
29
+ verbose = InteractiveProgressLogger(
30
+ "Materializing graph",
31
+ verbose=verbose,
32
+ )
33
+
34
+ with verbose as logger:
29
35
  self.df_dict, self.mask_dict = self.sanitize(graph, preprocess)
30
36
  self.stype_dict = self.get_stype_dict(graph)
31
37
  logger.log("Sanitized input data")
@@ -39,6 +45,7 @@ class LocalGraphStore:
39
45
 
40
46
  (
41
47
  self.time_column_dict,
48
+ self.end_time_column_dict,
42
49
  self.time_dict,
43
50
  self.min_time,
44
51
  self.max_time,
@@ -195,11 +202,15 @@ class LocalGraphStore:
195
202
  pkey_map = pkey_map[self.mask_dict[table.name]]
196
203
 
197
204
  if len(pkey_map) == 0:
198
- raise ValueError(
199
- f"Found no valid rows in table '{table.name}' since there "
200
- f"exists not a single row with a non-N/A primary key."
201
- f"Consider fixing your underlying data or removing this "
202
- f"table from the graph.")
205
+ error_msg = f"Found no valid rows in table '{table.name}'. "
206
+ if table.has_time_column():
207
+ error_msg += ("Please make sure that there exists valid "
208
+ "non-N/A primary key and time column pairs "
209
+ "in this table.")
210
+ else:
211
+ error_msg += ("Please make sure that there exists valid "
212
+ "non-N/A primary keys in this table.")
213
+ raise ValueError(error_msg)
203
214
 
204
215
  pkey_map_dict[table.name] = pkey_map
205
216
 
@@ -209,16 +220,21 @@ class LocalGraphStore:
209
220
  self,
210
221
  graph: LocalGraph,
211
222
  ) -> Tuple[
223
+ Dict[str, str],
212
224
  Dict[str, str],
213
225
  Dict[str, np.ndarray],
214
226
  pd.Timestamp,
215
227
  pd.Timestamp,
216
228
  ]:
217
229
  time_column_dict: Dict[str, str] = {}
230
+ end_time_column_dict: Dict[str, str] = {}
218
231
  time_dict: Dict[str, np.ndarray] = {}
219
232
  min_time = pd.Timestamp.max
220
233
  max_time = pd.Timestamp.min
221
234
  for table in graph.tables.values():
235
+ if table._end_time_column is not None:
236
+ end_time_column_dict[table.name] = table._end_time_column
237
+
222
238
  if table._time_column is None:
223
239
  continue
224
240
 
@@ -233,7 +249,13 @@ class LocalGraphStore:
233
249
  min_time = min(min_time, time.min())
234
250
  max_time = max(max_time, time.max())
235
251
 
236
- return time_column_dict, time_dict, min_time, max_time
252
+ return (
253
+ time_column_dict,
254
+ end_time_column_dict,
255
+ time_dict,
256
+ min_time,
257
+ max_time,
258
+ )
237
259
 
238
260
  def get_csc(
239
261
  self,