kumoai 2.10.0.dev202509281831__cp313-cp313-win_amd64.whl → 2.13.0.dev202511211730__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,45 +29,46 @@ class LocalGraph:
29
29
 
30
30
  .. code-block:: python
31
31
 
32
- import pandas as pd
33
- import kumoai.experimental.rfm as rfm
34
-
35
- # Load data frames into memory:
36
- df1 = pd.DataFrame(...)
37
- df2 = pd.DataFrame(...)
38
- df3 = pd.DataFrame(...)
39
-
40
- # Define tables from data frames:
41
- table1 = rfm.LocalTable(name="table1", data=df1)
42
- table2 = rfm.LocalTable(name="table2", data=df2)
43
- table3 = rfm.LocalTable(name="table3", data=df3)
44
-
45
- # Create a graph from a dictionary of tables:
46
- graph = rfm.LocalGraph({
47
- "table1": table1,
48
- "table2": table2,
49
- "table3": table3,
50
- })
51
-
52
- # Infer table metadata:
53
- graph.infer_metadata()
54
-
55
- # Infer links/edges:
56
- graph.infer_links()
57
-
58
- # Inspect table metadata:
59
- for table in graph.tables.values():
60
- table.print_metadata()
61
-
62
- # Visualize graph (if graphviz is installed):
63
- graph.visualize()
64
-
65
- # Add/Remove edges between tables:
66
- graph.link(src_table="table1", fkey="id1", dst_table="table2")
67
- graph.unlink(src_table="table1", fkey="id1", dst_table="table2")
68
-
69
- # Validate graph:
70
- graph.validate()
32
+ >>> # doctest: +SKIP
33
+ >>> import pandas as pd
34
+ >>> import kumoai.experimental.rfm as rfm
35
+
36
+ >>> # Load data frames into memory:
37
+ >>> df1 = pd.DataFrame(...)
38
+ >>> df2 = pd.DataFrame(...)
39
+ >>> df3 = pd.DataFrame(...)
40
+
41
+ >>> # Define tables from data frames:
42
+ >>> table1 = rfm.LocalTable(name="table1", data=df1)
43
+ >>> table2 = rfm.LocalTable(name="table2", data=df2)
44
+ >>> table3 = rfm.LocalTable(name="table3", data=df3)
45
+
46
+ >>> # Create a graph from a dictionary of tables:
47
+ >>> graph = rfm.LocalGraph({
48
+ ... "table1": table1,
49
+ ... "table2": table2,
50
+ ... "table3": table3,
51
+ ... })
52
+
53
+ >>> # Infer table metadata:
54
+ >>> graph.infer_metadata()
55
+
56
+ >>> # Infer links/edges:
57
+ >>> graph.infer_links()
58
+
59
+ >>> # Inspect table metadata:
60
+ >>> for table in graph.tables.values():
61
+ ... table.print_metadata()
62
+
63
+ >>> # Visualize graph (if graphviz is installed):
64
+ >>> graph.visualize()
65
+
66
+ >>> # Add/Remove edges between tables:
67
+ >>> graph.link(src_table="table1", fkey="id1", dst_table="table2")
68
+ >>> graph.unlink(src_table="table1", fkey="id1", dst_table="table2")
69
+
70
+ >>> # Validate graph:
71
+ >>> graph.validate()
71
72
  """
72
73
 
73
74
  # Constructors ############################################################
@@ -104,27 +105,28 @@ class LocalGraph:
104
105
 
105
106
  .. code-block:: python
106
107
 
107
- import pandas as pd
108
- import kumoai.experimental.rfm as rfm
108
+ >>> # doctest: +SKIP
109
+ >>> import pandas as pd
110
+ >>> import kumoai.experimental.rfm as rfm
109
111
 
110
- # Load data frames into memory:
111
- df1 = pd.DataFrame(...)
112
- df2 = pd.DataFrame(...)
113
- df3 = pd.DataFrame(...)
112
+ >>> # Load data frames into memory:
113
+ >>> df1 = pd.DataFrame(...)
114
+ >>> df2 = pd.DataFrame(...)
115
+ >>> df3 = pd.DataFrame(...)
114
116
 
115
- # Create a graph from a dictionary of data frames:
116
- graph = rfm.LocalGraph.from_data({
117
- "table1": df1,
118
- "table2": df2,
119
- "table3": df3,
120
- })
117
+ >>> # Create a graph from a dictionary of data frames:
118
+ >>> graph = rfm.LocalGraph.from_data({
119
+ ... "table1": df1,
120
+ ... "table2": df2,
121
+ ... "table3": df3,
122
+ ... })
121
123
 
122
- # Inspect table metadata:
123
- for table in graph.tables.values():
124
- table.print_metadata()
124
+ >>> # Inspect table metadata:
125
+ >>> for table in graph.tables.values():
126
+ ... table.print_metadata()
125
127
 
126
- # Visualize graph (if graphviz is installed):
127
- graph.visualize()
128
+ >>> # Visualize graph (if graphviz is installed):
129
+ >>> graph.visualize()
128
130
 
129
131
  Args:
130
132
  df_dict: A dictionary of data frames, where the keys are the names
@@ -141,6 +143,7 @@ class LocalGraph:
141
143
  graph.
142
144
 
143
145
  Example:
146
+ >>> # doctest: +SKIP
144
147
  >>> import kumoai.experimental.rfm as rfm
145
148
  >>> df1 = pd.DataFrame(...)
146
149
  >>> df2 = pd.DataFrame(...)
@@ -150,7 +153,7 @@ class LocalGraph:
150
153
  ... "table2": df2,
151
154
  ... "table3": df3,
152
155
  ... })
153
- ... graph.validate()
156
+ >>> graph.validate()
154
157
  """
155
158
  tables = [LocalTable(df, name) for name, df in df_dict.items()]
156
159
 
@@ -231,16 +234,17 @@ class LocalGraph:
231
234
  r"""Returns a :class:`pandas.DataFrame` object containing metadata
232
235
  information about the tables in this graph.
233
236
 
234
- The returned dataframe has columns ``name``, ``primary_key``, and
235
- ``time_column``, which provide an aggregate view of the properties of
236
- the tables of this graph.
237
+ The returned dataframe has columns ``name``, ``primary_key``,
238
+ ``time_column``, and ``end_time_column``, which provide an aggregate
239
+ view of the properties of the tables of this graph.
237
240
 
238
241
  Example:
242
+ >>> # doctest: +SKIP
239
243
  >>> import kumoai.experimental.rfm as rfm
240
244
  >>> graph = rfm.LocalGraph(tables=...).infer_metadata()
241
- >>> graph.metadata
242
- name primary_key time_column
243
- 0 users user_id -
245
+ >>> graph.metadata # doctest: +SKIP
246
+ name primary_key time_column end_time_column
247
+ 0 users user_id - -
244
248
  """
245
249
  tables = list(self.tables.values())
246
250
 
@@ -251,6 +255,11 @@ class LocalGraph:
251
255
  pd.Series(dtype=str, data=[t._primary_key or '-' for t in tables]),
252
256
  'time_column':
253
257
  pd.Series(dtype=str, data=[t._time_column or '-' for t in tables]),
258
+ 'end_time_column':
259
+ pd.Series(
260
+ dtype=str,
261
+ data=[t._end_time_column or '-' for t in tables],
262
+ ),
254
263
  })
255
264
 
256
265
  def print_metadata(self) -> None:
@@ -602,8 +611,8 @@ class LocalGraph:
602
611
  raise ValueError(f"{edge} is invalid as foreign key "
603
612
  f"'{fkey}' and primary key '{dst_key.name}' "
604
613
  f"have incompatible data types (got "
605
- f"fkey.dtype '{dst_key.dtype}' and "
606
- f"pkey.dtype '{src_key.dtype}')")
614
+ f"fkey.dtype '{src_key.dtype}' and "
615
+ f"pkey.dtype '{dst_key.dtype}')")
607
616
 
608
617
  return self
609
618
 
@@ -676,6 +685,11 @@ class LocalGraph:
676
685
  ]
677
686
  if time_column := table.time_column:
678
687
  keys += [f'{time_column.name}: Time ({time_column.dtype})']
688
+ if end_time_column := table.end_time_column:
689
+ keys += [
690
+ f'{end_time_column.name}: '
691
+ f'End Time ({end_time_column.dtype})'
692
+ ]
679
693
  key_repr = left_align(keys)
680
694
 
681
695
  columns = []
@@ -683,9 +697,9 @@ class LocalGraph:
683
697
  columns += [
684
698
  f'{column.name}: {column.stype} ({column.dtype})'
685
699
  for column in table.columns
686
- if column.name not in fkeys_dict[table_name]
687
- and column.name != table._primary_key
688
- and column.name != table._time_column
700
+ if column.name not in fkeys_dict[table_name] and
701
+ column.name != table._primary_key and column.name != table.
702
+ _time_column and column.name != table._end_time_column
689
703
  ]
690
704
  column_repr = left_align(columns)
691
705
 
@@ -752,16 +766,18 @@ class LocalGraph:
752
766
  def _to_api_graph_definition(self) -> GraphDefinition:
753
767
  tables: Dict[str, TableDefinition] = {}
754
768
  col_groups: List[ColumnKeyGroup] = []
755
- for t_name, table in self.tables.items():
756
- tables[t_name] = table._to_api_table_definition()
769
+ for table_name, table in self.tables.items():
770
+ tables[table_name] = table._to_api_table_definition()
757
771
  if table.primary_key is None:
758
772
  continue
759
- keys = [ColumnKey(t_name, table.primary_key.name)]
773
+ keys = [ColumnKey(table_name, table.primary_key.name)]
760
774
  for edge in self.edges:
761
- if edge.dst_table == t_name:
775
+ if edge.dst_table == table_name:
762
776
  keys.append(ColumnKey(edge.src_table, edge.fkey))
763
- keys = sorted(list(set(keys)),
764
- key=lambda x: f'{x.table_name}.{x.col_name}')
777
+ keys = sorted(
778
+ list(set(keys)),
779
+ key=lambda x: f'{x.table_name}.{x.col_name}',
780
+ )
765
781
  if len(keys) > 1:
766
782
  col_groups.append(ColumnKeyGroup(keys))
767
783
  return GraphDefinition(tables, col_groups)
@@ -2,7 +2,6 @@ from typing import Dict, List, Optional, Tuple
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
5
- from kumoapi.model_plan import RunMode
6
5
  from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
7
6
  from kumoapi.typing import Stype
8
7
 
@@ -33,7 +32,6 @@ class LocalGraphSampler:
33
32
  entity_table_names: Tuple[str, ...],
34
33
  node: np.ndarray,
35
34
  time: np.ndarray,
36
- run_mode: RunMode,
37
35
  num_neighbors: List[int],
38
36
  exclude_cols_dict: Dict[str, List[str]],
39
37
  ) -> Subgraph:
@@ -92,15 +90,23 @@ class LocalGraphSampler:
92
90
  )
93
91
  continue
94
92
 
95
- # Only store unique rows in `df` above a certain threshold:
96
- unique_node, inverse_node = np.unique(node, return_inverse=True)
97
- if len(node) > 1.05 * len(unique_node):
98
- df = df.iloc[unique_node]
99
- row = inverse_node
93
+ row: Optional[np.ndarray] = None
94
+ if table_name in self._graph_store.end_time_column_dict:
95
+ # Set end time to NaT for all values greater than anchor time:
96
+ df = df.iloc[node].reset_index(drop=True)
97
+ col_name = self._graph_store.end_time_column_dict[table_name]
98
+ ser = df[col_name]
99
+ value = ser.astype('datetime64[ns]').astype(int).to_numpy()
100
+ mask = value > time[batch]
101
+ df.loc[mask, col_name] = pd.NaT
100
102
  else:
101
- df = df.iloc[node]
102
- row = None
103
- df = df.reset_index(drop=True)
103
+ # Only store unique rows in `df` above a certain threshold:
104
+ unique_node, inverse = np.unique(node, return_inverse=True)
105
+ if len(node) > 1.05 * len(unique_node):
106
+ df = df.iloc[unique_node].reset_index(drop=True)
107
+ row = inverse
108
+ else:
109
+ df = df.iloc[node].reset_index(drop=True)
104
110
 
105
111
  # Filter data frame to minimal set of columns:
106
112
  df = df[columns]
@@ -45,6 +45,7 @@ class LocalGraphStore:
45
45
 
46
46
  (
47
47
  self.time_column_dict,
48
+ self.end_time_column_dict,
48
49
  self.time_dict,
49
50
  self.min_time,
50
51
  self.max_time,
@@ -219,16 +220,21 @@ class LocalGraphStore:
219
220
  self,
220
221
  graph: LocalGraph,
221
222
  ) -> Tuple[
223
+ Dict[str, str],
222
224
  Dict[str, str],
223
225
  Dict[str, np.ndarray],
224
226
  pd.Timestamp,
225
227
  pd.Timestamp,
226
228
  ]:
227
229
  time_column_dict: Dict[str, str] = {}
230
+ end_time_column_dict: Dict[str, str] = {}
228
231
  time_dict: Dict[str, np.ndarray] = {}
229
232
  min_time = pd.Timestamp.max
230
233
  max_time = pd.Timestamp.min
231
234
  for table in graph.tables.values():
235
+ if table._end_time_column is not None:
236
+ end_time_column_dict[table.name] = table._end_time_column
237
+
232
238
  if table._time_column is None:
233
239
  continue
234
240
 
@@ -243,7 +249,13 @@ class LocalGraphStore:
243
249
  min_time = min(min_time, time.min())
244
250
  max_time = max(max_time, time.max())
245
251
 
246
- return time_column_dict, time_dict, min_time, max_time
252
+ return (
253
+ time_column_dict,
254
+ end_time_column_dict,
255
+ time_dict,
256
+ min_time,
257
+ max_time,
258
+ )
247
259
 
248
260
  def get_csc(
249
261
  self,