kumoai 2.13.0.dev202511181731__cp311-cp311-macosx_11_0_arm64.whl → 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kumoai/__init__.py +12 -0
  2. kumoai/_version.py +1 -1
  3. kumoai/client/pquery.py +6 -2
  4. kumoai/connector/utils.py +23 -2
  5. kumoai/experimental/rfm/__init__.py +52 -52
  6. kumoai/experimental/rfm/authenticate.py +3 -4
  7. kumoai/experimental/rfm/backend/__init__.py +0 -0
  8. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  9. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +57 -110
  10. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  11. kumoai/experimental/rfm/backend/local/table.py +114 -0
  12. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  13. kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
  14. kumoai/experimental/rfm/backend/snow/table.py +169 -0
  15. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  16. kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
  17. kumoai/experimental/rfm/backend/sqlite/table.py +154 -0
  18. kumoai/experimental/rfm/base/__init__.py +33 -0
  19. kumoai/experimental/rfm/base/column.py +68 -0
  20. kumoai/experimental/rfm/base/column_expression.py +50 -0
  21. kumoai/experimental/rfm/base/sampler.py +773 -0
  22. kumoai/experimental/rfm/base/source.py +19 -0
  23. kumoai/experimental/rfm/base/sql_sampler.py +84 -0
  24. kumoai/experimental/rfm/base/sql_table.py +229 -0
  25. kumoai/experimental/rfm/{local_table.py → base/table.py} +219 -189
  26. kumoai/experimental/rfm/{local_graph.py → graph.py} +510 -91
  27. kumoai/experimental/rfm/infer/__init__.py +8 -0
  28. kumoai/experimental/rfm/infer/dtype.py +79 -0
  29. kumoai/experimental/rfm/infer/pkey.py +128 -0
  30. kumoai/experimental/rfm/infer/stype.py +35 -0
  31. kumoai/experimental/rfm/infer/time_col.py +61 -0
  32. kumoai/experimental/rfm/pquery/executor.py +27 -27
  33. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  34. kumoai/experimental/rfm/rfm.py +313 -246
  35. kumoai/experimental/rfm/sagemaker.py +15 -7
  36. kumoai/pquery/predictive_query.py +10 -6
  37. kumoai/testing/decorators.py +1 -1
  38. kumoai/testing/snow.py +50 -0
  39. kumoai/utils/__init__.py +3 -2
  40. kumoai/utils/progress_logger.py +178 -12
  41. kumoai/utils/sql.py +3 -0
  42. {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/METADATA +10 -8
  43. {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/RECORD +46 -26
  44. kumoai/experimental/rfm/local_graph_sampler.py +0 -184
  45. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  46. kumoai/experimental/rfm/utils.py +0 -344
  47. {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/WHEEL +0 -0
  48. {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/licenses/LICENSE +0 -0
  49. {kumoai-2.13.0.dev202511181731.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/top_level.txt +0 -0
@@ -1,184 +0,0 @@
1
- from typing import Dict, List, Optional, Tuple
2
-
3
- import numpy as np
4
- import pandas as pd
5
- from kumoapi.model_plan import RunMode
6
- from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
7
- from kumoapi.typing import Stype
8
-
9
- import kumoai.kumolib as kumolib
10
- from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
11
- from kumoai.experimental.rfm.utils import normalize_text
12
-
13
-
14
- class LocalGraphSampler:
15
- def __init__(self, graph_store: LocalGraphStore) -> None:
16
- self._graph_store = graph_store
17
- self._sampler = kumolib.NeighborSampler(
18
- self._graph_store.node_types,
19
- self._graph_store.edge_types,
20
- {
21
- '__'.join(edge_type): colptr
22
- for edge_type, colptr in self._graph_store.colptr_dict.items()
23
- },
24
- {
25
- '__'.join(edge_type): row
26
- for edge_type, row in self._graph_store.row_dict.items()
27
- },
28
- self._graph_store.time_dict,
29
- )
30
-
31
- def __call__(
32
- self,
33
- entity_table_names: Tuple[str, ...],
34
- node: np.ndarray,
35
- time: np.ndarray,
36
- run_mode: RunMode,
37
- num_neighbors: List[int],
38
- exclude_cols_dict: Dict[str, List[str]],
39
- ) -> Subgraph:
40
-
41
- (
42
- row_dict,
43
- col_dict,
44
- node_dict,
45
- batch_dict,
46
- num_sampled_nodes_dict,
47
- num_sampled_edges_dict,
48
- ) = self._sampler.sample(
49
- {
50
- '__'.join(edge_type): num_neighbors
51
- for edge_type in self._graph_store.edge_types
52
- },
53
- {}, # time interval based sampling
54
- entity_table_names[0],
55
- node,
56
- time // 1000**3, # nanoseconds to seconds
57
- )
58
-
59
- table_dict: Dict[str, Table] = {}
60
- for table_name, node in node_dict.items():
61
- batch = batch_dict[table_name]
62
-
63
- if len(node) == 0:
64
- continue
65
-
66
- df = self._graph_store.df_dict[table_name]
67
-
68
- num_sampled_nodes = num_sampled_nodes_dict[table_name].tolist()
69
- stype_dict = { # Exclude target columns:
70
- column_name: stype
71
- for column_name, stype in
72
- self._graph_store.stype_dict[table_name].items()
73
- if column_name not in exclude_cols_dict.get(table_name, [])
74
- }
75
- primary_key: Optional[str] = None
76
- if table_name in entity_table_names:
77
- primary_key = self._graph_store.pkey_name_dict.get(table_name)
78
-
79
- columns: List[str] = []
80
- if table_name in entity_table_names:
81
- columns += [self._graph_store.pkey_name_dict[table_name]]
82
- columns += list(stype_dict.keys())
83
-
84
- if len(columns) == 0:
85
- table_dict[table_name] = Table(
86
- df=pd.DataFrame(index=range(len(node))),
87
- row=None,
88
- batch=batch,
89
- num_sampled_nodes=num_sampled_nodes,
90
- stype_dict=stype_dict,
91
- primary_key=primary_key,
92
- )
93
- continue
94
-
95
- row: Optional[np.ndarray] = None
96
- if table_name in self._graph_store.end_time_column_dict:
97
- # Set end time to NaT for all values greater than anchor time:
98
- df = df.iloc[node].reset_index(drop=True)
99
- col_name = self._graph_store.end_time_column_dict[table_name]
100
- ser = df[col_name]
101
- value = ser.astype('datetime64[ns]').astype(int).to_numpy()
102
- mask = value > time[batch]
103
- df.loc[mask, col_name] = pd.NaT
104
- else:
105
- # Only store unique rows in `df` above a certain threshold:
106
- unique_node, inverse = np.unique(node, return_inverse=True)
107
- if len(node) > 1.05 * len(unique_node):
108
- df = df.iloc[unique_node].reset_index(drop=True)
109
- row = inverse
110
- else:
111
- df = df.iloc[node].reset_index(drop=True)
112
-
113
- # Filter data frame to minimal set of columns:
114
- df = df[columns]
115
-
116
- # Normalize text (if not already pre-processed):
117
- for column_name, stype in stype_dict.items():
118
- if stype == Stype.text:
119
- df[column_name] = normalize_text(df[column_name])
120
-
121
- table_dict[table_name] = Table(
122
- df=df,
123
- row=row,
124
- batch=batch,
125
- num_sampled_nodes=num_sampled_nodes,
126
- stype_dict=stype_dict,
127
- primary_key=primary_key,
128
- )
129
-
130
- link_dict: Dict[Tuple[str, str, str], Link] = {}
131
- for edge_type in self._graph_store.edge_types:
132
- edge_type_str = '__'.join(edge_type)
133
-
134
- row = row_dict[edge_type_str]
135
- col = col_dict[edge_type_str]
136
-
137
- if len(row) == 0:
138
- continue
139
-
140
- # Do not store reverse edge type if it is a replica:
141
- rev_edge_type = Subgraph.rev_edge_type(edge_type)
142
- rev_edge_type_str = '__'.join(rev_edge_type)
143
- if (rev_edge_type in link_dict
144
- and np.array_equal(row, col_dict[rev_edge_type_str])
145
- and np.array_equal(col, row_dict[rev_edge_type_str])):
146
- link = Link(
147
- layout=EdgeLayout.REV,
148
- row=None,
149
- col=None,
150
- num_sampled_edges=(
151
- num_sampled_edges_dict[edge_type_str].tolist()),
152
- )
153
- link_dict[edge_type] = link
154
- continue
155
-
156
- layout = EdgeLayout.COO
157
- if np.array_equal(row, np.arange(len(row))):
158
- row = None
159
- if np.array_equal(col, np.arange(len(col))):
160
- col = None
161
-
162
- # Store in compressed representation if more efficient:
163
- num_cols = table_dict[edge_type[2]].num_rows
164
- if col is not None and len(col) > num_cols + 1:
165
- layout = EdgeLayout.CSC
166
- colcount = np.bincount(col, minlength=num_cols)
167
- col = np.empty(num_cols + 1, dtype=col.dtype)
168
- col[0] = 0
169
- np.cumsum(colcount, out=col[1:])
170
-
171
- link = Link(
172
- layout=layout,
173
- row=row,
174
- col=col,
175
- num_sampled_edges=(
176
- num_sampled_edges_dict[edge_type_str].tolist()),
177
- )
178
- link_dict[edge_type] = link
179
-
180
- return Subgraph(
181
- anchor_time=time,
182
- table_dict=table_dict,
183
- link_dict=link_dict,
184
- )