kumoai 2.13.0.dev202512041731__cp310-cp310-win_amd64.whl → 2.15.0.dev202601141731__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. kumoai/__init__.py +23 -26
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +6 -0
  4. kumoai/client/jobs.py +26 -0
  5. kumoai/client/pquery.py +6 -2
  6. kumoai/connector/utils.py +21 -7
  7. kumoai/experimental/rfm/__init__.py +51 -24
  8. kumoai/experimental/rfm/authenticate.py +3 -4
  9. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  10. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +62 -110
  11. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  12. kumoai/experimental/rfm/backend/local/table.py +35 -31
  13. kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
  14. kumoai/experimental/rfm/backend/snow/sampler.py +407 -0
  15. kumoai/experimental/rfm/backend/snow/table.py +178 -50
  16. kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
  17. kumoai/experimental/rfm/backend/sqlite/sampler.py +456 -0
  18. kumoai/experimental/rfm/backend/sqlite/table.py +131 -48
  19. kumoai/experimental/rfm/base/__init__.py +22 -4
  20. kumoai/experimental/rfm/base/column.py +96 -10
  21. kumoai/experimental/rfm/base/expression.py +44 -0
  22. kumoai/experimental/rfm/base/mapper.py +69 -0
  23. kumoai/experimental/rfm/base/sampler.py +696 -47
  24. kumoai/experimental/rfm/base/source.py +2 -1
  25. kumoai/experimental/rfm/base/sql_sampler.py +385 -0
  26. kumoai/experimental/rfm/base/table.py +384 -207
  27. kumoai/experimental/rfm/base/utils.py +36 -0
  28. kumoai/experimental/rfm/graph.py +359 -187
  29. kumoai/experimental/rfm/infer/__init__.py +6 -4
  30. kumoai/experimental/rfm/infer/dtype.py +10 -5
  31. kumoai/experimental/rfm/infer/multicategorical.py +1 -1
  32. kumoai/experimental/rfm/infer/pkey.py +4 -2
  33. kumoai/experimental/rfm/infer/stype.py +35 -0
  34. kumoai/experimental/rfm/infer/time_col.py +5 -4
  35. kumoai/experimental/rfm/pquery/executor.py +27 -27
  36. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  37. kumoai/experimental/rfm/relbench.py +76 -0
  38. kumoai/experimental/rfm/rfm.py +770 -467
  39. kumoai/experimental/rfm/sagemaker.py +4 -4
  40. kumoai/experimental/rfm/task_table.py +292 -0
  41. kumoai/kumolib.cp310-win_amd64.pyd +0 -0
  42. kumoai/pquery/predictive_query.py +10 -6
  43. kumoai/pquery/training_table.py +16 -2
  44. kumoai/testing/snow.py +50 -0
  45. kumoai/trainer/distilled_trainer.py +175 -0
  46. kumoai/utils/__init__.py +3 -2
  47. kumoai/utils/display.py +87 -0
  48. kumoai/utils/progress_logger.py +192 -13
  49. kumoai/utils/sql.py +3 -0
  50. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/METADATA +3 -2
  51. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/RECORD +54 -42
  52. kumoai/experimental/rfm/local_graph_sampler.py +0 -223
  53. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  54. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/WHEEL +0 -0
  55. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/licenses/LICENSE +0 -0
  56. {kumoai-2.13.0.dev202512041731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/top_level.txt +0 -0
@@ -1,223 +0,0 @@
1
- import re
2
- from typing import Dict, List, Optional, Tuple
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
7
- from kumoapi.typing import Stype
8
-
9
- import kumoai.kumolib as kumolib
10
- from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
11
-
12
- PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
13
- MULTISPACE = re.compile(r"\s+")
14
-
15
-
16
- def normalize_text(
17
- ser: pd.Series,
18
- max_words: Optional[int] = 50,
19
- ) -> pd.Series:
20
- r"""Normalizes text into a list of lower-case words.
21
-
22
- Args:
23
- ser: The :class:`pandas.Series` to normalize.
24
- max_words: The maximum number of words to return.
25
- This will auto-shrink any large text column to avoid blowing up
26
- context size.
27
- """
28
- if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
29
- return ser
30
-
31
- def normalize_fn(line: str) -> list[str]:
32
- line = PUNCTUATION.sub(" ", line)
33
- line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
34
- line = MULTISPACE.sub(" ", line)
35
- words = line.split()
36
- if max_words is not None:
37
- words = words[:max_words]
38
- return words
39
-
40
- ser = ser.fillna('').astype(str)
41
-
42
- if max_words is not None:
43
- # We estimate the number of words as 5 characters + 1 space in an
44
- # English text on average. We need this pre-filter here, as word
45
- # splitting on a giant text can be very expensive:
46
- ser = ser.str[:6 * max_words]
47
-
48
- ser = ser.str.lower()
49
- ser = ser.map(normalize_fn)
50
-
51
- return ser
52
-
53
-
54
- class LocalGraphSampler:
55
- def __init__(self, graph_store: LocalGraphStore) -> None:
56
- self._graph_store = graph_store
57
- self._sampler = kumolib.NeighborSampler(
58
- self._graph_store.node_types,
59
- self._graph_store.edge_types,
60
- {
61
- '__'.join(edge_type): colptr
62
- for edge_type, colptr in self._graph_store.colptr_dict.items()
63
- },
64
- {
65
- '__'.join(edge_type): row
66
- for edge_type, row in self._graph_store.row_dict.items()
67
- },
68
- self._graph_store.time_dict,
69
- )
70
-
71
- def __call__(
72
- self,
73
- entity_table_names: Tuple[str, ...],
74
- node: np.ndarray,
75
- time: np.ndarray,
76
- num_neighbors: List[int],
77
- exclude_cols_dict: Dict[str, List[str]],
78
- ) -> Subgraph:
79
-
80
- (
81
- row_dict,
82
- col_dict,
83
- node_dict,
84
- batch_dict,
85
- num_sampled_nodes_dict,
86
- num_sampled_edges_dict,
87
- ) = self._sampler.sample(
88
- {
89
- '__'.join(edge_type): num_neighbors
90
- for edge_type in self._graph_store.edge_types
91
- },
92
- {}, # time interval based sampling
93
- entity_table_names[0],
94
- node,
95
- time // 1000**3, # nanoseconds to seconds
96
- )
97
-
98
- table_dict: Dict[str, Table] = {}
99
- for table_name, node in node_dict.items():
100
- batch = batch_dict[table_name]
101
-
102
- if len(node) == 0:
103
- continue
104
-
105
- df = self._graph_store.df_dict[table_name]
106
-
107
- num_sampled_nodes = num_sampled_nodes_dict[table_name].tolist()
108
- stype_dict = { # Exclude target columns:
109
- column_name: stype
110
- for column_name, stype in
111
- self._graph_store.stype_dict[table_name].items()
112
- if column_name not in exclude_cols_dict.get(table_name, [])
113
- }
114
- primary_key: Optional[str] = None
115
- if table_name in entity_table_names:
116
- primary_key = self._graph_store.pkey_name_dict.get(table_name)
117
-
118
- columns: List[str] = []
119
- if table_name in entity_table_names:
120
- columns += [self._graph_store.pkey_name_dict[table_name]]
121
- columns += list(stype_dict.keys())
122
-
123
- if len(columns) == 0:
124
- table_dict[table_name] = Table(
125
- df=pd.DataFrame(index=range(len(node))),
126
- row=None,
127
- batch=batch,
128
- num_sampled_nodes=num_sampled_nodes,
129
- stype_dict=stype_dict,
130
- primary_key=primary_key,
131
- )
132
- continue
133
-
134
- row: Optional[np.ndarray] = None
135
- if table_name in self._graph_store.end_time_column_dict:
136
- # Set end time to NaT for all values greater than anchor time:
137
- df = df.iloc[node].reset_index(drop=True)
138
- col_name = self._graph_store.end_time_column_dict[table_name]
139
- ser = df[col_name]
140
- value = ser.astype('datetime64[ns]').astype(int).to_numpy()
141
- mask = value > time[batch]
142
- df.loc[mask, col_name] = pd.NaT
143
- else:
144
- # Only store unique rows in `df` above a certain threshold:
145
- unique_node, inverse = np.unique(node, return_inverse=True)
146
- if len(node) > 1.05 * len(unique_node):
147
- df = df.iloc[unique_node].reset_index(drop=True)
148
- row = inverse
149
- else:
150
- df = df.iloc[node].reset_index(drop=True)
151
-
152
- # Filter data frame to minimal set of columns:
153
- df = df[columns]
154
-
155
- # Normalize text (if not already pre-processed):
156
- for column_name, stype in stype_dict.items():
157
- if stype == Stype.text:
158
- df[column_name] = normalize_text(df[column_name])
159
-
160
- table_dict[table_name] = Table(
161
- df=df,
162
- row=row,
163
- batch=batch,
164
- num_sampled_nodes=num_sampled_nodes,
165
- stype_dict=stype_dict,
166
- primary_key=primary_key,
167
- )
168
-
169
- link_dict: Dict[Tuple[str, str, str], Link] = {}
170
- for edge_type in self._graph_store.edge_types:
171
- edge_type_str = '__'.join(edge_type)
172
-
173
- row = row_dict[edge_type_str]
174
- col = col_dict[edge_type_str]
175
-
176
- if len(row) == 0:
177
- continue
178
-
179
- # Do not store reverse edge type if it is a replica:
180
- rev_edge_type = Subgraph.rev_edge_type(edge_type)
181
- rev_edge_type_str = '__'.join(rev_edge_type)
182
- if (rev_edge_type in link_dict
183
- and np.array_equal(row, col_dict[rev_edge_type_str])
184
- and np.array_equal(col, row_dict[rev_edge_type_str])):
185
- link = Link(
186
- layout=EdgeLayout.REV,
187
- row=None,
188
- col=None,
189
- num_sampled_edges=(
190
- num_sampled_edges_dict[edge_type_str].tolist()),
191
- )
192
- link_dict[edge_type] = link
193
- continue
194
-
195
- layout = EdgeLayout.COO
196
- if np.array_equal(row, np.arange(len(row))):
197
- row = None
198
- if np.array_equal(col, np.arange(len(col))):
199
- col = None
200
-
201
- # Store in compressed representation if more efficient:
202
- num_cols = table_dict[edge_type[2]].num_rows
203
- if col is not None and len(col) > num_cols + 1:
204
- layout = EdgeLayout.CSC
205
- colcount = np.bincount(col, minlength=num_cols)
206
- col = np.empty(num_cols + 1, dtype=col.dtype)
207
- col[0] = 0
208
- np.cumsum(colcount, out=col[1:])
209
-
210
- link = Link(
211
- layout=layout,
212
- row=row,
213
- col=col,
214
- num_sampled_edges=(
215
- num_sampled_edges_dict[edge_type_str].tolist()),
216
- )
217
- link_dict[edge_type] = link
218
-
219
- return Subgraph(
220
- anchor_time=time,
221
- table_dict=table_dict,
222
- link_dict=link_dict,
223
- )