kumoai 2.13.0.dev202512091732__cp311-cp311-macosx_11_0_arm64.whl → 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/_version.py +1 -1
- kumoai/client/pquery.py +6 -2
- kumoai/experimental/rfm/__init__.py +33 -8
- kumoai/experimental/rfm/authenticate.py +3 -4
- kumoai/experimental/rfm/backend/local/graph_store.py +40 -83
- kumoai/experimental/rfm/backend/local/sampler.py +128 -55
- kumoai/experimental/rfm/backend/local/table.py +21 -16
- kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
- kumoai/experimental/rfm/backend/snow/table.py +101 -49
- kumoai/experimental/rfm/backend/sqlite/__init__.py +4 -2
- kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +84 -31
- kumoai/experimental/rfm/base/__init__.py +24 -5
- kumoai/experimental/rfm/base/column.py +14 -12
- kumoai/experimental/rfm/base/column_expression.py +50 -0
- kumoai/experimental/rfm/base/sampler.py +429 -30
- kumoai/experimental/rfm/base/source.py +1 -0
- kumoai/experimental/rfm/base/sql_sampler.py +84 -0
- kumoai/experimental/rfm/base/sql_table.py +229 -0
- kumoai/experimental/rfm/base/table.py +165 -135
- kumoai/experimental/rfm/graph.py +266 -102
- kumoai/experimental/rfm/infer/__init__.py +6 -4
- kumoai/experimental/rfm/infer/dtype.py +3 -3
- kumoai/experimental/rfm/infer/pkey.py +4 -2
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +1 -2
- kumoai/experimental/rfm/pquery/executor.py +27 -27
- kumoai/experimental/rfm/pquery/pandas_executor.py +29 -31
- kumoai/experimental/rfm/rfm.py +299 -230
- kumoai/experimental/rfm/sagemaker.py +4 -4
- kumoai/pquery/predictive_query.py +10 -6
- kumoai/testing/snow.py +50 -0
- kumoai/utils/__init__.py +3 -2
- kumoai/utils/progress_logger.py +178 -12
- kumoai/utils/sql.py +3 -0
- {kumoai-2.13.0.dev202512091732.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/METADATA +3 -2
- {kumoai-2.13.0.dev202512091732.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/RECORD +41 -35
- kumoai/experimental/rfm/local_graph_sampler.py +0 -223
- kumoai/experimental/rfm/local_pquery_driver.py +0 -689
- {kumoai-2.13.0.dev202512091732.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/WHEEL +0 -0
- {kumoai-2.13.0.dev202512091732.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.13.0.dev202512091732.dist-info → kumoai-2.14.0.dev202512191731.dist-info}/top_level.txt +0 -0
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import Dict, List, Optional, Tuple
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
|
|
7
|
-
from kumoapi.typing import Stype
|
|
8
|
-
|
|
9
|
-
import kumoai.kumolib as kumolib
|
|
10
|
-
from kumoai.experimental.rfm.backend.local import LocalGraphStore
|
|
11
|
-
|
|
12
|
-
PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
|
|
13
|
-
MULTISPACE = re.compile(r"\s+")
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def normalize_text(
|
|
17
|
-
ser: pd.Series,
|
|
18
|
-
max_words: Optional[int] = 50,
|
|
19
|
-
) -> pd.Series:
|
|
20
|
-
r"""Normalizes text into a list of lower-case words.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
ser: The :class:`pandas.Series` to normalize.
|
|
24
|
-
max_words: The maximum number of words to return.
|
|
25
|
-
This will auto-shrink any large text column to avoid blowing up
|
|
26
|
-
context size.
|
|
27
|
-
"""
|
|
28
|
-
if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
|
|
29
|
-
return ser
|
|
30
|
-
|
|
31
|
-
def normalize_fn(line: str) -> list[str]:
|
|
32
|
-
line = PUNCTUATION.sub(" ", line)
|
|
33
|
-
line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
|
|
34
|
-
line = MULTISPACE.sub(" ", line)
|
|
35
|
-
words = line.split()
|
|
36
|
-
if max_words is not None:
|
|
37
|
-
words = words[:max_words]
|
|
38
|
-
return words
|
|
39
|
-
|
|
40
|
-
ser = ser.fillna('').astype(str)
|
|
41
|
-
|
|
42
|
-
if max_words is not None:
|
|
43
|
-
# We estimate the number of words as 5 characters + 1 space in an
|
|
44
|
-
# English text on average. We need this pre-filter here, as word
|
|
45
|
-
# splitting on a giant text can be very expensive:
|
|
46
|
-
ser = ser.str[:6 * max_words]
|
|
47
|
-
|
|
48
|
-
ser = ser.str.lower()
|
|
49
|
-
ser = ser.map(normalize_fn)
|
|
50
|
-
|
|
51
|
-
return ser
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class LocalGraphSampler:
|
|
55
|
-
def __init__(self, graph_store: LocalGraphStore) -> None:
|
|
56
|
-
self._graph_store = graph_store
|
|
57
|
-
self._sampler = kumolib.NeighborSampler(
|
|
58
|
-
self._graph_store.node_types,
|
|
59
|
-
self._graph_store.edge_types,
|
|
60
|
-
{
|
|
61
|
-
'__'.join(edge_type): colptr
|
|
62
|
-
for edge_type, colptr in self._graph_store.colptr_dict.items()
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
'__'.join(edge_type): row
|
|
66
|
-
for edge_type, row in self._graph_store.row_dict.items()
|
|
67
|
-
},
|
|
68
|
-
self._graph_store.time_dict,
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
def __call__(
|
|
72
|
-
self,
|
|
73
|
-
entity_table_names: Tuple[str, ...],
|
|
74
|
-
node: np.ndarray,
|
|
75
|
-
time: np.ndarray,
|
|
76
|
-
num_neighbors: List[int],
|
|
77
|
-
exclude_cols_dict: Dict[str, List[str]],
|
|
78
|
-
) -> Subgraph:
|
|
79
|
-
|
|
80
|
-
(
|
|
81
|
-
row_dict,
|
|
82
|
-
col_dict,
|
|
83
|
-
node_dict,
|
|
84
|
-
batch_dict,
|
|
85
|
-
num_sampled_nodes_dict,
|
|
86
|
-
num_sampled_edges_dict,
|
|
87
|
-
) = self._sampler.sample(
|
|
88
|
-
{
|
|
89
|
-
'__'.join(edge_type): num_neighbors
|
|
90
|
-
for edge_type in self._graph_store.edge_types
|
|
91
|
-
},
|
|
92
|
-
{}, # time interval based sampling
|
|
93
|
-
entity_table_names[0],
|
|
94
|
-
node,
|
|
95
|
-
time // 1000**3, # nanoseconds to seconds
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
table_dict: Dict[str, Table] = {}
|
|
99
|
-
for table_name, node in node_dict.items():
|
|
100
|
-
batch = batch_dict[table_name]
|
|
101
|
-
|
|
102
|
-
if len(node) == 0:
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
df = self._graph_store.df_dict[table_name]
|
|
106
|
-
|
|
107
|
-
num_sampled_nodes = num_sampled_nodes_dict[table_name].tolist()
|
|
108
|
-
stype_dict = { # Exclude target columns:
|
|
109
|
-
column_name: stype
|
|
110
|
-
for column_name, stype in
|
|
111
|
-
self._graph_store.stype_dict[table_name].items()
|
|
112
|
-
if column_name not in exclude_cols_dict.get(table_name, [])
|
|
113
|
-
}
|
|
114
|
-
primary_key: Optional[str] = None
|
|
115
|
-
if table_name in entity_table_names:
|
|
116
|
-
primary_key = self._graph_store.pkey_name_dict.get(table_name)
|
|
117
|
-
|
|
118
|
-
columns: List[str] = []
|
|
119
|
-
if table_name in entity_table_names:
|
|
120
|
-
columns += [self._graph_store.pkey_name_dict[table_name]]
|
|
121
|
-
columns += list(stype_dict.keys())
|
|
122
|
-
|
|
123
|
-
if len(columns) == 0:
|
|
124
|
-
table_dict[table_name] = Table(
|
|
125
|
-
df=pd.DataFrame(index=range(len(node))),
|
|
126
|
-
row=None,
|
|
127
|
-
batch=batch,
|
|
128
|
-
num_sampled_nodes=num_sampled_nodes,
|
|
129
|
-
stype_dict=stype_dict,
|
|
130
|
-
primary_key=primary_key,
|
|
131
|
-
)
|
|
132
|
-
continue
|
|
133
|
-
|
|
134
|
-
row: Optional[np.ndarray] = None
|
|
135
|
-
if table_name in self._graph_store.end_time_column_dict:
|
|
136
|
-
# Set end time to NaT for all values greater than anchor time:
|
|
137
|
-
df = df.iloc[node].reset_index(drop=True)
|
|
138
|
-
col_name = self._graph_store.end_time_column_dict[table_name]
|
|
139
|
-
ser = df[col_name]
|
|
140
|
-
value = ser.astype('datetime64[ns]').astype(int).to_numpy()
|
|
141
|
-
mask = value > time[batch]
|
|
142
|
-
df.loc[mask, col_name] = pd.NaT
|
|
143
|
-
else:
|
|
144
|
-
# Only store unique rows in `df` above a certain threshold:
|
|
145
|
-
unique_node, inverse = np.unique(node, return_inverse=True)
|
|
146
|
-
if len(node) > 1.05 * len(unique_node):
|
|
147
|
-
df = df.iloc[unique_node].reset_index(drop=True)
|
|
148
|
-
row = inverse
|
|
149
|
-
else:
|
|
150
|
-
df = df.iloc[node].reset_index(drop=True)
|
|
151
|
-
|
|
152
|
-
# Filter data frame to minimal set of columns:
|
|
153
|
-
df = df[columns]
|
|
154
|
-
|
|
155
|
-
# Normalize text (if not already pre-processed):
|
|
156
|
-
for column_name, stype in stype_dict.items():
|
|
157
|
-
if stype == Stype.text:
|
|
158
|
-
df[column_name] = normalize_text(df[column_name])
|
|
159
|
-
|
|
160
|
-
table_dict[table_name] = Table(
|
|
161
|
-
df=df,
|
|
162
|
-
row=row,
|
|
163
|
-
batch=batch,
|
|
164
|
-
num_sampled_nodes=num_sampled_nodes,
|
|
165
|
-
stype_dict=stype_dict,
|
|
166
|
-
primary_key=primary_key,
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
link_dict: Dict[Tuple[str, str, str], Link] = {}
|
|
170
|
-
for edge_type in self._graph_store.edge_types:
|
|
171
|
-
edge_type_str = '__'.join(edge_type)
|
|
172
|
-
|
|
173
|
-
row = row_dict[edge_type_str]
|
|
174
|
-
col = col_dict[edge_type_str]
|
|
175
|
-
|
|
176
|
-
if len(row) == 0:
|
|
177
|
-
continue
|
|
178
|
-
|
|
179
|
-
# Do not store reverse edge type if it is a replica:
|
|
180
|
-
rev_edge_type = Subgraph.rev_edge_type(edge_type)
|
|
181
|
-
rev_edge_type_str = '__'.join(rev_edge_type)
|
|
182
|
-
if (rev_edge_type in link_dict
|
|
183
|
-
and np.array_equal(row, col_dict[rev_edge_type_str])
|
|
184
|
-
and np.array_equal(col, row_dict[rev_edge_type_str])):
|
|
185
|
-
link = Link(
|
|
186
|
-
layout=EdgeLayout.REV,
|
|
187
|
-
row=None,
|
|
188
|
-
col=None,
|
|
189
|
-
num_sampled_edges=(
|
|
190
|
-
num_sampled_edges_dict[edge_type_str].tolist()),
|
|
191
|
-
)
|
|
192
|
-
link_dict[edge_type] = link
|
|
193
|
-
continue
|
|
194
|
-
|
|
195
|
-
layout = EdgeLayout.COO
|
|
196
|
-
if np.array_equal(row, np.arange(len(row))):
|
|
197
|
-
row = None
|
|
198
|
-
if np.array_equal(col, np.arange(len(col))):
|
|
199
|
-
col = None
|
|
200
|
-
|
|
201
|
-
# Store in compressed representation if more efficient:
|
|
202
|
-
num_cols = table_dict[edge_type[2]].num_rows
|
|
203
|
-
if col is not None and len(col) > num_cols + 1:
|
|
204
|
-
layout = EdgeLayout.CSC
|
|
205
|
-
colcount = np.bincount(col, minlength=num_cols)
|
|
206
|
-
col = np.empty(num_cols + 1, dtype=col.dtype)
|
|
207
|
-
col[0] = 0
|
|
208
|
-
np.cumsum(colcount, out=col[1:])
|
|
209
|
-
|
|
210
|
-
link = Link(
|
|
211
|
-
layout=layout,
|
|
212
|
-
row=row,
|
|
213
|
-
col=col,
|
|
214
|
-
num_sampled_edges=(
|
|
215
|
-
num_sampled_edges_dict[edge_type_str].tolist()),
|
|
216
|
-
)
|
|
217
|
-
link_dict[edge_type] = link
|
|
218
|
-
|
|
219
|
-
return Subgraph(
|
|
220
|
-
anchor_time=time,
|
|
221
|
-
table_dict=table_dict,
|
|
222
|
-
link_dict=link_dict,
|
|
223
|
-
)
|