kumoai 2.13.0.dev202512040651__cp312-cp312-win_amd64.whl → 2.14.0.dev202512111731__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
- kumoai/__init__.py,sha256=qu-qohU2cQlManX1aZIlzA3ivKl52m-cSQBPSW8urUU,10837
1
+ kumoai/__init__.py,sha256=aDhb7KGetDnOz54u1Fd45zfM2N8oAha6XT2CvJqOvgc,11146
2
2
  kumoai/_logging.py,sha256=qL4JbMQwKXri2f-SEJoFB8TY5ALG12S-nobGTNWxW-A,915
3
3
  kumoai/_singleton.py,sha256=i2BHWKpccNh5SJGDyU0IXsnYzJAYr8Xb0wz4c6LRbpo,861
4
- kumoai/_version.py,sha256=ubLFhwaEt9iOGqPjskQWNn61rU80l153XB_V_RDASDU,39
4
+ kumoai/_version.py,sha256=1P4-kFh0Np-63qTmspyV3KEawGdIIMPl6ZARL87NM3s,39
5
5
  kumoai/databricks.py,sha256=ahwJz6DWLXMkndT0XwEDBxF-hoqhidFR8wBUQ4TLZ68,490
6
6
  kumoai/exceptions.py,sha256=7TMs0SC8xrU009_Pgd4QXtSF9lxJq8MtRbeX9pcQUy4,859
7
7
  kumoai/formatting.py,sha256=o3uCnLwXPhe1KI5WV9sBgRrcU7ed4rgu_pf89GL9Nc0,983
8
8
  kumoai/futures.py,sha256=J8rtZMEYFzdn5xF_x-LAiKJz3KGL6PT02f6rq_2bOJk,3836
9
9
  kumoai/jobs.py,sha256=dCi7BAdfm2tCnonYlGU4WJokJWbh3RzFfaOX2EYCIHU,2576
10
- kumoai/kumolib.cp312-win_amd64.pyd,sha256=3QbQYqFqTG9ug5pa5KodhP122ajN9gnn0g5ckd8-gWc,198144
10
+ kumoai/kumolib.cp312-win_amd64.pyd,sha256=tgu5bJkhFo3Yc524MI6sVvovE0yUuQ-dEnGPbKaFBIE,198144
11
11
  kumoai/mixin.py,sha256=IaiB8SAI0VqOoMVzzIaUlqMt53-QPUK6OB0HikG-V9E,840
12
12
  kumoai/spcs.py,sha256=KWfENrwSLruprlD-QPh63uU0N6npiNrwkeKfBk3EUyQ,4260
13
13
  kumoai/artifact_export/__init__.py,sha256=UXAQI5q92ChBzWAk8o3J6pElzYHudAzFZssQXd4o7i8,247
@@ -55,23 +55,23 @@ kumoai/encoder/__init__.py,sha256=8FeP6mUyCeXxr1b8kUIi5dxe5vEXQRft9tPoaV1CBqg,18
55
55
  kumoai/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  kumoai/experimental/rfm/__init__.py,sha256=EFZz6IvvskmeO85Vig6p1m_6jdimS_BkeREOndHuRsc,6247
57
57
  kumoai/experimental/rfm/authenticate.py,sha256=G89_4TMeUpr5fG_0VTzMF5sdNhaciitA1oc2loTlTmo,19321
58
- kumoai/experimental/rfm/graph.py,sha256=ymPw8nNYeXeDnuyL2ioRL6wDHLmp3_gm9PQadLBCkL8,40096
59
- kumoai/experimental/rfm/local_graph_sampler.py,sha256=dQ3JnuozTNeZyUFRu2h8OTMNmV1RAoaCA0gvkpgOstg,8110
60
- kumoai/experimental/rfm/local_graph_store.py,sha256=6jY1ciVIlnBBhZCxWwBTl7SKX1fxRIDLszwrftD0Cdk,13485
61
- kumoai/experimental/rfm/local_pquery_driver.py,sha256=Yd_yHIrvuDj16IC1pvsqiQvZS41vvOOCRMiuDGtN6Fk,26851
62
- kumoai/experimental/rfm/rfm.py,sha256=vOnL8ecHTo1TX2B8_T8xaWGou8qYYz8DyVENu1H93mM,48834
58
+ kumoai/experimental/rfm/graph.py,sha256=SL3-WinoLnkZC6VVjebYGLuQJJyEVFJdCm6h3FNE0e4,40816
59
+ kumoai/experimental/rfm/rfm.py,sha256=jauaV5MuuSPsWDI2VwzMxxnsXpQ6Jp3nhgPTmxUMeh8,50304
63
60
  kumoai/experimental/rfm/sagemaker.py,sha256=sEJSyfEFBA3-7wKinBEzSooKHEn0BgPjrgRnPhYo79g,5120
64
61
  kumoai/experimental/rfm/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- kumoai/experimental/rfm/backend/local/__init__.py,sha256=usMh0fuDxKK-aOVT1sU30BQWFS0eSkfUrhUVILisQQI,934
62
+ kumoai/experimental/rfm/backend/local/__init__.py,sha256=8JbLaai0yhtldFcDkddphIJKMiKc0XnodvYBWkrGPXI,1056
63
+ kumoai/experimental/rfm/backend/local/graph_store.py,sha256=m1eHot4hRSl6OY1trPRDEH0U89hVoRqMfWhA9f-SyQU,12234
64
+ kumoai/experimental/rfm/backend/local/sampler.py,sha256=GRMaBtfrSTJtX_9_uwi4MXV0V8SHYAsJJ68d5qrvJJg,11051
66
65
  kumoai/experimental/rfm/backend/local/table.py,sha256=1PqNOROzlnK3SaZHNcU2hyzeifs0N4wssQAS3-Z0Myc,3674
67
66
  kumoai/experimental/rfm/backend/snow/__init__.py,sha256=viMeR9VWpB1kjRdSWCTNFMdM7a8Mj_Dtck1twJW8dV8,962
68
- kumoai/experimental/rfm/backend/snow/table.py,sha256=f6of849A7n-Om1ExRFPfy42Ba_2we7y__ECi6V2KgYg,4374
67
+ kumoai/experimental/rfm/backend/snow/table.py,sha256=Rf4hUPOUtsjpaIc9vBKWPZ3yz20OOg6DZqCGeih4KC8,4372
69
68
  kumoai/experimental/rfm/backend/sqlite/__init__.py,sha256=xw5NNLrWSvUvRkD49X_9hZYjas5EuP1XDANPy0EEjOg,874
70
69
  kumoai/experimental/rfm/backend/sqlite/table.py,sha256=mBiZC21gQwfR4demFrP37GmawMHfIm-G82mLQeBqIZo,3901
71
- kumoai/experimental/rfm/base/__init__.py,sha256=oXPkeBemtuDxRUK61-0sOT84GZB_oQ6HvaZNU1KFNaw,199
70
+ kumoai/experimental/rfm/base/__init__.py,sha256=FX1DonqdQLD-q0SBqhNIelr6HUN2LNiutviCmdikDok,282
72
71
  kumoai/experimental/rfm/base/column.py,sha256=OE-PRQ8HO4uTq0e3_3eHJFfhp5nzw79zd-43g3iMh4g,2385
72
+ kumoai/experimental/rfm/base/sampler.py,sha256=277THFQGvQK_OkBBKHQ7166oGpnL_zVPFble7Ehlu3c,31805
73
73
  kumoai/experimental/rfm/base/source.py,sha256=H5yN9xAwK3i_69EdqOV_x58muPGKQiI8ev5BhHQDZEo,290
74
- kumoai/experimental/rfm/base/table.py,sha256=glyAg4LCQdddM3lIRClJSA7qMyfoHUVAGBf1rEs6B8Y,20113
74
+ kumoai/experimental/rfm/base/table.py,sha256=6GlWUz3tAu2g1QqTA5idWGmfo2KEJoNApDFZRn8e0pg,20388
75
75
  kumoai/experimental/rfm/infer/__init__.py,sha256=qKg8or-SpgTApD6ePw1PJ4aUZPrOLTHLRCmBIJ92hrk,486
76
76
  kumoai/experimental/rfm/infer/categorical.py,sha256=bqmfrE5ZCBTcb35lA4SyAkCu3MgttAn29VBJYMBNhVg,893
77
77
  kumoai/experimental/rfm/infer/dtype.py,sha256=Hf_drluYNuN59lTSe-8GuXalg20Pv93kCktB6Hb9f74,2686
@@ -82,14 +82,14 @@ kumoai/experimental/rfm/infer/time_col.py,sha256=G98Cgz1m9G9VA-ApnCmGYnJxEFwp1jf
82
82
  kumoai/experimental/rfm/infer/timestamp.py,sha256=L2VxjtYTSyUBYAo4M-L08xSQlPpqnHMAVF5_vxjh3Y0,1135
83
83
  kumoai/experimental/rfm/pquery/__init__.py,sha256=RkTn0I74uXOUuOiBpa6S-_QEYctMutkUnBEfF9ztQzI,159
84
84
  kumoai/experimental/rfm/pquery/executor.py,sha256=S8wwXbAkH-YSnmEVYB8d6wyJF4JJ003mH_0zFTvOp_I,2843
85
- kumoai/experimental/rfm/pquery/pandas_executor.py,sha256=QQpOZ_ArH3eSAkenaY3J-gW1Wn5A7f85RiqZxaO5u1Q,19019
85
+ kumoai/experimental/rfm/pquery/pandas_executor.py,sha256=h5F2hrnyqAPqaqH4RwTdgkeedfCcDTpph9sF0lwIrx4,19024
86
86
  kumoai/graph/__init__.py,sha256=QGk3OMwRzQJSGESdcc7hcQH6UDmNVJYTdqnRren4c7Q,240
87
87
  kumoai/graph/column.py,sha256=cQhioibTbIKIBZ-bf8-Bt4F4Iblhidps-CYWrkxRPnE,4295
88
88
  kumoai/graph/graph.py,sha256=Pq-dxi4MwoDtrrwm3xeyUB9Hl7ryNfHq4rMHuvyNB3c,39239
89
89
  kumoai/graph/table.py,sha256=BB-4ezyd7hrrj6QZwRBa80ySH0trwYb4fmhRn3xoK-k,34726
90
90
  kumoai/pquery/__init__.py,sha256=FF6QUTG_xrz2ic1I8NcIa8O993Ae98eZ9gkvQ4rapgo,558
91
91
  kumoai/pquery/prediction_table.py,sha256=hWG4L_ze4PLgUoxCXNKk8_nkYxVXELQs8_X8KGOE9yk,11063
92
- kumoai/pquery/predictive_query.py,sha256=GWhQpQxf6apyyu-bvE3z63mX6NLd8lKbyu_jzj7rNms,25608
92
+ kumoai/pquery/predictive_query.py,sha256=I5Ntc7YO1qEGxKrLuhAzZO3SySr8Wnjhde8eDbbB7zk,25542
93
93
  kumoai/pquery/training_table.py,sha256=L1QjaVlY4SAPD8OUmTaH6YjZzBbPOnS9mnAT69znWv0,16233
94
94
  kumoai/testing/__init__.py,sha256=XBQ_Sa3WnOYlpXZ3gUn8w6nVfZt-nfPhytfIBeiPt4w,178
95
95
  kumoai/testing/decorators.py,sha256=p79ZCQqPY_MHWy0_l7-xQ6wUIqFTn4AbrGWTHLvpbQY,1664
@@ -104,8 +104,8 @@ kumoai/utils/__init__.py,sha256=wAKgmwtMIGuiauW9D_GGKH95K-24Kgwmld27mm4nsro,278
104
104
  kumoai/utils/datasets.py,sha256=UyAII-oAn7x3ombuvpbSQ41aVF9SYKBjQthTD-vcT2A,3011
105
105
  kumoai/utils/forecasting.py,sha256=ZgKeUCbWLOot0giAkoigwU5du8LkrwAicFOi5hVn6wg,7624
106
106
  kumoai/utils/progress_logger.py,sha256=MZsWgHd4UZQKCXiJZgQeW-Emi_BmzlCKPLPXOL_HqBo,5239
107
- kumoai-2.13.0.dev202512040651.dist-info/licenses/LICENSE,sha256=ZUilBDp--4vbhsEr6f_Upw9rnIx09zQ3K9fXQ0rfd6w,1111
108
- kumoai-2.13.0.dev202512040651.dist-info/METADATA,sha256=7uIT7UzASJE2vNRDcrBRyZ0i0YkGVX4wTq8ClqiKCWM,2580
109
- kumoai-2.13.0.dev202512040651.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
110
- kumoai-2.13.0.dev202512040651.dist-info/top_level.txt,sha256=YjU6UcmomoDx30vEXLsOU784ED7VztQOsFApk1SFwvs,7
111
- kumoai-2.13.0.dev202512040651.dist-info/RECORD,,
107
+ kumoai-2.14.0.dev202512111731.dist-info/licenses/LICENSE,sha256=ZUilBDp--4vbhsEr6f_Upw9rnIx09zQ3K9fXQ0rfd6w,1111
108
+ kumoai-2.14.0.dev202512111731.dist-info/METADATA,sha256=BTYN5IeO5MXS1lF5aA-O_p7x32QpVK0kuMMp6Wy2S-A,2580
109
+ kumoai-2.14.0.dev202512111731.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
110
+ kumoai-2.14.0.dev202512111731.dist-info/top_level.txt,sha256=YjU6UcmomoDx30vEXLsOU784ED7VztQOsFApk1SFwvs,7
111
+ kumoai-2.14.0.dev202512111731.dist-info/RECORD,,
@@ -1,223 +0,0 @@
1
- import re
2
- from typing import Dict, List, Optional, Tuple
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
7
- from kumoapi.typing import Stype
8
-
9
- import kumoai.kumolib as kumolib
10
- from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
11
-
12
- PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
13
- MULTISPACE = re.compile(r"\s+")
14
-
15
-
16
- def normalize_text(
17
- ser: pd.Series,
18
- max_words: Optional[int] = 50,
19
- ) -> pd.Series:
20
- r"""Normalizes text into a list of lower-case words.
21
-
22
- Args:
23
- ser: The :class:`pandas.Series` to normalize.
24
- max_words: The maximum number of words to return.
25
- This will auto-shrink any large text column to avoid blowing up
26
- context size.
27
- """
28
- if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
29
- return ser
30
-
31
- def normalize_fn(line: str) -> list[str]:
32
- line = PUNCTUATION.sub(" ", line)
33
- line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
34
- line = MULTISPACE.sub(" ", line)
35
- words = line.split()
36
- if max_words is not None:
37
- words = words[:max_words]
38
- return words
39
-
40
- ser = ser.fillna('').astype(str)
41
-
42
- if max_words is not None:
43
- # We estimate the number of words as 5 characters + 1 space in an
44
- # English text on average. We need this pre-filter here, as word
45
- # splitting on a giant text can be very expensive:
46
- ser = ser.str[:6 * max_words]
47
-
48
- ser = ser.str.lower()
49
- ser = ser.map(normalize_fn)
50
-
51
- return ser
52
-
53
-
54
- class LocalGraphSampler:
55
- def __init__(self, graph_store: LocalGraphStore) -> None:
56
- self._graph_store = graph_store
57
- self._sampler = kumolib.NeighborSampler(
58
- self._graph_store.node_types,
59
- self._graph_store.edge_types,
60
- {
61
- '__'.join(edge_type): colptr
62
- for edge_type, colptr in self._graph_store.colptr_dict.items()
63
- },
64
- {
65
- '__'.join(edge_type): row
66
- for edge_type, row in self._graph_store.row_dict.items()
67
- },
68
- self._graph_store.time_dict,
69
- )
70
-
71
- def __call__(
72
- self,
73
- entity_table_names: Tuple[str, ...],
74
- node: np.ndarray,
75
- time: np.ndarray,
76
- num_neighbors: List[int],
77
- exclude_cols_dict: Dict[str, List[str]],
78
- ) -> Subgraph:
79
-
80
- (
81
- row_dict,
82
- col_dict,
83
- node_dict,
84
- batch_dict,
85
- num_sampled_nodes_dict,
86
- num_sampled_edges_dict,
87
- ) = self._sampler.sample(
88
- {
89
- '__'.join(edge_type): num_neighbors
90
- for edge_type in self._graph_store.edge_types
91
- },
92
- {}, # time interval based sampling
93
- entity_table_names[0],
94
- node,
95
- time // 1000**3, # nanoseconds to seconds
96
- )
97
-
98
- table_dict: Dict[str, Table] = {}
99
- for table_name, node in node_dict.items():
100
- batch = batch_dict[table_name]
101
-
102
- if len(node) == 0:
103
- continue
104
-
105
- df = self._graph_store.df_dict[table_name]
106
-
107
- num_sampled_nodes = num_sampled_nodes_dict[table_name].tolist()
108
- stype_dict = { # Exclude target columns:
109
- column_name: stype
110
- for column_name, stype in
111
- self._graph_store.stype_dict[table_name].items()
112
- if column_name not in exclude_cols_dict.get(table_name, [])
113
- }
114
- primary_key: Optional[str] = None
115
- if table_name in entity_table_names:
116
- primary_key = self._graph_store.pkey_name_dict.get(table_name)
117
-
118
- columns: List[str] = []
119
- if table_name in entity_table_names:
120
- columns += [self._graph_store.pkey_name_dict[table_name]]
121
- columns += list(stype_dict.keys())
122
-
123
- if len(columns) == 0:
124
- table_dict[table_name] = Table(
125
- df=pd.DataFrame(index=range(len(node))),
126
- row=None,
127
- batch=batch,
128
- num_sampled_nodes=num_sampled_nodes,
129
- stype_dict=stype_dict,
130
- primary_key=primary_key,
131
- )
132
- continue
133
-
134
- row: Optional[np.ndarray] = None
135
- if table_name in self._graph_store.end_time_column_dict:
136
- # Set end time to NaT for all values greater than anchor time:
137
- df = df.iloc[node].reset_index(drop=True)
138
- col_name = self._graph_store.end_time_column_dict[table_name]
139
- ser = df[col_name]
140
- value = ser.astype('datetime64[ns]').astype(int).to_numpy()
141
- mask = value > time[batch]
142
- df.loc[mask, col_name] = pd.NaT
143
- else:
144
- # Only store unique rows in `df` above a certain threshold:
145
- unique_node, inverse = np.unique(node, return_inverse=True)
146
- if len(node) > 1.05 * len(unique_node):
147
- df = df.iloc[unique_node].reset_index(drop=True)
148
- row = inverse
149
- else:
150
- df = df.iloc[node].reset_index(drop=True)
151
-
152
- # Filter data frame to minimal set of columns:
153
- df = df[columns]
154
-
155
- # Normalize text (if not already pre-processed):
156
- for column_name, stype in stype_dict.items():
157
- if stype == Stype.text:
158
- df[column_name] = normalize_text(df[column_name])
159
-
160
- table_dict[table_name] = Table(
161
- df=df,
162
- row=row,
163
- batch=batch,
164
- num_sampled_nodes=num_sampled_nodes,
165
- stype_dict=stype_dict,
166
- primary_key=primary_key,
167
- )
168
-
169
- link_dict: Dict[Tuple[str, str, str], Link] = {}
170
- for edge_type in self._graph_store.edge_types:
171
- edge_type_str = '__'.join(edge_type)
172
-
173
- row = row_dict[edge_type_str]
174
- col = col_dict[edge_type_str]
175
-
176
- if len(row) == 0:
177
- continue
178
-
179
- # Do not store reverse edge type if it is a replica:
180
- rev_edge_type = Subgraph.rev_edge_type(edge_type)
181
- rev_edge_type_str = '__'.join(rev_edge_type)
182
- if (rev_edge_type in link_dict
183
- and np.array_equal(row, col_dict[rev_edge_type_str])
184
- and np.array_equal(col, row_dict[rev_edge_type_str])):
185
- link = Link(
186
- layout=EdgeLayout.REV,
187
- row=None,
188
- col=None,
189
- num_sampled_edges=(
190
- num_sampled_edges_dict[edge_type_str].tolist()),
191
- )
192
- link_dict[edge_type] = link
193
- continue
194
-
195
- layout = EdgeLayout.COO
196
- if np.array_equal(row, np.arange(len(row))):
197
- row = None
198
- if np.array_equal(col, np.arange(len(col))):
199
- col = None
200
-
201
- # Store in compressed representation if more efficient:
202
- num_cols = table_dict[edge_type[2]].num_rows
203
- if col is not None and len(col) > num_cols + 1:
204
- layout = EdgeLayout.CSC
205
- colcount = np.bincount(col, minlength=num_cols)
206
- col = np.empty(num_cols + 1, dtype=col.dtype)
207
- col[0] = 0
208
- np.cumsum(colcount, out=col[1:])
209
-
210
- link = Link(
211
- layout=layout,
212
- row=row,
213
- col=col,
214
- num_sampled_edges=(
215
- num_sampled_edges_dict[edge_type_str].tolist()),
216
- )
217
- link_dict[edge_type] = link
218
-
219
- return Subgraph(
220
- anchor_time=time,
221
- table_dict=table_dict,
222
- link_dict=link_dict,
223
- )