kumoai 2.7.0.dev202508201830__cp312-cp312-win_amd64.whl → 2.12.0.dev202511111731__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. kumoai/__init__.py +4 -2
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +10 -5
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/rfm.py +37 -8
  6. kumoai/connector/file_upload_connector.py +94 -85
  7. kumoai/connector/snowflake_connector.py +9 -0
  8. kumoai/connector/utils.py +1377 -209
  9. kumoai/experimental/rfm/__init__.py +5 -3
  10. kumoai/experimental/rfm/authenticate.py +8 -5
  11. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  12. kumoai/experimental/rfm/local_graph.py +96 -82
  13. kumoai/experimental/rfm/local_graph_sampler.py +16 -8
  14. kumoai/experimental/rfm/local_graph_store.py +32 -10
  15. kumoai/experimental/rfm/local_pquery_driver.py +342 -46
  16. kumoai/experimental/rfm/local_table.py +142 -45
  17. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  18. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +28 -58
  19. kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
  20. kumoai/experimental/rfm/rfm.py +535 -125
  21. kumoai/experimental/rfm/utils.py +0 -3
  22. kumoai/jobs.py +27 -1
  23. kumoai/kumolib.cp312-win_amd64.pyd +0 -0
  24. kumoai/pquery/prediction_table.py +5 -3
  25. kumoai/pquery/training_table.py +5 -3
  26. kumoai/trainer/job.py +9 -30
  27. kumoai/trainer/trainer.py +19 -10
  28. kumoai/utils/__init__.py +2 -1
  29. kumoai/utils/progress_logger.py +96 -16
  30. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/METADATA +4 -5
  31. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/RECORD +34 -34
  32. kumoai/experimental/rfm/pquery/pandas_backend.py +0 -437
  33. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/WHEEL +0 -0
  34. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/licenses/LICENSE +0 -0
  35. {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/top_level.txt +0 -0
@@ -1,437 +0,0 @@
1
- from typing import Dict, Optional, Tuple, Union
2
-
3
- import numpy as np
4
- import pandas as pd
5
- from kumoapi.rfm import PQueryDefinition
6
- from kumoapi.rfm.pquery import (
7
- Aggregation,
8
- AggregationType,
9
- BoolOp,
10
- Column,
11
- Condition,
12
- Filter,
13
- Float,
14
- FloatList,
15
- Int,
16
- IntList,
17
- LogicalOperation,
18
- MemberOp,
19
- RelOp,
20
- Str,
21
- StrList,
22
- )
23
-
24
- from kumoai.experimental.rfm.pquery import PQueryBackend
25
-
26
-
27
- class PQueryPandasBackend(PQueryBackend[pd.DataFrame, pd.Series, np.ndarray]):
28
- def eval_aggregation_type(
29
- self,
30
- op: AggregationType,
31
- feat: Optional[pd.Series],
32
- batch: np.ndarray,
33
- batch_size: int,
34
- filter_na: bool = True,
35
- ) -> Tuple[pd.Series, np.ndarray]:
36
-
37
- if op != AggregationType.COUNT:
38
- assert feat is not None
39
-
40
- if feat is not None:
41
- mask = feat.notna()
42
- feat, batch = feat[mask], batch[mask]
43
-
44
- if op == AggregationType.LIST_DISTINCT:
45
- df = pd.DataFrame(dict(feat=feat, batch=batch))
46
- df = df.drop_duplicates()
47
- out = df.groupby('batch')['feat'].agg(list)
48
-
49
- else:
50
- df = pd.DataFrame(dict(feat=feat, batch=batch))
51
- if op == AggregationType.AVG:
52
- agg = 'mean'
53
- elif op == AggregationType.COUNT:
54
- agg = 'size'
55
- else:
56
- agg = op.lower()
57
- out = df.groupby('batch')['feat'].agg(agg)
58
-
59
- if not pd.api.types.is_datetime64_any_dtype(out):
60
- out = out.astype('float32')
61
-
62
- out.name = None
63
- out.index.name = None
64
-
65
- if op in {AggregationType.SUM, AggregationType.COUNT}:
66
- out = out.reindex(range(batch_size), fill_value=0)
67
- mask = np.ones(batch_size, dtype=bool)
68
- return out, mask
69
-
70
- mask = np.zeros(batch_size, dtype=bool)
71
- mask[batch] = True
72
-
73
- if filter_na:
74
- return out.reset_index(drop=True), mask
75
-
76
- out = out.reindex(range(batch_size), fill_value=pd.NA)
77
-
78
- return out, mask
79
-
80
- def eval_rel_op(
81
- self,
82
- left: pd.Series,
83
- op: RelOp,
84
- right: Union[Int, Float, Str, None],
85
- ) -> pd.Series:
86
-
87
- if right is None:
88
- if op == RelOp.EQ:
89
- return left.isna()
90
- assert op == RelOp.NEQ
91
- return left.notna()
92
-
93
- value = pd.Series([right.value], dtype=left.dtype).iloc[0]
94
-
95
- if op == RelOp.EQ:
96
- return (left == value).fillna(False).astype(bool)
97
- if op == RelOp.NEQ:
98
- out = (left != value).fillna(False).astype(bool)
99
- out[left.isna()] = False # N/A != right should always be `False`.
100
- return out
101
- if op == RelOp.LEQ:
102
- return (left <= value).fillna(False).astype(bool)
103
- if op == RelOp.GEQ:
104
- return (left >= value).fillna(False).astype(bool)
105
- if op == RelOp.LT:
106
- return (left < value).fillna(False).astype(bool)
107
- if op == RelOp.GT:
108
- return (left > value).fillna(False).astype(bool)
109
-
110
- raise NotImplementedError(f"Operator '{op}' not implemented")
111
-
112
- def eval_member_op(
113
- self,
114
- left: pd.Series,
115
- op: MemberOp,
116
- right: Union[IntList, FloatList, StrList],
117
- ) -> pd.Series:
118
-
119
- if op == MemberOp.IN:
120
- ser = pd.Series(right.value, dtype=left.dtype)
121
- return left.isin(ser).astype(bool)
122
-
123
- raise NotImplementedError(f"Operator '{op}' not implemented")
124
-
125
- def eval_bool_op(
126
- self,
127
- left: pd.Series,
128
- op: BoolOp,
129
- right: Optional[pd.Series],
130
- ) -> pd.Series:
131
-
132
- # TODO Implement Kleene-Priest three-value logic.
133
- if op == BoolOp.AND:
134
- assert right is not None
135
- return left & right
136
- if op == BoolOp.OR:
137
- assert right is not None
138
- return left | right
139
- if op == BoolOp.NOT:
140
- return ~left
141
-
142
- raise NotImplementedError(f"Operator '{op}' not implemented")
143
-
144
- def eval_column(
145
- self,
146
- column: Column,
147
- feat_dict: Dict[str, pd.DataFrame],
148
- filter_na: bool = True,
149
- ) -> Tuple[pd.Series, np.ndarray]:
150
-
151
- out = feat_dict[column.table_name][column.column_name]
152
- out = out.reset_index(drop=True)
153
-
154
- if pd.api.types.is_float_dtype(out):
155
- out = out.astype('float32')
156
-
157
- out.name = None
158
- out.index.name = None
159
-
160
- mask = out.notna().to_numpy()
161
-
162
- if not filter_na:
163
- return out, mask
164
-
165
- out = out[mask].reset_index(drop=True)
166
-
167
- # Cast to primitive dtype:
168
- if pd.api.types.is_integer_dtype(out):
169
- out = out.astype('int64')
170
- elif pd.api.types.is_bool_dtype(out):
171
- out = out.astype('bool')
172
-
173
- return out, mask
174
-
175
- def eval_aggregation(
176
- self,
177
- aggr: Aggregation,
178
- feat_dict: Dict[str, pd.DataFrame],
179
- time_dict: Dict[str, pd.Series],
180
- batch_dict: Dict[str, np.ndarray],
181
- anchor_time: pd.Series,
182
- filter_na: bool = True,
183
- ) -> Tuple[pd.Series, np.ndarray]:
184
-
185
- target_table = aggr.column.table_name
186
- target_batch = batch_dict[target_table]
187
- target_time = time_dict[target_table]
188
- anchor_target_time = anchor_time[target_batch].reset_index(drop=True)
189
-
190
- target_mask = target_time <= anchor_target_time + aggr.end_offset
191
-
192
- if aggr.start is not None:
193
- start_offset = aggr.start * aggr.time_unit.to_offset()
194
- target_mask &= target_time > anchor_target_time + start_offset
195
-
196
- if aggr.filter is not None:
197
- target_mask &= self.eval_filter(
198
- filter=aggr.filter,
199
- feat_dict=feat_dict,
200
- time_dict=time_dict,
201
- batch_dict=batch_dict,
202
- anchor_time=anchor_time,
203
- )
204
-
205
- if (aggr.type == AggregationType.COUNT
206
- and aggr.column.column_name == '*'):
207
- target_feat = None
208
- else:
209
- target_feat, _ = self.eval_column(
210
- aggr.column,
211
- feat_dict,
212
- filter_na=False,
213
- )
214
- target_feat = target_feat[target_mask]
215
-
216
- return self.eval_aggregation_type(
217
- aggr.type,
218
- feat=target_feat,
219
- batch=target_batch[target_mask],
220
- batch_size=len(anchor_time),
221
- filter_na=filter_na,
222
- )
223
-
224
- def eval_condition(
225
- self,
226
- condition: Condition,
227
- feat_dict: Dict[str, pd.DataFrame],
228
- time_dict: Dict[str, pd.Series],
229
- batch_dict: Dict[str, np.ndarray],
230
- anchor_time: pd.Series,
231
- filter_na: bool = True,
232
- ) -> Tuple[pd.Series, np.ndarray]:
233
-
234
- if isinstance(condition.left, Column):
235
- left, mask = self.eval_column(
236
- column=condition.left,
237
- feat_dict=feat_dict,
238
- filter_na=filter_na if condition.right is not None else False,
239
- )
240
- else:
241
- assert isinstance(condition.left, Aggregation)
242
- left, mask = self.eval_aggregation(
243
- aggr=condition.left,
244
- feat_dict=feat_dict,
245
- time_dict=time_dict,
246
- batch_dict=batch_dict,
247
- anchor_time=anchor_time,
248
- filter_na=filter_na if condition.right is not None else False,
249
- )
250
-
251
- if filter_na and condition.right is None:
252
- mask = np.ones(len(left), dtype=bool)
253
-
254
- if isinstance(condition.op, RelOp):
255
- out = self.eval_rel_op(
256
- left=left,
257
- op=condition.op,
258
- right=condition.right,
259
- )
260
- else:
261
- assert isinstance(condition.op, MemberOp)
262
- out = self.eval_member_op(
263
- left=left,
264
- op=condition.op,
265
- right=condition.right,
266
- )
267
-
268
- return out, mask
269
-
270
- def eval_logical_operation(
271
- self,
272
- logical_operation: LogicalOperation,
273
- feat_dict: Dict[str, pd.DataFrame],
274
- time_dict: Dict[str, pd.Series],
275
- batch_dict: Dict[str, np.ndarray],
276
- anchor_time: pd.Series,
277
- filter_na: bool = True,
278
- ) -> Tuple[pd.Series, np.ndarray]:
279
-
280
- if isinstance(logical_operation.left, Condition):
281
- left, mask = self.eval_condition(
282
- condition=logical_operation.left,
283
- feat_dict=feat_dict,
284
- time_dict=time_dict,
285
- batch_dict=batch_dict,
286
- anchor_time=anchor_time,
287
- filter_na=False,
288
- )
289
- else:
290
- assert isinstance(logical_operation.left, LogicalOperation)
291
- left, mask = self.eval_logical_operation(
292
- logical_operation=logical_operation.left,
293
- feat_dict=feat_dict,
294
- time_dict=time_dict,
295
- batch_dict=batch_dict,
296
- anchor_time=anchor_time,
297
- filter_na=False,
298
- )
299
-
300
- right = right_mask = None
301
- if isinstance(logical_operation.right, Condition):
302
- right, right_mask = self.eval_condition(
303
- condition=logical_operation.right,
304
- feat_dict=feat_dict,
305
- time_dict=time_dict,
306
- batch_dict=batch_dict,
307
- anchor_time=anchor_time,
308
- filter_na=False,
309
- )
310
- elif isinstance(logical_operation.right, LogicalOperation):
311
- right, right_mask = self.eval_logical_operation(
312
- logical_operation=logical_operation.right,
313
- feat_dict=feat_dict,
314
- time_dict=time_dict,
315
- batch_dict=batch_dict,
316
- anchor_time=anchor_time,
317
- filter_na=False,
318
- )
319
-
320
- out = self.eval_bool_op(left, logical_operation.op, right)
321
-
322
- if right_mask is not None:
323
- mask &= right_mask
324
-
325
- if filter_na:
326
- out = out[mask].reset_index(drop=True)
327
-
328
- return out, mask
329
-
330
- def eval_filter(
331
- self,
332
- filter: Filter,
333
- feat_dict: Dict[str, pd.DataFrame],
334
- time_dict: Dict[str, pd.Series],
335
- batch_dict: Dict[str, np.ndarray],
336
- anchor_time: pd.Series,
337
- ) -> np.ndarray:
338
- if isinstance(filter.condition, Condition):
339
- return self.eval_condition(
340
- condition=filter.condition,
341
- feat_dict=feat_dict,
342
- time_dict=time_dict,
343
- batch_dict=batch_dict,
344
- anchor_time=anchor_time,
345
- filter_na=False,
346
- )[0].to_numpy()
347
- else:
348
- assert isinstance(filter.condition, LogicalOperation)
349
- return self.eval_logical_operation(
350
- logical_operation=filter.condition,
351
- feat_dict=feat_dict,
352
- time_dict=time_dict,
353
- batch_dict=batch_dict,
354
- anchor_time=anchor_time,
355
- filter_na=False,
356
- )[0].to_numpy()
357
-
358
- def eval_pquery(
359
- self,
360
- query: PQueryDefinition,
361
- feat_dict: Dict[str, pd.DataFrame],
362
- time_dict: Dict[str, pd.Series],
363
- batch_dict: Dict[str, np.ndarray],
364
- anchor_time: pd.Series,
365
- ) -> Tuple[pd.Series, np.ndarray]:
366
-
367
- mask = np.ones(len(anchor_time), dtype=bool)
368
-
369
- if query.entity.filter is not None:
370
- mask &= self.eval_filter(
371
- filter=query.entity.filter,
372
- feat_dict=feat_dict,
373
- time_dict=time_dict,
374
- batch_dict=batch_dict,
375
- anchor_time=anchor_time,
376
- )
377
-
378
- if getattr(query, 'assuming', None) is not None:
379
- if isinstance(query.assuming, Condition):
380
- mask &= self.eval_condition(
381
- condition=query.assuming,
382
- feat_dict=feat_dict,
383
- time_dict=time_dict,
384
- batch_dict=batch_dict,
385
- anchor_time=anchor_time,
386
- filter_na=False,
387
- )[0].to_numpy()
388
- else:
389
- assert isinstance(query.assuming, LogicalOperation)
390
- mask &= self.eval_logical_operation(
391
- logical_operation=query.assuming,
392
- feat_dict=feat_dict,
393
- time_dict=time_dict,
394
- batch_dict=batch_dict,
395
- anchor_time=anchor_time,
396
- filter_na=False,
397
- )[0].to_numpy()
398
-
399
- if isinstance(query.target, Column):
400
- out, _mask = self.eval_column(
401
- column=query.target,
402
- feat_dict=feat_dict,
403
- filter_na=True,
404
- )
405
- elif isinstance(query.target, Aggregation):
406
- out, _mask = self.eval_aggregation(
407
- aggr=query.target,
408
- feat_dict=feat_dict,
409
- time_dict=time_dict,
410
- batch_dict=batch_dict,
411
- anchor_time=anchor_time,
412
- filter_na=True,
413
- )
414
- elif isinstance(query.target, Condition):
415
- out, _mask = self.eval_condition(
416
- condition=query.target,
417
- feat_dict=feat_dict,
418
- time_dict=time_dict,
419
- batch_dict=batch_dict,
420
- anchor_time=anchor_time,
421
- filter_na=True,
422
- )
423
- else:
424
- assert isinstance(query.target, LogicalOperation)
425
- out, _mask = self.eval_logical_operation(
426
- logical_operation=query.target,
427
- feat_dict=feat_dict,
428
- time_dict=time_dict,
429
- batch_dict=batch_dict,
430
- anchor_time=anchor_time,
431
- filter_na=True,
432
- )
433
-
434
- out = out[mask[_mask]]
435
- mask &= _mask
436
-
437
- return out, mask