kumoai 2.12.0.dev202511031731__cp313-cp313-macosx_11_0_arm64.whl → 2.13.0.dev202512061731__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kumoai/__init__.py +18 -9
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +9 -13
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/rfm.py +35 -7
  6. kumoai/connector/utils.py +23 -2
  7. kumoai/experimental/rfm/__init__.py +164 -46
  8. kumoai/experimental/rfm/backend/__init__.py +0 -0
  9. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  10. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +20 -30
  11. kumoai/experimental/rfm/backend/local/sampler.py +131 -0
  12. kumoai/experimental/rfm/backend/local/table.py +109 -0
  13. kumoai/experimental/rfm/backend/snow/__init__.py +35 -0
  14. kumoai/experimental/rfm/backend/snow/table.py +117 -0
  15. kumoai/experimental/rfm/backend/sqlite/__init__.py +30 -0
  16. kumoai/experimental/rfm/backend/sqlite/table.py +101 -0
  17. kumoai/experimental/rfm/base/__init__.py +14 -0
  18. kumoai/experimental/rfm/base/column.py +66 -0
  19. kumoai/experimental/rfm/base/sampler.py +287 -0
  20. kumoai/experimental/rfm/base/source.py +18 -0
  21. kumoai/experimental/rfm/{local_table.py → base/table.py} +139 -139
  22. kumoai/experimental/rfm/{local_graph.py → graph.py} +334 -79
  23. kumoai/experimental/rfm/infer/__init__.py +6 -0
  24. kumoai/experimental/rfm/infer/dtype.py +79 -0
  25. kumoai/experimental/rfm/infer/pkey.py +126 -0
  26. kumoai/experimental/rfm/infer/time_col.py +62 -0
  27. kumoai/experimental/rfm/local_graph_sampler.py +43 -4
  28. kumoai/experimental/rfm/local_pquery_driver.py +222 -27
  29. kumoai/experimental/rfm/pquery/__init__.py +0 -4
  30. kumoai/experimental/rfm/pquery/pandas_executor.py +34 -8
  31. kumoai/experimental/rfm/rfm.py +153 -96
  32. kumoai/experimental/rfm/sagemaker.py +138 -0
  33. kumoai/spcs.py +1 -3
  34. kumoai/testing/decorators.py +1 -1
  35. kumoai/utils/progress_logger.py +10 -4
  36. {kumoai-2.12.0.dev202511031731.dist-info → kumoai-2.13.0.dev202512061731.dist-info}/METADATA +12 -2
  37. {kumoai-2.12.0.dev202511031731.dist-info → kumoai-2.13.0.dev202512061731.dist-info}/RECORD +40 -27
  38. kumoai/experimental/rfm/pquery/backend.py +0 -136
  39. kumoai/experimental/rfm/pquery/pandas_backend.py +0 -478
  40. kumoai/experimental/rfm/utils.py +0 -344
  41. {kumoai-2.12.0.dev202511031731.dist-info → kumoai-2.13.0.dev202512061731.dist-info}/WHEEL +0 -0
  42. {kumoai-2.12.0.dev202511031731.dist-info → kumoai-2.13.0.dev202512061731.dist-info}/licenses/LICENSE +0 -0
  43. {kumoai-2.12.0.dev202511031731.dist-info → kumoai-2.13.0.dev202512061731.dist-info}/top_level.txt +0 -0
@@ -1,478 +0,0 @@
1
- from typing import Dict, List, Optional, Tuple, Union
2
-
3
- import numpy as np
4
- import pandas as pd
5
- from kumoapi.rfm import PQueryDefinition
6
- from kumoapi.rfm.pquery import (
7
- Aggregation,
8
- AggregationType,
9
- BoolOp,
10
- Column,
11
- Condition,
12
- Filter,
13
- Float,
14
- FloatList,
15
- Int,
16
- IntList,
17
- LogicalOperation,
18
- MemberOp,
19
- RelOp,
20
- Str,
21
- StrList,
22
- )
23
-
24
- from kumoai.experimental.rfm.pquery import PQueryBackend
25
-
26
-
27
- class PQueryPandasBackend(PQueryBackend[pd.DataFrame, pd.Series, np.ndarray]):
28
- def eval_aggregation_type(
29
- self,
30
- op: AggregationType,
31
- feat: Optional[pd.Series],
32
- batch: np.ndarray,
33
- batch_size: int,
34
- filter_na: bool = True,
35
- ) -> Tuple[pd.Series, np.ndarray]:
36
-
37
- if op != AggregationType.COUNT:
38
- assert feat is not None
39
-
40
- if feat is not None:
41
- mask = feat.notna()
42
- feat, batch = feat[mask], batch[mask]
43
-
44
- if op == AggregationType.LIST_DISTINCT:
45
- df = pd.DataFrame(dict(feat=feat, batch=batch))
46
- df = df.drop_duplicates()
47
- out = df.groupby('batch')['feat'].agg(list)
48
-
49
- else:
50
- df = pd.DataFrame(dict(feat=feat, batch=batch))
51
- if op == AggregationType.AVG:
52
- agg = 'mean'
53
- elif op == AggregationType.COUNT:
54
- agg = 'size'
55
- else:
56
- agg = op.lower()
57
- out = df.groupby('batch')['feat'].agg(agg)
58
-
59
- if not pd.api.types.is_datetime64_any_dtype(out):
60
- out = out.astype('float32')
61
-
62
- out.name = None
63
- out.index.name = None
64
-
65
- if op in {AggregationType.SUM, AggregationType.COUNT}:
66
- out = out.reindex(range(batch_size), fill_value=0)
67
- mask = np.ones(batch_size, dtype=bool)
68
- return out, mask
69
-
70
- mask = np.zeros(batch_size, dtype=bool)
71
- mask[batch] = True
72
-
73
- if filter_na:
74
- return out.reset_index(drop=True), mask
75
-
76
- out = out.reindex(range(batch_size), fill_value=pd.NA)
77
-
78
- return out, mask
79
-
80
- def eval_rel_op(
81
- self,
82
- left: pd.Series,
83
- op: RelOp,
84
- right: Union[Int, Float, Str, None],
85
- ) -> pd.Series:
86
-
87
- if right is None:
88
- if op == RelOp.EQ:
89
- return left.isna()
90
- assert op == RelOp.NEQ
91
- return left.notna()
92
-
93
- value = pd.Series([right.value], dtype=left.dtype).iloc[0]
94
-
95
- if op == RelOp.EQ:
96
- return (left == value).fillna(False).astype(bool)
97
- if op == RelOp.NEQ:
98
- out = (left != value).fillna(False).astype(bool)
99
- out[left.isna()] = False # N/A != right should always be `False`.
100
- return out
101
- if op == RelOp.LEQ:
102
- return (left <= value).fillna(False).astype(bool)
103
- if op == RelOp.GEQ:
104
- return (left >= value).fillna(False).astype(bool)
105
- if op == RelOp.LT:
106
- return (left < value).fillna(False).astype(bool)
107
- if op == RelOp.GT:
108
- return (left > value).fillna(False).astype(bool)
109
-
110
- raise NotImplementedError(f"Operator '{op}' not implemented")
111
-
112
- def eval_member_op(
113
- self,
114
- left: pd.Series,
115
- op: MemberOp,
116
- right: Union[IntList, FloatList, StrList],
117
- ) -> pd.Series:
118
-
119
- if op == MemberOp.IN:
120
- ser = pd.Series(right.value, dtype=left.dtype)
121
- return left.isin(ser).astype(bool)
122
-
123
- raise NotImplementedError(f"Operator '{op}' not implemented")
124
-
125
- def eval_bool_op(
126
- self,
127
- left: pd.Series,
128
- op: BoolOp,
129
- right: Optional[pd.Series],
130
- ) -> pd.Series:
131
-
132
- # TODO Implement Kleene-Priest three-value logic.
133
- if op == BoolOp.AND:
134
- assert right is not None
135
- return left & right
136
- if op == BoolOp.OR:
137
- assert right is not None
138
- return left | right
139
- if op == BoolOp.NOT:
140
- return ~left
141
-
142
- raise NotImplementedError(f"Operator '{op}' not implemented")
143
-
144
- def eval_column(
145
- self,
146
- column: Column,
147
- feat_dict: Dict[str, pd.DataFrame],
148
- filter_na: bool = True,
149
- ) -> Tuple[pd.Series, np.ndarray]:
150
-
151
- out = feat_dict[column.table_name][column.column_name]
152
- out = out.reset_index(drop=True)
153
-
154
- if pd.api.types.is_float_dtype(out):
155
- out = out.astype('float32')
156
-
157
- out.name = None
158
- out.index.name = None
159
-
160
- mask = out.notna().to_numpy()
161
-
162
- if not filter_na:
163
- return out, mask
164
-
165
- out = out[mask].reset_index(drop=True)
166
-
167
- # Cast to primitive dtype:
168
- if pd.api.types.is_integer_dtype(out):
169
- out = out.astype('int64')
170
- elif pd.api.types.is_bool_dtype(out):
171
- out = out.astype('bool')
172
-
173
- return out, mask
174
-
175
- def eval_aggregation(
176
- self,
177
- aggr: Aggregation,
178
- feat_dict: Dict[str, pd.DataFrame],
179
- time_dict: Dict[str, pd.Series],
180
- batch_dict: Dict[str, np.ndarray],
181
- anchor_time: pd.Series,
182
- filter_na: bool = True,
183
- num_forecasts: int = 1,
184
- ) -> Tuple[pd.Series, np.ndarray]:
185
-
186
- target_table = aggr.column.table_name
187
- target_batch = batch_dict[target_table]
188
- target_time = time_dict[target_table]
189
-
190
- outs: List[pd.Series] = []
191
- masks: List[np.ndarray] = []
192
- for _ in range(num_forecasts):
193
- anchor_target_time = anchor_time[target_batch]
194
- anchor_target_time = anchor_target_time.reset_index(drop=True)
195
-
196
- target_mask = target_time <= anchor_target_time + aggr.end_offset
197
-
198
- if aggr.start is not None:
199
- start_offset = aggr.start * aggr.time_unit.to_offset()
200
- target_mask &= target_time > anchor_target_time + start_offset
201
- else:
202
- assert num_forecasts == 1
203
-
204
- if aggr.filter is not None:
205
- target_mask &= self.eval_filter(
206
- filter=aggr.filter,
207
- feat_dict=feat_dict,
208
- time_dict=time_dict,
209
- batch_dict=batch_dict,
210
- anchor_time=anchor_time,
211
- )
212
-
213
- if (aggr.type == AggregationType.COUNT
214
- and aggr.column.column_name == '*'):
215
- target_feat = None
216
- else:
217
- target_feat, _ = self.eval_column(
218
- aggr.column,
219
- feat_dict,
220
- filter_na=False,
221
- )
222
- target_feat = target_feat[target_mask]
223
-
224
- out, mask = self.eval_aggregation_type(
225
- aggr.type,
226
- feat=target_feat,
227
- batch=target_batch[target_mask],
228
- batch_size=len(anchor_time),
229
- filter_na=False if num_forecasts > 1 else filter_na,
230
- )
231
- outs.append(out)
232
- masks.append(mask)
233
-
234
- if num_forecasts > 1:
235
- anchor_time = anchor_time + aggr.end_offset
236
-
237
- if len(outs) == 1:
238
- assert len(masks) == 1
239
- return outs[0], masks[0]
240
-
241
- out = pd.Series([list(ser) for ser in zip(*outs)])
242
- mask = np.stack(masks, axis=-1).any(axis=-1) # type: ignore
243
-
244
- if filter_na:
245
- out = out[mask].reset_index(drop=True)
246
-
247
- return out, mask
248
-
249
- def eval_condition(
250
- self,
251
- condition: Condition,
252
- feat_dict: Dict[str, pd.DataFrame],
253
- time_dict: Dict[str, pd.Series],
254
- batch_dict: Dict[str, np.ndarray],
255
- anchor_time: pd.Series,
256
- filter_na: bool = True,
257
- num_forecasts: int = 1,
258
- ) -> Tuple[pd.Series, np.ndarray]:
259
-
260
- if num_forecasts > 1:
261
- raise NotImplementedError("Forecasting not yet implemented for "
262
- "non-regression tasks")
263
-
264
- if isinstance(condition.left, Column):
265
- left, mask = self.eval_column(
266
- column=condition.left,
267
- feat_dict=feat_dict,
268
- filter_na=filter_na if condition.right is not None else False,
269
- )
270
- else:
271
- assert isinstance(condition.left, Aggregation)
272
- left, mask = self.eval_aggregation(
273
- aggr=condition.left,
274
- feat_dict=feat_dict,
275
- time_dict=time_dict,
276
- batch_dict=batch_dict,
277
- anchor_time=anchor_time,
278
- filter_na=filter_na if condition.right is not None else False,
279
- )
280
-
281
- if filter_na and condition.right is None:
282
- mask = np.ones(len(left), dtype=bool)
283
-
284
- if isinstance(condition.op, RelOp):
285
- out = self.eval_rel_op(
286
- left=left,
287
- op=condition.op,
288
- right=condition.right,
289
- )
290
- else:
291
- assert isinstance(condition.op, MemberOp)
292
- out = self.eval_member_op(
293
- left=left,
294
- op=condition.op,
295
- right=condition.right,
296
- )
297
-
298
- return out, mask
299
-
300
- def eval_logical_operation(
301
- self,
302
- logical_operation: LogicalOperation,
303
- feat_dict: Dict[str, pd.DataFrame],
304
- time_dict: Dict[str, pd.Series],
305
- batch_dict: Dict[str, np.ndarray],
306
- anchor_time: pd.Series,
307
- filter_na: bool = True,
308
- num_forecasts: int = 1,
309
- ) -> Tuple[pd.Series, np.ndarray]:
310
-
311
- if num_forecasts > 1:
312
- raise NotImplementedError("Forecasting not yet implemented for "
313
- "non-regression tasks")
314
-
315
- if isinstance(logical_operation.left, Condition):
316
- left, mask = self.eval_condition(
317
- condition=logical_operation.left,
318
- feat_dict=feat_dict,
319
- time_dict=time_dict,
320
- batch_dict=batch_dict,
321
- anchor_time=anchor_time,
322
- filter_na=False,
323
- )
324
- else:
325
- assert isinstance(logical_operation.left, LogicalOperation)
326
- left, mask = self.eval_logical_operation(
327
- logical_operation=logical_operation.left,
328
- feat_dict=feat_dict,
329
- time_dict=time_dict,
330
- batch_dict=batch_dict,
331
- anchor_time=anchor_time,
332
- filter_na=False,
333
- )
334
-
335
- right = right_mask = None
336
- if isinstance(logical_operation.right, Condition):
337
- right, right_mask = self.eval_condition(
338
- condition=logical_operation.right,
339
- feat_dict=feat_dict,
340
- time_dict=time_dict,
341
- batch_dict=batch_dict,
342
- anchor_time=anchor_time,
343
- filter_na=False,
344
- )
345
- elif isinstance(logical_operation.right, LogicalOperation):
346
- right, right_mask = self.eval_logical_operation(
347
- logical_operation=logical_operation.right,
348
- feat_dict=feat_dict,
349
- time_dict=time_dict,
350
- batch_dict=batch_dict,
351
- anchor_time=anchor_time,
352
- filter_na=False,
353
- )
354
-
355
- out = self.eval_bool_op(left, logical_operation.op, right)
356
-
357
- if right_mask is not None:
358
- mask &= right_mask
359
-
360
- if filter_na:
361
- out = out[mask].reset_index(drop=True)
362
-
363
- return out, mask
364
-
365
- def eval_filter(
366
- self,
367
- filter: Filter,
368
- feat_dict: Dict[str, pd.DataFrame],
369
- time_dict: Dict[str, pd.Series],
370
- batch_dict: Dict[str, np.ndarray],
371
- anchor_time: pd.Series,
372
- ) -> np.ndarray:
373
- if isinstance(filter.condition, Condition):
374
- return self.eval_condition(
375
- condition=filter.condition,
376
- feat_dict=feat_dict,
377
- time_dict=time_dict,
378
- batch_dict=batch_dict,
379
- anchor_time=anchor_time,
380
- filter_na=False,
381
- )[0].to_numpy()
382
- else:
383
- assert isinstance(filter.condition, LogicalOperation)
384
- return self.eval_logical_operation(
385
- logical_operation=filter.condition,
386
- feat_dict=feat_dict,
387
- time_dict=time_dict,
388
- batch_dict=batch_dict,
389
- anchor_time=anchor_time,
390
- filter_na=False,
391
- )[0].to_numpy()
392
-
393
- def eval_pquery(
394
- self,
395
- query: PQueryDefinition,
396
- feat_dict: Dict[str, pd.DataFrame],
397
- time_dict: Dict[str, pd.Series],
398
- batch_dict: Dict[str, np.ndarray],
399
- anchor_time: pd.Series,
400
- num_forecasts: int = 1,
401
- ) -> Tuple[pd.Series, np.ndarray]:
402
-
403
- mask = np.ones(len(anchor_time), dtype=bool)
404
-
405
- if query.entity.filter is not None:
406
- mask &= self.eval_filter(
407
- filter=query.entity.filter,
408
- feat_dict=feat_dict,
409
- time_dict=time_dict,
410
- batch_dict=batch_dict,
411
- anchor_time=anchor_time,
412
- )
413
-
414
- if getattr(query, 'assuming', None) is not None:
415
- if isinstance(query.assuming, Condition):
416
- mask &= self.eval_condition(
417
- condition=query.assuming,
418
- feat_dict=feat_dict,
419
- time_dict=time_dict,
420
- batch_dict=batch_dict,
421
- anchor_time=anchor_time,
422
- filter_na=False,
423
- )[0].to_numpy()
424
- else:
425
- assert isinstance(query.assuming, LogicalOperation)
426
- mask &= self.eval_logical_operation(
427
- logical_operation=query.assuming,
428
- feat_dict=feat_dict,
429
- time_dict=time_dict,
430
- batch_dict=batch_dict,
431
- anchor_time=anchor_time,
432
- filter_na=False,
433
- )[0].to_numpy()
434
-
435
- if isinstance(query.target, Column):
436
- out, _mask = self.eval_column(
437
- column=query.target,
438
- feat_dict=feat_dict,
439
- filter_na=True,
440
- )
441
- elif isinstance(query.target, Aggregation):
442
- out, _mask = self.eval_aggregation(
443
- aggr=query.target,
444
- feat_dict=feat_dict,
445
- time_dict=time_dict,
446
- batch_dict=batch_dict,
447
- anchor_time=anchor_time,
448
- filter_na=True,
449
- num_forecasts=num_forecasts,
450
- )
451
- elif isinstance(query.target, Condition):
452
- out, _mask = self.eval_condition(
453
- condition=query.target,
454
- feat_dict=feat_dict,
455
- time_dict=time_dict,
456
- batch_dict=batch_dict,
457
- anchor_time=anchor_time,
458
- filter_na=True,
459
- num_forecasts=num_forecasts,
460
- )
461
- else:
462
- assert isinstance(query.target, LogicalOperation)
463
- out, _mask = self.eval_logical_operation(
464
- logical_operation=query.target,
465
- feat_dict=feat_dict,
466
- time_dict=time_dict,
467
- batch_dict=batch_dict,
468
- anchor_time=anchor_time,
469
- filter_na=True,
470
- num_forecasts=num_forecasts,
471
- )
472
-
473
- out = out[mask[_mask]]
474
- mask &= _mask
475
-
476
- out = out.reset_index(drop=True)
477
-
478
- return out, mask