kumoai 2.13.0.dev202512011731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. kumoai/__init__.py +12 -0
  2. kumoai/_version.py +1 -1
  3. kumoai/client/pquery.py +6 -2
  4. kumoai/experimental/rfm/__init__.py +33 -8
  5. kumoai/experimental/rfm/authenticate.py +3 -4
  6. kumoai/experimental/rfm/backend/local/__init__.py +4 -0
  7. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +53 -107
  8. kumoai/experimental/rfm/backend/local/sampler.py +315 -0
  9. kumoai/experimental/rfm/backend/local/table.py +41 -80
  10. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  11. kumoai/experimental/rfm/backend/snow/sampler.py +252 -0
  12. kumoai/experimental/rfm/backend/snow/table.py +147 -0
  13. kumoai/experimental/rfm/backend/sqlite/__init__.py +11 -2
  14. kumoai/experimental/rfm/backend/sqlite/sampler.py +349 -0
  15. kumoai/experimental/rfm/backend/sqlite/table.py +108 -88
  16. kumoai/experimental/rfm/base/__init__.py +26 -2
  17. kumoai/experimental/rfm/base/column.py +6 -12
  18. kumoai/experimental/rfm/base/column_expression.py +16 -0
  19. kumoai/experimental/rfm/base/sampler.py +773 -0
  20. kumoai/experimental/rfm/base/source.py +19 -0
  21. kumoai/experimental/rfm/base/sql_sampler.py +84 -0
  22. kumoai/experimental/rfm/base/sql_table.py +113 -0
  23. kumoai/experimental/rfm/base/table.py +174 -76
  24. kumoai/experimental/rfm/graph.py +444 -84
  25. kumoai/experimental/rfm/infer/__init__.py +6 -0
  26. kumoai/experimental/rfm/infer/dtype.py +77 -0
  27. kumoai/experimental/rfm/infer/pkey.py +128 -0
  28. kumoai/experimental/rfm/infer/time_col.py +61 -0
  29. kumoai/experimental/rfm/pquery/executor.py +27 -27
  30. kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
  31. kumoai/experimental/rfm/rfm.py +299 -240
  32. kumoai/experimental/rfm/sagemaker.py +4 -4
  33. kumoai/pquery/predictive_query.py +10 -6
  34. kumoai/testing/snow.py +50 -0
  35. kumoai/utils/__init__.py +3 -2
  36. kumoai/utils/progress_logger.py +178 -12
  37. kumoai/utils/sql.py +3 -0
  38. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/METADATA +6 -2
  39. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/RECORD +42 -30
  40. kumoai/experimental/rfm/local_graph_sampler.py +0 -182
  41. kumoai/experimental/rfm/local_pquery_driver.py +0 -689
  42. kumoai/experimental/rfm/utils.py +0 -344
  43. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/WHEEL +0 -0
  44. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/licenses/LICENSE +0 -0
  45. {kumoai-2.13.0.dev202512011731.dist-info → kumoai-2.14.0.dev202512181731.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
1
- from typing import Dict, List, Optional, Tuple
2
-
3
- import numpy as np
4
- import pandas as pd
5
- from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
6
- from kumoapi.typing import Stype
7
-
8
- import kumoai.kumolib as kumolib
9
- from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
10
- from kumoai.experimental.rfm.utils import normalize_text
11
-
12
-
13
- class LocalGraphSampler:
14
- def __init__(self, graph_store: LocalGraphStore) -> None:
15
- self._graph_store = graph_store
16
- self._sampler = kumolib.NeighborSampler(
17
- self._graph_store.node_types,
18
- self._graph_store.edge_types,
19
- {
20
- '__'.join(edge_type): colptr
21
- for edge_type, colptr in self._graph_store.colptr_dict.items()
22
- },
23
- {
24
- '__'.join(edge_type): row
25
- for edge_type, row in self._graph_store.row_dict.items()
26
- },
27
- self._graph_store.time_dict,
28
- )
29
-
30
- def __call__(
31
- self,
32
- entity_table_names: Tuple[str, ...],
33
- node: np.ndarray,
34
- time: np.ndarray,
35
- num_neighbors: List[int],
36
- exclude_cols_dict: Dict[str, List[str]],
37
- ) -> Subgraph:
38
-
39
- (
40
- row_dict,
41
- col_dict,
42
- node_dict,
43
- batch_dict,
44
- num_sampled_nodes_dict,
45
- num_sampled_edges_dict,
46
- ) = self._sampler.sample(
47
- {
48
- '__'.join(edge_type): num_neighbors
49
- for edge_type in self._graph_store.edge_types
50
- },
51
- {}, # time interval based sampling
52
- entity_table_names[0],
53
- node,
54
- time // 1000**3, # nanoseconds to seconds
55
- )
56
-
57
- table_dict: Dict[str, Table] = {}
58
- for table_name, node in node_dict.items():
59
- batch = batch_dict[table_name]
60
-
61
- if len(node) == 0:
62
- continue
63
-
64
- df = self._graph_store.df_dict[table_name]
65
-
66
- num_sampled_nodes = num_sampled_nodes_dict[table_name].tolist()
67
- stype_dict = { # Exclude target columns:
68
- column_name: stype
69
- for column_name, stype in
70
- self._graph_store.stype_dict[table_name].items()
71
- if column_name not in exclude_cols_dict.get(table_name, [])
72
- }
73
- primary_key: Optional[str] = None
74
- if table_name in entity_table_names:
75
- primary_key = self._graph_store.pkey_name_dict.get(table_name)
76
-
77
- columns: List[str] = []
78
- if table_name in entity_table_names:
79
- columns += [self._graph_store.pkey_name_dict[table_name]]
80
- columns += list(stype_dict.keys())
81
-
82
- if len(columns) == 0:
83
- table_dict[table_name] = Table(
84
- df=pd.DataFrame(index=range(len(node))),
85
- row=None,
86
- batch=batch,
87
- num_sampled_nodes=num_sampled_nodes,
88
- stype_dict=stype_dict,
89
- primary_key=primary_key,
90
- )
91
- continue
92
-
93
- row: Optional[np.ndarray] = None
94
- if table_name in self._graph_store.end_time_column_dict:
95
- # Set end time to NaT for all values greater than anchor time:
96
- df = df.iloc[node].reset_index(drop=True)
97
- col_name = self._graph_store.end_time_column_dict[table_name]
98
- ser = df[col_name]
99
- value = ser.astype('datetime64[ns]').astype(int).to_numpy()
100
- mask = value > time[batch]
101
- df.loc[mask, col_name] = pd.NaT
102
- else:
103
- # Only store unique rows in `df` above a certain threshold:
104
- unique_node, inverse = np.unique(node, return_inverse=True)
105
- if len(node) > 1.05 * len(unique_node):
106
- df = df.iloc[unique_node].reset_index(drop=True)
107
- row = inverse
108
- else:
109
- df = df.iloc[node].reset_index(drop=True)
110
-
111
- # Filter data frame to minimal set of columns:
112
- df = df[columns]
113
-
114
- # Normalize text (if not already pre-processed):
115
- for column_name, stype in stype_dict.items():
116
- if stype == Stype.text:
117
- df[column_name] = normalize_text(df[column_name])
118
-
119
- table_dict[table_name] = Table(
120
- df=df,
121
- row=row,
122
- batch=batch,
123
- num_sampled_nodes=num_sampled_nodes,
124
- stype_dict=stype_dict,
125
- primary_key=primary_key,
126
- )
127
-
128
- link_dict: Dict[Tuple[str, str, str], Link] = {}
129
- for edge_type in self._graph_store.edge_types:
130
- edge_type_str = '__'.join(edge_type)
131
-
132
- row = row_dict[edge_type_str]
133
- col = col_dict[edge_type_str]
134
-
135
- if len(row) == 0:
136
- continue
137
-
138
- # Do not store reverse edge type if it is a replica:
139
- rev_edge_type = Subgraph.rev_edge_type(edge_type)
140
- rev_edge_type_str = '__'.join(rev_edge_type)
141
- if (rev_edge_type in link_dict
142
- and np.array_equal(row, col_dict[rev_edge_type_str])
143
- and np.array_equal(col, row_dict[rev_edge_type_str])):
144
- link = Link(
145
- layout=EdgeLayout.REV,
146
- row=None,
147
- col=None,
148
- num_sampled_edges=(
149
- num_sampled_edges_dict[edge_type_str].tolist()),
150
- )
151
- link_dict[edge_type] = link
152
- continue
153
-
154
- layout = EdgeLayout.COO
155
- if np.array_equal(row, np.arange(len(row))):
156
- row = None
157
- if np.array_equal(col, np.arange(len(col))):
158
- col = None
159
-
160
- # Store in compressed representation if more efficient:
161
- num_cols = table_dict[edge_type[2]].num_rows
162
- if col is not None and len(col) > num_cols + 1:
163
- layout = EdgeLayout.CSC
164
- colcount = np.bincount(col, minlength=num_cols)
165
- col = np.empty(num_cols + 1, dtype=col.dtype)
166
- col[0] = 0
167
- np.cumsum(colcount, out=col[1:])
168
-
169
- link = Link(
170
- layout=layout,
171
- row=row,
172
- col=col,
173
- num_sampled_edges=(
174
- num_sampled_edges_dict[edge_type_str].tolist()),
175
- )
176
- link_dict[edge_type] = link
177
-
178
- return Subgraph(
179
- anchor_time=time,
180
- table_dict=table_dict,
181
- link_dict=link_dict,
182
- )