kumoai 2.8.0.dev202508221830__cp312-cp312-win_amd64.whl → 2.13.0.dev202512041141__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (52) hide show
  1. kumoai/__init__.py +22 -11
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +17 -16
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/rfm.py +37 -8
  6. kumoai/connector/file_upload_connector.py +94 -85
  7. kumoai/connector/utils.py +1399 -210
  8. kumoai/experimental/rfm/__init__.py +164 -46
  9. kumoai/experimental/rfm/authenticate.py +8 -5
  10. kumoai/experimental/rfm/backend/__init__.py +0 -0
  11. kumoai/experimental/rfm/backend/local/__init__.py +38 -0
  12. kumoai/experimental/rfm/backend/local/table.py +109 -0
  13. kumoai/experimental/rfm/backend/snow/__init__.py +35 -0
  14. kumoai/experimental/rfm/backend/snow/table.py +117 -0
  15. kumoai/experimental/rfm/backend/sqlite/__init__.py +30 -0
  16. kumoai/experimental/rfm/backend/sqlite/table.py +101 -0
  17. kumoai/experimental/rfm/base/__init__.py +10 -0
  18. kumoai/experimental/rfm/base/column.py +66 -0
  19. kumoai/experimental/rfm/base/source.py +18 -0
  20. kumoai/experimental/rfm/base/table.py +545 -0
  21. kumoai/experimental/rfm/{local_graph.py → graph.py} +413 -144
  22. kumoai/experimental/rfm/infer/__init__.py +6 -0
  23. kumoai/experimental/rfm/infer/dtype.py +79 -0
  24. kumoai/experimental/rfm/infer/pkey.py +126 -0
  25. kumoai/experimental/rfm/infer/time_col.py +62 -0
  26. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  27. kumoai/experimental/rfm/local_graph_sampler.py +58 -11
  28. kumoai/experimental/rfm/local_graph_store.py +45 -37
  29. kumoai/experimental/rfm/local_pquery_driver.py +342 -46
  30. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  31. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +28 -58
  32. kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
  33. kumoai/experimental/rfm/rfm.py +559 -148
  34. kumoai/experimental/rfm/sagemaker.py +138 -0
  35. kumoai/jobs.py +27 -1
  36. kumoai/kumolib.cp312-win_amd64.pyd +0 -0
  37. kumoai/pquery/prediction_table.py +5 -3
  38. kumoai/pquery/training_table.py +5 -3
  39. kumoai/spcs.py +1 -3
  40. kumoai/testing/decorators.py +1 -1
  41. kumoai/trainer/job.py +9 -30
  42. kumoai/trainer/trainer.py +19 -10
  43. kumoai/utils/__init__.py +2 -1
  44. kumoai/utils/progress_logger.py +96 -16
  45. {kumoai-2.8.0.dev202508221830.dist-info → kumoai-2.13.0.dev202512041141.dist-info}/METADATA +14 -5
  46. {kumoai-2.8.0.dev202508221830.dist-info → kumoai-2.13.0.dev202512041141.dist-info}/RECORD +49 -36
  47. kumoai/experimental/rfm/local_table.py +0 -448
  48. kumoai/experimental/rfm/pquery/pandas_backend.py +0 -437
  49. kumoai/experimental/rfm/utils.py +0 -347
  50. {kumoai-2.8.0.dev202508221830.dist-info → kumoai-2.13.0.dev202512041141.dist-info}/WHEEL +0 -0
  51. {kumoai-2.8.0.dev202508221830.dist-info → kumoai-2.13.0.dev202512041141.dist-info}/licenses/LICENSE +0 -0
  52. {kumoai-2.8.0.dev202508221830.dist-info → kumoai-2.13.0.dev202512041141.dist-info}/top_level.txt +0 -0
@@ -1,347 +0,0 @@
1
- import re
2
- import warnings
3
- from typing import Any, Dict, Optional
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import pyarrow as pa
8
- from kumoapi.typing import Dtype, Stype
9
-
10
- from kumoai.experimental.rfm.infer import (
11
- contains_categorical,
12
- contains_id,
13
- contains_multicategorical,
14
- contains_timestamp,
15
- )
16
-
17
- # Maximum number of rows to check for dtype inference in object columns
18
- _MAX_NUM_ROWS_FOR_DTYPE_INFERENCE = 100
19
-
20
- # Mapping from pandas/numpy dtypes to Kumo Dtypes
21
- PANDAS_TO_DTYPE: Dict[Any, Dtype] = {
22
- np.dtype('bool'): Dtype.bool,
23
- pd.BooleanDtype(): Dtype.bool,
24
- pa.bool_(): Dtype.bool,
25
- np.dtype('byte'): Dtype.int,
26
- pd.UInt8Dtype(): Dtype.int,
27
- np.dtype('int16'): Dtype.int,
28
- pd.Int16Dtype(): Dtype.int,
29
- np.dtype('int32'): Dtype.int,
30
- pd.Int32Dtype(): Dtype.int,
31
- np.dtype('int64'): Dtype.int,
32
- pd.Int64Dtype(): Dtype.int,
33
- np.dtype('float32'): Dtype.float,
34
- pd.Float32Dtype(): Dtype.float,
35
- np.dtype('float64'): Dtype.float,
36
- pd.Float64Dtype(): Dtype.float,
37
- np.dtype('object'): Dtype.string,
38
- pd.StringDtype(storage='python'): Dtype.string,
39
- pd.StringDtype(storage='pyarrow'): Dtype.string,
40
- pa.string(): Dtype.string,
41
- pa.binary(): Dtype.binary,
42
- np.dtype('datetime64[ns]'): Dtype.date,
43
- np.dtype('timedelta64[ns]'): Dtype.timedelta,
44
- pa.list_(pa.float32()): Dtype.floatlist,
45
- pa.list_(pa.int64()): Dtype.intlist,
46
- pa.list_(pa.string()): Dtype.stringlist,
47
- }
48
-
49
-
50
- def to_dtype(ser: pd.Series) -> Dtype:
51
- """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
52
-
53
- Args:
54
- ser: A :class:`pandas.Series` to analyze.
55
-
56
- Returns:
57
- The data type.
58
- """
59
- if pd.api.types.is_datetime64_any_dtype(ser.dtype):
60
- return Dtype.date
61
-
62
- if isinstance(ser.dtype, pd.CategoricalDtype):
63
- return Dtype.string
64
-
65
- if pd.api.types.is_object_dtype(ser.dtype):
66
- index = ser.iloc[:1000].first_valid_index()
67
- if index is not None and pd.api.types.is_list_like(ser[index]):
68
- pos = ser.index.get_loc(index)
69
- assert isinstance(pos, int)
70
- ser = ser.iloc[pos:pos + 1000].dropna()
71
-
72
- if not ser.map(pd.api.types.is_list_like).all():
73
- raise ValueError("Data contains a mix of list-like and "
74
- "non-list-like values")
75
-
76
- ser = ser[ser.map(lambda x: not isinstance(x, list) or len(x) > 0)]
77
-
78
- dtypes = ser.apply(lambda x: PANDAS_TO_DTYPE.get(
79
- np.array(x).dtype, Dtype.string)).unique().tolist()
80
-
81
- invalid_dtypes = set(dtypes) - {
82
- Dtype.string,
83
- Dtype.int,
84
- Dtype.float,
85
- }
86
- if len(invalid_dtypes) > 0:
87
- raise ValueError(f"Data contains unsupported list data types: "
88
- f"{list(invalid_dtypes)}")
89
-
90
- if Dtype.string in dtypes:
91
- return Dtype.stringlist
92
-
93
- if dtypes == [Dtype.int]:
94
- return Dtype.intlist
95
-
96
- return Dtype.floatlist
97
-
98
- if ser.dtype not in PANDAS_TO_DTYPE:
99
- raise ValueError(f"Unsupported data type '{ser.dtype}'")
100
-
101
- return PANDAS_TO_DTYPE[ser.dtype]
102
-
103
-
104
- def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
105
- r"""Infers the semantic type of a column.
106
-
107
- Args:
108
- ser: A :class:`pandas.Series` to analyze.
109
- column_name: The name of the column (used for pattern matching).
110
- dtype: The data type.
111
-
112
- Returns:
113
- The semantic type.
114
- """
115
- if contains_id(ser, column_name, dtype):
116
- return Stype.ID
117
-
118
- if contains_timestamp(ser, column_name, dtype):
119
- return Stype.timestamp
120
-
121
- if contains_multicategorical(ser, column_name, dtype):
122
- return Stype.multicategorical
123
-
124
- if contains_categorical(ser, column_name, dtype):
125
- return Stype.categorical
126
-
127
- return dtype.default_stype
128
-
129
-
130
- def detect_primary_key(
131
- table_name: str,
132
- df: pd.DataFrame,
133
- candidates: list[str],
134
- ) -> Optional[str]:
135
- r"""Auto-detect potential primary key column.
136
-
137
- Args:
138
- table_name: The table name.
139
- df: The pandas DataFrame to analyze
140
- candidates: A list of potential candidates.
141
-
142
- Returns:
143
- The name of the detected primary key, or ``None`` if not found.
144
- """
145
- # A list of (potentially modified) table names that are eligible to match
146
- # with a primary key, i.e.:
147
- # - UserInfo -> User
148
- # - snakecase <-> camelcase
149
- # - camelcase <-> snakecase
150
- # - plural <-> singular (users -> user, eligibilities -> eligibility)
151
- # - verb -> noun (qualifying -> qualify)
152
- _table_names = {table_name}
153
- if table_name.lower().endswith('_info'):
154
- _table_names.add(table_name[:-5])
155
- elif table_name.lower().endswith('info'):
156
- _table_names.add(table_name[:-4])
157
-
158
- table_names = set()
159
- for _table_name in _table_names:
160
- table_names.add(_table_name.lower())
161
- snakecase = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', _table_name)
162
- snakecase = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', snakecase)
163
- table_names.add(snakecase.lower())
164
- camelcase = _table_name.replace('_', '')
165
- table_names.add(camelcase.lower())
166
- if _table_name.lower().endswith('s'):
167
- table_names.add(_table_name.lower()[:-1])
168
- table_names.add(snakecase.lower()[:-1])
169
- table_names.add(camelcase.lower()[:-1])
170
- else:
171
- table_names.add(_table_name.lower() + 's')
172
- table_names.add(snakecase.lower() + 's')
173
- table_names.add(camelcase.lower() + 's')
174
- if _table_name.lower().endswith('ies'):
175
- table_names.add(_table_name.lower()[:-3] + 'y')
176
- table_names.add(snakecase.lower()[:-3] + 'y')
177
- table_names.add(camelcase.lower()[:-3] + 'y')
178
- elif _table_name.lower().endswith('y'):
179
- table_names.add(_table_name.lower()[:-1] + 'ies')
180
- table_names.add(snakecase.lower()[:-1] + 'ies')
181
- table_names.add(camelcase.lower()[:-1] + 'ies')
182
- if _table_name.lower().endswith('ing'):
183
- table_names.add(_table_name.lower()[:-3])
184
- table_names.add(snakecase.lower()[:-3])
185
- table_names.add(camelcase.lower()[:-3])
186
-
187
- scores: list[tuple[str, int]] = []
188
- for col_name in candidates:
189
- col_name_lower = col_name.lower()
190
-
191
- score = 0
192
-
193
- if col_name_lower == 'id':
194
- score += 4
195
-
196
- for table_name_lower in table_names:
197
-
198
- if col_name_lower == table_name_lower:
199
- score += 4 # USER -> USER
200
- break
201
-
202
- for suffix in ['id', 'hash', 'key', 'code', 'uuid']:
203
- if not col_name_lower.endswith(suffix):
204
- continue
205
-
206
- if col_name_lower == f'{table_name_lower}_{suffix}':
207
- score += 5 # USER -> USER_ID
208
- break
209
-
210
- if col_name_lower == f'{table_name_lower}{suffix}':
211
- score += 5 # User -> UserId
212
- break
213
-
214
- if col_name_lower.endswith(f'{table_name_lower}_{suffix}'):
215
- score += 2
216
-
217
- if col_name_lower.endswith(f'{table_name_lower}{suffix}'):
218
- score += 2
219
-
220
- # `rel-bench` hard-coding :(
221
- if table_name == 'studies' and col_name == 'nct_id':
222
- score += 1
223
-
224
- ser = df[col_name].iloc[:1_000_000]
225
- score += 3 * (ser.nunique() / len(ser))
226
-
227
- scores.append((col_name, score))
228
-
229
- scores = [x for x in scores if x[-1] >= 4]
230
- scores.sort(key=lambda x: x[-1], reverse=True)
231
-
232
- if len(scores) == 0:
233
- return None
234
-
235
- if len(scores) == 1:
236
- return scores[0][0]
237
-
238
- # In case of multiple candidates, only return one if its score is unique:
239
- if scores[0][1] != scores[1][1]:
240
- return scores[0][0]
241
-
242
- max_score = max(scores, key=lambda x: x[1])
243
- candidates = [col_name for col_name, score in scores if score == max_score]
244
- warnings.warn(f"Found multiple potential primary keys in table "
245
- f"'{table_name}': {candidates}. Please specify the primary "
246
- f"key for this table manually.")
247
-
248
- return None
249
-
250
-
251
- def detect_time_column(
252
- df: pd.DataFrame,
253
- candidates: list[str],
254
- ) -> Optional[str]:
255
- r"""Auto-detect potential time column.
256
-
257
- Args:
258
- df: The pandas DataFrame to analyze
259
- candidates: A list of potential candidates.
260
-
261
- Returns:
262
- The name of the detected time column, or ``None`` if not found.
263
- """
264
- candidates = [ # Exclude all candidates with `*last*` in column names:
265
- col_name for col_name in candidates
266
- if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
267
- ]
268
-
269
- if len(candidates) == 0:
270
- return None
271
-
272
- if len(candidates) == 1:
273
- return candidates[0]
274
-
275
- # If there exists a dedicated `create*` column, use it as time column:
276
- create_candidates = [
277
- candidate for candidate in candidates
278
- if candidate.lower().startswith('create')
279
- ]
280
- if len(create_candidates) == 1:
281
- return create_candidates[0]
282
- if len(create_candidates) > 1:
283
- candidates = create_candidates
284
-
285
- # Find the most optimal time column. Usually, it is the one pointing to
286
- # the oldest timestamps:
287
- with warnings.catch_warnings():
288
- warnings.filterwarnings('ignore', message='Could not infer format')
289
- min_timestamp_dict = {
290
- key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
291
- for key in candidates
292
- }
293
- min_timestamp_dict = {
294
- key: value.min().tz_localize(None)
295
- for key, value in min_timestamp_dict.items()
296
- }
297
- min_timestamp_dict = {
298
- key: value
299
- for key, value in min_timestamp_dict.items() if not pd.isna(value)
300
- }
301
-
302
- if len(min_timestamp_dict) == 0:
303
- return None
304
-
305
- return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
306
-
307
-
308
- PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
309
- MULTISPACE = re.compile(r"\s+")
310
-
311
-
312
- def normalize_text(
313
- ser: pd.Series,
314
- max_words: Optional[int] = 50,
315
- ) -> pd.Series:
316
- r"""Normalizes text into a list of lower-case words.
317
-
318
- Args:
319
- ser: The :class:`pandas.Series` to normalize.
320
- max_words: The maximum number of words to return.
321
- This will auto-shrink any large text column to avoid blowing up
322
- context size.
323
- """
324
- if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
325
- return ser
326
-
327
- def normalize_fn(line: str) -> list[str]:
328
- line = PUNCTUATION.sub(" ", line)
329
- line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
330
- line = MULTISPACE.sub(" ", line)
331
- words = line.split()
332
- if max_words is not None:
333
- words = words[:max_words]
334
- return words
335
-
336
- ser = ser.fillna('').astype(str)
337
-
338
- if max_words is not None:
339
- # We estimate the number of words as 5 characters + 1 space in an
340
- # English text on average. We need this pre-filter here, as word
341
- # splitting on a giant text can be very expensive:
342
- ser = ser.str[:6 * max_words]
343
-
344
- ser = ser.str.lower()
345
- ser = ser.map(normalize_fn)
346
-
347
- return ser