kumoai 2.13.0.dev202512021731__cp310-cp310-win_amd64.whl → 2.13.0.dev202512040252__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ import re
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def infer_time_column(
9
+ df: pd.DataFrame,
10
+ candidates: list[str],
11
+ ) -> Optional[str]:
12
+ r"""Auto-detect potential time column.
13
+
14
+ Args:
15
+ df: The pandas DataFrame to analyze.
16
+ candidates: A list of potential candidates.
17
+
18
+ Returns:
19
+ The name of the detected time column, or ``None`` if not found.
20
+ """
21
+ candidates = [ # Exclude all candidates with `*last*` in column names:
22
+ col_name for col_name in candidates
23
+ if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
24
+ ]
25
+
26
+ if len(candidates) == 0:
27
+ return None
28
+
29
+ if len(candidates) == 1:
30
+ return candidates[0]
31
+
32
+ # If there exists a dedicated `create*` column, use it as time column:
33
+ create_candidates = [
34
+ candidate for candidate in candidates
35
+ if candidate.lower().startswith('create')
36
+ ]
37
+ if len(create_candidates) == 1:
38
+ return create_candidates[0]
39
+ if len(create_candidates) > 1:
40
+ candidates = create_candidates
41
+
42
+ # Find the most optimal time column. Usually, it is the one pointing to
43
+ # the oldest timestamps:
44
+ with warnings.catch_warnings():
45
+ warnings.filterwarnings('ignore', message='Could not infer format')
46
+ min_timestamp_dict = {
47
+ key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
48
+ for key in candidates
49
+ }
50
+ min_timestamp_dict = {
51
+ key: value.min().tz_localize(None)
52
+ for key, value in min_timestamp_dict.items()
53
+ }
54
+ min_timestamp_dict = {
55
+ key: value
56
+ for key, value in min_timestamp_dict.items() if not pd.isna(value)
57
+ }
58
+
59
+ if len(min_timestamp_dict) == 0:
60
+ return None
61
+
62
+ return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from typing import Dict, List, Optional, Tuple
2
3
 
3
4
  import numpy as np
@@ -7,7 +8,47 @@ from kumoapi.typing import Stype
7
8
 
8
9
  import kumoai.kumolib as kumolib
9
10
  from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
10
- from kumoai.experimental.rfm.utils import normalize_text
11
+
12
+ PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
13
+ MULTISPACE = re.compile(r"\s+")
14
+
15
+
16
+ def normalize_text(
17
+ ser: pd.Series,
18
+ max_words: Optional[int] = 50,
19
+ ) -> pd.Series:
20
+ r"""Normalizes text into a list of lower-case words.
21
+
22
+ Args:
23
+ ser: The :class:`pandas.Series` to normalize.
24
+ max_words: The maximum number of words to return.
25
+ This will auto-shrink any large text column to avoid blowing up
26
+ context size.
27
+ """
28
+ if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
29
+ return ser
30
+
31
+ def normalize_fn(line: str) -> list[str]:
32
+ line = PUNCTUATION.sub(" ", line)
33
+ line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
34
+ line = MULTISPACE.sub(" ", line)
35
+ words = line.split()
36
+ if max_words is not None:
37
+ words = words[:max_words]
38
+ return words
39
+
40
+ ser = ser.fillna('').astype(str)
41
+
42
+ if max_words is not None:
43
+ # We estimate the number of words as 5 characters + 1 space in an
44
+ # English text on average. We need this pre-filter here, as word
45
+ # splitting on a giant text can be very expensive:
46
+ ser = ser.str[:6 * max_words]
47
+
48
+ ser = ser.str.lower()
49
+ ser = ser.map(normalize_fn)
50
+
51
+ return ser
11
52
 
12
53
 
13
54
  class LocalGraphSampler:
@@ -7,7 +7,6 @@ from kumoapi.rfm.context import Subgraph
7
7
  from kumoapi.typing import Stype
8
8
 
9
9
  from kumoai.experimental.rfm import Graph, LocalTable
10
- from kumoai.experimental.rfm.utils import normalize_text
11
10
  from kumoai.utils import InteractiveProgressLogger, ProgressLogger
12
11
 
13
12
  try:
@@ -21,7 +20,6 @@ class LocalGraphStore:
21
20
  def __init__(
22
21
  self,
23
22
  graph: Graph,
24
- preprocess: bool = False,
25
23
  verbose: Union[bool, ProgressLogger] = True,
26
24
  ) -> None:
27
25
 
@@ -32,7 +30,7 @@ class LocalGraphStore:
32
30
  )
33
31
 
34
32
  with verbose as logger:
35
- self.df_dict, self.mask_dict = self.sanitize(graph, preprocess)
33
+ self.df_dict, self.mask_dict = self.sanitize(graph)
36
34
  self.stype_dict = self.get_stype_dict(graph)
37
35
  logger.log("Sanitized input data")
38
36
 
@@ -106,7 +104,6 @@ class LocalGraphStore:
106
104
  def sanitize(
107
105
  self,
108
106
  graph: Graph,
109
- preprocess: bool = False,
110
107
  ) -> Tuple[Dict[str, pd.DataFrame], Dict[str, np.ndarray]]:
111
108
  r"""Sanitizes raw data according to table schema definition:
112
109
 
@@ -115,10 +112,6 @@ class LocalGraphStore:
115
112
  * drops timezone information from timestamps
116
113
  * drops duplicate primary keys
117
114
  * removes rows with missing primary keys or time values
118
-
119
- If ``preprocess`` is set to ``True``, it will additionally pre-process
120
- data for faster model processing. In particular, it:
121
- * tokenizes any text column that is not a foreign key
122
115
  """
123
116
  df_dict: Dict[str, pd.DataFrame] = {}
124
117
  for table_name, table in graph.tables.items():
@@ -126,8 +119,6 @@ class LocalGraphStore:
126
119
  df = table._data
127
120
  df_dict[table_name] = df.copy(deep=False).reset_index(drop=True)
128
121
 
129
- foreign_keys = {(edge.src_table, edge.fkey) for edge in graph.edges}
130
-
131
122
  mask_dict: Dict[str, np.ndarray] = {}
132
123
  for table in graph.tables.values():
133
124
  for col in table.columns:
@@ -145,12 +136,6 @@ class LocalGraphStore:
145
136
  ser = ser.dt.tz_localize(None)
146
137
  df_dict[table.name][col.name] = ser
147
138
 
148
- # Normalize text in advance (but exclude foreign keys):
149
- if (preprocess and col.stype == Stype.text
150
- and (table.name, col.name) not in foreign_keys):
151
- ser = df_dict[table.name][col.name]
152
- df_dict[table.name][col.name] = normalize_text(ser)
153
-
154
139
  mask: Optional[np.ndarray] = None
155
140
  if table._time_column is not None:
156
141
  ser = df_dict[table.name][table._time_column]
@@ -150,26 +150,16 @@ class KumoRFM:
150
150
 
151
151
  Args:
152
152
  graph: The graph.
153
- preprocess: Whether to pre-process the data in advance during graph
154
- materialization.
155
- This is a runtime trade-off between graph materialization and model
156
- processing speed.
157
- It can be benefical to preprocess your data once and then run many
158
- queries on top to achieve maximum model speed.
159
- However, if activiated, graph materialization can take potentially
160
- much longer, especially on graphs with many large text columns.
161
- Best to tune this option manually.
162
153
  verbose: Whether to print verbose output.
163
154
  """
164
155
  def __init__(
165
156
  self,
166
157
  graph: Graph,
167
- preprocess: bool = False,
168
158
  verbose: Union[bool, ProgressLogger] = True,
169
159
  ) -> None:
170
160
  graph = graph.validate()
171
161
  self._graph_def = graph._to_api_graph_definition()
172
- self._graph_store = LocalGraphStore(graph, preprocess, verbose)
162
+ self._graph_store = LocalGraphStore(graph, verbose)
173
163
  self._graph_sampler = LocalGraphSampler(self._graph_store)
174
164
 
175
165
  self._client: Optional[RFMAPI] = None
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kumoai
3
- Version: 2.13.0.dev202512021731
3
+ Version: 2.13.0.dev202512040252
4
4
  Summary: AI on the Modern Data Stack
5
5
  Author-email: "Kumo.AI" <hello@kumo.ai>
6
6
  License-Expression: MIT
@@ -42,6 +42,7 @@ Provides-Extra: sqlite
42
42
  Requires-Dist: adbc_driver_sqlite; extra == "sqlite"
43
43
  Provides-Extra: snowflake
44
44
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
45
+ Requires-Dist: pyyaml; extra == "snowflake"
45
46
  Provides-Extra: sagemaker
46
47
  Requires-Dist: boto3<2.0,>=1.30.0; extra == "sagemaker"
47
48
  Requires-Dist: mypy-boto3-sagemaker-runtime<2.0,>=1.34.0; extra == "sagemaker"
@@ -1,13 +1,13 @@
1
1
  kumoai/__init__.py,sha256=qu-qohU2cQlManX1aZIlzA3ivKl52m-cSQBPSW8urUU,10837
2
2
  kumoai/_logging.py,sha256=qL4JbMQwKXri2f-SEJoFB8TY5ALG12S-nobGTNWxW-A,915
3
3
  kumoai/_singleton.py,sha256=i2BHWKpccNh5SJGDyU0IXsnYzJAYr8Xb0wz4c6LRbpo,861
4
- kumoai/_version.py,sha256=J8ZxoMuLZm4ZbBIvjU8zNKn-zWD-Pu5i4RPV-vurJZU,39
4
+ kumoai/_version.py,sha256=16u1rVm-N2IEE7QbyS9U5nn_hjp7P_wxBIQzzAKSnDA,39
5
5
  kumoai/databricks.py,sha256=ahwJz6DWLXMkndT0XwEDBxF-hoqhidFR8wBUQ4TLZ68,490
6
6
  kumoai/exceptions.py,sha256=7TMs0SC8xrU009_Pgd4QXtSF9lxJq8MtRbeX9pcQUy4,859
7
7
  kumoai/formatting.py,sha256=o3uCnLwXPhe1KI5WV9sBgRrcU7ed4rgu_pf89GL9Nc0,983
8
8
  kumoai/futures.py,sha256=J8rtZMEYFzdn5xF_x-LAiKJz3KGL6PT02f6rq_2bOJk,3836
9
9
  kumoai/jobs.py,sha256=dCi7BAdfm2tCnonYlGU4WJokJWbh3RzFfaOX2EYCIHU,2576
10
- kumoai/kumolib.cp310-win_amd64.pyd,sha256=diW7mZyXCDEBISV6p-9XYxIXiwyZA3p5m-DxL6if4i4,194048
10
+ kumoai/kumolib.cp310-win_amd64.pyd,sha256=3iE0thfrVDx0Yhh0I0li-BwZcIpQfRpaYxYMsSpYofc,194048
11
11
  kumoai/mixin.py,sha256=IaiB8SAI0VqOoMVzzIaUlqMt53-QPUK6OB0HikG-V9E,840
12
12
  kumoai/spcs.py,sha256=KWfENrwSLruprlD-QPh63uU0N6npiNrwkeKfBk3EUyQ,4260
13
13
  kumoai/artifact_export/__init__.py,sha256=UXAQI5q92ChBzWAk8o3J6pElzYHudAzFZssQXd4o7i8,247
@@ -55,27 +55,30 @@ kumoai/encoder/__init__.py,sha256=8FeP6mUyCeXxr1b8kUIi5dxe5vEXQRft9tPoaV1CBqg,18
55
55
  kumoai/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  kumoai/experimental/rfm/__init__.py,sha256=EFZz6IvvskmeO85Vig6p1m_6jdimS_BkeREOndHuRsc,6247
57
57
  kumoai/experimental/rfm/authenticate.py,sha256=G89_4TMeUpr5fG_0VTzMF5sdNhaciitA1oc2loTlTmo,19321
58
- kumoai/experimental/rfm/graph.py,sha256=agUNrKA6gkH7Aqetn7f0Y4r1a8QJ9G3uA52WA1hKBpM,34476
59
- kumoai/experimental/rfm/local_graph_sampler.py,sha256=3JNpktW__nwxVKZxP4cQBgsIin7J_LNXYS7YlV36xbU,6854
60
- kumoai/experimental/rfm/local_graph_store.py,sha256=PoUwyMpoJhE5VjtgWYU3pV8__TcZ8oAv-zES5cIB3RQ,14251
58
+ kumoai/experimental/rfm/graph.py,sha256=kSWve-Fn_9qERFjEpCDO5zDnngtd9T4MOhR_o46PI7s,39602
59
+ kumoai/experimental/rfm/local_graph_sampler.py,sha256=dQ3JnuozTNeZyUFRu2h8OTMNmV1RAoaCA0gvkpgOstg,8110
60
+ kumoai/experimental/rfm/local_graph_store.py,sha256=6jY1ciVIlnBBhZCxWwBTl7SKX1fxRIDLszwrftD0Cdk,13485
61
61
  kumoai/experimental/rfm/local_pquery_driver.py,sha256=Yd_yHIrvuDj16IC1pvsqiQvZS41vvOOCRMiuDGtN6Fk,26851
62
- kumoai/experimental/rfm/rfm.py,sha256=mS6LPNgl_kVHeLjmJoAW-hQhBZFKtEH4acGnjbFgee0,49445
62
+ kumoai/experimental/rfm/rfm.py,sha256=vOnL8ecHTo1TX2B8_T8xaWGou8qYYz8DyVENu1H93mM,48834
63
63
  kumoai/experimental/rfm/sagemaker.py,sha256=sEJSyfEFBA3-7wKinBEzSooKHEn0BgPjrgRnPhYo79g,5120
64
- kumoai/experimental/rfm/utils.py,sha256=TN7HT_dy26Vhvd8YE90vB0hGZpTFsRMmpSZ3LcQoEIc,7626
65
64
  kumoai/experimental/rfm/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
65
  kumoai/experimental/rfm/backend/local/__init__.py,sha256=usMh0fuDxKK-aOVT1sU30BQWFS0eSkfUrhUVILisQQI,934
67
- kumoai/experimental/rfm/backend/local/table.py,sha256=oFf5cYu35Hcr950lvOGYjyXCmX6ICaoOBhzghCtVS1Q,8508
68
- kumoai/experimental/rfm/backend/snow/__init__.py,sha256=kn784OzM8tTnOnoieflOuWZZlab586v8yVVjyL9aXlc,898
66
+ kumoai/experimental/rfm/backend/local/table.py,sha256=1PqNOROzlnK3SaZHNcU2hyzeifs0N4wssQAS3-Z0Myc,3674
67
+ kumoai/experimental/rfm/backend/snow/__init__.py,sha256=viMeR9VWpB1kjRdSWCTNFMdM7a8Mj_Dtck1twJW8dV8,962
68
+ kumoai/experimental/rfm/backend/snow/table.py,sha256=HVrPtCVvfsisFmq9jMovowsE5Wl5oti3O-kru7ruXlc,4312
69
69
  kumoai/experimental/rfm/backend/sqlite/__init__.py,sha256=xw5NNLrWSvUvRkD49X_9hZYjas5EuP1XDANPy0EEjOg,874
70
- kumoai/experimental/rfm/backend/sqlite/table.py,sha256=A-37etb2jNUV-I2tz_-7nvoHq-qdn1K7mYjsro85A1Y,5132
71
- kumoai/experimental/rfm/base/__init__.py,sha256=XQ_jx-dFWg95KWqHORThl9UIOcbfl-JzRy47SRwjqmc,101
70
+ kumoai/experimental/rfm/backend/sqlite/table.py,sha256=mBiZC21gQwfR4demFrP37GmawMHfIm-G82mLQeBqIZo,3901
71
+ kumoai/experimental/rfm/base/__init__.py,sha256=oXPkeBemtuDxRUK61-0sOT84GZB_oQ6HvaZNU1KFNaw,199
72
72
  kumoai/experimental/rfm/base/column.py,sha256=OE-PRQ8HO4uTq0e3_3eHJFfhp5nzw79zd-43g3iMh4g,2385
73
- kumoai/experimental/rfm/base/table.py,sha256=HAYFqvbmJ5rLicNE3j-ykxyXFvAnK_E6xmJ18HJir-Y,17534
74
- kumoai/experimental/rfm/infer/__init__.py,sha256=PF654hCdPSf3tFAxNegDA9aOJLBG4IT1eIjRzI0Rvwo,356
73
+ kumoai/experimental/rfm/base/source.py,sha256=H5yN9xAwK3i_69EdqOV_x58muPGKQiI8ev5BhHQDZEo,290
74
+ kumoai/experimental/rfm/base/table.py,sha256=glyAg4LCQdddM3lIRClJSA7qMyfoHUVAGBf1rEs6B8Y,20113
75
+ kumoai/experimental/rfm/infer/__init__.py,sha256=qKg8or-SpgTApD6ePw1PJ4aUZPrOLTHLRCmBIJ92hrk,486
75
76
  kumoai/experimental/rfm/infer/categorical.py,sha256=bqmfrE5ZCBTcb35lA4SyAkCu3MgttAn29VBJYMBNhVg,893
77
+ kumoai/experimental/rfm/infer/dtype.py,sha256=Hf_drluYNuN59lTSe-8GuXalg20Pv93kCktB6Hb9f74,2686
76
78
  kumoai/experimental/rfm/infer/id.py,sha256=xaJBETLZa8ttzZCsDwFSwfyCi3VYsLc_kDWT_t_6Ih4,954
77
79
  kumoai/experimental/rfm/infer/multicategorical.py,sha256=D-1KwYRkOSkBrOJr4Xa3eTCoAF9O9hPGa7Vg67V5_HU,1150
78
- kumoai/experimental/rfm/infer/stype.py,sha256=a5w_3WwQTctnAkFw0J9_EjFvOeLAEtnrtDbImqLeCXA,946
80
+ kumoai/experimental/rfm/infer/pkey.py,sha256=Hvztcircd4iGdsnFU9Xi1kq_A5ONMnkAdnrpQT5svSs,4519
81
+ kumoai/experimental/rfm/infer/time_col.py,sha256=G98Cgz1m9G9VA-ApnCmGYnJxEFwp1jfaPf3nCMOz_N0,1882
79
82
  kumoai/experimental/rfm/infer/timestamp.py,sha256=L2VxjtYTSyUBYAo4M-L08xSQlPpqnHMAVF5_vxjh3Y0,1135
80
83
  kumoai/experimental/rfm/pquery/__init__.py,sha256=RkTn0I74uXOUuOiBpa6S-_QEYctMutkUnBEfF9ztQzI,159
81
84
  kumoai/experimental/rfm/pquery/executor.py,sha256=S8wwXbAkH-YSnmEVYB8d6wyJF4JJ003mH_0zFTvOp_I,2843
@@ -101,8 +104,8 @@ kumoai/utils/__init__.py,sha256=wAKgmwtMIGuiauW9D_GGKH95K-24Kgwmld27mm4nsro,278
101
104
  kumoai/utils/datasets.py,sha256=UyAII-oAn7x3ombuvpbSQ41aVF9SYKBjQthTD-vcT2A,3011
102
105
  kumoai/utils/forecasting.py,sha256=ZgKeUCbWLOot0giAkoigwU5du8LkrwAicFOi5hVn6wg,7624
103
106
  kumoai/utils/progress_logger.py,sha256=MZsWgHd4UZQKCXiJZgQeW-Emi_BmzlCKPLPXOL_HqBo,5239
104
- kumoai-2.13.0.dev202512021731.dist-info/licenses/LICENSE,sha256=ZUilBDp--4vbhsEr6f_Upw9rnIx09zQ3K9fXQ0rfd6w,1111
105
- kumoai-2.13.0.dev202512021731.dist-info/METADATA,sha256=BYNYYVarUHxSCq5IctkyMu_VHIeu12aQ-AX34ZC9X3c,2535
106
- kumoai-2.13.0.dev202512021731.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
107
- kumoai-2.13.0.dev202512021731.dist-info/top_level.txt,sha256=YjU6UcmomoDx30vEXLsOU784ED7VztQOsFApk1SFwvs,7
108
- kumoai-2.13.0.dev202512021731.dist-info/RECORD,,
107
+ kumoai-2.13.0.dev202512040252.dist-info/licenses/LICENSE,sha256=ZUilBDp--4vbhsEr6f_Upw9rnIx09zQ3K9fXQ0rfd6w,1111
108
+ kumoai-2.13.0.dev202512040252.dist-info/METADATA,sha256=T-O--qEm_2QPzB-dDkwR6Ei7r79H7v6qQBbR1e1J8gg,2580
109
+ kumoai-2.13.0.dev202512040252.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
110
+ kumoai-2.13.0.dev202512040252.dist-info/top_level.txt,sha256=YjU6UcmomoDx30vEXLsOU784ED7VztQOsFApk1SFwvs,7
111
+ kumoai-2.13.0.dev202512040252.dist-info/RECORD,,
@@ -1,35 +0,0 @@
1
- import pandas as pd
2
- from kumoapi.typing import Dtype, Stype
3
-
4
- from kumoai.experimental.rfm.infer import (
5
- contains_categorical,
6
- contains_id,
7
- contains_multicategorical,
8
- contains_timestamp,
9
- )
10
-
11
-
12
- def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
13
- r"""Infers the semantic type of a column.
14
-
15
- Args:
16
- ser: A :class:`pandas.Series` to analyze.
17
- column_name: The name of the column (used for pattern matching).
18
- dtype: The data type.
19
-
20
- Returns:
21
- The semantic type.
22
- """
23
- if contains_id(ser, column_name, dtype):
24
- return Stype.ID
25
-
26
- if contains_timestamp(ser, column_name, dtype):
27
- return Stype.timestamp
28
-
29
- if contains_multicategorical(ser, column_name, dtype):
30
- return Stype.multicategorical
31
-
32
- if contains_categorical(ser, column_name, dtype):
33
- return Stype.categorical
34
-
35
- return dtype.default_stype