kumoai 2.13.0.dev202511131731__cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kumoai might be problematic. Click here for more details.
- kumoai/__init__.py +294 -0
- kumoai/_logging.py +29 -0
- kumoai/_singleton.py +25 -0
- kumoai/_version.py +1 -0
- kumoai/artifact_export/__init__.py +9 -0
- kumoai/artifact_export/config.py +209 -0
- kumoai/artifact_export/job.py +108 -0
- kumoai/client/__init__.py +5 -0
- kumoai/client/client.py +221 -0
- kumoai/client/connector.py +110 -0
- kumoai/client/endpoints.py +150 -0
- kumoai/client/graph.py +120 -0
- kumoai/client/jobs.py +447 -0
- kumoai/client/online.py +78 -0
- kumoai/client/pquery.py +203 -0
- kumoai/client/rfm.py +112 -0
- kumoai/client/source_table.py +53 -0
- kumoai/client/table.py +101 -0
- kumoai/client/utils.py +130 -0
- kumoai/codegen/__init__.py +19 -0
- kumoai/codegen/cli.py +100 -0
- kumoai/codegen/context.py +16 -0
- kumoai/codegen/edits.py +473 -0
- kumoai/codegen/exceptions.py +10 -0
- kumoai/codegen/generate.py +222 -0
- kumoai/codegen/handlers/__init__.py +4 -0
- kumoai/codegen/handlers/connector.py +118 -0
- kumoai/codegen/handlers/graph.py +71 -0
- kumoai/codegen/handlers/pquery.py +62 -0
- kumoai/codegen/handlers/table.py +109 -0
- kumoai/codegen/handlers/utils.py +42 -0
- kumoai/codegen/identity.py +114 -0
- kumoai/codegen/loader.py +93 -0
- kumoai/codegen/naming.py +94 -0
- kumoai/codegen/registry.py +121 -0
- kumoai/connector/__init__.py +31 -0
- kumoai/connector/base.py +153 -0
- kumoai/connector/bigquery_connector.py +200 -0
- kumoai/connector/databricks_connector.py +213 -0
- kumoai/connector/file_upload_connector.py +189 -0
- kumoai/connector/glue_connector.py +150 -0
- kumoai/connector/s3_connector.py +278 -0
- kumoai/connector/snowflake_connector.py +252 -0
- kumoai/connector/source_table.py +471 -0
- kumoai/connector/utils.py +1775 -0
- kumoai/databricks.py +14 -0
- kumoai/encoder/__init__.py +4 -0
- kumoai/exceptions.py +26 -0
- kumoai/experimental/__init__.py +0 -0
- kumoai/experimental/rfm/__init__.py +67 -0
- kumoai/experimental/rfm/authenticate.py +433 -0
- kumoai/experimental/rfm/infer/__init__.py +11 -0
- kumoai/experimental/rfm/infer/categorical.py +40 -0
- kumoai/experimental/rfm/infer/id.py +46 -0
- kumoai/experimental/rfm/infer/multicategorical.py +48 -0
- kumoai/experimental/rfm/infer/timestamp.py +41 -0
- kumoai/experimental/rfm/local_graph.py +810 -0
- kumoai/experimental/rfm/local_graph_sampler.py +184 -0
- kumoai/experimental/rfm/local_graph_store.py +359 -0
- kumoai/experimental/rfm/local_pquery_driver.py +689 -0
- kumoai/experimental/rfm/local_table.py +545 -0
- kumoai/experimental/rfm/pquery/__init__.py +7 -0
- kumoai/experimental/rfm/pquery/executor.py +102 -0
- kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
- kumoai/experimental/rfm/rfm.py +1130 -0
- kumoai/experimental/rfm/utils.py +344 -0
- kumoai/formatting.py +30 -0
- kumoai/futures.py +99 -0
- kumoai/graph/__init__.py +12 -0
- kumoai/graph/column.py +106 -0
- kumoai/graph/graph.py +948 -0
- kumoai/graph/table.py +838 -0
- kumoai/jobs.py +80 -0
- kumoai/kumolib.cpython-313-x86_64-linux-gnu.so +0 -0
- kumoai/mixin.py +28 -0
- kumoai/pquery/__init__.py +25 -0
- kumoai/pquery/prediction_table.py +287 -0
- kumoai/pquery/predictive_query.py +637 -0
- kumoai/pquery/training_table.py +424 -0
- kumoai/spcs.py +123 -0
- kumoai/testing/__init__.py +8 -0
- kumoai/testing/decorators.py +57 -0
- kumoai/trainer/__init__.py +42 -0
- kumoai/trainer/baseline_trainer.py +93 -0
- kumoai/trainer/config.py +2 -0
- kumoai/trainer/job.py +1192 -0
- kumoai/trainer/online_serving.py +258 -0
- kumoai/trainer/trainer.py +475 -0
- kumoai/trainer/util.py +103 -0
- kumoai/utils/__init__.py +10 -0
- kumoai/utils/datasets.py +83 -0
- kumoai/utils/forecasting.py +209 -0
- kumoai/utils/progress_logger.py +177 -0
- kumoai-2.13.0.dev202511131731.dist-info/METADATA +60 -0
- kumoai-2.13.0.dev202511131731.dist-info/RECORD +98 -0
- kumoai-2.13.0.dev202511131731.dist-info/WHEEL +6 -0
- kumoai-2.13.0.dev202511131731.dist-info/licenses/LICENSE +9 -0
- kumoai-2.13.0.dev202511131731.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import warnings
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
from kumoapi.typing import Dtype, Stype
|
|
9
|
+
|
|
10
|
+
from kumoai.experimental.rfm.infer import (
|
|
11
|
+
contains_categorical,
|
|
12
|
+
contains_id,
|
|
13
|
+
contains_multicategorical,
|
|
14
|
+
contains_timestamp,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Mapping from pandas/numpy dtypes to Kumo Dtypes
|
|
18
|
+
PANDAS_TO_DTYPE: Dict[Any, Dtype] = {
|
|
19
|
+
np.dtype('bool'): Dtype.bool,
|
|
20
|
+
pd.BooleanDtype(): Dtype.bool,
|
|
21
|
+
pa.bool_(): Dtype.bool,
|
|
22
|
+
np.dtype('byte'): Dtype.int,
|
|
23
|
+
pd.UInt8Dtype(): Dtype.int,
|
|
24
|
+
np.dtype('int16'): Dtype.int,
|
|
25
|
+
pd.Int16Dtype(): Dtype.int,
|
|
26
|
+
np.dtype('int32'): Dtype.int,
|
|
27
|
+
pd.Int32Dtype(): Dtype.int,
|
|
28
|
+
np.dtype('int64'): Dtype.int,
|
|
29
|
+
pd.Int64Dtype(): Dtype.int,
|
|
30
|
+
np.dtype('float32'): Dtype.float,
|
|
31
|
+
pd.Float32Dtype(): Dtype.float,
|
|
32
|
+
np.dtype('float64'): Dtype.float,
|
|
33
|
+
pd.Float64Dtype(): Dtype.float,
|
|
34
|
+
np.dtype('object'): Dtype.string,
|
|
35
|
+
pd.StringDtype(storage='python'): Dtype.string,
|
|
36
|
+
pd.StringDtype(storage='pyarrow'): Dtype.string,
|
|
37
|
+
pa.string(): Dtype.string,
|
|
38
|
+
pa.binary(): Dtype.binary,
|
|
39
|
+
np.dtype('datetime64[ns]'): Dtype.date,
|
|
40
|
+
np.dtype('timedelta64[ns]'): Dtype.timedelta,
|
|
41
|
+
pa.list_(pa.float32()): Dtype.floatlist,
|
|
42
|
+
pa.list_(pa.int64()): Dtype.intlist,
|
|
43
|
+
pa.list_(pa.string()): Dtype.stringlist,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def to_dtype(ser: pd.Series) -> Dtype:
|
|
48
|
+
"""Extracts the :class:`Dtype` from a :class:`pandas.Series`.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
ser: A :class:`pandas.Series` to analyze.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
The data type.
|
|
55
|
+
"""
|
|
56
|
+
if pd.api.types.is_datetime64_any_dtype(ser.dtype):
|
|
57
|
+
return Dtype.date
|
|
58
|
+
|
|
59
|
+
if isinstance(ser.dtype, pd.CategoricalDtype):
|
|
60
|
+
return Dtype.string
|
|
61
|
+
|
|
62
|
+
if pd.api.types.is_object_dtype(ser.dtype):
|
|
63
|
+
index = ser.iloc[:1000].first_valid_index()
|
|
64
|
+
if index is not None and pd.api.types.is_list_like(ser[index]):
|
|
65
|
+
pos = ser.index.get_loc(index)
|
|
66
|
+
assert isinstance(pos, int)
|
|
67
|
+
ser = ser.iloc[pos:pos + 1000].dropna()
|
|
68
|
+
|
|
69
|
+
if not ser.map(pd.api.types.is_list_like).all():
|
|
70
|
+
raise ValueError("Data contains a mix of list-like and "
|
|
71
|
+
"non-list-like values")
|
|
72
|
+
|
|
73
|
+
ser = ser[ser.map(lambda x: not isinstance(x, list) or len(x) > 0)]
|
|
74
|
+
|
|
75
|
+
dtypes = ser.apply(lambda x: PANDAS_TO_DTYPE.get(
|
|
76
|
+
np.array(x).dtype, Dtype.string)).unique().tolist()
|
|
77
|
+
|
|
78
|
+
invalid_dtypes = set(dtypes) - {
|
|
79
|
+
Dtype.string,
|
|
80
|
+
Dtype.int,
|
|
81
|
+
Dtype.float,
|
|
82
|
+
}
|
|
83
|
+
if len(invalid_dtypes) > 0:
|
|
84
|
+
raise ValueError(f"Data contains unsupported list data types: "
|
|
85
|
+
f"{list(invalid_dtypes)}")
|
|
86
|
+
|
|
87
|
+
if Dtype.string in dtypes:
|
|
88
|
+
return Dtype.stringlist
|
|
89
|
+
|
|
90
|
+
if dtypes == [Dtype.int]:
|
|
91
|
+
return Dtype.intlist
|
|
92
|
+
|
|
93
|
+
return Dtype.floatlist
|
|
94
|
+
|
|
95
|
+
if ser.dtype not in PANDAS_TO_DTYPE:
|
|
96
|
+
raise ValueError(f"Unsupported data type '{ser.dtype}'")
|
|
97
|
+
|
|
98
|
+
return PANDAS_TO_DTYPE[ser.dtype]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
|
|
102
|
+
r"""Infers the semantic type of a column.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
ser: A :class:`pandas.Series` to analyze.
|
|
106
|
+
column_name: The name of the column (used for pattern matching).
|
|
107
|
+
dtype: The data type.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
The semantic type.
|
|
111
|
+
"""
|
|
112
|
+
if contains_id(ser, column_name, dtype):
|
|
113
|
+
return Stype.ID
|
|
114
|
+
|
|
115
|
+
if contains_timestamp(ser, column_name, dtype):
|
|
116
|
+
return Stype.timestamp
|
|
117
|
+
|
|
118
|
+
if contains_multicategorical(ser, column_name, dtype):
|
|
119
|
+
return Stype.multicategorical
|
|
120
|
+
|
|
121
|
+
if contains_categorical(ser, column_name, dtype):
|
|
122
|
+
return Stype.categorical
|
|
123
|
+
|
|
124
|
+
return dtype.default_stype
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def detect_primary_key(
|
|
128
|
+
table_name: str,
|
|
129
|
+
df: pd.DataFrame,
|
|
130
|
+
candidates: list[str],
|
|
131
|
+
) -> Optional[str]:
|
|
132
|
+
r"""Auto-detect potential primary key column.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
table_name: The table name.
|
|
136
|
+
df: The pandas DataFrame to analyze
|
|
137
|
+
candidates: A list of potential candidates.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
The name of the detected primary key, or ``None`` if not found.
|
|
141
|
+
"""
|
|
142
|
+
# A list of (potentially modified) table names that are eligible to match
|
|
143
|
+
# with a primary key, i.e.:
|
|
144
|
+
# - UserInfo -> User
|
|
145
|
+
# - snakecase <-> camelcase
|
|
146
|
+
# - camelcase <-> snakecase
|
|
147
|
+
# - plural <-> singular (users -> user, eligibilities -> eligibility)
|
|
148
|
+
# - verb -> noun (qualifying -> qualify)
|
|
149
|
+
_table_names = {table_name}
|
|
150
|
+
if table_name.lower().endswith('_info'):
|
|
151
|
+
_table_names.add(table_name[:-5])
|
|
152
|
+
elif table_name.lower().endswith('info'):
|
|
153
|
+
_table_names.add(table_name[:-4])
|
|
154
|
+
|
|
155
|
+
table_names = set()
|
|
156
|
+
for _table_name in _table_names:
|
|
157
|
+
table_names.add(_table_name.lower())
|
|
158
|
+
snakecase = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', _table_name)
|
|
159
|
+
snakecase = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', snakecase)
|
|
160
|
+
table_names.add(snakecase.lower())
|
|
161
|
+
camelcase = _table_name.replace('_', '')
|
|
162
|
+
table_names.add(camelcase.lower())
|
|
163
|
+
if _table_name.lower().endswith('s'):
|
|
164
|
+
table_names.add(_table_name.lower()[:-1])
|
|
165
|
+
table_names.add(snakecase.lower()[:-1])
|
|
166
|
+
table_names.add(camelcase.lower()[:-1])
|
|
167
|
+
else:
|
|
168
|
+
table_names.add(_table_name.lower() + 's')
|
|
169
|
+
table_names.add(snakecase.lower() + 's')
|
|
170
|
+
table_names.add(camelcase.lower() + 's')
|
|
171
|
+
if _table_name.lower().endswith('ies'):
|
|
172
|
+
table_names.add(_table_name.lower()[:-3] + 'y')
|
|
173
|
+
table_names.add(snakecase.lower()[:-3] + 'y')
|
|
174
|
+
table_names.add(camelcase.lower()[:-3] + 'y')
|
|
175
|
+
elif _table_name.lower().endswith('y'):
|
|
176
|
+
table_names.add(_table_name.lower()[:-1] + 'ies')
|
|
177
|
+
table_names.add(snakecase.lower()[:-1] + 'ies')
|
|
178
|
+
table_names.add(camelcase.lower()[:-1] + 'ies')
|
|
179
|
+
if _table_name.lower().endswith('ing'):
|
|
180
|
+
table_names.add(_table_name.lower()[:-3])
|
|
181
|
+
table_names.add(snakecase.lower()[:-3])
|
|
182
|
+
table_names.add(camelcase.lower()[:-3])
|
|
183
|
+
|
|
184
|
+
scores: list[tuple[str, int]] = []
|
|
185
|
+
for col_name in candidates:
|
|
186
|
+
col_name_lower = col_name.lower()
|
|
187
|
+
|
|
188
|
+
score = 0
|
|
189
|
+
|
|
190
|
+
if col_name_lower == 'id':
|
|
191
|
+
score += 4
|
|
192
|
+
|
|
193
|
+
for table_name_lower in table_names:
|
|
194
|
+
|
|
195
|
+
if col_name_lower == table_name_lower:
|
|
196
|
+
score += 4 # USER -> USER
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
for suffix in ['id', 'hash', 'key', 'code', 'uuid']:
|
|
200
|
+
if not col_name_lower.endswith(suffix):
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
if col_name_lower == f'{table_name_lower}_{suffix}':
|
|
204
|
+
score += 5 # USER -> USER_ID
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
if col_name_lower == f'{table_name_lower}{suffix}':
|
|
208
|
+
score += 5 # User -> UserId
|
|
209
|
+
break
|
|
210
|
+
|
|
211
|
+
if col_name_lower.endswith(f'{table_name_lower}_{suffix}'):
|
|
212
|
+
score += 2
|
|
213
|
+
|
|
214
|
+
if col_name_lower.endswith(f'{table_name_lower}{suffix}'):
|
|
215
|
+
score += 2
|
|
216
|
+
|
|
217
|
+
# `rel-bench` hard-coding :(
|
|
218
|
+
if table_name == 'studies' and col_name == 'nct_id':
|
|
219
|
+
score += 1
|
|
220
|
+
|
|
221
|
+
ser = df[col_name].iloc[:1_000_000]
|
|
222
|
+
score += 3 * (ser.nunique() / len(ser))
|
|
223
|
+
|
|
224
|
+
scores.append((col_name, score))
|
|
225
|
+
|
|
226
|
+
scores = [x for x in scores if x[-1] >= 4]
|
|
227
|
+
scores.sort(key=lambda x: x[-1], reverse=True)
|
|
228
|
+
|
|
229
|
+
if len(scores) == 0:
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
if len(scores) == 1:
|
|
233
|
+
return scores[0][0]
|
|
234
|
+
|
|
235
|
+
# In case of multiple candidates, only return one if its score is unique:
|
|
236
|
+
if scores[0][1] != scores[1][1]:
|
|
237
|
+
return scores[0][0]
|
|
238
|
+
|
|
239
|
+
max_score = max(scores, key=lambda x: x[1])
|
|
240
|
+
candidates = [col_name for col_name, score in scores if score == max_score]
|
|
241
|
+
warnings.warn(f"Found multiple potential primary keys in table "
|
|
242
|
+
f"'{table_name}': {candidates}. Please specify the primary "
|
|
243
|
+
f"key for this table manually.")
|
|
244
|
+
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def detect_time_column(
|
|
249
|
+
df: pd.DataFrame,
|
|
250
|
+
candidates: list[str],
|
|
251
|
+
) -> Optional[str]:
|
|
252
|
+
r"""Auto-detect potential time column.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
df: The pandas DataFrame to analyze
|
|
256
|
+
candidates: A list of potential candidates.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
The name of the detected time column, or ``None`` if not found.
|
|
260
|
+
"""
|
|
261
|
+
candidates = [ # Exclude all candidates with `*last*` in column names:
|
|
262
|
+
col_name for col_name in candidates
|
|
263
|
+
if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
if len(candidates) == 0:
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
if len(candidates) == 1:
|
|
270
|
+
return candidates[0]
|
|
271
|
+
|
|
272
|
+
# If there exists a dedicated `create*` column, use it as time column:
|
|
273
|
+
create_candidates = [
|
|
274
|
+
candidate for candidate in candidates
|
|
275
|
+
if candidate.lower().startswith('create')
|
|
276
|
+
]
|
|
277
|
+
if len(create_candidates) == 1:
|
|
278
|
+
return create_candidates[0]
|
|
279
|
+
if len(create_candidates) > 1:
|
|
280
|
+
candidates = create_candidates
|
|
281
|
+
|
|
282
|
+
# Find the most optimal time column. Usually, it is the one pointing to
|
|
283
|
+
# the oldest timestamps:
|
|
284
|
+
with warnings.catch_warnings():
|
|
285
|
+
warnings.filterwarnings('ignore', message='Could not infer format')
|
|
286
|
+
min_timestamp_dict = {
|
|
287
|
+
key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
|
|
288
|
+
for key in candidates
|
|
289
|
+
}
|
|
290
|
+
min_timestamp_dict = {
|
|
291
|
+
key: value.min().tz_localize(None)
|
|
292
|
+
for key, value in min_timestamp_dict.items()
|
|
293
|
+
}
|
|
294
|
+
min_timestamp_dict = {
|
|
295
|
+
key: value
|
|
296
|
+
for key, value in min_timestamp_dict.items() if not pd.isna(value)
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if len(min_timestamp_dict) == 0:
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
|
|
306
|
+
MULTISPACE = re.compile(r"\s+")
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def normalize_text(
|
|
310
|
+
ser: pd.Series,
|
|
311
|
+
max_words: Optional[int] = 50,
|
|
312
|
+
) -> pd.Series:
|
|
313
|
+
r"""Normalizes text into a list of lower-case words.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
ser: The :class:`pandas.Series` to normalize.
|
|
317
|
+
max_words: The maximum number of words to return.
|
|
318
|
+
This will auto-shrink any large text column to avoid blowing up
|
|
319
|
+
context size.
|
|
320
|
+
"""
|
|
321
|
+
if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
|
|
322
|
+
return ser
|
|
323
|
+
|
|
324
|
+
def normalize_fn(line: str) -> list[str]:
|
|
325
|
+
line = PUNCTUATION.sub(" ", line)
|
|
326
|
+
line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
|
|
327
|
+
line = MULTISPACE.sub(" ", line)
|
|
328
|
+
words = line.split()
|
|
329
|
+
if max_words is not None:
|
|
330
|
+
words = words[:max_words]
|
|
331
|
+
return words
|
|
332
|
+
|
|
333
|
+
ser = ser.fillna('').astype(str)
|
|
334
|
+
|
|
335
|
+
if max_words is not None:
|
|
336
|
+
# We estimate the number of words as 5 characters + 1 space in an
|
|
337
|
+
# English text on average. We need this pre-filter here, as word
|
|
338
|
+
# splitting on a giant text can be very expensive:
|
|
339
|
+
ser = ser.str[:6 * max_words]
|
|
340
|
+
|
|
341
|
+
ser = ser.str.lower()
|
|
342
|
+
ser = ser.map(normalize_fn)
|
|
343
|
+
|
|
344
|
+
return ser
|
kumoai/formatting.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from kumoapi.jobs import ErrorDetails
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def pretty_print_error_details(error_details: ErrorDetails) -> str:
|
|
5
|
+
"""Pretty prints the ErrorDetails combining all the individual items.
|
|
6
|
+
If there are CTAs, they are also displayed after creating
|
|
7
|
+
corresponding hyperlinks.
|
|
8
|
+
|
|
9
|
+
Arguments:
|
|
10
|
+
error_details (ErrorDetails): Standard ErrorDetails response from
|
|
11
|
+
get_errors APIs.
|
|
12
|
+
"""
|
|
13
|
+
out = ""
|
|
14
|
+
ctr = None
|
|
15
|
+
if len(error_details.items) != 1:
|
|
16
|
+
out += "Encountered multiple errors:\n"
|
|
17
|
+
ctr = 1
|
|
18
|
+
for error_detail in error_details.items:
|
|
19
|
+
if ctr is not None:
|
|
20
|
+
out += f'{ctr}.'
|
|
21
|
+
ctr += 1
|
|
22
|
+
if error_detail.title is not None:
|
|
23
|
+
out += f'{error_detail.title}: '
|
|
24
|
+
out += error_detail.description
|
|
25
|
+
if error_detail.cta is not None:
|
|
26
|
+
out += 'Follow the link for potential resolution:'
|
|
27
|
+
f' {error_detail.cta.url}'
|
|
28
|
+
out += '\n'
|
|
29
|
+
|
|
30
|
+
return out
|
kumoai/futures.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import concurrent
|
|
3
|
+
import logging
|
|
4
|
+
import threading
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from asyncio.events import AbstractEventLoop
|
|
7
|
+
from typing import Any, Awaitable, Coroutine, Generic, TypeVar
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
CoroFuncType = Awaitable[Any]
|
|
12
|
+
|
|
13
|
+
# Kumo global event loop (our implementation of green threads for pollers and
|
|
14
|
+
# other interactions with the Kumo backend that require long-running tasks).
|
|
15
|
+
# Since the caller may have their own event loop that we do not want to
|
|
16
|
+
# mess with, _do not_ ever call `set_event_loop` here!! Instead, be extra
|
|
17
|
+
# cautious to pass this loop everywhere.
|
|
18
|
+
_KUMO_EVENT_LOOP: AbstractEventLoop = asyncio.new_event_loop()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def initialize_event_loop() -> None:
|
|
22
|
+
def _run_background_loop(loop: AbstractEventLoop) -> None:
|
|
23
|
+
asyncio.set_event_loop(loop)
|
|
24
|
+
loop.run_forever()
|
|
25
|
+
|
|
26
|
+
t = threading.Thread(target=_run_background_loop,
|
|
27
|
+
args=(_KUMO_EVENT_LOOP, ), daemon=True)
|
|
28
|
+
t.start()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_future(coro: Coroutine[Any, Any, Any]) -> concurrent.futures.Future:
|
|
32
|
+
r"""Creates a future to execute in the Kumo event loop."""
|
|
33
|
+
# NOTE this function creates a future, chains it to the output of the
|
|
34
|
+
# execution of `coro` in the Kumo event loop, and handles exceptions
|
|
35
|
+
# before scheduling to run in the loop:
|
|
36
|
+
return asyncio.run_coroutine_threadsafe(coro, _KUMO_EVENT_LOOP)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
T = TypeVar("T")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class KumoFuture(ABC, Generic[T]):
|
|
43
|
+
r"""Abstract base class for a Kumo future object."""
|
|
44
|
+
|
|
45
|
+
# We cannot use Python future implementations (`asyncio.Future` or
|
|
46
|
+
# `concurrent.futures.Future`) as they are native to the Python
|
|
47
|
+
# implementation of asyncio and threading, and thus not easily extensible.
|
|
48
|
+
# Python additionally recommends not exposing low-level Future objects in
|
|
49
|
+
# user facing APIs.
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def result(self) -> T:
|
|
52
|
+
r"""Returns the resolved state of the future.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
Exception:
|
|
56
|
+
If the future is complete but in a failed state due to an
|
|
57
|
+
exception being raised, this method will raise the same
|
|
58
|
+
exception.
|
|
59
|
+
"""
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def future(self) -> 'concurrent.futures.Future[T]':
|
|
64
|
+
r"""Returns the :obj:`concurrent.futures.Future` object wrapped by
|
|
65
|
+
this future. It is not recommended to access this object directly.
|
|
66
|
+
"""
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
|
|
69
|
+
def done(self) -> bool:
|
|
70
|
+
r"""Returns :obj:`True` if this future has been resolved with
|
|
71
|
+
``result()``, or :obj:`False` if this future is still
|
|
72
|
+
in-progress. Note that this method will return :obj:`True` if the
|
|
73
|
+
future is complete, but in a failed state, and that this method will
|
|
74
|
+
return :obj:`False` if the job is complete, but the future has not
|
|
75
|
+
been awaited.
|
|
76
|
+
"""
|
|
77
|
+
return self.future().done()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class KumoProgressFuture(KumoFuture[T]):
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def _attach_internal(self, interval_s: float = 4.0) -> T:
|
|
83
|
+
raise NotImplementedError
|
|
84
|
+
|
|
85
|
+
def attach(self, interval_s: float = 4.0) -> T:
|
|
86
|
+
r"""Allows a user to attach to a running job and view its progress.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
interval_s (float): Time interval (seconds) between polls, minimum
|
|
90
|
+
value allowed is 4 seconds.
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
return self._attach_internal(interval_s=interval_s)
|
|
94
|
+
except Exception:
|
|
95
|
+
logger.warning(
|
|
96
|
+
"Detailed job tracking has become temporarily unavailable. "
|
|
97
|
+
"The job is continuing to proceed on the Kumo server, "
|
|
98
|
+
"and this call will complete when the job has finished.")
|
|
99
|
+
return self.result()
|
kumoai/graph/__init__.py
ADDED
kumoai/graph/column.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
|
|
4
|
+
from kumoapi.table import TimestampUnit
|
|
5
|
+
from kumoapi.typing import Dtype, Stype
|
|
6
|
+
|
|
7
|
+
from kumoai.mixin import CastMixin
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(init=False)
|
|
11
|
+
class Column(CastMixin):
|
|
12
|
+
r"""A column represents metadata information for a column in a Kumo
|
|
13
|
+
:class:`~kumoai.graph.Table`. Columns can be created independent of
|
|
14
|
+
a table, or can be fetched from a table with the
|
|
15
|
+
:meth:`~kumoai.graph.Table.column` method.
|
|
16
|
+
|
|
17
|
+
.. code-block:: python
|
|
18
|
+
|
|
19
|
+
import kumoai
|
|
20
|
+
|
|
21
|
+
# Fetch a column from a `kumoai.Table`:
|
|
22
|
+
table = kumoai.Table(...)
|
|
23
|
+
|
|
24
|
+
column = table.column("col_name")
|
|
25
|
+
column = table["col_name"] # equivalent to the above.
|
|
26
|
+
|
|
27
|
+
# Edit a column's data type:
|
|
28
|
+
print("Existing dtype: ", column.dtype)
|
|
29
|
+
column.dtype = "int"
|
|
30
|
+
|
|
31
|
+
# Edit a column's semantic type:
|
|
32
|
+
print("Existing stype: ", column.stype)
|
|
33
|
+
column.stype = "ID"
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
name: The name of this column.
|
|
37
|
+
stype: The semantic type of this column. Semantic types can be
|
|
38
|
+
specified as strings: the list of possible semantic types
|
|
39
|
+
is located at :class:`~kumoai.Stype`.
|
|
40
|
+
dtype: The data type of this column. Data types can be specified
|
|
41
|
+
as strings: the list of possible data types is located at
|
|
42
|
+
:class:`~kumoai.Dtype`.
|
|
43
|
+
timestamp_format: If this column represents a timestamp, the format
|
|
44
|
+
that the timestamp should be parsed in. The format can either be
|
|
45
|
+
a :class:`~kumoapi.table.TimestampUnit` for integer columns or a
|
|
46
|
+
string with a format identifier described
|
|
47
|
+
`here <https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html>`__
|
|
48
|
+
for a SaaS Kumo deployment and
|
|
49
|
+
`here <https://docs.snowflake.com/en/sql-reference/date-time-input-output#about-the-elements-used-in-input-and-output-formats>`__
|
|
50
|
+
for a Snowpark Container Services Kumo deployment. If left empty,
|
|
51
|
+
will be intelligently inferred by Kumo.
|
|
52
|
+
""" # noqa: E501
|
|
53
|
+
name: str
|
|
54
|
+
stype: Optional[Stype] = None
|
|
55
|
+
dtype: Optional[Dtype] = None
|
|
56
|
+
timestamp_format: Optional[Union[str, TimestampUnit]] = None
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
name: str,
|
|
61
|
+
stype: Optional[Union[Stype, str]] = None,
|
|
62
|
+
dtype: Optional[Union[Dtype, str]] = None,
|
|
63
|
+
timestamp_format: Optional[Union[str, TimestampUnit]] = None,
|
|
64
|
+
) -> None:
|
|
65
|
+
self.name = name
|
|
66
|
+
self.stype = Stype(stype) if stype is not None else None
|
|
67
|
+
self.dtype = Dtype(dtype) if dtype is not None else None
|
|
68
|
+
try:
|
|
69
|
+
self.timestamp_format = TimestampUnit(timestamp_format)
|
|
70
|
+
except ValueError:
|
|
71
|
+
self.timestamp_format = timestamp_format
|
|
72
|
+
|
|
73
|
+
def __hash__(self) -> int:
|
|
74
|
+
return hash((self.name, self.stype, self.dtype, self.timestamp_format))
|
|
75
|
+
|
|
76
|
+
def __setattr__(self, key: Any, value: Any) -> None:
|
|
77
|
+
if key == 'name' and value != getattr(self, key, value):
|
|
78
|
+
raise AttributeError("Attribute 'name' is read-only")
|
|
79
|
+
elif key == 'stype' and isinstance(value, str):
|
|
80
|
+
value = Stype(value)
|
|
81
|
+
elif key == 'dtype' and isinstance(value, str):
|
|
82
|
+
value = Dtype(value)
|
|
83
|
+
elif key == 'timestamp_format' and isinstance(value, str):
|
|
84
|
+
try:
|
|
85
|
+
value = TimestampUnit(value)
|
|
86
|
+
except ValueError:
|
|
87
|
+
pass
|
|
88
|
+
super().__setattr__(key, value)
|
|
89
|
+
|
|
90
|
+
def update(self, obj: 'Column', override: bool = True) -> 'Column':
|
|
91
|
+
for key in self.__dict__:
|
|
92
|
+
if key[0] == '_': # Skip private attributes:
|
|
93
|
+
continue
|
|
94
|
+
value = getattr(obj, key, None)
|
|
95
|
+
if value is not None:
|
|
96
|
+
if override or getattr(self, key, None) is None:
|
|
97
|
+
setattr(self, key, value)
|
|
98
|
+
return self
|
|
99
|
+
|
|
100
|
+
def __repr__(self) -> str:
|
|
101
|
+
out = (f"Column(name=\"{self.name}\", stype=\"{self.stype}\", "
|
|
102
|
+
f"dtype=\"{self.dtype}\"")
|
|
103
|
+
if self.timestamp_format is not None:
|
|
104
|
+
out += f", timestamp_format=\"{self.timestamp_format}\""
|
|
105
|
+
out += ")"
|
|
106
|
+
return out
|