kumoai 2.13.0.dev202511131731__cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (98) hide show
  1. kumoai/__init__.py +294 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +221 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +447 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +203 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1775 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +67 -0
  51. kumoai/experimental/rfm/authenticate.py +433 -0
  52. kumoai/experimental/rfm/infer/__init__.py +11 -0
  53. kumoai/experimental/rfm/infer/categorical.py +40 -0
  54. kumoai/experimental/rfm/infer/id.py +46 -0
  55. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  56. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  57. kumoai/experimental/rfm/local_graph.py +810 -0
  58. kumoai/experimental/rfm/local_graph_sampler.py +184 -0
  59. kumoai/experimental/rfm/local_graph_store.py +359 -0
  60. kumoai/experimental/rfm/local_pquery_driver.py +689 -0
  61. kumoai/experimental/rfm/local_table.py +545 -0
  62. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  63. kumoai/experimental/rfm/pquery/executor.py +102 -0
  64. kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
  65. kumoai/experimental/rfm/rfm.py +1130 -0
  66. kumoai/experimental/rfm/utils.py +344 -0
  67. kumoai/formatting.py +30 -0
  68. kumoai/futures.py +99 -0
  69. kumoai/graph/__init__.py +12 -0
  70. kumoai/graph/column.py +106 -0
  71. kumoai/graph/graph.py +948 -0
  72. kumoai/graph/table.py +838 -0
  73. kumoai/jobs.py +80 -0
  74. kumoai/kumolib.cpython-313-x86_64-linux-gnu.so +0 -0
  75. kumoai/mixin.py +28 -0
  76. kumoai/pquery/__init__.py +25 -0
  77. kumoai/pquery/prediction_table.py +287 -0
  78. kumoai/pquery/predictive_query.py +637 -0
  79. kumoai/pquery/training_table.py +424 -0
  80. kumoai/spcs.py +123 -0
  81. kumoai/testing/__init__.py +8 -0
  82. kumoai/testing/decorators.py +57 -0
  83. kumoai/trainer/__init__.py +42 -0
  84. kumoai/trainer/baseline_trainer.py +93 -0
  85. kumoai/trainer/config.py +2 -0
  86. kumoai/trainer/job.py +1192 -0
  87. kumoai/trainer/online_serving.py +258 -0
  88. kumoai/trainer/trainer.py +475 -0
  89. kumoai/trainer/util.py +103 -0
  90. kumoai/utils/__init__.py +10 -0
  91. kumoai/utils/datasets.py +83 -0
  92. kumoai/utils/forecasting.py +209 -0
  93. kumoai/utils/progress_logger.py +177 -0
  94. kumoai-2.13.0.dev202511131731.dist-info/METADATA +60 -0
  95. kumoai-2.13.0.dev202511131731.dist-info/RECORD +98 -0
  96. kumoai-2.13.0.dev202511131731.dist-info/WHEEL +6 -0
  97. kumoai-2.13.0.dev202511131731.dist-info/licenses/LICENSE +9 -0
  98. kumoai-2.13.0.dev202511131731.dist-info/top_level.txt +1 -0
@@ -0,0 +1,344 @@
1
+ import re
2
+ import warnings
3
+ from typing import Any, Dict, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pyarrow as pa
8
+ from kumoapi.typing import Dtype, Stype
9
+
10
+ from kumoai.experimental.rfm.infer import (
11
+ contains_categorical,
12
+ contains_id,
13
+ contains_multicategorical,
14
+ contains_timestamp,
15
+ )
16
+
17
+ # Mapping from pandas/numpy dtypes to Kumo Dtypes
18
+ PANDAS_TO_DTYPE: Dict[Any, Dtype] = {
19
+ np.dtype('bool'): Dtype.bool,
20
+ pd.BooleanDtype(): Dtype.bool,
21
+ pa.bool_(): Dtype.bool,
22
+ np.dtype('byte'): Dtype.int,
23
+ pd.UInt8Dtype(): Dtype.int,
24
+ np.dtype('int16'): Dtype.int,
25
+ pd.Int16Dtype(): Dtype.int,
26
+ np.dtype('int32'): Dtype.int,
27
+ pd.Int32Dtype(): Dtype.int,
28
+ np.dtype('int64'): Dtype.int,
29
+ pd.Int64Dtype(): Dtype.int,
30
+ np.dtype('float32'): Dtype.float,
31
+ pd.Float32Dtype(): Dtype.float,
32
+ np.dtype('float64'): Dtype.float,
33
+ pd.Float64Dtype(): Dtype.float,
34
+ np.dtype('object'): Dtype.string,
35
+ pd.StringDtype(storage='python'): Dtype.string,
36
+ pd.StringDtype(storage='pyarrow'): Dtype.string,
37
+ pa.string(): Dtype.string,
38
+ pa.binary(): Dtype.binary,
39
+ np.dtype('datetime64[ns]'): Dtype.date,
40
+ np.dtype('timedelta64[ns]'): Dtype.timedelta,
41
+ pa.list_(pa.float32()): Dtype.floatlist,
42
+ pa.list_(pa.int64()): Dtype.intlist,
43
+ pa.list_(pa.string()): Dtype.stringlist,
44
+ }
45
+
46
+
47
+ def to_dtype(ser: pd.Series) -> Dtype:
48
+ """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
49
+
50
+ Args:
51
+ ser: A :class:`pandas.Series` to analyze.
52
+
53
+ Returns:
54
+ The data type.
55
+ """
56
+ if pd.api.types.is_datetime64_any_dtype(ser.dtype):
57
+ return Dtype.date
58
+
59
+ if isinstance(ser.dtype, pd.CategoricalDtype):
60
+ return Dtype.string
61
+
62
+ if pd.api.types.is_object_dtype(ser.dtype):
63
+ index = ser.iloc[:1000].first_valid_index()
64
+ if index is not None and pd.api.types.is_list_like(ser[index]):
65
+ pos = ser.index.get_loc(index)
66
+ assert isinstance(pos, int)
67
+ ser = ser.iloc[pos:pos + 1000].dropna()
68
+
69
+ if not ser.map(pd.api.types.is_list_like).all():
70
+ raise ValueError("Data contains a mix of list-like and "
71
+ "non-list-like values")
72
+
73
+ ser = ser[ser.map(lambda x: not isinstance(x, list) or len(x) > 0)]
74
+
75
+ dtypes = ser.apply(lambda x: PANDAS_TO_DTYPE.get(
76
+ np.array(x).dtype, Dtype.string)).unique().tolist()
77
+
78
+ invalid_dtypes = set(dtypes) - {
79
+ Dtype.string,
80
+ Dtype.int,
81
+ Dtype.float,
82
+ }
83
+ if len(invalid_dtypes) > 0:
84
+ raise ValueError(f"Data contains unsupported list data types: "
85
+ f"{list(invalid_dtypes)}")
86
+
87
+ if Dtype.string in dtypes:
88
+ return Dtype.stringlist
89
+
90
+ if dtypes == [Dtype.int]:
91
+ return Dtype.intlist
92
+
93
+ return Dtype.floatlist
94
+
95
+ if ser.dtype not in PANDAS_TO_DTYPE:
96
+ raise ValueError(f"Unsupported data type '{ser.dtype}'")
97
+
98
+ return PANDAS_TO_DTYPE[ser.dtype]
99
+
100
+
101
+ def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
102
+ r"""Infers the semantic type of a column.
103
+
104
+ Args:
105
+ ser: A :class:`pandas.Series` to analyze.
106
+ column_name: The name of the column (used for pattern matching).
107
+ dtype: The data type.
108
+
109
+ Returns:
110
+ The semantic type.
111
+ """
112
+ if contains_id(ser, column_name, dtype):
113
+ return Stype.ID
114
+
115
+ if contains_timestamp(ser, column_name, dtype):
116
+ return Stype.timestamp
117
+
118
+ if contains_multicategorical(ser, column_name, dtype):
119
+ return Stype.multicategorical
120
+
121
+ if contains_categorical(ser, column_name, dtype):
122
+ return Stype.categorical
123
+
124
+ return dtype.default_stype
125
+
126
+
127
+ def detect_primary_key(
128
+ table_name: str,
129
+ df: pd.DataFrame,
130
+ candidates: list[str],
131
+ ) -> Optional[str]:
132
+ r"""Auto-detect potential primary key column.
133
+
134
+ Args:
135
+ table_name: The table name.
136
+ df: The pandas DataFrame to analyze
137
+ candidates: A list of potential candidates.
138
+
139
+ Returns:
140
+ The name of the detected primary key, or ``None`` if not found.
141
+ """
142
+ # A list of (potentially modified) table names that are eligible to match
143
+ # with a primary key, i.e.:
144
+ # - UserInfo -> User
145
+ # - snakecase <-> camelcase
146
+ # - camelcase <-> snakecase
147
+ # - plural <-> singular (users -> user, eligibilities -> eligibility)
148
+ # - verb -> noun (qualifying -> qualify)
149
+ _table_names = {table_name}
150
+ if table_name.lower().endswith('_info'):
151
+ _table_names.add(table_name[:-5])
152
+ elif table_name.lower().endswith('info'):
153
+ _table_names.add(table_name[:-4])
154
+
155
+ table_names = set()
156
+ for _table_name in _table_names:
157
+ table_names.add(_table_name.lower())
158
+ snakecase = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', _table_name)
159
+ snakecase = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', snakecase)
160
+ table_names.add(snakecase.lower())
161
+ camelcase = _table_name.replace('_', '')
162
+ table_names.add(camelcase.lower())
163
+ if _table_name.lower().endswith('s'):
164
+ table_names.add(_table_name.lower()[:-1])
165
+ table_names.add(snakecase.lower()[:-1])
166
+ table_names.add(camelcase.lower()[:-1])
167
+ else:
168
+ table_names.add(_table_name.lower() + 's')
169
+ table_names.add(snakecase.lower() + 's')
170
+ table_names.add(camelcase.lower() + 's')
171
+ if _table_name.lower().endswith('ies'):
172
+ table_names.add(_table_name.lower()[:-3] + 'y')
173
+ table_names.add(snakecase.lower()[:-3] + 'y')
174
+ table_names.add(camelcase.lower()[:-3] + 'y')
175
+ elif _table_name.lower().endswith('y'):
176
+ table_names.add(_table_name.lower()[:-1] + 'ies')
177
+ table_names.add(snakecase.lower()[:-1] + 'ies')
178
+ table_names.add(camelcase.lower()[:-1] + 'ies')
179
+ if _table_name.lower().endswith('ing'):
180
+ table_names.add(_table_name.lower()[:-3])
181
+ table_names.add(snakecase.lower()[:-3])
182
+ table_names.add(camelcase.lower()[:-3])
183
+
184
+ scores: list[tuple[str, int]] = []
185
+ for col_name in candidates:
186
+ col_name_lower = col_name.lower()
187
+
188
+ score = 0
189
+
190
+ if col_name_lower == 'id':
191
+ score += 4
192
+
193
+ for table_name_lower in table_names:
194
+
195
+ if col_name_lower == table_name_lower:
196
+ score += 4 # USER -> USER
197
+ break
198
+
199
+ for suffix in ['id', 'hash', 'key', 'code', 'uuid']:
200
+ if not col_name_lower.endswith(suffix):
201
+ continue
202
+
203
+ if col_name_lower == f'{table_name_lower}_{suffix}':
204
+ score += 5 # USER -> USER_ID
205
+ break
206
+
207
+ if col_name_lower == f'{table_name_lower}{suffix}':
208
+ score += 5 # User -> UserId
209
+ break
210
+
211
+ if col_name_lower.endswith(f'{table_name_lower}_{suffix}'):
212
+ score += 2
213
+
214
+ if col_name_lower.endswith(f'{table_name_lower}{suffix}'):
215
+ score += 2
216
+
217
+ # `rel-bench` hard-coding :(
218
+ if table_name == 'studies' and col_name == 'nct_id':
219
+ score += 1
220
+
221
+ ser = df[col_name].iloc[:1_000_000]
222
+ score += 3 * (ser.nunique() / len(ser))
223
+
224
+ scores.append((col_name, score))
225
+
226
+ scores = [x for x in scores if x[-1] >= 4]
227
+ scores.sort(key=lambda x: x[-1], reverse=True)
228
+
229
+ if len(scores) == 0:
230
+ return None
231
+
232
+ if len(scores) == 1:
233
+ return scores[0][0]
234
+
235
+ # In case of multiple candidates, only return one if its score is unique:
236
+ if scores[0][1] != scores[1][1]:
237
+ return scores[0][0]
238
+
239
+ max_score = max(scores, key=lambda x: x[1])
240
+ candidates = [col_name for col_name, score in scores if score == max_score]
241
+ warnings.warn(f"Found multiple potential primary keys in table "
242
+ f"'{table_name}': {candidates}. Please specify the primary "
243
+ f"key for this table manually.")
244
+
245
+ return None
246
+
247
+
248
+ def detect_time_column(
249
+ df: pd.DataFrame,
250
+ candidates: list[str],
251
+ ) -> Optional[str]:
252
+ r"""Auto-detect potential time column.
253
+
254
+ Args:
255
+ df: The pandas DataFrame to analyze
256
+ candidates: A list of potential candidates.
257
+
258
+ Returns:
259
+ The name of the detected time column, or ``None`` if not found.
260
+ """
261
+ candidates = [ # Exclude all candidates with `*last*` in column names:
262
+ col_name for col_name in candidates
263
+ if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
264
+ ]
265
+
266
+ if len(candidates) == 0:
267
+ return None
268
+
269
+ if len(candidates) == 1:
270
+ return candidates[0]
271
+
272
+ # If there exists a dedicated `create*` column, use it as time column:
273
+ create_candidates = [
274
+ candidate for candidate in candidates
275
+ if candidate.lower().startswith('create')
276
+ ]
277
+ if len(create_candidates) == 1:
278
+ return create_candidates[0]
279
+ if len(create_candidates) > 1:
280
+ candidates = create_candidates
281
+
282
+ # Find the most optimal time column. Usually, it is the one pointing to
283
+ # the oldest timestamps:
284
+ with warnings.catch_warnings():
285
+ warnings.filterwarnings('ignore', message='Could not infer format')
286
+ min_timestamp_dict = {
287
+ key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
288
+ for key in candidates
289
+ }
290
+ min_timestamp_dict = {
291
+ key: value.min().tz_localize(None)
292
+ for key, value in min_timestamp_dict.items()
293
+ }
294
+ min_timestamp_dict = {
295
+ key: value
296
+ for key, value in min_timestamp_dict.items() if not pd.isna(value)
297
+ }
298
+
299
+ if len(min_timestamp_dict) == 0:
300
+ return None
301
+
302
+ return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
303
+
304
+
305
+ PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
306
+ MULTISPACE = re.compile(r"\s+")
307
+
308
+
309
+ def normalize_text(
310
+ ser: pd.Series,
311
+ max_words: Optional[int] = 50,
312
+ ) -> pd.Series:
313
+ r"""Normalizes text into a list of lower-case words.
314
+
315
+ Args:
316
+ ser: The :class:`pandas.Series` to normalize.
317
+ max_words: The maximum number of words to return.
318
+ This will auto-shrink any large text column to avoid blowing up
319
+ context size.
320
+ """
321
+ if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
322
+ return ser
323
+
324
+ def normalize_fn(line: str) -> list[str]:
325
+ line = PUNCTUATION.sub(" ", line)
326
+ line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
327
+ line = MULTISPACE.sub(" ", line)
328
+ words = line.split()
329
+ if max_words is not None:
330
+ words = words[:max_words]
331
+ return words
332
+
333
+ ser = ser.fillna('').astype(str)
334
+
335
+ if max_words is not None:
336
+ # We estimate the number of words as 5 characters + 1 space in an
337
+ # English text on average. We need this pre-filter here, as word
338
+ # splitting on a giant text can be very expensive:
339
+ ser = ser.str[:6 * max_words]
340
+
341
+ ser = ser.str.lower()
342
+ ser = ser.map(normalize_fn)
343
+
344
+ return ser
kumoai/formatting.py ADDED
@@ -0,0 +1,30 @@
1
+ from kumoapi.jobs import ErrorDetails
2
+
3
+
4
+ def pretty_print_error_details(error_details: ErrorDetails) -> str:
5
+ """Pretty prints the ErrorDetails combining all the individual items.
6
+ If there are CTAs, they are also displayed after creating
7
+ corresponding hyperlinks.
8
+
9
+ Arguments:
10
+ error_details (ErrorDetails): Standard ErrorDetails response from
11
+ get_errors APIs.
12
+ """
13
+ out = ""
14
+ ctr = None
15
+ if len(error_details.items) != 1:
16
+ out += "Encountered multiple errors:\n"
17
+ ctr = 1
18
+ for error_detail in error_details.items:
19
+ if ctr is not None:
20
+ out += f'{ctr}.'
21
+ ctr += 1
22
+ if error_detail.title is not None:
23
+ out += f'{error_detail.title}: '
24
+ out += error_detail.description
25
+ if error_detail.cta is not None:
26
+ out += 'Follow the link for potential resolution:'
27
+ f' {error_detail.cta.url}'
28
+ out += '\n'
29
+
30
+ return out
kumoai/futures.py ADDED
@@ -0,0 +1,99 @@
1
+ import asyncio
2
+ import concurrent
3
+ import logging
4
+ import threading
5
+ from abc import ABC, abstractmethod
6
+ from asyncio.events import AbstractEventLoop
7
+ from typing import Any, Awaitable, Coroutine, Generic, TypeVar
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ CoroFuncType = Awaitable[Any]
12
+
13
+ # Kumo global event loop (our implementation of green threads for pollers and
14
+ # other interactions with the Kumo backend that require long-running tasks).
15
+ # Since the caller may have their own event loop that we do not want to
16
+ # mess with, _do not_ ever call `set_event_loop` here!! Instead, be extra
17
+ # cautious to pass this loop everywhere.
18
+ _KUMO_EVENT_LOOP: AbstractEventLoop = asyncio.new_event_loop()
19
+
20
+
21
+ def initialize_event_loop() -> None:
22
+ def _run_background_loop(loop: AbstractEventLoop) -> None:
23
+ asyncio.set_event_loop(loop)
24
+ loop.run_forever()
25
+
26
+ t = threading.Thread(target=_run_background_loop,
27
+ args=(_KUMO_EVENT_LOOP, ), daemon=True)
28
+ t.start()
29
+
30
+
31
+ def create_future(coro: Coroutine[Any, Any, Any]) -> concurrent.futures.Future:
32
+ r"""Creates a future to execute in the Kumo event loop."""
33
+ # NOTE this function creates a future, chains it to the output of the
34
+ # execution of `coro` in the Kumo event loop, and handles exceptions
35
+ # before scheduling to run in the loop:
36
+ return asyncio.run_coroutine_threadsafe(coro, _KUMO_EVENT_LOOP)
37
+
38
+
39
+ T = TypeVar("T")
40
+
41
+
42
+ class KumoFuture(ABC, Generic[T]):
43
+ r"""Abstract base class for a Kumo future object."""
44
+
45
+ # We cannot use Python future implementations (`asyncio.Future` or
46
+ # `concurrent.futures.Future`) as they are native to the Python
47
+ # implementation of asyncio and threading, and thus not easily extensible.
48
+ # Python additionally recommends not exposing low-level Future objects in
49
+ # user facing APIs.
50
+ @abstractmethod
51
+ def result(self) -> T:
52
+ r"""Returns the resolved state of the future.
53
+
54
+ Raises:
55
+ Exception:
56
+ If the future is complete but in a failed state due to an
57
+ exception being raised, this method will raise the same
58
+ exception.
59
+ """
60
+ raise NotImplementedError
61
+
62
+ @abstractmethod
63
+ def future(self) -> 'concurrent.futures.Future[T]':
64
+ r"""Returns the :obj:`concurrent.futures.Future` object wrapped by
65
+ this future. It is not recommended to access this object directly.
66
+ """
67
+ raise NotImplementedError
68
+
69
+ def done(self) -> bool:
70
+ r"""Returns :obj:`True` if this future has been resolved with
71
+ ``result()``, or :obj:`False` if this future is still
72
+ in-progress. Note that this method will return :obj:`True` if the
73
+ future is complete, but in a failed state, and that this method will
74
+ return :obj:`False` if the job is complete, but the future has not
75
+ been awaited.
76
+ """
77
+ return self.future().done()
78
+
79
+
80
+ class KumoProgressFuture(KumoFuture[T]):
81
+ @abstractmethod
82
+ def _attach_internal(self, interval_s: float = 4.0) -> T:
83
+ raise NotImplementedError
84
+
85
+ def attach(self, interval_s: float = 4.0) -> T:
86
+ r"""Allows a user to attach to a running job and view its progress.
87
+
88
+ Args:
89
+ interval_s (float): Time interval (seconds) between polls, minimum
90
+ value allowed is 4 seconds.
91
+ """
92
+ try:
93
+ return self._attach_internal(interval_s=interval_s)
94
+ except Exception:
95
+ logger.warning(
96
+ "Detailed job tracking has become temporarily unavailable. "
97
+ "The job is continuing to proceed on the Kumo server, "
98
+ "and this call will complete when the job has finished.")
99
+ return self.result()
@@ -0,0 +1,12 @@
1
+ from .column import Column, TimestampUnit
2
+ from .table import Table
3
+ from .graph import Graph, Edge, GraphHealthStats
4
+
5
+ __all__ = [
6
+ 'TimestampUnit',
7
+ 'Column',
8
+ 'Table',
9
+ 'Graph',
10
+ 'Edge',
11
+ 'GraphHealthStats',
12
+ ]
kumoai/graph/column.py ADDED
@@ -0,0 +1,106 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Optional, Union
3
+
4
+ from kumoapi.table import TimestampUnit
5
+ from kumoapi.typing import Dtype, Stype
6
+
7
+ from kumoai.mixin import CastMixin
8
+
9
+
10
+ @dataclass(init=False)
11
+ class Column(CastMixin):
12
+ r"""A column represents metadata information for a column in a Kumo
13
+ :class:`~kumoai.graph.Table`. Columns can be created independent of
14
+ a table, or can be fetched from a table with the
15
+ :meth:`~kumoai.graph.Table.column` method.
16
+
17
+ .. code-block:: python
18
+
19
+ import kumoai
20
+
21
+ # Fetch a column from a `kumoai.Table`:
22
+ table = kumoai.Table(...)
23
+
24
+ column = table.column("col_name")
25
+ column = table["col_name"] # equivalent to the above.
26
+
27
+ # Edit a column's data type:
28
+ print("Existing dtype: ", column.dtype)
29
+ column.dtype = "int"
30
+
31
+ # Edit a column's semantic type:
32
+ print("Existing stype: ", column.stype)
33
+ column.stype = "ID"
34
+
35
+ Args:
36
+ name: The name of this column.
37
+ stype: The semantic type of this column. Semantic types can be
38
+ specified as strings: the list of possible semantic types
39
+ is located at :class:`~kumoai.Stype`.
40
+ dtype: The data type of this column. Data types can be specified
41
+ as strings: the list of possible data types is located at
42
+ :class:`~kumoai.Dtype`.
43
+ timestamp_format: If this column represents a timestamp, the format
44
+ that the timestamp should be parsed in. The format can either be
45
+ a :class:`~kumoapi.table.TimestampUnit` for integer columns or a
46
+ string with a format identifier described
47
+ `here <https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html>`__
48
+ for a SaaS Kumo deployment and
49
+ `here <https://docs.snowflake.com/en/sql-reference/date-time-input-output#about-the-elements-used-in-input-and-output-formats>`__
50
+ for a Snowpark Container Services Kumo deployment. If left empty,
51
+ will be intelligently inferred by Kumo.
52
+ """ # noqa: E501
53
+ name: str
54
+ stype: Optional[Stype] = None
55
+ dtype: Optional[Dtype] = None
56
+ timestamp_format: Optional[Union[str, TimestampUnit]] = None
57
+
58
+ def __init__(
59
+ self,
60
+ name: str,
61
+ stype: Optional[Union[Stype, str]] = None,
62
+ dtype: Optional[Union[Dtype, str]] = None,
63
+ timestamp_format: Optional[Union[str, TimestampUnit]] = None,
64
+ ) -> None:
65
+ self.name = name
66
+ self.stype = Stype(stype) if stype is not None else None
67
+ self.dtype = Dtype(dtype) if dtype is not None else None
68
+ try:
69
+ self.timestamp_format = TimestampUnit(timestamp_format)
70
+ except ValueError:
71
+ self.timestamp_format = timestamp_format
72
+
73
+ def __hash__(self) -> int:
74
+ return hash((self.name, self.stype, self.dtype, self.timestamp_format))
75
+
76
+ def __setattr__(self, key: Any, value: Any) -> None:
77
+ if key == 'name' and value != getattr(self, key, value):
78
+ raise AttributeError("Attribute 'name' is read-only")
79
+ elif key == 'stype' and isinstance(value, str):
80
+ value = Stype(value)
81
+ elif key == 'dtype' and isinstance(value, str):
82
+ value = Dtype(value)
83
+ elif key == 'timestamp_format' and isinstance(value, str):
84
+ try:
85
+ value = TimestampUnit(value)
86
+ except ValueError:
87
+ pass
88
+ super().__setattr__(key, value)
89
+
90
+ def update(self, obj: 'Column', override: bool = True) -> 'Column':
91
+ for key in self.__dict__:
92
+ if key[0] == '_': # Skip private attributes:
93
+ continue
94
+ value = getattr(obj, key, None)
95
+ if value is not None:
96
+ if override or getattr(self, key, None) is None:
97
+ setattr(self, key, value)
98
+ return self
99
+
100
+ def __repr__(self) -> str:
101
+ out = (f"Column(name=\"{self.name}\", stype=\"{self.stype}\", "
102
+ f"dtype=\"{self.dtype}\"")
103
+ if self.timestamp_format is not None:
104
+ out += f", timestamp_format=\"{self.timestamp_format}\""
105
+ out += ")"
106
+ return out