kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (122) hide show
  1. kumoai/__init__.py +300 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +223 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +471 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +207 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1796 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +210 -0
  51. kumoai/experimental/rfm/authenticate.py +432 -0
  52. kumoai/experimental/rfm/backend/__init__.py +0 -0
  53. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  54. kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
  55. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  56. kumoai/experimental/rfm/backend/local/table.py +113 -0
  57. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  58. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  59. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  60. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  61. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  62. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  63. kumoai/experimental/rfm/base/__init__.py +30 -0
  64. kumoai/experimental/rfm/base/column.py +152 -0
  65. kumoai/experimental/rfm/base/expression.py +44 -0
  66. kumoai/experimental/rfm/base/sampler.py +761 -0
  67. kumoai/experimental/rfm/base/source.py +19 -0
  68. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  69. kumoai/experimental/rfm/base/table.py +736 -0
  70. kumoai/experimental/rfm/graph.py +1237 -0
  71. kumoai/experimental/rfm/infer/__init__.py +19 -0
  72. kumoai/experimental/rfm/infer/categorical.py +40 -0
  73. kumoai/experimental/rfm/infer/dtype.py +82 -0
  74. kumoai/experimental/rfm/infer/id.py +46 -0
  75. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  76. kumoai/experimental/rfm/infer/pkey.py +128 -0
  77. kumoai/experimental/rfm/infer/stype.py +35 -0
  78. kumoai/experimental/rfm/infer/time_col.py +61 -0
  79. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  80. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  81. kumoai/experimental/rfm/pquery/executor.py +102 -0
  82. kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
  83. kumoai/experimental/rfm/relbench.py +76 -0
  84. kumoai/experimental/rfm/rfm.py +1184 -0
  85. kumoai/experimental/rfm/sagemaker.py +138 -0
  86. kumoai/experimental/rfm/task_table.py +231 -0
  87. kumoai/formatting.py +30 -0
  88. kumoai/futures.py +99 -0
  89. kumoai/graph/__init__.py +12 -0
  90. kumoai/graph/column.py +106 -0
  91. kumoai/graph/graph.py +948 -0
  92. kumoai/graph/table.py +838 -0
  93. kumoai/jobs.py +80 -0
  94. kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
  95. kumoai/mixin.py +28 -0
  96. kumoai/pquery/__init__.py +25 -0
  97. kumoai/pquery/prediction_table.py +287 -0
  98. kumoai/pquery/predictive_query.py +641 -0
  99. kumoai/pquery/training_table.py +424 -0
  100. kumoai/spcs.py +121 -0
  101. kumoai/testing/__init__.py +8 -0
  102. kumoai/testing/decorators.py +57 -0
  103. kumoai/testing/snow.py +50 -0
  104. kumoai/trainer/__init__.py +42 -0
  105. kumoai/trainer/baseline_trainer.py +93 -0
  106. kumoai/trainer/config.py +2 -0
  107. kumoai/trainer/distilled_trainer.py +175 -0
  108. kumoai/trainer/job.py +1192 -0
  109. kumoai/trainer/online_serving.py +258 -0
  110. kumoai/trainer/trainer.py +475 -0
  111. kumoai/trainer/util.py +103 -0
  112. kumoai/utils/__init__.py +11 -0
  113. kumoai/utils/datasets.py +83 -0
  114. kumoai/utils/display.py +51 -0
  115. kumoai/utils/forecasting.py +209 -0
  116. kumoai/utils/progress_logger.py +343 -0
  117. kumoai/utils/sql.py +3 -0
  118. kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
  119. kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
  120. kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
  121. kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
  122. kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
kumoai/graph/table.py ADDED
@@ -0,0 +1,838 @@
1
+ import copy
2
+ import logging
3
+ import time
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ import kumoapi.data_snapshot as snapshot_api
7
+ import kumoapi.table as api
8
+ import pandas as pd
9
+ from kumoapi.common import JobStatus
10
+ from kumoapi.data_snapshot import TableSnapshotID
11
+ from kumoapi.typing import Stype
12
+ from typing_extensions import Self
13
+
14
+ from kumoai import global_state
15
+ from kumoai.client.table import TableID
16
+ from kumoai.connector import SourceColumn, SourceTable
17
+ from kumoai.graph.column import Column
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _DEFAULT_INTERVAL_S = 20
22
+
23
+
24
+ class Table:
25
+ r"""A Table represents metadata information for a table in a Kumo
26
+ :class:`~kumoai.graph.Graph`.
27
+
28
+ Whereas a :class:`~kumoai.connector.SourceTable` is simply a reference to a
29
+ table behind a backing :class:`~kumoai.connector.Connector`, a table fully
30
+ specifies the relevant metadata (including selected source columns, column
31
+ data type and semantic type, and relational constraint information)
32
+ necessary to train a :class:`~kumoai.pquery.PredictiveQuery` on graph of
33
+ tables. A table can either be constructed explicitly, or with the
34
+ convenience method :meth:`~kumoai.graph.Table.from_source_table`.
35
+
36
+ .. code-block:: python
37
+
38
+ import kumoai
39
+
40
+ # Define connector to source data:
41
+ connector = kumoai.S3Connector('s3://...')
42
+
43
+ # Create table using `from_source_table`:
44
+ customer = kumoai.Table.from_source_table(
45
+ source_table=connector['customer'],
46
+ primary_key='CustomerID',
47
+ )
48
+
49
+ # Create a table by constructing it directly:
50
+ customer = kumoai.Table(
51
+ source_table=connector['customer'],
52
+ columns=[kumoai.Column(name='CustomerID', dtype='string', stype='ID')],
53
+ primary_key='CustomerID',
54
+ )
55
+
56
+ # Infer any missing metadata in the table, from source table
57
+ # properties:
58
+ print("Current metadata: ", customer.metadata)
59
+ customer.infer_metadata()
60
+
61
+ # Validate the table configuration, for use in Kumo downstream models:
62
+ customer.validate(verbose=True)
63
+
64
+ # Fetch statistics from a snapshot of this table (this method will
65
+ # take a table snapshot, and as a result may have high latency):
66
+ customer.get_stats(wait_for="minimal")
67
+
68
+ Args:
69
+ source_table: The source table this Kumo table is created from.
70
+ columns: The selected columns of the source table that are part of this
71
+ Kumo table. Note that each column must specify its data type and
72
+ semantic type; see the :class:`~kumoai.graph.Column` documentation
73
+ for more information. If `None` all columns from the
74
+ source table are included by default.
75
+ primary_key: The primary key of the table, if present. The primary key
76
+ must exist in the :obj:`columns` argument.
77
+ time_column: The time column of the table, if present. The time column
78
+ must exist in the :obj:`columns` argument.
79
+ end_time_column: The end time column of the table, if present. The end
80
+ time column must exist in the :obj:`columns` argument.
81
+ """ # noqa: E501
82
+
83
+ def __init__(
84
+ self,
85
+ source_table: SourceTable,
86
+ columns: Optional[List[Union[SourceColumn, Column]]] = None,
87
+ primary_key: Optional[str] = None,
88
+ time_column: Optional[str] = None,
89
+ end_time_column: Optional[str] = None,
90
+ ) -> None:
91
+ # Reference to the source (raw) table:
92
+ self.source_table = source_table
93
+ self.source_name = source_table.name
94
+
95
+ # Columns. Note that there is no distinction between columns treated as
96
+ # features and those treated as constraints at this stage. The
97
+ # treatment of columns as "feature" or "schema-only" columns will be
98
+ # decided at the model plan stage (e.g. by encoding as `Null()`):
99
+ self._columns: Dict[str, Column] = {}
100
+
101
+ # Basic schema. This information is defined at the table level:
102
+ self._primary_key: Optional[str] = None
103
+ self._time_column: Optional[str] = None
104
+ self._end_time_column: Optional[str] = None
105
+
106
+ # Update values:
107
+ if columns is None:
108
+ columns = list(source_table.column_dict.values())
109
+ for col in (columns or []):
110
+ if isinstance(col, SourceColumn):
111
+ col = Column(name=col.name, stype=col.stype, dtype=col.dtype)
112
+ self.add_column(Column._cast(col))
113
+ self.primary_key = Column._cast(primary_key)
114
+ self.time_column = Column._cast(time_column)
115
+ self.end_time_column = Column._cast(end_time_column)
116
+
117
+ # Cached from backend. Note there is no such thing as a table resource
118
+ # as tables are only persisted in the context of a graph. However,
119
+ # table snapshot resources exist, as tables can be ingested and have
120
+ # data fetched:
121
+ self._table_snapshot_id: Optional[TableSnapshotID] = None
122
+
123
+ @staticmethod
124
+ def from_source_table(
125
+ source_table: SourceTable,
126
+ column_names: Optional[List[str]] = None,
127
+ primary_key: Optional[str] = None,
128
+ time_column: Optional[str] = None,
129
+ end_time_column: Optional[str] = None,
130
+ ) -> 'Table':
131
+ r"""Creates a Kumo Table from a source table. If no column names are
132
+ specified, all source columns are included by default.
133
+
134
+ Args:
135
+ source_table: The :class:`~kumoai.connector.SourceTable` object
136
+ that this table is constructed on.
137
+ column_names: A list of columns to include from the source table;
138
+ if not specified, all columns are included by default.
139
+ primary_key: The name of the primary key of this table, if it
140
+ exists.
141
+ time_column: The name of the time column of this table, if it
142
+ exists.
143
+ end_time_column: The name of the end time column of this table, if
144
+ it exists.
145
+ """
146
+ cols = [
147
+ Column(name, col.stype, col.dtype)
148
+ for name, col in source_table.column_dict.items()
149
+ if (name in column_names if column_names is not None else True)
150
+ ]
151
+ out = Table(source_table, cols)
152
+ out.primary_key = Column._cast(primary_key)
153
+ out.time_column = Column._cast(time_column)
154
+ out.end_time_column = Column._cast(end_time_column)
155
+ return out
156
+
157
+ def print_definition(self) -> None:
158
+ r"""Prints the full definition for this table; this definition can be
159
+ copied-and-pasted verbatim to re-create this table.
160
+ """
161
+ pkey_name = (f"\"{self.primary_key.name}\""
162
+ if self.primary_key is not None else "None")
163
+ t_name = (f"\"{self.time_column.name}\""
164
+ if self.time_column is not None else "None")
165
+ et_name = (f"\"{self.end_time_column.name}\""
166
+ if self.end_time_column is not None else "None")
167
+ col_dict = "\n".join([f' {c},' for c in self.columns])
168
+ source_repr = f"{self.source_table.connector}[\"{self.source_name}\"]"
169
+ print(f'{self.__class__.__name__}(\n'
170
+ f' source_table={source_repr},\n'
171
+ f' primary_key={pkey_name},\n'
172
+ f' time_column={t_name},\n'
173
+ f' end_time_column={et_name},\n'
174
+ f' columns=[\n{col_dict}\n'
175
+ f' ],\n'
176
+ f')')
177
+
178
+ # Data column #############################################################
179
+
180
+ def has_column(self, name: str) -> bool:
181
+ r"""Returns True if this table has column with name :obj:`name`; False
182
+ otherwise.
183
+ """
184
+ return name in self._columns
185
+
186
+ def column(self, name: str) -> Column:
187
+ r"""Returns the data column named with name :obj:`name` in this table,
188
+ or raises a :obj:`KeyError` if no such column is present.
189
+
190
+ Raises:
191
+ :class:`KeyError`
192
+ if :obj:`name` is not present in this table.
193
+ """
194
+ if not self.has_column(name):
195
+ raise KeyError(
196
+ f"Column '{name}' not found in table '{self.source_name}'")
197
+ return self._columns[name]
198
+
199
+ @property
200
+ def columns(self) -> List[Column]:
201
+ r"""Returns a list of :class:`~kumoai.Column` objects that represent
202
+ the columns in this table.
203
+ """
204
+ return list(self._columns.values())
205
+
206
+ def add_column(self, *args: Any, **kwargs: Any) -> None:
207
+ r"""Adds a :obj:`~kumoai.graph.Column` to this table. A column can
208
+ either be added by directly specifying its configuration in this call,
209
+ or by creating a Column object and passing it as an argument.
210
+
211
+ Example:
212
+ >>> import kumoai
213
+ >>> table = kumoai.Table(source_table=...) # doctest: +SKIP
214
+ >>> table.add_column(name='col1', dtype='string') # doctest: +SKIP
215
+ >>> table.add_column(kumoai.Column('col2', 'int')) # doctest: +SKIP
216
+
217
+ .. # noqa: E501
218
+ """
219
+ col = Column._cast(*args, **kwargs)
220
+ if col is None:
221
+ raise ValueError("Cannot add a 'None' column to a table.")
222
+ if self.has_column(col.name):
223
+ self._columns[col.name].update(col)
224
+ else:
225
+ self._columns[col.name] = col
226
+
227
+ def remove_column(self, name: str) -> Self:
228
+ r"""Removes a :obj:`~kumoai.graph.Column` from this table.
229
+
230
+ Raises:
231
+ :class:`KeyError`
232
+ if :obj:`name` is not present in this table.
233
+ """
234
+ if not self.has_column(name):
235
+ raise KeyError(
236
+ f"Column '{name}' not found in table '{self.source_name}'")
237
+
238
+ if self.has_primary_key() and self._primary_key == name:
239
+ self.primary_key = None
240
+ if self.has_time_column() and self._time_column == name:
241
+ self.time_column = None
242
+ if self.has_end_time_column() and self._end_time_column == name:
243
+ self.end_time_column = None
244
+ del self._columns[name]
245
+ return self
246
+
247
+ # Primary key #############################################################
248
+
249
+ def has_primary_key(self) -> bool:
250
+ r"""Returns :obj:`True` if this table has a primary key; :obj:`False`
251
+ otherwise.
252
+ """
253
+ return self._primary_key is not None
254
+
255
+ @property
256
+ def primary_key(self) -> Optional[Column]:
257
+ r"""The primary key column of this table.
258
+
259
+ The getter returns the primary key column of this table, or None if no
260
+ such primary key is present.
261
+
262
+ The setter sets a column as a primary key on this table, and raises a
263
+ :class:`ValueError` if the primary key has a non-ID semantic type.
264
+ """
265
+ if not self.has_primary_key():
266
+ return None
267
+ assert self._primary_key is not None
268
+ return self._columns[self._primary_key]
269
+
270
+ @primary_key.setter
271
+ def primary_key(self, *args: Any, **kwargs: Any) -> Self:
272
+ col = Column._cast(*args, **kwargs)
273
+ if col is None:
274
+ self._primary_key = None
275
+ return self
276
+
277
+ if col.stype is not None and col.stype != Stype.ID:
278
+ raise ValueError(
279
+ f"The semantic type of a primary key must be 'ID' (got "
280
+ f"{col.stype}).")
281
+
282
+ col.stype = Stype.ID
283
+ self.add_column(col)
284
+ self._primary_key = col.name
285
+ return self
286
+
287
+ # Time column #############################################################
288
+
289
+ def has_time_column(self) -> bool:
290
+ r"""Returns :obj:`True` if this table has a time column; :obj:`False`
291
+ otherwise.
292
+ """
293
+ return self._time_column is not None
294
+
295
+ @property
296
+ def time_column(self) -> Optional[Column]:
297
+ r"""The time column of this table.
298
+
299
+ The getter returns the time column of this table, or :obj:`None` if no
300
+ such time column is present.
301
+
302
+ The setter sets a column as a time column on this table, and raises a
303
+ :class:`ValueError` if the time column is the same as the end time
304
+ column, or has a non-timestamp semantic type.
305
+ """
306
+ if not self.has_time_column():
307
+ return None
308
+ assert self._time_column is not None
309
+ return self._columns[self._time_column]
310
+
311
+ @time_column.setter
312
+ def time_column(self, *args: Any, **kwargs: Any) -> Self:
313
+ col = Column._cast(*args, **kwargs)
314
+ if col is None:
315
+ self._time_column = None
316
+ return self
317
+
318
+ if self.has_end_time_column() and self._end_time_column == col.name:
319
+ raise ValueError(f"Cannot set the time column ('{col.name}') "
320
+ f"to be the same as the end time column "
321
+ f"('{self._end_time_column}')")
322
+
323
+ if col.stype is not None and col.stype != Stype.timestamp:
324
+ raise ValueError(
325
+ f"The semantic type of a time column must be 'timestamp' (got "
326
+ f"{col.stype}).")
327
+
328
+ col.stype = Stype.timestamp
329
+ self.add_column(col)
330
+ self._time_column = col.name
331
+ return self
332
+
333
+ # End time column #########################################################
334
+
335
+ def has_end_time_column(self) -> bool:
336
+ r"""Returns :obj:`True` if this table has an end time column;
337
+ :obj:`False` otherwise.
338
+ """
339
+ return self._end_time_column is not None
340
+
341
+ @property
342
+ def end_time_column(self) -> Optional[Column]:
343
+ r"""The end time column of this table.
344
+
345
+ The getter returns the end time column of this table, or :obj:`None` if
346
+ no such column is present.
347
+
348
+ The setter sets a column as a time column on this table, and raises a
349
+ :class:`ValueError` if the time column is the same as the end time
350
+ column, or has a non-timestamp semantic type.
351
+ """
352
+ if not self.has_end_time_column():
353
+ return None
354
+ assert self._end_time_column is not None
355
+ return self._columns[self._end_time_column]
356
+
357
+ @end_time_column.setter
358
+ def end_time_column(self, *args: Any, **kwargs: Any) -> Self:
359
+ col = Column._cast(*args, **kwargs)
360
+ if col is None:
361
+ self._end_time_column = None
362
+ return self
363
+
364
+ if self.has_time_column() and self._time_column == col.name:
365
+ raise ValueError(f"Cannot set the end time column ('{col.name}') "
366
+ f"to be the same as the time column "
367
+ f"('{self._time_column}')")
368
+
369
+ if col.stype is not None and col.stype != Stype.timestamp:
370
+ raise ValueError(
371
+ f"The semantic type of an end time column must be 'timestamp' "
372
+ f"(got {col.stype}).")
373
+
374
+ col.stype = Stype.timestamp
375
+ self.add_column(col)
376
+ self._end_time_column = col.name
377
+ return self
378
+
379
+ # Metadata ################################################################
380
+
381
+ @property
382
+ def metadata(self) -> pd.DataFrame:
383
+ r"""Returns a :class:`~pandas.DataFrame` object containing Kumo metadata
384
+ information about the columns in this table.
385
+
386
+ The returned dataframe has columns ``name``, ``dtype``, ``stype``,
387
+ ``is_primary_key``, ``is_time_column``, and ``is_end_time_column``,
388
+ which provide an aggregate view of the properties of the columns of
389
+ this table.
390
+
391
+ Example:
392
+ >>> import kumoai
393
+ >>> table = kumoai.Table(source_table=...) # doctest: +SKIP
394
+ >>> table.add_column(name='CustomerID', dtype='float64', stype='ID') # doctest: +SKIP
395
+ >>> table.metadata # doctest: +SKIP
396
+ name dtype stype is_time_column is_end_time_column
397
+ 0 CustomerID float64 ID False False
398
+ """ # noqa: E501
399
+ items = self._columns.items()
400
+ col_names: List[str] = [i[0] for i in items]
401
+ cols: List[Column] = [i[1] for i in items]
402
+
403
+ return pd.DataFrame({
404
+ 'name':
405
+ pd.Series(dtype=str, data=col_names),
406
+ 'dtype':
407
+ pd.Series(
408
+ dtype=str, data=[
409
+ c.dtype.value if c.dtype is not None else None
410
+ for c in cols
411
+ ]),
412
+ 'stype':
413
+ pd.Series(
414
+ dtype=str, data=[
415
+ c.stype.value if c.stype is not None else None
416
+ for c in cols
417
+ ]),
418
+ 'is_primary_key':
419
+ pd.Series(dtype=bool, data=[self.primary_key == c for c in cols]),
420
+ 'is_time_column':
421
+ pd.Series(dtype=bool, data=[self.time_column == c for c in cols]),
422
+ 'is_end_time_column':
423
+ pd.Series(dtype=bool,
424
+ data=[self.end_time_column == c for c in cols]),
425
+ })
426
+
427
+ def infer_metadata(self, inplace: bool = True) -> Self:
428
+ r"""Infers all metadata for this table's specified columns, including
429
+ the column data types, semantic types, timestamp formats, primary keys,
430
+ and time/end-time columns
431
+
432
+ Args:
433
+ inplace: Whether the method should modify the table columns in
434
+ place or return a new :class:`~kumoai.graph.Table` object.
435
+
436
+ .. note::
437
+ This method in-place modifies the Table object if `inplace = True`,
438
+ and returns a copy if ``inplace = False``.
439
+ """
440
+ col_requests: List[api.ColumnMetadataRequest] = []
441
+ for col in self.columns:
442
+ col_requests.append(
443
+ # stype and dtype are None to support inferral:
444
+ api.ColumnMetadataRequest(
445
+ name=col.name,
446
+ stype=None,
447
+ dtype=None,
448
+ timestamp_format=col.timestamp_format,
449
+ ))
450
+
451
+ pk_name: Optional[str] = None
452
+ if self.has_primary_key():
453
+ pk_name = self.primary_key.name # type: ignore
454
+
455
+ tc_name: Optional[str] = None
456
+ if self.has_time_column():
457
+ tc_name = self.time_column.name # type: ignore
458
+
459
+ request = api.TableMetadataRequest(
460
+ cols=col_requests,
461
+ source_table=self.source_table._to_api_source_table(),
462
+ pkey=pk_name,
463
+ time_col=tc_name,
464
+ )
465
+
466
+ response = global_state.client.table_api.infer_metadata(request)
467
+ inferred_cols: Dict[str, api.Column] = {
468
+ col.name: col
469
+ for col in response.cols
470
+ }
471
+
472
+ # Handle inplace:
473
+ out = self
474
+ if not inplace:
475
+ out = copy.deepcopy(self)
476
+
477
+ # TODO(manan): respect user overrides
478
+ # TODO(manan): what happens when the ts format is set based on an
479
+ # override?
480
+ for col in out.columns:
481
+ inferred_col = inferred_cols[col.name]
482
+
483
+ col.dtype = inferred_col.dtype
484
+ col.stype = inferred_col.stype
485
+ col.timestamp_format = (col.timestamp_format
486
+ or inferred_col.timestamp_format)
487
+
488
+ # TODO(manan): support end-time column
489
+ if not out.has_primary_key() and response.pkey is not None:
490
+ out.primary_key = response.pkey
491
+ if not out.has_time_column() and response.time_col is not None:
492
+ out.time_column = response.time_col
493
+
494
+ # Override for Kumo backend, always:
495
+ if out.has_primary_key():
496
+ out.primary_key.stype = Stype.ID # type: ignore
497
+
498
+ if out.has_time_column():
499
+ out.time_column.stype = Stype.timestamp # type: ignore
500
+
501
+ if out.has_end_time_column():
502
+ out.end_time_column.stype = Stype.timestamp # type: ignore
503
+
504
+ return out
505
+
506
+ def _validate_definition(self) -> None:
507
+ for col in self.columns:
508
+ if col.dtype is None or col.stype is None:
509
+ raise ValueError(
510
+ f"Column {col.name} is not fully specified. Please "
511
+ f"specify this column's data type and semantic type "
512
+ f"before proceeding. {col.name} currently has a "
513
+ f"data type of {col.dtype} and semantic type of "
514
+ f"{col.stype}.")
515
+
516
+ def validate(self, verbose: bool = True) -> Self:
517
+ r"""Validates a Table to ensure that all relevant metadata is specified
518
+ for a table to be used in a downstream :class:`~kumoai.graph.Graph` and
519
+ :class:`~kumoai.pquery.PredictiveQuery`.
520
+
521
+ Conceretely, validation ensures that all columns have valid
522
+ data and semantic types, with respect to the table's source data.
523
+ For example, if a text column is assigned a ``dtype`` of ``"int"``,
524
+ this method will raise an exception detailing the mismatch. Similarly,
525
+ if a column cannot be cast from its source data type to the specified
526
+ data type (*e.g* ``"int"`` to ``"binary"``), this method will raise an
527
+ exception.
528
+
529
+ .. warning::
530
+ Data type validation is performed on a sample of table data. A
531
+ valid response may not indicate your entire data source is
532
+ configured correctly.
533
+
534
+ Args:
535
+ verbose: Whether to log non-error output of this validation.
536
+
537
+ Example:
538
+ >>> import kumoai
539
+ >>> table = kumoai.Table(...) # doctest: +SKIP
540
+ >>> table.validate() # doctest: +SKIP
541
+
542
+ Raises:
543
+ ValueError:
544
+ if validation fails.
545
+ """
546
+ self._validate_definition()
547
+
548
+ # Actual heavy lifting:
549
+ resp = global_state.client.table_api.validate_table(
550
+ api.TableValidationRequest(self._to_api_table_definition()))
551
+ if not resp.ok:
552
+ raise ValueError(resp.error_message())
553
+ if verbose:
554
+ if resp.empty():
555
+ logger.info("Table %s is configured correctly.",
556
+ self.source_name)
557
+ else:
558
+ logger.warning(resp.message())
559
+ return self
560
+
561
+ # Snapshot ################################################################
562
+
563
+ @property
564
+ def snapshot_id(self) -> Optional[snapshot_api.TableSnapshotID]:
565
+ r"""Returns the snapshot ID of this table's snapshot, if a snapshot
566
+ has been taken. Returns `None` otherwise.
567
+
568
+ .. warning::
569
+ This property currently only returns a snapshot ID if a snapshot
570
+ has been taken *in this session.*
571
+ """
572
+ return self._table_snapshot_id
573
+
574
+ def snapshot(
575
+ self,
576
+ *,
577
+ force_refresh: bool = False,
578
+ non_blocking: bool = False,
579
+ ) -> snapshot_api.TableSnapshotID:
580
+ r"""Takes a *snapshot* of this table's underlying data, and returns a
581
+ unique identifier for this snapshot.
582
+
583
+ The *snapshot* functionality allows one to freeze a table in time, so
584
+ that underlying data changes do not require Kumo to re-process the
585
+ data. This allows for fast iterative machine learning model
586
+ development, on a consistent set of input data.
587
+
588
+ .. warning::
589
+ Please note that snapshots are intended to freeze tables in
590
+ time, and not to allow for "time-traveling" to an earlier version
591
+ of data with a prior snapshot. In particular, this means that a
592
+ table can only have one version of a snapshot, which represents
593
+ the latest snapshot taken for that table.
594
+
595
+ .. note::
596
+ If you are using Kumo as a Snowpark Container Services native
597
+ application, please note that *snapshot* is a no-op for all
598
+ non-view tables.
599
+
600
+ Args:
601
+ force_refresh: Indicates whether a snapshot should be taken, if one
602
+ already exists in Kumo. If :obj:`False`, a previously existing
603
+ snapshot may be re-used. If :obj:`True`, a new snapshot is
604
+ always taken.
605
+ non_blocking: Whether this operation should return immediately
606
+ after creating the snapshot, or await completion of the
607
+ snapshot. If :obj:`True`, the snapshot will proceed in the
608
+ background, and will be used for any downstream job.
609
+ """
610
+ if self._table_snapshot_id is None or force_refresh:
611
+ self._table_snapshot_id = (
612
+ global_state.client.table_api.create_snapshot(
613
+ table_definition=self._to_api_table_definition(),
614
+ refresh_source=True,
615
+ ))
616
+
617
+ stage = snapshot_api.TableSnapshotStage.INGEST
618
+ resource: snapshot_api.TableSnapshotResource = (
619
+ global_state.client.table_api.get_snapshot(
620
+ snapshot_id=self._table_snapshot_id))
621
+
622
+ if not non_blocking:
623
+ status = resource.stages[stage].status
624
+ while not status.is_terminal:
625
+ # TODO(manan, siyang): fix start and end time
626
+ resource = (global_state.client.table_api.get_snapshot(
627
+ snapshot_id=self._table_snapshot_id))
628
+ logger.info(
629
+ "Awaiting snapshot completion: current status is %s ",
630
+ status)
631
+ time.sleep(_DEFAULT_INTERVAL_S)
632
+ status = resource.stages[stage].status
633
+
634
+ state = resource.stages[stage]
635
+ status = state.status
636
+ warnings = "\n".join([
637
+ f"{i}. {message}"
638
+ for i, message in enumerate(state.warnings)
639
+ ])
640
+ error = state.error
641
+ if status == JobStatus.FAILED:
642
+ raise RuntimeError(
643
+ f"Table snapshot with identifier "
644
+ f"{self._table_snapshot_id} failed, with error "
645
+ f"{error} and warnings {warnings}")
646
+ if len(state.warnings) > 0:
647
+ logger.warning(
648
+ "Table snapshot completed with the following "
649
+ "warnings: %s", warnings)
650
+
651
+ # <prefix>@<data_version>
652
+ assert self._table_snapshot_id is not None
653
+ return self._table_snapshot_id
654
+
655
+ def get_stats(
656
+ self,
657
+ wait_for: Optional[str] = None,
658
+ ) -> Dict[str, Dict[str, Any]]:
659
+ r"""Returns all currently computed statistics on the latest snapshot of
660
+ this table. If a snapshot on this table has not been taken, this method
661
+ will take a snapshot.
662
+
663
+ .. note::
664
+ Table statstics are computed in multiple stages after ingestion is
665
+ complete. These stages are called *minimal* and *full*; minimal
666
+ statistics are always computed before full statistics.
667
+
668
+ Args:
669
+ wait_for: Whether this operation should block on the existence of
670
+ statistics availability. This argument can take one of three
671
+ values: :obj:`None`, which indicates that the method should
672
+ return immediately with whatever statistics are present,
673
+ :obj:`"minimal"`, which indicates that the method should return
674
+ the when the minimum, maximum, and fraction of NA values
675
+ statistics are present, or :obj:`"full"`, which indicates that
676
+ the method should return when all computed statistics are
677
+ present.
678
+ """
679
+ assert wait_for is None or wait_for in {"minimal", "full"}
680
+
681
+ # Attempt to snapshot, use cached snapshot if possible:
682
+ if not self._table_snapshot_id:
683
+ self.snapshot(force_refresh=False, non_blocking=False)
684
+ assert self._table_snapshot_id is not None
685
+
686
+ # Fetch resource:
687
+ resource: snapshot_api.TableSnapshotResource = (
688
+ global_state.client.table_api.get_snapshot(
689
+ snapshot_id=self._table_snapshot_id))
690
+
691
+ # Wait for a stage, if we need to:
692
+ if wait_for:
693
+ if wait_for == "minimal":
694
+ stage = snapshot_api.TableSnapshotStage.MIN_COL_STATS
695
+ else:
696
+ stage = snapshot_api.TableSnapshotStage.FULL_COL_STATS
697
+
698
+ status = resource.stages[stage].status
699
+ while not status.is_terminal:
700
+ resource = (global_state.client.table_api.get_snapshot(
701
+ snapshot_id=self._table_snapshot_id))
702
+ logger.info(
703
+ "Awaiting %s column statistics: current status is %s ",
704
+ wait_for, status)
705
+ time.sleep(_DEFAULT_INTERVAL_S)
706
+ status = resource.stages[stage].status
707
+
708
+ # Write out statistics:
709
+ out = {}
710
+ col_stats = resource.column_stats
711
+ for stat in (col_stats or []):
712
+ out[stat.column_name] = stat.stats
713
+ return out
714
+
715
+ # Persistence #############################################################
716
+
717
+ def _to_api_table_definition(self) -> api.TableDefinition:
718
+ # TODO(manan): type narrowing?
719
+ pk_name: Optional[str] = None
720
+ if self.has_primary_key():
721
+ pk_name = self.primary_key.name # type: ignore
722
+
723
+ tc_name: Optional[str] = None
724
+ if self.has_time_column():
725
+ tc_name = self.time_column.name # type: ignore
726
+
727
+ etc_name: Optional[str] = None
728
+ if self.has_end_time_column():
729
+ etc_name = self.end_time_column.name # type: ignore
730
+
731
+ return api.TableDefinition(
732
+ cols=[
733
+ api.Column(col.name, col.stype, col.dtype,
734
+ col.timestamp_format) for col in self.columns
735
+ ],
736
+ source_table=self.source_table._to_api_source_table(),
737
+ pkey=pk_name,
738
+ time_col=tc_name,
739
+ end_time_col=etc_name,
740
+ )
741
+
742
+ @staticmethod
743
+ def _from_api_table_definition(
744
+ table_definition: api.TableDefinition) -> 'Table':
745
+ return Table(
746
+ source_table=SourceTable._from_api_table_definition(
747
+ table_definition),
748
+ columns=[
749
+ Column(col.name, col.stype, col.dtype, col.timestamp_format)
750
+ for col in table_definition.cols
751
+ ],
752
+ primary_key=table_definition.pkey,
753
+ time_column=table_definition.time_col,
754
+ end_time_column=table_definition.end_time_col,
755
+ )
756
+
757
+ def save(self, name: Optional[str] = None) -> Union[TableID, str]:
758
+ r"""Associates this table with a unique name, that can later be
759
+ used to fetch the table either in the Kumo UI or in the Kumo SDK
760
+ with method :meth:`~kumoai.Table.load`.
761
+
762
+ Args:
763
+ name: The name to associate with this table definition. If the
764
+ name is already associated with another table, that table will
765
+ be overridden.
766
+
767
+ Example:
768
+ >>> import kumoai
769
+ >>> table = kumoai.Table(...) # doctest: +SKIP
770
+ >>> unique_id = table.save() # doctest: +SKIP
771
+ >>> loaded = kumoai.Table.load(unique_id) # doctest: +SKIP
772
+ >>> name = table.save("name") # doctest: +SKIP
773
+ >>> loaded = kumoai.Table.load("name") # doctest: +SKIP
774
+ """
775
+ self.validate(verbose=False)
776
+ template_resource = (global_state.client.table_api.get_table_if_exists(
777
+ table_id_or_name=name)) if name else None
778
+
779
+ if template_resource is not None:
780
+ config = self._from_api_table_definition(template_resource.table)
781
+ logger.warning(
782
+ ("Table template %s already exists, with configuration %s. "
783
+ "This template will be overridden with configuration %s."),
784
+ name, str(config), str(self))
785
+
786
+ # TODO(manan): fix
787
+ _id = global_state.client.table_api.create_table(
788
+ table_def=self._to_api_table_definition(),
789
+ name_alias=name,
790
+ force_rename=True if name else False,
791
+ )
792
+ return f"table-{_id.split('-', maxsplit=1)[1]}"
793
+
794
+ @classmethod
795
+ def load(cls, table_id_or_template: str) -> 'Table':
796
+ r"""Loads a table from either a table ID or a named template. Returns a
797
+ :class:`Table` object that contains the loaded table along with its
798
+ columns, etc.
799
+ """
800
+ api = global_state.client.table_api
801
+ res = api.get_table_if_exists(table_id_or_template)
802
+ if not res:
803
+ raise ValueError(f"Table {table_id_or_template} was not found.")
804
+ out = cls._from_api_table_definition(res.table)
805
+ return out
806
+
807
+ # Class properties ########################################################
808
+
809
+ def __hash__(self) -> int:
810
+ return hash(
811
+ tuple(self.columns +
812
+ [self.primary_key, self.time_column, self.end_time_column]))
813
+
814
+ def __contains__(self, name: str) -> bool:
815
+ return self.has_column(name)
816
+
817
+ def __getitem__(self, name: str) -> Column:
818
+ return self.column(name)
819
+
820
+ def __delitem__(self, name: str) -> None:
821
+ self.remove_column(name)
822
+
823
+ def __repr__(self) -> str:
824
+ col_names = str(list(self._columns.keys())).replace("'", "")
825
+ pkey_name = (self.primary_key.name
826
+ if self.primary_key is not None else "None")
827
+ t_name = (self.time_column.name
828
+ if self.time_column is not None else "None")
829
+ et_name = (self.end_time_column.name
830
+ if self.end_time_column is not None else "None")
831
+ return (f'{self.__class__.__name__}(\n'
832
+ f' source_name={self.source_name},\n'
833
+ f' data_source={self.source_table.connector.name},\n'
834
+ f' columns={col_names},\n'
835
+ f' primary_key={pkey_name},\n'
836
+ f' time_column={t_name},\n'
837
+ f' end_time_column={et_name},\n'
838
+ f')')