kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (122) hide show
  1. kumoai/__init__.py +300 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +223 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +471 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +207 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1796 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +210 -0
  51. kumoai/experimental/rfm/authenticate.py +432 -0
  52. kumoai/experimental/rfm/backend/__init__.py +0 -0
  53. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  54. kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
  55. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  56. kumoai/experimental/rfm/backend/local/table.py +113 -0
  57. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  58. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  59. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  60. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  61. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  62. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  63. kumoai/experimental/rfm/base/__init__.py +30 -0
  64. kumoai/experimental/rfm/base/column.py +152 -0
  65. kumoai/experimental/rfm/base/expression.py +44 -0
  66. kumoai/experimental/rfm/base/sampler.py +761 -0
  67. kumoai/experimental/rfm/base/source.py +19 -0
  68. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  69. kumoai/experimental/rfm/base/table.py +736 -0
  70. kumoai/experimental/rfm/graph.py +1237 -0
  71. kumoai/experimental/rfm/infer/__init__.py +19 -0
  72. kumoai/experimental/rfm/infer/categorical.py +40 -0
  73. kumoai/experimental/rfm/infer/dtype.py +82 -0
  74. kumoai/experimental/rfm/infer/id.py +46 -0
  75. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  76. kumoai/experimental/rfm/infer/pkey.py +128 -0
  77. kumoai/experimental/rfm/infer/stype.py +35 -0
  78. kumoai/experimental/rfm/infer/time_col.py +61 -0
  79. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  80. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  81. kumoai/experimental/rfm/pquery/executor.py +102 -0
  82. kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
  83. kumoai/experimental/rfm/relbench.py +76 -0
  84. kumoai/experimental/rfm/rfm.py +1184 -0
  85. kumoai/experimental/rfm/sagemaker.py +138 -0
  86. kumoai/experimental/rfm/task_table.py +231 -0
  87. kumoai/formatting.py +30 -0
  88. kumoai/futures.py +99 -0
  89. kumoai/graph/__init__.py +12 -0
  90. kumoai/graph/column.py +106 -0
  91. kumoai/graph/graph.py +948 -0
  92. kumoai/graph/table.py +838 -0
  93. kumoai/jobs.py +80 -0
  94. kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
  95. kumoai/mixin.py +28 -0
  96. kumoai/pquery/__init__.py +25 -0
  97. kumoai/pquery/prediction_table.py +287 -0
  98. kumoai/pquery/predictive_query.py +641 -0
  99. kumoai/pquery/training_table.py +424 -0
  100. kumoai/spcs.py +121 -0
  101. kumoai/testing/__init__.py +8 -0
  102. kumoai/testing/decorators.py +57 -0
  103. kumoai/testing/snow.py +50 -0
  104. kumoai/trainer/__init__.py +42 -0
  105. kumoai/trainer/baseline_trainer.py +93 -0
  106. kumoai/trainer/config.py +2 -0
  107. kumoai/trainer/distilled_trainer.py +175 -0
  108. kumoai/trainer/job.py +1192 -0
  109. kumoai/trainer/online_serving.py +258 -0
  110. kumoai/trainer/trainer.py +475 -0
  111. kumoai/trainer/util.py +103 -0
  112. kumoai/utils/__init__.py +11 -0
  113. kumoai/utils/datasets.py +83 -0
  114. kumoai/utils/display.py +51 -0
  115. kumoai/utils/forecasting.py +209 -0
  116. kumoai/utils/progress_logger.py +343 -0
  117. kumoai/utils/sql.py +3 -0
  118. kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
  119. kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
  120. kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
  121. kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
  122. kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
kumoai/jobs.py ADDED
@@ -0,0 +1,80 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generic, Mapping, Optional, TypeVar
3
+
4
+ from kumoapi.jobs import JobStatusReport
5
+ from typing_extensions import Self
6
+
7
+ from kumoai.client.jobs import CommonJobAPI, JobRequestType, JobResourceType
8
+
9
+ IDType = TypeVar('IDType', bound=str)
10
+
11
+
12
+ class JobInterface(ABC, Generic[IDType, JobRequestType, JobResourceType]):
13
+ r"""Defines a standard interface for job objects."""
14
+ @staticmethod
15
+ @abstractmethod
16
+ def _api() -> CommonJobAPI[JobRequestType, JobResourceType]:
17
+ pass
18
+
19
+ @classmethod
20
+ def search_by_tags(cls, tags: Mapping[str, str],
21
+ limit: int = 10) -> list[Self]:
22
+ r"""Returns a list of job instances from a set of job tags.
23
+
24
+ Args:
25
+ tags (Mapping[str, str]): Tags by which to search.
26
+ limit (int): Max number of jobs to list, default 10.
27
+
28
+ Example:
29
+ >>> # doctest: +SKIP
30
+ >>> tags = {'pquery_name': 'my_pquery_name'}
31
+ >>> jobs = BatchPredictionJob.search_by_tags(tags)
32
+ Search limited to 10 results based on the `limit` parameter.
33
+ Found 2 jobs.
34
+ """
35
+ print(f"Search limited to {limit} results based on the `limit` "
36
+ "parameter.")
37
+
38
+ jobs = cls._api().list(limit=limit, additional_tags=tags)
39
+
40
+ print(f"Found {len(jobs)} jobs.")
41
+
42
+ return [cls(j.job_id) for j in jobs] # type: ignore
43
+
44
+ @property
45
+ @abstractmethod
46
+ def id(self) -> IDType:
47
+ pass
48
+
49
+ @abstractmethod
50
+ def status(self) -> JobStatusReport:
51
+ pass
52
+
53
+ def get_tags(self) -> dict[str, str]:
54
+ r"""Returns the tags of the job."""
55
+ return self._api().get(self.id).tags
56
+
57
+ def delete_tags(self, tags: list[str]) -> bool:
58
+ r"""Removes the tags from the job.
59
+
60
+ Args:
61
+ tags (list[str]): The tags to remove.
62
+ """
63
+ return self._api().delete_tags(self.id, tags)
64
+
65
+ def update_tags(self, tags: Mapping[str, Optional[str]]) -> bool:
66
+ r"""Updates the tags of the job.
67
+
68
+ Args:
69
+ tags (Mapping[str, Optional[str]]): The tags to update.
70
+ Note that the value 'none' will remove the tag. If the tag is
71
+ not present, it will be added.
72
+ """
73
+ return self._api().update_tags(self.id, tags)
74
+
75
+ @abstractmethod
76
+ def load_config(self) -> JobRequestType:
77
+ pass
78
+
79
+ def __repr__(self) -> str:
80
+ return f'{self.__class__.__name__}(job_id={self.id})'
kumoai/mixin.py ADDED
@@ -0,0 +1,28 @@
1
+ import dataclasses
2
+ from typing import Any, Optional, Type, TypeVar
3
+
4
+ T = TypeVar('T')
5
+
6
+
7
+ class CastMixin:
8
+ @classmethod
9
+ def _cast(
10
+ cls: Type[T],
11
+ *args: Any,
12
+ **kwargs: Any,
13
+ ) -> Optional[T]:
14
+ # TODO clean up type hints
15
+ # TODO can we apply this recursively?
16
+ if len(args) == 1 and len(kwargs) == 0:
17
+ elem = args[0]
18
+ if elem is None:
19
+ return None
20
+ if isinstance(elem, cls):
21
+ return elem
22
+ if isinstance(elem, (tuple, list)):
23
+ return cls(*elem)
24
+ if isinstance(elem, dict):
25
+ return cls(**elem)
26
+ if dataclasses.is_dataclass(elem):
27
+ return cls(**dataclasses.asdict(elem)) # type: ignore
28
+ return cls(*args, **kwargs)
@@ -0,0 +1,25 @@
1
+ from .predictive_query import PredictiveQuery
2
+ from .training_table import (
3
+ TrainingTable,
4
+ TrainingTableJob,
5
+ )
6
+ from .prediction_table import (
7
+ PredictionTable,
8
+ PredictionTableJob,
9
+ )
10
+ from kumoapi.model_plan import (
11
+ TrainingTableGenerationPlan,
12
+ PredictionTableGenerationPlan,
13
+ RunMode,
14
+ )
15
+
16
+ __all__ = [
17
+ 'RunMode',
18
+ 'PredictiveQuery',
19
+ 'TrainingTableGenerationPlan',
20
+ 'PredictionTableGenerationPlan',
21
+ 'TrainingTable',
22
+ 'TrainingTableJob',
23
+ 'PredictionTable',
24
+ 'PredictionTableJob',
25
+ ]
@@ -0,0 +1,287 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from concurrent.futures import Future
6
+ from datetime import datetime
7
+ from functools import cached_property
8
+ from typing import List, Optional, Union
9
+
10
+ import pandas as pd
11
+ from kumoapi.common import JobStatus
12
+ from kumoapi.jobs import (
13
+ GeneratePredictionTableJobResource,
14
+ GeneratePredictionTableRequest,
15
+ JobStatusReport,
16
+ )
17
+ from typing_extensions import override
18
+
19
+ from kumoai import global_state
20
+ from kumoai.client.jobs import (
21
+ GeneratePredictionTableJobAPI,
22
+ GeneratePredictionTableJobID,
23
+ )
24
+ from kumoai.connector.s3_connector import S3URI
25
+ from kumoai.formatting import pretty_print_error_details
26
+ from kumoai.futures import KumoFuture, create_future
27
+ from kumoai.jobs import JobInterface
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ _DEFAULT_INTERVAL_S = 20
32
+
33
+
34
+ class PredictionTable:
35
+ r"""A prediction table in the Kumo platform. A prediction table can
36
+ either be initialized from a job ID of a completed prediction table
37
+ generation job, or a path on a supported object store (S3 for a SaaS or
38
+ Databricks deployment, and Snowflake session storage for Snowflake).
39
+
40
+ .. warning::
41
+ Custom prediction table is an experimental feature; please work
42
+ with your Kumo POC to ensure you are using it correctly!
43
+
44
+
45
+ .. code-block:: python
46
+
47
+ import kumoai
48
+
49
+ # Create a Prediction Table from a prediction table generation job.
50
+ # Note that the job ID passed here must be in a completed state:
51
+ prediction_table = kumoai.PredictionTable("gen-predtable-job-...")
52
+
53
+ # Read the prediction table as a Pandas DataFrame:
54
+ prediction_df = prediction_table.data_df()
55
+
56
+ # Get URLs to download the prediction table:
57
+ prediction_download_urls = prediction_table.data_urls()
58
+
59
+ Args:
60
+ job_id: ID of the prediction table generation job which
61
+ generated this prediction table. If a custom table data path is
62
+ specified, this parameter should be left as ``None``.
63
+ table_data_path: S3 path of the table data location, for which Kumo
64
+ must at least have read access. If a job ID is specified, this
65
+ parameter should be left as ``None``.
66
+ """
67
+ def __init__(
68
+ self,
69
+ job_id: Optional[GeneratePredictionTableJobID] = None,
70
+ table_data_path: Optional[str] = None,
71
+ ) -> None:
72
+ # Validation:
73
+ if not (job_id or table_data_path):
74
+ raise ValueError(
75
+ "A PredictionTable must either be initialized with a table "
76
+ "data path, or a job ID of a completed prediction table "
77
+ "generation job.")
78
+ if job_id and table_data_path:
79
+ raise ValueError(
80
+ "Please either pass a table data path, or a job ID of a "
81
+ "completed prediction table generation job; passing both "
82
+ "is not allowed.")
83
+
84
+ # Custom path:
85
+ self.table_data_uri: Optional[Union[str, S3URI]] = None
86
+ if table_data_path is not None:
87
+ if table_data_path.startswith('dbfs:/'):
88
+ raise ValueError(
89
+ "Files from Databricks UC Volumes are not supported")
90
+ if global_state.is_spcs:
91
+ if table_data_path.startswith('s3://'):
92
+ raise ValueError(
93
+ "SPCS does not support S3 paths for prediction tables."
94
+ )
95
+ # TODO(zeyuan): support custom stage path on SPCS:
96
+ self.table_data_uri = table_data_path
97
+ else:
98
+ self.table_data_uri = S3URI(table_data_path).validate()
99
+
100
+ # Job ID:
101
+ self.job_id = job_id
102
+ if job_id:
103
+ status = _get_status(job_id).status
104
+ if status != JobStatus.DONE:
105
+ raise ValueError(
106
+ f"Job {job_id} is not yet complete (status: {status}). If "
107
+ f"you would like to create a future (waiting for "
108
+ f"prediction table generation success), please use "
109
+ f"`PredictionTableJob`.")
110
+
111
+ def data_urls(self) -> List[str]:
112
+ r"""Returns a list of URLs that can be used to view generated
113
+ prediction table data; if a custom data path was passed, this path is
114
+ simply returned.
115
+
116
+ The list will contain more than one element if the table is
117
+ partitioned; paths will be relative to the location of the Kumo data
118
+ plane.
119
+ """
120
+ api = global_state.client.generate_prediction_table_job_api
121
+ if not self.job_id:
122
+ # Custom prediction table:
123
+ if global_state.is_spcs:
124
+ assert isinstance(self.table_data_uri, str)
125
+ return [self.table_data_uri]
126
+ else:
127
+ assert isinstance(self.table_data_uri, S3URI)
128
+ return [self.table_data_uri.uri]
129
+ return api.get_table_data(self.job_id, presigned=True)
130
+
131
+ def data_df(self) -> pd.DataFrame:
132
+ r"""Returns a Pandas DataFrame object representing the generated
133
+ or custom-specified prediction table data.
134
+
135
+ .. warning::
136
+
137
+ This method will load the full prediction table into memory as a
138
+ :class:`~pandas.DataFrame` object. If you are working on a machine
139
+ with limited resources, please use
140
+ :meth:`~kumoai.pquery.PredictionTable.data_urls` instead to
141
+ download the data and perform analysis per-partition.
142
+ """
143
+ if global_state.is_spcs:
144
+ from kumoai.spcs import _parquet_dataset_to_df
145
+
146
+ # TODO(dm): return type hint is wrong
147
+ return _parquet_dataset_to_df(self.data_urls())
148
+ else:
149
+ urls = self.data_urls()
150
+ try:
151
+ return pd.concat([pd.read_parquet(x) for x in urls])
152
+ except Exception as e:
153
+ raise ValueError(
154
+ f"Could not create a Pandas DataFrame object from data "
155
+ f"paths {urls}. Please construct the DataFrame manually."
156
+ ) from e
157
+
158
+ @property
159
+ def anchor_time(self) -> Optional[datetime]:
160
+ r"""Returns the anchor time corresponding to the generated prediction
161
+ table data, if the data was not custom-specified.
162
+ """
163
+ if self.job_id is None:
164
+ logger.warning(
165
+ "Fetching the anchor time is not supported for a custom "
166
+ "prediction table (path: %s)", self.table_data_uri)
167
+ return None
168
+ api = global_state.client.generate_prediction_table_job_api
169
+ return api.get_anchor_time(self.job_id)
170
+
171
+
172
+ # Prediction Table Future #####################################################
173
+
174
+
175
+ class PredictionTableJob(JobInterface[GeneratePredictionTableJobID,
176
+ GeneratePredictionTableRequest,
177
+ GeneratePredictionTableJobResource],
178
+ KumoFuture[PredictionTable]):
179
+ r"""A representation of an ongoing prediction table generation job in the
180
+ Kumo platform.
181
+
182
+ .. code-block:: python
183
+
184
+ import kumoai
185
+
186
+ # See `PredictiveQuery` documentation:
187
+ pquery = kumoai.PredictiveQuery(...)
188
+
189
+ # If a prediction table is generated in nonblocking mode, the response
190
+ # will be of type `PredictionTableJob`:
191
+ prediction_table_job = pquery.generate_prediction_table(non_blocking=True)
192
+
193
+ # You can also construct a `PredictionTableJob` from a job ID, e.g.
194
+ # one that is present in the Kumo Jobs page:
195
+ prediction_table_job = kumoai.PredictionTableJob("gen-predtable-job-...")
196
+
197
+ # Get the status of the job:
198
+ print(prediction_table_job.status())
199
+
200
+ # Cancel the job:
201
+ prediction_table_job.cancel()
202
+
203
+ # Wait for the job to complete, and return a `PredictionTable`:
204
+ prediction_table_job.result()
205
+
206
+ Args:
207
+ job_id: ID of the prediction table generation job.
208
+ """ # noqa
209
+
210
+ @override
211
+ @staticmethod
212
+ def _api() -> GeneratePredictionTableJobAPI:
213
+ return global_state.client.generate_prediction_table_job_api
214
+
215
+ def __init__(
216
+ self,
217
+ job_id: GeneratePredictionTableJobID,
218
+ ) -> None:
219
+ self.job_id = job_id
220
+ self.job: Optional[GeneratePredictionTableJobResource] = None
221
+
222
+ @cached_property
223
+ def _fut(self) -> Future:
224
+ return create_future(self._poll())
225
+
226
+ @override
227
+ @property
228
+ def id(self) -> GeneratePredictionTableJobID:
229
+ r"""The unique ID of this prediction table generation process."""
230
+ return self.job_id
231
+
232
+ @override
233
+ def result(self) -> PredictionTable:
234
+ return self._fut.result()
235
+
236
+ @override
237
+ def future(self) -> Future[PredictionTable]:
238
+ return self._fut
239
+
240
+ @override
241
+ def status(self) -> JobStatusReport:
242
+ r"""Returns the status of a running prediction table generation job."""
243
+ return self._poll_job().job_status_report
244
+
245
+ def cancel(self) -> None:
246
+ r"""Cancels a running prediction table generation job, and raises an
247
+ error if cancellation failed.
248
+ """
249
+ return self._api().cancel(self.job_id)
250
+
251
+ # TODO(manan): make asynchronous natively with aiohttp:
252
+ def _poll_job(self) -> GeneratePredictionTableJobResource:
253
+ # Skip polling if job is already in terminal state.
254
+ if not self.job or not self.job.job_status_report.status.is_terminal:
255
+ self.job = self._api().get(self.job_id)
256
+ return self.job
257
+
258
+ async def _poll(self) -> PredictionTable:
259
+ while not self.status().status.is_terminal:
260
+ await asyncio.sleep(_DEFAULT_INTERVAL_S)
261
+ status = self.status().status
262
+ if status != JobStatus.DONE:
263
+ error_details = self._api().get_job_error(self.job_id)
264
+ error_str = pretty_print_error_details(error_details)
265
+ raise RuntimeError(
266
+ f"Prediction table generation for job {self.job_id} failed "
267
+ f"with job status {status}. Encountered below"
268
+ f" errors: {error_str}")
269
+ return PredictionTable(self.job_id)
270
+
271
+ @override
272
+ def load_config(self) -> GeneratePredictionTableRequest:
273
+ r"""Load the full configuration for this
274
+ prediction table generation job.
275
+
276
+ Returns:
277
+ GeneratePredictionTableRequest:
278
+ Complete configuration including plan,
279
+ pquery_id, graph_snapshot_id, etc.
280
+ """
281
+ return self._api().get_config(self.job_id)
282
+
283
+
284
+ def _get_status(job_id: str) -> JobStatusReport:
285
+ api = global_state.client.generate_prediction_table_job_api
286
+ resource: GeneratePredictionTableJobResource = api.get(job_id)
287
+ return resource.job_status_report