kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kumoai might be problematic. Click here for more details.
- kumoai/__init__.py +300 -0
- kumoai/_logging.py +29 -0
- kumoai/_singleton.py +25 -0
- kumoai/_version.py +1 -0
- kumoai/artifact_export/__init__.py +9 -0
- kumoai/artifact_export/config.py +209 -0
- kumoai/artifact_export/job.py +108 -0
- kumoai/client/__init__.py +5 -0
- kumoai/client/client.py +223 -0
- kumoai/client/connector.py +110 -0
- kumoai/client/endpoints.py +150 -0
- kumoai/client/graph.py +120 -0
- kumoai/client/jobs.py +471 -0
- kumoai/client/online.py +78 -0
- kumoai/client/pquery.py +207 -0
- kumoai/client/rfm.py +112 -0
- kumoai/client/source_table.py +53 -0
- kumoai/client/table.py +101 -0
- kumoai/client/utils.py +130 -0
- kumoai/codegen/__init__.py +19 -0
- kumoai/codegen/cli.py +100 -0
- kumoai/codegen/context.py +16 -0
- kumoai/codegen/edits.py +473 -0
- kumoai/codegen/exceptions.py +10 -0
- kumoai/codegen/generate.py +222 -0
- kumoai/codegen/handlers/__init__.py +4 -0
- kumoai/codegen/handlers/connector.py +118 -0
- kumoai/codegen/handlers/graph.py +71 -0
- kumoai/codegen/handlers/pquery.py +62 -0
- kumoai/codegen/handlers/table.py +109 -0
- kumoai/codegen/handlers/utils.py +42 -0
- kumoai/codegen/identity.py +114 -0
- kumoai/codegen/loader.py +93 -0
- kumoai/codegen/naming.py +94 -0
- kumoai/codegen/registry.py +121 -0
- kumoai/connector/__init__.py +31 -0
- kumoai/connector/base.py +153 -0
- kumoai/connector/bigquery_connector.py +200 -0
- kumoai/connector/databricks_connector.py +213 -0
- kumoai/connector/file_upload_connector.py +189 -0
- kumoai/connector/glue_connector.py +150 -0
- kumoai/connector/s3_connector.py +278 -0
- kumoai/connector/snowflake_connector.py +252 -0
- kumoai/connector/source_table.py +471 -0
- kumoai/connector/utils.py +1796 -0
- kumoai/databricks.py +14 -0
- kumoai/encoder/__init__.py +4 -0
- kumoai/exceptions.py +26 -0
- kumoai/experimental/__init__.py +0 -0
- kumoai/experimental/rfm/__init__.py +210 -0
- kumoai/experimental/rfm/authenticate.py +432 -0
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +113 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
- kumoai/experimental/rfm/backend/snow/table.py +242 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
- kumoai/experimental/rfm/base/__init__.py +30 -0
- kumoai/experimental/rfm/base/column.py +152 -0
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +761 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +143 -0
- kumoai/experimental/rfm/base/table.py +736 -0
- kumoai/experimental/rfm/graph.py +1237 -0
- kumoai/experimental/rfm/infer/__init__.py +19 -0
- kumoai/experimental/rfm/infer/categorical.py +40 -0
- kumoai/experimental/rfm/infer/dtype.py +82 -0
- kumoai/experimental/rfm/infer/id.py +46 -0
- kumoai/experimental/rfm/infer/multicategorical.py +48 -0
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/infer/timestamp.py +41 -0
- kumoai/experimental/rfm/pquery/__init__.py +7 -0
- kumoai/experimental/rfm/pquery/executor.py +102 -0
- kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +1184 -0
- kumoai/experimental/rfm/sagemaker.py +138 -0
- kumoai/experimental/rfm/task_table.py +231 -0
- kumoai/formatting.py +30 -0
- kumoai/futures.py +99 -0
- kumoai/graph/__init__.py +12 -0
- kumoai/graph/column.py +106 -0
- kumoai/graph/graph.py +948 -0
- kumoai/graph/table.py +838 -0
- kumoai/jobs.py +80 -0
- kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
- kumoai/mixin.py +28 -0
- kumoai/pquery/__init__.py +25 -0
- kumoai/pquery/prediction_table.py +287 -0
- kumoai/pquery/predictive_query.py +641 -0
- kumoai/pquery/training_table.py +424 -0
- kumoai/spcs.py +121 -0
- kumoai/testing/__init__.py +8 -0
- kumoai/testing/decorators.py +57 -0
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/__init__.py +42 -0
- kumoai/trainer/baseline_trainer.py +93 -0
- kumoai/trainer/config.py +2 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/trainer/job.py +1192 -0
- kumoai/trainer/online_serving.py +258 -0
- kumoai/trainer/trainer.py +475 -0
- kumoai/trainer/util.py +103 -0
- kumoai/utils/__init__.py +11 -0
- kumoai/utils/datasets.py +83 -0
- kumoai/utils/display.py +51 -0
- kumoai/utils/forecasting.py +209 -0
- kumoai/utils/progress_logger.py +343 -0
- kumoai/utils/sql.py +3 -0
- kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
- kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
- kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
- kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
- kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from .trainer import Trainer
|
|
2
|
+
from kumoapi.model_plan import (
|
|
3
|
+
TrainingJobPlan,
|
|
4
|
+
ColumnProcessingPlan,
|
|
5
|
+
NeighborSamplingPlan,
|
|
6
|
+
OptimizationPlan,
|
|
7
|
+
ModelArchitecturePlan,
|
|
8
|
+
ModelPlan,
|
|
9
|
+
GNNModelPlan,
|
|
10
|
+
GraphTransformerModelPlan,
|
|
11
|
+
)
|
|
12
|
+
# For backwards compatibility
|
|
13
|
+
from kumoai.artifact_export import (
|
|
14
|
+
ArtifactExportJob,
|
|
15
|
+
ArtifactExportResult,
|
|
16
|
+
)
|
|
17
|
+
from .job import (
|
|
18
|
+
TrainingJobResult,
|
|
19
|
+
TrainingJob,
|
|
20
|
+
BatchPredictionJobResult,
|
|
21
|
+
BatchPredictionJob,
|
|
22
|
+
)
|
|
23
|
+
from .baseline_trainer import BaselineTrainer
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
'TrainingJobPlan',
|
|
27
|
+
'ColumnProcessingPlan',
|
|
28
|
+
'NeighborSamplingPlan',
|
|
29
|
+
'OptimizationPlan',
|
|
30
|
+
'ModelArchitecturePlan',
|
|
31
|
+
'ModelPlan',
|
|
32
|
+
'GNNModelPlan',
|
|
33
|
+
'GraphTransformerModelPlan',
|
|
34
|
+
'Trainer',
|
|
35
|
+
'TrainingJobResult',
|
|
36
|
+
'TrainingJob',
|
|
37
|
+
'BatchPredictionJobResult',
|
|
38
|
+
'BatchPredictionJob',
|
|
39
|
+
'BaselineTrainer',
|
|
40
|
+
'ArtifactExportJob',
|
|
41
|
+
'ArtifactExportResult',
|
|
42
|
+
]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from typing import List, Mapping, Optional, Union
|
|
2
|
+
|
|
3
|
+
from kumoapi.jobs import BaselineJobRequest
|
|
4
|
+
|
|
5
|
+
from kumoai import global_state
|
|
6
|
+
from kumoai.client.jobs import BaselineJobID
|
|
7
|
+
from kumoai.graph import Graph
|
|
8
|
+
from kumoai.pquery.training_table import TrainingTable, TrainingTableJob
|
|
9
|
+
from kumoai.trainer.job import BaselineJob, BaselineJobResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaselineTrainer:
|
|
13
|
+
r"""A baseline trainer supports creating a Kumo baseline model on a
|
|
14
|
+
:class:`~kumoai.pquery.PredictiveQuery`. It is primarily oriented around
|
|
15
|
+
:meth:`~kumoai.trainer.Trainer.run`, which accepts a
|
|
16
|
+
:class:`~kumoai.graph.Graph` and :class:`~kumoai.pquery.TrainingTable` and
|
|
17
|
+
produces a :class:`~kumoai.trainer.BaselineJobResult`.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
metrics List[str]: A list to metrics that baseline model will be
|
|
21
|
+
evaluated on.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> import kumoai # doctest: +SKIP
|
|
25
|
+
>>> pquery = kumoai.PredictiveQuery(...) # doctest: +SKIP
|
|
26
|
+
>>> trainer = kumoai.BaselineTrainer(metrics=metrics) # doctest: +SKIP
|
|
27
|
+
|
|
28
|
+
.. # noqa: E501
|
|
29
|
+
"""
|
|
30
|
+
def __init__(self, metrics: List[str]) -> None:
|
|
31
|
+
self._metrics: List[str] = metrics
|
|
32
|
+
|
|
33
|
+
# Cached from backend:
|
|
34
|
+
self._baseline_job_id: Optional[BaselineJobID] = None
|
|
35
|
+
|
|
36
|
+
def run(
|
|
37
|
+
self,
|
|
38
|
+
graph: Graph,
|
|
39
|
+
train_table: Union[TrainingTable, TrainingTableJob],
|
|
40
|
+
*,
|
|
41
|
+
non_blocking: bool = False,
|
|
42
|
+
custom_tags: Mapping[str, str] = {},
|
|
43
|
+
) -> Union[BaselineJob, BaselineJobResult]:
|
|
44
|
+
"""Runs a baseline to the specified graph and training table.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
graph (Graph): The :class:`~kumoai.graph.Graph` object that
|
|
48
|
+
represents the tables and relationships that baseline model
|
|
49
|
+
is running against.
|
|
50
|
+
train_table (Union[TrainingTable, TrainingTableJob]): The
|
|
51
|
+
:class:`~kumoai.pquery.TrainingTable`, or in-progress
|
|
52
|
+
:class:`~kumoai.pquery.TrainingTableJob` that represents
|
|
53
|
+
the training data produced by a
|
|
54
|
+
:class:`~kumoai.pquery.PredictiveQuery` on :obj:`graph`.
|
|
55
|
+
non_blocking (bool): Whether this operation should return
|
|
56
|
+
immediately after launching the baseline job, or await
|
|
57
|
+
completion of the baseline job. Defaults to False.
|
|
58
|
+
custom_tags (Mapping[str, str], optional): Customer defined k-v
|
|
59
|
+
tags to be associated with the job to be launched. Job tags
|
|
60
|
+
are useful for grouping and searching jobs.. Defaults to {}.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Union[BaselineJob, BaselineJobResult]:
|
|
64
|
+
If ``non_blocking=False``, returns a baseline job object. If
|
|
65
|
+
``non_blocking=True``, returns a baseline job future object.
|
|
66
|
+
"""
|
|
67
|
+
job_id = train_table.job_id
|
|
68
|
+
assert job_id is not None
|
|
69
|
+
|
|
70
|
+
train_table_job_api = global_state.client.generate_train_table_job_api
|
|
71
|
+
pq_id = train_table_job_api.get(job_id).config.pquery_id
|
|
72
|
+
assert pq_id is not None
|
|
73
|
+
|
|
74
|
+
# NOTE the backend implementation currently handles sequentialization
|
|
75
|
+
# between a training table future and a baseline job; that is, if the
|
|
76
|
+
# training table future is still executing, the backend will wait on
|
|
77
|
+
# the job ID completion before executing a baseline job. This preserves
|
|
78
|
+
# semantics for both futures, ensures that Kumo works as expected if
|
|
79
|
+
# used only via REST API, and allows us to avoid chaining calllbacks
|
|
80
|
+
# in an ugly way here:
|
|
81
|
+
api = global_state.client.baseline_job_api
|
|
82
|
+
self._baseline_job_id = api.create(
|
|
83
|
+
BaselineJobRequest(
|
|
84
|
+
job_tags=dict(custom_tags),
|
|
85
|
+
pquery_id=pq_id,
|
|
86
|
+
metrics=self._metrics,
|
|
87
|
+
graph_snapshot_id=graph.snapshot(non_blocking=non_blocking),
|
|
88
|
+
train_table_job_id=job_id,
|
|
89
|
+
))
|
|
90
|
+
out = BaselineJob(job_id=self._baseline_job_id)
|
|
91
|
+
if non_blocking:
|
|
92
|
+
return out
|
|
93
|
+
return out.attach()
|
kumoai/trainer/config.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Literal, Mapping, Optional, Union, overload
|
|
3
|
+
|
|
4
|
+
from kumoapi.distilled_model_plan import DistilledModelPlan
|
|
5
|
+
from kumoapi.jobs import DistillationJobRequest, DistillationJobResource
|
|
6
|
+
|
|
7
|
+
from kumoai import global_state
|
|
8
|
+
from kumoai.client.jobs import TrainingJobID
|
|
9
|
+
from kumoai.graph import Graph
|
|
10
|
+
from kumoai.pquery.training_table import TrainingTable, TrainingTableJob
|
|
11
|
+
from kumoai.trainer.job import TrainingJob, TrainingJobResult
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DistillationTrainer:
|
|
17
|
+
r"""A trainer supports creating a Kumo machine learning model
|
|
18
|
+
for use in an online serving endpoint. The distllation process involes
|
|
19
|
+
training a shallow model on a :class:`~kumoai.pquery.PredictiveQuery` using
|
|
20
|
+
the embeddings generated by a base model :args:`base_training_job_id`.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
model_plan: The distilled model plan to use for the distillation process.
|
|
24
|
+
base_training_job_id: The ID of the base training job to use for the distillation process.
|
|
25
|
+
""" # noqa: E501
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
model_plan: DistilledModelPlan,
|
|
30
|
+
base_training_job_id: TrainingJobID,
|
|
31
|
+
) -> None:
|
|
32
|
+
self.model_plan: DistilledModelPlan = model_plan
|
|
33
|
+
self.base_training_job_id: TrainingJobID = base_training_job_id
|
|
34
|
+
|
|
35
|
+
# Cached from backend:
|
|
36
|
+
self._training_job_id: Optional[TrainingJobID] = None
|
|
37
|
+
|
|
38
|
+
# Metadata ################################################################
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_trained(self) -> bool:
|
|
42
|
+
r"""Returns ``True`` if this trainer instance has successfully been
|
|
43
|
+
trained (and is therefore ready for prediction); ``False`` otherwise.
|
|
44
|
+
"""
|
|
45
|
+
raise NotImplementedError(
|
|
46
|
+
"Checking if a distilled trainer is trained is not "
|
|
47
|
+
"implemented yet.")
|
|
48
|
+
|
|
49
|
+
@overload
|
|
50
|
+
def fit(
|
|
51
|
+
self,
|
|
52
|
+
graph: Graph,
|
|
53
|
+
train_table: Union[TrainingTable, TrainingTableJob],
|
|
54
|
+
) -> TrainingJobResult:
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
@overload
|
|
58
|
+
def fit(
|
|
59
|
+
self,
|
|
60
|
+
graph: Graph,
|
|
61
|
+
train_table: Union[TrainingTable, TrainingTableJob],
|
|
62
|
+
*,
|
|
63
|
+
non_blocking: Literal[False],
|
|
64
|
+
) -> TrainingJobResult:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@overload
|
|
68
|
+
def fit(
|
|
69
|
+
self,
|
|
70
|
+
graph: Graph,
|
|
71
|
+
train_table: Union[TrainingTable, TrainingTableJob],
|
|
72
|
+
*,
|
|
73
|
+
non_blocking: Literal[True],
|
|
74
|
+
) -> TrainingJob:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@overload
|
|
78
|
+
def fit(
|
|
79
|
+
self,
|
|
80
|
+
graph: Graph,
|
|
81
|
+
train_table: Union[TrainingTable, TrainingTableJob],
|
|
82
|
+
*,
|
|
83
|
+
non_blocking: bool,
|
|
84
|
+
) -> Union[TrainingJob, TrainingJobResult]:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def fit(
|
|
88
|
+
self,
|
|
89
|
+
graph: Graph,
|
|
90
|
+
train_table: Union[TrainingTable, TrainingTableJob],
|
|
91
|
+
*,
|
|
92
|
+
non_blocking: bool = False,
|
|
93
|
+
custom_tags: Mapping[str, str] = {},
|
|
94
|
+
) -> Union[TrainingJob, TrainingJobResult]:
|
|
95
|
+
r"""Fits a model to the specified graph and training table, with the
|
|
96
|
+
strategy defined by :class:`DistilledTrainer`'s :obj:`model_plan`.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
graph: The :class:`~kumoai.graph.Graph` object that represents the
|
|
100
|
+
tables and relationships that Kumo will learn from.
|
|
101
|
+
train_table: The :class:`~kumoai.pquery.TrainingTable`, or
|
|
102
|
+
in-progress :class:`~kumoai.pquery.TrainingTableJob`, that
|
|
103
|
+
represents the training data produced by a
|
|
104
|
+
:class:`~kumoai.pquery.PredictiveQuery` on :obj:`graph`.
|
|
105
|
+
non_blocking: Whether this operation should return immediately
|
|
106
|
+
after launching the training job, or await completion of the
|
|
107
|
+
training job.
|
|
108
|
+
custom_tags: Additional, customer defined k-v tags to be associated
|
|
109
|
+
with the job to be launched. Job tags are useful for grouping
|
|
110
|
+
and searching jobs.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Union[TrainingJobResult, TrainingJob]:
|
|
114
|
+
If ``non_blocking=False``, returns a training job object. If
|
|
115
|
+
``non_blocking=True``, returns a training job future object.
|
|
116
|
+
"""
|
|
117
|
+
# TODO(manan, siyang): remove soon:
|
|
118
|
+
job_id = train_table.job_id
|
|
119
|
+
assert job_id is not None
|
|
120
|
+
|
|
121
|
+
train_table_job_api = global_state.client.generate_train_table_job_api
|
|
122
|
+
pq_id = train_table_job_api.get(job_id).config.pquery_id
|
|
123
|
+
assert pq_id is not None
|
|
124
|
+
|
|
125
|
+
custom_table = None
|
|
126
|
+
if isinstance(train_table, TrainingTable):
|
|
127
|
+
custom_table = train_table._custom_train_table
|
|
128
|
+
|
|
129
|
+
# NOTE the backend implementation currently handles sequentialization
|
|
130
|
+
# between a training table future and a training job; that is, if the
|
|
131
|
+
# training table future is still executing, the backend will wait on
|
|
132
|
+
# the job ID completion before executing a training job. This preserves
|
|
133
|
+
# semantics for both futures, ensures that Kumo works as expected if
|
|
134
|
+
# used only via REST API, and allows us to avoid chaining calllbacks
|
|
135
|
+
# in an ugly way here:
|
|
136
|
+
api = global_state.client.distillation_job_api
|
|
137
|
+
self._training_job_id = api.create(
|
|
138
|
+
DistillationJobRequest(
|
|
139
|
+
dict(custom_tags),
|
|
140
|
+
pquery_id=pq_id,
|
|
141
|
+
base_training_job_id=self.base_training_job_id,
|
|
142
|
+
distilled_model_plan=self.model_plan,
|
|
143
|
+
graph_snapshot_id=graph.snapshot(non_blocking=non_blocking),
|
|
144
|
+
train_table_job_id=job_id,
|
|
145
|
+
custom_train_table=custom_table,
|
|
146
|
+
))
|
|
147
|
+
|
|
148
|
+
out = TrainingJob(job_id=self._training_job_id)
|
|
149
|
+
if non_blocking:
|
|
150
|
+
return out
|
|
151
|
+
return out.attach()
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def _load_from_job(
|
|
155
|
+
cls,
|
|
156
|
+
job: DistillationJobResource,
|
|
157
|
+
) -> 'DistillationTrainer':
|
|
158
|
+
trainer = cls(job.config.distilled_model_plan,
|
|
159
|
+
job.config.base_training_job_id)
|
|
160
|
+
trainer._training_job_id = job.job_id
|
|
161
|
+
return trainer
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def load(cls, job_id: TrainingJobID) -> 'DistillationTrainer':
|
|
165
|
+
r"""Creates a :class:`~kumoai.trainer.Trainer` instance from a training
|
|
166
|
+
job ID.
|
|
167
|
+
"""
|
|
168
|
+
raise NotImplementedError(
|
|
169
|
+
"Loading a distilled trainer from a job ID is not implemented yet."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def load_from_tags(cls, tags: Mapping[str, str]) -> 'DistillationTrainer':
|
|
174
|
+
raise NotImplementedError(
|
|
175
|
+
"Loading a distilled trainer from tags is not implemented yet.")
|