kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kumoai might be problematic. Click here for more details.
- kumoai/__init__.py +300 -0
- kumoai/_logging.py +29 -0
- kumoai/_singleton.py +25 -0
- kumoai/_version.py +1 -0
- kumoai/artifact_export/__init__.py +9 -0
- kumoai/artifact_export/config.py +209 -0
- kumoai/artifact_export/job.py +108 -0
- kumoai/client/__init__.py +5 -0
- kumoai/client/client.py +223 -0
- kumoai/client/connector.py +110 -0
- kumoai/client/endpoints.py +150 -0
- kumoai/client/graph.py +120 -0
- kumoai/client/jobs.py +471 -0
- kumoai/client/online.py +78 -0
- kumoai/client/pquery.py +207 -0
- kumoai/client/rfm.py +112 -0
- kumoai/client/source_table.py +53 -0
- kumoai/client/table.py +101 -0
- kumoai/client/utils.py +130 -0
- kumoai/codegen/__init__.py +19 -0
- kumoai/codegen/cli.py +100 -0
- kumoai/codegen/context.py +16 -0
- kumoai/codegen/edits.py +473 -0
- kumoai/codegen/exceptions.py +10 -0
- kumoai/codegen/generate.py +222 -0
- kumoai/codegen/handlers/__init__.py +4 -0
- kumoai/codegen/handlers/connector.py +118 -0
- kumoai/codegen/handlers/graph.py +71 -0
- kumoai/codegen/handlers/pquery.py +62 -0
- kumoai/codegen/handlers/table.py +109 -0
- kumoai/codegen/handlers/utils.py +42 -0
- kumoai/codegen/identity.py +114 -0
- kumoai/codegen/loader.py +93 -0
- kumoai/codegen/naming.py +94 -0
- kumoai/codegen/registry.py +121 -0
- kumoai/connector/__init__.py +31 -0
- kumoai/connector/base.py +153 -0
- kumoai/connector/bigquery_connector.py +200 -0
- kumoai/connector/databricks_connector.py +213 -0
- kumoai/connector/file_upload_connector.py +189 -0
- kumoai/connector/glue_connector.py +150 -0
- kumoai/connector/s3_connector.py +278 -0
- kumoai/connector/snowflake_connector.py +252 -0
- kumoai/connector/source_table.py +471 -0
- kumoai/connector/utils.py +1796 -0
- kumoai/databricks.py +14 -0
- kumoai/encoder/__init__.py +4 -0
- kumoai/exceptions.py +26 -0
- kumoai/experimental/__init__.py +0 -0
- kumoai/experimental/rfm/__init__.py +210 -0
- kumoai/experimental/rfm/authenticate.py +432 -0
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +113 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
- kumoai/experimental/rfm/backend/snow/table.py +242 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
- kumoai/experimental/rfm/base/__init__.py +30 -0
- kumoai/experimental/rfm/base/column.py +152 -0
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +761 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +143 -0
- kumoai/experimental/rfm/base/table.py +736 -0
- kumoai/experimental/rfm/graph.py +1237 -0
- kumoai/experimental/rfm/infer/__init__.py +19 -0
- kumoai/experimental/rfm/infer/categorical.py +40 -0
- kumoai/experimental/rfm/infer/dtype.py +82 -0
- kumoai/experimental/rfm/infer/id.py +46 -0
- kumoai/experimental/rfm/infer/multicategorical.py +48 -0
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/infer/timestamp.py +41 -0
- kumoai/experimental/rfm/pquery/__init__.py +7 -0
- kumoai/experimental/rfm/pquery/executor.py +102 -0
- kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +1184 -0
- kumoai/experimental/rfm/sagemaker.py +138 -0
- kumoai/experimental/rfm/task_table.py +231 -0
- kumoai/formatting.py +30 -0
- kumoai/futures.py +99 -0
- kumoai/graph/__init__.py +12 -0
- kumoai/graph/column.py +106 -0
- kumoai/graph/graph.py +948 -0
- kumoai/graph/table.py +838 -0
- kumoai/jobs.py +80 -0
- kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
- kumoai/mixin.py +28 -0
- kumoai/pquery/__init__.py +25 -0
- kumoai/pquery/prediction_table.py +287 -0
- kumoai/pquery/predictive_query.py +641 -0
- kumoai/pquery/training_table.py +424 -0
- kumoai/spcs.py +121 -0
- kumoai/testing/__init__.py +8 -0
- kumoai/testing/decorators.py +57 -0
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/__init__.py +42 -0
- kumoai/trainer/baseline_trainer.py +93 -0
- kumoai/trainer/config.py +2 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/trainer/job.py +1192 -0
- kumoai/trainer/online_serving.py +258 -0
- kumoai/trainer/trainer.py +475 -0
- kumoai/trainer/util.py +103 -0
- kumoai/utils/__init__.py +11 -0
- kumoai/utils/datasets.py +83 -0
- kumoai/utils/display.py +51 -0
- kumoai/utils/forecasting.py +209 -0
- kumoai/utils/progress_logger.py +343 -0
- kumoai/utils/sql.py +3 -0
- kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
- kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
- kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
- kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
- kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from kumoapi.data_source import (
|
|
5
|
+
BigQueryConnectorResourceConfig,
|
|
6
|
+
BigQueryCredentials,
|
|
7
|
+
CreateConnectorArgs,
|
|
8
|
+
DataSourceType,
|
|
9
|
+
)
|
|
10
|
+
from kumoapi.source_table import BigQuerySourceTableRequest
|
|
11
|
+
from typing_extensions import Self, override
|
|
12
|
+
|
|
13
|
+
from kumoai import global_state
|
|
14
|
+
from kumoai.connector import Connector
|
|
15
|
+
|
|
16
|
+
_ENV_BIGQUERY_PRIVATE_KEY_ID = 'BIGQUERY_PRIVATE_KEY_ID'
|
|
17
|
+
_ENV_BIGQUERY_PRIVATE_KEY = 'BIGQUERY_PRIVATE_KEY'
|
|
18
|
+
_ENV_BIGQUERY_CLIENT_ID = 'BIGQUERY_CLIENT_ID'
|
|
19
|
+
_ENV_BIGQUERY_CLIENT_EMAIL = 'BIGQUERY_CLIENT_EMAIL'
|
|
20
|
+
_ENV_BIGQUERY_TOKEN_URI = 'BIGQUERY_TOKEN_URI'
|
|
21
|
+
_ENV_BIGQUERY_AUTH_URI = 'BIGQUERY_AUTH_URI'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BigQueryConnector(Connector):
|
|
25
|
+
r"""Establishes a connection to a
|
|
26
|
+
`BigQuery <https://cloud.google.com/bigquery>`_ database.
|
|
27
|
+
|
|
28
|
+
Authentication requires passing a private key ID, private key string,
|
|
29
|
+
client ID, client email, token URI, and authentication URI to the
|
|
30
|
+
connector, either via environment variables
|
|
31
|
+
(``BIGQUERY_PRIVATE_KEY_ID``, ``BIGQUERY_PRIVATE_KEY``,
|
|
32
|
+
``BIGQUERY_CLIENT_ID``, ``BIGQUERY_CLIENT_EMAIL``, ``BIGQUERY_TOKEN_URI``,
|
|
33
|
+
``BIGQUERY_AUTH_URI``), or via keys in the credentials dictionary
|
|
34
|
+
(:obj:`private_key_id`, :obj:`private_key`, :obj:`client_id`,
|
|
35
|
+
:obj:`client_email`, :obj:`token_uri`, :obj:`auth_uri`).
|
|
36
|
+
|
|
37
|
+
.. code-block:: python
|
|
38
|
+
|
|
39
|
+
import kumoai
|
|
40
|
+
|
|
41
|
+
# Either pass `credentials=dict(private_key_id=..., private_key=...,
|
|
42
|
+
# client_id=..., client_email=..., token_uri=..., auth_url=...)` or set
|
|
43
|
+
# the aforementioned environment variables:
|
|
44
|
+
connector = kumoai.BigQueryConnector(
|
|
45
|
+
name="<connector_name>",
|
|
46
|
+
project_id="<bigquery_project_id>",
|
|
47
|
+
dataset_id="<bigquery_dataset_id>",
|
|
48
|
+
credentials=credentials,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# List all tables:
|
|
52
|
+
print(connector.table_names())
|
|
53
|
+
|
|
54
|
+
# Check whether a table is present:
|
|
55
|
+
assert "articles" in connector
|
|
56
|
+
|
|
57
|
+
# Fetch a source table (both approaches are equivalent):
|
|
58
|
+
source_table = connector["articles"]
|
|
59
|
+
source_table = connector.table("articles")
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
name: The name of the connector.
|
|
63
|
+
project_id: The project ID to connect to.
|
|
64
|
+
dataset_id: The dataset ID within the connected project.
|
|
65
|
+
credentials: The private key ID, private key, client ID, client email,
|
|
66
|
+
token URI, and auth URI that correspond to this BigQuery account.
|
|
67
|
+
"""
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
name: str,
|
|
71
|
+
project_id: str,
|
|
72
|
+
dataset_id: str,
|
|
73
|
+
credentials: Optional[Dict[str, str]] = None,
|
|
74
|
+
_bypass_creation: bool = False, # INTERNAL ONLY.
|
|
75
|
+
):
|
|
76
|
+
super().__init__()
|
|
77
|
+
|
|
78
|
+
self._name = name
|
|
79
|
+
self.project_id = project_id
|
|
80
|
+
self.dataset_id = dataset_id
|
|
81
|
+
|
|
82
|
+
if _bypass_creation:
|
|
83
|
+
# TODO(manan, siyang): validate that this connector actually exists
|
|
84
|
+
# in the REST DB:
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
# Fully specify credentials, create Kumo connector:
|
|
88
|
+
credentials = credentials or {}
|
|
89
|
+
credentials_args = {
|
|
90
|
+
"private_key_id":
|
|
91
|
+
credentials.get("private_key_id",
|
|
92
|
+
os.getenv(_ENV_BIGQUERY_PRIVATE_KEY_ID)),
|
|
93
|
+
"private_key":
|
|
94
|
+
credentials.get("private_key",
|
|
95
|
+
os.getenv(_ENV_BIGQUERY_PRIVATE_KEY)),
|
|
96
|
+
"client_id":
|
|
97
|
+
credentials.get("client_id", os.getenv(_ENV_BIGQUERY_CLIENT_ID)),
|
|
98
|
+
"client_email":
|
|
99
|
+
credentials.get("client_email",
|
|
100
|
+
os.getenv(_ENV_BIGQUERY_CLIENT_EMAIL)),
|
|
101
|
+
"token_uri":
|
|
102
|
+
credentials.get("token_uri", os.getenv(_ENV_BIGQUERY_TOKEN_URI)),
|
|
103
|
+
"auth_uri":
|
|
104
|
+
credentials.get("auth_uri", os.getenv(_ENV_BIGQUERY_AUTH_URI)),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
self._create_connector(credentials_args) # type: ignore
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def get_by_name(cls, name: str) -> Self:
|
|
111
|
+
r"""Returns an instance of a named BigQuery Connector, including
|
|
112
|
+
those created in the Kumo UI.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
name: The name of the existing connector.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
>>> import kumoai
|
|
119
|
+
>>> connector = kumoai.SnowflakeConnector.get_by_name("name") # doctest: +SKIP # noqa: E501
|
|
120
|
+
"""
|
|
121
|
+
api = global_state.client.connector_api
|
|
122
|
+
resp = api.get(name)
|
|
123
|
+
if resp is None:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"There does not exist an existing stored connector with name "
|
|
126
|
+
f"{name}.")
|
|
127
|
+
config = resp.config
|
|
128
|
+
assert isinstance(config, BigQueryConnectorResourceConfig)
|
|
129
|
+
return cls(
|
|
130
|
+
name=config.name,
|
|
131
|
+
project_id=config.project_id,
|
|
132
|
+
dataset_id=config.dataset_id,
|
|
133
|
+
credentials=None,
|
|
134
|
+
_bypass_creation=True,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
@override
|
|
138
|
+
@property
|
|
139
|
+
def name(self) -> str:
|
|
140
|
+
r"""Returns the name of this connector."""
|
|
141
|
+
return self._name
|
|
142
|
+
|
|
143
|
+
@override
|
|
144
|
+
@property
|
|
145
|
+
def source_type(self) -> DataSourceType:
|
|
146
|
+
return DataSourceType.BIGQUERY
|
|
147
|
+
|
|
148
|
+
@override
|
|
149
|
+
def _source_table_request(
|
|
150
|
+
self,
|
|
151
|
+
table_names: List[str],
|
|
152
|
+
) -> BigQuerySourceTableRequest:
|
|
153
|
+
return BigQuerySourceTableRequest(
|
|
154
|
+
connector_id=self.name,
|
|
155
|
+
table_names=table_names,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def _create_connector(self, credentials: Dict[str, str]) -> None:
|
|
159
|
+
r"""Creates and persists a BigQuery connector in the REST DB.
|
|
160
|
+
Currently only intended for internal use.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
credentials: Fully-specified credentials containing the username
|
|
164
|
+
and password for the BigQuery connector.
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
RuntimeError: if connector creation failed
|
|
168
|
+
"""
|
|
169
|
+
credentials = BigQueryCredentials(
|
|
170
|
+
private_key_id=credentials["private_key_id"] or '',
|
|
171
|
+
private_key=credentials["private_key"] or '',
|
|
172
|
+
client_id=credentials["client_id"] or '',
|
|
173
|
+
client_email=credentials["client_email"] or '',
|
|
174
|
+
token_uri=credentials["token_uri"] or '',
|
|
175
|
+
auth_uri=credentials["auth_uri"] or '',
|
|
176
|
+
)
|
|
177
|
+
args = CreateConnectorArgs(
|
|
178
|
+
config=BigQueryConnectorResourceConfig(
|
|
179
|
+
name=self.name,
|
|
180
|
+
project_id=self.project_id,
|
|
181
|
+
dataset_id=self.dataset_id,
|
|
182
|
+
),
|
|
183
|
+
credentials=credentials,
|
|
184
|
+
)
|
|
185
|
+
global_state.client.connector_api.create_if_not_exist(args)
|
|
186
|
+
|
|
187
|
+
def _delete_connector(self) -> None:
|
|
188
|
+
r"""Deletes a connector in the REST DB. Only intended for internal
|
|
189
|
+
use.
|
|
190
|
+
"""
|
|
191
|
+
global_state.client.connector_api.delete_if_exists(self.name)
|
|
192
|
+
|
|
193
|
+
# Class properties ########################################################
|
|
194
|
+
|
|
195
|
+
@override
|
|
196
|
+
def __repr__(self) -> str:
|
|
197
|
+
return (f'{self.__class__.__name__}'
|
|
198
|
+
f'(name=\"{self.name}\",'
|
|
199
|
+
f'project_id=\"{self.project_id}\", '
|
|
200
|
+
f'dataset_id=\"{self.dataset_id}\")')
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from kumoapi.data_source import (
|
|
5
|
+
CreateConnectorArgs,
|
|
6
|
+
DatabricksConnectorResourceConfig,
|
|
7
|
+
DatabricksCredentials,
|
|
8
|
+
DataSourceType,
|
|
9
|
+
)
|
|
10
|
+
from kumoapi.source_table import DatabricksSourceTableRequest
|
|
11
|
+
from typing_extensions import Self, override
|
|
12
|
+
|
|
13
|
+
from kumoai import global_state
|
|
14
|
+
from kumoai.connector import Connector
|
|
15
|
+
|
|
16
|
+
_ENV_DATABRICKS_CLIENT_ID = 'DATABRICKS_CLIENT_ID'
|
|
17
|
+
_ENV_DATABRICKS_CLIENT_SECRET = 'DATABRICKS_CLIENT_SECRET'
|
|
18
|
+
_ENV_DATABRICKS_TOKEN = 'DATABRICKS_TOKEN'
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DatabricksConnector(Connector):
|
|
22
|
+
r"""Establishes a connection to a
|
|
23
|
+
`Databricks <https://www.databricks.com/>`_ database.
|
|
24
|
+
|
|
25
|
+
Authentication requires passing either a client ID and client secret, or a
|
|
26
|
+
personal access token, to the connector, either via environment variables
|
|
27
|
+
(``DATABRICKS_CLIENT_ID`` and ``DATABRICKS_CLIENT_SECRET``, or
|
|
28
|
+
``DATABRICKS_TOKEN``), or via keys in the credentials dictionary
|
|
29
|
+
(``client_id`` and ``client_secret``, or ``token``).
|
|
30
|
+
|
|
31
|
+
.. code-block:: python
|
|
32
|
+
|
|
33
|
+
import kumoai
|
|
34
|
+
|
|
35
|
+
# Either pass `credentials=dict(client_id=..., client_secret=...,
|
|
36
|
+
# token=...) or set the 'DATABRICKS_CLIENT_ID' and
|
|
37
|
+
# 'DATABRICKS_CLIENT_SECRET' (or 'DATABRICKS_TOKEN') environment
|
|
38
|
+
# variables:
|
|
39
|
+
connector = kumoai.connector.DatabricksConnector(
|
|
40
|
+
name="<connector_name>",
|
|
41
|
+
host="<databricks_host_name>",
|
|
42
|
+
cluster_id="<databricks_cluster_id>",
|
|
43
|
+
warehouse_id="<databricks_warehouse_id>",
|
|
44
|
+
catalog="<databricks_catalog_name>",
|
|
45
|
+
credentials=credentials,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# List all tables:
|
|
49
|
+
print(connector.table_names())
|
|
50
|
+
|
|
51
|
+
# Check whether a table is present:
|
|
52
|
+
assert "articles" in connector
|
|
53
|
+
|
|
54
|
+
# Fetch a source table (both approaches are equivalent):
|
|
55
|
+
source_table = connector["articles"]
|
|
56
|
+
source_table = connector.table("articles")
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
name: The name of the connector.
|
|
60
|
+
host: The host name.
|
|
61
|
+
cluster_id: The cluster ID of this warehouse.
|
|
62
|
+
warehouse_id: The warehouse ID of this warehous.
|
|
63
|
+
catalog: The name of the Databricks catalog.
|
|
64
|
+
credentials: The client ID, client secret, and personal access token
|
|
65
|
+
that correspond to this Databricks account.
|
|
66
|
+
"""
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
name: str,
|
|
70
|
+
host: str,
|
|
71
|
+
cluster_id: str,
|
|
72
|
+
warehouse_id: str,
|
|
73
|
+
catalog: str,
|
|
74
|
+
credentials: Optional[Dict[str, str]] = None,
|
|
75
|
+
_bypass_creation: bool = False, # INTERNAL ONLY.
|
|
76
|
+
):
|
|
77
|
+
super().__init__()
|
|
78
|
+
|
|
79
|
+
self._name = name
|
|
80
|
+
self.host = host
|
|
81
|
+
self.cluster_id = cluster_id
|
|
82
|
+
self.warehouse_id = warehouse_id
|
|
83
|
+
self.catalog = catalog
|
|
84
|
+
|
|
85
|
+
if _bypass_creation:
|
|
86
|
+
# TODO(manan, siyang): validate that this connector actually exists
|
|
87
|
+
# in the REST DB:
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
# Fully specify credentials, create Kumo connector:
|
|
91
|
+
credentials = credentials or {}
|
|
92
|
+
credentials_args = {
|
|
93
|
+
"client_id":
|
|
94
|
+
credentials.get("client_id", os.getenv(_ENV_DATABRICKS_CLIENT_ID)),
|
|
95
|
+
"client_secret":
|
|
96
|
+
credentials.get("client_secret",
|
|
97
|
+
os.getenv(_ENV_DATABRICKS_CLIENT_SECRET)),
|
|
98
|
+
"token":
|
|
99
|
+
credentials.get("token", os.getenv(_ENV_DATABRICKS_TOKEN))
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
has_pat = credentials_args["token"] is not None
|
|
103
|
+
has_client_id_secret = (credentials_args["client_id"] is not None and
|
|
104
|
+
credentials_args["client_secret"] is not None)
|
|
105
|
+
|
|
106
|
+
if has_pat and has_client_id_secret:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
"Please pass only one of a (Databricks client ID and "
|
|
109
|
+
"Databricks client secret) or a (Databricks PAT).")
|
|
110
|
+
elif not (has_pat or has_client_id_secret):
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"Please pass valid credentials to create a Databricks "
|
|
113
|
+
f"connector. You can do so either via the 'credentials' "
|
|
114
|
+
f"argument or the {_ENV_DATABRICKS_CLIENT_ID} and "
|
|
115
|
+
f"{_ENV_DATABRICKS_CLIENT_SECRET}, or "
|
|
116
|
+
f"{_ENV_DATABRICKS_TOKEN} environment variables.")
|
|
117
|
+
|
|
118
|
+
self._create_connector(credentials_args) # type: ignore
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def get_by_name(cls, name: str) -> Self:
|
|
122
|
+
r"""Returns an instance of a named Databricks Connector, including
|
|
123
|
+
those created in the Kumo UI.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
name: The name of the existing connector.
|
|
127
|
+
|
|
128
|
+
Example:
|
|
129
|
+
>>> import kumoai
|
|
130
|
+
>>> connector = kumoai.DatabricksConnector.get_by_name("name") # doctest: +SKIP # noqa: E501
|
|
131
|
+
"""
|
|
132
|
+
api = global_state.client.connector_api
|
|
133
|
+
resp = api.get(name)
|
|
134
|
+
if resp is None:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"There does not exist an existing stored connector with name "
|
|
137
|
+
f"{name}.")
|
|
138
|
+
config = resp.config
|
|
139
|
+
assert isinstance(config, DatabricksConnectorResourceConfig)
|
|
140
|
+
return cls(
|
|
141
|
+
name=config.name,
|
|
142
|
+
host=config.host,
|
|
143
|
+
cluster_id=config.cluster_id,
|
|
144
|
+
warehouse_id=config.warehouse_id,
|
|
145
|
+
catalog=config.catalog,
|
|
146
|
+
credentials=None,
|
|
147
|
+
_bypass_creation=True,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@override
|
|
151
|
+
@property
|
|
152
|
+
def name(self) -> str:
|
|
153
|
+
r"""Returns the name of this connector."""
|
|
154
|
+
return self._name
|
|
155
|
+
|
|
156
|
+
@override
|
|
157
|
+
@property
|
|
158
|
+
def source_type(self) -> DataSourceType:
|
|
159
|
+
return DataSourceType.DATABRICKS
|
|
160
|
+
|
|
161
|
+
@override
|
|
162
|
+
def _source_table_request(
|
|
163
|
+
self,
|
|
164
|
+
table_names: List[str],
|
|
165
|
+
) -> DatabricksSourceTableRequest:
|
|
166
|
+
return DatabricksSourceTableRequest(
|
|
167
|
+
connector_id=self.name,
|
|
168
|
+
table_names=table_names,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def _create_connector(self, credentials: Dict[str, str]) -> None:
|
|
172
|
+
r"""Creates and persists a Databricks connector in the REST DB.
|
|
173
|
+
Currently only intended for internal use.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
credentials: Fully-specified credentials containing the username
|
|
177
|
+
and password for the Databricks connector.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
RuntimeError: if connector creation failed
|
|
181
|
+
"""
|
|
182
|
+
credentials = DatabricksCredentials(
|
|
183
|
+
client_id=credentials["client_id"] or '',
|
|
184
|
+
client_secret=credentials["client_secret"] or '',
|
|
185
|
+
pat=credentials["token"] or '',
|
|
186
|
+
)
|
|
187
|
+
args = CreateConnectorArgs(
|
|
188
|
+
config=DatabricksConnectorResourceConfig(
|
|
189
|
+
name=self.name,
|
|
190
|
+
host=self.host,
|
|
191
|
+
cluster_id=self.cluster_id,
|
|
192
|
+
warehouse_id=self.warehouse_id,
|
|
193
|
+
catalog=self.catalog,
|
|
194
|
+
),
|
|
195
|
+
credentials=credentials,
|
|
196
|
+
)
|
|
197
|
+
global_state.client.connector_api.create_if_not_exist(args)
|
|
198
|
+
|
|
199
|
+
def _delete_connector(self) -> None:
|
|
200
|
+
r"""Deletes a connector in the REST DB. Only intended for internal
|
|
201
|
+
use.
|
|
202
|
+
"""
|
|
203
|
+
global_state.client.connector_api.delete_if_exists(self.name)
|
|
204
|
+
|
|
205
|
+
# Class properties ########################################################
|
|
206
|
+
|
|
207
|
+
@override
|
|
208
|
+
def __repr__(self) -> str:
|
|
209
|
+
return (f'{self.__class__.__name__}'
|
|
210
|
+
f'(name=\"{self.name}\", host=\"{self.host}\", '
|
|
211
|
+
f'cluster_id=\"{self.cluster_id}\", '
|
|
212
|
+
f'warehouse_id=\"{self.warehouse_id}\", '
|
|
213
|
+
f'catalog=\"{self.catalog}\")')
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from kumoapi.source_table import (
|
|
4
|
+
DataSourceType,
|
|
5
|
+
FileType,
|
|
6
|
+
S3SourceTableRequest,
|
|
7
|
+
SourceTableConfigRequest,
|
|
8
|
+
SourceTableConfigResponse,
|
|
9
|
+
)
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
from kumoai import global_state
|
|
13
|
+
from kumoai.connector.base import Connector
|
|
14
|
+
from kumoai.connector.utils import delete_uploaded_table, upload_table
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FileUploadConnector(Connector):
|
|
18
|
+
r"""Defines a connector to files directly uploaded to Kumo, either as
|
|
19
|
+
'parquet' or 'csv' (non-partitioned) data.
|
|
20
|
+
|
|
21
|
+
To get started with file upload, please first upload a table with
|
|
22
|
+
the :meth:`upload` method in the :class:`FileUploadConnector` class.
|
|
23
|
+
You can then access
|
|
24
|
+
this table behind the file upload connector as follows:
|
|
25
|
+
|
|
26
|
+
.. code-block:: python
|
|
27
|
+
|
|
28
|
+
import kumoai
|
|
29
|
+
|
|
30
|
+
# Create the file upload connector:
|
|
31
|
+
connector = kumoai.FileUploadConnector(file_type="parquet")
|
|
32
|
+
|
|
33
|
+
# Upload the table; assume it is stored at `/data/users.parquet`
|
|
34
|
+
connector.upload(name="users", path="/data/users.parquet")
|
|
35
|
+
|
|
36
|
+
# Check that the file upload connector has a `users` table:
|
|
37
|
+
assert connector.has_table("users")
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
file_type: The file type of uploaded data. Can be either ``"csv"``
|
|
41
|
+
or ``"parquet"``.
|
|
42
|
+
"""
|
|
43
|
+
def __init__(self, file_type: str) -> None:
|
|
44
|
+
r"""Creates the connector to uploaded files of type
|
|
45
|
+
:obj:`file_type`.
|
|
46
|
+
"""
|
|
47
|
+
assert file_type.lower() in {'parquet', 'csv'}
|
|
48
|
+
self._file_type = file_type.lower()
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def name(self) -> str:
|
|
52
|
+
return f'{self._file_type}_upload_connector'
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def source_type(self) -> DataSourceType:
|
|
56
|
+
return DataSourceType.S3
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def file_type(self) -> FileType:
|
|
60
|
+
return (FileType.PARQUET
|
|
61
|
+
if self._file_type == 'parquet' else FileType.CSV)
|
|
62
|
+
|
|
63
|
+
def _get_table_config(self, table_name: str) -> SourceTableConfigResponse:
|
|
64
|
+
req = SourceTableConfigRequest(connector_id=self.name,
|
|
65
|
+
table_name=table_name,
|
|
66
|
+
source_type=self.source_type,
|
|
67
|
+
file_type=None)
|
|
68
|
+
return global_state.client.source_table_api.get_table_config(req)
|
|
69
|
+
|
|
70
|
+
@override
|
|
71
|
+
def _source_table_request(self,
|
|
72
|
+
table_names: List[str]) -> S3SourceTableRequest:
|
|
73
|
+
return S3SourceTableRequest(s3_root_dir="", connector_id=self.name,
|
|
74
|
+
table_names=table_names, file_type=None)
|
|
75
|
+
|
|
76
|
+
def upload(
|
|
77
|
+
self,
|
|
78
|
+
name: str,
|
|
79
|
+
path: str,
|
|
80
|
+
auto_partition: bool = True,
|
|
81
|
+
partition_size_mb: int = 250,
|
|
82
|
+
) -> None:
|
|
83
|
+
r"""Upload a table to Kumo from a local or remote path.
|
|
84
|
+
|
|
85
|
+
Supports ``s3://``, ``gs://``, ``abfs://``, ``abfss://``, and ``az://``
|
|
86
|
+
|
|
87
|
+
Tables uploaded this way can be accessed from this
|
|
88
|
+
``FileUploadConnector`` using the provided name, e.g.,
|
|
89
|
+
``connector_obj["my_table"]``.
|
|
90
|
+
|
|
91
|
+
Local files
|
|
92
|
+
-----------
|
|
93
|
+
- Accepts one ``.parquet`` or ``.csv`` file (must match this
|
|
94
|
+
connector’s ``file_type``).
|
|
95
|
+
- If the file is > 1 GiB and ``auto_partition=True``, it is split
|
|
96
|
+
into ~``partition_size_mb`` MiB parts and uploaded under a common
|
|
97
|
+
prefix so the connector can read them as one table.
|
|
98
|
+
|
|
99
|
+
Remote paths
|
|
100
|
+
------------
|
|
101
|
+
- **Single file** (``.parquet``/``.csv``): validated and uploaded via
|
|
102
|
+
multipart PUT. Files > 1 GiB are rejected — re-shard to ~200 MiB
|
|
103
|
+
and upload the directory instead.
|
|
104
|
+
- **Directory**: must contain only one format (all Parquet or all CSV)
|
|
105
|
+
matching this connector’s ``file_type``. Files are validated
|
|
106
|
+
(consistent schema; CSV headers sanitized) and uploaded in parallel
|
|
107
|
+
with memory-safe budgeting.
|
|
108
|
+
|
|
109
|
+
.. warning::
|
|
110
|
+
For local uploads, input must be a single CSV or Parquet file
|
|
111
|
+
(matching the connector type). For remote uploads, mixed
|
|
112
|
+
CSV/Parquet directories are not supported. Remote single files
|
|
113
|
+
larger than 1 GiB are not supported.
|
|
114
|
+
|
|
115
|
+
Examples:
|
|
116
|
+
---------
|
|
117
|
+
.. code-block:: python
|
|
118
|
+
|
|
119
|
+
import kumoai
|
|
120
|
+
conn = kumoai.FileUploadConnector(file_type="parquet")
|
|
121
|
+
|
|
122
|
+
# Local: small file
|
|
123
|
+
conn.upload(name="users", path="/data/users.parquet")
|
|
124
|
+
|
|
125
|
+
# Local: large file (auto-partitions)
|
|
126
|
+
conn.upload(
|
|
127
|
+
name="txns",
|
|
128
|
+
path="/data/large_txns.parquet",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Local: disable auto-partitioning (raises if > 1 GiB)
|
|
132
|
+
conn.upload(
|
|
133
|
+
name="users",
|
|
134
|
+
path="/data/users.parquet",
|
|
135
|
+
auto_partition=False,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# CSV connector
|
|
139
|
+
csv_conn = kumoai.FileUploadConnector(file_type="csv")
|
|
140
|
+
csv_conn.upload(name="sales", path="/data/sales.csv")
|
|
141
|
+
|
|
142
|
+
# Remote: single file (<= 1 GiB)
|
|
143
|
+
conn.upload(name="logs", path="s3://bkt/path/logs.parquet")
|
|
144
|
+
|
|
145
|
+
# Remote: directory of shards (uniform format)
|
|
146
|
+
csv_conn.upload(name="events", path="gs://mybkt/events_csv/")
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
name:
|
|
150
|
+
Table name to create in Kumo; access later via this connector.
|
|
151
|
+
path:
|
|
152
|
+
Local path or remote URL to a ``.parquet``/``.csv`` file or a
|
|
153
|
+
directory (uniform format). The format must match this
|
|
154
|
+
connector’s ``file_type``.
|
|
155
|
+
auto_partition:
|
|
156
|
+
Local-only. If ``True`` and the local file is > 1 GiB, split
|
|
157
|
+
into ~``partition_size_mb`` MiB parts.
|
|
158
|
+
partition_size_mb:
|
|
159
|
+
Local-only. Target partition size (100–1000 MiB) when
|
|
160
|
+
``auto_partition`` is ``True``.
|
|
161
|
+
"""
|
|
162
|
+
upload_table(name=name, path=path, auto_partition=auto_partition,
|
|
163
|
+
partition_size_mb=partition_size_mb,
|
|
164
|
+
file_type=self._file_type)
|
|
165
|
+
|
|
166
|
+
def delete(
|
|
167
|
+
self,
|
|
168
|
+
name: str,
|
|
169
|
+
) -> None:
|
|
170
|
+
r"""Synchronously deletes a previously uploaded table from the Kumo
|
|
171
|
+
data plane.
|
|
172
|
+
|
|
173
|
+
.. code-block:: python
|
|
174
|
+
|
|
175
|
+
# Assume we have uploaded a `.parquet` table named `users`, and a
|
|
176
|
+
# `FileUploadConnector` has been created called `connector`, and
|
|
177
|
+
# we want to delete this table from Kumo:
|
|
178
|
+
connector.delete(name="users")
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
name: The name of the table to be deleted. This table must have
|
|
182
|
+
previously been uploaded with a call to
|
|
183
|
+
:meth:`~kumoai.connector.FileUploadConnector.upload`.
|
|
184
|
+
"""
|
|
185
|
+
if not self.has_table(name):
|
|
186
|
+
raise ValueError(f"The table '{name}' does not exist in {self}. "
|
|
187
|
+
f"Please check the existence of the source data.")
|
|
188
|
+
|
|
189
|
+
delete_uploaded_table(name, self._file_type)
|