kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (122) hide show
  1. kumoai/__init__.py +300 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +223 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +471 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +207 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1796 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +210 -0
  51. kumoai/experimental/rfm/authenticate.py +432 -0
  52. kumoai/experimental/rfm/backend/__init__.py +0 -0
  53. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  54. kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
  55. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  56. kumoai/experimental/rfm/backend/local/table.py +113 -0
  57. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  58. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  59. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  60. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  61. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  62. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  63. kumoai/experimental/rfm/base/__init__.py +30 -0
  64. kumoai/experimental/rfm/base/column.py +152 -0
  65. kumoai/experimental/rfm/base/expression.py +44 -0
  66. kumoai/experimental/rfm/base/sampler.py +761 -0
  67. kumoai/experimental/rfm/base/source.py +19 -0
  68. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  69. kumoai/experimental/rfm/base/table.py +736 -0
  70. kumoai/experimental/rfm/graph.py +1237 -0
  71. kumoai/experimental/rfm/infer/__init__.py +19 -0
  72. kumoai/experimental/rfm/infer/categorical.py +40 -0
  73. kumoai/experimental/rfm/infer/dtype.py +82 -0
  74. kumoai/experimental/rfm/infer/id.py +46 -0
  75. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  76. kumoai/experimental/rfm/infer/pkey.py +128 -0
  77. kumoai/experimental/rfm/infer/stype.py +35 -0
  78. kumoai/experimental/rfm/infer/time_col.py +61 -0
  79. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  80. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  81. kumoai/experimental/rfm/pquery/executor.py +102 -0
  82. kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
  83. kumoai/experimental/rfm/relbench.py +76 -0
  84. kumoai/experimental/rfm/rfm.py +1184 -0
  85. kumoai/experimental/rfm/sagemaker.py +138 -0
  86. kumoai/experimental/rfm/task_table.py +231 -0
  87. kumoai/formatting.py +30 -0
  88. kumoai/futures.py +99 -0
  89. kumoai/graph/__init__.py +12 -0
  90. kumoai/graph/column.py +106 -0
  91. kumoai/graph/graph.py +948 -0
  92. kumoai/graph/table.py +838 -0
  93. kumoai/jobs.py +80 -0
  94. kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
  95. kumoai/mixin.py +28 -0
  96. kumoai/pquery/__init__.py +25 -0
  97. kumoai/pquery/prediction_table.py +287 -0
  98. kumoai/pquery/predictive_query.py +641 -0
  99. kumoai/pquery/training_table.py +424 -0
  100. kumoai/spcs.py +121 -0
  101. kumoai/testing/__init__.py +8 -0
  102. kumoai/testing/decorators.py +57 -0
  103. kumoai/testing/snow.py +50 -0
  104. kumoai/trainer/__init__.py +42 -0
  105. kumoai/trainer/baseline_trainer.py +93 -0
  106. kumoai/trainer/config.py +2 -0
  107. kumoai/trainer/distilled_trainer.py +175 -0
  108. kumoai/trainer/job.py +1192 -0
  109. kumoai/trainer/online_serving.py +258 -0
  110. kumoai/trainer/trainer.py +475 -0
  111. kumoai/trainer/util.py +103 -0
  112. kumoai/utils/__init__.py +11 -0
  113. kumoai/utils/datasets.py +83 -0
  114. kumoai/utils/display.py +51 -0
  115. kumoai/utils/forecasting.py +209 -0
  116. kumoai/utils/progress_logger.py +343 -0
  117. kumoai/utils/sql.py +3 -0
  118. kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
  119. kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
  120. kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
  121. kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
  122. kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
@@ -0,0 +1,200 @@
1
+ import os
2
+ from typing import Dict, List, Optional
3
+
4
+ from kumoapi.data_source import (
5
+ BigQueryConnectorResourceConfig,
6
+ BigQueryCredentials,
7
+ CreateConnectorArgs,
8
+ DataSourceType,
9
+ )
10
+ from kumoapi.source_table import BigQuerySourceTableRequest
11
+ from typing_extensions import Self, override
12
+
13
+ from kumoai import global_state
14
+ from kumoai.connector import Connector
15
+
16
+ _ENV_BIGQUERY_PRIVATE_KEY_ID = 'BIGQUERY_PRIVATE_KEY_ID'
17
+ _ENV_BIGQUERY_PRIVATE_KEY = 'BIGQUERY_PRIVATE_KEY'
18
+ _ENV_BIGQUERY_CLIENT_ID = 'BIGQUERY_CLIENT_ID'
19
+ _ENV_BIGQUERY_CLIENT_EMAIL = 'BIGQUERY_CLIENT_EMAIL'
20
+ _ENV_BIGQUERY_TOKEN_URI = 'BIGQUERY_TOKEN_URI'
21
+ _ENV_BIGQUERY_AUTH_URI = 'BIGQUERY_AUTH_URI'
22
+
23
+
24
+ class BigQueryConnector(Connector):
25
+ r"""Establishes a connection to a
26
+ `BigQuery <https://cloud.google.com/bigquery>`_ database.
27
+
28
+ Authentication requires passing a private key ID, private key string,
29
+ client ID, client email, token URI, and authentication URI to the
30
+ connector, either via environment variables
31
+ (``BIGQUERY_PRIVATE_KEY_ID``, ``BIGQUERY_PRIVATE_KEY``,
32
+ ``BIGQUERY_CLIENT_ID``, ``BIGQUERY_CLIENT_EMAIL``, ``BIGQUERY_TOKEN_URI``,
33
+ ``BIGQUERY_AUTH_URI``), or via keys in the credentials dictionary
34
+ (:obj:`private_key_id`, :obj:`private_key`, :obj:`client_id`,
35
+ :obj:`client_email`, :obj:`token_uri`, :obj:`auth_uri`).
36
+
37
+ .. code-block:: python
38
+
39
+ import kumoai
40
+
41
+ # Either pass `credentials=dict(private_key_id=..., private_key=...,
42
+ # client_id=..., client_email=..., token_uri=..., auth_url=...)` or set
43
+ # the aforementioned environment variables:
44
+ connector = kumoai.BigQueryConnector(
45
+ name="<connector_name>",
46
+ project_id="<bigquery_project_id>",
47
+ dataset_id="<bigquery_dataset_id>",
48
+ credentials=credentials,
49
+ )
50
+
51
+ # List all tables:
52
+ print(connector.table_names())
53
+
54
+ # Check whether a table is present:
55
+ assert "articles" in connector
56
+
57
+ # Fetch a source table (both approaches are equivalent):
58
+ source_table = connector["articles"]
59
+ source_table = connector.table("articles")
60
+
61
+ Args:
62
+ name: The name of the connector.
63
+ project_id: The project ID to connect to.
64
+ dataset_id: The dataset ID within the connected project.
65
+ credentials: The private key ID, private key, client ID, client email,
66
+ token URI, and auth URI that correspond to this BigQuery account.
67
+ """
68
+ def __init__(
69
+ self,
70
+ name: str,
71
+ project_id: str,
72
+ dataset_id: str,
73
+ credentials: Optional[Dict[str, str]] = None,
74
+ _bypass_creation: bool = False, # INTERNAL ONLY.
75
+ ):
76
+ super().__init__()
77
+
78
+ self._name = name
79
+ self.project_id = project_id
80
+ self.dataset_id = dataset_id
81
+
82
+ if _bypass_creation:
83
+ # TODO(manan, siyang): validate that this connector actually exists
84
+ # in the REST DB:
85
+ return
86
+
87
+ # Fully specify credentials, create Kumo connector:
88
+ credentials = credentials or {}
89
+ credentials_args = {
90
+ "private_key_id":
91
+ credentials.get("private_key_id",
92
+ os.getenv(_ENV_BIGQUERY_PRIVATE_KEY_ID)),
93
+ "private_key":
94
+ credentials.get("private_key",
95
+ os.getenv(_ENV_BIGQUERY_PRIVATE_KEY)),
96
+ "client_id":
97
+ credentials.get("client_id", os.getenv(_ENV_BIGQUERY_CLIENT_ID)),
98
+ "client_email":
99
+ credentials.get("client_email",
100
+ os.getenv(_ENV_BIGQUERY_CLIENT_EMAIL)),
101
+ "token_uri":
102
+ credentials.get("token_uri", os.getenv(_ENV_BIGQUERY_TOKEN_URI)),
103
+ "auth_uri":
104
+ credentials.get("auth_uri", os.getenv(_ENV_BIGQUERY_AUTH_URI)),
105
+ }
106
+
107
+ self._create_connector(credentials_args) # type: ignore
108
+
109
+ @classmethod
110
+ def get_by_name(cls, name: str) -> Self:
111
+ r"""Returns an instance of a named BigQuery Connector, including
112
+ those created in the Kumo UI.
113
+
114
+ Args:
115
+ name: The name of the existing connector.
116
+
117
+ Example:
118
+ >>> import kumoai
119
+ >>> connector = kumoai.SnowflakeConnector.get_by_name("name") # doctest: +SKIP # noqa: E501
120
+ """
121
+ api = global_state.client.connector_api
122
+ resp = api.get(name)
123
+ if resp is None:
124
+ raise ValueError(
125
+ f"There does not exist an existing stored connector with name "
126
+ f"{name}.")
127
+ config = resp.config
128
+ assert isinstance(config, BigQueryConnectorResourceConfig)
129
+ return cls(
130
+ name=config.name,
131
+ project_id=config.project_id,
132
+ dataset_id=config.dataset_id,
133
+ credentials=None,
134
+ _bypass_creation=True,
135
+ )
136
+
137
+ @override
138
+ @property
139
+ def name(self) -> str:
140
+ r"""Returns the name of this connector."""
141
+ return self._name
142
+
143
+ @override
144
+ @property
145
+ def source_type(self) -> DataSourceType:
146
+ return DataSourceType.BIGQUERY
147
+
148
+ @override
149
+ def _source_table_request(
150
+ self,
151
+ table_names: List[str],
152
+ ) -> BigQuerySourceTableRequest:
153
+ return BigQuerySourceTableRequest(
154
+ connector_id=self.name,
155
+ table_names=table_names,
156
+ )
157
+
158
+ def _create_connector(self, credentials: Dict[str, str]) -> None:
159
+ r"""Creates and persists a BigQuery connector in the REST DB.
160
+ Currently only intended for internal use.
161
+
162
+ Args:
163
+ credentials: Fully-specified credentials containing the username
164
+ and password for the BigQuery connector.
165
+
166
+ Raises:
167
+ RuntimeError: if connector creation failed
168
+ """
169
+ credentials = BigQueryCredentials(
170
+ private_key_id=credentials["private_key_id"] or '',
171
+ private_key=credentials["private_key"] or '',
172
+ client_id=credentials["client_id"] or '',
173
+ client_email=credentials["client_email"] or '',
174
+ token_uri=credentials["token_uri"] or '',
175
+ auth_uri=credentials["auth_uri"] or '',
176
+ )
177
+ args = CreateConnectorArgs(
178
+ config=BigQueryConnectorResourceConfig(
179
+ name=self.name,
180
+ project_id=self.project_id,
181
+ dataset_id=self.dataset_id,
182
+ ),
183
+ credentials=credentials,
184
+ )
185
+ global_state.client.connector_api.create_if_not_exist(args)
186
+
187
+ def _delete_connector(self) -> None:
188
+ r"""Deletes a connector in the REST DB. Only intended for internal
189
+ use.
190
+ """
191
+ global_state.client.connector_api.delete_if_exists(self.name)
192
+
193
+ # Class properties ########################################################
194
+
195
+ @override
196
+ def __repr__(self) -> str:
197
+ return (f'{self.__class__.__name__}'
198
+ f'(name=\"{self.name}\",'
199
+ f'project_id=\"{self.project_id}\", '
200
+ f'dataset_id=\"{self.dataset_id}\")')
@@ -0,0 +1,213 @@
1
+ import os
2
+ from typing import Dict, List, Optional
3
+
4
+ from kumoapi.data_source import (
5
+ CreateConnectorArgs,
6
+ DatabricksConnectorResourceConfig,
7
+ DatabricksCredentials,
8
+ DataSourceType,
9
+ )
10
+ from kumoapi.source_table import DatabricksSourceTableRequest
11
+ from typing_extensions import Self, override
12
+
13
+ from kumoai import global_state
14
+ from kumoai.connector import Connector
15
+
16
+ _ENV_DATABRICKS_CLIENT_ID = 'DATABRICKS_CLIENT_ID'
17
+ _ENV_DATABRICKS_CLIENT_SECRET = 'DATABRICKS_CLIENT_SECRET'
18
+ _ENV_DATABRICKS_TOKEN = 'DATABRICKS_TOKEN'
19
+
20
+
21
+ class DatabricksConnector(Connector):
22
+ r"""Establishes a connection to a
23
+ `Databricks <https://www.databricks.com/>`_ database.
24
+
25
+ Authentication requires passing either a client ID and client secret, or a
26
+ personal access token, to the connector, either via environment variables
27
+ (``DATABRICKS_CLIENT_ID`` and ``DATABRICKS_CLIENT_SECRET``, or
28
+ ``DATABRICKS_TOKEN``), or via keys in the credentials dictionary
29
+ (``client_id`` and ``client_secret``, or ``token``).
30
+
31
+ .. code-block:: python
32
+
33
+ import kumoai
34
+
35
+ # Either pass `credentials=dict(client_id=..., client_secret=...,
36
+ # token=...) or set the 'DATABRICKS_CLIENT_ID' and
37
+ # 'DATABRICKS_CLIENT_SECRET' (or 'DATABRICKS_TOKEN') environment
38
+ # variables:
39
+ connector = kumoai.connector.DatabricksConnector(
40
+ name="<connector_name>",
41
+ host="<databricks_host_name>",
42
+ cluster_id="<databricks_cluster_id>",
43
+ warehouse_id="<databricks_warehouse_id>",
44
+ catalog="<databricks_catalog_name>",
45
+ credentials=credentials,
46
+ )
47
+
48
+ # List all tables:
49
+ print(connector.table_names())
50
+
51
+ # Check whether a table is present:
52
+ assert "articles" in connector
53
+
54
+ # Fetch a source table (both approaches are equivalent):
55
+ source_table = connector["articles"]
56
+ source_table = connector.table("articles")
57
+
58
+ Args:
59
+ name: The name of the connector.
60
+ host: The host name.
61
+ cluster_id: The cluster ID of this warehouse.
62
+ warehouse_id: The warehouse ID of this warehous.
63
+ catalog: The name of the Databricks catalog.
64
+ credentials: The client ID, client secret, and personal access token
65
+ that correspond to this Databricks account.
66
+ """
67
+ def __init__(
68
+ self,
69
+ name: str,
70
+ host: str,
71
+ cluster_id: str,
72
+ warehouse_id: str,
73
+ catalog: str,
74
+ credentials: Optional[Dict[str, str]] = None,
75
+ _bypass_creation: bool = False, # INTERNAL ONLY.
76
+ ):
77
+ super().__init__()
78
+
79
+ self._name = name
80
+ self.host = host
81
+ self.cluster_id = cluster_id
82
+ self.warehouse_id = warehouse_id
83
+ self.catalog = catalog
84
+
85
+ if _bypass_creation:
86
+ # TODO(manan, siyang): validate that this connector actually exists
87
+ # in the REST DB:
88
+ return
89
+
90
+ # Fully specify credentials, create Kumo connector:
91
+ credentials = credentials or {}
92
+ credentials_args = {
93
+ "client_id":
94
+ credentials.get("client_id", os.getenv(_ENV_DATABRICKS_CLIENT_ID)),
95
+ "client_secret":
96
+ credentials.get("client_secret",
97
+ os.getenv(_ENV_DATABRICKS_CLIENT_SECRET)),
98
+ "token":
99
+ credentials.get("token", os.getenv(_ENV_DATABRICKS_TOKEN))
100
+ }
101
+
102
+ has_pat = credentials_args["token"] is not None
103
+ has_client_id_secret = (credentials_args["client_id"] is not None and
104
+ credentials_args["client_secret"] is not None)
105
+
106
+ if has_pat and has_client_id_secret:
107
+ raise ValueError(
108
+ "Please pass only one of a (Databricks client ID and "
109
+ "Databricks client secret) or a (Databricks PAT).")
110
+ elif not (has_pat or has_client_id_secret):
111
+ raise ValueError(
112
+ f"Please pass valid credentials to create a Databricks "
113
+ f"connector. You can do so either via the 'credentials' "
114
+ f"argument or the {_ENV_DATABRICKS_CLIENT_ID} and "
115
+ f"{_ENV_DATABRICKS_CLIENT_SECRET}, or "
116
+ f"{_ENV_DATABRICKS_TOKEN} environment variables.")
117
+
118
+ self._create_connector(credentials_args) # type: ignore
119
+
120
+ @classmethod
121
+ def get_by_name(cls, name: str) -> Self:
122
+ r"""Returns an instance of a named Databricks Connector, including
123
+ those created in the Kumo UI.
124
+
125
+ Args:
126
+ name: The name of the existing connector.
127
+
128
+ Example:
129
+ >>> import kumoai
130
+ >>> connector = kumoai.DatabricksConnector.get_by_name("name") # doctest: +SKIP # noqa: E501
131
+ """
132
+ api = global_state.client.connector_api
133
+ resp = api.get(name)
134
+ if resp is None:
135
+ raise ValueError(
136
+ f"There does not exist an existing stored connector with name "
137
+ f"{name}.")
138
+ config = resp.config
139
+ assert isinstance(config, DatabricksConnectorResourceConfig)
140
+ return cls(
141
+ name=config.name,
142
+ host=config.host,
143
+ cluster_id=config.cluster_id,
144
+ warehouse_id=config.warehouse_id,
145
+ catalog=config.catalog,
146
+ credentials=None,
147
+ _bypass_creation=True,
148
+ )
149
+
150
+ @override
151
+ @property
152
+ def name(self) -> str:
153
+ r"""Returns the name of this connector."""
154
+ return self._name
155
+
156
+ @override
157
+ @property
158
+ def source_type(self) -> DataSourceType:
159
+ return DataSourceType.DATABRICKS
160
+
161
+ @override
162
+ def _source_table_request(
163
+ self,
164
+ table_names: List[str],
165
+ ) -> DatabricksSourceTableRequest:
166
+ return DatabricksSourceTableRequest(
167
+ connector_id=self.name,
168
+ table_names=table_names,
169
+ )
170
+
171
+ def _create_connector(self, credentials: Dict[str, str]) -> None:
172
+ r"""Creates and persists a Databricks connector in the REST DB.
173
+ Currently only intended for internal use.
174
+
175
+ Args:
176
+ credentials: Fully-specified credentials containing the username
177
+ and password for the Databricks connector.
178
+
179
+ Raises:
180
+ RuntimeError: if connector creation failed
181
+ """
182
+ credentials = DatabricksCredentials(
183
+ client_id=credentials["client_id"] or '',
184
+ client_secret=credentials["client_secret"] or '',
185
+ pat=credentials["token"] or '',
186
+ )
187
+ args = CreateConnectorArgs(
188
+ config=DatabricksConnectorResourceConfig(
189
+ name=self.name,
190
+ host=self.host,
191
+ cluster_id=self.cluster_id,
192
+ warehouse_id=self.warehouse_id,
193
+ catalog=self.catalog,
194
+ ),
195
+ credentials=credentials,
196
+ )
197
+ global_state.client.connector_api.create_if_not_exist(args)
198
+
199
+ def _delete_connector(self) -> None:
200
+ r"""Deletes a connector in the REST DB. Only intended for internal
201
+ use.
202
+ """
203
+ global_state.client.connector_api.delete_if_exists(self.name)
204
+
205
+ # Class properties ########################################################
206
+
207
+ @override
208
+ def __repr__(self) -> str:
209
+ return (f'{self.__class__.__name__}'
210
+ f'(name=\"{self.name}\", host=\"{self.host}\", '
211
+ f'cluster_id=\"{self.cluster_id}\", '
212
+ f'warehouse_id=\"{self.warehouse_id}\", '
213
+ f'catalog=\"{self.catalog}\")')
@@ -0,0 +1,189 @@
1
+ from typing import List
2
+
3
+ from kumoapi.source_table import (
4
+ DataSourceType,
5
+ FileType,
6
+ S3SourceTableRequest,
7
+ SourceTableConfigRequest,
8
+ SourceTableConfigResponse,
9
+ )
10
+ from typing_extensions import override
11
+
12
+ from kumoai import global_state
13
+ from kumoai.connector.base import Connector
14
+ from kumoai.connector.utils import delete_uploaded_table, upload_table
15
+
16
+
17
+ class FileUploadConnector(Connector):
18
+ r"""Defines a connector to files directly uploaded to Kumo, either as
19
+ 'parquet' or 'csv' (non-partitioned) data.
20
+
21
+ To get started with file upload, please first upload a table with
22
+ the :meth:`upload` method in the :class:`FileUploadConnector` class.
23
+ You can then access
24
+ this table behind the file upload connector as follows:
25
+
26
+ .. code-block:: python
27
+
28
+ import kumoai
29
+
30
+ # Create the file upload connector:
31
+ connector = kumoai.FileUploadConnector(file_type="parquet")
32
+
33
+ # Upload the table; assume it is stored at `/data/users.parquet`
34
+ connector.upload(name="users", path="/data/users.parquet")
35
+
36
+ # Check that the file upload connector has a `users` table:
37
+ assert connector.has_table("users")
38
+
39
+ Args:
40
+ file_type: The file type of uploaded data. Can be either ``"csv"``
41
+ or ``"parquet"``.
42
+ """
43
+ def __init__(self, file_type: str) -> None:
44
+ r"""Creates the connector to uploaded files of type
45
+ :obj:`file_type`.
46
+ """
47
+ assert file_type.lower() in {'parquet', 'csv'}
48
+ self._file_type = file_type.lower()
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return f'{self._file_type}_upload_connector'
53
+
54
+ @property
55
+ def source_type(self) -> DataSourceType:
56
+ return DataSourceType.S3
57
+
58
+ @property
59
+ def file_type(self) -> FileType:
60
+ return (FileType.PARQUET
61
+ if self._file_type == 'parquet' else FileType.CSV)
62
+
63
+ def _get_table_config(self, table_name: str) -> SourceTableConfigResponse:
64
+ req = SourceTableConfigRequest(connector_id=self.name,
65
+ table_name=table_name,
66
+ source_type=self.source_type,
67
+ file_type=None)
68
+ return global_state.client.source_table_api.get_table_config(req)
69
+
70
+ @override
71
+ def _source_table_request(self,
72
+ table_names: List[str]) -> S3SourceTableRequest:
73
+ return S3SourceTableRequest(s3_root_dir="", connector_id=self.name,
74
+ table_names=table_names, file_type=None)
75
+
76
+ def upload(
77
+ self,
78
+ name: str,
79
+ path: str,
80
+ auto_partition: bool = True,
81
+ partition_size_mb: int = 250,
82
+ ) -> None:
83
+ r"""Upload a table to Kumo from a local or remote path.
84
+
85
+ Supports ``s3://``, ``gs://``, ``abfs://``, ``abfss://``, and ``az://``
86
+
87
+ Tables uploaded this way can be accessed from this
88
+ ``FileUploadConnector`` using the provided name, e.g.,
89
+ ``connector_obj["my_table"]``.
90
+
91
+ Local files
92
+ -----------
93
+ - Accepts one ``.parquet`` or ``.csv`` file (must match this
94
+ connector’s ``file_type``).
95
+ - If the file is > 1 GiB and ``auto_partition=True``, it is split
96
+ into ~``partition_size_mb`` MiB parts and uploaded under a common
97
+ prefix so the connector can read them as one table.
98
+
99
+ Remote paths
100
+ ------------
101
+ - **Single file** (``.parquet``/``.csv``): validated and uploaded via
102
+ multipart PUT. Files > 1 GiB are rejected — re-shard to ~200 MiB
103
+ and upload the directory instead.
104
+ - **Directory**: must contain only one format (all Parquet or all CSV)
105
+ matching this connector’s ``file_type``. Files are validated
106
+ (consistent schema; CSV headers sanitized) and uploaded in parallel
107
+ with memory-safe budgeting.
108
+
109
+ .. warning::
110
+ For local uploads, input must be a single CSV or Parquet file
111
+ (matching the connector type). For remote uploads, mixed
112
+ CSV/Parquet directories are not supported. Remote single files
113
+ larger than 1 GiB are not supported.
114
+
115
+ Examples:
116
+ ---------
117
+ .. code-block:: python
118
+
119
+ import kumoai
120
+ conn = kumoai.FileUploadConnector(file_type="parquet")
121
+
122
+ # Local: small file
123
+ conn.upload(name="users", path="/data/users.parquet")
124
+
125
+ # Local: large file (auto-partitions)
126
+ conn.upload(
127
+ name="txns",
128
+ path="/data/large_txns.parquet",
129
+ )
130
+
131
+ # Local: disable auto-partitioning (raises if > 1 GiB)
132
+ conn.upload(
133
+ name="users",
134
+ path="/data/users.parquet",
135
+ auto_partition=False,
136
+ )
137
+
138
+ # CSV connector
139
+ csv_conn = kumoai.FileUploadConnector(file_type="csv")
140
+ csv_conn.upload(name="sales", path="/data/sales.csv")
141
+
142
+ # Remote: single file (<= 1 GiB)
143
+ conn.upload(name="logs", path="s3://bkt/path/logs.parquet")
144
+
145
+ # Remote: directory of shards (uniform format)
146
+ csv_conn.upload(name="events", path="gs://mybkt/events_csv/")
147
+
148
+ Args:
149
+ name:
150
+ Table name to create in Kumo; access later via this connector.
151
+ path:
152
+ Local path or remote URL to a ``.parquet``/``.csv`` file or a
153
+ directory (uniform format). The format must match this
154
+ connector’s ``file_type``.
155
+ auto_partition:
156
+ Local-only. If ``True`` and the local file is > 1 GiB, split
157
+ into ~``partition_size_mb`` MiB parts.
158
+ partition_size_mb:
159
+ Local-only. Target partition size (100–1000 MiB) when
160
+ ``auto_partition`` is ``True``.
161
+ """
162
+ upload_table(name=name, path=path, auto_partition=auto_partition,
163
+ partition_size_mb=partition_size_mb,
164
+ file_type=self._file_type)
165
+
166
+ def delete(
167
+ self,
168
+ name: str,
169
+ ) -> None:
170
+ r"""Synchronously deletes a previously uploaded table from the Kumo
171
+ data plane.
172
+
173
+ .. code-block:: python
174
+
175
+ # Assume we have uploaded a `.parquet` table named `users`, and a
176
+ # `FileUploadConnector` has been created called `connector`, and
177
+ # we want to delete this table from Kumo:
178
+ connector.delete(name="users")
179
+
180
+ Args:
181
+ name: The name of the table to be deleted. This table must have
182
+ previously been uploaded with a call to
183
+ :meth:`~kumoai.connector.FileUploadConnector.upload`.
184
+ """
185
+ if not self.has_table(name):
186
+ raise ValueError(f"The table '{name}' does not exist in {self}. "
187
+ f"Please check the existence of the source data.")
188
+
189
+ delete_uploaded_table(name, self._file_type)