sqlframe 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlframe/__init__.py +0 -0
- sqlframe/_version.py +16 -0
- sqlframe/base/__init__.py +0 -0
- sqlframe/base/_typing.py +39 -0
- sqlframe/base/catalog.py +1163 -0
- sqlframe/base/column.py +388 -0
- sqlframe/base/dataframe.py +1519 -0
- sqlframe/base/decorators.py +51 -0
- sqlframe/base/exceptions.py +14 -0
- sqlframe/base/function_alternatives.py +1055 -0
- sqlframe/base/functions.py +1678 -0
- sqlframe/base/group.py +102 -0
- sqlframe/base/mixins/__init__.py +0 -0
- sqlframe/base/mixins/catalog_mixins.py +419 -0
- sqlframe/base/mixins/readwriter_mixins.py +118 -0
- sqlframe/base/normalize.py +84 -0
- sqlframe/base/operations.py +87 -0
- sqlframe/base/readerwriter.py +679 -0
- sqlframe/base/session.py +585 -0
- sqlframe/base/transforms.py +13 -0
- sqlframe/base/types.py +418 -0
- sqlframe/base/util.py +242 -0
- sqlframe/base/window.py +139 -0
- sqlframe/bigquery/__init__.py +23 -0
- sqlframe/bigquery/catalog.py +255 -0
- sqlframe/bigquery/column.py +1 -0
- sqlframe/bigquery/dataframe.py +54 -0
- sqlframe/bigquery/functions.py +378 -0
- sqlframe/bigquery/group.py +14 -0
- sqlframe/bigquery/readwriter.py +29 -0
- sqlframe/bigquery/session.py +89 -0
- sqlframe/bigquery/types.py +1 -0
- sqlframe/bigquery/window.py +1 -0
- sqlframe/duckdb/__init__.py +20 -0
- sqlframe/duckdb/catalog.py +108 -0
- sqlframe/duckdb/column.py +1 -0
- sqlframe/duckdb/dataframe.py +55 -0
- sqlframe/duckdb/functions.py +47 -0
- sqlframe/duckdb/group.py +14 -0
- sqlframe/duckdb/readwriter.py +111 -0
- sqlframe/duckdb/session.py +65 -0
- sqlframe/duckdb/types.py +1 -0
- sqlframe/duckdb/window.py +1 -0
- sqlframe/postgres/__init__.py +23 -0
- sqlframe/postgres/catalog.py +106 -0
- sqlframe/postgres/column.py +1 -0
- sqlframe/postgres/dataframe.py +54 -0
- sqlframe/postgres/functions.py +61 -0
- sqlframe/postgres/group.py +14 -0
- sqlframe/postgres/readwriter.py +29 -0
- sqlframe/postgres/session.py +68 -0
- sqlframe/postgres/types.py +1 -0
- sqlframe/postgres/window.py +1 -0
- sqlframe/redshift/__init__.py +23 -0
- sqlframe/redshift/catalog.py +127 -0
- sqlframe/redshift/column.py +1 -0
- sqlframe/redshift/dataframe.py +54 -0
- sqlframe/redshift/functions.py +18 -0
- sqlframe/redshift/group.py +14 -0
- sqlframe/redshift/readwriter.py +29 -0
- sqlframe/redshift/session.py +53 -0
- sqlframe/redshift/types.py +1 -0
- sqlframe/redshift/window.py +1 -0
- sqlframe/snowflake/__init__.py +26 -0
- sqlframe/snowflake/catalog.py +134 -0
- sqlframe/snowflake/column.py +1 -0
- sqlframe/snowflake/dataframe.py +54 -0
- sqlframe/snowflake/functions.py +18 -0
- sqlframe/snowflake/group.py +14 -0
- sqlframe/snowflake/readwriter.py +29 -0
- sqlframe/snowflake/session.py +53 -0
- sqlframe/snowflake/types.py +1 -0
- sqlframe/snowflake/window.py +1 -0
- sqlframe/spark/__init__.py +23 -0
- sqlframe/spark/catalog.py +1028 -0
- sqlframe/spark/column.py +1 -0
- sqlframe/spark/dataframe.py +54 -0
- sqlframe/spark/functions.py +22 -0
- sqlframe/spark/group.py +14 -0
- sqlframe/spark/readwriter.py +29 -0
- sqlframe/spark/session.py +90 -0
- sqlframe/spark/types.py +1 -0
- sqlframe/spark/window.py +1 -0
- sqlframe/standalone/__init__.py +26 -0
- sqlframe/standalone/catalog.py +13 -0
- sqlframe/standalone/column.py +1 -0
- sqlframe/standalone/dataframe.py +36 -0
- sqlframe/standalone/functions.py +1 -0
- sqlframe/standalone/group.py +14 -0
- sqlframe/standalone/readwriter.py +19 -0
- sqlframe/standalone/session.py +40 -0
- sqlframe/standalone/types.py +1 -0
- sqlframe/standalone/window.py +1 -0
- sqlframe-1.1.3.dist-info/LICENSE +21 -0
- sqlframe-1.1.3.dist-info/METADATA +172 -0
- sqlframe-1.1.3.dist-info/RECORD +98 -0
- sqlframe-1.1.3.dist-info/WHEEL +5 -0
- sqlframe-1.1.3.dist-info/top_level.txt +1 -0
sqlframe/base/catalog.py
ADDED
|
@@ -0,0 +1,1163 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlglot import MappingSchema, exp
|
|
8
|
+
|
|
9
|
+
from sqlframe.base.decorators import normalize
|
|
10
|
+
from sqlframe.base.exceptions import TableSchemaError
|
|
11
|
+
from sqlframe.base.util import ensure_column_mapping, to_schema
|
|
12
|
+
|
|
13
|
+
if t.TYPE_CHECKING:
|
|
14
|
+
from sqlglot.schema import ColumnMapping
|
|
15
|
+
|
|
16
|
+
from sqlframe.base._typing import StorageLevel, UserDefinedFunctionLike
|
|
17
|
+
from sqlframe.base.session import DF, _BaseSession
|
|
18
|
+
from sqlframe.base.types import DataType, StructType
|
|
19
|
+
|
|
20
|
+
SESSION = t.TypeVar("SESSION", bound=_BaseSession)
|
|
21
|
+
else:
|
|
22
|
+
DF = t.TypeVar("DF")
|
|
23
|
+
SESSION = t.TypeVar("SESSION")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _BaseCatalog(t.Generic[SESSION, DF]):
|
|
27
|
+
"""User-facing catalog API, accessible through `SparkSession.catalog`."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, sparkSession: SESSION, schema: t.Optional[MappingSchema] = None) -> None:
|
|
30
|
+
"""Create a new Catalog that wraps the underlying JVM object."""
|
|
31
|
+
self.session = sparkSession
|
|
32
|
+
self._schema = schema or MappingSchema()
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def spark(self) -> SESSION:
|
|
36
|
+
return self.session
|
|
37
|
+
|
|
38
|
+
def ensure_table(self, table_name: exp.Table | str) -> exp.Table:
|
|
39
|
+
return (
|
|
40
|
+
(
|
|
41
|
+
exp.to_table(table_name, dialect=self.session.input_dialect)
|
|
42
|
+
.transform(self.session.input_dialect.normalize_identifier)
|
|
43
|
+
.assert_is(exp.Table)
|
|
44
|
+
)
|
|
45
|
+
if isinstance(table_name, str)
|
|
46
|
+
else table_name
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def get_columns_from_schema(self, table: exp.Table | str) -> t.Dict[str, exp.DataType]:
|
|
50
|
+
table = self.ensure_table(table)
|
|
51
|
+
return {
|
|
52
|
+
exp.column(name, quoted=True).sql(
|
|
53
|
+
dialect=self.session.input_dialect
|
|
54
|
+
): exp.DataType.build(dtype, dialect=self.session.input_dialect)
|
|
55
|
+
for name, dtype in self._schema.find(table, raise_on_missing=True).items() # type: ignore
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
def get_columns(self, table: exp.Table | str) -> t.Dict[str, exp.DataType]:
|
|
59
|
+
table = self.ensure_table(table)
|
|
60
|
+
columns = self.listColumns(table.sql(dialect=self.session.input_dialect))
|
|
61
|
+
if not columns:
|
|
62
|
+
return {}
|
|
63
|
+
return {
|
|
64
|
+
exp.column(c.name, quoted=True).sql(
|
|
65
|
+
dialect=self.session.input_dialect
|
|
66
|
+
): exp.DataType.build(c.dataType, dialect=self.session.input_dialect)
|
|
67
|
+
for c in columns
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def add_table(
|
|
71
|
+
self, table: exp.Table | str, column_mapping: t.Optional[ColumnMapping] = None
|
|
72
|
+
) -> None:
|
|
73
|
+
# TODO: Making this an update or add
|
|
74
|
+
table = self.ensure_table(table)
|
|
75
|
+
if self._schema.find(table):
|
|
76
|
+
return
|
|
77
|
+
if not column_mapping:
|
|
78
|
+
try:
|
|
79
|
+
column_mapping = self.get_columns(table)
|
|
80
|
+
except NotImplementedError:
|
|
81
|
+
# TODO: Add doc link
|
|
82
|
+
raise TableSchemaError(
|
|
83
|
+
"This session does not have access to a catalog that can lookup column information. See docs for explicitly defining columns or using a session that can automatically determine this."
|
|
84
|
+
)
|
|
85
|
+
column_mapping = ensure_column_mapping(column_mapping) # type: ignore
|
|
86
|
+
self._schema.add_table(table, column_mapping, dialect=self.session.input_dialect)
|
|
87
|
+
|
|
88
|
+
@normalize(["dbName"])
|
|
89
|
+
def getDatabase(self, dbName: str) -> Database:
|
|
90
|
+
"""Get the database with the specified name.
|
|
91
|
+
This throws an :class:`AnalysisException` when the database cannot be found.
|
|
92
|
+
|
|
93
|
+
.. versionadded:: 3.4.0
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
dbName : str
|
|
98
|
+
name of the database to get.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
:class:`Database`
|
|
103
|
+
The database found by the name.
|
|
104
|
+
|
|
105
|
+
Examples
|
|
106
|
+
--------
|
|
107
|
+
>>> spark.catalog.getDatabase("default")
|
|
108
|
+
Database(name='default', catalog='spark_catalog', description='default database', ...
|
|
109
|
+
|
|
110
|
+
Using the fully qualified name with the catalog name.
|
|
111
|
+
|
|
112
|
+
>>> spark.catalog.getDatabase("spark_catalog.default")
|
|
113
|
+
Database(name='default', catalog='spark_catalog', description='default database', ...
|
|
114
|
+
"""
|
|
115
|
+
schema = to_schema(dbName, dialect=self.session.input_dialect)
|
|
116
|
+
database_name = schema.db
|
|
117
|
+
databases = self.listDatabases(pattern=database_name)
|
|
118
|
+
if len(databases) == 0:
|
|
119
|
+
raise ValueError(f"Database '{dbName}' not found")
|
|
120
|
+
if len(databases) > 1:
|
|
121
|
+
if schema.catalog is not None:
|
|
122
|
+
filtered_databases = [db for db in databases if db.catalog == schema.catalog]
|
|
123
|
+
if filtered_databases:
|
|
124
|
+
return filtered_databases[0]
|
|
125
|
+
return databases[0]
|
|
126
|
+
|
|
127
|
+
@normalize(["dbName"])
|
|
128
|
+
def databaseExists(self, dbName: str) -> bool:
|
|
129
|
+
"""Check if the database with the specified name exists.
|
|
130
|
+
|
|
131
|
+
.. versionadded:: 3.3.0
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
dbName : str
|
|
136
|
+
name of the database to check existence
|
|
137
|
+
|
|
138
|
+
.. versionchanged:: 3.4.0
|
|
139
|
+
Allow ``dbName`` to be qualified with catalog name.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
bool
|
|
144
|
+
Indicating whether the database exists
|
|
145
|
+
|
|
146
|
+
Examples
|
|
147
|
+
--------
|
|
148
|
+
Check if 'test_new_database' database exists
|
|
149
|
+
|
|
150
|
+
>>> spark.catalog.databaseExists("test_new_database")
|
|
151
|
+
False
|
|
152
|
+
>>> _ = spark.sql("CREATE DATABASE test_new_database")
|
|
153
|
+
>>> spark.catalog.databaseExists("test_new_database")
|
|
154
|
+
True
|
|
155
|
+
|
|
156
|
+
Using the fully qualified name with the catalog name.
|
|
157
|
+
|
|
158
|
+
>>> spark.catalog.databaseExists("spark_catalog.test_new_database")
|
|
159
|
+
True
|
|
160
|
+
>>> _ = spark.sql("DROP DATABASE test_new_database")
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
self.getDatabase(dbName)
|
|
164
|
+
return True
|
|
165
|
+
except ValueError:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
@normalize(["tableName"])
|
|
169
|
+
def getTable(self, tableName: str) -> Table:
|
|
170
|
+
"""Get the table or view with the specified name. This table can be a temporary view or a
|
|
171
|
+
table/view. This throws an :class:`AnalysisException` when no Table can be found.
|
|
172
|
+
|
|
173
|
+
.. versionadded:: 3.4.0
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
tableName : str
|
|
178
|
+
name of the table to get.
|
|
179
|
+
|
|
180
|
+
.. versionchanged:: 3.4.0
|
|
181
|
+
Allow `tableName` to be qualified with catalog name.
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
:class:`Table`
|
|
186
|
+
The table found by the name.
|
|
187
|
+
|
|
188
|
+
Examples
|
|
189
|
+
--------
|
|
190
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
191
|
+
>>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
|
|
192
|
+
>>> spark.catalog.getTable("tbl1")
|
|
193
|
+
Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
|
|
194
|
+
|
|
195
|
+
Using the fully qualified name with the catalog name.
|
|
196
|
+
|
|
197
|
+
>>> spark.catalog.getTable("default.tbl1")
|
|
198
|
+
Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
|
|
199
|
+
>>> spark.catalog.getTable("spark_catalog.default.tbl1")
|
|
200
|
+
Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
|
|
201
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
202
|
+
|
|
203
|
+
Throw an analysis exception when the table does not exist.
|
|
204
|
+
|
|
205
|
+
>>> spark.catalog.getTable("tbl1")
|
|
206
|
+
Traceback (most recent call last):
|
|
207
|
+
...
|
|
208
|
+
AnalysisException: ...
|
|
209
|
+
"""
|
|
210
|
+
table = exp.to_table(tableName, dialect=self.session.input_dialect)
|
|
211
|
+
schema = table.copy()
|
|
212
|
+
schema.set("this", None)
|
|
213
|
+
tables = self.listTables(
|
|
214
|
+
schema.sql(dialect=self.session.input_dialect) if schema.db else None
|
|
215
|
+
)
|
|
216
|
+
matching_tables = [t for t in tables if t.name == table.name]
|
|
217
|
+
if not matching_tables:
|
|
218
|
+
raise ValueError(f"Table '{tableName}' not found")
|
|
219
|
+
return matching_tables[0]
|
|
220
|
+
|
|
221
|
+
def functionExists(self, functionName: str, dbName: t.Optional[str] = None) -> bool:
|
|
222
|
+
"""Check if the function with the specified name exists.
|
|
223
|
+
This can either be a temporary function or a function.
|
|
224
|
+
|
|
225
|
+
.. versionadded:: 3.3.0
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
functionName : str
|
|
230
|
+
name of the function to check existence
|
|
231
|
+
|
|
232
|
+
.. versionchanged:: 3.4.0
|
|
233
|
+
Allow ``functionName`` to be qualified with catalog name
|
|
234
|
+
|
|
235
|
+
dbName : str, t.Optional
|
|
236
|
+
name of the database to check function existence in.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
bool
|
|
241
|
+
Indicating whether the function exists
|
|
242
|
+
|
|
243
|
+
Notes
|
|
244
|
+
-----
|
|
245
|
+
If no database is specified, the current database and catalog
|
|
246
|
+
are used. This API includes all temporary functions.
|
|
247
|
+
|
|
248
|
+
Examples
|
|
249
|
+
--------
|
|
250
|
+
>>> spark.catalog.functionExists("count")
|
|
251
|
+
True
|
|
252
|
+
|
|
253
|
+
Using the fully qualified name for function name.
|
|
254
|
+
|
|
255
|
+
>>> spark.catalog.functionExists("default.unexisting_function")
|
|
256
|
+
False
|
|
257
|
+
>>> spark.catalog.functionExists("spark_catalog.default.unexisting_function")
|
|
258
|
+
False
|
|
259
|
+
"""
|
|
260
|
+
functions = self.listFunctions(dbName)
|
|
261
|
+
return any([f.name == functionName for f in functions])
|
|
262
|
+
|
|
263
|
+
def getFunction(self, functionName: str) -> Function:
|
|
264
|
+
"""Get the function with the specified name. This function can be a temporary function or a
|
|
265
|
+
function. This throws an :class:`AnalysisException` when the function cannot be found.
|
|
266
|
+
|
|
267
|
+
.. versionadded:: 3.4.0
|
|
268
|
+
|
|
269
|
+
Parameters
|
|
270
|
+
----------
|
|
271
|
+
functionName : str
|
|
272
|
+
name of the function to check existence.
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
:class:`Function`
|
|
277
|
+
The function found by the name.
|
|
278
|
+
|
|
279
|
+
Examples
|
|
280
|
+
--------
|
|
281
|
+
>>> _ = spark.sql(
|
|
282
|
+
... "CREATE FUNCTION my_func1 AS 'test.org.apache.spark.sql.MyDoubleAvg'")
|
|
283
|
+
>>> spark.catalog.getFunction("my_func1")
|
|
284
|
+
Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
|
|
285
|
+
|
|
286
|
+
Using the fully qualified name for function name.
|
|
287
|
+
|
|
288
|
+
>>> spark.catalog.getFunction("default.my_func1")
|
|
289
|
+
Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
|
|
290
|
+
>>> spark.catalog.getFunction("spark_catalog.default.my_func1")
|
|
291
|
+
Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
|
|
292
|
+
|
|
293
|
+
Throw an analysis exception when the function does not exists.
|
|
294
|
+
|
|
295
|
+
>>> spark.catalog.getFunction("my_func2")
|
|
296
|
+
Traceback (most recent call last):
|
|
297
|
+
...
|
|
298
|
+
AnalysisException: ...
|
|
299
|
+
"""
|
|
300
|
+
table = exp.to_table(functionName, dialect=self.session.input_dialect)
|
|
301
|
+
if table.catalog or table.db:
|
|
302
|
+
schema = table.copy()
|
|
303
|
+
schema.set("this", None)
|
|
304
|
+
db_name = schema.sql(dialect=self.session.input_dialect)
|
|
305
|
+
function_name = table.name
|
|
306
|
+
else:
|
|
307
|
+
db_name = None
|
|
308
|
+
function_name = functionName
|
|
309
|
+
functions = self.listFunctions(dbName=db_name, pattern=function_name)
|
|
310
|
+
matching_functions = [f for f in functions if f.name == function_name]
|
|
311
|
+
if not matching_functions:
|
|
312
|
+
raise ValueError(f"Function '{functionName}' not found")
|
|
313
|
+
return matching_functions[0]
|
|
314
|
+
|
|
315
|
+
@normalize(["tableName", "dbName"])
|
|
316
|
+
def tableExists(self, tableName: str, dbName: t.Optional[str] = None) -> bool:
|
|
317
|
+
"""Check if the table or view with the specified name exists.
|
|
318
|
+
This can either be a temporary view or a table/view.
|
|
319
|
+
|
|
320
|
+
.. versionadded:: 3.3.0
|
|
321
|
+
|
|
322
|
+
Parameters
|
|
323
|
+
----------
|
|
324
|
+
tableName : str
|
|
325
|
+
name of the table to check existence.
|
|
326
|
+
If no database is specified, first try to treat ``tableName`` as a
|
|
327
|
+
multi-layer-namespace identifier, then try ``tableName`` as a normal table
|
|
328
|
+
name in the current database if necessary.
|
|
329
|
+
|
|
330
|
+
.. versionchanged:: 3.4.0
|
|
331
|
+
Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
|
|
332
|
+
|
|
333
|
+
dbName : str, t.Optional
|
|
334
|
+
name of the database to check table existence in.
|
|
335
|
+
|
|
336
|
+
Returns
|
|
337
|
+
-------
|
|
338
|
+
bool
|
|
339
|
+
Indicating whether the table/view exists
|
|
340
|
+
|
|
341
|
+
Examples
|
|
342
|
+
--------
|
|
343
|
+
This function can check if a table is defined or not:
|
|
344
|
+
|
|
345
|
+
>>> spark.catalog.tableExists("unexisting_table")
|
|
346
|
+
False
|
|
347
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
348
|
+
>>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
|
|
349
|
+
>>> spark.catalog.tableExists("tbl1")
|
|
350
|
+
True
|
|
351
|
+
|
|
352
|
+
Using the fully qualified names for tables.
|
|
353
|
+
|
|
354
|
+
>>> spark.catalog.tableExists("default.tbl1")
|
|
355
|
+
True
|
|
356
|
+
>>> spark.catalog.tableExists("spark_catalog.default.tbl1")
|
|
357
|
+
True
|
|
358
|
+
>>> spark.catalog.tableExists("tbl1", "default")
|
|
359
|
+
True
|
|
360
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
361
|
+
|
|
362
|
+
Check if views exist:
|
|
363
|
+
|
|
364
|
+
>>> spark.catalog.tableExists("view1")
|
|
365
|
+
False
|
|
366
|
+
>>> _ = spark.sql("CREATE VIEW view1 AS SELECT 1")
|
|
367
|
+
>>> spark.catalog.tableExists("view1")
|
|
368
|
+
True
|
|
369
|
+
|
|
370
|
+
Using the fully qualified names for views.
|
|
371
|
+
|
|
372
|
+
>>> spark.catalog.tableExists("default.view1")
|
|
373
|
+
True
|
|
374
|
+
>>> spark.catalog.tableExists("spark_catalog.default.view1")
|
|
375
|
+
True
|
|
376
|
+
>>> spark.catalog.tableExists("view1", "default")
|
|
377
|
+
True
|
|
378
|
+
>>> _ = spark.sql("DROP VIEW view1")
|
|
379
|
+
|
|
380
|
+
Check if temporary views exist:
|
|
381
|
+
|
|
382
|
+
>>> _ = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1")
|
|
383
|
+
>>> spark.catalog.tableExists("view1")
|
|
384
|
+
True
|
|
385
|
+
>>> df = spark.sql("DROP VIEW view1")
|
|
386
|
+
>>> spark.catalog.tableExists("view1")
|
|
387
|
+
False
|
|
388
|
+
"""
|
|
389
|
+
table = exp.to_table(tableName, dialect=self.session.input_dialect)
|
|
390
|
+
schema_arg = to_schema(dbName, dialect=self.session.input_dialect) if dbName else None
|
|
391
|
+
if not table.db:
|
|
392
|
+
if schema_arg and schema_arg.db:
|
|
393
|
+
table.set("db", schema_arg.args["db"])
|
|
394
|
+
else:
|
|
395
|
+
table.set("db", exp.parse_identifier(self.currentDatabase(), dialect="duckdb"))
|
|
396
|
+
if not table.catalog:
|
|
397
|
+
if schema_arg and schema_arg.catalog:
|
|
398
|
+
table.set("catalog", schema_arg.args["catalog"])
|
|
399
|
+
else:
|
|
400
|
+
table.set("catalog", exp.parse_identifier(self.currentCatalog(), dialect="duckdb"))
|
|
401
|
+
table_name = table.name
|
|
402
|
+
schema = table.copy()
|
|
403
|
+
schema.set("this", None)
|
|
404
|
+
tables = self.listTables(schema.sql(dialect=self.session.input_dialect))
|
|
405
|
+
return any([x for x in tables if x.name == table_name])
|
|
406
|
+
|
|
407
|
+
def currentCatalog(self) -> str:
|
|
408
|
+
"""Returns the current default catalog in this session.
|
|
409
|
+
|
|
410
|
+
.. versionadded:: 3.4.0
|
|
411
|
+
|
|
412
|
+
Examples
|
|
413
|
+
--------
|
|
414
|
+
>>> spark.catalog.currentCatalog()
|
|
415
|
+
'spark_catalog'
|
|
416
|
+
"""
|
|
417
|
+
raise NotImplementedError
|
|
418
|
+
|
|
419
|
+
def setCurrentCatalog(self, catalogName: str) -> None:
|
|
420
|
+
"""Sets the current default catalog in this session.
|
|
421
|
+
|
|
422
|
+
.. versionadded:: 3.4.0
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
catalogName : str
|
|
427
|
+
name of the catalog to set
|
|
428
|
+
|
|
429
|
+
Examples
|
|
430
|
+
--------
|
|
431
|
+
>>> spark.catalog.setCurrentCatalog("spark_catalog")
|
|
432
|
+
"""
|
|
433
|
+
raise NotImplementedError
|
|
434
|
+
|
|
435
|
+
def currentDatabase(self) -> str:
|
|
436
|
+
"""Returns the current default schema in this session.
|
|
437
|
+
|
|
438
|
+
.. versionadded:: 3.4.0
|
|
439
|
+
|
|
440
|
+
Examples
|
|
441
|
+
--------
|
|
442
|
+
>>> spark.catalog.currentDatabase()
|
|
443
|
+
'default'
|
|
444
|
+
"""
|
|
445
|
+
raise NotImplementedError
|
|
446
|
+
|
|
447
|
+
def listDatabases(self, pattern: t.Optional[str] = None) -> t.List[Database]:
|
|
448
|
+
"""
|
|
449
|
+
Returns a t.List of databases available across all sessions.
|
|
450
|
+
|
|
451
|
+
.. versionadded:: 2.0.0
|
|
452
|
+
|
|
453
|
+
Parameters
|
|
454
|
+
----------
|
|
455
|
+
pattern : str
|
|
456
|
+
The pattern that the database name needs to match.
|
|
457
|
+
|
|
458
|
+
.. versionchanged: 3.5.0
|
|
459
|
+
Adds ``pattern`` argument.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
t.List
|
|
464
|
+
A t.List of :class:`Database`.
|
|
465
|
+
|
|
466
|
+
Examples
|
|
467
|
+
--------
|
|
468
|
+
>>> spark.catalog.t.listDatabases()
|
|
469
|
+
[Database(name='default', catalog='spark_catalog', description='default database', ...
|
|
470
|
+
|
|
471
|
+
>>> spark.catalog.t.listDatabases("def*")
|
|
472
|
+
[Database(name='default', catalog='spark_catalog', description='default database', ...
|
|
473
|
+
|
|
474
|
+
>>> spark.catalog.t.listDatabases("def2*")
|
|
475
|
+
[]
|
|
476
|
+
"""
|
|
477
|
+
raise NotImplementedError
|
|
478
|
+
|
|
479
|
+
def listCatalogs(self, pattern: t.Optional[str] = None) -> t.List[CatalogMetadata]:
|
|
480
|
+
"""
|
|
481
|
+
Returns a t.List of databases available across all sessions.
|
|
482
|
+
|
|
483
|
+
.. versionadded:: 2.0.0
|
|
484
|
+
|
|
485
|
+
Parameters
|
|
486
|
+
----------
|
|
487
|
+
pattern : str
|
|
488
|
+
The pattern that the database name needs to match.
|
|
489
|
+
|
|
490
|
+
.. versionchanged: 3.5.0
|
|
491
|
+
Adds ``pattern`` argument.
|
|
492
|
+
|
|
493
|
+
Returns
|
|
494
|
+
-------
|
|
495
|
+
t.List
|
|
496
|
+
A t.List of :class:`Database`.
|
|
497
|
+
|
|
498
|
+
Examples
|
|
499
|
+
--------
|
|
500
|
+
>>> spark.catalog.t.listDatabases()
|
|
501
|
+
[Database(name='default', catalog='spark_catalog', description='default database', ...
|
|
502
|
+
|
|
503
|
+
>>> spark.catalog.t.listDatabases("def*")
|
|
504
|
+
[Database(name='default', catalog='spark_catalog', description='default database', ...
|
|
505
|
+
|
|
506
|
+
>>> spark.catalog.t.listDatabases("def2*")
|
|
507
|
+
[]
|
|
508
|
+
"""
|
|
509
|
+
raise NotImplementedError
|
|
510
|
+
|
|
511
|
+
def setCurrentDatabase(self, dbName: str) -> None:
|
|
512
|
+
"""
|
|
513
|
+
Sets the current default database in this session.
|
|
514
|
+
|
|
515
|
+
.. versionadded:: 2.0.0
|
|
516
|
+
|
|
517
|
+
Examples
|
|
518
|
+
--------
|
|
519
|
+
>>> spark.catalog.setCurrentDatabase("default")
|
|
520
|
+
"""
|
|
521
|
+
raise NotImplementedError
|
|
522
|
+
|
|
523
|
+
def listTables(
|
|
524
|
+
self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
|
|
525
|
+
) -> t.List[Table]:
|
|
526
|
+
"""Returns a t.List of tables/views in the specified database.
|
|
527
|
+
|
|
528
|
+
.. versionadded:: 2.0.0
|
|
529
|
+
|
|
530
|
+
Parameters
|
|
531
|
+
----------
|
|
532
|
+
dbName : str
|
|
533
|
+
name of the database to t.List the tables.
|
|
534
|
+
|
|
535
|
+
.. versionchanged:: 3.4.0
|
|
536
|
+
Allow ``dbName`` to be qualified with catalog name.
|
|
537
|
+
|
|
538
|
+
pattern : str
|
|
539
|
+
The pattern that the database name needs to match.
|
|
540
|
+
|
|
541
|
+
.. versionchanged: 3.5.0
|
|
542
|
+
Adds ``pattern`` argument.
|
|
543
|
+
|
|
544
|
+
Returns
|
|
545
|
+
-------
|
|
546
|
+
t.List
|
|
547
|
+
A t.List of :class:`Table`.
|
|
548
|
+
|
|
549
|
+
Notes
|
|
550
|
+
-----
|
|
551
|
+
If no database is specified, the current database and catalog
|
|
552
|
+
are used. This API includes all temporary views.
|
|
553
|
+
|
|
554
|
+
Examples
|
|
555
|
+
--------
|
|
556
|
+
>>> spark.range(1).createTempView("test_view")
|
|
557
|
+
>>> spark.catalog.t.listTables()
|
|
558
|
+
[Table(name='test_view', catalog=None, namespace=[], description=None, ...
|
|
559
|
+
|
|
560
|
+
>>> spark.catalog.t.listTables(pattern="test*")
|
|
561
|
+
[Table(name='test_view', catalog=None, namespace=[], description=None, ...
|
|
562
|
+
|
|
563
|
+
>>> spark.catalog.t.listTables(pattern="table*")
|
|
564
|
+
[]
|
|
565
|
+
|
|
566
|
+
>>> _ = spark.catalog.dropTempView("test_view")
|
|
567
|
+
>>> spark.catalog.t.listTables()
|
|
568
|
+
[]
|
|
569
|
+
"""
|
|
570
|
+
raise NotImplementedError
|
|
571
|
+
|
|
572
|
+
def listColumns(self, tableName: str, dbName: t.Optional[str] = None) -> t.List[Column]:
|
|
573
|
+
"""Returns a t.List of columns for the given table/view in the specified database.
|
|
574
|
+
|
|
575
|
+
.. versionadded:: 2.0.0
|
|
576
|
+
|
|
577
|
+
Parameters
|
|
578
|
+
----------
|
|
579
|
+
tableName : str
|
|
580
|
+
name of the table to t.List columns.
|
|
581
|
+
|
|
582
|
+
.. versionchanged:: 3.4.0
|
|
583
|
+
Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
|
|
584
|
+
|
|
585
|
+
dbName : str, t.Optional
|
|
586
|
+
name of the database to find the table to t.List columns.
|
|
587
|
+
|
|
588
|
+
Returns
|
|
589
|
+
-------
|
|
590
|
+
t.List
|
|
591
|
+
A t.List of :class:`Column`.
|
|
592
|
+
|
|
593
|
+
Notes
|
|
594
|
+
-----
|
|
595
|
+
The order of arguments here is different from that of its JVM counterpart
|
|
596
|
+
because Python does not support method overloading.
|
|
597
|
+
|
|
598
|
+
If no database is specified, the current database and catalog
|
|
599
|
+
are used. This API includes all temporary views.
|
|
600
|
+
|
|
601
|
+
Examples
|
|
602
|
+
--------
|
|
603
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
604
|
+
>>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING parquet")
|
|
605
|
+
>>> spark.catalog.t.listColumns("tblA")
|
|
606
|
+
[Column(name='name', description=None, dataType='string', nullable=True, ...
|
|
607
|
+
>>> _ = spark.sql("DROP TABLE tblA")
|
|
608
|
+
"""
|
|
609
|
+
raise NotImplementedError
|
|
610
|
+
|
|
611
|
+
def listFunctions(
|
|
612
|
+
self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
|
|
613
|
+
) -> t.List[Function]:
|
|
614
|
+
"""
|
|
615
|
+
Returns a t.List of functions registered in the specified database.
|
|
616
|
+
|
|
617
|
+
.. versionadded:: 3.4.0
|
|
618
|
+
|
|
619
|
+
Parameters
|
|
620
|
+
----------
|
|
621
|
+
dbName : str
|
|
622
|
+
name of the database to t.List the functions.
|
|
623
|
+
``dbName`` can be qualified with catalog name.
|
|
624
|
+
pattern : str
|
|
625
|
+
The pattern that the function name needs to match.
|
|
626
|
+
|
|
627
|
+
.. versionchanged: 3.5.0
|
|
628
|
+
Adds ``pattern`` argument.
|
|
629
|
+
|
|
630
|
+
Returns
|
|
631
|
+
-------
|
|
632
|
+
t.List
|
|
633
|
+
A t.List of :class:`Function`.
|
|
634
|
+
|
|
635
|
+
Notes
|
|
636
|
+
-----
|
|
637
|
+
If no database is specified, the current database and catalog
|
|
638
|
+
are used. This API includes all temporary functions.
|
|
639
|
+
|
|
640
|
+
Examples
|
|
641
|
+
--------
|
|
642
|
+
>>> spark.catalog.t.listFunctions()
|
|
643
|
+
[Function(name=...
|
|
644
|
+
|
|
645
|
+
>>> spark.catalog.t.listFunctions(pattern="to_*")
|
|
646
|
+
[Function(name=...
|
|
647
|
+
|
|
648
|
+
>>> spark.catalog.t.listFunctions(pattern="*not_existing_func*")
|
|
649
|
+
[]
|
|
650
|
+
"""
|
|
651
|
+
raise NotImplementedError
|
|
652
|
+
|
|
653
|
+
def createExternalTable(
|
|
654
|
+
self,
|
|
655
|
+
tableName: str,
|
|
656
|
+
path: t.Optional[str] = None,
|
|
657
|
+
source: t.Optional[str] = None,
|
|
658
|
+
schema: t.Optional[StructType] = None,
|
|
659
|
+
**options: str,
|
|
660
|
+
) -> DF:
|
|
661
|
+
"""Creates a table based on the dataset in a data source.
|
|
662
|
+
|
|
663
|
+
It returns the DataFrame associated with the external table.
|
|
664
|
+
|
|
665
|
+
The data source is specified by the ``source`` and a set of ``options``.
|
|
666
|
+
If ``source`` is not specified, the default data source configured by
|
|
667
|
+
``spark.sql.sources.default`` will be used.
|
|
668
|
+
|
|
669
|
+
t.Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
|
|
670
|
+
created external table.
|
|
671
|
+
|
|
672
|
+
.. versionadded:: 2.0.0
|
|
673
|
+
|
|
674
|
+
Returns
|
|
675
|
+
-------
|
|
676
|
+
:class:`DataFrame`
|
|
677
|
+
"""
|
|
678
|
+
raise NotImplementedError()
|
|
679
|
+
|
|
680
|
+
def createTable(
|
|
681
|
+
self,
|
|
682
|
+
tableName: str,
|
|
683
|
+
path: t.Optional[str] = None,
|
|
684
|
+
source: t.Optional[str] = None,
|
|
685
|
+
schema: t.Optional[StructType] = None,
|
|
686
|
+
description: t.Optional[str] = None,
|
|
687
|
+
**options: str,
|
|
688
|
+
) -> DF:
|
|
689
|
+
"""Creates a table based on the dataset in a data source.
|
|
690
|
+
|
|
691
|
+
.. versionadded:: 2.2.0
|
|
692
|
+
|
|
693
|
+
Parameters
|
|
694
|
+
----------
|
|
695
|
+
tableName : str
|
|
696
|
+
name of the table to create.
|
|
697
|
+
|
|
698
|
+
.. versionchanged:: 3.4.0
|
|
699
|
+
Allow ``tableName`` to be qualified with catalog name.
|
|
700
|
+
|
|
701
|
+
path : str, t.Optional
|
|
702
|
+
the path in which the data for this table exists.
|
|
703
|
+
When ``path`` is specified, an external table is
|
|
704
|
+
created from the data at the given path. Otherwise a managed table is created.
|
|
705
|
+
source : str, t.Optional
|
|
706
|
+
the source of this table such as 'parquet, 'orc', etc.
|
|
707
|
+
If ``source`` is not specified, the default data source configured by
|
|
708
|
+
``spark.sql.sources.default`` will be used.
|
|
709
|
+
schema : class:`StructType`, t.Optional
|
|
710
|
+
the schema for this table.
|
|
711
|
+
description : str, t.Optional
|
|
712
|
+
the description of this table.
|
|
713
|
+
|
|
714
|
+
.. versionchanged:: 3.1.0
|
|
715
|
+
Added the ``description`` parameter.
|
|
716
|
+
|
|
717
|
+
**options : dict, t.Optional
|
|
718
|
+
extra options to specify in the table.
|
|
719
|
+
|
|
720
|
+
Returns
|
|
721
|
+
-------
|
|
722
|
+
:class:`DataFrame`
|
|
723
|
+
The DataFrame associated with the table.
|
|
724
|
+
|
|
725
|
+
Examples
|
|
726
|
+
--------
|
|
727
|
+
Creating a managed table.
|
|
728
|
+
|
|
729
|
+
>>> _ = spark.catalog.createTable("tbl1", schema=spark.range(1).schema, source='parquet')
|
|
730
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
731
|
+
|
|
732
|
+
Creating an external table
|
|
733
|
+
|
|
734
|
+
>>> import tempfile
|
|
735
|
+
>>> with tempfile.TemporaryDirectory() as d:
|
|
736
|
+
... _ = spark.catalog.createTable(
|
|
737
|
+
... "tbl2", schema=spark.range(1).schema, path=d, source='parquet')
|
|
738
|
+
>>> _ = spark.sql("DROP TABLE tbl2")
|
|
739
|
+
"""
|
|
740
|
+
raise NotImplementedError()
|
|
741
|
+
|
|
742
|
+
def dropTempView(self, viewName: str) -> bool:
|
|
743
|
+
"""Drops the local temporary view with the given view name in the catalog.
|
|
744
|
+
If the view has been cached before, then it will also be uncached.
|
|
745
|
+
Returns true if this view is dropped successfully, false otherwise.
|
|
746
|
+
|
|
747
|
+
.. versionadded:: 2.0.0
|
|
748
|
+
|
|
749
|
+
Parameters
|
|
750
|
+
----------
|
|
751
|
+
viewName : str
|
|
752
|
+
name of the temporary view to drop.
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
bool
|
|
757
|
+
If the temporary view was successfully dropped or not.
|
|
758
|
+
|
|
759
|
+
.. versionadded:: 2.1.0
|
|
760
|
+
The return type of this method was ``None`` in Spark 2.0, but changed to ``bool``
|
|
761
|
+
in Spark 2.1.
|
|
762
|
+
|
|
763
|
+
Examples
|
|
764
|
+
--------
|
|
765
|
+
>>> spark.createDataFrame([(1, 1)]).createTempView("my_table")
|
|
766
|
+
|
|
767
|
+
Dropping the temporary view.
|
|
768
|
+
|
|
769
|
+
>>> spark.catalog.dropTempView("my_table")
|
|
770
|
+
True
|
|
771
|
+
|
|
772
|
+
Throw an exception if the temporary view does not exists.
|
|
773
|
+
|
|
774
|
+
>>> spark.table("my_table")
|
|
775
|
+
Traceback (most recent call last):
|
|
776
|
+
...
|
|
777
|
+
AnalysisException: ...
|
|
778
|
+
"""
|
|
779
|
+
raise NotImplementedError()
|
|
780
|
+
|
|
781
|
+
def dropGlobalTempView(self, viewName: str) -> bool:
|
|
782
|
+
"""Drops the global temporary view with the given view name in the catalog.
|
|
783
|
+
|
|
784
|
+
.. versionadded:: 2.1.0
|
|
785
|
+
|
|
786
|
+
Parameters
|
|
787
|
+
----------
|
|
788
|
+
viewName : str
|
|
789
|
+
name of the global view to drop.
|
|
790
|
+
|
|
791
|
+
Returns
|
|
792
|
+
-------
|
|
793
|
+
bool
|
|
794
|
+
If the global view was successfully dropped or not.
|
|
795
|
+
|
|
796
|
+
Notes
|
|
797
|
+
-----
|
|
798
|
+
If the view has been cached before, then it will also be uncached.
|
|
799
|
+
|
|
800
|
+
Examples
|
|
801
|
+
--------
|
|
802
|
+
>>> spark.createDataFrame([(1, 1)]).createGlobalTempView("my_table")
|
|
803
|
+
|
|
804
|
+
Dropping the global view.
|
|
805
|
+
|
|
806
|
+
>>> spark.catalog.dropGlobalTempView("my_table")
|
|
807
|
+
True
|
|
808
|
+
|
|
809
|
+
Throw an exception if the global view does not exists.
|
|
810
|
+
|
|
811
|
+
>>> spark.table("global_temp.my_table")
|
|
812
|
+
Traceback (most recent call last):
|
|
813
|
+
...
|
|
814
|
+
AnalysisException: ...
|
|
815
|
+
"""
|
|
816
|
+
raise NotImplementedError()
|
|
817
|
+
|
|
818
|
+
def registerFunction(
|
|
819
|
+
self, name: str, f: t.Callable[..., t.Any], returnType: t.Optional[DataType] = None
|
|
820
|
+
) -> UserDefinedFunctionLike:
|
|
821
|
+
"""An alias for :func:`spark.udf.register`.
|
|
822
|
+
See :meth:`pyspark.sql.UDFRegistration.register`.
|
|
823
|
+
|
|
824
|
+
.. versionadded:: 2.0.0
|
|
825
|
+
|
|
826
|
+
.. deprecated:: 2.3.0
|
|
827
|
+
Use :func:`spark.udf.register` instead.
|
|
828
|
+
|
|
829
|
+
.. versionchanged:: 3.4.0
|
|
830
|
+
Supports Spark Connect.
|
|
831
|
+
"""
|
|
832
|
+
raise NotImplementedError()
|
|
833
|
+
|
|
834
|
+
def isCached(self, tableName: str) -> bool:
|
|
835
|
+
"""
|
|
836
|
+
Returns true if the table is currently cached in-memory.
|
|
837
|
+
|
|
838
|
+
.. versionadded:: 2.0.0
|
|
839
|
+
|
|
840
|
+
Parameters
|
|
841
|
+
----------
|
|
842
|
+
tableName : str
|
|
843
|
+
name of the table to get.
|
|
844
|
+
|
|
845
|
+
.. versionchanged:: 3.4.0
|
|
846
|
+
Allow ``tableName`` to be qualified with catalog name.
|
|
847
|
+
|
|
848
|
+
Returns
|
|
849
|
+
-------
|
|
850
|
+
bool
|
|
851
|
+
|
|
852
|
+
Examples
|
|
853
|
+
--------
|
|
854
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
855
|
+
>>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
|
|
856
|
+
>>> spark.catalog.cacheTable("tbl1")
|
|
857
|
+
>>> spark.catalog.isCached("tbl1")
|
|
858
|
+
True
|
|
859
|
+
|
|
860
|
+
Throw an analysis exception when the table does not exist.
|
|
861
|
+
|
|
862
|
+
>>> spark.catalog.isCached("not_existing_table")
|
|
863
|
+
Traceback (most recent call last):
|
|
864
|
+
...
|
|
865
|
+
AnalysisException: ...
|
|
866
|
+
|
|
867
|
+
Using the fully qualified name for the table.
|
|
868
|
+
|
|
869
|
+
>>> spark.catalog.isCached("spark_catalog.default.tbl1")
|
|
870
|
+
True
|
|
871
|
+
>>> spark.catalog.uncacheTable("tbl1")
|
|
872
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
873
|
+
"""
|
|
874
|
+
raise NotImplementedError()
|
|
875
|
+
|
|
876
|
+
def cacheTable(self, tableName: str, storageLevel: t.Optional[StorageLevel] = None) -> None:
|
|
877
|
+
"""Caches the specified table in-memory or with given storage level.
|
|
878
|
+
Default MEMORY_AND_DISK.
|
|
879
|
+
|
|
880
|
+
.. versionadded:: 2.0.0
|
|
881
|
+
|
|
882
|
+
Parameters
|
|
883
|
+
----------
|
|
884
|
+
tableName : str
|
|
885
|
+
name of the table to get.
|
|
886
|
+
|
|
887
|
+
.. versionchanged:: 3.4.0
|
|
888
|
+
Allow ``tableName`` to be qualified with catalog name.
|
|
889
|
+
|
|
890
|
+
storageLevel : :class:`StorageLevel`
|
|
891
|
+
storage level to set for persistence.
|
|
892
|
+
|
|
893
|
+
.. versionchanged:: 3.5.0
|
|
894
|
+
Allow to specify storage level.
|
|
895
|
+
|
|
896
|
+
Examples
|
|
897
|
+
--------
|
|
898
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
899
|
+
>>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
|
|
900
|
+
>>> spark.catalog.cacheTable("tbl1")
|
|
901
|
+
|
|
902
|
+
or
|
|
903
|
+
|
|
904
|
+
>>> spark.catalog.cacheTable("tbl1", StorageLevel.OFF_HEAP)
|
|
905
|
+
|
|
906
|
+
Throw an analysis exception when the table does not exist.
|
|
907
|
+
|
|
908
|
+
>>> spark.catalog.cacheTable("not_existing_table")
|
|
909
|
+
Traceback (most recent call last):
|
|
910
|
+
...
|
|
911
|
+
AnalysisException: ...
|
|
912
|
+
|
|
913
|
+
Using the fully qualified name for the table.
|
|
914
|
+
|
|
915
|
+
>>> spark.catalog.cacheTable("spark_catalog.default.tbl1")
|
|
916
|
+
>>> spark.catalog.uncacheTable("tbl1")
|
|
917
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
918
|
+
"""
|
|
919
|
+
raise NotImplementedError()
|
|
920
|
+
|
|
921
|
+
def uncacheTable(self, tableName: str) -> None:
|
|
922
|
+
"""Removes the specified table from the in-memory cache.
|
|
923
|
+
|
|
924
|
+
.. versionadded:: 2.0.0
|
|
925
|
+
|
|
926
|
+
Parameters
|
|
927
|
+
----------
|
|
928
|
+
tableName : str
|
|
929
|
+
name of the table to get.
|
|
930
|
+
|
|
931
|
+
.. versionchanged:: 3.4.0
|
|
932
|
+
Allow ``tableName`` to be qualified with catalog name.
|
|
933
|
+
|
|
934
|
+
Examples
|
|
935
|
+
--------
|
|
936
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
937
|
+
>>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
|
|
938
|
+
>>> spark.catalog.cacheTable("tbl1")
|
|
939
|
+
>>> spark.catalog.uncacheTable("tbl1")
|
|
940
|
+
>>> spark.catalog.isCached("tbl1")
|
|
941
|
+
False
|
|
942
|
+
|
|
943
|
+
Throw an analysis exception when the table does not exist.
|
|
944
|
+
|
|
945
|
+
>>> spark.catalog.uncacheTable("not_existing_table")
|
|
946
|
+
Traceback (most recent call last):
|
|
947
|
+
...
|
|
948
|
+
AnalysisException: ...
|
|
949
|
+
|
|
950
|
+
Using the fully qualified name for the table.
|
|
951
|
+
|
|
952
|
+
>>> spark.catalog.uncacheTable("spark_catalog.default.tbl1")
|
|
953
|
+
>>> spark.catalog.isCached("tbl1")
|
|
954
|
+
False
|
|
955
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
956
|
+
"""
|
|
957
|
+
raise NotImplementedError()
|
|
958
|
+
|
|
959
|
+
def clearCache(self) -> None:
|
|
960
|
+
"""Removes all cached tables from the in-memory cache.
|
|
961
|
+
|
|
962
|
+
.. versionadded:: 2.0.0
|
|
963
|
+
|
|
964
|
+
Examples
|
|
965
|
+
--------
|
|
966
|
+
>>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
967
|
+
>>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
|
|
968
|
+
>>> spark.catalog.clearCache()
|
|
969
|
+
>>> spark.catalog.isCached("tbl1")
|
|
970
|
+
False
|
|
971
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
972
|
+
"""
|
|
973
|
+
raise NotImplementedError()
|
|
974
|
+
|
|
975
|
+
def refreshTable(self, tableName: str) -> None:
|
|
976
|
+
"""Invalidates and refreshes all the cached data and metadata of the given table.
|
|
977
|
+
|
|
978
|
+
.. versionadded:: 2.0.0
|
|
979
|
+
|
|
980
|
+
Parameters
|
|
981
|
+
----------
|
|
982
|
+
tableName : str
|
|
983
|
+
name of the table to get.
|
|
984
|
+
|
|
985
|
+
.. versionchanged:: 3.4.0
|
|
986
|
+
Allow ``tableName`` to be qualified with catalog name.
|
|
987
|
+
|
|
988
|
+
Examples
|
|
989
|
+
--------
|
|
990
|
+
The example below caches a table, and then removes the data.
|
|
991
|
+
|
|
992
|
+
>>> import tempfile
|
|
993
|
+
>>> with tempfile.TemporaryDirectory() as d:
|
|
994
|
+
... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
995
|
+
... _ = spark.sql(
|
|
996
|
+
... "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
|
|
997
|
+
... _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
|
|
998
|
+
... spark.catalog.cacheTable("tbl1")
|
|
999
|
+
... spark.table("tbl1").show()
|
|
1000
|
+
+---+
|
|
1001
|
+
|col|
|
|
1002
|
+
+---+
|
|
1003
|
+
|abc|
|
|
1004
|
+
+---+
|
|
1005
|
+
|
|
1006
|
+
Because the table is cached, it computes from the cached data as below.
|
|
1007
|
+
|
|
1008
|
+
>>> spark.table("tbl1").count()
|
|
1009
|
+
1
|
|
1010
|
+
|
|
1011
|
+
After refreshing the table, it shows 0 because the data does not exist t.Anymore.
|
|
1012
|
+
|
|
1013
|
+
>>> spark.catalog.refreshTable("tbl1")
|
|
1014
|
+
>>> spark.table("tbl1").count()
|
|
1015
|
+
0
|
|
1016
|
+
|
|
1017
|
+
Using the fully qualified name for the table.
|
|
1018
|
+
|
|
1019
|
+
>>> spark.catalog.refreshTable("spark_catalog.default.tbl1")
|
|
1020
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
1021
|
+
"""
|
|
1022
|
+
raise NotImplementedError()
|
|
1023
|
+
|
|
1024
|
+
def recoverPartitions(self, tableName: str) -> None:
|
|
1025
|
+
"""Recovers all the partitions of the given table and updates the catalog.
|
|
1026
|
+
|
|
1027
|
+
.. versionadded:: 2.1.1
|
|
1028
|
+
|
|
1029
|
+
Parameters
|
|
1030
|
+
----------
|
|
1031
|
+
tableName : str
|
|
1032
|
+
name of the table to get.
|
|
1033
|
+
|
|
1034
|
+
Notes
|
|
1035
|
+
-----
|
|
1036
|
+
Only works with a partitioned table, and not a view.
|
|
1037
|
+
|
|
1038
|
+
Examples
|
|
1039
|
+
--------
|
|
1040
|
+
The example below creates a partitioned table against the existing directory of
|
|
1041
|
+
the partitioned table. After that, it recovers the partitions.
|
|
1042
|
+
|
|
1043
|
+
>>> import tempfile
|
|
1044
|
+
>>> with tempfile.TemporaryDirectory() as d:
|
|
1045
|
+
... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
1046
|
+
... spark.range(1).selectExpr(
|
|
1047
|
+
... "id as key", "id as value").write.partitionBy("key").mode("overwrite").save(d)
|
|
1048
|
+
... _ = spark.sql(
|
|
1049
|
+
... "CREATE TABLE tbl1 (key LONG, value LONG)"
|
|
1050
|
+
... "USING parquet OPTIONS (path '{}') PARTITIONED BY (key)".format(d))
|
|
1051
|
+
... spark.table("tbl1").show()
|
|
1052
|
+
... spark.catalog.recoverPartitions("tbl1")
|
|
1053
|
+
... spark.table("tbl1").show()
|
|
1054
|
+
+-----+---+
|
|
1055
|
+
|value|key|
|
|
1056
|
+
+-----+---+
|
|
1057
|
+
+-----+---+
|
|
1058
|
+
+-----+---+
|
|
1059
|
+
|value|key|
|
|
1060
|
+
+-----+---+
|
|
1061
|
+
| 0| 0|
|
|
1062
|
+
+-----+---+
|
|
1063
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
1064
|
+
"""
|
|
1065
|
+
raise NotImplementedError()
|
|
1066
|
+
|
|
1067
|
+
def refreshByPath(self, path: str) -> None:
|
|
1068
|
+
"""Invalidates and refreshes all the cached data (and the associated metadata) for t.Any
|
|
1069
|
+
DataFrame that contains the given data source path.
|
|
1070
|
+
|
|
1071
|
+
.. versionadded:: 2.2.0
|
|
1072
|
+
|
|
1073
|
+
Parameters
|
|
1074
|
+
----------
|
|
1075
|
+
path : str
|
|
1076
|
+
the path to refresh the cache.
|
|
1077
|
+
|
|
1078
|
+
Examples
|
|
1079
|
+
--------
|
|
1080
|
+
The example below caches a table, and then removes the data.
|
|
1081
|
+
|
|
1082
|
+
>>> import tempfile
|
|
1083
|
+
>>> with tempfile.TemporaryDirectory() as d:
|
|
1084
|
+
... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
|
|
1085
|
+
... _ = spark.sql(
|
|
1086
|
+
... "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
|
|
1087
|
+
... _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
|
|
1088
|
+
... spark.catalog.cacheTable("tbl1")
|
|
1089
|
+
... spark.table("tbl1").show()
|
|
1090
|
+
+---+
|
|
1091
|
+
|col|
|
|
1092
|
+
+---+
|
|
1093
|
+
|abc|
|
|
1094
|
+
+---+
|
|
1095
|
+
|
|
1096
|
+
Because the table is cached, it computes from the cached data as below.
|
|
1097
|
+
|
|
1098
|
+
>>> spark.table("tbl1").count()
|
|
1099
|
+
1
|
|
1100
|
+
|
|
1101
|
+
After refreshing the table by path, it shows 0 because the data does not exist t.Anymore.
|
|
1102
|
+
|
|
1103
|
+
>>> spark.catalog.refreshByPath(d)
|
|
1104
|
+
>>> spark.table("tbl1").count()
|
|
1105
|
+
0
|
|
1106
|
+
|
|
1107
|
+
>>> _ = spark.sql("DROP TABLE tbl1")
|
|
1108
|
+
"""
|
|
1109
|
+
raise NotImplementedError()
|
|
1110
|
+
|
|
1111
|
+
def _reset(self) -> None:
|
|
1112
|
+
"""(Internal use only) Drop all existing databases (except "default"), tables,
|
|
1113
|
+
partitions and functions, and set the current database to "default".
|
|
1114
|
+
|
|
1115
|
+
This is mainly used for tests.
|
|
1116
|
+
"""
|
|
1117
|
+
raise NotImplementedError()
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
class CatalogMetadata(t.NamedTuple):
|
|
1121
|
+
name: str
|
|
1122
|
+
description: t.Optional[str]
|
|
1123
|
+
|
|
1124
|
+
|
|
1125
|
+
class Database(t.NamedTuple):
|
|
1126
|
+
name: str
|
|
1127
|
+
catalog: t.Optional[str]
|
|
1128
|
+
description: t.Optional[str]
|
|
1129
|
+
locationUri: str
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
class Table(t.NamedTuple):
|
|
1133
|
+
name: str
|
|
1134
|
+
catalog: t.Optional[str]
|
|
1135
|
+
namespace: t.Optional[t.List[str]]
|
|
1136
|
+
description: t.Optional[str]
|
|
1137
|
+
tableType: str
|
|
1138
|
+
isTemporary: bool
|
|
1139
|
+
|
|
1140
|
+
@property
|
|
1141
|
+
def database(self) -> t.Optional[str]:
|
|
1142
|
+
if self.namespace is not None and len(self.namespace) == 1:
|
|
1143
|
+
return self.namespace[0]
|
|
1144
|
+
else:
|
|
1145
|
+
return None
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
class Column(t.NamedTuple):
|
|
1149
|
+
name: str
|
|
1150
|
+
description: t.Optional[str]
|
|
1151
|
+
dataType: str
|
|
1152
|
+
nullable: bool
|
|
1153
|
+
isPartition: bool
|
|
1154
|
+
isBucket: bool
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
class Function(t.NamedTuple):
|
|
1158
|
+
name: str
|
|
1159
|
+
catalog: t.Optional[str]
|
|
1160
|
+
namespace: t.Optional[t.List[str]]
|
|
1161
|
+
description: t.Optional[str]
|
|
1162
|
+
className: str
|
|
1163
|
+
isTemporary: bool
|