sqlframe 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sqlframe/__init__.py +0 -0
  2. sqlframe/_version.py +16 -0
  3. sqlframe/base/__init__.py +0 -0
  4. sqlframe/base/_typing.py +39 -0
  5. sqlframe/base/catalog.py +1163 -0
  6. sqlframe/base/column.py +388 -0
  7. sqlframe/base/dataframe.py +1519 -0
  8. sqlframe/base/decorators.py +51 -0
  9. sqlframe/base/exceptions.py +14 -0
  10. sqlframe/base/function_alternatives.py +1055 -0
  11. sqlframe/base/functions.py +1678 -0
  12. sqlframe/base/group.py +102 -0
  13. sqlframe/base/mixins/__init__.py +0 -0
  14. sqlframe/base/mixins/catalog_mixins.py +419 -0
  15. sqlframe/base/mixins/readwriter_mixins.py +118 -0
  16. sqlframe/base/normalize.py +84 -0
  17. sqlframe/base/operations.py +87 -0
  18. sqlframe/base/readerwriter.py +679 -0
  19. sqlframe/base/session.py +585 -0
  20. sqlframe/base/transforms.py +13 -0
  21. sqlframe/base/types.py +418 -0
  22. sqlframe/base/util.py +242 -0
  23. sqlframe/base/window.py +139 -0
  24. sqlframe/bigquery/__init__.py +23 -0
  25. sqlframe/bigquery/catalog.py +255 -0
  26. sqlframe/bigquery/column.py +1 -0
  27. sqlframe/bigquery/dataframe.py +54 -0
  28. sqlframe/bigquery/functions.py +378 -0
  29. sqlframe/bigquery/group.py +14 -0
  30. sqlframe/bigquery/readwriter.py +29 -0
  31. sqlframe/bigquery/session.py +89 -0
  32. sqlframe/bigquery/types.py +1 -0
  33. sqlframe/bigquery/window.py +1 -0
  34. sqlframe/duckdb/__init__.py +20 -0
  35. sqlframe/duckdb/catalog.py +108 -0
  36. sqlframe/duckdb/column.py +1 -0
  37. sqlframe/duckdb/dataframe.py +55 -0
  38. sqlframe/duckdb/functions.py +47 -0
  39. sqlframe/duckdb/group.py +14 -0
  40. sqlframe/duckdb/readwriter.py +111 -0
  41. sqlframe/duckdb/session.py +65 -0
  42. sqlframe/duckdb/types.py +1 -0
  43. sqlframe/duckdb/window.py +1 -0
  44. sqlframe/postgres/__init__.py +23 -0
  45. sqlframe/postgres/catalog.py +106 -0
  46. sqlframe/postgres/column.py +1 -0
  47. sqlframe/postgres/dataframe.py +54 -0
  48. sqlframe/postgres/functions.py +61 -0
  49. sqlframe/postgres/group.py +14 -0
  50. sqlframe/postgres/readwriter.py +29 -0
  51. sqlframe/postgres/session.py +68 -0
  52. sqlframe/postgres/types.py +1 -0
  53. sqlframe/postgres/window.py +1 -0
  54. sqlframe/redshift/__init__.py +23 -0
  55. sqlframe/redshift/catalog.py +127 -0
  56. sqlframe/redshift/column.py +1 -0
  57. sqlframe/redshift/dataframe.py +54 -0
  58. sqlframe/redshift/functions.py +18 -0
  59. sqlframe/redshift/group.py +14 -0
  60. sqlframe/redshift/readwriter.py +29 -0
  61. sqlframe/redshift/session.py +53 -0
  62. sqlframe/redshift/types.py +1 -0
  63. sqlframe/redshift/window.py +1 -0
  64. sqlframe/snowflake/__init__.py +26 -0
  65. sqlframe/snowflake/catalog.py +134 -0
  66. sqlframe/snowflake/column.py +1 -0
  67. sqlframe/snowflake/dataframe.py +54 -0
  68. sqlframe/snowflake/functions.py +18 -0
  69. sqlframe/snowflake/group.py +14 -0
  70. sqlframe/snowflake/readwriter.py +29 -0
  71. sqlframe/snowflake/session.py +53 -0
  72. sqlframe/snowflake/types.py +1 -0
  73. sqlframe/snowflake/window.py +1 -0
  74. sqlframe/spark/__init__.py +23 -0
  75. sqlframe/spark/catalog.py +1028 -0
  76. sqlframe/spark/column.py +1 -0
  77. sqlframe/spark/dataframe.py +54 -0
  78. sqlframe/spark/functions.py +22 -0
  79. sqlframe/spark/group.py +14 -0
  80. sqlframe/spark/readwriter.py +29 -0
  81. sqlframe/spark/session.py +90 -0
  82. sqlframe/spark/types.py +1 -0
  83. sqlframe/spark/window.py +1 -0
  84. sqlframe/standalone/__init__.py +26 -0
  85. sqlframe/standalone/catalog.py +13 -0
  86. sqlframe/standalone/column.py +1 -0
  87. sqlframe/standalone/dataframe.py +36 -0
  88. sqlframe/standalone/functions.py +1 -0
  89. sqlframe/standalone/group.py +14 -0
  90. sqlframe/standalone/readwriter.py +19 -0
  91. sqlframe/standalone/session.py +40 -0
  92. sqlframe/standalone/types.py +1 -0
  93. sqlframe/standalone/window.py +1 -0
  94. sqlframe-1.1.3.dist-info/LICENSE +21 -0
  95. sqlframe-1.1.3.dist-info/METADATA +172 -0
  96. sqlframe-1.1.3.dist-info/RECORD +98 -0
  97. sqlframe-1.1.3.dist-info/WHEEL +5 -0
  98. sqlframe-1.1.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1163 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import typing as t
6
+
7
+ from sqlglot import MappingSchema, exp
8
+
9
+ from sqlframe.base.decorators import normalize
10
+ from sqlframe.base.exceptions import TableSchemaError
11
+ from sqlframe.base.util import ensure_column_mapping, to_schema
12
+
13
+ if t.TYPE_CHECKING:
14
+ from sqlglot.schema import ColumnMapping
15
+
16
+ from sqlframe.base._typing import StorageLevel, UserDefinedFunctionLike
17
+ from sqlframe.base.session import DF, _BaseSession
18
+ from sqlframe.base.types import DataType, StructType
19
+
20
+ SESSION = t.TypeVar("SESSION", bound=_BaseSession)
21
+ else:
22
+ DF = t.TypeVar("DF")
23
+ SESSION = t.TypeVar("SESSION")
24
+
25
+
26
+ class _BaseCatalog(t.Generic[SESSION, DF]):
27
+ """User-facing catalog API, accessible through `SparkSession.catalog`."""
28
+
29
+ def __init__(self, sparkSession: SESSION, schema: t.Optional[MappingSchema] = None) -> None:
30
+ """Create a new Catalog that wraps the underlying JVM object."""
31
+ self.session = sparkSession
32
+ self._schema = schema or MappingSchema()
33
+
34
+ @property
35
+ def spark(self) -> SESSION:
36
+ return self.session
37
+
38
+ def ensure_table(self, table_name: exp.Table | str) -> exp.Table:
39
+ return (
40
+ (
41
+ exp.to_table(table_name, dialect=self.session.input_dialect)
42
+ .transform(self.session.input_dialect.normalize_identifier)
43
+ .assert_is(exp.Table)
44
+ )
45
+ if isinstance(table_name, str)
46
+ else table_name
47
+ )
48
+
49
+ def get_columns_from_schema(self, table: exp.Table | str) -> t.Dict[str, exp.DataType]:
50
+ table = self.ensure_table(table)
51
+ return {
52
+ exp.column(name, quoted=True).sql(
53
+ dialect=self.session.input_dialect
54
+ ): exp.DataType.build(dtype, dialect=self.session.input_dialect)
55
+ for name, dtype in self._schema.find(table, raise_on_missing=True).items() # type: ignore
56
+ }
57
+
58
+ def get_columns(self, table: exp.Table | str) -> t.Dict[str, exp.DataType]:
59
+ table = self.ensure_table(table)
60
+ columns = self.listColumns(table.sql(dialect=self.session.input_dialect))
61
+ if not columns:
62
+ return {}
63
+ return {
64
+ exp.column(c.name, quoted=True).sql(
65
+ dialect=self.session.input_dialect
66
+ ): exp.DataType.build(c.dataType, dialect=self.session.input_dialect)
67
+ for c in columns
68
+ }
69
+
70
+ def add_table(
71
+ self, table: exp.Table | str, column_mapping: t.Optional[ColumnMapping] = None
72
+ ) -> None:
73
+ # TODO: Making this an update or add
74
+ table = self.ensure_table(table)
75
+ if self._schema.find(table):
76
+ return
77
+ if not column_mapping:
78
+ try:
79
+ column_mapping = self.get_columns(table)
80
+ except NotImplementedError:
81
+ # TODO: Add doc link
82
+ raise TableSchemaError(
83
+ "This session does not have access to a catalog that can lookup column information. See docs for explicitly defining columns or using a session that can automatically determine this."
84
+ )
85
+ column_mapping = ensure_column_mapping(column_mapping) # type: ignore
86
+ self._schema.add_table(table, column_mapping, dialect=self.session.input_dialect)
87
+
88
+ @normalize(["dbName"])
89
+ def getDatabase(self, dbName: str) -> Database:
90
+ """Get the database with the specified name.
91
+ This throws an :class:`AnalysisException` when the database cannot be found.
92
+
93
+ .. versionadded:: 3.4.0
94
+
95
+ Parameters
96
+ ----------
97
+ dbName : str
98
+ name of the database to get.
99
+
100
+ Returns
101
+ -------
102
+ :class:`Database`
103
+ The database found by the name.
104
+
105
+ Examples
106
+ --------
107
+ >>> spark.catalog.getDatabase("default")
108
+ Database(name='default', catalog='spark_catalog', description='default database', ...
109
+
110
+ Using the fully qualified name with the catalog name.
111
+
112
+ >>> spark.catalog.getDatabase("spark_catalog.default")
113
+ Database(name='default', catalog='spark_catalog', description='default database', ...
114
+ """
115
+ schema = to_schema(dbName, dialect=self.session.input_dialect)
116
+ database_name = schema.db
117
+ databases = self.listDatabases(pattern=database_name)
118
+ if len(databases) == 0:
119
+ raise ValueError(f"Database '{dbName}' not found")
120
+ if len(databases) > 1:
121
+ if schema.catalog is not None:
122
+ filtered_databases = [db for db in databases if db.catalog == schema.catalog]
123
+ if filtered_databases:
124
+ return filtered_databases[0]
125
+ return databases[0]
126
+
127
+ @normalize(["dbName"])
128
+ def databaseExists(self, dbName: str) -> bool:
129
+ """Check if the database with the specified name exists.
130
+
131
+ .. versionadded:: 3.3.0
132
+
133
+ Parameters
134
+ ----------
135
+ dbName : str
136
+ name of the database to check existence
137
+
138
+ .. versionchanged:: 3.4.0
139
+ Allow ``dbName`` to be qualified with catalog name.
140
+
141
+ Returns
142
+ -------
143
+ bool
144
+ Indicating whether the database exists
145
+
146
+ Examples
147
+ --------
148
+ Check if 'test_new_database' database exists
149
+
150
+ >>> spark.catalog.databaseExists("test_new_database")
151
+ False
152
+ >>> _ = spark.sql("CREATE DATABASE test_new_database")
153
+ >>> spark.catalog.databaseExists("test_new_database")
154
+ True
155
+
156
+ Using the fully qualified name with the catalog name.
157
+
158
+ >>> spark.catalog.databaseExists("spark_catalog.test_new_database")
159
+ True
160
+ >>> _ = spark.sql("DROP DATABASE test_new_database")
161
+ """
162
+ try:
163
+ self.getDatabase(dbName)
164
+ return True
165
+ except ValueError:
166
+ return False
167
+
168
+ @normalize(["tableName"])
169
+ def getTable(self, tableName: str) -> Table:
170
+ """Get the table or view with the specified name. This table can be a temporary view or a
171
+ table/view. This throws an :class:`AnalysisException` when no Table can be found.
172
+
173
+ .. versionadded:: 3.4.0
174
+
175
+ Parameters
176
+ ----------
177
+ tableName : str
178
+ name of the table to get.
179
+
180
+ .. versionchanged:: 3.4.0
181
+ Allow `tableName` to be qualified with catalog name.
182
+
183
+ Returns
184
+ -------
185
+ :class:`Table`
186
+ The table found by the name.
187
+
188
+ Examples
189
+ --------
190
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
191
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
192
+ >>> spark.catalog.getTable("tbl1")
193
+ Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
194
+
195
+ Using the fully qualified name with the catalog name.
196
+
197
+ >>> spark.catalog.getTable("default.tbl1")
198
+ Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
199
+ >>> spark.catalog.getTable("spark_catalog.default.tbl1")
200
+ Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
201
+ >>> _ = spark.sql("DROP TABLE tbl1")
202
+
203
+ Throw an analysis exception when the table does not exist.
204
+
205
+ >>> spark.catalog.getTable("tbl1")
206
+ Traceback (most recent call last):
207
+ ...
208
+ AnalysisException: ...
209
+ """
210
+ table = exp.to_table(tableName, dialect=self.session.input_dialect)
211
+ schema = table.copy()
212
+ schema.set("this", None)
213
+ tables = self.listTables(
214
+ schema.sql(dialect=self.session.input_dialect) if schema.db else None
215
+ )
216
+ matching_tables = [t for t in tables if t.name == table.name]
217
+ if not matching_tables:
218
+ raise ValueError(f"Table '{tableName}' not found")
219
+ return matching_tables[0]
220
+
221
+ def functionExists(self, functionName: str, dbName: t.Optional[str] = None) -> bool:
222
+ """Check if the function with the specified name exists.
223
+ This can either be a temporary function or a function.
224
+
225
+ .. versionadded:: 3.3.0
226
+
227
+ Parameters
228
+ ----------
229
+ functionName : str
230
+ name of the function to check existence
231
+
232
+ .. versionchanged:: 3.4.0
233
+ Allow ``functionName`` to be qualified with catalog name
234
+
235
+ dbName : str, t.Optional
236
+ name of the database to check function existence in.
237
+
238
+ Returns
239
+ -------
240
+ bool
241
+ Indicating whether the function exists
242
+
243
+ Notes
244
+ -----
245
+ If no database is specified, the current database and catalog
246
+ are used. This API includes all temporary functions.
247
+
248
+ Examples
249
+ --------
250
+ >>> spark.catalog.functionExists("count")
251
+ True
252
+
253
+ Using the fully qualified name for function name.
254
+
255
+ >>> spark.catalog.functionExists("default.unexisting_function")
256
+ False
257
+ >>> spark.catalog.functionExists("spark_catalog.default.unexisting_function")
258
+ False
259
+ """
260
+ functions = self.listFunctions(dbName)
261
+ return any([f.name == functionName for f in functions])
262
+
263
+ def getFunction(self, functionName: str) -> Function:
264
+ """Get the function with the specified name. This function can be a temporary function or a
265
+ function. This throws an :class:`AnalysisException` when the function cannot be found.
266
+
267
+ .. versionadded:: 3.4.0
268
+
269
+ Parameters
270
+ ----------
271
+ functionName : str
272
+ name of the function to check existence.
273
+
274
+ Returns
275
+ -------
276
+ :class:`Function`
277
+ The function found by the name.
278
+
279
+ Examples
280
+ --------
281
+ >>> _ = spark.sql(
282
+ ... "CREATE FUNCTION my_func1 AS 'test.org.apache.spark.sql.MyDoubleAvg'")
283
+ >>> spark.catalog.getFunction("my_func1")
284
+ Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
285
+
286
+ Using the fully qualified name for function name.
287
+
288
+ >>> spark.catalog.getFunction("default.my_func1")
289
+ Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
290
+ >>> spark.catalog.getFunction("spark_catalog.default.my_func1")
291
+ Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
292
+
293
+ Throw an analysis exception when the function does not exists.
294
+
295
+ >>> spark.catalog.getFunction("my_func2")
296
+ Traceback (most recent call last):
297
+ ...
298
+ AnalysisException: ...
299
+ """
300
+ table = exp.to_table(functionName, dialect=self.session.input_dialect)
301
+ if table.catalog or table.db:
302
+ schema = table.copy()
303
+ schema.set("this", None)
304
+ db_name = schema.sql(dialect=self.session.input_dialect)
305
+ function_name = table.name
306
+ else:
307
+ db_name = None
308
+ function_name = functionName
309
+ functions = self.listFunctions(dbName=db_name, pattern=function_name)
310
+ matching_functions = [f for f in functions if f.name == function_name]
311
+ if not matching_functions:
312
+ raise ValueError(f"Function '{functionName}' not found")
313
+ return matching_functions[0]
314
+
315
+ @normalize(["tableName", "dbName"])
316
+ def tableExists(self, tableName: str, dbName: t.Optional[str] = None) -> bool:
317
+ """Check if the table or view with the specified name exists.
318
+ This can either be a temporary view or a table/view.
319
+
320
+ .. versionadded:: 3.3.0
321
+
322
+ Parameters
323
+ ----------
324
+ tableName : str
325
+ name of the table to check existence.
326
+ If no database is specified, first try to treat ``tableName`` as a
327
+ multi-layer-namespace identifier, then try ``tableName`` as a normal table
328
+ name in the current database if necessary.
329
+
330
+ .. versionchanged:: 3.4.0
331
+ Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
332
+
333
+ dbName : str, t.Optional
334
+ name of the database to check table existence in.
335
+
336
+ Returns
337
+ -------
338
+ bool
339
+ Indicating whether the table/view exists
340
+
341
+ Examples
342
+ --------
343
+ This function can check if a table is defined or not:
344
+
345
+ >>> spark.catalog.tableExists("unexisting_table")
346
+ False
347
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
348
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
349
+ >>> spark.catalog.tableExists("tbl1")
350
+ True
351
+
352
+ Using the fully qualified names for tables.
353
+
354
+ >>> spark.catalog.tableExists("default.tbl1")
355
+ True
356
+ >>> spark.catalog.tableExists("spark_catalog.default.tbl1")
357
+ True
358
+ >>> spark.catalog.tableExists("tbl1", "default")
359
+ True
360
+ >>> _ = spark.sql("DROP TABLE tbl1")
361
+
362
+ Check if views exist:
363
+
364
+ >>> spark.catalog.tableExists("view1")
365
+ False
366
+ >>> _ = spark.sql("CREATE VIEW view1 AS SELECT 1")
367
+ >>> spark.catalog.tableExists("view1")
368
+ True
369
+
370
+ Using the fully qualified names for views.
371
+
372
+ >>> spark.catalog.tableExists("default.view1")
373
+ True
374
+ >>> spark.catalog.tableExists("spark_catalog.default.view1")
375
+ True
376
+ >>> spark.catalog.tableExists("view1", "default")
377
+ True
378
+ >>> _ = spark.sql("DROP VIEW view1")
379
+
380
+ Check if temporary views exist:
381
+
382
+ >>> _ = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1")
383
+ >>> spark.catalog.tableExists("view1")
384
+ True
385
+ >>> df = spark.sql("DROP VIEW view1")
386
+ >>> spark.catalog.tableExists("view1")
387
+ False
388
+ """
389
+ table = exp.to_table(tableName, dialect=self.session.input_dialect)
390
+ schema_arg = to_schema(dbName, dialect=self.session.input_dialect) if dbName else None
391
+ if not table.db:
392
+ if schema_arg and schema_arg.db:
393
+ table.set("db", schema_arg.args["db"])
394
+ else:
395
+ table.set("db", exp.parse_identifier(self.currentDatabase(), dialect="duckdb"))
396
+ if not table.catalog:
397
+ if schema_arg and schema_arg.catalog:
398
+ table.set("catalog", schema_arg.args["catalog"])
399
+ else:
400
+ table.set("catalog", exp.parse_identifier(self.currentCatalog(), dialect="duckdb"))
401
+ table_name = table.name
402
+ schema = table.copy()
403
+ schema.set("this", None)
404
+ tables = self.listTables(schema.sql(dialect=self.session.input_dialect))
405
+ return any([x for x in tables if x.name == table_name])
406
+
407
+ def currentCatalog(self) -> str:
408
+ """Returns the current default catalog in this session.
409
+
410
+ .. versionadded:: 3.4.0
411
+
412
+ Examples
413
+ --------
414
+ >>> spark.catalog.currentCatalog()
415
+ 'spark_catalog'
416
+ """
417
+ raise NotImplementedError
418
+
419
+ def setCurrentCatalog(self, catalogName: str) -> None:
420
+ """Sets the current default catalog in this session.
421
+
422
+ .. versionadded:: 3.4.0
423
+
424
+ Parameters
425
+ ----------
426
+ catalogName : str
427
+ name of the catalog to set
428
+
429
+ Examples
430
+ --------
431
+ >>> spark.catalog.setCurrentCatalog("spark_catalog")
432
+ """
433
+ raise NotImplementedError
434
+
435
+ def currentDatabase(self) -> str:
436
+ """Returns the current default schema in this session.
437
+
438
+ .. versionadded:: 3.4.0
439
+
440
+ Examples
441
+ --------
442
+ >>> spark.catalog.currentDatabase()
443
+ 'default'
444
+ """
445
+ raise NotImplementedError
446
+
447
+ def listDatabases(self, pattern: t.Optional[str] = None) -> t.List[Database]:
448
+ """
449
+ Returns a t.List of databases available across all sessions.
450
+
451
+ .. versionadded:: 2.0.0
452
+
453
+ Parameters
454
+ ----------
455
+ pattern : str
456
+ The pattern that the database name needs to match.
457
+
458
+ .. versionchanged: 3.5.0
459
+ Adds ``pattern`` argument.
460
+
461
+ Returns
462
+ -------
463
+ t.List
464
+ A t.List of :class:`Database`.
465
+
466
+ Examples
467
+ --------
468
+ >>> spark.catalog.t.listDatabases()
469
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
470
+
471
+ >>> spark.catalog.t.listDatabases("def*")
472
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
473
+
474
+ >>> spark.catalog.t.listDatabases("def2*")
475
+ []
476
+ """
477
+ raise NotImplementedError
478
+
479
+ def listCatalogs(self, pattern: t.Optional[str] = None) -> t.List[CatalogMetadata]:
480
+ """
481
+ Returns a t.List of databases available across all sessions.
482
+
483
+ .. versionadded:: 2.0.0
484
+
485
+ Parameters
486
+ ----------
487
+ pattern : str
488
+ The pattern that the database name needs to match.
489
+
490
+ .. versionchanged: 3.5.0
491
+ Adds ``pattern`` argument.
492
+
493
+ Returns
494
+ -------
495
+ t.List
496
+ A t.List of :class:`Database`.
497
+
498
+ Examples
499
+ --------
500
+ >>> spark.catalog.t.listDatabases()
501
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
502
+
503
+ >>> spark.catalog.t.listDatabases("def*")
504
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
505
+
506
+ >>> spark.catalog.t.listDatabases("def2*")
507
+ []
508
+ """
509
+ raise NotImplementedError
510
+
511
+ def setCurrentDatabase(self, dbName: str) -> None:
512
+ """
513
+ Sets the current default database in this session.
514
+
515
+ .. versionadded:: 2.0.0
516
+
517
+ Examples
518
+ --------
519
+ >>> spark.catalog.setCurrentDatabase("default")
520
+ """
521
+ raise NotImplementedError
522
+
523
+ def listTables(
524
+ self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
525
+ ) -> t.List[Table]:
526
+ """Returns a t.List of tables/views in the specified database.
527
+
528
+ .. versionadded:: 2.0.0
529
+
530
+ Parameters
531
+ ----------
532
+ dbName : str
533
+ name of the database to t.List the tables.
534
+
535
+ .. versionchanged:: 3.4.0
536
+ Allow ``dbName`` to be qualified with catalog name.
537
+
538
+ pattern : str
539
+ The pattern that the database name needs to match.
540
+
541
+ .. versionchanged: 3.5.0
542
+ Adds ``pattern`` argument.
543
+
544
+ Returns
545
+ -------
546
+ t.List
547
+ A t.List of :class:`Table`.
548
+
549
+ Notes
550
+ -----
551
+ If no database is specified, the current database and catalog
552
+ are used. This API includes all temporary views.
553
+
554
+ Examples
555
+ --------
556
+ >>> spark.range(1).createTempView("test_view")
557
+ >>> spark.catalog.t.listTables()
558
+ [Table(name='test_view', catalog=None, namespace=[], description=None, ...
559
+
560
+ >>> spark.catalog.t.listTables(pattern="test*")
561
+ [Table(name='test_view', catalog=None, namespace=[], description=None, ...
562
+
563
+ >>> spark.catalog.t.listTables(pattern="table*")
564
+ []
565
+
566
+ >>> _ = spark.catalog.dropTempView("test_view")
567
+ >>> spark.catalog.t.listTables()
568
+ []
569
+ """
570
+ raise NotImplementedError
571
+
572
+ def listColumns(self, tableName: str, dbName: t.Optional[str] = None) -> t.List[Column]:
573
+ """Returns a t.List of columns for the given table/view in the specified database.
574
+
575
+ .. versionadded:: 2.0.0
576
+
577
+ Parameters
578
+ ----------
579
+ tableName : str
580
+ name of the table to t.List columns.
581
+
582
+ .. versionchanged:: 3.4.0
583
+ Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
584
+
585
+ dbName : str, t.Optional
586
+ name of the database to find the table to t.List columns.
587
+
588
+ Returns
589
+ -------
590
+ t.List
591
+ A t.List of :class:`Column`.
592
+
593
+ Notes
594
+ -----
595
+ The order of arguments here is different from that of its JVM counterpart
596
+ because Python does not support method overloading.
597
+
598
+ If no database is specified, the current database and catalog
599
+ are used. This API includes all temporary views.
600
+
601
+ Examples
602
+ --------
603
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
604
+ >>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING parquet")
605
+ >>> spark.catalog.t.listColumns("tblA")
606
+ [Column(name='name', description=None, dataType='string', nullable=True, ...
607
+ >>> _ = spark.sql("DROP TABLE tblA")
608
+ """
609
+ raise NotImplementedError
610
+
611
+ def listFunctions(
612
+ self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
613
+ ) -> t.List[Function]:
614
+ """
615
+ Returns a t.List of functions registered in the specified database.
616
+
617
+ .. versionadded:: 3.4.0
618
+
619
+ Parameters
620
+ ----------
621
+ dbName : str
622
+ name of the database to t.List the functions.
623
+ ``dbName`` can be qualified with catalog name.
624
+ pattern : str
625
+ The pattern that the function name needs to match.
626
+
627
+ .. versionchanged: 3.5.0
628
+ Adds ``pattern`` argument.
629
+
630
+ Returns
631
+ -------
632
+ t.List
633
+ A t.List of :class:`Function`.
634
+
635
+ Notes
636
+ -----
637
+ If no database is specified, the current database and catalog
638
+ are used. This API includes all temporary functions.
639
+
640
+ Examples
641
+ --------
642
+ >>> spark.catalog.t.listFunctions()
643
+ [Function(name=...
644
+
645
+ >>> spark.catalog.t.listFunctions(pattern="to_*")
646
+ [Function(name=...
647
+
648
+ >>> spark.catalog.t.listFunctions(pattern="*not_existing_func*")
649
+ []
650
+ """
651
+ raise NotImplementedError
652
+
653
+ def createExternalTable(
654
+ self,
655
+ tableName: str,
656
+ path: t.Optional[str] = None,
657
+ source: t.Optional[str] = None,
658
+ schema: t.Optional[StructType] = None,
659
+ **options: str,
660
+ ) -> DF:
661
+ """Creates a table based on the dataset in a data source.
662
+
663
+ It returns the DataFrame associated with the external table.
664
+
665
+ The data source is specified by the ``source`` and a set of ``options``.
666
+ If ``source`` is not specified, the default data source configured by
667
+ ``spark.sql.sources.default`` will be used.
668
+
669
+ t.Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
670
+ created external table.
671
+
672
+ .. versionadded:: 2.0.0
673
+
674
+ Returns
675
+ -------
676
+ :class:`DataFrame`
677
+ """
678
+ raise NotImplementedError()
679
+
680
+ def createTable(
681
+ self,
682
+ tableName: str,
683
+ path: t.Optional[str] = None,
684
+ source: t.Optional[str] = None,
685
+ schema: t.Optional[StructType] = None,
686
+ description: t.Optional[str] = None,
687
+ **options: str,
688
+ ) -> DF:
689
+ """Creates a table based on the dataset in a data source.
690
+
691
+ .. versionadded:: 2.2.0
692
+
693
+ Parameters
694
+ ----------
695
+ tableName : str
696
+ name of the table to create.
697
+
698
+ .. versionchanged:: 3.4.0
699
+ Allow ``tableName`` to be qualified with catalog name.
700
+
701
+ path : str, t.Optional
702
+ the path in which the data for this table exists.
703
+ When ``path`` is specified, an external table is
704
+ created from the data at the given path. Otherwise a managed table is created.
705
+ source : str, t.Optional
706
+ the source of this table such as 'parquet, 'orc', etc.
707
+ If ``source`` is not specified, the default data source configured by
708
+ ``spark.sql.sources.default`` will be used.
709
+ schema : class:`StructType`, t.Optional
710
+ the schema for this table.
711
+ description : str, t.Optional
712
+ the description of this table.
713
+
714
+ .. versionchanged:: 3.1.0
715
+ Added the ``description`` parameter.
716
+
717
+ **options : dict, t.Optional
718
+ extra options to specify in the table.
719
+
720
+ Returns
721
+ -------
722
+ :class:`DataFrame`
723
+ The DataFrame associated with the table.
724
+
725
+ Examples
726
+ --------
727
+ Creating a managed table.
728
+
729
+ >>> _ = spark.catalog.createTable("tbl1", schema=spark.range(1).schema, source='parquet')
730
+ >>> _ = spark.sql("DROP TABLE tbl1")
731
+
732
+ Creating an external table
733
+
734
+ >>> import tempfile
735
+ >>> with tempfile.TemporaryDirectory() as d:
736
+ ... _ = spark.catalog.createTable(
737
+ ... "tbl2", schema=spark.range(1).schema, path=d, source='parquet')
738
+ >>> _ = spark.sql("DROP TABLE tbl2")
739
+ """
740
+ raise NotImplementedError()
741
+
742
+ def dropTempView(self, viewName: str) -> bool:
743
+ """Drops the local temporary view with the given view name in the catalog.
744
+ If the view has been cached before, then it will also be uncached.
745
+ Returns true if this view is dropped successfully, false otherwise.
746
+
747
+ .. versionadded:: 2.0.0
748
+
749
+ Parameters
750
+ ----------
751
+ viewName : str
752
+ name of the temporary view to drop.
753
+
754
+ Returns
755
+ -------
756
+ bool
757
+ If the temporary view was successfully dropped or not.
758
+
759
+ .. versionadded:: 2.1.0
760
+ The return type of this method was ``None`` in Spark 2.0, but changed to ``bool``
761
+ in Spark 2.1.
762
+
763
+ Examples
764
+ --------
765
+ >>> spark.createDataFrame([(1, 1)]).createTempView("my_table")
766
+
767
+ Dropping the temporary view.
768
+
769
+ >>> spark.catalog.dropTempView("my_table")
770
+ True
771
+
772
+ Throw an exception if the temporary view does not exists.
773
+
774
+ >>> spark.table("my_table")
775
+ Traceback (most recent call last):
776
+ ...
777
+ AnalysisException: ...
778
+ """
779
+ raise NotImplementedError()
780
+
781
+ def dropGlobalTempView(self, viewName: str) -> bool:
782
+ """Drops the global temporary view with the given view name in the catalog.
783
+
784
+ .. versionadded:: 2.1.0
785
+
786
+ Parameters
787
+ ----------
788
+ viewName : str
789
+ name of the global view to drop.
790
+
791
+ Returns
792
+ -------
793
+ bool
794
+ If the global view was successfully dropped or not.
795
+
796
+ Notes
797
+ -----
798
+ If the view has been cached before, then it will also be uncached.
799
+
800
+ Examples
801
+ --------
802
+ >>> spark.createDataFrame([(1, 1)]).createGlobalTempView("my_table")
803
+
804
+ Dropping the global view.
805
+
806
+ >>> spark.catalog.dropGlobalTempView("my_table")
807
+ True
808
+
809
+ Throw an exception if the global view does not exists.
810
+
811
+ >>> spark.table("global_temp.my_table")
812
+ Traceback (most recent call last):
813
+ ...
814
+ AnalysisException: ...
815
+ """
816
+ raise NotImplementedError()
817
+
818
+ def registerFunction(
819
+ self, name: str, f: t.Callable[..., t.Any], returnType: t.Optional[DataType] = None
820
+ ) -> UserDefinedFunctionLike:
821
+ """An alias for :func:`spark.udf.register`.
822
+ See :meth:`pyspark.sql.UDFRegistration.register`.
823
+
824
+ .. versionadded:: 2.0.0
825
+
826
+ .. deprecated:: 2.3.0
827
+ Use :func:`spark.udf.register` instead.
828
+
829
+ .. versionchanged:: 3.4.0
830
+ Supports Spark Connect.
831
+ """
832
+ raise NotImplementedError()
833
+
834
+ def isCached(self, tableName: str) -> bool:
835
+ """
836
+ Returns true if the table is currently cached in-memory.
837
+
838
+ .. versionadded:: 2.0.0
839
+
840
+ Parameters
841
+ ----------
842
+ tableName : str
843
+ name of the table to get.
844
+
845
+ .. versionchanged:: 3.4.0
846
+ Allow ``tableName`` to be qualified with catalog name.
847
+
848
+ Returns
849
+ -------
850
+ bool
851
+
852
+ Examples
853
+ --------
854
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
855
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
856
+ >>> spark.catalog.cacheTable("tbl1")
857
+ >>> spark.catalog.isCached("tbl1")
858
+ True
859
+
860
+ Throw an analysis exception when the table does not exist.
861
+
862
+ >>> spark.catalog.isCached("not_existing_table")
863
+ Traceback (most recent call last):
864
+ ...
865
+ AnalysisException: ...
866
+
867
+ Using the fully qualified name for the table.
868
+
869
+ >>> spark.catalog.isCached("spark_catalog.default.tbl1")
870
+ True
871
+ >>> spark.catalog.uncacheTable("tbl1")
872
+ >>> _ = spark.sql("DROP TABLE tbl1")
873
+ """
874
+ raise NotImplementedError()
875
+
876
+ def cacheTable(self, tableName: str, storageLevel: t.Optional[StorageLevel] = None) -> None:
877
+ """Caches the specified table in-memory or with given storage level.
878
+ Default MEMORY_AND_DISK.
879
+
880
+ .. versionadded:: 2.0.0
881
+
882
+ Parameters
883
+ ----------
884
+ tableName : str
885
+ name of the table to get.
886
+
887
+ .. versionchanged:: 3.4.0
888
+ Allow ``tableName`` to be qualified with catalog name.
889
+
890
+ storageLevel : :class:`StorageLevel`
891
+ storage level to set for persistence.
892
+
893
+ .. versionchanged:: 3.5.0
894
+ Allow to specify storage level.
895
+
896
+ Examples
897
+ --------
898
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
899
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
900
+ >>> spark.catalog.cacheTable("tbl1")
901
+
902
+ or
903
+
904
+ >>> spark.catalog.cacheTable("tbl1", StorageLevel.OFF_HEAP)
905
+
906
+ Throw an analysis exception when the table does not exist.
907
+
908
+ >>> spark.catalog.cacheTable("not_existing_table")
909
+ Traceback (most recent call last):
910
+ ...
911
+ AnalysisException: ...
912
+
913
+ Using the fully qualified name for the table.
914
+
915
+ >>> spark.catalog.cacheTable("spark_catalog.default.tbl1")
916
+ >>> spark.catalog.uncacheTable("tbl1")
917
+ >>> _ = spark.sql("DROP TABLE tbl1")
918
+ """
919
+ raise NotImplementedError()
920
+
921
+ def uncacheTable(self, tableName: str) -> None:
922
+ """Removes the specified table from the in-memory cache.
923
+
924
+ .. versionadded:: 2.0.0
925
+
926
+ Parameters
927
+ ----------
928
+ tableName : str
929
+ name of the table to get.
930
+
931
+ .. versionchanged:: 3.4.0
932
+ Allow ``tableName`` to be qualified with catalog name.
933
+
934
+ Examples
935
+ --------
936
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
937
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
938
+ >>> spark.catalog.cacheTable("tbl1")
939
+ >>> spark.catalog.uncacheTable("tbl1")
940
+ >>> spark.catalog.isCached("tbl1")
941
+ False
942
+
943
+ Throw an analysis exception when the table does not exist.
944
+
945
+ >>> spark.catalog.uncacheTable("not_existing_table")
946
+ Traceback (most recent call last):
947
+ ...
948
+ AnalysisException: ...
949
+
950
+ Using the fully qualified name for the table.
951
+
952
+ >>> spark.catalog.uncacheTable("spark_catalog.default.tbl1")
953
+ >>> spark.catalog.isCached("tbl1")
954
+ False
955
+ >>> _ = spark.sql("DROP TABLE tbl1")
956
+ """
957
+ raise NotImplementedError()
958
+
959
+ def clearCache(self) -> None:
960
+ """Removes all cached tables from the in-memory cache.
961
+
962
+ .. versionadded:: 2.0.0
963
+
964
+ Examples
965
+ --------
966
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
967
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
968
+ >>> spark.catalog.clearCache()
969
+ >>> spark.catalog.isCached("tbl1")
970
+ False
971
+ >>> _ = spark.sql("DROP TABLE tbl1")
972
+ """
973
+ raise NotImplementedError()
974
+
975
+ def refreshTable(self, tableName: str) -> None:
976
+ """Invalidates and refreshes all the cached data and metadata of the given table.
977
+
978
+ .. versionadded:: 2.0.0
979
+
980
+ Parameters
981
+ ----------
982
+ tableName : str
983
+ name of the table to get.
984
+
985
+ .. versionchanged:: 3.4.0
986
+ Allow ``tableName`` to be qualified with catalog name.
987
+
988
+ Examples
989
+ --------
990
+ The example below caches a table, and then removes the data.
991
+
992
+ >>> import tempfile
993
+ >>> with tempfile.TemporaryDirectory() as d:
994
+ ... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
995
+ ... _ = spark.sql(
996
+ ... "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
997
+ ... _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
998
+ ... spark.catalog.cacheTable("tbl1")
999
+ ... spark.table("tbl1").show()
1000
+ +---+
1001
+ |col|
1002
+ +---+
1003
+ |abc|
1004
+ +---+
1005
+
1006
+ Because the table is cached, it computes from the cached data as below.
1007
+
1008
+ >>> spark.table("tbl1").count()
1009
+ 1
1010
+
1011
+ After refreshing the table, it shows 0 because the data does not exist t.Anymore.
1012
+
1013
+ >>> spark.catalog.refreshTable("tbl1")
1014
+ >>> spark.table("tbl1").count()
1015
+ 0
1016
+
1017
+ Using the fully qualified name for the table.
1018
+
1019
+ >>> spark.catalog.refreshTable("spark_catalog.default.tbl1")
1020
+ >>> _ = spark.sql("DROP TABLE tbl1")
1021
+ """
1022
+ raise NotImplementedError()
1023
+
1024
+ def recoverPartitions(self, tableName: str) -> None:
1025
+ """Recovers all the partitions of the given table and updates the catalog.
1026
+
1027
+ .. versionadded:: 2.1.1
1028
+
1029
+ Parameters
1030
+ ----------
1031
+ tableName : str
1032
+ name of the table to get.
1033
+
1034
+ Notes
1035
+ -----
1036
+ Only works with a partitioned table, and not a view.
1037
+
1038
+ Examples
1039
+ --------
1040
+ The example below creates a partitioned table against the existing directory of
1041
+ the partitioned table. After that, it recovers the partitions.
1042
+
1043
+ >>> import tempfile
1044
+ >>> with tempfile.TemporaryDirectory() as d:
1045
+ ... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
1046
+ ... spark.range(1).selectExpr(
1047
+ ... "id as key", "id as value").write.partitionBy("key").mode("overwrite").save(d)
1048
+ ... _ = spark.sql(
1049
+ ... "CREATE TABLE tbl1 (key LONG, value LONG)"
1050
+ ... "USING parquet OPTIONS (path '{}') PARTITIONED BY (key)".format(d))
1051
+ ... spark.table("tbl1").show()
1052
+ ... spark.catalog.recoverPartitions("tbl1")
1053
+ ... spark.table("tbl1").show()
1054
+ +-----+---+
1055
+ |value|key|
1056
+ +-----+---+
1057
+ +-----+---+
1058
+ +-----+---+
1059
+ |value|key|
1060
+ +-----+---+
1061
+ | 0| 0|
1062
+ +-----+---+
1063
+ >>> _ = spark.sql("DROP TABLE tbl1")
1064
+ """
1065
+ raise NotImplementedError()
1066
+
1067
+ def refreshByPath(self, path: str) -> None:
1068
+ """Invalidates and refreshes all the cached data (and the associated metadata) for t.Any
1069
+ DataFrame that contains the given data source path.
1070
+
1071
+ .. versionadded:: 2.2.0
1072
+
1073
+ Parameters
1074
+ ----------
1075
+ path : str
1076
+ the path to refresh the cache.
1077
+
1078
+ Examples
1079
+ --------
1080
+ The example below caches a table, and then removes the data.
1081
+
1082
+ >>> import tempfile
1083
+ >>> with tempfile.TemporaryDirectory() as d:
1084
+ ... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
1085
+ ... _ = spark.sql(
1086
+ ... "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
1087
+ ... _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
1088
+ ... spark.catalog.cacheTable("tbl1")
1089
+ ... spark.table("tbl1").show()
1090
+ +---+
1091
+ |col|
1092
+ +---+
1093
+ |abc|
1094
+ +---+
1095
+
1096
+ Because the table is cached, it computes from the cached data as below.
1097
+
1098
+ >>> spark.table("tbl1").count()
1099
+ 1
1100
+
1101
+ After refreshing the table by path, it shows 0 because the data does not exist t.Anymore.
1102
+
1103
+ >>> spark.catalog.refreshByPath(d)
1104
+ >>> spark.table("tbl1").count()
1105
+ 0
1106
+
1107
+ >>> _ = spark.sql("DROP TABLE tbl1")
1108
+ """
1109
+ raise NotImplementedError()
1110
+
1111
+ def _reset(self) -> None:
1112
+ """(Internal use only) Drop all existing databases (except "default"), tables,
1113
+ partitions and functions, and set the current database to "default".
1114
+
1115
+ This is mainly used for tests.
1116
+ """
1117
+ raise NotImplementedError()
1118
+
1119
+
1120
+ class CatalogMetadata(t.NamedTuple):
1121
+ name: str
1122
+ description: t.Optional[str]
1123
+
1124
+
1125
+ class Database(t.NamedTuple):
1126
+ name: str
1127
+ catalog: t.Optional[str]
1128
+ description: t.Optional[str]
1129
+ locationUri: str
1130
+
1131
+
1132
+ class Table(t.NamedTuple):
1133
+ name: str
1134
+ catalog: t.Optional[str]
1135
+ namespace: t.Optional[t.List[str]]
1136
+ description: t.Optional[str]
1137
+ tableType: str
1138
+ isTemporary: bool
1139
+
1140
+ @property
1141
+ def database(self) -> t.Optional[str]:
1142
+ if self.namespace is not None and len(self.namespace) == 1:
1143
+ return self.namespace[0]
1144
+ else:
1145
+ return None
1146
+
1147
+
1148
+ class Column(t.NamedTuple):
1149
+ name: str
1150
+ description: t.Optional[str]
1151
+ dataType: str
1152
+ nullable: bool
1153
+ isPartition: bool
1154
+ isBucket: bool
1155
+
1156
+
1157
+ class Function(t.NamedTuple):
1158
+ name: str
1159
+ catalog: t.Optional[str]
1160
+ namespace: t.Optional[t.List[str]]
1161
+ description: t.Optional[str]
1162
+ className: str
1163
+ isTemporary: bool