sqlframe 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sqlframe/__init__.py +0 -0
  2. sqlframe/_version.py +16 -0
  3. sqlframe/base/__init__.py +0 -0
  4. sqlframe/base/_typing.py +39 -0
  5. sqlframe/base/catalog.py +1163 -0
  6. sqlframe/base/column.py +388 -0
  7. sqlframe/base/dataframe.py +1519 -0
  8. sqlframe/base/decorators.py +51 -0
  9. sqlframe/base/exceptions.py +14 -0
  10. sqlframe/base/function_alternatives.py +1055 -0
  11. sqlframe/base/functions.py +1678 -0
  12. sqlframe/base/group.py +102 -0
  13. sqlframe/base/mixins/__init__.py +0 -0
  14. sqlframe/base/mixins/catalog_mixins.py +419 -0
  15. sqlframe/base/mixins/readwriter_mixins.py +118 -0
  16. sqlframe/base/normalize.py +84 -0
  17. sqlframe/base/operations.py +87 -0
  18. sqlframe/base/readerwriter.py +679 -0
  19. sqlframe/base/session.py +585 -0
  20. sqlframe/base/transforms.py +13 -0
  21. sqlframe/base/types.py +418 -0
  22. sqlframe/base/util.py +242 -0
  23. sqlframe/base/window.py +139 -0
  24. sqlframe/bigquery/__init__.py +23 -0
  25. sqlframe/bigquery/catalog.py +255 -0
  26. sqlframe/bigquery/column.py +1 -0
  27. sqlframe/bigquery/dataframe.py +54 -0
  28. sqlframe/bigquery/functions.py +378 -0
  29. sqlframe/bigquery/group.py +14 -0
  30. sqlframe/bigquery/readwriter.py +29 -0
  31. sqlframe/bigquery/session.py +89 -0
  32. sqlframe/bigquery/types.py +1 -0
  33. sqlframe/bigquery/window.py +1 -0
  34. sqlframe/duckdb/__init__.py +20 -0
  35. sqlframe/duckdb/catalog.py +108 -0
  36. sqlframe/duckdb/column.py +1 -0
  37. sqlframe/duckdb/dataframe.py +55 -0
  38. sqlframe/duckdb/functions.py +47 -0
  39. sqlframe/duckdb/group.py +14 -0
  40. sqlframe/duckdb/readwriter.py +111 -0
  41. sqlframe/duckdb/session.py +65 -0
  42. sqlframe/duckdb/types.py +1 -0
  43. sqlframe/duckdb/window.py +1 -0
  44. sqlframe/postgres/__init__.py +23 -0
  45. sqlframe/postgres/catalog.py +106 -0
  46. sqlframe/postgres/column.py +1 -0
  47. sqlframe/postgres/dataframe.py +54 -0
  48. sqlframe/postgres/functions.py +61 -0
  49. sqlframe/postgres/group.py +14 -0
  50. sqlframe/postgres/readwriter.py +29 -0
  51. sqlframe/postgres/session.py +68 -0
  52. sqlframe/postgres/types.py +1 -0
  53. sqlframe/postgres/window.py +1 -0
  54. sqlframe/redshift/__init__.py +23 -0
  55. sqlframe/redshift/catalog.py +127 -0
  56. sqlframe/redshift/column.py +1 -0
  57. sqlframe/redshift/dataframe.py +54 -0
  58. sqlframe/redshift/functions.py +18 -0
  59. sqlframe/redshift/group.py +14 -0
  60. sqlframe/redshift/readwriter.py +29 -0
  61. sqlframe/redshift/session.py +53 -0
  62. sqlframe/redshift/types.py +1 -0
  63. sqlframe/redshift/window.py +1 -0
  64. sqlframe/snowflake/__init__.py +26 -0
  65. sqlframe/snowflake/catalog.py +134 -0
  66. sqlframe/snowflake/column.py +1 -0
  67. sqlframe/snowflake/dataframe.py +54 -0
  68. sqlframe/snowflake/functions.py +18 -0
  69. sqlframe/snowflake/group.py +14 -0
  70. sqlframe/snowflake/readwriter.py +29 -0
  71. sqlframe/snowflake/session.py +53 -0
  72. sqlframe/snowflake/types.py +1 -0
  73. sqlframe/snowflake/window.py +1 -0
  74. sqlframe/spark/__init__.py +23 -0
  75. sqlframe/spark/catalog.py +1028 -0
  76. sqlframe/spark/column.py +1 -0
  77. sqlframe/spark/dataframe.py +54 -0
  78. sqlframe/spark/functions.py +22 -0
  79. sqlframe/spark/group.py +14 -0
  80. sqlframe/spark/readwriter.py +29 -0
  81. sqlframe/spark/session.py +90 -0
  82. sqlframe/spark/types.py +1 -0
  83. sqlframe/spark/window.py +1 -0
  84. sqlframe/standalone/__init__.py +26 -0
  85. sqlframe/standalone/catalog.py +13 -0
  86. sqlframe/standalone/column.py +1 -0
  87. sqlframe/standalone/dataframe.py +36 -0
  88. sqlframe/standalone/functions.py +1 -0
  89. sqlframe/standalone/group.py +14 -0
  90. sqlframe/standalone/readwriter.py +19 -0
  91. sqlframe/standalone/session.py +40 -0
  92. sqlframe/standalone/types.py +1 -0
  93. sqlframe/standalone/window.py +1 -0
  94. sqlframe-1.1.3.dist-info/LICENSE +21 -0
  95. sqlframe-1.1.3.dist-info/METADATA +172 -0
  96. sqlframe-1.1.3.dist-info/RECORD +98 -0
  97. sqlframe-1.1.3.dist-info/WHEEL +5 -0
  98. sqlframe-1.1.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1028 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import fnmatch
6
+ import typing as t
7
+
8
+ from sqlframe.base.catalog import (
9
+ CatalogMetadata,
10
+ Column,
11
+ Database,
12
+ Function,
13
+ Table,
14
+ _BaseCatalog,
15
+ )
16
+ from sqlframe.base.types import DataType, StructType
17
+
18
+ if t.TYPE_CHECKING:
19
+ from sqlframe.base._typing import StorageLevel, UserDefinedFunctionLike
20
+ from sqlframe.spark.dataframe import SparkDataFrame
21
+ from sqlframe.spark.session import SparkSession # noqa
22
+
23
+
24
+ class SparkCatalog(
25
+ _BaseCatalog["SparkSession", "SparkDataFrame"],
26
+ ):
27
+ @property
28
+ def _spark_catalog(self):
29
+ return self.session.spark_session.catalog
30
+
31
+ def getDatabase(self, dbName: str) -> Database:
32
+ """Get the database with the specified name.
33
+ This throws an :class:`AnalysisException` when the database cannot be found.
34
+
35
+ .. versionadded:: 3.4.0
36
+
37
+ Parameters
38
+ ----------
39
+ dbName : str
40
+ name of the database to get.
41
+
42
+ Returns
43
+ -------
44
+ :class:`Database`
45
+ The database found by the name.
46
+
47
+ Examples
48
+ --------
49
+ >>> spark.catalog.getDatabase("default")
50
+ Database(name='default', catalog='spark_catalog', description='default database', ...
51
+
52
+ Using the fully qualified name with the catalog name.
53
+
54
+ >>> spark.catalog.getDatabase("spark_catalog.default")
55
+ Database(name='default', catalog='spark_catalog', description='default database', ...
56
+ """
57
+ return Database(*self._spark_catalog.getDatabase(dbName))
58
+
59
+ def databaseExists(self, dbName: str) -> bool:
60
+ """Check if the database with the specified name exists.
61
+
62
+ .. versionadded:: 3.3.0
63
+
64
+ Parameters
65
+ ----------
66
+ dbName : str
67
+ name of the database to check existence
68
+
69
+ .. versionchanged:: 3.4.0
70
+ Allow ``dbName`` to be qualified with catalog name.
71
+
72
+ Returns
73
+ -------
74
+ bool
75
+ Indicating whether the database exists
76
+
77
+ Examples
78
+ --------
79
+ Check if 'test_new_database' database exists
80
+
81
+ >>> spark.catalog.databaseExists("test_new_database")
82
+ False
83
+ >>> _ = spark.sql("CREATE DATABASE test_new_database")
84
+ >>> spark.catalog.databaseExists("test_new_database")
85
+ True
86
+
87
+ Using the fully qualified name with the catalog name.
88
+
89
+ >>> spark.catalog.databaseExists("spark_catalog.test_new_database")
90
+ True
91
+ >>> _ = spark.sql("DROP DATABASE test_new_database")
92
+ """
93
+ return self._spark_catalog.databaseExists(dbName)
94
+
95
+ def getTable(self, tableName: str) -> Table:
96
+ """Get the table or view with the specified name. This table can be a temporary view or a
97
+ table/view. This throws an :class:`AnalysisException` when no Table can be found.
98
+
99
+ .. versionadded:: 3.4.0
100
+
101
+ Parameters
102
+ ----------
103
+ tableName : str
104
+ name of the table to get.
105
+
106
+ .. versionchanged:: 3.4.0
107
+ Allow `tableName` to be qualified with catalog name.
108
+
109
+ Returns
110
+ -------
111
+ :class:`Table`
112
+ The table found by the name.
113
+
114
+ Examples
115
+ --------
116
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
117
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
118
+ >>> spark.catalog.getTable("tbl1")
119
+ Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
120
+
121
+ Using the fully qualified name with the catalog name.
122
+
123
+ >>> spark.catalog.getTable("default.tbl1")
124
+ Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
125
+ >>> spark.catalog.getTable("spark_catalog.default.tbl1")
126
+ Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
127
+ >>> _ = spark.sql("DROP TABLE tbl1")
128
+
129
+ Throw an analysis exception when the table does not exist.
130
+
131
+ >>> spark.catalog.getTable("tbl1")
132
+ Traceback (most recent call last):
133
+ ...
134
+ AnalysisException: ...
135
+ """
136
+ return Table(*self._spark_catalog.getTable(tableName))
137
+
138
+ def functionExists(self, functionName: str, dbName: t.Optional[str] = None) -> bool:
139
+ """Check if the function with the specified name exists.
140
+ This can either be a temporary function or a function.
141
+
142
+ .. versionadded:: 3.3.0
143
+
144
+ Parameters
145
+ ----------
146
+ functionName : str
147
+ name of the function to check existence
148
+
149
+ .. versionchanged:: 3.4.0
150
+ Allow ``functionName`` to be qualified with catalog name
151
+
152
+ dbName : str, t.Optional
153
+ name of the database to check function existence in.
154
+
155
+ Returns
156
+ -------
157
+ bool
158
+ Indicating whether the function exists
159
+
160
+ Notes
161
+ -----
162
+ If no database is specified, the current database and catalog
163
+ are used. This API includes all temporary functions.
164
+
165
+ Examples
166
+ --------
167
+ >>> spark.catalog.functionExists("count")
168
+ True
169
+
170
+ Using the fully qualified name for function name.
171
+
172
+ >>> spark.catalog.functionExists("default.unexisting_function")
173
+ False
174
+ >>> spark.catalog.functionExists("spark_catalog.default.unexisting_function")
175
+ False
176
+ """
177
+ return self._spark_catalog.functionExists(functionName, dbName)
178
+
179
+ def getFunction(self, functionName: str) -> Function:
180
+ """Get the function with the specified name. This function can be a temporary function or a
181
+ function. This throws an :class:`AnalysisException` when the function cannot be found.
182
+
183
+ .. versionadded:: 3.4.0
184
+
185
+ Parameters
186
+ ----------
187
+ functionName : str
188
+ name of the function to check existence.
189
+
190
+ Returns
191
+ -------
192
+ :class:`Function`
193
+ The function found by the name.
194
+
195
+ Examples
196
+ --------
197
+ >>> _ = spark.sql(
198
+ ... "CREATE FUNCTION my_func1 AS 'test.org.apache.spark.sql.MyDoubleAvg'")
199
+ >>> spark.catalog.getFunction("my_func1")
200
+ Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
201
+
202
+ Using the fully qualified name for function name.
203
+
204
+ >>> spark.catalog.getFunction("default.my_func1")
205
+ Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
206
+ >>> spark.catalog.getFunction("spark_catalog.default.my_func1")
207
+ Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
208
+
209
+ Throw an analysis exception when the function does not exists.
210
+
211
+ >>> spark.catalog.getFunction("my_func2")
212
+ Traceback (most recent call last):
213
+ ...
214
+ AnalysisException: ...
215
+ """
216
+ return Function(*self._spark_catalog.getFunction(functionName))
217
+
218
+ def tableExists(self, tableName: str, dbName: t.Optional[str] = None) -> bool:
219
+ """Check if the table or view with the specified name exists.
220
+ This can either be a temporary view or a table/view.
221
+
222
+ .. versionadded:: 3.3.0
223
+
224
+ Parameters
225
+ ----------
226
+ tableName : str
227
+ name of the table to check existence.
228
+ If no database is specified, first try to treat ``tableName`` as a
229
+ multi-layer-namespace identifier, then try ``tableName`` as a normal table
230
+ name in the current database if necessary.
231
+
232
+ .. versionchanged:: 3.4.0
233
+ Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
234
+
235
+ dbName : str, t.Optional
236
+ name of the database to check table existence in.
237
+
238
+ Returns
239
+ -------
240
+ bool
241
+ Indicating whether the table/view exists
242
+
243
+ Examples
244
+ --------
245
+ This function can check if a table is defined or not:
246
+
247
+ >>> spark.catalog.tableExists("unexisting_table")
248
+ False
249
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
250
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
251
+ >>> spark.catalog.tableExists("tbl1")
252
+ True
253
+
254
+ Using the fully qualified names for tables.
255
+
256
+ >>> spark.catalog.tableExists("default.tbl1")
257
+ True
258
+ >>> spark.catalog.tableExists("spark_catalog.default.tbl1")
259
+ True
260
+ >>> spark.catalog.tableExists("tbl1", "default")
261
+ True
262
+ >>> _ = spark.sql("DROP TABLE tbl1")
263
+
264
+ Check if views exist:
265
+
266
+ >>> spark.catalog.tableExists("view1")
267
+ False
268
+ >>> _ = spark.sql("CREATE VIEW view1 AS SELECT 1")
269
+ >>> spark.catalog.tableExists("view1")
270
+ True
271
+
272
+ Using the fully qualified names for views.
273
+
274
+ >>> spark.catalog.tableExists("default.view1")
275
+ True
276
+ >>> spark.catalog.tableExists("spark_catalog.default.view1")
277
+ True
278
+ >>> spark.catalog.tableExists("view1", "default")
279
+ True
280
+ >>> _ = spark.sql("DROP VIEW view1")
281
+
282
+ Check if temporary views exist:
283
+
284
+ >>> _ = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1")
285
+ >>> spark.catalog.tableExists("view1")
286
+ True
287
+ >>> df = spark.sql("DROP VIEW view1")
288
+ >>> spark.catalog.tableExists("view1")
289
+ False
290
+ """
291
+ return self._spark_catalog.tableExists(tableName, dbName)
292
+
293
+ def currentCatalog(self) -> str:
294
+ """Returns the current default catalog in this session.
295
+
296
+ .. versionadded:: 3.4.0
297
+
298
+ Examples
299
+ --------
300
+ >>> spark.catalog.currentCatalog()
301
+ 'spark_catalog'
302
+ """
303
+ return self._spark_catalog.currentCatalog()
304
+
305
+ def setCurrentCatalog(self, catalogName: str) -> None:
306
+ """Sets the current default catalog in this session.
307
+
308
+ .. versionadded:: 3.4.0
309
+
310
+ Parameters
311
+ ----------
312
+ catalogName : str
313
+ name of the catalog to set
314
+
315
+ Examples
316
+ --------
317
+ >>> spark.catalog.setCurrentCatalog("spark_catalog")
318
+ """
319
+ return self._spark_catalog.setCurrentCatalog(catalogName)
320
+
321
+ def currentDatabase(self) -> str:
322
+ """Returns the current default schema in this session.
323
+
324
+ .. versionadded:: 3.4.0
325
+
326
+ Examples
327
+ --------
328
+ >>> spark.catalog.currentDatabase()
329
+ 'default'
330
+ """
331
+ return self._spark_catalog.currentDatabase()
332
+
333
+ def listDatabases(self, pattern: t.Optional[str] = None) -> t.List[Database]:
334
+ """
335
+ Returns a t.List of databases available across all sessions.
336
+
337
+ .. versionadded:: 2.0.0
338
+
339
+ Parameters
340
+ ----------
341
+ pattern : str
342
+ The pattern that the database name needs to match.
343
+
344
+ .. versionchanged: 3.5.0
345
+ Adds ``pattern`` argument.
346
+
347
+ Returns
348
+ -------
349
+ t.List
350
+ A t.List of :class:`Database`.
351
+
352
+ Examples
353
+ --------
354
+ >>> spark.catalog.t.listDatabases()
355
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
356
+
357
+ >>> spark.catalog.t.listDatabases("def*")
358
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
359
+
360
+ >>> spark.catalog.t.listDatabases("def2*")
361
+ []
362
+ """
363
+ return [Database(*x) for x in self._spark_catalog.listDatabases(pattern)]
364
+
365
+ def listCatalogs(self, pattern: t.Optional[str] = None) -> t.List[CatalogMetadata]:
366
+ """
367
+ Returns a t.List of databases available across all sessions.
368
+
369
+ .. versionadded:: 2.0.0
370
+
371
+ Parameters
372
+ ----------
373
+ pattern : str
374
+ The pattern that the database name needs to match.
375
+
376
+ .. versionchanged: 3.5.0
377
+ Adds ``pattern`` argument.
378
+
379
+ Returns
380
+ -------
381
+ t.List
382
+ A t.List of :class:`Database`.
383
+
384
+ Examples
385
+ --------
386
+ >>> spark.catalog.t.listDatabases()
387
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
388
+
389
+ >>> spark.catalog.t.listDatabases("def*")
390
+ [Database(name='default', catalog='spark_catalog', description='default database', ...
391
+
392
+ >>> spark.catalog.t.listDatabases("def2*")
393
+ []
394
+ """
395
+ return [CatalogMetadata(*x) for x in self._spark_catalog.listCatalogs(pattern)]
396
+
397
+ def setCurrentDatabase(self, dbName: str) -> None:
398
+ """
399
+ Sets the current default database in this session.
400
+
401
+ .. versionadded:: 2.0.0
402
+
403
+ Examples
404
+ --------
405
+ >>> spark.catalog.setCurrentDatabase("default")
406
+ """
407
+ return self._spark_catalog.setCurrentDatabase(dbName)
408
+
409
+ def listTables(
410
+ self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
411
+ ) -> t.List[Table]:
412
+ """Returns a t.List of tables/views in the specified database.
413
+
414
+ .. versionadded:: 2.0.0
415
+
416
+ Parameters
417
+ ----------
418
+ dbName : str
419
+ name of the database to t.List the tables.
420
+
421
+ .. versionchanged:: 3.4.0
422
+ Allow ``dbName`` to be qualified with catalog name.
423
+
424
+ pattern : str
425
+ The pattern that the database name needs to match.
426
+
427
+ .. versionchanged: 3.5.0
428
+ Adds ``pattern`` argument.
429
+
430
+ Returns
431
+ -------
432
+ t.List
433
+ A t.List of :class:`Table`.
434
+
435
+ Notes
436
+ -----
437
+ If no database is specified, the current database and catalog
438
+ are used. This API includes all temporary views.
439
+
440
+ Examples
441
+ --------
442
+ >>> spark.range(1).createTempView("test_view")
443
+ >>> spark.catalog.t.listTables()
444
+ [Table(name='test_view', catalog=None, namespace=[], description=None, ...
445
+
446
+ >>> spark.catalog.t.listTables(pattern="test*")
447
+ [Table(name='test_view', catalog=None, namespace=[], description=None, ...
448
+
449
+ >>> spark.catalog.t.listTables(pattern="table*")
450
+ []
451
+
452
+ >>> _ = spark.catalog.dropTempView("test_view")
453
+ >>> spark.catalog.t.listTables()
454
+ []
455
+ """
456
+ tables = self._spark_catalog.listTables(dbName, pattern)
457
+ for table_name in self.spark.temp_views:
458
+ if not pattern or (pattern and fnmatch.fnmatch(table_name, pattern)):
459
+ tables.append(
460
+ Table(
461
+ name=table_name,
462
+ catalog=None,
463
+ namespace=[],
464
+ description=None,
465
+ tableType="VIEW",
466
+ isTemporary=True,
467
+ )
468
+ )
469
+ return [Table(*x) for x in self._spark_catalog.listTables(dbName, pattern)]
470
+
471
+ def listColumns(self, tableName: str, dbName: t.Optional[str] = None) -> t.List[Column]:
472
+ """Returns a t.List of columns for the given table/view in the specified database.
473
+
474
+ .. versionadded:: 2.0.0
475
+
476
+ Parameters
477
+ ----------
478
+ tableName : str
479
+ name of the table to t.List columns.
480
+
481
+ .. versionchanged:: 3.4.0
482
+ Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
483
+
484
+ dbName : str, t.Optional
485
+ name of the database to find the table to t.List columns.
486
+
487
+ Returns
488
+ -------
489
+ t.List
490
+ A t.List of :class:`Column`.
491
+
492
+ Notes
493
+ -----
494
+ The order of arguments here is different from that of its JVM counterpart
495
+ because Python does not support method overloading.
496
+
497
+ If no database is specified, the current database and catalog
498
+ are used. This API includes all temporary views.
499
+
500
+ Examples
501
+ --------
502
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
503
+ >>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING parquet")
504
+ >>> spark.catalog.t.listColumns("tblA")
505
+ [Column(name='name', description=None, dataType='string', nullable=True, ...
506
+ >>> _ = spark.sql("DROP TABLE tblA")
507
+ """
508
+ if df := self.spark.temp_views.get(tableName):
509
+ return [
510
+ Column(
511
+ name=col,
512
+ description=None,
513
+ dataType="",
514
+ nullable=True,
515
+ isPartition=False,
516
+ isBucket=False,
517
+ )
518
+ for col in df.columns
519
+ ]
520
+ return [Column(*x) for x in self._spark_catalog.listColumns(tableName, dbName)]
521
+
522
+ def listFunctions(
523
+ self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
524
+ ) -> t.List[Function]:
525
+ """
526
+ Returns a t.List of functions registered in the specified database.
527
+
528
+ .. versionadded:: 3.4.0
529
+
530
+ Parameters
531
+ ----------
532
+ dbName : str
533
+ name of the database to t.List the functions.
534
+ ``dbName`` can be qualified with catalog name.
535
+ pattern : str
536
+ The pattern that the function name needs to match.
537
+
538
+ .. versionchanged: 3.5.0
539
+ Adds ``pattern`` argument.
540
+
541
+ Returns
542
+ -------
543
+ t.List
544
+ A t.List of :class:`Function`.
545
+
546
+ Notes
547
+ -----
548
+ If no database is specified, the current database and catalog
549
+ are used. This API includes all temporary functions.
550
+
551
+ Examples
552
+ --------
553
+ >>> spark.catalog.t.listFunctions()
554
+ [Function(name=...
555
+
556
+ >>> spark.catalog.t.listFunctions(pattern="to_*")
557
+ [Function(name=...
558
+
559
+ >>> spark.catalog.t.listFunctions(pattern="*not_existing_func*")
560
+ []
561
+ """
562
+ return [Function(*x) for x in self._spark_catalog.listFunctions(dbName, pattern)]
563
+
564
+ def createExternalTable(
565
+ self,
566
+ tableName: str,
567
+ path: t.Optional[str] = None,
568
+ source: t.Optional[str] = None,
569
+ schema: t.Optional[StructType] = None,
570
+ **options: str,
571
+ ) -> SparkDataFrame:
572
+ """Creates a table based on the dataset in a data source.
573
+
574
+ It returns the DataFrame associated with the external table.
575
+
576
+ The data source is specified by the ``source`` and a set of ``options``.
577
+ If ``source`` is not specified, the default data source configured by
578
+ ``spark.sql.sources.default`` will be used.
579
+
580
+ t.Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
581
+ created external table.
582
+
583
+ .. versionadded:: 2.0.0
584
+
585
+ Returns
586
+ -------
587
+ :class:`DataFrame`
588
+ """
589
+ raise NotImplementedError()
590
+
591
+ def createTable(
592
+ self,
593
+ tableName: str,
594
+ path: t.Optional[str] = None,
595
+ source: t.Optional[str] = None,
596
+ schema: t.Optional[StructType] = None,
597
+ description: t.Optional[str] = None,
598
+ **options: str,
599
+ ) -> SparkDataFrame:
600
+ """Creates a table based on the dataset in a data source.
601
+
602
+ .. versionadded:: 2.2.0
603
+
604
+ Parameters
605
+ ----------
606
+ tableName : str
607
+ name of the table to create.
608
+
609
+ .. versionchanged:: 3.4.0
610
+ Allow ``tableName`` to be qualified with catalog name.
611
+
612
+ path : str, t.Optional
613
+ the path in which the data for this table exists.
614
+ When ``path`` is specified, an external table is
615
+ created from the data at the given path. Otherwise a managed table is created.
616
+ source : str, t.Optional
617
+ the source of this table such as 'parquet, 'orc', etc.
618
+ If ``source`` is not specified, the default data source configured by
619
+ ``spark.sql.sources.default`` will be used.
620
+ schema : class:`StructType`, t.Optional
621
+ the schema for this table.
622
+ description : str, t.Optional
623
+ the description of this table.
624
+
625
+ .. versionchanged:: 3.1.0
626
+ Added the ``description`` parameter.
627
+
628
+ **options : dict, t.Optional
629
+ extra options to specify in the table.
630
+
631
+ Returns
632
+ -------
633
+ :class:`DataFrame`
634
+ The DataFrame associated with the table.
635
+
636
+ Examples
637
+ --------
638
+ Creating a managed table.
639
+
640
+ >>> _ = spark.catalog.createTable("tbl1", schema=spark.range(1).schema, source='parquet')
641
+ >>> _ = spark.sql("DROP TABLE tbl1")
642
+
643
+ Creating an external table
644
+
645
+ >>> import tempfile
646
+ >>> with tempfile.TemporaryDirectory() as d:
647
+ ... _ = spark.catalog.createTable(
648
+ ... "tbl2", schema=spark.range(1).schema, path=d, source='parquet')
649
+ >>> _ = spark.sql("DROP TABLE tbl2")
650
+ """
651
+ raise NotImplementedError()
652
+
653
+ def dropTempView(self, viewName: str) -> bool:
654
+ """Drops the local temporary view with the given view name in the catalog.
655
+ If the view has been cached before, then it will also be uncached.
656
+ Returns true if this view is dropped successfully, false otherwise.
657
+
658
+ .. versionadded:: 2.0.0
659
+
660
+ Parameters
661
+ ----------
662
+ viewName : str
663
+ name of the temporary view to drop.
664
+
665
+ Returns
666
+ -------
667
+ bool
668
+ If the temporary view was successfully dropped or not.
669
+
670
+ .. versionadded:: 2.1.0
671
+ The return type of this method was ``None`` in Spark 2.0, but changed to ``bool``
672
+ in Spark 2.1.
673
+
674
+ Examples
675
+ --------
676
+ >>> spark.createDataFrame([(1, 1)]).createTempView("my_table")
677
+
678
+ Dropping the temporary view.
679
+
680
+ >>> spark.catalog.dropTempView("my_table")
681
+ True
682
+
683
+ Throw an exception if the temporary view does not exists.
684
+
685
+ >>> spark.table("my_table")
686
+ Traceback (most recent call last):
687
+ ...
688
+ AnalysisException: ...
689
+ """
690
+ return self._spark_catalog.dropTempView(viewName)
691
+
692
+ def dropGlobalTempView(self, viewName: str) -> bool:
693
+ """Drops the global temporary view with the given view name in the catalog.
694
+
695
+ .. versionadded:: 2.1.0
696
+
697
+ Parameters
698
+ ----------
699
+ viewName : str
700
+ name of the global view to drop.
701
+
702
+ Returns
703
+ -------
704
+ bool
705
+ If the global view was successfully dropped or not.
706
+
707
+ Notes
708
+ -----
709
+ If the view has been cached before, then it will also be uncached.
710
+
711
+ Examples
712
+ --------
713
+ >>> spark.createDataFrame([(1, 1)]).createGlobalTempView("my_table")
714
+
715
+ Dropping the global view.
716
+
717
+ >>> spark.catalog.dropGlobalTempView("my_table")
718
+ True
719
+
720
+ Throw an exception if the global view does not exists.
721
+
722
+ >>> spark.table("global_temp.my_table")
723
+ Traceback (most recent call last):
724
+ ...
725
+ AnalysisException: ...
726
+ """
727
+ return self._spark_catalog.dropGlobalTempView(viewName)
728
+
729
+ def registerFunction(
730
+ self, name: str, f: t.Callable[..., t.Any], returnType: t.Optional[DataType] = None
731
+ ) -> UserDefinedFunctionLike:
732
+ """An alias for :func:`spark.udf.register`.
733
+ See :meth:`pyspark.sql.UDFRegistration.register`.
734
+
735
+ .. versionadded:: 2.0.0
736
+
737
+ .. deprecated:: 2.3.0
738
+ Use :func:`spark.udf.register` instead.
739
+
740
+ .. versionchanged:: 3.4.0
741
+ Supports Spark Connect.
742
+ """
743
+ return self._spark_catalog.registerFunction(name, f, returnType)
744
+
745
+ def isCached(self, tableName: str) -> bool:
746
+ """
747
+ Returns true if the table is currently cached in-memory.
748
+
749
+ .. versionadded:: 2.0.0
750
+
751
+ Parameters
752
+ ----------
753
+ tableName : str
754
+ name of the table to get.
755
+
756
+ .. versionchanged:: 3.4.0
757
+ Allow ``tableName`` to be qualified with catalog name.
758
+
759
+ Returns
760
+ -------
761
+ bool
762
+
763
+ Examples
764
+ --------
765
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
766
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
767
+ >>> spark.catalog.cacheTable("tbl1")
768
+ >>> spark.catalog.isCached("tbl1")
769
+ True
770
+
771
+ Throw an analysis exception when the table does not exist.
772
+
773
+ >>> spark.catalog.isCached("not_existing_table")
774
+ Traceback (most recent call last):
775
+ ...
776
+ AnalysisException: ...
777
+
778
+ Using the fully qualified name for the table.
779
+
780
+ >>> spark.catalog.isCached("spark_catalog.default.tbl1")
781
+ True
782
+ >>> spark.catalog.uncacheTable("tbl1")
783
+ >>> _ = spark.sql("DROP TABLE tbl1")
784
+ """
785
+ return self._spark_catalog.isCached(tableName)
786
+
787
+ def cacheTable(self, tableName: str, storageLevel: t.Optional[StorageLevel] = None) -> None:
788
+ """Caches the specified table in-memory or with given storage level.
789
+ Default MEMORY_AND_DISK.
790
+
791
+ .. versionadded:: 2.0.0
792
+
793
+ Parameters
794
+ ----------
795
+ tableName : str
796
+ name of the table to get.
797
+
798
+ .. versionchanged:: 3.4.0
799
+ Allow ``tableName`` to be qualified with catalog name.
800
+
801
+ storageLevel : :class:`StorageLevel`
802
+ storage level to set for persistence.
803
+
804
+ .. versionchanged:: 3.5.0
805
+ Allow to specify storage level.
806
+
807
+ Examples
808
+ --------
809
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
810
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
811
+ >>> spark.catalog.cacheTable("tbl1")
812
+
813
+ or
814
+
815
+ >>> spark.catalog.cacheTable("tbl1", StorageLevel.OFF_HEAP)
816
+
817
+ Throw an analysis exception when the table does not exist.
818
+
819
+ >>> spark.catalog.cacheTable("not_existing_table")
820
+ Traceback (most recent call last):
821
+ ...
822
+ AnalysisException: ...
823
+
824
+ Using the fully qualified name for the table.
825
+
826
+ >>> spark.catalog.cacheTable("spark_catalog.default.tbl1")
827
+ >>> spark.catalog.uncacheTable("tbl1")
828
+ >>> _ = spark.sql("DROP TABLE tbl1")
829
+ """
830
+ return self._spark_catalog.cacheTable(tableName, storageLevel)
831
+
832
+ def uncacheTable(self, tableName: str) -> None:
833
+ """Removes the specified table from the in-memory cache.
834
+
835
+ .. versionadded:: 2.0.0
836
+
837
+ Parameters
838
+ ----------
839
+ tableName : str
840
+ name of the table to get.
841
+
842
+ .. versionchanged:: 3.4.0
843
+ Allow ``tableName`` to be qualified with catalog name.
844
+
845
+ Examples
846
+ --------
847
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
848
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
849
+ >>> spark.catalog.cacheTable("tbl1")
850
+ >>> spark.catalog.uncacheTable("tbl1")
851
+ >>> spark.catalog.isCached("tbl1")
852
+ False
853
+
854
+ Throw an analysis exception when the table does not exist.
855
+
856
+ >>> spark.catalog.uncacheTable("not_existing_table")
857
+ Traceback (most recent call last):
858
+ ...
859
+ AnalysisException: ...
860
+
861
+ Using the fully qualified name for the table.
862
+
863
+ >>> spark.catalog.uncacheTable("spark_catalog.default.tbl1")
864
+ >>> spark.catalog.isCached("tbl1")
865
+ False
866
+ >>> _ = spark.sql("DROP TABLE tbl1")
867
+ """
868
+ return self._spark_catalog.uncacheTable(tableName)
869
+
870
+ def clearCache(self) -> None:
871
+ """Removes all cached tables from the in-memory cache.
872
+
873
+ .. versionadded:: 2.0.0
874
+
875
+ Examples
876
+ --------
877
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
878
+ >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
879
+ >>> spark.catalog.clearCache()
880
+ >>> spark.catalog.isCached("tbl1")
881
+ False
882
+ >>> _ = spark.sql("DROP TABLE tbl1")
883
+ """
884
+ return self._spark_catalog.clearCache()
885
+
886
+ def refreshTable(self, tableName: str) -> None:
887
+ """Invalidates and refreshes all the cached data and metadata of the given table.
888
+
889
+ .. versionadded:: 2.0.0
890
+
891
+ Parameters
892
+ ----------
893
+ tableName : str
894
+ name of the table to get.
895
+
896
+ .. versionchanged:: 3.4.0
897
+ Allow ``tableName`` to be qualified with catalog name.
898
+
899
+ Examples
900
+ --------
901
+ The example below caches a table, and then removes the data.
902
+
903
+ >>> import tempfile
904
+ >>> with tempfile.TemporaryDirectory() as d:
905
+ ... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
906
+ ... _ = spark.sql(
907
+ ... "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
908
+ ... _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
909
+ ... spark.catalog.cacheTable("tbl1")
910
+ ... spark.table("tbl1").show()
911
+ +---+
912
+ |col|
913
+ +---+
914
+ |abc|
915
+ +---+
916
+
917
+ Because the table is cached, it computes from the cached data as below.
918
+
919
+ >>> spark.table("tbl1").count()
920
+ 1
921
+
922
+ After refreshing the table, it shows 0 because the data does not exist t.Anymore.
923
+
924
+ >>> spark.catalog.refreshTable("tbl1")
925
+ >>> spark.table("tbl1").count()
926
+ 0
927
+
928
+ Using the fully qualified name for the table.
929
+
930
+ >>> spark.catalog.refreshTable("spark_catalog.default.tbl1")
931
+ >>> _ = spark.sql("DROP TABLE tbl1")
932
+ """
933
+ return self._spark_catalog.refreshTable(tableName)
934
+
935
+ def recoverPartitions(self, tableName: str) -> None:
936
+ """Recovers all the partitions of the given table and updates the catalog.
937
+
938
+ .. versionadded:: 2.1.1
939
+
940
+ Parameters
941
+ ----------
942
+ tableName : str
943
+ name of the table to get.
944
+
945
+ Notes
946
+ -----
947
+ Only works with a partitioned table, and not a view.
948
+
949
+ Examples
950
+ --------
951
+ The example below creates a partitioned table against the existing directory of
952
+ the partitioned table. After that, it recovers the partitions.
953
+
954
+ >>> import tempfile
955
+ >>> with tempfile.TemporaryDirectory() as d:
956
+ ... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
957
+ ... spark.range(1).selectExpr(
958
+ ... "id as key", "id as value").write.partitionBy("key").mode("overwrite").save(d)
959
+ ... _ = spark.sql(
960
+ ... "CREATE TABLE tbl1 (key LONG, value LONG)"
961
+ ... "USING parquet OPTIONS (path '{}') PARTITIONED BY (key)".format(d))
962
+ ... spark.table("tbl1").show()
963
+ ... spark.catalog.recoverPartitions("tbl1")
964
+ ... spark.table("tbl1").show()
965
+ +-----+---+
966
+ |value|key|
967
+ +-----+---+
968
+ +-----+---+
969
+ +-----+---+
970
+ |value|key|
971
+ +-----+---+
972
+ | 0| 0|
973
+ +-----+---+
974
+ >>> _ = spark.sql("DROP TABLE tbl1")
975
+ """
976
+ return self._spark_catalog.recoverPartitions(tableName)
977
+
978
+ def refreshByPath(self, path: str) -> None:
979
+ """Invalidates and refreshes all the cached data (and the associated metadata) for t.Any
980
+ DataFrame that contains the given data source path.
981
+
982
+ .. versionadded:: 2.2.0
983
+
984
+ Parameters
985
+ ----------
986
+ path : str
987
+ the path to refresh the cache.
988
+
989
+ Examples
990
+ --------
991
+ The example below caches a table, and then removes the data.
992
+
993
+ >>> import tempfile
994
+ >>> with tempfile.TemporaryDirectory() as d:
995
+ ... _ = spark.sql("DROP TABLE IF EXISTS tbl1")
996
+ ... _ = spark.sql(
997
+ ... "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
998
+ ... _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
999
+ ... spark.catalog.cacheTable("tbl1")
1000
+ ... spark.table("tbl1").show()
1001
+ +---+
1002
+ |col|
1003
+ +---+
1004
+ |abc|
1005
+ +---+
1006
+
1007
+ Because the table is cached, it computes from the cached data as below.
1008
+
1009
+ >>> spark.table("tbl1").count()
1010
+ 1
1011
+
1012
+ After refreshing the table by path, it shows 0 because the data does not exist t.Anymore.
1013
+
1014
+ >>> spark.catalog.refreshByPath(d)
1015
+ >>> spark.table("tbl1").count()
1016
+ 0
1017
+
1018
+ >>> _ = spark.sql("DROP TABLE tbl1")
1019
+ """
1020
+ return self._spark_catalog.refreshByPath(path)
1021
+
1022
+ def _reset(self) -> None:
1023
+ """(Internal use only) Drop all existing databases (except "default"), tables,
1024
+ partitions and functions, and set the current database to "default".
1025
+
1026
+ This is mainly used for tests.
1027
+ """
1028
+ return self._spark_catalog._reset()