sqlframe 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlframe/_version.py +2 -2
- sqlframe/base/function_alternatives.py +96 -0
- sqlframe/base/functions.py +4013 -1
- sqlframe/base/session.py +2 -2
- sqlframe/base/types.py +1 -1
- sqlframe/base/util.py +5 -0
- sqlframe/bigquery/functions.py +4 -0
- sqlframe/bigquery/functions.pyi +37 -1
- sqlframe/duckdb/functions.py +3 -0
- sqlframe/duckdb/functions.pyi +29 -0
- sqlframe/postgres/functions.py +6 -0
- sqlframe/postgres/functions.pyi +28 -0
- sqlframe/snowflake/functions.py +3 -0
- sqlframe/snowflake/functions.pyi +27 -0
- sqlframe/spark/functions.pyi +161 -1
- {sqlframe-1.9.0.dist-info → sqlframe-1.10.0.dist-info}/METADATA +1 -1
- {sqlframe-1.9.0.dist-info → sqlframe-1.10.0.dist-info}/RECORD +20 -20
- {sqlframe-1.9.0.dist-info → sqlframe-1.10.0.dist-info}/LICENSE +0 -0
- {sqlframe-1.9.0.dist-info → sqlframe-1.10.0.dist-info}/WHEEL +0 -0
- {sqlframe-1.9.0.dist-info → sqlframe-1.10.0.dist-info}/top_level.txt +0 -0
sqlframe/base/functions.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import decimal
|
|
5
6
|
import logging
|
|
6
7
|
import typing as t
|
|
7
8
|
|
|
@@ -13,6 +14,8 @@ from sqlframe.base.column import Column
|
|
|
13
14
|
from sqlframe.base.decorators import func_metadata as meta
|
|
14
15
|
|
|
15
16
|
if t.TYPE_CHECKING:
|
|
17
|
+
from pyspark.sql.session import SparkContext
|
|
18
|
+
|
|
16
19
|
from sqlframe.base._typing import ColumnOrLiteral, ColumnOrName
|
|
17
20
|
from sqlframe.base.session import DF
|
|
18
21
|
from sqlframe.base.types import ArrayType, StructType
|
|
@@ -424,6 +427,9 @@ def pow(col1: t.Union[ColumnOrName, float], col2: t.Union[ColumnOrName, float])
|
|
|
424
427
|
return Column.invoke_expression_over_column(col1_value, expression.Pow, expression=col2_value)
|
|
425
428
|
|
|
426
429
|
|
|
430
|
+
power = pow
|
|
431
|
+
|
|
432
|
+
|
|
427
433
|
@meta()
|
|
428
434
|
def row_number() -> Column:
|
|
429
435
|
return Column(expression.Anonymous(this="ROW_NUMBER"))
|
|
@@ -947,7 +953,7 @@ def timestamp_seconds(col: ColumnOrName) -> Column:
|
|
|
947
953
|
return Column.invoke_expression_over_column(col, expression.UnixToTime)
|
|
948
954
|
|
|
949
955
|
|
|
950
|
-
@meta(unsupported_engines=["
|
|
956
|
+
@meta(unsupported_engines=["*", "spark"])
|
|
951
957
|
def window(
|
|
952
958
|
timeColumn: ColumnOrName,
|
|
953
959
|
windowDuration: str,
|
|
@@ -1278,6 +1284,73 @@ def array(*cols: t.Union[ColumnOrName, t.Iterable[ColumnOrName]]) -> Column:
|
|
|
1278
1284
|
return Column.invoke_expression_over_column(None, expression.Array, expressions=columns)
|
|
1279
1285
|
|
|
1280
1286
|
|
|
1287
|
+
@meta(unsupported_engines="*")
|
|
1288
|
+
def array_agg(col: ColumnOrName) -> Column:
|
|
1289
|
+
return Column.invoke_expression_over_column(col, expression.ArrayAgg)
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
@meta(unsupported_engines="*")
|
|
1293
|
+
def array_append(col: ColumnOrName, value: ColumnOrLiteral) -> Column:
|
|
1294
|
+
value = value if isinstance(value, Column) else lit(value)
|
|
1295
|
+
return Column.invoke_anonymous_function(col, "ARRAY_APPEND", value)
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
@meta(unsupported_engines="*")
|
|
1299
|
+
def array_compact(col: ColumnOrName) -> Column:
|
|
1300
|
+
return Column.invoke_anonymous_function(col, "ARRAY_COMPACT")
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
@meta(unsupported_engines="*")
|
|
1304
|
+
def array_insert(
|
|
1305
|
+
col: ColumnOrName, pos: t.Union[ColumnOrName, int], value: ColumnOrLiteral
|
|
1306
|
+
) -> Column:
|
|
1307
|
+
value = value if isinstance(value, Column) else lit(value)
|
|
1308
|
+
if isinstance(pos, int):
|
|
1309
|
+
pos = lit(pos)
|
|
1310
|
+
return Column.invoke_anonymous_function(col, "ARRAY_INSERT", pos, value) # type: ignore
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
@meta(unsupported_engines="*")
|
|
1314
|
+
def array_prepend(col: ColumnOrName, value: ColumnOrLiteral) -> Column:
|
|
1315
|
+
value = value if isinstance(value, Column) else lit(value)
|
|
1316
|
+
return Column.invoke_anonymous_function(col, "ARRAY_PREPEND", value)
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
@meta(unsupported_engines="*")
|
|
1320
|
+
def array_size(col: ColumnOrName) -> Column:
|
|
1321
|
+
return Column.invoke_anonymous_function(col, "ARRAY_SIZE")
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
@meta(unsupported_engines="*")
|
|
1325
|
+
def bit_and(col: ColumnOrName) -> Column:
|
|
1326
|
+
return Column.invoke_anonymous_function(col, "BIT_AND")
|
|
1327
|
+
|
|
1328
|
+
|
|
1329
|
+
@meta(unsupported_engines="*")
|
|
1330
|
+
def bit_or(col: ColumnOrName) -> Column:
|
|
1331
|
+
return Column.invoke_anonymous_function(col, "BIT_OR")
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
@meta(unsupported_engines="*")
|
|
1335
|
+
def bit_xor(col: ColumnOrName) -> Column:
|
|
1336
|
+
return Column.invoke_anonymous_function(col, "BIT_XOR")
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
@meta(unsupported_engines="*")
|
|
1340
|
+
def bit_count(col: ColumnOrName) -> Column:
|
|
1341
|
+
return Column.invoke_anonymous_function(col, "BIT_COUNT")
|
|
1342
|
+
|
|
1343
|
+
|
|
1344
|
+
@meta(unsupported_engines="*")
|
|
1345
|
+
def bit_get(col: ColumnOrName, pos: ColumnOrName) -> Column:
|
|
1346
|
+
return Column.invoke_anonymous_function(col, "BIT_GET", pos)
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
@meta(unsupported_engines="*")
|
|
1350
|
+
def getbit(col: ColumnOrName, pos: ColumnOrName) -> Column:
|
|
1351
|
+
return Column.invoke_anonymous_function(col, "GETBIT", pos)
|
|
1352
|
+
|
|
1353
|
+
|
|
1281
1354
|
@meta(unsupported_engines=["bigquery", "postgres"])
|
|
1282
1355
|
def create_map(*cols: t.Union[ColumnOrName, t.Iterable[ColumnOrName]]) -> Column:
|
|
1283
1356
|
cols = list(_flatten(cols)) if not isinstance(cols[0], (str, Column)) else cols # type: ignore
|
|
@@ -1767,6 +1840,31 @@ def aes_encrypt(
|
|
|
1767
1840
|
return Column.invoke_anonymous_function(input, "AES_ENCRYPT", *columns)
|
|
1768
1841
|
|
|
1769
1842
|
|
|
1843
|
+
@meta(unsupported_engines="*")
|
|
1844
|
+
def bitmap_bit_position(col: ColumnOrName) -> Column:
|
|
1845
|
+
return Column.invoke_anonymous_function(col, "BITMAP_BIT_POSITION")
|
|
1846
|
+
|
|
1847
|
+
|
|
1848
|
+
@meta(unsupported_engines="*")
|
|
1849
|
+
def bitmap_bucket_number(col: ColumnOrName) -> Column:
|
|
1850
|
+
return Column.invoke_anonymous_function(col, "BITMAP_BUCKET_NUMBER")
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
@meta(unsupported_engines="*")
|
|
1854
|
+
def bitmap_construct_agg(col: ColumnOrName) -> Column:
|
|
1855
|
+
return Column.invoke_anonymous_function(col, "BITMAP_CONSTRUCT_AGG")
|
|
1856
|
+
|
|
1857
|
+
|
|
1858
|
+
@meta(unsupported_engines="*")
|
|
1859
|
+
def bitmap_count(col: ColumnOrName) -> Column:
|
|
1860
|
+
return Column.invoke_anonymous_function(col, "BITMAP_COUNT")
|
|
1861
|
+
|
|
1862
|
+
|
|
1863
|
+
@meta(unsupported_engines="*")
|
|
1864
|
+
def bitmap_or_agg(col: ColumnOrName) -> Column:
|
|
1865
|
+
return Column.invoke_anonymous_function(col, "BITMAP_OR_AGG")
|
|
1866
|
+
|
|
1867
|
+
|
|
1770
1868
|
@meta(unsupported_engines="*")
|
|
1771
1869
|
def to_binary(col: ColumnOrName, format: t.Optional[ColumnOrName] = None) -> Column:
|
|
1772
1870
|
if format is not None:
|
|
@@ -1774,6 +1872,3920 @@ def to_binary(col: ColumnOrName, format: t.Optional[ColumnOrName] = None) -> Col
|
|
|
1774
1872
|
return Column.invoke_anonymous_function(col, "TO_BINARY")
|
|
1775
1873
|
|
|
1776
1874
|
|
|
1875
|
+
@meta()
|
|
1876
|
+
def any_value(col: ColumnOrName, ignoreNulls: t.Optional[t.Union[bool, Column]] = None) -> Column:
|
|
1877
|
+
column = Column.invoke_expression_over_column(col, expression.AnyValue)
|
|
1878
|
+
if ignoreNulls:
|
|
1879
|
+
return Column(expression.IgnoreNulls(this=column.expression))
|
|
1880
|
+
return column
|
|
1881
|
+
|
|
1882
|
+
|
|
1883
|
+
@meta(unsupported_engines="*")
|
|
1884
|
+
def approx_percentile(
|
|
1885
|
+
col: ColumnOrName,
|
|
1886
|
+
percentage: t.Union[Column, float, t.List[float], t.Tuple[float]],
|
|
1887
|
+
accuracy: t.Union[Column, float] = 10000,
|
|
1888
|
+
) -> Column:
|
|
1889
|
+
percentage = lit(percentage) if not isinstance(accuracy, Column) else percentage
|
|
1890
|
+
accuracy = lit(accuracy) if not isinstance(accuracy, Column) else accuracy
|
|
1891
|
+
|
|
1892
|
+
return Column.invoke_expression_over_column(
|
|
1893
|
+
col, expression.ApproxQuantile, quantile=percentage, accuracy=accuracy
|
|
1894
|
+
)
|
|
1895
|
+
|
|
1896
|
+
|
|
1897
|
+
@meta()
|
|
1898
|
+
def bool_and(col: ColumnOrName) -> Column:
|
|
1899
|
+
return Column.invoke_expression_over_column(col, expression.LogicalAnd)
|
|
1900
|
+
|
|
1901
|
+
|
|
1902
|
+
@meta()
|
|
1903
|
+
def bool_or(col: ColumnOrName) -> Column:
|
|
1904
|
+
return Column.invoke_expression_over_column(col, expression.LogicalOr)
|
|
1905
|
+
|
|
1906
|
+
|
|
1907
|
+
@meta(unsupported_engines="*")
|
|
1908
|
+
def btrim(str: ColumnOrName, trim: t.Optional[ColumnOrName] = None) -> Column:
|
|
1909
|
+
if trim is not None:
|
|
1910
|
+
return Column.invoke_anonymous_function(str, "btrim", trim)
|
|
1911
|
+
else:
|
|
1912
|
+
return Column.invoke_anonymous_function(str, "btrim")
|
|
1913
|
+
|
|
1914
|
+
|
|
1915
|
+
@meta(unsupported_engines="*")
|
|
1916
|
+
def bucket(numBuckets: t.Union[Column, int], col: ColumnOrName) -> Column:
|
|
1917
|
+
numBuckets = lit(numBuckets) if isinstance(numBuckets, int) else numBuckets
|
|
1918
|
+
return Column.invoke_anonymous_function(numBuckets, "bucket", col)
|
|
1919
|
+
|
|
1920
|
+
|
|
1921
|
+
@meta()
|
|
1922
|
+
def call_function(funcName: str, *cols: ColumnOrName) -> Column:
|
|
1923
|
+
cols = ensure_list(cols) # type: ignore
|
|
1924
|
+
if len(cols) > 1:
|
|
1925
|
+
return Column.invoke_anonymous_function(cols[0], funcName, *cols[1:])
|
|
1926
|
+
return Column.invoke_anonymous_function(cols[0], funcName)
|
|
1927
|
+
|
|
1928
|
+
|
|
1929
|
+
# @meta(unsupported_engines="*")
|
|
1930
|
+
# def call_udf(udfName: str, *cols: ColumnOrName) -> Column:
|
|
1931
|
+
# """
|
|
1932
|
+
# Call an user-defined function.
|
|
1933
|
+
#
|
|
1934
|
+
# .. versionadded:: 3.4.0
|
|
1935
|
+
#
|
|
1936
|
+
# Parameters
|
|
1937
|
+
# ----------
|
|
1938
|
+
# udfName : str
|
|
1939
|
+
# name of the user defined function (UDF)
|
|
1940
|
+
# cols : :class:`~pyspark.sql.Column` or str
|
|
1941
|
+
# column names or :class:`~pyspark.sql.Column`\\s to be used in the UDF
|
|
1942
|
+
#
|
|
1943
|
+
# Returns
|
|
1944
|
+
# -------
|
|
1945
|
+
# :class:`~pyspark.sql.Column`
|
|
1946
|
+
# result of executed udf.
|
|
1947
|
+
#
|
|
1948
|
+
# Examples
|
|
1949
|
+
# --------
|
|
1950
|
+
# >>> from pyspark.sql.functions import call_udf, col
|
|
1951
|
+
# >>> from pyspark.sql.types import IntegerType, StringType
|
|
1952
|
+
# >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
|
|
1953
|
+
# >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
|
|
1954
|
+
# >>> df.select(call_udf("intX2", "id")).show()
|
|
1955
|
+
# +---------+
|
|
1956
|
+
# |intX2(id)|
|
|
1957
|
+
# +---------+
|
|
1958
|
+
# | 2|
|
|
1959
|
+
# | 4|
|
|
1960
|
+
# | 6|
|
|
1961
|
+
# +---------+
|
|
1962
|
+
# >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())
|
|
1963
|
+
# >>> df.select(call_udf("strX2", col("name"))).show()
|
|
1964
|
+
# +-----------+
|
|
1965
|
+
# |strX2(name)|
|
|
1966
|
+
# +-----------+
|
|
1967
|
+
# | aa|
|
|
1968
|
+
# | bb|
|
|
1969
|
+
# | cc|
|
|
1970
|
+
# +-----------+
|
|
1971
|
+
# """
|
|
1972
|
+
# sc = get_active_spark_context()
|
|
1973
|
+
# return _invoke_function("call_udf", udfName, _to_seq(sc, cols, _to_java_column))
|
|
1974
|
+
#
|
|
1975
|
+
#
|
|
1976
|
+
# @pytest.mark.parametrize(
|
|
1977
|
+
# "expression, expected",
|
|
1978
|
+
# [
|
|
1979
|
+
# (SF.call_udf("cola"), "CALL_UDF(cola)"),
|
|
1980
|
+
# (SF.call_udf(SF.col("cola")), "CALL_UDF(cola)"),
|
|
1981
|
+
# ],
|
|
1982
|
+
# )
|
|
1983
|
+
# def test_call_udf(expression, expected):
|
|
1984
|
+
# assert expression.sql() == expected
|
|
1985
|
+
#
|
|
1986
|
+
# def test_call_udf(get_session_and_func, get_func):
|
|
1987
|
+
# session, call_udf = get_session_and_func("call_udf")
|
|
1988
|
+
# >>> from pyspark.sql.functions import call_udf, col
|
|
1989
|
+
# >>> from pyspark.sql.types import IntegerType, StringType
|
|
1990
|
+
# >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
|
|
1991
|
+
# >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
|
|
1992
|
+
# >>> df.select(call_udf("intX2", "id")).show()
|
|
1993
|
+
# +---------+
|
|
1994
|
+
# |intX2(id)|
|
|
1995
|
+
# +---------+
|
|
1996
|
+
# | 2|
|
|
1997
|
+
# | 4|
|
|
1998
|
+
# | 6|
|
|
1999
|
+
# +---------+
|
|
2000
|
+
# >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())
|
|
2001
|
+
# >>> df.select(call_udf("strX2", col("name"))).show()
|
|
2002
|
+
# +-----------+
|
|
2003
|
+
# |strX2(name)|
|
|
2004
|
+
# +-----------+
|
|
2005
|
+
# | aa|
|
|
2006
|
+
# | bb|
|
|
2007
|
+
# | cc|
|
|
2008
|
+
# +-----------+
|
|
2009
|
+
|
|
2010
|
+
|
|
2011
|
+
@meta(unsupported_engines="*")
|
|
2012
|
+
def cardinality(col: ColumnOrName) -> Column:
|
|
2013
|
+
return Column.invoke_anonymous_function(col, "cardinality")
|
|
2014
|
+
|
|
2015
|
+
|
|
2016
|
+
@meta()
|
|
2017
|
+
def char(col: ColumnOrName) -> Column:
|
|
2018
|
+
return Column.invoke_expression_over_column(col, expression.Chr)
|
|
2019
|
+
|
|
2020
|
+
|
|
2021
|
+
@meta(unsupported_engines="*")
|
|
2022
|
+
def char_length(str: ColumnOrName) -> Column:
|
|
2023
|
+
return Column.invoke_anonymous_function(str, "char_length")
|
|
2024
|
+
|
|
2025
|
+
|
|
2026
|
+
@meta(unsupported_engines="*")
|
|
2027
|
+
def character_length(str: ColumnOrName) -> Column:
|
|
2028
|
+
return Column.invoke_anonymous_function(str, "character_length")
|
|
2029
|
+
|
|
2030
|
+
|
|
2031
|
+
@meta(unsupported_engines="*")
|
|
2032
|
+
def contains(left: ColumnOrName, right: ColumnOrName) -> Column:
|
|
2033
|
+
return Column.invoke_anonymous_function(left, "contains", right)
|
|
2034
|
+
|
|
2035
|
+
|
|
2036
|
+
@meta(unsupported_engines="*")
|
|
2037
|
+
def convert_timezone(
|
|
2038
|
+
sourceTz: t.Optional[Column], targetTz: Column, sourceTs: ColumnOrName
|
|
2039
|
+
) -> Column:
|
|
2040
|
+
if sourceTz is None:
|
|
2041
|
+
return Column.invoke_anonymous_function(targetTz, "convert_timezone", sourceTs)
|
|
2042
|
+
else:
|
|
2043
|
+
return Column.invoke_anonymous_function(sourceTz, "convert_timezone", targetTz, sourceTs)
|
|
2044
|
+
|
|
2045
|
+
|
|
2046
|
+
@meta(unsupported_engines="postgres")
|
|
2047
|
+
def count_if(col: ColumnOrName) -> Column:
|
|
2048
|
+
return Column.invoke_expression_over_column(col, expression.CountIf)
|
|
2049
|
+
|
|
2050
|
+
|
|
2051
|
+
@meta(unsupported_engines="*")
|
|
2052
|
+
def count_min_sketch(
|
|
2053
|
+
col: ColumnOrName,
|
|
2054
|
+
eps: ColumnOrName,
|
|
2055
|
+
confidence: ColumnOrName,
|
|
2056
|
+
seed: ColumnOrName,
|
|
2057
|
+
) -> Column:
|
|
2058
|
+
eps = Column.ensure_col(eps).cast("double")
|
|
2059
|
+
confidence = Column.ensure_col(confidence).cast("double")
|
|
2060
|
+
return Column.invoke_anonymous_function(col, "count_min_sketch", eps, confidence, seed)
|
|
2061
|
+
|
|
2062
|
+
|
|
2063
|
+
@meta(unsupported_engines="*")
|
|
2064
|
+
def curdate() -> Column:
|
|
2065
|
+
"""
|
|
2066
|
+
Returns the current date at the start of query evaluation as a :class:`DateType` column.
|
|
2067
|
+
All calls of current_date within the same query return the same value.
|
|
2068
|
+
|
|
2069
|
+
.. versionadded:: 3.5.0
|
|
2070
|
+
|
|
2071
|
+
Returns
|
|
2072
|
+
-------
|
|
2073
|
+
:class:`~pyspark.sql.Column`
|
|
2074
|
+
current date.
|
|
2075
|
+
|
|
2076
|
+
Examples
|
|
2077
|
+
--------
|
|
2078
|
+
>>> import pyspark.sql.functions as sf
|
|
2079
|
+
>>> spark.range(1).select(sf.curdate()).show() # doctest: +SKIP
|
|
2080
|
+
+--------------+
|
|
2081
|
+
|current_date()|
|
|
2082
|
+
+--------------+
|
|
2083
|
+
| 2022-08-26|
|
|
2084
|
+
+--------------+
|
|
2085
|
+
"""
|
|
2086
|
+
return Column.invoke_anonymous_function(None, "curdate")
|
|
2087
|
+
|
|
2088
|
+
|
|
2089
|
+
@meta(unsupported_engines="*")
|
|
2090
|
+
def current_catalog() -> Column:
|
|
2091
|
+
"""Returns the current catalog.
|
|
2092
|
+
|
|
2093
|
+
.. versionadded:: 3.5.0
|
|
2094
|
+
|
|
2095
|
+
Examples
|
|
2096
|
+
--------
|
|
2097
|
+
>>> spark.range(1).select(current_catalog()).show()
|
|
2098
|
+
+-----------------+
|
|
2099
|
+
|current_catalog()|
|
|
2100
|
+
+-----------------+
|
|
2101
|
+
| spark_catalog|
|
|
2102
|
+
+-----------------+
|
|
2103
|
+
"""
|
|
2104
|
+
return Column.invoke_anonymous_function(None, "current_catalog")
|
|
2105
|
+
|
|
2106
|
+
|
|
2107
|
+
@meta(unsupported_engines="*")
|
|
2108
|
+
def current_database() -> Column:
|
|
2109
|
+
"""Returns the current database.
|
|
2110
|
+
|
|
2111
|
+
.. versionadded:: 3.5.0
|
|
2112
|
+
|
|
2113
|
+
Examples
|
|
2114
|
+
--------
|
|
2115
|
+
>>> spark.range(1).select(current_database()).show()
|
|
2116
|
+
+------------------+
|
|
2117
|
+
|current_database()|
|
|
2118
|
+
+------------------+
|
|
2119
|
+
| default|
|
|
2120
|
+
+------------------+
|
|
2121
|
+
"""
|
|
2122
|
+
return Column.invoke_anonymous_function(None, "current_database")
|
|
2123
|
+
|
|
2124
|
+
|
|
2125
|
+
current_schema = current_database
|
|
2126
|
+
|
|
2127
|
+
|
|
2128
|
+
@meta(unsupported_engines="*")
|
|
2129
|
+
def current_timezone() -> Column:
|
|
2130
|
+
return Column.invoke_anonymous_function(None, "current_timezone")
|
|
2131
|
+
|
|
2132
|
+
|
|
2133
|
+
@meta()
|
|
2134
|
+
def current_user() -> Column:
|
|
2135
|
+
return Column.invoke_expression_over_column(None, expression.CurrentUser)
|
|
2136
|
+
|
|
2137
|
+
|
|
2138
|
+
@meta(unsupported_engines="*")
|
|
2139
|
+
def date_from_unix_date(days: ColumnOrName) -> Column:
|
|
2140
|
+
return Column.invoke_anonymous_function(days, "date_from_unix_date")
|
|
2141
|
+
|
|
2142
|
+
|
|
2143
|
+
@meta(unsupported_engines="*")
|
|
2144
|
+
def date_part(field: ColumnOrName, source: ColumnOrName) -> Column:
|
|
2145
|
+
return Column.invoke_anonymous_function(field, "date_part", source)
|
|
2146
|
+
|
|
2147
|
+
|
|
2148
|
+
dateadd = date_add
|
|
2149
|
+
datediff = date_diff
|
|
2150
|
+
|
|
2151
|
+
|
|
2152
|
+
@meta(unsupported_engines="*")
|
|
2153
|
+
def datepart(field: ColumnOrName, source: ColumnOrName) -> Column:
|
|
2154
|
+
return Column.invoke_anonymous_function(field, "datepart", source)
|
|
2155
|
+
|
|
2156
|
+
|
|
2157
|
+
@meta(unsupported_engines="*")
|
|
2158
|
+
def day(col: ColumnOrName) -> Column:
|
|
2159
|
+
return Column.invoke_expression_over_column(col, expression.Day)
|
|
2160
|
+
|
|
2161
|
+
|
|
2162
|
+
@meta(unsupported_engines="*")
|
|
2163
|
+
def days(col: ColumnOrName) -> Column:
|
|
2164
|
+
return Column.invoke_anonymous_function(col, "days")
|
|
2165
|
+
|
|
2166
|
+
|
|
2167
|
+
@meta(unsupported_engines="*")
|
|
2168
|
+
def elt(*inputs: ColumnOrName) -> Column:
|
|
2169
|
+
inputs = ensure_list(inputs) # type: ignore
|
|
2170
|
+
if len(inputs) > 1:
|
|
2171
|
+
return Column.invoke_anonymous_function(inputs[0], "elt", *inputs[1:])
|
|
2172
|
+
return Column.invoke_anonymous_function(inputs[0], "elt")
|
|
2173
|
+
|
|
2174
|
+
|
|
2175
|
+
@meta(unsupported_engines="*")
|
|
2176
|
+
def endswith(str: ColumnOrName, suffix: ColumnOrName) -> Column:
|
|
2177
|
+
return Column.invoke_anonymous_function(str, "endswith", suffix)
|
|
2178
|
+
|
|
2179
|
+
|
|
2180
|
+
@meta(unsupported_engines="*")
|
|
2181
|
+
def equal_null(col1: ColumnOrName, col2: ColumnOrName) -> Column:
|
|
2182
|
+
return Column.invoke_anonymous_function(col1, "equal_null", col2)
|
|
2183
|
+
|
|
2184
|
+
|
|
2185
|
+
@meta(unsupported_engines="*")
|
|
2186
|
+
def every(col: ColumnOrName) -> Column:
|
|
2187
|
+
return Column.invoke_anonymous_function(col, "every")
|
|
2188
|
+
|
|
2189
|
+
|
|
2190
|
+
@meta()
|
|
2191
|
+
def extract(field: ColumnOrName, source: ColumnOrName) -> Column:
|
|
2192
|
+
return Column.invoke_expression_over_column(field, expression.Extract, expression=source)
|
|
2193
|
+
|
|
2194
|
+
|
|
2195
|
+
@meta(unsupported_engines="*")
|
|
2196
|
+
def find_in_set(str: ColumnOrName, str_array: ColumnOrName) -> Column:
|
|
2197
|
+
return Column.invoke_anonymous_function(str, "find_in_set", str_array)
|
|
2198
|
+
|
|
2199
|
+
|
|
2200
|
+
@meta(unsupported_engines="*")
|
|
2201
|
+
def first_value(col: ColumnOrName, ignoreNulls: t.Optional[t.Union[bool, Column]] = None) -> Column:
|
|
2202
|
+
column = Column.invoke_expression_over_column(col, expression.FirstValue)
|
|
2203
|
+
|
|
2204
|
+
if ignoreNulls:
|
|
2205
|
+
return Column(expression.IgnoreNulls(this=column.expression))
|
|
2206
|
+
return column
|
|
2207
|
+
|
|
2208
|
+
|
|
2209
|
+
@meta(unsupported_engines="*")
|
|
2210
|
+
def get(col: ColumnOrName, index: t.Union[ColumnOrName, int]) -> Column:
|
|
2211
|
+
index = lit(index) if isinstance(index, int) else index
|
|
2212
|
+
|
|
2213
|
+
return Column.invoke_anonymous_function(col, "get", index)
|
|
2214
|
+
|
|
2215
|
+
|
|
2216
|
+
@meta(unsupported_engines="*")
|
|
2217
|
+
def get_active_spark_context() -> SparkContext:
|
|
2218
|
+
"""Raise RuntimeError if SparkContext is not initialized,
|
|
2219
|
+
otherwise, returns the active SparkContext."""
|
|
2220
|
+
from sqlframe.base.session import _BaseSession
|
|
2221
|
+
from sqlframe.spark.session import SparkSession
|
|
2222
|
+
|
|
2223
|
+
session: _BaseSession = _BaseSession()
|
|
2224
|
+
if not isinstance(session, SparkSession):
|
|
2225
|
+
raise RuntimeError("This function is only available in SparkSession.")
|
|
2226
|
+
return session.spark_session.sparkContext
|
|
2227
|
+
|
|
2228
|
+
|
|
2229
|
+
@meta(unsupported_engines="*")
|
|
2230
|
+
def grouping(col: ColumnOrName) -> Column:
|
|
2231
|
+
"""
|
|
2232
|
+
Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
|
|
2233
|
+
or not, returns 1 for aggregated or 0 for not aggregated in the result set.
|
|
2234
|
+
|
|
2235
|
+
.. versionadded:: 2.0.0
|
|
2236
|
+
|
|
2237
|
+
.. versionchanged:: 3.4.0
|
|
2238
|
+
Supports Spark Connect.
|
|
2239
|
+
|
|
2240
|
+
Parameters
|
|
2241
|
+
----------
|
|
2242
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2243
|
+
column to check if it's aggregated.
|
|
2244
|
+
|
|
2245
|
+
Returns
|
|
2246
|
+
-------
|
|
2247
|
+
:class:`~pyspark.sql.Column`
|
|
2248
|
+
returns 1 for aggregated or 0 for not aggregated in the result set.
|
|
2249
|
+
|
|
2250
|
+
Examples
|
|
2251
|
+
--------
|
|
2252
|
+
>>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
|
|
2253
|
+
>>> df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()
|
|
2254
|
+
+-----+--------------+--------+
|
|
2255
|
+
| name|grouping(name)|sum(age)|
|
|
2256
|
+
+-----+--------------+--------+
|
|
2257
|
+
| NULL| 1| 7|
|
|
2258
|
+
|Alice| 0| 2|
|
|
2259
|
+
| Bob| 0| 5|
|
|
2260
|
+
+-----+--------------+--------+
|
|
2261
|
+
"""
|
|
2262
|
+
return Column.invoke_anonymous_function(col, "grouping")
|
|
2263
|
+
|
|
2264
|
+
|
|
2265
|
+
@meta(unsupported_engines="*")
|
|
2266
|
+
def histogram_numeric(col: ColumnOrName, nBins: ColumnOrName) -> Column:
|
|
2267
|
+
"""Computes a histogram on numeric 'col' using nb bins.
|
|
2268
|
+
The return value is an array of (x,y) pairs representing the centers of the
|
|
2269
|
+
histogram's bins. As the value of 'nb' is increased, the histogram approximation
|
|
2270
|
+
gets finer-grained, but may yield artifacts around outliers. In practice, 20-40
|
|
2271
|
+
histogram bins appear to work well, with more bins being required for skewed or
|
|
2272
|
+
smaller datasets. Note that this function creates a histogram with non-uniform
|
|
2273
|
+
bin widths. It offers no guarantees in terms of the mean-squared-error of the
|
|
2274
|
+
histogram, but in practice is comparable to the histograms produced by the R/S-Plus
|
|
2275
|
+
statistical computing packages. Note: the output type of the 'x' field in the return value is
|
|
2276
|
+
propagated from the input value consumed in the aggregate function.
|
|
2277
|
+
|
|
2278
|
+
.. versionadded:: 3.5.0
|
|
2279
|
+
|
|
2280
|
+
Parameters
|
|
2281
|
+
----------
|
|
2282
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2283
|
+
target column to work on.
|
|
2284
|
+
nBins : :class:`~pyspark.sql.Column` or str
|
|
2285
|
+
number of Histogram columns.
|
|
2286
|
+
|
|
2287
|
+
Returns
|
|
2288
|
+
-------
|
|
2289
|
+
:class:`~pyspark.sql.Column`
|
|
2290
|
+
a histogram on numeric 'col' using nb bins.
|
|
2291
|
+
|
|
2292
|
+
Examples
|
|
2293
|
+
--------
|
|
2294
|
+
>>> df = spark.createDataFrame([("a", 1),
|
|
2295
|
+
... ("a", 2),
|
|
2296
|
+
... ("a", 3),
|
|
2297
|
+
... ("b", 8),
|
|
2298
|
+
... ("b", 2)], ["c1", "c2"])
|
|
2299
|
+
>>> df.select(histogram_numeric('c2', lit(5))).show()
|
|
2300
|
+
+------------------------+
|
|
2301
|
+
|histogram_numeric(c2, 5)|
|
|
2302
|
+
+------------------------+
|
|
2303
|
+
| [{1, 1.0}, {2, 1....|
|
|
2304
|
+
+------------------------+
|
|
2305
|
+
"""
|
|
2306
|
+
return Column.invoke_anonymous_function(col, "histogram_numeric", nBins)
|
|
2307
|
+
|
|
2308
|
+
|
|
2309
|
+
@meta(unsupported_engines="*")
|
|
2310
|
+
def hll_sketch_agg(col: ColumnOrName, lgConfigK: t.Optional[t.Union[int, Column]] = None) -> Column:
|
|
2311
|
+
"""
|
|
2312
|
+
Aggregate function: returns the updatable binary representation of the Datasketches
|
|
2313
|
+
HllSketch configured with lgConfigK arg.
|
|
2314
|
+
|
|
2315
|
+
.. versionadded:: 3.5.0
|
|
2316
|
+
|
|
2317
|
+
Parameters
|
|
2318
|
+
----------
|
|
2319
|
+
col : :class:`~pyspark.sql.Column` or str or int
|
|
2320
|
+
lgConfigK : int, optional
|
|
2321
|
+
The log-base-2 of K, where K is the number of buckets or slots for the HllSketch
|
|
2322
|
+
|
|
2323
|
+
Returns
|
|
2324
|
+
-------
|
|
2325
|
+
:class:`~pyspark.sql.Column`
|
|
2326
|
+
The binary representation of the HllSketch.
|
|
2327
|
+
|
|
2328
|
+
Examples
|
|
2329
|
+
--------
|
|
2330
|
+
>>> df = spark.createDataFrame([1,2,2,3], "INT")
|
|
2331
|
+
>>> df1 = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))
|
|
2332
|
+
>>> df1.show()
|
|
2333
|
+
+------------+
|
|
2334
|
+
|distinct_cnt|
|
|
2335
|
+
+------------+
|
|
2336
|
+
| 3|
|
|
2337
|
+
+------------+
|
|
2338
|
+
>>> df2 = df.agg(hll_sketch_estimate(
|
|
2339
|
+
... hll_sketch_agg("value", lit(12))
|
|
2340
|
+
... ).alias("distinct_cnt"))
|
|
2341
|
+
>>> df2.show()
|
|
2342
|
+
+------------+
|
|
2343
|
+
|distinct_cnt|
|
|
2344
|
+
+------------+
|
|
2345
|
+
| 3|
|
|
2346
|
+
+------------+
|
|
2347
|
+
>>> df3 = df.agg(hll_sketch_estimate(
|
|
2348
|
+
... hll_sketch_agg(col("value"), lit(12))).alias("distinct_cnt"))
|
|
2349
|
+
>>> df3.show()
|
|
2350
|
+
+------------+
|
|
2351
|
+
|distinct_cnt|
|
|
2352
|
+
+------------+
|
|
2353
|
+
| 3|
|
|
2354
|
+
+------------+
|
|
2355
|
+
"""
|
|
2356
|
+
if lgConfigK is None:
|
|
2357
|
+
return Column.invoke_anonymous_function(col, "hll_sketch_agg")
|
|
2358
|
+
else:
|
|
2359
|
+
_lgConfigK = lit(lgConfigK) if isinstance(lgConfigK, int) else lgConfigK
|
|
2360
|
+
return Column.invoke_anonymous_function(col, "hll_sketch_agg", _lgConfigK)
|
|
2361
|
+
|
|
2362
|
+
|
|
2363
|
+
@meta(unsupported_engines="*")
|
|
2364
|
+
def hll_sketch_estimate(col: ColumnOrName) -> Column:
|
|
2365
|
+
"""
|
|
2366
|
+
Returns the estimated number of unique values given the binary representation
|
|
2367
|
+
of a Datasketches HllSketch.
|
|
2368
|
+
|
|
2369
|
+
.. versionadded:: 3.5.0
|
|
2370
|
+
|
|
2371
|
+
Parameters
|
|
2372
|
+
----------
|
|
2373
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2374
|
+
|
|
2375
|
+
Returns
|
|
2376
|
+
-------
|
|
2377
|
+
:class:`~pyspark.sql.Column`
|
|
2378
|
+
The estimated number of unique values for the HllSketch.
|
|
2379
|
+
|
|
2380
|
+
Examples
|
|
2381
|
+
--------
|
|
2382
|
+
>>> df = spark.createDataFrame([1,2,2,3], "INT")
|
|
2383
|
+
>>> df = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))
|
|
2384
|
+
>>> df.show()
|
|
2385
|
+
+------------+
|
|
2386
|
+
|distinct_cnt|
|
|
2387
|
+
+------------+
|
|
2388
|
+
| 3|
|
|
2389
|
+
+------------+
|
|
2390
|
+
"""
|
|
2391
|
+
return Column.invoke_anonymous_function(col, "hll_sketch_estimate")
|
|
2392
|
+
|
|
2393
|
+
|
|
2394
|
+
@meta(unsupported_engines="*")
|
|
2395
|
+
def hll_union(
|
|
2396
|
+
col1: ColumnOrName, col2: ColumnOrName, allowDifferentLgConfigK: t.Optional[bool] = None
|
|
2397
|
+
) -> Column:
|
|
2398
|
+
"""
|
|
2399
|
+
Merges two binary representations of Datasketches HllSketch objects, using a
|
|
2400
|
+
Datasketches Union object. Throws an exception if sketches have different
|
|
2401
|
+
lgConfigK values and allowDifferentLgConfigK is unset or set to false.
|
|
2402
|
+
|
|
2403
|
+
.. versionadded:: 3.5.0
|
|
2404
|
+
|
|
2405
|
+
Parameters
|
|
2406
|
+
----------
|
|
2407
|
+
col1 : :class:`~pyspark.sql.Column` or str
|
|
2408
|
+
col2 : :class:`~pyspark.sql.Column` or str
|
|
2409
|
+
allowDifferentLgConfigK : bool, optional
|
|
2410
|
+
Allow sketches with different lgConfigK values to be merged (defaults to false).
|
|
2411
|
+
|
|
2412
|
+
Returns
|
|
2413
|
+
-------
|
|
2414
|
+
:class:`~pyspark.sql.Column`
|
|
2415
|
+
The binary representation of the merged HllSketch.
|
|
2416
|
+
|
|
2417
|
+
Examples
|
|
2418
|
+
--------
|
|
2419
|
+
>>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>")
|
|
2420
|
+
>>> df = df.agg(hll_sketch_agg("v1").alias("sketch1"), hll_sketch_agg("v2").alias("sketch2"))
|
|
2421
|
+
>>> df = df.withColumn("distinct_cnt", hll_sketch_estimate(hll_union("sketch1", "sketch2")))
|
|
2422
|
+
>>> df.drop("sketch1", "sketch2").show()
|
|
2423
|
+
+------------+
|
|
2424
|
+
|distinct_cnt|
|
|
2425
|
+
+------------+
|
|
2426
|
+
| 6|
|
|
2427
|
+
+------------+
|
|
2428
|
+
"""
|
|
2429
|
+
if allowDifferentLgConfigK is not None:
|
|
2430
|
+
allowDifferentLgConfigK = (
|
|
2431
|
+
lit(allowDifferentLgConfigK)
|
|
2432
|
+
if isinstance(allowDifferentLgConfigK, bool)
|
|
2433
|
+
else allowDifferentLgConfigK
|
|
2434
|
+
)
|
|
2435
|
+
return Column.invoke_anonymous_function(col1, "hll_union", col2, allowDifferentLgConfigK) # type: ignore
|
|
2436
|
+
else:
|
|
2437
|
+
return Column.invoke_anonymous_function(col1, "hll_union", col2)
|
|
2438
|
+
|
|
2439
|
+
|
|
2440
|
+
@meta(unsupported_engines="*")
|
|
2441
|
+
def hll_union_agg(
|
|
2442
|
+
col: ColumnOrName, allowDifferentLgConfigK: t.Optional[t.Union[bool, Column]] = None
|
|
2443
|
+
) -> Column:
|
|
2444
|
+
"""
|
|
2445
|
+
Aggregate function: returns the updatable binary representation of the Datasketches
|
|
2446
|
+
HllSketch, generated by merging previously created Datasketches HllSketch instances
|
|
2447
|
+
via a Datasketches Union instance. Throws an exception if sketches have different
|
|
2448
|
+
lgConfigK values and allowDifferentLgConfigK is unset or set to false.
|
|
2449
|
+
|
|
2450
|
+
.. versionadded:: 3.5.0
|
|
2451
|
+
|
|
2452
|
+
Parameters
|
|
2453
|
+
----------
|
|
2454
|
+
col : :class:`~pyspark.sql.Column` or str or bool
|
|
2455
|
+
allowDifferentLgConfigK : bool, optional
|
|
2456
|
+
Allow sketches with different lgConfigK values to be merged (defaults to false).
|
|
2457
|
+
|
|
2458
|
+
Returns
|
|
2459
|
+
-------
|
|
2460
|
+
:class:`~pyspark.sql.Column`
|
|
2461
|
+
The binary representation of the merged HllSketch.
|
|
2462
|
+
|
|
2463
|
+
Examples
|
|
2464
|
+
--------
|
|
2465
|
+
>>> df1 = spark.createDataFrame([1,2,2,3], "INT")
|
|
2466
|
+
>>> df1 = df1.agg(hll_sketch_agg("value").alias("sketch"))
|
|
2467
|
+
>>> df2 = spark.createDataFrame([4,5,5,6], "INT")
|
|
2468
|
+
>>> df2 = df2.agg(hll_sketch_agg("value").alias("sketch"))
|
|
2469
|
+
>>> df3 = df1.union(df2).agg(hll_sketch_estimate(
|
|
2470
|
+
... hll_union_agg("sketch")
|
|
2471
|
+
... ).alias("distinct_cnt"))
|
|
2472
|
+
>>> df3.drop("sketch").show()
|
|
2473
|
+
+------------+
|
|
2474
|
+
|distinct_cnt|
|
|
2475
|
+
+------------+
|
|
2476
|
+
| 6|
|
|
2477
|
+
+------------+
|
|
2478
|
+
>>> df4 = df1.union(df2).agg(hll_sketch_estimate(
|
|
2479
|
+
... hll_union_agg("sketch", lit(False))
|
|
2480
|
+
... ).alias("distinct_cnt"))
|
|
2481
|
+
>>> df4.drop("sketch").show()
|
|
2482
|
+
+------------+
|
|
2483
|
+
|distinct_cnt|
|
|
2484
|
+
+------------+
|
|
2485
|
+
| 6|
|
|
2486
|
+
+------------+
|
|
2487
|
+
>>> df5 = df1.union(df2).agg(hll_sketch_estimate(
|
|
2488
|
+
... hll_union_agg(col("sketch"), lit(False))
|
|
2489
|
+
... ).alias("distinct_cnt"))
|
|
2490
|
+
>>> df5.drop("sketch").show()
|
|
2491
|
+
+------------+
|
|
2492
|
+
|distinct_cnt|
|
|
2493
|
+
+------------+
|
|
2494
|
+
| 6|
|
|
2495
|
+
+------------+
|
|
2496
|
+
"""
|
|
2497
|
+
if allowDifferentLgConfigK is None:
|
|
2498
|
+
return Column.invoke_anonymous_function(col, "hll_union_agg")
|
|
2499
|
+
else:
|
|
2500
|
+
_allowDifferentLgConfigK = (
|
|
2501
|
+
lit(allowDifferentLgConfigK)
|
|
2502
|
+
if isinstance(allowDifferentLgConfigK, bool)
|
|
2503
|
+
else allowDifferentLgConfigK
|
|
2504
|
+
)
|
|
2505
|
+
return Column.invoke_anonymous_function(col, "hll_union_agg", _allowDifferentLgConfigK)
|
|
2506
|
+
|
|
2507
|
+
|
|
2508
|
+
@meta(unsupported_engines="*")
|
|
2509
|
+
def hours(col: ColumnOrName) -> Column:
|
|
2510
|
+
"""
|
|
2511
|
+
Partition transform function: A transform for timestamps
|
|
2512
|
+
to partition data into hours.
|
|
2513
|
+
|
|
2514
|
+
.. versionadded:: 3.1.0
|
|
2515
|
+
|
|
2516
|
+
.. versionchanged:: 3.4.0
|
|
2517
|
+
Supports Spark Connect.
|
|
2518
|
+
|
|
2519
|
+
Parameters
|
|
2520
|
+
----------
|
|
2521
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2522
|
+
target date or timestamp column to work on.
|
|
2523
|
+
|
|
2524
|
+
Returns
|
|
2525
|
+
-------
|
|
2526
|
+
:class:`~pyspark.sql.Column`
|
|
2527
|
+
data partitioned by hours.
|
|
2528
|
+
|
|
2529
|
+
Examples
|
|
2530
|
+
--------
|
|
2531
|
+
>>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP
|
|
2532
|
+
... hours("ts")
|
|
2533
|
+
... ).createOrReplace()
|
|
2534
|
+
|
|
2535
|
+
Notes
|
|
2536
|
+
-----
|
|
2537
|
+
This function can be used only in combination with
|
|
2538
|
+
:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`
|
|
2539
|
+
method of the `DataFrameWriterV2`.
|
|
2540
|
+
|
|
2541
|
+
"""
|
|
2542
|
+
return Column.invoke_anonymous_function(col, "hours")
|
|
2543
|
+
|
|
2544
|
+
|
|
2545
|
+
@meta()
|
|
2546
|
+
def ifnull(col1: ColumnOrName, col2: ColumnOrName) -> Column:
|
|
2547
|
+
"""
|
|
2548
|
+
Returns `col2` if `col1` is null, or `col1` otherwise.
|
|
2549
|
+
|
|
2550
|
+
.. versionadded:: 3.5.0
|
|
2551
|
+
|
|
2552
|
+
Parameters
|
|
2553
|
+
----------
|
|
2554
|
+
col1 : :class:`~pyspark.sql.Column` or str
|
|
2555
|
+
col2 : :class:`~pyspark.sql.Column` or str
|
|
2556
|
+
|
|
2557
|
+
Examples
|
|
2558
|
+
--------
|
|
2559
|
+
>>> import pyspark.sql.functions as sf
|
|
2560
|
+
>>> df = spark.createDataFrame([(None,), (1,)], ["e"])
|
|
2561
|
+
>>> df.select(sf.ifnull(df.e, sf.lit(8))).show()
|
|
2562
|
+
+------------+
|
|
2563
|
+
|ifnull(e, 8)|
|
|
2564
|
+
+------------+
|
|
2565
|
+
| 8|
|
|
2566
|
+
| 1|
|
|
2567
|
+
+------------+
|
|
2568
|
+
"""
|
|
2569
|
+
return Column.invoke_expression_over_column(col1, expression.Coalesce, expressions=[col2])
|
|
2570
|
+
|
|
2571
|
+
|
|
2572
|
+
@meta(unsupported_engines="*")
|
|
2573
|
+
def ilike(
|
|
2574
|
+
str: ColumnOrName, pattern: ColumnOrName, escapeChar: t.Optional["Column"] = None
|
|
2575
|
+
) -> Column:
|
|
2576
|
+
"""
|
|
2577
|
+
Returns true if str matches `pattern` with `escape` case-insensitively,
|
|
2578
|
+
null if any arguments are null, false otherwise.
|
|
2579
|
+
The default escape character is the '\'.
|
|
2580
|
+
|
|
2581
|
+
.. versionadded:: 3.5.0
|
|
2582
|
+
|
|
2583
|
+
Parameters
|
|
2584
|
+
----------
|
|
2585
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
2586
|
+
A string.
|
|
2587
|
+
pattern : :class:`~pyspark.sql.Column` or str
|
|
2588
|
+
A string. The pattern is a string which is matched literally, with
|
|
2589
|
+
exception to the following special symbols:
|
|
2590
|
+
_ matches any one character in the input (similar to . in posix regular expressions)
|
|
2591
|
+
% matches zero or more characters in the input (similar to .* in posix regular
|
|
2592
|
+
expressions)
|
|
2593
|
+
Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order
|
|
2594
|
+
to match "\abc", the pattern should be "\\abc".
|
|
2595
|
+
When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back
|
|
2596
|
+
to Spark 1.6 behavior regarding string literal parsing. For example, if the config is
|
|
2597
|
+
enabled, the pattern to match "\abc" should be "\abc".
|
|
2598
|
+
escape : :class:`~pyspark.sql.Column`
|
|
2599
|
+
An character added since Spark 3.0. The default escape character is the '\'.
|
|
2600
|
+
If an escape character precedes a special symbol or another escape character, the
|
|
2601
|
+
following character is matched literally. It is invalid to escape any other character.
|
|
2602
|
+
|
|
2603
|
+
Examples
|
|
2604
|
+
--------
|
|
2605
|
+
>>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])
|
|
2606
|
+
>>> df.select(ilike(df.a, df.b).alias('r')).collect()
|
|
2607
|
+
[Row(r=True)]
|
|
2608
|
+
|
|
2609
|
+
>>> df = spark.createDataFrame(
|
|
2610
|
+
... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],
|
|
2611
|
+
... ['a', 'b']
|
|
2612
|
+
... )
|
|
2613
|
+
>>> df.select(ilike(df.a, df.b, lit('/')).alias('r')).collect()
|
|
2614
|
+
[Row(r=True)]
|
|
2615
|
+
"""
|
|
2616
|
+
column = Column.invoke_expression_over_column(str, expression.ILike, expression=pattern)
|
|
2617
|
+
if escapeChar is not None:
|
|
2618
|
+
return Column(
|
|
2619
|
+
expression.Escape(
|
|
2620
|
+
this=column.expression,
|
|
2621
|
+
expression=Column.ensure_col(escapeChar).expression,
|
|
2622
|
+
)
|
|
2623
|
+
)
|
|
2624
|
+
return column
|
|
2625
|
+
|
|
2626
|
+
|
|
2627
|
+
@meta(unsupported_engines="*")
|
|
2628
|
+
def inline(col: ColumnOrName) -> Column:
|
|
2629
|
+
"""
|
|
2630
|
+
Explodes an array of structs into a table.
|
|
2631
|
+
|
|
2632
|
+
.. versionadded:: 3.4.0
|
|
2633
|
+
|
|
2634
|
+
Parameters
|
|
2635
|
+
----------
|
|
2636
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2637
|
+
input column of values to explode.
|
|
2638
|
+
|
|
2639
|
+
Returns
|
|
2640
|
+
-------
|
|
2641
|
+
:class:`~pyspark.sql.Column`
|
|
2642
|
+
generator expression with the inline exploded result.
|
|
2643
|
+
|
|
2644
|
+
See Also
|
|
2645
|
+
--------
|
|
2646
|
+
:meth:`explode`
|
|
2647
|
+
|
|
2648
|
+
Notes
|
|
2649
|
+
-----
|
|
2650
|
+
Supports Spark Connect.
|
|
2651
|
+
|
|
2652
|
+
Examples
|
|
2653
|
+
--------
|
|
2654
|
+
>>> from pyspark.sql import Row
|
|
2655
|
+
>>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])])
|
|
2656
|
+
>>> df.select(inline(df.structlist)).show()
|
|
2657
|
+
+---+---+
|
|
2658
|
+
| a| b|
|
|
2659
|
+
+---+---+
|
|
2660
|
+
| 1| 2|
|
|
2661
|
+
| 3| 4|
|
|
2662
|
+
+---+---+
|
|
2663
|
+
"""
|
|
2664
|
+
return Column.invoke_anonymous_function(col, "inline")
|
|
2665
|
+
|
|
2666
|
+
|
|
2667
|
+
@meta(unsupported_engines="*")
|
|
2668
|
+
def inline_outer(col: ColumnOrName) -> Column:
|
|
2669
|
+
"""
|
|
2670
|
+
Explodes an array of structs into a table.
|
|
2671
|
+
Unlike inline, if the array is null or empty then null is produced for each nested column.
|
|
2672
|
+
|
|
2673
|
+
.. versionadded:: 3.4.0
|
|
2674
|
+
|
|
2675
|
+
Parameters
|
|
2676
|
+
----------
|
|
2677
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2678
|
+
input column of values to explode.
|
|
2679
|
+
|
|
2680
|
+
Returns
|
|
2681
|
+
-------
|
|
2682
|
+
:class:`~pyspark.sql.Column`
|
|
2683
|
+
generator expression with the inline exploded result.
|
|
2684
|
+
|
|
2685
|
+
See Also
|
|
2686
|
+
--------
|
|
2687
|
+
:meth:`explode_outer`
|
|
2688
|
+
:meth:`inline`
|
|
2689
|
+
|
|
2690
|
+
Notes
|
|
2691
|
+
-----
|
|
2692
|
+
Supports Spark Connect.
|
|
2693
|
+
|
|
2694
|
+
Examples
|
|
2695
|
+
--------
|
|
2696
|
+
>>> from pyspark.sql import Row
|
|
2697
|
+
>>> df = spark.createDataFrame([
|
|
2698
|
+
... Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]),
|
|
2699
|
+
... Row(id=2, structlist=[])
|
|
2700
|
+
... ])
|
|
2701
|
+
>>> df.select('id', inline_outer(df.structlist)).show()
|
|
2702
|
+
+---+----+----+
|
|
2703
|
+
| id| a| b|
|
|
2704
|
+
+---+----+----+
|
|
2705
|
+
| 1| 1| 2|
|
|
2706
|
+
| 1| 3| 4|
|
|
2707
|
+
| 2|NULL|NULL|
|
|
2708
|
+
+---+----+----+
|
|
2709
|
+
"""
|
|
2710
|
+
return Column.invoke_anonymous_function(col, "inline_outer")
|
|
2711
|
+
|
|
2712
|
+
|
|
2713
|
+
@meta(unsupported_engines="*")
|
|
2714
|
+
def isnotnull(col: ColumnOrName) -> Column:
|
|
2715
|
+
"""
|
|
2716
|
+
Returns true if `col` is not null, or false otherwise.
|
|
2717
|
+
|
|
2718
|
+
.. versionadded:: 3.5.0
|
|
2719
|
+
|
|
2720
|
+
Parameters
|
|
2721
|
+
----------
|
|
2722
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2723
|
+
|
|
2724
|
+
Examples
|
|
2725
|
+
--------
|
|
2726
|
+
>>> df = spark.createDataFrame([(None,), (1,)], ["e"])
|
|
2727
|
+
>>> df.select(isnotnull(df.e).alias('r')).collect()
|
|
2728
|
+
[Row(r=False), Row(r=True)]
|
|
2729
|
+
"""
|
|
2730
|
+
return Column.invoke_anonymous_function(col, "isnotnull")
|
|
2731
|
+
|
|
2732
|
+
|
|
2733
|
+
@meta(unsupported_engines="*")
|
|
2734
|
+
def java_method(*cols: ColumnOrName) -> Column:
|
|
2735
|
+
"""
|
|
2736
|
+
Calls a method with reflection.
|
|
2737
|
+
|
|
2738
|
+
.. versionadded:: 3.5.0
|
|
2739
|
+
|
|
2740
|
+
Parameters
|
|
2741
|
+
----------
|
|
2742
|
+
cols : :class:`~pyspark.sql.Column` or str
|
|
2743
|
+
the first element should be a literal string for the class name,
|
|
2744
|
+
and the second element should be a literal string for the method name,
|
|
2745
|
+
and the remaining are input arguments to the Java method.
|
|
2746
|
+
|
|
2747
|
+
Examples
|
|
2748
|
+
--------
|
|
2749
|
+
>>> import pyspark.sql.functions as sf
|
|
2750
|
+
>>> spark.range(1).select(
|
|
2751
|
+
... sf.java_method(
|
|
2752
|
+
... sf.lit("java.util.UUID"),
|
|
2753
|
+
... sf.lit("fromString"),
|
|
2754
|
+
... sf.lit("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2")
|
|
2755
|
+
... )
|
|
2756
|
+
... ).show(truncate=False)
|
|
2757
|
+
+-----------------------------------------------------------------------------+
|
|
2758
|
+
|java_method(java.util.UUID, fromString, a5cf6c42-0c85-418f-af6c-3e4e5b1328f2)|
|
|
2759
|
+
+-----------------------------------------------------------------------------+
|
|
2760
|
+
|a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |
|
|
2761
|
+
+-----------------------------------------------------------------------------+
|
|
2762
|
+
"""
|
|
2763
|
+
cols = ensure_list(cols) # type: ignore
|
|
2764
|
+
if len(cols) > 1:
|
|
2765
|
+
return Column.invoke_anonymous_function(cols[0], "java_method", *cols[1:])
|
|
2766
|
+
return Column.invoke_anonymous_function(cols[0], "java_method")
|
|
2767
|
+
|
|
2768
|
+
|
|
2769
|
+
@meta(unsupported_engines="*")
|
|
2770
|
+
def json_array_length(col: ColumnOrName) -> Column:
|
|
2771
|
+
"""
|
|
2772
|
+
Returns the number of elements in the outermost JSON array. `NULL` is returned in case of
|
|
2773
|
+
any other valid JSON string, `NULL` or an invalid JSON.
|
|
2774
|
+
|
|
2775
|
+
.. versionadded:: 3.5.0
|
|
2776
|
+
|
|
2777
|
+
Parameters
|
|
2778
|
+
----------
|
|
2779
|
+
col: :class:`~pyspark.sql.Column` or str
|
|
2780
|
+
target column to compute on.
|
|
2781
|
+
|
|
2782
|
+
Returns
|
|
2783
|
+
-------
|
|
2784
|
+
:class:`~pyspark.sql.Column`
|
|
2785
|
+
length of json array.
|
|
2786
|
+
|
|
2787
|
+
Examples
|
|
2788
|
+
--------
|
|
2789
|
+
>>> df = spark.createDataFrame([(None,), ('[1, 2, 3]',), ('[]',)], ['data'])
|
|
2790
|
+
>>> df.select(json_array_length(df.data).alias('r')).collect()
|
|
2791
|
+
[Row(r=None), Row(r=3), Row(r=0)]
|
|
2792
|
+
"""
|
|
2793
|
+
return Column.invoke_anonymous_function(col, "json_array_length")
|
|
2794
|
+
|
|
2795
|
+
|
|
2796
|
+
@meta(unsupported_engines="*")
|
|
2797
|
+
def json_object_keys(col: ColumnOrName) -> Column:
|
|
2798
|
+
"""
|
|
2799
|
+
Returns all the keys of the outermost JSON object as an array. If a valid JSON object is
|
|
2800
|
+
given, all the keys of the outermost object will be returned as an array. If it is any
|
|
2801
|
+
other valid JSON string, an invalid JSON string or an empty string, the function returns null.
|
|
2802
|
+
|
|
2803
|
+
.. versionadded:: 3.5.0
|
|
2804
|
+
|
|
2805
|
+
Parameters
|
|
2806
|
+
----------
|
|
2807
|
+
col: :class:`~pyspark.sql.Column` or str
|
|
2808
|
+
target column to compute on.
|
|
2809
|
+
|
|
2810
|
+
Returns
|
|
2811
|
+
-------
|
|
2812
|
+
:class:`~pyspark.sql.Column`
|
|
2813
|
+
all the keys of the outermost JSON object.
|
|
2814
|
+
|
|
2815
|
+
Examples
|
|
2816
|
+
--------
|
|
2817
|
+
>>> df = spark.createDataFrame([(None,), ('{}',), ('{"key1":1, "key2":2}',)], ['data'])
|
|
2818
|
+
>>> df.select(json_object_keys(df.data).alias('r')).collect()
|
|
2819
|
+
[Row(r=None), Row(r=[]), Row(r=['key1', 'key2'])]
|
|
2820
|
+
"""
|
|
2821
|
+
return Column.invoke_anonymous_function(col, "json_object_keys")
|
|
2822
|
+
|
|
2823
|
+
|
|
2824
|
+
@meta(unsupported_engines="*")
|
|
2825
|
+
def last_value(col: ColumnOrName, ignoreNulls: t.Optional[t.Union[bool, Column]] = None) -> Column:
|
|
2826
|
+
"""Returns the last value of `col` for a group of rows. It will return the last non-null
|
|
2827
|
+
value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned.
|
|
2828
|
+
|
|
2829
|
+
.. versionadded:: 3.5.0
|
|
2830
|
+
|
|
2831
|
+
Parameters
|
|
2832
|
+
----------
|
|
2833
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2834
|
+
target column to work on.
|
|
2835
|
+
ignorenulls : :class:`~pyspark.sql.Column` or bool
|
|
2836
|
+
if first value is null then look for first non-null value.
|
|
2837
|
+
|
|
2838
|
+
Returns
|
|
2839
|
+
-------
|
|
2840
|
+
:class:`~pyspark.sql.Column`
|
|
2841
|
+
some value of `col` for a group of rows.
|
|
2842
|
+
|
|
2843
|
+
Examples
|
|
2844
|
+
--------
|
|
2845
|
+
>>> import pyspark.sql.functions as sf
|
|
2846
|
+
>>> spark.createDataFrame(
|
|
2847
|
+
... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]
|
|
2848
|
+
... ).select(sf.last_value('a'), sf.last_value('b')).show()
|
|
2849
|
+
+-------------+-------------+
|
|
2850
|
+
|last_value(a)|last_value(b)|
|
|
2851
|
+
+-------------+-------------+
|
|
2852
|
+
| NULL| 2|
|
|
2853
|
+
+-------------+-------------+
|
|
2854
|
+
|
|
2855
|
+
>>> import pyspark.sql.functions as sf
|
|
2856
|
+
>>> spark.createDataFrame(
|
|
2857
|
+
... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]
|
|
2858
|
+
... ).select(sf.last_value('a', True), sf.last_value('b', True)).show()
|
|
2859
|
+
+-------------+-------------+
|
|
2860
|
+
|last_value(a)|last_value(b)|
|
|
2861
|
+
+-------------+-------------+
|
|
2862
|
+
| b| 2|
|
|
2863
|
+
+-------------+-------------+
|
|
2864
|
+
"""
|
|
2865
|
+
column = Column.invoke_expression_over_column(col, expression.LastValue)
|
|
2866
|
+
|
|
2867
|
+
if ignoreNulls:
|
|
2868
|
+
return Column(expression.IgnoreNulls(this=column.expression))
|
|
2869
|
+
return column
|
|
2870
|
+
|
|
2871
|
+
|
|
2872
|
+
@meta()
|
|
2873
|
+
def lcase(str: ColumnOrName) -> Column:
|
|
2874
|
+
"""
|
|
2875
|
+
Returns `str` with all characters changed to lowercase.
|
|
2876
|
+
|
|
2877
|
+
.. versionadded:: 3.5.0
|
|
2878
|
+
|
|
2879
|
+
Parameters
|
|
2880
|
+
----------
|
|
2881
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
2882
|
+
Input column or strings.
|
|
2883
|
+
|
|
2884
|
+
Examples
|
|
2885
|
+
--------
|
|
2886
|
+
>>> import pyspark.sql.functions as sf
|
|
2887
|
+
>>> spark.range(1).select(sf.lcase(sf.lit("Spark"))).show()
|
|
2888
|
+
+------------+
|
|
2889
|
+
|lcase(Spark)|
|
|
2890
|
+
+------------+
|
|
2891
|
+
| spark|
|
|
2892
|
+
+------------+
|
|
2893
|
+
"""
|
|
2894
|
+
return Column.invoke_expression_over_column(str, expression.Lower)
|
|
2895
|
+
|
|
2896
|
+
|
|
2897
|
+
@meta()
|
|
2898
|
+
def left(str: ColumnOrName, len: ColumnOrName) -> Column:
|
|
2899
|
+
"""
|
|
2900
|
+
Returns the leftmost `len`(`len` can be string type) characters from the string `str`,
|
|
2901
|
+
if `len` is less or equal than 0 the result is an empty string.
|
|
2902
|
+
|
|
2903
|
+
.. versionadded:: 3.5.0
|
|
2904
|
+
|
|
2905
|
+
Parameters
|
|
2906
|
+
----------
|
|
2907
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
2908
|
+
Input column or strings.
|
|
2909
|
+
len : :class:`~pyspark.sql.Column` or str
|
|
2910
|
+
Input column or strings, the leftmost `len`.
|
|
2911
|
+
|
|
2912
|
+
Examples
|
|
2913
|
+
--------
|
|
2914
|
+
>>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])
|
|
2915
|
+
>>> df.select(left(df.a, df.b).alias('r')).collect()
|
|
2916
|
+
[Row(r='Spa')]
|
|
2917
|
+
"""
|
|
2918
|
+
return Column.invoke_expression_over_column(str, expression.Left, expression=len)
|
|
2919
|
+
|
|
2920
|
+
|
|
2921
|
+
@meta(unsupported_engines="*")
|
|
2922
|
+
def like(
|
|
2923
|
+
str: ColumnOrName, pattern: ColumnOrName, escapeChar: t.Optional["Column"] = None
|
|
2924
|
+
) -> Column:
|
|
2925
|
+
"""
|
|
2926
|
+
Returns true if str matches `pattern` with `escape`,
|
|
2927
|
+
null if any arguments are null, false otherwise.
|
|
2928
|
+
The default escape character is the '\'.
|
|
2929
|
+
|
|
2930
|
+
.. versionadded:: 3.5.0
|
|
2931
|
+
|
|
2932
|
+
Parameters
|
|
2933
|
+
----------
|
|
2934
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
2935
|
+
A string.
|
|
2936
|
+
pattern : :class:`~pyspark.sql.Column` or str
|
|
2937
|
+
A string. The pattern is a string which is matched literally, with
|
|
2938
|
+
exception to the following special symbols:
|
|
2939
|
+
_ matches any one character in the input (similar to . in posix regular expressions)
|
|
2940
|
+
% matches zero or more characters in the input (similar to .* in posix regular
|
|
2941
|
+
expressions)
|
|
2942
|
+
Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order
|
|
2943
|
+
to match "\abc", the pattern should be "\\abc".
|
|
2944
|
+
When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back
|
|
2945
|
+
to Spark 1.6 behavior regarding string literal parsing. For example, if the config is
|
|
2946
|
+
enabled, the pattern to match "\abc" should be "\abc".
|
|
2947
|
+
escape : :class:`~pyspark.sql.Column`
|
|
2948
|
+
An character added since Spark 3.0. The default escape character is the '\'.
|
|
2949
|
+
If an escape character precedes a special symbol or another escape character, the
|
|
2950
|
+
following character is matched literally. It is invalid to escape any other character.
|
|
2951
|
+
|
|
2952
|
+
Examples
|
|
2953
|
+
--------
|
|
2954
|
+
>>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])
|
|
2955
|
+
>>> df.select(like(df.a, df.b).alias('r')).collect()
|
|
2956
|
+
[Row(r=True)]
|
|
2957
|
+
|
|
2958
|
+
>>> df = spark.createDataFrame(
|
|
2959
|
+
... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],
|
|
2960
|
+
... ['a', 'b']
|
|
2961
|
+
... )
|
|
2962
|
+
>>> df.select(like(df.a, df.b, lit('/')).alias('r')).collect()
|
|
2963
|
+
[Row(r=True)]
|
|
2964
|
+
"""
|
|
2965
|
+
column = Column.invoke_expression_over_column(str, expression.Like, expression=pattern)
|
|
2966
|
+
if escapeChar is not None:
|
|
2967
|
+
return Column(
|
|
2968
|
+
expression.Escape(
|
|
2969
|
+
this=column.expression,
|
|
2970
|
+
expression=Column.ensure_col(escapeChar).expression,
|
|
2971
|
+
)
|
|
2972
|
+
)
|
|
2973
|
+
return column
|
|
2974
|
+
|
|
2975
|
+
|
|
2976
|
+
@meta()
|
|
2977
|
+
def ln(col: ColumnOrName) -> Column:
|
|
2978
|
+
"""Returns the natural logarithm of the argument.
|
|
2979
|
+
|
|
2980
|
+
.. versionadded:: 3.5.0
|
|
2981
|
+
|
|
2982
|
+
Parameters
|
|
2983
|
+
----------
|
|
2984
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
2985
|
+
a column to calculate logariphm for.
|
|
2986
|
+
|
|
2987
|
+
Returns
|
|
2988
|
+
-------
|
|
2989
|
+
:class:`~pyspark.sql.Column`
|
|
2990
|
+
natural logarithm of given value.
|
|
2991
|
+
|
|
2992
|
+
Examples
|
|
2993
|
+
--------
|
|
2994
|
+
>>> df = spark.createDataFrame([(4,)], ['a'])
|
|
2995
|
+
>>> df.select(ln('a')).show()
|
|
2996
|
+
+------------------+
|
|
2997
|
+
| ln(a)|
|
|
2998
|
+
+------------------+
|
|
2999
|
+
|1.3862943611198906|
|
|
3000
|
+
+------------------+
|
|
3001
|
+
"""
|
|
3002
|
+
return Column.invoke_expression_over_column(col, expression.Ln)
|
|
3003
|
+
|
|
3004
|
+
|
|
3005
|
+
@meta(unsupported_engines="*")
|
|
3006
|
+
def localtimestamp() -> Column:
|
|
3007
|
+
"""
|
|
3008
|
+
Returns the current timestamp without time zone at the start of query evaluation
|
|
3009
|
+
as a timestamp without time zone column. All calls of localtimestamp within the
|
|
3010
|
+
same query return the same value.
|
|
3011
|
+
|
|
3012
|
+
.. versionadded:: 3.4.0
|
|
3013
|
+
|
|
3014
|
+
.. versionchanged:: 3.4.0
|
|
3015
|
+
Supports Spark Connect.
|
|
3016
|
+
|
|
3017
|
+
Returns
|
|
3018
|
+
-------
|
|
3019
|
+
:class:`~pyspark.sql.Column`
|
|
3020
|
+
current local date and time.
|
|
3021
|
+
|
|
3022
|
+
Examples
|
|
3023
|
+
--------
|
|
3024
|
+
>>> df = spark.range(1)
|
|
3025
|
+
>>> df.select(localtimestamp()).show(truncate=False) # doctest: +SKIP
|
|
3026
|
+
+-----------------------+
|
|
3027
|
+
|localtimestamp() |
|
|
3028
|
+
+-----------------------+
|
|
3029
|
+
|2022-08-26 21:28:34.639|
|
|
3030
|
+
+-----------------------+
|
|
3031
|
+
"""
|
|
3032
|
+
return Column.invoke_anonymous_function(None, "localtimestamp")
|
|
3033
|
+
|
|
3034
|
+
|
|
3035
|
+
@meta(unsupported_engines="*")
|
|
3036
|
+
def make_dt_interval(
|
|
3037
|
+
days: t.Optional[ColumnOrName] = None,
|
|
3038
|
+
hours: t.Optional[ColumnOrName] = None,
|
|
3039
|
+
mins: t.Optional[ColumnOrName] = None,
|
|
3040
|
+
secs: t.Optional[ColumnOrName] = None,
|
|
3041
|
+
) -> Column:
|
|
3042
|
+
"""
|
|
3043
|
+
Make DayTimeIntervalType duration from days, hours, mins and secs.
|
|
3044
|
+
|
|
3045
|
+
.. versionadded:: 3.5.0
|
|
3046
|
+
|
|
3047
|
+
Parameters
|
|
3048
|
+
----------
|
|
3049
|
+
days : :class:`~pyspark.sql.Column` or str
|
|
3050
|
+
the number of days, positive or negative
|
|
3051
|
+
hours : :class:`~pyspark.sql.Column` or str
|
|
3052
|
+
the number of hours, positive or negative
|
|
3053
|
+
mins : :class:`~pyspark.sql.Column` or str
|
|
3054
|
+
the number of minutes, positive or negative
|
|
3055
|
+
secs : :class:`~pyspark.sql.Column` or str
|
|
3056
|
+
the number of seconds with the fractional part in microsecond precision.
|
|
3057
|
+
|
|
3058
|
+
Examples
|
|
3059
|
+
--------
|
|
3060
|
+
>>> df = spark.createDataFrame([[1, 12, 30, 01.001001]],
|
|
3061
|
+
... ["day", "hour", "min", "sec"])
|
|
3062
|
+
>>> df.select(make_dt_interval(
|
|
3063
|
+
... df.day, df.hour, df.min, df.sec).alias('r')
|
|
3064
|
+
... ).show(truncate=False)
|
|
3065
|
+
+------------------------------------------+
|
|
3066
|
+
|r |
|
|
3067
|
+
+------------------------------------------+
|
|
3068
|
+
|INTERVAL '1 12:30:01.001001' DAY TO SECOND|
|
|
3069
|
+
+------------------------------------------+
|
|
3070
|
+
|
|
3071
|
+
>>> df.select(make_dt_interval(
|
|
3072
|
+
... df.day, df.hour, df.min).alias('r')
|
|
3073
|
+
... ).show(truncate=False)
|
|
3074
|
+
+-----------------------------------+
|
|
3075
|
+
|r |
|
|
3076
|
+
+-----------------------------------+
|
|
3077
|
+
|INTERVAL '1 12:30:00' DAY TO SECOND|
|
|
3078
|
+
+-----------------------------------+
|
|
3079
|
+
|
|
3080
|
+
>>> df.select(make_dt_interval(
|
|
3081
|
+
... df.day, df.hour).alias('r')
|
|
3082
|
+
... ).show(truncate=False)
|
|
3083
|
+
+-----------------------------------+
|
|
3084
|
+
|r |
|
|
3085
|
+
+-----------------------------------+
|
|
3086
|
+
|INTERVAL '1 12:00:00' DAY TO SECOND|
|
|
3087
|
+
+-----------------------------------+
|
|
3088
|
+
|
|
3089
|
+
>>> df.select(make_dt_interval(df.day).alias('r')).show(truncate=False)
|
|
3090
|
+
+-----------------------------------+
|
|
3091
|
+
|r |
|
|
3092
|
+
+-----------------------------------+
|
|
3093
|
+
|INTERVAL '1 00:00:00' DAY TO SECOND|
|
|
3094
|
+
+-----------------------------------+
|
|
3095
|
+
|
|
3096
|
+
>>> df.select(make_dt_interval().alias('r')).show(truncate=False)
|
|
3097
|
+
+-----------------------------------+
|
|
3098
|
+
|r |
|
|
3099
|
+
+-----------------------------------+
|
|
3100
|
+
|INTERVAL '0 00:00:00' DAY TO SECOND|
|
|
3101
|
+
+-----------------------------------+
|
|
3102
|
+
"""
|
|
3103
|
+
_days = lit(0) if days is None else days
|
|
3104
|
+
_hours = lit(0) if hours is None else hours
|
|
3105
|
+
_mins = lit(0) if mins is None else mins
|
|
3106
|
+
_secs = lit(decimal.Decimal(0)) if secs is None else secs
|
|
3107
|
+
return Column.invoke_anonymous_function(_days, "make_dt_interval", _hours, _mins, _secs)
|
|
3108
|
+
|
|
3109
|
+
|
|
3110
|
+
@meta(unsupported_engines="*")
|
|
3111
|
+
def make_timestamp(
|
|
3112
|
+
years: ColumnOrName,
|
|
3113
|
+
months: ColumnOrName,
|
|
3114
|
+
days: ColumnOrName,
|
|
3115
|
+
hours: ColumnOrName,
|
|
3116
|
+
mins: ColumnOrName,
|
|
3117
|
+
secs: ColumnOrName,
|
|
3118
|
+
timezone: t.Optional[ColumnOrName] = None,
|
|
3119
|
+
) -> Column:
|
|
3120
|
+
"""
|
|
3121
|
+
Create timestamp from years, months, days, hours, mins, secs and timezone fields.
|
|
3122
|
+
The result data type is consistent with the value of configuration `spark.sql.timestampType`.
|
|
3123
|
+
If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL
|
|
3124
|
+
on invalid inputs. Otherwise, it will throw an error instead.
|
|
3125
|
+
|
|
3126
|
+
.. versionadded:: 3.5.0
|
|
3127
|
+
|
|
3128
|
+
Parameters
|
|
3129
|
+
----------
|
|
3130
|
+
years : :class:`~pyspark.sql.Column` or str
|
|
3131
|
+
the year to represent, from 1 to 9999
|
|
3132
|
+
months : :class:`~pyspark.sql.Column` or str
|
|
3133
|
+
the month-of-year to represent, from 1 (January) to 12 (December)
|
|
3134
|
+
days : :class:`~pyspark.sql.Column` or str
|
|
3135
|
+
the day-of-month to represent, from 1 to 31
|
|
3136
|
+
hours : :class:`~pyspark.sql.Column` or str
|
|
3137
|
+
the hour-of-day to represent, from 0 to 23
|
|
3138
|
+
mins : :class:`~pyspark.sql.Column` or str
|
|
3139
|
+
the minute-of-hour to represent, from 0 to 59
|
|
3140
|
+
secs : :class:`~pyspark.sql.Column` or str
|
|
3141
|
+
the second-of-minute and its micro-fraction to represent, from 0 to 60.
|
|
3142
|
+
The value can be either an integer like 13 , or a fraction like 13.123.
|
|
3143
|
+
If the sec argument equals to 60, the seconds field is set
|
|
3144
|
+
to 0 and 1 minute is added to the final timestamp.
|
|
3145
|
+
timezone : :class:`~pyspark.sql.Column` or str
|
|
3146
|
+
the time zone identifier. For example, CET, UTC and etc.
|
|
3147
|
+
|
|
3148
|
+
Examples
|
|
3149
|
+
--------
|
|
3150
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
3151
|
+
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
|
|
3152
|
+
... ["year", "month", "day", "hour", "min", "sec", "timezone"])
|
|
3153
|
+
>>> df.select(make_timestamp(
|
|
3154
|
+
... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone).alias('r')
|
|
3155
|
+
... ).show(truncate=False)
|
|
3156
|
+
+-----------------------+
|
|
3157
|
+
|r |
|
|
3158
|
+
+-----------------------+
|
|
3159
|
+
|2014-12-27 21:30:45.887|
|
|
3160
|
+
+-----------------------+
|
|
3161
|
+
|
|
3162
|
+
>>> df.select(make_timestamp(
|
|
3163
|
+
... df.year, df.month, df.day, df.hour, df.min, df.sec).alias('r')
|
|
3164
|
+
... ).show(truncate=False)
|
|
3165
|
+
+-----------------------+
|
|
3166
|
+
|r |
|
|
3167
|
+
+-----------------------+
|
|
3168
|
+
|2014-12-28 06:30:45.887|
|
|
3169
|
+
+-----------------------+
|
|
3170
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
3171
|
+
"""
|
|
3172
|
+
if timezone is not None:
|
|
3173
|
+
return Column.invoke_anonymous_function(
|
|
3174
|
+
years, "make_timestamp", months, days, hours, mins, secs, timezone
|
|
3175
|
+
)
|
|
3176
|
+
else:
|
|
3177
|
+
return Column.invoke_anonymous_function(
|
|
3178
|
+
years, "make_timestamp", months, days, hours, mins, secs
|
|
3179
|
+
)
|
|
3180
|
+
|
|
3181
|
+
|
|
3182
|
+
@meta(unsupported_engines="*")
|
|
3183
|
+
def make_timestamp_ltz(
|
|
3184
|
+
years: ColumnOrName,
|
|
3185
|
+
months: ColumnOrName,
|
|
3186
|
+
days: ColumnOrName,
|
|
3187
|
+
hours: ColumnOrName,
|
|
3188
|
+
mins: ColumnOrName,
|
|
3189
|
+
secs: ColumnOrName,
|
|
3190
|
+
timezone: t.Optional[ColumnOrName] = None,
|
|
3191
|
+
) -> Column:
|
|
3192
|
+
"""
|
|
3193
|
+
Create the current timestamp with local time zone from years, months, days, hours, mins,
|
|
3194
|
+
secs and timezone fields. If the configuration `spark.sql.ansi.enabled` is false,
|
|
3195
|
+
the function returns NULL on invalid inputs. Otherwise, it will throw an error instead.
|
|
3196
|
+
|
|
3197
|
+
.. versionadded:: 3.5.0
|
|
3198
|
+
|
|
3199
|
+
Parameters
|
|
3200
|
+
----------
|
|
3201
|
+
years : :class:`~pyspark.sql.Column` or str
|
|
3202
|
+
the year to represent, from 1 to 9999
|
|
3203
|
+
months : :class:`~pyspark.sql.Column` or str
|
|
3204
|
+
the month-of-year to represent, from 1 (January) to 12 (December)
|
|
3205
|
+
days : :class:`~pyspark.sql.Column` or str
|
|
3206
|
+
the day-of-month to represent, from 1 to 31
|
|
3207
|
+
hours : :class:`~pyspark.sql.Column` or str
|
|
3208
|
+
the hour-of-day to represent, from 0 to 23
|
|
3209
|
+
mins : :class:`~pyspark.sql.Column` or str
|
|
3210
|
+
the minute-of-hour to represent, from 0 to 59
|
|
3211
|
+
secs : :class:`~pyspark.sql.Column` or str
|
|
3212
|
+
the second-of-minute and its micro-fraction to represent, from 0 to 60.
|
|
3213
|
+
The value can be either an integer like 13 , or a fraction like 13.123.
|
|
3214
|
+
If the sec argument equals to 60, the seconds field is set
|
|
3215
|
+
to 0 and 1 minute is added to the final timestamp.
|
|
3216
|
+
timezone : :class:`~pyspark.sql.Column` or str
|
|
3217
|
+
the time zone identifier. For example, CET, UTC and etc.
|
|
3218
|
+
|
|
3219
|
+
Examples
|
|
3220
|
+
--------
|
|
3221
|
+
>>> import pyspark.sql.functions as sf
|
|
3222
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
3223
|
+
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
|
|
3224
|
+
... ["year", "month", "day", "hour", "min", "sec", "timezone"])
|
|
3225
|
+
>>> df.select(sf.make_timestamp_ltz(
|
|
3226
|
+
... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone)
|
|
3227
|
+
... ).show(truncate=False)
|
|
3228
|
+
+--------------------------------------------------------------+
|
|
3229
|
+
|make_timestamp_ltz(year, month, day, hour, min, sec, timezone)|
|
|
3230
|
+
+--------------------------------------------------------------+
|
|
3231
|
+
|2014-12-27 21:30:45.887 |
|
|
3232
|
+
+--------------------------------------------------------------+
|
|
3233
|
+
|
|
3234
|
+
>>> df.select(sf.make_timestamp_ltz(
|
|
3235
|
+
... df.year, df.month, df.day, df.hour, df.min, df.sec)
|
|
3236
|
+
... ).show(truncate=False)
|
|
3237
|
+
+----------------------------------------------------+
|
|
3238
|
+
|make_timestamp_ltz(year, month, day, hour, min, sec)|
|
|
3239
|
+
+----------------------------------------------------+
|
|
3240
|
+
|2014-12-28 06:30:45.887 |
|
|
3241
|
+
+----------------------------------------------------+
|
|
3242
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
3243
|
+
"""
|
|
3244
|
+
if timezone is not None:
|
|
3245
|
+
return Column.invoke_anonymous_function(
|
|
3246
|
+
years, "make_timestamp_ltz", months, days, hours, mins, secs, timezone
|
|
3247
|
+
)
|
|
3248
|
+
else:
|
|
3249
|
+
return Column.invoke_anonymous_function(
|
|
3250
|
+
years, "make_timestamp_ltz", months, days, hours, mins, secs
|
|
3251
|
+
)
|
|
3252
|
+
|
|
3253
|
+
|
|
3254
|
+
@meta(unsupported_engines="*")
|
|
3255
|
+
def make_timestamp_ntz(
|
|
3256
|
+
years: ColumnOrName,
|
|
3257
|
+
months: ColumnOrName,
|
|
3258
|
+
days: ColumnOrName,
|
|
3259
|
+
hours: ColumnOrName,
|
|
3260
|
+
mins: ColumnOrName,
|
|
3261
|
+
secs: ColumnOrName,
|
|
3262
|
+
) -> Column:
|
|
3263
|
+
"""
|
|
3264
|
+
Create local date-time from years, months, days, hours, mins, secs fields.
|
|
3265
|
+
If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL
|
|
3266
|
+
on invalid inputs. Otherwise, it will throw an error instead.
|
|
3267
|
+
|
|
3268
|
+
.. versionadded:: 3.5.0
|
|
3269
|
+
|
|
3270
|
+
Parameters
|
|
3271
|
+
----------
|
|
3272
|
+
years : :class:`~pyspark.sql.Column` or str
|
|
3273
|
+
the year to represent, from 1 to 9999
|
|
3274
|
+
months : :class:`~pyspark.sql.Column` or str
|
|
3275
|
+
the month-of-year to represent, from 1 (January) to 12 (December)
|
|
3276
|
+
days : :class:`~pyspark.sql.Column` or str
|
|
3277
|
+
the day-of-month to represent, from 1 to 31
|
|
3278
|
+
hours : :class:`~pyspark.sql.Column` or str
|
|
3279
|
+
the hour-of-day to represent, from 0 to 23
|
|
3280
|
+
mins : :class:`~pyspark.sql.Column` or str
|
|
3281
|
+
the minute-of-hour to represent, from 0 to 59
|
|
3282
|
+
secs : :class:`~pyspark.sql.Column` or str
|
|
3283
|
+
the second-of-minute and its micro-fraction to represent, from 0 to 60.
|
|
3284
|
+
The value can be either an integer like 13 , or a fraction like 13.123.
|
|
3285
|
+
If the sec argument equals to 60, the seconds field is set
|
|
3286
|
+
to 0 and 1 minute is added to the final timestamp.
|
|
3287
|
+
|
|
3288
|
+
Examples
|
|
3289
|
+
--------
|
|
3290
|
+
>>> import pyspark.sql.functions as sf
|
|
3291
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
3292
|
+
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]],
|
|
3293
|
+
... ["year", "month", "day", "hour", "min", "sec"])
|
|
3294
|
+
>>> df.select(sf.make_timestamp_ntz(
|
|
3295
|
+
... df.year, df.month, df.day, df.hour, df.min, df.sec)
|
|
3296
|
+
... ).show(truncate=False)
|
|
3297
|
+
+----------------------------------------------------+
|
|
3298
|
+
|make_timestamp_ntz(year, month, day, hour, min, sec)|
|
|
3299
|
+
+----------------------------------------------------+
|
|
3300
|
+
|2014-12-28 06:30:45.887 |
|
|
3301
|
+
+----------------------------------------------------+
|
|
3302
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
3303
|
+
"""
|
|
3304
|
+
return Column.invoke_anonymous_function(
|
|
3305
|
+
years, "make_timestamp_ntz", months, days, hours, mins, secs
|
|
3306
|
+
)
|
|
3307
|
+
|
|
3308
|
+
|
|
3309
|
+
@meta(unsupported_engines="*")
|
|
3310
|
+
def make_ym_interval(
|
|
3311
|
+
years: t.Optional[ColumnOrName] = None,
|
|
3312
|
+
months: t.Optional[ColumnOrName] = None,
|
|
3313
|
+
) -> Column:
|
|
3314
|
+
"""
|
|
3315
|
+
Make year-month interval from years, months.
|
|
3316
|
+
|
|
3317
|
+
.. versionadded:: 3.5.0
|
|
3318
|
+
|
|
3319
|
+
Parameters
|
|
3320
|
+
----------
|
|
3321
|
+
years : :class:`~pyspark.sql.Column` or str
|
|
3322
|
+
the number of years, positive or negative
|
|
3323
|
+
months : :class:`~pyspark.sql.Column` or str
|
|
3324
|
+
the number of months, positive or negative
|
|
3325
|
+
|
|
3326
|
+
Examples
|
|
3327
|
+
--------
|
|
3328
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
3329
|
+
>>> df = spark.createDataFrame([[2014, 12]], ["year", "month"])
|
|
3330
|
+
>>> df.select(make_ym_interval(df.year, df.month).alias('r')).show(truncate=False)
|
|
3331
|
+
+-------------------------------+
|
|
3332
|
+
|r |
|
|
3333
|
+
+-------------------------------+
|
|
3334
|
+
|INTERVAL '2015-0' YEAR TO MONTH|
|
|
3335
|
+
+-------------------------------+
|
|
3336
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
3337
|
+
"""
|
|
3338
|
+
_years = lit(0) if years is None else years
|
|
3339
|
+
_months = lit(0) if months is None else months
|
|
3340
|
+
return Column.invoke_anonymous_function(_years, "make_ym_interval", _months)
|
|
3341
|
+
|
|
3342
|
+
|
|
3343
|
+
@meta(unsupported_engines="*")
|
|
3344
|
+
def map_contains_key(col: ColumnOrName, value: t.Any) -> Column:
|
|
3345
|
+
"""
|
|
3346
|
+
Returns true if the map contains the key.
|
|
3347
|
+
|
|
3348
|
+
.. versionadded:: 3.4.0
|
|
3349
|
+
|
|
3350
|
+
.. versionchanged:: 3.4.0
|
|
3351
|
+
Supports Spark Connect.
|
|
3352
|
+
|
|
3353
|
+
Parameters
|
|
3354
|
+
----------
|
|
3355
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
3356
|
+
name of column or expression
|
|
3357
|
+
value :
|
|
3358
|
+
a literal value
|
|
3359
|
+
|
|
3360
|
+
Returns
|
|
3361
|
+
-------
|
|
3362
|
+
:class:`~pyspark.sql.Column`
|
|
3363
|
+
True if key is in the map and False otherwise.
|
|
3364
|
+
|
|
3365
|
+
Examples
|
|
3366
|
+
--------
|
|
3367
|
+
>>> from pyspark.sql.functions import map_contains_key
|
|
3368
|
+
>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
|
|
3369
|
+
>>> df.select(map_contains_key("data", 1)).show()
|
|
3370
|
+
+---------------------------------+
|
|
3371
|
+
|array_contains(map_keys(data), 1)|
|
|
3372
|
+
+---------------------------------+
|
|
3373
|
+
| true|
|
|
3374
|
+
+---------------------------------+
|
|
3375
|
+
>>> df.select(map_contains_key("data", -1)).show()
|
|
3376
|
+
+----------------------------------+
|
|
3377
|
+
|array_contains(map_keys(data), -1)|
|
|
3378
|
+
+----------------------------------+
|
|
3379
|
+
| false|
|
|
3380
|
+
+----------------------------------+
|
|
3381
|
+
"""
|
|
3382
|
+
value = lit(value) if not isinstance(value, Column) else value
|
|
3383
|
+
return Column.invoke_anonymous_function(col, "map_contains_key", value)
|
|
3384
|
+
|
|
3385
|
+
|
|
3386
|
+
@meta(unsupported_engines="*")
|
|
3387
|
+
def mask(
|
|
3388
|
+
col: ColumnOrName,
|
|
3389
|
+
upperChar: t.Optional[ColumnOrName] = None,
|
|
3390
|
+
lowerChar: t.Optional[ColumnOrName] = None,
|
|
3391
|
+
digitChar: t.Optional[ColumnOrName] = None,
|
|
3392
|
+
otherChar: t.Optional[ColumnOrName] = None,
|
|
3393
|
+
) -> Column:
|
|
3394
|
+
"""
|
|
3395
|
+
Masks the given string value. This can be useful for creating copies of tables with sensitive
|
|
3396
|
+
information removed.
|
|
3397
|
+
|
|
3398
|
+
.. versionadded:: 3.5.0
|
|
3399
|
+
|
|
3400
|
+
Parameters
|
|
3401
|
+
----------
|
|
3402
|
+
col: :class:`~pyspark.sql.Column` or str
|
|
3403
|
+
target column to compute on.
|
|
3404
|
+
upperChar: :class:`~pyspark.sql.Column` or str
|
|
3405
|
+
character to replace upper-case characters with. Specify NULL to retain original character.
|
|
3406
|
+
lowerChar: :class:`~pyspark.sql.Column` or str
|
|
3407
|
+
character to replace lower-case characters with. Specify NULL to retain original character.
|
|
3408
|
+
digitChar: :class:`~pyspark.sql.Column` or str
|
|
3409
|
+
character to replace digit characters with. Specify NULL to retain original character.
|
|
3410
|
+
otherChar: :class:`~pyspark.sql.Column` or str
|
|
3411
|
+
character to replace all other characters with. Specify NULL to retain original character.
|
|
3412
|
+
|
|
3413
|
+
Returns
|
|
3414
|
+
-------
|
|
3415
|
+
:class:`~pyspark.sql.Column`
|
|
3416
|
+
|
|
3417
|
+
Examples
|
|
3418
|
+
--------
|
|
3419
|
+
>>> df = spark.createDataFrame([("AbCD123-@$#",), ("abcd-EFGH-8765-4321",)], ['data'])
|
|
3420
|
+
>>> df.select(mask(df.data).alias('r')).collect()
|
|
3421
|
+
[Row(r='XxXXnnn-@$#'), Row(r='xxxx-XXXX-nnnn-nnnn')]
|
|
3422
|
+
>>> df.select(mask(df.data, lit('Y')).alias('r')).collect()
|
|
3423
|
+
[Row(r='YxYYnnn-@$#'), Row(r='xxxx-YYYY-nnnn-nnnn')]
|
|
3424
|
+
>>> df.select(mask(df.data, lit('Y'), lit('y')).alias('r')).collect()
|
|
3425
|
+
[Row(r='YyYYnnn-@$#'), Row(r='yyyy-YYYY-nnnn-nnnn')]
|
|
3426
|
+
>>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d')).alias('r')).collect()
|
|
3427
|
+
[Row(r='YyYYddd-@$#'), Row(r='yyyy-YYYY-dddd-dddd')]
|
|
3428
|
+
>>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d'), lit('*')).alias('r')).collect()
|
|
3429
|
+
[Row(r='YyYYddd****'), Row(r='yyyy*YYYY*dddd*dddd')]
|
|
3430
|
+
"""
|
|
3431
|
+
|
|
3432
|
+
_upperChar = lit("X") if upperChar is None else upperChar
|
|
3433
|
+
_lowerChar = lit("x") if lowerChar is None else lowerChar
|
|
3434
|
+
_digitChar = lit("n") if digitChar is None else digitChar
|
|
3435
|
+
_otherChar = lit(None) if otherChar is None else otherChar
|
|
3436
|
+
return Column.invoke_anonymous_function(
|
|
3437
|
+
col, "mask", _upperChar, _lowerChar, _digitChar, _otherChar
|
|
3438
|
+
)
|
|
3439
|
+
|
|
3440
|
+
|
|
3441
|
+
@meta(unsupported_engines="*")
|
|
3442
|
+
def median(col: ColumnOrName) -> Column:
|
|
3443
|
+
"""
|
|
3444
|
+
Returns the median of the values in a group.
|
|
3445
|
+
|
|
3446
|
+
.. versionadded:: 3.4.0
|
|
3447
|
+
|
|
3448
|
+
Parameters
|
|
3449
|
+
----------
|
|
3450
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
3451
|
+
target column to compute on.
|
|
3452
|
+
|
|
3453
|
+
Returns
|
|
3454
|
+
-------
|
|
3455
|
+
:class:`~pyspark.sql.Column`
|
|
3456
|
+
the median of the values in a group.
|
|
3457
|
+
|
|
3458
|
+
Notes
|
|
3459
|
+
-----
|
|
3460
|
+
Supports Spark Connect.
|
|
3461
|
+
|
|
3462
|
+
Examples
|
|
3463
|
+
--------
|
|
3464
|
+
>>> df = spark.createDataFrame([
|
|
3465
|
+
... ("Java", 2012, 20000), ("dotNET", 2012, 5000),
|
|
3466
|
+
... ("Java", 2012, 22000), ("dotNET", 2012, 10000),
|
|
3467
|
+
... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
|
|
3468
|
+
... schema=("course", "year", "earnings"))
|
|
3469
|
+
>>> df.groupby("course").agg(median("earnings")).show()
|
|
3470
|
+
+------+----------------+
|
|
3471
|
+
|course|median(earnings)|
|
|
3472
|
+
+------+----------------+
|
|
3473
|
+
| Java| 22000.0|
|
|
3474
|
+
|dotNET| 10000.0|
|
|
3475
|
+
+------+----------------+
|
|
3476
|
+
"""
|
|
3477
|
+
return Column.invoke_anonymous_function(col, "median")
|
|
3478
|
+
|
|
3479
|
+
|
|
3480
|
+
@meta(unsupported_engines="*")
|
|
3481
|
+
def mode(col: ColumnOrName) -> Column:
|
|
3482
|
+
"""
|
|
3483
|
+
Returns the most frequent value in a group.
|
|
3484
|
+
|
|
3485
|
+
.. versionadded:: 3.4.0
|
|
3486
|
+
|
|
3487
|
+
Parameters
|
|
3488
|
+
----------
|
|
3489
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
3490
|
+
target column to compute on.
|
|
3491
|
+
|
|
3492
|
+
Returns
|
|
3493
|
+
-------
|
|
3494
|
+
:class:`~pyspark.sql.Column`
|
|
3495
|
+
the most frequent value in a group.
|
|
3496
|
+
|
|
3497
|
+
Notes
|
|
3498
|
+
-----
|
|
3499
|
+
Supports Spark Connect.
|
|
3500
|
+
|
|
3501
|
+
Examples
|
|
3502
|
+
--------
|
|
3503
|
+
>>> df = spark.createDataFrame([
|
|
3504
|
+
... ("Java", 2012, 20000), ("dotNET", 2012, 5000),
|
|
3505
|
+
... ("Java", 2012, 20000), ("dotNET", 2012, 5000),
|
|
3506
|
+
... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
|
|
3507
|
+
... schema=("course", "year", "earnings"))
|
|
3508
|
+
>>> df.groupby("course").agg(mode("year")).show()
|
|
3509
|
+
+------+----------+
|
|
3510
|
+
|course|mode(year)|
|
|
3511
|
+
+------+----------+
|
|
3512
|
+
| Java| 2012|
|
|
3513
|
+
|dotNET| 2012|
|
|
3514
|
+
+------+----------+
|
|
3515
|
+
"""
|
|
3516
|
+
return Column.invoke_anonymous_function(col, "mode")
|
|
3517
|
+
|
|
3518
|
+
|
|
3519
|
+
@meta(unsupported_engines="*")
|
|
3520
|
+
def months(col: ColumnOrName) -> Column:
|
|
3521
|
+
"""
|
|
3522
|
+
Partition transform function: A transform for timestamps and dates
|
|
3523
|
+
to partition data into months.
|
|
3524
|
+
|
|
3525
|
+
.. versionadded:: 3.1.0
|
|
3526
|
+
|
|
3527
|
+
.. versionchanged:: 3.4.0
|
|
3528
|
+
Supports Spark Connect.
|
|
3529
|
+
|
|
3530
|
+
Parameters
|
|
3531
|
+
----------
|
|
3532
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
3533
|
+
target date or timestamp column to work on.
|
|
3534
|
+
|
|
3535
|
+
Returns
|
|
3536
|
+
-------
|
|
3537
|
+
:class:`~pyspark.sql.Column`
|
|
3538
|
+
data partitioned by months.
|
|
3539
|
+
|
|
3540
|
+
Examples
|
|
3541
|
+
--------
|
|
3542
|
+
>>> df.writeTo("catalog.db.table").partitionedBy(
|
|
3543
|
+
... months("ts")
|
|
3544
|
+
... ).createOrReplace() # doctest: +SKIP
|
|
3545
|
+
|
|
3546
|
+
Notes
|
|
3547
|
+
-----
|
|
3548
|
+
This function can be used only in combination with
|
|
3549
|
+
:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`
|
|
3550
|
+
method of the `DataFrameWriterV2`.
|
|
3551
|
+
|
|
3552
|
+
"""
|
|
3553
|
+
return Column.invoke_anonymous_function(col, "months")
|
|
3554
|
+
|
|
3555
|
+
|
|
3556
|
+
@meta(unsupported_engines="*")
|
|
3557
|
+
def named_struct(*cols: ColumnOrName) -> Column:
|
|
3558
|
+
"""
|
|
3559
|
+
Creates a struct with the given field names and values.
|
|
3560
|
+
|
|
3561
|
+
.. versionadded:: 3.5.0
|
|
3562
|
+
|
|
3563
|
+
Parameters
|
|
3564
|
+
----------
|
|
3565
|
+
cols : :class:`~pyspark.sql.Column` or str
|
|
3566
|
+
list of columns to work on.
|
|
3567
|
+
|
|
3568
|
+
Returns
|
|
3569
|
+
-------
|
|
3570
|
+
:class:`~pyspark.sql.Column`
|
|
3571
|
+
|
|
3572
|
+
Examples
|
|
3573
|
+
--------
|
|
3574
|
+
>>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c'])
|
|
3575
|
+
>>> df.select(named_struct(lit('x'), df.a, lit('y'), df.b).alias('r')).collect()
|
|
3576
|
+
[Row(r=Row(x=1, y=2))]
|
|
3577
|
+
"""
|
|
3578
|
+
cols = ensure_list(cols) # type: ignore
|
|
3579
|
+
if len(cols) > 1:
|
|
3580
|
+
return Column.invoke_anonymous_function(cols[0], "named_struct", *cols[1:])
|
|
3581
|
+
return Column.invoke_anonymous_function(cols[0], "named_struct")
|
|
3582
|
+
|
|
3583
|
+
|
|
3584
|
+
@meta(unsupported_engines="*")
|
|
3585
|
+
def negative(col: ColumnOrName) -> Column:
|
|
3586
|
+
"""
|
|
3587
|
+
Returns the negative value.
|
|
3588
|
+
|
|
3589
|
+
.. versionadded:: 3.5.0
|
|
3590
|
+
|
|
3591
|
+
Parameters
|
|
3592
|
+
----------
|
|
3593
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
3594
|
+
column to calculate negative value for.
|
|
3595
|
+
|
|
3596
|
+
Returns
|
|
3597
|
+
-------
|
|
3598
|
+
:class:`~pyspark.sql.Column`
|
|
3599
|
+
negative value.
|
|
3600
|
+
|
|
3601
|
+
Examples
|
|
3602
|
+
--------
|
|
3603
|
+
>>> import pyspark.sql.functions as sf
|
|
3604
|
+
>>> spark.range(3).select(sf.negative("id")).show()
|
|
3605
|
+
+------------+
|
|
3606
|
+
|negative(id)|
|
|
3607
|
+
+------------+
|
|
3608
|
+
| 0|
|
|
3609
|
+
| -1|
|
|
3610
|
+
| -2|
|
|
3611
|
+
+------------+
|
|
3612
|
+
"""
|
|
3613
|
+
return Column.invoke_anonymous_function(col, "negative")
|
|
3614
|
+
|
|
3615
|
+
|
|
3616
|
+
negate = negative
|
|
3617
|
+
now = current_timestamp
|
|
3618
|
+
|
|
3619
|
+
|
|
3620
|
+
@meta()
|
|
3621
|
+
def nvl(col1: ColumnOrName, col2: ColumnOrName) -> Column:
|
|
3622
|
+
"""
|
|
3623
|
+
Returns `col2` if `col1` is null, or `col1` otherwise.
|
|
3624
|
+
|
|
3625
|
+
.. versionadded:: 3.5.0
|
|
3626
|
+
|
|
3627
|
+
Parameters
|
|
3628
|
+
----------
|
|
3629
|
+
col1 : :class:`~pyspark.sql.Column` or str
|
|
3630
|
+
col2 : :class:`~pyspark.sql.Column` or str
|
|
3631
|
+
|
|
3632
|
+
Examples
|
|
3633
|
+
--------
|
|
3634
|
+
>>> df = spark.createDataFrame([(None, 8,), (1, 9,)], ["a", "b"])
|
|
3635
|
+
>>> df.select(nvl(df.a, df.b).alias('r')).collect()
|
|
3636
|
+
[Row(r=8), Row(r=1)]
|
|
3637
|
+
"""
|
|
3638
|
+
return Column.invoke_expression_over_column(col1, expression.Coalesce, expressions=[col2])
|
|
3639
|
+
|
|
3640
|
+
|
|
3641
|
+
@meta()
|
|
3642
|
+
def nvl2(col1: ColumnOrName, col2: ColumnOrName, col3: ColumnOrName) -> Column:
|
|
3643
|
+
"""
|
|
3644
|
+
Returns `col2` if `col1` is not null, or `col3` otherwise.
|
|
3645
|
+
|
|
3646
|
+
.. versionadded:: 3.5.0
|
|
3647
|
+
|
|
3648
|
+
Parameters
|
|
3649
|
+
----------
|
|
3650
|
+
col1 : :class:`~pyspark.sql.Column` or str
|
|
3651
|
+
col2 : :class:`~pyspark.sql.Column` or str
|
|
3652
|
+
col3 : :class:`~pyspark.sql.Column` or str
|
|
3653
|
+
|
|
3654
|
+
Examples
|
|
3655
|
+
--------
|
|
3656
|
+
>>> df = spark.createDataFrame([(None, 8, 6,), (1, 9, 9,)], ["a", "b", "c"])
|
|
3657
|
+
>>> df.select(nvl2(df.a, df.b, df.c).alias('r')).collect()
|
|
3658
|
+
[Row(r=6), Row(r=9)]
|
|
3659
|
+
"""
|
|
3660
|
+
return Column.invoke_expression_over_column(col1, expression.Nvl2, true=col2, false=col3)
|
|
3661
|
+
|
|
3662
|
+
|
|
3663
|
+
@meta(unsupported_engines="*")
|
|
3664
|
+
def parse_url(
|
|
3665
|
+
url: ColumnOrName, partToExtract: ColumnOrName, key: t.Optional[ColumnOrName] = None
|
|
3666
|
+
) -> Column:
|
|
3667
|
+
"""
|
|
3668
|
+
Extracts a part from a URL.
|
|
3669
|
+
|
|
3670
|
+
.. versionadded:: 3.5.0
|
|
3671
|
+
|
|
3672
|
+
Parameters
|
|
3673
|
+
----------
|
|
3674
|
+
url : :class:`~pyspark.sql.Column` or str
|
|
3675
|
+
A column of string.
|
|
3676
|
+
partToExtract : :class:`~pyspark.sql.Column` or str
|
|
3677
|
+
A column of string, the path.
|
|
3678
|
+
key : :class:`~pyspark.sql.Column` or str, optional
|
|
3679
|
+
A column of string, the key.
|
|
3680
|
+
|
|
3681
|
+
Examples
|
|
3682
|
+
--------
|
|
3683
|
+
>>> df = spark.createDataFrame(
|
|
3684
|
+
... [("http://spark.apache.org/path?query=1", "QUERY", "query",)],
|
|
3685
|
+
... ["a", "b", "c"]
|
|
3686
|
+
... )
|
|
3687
|
+
>>> df.select(parse_url(df.a, df.b, df.c).alias('r')).collect()
|
|
3688
|
+
[Row(r='1')]
|
|
3689
|
+
|
|
3690
|
+
>>> df.select(parse_url(df.a, df.b).alias('r')).collect()
|
|
3691
|
+
[Row(r='query=1')]
|
|
3692
|
+
"""
|
|
3693
|
+
if key is not None:
|
|
3694
|
+
return Column.invoke_anonymous_function(url, "parse_url", partToExtract, key)
|
|
3695
|
+
else:
|
|
3696
|
+
return Column.invoke_anonymous_function(url, "parse_url", partToExtract)
|
|
3697
|
+
|
|
3698
|
+
|
|
3699
|
+
@meta(unsupported_engines="*")
|
|
3700
|
+
def pi() -> Column:
|
|
3701
|
+
"""Returns Pi.
|
|
3702
|
+
|
|
3703
|
+
.. versionadded:: 3.5.0
|
|
3704
|
+
|
|
3705
|
+
Examples
|
|
3706
|
+
--------
|
|
3707
|
+
>>> spark.range(1).select(pi()).show()
|
|
3708
|
+
+-----------------+
|
|
3709
|
+
| PI()|
|
|
3710
|
+
+-----------------+
|
|
3711
|
+
|3.141592653589793|
|
|
3712
|
+
+-----------------+
|
|
3713
|
+
"""
|
|
3714
|
+
return Column.invoke_anonymous_function(None, "pi")
|
|
3715
|
+
|
|
3716
|
+
|
|
3717
|
+
@meta(unsupported_engines="*")
|
|
3718
|
+
def pmod(dividend: t.Union[ColumnOrName, float], divisor: t.Union[ColumnOrName, float]) -> Column:
|
|
3719
|
+
"""
|
|
3720
|
+
Returns the positive value of dividend mod divisor.
|
|
3721
|
+
|
|
3722
|
+
.. versionadded:: 3.4.0
|
|
3723
|
+
|
|
3724
|
+
Parameters
|
|
3725
|
+
----------
|
|
3726
|
+
dividend : str, :class:`~pyspark.sql.Column` or float
|
|
3727
|
+
the column that contains dividend, or the specified dividend value
|
|
3728
|
+
divisor : str, :class:`~pyspark.sql.Column` or float
|
|
3729
|
+
the column that contains divisor, or the specified divisor value
|
|
3730
|
+
|
|
3731
|
+
Returns
|
|
3732
|
+
-------
|
|
3733
|
+
:class:`~pyspark.sql.Column`
|
|
3734
|
+
positive value of dividend mod divisor.
|
|
3735
|
+
|
|
3736
|
+
Notes
|
|
3737
|
+
-----
|
|
3738
|
+
Supports Spark Connect.
|
|
3739
|
+
|
|
3740
|
+
Examples
|
|
3741
|
+
--------
|
|
3742
|
+
>>> from pyspark.sql.functions import pmod
|
|
3743
|
+
>>> df = spark.createDataFrame([
|
|
3744
|
+
... (1.0, float('nan')), (float('nan'), 2.0), (10.0, 3.0),
|
|
3745
|
+
... (float('nan'), float('nan')), (-3.0, 4.0), (-10.0, 3.0),
|
|
3746
|
+
... (-5.0, -6.0), (7.0, -8.0), (1.0, 2.0)],
|
|
3747
|
+
... ("a", "b"))
|
|
3748
|
+
>>> df.select(pmod("a", "b")).show()
|
|
3749
|
+
+----------+
|
|
3750
|
+
|pmod(a, b)|
|
|
3751
|
+
+----------+
|
|
3752
|
+
| NaN|
|
|
3753
|
+
| NaN|
|
|
3754
|
+
| 1.0|
|
|
3755
|
+
| NaN|
|
|
3756
|
+
| 1.0|
|
|
3757
|
+
| 2.0|
|
|
3758
|
+
| -5.0|
|
|
3759
|
+
| 7.0|
|
|
3760
|
+
| 1.0|
|
|
3761
|
+
+----------+
|
|
3762
|
+
"""
|
|
3763
|
+
dividend = lit(dividend) if isinstance(dividend, float) else dividend
|
|
3764
|
+
divisor = lit(divisor) if isinstance(divisor, float) else divisor
|
|
3765
|
+
return Column.invoke_anonymous_function(dividend, "pmod", divisor)
|
|
3766
|
+
|
|
3767
|
+
|
|
3768
|
+
@meta()
|
|
3769
|
+
def position(
|
|
3770
|
+
substr: ColumnOrName, str: ColumnOrName, start: t.Optional[ColumnOrName] = None
|
|
3771
|
+
) -> Column:
|
|
3772
|
+
"""
|
|
3773
|
+
Returns the position of the first occurrence of `substr` in `str` after position `start`.
|
|
3774
|
+
The given `start` and return value are 1-based.
|
|
3775
|
+
|
|
3776
|
+
.. versionadded:: 3.5.0
|
|
3777
|
+
|
|
3778
|
+
Parameters
|
|
3779
|
+
----------
|
|
3780
|
+
substr : :class:`~pyspark.sql.Column` or str
|
|
3781
|
+
A column of string, substring.
|
|
3782
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
3783
|
+
A column of string.
|
|
3784
|
+
start : :class:`~pyspark.sql.Column` or str, optional
|
|
3785
|
+
A column of string, start position.
|
|
3786
|
+
|
|
3787
|
+
Examples
|
|
3788
|
+
--------
|
|
3789
|
+
>>> import pyspark.sql.functions as sf
|
|
3790
|
+
>>> spark.createDataFrame(
|
|
3791
|
+
... [("bar", "foobarbar", 5,)], ["a", "b", "c"]
|
|
3792
|
+
... ).select(sf.position("a", "b", "c")).show()
|
|
3793
|
+
+-----------------+
|
|
3794
|
+
|position(a, b, c)|
|
|
3795
|
+
+-----------------+
|
|
3796
|
+
| 7|
|
|
3797
|
+
+-----------------+
|
|
3798
|
+
|
|
3799
|
+
>>> spark.createDataFrame(
|
|
3800
|
+
... [("bar", "foobarbar", 5,)], ["a", "b", "c"]
|
|
3801
|
+
... ).select(sf.position("a", "b")).show()
|
|
3802
|
+
+-----------------+
|
|
3803
|
+
|position(a, b, 1)|
|
|
3804
|
+
+-----------------+
|
|
3805
|
+
| 4|
|
|
3806
|
+
+-----------------+
|
|
3807
|
+
"""
|
|
3808
|
+
if start is not None:
|
|
3809
|
+
return Column.invoke_expression_over_column(
|
|
3810
|
+
str, expression.StrPosition, substr=substr, position=start
|
|
3811
|
+
)
|
|
3812
|
+
else:
|
|
3813
|
+
return Column.invoke_expression_over_column(str, expression.StrPosition, substr=substr)
|
|
3814
|
+
|
|
3815
|
+
|
|
3816
|
+
@meta(unsupported_engines="*")
|
|
3817
|
+
def positive(col: ColumnOrName) -> Column:
|
|
3818
|
+
"""
|
|
3819
|
+
Returns the value.
|
|
3820
|
+
|
|
3821
|
+
.. versionadded:: 3.5.0
|
|
3822
|
+
|
|
3823
|
+
Parameters
|
|
3824
|
+
----------
|
|
3825
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
3826
|
+
input value column.
|
|
3827
|
+
|
|
3828
|
+
Returns
|
|
3829
|
+
-------
|
|
3830
|
+
:class:`~pyspark.sql.Column`
|
|
3831
|
+
value.
|
|
3832
|
+
|
|
3833
|
+
Examples
|
|
3834
|
+
--------
|
|
3835
|
+
>>> df = spark.createDataFrame([(-1,), (0,), (1,)], ['v'])
|
|
3836
|
+
>>> df.select(positive("v").alias("p")).show()
|
|
3837
|
+
+---+
|
|
3838
|
+
| p|
|
|
3839
|
+
+---+
|
|
3840
|
+
| -1|
|
|
3841
|
+
| 0|
|
|
3842
|
+
| 1|
|
|
3843
|
+
+---+
|
|
3844
|
+
"""
|
|
3845
|
+
return Column.invoke_anonymous_function(col, "positive")
|
|
3846
|
+
|
|
3847
|
+
|
|
3848
|
+
@meta(unsupported_engines="*")
|
|
3849
|
+
def printf(format: ColumnOrName, *cols: ColumnOrName) -> Column:
|
|
3850
|
+
"""
|
|
3851
|
+
Formats the arguments in printf-style and returns the result as a string column.
|
|
3852
|
+
|
|
3853
|
+
.. versionadded:: 3.5.0
|
|
3854
|
+
|
|
3855
|
+
Parameters
|
|
3856
|
+
----------
|
|
3857
|
+
format : :class:`~pyspark.sql.Column` or str
|
|
3858
|
+
string that can contain embedded format tags and used as result column's value
|
|
3859
|
+
cols : :class:`~pyspark.sql.Column` or str
|
|
3860
|
+
column names or :class:`~pyspark.sql.Column`\\s to be used in formatting
|
|
3861
|
+
|
|
3862
|
+
Examples
|
|
3863
|
+
--------
|
|
3864
|
+
>>> import pyspark.sql.functions as sf
|
|
3865
|
+
>>> spark.createDataFrame(
|
|
3866
|
+
... [("aa%d%s", 123, "cc",)], ["a", "b", "c"]
|
|
3867
|
+
... ).select(sf.printf("a", "b", "c")).show()
|
|
3868
|
+
+---------------+
|
|
3869
|
+
|printf(a, b, c)|
|
|
3870
|
+
+---------------+
|
|
3871
|
+
| aa123cc|
|
|
3872
|
+
+---------------+
|
|
3873
|
+
"""
|
|
3874
|
+
return Column.invoke_anonymous_function(format, "printf", *cols)
|
|
3875
|
+
|
|
3876
|
+
|
|
3877
|
+
@meta(unsupported_engines=["*", "spark"])
|
|
3878
|
+
def product(col: ColumnOrName) -> Column:
|
|
3879
|
+
"""
|
|
3880
|
+
Aggregate function: returns the product of the values in a group.
|
|
3881
|
+
|
|
3882
|
+
.. versionadded:: 3.2.0
|
|
3883
|
+
|
|
3884
|
+
.. versionchanged:: 3.4.0
|
|
3885
|
+
Supports Spark Connect.
|
|
3886
|
+
|
|
3887
|
+
Parameters
|
|
3888
|
+
----------
|
|
3889
|
+
col : str, :class:`Column`
|
|
3890
|
+
column containing values to be multiplied together
|
|
3891
|
+
|
|
3892
|
+
Returns
|
|
3893
|
+
-------
|
|
3894
|
+
:class:`~pyspark.sql.Column`
|
|
3895
|
+
the column for computed results.
|
|
3896
|
+
|
|
3897
|
+
Examples
|
|
3898
|
+
--------
|
|
3899
|
+
>>> df = spark.range(1, 10).toDF('x').withColumn('mod3', col('x') % 3)
|
|
3900
|
+
>>> prods = df.groupBy('mod3').agg(product('x').alias('product'))
|
|
3901
|
+
>>> prods.orderBy('mod3').show()
|
|
3902
|
+
+----+-------+
|
|
3903
|
+
|mod3|product|
|
|
3904
|
+
+----+-------+
|
|
3905
|
+
| 0| 162.0|
|
|
3906
|
+
| 1| 28.0|
|
|
3907
|
+
| 2| 80.0|
|
|
3908
|
+
+----+-------+
|
|
3909
|
+
"""
|
|
3910
|
+
return Column.invoke_anonymous_function(col, "product")
|
|
3911
|
+
|
|
3912
|
+
|
|
3913
|
+
reduce = aggregate
|
|
3914
|
+
|
|
3915
|
+
|
|
3916
|
+
@meta(unsupported_engines="*")
|
|
3917
|
+
def reflect(*cols: ColumnOrName) -> Column:
|
|
3918
|
+
"""
|
|
3919
|
+
Calls a method with reflection.
|
|
3920
|
+
|
|
3921
|
+
.. versionadded:: 3.5.0
|
|
3922
|
+
|
|
3923
|
+
Parameters
|
|
3924
|
+
----------
|
|
3925
|
+
cols : :class:`~pyspark.sql.Column` or str
|
|
3926
|
+
the first element should be a literal string for the class name,
|
|
3927
|
+
and the second element should be a literal string for the method name,
|
|
3928
|
+
and the remaining are input arguments to the Java method.
|
|
3929
|
+
|
|
3930
|
+
Examples
|
|
3931
|
+
--------
|
|
3932
|
+
>>> df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"])
|
|
3933
|
+
>>> df.select(
|
|
3934
|
+
... reflect(lit("java.util.UUID"), lit("fromString"), df.a).alias('r')
|
|
3935
|
+
... ).collect()
|
|
3936
|
+
[Row(r='a5cf6c42-0c85-418f-af6c-3e4e5b1328f2')]
|
|
3937
|
+
"""
|
|
3938
|
+
if len(cols) > 1:
|
|
3939
|
+
return Column.invoke_anonymous_function(cols[0], "reflect", *cols[1:])
|
|
3940
|
+
return Column.invoke_anonymous_function(cols[0], "reflect")
|
|
3941
|
+
|
|
3942
|
+
|
|
3943
|
+
@meta(unsupported_engines="*")
|
|
3944
|
+
def regexp(str: ColumnOrName, regexp: ColumnOrName) -> Column:
|
|
3945
|
+
r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.
|
|
3946
|
+
|
|
3947
|
+
.. versionadded:: 3.5.0
|
|
3948
|
+
|
|
3949
|
+
Parameters
|
|
3950
|
+
----------
|
|
3951
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
3952
|
+
target column to work on.
|
|
3953
|
+
regexp : :class:`~pyspark.sql.Column` or str
|
|
3954
|
+
regex pattern to apply.
|
|
3955
|
+
|
|
3956
|
+
Returns
|
|
3957
|
+
-------
|
|
3958
|
+
:class:`~pyspark.sql.Column`
|
|
3959
|
+
true if `str` matches a Java regex, or false otherwise.
|
|
3960
|
+
|
|
3961
|
+
Examples
|
|
3962
|
+
--------
|
|
3963
|
+
>>> import pyspark.sql.functions as sf
|
|
3964
|
+
>>> spark.createDataFrame(
|
|
3965
|
+
... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
|
|
3966
|
+
... ).select(sf.regexp('str', sf.lit(r'(\d+)'))).show()
|
|
3967
|
+
+------------------+
|
|
3968
|
+
|REGEXP(str, (\d+))|
|
|
3969
|
+
+------------------+
|
|
3970
|
+
| true|
|
|
3971
|
+
+------------------+
|
|
3972
|
+
|
|
3973
|
+
>>> import pyspark.sql.functions as sf
|
|
3974
|
+
>>> spark.createDataFrame(
|
|
3975
|
+
... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
|
|
3976
|
+
... ).select(sf.regexp('str', sf.lit(r'\d{2}b'))).show()
|
|
3977
|
+
+-------------------+
|
|
3978
|
+
|REGEXP(str, \d{2}b)|
|
|
3979
|
+
+-------------------+
|
|
3980
|
+
| false|
|
|
3981
|
+
+-------------------+
|
|
3982
|
+
|
|
3983
|
+
>>> import pyspark.sql.functions as sf
|
|
3984
|
+
>>> spark.createDataFrame(
|
|
3985
|
+
... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
|
|
3986
|
+
... ).select(sf.regexp('str', sf.col("regexp"))).show()
|
|
3987
|
+
+-------------------+
|
|
3988
|
+
|REGEXP(str, regexp)|
|
|
3989
|
+
+-------------------+
|
|
3990
|
+
| true|
|
|
3991
|
+
+-------------------+
|
|
3992
|
+
"""
|
|
3993
|
+
return Column.invoke_anonymous_function(str, "regexp", regexp)
|
|
3994
|
+
|
|
3995
|
+
|
|
3996
|
+
@meta(unsupported_engines="*")
|
|
3997
|
+
def regexp_count(str: ColumnOrName, regexp: ColumnOrName) -> Column:
|
|
3998
|
+
r"""Returns a count of the number of times that the Java regex pattern `regexp` is matched
|
|
3999
|
+
in the string `str`.
|
|
4000
|
+
|
|
4001
|
+
.. versionadded:: 3.5.0
|
|
4002
|
+
|
|
4003
|
+
Parameters
|
|
4004
|
+
----------
|
|
4005
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4006
|
+
target column to work on.
|
|
4007
|
+
regexp : :class:`~pyspark.sql.Column` or str
|
|
4008
|
+
regex pattern to apply.
|
|
4009
|
+
|
|
4010
|
+
Returns
|
|
4011
|
+
-------
|
|
4012
|
+
:class:`~pyspark.sql.Column`
|
|
4013
|
+
the number of times that a Java regex pattern is matched in the string.
|
|
4014
|
+
|
|
4015
|
+
Examples
|
|
4016
|
+
--------
|
|
4017
|
+
>>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
|
|
4018
|
+
>>> df.select(regexp_count('str', lit(r'\d+')).alias('d')).collect()
|
|
4019
|
+
[Row(d=3)]
|
|
4020
|
+
>>> df.select(regexp_count('str', lit(r'mmm')).alias('d')).collect()
|
|
4021
|
+
[Row(d=0)]
|
|
4022
|
+
>>> df.select(regexp_count("str", col("regexp")).alias('d')).collect()
|
|
4023
|
+
[Row(d=3)]
|
|
4024
|
+
"""
|
|
4025
|
+
return Column.invoke_anonymous_function(str, "regexp_count", regexp)
|
|
4026
|
+
|
|
4027
|
+
|
|
4028
|
+
@meta(unsupported_engines="*")
|
|
4029
|
+
def regexp_extract_all(
|
|
4030
|
+
str: ColumnOrName, regexp: ColumnOrName, idx: t.Optional[t.Union[int, Column]] = None
|
|
4031
|
+
) -> Column:
|
|
4032
|
+
r"""Extract all strings in the `str` that match the Java regex `regexp`
|
|
4033
|
+
and corresponding to the regex group index.
|
|
4034
|
+
|
|
4035
|
+
.. versionadded:: 3.5.0
|
|
4036
|
+
|
|
4037
|
+
Parameters
|
|
4038
|
+
----------
|
|
4039
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4040
|
+
target column to work on.
|
|
4041
|
+
regexp : :class:`~pyspark.sql.Column` or str
|
|
4042
|
+
regex pattern to apply.
|
|
4043
|
+
idx : int
|
|
4044
|
+
matched group id.
|
|
4045
|
+
|
|
4046
|
+
Returns
|
|
4047
|
+
-------
|
|
4048
|
+
:class:`~pyspark.sql.Column`
|
|
4049
|
+
all strings in the `str` that match a Java regex and corresponding to the regex group index.
|
|
4050
|
+
|
|
4051
|
+
Examples
|
|
4052
|
+
--------
|
|
4053
|
+
>>> df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")], ["str", "regexp"])
|
|
4054
|
+
>>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)')).alias('d')).collect()
|
|
4055
|
+
[Row(d=['100', '300'])]
|
|
4056
|
+
>>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 1).alias('d')).collect()
|
|
4057
|
+
[Row(d=['100', '300'])]
|
|
4058
|
+
>>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 2).alias('d')).collect()
|
|
4059
|
+
[Row(d=['200', '400'])]
|
|
4060
|
+
>>> df.select(regexp_extract_all('str', col("regexp")).alias('d')).collect()
|
|
4061
|
+
[Row(d=['100', '300'])]
|
|
4062
|
+
"""
|
|
4063
|
+
if idx is None:
|
|
4064
|
+
return Column.invoke_anonymous_function(str, "regexp_extract_all", regexp)
|
|
4065
|
+
else:
|
|
4066
|
+
idx = lit(idx) if isinstance(idx, int) else idx
|
|
4067
|
+
return Column.invoke_anonymous_function(str, "regexp_extract_all", regexp, idx)
|
|
4068
|
+
|
|
4069
|
+
|
|
4070
|
+
@meta(unsupported_engines="*")
|
|
4071
|
+
def regexp_instr(
|
|
4072
|
+
str: ColumnOrName, regexp: ColumnOrName, idx: t.Optional[t.Union[int, Column]] = None
|
|
4073
|
+
) -> Column:
|
|
4074
|
+
r"""Extract all strings in the `str` that match the Java regex `regexp`
|
|
4075
|
+
and corresponding to the regex group index.
|
|
4076
|
+
|
|
4077
|
+
.. versionadded:: 3.5.0
|
|
4078
|
+
|
|
4079
|
+
Parameters
|
|
4080
|
+
----------
|
|
4081
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4082
|
+
target column to work on.
|
|
4083
|
+
regexp : :class:`~pyspark.sql.Column` or str
|
|
4084
|
+
regex pattern to apply.
|
|
4085
|
+
idx : int
|
|
4086
|
+
matched group id.
|
|
4087
|
+
|
|
4088
|
+
Returns
|
|
4089
|
+
-------
|
|
4090
|
+
:class:`~pyspark.sql.Column`
|
|
4091
|
+
all strings in the `str` that match a Java regex and corresponding to the regex group index.
|
|
4092
|
+
|
|
4093
|
+
Examples
|
|
4094
|
+
--------
|
|
4095
|
+
>>> df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str", "regexp"])
|
|
4096
|
+
>>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)')).alias('d')).collect()
|
|
4097
|
+
[Row(d=1)]
|
|
4098
|
+
>>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 1).alias('d')).collect()
|
|
4099
|
+
[Row(d=1)]
|
|
4100
|
+
>>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 2).alias('d')).collect()
|
|
4101
|
+
[Row(d=1)]
|
|
4102
|
+
>>> df.select(regexp_instr('str', col("regexp")).alias('d')).collect()
|
|
4103
|
+
[Row(d=1)]
|
|
4104
|
+
"""
|
|
4105
|
+
if idx is None:
|
|
4106
|
+
return Column.invoke_anonymous_function(str, "regexp_instr", regexp)
|
|
4107
|
+
else:
|
|
4108
|
+
idx = lit(idx) if isinstance(idx, int) else idx
|
|
4109
|
+
return Column.invoke_anonymous_function(str, "regexp_instr", regexp, idx)
|
|
4110
|
+
|
|
4111
|
+
|
|
4112
|
+
@meta(unsupported_engines="snowflake")
|
|
4113
|
+
def regexp_like(str: ColumnOrName, regexp: ColumnOrName) -> Column:
|
|
4114
|
+
r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.
|
|
4115
|
+
|
|
4116
|
+
.. versionadded:: 3.5.0
|
|
4117
|
+
|
|
4118
|
+
Parameters
|
|
4119
|
+
----------
|
|
4120
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4121
|
+
target column to work on.
|
|
4122
|
+
regexp : :class:`~pyspark.sql.Column` or str
|
|
4123
|
+
regex pattern to apply.
|
|
4124
|
+
|
|
4125
|
+
Returns
|
|
4126
|
+
-------
|
|
4127
|
+
:class:`~pyspark.sql.Column`
|
|
4128
|
+
true if `str` matches a Java regex, or false otherwise.
|
|
4129
|
+
|
|
4130
|
+
Examples
|
|
4131
|
+
--------
|
|
4132
|
+
>>> import pyspark.sql.functions as sf
|
|
4133
|
+
>>> spark.createDataFrame(
|
|
4134
|
+
... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
|
|
4135
|
+
... ).select(sf.regexp_like('str', sf.lit(r'(\d+)'))).show()
|
|
4136
|
+
+-----------------------+
|
|
4137
|
+
|REGEXP_LIKE(str, (\d+))|
|
|
4138
|
+
+-----------------------+
|
|
4139
|
+
| true|
|
|
4140
|
+
+-----------------------+
|
|
4141
|
+
|
|
4142
|
+
>>> import pyspark.sql.functions as sf
|
|
4143
|
+
>>> spark.createDataFrame(
|
|
4144
|
+
... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
|
|
4145
|
+
... ).select(sf.regexp_like('str', sf.lit(r'\d{2}b'))).show()
|
|
4146
|
+
+------------------------+
|
|
4147
|
+
|REGEXP_LIKE(str, \d{2}b)|
|
|
4148
|
+
+------------------------+
|
|
4149
|
+
| false|
|
|
4150
|
+
+------------------------+
|
|
4151
|
+
|
|
4152
|
+
>>> import pyspark.sql.functions as sf
|
|
4153
|
+
>>> spark.createDataFrame(
|
|
4154
|
+
... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
|
|
4155
|
+
... ).select(sf.regexp_like('str', sf.col("regexp"))).show()
|
|
4156
|
+
+------------------------+
|
|
4157
|
+
|REGEXP_LIKE(str, regexp)|
|
|
4158
|
+
+------------------------+
|
|
4159
|
+
| true|
|
|
4160
|
+
+------------------------+
|
|
4161
|
+
"""
|
|
4162
|
+
return Column.invoke_expression_over_column(str, expression.RegexpLike, expression=regexp)
|
|
4163
|
+
|
|
4164
|
+
|
|
4165
|
+
@meta(unsupported_engines="*")
|
|
4166
|
+
def regexp_substr(str: ColumnOrName, regexp: ColumnOrName) -> Column:
|
|
4167
|
+
r"""Returns the substring that matches the Java regex `regexp` within the string `str`.
|
|
4168
|
+
If the regular expression is not found, the result is null.
|
|
4169
|
+
|
|
4170
|
+
.. versionadded:: 3.5.0
|
|
4171
|
+
|
|
4172
|
+
Parameters
|
|
4173
|
+
----------
|
|
4174
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4175
|
+
target column to work on.
|
|
4176
|
+
regexp : :class:`~pyspark.sql.Column` or str
|
|
4177
|
+
regex pattern to apply.
|
|
4178
|
+
|
|
4179
|
+
Returns
|
|
4180
|
+
-------
|
|
4181
|
+
:class:`~pyspark.sql.Column`
|
|
4182
|
+
the substring that matches a Java regex within the string `str`.
|
|
4183
|
+
|
|
4184
|
+
Examples
|
|
4185
|
+
--------
|
|
4186
|
+
>>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
|
|
4187
|
+
>>> df.select(regexp_substr('str', lit(r'\d+')).alias('d')).collect()
|
|
4188
|
+
[Row(d='1')]
|
|
4189
|
+
>>> df.select(regexp_substr('str', lit(r'mmm')).alias('d')).collect()
|
|
4190
|
+
[Row(d=None)]
|
|
4191
|
+
>>> df.select(regexp_substr("str", col("regexp")).alias('d')).collect()
|
|
4192
|
+
[Row(d='1')]
|
|
4193
|
+
"""
|
|
4194
|
+
return Column.invoke_anonymous_function(str, "regexp_substr", regexp)
|
|
4195
|
+
|
|
4196
|
+
|
|
4197
|
+
@meta(unsupported_engines="*")
|
|
4198
|
+
def regr_avgx(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4199
|
+
"""
|
|
4200
|
+
Aggregate function: returns the average of the independent variable for non-null pairs
|
|
4201
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4202
|
+
|
|
4203
|
+
.. versionadded:: 3.5.0
|
|
4204
|
+
|
|
4205
|
+
Parameters
|
|
4206
|
+
----------
|
|
4207
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4208
|
+
the dependent variable.
|
|
4209
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4210
|
+
the independent variable.
|
|
4211
|
+
|
|
4212
|
+
Returns
|
|
4213
|
+
-------
|
|
4214
|
+
:class:`~pyspark.sql.Column`
|
|
4215
|
+
the average of the independent variable for non-null pairs in a group.
|
|
4216
|
+
|
|
4217
|
+
Examples
|
|
4218
|
+
--------
|
|
4219
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4220
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4221
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4222
|
+
>>> df.select(regr_avgx("y", "x")).first()
|
|
4223
|
+
Row(regr_avgx(y, x)=0.999)
|
|
4224
|
+
"""
|
|
4225
|
+
return Column.invoke_anonymous_function(y, "regr_avgx", x)
|
|
4226
|
+
|
|
4227
|
+
|
|
4228
|
+
@meta(unsupported_engines="*")
|
|
4229
|
+
def regr_avgy(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4230
|
+
"""
|
|
4231
|
+
Aggregate function: returns the average of the dependent variable for non-null pairs
|
|
4232
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4233
|
+
|
|
4234
|
+
.. versionadded:: 3.5.0
|
|
4235
|
+
|
|
4236
|
+
Parameters
|
|
4237
|
+
----------
|
|
4238
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4239
|
+
the dependent variable.
|
|
4240
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4241
|
+
the independent variable.
|
|
4242
|
+
|
|
4243
|
+
Returns
|
|
4244
|
+
-------
|
|
4245
|
+
:class:`~pyspark.sql.Column`
|
|
4246
|
+
the average of the dependent variable for non-null pairs in a group.
|
|
4247
|
+
|
|
4248
|
+
Examples
|
|
4249
|
+
--------
|
|
4250
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4251
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4252
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4253
|
+
>>> df.select(regr_avgy("y", "x")).first()
|
|
4254
|
+
Row(regr_avgy(y, x)=9.980732994136464)
|
|
4255
|
+
"""
|
|
4256
|
+
return Column.invoke_anonymous_function(y, "regr_avgy", x)
|
|
4257
|
+
|
|
4258
|
+
|
|
4259
|
+
@meta(unsupported_engines="*")
|
|
4260
|
+
def regr_count(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4261
|
+
"""
|
|
4262
|
+
Aggregate function: returns the number of non-null number pairs
|
|
4263
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4264
|
+
|
|
4265
|
+
.. versionadded:: 3.5.0
|
|
4266
|
+
|
|
4267
|
+
Parameters
|
|
4268
|
+
----------
|
|
4269
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4270
|
+
the dependent variable.
|
|
4271
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4272
|
+
the independent variable.
|
|
4273
|
+
|
|
4274
|
+
Returns
|
|
4275
|
+
-------
|
|
4276
|
+
:class:`~pyspark.sql.Column`
|
|
4277
|
+
the number of non-null number pairs in a group.
|
|
4278
|
+
|
|
4279
|
+
Examples
|
|
4280
|
+
--------
|
|
4281
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4282
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4283
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4284
|
+
>>> df.select(regr_count("y", "x")).first()
|
|
4285
|
+
Row(regr_count(y, x)=1000)
|
|
4286
|
+
"""
|
|
4287
|
+
return Column.invoke_anonymous_function(y, "regr_count", x)
|
|
4288
|
+
|
|
4289
|
+
|
|
4290
|
+
@meta(unsupported_engines="*")
|
|
4291
|
+
def regr_intercept(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4292
|
+
"""
|
|
4293
|
+
Aggregate function: returns the intercept of the univariate linear regression line
|
|
4294
|
+
for non-null pairs in a group, where `y` is the dependent variable and
|
|
4295
|
+
`x` is the independent variable.
|
|
4296
|
+
|
|
4297
|
+
.. versionadded:: 3.5.0
|
|
4298
|
+
|
|
4299
|
+
Parameters
|
|
4300
|
+
----------
|
|
4301
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4302
|
+
the dependent variable.
|
|
4303
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4304
|
+
the independent variable.
|
|
4305
|
+
|
|
4306
|
+
Returns
|
|
4307
|
+
-------
|
|
4308
|
+
:class:`~pyspark.sql.Column`
|
|
4309
|
+
the intercept of the univariate linear regression line for non-null pairs in a group.
|
|
4310
|
+
|
|
4311
|
+
Examples
|
|
4312
|
+
--------
|
|
4313
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4314
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4315
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4316
|
+
>>> df.select(regr_intercept("y", "x")).first()
|
|
4317
|
+
Row(regr_intercept(y, x)=-0.04961745990969568)
|
|
4318
|
+
"""
|
|
4319
|
+
return Column.invoke_anonymous_function(y, "regr_intercept", x)
|
|
4320
|
+
|
|
4321
|
+
|
|
4322
|
+
@meta(unsupported_engines="*")
|
|
4323
|
+
def regr_r2(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4324
|
+
"""
|
|
4325
|
+
Aggregate function: returns the coefficient of determination for non-null pairs
|
|
4326
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4327
|
+
|
|
4328
|
+
.. versionadded:: 3.5.0
|
|
4329
|
+
|
|
4330
|
+
Parameters
|
|
4331
|
+
----------
|
|
4332
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4333
|
+
the dependent variable.
|
|
4334
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4335
|
+
the independent variable.
|
|
4336
|
+
|
|
4337
|
+
Returns
|
|
4338
|
+
-------
|
|
4339
|
+
:class:`~pyspark.sql.Column`
|
|
4340
|
+
the coefficient of determination for non-null pairs in a group.
|
|
4341
|
+
|
|
4342
|
+
Examples
|
|
4343
|
+
--------
|
|
4344
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4345
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4346
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4347
|
+
>>> df.select(regr_r2("y", "x")).first()
|
|
4348
|
+
Row(regr_r2(y, x)=0.9851908293645436)
|
|
4349
|
+
"""
|
|
4350
|
+
return Column.invoke_anonymous_function(y, "regr_r2", x)
|
|
4351
|
+
|
|
4352
|
+
|
|
4353
|
+
@meta(unsupported_engines="*")
|
|
4354
|
+
def regr_slope(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4355
|
+
"""
|
|
4356
|
+
Aggregate function: returns the slope of the linear regression line for non-null pairs
|
|
4357
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4358
|
+
|
|
4359
|
+
.. versionadded:: 3.5.0
|
|
4360
|
+
|
|
4361
|
+
Parameters
|
|
4362
|
+
----------
|
|
4363
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4364
|
+
the dependent variable.
|
|
4365
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4366
|
+
the independent variable.
|
|
4367
|
+
|
|
4368
|
+
Returns
|
|
4369
|
+
-------
|
|
4370
|
+
:class:`~pyspark.sql.Column`
|
|
4371
|
+
the slope of the linear regression line for non-null pairs in a group.
|
|
4372
|
+
|
|
4373
|
+
Examples
|
|
4374
|
+
--------
|
|
4375
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4376
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4377
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4378
|
+
>>> df.select(regr_slope("y", "x")).first()
|
|
4379
|
+
Row(regr_slope(y, x)=10.040390844891048)
|
|
4380
|
+
"""
|
|
4381
|
+
return Column.invoke_anonymous_function(y, "regr_slope", x)
|
|
4382
|
+
|
|
4383
|
+
|
|
4384
|
+
@meta(unsupported_engines="*")
|
|
4385
|
+
def regr_sxx(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4386
|
+
"""
|
|
4387
|
+
Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs
|
|
4388
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4389
|
+
|
|
4390
|
+
.. versionadded:: 3.5.0
|
|
4391
|
+
|
|
4392
|
+
Parameters
|
|
4393
|
+
----------
|
|
4394
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4395
|
+
the dependent variable.
|
|
4396
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4397
|
+
the independent variable.
|
|
4398
|
+
|
|
4399
|
+
Returns
|
|
4400
|
+
-------
|
|
4401
|
+
:class:`~pyspark.sql.Column`
|
|
4402
|
+
REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs in a group.
|
|
4403
|
+
|
|
4404
|
+
Examples
|
|
4405
|
+
--------
|
|
4406
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4407
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4408
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4409
|
+
>>> df.select(regr_sxx("y", "x")).first()
|
|
4410
|
+
Row(regr_sxx(y, x)=666.9989999999996)
|
|
4411
|
+
"""
|
|
4412
|
+
return Column.invoke_anonymous_function(y, "regr_sxx", x)
|
|
4413
|
+
|
|
4414
|
+
|
|
4415
|
+
@meta(unsupported_engines="*")
|
|
4416
|
+
def regr_sxy(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4417
|
+
"""
|
|
4418
|
+
Aggregate function: returns REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs
|
|
4419
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4420
|
+
|
|
4421
|
+
.. versionadded:: 3.5.0
|
|
4422
|
+
|
|
4423
|
+
Parameters
|
|
4424
|
+
----------
|
|
4425
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4426
|
+
the dependent variable.
|
|
4427
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4428
|
+
the independent variable.
|
|
4429
|
+
|
|
4430
|
+
Returns
|
|
4431
|
+
-------
|
|
4432
|
+
:class:`~pyspark.sql.Column`
|
|
4433
|
+
REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs in a group.
|
|
4434
|
+
|
|
4435
|
+
Examples
|
|
4436
|
+
--------
|
|
4437
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4438
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4439
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4440
|
+
>>> df.select(regr_sxy("y", "x")).first()
|
|
4441
|
+
Row(regr_sxy(y, x)=6696.93065315148)
|
|
4442
|
+
"""
|
|
4443
|
+
return Column.invoke_anonymous_function(y, "regr_sxy", x)
|
|
4444
|
+
|
|
4445
|
+
|
|
4446
|
+
@meta(unsupported_engines="*")
|
|
4447
|
+
def regr_syy(y: ColumnOrName, x: ColumnOrName) -> Column:
|
|
4448
|
+
"""
|
|
4449
|
+
Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs
|
|
4450
|
+
in a group, where `y` is the dependent variable and `x` is the independent variable.
|
|
4451
|
+
|
|
4452
|
+
.. versionadded:: 3.5.0
|
|
4453
|
+
|
|
4454
|
+
Parameters
|
|
4455
|
+
----------
|
|
4456
|
+
y : :class:`~pyspark.sql.Column` or str
|
|
4457
|
+
the dependent variable.
|
|
4458
|
+
x : :class:`~pyspark.sql.Column` or str
|
|
4459
|
+
the independent variable.
|
|
4460
|
+
|
|
4461
|
+
Returns
|
|
4462
|
+
-------
|
|
4463
|
+
:class:`~pyspark.sql.Column`
|
|
4464
|
+
REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs in a group.
|
|
4465
|
+
|
|
4466
|
+
Examples
|
|
4467
|
+
--------
|
|
4468
|
+
>>> x = (col("id") % 3).alias("x")
|
|
4469
|
+
>>> y = (randn(42) + x * 10).alias("y")
|
|
4470
|
+
>>> df = spark.range(0, 1000, 1, 1).select(x, y)
|
|
4471
|
+
>>> df.select(regr_syy("y", "x")).first()
|
|
4472
|
+
Row(regr_syy(y, x)=68250.53503811295)
|
|
4473
|
+
"""
|
|
4474
|
+
return Column.invoke_anonymous_function(y, "regr_syy", x)
|
|
4475
|
+
|
|
4476
|
+
|
|
4477
|
+
@meta(unsupported_engines="*")
|
|
4478
|
+
def replace(
|
|
4479
|
+
src: ColumnOrName, search: ColumnOrName, replace: t.Optional[ColumnOrName] = None
|
|
4480
|
+
) -> Column:
|
|
4481
|
+
"""
|
|
4482
|
+
Replaces all occurrences of `search` with `replace`.
|
|
4483
|
+
|
|
4484
|
+
.. versionadded:: 3.5.0
|
|
4485
|
+
|
|
4486
|
+
Parameters
|
|
4487
|
+
----------
|
|
4488
|
+
src : :class:`~pyspark.sql.Column` or str
|
|
4489
|
+
A column of string to be replaced.
|
|
4490
|
+
search : :class:`~pyspark.sql.Column` or str
|
|
4491
|
+
A column of string, If `search` is not found in `str`, `str` is returned unchanged.
|
|
4492
|
+
replace : :class:`~pyspark.sql.Column` or str, optional
|
|
4493
|
+
A column of string, If `replace` is not specified or is an empty string,
|
|
4494
|
+
nothing replaces the string that is removed from `str`.
|
|
4495
|
+
|
|
4496
|
+
Examples
|
|
4497
|
+
--------
|
|
4498
|
+
>>> df = spark.createDataFrame([("ABCabc", "abc", "DEF",)], ["a", "b", "c"])
|
|
4499
|
+
>>> df.select(replace(df.a, df.b, df.c).alias('r')).collect()
|
|
4500
|
+
[Row(r='ABCDEF')]
|
|
4501
|
+
|
|
4502
|
+
>>> df.select(replace(df.a, df.b).alias('r')).collect()
|
|
4503
|
+
[Row(r='ABC')]
|
|
4504
|
+
"""
|
|
4505
|
+
if replace is not None:
|
|
4506
|
+
return Column.invoke_anonymous_function(src, "replace", search, replace)
|
|
4507
|
+
else:
|
|
4508
|
+
return Column.invoke_anonymous_function(src, "replace", search)
|
|
4509
|
+
|
|
4510
|
+
|
|
4511
|
+
@meta()
|
|
4512
|
+
def right(str: ColumnOrName, len: ColumnOrName) -> Column:
|
|
4513
|
+
"""
|
|
4514
|
+
Returns the rightmost `len`(`len` can be string type) characters from the string `str`,
|
|
4515
|
+
if `len` is less or equal than 0 the result is an empty string.
|
|
4516
|
+
|
|
4517
|
+
.. versionadded:: 3.5.0
|
|
4518
|
+
|
|
4519
|
+
Parameters
|
|
4520
|
+
----------
|
|
4521
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4522
|
+
Input column or strings.
|
|
4523
|
+
len : :class:`~pyspark.sql.Column` or str
|
|
4524
|
+
Input column or strings, the rightmost `len`.
|
|
4525
|
+
|
|
4526
|
+
Examples
|
|
4527
|
+
--------
|
|
4528
|
+
>>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])
|
|
4529
|
+
>>> df.select(right(df.a, df.b).alias('r')).collect()
|
|
4530
|
+
[Row(r='SQL')]
|
|
4531
|
+
"""
|
|
4532
|
+
return Column.invoke_expression_over_column(str, expression.Right, expression=len)
|
|
4533
|
+
|
|
4534
|
+
|
|
4535
|
+
rlike = regexp_like
|
|
4536
|
+
sha = sha1
|
|
4537
|
+
|
|
4538
|
+
|
|
4539
|
+
@meta()
|
|
4540
|
+
def sign(col: ColumnOrName) -> Column:
|
|
4541
|
+
"""
|
|
4542
|
+
Computes the signum of the given value.
|
|
4543
|
+
|
|
4544
|
+
.. versionadded:: 1.4.0
|
|
4545
|
+
|
|
4546
|
+
.. versionchanged:: 3.4.0
|
|
4547
|
+
Supports Spark Connect.
|
|
4548
|
+
|
|
4549
|
+
Parameters
|
|
4550
|
+
----------
|
|
4551
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4552
|
+
target column to compute on.
|
|
4553
|
+
|
|
4554
|
+
Returns
|
|
4555
|
+
-------
|
|
4556
|
+
:class:`~pyspark.sql.Column`
|
|
4557
|
+
the column for computed results.
|
|
4558
|
+
|
|
4559
|
+
Examples
|
|
4560
|
+
--------
|
|
4561
|
+
>>> import pyspark.sql.functions as sf
|
|
4562
|
+
>>> spark.range(1).select(
|
|
4563
|
+
... sf.sign(sf.lit(-5)),
|
|
4564
|
+
... sf.sign(sf.lit(6))
|
|
4565
|
+
... ).show()
|
|
4566
|
+
+--------+-------+
|
|
4567
|
+
|sign(-5)|sign(6)|
|
|
4568
|
+
+--------+-------+
|
|
4569
|
+
| -1.0| 1.0|
|
|
4570
|
+
+--------+-------+
|
|
4571
|
+
"""
|
|
4572
|
+
return Column.invoke_expression_over_column(col, expression.Sign)
|
|
4573
|
+
|
|
4574
|
+
|
|
4575
|
+
@meta(unsupported_engines="*")
|
|
4576
|
+
def some(col: ColumnOrName) -> Column:
|
|
4577
|
+
"""
|
|
4578
|
+
Aggregate function: returns true if at least one value of `col` is true.
|
|
4579
|
+
|
|
4580
|
+
.. versionadded:: 3.5.0
|
|
4581
|
+
|
|
4582
|
+
Parameters
|
|
4583
|
+
----------
|
|
4584
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4585
|
+
column to check if at least one value is true.
|
|
4586
|
+
|
|
4587
|
+
Returns
|
|
4588
|
+
-------
|
|
4589
|
+
:class:`~pyspark.sql.Column`
|
|
4590
|
+
true if at least one value of `col` is true, false otherwise.
|
|
4591
|
+
|
|
4592
|
+
Examples
|
|
4593
|
+
--------
|
|
4594
|
+
>>> import pyspark.sql.functions as sf
|
|
4595
|
+
>>> spark.createDataFrame(
|
|
4596
|
+
... [[True], [True], [True]], ["flag"]
|
|
4597
|
+
... ).select(sf.some("flag")).show()
|
|
4598
|
+
+----------+
|
|
4599
|
+
|some(flag)|
|
|
4600
|
+
+----------+
|
|
4601
|
+
| true|
|
|
4602
|
+
+----------+
|
|
4603
|
+
|
|
4604
|
+
>>> import pyspark.sql.functions as sf
|
|
4605
|
+
>>> spark.createDataFrame(
|
|
4606
|
+
... [[True], [False], [True]], ["flag"]
|
|
4607
|
+
... ).select(sf.some("flag")).show()
|
|
4608
|
+
+----------+
|
|
4609
|
+
|some(flag)|
|
|
4610
|
+
+----------+
|
|
4611
|
+
| true|
|
|
4612
|
+
+----------+
|
|
4613
|
+
|
|
4614
|
+
>>> import pyspark.sql.functions as sf
|
|
4615
|
+
>>> spark.createDataFrame(
|
|
4616
|
+
... [[False], [False], [False]], ["flag"]
|
|
4617
|
+
... ).select(sf.some("flag")).show()
|
|
4618
|
+
+----------+
|
|
4619
|
+
|some(flag)|
|
|
4620
|
+
+----------+
|
|
4621
|
+
| false|
|
|
4622
|
+
+----------+
|
|
4623
|
+
"""
|
|
4624
|
+
return Column.invoke_anonymous_function(col, "some")
|
|
4625
|
+
|
|
4626
|
+
|
|
4627
|
+
@meta(unsupported_engines="*")
|
|
4628
|
+
def spark_partition_id() -> Column:
|
|
4629
|
+
"""A column for partition ID.
|
|
4630
|
+
|
|
4631
|
+
.. versionadded:: 1.6.0
|
|
4632
|
+
|
|
4633
|
+
.. versionchanged:: 3.4.0
|
|
4634
|
+
Supports Spark Connect.
|
|
4635
|
+
|
|
4636
|
+
Notes
|
|
4637
|
+
-----
|
|
4638
|
+
This is non deterministic because it depends on data partitioning and task scheduling.
|
|
4639
|
+
|
|
4640
|
+
Returns
|
|
4641
|
+
-------
|
|
4642
|
+
:class:`~pyspark.sql.Column`
|
|
4643
|
+
partition id the record belongs to.
|
|
4644
|
+
|
|
4645
|
+
Examples
|
|
4646
|
+
--------
|
|
4647
|
+
>>> df = spark.range(2)
|
|
4648
|
+
>>> df.repartition(1).select(spark_partition_id().alias("pid")).collect()
|
|
4649
|
+
[Row(pid=0), Row(pid=0)]
|
|
4650
|
+
"""
|
|
4651
|
+
return Column.invoke_anonymous_function(None, "spark_partition_id")
|
|
4652
|
+
|
|
4653
|
+
|
|
4654
|
+
@meta(unsupported_engines="*")
|
|
4655
|
+
def split_part(src: ColumnOrName, delimiter: ColumnOrName, partNum: ColumnOrName) -> Column:
|
|
4656
|
+
"""
|
|
4657
|
+
Splits `str` by delimiter and return requested part of the split (1-based).
|
|
4658
|
+
If any input is null, returns null. if `partNum` is out of range of split parts,
|
|
4659
|
+
returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative,
|
|
4660
|
+
the parts are counted backward from the end of the string.
|
|
4661
|
+
If the `delimiter` is an empty string, the `str` is not split.
|
|
4662
|
+
|
|
4663
|
+
.. versionadded:: 3.5.0
|
|
4664
|
+
|
|
4665
|
+
Parameters
|
|
4666
|
+
----------
|
|
4667
|
+
src : :class:`~pyspark.sql.Column` or str
|
|
4668
|
+
A column of string to be splited.
|
|
4669
|
+
delimiter : :class:`~pyspark.sql.Column` or str
|
|
4670
|
+
A column of string, the delimiter used for split.
|
|
4671
|
+
partNum : :class:`~pyspark.sql.Column` or str
|
|
4672
|
+
A column of string, requested part of the split (1-based).
|
|
4673
|
+
|
|
4674
|
+
Examples
|
|
4675
|
+
--------
|
|
4676
|
+
>>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"])
|
|
4677
|
+
>>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect()
|
|
4678
|
+
[Row(r='13')]
|
|
4679
|
+
"""
|
|
4680
|
+
return Column.invoke_anonymous_function(src, "split_part", delimiter, partNum)
|
|
4681
|
+
|
|
4682
|
+
|
|
4683
|
+
@meta()
|
|
4684
|
+
def startswith(str: ColumnOrName, prefix: ColumnOrName) -> Column:
|
|
4685
|
+
"""
|
|
4686
|
+
Returns a boolean. The value is True if str starts with prefix.
|
|
4687
|
+
Returns NULL if either input expression is NULL. Otherwise, returns False.
|
|
4688
|
+
Both str or prefix must be of STRING or BINARY type.
|
|
4689
|
+
|
|
4690
|
+
.. versionadded:: 3.5.0
|
|
4691
|
+
|
|
4692
|
+
Parameters
|
|
4693
|
+
----------
|
|
4694
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
4695
|
+
A column of string.
|
|
4696
|
+
prefix : :class:`~pyspark.sql.Column` or str
|
|
4697
|
+
A column of string, the prefix.
|
|
4698
|
+
|
|
4699
|
+
Examples
|
|
4700
|
+
--------
|
|
4701
|
+
>>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])
|
|
4702
|
+
>>> df.select(startswith(df.a, df.b).alias('r')).collect()
|
|
4703
|
+
[Row(r=True)]
|
|
4704
|
+
|
|
4705
|
+
>>> df = spark.createDataFrame([("414243", "4142",)], ["e", "f"])
|
|
4706
|
+
>>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))
|
|
4707
|
+
>>> df.printSchema()
|
|
4708
|
+
root
|
|
4709
|
+
|-- e: binary (nullable = true)
|
|
4710
|
+
|-- f: binary (nullable = true)
|
|
4711
|
+
>>> df.select(startswith("e", "f"), startswith("f", "e")).show()
|
|
4712
|
+
+----------------+----------------+
|
|
4713
|
+
|startswith(e, f)|startswith(f, e)|
|
|
4714
|
+
+----------------+----------------+
|
|
4715
|
+
| true| false|
|
|
4716
|
+
+----------------+----------------+
|
|
4717
|
+
"""
|
|
4718
|
+
return Column.invoke_expression_over_column(str, expression.StartsWith, expression=prefix)
|
|
4719
|
+
|
|
4720
|
+
|
|
4721
|
+
@meta(unsupported_engines="*")
|
|
4722
|
+
def std(col: ColumnOrName) -> Column:
|
|
4723
|
+
"""
|
|
4724
|
+
Aggregate function: alias for stddev_samp.
|
|
4725
|
+
|
|
4726
|
+
.. versionadded:: 3.5.0
|
|
4727
|
+
|
|
4728
|
+
Parameters
|
|
4729
|
+
----------
|
|
4730
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4731
|
+
target column to compute on.
|
|
4732
|
+
|
|
4733
|
+
Returns
|
|
4734
|
+
-------
|
|
4735
|
+
:class:`~pyspark.sql.Column`
|
|
4736
|
+
standard deviation of given column.
|
|
4737
|
+
|
|
4738
|
+
Examples
|
|
4739
|
+
--------
|
|
4740
|
+
>>> import pyspark.sql.functions as sf
|
|
4741
|
+
>>> spark.range(6).select(sf.std("id")).show()
|
|
4742
|
+
+------------------+
|
|
4743
|
+
| std(id)|
|
|
4744
|
+
+------------------+
|
|
4745
|
+
|1.8708286933869...|
|
|
4746
|
+
+------------------+
|
|
4747
|
+
"""
|
|
4748
|
+
return Column.invoke_anonymous_function(col, "std")
|
|
4749
|
+
|
|
4750
|
+
|
|
4751
|
+
@meta(unsupported_engines="*")
|
|
4752
|
+
def str_to_map(
|
|
4753
|
+
text: ColumnOrName,
|
|
4754
|
+
pairDelim: t.Optional[ColumnOrName] = None,
|
|
4755
|
+
keyValueDelim: t.Optional[ColumnOrName] = None,
|
|
4756
|
+
) -> Column:
|
|
4757
|
+
"""
|
|
4758
|
+
Creates a map after splitting the text into key/value pairs using delimiters.
|
|
4759
|
+
Both `pairDelim` and `keyValueDelim` are treated as regular expressions.
|
|
4760
|
+
|
|
4761
|
+
.. versionadded:: 3.5.0
|
|
4762
|
+
|
|
4763
|
+
Parameters
|
|
4764
|
+
----------
|
|
4765
|
+
text : :class:`~pyspark.sql.Column` or str
|
|
4766
|
+
Input column or strings.
|
|
4767
|
+
pairDelim : :class:`~pyspark.sql.Column` or str, optional
|
|
4768
|
+
delimiter to use to split pair.
|
|
4769
|
+
keyValueDelim : :class:`~pyspark.sql.Column` or str, optional
|
|
4770
|
+
delimiter to use to split key/value.
|
|
4771
|
+
|
|
4772
|
+
Examples
|
|
4773
|
+
--------
|
|
4774
|
+
>>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
|
|
4775
|
+
>>> df.select(str_to_map(df.e, lit(","), lit(":")).alias('r')).collect()
|
|
4776
|
+
[Row(r={'a': '1', 'b': '2', 'c': '3'})]
|
|
4777
|
+
|
|
4778
|
+
>>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
|
|
4779
|
+
>>> df.select(str_to_map(df.e, lit(",")).alias('r')).collect()
|
|
4780
|
+
[Row(r={'a': '1', 'b': '2', 'c': '3'})]
|
|
4781
|
+
|
|
4782
|
+
>>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
|
|
4783
|
+
>>> df.select(str_to_map(df.e).alias('r')).collect()
|
|
4784
|
+
[Row(r={'a': '1', 'b': '2', 'c': '3'})]
|
|
4785
|
+
"""
|
|
4786
|
+
if pairDelim is None:
|
|
4787
|
+
pairDelim = lit(",")
|
|
4788
|
+
if keyValueDelim is None:
|
|
4789
|
+
keyValueDelim = lit(":")
|
|
4790
|
+
return Column.invoke_expression_over_column(
|
|
4791
|
+
text, expression.StrToMap, pair_delim=pairDelim, key_value_delim=keyValueDelim
|
|
4792
|
+
)
|
|
4793
|
+
|
|
4794
|
+
|
|
4795
|
+
@meta(unsupported_engines="*")
|
|
4796
|
+
def substr(str: ColumnOrName, pos: ColumnOrName, len: t.Optional[ColumnOrName] = None) -> Column:
|
|
4797
|
+
"""
|
|
4798
|
+
Returns the substring of `str` that starts at `pos` and is of length `len`,
|
|
4799
|
+
or the slice of byte array that starts at `pos` and is of length `len`.
|
|
4800
|
+
|
|
4801
|
+
.. versionadded:: 3.5.0
|
|
4802
|
+
|
|
4803
|
+
Parameters
|
|
4804
|
+
----------
|
|
4805
|
+
src : :class:`~pyspark.sql.Column` or str
|
|
4806
|
+
A column of string.
|
|
4807
|
+
pos : :class:`~pyspark.sql.Column` or str
|
|
4808
|
+
A column of string, the substring of `str` that starts at `pos`.
|
|
4809
|
+
len : :class:`~pyspark.sql.Column` or str, optional
|
|
4810
|
+
A column of string, the substring of `str` is of length `len`.
|
|
4811
|
+
|
|
4812
|
+
Examples
|
|
4813
|
+
--------
|
|
4814
|
+
>>> import pyspark.sql.functions as sf
|
|
4815
|
+
>>> spark.createDataFrame(
|
|
4816
|
+
... [("Spark SQL", 5, 1,)], ["a", "b", "c"]
|
|
4817
|
+
... ).select(sf.substr("a", "b", "c")).show()
|
|
4818
|
+
+---------------+
|
|
4819
|
+
|substr(a, b, c)|
|
|
4820
|
+
+---------------+
|
|
4821
|
+
| k|
|
|
4822
|
+
+---------------+
|
|
4823
|
+
|
|
4824
|
+
>>> import pyspark.sql.functions as sf
|
|
4825
|
+
>>> spark.createDataFrame(
|
|
4826
|
+
... [("Spark SQL", 5, 1,)], ["a", "b", "c"]
|
|
4827
|
+
... ).select(sf.substr("a", "b")).show()
|
|
4828
|
+
+------------------------+
|
|
4829
|
+
|substr(a, b, 2147483647)|
|
|
4830
|
+
+------------------------+
|
|
4831
|
+
| k SQL|
|
|
4832
|
+
+------------------------+
|
|
4833
|
+
"""
|
|
4834
|
+
if len is not None:
|
|
4835
|
+
return Column.invoke_anonymous_function(str, "substr", pos, len)
|
|
4836
|
+
else:
|
|
4837
|
+
return Column.invoke_anonymous_function(str, "substr", pos)
|
|
4838
|
+
|
|
4839
|
+
|
|
4840
|
+
@meta(unsupported_engines="*")
|
|
4841
|
+
def timestamp_micros(col: ColumnOrName) -> Column:
|
|
4842
|
+
"""
|
|
4843
|
+
Creates timestamp from the number of microseconds since UTC epoch.
|
|
4844
|
+
|
|
4845
|
+
.. versionadded:: 3.5.0
|
|
4846
|
+
|
|
4847
|
+
Parameters
|
|
4848
|
+
----------
|
|
4849
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4850
|
+
unix time values.
|
|
4851
|
+
|
|
4852
|
+
Returns
|
|
4853
|
+
-------
|
|
4854
|
+
:class:`~pyspark.sql.Column`
|
|
4855
|
+
converted timestamp value.
|
|
4856
|
+
|
|
4857
|
+
Examples
|
|
4858
|
+
--------
|
|
4859
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "UTC")
|
|
4860
|
+
>>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])
|
|
4861
|
+
>>> time_df.select(timestamp_micros(time_df.unix_time).alias('ts')).show()
|
|
4862
|
+
+--------------------+
|
|
4863
|
+
| ts|
|
|
4864
|
+
+--------------------+
|
|
4865
|
+
|1970-01-01 00:20:...|
|
|
4866
|
+
+--------------------+
|
|
4867
|
+
>>> time_df.select(timestamp_micros('unix_time').alias('ts')).printSchema()
|
|
4868
|
+
root
|
|
4869
|
+
|-- ts: timestamp (nullable = true)
|
|
4870
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
4871
|
+
"""
|
|
4872
|
+
return Column.invoke_anonymous_function(col, "timestamp_micros")
|
|
4873
|
+
|
|
4874
|
+
|
|
4875
|
+
@meta(unsupported_engines="*")
|
|
4876
|
+
def timestamp_millis(col: ColumnOrName) -> Column:
|
|
4877
|
+
"""
|
|
4878
|
+
Creates timestamp from the number of milliseconds since UTC epoch.
|
|
4879
|
+
|
|
4880
|
+
.. versionadded:: 3.5.0
|
|
4881
|
+
|
|
4882
|
+
Parameters
|
|
4883
|
+
----------
|
|
4884
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4885
|
+
unix time values.
|
|
4886
|
+
|
|
4887
|
+
Returns
|
|
4888
|
+
-------
|
|
4889
|
+
:class:`~pyspark.sql.Column`
|
|
4890
|
+
converted timestamp value.
|
|
4891
|
+
|
|
4892
|
+
Examples
|
|
4893
|
+
--------
|
|
4894
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "UTC")
|
|
4895
|
+
>>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])
|
|
4896
|
+
>>> time_df.select(timestamp_millis(time_df.unix_time).alias('ts')).show()
|
|
4897
|
+
+-------------------+
|
|
4898
|
+
| ts|
|
|
4899
|
+
+-------------------+
|
|
4900
|
+
|1970-01-15 05:43:39|
|
|
4901
|
+
+-------------------+
|
|
4902
|
+
>>> time_df.select(timestamp_millis('unix_time').alias('ts')).printSchema()
|
|
4903
|
+
root
|
|
4904
|
+
|-- ts: timestamp (nullable = true)
|
|
4905
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
4906
|
+
"""
|
|
4907
|
+
return Column.invoke_anonymous_function(col, "timestamp_millis")
|
|
4908
|
+
|
|
4909
|
+
|
|
4910
|
+
@meta(unsupported_engines="*")
|
|
4911
|
+
def to_char(col: ColumnOrName, format: ColumnOrName) -> Column:
|
|
4912
|
+
"""
|
|
4913
|
+
Convert `col` to a string based on the `format`.
|
|
4914
|
+
Throws an exception if the conversion fails. The format can consist of the following
|
|
4915
|
+
characters, case insensitive:
|
|
4916
|
+
'0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the
|
|
4917
|
+
format string matches a sequence of digits in the input value, generating a result
|
|
4918
|
+
string of the same length as the corresponding sequence in the format string.
|
|
4919
|
+
The result string is left-padded with zeros if the 0/9 sequence comprises more digits
|
|
4920
|
+
than the matching part of the decimal value, starts with 0, and is before the decimal
|
|
4921
|
+
point. Otherwise, it is padded with spaces.
|
|
4922
|
+
'.' or 'D': Specifies the position of the decimal point (optional, only allowed once).
|
|
4923
|
+
',' or 'G': Specifies the position of the grouping (thousands) separator (,).
|
|
4924
|
+
There must be a 0 or 9 to the left and right of each grouping separator.
|
|
4925
|
+
'$': Specifies the location of the $ currency sign. This character may only be specified once.
|
|
4926
|
+
'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at
|
|
4927
|
+
the beginning or end of the format string). Note that 'S' prints '+' for positive
|
|
4928
|
+
values but 'MI' prints a space.
|
|
4929
|
+
'PR': Only allowed at the end of the format string; specifies that the result string
|
|
4930
|
+
will be wrapped by angle brackets if the input value is negative.
|
|
4931
|
+
|
|
4932
|
+
.. versionadded:: 3.5.0
|
|
4933
|
+
|
|
4934
|
+
Parameters
|
|
4935
|
+
----------
|
|
4936
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4937
|
+
Input column or strings.
|
|
4938
|
+
format : :class:`~pyspark.sql.Column` or str, optional
|
|
4939
|
+
format to use to convert char values.
|
|
4940
|
+
|
|
4941
|
+
Examples
|
|
4942
|
+
--------
|
|
4943
|
+
>>> df = spark.createDataFrame([(78.12,)], ["e"])
|
|
4944
|
+
>>> df.select(to_char(df.e, lit("$99.99")).alias('r')).collect()
|
|
4945
|
+
[Row(r='$78.12')]
|
|
4946
|
+
"""
|
|
4947
|
+
return Column.invoke_anonymous_function(col, "to_char", format)
|
|
4948
|
+
|
|
4949
|
+
|
|
4950
|
+
@meta(unsupported_engines=["bigquery", "duckdb"])
|
|
4951
|
+
def to_number(col: ColumnOrName, format: ColumnOrName) -> Column:
|
|
4952
|
+
"""
|
|
4953
|
+
Convert string 'col' to a number based on the string format 'format'.
|
|
4954
|
+
Throws an exception if the conversion fails. The format can consist of the following
|
|
4955
|
+
characters, case insensitive:
|
|
4956
|
+
'0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the
|
|
4957
|
+
format string matches a sequence of digits in the input string. If the 0/9
|
|
4958
|
+
sequence starts with 0 and is before the decimal point, it can only match a digit
|
|
4959
|
+
sequence of the same size. Otherwise, if the sequence starts with 9 or is after
|
|
4960
|
+
the decimal point, it can match a digit sequence that has the same or smaller size.
|
|
4961
|
+
'.' or 'D': Specifies the position of the decimal point (optional, only allowed once).
|
|
4962
|
+
',' or 'G': Specifies the position of the grouping (thousands) separator (,).
|
|
4963
|
+
There must be a 0 or 9 to the left and right of each grouping separator.
|
|
4964
|
+
'col' must match the grouping separator relevant for the size of the number.
|
|
4965
|
+
'$': Specifies the location of the $ currency sign. This character may only be
|
|
4966
|
+
specified once.
|
|
4967
|
+
'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed
|
|
4968
|
+
once at the beginning or end of the format string). Note that 'S' allows '-'
|
|
4969
|
+
but 'MI' does not.
|
|
4970
|
+
'PR': Only allowed at the end of the format string; specifies that 'col' indicates a
|
|
4971
|
+
negative number with wrapping angled brackets.
|
|
4972
|
+
|
|
4973
|
+
.. versionadded:: 3.5.0
|
|
4974
|
+
|
|
4975
|
+
Parameters
|
|
4976
|
+
----------
|
|
4977
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
4978
|
+
Input column or strings.
|
|
4979
|
+
format : :class:`~pyspark.sql.Column` or str, optional
|
|
4980
|
+
format to use to convert number values.
|
|
4981
|
+
|
|
4982
|
+
Examples
|
|
4983
|
+
--------
|
|
4984
|
+
>>> df = spark.createDataFrame([("$78.12",)], ["e"])
|
|
4985
|
+
>>> df.select(to_number(df.e, lit("$99.99")).alias('r')).collect()
|
|
4986
|
+
[Row(r=Decimal('78.12'))]
|
|
4987
|
+
"""
|
|
4988
|
+
return Column.invoke_expression_over_column(col, expression.ToNumber, format=format)
|
|
4989
|
+
|
|
4990
|
+
|
|
4991
|
+
def to_str(value: t.Any) -> t.Optional[str]:
|
|
4992
|
+
"""
|
|
4993
|
+
A wrapper over str(), but converts bool values to lower case strings.
|
|
4994
|
+
If None is given, just returns None, instead of converting it to string "None".
|
|
4995
|
+
"""
|
|
4996
|
+
if isinstance(value, bool):
|
|
4997
|
+
return str(value).lower()
|
|
4998
|
+
elif value is None:
|
|
4999
|
+
return value
|
|
5000
|
+
else:
|
|
5001
|
+
return str(value)
|
|
5002
|
+
|
|
5003
|
+
|
|
5004
|
+
@meta(unsupported_engines="*")
|
|
5005
|
+
def to_timestamp_ltz(
|
|
5006
|
+
timestamp: ColumnOrName,
|
|
5007
|
+
format: t.Optional[ColumnOrName] = None,
|
|
5008
|
+
) -> Column:
|
|
5009
|
+
"""
|
|
5010
|
+
Parses the `timestamp` with the `format` to a timestamp without time zone.
|
|
5011
|
+
Returns null with invalid input.
|
|
5012
|
+
|
|
5013
|
+
.. versionadded:: 3.5.0
|
|
5014
|
+
|
|
5015
|
+
Parameters
|
|
5016
|
+
----------
|
|
5017
|
+
timestamp : :class:`~pyspark.sql.Column` or str
|
|
5018
|
+
Input column or strings.
|
|
5019
|
+
format : :class:`~pyspark.sql.Column` or str, optional
|
|
5020
|
+
format to use to convert type `TimestampType` timestamp values.
|
|
5021
|
+
|
|
5022
|
+
Examples
|
|
5023
|
+
--------
|
|
5024
|
+
>>> df = spark.createDataFrame([("2016-12-31",)], ["e"])
|
|
5025
|
+
>>> df.select(to_timestamp_ltz(df.e, lit("yyyy-MM-dd")).alias('r')).collect()
|
|
5026
|
+
... # doctest: +SKIP
|
|
5027
|
+
[Row(r=datetime.datetime(2016, 12, 31, 0, 0))]
|
|
5028
|
+
|
|
5029
|
+
>>> df = spark.createDataFrame([("2016-12-31",)], ["e"])
|
|
5030
|
+
>>> df.select(to_timestamp_ltz(df.e).alias('r')).collect()
|
|
5031
|
+
... # doctest: +SKIP
|
|
5032
|
+
[Row(r=datetime.datetime(2016, 12, 31, 0, 0))]
|
|
5033
|
+
"""
|
|
5034
|
+
if format is not None:
|
|
5035
|
+
return Column.invoke_anonymous_function(timestamp, "to_timestamp_ltz", format)
|
|
5036
|
+
else:
|
|
5037
|
+
return Column.invoke_anonymous_function(timestamp, "to_timestamp_ltz")
|
|
5038
|
+
|
|
5039
|
+
|
|
5040
|
+
@meta(unsupported_engines="*")
|
|
5041
|
+
def to_timestamp_ntz(
|
|
5042
|
+
timestamp: ColumnOrName,
|
|
5043
|
+
format: t.Optional[ColumnOrName] = None,
|
|
5044
|
+
) -> Column:
|
|
5045
|
+
"""
|
|
5046
|
+
Parses the `timestamp` with the `format` to a timestamp without time zone.
|
|
5047
|
+
Returns null with invalid input.
|
|
5048
|
+
|
|
5049
|
+
.. versionadded:: 3.5.0
|
|
5050
|
+
|
|
5051
|
+
Parameters
|
|
5052
|
+
----------
|
|
5053
|
+
timestamp : :class:`~pyspark.sql.Column` or str
|
|
5054
|
+
Input column or strings.
|
|
5055
|
+
format : :class:`~pyspark.sql.Column` or str, optional
|
|
5056
|
+
format to use to convert type `TimestampNTZType` timestamp values.
|
|
5057
|
+
|
|
5058
|
+
Examples
|
|
5059
|
+
--------
|
|
5060
|
+
>>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
|
|
5061
|
+
>>> df.select(to_timestamp_ntz(df.e, lit("yyyy-MM-dd")).alias('r')).collect()
|
|
5062
|
+
... # doctest: +SKIP
|
|
5063
|
+
[Row(r=datetime.datetime(2016, 4, 8, 0, 0))]
|
|
5064
|
+
|
|
5065
|
+
>>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
|
|
5066
|
+
>>> df.select(to_timestamp_ntz(df.e).alias('r')).collect()
|
|
5067
|
+
... # doctest: +SKIP
|
|
5068
|
+
[Row(r=datetime.datetime(2016, 4, 8, 0, 0))]
|
|
5069
|
+
"""
|
|
5070
|
+
if format is not None:
|
|
5071
|
+
return Column.invoke_anonymous_function(timestamp, "to_timestamp_ntz", format)
|
|
5072
|
+
else:
|
|
5073
|
+
return Column.invoke_anonymous_function(timestamp, "to_timestamp_ntz")
|
|
5074
|
+
|
|
5075
|
+
|
|
5076
|
+
@meta(unsupported_engines=["bigquery", "postgres", "snowflake"])
|
|
5077
|
+
def to_unix_timestamp(
|
|
5078
|
+
timestamp: ColumnOrName,
|
|
5079
|
+
format: t.Optional[ColumnOrName] = None,
|
|
5080
|
+
) -> Column:
|
|
5081
|
+
"""
|
|
5082
|
+
Returns the UNIX timestamp of the given time.
|
|
5083
|
+
|
|
5084
|
+
.. versionadded:: 3.5.0
|
|
5085
|
+
|
|
5086
|
+
Parameters
|
|
5087
|
+
----------
|
|
5088
|
+
timestamp : :class:`~pyspark.sql.Column` or str
|
|
5089
|
+
Input column or strings.
|
|
5090
|
+
format : :class:`~pyspark.sql.Column` or str, optional
|
|
5091
|
+
format to use to convert UNIX timestamp values.
|
|
5092
|
+
|
|
5093
|
+
Examples
|
|
5094
|
+
--------
|
|
5095
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
5096
|
+
>>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
|
|
5097
|
+
>>> df.select(to_unix_timestamp(df.e, lit("yyyy-MM-dd")).alias('r')).collect()
|
|
5098
|
+
[Row(r=1460098800)]
|
|
5099
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
5100
|
+
|
|
5101
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
5102
|
+
>>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
|
|
5103
|
+
>>> df.select(to_unix_timestamp(df.e).alias('r')).collect()
|
|
5104
|
+
[Row(r=None)]
|
|
5105
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
5106
|
+
"""
|
|
5107
|
+
if format is not None:
|
|
5108
|
+
return Column.invoke_expression_over_column(timestamp, expression.StrToUnix, format=format)
|
|
5109
|
+
else:
|
|
5110
|
+
return Column.invoke_expression_over_column(timestamp, expression.StrToUnix)
|
|
5111
|
+
|
|
5112
|
+
|
|
5113
|
+
@meta(unsupported_engines="*")
|
|
5114
|
+
def to_varchar(col: ColumnOrName, format: ColumnOrName) -> Column:
|
|
5115
|
+
"""
|
|
5116
|
+
Convert `col` to a string based on the `format`.
|
|
5117
|
+
Throws an exception if the conversion fails. The format can consist of the following
|
|
5118
|
+
characters, case insensitive:
|
|
5119
|
+
'0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the
|
|
5120
|
+
format string matches a sequence of digits in the input value, generating a result
|
|
5121
|
+
string of the same length as the corresponding sequence in the format string.
|
|
5122
|
+
The result string is left-padded with zeros if the 0/9 sequence comprises more digits
|
|
5123
|
+
than the matching part of the decimal value, starts with 0, and is before the decimal
|
|
5124
|
+
point. Otherwise, it is padded with spaces.
|
|
5125
|
+
'.' or 'D': Specifies the position of the decimal point (optional, only allowed once).
|
|
5126
|
+
',' or 'G': Specifies the position of the grouping (thousands) separator (,).
|
|
5127
|
+
There must be a 0 or 9 to the left and right of each grouping separator.
|
|
5128
|
+
'$': Specifies the location of the $ currency sign. This character may only be specified once.
|
|
5129
|
+
'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at
|
|
5130
|
+
the beginning or end of the format string). Note that 'S' prints '+' for positive
|
|
5131
|
+
values but 'MI' prints a space.
|
|
5132
|
+
'PR': Only allowed at the end of the format string; specifies that the result string
|
|
5133
|
+
will be wrapped by angle brackets if the input value is negative.
|
|
5134
|
+
|
|
5135
|
+
.. versionadded:: 3.5.0
|
|
5136
|
+
|
|
5137
|
+
Parameters
|
|
5138
|
+
----------
|
|
5139
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
5140
|
+
Input column or strings.
|
|
5141
|
+
format : :class:`~pyspark.sql.Column` or str, optional
|
|
5142
|
+
format to use to convert char values.
|
|
5143
|
+
|
|
5144
|
+
Examples
|
|
5145
|
+
--------
|
|
5146
|
+
>>> df = spark.createDataFrame([(78.12,)], ["e"])
|
|
5147
|
+
>>> df.select(to_varchar(df.e, lit("$99.99")).alias('r')).collect()
|
|
5148
|
+
[Row(r='$78.12')]
|
|
5149
|
+
"""
|
|
5150
|
+
return Column.invoke_anonymous_function(col, "to_varchar", format)
|
|
5151
|
+
|
|
5152
|
+
|
|
5153
|
+
@meta(unsupported_engines="*")
|
|
5154
|
+
def try_aes_decrypt(
|
|
5155
|
+
input: ColumnOrName,
|
|
5156
|
+
key: ColumnOrName,
|
|
5157
|
+
mode: t.Optional[ColumnOrName] = None,
|
|
5158
|
+
padding: t.Optional[ColumnOrName] = None,
|
|
5159
|
+
aad: t.Optional[ColumnOrName] = None,
|
|
5160
|
+
) -> Column:
|
|
5161
|
+
"""
|
|
5162
|
+
This is a special version of `aes_decrypt` that performs the same operation,
|
|
5163
|
+
but returns a NULL value instead of raising an error if the decryption cannot be performed.
|
|
5164
|
+
Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16,
|
|
5165
|
+
24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB',
|
|
5166
|
+
'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). t.Optional additional authenticated data (AAD) is
|
|
5167
|
+
only supported for GCM. If provided for encryption, the identical AAD value must be provided
|
|
5168
|
+
for decryption. The default mode is GCM.
|
|
5169
|
+
|
|
5170
|
+
.. versionadded:: 3.5.0
|
|
5171
|
+
|
|
5172
|
+
Parameters
|
|
5173
|
+
----------
|
|
5174
|
+
input : :class:`~pyspark.sql.Column` or str
|
|
5175
|
+
The binary value to decrypt.
|
|
5176
|
+
key : :class:`~pyspark.sql.Column` or str
|
|
5177
|
+
The passphrase to use to decrypt the data.
|
|
5178
|
+
mode : :class:`~pyspark.sql.Column` or str, optional
|
|
5179
|
+
Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB,
|
|
5180
|
+
GCM, CBC.
|
|
5181
|
+
padding : :class:`~pyspark.sql.Column` or str, optional
|
|
5182
|
+
Specifies how to pad messages whose length is not a multiple of the block size. Valid
|
|
5183
|
+
values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS
|
|
5184
|
+
for CBC.
|
|
5185
|
+
aad : :class:`~pyspark.sql.Column` or str, optional
|
|
5186
|
+
t.Optional additional authenticated data. Only supported for GCM mode. This can be any
|
|
5187
|
+
free-form input and must be provided for both encryption and decryption.
|
|
5188
|
+
|
|
5189
|
+
Examples
|
|
5190
|
+
--------
|
|
5191
|
+
>>> df = spark.createDataFrame([(
|
|
5192
|
+
... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4",
|
|
5193
|
+
... "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",
|
|
5194
|
+
... "This is an AAD mixed into the input",)],
|
|
5195
|
+
... ["input", "key", "mode", "padding", "aad"]
|
|
5196
|
+
... )
|
|
5197
|
+
>>> df.select(try_aes_decrypt(
|
|
5198
|
+
... unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias('r')
|
|
5199
|
+
... ).collect()
|
|
5200
|
+
[Row(r=bytearray(b'Spark'))]
|
|
5201
|
+
|
|
5202
|
+
>>> df = spark.createDataFrame([(
|
|
5203
|
+
... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=",
|
|
5204
|
+
... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)],
|
|
5205
|
+
... ["input", "key", "mode", "padding"]
|
|
5206
|
+
... )
|
|
5207
|
+
>>> df.select(try_aes_decrypt(
|
|
5208
|
+
... unbase64(df.input), df.key, df.mode, df.padding).alias('r')
|
|
5209
|
+
... ).collect()
|
|
5210
|
+
[Row(r=bytearray(b'Spark'))]
|
|
5211
|
+
|
|
5212
|
+
>>> df.select(try_aes_decrypt(unbase64(df.input), df.key, df.mode).alias('r')).collect()
|
|
5213
|
+
[Row(r=bytearray(b'Spark'))]
|
|
5214
|
+
|
|
5215
|
+
>>> df = spark.createDataFrame([(
|
|
5216
|
+
... "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",
|
|
5217
|
+
... "0000111122223333",)],
|
|
5218
|
+
... ["input", "key"]
|
|
5219
|
+
... )
|
|
5220
|
+
>>> df.select(try_aes_decrypt(unhex(df.input), df.key).alias('r')).collect()
|
|
5221
|
+
[Row(r=bytearray(b'Spark'))]
|
|
5222
|
+
"""
|
|
5223
|
+
_mode = lit("GCM") if mode is None else mode
|
|
5224
|
+
_padding = lit("DEFAULT") if padding is None else padding
|
|
5225
|
+
_aad = lit("") if aad is None else aad
|
|
5226
|
+
return Column.invoke_anonymous_function(input, "try_aes_decrypt", key, _mode, _padding, _aad)
|
|
5227
|
+
|
|
5228
|
+
|
|
5229
|
+
@meta(unsupported_engines=["bigquery", "snowflake"])
|
|
5230
|
+
def try_element_at(col: ColumnOrName, extraction: ColumnOrName) -> Column:
|
|
5231
|
+
"""
|
|
5232
|
+
(array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will
|
|
5233
|
+
throw an error. If index < 0, accesses elements from the last to the first. The function
|
|
5234
|
+
always returns NULL if the index exceeds the length of the array.
|
|
5235
|
+
|
|
5236
|
+
(map, key) - Returns value for given key. The function always returns NULL if the key is not
|
|
5237
|
+
contained in the map.
|
|
5238
|
+
|
|
5239
|
+
.. versionadded:: 3.5.0
|
|
5240
|
+
|
|
5241
|
+
Parameters
|
|
5242
|
+
----------
|
|
5243
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
5244
|
+
name of column containing array or map
|
|
5245
|
+
extraction :
|
|
5246
|
+
index to check for in array or key to check for in map
|
|
5247
|
+
|
|
5248
|
+
Examples
|
|
5249
|
+
--------
|
|
5250
|
+
>>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
|
|
5251
|
+
>>> df.select(try_element_at(df.data, lit(1)).alias('r')).collect()
|
|
5252
|
+
[Row(r='a')]
|
|
5253
|
+
>>> df.select(try_element_at(df.data, lit(-1)).alias('r')).collect()
|
|
5254
|
+
[Row(r='c')]
|
|
5255
|
+
|
|
5256
|
+
>>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
|
|
5257
|
+
>>> df.select(try_element_at(df.data, lit("a")).alias('r')).collect()
|
|
5258
|
+
[Row(r=1.0)]
|
|
5259
|
+
"""
|
|
5260
|
+
return Column(
|
|
5261
|
+
expression.Bracket(
|
|
5262
|
+
this=Column.ensure_col(col).expression,
|
|
5263
|
+
expressions=[Column.ensure_col(extraction).expression],
|
|
5264
|
+
safe=True,
|
|
5265
|
+
)
|
|
5266
|
+
)
|
|
5267
|
+
|
|
5268
|
+
|
|
5269
|
+
@meta(unsupported_engines="*")
|
|
5270
|
+
def try_to_timestamp(col: ColumnOrName, format: t.Optional[ColumnOrName] = None) -> Column:
|
|
5271
|
+
"""
|
|
5272
|
+
Parses the `col` with the `format` to a timestamp. The function always
|
|
5273
|
+
returns null on an invalid input with/without ANSI SQL mode enabled. The result data type is
|
|
5274
|
+
consistent with the value of configuration `spark.sql.timestampType`.
|
|
5275
|
+
|
|
5276
|
+
.. versionadded:: 3.5.0
|
|
5277
|
+
|
|
5278
|
+
Parameters
|
|
5279
|
+
----------
|
|
5280
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
5281
|
+
column values to convert.
|
|
5282
|
+
format: str, optional
|
|
5283
|
+
format to use to convert timestamp values.
|
|
5284
|
+
|
|
5285
|
+
Examples
|
|
5286
|
+
--------
|
|
5287
|
+
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
|
|
5288
|
+
>>> df.select(try_to_timestamp(df.t).alias('dt')).collect()
|
|
5289
|
+
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
|
|
5290
|
+
|
|
5291
|
+
>>> df.select(try_to_timestamp(df.t, lit('yyyy-MM-dd HH:mm:ss')).alias('dt')).collect()
|
|
5292
|
+
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
|
|
5293
|
+
"""
|
|
5294
|
+
if format is not None:
|
|
5295
|
+
return Column.invoke_anonymous_function(col, "try_to_timestamp", format)
|
|
5296
|
+
else:
|
|
5297
|
+
return Column.invoke_anonymous_function(col, "try_to_timestamp")
|
|
5298
|
+
|
|
5299
|
+
|
|
5300
|
+
@meta()
|
|
5301
|
+
def ucase(str: ColumnOrName) -> Column:
|
|
5302
|
+
"""
|
|
5303
|
+
Returns `str` with all characters changed to uppercase.
|
|
5304
|
+
|
|
5305
|
+
.. versionadded:: 3.5.0
|
|
5306
|
+
|
|
5307
|
+
Parameters
|
|
5308
|
+
----------
|
|
5309
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
5310
|
+
Input column or strings.
|
|
5311
|
+
|
|
5312
|
+
Examples
|
|
5313
|
+
--------
|
|
5314
|
+
>>> import pyspark.sql.functions as sf
|
|
5315
|
+
>>> spark.range(1).select(sf.ucase(sf.lit("Spark"))).show()
|
|
5316
|
+
+------------+
|
|
5317
|
+
|ucase(Spark)|
|
|
5318
|
+
+------------+
|
|
5319
|
+
| SPARK|
|
|
5320
|
+
+------------+
|
|
5321
|
+
"""
|
|
5322
|
+
return Column.invoke_expression_over_column(str, expression.Upper)
|
|
5323
|
+
|
|
5324
|
+
|
|
5325
|
+
@meta()
|
|
5326
|
+
def unix_date(col: ColumnOrName) -> Column:
|
|
5327
|
+
"""Returns the number of days since 1970-01-01.
|
|
5328
|
+
|
|
5329
|
+
.. versionadded:: 3.5.0
|
|
5330
|
+
|
|
5331
|
+
Examples
|
|
5332
|
+
--------
|
|
5333
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
5334
|
+
>>> df = spark.createDataFrame([('1970-01-02',)], ['t'])
|
|
5335
|
+
>>> df.select(unix_date(to_date(df.t)).alias('n')).collect()
|
|
5336
|
+
[Row(n=1)]
|
|
5337
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
5338
|
+
"""
|
|
5339
|
+
return Column.invoke_expression_over_column(col, expression.UnixDate)
|
|
5340
|
+
|
|
5341
|
+
|
|
5342
|
+
@meta(unsupported_engines="*")
|
|
5343
|
+
def unix_micros(col: ColumnOrName) -> Column:
|
|
5344
|
+
"""Returns the number of microseconds since 1970-01-01 00:00:00 UTC.
|
|
5345
|
+
|
|
5346
|
+
.. versionadded:: 3.5.0
|
|
5347
|
+
|
|
5348
|
+
Examples
|
|
5349
|
+
--------
|
|
5350
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
5351
|
+
>>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])
|
|
5352
|
+
>>> df.select(unix_micros(to_timestamp(df.t)).alias('n')).collect()
|
|
5353
|
+
[Row(n=1437584400000000)]
|
|
5354
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
5355
|
+
"""
|
|
5356
|
+
return Column.invoke_anonymous_function(col, "unix_micros")
|
|
5357
|
+
|
|
5358
|
+
|
|
5359
|
+
@meta(unsupported_engines="*")
|
|
5360
|
+
def unix_millis(col: ColumnOrName) -> Column:
|
|
5361
|
+
"""Returns the number of milliseconds since 1970-01-01 00:00:00 UTC.
|
|
5362
|
+
Truncates higher levels of precision.
|
|
5363
|
+
|
|
5364
|
+
.. versionadded:: 3.5.0
|
|
5365
|
+
|
|
5366
|
+
Examples
|
|
5367
|
+
--------
|
|
5368
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
5369
|
+
>>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])
|
|
5370
|
+
>>> df.select(unix_millis(to_timestamp(df.t)).alias('n')).collect()
|
|
5371
|
+
[Row(n=1437584400000)]
|
|
5372
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
5373
|
+
"""
|
|
5374
|
+
return Column.invoke_anonymous_function(col, "unix_millis")
|
|
5375
|
+
|
|
5376
|
+
|
|
5377
|
+
@meta(unsupported_engines="*")
|
|
5378
|
+
def unix_seconds(col: ColumnOrName) -> Column:
|
|
5379
|
+
"""Returns the number of seconds since 1970-01-01 00:00:00 UTC.
|
|
5380
|
+
Truncates higher levels of precision.
|
|
5381
|
+
|
|
5382
|
+
.. versionadded:: 3.5.0
|
|
5383
|
+
|
|
5384
|
+
Examples
|
|
5385
|
+
--------
|
|
5386
|
+
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
|
5387
|
+
>>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])
|
|
5388
|
+
>>> df.select(unix_seconds(to_timestamp(df.t)).alias('n')).collect()
|
|
5389
|
+
[Row(n=1437584400)]
|
|
5390
|
+
>>> spark.conf.unset("spark.sql.session.timeZone")
|
|
5391
|
+
"""
|
|
5392
|
+
return Column.invoke_anonymous_function(col, "unix_seconds")
|
|
5393
|
+
|
|
5394
|
+
|
|
5395
|
+
@meta(unsupported_engines="*")
|
|
5396
|
+
def url_decode(str: ColumnOrName) -> Column:
|
|
5397
|
+
"""
|
|
5398
|
+
Decodes a `str` in 'application/x-www-form-urlencoded' format
|
|
5399
|
+
using a specific encoding scheme.
|
|
5400
|
+
|
|
5401
|
+
.. versionadded:: 3.5.0
|
|
5402
|
+
|
|
5403
|
+
Parameters
|
|
5404
|
+
----------
|
|
5405
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
5406
|
+
A column of string to decode.
|
|
5407
|
+
|
|
5408
|
+
Examples
|
|
5409
|
+
--------
|
|
5410
|
+
>>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["a"])
|
|
5411
|
+
>>> df.select(url_decode(df.a).alias('r')).collect()
|
|
5412
|
+
[Row(r='https://spark.apache.org')]
|
|
5413
|
+
"""
|
|
5414
|
+
return Column.invoke_anonymous_function(str, "url_decode")
|
|
5415
|
+
|
|
5416
|
+
|
|
5417
|
+
@meta(unsupported_engines="*")
|
|
5418
|
+
def url_encode(str: ColumnOrName) -> Column:
|
|
5419
|
+
"""
|
|
5420
|
+
Translates a string into 'application/x-www-form-urlencoded' format
|
|
5421
|
+
using a specific encoding scheme.
|
|
5422
|
+
|
|
5423
|
+
.. versionadded:: 3.5.0
|
|
5424
|
+
|
|
5425
|
+
Parameters
|
|
5426
|
+
----------
|
|
5427
|
+
str : :class:`~pyspark.sql.Column` or str
|
|
5428
|
+
A column of string to encode.
|
|
5429
|
+
|
|
5430
|
+
Examples
|
|
5431
|
+
--------
|
|
5432
|
+
>>> df = spark.createDataFrame([("https://spark.apache.org",)], ["a"])
|
|
5433
|
+
>>> df.select(url_encode(df.a).alias('r')).collect()
|
|
5434
|
+
[Row(r='https%3A%2F%2Fspark.apache.org')]
|
|
5435
|
+
"""
|
|
5436
|
+
return Column.invoke_anonymous_function(str, "url_encode")
|
|
5437
|
+
|
|
5438
|
+
|
|
5439
|
+
user = current_user
|
|
5440
|
+
|
|
5441
|
+
|
|
5442
|
+
@meta(unsupported_engines="*")
|
|
5443
|
+
def version() -> Column:
|
|
5444
|
+
"""
|
|
5445
|
+
Returns the Spark version. The string contains 2 fields, the first being a release version
|
|
5446
|
+
and the second being a git revision.
|
|
5447
|
+
|
|
5448
|
+
.. versionadded:: 3.5.0
|
|
5449
|
+
|
|
5450
|
+
Examples
|
|
5451
|
+
--------
|
|
5452
|
+
>>> df = spark.range(1)
|
|
5453
|
+
>>> df.select(version()).show(truncate=False) # doctest: +SKIP
|
|
5454
|
+
+----------------------------------------------+
|
|
5455
|
+
|version() |
|
|
5456
|
+
+----------------------------------------------+
|
|
5457
|
+
|3.5.0 cafbea5b13623276517a9d716f75745eff91f616|
|
|
5458
|
+
+----------------------------------------------+
|
|
5459
|
+
"""
|
|
5460
|
+
return Column.invoke_anonymous_function(None, "version")
|
|
5461
|
+
|
|
5462
|
+
|
|
5463
|
+
@meta(unsupported_engines="*")
|
|
5464
|
+
def weekday(col: ColumnOrName) -> Column:
|
|
5465
|
+
"""
|
|
5466
|
+
Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
|
|
5467
|
+
|
|
5468
|
+
.. versionadded:: 3.5.0
|
|
5469
|
+
|
|
5470
|
+
Parameters
|
|
5471
|
+
----------
|
|
5472
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
5473
|
+
target date/timestamp column to work on.
|
|
5474
|
+
|
|
5475
|
+
Returns
|
|
5476
|
+
-------
|
|
5477
|
+
:class:`~pyspark.sql.Column`
|
|
5478
|
+
the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
|
|
5479
|
+
|
|
5480
|
+
Examples
|
|
5481
|
+
--------
|
|
5482
|
+
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
5483
|
+
>>> df.select(weekday('dt').alias('day')).show()
|
|
5484
|
+
+---+
|
|
5485
|
+
|day|
|
|
5486
|
+
+---+
|
|
5487
|
+
| 2|
|
|
5488
|
+
+---+
|
|
5489
|
+
"""
|
|
5490
|
+
return Column.invoke_anonymous_function(col, "weekday")
|
|
5491
|
+
|
|
5492
|
+
|
|
5493
|
+
@meta(unsupported_engines="*")
|
|
5494
|
+
def width_bucket(
|
|
5495
|
+
v: ColumnOrName,
|
|
5496
|
+
min: ColumnOrName,
|
|
5497
|
+
max: ColumnOrName,
|
|
5498
|
+
numBucket: t.Union[ColumnOrName, int],
|
|
5499
|
+
) -> Column:
|
|
5500
|
+
"""
|
|
5501
|
+
Returns the bucket number into which the value of this expression would fall
|
|
5502
|
+
after being evaluated. Note that input arguments must follow conditions listed below;
|
|
5503
|
+
otherwise, the method will return null.
|
|
5504
|
+
|
|
5505
|
+
.. versionadded:: 3.5.0
|
|
5506
|
+
|
|
5507
|
+
Parameters
|
|
5508
|
+
----------
|
|
5509
|
+
v : str or :class:`~pyspark.sql.Column`
|
|
5510
|
+
value to compute a bucket number in the histogram
|
|
5511
|
+
min : str or :class:`~pyspark.sql.Column`
|
|
5512
|
+
minimum value of the histogram
|
|
5513
|
+
max : str or :class:`~pyspark.sql.Column`
|
|
5514
|
+
maximum value of the histogram
|
|
5515
|
+
numBucket : str, :class:`~pyspark.sql.Column` or int
|
|
5516
|
+
the number of buckets
|
|
5517
|
+
|
|
5518
|
+
Returns
|
|
5519
|
+
-------
|
|
5520
|
+
:class:`~pyspark.sql.Column`
|
|
5521
|
+
the bucket number into which the value would fall after being evaluated
|
|
5522
|
+
|
|
5523
|
+
Examples
|
|
5524
|
+
--------
|
|
5525
|
+
>>> df = spark.createDataFrame([
|
|
5526
|
+
... (5.3, 0.2, 10.6, 5),
|
|
5527
|
+
... (-2.1, 1.3, 3.4, 3),
|
|
5528
|
+
... (8.1, 0.0, 5.7, 4),
|
|
5529
|
+
... (-0.9, 5.2, 0.5, 2)],
|
|
5530
|
+
... ['v', 'min', 'max', 'n'])
|
|
5531
|
+
>>> df.select(width_bucket('v', 'min', 'max', 'n')).show()
|
|
5532
|
+
+----------------------------+
|
|
5533
|
+
|width_bucket(v, min, max, n)|
|
|
5534
|
+
+----------------------------+
|
|
5535
|
+
| 3|
|
|
5536
|
+
| 0|
|
|
5537
|
+
| 5|
|
|
5538
|
+
| 3|
|
|
5539
|
+
+----------------------------+
|
|
5540
|
+
"""
|
|
5541
|
+
numBucket = lit(numBucket) if isinstance(numBucket, int) else numBucket
|
|
5542
|
+
return Column.invoke_anonymous_function(v, "width_bucket", min, max, numBucket)
|
|
5543
|
+
|
|
5544
|
+
|
|
5545
|
+
@meta(unsupported_engines=["*", "spark"])
|
|
5546
|
+
def window_time(
|
|
5547
|
+
windowColumn: ColumnOrName,
|
|
5548
|
+
) -> Column:
|
|
5549
|
+
"""Computes the event time from a window column. The column window values are produced
|
|
5550
|
+
by window aggregating operators and are of type `STRUCT<start: TIMESTAMP, end: TIMESTAMP>`
|
|
5551
|
+
where start is inclusive and end is exclusive. The event time of records produced by window
|
|
5552
|
+
aggregating operators can be computed as ``window_time(window)`` and are
|
|
5553
|
+
``window.end - lit(1).alias("microsecond")`` (as microsecond is the minimal supported event
|
|
5554
|
+
time precision). The window column must be one produced by a window aggregating operator.
|
|
5555
|
+
|
|
5556
|
+
.. versionadded:: 3.4.0
|
|
5557
|
+
|
|
5558
|
+
Parameters
|
|
5559
|
+
----------
|
|
5560
|
+
windowColumn : :class:`~pyspark.sql.Column`
|
|
5561
|
+
The window column of a window aggregate records.
|
|
5562
|
+
|
|
5563
|
+
Returns
|
|
5564
|
+
-------
|
|
5565
|
+
:class:`~pyspark.sql.Column`
|
|
5566
|
+
the column for computed results.
|
|
5567
|
+
|
|
5568
|
+
Notes
|
|
5569
|
+
-----
|
|
5570
|
+
Supports Spark Connect.
|
|
5571
|
+
|
|
5572
|
+
Examples
|
|
5573
|
+
--------
|
|
5574
|
+
>>> import datetime
|
|
5575
|
+
>>> df = spark.createDataFrame(
|
|
5576
|
+
... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],
|
|
5577
|
+
... ).toDF("date", "val")
|
|
5578
|
+
|
|
5579
|
+
Group the data into 5 second time windows and aggregate as sum.
|
|
5580
|
+
|
|
5581
|
+
>>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))
|
|
5582
|
+
|
|
5583
|
+
Extract the window event time using the window_time function.
|
|
5584
|
+
|
|
5585
|
+
>>> w.select(
|
|
5586
|
+
... w.window.end.cast("string").alias("end"),
|
|
5587
|
+
... window_time(w.window).cast("string").alias("window_time"),
|
|
5588
|
+
... "sum"
|
|
5589
|
+
... ).collect()
|
|
5590
|
+
[Row(end='2016-03-11 09:00:10', window_time='2016-03-11 09:00:09.999999', sum=1)]
|
|
5591
|
+
"""
|
|
5592
|
+
return Column.invoke_anonymous_function(windowColumn, "window_time")
|
|
5593
|
+
|
|
5594
|
+
|
|
5595
|
+
@meta(unsupported_engines="*")
|
|
5596
|
+
def xpath(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5597
|
+
"""
|
|
5598
|
+
Returns a string array of values within the nodes of xml that match the XPath expression.
|
|
5599
|
+
|
|
5600
|
+
.. versionadded:: 3.5.0
|
|
5601
|
+
|
|
5602
|
+
Examples
|
|
5603
|
+
--------
|
|
5604
|
+
>>> df = spark.createDataFrame(
|
|
5605
|
+
... [('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>',)], ['x'])
|
|
5606
|
+
>>> df.select(xpath(df.x, lit('a/b/text()')).alias('r')).collect()
|
|
5607
|
+
[Row(r=['b1', 'b2', 'b3'])]
|
|
5608
|
+
"""
|
|
5609
|
+
return Column.invoke_anonymous_function(xml, "xpath", path)
|
|
5610
|
+
|
|
5611
|
+
|
|
5612
|
+
@meta(unsupported_engines="*")
|
|
5613
|
+
def xpath_boolean(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5614
|
+
"""
|
|
5615
|
+
Returns true if the XPath expression evaluates to true, or if a matching node is found.
|
|
5616
|
+
|
|
5617
|
+
.. versionadded:: 3.5.0
|
|
5618
|
+
|
|
5619
|
+
Examples
|
|
5620
|
+
--------
|
|
5621
|
+
>>> df = spark.createDataFrame([('<a><b>1</b></a>',)], ['x'])
|
|
5622
|
+
>>> df.select(xpath_boolean(df.x, lit('a/b')).alias('r')).collect()
|
|
5623
|
+
[Row(r=True)]
|
|
5624
|
+
"""
|
|
5625
|
+
return Column.invoke_anonymous_function(xml, "xpath_boolean", path)
|
|
5626
|
+
|
|
5627
|
+
|
|
5628
|
+
@meta(unsupported_engines="*")
|
|
5629
|
+
def xpath_double(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5630
|
+
"""
|
|
5631
|
+
Returns a double value, the value zero if no match is found,
|
|
5632
|
+
or NaN if a match is found but the value is non-numeric.
|
|
5633
|
+
|
|
5634
|
+
.. versionadded:: 3.5.0
|
|
5635
|
+
|
|
5636
|
+
Examples
|
|
5637
|
+
--------
|
|
5638
|
+
>>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
|
|
5639
|
+
>>> df.select(xpath_double(df.x, lit('sum(a/b)')).alias('r')).collect()
|
|
5640
|
+
[Row(r=3.0)]
|
|
5641
|
+
"""
|
|
5642
|
+
return Column.invoke_anonymous_function(xml, "xpath_double", path)
|
|
5643
|
+
|
|
5644
|
+
|
|
5645
|
+
@meta(unsupported_engines="*")
|
|
5646
|
+
def xpath_float(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5647
|
+
"""
|
|
5648
|
+
Returns a float value, the value zero if no match is found,
|
|
5649
|
+
or NaN if a match is found but the value is non-numeric.
|
|
5650
|
+
|
|
5651
|
+
.. versionadded:: 3.5.0
|
|
5652
|
+
|
|
5653
|
+
Examples
|
|
5654
|
+
--------
|
|
5655
|
+
>>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
|
|
5656
|
+
>>> df.select(xpath_float(df.x, lit('sum(a/b)')).alias('r')).collect()
|
|
5657
|
+
[Row(r=3.0)]
|
|
5658
|
+
"""
|
|
5659
|
+
return Column.invoke_anonymous_function(xml, "xpath_float", path)
|
|
5660
|
+
|
|
5661
|
+
|
|
5662
|
+
@meta(unsupported_engines="*")
|
|
5663
|
+
def xpath_int(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5664
|
+
"""
|
|
5665
|
+
Returns an integer value, or the value zero if no match is found,
|
|
5666
|
+
or a match is found but the value is non-numeric.
|
|
5667
|
+
|
|
5668
|
+
.. versionadded:: 3.5.0
|
|
5669
|
+
|
|
5670
|
+
Examples
|
|
5671
|
+
--------
|
|
5672
|
+
>>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
|
|
5673
|
+
>>> df.select(xpath_int(df.x, lit('sum(a/b)')).alias('r')).collect()
|
|
5674
|
+
[Row(r=3)]
|
|
5675
|
+
"""
|
|
5676
|
+
return Column.invoke_anonymous_function(xml, "xpath_int", path)
|
|
5677
|
+
|
|
5678
|
+
|
|
5679
|
+
@meta(unsupported_engines="*")
|
|
5680
|
+
def xpath_long(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5681
|
+
"""
|
|
5682
|
+
Returns a long integer value, or the value zero if no match is found,
|
|
5683
|
+
or a match is found but the value is non-numeric.
|
|
5684
|
+
|
|
5685
|
+
.. versionadded:: 3.5.0
|
|
5686
|
+
|
|
5687
|
+
Examples
|
|
5688
|
+
--------
|
|
5689
|
+
>>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
|
|
5690
|
+
>>> df.select(xpath_long(df.x, lit('sum(a/b)')).alias('r')).collect()
|
|
5691
|
+
[Row(r=3)]
|
|
5692
|
+
"""
|
|
5693
|
+
return Column.invoke_anonymous_function(xml, "xpath_long", path)
|
|
5694
|
+
|
|
5695
|
+
|
|
5696
|
+
@meta(unsupported_engines="*")
|
|
5697
|
+
def xpath_number(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5698
|
+
"""
|
|
5699
|
+
Returns a double value, the value zero if no match is found,
|
|
5700
|
+
or NaN if a match is found but the value is non-numeric.
|
|
5701
|
+
|
|
5702
|
+
.. versionadded:: 3.5.0
|
|
5703
|
+
|
|
5704
|
+
Examples
|
|
5705
|
+
--------
|
|
5706
|
+
>>> import pyspark.sql.functions as sf
|
|
5707
|
+
>>> spark.createDataFrame(
|
|
5708
|
+
... [('<a><b>1</b><b>2</b></a>',)], ['x']
|
|
5709
|
+
... ).select(sf.xpath_number('x', sf.lit('sum(a/b)'))).show()
|
|
5710
|
+
+-------------------------+
|
|
5711
|
+
|xpath_number(x, sum(a/b))|
|
|
5712
|
+
+-------------------------+
|
|
5713
|
+
| 3.0|
|
|
5714
|
+
+-------------------------+
|
|
5715
|
+
"""
|
|
5716
|
+
return Column.invoke_anonymous_function(xml, "xpath_number", path)
|
|
5717
|
+
|
|
5718
|
+
|
|
5719
|
+
@meta(unsupported_engines="*")
|
|
5720
|
+
def xpath_short(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5721
|
+
"""
|
|
5722
|
+
Returns a short integer value, or the value zero if no match is found,
|
|
5723
|
+
or a match is found but the value is non-numeric.
|
|
5724
|
+
|
|
5725
|
+
.. versionadded:: 3.5.0
|
|
5726
|
+
|
|
5727
|
+
Examples
|
|
5728
|
+
--------
|
|
5729
|
+
>>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
|
|
5730
|
+
>>> df.select(xpath_short(df.x, lit('sum(a/b)')).alias('r')).collect()
|
|
5731
|
+
[Row(r=3)]
|
|
5732
|
+
"""
|
|
5733
|
+
return Column.invoke_anonymous_function(xml, "xpath_short", path)
|
|
5734
|
+
|
|
5735
|
+
|
|
5736
|
+
@meta(unsupported_engines="*")
|
|
5737
|
+
def xpath_string(xml: ColumnOrName, path: ColumnOrName) -> Column:
|
|
5738
|
+
"""
|
|
5739
|
+
Returns the text contents of the first xml node that matches the XPath expression.
|
|
5740
|
+
|
|
5741
|
+
.. versionadded:: 3.5.0
|
|
5742
|
+
|
|
5743
|
+
Examples
|
|
5744
|
+
--------
|
|
5745
|
+
>>> df = spark.createDataFrame([('<a><b>b</b><c>cc</c></a>',)], ['x'])
|
|
5746
|
+
>>> df.select(xpath_string(df.x, lit('a/c')).alias('r')).collect()
|
|
5747
|
+
[Row(r='cc')]
|
|
5748
|
+
"""
|
|
5749
|
+
return Column.invoke_anonymous_function(xml, "xpath_string", path)
|
|
5750
|
+
|
|
5751
|
+
|
|
5752
|
+
@meta(unsupported_engines="*")
|
|
5753
|
+
def years(col: ColumnOrName) -> Column:
|
|
5754
|
+
"""
|
|
5755
|
+
Partition transform function: A transform for timestamps and dates
|
|
5756
|
+
to partition data into years.
|
|
5757
|
+
|
|
5758
|
+
.. versionadded:: 3.1.0
|
|
5759
|
+
|
|
5760
|
+
.. versionchanged:: 3.4.0
|
|
5761
|
+
Supports Spark Connect.
|
|
5762
|
+
|
|
5763
|
+
Parameters
|
|
5764
|
+
----------
|
|
5765
|
+
col : :class:`~pyspark.sql.Column` or str
|
|
5766
|
+
target date or timestamp column to work on.
|
|
5767
|
+
|
|
5768
|
+
Returns
|
|
5769
|
+
-------
|
|
5770
|
+
:class:`~pyspark.sql.Column`
|
|
5771
|
+
data partitioned by years.
|
|
5772
|
+
|
|
5773
|
+
Examples
|
|
5774
|
+
--------
|
|
5775
|
+
>>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP
|
|
5776
|
+
... years("ts")
|
|
5777
|
+
... ).createOrReplace()
|
|
5778
|
+
|
|
5779
|
+
Notes
|
|
5780
|
+
-----
|
|
5781
|
+
This function can be used only in combination with
|
|
5782
|
+
:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`
|
|
5783
|
+
method of the `DataFrameWriterV2`.
|
|
5784
|
+
|
|
5785
|
+
"""
|
|
5786
|
+
return Column.invoke_anonymous_function(col, "years")
|
|
5787
|
+
|
|
5788
|
+
|
|
1777
5789
|
@meta()
|
|
1778
5790
|
def _lambda_quoted(value: str) -> t.Optional[bool]:
|
|
1779
5791
|
return False if value == "_" else None
|