sqlframe 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import decimal
5
6
  import logging
6
7
  import typing as t
7
8
 
@@ -13,6 +14,8 @@ from sqlframe.base.column import Column
13
14
  from sqlframe.base.decorators import func_metadata as meta
14
15
 
15
16
  if t.TYPE_CHECKING:
17
+ from pyspark.sql.session import SparkContext
18
+
16
19
  from sqlframe.base._typing import ColumnOrLiteral, ColumnOrName
17
20
  from sqlframe.base.session import DF
18
21
  from sqlframe.base.types import ArrayType, StructType
@@ -424,6 +427,9 @@ def pow(col1: t.Union[ColumnOrName, float], col2: t.Union[ColumnOrName, float])
424
427
  return Column.invoke_expression_over_column(col1_value, expression.Pow, expression=col2_value)
425
428
 
426
429
 
430
+ power = pow
431
+
432
+
427
433
  @meta()
428
434
  def row_number() -> Column:
429
435
  return Column(expression.Anonymous(this="ROW_NUMBER"))
@@ -947,7 +953,7 @@ def timestamp_seconds(col: ColumnOrName) -> Column:
947
953
  return Column.invoke_expression_over_column(col, expression.UnixToTime)
948
954
 
949
955
 
950
- @meta(unsupported_engines=["duckdb", "postgres", "bigquery", "redshift", "snowflake", "spark"])
956
+ @meta(unsupported_engines=["*", "spark"])
951
957
  def window(
952
958
  timeColumn: ColumnOrName,
953
959
  windowDuration: str,
@@ -1278,6 +1284,73 @@ def array(*cols: t.Union[ColumnOrName, t.Iterable[ColumnOrName]]) -> Column:
1278
1284
  return Column.invoke_expression_over_column(None, expression.Array, expressions=columns)
1279
1285
 
1280
1286
 
1287
+ @meta(unsupported_engines="*")
1288
+ def array_agg(col: ColumnOrName) -> Column:
1289
+ return Column.invoke_expression_over_column(col, expression.ArrayAgg)
1290
+
1291
+
1292
+ @meta(unsupported_engines="*")
1293
+ def array_append(col: ColumnOrName, value: ColumnOrLiteral) -> Column:
1294
+ value = value if isinstance(value, Column) else lit(value)
1295
+ return Column.invoke_anonymous_function(col, "ARRAY_APPEND", value)
1296
+
1297
+
1298
+ @meta(unsupported_engines="*")
1299
+ def array_compact(col: ColumnOrName) -> Column:
1300
+ return Column.invoke_anonymous_function(col, "ARRAY_COMPACT")
1301
+
1302
+
1303
+ @meta(unsupported_engines="*")
1304
+ def array_insert(
1305
+ col: ColumnOrName, pos: t.Union[ColumnOrName, int], value: ColumnOrLiteral
1306
+ ) -> Column:
1307
+ value = value if isinstance(value, Column) else lit(value)
1308
+ if isinstance(pos, int):
1309
+ pos = lit(pos)
1310
+ return Column.invoke_anonymous_function(col, "ARRAY_INSERT", pos, value) # type: ignore
1311
+
1312
+
1313
+ @meta(unsupported_engines="*")
1314
+ def array_prepend(col: ColumnOrName, value: ColumnOrLiteral) -> Column:
1315
+ value = value if isinstance(value, Column) else lit(value)
1316
+ return Column.invoke_anonymous_function(col, "ARRAY_PREPEND", value)
1317
+
1318
+
1319
+ @meta(unsupported_engines="*")
1320
+ def array_size(col: ColumnOrName) -> Column:
1321
+ return Column.invoke_anonymous_function(col, "ARRAY_SIZE")
1322
+
1323
+
1324
+ @meta(unsupported_engines="*")
1325
+ def bit_and(col: ColumnOrName) -> Column:
1326
+ return Column.invoke_anonymous_function(col, "BIT_AND")
1327
+
1328
+
1329
+ @meta(unsupported_engines="*")
1330
+ def bit_or(col: ColumnOrName) -> Column:
1331
+ return Column.invoke_anonymous_function(col, "BIT_OR")
1332
+
1333
+
1334
+ @meta(unsupported_engines="*")
1335
+ def bit_xor(col: ColumnOrName) -> Column:
1336
+ return Column.invoke_anonymous_function(col, "BIT_XOR")
1337
+
1338
+
1339
+ @meta(unsupported_engines="*")
1340
+ def bit_count(col: ColumnOrName) -> Column:
1341
+ return Column.invoke_anonymous_function(col, "BIT_COUNT")
1342
+
1343
+
1344
+ @meta(unsupported_engines="*")
1345
+ def bit_get(col: ColumnOrName, pos: ColumnOrName) -> Column:
1346
+ return Column.invoke_anonymous_function(col, "BIT_GET", pos)
1347
+
1348
+
1349
+ @meta(unsupported_engines="*")
1350
+ def getbit(col: ColumnOrName, pos: ColumnOrName) -> Column:
1351
+ return Column.invoke_anonymous_function(col, "GETBIT", pos)
1352
+
1353
+
1281
1354
  @meta(unsupported_engines=["bigquery", "postgres"])
1282
1355
  def create_map(*cols: t.Union[ColumnOrName, t.Iterable[ColumnOrName]]) -> Column:
1283
1356
  cols = list(_flatten(cols)) if not isinstance(cols[0], (str, Column)) else cols # type: ignore
@@ -1767,6 +1840,31 @@ def aes_encrypt(
1767
1840
  return Column.invoke_anonymous_function(input, "AES_ENCRYPT", *columns)
1768
1841
 
1769
1842
 
1843
+ @meta(unsupported_engines="*")
1844
+ def bitmap_bit_position(col: ColumnOrName) -> Column:
1845
+ return Column.invoke_anonymous_function(col, "BITMAP_BIT_POSITION")
1846
+
1847
+
1848
+ @meta(unsupported_engines="*")
1849
+ def bitmap_bucket_number(col: ColumnOrName) -> Column:
1850
+ return Column.invoke_anonymous_function(col, "BITMAP_BUCKET_NUMBER")
1851
+
1852
+
1853
+ @meta(unsupported_engines="*")
1854
+ def bitmap_construct_agg(col: ColumnOrName) -> Column:
1855
+ return Column.invoke_anonymous_function(col, "BITMAP_CONSTRUCT_AGG")
1856
+
1857
+
1858
+ @meta(unsupported_engines="*")
1859
+ def bitmap_count(col: ColumnOrName) -> Column:
1860
+ return Column.invoke_anonymous_function(col, "BITMAP_COUNT")
1861
+
1862
+
1863
+ @meta(unsupported_engines="*")
1864
+ def bitmap_or_agg(col: ColumnOrName) -> Column:
1865
+ return Column.invoke_anonymous_function(col, "BITMAP_OR_AGG")
1866
+
1867
+
1770
1868
  @meta(unsupported_engines="*")
1771
1869
  def to_binary(col: ColumnOrName, format: t.Optional[ColumnOrName] = None) -> Column:
1772
1870
  if format is not None:
@@ -1774,6 +1872,3920 @@ def to_binary(col: ColumnOrName, format: t.Optional[ColumnOrName] = None) -> Col
1774
1872
  return Column.invoke_anonymous_function(col, "TO_BINARY")
1775
1873
 
1776
1874
 
1875
+ @meta()
1876
+ def any_value(col: ColumnOrName, ignoreNulls: t.Optional[t.Union[bool, Column]] = None) -> Column:
1877
+ column = Column.invoke_expression_over_column(col, expression.AnyValue)
1878
+ if ignoreNulls:
1879
+ return Column(expression.IgnoreNulls(this=column.expression))
1880
+ return column
1881
+
1882
+
1883
+ @meta(unsupported_engines="*")
1884
+ def approx_percentile(
1885
+ col: ColumnOrName,
1886
+ percentage: t.Union[Column, float, t.List[float], t.Tuple[float]],
1887
+ accuracy: t.Union[Column, float] = 10000,
1888
+ ) -> Column:
1889
+ percentage = lit(percentage) if not isinstance(accuracy, Column) else percentage
1890
+ accuracy = lit(accuracy) if not isinstance(accuracy, Column) else accuracy
1891
+
1892
+ return Column.invoke_expression_over_column(
1893
+ col, expression.ApproxQuantile, quantile=percentage, accuracy=accuracy
1894
+ )
1895
+
1896
+
1897
+ @meta()
1898
+ def bool_and(col: ColumnOrName) -> Column:
1899
+ return Column.invoke_expression_over_column(col, expression.LogicalAnd)
1900
+
1901
+
1902
+ @meta()
1903
+ def bool_or(col: ColumnOrName) -> Column:
1904
+ return Column.invoke_expression_over_column(col, expression.LogicalOr)
1905
+
1906
+
1907
+ @meta(unsupported_engines="*")
1908
+ def btrim(str: ColumnOrName, trim: t.Optional[ColumnOrName] = None) -> Column:
1909
+ if trim is not None:
1910
+ return Column.invoke_anonymous_function(str, "btrim", trim)
1911
+ else:
1912
+ return Column.invoke_anonymous_function(str, "btrim")
1913
+
1914
+
1915
+ @meta(unsupported_engines="*")
1916
+ def bucket(numBuckets: t.Union[Column, int], col: ColumnOrName) -> Column:
1917
+ numBuckets = lit(numBuckets) if isinstance(numBuckets, int) else numBuckets
1918
+ return Column.invoke_anonymous_function(numBuckets, "bucket", col)
1919
+
1920
+
1921
+ @meta()
1922
+ def call_function(funcName: str, *cols: ColumnOrName) -> Column:
1923
+ cols = ensure_list(cols) # type: ignore
1924
+ if len(cols) > 1:
1925
+ return Column.invoke_anonymous_function(cols[0], funcName, *cols[1:])
1926
+ return Column.invoke_anonymous_function(cols[0], funcName)
1927
+
1928
+
1929
+ # @meta(unsupported_engines="*")
1930
+ # def call_udf(udfName: str, *cols: ColumnOrName) -> Column:
1931
+ # """
1932
+ # Call an user-defined function.
1933
+ #
1934
+ # .. versionadded:: 3.4.0
1935
+ #
1936
+ # Parameters
1937
+ # ----------
1938
+ # udfName : str
1939
+ # name of the user defined function (UDF)
1940
+ # cols : :class:`~pyspark.sql.Column` or str
1941
+ # column names or :class:`~pyspark.sql.Column`\\s to be used in the UDF
1942
+ #
1943
+ # Returns
1944
+ # -------
1945
+ # :class:`~pyspark.sql.Column`
1946
+ # result of executed udf.
1947
+ #
1948
+ # Examples
1949
+ # --------
1950
+ # >>> from pyspark.sql.functions import call_udf, col
1951
+ # >>> from pyspark.sql.types import IntegerType, StringType
1952
+ # >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
1953
+ # >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
1954
+ # >>> df.select(call_udf("intX2", "id")).show()
1955
+ # +---------+
1956
+ # |intX2(id)|
1957
+ # +---------+
1958
+ # | 2|
1959
+ # | 4|
1960
+ # | 6|
1961
+ # +---------+
1962
+ # >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())
1963
+ # >>> df.select(call_udf("strX2", col("name"))).show()
1964
+ # +-----------+
1965
+ # |strX2(name)|
1966
+ # +-----------+
1967
+ # | aa|
1968
+ # | bb|
1969
+ # | cc|
1970
+ # +-----------+
1971
+ # """
1972
+ # sc = get_active_spark_context()
1973
+ # return _invoke_function("call_udf", udfName, _to_seq(sc, cols, _to_java_column))
1974
+ #
1975
+ #
1976
+ # @pytest.mark.parametrize(
1977
+ # "expression, expected",
1978
+ # [
1979
+ # (SF.call_udf("cola"), "CALL_UDF(cola)"),
1980
+ # (SF.call_udf(SF.col("cola")), "CALL_UDF(cola)"),
1981
+ # ],
1982
+ # )
1983
+ # def test_call_udf(expression, expected):
1984
+ # assert expression.sql() == expected
1985
+ #
1986
+ # def test_call_udf(get_session_and_func, get_func):
1987
+ # session, call_udf = get_session_and_func("call_udf")
1988
+ # >>> from pyspark.sql.functions import call_udf, col
1989
+ # >>> from pyspark.sql.types import IntegerType, StringType
1990
+ # >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
1991
+ # >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
1992
+ # >>> df.select(call_udf("intX2", "id")).show()
1993
+ # +---------+
1994
+ # |intX2(id)|
1995
+ # +---------+
1996
+ # | 2|
1997
+ # | 4|
1998
+ # | 6|
1999
+ # +---------+
2000
+ # >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())
2001
+ # >>> df.select(call_udf("strX2", col("name"))).show()
2002
+ # +-----------+
2003
+ # |strX2(name)|
2004
+ # +-----------+
2005
+ # | aa|
2006
+ # | bb|
2007
+ # | cc|
2008
+ # +-----------+
2009
+
2010
+
2011
+ @meta(unsupported_engines="*")
2012
+ def cardinality(col: ColumnOrName) -> Column:
2013
+ return Column.invoke_anonymous_function(col, "cardinality")
2014
+
2015
+
2016
+ @meta()
2017
+ def char(col: ColumnOrName) -> Column:
2018
+ return Column.invoke_expression_over_column(col, expression.Chr)
2019
+
2020
+
2021
+ @meta(unsupported_engines="*")
2022
+ def char_length(str: ColumnOrName) -> Column:
2023
+ return Column.invoke_anonymous_function(str, "char_length")
2024
+
2025
+
2026
+ @meta(unsupported_engines="*")
2027
+ def character_length(str: ColumnOrName) -> Column:
2028
+ return Column.invoke_anonymous_function(str, "character_length")
2029
+
2030
+
2031
+ @meta(unsupported_engines="*")
2032
+ def contains(left: ColumnOrName, right: ColumnOrName) -> Column:
2033
+ return Column.invoke_anonymous_function(left, "contains", right)
2034
+
2035
+
2036
+ @meta(unsupported_engines="*")
2037
+ def convert_timezone(
2038
+ sourceTz: t.Optional[Column], targetTz: Column, sourceTs: ColumnOrName
2039
+ ) -> Column:
2040
+ if sourceTz is None:
2041
+ return Column.invoke_anonymous_function(targetTz, "convert_timezone", sourceTs)
2042
+ else:
2043
+ return Column.invoke_anonymous_function(sourceTz, "convert_timezone", targetTz, sourceTs)
2044
+
2045
+
2046
+ @meta(unsupported_engines="postgres")
2047
+ def count_if(col: ColumnOrName) -> Column:
2048
+ return Column.invoke_expression_over_column(col, expression.CountIf)
2049
+
2050
+
2051
+ @meta(unsupported_engines="*")
2052
+ def count_min_sketch(
2053
+ col: ColumnOrName,
2054
+ eps: ColumnOrName,
2055
+ confidence: ColumnOrName,
2056
+ seed: ColumnOrName,
2057
+ ) -> Column:
2058
+ eps = Column.ensure_col(eps).cast("double")
2059
+ confidence = Column.ensure_col(confidence).cast("double")
2060
+ return Column.invoke_anonymous_function(col, "count_min_sketch", eps, confidence, seed)
2061
+
2062
+
2063
+ @meta(unsupported_engines="*")
2064
+ def curdate() -> Column:
2065
+ """
2066
+ Returns the current date at the start of query evaluation as a :class:`DateType` column.
2067
+ All calls of current_date within the same query return the same value.
2068
+
2069
+ .. versionadded:: 3.5.0
2070
+
2071
+ Returns
2072
+ -------
2073
+ :class:`~pyspark.sql.Column`
2074
+ current date.
2075
+
2076
+ Examples
2077
+ --------
2078
+ >>> import pyspark.sql.functions as sf
2079
+ >>> spark.range(1).select(sf.curdate()).show() # doctest: +SKIP
2080
+ +--------------+
2081
+ |current_date()|
2082
+ +--------------+
2083
+ | 2022-08-26|
2084
+ +--------------+
2085
+ """
2086
+ return Column.invoke_anonymous_function(None, "curdate")
2087
+
2088
+
2089
+ @meta(unsupported_engines="*")
2090
+ def current_catalog() -> Column:
2091
+ """Returns the current catalog.
2092
+
2093
+ .. versionadded:: 3.5.0
2094
+
2095
+ Examples
2096
+ --------
2097
+ >>> spark.range(1).select(current_catalog()).show()
2098
+ +-----------------+
2099
+ |current_catalog()|
2100
+ +-----------------+
2101
+ | spark_catalog|
2102
+ +-----------------+
2103
+ """
2104
+ return Column.invoke_anonymous_function(None, "current_catalog")
2105
+
2106
+
2107
+ @meta(unsupported_engines="*")
2108
+ def current_database() -> Column:
2109
+ """Returns the current database.
2110
+
2111
+ .. versionadded:: 3.5.0
2112
+
2113
+ Examples
2114
+ --------
2115
+ >>> spark.range(1).select(current_database()).show()
2116
+ +------------------+
2117
+ |current_database()|
2118
+ +------------------+
2119
+ | default|
2120
+ +------------------+
2121
+ """
2122
+ return Column.invoke_anonymous_function(None, "current_database")
2123
+
2124
+
2125
+ current_schema = current_database
2126
+
2127
+
2128
+ @meta(unsupported_engines="*")
2129
+ def current_timezone() -> Column:
2130
+ return Column.invoke_anonymous_function(None, "current_timezone")
2131
+
2132
+
2133
+ @meta()
2134
+ def current_user() -> Column:
2135
+ return Column.invoke_expression_over_column(None, expression.CurrentUser)
2136
+
2137
+
2138
+ @meta(unsupported_engines="*")
2139
+ def date_from_unix_date(days: ColumnOrName) -> Column:
2140
+ return Column.invoke_anonymous_function(days, "date_from_unix_date")
2141
+
2142
+
2143
+ @meta(unsupported_engines="*")
2144
+ def date_part(field: ColumnOrName, source: ColumnOrName) -> Column:
2145
+ return Column.invoke_anonymous_function(field, "date_part", source)
2146
+
2147
+
2148
+ dateadd = date_add
2149
+ datediff = date_diff
2150
+
2151
+
2152
+ @meta(unsupported_engines="*")
2153
+ def datepart(field: ColumnOrName, source: ColumnOrName) -> Column:
2154
+ return Column.invoke_anonymous_function(field, "datepart", source)
2155
+
2156
+
2157
+ @meta(unsupported_engines="*")
2158
+ def day(col: ColumnOrName) -> Column:
2159
+ return Column.invoke_expression_over_column(col, expression.Day)
2160
+
2161
+
2162
+ @meta(unsupported_engines="*")
2163
+ def days(col: ColumnOrName) -> Column:
2164
+ return Column.invoke_anonymous_function(col, "days")
2165
+
2166
+
2167
+ @meta(unsupported_engines="*")
2168
+ def elt(*inputs: ColumnOrName) -> Column:
2169
+ inputs = ensure_list(inputs) # type: ignore
2170
+ if len(inputs) > 1:
2171
+ return Column.invoke_anonymous_function(inputs[0], "elt", *inputs[1:])
2172
+ return Column.invoke_anonymous_function(inputs[0], "elt")
2173
+
2174
+
2175
+ @meta(unsupported_engines="*")
2176
+ def endswith(str: ColumnOrName, suffix: ColumnOrName) -> Column:
2177
+ return Column.invoke_anonymous_function(str, "endswith", suffix)
2178
+
2179
+
2180
+ @meta(unsupported_engines="*")
2181
+ def equal_null(col1: ColumnOrName, col2: ColumnOrName) -> Column:
2182
+ return Column.invoke_anonymous_function(col1, "equal_null", col2)
2183
+
2184
+
2185
+ @meta(unsupported_engines="*")
2186
+ def every(col: ColumnOrName) -> Column:
2187
+ return Column.invoke_anonymous_function(col, "every")
2188
+
2189
+
2190
+ @meta()
2191
+ def extract(field: ColumnOrName, source: ColumnOrName) -> Column:
2192
+ return Column.invoke_expression_over_column(field, expression.Extract, expression=source)
2193
+
2194
+
2195
+ @meta(unsupported_engines="*")
2196
+ def find_in_set(str: ColumnOrName, str_array: ColumnOrName) -> Column:
2197
+ return Column.invoke_anonymous_function(str, "find_in_set", str_array)
2198
+
2199
+
2200
+ @meta(unsupported_engines="*")
2201
+ def first_value(col: ColumnOrName, ignoreNulls: t.Optional[t.Union[bool, Column]] = None) -> Column:
2202
+ column = Column.invoke_expression_over_column(col, expression.FirstValue)
2203
+
2204
+ if ignoreNulls:
2205
+ return Column(expression.IgnoreNulls(this=column.expression))
2206
+ return column
2207
+
2208
+
2209
+ @meta(unsupported_engines="*")
2210
+ def get(col: ColumnOrName, index: t.Union[ColumnOrName, int]) -> Column:
2211
+ index = lit(index) if isinstance(index, int) else index
2212
+
2213
+ return Column.invoke_anonymous_function(col, "get", index)
2214
+
2215
+
2216
+ @meta(unsupported_engines="*")
2217
+ def get_active_spark_context() -> SparkContext:
2218
+ """Raise RuntimeError if SparkContext is not initialized,
2219
+ otherwise, returns the active SparkContext."""
2220
+ from sqlframe.base.session import _BaseSession
2221
+ from sqlframe.spark.session import SparkSession
2222
+
2223
+ session: _BaseSession = _BaseSession()
2224
+ if not isinstance(session, SparkSession):
2225
+ raise RuntimeError("This function is only available in SparkSession.")
2226
+ return session.spark_session.sparkContext
2227
+
2228
+
2229
+ @meta(unsupported_engines="*")
2230
+ def grouping(col: ColumnOrName) -> Column:
2231
+ """
2232
+ Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
2233
+ or not, returns 1 for aggregated or 0 for not aggregated in the result set.
2234
+
2235
+ .. versionadded:: 2.0.0
2236
+
2237
+ .. versionchanged:: 3.4.0
2238
+ Supports Spark Connect.
2239
+
2240
+ Parameters
2241
+ ----------
2242
+ col : :class:`~pyspark.sql.Column` or str
2243
+ column to check if it's aggregated.
2244
+
2245
+ Returns
2246
+ -------
2247
+ :class:`~pyspark.sql.Column`
2248
+ returns 1 for aggregated or 0 for not aggregated in the result set.
2249
+
2250
+ Examples
2251
+ --------
2252
+ >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
2253
+ >>> df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()
2254
+ +-----+--------------+--------+
2255
+ | name|grouping(name)|sum(age)|
2256
+ +-----+--------------+--------+
2257
+ | NULL| 1| 7|
2258
+ |Alice| 0| 2|
2259
+ | Bob| 0| 5|
2260
+ +-----+--------------+--------+
2261
+ """
2262
+ return Column.invoke_anonymous_function(col, "grouping")
2263
+
2264
+
2265
+ @meta(unsupported_engines="*")
2266
+ def histogram_numeric(col: ColumnOrName, nBins: ColumnOrName) -> Column:
2267
+ """Computes a histogram on numeric 'col' using nb bins.
2268
+ The return value is an array of (x,y) pairs representing the centers of the
2269
+ histogram's bins. As the value of 'nb' is increased, the histogram approximation
2270
+ gets finer-grained, but may yield artifacts around outliers. In practice, 20-40
2271
+ histogram bins appear to work well, with more bins being required for skewed or
2272
+ smaller datasets. Note that this function creates a histogram with non-uniform
2273
+ bin widths. It offers no guarantees in terms of the mean-squared-error of the
2274
+ histogram, but in practice is comparable to the histograms produced by the R/S-Plus
2275
+ statistical computing packages. Note: the output type of the 'x' field in the return value is
2276
+ propagated from the input value consumed in the aggregate function.
2277
+
2278
+ .. versionadded:: 3.5.0
2279
+
2280
+ Parameters
2281
+ ----------
2282
+ col : :class:`~pyspark.sql.Column` or str
2283
+ target column to work on.
2284
+ nBins : :class:`~pyspark.sql.Column` or str
2285
+ number of Histogram columns.
2286
+
2287
+ Returns
2288
+ -------
2289
+ :class:`~pyspark.sql.Column`
2290
+ a histogram on numeric 'col' using nb bins.
2291
+
2292
+ Examples
2293
+ --------
2294
+ >>> df = spark.createDataFrame([("a", 1),
2295
+ ... ("a", 2),
2296
+ ... ("a", 3),
2297
+ ... ("b", 8),
2298
+ ... ("b", 2)], ["c1", "c2"])
2299
+ >>> df.select(histogram_numeric('c2', lit(5))).show()
2300
+ +------------------------+
2301
+ |histogram_numeric(c2, 5)|
2302
+ +------------------------+
2303
+ | [{1, 1.0}, {2, 1....|
2304
+ +------------------------+
2305
+ """
2306
+ return Column.invoke_anonymous_function(col, "histogram_numeric", nBins)
2307
+
2308
+
2309
+ @meta(unsupported_engines="*")
2310
+ def hll_sketch_agg(col: ColumnOrName, lgConfigK: t.Optional[t.Union[int, Column]] = None) -> Column:
2311
+ """
2312
+ Aggregate function: returns the updatable binary representation of the Datasketches
2313
+ HllSketch configured with lgConfigK arg.
2314
+
2315
+ .. versionadded:: 3.5.0
2316
+
2317
+ Parameters
2318
+ ----------
2319
+ col : :class:`~pyspark.sql.Column` or str or int
2320
+ lgConfigK : int, optional
2321
+ The log-base-2 of K, where K is the number of buckets or slots for the HllSketch
2322
+
2323
+ Returns
2324
+ -------
2325
+ :class:`~pyspark.sql.Column`
2326
+ The binary representation of the HllSketch.
2327
+
2328
+ Examples
2329
+ --------
2330
+ >>> df = spark.createDataFrame([1,2,2,3], "INT")
2331
+ >>> df1 = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))
2332
+ >>> df1.show()
2333
+ +------------+
2334
+ |distinct_cnt|
2335
+ +------------+
2336
+ | 3|
2337
+ +------------+
2338
+ >>> df2 = df.agg(hll_sketch_estimate(
2339
+ ... hll_sketch_agg("value", lit(12))
2340
+ ... ).alias("distinct_cnt"))
2341
+ >>> df2.show()
2342
+ +------------+
2343
+ |distinct_cnt|
2344
+ +------------+
2345
+ | 3|
2346
+ +------------+
2347
+ >>> df3 = df.agg(hll_sketch_estimate(
2348
+ ... hll_sketch_agg(col("value"), lit(12))).alias("distinct_cnt"))
2349
+ >>> df3.show()
2350
+ +------------+
2351
+ |distinct_cnt|
2352
+ +------------+
2353
+ | 3|
2354
+ +------------+
2355
+ """
2356
+ if lgConfigK is None:
2357
+ return Column.invoke_anonymous_function(col, "hll_sketch_agg")
2358
+ else:
2359
+ _lgConfigK = lit(lgConfigK) if isinstance(lgConfigK, int) else lgConfigK
2360
+ return Column.invoke_anonymous_function(col, "hll_sketch_agg", _lgConfigK)
2361
+
2362
+
2363
+ @meta(unsupported_engines="*")
2364
+ def hll_sketch_estimate(col: ColumnOrName) -> Column:
2365
+ """
2366
+ Returns the estimated number of unique values given the binary representation
2367
+ of a Datasketches HllSketch.
2368
+
2369
+ .. versionadded:: 3.5.0
2370
+
2371
+ Parameters
2372
+ ----------
2373
+ col : :class:`~pyspark.sql.Column` or str
2374
+
2375
+ Returns
2376
+ -------
2377
+ :class:`~pyspark.sql.Column`
2378
+ The estimated number of unique values for the HllSketch.
2379
+
2380
+ Examples
2381
+ --------
2382
+ >>> df = spark.createDataFrame([1,2,2,3], "INT")
2383
+ >>> df = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))
2384
+ >>> df.show()
2385
+ +------------+
2386
+ |distinct_cnt|
2387
+ +------------+
2388
+ | 3|
2389
+ +------------+
2390
+ """
2391
+ return Column.invoke_anonymous_function(col, "hll_sketch_estimate")
2392
+
2393
+
2394
+ @meta(unsupported_engines="*")
2395
+ def hll_union(
2396
+ col1: ColumnOrName, col2: ColumnOrName, allowDifferentLgConfigK: t.Optional[bool] = None
2397
+ ) -> Column:
2398
+ """
2399
+ Merges two binary representations of Datasketches HllSketch objects, using a
2400
+ Datasketches Union object. Throws an exception if sketches have different
2401
+ lgConfigK values and allowDifferentLgConfigK is unset or set to false.
2402
+
2403
+ .. versionadded:: 3.5.0
2404
+
2405
+ Parameters
2406
+ ----------
2407
+ col1 : :class:`~pyspark.sql.Column` or str
2408
+ col2 : :class:`~pyspark.sql.Column` or str
2409
+ allowDifferentLgConfigK : bool, optional
2410
+ Allow sketches with different lgConfigK values to be merged (defaults to false).
2411
+
2412
+ Returns
2413
+ -------
2414
+ :class:`~pyspark.sql.Column`
2415
+ The binary representation of the merged HllSketch.
2416
+
2417
+ Examples
2418
+ --------
2419
+ >>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>")
2420
+ >>> df = df.agg(hll_sketch_agg("v1").alias("sketch1"), hll_sketch_agg("v2").alias("sketch2"))
2421
+ >>> df = df.withColumn("distinct_cnt", hll_sketch_estimate(hll_union("sketch1", "sketch2")))
2422
+ >>> df.drop("sketch1", "sketch2").show()
2423
+ +------------+
2424
+ |distinct_cnt|
2425
+ +------------+
2426
+ | 6|
2427
+ +------------+
2428
+ """
2429
+ if allowDifferentLgConfigK is not None:
2430
+ allowDifferentLgConfigK = (
2431
+ lit(allowDifferentLgConfigK)
2432
+ if isinstance(allowDifferentLgConfigK, bool)
2433
+ else allowDifferentLgConfigK
2434
+ )
2435
+ return Column.invoke_anonymous_function(col1, "hll_union", col2, allowDifferentLgConfigK) # type: ignore
2436
+ else:
2437
+ return Column.invoke_anonymous_function(col1, "hll_union", col2)
2438
+
2439
+
2440
+ @meta(unsupported_engines="*")
2441
+ def hll_union_agg(
2442
+ col: ColumnOrName, allowDifferentLgConfigK: t.Optional[t.Union[bool, Column]] = None
2443
+ ) -> Column:
2444
+ """
2445
+ Aggregate function: returns the updatable binary representation of the Datasketches
2446
+ HllSketch, generated by merging previously created Datasketches HllSketch instances
2447
+ via a Datasketches Union instance. Throws an exception if sketches have different
2448
+ lgConfigK values and allowDifferentLgConfigK is unset or set to false.
2449
+
2450
+ .. versionadded:: 3.5.0
2451
+
2452
+ Parameters
2453
+ ----------
2454
+ col : :class:`~pyspark.sql.Column` or str or bool
2455
+ allowDifferentLgConfigK : bool, optional
2456
+ Allow sketches with different lgConfigK values to be merged (defaults to false).
2457
+
2458
+ Returns
2459
+ -------
2460
+ :class:`~pyspark.sql.Column`
2461
+ The binary representation of the merged HllSketch.
2462
+
2463
+ Examples
2464
+ --------
2465
+ >>> df1 = spark.createDataFrame([1,2,2,3], "INT")
2466
+ >>> df1 = df1.agg(hll_sketch_agg("value").alias("sketch"))
2467
+ >>> df2 = spark.createDataFrame([4,5,5,6], "INT")
2468
+ >>> df2 = df2.agg(hll_sketch_agg("value").alias("sketch"))
2469
+ >>> df3 = df1.union(df2).agg(hll_sketch_estimate(
2470
+ ... hll_union_agg("sketch")
2471
+ ... ).alias("distinct_cnt"))
2472
+ >>> df3.drop("sketch").show()
2473
+ +------------+
2474
+ |distinct_cnt|
2475
+ +------------+
2476
+ | 6|
2477
+ +------------+
2478
+ >>> df4 = df1.union(df2).agg(hll_sketch_estimate(
2479
+ ... hll_union_agg("sketch", lit(False))
2480
+ ... ).alias("distinct_cnt"))
2481
+ >>> df4.drop("sketch").show()
2482
+ +------------+
2483
+ |distinct_cnt|
2484
+ +------------+
2485
+ | 6|
2486
+ +------------+
2487
+ >>> df5 = df1.union(df2).agg(hll_sketch_estimate(
2488
+ ... hll_union_agg(col("sketch"), lit(False))
2489
+ ... ).alias("distinct_cnt"))
2490
+ >>> df5.drop("sketch").show()
2491
+ +------------+
2492
+ |distinct_cnt|
2493
+ +------------+
2494
+ | 6|
2495
+ +------------+
2496
+ """
2497
+ if allowDifferentLgConfigK is None:
2498
+ return Column.invoke_anonymous_function(col, "hll_union_agg")
2499
+ else:
2500
+ _allowDifferentLgConfigK = (
2501
+ lit(allowDifferentLgConfigK)
2502
+ if isinstance(allowDifferentLgConfigK, bool)
2503
+ else allowDifferentLgConfigK
2504
+ )
2505
+ return Column.invoke_anonymous_function(col, "hll_union_agg", _allowDifferentLgConfigK)
2506
+
2507
+
2508
+ @meta(unsupported_engines="*")
2509
+ def hours(col: ColumnOrName) -> Column:
2510
+ """
2511
+ Partition transform function: A transform for timestamps
2512
+ to partition data into hours.
2513
+
2514
+ .. versionadded:: 3.1.0
2515
+
2516
+ .. versionchanged:: 3.4.0
2517
+ Supports Spark Connect.
2518
+
2519
+ Parameters
2520
+ ----------
2521
+ col : :class:`~pyspark.sql.Column` or str
2522
+ target date or timestamp column to work on.
2523
+
2524
+ Returns
2525
+ -------
2526
+ :class:`~pyspark.sql.Column`
2527
+ data partitioned by hours.
2528
+
2529
+ Examples
2530
+ --------
2531
+ >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP
2532
+ ... hours("ts")
2533
+ ... ).createOrReplace()
2534
+
2535
+ Notes
2536
+ -----
2537
+ This function can be used only in combination with
2538
+ :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`
2539
+ method of the `DataFrameWriterV2`.
2540
+
2541
+ """
2542
+ return Column.invoke_anonymous_function(col, "hours")
2543
+
2544
+
2545
+ @meta()
2546
+ def ifnull(col1: ColumnOrName, col2: ColumnOrName) -> Column:
2547
+ """
2548
+ Returns `col2` if `col1` is null, or `col1` otherwise.
2549
+
2550
+ .. versionadded:: 3.5.0
2551
+
2552
+ Parameters
2553
+ ----------
2554
+ col1 : :class:`~pyspark.sql.Column` or str
2555
+ col2 : :class:`~pyspark.sql.Column` or str
2556
+
2557
+ Examples
2558
+ --------
2559
+ >>> import pyspark.sql.functions as sf
2560
+ >>> df = spark.createDataFrame([(None,), (1,)], ["e"])
2561
+ >>> df.select(sf.ifnull(df.e, sf.lit(8))).show()
2562
+ +------------+
2563
+ |ifnull(e, 8)|
2564
+ +------------+
2565
+ | 8|
2566
+ | 1|
2567
+ +------------+
2568
+ """
2569
+ return Column.invoke_expression_over_column(col1, expression.Coalesce, expressions=[col2])
2570
+
2571
+
2572
+ @meta(unsupported_engines="*")
2573
+ def ilike(
2574
+ str: ColumnOrName, pattern: ColumnOrName, escapeChar: t.Optional["Column"] = None
2575
+ ) -> Column:
2576
+ """
2577
+ Returns true if str matches `pattern` with `escape` case-insensitively,
2578
+ null if any arguments are null, false otherwise.
2579
+ The default escape character is the '\'.
2580
+
2581
+ .. versionadded:: 3.5.0
2582
+
2583
+ Parameters
2584
+ ----------
2585
+ str : :class:`~pyspark.sql.Column` or str
2586
+ A string.
2587
+ pattern : :class:`~pyspark.sql.Column` or str
2588
+ A string. The pattern is a string which is matched literally, with
2589
+ exception to the following special symbols:
2590
+ _ matches any one character in the input (similar to . in posix regular expressions)
2591
+ % matches zero or more characters in the input (similar to .* in posix regular
2592
+ expressions)
2593
+ Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order
2594
+ to match "\abc", the pattern should be "\\abc".
2595
+ When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back
2596
+ to Spark 1.6 behavior regarding string literal parsing. For example, if the config is
2597
+ enabled, the pattern to match "\abc" should be "\abc".
2598
+ escape : :class:`~pyspark.sql.Column`
2599
+ An character added since Spark 3.0. The default escape character is the '\'.
2600
+ If an escape character precedes a special symbol or another escape character, the
2601
+ following character is matched literally. It is invalid to escape any other character.
2602
+
2603
+ Examples
2604
+ --------
2605
+ >>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])
2606
+ >>> df.select(ilike(df.a, df.b).alias('r')).collect()
2607
+ [Row(r=True)]
2608
+
2609
+ >>> df = spark.createDataFrame(
2610
+ ... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],
2611
+ ... ['a', 'b']
2612
+ ... )
2613
+ >>> df.select(ilike(df.a, df.b, lit('/')).alias('r')).collect()
2614
+ [Row(r=True)]
2615
+ """
2616
+ column = Column.invoke_expression_over_column(str, expression.ILike, expression=pattern)
2617
+ if escapeChar is not None:
2618
+ return Column(
2619
+ expression.Escape(
2620
+ this=column.expression,
2621
+ expression=Column.ensure_col(escapeChar).expression,
2622
+ )
2623
+ )
2624
+ return column
2625
+
2626
+
2627
+ @meta(unsupported_engines="*")
2628
+ def inline(col: ColumnOrName) -> Column:
2629
+ """
2630
+ Explodes an array of structs into a table.
2631
+
2632
+ .. versionadded:: 3.4.0
2633
+
2634
+ Parameters
2635
+ ----------
2636
+ col : :class:`~pyspark.sql.Column` or str
2637
+ input column of values to explode.
2638
+
2639
+ Returns
2640
+ -------
2641
+ :class:`~pyspark.sql.Column`
2642
+ generator expression with the inline exploded result.
2643
+
2644
+ See Also
2645
+ --------
2646
+ :meth:`explode`
2647
+
2648
+ Notes
2649
+ -----
2650
+ Supports Spark Connect.
2651
+
2652
+ Examples
2653
+ --------
2654
+ >>> from pyspark.sql import Row
2655
+ >>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])])
2656
+ >>> df.select(inline(df.structlist)).show()
2657
+ +---+---+
2658
+ | a| b|
2659
+ +---+---+
2660
+ | 1| 2|
2661
+ | 3| 4|
2662
+ +---+---+
2663
+ """
2664
+ return Column.invoke_anonymous_function(col, "inline")
2665
+
2666
+
2667
+ @meta(unsupported_engines="*")
2668
+ def inline_outer(col: ColumnOrName) -> Column:
2669
+ """
2670
+ Explodes an array of structs into a table.
2671
+ Unlike inline, if the array is null or empty then null is produced for each nested column.
2672
+
2673
+ .. versionadded:: 3.4.0
2674
+
2675
+ Parameters
2676
+ ----------
2677
+ col : :class:`~pyspark.sql.Column` or str
2678
+ input column of values to explode.
2679
+
2680
+ Returns
2681
+ -------
2682
+ :class:`~pyspark.sql.Column`
2683
+ generator expression with the inline exploded result.
2684
+
2685
+ See Also
2686
+ --------
2687
+ :meth:`explode_outer`
2688
+ :meth:`inline`
2689
+
2690
+ Notes
2691
+ -----
2692
+ Supports Spark Connect.
2693
+
2694
+ Examples
2695
+ --------
2696
+ >>> from pyspark.sql import Row
2697
+ >>> df = spark.createDataFrame([
2698
+ ... Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]),
2699
+ ... Row(id=2, structlist=[])
2700
+ ... ])
2701
+ >>> df.select('id', inline_outer(df.structlist)).show()
2702
+ +---+----+----+
2703
+ | id| a| b|
2704
+ +---+----+----+
2705
+ | 1| 1| 2|
2706
+ | 1| 3| 4|
2707
+ | 2|NULL|NULL|
2708
+ +---+----+----+
2709
+ """
2710
+ return Column.invoke_anonymous_function(col, "inline_outer")
2711
+
2712
+
2713
+ @meta(unsupported_engines="*")
2714
+ def isnotnull(col: ColumnOrName) -> Column:
2715
+ """
2716
+ Returns true if `col` is not null, or false otherwise.
2717
+
2718
+ .. versionadded:: 3.5.0
2719
+
2720
+ Parameters
2721
+ ----------
2722
+ col : :class:`~pyspark.sql.Column` or str
2723
+
2724
+ Examples
2725
+ --------
2726
+ >>> df = spark.createDataFrame([(None,), (1,)], ["e"])
2727
+ >>> df.select(isnotnull(df.e).alias('r')).collect()
2728
+ [Row(r=False), Row(r=True)]
2729
+ """
2730
+ return Column.invoke_anonymous_function(col, "isnotnull")
2731
+
2732
+
2733
+ @meta(unsupported_engines="*")
2734
+ def java_method(*cols: ColumnOrName) -> Column:
2735
+ """
2736
+ Calls a method with reflection.
2737
+
2738
+ .. versionadded:: 3.5.0
2739
+
2740
+ Parameters
2741
+ ----------
2742
+ cols : :class:`~pyspark.sql.Column` or str
2743
+ the first element should be a literal string for the class name,
2744
+ and the second element should be a literal string for the method name,
2745
+ and the remaining are input arguments to the Java method.
2746
+
2747
+ Examples
2748
+ --------
2749
+ >>> import pyspark.sql.functions as sf
2750
+ >>> spark.range(1).select(
2751
+ ... sf.java_method(
2752
+ ... sf.lit("java.util.UUID"),
2753
+ ... sf.lit("fromString"),
2754
+ ... sf.lit("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2")
2755
+ ... )
2756
+ ... ).show(truncate=False)
2757
+ +-----------------------------------------------------------------------------+
2758
+ |java_method(java.util.UUID, fromString, a5cf6c42-0c85-418f-af6c-3e4e5b1328f2)|
2759
+ +-----------------------------------------------------------------------------+
2760
+ |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |
2761
+ +-----------------------------------------------------------------------------+
2762
+ """
2763
+ cols = ensure_list(cols) # type: ignore
2764
+ if len(cols) > 1:
2765
+ return Column.invoke_anonymous_function(cols[0], "java_method", *cols[1:])
2766
+ return Column.invoke_anonymous_function(cols[0], "java_method")
2767
+
2768
+
2769
+ @meta(unsupported_engines="*")
2770
+ def json_array_length(col: ColumnOrName) -> Column:
2771
+ """
2772
+ Returns the number of elements in the outermost JSON array. `NULL` is returned in case of
2773
+ any other valid JSON string, `NULL` or an invalid JSON.
2774
+
2775
+ .. versionadded:: 3.5.0
2776
+
2777
+ Parameters
2778
+ ----------
2779
+ col: :class:`~pyspark.sql.Column` or str
2780
+ target column to compute on.
2781
+
2782
+ Returns
2783
+ -------
2784
+ :class:`~pyspark.sql.Column`
2785
+ length of json array.
2786
+
2787
+ Examples
2788
+ --------
2789
+ >>> df = spark.createDataFrame([(None,), ('[1, 2, 3]',), ('[]',)], ['data'])
2790
+ >>> df.select(json_array_length(df.data).alias('r')).collect()
2791
+ [Row(r=None), Row(r=3), Row(r=0)]
2792
+ """
2793
+ return Column.invoke_anonymous_function(col, "json_array_length")
2794
+
2795
+
2796
+ @meta(unsupported_engines="*")
2797
+ def json_object_keys(col: ColumnOrName) -> Column:
2798
+ """
2799
+ Returns all the keys of the outermost JSON object as an array. If a valid JSON object is
2800
+ given, all the keys of the outermost object will be returned as an array. If it is any
2801
+ other valid JSON string, an invalid JSON string or an empty string, the function returns null.
2802
+
2803
+ .. versionadded:: 3.5.0
2804
+
2805
+ Parameters
2806
+ ----------
2807
+ col: :class:`~pyspark.sql.Column` or str
2808
+ target column to compute on.
2809
+
2810
+ Returns
2811
+ -------
2812
+ :class:`~pyspark.sql.Column`
2813
+ all the keys of the outermost JSON object.
2814
+
2815
+ Examples
2816
+ --------
2817
+ >>> df = spark.createDataFrame([(None,), ('{}',), ('{"key1":1, "key2":2}',)], ['data'])
2818
+ >>> df.select(json_object_keys(df.data).alias('r')).collect()
2819
+ [Row(r=None), Row(r=[]), Row(r=['key1', 'key2'])]
2820
+ """
2821
+ return Column.invoke_anonymous_function(col, "json_object_keys")
2822
+
2823
+
2824
+ @meta(unsupported_engines="*")
2825
+ def last_value(col: ColumnOrName, ignoreNulls: t.Optional[t.Union[bool, Column]] = None) -> Column:
2826
+ """Returns the last value of `col` for a group of rows. It will return the last non-null
2827
+ value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned.
2828
+
2829
+ .. versionadded:: 3.5.0
2830
+
2831
+ Parameters
2832
+ ----------
2833
+ col : :class:`~pyspark.sql.Column` or str
2834
+ target column to work on.
2835
+ ignorenulls : :class:`~pyspark.sql.Column` or bool
2836
+ if first value is null then look for first non-null value.
2837
+
2838
+ Returns
2839
+ -------
2840
+ :class:`~pyspark.sql.Column`
2841
+ some value of `col` for a group of rows.
2842
+
2843
+ Examples
2844
+ --------
2845
+ >>> import pyspark.sql.functions as sf
2846
+ >>> spark.createDataFrame(
2847
+ ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]
2848
+ ... ).select(sf.last_value('a'), sf.last_value('b')).show()
2849
+ +-------------+-------------+
2850
+ |last_value(a)|last_value(b)|
2851
+ +-------------+-------------+
2852
+ | NULL| 2|
2853
+ +-------------+-------------+
2854
+
2855
+ >>> import pyspark.sql.functions as sf
2856
+ >>> spark.createDataFrame(
2857
+ ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]
2858
+ ... ).select(sf.last_value('a', True), sf.last_value('b', True)).show()
2859
+ +-------------+-------------+
2860
+ |last_value(a)|last_value(b)|
2861
+ +-------------+-------------+
2862
+ | b| 2|
2863
+ +-------------+-------------+
2864
+ """
2865
+ column = Column.invoke_expression_over_column(col, expression.LastValue)
2866
+
2867
+ if ignoreNulls:
2868
+ return Column(expression.IgnoreNulls(this=column.expression))
2869
+ return column
2870
+
2871
+
2872
+ @meta()
2873
+ def lcase(str: ColumnOrName) -> Column:
2874
+ """
2875
+ Returns `str` with all characters changed to lowercase.
2876
+
2877
+ .. versionadded:: 3.5.0
2878
+
2879
+ Parameters
2880
+ ----------
2881
+ str : :class:`~pyspark.sql.Column` or str
2882
+ Input column or strings.
2883
+
2884
+ Examples
2885
+ --------
2886
+ >>> import pyspark.sql.functions as sf
2887
+ >>> spark.range(1).select(sf.lcase(sf.lit("Spark"))).show()
2888
+ +------------+
2889
+ |lcase(Spark)|
2890
+ +------------+
2891
+ | spark|
2892
+ +------------+
2893
+ """
2894
+ return Column.invoke_expression_over_column(str, expression.Lower)
2895
+
2896
+
2897
+ @meta()
2898
+ def left(str: ColumnOrName, len: ColumnOrName) -> Column:
2899
+ """
2900
+ Returns the leftmost `len`(`len` can be string type) characters from the string `str`,
2901
+ if `len` is less or equal than 0 the result is an empty string.
2902
+
2903
+ .. versionadded:: 3.5.0
2904
+
2905
+ Parameters
2906
+ ----------
2907
+ str : :class:`~pyspark.sql.Column` or str
2908
+ Input column or strings.
2909
+ len : :class:`~pyspark.sql.Column` or str
2910
+ Input column or strings, the leftmost `len`.
2911
+
2912
+ Examples
2913
+ --------
2914
+ >>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])
2915
+ >>> df.select(left(df.a, df.b).alias('r')).collect()
2916
+ [Row(r='Spa')]
2917
+ """
2918
+ return Column.invoke_expression_over_column(str, expression.Left, expression=len)
2919
+
2920
+
2921
+ @meta(unsupported_engines="*")
2922
+ def like(
2923
+ str: ColumnOrName, pattern: ColumnOrName, escapeChar: t.Optional["Column"] = None
2924
+ ) -> Column:
2925
+ """
2926
+ Returns true if str matches `pattern` with `escape`,
2927
+ null if any arguments are null, false otherwise.
2928
+ The default escape character is the '\'.
2929
+
2930
+ .. versionadded:: 3.5.0
2931
+
2932
+ Parameters
2933
+ ----------
2934
+ str : :class:`~pyspark.sql.Column` or str
2935
+ A string.
2936
+ pattern : :class:`~pyspark.sql.Column` or str
2937
+ A string. The pattern is a string which is matched literally, with
2938
+ exception to the following special symbols:
2939
+ _ matches any one character in the input (similar to . in posix regular expressions)
2940
+ % matches zero or more characters in the input (similar to .* in posix regular
2941
+ expressions)
2942
+ Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order
2943
+ to match "\abc", the pattern should be "\\abc".
2944
+ When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back
2945
+ to Spark 1.6 behavior regarding string literal parsing. For example, if the config is
2946
+ enabled, the pattern to match "\abc" should be "\abc".
2947
+ escape : :class:`~pyspark.sql.Column`
2948
+ An character added since Spark 3.0. The default escape character is the '\'.
2949
+ If an escape character precedes a special symbol or another escape character, the
2950
+ following character is matched literally. It is invalid to escape any other character.
2951
+
2952
+ Examples
2953
+ --------
2954
+ >>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])
2955
+ >>> df.select(like(df.a, df.b).alias('r')).collect()
2956
+ [Row(r=True)]
2957
+
2958
+ >>> df = spark.createDataFrame(
2959
+ ... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],
2960
+ ... ['a', 'b']
2961
+ ... )
2962
+ >>> df.select(like(df.a, df.b, lit('/')).alias('r')).collect()
2963
+ [Row(r=True)]
2964
+ """
2965
+ column = Column.invoke_expression_over_column(str, expression.Like, expression=pattern)
2966
+ if escapeChar is not None:
2967
+ return Column(
2968
+ expression.Escape(
2969
+ this=column.expression,
2970
+ expression=Column.ensure_col(escapeChar).expression,
2971
+ )
2972
+ )
2973
+ return column
2974
+
2975
+
2976
+ @meta()
2977
+ def ln(col: ColumnOrName) -> Column:
2978
+ """Returns the natural logarithm of the argument.
2979
+
2980
+ .. versionadded:: 3.5.0
2981
+
2982
+ Parameters
2983
+ ----------
2984
+ col : :class:`~pyspark.sql.Column` or str
2985
+ a column to calculate logariphm for.
2986
+
2987
+ Returns
2988
+ -------
2989
+ :class:`~pyspark.sql.Column`
2990
+ natural logarithm of given value.
2991
+
2992
+ Examples
2993
+ --------
2994
+ >>> df = spark.createDataFrame([(4,)], ['a'])
2995
+ >>> df.select(ln('a')).show()
2996
+ +------------------+
2997
+ | ln(a)|
2998
+ +------------------+
2999
+ |1.3862943611198906|
3000
+ +------------------+
3001
+ """
3002
+ return Column.invoke_expression_over_column(col, expression.Ln)
3003
+
3004
+
3005
+ @meta(unsupported_engines="*")
3006
+ def localtimestamp() -> Column:
3007
+ """
3008
+ Returns the current timestamp without time zone at the start of query evaluation
3009
+ as a timestamp without time zone column. All calls of localtimestamp within the
3010
+ same query return the same value.
3011
+
3012
+ .. versionadded:: 3.4.0
3013
+
3014
+ .. versionchanged:: 3.4.0
3015
+ Supports Spark Connect.
3016
+
3017
+ Returns
3018
+ -------
3019
+ :class:`~pyspark.sql.Column`
3020
+ current local date and time.
3021
+
3022
+ Examples
3023
+ --------
3024
+ >>> df = spark.range(1)
3025
+ >>> df.select(localtimestamp()).show(truncate=False) # doctest: +SKIP
3026
+ +-----------------------+
3027
+ |localtimestamp() |
3028
+ +-----------------------+
3029
+ |2022-08-26 21:28:34.639|
3030
+ +-----------------------+
3031
+ """
3032
+ return Column.invoke_anonymous_function(None, "localtimestamp")
3033
+
3034
+
3035
+ @meta(unsupported_engines="*")
3036
+ def make_dt_interval(
3037
+ days: t.Optional[ColumnOrName] = None,
3038
+ hours: t.Optional[ColumnOrName] = None,
3039
+ mins: t.Optional[ColumnOrName] = None,
3040
+ secs: t.Optional[ColumnOrName] = None,
3041
+ ) -> Column:
3042
+ """
3043
+ Make DayTimeIntervalType duration from days, hours, mins and secs.
3044
+
3045
+ .. versionadded:: 3.5.0
3046
+
3047
+ Parameters
3048
+ ----------
3049
+ days : :class:`~pyspark.sql.Column` or str
3050
+ the number of days, positive or negative
3051
+ hours : :class:`~pyspark.sql.Column` or str
3052
+ the number of hours, positive or negative
3053
+ mins : :class:`~pyspark.sql.Column` or str
3054
+ the number of minutes, positive or negative
3055
+ secs : :class:`~pyspark.sql.Column` or str
3056
+ the number of seconds with the fractional part in microsecond precision.
3057
+
3058
+ Examples
3059
+ --------
3060
+ >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]],
3061
+ ... ["day", "hour", "min", "sec"])
3062
+ >>> df.select(make_dt_interval(
3063
+ ... df.day, df.hour, df.min, df.sec).alias('r')
3064
+ ... ).show(truncate=False)
3065
+ +------------------------------------------+
3066
+ |r |
3067
+ +------------------------------------------+
3068
+ |INTERVAL '1 12:30:01.001001' DAY TO SECOND|
3069
+ +------------------------------------------+
3070
+
3071
+ >>> df.select(make_dt_interval(
3072
+ ... df.day, df.hour, df.min).alias('r')
3073
+ ... ).show(truncate=False)
3074
+ +-----------------------------------+
3075
+ |r |
3076
+ +-----------------------------------+
3077
+ |INTERVAL '1 12:30:00' DAY TO SECOND|
3078
+ +-----------------------------------+
3079
+
3080
+ >>> df.select(make_dt_interval(
3081
+ ... df.day, df.hour).alias('r')
3082
+ ... ).show(truncate=False)
3083
+ +-----------------------------------+
3084
+ |r |
3085
+ +-----------------------------------+
3086
+ |INTERVAL '1 12:00:00' DAY TO SECOND|
3087
+ +-----------------------------------+
3088
+
3089
+ >>> df.select(make_dt_interval(df.day).alias('r')).show(truncate=False)
3090
+ +-----------------------------------+
3091
+ |r |
3092
+ +-----------------------------------+
3093
+ |INTERVAL '1 00:00:00' DAY TO SECOND|
3094
+ +-----------------------------------+
3095
+
3096
+ >>> df.select(make_dt_interval().alias('r')).show(truncate=False)
3097
+ +-----------------------------------+
3098
+ |r |
3099
+ +-----------------------------------+
3100
+ |INTERVAL '0 00:00:00' DAY TO SECOND|
3101
+ +-----------------------------------+
3102
+ """
3103
+ _days = lit(0) if days is None else days
3104
+ _hours = lit(0) if hours is None else hours
3105
+ _mins = lit(0) if mins is None else mins
3106
+ _secs = lit(decimal.Decimal(0)) if secs is None else secs
3107
+ return Column.invoke_anonymous_function(_days, "make_dt_interval", _hours, _mins, _secs)
3108
+
3109
+
3110
+ @meta(unsupported_engines="*")
3111
+ def make_timestamp(
3112
+ years: ColumnOrName,
3113
+ months: ColumnOrName,
3114
+ days: ColumnOrName,
3115
+ hours: ColumnOrName,
3116
+ mins: ColumnOrName,
3117
+ secs: ColumnOrName,
3118
+ timezone: t.Optional[ColumnOrName] = None,
3119
+ ) -> Column:
3120
+ """
3121
+ Create timestamp from years, months, days, hours, mins, secs and timezone fields.
3122
+ The result data type is consistent with the value of configuration `spark.sql.timestampType`.
3123
+ If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL
3124
+ on invalid inputs. Otherwise, it will throw an error instead.
3125
+
3126
+ .. versionadded:: 3.5.0
3127
+
3128
+ Parameters
3129
+ ----------
3130
+ years : :class:`~pyspark.sql.Column` or str
3131
+ the year to represent, from 1 to 9999
3132
+ months : :class:`~pyspark.sql.Column` or str
3133
+ the month-of-year to represent, from 1 (January) to 12 (December)
3134
+ days : :class:`~pyspark.sql.Column` or str
3135
+ the day-of-month to represent, from 1 to 31
3136
+ hours : :class:`~pyspark.sql.Column` or str
3137
+ the hour-of-day to represent, from 0 to 23
3138
+ mins : :class:`~pyspark.sql.Column` or str
3139
+ the minute-of-hour to represent, from 0 to 59
3140
+ secs : :class:`~pyspark.sql.Column` or str
3141
+ the second-of-minute and its micro-fraction to represent, from 0 to 60.
3142
+ The value can be either an integer like 13 , or a fraction like 13.123.
3143
+ If the sec argument equals to 60, the seconds field is set
3144
+ to 0 and 1 minute is added to the final timestamp.
3145
+ timezone : :class:`~pyspark.sql.Column` or str
3146
+ the time zone identifier. For example, CET, UTC and etc.
3147
+
3148
+ Examples
3149
+ --------
3150
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
3151
+ >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
3152
+ ... ["year", "month", "day", "hour", "min", "sec", "timezone"])
3153
+ >>> df.select(make_timestamp(
3154
+ ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone).alias('r')
3155
+ ... ).show(truncate=False)
3156
+ +-----------------------+
3157
+ |r |
3158
+ +-----------------------+
3159
+ |2014-12-27 21:30:45.887|
3160
+ +-----------------------+
3161
+
3162
+ >>> df.select(make_timestamp(
3163
+ ... df.year, df.month, df.day, df.hour, df.min, df.sec).alias('r')
3164
+ ... ).show(truncate=False)
3165
+ +-----------------------+
3166
+ |r |
3167
+ +-----------------------+
3168
+ |2014-12-28 06:30:45.887|
3169
+ +-----------------------+
3170
+ >>> spark.conf.unset("spark.sql.session.timeZone")
3171
+ """
3172
+ if timezone is not None:
3173
+ return Column.invoke_anonymous_function(
3174
+ years, "make_timestamp", months, days, hours, mins, secs, timezone
3175
+ )
3176
+ else:
3177
+ return Column.invoke_anonymous_function(
3178
+ years, "make_timestamp", months, days, hours, mins, secs
3179
+ )
3180
+
3181
+
3182
+ @meta(unsupported_engines="*")
3183
+ def make_timestamp_ltz(
3184
+ years: ColumnOrName,
3185
+ months: ColumnOrName,
3186
+ days: ColumnOrName,
3187
+ hours: ColumnOrName,
3188
+ mins: ColumnOrName,
3189
+ secs: ColumnOrName,
3190
+ timezone: t.Optional[ColumnOrName] = None,
3191
+ ) -> Column:
3192
+ """
3193
+ Create the current timestamp with local time zone from years, months, days, hours, mins,
3194
+ secs and timezone fields. If the configuration `spark.sql.ansi.enabled` is false,
3195
+ the function returns NULL on invalid inputs. Otherwise, it will throw an error instead.
3196
+
3197
+ .. versionadded:: 3.5.0
3198
+
3199
+ Parameters
3200
+ ----------
3201
+ years : :class:`~pyspark.sql.Column` or str
3202
+ the year to represent, from 1 to 9999
3203
+ months : :class:`~pyspark.sql.Column` or str
3204
+ the month-of-year to represent, from 1 (January) to 12 (December)
3205
+ days : :class:`~pyspark.sql.Column` or str
3206
+ the day-of-month to represent, from 1 to 31
3207
+ hours : :class:`~pyspark.sql.Column` or str
3208
+ the hour-of-day to represent, from 0 to 23
3209
+ mins : :class:`~pyspark.sql.Column` or str
3210
+ the minute-of-hour to represent, from 0 to 59
3211
+ secs : :class:`~pyspark.sql.Column` or str
3212
+ the second-of-minute and its micro-fraction to represent, from 0 to 60.
3213
+ The value can be either an integer like 13 , or a fraction like 13.123.
3214
+ If the sec argument equals to 60, the seconds field is set
3215
+ to 0 and 1 minute is added to the final timestamp.
3216
+ timezone : :class:`~pyspark.sql.Column` or str
3217
+ the time zone identifier. For example, CET, UTC and etc.
3218
+
3219
+ Examples
3220
+ --------
3221
+ >>> import pyspark.sql.functions as sf
3222
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
3223
+ >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
3224
+ ... ["year", "month", "day", "hour", "min", "sec", "timezone"])
3225
+ >>> df.select(sf.make_timestamp_ltz(
3226
+ ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone)
3227
+ ... ).show(truncate=False)
3228
+ +--------------------------------------------------------------+
3229
+ |make_timestamp_ltz(year, month, day, hour, min, sec, timezone)|
3230
+ +--------------------------------------------------------------+
3231
+ |2014-12-27 21:30:45.887 |
3232
+ +--------------------------------------------------------------+
3233
+
3234
+ >>> df.select(sf.make_timestamp_ltz(
3235
+ ... df.year, df.month, df.day, df.hour, df.min, df.sec)
3236
+ ... ).show(truncate=False)
3237
+ +----------------------------------------------------+
3238
+ |make_timestamp_ltz(year, month, day, hour, min, sec)|
3239
+ +----------------------------------------------------+
3240
+ |2014-12-28 06:30:45.887 |
3241
+ +----------------------------------------------------+
3242
+ >>> spark.conf.unset("spark.sql.session.timeZone")
3243
+ """
3244
+ if timezone is not None:
3245
+ return Column.invoke_anonymous_function(
3246
+ years, "make_timestamp_ltz", months, days, hours, mins, secs, timezone
3247
+ )
3248
+ else:
3249
+ return Column.invoke_anonymous_function(
3250
+ years, "make_timestamp_ltz", months, days, hours, mins, secs
3251
+ )
3252
+
3253
+
3254
+ @meta(unsupported_engines="*")
3255
+ def make_timestamp_ntz(
3256
+ years: ColumnOrName,
3257
+ months: ColumnOrName,
3258
+ days: ColumnOrName,
3259
+ hours: ColumnOrName,
3260
+ mins: ColumnOrName,
3261
+ secs: ColumnOrName,
3262
+ ) -> Column:
3263
+ """
3264
+ Create local date-time from years, months, days, hours, mins, secs fields.
3265
+ If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL
3266
+ on invalid inputs. Otherwise, it will throw an error instead.
3267
+
3268
+ .. versionadded:: 3.5.0
3269
+
3270
+ Parameters
3271
+ ----------
3272
+ years : :class:`~pyspark.sql.Column` or str
3273
+ the year to represent, from 1 to 9999
3274
+ months : :class:`~pyspark.sql.Column` or str
3275
+ the month-of-year to represent, from 1 (January) to 12 (December)
3276
+ days : :class:`~pyspark.sql.Column` or str
3277
+ the day-of-month to represent, from 1 to 31
3278
+ hours : :class:`~pyspark.sql.Column` or str
3279
+ the hour-of-day to represent, from 0 to 23
3280
+ mins : :class:`~pyspark.sql.Column` or str
3281
+ the minute-of-hour to represent, from 0 to 59
3282
+ secs : :class:`~pyspark.sql.Column` or str
3283
+ the second-of-minute and its micro-fraction to represent, from 0 to 60.
3284
+ The value can be either an integer like 13 , or a fraction like 13.123.
3285
+ If the sec argument equals to 60, the seconds field is set
3286
+ to 0 and 1 minute is added to the final timestamp.
3287
+
3288
+ Examples
3289
+ --------
3290
+ >>> import pyspark.sql.functions as sf
3291
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
3292
+ >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]],
3293
+ ... ["year", "month", "day", "hour", "min", "sec"])
3294
+ >>> df.select(sf.make_timestamp_ntz(
3295
+ ... df.year, df.month, df.day, df.hour, df.min, df.sec)
3296
+ ... ).show(truncate=False)
3297
+ +----------------------------------------------------+
3298
+ |make_timestamp_ntz(year, month, day, hour, min, sec)|
3299
+ +----------------------------------------------------+
3300
+ |2014-12-28 06:30:45.887 |
3301
+ +----------------------------------------------------+
3302
+ >>> spark.conf.unset("spark.sql.session.timeZone")
3303
+ """
3304
+ return Column.invoke_anonymous_function(
3305
+ years, "make_timestamp_ntz", months, days, hours, mins, secs
3306
+ )
3307
+
3308
+
3309
+ @meta(unsupported_engines="*")
3310
+ def make_ym_interval(
3311
+ years: t.Optional[ColumnOrName] = None,
3312
+ months: t.Optional[ColumnOrName] = None,
3313
+ ) -> Column:
3314
+ """
3315
+ Make year-month interval from years, months.
3316
+
3317
+ .. versionadded:: 3.5.0
3318
+
3319
+ Parameters
3320
+ ----------
3321
+ years : :class:`~pyspark.sql.Column` or str
3322
+ the number of years, positive or negative
3323
+ months : :class:`~pyspark.sql.Column` or str
3324
+ the number of months, positive or negative
3325
+
3326
+ Examples
3327
+ --------
3328
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
3329
+ >>> df = spark.createDataFrame([[2014, 12]], ["year", "month"])
3330
+ >>> df.select(make_ym_interval(df.year, df.month).alias('r')).show(truncate=False)
3331
+ +-------------------------------+
3332
+ |r |
3333
+ +-------------------------------+
3334
+ |INTERVAL '2015-0' YEAR TO MONTH|
3335
+ +-------------------------------+
3336
+ >>> spark.conf.unset("spark.sql.session.timeZone")
3337
+ """
3338
+ _years = lit(0) if years is None else years
3339
+ _months = lit(0) if months is None else months
3340
+ return Column.invoke_anonymous_function(_years, "make_ym_interval", _months)
3341
+
3342
+
3343
+ @meta(unsupported_engines="*")
3344
+ def map_contains_key(col: ColumnOrName, value: t.Any) -> Column:
3345
+ """
3346
+ Returns true if the map contains the key.
3347
+
3348
+ .. versionadded:: 3.4.0
3349
+
3350
+ .. versionchanged:: 3.4.0
3351
+ Supports Spark Connect.
3352
+
3353
+ Parameters
3354
+ ----------
3355
+ col : :class:`~pyspark.sql.Column` or str
3356
+ name of column or expression
3357
+ value :
3358
+ a literal value
3359
+
3360
+ Returns
3361
+ -------
3362
+ :class:`~pyspark.sql.Column`
3363
+ True if key is in the map and False otherwise.
3364
+
3365
+ Examples
3366
+ --------
3367
+ >>> from pyspark.sql.functions import map_contains_key
3368
+ >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
3369
+ >>> df.select(map_contains_key("data", 1)).show()
3370
+ +---------------------------------+
3371
+ |array_contains(map_keys(data), 1)|
3372
+ +---------------------------------+
3373
+ | true|
3374
+ +---------------------------------+
3375
+ >>> df.select(map_contains_key("data", -1)).show()
3376
+ +----------------------------------+
3377
+ |array_contains(map_keys(data), -1)|
3378
+ +----------------------------------+
3379
+ | false|
3380
+ +----------------------------------+
3381
+ """
3382
+ value = lit(value) if not isinstance(value, Column) else value
3383
+ return Column.invoke_anonymous_function(col, "map_contains_key", value)
3384
+
3385
+
3386
+ @meta(unsupported_engines="*")
3387
+ def mask(
3388
+ col: ColumnOrName,
3389
+ upperChar: t.Optional[ColumnOrName] = None,
3390
+ lowerChar: t.Optional[ColumnOrName] = None,
3391
+ digitChar: t.Optional[ColumnOrName] = None,
3392
+ otherChar: t.Optional[ColumnOrName] = None,
3393
+ ) -> Column:
3394
+ """
3395
+ Masks the given string value. This can be useful for creating copies of tables with sensitive
3396
+ information removed.
3397
+
3398
+ .. versionadded:: 3.5.0
3399
+
3400
+ Parameters
3401
+ ----------
3402
+ col: :class:`~pyspark.sql.Column` or str
3403
+ target column to compute on.
3404
+ upperChar: :class:`~pyspark.sql.Column` or str
3405
+ character to replace upper-case characters with. Specify NULL to retain original character.
3406
+ lowerChar: :class:`~pyspark.sql.Column` or str
3407
+ character to replace lower-case characters with. Specify NULL to retain original character.
3408
+ digitChar: :class:`~pyspark.sql.Column` or str
3409
+ character to replace digit characters with. Specify NULL to retain original character.
3410
+ otherChar: :class:`~pyspark.sql.Column` or str
3411
+ character to replace all other characters with. Specify NULL to retain original character.
3412
+
3413
+ Returns
3414
+ -------
3415
+ :class:`~pyspark.sql.Column`
3416
+
3417
+ Examples
3418
+ --------
3419
+ >>> df = spark.createDataFrame([("AbCD123-@$#",), ("abcd-EFGH-8765-4321",)], ['data'])
3420
+ >>> df.select(mask(df.data).alias('r')).collect()
3421
+ [Row(r='XxXXnnn-@$#'), Row(r='xxxx-XXXX-nnnn-nnnn')]
3422
+ >>> df.select(mask(df.data, lit('Y')).alias('r')).collect()
3423
+ [Row(r='YxYYnnn-@$#'), Row(r='xxxx-YYYY-nnnn-nnnn')]
3424
+ >>> df.select(mask(df.data, lit('Y'), lit('y')).alias('r')).collect()
3425
+ [Row(r='YyYYnnn-@$#'), Row(r='yyyy-YYYY-nnnn-nnnn')]
3426
+ >>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d')).alias('r')).collect()
3427
+ [Row(r='YyYYddd-@$#'), Row(r='yyyy-YYYY-dddd-dddd')]
3428
+ >>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d'), lit('*')).alias('r')).collect()
3429
+ [Row(r='YyYYddd****'), Row(r='yyyy*YYYY*dddd*dddd')]
3430
+ """
3431
+
3432
+ _upperChar = lit("X") if upperChar is None else upperChar
3433
+ _lowerChar = lit("x") if lowerChar is None else lowerChar
3434
+ _digitChar = lit("n") if digitChar is None else digitChar
3435
+ _otherChar = lit(None) if otherChar is None else otherChar
3436
+ return Column.invoke_anonymous_function(
3437
+ col, "mask", _upperChar, _lowerChar, _digitChar, _otherChar
3438
+ )
3439
+
3440
+
3441
+ @meta(unsupported_engines="*")
3442
+ def median(col: ColumnOrName) -> Column:
3443
+ """
3444
+ Returns the median of the values in a group.
3445
+
3446
+ .. versionadded:: 3.4.0
3447
+
3448
+ Parameters
3449
+ ----------
3450
+ col : :class:`~pyspark.sql.Column` or str
3451
+ target column to compute on.
3452
+
3453
+ Returns
3454
+ -------
3455
+ :class:`~pyspark.sql.Column`
3456
+ the median of the values in a group.
3457
+
3458
+ Notes
3459
+ -----
3460
+ Supports Spark Connect.
3461
+
3462
+ Examples
3463
+ --------
3464
+ >>> df = spark.createDataFrame([
3465
+ ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),
3466
+ ... ("Java", 2012, 22000), ("dotNET", 2012, 10000),
3467
+ ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
3468
+ ... schema=("course", "year", "earnings"))
3469
+ >>> df.groupby("course").agg(median("earnings")).show()
3470
+ +------+----------------+
3471
+ |course|median(earnings)|
3472
+ +------+----------------+
3473
+ | Java| 22000.0|
3474
+ |dotNET| 10000.0|
3475
+ +------+----------------+
3476
+ """
3477
+ return Column.invoke_anonymous_function(col, "median")
3478
+
3479
+
3480
+ @meta(unsupported_engines="*")
3481
+ def mode(col: ColumnOrName) -> Column:
3482
+ """
3483
+ Returns the most frequent value in a group.
3484
+
3485
+ .. versionadded:: 3.4.0
3486
+
3487
+ Parameters
3488
+ ----------
3489
+ col : :class:`~pyspark.sql.Column` or str
3490
+ target column to compute on.
3491
+
3492
+ Returns
3493
+ -------
3494
+ :class:`~pyspark.sql.Column`
3495
+ the most frequent value in a group.
3496
+
3497
+ Notes
3498
+ -----
3499
+ Supports Spark Connect.
3500
+
3501
+ Examples
3502
+ --------
3503
+ >>> df = spark.createDataFrame([
3504
+ ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),
3505
+ ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),
3506
+ ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
3507
+ ... schema=("course", "year", "earnings"))
3508
+ >>> df.groupby("course").agg(mode("year")).show()
3509
+ +------+----------+
3510
+ |course|mode(year)|
3511
+ +------+----------+
3512
+ | Java| 2012|
3513
+ |dotNET| 2012|
3514
+ +------+----------+
3515
+ """
3516
+ return Column.invoke_anonymous_function(col, "mode")
3517
+
3518
+
3519
+ @meta(unsupported_engines="*")
3520
+ def months(col: ColumnOrName) -> Column:
3521
+ """
3522
+ Partition transform function: A transform for timestamps and dates
3523
+ to partition data into months.
3524
+
3525
+ .. versionadded:: 3.1.0
3526
+
3527
+ .. versionchanged:: 3.4.0
3528
+ Supports Spark Connect.
3529
+
3530
+ Parameters
3531
+ ----------
3532
+ col : :class:`~pyspark.sql.Column` or str
3533
+ target date or timestamp column to work on.
3534
+
3535
+ Returns
3536
+ -------
3537
+ :class:`~pyspark.sql.Column`
3538
+ data partitioned by months.
3539
+
3540
+ Examples
3541
+ --------
3542
+ >>> df.writeTo("catalog.db.table").partitionedBy(
3543
+ ... months("ts")
3544
+ ... ).createOrReplace() # doctest: +SKIP
3545
+
3546
+ Notes
3547
+ -----
3548
+ This function can be used only in combination with
3549
+ :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`
3550
+ method of the `DataFrameWriterV2`.
3551
+
3552
+ """
3553
+ return Column.invoke_anonymous_function(col, "months")
3554
+
3555
+
3556
+ @meta(unsupported_engines="*")
3557
+ def named_struct(*cols: ColumnOrName) -> Column:
3558
+ """
3559
+ Creates a struct with the given field names and values.
3560
+
3561
+ .. versionadded:: 3.5.0
3562
+
3563
+ Parameters
3564
+ ----------
3565
+ cols : :class:`~pyspark.sql.Column` or str
3566
+ list of columns to work on.
3567
+
3568
+ Returns
3569
+ -------
3570
+ :class:`~pyspark.sql.Column`
3571
+
3572
+ Examples
3573
+ --------
3574
+ >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c'])
3575
+ >>> df.select(named_struct(lit('x'), df.a, lit('y'), df.b).alias('r')).collect()
3576
+ [Row(r=Row(x=1, y=2))]
3577
+ """
3578
+ cols = ensure_list(cols) # type: ignore
3579
+ if len(cols) > 1:
3580
+ return Column.invoke_anonymous_function(cols[0], "named_struct", *cols[1:])
3581
+ return Column.invoke_anonymous_function(cols[0], "named_struct")
3582
+
3583
+
3584
+ @meta(unsupported_engines="*")
3585
+ def negative(col: ColumnOrName) -> Column:
3586
+ """
3587
+ Returns the negative value.
3588
+
3589
+ .. versionadded:: 3.5.0
3590
+
3591
+ Parameters
3592
+ ----------
3593
+ col : :class:`~pyspark.sql.Column` or str
3594
+ column to calculate negative value for.
3595
+
3596
+ Returns
3597
+ -------
3598
+ :class:`~pyspark.sql.Column`
3599
+ negative value.
3600
+
3601
+ Examples
3602
+ --------
3603
+ >>> import pyspark.sql.functions as sf
3604
+ >>> spark.range(3).select(sf.negative("id")).show()
3605
+ +------------+
3606
+ |negative(id)|
3607
+ +------------+
3608
+ | 0|
3609
+ | -1|
3610
+ | -2|
3611
+ +------------+
3612
+ """
3613
+ return Column.invoke_anonymous_function(col, "negative")
3614
+
3615
+
3616
+ negate = negative
3617
+ now = current_timestamp
3618
+
3619
+
3620
+ @meta()
3621
+ def nvl(col1: ColumnOrName, col2: ColumnOrName) -> Column:
3622
+ """
3623
+ Returns `col2` if `col1` is null, or `col1` otherwise.
3624
+
3625
+ .. versionadded:: 3.5.0
3626
+
3627
+ Parameters
3628
+ ----------
3629
+ col1 : :class:`~pyspark.sql.Column` or str
3630
+ col2 : :class:`~pyspark.sql.Column` or str
3631
+
3632
+ Examples
3633
+ --------
3634
+ >>> df = spark.createDataFrame([(None, 8,), (1, 9,)], ["a", "b"])
3635
+ >>> df.select(nvl(df.a, df.b).alias('r')).collect()
3636
+ [Row(r=8), Row(r=1)]
3637
+ """
3638
+ return Column.invoke_expression_over_column(col1, expression.Coalesce, expressions=[col2])
3639
+
3640
+
3641
+ @meta()
3642
+ def nvl2(col1: ColumnOrName, col2: ColumnOrName, col3: ColumnOrName) -> Column:
3643
+ """
3644
+ Returns `col2` if `col1` is not null, or `col3` otherwise.
3645
+
3646
+ .. versionadded:: 3.5.0
3647
+
3648
+ Parameters
3649
+ ----------
3650
+ col1 : :class:`~pyspark.sql.Column` or str
3651
+ col2 : :class:`~pyspark.sql.Column` or str
3652
+ col3 : :class:`~pyspark.sql.Column` or str
3653
+
3654
+ Examples
3655
+ --------
3656
+ >>> df = spark.createDataFrame([(None, 8, 6,), (1, 9, 9,)], ["a", "b", "c"])
3657
+ >>> df.select(nvl2(df.a, df.b, df.c).alias('r')).collect()
3658
+ [Row(r=6), Row(r=9)]
3659
+ """
3660
+ return Column.invoke_expression_over_column(col1, expression.Nvl2, true=col2, false=col3)
3661
+
3662
+
3663
+ @meta(unsupported_engines="*")
3664
+ def parse_url(
3665
+ url: ColumnOrName, partToExtract: ColumnOrName, key: t.Optional[ColumnOrName] = None
3666
+ ) -> Column:
3667
+ """
3668
+ Extracts a part from a URL.
3669
+
3670
+ .. versionadded:: 3.5.0
3671
+
3672
+ Parameters
3673
+ ----------
3674
+ url : :class:`~pyspark.sql.Column` or str
3675
+ A column of string.
3676
+ partToExtract : :class:`~pyspark.sql.Column` or str
3677
+ A column of string, the path.
3678
+ key : :class:`~pyspark.sql.Column` or str, optional
3679
+ A column of string, the key.
3680
+
3681
+ Examples
3682
+ --------
3683
+ >>> df = spark.createDataFrame(
3684
+ ... [("http://spark.apache.org/path?query=1", "QUERY", "query",)],
3685
+ ... ["a", "b", "c"]
3686
+ ... )
3687
+ >>> df.select(parse_url(df.a, df.b, df.c).alias('r')).collect()
3688
+ [Row(r='1')]
3689
+
3690
+ >>> df.select(parse_url(df.a, df.b).alias('r')).collect()
3691
+ [Row(r='query=1')]
3692
+ """
3693
+ if key is not None:
3694
+ return Column.invoke_anonymous_function(url, "parse_url", partToExtract, key)
3695
+ else:
3696
+ return Column.invoke_anonymous_function(url, "parse_url", partToExtract)
3697
+
3698
+
3699
+ @meta(unsupported_engines="*")
3700
+ def pi() -> Column:
3701
+ """Returns Pi.
3702
+
3703
+ .. versionadded:: 3.5.0
3704
+
3705
+ Examples
3706
+ --------
3707
+ >>> spark.range(1).select(pi()).show()
3708
+ +-----------------+
3709
+ | PI()|
3710
+ +-----------------+
3711
+ |3.141592653589793|
3712
+ +-----------------+
3713
+ """
3714
+ return Column.invoke_anonymous_function(None, "pi")
3715
+
3716
+
3717
+ @meta(unsupported_engines="*")
3718
+ def pmod(dividend: t.Union[ColumnOrName, float], divisor: t.Union[ColumnOrName, float]) -> Column:
3719
+ """
3720
+ Returns the positive value of dividend mod divisor.
3721
+
3722
+ .. versionadded:: 3.4.0
3723
+
3724
+ Parameters
3725
+ ----------
3726
+ dividend : str, :class:`~pyspark.sql.Column` or float
3727
+ the column that contains dividend, or the specified dividend value
3728
+ divisor : str, :class:`~pyspark.sql.Column` or float
3729
+ the column that contains divisor, or the specified divisor value
3730
+
3731
+ Returns
3732
+ -------
3733
+ :class:`~pyspark.sql.Column`
3734
+ positive value of dividend mod divisor.
3735
+
3736
+ Notes
3737
+ -----
3738
+ Supports Spark Connect.
3739
+
3740
+ Examples
3741
+ --------
3742
+ >>> from pyspark.sql.functions import pmod
3743
+ >>> df = spark.createDataFrame([
3744
+ ... (1.0, float('nan')), (float('nan'), 2.0), (10.0, 3.0),
3745
+ ... (float('nan'), float('nan')), (-3.0, 4.0), (-10.0, 3.0),
3746
+ ... (-5.0, -6.0), (7.0, -8.0), (1.0, 2.0)],
3747
+ ... ("a", "b"))
3748
+ >>> df.select(pmod("a", "b")).show()
3749
+ +----------+
3750
+ |pmod(a, b)|
3751
+ +----------+
3752
+ | NaN|
3753
+ | NaN|
3754
+ | 1.0|
3755
+ | NaN|
3756
+ | 1.0|
3757
+ | 2.0|
3758
+ | -5.0|
3759
+ | 7.0|
3760
+ | 1.0|
3761
+ +----------+
3762
+ """
3763
+ dividend = lit(dividend) if isinstance(dividend, float) else dividend
3764
+ divisor = lit(divisor) if isinstance(divisor, float) else divisor
3765
+ return Column.invoke_anonymous_function(dividend, "pmod", divisor)
3766
+
3767
+
3768
+ @meta()
3769
+ def position(
3770
+ substr: ColumnOrName, str: ColumnOrName, start: t.Optional[ColumnOrName] = None
3771
+ ) -> Column:
3772
+ """
3773
+ Returns the position of the first occurrence of `substr` in `str` after position `start`.
3774
+ The given `start` and return value are 1-based.
3775
+
3776
+ .. versionadded:: 3.5.0
3777
+
3778
+ Parameters
3779
+ ----------
3780
+ substr : :class:`~pyspark.sql.Column` or str
3781
+ A column of string, substring.
3782
+ str : :class:`~pyspark.sql.Column` or str
3783
+ A column of string.
3784
+ start : :class:`~pyspark.sql.Column` or str, optional
3785
+ A column of string, start position.
3786
+
3787
+ Examples
3788
+ --------
3789
+ >>> import pyspark.sql.functions as sf
3790
+ >>> spark.createDataFrame(
3791
+ ... [("bar", "foobarbar", 5,)], ["a", "b", "c"]
3792
+ ... ).select(sf.position("a", "b", "c")).show()
3793
+ +-----------------+
3794
+ |position(a, b, c)|
3795
+ +-----------------+
3796
+ | 7|
3797
+ +-----------------+
3798
+
3799
+ >>> spark.createDataFrame(
3800
+ ... [("bar", "foobarbar", 5,)], ["a", "b", "c"]
3801
+ ... ).select(sf.position("a", "b")).show()
3802
+ +-----------------+
3803
+ |position(a, b, 1)|
3804
+ +-----------------+
3805
+ | 4|
3806
+ +-----------------+
3807
+ """
3808
+ if start is not None:
3809
+ return Column.invoke_expression_over_column(
3810
+ str, expression.StrPosition, substr=substr, position=start
3811
+ )
3812
+ else:
3813
+ return Column.invoke_expression_over_column(str, expression.StrPosition, substr=substr)
3814
+
3815
+
3816
+ @meta(unsupported_engines="*")
3817
+ def positive(col: ColumnOrName) -> Column:
3818
+ """
3819
+ Returns the value.
3820
+
3821
+ .. versionadded:: 3.5.0
3822
+
3823
+ Parameters
3824
+ ----------
3825
+ col : :class:`~pyspark.sql.Column` or str
3826
+ input value column.
3827
+
3828
+ Returns
3829
+ -------
3830
+ :class:`~pyspark.sql.Column`
3831
+ value.
3832
+
3833
+ Examples
3834
+ --------
3835
+ >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ['v'])
3836
+ >>> df.select(positive("v").alias("p")).show()
3837
+ +---+
3838
+ | p|
3839
+ +---+
3840
+ | -1|
3841
+ | 0|
3842
+ | 1|
3843
+ +---+
3844
+ """
3845
+ return Column.invoke_anonymous_function(col, "positive")
3846
+
3847
+
3848
+ @meta(unsupported_engines="*")
3849
+ def printf(format: ColumnOrName, *cols: ColumnOrName) -> Column:
3850
+ """
3851
+ Formats the arguments in printf-style and returns the result as a string column.
3852
+
3853
+ .. versionadded:: 3.5.0
3854
+
3855
+ Parameters
3856
+ ----------
3857
+ format : :class:`~pyspark.sql.Column` or str
3858
+ string that can contain embedded format tags and used as result column's value
3859
+ cols : :class:`~pyspark.sql.Column` or str
3860
+ column names or :class:`~pyspark.sql.Column`\\s to be used in formatting
3861
+
3862
+ Examples
3863
+ --------
3864
+ >>> import pyspark.sql.functions as sf
3865
+ >>> spark.createDataFrame(
3866
+ ... [("aa%d%s", 123, "cc",)], ["a", "b", "c"]
3867
+ ... ).select(sf.printf("a", "b", "c")).show()
3868
+ +---------------+
3869
+ |printf(a, b, c)|
3870
+ +---------------+
3871
+ | aa123cc|
3872
+ +---------------+
3873
+ """
3874
+ return Column.invoke_anonymous_function(format, "printf", *cols)
3875
+
3876
+
3877
+ @meta(unsupported_engines=["*", "spark"])
3878
+ def product(col: ColumnOrName) -> Column:
3879
+ """
3880
+ Aggregate function: returns the product of the values in a group.
3881
+
3882
+ .. versionadded:: 3.2.0
3883
+
3884
+ .. versionchanged:: 3.4.0
3885
+ Supports Spark Connect.
3886
+
3887
+ Parameters
3888
+ ----------
3889
+ col : str, :class:`Column`
3890
+ column containing values to be multiplied together
3891
+
3892
+ Returns
3893
+ -------
3894
+ :class:`~pyspark.sql.Column`
3895
+ the column for computed results.
3896
+
3897
+ Examples
3898
+ --------
3899
+ >>> df = spark.range(1, 10).toDF('x').withColumn('mod3', col('x') % 3)
3900
+ >>> prods = df.groupBy('mod3').agg(product('x').alias('product'))
3901
+ >>> prods.orderBy('mod3').show()
3902
+ +----+-------+
3903
+ |mod3|product|
3904
+ +----+-------+
3905
+ | 0| 162.0|
3906
+ | 1| 28.0|
3907
+ | 2| 80.0|
3908
+ +----+-------+
3909
+ """
3910
+ return Column.invoke_anonymous_function(col, "product")
3911
+
3912
+
3913
+ reduce = aggregate
3914
+
3915
+
3916
+ @meta(unsupported_engines="*")
3917
+ def reflect(*cols: ColumnOrName) -> Column:
3918
+ """
3919
+ Calls a method with reflection.
3920
+
3921
+ .. versionadded:: 3.5.0
3922
+
3923
+ Parameters
3924
+ ----------
3925
+ cols : :class:`~pyspark.sql.Column` or str
3926
+ the first element should be a literal string for the class name,
3927
+ and the second element should be a literal string for the method name,
3928
+ and the remaining are input arguments to the Java method.
3929
+
3930
+ Examples
3931
+ --------
3932
+ >>> df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"])
3933
+ >>> df.select(
3934
+ ... reflect(lit("java.util.UUID"), lit("fromString"), df.a).alias('r')
3935
+ ... ).collect()
3936
+ [Row(r='a5cf6c42-0c85-418f-af6c-3e4e5b1328f2')]
3937
+ """
3938
+ if len(cols) > 1:
3939
+ return Column.invoke_anonymous_function(cols[0], "reflect", *cols[1:])
3940
+ return Column.invoke_anonymous_function(cols[0], "reflect")
3941
+
3942
+
3943
+ @meta(unsupported_engines="*")
3944
+ def regexp(str: ColumnOrName, regexp: ColumnOrName) -> Column:
3945
+ r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.
3946
+
3947
+ .. versionadded:: 3.5.0
3948
+
3949
+ Parameters
3950
+ ----------
3951
+ str : :class:`~pyspark.sql.Column` or str
3952
+ target column to work on.
3953
+ regexp : :class:`~pyspark.sql.Column` or str
3954
+ regex pattern to apply.
3955
+
3956
+ Returns
3957
+ -------
3958
+ :class:`~pyspark.sql.Column`
3959
+ true if `str` matches a Java regex, or false otherwise.
3960
+
3961
+ Examples
3962
+ --------
3963
+ >>> import pyspark.sql.functions as sf
3964
+ >>> spark.createDataFrame(
3965
+ ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
3966
+ ... ).select(sf.regexp('str', sf.lit(r'(\d+)'))).show()
3967
+ +------------------+
3968
+ |REGEXP(str, (\d+))|
3969
+ +------------------+
3970
+ | true|
3971
+ +------------------+
3972
+
3973
+ >>> import pyspark.sql.functions as sf
3974
+ >>> spark.createDataFrame(
3975
+ ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
3976
+ ... ).select(sf.regexp('str', sf.lit(r'\d{2}b'))).show()
3977
+ +-------------------+
3978
+ |REGEXP(str, \d{2}b)|
3979
+ +-------------------+
3980
+ | false|
3981
+ +-------------------+
3982
+
3983
+ >>> import pyspark.sql.functions as sf
3984
+ >>> spark.createDataFrame(
3985
+ ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
3986
+ ... ).select(sf.regexp('str', sf.col("regexp"))).show()
3987
+ +-------------------+
3988
+ |REGEXP(str, regexp)|
3989
+ +-------------------+
3990
+ | true|
3991
+ +-------------------+
3992
+ """
3993
+ return Column.invoke_anonymous_function(str, "regexp", regexp)
3994
+
3995
+
3996
+ @meta(unsupported_engines="*")
3997
+ def regexp_count(str: ColumnOrName, regexp: ColumnOrName) -> Column:
3998
+ r"""Returns a count of the number of times that the Java regex pattern `regexp` is matched
3999
+ in the string `str`.
4000
+
4001
+ .. versionadded:: 3.5.0
4002
+
4003
+ Parameters
4004
+ ----------
4005
+ str : :class:`~pyspark.sql.Column` or str
4006
+ target column to work on.
4007
+ regexp : :class:`~pyspark.sql.Column` or str
4008
+ regex pattern to apply.
4009
+
4010
+ Returns
4011
+ -------
4012
+ :class:`~pyspark.sql.Column`
4013
+ the number of times that a Java regex pattern is matched in the string.
4014
+
4015
+ Examples
4016
+ --------
4017
+ >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
4018
+ >>> df.select(regexp_count('str', lit(r'\d+')).alias('d')).collect()
4019
+ [Row(d=3)]
4020
+ >>> df.select(regexp_count('str', lit(r'mmm')).alias('d')).collect()
4021
+ [Row(d=0)]
4022
+ >>> df.select(regexp_count("str", col("regexp")).alias('d')).collect()
4023
+ [Row(d=3)]
4024
+ """
4025
+ return Column.invoke_anonymous_function(str, "regexp_count", regexp)
4026
+
4027
+
4028
+ @meta(unsupported_engines="*")
4029
+ def regexp_extract_all(
4030
+ str: ColumnOrName, regexp: ColumnOrName, idx: t.Optional[t.Union[int, Column]] = None
4031
+ ) -> Column:
4032
+ r"""Extract all strings in the `str` that match the Java regex `regexp`
4033
+ and corresponding to the regex group index.
4034
+
4035
+ .. versionadded:: 3.5.0
4036
+
4037
+ Parameters
4038
+ ----------
4039
+ str : :class:`~pyspark.sql.Column` or str
4040
+ target column to work on.
4041
+ regexp : :class:`~pyspark.sql.Column` or str
4042
+ regex pattern to apply.
4043
+ idx : int
4044
+ matched group id.
4045
+
4046
+ Returns
4047
+ -------
4048
+ :class:`~pyspark.sql.Column`
4049
+ all strings in the `str` that match a Java regex and corresponding to the regex group index.
4050
+
4051
+ Examples
4052
+ --------
4053
+ >>> df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")], ["str", "regexp"])
4054
+ >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)')).alias('d')).collect()
4055
+ [Row(d=['100', '300'])]
4056
+ >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 1).alias('d')).collect()
4057
+ [Row(d=['100', '300'])]
4058
+ >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 2).alias('d')).collect()
4059
+ [Row(d=['200', '400'])]
4060
+ >>> df.select(regexp_extract_all('str', col("regexp")).alias('d')).collect()
4061
+ [Row(d=['100', '300'])]
4062
+ """
4063
+ if idx is None:
4064
+ return Column.invoke_anonymous_function(str, "regexp_extract_all", regexp)
4065
+ else:
4066
+ idx = lit(idx) if isinstance(idx, int) else idx
4067
+ return Column.invoke_anonymous_function(str, "regexp_extract_all", regexp, idx)
4068
+
4069
+
4070
+ @meta(unsupported_engines="*")
4071
+ def regexp_instr(
4072
+ str: ColumnOrName, regexp: ColumnOrName, idx: t.Optional[t.Union[int, Column]] = None
4073
+ ) -> Column:
4074
+ r"""Extract all strings in the `str` that match the Java regex `regexp`
4075
+ and corresponding to the regex group index.
4076
+
4077
+ .. versionadded:: 3.5.0
4078
+
4079
+ Parameters
4080
+ ----------
4081
+ str : :class:`~pyspark.sql.Column` or str
4082
+ target column to work on.
4083
+ regexp : :class:`~pyspark.sql.Column` or str
4084
+ regex pattern to apply.
4085
+ idx : int
4086
+ matched group id.
4087
+
4088
+ Returns
4089
+ -------
4090
+ :class:`~pyspark.sql.Column`
4091
+ all strings in the `str` that match a Java regex and corresponding to the regex group index.
4092
+
4093
+ Examples
4094
+ --------
4095
+ >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str", "regexp"])
4096
+ >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)')).alias('d')).collect()
4097
+ [Row(d=1)]
4098
+ >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 1).alias('d')).collect()
4099
+ [Row(d=1)]
4100
+ >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 2).alias('d')).collect()
4101
+ [Row(d=1)]
4102
+ >>> df.select(regexp_instr('str', col("regexp")).alias('d')).collect()
4103
+ [Row(d=1)]
4104
+ """
4105
+ if idx is None:
4106
+ return Column.invoke_anonymous_function(str, "regexp_instr", regexp)
4107
+ else:
4108
+ idx = lit(idx) if isinstance(idx, int) else idx
4109
+ return Column.invoke_anonymous_function(str, "regexp_instr", regexp, idx)
4110
+
4111
+
4112
+ @meta(unsupported_engines="snowflake")
4113
+ def regexp_like(str: ColumnOrName, regexp: ColumnOrName) -> Column:
4114
+ r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.
4115
+
4116
+ .. versionadded:: 3.5.0
4117
+
4118
+ Parameters
4119
+ ----------
4120
+ str : :class:`~pyspark.sql.Column` or str
4121
+ target column to work on.
4122
+ regexp : :class:`~pyspark.sql.Column` or str
4123
+ regex pattern to apply.
4124
+
4125
+ Returns
4126
+ -------
4127
+ :class:`~pyspark.sql.Column`
4128
+ true if `str` matches a Java regex, or false otherwise.
4129
+
4130
+ Examples
4131
+ --------
4132
+ >>> import pyspark.sql.functions as sf
4133
+ >>> spark.createDataFrame(
4134
+ ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
4135
+ ... ).select(sf.regexp_like('str', sf.lit(r'(\d+)'))).show()
4136
+ +-----------------------+
4137
+ |REGEXP_LIKE(str, (\d+))|
4138
+ +-----------------------+
4139
+ | true|
4140
+ +-----------------------+
4141
+
4142
+ >>> import pyspark.sql.functions as sf
4143
+ >>> spark.createDataFrame(
4144
+ ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
4145
+ ... ).select(sf.regexp_like('str', sf.lit(r'\d{2}b'))).show()
4146
+ +------------------------+
4147
+ |REGEXP_LIKE(str, \d{2}b)|
4148
+ +------------------------+
4149
+ | false|
4150
+ +------------------------+
4151
+
4152
+ >>> import pyspark.sql.functions as sf
4153
+ >>> spark.createDataFrame(
4154
+ ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
4155
+ ... ).select(sf.regexp_like('str', sf.col("regexp"))).show()
4156
+ +------------------------+
4157
+ |REGEXP_LIKE(str, regexp)|
4158
+ +------------------------+
4159
+ | true|
4160
+ +------------------------+
4161
+ """
4162
+ return Column.invoke_expression_over_column(str, expression.RegexpLike, expression=regexp)
4163
+
4164
+
4165
+ @meta(unsupported_engines="*")
4166
+ def regexp_substr(str: ColumnOrName, regexp: ColumnOrName) -> Column:
4167
+ r"""Returns the substring that matches the Java regex `regexp` within the string `str`.
4168
+ If the regular expression is not found, the result is null.
4169
+
4170
+ .. versionadded:: 3.5.0
4171
+
4172
+ Parameters
4173
+ ----------
4174
+ str : :class:`~pyspark.sql.Column` or str
4175
+ target column to work on.
4176
+ regexp : :class:`~pyspark.sql.Column` or str
4177
+ regex pattern to apply.
4178
+
4179
+ Returns
4180
+ -------
4181
+ :class:`~pyspark.sql.Column`
4182
+ the substring that matches a Java regex within the string `str`.
4183
+
4184
+ Examples
4185
+ --------
4186
+ >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
4187
+ >>> df.select(regexp_substr('str', lit(r'\d+')).alias('d')).collect()
4188
+ [Row(d='1')]
4189
+ >>> df.select(regexp_substr('str', lit(r'mmm')).alias('d')).collect()
4190
+ [Row(d=None)]
4191
+ >>> df.select(regexp_substr("str", col("regexp")).alias('d')).collect()
4192
+ [Row(d='1')]
4193
+ """
4194
+ return Column.invoke_anonymous_function(str, "regexp_substr", regexp)
4195
+
4196
+
4197
+ @meta(unsupported_engines="*")
4198
+ def regr_avgx(y: ColumnOrName, x: ColumnOrName) -> Column:
4199
+ """
4200
+ Aggregate function: returns the average of the independent variable for non-null pairs
4201
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4202
+
4203
+ .. versionadded:: 3.5.0
4204
+
4205
+ Parameters
4206
+ ----------
4207
+ y : :class:`~pyspark.sql.Column` or str
4208
+ the dependent variable.
4209
+ x : :class:`~pyspark.sql.Column` or str
4210
+ the independent variable.
4211
+
4212
+ Returns
4213
+ -------
4214
+ :class:`~pyspark.sql.Column`
4215
+ the average of the independent variable for non-null pairs in a group.
4216
+
4217
+ Examples
4218
+ --------
4219
+ >>> x = (col("id") % 3).alias("x")
4220
+ >>> y = (randn(42) + x * 10).alias("y")
4221
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4222
+ >>> df.select(regr_avgx("y", "x")).first()
4223
+ Row(regr_avgx(y, x)=0.999)
4224
+ """
4225
+ return Column.invoke_anonymous_function(y, "regr_avgx", x)
4226
+
4227
+
4228
+ @meta(unsupported_engines="*")
4229
+ def regr_avgy(y: ColumnOrName, x: ColumnOrName) -> Column:
4230
+ """
4231
+ Aggregate function: returns the average of the dependent variable for non-null pairs
4232
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4233
+
4234
+ .. versionadded:: 3.5.0
4235
+
4236
+ Parameters
4237
+ ----------
4238
+ y : :class:`~pyspark.sql.Column` or str
4239
+ the dependent variable.
4240
+ x : :class:`~pyspark.sql.Column` or str
4241
+ the independent variable.
4242
+
4243
+ Returns
4244
+ -------
4245
+ :class:`~pyspark.sql.Column`
4246
+ the average of the dependent variable for non-null pairs in a group.
4247
+
4248
+ Examples
4249
+ --------
4250
+ >>> x = (col("id") % 3).alias("x")
4251
+ >>> y = (randn(42) + x * 10).alias("y")
4252
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4253
+ >>> df.select(regr_avgy("y", "x")).first()
4254
+ Row(regr_avgy(y, x)=9.980732994136464)
4255
+ """
4256
+ return Column.invoke_anonymous_function(y, "regr_avgy", x)
4257
+
4258
+
4259
+ @meta(unsupported_engines="*")
4260
+ def regr_count(y: ColumnOrName, x: ColumnOrName) -> Column:
4261
+ """
4262
+ Aggregate function: returns the number of non-null number pairs
4263
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4264
+
4265
+ .. versionadded:: 3.5.0
4266
+
4267
+ Parameters
4268
+ ----------
4269
+ y : :class:`~pyspark.sql.Column` or str
4270
+ the dependent variable.
4271
+ x : :class:`~pyspark.sql.Column` or str
4272
+ the independent variable.
4273
+
4274
+ Returns
4275
+ -------
4276
+ :class:`~pyspark.sql.Column`
4277
+ the number of non-null number pairs in a group.
4278
+
4279
+ Examples
4280
+ --------
4281
+ >>> x = (col("id") % 3).alias("x")
4282
+ >>> y = (randn(42) + x * 10).alias("y")
4283
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4284
+ >>> df.select(regr_count("y", "x")).first()
4285
+ Row(regr_count(y, x)=1000)
4286
+ """
4287
+ return Column.invoke_anonymous_function(y, "regr_count", x)
4288
+
4289
+
4290
+ @meta(unsupported_engines="*")
4291
+ def regr_intercept(y: ColumnOrName, x: ColumnOrName) -> Column:
4292
+ """
4293
+ Aggregate function: returns the intercept of the univariate linear regression line
4294
+ for non-null pairs in a group, where `y` is the dependent variable and
4295
+ `x` is the independent variable.
4296
+
4297
+ .. versionadded:: 3.5.0
4298
+
4299
+ Parameters
4300
+ ----------
4301
+ y : :class:`~pyspark.sql.Column` or str
4302
+ the dependent variable.
4303
+ x : :class:`~pyspark.sql.Column` or str
4304
+ the independent variable.
4305
+
4306
+ Returns
4307
+ -------
4308
+ :class:`~pyspark.sql.Column`
4309
+ the intercept of the univariate linear regression line for non-null pairs in a group.
4310
+
4311
+ Examples
4312
+ --------
4313
+ >>> x = (col("id") % 3).alias("x")
4314
+ >>> y = (randn(42) + x * 10).alias("y")
4315
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4316
+ >>> df.select(regr_intercept("y", "x")).first()
4317
+ Row(regr_intercept(y, x)=-0.04961745990969568)
4318
+ """
4319
+ return Column.invoke_anonymous_function(y, "regr_intercept", x)
4320
+
4321
+
4322
+ @meta(unsupported_engines="*")
4323
+ def regr_r2(y: ColumnOrName, x: ColumnOrName) -> Column:
4324
+ """
4325
+ Aggregate function: returns the coefficient of determination for non-null pairs
4326
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4327
+
4328
+ .. versionadded:: 3.5.0
4329
+
4330
+ Parameters
4331
+ ----------
4332
+ y : :class:`~pyspark.sql.Column` or str
4333
+ the dependent variable.
4334
+ x : :class:`~pyspark.sql.Column` or str
4335
+ the independent variable.
4336
+
4337
+ Returns
4338
+ -------
4339
+ :class:`~pyspark.sql.Column`
4340
+ the coefficient of determination for non-null pairs in a group.
4341
+
4342
+ Examples
4343
+ --------
4344
+ >>> x = (col("id") % 3).alias("x")
4345
+ >>> y = (randn(42) + x * 10).alias("y")
4346
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4347
+ >>> df.select(regr_r2("y", "x")).first()
4348
+ Row(regr_r2(y, x)=0.9851908293645436)
4349
+ """
4350
+ return Column.invoke_anonymous_function(y, "regr_r2", x)
4351
+
4352
+
4353
+ @meta(unsupported_engines="*")
4354
+ def regr_slope(y: ColumnOrName, x: ColumnOrName) -> Column:
4355
+ """
4356
+ Aggregate function: returns the slope of the linear regression line for non-null pairs
4357
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4358
+
4359
+ .. versionadded:: 3.5.0
4360
+
4361
+ Parameters
4362
+ ----------
4363
+ y : :class:`~pyspark.sql.Column` or str
4364
+ the dependent variable.
4365
+ x : :class:`~pyspark.sql.Column` or str
4366
+ the independent variable.
4367
+
4368
+ Returns
4369
+ -------
4370
+ :class:`~pyspark.sql.Column`
4371
+ the slope of the linear regression line for non-null pairs in a group.
4372
+
4373
+ Examples
4374
+ --------
4375
+ >>> x = (col("id") % 3).alias("x")
4376
+ >>> y = (randn(42) + x * 10).alias("y")
4377
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4378
+ >>> df.select(regr_slope("y", "x")).first()
4379
+ Row(regr_slope(y, x)=10.040390844891048)
4380
+ """
4381
+ return Column.invoke_anonymous_function(y, "regr_slope", x)
4382
+
4383
+
4384
+ @meta(unsupported_engines="*")
4385
+ def regr_sxx(y: ColumnOrName, x: ColumnOrName) -> Column:
4386
+ """
4387
+ Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs
4388
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4389
+
4390
+ .. versionadded:: 3.5.0
4391
+
4392
+ Parameters
4393
+ ----------
4394
+ y : :class:`~pyspark.sql.Column` or str
4395
+ the dependent variable.
4396
+ x : :class:`~pyspark.sql.Column` or str
4397
+ the independent variable.
4398
+
4399
+ Returns
4400
+ -------
4401
+ :class:`~pyspark.sql.Column`
4402
+ REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs in a group.
4403
+
4404
+ Examples
4405
+ --------
4406
+ >>> x = (col("id") % 3).alias("x")
4407
+ >>> y = (randn(42) + x * 10).alias("y")
4408
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4409
+ >>> df.select(regr_sxx("y", "x")).first()
4410
+ Row(regr_sxx(y, x)=666.9989999999996)
4411
+ """
4412
+ return Column.invoke_anonymous_function(y, "regr_sxx", x)
4413
+
4414
+
4415
+ @meta(unsupported_engines="*")
4416
+ def regr_sxy(y: ColumnOrName, x: ColumnOrName) -> Column:
4417
+ """
4418
+ Aggregate function: returns REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs
4419
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4420
+
4421
+ .. versionadded:: 3.5.0
4422
+
4423
+ Parameters
4424
+ ----------
4425
+ y : :class:`~pyspark.sql.Column` or str
4426
+ the dependent variable.
4427
+ x : :class:`~pyspark.sql.Column` or str
4428
+ the independent variable.
4429
+
4430
+ Returns
4431
+ -------
4432
+ :class:`~pyspark.sql.Column`
4433
+ REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs in a group.
4434
+
4435
+ Examples
4436
+ --------
4437
+ >>> x = (col("id") % 3).alias("x")
4438
+ >>> y = (randn(42) + x * 10).alias("y")
4439
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4440
+ >>> df.select(regr_sxy("y", "x")).first()
4441
+ Row(regr_sxy(y, x)=6696.93065315148)
4442
+ """
4443
+ return Column.invoke_anonymous_function(y, "regr_sxy", x)
4444
+
4445
+
4446
+ @meta(unsupported_engines="*")
4447
+ def regr_syy(y: ColumnOrName, x: ColumnOrName) -> Column:
4448
+ """
4449
+ Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs
4450
+ in a group, where `y` is the dependent variable and `x` is the independent variable.
4451
+
4452
+ .. versionadded:: 3.5.0
4453
+
4454
+ Parameters
4455
+ ----------
4456
+ y : :class:`~pyspark.sql.Column` or str
4457
+ the dependent variable.
4458
+ x : :class:`~pyspark.sql.Column` or str
4459
+ the independent variable.
4460
+
4461
+ Returns
4462
+ -------
4463
+ :class:`~pyspark.sql.Column`
4464
+ REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs in a group.
4465
+
4466
+ Examples
4467
+ --------
4468
+ >>> x = (col("id") % 3).alias("x")
4469
+ >>> y = (randn(42) + x * 10).alias("y")
4470
+ >>> df = spark.range(0, 1000, 1, 1).select(x, y)
4471
+ >>> df.select(regr_syy("y", "x")).first()
4472
+ Row(regr_syy(y, x)=68250.53503811295)
4473
+ """
4474
+ return Column.invoke_anonymous_function(y, "regr_syy", x)
4475
+
4476
+
4477
+ @meta(unsupported_engines="*")
4478
+ def replace(
4479
+ src: ColumnOrName, search: ColumnOrName, replace: t.Optional[ColumnOrName] = None
4480
+ ) -> Column:
4481
+ """
4482
+ Replaces all occurrences of `search` with `replace`.
4483
+
4484
+ .. versionadded:: 3.5.0
4485
+
4486
+ Parameters
4487
+ ----------
4488
+ src : :class:`~pyspark.sql.Column` or str
4489
+ A column of string to be replaced.
4490
+ search : :class:`~pyspark.sql.Column` or str
4491
+ A column of string, If `search` is not found in `str`, `str` is returned unchanged.
4492
+ replace : :class:`~pyspark.sql.Column` or str, optional
4493
+ A column of string, If `replace` is not specified or is an empty string,
4494
+ nothing replaces the string that is removed from `str`.
4495
+
4496
+ Examples
4497
+ --------
4498
+ >>> df = spark.createDataFrame([("ABCabc", "abc", "DEF",)], ["a", "b", "c"])
4499
+ >>> df.select(replace(df.a, df.b, df.c).alias('r')).collect()
4500
+ [Row(r='ABCDEF')]
4501
+
4502
+ >>> df.select(replace(df.a, df.b).alias('r')).collect()
4503
+ [Row(r='ABC')]
4504
+ """
4505
+ if replace is not None:
4506
+ return Column.invoke_anonymous_function(src, "replace", search, replace)
4507
+ else:
4508
+ return Column.invoke_anonymous_function(src, "replace", search)
4509
+
4510
+
4511
+ @meta()
4512
+ def right(str: ColumnOrName, len: ColumnOrName) -> Column:
4513
+ """
4514
+ Returns the rightmost `len`(`len` can be string type) characters from the string `str`,
4515
+ if `len` is less or equal than 0 the result is an empty string.
4516
+
4517
+ .. versionadded:: 3.5.0
4518
+
4519
+ Parameters
4520
+ ----------
4521
+ str : :class:`~pyspark.sql.Column` or str
4522
+ Input column or strings.
4523
+ len : :class:`~pyspark.sql.Column` or str
4524
+ Input column or strings, the rightmost `len`.
4525
+
4526
+ Examples
4527
+ --------
4528
+ >>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])
4529
+ >>> df.select(right(df.a, df.b).alias('r')).collect()
4530
+ [Row(r='SQL')]
4531
+ """
4532
+ return Column.invoke_expression_over_column(str, expression.Right, expression=len)
4533
+
4534
+
4535
+ rlike = regexp_like
4536
+ sha = sha1
4537
+
4538
+
4539
+ @meta()
4540
+ def sign(col: ColumnOrName) -> Column:
4541
+ """
4542
+ Computes the signum of the given value.
4543
+
4544
+ .. versionadded:: 1.4.0
4545
+
4546
+ .. versionchanged:: 3.4.0
4547
+ Supports Spark Connect.
4548
+
4549
+ Parameters
4550
+ ----------
4551
+ col : :class:`~pyspark.sql.Column` or str
4552
+ target column to compute on.
4553
+
4554
+ Returns
4555
+ -------
4556
+ :class:`~pyspark.sql.Column`
4557
+ the column for computed results.
4558
+
4559
+ Examples
4560
+ --------
4561
+ >>> import pyspark.sql.functions as sf
4562
+ >>> spark.range(1).select(
4563
+ ... sf.sign(sf.lit(-5)),
4564
+ ... sf.sign(sf.lit(6))
4565
+ ... ).show()
4566
+ +--------+-------+
4567
+ |sign(-5)|sign(6)|
4568
+ +--------+-------+
4569
+ | -1.0| 1.0|
4570
+ +--------+-------+
4571
+ """
4572
+ return Column.invoke_expression_over_column(col, expression.Sign)
4573
+
4574
+
4575
+ @meta(unsupported_engines="*")
4576
+ def some(col: ColumnOrName) -> Column:
4577
+ """
4578
+ Aggregate function: returns true if at least one value of `col` is true.
4579
+
4580
+ .. versionadded:: 3.5.0
4581
+
4582
+ Parameters
4583
+ ----------
4584
+ col : :class:`~pyspark.sql.Column` or str
4585
+ column to check if at least one value is true.
4586
+
4587
+ Returns
4588
+ -------
4589
+ :class:`~pyspark.sql.Column`
4590
+ true if at least one value of `col` is true, false otherwise.
4591
+
4592
+ Examples
4593
+ --------
4594
+ >>> import pyspark.sql.functions as sf
4595
+ >>> spark.createDataFrame(
4596
+ ... [[True], [True], [True]], ["flag"]
4597
+ ... ).select(sf.some("flag")).show()
4598
+ +----------+
4599
+ |some(flag)|
4600
+ +----------+
4601
+ | true|
4602
+ +----------+
4603
+
4604
+ >>> import pyspark.sql.functions as sf
4605
+ >>> spark.createDataFrame(
4606
+ ... [[True], [False], [True]], ["flag"]
4607
+ ... ).select(sf.some("flag")).show()
4608
+ +----------+
4609
+ |some(flag)|
4610
+ +----------+
4611
+ | true|
4612
+ +----------+
4613
+
4614
+ >>> import pyspark.sql.functions as sf
4615
+ >>> spark.createDataFrame(
4616
+ ... [[False], [False], [False]], ["flag"]
4617
+ ... ).select(sf.some("flag")).show()
4618
+ +----------+
4619
+ |some(flag)|
4620
+ +----------+
4621
+ | false|
4622
+ +----------+
4623
+ """
4624
+ return Column.invoke_anonymous_function(col, "some")
4625
+
4626
+
4627
+ @meta(unsupported_engines="*")
4628
+ def spark_partition_id() -> Column:
4629
+ """A column for partition ID.
4630
+
4631
+ .. versionadded:: 1.6.0
4632
+
4633
+ .. versionchanged:: 3.4.0
4634
+ Supports Spark Connect.
4635
+
4636
+ Notes
4637
+ -----
4638
+ This is non deterministic because it depends on data partitioning and task scheduling.
4639
+
4640
+ Returns
4641
+ -------
4642
+ :class:`~pyspark.sql.Column`
4643
+ partition id the record belongs to.
4644
+
4645
+ Examples
4646
+ --------
4647
+ >>> df = spark.range(2)
4648
+ >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect()
4649
+ [Row(pid=0), Row(pid=0)]
4650
+ """
4651
+ return Column.invoke_anonymous_function(None, "spark_partition_id")
4652
+
4653
+
4654
+ @meta(unsupported_engines="*")
4655
+ def split_part(src: ColumnOrName, delimiter: ColumnOrName, partNum: ColumnOrName) -> Column:
4656
+ """
4657
+ Splits `str` by delimiter and return requested part of the split (1-based).
4658
+ If any input is null, returns null. if `partNum` is out of range of split parts,
4659
+ returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative,
4660
+ the parts are counted backward from the end of the string.
4661
+ If the `delimiter` is an empty string, the `str` is not split.
4662
+
4663
+ .. versionadded:: 3.5.0
4664
+
4665
+ Parameters
4666
+ ----------
4667
+ src : :class:`~pyspark.sql.Column` or str
4668
+ A column of string to be splited.
4669
+ delimiter : :class:`~pyspark.sql.Column` or str
4670
+ A column of string, the delimiter used for split.
4671
+ partNum : :class:`~pyspark.sql.Column` or str
4672
+ A column of string, requested part of the split (1-based).
4673
+
4674
+ Examples
4675
+ --------
4676
+ >>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"])
4677
+ >>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect()
4678
+ [Row(r='13')]
4679
+ """
4680
+ return Column.invoke_anonymous_function(src, "split_part", delimiter, partNum)
4681
+
4682
+
4683
+ @meta()
4684
+ def startswith(str: ColumnOrName, prefix: ColumnOrName) -> Column:
4685
+ """
4686
+ Returns a boolean. The value is True if str starts with prefix.
4687
+ Returns NULL if either input expression is NULL. Otherwise, returns False.
4688
+ Both str or prefix must be of STRING or BINARY type.
4689
+
4690
+ .. versionadded:: 3.5.0
4691
+
4692
+ Parameters
4693
+ ----------
4694
+ str : :class:`~pyspark.sql.Column` or str
4695
+ A column of string.
4696
+ prefix : :class:`~pyspark.sql.Column` or str
4697
+ A column of string, the prefix.
4698
+
4699
+ Examples
4700
+ --------
4701
+ >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])
4702
+ >>> df.select(startswith(df.a, df.b).alias('r')).collect()
4703
+ [Row(r=True)]
4704
+
4705
+ >>> df = spark.createDataFrame([("414243", "4142",)], ["e", "f"])
4706
+ >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))
4707
+ >>> df.printSchema()
4708
+ root
4709
+ |-- e: binary (nullable = true)
4710
+ |-- f: binary (nullable = true)
4711
+ >>> df.select(startswith("e", "f"), startswith("f", "e")).show()
4712
+ +----------------+----------------+
4713
+ |startswith(e, f)|startswith(f, e)|
4714
+ +----------------+----------------+
4715
+ | true| false|
4716
+ +----------------+----------------+
4717
+ """
4718
+ return Column.invoke_expression_over_column(str, expression.StartsWith, expression=prefix)
4719
+
4720
+
4721
+ @meta(unsupported_engines="*")
4722
+ def std(col: ColumnOrName) -> Column:
4723
+ """
4724
+ Aggregate function: alias for stddev_samp.
4725
+
4726
+ .. versionadded:: 3.5.0
4727
+
4728
+ Parameters
4729
+ ----------
4730
+ col : :class:`~pyspark.sql.Column` or str
4731
+ target column to compute on.
4732
+
4733
+ Returns
4734
+ -------
4735
+ :class:`~pyspark.sql.Column`
4736
+ standard deviation of given column.
4737
+
4738
+ Examples
4739
+ --------
4740
+ >>> import pyspark.sql.functions as sf
4741
+ >>> spark.range(6).select(sf.std("id")).show()
4742
+ +------------------+
4743
+ | std(id)|
4744
+ +------------------+
4745
+ |1.8708286933869...|
4746
+ +------------------+
4747
+ """
4748
+ return Column.invoke_anonymous_function(col, "std")
4749
+
4750
+
4751
+ @meta(unsupported_engines="*")
4752
+ def str_to_map(
4753
+ text: ColumnOrName,
4754
+ pairDelim: t.Optional[ColumnOrName] = None,
4755
+ keyValueDelim: t.Optional[ColumnOrName] = None,
4756
+ ) -> Column:
4757
+ """
4758
+ Creates a map after splitting the text into key/value pairs using delimiters.
4759
+ Both `pairDelim` and `keyValueDelim` are treated as regular expressions.
4760
+
4761
+ .. versionadded:: 3.5.0
4762
+
4763
+ Parameters
4764
+ ----------
4765
+ text : :class:`~pyspark.sql.Column` or str
4766
+ Input column or strings.
4767
+ pairDelim : :class:`~pyspark.sql.Column` or str, optional
4768
+ delimiter to use to split pair.
4769
+ keyValueDelim : :class:`~pyspark.sql.Column` or str, optional
4770
+ delimiter to use to split key/value.
4771
+
4772
+ Examples
4773
+ --------
4774
+ >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
4775
+ >>> df.select(str_to_map(df.e, lit(","), lit(":")).alias('r')).collect()
4776
+ [Row(r={'a': '1', 'b': '2', 'c': '3'})]
4777
+
4778
+ >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
4779
+ >>> df.select(str_to_map(df.e, lit(",")).alias('r')).collect()
4780
+ [Row(r={'a': '1', 'b': '2', 'c': '3'})]
4781
+
4782
+ >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
4783
+ >>> df.select(str_to_map(df.e).alias('r')).collect()
4784
+ [Row(r={'a': '1', 'b': '2', 'c': '3'})]
4785
+ """
4786
+ if pairDelim is None:
4787
+ pairDelim = lit(",")
4788
+ if keyValueDelim is None:
4789
+ keyValueDelim = lit(":")
4790
+ return Column.invoke_expression_over_column(
4791
+ text, expression.StrToMap, pair_delim=pairDelim, key_value_delim=keyValueDelim
4792
+ )
4793
+
4794
+
4795
+ @meta(unsupported_engines="*")
4796
+ def substr(str: ColumnOrName, pos: ColumnOrName, len: t.Optional[ColumnOrName] = None) -> Column:
4797
+ """
4798
+ Returns the substring of `str` that starts at `pos` and is of length `len`,
4799
+ or the slice of byte array that starts at `pos` and is of length `len`.
4800
+
4801
+ .. versionadded:: 3.5.0
4802
+
4803
+ Parameters
4804
+ ----------
4805
+ src : :class:`~pyspark.sql.Column` or str
4806
+ A column of string.
4807
+ pos : :class:`~pyspark.sql.Column` or str
4808
+ A column of string, the substring of `str` that starts at `pos`.
4809
+ len : :class:`~pyspark.sql.Column` or str, optional
4810
+ A column of string, the substring of `str` is of length `len`.
4811
+
4812
+ Examples
4813
+ --------
4814
+ >>> import pyspark.sql.functions as sf
4815
+ >>> spark.createDataFrame(
4816
+ ... [("Spark SQL", 5, 1,)], ["a", "b", "c"]
4817
+ ... ).select(sf.substr("a", "b", "c")).show()
4818
+ +---------------+
4819
+ |substr(a, b, c)|
4820
+ +---------------+
4821
+ | k|
4822
+ +---------------+
4823
+
4824
+ >>> import pyspark.sql.functions as sf
4825
+ >>> spark.createDataFrame(
4826
+ ... [("Spark SQL", 5, 1,)], ["a", "b", "c"]
4827
+ ... ).select(sf.substr("a", "b")).show()
4828
+ +------------------------+
4829
+ |substr(a, b, 2147483647)|
4830
+ +------------------------+
4831
+ | k SQL|
4832
+ +------------------------+
4833
+ """
4834
+ if len is not None:
4835
+ return Column.invoke_anonymous_function(str, "substr", pos, len)
4836
+ else:
4837
+ return Column.invoke_anonymous_function(str, "substr", pos)
4838
+
4839
+
4840
+ @meta(unsupported_engines="*")
4841
+ def timestamp_micros(col: ColumnOrName) -> Column:
4842
+ """
4843
+ Creates timestamp from the number of microseconds since UTC epoch.
4844
+
4845
+ .. versionadded:: 3.5.0
4846
+
4847
+ Parameters
4848
+ ----------
4849
+ col : :class:`~pyspark.sql.Column` or str
4850
+ unix time values.
4851
+
4852
+ Returns
4853
+ -------
4854
+ :class:`~pyspark.sql.Column`
4855
+ converted timestamp value.
4856
+
4857
+ Examples
4858
+ --------
4859
+ >>> spark.conf.set("spark.sql.session.timeZone", "UTC")
4860
+ >>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])
4861
+ >>> time_df.select(timestamp_micros(time_df.unix_time).alias('ts')).show()
4862
+ +--------------------+
4863
+ | ts|
4864
+ +--------------------+
4865
+ |1970-01-01 00:20:...|
4866
+ +--------------------+
4867
+ >>> time_df.select(timestamp_micros('unix_time').alias('ts')).printSchema()
4868
+ root
4869
+ |-- ts: timestamp (nullable = true)
4870
+ >>> spark.conf.unset("spark.sql.session.timeZone")
4871
+ """
4872
+ return Column.invoke_anonymous_function(col, "timestamp_micros")
4873
+
4874
+
4875
+ @meta(unsupported_engines="*")
4876
+ def timestamp_millis(col: ColumnOrName) -> Column:
4877
+ """
4878
+ Creates timestamp from the number of milliseconds since UTC epoch.
4879
+
4880
+ .. versionadded:: 3.5.0
4881
+
4882
+ Parameters
4883
+ ----------
4884
+ col : :class:`~pyspark.sql.Column` or str
4885
+ unix time values.
4886
+
4887
+ Returns
4888
+ -------
4889
+ :class:`~pyspark.sql.Column`
4890
+ converted timestamp value.
4891
+
4892
+ Examples
4893
+ --------
4894
+ >>> spark.conf.set("spark.sql.session.timeZone", "UTC")
4895
+ >>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])
4896
+ >>> time_df.select(timestamp_millis(time_df.unix_time).alias('ts')).show()
4897
+ +-------------------+
4898
+ | ts|
4899
+ +-------------------+
4900
+ |1970-01-15 05:43:39|
4901
+ +-------------------+
4902
+ >>> time_df.select(timestamp_millis('unix_time').alias('ts')).printSchema()
4903
+ root
4904
+ |-- ts: timestamp (nullable = true)
4905
+ >>> spark.conf.unset("spark.sql.session.timeZone")
4906
+ """
4907
+ return Column.invoke_anonymous_function(col, "timestamp_millis")
4908
+
4909
+
4910
+ @meta(unsupported_engines="*")
4911
+ def to_char(col: ColumnOrName, format: ColumnOrName) -> Column:
4912
+ """
4913
+ Convert `col` to a string based on the `format`.
4914
+ Throws an exception if the conversion fails. The format can consist of the following
4915
+ characters, case insensitive:
4916
+ '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the
4917
+ format string matches a sequence of digits in the input value, generating a result
4918
+ string of the same length as the corresponding sequence in the format string.
4919
+ The result string is left-padded with zeros if the 0/9 sequence comprises more digits
4920
+ than the matching part of the decimal value, starts with 0, and is before the decimal
4921
+ point. Otherwise, it is padded with spaces.
4922
+ '.' or 'D': Specifies the position of the decimal point (optional, only allowed once).
4923
+ ',' or 'G': Specifies the position of the grouping (thousands) separator (,).
4924
+ There must be a 0 or 9 to the left and right of each grouping separator.
4925
+ '$': Specifies the location of the $ currency sign. This character may only be specified once.
4926
+ 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at
4927
+ the beginning or end of the format string). Note that 'S' prints '+' for positive
4928
+ values but 'MI' prints a space.
4929
+ 'PR': Only allowed at the end of the format string; specifies that the result string
4930
+ will be wrapped by angle brackets if the input value is negative.
4931
+
4932
+ .. versionadded:: 3.5.0
4933
+
4934
+ Parameters
4935
+ ----------
4936
+ col : :class:`~pyspark.sql.Column` or str
4937
+ Input column or strings.
4938
+ format : :class:`~pyspark.sql.Column` or str, optional
4939
+ format to use to convert char values.
4940
+
4941
+ Examples
4942
+ --------
4943
+ >>> df = spark.createDataFrame([(78.12,)], ["e"])
4944
+ >>> df.select(to_char(df.e, lit("$99.99")).alias('r')).collect()
4945
+ [Row(r='$78.12')]
4946
+ """
4947
+ return Column.invoke_anonymous_function(col, "to_char", format)
4948
+
4949
+
4950
+ @meta(unsupported_engines=["bigquery", "duckdb"])
4951
+ def to_number(col: ColumnOrName, format: ColumnOrName) -> Column:
4952
+ """
4953
+ Convert string 'col' to a number based on the string format 'format'.
4954
+ Throws an exception if the conversion fails. The format can consist of the following
4955
+ characters, case insensitive:
4956
+ '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the
4957
+ format string matches a sequence of digits in the input string. If the 0/9
4958
+ sequence starts with 0 and is before the decimal point, it can only match a digit
4959
+ sequence of the same size. Otherwise, if the sequence starts with 9 or is after
4960
+ the decimal point, it can match a digit sequence that has the same or smaller size.
4961
+ '.' or 'D': Specifies the position of the decimal point (optional, only allowed once).
4962
+ ',' or 'G': Specifies the position of the grouping (thousands) separator (,).
4963
+ There must be a 0 or 9 to the left and right of each grouping separator.
4964
+ 'col' must match the grouping separator relevant for the size of the number.
4965
+ '$': Specifies the location of the $ currency sign. This character may only be
4966
+ specified once.
4967
+ 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed
4968
+ once at the beginning or end of the format string). Note that 'S' allows '-'
4969
+ but 'MI' does not.
4970
+ 'PR': Only allowed at the end of the format string; specifies that 'col' indicates a
4971
+ negative number with wrapping angled brackets.
4972
+
4973
+ .. versionadded:: 3.5.0
4974
+
4975
+ Parameters
4976
+ ----------
4977
+ col : :class:`~pyspark.sql.Column` or str
4978
+ Input column or strings.
4979
+ format : :class:`~pyspark.sql.Column` or str, optional
4980
+ format to use to convert number values.
4981
+
4982
+ Examples
4983
+ --------
4984
+ >>> df = spark.createDataFrame([("$78.12",)], ["e"])
4985
+ >>> df.select(to_number(df.e, lit("$99.99")).alias('r')).collect()
4986
+ [Row(r=Decimal('78.12'))]
4987
+ """
4988
+ return Column.invoke_expression_over_column(col, expression.ToNumber, format=format)
4989
+
4990
+
4991
+ def to_str(value: t.Any) -> t.Optional[str]:
4992
+ """
4993
+ A wrapper over str(), but converts bool values to lower case strings.
4994
+ If None is given, just returns None, instead of converting it to string "None".
4995
+ """
4996
+ if isinstance(value, bool):
4997
+ return str(value).lower()
4998
+ elif value is None:
4999
+ return value
5000
+ else:
5001
+ return str(value)
5002
+
5003
+
5004
+ @meta(unsupported_engines="*")
5005
+ def to_timestamp_ltz(
5006
+ timestamp: ColumnOrName,
5007
+ format: t.Optional[ColumnOrName] = None,
5008
+ ) -> Column:
5009
+ """
5010
+ Parses the `timestamp` with the `format` to a timestamp without time zone.
5011
+ Returns null with invalid input.
5012
+
5013
+ .. versionadded:: 3.5.0
5014
+
5015
+ Parameters
5016
+ ----------
5017
+ timestamp : :class:`~pyspark.sql.Column` or str
5018
+ Input column or strings.
5019
+ format : :class:`~pyspark.sql.Column` or str, optional
5020
+ format to use to convert type `TimestampType` timestamp values.
5021
+
5022
+ Examples
5023
+ --------
5024
+ >>> df = spark.createDataFrame([("2016-12-31",)], ["e"])
5025
+ >>> df.select(to_timestamp_ltz(df.e, lit("yyyy-MM-dd")).alias('r')).collect()
5026
+ ... # doctest: +SKIP
5027
+ [Row(r=datetime.datetime(2016, 12, 31, 0, 0))]
5028
+
5029
+ >>> df = spark.createDataFrame([("2016-12-31",)], ["e"])
5030
+ >>> df.select(to_timestamp_ltz(df.e).alias('r')).collect()
5031
+ ... # doctest: +SKIP
5032
+ [Row(r=datetime.datetime(2016, 12, 31, 0, 0))]
5033
+ """
5034
+ if format is not None:
5035
+ return Column.invoke_anonymous_function(timestamp, "to_timestamp_ltz", format)
5036
+ else:
5037
+ return Column.invoke_anonymous_function(timestamp, "to_timestamp_ltz")
5038
+
5039
+
5040
+ @meta(unsupported_engines="*")
5041
+ def to_timestamp_ntz(
5042
+ timestamp: ColumnOrName,
5043
+ format: t.Optional[ColumnOrName] = None,
5044
+ ) -> Column:
5045
+ """
5046
+ Parses the `timestamp` with the `format` to a timestamp without time zone.
5047
+ Returns null with invalid input.
5048
+
5049
+ .. versionadded:: 3.5.0
5050
+
5051
+ Parameters
5052
+ ----------
5053
+ timestamp : :class:`~pyspark.sql.Column` or str
5054
+ Input column or strings.
5055
+ format : :class:`~pyspark.sql.Column` or str, optional
5056
+ format to use to convert type `TimestampNTZType` timestamp values.
5057
+
5058
+ Examples
5059
+ --------
5060
+ >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
5061
+ >>> df.select(to_timestamp_ntz(df.e, lit("yyyy-MM-dd")).alias('r')).collect()
5062
+ ... # doctest: +SKIP
5063
+ [Row(r=datetime.datetime(2016, 4, 8, 0, 0))]
5064
+
5065
+ >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
5066
+ >>> df.select(to_timestamp_ntz(df.e).alias('r')).collect()
5067
+ ... # doctest: +SKIP
5068
+ [Row(r=datetime.datetime(2016, 4, 8, 0, 0))]
5069
+ """
5070
+ if format is not None:
5071
+ return Column.invoke_anonymous_function(timestamp, "to_timestamp_ntz", format)
5072
+ else:
5073
+ return Column.invoke_anonymous_function(timestamp, "to_timestamp_ntz")
5074
+
5075
+
5076
+ @meta(unsupported_engines=["bigquery", "postgres", "snowflake"])
5077
+ def to_unix_timestamp(
5078
+ timestamp: ColumnOrName,
5079
+ format: t.Optional[ColumnOrName] = None,
5080
+ ) -> Column:
5081
+ """
5082
+ Returns the UNIX timestamp of the given time.
5083
+
5084
+ .. versionadded:: 3.5.0
5085
+
5086
+ Parameters
5087
+ ----------
5088
+ timestamp : :class:`~pyspark.sql.Column` or str
5089
+ Input column or strings.
5090
+ format : :class:`~pyspark.sql.Column` or str, optional
5091
+ format to use to convert UNIX timestamp values.
5092
+
5093
+ Examples
5094
+ --------
5095
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
5096
+ >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
5097
+ >>> df.select(to_unix_timestamp(df.e, lit("yyyy-MM-dd")).alias('r')).collect()
5098
+ [Row(r=1460098800)]
5099
+ >>> spark.conf.unset("spark.sql.session.timeZone")
5100
+
5101
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
5102
+ >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
5103
+ >>> df.select(to_unix_timestamp(df.e).alias('r')).collect()
5104
+ [Row(r=None)]
5105
+ >>> spark.conf.unset("spark.sql.session.timeZone")
5106
+ """
5107
+ if format is not None:
5108
+ return Column.invoke_expression_over_column(timestamp, expression.StrToUnix, format=format)
5109
+ else:
5110
+ return Column.invoke_expression_over_column(timestamp, expression.StrToUnix)
5111
+
5112
+
5113
+ @meta(unsupported_engines="*")
5114
+ def to_varchar(col: ColumnOrName, format: ColumnOrName) -> Column:
5115
+ """
5116
+ Convert `col` to a string based on the `format`.
5117
+ Throws an exception if the conversion fails. The format can consist of the following
5118
+ characters, case insensitive:
5119
+ '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the
5120
+ format string matches a sequence of digits in the input value, generating a result
5121
+ string of the same length as the corresponding sequence in the format string.
5122
+ The result string is left-padded with zeros if the 0/9 sequence comprises more digits
5123
+ than the matching part of the decimal value, starts with 0, and is before the decimal
5124
+ point. Otherwise, it is padded with spaces.
5125
+ '.' or 'D': Specifies the position of the decimal point (optional, only allowed once).
5126
+ ',' or 'G': Specifies the position of the grouping (thousands) separator (,).
5127
+ There must be a 0 or 9 to the left and right of each grouping separator.
5128
+ '$': Specifies the location of the $ currency sign. This character may only be specified once.
5129
+ 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at
5130
+ the beginning or end of the format string). Note that 'S' prints '+' for positive
5131
+ values but 'MI' prints a space.
5132
+ 'PR': Only allowed at the end of the format string; specifies that the result string
5133
+ will be wrapped by angle brackets if the input value is negative.
5134
+
5135
+ .. versionadded:: 3.5.0
5136
+
5137
+ Parameters
5138
+ ----------
5139
+ col : :class:`~pyspark.sql.Column` or str
5140
+ Input column or strings.
5141
+ format : :class:`~pyspark.sql.Column` or str, optional
5142
+ format to use to convert char values.
5143
+
5144
+ Examples
5145
+ --------
5146
+ >>> df = spark.createDataFrame([(78.12,)], ["e"])
5147
+ >>> df.select(to_varchar(df.e, lit("$99.99")).alias('r')).collect()
5148
+ [Row(r='$78.12')]
5149
+ """
5150
+ return Column.invoke_anonymous_function(col, "to_varchar", format)
5151
+
5152
+
5153
+ @meta(unsupported_engines="*")
5154
+ def try_aes_decrypt(
5155
+ input: ColumnOrName,
5156
+ key: ColumnOrName,
5157
+ mode: t.Optional[ColumnOrName] = None,
5158
+ padding: t.Optional[ColumnOrName] = None,
5159
+ aad: t.Optional[ColumnOrName] = None,
5160
+ ) -> Column:
5161
+ """
5162
+ This is a special version of `aes_decrypt` that performs the same operation,
5163
+ but returns a NULL value instead of raising an error if the decryption cannot be performed.
5164
+ Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16,
5165
+ 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB',
5166
+ 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). t.Optional additional authenticated data (AAD) is
5167
+ only supported for GCM. If provided for encryption, the identical AAD value must be provided
5168
+ for decryption. The default mode is GCM.
5169
+
5170
+ .. versionadded:: 3.5.0
5171
+
5172
+ Parameters
5173
+ ----------
5174
+ input : :class:`~pyspark.sql.Column` or str
5175
+ The binary value to decrypt.
5176
+ key : :class:`~pyspark.sql.Column` or str
5177
+ The passphrase to use to decrypt the data.
5178
+ mode : :class:`~pyspark.sql.Column` or str, optional
5179
+ Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB,
5180
+ GCM, CBC.
5181
+ padding : :class:`~pyspark.sql.Column` or str, optional
5182
+ Specifies how to pad messages whose length is not a multiple of the block size. Valid
5183
+ values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS
5184
+ for CBC.
5185
+ aad : :class:`~pyspark.sql.Column` or str, optional
5186
+ t.Optional additional authenticated data. Only supported for GCM mode. This can be any
5187
+ free-form input and must be provided for both encryption and decryption.
5188
+
5189
+ Examples
5190
+ --------
5191
+ >>> df = spark.createDataFrame([(
5192
+ ... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4",
5193
+ ... "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",
5194
+ ... "This is an AAD mixed into the input",)],
5195
+ ... ["input", "key", "mode", "padding", "aad"]
5196
+ ... )
5197
+ >>> df.select(try_aes_decrypt(
5198
+ ... unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias('r')
5199
+ ... ).collect()
5200
+ [Row(r=bytearray(b'Spark'))]
5201
+
5202
+ >>> df = spark.createDataFrame([(
5203
+ ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=",
5204
+ ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)],
5205
+ ... ["input", "key", "mode", "padding"]
5206
+ ... )
5207
+ >>> df.select(try_aes_decrypt(
5208
+ ... unbase64(df.input), df.key, df.mode, df.padding).alias('r')
5209
+ ... ).collect()
5210
+ [Row(r=bytearray(b'Spark'))]
5211
+
5212
+ >>> df.select(try_aes_decrypt(unbase64(df.input), df.key, df.mode).alias('r')).collect()
5213
+ [Row(r=bytearray(b'Spark'))]
5214
+
5215
+ >>> df = spark.createDataFrame([(
5216
+ ... "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",
5217
+ ... "0000111122223333",)],
5218
+ ... ["input", "key"]
5219
+ ... )
5220
+ >>> df.select(try_aes_decrypt(unhex(df.input), df.key).alias('r')).collect()
5221
+ [Row(r=bytearray(b'Spark'))]
5222
+ """
5223
+ _mode = lit("GCM") if mode is None else mode
5224
+ _padding = lit("DEFAULT") if padding is None else padding
5225
+ _aad = lit("") if aad is None else aad
5226
+ return Column.invoke_anonymous_function(input, "try_aes_decrypt", key, _mode, _padding, _aad)
5227
+
5228
+
5229
+ @meta(unsupported_engines=["bigquery", "snowflake"])
5230
+ def try_element_at(col: ColumnOrName, extraction: ColumnOrName) -> Column:
5231
+ """
5232
+ (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will
5233
+ throw an error. If index < 0, accesses elements from the last to the first. The function
5234
+ always returns NULL if the index exceeds the length of the array.
5235
+
5236
+ (map, key) - Returns value for given key. The function always returns NULL if the key is not
5237
+ contained in the map.
5238
+
5239
+ .. versionadded:: 3.5.0
5240
+
5241
+ Parameters
5242
+ ----------
5243
+ col : :class:`~pyspark.sql.Column` or str
5244
+ name of column containing array or map
5245
+ extraction :
5246
+ index to check for in array or key to check for in map
5247
+
5248
+ Examples
5249
+ --------
5250
+ >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
5251
+ >>> df.select(try_element_at(df.data, lit(1)).alias('r')).collect()
5252
+ [Row(r='a')]
5253
+ >>> df.select(try_element_at(df.data, lit(-1)).alias('r')).collect()
5254
+ [Row(r='c')]
5255
+
5256
+ >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
5257
+ >>> df.select(try_element_at(df.data, lit("a")).alias('r')).collect()
5258
+ [Row(r=1.0)]
5259
+ """
5260
+ return Column(
5261
+ expression.Bracket(
5262
+ this=Column.ensure_col(col).expression,
5263
+ expressions=[Column.ensure_col(extraction).expression],
5264
+ safe=True,
5265
+ )
5266
+ )
5267
+
5268
+
5269
+ @meta(unsupported_engines="*")
5270
+ def try_to_timestamp(col: ColumnOrName, format: t.Optional[ColumnOrName] = None) -> Column:
5271
+ """
5272
+ Parses the `col` with the `format` to a timestamp. The function always
5273
+ returns null on an invalid input with/without ANSI SQL mode enabled. The result data type is
5274
+ consistent with the value of configuration `spark.sql.timestampType`.
5275
+
5276
+ .. versionadded:: 3.5.0
5277
+
5278
+ Parameters
5279
+ ----------
5280
+ col : :class:`~pyspark.sql.Column` or str
5281
+ column values to convert.
5282
+ format: str, optional
5283
+ format to use to convert timestamp values.
5284
+
5285
+ Examples
5286
+ --------
5287
+ >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
5288
+ >>> df.select(try_to_timestamp(df.t).alias('dt')).collect()
5289
+ [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
5290
+
5291
+ >>> df.select(try_to_timestamp(df.t, lit('yyyy-MM-dd HH:mm:ss')).alias('dt')).collect()
5292
+ [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
5293
+ """
5294
+ if format is not None:
5295
+ return Column.invoke_anonymous_function(col, "try_to_timestamp", format)
5296
+ else:
5297
+ return Column.invoke_anonymous_function(col, "try_to_timestamp")
5298
+
5299
+
5300
+ @meta()
5301
+ def ucase(str: ColumnOrName) -> Column:
5302
+ """
5303
+ Returns `str` with all characters changed to uppercase.
5304
+
5305
+ .. versionadded:: 3.5.0
5306
+
5307
+ Parameters
5308
+ ----------
5309
+ str : :class:`~pyspark.sql.Column` or str
5310
+ Input column or strings.
5311
+
5312
+ Examples
5313
+ --------
5314
+ >>> import pyspark.sql.functions as sf
5315
+ >>> spark.range(1).select(sf.ucase(sf.lit("Spark"))).show()
5316
+ +------------+
5317
+ |ucase(Spark)|
5318
+ +------------+
5319
+ | SPARK|
5320
+ +------------+
5321
+ """
5322
+ return Column.invoke_expression_over_column(str, expression.Upper)
5323
+
5324
+
5325
+ @meta()
5326
+ def unix_date(col: ColumnOrName) -> Column:
5327
+ """Returns the number of days since 1970-01-01.
5328
+
5329
+ .. versionadded:: 3.5.0
5330
+
5331
+ Examples
5332
+ --------
5333
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
5334
+ >>> df = spark.createDataFrame([('1970-01-02',)], ['t'])
5335
+ >>> df.select(unix_date(to_date(df.t)).alias('n')).collect()
5336
+ [Row(n=1)]
5337
+ >>> spark.conf.unset("spark.sql.session.timeZone")
5338
+ """
5339
+ return Column.invoke_expression_over_column(col, expression.UnixDate)
5340
+
5341
+
5342
+ @meta(unsupported_engines="*")
5343
+ def unix_micros(col: ColumnOrName) -> Column:
5344
+ """Returns the number of microseconds since 1970-01-01 00:00:00 UTC.
5345
+
5346
+ .. versionadded:: 3.5.0
5347
+
5348
+ Examples
5349
+ --------
5350
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
5351
+ >>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])
5352
+ >>> df.select(unix_micros(to_timestamp(df.t)).alias('n')).collect()
5353
+ [Row(n=1437584400000000)]
5354
+ >>> spark.conf.unset("spark.sql.session.timeZone")
5355
+ """
5356
+ return Column.invoke_anonymous_function(col, "unix_micros")
5357
+
5358
+
5359
+ @meta(unsupported_engines="*")
5360
+ def unix_millis(col: ColumnOrName) -> Column:
5361
+ """Returns the number of milliseconds since 1970-01-01 00:00:00 UTC.
5362
+ Truncates higher levels of precision.
5363
+
5364
+ .. versionadded:: 3.5.0
5365
+
5366
+ Examples
5367
+ --------
5368
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
5369
+ >>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])
5370
+ >>> df.select(unix_millis(to_timestamp(df.t)).alias('n')).collect()
5371
+ [Row(n=1437584400000)]
5372
+ >>> spark.conf.unset("spark.sql.session.timeZone")
5373
+ """
5374
+ return Column.invoke_anonymous_function(col, "unix_millis")
5375
+
5376
+
5377
+ @meta(unsupported_engines="*")
5378
+ def unix_seconds(col: ColumnOrName) -> Column:
5379
+ """Returns the number of seconds since 1970-01-01 00:00:00 UTC.
5380
+ Truncates higher levels of precision.
5381
+
5382
+ .. versionadded:: 3.5.0
5383
+
5384
+ Examples
5385
+ --------
5386
+ >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
5387
+ >>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])
5388
+ >>> df.select(unix_seconds(to_timestamp(df.t)).alias('n')).collect()
5389
+ [Row(n=1437584400)]
5390
+ >>> spark.conf.unset("spark.sql.session.timeZone")
5391
+ """
5392
+ return Column.invoke_anonymous_function(col, "unix_seconds")
5393
+
5394
+
5395
+ @meta(unsupported_engines="*")
5396
+ def url_decode(str: ColumnOrName) -> Column:
5397
+ """
5398
+ Decodes a `str` in 'application/x-www-form-urlencoded' format
5399
+ using a specific encoding scheme.
5400
+
5401
+ .. versionadded:: 3.5.0
5402
+
5403
+ Parameters
5404
+ ----------
5405
+ str : :class:`~pyspark.sql.Column` or str
5406
+ A column of string to decode.
5407
+
5408
+ Examples
5409
+ --------
5410
+ >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["a"])
5411
+ >>> df.select(url_decode(df.a).alias('r')).collect()
5412
+ [Row(r='https://spark.apache.org')]
5413
+ """
5414
+ return Column.invoke_anonymous_function(str, "url_decode")
5415
+
5416
+
5417
+ @meta(unsupported_engines="*")
5418
+ def url_encode(str: ColumnOrName) -> Column:
5419
+ """
5420
+ Translates a string into 'application/x-www-form-urlencoded' format
5421
+ using a specific encoding scheme.
5422
+
5423
+ .. versionadded:: 3.5.0
5424
+
5425
+ Parameters
5426
+ ----------
5427
+ str : :class:`~pyspark.sql.Column` or str
5428
+ A column of string to encode.
5429
+
5430
+ Examples
5431
+ --------
5432
+ >>> df = spark.createDataFrame([("https://spark.apache.org",)], ["a"])
5433
+ >>> df.select(url_encode(df.a).alias('r')).collect()
5434
+ [Row(r='https%3A%2F%2Fspark.apache.org')]
5435
+ """
5436
+ return Column.invoke_anonymous_function(str, "url_encode")
5437
+
5438
+
5439
+ user = current_user
5440
+
5441
+
5442
+ @meta(unsupported_engines="*")
5443
+ def version() -> Column:
5444
+ """
5445
+ Returns the Spark version. The string contains 2 fields, the first being a release version
5446
+ and the second being a git revision.
5447
+
5448
+ .. versionadded:: 3.5.0
5449
+
5450
+ Examples
5451
+ --------
5452
+ >>> df = spark.range(1)
5453
+ >>> df.select(version()).show(truncate=False) # doctest: +SKIP
5454
+ +----------------------------------------------+
5455
+ |version() |
5456
+ +----------------------------------------------+
5457
+ |3.5.0 cafbea5b13623276517a9d716f75745eff91f616|
5458
+ +----------------------------------------------+
5459
+ """
5460
+ return Column.invoke_anonymous_function(None, "version")
5461
+
5462
+
5463
+ @meta(unsupported_engines="*")
5464
+ def weekday(col: ColumnOrName) -> Column:
5465
+ """
5466
+ Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
5467
+
5468
+ .. versionadded:: 3.5.0
5469
+
5470
+ Parameters
5471
+ ----------
5472
+ col : :class:`~pyspark.sql.Column` or str
5473
+ target date/timestamp column to work on.
5474
+
5475
+ Returns
5476
+ -------
5477
+ :class:`~pyspark.sql.Column`
5478
+ the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
5479
+
5480
+ Examples
5481
+ --------
5482
+ >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
5483
+ >>> df.select(weekday('dt').alias('day')).show()
5484
+ +---+
5485
+ |day|
5486
+ +---+
5487
+ | 2|
5488
+ +---+
5489
+ """
5490
+ return Column.invoke_anonymous_function(col, "weekday")
5491
+
5492
+
5493
+ @meta(unsupported_engines="*")
5494
+ def width_bucket(
5495
+ v: ColumnOrName,
5496
+ min: ColumnOrName,
5497
+ max: ColumnOrName,
5498
+ numBucket: t.Union[ColumnOrName, int],
5499
+ ) -> Column:
5500
+ """
5501
+ Returns the bucket number into which the value of this expression would fall
5502
+ after being evaluated. Note that input arguments must follow conditions listed below;
5503
+ otherwise, the method will return null.
5504
+
5505
+ .. versionadded:: 3.5.0
5506
+
5507
+ Parameters
5508
+ ----------
5509
+ v : str or :class:`~pyspark.sql.Column`
5510
+ value to compute a bucket number in the histogram
5511
+ min : str or :class:`~pyspark.sql.Column`
5512
+ minimum value of the histogram
5513
+ max : str or :class:`~pyspark.sql.Column`
5514
+ maximum value of the histogram
5515
+ numBucket : str, :class:`~pyspark.sql.Column` or int
5516
+ the number of buckets
5517
+
5518
+ Returns
5519
+ -------
5520
+ :class:`~pyspark.sql.Column`
5521
+ the bucket number into which the value would fall after being evaluated
5522
+
5523
+ Examples
5524
+ --------
5525
+ >>> df = spark.createDataFrame([
5526
+ ... (5.3, 0.2, 10.6, 5),
5527
+ ... (-2.1, 1.3, 3.4, 3),
5528
+ ... (8.1, 0.0, 5.7, 4),
5529
+ ... (-0.9, 5.2, 0.5, 2)],
5530
+ ... ['v', 'min', 'max', 'n'])
5531
+ >>> df.select(width_bucket('v', 'min', 'max', 'n')).show()
5532
+ +----------------------------+
5533
+ |width_bucket(v, min, max, n)|
5534
+ +----------------------------+
5535
+ | 3|
5536
+ | 0|
5537
+ | 5|
5538
+ | 3|
5539
+ +----------------------------+
5540
+ """
5541
+ numBucket = lit(numBucket) if isinstance(numBucket, int) else numBucket
5542
+ return Column.invoke_anonymous_function(v, "width_bucket", min, max, numBucket)
5543
+
5544
+
5545
+ @meta(unsupported_engines=["*", "spark"])
5546
+ def window_time(
5547
+ windowColumn: ColumnOrName,
5548
+ ) -> Column:
5549
+ """Computes the event time from a window column. The column window values are produced
5550
+ by window aggregating operators and are of type `STRUCT<start: TIMESTAMP, end: TIMESTAMP>`
5551
+ where start is inclusive and end is exclusive. The event time of records produced by window
5552
+ aggregating operators can be computed as ``window_time(window)`` and are
5553
+ ``window.end - lit(1).alias("microsecond")`` (as microsecond is the minimal supported event
5554
+ time precision). The window column must be one produced by a window aggregating operator.
5555
+
5556
+ .. versionadded:: 3.4.0
5557
+
5558
+ Parameters
5559
+ ----------
5560
+ windowColumn : :class:`~pyspark.sql.Column`
5561
+ The window column of a window aggregate records.
5562
+
5563
+ Returns
5564
+ -------
5565
+ :class:`~pyspark.sql.Column`
5566
+ the column for computed results.
5567
+
5568
+ Notes
5569
+ -----
5570
+ Supports Spark Connect.
5571
+
5572
+ Examples
5573
+ --------
5574
+ >>> import datetime
5575
+ >>> df = spark.createDataFrame(
5576
+ ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],
5577
+ ... ).toDF("date", "val")
5578
+
5579
+ Group the data into 5 second time windows and aggregate as sum.
5580
+
5581
+ >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))
5582
+
5583
+ Extract the window event time using the window_time function.
5584
+
5585
+ >>> w.select(
5586
+ ... w.window.end.cast("string").alias("end"),
5587
+ ... window_time(w.window).cast("string").alias("window_time"),
5588
+ ... "sum"
5589
+ ... ).collect()
5590
+ [Row(end='2016-03-11 09:00:10', window_time='2016-03-11 09:00:09.999999', sum=1)]
5591
+ """
5592
+ return Column.invoke_anonymous_function(windowColumn, "window_time")
5593
+
5594
+
5595
+ @meta(unsupported_engines="*")
5596
+ def xpath(xml: ColumnOrName, path: ColumnOrName) -> Column:
5597
+ """
5598
+ Returns a string array of values within the nodes of xml that match the XPath expression.
5599
+
5600
+ .. versionadded:: 3.5.0
5601
+
5602
+ Examples
5603
+ --------
5604
+ >>> df = spark.createDataFrame(
5605
+ ... [('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>',)], ['x'])
5606
+ >>> df.select(xpath(df.x, lit('a/b/text()')).alias('r')).collect()
5607
+ [Row(r=['b1', 'b2', 'b3'])]
5608
+ """
5609
+ return Column.invoke_anonymous_function(xml, "xpath", path)
5610
+
5611
+
5612
+ @meta(unsupported_engines="*")
5613
+ def xpath_boolean(xml: ColumnOrName, path: ColumnOrName) -> Column:
5614
+ """
5615
+ Returns true if the XPath expression evaluates to true, or if a matching node is found.
5616
+
5617
+ .. versionadded:: 3.5.0
5618
+
5619
+ Examples
5620
+ --------
5621
+ >>> df = spark.createDataFrame([('<a><b>1</b></a>',)], ['x'])
5622
+ >>> df.select(xpath_boolean(df.x, lit('a/b')).alias('r')).collect()
5623
+ [Row(r=True)]
5624
+ """
5625
+ return Column.invoke_anonymous_function(xml, "xpath_boolean", path)
5626
+
5627
+
5628
+ @meta(unsupported_engines="*")
5629
+ def xpath_double(xml: ColumnOrName, path: ColumnOrName) -> Column:
5630
+ """
5631
+ Returns a double value, the value zero if no match is found,
5632
+ or NaN if a match is found but the value is non-numeric.
5633
+
5634
+ .. versionadded:: 3.5.0
5635
+
5636
+ Examples
5637
+ --------
5638
+ >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
5639
+ >>> df.select(xpath_double(df.x, lit('sum(a/b)')).alias('r')).collect()
5640
+ [Row(r=3.0)]
5641
+ """
5642
+ return Column.invoke_anonymous_function(xml, "xpath_double", path)
5643
+
5644
+
5645
+ @meta(unsupported_engines="*")
5646
+ def xpath_float(xml: ColumnOrName, path: ColumnOrName) -> Column:
5647
+ """
5648
+ Returns a float value, the value zero if no match is found,
5649
+ or NaN if a match is found but the value is non-numeric.
5650
+
5651
+ .. versionadded:: 3.5.0
5652
+
5653
+ Examples
5654
+ --------
5655
+ >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
5656
+ >>> df.select(xpath_float(df.x, lit('sum(a/b)')).alias('r')).collect()
5657
+ [Row(r=3.0)]
5658
+ """
5659
+ return Column.invoke_anonymous_function(xml, "xpath_float", path)
5660
+
5661
+
5662
+ @meta(unsupported_engines="*")
5663
+ def xpath_int(xml: ColumnOrName, path: ColumnOrName) -> Column:
5664
+ """
5665
+ Returns an integer value, or the value zero if no match is found,
5666
+ or a match is found but the value is non-numeric.
5667
+
5668
+ .. versionadded:: 3.5.0
5669
+
5670
+ Examples
5671
+ --------
5672
+ >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
5673
+ >>> df.select(xpath_int(df.x, lit('sum(a/b)')).alias('r')).collect()
5674
+ [Row(r=3)]
5675
+ """
5676
+ return Column.invoke_anonymous_function(xml, "xpath_int", path)
5677
+
5678
+
5679
+ @meta(unsupported_engines="*")
5680
+ def xpath_long(xml: ColumnOrName, path: ColumnOrName) -> Column:
5681
+ """
5682
+ Returns a long integer value, or the value zero if no match is found,
5683
+ or a match is found but the value is non-numeric.
5684
+
5685
+ .. versionadded:: 3.5.0
5686
+
5687
+ Examples
5688
+ --------
5689
+ >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
5690
+ >>> df.select(xpath_long(df.x, lit('sum(a/b)')).alias('r')).collect()
5691
+ [Row(r=3)]
5692
+ """
5693
+ return Column.invoke_anonymous_function(xml, "xpath_long", path)
5694
+
5695
+
5696
+ @meta(unsupported_engines="*")
5697
+ def xpath_number(xml: ColumnOrName, path: ColumnOrName) -> Column:
5698
+ """
5699
+ Returns a double value, the value zero if no match is found,
5700
+ or NaN if a match is found but the value is non-numeric.
5701
+
5702
+ .. versionadded:: 3.5.0
5703
+
5704
+ Examples
5705
+ --------
5706
+ >>> import pyspark.sql.functions as sf
5707
+ >>> spark.createDataFrame(
5708
+ ... [('<a><b>1</b><b>2</b></a>',)], ['x']
5709
+ ... ).select(sf.xpath_number('x', sf.lit('sum(a/b)'))).show()
5710
+ +-------------------------+
5711
+ |xpath_number(x, sum(a/b))|
5712
+ +-------------------------+
5713
+ | 3.0|
5714
+ +-------------------------+
5715
+ """
5716
+ return Column.invoke_anonymous_function(xml, "xpath_number", path)
5717
+
5718
+
5719
+ @meta(unsupported_engines="*")
5720
+ def xpath_short(xml: ColumnOrName, path: ColumnOrName) -> Column:
5721
+ """
5722
+ Returns a short integer value, or the value zero if no match is found,
5723
+ or a match is found but the value is non-numeric.
5724
+
5725
+ .. versionadded:: 3.5.0
5726
+
5727
+ Examples
5728
+ --------
5729
+ >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
5730
+ >>> df.select(xpath_short(df.x, lit('sum(a/b)')).alias('r')).collect()
5731
+ [Row(r=3)]
5732
+ """
5733
+ return Column.invoke_anonymous_function(xml, "xpath_short", path)
5734
+
5735
+
5736
+ @meta(unsupported_engines="*")
5737
+ def xpath_string(xml: ColumnOrName, path: ColumnOrName) -> Column:
5738
+ """
5739
+ Returns the text contents of the first xml node that matches the XPath expression.
5740
+
5741
+ .. versionadded:: 3.5.0
5742
+
5743
+ Examples
5744
+ --------
5745
+ >>> df = spark.createDataFrame([('<a><b>b</b><c>cc</c></a>',)], ['x'])
5746
+ >>> df.select(xpath_string(df.x, lit('a/c')).alias('r')).collect()
5747
+ [Row(r='cc')]
5748
+ """
5749
+ return Column.invoke_anonymous_function(xml, "xpath_string", path)
5750
+
5751
+
5752
+ @meta(unsupported_engines="*")
5753
+ def years(col: ColumnOrName) -> Column:
5754
+ """
5755
+ Partition transform function: A transform for timestamps and dates
5756
+ to partition data into years.
5757
+
5758
+ .. versionadded:: 3.1.0
5759
+
5760
+ .. versionchanged:: 3.4.0
5761
+ Supports Spark Connect.
5762
+
5763
+ Parameters
5764
+ ----------
5765
+ col : :class:`~pyspark.sql.Column` or str
5766
+ target date or timestamp column to work on.
5767
+
5768
+ Returns
5769
+ -------
5770
+ :class:`~pyspark.sql.Column`
5771
+ data partitioned by years.
5772
+
5773
+ Examples
5774
+ --------
5775
+ >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP
5776
+ ... years("ts")
5777
+ ... ).createOrReplace()
5778
+
5779
+ Notes
5780
+ -----
5781
+ This function can be used only in combination with
5782
+ :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`
5783
+ method of the `DataFrameWriterV2`.
5784
+
5785
+ """
5786
+ return Column.invoke_anonymous_function(col, "years")
5787
+
5788
+
1777
5789
  @meta()
1778
5790
  def _lambda_quoted(value: str) -> t.Optional[bool]:
1779
5791
  return False if value == "_" else None