sqlframe 3.20.0__py3-none-any.whl → 3.21.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
sqlframe/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '3.20.0'
16
- __version_tuple__ = version_tuple = (3, 20, 0)
15
+ __version__ = version = '3.21.1'
16
+ __version_tuple__ = version_tuple = (3, 21, 1)
@@ -296,6 +296,12 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
296
296
 
297
297
  @property
298
298
  def columns(self) -> t.List[str]:
299
+ expression_display_names = self.expression.copy()
300
+ self._set_display_names(expression_display_names)
301
+ return expression_display_names.named_selects
302
+
303
+ @property
304
+ def _columns(self) -> t.List[str]:
299
305
  return self.expression.named_selects
300
306
 
301
307
  @property
@@ -611,6 +617,18 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
611
617
  }
612
618
  self.display_name_mapping.update(zipped)
613
619
 
620
+ def _set_display_names(self, select_expression: exp.Select) -> None:
621
+ for index, column in enumerate(select_expression.expressions):
622
+ column_name = quote_preserving_alias_or_name(column)
623
+ if column_name in self.display_name_mapping:
624
+ display_name_identifier = exp.to_identifier(
625
+ self.display_name_mapping[column_name], quoted=True
626
+ )
627
+ display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
628
+ select_expression.expressions[index] = exp.alias_(
629
+ column.unalias(), display_name_identifier, quoted=True
630
+ )
631
+
614
632
  def _get_expressions(
615
633
  self,
616
634
  optimize: bool = True,
@@ -631,16 +649,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
631
649
  select_expression = select_expression.transform(
632
650
  replace_id_value, replacement_mapping
633
651
  ).assert_is(exp.Select)
634
- for index, column in enumerate(select_expression.expressions):
635
- column_name = quote_preserving_alias_or_name(column)
636
- if column_name in self.display_name_mapping:
637
- display_name_identifier = exp.to_identifier(
638
- self.display_name_mapping[column_name], quoted=True
639
- )
640
- display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
641
- select_expression.expressions[index] = exp.alias_(
642
- column.unalias(), display_name_identifier, quoted=True
643
- )
652
+ self._set_display_names(select_expression)
644
653
  if optimize:
645
654
  select_expression = t.cast(
646
655
  exp.Select,
@@ -1158,8 +1167,8 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
1158
1167
 
1159
1168
  @operation(Operation.FROM)
1160
1169
  def unionByName(self, other: Self, allowMissingColumns: bool = False) -> Self:
1161
- l_columns = self.columns
1162
- r_columns = other.columns
1170
+ l_columns = self._columns
1171
+ r_columns = other._columns
1163
1172
  if not allowMissingColumns:
1164
1173
  l_expressions = l_columns
1165
1174
  r_expressions = l_columns
@@ -1619,9 +1628,9 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
1619
1628
  | 16| Bob|
1620
1629
  +---+-----+
1621
1630
  """
1622
- if len(cols) != len(self.columns):
1631
+ if len(cols) != len(self._columns):
1623
1632
  raise ValueError(
1624
- f"Number of column names does not match number of columns: {len(cols)} != {len(self.columns)}"
1633
+ f"Number of column names does not match number of columns: {len(cols)} != {len(self._columns)}"
1625
1634
  )
1626
1635
  expression = self.expression.copy()
1627
1636
  expression = expression.select(
@@ -1718,6 +1727,114 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
1718
1727
  grouping_columns.extend([list(x) for x in itertools.combinations(columns, i)])
1719
1728
  return self._group_data(self, grouping_columns, self.last_op)
1720
1729
 
1730
+ @operation(Operation.SELECT)
1731
+ def unpivot(
1732
+ self,
1733
+ ids: t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]],
1734
+ values: t.Optional[t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]]],
1735
+ variableColumnName: str,
1736
+ valueColumnName: str,
1737
+ ) -> Self:
1738
+ """
1739
+ Unpivot a DataFrame from wide format to long format, optionally leaving
1740
+ identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,
1741
+ except for the aggregation, which cannot be reversed.
1742
+
1743
+ This function is useful to massage a DataFrame into a format where some
1744
+ columns are identifier columns ("ids"), while all other columns ("values")
1745
+ are "unpivoted" to the rows, leaving just two non-id columns, named as given
1746
+ by `variableColumnName` and `valueColumnName`.
1747
+
1748
+ When no "id" columns are given, the unpivoted DataFrame consists of only the
1749
+ "variable" and "value" columns.
1750
+
1751
+ The `values` columns must not be empty so at least one value must be given to be unpivoted.
1752
+ When `values` is `None`, all non-id columns will be unpivoted.
1753
+
1754
+ All "value" columns must share a least common data type. Unless they are the same data type,
1755
+ all "value" columns are cast to the nearest common data type. For instance, types
1756
+ `IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType`
1757
+ do not have a common data type and `unpivot` fails.
1758
+
1759
+ .. versionadded:: 3.4.0
1760
+
1761
+ Parameters
1762
+ ----------
1763
+ ids : str, Column, tuple, list
1764
+ Column(s) to use as identifiers. Can be a single column or column name,
1765
+ or a list or tuple for multiple columns.
1766
+ values : str, Column, tuple, list, optional
1767
+ Column(s) to unpivot. Can be a single column or column name, or a list or tuple
1768
+ for multiple columns. If specified, must not be empty. If not specified, uses all
1769
+ columns that are not set as `ids`.
1770
+ variableColumnName : str
1771
+ Name of the variable column.
1772
+ valueColumnName : str
1773
+ Name of the value column.
1774
+
1775
+ Returns
1776
+ -------
1777
+ :class:`DataFrame`
1778
+ Unpivoted DataFrame.
1779
+
1780
+ Notes
1781
+ -----
1782
+ Supports Spark Connect.
1783
+
1784
+ Examples
1785
+ --------
1786
+ >>> df = spark.createDataFrame(
1787
+ ... [(1, 11, 1.1), (2, 12, 1.2)],
1788
+ ... ["id", "int", "double"],
1789
+ ... )
1790
+ >>> df.show()
1791
+ +---+---+------+
1792
+ | id|int|double|
1793
+ +---+---+------+
1794
+ | 1| 11| 1.1|
1795
+ | 2| 12| 1.2|
1796
+ +---+---+------+
1797
+
1798
+ >>> df.unpivot("id", ["int", "double"], "var", "val").show()
1799
+ +---+------+----+
1800
+ | id| var| val|
1801
+ +---+------+----+
1802
+ | 1| int|11.0|
1803
+ | 1|double| 1.1|
1804
+ | 2| int|12.0|
1805
+ | 2|double| 1.2|
1806
+ +---+------+----+
1807
+
1808
+ See Also
1809
+ --------
1810
+ DataFrame.melt
1811
+ """
1812
+ from sqlframe.base import functions as F
1813
+
1814
+ id_columns = self._ensure_and_normalize_cols(ids)
1815
+ if not values:
1816
+ outer_selects = self._get_outer_select_columns(self.expression)
1817
+ values = [
1818
+ column
1819
+ for column in outer_selects
1820
+ if column.alias_or_name not in {x.alias_or_name for x in id_columns}
1821
+ ]
1822
+ value_columns = self._ensure_and_normalize_cols(values)
1823
+
1824
+ df = self._convert_leaf_to_cte()
1825
+ selects = []
1826
+ for value in value_columns:
1827
+ selects.append(
1828
+ exp.select(
1829
+ *[x.column_expression for x in id_columns],
1830
+ F.lit(value.alias_or_name).alias(variableColumnName).expression,
1831
+ value.alias(valueColumnName).expression,
1832
+ ).from_(df.expression.ctes[-1].alias_or_name)
1833
+ )
1834
+ unioned_expression = functools.reduce(lambda x, y: x.union(y, distinct=False), selects) # type: ignore
1835
+ final_expression = self._add_ctes_to_expression(unioned_expression, df.expression.ctes)
1836
+ return self.copy(expression=final_expression)._convert_leaf_to_cte()
1837
+
1721
1838
  def collect(self) -> t.List[Row]:
1722
1839
  return self._collect()
1723
1840
 
@@ -193,10 +193,6 @@ def factorial_ensure_int(col: ColumnOrName) -> Column:
193
193
  return Column.invoke_anonymous_function(col_func(col).cast("integer"), "FACTORIAL")
194
194
 
195
195
 
196
- def skewness_from_skew(col: ColumnOrName) -> Column:
197
- return Column.invoke_anonymous_function(col, "SKEW")
198
-
199
-
200
196
  def isnan_using_equal(col: ColumnOrName) -> Column:
201
197
  lit = get_func_from_session("lit")
202
198
  return Column(
@@ -486,14 +486,32 @@ def var_pop(col: ColumnOrName) -> Column:
486
486
 
487
487
  @meta(unsupported_engines=["bigquery", "postgres"])
488
488
  def skewness(col: ColumnOrName) -> Column:
489
- from sqlframe.base.function_alternatives import skewness_from_skew
490
-
491
489
  session = _get_session()
492
490
 
491
+ func_name = "SKEWNESS"
492
+
493
493
  if session._is_snowflake:
494
- return skewness_from_skew(col)
494
+ func_name = "SKEW"
495
+
496
+ if session._is_duckdb or session._is_snowflake:
497
+ when_func = get_func_from_session("when")
498
+ count_func = get_func_from_session("count")
499
+ count_star = count_func("*")
500
+ lit_func = get_func_from_session("lit")
501
+ sqrt_func = get_func_from_session("sqrt")
502
+ col = Column.ensure_col(col)
503
+ return (
504
+ when_func(count_star == lit_func(0), lit_func(None))
505
+ .when(count_star == lit_func(1), lit_func(float("nan")))
506
+ .when(count_star == lit_func(2), lit_func(0.0))
507
+ .otherwise(
508
+ Column.invoke_anonymous_function(col, func_name)
509
+ * (count_star - lit_func(2))
510
+ / (sqrt_func(count_star * (count_star - lit_func(1))))
511
+ )
512
+ )
495
513
 
496
- return Column.invoke_anonymous_function(col, "SKEWNESS")
514
+ return Column.invoke_anonymous_function(col, func_name)
497
515
 
498
516
 
499
517
  @meta(unsupported_engines=["bigquery", "postgres"])
sqlframe/base/util.py CHANGED
@@ -97,12 +97,8 @@ def get_column_mapping_from_schema_input(
97
97
  else:
98
98
  value = {x.strip(): None for x in schema}
99
99
  return {
100
- exp.to_column(k).sql(dialect=dialect): exp.DataType.build(v, dialect=dialect)
101
- if v is not None
102
- else v
103
- for k, v in value.items()
100
+ k: exp.DataType.build(v, dialect=dialect) if v is not None else v for k, v in value.items()
104
101
  }
105
- # return {x.strip(): None for x in schema} # type: ignore
106
102
 
107
103
 
108
104
  def get_tables_from_expression_with_join(expression: exp.Select) -> t.List[exp.Table]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sqlframe
3
- Version: 3.20.0
3
+ Version: 3.21.1
4
4
  Summary: Turning PySpark Into a Universal DataFrame API
5
5
  Home-page: https://github.com/eakmanrq/sqlframe
6
6
  Author: Ryan Eakman
@@ -1,14 +1,14 @@
1
1
  sqlframe/__init__.py,sha256=wfqm98eLoLid9oV_FzzpG5loKC6LxOhj2lXpfN7SARo,3138
2
- sqlframe/_version.py,sha256=nzt1OjXbH5tyyHQvLpmIr9I_E9sBcud1ZUXFSGz-12c,413
2
+ sqlframe/_version.py,sha256=fmhKf9XPZdwZdKpQ-ESJ_LGssm7Q8K_NJEGVKwXLGQM,413
3
3
  sqlframe/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sqlframe/base/_typing.py,sha256=b2clI5HI1zEZKB_3Msx3FeAJQyft44ubUifJwQRVXyQ,1298
5
5
  sqlframe/base/catalog.py,sha256=SzFQalTWdhWzxUY-4ut1f9TfOECp_JmJEgNPfrRKCe0,38457
6
6
  sqlframe/base/column.py,sha256=oHVwkSWABO3ZlAbgBShsxSSlgbI06BOup5XJrRhgqJI,18097
7
- sqlframe/base/dataframe.py,sha256=mKXbIKYiKH5mh6qj0Dg7L_znmCL85q9kHlmHtCW4kJ4,79352
7
+ sqlframe/base/dataframe.py,sha256=FOgLdCpscLsBntkRvutcgSVqXqMgXo9DYa892mXu00E,83907
8
8
  sqlframe/base/decorators.py,sha256=ms-CvDOIW3T8IVB9VqDmLwAiaEsqXLYRXEqVQaxktiM,1890
9
9
  sqlframe/base/exceptions.py,sha256=9Uwvqn2eAkDpqm4BrRgbL61qM-GMCbJEMAW8otxO46s,370
10
- sqlframe/base/function_alternatives.py,sha256=NV31IaEhVYmfUSWetAEFISAvLzs2DxQ7bp-iMNgj0hQ,53786
11
- sqlframe/base/functions.py,sha256=nfDf2oKoBq2hrutTfuVHKmGvkm_X_ZvhfnFPv1rn0oU,222350
10
+ sqlframe/base/function_alternatives.py,sha256=KFkEm0aIHzajvQmiPZnzTLh-Ud9wjeg4lJ4Rk0vk-YU,53674
11
+ sqlframe/base/functions.py,sha256=jfLgboldiTB9CPkoZMtKUAwx6XSvFnEOIpCZQfoEJJU,223060
12
12
  sqlframe/base/group.py,sha256=fsyG5990_Pd7gFPjTFrH9IEoAquL_wEkVpIlBAIkZJU,4091
13
13
  sqlframe/base/normalize.py,sha256=nXAJ5CwxVf4DV0GsH-q1w0p8gmjSMlv96k_ez1eVul8,3880
14
14
  sqlframe/base/operations.py,sha256=xSPw74e59wYvNd6U1AlwziNCTG6Aftrbl4SybN9u9VE,3450
@@ -18,7 +18,7 @@ sqlframe/base/table.py,sha256=rCeh1W5SWbtEVfkLAUiexzrZwNgmZeptLEmLcM1ABkE,6961
18
18
  sqlframe/base/transforms.py,sha256=y0j3SGDz3XCmNGrvassk1S-owllUWfkHyMgZlY6SFO4,467
19
19
  sqlframe/base/types.py,sha256=iBNk9bpFtb2NBIogYS8i7OlQZMRvpR6XxqzBebsjQDU,12280
20
20
  sqlframe/base/udf.py,sha256=O6hMhBUy9NVv-mhJRtfFhXTIa_-Z8Y_FkmmuOHu0l90,1117
21
- sqlframe/base/util.py,sha256=rdnH3Kg6gZVT3DehU_ZHjfum79vc-I5W_Il6OiCtWF4,15284
21
+ sqlframe/base/util.py,sha256=_s2M-qHzTLgyGu1v8laRHJorUpUO6-fr3kk7CsvcuXw,15161
22
22
  sqlframe/base/window.py,sha256=8hOv-ignPPIsZA9FzvYzcLE9J_glalVaYjIAUdRUX3o,4943
23
23
  sqlframe/base/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  sqlframe/base/mixins/catalog_mixins.py,sha256=9tn0mK8oPoqIIjNItystD5tdBMdK9YpkxTG7G9KQl8k,18619
@@ -129,8 +129,8 @@ sqlframe/standalone/udf.py,sha256=azmgtUjHNIPs0WMVNId05SHwiYn41MKVBhKXsQJ5dmY,27
129
129
  sqlframe/standalone/window.py,sha256=6GKPzuxeSapJakBaKBeT9VpED1ACdjggDv9JRILDyV0,35
130
130
  sqlframe/testing/__init__.py,sha256=VVCosQhitU74A3NnE52O4mNtGZONapuEXcc20QmSlnQ,132
131
131
  sqlframe/testing/utils.py,sha256=PFsGZpwNUE_4-g_f43_vstTqsK0AQ2lBneb5Eb6NkFo,13008
132
- sqlframe-3.20.0.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
133
- sqlframe-3.20.0.dist-info/METADATA,sha256=vEauG8vJY6ak5FN5oJpsaGRKgzD7uaodpdlFFu3uN04,8970
134
- sqlframe-3.20.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
135
- sqlframe-3.20.0.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
136
- sqlframe-3.20.0.dist-info/RECORD,,
132
+ sqlframe-3.21.1.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
133
+ sqlframe-3.21.1.dist-info/METADATA,sha256=AauznGD-zSbh2cqT63w2MIrg_-0SlewyyRMNElL5O2I,8970
134
+ sqlframe-3.21.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
135
+ sqlframe-3.21.1.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
136
+ sqlframe-3.21.1.dist-info/RECORD,,