sqlframe 3.20.0__py3-none-any.whl → 3.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlframe/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '3.20.0'
16
- __version_tuple__ = version_tuple = (3, 20, 0)
15
+ __version__ = version = '3.21.1'
16
+ __version_tuple__ = version_tuple = (3, 21, 1)
@@ -296,6 +296,12 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
296
296
 
297
297
  @property
298
298
  def columns(self) -> t.List[str]:
299
+ expression_display_names = self.expression.copy()
300
+ self._set_display_names(expression_display_names)
301
+ return expression_display_names.named_selects
302
+
303
+ @property
304
+ def _columns(self) -> t.List[str]:
299
305
  return self.expression.named_selects
300
306
 
301
307
  @property
@@ -611,6 +617,18 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
611
617
  }
612
618
  self.display_name_mapping.update(zipped)
613
619
 
620
+ def _set_display_names(self, select_expression: exp.Select) -> None:
621
+ for index, column in enumerate(select_expression.expressions):
622
+ column_name = quote_preserving_alias_or_name(column)
623
+ if column_name in self.display_name_mapping:
624
+ display_name_identifier = exp.to_identifier(
625
+ self.display_name_mapping[column_name], quoted=True
626
+ )
627
+ display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
628
+ select_expression.expressions[index] = exp.alias_(
629
+ column.unalias(), display_name_identifier, quoted=True
630
+ )
631
+
614
632
  def _get_expressions(
615
633
  self,
616
634
  optimize: bool = True,
@@ -631,16 +649,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
631
649
  select_expression = select_expression.transform(
632
650
  replace_id_value, replacement_mapping
633
651
  ).assert_is(exp.Select)
634
- for index, column in enumerate(select_expression.expressions):
635
- column_name = quote_preserving_alias_or_name(column)
636
- if column_name in self.display_name_mapping:
637
- display_name_identifier = exp.to_identifier(
638
- self.display_name_mapping[column_name], quoted=True
639
- )
640
- display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
641
- select_expression.expressions[index] = exp.alias_(
642
- column.unalias(), display_name_identifier, quoted=True
643
- )
652
+ self._set_display_names(select_expression)
644
653
  if optimize:
645
654
  select_expression = t.cast(
646
655
  exp.Select,
@@ -1158,8 +1167,8 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
1158
1167
 
1159
1168
  @operation(Operation.FROM)
1160
1169
  def unionByName(self, other: Self, allowMissingColumns: bool = False) -> Self:
1161
- l_columns = self.columns
1162
- r_columns = other.columns
1170
+ l_columns = self._columns
1171
+ r_columns = other._columns
1163
1172
  if not allowMissingColumns:
1164
1173
  l_expressions = l_columns
1165
1174
  r_expressions = l_columns
@@ -1619,9 +1628,9 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
1619
1628
  | 16| Bob|
1620
1629
  +---+-----+
1621
1630
  """
1622
- if len(cols) != len(self.columns):
1631
+ if len(cols) != len(self._columns):
1623
1632
  raise ValueError(
1624
- f"Number of column names does not match number of columns: {len(cols)} != {len(self.columns)}"
1633
+ f"Number of column names does not match number of columns: {len(cols)} != {len(self._columns)}"
1625
1634
  )
1626
1635
  expression = self.expression.copy()
1627
1636
  expression = expression.select(
@@ -1718,6 +1727,114 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
1718
1727
  grouping_columns.extend([list(x) for x in itertools.combinations(columns, i)])
1719
1728
  return self._group_data(self, grouping_columns, self.last_op)
1720
1729
 
1730
+ @operation(Operation.SELECT)
1731
+ def unpivot(
1732
+ self,
1733
+ ids: t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]],
1734
+ values: t.Optional[t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]]],
1735
+ variableColumnName: str,
1736
+ valueColumnName: str,
1737
+ ) -> Self:
1738
+ """
1739
+ Unpivot a DataFrame from wide format to long format, optionally leaving
1740
+ identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,
1741
+ except for the aggregation, which cannot be reversed.
1742
+
1743
+ This function is useful to massage a DataFrame into a format where some
1744
+ columns are identifier columns ("ids"), while all other columns ("values")
1745
+ are "unpivoted" to the rows, leaving just two non-id columns, named as given
1746
+ by `variableColumnName` and `valueColumnName`.
1747
+
1748
+ When no "id" columns are given, the unpivoted DataFrame consists of only the
1749
+ "variable" and "value" columns.
1750
+
1751
+ The `values` columns must not be empty so at least one value must be given to be unpivoted.
1752
+ When `values` is `None`, all non-id columns will be unpivoted.
1753
+
1754
+ All "value" columns must share a least common data type. Unless they are the same data type,
1755
+ all "value" columns are cast to the nearest common data type. For instance, types
1756
+ `IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType`
1757
+ do not have a common data type and `unpivot` fails.
1758
+
1759
+ .. versionadded:: 3.4.0
1760
+
1761
+ Parameters
1762
+ ----------
1763
+ ids : str, Column, tuple, list
1764
+ Column(s) to use as identifiers. Can be a single column or column name,
1765
+ or a list or tuple for multiple columns.
1766
+ values : str, Column, tuple, list, optional
1767
+ Column(s) to unpivot. Can be a single column or column name, or a list or tuple
1768
+ for multiple columns. If specified, must not be empty. If not specified, uses all
1769
+ columns that are not set as `ids`.
1770
+ variableColumnName : str
1771
+ Name of the variable column.
1772
+ valueColumnName : str
1773
+ Name of the value column.
1774
+
1775
+ Returns
1776
+ -------
1777
+ :class:`DataFrame`
1778
+ Unpivoted DataFrame.
1779
+
1780
+ Notes
1781
+ -----
1782
+ Supports Spark Connect.
1783
+
1784
+ Examples
1785
+ --------
1786
+ >>> df = spark.createDataFrame(
1787
+ ... [(1, 11, 1.1), (2, 12, 1.2)],
1788
+ ... ["id", "int", "double"],
1789
+ ... )
1790
+ >>> df.show()
1791
+ +---+---+------+
1792
+ | id|int|double|
1793
+ +---+---+------+
1794
+ | 1| 11| 1.1|
1795
+ | 2| 12| 1.2|
1796
+ +---+---+------+
1797
+
1798
+ >>> df.unpivot("id", ["int", "double"], "var", "val").show()
1799
+ +---+------+----+
1800
+ | id| var| val|
1801
+ +---+------+----+
1802
+ | 1| int|11.0|
1803
+ | 1|double| 1.1|
1804
+ | 2| int|12.0|
1805
+ | 2|double| 1.2|
1806
+ +---+------+----+
1807
+
1808
+ See Also
1809
+ --------
1810
+ DataFrame.melt
1811
+ """
1812
+ from sqlframe.base import functions as F
1813
+
1814
+ id_columns = self._ensure_and_normalize_cols(ids)
1815
+ if not values:
1816
+ outer_selects = self._get_outer_select_columns(self.expression)
1817
+ values = [
1818
+ column
1819
+ for column in outer_selects
1820
+ if column.alias_or_name not in {x.alias_or_name for x in id_columns}
1821
+ ]
1822
+ value_columns = self._ensure_and_normalize_cols(values)
1823
+
1824
+ df = self._convert_leaf_to_cte()
1825
+ selects = []
1826
+ for value in value_columns:
1827
+ selects.append(
1828
+ exp.select(
1829
+ *[x.column_expression for x in id_columns],
1830
+ F.lit(value.alias_or_name).alias(variableColumnName).expression,
1831
+ value.alias(valueColumnName).expression,
1832
+ ).from_(df.expression.ctes[-1].alias_or_name)
1833
+ )
1834
+ unioned_expression = functools.reduce(lambda x, y: x.union(y, distinct=False), selects) # type: ignore
1835
+ final_expression = self._add_ctes_to_expression(unioned_expression, df.expression.ctes)
1836
+ return self.copy(expression=final_expression)._convert_leaf_to_cte()
1837
+
1721
1838
  def collect(self) -> t.List[Row]:
1722
1839
  return self._collect()
1723
1840
 
@@ -193,10 +193,6 @@ def factorial_ensure_int(col: ColumnOrName) -> Column:
193
193
  return Column.invoke_anonymous_function(col_func(col).cast("integer"), "FACTORIAL")
194
194
 
195
195
 
196
- def skewness_from_skew(col: ColumnOrName) -> Column:
197
- return Column.invoke_anonymous_function(col, "SKEW")
198
-
199
-
200
196
  def isnan_using_equal(col: ColumnOrName) -> Column:
201
197
  lit = get_func_from_session("lit")
202
198
  return Column(
@@ -486,14 +486,32 @@ def var_pop(col: ColumnOrName) -> Column:
486
486
 
487
487
  @meta(unsupported_engines=["bigquery", "postgres"])
488
488
  def skewness(col: ColumnOrName) -> Column:
489
- from sqlframe.base.function_alternatives import skewness_from_skew
490
-
491
489
  session = _get_session()
492
490
 
491
+ func_name = "SKEWNESS"
492
+
493
493
  if session._is_snowflake:
494
- return skewness_from_skew(col)
494
+ func_name = "SKEW"
495
+
496
+ if session._is_duckdb or session._is_snowflake:
497
+ when_func = get_func_from_session("when")
498
+ count_func = get_func_from_session("count")
499
+ count_star = count_func("*")
500
+ lit_func = get_func_from_session("lit")
501
+ sqrt_func = get_func_from_session("sqrt")
502
+ col = Column.ensure_col(col)
503
+ return (
504
+ when_func(count_star == lit_func(0), lit_func(None))
505
+ .when(count_star == lit_func(1), lit_func(float("nan")))
506
+ .when(count_star == lit_func(2), lit_func(0.0))
507
+ .otherwise(
508
+ Column.invoke_anonymous_function(col, func_name)
509
+ * (count_star - lit_func(2))
510
+ / (sqrt_func(count_star * (count_star - lit_func(1))))
511
+ )
512
+ )
495
513
 
496
- return Column.invoke_anonymous_function(col, "SKEWNESS")
514
+ return Column.invoke_anonymous_function(col, func_name)
497
515
 
498
516
 
499
517
  @meta(unsupported_engines=["bigquery", "postgres"])
sqlframe/base/util.py CHANGED
@@ -97,12 +97,8 @@ def get_column_mapping_from_schema_input(
97
97
  else:
98
98
  value = {x.strip(): None for x in schema}
99
99
  return {
100
- exp.to_column(k).sql(dialect=dialect): exp.DataType.build(v, dialect=dialect)
101
- if v is not None
102
- else v
103
- for k, v in value.items()
100
+ k: exp.DataType.build(v, dialect=dialect) if v is not None else v for k, v in value.items()
104
101
  }
105
- # return {x.strip(): None for x in schema} # type: ignore
106
102
 
107
103
 
108
104
  def get_tables_from_expression_with_join(expression: exp.Select) -> t.List[exp.Table]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sqlframe
3
- Version: 3.20.0
3
+ Version: 3.21.1
4
4
  Summary: Turning PySpark Into a Universal DataFrame API
5
5
  Home-page: https://github.com/eakmanrq/sqlframe
6
6
  Author: Ryan Eakman
@@ -1,14 +1,14 @@
1
1
  sqlframe/__init__.py,sha256=wfqm98eLoLid9oV_FzzpG5loKC6LxOhj2lXpfN7SARo,3138
2
- sqlframe/_version.py,sha256=nzt1OjXbH5tyyHQvLpmIr9I_E9sBcud1ZUXFSGz-12c,413
2
+ sqlframe/_version.py,sha256=fmhKf9XPZdwZdKpQ-ESJ_LGssm7Q8K_NJEGVKwXLGQM,413
3
3
  sqlframe/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sqlframe/base/_typing.py,sha256=b2clI5HI1zEZKB_3Msx3FeAJQyft44ubUifJwQRVXyQ,1298
5
5
  sqlframe/base/catalog.py,sha256=SzFQalTWdhWzxUY-4ut1f9TfOECp_JmJEgNPfrRKCe0,38457
6
6
  sqlframe/base/column.py,sha256=oHVwkSWABO3ZlAbgBShsxSSlgbI06BOup5XJrRhgqJI,18097
7
- sqlframe/base/dataframe.py,sha256=mKXbIKYiKH5mh6qj0Dg7L_znmCL85q9kHlmHtCW4kJ4,79352
7
+ sqlframe/base/dataframe.py,sha256=FOgLdCpscLsBntkRvutcgSVqXqMgXo9DYa892mXu00E,83907
8
8
  sqlframe/base/decorators.py,sha256=ms-CvDOIW3T8IVB9VqDmLwAiaEsqXLYRXEqVQaxktiM,1890
9
9
  sqlframe/base/exceptions.py,sha256=9Uwvqn2eAkDpqm4BrRgbL61qM-GMCbJEMAW8otxO46s,370
10
- sqlframe/base/function_alternatives.py,sha256=NV31IaEhVYmfUSWetAEFISAvLzs2DxQ7bp-iMNgj0hQ,53786
11
- sqlframe/base/functions.py,sha256=nfDf2oKoBq2hrutTfuVHKmGvkm_X_ZvhfnFPv1rn0oU,222350
10
+ sqlframe/base/function_alternatives.py,sha256=KFkEm0aIHzajvQmiPZnzTLh-Ud9wjeg4lJ4Rk0vk-YU,53674
11
+ sqlframe/base/functions.py,sha256=jfLgboldiTB9CPkoZMtKUAwx6XSvFnEOIpCZQfoEJJU,223060
12
12
  sqlframe/base/group.py,sha256=fsyG5990_Pd7gFPjTFrH9IEoAquL_wEkVpIlBAIkZJU,4091
13
13
  sqlframe/base/normalize.py,sha256=nXAJ5CwxVf4DV0GsH-q1w0p8gmjSMlv96k_ez1eVul8,3880
14
14
  sqlframe/base/operations.py,sha256=xSPw74e59wYvNd6U1AlwziNCTG6Aftrbl4SybN9u9VE,3450
@@ -18,7 +18,7 @@ sqlframe/base/table.py,sha256=rCeh1W5SWbtEVfkLAUiexzrZwNgmZeptLEmLcM1ABkE,6961
18
18
  sqlframe/base/transforms.py,sha256=y0j3SGDz3XCmNGrvassk1S-owllUWfkHyMgZlY6SFO4,467
19
19
  sqlframe/base/types.py,sha256=iBNk9bpFtb2NBIogYS8i7OlQZMRvpR6XxqzBebsjQDU,12280
20
20
  sqlframe/base/udf.py,sha256=O6hMhBUy9NVv-mhJRtfFhXTIa_-Z8Y_FkmmuOHu0l90,1117
21
- sqlframe/base/util.py,sha256=rdnH3Kg6gZVT3DehU_ZHjfum79vc-I5W_Il6OiCtWF4,15284
21
+ sqlframe/base/util.py,sha256=_s2M-qHzTLgyGu1v8laRHJorUpUO6-fr3kk7CsvcuXw,15161
22
22
  sqlframe/base/window.py,sha256=8hOv-ignPPIsZA9FzvYzcLE9J_glalVaYjIAUdRUX3o,4943
23
23
  sqlframe/base/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  sqlframe/base/mixins/catalog_mixins.py,sha256=9tn0mK8oPoqIIjNItystD5tdBMdK9YpkxTG7G9KQl8k,18619
@@ -129,8 +129,8 @@ sqlframe/standalone/udf.py,sha256=azmgtUjHNIPs0WMVNId05SHwiYn41MKVBhKXsQJ5dmY,27
129
129
  sqlframe/standalone/window.py,sha256=6GKPzuxeSapJakBaKBeT9VpED1ACdjggDv9JRILDyV0,35
130
130
  sqlframe/testing/__init__.py,sha256=VVCosQhitU74A3NnE52O4mNtGZONapuEXcc20QmSlnQ,132
131
131
  sqlframe/testing/utils.py,sha256=PFsGZpwNUE_4-g_f43_vstTqsK0AQ2lBneb5Eb6NkFo,13008
132
- sqlframe-3.20.0.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
133
- sqlframe-3.20.0.dist-info/METADATA,sha256=vEauG8vJY6ak5FN5oJpsaGRKgzD7uaodpdlFFu3uN04,8970
134
- sqlframe-3.20.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
135
- sqlframe-3.20.0.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
136
- sqlframe-3.20.0.dist-info/RECORD,,
132
+ sqlframe-3.21.1.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
133
+ sqlframe-3.21.1.dist-info/METADATA,sha256=AauznGD-zSbh2cqT63w2MIrg_-0SlewyyRMNElL5O2I,8970
134
+ sqlframe-3.21.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
135
+ sqlframe-3.21.1.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
136
+ sqlframe-3.21.1.dist-info/RECORD,,