sqlframe 3.20.0__py3-none-any.whl → 3.21.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlframe/_version.py +2 -2
- sqlframe/base/dataframe.py +131 -14
- sqlframe/base/function_alternatives.py +0 -4
- sqlframe/base/functions.py +22 -4
- sqlframe/base/util.py +1 -5
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/METADATA +1 -1
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/RECORD +10 -10
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/LICENSE +0 -0
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/WHEEL +0 -0
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/top_level.txt +0 -0
sqlframe/_version.py
CHANGED
sqlframe/base/dataframe.py
CHANGED
@@ -296,6 +296,12 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
296
296
|
|
297
297
|
@property
|
298
298
|
def columns(self) -> t.List[str]:
|
299
|
+
expression_display_names = self.expression.copy()
|
300
|
+
self._set_display_names(expression_display_names)
|
301
|
+
return expression_display_names.named_selects
|
302
|
+
|
303
|
+
@property
|
304
|
+
def _columns(self) -> t.List[str]:
|
299
305
|
return self.expression.named_selects
|
300
306
|
|
301
307
|
@property
|
@@ -611,6 +617,18 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
611
617
|
}
|
612
618
|
self.display_name_mapping.update(zipped)
|
613
619
|
|
620
|
+
def _set_display_names(self, select_expression: exp.Select) -> None:
|
621
|
+
for index, column in enumerate(select_expression.expressions):
|
622
|
+
column_name = quote_preserving_alias_or_name(column)
|
623
|
+
if column_name in self.display_name_mapping:
|
624
|
+
display_name_identifier = exp.to_identifier(
|
625
|
+
self.display_name_mapping[column_name], quoted=True
|
626
|
+
)
|
627
|
+
display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
|
628
|
+
select_expression.expressions[index] = exp.alias_(
|
629
|
+
column.unalias(), display_name_identifier, quoted=True
|
630
|
+
)
|
631
|
+
|
614
632
|
def _get_expressions(
|
615
633
|
self,
|
616
634
|
optimize: bool = True,
|
@@ -631,16 +649,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
631
649
|
select_expression = select_expression.transform(
|
632
650
|
replace_id_value, replacement_mapping
|
633
651
|
).assert_is(exp.Select)
|
634
|
-
|
635
|
-
column_name = quote_preserving_alias_or_name(column)
|
636
|
-
if column_name in self.display_name_mapping:
|
637
|
-
display_name_identifier = exp.to_identifier(
|
638
|
-
self.display_name_mapping[column_name], quoted=True
|
639
|
-
)
|
640
|
-
display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
|
641
|
-
select_expression.expressions[index] = exp.alias_(
|
642
|
-
column.unalias(), display_name_identifier, quoted=True
|
643
|
-
)
|
652
|
+
self._set_display_names(select_expression)
|
644
653
|
if optimize:
|
645
654
|
select_expression = t.cast(
|
646
655
|
exp.Select,
|
@@ -1158,8 +1167,8 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
1158
1167
|
|
1159
1168
|
@operation(Operation.FROM)
|
1160
1169
|
def unionByName(self, other: Self, allowMissingColumns: bool = False) -> Self:
|
1161
|
-
l_columns = self.
|
1162
|
-
r_columns = other.
|
1170
|
+
l_columns = self._columns
|
1171
|
+
r_columns = other._columns
|
1163
1172
|
if not allowMissingColumns:
|
1164
1173
|
l_expressions = l_columns
|
1165
1174
|
r_expressions = l_columns
|
@@ -1619,9 +1628,9 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
1619
1628
|
| 16| Bob|
|
1620
1629
|
+---+-----+
|
1621
1630
|
"""
|
1622
|
-
if len(cols) != len(self.
|
1631
|
+
if len(cols) != len(self._columns):
|
1623
1632
|
raise ValueError(
|
1624
|
-
f"Number of column names does not match number of columns: {len(cols)} != {len(self.
|
1633
|
+
f"Number of column names does not match number of columns: {len(cols)} != {len(self._columns)}"
|
1625
1634
|
)
|
1626
1635
|
expression = self.expression.copy()
|
1627
1636
|
expression = expression.select(
|
@@ -1718,6 +1727,114 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
1718
1727
|
grouping_columns.extend([list(x) for x in itertools.combinations(columns, i)])
|
1719
1728
|
return self._group_data(self, grouping_columns, self.last_op)
|
1720
1729
|
|
1730
|
+
@operation(Operation.SELECT)
|
1731
|
+
def unpivot(
|
1732
|
+
self,
|
1733
|
+
ids: t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]],
|
1734
|
+
values: t.Optional[t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]]],
|
1735
|
+
variableColumnName: str,
|
1736
|
+
valueColumnName: str,
|
1737
|
+
) -> Self:
|
1738
|
+
"""
|
1739
|
+
Unpivot a DataFrame from wide format to long format, optionally leaving
|
1740
|
+
identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,
|
1741
|
+
except for the aggregation, which cannot be reversed.
|
1742
|
+
|
1743
|
+
This function is useful to massage a DataFrame into a format where some
|
1744
|
+
columns are identifier columns ("ids"), while all other columns ("values")
|
1745
|
+
are "unpivoted" to the rows, leaving just two non-id columns, named as given
|
1746
|
+
by `variableColumnName` and `valueColumnName`.
|
1747
|
+
|
1748
|
+
When no "id" columns are given, the unpivoted DataFrame consists of only the
|
1749
|
+
"variable" and "value" columns.
|
1750
|
+
|
1751
|
+
The `values` columns must not be empty so at least one value must be given to be unpivoted.
|
1752
|
+
When `values` is `None`, all non-id columns will be unpivoted.
|
1753
|
+
|
1754
|
+
All "value" columns must share a least common data type. Unless they are the same data type,
|
1755
|
+
all "value" columns are cast to the nearest common data type. For instance, types
|
1756
|
+
`IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType`
|
1757
|
+
do not have a common data type and `unpivot` fails.
|
1758
|
+
|
1759
|
+
.. versionadded:: 3.4.0
|
1760
|
+
|
1761
|
+
Parameters
|
1762
|
+
----------
|
1763
|
+
ids : str, Column, tuple, list
|
1764
|
+
Column(s) to use as identifiers. Can be a single column or column name,
|
1765
|
+
or a list or tuple for multiple columns.
|
1766
|
+
values : str, Column, tuple, list, optional
|
1767
|
+
Column(s) to unpivot. Can be a single column or column name, or a list or tuple
|
1768
|
+
for multiple columns. If specified, must not be empty. If not specified, uses all
|
1769
|
+
columns that are not set as `ids`.
|
1770
|
+
variableColumnName : str
|
1771
|
+
Name of the variable column.
|
1772
|
+
valueColumnName : str
|
1773
|
+
Name of the value column.
|
1774
|
+
|
1775
|
+
Returns
|
1776
|
+
-------
|
1777
|
+
:class:`DataFrame`
|
1778
|
+
Unpivoted DataFrame.
|
1779
|
+
|
1780
|
+
Notes
|
1781
|
+
-----
|
1782
|
+
Supports Spark Connect.
|
1783
|
+
|
1784
|
+
Examples
|
1785
|
+
--------
|
1786
|
+
>>> df = spark.createDataFrame(
|
1787
|
+
... [(1, 11, 1.1), (2, 12, 1.2)],
|
1788
|
+
... ["id", "int", "double"],
|
1789
|
+
... )
|
1790
|
+
>>> df.show()
|
1791
|
+
+---+---+------+
|
1792
|
+
| id|int|double|
|
1793
|
+
+---+---+------+
|
1794
|
+
| 1| 11| 1.1|
|
1795
|
+
| 2| 12| 1.2|
|
1796
|
+
+---+---+------+
|
1797
|
+
|
1798
|
+
>>> df.unpivot("id", ["int", "double"], "var", "val").show()
|
1799
|
+
+---+------+----+
|
1800
|
+
| id| var| val|
|
1801
|
+
+---+------+----+
|
1802
|
+
| 1| int|11.0|
|
1803
|
+
| 1|double| 1.1|
|
1804
|
+
| 2| int|12.0|
|
1805
|
+
| 2|double| 1.2|
|
1806
|
+
+---+------+----+
|
1807
|
+
|
1808
|
+
See Also
|
1809
|
+
--------
|
1810
|
+
DataFrame.melt
|
1811
|
+
"""
|
1812
|
+
from sqlframe.base import functions as F
|
1813
|
+
|
1814
|
+
id_columns = self._ensure_and_normalize_cols(ids)
|
1815
|
+
if not values:
|
1816
|
+
outer_selects = self._get_outer_select_columns(self.expression)
|
1817
|
+
values = [
|
1818
|
+
column
|
1819
|
+
for column in outer_selects
|
1820
|
+
if column.alias_or_name not in {x.alias_or_name for x in id_columns}
|
1821
|
+
]
|
1822
|
+
value_columns = self._ensure_and_normalize_cols(values)
|
1823
|
+
|
1824
|
+
df = self._convert_leaf_to_cte()
|
1825
|
+
selects = []
|
1826
|
+
for value in value_columns:
|
1827
|
+
selects.append(
|
1828
|
+
exp.select(
|
1829
|
+
*[x.column_expression for x in id_columns],
|
1830
|
+
F.lit(value.alias_or_name).alias(variableColumnName).expression,
|
1831
|
+
value.alias(valueColumnName).expression,
|
1832
|
+
).from_(df.expression.ctes[-1].alias_or_name)
|
1833
|
+
)
|
1834
|
+
unioned_expression = functools.reduce(lambda x, y: x.union(y, distinct=False), selects) # type: ignore
|
1835
|
+
final_expression = self._add_ctes_to_expression(unioned_expression, df.expression.ctes)
|
1836
|
+
return self.copy(expression=final_expression)._convert_leaf_to_cte()
|
1837
|
+
|
1721
1838
|
def collect(self) -> t.List[Row]:
|
1722
1839
|
return self._collect()
|
1723
1840
|
|
@@ -193,10 +193,6 @@ def factorial_ensure_int(col: ColumnOrName) -> Column:
|
|
193
193
|
return Column.invoke_anonymous_function(col_func(col).cast("integer"), "FACTORIAL")
|
194
194
|
|
195
195
|
|
196
|
-
def skewness_from_skew(col: ColumnOrName) -> Column:
|
197
|
-
return Column.invoke_anonymous_function(col, "SKEW")
|
198
|
-
|
199
|
-
|
200
196
|
def isnan_using_equal(col: ColumnOrName) -> Column:
|
201
197
|
lit = get_func_from_session("lit")
|
202
198
|
return Column(
|
sqlframe/base/functions.py
CHANGED
@@ -486,14 +486,32 @@ def var_pop(col: ColumnOrName) -> Column:
|
|
486
486
|
|
487
487
|
@meta(unsupported_engines=["bigquery", "postgres"])
|
488
488
|
def skewness(col: ColumnOrName) -> Column:
|
489
|
-
from sqlframe.base.function_alternatives import skewness_from_skew
|
490
|
-
|
491
489
|
session = _get_session()
|
492
490
|
|
491
|
+
func_name = "SKEWNESS"
|
492
|
+
|
493
493
|
if session._is_snowflake:
|
494
|
-
|
494
|
+
func_name = "SKEW"
|
495
|
+
|
496
|
+
if session._is_duckdb or session._is_snowflake:
|
497
|
+
when_func = get_func_from_session("when")
|
498
|
+
count_func = get_func_from_session("count")
|
499
|
+
count_star = count_func("*")
|
500
|
+
lit_func = get_func_from_session("lit")
|
501
|
+
sqrt_func = get_func_from_session("sqrt")
|
502
|
+
col = Column.ensure_col(col)
|
503
|
+
return (
|
504
|
+
when_func(count_star == lit_func(0), lit_func(None))
|
505
|
+
.when(count_star == lit_func(1), lit_func(float("nan")))
|
506
|
+
.when(count_star == lit_func(2), lit_func(0.0))
|
507
|
+
.otherwise(
|
508
|
+
Column.invoke_anonymous_function(col, func_name)
|
509
|
+
* (count_star - lit_func(2))
|
510
|
+
/ (sqrt_func(count_star * (count_star - lit_func(1))))
|
511
|
+
)
|
512
|
+
)
|
495
513
|
|
496
|
-
return Column.invoke_anonymous_function(col,
|
514
|
+
return Column.invoke_anonymous_function(col, func_name)
|
497
515
|
|
498
516
|
|
499
517
|
@meta(unsupported_engines=["bigquery", "postgres"])
|
sqlframe/base/util.py
CHANGED
@@ -97,12 +97,8 @@ def get_column_mapping_from_schema_input(
|
|
97
97
|
else:
|
98
98
|
value = {x.strip(): None for x in schema}
|
99
99
|
return {
|
100
|
-
|
101
|
-
if v is not None
|
102
|
-
else v
|
103
|
-
for k, v in value.items()
|
100
|
+
k: exp.DataType.build(v, dialect=dialect) if v is not None else v for k, v in value.items()
|
104
101
|
}
|
105
|
-
# return {x.strip(): None for x in schema} # type: ignore
|
106
102
|
|
107
103
|
|
108
104
|
def get_tables_from_expression_with_join(expression: exp.Select) -> t.List[exp.Table]:
|
@@ -1,14 +1,14 @@
|
|
1
1
|
sqlframe/__init__.py,sha256=wfqm98eLoLid9oV_FzzpG5loKC6LxOhj2lXpfN7SARo,3138
|
2
|
-
sqlframe/_version.py,sha256=
|
2
|
+
sqlframe/_version.py,sha256=fmhKf9XPZdwZdKpQ-ESJ_LGssm7Q8K_NJEGVKwXLGQM,413
|
3
3
|
sqlframe/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
sqlframe/base/_typing.py,sha256=b2clI5HI1zEZKB_3Msx3FeAJQyft44ubUifJwQRVXyQ,1298
|
5
5
|
sqlframe/base/catalog.py,sha256=SzFQalTWdhWzxUY-4ut1f9TfOECp_JmJEgNPfrRKCe0,38457
|
6
6
|
sqlframe/base/column.py,sha256=oHVwkSWABO3ZlAbgBShsxSSlgbI06BOup5XJrRhgqJI,18097
|
7
|
-
sqlframe/base/dataframe.py,sha256=
|
7
|
+
sqlframe/base/dataframe.py,sha256=FOgLdCpscLsBntkRvutcgSVqXqMgXo9DYa892mXu00E,83907
|
8
8
|
sqlframe/base/decorators.py,sha256=ms-CvDOIW3T8IVB9VqDmLwAiaEsqXLYRXEqVQaxktiM,1890
|
9
9
|
sqlframe/base/exceptions.py,sha256=9Uwvqn2eAkDpqm4BrRgbL61qM-GMCbJEMAW8otxO46s,370
|
10
|
-
sqlframe/base/function_alternatives.py,sha256=
|
11
|
-
sqlframe/base/functions.py,sha256=
|
10
|
+
sqlframe/base/function_alternatives.py,sha256=KFkEm0aIHzajvQmiPZnzTLh-Ud9wjeg4lJ4Rk0vk-YU,53674
|
11
|
+
sqlframe/base/functions.py,sha256=jfLgboldiTB9CPkoZMtKUAwx6XSvFnEOIpCZQfoEJJU,223060
|
12
12
|
sqlframe/base/group.py,sha256=fsyG5990_Pd7gFPjTFrH9IEoAquL_wEkVpIlBAIkZJU,4091
|
13
13
|
sqlframe/base/normalize.py,sha256=nXAJ5CwxVf4DV0GsH-q1w0p8gmjSMlv96k_ez1eVul8,3880
|
14
14
|
sqlframe/base/operations.py,sha256=xSPw74e59wYvNd6U1AlwziNCTG6Aftrbl4SybN9u9VE,3450
|
@@ -18,7 +18,7 @@ sqlframe/base/table.py,sha256=rCeh1W5SWbtEVfkLAUiexzrZwNgmZeptLEmLcM1ABkE,6961
|
|
18
18
|
sqlframe/base/transforms.py,sha256=y0j3SGDz3XCmNGrvassk1S-owllUWfkHyMgZlY6SFO4,467
|
19
19
|
sqlframe/base/types.py,sha256=iBNk9bpFtb2NBIogYS8i7OlQZMRvpR6XxqzBebsjQDU,12280
|
20
20
|
sqlframe/base/udf.py,sha256=O6hMhBUy9NVv-mhJRtfFhXTIa_-Z8Y_FkmmuOHu0l90,1117
|
21
|
-
sqlframe/base/util.py,sha256=
|
21
|
+
sqlframe/base/util.py,sha256=_s2M-qHzTLgyGu1v8laRHJorUpUO6-fr3kk7CsvcuXw,15161
|
22
22
|
sqlframe/base/window.py,sha256=8hOv-ignPPIsZA9FzvYzcLE9J_glalVaYjIAUdRUX3o,4943
|
23
23
|
sqlframe/base/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
24
|
sqlframe/base/mixins/catalog_mixins.py,sha256=9tn0mK8oPoqIIjNItystD5tdBMdK9YpkxTG7G9KQl8k,18619
|
@@ -129,8 +129,8 @@ sqlframe/standalone/udf.py,sha256=azmgtUjHNIPs0WMVNId05SHwiYn41MKVBhKXsQJ5dmY,27
|
|
129
129
|
sqlframe/standalone/window.py,sha256=6GKPzuxeSapJakBaKBeT9VpED1ACdjggDv9JRILDyV0,35
|
130
130
|
sqlframe/testing/__init__.py,sha256=VVCosQhitU74A3NnE52O4mNtGZONapuEXcc20QmSlnQ,132
|
131
131
|
sqlframe/testing/utils.py,sha256=PFsGZpwNUE_4-g_f43_vstTqsK0AQ2lBneb5Eb6NkFo,13008
|
132
|
-
sqlframe-3.
|
133
|
-
sqlframe-3.
|
134
|
-
sqlframe-3.
|
135
|
-
sqlframe-3.
|
136
|
-
sqlframe-3.
|
132
|
+
sqlframe-3.21.1.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
|
133
|
+
sqlframe-3.21.1.dist-info/METADATA,sha256=AauznGD-zSbh2cqT63w2MIrg_-0SlewyyRMNElL5O2I,8970
|
134
|
+
sqlframe-3.21.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
135
|
+
sqlframe-3.21.1.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
|
136
|
+
sqlframe-3.21.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|