sqlframe 3.20.0__py3-none-any.whl → 3.21.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- sqlframe/_version.py +2 -2
- sqlframe/base/dataframe.py +131 -14
- sqlframe/base/function_alternatives.py +0 -4
- sqlframe/base/functions.py +22 -4
- sqlframe/base/util.py +1 -5
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/METADATA +1 -1
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/RECORD +10 -10
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/LICENSE +0 -0
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/WHEEL +0 -0
- {sqlframe-3.20.0.dist-info → sqlframe-3.21.1.dist-info}/top_level.txt +0 -0
sqlframe/_version.py
CHANGED
sqlframe/base/dataframe.py
CHANGED
@@ -296,6 +296,12 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
296
296
|
|
297
297
|
@property
|
298
298
|
def columns(self) -> t.List[str]:
|
299
|
+
expression_display_names = self.expression.copy()
|
300
|
+
self._set_display_names(expression_display_names)
|
301
|
+
return expression_display_names.named_selects
|
302
|
+
|
303
|
+
@property
|
304
|
+
def _columns(self) -> t.List[str]:
|
299
305
|
return self.expression.named_selects
|
300
306
|
|
301
307
|
@property
|
@@ -611,6 +617,18 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
611
617
|
}
|
612
618
|
self.display_name_mapping.update(zipped)
|
613
619
|
|
620
|
+
def _set_display_names(self, select_expression: exp.Select) -> None:
|
621
|
+
for index, column in enumerate(select_expression.expressions):
|
622
|
+
column_name = quote_preserving_alias_or_name(column)
|
623
|
+
if column_name in self.display_name_mapping:
|
624
|
+
display_name_identifier = exp.to_identifier(
|
625
|
+
self.display_name_mapping[column_name], quoted=True
|
626
|
+
)
|
627
|
+
display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
|
628
|
+
select_expression.expressions[index] = exp.alias_(
|
629
|
+
column.unalias(), display_name_identifier, quoted=True
|
630
|
+
)
|
631
|
+
|
614
632
|
def _get_expressions(
|
615
633
|
self,
|
616
634
|
optimize: bool = True,
|
@@ -631,16 +649,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
631
649
|
select_expression = select_expression.transform(
|
632
650
|
replace_id_value, replacement_mapping
|
633
651
|
).assert_is(exp.Select)
|
634
|
-
|
635
|
-
column_name = quote_preserving_alias_or_name(column)
|
636
|
-
if column_name in self.display_name_mapping:
|
637
|
-
display_name_identifier = exp.to_identifier(
|
638
|
-
self.display_name_mapping[column_name], quoted=True
|
639
|
-
)
|
640
|
-
display_name_identifier._meta = {"case_sensitive": True, **(column._meta or {})}
|
641
|
-
select_expression.expressions[index] = exp.alias_(
|
642
|
-
column.unalias(), display_name_identifier, quoted=True
|
643
|
-
)
|
652
|
+
self._set_display_names(select_expression)
|
644
653
|
if optimize:
|
645
654
|
select_expression = t.cast(
|
646
655
|
exp.Select,
|
@@ -1158,8 +1167,8 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
1158
1167
|
|
1159
1168
|
@operation(Operation.FROM)
|
1160
1169
|
def unionByName(self, other: Self, allowMissingColumns: bool = False) -> Self:
|
1161
|
-
l_columns = self.
|
1162
|
-
r_columns = other.
|
1170
|
+
l_columns = self._columns
|
1171
|
+
r_columns = other._columns
|
1163
1172
|
if not allowMissingColumns:
|
1164
1173
|
l_expressions = l_columns
|
1165
1174
|
r_expressions = l_columns
|
@@ -1619,9 +1628,9 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
1619
1628
|
| 16| Bob|
|
1620
1629
|
+---+-----+
|
1621
1630
|
"""
|
1622
|
-
if len(cols) != len(self.
|
1631
|
+
if len(cols) != len(self._columns):
|
1623
1632
|
raise ValueError(
|
1624
|
-
f"Number of column names does not match number of columns: {len(cols)} != {len(self.
|
1633
|
+
f"Number of column names does not match number of columns: {len(cols)} != {len(self._columns)}"
|
1625
1634
|
)
|
1626
1635
|
expression = self.expression.copy()
|
1627
1636
|
expression = expression.select(
|
@@ -1718,6 +1727,114 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
1718
1727
|
grouping_columns.extend([list(x) for x in itertools.combinations(columns, i)])
|
1719
1728
|
return self._group_data(self, grouping_columns, self.last_op)
|
1720
1729
|
|
1730
|
+
@operation(Operation.SELECT)
|
1731
|
+
def unpivot(
|
1732
|
+
self,
|
1733
|
+
ids: t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]],
|
1734
|
+
values: t.Optional[t.Union[ColumnOrName, t.List[ColumnOrName], t.Tuple[ColumnOrName, ...]]],
|
1735
|
+
variableColumnName: str,
|
1736
|
+
valueColumnName: str,
|
1737
|
+
) -> Self:
|
1738
|
+
"""
|
1739
|
+
Unpivot a DataFrame from wide format to long format, optionally leaving
|
1740
|
+
identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,
|
1741
|
+
except for the aggregation, which cannot be reversed.
|
1742
|
+
|
1743
|
+
This function is useful to massage a DataFrame into a format where some
|
1744
|
+
columns are identifier columns ("ids"), while all other columns ("values")
|
1745
|
+
are "unpivoted" to the rows, leaving just two non-id columns, named as given
|
1746
|
+
by `variableColumnName` and `valueColumnName`.
|
1747
|
+
|
1748
|
+
When no "id" columns are given, the unpivoted DataFrame consists of only the
|
1749
|
+
"variable" and "value" columns.
|
1750
|
+
|
1751
|
+
The `values` columns must not be empty so at least one value must be given to be unpivoted.
|
1752
|
+
When `values` is `None`, all non-id columns will be unpivoted.
|
1753
|
+
|
1754
|
+
All "value" columns must share a least common data type. Unless they are the same data type,
|
1755
|
+
all "value" columns are cast to the nearest common data type. For instance, types
|
1756
|
+
`IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType`
|
1757
|
+
do not have a common data type and `unpivot` fails.
|
1758
|
+
|
1759
|
+
.. versionadded:: 3.4.0
|
1760
|
+
|
1761
|
+
Parameters
|
1762
|
+
----------
|
1763
|
+
ids : str, Column, tuple, list
|
1764
|
+
Column(s) to use as identifiers. Can be a single column or column name,
|
1765
|
+
or a list or tuple for multiple columns.
|
1766
|
+
values : str, Column, tuple, list, optional
|
1767
|
+
Column(s) to unpivot. Can be a single column or column name, or a list or tuple
|
1768
|
+
for multiple columns. If specified, must not be empty. If not specified, uses all
|
1769
|
+
columns that are not set as `ids`.
|
1770
|
+
variableColumnName : str
|
1771
|
+
Name of the variable column.
|
1772
|
+
valueColumnName : str
|
1773
|
+
Name of the value column.
|
1774
|
+
|
1775
|
+
Returns
|
1776
|
+
-------
|
1777
|
+
:class:`DataFrame`
|
1778
|
+
Unpivoted DataFrame.
|
1779
|
+
|
1780
|
+
Notes
|
1781
|
+
-----
|
1782
|
+
Supports Spark Connect.
|
1783
|
+
|
1784
|
+
Examples
|
1785
|
+
--------
|
1786
|
+
>>> df = spark.createDataFrame(
|
1787
|
+
... [(1, 11, 1.1), (2, 12, 1.2)],
|
1788
|
+
... ["id", "int", "double"],
|
1789
|
+
... )
|
1790
|
+
>>> df.show()
|
1791
|
+
+---+---+------+
|
1792
|
+
| id|int|double|
|
1793
|
+
+---+---+------+
|
1794
|
+
| 1| 11| 1.1|
|
1795
|
+
| 2| 12| 1.2|
|
1796
|
+
+---+---+------+
|
1797
|
+
|
1798
|
+
>>> df.unpivot("id", ["int", "double"], "var", "val").show()
|
1799
|
+
+---+------+----+
|
1800
|
+
| id| var| val|
|
1801
|
+
+---+------+----+
|
1802
|
+
| 1| int|11.0|
|
1803
|
+
| 1|double| 1.1|
|
1804
|
+
| 2| int|12.0|
|
1805
|
+
| 2|double| 1.2|
|
1806
|
+
+---+------+----+
|
1807
|
+
|
1808
|
+
See Also
|
1809
|
+
--------
|
1810
|
+
DataFrame.melt
|
1811
|
+
"""
|
1812
|
+
from sqlframe.base import functions as F
|
1813
|
+
|
1814
|
+
id_columns = self._ensure_and_normalize_cols(ids)
|
1815
|
+
if not values:
|
1816
|
+
outer_selects = self._get_outer_select_columns(self.expression)
|
1817
|
+
values = [
|
1818
|
+
column
|
1819
|
+
for column in outer_selects
|
1820
|
+
if column.alias_or_name not in {x.alias_or_name for x in id_columns}
|
1821
|
+
]
|
1822
|
+
value_columns = self._ensure_and_normalize_cols(values)
|
1823
|
+
|
1824
|
+
df = self._convert_leaf_to_cte()
|
1825
|
+
selects = []
|
1826
|
+
for value in value_columns:
|
1827
|
+
selects.append(
|
1828
|
+
exp.select(
|
1829
|
+
*[x.column_expression for x in id_columns],
|
1830
|
+
F.lit(value.alias_or_name).alias(variableColumnName).expression,
|
1831
|
+
value.alias(valueColumnName).expression,
|
1832
|
+
).from_(df.expression.ctes[-1].alias_or_name)
|
1833
|
+
)
|
1834
|
+
unioned_expression = functools.reduce(lambda x, y: x.union(y, distinct=False), selects) # type: ignore
|
1835
|
+
final_expression = self._add_ctes_to_expression(unioned_expression, df.expression.ctes)
|
1836
|
+
return self.copy(expression=final_expression)._convert_leaf_to_cte()
|
1837
|
+
|
1721
1838
|
def collect(self) -> t.List[Row]:
|
1722
1839
|
return self._collect()
|
1723
1840
|
|
@@ -193,10 +193,6 @@ def factorial_ensure_int(col: ColumnOrName) -> Column:
|
|
193
193
|
return Column.invoke_anonymous_function(col_func(col).cast("integer"), "FACTORIAL")
|
194
194
|
|
195
195
|
|
196
|
-
def skewness_from_skew(col: ColumnOrName) -> Column:
|
197
|
-
return Column.invoke_anonymous_function(col, "SKEW")
|
198
|
-
|
199
|
-
|
200
196
|
def isnan_using_equal(col: ColumnOrName) -> Column:
|
201
197
|
lit = get_func_from_session("lit")
|
202
198
|
return Column(
|
sqlframe/base/functions.py
CHANGED
@@ -486,14 +486,32 @@ def var_pop(col: ColumnOrName) -> Column:
|
|
486
486
|
|
487
487
|
@meta(unsupported_engines=["bigquery", "postgres"])
|
488
488
|
def skewness(col: ColumnOrName) -> Column:
|
489
|
-
from sqlframe.base.function_alternatives import skewness_from_skew
|
490
|
-
|
491
489
|
session = _get_session()
|
492
490
|
|
491
|
+
func_name = "SKEWNESS"
|
492
|
+
|
493
493
|
if session._is_snowflake:
|
494
|
-
|
494
|
+
func_name = "SKEW"
|
495
|
+
|
496
|
+
if session._is_duckdb or session._is_snowflake:
|
497
|
+
when_func = get_func_from_session("when")
|
498
|
+
count_func = get_func_from_session("count")
|
499
|
+
count_star = count_func("*")
|
500
|
+
lit_func = get_func_from_session("lit")
|
501
|
+
sqrt_func = get_func_from_session("sqrt")
|
502
|
+
col = Column.ensure_col(col)
|
503
|
+
return (
|
504
|
+
when_func(count_star == lit_func(0), lit_func(None))
|
505
|
+
.when(count_star == lit_func(1), lit_func(float("nan")))
|
506
|
+
.when(count_star == lit_func(2), lit_func(0.0))
|
507
|
+
.otherwise(
|
508
|
+
Column.invoke_anonymous_function(col, func_name)
|
509
|
+
* (count_star - lit_func(2))
|
510
|
+
/ (sqrt_func(count_star * (count_star - lit_func(1))))
|
511
|
+
)
|
512
|
+
)
|
495
513
|
|
496
|
-
return Column.invoke_anonymous_function(col,
|
514
|
+
return Column.invoke_anonymous_function(col, func_name)
|
497
515
|
|
498
516
|
|
499
517
|
@meta(unsupported_engines=["bigquery", "postgres"])
|
sqlframe/base/util.py
CHANGED
@@ -97,12 +97,8 @@ def get_column_mapping_from_schema_input(
|
|
97
97
|
else:
|
98
98
|
value = {x.strip(): None for x in schema}
|
99
99
|
return {
|
100
|
-
|
101
|
-
if v is not None
|
102
|
-
else v
|
103
|
-
for k, v in value.items()
|
100
|
+
k: exp.DataType.build(v, dialect=dialect) if v is not None else v for k, v in value.items()
|
104
101
|
}
|
105
|
-
# return {x.strip(): None for x in schema} # type: ignore
|
106
102
|
|
107
103
|
|
108
104
|
def get_tables_from_expression_with_join(expression: exp.Select) -> t.List[exp.Table]:
|
@@ -1,14 +1,14 @@
|
|
1
1
|
sqlframe/__init__.py,sha256=wfqm98eLoLid9oV_FzzpG5loKC6LxOhj2lXpfN7SARo,3138
|
2
|
-
sqlframe/_version.py,sha256=
|
2
|
+
sqlframe/_version.py,sha256=fmhKf9XPZdwZdKpQ-ESJ_LGssm7Q8K_NJEGVKwXLGQM,413
|
3
3
|
sqlframe/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
sqlframe/base/_typing.py,sha256=b2clI5HI1zEZKB_3Msx3FeAJQyft44ubUifJwQRVXyQ,1298
|
5
5
|
sqlframe/base/catalog.py,sha256=SzFQalTWdhWzxUY-4ut1f9TfOECp_JmJEgNPfrRKCe0,38457
|
6
6
|
sqlframe/base/column.py,sha256=oHVwkSWABO3ZlAbgBShsxSSlgbI06BOup5XJrRhgqJI,18097
|
7
|
-
sqlframe/base/dataframe.py,sha256=
|
7
|
+
sqlframe/base/dataframe.py,sha256=FOgLdCpscLsBntkRvutcgSVqXqMgXo9DYa892mXu00E,83907
|
8
8
|
sqlframe/base/decorators.py,sha256=ms-CvDOIW3T8IVB9VqDmLwAiaEsqXLYRXEqVQaxktiM,1890
|
9
9
|
sqlframe/base/exceptions.py,sha256=9Uwvqn2eAkDpqm4BrRgbL61qM-GMCbJEMAW8otxO46s,370
|
10
|
-
sqlframe/base/function_alternatives.py,sha256=
|
11
|
-
sqlframe/base/functions.py,sha256=
|
10
|
+
sqlframe/base/function_alternatives.py,sha256=KFkEm0aIHzajvQmiPZnzTLh-Ud9wjeg4lJ4Rk0vk-YU,53674
|
11
|
+
sqlframe/base/functions.py,sha256=jfLgboldiTB9CPkoZMtKUAwx6XSvFnEOIpCZQfoEJJU,223060
|
12
12
|
sqlframe/base/group.py,sha256=fsyG5990_Pd7gFPjTFrH9IEoAquL_wEkVpIlBAIkZJU,4091
|
13
13
|
sqlframe/base/normalize.py,sha256=nXAJ5CwxVf4DV0GsH-q1w0p8gmjSMlv96k_ez1eVul8,3880
|
14
14
|
sqlframe/base/operations.py,sha256=xSPw74e59wYvNd6U1AlwziNCTG6Aftrbl4SybN9u9VE,3450
|
@@ -18,7 +18,7 @@ sqlframe/base/table.py,sha256=rCeh1W5SWbtEVfkLAUiexzrZwNgmZeptLEmLcM1ABkE,6961
|
|
18
18
|
sqlframe/base/transforms.py,sha256=y0j3SGDz3XCmNGrvassk1S-owllUWfkHyMgZlY6SFO4,467
|
19
19
|
sqlframe/base/types.py,sha256=iBNk9bpFtb2NBIogYS8i7OlQZMRvpR6XxqzBebsjQDU,12280
|
20
20
|
sqlframe/base/udf.py,sha256=O6hMhBUy9NVv-mhJRtfFhXTIa_-Z8Y_FkmmuOHu0l90,1117
|
21
|
-
sqlframe/base/util.py,sha256=
|
21
|
+
sqlframe/base/util.py,sha256=_s2M-qHzTLgyGu1v8laRHJorUpUO6-fr3kk7CsvcuXw,15161
|
22
22
|
sqlframe/base/window.py,sha256=8hOv-ignPPIsZA9FzvYzcLE9J_glalVaYjIAUdRUX3o,4943
|
23
23
|
sqlframe/base/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
24
|
sqlframe/base/mixins/catalog_mixins.py,sha256=9tn0mK8oPoqIIjNItystD5tdBMdK9YpkxTG7G9KQl8k,18619
|
@@ -129,8 +129,8 @@ sqlframe/standalone/udf.py,sha256=azmgtUjHNIPs0WMVNId05SHwiYn41MKVBhKXsQJ5dmY,27
|
|
129
129
|
sqlframe/standalone/window.py,sha256=6GKPzuxeSapJakBaKBeT9VpED1ACdjggDv9JRILDyV0,35
|
130
130
|
sqlframe/testing/__init__.py,sha256=VVCosQhitU74A3NnE52O4mNtGZONapuEXcc20QmSlnQ,132
|
131
131
|
sqlframe/testing/utils.py,sha256=PFsGZpwNUE_4-g_f43_vstTqsK0AQ2lBneb5Eb6NkFo,13008
|
132
|
-
sqlframe-3.
|
133
|
-
sqlframe-3.
|
134
|
-
sqlframe-3.
|
135
|
-
sqlframe-3.
|
136
|
-
sqlframe-3.
|
132
|
+
sqlframe-3.21.1.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
|
133
|
+
sqlframe-3.21.1.dist-info/METADATA,sha256=AauznGD-zSbh2cqT63w2MIrg_-0SlewyyRMNElL5O2I,8970
|
134
|
+
sqlframe-3.21.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
135
|
+
sqlframe-3.21.1.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
|
136
|
+
sqlframe-3.21.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|