sqlframe 3.15.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlframe/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '3.15.1'
16
- __version_tuple__ = version_tuple = (3, 15, 1)
15
+ __version__ = version = '3.16.0'
16
+ __version_tuple__ = version_tuple = (3, 16, 0)
@@ -391,7 +391,9 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
391
391
 
392
392
  cols = self._ensure_list_of_columns(cols)
393
393
  normalize(self.session, expression or self.expression, cols)
394
- return list(flatten([self._expand_star(col) for col in cols]))
394
+ cols = list(flatten([self._expand_star(col) for col in cols]))
395
+ self._resolve_ambiguous_columns(cols)
396
+ return cols
395
397
 
396
398
  def _ensure_and_normalize_col(self, col):
397
399
  from sqlframe.base.column import Column
@@ -399,6 +401,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
399
401
 
400
402
  col = Column.ensure_col(col)
401
403
  normalize(self.session, self.expression, col)
404
+ self._resolve_ambiguous_columns(col)
402
405
  return col
403
406
 
404
407
  def _convert_leaf_to_cte(
@@ -745,10 +748,55 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
745
748
  kwargs["join_on_uuid"] = str(uuid4())
746
749
  return self.__class__(**object_to_dict(self, **kwargs))
747
750
 
751
+ def _resolve_ambiguous_columns(self, columns: t.Union[Column, t.List[Column]]) -> None:
752
+ if "joins" not in self.expression.args:
753
+ return
754
+
755
+ columns = ensure_list(columns)
756
+ ambiguous_cols: t.List[exp.Column] = list(
757
+ flatten(
758
+ [
759
+ sub_col
760
+ for col in columns
761
+ for sub_col in col.expression.find_all(exp.Column)
762
+ if not sub_col.table
763
+ ]
764
+ )
765
+ )
766
+ if ambiguous_cols:
767
+ join_table_identifiers = [
768
+ x.this for x in get_tables_from_expression_with_join(self.expression)
769
+ ]
770
+ cte_names_in_join = [x.this for x in join_table_identifiers]
771
+ # If we have columns that resolve to multiple CTE expressions then we want to use each CTE left-to-right
772
+ # (or right to left if a right join) and therefore we allow multiple columns with the same
773
+ # name in the result. This matches the behavior of Spark.
774
+ resolved_column_position: t.Dict[exp.Column, int] = {
775
+ col.copy(): -1 for col in ambiguous_cols
776
+ }
777
+ for ambiguous_col in ambiguous_cols:
778
+ ctes = (
779
+ list(reversed(self.expression.ctes))
780
+ if self.expression.args["joins"][0].args.get("side", "") == "right"
781
+ else self.expression.ctes
782
+ )
783
+ ctes_with_column = [
784
+ cte
785
+ for cte in ctes
786
+ if cte.alias_or_name in cte_names_in_join
787
+ and ambiguous_col.alias_or_name in cte.this.named_selects
788
+ ]
789
+ # Check if there is a CTE with this column that we haven't used before. If so, use it. Otherwise,
790
+ # use the same CTE we used before
791
+ cte = seq_get(ctes_with_column, resolved_column_position[ambiguous_col] + 1)
792
+ if cte:
793
+ resolved_column_position[ambiguous_col] += 1
794
+ else:
795
+ cte = ctes_with_column[resolved_column_position[ambiguous_col]]
796
+ ambiguous_col.set("table", exp.to_identifier(cte.alias_or_name))
797
+
748
798
  @operation(Operation.SELECT)
749
799
  def select(self, *cols, **kwargs) -> Self:
750
- from sqlframe.base.column import Column
751
-
752
800
  if not cols:
753
801
  return self
754
802
 
@@ -756,48 +804,6 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
756
804
  cols = cols[0] # type: ignore
757
805
  columns = self._ensure_and_normalize_cols(cols)
758
806
  kwargs["append"] = kwargs.get("append", False)
759
- if self.expression.args.get("joins"):
760
- ambiguous_cols: t.List[exp.Column] = list(
761
- flatten(
762
- [
763
- sub_col
764
- for col in columns
765
- for sub_col in col.expression.find_all(exp.Column)
766
- if not sub_col.table
767
- ]
768
- )
769
- )
770
- if ambiguous_cols:
771
- join_table_identifiers = [
772
- x.this for x in get_tables_from_expression_with_join(self.expression)
773
- ]
774
- cte_names_in_join = [x.this for x in join_table_identifiers]
775
- # If we have columns that resolve to multiple CTE expressions then we want to use each CTE left-to-right
776
- # (or right to left if a right join) and therefore we allow multiple columns with the same
777
- # name in the result. This matches the behavior of Spark.
778
- resolved_column_position: t.Dict[exp.Column, int] = {
779
- col.copy(): -1 for col in ambiguous_cols
780
- }
781
- for ambiguous_col in ambiguous_cols:
782
- ctes = (
783
- list(reversed(self.expression.ctes))
784
- if self.expression.args["joins"][0].args.get("side", "") == "right"
785
- else self.expression.ctes
786
- )
787
- ctes_with_column = [
788
- cte
789
- for cte in ctes
790
- if cte.alias_or_name in cte_names_in_join
791
- and ambiguous_col.alias_or_name in cte.this.named_selects
792
- ]
793
- # Check if there is a CTE with this column that we haven't used before. If so, use it. Otherwise,
794
- # use the same CTE we used before
795
- cte = seq_get(ctes_with_column, resolved_column_position[ambiguous_col] + 1)
796
- if cte:
797
- resolved_column_position[ambiguous_col] += 1
798
- else:
799
- cte = ctes_with_column[resolved_column_position[ambiguous_col]]
800
- ambiguous_col.set("table", exp.to_identifier(cte.alias_or_name))
801
807
  # If an expression is `CAST(x AS DATETYPE)` then we want to alias so that `x` is the result column name
802
808
  columns = [
803
809
  col.alias(col.expression.alias_or_name)
@@ -43,7 +43,7 @@ def func_metadata(unsupported_engines: t.Optional[t.Union[str, t.List[str]]] = N
43
43
  col_name = col_name.this
44
44
  alias_name = f"{func.__name__}__{col_name or ''}__"
45
45
  # BigQuery has restrictions on alias names so we constrain it to alphanumeric characters and underscores
46
- return result.alias(re.sub("\W", "_", alias_name)) # type: ignore
46
+ return result.alias(re.sub(r"\W", "_", alias_name)) # type: ignore
47
47
  return result
48
48
 
49
49
  wrapper.unsupported_engines = ( # type: ignore
@@ -2851,12 +2851,14 @@ def bool_or(col: ColumnOrName) -> Column:
2851
2851
  return Column.invoke_expression_over_column(col, expression.LogicalOr)
2852
2852
 
2853
2853
 
2854
- @meta(unsupported_engines="*")
2854
+ @meta()
2855
2855
  def btrim(str: ColumnOrName, trim: t.Optional[ColumnOrName] = None) -> Column:
2856
2856
  if trim is not None:
2857
- return Column.invoke_anonymous_function(str, "btrim", trim)
2857
+ return Column.invoke_expression_over_column(
2858
+ str, expression.Trim, expression=Column.ensure_col(trim).column_expression
2859
+ )
2858
2860
  else:
2859
- return Column.invoke_anonymous_function(str, "btrim")
2861
+ return Column.invoke_expression_over_column(str, expression.Trim)
2860
2862
 
2861
2863
 
2862
2864
  @meta(unsupported_engines="*")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sqlframe
3
- Version: 3.15.1
3
+ Version: 3.16.0
4
4
  Summary: Turning PySpark Into a Universal DataFrame API
5
5
  Home-page: https://github.com/eakmanrq/sqlframe
6
6
  Author: Ryan Eakman
@@ -1,14 +1,14 @@
1
1
  sqlframe/__init__.py,sha256=wfqm98eLoLid9oV_FzzpG5loKC6LxOhj2lXpfN7SARo,3138
2
- sqlframe/_version.py,sha256=rNfI2qI8EULJid-fGjytQ8KiqfMi0Ktaq6sNSFSM_1s,413
2
+ sqlframe/_version.py,sha256=CtTis8a_OeN0EsLFoVgtqX-ARqHjuin2ATomgRROY1Y,413
3
3
  sqlframe/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sqlframe/base/_typing.py,sha256=b2clI5HI1zEZKB_3Msx3FeAJQyft44ubUifJwQRVXyQ,1298
5
5
  sqlframe/base/catalog.py,sha256=SzFQalTWdhWzxUY-4ut1f9TfOECp_JmJEgNPfrRKCe0,38457
6
6
  sqlframe/base/column.py,sha256=wRghgieYAA51aw4WuFQWOvl0TFOToZbBhBuIamEzxx4,18011
7
- sqlframe/base/dataframe.py,sha256=E1zWlB_a2FNOxjTcQ68MtL_A4c8fnLiHY3MeZttK4Xk,76570
8
- sqlframe/base/decorators.py,sha256=P56cgs8DANxGRIwVs5uOMnDy-BlXZZYMbf4fdnkpWPI,1889
7
+ sqlframe/base/dataframe.py,sha256=KKBwtn73xNGt2gRwUB8Vri7Ee6_ivP5a_qij4Eq96zE,76622
8
+ sqlframe/base/decorators.py,sha256=ms-CvDOIW3T8IVB9VqDmLwAiaEsqXLYRXEqVQaxktiM,1890
9
9
  sqlframe/base/exceptions.py,sha256=9Uwvqn2eAkDpqm4BrRgbL61qM-GMCbJEMAW8otxO46s,370
10
10
  sqlframe/base/function_alternatives.py,sha256=NV31IaEhVYmfUSWetAEFISAvLzs2DxQ7bp-iMNgj0hQ,53786
11
- sqlframe/base/functions.py,sha256=9mN54Nx6yqos1njfyW2-WRzfFUsA96P9z1ldJVtovSs,220543
11
+ sqlframe/base/functions.py,sha256=o8zwbS8zCsyNe5arcb6dbAGBL8a1tH99rGyRimwzzUk,220614
12
12
  sqlframe/base/group.py,sha256=fsyG5990_Pd7gFPjTFrH9IEoAquL_wEkVpIlBAIkZJU,4091
13
13
  sqlframe/base/normalize.py,sha256=nXAJ5CwxVf4DV0GsH-q1w0p8gmjSMlv96k_ez1eVul8,3880
14
14
  sqlframe/base/operations.py,sha256=xSPw74e59wYvNd6U1AlwziNCTG6Aftrbl4SybN9u9VE,3450
@@ -129,8 +129,8 @@ sqlframe/standalone/udf.py,sha256=azmgtUjHNIPs0WMVNId05SHwiYn41MKVBhKXsQJ5dmY,27
129
129
  sqlframe/standalone/window.py,sha256=6GKPzuxeSapJakBaKBeT9VpED1ACdjggDv9JRILDyV0,35
130
130
  sqlframe/testing/__init__.py,sha256=VVCosQhitU74A3NnE52O4mNtGZONapuEXcc20QmSlnQ,132
131
131
  sqlframe/testing/utils.py,sha256=PFsGZpwNUE_4-g_f43_vstTqsK0AQ2lBneb5Eb6NkFo,13008
132
- sqlframe-3.15.1.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
133
- sqlframe-3.15.1.dist-info/METADATA,sha256=-MxovSCoyQnT-6Ujd4BDA_yVpf9KWra2v1CQGN2TmG4,8970
134
- sqlframe-3.15.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
135
- sqlframe-3.15.1.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
136
- sqlframe-3.15.1.dist-info/RECORD,,
132
+ sqlframe-3.16.0.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
133
+ sqlframe-3.16.0.dist-info/METADATA,sha256=SMpgyXmxbVMqeeRuByF19qKm9iLDYubcniTCYBUmyNo,8970
134
+ sqlframe-3.16.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
135
+ sqlframe-3.16.0.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
136
+ sqlframe-3.16.0.dist-info/RECORD,,