sqlframe 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sqlframe/__init__.py +0 -0
  2. sqlframe/_version.py +16 -0
  3. sqlframe/base/__init__.py +0 -0
  4. sqlframe/base/_typing.py +39 -0
  5. sqlframe/base/catalog.py +1163 -0
  6. sqlframe/base/column.py +388 -0
  7. sqlframe/base/dataframe.py +1519 -0
  8. sqlframe/base/decorators.py +51 -0
  9. sqlframe/base/exceptions.py +14 -0
  10. sqlframe/base/function_alternatives.py +1055 -0
  11. sqlframe/base/functions.py +1678 -0
  12. sqlframe/base/group.py +102 -0
  13. sqlframe/base/mixins/__init__.py +0 -0
  14. sqlframe/base/mixins/catalog_mixins.py +419 -0
  15. sqlframe/base/mixins/readwriter_mixins.py +118 -0
  16. sqlframe/base/normalize.py +84 -0
  17. sqlframe/base/operations.py +87 -0
  18. sqlframe/base/readerwriter.py +679 -0
  19. sqlframe/base/session.py +585 -0
  20. sqlframe/base/transforms.py +13 -0
  21. sqlframe/base/types.py +418 -0
  22. sqlframe/base/util.py +242 -0
  23. sqlframe/base/window.py +139 -0
  24. sqlframe/bigquery/__init__.py +23 -0
  25. sqlframe/bigquery/catalog.py +255 -0
  26. sqlframe/bigquery/column.py +1 -0
  27. sqlframe/bigquery/dataframe.py +54 -0
  28. sqlframe/bigquery/functions.py +378 -0
  29. sqlframe/bigquery/group.py +14 -0
  30. sqlframe/bigquery/readwriter.py +29 -0
  31. sqlframe/bigquery/session.py +89 -0
  32. sqlframe/bigquery/types.py +1 -0
  33. sqlframe/bigquery/window.py +1 -0
  34. sqlframe/duckdb/__init__.py +20 -0
  35. sqlframe/duckdb/catalog.py +108 -0
  36. sqlframe/duckdb/column.py +1 -0
  37. sqlframe/duckdb/dataframe.py +55 -0
  38. sqlframe/duckdb/functions.py +47 -0
  39. sqlframe/duckdb/group.py +14 -0
  40. sqlframe/duckdb/readwriter.py +111 -0
  41. sqlframe/duckdb/session.py +65 -0
  42. sqlframe/duckdb/types.py +1 -0
  43. sqlframe/duckdb/window.py +1 -0
  44. sqlframe/postgres/__init__.py +23 -0
  45. sqlframe/postgres/catalog.py +106 -0
  46. sqlframe/postgres/column.py +1 -0
  47. sqlframe/postgres/dataframe.py +54 -0
  48. sqlframe/postgres/functions.py +61 -0
  49. sqlframe/postgres/group.py +14 -0
  50. sqlframe/postgres/readwriter.py +29 -0
  51. sqlframe/postgres/session.py +68 -0
  52. sqlframe/postgres/types.py +1 -0
  53. sqlframe/postgres/window.py +1 -0
  54. sqlframe/redshift/__init__.py +23 -0
  55. sqlframe/redshift/catalog.py +127 -0
  56. sqlframe/redshift/column.py +1 -0
  57. sqlframe/redshift/dataframe.py +54 -0
  58. sqlframe/redshift/functions.py +18 -0
  59. sqlframe/redshift/group.py +14 -0
  60. sqlframe/redshift/readwriter.py +29 -0
  61. sqlframe/redshift/session.py +53 -0
  62. sqlframe/redshift/types.py +1 -0
  63. sqlframe/redshift/window.py +1 -0
  64. sqlframe/snowflake/__init__.py +26 -0
  65. sqlframe/snowflake/catalog.py +134 -0
  66. sqlframe/snowflake/column.py +1 -0
  67. sqlframe/snowflake/dataframe.py +54 -0
  68. sqlframe/snowflake/functions.py +18 -0
  69. sqlframe/snowflake/group.py +14 -0
  70. sqlframe/snowflake/readwriter.py +29 -0
  71. sqlframe/snowflake/session.py +53 -0
  72. sqlframe/snowflake/types.py +1 -0
  73. sqlframe/snowflake/window.py +1 -0
  74. sqlframe/spark/__init__.py +23 -0
  75. sqlframe/spark/catalog.py +1028 -0
  76. sqlframe/spark/column.py +1 -0
  77. sqlframe/spark/dataframe.py +54 -0
  78. sqlframe/spark/functions.py +22 -0
  79. sqlframe/spark/group.py +14 -0
  80. sqlframe/spark/readwriter.py +29 -0
  81. sqlframe/spark/session.py +90 -0
  82. sqlframe/spark/types.py +1 -0
  83. sqlframe/spark/window.py +1 -0
  84. sqlframe/standalone/__init__.py +26 -0
  85. sqlframe/standalone/catalog.py +13 -0
  86. sqlframe/standalone/column.py +1 -0
  87. sqlframe/standalone/dataframe.py +36 -0
  88. sqlframe/standalone/functions.py +1 -0
  89. sqlframe/standalone/group.py +14 -0
  90. sqlframe/standalone/readwriter.py +19 -0
  91. sqlframe/standalone/session.py +40 -0
  92. sqlframe/standalone/types.py +1 -0
  93. sqlframe/standalone/window.py +1 -0
  94. sqlframe-1.1.3.dist-info/LICENSE +21 -0
  95. sqlframe-1.1.3.dist-info/METADATA +172 -0
  96. sqlframe-1.1.3.dist-info/RECORD +98 -0
  97. sqlframe-1.1.3.dist-info/WHEEL +5 -0
  98. sqlframe-1.1.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,139 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import typing as t
7
+
8
+ from sqlglot import expressions as exp
9
+ from sqlglot.helper import flatten
10
+
11
+ from sqlframe.base import functions as F
12
+
13
+ if t.TYPE_CHECKING:
14
+ from sqlframe.base._typing import ColumnOrName
15
+
16
+
17
+ class Window:
18
+ _JAVA_MIN_LONG = -(1 << 63) # -9223372036854775808
19
+ _JAVA_MAX_LONG = (1 << 63) - 1 # 9223372036854775807
20
+ _PRECEDING_THRESHOLD = max(-sys.maxsize, _JAVA_MIN_LONG)
21
+ _FOLLOWING_THRESHOLD = min(sys.maxsize, _JAVA_MAX_LONG)
22
+
23
+ unboundedPreceding: int = _JAVA_MIN_LONG
24
+
25
+ unboundedFollowing: int = _JAVA_MAX_LONG
26
+
27
+ currentRow: int = 0
28
+
29
+ @classmethod
30
+ def partitionBy(cls, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
31
+ return WindowSpec().partitionBy(*cols)
32
+
33
+ @classmethod
34
+ def orderBy(cls, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
35
+ return WindowSpec().orderBy(*cols)
36
+
37
+ @classmethod
38
+ def rowsBetween(cls, start: int, end: int) -> WindowSpec:
39
+ return WindowSpec().rowsBetween(start, end)
40
+
41
+ @classmethod
42
+ def rangeBetween(cls, start: int, end: int) -> WindowSpec:
43
+ return WindowSpec().rangeBetween(start, end)
44
+
45
+
46
+ class WindowSpec:
47
+ def __init__(self, expression: exp.Expression = exp.Window()):
48
+ self.expression = expression
49
+
50
+ def copy(self):
51
+ return WindowSpec(self.expression.copy())
52
+
53
+ def sql(self, **kwargs) -> str:
54
+ from sqlframe.base.session import _BaseSession
55
+
56
+ return self.expression.sql(dialect=_BaseSession().input_dialect, **kwargs)
57
+
58
+ def partitionBy(self, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
59
+ from sqlframe.base.column import Column
60
+
61
+ cols = flatten(cols) if isinstance(cols[0], (list, set, tuple)) else cols # type: ignore
62
+ expressions = [Column.ensure_col(x).expression for x in cols] # type: ignore
63
+ window_spec = self.copy()
64
+ partition_by_expressions = window_spec.expression.args.get("partition_by", [])
65
+ partition_by_expressions.extend(expressions)
66
+ window_spec.expression.set("partition_by", partition_by_expressions)
67
+ return window_spec
68
+
69
+ def orderBy(self, *cols: t.Union[ColumnOrName, t.List[ColumnOrName]]) -> WindowSpec:
70
+ from sqlframe.base.column import Column
71
+
72
+ cols = flatten(cols) if isinstance(cols[0], (list, set, tuple)) else cols # type: ignore
73
+ expressions = [Column.ensure_col(x).expression for x in cols] # type: ignore
74
+ window_spec = self.copy()
75
+ if window_spec.expression.args.get("order") is None:
76
+ window_spec.expression.set("order", exp.Order(expressions=[]))
77
+ order_by = window_spec.expression.args["order"].expressions
78
+ order_by.extend(expressions)
79
+ window_spec.expression.args["order"].set("expressions", order_by)
80
+ return window_spec
81
+
82
+ def _calc_start_end(
83
+ self, start: int, end: int
84
+ ) -> t.Dict[str, t.Optional[t.Union[str, exp.Expression]]]:
85
+ kwargs: t.Dict[str, t.Optional[t.Union[str, exp.Expression]]] = {
86
+ "start_side": None,
87
+ "end_side": None,
88
+ }
89
+ if start == Window.currentRow:
90
+ kwargs["start"] = "CURRENT ROW"
91
+ else:
92
+ kwargs = {
93
+ **kwargs,
94
+ **{
95
+ "start_side": "PRECEDING",
96
+ "start": (
97
+ "UNBOUNDED"
98
+ if start <= Window.unboundedPreceding
99
+ else F.lit(start).expression
100
+ ),
101
+ },
102
+ }
103
+ if end == Window.currentRow:
104
+ kwargs["end"] = "CURRENT ROW"
105
+ else:
106
+ kwargs = {
107
+ **kwargs,
108
+ **{
109
+ "end_side": "FOLLOWING",
110
+ "end": (
111
+ "UNBOUNDED" if end >= Window.unboundedFollowing else F.lit(end).expression
112
+ ),
113
+ },
114
+ }
115
+ return kwargs
116
+
117
+ def rowsBetween(self, start: int, end: int) -> WindowSpec:
118
+ window_spec = self.copy()
119
+ spec = self._calc_start_end(start, end)
120
+ spec["kind"] = "ROWS"
121
+ window_spec.expression.set(
122
+ "spec",
123
+ exp.WindowSpec(
124
+ **{**window_spec.expression.args.get("spec", exp.WindowSpec()).args, **spec}
125
+ ),
126
+ )
127
+ return window_spec
128
+
129
+ def rangeBetween(self, start: int, end: int) -> WindowSpec:
130
+ window_spec = self.copy()
131
+ spec = self._calc_start_end(start, end)
132
+ spec["kind"] = "RANGE"
133
+ window_spec.expression.set(
134
+ "spec",
135
+ exp.WindowSpec(
136
+ **{**window_spec.expression.args.get("spec", exp.WindowSpec()).args, **spec}
137
+ ),
138
+ )
139
+ return window_spec
@@ -0,0 +1,23 @@
1
+ from sqlframe.bigquery.catalog import BigQueryCatalog
2
+ from sqlframe.bigquery.column import Column
3
+ from sqlframe.bigquery.dataframe import BigQueryDataFrame, BigQueryDataFrameNaFunctions
4
+ from sqlframe.bigquery.group import BigQueryGroupedData
5
+ from sqlframe.bigquery.readwriter import (
6
+ BigQueryDataFrameReader,
7
+ BigQueryDataFrameWriter,
8
+ )
9
+ from sqlframe.bigquery.session import BigQuerySession
10
+ from sqlframe.bigquery.window import Window, WindowSpec
11
+
12
+ __all__ = [
13
+ "BigQueryCatalog",
14
+ "Column",
15
+ "BigQueryDataFrame",
16
+ "BigQueryDataFrameNaFunctions",
17
+ "BigQueryGroupedData",
18
+ "BigQueryDataFrameReader",
19
+ "BigQueryDataFrameWriter",
20
+ "BigQuerySession",
21
+ "Window",
22
+ "WindowSpec",
23
+ ]
@@ -0,0 +1,255 @@
1
+ from __future__ import annotations
2
+
3
+ import fnmatch
4
+ import typing as t
5
+
6
+ from sqlglot import exp
7
+
8
+ from sqlframe.base.catalog import CatalogMetadata, Column, Function
9
+ from sqlframe.base.decorators import normalize
10
+ from sqlframe.base.mixins.catalog_mixins import (
11
+ ListDatabasesFromInfoSchemaMixin,
12
+ ListTablesFromInfoSchemaMixin,
13
+ _BaseInfoSchemaMixin,
14
+ )
15
+ from sqlframe.base.util import schema_, to_schema
16
+
17
+ if t.TYPE_CHECKING:
18
+ from google.cloud.bigquery import StandardSqlDataType
19
+
20
+ from sqlframe.bigquery.dataframe import BigQueryDataFrame # noqa
21
+ from sqlframe.bigquery.session import BigQuerySession # noqa
22
+
23
+
24
+ class BigQueryCatalog(
25
+ ListDatabasesFromInfoSchemaMixin["BigQuerySession", "BigQueryDataFrame"],
26
+ ListTablesFromInfoSchemaMixin["BigQuerySession", "BigQueryDataFrame"],
27
+ _BaseInfoSchemaMixin["BigQuerySession", "BigQueryDataFrame"],
28
+ ):
29
+ QUALIFY_INFO_SCHEMA_WITH_DATABASE = True
30
+ UPPERCASE_INFO_SCHEMA = True
31
+
32
+ def setCurrentCatalog(self, catalogName: str) -> None:
33
+ self.session.default_project = catalogName
34
+
35
+ def currentCatalog(self) -> str:
36
+ return self.session.default_project
37
+
38
+ def setCurrentDatabase(self, dbName: str) -> None:
39
+ self.session.default_dataset = dbName
40
+
41
+ def currentDatabase(self) -> str:
42
+ if not self.session.default_dataset:
43
+ raise ValueError(
44
+ "No default dataset set. Define `default_dataset` when creating `BigQuerySession`."
45
+ )
46
+ return to_schema(self.session.default_dataset).db
47
+
48
+ @normalize(["tableName", "dbName"])
49
+ def listColumns(self, tableName: str, dbName: t.Optional[str] = None) -> t.List[Column]:
50
+ """Returns a t.List of columns for the given table/view in the specified database.
51
+
52
+ .. versionadded:: 2.0.0
53
+
54
+ Parameters
55
+ ----------
56
+ tableName : str
57
+ name of the table to t.List columns.
58
+
59
+ .. versionchanged:: 3.4.0
60
+ Allow ``tableName`` to be qualified with catalog name when ``dbName`` is None.
61
+
62
+ dbName : str, t.Optional
63
+ name of the database to find the table to t.List columns.
64
+
65
+ Returns
66
+ -------
67
+ t.List
68
+ A t.List of :class:`Column`.
69
+
70
+ Notes
71
+ -----
72
+ The order of arguments here is different from that of its JVM counterpart
73
+ because Python does not support method overloading.
74
+
75
+ If no database is specified, the current database and catalog
76
+ are used. This API includes all temporary views.
77
+
78
+ Examples
79
+ --------
80
+ >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
81
+ >>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING parquet")
82
+ >>> spark.catalog.t.listColumns("tblA")
83
+ [Column(name='name', description=None, dataType='string', nullable=True, ...
84
+ >>> _ = spark.sql("DROP TABLE tblA")
85
+ """
86
+
87
+ # Source: https://github.com/TobikoData/sqlmesh/blob/4bf5e7aa9302e877273812842eba0b457e28af9e/sqlmesh/core/engine_adapter/bigquery.py#L186-L205
88
+ def dtype_to_sql(dtype: t.Optional[StandardSqlDataType]) -> str:
89
+ assert dtype
90
+
91
+ kind = dtype.type_kind
92
+ assert kind
93
+
94
+ # Not using the enum value to preserve compatibility with older versions
95
+ # of the BigQuery library.
96
+ if kind.name == "ARRAY":
97
+ return f"ARRAY<{dtype_to_sql(dtype.array_element_type)}>"
98
+ if kind.name == "STRUCT":
99
+ struct_type = dtype.struct_type
100
+ assert struct_type
101
+ fields = ", ".join(
102
+ f"{field.name} {dtype_to_sql(field.type)}" for field in struct_type.fields
103
+ )
104
+ return f"STRUCT<{fields}>"
105
+ if kind.name == "TYPE_KIND_UNSPECIFIED":
106
+ return "JSON"
107
+ return kind.name
108
+
109
+ if df := self.session.temp_views.get(tableName):
110
+ return [
111
+ Column(
112
+ name=x,
113
+ description=None,
114
+ dataType="",
115
+ nullable=True,
116
+ isPartition=False,
117
+ isBucket=False,
118
+ )
119
+ for x in df.columns
120
+ ]
121
+
122
+ table = exp.to_table(tableName, dialect=self.session.input_dialect)
123
+ schema = to_schema(dbName, dialect=self.session.input_dialect) if dbName else None
124
+ if not table.db:
125
+ if schema and schema.db:
126
+ table.set("db", schema.args["db"])
127
+ else:
128
+ table.set(
129
+ "db",
130
+ exp.parse_identifier(
131
+ self.currentDatabase(), dialect=self.session.input_dialect
132
+ ),
133
+ )
134
+ if not table.catalog:
135
+ if schema and schema.catalog:
136
+ table.set("catalog", schema.args["catalog"])
137
+ else:
138
+ table.set(
139
+ "catalog",
140
+ exp.parse_identifier(self.currentCatalog(), dialect=self.session.input_dialect),
141
+ )
142
+ bq_table = self.session._client.get_table(table=".".join(part.name for part in table.parts))
143
+ columns = [
144
+ Column(
145
+ name=field.name,
146
+ description=field.description,
147
+ dataType=exp.DataType.build(
148
+ dtype_to_sql(field.to_standard_sql().type), dialect=self.session.input_dialect
149
+ ).sql(dialect=self.session.input_dialect),
150
+ nullable=field.is_nullable,
151
+ isPartition=False,
152
+ isBucket=False,
153
+ )
154
+ for field in bq_table.schema
155
+ ]
156
+ if bq_table.time_partitioning and not bq_table.time_partitioning.field:
157
+ columns.append(
158
+ Column(
159
+ name="_PARTITIONTIME",
160
+ description=None,
161
+ dataType=exp.DataType.build("TIMESTAMP").sql(
162
+ dialect=self.session.input_dialect
163
+ ),
164
+ nullable=False,
165
+ isPartition=True,
166
+ isBucket=False,
167
+ )
168
+ )
169
+ if bq_table.time_partitioning.type_ == "DAY":
170
+ columns.append(
171
+ Column(
172
+ name="_PARTITIONDATE",
173
+ description=None,
174
+ dataType=exp.DataType.build("DATE").sql(dialect=self.session.input_dialect),
175
+ nullable=False,
176
+ isPartition=True,
177
+ isBucket=False,
178
+ )
179
+ )
180
+ return columns
181
+
182
+ def listCatalogs(self, pattern: t.Optional[str] = None) -> t.List[CatalogMetadata]:
183
+ return [CatalogMetadata(name=self.session.default_project, description=None)]
184
+
185
+ def listFunctions(
186
+ self, dbName: t.Optional[str] = None, pattern: t.Optional[str] = None
187
+ ) -> t.List[Function]:
188
+ """
189
+ Returns a t.List of functions registered in the specified database.
190
+
191
+ .. versionadded:: 3.4.0
192
+
193
+ Parameters
194
+ ----------
195
+ dbName : str
196
+ name of the database to t.List the functions.
197
+ ``dbName`` can be qualified with catalog name.
198
+ pattern : str
199
+ The pattern that the function name needs to match.
200
+
201
+ .. versionchanged: 3.5.0
202
+ Adds ``pattern`` argument.
203
+
204
+ Returns
205
+ -------
206
+ t.List
207
+ A t.List of :class:`Function`.
208
+
209
+ Notes
210
+ -----
211
+ If no database is specified, the current database and catalog
212
+ are used. This API includes all temporary functions.
213
+
214
+ Examples
215
+ --------
216
+ >>> spark.catalog.t.listFunctions()
217
+ [Function(name=...
218
+
219
+ >>> spark.catalog.t.listFunctions(pattern="to_*")
220
+ [Function(name=...
221
+
222
+ >>> spark.catalog.t.listFunctions(pattern="*not_existing_func*")
223
+ []
224
+ """
225
+ if not dbName:
226
+ schema = schema_(
227
+ db=exp.parse_identifier(self.currentDatabase(), dialect=self.session.input_dialect),
228
+ catalog=exp.parse_identifier(
229
+ self.currentCatalog(), dialect=self.session.input_dialect
230
+ ),
231
+ )
232
+ else:
233
+ schema = to_schema(dbName, dialect=self.session.input_dialect)
234
+ table = self._get_info_schema_table("routines", database=schema.db)
235
+ select = (
236
+ exp.select("routine_name", "specific_schema", "specific_catalog")
237
+ .from_(table)
238
+ .where(exp.column("specific_schema").eq(schema.db))
239
+ )
240
+ if schema.catalog:
241
+ select = select.where(exp.column("specific_catalog").eq(schema.catalog))
242
+ functions = self.session._fetch_rows(select)
243
+ if pattern:
244
+ functions = [x for x in functions if fnmatch.fnmatch(x["routine_name"], pattern)]
245
+ return [
246
+ Function(
247
+ name=x["routine_name"],
248
+ catalog=x["specific_catalog"],
249
+ namespace=[x["specific_schema"]],
250
+ description=None,
251
+ className="",
252
+ isTemporary=False,
253
+ )
254
+ for x in functions
255
+ ]
@@ -0,0 +1 @@
1
+ from sqlframe.base.column import Column
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+ import typing as t
6
+
7
+ from sqlframe.base.dataframe import (
8
+ _BaseDataFrame,
9
+ _BaseDataFrameNaFunctions,
10
+ _BaseDataFrameStatFunctions,
11
+ )
12
+ from sqlframe.bigquery.group import BigQueryGroupedData
13
+
14
+ if sys.version_info >= (3, 11):
15
+ from typing import Self
16
+ else:
17
+ from typing_extensions import Self
18
+
19
+ if t.TYPE_CHECKING:
20
+ from sqlframe.bigquery.readwriter import BigQueryDataFrameWriter
21
+ from sqlframe.bigquery.session import BigQuerySession
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class BigQueryDataFrameNaFunctions(_BaseDataFrameNaFunctions["BigQueryDataFrame"]):
28
+ pass
29
+
30
+
31
+ class BigQueryDataFrameStatFunctions(_BaseDataFrameStatFunctions["BigQueryDataFrame"]):
32
+ pass
33
+
34
+
35
+ class BigQueryDataFrame(
36
+ _BaseDataFrame[
37
+ "BigQuerySession",
38
+ "BigQueryDataFrameWriter",
39
+ "BigQueryDataFrameNaFunctions",
40
+ "BigQueryDataFrameStatFunctions",
41
+ "BigQueryGroupedData",
42
+ ]
43
+ ):
44
+ _na = BigQueryDataFrameNaFunctions
45
+ _stat = BigQueryDataFrameStatFunctions
46
+ _group_data = BigQueryGroupedData
47
+
48
+ def cache(self) -> Self:
49
+ logger.warning("BigQuery does not support caching. Ignoring cache() call.")
50
+ return self
51
+
52
+ def persist(self) -> Self:
53
+ logger.warning("BigQuery does not support persist. Ignoring persist() call.")
54
+ return self