fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,5 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ def read_excel(path: str) -> DataFrame:
5
+ raise NotImplementedError()
@@ -0,0 +1,33 @@
1
+ from typing import Iterable, Optional, cast
2
+
3
+ import yaml
4
+
5
+ from fabricks.utils.path import Path
6
+
7
+
8
+ def read_yaml(
9
+ path: Path,
10
+ root: Optional[str] = None,
11
+ preferred_file_name: Optional[str] = None,
12
+ ) -> Iterable[dict]:
13
+ found = False
14
+
15
+ for file in path.walk():
16
+ if not file.endswith(".yml"):
17
+ continue
18
+
19
+ if preferred_file_name is not None and preferred_file_name not in file:
20
+ continue
21
+
22
+ found = True
23
+
24
+ with open(file, "r", encoding="utf-8") as f:
25
+ data = yaml.safe_load(f)
26
+ for job_config in data:
27
+ if root:
28
+ yield cast(dict, job_config[root])
29
+ else:
30
+ yield cast(dict, job_config)
31
+
32
+ if preferred_file_name is not None and not found:
33
+ yield from read_yaml(path=path, root=root, preferred_file_name=None)
@@ -0,0 +1,7 @@
1
+ from fabricks.utils.schema.get_json_schema_for_type import get_json_schema_for_type
2
+ from fabricks.utils.schema.get_schema_for_type import get_schema_for_type
3
+
4
+ __all__ = [
5
+ "get_json_schema_for_type",
6
+ "get_schema_for_type",
7
+ ]
@@ -0,0 +1,161 @@
1
+ import dataclasses
2
+ import datetime
3
+ import logging
4
+ import sys
5
+ import types
6
+ from typing import Any, ForwardRef, Literal, Type, Union, get_type_hints
7
+ from uuid import UUID
8
+
9
+ LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ def get_json_schema_for_type(proptype: Type):
13
+ def_list: dict[str, dict] = {}
14
+ schema = _get_json_schema_for_type(proptype, def_list, is_root=True)
15
+ schema["$defs"] = def_list
16
+ schema["$schema"] = "https://json-schema.org/draft/2020-12/schema"
17
+ return schema
18
+
19
+
20
+ def _get_json_schema_for_type(proptype: Type, def_list: dict[str, dict], is_root: bool, is_nullable=False) -> dict:
21
+ def _fixref(input: dict) -> dict:
22
+ if "type" in input:
23
+ if "$ref" in input["type"]:
24
+ return input["type"]
25
+ return input
26
+
27
+ def _may_null(input: dict, is_nullable: bool) -> dict:
28
+ if is_nullable:
29
+ return {"oneOf": [{"type": "null"}, input]}
30
+ return input
31
+
32
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
33
+ return {"enum": proptype.__args__}
34
+
35
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == tuple: # noqa E721
36
+ return {
37
+ "type": "array",
38
+ "minItems": len(proptype.__args__),
39
+ "maxItems": len(proptype.__args__),
40
+ "additionalItems": False,
41
+ "prefixItems": [_get_json_schema_for_type(t, def_list, is_root=False) for t in proptype.__args__],
42
+ }
43
+
44
+ if (sys.version_info >= (3, 10) and isinstance(proptype, types.UnionType)) or (
45
+ hasattr(proptype, "__origin__") and proptype.__origin__ == Union
46
+ ):
47
+ if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None): # noqa E721
48
+ t = _get_json_schema_for_type(proptype.__args__[1], def_list, is_root=False, is_nullable=True)
49
+ return t
50
+
51
+ if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None): # noqa E721
52
+ t = _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False, is_nullable=True)
53
+ return t
54
+
55
+ one_of_types = [
56
+ _get_json_schema_for_type(f, def_list, is_root=False, is_nullable=False) for f in proptype.__args__
57
+ ]
58
+
59
+ return {"oneOf": one_of_types}
60
+
61
+ if proptype == type(None): # noqa E721
62
+ return {"type": "null"}
63
+
64
+ if proptype == str: # noqa E721
65
+ return {"type": "string"} if not is_nullable else {"type": ["string", "null"]}
66
+
67
+ if proptype == Any:
68
+ return {}
69
+
70
+ if proptype == UUID:
71
+ return {
72
+ "type": "string" if not is_nullable else ["string", "null"],
73
+ "format": "uuid",
74
+ }
75
+
76
+ if proptype == int: # noqa E721
77
+ return {"type": "integer" if not is_nullable else ["integer", "null"]}
78
+
79
+ if proptype == float: # noqa E721
80
+ return {"type": "number" if not is_nullable else ["number", "null"]}
81
+
82
+ if proptype == bool: # noqa E721
83
+ return {"type": "boolean" if not is_nullable else ["boolean", "null"]}
84
+
85
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
86
+ return {
87
+ "type": "array",
88
+ "items": _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False),
89
+ }
90
+
91
+ if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
92
+ typehints = get_type_hints(proptype)
93
+ props = {k: _get_json_schema_for_type(v, def_list, is_root=False) for (k, v) in typehints.items()}
94
+
95
+ if hasattr(proptype, "__name__") and not is_root:
96
+ def_list[proptype.__name__] = {"type": "object", "properties": props}
97
+ return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
98
+ else:
99
+ return _may_null({"type": "object", "properties": props}, is_nullable)
100
+
101
+ if dataclasses.is_dataclass(proptype):
102
+ required = [
103
+ f.name
104
+ for f in dataclasses.fields(proptype)
105
+ if f.default == dataclasses.MISSING and f.default_factory == dataclasses.MISSING and f.init
106
+ ]
107
+ definition = {
108
+ "type": "object",
109
+ "required": required,
110
+ "additionalProperties": False,
111
+ "properties": {
112
+ f.name: _get_json_schema_for_type(f.type, def_list, is_root=False) # type: ignore
113
+ for f in dataclasses.fields(proptype)
114
+ },
115
+ }
116
+
117
+ if is_root:
118
+ return definition
119
+ else:
120
+ def_list[proptype.__name__] = definition
121
+
122
+ return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
123
+
124
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == dict and len(proptype.__args__) == 2: # noqa E721
125
+ keytype = proptype.__args__[0]
126
+ if keytype != str and keytype != UUID: # noqa E721
127
+ raise NotImplementedError()
128
+ valuetype = proptype.__args__[1]
129
+ return _may_null(
130
+ {
131
+ "type": "object",
132
+ "additionalProperties": _fixref(
133
+ {"type": _get_json_schema_for_type(valuetype, def_list, is_root=False)}
134
+ ),
135
+ },
136
+ is_nullable,
137
+ )
138
+
139
+ if isinstance(proptype, ForwardRef):
140
+ arg = proptype.__forward_arg__
141
+ return _may_null({"$ref": "#/$defs/" + arg}, is_nullable)
142
+
143
+ if proptype == datetime.datetime:
144
+ return {
145
+ "type": "string" if not is_nullable else ["string", "null"],
146
+ "format": "date-time",
147
+ }
148
+
149
+ if proptype == datetime.time:
150
+ return {
151
+ "type": "string" if not is_nullable else ["string", "null"],
152
+ "format": "time",
153
+ }
154
+
155
+ if proptype == datetime.date:
156
+ return {
157
+ "type": "string" if not is_nullable else ["string", "null"],
158
+ "format": "date",
159
+ }
160
+
161
+ return {}
@@ -0,0 +1,99 @@
1
+ import dataclasses
2
+ from typing import List, Literal, Type, Union, cast, get_type_hints, overload
3
+
4
+ from pyspark.sql.types import (
5
+ ArrayType,
6
+ BooleanType,
7
+ DataType,
8
+ DoubleType,
9
+ LongType,
10
+ MapType,
11
+ NullType,
12
+ StringType,
13
+ StructField,
14
+ StructType,
15
+ )
16
+
17
+
18
+ @overload
19
+ def get_schema_for_type(proptype: Union[int, str, float, bool]) -> DataType: ...
20
+
21
+
22
+ @overload
23
+ def get_schema_for_type(proptype: Type) -> StructType: ...
24
+
25
+
26
+ def _merge_struct_types(types: List[DataType]):
27
+ not_none_types = [t for t in types if type(t) != type(NullType())] # noqa: E721
28
+
29
+ assert len([f for f in not_none_types if not isinstance(f, StructType)]) == 0
30
+ all_fields: List[StructField] = []
31
+
32
+ for subtype in not_none_types:
33
+ fields = cast(StructType, subtype).fields
34
+ for field in fields:
35
+ existing_field = next((f for f in all_fields if f.name == field.name), None)
36
+ if existing_field is not None and (
37
+ type(existing_field.dataType) != type(field.dataType) # noqa: E721
38
+ or isinstance(existing_field.dataType, StructType)
39
+ ):
40
+ new_type = _merge_struct_types([existing_field.dataType, field.dataType])
41
+ all_fields.append(StructField(name=field.name, dataType=new_type))
42
+ all_fields.remove(existing_field)
43
+ else:
44
+ assert existing_field is None or type(existing_field.dataType) == type(field.dataType) # noqa: E721
45
+ if existing_field is None:
46
+ all_fields.append(field)
47
+
48
+ return StructType(fields=all_fields)
49
+
50
+
51
+ def get_schema_for_type(proptype: Type) -> DataType: # type: ignore
52
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
53
+ return get_schema_for_type(type(proptype.__args__[0])) # For literal types we assume first type is correct
54
+
55
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == Union:
56
+ if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None): # noqa E721
57
+ return get_schema_for_type(proptype.__args__[1])
58
+ if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None): # noqa E721
59
+ return get_schema_for_type(proptype.__args__[0])
60
+
61
+ return _merge_struct_types([get_schema_for_type(f) for f in proptype.__args__])
62
+
63
+ if proptype == type(None): # noqa E721
64
+ return NullType()
65
+
66
+ if proptype == str: # noqa E721
67
+ return StringType()
68
+
69
+ if proptype == int: # noqa E721
70
+ return LongType()
71
+
72
+ if proptype == float: # noqa E721
73
+ return DoubleType()
74
+
75
+ if proptype == bool: # noqa E721
76
+ return BooleanType()
77
+
78
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
79
+ return ArrayType(get_schema_for_type(proptype.__args__[0]))
80
+
81
+ if proptype == dict[str, str]:
82
+ return MapType(StringType(), StringType())
83
+
84
+ if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
85
+ types = get_type_hints(proptype)
86
+ fields = [StructField(k, get_schema_for_type(v)) for k, v in types.items()]
87
+ return StructType(fields=fields)
88
+
89
+ if dataclasses.is_dataclass(proptype):
90
+ fields = [StructField(f.name, get_schema_for_type(f.type)) for f in dataclasses.fields(proptype)]
91
+ return StructType(fields=fields)
92
+
93
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == dict: # noqa E721
94
+ if len(proptype.__args__) == 2:
95
+ value_type = proptype.__args__[1]
96
+ value_schema = get_schema_for_type(value_type)
97
+ return MapType(StringType(), value_schema)
98
+
99
+ raise NotImplementedError()
@@ -0,0 +1,76 @@
1
+ import os
2
+ from typing import Final, Optional
3
+
4
+ from databricks.sdk.dbutils import RemoteDbUtils
5
+ from pyspark.sql import DataFrame, SparkSession
6
+
7
+ DATABRICKS_LOCALMODE: Final[bool] = os.getenv("DATABRICKS_LOCALMODE", "false").lower() in ("true", "1", "yes")
8
+
9
+
10
+ def get_spark() -> SparkSession:
11
+ if DATABRICKS_LOCALMODE:
12
+ from databricks.connect.session import DatabricksSession
13
+ from databricks.sdk.core import Config
14
+
15
+ profile = os.getenv("DATABRICKS_PROFILE", "DEFAULT")
16
+
17
+ cluster_id = os.getenv("DATABRICKS_CLUSTER_ID")
18
+ assert cluster_id, "DATABRICKS_CLUSTER_ID environment variable is not set"
19
+
20
+ c = Config(profile=profile, cluster_id=cluster_id)
21
+
22
+ spark = DatabricksSession.builder.sdkConfig(c).getOrCreate()
23
+
24
+ else:
25
+ pass
26
+
27
+ spark = SparkSession.builder.getOrCreate() # type: ignore
28
+
29
+ assert spark is not None
30
+ return spark # type: ignore
31
+
32
+
33
+ def display(df: DataFrame, limit: Optional[int] = None) -> None:
34
+ """
35
+ Display a Spark DataFrame in Databricks notebook or local environment.
36
+ If running in local mode, it converts the DataFrame to a Pandas DataFrame for display.
37
+ """
38
+ if DATABRICKS_LOCALMODE:
39
+ from IPython.display import display
40
+
41
+ if limit is not None:
42
+ df = df.limit(limit)
43
+
44
+ display(df.toPandas())
45
+
46
+ else:
47
+ from databricks.sdk.runtime import display
48
+
49
+ if limit is not None:
50
+ df = df.limit(limit)
51
+
52
+ display(df)
53
+
54
+
55
+ def get_dbutils(spark: Optional[SparkSession] = None) -> Optional[RemoteDbUtils]:
56
+ try:
57
+ if DATABRICKS_LOCALMODE:
58
+ from databricks.sdk import WorkspaceClient
59
+
60
+ w = WorkspaceClient()
61
+ dbutils = w.dbutils
62
+
63
+ else:
64
+ from pyspark.dbutils import DBUtils
65
+
66
+ dbutils = DBUtils(spark)
67
+
68
+ assert dbutils is not None
69
+ return dbutils # type: ignore
70
+
71
+ except Exception:
72
+ return None
73
+
74
+
75
+ spark = get_spark()
76
+ dbutils = get_dbutils(spark=spark)
@@ -0,0 +1,56 @@
1
+ from typing import List, Optional
2
+
3
+ from sqlglot import exp, parse_one, transpile
4
+ from sqlglot.dialects.databricks import Databricks
5
+
6
+
7
+ class Fabricks(Databricks):
8
+ class Generator(Databricks.Generator):
9
+ EXPRESSIONS_WITHOUT_NESTED_CTES = {
10
+ exp.Insert,
11
+ exp.Union,
12
+ }
13
+
14
+
15
+ def fix(sql: str, keep_comments: bool = True):
16
+ sql = transpile(
17
+ sql,
18
+ "fabricks",
19
+ identify=True,
20
+ pretty=True,
21
+ normalize=False,
22
+ normalize_functions="lower",
23
+ leading_comma=True,
24
+ max_text_width=119,
25
+ comments=keep_comments,
26
+ )[0]
27
+ return sql
28
+
29
+
30
+ def is_global_temp_view(sql: str):
31
+ tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
32
+ for t in tables:
33
+ return "global_temp" in str(t)
34
+
35
+
36
+ def get_global_temp_view(sql: str) -> Optional[str]:
37
+ tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
38
+ for t in tables:
39
+ if "global_temp" in str(t):
40
+ return str(t)
41
+
42
+
43
+ def parse(sql: str) -> exp.Expression:
44
+ return parse_one(sql, dialect="fabricks")
45
+
46
+
47
+ def get_tables(sql: str, allowed_databases: Optional[List[str]] = None) -> List[str]:
48
+ tables = set()
49
+ for table in parse(sql).find_all(exp.Table):
50
+ if len(table.db) > 0: # exclude CTEs
51
+ if allowed_databases:
52
+ if table.db not in allowed_databases:
53
+ continue
54
+ tables.add(f"{table.db}.{table.name}")
55
+ tables = list(tables)
56
+ return tables
@@ -0,0 +1,8 @@
1
+ from fabricks.utils.write.delta import append_delta, overwrite_delta
2
+ from fabricks.utils.write.stream import write_stream
3
+
4
+ __all__ = [
5
+ "append_delta",
6
+ "overwrite_delta",
7
+ "write_stream",
8
+ ]
@@ -0,0 +1,46 @@
1
+ from typing import List, Optional, Union, get_args
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.utils.path import Path
6
+ from fabricks.utils.read._types import AllowedIOModes
7
+
8
+
9
+ def write_delta(
10
+ df: DataFrame,
11
+ path: Path,
12
+ mode: AllowedIOModes,
13
+ options: Optional[dict[str, str]] = None,
14
+ partition_by: Union[Optional[List[str]], str] = None,
15
+ ):
16
+ assert mode in list(get_args(AllowedIOModes))
17
+
18
+ if isinstance(partition_by, str):
19
+ partition_by = [partition_by]
20
+
21
+ writer = df.write.format("delta").mode(mode).option("mergeSchema", "True").option("overwriteSchema", "True")
22
+ if partition_by:
23
+ writer = writer.partitionBy(*partition_by)
24
+ if options:
25
+ for key, value in options.items():
26
+ writer = writer.option(key, value)
27
+
28
+ writer.save(path.string)
29
+
30
+
31
+ def append_delta(
32
+ df: DataFrame,
33
+ path: Path,
34
+ options: Optional[dict[str, str]] = None,
35
+ partition_by: Union[Optional[List[str]], str] = None,
36
+ ):
37
+ write_delta(df, path, "append", options=options)
38
+
39
+
40
+ def overwrite_delta(
41
+ df: DataFrame,
42
+ path: Path,
43
+ options: Optional[dict[str, str]] = None,
44
+ partition_by: Union[Optional[List[str]], str] = None,
45
+ ):
46
+ write_delta(df, path, "overwrite", options=options)
@@ -0,0 +1,27 @@
1
+ from typing import Callable, Optional
2
+
3
+ from pyspark.sql import DataFrame
4
+ from pyspark.sql.streaming.query import StreamingQuery
5
+
6
+ from fabricks.utils.path import Path
7
+
8
+
9
+ def write_stream(
10
+ df: DataFrame,
11
+ checkpoints_path: Path,
12
+ func: Callable,
13
+ timeout: Optional[int] = 18000,
14
+ ) -> StreamingQuery:
15
+ if timeout is None:
16
+ timeout = 18000
17
+
18
+ assert timeout is not None
19
+
20
+ query = (
21
+ df.writeStream.foreachBatch(func)
22
+ .option("checkpointLocation", checkpoints_path.string)
23
+ .trigger(once=True)
24
+ .start()
25
+ )
26
+ query.awaitTermination(timeout=timeout)
27
+ return query
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: fabricks
3
+ Version: 3.0.11
4
+ Author-email: BMS DWH Team <bi_support@bmsuisse.ch>
5
+ Requires-Python: <4,>=3.9
6
+ Requires-Dist: azure-data-tables<13,>=12.5.0
7
+ Requires-Dist: azure-identity>=1.10.0
8
+ Requires-Dist: azure-storage-blob>=12.14.1
9
+ Requires-Dist: azure-storage-queue<13,>=12.10.0
10
+ Requires-Dist: databricks-cli>=0.18.0
11
+ Requires-Dist: databricks-sdk>=0.20.0
12
+ Requires-Dist: importlib-metadata>=8.6.1
13
+ Requires-Dist: jinja2>=2.11.3
14
+ Requires-Dist: mermaid-magic>=0.1.4
15
+ Requires-Dist: pydantic-settings
16
+ Requires-Dist: pydantic-yaml>=1.4.0
17
+ Requires-Dist: pydantic>=1.10.21
18
+ Requires-Dist: python-dotenv>=1.0.1
19
+ Requires-Dist: pyyaml>=6.0.0
20
+ Requires-Dist: sqlglot>=22.1.1
21
+ Requires-Dist: tenacity>=9.1.2
22
+ Requires-Dist: tomli>=2.2.1; python_version < '3.11'
23
+ Requires-Dist: tqdm>=4.67.1