fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,43 @@
1
+ from typing import Optional
2
+
3
+ import yaml
4
+ from databricks.sdk.runtime import spark
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql.types import StructType
7
+
8
+ from fabricks.utils.helpers import concat_dfs
9
+ from fabricks.utils.path import Path
10
+
11
+
12
+ def read_yaml(
13
+ path: Path,
14
+ root: Optional[str] = None,
15
+ schema: Optional[StructType] = None,
16
+ file_name: Optional[str] = None,
17
+ ) -> Optional[DataFrame]:
18
+ files = [f for f in path.walk() if f.endswith(".yml")]
19
+ if file_name:
20
+ files = [f for f in files if file_name in f]
21
+
22
+ dfs = [spark.createDataFrame([], schema=schema)] if schema else []
23
+
24
+ for file in files:
25
+ with open(file) as f:
26
+ data = yaml.safe_load(f)
27
+
28
+ if schema:
29
+ dt = [d[root] for d in data] if root else data
30
+ df = spark.createDataFrame(dt, schema=schema)
31
+ else:
32
+ json = spark.sparkContext.parallelize(data)
33
+ df = spark.read.json(json)
34
+ if root:
35
+ df = df.select(f"{root}.*")
36
+
37
+ dfs.append(df)
38
+
39
+ if dfs:
40
+ df = concat_dfs(dfs)
41
+ return df
42
+
43
+ return spark.createDataFrame([], schema=schema) if schema else None
@@ -0,0 +1,3 @@
1
+ from typing import Literal
2
+
3
+ IOModes = Literal["overwrite", "append"]
@@ -0,0 +1,7 @@
1
+ from fabricks.utils.schema.get_json_schema_for_type import get_json_schema_for_type
2
+ from fabricks.utils.schema.get_schema_for_type import get_schema_for_type
3
+
4
+ __all__ = [
5
+ "get_json_schema_for_type",
6
+ "get_schema_for_type",
7
+ ]
@@ -0,0 +1,161 @@
1
+ import dataclasses
2
+ import datetime
3
+ import logging
4
+ import sys
5
+ import types
6
+ from typing import Any, ForwardRef, Literal, Type, Union, get_type_hints
7
+ from uuid import UUID
8
+
9
+ LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ def get_json_schema_for_type(proptype: Type):
13
+ def_list: dict[str, dict] = {}
14
+ schema = _get_json_schema_for_type(proptype, def_list, is_root=True)
15
+ schema["$defs"] = def_list
16
+ schema["$schema"] = "https://json-schema.org/draft/2020-12/schema"
17
+ return schema
18
+
19
+
20
+ def _get_json_schema_for_type(proptype: Type, def_list: dict[str, dict], is_root: bool, is_nullable=False) -> dict:
21
+ def _fixref(input: dict) -> dict:
22
+ if "type" in input:
23
+ if "$ref" in input["type"]:
24
+ return input["type"]
25
+ return input
26
+
27
+ def _may_null(input: dict, is_nullable: bool) -> dict:
28
+ if is_nullable:
29
+ return {"oneOf": [{"type": "null"}, input]}
30
+ return input
31
+
32
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
33
+ return {"enum": proptype.__args__}
34
+
35
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == tuple: # noqa E721
36
+ return {
37
+ "type": "array",
38
+ "minItems": len(proptype.__args__),
39
+ "maxItems": len(proptype.__args__),
40
+ "additionalItems": False,
41
+ "prefixItems": [_get_json_schema_for_type(t, def_list, is_root=False) for t in proptype.__args__],
42
+ }
43
+
44
+ if (sys.version_info >= (3, 10) and isinstance(proptype, types.UnionType)) or (
45
+ hasattr(proptype, "__origin__") and proptype.__origin__ == Union
46
+ ):
47
+ if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None):
48
+ t = _get_json_schema_for_type(proptype.__args__[1], def_list, is_root=False, is_nullable=True)
49
+ return t
50
+
51
+ if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None):
52
+ t = _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False, is_nullable=True)
53
+ return t
54
+
55
+ one_of_types = [
56
+ _get_json_schema_for_type(f, def_list, is_root=False, is_nullable=False) for f in proptype.__args__
57
+ ]
58
+
59
+ return {"oneOf": one_of_types}
60
+
61
+ if proptype == type(None):
62
+ return {"type": "null"}
63
+
64
+ if proptype == str: # noqa E721
65
+ return {"type": "string"} if not is_nullable else {"type": ["string", "null"]}
66
+
67
+ if proptype == Any:
68
+ return {}
69
+
70
+ if proptype == UUID:
71
+ return {
72
+ "type": "string" if not is_nullable else ["string", "null"],
73
+ "format": "uuid",
74
+ }
75
+
76
+ if proptype == int: # noqa E721
77
+ return {"type": "integer" if not is_nullable else ["integer", "null"]}
78
+
79
+ if proptype == float: # noqa E721
80
+ return {"type": "number" if not is_nullable else ["number", "null"]}
81
+
82
+ if proptype == bool: # noqa E721
83
+ return {"type": "boolean" if not is_nullable else ["boolean", "null"]}
84
+
85
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
86
+ return {
87
+ "type": "array",
88
+ "items": _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False),
89
+ }
90
+
91
+ if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
92
+ typehints = get_type_hints(proptype)
93
+ props = {k: _get_json_schema_for_type(v, def_list, is_root=False) for (k, v) in typehints.items()}
94
+
95
+ if hasattr(proptype, "__name__") and not is_root:
96
+ def_list[proptype.__name__] = {"type": "object", "properties": props}
97
+ return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
98
+ else:
99
+ return _may_null({"type": "object", "properties": props}, is_nullable)
100
+
101
+ if dataclasses.is_dataclass(proptype):
102
+ required = [
103
+ f.name
104
+ for f in dataclasses.fields(proptype)
105
+ if f.default == dataclasses.MISSING and f.default_factory == dataclasses.MISSING and f.init
106
+ ]
107
+ definition = {
108
+ "type": "object",
109
+ "required": required,
110
+ "additionalProperties": False,
111
+ "properties": {
112
+ f.name: _get_json_schema_for_type(f.type, def_list, is_root=False) # type: ignore
113
+ for f in dataclasses.fields(proptype)
114
+ },
115
+ }
116
+
117
+ if is_root:
118
+ return definition
119
+ else:
120
+ def_list[proptype.__name__] = definition
121
+
122
+ return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
123
+
124
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == dict and len(proptype.__args__) == 2: # noqa E721
125
+ keytype = proptype.__args__[0]
126
+ if keytype != str and keytype != UUID: # noqa E721
127
+ raise NotImplementedError()
128
+ valuetype = proptype.__args__[1]
129
+ return _may_null(
130
+ {
131
+ "type": "object",
132
+ "additionalProperties": _fixref(
133
+ {"type": _get_json_schema_for_type(valuetype, def_list, is_root=False)}
134
+ ),
135
+ },
136
+ is_nullable,
137
+ )
138
+
139
+ if isinstance(proptype, ForwardRef):
140
+ arg = proptype.__forward_arg__
141
+ return _may_null({"$ref": "#/$defs/" + arg}, is_nullable)
142
+
143
+ if proptype == datetime.datetime:
144
+ return {
145
+ "type": "string" if not is_nullable else ["string", "null"],
146
+ "format": "date-time",
147
+ }
148
+
149
+ if proptype == datetime.time:
150
+ return {
151
+ "type": "string" if not is_nullable else ["string", "null"],
152
+ "format": "time",
153
+ }
154
+
155
+ if proptype == datetime.date:
156
+ return {
157
+ "type": "string" if not is_nullable else ["string", "null"],
158
+ "format": "date",
159
+ }
160
+
161
+ return {}
@@ -0,0 +1,93 @@
1
+ import dataclasses
2
+ from typing import List, Literal, Type, Union, cast, get_type_hints, overload
3
+
4
+ from pyspark.sql.types import (
5
+ ArrayType,
6
+ BooleanType,
7
+ DataType,
8
+ DoubleType,
9
+ LongType,
10
+ MapType,
11
+ NullType,
12
+ StringType,
13
+ StructField,
14
+ StructType,
15
+ )
16
+
17
+
18
+ @overload
19
+ def get_schema_for_type(proptype: Union[int, str, float, bool]) -> DataType: ...
20
+
21
+
22
+ @overload
23
+ def get_schema_for_type(proptype: Type) -> StructType: ...
24
+
25
+
26
+ def _merge_struct_types(types: List[DataType]):
27
+ not_none_types = [t for t in types if type(t) != type(NullType())] # noqa: E721
28
+
29
+ assert len([f for f in not_none_types if not isinstance(f, StructType)]) == 0
30
+ all_fields: List[StructField] = []
31
+
32
+ for subtype in not_none_types:
33
+ fields = cast(StructType, subtype).fields
34
+ for field in fields:
35
+ existing_field = next((f for f in all_fields if f.name == field.name), None)
36
+ if existing_field is not None and (
37
+ type(existing_field.dataType) != type(field.dataType) # noqa: E721
38
+ or isinstance(existing_field.dataType, StructType)
39
+ ):
40
+ new_type = _merge_struct_types([existing_field.dataType, field.dataType])
41
+ all_fields.append(StructField(name=field.name, dataType=new_type))
42
+ all_fields.remove(existing_field)
43
+ else:
44
+ assert existing_field is None or type(existing_field.dataType) == type(field.dataType) # noqa: E721
45
+ if existing_field is None:
46
+ all_fields.append(field)
47
+
48
+ return StructType(fields=all_fields)
49
+
50
+
51
+ def get_schema_for_type(proptype: Type) -> DataType: # type: ignore
52
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
53
+ return get_schema_for_type(type(proptype.__args__[0])) # For literal types we assume first type is correct
54
+
55
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == Union:
56
+ if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None):
57
+ return get_schema_for_type(proptype.__args__[1])
58
+ if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None):
59
+ return get_schema_for_type(proptype.__args__[0])
60
+
61
+ return _merge_struct_types([get_schema_for_type(f) for f in proptype.__args__])
62
+
63
+ if proptype == type(None):
64
+ return NullType()
65
+
66
+ if proptype == str: # noqa E721
67
+ return StringType()
68
+
69
+ if proptype == int: # noqa E721
70
+ return LongType()
71
+
72
+ if proptype == float: # noqa E721
73
+ return DoubleType()
74
+
75
+ if proptype == bool: # noqa E721
76
+ return BooleanType()
77
+
78
+ if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
79
+ return ArrayType(get_schema_for_type(proptype.__args__[0]))
80
+
81
+ if proptype == dict[str, str]:
82
+ return MapType(StringType(), StringType())
83
+
84
+ if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
85
+ types = get_type_hints(proptype)
86
+ fields = [StructField(k, get_schema_for_type(v)) for k, v in types.items()]
87
+ return StructType(fields=fields)
88
+
89
+ if dataclasses.is_dataclass(proptype):
90
+ fields = [StructField(f.name, get_schema_for_type(f.type)) for f in dataclasses.fields(proptype)]
91
+ return StructType(fields=fields)
92
+
93
+ raise NotImplementedError()
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+
6
+ from databricks.sdk.runtime import dbutils, spark
7
+
8
+
9
+ @dataclass
10
+ class Secret:
11
+ pass
12
+
13
+
14
+ @dataclass
15
+ class ApplicationRegistration(Secret):
16
+ secret: str
17
+ application_id: str
18
+ directory_id: str
19
+
20
+
21
+ @dataclass
22
+ class AccessKey(Secret):
23
+ key: str
24
+
25
+
26
+ def _get_secret_from_secret_scope(secret_scope: str, name: str) -> str:
27
+ scopes = [s.name for s in dbutils.secrets.listScopes()]
28
+ assert secret_scope in scopes, "scope {secret_scope} not found"
29
+ return dbutils.secrets.get(scope=secret_scope, key=name)
30
+
31
+
32
+ def get_secret_from_secret_scope(secret_scope: str, name: str) -> Secret:
33
+ secret = _get_secret_from_secret_scope(secret_scope=secret_scope, name=name)
34
+ if name.endswith("application-registration"):
35
+ s = json.loads(secret)
36
+ assert s.get("secret"), f"no secret found in {name}"
37
+ assert s.get("application_id"), f"no application_id found in {name}"
38
+ assert s.get("directory_id"), f"no directory_id found in {name}"
39
+ return ApplicationRegistration(
40
+ secret=s.get("secret"),
41
+ application_id=s.get("application_id"),
42
+ directory_id=s.get("directory_id"),
43
+ )
44
+ elif name.endswith("access-key"):
45
+ return AccessKey(key=secret)
46
+ else:
47
+ raise ValueError(f"{name} is not valid")
48
+
49
+
50
+ def _add_secret_to_spark(key: str, value: str):
51
+ spark.conf.set(key, value)
52
+ # needed for check (invalid configuration value detected for fs.azure.account.key)
53
+ spark._jsc.hadoopConfiguration().set(key, value) # type: ignore
54
+
55
+
56
+ def add_secret_to_spark(secret: Secret, uri: str):
57
+ if isinstance(secret, ApplicationRegistration):
58
+ _add_secret_to_spark(f"fs.azure.account.auth.type.{uri}", "OAuth")
59
+ _add_secret_to_spark(
60
+ f"fs.azure.account.oauth.provider.type.{uri}",
61
+ "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
62
+ )
63
+ _add_secret_to_spark(
64
+ f"fs.azure.account.oauth2.client.id.{uri}",
65
+ secret.application_id,
66
+ )
67
+ _add_secret_to_spark(
68
+ f"fs.azure.account.oauth2.client.secret.{uri}",
69
+ secret.secret,
70
+ )
71
+ _add_secret_to_spark(
72
+ f"fs.azure.account.oauth2.client.endpoint.{uri}",
73
+ f"https://login.microsoftonline.com/{secret.directory_id}/oauth2/token",
74
+ )
75
+ elif isinstance(secret, AccessKey):
76
+ _add_secret_to_spark(f"fs.azure.account.key.{uri}", secret.key)
77
+ else:
78
+ raise ValueError("secret is not valid")
@@ -0,0 +1,48 @@
1
+ from typing import Optional
2
+
3
+ from sqlglot import exp, parse_one, transpile
4
+ from sqlglot.dialects.databricks import Databricks
5
+
6
+
7
+ class Fabricks(Databricks):
8
+ class Generator(Databricks.Generator):
9
+ EXPRESSIONS_WITHOUT_NESTED_CTES = {
10
+ exp.Insert,
11
+ exp.Union,
12
+ }
13
+
14
+
15
+ def fix(sql: str):
16
+ """
17
+ Fixes the given SQL query by parsing it using the 'fabricks' dialect,
18
+ transpiling it, and returning the fixed SQL query.
19
+
20
+ Args:
21
+ sql (str): The SQL query to be fixed.
22
+
23
+ Returns:
24
+ str: The fixed SQL query.
25
+ """
26
+ sql = parse_one(sql, dialect="fabricks").sql()
27
+ sql = transpile(
28
+ sql,
29
+ identify=True,
30
+ pretty=True,
31
+ normalize=False,
32
+ normalize_functions="lower",
33
+ write="fabricks",
34
+ )[0]
35
+ return sql
36
+
37
+
38
+ def is_global_temp_view(sql: str):
39
+ tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
40
+ for t in tables:
41
+ return "global_temp" in str(t)
42
+
43
+
44
+ def get_global_temp_view(sql: str) -> Optional[str]:
45
+ tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
46
+ for t in tables:
47
+ if "global_temp" in str(t):
48
+ return str(t)
@@ -0,0 +1,8 @@
1
+ from fabricks.utils.write.delta import append_delta, overwrite_delta
2
+ from fabricks.utils.write.stream import write_stream
3
+
4
+ __all__ = [
5
+ "append_delta",
6
+ "overwrite_delta",
7
+ "write_stream",
8
+ ]
@@ -0,0 +1,46 @@
1
+ from typing import List, Optional, Union, get_args
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.utils.path import Path
6
+ from fabricks.utils.read.types import IOModes
7
+
8
+
9
+ def write_delta(
10
+ df: DataFrame,
11
+ path: Path,
12
+ mode: IOModes,
13
+ options: Optional[dict[str, str]] = None,
14
+ partition_by: Union[Optional[List[str]], str] = None,
15
+ ):
16
+ assert mode in list(get_args(IOModes))
17
+
18
+ if isinstance(partition_by, str):
19
+ partition_by = [partition_by]
20
+
21
+ writer = df.write.format("delta").mode(mode).option("mergeSchema", "True").option("overwriteSchema", "True")
22
+ if partition_by:
23
+ writer = writer.partitionBy(*partition_by)
24
+ if options:
25
+ for key, value in options.items():
26
+ writer = writer.option(key, value)
27
+
28
+ writer.save(path.string)
29
+
30
+
31
+ def append_delta(
32
+ df: DataFrame,
33
+ path: Path,
34
+ options: Optional[dict[str, str]] = None,
35
+ partition_by: Union[Optional[List[str]], str] = None,
36
+ ):
37
+ write_delta(df, path, "append", options=options)
38
+
39
+
40
+ def overwrite_delta(
41
+ df: DataFrame,
42
+ path: Path,
43
+ options: Optional[dict[str, str]] = None,
44
+ partition_by: Union[Optional[List[str]], str] = None,
45
+ ):
46
+ write_delta(df, path, "overwrite", options=options)
@@ -0,0 +1,27 @@
1
+ from typing import Callable, Optional
2
+
3
+ from pyspark.sql import DataFrame
4
+ from pyspark.sql.streaming.query import StreamingQuery
5
+
6
+ from fabricks.utils.path import Path
7
+
8
+
9
+ def write_stream(
10
+ df: DataFrame,
11
+ checkpoints_path: Path,
12
+ func: Callable,
13
+ timeout: Optional[int] = 18000,
14
+ ) -> StreamingQuery:
15
+ if timeout is None:
16
+ timeout = 18000
17
+
18
+ assert timeout is not None
19
+
20
+ query = (
21
+ df.writeStream.foreachBatch(func)
22
+ .option("checkpointLocation", checkpoints_path.string)
23
+ .trigger(once=True)
24
+ .start()
25
+ )
26
+ query.awaitTermination(timeout=timeout)
27
+ return query