fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,25 @@
1
+ import sys
2
+ from importlib.util import spec_from_file_location
3
+ from typing import Optional
4
+
5
+ from fabricks.context import PATH_PARSERS
6
+ from fabricks.core.parsers.base import PARSERS, BaseParser
7
+ from fabricks.core.parsers.types import ParserOptions
8
+
9
+
10
+ def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> BaseParser:
11
+ if name not in ["json", "parquet", "avro", "csv", "tsv", "delta", "table"]:
12
+ sys.path.append(PATH_PARSERS.string)
13
+
14
+ path = PATH_PARSERS.join(name).append(".py")
15
+ assert path.exists(), f"parser not found ({path})"
16
+ spec = spec_from_file_location(name, path.string)
17
+ assert spec, f"parser not found ({path})"
18
+ spec.loader.load_module() # type: ignore
19
+
20
+ parser = PARSERS[name](parser_options)
21
+ else:
22
+ parser = BaseParser(parser_options, name)
23
+
24
+ assert parser
25
+ return parser
@@ -0,0 +1,6 @@
1
+ from typing import Optional, TypedDict
2
+
3
+
4
+ class ParserOptions(TypedDict):
5
+ file_format: Optional[str]
6
+ read_options: Optional[dict[str, str]]
@@ -0,0 +1,89 @@
1
+ from typing import List, Optional, TypedDict
2
+
3
+ from databricks.sdk.runtime import spark
4
+ from pyspark.sql import DataFrame
5
+
6
+ from fabricks.context import PATH_SCHEDULES
7
+ from fabricks.context.log import Logger
8
+ from fabricks.core.jobs.base.types import TStep
9
+ from fabricks.utils.read.read_yaml import read_yaml
10
+ from fabricks.utils.schema import get_schema_for_type
11
+ from fabricks.utils.sqlglot import fix as fix_sql
12
+
13
+
14
+ class Options(TypedDict):
15
+ steps: Optional[List[TStep]]
16
+ tag: Optional[str]
17
+ view: Optional[str]
18
+ variables: Optional[dict[str, str]]
19
+
20
+
21
+ class Schedule(TypedDict):
22
+ name: str
23
+ options: Options
24
+
25
+
26
+ def get_schedules() -> DataFrame:
27
+ schema = get_schema_for_type(Schedule)
28
+ df = read_yaml(PATH_SCHEDULES, root="schedule", schema=schema)
29
+ assert df, "no schedules found"
30
+ return df
31
+
32
+
33
+ def get_schedule(name: str) -> DataFrame:
34
+ df = get_schedules()
35
+ df = df.where(f"name == '{name}'")
36
+ assert not df.isEmpty(), "schedule not found"
37
+ assert df.count() == 1, "schedule duplicated"
38
+ return df
39
+
40
+
41
+ def _create_or_replace_view(name: str, options: DataFrame):
42
+ step = "-- no step provided"
43
+ tag = "-- no tag provided"
44
+ view = "-- no view provided"
45
+
46
+ if options.steps is not None:
47
+ steps = [f"'{s}'" for s in options.steps] # type: ignore
48
+ step = f"and j.step in ({', '.join(steps)})"
49
+ if options.tag is not None:
50
+ tag = f"and array_contains(j.tags, '{options.tag}')"
51
+ if options.view is not None:
52
+ view = f"inner join fabricks.{options.view} v on j.job_id = v.job_id"
53
+
54
+ sql = f"""
55
+ create or replace view fabricks.{name}_schedule
56
+ as
57
+ select
58
+ j.*
59
+ from
60
+ fabricks.jobs j
61
+ {view}
62
+ where
63
+ true
64
+ {step}
65
+ {tag}
66
+ and j.type not in ('manual')
67
+ """
68
+ sql = fix_sql(sql)
69
+ Logger.debug(f"schedule - %sql\n---\n{sql}\n---")
70
+
71
+ spark.sql(sql)
72
+
73
+
74
+ def create_or_replace_view(name: str):
75
+ df = get_schedule(name=name)
76
+ for row in df.collect():
77
+ try:
78
+ _create_or_replace_view(row.name, row.options)
79
+ except Exception:
80
+ Logger.exception(f"schedule - {row.name} not created nor replaced")
81
+
82
+
83
+ def create_or_replace_views():
84
+ df = get_schedules()
85
+ for row in df.collect():
86
+ try:
87
+ _create_or_replace_view(row.name, row.options)
88
+ except Exception:
89
+ Logger.exception(f"schedule - {row.name} not created nor replaced")
@@ -0,0 +1,13 @@
1
+ from fabricks.core.scripts.generate import generate
2
+ from fabricks.core.scripts.optimize import optimize
3
+ from fabricks.core.scripts.process import process
4
+ from fabricks.core.scripts.terminate import terminate
5
+ from fabricks.core.scripts.vacuum import vacuum
6
+
7
+ __all__ = [
8
+ "process",
9
+ "optimize",
10
+ "generate",
11
+ "terminate",
12
+ "vacuum",
13
+ ]
@@ -0,0 +1,82 @@
1
+ from typing import List, Optional, Union, cast
2
+
3
+ from fabricks.context import FABRICKS_STORAGE
4
+ from fabricks.context.log import Logger
5
+ from fabricks.core.deploy import deploy
6
+ from fabricks.core.jobs.base.types import Steps, TStep
7
+ from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
8
+ from fabricks.core.steps.base import BaseStep
9
+ from fabricks.core.views import create_or_replace_views
10
+ from fabricks.metastore.database import Database
11
+
12
+
13
+ def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]]):
14
+ if steps is None:
15
+ steps = Steps
16
+ assert steps is not None
17
+
18
+ if isinstance(steps, str):
19
+ steps = [cast(TStep, steps)]
20
+ elif isinstance(steps, List):
21
+ steps = [cast(TStep, s) for s in steps]
22
+ elif isinstance(steps, TStep):
23
+ steps = [steps]
24
+
25
+ Logger.warning("armageddon")
26
+ print("")
27
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡤⠤⠴⠾⠋⠉⠛⢾⡏⠙⠿⠦⠤⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
28
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⢶⣿⠉⢀⣀⡠⠆⠀⠀⠀⠀⠀⠀⠀⢤⣀⣀⠈⢹⣦⢤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
29
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⣿⠁⢋⡙⠁⠀⡝⠀⠀⠀⠀⣀⡸⠋⠁⠀⠀⠹⡀⠀⠈⠈⠆⢹⢦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
30
+ print(" ⠀⠀⠀⠀⠀⠀⠀⢀⣠⣤⣿⣁⡡⣴⡏⠀⠀⠀⢀⠀⢧⣀⠄⠀⠀⠀⣀⣰⠆⢀⠁⠀⠀⢈⣶⡤⣀⢹⣦⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀ ")
31
+ print(" ⠀⠀⠀⠀⠀⣠⢴⠟⢁⡝⠀⠁⠀⠃⠉⠀⠀⠘⣯⠀⡀⠾⣤⣄⣠⢤⠾⠄⠀⣸⠖⠀⠀⠈⠀⠃⠀⠀⠹⡄⠙⣶⢤⡀⠀⠀⠀⠀⠀ ")
32
+ print(" ⠀⠀⠀⣠⠾⡇⠈⣀⡞⠀⠀⠀⠀⡀⠀⢀⣠⣄⣇⠀⣳⠴⠃⠀⠀⠀⠣⢴⠉⣰⣇⣀⣀⠀⠀⡄⠀⠀⠀⢹⣄⡘⠈⡷⣦⠀⠀⠀⠀ ")
33
+ print(" ⢠⠞⠉⢻⡄⠀⠀⠈⠙⠀⠀⠀⠀⠙⣶⣏⣤⣤⠟⠉⠁⠀⠀⠀⠀⠀⠀⠀⠉⠙⢦⣱⣌⣷⠊⠀⠀⠀⠀⠈⠁⠀⠀⠀⡝⠉⠻⣄⠀ ")
34
+ print(" ⠛⢀⡠⢼⡇⠀⠀⢀⡄⠀⢀⣀⡽⠚⠁⠀⠀⠀⢠⡀⢠⣀⠠⣔⢁⡀⠀⣄⠀⡄⠀⠀⠀⠈⠑⠺⣄⡀⠀⠠⡀⠀⠀⢠⡧⠄⠀⠘⢧ ")
35
+ print(" ⡶⠋⠀⠀⠈⣠⣈⣩⠗⠒⠋⠀⠀⠀⠀⣀⣠⣆⡼⣷⣞⠛⠻⡉⠉⡟⠒⡛⣶⠧⣀⣀⣀⠀⠀⠀⠀⠈⠓⠺⢏⣉⣠⠋⠀⠀⠀⢢⣸ ")
36
+ print(" ⠇⠐⠤⠤⠖⠁⣿⣀⣀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠙⠛⢿⣷⡄⢣⡼⠀⣾⣿⠧⠒⠓⠚⠛⠉⠀⠀⠀⠀⠀⢀⣀⣾⡉⠓⠤⡤⠄⠸⢿ ")
37
+ print(" ⣆⣤⠀⠀⠠⠀⠈⠓⠈⠓⠤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣿⣿⢸⠀⢸⣿⠇⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⠒⠁⠰⠃⠀⠠⠀⠀⢀⣀⠞ ")
38
+ print(" ⠀⠉⠓⢲⣄⡈⢀⣠⠀⠀⠀⡸⠶⠂⠀⠀⢀⠀⠀⠤⠞⢻⡇⠀⠀⢘⡟⠑⠤⠄⠀⢀⠀⠀⠐⠲⢿⡀⠀⠀⢤⣀⢈⣀⡴⠖⠋⠀⠀ ")
39
+ print(" ⠀⠀⠀⠀⠈⠉⠉⠙⠓⠒⣾⣁⣀⣴⠀⣀⠙⢧⠂⢀⣆⣀⣷⣤⣀⣾⣇⣀⡆⠀⢢⠛⢁⠀⢰⣀⣀⣹⠒⠒⠛⠉⠉⠉⠀⠀⠀⠀⠀ ")
40
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠉⠛⠉⠙⠉⠀⠀⣿⡟⣿⣿⠀⠀⠈⠉⠉⠙⠋⠉⠉⠀⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
41
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⡇⢻⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
42
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣤⣶⣾⣿⣿⠁⠀⢹⡛⣟⡶⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
43
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⠛⢯⣽⡟⢿⣿⠛⠿⠳⠞⠻⣿⠻⣆⢽⠟⣶⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
44
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⠃⠲⠯⠴⣦⣼⣷⣤⣤⣶⣤⣩⡧⠽⠷⠐⠛⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
45
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣿⡇⠀⣿⡆⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
46
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣀⣄⡀⢀⣀⣠⡾⡿⢡⢐⠻⣿⣄⣀⡀⠀⣀⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
47
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⢴⡏⠁⠀⠝⠉⣡⠟⣰⠃⢸⣿⠀⣷⠙⢧⡉⠻⡅⠀⠙⡷⢤⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
48
+ print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⡟⠀⠈⣿⢄⡴⠞⠻⣄⣰⣡⠤⣞⣸⡤⢬⣧⣀⡿⠛⠦⣤⣶⡃⠀⢹⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
49
+ print(" ⠀⠀⠀⠀⠀⠀⢀⣴⣶⡿⠃⠉⢺⠁⠙⠒⠀⠀⣠⡉⠀⠉⠚⠉⠉⠑⠈⠀⠈⣧⠀⠀⠒⠋⠀⡹⠋⠀⢻⡶⠶⡄⠀⠀⠀⠀⠀⠀⠀ ")
50
+ print(" ⠀⠀⠀⠀⠀⣠⣾⣿⣇⠁⢈⡦⠀⡍⠋⠁⡀⠸⡋⠀⠀⠀⢘⠏⠉⡏⠀⠀⠀⢉⡷⠀⡌⠉⠋⡇⠠⣏⠈⢁⣦⣿⣦⠀⠀⠀⠀⠀⠀ ")
51
+ print(" ⠀⠀⠀⠀⠀⠉⣁⠀⠉⠉⠉⠙⠛⠛⠒⠚⠳⠤⢼⣤⣠⠤⣮⣠⣤⣼⠦⢤⣤⣿⠤⠾⠓⠒⠛⢓⠛⠉⠉⠉⠀⠈⠉⠀⠀⠀⠀⠀⠀ ")
52
+ print("")
53
+
54
+ fabricks = Database("fabricks")
55
+ fabricks.drop()
56
+ for s in steps:
57
+ step = BaseStep(s)
58
+ step.drop()
59
+
60
+ tmp = FABRICKS_STORAGE.join("tmp")
61
+ tmp.rm()
62
+
63
+ checkpoint = FABRICKS_STORAGE.join("checkpoints")
64
+ checkpoint.rm()
65
+
66
+ schema = FABRICKS_STORAGE.join("schemas")
67
+ schema.rm()
68
+
69
+ schedule = FABRICKS_STORAGE.join("schedules")
70
+ schedule.rm()
71
+
72
+ fabricks.create()
73
+
74
+ deploy.tables(drop=True)
75
+ for s in steps:
76
+ step = BaseStep(s)
77
+ step.create()
78
+
79
+ deploy.views()
80
+
81
+ create_or_replace_views()
82
+ create_or_replace_schedules_views()
@@ -0,0 +1,20 @@
1
+ from typing import Tuple
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.core.dags.generator import DagGenerator
6
+
7
+
8
+ def generate(schedule: str) -> Tuple[str, DataFrame, DataFrame]:
9
+ """
10
+ Generate a schedule, job dataframe, and dependency dataframe based on the given schedule.
11
+
12
+ Args:
13
+ schedule (str): The schedule to generate from.
14
+
15
+ Returns:
16
+ Tuple[str, DataFrame, DataFrame]: A tuple containing the schedule ID, job dataframe, and dependency dataframe.
17
+ """
18
+ g = DagGenerator(schedule)
19
+ schedule_id, job_df, dep_df = g.generate()
20
+ return schedule_id, job_df, dep_df
@@ -0,0 +1,28 @@
1
+ from dataclasses import dataclass
2
+ from typing import List
3
+
4
+ from fabricks.core.jobs.base.types import JobConf
5
+ from fabricks.utils.schema import get_json_schema_for_type
6
+
7
+
8
+ def get_job_schema() -> str:
9
+ import json
10
+
11
+ @dataclass
12
+ class JobWrapper:
13
+ job: JobConf
14
+
15
+ sc = get_json_schema_for_type(List[JobWrapper])
16
+ defs: dict[str, dict] = sc["$defs"]
17
+ removals = [("Job", "job_id"), ("Job", "table")]
18
+
19
+ for key, defi in defs.items():
20
+ for ent, prop in removals:
21
+ if key.startswith(ent) and prop in defi["properties"]:
22
+ req: List[str] = defi["required"]
23
+ req.remove(prop) # not defined in yaml
24
+ jobprops: dict = defi["properties"]
25
+ jobprops.pop(prop)
26
+
27
+ j = json.dumps(sc, indent=4)
28
+ return j
@@ -0,0 +1,45 @@
1
+ from typing import Optional
2
+
3
+ from databricks.sdk.runtime import spark
4
+ from pyspark.sql import Row
5
+
6
+ from fabricks.core.jobs.get_job import get_job
7
+ from fabricks.utils.helpers import run_in_parallel
8
+
9
+
10
+ def optimize(schedule_id: Optional[str] = None):
11
+ """
12
+ Cleans the Fabricks jobs by vacuuming and optimizing the tables.
13
+
14
+ Args:
15
+ schedule_id (Optional[str]): The schedule ID to filter the jobs. If None, all jobs will be cleaned.
16
+
17
+ Returns:
18
+ None
19
+ """
20
+ if schedule_id is not None:
21
+ df = spark.sql(
22
+ f"""
23
+ select
24
+ j.step,
25
+ j.job_id
26
+ from
27
+ fabricks.logs l
28
+ inner join fabricks.jobs j on l.job_id = j.job_id
29
+ where
30
+ true
31
+ and not j.mode = 'memory'
32
+ and l.schedule_id = '{schedule_id}'
33
+ group by
34
+ j.step,
35
+ j.job_id
36
+ """
37
+ )
38
+ else:
39
+ df = spark.sql("select * from fabricks.jobs where not mode = 'memory'")
40
+
41
+ def _optimize(row: Row):
42
+ job = get_job(step=row["step"], job_id=row["job_id"])
43
+ job.optimize()
44
+
45
+ run_in_parallel(_optimize, df, 16)
@@ -0,0 +1,9 @@
1
+ from typing import Union
2
+
3
+ from fabricks.core.dags.processor import DagProcessor
4
+ from fabricks.core.jobs.base.types import TStep
5
+
6
+
7
+ def process(schedule_id: str, schedule: str, step: Union[TStep, str]):
8
+ p = DagProcessor(schedule_id=schedule_id, schedule=schedule, step=step)
9
+ p.process()
@@ -0,0 +1,48 @@
1
+ from databricks.sdk.runtime import spark
2
+ from pyspark.sql import Row
3
+
4
+ from fabricks.cdc import NoCDC
5
+ from fabricks.core.jobs.base.types import Steps
6
+ from fabricks.utils.helpers import concat_dfs, run_in_parallel
7
+
8
+
9
+ def collect_stats():
10
+ def _collect_tables(s: str):
11
+ df_table = spark.sql(f"show tables in {s}")
12
+ df_view = spark.sql(f"show views in {s}")
13
+
14
+ cond = [df_table.tableName == df_view.viewName]
15
+ df_table = df_table.join(df_view, cond, how="left_anti")
16
+
17
+ return df_table
18
+
19
+ dfs = run_in_parallel(_collect_tables, Steps, workers=8)
20
+ df_table = concat_dfs(dfs)
21
+
22
+ def _collect_stats(row: Row):
23
+ table = row["tableName"]
24
+ database = row["database"]
25
+ job = f"{database}.{table}"
26
+
27
+ desc = spark.sql(f"describe detail {job}").collect()[0]
28
+ bytes = desc["sizeInBytes"]
29
+ files = desc["numFiles"]
30
+
31
+ df = spark.sql(
32
+ f"""
33
+ select
34
+ '{database}' as step,
35
+ md5('{job}') as job_id,
36
+ cast({bytes} as long) as bytes,
37
+ cast({files} as long) as `files`,
38
+ cast(count(*) as long) as `rows`
39
+ from
40
+ {job}
41
+ """
42
+ )
43
+
44
+ return df
45
+
46
+ dfs = run_in_parallel(_collect_stats, df_table, workers=64)
47
+ df = concat_dfs(dfs)
48
+ NoCDC("fabricks", "statistics").overwrite(df)
@@ -0,0 +1,27 @@
1
+ from typing import Iterable
2
+
3
+ from databricks.sdk.runtime import spark
4
+
5
+ from fabricks.cdc import NoCDC
6
+ from fabricks.context.runtime import BRONZE, GOLD, SILVER
7
+
8
+
9
+ def collect_steps():
10
+ steps = []
11
+
12
+ def _collect(extend: str, iterable: Iterable):
13
+ for i in iterable:
14
+ steps.append(
15
+ {
16
+ "extend": extend,
17
+ "step": i.get("name"),
18
+ "order": i.get("options", {}).get("order", 0),
19
+ },
20
+ )
21
+
22
+ _collect("bronze", BRONZE)
23
+ _collect("silver", SILVER)
24
+ _collect("gold", GOLD)
25
+
26
+ df = spark.createDataFrame(steps)
27
+ NoCDC("fabricks", "steps").overwrite(df)
@@ -0,0 +1,6 @@
1
+ from fabricks.core.dags.terminator import DagTerminator
2
+
3
+
4
+ def terminate(schedule_id: str):
5
+ t = DagTerminator(schedule_id=schedule_id)
6
+ t.terminate()
@@ -0,0 +1,45 @@
1
+ from typing import Optional
2
+
3
+ from databricks.sdk.runtime import spark
4
+ from pyspark.sql import Row
5
+
6
+ from fabricks.core.jobs.get_job import get_job
7
+ from fabricks.utils.helpers import run_in_parallel
8
+
9
+
10
+ def vacuum(schedule_id: Optional[str] = None):
11
+ """
12
+ Cleans the Fabricks jobs by vacuuming and optimizing the tables.
13
+
14
+ Args:
15
+ schedule_id (Optional[str]): The schedule ID to filter the jobs. If None, all jobs will be cleaned.
16
+
17
+ Returns:
18
+ None
19
+ """
20
+ if schedule_id is not None:
21
+ df = spark.sql(
22
+ f"""
23
+ select
24
+ j.step,
25
+ j.job_id
26
+ from
27
+ fabricks.logs l
28
+ inner join fabricks.jobs j on l.job_id = j.job_id
29
+ where
30
+ true
31
+ and not j.mode = 'memory'
32
+ and l.schedule_id = '{schedule_id}'
33
+ group by
34
+ j.step,
35
+ j.job_id
36
+ """
37
+ )
38
+ else:
39
+ df = spark.sql("select * from fabricks.jobs where not mode = 'memory'")
40
+
41
+ def _vacuum(row: Row):
42
+ job = get_job(step=row["step"], job_id=row["job_id"])
43
+ job.vacuum()
44
+
45
+ run_in_parallel(_vacuum, df, 16)
@@ -0,0 +1,55 @@
1
+ import sys
2
+
3
+ from databricks.sdk.runtime import dbutils, spark
4
+
5
+ from fabricks.context import FABRICKS_STORAGE, PATH_LIBRARIES, PATH_REQUIREMENTS
6
+ from fabricks.context.log import Logger
7
+ from fabricks.utils.pip import pip_requirements, pip_wheel
8
+
9
+
10
+ def collect_site_packages(nofail: bool = False):
11
+ Logger.info(f"collect libraries ({PATH_REQUIREMENTS})")
12
+
13
+ dbfs_wheel = "dbfs:/fabricks/wheels"
14
+ mnt_wheel = "dbfs:/mnt/fabricks/wheels"
15
+
16
+ dbutils.fs.mkdirs(dbfs_wheel)
17
+
18
+ try:
19
+ w = FABRICKS_STORAGE.join("wheels")
20
+ Logger.info(f"pip wheel ({w})")
21
+ pip_wheel(PATH_REQUIREMENTS, w)
22
+ except (Exception, ValueError) as e:
23
+ if nofail:
24
+ Logger.exception("oops (pip wheel)")
25
+ else:
26
+ raise e
27
+ try:
28
+ for f in dbutils.fs.ls(mnt_wheel):
29
+ to = f"{dbfs_wheel}/{f.name}"
30
+ try:
31
+ dbutils.fs.ls(to)
32
+ except Exception:
33
+ Logger.info(f"uploading {f.name} ({to})")
34
+ dbutils.fs.cp(f.path, to)
35
+ except Exception as e:
36
+ if nofail:
37
+ Logger.exception("oops (uploading)")
38
+ else:
39
+ raise e
40
+
41
+ try:
42
+ p = FABRICKS_STORAGE.join("site-packages")
43
+ Logger.info(f"pip requirements ({p})")
44
+ pip_requirements(requirements_path=PATH_REQUIREMENTS, tgt_path=p)
45
+ except Exception as e:
46
+ if nofail:
47
+ Logger.exception("oops (pip requirements)")
48
+ else:
49
+ raise e
50
+
51
+
52
+ def add_site_packages_to_path():
53
+ if PATH_LIBRARIES not in sys.path:
54
+ spark._sc._python_includes.append(PATH_LIBRARIES) # type: ignore
55
+ sys.path.append(PATH_LIBRARIES)
@@ -0,0 +1,4 @@
1
+ from fabricks.core.steps.base import BaseStep
2
+ from fabricks.core.steps.get_step import get_step
3
+
4
+ __all__ = ["BaseStep", "get_step"]