fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +76 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
  94. fabricks-3.0.6.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
@@ -1,67 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, Dict, List, Optional
4
-
5
- from fabricks.config.steps.base import ExtenderOptions, ModelBase
6
- from fabricks.config.steps.bronze import BronzeStepConfig
7
- from fabricks.config.steps.gold import GoldStepConfig
8
- from fabricks.config.steps.silver import SilverStepConfig
9
-
10
-
11
- class RuntimePathOptions(ModelBase):
12
- storage: str
13
- udfs: str
14
- parsers: str
15
- schedules: str
16
- views: str
17
- requirements: str
18
-
19
-
20
- class RuntimeTimeoutOptions(ModelBase):
21
- step: int
22
- job: int
23
- pre_run: int
24
- post_run: int
25
-
26
-
27
- class RuntimeOptions(ModelBase):
28
- secret_scope: str
29
- unity_catalog: Optional[bool] = None
30
- type_widening: Optional[bool] = None
31
- catalog: Optional[str] = None
32
- workers: int
33
- timeouts: RuntimeTimeoutOptions
34
- retention_days: int
35
-
36
-
37
- class SparkOptions(ModelBase):
38
- sql: Dict[str, Any]
39
- conf: Dict[str, Any]
40
-
41
-
42
- class PowerBI(ModelBase):
43
- name: str
44
-
45
-
46
- class DatabasePathOptions(ModelBase):
47
- storage: str
48
-
49
-
50
- class Database(ModelBase):
51
- name: str
52
- path_options: DatabasePathOptions
53
-
54
-
55
- class RuntimeConfig(ModelBase):
56
- name: str
57
- options: RuntimeOptions
58
- path_options: RuntimePathOptions
59
- extender_options: Optional[ExtenderOptions] = None
60
- spark_options: SparkOptions
61
- bronze: Optional[List[BronzeStepConfig]] = None
62
- silver: Optional[List[SilverStepConfig]] = None
63
- gold: Optional[List[GoldStepConfig]] = None
64
- powerbi: Optional[List[PowerBI]] = None
65
- databases: Optional[List[Database]] = None
66
- variables: Optional[List[Dict[str, Any]]] = None
67
- credentials: Optional[List[Dict[str, Any]]] = None
@@ -1,6 +0,0 @@
1
- from fabricks.config.steps.base import BaseStepConfig
2
- from fabricks.config.steps.bronze import BronzeStepConfig
3
- from fabricks.config.steps.gold import GoldStepConfig
4
- from fabricks.config.steps.silver import SilverStepConfig
5
-
6
- __all__ = ["BaseStepConfig", "BronzeStepConfig", "SilverStepConfig", "GoldStepConfig"]
@@ -1,50 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, Dict, List, Optional
4
-
5
- from fabricks.config.base import ModelBase
6
-
7
-
8
- class PathOptions(ModelBase):
9
- runtime: str
10
- storage: str
11
-
12
-
13
- class StepTimeoutOptions(ModelBase):
14
- step: Optional[int] = None
15
- job: Optional[int] = None
16
- pre_run: Optional[int] = None
17
- post_run: Optional[int] = None
18
-
19
-
20
- class InvokeOptions(ModelBase):
21
- notebook: str
22
- arguments: Optional[Dict[str, Any]] = None
23
-
24
-
25
- class ExtenderOptions(ModelBase):
26
- extender: str
27
- arguments: Optional[Dict[str, Any]] = None
28
-
29
-
30
- class TableOptions(ModelBase):
31
- powerbi: Optional[bool] = None
32
- liquid_clustering: Optional[bool] = None
33
- properties: Optional[Dict[str, Any]] = None
34
- retention_days: Optional[int] = None
35
-
36
-
37
- class DefaultOptions(ModelBase):
38
- order: int
39
- workers: Optional[int] = None
40
- timeouts: StepTimeoutOptions
41
- extenders: Optional[List[str]] = None
42
- pre_run: Optional[InvokeOptions] = None
43
- post_run: Optional[InvokeOptions] = None
44
-
45
-
46
- class BaseStepConfig(ModelBase):
47
- name: str
48
- options: DefaultOptions
49
- path_options: PathOptions
50
- table_options: Optional[TableOptions] = None
@@ -1,7 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from fabricks.config.steps.base import BaseStepConfig
4
-
5
-
6
- class BronzeStepConfig(BaseStepConfig):
7
- pass
@@ -1,14 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Optional
4
-
5
- from fabricks.config.steps.base import BaseStepConfig, DefaultOptions
6
-
7
-
8
- class GoldOptions(DefaultOptions):
9
- schema_drift: Optional[bool] = None
10
- metadata: Optional[bool] = None
11
-
12
-
13
- class GoldStepConfig(BaseStepConfig):
14
- options: GoldOptions
@@ -1,15 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Optional
4
-
5
- from fabricks.config.steps.base import BaseStepConfig, DefaultOptions
6
-
7
-
8
- class SilverOptions(DefaultOptions):
9
- parent: str
10
- stream: Optional[bool] = None
11
- local_checkpoint: Optional[bool] = None
12
-
13
-
14
- class SilverStepConfig(BaseStepConfig):
15
- options: SilverOptions
@@ -1,17 +0,0 @@
1
- from fabricks.core.deploy.tables import deploy_tables
2
- from fabricks.core.deploy.udfs import deploy_udfs
3
- from fabricks.core.deploy.views import deploy_views
4
-
5
-
6
- class deploy:
7
- @staticmethod
8
- def tables(drop: bool = False):
9
- deploy_tables(drop=drop)
10
-
11
- @staticmethod
12
- def views():
13
- deploy_views()
14
-
15
- @staticmethod
16
- def udfs():
17
- deploy_udfs()
@@ -1,142 +0,0 @@
1
- from typing import List, Optional, TypedDict
2
-
3
- from pyspark.sql import DataFrame
4
- from pyspark.sql.types import Row
5
-
6
- from fabricks.context import PATH_SCHEDULES, SPARK
7
- from fabricks.context.log import DEFAULT_LOGGER
8
- from fabricks.core.jobs.base._types import TStep
9
- from fabricks.utils.read.read_yaml import read_yaml
10
- from fabricks.utils.schema import get_schema_for_type
11
- from fabricks.utils.sqlglot import fix as fix_sql
12
-
13
-
14
- class Options(TypedDict):
15
- steps: Optional[List[TStep]]
16
- tag: Optional[str]
17
- view: Optional[str]
18
- variables: Optional[dict[str, str]]
19
-
20
-
21
- class Schedule(TypedDict):
22
- name: str
23
- options: Options
24
-
25
-
26
- def get_schedules():
27
- return read_yaml(PATH_SCHEDULES, root="schedule")
28
-
29
-
30
- def get_schedules_df() -> DataFrame:
31
- schema = get_schema_for_type(Schedule)
32
- df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
33
-
34
- assert df, "no schedules found"
35
- return df
36
-
37
-
38
- def get_schedule(name: str) -> Row:
39
- schedules = [s for s in get_schedules() if s["name"] == name]
40
-
41
- assert schedules, "schedule not found"
42
- assert len(schedules) == 1, "schedule duplicated"
43
- return Row(**schedules[0])
44
-
45
-
46
- def create_or_replace_view_internal(name: str, options: dict):
47
- step = "-- no step provided"
48
- tag = "-- no tag provided"
49
- view = "-- no view provided"
50
-
51
- assert isinstance(options, dict), "options must be a dict"
52
-
53
- if options.get("steps") is not None:
54
- steps = [f"'{s}'" for s in options.get("steps")] # type: ignore
55
- step = f"and j.step in ({', '.join(steps)})"
56
-
57
- if options.get("tag") is not None:
58
- tag = f"""and array_contains(j.tags, '{options.get("tag")}')"""
59
-
60
- if options.get("view") is not None:
61
- view = f"""inner join fabricks.{options.get("view")} v on j.job_id = v.job_id"""
62
-
63
- sql = f"""
64
- create or replace view fabricks.{name}_schedule
65
- as
66
- select
67
- j.*
68
- from
69
- fabricks.jobs j
70
- {view}
71
- where
72
- true
73
- {step}
74
- {tag}
75
- and j.type not in ('manual')
76
- """
77
- sql = fix_sql(sql)
78
- DEFAULT_LOGGER.debug(f"schedule - %sql\n---\n{sql}\n---")
79
-
80
- SPARK.sql(sql)
81
-
82
-
83
- def create_or_replace_view(name: str):
84
- row = get_schedule(name=name)
85
- try:
86
- create_or_replace_view_internal(row.name, row.options)
87
- except Exception:
88
- DEFAULT_LOGGER.exception(f"schedule - {row.name} not created nor replaced")
89
-
90
-
91
- def create_or_replace_views():
92
- df = get_schedules_df()
93
- for row in df.collect():
94
- try:
95
- create_or_replace_view_internal(row.name, row.options.asDict())
96
- except Exception:
97
- DEFAULT_LOGGER.exception(f"schedule - {row.name} not created nor replaced")
98
-
99
-
100
- def get_dependencies(name: str) -> DataFrame:
101
- from fabricks.core.dags import DagGenerator
102
-
103
- g = DagGenerator(schedule=name)
104
- return g.get_dependencies()
105
-
106
-
107
- def get_mermaid_diagram(name: str) -> str:
108
- df = get_dependencies(name)
109
-
110
- df = df.withColumnRenamed("ParentId", "parent_id")
111
- df = df.withColumnRenamed("Parent", "parent")
112
- df = df.withColumnRenamed("JobId", "job_id")
113
- df = df.withColumnRenamed("Job", "job")
114
-
115
- dependencies = df.select("parent_id", "parent", "job_id", "job").collect()
116
-
117
- out = "flowchart TD\n"
118
-
119
- unique_nodes = set()
120
-
121
- for row in dependencies:
122
- parent_id = str(row["parent_id"])
123
- parent_name = str(row["parent"])
124
- child_id = str(row["job_id"])
125
- child_name = str(row["job"])
126
-
127
- if parent_id != "0" and parent_id is not None:
128
- if parent_id not in unique_nodes:
129
- out += f" {parent_id}[{parent_name}]\n"
130
- unique_nodes.add(parent_id)
131
-
132
- if child_id not in unique_nodes:
133
- out += f" {child_id}[{child_name}]\n"
134
- unique_nodes.add(child_id)
135
-
136
- out += f" {parent_id} --> {child_id}\n"
137
- else:
138
- if child_id not in unique_nodes:
139
- out += f" {child_id}[{child_name}]\n"
140
- unique_nodes.add(child_id)
141
-
142
- return out
@@ -1,9 +0,0 @@
1
- from fabricks.core.scripts.generate import generate
2
- from fabricks.core.scripts.process import process
3
- from fabricks.core.scripts.terminate import terminate
4
-
5
- __all__ = [
6
- "process",
7
- "generate",
8
- "terminate",
9
- ]
@@ -1,87 +0,0 @@
1
- import logging
2
- from typing import List, Optional, Union, cast
3
-
4
- from fabricks.context import FABRICKS_STORAGE
5
- from fabricks.context.log import DEFAULT_LOGGER
6
- from fabricks.core.deploy import deploy
7
- from fabricks.core.jobs.base._types import Steps, TStep
8
- from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
9
- from fabricks.core.steps.base import BaseStep
10
- from fabricks.core.views import create_or_replace_views
11
- from fabricks.metastore.database import Database
12
-
13
-
14
- def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]]):
15
- DEFAULT_LOGGER.setLevel(logging.INFO)
16
-
17
- if steps is None:
18
- steps = Steps
19
- assert steps is not None
20
-
21
- if isinstance(steps, str):
22
- steps = [cast(TStep, steps)]
23
- elif isinstance(steps, List):
24
- steps = [cast(TStep, s) for s in steps]
25
- elif isinstance(steps, TStep):
26
- steps = [steps]
27
-
28
- DEFAULT_LOGGER.warning("armageddon")
29
- print("")
30
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡤⠤⠴⠾⠋⠉⠛⢾⡏⠙⠿⠦⠤⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
31
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⢶⣿⠉⢀⣀⡠⠆⠀⠀⠀⠀⠀⠀⠀⢤⣀⣀⠈⢹⣦⢤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
32
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⣿⠁⢋⡙⠁⠀⡝⠀⠀⠀⠀⣀⡸⠋⠁⠀⠀⠹⡀⠀⠈⠈⠆⢹⢦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
33
- print(" ⠀⠀⠀⠀⠀⠀⠀⢀⣠⣤⣿⣁⡡⣴⡏⠀⠀⠀⢀⠀⢧⣀⠄⠀⠀⠀⣀⣰⠆⢀⠁⠀⠀⢈⣶⡤⣀⢹⣦⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀ ")
34
- print(" ⠀⠀⠀⠀⠀⣠⢴⠟⢁⡝⠀⠁⠀⠃⠉⠀⠀⠘⣯⠀⡀⠾⣤⣄⣠⢤⠾⠄⠀⣸⠖⠀⠀⠈⠀⠃⠀⠀⠹⡄⠙⣶⢤⡀⠀⠀⠀⠀⠀ ")
35
- print(" ⠀⠀⠀⣠⠾⡇⠈⣀⡞⠀⠀⠀⠀⡀⠀⢀⣠⣄⣇⠀⣳⠴⠃⠀⠀⠀⠣⢴⠉⣰⣇⣀⣀⠀⠀⡄⠀⠀⠀⢹⣄⡘⠈⡷⣦⠀⠀⠀⠀ ")
36
- print(" ⢠⠞⠉⢻⡄⠀⠀⠈⠙⠀⠀⠀⠀⠙⣶⣏⣤⣤⠟⠉⠁⠀⠀⠀⠀⠀⠀⠀⠉⠙⢦⣱⣌⣷⠊⠀⠀⠀⠀⠈⠁⠀⠀⠀⡝⠉⠻⣄⠀ ")
37
- print(" ⠛⢀⡠⢼⡇⠀⠀⢀⡄⠀⢀⣀⡽⠚⠁⠀⠀⠀⢠⡀⢠⣀⠠⣔⢁⡀⠀⣄⠀⡄⠀⠀⠀⠈⠑⠺⣄⡀⠀⠠⡀⠀⠀⢠⡧⠄⠀⠘⢧ ")
38
- print(" ⡶⠋⠀⠀⠈⣠⣈⣩⠗⠒⠋⠀⠀⠀⠀⣀⣠⣆⡼⣷⣞⠛⠻⡉⠉⡟⠒⡛⣶⠧⣀⣀⣀⠀⠀⠀⠀⠈⠓⠺⢏⣉⣠⠋⠀⠀⠀⢢⣸ ")
39
- print(" ⠇⠐⠤⠤⠖⠁⣿⣀⣀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠙⠛⢿⣷⡄⢣⡼⠀⣾⣿⠧⠒⠓⠚⠛⠉⠀⠀⠀⠀⠀⢀⣀⣾⡉⠓⠤⡤⠄⠸⢿ ")
40
- print(" ⣆⣤⠀⠀⠠⠀⠈⠓⠈⠓⠤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣿⣿⢸⠀⢸⣿⠇⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⠒⠁⠰⠃⠀⠠⠀⠀⢀⣀⠞ ")
41
- print(" ⠀⠉⠓⢲⣄⡈⢀⣠⠀⠀⠀⡸⠶⠂⠀⠀⢀⠀⠀⠤⠞⢻⡇⠀⠀⢘⡟⠑⠤⠄⠀⢀⠀⠀⠐⠲⢿⡀⠀⠀⢤⣀⢈⣀⡴⠖⠋⠀⠀ ")
42
- print(" ⠀⠀⠀⠀⠈⠉⠉⠙⠓⠒⣾⣁⣀⣴⠀⣀⠙⢧⠂⢀⣆⣀⣷⣤⣀⣾⣇⣀⡆⠀⢢⠛⢁⠀⢰⣀⣀⣹⠒⠒⠛⠉⠉⠉⠀⠀⠀⠀⠀ ")
43
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠉⠛⠉⠙⠉⠀⠀⣿⡟⣿⣿⠀⠀⠈⠉⠉⠙⠋⠉⠉⠀⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
44
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⡇⢻⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
45
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣤⣶⣾⣿⣿⠁⠀⢹⡛⣟⡶⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
46
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⠛⢯⣽⡟⢿⣿⠛⠿⠳⠞⠻⣿⠻⣆⢽⠟⣶⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
47
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⠃⠲⠯⠴⣦⣼⣷⣤⣤⣶⣤⣩⡧⠽⠷⠐⠛⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
48
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣿⡇⠀⣿⡆⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
49
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣀⣄⡀⢀⣀⣠⡾⡿⢡⢐⠻⣿⣄⣀⡀⠀⣀⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
50
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⢴⡏⠁⠀⠝⠉⣡⠟⣰⠃⢸⣿⠀⣷⠙⢧⡉⠻⡅⠀⠙⡷⢤⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
51
- print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⡟⠀⠈⣿⢄⡴⠞⠻⣄⣰⣡⠤⣞⣸⡤⢬⣧⣀⡿⠛⠦⣤⣶⡃⠀⢹⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
52
- print(" ⠀⠀⠀⠀⠀⠀⢀⣴⣶⡿⠃⠉⢺⠁⠙⠒⠀⠀⣠⡉⠀⠉⠚⠉⠉⠑⠈⠀⠈⣧⠀⠀⠒⠋⠀⡹⠋⠀⢻⡶⠶⡄⠀⠀⠀⠀⠀⠀⠀ ")
53
- print(" ⠀⠀⠀⠀⠀⣠⣾⣿⣇⠁⢈⡦⠀⡍⠋⠁⡀⠸⡋⠀⠀⠀⢘⠏⠉⡏⠀⠀⠀⢉⡷⠀⡌⠉⠋⡇⠠⣏⠈⢁⣦⣿⣦⠀⠀⠀⠀⠀⠀ ")
54
- print(" ⠀⠀⠀⠀⠀⠉⣁⠀⠉⠉⠉⠙⠛⠛⠒⠚⠳⠤⢼⣤⣠⠤⣮⣠⣤⣼⠦⢤⣤⣿⠤⠾⠓⠒⠛⢓⠛⠉⠉⠉⠀⠈⠉⠀⠀⠀⠀⠀⠀ ")
55
- print("")
56
-
57
- fabricks = Database("fabricks")
58
- fabricks.drop()
59
- for s in steps:
60
- step = BaseStep(s)
61
- step.drop()
62
-
63
- tmp = FABRICKS_STORAGE.joinpath("tmp")
64
- tmp.rm()
65
-
66
- checkpoint = FABRICKS_STORAGE.joinpath("checkpoints")
67
- checkpoint.rm()
68
-
69
- schema = FABRICKS_STORAGE.joinpath("schemas")
70
- schema.rm()
71
-
72
- schedule = FABRICKS_STORAGE.joinpath("schedules")
73
- schedule.rm()
74
-
75
- fabricks.create()
76
-
77
- deploy.tables(drop=True)
78
- deploy.udfs()
79
-
80
- for s in steps:
81
- step = BaseStep(s)
82
- step.create()
83
-
84
- deploy.views()
85
-
86
- create_or_replace_views()
87
- create_or_replace_schedules_views()
@@ -1,51 +0,0 @@
1
- from pyspark.sql.types import Row
2
-
3
- from fabricks.cdc import NoCDC
4
- from fabricks.context import SPARK
5
- from fabricks.core.jobs.base._types import Steps
6
- from fabricks.utils.helpers import concat_dfs, run_in_parallel
7
-
8
-
9
- def collect_stats():
10
- def _collect_tables(s: str):
11
- df_table = SPARK.sql(f"show tables in {s}")
12
- df_view = SPARK.sql(f"show views in {s}")
13
-
14
- cond = [df_table.tableName == df_view.viewName]
15
- df_table = df_table.join(df_view, cond, how="left_anti")
16
-
17
- return df_table
18
-
19
- dfs = run_in_parallel(_collect_tables, Steps, workers=8)
20
- df_table = concat_dfs(dfs)
21
- assert df_table is not None
22
-
23
- def _collect_stats(row: Row):
24
- table = row["tableName"]
25
- database = row["database"]
26
- job = f"{database}.{table}"
27
-
28
- desc = SPARK.sql(f"describe detail {job}").collect()[0]
29
- bytes = desc["sizeInBytes"]
30
- files = desc["numFiles"]
31
-
32
- df = SPARK.sql(
33
- f"""
34
- select
35
- '{database}' as step,
36
- md5('{job}') as job_id,
37
- cast({bytes} as long) as bytes,
38
- cast({files} as long) as `files`,
39
- cast(count(*) as long) as `rows`
40
- from
41
- {job}
42
- """
43
- )
44
-
45
- return df
46
-
47
- dfs = run_in_parallel(_collect_stats, df_table, workers=64)
48
- df = concat_dfs(dfs)
49
- assert df is not None
50
-
51
- NoCDC("fabricks", "statistics").overwrite(df)
@@ -1,26 +0,0 @@
1
- from typing import Iterable
2
-
3
- from fabricks.cdc import NoCDC
4
- from fabricks.context import SPARK
5
- from fabricks.context.runtime import BRONZE, GOLD, SILVER
6
-
7
-
8
- def collect_steps():
9
- steps = []
10
-
11
- def _collect(expand: str, iterable: Iterable):
12
- for i in iterable:
13
- steps.append(
14
- {
15
- "expand": expand,
16
- "step": i.get("name"),
17
- "order": i.get("options", {}).get("order", 0),
18
- },
19
- )
20
-
21
- _collect("bronze", BRONZE)
22
- _collect("silver", SILVER)
23
- _collect("gold", GOLD)
24
-
25
- df = SPARK.createDataFrame(steps)
26
- NoCDC("fabricks", "steps").overwrite(df)