fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +8 -7
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +96 -43
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +9 -8
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +269 -102
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -137
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
@@ -1,7 +1,7 @@
1
1
  from typing import Callable, Optional
2
2
 
3
- from fabricks.core.parsers._types import ParserOptions
4
3
  from fabricks.core.parsers.base import PARSERS, BaseParser
4
+ from fabricks.models import ParserOptions
5
5
 
6
6
 
7
7
  def parser(name: str):
@@ -1,12 +1,12 @@
1
- from typing import Optional
1
+ from typing import Callable, Optional
2
2
 
3
3
  from fabricks.context import PATH_PARSERS
4
- from fabricks.core.parsers._types import ParserOptions
5
4
  from fabricks.core.parsers.base import PARSERS, BaseParser
5
+ from fabricks.models import ParserOptions
6
6
  from fabricks.utils.helpers import load_module_from_path
7
7
 
8
8
 
9
- def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> BaseParser:
9
+ def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> Callable:
10
10
  if name not in ["json", "parquet", "avro", "csv", "tsv", "delta", "table"]:
11
11
  path = PATH_PARSERS.joinpath(name).append(".py")
12
12
  assert path.exists(), f"parser not found ({path})"
@@ -17,5 +17,4 @@ def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> Bas
17
17
  else:
18
18
  parser = BaseParser(parser_options, name)
19
19
 
20
- assert parser
21
- return parser
20
+ return parser.get_data
@@ -1,9 +1,6 @@
1
- from typing import Union
2
-
3
1
  from fabricks.core.dags.processor import DagProcessor
4
- from fabricks.core.jobs.base._types import TStep
5
2
 
6
3
 
7
- def process(schedule_id: str, schedule: str, step: Union[TStep, str]):
4
+ def process(schedule_id: str, schedule: str, step: str):
8
5
  with DagProcessor(schedule_id=schedule_id, schedule=schedule, step=step) as p:
9
6
  p.process()
@@ -4,24 +4,34 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import expr, md5
6
6
  from pyspark.sql.types import Row
7
+ from sparkdantic import create_spark_schema
7
8
  from typing_extensions import deprecated
8
9
 
9
10
  from fabricks.cdc import NoCDC
10
- from fabricks.context import CONF_RUNTIME, LOGLEVEL, PATHS_RUNTIME, PATHS_STORAGE, SPARK, STEPS
11
+ from fabricks.context import (
12
+ CONF_RUNTIME,
13
+ LOGLEVEL,
14
+ PATHS_RUNTIME,
15
+ PATHS_STORAGE,
16
+ SPARK,
17
+ STEPS,
18
+ Bronzes,
19
+ Golds,
20
+ Silvers,
21
+ )
11
22
  from fabricks.context.log import DEFAULT_LOGGER
12
- from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
13
23
  from fabricks.core.jobs.get_job import get_job
14
24
  from fabricks.core.steps._types import Timeouts
15
25
  from fabricks.core.steps.get_step_conf import get_step_conf
16
26
  from fabricks.metastore.database import Database
17
27
  from fabricks.metastore.table import Table
28
+ from fabricks.models import SchemaDependencies, StepBronzeOptions, StepGoldOptions, StepSilverOptions
18
29
  from fabricks.utils.helpers import run_in_parallel
19
30
  from fabricks.utils.read.read_yaml import read_yaml
20
- from fabricks.utils.schema import get_schema_for_type
21
31
 
22
32
 
23
33
  class BaseStep:
24
- def __init__(self, step: Union[TStep, str]):
34
+ def __init__(self, step: str):
25
35
  self.name = cast(str, step)
26
36
 
27
37
  if self.name in Bronzes:
@@ -45,7 +55,7 @@ class BaseStep:
45
55
  self.database = Database(self.name)
46
56
 
47
57
  _conf: Optional[dict] = None
48
- _options: Optional[dict] = None
58
+ _options: Optional[Union[StepBronzeOptions, StepSilverOptions, StepGoldOptions]] = None
49
59
 
50
60
  _workers: Optional[int] = None
51
61
  _timeouts: Optional[Timeouts] = None
@@ -53,18 +63,18 @@ class BaseStep:
53
63
  @property
54
64
  def workers(self):
55
65
  if not self._workers:
56
- w = self.options.get("workers")
66
+ w = self.options.workers
57
67
  if w is None:
58
- w = CONF_RUNTIME.get("options", {}).get("workers")
68
+ w = CONF_RUNTIME.options.workers
59
69
  assert w is not None
60
70
  self._workers = cast(int, w)
61
71
 
62
72
  return self._workers
63
73
 
64
74
  def _get_timeout(self, what: str) -> int:
65
- t = self.options.get("timeouts", {}).get(what, None)
75
+ t = getattr(self.options.timeouts, what, None)
66
76
  if t is None:
67
- t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
77
+ t = getattr(CONF_RUNTIME.options.timeouts, what)
68
78
  assert t is not None
69
79
 
70
80
  return int(t)
@@ -82,18 +92,18 @@ class BaseStep:
82
92
  @property
83
93
  def conf(self) -> dict:
84
94
  if not self._conf:
85
- _conf = [s for s in STEPS if s.get("name") == self.name][0]
95
+ _conf = [s for s in STEPS if s.name == self.name][0]
86
96
  assert _conf is not None
87
- self._conf = cast(dict[str, str], _conf)
97
+ self._conf = _conf.model_dump()
88
98
 
89
99
  return self._conf
90
100
 
91
101
  @property
92
- def options(self) -> dict:
102
+ def options(self):
93
103
  if not self._options:
94
- o = self.conf.get("options")
95
- assert o is not None
96
- self._options = cast(dict[str, str], o)
104
+ _step = [s for s in STEPS if s.name == self.name][0]
105
+ assert _step is not None
106
+ self._options = _step.options
97
107
 
98
108
  return self._options
99
109
 
@@ -209,7 +219,7 @@ class BaseStep:
209
219
 
210
220
  try:
211
221
  conf = get_step_conf(self.name)
212
- schema = get_schema_for_type(conf)
222
+ schema = create_spark_schema(conf)
213
223
  jobs = self.get_jobs_iter(topic=topic)
214
224
 
215
225
  df = SPARK.createDataFrame(jobs, schema=schema) # type: ignore
@@ -392,7 +402,7 @@ class BaseStep:
392
402
  DEFAULT_LOGGER.setLevel(LOGLEVEL)
393
403
 
394
404
  def update_steps_list(self):
395
- order = self.options.get("order", 0)
405
+ order = self.options.order or 0
396
406
  df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
397
407
 
398
408
  NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
@@ -1,10 +1,8 @@
1
- from typing import Union
2
-
3
- from fabricks.core.jobs.base._types import Steps, TStep
1
+ from fabricks.context import Steps
4
2
  from fabricks.core.steps.base import BaseStep
5
3
 
6
4
 
7
- def get_step(step: Union[TStep, str]) -> BaseStep:
5
+ def get_step(step: str) -> BaseStep:
8
6
  assert step in Steps, f"{step} not found"
9
7
  base_step = BaseStep(step=step)
10
8
  return base_step
@@ -1,12 +1,8 @@
1
- from typing import Union, cast
1
+ from fabricks.context import Bronzes, Golds, Silvers
2
+ from fabricks.models import JobConfBronze, JobConfGold, JobConfSilver
2
3
 
3
- from fabricks.core.jobs.base._types import Bronzes, Golds, JobConfBronze, JobConfGold, JobConfSilver, Silvers, TStep
4
-
5
-
6
- def get_step_conf(step: Union[TStep, str]):
7
- if isinstance(step, str):
8
- step = cast(TStep, step)
9
4
 
5
+ def get_step_conf(step: str):
10
6
  if step in Bronzes:
11
7
  expand = "bronze"
12
8
  elif step in Silvers:
fabricks/core/udfs.py CHANGED
@@ -5,26 +5,27 @@ from typing import Callable, List, Optional
5
5
 
6
6
  from pyspark.sql import SparkSession
7
7
 
8
- from fabricks.context import CATALOG, IS_UNITY_CATALOG, PATH_UDFS, SPARK, CONF_RUNTIME
8
+ from fabricks.context import CATALOG, CONF_RUNTIME, IS_UNITY_CATALOG, PATH_UDFS, SPARK
9
9
  from fabricks.context.log import DEFAULT_LOGGER
10
10
 
11
11
  UDFS: dict[str, Callable] = {}
12
12
 
13
- udf_schema = CONF_RUNTIME.get("udf_options", {}).get("schema", "default")
14
- udf_prefix = CONF_RUNTIME.get("udf_options", {}).get("prefix", "udf_")
13
+ UDF_SCHEMA = CONF_RUNTIME.udf_options.schema_name or "default" if CONF_RUNTIME.udf_options else "default"
14
+ UDF_PREFIX = CONF_RUNTIME.udf_options.prefix or "udf_" if CONF_RUNTIME.udf_options else "udf_"
15
+
15
16
 
16
17
  def register_all_udfs(extension: Optional[str] = None, override: bool = False):
17
18
  """
18
19
  Register all user-defined functions (UDFs).
19
20
  """
20
- DEFAULT_LOGGER.info("register udfs")
21
+ DEFAULT_LOGGER.info("register udfs", extra={"label": "fabricks"})
21
22
 
22
23
  for udf in get_udfs(extension=extension):
23
24
  split = udf.split(".")
24
25
  try:
25
26
  register_udf(udf=split[0], extension=split[1], override=override)
26
27
  except Exception as e:
27
- DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e)
28
+ DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e, extra={"label": "fabricks"})
28
29
 
29
30
 
30
31
  def get_udfs(extension: Optional[str] = None) -> List[str]:
@@ -49,12 +50,12 @@ def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
49
50
  spark = SPARK
50
51
  assert spark is not None
51
52
 
52
- df = spark.sql(f"show user functions in {udf_schema}")
53
+ df = spark.sql(f"show user functions in {UDF_SCHEMA}")
53
54
 
54
55
  if CATALOG:
55
- df = df.where(f"function == '{CATALOG}.{udf_schema}.{udf_prefix}{udf}'")
56
+ df = df.where(f"function == '{CATALOG}.{UDF_SCHEMA}.{UDF_PREFIX}{udf}'")
56
57
  else:
57
- df = df.where(f"function == 'spark_catalog.{udf_schema}.{udf_prefix}{udf}'")
58
+ df = df.where(f"function == 'spark_catalog.{UDF_SCHEMA}.{UDF_PREFIX}{udf}'")
58
59
 
59
60
  return not df.isEmpty()
60
61
 
fabricks/core/views.py CHANGED
@@ -1,10 +1,10 @@
1
1
  from fabricks.context import PATH_VIEWS, SPARK
2
2
  from fabricks.context.log import DEFAULT_LOGGER
3
- from fabricks.utils.path import Path
3
+ from fabricks.utils.path import GitPath
4
4
  from fabricks.utils.sqlglot import fix as fix_sql
5
5
 
6
6
 
7
- def create_or_replace_view_internal(path: Path):
7
+ def create_or_replace_view_internal(path: GitPath):
8
8
  sql = path.get_sql()
9
9
  file_name = path.get_file_name().split(".")[0]
10
10
 
@@ -1,10 +1,9 @@
1
1
  import logging
2
- from typing import List, Optional, Union, cast
2
+ from typing import Optional, Union
3
3
 
4
- from fabricks.context import FABRICKS_STORAGE
4
+ from fabricks.context import FABRICKS_STORAGE, Steps
5
5
  from fabricks.context.log import DEFAULT_LOGGER
6
- from fabricks.core.jobs.base._types import Steps, TStep
7
- from fabricks.core.steps.base import BaseStep
6
+ from fabricks.core.steps import get_step
8
7
  from fabricks.deploy.masks import deploy_masks
9
8
  from fabricks.deploy.notebooks import deploy_notebooks
10
9
  from fabricks.deploy.schedules import deploy_schedules
@@ -17,8 +16,8 @@ from fabricks.metastore.database import Database
17
16
 
18
17
  class Deploy:
19
18
  @staticmethod
20
- def tables(drop: bool = False):
21
- deploy_tables(drop=drop)
19
+ def tables(drop: bool = False, update: bool = False):
20
+ deploy_tables(drop=drop, update=update)
22
21
 
23
22
  @staticmethod
24
23
  def views():
@@ -33,16 +32,30 @@ class Deploy:
33
32
  deploy_masks(override=override)
34
33
 
35
34
  @staticmethod
36
- def notebooks():
37
- deploy_notebooks()
35
+ def notebooks(override: bool = False):
36
+ deploy_notebooks(overwrite=override)
38
37
 
39
38
  @staticmethod
40
39
  def schedules():
41
40
  deploy_schedules()
42
41
 
43
42
  @staticmethod
44
- def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]], nowait: bool = False):
45
- DEFAULT_LOGGER.warning("!💥 armageddon 💥!")
43
+ def step(step: str):
44
+ Deploy.tables()
45
+ s = get_step(step)
46
+ s.create()
47
+
48
+ Deploy.views()
49
+ Deploy.schedules()
50
+
51
+ @staticmethod
52
+ def job(step: str):
53
+ s = get_step(step)
54
+ s.create()
55
+
56
+ @staticmethod
57
+ def armageddon(steps: Optional[Union[str, list[str]]] = None, nowait: bool = False):
58
+ DEFAULT_LOGGER.warning("!💥 armageddon 💥!", extra={"label": "fabricks"})
46
59
  print_atomic_bomb(nowait=nowait)
47
60
 
48
61
  DEFAULT_LOGGER.setLevel(logging.INFO)
@@ -52,17 +65,15 @@ class Deploy:
52
65
  assert steps is not None
53
66
 
54
67
  if isinstance(steps, str):
55
- steps = [cast(TStep, steps)]
56
- elif isinstance(steps, List):
57
- steps = [cast(TStep, s) for s in steps]
58
- elif isinstance(steps, TStep):
59
68
  steps = [steps]
69
+ elif isinstance(steps, list):
70
+ steps = [s for s in steps]
60
71
 
61
72
  fabricks = Database("fabricks")
62
73
  fabricks.drop()
63
74
 
64
75
  for s in steps:
65
- step = BaseStep(s)
76
+ step = get_step(s)
66
77
  step.drop()
67
78
 
68
79
  tmp = FABRICKS_STORAGE.joinpath("tmp")
@@ -85,7 +96,7 @@ class Deploy:
85
96
  Deploy.notebooks()
86
97
 
87
98
  for s in steps:
88
- step = BaseStep(s)
99
+ step = get_step(s)
89
100
  step.create()
90
101
 
91
102
  Deploy.views()
fabricks/deploy/masks.py CHANGED
@@ -3,6 +3,6 @@ from fabricks.core.masks import register_all_masks
3
3
 
4
4
 
5
5
  def deploy_masks(override: bool = True):
6
- DEFAULT_LOGGER.info("create or replace masks")
6
+ DEFAULT_LOGGER.info("create or replace masks", extra={"label": "fabricks"})
7
7
 
8
8
  register_all_masks(override=override)
@@ -13,7 +13,7 @@ from fabricks.context.log import DEFAULT_LOGGER
13
13
  def deploy_notebook(notebook: str):
14
14
  from fabricks.api import notebooks
15
15
 
16
- DEFAULT_LOGGER.debug(f"overwrite {notebook}")
16
+ DEFAULT_LOGGER.debug(f"overwrite {notebook}", extra={"label": "fabricks"})
17
17
 
18
18
  w = WorkspaceClient()
19
19
 
@@ -34,21 +34,24 @@ def deploy_notebook(notebook: str):
34
34
  )
35
35
 
36
36
 
37
- def deploy_notebooks():
38
- DEFAULT_LOGGER.info("overwrite notebooks")
39
-
40
- _create_dir_if_not_exists()
41
- _clean_dir()
42
-
43
- for n in [
44
- "cluster",
45
- "initialize",
46
- "process",
47
- "schedule",
48
- "run",
49
- "terminate",
50
- ]:
51
- deploy_notebook(notebook=n)
37
+ def deploy_notebooks(overwrite: bool = False):
38
+ if overwrite:
39
+ DEFAULT_LOGGER.warning("overwrite notebooks", extra={"label": "fabricks"})
40
+
41
+ _create_dir_if_not_exists()
42
+ _clean_dir()
43
+
44
+ for n in [
45
+ "cluster",
46
+ "initialize",
47
+ "process",
48
+ "schedule",
49
+ "run",
50
+ "terminate",
51
+ ]:
52
+ deploy_notebook(notebook=n)
53
+ else:
54
+ DEFAULT_LOGGER.info("deploy notebooks skipped (overwrite=False)", extra={"label": "fabricks"})
52
55
 
53
56
 
54
57
  def _create_dir_if_not_exists():
@@ -4,7 +4,7 @@ from fabricks.core.views import create_or_replace_views as create_or_replace_cus
4
4
 
5
5
 
6
6
  def deploy_schedules():
7
- DEFAULT_LOGGER.info("create or replace schedules")
7
+ DEFAULT_LOGGER.info("create or replace schedules", extra={"label": "fabricks"})
8
8
 
9
9
  create_or_replace_custom_views()
10
10
  create_or_replace_views()
fabricks/deploy/tables.py CHANGED
@@ -1,4 +1,4 @@
1
- from pyspark.sql.types import LongType, StringType, StructField, StructType, TimestampType
1
+ from pyspark.sql.types import LongType, StringType, StructField, StructType, TimestampType, VariantType
2
2
 
3
3
  from fabricks.cdc import NoCDC
4
4
  from fabricks.context import SPARK
@@ -6,77 +6,94 @@ from fabricks.context.log import DEFAULT_LOGGER
6
6
  from fabricks.metastore.table import Table
7
7
 
8
8
 
9
- def deploy_tables(drop: bool = False):
10
- DEFAULT_LOGGER.info("create or replace fabricks (default) tables")
9
+ def deploy_tables(drop: bool = False, update: bool = False):
10
+ DEFAULT_LOGGER.info("create or replace fabricks (default) tables", extra={"label": "fabricks"})
11
11
 
12
- create_table_log(drop=drop)
13
- create_table_dummy(drop=drop)
14
- create_table_step(drop=drop)
12
+ create_table_log(drop=drop, update=update)
13
+ create_table_dummy(drop=drop, update=update)
14
+ create_table_step(drop=drop, update=update)
15
15
 
16
16
 
17
- def create_table_step(drop: bool = False):
17
+ def create_table_step(drop: bool = False, update: bool = False):
18
18
  table = Table("fabricks", "steps")
19
+ schema = StructType(
20
+ [
21
+ StructField("step", StringType(), True),
22
+ StructField("expand", StringType(), True),
23
+ StructField("order", LongType(), True),
24
+ ]
25
+ )
26
+
19
27
  if drop:
20
28
  table.drop()
21
29
 
22
30
  if not table.exists():
23
- schema = StructType(
24
- [
25
- StructField("step", StringType(), True),
26
- StructField("expand", StringType(), True),
27
- StructField("order", LongType(), True),
28
- ]
31
+ table.create(
32
+ schema=schema,
33
+ partitioning=True,
34
+ partition_by=["expand"],
29
35
  )
30
- table.create(schema=schema, partitioning=True, partition_by=["expand"])
36
+ elif update:
37
+ table.overwrite_schema(schema=schema)
31
38
 
32
39
 
33
- def create_table_log(drop: bool = False):
40
+ def create_table_log(drop: bool = False, update: bool = False):
34
41
  table = Table("fabricks", "logs")
42
+ schema = StructType(
43
+ [
44
+ StructField("schedule_id", StringType(), True),
45
+ StructField("schedule", StringType(), True),
46
+ StructField("step", StringType(), True),
47
+ StructField("job_id", StringType(), True),
48
+ StructField("job", StringType(), True),
49
+ StructField("notebook_id", StringType(), True),
50
+ StructField("level", StringType(), True),
51
+ StructField("status", StringType(), True),
52
+ StructField("timestamp", TimestampType(), True),
53
+ StructField(
54
+ "exception",
55
+ StructType(
56
+ [
57
+ StructField("type", StringType(), True),
58
+ StructField("message", StringType(), True),
59
+ StructField("traceback", StringType(), True),
60
+ ]
61
+ ),
62
+ True,
63
+ ),
64
+ StructField("json", VariantType(), True),
65
+ ]
66
+ )
67
+
35
68
  if drop:
36
69
  table.drop()
37
70
 
38
71
  if not table.exists():
39
- schema = StructType(
40
- [
41
- StructField("schedule_id", StringType(), True),
42
- StructField("schedule", StringType(), True),
43
- StructField("step", StringType(), True),
44
- StructField("job_id", StringType(), True),
45
- StructField("job", StringType(), True),
46
- StructField("notebook_id", StringType(), True),
47
- StructField("level", StringType(), True),
48
- StructField("status", StringType(), True),
49
- StructField("timestamp", TimestampType(), True),
50
- StructField(
51
- "exception",
52
- StructType(
53
- [
54
- StructField("type", StringType(), True),
55
- StructField("message", StringType(), True),
56
- StructField("traceback", StringType(), True),
57
- ]
58
- ),
59
- True,
60
- ),
61
- ]
72
+ table.create(
73
+ schema=schema,
74
+ partitioning=True,
75
+ partition_by=["schedule_id", "step"],
62
76
  )
63
- table.create(schema=schema, partitioning=True, partition_by=["schedule_id", "step"])
77
+ elif update:
78
+ table.overwrite_schema(schema=schema)
64
79
 
65
80
 
66
- def create_table_dummy(drop: bool = False):
81
+ def create_table_dummy(drop: bool = False, update: bool = False):
67
82
  cdc = NoCDC("fabricks", "dummy")
83
+ df = SPARK.sql(
84
+ """
85
+ select
86
+ 1 as __key,
87
+ md5('1') as __hash,
88
+ cast('1900-01-01' as timestamp) as __valid_from,
89
+ cast('9999-12-31' as timestamp) as __valid_to
90
+ """
91
+ )
68
92
 
69
93
  if drop:
70
94
  cdc.drop()
71
95
 
72
96
  if not cdc.table.exists():
73
- df = SPARK.sql(
74
- """
75
- select
76
- 1 as __key,
77
- md5('1') as __hash,
78
- cast('1900-01-01' as timestamp) as __valid_from,
79
- cast('9999-12-31' as timestamp) as __valid_to
80
- """
81
- )
82
97
  cdc.overwrite(df)
98
+ elif update:
99
+ cdc.overwrite_schema(df)
fabricks/deploy/udfs.py CHANGED
@@ -5,7 +5,7 @@ from fabricks.utils.sqlglot import fix as fix_sql
5
5
 
6
6
 
7
7
  def deploy_udfs(override: bool = True):
8
- DEFAULT_LOGGER.info("create or replace udfs")
8
+ DEFAULT_LOGGER.info("create or replace udfs", extra={"label": "fabricks"})
9
9
 
10
10
  register_all_udfs(extension="sql", override=override)
11
11
  create_or_replace_udf_job_id()
@@ -15,5 +15,5 @@ def create_or_replace_udf_job_id():
15
15
  sql = "create or replace function fabricks.udf_job_id(job string) returns string return md5(job)"
16
16
  sql = fix_sql(sql)
17
17
 
18
- DEFAULT_LOGGER.debug("create or replace fabricks.udf_job_id", extra={"sql": sql})
18
+ DEFAULT_LOGGER.debug("create or replace fabricks.udf_job_id", extra={"sql": sql, "label": "fabricks"})
19
19
  SPARK.sql(sql)