fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +76 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
  94. fabricks-3.0.6.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,76 @@
1
+ import logging
2
+
3
+ import fabricks.context.config as c
4
+ import fabricks.context.runtime as r
5
+
6
+
7
+ def pprint_runtime():
8
+ print("=" * 60)
9
+ print("FABRICKS RUNTIME CONFIGURATION")
10
+ print("=" * 60)
11
+
12
+ # Core Paths Section
13
+ print("\n📁 CORE CONFIG:")
14
+ print(f" Runtime: {c.PATH_RUNTIME.string}")
15
+ print(f" Notebooks: {c.PATH_NOTEBOOKS.string}")
16
+ print(f" Config: {c.PATH_CONFIG.string}")
17
+ print(f" Log Level: {logging.getLevelName(c.LOGLEVEL)}")
18
+ print(f" Debug Mode: {'✓' if c.IS_DEBUGMODE else '✗'}")
19
+ print(f" Job Config from YAML: {'✓' if c.IS_JOB_CONFIG_FROM_YAML else '✗'}")
20
+
21
+ print("\n⚙️ RUNTIME SETTINGS:")
22
+ print("\n🔄 PIPELINE STEPS:")
23
+
24
+ def _print_steps(steps_list, layer_name, icon):
25
+ if steps_list and any(step for step in steps_list if step):
26
+ print(f" {icon} {layer_name}:")
27
+ for step in steps_list:
28
+ if step:
29
+ step_name = step.get("name", "Unnamed")
30
+ print(f" • {step_name}")
31
+ else:
32
+ print(f" {icon} {layer_name}: No steps")
33
+
34
+ _print_steps(r.BRONZE, "Bronze", "🥉")
35
+ _print_steps(r.SILVER, "Silver", "🥈")
36
+ _print_steps(r.GOLD, "Gold", "🥇")
37
+
38
+ # Storage Configuration Section
39
+ print("\n💾 STORAGE CONFIGURATION:")
40
+ print(f" Storage URI: {r.FABRICKS_STORAGE.string}")
41
+ print(f" Storage Credential: {r.FABRICKS_STORAGE_CREDENTIAL or 'Not configured'}")
42
+
43
+ # Unity Catalog Section
44
+ print("\n🏛️ UNITY CATALOG:")
45
+ print(f" Enabled: {'✓' if r.IS_UNITY_CATALOG else '✗'}")
46
+ if r.IS_UNITY_CATALOG and r.CATALOG:
47
+ print(f" Catalog: {r.CATALOG}")
48
+
49
+ # Security Section
50
+ print("\n🔐 SECURITY:")
51
+ print(f" Secret Scope: {r.SECRET_SCOPE}")
52
+
53
+ # Component Paths Section
54
+ print("\n🛠️ COMPONENT PATHS:")
55
+ components = [
56
+ ("UDFs", r.PATH_UDFS),
57
+ ("Parsers", r.PATH_PARSERS),
58
+ ("Extenders", r.PATH_EXTENDERS),
59
+ ("Views", r.PATH_VIEWS),
60
+ ("Schedules", r.PATH_SCHEDULES),
61
+ ]
62
+
63
+ for name, path in components:
64
+ print(f" {name}: {path.string}")
65
+
66
+ # Storage Paths Section
67
+ print("\n📦 STORAGE PATHS:")
68
+ for name, path in sorted(r.PATHS_STORAGE.items()):
69
+ icon = "🏭" if name == "fabricks" else "📊"
70
+ print(f" {icon} {name}: {path.string}")
71
+
72
+ # Runtime Paths Section
73
+ if r.PATHS_RUNTIME:
74
+ print("\n⚡ RUNTIME PATHS:")
75
+ for name, path in sorted(r.PATHS_RUNTIME.items()):
76
+ print(f" 📂 {name}: {path.string}")
@@ -3,6 +3,7 @@ from typing import Optional, Tuple
3
3
  from uuid import uuid4
4
4
 
5
5
  from pyspark.sql import DataFrame
6
+ from pyspark.sql.functions import lit
6
7
 
7
8
  from fabricks.context import SPARK
8
9
  from fabricks.core.dags.base import BaseDags
@@ -55,13 +56,11 @@ class DagGenerator(BaseDags):
55
56
  if job_df is None:
56
57
  job_df = self.get_jobs()
57
58
 
58
- return SPARK.sql(
59
+ df = SPARK.sql(
59
60
  """
60
61
  select
61
62
  'dependencies' as PartitionKey,
62
- d.dependency_id::string as RowKey,
63
- {schedule_id} as ScheduleId,
64
- {schedule} as Schedule,
63
+ d.dependency_id :: string as RowKey,
65
64
  d.dependency_id as DependencyId,
66
65
  j.Step as Step,
67
66
  j.Job as Job,
@@ -90,9 +89,9 @@ class DagGenerator(BaseDags):
90
89
  group by all
91
90
  """,
92
91
  job=job_df,
93
- schedule=self.schedule,
94
- schedule_id=self.schedule_id,
95
92
  )
93
+ df = df.withColumn("ScheduleId", lit(self.schedule_id))
94
+ return df.withColumn("Schedule", lit(self.schedule))
96
95
 
97
96
  def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
98
97
  if job_df is None:
@@ -136,7 +135,7 @@ class DagGenerator(BaseDags):
136
135
  'INFO' as `Level`,
137
136
  `Status` as `Message`,
138
137
  from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
139
- md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, -1), "*")) as RowKey
138
+ md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, '-1'), "*")) as RowKey
140
139
  from
141
140
  {df}
142
141
  """,
fabricks/core/dags/log.py CHANGED
@@ -1,23 +1,10 @@
1
1
  import logging
2
2
  from typing import Final
3
3
 
4
- from fabricks.context.runtime import FABRICKS_STORAGE
5
- from fabricks.core.dags.utils import get_connection_info
6
- from fabricks.utils.azure_table import AzureTable
4
+ from fabricks.core.dags.utils import get_table
7
5
  from fabricks.utils.log import AzureTableLogHandler, get_logger
8
6
 
9
-
10
- def _get_table():
11
- storage_account = FABRICKS_STORAGE.get_storage_account()
12
-
13
- cx = get_connection_info(storage_account)
14
-
15
- return AzureTable(
16
- "dags", storage_account=storage_account, access_key=cx["access_key"], credential=cx["credential"]
17
- )
18
-
19
-
20
- table = _get_table()
7
+ table = get_table()
21
8
  Logger, TableLogHandler = get_logger("dags", logging.INFO, table=table, debugmode=False)
22
9
 
23
10
  LOGGER: Final[logging.Logger] = Logger
@@ -8,7 +8,7 @@ from azure.core.exceptions import AzureError
8
8
  from databricks.sdk.runtime import dbutils, spark
9
9
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
10
10
 
11
- from fabricks.context.runtime import PATH_NOTEBOOKS
11
+ from fabricks.context import PATH_NOTEBOOKS
12
12
  from fabricks.core.dags.base import BaseDags
13
13
  from fabricks.core.dags.log import LOGGER
14
14
  from fabricks.core.dags.run import run
@@ -90,7 +90,7 @@ class DagProcessor(BaseDags):
90
90
  if len(scheduled) == 0:
91
91
  for _ in range(self.step.workers):
92
92
  self.queue.send_sentinel()
93
- LOGGER.info("no more job to schedule")
93
+ LOGGER.info("no more job to schedule", extra={"label": str(self.step)})
94
94
  break
95
95
 
96
96
  else:
@@ -100,7 +100,7 @@ class DagProcessor(BaseDags):
100
100
 
101
101
  if len(dependencies) == 0:
102
102
  s["Status"] = "waiting"
103
- LOGGER.info("waiting", extra=self.extra(s))
103
+ LOGGER.debug("waiting", extra=self.extra(s))
104
104
  self.table.upsert(s)
105
105
  self.queue.send(s)
106
106
 
@@ -110,7 +110,7 @@ class DagProcessor(BaseDags):
110
110
  while True:
111
111
  response = self.queue.receive()
112
112
  if response == self.queue.sentinel:
113
- LOGGER.info("no more job available")
113
+ LOGGER.info("no more job to process", extra={"label": str(self.step)})
114
114
  break
115
115
 
116
116
  elif response:
@@ -118,7 +118,7 @@ class DagProcessor(BaseDags):
118
118
 
119
119
  j["Status"] = "starting"
120
120
  self.table.upsert(j)
121
- LOGGER.info("starting", extra=self.extra(j))
121
+ LOGGER.info("start", extra=self.extra(j))
122
122
 
123
123
  try:
124
124
  if self.notebook:
@@ -143,12 +143,12 @@ class DagProcessor(BaseDags):
143
143
  )
144
144
 
145
145
  except Exception:
146
- LOGGER.warning("failed", extra={"step": str(self.step), "job": j.get("Job")})
146
+ LOGGER.warning("fail", extra={"label": j.get("Job")})
147
147
 
148
148
  finally:
149
149
  j["Status"] = "ok"
150
150
  self.table.upsert(j)
151
- LOGGER.info("ok", extra=self.extra(j))
151
+ LOGGER.info("end", extra=self.extra(j))
152
152
 
153
153
  dependencies = self.table.query(f"PartitionKey eq 'dependencies' and ParentId eq '{j.get('JobId')}'")
154
154
  self.table.delete(dependencies)
@@ -191,7 +191,7 @@ class DagProcessor(BaseDags):
191
191
  assert isinstance(scheduled, List)
192
192
 
193
193
  if len(scheduled) > 0:
194
- LOGGER.info("start")
194
+ LOGGER.info("start", extra={"label": str(self.step)})
195
195
 
196
196
  p = Process(target=self._process())
197
197
  p.start()
@@ -201,17 +201,17 @@ class DagProcessor(BaseDags):
201
201
  self.queue.delete()
202
202
 
203
203
  if p.exitcode is None:
204
- LOGGER.critical("timeout")
204
+ LOGGER.critical("timeout", extra={"label": str(self.step)})
205
205
  raise ValueError(f"{self.step} timed out")
206
206
 
207
207
  else:
208
208
  df = self.get_logs(str(self.step))
209
209
  self.write_logs(df)
210
210
 
211
- LOGGER.info("end")
211
+ LOGGER.info("end", extra={"label": str(self.step)})
212
212
 
213
213
  else:
214
- LOGGER.info("no job to schedule")
214
+ LOGGER.info("no job to schedule", extra={"label": str(self.step)})
215
215
 
216
216
  def __str__(self) -> str:
217
217
  return f"{str(self.step)} ({self.schedule_id})"
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, cast
2
2
 
3
- from fabricks.context import DBUTILS, FABRICKS_STORAGE_CREDENTIAL, IS_UNITY_CATALOG, SECRET_SCOPE
3
+ from fabricks.context import DBUTILS, FABRICKS_STORAGE, FABRICKS_STORAGE_CREDENTIAL, IS_UNITY_CATALOG, SECRET_SCOPE
4
+ from fabricks.utils.azure_table import AzureTable
4
5
 
5
6
 
6
7
  def _get_access_key_from_secret_scope(storage_account: str) -> str:
@@ -38,3 +39,16 @@ def get_connection_info(storage_account: str) -> dict:
38
39
  "access_key": access_key,
39
40
  "credential": credential,
40
41
  }
42
+
43
+
44
+ def get_table():
45
+ storage_account = FABRICKS_STORAGE.get_storage_account()
46
+
47
+ cx = get_connection_info(storage_account)
48
+
49
+ return AzureTable(
50
+ "dags",
51
+ storage_account=storage_account,
52
+ access_key=cx["access_key"],
53
+ credential=cx["credential"],
54
+ )
@@ -26,3 +26,7 @@ def get_job_schema() -> str:
26
26
 
27
27
  j = json.dumps(sc, indent=4)
28
28
  return j
29
+
30
+
31
+ def print_job_schema():
32
+ print(get_job_schema())
@@ -4,7 +4,7 @@ from typing import List, Literal, Optional, TypedDict, Union
4
4
  from pydantic import BaseModel, ConfigDict, model_validator
5
5
  from pyspark.sql.types import StringType, StructField, StructType
6
6
 
7
- from fabricks.cdc.base._types import ChangeDataCaptures
7
+ from fabricks.cdc.base._types import AllowedChangeDataCaptures
8
8
  from fabricks.context import BRONZE, GOLD, SILVER
9
9
  from fabricks.core.jobs.get_job_id import get_dependency_id, get_job_id
10
10
  from fabricks.core.parsers import ParserOptions
@@ -21,15 +21,18 @@ Silvers: List[TSilver] = [s.get("name") for s in SILVER]
21
21
  Golds: List[TGold] = [g.get("name") for g in GOLD]
22
22
  Steps: List[TStep] = Bronzes + Silvers + Golds
23
23
 
24
- BronzeModes = Literal["memory", "append", "register"]
25
- SilverModes = Literal["memory", "append", "latest", "update", "combine"]
26
- GoldModes = Literal["memory", "append", "complete", "update", "invoke"]
27
- Modes = Literal[BronzeModes, SilverModes, GoldModes]
24
+ AllowedModesBronze = Literal["memory", "append", "register"]
25
+ AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
26
+ AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
27
+ AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
28
28
 
29
- FileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
30
- Operations = Literal["upsert", "reload", "delete"]
31
- Types = Literal["manual", "default"]
32
- Origins = Literal["parser", "job"]
29
+ AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
30
+ AllowedOperations = Literal["upsert", "reload", "delete"]
31
+ AllowedTypes = Literal["manual", "default"]
32
+ AllowedOrigins = Literal["parser", "job"]
33
+
34
+ AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
35
+ AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
33
36
 
34
37
 
35
38
  class SparkOptions(TypedDict):
@@ -37,6 +40,26 @@ class SparkOptions(TypedDict):
37
40
  conf: Optional[dict[str, str]]
38
41
 
39
42
 
43
+ class ForeignKeyOptions(TypedDict):
44
+ foreign_key: Optional[AllowedForeignKeyOptions]
45
+ constraint: Optional[AllowedConstraintOptions]
46
+
47
+
48
+ class PrimaryKeyOptions(TypedDict):
49
+ constraint: Optional[AllowedConstraintOptions]
50
+
51
+
52
+ class ForeignKey(TypedDict):
53
+ keys: List[str]
54
+ reference: str
55
+ options: Optional[ForeignKeyOptions]
56
+
57
+
58
+ class PrimaryKey(TypedDict):
59
+ keys: List[str]
60
+ options: Optional[PrimaryKeyOptions]
61
+
62
+
40
63
  class TableOptions(TypedDict):
41
64
  identity: Optional[bool]
42
65
  liquid_clustering: Optional[bool]
@@ -44,12 +67,17 @@ class TableOptions(TypedDict):
44
67
  zorder_by: Optional[List[str]]
45
68
  cluster_by: Optional[List[str]]
46
69
  powerbi: Optional[bool]
70
+ maximum_compatibility: Optional[bool]
47
71
  bloomfilter_by: Optional[List[str]]
48
72
  constraints: Optional[dict[str, str]]
49
73
  properties: Optional[dict[str, str]]
50
74
  comment: Optional[str]
51
75
  calculated_columns: Optional[dict[str, str]]
76
+ masks: Optional[dict[str, str]]
77
+ comments: Optional[dict[str, str]]
52
78
  retention_days: Optional[int]
79
+ primary_key: Optional[dict[str, PrimaryKey]]
80
+ foreign_keys: Optional[dict[str, ForeignKey]]
53
81
 
54
82
 
55
83
  class _InvokeOptions(TypedDict):
@@ -79,8 +107,8 @@ class CheckOptions(TypedDict):
79
107
 
80
108
 
81
109
  class BronzeOptions(TypedDict):
82
- type: Optional[Types]
83
- mode: BronzeModes
110
+ type: Optional[AllowedTypes]
111
+ mode: AllowedModesBronze
84
112
  uri: str
85
113
  parser: str
86
114
  source: str
@@ -88,20 +116,28 @@ class BronzeOptions(TypedDict):
88
116
  # default
89
117
  parents: Optional[List[str]]
90
118
  filter_where: Optional[str]
119
+ optimize: Optional[bool]
120
+ compute_statistics: Optional[bool]
121
+ vacuum: Optional[bool]
122
+ no_drop: Optional[bool]
91
123
  # extra
92
124
  encrypted_columns: Optional[List[str]]
93
125
  calculated_columns: Optional[dict[str, str]]
94
- operation: Optional[Operations]
126
+ operation: Optional[AllowedOperations]
95
127
  timeout: Optional[int]
96
128
 
97
129
 
98
130
  class SilverOptions(TypedDict):
99
- type: Optional[Types]
100
- mode: SilverModes
101
- change_data_capture: ChangeDataCaptures
131
+ type: Optional[AllowedTypes]
132
+ mode: AllowedModesSilver
133
+ change_data_capture: AllowedChangeDataCaptures
102
134
  # default
103
135
  parents: Optional[List[str]]
104
136
  filter_where: Optional[str]
137
+ optimize: Optional[bool]
138
+ compute_statistics: Optional[bool]
139
+ vacuum: Optional[bool]
140
+ no_drop: Optional[bool]
105
141
  # extra
106
142
  deduplicate: Optional[bool]
107
143
  stream: Optional[bool]
@@ -111,22 +147,28 @@ class SilverOptions(TypedDict):
111
147
 
112
148
 
113
149
  class GoldOptions(TypedDict):
114
- type: Optional[Types]
115
- mode: GoldModes
116
- change_data_capture: ChangeDataCaptures
150
+ type: Optional[AllowedTypes]
151
+ mode: AllowedModesGold
152
+ change_data_capture: AllowedChangeDataCaptures
117
153
  update_where: Optional[str]
118
154
  # default
119
155
  parents: Optional[List[str]]
156
+ optimize: Optional[bool]
157
+ compute_statistics: Optional[bool]
158
+ vacuum: Optional[bool]
159
+ no_drop: Optional[bool]
120
160
  # extra
121
161
  deduplicate: Optional[bool] # remove duplicates on the keys and on the hash
122
162
  rectify_as_upserts: Optional[bool] # convert reloads into upserts and deletes
123
- correct_valid_from: Optional[bool]
124
- persist_last_timestamp: Optional[bool]
163
+ correct_valid_from: Optional[bool] # update valid_from to '1900-01-01' for the first timestamp
164
+ persist_last_timestamp: Optional[bool] # persist the last timestamp to be used as a watermark for the next run
165
+ # delete_missing: Optional[bool] # delete missing records on update (to be implemented)
125
166
  # else
126
167
  table: Optional[str]
127
168
  notebook: Optional[bool]
128
169
  requirements: Optional[bool]
129
170
  timeout: Optional[int]
171
+ metadata: Optional[bool]
130
172
 
131
173
 
132
174
  StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
@@ -204,7 +246,7 @@ class Options:
204
246
 
205
247
  class JobDependency(BaseModel):
206
248
  model_config = ConfigDict(extra="forbid", frozen=True)
207
- origin: Origins
249
+ origin: AllowedOrigins
208
250
  job_id: str
209
251
  parent: str
210
252
  parent_id: str
@@ -220,7 +262,7 @@ class JobDependency(BaseModel):
220
262
  return self
221
263
 
222
264
  @staticmethod
223
- def from_parts(job_id: str, parent: str, origin: Origins):
265
+ def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
224
266
  parent = parent.removesuffix("__current")
225
267
  return JobDependency(
226
268
  job_id=job_id,
@@ -20,7 +20,7 @@ class Checker(Generator):
20
20
 
21
21
  def _check(self, position: Literal["pre_run", "post_run"]):
22
22
  if self.options.check.get(position):
23
- DEFAULT_LOGGER.debug(f"{position.replace('_', ' ')} check", extra={"job": self})
23
+ DEFAULT_LOGGER.debug(f"check {position}", extra={"label": self})
24
24
 
25
25
  p = self.paths.runtime.append(f".{position}.sql")
26
26
  assert p.exists(), f"{position} check not found ({p})"
@@ -31,9 +31,9 @@ class Checker(Generator):
31
31
 
32
32
  if not fail_df.isEmpty():
33
33
  for row in fail_df.collect():
34
- DEFAULT_LOGGER.error(
35
- f"{position.replace('_', ' ')} check failed due to {row['__message']}",
36
- extra={"job": self},
34
+ DEFAULT_LOGGER.warning(
35
+ f"check {position} failed due to {row['__message']}",
36
+ extra={"label": self},
37
37
  )
38
38
 
39
39
  if position == "pre_run":
@@ -44,8 +44,8 @@ class Checker(Generator):
44
44
  elif not warning_df.isEmpty():
45
45
  for row in warning_df.collect():
46
46
  DEFAULT_LOGGER.warning(
47
- f"{position.replace('_', ' ')} check failed due to {row['__message']}",
48
- extra={"job": self},
47
+ f"check {position} failed due to {row['__message']}",
48
+ extra={"label": self},
49
49
  )
50
50
 
51
51
  if position == "pre_run":
@@ -59,19 +59,20 @@ class Checker(Generator):
59
59
  count_must_equal = self.options.check.get("count_must_equal")
60
60
 
61
61
  if min_rows or max_rows or count_must_equal:
62
- DEFAULT_LOGGER.debug("extra post run check", extra={"job": self})
63
-
64
62
  df = self.spark.sql(f"select count(*) from {self}")
65
63
  rows = df.collect()[0][0]
66
64
  if min_rows:
65
+ DEFAULT_LOGGER.debug("check min rows", extra={"label": self})
67
66
  if rows < min_rows:
68
67
  raise PostRunCheckException(f"min rows check failed ({rows} < {min_rows})", dataframe=df)
69
68
 
70
69
  if max_rows:
70
+ DEFAULT_LOGGER.debug("check max rows", extra={"label": self})
71
71
  if rows > max_rows:
72
72
  raise PostRunCheckException(f"max rows check failed ({rows} > {max_rows})", dataframe=df)
73
73
 
74
74
  if count_must_equal:
75
+ DEFAULT_LOGGER.debug("check count must equal", extra={"label": self})
75
76
  equals_rows = self.spark.read.table(count_must_equal).count()
76
77
  if rows != equals_rows:
77
78
  raise PostRunCheckException(
@@ -81,7 +82,7 @@ class Checker(Generator):
81
82
 
82
83
  def _check_duplicate_in_column(self, column: str):
83
84
  if column in self.table.columns:
84
- DEFAULT_LOGGER.debug(f"duplicate {column} check", extra={"job": self})
85
+ DEFAULT_LOGGER.debug(f"check duplicate in {column}", extra={"label": self})
85
86
 
86
87
  cols = [column]
87
88
 
@@ -108,7 +109,7 @@ class Checker(Generator):
108
109
  )
109
110
 
110
111
  else:
111
- DEFAULT_LOGGER.debug(f"{column} not found", extra={"job": self})
112
+ DEFAULT_LOGGER.debug(f"could not find {column}", extra={"label": self})
112
113
 
113
114
  def check_duplicate_key(self):
114
115
  self._check_duplicate_in_column("__key")
@@ -121,7 +122,7 @@ class Checker(Generator):
121
122
 
122
123
  def check_skip_run(self):
123
124
  if self.options.check.get("skip"):
124
- DEFAULT_LOGGER.debug("skip check", extra={"job": self})
125
+ DEFAULT_LOGGER.debug("check if run should be skipped", extra={"label": self})
125
126
 
126
127
  p = self.paths.runtime.append(".skip.sql")
127
128
  assert p.exists(), "skip check not found"
@@ -132,7 +133,7 @@ class Checker(Generator):
132
133
  for row in skip_df.collect():
133
134
  DEFAULT_LOGGER.warning(
134
135
  f"skip run due to {row['__message']}",
135
- extra={"job": self},
136
+ extra={"label": self},
136
137
  )
137
138
 
138
139
  raise SkipRunCheckWarning(row["__message"], dataframe=df)