fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +8 -7
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +96 -43
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +9 -8
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +269 -102
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -137
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
fabricks/api/context.py CHANGED
@@ -1,5 +1,17 @@
1
- from fabricks.context import BRONZE, DBUTILS, GOLD, SECRET_SCOPE, SILVER, SPARK, init_spark_session, pprint_runtime
2
- from fabricks.core.jobs.base._types import Bronzes, Golds, Silvers, Steps
1
+ from fabricks.context import (
2
+ BRONZE,
3
+ CONF_RUNTIME,
4
+ DBUTILS,
5
+ GOLD,
6
+ SILVER,
7
+ SPARK,
8
+ Bronzes,
9
+ Golds,
10
+ Silvers,
11
+ Steps,
12
+ init_spark_session,
13
+ pprint_runtime,
14
+ )
3
15
 
4
16
  # step
5
17
  BRONZES = Bronzes
@@ -18,7 +30,7 @@ __all__ = [
18
30
  "GOLDS",
19
31
  "init_spark_session",
20
32
  "pprint_runtime",
21
- "SECRET_SCOPE",
33
+ "CONF_RUNTIME",
22
34
  "SILVER",
23
35
  "Silvers",
24
36
  "SILVERS",
@@ -4,14 +4,13 @@
4
4
  # COMMAND ----------
5
5
 
6
6
  from logging import DEBUG
7
- from typing import Any, cast
7
+ from typing import Any
8
8
 
9
9
  from databricks.sdk.runtime import dbutils, display, spark
10
10
 
11
11
  from fabricks.context import PATH_NOTEBOOKS
12
12
  from fabricks.context.log import DEFAULT_LOGGER
13
13
  from fabricks.core import get_step
14
- from fabricks.core.jobs.base._types import TStep
15
14
  from fabricks.core.schedules import generate, terminate
16
15
  from fabricks.utils.helpers import run_in_parallel, run_notebook
17
16
 
@@ -51,7 +50,7 @@ steps = [row.step for row in spark.sql("select step from {df} group by step", df
51
50
 
52
51
 
53
52
  def _schedule(task: Any):
54
- step = get_step(step=cast(TStep, task))
53
+ step = get_step(step=task)
55
54
  run_notebook(
56
55
  PATH_NOTEBOOKS.joinpath("process"),
57
56
  timeout=step.timeouts.step,
fabricks/api/parsers.py CHANGED
@@ -1,3 +1,4 @@
1
- from fabricks.core.parsers import BaseParser, ParserOptions, parser
1
+ from fabricks.core.parsers import BaseParser, parser
2
+ from fabricks.models import ParserOptions
2
3
 
3
4
  __all__ = ["BaseParser", "ParserOptions", "parser"]
fabricks/api/utils.py CHANGED
@@ -1,9 +1,11 @@
1
1
  from fabricks.utils.helpers import concat_dfs, concat_ws, run_in_parallel
2
- from fabricks.utils.path import Path
2
+ from fabricks.utils.path import FileSharePath, GitPath, Path
3
3
 
4
4
  __all__ = [
5
5
  "concat_dfs",
6
6
  "concat_ws",
7
+ "FileSharePath",
8
+ "GitPath",
7
9
  "Path",
8
10
  "run_in_parallel",
9
11
  ]
fabricks/cdc/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from fabricks.cdc.base import AllowedChangeDataCaptures, BaseCDC
1
+ from fabricks.cdc.base import BaseCDC
2
2
  from fabricks.cdc.cdc import CDC
3
3
  from fabricks.cdc.nocdc import NoCDC
4
4
  from fabricks.cdc.scd1 import SCD1
@@ -7,7 +7,6 @@ from fabricks.cdc.scd2 import SCD2
7
7
  __all__ = [
8
8
  "BaseCDC",
9
9
  "CDC",
10
- "AllowedChangeDataCaptures",
11
10
  "NoCDC",
12
11
  "SCD1",
13
12
  "SCD2",
@@ -1,4 +1,3 @@
1
- from fabricks.cdc.base._types import AllowedChangeDataCaptures
2
1
  from fabricks.cdc.base.cdc import BaseCDC
3
2
 
4
- __all__ = ["BaseCDC", "AllowedChangeDataCaptures"]
3
+ __all__ = ["BaseCDC"]
@@ -1,10 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Literal, Union
3
+ from typing import Union
4
4
 
5
5
  from pyspark.sql import DataFrame
6
+ from pyspark.sql.types import StructType
6
7
 
7
8
  from fabricks.metastore.table import Table
8
9
 
9
- AllowedChangeDataCaptures = Literal["nocdc", "scd1", "scd2"]
10
- AllowedSources = Union[DataFrame, Table, str]
10
+ # Import from models for consistency
11
+
12
+ AllowedSources = Union[DataFrame, Table, str, StructType]
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
4
4
  from typing import List, Optional, Union
5
5
 
6
6
  from pyspark.sql import DataFrame, SparkSession
7
+ from pyspark.sql.types import StructType
7
8
 
8
9
  from fabricks.cdc.base._types import AllowedSources
9
10
  from fabricks.context import SPARK
@@ -111,6 +112,7 @@ class Configurator(ABC):
111
112
  cols = [
112
113
  "__operation",
113
114
  "__metadata",
115
+ "__last_updated",
114
116
  "__rescued_data",
115
117
  ]
116
118
 
@@ -135,6 +137,7 @@ class Configurator(ABC):
135
137
  # Trailing
136
138
  "__operation",
137
139
  "__metadata",
140
+ "__last_updated",
138
141
  "__rescued_data",
139
142
  ]
140
143
 
@@ -149,6 +152,8 @@ class Configurator(ABC):
149
152
  df = self.table.dataframe
150
153
  elif isinstance(src, str):
151
154
  df = self.spark.sql(src)
155
+ elif isinstance(src, StructType):
156
+ df = self.spark.createDataFrame([], schema=src)
152
157
  else:
153
158
  raise ValueError(f"{src} not allowed")
154
159
 
@@ -4,6 +4,7 @@ from typing import Any, List, Optional, Sequence, Union, cast
4
4
 
5
5
  from py4j.protocol import Py4JJavaError
6
6
  from pyspark.sql import DataFrame
7
+ from pyspark.sql.types import StructType
7
8
 
8
9
  from fabricks.cdc.base._types import AllowedSources
9
10
  from fabricks.cdc.base.configurator import Configurator
@@ -25,11 +26,11 @@ class Generator(Configurator):
25
26
  identity: Optional[bool] = False,
26
27
  liquid_clustering: Optional[bool] = False,
27
28
  cluster_by: Optional[Union[List[str], str]] = None,
28
- properties: Optional[dict[str, str]] = None,
29
+ properties: Optional[dict[str, str | bool | int]] = None,
29
30
  masks: Optional[dict[str, str]] = None,
30
31
  primary_key: Optional[dict[str, Any]] = None,
31
32
  foreign_keys: Optional[dict[str, Any]] = None,
32
- comments: Optional[dict[str, str]] = None,
33
+ comments: Optional[dict[str, Any]] = None,
33
34
  **kwargs,
34
35
  ):
35
36
  kwargs["mode"] = "complete"
@@ -145,6 +146,7 @@ class Generator(Configurator):
145
146
  d = self.get_schema_differences(src, **kwargs)
146
147
  if d is None:
147
148
  return None
149
+
148
150
  return len(d) > 0
149
151
 
150
152
  def _update_schema(
@@ -155,7 +157,9 @@ class Generator(Configurator):
155
157
  **kwargs,
156
158
  ):
157
159
  if self.is_view:
158
- assert not isinstance(src, DataFrameLike), "dataframe not allowed"
160
+ assert not isinstance(src, DataFrameLike) and not isinstance(src, StructType), (
161
+ "dataframe and structtype not allowed"
162
+ )
159
163
  self.create_or_replace_view(src=src)
160
164
 
161
165
  else:
@@ -7,6 +7,7 @@ from pyspark.sql import DataFrame
7
7
 
8
8
  from fabricks.cdc.base._types import AllowedSources
9
9
  from fabricks.cdc.base.processor import Processor
10
+ from fabricks.context.config import IS_DEBUGMODE
10
11
  from fabricks.context.log import DEFAULT_LOGGER
11
12
  from fabricks.metastore.view import create_or_replace_global_temp_view
12
13
  from fabricks.utils._types import DataFrameLike
@@ -56,6 +57,7 @@ class Merger(Processor):
56
57
  assert "__key" or keys, f"{self} - __key or keys not found"
57
58
 
58
59
  return {
60
+ "debugmode": IS_DEBUGMODE,
59
61
  "src": src,
60
62
  "format": format,
61
63
  "tgt": self.table,
@@ -7,6 +7,7 @@ from pyspark.sql import DataFrame
7
7
 
8
8
  from fabricks.cdc.base._types import AllowedSources
9
9
  from fabricks.cdc.base.generator import Generator
10
+ from fabricks.context.config import IS_DEBUGMODE
10
11
  from fabricks.context.log import DEFAULT_LOGGER
11
12
  from fabricks.metastore.table import Table
12
13
  from fabricks.metastore.view import create_or_replace_global_temp_view
@@ -65,6 +66,7 @@ class Processor(Generator):
65
66
  add_key = kwargs.get("add_key", None)
66
67
  add_hash = kwargs.get("add_hash", None)
67
68
  add_timestamp = kwargs.get("add_timestamp", None)
69
+ add_last_updated = kwargs.get("add_last_updated", None)
68
70
  add_metadata = kwargs.get("add_metadata", None)
69
71
 
70
72
  has_order_by = None if not order_duplicate_by else True
@@ -78,6 +80,7 @@ class Processor(Generator):
78
80
  has_hash = add_hash or "__hash" in inputs
79
81
  has_identity = "__identity" in inputs
80
82
  has_rescued_data = "__rescued_data" in inputs
83
+ has_last_updated = add_last_updated or "__last_updated" in inputs
81
84
 
82
85
  soft_delete = kwargs.get("soft_delete", None)
83
86
  delete_missing = kwargs.get("delete_missing", None)
@@ -152,6 +155,10 @@ class Processor(Generator):
152
155
  if add_hash and "__hash" in inputs:
153
156
  overwrite.append("__hash")
154
157
 
158
+ # override __last_updated if added and found in df
159
+ if add_last_updated and "__last_updated" in inputs:
160
+ overwrite.append("__last_updated")
161
+
155
162
  # override metadata if added and found in df
156
163
  if add_metadata and "__metadata" in inputs:
157
164
  overwrite.append("__metadata")
@@ -219,6 +226,11 @@ class Processor(Generator):
219
226
  outputs.append("__metadata")
220
227
  if "__metadata" not in intermediates:
221
228
  intermediates.append("__metadata")
229
+ if has_last_updated:
230
+ if "__last_updated" not in outputs:
231
+ outputs.append("__last_updated")
232
+ if "__last_updated" not in intermediates:
233
+ intermediates.append("__last_updated")
222
234
  if has_source:
223
235
  if "__source" not in outputs:
224
236
  outputs.append("__source")
@@ -311,6 +323,7 @@ class Processor(Generator):
311
323
  parent_final = "__final"
312
324
 
313
325
  return {
326
+ "debugmode": IS_DEBUGMODE,
314
327
  "src": src,
315
328
  "format": format,
316
329
  "tgt": tgt,
@@ -337,6 +350,7 @@ class Processor(Generator):
337
350
  "has_rows": has_rows,
338
351
  "has_source": has_source,
339
352
  "has_metadata": has_metadata,
353
+ "has_last_updated": has_last_updated,
340
354
  "has_timestamp": has_timestamp,
341
355
  "has_operation": has_operation,
342
356
  "has_identity": has_identity,
@@ -347,6 +361,7 @@ class Processor(Generator):
347
361
  # default add
348
362
  "add_metadata": add_metadata,
349
363
  "add_timestamp": add_timestamp,
364
+ "add_last_updated": add_last_updated,
350
365
  "add_key": add_key,
351
366
  "add_hash": add_hash,
352
367
  # value add