fabricks 2024.7.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks-2024.7.1.5/PKG-INFO +212 -0
  2. fabricks-2024.7.1.5/README.md +189 -0
  3. fabricks-2024.7.1.5/fabricks/__init__.py +0 -0
  4. fabricks-2024.7.1.5/fabricks/api/__init__.py +7 -0
  5. fabricks-2024.7.1.5/fabricks/api/cdc/__init__.py +6 -0
  6. fabricks-2024.7.1.5/fabricks/api/cdc/nocdc.py +3 -0
  7. fabricks-2024.7.1.5/fabricks/api/cdc/scd1.py +3 -0
  8. fabricks-2024.7.1.5/fabricks/api/cdc/scd2.py +3 -0
  9. fabricks-2024.7.1.5/fabricks/api/context.py +31 -0
  10. fabricks-2024.7.1.5/fabricks/api/core.py +4 -0
  11. fabricks-2024.7.1.5/fabricks/api/extenders.py +3 -0
  12. fabricks-2024.7.1.5/fabricks/api/log.py +3 -0
  13. fabricks-2024.7.1.5/fabricks/api/metastore/__init__.py +10 -0
  14. fabricks-2024.7.1.5/fabricks/api/metastore/database.py +3 -0
  15. fabricks-2024.7.1.5/fabricks/api/metastore/table.py +3 -0
  16. fabricks-2024.7.1.5/fabricks/api/metastore/view.py +6 -0
  17. fabricks-2024.7.1.5/fabricks/api/notebooks/__init__.py +0 -0
  18. fabricks-2024.7.1.5/fabricks/api/notebooks/cluster.py +6 -0
  19. fabricks-2024.7.1.5/fabricks/api/notebooks/deploy/__init__.py +0 -0
  20. fabricks-2024.7.1.5/fabricks/api/notebooks/deploy/fabricks.py +147 -0
  21. fabricks-2024.7.1.5/fabricks/api/notebooks/deploy/notebooks.py +86 -0
  22. fabricks-2024.7.1.5/fabricks/api/notebooks/initialize.py +38 -0
  23. fabricks-2024.7.1.5/fabricks/api/notebooks/optimize.py +25 -0
  24. fabricks-2024.7.1.5/fabricks/api/notebooks/process.py +50 -0
  25. fabricks-2024.7.1.5/fabricks/api/notebooks/run.py +87 -0
  26. fabricks-2024.7.1.5/fabricks/api/notebooks/terminate.py +27 -0
  27. fabricks-2024.7.1.5/fabricks/api/notebooks/vacuum.py +25 -0
  28. fabricks-2024.7.1.5/fabricks/api/parsers.py +3 -0
  29. fabricks-2024.7.1.5/fabricks/api/udfs.py +3 -0
  30. fabricks-2024.7.1.5/fabricks/api/utils.py +9 -0
  31. fabricks-2024.7.1.5/fabricks/cdc/__init__.py +14 -0
  32. fabricks-2024.7.1.5/fabricks/cdc/base/__init__.py +4 -0
  33. fabricks-2024.7.1.5/fabricks/cdc/base/cdc.py +5 -0
  34. fabricks-2024.7.1.5/fabricks/cdc/base/configurator.py +145 -0
  35. fabricks-2024.7.1.5/fabricks/cdc/base/generator.py +117 -0
  36. fabricks-2024.7.1.5/fabricks/cdc/base/merger.py +107 -0
  37. fabricks-2024.7.1.5/fabricks/cdc/base/processor.py +338 -0
  38. fabricks-2024.7.1.5/fabricks/cdc/base/types.py +3 -0
  39. fabricks-2024.7.1.5/fabricks/cdc/cdc.py +5 -0
  40. fabricks-2024.7.1.5/fabricks/cdc/nocdc.py +19 -0
  41. fabricks-2024.7.1.5/fabricks/cdc/scd.py +21 -0
  42. fabricks-2024.7.1.5/fabricks/cdc/scd1.py +15 -0
  43. fabricks-2024.7.1.5/fabricks/cdc/scd2.py +15 -0
  44. fabricks-2024.7.1.5/fabricks/cdc/templates/__init__.py +0 -0
  45. fabricks-2024.7.1.5/fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  46. fabricks-2024.7.1.5/fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  47. fabricks-2024.7.1.5/fabricks/cdc/templates/merge.sql.jinja +2 -0
  48. fabricks-2024.7.1.5/fabricks/cdc/templates/query/__init__.py +0 -0
  49. fabricks-2024.7.1.5/fabricks/cdc/templates/query/base.sql.jinja +34 -0
  50. fabricks-2024.7.1.5/fabricks/cdc/templates/query/context.sql.jinja +95 -0
  51. fabricks-2024.7.1.5/fabricks/cdc/templates/query/current.sql.jinja +32 -0
  52. fabricks-2024.7.1.5/fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  53. fabricks-2024.7.1.5/fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  54. fabricks-2024.7.1.5/fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  55. fabricks-2024.7.1.5/fabricks/cdc/templates/query/final.sql.jinja +1 -0
  56. fabricks-2024.7.1.5/fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  57. fabricks-2024.7.1.5/fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  58. fabricks-2024.7.1.5/fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  59. fabricks-2024.7.1.5/fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  60. fabricks-2024.7.1.5/fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  61. fabricks-2024.7.1.5/fabricks/cdc/templates/query.sql.jinja +11 -0
  62. fabricks-2024.7.1.5/fabricks/context/__init__.py +51 -0
  63. fabricks-2024.7.1.5/fabricks/context/log.py +26 -0
  64. fabricks-2024.7.1.5/fabricks/context/runtime.py +143 -0
  65. fabricks-2024.7.1.5/fabricks/context/spark.py +43 -0
  66. fabricks-2024.7.1.5/fabricks/context/types.py +123 -0
  67. fabricks-2024.7.1.5/fabricks/core/__init__.py +4 -0
  68. fabricks-2024.7.1.5/fabricks/core/dags/__init__.py +9 -0
  69. fabricks-2024.7.1.5/fabricks/core/dags/base.py +72 -0
  70. fabricks-2024.7.1.5/fabricks/core/dags/generator.py +154 -0
  71. fabricks-2024.7.1.5/fabricks/core/dags/log.py +14 -0
  72. fabricks-2024.7.1.5/fabricks/core/dags/processor.py +163 -0
  73. fabricks-2024.7.1.5/fabricks/core/dags/terminator.py +26 -0
  74. fabricks-2024.7.1.5/fabricks/core/deploy/__init__.py +12 -0
  75. fabricks-2024.7.1.5/fabricks/core/deploy/tables.py +76 -0
  76. fabricks-2024.7.1.5/fabricks/core/deploy/views.py +417 -0
  77. fabricks-2024.7.1.5/fabricks/core/extenders.py +29 -0
  78. fabricks-2024.7.1.5/fabricks/core/jobs/__init__.py +20 -0
  79. fabricks-2024.7.1.5/fabricks/core/jobs/base/__init__.py +10 -0
  80. fabricks-2024.7.1.5/fabricks/core/jobs/base/checker.py +89 -0
  81. fabricks-2024.7.1.5/fabricks/core/jobs/base/configurator.py +323 -0
  82. fabricks-2024.7.1.5/fabricks/core/jobs/base/error.py +16 -0
  83. fabricks-2024.7.1.5/fabricks/core/jobs/base/generator.py +391 -0
  84. fabricks-2024.7.1.5/fabricks/core/jobs/base/invoker.py +119 -0
  85. fabricks-2024.7.1.5/fabricks/core/jobs/base/job.py +5 -0
  86. fabricks-2024.7.1.5/fabricks/core/jobs/base/processor.py +204 -0
  87. fabricks-2024.7.1.5/fabricks/core/jobs/base/types.py +191 -0
  88. fabricks-2024.7.1.5/fabricks/core/jobs/bronze.py +333 -0
  89. fabricks-2024.7.1.5/fabricks/core/jobs/get_job.py +126 -0
  90. fabricks-2024.7.1.5/fabricks/core/jobs/get_job_conf.py +115 -0
  91. fabricks-2024.7.1.5/fabricks/core/jobs/get_job_id.py +26 -0
  92. fabricks-2024.7.1.5/fabricks/core/jobs/get_jobs.py +89 -0
  93. fabricks-2024.7.1.5/fabricks/core/jobs/gold.py +218 -0
  94. fabricks-2024.7.1.5/fabricks/core/jobs/silver.py +354 -0
  95. fabricks-2024.7.1.5/fabricks/core/parsers/__init__.py +12 -0
  96. fabricks-2024.7.1.5/fabricks/core/parsers/base.py +91 -0
  97. fabricks-2024.7.1.5/fabricks/core/parsers/decorator.py +11 -0
  98. fabricks-2024.7.1.5/fabricks/core/parsers/get_parser.py +25 -0
  99. fabricks-2024.7.1.5/fabricks/core/parsers/types.py +6 -0
  100. fabricks-2024.7.1.5/fabricks/core/schedules.py +89 -0
  101. fabricks-2024.7.1.5/fabricks/core/scripts/__init__.py +13 -0
  102. fabricks-2024.7.1.5/fabricks/core/scripts/armageddon.py +82 -0
  103. fabricks-2024.7.1.5/fabricks/core/scripts/generate.py +20 -0
  104. fabricks-2024.7.1.5/fabricks/core/scripts/job_schema.py +28 -0
  105. fabricks-2024.7.1.5/fabricks/core/scripts/optimize.py +45 -0
  106. fabricks-2024.7.1.5/fabricks/core/scripts/process.py +9 -0
  107. fabricks-2024.7.1.5/fabricks/core/scripts/stats.py +48 -0
  108. fabricks-2024.7.1.5/fabricks/core/scripts/steps.py +27 -0
  109. fabricks-2024.7.1.5/fabricks/core/scripts/terminate.py +6 -0
  110. fabricks-2024.7.1.5/fabricks/core/scripts/vacuum.py +45 -0
  111. fabricks-2024.7.1.5/fabricks/core/site_packages.py +55 -0
  112. fabricks-2024.7.1.5/fabricks/core/steps/__init__.py +4 -0
  113. fabricks-2024.7.1.5/fabricks/core/steps/base.py +282 -0
  114. fabricks-2024.7.1.5/fabricks/core/steps/get_step.py +10 -0
  115. fabricks-2024.7.1.5/fabricks/core/steps/get_step_conf.py +33 -0
  116. fabricks-2024.7.1.5/fabricks/core/steps/types.py +7 -0
  117. fabricks-2024.7.1.5/fabricks/core/udfs.py +106 -0
  118. fabricks-2024.7.1.5/fabricks/core/utils.py +69 -0
  119. fabricks-2024.7.1.5/fabricks/core/views.py +36 -0
  120. fabricks-2024.7.1.5/fabricks/metastore/README.md +3 -0
  121. fabricks-2024.7.1.5/fabricks/metastore/__init__.py +5 -0
  122. fabricks-2024.7.1.5/fabricks/metastore/database.py +71 -0
  123. fabricks-2024.7.1.5/fabricks/metastore/pyproject.toml +20 -0
  124. fabricks-2024.7.1.5/fabricks/metastore/relational.py +61 -0
  125. fabricks-2024.7.1.5/fabricks/metastore/table.py +529 -0
  126. fabricks-2024.7.1.5/fabricks/metastore/utils.py +35 -0
  127. fabricks-2024.7.1.5/fabricks/metastore/view.py +40 -0
  128. fabricks-2024.7.1.5/fabricks/utils/README.md +3 -0
  129. fabricks-2024.7.1.5/fabricks/utils/__init__.py +0 -0
  130. fabricks-2024.7.1.5/fabricks/utils/azure_queue.py +63 -0
  131. fabricks-2024.7.1.5/fabricks/utils/azure_table.py +99 -0
  132. fabricks-2024.7.1.5/fabricks/utils/console.py +51 -0
  133. fabricks-2024.7.1.5/fabricks/utils/container.py +57 -0
  134. fabricks-2024.7.1.5/fabricks/utils/fdict.py +28 -0
  135. fabricks-2024.7.1.5/fabricks/utils/helpers.py +89 -0
  136. fabricks-2024.7.1.5/fabricks/utils/log.py +153 -0
  137. fabricks-2024.7.1.5/fabricks/utils/path.py +206 -0
  138. fabricks-2024.7.1.5/fabricks/utils/pip.py +61 -0
  139. fabricks-2024.7.1.5/fabricks/utils/pydantic.py +92 -0
  140. fabricks-2024.7.1.5/fabricks/utils/pyproject.toml +18 -0
  141. fabricks-2024.7.1.5/fabricks/utils/read/__init__.py +11 -0
  142. fabricks-2024.7.1.5/fabricks/utils/read/read.py +305 -0
  143. fabricks-2024.7.1.5/fabricks/utils/read/read_excel.py +5 -0
  144. fabricks-2024.7.1.5/fabricks/utils/read/read_yaml.py +43 -0
  145. fabricks-2024.7.1.5/fabricks/utils/read/types.py +3 -0
  146. fabricks-2024.7.1.5/fabricks/utils/schema/__init__.py +7 -0
  147. fabricks-2024.7.1.5/fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  148. fabricks-2024.7.1.5/fabricks/utils/schema/get_schema_for_type.py +93 -0
  149. fabricks-2024.7.1.5/fabricks/utils/secret.py +78 -0
  150. fabricks-2024.7.1.5/fabricks/utils/sqlglot.py +48 -0
  151. fabricks-2024.7.1.5/fabricks/utils/write/__init__.py +8 -0
  152. fabricks-2024.7.1.5/fabricks/utils/write/delta.py +46 -0
  153. fabricks-2024.7.1.5/fabricks/utils/write/stream.py +27 -0
  154. fabricks-2024.7.1.5/pyproject.toml +76 -0
@@ -0,0 +1,212 @@
1
+ Metadata-Version: 2.1
2
+ Name: fabricks
3
+ Version: 2024.7.1.5
4
+ Summary:
5
+ Author: BMS DWH Team
6
+ Author-email: bi_support@bmsuisse.ch
7
+ Requires-Python: >=3.9,<4
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: azure-data-tables (>=12.5.0,<13.0.0)
14
+ Requires-Dist: azure-identity (>=1.10.0)
15
+ Requires-Dist: azure-storage-blob (>=12.14.1)
16
+ Requires-Dist: azure-storage-queue (>=12.10.0,<13.0.0)
17
+ Requires-Dist: databricks-sdk (>=0.29.0)
18
+ Requires-Dist: jinja2 (>=2.11.3)
19
+ Requires-Dist: python-dotenv (>=1.0.1)
20
+ Requires-Dist: sqlglot (>=22.1.1)
21
+ Description-Content-Type: text/markdown
22
+
23
+ # Fabricks
24
+
25
+ Fabricks is a Python framework developed to help create a lake house in Databricks. It simplifies the process of building and maintaining data pipelines by providing a standardized approach to defining and managing data processing workflows.
26
+
27
+ Though Fabricks is currently really meant to be run on Databricks, the code using Fabricks is really portable - you'll almost exclusively
28
+ write SQL-Select Code - no need to manually write DDL/DML/Merge queries. Later on we might add support for other platforms as well, eg. DuckDB or Open Source Spark.
29
+
30
+
31
+
32
+ ## Features
33
+
34
+ - YAML configuration files: Fabricks uses YAML files for configuration, making it easy to define and modify workflows without requiring significant changes to the code.
35
+ - SQL files for business logic: Business logic is defined in SQL files, providing a familiar and powerful tool for data processing.
36
+ - Version control: Fabricks supports version control, ensuring that changes are tracked and can be rolled back if necessary.
37
+ - Seamless integration of new data sources: Fabricks can easily integrate new data sources into existing workflows.
38
+ - Change Data Capture: Fabricks supports Change Data Capture, allowing it to track and handle changes in the data over time.
39
+ - Drop and create: Fabricks can drop and create tables as needed, providing flexibility in managing the data schema.
40
+
41
+ ## Getting Started
42
+
43
+ To get started with Fabricks, you'll need to install it and set up your first project. Here's a basic guide on how to do that:
44
+
45
+ ### Installation
46
+
47
+ To install Fabricks, you need to install the library on your Databricks cluster. Follow the steps below:
48
+
49
+ 1. Navigate to your Databricks workspace.
50
+ 2. Select the cluster where you want to install the library.
51
+ 3. Click on the `Libraries` tab.
52
+ 4. Click on `Install New`.
53
+ 5. Choose `PyPI` from the library source dropdown.
54
+ 6. Enter `fabricks` in the package text box.
55
+ 7. Click `Install`.
56
+
57
+ After the library is installed, you can import it in your notebooks or scripts using `import fabricks`.
58
+
59
+ ### Setting Up Your First Project
60
+
61
+ # Fabricks Runtime Configuration
62
+
63
+ The Fabricks runtime configuration is defined in a YAML file. This file specifies the settings for the Fabricks runtime, including options for the runtime environment, path options, Spark options, and the configuration for different stages of the data pipeline (bronze, silver, gold, etc.).
64
+
65
+ A sample can be found in the [tests](tests/runtime/fabricks/conf.5589296195699698.yml)
66
+
67
+ ## Configuration Options
68
+
69
+ - `name`: The name of the configuration.
70
+ - `options`: General options for the runtime. This includes:
71
+ - `secret_scope`: The name of the secret scope in Databricks.
72
+ - `timeout`: The timeout for the runtime in seconds.
73
+ - `workers`: The number of workers for the runtime.
74
+ - `path_options`: Options for the storage path. This includes:
75
+ - `storage`: The storage path for the data.
76
+ - `spark_options`: Options for Spark. This includes:
77
+ - `sql`: SQL options for Spark.
78
+
79
+ ## Data Pipeline Stages
80
+
81
+ The configuration file defines the settings for different stages of the data pipeline:
82
+
83
+ - `bronze`: The initial stage of the data pipeline. This includes:
84
+ - `name`: The name of the stage.
85
+ - `path_options`: Options for the storage path.
86
+ - `options`: Options for the stage.
87
+
88
+ For some samples, see in the [tests/runtime/bronze Folder](tests/runtime/bronze)
89
+ - `silver`: The intermediate stage of the data pipeline. This includes:
90
+ - `name`: The name of the stage.
91
+ - `path_options`: Options for the storage path.
92
+ - `options`: Options for the stage.
93
+
94
+ For some samples, see in the [tests/runtime/silver Folder](tests/runtime/silver)
95
+ - `gold`: The final stage of the data pipeline. This includes:
96
+ - `name`: The name of the stage.
97
+ - `path_options`: Options for the storage path.
98
+ - `options`: Options for the stage.
99
+
100
+
101
+ For some samples, see in the [tests/runtime/gold Folder](tests/runtime/gold)
102
+
103
+ The folder names and the stage names can be configures in the main Fabricks config, you don't have t o stick with the defaults
104
+
105
+ ## Other Configurations
106
+
107
+ - `powerbi`: Configuration for PowerBI integration.
108
+ - `databases`: Configuration for the databases.
109
+ - `credentials`: Credentials for accessing different resources.
110
+ - `variables`: Variables used in the configuration.
111
+
112
+ Please note that this is a basic documentation based on the provided YAML file. The actual configuration options may vary depending on the specific requirements of your project.
113
+
114
+ # Bronze Step Configuration
115
+
116
+ The "bronze" step in Fabricks is the initial stage of the data pipeline. It is responsible for ingesting raw data and storing it in a "bronze" table for further processing. The configuration for the "bronze" step is defined in a YAML file. Each job in the "bronze" step has the following configuration options:
117
+
118
+ - `step`: The step in the data pipeline. For these jobs, it is always "bronze".
119
+ - `topic`: The topic of the job. This is usually the name of the data source.
120
+ - `item`: The item of the job. This is usually the name of the specific data item being processed.
121
+ - `tags`: Tags for the job. These can be used to categorize or filter jobs.
122
+ - `options`: Options for the job. This includes:
123
+ - `mode`: The mode of the job. This can be "append", "memory", or "register".
124
+ - `uri`: The URI of the data source. This is usually an Azure Blob File System (ABFS) URI.
125
+ - `parser`: The parser to use for the data. This can be "monarch", "parquet", etc.
126
+ - `keys`: The keys for the data. These are the columns that uniquely identify each row in the data.
127
+ - `source`: The source of the data. This is usually the same as the topic.
128
+ - `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
129
+ - `encrypted_columns`: The columns in the data that are encrypted. These columns will be decrypted during the "bronze" step.
130
+ - `calculated_columns`: The columns in the data that are calculated. These columns will be calculated during the "bronze" step.
131
+
132
+ Here's an example of a "bronze" step job:
133
+
134
+ ```yaml
135
+ - job:
136
+ step: bronze
137
+ topic: king
138
+ item: scd1
139
+ tags: [test]
140
+ options:
141
+ mode: append
142
+ uri: abfss://fabricks@$datahub/raw/king
143
+ parser: monarch
144
+ keys: [id]
145
+ source: king
146
+ ```
147
+
148
+ # Silver Step Configuration
149
+
150
+ The "silver" step in Fabricks is the intermediate stage of the data pipeline. It is responsible for processing the raw data ingested in the "bronze" step and storing it in a "silver" table for further processing. The configuration for the "silver" step is defined in a YAML file. Each job in the "silver" step has the following configuration options:
151
+
152
+ - `step`: The step in the data pipeline. For these jobs, it is always "silver".
153
+ - `topic`: The topic of the job. This is usually the name of the data source.
154
+ - `item`: The item of the job. This is usually the name of the specific data item being processed.
155
+ - `tags`: Tags for the job. These can be used to categorize or filter jobs.
156
+ - `options`: Options for the job. This includes:
157
+ - `mode`: The mode of the job. This can be "update", "memory", "latest", "append", "combine", etc.
158
+ - `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
159
+ - `parents`: The parent jobs that this job depends on. These are usually "bronze" step jobs.
160
+ - `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
161
+ - `order_duplicate_by`: The order to use when removing duplicates. This can be "asc" or "desc".
162
+ - `check_options`: Options for checking the data. This includes:
163
+ - `max_rows`: The maximum number of rows to check.
164
+ - `stream`: Whether to stream the data. This can be "true" or "false".
165
+
166
+ Here's an example of a "silver" step job:
167
+
168
+ ```yaml
169
+ - job:
170
+ step: silver
171
+ topic: king_and_queen
172
+ item: scd1
173
+ tags: [test]
174
+ options:
175
+ mode: update
176
+ change_data_capture: scd1
177
+ parents: [bronze.queen_scd1, bronze.king_scd1]
178
+ ```
179
+ # Gold Step Configuration
180
+
181
+ The "gold" step in Fabricks is the final stage of the data pipeline. It is responsible for processing the data from the "silver" step and storing it in a "gold" table for consumption. The configuration for the "gold" step is defined in a YAML file. Each job in the "gold" step has the following configuration options:
182
+
183
+ - `step`: The step in the data pipeline. For these jobs, it is always "gold".
184
+ - `topic`: The topic of the job. This is usually the name of the data source.
185
+ - `item`: The item of the job. This is usually the name of the specific data item being processed.
186
+ - `tags`: Tags for the job. These can be used to categorize or filter jobs.
187
+ - `options`: Options for the job. This includes:
188
+ - `mode`: The mode of the job. This can be "complete", "memory", etc.
189
+ - `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
190
+
191
+ Here's an example of a "gold" step job:
192
+
193
+ ```yaml
194
+ - job:
195
+ step: gold
196
+ topic: scd2
197
+ item: complete
198
+ tags: [test]
199
+ options:
200
+ change_data_capture: scd2
201
+ mode: complete
202
+ ````
203
+
204
+ ## Usage
205
+
206
+ // Instructions on how to use the framework go here
207
+
208
+
209
+ ## License
210
+
211
+ This project is licensed under the terms of the MIT license.
212
+
@@ -0,0 +1,189 @@
1
+ # Fabricks
2
+
3
+ Fabricks is a Python framework developed to help create a lake house in Databricks. It simplifies the process of building and maintaining data pipelines by providing a standardized approach to defining and managing data processing workflows.
4
+
5
+ Though Fabricks is currently really meant to be run on Databricks, the code using Fabricks is really portable - you'll almost exclusively
6
+ write SQL-Select Code - no need to manually write DDL/DML/Merge queries. Later on we might add support for other platforms as well, eg. DuckDB or Open Source Spark.
7
+
8
+
9
+
10
+ ## Features
11
+
12
+ - YAML configuration files: Fabricks uses YAML files for configuration, making it easy to define and modify workflows without requiring significant changes to the code.
13
+ - SQL files for business logic: Business logic is defined in SQL files, providing a familiar and powerful tool for data processing.
14
+ - Version control: Fabricks supports version control, ensuring that changes are tracked and can be rolled back if necessary.
15
+ - Seamless integration of new data sources: Fabricks can easily integrate new data sources into existing workflows.
16
+ - Change Data Capture: Fabricks supports Change Data Capture, allowing it to track and handle changes in the data over time.
17
+ - Drop and create: Fabricks can drop and create tables as needed, providing flexibility in managing the data schema.
18
+
19
+ ## Getting Started
20
+
21
+ To get started with Fabricks, you'll need to install it and set up your first project. Here's a basic guide on how to do that:
22
+
23
+ ### Installation
24
+
25
+ To install Fabricks, you need to install the library on your Databricks cluster. Follow the steps below:
26
+
27
+ 1. Navigate to your Databricks workspace.
28
+ 2. Select the cluster where you want to install the library.
29
+ 3. Click on the `Libraries` tab.
30
+ 4. Click on `Install New`.
31
+ 5. Choose `PyPI` from the library source dropdown.
32
+ 6. Enter `fabricks` in the package text box.
33
+ 7. Click `Install`.
34
+
35
+ After the library is installed, you can import it in your notebooks or scripts using `import fabricks`.
36
+
37
+ ### Setting Up Your First Project
38
+
39
+ # Fabricks Runtime Configuration
40
+
41
+ The Fabricks runtime configuration is defined in a YAML file. This file specifies the settings for the Fabricks runtime, including options for the runtime environment, path options, Spark options, and the configuration for different stages of the data pipeline (bronze, silver, gold, etc.).
42
+
43
+ A sample can be found in the [tests](tests/runtime/fabricks/conf.5589296195699698.yml)
44
+
45
+ ## Configuration Options
46
+
47
+ - `name`: The name of the configuration.
48
+ - `options`: General options for the runtime. This includes:
49
+ - `secret_scope`: The name of the secret scope in Databricks.
50
+ - `timeout`: The timeout for the runtime in seconds.
51
+ - `workers`: The number of workers for the runtime.
52
+ - `path_options`: Options for the storage path. This includes:
53
+ - `storage`: The storage path for the data.
54
+ - `spark_options`: Options for Spark. This includes:
55
+ - `sql`: SQL options for Spark.
56
+
57
+ ## Data Pipeline Stages
58
+
59
+ The configuration file defines the settings for different stages of the data pipeline:
60
+
61
+ - `bronze`: The initial stage of the data pipeline. This includes:
62
+ - `name`: The name of the stage.
63
+ - `path_options`: Options for the storage path.
64
+ - `options`: Options for the stage.
65
+
66
+ For some samples, see in the [tests/runtime/bronze Folder](tests/runtime/bronze)
67
+ - `silver`: The intermediate stage of the data pipeline. This includes:
68
+ - `name`: The name of the stage.
69
+ - `path_options`: Options for the storage path.
70
+ - `options`: Options for the stage.
71
+
72
+ For some samples, see in the [tests/runtime/silver Folder](tests/runtime/silver)
73
+ - `gold`: The final stage of the data pipeline. This includes:
74
+ - `name`: The name of the stage.
75
+ - `path_options`: Options for the storage path.
76
+ - `options`: Options for the stage.
77
+
78
+
79
+ For some samples, see in the [tests/runtime/gold Folder](tests/runtime/gold)
80
+
81
+ The folder names and the stage names can be configures in the main Fabricks config, you don't have t o stick with the defaults
82
+
83
+ ## Other Configurations
84
+
85
+ - `powerbi`: Configuration for PowerBI integration.
86
+ - `databases`: Configuration for the databases.
87
+ - `credentials`: Credentials for accessing different resources.
88
+ - `variables`: Variables used in the configuration.
89
+
90
+ Please note that this is a basic documentation based on the provided YAML file. The actual configuration options may vary depending on the specific requirements of your project.
91
+
92
+ # Bronze Step Configuration
93
+
94
+ The "bronze" step in Fabricks is the initial stage of the data pipeline. It is responsible for ingesting raw data and storing it in a "bronze" table for further processing. The configuration for the "bronze" step is defined in a YAML file. Each job in the "bronze" step has the following configuration options:
95
+
96
+ - `step`: The step in the data pipeline. For these jobs, it is always "bronze".
97
+ - `topic`: The topic of the job. This is usually the name of the data source.
98
+ - `item`: The item of the job. This is usually the name of the specific data item being processed.
99
+ - `tags`: Tags for the job. These can be used to categorize or filter jobs.
100
+ - `options`: Options for the job. This includes:
101
+ - `mode`: The mode of the job. This can be "append", "memory", or "register".
102
+ - `uri`: The URI of the data source. This is usually an Azure Blob File System (ABFS) URI.
103
+ - `parser`: The parser to use for the data. This can be "monarch", "parquet", etc.
104
+ - `keys`: The keys for the data. These are the columns that uniquely identify each row in the data.
105
+ - `source`: The source of the data. This is usually the same as the topic.
106
+ - `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
107
+ - `encrypted_columns`: The columns in the data that are encrypted. These columns will be decrypted during the "bronze" step.
108
+ - `calculated_columns`: The columns in the data that are calculated. These columns will be calculated during the "bronze" step.
109
+
110
+ Here's an example of a "bronze" step job:
111
+
112
+ ```yaml
113
+ - job:
114
+ step: bronze
115
+ topic: king
116
+ item: scd1
117
+ tags: [test]
118
+ options:
119
+ mode: append
120
+ uri: abfss://fabricks@$datahub/raw/king
121
+ parser: monarch
122
+ keys: [id]
123
+ source: king
124
+ ```
125
+
126
+ # Silver Step Configuration
127
+
128
+ The "silver" step in Fabricks is the intermediate stage of the data pipeline. It is responsible for processing the raw data ingested in the "bronze" step and storing it in a "silver" table for further processing. The configuration for the "silver" step is defined in a YAML file. Each job in the "silver" step has the following configuration options:
129
+
130
+ - `step`: The step in the data pipeline. For these jobs, it is always "silver".
131
+ - `topic`: The topic of the job. This is usually the name of the data source.
132
+ - `item`: The item of the job. This is usually the name of the specific data item being processed.
133
+ - `tags`: Tags for the job. These can be used to categorize or filter jobs.
134
+ - `options`: Options for the job. This includes:
135
+ - `mode`: The mode of the job. This can be "update", "memory", "latest", "append", "combine", etc.
136
+ - `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
137
+ - `parents`: The parent jobs that this job depends on. These are usually "bronze" step jobs.
138
+ - `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
139
+ - `order_duplicate_by`: The order to use when removing duplicates. This can be "asc" or "desc".
140
+ - `check_options`: Options for checking the data. This includes:
141
+ - `max_rows`: The maximum number of rows to check.
142
+ - `stream`: Whether to stream the data. This can be "true" or "false".
143
+
144
+ Here's an example of a "silver" step job:
145
+
146
+ ```yaml
147
+ - job:
148
+ step: silver
149
+ topic: king_and_queen
150
+ item: scd1
151
+ tags: [test]
152
+ options:
153
+ mode: update
154
+ change_data_capture: scd1
155
+ parents: [bronze.queen_scd1, bronze.king_scd1]
156
+ ```
157
+ # Gold Step Configuration
158
+
159
+ The "gold" step in Fabricks is the final stage of the data pipeline. It is responsible for processing the data from the "silver" step and storing it in a "gold" table for consumption. The configuration for the "gold" step is defined in a YAML file. Each job in the "gold" step has the following configuration options:
160
+
161
+ - `step`: The step in the data pipeline. For these jobs, it is always "gold".
162
+ - `topic`: The topic of the job. This is usually the name of the data source.
163
+ - `item`: The item of the job. This is usually the name of the specific data item being processed.
164
+ - `tags`: Tags for the job. These can be used to categorize or filter jobs.
165
+ - `options`: Options for the job. This includes:
166
+ - `mode`: The mode of the job. This can be "complete", "memory", etc.
167
+ - `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
168
+
169
+ Here's an example of a "gold" step job:
170
+
171
+ ```yaml
172
+ - job:
173
+ step: gold
174
+ topic: scd2
175
+ item: complete
176
+ tags: [test]
177
+ options:
178
+ change_data_capture: scd2
179
+ mode: complete
180
+ ````
181
+
182
+ ## Usage
183
+
184
+ // Instructions on how to use the framework go here
185
+
186
+
187
+ ## License
188
+
189
+ This project is licensed under the terms of the MIT license.
File without changes
@@ -0,0 +1,7 @@
1
+ from fabricks.api.core import get_job, get_jobs, get_step
2
+
3
+ __all__ = [
4
+ "get_job",
5
+ "get_jobs",
6
+ "get_step",
7
+ ]
@@ -0,0 +1,6 @@
1
+ from fabricks.api.cdc.nocdc import NoCDC
2
+ from fabricks.api.cdc.scd1 import SCD1
3
+ from fabricks.api.cdc.scd2 import SCD2
4
+ from fabricks.cdc.cdc import CDC
5
+
6
+ __all__ = ["CDC", "SCD1", "SCD2", "NoCDC"]
@@ -0,0 +1,3 @@
1
+ from fabricks.cdc.nocdc import NoCDC
2
+
3
+ __all__ = ["NoCDC"]
@@ -0,0 +1,3 @@
1
+ from fabricks.cdc.scd1 import SCD1
2
+
3
+ __all__ = ["SCD1"]
@@ -0,0 +1,3 @@
1
+ from fabricks.cdc.scd2 import SCD2
2
+
3
+ __all__ = ["SCD2"]
@@ -0,0 +1,31 @@
1
+ from databricks.sdk.runtime import dbutils, spark
2
+
3
+ from fabricks.context import BRONZE, GOLD, SECRET_SCOPE, SILVER
4
+ from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers
5
+
6
+ # spark
7
+ SPARK = spark
8
+ DBUTILS = dbutils
9
+
10
+ # step
11
+ BRONZES = Bronzes
12
+ SILVERS = Silvers
13
+ GOLDS = Golds
14
+ STEPS = BRONZES + SILVERS + GOLDS
15
+
16
+
17
+ __all__ = [
18
+ "BRONZE",
19
+ "Bronzes",
20
+ "BRONZES",
21
+ "DBUTILS",
22
+ "GOLD",
23
+ "Golds",
24
+ "GOLDS",
25
+ "SECRET_SCOPE",
26
+ "SILVER",
27
+ "Silvers",
28
+ "SILVERS",
29
+ "SPARK",
30
+ "STEPS",
31
+ ]
@@ -0,0 +1,4 @@
1
+ from fabricks.core.jobs import get_job, get_jobs
2
+ from fabricks.core.steps import get_step
3
+
4
+ __all__ = ["get_job", "get_jobs", "get_step"]
@@ -0,0 +1,3 @@
1
+ from fabricks.core.extenders import extender
2
+
3
+ __all__ = ["extender"]
@@ -0,0 +1,3 @@
1
+ from fabricks.context.log import Logger, TableLogger, flush
2
+
3
+ __all__ = ["Logger", "TableLogger", "flush"]
@@ -0,0 +1,10 @@
1
+ from fabricks.api.metastore.database import Database
2
+ from fabricks.api.metastore.table import Table
3
+ from fabricks.api.metastore.view import View, create_or_replace_view
4
+
5
+ __all__ = [
6
+ "create_or_replace_view",
7
+ "Database",
8
+ "Table",
9
+ "View",
10
+ ]
@@ -0,0 +1,3 @@
1
+ from fabricks.metastore import Database
2
+
3
+ __all__ = ["Database"]
@@ -0,0 +1,3 @@
1
+ from fabricks.metastore import Table
2
+
3
+ __all__ = ["Table"]
@@ -0,0 +1,6 @@
1
+ from fabricks.metastore import View
2
+
3
+ create_or_replace_view = View.create_or_replace
4
+
5
+
6
+ __all__ = ["View", "create_or_replace_view"]
File without changes
@@ -0,0 +1,6 @@
1
+ # Databricks notebook source
2
+ from databricks.sdk.runtime import dbutils
3
+
4
+ # COMMAND ----------
5
+
6
+ dbutils.notebook.exit("exit (0)")
@@ -0,0 +1,147 @@
1
+ # Databricks notebook source
2
+ # MAGIC %pip install python-dotenv
3
+
4
+ # COMMAND ----------
5
+
6
+ import os
7
+ import subprocess
8
+ from pathlib import Path
9
+
10
+ from databricks.sdk.runtime import dbutils
11
+ from dotenv import load_dotenv
12
+
13
+ # COMMAND ----------
14
+
15
+ load_dotenv()
16
+
17
+ # COMMAND ----------
18
+
19
+ try:
20
+ dbutils.fs.ls("mnt/fabricks")
21
+ except Exception:
22
+ print("fabricks container not mounted")
23
+
24
+ # COMMAND ----------
25
+
26
+ try:
27
+ dbutils.fs.ls("mnt/fabricks/versions")
28
+ except Exception:
29
+ print("fabricks not found")
30
+
31
+ # COMMAND ----------
32
+
33
+ runtime = os.environ.get("path_runtime")
34
+ assert runtime
35
+ if not Path(runtime).exists():
36
+ print("runtime not found")
37
+
38
+ # COMMAND ----------
39
+
40
+ notebooks = os.environ.get("path_notebooks")
41
+ assert notebooks
42
+ if not Path(notebooks).exists():
43
+ print("notebooks not found")
44
+
45
+ # COMMAND ----------
46
+
47
+ scripts = os.environ.get("path_scripts")
48
+ assert scripts
49
+ if not Path(scripts).exists():
50
+ print("scripts not found")
51
+
52
+ # COMMAND ----------
53
+
54
+ abfss_wheels = "abfss://fabricks-wheels@bmsstaprdeuwsoftware.dfs.core.windows.net"
55
+
56
+ # COMMAND ----------
57
+
58
+ version = os.environ.get("fabricks_version")
59
+
60
+ # COMMAND ----------
61
+
62
+ mnt_version = f"dbfs:/mnt/fabricks/versions/{version}"
63
+ fuse_mnt_version = f"/dbfs/mnt/fabricks/versions/{version}"
64
+
65
+ # COMMAND ----------
66
+
67
+ try:
68
+ for f in dbutils.fs.ls(mnt_version):
69
+ dbutils.fs.rm(f.path, True)
70
+ except Exception:
71
+ pass
72
+
73
+ dbutils.fs.rm(mnt_version, True)
74
+ dbutils.fs.mkdirs(mnt_version)
75
+
76
+ # COMMAND ----------
77
+
78
+ dbutils.fs.rm(mnt_version, True)
79
+ dbutils.fs.mkdirs(mnt_version)
80
+
81
+ # COMMAND ----------
82
+
83
+ print("copying version to", f"{mnt_version}/version")
84
+
85
+ for f in dbutils.fs.ls(f"{abfss_wheels}/{version}"):
86
+ to = f"{mnt_version}/{f.name}"
87
+
88
+ try:
89
+ dbutils.fs.ls(to)
90
+ except Exception:
91
+ print("uploading", f.name)
92
+ dbutils.fs.cp(f.path, to, recurse=True)
93
+
94
+ # COMMAND ----------
95
+
96
+ print("pip install requirements.txt")
97
+
98
+ out = subprocess.run(
99
+ [
100
+ "pip",
101
+ "install",
102
+ "--no-index",
103
+ f"--find-links={fuse_mnt_version}/wheels",
104
+ "-r",
105
+ f"{fuse_mnt_version}/requirements.txt",
106
+ ],
107
+ capture_output=True,
108
+ )
109
+
110
+ if out.returncode == 1:
111
+ raise ValueError(out.stderr)
112
+
113
+ # COMMAND ----------
114
+
115
+ latest = os.environ.get("latest")
116
+ assert latest
117
+ latest = latest.lower() == "true"
118
+
119
+ # COMMAND ----------
120
+
121
+ versions = [version, "2023.12.*"] if latest else [version]
122
+
123
+ # COMMAND ----------
124
+
125
+ print("deploy init script")
126
+
127
+ for v in versions:
128
+ path = f"{scripts}/{v}.sh"
129
+
130
+ with open(path, "w") as sh:
131
+ sh.write(
132
+ f"""
133
+ sudo echo FABRICKS_RUNTIME={runtime} >> /etc/environment
134
+ sudo echo FABRICKS_NOTEBOOKS={notebooks}/{version} >> /etc/environment
135
+ sudo echo FABRICKS_VERSION={version} >> /etc/environment
136
+
137
+ /databricks/python/bin/pip install --no-index --find-links='{fuse_mnt_version}/wheels' -r '{fuse_mnt_version}/requirements.txt'
138
+ /databricks/python/bin/pip install sqlglot
139
+ /databricks/python/bin/pip install jinja2
140
+ """.replace(" ", "").strip()
141
+ )
142
+
143
+ # COMMAND ----------
144
+
145
+ dbutils.notebook.exit("exit (0)")
146
+
147
+ # COMMAND ----------