fabricks 2024.7.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks-2024.7.1.5/PKG-INFO +212 -0
- fabricks-2024.7.1.5/README.md +189 -0
- fabricks-2024.7.1.5/fabricks/__init__.py +0 -0
- fabricks-2024.7.1.5/fabricks/api/__init__.py +7 -0
- fabricks-2024.7.1.5/fabricks/api/cdc/__init__.py +6 -0
- fabricks-2024.7.1.5/fabricks/api/cdc/nocdc.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/cdc/scd1.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/cdc/scd2.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/context.py +31 -0
- fabricks-2024.7.1.5/fabricks/api/core.py +4 -0
- fabricks-2024.7.1.5/fabricks/api/extenders.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/log.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/metastore/__init__.py +10 -0
- fabricks-2024.7.1.5/fabricks/api/metastore/database.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/metastore/table.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/metastore/view.py +6 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/__init__.py +0 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/cluster.py +6 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/initialize.py +38 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/optimize.py +25 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/process.py +50 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/run.py +87 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/terminate.py +27 -0
- fabricks-2024.7.1.5/fabricks/api/notebooks/vacuum.py +25 -0
- fabricks-2024.7.1.5/fabricks/api/parsers.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/udfs.py +3 -0
- fabricks-2024.7.1.5/fabricks/api/utils.py +9 -0
- fabricks-2024.7.1.5/fabricks/cdc/__init__.py +14 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/__init__.py +4 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/cdc.py +5 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/configurator.py +145 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/generator.py +117 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/merger.py +107 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/processor.py +338 -0
- fabricks-2024.7.1.5/fabricks/cdc/base/types.py +3 -0
- fabricks-2024.7.1.5/fabricks/cdc/cdc.py +5 -0
- fabricks-2024.7.1.5/fabricks/cdc/nocdc.py +19 -0
- fabricks-2024.7.1.5/fabricks/cdc/scd.py +21 -0
- fabricks-2024.7.1.5/fabricks/cdc/scd1.py +15 -0
- fabricks-2024.7.1.5/fabricks/cdc/scd2.py +15 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/__init__.py +0 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks-2024.7.1.5/fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks-2024.7.1.5/fabricks/context/__init__.py +51 -0
- fabricks-2024.7.1.5/fabricks/context/log.py +26 -0
- fabricks-2024.7.1.5/fabricks/context/runtime.py +143 -0
- fabricks-2024.7.1.5/fabricks/context/spark.py +43 -0
- fabricks-2024.7.1.5/fabricks/context/types.py +123 -0
- fabricks-2024.7.1.5/fabricks/core/__init__.py +4 -0
- fabricks-2024.7.1.5/fabricks/core/dags/__init__.py +9 -0
- fabricks-2024.7.1.5/fabricks/core/dags/base.py +72 -0
- fabricks-2024.7.1.5/fabricks/core/dags/generator.py +154 -0
- fabricks-2024.7.1.5/fabricks/core/dags/log.py +14 -0
- fabricks-2024.7.1.5/fabricks/core/dags/processor.py +163 -0
- fabricks-2024.7.1.5/fabricks/core/dags/terminator.py +26 -0
- fabricks-2024.7.1.5/fabricks/core/deploy/__init__.py +12 -0
- fabricks-2024.7.1.5/fabricks/core/deploy/tables.py +76 -0
- fabricks-2024.7.1.5/fabricks/core/deploy/views.py +417 -0
- fabricks-2024.7.1.5/fabricks/core/extenders.py +29 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/__init__.py +20 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/__init__.py +10 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/checker.py +89 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/configurator.py +323 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/error.py +16 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/generator.py +391 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/invoker.py +119 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/job.py +5 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/processor.py +204 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/base/types.py +191 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/bronze.py +333 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/get_job.py +126 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/get_job_id.py +26 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/get_jobs.py +89 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/gold.py +218 -0
- fabricks-2024.7.1.5/fabricks/core/jobs/silver.py +354 -0
- fabricks-2024.7.1.5/fabricks/core/parsers/__init__.py +12 -0
- fabricks-2024.7.1.5/fabricks/core/parsers/base.py +91 -0
- fabricks-2024.7.1.5/fabricks/core/parsers/decorator.py +11 -0
- fabricks-2024.7.1.5/fabricks/core/parsers/get_parser.py +25 -0
- fabricks-2024.7.1.5/fabricks/core/parsers/types.py +6 -0
- fabricks-2024.7.1.5/fabricks/core/schedules.py +89 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/__init__.py +13 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/armageddon.py +82 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/generate.py +20 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/job_schema.py +28 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/optimize.py +45 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/process.py +9 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/stats.py +48 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/steps.py +27 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/terminate.py +6 -0
- fabricks-2024.7.1.5/fabricks/core/scripts/vacuum.py +45 -0
- fabricks-2024.7.1.5/fabricks/core/site_packages.py +55 -0
- fabricks-2024.7.1.5/fabricks/core/steps/__init__.py +4 -0
- fabricks-2024.7.1.5/fabricks/core/steps/base.py +282 -0
- fabricks-2024.7.1.5/fabricks/core/steps/get_step.py +10 -0
- fabricks-2024.7.1.5/fabricks/core/steps/get_step_conf.py +33 -0
- fabricks-2024.7.1.5/fabricks/core/steps/types.py +7 -0
- fabricks-2024.7.1.5/fabricks/core/udfs.py +106 -0
- fabricks-2024.7.1.5/fabricks/core/utils.py +69 -0
- fabricks-2024.7.1.5/fabricks/core/views.py +36 -0
- fabricks-2024.7.1.5/fabricks/metastore/README.md +3 -0
- fabricks-2024.7.1.5/fabricks/metastore/__init__.py +5 -0
- fabricks-2024.7.1.5/fabricks/metastore/database.py +71 -0
- fabricks-2024.7.1.5/fabricks/metastore/pyproject.toml +20 -0
- fabricks-2024.7.1.5/fabricks/metastore/relational.py +61 -0
- fabricks-2024.7.1.5/fabricks/metastore/table.py +529 -0
- fabricks-2024.7.1.5/fabricks/metastore/utils.py +35 -0
- fabricks-2024.7.1.5/fabricks/metastore/view.py +40 -0
- fabricks-2024.7.1.5/fabricks/utils/README.md +3 -0
- fabricks-2024.7.1.5/fabricks/utils/__init__.py +0 -0
- fabricks-2024.7.1.5/fabricks/utils/azure_queue.py +63 -0
- fabricks-2024.7.1.5/fabricks/utils/azure_table.py +99 -0
- fabricks-2024.7.1.5/fabricks/utils/console.py +51 -0
- fabricks-2024.7.1.5/fabricks/utils/container.py +57 -0
- fabricks-2024.7.1.5/fabricks/utils/fdict.py +28 -0
- fabricks-2024.7.1.5/fabricks/utils/helpers.py +89 -0
- fabricks-2024.7.1.5/fabricks/utils/log.py +153 -0
- fabricks-2024.7.1.5/fabricks/utils/path.py +206 -0
- fabricks-2024.7.1.5/fabricks/utils/pip.py +61 -0
- fabricks-2024.7.1.5/fabricks/utils/pydantic.py +92 -0
- fabricks-2024.7.1.5/fabricks/utils/pyproject.toml +18 -0
- fabricks-2024.7.1.5/fabricks/utils/read/__init__.py +11 -0
- fabricks-2024.7.1.5/fabricks/utils/read/read.py +305 -0
- fabricks-2024.7.1.5/fabricks/utils/read/read_excel.py +5 -0
- fabricks-2024.7.1.5/fabricks/utils/read/read_yaml.py +43 -0
- fabricks-2024.7.1.5/fabricks/utils/read/types.py +3 -0
- fabricks-2024.7.1.5/fabricks/utils/schema/__init__.py +7 -0
- fabricks-2024.7.1.5/fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks-2024.7.1.5/fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks-2024.7.1.5/fabricks/utils/secret.py +78 -0
- fabricks-2024.7.1.5/fabricks/utils/sqlglot.py +48 -0
- fabricks-2024.7.1.5/fabricks/utils/write/__init__.py +8 -0
- fabricks-2024.7.1.5/fabricks/utils/write/delta.py +46 -0
- fabricks-2024.7.1.5/fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5/pyproject.toml +76 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: fabricks
|
|
3
|
+
Version: 2024.7.1.5
|
|
4
|
+
Summary:
|
|
5
|
+
Author: BMS DWH Team
|
|
6
|
+
Author-email: bi_support@bmsuisse.ch
|
|
7
|
+
Requires-Python: >=3.9,<4
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: azure-data-tables (>=12.5.0,<13.0.0)
|
|
14
|
+
Requires-Dist: azure-identity (>=1.10.0)
|
|
15
|
+
Requires-Dist: azure-storage-blob (>=12.14.1)
|
|
16
|
+
Requires-Dist: azure-storage-queue (>=12.10.0,<13.0.0)
|
|
17
|
+
Requires-Dist: databricks-sdk (>=0.29.0)
|
|
18
|
+
Requires-Dist: jinja2 (>=2.11.3)
|
|
19
|
+
Requires-Dist: python-dotenv (>=1.0.1)
|
|
20
|
+
Requires-Dist: sqlglot (>=22.1.1)
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# Fabricks
|
|
24
|
+
|
|
25
|
+
Fabricks is a Python framework developed to help create a lake house in Databricks. It simplifies the process of building and maintaining data pipelines by providing a standardized approach to defining and managing data processing workflows.
|
|
26
|
+
|
|
27
|
+
Though Fabricks is currently really meant to be run on Databricks, the code using Fabricks is really portable - you'll almost exclusively
|
|
28
|
+
write SQL-Select Code - no need to manually write DDL/DML/Merge queries. Later on we might add support for other platforms as well, eg. DuckDB or Open Source Spark.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- YAML configuration files: Fabricks uses YAML files for configuration, making it easy to define and modify workflows without requiring significant changes to the code.
|
|
35
|
+
- SQL files for business logic: Business logic is defined in SQL files, providing a familiar and powerful tool for data processing.
|
|
36
|
+
- Version control: Fabricks supports version control, ensuring that changes are tracked and can be rolled back if necessary.
|
|
37
|
+
- Seamless integration of new data sources: Fabricks can easily integrate new data sources into existing workflows.
|
|
38
|
+
- Change Data Capture: Fabricks supports Change Data Capture, allowing it to track and handle changes in the data over time.
|
|
39
|
+
- Drop and create: Fabricks can drop and create tables as needed, providing flexibility in managing the data schema.
|
|
40
|
+
|
|
41
|
+
## Getting Started
|
|
42
|
+
|
|
43
|
+
To get started with Fabricks, you'll need to install it and set up your first project. Here's a basic guide on how to do that:
|
|
44
|
+
|
|
45
|
+
### Installation
|
|
46
|
+
|
|
47
|
+
To install Fabricks, you need to install the library on your Databricks cluster. Follow the steps below:
|
|
48
|
+
|
|
49
|
+
1. Navigate to your Databricks workspace.
|
|
50
|
+
2. Select the cluster where you want to install the library.
|
|
51
|
+
3. Click on the `Libraries` tab.
|
|
52
|
+
4. Click on `Install New`.
|
|
53
|
+
5. Choose `PyPI` from the library source dropdown.
|
|
54
|
+
6. Enter `fabricks` in the package text box.
|
|
55
|
+
7. Click `Install`.
|
|
56
|
+
|
|
57
|
+
After the library is installed, you can import it in your notebooks or scripts using `import fabricks`.
|
|
58
|
+
|
|
59
|
+
### Setting Up Your First Project
|
|
60
|
+
|
|
61
|
+
# Fabricks Runtime Configuration
|
|
62
|
+
|
|
63
|
+
The Fabricks runtime configuration is defined in a YAML file. This file specifies the settings for the Fabricks runtime, including options for the runtime environment, path options, Spark options, and the configuration for different stages of the data pipeline (bronze, silver, gold, etc.).
|
|
64
|
+
|
|
65
|
+
A sample can be found in the [tests](tests/runtime/fabricks/conf.5589296195699698.yml)
|
|
66
|
+
|
|
67
|
+
## Configuration Options
|
|
68
|
+
|
|
69
|
+
- `name`: The name of the configuration.
|
|
70
|
+
- `options`: General options for the runtime. This includes:
|
|
71
|
+
- `secret_scope`: The name of the secret scope in Databricks.
|
|
72
|
+
- `timeout`: The timeout for the runtime in seconds.
|
|
73
|
+
- `workers`: The number of workers for the runtime.
|
|
74
|
+
- `path_options`: Options for the storage path. This includes:
|
|
75
|
+
- `storage`: The storage path for the data.
|
|
76
|
+
- `spark_options`: Options for Spark. This includes:
|
|
77
|
+
- `sql`: SQL options for Spark.
|
|
78
|
+
|
|
79
|
+
## Data Pipeline Stages
|
|
80
|
+
|
|
81
|
+
The configuration file defines the settings for different stages of the data pipeline:
|
|
82
|
+
|
|
83
|
+
- `bronze`: The initial stage of the data pipeline. This includes:
|
|
84
|
+
- `name`: The name of the stage.
|
|
85
|
+
- `path_options`: Options for the storage path.
|
|
86
|
+
- `options`: Options for the stage.
|
|
87
|
+
|
|
88
|
+
For some samples, see in the [tests/runtime/bronze Folder](tests/runtime/bronze)
|
|
89
|
+
- `silver`: The intermediate stage of the data pipeline. This includes:
|
|
90
|
+
- `name`: The name of the stage.
|
|
91
|
+
- `path_options`: Options for the storage path.
|
|
92
|
+
- `options`: Options for the stage.
|
|
93
|
+
|
|
94
|
+
For some samples, see in the [tests/runtime/silver Folder](tests/runtime/silver)
|
|
95
|
+
- `gold`: The final stage of the data pipeline. This includes:
|
|
96
|
+
- `name`: The name of the stage.
|
|
97
|
+
- `path_options`: Options for the storage path.
|
|
98
|
+
- `options`: Options for the stage.
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
For some samples, see in the [tests/runtime/gold Folder](tests/runtime/gold)
|
|
102
|
+
|
|
103
|
+
The folder names and the stage names can be configures in the main Fabricks config, you don't have t o stick with the defaults
|
|
104
|
+
|
|
105
|
+
## Other Configurations
|
|
106
|
+
|
|
107
|
+
- `powerbi`: Configuration for PowerBI integration.
|
|
108
|
+
- `databases`: Configuration for the databases.
|
|
109
|
+
- `credentials`: Credentials for accessing different resources.
|
|
110
|
+
- `variables`: Variables used in the configuration.
|
|
111
|
+
|
|
112
|
+
Please note that this is a basic documentation based on the provided YAML file. The actual configuration options may vary depending on the specific requirements of your project.
|
|
113
|
+
|
|
114
|
+
# Bronze Step Configuration
|
|
115
|
+
|
|
116
|
+
The "bronze" step in Fabricks is the initial stage of the data pipeline. It is responsible for ingesting raw data and storing it in a "bronze" table for further processing. The configuration for the "bronze" step is defined in a YAML file. Each job in the "bronze" step has the following configuration options:
|
|
117
|
+
|
|
118
|
+
- `step`: The step in the data pipeline. For these jobs, it is always "bronze".
|
|
119
|
+
- `topic`: The topic of the job. This is usually the name of the data source.
|
|
120
|
+
- `item`: The item of the job. This is usually the name of the specific data item being processed.
|
|
121
|
+
- `tags`: Tags for the job. These can be used to categorize or filter jobs.
|
|
122
|
+
- `options`: Options for the job. This includes:
|
|
123
|
+
- `mode`: The mode of the job. This can be "append", "memory", or "register".
|
|
124
|
+
- `uri`: The URI of the data source. This is usually an Azure Blob File System (ABFS) URI.
|
|
125
|
+
- `parser`: The parser to use for the data. This can be "monarch", "parquet", etc.
|
|
126
|
+
- `keys`: The keys for the data. These are the columns that uniquely identify each row in the data.
|
|
127
|
+
- `source`: The source of the data. This is usually the same as the topic.
|
|
128
|
+
- `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
|
|
129
|
+
- `encrypted_columns`: The columns in the data that are encrypted. These columns will be decrypted during the "bronze" step.
|
|
130
|
+
- `calculated_columns`: The columns in the data that are calculated. These columns will be calculated during the "bronze" step.
|
|
131
|
+
|
|
132
|
+
Here's an example of a "bronze" step job:
|
|
133
|
+
|
|
134
|
+
```yaml
|
|
135
|
+
- job:
|
|
136
|
+
step: bronze
|
|
137
|
+
topic: king
|
|
138
|
+
item: scd1
|
|
139
|
+
tags: [test]
|
|
140
|
+
options:
|
|
141
|
+
mode: append
|
|
142
|
+
uri: abfss://fabricks@$datahub/raw/king
|
|
143
|
+
parser: monarch
|
|
144
|
+
keys: [id]
|
|
145
|
+
source: king
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
# Silver Step Configuration
|
|
149
|
+
|
|
150
|
+
The "silver" step in Fabricks is the intermediate stage of the data pipeline. It is responsible for processing the raw data ingested in the "bronze" step and storing it in a "silver" table for further processing. The configuration for the "silver" step is defined in a YAML file. Each job in the "silver" step has the following configuration options:
|
|
151
|
+
|
|
152
|
+
- `step`: The step in the data pipeline. For these jobs, it is always "silver".
|
|
153
|
+
- `topic`: The topic of the job. This is usually the name of the data source.
|
|
154
|
+
- `item`: The item of the job. This is usually the name of the specific data item being processed.
|
|
155
|
+
- `tags`: Tags for the job. These can be used to categorize or filter jobs.
|
|
156
|
+
- `options`: Options for the job. This includes:
|
|
157
|
+
- `mode`: The mode of the job. This can be "update", "memory", "latest", "append", "combine", etc.
|
|
158
|
+
- `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
|
|
159
|
+
- `parents`: The parent jobs that this job depends on. These are usually "bronze" step jobs.
|
|
160
|
+
- `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
|
|
161
|
+
- `order_duplicate_by`: The order to use when removing duplicates. This can be "asc" or "desc".
|
|
162
|
+
- `check_options`: Options for checking the data. This includes:
|
|
163
|
+
- `max_rows`: The maximum number of rows to check.
|
|
164
|
+
- `stream`: Whether to stream the data. This can be "true" or "false".
|
|
165
|
+
|
|
166
|
+
Here's an example of a "silver" step job:
|
|
167
|
+
|
|
168
|
+
```yaml
|
|
169
|
+
- job:
|
|
170
|
+
step: silver
|
|
171
|
+
topic: king_and_queen
|
|
172
|
+
item: scd1
|
|
173
|
+
tags: [test]
|
|
174
|
+
options:
|
|
175
|
+
mode: update
|
|
176
|
+
change_data_capture: scd1
|
|
177
|
+
parents: [bronze.queen_scd1, bronze.king_scd1]
|
|
178
|
+
```
|
|
179
|
+
# Gold Step Configuration
|
|
180
|
+
|
|
181
|
+
The "gold" step in Fabricks is the final stage of the data pipeline. It is responsible for processing the data from the "silver" step and storing it in a "gold" table for consumption. The configuration for the "gold" step is defined in a YAML file. Each job in the "gold" step has the following configuration options:
|
|
182
|
+
|
|
183
|
+
- `step`: The step in the data pipeline. For these jobs, it is always "gold".
|
|
184
|
+
- `topic`: The topic of the job. This is usually the name of the data source.
|
|
185
|
+
- `item`: The item of the job. This is usually the name of the specific data item being processed.
|
|
186
|
+
- `tags`: Tags for the job. These can be used to categorize or filter jobs.
|
|
187
|
+
- `options`: Options for the job. This includes:
|
|
188
|
+
- `mode`: The mode of the job. This can be "complete", "memory", etc.
|
|
189
|
+
- `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
|
|
190
|
+
|
|
191
|
+
Here's an example of a "gold" step job:
|
|
192
|
+
|
|
193
|
+
```yaml
|
|
194
|
+
- job:
|
|
195
|
+
step: gold
|
|
196
|
+
topic: scd2
|
|
197
|
+
item: complete
|
|
198
|
+
tags: [test]
|
|
199
|
+
options:
|
|
200
|
+
change_data_capture: scd2
|
|
201
|
+
mode: complete
|
|
202
|
+
````
|
|
203
|
+
|
|
204
|
+
## Usage
|
|
205
|
+
|
|
206
|
+
// Instructions on how to use the framework go here
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
## License
|
|
210
|
+
|
|
211
|
+
This project is licensed under the terms of the MIT license.
|
|
212
|
+
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# Fabricks
|
|
2
|
+
|
|
3
|
+
Fabricks is a Python framework developed to help create a lake house in Databricks. It simplifies the process of building and maintaining data pipelines by providing a standardized approach to defining and managing data processing workflows.
|
|
4
|
+
|
|
5
|
+
Though Fabricks is currently really meant to be run on Databricks, the code using Fabricks is really portable - you'll almost exclusively
|
|
6
|
+
write SQL-Select Code - no need to manually write DDL/DML/Merge queries. Later on we might add support for other platforms as well, eg. DuckDB or Open Source Spark.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- YAML configuration files: Fabricks uses YAML files for configuration, making it easy to define and modify workflows without requiring significant changes to the code.
|
|
13
|
+
- SQL files for business logic: Business logic is defined in SQL files, providing a familiar and powerful tool for data processing.
|
|
14
|
+
- Version control: Fabricks supports version control, ensuring that changes are tracked and can be rolled back if necessary.
|
|
15
|
+
- Seamless integration of new data sources: Fabricks can easily integrate new data sources into existing workflows.
|
|
16
|
+
- Change Data Capture: Fabricks supports Change Data Capture, allowing it to track and handle changes in the data over time.
|
|
17
|
+
- Drop and create: Fabricks can drop and create tables as needed, providing flexibility in managing the data schema.
|
|
18
|
+
|
|
19
|
+
## Getting Started
|
|
20
|
+
|
|
21
|
+
To get started with Fabricks, you'll need to install it and set up your first project. Here's a basic guide on how to do that:
|
|
22
|
+
|
|
23
|
+
### Installation
|
|
24
|
+
|
|
25
|
+
To install Fabricks, you need to install the library on your Databricks cluster. Follow the steps below:
|
|
26
|
+
|
|
27
|
+
1. Navigate to your Databricks workspace.
|
|
28
|
+
2. Select the cluster where you want to install the library.
|
|
29
|
+
3. Click on the `Libraries` tab.
|
|
30
|
+
4. Click on `Install New`.
|
|
31
|
+
5. Choose `PyPI` from the library source dropdown.
|
|
32
|
+
6. Enter `fabricks` in the package text box.
|
|
33
|
+
7. Click `Install`.
|
|
34
|
+
|
|
35
|
+
After the library is installed, you can import it in your notebooks or scripts using `import fabricks`.
|
|
36
|
+
|
|
37
|
+
### Setting Up Your First Project
|
|
38
|
+
|
|
39
|
+
# Fabricks Runtime Configuration
|
|
40
|
+
|
|
41
|
+
The Fabricks runtime configuration is defined in a YAML file. This file specifies the settings for the Fabricks runtime, including options for the runtime environment, path options, Spark options, and the configuration for different stages of the data pipeline (bronze, silver, gold, etc.).
|
|
42
|
+
|
|
43
|
+
A sample can be found in the [tests](tests/runtime/fabricks/conf.5589296195699698.yml)
|
|
44
|
+
|
|
45
|
+
## Configuration Options
|
|
46
|
+
|
|
47
|
+
- `name`: The name of the configuration.
|
|
48
|
+
- `options`: General options for the runtime. This includes:
|
|
49
|
+
- `secret_scope`: The name of the secret scope in Databricks.
|
|
50
|
+
- `timeout`: The timeout for the runtime in seconds.
|
|
51
|
+
- `workers`: The number of workers for the runtime.
|
|
52
|
+
- `path_options`: Options for the storage path. This includes:
|
|
53
|
+
- `storage`: The storage path for the data.
|
|
54
|
+
- `spark_options`: Options for Spark. This includes:
|
|
55
|
+
- `sql`: SQL options for Spark.
|
|
56
|
+
|
|
57
|
+
## Data Pipeline Stages
|
|
58
|
+
|
|
59
|
+
The configuration file defines the settings for different stages of the data pipeline:
|
|
60
|
+
|
|
61
|
+
- `bronze`: The initial stage of the data pipeline. This includes:
|
|
62
|
+
- `name`: The name of the stage.
|
|
63
|
+
- `path_options`: Options for the storage path.
|
|
64
|
+
- `options`: Options for the stage.
|
|
65
|
+
|
|
66
|
+
For some samples, see in the [tests/runtime/bronze Folder](tests/runtime/bronze)
|
|
67
|
+
- `silver`: The intermediate stage of the data pipeline. This includes:
|
|
68
|
+
- `name`: The name of the stage.
|
|
69
|
+
- `path_options`: Options for the storage path.
|
|
70
|
+
- `options`: Options for the stage.
|
|
71
|
+
|
|
72
|
+
For some samples, see in the [tests/runtime/silver Folder](tests/runtime/silver)
|
|
73
|
+
- `gold`: The final stage of the data pipeline. This includes:
|
|
74
|
+
- `name`: The name of the stage.
|
|
75
|
+
- `path_options`: Options for the storage path.
|
|
76
|
+
- `options`: Options for the stage.
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
For some samples, see in the [tests/runtime/gold Folder](tests/runtime/gold)
|
|
80
|
+
|
|
81
|
+
The folder names and the stage names can be configures in the main Fabricks config, you don't have t o stick with the defaults
|
|
82
|
+
|
|
83
|
+
## Other Configurations
|
|
84
|
+
|
|
85
|
+
- `powerbi`: Configuration for PowerBI integration.
|
|
86
|
+
- `databases`: Configuration for the databases.
|
|
87
|
+
- `credentials`: Credentials for accessing different resources.
|
|
88
|
+
- `variables`: Variables used in the configuration.
|
|
89
|
+
|
|
90
|
+
Please note that this is a basic documentation based on the provided YAML file. The actual configuration options may vary depending on the specific requirements of your project.
|
|
91
|
+
|
|
92
|
+
# Bronze Step Configuration
|
|
93
|
+
|
|
94
|
+
The "bronze" step in Fabricks is the initial stage of the data pipeline. It is responsible for ingesting raw data and storing it in a "bronze" table for further processing. The configuration for the "bronze" step is defined in a YAML file. Each job in the "bronze" step has the following configuration options:
|
|
95
|
+
|
|
96
|
+
- `step`: The step in the data pipeline. For these jobs, it is always "bronze".
|
|
97
|
+
- `topic`: The topic of the job. This is usually the name of the data source.
|
|
98
|
+
- `item`: The item of the job. This is usually the name of the specific data item being processed.
|
|
99
|
+
- `tags`: Tags for the job. These can be used to categorize or filter jobs.
|
|
100
|
+
- `options`: Options for the job. This includes:
|
|
101
|
+
- `mode`: The mode of the job. This can be "append", "memory", or "register".
|
|
102
|
+
- `uri`: The URI of the data source. This is usually an Azure Blob File System (ABFS) URI.
|
|
103
|
+
- `parser`: The parser to use for the data. This can be "monarch", "parquet", etc.
|
|
104
|
+
- `keys`: The keys for the data. These are the columns that uniquely identify each row in the data.
|
|
105
|
+
- `source`: The source of the data. This is usually the same as the topic.
|
|
106
|
+
- `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
|
|
107
|
+
- `encrypted_columns`: The columns in the data that are encrypted. These columns will be decrypted during the "bronze" step.
|
|
108
|
+
- `calculated_columns`: The columns in the data that are calculated. These columns will be calculated during the "bronze" step.
|
|
109
|
+
|
|
110
|
+
Here's an example of a "bronze" step job:
|
|
111
|
+
|
|
112
|
+
```yaml
|
|
113
|
+
- job:
|
|
114
|
+
step: bronze
|
|
115
|
+
topic: king
|
|
116
|
+
item: scd1
|
|
117
|
+
tags: [test]
|
|
118
|
+
options:
|
|
119
|
+
mode: append
|
|
120
|
+
uri: abfss://fabricks@$datahub/raw/king
|
|
121
|
+
parser: monarch
|
|
122
|
+
keys: [id]
|
|
123
|
+
source: king
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
# Silver Step Configuration
|
|
127
|
+
|
|
128
|
+
The "silver" step in Fabricks is the intermediate stage of the data pipeline. It is responsible for processing the raw data ingested in the "bronze" step and storing it in a "silver" table for further processing. The configuration for the "silver" step is defined in a YAML file. Each job in the "silver" step has the following configuration options:
|
|
129
|
+
|
|
130
|
+
- `step`: The step in the data pipeline. For these jobs, it is always "silver".
|
|
131
|
+
- `topic`: The topic of the job. This is usually the name of the data source.
|
|
132
|
+
- `item`: The item of the job. This is usually the name of the specific data item being processed.
|
|
133
|
+
- `tags`: Tags for the job. These can be used to categorize or filter jobs.
|
|
134
|
+
- `options`: Options for the job. This includes:
|
|
135
|
+
- `mode`: The mode of the job. This can be "update", "memory", "latest", "append", "combine", etc.
|
|
136
|
+
- `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
|
|
137
|
+
- `parents`: The parent jobs that this job depends on. These are usually "bronze" step jobs.
|
|
138
|
+
- `extender`: The extender for the data. This is used to extend the data with additional columns or transformations.
|
|
139
|
+
- `order_duplicate_by`: The order to use when removing duplicates. This can be "asc" or "desc".
|
|
140
|
+
- `check_options`: Options for checking the data. This includes:
|
|
141
|
+
- `max_rows`: The maximum number of rows to check.
|
|
142
|
+
- `stream`: Whether to stream the data. This can be "true" or "false".
|
|
143
|
+
|
|
144
|
+
Here's an example of a "silver" step job:
|
|
145
|
+
|
|
146
|
+
```yaml
|
|
147
|
+
- job:
|
|
148
|
+
step: silver
|
|
149
|
+
topic: king_and_queen
|
|
150
|
+
item: scd1
|
|
151
|
+
tags: [test]
|
|
152
|
+
options:
|
|
153
|
+
mode: update
|
|
154
|
+
change_data_capture: scd1
|
|
155
|
+
parents: [bronze.queen_scd1, bronze.king_scd1]
|
|
156
|
+
```
|
|
157
|
+
# Gold Step Configuration
|
|
158
|
+
|
|
159
|
+
The "gold" step in Fabricks is the final stage of the data pipeline. It is responsible for processing the data from the "silver" step and storing it in a "gold" table for consumption. The configuration for the "gold" step is defined in a YAML file. Each job in the "gold" step has the following configuration options:
|
|
160
|
+
|
|
161
|
+
- `step`: The step in the data pipeline. For these jobs, it is always "gold".
|
|
162
|
+
- `topic`: The topic of the job. This is usually the name of the data source.
|
|
163
|
+
- `item`: The item of the job. This is usually the name of the specific data item being processed.
|
|
164
|
+
- `tags`: Tags for the job. These can be used to categorize or filter jobs.
|
|
165
|
+
- `options`: Options for the job. This includes:
|
|
166
|
+
- `mode`: The mode of the job. This can be "complete", "memory", etc.
|
|
167
|
+
- `change_data_capture`: The type of Change Data Capture (CDC) to use. This can be "scd1", "scd2", "nocdc", etc.
|
|
168
|
+
|
|
169
|
+
Here's an example of a "gold" step job:
|
|
170
|
+
|
|
171
|
+
```yaml
|
|
172
|
+
- job:
|
|
173
|
+
step: gold
|
|
174
|
+
topic: scd2
|
|
175
|
+
item: complete
|
|
176
|
+
tags: [test]
|
|
177
|
+
options:
|
|
178
|
+
change_data_capture: scd2
|
|
179
|
+
mode: complete
|
|
180
|
+
````
|
|
181
|
+
|
|
182
|
+
## Usage
|
|
183
|
+
|
|
184
|
+
// Instructions on how to use the framework go here
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
## License
|
|
188
|
+
|
|
189
|
+
This project is licensed under the terms of the MIT license.
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from databricks.sdk.runtime import dbutils, spark
|
|
2
|
+
|
|
3
|
+
from fabricks.context import BRONZE, GOLD, SECRET_SCOPE, SILVER
|
|
4
|
+
from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers
|
|
5
|
+
|
|
6
|
+
# spark
|
|
7
|
+
SPARK = spark
|
|
8
|
+
DBUTILS = dbutils
|
|
9
|
+
|
|
10
|
+
# step
|
|
11
|
+
BRONZES = Bronzes
|
|
12
|
+
SILVERS = Silvers
|
|
13
|
+
GOLDS = Golds
|
|
14
|
+
STEPS = BRONZES + SILVERS + GOLDS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"BRONZE",
|
|
19
|
+
"Bronzes",
|
|
20
|
+
"BRONZES",
|
|
21
|
+
"DBUTILS",
|
|
22
|
+
"GOLD",
|
|
23
|
+
"Golds",
|
|
24
|
+
"GOLDS",
|
|
25
|
+
"SECRET_SCOPE",
|
|
26
|
+
"SILVER",
|
|
27
|
+
"Silvers",
|
|
28
|
+
"SILVERS",
|
|
29
|
+
"SPARK",
|
|
30
|
+
"STEPS",
|
|
31
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Databricks notebook source
|
|
2
|
+
# MAGIC %pip install python-dotenv
|
|
3
|
+
|
|
4
|
+
# COMMAND ----------
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from databricks.sdk.runtime import dbutils
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
|
|
13
|
+
# COMMAND ----------
|
|
14
|
+
|
|
15
|
+
load_dotenv()
|
|
16
|
+
|
|
17
|
+
# COMMAND ----------
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
dbutils.fs.ls("mnt/fabricks")
|
|
21
|
+
except Exception:
|
|
22
|
+
print("fabricks container not mounted")
|
|
23
|
+
|
|
24
|
+
# COMMAND ----------
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
dbutils.fs.ls("mnt/fabricks/versions")
|
|
28
|
+
except Exception:
|
|
29
|
+
print("fabricks not found")
|
|
30
|
+
|
|
31
|
+
# COMMAND ----------
|
|
32
|
+
|
|
33
|
+
runtime = os.environ.get("path_runtime")
|
|
34
|
+
assert runtime
|
|
35
|
+
if not Path(runtime).exists():
|
|
36
|
+
print("runtime not found")
|
|
37
|
+
|
|
38
|
+
# COMMAND ----------
|
|
39
|
+
|
|
40
|
+
notebooks = os.environ.get("path_notebooks")
|
|
41
|
+
assert notebooks
|
|
42
|
+
if not Path(notebooks).exists():
|
|
43
|
+
print("notebooks not found")
|
|
44
|
+
|
|
45
|
+
# COMMAND ----------
|
|
46
|
+
|
|
47
|
+
scripts = os.environ.get("path_scripts")
|
|
48
|
+
assert scripts
|
|
49
|
+
if not Path(scripts).exists():
|
|
50
|
+
print("scripts not found")
|
|
51
|
+
|
|
52
|
+
# COMMAND ----------
|
|
53
|
+
|
|
54
|
+
abfss_wheels = "abfss://fabricks-wheels@bmsstaprdeuwsoftware.dfs.core.windows.net"
|
|
55
|
+
|
|
56
|
+
# COMMAND ----------
|
|
57
|
+
|
|
58
|
+
version = os.environ.get("fabricks_version")
|
|
59
|
+
|
|
60
|
+
# COMMAND ----------
|
|
61
|
+
|
|
62
|
+
mnt_version = f"dbfs:/mnt/fabricks/versions/{version}"
|
|
63
|
+
fuse_mnt_version = f"/dbfs/mnt/fabricks/versions/{version}"
|
|
64
|
+
|
|
65
|
+
# COMMAND ----------
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
for f in dbutils.fs.ls(mnt_version):
|
|
69
|
+
dbutils.fs.rm(f.path, True)
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
dbutils.fs.rm(mnt_version, True)
|
|
74
|
+
dbutils.fs.mkdirs(mnt_version)
|
|
75
|
+
|
|
76
|
+
# COMMAND ----------
|
|
77
|
+
|
|
78
|
+
dbutils.fs.rm(mnt_version, True)
|
|
79
|
+
dbutils.fs.mkdirs(mnt_version)
|
|
80
|
+
|
|
81
|
+
# COMMAND ----------
|
|
82
|
+
|
|
83
|
+
print("copying version to", f"{mnt_version}/version")
|
|
84
|
+
|
|
85
|
+
for f in dbutils.fs.ls(f"{abfss_wheels}/{version}"):
|
|
86
|
+
to = f"{mnt_version}/{f.name}"
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
dbutils.fs.ls(to)
|
|
90
|
+
except Exception:
|
|
91
|
+
print("uploading", f.name)
|
|
92
|
+
dbutils.fs.cp(f.path, to, recurse=True)
|
|
93
|
+
|
|
94
|
+
# COMMAND ----------
|
|
95
|
+
|
|
96
|
+
print("pip install requirements.txt")
|
|
97
|
+
|
|
98
|
+
out = subprocess.run(
|
|
99
|
+
[
|
|
100
|
+
"pip",
|
|
101
|
+
"install",
|
|
102
|
+
"--no-index",
|
|
103
|
+
f"--find-links={fuse_mnt_version}/wheels",
|
|
104
|
+
"-r",
|
|
105
|
+
f"{fuse_mnt_version}/requirements.txt",
|
|
106
|
+
],
|
|
107
|
+
capture_output=True,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if out.returncode == 1:
|
|
111
|
+
raise ValueError(out.stderr)
|
|
112
|
+
|
|
113
|
+
# COMMAND ----------
|
|
114
|
+
|
|
115
|
+
latest = os.environ.get("latest")
|
|
116
|
+
assert latest
|
|
117
|
+
latest = latest.lower() == "true"
|
|
118
|
+
|
|
119
|
+
# COMMAND ----------
|
|
120
|
+
|
|
121
|
+
versions = [version, "2023.12.*"] if latest else [version]
|
|
122
|
+
|
|
123
|
+
# COMMAND ----------
|
|
124
|
+
|
|
125
|
+
print("deploy init script")
|
|
126
|
+
|
|
127
|
+
for v in versions:
|
|
128
|
+
path = f"{scripts}/{v}.sh"
|
|
129
|
+
|
|
130
|
+
with open(path, "w") as sh:
|
|
131
|
+
sh.write(
|
|
132
|
+
f"""
|
|
133
|
+
sudo echo FABRICKS_RUNTIME={runtime} >> /etc/environment
|
|
134
|
+
sudo echo FABRICKS_NOTEBOOKS={notebooks}/{version} >> /etc/environment
|
|
135
|
+
sudo echo FABRICKS_VERSION={version} >> /etc/environment
|
|
136
|
+
|
|
137
|
+
/databricks/python/bin/pip install --no-index --find-links='{fuse_mnt_version}/wheels' -r '{fuse_mnt_version}/requirements.txt'
|
|
138
|
+
/databricks/python/bin/pip install sqlglot
|
|
139
|
+
/databricks/python/bin/pip install jinja2
|
|
140
|
+
""".replace(" ", "").strip()
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# COMMAND ----------
|
|
144
|
+
|
|
145
|
+
dbutils.notebook.exit("exit (0)")
|
|
146
|
+
|
|
147
|
+
# COMMAND ----------
|