mdbt 0.4.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbt/__init__.py +0 -0
- mdbt/ai_core.py +116 -0
- mdbt/build_dbt_docs_ai.py +147 -0
- mdbt/build_unit_test_data_ai.py +129 -0
- mdbt/cmdline.py +368 -0
- mdbt/core.py +113 -0
- mdbt/expectations_output_builder.py +74 -0
- mdbt/lightdash.py +84 -0
- mdbt/main.py +474 -0
- mdbt/precommit_format.py +84 -0
- mdbt/prompts.py +244 -0
- mdbt/recce.py +66 -0
- mdbt/sort_yaml_fields.py +148 -0
- mdbt/sql_sorter.py +165 -0
- mdbt-0.4.27.dist-info/METADATA +28 -0
- mdbt-0.4.27.dist-info/RECORD +20 -0
- mdbt-0.4.27.dist-info/WHEEL +5 -0
- mdbt-0.4.27.dist-info/entry_points.txt +2 -0
- mdbt-0.4.27.dist-info/licenses/LICENSE +21 -0
- mdbt-0.4.27.dist-info/top_level.txt +1 -0
mdbt/prompts.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
class Prompts:
|
|
2
|
+
|
|
3
|
+
@property
|
|
4
|
+
def dbt_docs_gte_l3_prompt(self):
|
|
5
|
+
return """
|
|
6
|
+
You will help build DBT documentation YML files for a given SQL query. Sometimes you will be asked to generate a description from scratch, other times you will be asked to fill in missing columns that exist in the model, but not in the documentation.
|
|
7
|
+
|
|
8
|
+
These models are built for Marki Microwave, a RF semicoductor company that designed and built RF and microwave components. The models are built to help the company understand their business better. The models are built in DBT and are used to generate reports and dashboards. For acronyms, or other terminology, consider in context of the RF semiconductor industry.
|
|
9
|
+
|
|
10
|
+
Primary DBT Guidelines:
|
|
11
|
+
|
|
12
|
+
3. Include a config block for each model:
|
|
13
|
+
a. Set `materialized` to `table`
|
|
14
|
+
b. Do not include a `sort` key.
|
|
15
|
+
4. For long descriptions, use the following format so the lines are not too long:
|
|
16
|
+
```
|
|
17
|
+
- name: replacement_plan_id
|
|
18
|
+
description: >
|
|
19
|
+
Identifier for the replacement plan if applicable. A replacement plan is defined as a plan that
|
|
20
|
+
started within 5 days before, or up to 30 days after the end date of the prior plan and is not
|
|
21
|
+
an add-on plan.
|
|
22
|
+
```
|
|
23
|
+
5. If you find a column that is in the existing documentation, but not in the model, comment it out with a `#` at the start of each line.
|
|
24
|
+
6. Only return the YML documentation file contents. Do not provide an explanation.
|
|
25
|
+
7. Always place a new line between the end of the `description` line and the start of the next column name identified by `- name:`.
|
|
26
|
+
8. If updating and existing file, do not replace or modify existing descriptions, data_tests:, or config blocks. Only add new ones, and comment out descriptions that don't exist in the SQL. use data_tests:, not tests:
|
|
27
|
+
9. Reorder or order the column descriptions in the YML file in the same order they appear in the SQL query. If you are modifying an existing YML file, still re-order the elements, don't comment out the old element location and put a new element in.
|
|
28
|
+
10. If modifying an existing YML, leave the value of materialized as is. Do not change it to `table` if it is `view` or vice versa.
|
|
29
|
+
11. Use lowercase for all column names, metric, and dimension names. The sample data will come back with uppercase column names, but the YML file should have lowercase names.
|
|
30
|
+
|
|
31
|
+
12. If there is an primary key ID column as the first field, then add a
|
|
32
|
+
```
|
|
33
|
+
data_tests:
|
|
34
|
+
- unique
|
|
35
|
+
- not_null
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
If the first column is not a primary key ID column, then use a "unique_combination_of_columns" test like this:
|
|
39
|
+
```
|
|
40
|
+
data_tests:
|
|
41
|
+
- dbt_utils.unique_combination_of_columns:
|
|
42
|
+
combination_of_columns:
|
|
43
|
+
- month_at
|
|
44
|
+
- network_name
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
Full example output:
|
|
50
|
+
```
|
|
51
|
+
version: 2
|
|
52
|
+
|
|
53
|
+
models:
|
|
54
|
+
- name: appointment_revenue_mrpv_metrics
|
|
55
|
+
description: >
|
|
56
|
+
This model provides Medical Revenue Per Vist (MRPV) metrics. It includes filterable dimensions by first appt/rev
|
|
57
|
+
veterinarian, location, and network.
|
|
58
|
+
|
|
59
|
+
config:
|
|
60
|
+
materialized: table
|
|
61
|
+
|
|
62
|
+
columns:
|
|
63
|
+
- name: order_at
|
|
64
|
+
description: "The date associated with the order."
|
|
65
|
+
data_tests:
|
|
66
|
+
- not_null
|
|
67
|
+
|
|
68
|
+
- name: location_id
|
|
69
|
+
description: "The identifier for the location where the order was placed."
|
|
70
|
+
|
|
71
|
+
- name: location_name
|
|
72
|
+
description: "The name of the location where the order was placed."
|
|
73
|
+
|
|
74
|
+
- name: network_name
|
|
75
|
+
description: "The name of the network associated with the order."
|
|
76
|
+
|
|
77
|
+
- name: medical_revenue
|
|
78
|
+
description: "Total revenue from medical services."
|
|
79
|
+
|
|
80
|
+
- name: medical_appointment_count
|
|
81
|
+
description: "Count of medical appointments."
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
This is a CSV data sample from the model:
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def dbt_docs_lte_l2_prompt(self):
|
|
90
|
+
return """
|
|
91
|
+
You will help build DBT documentation YML files for a given SQL query. Sometimes you will be asked to generate a description from scratch, other times you will be asked to fill in missing columns that exist in the model, but not in the documentation.
|
|
92
|
+
|
|
93
|
+
These models are built for Marki Microwave, a RF semicoductor company that designed and built RF and microwave components. The models are built to help the company understand their business better. The models are built in DBT and are used to generate reports and dashboards. For acronyms, or other terminology, consider in context of the RF semiconductor industry.
|
|
94
|
+
|
|
95
|
+
Primary DBT Guidelines:
|
|
96
|
+
|
|
97
|
+
3. Include a config block for each model:
|
|
98
|
+
a. Set `materialized` to `view`
|
|
99
|
+
b. Do not include a `sort` key.
|
|
100
|
+
c. If the model name ends in `_mat` set materialized to `table`.
|
|
101
|
+
4. Add data_tests: `unique` and `not_null` to the primary key only. Do not add data_tests: to any other columns. use data_tests:, not tests:
|
|
102
|
+
5. For long descriptions, use the following format so the lines are not too long:
|
|
103
|
+
```
|
|
104
|
+
- name: replacement_plan_id
|
|
105
|
+
description: >
|
|
106
|
+
Identifier for the replacement plan if applicable. A replacement plan is defined as a plan that
|
|
107
|
+
started within 5 days before, or up to 30 days after the end date of the prior plan and is not
|
|
108
|
+
an add-on plan.
|
|
109
|
+
```
|
|
110
|
+
6. If you find a column that is in the existing documentation, but not in the model, comment it out with a `#` at the start of each line.
|
|
111
|
+
7. Only return the YML documentation file contents. Do not provide an explanation.
|
|
112
|
+
8. Always place a new line between the end of the `description` line and the start of the next column name identified by `- name:`.
|
|
113
|
+
9. Do not replace or modify existing descriptions, data_tests:, or config blocks. Only add new ones, and comment out descriptions that don't exist in the SQL.
|
|
114
|
+
10. Reorder or order the column descriptions in the YML file in the same order they appear in the SQL query. If you are modifying an existing YML file, still re-order the elements, don't comment out the old element location and put a new element in.
|
|
115
|
+
11. If modifying an existing YML, leave the value of materialized as is. Do not change it to `table` if it is `view` or vice versa.
|
|
116
|
+
12. Use lowercase for all column names, metric, and dimension names. The sample data will come back with uppercase column names, but the YML file should have lowercase names.
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
Full example output:
|
|
120
|
+
```
|
|
121
|
+
version: 2
|
|
122
|
+
|
|
123
|
+
models:
|
|
124
|
+
- name: stg_vs_example
|
|
125
|
+
description: >
|
|
126
|
+
This is an example description that is longer than one line. It is a good example of how to write a long
|
|
127
|
+
description using the > character.
|
|
128
|
+
|
|
129
|
+
config:
|
|
130
|
+
materialized: table
|
|
131
|
+
|
|
132
|
+
columns:
|
|
133
|
+
- name: order_at
|
|
134
|
+
description: "The date associated with the order."
|
|
135
|
+
|
|
136
|
+
- name: location_id
|
|
137
|
+
description: "The identifier for the location where the order was placed."
|
|
138
|
+
|
|
139
|
+
- name: location_name
|
|
140
|
+
description: "The name of the location where the order was placed."
|
|
141
|
+
|
|
142
|
+
- name: network_name
|
|
143
|
+
description: "The name of the network associated with the order."
|
|
144
|
+
|
|
145
|
+
- name: medical_revenue
|
|
146
|
+
description: "Total revenue from medical services."
|
|
147
|
+
|
|
148
|
+
- name: medical_appointment_count
|
|
149
|
+
description: "Count of medical appointments."
|
|
150
|
+
```
|
|
151
|
+
This is a CSV data sample from the model:
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def build_unit_test_prompt(self):
|
|
156
|
+
|
|
157
|
+
return """
|
|
158
|
+
You will help build mockup input and expected output data for DBT unit data_tests: using the EqualExperts/dbt_unit_testing package. The input and expect data will be in a CSV type format using | as a seperator between fields.
|
|
159
|
+
|
|
160
|
+
The user will pass a SQL DBT model that looks like this as input:
|
|
161
|
+
```
|
|
162
|
+
select
|
|
163
|
+
date(o.order_item_at) as revenue_day_at
|
|
164
|
+
, o.location_id
|
|
165
|
+
, o.is_medical_revenue
|
|
166
|
+
, o.is_plan_payment
|
|
167
|
+
, o.location_name
|
|
168
|
+
, o.product_type_name
|
|
169
|
+
, sum(o.total_before_tax) as revenue_sum
|
|
170
|
+
|
|
171
|
+
from {{ dbt_unit_testing.ref('fct_order_items_mat') }} o
|
|
172
|
+
group by revenue_day_at
|
|
173
|
+
, location_id
|
|
174
|
+
, is_medical_revenue
|
|
175
|
+
, is_plan_payment
|
|
176
|
+
, location_name
|
|
177
|
+
, product_type_name
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
You will return data that looks like this.
|
|
181
|
+
Use this line for the dbt_unit_tests.test (name is filled in) `{{% call dbt_unit_testing.test('{model_name}', '{model_name} unit test') %}}` :
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
{{{{ config(tags=['unit-test']) }}}}
|
|
185
|
+
|
|
186
|
+
--depends-on: {{{{ ref('fct_appointments') }}}}
|
|
187
|
+
|
|
188
|
+
{{% call dbt_unit_testing.test('model_name', 'Description of Test') %}}
|
|
189
|
+
|
|
190
|
+
{{% call dbt_unit_testing.mock_ref('fct_order_items_mat', options={{"input_format": "csv"}}) %}}
|
|
191
|
+
|
|
192
|
+
ORDER_ITEM_AT |LOCATION_ID |IS_MEDICAL_REVENUE |IS_PLAN_PAYMENT |LOCATION_NAME | PRODUCT_TYPE_NAME | TOTAL_BEFORE_TAX
|
|
193
|
+
'2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |TRUE |'ABC123' | 'Product 1' | 25
|
|
194
|
+
'2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |FALSE |'ABC123' | 'Product 2' | 25
|
|
195
|
+
'2024-01-01 00:00:00.000000000 -08:00' |123 |FALSE |FALSE |'ABC123' | 'Product 2' | 25
|
|
196
|
+
'2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |TRUE |'ABC123' | 'Product 1' | 25
|
|
197
|
+
'2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |FALSE |'ABC123' | 'Product 2' | 25
|
|
198
|
+
'2024-01-01 00:00:00.000000000 -08:00' |123 |FALSE |FALSE |'ABC123' | 'Product 2' | 25
|
|
199
|
+
'2024-01-01 00:00:00.000000000 -08:00' |987 |TRUE |TRUE |'DEF123' | 'Product 1' | 25
|
|
200
|
+
'2024-01-01 00:00:00.000000000 -08:00' |987 |TRUE |FALSE |'DEF123' | 'Product 2' | 25
|
|
201
|
+
'2024-01-01 00:00:00.000000000 -08:00' |987 |FALSE |FALSE |'DEF123' | 'Product 2' | 25
|
|
202
|
+
|
|
203
|
+
{{% endcall %}}
|
|
204
|
+
|
|
205
|
+
{{% call dbt_unit_testing.expect({{"input_format": "csv"}}) %}}
|
|
206
|
+
|
|
207
|
+
REVENUE_DAY_AT |LOCATION_ID |IS_MEDICAL_REVENUE |IS_PLAN_PAYMENT |LOCATION_NAME |PRODUCT_TYPE_NAME |REVENUE_SUM
|
|
208
|
+
'2024-01-01' |123 |TRUE |TRUE |'ABC123' |'Product 1' |50
|
|
209
|
+
'2024-01-01' |123 |TRUE |FALSE |'ABC123' |'Product 2' |50
|
|
210
|
+
'2024-01-01' |123 |FALSE |FALSE |'ABC123' |'Product 2' |50
|
|
211
|
+
'2024-01-01' |987 |TRUE |TRUE |'DEF123' |'Product 1' |25
|
|
212
|
+
'2024-01-01' |987 |TRUE |FALSE |'DEF123' |'Product 2' |25
|
|
213
|
+
'2024-01-01' |987 |FALSE |FALSE |'DEF123' |'Product 2' |25
|
|
214
|
+
|
|
215
|
+
{{% endcall %}}
|
|
216
|
+
{{% endcall %}}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Note how the model aggregates the expected REVENUE_SUM. Do your best to aggregate the expected data based on the SQL in the input. The goal is to create a model that is easy to read and hand validate.
|
|
220
|
+
|
|
221
|
+
When creating the mock data, follow these guidelines:
|
|
222
|
+
|
|
223
|
+
1. For boolean input columns, create enough rows that you can test both TRUE and FALSE values for all columns. For example, if there are 3 boolean columns, you would need six rows to test all combinations of TRUE and FALSE values.
|
|
224
|
+
2. For ID columns, use simple numbers. For example, 123, 456, 789, etc.
|
|
225
|
+
3. For name columns, try to identify when the name should be the same for a given ID. For example, if there are 3 rows with LOCATION_ID = 123, then the LOCATION_NAME should be the same for all three rows.
|
|
226
|
+
4. If a column ends in _at, it is either a date or a timestamp. If it is date, the column will end in _day_at. Timestamps will only end in _at. If the column is a date, use a date format like '2024-01-01'. If the column is a timestamp, use a timestamp format like '2024-01-01 00:00:00.000000000 -08:00'.
|
|
227
|
+
5. For numeric or dollar value columns, use simple numbers. For example, 10, 20.
|
|
228
|
+
6. Use a minimal number of rows needed to fully exercise the logic in the function. Try to sort dates and group by locations, names, or other similar common values.
|
|
229
|
+
7. You will need an `mock_ref` block for each DBT model in the input SQL. DBT models will be defined as either {{ ref('model_name') }} or {{ dbt_unit_testing.ref('model_name') }}.
|
|
230
|
+
8. Align the columns in the input and expected using tabs. You can use as many rows as needed.
|
|
231
|
+
9. Output the data in the same format as the example. Use as many `dbt_unit_testing.mock_ref` blocks as needed.
|
|
232
|
+
10. Use enough rows with variation that at least one aggregation can be created. For example, if the model groups by date and ID, you will need at least two rows with the same date and ID but different values for the columns being aggregated.
|
|
233
|
+
11. Location names are always three uppercase letters followed by three numbers. For example, "ABC123" or "DEF123". Network names are always the first three uppercase letters of the network name, like "ABC" or "DEF"
|
|
234
|
+
12. At the top of the file as in the example, add a --depends-on line for each moc_ref model used in the input SQL. Example, --depends-on: {{ ref('fct_appointments') }}
|
|
235
|
+
13. Encapsulate any string in single or date in single quotes. Even if the sample data has long strings, truncate to no more than 30 characters, preferably even less unless the logic requires something more.
|
|
236
|
+
14. Try to limit the date range in the input data to one or two days of time span, unless more is needed to fully test the logic of the model.
|
|
237
|
+
15. Do not include timezones in mockup data unless the sample data provided for that model includes timezones.
|
|
238
|
+
16. Do not include columns in the mock_ref blocks that are not used by the SQL model being tested.
|
|
239
|
+
17. All dates and strings must be enclosed in single quotes in the mock data.
|
|
240
|
+
Do not provide an explanation, only return the code for the test.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# %%
|
mdbt/recce.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
from click.core import Context
|
|
6
|
+
|
|
7
|
+
from mdbt.core import Core
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Recce(Core):
|
|
11
|
+
|
|
12
|
+
def __init__(self, test_mode=False):
|
|
13
|
+
super().__init__(test_mode=test_mode)
|
|
14
|
+
|
|
15
|
+
def recce(self, ctx: Context):
|
|
16
|
+
print("Downloading production artifacts.")
|
|
17
|
+
current_dir = os.getcwd()
|
|
18
|
+
# Initialize variables
|
|
19
|
+
target_path = None
|
|
20
|
+
logs = None
|
|
21
|
+
# Check if current directory ends with 'transform'
|
|
22
|
+
if current_dir.endswith("transform"):
|
|
23
|
+
target_path = os.path.join("target-base")
|
|
24
|
+
logs = os.path.join("logs")
|
|
25
|
+
elif os.path.isdir(os.path.join(current_dir, "transform")):
|
|
26
|
+
target_path = os.path.join("transform", "target-base")
|
|
27
|
+
logs = os.path.join("transform", "logs")
|
|
28
|
+
else:
|
|
29
|
+
raise FileNotFoundError(
|
|
30
|
+
"No 'transform' directory found in the current execution directory."
|
|
31
|
+
)
|
|
32
|
+
os.makedirs(target_path, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
# Delete all files in target_path
|
|
35
|
+
for file_name in os.listdir(target_path):
|
|
36
|
+
file_path = os.path.join(target_path, file_name)
|
|
37
|
+
if os.path.isfile(file_path):
|
|
38
|
+
os.remove(file_path)
|
|
39
|
+
|
|
40
|
+
# Pull artifacts from Snowflake. These are the latest production artifacts.
|
|
41
|
+
try:
|
|
42
|
+
if not self.test_mode:
|
|
43
|
+
subprocess.run(
|
|
44
|
+
["dbt", "run-operation", "get_last_artifacts"], check=True
|
|
45
|
+
)
|
|
46
|
+
except subprocess.CalledProcessError as e:
|
|
47
|
+
self.handle_cmd_line_error(e)
|
|
48
|
+
|
|
49
|
+
# Copy files from logs to target_path
|
|
50
|
+
if os.path.isdir(logs):
|
|
51
|
+
for file_name in os.listdir(logs):
|
|
52
|
+
full_file_path = os.path.join(logs, file_name)
|
|
53
|
+
if os.path.isfile(full_file_path):
|
|
54
|
+
shutil.copy(full_file_path, target_path)
|
|
55
|
+
else:
|
|
56
|
+
raise FileNotFoundError(
|
|
57
|
+
f"'logs' directory not found at expected path: {logs}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Start recce server
|
|
61
|
+
try:
|
|
62
|
+
if not self.test_mode:
|
|
63
|
+
subprocess.run(["dbt", "docs", "generate"], check=True)
|
|
64
|
+
subprocess.run(["recce", "server"], check=True)
|
|
65
|
+
except subprocess.CalledProcessError as e:
|
|
66
|
+
self.handle_cmd_line_error(e)
|
mdbt/sort_yaml_fields.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Using this instead of the default as it preserves the order of keys in the dictionary.
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Any
|
|
5
|
+
from typing import Dict
|
|
6
|
+
from typing import List
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
import pyperclip
|
|
11
|
+
from dotenv import find_dotenv
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
from ruamel.yaml import YAML
|
|
14
|
+
|
|
15
|
+
from mdbt.ai_core import AiCore
|
|
16
|
+
from mdbt.main import MDBT
|
|
17
|
+
|
|
18
|
+
load_dotenv(find_dotenv("../.env"))
|
|
19
|
+
load_dotenv(find_dotenv(".env"))
|
|
20
|
+
|
|
21
|
+
# Modify the dumper to not sort keys and to use ordered dict format
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SortYAML(AiCore):
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
self.yaml = YAML(typ="rt")
|
|
28
|
+
self.yaml.preserve_quotes = True
|
|
29
|
+
self.yaml.explicit_start = True
|
|
30
|
+
self.yaml.indent(mapping=2, sequence=4, offset=2)
|
|
31
|
+
|
|
32
|
+
def main(
|
|
33
|
+
self,
|
|
34
|
+
select: Optional[str] = None,
|
|
35
|
+
all_files: Optional[bool] = False,
|
|
36
|
+
overwrite: Optional[bool] = False,
|
|
37
|
+
):
|
|
38
|
+
models = self._get_models(select, all_files)
|
|
39
|
+
if len(models) > 1 and not overwrite:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
"Multiple models found. Default copy to clipboard only works with one model. Use the --overwrite flag "
|
|
42
|
+
"if processing more than one model at a time."
|
|
43
|
+
)
|
|
44
|
+
if not models:
|
|
45
|
+
raise ValueError(f"No models found for select '{select}'")
|
|
46
|
+
|
|
47
|
+
for model in models:
|
|
48
|
+
original_file_path = model["original_file_path"]
|
|
49
|
+
model_name = model["name"]
|
|
50
|
+
schema_file, table_name = self._get_schema_path_and_table(
|
|
51
|
+
original_file_path=original_file_path, model_name=model_name
|
|
52
|
+
)
|
|
53
|
+
schema_data = self.read_yml(schema_file)
|
|
54
|
+
db_columns = self.get_db_columns(table_name)
|
|
55
|
+
updated_schema = self.reorganize_columns(schema_data, db_columns)
|
|
56
|
+
|
|
57
|
+
if overwrite:
|
|
58
|
+
with open(schema_file, "w") as stream:
|
|
59
|
+
self.yaml.dump(
|
|
60
|
+
updated_schema, stream, transform=self.clean_top_line
|
|
61
|
+
)
|
|
62
|
+
self.yaml.dump(
|
|
63
|
+
updated_schema, sys.stdout, transform=self.clean_top_line
|
|
64
|
+
)
|
|
65
|
+
print(f"Schema file '{schema_file}' updated")
|
|
66
|
+
else:
|
|
67
|
+
self.save_yml_to_clipboard(updated_schema)
|
|
68
|
+
|
|
69
|
+
def _get_models(self, select: str, all_files: bool) -> List[Dict[str, Any]]:
|
|
70
|
+
mmw = MDBT()
|
|
71
|
+
if not all_files:
|
|
72
|
+
args = [
|
|
73
|
+
"--select",
|
|
74
|
+
select,
|
|
75
|
+
"--exclude",
|
|
76
|
+
"resource_type:test resource_type:seed resource_type:snapshot resource_type:source",
|
|
77
|
+
]
|
|
78
|
+
else:
|
|
79
|
+
args = [
|
|
80
|
+
"--exclude",
|
|
81
|
+
"resource_type:test resource_type:seed resource_type:snapshot resource_type:source",
|
|
82
|
+
]
|
|
83
|
+
ls_json = mmw.dbt_ls_to_json(args)
|
|
84
|
+
|
|
85
|
+
return ls_json
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def _get_schema_path_and_table(
|
|
89
|
+
original_file_path: str, model_name: str
|
|
90
|
+
) -> Tuple[str, str]:
|
|
91
|
+
schema_file = original_file_path[:-3] + "yml"
|
|
92
|
+
schema = os.environ.get("DEV_SCHEMA")
|
|
93
|
+
if not schema:
|
|
94
|
+
raise ValueError("DEV_SCHEMA environment variable is not set")
|
|
95
|
+
database = os.environ.get("DEV_DATABASE")
|
|
96
|
+
if not database:
|
|
97
|
+
raise ValueError("DEV_DATABASE environment variable is not set")
|
|
98
|
+
table_name = f"{database}.{schema}.{model_name}"
|
|
99
|
+
print(f"Schema file: {schema_file}")
|
|
100
|
+
print(f"Table name: {table_name}")
|
|
101
|
+
return schema_file, table_name
|
|
102
|
+
|
|
103
|
+
def read_yml(self, file_path: str) -> Dict[str, Any]:
|
|
104
|
+
with open(file_path, "r") as stream:
|
|
105
|
+
return self.yaml.load(stream)
|
|
106
|
+
|
|
107
|
+
def save_yml_to_clipboard(self, data: Dict[str, Any]):
|
|
108
|
+
# Convert yaml to string. This yaml class is a pain in the ass to get it to return a string. This is sort of
|
|
109
|
+
# a hack where it will send the string data to the copy_to_clip function which saves it to the clipboard.
|
|
110
|
+
# It honestly seems like the guy who wrote this class never considered any need beyond dumping to the stdout.
|
|
111
|
+
self.yaml.dump(data, sys.stdout, transform=self.copy_to_clip)
|
|
112
|
+
print("Sorted YAML schema copied to clipboard!")
|
|
113
|
+
|
|
114
|
+
def copy_to_clip(self, string_yaml: str) -> str:
|
|
115
|
+
# Remove the first line of the string
|
|
116
|
+
string_yaml = self.clean_top_line(string_yaml)
|
|
117
|
+
pyperclip.copy(string_yaml)
|
|
118
|
+
print("Sorted YAML schema copied to clipboard!")
|
|
119
|
+
return string_yaml
|
|
120
|
+
|
|
121
|
+
def clean_top_line(self, string_yaml: str) -> str:
|
|
122
|
+
str_lines = string_yaml.split("\n")
|
|
123
|
+
string_yaml = "\n".join(str_lines[1:])
|
|
124
|
+
return string_yaml
|
|
125
|
+
|
|
126
|
+
def get_db_columns(self, table_name: str) -> list:
|
|
127
|
+
self._cur.execute(f"SELECT * FROM {table_name} LIMIT 0")
|
|
128
|
+
return [desc[0].lower() for desc in self._cur.description]
|
|
129
|
+
|
|
130
|
+
def reorganize_columns(
|
|
131
|
+
self, schema_data: Dict[str, Any], db_columns: list
|
|
132
|
+
) -> Dict[str, Any]:
|
|
133
|
+
if "models" not in schema_data or not schema_data["models"]:
|
|
134
|
+
raise ValueError("YML schema does not contain any models")
|
|
135
|
+
model = schema_data["models"][0] # Assuming a single model for simplicity
|
|
136
|
+
columns = model.get("columns", [])
|
|
137
|
+
col_dict = {col["name"]: col for col in columns}
|
|
138
|
+
|
|
139
|
+
sorted_columns = [col_dict[col] for col in db_columns if col in col_dict]
|
|
140
|
+
|
|
141
|
+
model["columns"] = sorted_columns
|
|
142
|
+
schema_data["models"][0] = model
|
|
143
|
+
return schema_data
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
y = SortYAML()
|
|
148
|
+
y.main(select=None, all_files=True, overwrite=True)
|
mdbt/sql_sorter.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
#!/usr/bin/env python3
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
from typing import List, Tuple
|
|
7
|
+
|
|
8
|
+
import pyperclip
|
|
9
|
+
from mdbt.core import Core
|
|
10
|
+
|
|
11
|
+
class ColumnSorter(Core):
|
|
12
|
+
"""Sorts SQL select lines from the clipboard based on alias and data‐type rules."""
|
|
13
|
+
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
"""Initialize ColumnSorter with the raw lines to sort.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
lines: A list of lines (each line is one select‐list entry, typically starting with a comma).
|
|
19
|
+
"""
|
|
20
|
+
super().__init__()
|
|
21
|
+
|
|
22
|
+
def main(self) -> None:
|
|
23
|
+
"""Read lines from the clipboard, sort them, write them back, and print."""
|
|
24
|
+
# 1) Read everything currently on the clipboard
|
|
25
|
+
try:
|
|
26
|
+
clipboard_content = pyperclip.paste()
|
|
27
|
+
except pyperclip.PyperclipException:
|
|
28
|
+
sys.stderr.write('Error: could not access clipboard. Is pyperclip installed and supported?\n')
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
|
|
31
|
+
# 2) Split into individual lines
|
|
32
|
+
lines = clipboard_content.splitlines()
|
|
33
|
+
|
|
34
|
+
# 3) Sort them
|
|
35
|
+
sorter = ColumnSorter()
|
|
36
|
+
sorted_lines = sorter.sort_lines(lines)
|
|
37
|
+
|
|
38
|
+
# 4) Join back together
|
|
39
|
+
result = '\n'.join(sorted_lines)
|
|
40
|
+
|
|
41
|
+
# 5) Copy sorted result back to clipboard
|
|
42
|
+
try:
|
|
43
|
+
pyperclip.copy(result)
|
|
44
|
+
except pyperclip.PyperclipException:
|
|
45
|
+
sys.stderr.write('Warning: could not write back to clipboard. Outputting to stdout instead.\n')
|
|
46
|
+
print(result)
|
|
47
|
+
sys.exit(0)
|
|
48
|
+
|
|
49
|
+
# 6) Also print to stdout for verification
|
|
50
|
+
print(result)
|
|
51
|
+
|
|
52
|
+
def parse_line(self, line: str) -> Tuple[str, str, str]:
|
|
53
|
+
"""Split one line into (prefix, full_expression, alias).
|
|
54
|
+
|
|
55
|
+
- Prefix is any leading commas/spaces (e.g. ', ' and indentation).
|
|
56
|
+
- full_expression is everything after that prefix, up to the alias (if one exists).
|
|
57
|
+
- alias is the name used for sorting (the part after 'AS', or if no AS, the base column name).
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
line: A single select‐list line, e.g. ', "FOO"::varchar as foo_id' or ', project_id'.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
A tuple (prefix, full_expression, alias):
|
|
64
|
+
- prefix: leading commas and spaces (e.g. ', '),
|
|
65
|
+
- full_expression: the expression/column + cast (e.g. '"FOO"::varchar'),
|
|
66
|
+
- alias: the alias (e.g. 'foo_id') or base column name if no AS.
|
|
67
|
+
"""
|
|
68
|
+
# 1) Extract prefix (leading comma + whitespace), if present
|
|
69
|
+
m = re.match(r'^(\s*,\s*)(.*)$', line)
|
|
70
|
+
if m:
|
|
71
|
+
prefix = m.group(1)
|
|
72
|
+
rest = m.group(2).strip()
|
|
73
|
+
else:
|
|
74
|
+
# No leading comma, treat everything as rest
|
|
75
|
+
prefix = ''
|
|
76
|
+
rest = line.strip()
|
|
77
|
+
|
|
78
|
+
# 2) Look for 'AS' (case-insensitive), split into left/right
|
|
79
|
+
# Use regex to split on whitespace+as+whitespace, max once
|
|
80
|
+
lower_rest = rest.lower()
|
|
81
|
+
if re.search(r'\s+as\s+', lower_rest):
|
|
82
|
+
parts = re.split(r'\s+as\s+', rest, maxsplit=1, flags=re.IGNORECASE)
|
|
83
|
+
expression_part = parts[0].strip()
|
|
84
|
+
alias = parts[1].strip()
|
|
85
|
+
else:
|
|
86
|
+
# No AS: take expression exactly as rest; derive alias from expression
|
|
87
|
+
expression_part = rest
|
|
88
|
+
# If there's a '::', drop the cast and use the part before it
|
|
89
|
+
if '::' in expression_part:
|
|
90
|
+
alias = expression_part.split('::', 1)[0].strip()
|
|
91
|
+
else:
|
|
92
|
+
# If nothing to split, alias is simply the whole rest
|
|
93
|
+
alias = expression_part
|
|
94
|
+
|
|
95
|
+
return prefix, expression_part, alias
|
|
96
|
+
|
|
97
|
+
def get_group(self, expression: str, alias: str) -> int:
|
|
98
|
+
"""Determine sorting group (0..4) for a given line.
|
|
99
|
+
|
|
100
|
+
The rules (in order) are:
|
|
101
|
+
0: alias ends with '_id'
|
|
102
|
+
1: alias ends with '_at'
|
|
103
|
+
2: alias starts with 'is_'
|
|
104
|
+
3: anything except variant casts
|
|
105
|
+
4: fields cast as VARIANT (i.e., '::variant' appears)
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
expression: The full expression (column plus any cast).
|
|
109
|
+
alias: The alias to use for naming rules.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
An integer group (0 through 4), where lower means higher priority.
|
|
113
|
+
"""
|
|
114
|
+
a = alias.lower()
|
|
115
|
+
expr_lower = expression.lower()
|
|
116
|
+
|
|
117
|
+
if a.endswith('_id'):
|
|
118
|
+
return 0
|
|
119
|
+
if a.endswith('_at'):
|
|
120
|
+
return 1
|
|
121
|
+
if a.startswith('is_'):
|
|
122
|
+
return 2
|
|
123
|
+
# Check for VARIANT cast
|
|
124
|
+
if re.search(r'::\s*variant\b', expr_lower):
|
|
125
|
+
return 4
|
|
126
|
+
# Otherwise, everything else is group 3
|
|
127
|
+
return 3
|
|
128
|
+
|
|
129
|
+
def sort_lines(self, lines: List[str]) -> List[str]:
|
|
130
|
+
"""Sort all stored lines according to group and alias.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
A new list of lines (with original prefixes) in sorted order.
|
|
134
|
+
"""
|
|
135
|
+
parsed: List[Tuple[int, str, str, str]] = []
|
|
136
|
+
# parsed tuples: (group, alias_lower, prefix, full_expression)
|
|
137
|
+
for raw_line in lines:
|
|
138
|
+
# Skip empty lines
|
|
139
|
+
if not raw_line.strip():
|
|
140
|
+
continue
|
|
141
|
+
prefix, expr, alias = self.parse_line(raw_line)
|
|
142
|
+
group = self.get_group(expr, alias)
|
|
143
|
+
parsed.append((group, alias.lower(), prefix, expr))
|
|
144
|
+
|
|
145
|
+
# Sort first by group number, then by alias lexicographically
|
|
146
|
+
parsed.sort(key=lambda t: (t[0], t[1]))
|
|
147
|
+
|
|
148
|
+
# Reconstruct each line as prefix + expression [ + ' as ' + alias if original had AS ]
|
|
149
|
+
# BUT to preserve the original "AS" style, we’ll just print prefix + expression + ' as ' + alias
|
|
150
|
+
# except if alias exactly equals the expression (i.e. no AS in original), then drop ' as '.
|
|
151
|
+
sorted_lines: List[str] = []
|
|
152
|
+
for group, alias_lower, prefix, expr in parsed:
|
|
153
|
+
# Determine if original expr already contained ' as alias_lower' (case-insensitive)
|
|
154
|
+
# We can check if expr.lower().endswith(alias_lower) but that fails if casting was present.
|
|
155
|
+
# Instead, if alias_lower != expr.split('::')[0].strip().lower(), we assume original used AS.
|
|
156
|
+
base_no_cast = expr.split('::', 1)[0].strip().lower()
|
|
157
|
+
if base_no_cast != alias_lower:
|
|
158
|
+
# original must have had an explicit alias, so we add ' as alias'
|
|
159
|
+
line_text = f'{prefix}{expr} as {alias_lower}'
|
|
160
|
+
else:
|
|
161
|
+
# no AS needed
|
|
162
|
+
line_text = f'{prefix}{expr}'
|
|
163
|
+
sorted_lines.append(line_text)
|
|
164
|
+
|
|
165
|
+
return sorted_lines
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mdbt
|
|
3
|
+
Version: 0.4.27
|
|
4
|
+
Summary: A CLI tool to manage dbt builds with state handling and manifest management
|
|
5
|
+
Author: Craig Lathrop
|
|
6
|
+
Author-email: info@markimicrowave.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: click<9.0.0,>=8.0.0
|
|
13
|
+
Requires-Dist: pyperclip<2.0.0,>=1.8.0
|
|
14
|
+
Requires-Dist: snowflake-connector-python[pandas]<4.0.0,>=3.11.0
|
|
15
|
+
Requires-Dist: python-dotenv<1.2.0,>=1.0.0
|
|
16
|
+
Requires-Dist: openai<2.0.0,>=1.35.0
|
|
17
|
+
Requires-Dist: sqlfluff==3.4.0
|
|
18
|
+
Requires-Dist: sqlfluff-templater-dbt==3.4.0
|
|
19
|
+
Requires-Dist: wordninja==2.0.0
|
|
20
|
+
Requires-Dist: ruamel.yaml<0.18.0
|
|
21
|
+
Requires-Dist: recce<=0.44.3
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
mdbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mdbt/ai_core.py,sha256=diJFT35pGOEIQpUYOk6GFBgQgmK2sHj_F_kfkjeaEyQ,3963
|
|
3
|
+
mdbt/build_dbt_docs_ai.py,sha256=BOuFBUJyf2l7pOzw9LSTXc5Gk-8LoGqKKDRE4Jgx0p0,5309
|
|
4
|
+
mdbt/build_unit_test_data_ai.py,sha256=OqJXDerhg54_QVIlVtXNxfk5TKVe68i6JnrV_8iEDrc,4292
|
|
5
|
+
mdbt/cmdline.py,sha256=meNATu3BzP_4Htt5VcoT923mlh9NsfK8og0JQgn9PCE,10822
|
|
6
|
+
mdbt/core.py,sha256=VZhIwJEPm33hhTyYn9v7EAjXU0aDVkRsxbozz3G6OAI,4109
|
|
7
|
+
mdbt/expectations_output_builder.py,sha256=AXKEM-WO7FecYzfMLwzsOnQnVf7AiHBi_khyidE2lJs,3195
|
|
8
|
+
mdbt/lightdash.py,sha256=qJBJ-pc5mN8GBA1MZElRhtA8aldrX-AgvHtha4iOA-Y,2745
|
|
9
|
+
mdbt/main.py,sha256=UO3r7zOXmVpjnAIz2eeZVeQMFmgXa698Gm42Wo2qhRU,16939
|
|
10
|
+
mdbt/precommit_format.py,sha256=9HC10mh4QQIgaQSxdAwaCXbsiHT9cCrLdbc3PAQkotc,2845
|
|
11
|
+
mdbt/prompts.py,sha256=2vCvh9hamgop92kGGaMKtap11F2MZiM7hHKjcwX4lhQ,13992
|
|
12
|
+
mdbt/recce.py,sha256=P14CvWd7lRgTPUW7BVMLmcclSqby-_uSgpoI3r5VjTA,2327
|
|
13
|
+
mdbt/sort_yaml_fields.py,sha256=1PROsrz8KubSr0bVmPq3oAw4V-eSiZh1uGRR5B-uaA4,5648
|
|
14
|
+
mdbt/sql_sorter.py,sha256=8bd6svrtcXp7ePT2g4FTGLTW55qbsVjXgUmba7L-G-4,6467
|
|
15
|
+
mdbt-0.4.27.dist-info/licenses/LICENSE,sha256=DrJpgQEYhttwpwcE56BzrGZ1aEfR_tqfaxsI5NlsYOE,1072
|
|
16
|
+
mdbt-0.4.27.dist-info/METADATA,sha256=nYnEobjgoL3XJcryk3GBVzLZgxniFvp6J7u-w55sZzU,920
|
|
17
|
+
mdbt-0.4.27.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
mdbt-0.4.27.dist-info/entry_points.txt,sha256=fVft1CYVP2MtZMMtsXN67S1T5RszfgKtAuaeoKLdCow,43
|
|
19
|
+
mdbt-0.4.27.dist-info/top_level.txt,sha256=-PP7vAl9EgVjRTzBovElczsPNjOfja6kjZssNmv5vo0,5
|
|
20
|
+
mdbt-0.4.27.dist-info/RECORD,,
|