mdbt 0.4.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbt/prompts.py ADDED
@@ -0,0 +1,244 @@
1
+ class Prompts:
2
+
3
+ @property
4
+ def dbt_docs_gte_l3_prompt(self):
5
+ return """
6
+ You will help build DBT documentation YML files for a given SQL query. Sometimes you will be asked to generate a description from scratch, other times you will be asked to fill in missing columns that exist in the model, but not in the documentation.
7
+
8
+ These models are built for Marki Microwave, a RF semicoductor company that designed and built RF and microwave components. The models are built to help the company understand their business better. The models are built in DBT and are used to generate reports and dashboards. For acronyms, or other terminology, consider in context of the RF semiconductor industry.
9
+
10
+ Primary DBT Guidelines:
11
+
12
+ 3. Include a config block for each model:
13
+ a. Set `materialized` to `table`
14
+ b. Do not include a `sort` key.
15
+ 4. For long descriptions, use the following format so the lines are not too long:
16
+ ```
17
+ - name: replacement_plan_id
18
+ description: >
19
+ Identifier for the replacement plan if applicable. A replacement plan is defined as a plan that
20
+ started within 5 days before, or up to 30 days after the end date of the prior plan and is not
21
+ an add-on plan.
22
+ ```
23
+ 5. If you find a column that is in the existing documentation, but not in the model, comment it out with a `#` at the start of each line.
24
+ 6. Only return the YML documentation file contents. Do not provide an explanation.
25
+ 7. Always place a new line between the end of the `description` line and the start of the next column name identified by `- name:`.
26
+ 8. If updating and existing file, do not replace or modify existing descriptions, data_tests:, or config blocks. Only add new ones, and comment out descriptions that don't exist in the SQL. use data_tests:, not tests:
27
+ 9. Reorder or order the column descriptions in the YML file in the same order they appear in the SQL query. If you are modifying an existing YML file, still re-order the elements, don't comment out the old element location and put a new element in.
28
+ 10. If modifying an existing YML, leave the value of materialized as is. Do not change it to `table` if it is `view` or vice versa.
29
+ 11. Use lowercase for all column names, metric, and dimension names. The sample data will come back with uppercase column names, but the YML file should have lowercase names.
30
+
31
+ 12. If there is an primary key ID column as the first field, then add a
32
+ ```
33
+ data_tests:
34
+ - unique
35
+ - not_null
36
+ ```
37
+
38
+ If the first column is not a primary key ID column, then use a "unique_combination_of_columns" test like this:
39
+ ```
40
+ data_tests:
41
+ - dbt_utils.unique_combination_of_columns:
42
+ combination_of_columns:
43
+ - month_at
44
+ - network_name
45
+ ```
46
+
47
+
48
+
49
+ Full example output:
50
+ ```
51
+ version: 2
52
+
53
+ models:
54
+ - name: appointment_revenue_mrpv_metrics
55
+ description: >
56
+ This model provides Medical Revenue Per Vist (MRPV) metrics. It includes filterable dimensions by first appt/rev
57
+ veterinarian, location, and network.
58
+
59
+ config:
60
+ materialized: table
61
+
62
+ columns:
63
+ - name: order_at
64
+ description: "The date associated with the order."
65
+ data_tests:
66
+ - not_null
67
+
68
+ - name: location_id
69
+ description: "The identifier for the location where the order was placed."
70
+
71
+ - name: location_name
72
+ description: "The name of the location where the order was placed."
73
+
74
+ - name: network_name
75
+ description: "The name of the network associated with the order."
76
+
77
+ - name: medical_revenue
78
+ description: "Total revenue from medical services."
79
+
80
+ - name: medical_appointment_count
81
+ description: "Count of medical appointments."
82
+
83
+ ```
84
+
85
+ This is a CSV data sample from the model:
86
+ """
87
+
88
+ @property
89
+ def dbt_docs_lte_l2_prompt(self):
90
+ return """
91
+ You will help build DBT documentation YML files for a given SQL query. Sometimes you will be asked to generate a description from scratch, other times you will be asked to fill in missing columns that exist in the model, but not in the documentation.
92
+
93
+ These models are built for Marki Microwave, a RF semicoductor company that designed and built RF and microwave components. The models are built to help the company understand their business better. The models are built in DBT and are used to generate reports and dashboards. For acronyms, or other terminology, consider in context of the RF semiconductor industry.
94
+
95
+ Primary DBT Guidelines:
96
+
97
+ 3. Include a config block for each model:
98
+ a. Set `materialized` to `view`
99
+ b. Do not include a `sort` key.
100
+ c. If the model name ends in `_mat` set materialized to `table`.
101
+ 4. Add data_tests: `unique` and `not_null` to the primary key only. Do not add data_tests: to any other columns. use data_tests:, not tests:
102
+ 5. For long descriptions, use the following format so the lines are not too long:
103
+ ```
104
+ - name: replacement_plan_id
105
+ description: >
106
+ Identifier for the replacement plan if applicable. A replacement plan is defined as a plan that
107
+ started within 5 days before, or up to 30 days after the end date of the prior plan and is not
108
+ an add-on plan.
109
+ ```
110
+ 6. If you find a column that is in the existing documentation, but not in the model, comment it out with a `#` at the start of each line.
111
+ 7. Only return the YML documentation file contents. Do not provide an explanation.
112
+ 8. Always place a new line between the end of the `description` line and the start of the next column name identified by `- name:`.
113
+ 9. Do not replace or modify existing descriptions, data_tests:, or config blocks. Only add new ones, and comment out descriptions that don't exist in the SQL.
114
+ 10. Reorder or order the column descriptions in the YML file in the same order they appear in the SQL query. If you are modifying an existing YML file, still re-order the elements, don't comment out the old element location and put a new element in.
115
+ 11. If modifying an existing YML, leave the value of materialized as is. Do not change it to `table` if it is `view` or vice versa.
116
+ 12. Use lowercase for all column names, metric, and dimension names. The sample data will come back with uppercase column names, but the YML file should have lowercase names.
117
+
118
+
119
+ Full example output:
120
+ ```
121
+ version: 2
122
+
123
+ models:
124
+ - name: stg_vs_example
125
+ description: >
126
+ This is an example description that is longer than one line. It is a good example of how to write a long
127
+ description using the > character.
128
+
129
+ config:
130
+ materialized: table
131
+
132
+ columns:
133
+ - name: order_at
134
+ description: "The date associated with the order."
135
+
136
+ - name: location_id
137
+ description: "The identifier for the location where the order was placed."
138
+
139
+ - name: location_name
140
+ description: "The name of the location where the order was placed."
141
+
142
+ - name: network_name
143
+ description: "The name of the network associated with the order."
144
+
145
+ - name: medical_revenue
146
+ description: "Total revenue from medical services."
147
+
148
+ - name: medical_appointment_count
149
+ description: "Count of medical appointments."
150
+ ```
151
+ This is a CSV data sample from the model:
152
+ """
153
+
154
+ @property
155
+ def build_unit_test_prompt(self):
156
+
157
+ return """
158
+ You will help build mockup input and expected output data for DBT unit data_tests: using the EqualExperts/dbt_unit_testing package. The input and expect data will be in a CSV type format using | as a seperator between fields.
159
+
160
+ The user will pass a SQL DBT model that looks like this as input:
161
+ ```
162
+ select
163
+ date(o.order_item_at) as revenue_day_at
164
+ , o.location_id
165
+ , o.is_medical_revenue
166
+ , o.is_plan_payment
167
+ , o.location_name
168
+ , o.product_type_name
169
+ , sum(o.total_before_tax) as revenue_sum
170
+
171
+ from {{ dbt_unit_testing.ref('fct_order_items_mat') }} o
172
+ group by revenue_day_at
173
+ , location_id
174
+ , is_medical_revenue
175
+ , is_plan_payment
176
+ , location_name
177
+ , product_type_name
178
+ ```
179
+
180
+ You will return data that looks like this.
181
+ Use this line for the dbt_unit_tests.test (name is filled in) `{{% call dbt_unit_testing.test('{model_name}', '{model_name} unit test') %}}` :
182
+
183
+ ```
184
+ {{{{ config(tags=['unit-test']) }}}}
185
+
186
+ --depends-on: {{{{ ref('fct_appointments') }}}}
187
+
188
+ {{% call dbt_unit_testing.test('model_name', 'Description of Test') %}}
189
+
190
+ {{% call dbt_unit_testing.mock_ref('fct_order_items_mat', options={{"input_format": "csv"}}) %}}
191
+
192
+ ORDER_ITEM_AT |LOCATION_ID |IS_MEDICAL_REVENUE |IS_PLAN_PAYMENT |LOCATION_NAME | PRODUCT_TYPE_NAME | TOTAL_BEFORE_TAX
193
+ '2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |TRUE |'ABC123' | 'Product 1' | 25
194
+ '2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |FALSE |'ABC123' | 'Product 2' | 25
195
+ '2024-01-01 00:00:00.000000000 -08:00' |123 |FALSE |FALSE |'ABC123' | 'Product 2' | 25
196
+ '2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |TRUE |'ABC123' | 'Product 1' | 25
197
+ '2024-01-01 00:00:00.000000000 -08:00' |123 |TRUE |FALSE |'ABC123' | 'Product 2' | 25
198
+ '2024-01-01 00:00:00.000000000 -08:00' |123 |FALSE |FALSE |'ABC123' | 'Product 2' | 25
199
+ '2024-01-01 00:00:00.000000000 -08:00' |987 |TRUE |TRUE |'DEF123' | 'Product 1' | 25
200
+ '2024-01-01 00:00:00.000000000 -08:00' |987 |TRUE |FALSE |'DEF123' | 'Product 2' | 25
201
+ '2024-01-01 00:00:00.000000000 -08:00' |987 |FALSE |FALSE |'DEF123' | 'Product 2' | 25
202
+
203
+ {{% endcall %}}
204
+
205
+ {{% call dbt_unit_testing.expect({{"input_format": "csv"}}) %}}
206
+
207
+ REVENUE_DAY_AT |LOCATION_ID |IS_MEDICAL_REVENUE |IS_PLAN_PAYMENT |LOCATION_NAME |PRODUCT_TYPE_NAME |REVENUE_SUM
208
+ '2024-01-01' |123 |TRUE |TRUE |'ABC123' |'Product 1' |50
209
+ '2024-01-01' |123 |TRUE |FALSE |'ABC123' |'Product 2' |50
210
+ '2024-01-01' |123 |FALSE |FALSE |'ABC123' |'Product 2' |50
211
+ '2024-01-01' |987 |TRUE |TRUE |'DEF123' |'Product 1' |25
212
+ '2024-01-01' |987 |TRUE |FALSE |'DEF123' |'Product 2' |25
213
+ '2024-01-01' |987 |FALSE |FALSE |'DEF123' |'Product 2' |25
214
+
215
+ {{% endcall %}}
216
+ {{% endcall %}}
217
+ ```
218
+
219
+ Note how the model aggregates the expected REVENUE_SUM. Do your best to aggregate the expected data based on the SQL in the input. The goal is to create a model that is easy to read and hand validate.
220
+
221
+ When creating the mock data, follow these guidelines:
222
+
223
+ 1. For boolean input columns, create enough rows that you can test both TRUE and FALSE values for all columns. For example, if there are 3 boolean columns, you would need six rows to test all combinations of TRUE and FALSE values.
224
+ 2. For ID columns, use simple numbers. For example, 123, 456, 789, etc.
225
+ 3. For name columns, try to identify when the name should be the same for a given ID. For example, if there are 3 rows with LOCATION_ID = 123, then the LOCATION_NAME should be the same for all three rows.
226
+ 4. If a column ends in _at, it is either a date or a timestamp. If it is date, the column will end in _day_at. Timestamps will only end in _at. If the column is a date, use a date format like '2024-01-01'. If the column is a timestamp, use a timestamp format like '2024-01-01 00:00:00.000000000 -08:00'.
227
+ 5. For numeric or dollar value columns, use simple numbers. For example, 10, 20.
228
+ 6. Use a minimal number of rows needed to fully exercise the logic in the function. Try to sort dates and group by locations, names, or other similar common values.
229
+ 7. You will need an `mock_ref` block for each DBT model in the input SQL. DBT models will be defined as either {{ ref('model_name') }} or {{ dbt_unit_testing.ref('model_name') }}.
230
+ 8. Align the columns in the input and expected using tabs. You can use as many rows as needed.
231
+ 9. Output the data in the same format as the example. Use as many `dbt_unit_testing.mock_ref` blocks as needed.
232
+ 10. Use enough rows with variation that at least one aggregation can be created. For example, if the model groups by date and ID, you will need at least two rows with the same date and ID but different values for the columns being aggregated.
233
+ 11. Location names are always three uppercase letters followed by three numbers. For example, "ABC123" or "DEF123". Network names are always the first three uppercase letters of the network name, like "ABC" or "DEF"
234
+ 12. At the top of the file as in the example, add a --depends-on line for each moc_ref model used in the input SQL. Example, --depends-on: {{ ref('fct_appointments') }}
235
+ 13. Encapsulate any string in single or date in single quotes. Even if the sample data has long strings, truncate to no more than 30 characters, preferably even less unless the logic requires something more.
236
+ 14. Try to limit the date range in the input data to one or two days of time span, unless more is needed to fully test the logic of the model.
237
+ 15. Do not include timezones in mockup data unless the sample data provided for that model includes timezones.
238
+ 16. Do not include columns in the mock_ref blocks that are not used by the SQL model being tested.
239
+ 17. All dates and strings must be enclosed in single quotes in the mock data.
240
+ Do not provide an explanation, only return the code for the test.
241
+ """
242
+
243
+
244
+ # %%
mdbt/recce.py ADDED
@@ -0,0 +1,66 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+
5
+ from click.core import Context
6
+
7
+ from mdbt.core import Core
8
+
9
+
10
+ class Recce(Core):
11
+
12
+ def __init__(self, test_mode=False):
13
+ super().__init__(test_mode=test_mode)
14
+
15
+ def recce(self, ctx: Context):
16
+ print("Downloading production artifacts.")
17
+ current_dir = os.getcwd()
18
+ # Initialize variables
19
+ target_path = None
20
+ logs = None
21
+ # Check if current directory ends with 'transform'
22
+ if current_dir.endswith("transform"):
23
+ target_path = os.path.join("target-base")
24
+ logs = os.path.join("logs")
25
+ elif os.path.isdir(os.path.join(current_dir, "transform")):
26
+ target_path = os.path.join("transform", "target-base")
27
+ logs = os.path.join("transform", "logs")
28
+ else:
29
+ raise FileNotFoundError(
30
+ "No 'transform' directory found in the current execution directory."
31
+ )
32
+ os.makedirs(target_path, exist_ok=True)
33
+
34
+ # Delete all files in target_path
35
+ for file_name in os.listdir(target_path):
36
+ file_path = os.path.join(target_path, file_name)
37
+ if os.path.isfile(file_path):
38
+ os.remove(file_path)
39
+
40
+ # Pull artifacts from Snowflake. These are the latest production artifacts.
41
+ try:
42
+ if not self.test_mode:
43
+ subprocess.run(
44
+ ["dbt", "run-operation", "get_last_artifacts"], check=True
45
+ )
46
+ except subprocess.CalledProcessError as e:
47
+ self.handle_cmd_line_error(e)
48
+
49
+ # Copy files from logs to target_path
50
+ if os.path.isdir(logs):
51
+ for file_name in os.listdir(logs):
52
+ full_file_path = os.path.join(logs, file_name)
53
+ if os.path.isfile(full_file_path):
54
+ shutil.copy(full_file_path, target_path)
55
+ else:
56
+ raise FileNotFoundError(
57
+ f"'logs' directory not found at expected path: {logs}"
58
+ )
59
+
60
+ # Start recce server
61
+ try:
62
+ if not self.test_mode:
63
+ subprocess.run(["dbt", "docs", "generate"], check=True)
64
+ subprocess.run(["recce", "server"], check=True)
65
+ except subprocess.CalledProcessError as e:
66
+ self.handle_cmd_line_error(e)
@@ -0,0 +1,148 @@
1
+ # Using this instead of the default as it preserves the order of keys in the dictionary.
2
+ import os
3
+ import sys
4
+ from typing import Any
5
+ from typing import Dict
6
+ from typing import List
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ import pyperclip
11
+ from dotenv import find_dotenv
12
+ from dotenv import load_dotenv
13
+ from ruamel.yaml import YAML
14
+
15
+ from mdbt.ai_core import AiCore
16
+ from mdbt.main import MDBT
17
+
18
+ load_dotenv(find_dotenv("../.env"))
19
+ load_dotenv(find_dotenv(".env"))
20
+
21
+ # Modify the dumper to not sort keys and to use ordered dict format
22
+
23
+
24
+ class SortYAML(AiCore):
25
+ def __init__(self):
26
+ super().__init__()
27
+ self.yaml = YAML(typ="rt")
28
+ self.yaml.preserve_quotes = True
29
+ self.yaml.explicit_start = True
30
+ self.yaml.indent(mapping=2, sequence=4, offset=2)
31
+
32
+ def main(
33
+ self,
34
+ select: Optional[str] = None,
35
+ all_files: Optional[bool] = False,
36
+ overwrite: Optional[bool] = False,
37
+ ):
38
+ models = self._get_models(select, all_files)
39
+ if len(models) > 1 and not overwrite:
40
+ raise ValueError(
41
+ "Multiple models found. Default copy to clipboard only works with one model. Use the --overwrite flag "
42
+ "if processing more than one model at a time."
43
+ )
44
+ if not models:
45
+ raise ValueError(f"No models found for select '{select}'")
46
+
47
+ for model in models:
48
+ original_file_path = model["original_file_path"]
49
+ model_name = model["name"]
50
+ schema_file, table_name = self._get_schema_path_and_table(
51
+ original_file_path=original_file_path, model_name=model_name
52
+ )
53
+ schema_data = self.read_yml(schema_file)
54
+ db_columns = self.get_db_columns(table_name)
55
+ updated_schema = self.reorganize_columns(schema_data, db_columns)
56
+
57
+ if overwrite:
58
+ with open(schema_file, "w") as stream:
59
+ self.yaml.dump(
60
+ updated_schema, stream, transform=self.clean_top_line
61
+ )
62
+ self.yaml.dump(
63
+ updated_schema, sys.stdout, transform=self.clean_top_line
64
+ )
65
+ print(f"Schema file '{schema_file}' updated")
66
+ else:
67
+ self.save_yml_to_clipboard(updated_schema)
68
+
69
+ def _get_models(self, select: str, all_files: bool) -> List[Dict[str, Any]]:
70
+ mmw = MDBT()
71
+ if not all_files:
72
+ args = [
73
+ "--select",
74
+ select,
75
+ "--exclude",
76
+ "resource_type:test resource_type:seed resource_type:snapshot resource_type:source",
77
+ ]
78
+ else:
79
+ args = [
80
+ "--exclude",
81
+ "resource_type:test resource_type:seed resource_type:snapshot resource_type:source",
82
+ ]
83
+ ls_json = mmw.dbt_ls_to_json(args)
84
+
85
+ return ls_json
86
+
87
+ @staticmethod
88
+ def _get_schema_path_and_table(
89
+ original_file_path: str, model_name: str
90
+ ) -> Tuple[str, str]:
91
+ schema_file = original_file_path[:-3] + "yml"
92
+ schema = os.environ.get("DEV_SCHEMA")
93
+ if not schema:
94
+ raise ValueError("DEV_SCHEMA environment variable is not set")
95
+ database = os.environ.get("DEV_DATABASE")
96
+ if not database:
97
+ raise ValueError("DEV_DATABASE environment variable is not set")
98
+ table_name = f"{database}.{schema}.{model_name}"
99
+ print(f"Schema file: {schema_file}")
100
+ print(f"Table name: {table_name}")
101
+ return schema_file, table_name
102
+
103
+ def read_yml(self, file_path: str) -> Dict[str, Any]:
104
+ with open(file_path, "r") as stream:
105
+ return self.yaml.load(stream)
106
+
107
+ def save_yml_to_clipboard(self, data: Dict[str, Any]):
108
+ # Convert yaml to string. This yaml class is a pain in the ass to get it to return a string. This is sort of
109
+ # a hack where it will send the string data to the copy_to_clip function which saves it to the clipboard.
110
+ # It honestly seems like the guy who wrote this class never considered any need beyond dumping to the stdout.
111
+ self.yaml.dump(data, sys.stdout, transform=self.copy_to_clip)
112
+ print("Sorted YAML schema copied to clipboard!")
113
+
114
+ def copy_to_clip(self, string_yaml: str) -> str:
115
+ # Remove the first line of the string
116
+ string_yaml = self.clean_top_line(string_yaml)
117
+ pyperclip.copy(string_yaml)
118
+ print("Sorted YAML schema copied to clipboard!")
119
+ return string_yaml
120
+
121
+ def clean_top_line(self, string_yaml: str) -> str:
122
+ str_lines = string_yaml.split("\n")
123
+ string_yaml = "\n".join(str_lines[1:])
124
+ return string_yaml
125
+
126
+ def get_db_columns(self, table_name: str) -> list:
127
+ self._cur.execute(f"SELECT * FROM {table_name} LIMIT 0")
128
+ return [desc[0].lower() for desc in self._cur.description]
129
+
130
+ def reorganize_columns(
131
+ self, schema_data: Dict[str, Any], db_columns: list
132
+ ) -> Dict[str, Any]:
133
+ if "models" not in schema_data or not schema_data["models"]:
134
+ raise ValueError("YML schema does not contain any models")
135
+ model = schema_data["models"][0] # Assuming a single model for simplicity
136
+ columns = model.get("columns", [])
137
+ col_dict = {col["name"]: col for col in columns}
138
+
139
+ sorted_columns = [col_dict[col] for col in db_columns if col in col_dict]
140
+
141
+ model["columns"] = sorted_columns
142
+ schema_data["models"][0] = model
143
+ return schema_data
144
+
145
+
146
+ if __name__ == "__main__":
147
+ y = SortYAML()
148
+ y.main(select=None, all_files=True, overwrite=True)
mdbt/sql_sorter.py ADDED
@@ -0,0 +1,165 @@
1
+
2
+
3
+ #!/usr/bin/env python3
4
+ import re
5
+ import sys
6
+ from typing import List, Tuple
7
+
8
+ import pyperclip
9
+ from mdbt.core import Core
10
+
11
+ class ColumnSorter(Core):
12
+ """Sorts SQL select lines from the clipboard based on alias and data‐type rules."""
13
+
14
+ def __init__(self) -> None:
15
+ """Initialize ColumnSorter with the raw lines to sort.
16
+
17
+ Args:
18
+ lines: A list of lines (each line is one select‐list entry, typically starting with a comma).
19
+ """
20
+ super().__init__()
21
+
22
+ def main(self) -> None:
23
+ """Read lines from the clipboard, sort them, write them back, and print."""
24
+ # 1) Read everything currently on the clipboard
25
+ try:
26
+ clipboard_content = pyperclip.paste()
27
+ except pyperclip.PyperclipException:
28
+ sys.stderr.write('Error: could not access clipboard. Is pyperclip installed and supported?\n')
29
+ sys.exit(1)
30
+
31
+ # 2) Split into individual lines
32
+ lines = clipboard_content.splitlines()
33
+
34
+ # 3) Sort them
35
+ sorter = ColumnSorter()
36
+ sorted_lines = sorter.sort_lines(lines)
37
+
38
+ # 4) Join back together
39
+ result = '\n'.join(sorted_lines)
40
+
41
+ # 5) Copy sorted result back to clipboard
42
+ try:
43
+ pyperclip.copy(result)
44
+ except pyperclip.PyperclipException:
45
+ sys.stderr.write('Warning: could not write back to clipboard. Outputting to stdout instead.\n')
46
+ print(result)
47
+ sys.exit(0)
48
+
49
+ # 6) Also print to stdout for verification
50
+ print(result)
51
+
52
+ def parse_line(self, line: str) -> Tuple[str, str, str]:
53
+ """Split one line into (prefix, full_expression, alias).
54
+
55
+ - Prefix is any leading commas/spaces (e.g. ', ' and indentation).
56
+ - full_expression is everything after that prefix, up to the alias (if one exists).
57
+ - alias is the name used for sorting (the part after 'AS', or if no AS, the base column name).
58
+
59
+ Args:
60
+ line: A single select‐list line, e.g. ', "FOO"::varchar as foo_id' or ', project_id'.
61
+
62
+ Returns:
63
+ A tuple (prefix, full_expression, alias):
64
+ - prefix: leading commas and spaces (e.g. ', '),
65
+ - full_expression: the expression/column + cast (e.g. '"FOO"::varchar'),
66
+ - alias: the alias (e.g. 'foo_id') or base column name if no AS.
67
+ """
68
+ # 1) Extract prefix (leading comma + whitespace), if present
69
+ m = re.match(r'^(\s*,\s*)(.*)$', line)
70
+ if m:
71
+ prefix = m.group(1)
72
+ rest = m.group(2).strip()
73
+ else:
74
+ # No leading comma, treat everything as rest
75
+ prefix = ''
76
+ rest = line.strip()
77
+
78
+ # 2) Look for 'AS' (case-insensitive), split into left/right
79
+ # Use regex to split on whitespace+as+whitespace, max once
80
+ lower_rest = rest.lower()
81
+ if re.search(r'\s+as\s+', lower_rest):
82
+ parts = re.split(r'\s+as\s+', rest, maxsplit=1, flags=re.IGNORECASE)
83
+ expression_part = parts[0].strip()
84
+ alias = parts[1].strip()
85
+ else:
86
+ # No AS: take expression exactly as rest; derive alias from expression
87
+ expression_part = rest
88
+ # If there's a '::', drop the cast and use the part before it
89
+ if '::' in expression_part:
90
+ alias = expression_part.split('::', 1)[0].strip()
91
+ else:
92
+ # If nothing to split, alias is simply the whole rest
93
+ alias = expression_part
94
+
95
+ return prefix, expression_part, alias
96
+
97
+ def get_group(self, expression: str, alias: str) -> int:
98
+ """Determine sorting group (0..4) for a given line.
99
+
100
+ The rules (in order) are:
101
+ 0: alias ends with '_id'
102
+ 1: alias ends with '_at'
103
+ 2: alias starts with 'is_'
104
+ 3: anything except variant casts
105
+ 4: fields cast as VARIANT (i.e., '::variant' appears)
106
+
107
+ Args:
108
+ expression: The full expression (column plus any cast).
109
+ alias: The alias to use for naming rules.
110
+
111
+ Returns:
112
+ An integer group (0 through 4), where lower means higher priority.
113
+ """
114
+ a = alias.lower()
115
+ expr_lower = expression.lower()
116
+
117
+ if a.endswith('_id'):
118
+ return 0
119
+ if a.endswith('_at'):
120
+ return 1
121
+ if a.startswith('is_'):
122
+ return 2
123
+ # Check for VARIANT cast
124
+ if re.search(r'::\s*variant\b', expr_lower):
125
+ return 4
126
+ # Otherwise, everything else is group 3
127
+ return 3
128
+
129
+ def sort_lines(self, lines: List[str]) -> List[str]:
130
+ """Sort all stored lines according to group and alias.
131
+
132
+ Returns:
133
+ A new list of lines (with original prefixes) in sorted order.
134
+ """
135
+ parsed: List[Tuple[int, str, str, str]] = []
136
+ # parsed tuples: (group, alias_lower, prefix, full_expression)
137
+ for raw_line in lines:
138
+ # Skip empty lines
139
+ if not raw_line.strip():
140
+ continue
141
+ prefix, expr, alias = self.parse_line(raw_line)
142
+ group = self.get_group(expr, alias)
143
+ parsed.append((group, alias.lower(), prefix, expr))
144
+
145
+ # Sort first by group number, then by alias lexicographically
146
+ parsed.sort(key=lambda t: (t[0], t[1]))
147
+
148
+ # Reconstruct each line as prefix + expression [ + ' as ' + alias if original had AS ]
149
+ # BUT to preserve the original "AS" style, we’ll just print prefix + expression + ' as ' + alias
150
+ # except if alias exactly equals the expression (i.e. no AS in original), then drop ' as '.
151
+ sorted_lines: List[str] = []
152
+ for group, alias_lower, prefix, expr in parsed:
153
+ # Determine if original expr already contained ' as alias_lower' (case-insensitive)
154
+ # We can check if expr.lower().endswith(alias_lower) but that fails if casting was present.
155
+ # Instead, if alias_lower != expr.split('::')[0].strip().lower(), we assume original used AS.
156
+ base_no_cast = expr.split('::', 1)[0].strip().lower()
157
+ if base_no_cast != alias_lower:
158
+ # original must have had an explicit alias, so we add ' as alias'
159
+ line_text = f'{prefix}{expr} as {alias_lower}'
160
+ else:
161
+ # no AS needed
162
+ line_text = f'{prefix}{expr}'
163
+ sorted_lines.append(line_text)
164
+
165
+ return sorted_lines
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: mdbt
3
+ Version: 0.4.27
4
+ Summary: A CLI tool to manage dbt builds with state handling and manifest management
5
+ Author: Craig Lathrop
6
+ Author-email: info@markimicrowave.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.9
11
+ License-File: LICENSE
12
+ Requires-Dist: click<9.0.0,>=8.0.0
13
+ Requires-Dist: pyperclip<2.0.0,>=1.8.0
14
+ Requires-Dist: snowflake-connector-python[pandas]<4.0.0,>=3.11.0
15
+ Requires-Dist: python-dotenv<1.2.0,>=1.0.0
16
+ Requires-Dist: openai<2.0.0,>=1.35.0
17
+ Requires-Dist: sqlfluff==3.4.0
18
+ Requires-Dist: sqlfluff-templater-dbt==3.4.0
19
+ Requires-Dist: wordninja==2.0.0
20
+ Requires-Dist: ruamel.yaml<0.18.0
21
+ Requires-Dist: recce<=0.44.3
22
+ Dynamic: author
23
+ Dynamic: author-email
24
+ Dynamic: classifier
25
+ Dynamic: license-file
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
@@ -0,0 +1,20 @@
1
+ mdbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mdbt/ai_core.py,sha256=diJFT35pGOEIQpUYOk6GFBgQgmK2sHj_F_kfkjeaEyQ,3963
3
+ mdbt/build_dbt_docs_ai.py,sha256=BOuFBUJyf2l7pOzw9LSTXc5Gk-8LoGqKKDRE4Jgx0p0,5309
4
+ mdbt/build_unit_test_data_ai.py,sha256=OqJXDerhg54_QVIlVtXNxfk5TKVe68i6JnrV_8iEDrc,4292
5
+ mdbt/cmdline.py,sha256=meNATu3BzP_4Htt5VcoT923mlh9NsfK8og0JQgn9PCE,10822
6
+ mdbt/core.py,sha256=VZhIwJEPm33hhTyYn9v7EAjXU0aDVkRsxbozz3G6OAI,4109
7
+ mdbt/expectations_output_builder.py,sha256=AXKEM-WO7FecYzfMLwzsOnQnVf7AiHBi_khyidE2lJs,3195
8
+ mdbt/lightdash.py,sha256=qJBJ-pc5mN8GBA1MZElRhtA8aldrX-AgvHtha4iOA-Y,2745
9
+ mdbt/main.py,sha256=UO3r7zOXmVpjnAIz2eeZVeQMFmgXa698Gm42Wo2qhRU,16939
10
+ mdbt/precommit_format.py,sha256=9HC10mh4QQIgaQSxdAwaCXbsiHT9cCrLdbc3PAQkotc,2845
11
+ mdbt/prompts.py,sha256=2vCvh9hamgop92kGGaMKtap11F2MZiM7hHKjcwX4lhQ,13992
12
+ mdbt/recce.py,sha256=P14CvWd7lRgTPUW7BVMLmcclSqby-_uSgpoI3r5VjTA,2327
13
+ mdbt/sort_yaml_fields.py,sha256=1PROsrz8KubSr0bVmPq3oAw4V-eSiZh1uGRR5B-uaA4,5648
14
+ mdbt/sql_sorter.py,sha256=8bd6svrtcXp7ePT2g4FTGLTW55qbsVjXgUmba7L-G-4,6467
15
+ mdbt-0.4.27.dist-info/licenses/LICENSE,sha256=DrJpgQEYhttwpwcE56BzrGZ1aEfR_tqfaxsI5NlsYOE,1072
16
+ mdbt-0.4.27.dist-info/METADATA,sha256=nYnEobjgoL3XJcryk3GBVzLZgxniFvp6J7u-w55sZzU,920
17
+ mdbt-0.4.27.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ mdbt-0.4.27.dist-info/entry_points.txt,sha256=fVft1CYVP2MtZMMtsXN67S1T5RszfgKtAuaeoKLdCow,43
19
+ mdbt-0.4.27.dist-info/top_level.txt,sha256=-PP7vAl9EgVjRTzBovElczsPNjOfja6kjZssNmv5vo0,5
20
+ mdbt-0.4.27.dist-info/RECORD,,