altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
  2. altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
  3. altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
  4. altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
  5. altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
  6. altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
  7. altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
  8. datapilot/__init__.py +1 -0
  9. datapilot/__main__.py +14 -0
  10. datapilot/cli/__init__.py +0 -0
  11. datapilot/cli/main.py +11 -0
  12. datapilot/clients/__init__.py +0 -0
  13. datapilot/clients/altimate/__init__.py +0 -0
  14. datapilot/clients/altimate/client.py +85 -0
  15. datapilot/clients/altimate/utils.py +75 -0
  16. datapilot/config/__init__.py +0 -0
  17. datapilot/config/config.py +16 -0
  18. datapilot/config/utils.py +32 -0
  19. datapilot/core/__init__.py +0 -0
  20. datapilot/core/insights/__init__.py +2 -0
  21. datapilot/core/insights/base/__init__.py +0 -0
  22. datapilot/core/insights/base/insight.py +34 -0
  23. datapilot/core/insights/report.py +16 -0
  24. datapilot/core/insights/schema.py +24 -0
  25. datapilot/core/insights/sql/__init__.py +0 -0
  26. datapilot/core/insights/sql/base/__init__.py +0 -0
  27. datapilot/core/insights/sql/base/insight.py +18 -0
  28. datapilot/core/insights/sql/runtime/__init__.py +0 -0
  29. datapilot/core/insights/sql/static/__init__.py +0 -0
  30. datapilot/core/insights/utils.py +20 -0
  31. datapilot/core/platforms/__init__.py +0 -0
  32. datapilot/core/platforms/dbt/__init__.py +0 -0
  33. datapilot/core/platforms/dbt/cli/__init__.py +0 -0
  34. datapilot/core/platforms/dbt/cli/cli.py +112 -0
  35. datapilot/core/platforms/dbt/constants.py +34 -0
  36. datapilot/core/platforms/dbt/exceptions.py +6 -0
  37. datapilot/core/platforms/dbt/executor.py +157 -0
  38. datapilot/core/platforms/dbt/factory.py +22 -0
  39. datapilot/core/platforms/dbt/formatting.py +45 -0
  40. datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
  41. datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
  42. datapilot/core/platforms/dbt/insights/__init__.py +115 -0
  43. datapilot/core/platforms/dbt/insights/base.py +133 -0
  44. datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
  45. datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
  46. datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
  47. datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
  48. datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
  49. datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
  50. datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
  51. datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
  52. datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
  53. datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
  54. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
  55. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
  56. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
  57. datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
  58. datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
  59. datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
  60. datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
  61. datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
  62. datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
  63. datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
  64. datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
  65. datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
  66. datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
  67. datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
  68. datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
  69. datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
  70. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
  71. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
  72. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
  73. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
  74. datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
  75. datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
  76. datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
  77. datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
  78. datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
  79. datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
  80. datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
  81. datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
  82. datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
  83. datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
  84. datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
  85. datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
  86. datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
  87. datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
  88. datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
  89. datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
  90. datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
  91. datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
  92. datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
  93. datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
  94. datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
  95. datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
  96. datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
  97. datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
  98. datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
  99. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
  100. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
  101. datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
  102. datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
  103. datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
  104. datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
  105. datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
  106. datapilot/core/platforms/dbt/insights/schema.py +72 -0
  107. datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
  108. datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
  109. datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
  110. datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
  111. datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
  112. datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
  113. datapilot/core/platforms/dbt/insights/utils.py +9 -0
  114. datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
  115. datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
  116. datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
  117. datapilot/core/platforms/dbt/utils.py +525 -0
  118. datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
  119. datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
  120. datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
  121. datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
  122. datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
  123. datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
  124. datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
  125. datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
  126. datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
  127. datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
  128. datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
  129. datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
  130. datapilot/exceptions/__init__.py +0 -0
  131. datapilot/exceptions/exceptions.py +10 -0
  132. datapilot/schemas/__init__.py +0 -0
  133. datapilot/schemas/constants.py +5 -0
  134. datapilot/schemas/nodes.py +19 -0
  135. datapilot/schemas/sql.py +10 -0
  136. datapilot/utils/__init__.py +0 -0
  137. datapilot/utils/formatting/__init__.py +0 -0
  138. datapilot/utils/formatting/utils.py +59 -0
  139. datapilot/utils/utils.py +317 -0
@@ -0,0 +1,525 @@
1
+ import re
2
+ from enum import Enum
3
+ from typing import Dict
4
+ from typing import List
5
+ from typing import Optional
6
+ from typing import Tuple
7
+ from typing import Union
8
+
9
+ from dbt_artifacts_parser.parser import parse_catalog
10
+ from dbt_artifacts_parser.parser import parse_manifest
11
+
12
+ from datapilot.core.platforms.dbt.constants import BASE
13
+ from datapilot.core.platforms.dbt.constants import FOLDER
14
+ from datapilot.core.platforms.dbt.constants import INTERMEDIATE
15
+ from datapilot.core.platforms.dbt.constants import MART
16
+ from datapilot.core.platforms.dbt.constants import MODEL
17
+ from datapilot.core.platforms.dbt.constants import OTHER
18
+ from datapilot.core.platforms.dbt.constants import STAGING
19
+ from datapilot.core.platforms.dbt.exceptions import AltimateInvalidManifestError
20
+ from datapilot.core.platforms.dbt.factory import DBTFactory
21
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestExposureNode
22
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestNode
23
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestSourceNode
24
+ from datapilot.core.platforms.dbt.schemas.manifest import AltimateManifestTestNode
25
+ from datapilot.core.platforms.dbt.schemas.manifest import Catalog
26
+ from datapilot.core.platforms.dbt.schemas.manifest import Manifest
27
+ from datapilot.exceptions.exceptions import AltimateFileNotFoundError
28
+ from datapilot.exceptions.exceptions import AltimateInvalidJSONError
29
+ from datapilot.utils.utils import extract_dir_name_from_file_path
30
+ from datapilot.utils.utils import extract_folders_in_path
31
+ from datapilot.utils.utils import is_superset_path
32
+ from datapilot.utils.utils import load_json
33
+
34
+ MODEL_TYPE_PATTERNS = {
35
+ STAGING: r"^stg_.*", # Example: models starting with 'stg_'
36
+ MART: r"^(mrt_|mart_|fct_|dim_).*", # Example: models starting with 'mrt_' or 'mart_'
37
+ INTERMEDIATE: r"^int_.*", # Example: models starting with 'int_'
38
+ BASE: r"^base_.*", # Example: models starting with 'base_'
39
+ # Add other model types with their regex patterns here
40
+ }
41
+
42
+ FOLDER_MAP = {
43
+ STAGING: STAGING,
44
+ MART: MART,
45
+ INTERMEDIATE: INTERMEDIATE,
46
+ BASE: BASE,
47
+ # Add other model types with their folder names here
48
+ }
49
+
50
+
51
+ class SelectOption(Enum):
52
+ DIRECTORY = "directory"
53
+ MODEL_NAME = "model_name"
54
+ MODEL_PATH = "model_path"
55
+
56
+
57
+ def combine_dict(dict1: Dict, dict2: Optional[Dict]) -> Dict:
58
+ dict2 = dict2 or {}
59
+ return {**dict1, **dict2}
60
+
61
+
62
+ def load_manifest(manifest_path: str) -> Manifest:
63
+ try:
64
+ manifest_dict = load_json(manifest_path)
65
+ except FileNotFoundError as e:
66
+ raise AltimateFileNotFoundError(f"Manifest file not found: {manifest_path}. Error: {e}") from e
67
+ except ValueError as e:
68
+ raise AltimateInvalidJSONError(f"Invalid manifest file: {manifest_path}. Error: {e}") from e
69
+ except Exception as e:
70
+ raise AltimateInvalidManifestError(
71
+ f"Invalid manifest file: {manifest_path}. Error: {e}. Please ensure that you are providing the path to a manifest file"
72
+ ) from e
73
+
74
+ try:
75
+ manifest: Manifest = parse_manifest(manifest_dict)
76
+ except ValueError as e:
77
+ raise AltimateInvalidManifestError(f"Invalid manifest file: {manifest_path}. Error: {e}") from e
78
+
79
+ return manifest
80
+
81
+
82
+ def load_catalog(catalog_path: str) -> Catalog:
83
+ try:
84
+ catalog_dict = load_json(catalog_path)
85
+ except FileNotFoundError as e:
86
+ raise AltimateFileNotFoundError(f"Manifest file not found: {catalog_path}. Error: {e}") from e
87
+ except ValueError as e:
88
+ raise AltimateInvalidJSONError(f"Invalid JSON file: {catalog_path}. Error: {e}") from e
89
+
90
+ try:
91
+ catalog: Catalog = parse_catalog(catalog_dict)
92
+ except ValueError as e:
93
+ raise AltimateInvalidManifestError(f"Invalid manifest file: {catalog_path}. Error: {e}") from e
94
+
95
+ return catalog
96
+
97
+
98
+ def load_run_results(run_results_path: str) -> Manifest:
99
+ raise NotImplementedError
100
+
101
+
102
+ # TODO: Add tests!
103
+ def get_table_name_from_source(source: AltimateManifestSourceNode) -> str:
104
+ db = source.database
105
+ schema = source.schema_name
106
+ identifier = source.identifier
107
+ if db:
108
+ return f"{db}.{schema}.{identifier}"
109
+ return f"{schema}.{identifier}"
110
+
111
+
112
+ def classify_model_type_by_name(
113
+ model_name: str,
114
+ model_name_pattern: Optional[Dict[str, str]],
115
+ ):
116
+ types_patterns = combine_dict(MODEL_TYPE_PATTERNS, model_name_pattern)
117
+ for model_type, pattern in types_patterns.items():
118
+ if re.match(pattern, model_name):
119
+ return model_type
120
+
121
+ return None
122
+
123
+
124
+ def classify_model_type_by_folder(model_path: str, model_folder_pattern: Optional[Dict[str, str]]) -> str:
125
+ folder_patterns = combine_dict(FOLDER_MAP, model_folder_pattern)
126
+ dirname = extract_dir_name_from_file_path(model_path)
127
+ for model_type, pattern in folder_patterns.items():
128
+ if re.match(pattern, dirname):
129
+ return model_type
130
+
131
+ return OTHER
132
+
133
+
134
+ # TODO: Add tests!
135
+ def classify_model_type(
136
+ model_name: str,
137
+ folder_path: Optional[str] = None,
138
+ patterns: Optional[Dict[str, Optional[Dict[str, str]]]] = None,
139
+ ) -> Optional[str]:
140
+ """
141
+ Classify the type of a model based on its name using regex patterns.
142
+
143
+ :param model_name: The name of the model.
144
+ :param types_patterns: A dictionary mapping model types to their regex patterns.
145
+ :return: The type of the model or None if no match is found.
146
+ """
147
+ type_patterns = patterns.get(MODEL, {})
148
+ model_type = classify_model_type_by_name(model_name, type_patterns)
149
+
150
+ if model_type:
151
+ return model_type
152
+
153
+ if folder_path:
154
+ folder_patterns = patterns.get(FOLDER, {})
155
+ model_type = classify_model_type_by_folder(folder_path, folder_patterns)
156
+ if model_type:
157
+ return model_type
158
+ return OTHER # if no pattern matches
159
+
160
+
161
+ def _check_model_naming_convention(
162
+ model_name: str, expected_model_type: str, patterns: Optional[Dict[str, str]]
163
+ ) -> Tuple[bool, Optional[str]]:
164
+ model_patterns = combine_dict(MODEL_TYPE_PATTERNS, patterns)
165
+ expected_model_pattern = model_patterns.get(expected_model_type)
166
+ if expected_model_pattern:
167
+ if re.match(expected_model_pattern, model_name):
168
+ return True, None
169
+ return False, expected_model_pattern
170
+
171
+
172
+ def get_node_source_name(
173
+ node: AltimateManifestNode,
174
+ sources: Dict[str, AltimateManifestSourceNode],
175
+ ) -> str:
176
+ for node_id in node.depends_on.nodes:
177
+ if node_id in sources:
178
+ return sources[node_id].source_name
179
+
180
+
181
+ def _check_mart_convention(folder_patterns, directory_name, node_name):
182
+ if re.match(folder_patterns.get(MART, ""), directory_name):
183
+ return True, None
184
+ return (
185
+ False,
186
+ f"*/{folder_patterns.get(MART, '')}/{node_name}.sql",
187
+ )
188
+
189
+
190
+ def _staging_error_message(source_name, node_name, staging_pattern):
191
+ return f"*/{staging_pattern}/{source_name}/{node_name}.sql"
192
+
193
+
194
+ def _check_staging_convention(folder_path, folder_patterns, directory_name, node, sources):
195
+ directories = extract_folders_in_path(folder_path)
196
+ source_name = get_node_source_name(node, sources)
197
+ if not source_name:
198
+ return True, None
199
+ if directory_name != source_name:
200
+ return False, _staging_error_message(source_name, node.name, folder_patterns.get(STAGING, ""))
201
+
202
+ staging_pattern = folder_patterns.get(STAGING)
203
+ if staging_pattern and len(directories) > 2 and not re.match(staging_pattern, directories[-2]):
204
+ return False, _staging_error_message(source_name, node.name, staging_pattern)
205
+
206
+ return True, None
207
+
208
+
209
+ def _check_source_folder_convention(source_name, folder_path, patterns=Optional[Dict[str, Dict[str, str]]]):
210
+ folder_patterns = combine_dict(FOLDER_MAP, patterns.get(FOLDER))
211
+ directories = extract_folders_in_path(folder_path)
212
+ directory_name = extract_dir_name_from_file_path(folder_path)
213
+ if directory_name != source_name:
214
+ return False, f"{folder_patterns.get(STAGING)}/{source_name}/source.yml"
215
+
216
+ if len(directories) > 2 and not re.match(folder_patterns.get(STAGING), directories[-2]):
217
+ return False, f"{folder_patterns.get(STAGING)}/{source_name}/source.yml"
218
+
219
+ return True, None
220
+
221
+
222
+ def _check_model_folder_convention(
223
+ model_type: str,
224
+ folder_path: str,
225
+ patterns: Dict[str, Optional[Dict[str, str]]],
226
+ node: AltimateManifestNode,
227
+ sources: Dict[str, AltimateManifestSourceNode],
228
+ ) -> Tuple[bool, Optional[str]]:
229
+ folder_patterns = patterns.get(FOLDER, {}) or {}
230
+ folder_patterns = {**FOLDER_MAP, **folder_patterns}
231
+ directory_name = extract_dir_name_from_file_path(folder_path)
232
+ if model_type == MART:
233
+ return _check_mart_convention(folder_patterns, directory_name, node.name)
234
+
235
+ if model_type == STAGING:
236
+ return _check_staging_convention(folder_path, folder_patterns, directory_name, node, sources)
237
+
238
+ return True, None
239
+
240
+
241
+ # TODO: Add tests!
242
+ def get_children_map(nodes: Dict[str, AltimateManifestNode]) -> Dict[str, AltimateManifestNode]:
243
+ """
244
+ Current manifest contains information about parents
245
+ THis gives an information of node to children
246
+
247
+ :param nodes: A dictionary of nodes in a manifest.
248
+ :return: A dictionary of all the children of a node.
249
+ """
250
+ children_map = {}
251
+ for node_id, node in nodes.items():
252
+ for parent in node.depends_on.nodes:
253
+ children_map.setdefault(parent, set()).add(node_id)
254
+ return children_map
255
+
256
+
257
+ # TODO: Add tests!
258
+ def get_hard_coded_references(sql_code):
259
+ """
260
+ Find all hard-coded references in the given SQL code.
261
+
262
+ :param sql_code: A string containing the SQL code to be analyzed.
263
+ :return: A set of unique hard-coded references found in the SQL code.
264
+ """
265
+ # Define regex patterns to match different types of hard-coded references
266
+ from_hard_coded_references = {
267
+ "from_var_1": r"""(?ix)
268
+
269
+ # first matching group
270
+ # from or join followed by at least 1 whitespace character
271
+ (from | join)\s +
272
+
273
+ # second matching group
274
+ # opening {{, 0 or more whitespace character(s), var, 0 or more whitespace character(s), an opening parenthesis, 0 or more whitespace character(s), 1 or 0 quotation mark
275
+ ({{\s * var\s * \(\s *[\'\"]?)
276
+
277
+ # third matching group
278
+ # at least 1 of anything except a parenthesis or quotation mark
279
+ ([^)\'\"]+)
280
+
281
+ # fourth matching group
282
+ # 1 or 0 quotation mark, 0 or more whitespace character(s)
283
+ ([\'\"]?\s*)
284
+
285
+ # fifth matching group
286
+ # a closing parenthesis, 0 or more whitespace character(s), closing }}
287
+ (\)\s *}})
288
+
289
+ """,
290
+ "from_var_2": r"""(?ix)
291
+
292
+ # first matching group
293
+ # from or join followed by at least 1 whitespace character
294
+ (
295
+ from | join)\s +
296
+
297
+ # second matching group
298
+ # opening {{, 0 or more whitespace character(s), var, 0 or more whitespace character(s), an opening parenthesis, 0 or more whitespace character(s), 1 or 0 quotation mark
299
+ ({{\s * var\s * \(\s *[\'\"]?)
300
+
301
+ # third matching group
302
+ # at least 1 of anything except a parenthesis or quotation mark
303
+ ([^)\'\"]+)
304
+
305
+ # fourth matching group
306
+ # 1 or 0 quotation mark, 0 or more whitespace character(s)
307
+ ([\'\"]?\s*)
308
+
309
+ # fifth matching group
310
+ # a comma
311
+ (,)
312
+
313
+ # sixth matching group
314
+ # 0 or more whitespace character(s), 1 or 0 quotation mark
315
+ (\s *[\'\"]?)
316
+
317
+ # seventh matching group
318
+ # at least 1 of anything except a parenthesis or quotation mark
319
+ ([^)\'\"]+)
320
+
321
+ # eighth matching group
322
+ # 1 or 0 quotation mark, 0 or more whitespace character(s)
323
+ ([\'\"]?\s*)
324
+
325
+ # ninth matching group
326
+ # a closing parenthesis, 0 or more whitespace character(s), closing }}
327
+ (\)\s *}})
328
+
329
+ """,
330
+ "from_table_1": r"""(?ix)
331
+
332
+ # first matching group
333
+ # from or join followed by at least 1 whitespace character
334
+ (
335
+ from | join)\s +
336
+
337
+ # second matching group
338
+ # 1 or 0 of (opening bracket, backtick, or quotation mark)
339
+ ([\[`\"\']?)
340
+
341
+ # third matching group
342
+ # at least 1 word character
343
+ (\w+)
344
+
345
+ # fouth matching group
346
+ # 1 or 0 of (closing bracket, backtick, or quotation mark)
347
+ ([\]`\"\']?)
348
+
349
+ # fifth matching group
350
+ # a period
351
+ (\.)
352
+
353
+ # sixth matching group
354
+ # 1 or 0 of (opening bracket, backtick, or quotation mark)
355
+ ([\[`\"\']?)
356
+
357
+ # seventh matching group
358
+ # at least 1 word character
359
+ (\w+)
360
+
361
+ # eighth matching group
362
+ # 1 or 0 of (closing bracket, backtick, or quotation mark) folowed by a whitespace character or end of string
363
+ ([\]`\"\']?)(?=\s|$)
364
+
365
+ """,
366
+ "from_table_2": r"""(?ix)
367
+
368
+ # first matching group
369
+ # from or join followed by at least 1 whitespace character
370
+ (
371
+ from | join)\s +
372
+
373
+ # second matching group
374
+ # 1 or 0 of (opening bracket, backtick, or quotation mark)
375
+ ([\[`\"\']?)
376
+
377
+ # third matching group
378
+ # at least 1 word character
379
+ (\w+)
380
+ # fouth matching group
381
+ # 1 or 0 of (closing bracket, backtick, or quotation mark)
382
+ ([\]`\"\']?)
383
+
384
+ # fifth matching group
385
+ # a period
386
+ (\.)
387
+
388
+ # sixth matching group
389
+ # 1 or 0 of (opening bracket, backtick, or quotation mark)
390
+ ([\[`\"\']?)
391
+
392
+ # seventh matching group
393
+ # at least 1 word character
394
+ (\w+)
395
+
396
+ # eighth matching group
397
+ # 1 or 0 of (closing bracket, backtick, or quotation mark)
398
+ ([\]`\"\']?)
399
+
400
+ # ninth matching group
401
+ # a period
402
+ (\.)
403
+
404
+ # tenth matching group
405
+ # 1 or 0 of (closing bracket, backtick, or quotation mark)
406
+ ([\[`\"\']?)
407
+
408
+ # eleventh matching group
409
+ # at least 1 word character
410
+ (\w+)
411
+
412
+ # twelfth matching group
413
+ # 1 or 0 of (closing bracket, backtick, or quotation mark) folowed by a whitespace character or end of string
414
+ ([\]`\"\']?)(?=\s|$)
415
+
416
+ """,
417
+ "from_table_3": r"""(?ix)
418
+
419
+ # first matching group
420
+ # from or join followed by at least 1 whitespace character
421
+ (
422
+ from | join)\s +
423
+
424
+ # second matching group
425
+ # 1 of (opening bracket, backtick, or quotation mark)
426
+ ([\[`\"\'])
427
+
428
+ # third matching group
429
+ # at least 1 word character or space
430
+ ([\w]+)
431
+
432
+ # fourth matching group
433
+ # 1 of (closing bracket, backtick, or quotation mark) folowed by a whitespace character or end of string
434
+ ([\]`\"\'])(?=\s|$)
435
+
436
+ """,
437
+ }
438
+
439
+ # Set to store all unique hard-coded references
440
+ hard_coded_references = set()
441
+ for regex_pattern in from_hard_coded_references.values():
442
+ # Compile the regex pattern
443
+ all_regex_matches = re.findall(regex_pattern, sql_code)
444
+
445
+ # Find all matches in the SQL code
446
+ # Process each match
447
+ for match in all_regex_matches:
448
+ # Extract all groups except the first one and join them
449
+ raw_reference = "".join(match[1:]).strip() #
450
+
451
+ hard_coded_references.add(raw_reference)
452
+ return hard_coded_references
453
+
454
+
455
+ def parse_argument(argument: str) -> dict:
456
+ """
457
+ Parses the given argument to categorize it as a model path, directory, or model name.
458
+
459
+ Parameters:
460
+ - argument (str): The input argument to be parsed.
461
+
462
+ Returns:
463
+ - dict: A dictionary containing the 'type' and 'name' of the parsed argument.
464
+ """
465
+ # Determine if the argument is a model path or directory based on its prefix and suffix.
466
+ if argument.startswith("path:"):
467
+ path_type = SelectOption.MODEL_PATH if argument.endswith(".sql") else SelectOption.DIRECTORY
468
+ path = argument.split(":", 1)[1]
469
+ return {"type": path_type, "name": path}
470
+
471
+ # Identify argument as a model path if it ends with '.sql'.
472
+ if argument.endswith(".sql"):
473
+ return {"type": SelectOption.MODEL_PATH, "name": argument}
474
+
475
+ # Identify argument as a directory if it contains path separators.
476
+ if "/" in argument or "\\" in argument:
477
+ return {"type": SelectOption.DIRECTORY, "name": argument}
478
+
479
+ # Default case: treat the argument as a model name.
480
+ return {"type": SelectOption.MODEL_NAME, "name": argument}
481
+
482
+
483
+ def add_models_by_type(selected_category: dict, entities: dict, final_models: List[str]):
484
+ """
485
+ Adds models to the final list based on the selected category.
486
+
487
+ Parameters:
488
+ - selected_category (dict): The category selected for adding models.
489
+ - entities (dict): A dictionary of entities, each associated with a type.
490
+ - final_models (List[str]): The list to which the models' unique IDs are added.
491
+ """
492
+ for entity in entities.values():
493
+ if selected_category["type"] in (SelectOption.MODEL_NAME, SelectOption.MODEL_PATH):
494
+ if entity.name == selected_category.get("name") or entity.original_file_path == selected_category.get("name"):
495
+ final_models.append(entity.unique_id)
496
+ elif selected_category["type"] == SelectOption.DIRECTORY:
497
+ if is_superset_path(selected_category["name"], entity.original_file_path):
498
+ final_models.append(entity.unique_id)
499
+
500
+
501
+ def get_models(
502
+ selected_model_list: Optional[List[str]],
503
+ entities: Dict[str, Union[AltimateManifestNode, AltimateManifestExposureNode, AltimateManifestSourceNode, AltimateManifestTestNode]],
504
+ ) -> List[str]:
505
+ """
506
+ Retrieves models based on a selected list and entities.
507
+
508
+ Parameters:
509
+ - selected_model_list (Optional[List[str]]): The list of selected models.
510
+ - entities (Dict): A dictionary containing entity types and their instances.
511
+
512
+ Returns:
513
+ - List[str]: A list of unique model IDs based on the selection criteria.
514
+ """
515
+ final_models = []
516
+ for selected_model in selected_model_list or []:
517
+ selected_category = parse_argument(selected_model)
518
+ for entity_type in entities:
519
+ add_models_by_type(selected_category, entities[entity_type], final_models)
520
+ return list(set(final_models))
521
+
522
+
523
+ def get_manifest_wrapper(manifest_path: str):
524
+ manifest = load_manifest(manifest_path)
525
+ return DBTFactory.get_manifest_wrapper(manifest)
File without changes
@@ -0,0 +1,18 @@
1
+ from dbt_artifacts_parser.parsers.catalog.catalog_v1 import CatalogV1
2
+
3
+ from datapilot.core.platforms.dbt.wrappers.catalog.wrapper import BaseCatalogWrapper
4
+
5
+
6
+ class CatalogV1Wrapper(BaseCatalogWrapper):
7
+ def __init__(self, catalog: CatalogV1):
8
+ self.catalog = catalog
9
+
10
+ def get_schema(self):
11
+ nodes_with_schemas = {}
12
+ for node_id, catalog_table_node in self.catalog.nodes.items():
13
+ nodes_with_schemas[node_id] = {column_name: column_node.type for column_name, column_node in catalog_table_node.columns.items()}
14
+ for source_id, catalog_source_node in self.catalog.sources.items():
15
+ nodes_with_schemas[source_id] = {
16
+ column_name: column_node.type for column_name, column_node in catalog_source_node.columns.items()
17
+ }
18
+ return nodes_with_schemas
@@ -0,0 +1,9 @@
1
+ from abc import ABC
2
+ from abc import abstractmethod
3
+ from typing import Dict
4
+
5
+
6
+ class BaseCatalogWrapper(ABC):
7
+ @abstractmethod
8
+ def get_schema(self) -> Dict[str, Dict[str, str]]:
9
+ pass
@@ -0,0 +1,47 @@
1
+ from typing import Dict
2
+ from typing import Type
3
+ from typing import Union
4
+
5
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import AnalysisNode
6
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import Exposure
7
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import GenericTestNode
8
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import HookNode
9
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import Macro
10
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import ModelNode
11
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import RPCNode
12
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import SeedNode
13
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import SingularTestNode
14
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import SnapshotNode
15
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import SourceDefinition
16
+ from dbt_artifacts_parser.parsers.manifest.manifest_v11 import SqlNode
17
+
18
+ from datapilot.core.platforms.dbt.constants import GENERIC
19
+ from datapilot.core.platforms.dbt.constants import SINGULAR
20
+
21
+ ManifestNode = Union[
22
+ AnalysisNode,
23
+ SingularTestNode,
24
+ HookNode,
25
+ ModelNode,
26
+ RPCNode,
27
+ SqlNode,
28
+ GenericTestNode,
29
+ SnapshotNode,
30
+ SeedNode,
31
+ ]
32
+
33
+ SourceNode = SourceDefinition
34
+
35
+ ExposureNode = Exposure
36
+
37
+ TestNode = Union[GenericTestNode, SingularTestNode]
38
+
39
+ MacroNode = Macro
40
+
41
+ TEST_TYPE_TO_NODE_MAP: Dict[str, Type] = {
42
+ GENERIC: [GenericTestNode],
43
+ SINGULAR: [SingularTestNode],
44
+ }
45
+
46
+
47
+ SeedNodeMap = SeedNode