altimate-datapilot-cli 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. altimate_datapilot_cli-0.0.8.dist-info/AUTHORS.rst +5 -0
  2. altimate_datapilot_cli-0.0.8.dist-info/LICENSE +9 -0
  3. altimate_datapilot_cli-0.0.8.dist-info/METADATA +102 -0
  4. altimate_datapilot_cli-0.0.8.dist-info/RECORD +139 -0
  5. altimate_datapilot_cli-0.0.8.dist-info/WHEEL +5 -0
  6. altimate_datapilot_cli-0.0.8.dist-info/entry_points.txt +4 -0
  7. altimate_datapilot_cli-0.0.8.dist-info/top_level.txt +1 -0
  8. datapilot/__init__.py +1 -0
  9. datapilot/__main__.py +14 -0
  10. datapilot/cli/__init__.py +0 -0
  11. datapilot/cli/main.py +11 -0
  12. datapilot/clients/__init__.py +0 -0
  13. datapilot/clients/altimate/__init__.py +0 -0
  14. datapilot/clients/altimate/client.py +85 -0
  15. datapilot/clients/altimate/utils.py +75 -0
  16. datapilot/config/__init__.py +0 -0
  17. datapilot/config/config.py +16 -0
  18. datapilot/config/utils.py +32 -0
  19. datapilot/core/__init__.py +0 -0
  20. datapilot/core/insights/__init__.py +2 -0
  21. datapilot/core/insights/base/__init__.py +0 -0
  22. datapilot/core/insights/base/insight.py +34 -0
  23. datapilot/core/insights/report.py +16 -0
  24. datapilot/core/insights/schema.py +24 -0
  25. datapilot/core/insights/sql/__init__.py +0 -0
  26. datapilot/core/insights/sql/base/__init__.py +0 -0
  27. datapilot/core/insights/sql/base/insight.py +18 -0
  28. datapilot/core/insights/sql/runtime/__init__.py +0 -0
  29. datapilot/core/insights/sql/static/__init__.py +0 -0
  30. datapilot/core/insights/utils.py +20 -0
  31. datapilot/core/platforms/__init__.py +0 -0
  32. datapilot/core/platforms/dbt/__init__.py +0 -0
  33. datapilot/core/platforms/dbt/cli/__init__.py +0 -0
  34. datapilot/core/platforms/dbt/cli/cli.py +112 -0
  35. datapilot/core/platforms/dbt/constants.py +34 -0
  36. datapilot/core/platforms/dbt/exceptions.py +6 -0
  37. datapilot/core/platforms/dbt/executor.py +157 -0
  38. datapilot/core/platforms/dbt/factory.py +22 -0
  39. datapilot/core/platforms/dbt/formatting.py +45 -0
  40. datapilot/core/platforms/dbt/hooks/__init__.py +0 -0
  41. datapilot/core/platforms/dbt/hooks/executor_hook.py +86 -0
  42. datapilot/core/platforms/dbt/insights/__init__.py +115 -0
  43. datapilot/core/platforms/dbt/insights/base.py +133 -0
  44. datapilot/core/platforms/dbt/insights/checks/__init__.py +0 -0
  45. datapilot/core/platforms/dbt/insights/checks/base.py +26 -0
  46. datapilot/core/platforms/dbt/insights/checks/check_column_desc_are_same.py +105 -0
  47. datapilot/core/platforms/dbt/insights/checks/check_column_name_contract.py +154 -0
  48. datapilot/core/platforms/dbt/insights/checks/check_macro_args_have_desc.py +75 -0
  49. datapilot/core/platforms/dbt/insights/checks/check_macro_has_desc.py +63 -0
  50. datapilot/core/platforms/dbt/insights/checks/check_model_has_all_columns.py +96 -0
  51. datapilot/core/platforms/dbt/insights/checks/check_model_has_labels_keys.py +112 -0
  52. datapilot/core/platforms/dbt/insights/checks/check_model_has_meta_keys.py +108 -0
  53. datapilot/core/platforms/dbt/insights/checks/check_model_has_properties_file.py +64 -0
  54. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_group.py +118 -0
  55. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_name.py +114 -0
  56. datapilot/core/platforms/dbt/insights/checks/check_model_has_tests_by_type.py +119 -0
  57. datapilot/core/platforms/dbt/insights/checks/check_model_materialization_by_childs.py +129 -0
  58. datapilot/core/platforms/dbt/insights/checks/check_model_name_contract.py +132 -0
  59. datapilot/core/platforms/dbt/insights/checks/check_model_parents_and_childs.py +135 -0
  60. datapilot/core/platforms/dbt/insights/checks/check_model_parents_database.py +109 -0
  61. datapilot/core/platforms/dbt/insights/checks/check_model_parents_schema.py +109 -0
  62. datapilot/core/platforms/dbt/insights/checks/check_model_tags.py +87 -0
  63. datapilot/core/platforms/dbt/insights/checks/check_source_childs.py +97 -0
  64. datapilot/core/platforms/dbt/insights/checks/check_source_columns_have_desc.py +96 -0
  65. datapilot/core/platforms/dbt/insights/checks/check_source_has_all_columns.py +103 -0
  66. datapilot/core/platforms/dbt/insights/checks/check_source_has_freshness.py +94 -0
  67. datapilot/core/platforms/dbt/insights/checks/check_source_has_labels_keys.py +110 -0
  68. datapilot/core/platforms/dbt/insights/checks/check_source_has_loader.py +62 -0
  69. datapilot/core/platforms/dbt/insights/checks/check_source_has_meta_keys.py +117 -0
  70. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests.py +82 -0
  71. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_group.py +117 -0
  72. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_name.py +113 -0
  73. datapilot/core/platforms/dbt/insights/checks/check_source_has_tests_by_type.py +119 -0
  74. datapilot/core/platforms/dbt/insights/checks/check_source_table_has_description.py +62 -0
  75. datapilot/core/platforms/dbt/insights/checks/check_source_tags.py +76 -0
  76. datapilot/core/platforms/dbt/insights/dbt_test/__init__.py +0 -0
  77. datapilot/core/platforms/dbt/insights/dbt_test/base.py +23 -0
  78. datapilot/core/platforms/dbt/insights/dbt_test/missing_primary_key_tests.py +130 -0
  79. datapilot/core/platforms/dbt/insights/dbt_test/test_coverage.py +118 -0
  80. datapilot/core/platforms/dbt/insights/governance/__init__.py +0 -0
  81. datapilot/core/platforms/dbt/insights/governance/base.py +23 -0
  82. datapilot/core/platforms/dbt/insights/governance/documentation_on_stale_columns.py +130 -0
  83. datapilot/core/platforms/dbt/insights/governance/exposures_dependent_on_private_models.py +90 -0
  84. datapilot/core/platforms/dbt/insights/governance/public_models_without_contracts.py +89 -0
  85. datapilot/core/platforms/dbt/insights/governance/undocumented_columns.py +148 -0
  86. datapilot/core/platforms/dbt/insights/governance/undocumented_public_models.py +110 -0
  87. datapilot/core/platforms/dbt/insights/modelling/README.md +15 -0
  88. datapilot/core/platforms/dbt/insights/modelling/__init__.py +0 -0
  89. datapilot/core/platforms/dbt/insights/modelling/base.py +31 -0
  90. datapilot/core/platforms/dbt/insights/modelling/direct_join_to_source.py +125 -0
  91. datapilot/core/platforms/dbt/insights/modelling/downstream_models_dependent_on_source.py +113 -0
  92. datapilot/core/platforms/dbt/insights/modelling/duplicate_sources.py +85 -0
  93. datapilot/core/platforms/dbt/insights/modelling/hard_coded_references.py +80 -0
  94. datapilot/core/platforms/dbt/insights/modelling/joining_of_upstream_concepts.py +79 -0
  95. datapilot/core/platforms/dbt/insights/modelling/model_fanout.py +126 -0
  96. datapilot/core/platforms/dbt/insights/modelling/multiple_sources_joined.py +83 -0
  97. datapilot/core/platforms/dbt/insights/modelling/root_model.py +82 -0
  98. datapilot/core/platforms/dbt/insights/modelling/source_fanout.py +102 -0
  99. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_downstream_models.py +103 -0
  100. datapilot/core/platforms/dbt/insights/modelling/staging_model_dependent_on_staging_models.py +89 -0
  101. datapilot/core/platforms/dbt/insights/modelling/unused_sources.py +59 -0
  102. datapilot/core/platforms/dbt/insights/performance/__init__.py +0 -0
  103. datapilot/core/platforms/dbt/insights/performance/base.py +26 -0
  104. datapilot/core/platforms/dbt/insights/performance/chain_view_linking.py +92 -0
  105. datapilot/core/platforms/dbt/insights/performance/exposure_parent_materializations.py +104 -0
  106. datapilot/core/platforms/dbt/insights/schema.py +72 -0
  107. datapilot/core/platforms/dbt/insights/structure/__init__.py +0 -0
  108. datapilot/core/platforms/dbt/insights/structure/base.py +33 -0
  109. datapilot/core/platforms/dbt/insights/structure/model_directories_structure.py +92 -0
  110. datapilot/core/platforms/dbt/insights/structure/model_naming_conventions.py +97 -0
  111. datapilot/core/platforms/dbt/insights/structure/source_directories_structure.py +80 -0
  112. datapilot/core/platforms/dbt/insights/structure/test_directory_structure.py +74 -0
  113. datapilot/core/platforms/dbt/insights/utils.py +9 -0
  114. datapilot/core/platforms/dbt/schemas/__init__.py +0 -0
  115. datapilot/core/platforms/dbt/schemas/catalog.py +73 -0
  116. datapilot/core/platforms/dbt/schemas/manifest.py +462 -0
  117. datapilot/core/platforms/dbt/utils.py +525 -0
  118. datapilot/core/platforms/dbt/wrappers/__init__.py +0 -0
  119. datapilot/core/platforms/dbt/wrappers/catalog/__init__.py +0 -0
  120. datapilot/core/platforms/dbt/wrappers/catalog/v1/__init__.py +0 -0
  121. datapilot/core/platforms/dbt/wrappers/catalog/v1/wrapper.py +18 -0
  122. datapilot/core/platforms/dbt/wrappers/catalog/wrapper.py +9 -0
  123. datapilot/core/platforms/dbt/wrappers/manifest/__init__.py +0 -0
  124. datapilot/core/platforms/dbt/wrappers/manifest/v11/__init__.py +0 -0
  125. datapilot/core/platforms/dbt/wrappers/manifest/v11/schemas.py +47 -0
  126. datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py +396 -0
  127. datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py +35 -0
  128. datapilot/core/platforms/dbt/wrappers/run_results/__init__.py +0 -0
  129. datapilot/core/platforms/dbt/wrappers/run_results/run_results.py +39 -0
  130. datapilot/exceptions/__init__.py +0 -0
  131. datapilot/exceptions/exceptions.py +10 -0
  132. datapilot/schemas/__init__.py +0 -0
  133. datapilot/schemas/constants.py +5 -0
  134. datapilot/schemas/nodes.py +19 -0
  135. datapilot/schemas/sql.py +10 -0
  136. datapilot/utils/__init__.py +0 -0
  137. datapilot/utils/formatting/__init__.py +0 -0
  138. datapilot/utils/formatting/utils.py +59 -0
  139. datapilot/utils/utils.py +317 -0
@@ -0,0 +1,317 @@
1
+ import json
2
+ import os
3
+ import re
4
+ import subprocess
5
+ import tempfile
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Dict
9
+ from typing import List
10
+ from typing import Union
11
+
12
+ from dbt_artifacts_parser.parser import parse_catalog
13
+ from dbt_artifacts_parser.parser import parse_manifest
14
+
15
+ from datapilot.config.config import load_config
16
+ from datapilot.schemas.nodes import ModelNode
17
+ from datapilot.schemas.nodes import SourceNode
18
+
19
+
20
+ def load_json(file_path: str) -> Dict:
21
+ try:
22
+ with Path(file_path).open() as f:
23
+ return json.load(f)
24
+ except FileNotFoundError:
25
+ raise
26
+ except json.decoder.JSONDecodeError as e:
27
+ raise ValueError(f"Invalid JSON file: {file_path}") from e
28
+ except IsADirectoryError as e:
29
+ raise ValueError(f"Please provide a A valid manifest file path. {file_path} is a directory") from e
30
+
31
+
32
+ def extract_dir_name_from_file_path(path: str) -> str:
33
+ # Handle both Windows and Linux paths using os.path
34
+ # Get root directory name
35
+ return Path(path).parent.name
36
+
37
+
38
+ def extract_folders_in_path(path: str) -> list:
39
+ # Split the path into parts
40
+ path_parts = path.split(os.path.sep)
41
+
42
+ # Exclude the last part if it's a file (has a file extension)
43
+ if "." in path_parts[-1]:
44
+ path_parts = path_parts[:-1]
45
+ path_parts = [part for part in path_parts if part != ""]
46
+ return path_parts
47
+
48
+
49
+ def get_dir_path(path: str) -> str:
50
+ """
51
+ Get the directory path of a file path.
52
+ For example, if the path is /a/b/c/d.txt, the directory path is /a/b/c
53
+
54
+ :param path:
55
+ :return:
56
+ """
57
+ return Path(path).parent
58
+
59
+
60
+ def is_superset_path(superset_path: str, path: str):
61
+ """
62
+ Check if the path is a sub-path of the superset path.
63
+
64
+ :param superset_path: The superset path
65
+ :param path: The path to be checked
66
+ :return: True if the path is a sub-path of the superset path, False otherwise
67
+ """
68
+
69
+ try:
70
+ Path(path).relative_to(superset_path)
71
+ return True
72
+ except ValueError:
73
+ return False
74
+
75
+
76
+ def get_changed_files(include_untracked=True):
77
+ command = ["git", "status", "--porcelain"]
78
+ if include_untracked:
79
+ command.append("-uall")
80
+ result = subprocess.run(command, capture_output=True, text=True) # noqa
81
+ changed_files = []
82
+ for line in result.stdout.splitlines():
83
+ if line.startswith("??") and include_untracked:
84
+ changed_files.append(line.split()[1])
85
+ elif line.startswith(("M", "A", "D", "R", " M", " A", " D", " R")):
86
+ changed_files.append(line.split()[1])
87
+ return changed_files
88
+
89
+
90
+ def get_tmp_dir_path():
91
+ tmp_dir = Path(tempfile.gettempdir()) / str(uuid.uuid4())
92
+ tmp_dir.mkdir(parents=True, exist_ok=True)
93
+ return tmp_dir
94
+
95
+
96
+ def get_column_type(dtype: str) -> str:
97
+ dtype = dtype.lower()
98
+ if re.match(r".*int.*", dtype):
99
+ return "INTEGER"
100
+ elif re.match(r".*float.*", dtype):
101
+ return "FLOAT"
102
+ elif re.match(r".*bool.*", dtype):
103
+ return "BOOLEAN"
104
+ elif re.match(r".*date.*", dtype):
105
+ return "DATE"
106
+ elif re.match(r".*time.*", dtype):
107
+ return "TIME"
108
+ elif re.match(r".*timestamp.*", dtype):
109
+ return "TIMESTAMP"
110
+ elif re.match(r".*text.*", dtype):
111
+ return "TEXT"
112
+ elif re.match(r".*char.*", dtype):
113
+ return "TEXT"
114
+ elif re.match(r".*varchar.*", dtype):
115
+ return "TEXT"
116
+ elif re.match(r".*numeric.*", dtype):
117
+ return "NUMERIC"
118
+ elif re.match(r".*decimal.*", dtype):
119
+ return "DECIMAL"
120
+ elif re.match(r".*double.*", dtype):
121
+ return "DOUBLE"
122
+ elif re.match(r".*real.*", dtype):
123
+ return "REAL"
124
+ else:
125
+ return "TEXT"
126
+
127
+
128
+ def get_manifest_model_nodes(manifest: Dict, models: List) -> List[ModelNode]:
129
+ nodes = []
130
+ for node in manifest["nodes"].values():
131
+ if node["name"] in models:
132
+ if node["resource_type"] == "model" and node["config"]["materialized"] in ["table", "view"]:
133
+ nodes.append(
134
+ ModelNode(
135
+ unique_id=node["unique_id"],
136
+ name=node["name"],
137
+ resource_type=node["resource_type"],
138
+ database=node["database"],
139
+ alias=node["alias"],
140
+ table_schema=node["schema"],
141
+ )
142
+ )
143
+ return nodes
144
+
145
+
146
+ def get_manifest_source_nodes(manifest: Dict, sources: List) -> List[SourceNode]:
147
+ nodes = []
148
+ for node in manifest["sources"].values():
149
+ if node["source_name"] in sources:
150
+ nodes.append(
151
+ SourceNode(
152
+ unique_id=node["unique_id"],
153
+ name=node["source_name"],
154
+ resource_type=node["resource_type"],
155
+ table=node["identifier"],
156
+ database=node["database"],
157
+ table_schema=node["schema"],
158
+ )
159
+ )
160
+ return nodes
161
+
162
+
163
+ def get_model_tables(models: List[ModelNode]) -> List[str]:
164
+ tables = []
165
+ for model in models:
166
+ tables.append(f"{model.database}.{model.table_schema}.{model.alias}")
167
+ return tables
168
+
169
+
170
+ def get_source_tables(sources: List[SourceNode]) -> List[str]:
171
+ tables = []
172
+ for source in sources:
173
+ tables.append(f"{source.database}.{source.table_schema}.{source.name}")
174
+ return tables
175
+
176
+
177
+ def get_table_name(node: Union[ModelNode, SourceNode], node_type: str) -> str:
178
+ if node_type == "nodes":
179
+ return f"{node.database}.{node.table_schema}.{node.alias}"
180
+ return f"{node.database}.{node.table_schema}.{node.name}"
181
+
182
+
183
+ def fill_catalog(table_columns_map: Dict, manifest: Dict, catalog: Dict, nodes: List[Union[ModelNode, SourceNode]], node_type: str) -> Dict:
184
+ if not nodes:
185
+ catalog[node_type] = {}
186
+ return catalog
187
+
188
+ for node in nodes:
189
+ columns = {}
190
+ for column in table_columns_map[node.unique_id]:
191
+ column_type = get_column_type(column["dtype"])
192
+ columns[column["column"]] = {
193
+ "type": column_type,
194
+ "index": len(columns) + 1,
195
+ "name": column["column"],
196
+ "comment": None,
197
+ }
198
+
199
+ catalog[node_type] = {
200
+ node.unique_id: {
201
+ "metadata": {
202
+ "type": "BASE TABLE",
203
+ "schema": manifest[node_type][node.unique_id]["schema"],
204
+ "name": node.alias if node_type == "nodes" else node.name,
205
+ "database": manifest[node_type][node.unique_id]["database"],
206
+ "comment": None,
207
+ "owner": None,
208
+ },
209
+ "columns": columns,
210
+ "stats": {},
211
+ "unique_id": node.unique_id,
212
+ }
213
+ }
214
+
215
+ return catalog
216
+
217
+
218
+ def run_macro(macro: str, base_path: str) -> str:
219
+ dbt_compile = subprocess.run(
220
+ ["dbt", "compile", "--inline", macro], # noqa
221
+ capture_output=True,
222
+ cwd=base_path,
223
+ text=True,
224
+ )
225
+ return dbt_compile.stdout
226
+
227
+
228
+ def generate_partial_manifest_catalog(changed_files, base_path: str = "./"):
229
+ try:
230
+ # print(f"Running generate_partial_manifest_catalog for {changed_files}")
231
+ yaml_files = [
232
+ f for f in changed_files if Path(f).suffix in [".yml", ".yaml"] and Path(f).name not in ["dbt_project.yml", "profiles.yml"]
233
+ ]
234
+ model_stem = [Path(f).stem for f in changed_files if Path(f).suffix in [".sql"]]
235
+ # print(f"yaml_files: {yaml_files}")
236
+ # print(f"model_stem: {model_stem}")
237
+ model_set = set()
238
+ source_set = set()
239
+
240
+ for file in yaml_files:
241
+ parsed_file = load_config(file)
242
+ if "models" in parsed_file:
243
+ for model in parsed_file["models"]:
244
+ model_set.add(model.get("name", ""))
245
+ if "sources" in parsed_file:
246
+ for source in parsed_file["sources"]:
247
+ source_set.add(source.get("name", ""))
248
+
249
+ for model in model_stem:
250
+ model_set.add(model)
251
+
252
+ models = list(model_set)
253
+ source_list = list(source_set)
254
+
255
+ # print(f"models: {models}")
256
+ # print(f"sources: {source_list}")
257
+ subprocess.run(["dbt", "parse"], cwd=base_path, stdout=subprocess.PIPE) # noqa
258
+
259
+ manifest_file = Path(Path(base_path) / "target/manifest.json")
260
+ with manifest_file.open() as f:
261
+ manifest = json.load(f)
262
+
263
+ nodes = get_manifest_model_nodes(manifest, models)
264
+ sources = get_manifest_source_nodes(manifest, source_list)
265
+
266
+ nodes_data = [{"name": node.name, "resource_type": node.resource_type, "unique_id": node.unique_id, "table": ""} for node in nodes]
267
+
268
+ sources_data = [
269
+ {"name": source.name, "resource_type": source.resource_type, "unique_id": source.unique_id, "table": source.table}
270
+ for source in sources
271
+ ]
272
+
273
+ nodes_str = ",\n".join(json.dumps(data) for data in nodes_data + sources_data)
274
+
275
+ query = (
276
+ "{% set result = {} %}{% set nodes = ["
277
+ + nodes_str
278
+ + '] %}{% for n in nodes %}{% if n["resource_type"] == "source" %}{% set columns = adapter.get_columns_in_relation(source(n["name"], n["table"])) %}{% else %}{% set columns = adapter.get_columns_in_relation(ref(n["name"])) %}{% endif %}{% set new_columns = [] %}{% for column in columns %}{% do new_columns.append({"column": column.name, "dtype": column.dtype}) %}{% endfor %}{% do result.update({n["unique_id"]:new_columns}) %}{% endfor %}{{ tojson(result) }}'
279
+ )
280
+
281
+ dbt_compile_output = run_macro(query, base_path)
282
+
283
+ # print(dbt_compile_output)
284
+
285
+ compiled_inline_node = dbt_compile_output.split("Compiled inline node is:")[1].strip().replace("'", "").strip()
286
+
287
+ table_columns_map = json.loads(compiled_inline_node)
288
+
289
+ # we need to get all columns from compiled_dict which is a list of dictionaries
290
+ # and each item in the list is a dictionary with keys table, name, type
291
+ # we need to create a map of all the columns for each table
292
+ # and then create a catalog for each table
293
+
294
+ catalog = {
295
+ "metadata": {
296
+ "dbt_schema_version": "https://schemas.getdbt.com/dbt/catalog/v1.json",
297
+ "dbt_version": "1.7.2",
298
+ "generated_at": "2024-03-04T11:13:52.284167Z",
299
+ "invocation_id": "e2970ef7-c397-404b-ac5d-63a71a45b628",
300
+ "env": {},
301
+ },
302
+ "errors": None,
303
+ }
304
+
305
+ catalog = fill_catalog(table_columns_map, manifest, catalog, nodes, "nodes")
306
+ catalog = fill_catalog(table_columns_map, manifest, catalog, sources, "sources")
307
+
308
+ selected_models = [node.unique_id for node in nodes + sources]
309
+ return selected_models, parse_manifest(manifest), parse_catalog(catalog)
310
+ except Exception as e:
311
+ raise Exception("Unable to generate partial manifest and catalog") from e
312
+
313
+
314
+ if __name__ == "__main__":
315
+ print("Running main")
316
+ print(generate_partial_manifest_catalog([], "/Users/gaurp/Desktop/manifest.json", ""))
317
+ print("Done running main")