quollio-core 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quollio_core/__init__.py +1 -1
- quollio_core/bricks.py +237 -0
- quollio_core/dbt_projects/databricks/.gitignore +4 -0
- quollio_core/dbt_projects/databricks/README.md +5 -0
- quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
- quollio_core/dbt_projects/databricks/dbt_project.yml +21 -0
- quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +73 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +14 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +63 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +11 -0
- quollio_core/dbt_projects/databricks/models/sources.yml +84 -0
- quollio_core/dbt_projects/databricks/package-lock.yml +14 -0
- quollio_core/dbt_projects/databricks/packages.yml +13 -0
- quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +14 -0
- quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
- quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
- quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +54 -22
- quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -1
- quollio_core/dbt_projects/redshift/package-lock.yml +1 -1
- quollio_core/dbt_projects/seeds/.gitkeep +0 -0
- quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +4 -0
- quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -1
- quollio_core/helper/env_default.py +4 -1
- quollio_core/profilers/databricks.py +196 -0
- quollio_core/profilers/lineage.py +12 -0
- quollio_core/profilers/stats.py +0 -1
- quollio_core/redshift.py +4 -5
- quollio_core/repository/databricks.py +62 -0
- quollio_core/snowflake.py +4 -5
- {quollio_core-0.4.3.dist-info → quollio_core-0.4.5.dist-info}/METADATA +5 -1
- {quollio_core-0.4.3.dist-info → quollio_core-0.4.5.dist-info}/RECORD +34 -15
- {quollio_core-0.4.3.dist-info → quollio_core-0.4.5.dist-info}/LICENSE +0 -0
- {quollio_core-0.4.3.dist-info → quollio_core-0.4.5.dist-info}/WHEEL +0 -0
quollio_core/__init__.py
CHANGED
quollio_core/bricks.py
ADDED
@@ -0,0 +1,237 @@
|
|
1
|
+
import argparse
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
|
5
|
+
from quollio_core.helper.core import setup_dbt_profile
|
6
|
+
from quollio_core.helper.env_default import env_default
|
7
|
+
from quollio_core.profilers.databricks import (
|
8
|
+
databricks_column_level_lineage,
|
9
|
+
databricks_column_stats,
|
10
|
+
databricks_table_level_lineage,
|
11
|
+
)
|
12
|
+
from quollio_core.repository import databricks as db
|
13
|
+
from quollio_core.repository import dbt, qdc
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
def build_view(
|
19
|
+
conn: db.DatabricksConnectionConfig,
|
20
|
+
target_tables: str,
|
21
|
+
log_level: str = "info",
|
22
|
+
) -> None:
|
23
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
24
|
+
|
25
|
+
logger.info("Build profiler views using dbt")
|
26
|
+
# set parameters
|
27
|
+
dbt_client = dbt.DBTClient()
|
28
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
29
|
+
project_path = f"{current_dir}/dbt_projects/databricks"
|
30
|
+
template_path = f"{current_dir}/dbt_projects/databricks/profiles"
|
31
|
+
template_name = "profiles_template.yml"
|
32
|
+
|
33
|
+
# build views using dbt
|
34
|
+
setup_dbt_profile(connections_json=conn.as_dict(), template_path=template_path, template_name=template_name)
|
35
|
+
# FIXME: when executing some of the commands, directory changes due to the library bug.
|
36
|
+
# https://github.com/dbt-labs/dbt-core/issues/8997
|
37
|
+
dbt_client.invoke(
|
38
|
+
cmd="deps",
|
39
|
+
project_dir=project_path,
|
40
|
+
profile_dir=template_path,
|
41
|
+
options=["--no-use-colors", "--log-level", log_level],
|
42
|
+
)
|
43
|
+
|
44
|
+
run_options = ["--no-use-colors", "--log-level", log_level, "--select", target_tables]
|
45
|
+
dbt_client.invoke(
|
46
|
+
cmd="run",
|
47
|
+
project_dir=project_path,
|
48
|
+
profile_dir=template_path,
|
49
|
+
options=run_options,
|
50
|
+
)
|
51
|
+
return
|
52
|
+
|
53
|
+
|
54
|
+
def load_lineage(
|
55
|
+
conn: db.DatabricksConnectionConfig,
|
56
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
57
|
+
tenant_id: str,
|
58
|
+
) -> None:
|
59
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
60
|
+
|
61
|
+
logger.info("Generate Databricks table to table lineage.")
|
62
|
+
databricks_table_level_lineage(
|
63
|
+
conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_table_level"
|
64
|
+
)
|
65
|
+
|
66
|
+
logger.info("Generate Databricks column to column lineage.")
|
67
|
+
databricks_column_level_lineage(
|
68
|
+
conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_column_level"
|
69
|
+
)
|
70
|
+
|
71
|
+
logger.info("Lineage data is successfully loaded.")
|
72
|
+
return
|
73
|
+
|
74
|
+
|
75
|
+
def load_column_stats(
|
76
|
+
conn: db.DatabricksConnectionConfig,
|
77
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
78
|
+
tenant_id: str,
|
79
|
+
) -> None:
|
80
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
81
|
+
|
82
|
+
logger.info("Generate Databricks column stats.")
|
83
|
+
databricks_column_stats(
|
84
|
+
conn=conn,
|
85
|
+
qdc_client=qdc_client,
|
86
|
+
tenant_id=tenant_id,
|
87
|
+
)
|
88
|
+
|
89
|
+
logger.info("Column stats are successfully loaded.")
|
90
|
+
return
|
91
|
+
|
92
|
+
|
93
|
+
if __name__ == "__main__":
|
94
|
+
parser = argparse.ArgumentParser(
|
95
|
+
prog="Quollio Intelligence Agent for Databricks",
|
96
|
+
description="Build views and load lineage and stats to Quollio from Databricks using dbt.",
|
97
|
+
epilog="Copyright (c) 2024 Quollio Technologies, Inc.",
|
98
|
+
)
|
99
|
+
parser.add_argument(
|
100
|
+
"commands",
|
101
|
+
choices=["build_view", "load_lineage", "load_stats"],
|
102
|
+
type=str,
|
103
|
+
nargs="+",
|
104
|
+
help="""
|
105
|
+
The command to execute.
|
106
|
+
'build_view': Build views using dbt,
|
107
|
+
'load_lineage': Load lineage data from created views to Quollio,
|
108
|
+
'load_stats': Load stats from created views to Quollio,
|
109
|
+
'load_sqllineage': Load lineage data from sql parse result(alpha),
|
110
|
+
""",
|
111
|
+
)
|
112
|
+
parser.add_argument(
|
113
|
+
"--host", type=str, action=env_default("DATABRICKS_HOST"), required=False, help="Host for Databricks workspace"
|
114
|
+
)
|
115
|
+
parser.add_argument(
|
116
|
+
"--http_path",
|
117
|
+
type=str,
|
118
|
+
action=env_default("DATABRICKS_HTTP_PATH"),
|
119
|
+
required=False,
|
120
|
+
help="HTTP path for a Databricks compute resource (i.e warehouse)",
|
121
|
+
)
|
122
|
+
parser.add_argument(
|
123
|
+
"--port",
|
124
|
+
type=int,
|
125
|
+
action=env_default("DATABRICKS_PORT"),
|
126
|
+
required=False,
|
127
|
+
help="Port for Databricks compute resource",
|
128
|
+
)
|
129
|
+
parser.add_argument(
|
130
|
+
"--databricks_client_secret",
|
131
|
+
type=str,
|
132
|
+
action=env_default("DATABRICKS_CLIENT_SECRET"),
|
133
|
+
required=False,
|
134
|
+
help="Secret for the service principal",
|
135
|
+
)
|
136
|
+
parser.add_argument(
|
137
|
+
"--databricks_client_id",
|
138
|
+
type=str,
|
139
|
+
action=env_default("DATABRICKS_CLIENT_ID"),
|
140
|
+
required=False,
|
141
|
+
help="Client id for the service principal",
|
142
|
+
)
|
143
|
+
parser.add_argument(
|
144
|
+
"--catalog",
|
145
|
+
type=str,
|
146
|
+
required=False,
|
147
|
+
action=env_default("DATABRICKS_TARGET_CATALOG"),
|
148
|
+
help="Target database name where the views are built by dbt",
|
149
|
+
)
|
150
|
+
parser.add_argument(
|
151
|
+
"--schema",
|
152
|
+
type=str,
|
153
|
+
action=env_default("DATABRICKS_TARGET_SCHEMA"),
|
154
|
+
required=False,
|
155
|
+
help="Target schema name where the views are built by dbt",
|
156
|
+
)
|
157
|
+
parser.add_argument(
|
158
|
+
"--log_level",
|
159
|
+
type=str,
|
160
|
+
choices=["debug", "info", "warn", "error", "none"],
|
161
|
+
action=env_default("LOG_LEVEL"),
|
162
|
+
required=False,
|
163
|
+
help="The log level for dbt commands. Default value is info",
|
164
|
+
)
|
165
|
+
parser.add_argument(
|
166
|
+
"--api_url",
|
167
|
+
type=str,
|
168
|
+
action=env_default("QDC_API_URL"),
|
169
|
+
required=False,
|
170
|
+
help="The base URL of Quollio External API",
|
171
|
+
)
|
172
|
+
parser.add_argument(
|
173
|
+
"--client_id",
|
174
|
+
type=str,
|
175
|
+
action=env_default("QDC_CLIENT_ID"),
|
176
|
+
required=False,
|
177
|
+
help="The client id that is created on Quollio console to let clients access Quollio External API",
|
178
|
+
)
|
179
|
+
parser.add_argument(
|
180
|
+
"--client_secret",
|
181
|
+
type=str,
|
182
|
+
action=env_default("QDC_CLIENT_SECRET"),
|
183
|
+
required=False,
|
184
|
+
help="The client secrete that is created on Quollio console to let clients access Quollio External API",
|
185
|
+
)
|
186
|
+
parser.add_argument(
|
187
|
+
"--tenant_id",
|
188
|
+
type=str,
|
189
|
+
action=env_default("TENANT_ID"),
|
190
|
+
required=False,
|
191
|
+
help="The tenant id (company id) where the lineage and stats are loaded",
|
192
|
+
)
|
193
|
+
parser.add_argument(
|
194
|
+
"--target_tables",
|
195
|
+
type=str,
|
196
|
+
nargs="*",
|
197
|
+
choices=["quollio_lineage_table_level", "quollio_lineage_view_level"],
|
198
|
+
action=env_default("DATABRICKS_TARGET_TABLES"),
|
199
|
+
required=False,
|
200
|
+
help="Target tables you want to create with dbt module. \
|
201
|
+
You need to specify this parameter if you want to specify tables, not all ones. \
|
202
|
+
Please specify table name with blank delimiter like tableA tableB \
|
203
|
+
if you want to create two or more tables",
|
204
|
+
)
|
205
|
+
|
206
|
+
args = parser.parse_args()
|
207
|
+
|
208
|
+
conn = db.DatabricksConnectionConfig(
|
209
|
+
host=args.host,
|
210
|
+
http_path=args.http_path,
|
211
|
+
client_id=args.databricks_client_id,
|
212
|
+
client_secret=args.databricks_client_secret,
|
213
|
+
catalog=args.catalog,
|
214
|
+
schema=args.schema,
|
215
|
+
)
|
216
|
+
|
217
|
+
if len(args.commands) == 0:
|
218
|
+
raise ValueError("No command is provided")
|
219
|
+
|
220
|
+
if "build_view" in args.commands:
|
221
|
+
build_view(
|
222
|
+
conn=conn,
|
223
|
+
target_tables=args.target_tables,
|
224
|
+
log_level=args.log_level,
|
225
|
+
)
|
226
|
+
|
227
|
+
if "load_lineage" in args.commands:
|
228
|
+
qdc_client = qdc.QDCExternalAPIClient(
|
229
|
+
base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
|
230
|
+
)
|
231
|
+
load_lineage(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
|
232
|
+
|
233
|
+
if "load_stats" in args.commands:
|
234
|
+
qdc_client = qdc.QDCExternalAPIClient(
|
235
|
+
base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
|
236
|
+
)
|
237
|
+
databricks_column_stats(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
### Quollio Intelligence Agent Support For Databricks
|
2
|
+
Notable Files:
|
3
|
+
1. [quollio_lineage_table_level.sql](models/quollio_lineage_table_level.sql) - Generates table lineage data from Databricks system tables.
|
4
|
+
2. [quollio_lineage_column_level.sql](models/quollio_lineage_table_level.sql) - Generates column lineage data from Databricks system tables.
|
5
|
+
3. [sources.yml](models/sources.yml) - Refrences sources in the Databricks system catalog.
|
File without changes
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: 'quollio_intelligence_databricks'
|
2
|
+
version: '1.0.0'
|
3
|
+
config-version: 2
|
4
|
+
|
5
|
+
profile: 'quollio_intelligence_databricks'
|
6
|
+
|
7
|
+
model-paths: ["models"]
|
8
|
+
analysis-paths: ["analyses"]
|
9
|
+
test-paths: ["tests"]
|
10
|
+
seed-paths: ["seeds"]
|
11
|
+
macro-paths: ["macros"]
|
12
|
+
snapshot-paths: ["snapshots"]
|
13
|
+
|
14
|
+
clean-targets:
|
15
|
+
- "target"
|
16
|
+
- "dbt_packages"
|
17
|
+
|
18
|
+
models:
|
19
|
+
+dbt-osmosis: "{model}.yml"
|
20
|
+
# Databricks automatically enables grants on SQL endpoints
|
21
|
+
# https://docs.getdbt.com/reference/resource-configs/grants
|
File without changes
|
@@ -0,0 +1,73 @@
|
|
1
|
+
-- Gets full table lineage from Databricks
|
2
|
+
WITH columns_lineage_history AS (
|
3
|
+
SELECT
|
4
|
+
-- The databricks columns table does not have a full table name, create with CONCAT()
|
5
|
+
source_table_full_name AS upstream_table,
|
6
|
+
target_table_full_name as downstream_table,
|
7
|
+
source_column_name as upstream_column,
|
8
|
+
target_column_name as downstream_column,
|
9
|
+
event_time,
|
10
|
+
RANK() OVER (
|
11
|
+
PARTITION BY target_table_full_name
|
12
|
+
ORDER BY
|
13
|
+
event_time DESC
|
14
|
+
) AS rank
|
15
|
+
FROM
|
16
|
+
{{ source('access','column_lineage') }}
|
17
|
+
WHERE
|
18
|
+
source_table_full_name IS NOT NULL
|
19
|
+
AND target_table_full_name IS NOT NULL
|
20
|
+
AND source_table_full_name NOT LIKE "%quollio%"
|
21
|
+
AND target_table_full_name NOT LIKE "%quollio%"
|
22
|
+
),
|
23
|
+
-- Gets list of existing columns in catalogs
|
24
|
+
existing_columns (
|
25
|
+
SELECT
|
26
|
+
CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_full_name,
|
27
|
+
column_name
|
28
|
+
FROM
|
29
|
+
{{ source('inf_sch','columns') }}
|
30
|
+
),
|
31
|
+
|
32
|
+
-- Checks if the downstream tables exists and group operations.
|
33
|
+
downstream_column_exists (
|
34
|
+
SELECT
|
35
|
+
upstream_table AS UPSTREAM_TABLE_NAME,
|
36
|
+
upstream_column AS UPSTREAM_COLUMN_NAME,
|
37
|
+
downstream_table AS DOWNSTREAM_TABLE_NAME,
|
38
|
+
downstream_column AS DOWNSTREAM_COLUMN_NAME,
|
39
|
+
event_time
|
40
|
+
FROM
|
41
|
+
columns_lineage_history clh
|
42
|
+
INNER JOIN existing_columns ec ON clh.downstream_table = ec.table_full_name
|
43
|
+
AND clh.downstream_column = ec.column_name
|
44
|
+
WHERE
|
45
|
+
rank = 1
|
46
|
+
GROUP BY UPSTREAM_TABLE, UPSTREAM_COLUMN, DOWNSTREAM_TABLE, DOWNSTREAM_COLUMN, EVENT_TIME
|
47
|
+
),
|
48
|
+
|
49
|
+
-- Aggregates the column lineage
|
50
|
+
aggregated_column_lineage AS (
|
51
|
+
SELECT
|
52
|
+
downstream_table_name,
|
53
|
+
downstream_column_name,
|
54
|
+
collect_set(
|
55
|
+
named_struct(
|
56
|
+
'upstream_table_name', upstream_table_name,
|
57
|
+
'upstream_column_name', upstream_column_name
|
58
|
+
)
|
59
|
+
) AS upstream_columns
|
60
|
+
FROM
|
61
|
+
downstream_column_exists
|
62
|
+
GROUP BY
|
63
|
+
downstream_table_name,
|
64
|
+
downstream_column_name
|
65
|
+
)
|
66
|
+
|
67
|
+
SELECT
|
68
|
+
downstream_table_name AS DOWNSTREAM_TABLE_NAME,
|
69
|
+
downstream_column_name AS DOWNSTREAM_COLUMN_NAME,
|
70
|
+
to_json(upstream_columns) AS UPSTREAM_COLUMNS
|
71
|
+
FROM
|
72
|
+
aggregated_column_lineage
|
73
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
version: 2
|
2
|
+
|
3
|
+
model:
|
4
|
+
- name: quollio_lineage_column_level
|
5
|
+
columns:
|
6
|
+
- name: UPSTREAM_COLUMNS
|
7
|
+
description: 'String column with all upstream columns in JSON format'
|
8
|
+
type: string
|
9
|
+
- name: DOWNSTREAM_TABLE_NAME
|
10
|
+
description: 'Full downstream table name in <catalog>.<schema>.<table> format'
|
11
|
+
type: string
|
12
|
+
- name: DOWNSTREAM_COLUMN_NAME
|
13
|
+
description: 'Downstream column name'
|
14
|
+
type: string
|
@@ -0,0 +1,63 @@
|
|
1
|
+
-- Gets full table lineage from Databricks
|
2
|
+
WITH table_lineage_history AS (
|
3
|
+
SELECT
|
4
|
+
source_table_full_name as upstream_table,
|
5
|
+
target_table_full_name as downstream_table,
|
6
|
+
target_type,
|
7
|
+
event_time,
|
8
|
+
RANK() OVER (
|
9
|
+
PARTITION BY target_table_full_name
|
10
|
+
ORDER BY
|
11
|
+
event_time DESC
|
12
|
+
) AS rank
|
13
|
+
FROM
|
14
|
+
{{ source('access','table_lineage') }}
|
15
|
+
WHERE
|
16
|
+
source_table_full_name IS NOT NULL
|
17
|
+
AND target_table_full_name IS NOT NULL
|
18
|
+
AND source_table_full_name NOT LIKE "%quollio%"
|
19
|
+
AND target_table_full_name NOT LIKE "%quollio%"
|
20
|
+
),
|
21
|
+
-- Gets list of existing tables in catalogs
|
22
|
+
existing_tables (
|
23
|
+
SELECT
|
24
|
+
CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_full_name
|
25
|
+
FROM
|
26
|
+
{{ source('inf_sch','tables') }}
|
27
|
+
),
|
28
|
+
|
29
|
+
-- Checks if the downstream tables exists and group operations.
|
30
|
+
downstream_table_exists (
|
31
|
+
SELECT
|
32
|
+
upstream_table,
|
33
|
+
downstream_table,
|
34
|
+
target_type,
|
35
|
+
event_time
|
36
|
+
FROM
|
37
|
+
table_lineage_history tlh
|
38
|
+
INNER JOIN existing_tables et ON tlh.downstream_table = et.table_full_name
|
39
|
+
WHERE
|
40
|
+
rank = 1
|
41
|
+
GROUP BY upstream_table, downstream_table, target_type, event_time
|
42
|
+
),
|
43
|
+
|
44
|
+
aggregated_table_lineage AS (
|
45
|
+
SELECT
|
46
|
+
downstream_table,
|
47
|
+
collect_set(
|
48
|
+
named_struct(
|
49
|
+
'upstream_object_name', upstream_table
|
50
|
+
)
|
51
|
+
) AS upstream_tables
|
52
|
+
FROM
|
53
|
+
downstream_table_exists
|
54
|
+
GROUP BY
|
55
|
+
downstream_table
|
56
|
+
)
|
57
|
+
SELECT
|
58
|
+
downstream_table as DOWNSTREAM_TABLE_NAME,
|
59
|
+
to_json(upstream_tables) as UPSTREAM_TABLES
|
60
|
+
|
61
|
+
FROM
|
62
|
+
aggregated_table_lineage
|
63
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
version: 2
|
2
|
+
|
3
|
+
model:
|
4
|
+
- name: quollio_lineage_column_level
|
5
|
+
columns:
|
6
|
+
- name: UPSTREAM_TABLES
|
7
|
+
description: 'String column with all upstream tables in JSON format'
|
8
|
+
type: string
|
9
|
+
- name: DOWNSTREAM_TABLE_NAME
|
10
|
+
description: 'Full downstream table name in <catalog>.<schema>.<table> format'
|
11
|
+
type: string
|
@@ -0,0 +1,84 @@
|
|
1
|
+
version: 2
|
2
|
+
|
3
|
+
sources:
|
4
|
+
- name: access
|
5
|
+
database: system
|
6
|
+
schema: access
|
7
|
+
tables:
|
8
|
+
- name: table_lineage
|
9
|
+
description: Describes table level lineage
|
10
|
+
columns:
|
11
|
+
- name: source_table_full_name
|
12
|
+
description: ''
|
13
|
+
type: string
|
14
|
+
- name: target_table_full_name
|
15
|
+
description: ''
|
16
|
+
type: string
|
17
|
+
- name: target_type
|
18
|
+
description: ''
|
19
|
+
type: string
|
20
|
+
- name: event_time
|
21
|
+
description: ''
|
22
|
+
type: timestamp
|
23
|
+
|
24
|
+
- name: column_lineage
|
25
|
+
description: Describes column level lineage
|
26
|
+
columns:
|
27
|
+
- name: source_table_full_name
|
28
|
+
description: ''
|
29
|
+
type: string
|
30
|
+
- name: target_table_full_name
|
31
|
+
description: ''
|
32
|
+
type: string
|
33
|
+
- name: event_time
|
34
|
+
description: ''
|
35
|
+
type: timestamp
|
36
|
+
- name: source_column_name
|
37
|
+
description: ''
|
38
|
+
type: string
|
39
|
+
- name: target_column_name
|
40
|
+
description: ''
|
41
|
+
type: string
|
42
|
+
|
43
|
+
- name: inf_sch
|
44
|
+
database: system
|
45
|
+
schema: information_schema
|
46
|
+
tables:
|
47
|
+
- name: tables
|
48
|
+
description: Lists existing tables (i.e., not deleted).
|
49
|
+
columns:
|
50
|
+
- name: table_catalog
|
51
|
+
description: ''
|
52
|
+
type: string
|
53
|
+
- name: table_schema
|
54
|
+
description: ''
|
55
|
+
type: string
|
56
|
+
- name: table_name
|
57
|
+
description: ''
|
58
|
+
type: string
|
59
|
+
|
60
|
+
- name: views
|
61
|
+
description: Lists existing views (i.e., not deleted). Views are treated as tables.
|
62
|
+
columns:
|
63
|
+
- name: table_catalog
|
64
|
+
description: ''
|
65
|
+
type: string
|
66
|
+
- name: table_schema
|
67
|
+
description: ''
|
68
|
+
type: string
|
69
|
+
- name: table_name
|
70
|
+
description: ''
|
71
|
+
type: string
|
72
|
+
|
73
|
+
- name: columns
|
74
|
+
description: ''
|
75
|
+
columns:
|
76
|
+
- name: table_catalog
|
77
|
+
description: ''
|
78
|
+
type: string
|
79
|
+
- name: table_schema
|
80
|
+
description: ''
|
81
|
+
type: string
|
82
|
+
- name: table_name
|
83
|
+
description: ''
|
84
|
+
type: string
|
@@ -0,0 +1,14 @@
|
|
1
|
+
packages:
|
2
|
+
- package: dbt-labs/dbt_utils
|
3
|
+
version: 1.1.1
|
4
|
+
- package: dbt-labs/spark_utils
|
5
|
+
version: 0.3.0
|
6
|
+
- package: dbt-labs/codegen
|
7
|
+
version: 0.12.1
|
8
|
+
- package: dbt-labs/dbt_external_tables
|
9
|
+
version: 0.8.7
|
10
|
+
- package: dbt-labs/dbt_project_evaluator
|
11
|
+
version: 0.8.1
|
12
|
+
- package: brooklyn-data/dbt_artifacts
|
13
|
+
version: 2.6.2
|
14
|
+
sha1_hash: cbb324267dbf6c6fb7de11b162e4fbafd1e32a9c
|
@@ -0,0 +1,13 @@
|
|
1
|
+
packages:
|
2
|
+
- package: dbt-labs/dbt_utils
|
3
|
+
version: [">=0.0.0", "<2.0.0"]
|
4
|
+
- package: dbt-labs/spark_utils
|
5
|
+
version: [">=0.0.0", "<1.0.0"]
|
6
|
+
- package: dbt-labs/codegen
|
7
|
+
version: [">=0.0.0", "<1.0.0"]
|
8
|
+
- package: dbt-labs/dbt_external_tables
|
9
|
+
version: [">=0.0.0", "<1.0.0"]
|
10
|
+
- package: dbt-labs/dbt_project_evaluator
|
11
|
+
version: [">=0.0.0", "<1.0.0"]
|
12
|
+
- package: brooklyn-data/dbt_artifacts
|
13
|
+
version: [">=2.0.0", "<3.0.0"]
|
@@ -0,0 +1,14 @@
|
|
1
|
+
quollio_intelligence_databricks:
|
2
|
+
target: project
|
3
|
+
outputs:
|
4
|
+
project:
|
5
|
+
type: databricks
|
6
|
+
host: {{ host }}
|
7
|
+
http_path: {{ http_path }}
|
8
|
+
catalog: {{ catalog }}
|
9
|
+
schema: {{ schema }}
|
10
|
+
auth_type: oauth
|
11
|
+
client_id: {{ client_id }}
|
12
|
+
client_secret: {{ client_secret }}
|
13
|
+
databricks_port: {{ databricks_port }}
|
14
|
+
|
File without changes
|
File without changes
|
@@ -2,6 +2,7 @@
|
|
2
2
|
{%- set identifier = model['alias'] %}
|
3
3
|
{%- set target_relations = [] %}
|
4
4
|
{%- set chunk = config.get('chunk') %}
|
5
|
+
{%- set grant_config = config.get('grants') %}
|
5
6
|
|
6
7
|
{{ run_hooks(pre_hooks, inside_transaction=False) }}
|
7
8
|
-- `BEGIN` happens here:
|
@@ -22,29 +23,57 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name no
|
|
22
23
|
{%- for i in range(0, records|length, chunk) -%}
|
23
24
|
{%- set build_sql %}
|
24
25
|
{%- for record in records[i: i+chunk] -%}
|
25
|
-
{%- if not loop.first
|
26
|
+
{%- if not loop.first -%}UNION{% endif %}
|
26
27
|
SELECT
|
27
|
-
|
28
|
-
|
29
|
-
,
|
30
|
-
,
|
31
|
-
,
|
32
|
-
,
|
33
|
-
,
|
34
|
-
|
35
|
-
,
|
36
|
-
,
|
37
|
-
|
38
|
-
,
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
{
|
46
|
-
|
47
|
-
|
28
|
+
main.db_name
|
29
|
+
, main.schema_name
|
30
|
+
, main.table_name
|
31
|
+
, main.column_name
|
32
|
+
, main.max_value
|
33
|
+
, main.min_value
|
34
|
+
, main.null_count
|
35
|
+
, main.cardinality
|
36
|
+
, main.avg_value
|
37
|
+
, main.median_value
|
38
|
+
, mode.mode_value
|
39
|
+
, main.stddev_value
|
40
|
+
FROM
|
41
|
+
(
|
42
|
+
SELECT
|
43
|
+
DISTINCT
|
44
|
+
'{{record[0]}}'::varchar as db_name
|
45
|
+
, '{{record[1]}}'::varchar as schema_name
|
46
|
+
, '{{record[2]}}'::varchar as table_name
|
47
|
+
, '{{record[3]}}'::varchar as column_name
|
48
|
+
, {% if var("skip_heavy") == false and record[5] == true %}cast(max("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
|
49
|
+
, {% if var("skip_heavy") == false and record[5] == true %}cast(min("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
|
50
|
+
-- requires full table scan
|
51
|
+
, {% if var("skip_heavy") == false %}cast(SUM(NVL2("{{record[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
|
52
|
+
, APPROXIMATE COUNT(DISTINCT "{{record[3]}}") AS cardinality
|
53
|
+
-- requires full table scan
|
54
|
+
, {% if var("skip_heavy") == false and record[5] == true %}cast(avg("{{record[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
|
55
|
+
, {% if var("skip_heavy") == false and record[5] == true %}cast(median("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
|
56
|
+
-- requires full table scan
|
57
|
+
, {% if record[5] == true %}cast(STDDEV_SAMP("{{record[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
|
58
|
+
FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }}
|
59
|
+
) main, (
|
60
|
+
{%- if var("skip_heavy") == false and record[4] == false %}
|
61
|
+
SELECT
|
62
|
+
cast("{{record[3]}}" as varchar) mode_value
|
63
|
+
FROM (
|
64
|
+
SELECT
|
65
|
+
DISTINCT
|
66
|
+
"{{record[3]}}"
|
67
|
+
, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS row_num
|
68
|
+
FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }}
|
69
|
+
GROUP BY
|
70
|
+
"{{record[3]}}"
|
71
|
+
)
|
72
|
+
WHERE
|
73
|
+
row_num = 1
|
74
|
+
{% else %}
|
75
|
+
SELECT null as mode_value {%- endif -%}
|
76
|
+
) mode
|
48
77
|
{% endfor -%}
|
49
78
|
{%- endset %}
|
50
79
|
-- create a view with a index as suffix
|
@@ -54,6 +83,9 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name no
|
|
54
83
|
{% call statement("main") %}
|
55
84
|
{{ get_replace_view_sql(target_relation, build_sql) }}
|
56
85
|
{% endcall %}
|
86
|
+
{%- set full_refresh_mode = (should_full_refresh()) -%}
|
87
|
+
{%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
|
88
|
+
{%- do apply_grants(target_relation, grant_config, should_revoke) %}
|
57
89
|
{%- set target_relations = target_relations.append(target_relation) %}
|
58
90
|
{%- endfor -%}
|
59
91
|
|
File without changes
|
@@ -2,6 +2,7 @@
|
|
2
2
|
{%- set identifier = model['alias'] %}
|
3
3
|
{%- set target_relations = [] %}
|
4
4
|
{%- set chunk = config.get('chunk') %}
|
5
|
+
{%- set grant_config = config.get('grants') %}
|
5
6
|
|
6
7
|
{{ run_hooks(pre_hooks, inside_transaction=False) }}
|
7
8
|
-- `BEGIN` happens here:
|
@@ -46,6 +47,9 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE NOT startswit
|
|
46
47
|
{% call statement("main") %}
|
47
48
|
{{ get_create_view_as_sql(target_relation, build_sql) }}
|
48
49
|
{% endcall %}
|
50
|
+
{%- set full_refresh_mode = (should_full_refresh()) -%}
|
51
|
+
{%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
|
52
|
+
{%- do apply_grants(target_relation, grant_config, should_revoke) %}
|
49
53
|
{%- set target_relations = target_relations.append(target_relation) %}
|
50
54
|
{%- endfor -%}
|
51
55
|
|
@@ -16,7 +16,10 @@ class EnvDefault(argparse.Action):
|
|
16
16
|
def __init__(self, envvar, required=True, default=None, **kwargs):
|
17
17
|
# override values if envvar exists
|
18
18
|
if envvar in os.environ:
|
19
|
-
|
19
|
+
if kwargs.get("nargs", None) is None:
|
20
|
+
default = os.environ[envvar]
|
21
|
+
else:
|
22
|
+
default = os.environ[envvar].split(" ")
|
20
23
|
if required and default:
|
21
24
|
required = False
|
22
25
|
super(EnvDefault, self).__init__(default=default, required=required, **kwargs)
|
@@ -0,0 +1,196 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Dict, List
|
3
|
+
|
4
|
+
from quollio_core.profilers.lineage import (
|
5
|
+
gen_column_lineage_payload,
|
6
|
+
gen_table_lineage_payload,
|
7
|
+
parse_databricks_table_lineage,
|
8
|
+
)
|
9
|
+
from quollio_core.profilers.stats import gen_table_stats_payload
|
10
|
+
from quollio_core.repository import databricks, qdc
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def databricks_table_level_lineage(
|
16
|
+
conn: databricks.DatabricksConnectionConfig,
|
17
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
18
|
+
tenant_id: str,
|
19
|
+
dbt_table_name: str = "quollio_lineage_table_level",
|
20
|
+
) -> None:
|
21
|
+
logging.basicConfig(level=logging.info, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
22
|
+
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
23
|
+
results = databricks_executor.get_query_results(
|
24
|
+
query=f"""
|
25
|
+
SELECT
|
26
|
+
DOWNSTREAM_TABLE_NAME,
|
27
|
+
UPSTREAM_TABLES
|
28
|
+
FROM {conn.catalog}.{conn.schema}.{dbt_table_name}
|
29
|
+
"""
|
30
|
+
)
|
31
|
+
tables = parse_databricks_table_lineage(results)
|
32
|
+
update_table_lineage_inputs = gen_table_lineage_payload(
|
33
|
+
tenant_id=tenant_id,
|
34
|
+
endpoint=conn.host,
|
35
|
+
tables=tables,
|
36
|
+
)
|
37
|
+
|
38
|
+
req_count = 0
|
39
|
+
for update_table_lineage_input in update_table_lineage_inputs:
|
40
|
+
logger.info(
|
41
|
+
"Generating table lineage. downstream: %s -> %s-> %s",
|
42
|
+
update_table_lineage_input.downstream_database_name,
|
43
|
+
update_table_lineage_input.downstream_schema_name,
|
44
|
+
update_table_lineage_input.downstream_table_name,
|
45
|
+
)
|
46
|
+
status_code = qdc_client.update_lineage_by_id(
|
47
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
48
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
49
|
+
)
|
50
|
+
if status_code == 200:
|
51
|
+
req_count += 1
|
52
|
+
logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
|
53
|
+
return
|
54
|
+
|
55
|
+
|
56
|
+
def databricks_column_level_lineage(
|
57
|
+
conn: databricks.DatabricksConnectionConfig,
|
58
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
59
|
+
tenant_id: str,
|
60
|
+
dbt_table_name: str = "quollio_lineage_column_level",
|
61
|
+
) -> None:
|
62
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
63
|
+
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
64
|
+
results = databricks_executor.get_query_results(
|
65
|
+
query=f"""
|
66
|
+
SELECT
|
67
|
+
*
|
68
|
+
FROM
|
69
|
+
{conn.catalog}.{conn.schema}.{dbt_table_name}
|
70
|
+
"""
|
71
|
+
)
|
72
|
+
|
73
|
+
update_column_lineage_inputs = gen_column_lineage_payload(
|
74
|
+
tenant_id=tenant_id,
|
75
|
+
endpoint=conn.host,
|
76
|
+
columns=results,
|
77
|
+
)
|
78
|
+
|
79
|
+
req_count = 0
|
80
|
+
for update_column_lineage_input in update_column_lineage_inputs:
|
81
|
+
logger.info(
|
82
|
+
"Generating column lineage. downstream: %s -> %s -> %s -> %s",
|
83
|
+
update_column_lineage_input.downstream_database_name,
|
84
|
+
update_column_lineage_input.downstream_schema_name,
|
85
|
+
update_column_lineage_input.downstream_table_name,
|
86
|
+
update_column_lineage_input.downstream_column_name,
|
87
|
+
)
|
88
|
+
status_code = qdc_client.update_lineage_by_id(
|
89
|
+
global_id=update_column_lineage_input.downstream_global_id,
|
90
|
+
payload=update_column_lineage_input.upstreams.as_dict(),
|
91
|
+
)
|
92
|
+
if status_code == 200:
|
93
|
+
req_count += 1
|
94
|
+
logger.info(
|
95
|
+
"Generating column lineage is finished. %s lineages are ingested.",
|
96
|
+
req_count,
|
97
|
+
)
|
98
|
+
return
|
99
|
+
|
100
|
+
|
101
|
+
def _get_monitoring_tables(
|
102
|
+
conn: databricks.DatabricksConnectionConfig, monitoring_table_id: str = "_profile_metrics"
|
103
|
+
) -> List[Dict[str, str]]:
|
104
|
+
tables = []
|
105
|
+
query = f"""
|
106
|
+
SELECT
|
107
|
+
table_catalog,
|
108
|
+
table_schema,
|
109
|
+
table_name,
|
110
|
+
CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_fqdn
|
111
|
+
FROM
|
112
|
+
system.information_schema.tables
|
113
|
+
WHERE table_name LIKE "%{monitoring_table_id}"
|
114
|
+
"""
|
115
|
+
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
116
|
+
tables = databricks_executor.get_query_results(query)
|
117
|
+
if len(tables) > 0:
|
118
|
+
logger.info("Found %s monitoring tables.", len(tables))
|
119
|
+
return tables
|
120
|
+
else:
|
121
|
+
logger.info("No monitoring tables found.")
|
122
|
+
return []
|
123
|
+
|
124
|
+
|
125
|
+
def _get_column_stats(
|
126
|
+
conn: databricks.DatabricksConnectionConfig, monitoring_table_id: str = "_profile_metrics"
|
127
|
+
) -> List[Dict[str, str]]:
|
128
|
+
tables = _get_monitoring_tables(conn, monitoring_table_id)
|
129
|
+
if not tables:
|
130
|
+
return []
|
131
|
+
stats = []
|
132
|
+
for table in tables:
|
133
|
+
monitored_table = table["table_fqdn"].removesuffix("_profile_metrics")
|
134
|
+
monitored_table = monitored_table.split(".")
|
135
|
+
if len(monitored_table) != 3:
|
136
|
+
raise ValueError(f"Invalid table name: {table['table_fqdn']}")
|
137
|
+
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
138
|
+
query = """
|
139
|
+
WITH MaxCounts AS (
|
140
|
+
SELECT
|
141
|
+
t.COLUMN_NAME,
|
142
|
+
MAX(item.count) AS max_count,
|
143
|
+
MAX(t.window) AS latest
|
144
|
+
FROM
|
145
|
+
{monitoring_table} t
|
146
|
+
LATERAL VIEW EXPLODE(t.frequent_items) AS item
|
147
|
+
GROUP BY t.COLUMN_NAME
|
148
|
+
)
|
149
|
+
SELECT
|
150
|
+
"{monitored_table_catalog}" as DB_NAME,
|
151
|
+
"{monitored_table_schema}" as SCHEMA_NAME,
|
152
|
+
"{monitored_table_name}" as TABLE_NAME,
|
153
|
+
t.COLUMN_NAME,
|
154
|
+
t.DATA_TYPE,
|
155
|
+
t.distinct_count as CARDINALITY,
|
156
|
+
t.MAX as MAX_VALUE,
|
157
|
+
t.MIN as MIN_VALUE,
|
158
|
+
t.AVG as AVG_VALUE,
|
159
|
+
t.MEDIAN as MEDIAN_VALUE,
|
160
|
+
t.STDDEV as STDDEV_VALUE,
|
161
|
+
t.NUM_NULLS as NULL_COUNT,
|
162
|
+
item.item AS MODE_VALUE
|
163
|
+
FROM
|
164
|
+
{monitoring_table} t
|
165
|
+
JOIN MaxCounts mc ON t.COLUMN_NAME = mc.COLUMN_NAME
|
166
|
+
LATERAL VIEW EXPLODE(t.frequent_items) AS item
|
167
|
+
WHERE
|
168
|
+
item.count = mc.max_count
|
169
|
+
AND t.window = mc.latest
|
170
|
+
""".format(
|
171
|
+
monitoring_table=table["table_fqdn"],
|
172
|
+
monitored_table_catalog=monitored_table[0],
|
173
|
+
monitored_table_schema=monitored_table[1],
|
174
|
+
monitored_table_name=monitored_table[2],
|
175
|
+
)
|
176
|
+
stats.append(databricks_executor.get_query_results(query))
|
177
|
+
return stats
|
178
|
+
|
179
|
+
|
180
|
+
def databricks_column_stats(
|
181
|
+
conn: databricks.DatabricksConnectionConfig,
|
182
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
183
|
+
tenant_id: str,
|
184
|
+
monitoring_table_id: str = "_profile_metrics",
|
185
|
+
) -> None:
|
186
|
+
table_stats = _get_column_stats(conn, monitoring_table_id)
|
187
|
+
for table in table_stats:
|
188
|
+
stats = gen_table_stats_payload(tenant_id, conn.host, table)
|
189
|
+
for stat in stats:
|
190
|
+
status_code = qdc_client.update_stats_by_id(
|
191
|
+
global_id=stat.global_id,
|
192
|
+
payload=stat.body.as_dict(),
|
193
|
+
)
|
194
|
+
if status_code == 200:
|
195
|
+
logger.info("Stats for %s is successfully ingested.", stat.global_id)
|
196
|
+
return
|
@@ -141,3 +141,15 @@ def parse_snowflake_results(results: List[Dict[str, str]]):
|
|
141
141
|
payload["UPSTREAM_TABLES"] = json.loads(result["UPSTREAM_TABLES"])
|
142
142
|
payloads.append(payload)
|
143
143
|
return payloads
|
144
|
+
|
145
|
+
|
146
|
+
def parse_databricks_table_lineage(results: List) -> List[Dict[str, Dict]]:
|
147
|
+
# Parses results from Quollio Databricks lineage table
|
148
|
+
# Returns tuple of downstream_table_name (0) and upstream_tables (1)
|
149
|
+
payloads = list()
|
150
|
+
for result in results:
|
151
|
+
payload = dict()
|
152
|
+
payload["DOWNSTREAM_TABLE_NAME"] = result["DOWNSTREAM_TABLE_NAME"]
|
153
|
+
payload["UPSTREAM_TABLES"] = json.loads(result["UPSTREAM_TABLES"])
|
154
|
+
payloads.append(payload)
|
155
|
+
return payloads
|
quollio_core/profilers/stats.py
CHANGED
@@ -77,7 +77,6 @@ def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str,
|
|
77
77
|
table_global_id = new_global_id(
|
78
78
|
tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
|
79
79
|
)
|
80
|
-
|
81
80
|
stats_request = StatsRequest(
|
82
81
|
global_id=table_global_id,
|
83
82
|
db=stat["DB_NAME"],
|
quollio_core/redshift.py
CHANGED
@@ -50,11 +50,10 @@ def build_view(
|
|
50
50
|
options=["--no-use-colors", "--log-level", log_level, "--vars", options],
|
51
51
|
)
|
52
52
|
run_options = ["--no-use-colors", "--log-level", log_level, "--vars", options]
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
target_tables_str = " ".join(target_tables_list)
|
53
|
+
if target_tables is not None:
|
54
|
+
if "quollio_stats_columns" in target_tables:
|
55
|
+
target_tables.append("quollio_stats_profiling_columns")
|
56
|
+
target_tables_str = " ".join(target_tables)
|
58
57
|
run_options.append("--select")
|
59
58
|
run_options.append(target_tables_str)
|
60
59
|
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import logging
|
2
|
+
from dataclasses import asdict, dataclass
|
3
|
+
from typing import Dict, List, Optional
|
4
|
+
|
5
|
+
from databricks.sdk.core import Config, HeaderFactory, oauth_service_principal
|
6
|
+
from databricks.sql.client import Connection, connect
|
7
|
+
|
8
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class DatabricksConnectionConfig:
|
13
|
+
host: str
|
14
|
+
http_path: str
|
15
|
+
client_id: str
|
16
|
+
client_secret: str
|
17
|
+
catalog: str
|
18
|
+
schema: str
|
19
|
+
|
20
|
+
def as_dict(self) -> Dict[str, str]:
|
21
|
+
return asdict(self)
|
22
|
+
|
23
|
+
|
24
|
+
class DatabricksQueryExecutor:
|
25
|
+
def __init__(self, config: DatabricksConnectionConfig) -> None:
|
26
|
+
self.config = config
|
27
|
+
self.conn = self.__initialize()
|
28
|
+
|
29
|
+
def __enter__(self):
|
30
|
+
return self
|
31
|
+
|
32
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
33
|
+
self.conn.close()
|
34
|
+
|
35
|
+
def __initialize(self) -> Connection:
|
36
|
+
conn = connect(
|
37
|
+
server_hostname=self.config.host,
|
38
|
+
http_path=self.config.http_path,
|
39
|
+
credentials_provider=self.credential_provider,
|
40
|
+
)
|
41
|
+
return conn
|
42
|
+
|
43
|
+
def get_query_results(self, query: str) -> List[Dict[str, str]]:
|
44
|
+
results_asdict: List[Dict[str, str]] = []
|
45
|
+
with self.conn.cursor() as cur:
|
46
|
+
try:
|
47
|
+
cur.execute(query)
|
48
|
+
result: List[Dict[str, str]] = cur.fetchall()
|
49
|
+
except Exception as e:
|
50
|
+
logging.error(query, exc_info=True)
|
51
|
+
logging.error("databricks get_query_results failed. %s", e)
|
52
|
+
raise
|
53
|
+
|
54
|
+
for row in result:
|
55
|
+
results_asdict.append(row.asDict())
|
56
|
+
return results_asdict
|
57
|
+
|
58
|
+
def credential_provider(self) -> Optional[HeaderFactory]:
|
59
|
+
config = Config(
|
60
|
+
host=f"https://{self.config.host}", client_id=self.config.client_id, client_secret=self.config.client_secret
|
61
|
+
)
|
62
|
+
return oauth_service_principal(config)
|
quollio_core/snowflake.py
CHANGED
@@ -50,11 +50,10 @@ def build_view(
|
|
50
50
|
options=["--no-use-colors", "--log-level", log_level, "--vars", options],
|
51
51
|
)
|
52
52
|
run_options = ["--no-use-colors", "--log-level", log_level, "--vars", options]
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
target_tables_str = " ".join(target_tables_list)
|
53
|
+
if target_tables is not None:
|
54
|
+
if "quollio_stats_columns" in target_tables:
|
55
|
+
target_tables.append("quollio_stats_profiling_columns")
|
56
|
+
target_tables_str = " ".join(target_tables)
|
58
57
|
run_options.append("--select")
|
59
58
|
run_options.append(target_tables_str)
|
60
59
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: quollio-core
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.5
|
4
4
|
Summary: Quollio Core
|
5
5
|
Author-email: quollio-dev <qt.dev@quollio.com>
|
6
6
|
Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
|
@@ -21,14 +21,18 @@ Requires-Dist: blake3==0.3.3
|
|
21
21
|
Requires-Dist: dbt-core==1.7.10
|
22
22
|
Requires-Dist: dbt-snowflake==1.7.0
|
23
23
|
Requires-Dist: dbt-redshift==1.7.1
|
24
|
+
Requires-Dist: dbt-databricks==1.7.1
|
24
25
|
Requires-Dist: jinja2==3.1.3
|
25
26
|
Requires-Dist: PyYAML==6.0.1
|
26
27
|
Requires-Dist: requests==2.31.0
|
27
28
|
Requires-Dist: pyjwt==2.8.0
|
28
29
|
Requires-Dist: redshift-connector==2.0.915
|
29
30
|
Requires-Dist: snowflake-connector-python==3.5.0
|
31
|
+
Requires-Dist: databricks-sdk==0.17.0
|
32
|
+
Requires-Dist: databricks-sql-connector==2.9.5
|
30
33
|
Requires-Dist: sqlglot==20.8.0
|
31
34
|
Requires-Dist: black>=22.3.0 ; extra == "test"
|
35
|
+
Requires-Dist: coverage>=7.3.2 ; extra == "test"
|
32
36
|
Requires-Dist: isort>=5.10.1 ; extra == "test"
|
33
37
|
Requires-Dist: pyproject-flake8>=0.0.1-alpha.2 ; extra == "test"
|
34
38
|
Requires-Dist: pytest>=5.2 ; extra == "test"
|
@@ -1,20 +1,36 @@
|
|
1
|
-
quollio_core/__init__.py,sha256=
|
2
|
-
quollio_core/
|
3
|
-
quollio_core/
|
1
|
+
quollio_core/__init__.py,sha256=BXXaDg79qecIHTSRqA3Yh4FvgDbWzUjgYYWbwaMTRJY,83
|
2
|
+
quollio_core/bricks.py,sha256=PCHyh_I6M4PBRpLDtc5DTr7rpharllu-vcSAhySM4xg,8001
|
3
|
+
quollio_core/redshift.py,sha256=wap7QmV-YuHZAomIrHXytGUuxhQ5MFEb38QDY3XrThQ,10167
|
4
|
+
quollio_core/snowflake.py,sha256=8IMbdTjCDBIiS_GF8APWRTVWNj6EM3ZT8MRN12T-1v0,10266
|
5
|
+
quollio_core/dbt_projects/databricks/.gitignore,sha256=1jJAyXSzJ3YUm0nx3i7wUSE4RjQMX3ad6F8O88UbtzI,29
|
6
|
+
quollio_core/dbt_projects/databricks/README.md,sha256=ZpRQyhFAODAiS8dc1Kb_ndkul4cu4o4udN_EMa49CU4,440
|
7
|
+
quollio_core/dbt_projects/databricks/dbt_project.yml,sha256=3sH98RNk7TnphvI3yEdXDstb92kW5BNxr-cT0tXhwzk,480
|
8
|
+
quollio_core/dbt_projects/databricks/package-lock.yml,sha256=0s8qBWevHon05NSj37p8NMa-rMVs_6McdKmddUVyGQ8,376
|
9
|
+
quollio_core/dbt_projects/databricks/packages.yml,sha256=JtwWg3XK-nJp4tPv2QEER0kc5F2sPeVYqTkE54zMsIM,443
|
10
|
+
quollio_core/dbt_projects/databricks/analyses/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
quollio_core/dbt_projects/databricks/macros/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql,sha256=mZ4mDCEZTwiSgCUr-w2QGze2-NQapt45EyQNQkCOI5I,2171
|
13
|
+
quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml,sha256=tidAK_FMhYYuPTxFoactwcXYQPSMZwQTxWrGBly4-1o,450
|
14
|
+
quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql,sha256=K63J7n7NIM2Jc7c4IF21JcW8AYOm9HxBNDiveUE4kzU,1558
|
15
|
+
quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml,sha256=ZGjz6C2bguDJxJyA7LhCHbuyZSRPEaRMXln9rxcotuo,344
|
16
|
+
quollio_core/dbt_projects/databricks/models/sources.yml,sha256=JXU-8lNsKm8dxIjmWos1vbTsWiea-9-pXnntik63ZpA,2231
|
17
|
+
quollio_core/dbt_projects/databricks/profiles/profiles_template.yml,sha256=Dw1RuTrE04yvGIaPQL7uc6pgSWloKHhu0KrduzJ1Z6M,353
|
18
|
+
quollio_core/dbt_projects/databricks/seeds/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
quollio_core/dbt_projects/databricks/snapshots/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
20
|
quollio_core/dbt_projects/redshift/README.md,sha256=55nDkX5uQXWmawpQbgG1hbyn64j_CegDBQddQ2C85C8,571
|
5
21
|
quollio_core/dbt_projects/redshift/dbt_project.yml,sha256=WVCmT-2usdGSm6EBM6MCdzEeEFwv9ANsyknreoNXgBc,405
|
6
|
-
quollio_core/dbt_projects/redshift/package-lock.yml,sha256=
|
22
|
+
quollio_core/dbt_projects/redshift/package-lock.yml,sha256=Gef3zDCLF41j_FL-_h3sIZOUVj6j7nTTvxXrQPLcBP0,109
|
7
23
|
quollio_core/dbt_projects/redshift/packages.yml,sha256=p9Bl2C44gdC6iYTUkz_15yq3xahSJf2IA3WOXLF_ahA,61
|
8
24
|
quollio_core/dbt_projects/redshift/analyses/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
25
|
quollio_core/dbt_projects/redshift/macros/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql,sha256=
|
26
|
+
quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql,sha256=3tRQeXXdjn3aSZ94DgMN6A6yMlTCE8aMKXeIdcZVDIM,3998
|
11
27
|
quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql,sha256=AVPcNXfVYHwyutJzg61QT_VF9umfoC4i8C2HecAU4d4,2042
|
12
28
|
quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml,sha256=UcrXpUTT3ihBHKPljvjw8xHz-ND60PfvMJaXqGKOEic,236
|
13
29
|
quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql,sha256=A0CTgQwlz8InabA0cHuygV2GMZGYuAa7Zd5DIUOYzQI,1289
|
14
30
|
quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml,sha256=7Npwo3svL9715HpNU2MKzRI014Da4tIStLzAHmd0UaU,235
|
15
31
|
quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql,sha256=e0A_Wqv_OcC8gG_yzTbI59vT-4vCI3JiAzFlmkvLnMk,1049
|
16
32
|
quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml,sha256=qgazupx3ca4P8R0loY5F9hyCz2fmAcWqZ6iOySo_NoY,377
|
17
|
-
quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql,sha256=
|
33
|
+
quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql,sha256=lH8xPmAzSW-6wi_g1y_LFVhtFgHzBvTweVX-MKeJzUQ,302
|
18
34
|
quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml,sha256=V_BESPk6IqE52ExT26-78As9l9AlWW86-Geb5PIhThU,67
|
19
35
|
quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql,sha256=IPmHf51Er2jE9cMQHybT4adRxwwi2CEmgrBSv1Oeduc,1592
|
20
36
|
quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml,sha256=s-p9F44TdwoFYlQN-b9gHzcFYOMqhqDGA9ORS_M4lhs,523
|
@@ -22,20 +38,21 @@ quollio_core/dbt_projects/redshift/models/sources.yml,sha256=NOSoR4ces2XivuenuG7
|
|
22
38
|
quollio_core/dbt_projects/redshift/profiles/profiles_template.yml,sha256=8nS-IE25cLo6uhHdtiZG9YX0cd2fhUG0pPrFUJWs2AY,291
|
23
39
|
quollio_core/dbt_projects/redshift/seeds/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
40
|
quollio_core/dbt_projects/redshift/snapshots/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
+
quollio_core/dbt_projects/seeds/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
42
|
quollio_core/dbt_projects/snowflake/README.md,sha256=55nDkX5uQXWmawpQbgG1hbyn64j_CegDBQddQ2C85C8,571
|
26
43
|
quollio_core/dbt_projects/snowflake/dbt_project.yml,sha256=LN5NDOyakQjIK99IogQX4Whh_1zmqUfD2gqDU9JR3As,407
|
27
44
|
quollio_core/dbt_projects/snowflake/package-lock.yml,sha256=Gef3zDCLF41j_FL-_h3sIZOUVj6j7nTTvxXrQPLcBP0,109
|
28
45
|
quollio_core/dbt_projects/snowflake/packages.yml,sha256=p9Bl2C44gdC6iYTUkz_15yq3xahSJf2IA3WOXLF_ahA,61
|
29
46
|
quollio_core/dbt_projects/snowflake/analyses/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
47
|
quollio_core/dbt_projects/snowflake/macros/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql,sha256=
|
48
|
+
quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql,sha256=T4nFL76AbuQHBiLSAvNchoJnRjb1IRj4nToOyTNvLvw,2782
|
32
49
|
quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql,sha256=Cxt2U2aXNG_LUm63jwTyxUkapkrB7_uHmesx1PTcMJM,4721
|
33
50
|
quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml,sha256=a2uNIAh-xw51eu-GmHVuAnGnTbwK7h8-DjDeQtK3KaQ,711
|
34
51
|
quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql,sha256=Q_7vY1N1Hi1LFv5CxkkdR3gQw8fTDnoKECTLSK4gd3o,5112
|
35
52
|
quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml,sha256=QXlMBIkHo1Y-ANveKVx1FwyoYTMRXKgE2Z-PNouhQTw,325
|
36
53
|
quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql,sha256=gd6JhQO13xBIvOoeXcce1I7amNGytwE8pwUApXehwqM,1520
|
37
54
|
quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml,sha256=qgazupx3ca4P8R0loY5F9hyCz2fmAcWqZ6iOySo_NoY,377
|
38
|
-
quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql,sha256=
|
55
|
+
quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql,sha256=lH8xPmAzSW-6wi_g1y_LFVhtFgHzBvTweVX-MKeJzUQ,302
|
39
56
|
quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml,sha256=V_BESPk6IqE52ExT26-78As9l9AlWW86-Geb5PIhThU,67
|
40
57
|
quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql,sha256=kt2aFimIPkgKI_UQTjvfRlAjrdSbO8z6C_749pnXrnE,1382
|
41
58
|
quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml,sha256=W39VAmFnnX6RBoW7B_4CConC1lm0Jm9o50Jsz9bYZzY,538
|
@@ -45,19 +62,21 @@ quollio_core/dbt_projects/snowflake/seeds/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
45
62
|
quollio_core/dbt_projects/snowflake/snapshots/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
63
|
quollio_core/helper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
64
|
quollio_core/helper/core.py,sha256=-3vCDlKExWPHJmWuZQNpYnvPP55uoGwRpTtnFvsDxIo,1127
|
48
|
-
quollio_core/helper/env_default.py,sha256=
|
65
|
+
quollio_core/helper/env_default.py,sha256=YIL9hfrPs1ViL1AXohnbWEjVBUDXbVVakH0ZoSZWOlc,1202
|
49
66
|
quollio_core/profilers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
|
-
quollio_core/profilers/
|
67
|
+
quollio_core/profilers/databricks.py,sha256=skTTlqogJGauZkN7c9uVSYalAACIB43yblGc1jEIM1U,7501
|
68
|
+
quollio_core/profilers/lineage.py,sha256=HrTjXxrchETRmHEb5tSFzzHdb6z2KMw-DTnUSeKxmr0,6379
|
51
69
|
quollio_core/profilers/redshift.py,sha256=obdHVIsOM1bwHGdvYKalsJcTXwLK02kAKQMSBzSvsDo,7862
|
52
70
|
quollio_core/profilers/snowflake.py,sha256=C1LC19ZaUMwNoXjsbnez0xANydJYs8oNRt6tixWKDq8,9090
|
53
71
|
quollio_core/profilers/sqllineage.py,sha256=oCyl4tpXL5bkfguXAzTHSB9kZBL3tQK_rfcJ4XQMrLo,5177
|
54
|
-
quollio_core/profilers/stats.py,sha256=
|
72
|
+
quollio_core/profilers/stats.py,sha256=PG1NbbUSpc1JuEYvBzD66rd24tp0C13_Y5Y7vRjYG1c,4720
|
55
73
|
quollio_core/repository/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
|
+
quollio_core/repository/databricks.py,sha256=m68tja5N-QxH3VqEq-mOJKBeR2qldSgj_L9iIxvWwm0,1945
|
56
75
|
quollio_core/repository/dbt.py,sha256=HXqW_xa4xYPh9CnKkg4L1gwG3SGjj2BAYoWgzWMFU4U,770
|
57
76
|
quollio_core/repository/qdc.py,sha256=VCmzAUvjLemw1os5TaPtfBFkMCOMuPeftjZmUPhFj2Y,4702
|
58
77
|
quollio_core/repository/redshift.py,sha256=UVHIpYzDQ2AbBTAGa8DgmEenG0NZsHfYroR1MmEPQGA,2991
|
59
78
|
quollio_core/repository/snowflake.py,sha256=1YVMDfb9euJKvikv1pk_IxVF6SVsiemSvZ-WMTSbY7E,1874
|
60
|
-
quollio_core-0.4.
|
61
|
-
quollio_core-0.4.
|
62
|
-
quollio_core-0.4.
|
63
|
-
quollio_core-0.4.
|
79
|
+
quollio_core-0.4.5.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
|
80
|
+
quollio_core-0.4.5.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
|
81
|
+
quollio_core-0.4.5.dist-info/METADATA,sha256=fbpCG8MiXchuHMyatF_kiThLXWSP0gMoMV_ffVDU4MA,6571
|
82
|
+
quollio_core-0.4.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|