quollio-core 0.4.4__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quollio_core/__init__.py +1 -1
- quollio_core/bigquery.py +123 -0
- quollio_core/bricks.py +288 -0
- quollio_core/dbt_projects/databricks/.gitignore +4 -0
- quollio_core/dbt_projects/databricks/README.md +5 -0
- quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
- quollio_core/dbt_projects/databricks/dbt_project.yml +21 -0
- quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +73 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +14 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +63 -0
- quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +11 -0
- quollio_core/dbt_projects/databricks/models/sources.yml +84 -0
- quollio_core/dbt_projects/databricks/package-lock.yml +14 -0
- quollio_core/dbt_projects/databricks/packages.yml +13 -0
- quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +14 -0
- quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
- quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
- quollio_core/dbt_projects/redshift/dbt_project.yml +1 -1
- quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +101 -34
- quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
- quollio_core/dbt_projects/redshift/package-lock.yml +1 -1
- quollio_core/dbt_projects/seeds/.gitkeep +0 -0
- quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +50 -27
- quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
- quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +57 -20
- quollio_core/helper/core.py +4 -0
- quollio_core/helper/env_default.py +28 -2
- quollio_core/helper/log.py +17 -0
- quollio_core/profilers/bigquery.py +81 -0
- quollio_core/profilers/databricks.py +198 -0
- quollio_core/profilers/lineage.py +26 -0
- quollio_core/profilers/redshift.py +41 -74
- quollio_core/profilers/snowflake.py +138 -169
- quollio_core/profilers/sqllineage.py +0 -1
- quollio_core/profilers/stats.py +0 -1
- quollio_core/redshift.py +15 -18
- quollio_core/repository/bigquery.py +61 -0
- quollio_core/repository/databricks.py +62 -0
- quollio_core/repository/dbt.py +0 -1
- quollio_core/repository/qdc.py +0 -3
- quollio_core/repository/redshift.py +0 -1
- quollio_core/repository/snowflake.py +6 -1
- quollio_core/snowflake.py +29 -16
- {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/METADATA +11 -2
- {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/RECORD +48 -25
- {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/LICENSE +0 -0
- {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/WHEEL +0 -0
@@ -17,42 +17,41 @@ def snowflake_table_to_table_lineage(
|
|
17
17
|
qdc_client: qdc.QDCExternalAPIClient,
|
18
18
|
tenant_id: str,
|
19
19
|
) -> None:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
schema=conn.account_schema,
|
31
|
-
)
|
32
|
-
)
|
33
|
-
parsed_results = parse_snowflake_results(results=results)
|
34
|
-
update_table_lineage_inputs = gen_table_lineage_payload(
|
35
|
-
tenant_id=tenant_id,
|
36
|
-
endpoint=conn.account_id,
|
37
|
-
tables=parsed_results,
|
38
|
-
)
|
39
|
-
|
40
|
-
req_count = 0
|
41
|
-
for update_table_lineage_input in update_table_lineage_inputs:
|
42
|
-
logger.info(
|
43
|
-
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
44
|
-
db=update_table_lineage_input.downstream_database_name,
|
45
|
-
schema=update_table_lineage_input.downstream_schema_name,
|
46
|
-
table=update_table_lineage_input.downstream_table_name,
|
20
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
21
|
+
results = sf_executor.get_query_results(
|
22
|
+
query="""
|
23
|
+
SELECT
|
24
|
+
*
|
25
|
+
FROM
|
26
|
+
{db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
|
27
|
+
""".format(
|
28
|
+
db=conn.account_database,
|
29
|
+
schema=conn.account_schema,
|
47
30
|
)
|
48
31
|
)
|
49
|
-
|
50
|
-
|
51
|
-
|
32
|
+
parsed_results = parse_snowflake_results(results=results)
|
33
|
+
update_table_lineage_inputs = gen_table_lineage_payload(
|
34
|
+
tenant_id=tenant_id,
|
35
|
+
endpoint=conn.account_id,
|
36
|
+
tables=parsed_results,
|
52
37
|
)
|
53
|
-
|
54
|
-
|
55
|
-
|
38
|
+
|
39
|
+
req_count = 0
|
40
|
+
for update_table_lineage_input in update_table_lineage_inputs:
|
41
|
+
logger.info(
|
42
|
+
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
43
|
+
db=update_table_lineage_input.downstream_database_name,
|
44
|
+
schema=update_table_lineage_input.downstream_schema_name,
|
45
|
+
table=update_table_lineage_input.downstream_table_name,
|
46
|
+
)
|
47
|
+
)
|
48
|
+
status_code = qdc_client.update_lineage_by_id(
|
49
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
50
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
51
|
+
)
|
52
|
+
if status_code == 200:
|
53
|
+
req_count += 1
|
54
|
+
logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
|
56
55
|
return
|
57
56
|
|
58
57
|
|
@@ -61,41 +60,41 @@ def snowflake_column_to_column_lineage(
|
|
61
60
|
qdc_client: qdc.QDCExternalAPIClient,
|
62
61
|
tenant_id: str,
|
63
62
|
) -> None:
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
)
|
75
|
-
)
|
76
|
-
update_column_lineage_inputs = gen_column_lineage_payload(
|
77
|
-
tenant_id=tenant_id,
|
78
|
-
endpoint=conn.account_id,
|
79
|
-
columns=results,
|
80
|
-
)
|
81
|
-
|
82
|
-
req_count = 0
|
83
|
-
for update_column_lineage_input in update_column_lineage_inputs:
|
84
|
-
logger.info(
|
85
|
-
"Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
|
86
|
-
db=update_column_lineage_input.downstream_database_name,
|
87
|
-
schema=update_column_lineage_input.downstream_schema_name,
|
88
|
-
table=update_column_lineage_input.downstream_table_name,
|
89
|
-
column=update_column_lineage_input.downstream_column_name,
|
63
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
64
|
+
results = sf_executor.get_query_results(
|
65
|
+
query="""
|
66
|
+
SELECT
|
67
|
+
*
|
68
|
+
FROM
|
69
|
+
{db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
|
70
|
+
""".format(
|
71
|
+
db=conn.account_database,
|
72
|
+
schema=conn.account_schema,
|
90
73
|
)
|
91
74
|
)
|
92
|
-
|
93
|
-
|
94
|
-
|
75
|
+
update_column_lineage_inputs = gen_column_lineage_payload(
|
76
|
+
tenant_id=tenant_id,
|
77
|
+
endpoint=conn.account_id,
|
78
|
+
columns=results,
|
95
79
|
)
|
96
|
-
|
97
|
-
|
98
|
-
|
80
|
+
|
81
|
+
req_count = 0
|
82
|
+
for update_column_lineage_input in update_column_lineage_inputs:
|
83
|
+
logger.info(
|
84
|
+
"Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
|
85
|
+
db=update_column_lineage_input.downstream_database_name,
|
86
|
+
schema=update_column_lineage_input.downstream_schema_name,
|
87
|
+
table=update_column_lineage_input.downstream_table_name,
|
88
|
+
column=update_column_lineage_input.downstream_column_name,
|
89
|
+
)
|
90
|
+
)
|
91
|
+
status_code = qdc_client.update_lineage_by_id(
|
92
|
+
global_id=update_column_lineage_input.downstream_global_id,
|
93
|
+
payload=update_column_lineage_input.upstreams.as_dict(),
|
94
|
+
)
|
95
|
+
if status_code == 200:
|
96
|
+
req_count += 1
|
97
|
+
logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
|
99
98
|
return
|
100
99
|
|
101
100
|
|
@@ -104,110 +103,67 @@ def snowflake_table_level_sqllineage(
|
|
104
103
|
qdc_client: qdc.QDCExternalAPIClient,
|
105
104
|
tenant_id: str,
|
106
105
|
) -> None:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
schema=conn.account_schema,
|
120
|
-
)
|
121
|
-
)
|
122
|
-
update_table_lineage_inputs_list = list()
|
123
|
-
sql_lineage = SQLLineage()
|
124
|
-
for result in results:
|
125
|
-
src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
|
126
|
-
sql=result["QUERY_TEXT"],
|
127
|
-
dialect="snowflake",
|
128
|
-
dest_db=result["DATABASE_NAME"],
|
129
|
-
dest_schema=result["SCHEMA_NAME"],
|
130
|
-
)
|
131
|
-
update_table_lineage_inputs = sql_lineage.gen_lineage_input(
|
132
|
-
tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
|
133
|
-
)
|
134
|
-
update_table_lineage_inputs_list.append(update_table_lineage_inputs)
|
135
|
-
|
136
|
-
req_count = 0
|
137
|
-
for update_table_lineage_input in update_table_lineage_inputs_list:
|
138
|
-
logger.info(
|
139
|
-
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
140
|
-
db=update_table_lineage_input.downstream_database_name,
|
141
|
-
schema=update_table_lineage_input.downstream_schema_name,
|
142
|
-
table=update_table_lineage_input.downstream_table_name,
|
106
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
107
|
+
results = sf_executor.get_query_results(
|
108
|
+
query="""
|
109
|
+
SELECT
|
110
|
+
database_name
|
111
|
+
, schema_name
|
112
|
+
, query_text
|
113
|
+
FROM
|
114
|
+
{db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
|
115
|
+
""".format(
|
116
|
+
db=conn.account_database,
|
117
|
+
schema=conn.account_schema,
|
143
118
|
)
|
144
119
|
)
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
120
|
+
update_table_lineage_inputs_list = list()
|
121
|
+
sql_lineage = SQLLineage()
|
122
|
+
for result in results:
|
123
|
+
src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
|
124
|
+
sql=result["QUERY_TEXT"],
|
125
|
+
dialect="snowflake",
|
126
|
+
dest_db=result["DATABASE_NAME"],
|
127
|
+
dest_schema=result["SCHEMA_NAME"],
|
128
|
+
)
|
129
|
+
update_table_lineage_inputs = sql_lineage.gen_lineage_input(
|
130
|
+
tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
|
131
|
+
)
|
132
|
+
update_table_lineage_inputs_list.append(update_table_lineage_inputs)
|
133
|
+
|
134
|
+
req_count = 0
|
135
|
+
for update_table_lineage_input in update_table_lineage_inputs_list:
|
136
|
+
logger.info(
|
137
|
+
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
138
|
+
db=update_table_lineage_input.downstream_database_name,
|
139
|
+
schema=update_table_lineage_input.downstream_schema_name,
|
140
|
+
table=update_table_lineage_input.downstream_table_name,
|
141
|
+
)
|
142
|
+
)
|
143
|
+
status_code = qdc_client.update_lineage_by_id(
|
144
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
145
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
146
|
+
)
|
147
|
+
if status_code == 200:
|
148
|
+
req_count += 1
|
149
|
+
logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
|
152
150
|
return
|
153
151
|
|
154
152
|
|
155
|
-
def _get_target_tables_query(db: str, schema: str) -> str:
|
156
|
-
query = """
|
157
|
-
SELECT
|
158
|
-
DISTINCT
|
159
|
-
TABLE_CATALOG
|
160
|
-
, TABLE_SCHEMA
|
161
|
-
, TABLE_NAME
|
162
|
-
FROM
|
163
|
-
{db}.{schema}.QUOLLIO_STATS_PROFILING_COLUMNS
|
164
|
-
""".format(
|
165
|
-
db=db, schema=schema
|
166
|
-
)
|
167
|
-
return query
|
168
|
-
|
169
|
-
|
170
|
-
def _get_stats_tables_query(db: str, schema: str) -> str:
|
171
|
-
query = """
|
172
|
-
SELECT
|
173
|
-
DISTINCT
|
174
|
-
TABLE_CATALOG
|
175
|
-
, TABLE_SCHEMA
|
176
|
-
, TABLE_NAME
|
177
|
-
FROM
|
178
|
-
{db}.INFORMATION_SCHEMA.TABLES
|
179
|
-
WHERE
|
180
|
-
startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
|
181
|
-
AND TABLE_SCHEMA = UPPER('{schema}')
|
182
|
-
""".format(
|
183
|
-
db=db, schema=schema
|
184
|
-
)
|
185
|
-
return query
|
186
|
-
|
187
|
-
|
188
153
|
def snowflake_table_stats(
|
189
154
|
conn: snowflake.SnowflakeConnectionConfig,
|
190
155
|
qdc_client: qdc.QDCExternalAPIClient,
|
191
156
|
tenant_id: str,
|
192
157
|
) -> None:
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
)
|
200
|
-
target_assets = sf_executor.get_query_results(query=target_query)
|
201
|
-
|
202
|
-
stats_query = _get_stats_tables_query(
|
203
|
-
db=conn.account_database,
|
204
|
-
schema=conn.account_schema,
|
205
|
-
)
|
206
|
-
stats_columns = sf_executor.get_query_results(query=stats_query)
|
158
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
159
|
+
stats_query = _gen_get_stats_views_query(
|
160
|
+
db=conn.account_database,
|
161
|
+
schema=conn.account_schema,
|
162
|
+
)
|
163
|
+
stats_views = sf_executor.get_query_results(query=stats_query)
|
207
164
|
|
208
|
-
|
209
|
-
|
210
|
-
for stats_column in stats_columns:
|
165
|
+
req_count = 0
|
166
|
+
for stats_view in stats_views:
|
211
167
|
stats_query = """
|
212
168
|
SELECT
|
213
169
|
db_name
|
@@ -224,18 +180,12 @@ def snowflake_table_stats(
|
|
224
180
|
, stddev_value
|
225
181
|
FROM
|
226
182
|
{db}.{schema}.{table}
|
227
|
-
WHERE
|
228
|
-
db_name = '{target_db}'
|
229
|
-
and schema_name = '{target_schema}'
|
230
|
-
and table_name = '{target_table}'
|
231
183
|
""".format(
|
232
|
-
db=
|
233
|
-
schema=
|
234
|
-
table=
|
235
|
-
target_db=target_asset["TABLE_CATALOG"],
|
236
|
-
target_schema=target_asset["TABLE_SCHEMA"],
|
237
|
-
target_table=target_asset["TABLE_NAME"],
|
184
|
+
db=stats_view["TABLE_CATALOG"],
|
185
|
+
schema=stats_view["TABLE_SCHEMA"],
|
186
|
+
table=stats_view["TABLE_NAME"],
|
238
187
|
)
|
188
|
+
logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
|
239
189
|
stats_result = sf_executor.get_query_results(query=stats_query)
|
240
190
|
payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
|
241
191
|
for payload in payloads:
|
@@ -253,4 +203,23 @@ def snowflake_table_stats(
|
|
253
203
|
)
|
254
204
|
if status_code == 200:
|
255
205
|
req_count += 1
|
256
|
-
|
206
|
+
logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
|
207
|
+
return
|
208
|
+
|
209
|
+
|
210
|
+
def _gen_get_stats_views_query(db: str, schema: str) -> str:
|
211
|
+
query = """
|
212
|
+
SELECT
|
213
|
+
DISTINCT
|
214
|
+
TABLE_CATALOG
|
215
|
+
, TABLE_SCHEMA
|
216
|
+
, TABLE_NAME
|
217
|
+
FROM
|
218
|
+
{db}.INFORMATION_SCHEMA.TABLES
|
219
|
+
WHERE
|
220
|
+
startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
|
221
|
+
AND TABLE_SCHEMA = UPPER('{schema}')
|
222
|
+
""".format(
|
223
|
+
db=db, schema=schema
|
224
|
+
)
|
225
|
+
return query
|
@@ -54,7 +54,6 @@ class SQLLineage:
|
|
54
54
|
dest_db: str = None,
|
55
55
|
dest_schema: str = None,
|
56
56
|
) -> Tuple[Set[Table], Table]:
|
57
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
58
57
|
try:
|
59
58
|
statement: sqlglot.Expression = sqlglot.parse_one(sql=sql, error_level=sqlglot.ErrorLevel.RAISE)
|
60
59
|
except ParseError as e:
|
quollio_core/profilers/stats.py
CHANGED
@@ -77,7 +77,6 @@ def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str,
|
|
77
77
|
table_global_id = new_global_id(
|
78
78
|
tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
|
79
79
|
)
|
80
|
-
|
81
80
|
stats_request = StatsRequest(
|
82
81
|
global_id=table_global_id,
|
83
82
|
db=stat["DB_NAME"],
|
quollio_core/redshift.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4
4
|
|
5
5
|
from quollio_core.helper.core import setup_dbt_profile
|
6
6
|
from quollio_core.helper.env_default import env_default
|
7
|
+
from quollio_core.helper.log import set_log_level
|
7
8
|
from quollio_core.profilers.redshift import (
|
8
9
|
redshift_table_level_lineage,
|
9
10
|
redshift_table_level_sqllineage,
|
@@ -16,12 +17,10 @@ logger = logging.getLogger(__name__)
|
|
16
17
|
|
17
18
|
def build_view(
|
18
19
|
conn: redshift.RedshiftConnectionConfig,
|
19
|
-
|
20
|
+
aggregate_all: bool = False,
|
20
21
|
target_tables: str = "",
|
21
22
|
log_level: str = "info",
|
22
23
|
) -> None:
|
23
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
24
|
-
|
25
24
|
logger.info("Build profiler views using dbt")
|
26
25
|
# set parameters
|
27
26
|
dbt_client = dbt.DBTClient()
|
@@ -29,9 +28,9 @@ def build_view(
|
|
29
28
|
project_path = f"{current_dir}/dbt_projects/redshift"
|
30
29
|
template_path = f"{current_dir}/dbt_projects/redshift/profiles"
|
31
30
|
template_name = "profiles_template.yml"
|
32
|
-
options = '{{"query_user": {query_user}, "
|
31
|
+
options = '{{"query_user": {query_user}, "aggregate_all": {aggregate_all}, "target_database": {database}}}'.format(
|
33
32
|
query_user=conn.query_user,
|
34
|
-
|
33
|
+
aggregate_all=aggregate_all,
|
35
34
|
database=conn.database,
|
36
35
|
)
|
37
36
|
|
@@ -50,11 +49,10 @@ def build_view(
|
|
50
49
|
options=["--no-use-colors", "--log-level", log_level, "--vars", options],
|
51
50
|
)
|
52
51
|
run_options = ["--no-use-colors", "--log-level", log_level, "--vars", options]
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
target_tables_str = " ".join(target_tables_list)
|
52
|
+
if target_tables is not None:
|
53
|
+
if "quollio_stats_columns" in target_tables:
|
54
|
+
target_tables.append("quollio_stats_profiling_columns")
|
55
|
+
target_tables_str = " ".join(target_tables)
|
58
56
|
run_options.append("--select")
|
59
57
|
run_options.append(target_tables_str)
|
60
58
|
|
@@ -75,7 +73,6 @@ def load_lineage(
|
|
75
73
|
qdc_client: qdc.QDCExternalAPIClient,
|
76
74
|
tenant_id: str,
|
77
75
|
) -> None:
|
78
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
79
76
|
logger.info("Generate redshift table to table lineage.")
|
80
77
|
redshift_table_level_lineage(
|
81
78
|
conn=conn,
|
@@ -102,7 +99,6 @@ def load_stats(
|
|
102
99
|
qdc_client: qdc.QDCExternalAPIClient,
|
103
100
|
tenant_id: str,
|
104
101
|
) -> None:
|
105
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
106
102
|
|
107
103
|
logger.info("Generate redshift stats.")
|
108
104
|
redshift_table_stats(
|
@@ -120,7 +116,6 @@ def load_sqllineage(
|
|
120
116
|
qdc_client: qdc.QDCExternalAPIClient,
|
121
117
|
tenant_id: str,
|
122
118
|
) -> None:
|
123
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
124
119
|
|
125
120
|
logger.info("Generate Redshift sqllineage.")
|
126
121
|
redshift_table_level_sqllineage(
|
@@ -211,12 +206,12 @@ if __name__ == "__main__":
|
|
211
206
|
help="Target schema name where the views are built by dbt",
|
212
207
|
)
|
213
208
|
parser.add_argument(
|
214
|
-
"--
|
209
|
+
"--aggregate_all",
|
215
210
|
type=bool,
|
216
|
-
action=env_default("
|
217
|
-
default=
|
211
|
+
action=env_default("REDSHIFT_AGGREGATE_ALL", store_true=True),
|
212
|
+
default=False,
|
218
213
|
required=False,
|
219
|
-
help="
|
214
|
+
help="Aggregate all stats values. False by default.",
|
220
215
|
)
|
221
216
|
parser.add_argument(
|
222
217
|
"--target_tables",
|
@@ -267,6 +262,8 @@ if __name__ == "__main__":
|
|
267
262
|
help="The client secrete that is created on Quollio console to let clients access Quollio External API",
|
268
263
|
)
|
269
264
|
args = parser.parse_args()
|
265
|
+
set_log_level(level=args.log_level)
|
266
|
+
|
270
267
|
conn = redshift.RedshiftConnectionConfig(
|
271
268
|
host=args.host,
|
272
269
|
build_user=args.build_user,
|
@@ -284,7 +281,7 @@ if __name__ == "__main__":
|
|
284
281
|
if "build_view" in args.commands:
|
285
282
|
build_view(
|
286
283
|
conn=conn,
|
287
|
-
|
284
|
+
aggregate_all=args.aggregate_all,
|
288
285
|
target_tables=args.target_tables,
|
289
286
|
log_level=args.log_level,
|
290
287
|
)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from google.cloud.bigquery import Client
|
4
|
+
from google.cloud.datacatalog_lineage_v1 import EntityReference, LineageClient, SearchLinksRequest
|
5
|
+
from google.oauth2.service_account import Credentials
|
6
|
+
from googleapiclient.discovery import build
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class BigQueryClient:
|
12
|
+
def __init__(self, credentials: Credentials) -> None:
|
13
|
+
self.client = self.__initialze(credentials=credentials)
|
14
|
+
|
15
|
+
def __initialze(self, credentials: Credentials) -> Client:
|
16
|
+
client = Client(credentials=credentials)
|
17
|
+
return client
|
18
|
+
|
19
|
+
def list_datasets(self, project_id) -> list:
|
20
|
+
datasets = list(self.client.list_datasets(project_id))
|
21
|
+
logger.debug("Found %s datasets in project %s", len(datasets), project_id)
|
22
|
+
return datasets
|
23
|
+
|
24
|
+
def list_tables(self, dataset_id) -> list:
|
25
|
+
tables = list(self.client.list_tables(dataset_id))
|
26
|
+
logger.debug("Found %s tables in dataset %s", len(tables), dataset_id)
|
27
|
+
return list(self.client.list_tables(dataset_id))
|
28
|
+
|
29
|
+
|
30
|
+
class GCPLineageClient:
|
31
|
+
def __init__(self, credentials: Credentials) -> None:
|
32
|
+
self.client = self.__initialze(credentials=credentials)
|
33
|
+
|
34
|
+
def __initialze(self, credentials: Credentials) -> LineageClient:
|
35
|
+
client = LineageClient(credentials=credentials)
|
36
|
+
return client
|
37
|
+
|
38
|
+
def get_links(self, request: SearchLinksRequest) -> list:
|
39
|
+
response = self.client.search_links(request)
|
40
|
+
return response.links
|
41
|
+
|
42
|
+
|
43
|
+
def get_entitiy_reference() -> EntityReference:
|
44
|
+
return EntityReference()
|
45
|
+
|
46
|
+
|
47
|
+
def get_search_request(downstream_table: EntityReference, project_id: str, region: str) -> SearchLinksRequest:
|
48
|
+
return SearchLinksRequest(target=downstream_table, parent=f"projects/{project_id}/locations/{region.lower()}")
|
49
|
+
|
50
|
+
|
51
|
+
def get_credentials(credentials_json: dict) -> Credentials:
|
52
|
+
return Credentials.from_service_account_info(credentials_json)
|
53
|
+
|
54
|
+
|
55
|
+
def get_org_id(credentials_json: dict) -> str:
|
56
|
+
credentials = get_credentials(credentials_json)
|
57
|
+
crm_service = build("cloudresourcemanager", "v1", credentials=credentials)
|
58
|
+
project_id = credentials_json["project_id"]
|
59
|
+
project = crm_service.projects().get(projectId=project_id).execute()
|
60
|
+
org_id = project["parent"]["id"]
|
61
|
+
return org_id
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import logging
|
2
|
+
from dataclasses import asdict, dataclass
|
3
|
+
from typing import Dict, List, Optional
|
4
|
+
|
5
|
+
from databricks.sdk.core import Config, HeaderFactory, oauth_service_principal
|
6
|
+
from databricks.sql.client import Connection, connect
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class DatabricksConnectionConfig:
|
13
|
+
host: str
|
14
|
+
http_path: str
|
15
|
+
client_id: str
|
16
|
+
client_secret: str
|
17
|
+
catalog: str
|
18
|
+
schema: str
|
19
|
+
|
20
|
+
def as_dict(self) -> Dict[str, str]:
|
21
|
+
return asdict(self)
|
22
|
+
|
23
|
+
|
24
|
+
class DatabricksQueryExecutor:
|
25
|
+
def __init__(self, config: DatabricksConnectionConfig) -> None:
|
26
|
+
self.config = config
|
27
|
+
self.conn = self.__initialize()
|
28
|
+
|
29
|
+
def __enter__(self):
|
30
|
+
return self
|
31
|
+
|
32
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
33
|
+
self.conn.close()
|
34
|
+
|
35
|
+
def __initialize(self) -> Connection:
|
36
|
+
conn = connect(
|
37
|
+
server_hostname=self.config.host,
|
38
|
+
http_path=self.config.http_path,
|
39
|
+
credentials_provider=self.credential_provider,
|
40
|
+
)
|
41
|
+
return conn
|
42
|
+
|
43
|
+
def get_query_results(self, query: str) -> List[Dict[str, str]]:
|
44
|
+
results_asdict: List[Dict[str, str]] = []
|
45
|
+
with self.conn.cursor() as cur:
|
46
|
+
try:
|
47
|
+
cur.execute(query)
|
48
|
+
result: List[Dict[str, str]] = cur.fetchall()
|
49
|
+
except Exception as e:
|
50
|
+
logger.error(query, exc_info=True)
|
51
|
+
logger.error("databricks get_query_results failed. %s", e)
|
52
|
+
raise
|
53
|
+
|
54
|
+
for row in result:
|
55
|
+
results_asdict.append(row.asDict())
|
56
|
+
return results_asdict
|
57
|
+
|
58
|
+
def credential_provider(self) -> Optional[HeaderFactory]:
|
59
|
+
config = Config(
|
60
|
+
host=f"https://{self.config.host}", client_id=self.config.client_id, client_secret=self.config.client_secret
|
61
|
+
)
|
62
|
+
return oauth_service_principal(config)
|
quollio_core/repository/dbt.py
CHANGED
@@ -11,7 +11,6 @@ class DBTClient:
|
|
11
11
|
self.dbt = dbtRunner()
|
12
12
|
|
13
13
|
def invoke(self, cmd: str, project_dir: str, profile_dir: str, options: List[str] = None) -> dbtRunnerResult:
|
14
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
15
14
|
req = [cmd, "--project-dir", project_dir, "--profiles-dir", profile_dir]
|
16
15
|
if options is not None:
|
17
16
|
req.extend(options)
|
quollio_core/repository/qdc.py
CHANGED
@@ -25,7 +25,6 @@ class QDCExternalAPIClient:
|
|
25
25
|
Tried to find a package for oauth0 client credentials flow,
|
26
26
|
but any of them contains bugs or lacks of features to handle the token refresh when it's expired
|
27
27
|
"""
|
28
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
29
28
|
url = f"{self.base_url}/oauth2/token"
|
30
29
|
creds = f"{self.client_id}:{self.client_secret}"
|
31
30
|
encoded_creds = base64.b64encode(creds.encode()).decode()
|
@@ -65,7 +64,6 @@ class QDCExternalAPIClient:
|
|
65
64
|
return session
|
66
65
|
|
67
66
|
def update_stats_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
|
68
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
69
67
|
self._refresh_token_if_expired()
|
70
68
|
headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
|
71
69
|
endpoint = f"{self.base_url}/v2/assets/{global_id}/stats"
|
@@ -85,7 +83,6 @@ class QDCExternalAPIClient:
|
|
85
83
|
return res.status_code
|
86
84
|
|
87
85
|
def update_lineage_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
|
88
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
89
86
|
self._refresh_token_if_expired()
|
90
87
|
headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
|
91
88
|
endpoint = f"{self.base_url}/v2/lineage/{global_id}"
|
@@ -67,7 +67,6 @@ class RedshiftQueryExecutor:
|
|
67
67
|
return conn
|
68
68
|
|
69
69
|
def get_query_results(self, query: str) -> Tuple[List[str]]:
|
70
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
71
70
|
with self.conn.cursor() as cur:
|
72
71
|
try:
|
73
72
|
cur.execute(query)
|
@@ -28,6 +28,12 @@ class SnowflakeQueryExecutor:
|
|
28
28
|
def __init__(self, config: SnowflakeConnectionConfig) -> None:
|
29
29
|
self.conn = self.__initialize(config)
|
30
30
|
|
31
|
+
def __enter__(self):
|
32
|
+
return self
|
33
|
+
|
34
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
35
|
+
self.conn.close()
|
36
|
+
|
31
37
|
def __initialize(self, config: SnowflakeConnectionConfig) -> SnowflakeConnection:
|
32
38
|
conn: SnowflakeConnection = connect(
|
33
39
|
user=config.account_user,
|
@@ -41,7 +47,6 @@ class SnowflakeQueryExecutor:
|
|
41
47
|
return conn
|
42
48
|
|
43
49
|
def get_query_results(self, query: str) -> List[Dict[str, str]]:
|
44
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
45
50
|
with self.conn.cursor(DictCursor) as cur:
|
46
51
|
try:
|
47
52
|
cur.execute(query)
|