quollio-core 0.4.7__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quollio_core/__init__.py +1 -1
- quollio_core/bigquery.py +10 -1
- quollio_core/bricks.py +43 -12
- quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +1 -1
- quollio_core/dbt_projects/redshift/dbt_project.yml +1 -1
- quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +68 -29
- quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
- quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +50 -28
- quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
- quollio_core/helper/core.py +4 -0
- quollio_core/helper/env_default.py +24 -1
- quollio_core/helper/log.py +17 -0
- quollio_core/profilers/databricks.py +11 -6
- quollio_core/profilers/redshift.py +41 -74
- quollio_core/profilers/snowflake.py +138 -169
- quollio_core/profilers/sqllineage.py +0 -1
- quollio_core/redshift.py +11 -13
- quollio_core/repository/databricks.py +3 -3
- quollio_core/repository/dbt.py +0 -1
- quollio_core/repository/qdc.py +0 -3
- quollio_core/repository/redshift.py +0 -1
- quollio_core/repository/snowflake.py +6 -1
- quollio_core/snowflake.py +25 -11
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/METADATA +2 -2
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/RECORD +27 -26
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/LICENSE +0 -0
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/WHEEL +0 -0
@@ -14,7 +14,6 @@ def redshift_table_level_lineage(
|
|
14
14
|
tenant_id: str,
|
15
15
|
dbt_table_name: str,
|
16
16
|
) -> None:
|
17
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
18
17
|
with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
|
19
18
|
results = redshift_executor.get_query_results(
|
20
19
|
query="""
|
@@ -55,22 +54,7 @@ def redshift_table_level_lineage(
|
|
55
54
|
return
|
56
55
|
|
57
56
|
|
58
|
-
def
|
59
|
-
query = """
|
60
|
-
SELECT
|
61
|
-
DISTINCT
|
62
|
-
database_name
|
63
|
-
, schema_name
|
64
|
-
, table_name
|
65
|
-
FROM
|
66
|
-
{db}.{schema}.quollio_stats_profiling_columns
|
67
|
-
""".format(
|
68
|
-
db=db, schema=schema
|
69
|
-
)
|
70
|
-
return query
|
71
|
-
|
72
|
-
|
73
|
-
def _get_stats_tables_query(db: str, schema: str) -> str:
|
57
|
+
def _gen_get_stats_views_query(db: str, schema: str) -> str:
|
74
58
|
query = """
|
75
59
|
SELECT
|
76
60
|
DISTINCT
|
@@ -93,70 +77,54 @@ def redshift_table_stats(
|
|
93
77
|
qdc_client: qdc.QDCExternalAPIClient,
|
94
78
|
tenant_id: str,
|
95
79
|
) -> None:
|
96
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
97
80
|
|
98
81
|
with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
|
99
|
-
|
100
|
-
target_query = _get_target_tables_query(
|
82
|
+
stats_query = _gen_get_stats_views_query(
|
101
83
|
db=conn.database,
|
102
84
|
schema=conn.schema,
|
103
85
|
)
|
104
|
-
|
86
|
+
stats_views = redshift_executor.get_query_results(query=stats_query)
|
105
87
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
target_table=target_asset[2],
|
88
|
+
req_count = 0
|
89
|
+
for stats_view in stats_views:
|
90
|
+
stats_query = """
|
91
|
+
SELECT
|
92
|
+
db_name
|
93
|
+
, schema_name
|
94
|
+
, table_name
|
95
|
+
, column_name
|
96
|
+
, max_value
|
97
|
+
, min_value
|
98
|
+
, null_count
|
99
|
+
, cardinality
|
100
|
+
, avg_value
|
101
|
+
, median_value
|
102
|
+
, mode_value
|
103
|
+
, stddev_value
|
104
|
+
FROM
|
105
|
+
{db}.{schema}.{table}
|
106
|
+
""".format(
|
107
|
+
db=stats_view[0],
|
108
|
+
schema=stats_view[1],
|
109
|
+
table=stats_view[2],
|
110
|
+
)
|
111
|
+
stats_result = redshift_executor.get_query_results(query=stats_query)
|
112
|
+
payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
|
113
|
+
for payload in payloads:
|
114
|
+
logger.info(
|
115
|
+
"Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
|
116
|
+
db=payload.db,
|
117
|
+
schema=payload.schema,
|
118
|
+
table=payload.table,
|
119
|
+
column=payload.column,
|
120
|
+
)
|
140
121
|
)
|
141
|
-
|
142
|
-
|
143
|
-
|
122
|
+
status_code = qdc_client.update_stats_by_id(
|
123
|
+
global_id=payload.global_id,
|
124
|
+
payload=payload.body.get_column_stats(),
|
144
125
|
)
|
145
|
-
|
146
|
-
|
147
|
-
"Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
|
148
|
-
db=payload.db,
|
149
|
-
schema=payload.schema,
|
150
|
-
table=payload.table,
|
151
|
-
column=payload.column,
|
152
|
-
)
|
153
|
-
)
|
154
|
-
status_code = qdc_client.update_stats_by_id(
|
155
|
-
global_id=payload.global_id,
|
156
|
-
payload=payload.body.get_column_stats(),
|
157
|
-
)
|
158
|
-
if status_code == 200:
|
159
|
-
req_count += 1
|
126
|
+
if status_code == 200:
|
127
|
+
req_count += 1
|
160
128
|
logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
|
161
129
|
return
|
162
130
|
|
@@ -166,7 +134,6 @@ def redshift_table_level_sqllineage(
|
|
166
134
|
qdc_client: qdc.QDCExternalAPIClient,
|
167
135
|
tenant_id: str,
|
168
136
|
) -> None:
|
169
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
170
137
|
redshift_connector = redshift.RedshiftQueryExecutor(conn)
|
171
138
|
results = redshift_connector.get_query_results(
|
172
139
|
query="""
|
@@ -17,42 +17,41 @@ def snowflake_table_to_table_lineage(
|
|
17
17
|
qdc_client: qdc.QDCExternalAPIClient,
|
18
18
|
tenant_id: str,
|
19
19
|
) -> None:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
schema=conn.account_schema,
|
31
|
-
)
|
32
|
-
)
|
33
|
-
parsed_results = parse_snowflake_results(results=results)
|
34
|
-
update_table_lineage_inputs = gen_table_lineage_payload(
|
35
|
-
tenant_id=tenant_id,
|
36
|
-
endpoint=conn.account_id,
|
37
|
-
tables=parsed_results,
|
38
|
-
)
|
39
|
-
|
40
|
-
req_count = 0
|
41
|
-
for update_table_lineage_input in update_table_lineage_inputs:
|
42
|
-
logger.info(
|
43
|
-
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
44
|
-
db=update_table_lineage_input.downstream_database_name,
|
45
|
-
schema=update_table_lineage_input.downstream_schema_name,
|
46
|
-
table=update_table_lineage_input.downstream_table_name,
|
20
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
21
|
+
results = sf_executor.get_query_results(
|
22
|
+
query="""
|
23
|
+
SELECT
|
24
|
+
*
|
25
|
+
FROM
|
26
|
+
{db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
|
27
|
+
""".format(
|
28
|
+
db=conn.account_database,
|
29
|
+
schema=conn.account_schema,
|
47
30
|
)
|
48
31
|
)
|
49
|
-
|
50
|
-
|
51
|
-
|
32
|
+
parsed_results = parse_snowflake_results(results=results)
|
33
|
+
update_table_lineage_inputs = gen_table_lineage_payload(
|
34
|
+
tenant_id=tenant_id,
|
35
|
+
endpoint=conn.account_id,
|
36
|
+
tables=parsed_results,
|
52
37
|
)
|
53
|
-
|
54
|
-
|
55
|
-
|
38
|
+
|
39
|
+
req_count = 0
|
40
|
+
for update_table_lineage_input in update_table_lineage_inputs:
|
41
|
+
logger.info(
|
42
|
+
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
43
|
+
db=update_table_lineage_input.downstream_database_name,
|
44
|
+
schema=update_table_lineage_input.downstream_schema_name,
|
45
|
+
table=update_table_lineage_input.downstream_table_name,
|
46
|
+
)
|
47
|
+
)
|
48
|
+
status_code = qdc_client.update_lineage_by_id(
|
49
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
50
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
51
|
+
)
|
52
|
+
if status_code == 200:
|
53
|
+
req_count += 1
|
54
|
+
logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
|
56
55
|
return
|
57
56
|
|
58
57
|
|
@@ -61,41 +60,41 @@ def snowflake_column_to_column_lineage(
|
|
61
60
|
qdc_client: qdc.QDCExternalAPIClient,
|
62
61
|
tenant_id: str,
|
63
62
|
) -> None:
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
)
|
75
|
-
)
|
76
|
-
update_column_lineage_inputs = gen_column_lineage_payload(
|
77
|
-
tenant_id=tenant_id,
|
78
|
-
endpoint=conn.account_id,
|
79
|
-
columns=results,
|
80
|
-
)
|
81
|
-
|
82
|
-
req_count = 0
|
83
|
-
for update_column_lineage_input in update_column_lineage_inputs:
|
84
|
-
logger.info(
|
85
|
-
"Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
|
86
|
-
db=update_column_lineage_input.downstream_database_name,
|
87
|
-
schema=update_column_lineage_input.downstream_schema_name,
|
88
|
-
table=update_column_lineage_input.downstream_table_name,
|
89
|
-
column=update_column_lineage_input.downstream_column_name,
|
63
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
64
|
+
results = sf_executor.get_query_results(
|
65
|
+
query="""
|
66
|
+
SELECT
|
67
|
+
*
|
68
|
+
FROM
|
69
|
+
{db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
|
70
|
+
""".format(
|
71
|
+
db=conn.account_database,
|
72
|
+
schema=conn.account_schema,
|
90
73
|
)
|
91
74
|
)
|
92
|
-
|
93
|
-
|
94
|
-
|
75
|
+
update_column_lineage_inputs = gen_column_lineage_payload(
|
76
|
+
tenant_id=tenant_id,
|
77
|
+
endpoint=conn.account_id,
|
78
|
+
columns=results,
|
95
79
|
)
|
96
|
-
|
97
|
-
|
98
|
-
|
80
|
+
|
81
|
+
req_count = 0
|
82
|
+
for update_column_lineage_input in update_column_lineage_inputs:
|
83
|
+
logger.info(
|
84
|
+
"Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
|
85
|
+
db=update_column_lineage_input.downstream_database_name,
|
86
|
+
schema=update_column_lineage_input.downstream_schema_name,
|
87
|
+
table=update_column_lineage_input.downstream_table_name,
|
88
|
+
column=update_column_lineage_input.downstream_column_name,
|
89
|
+
)
|
90
|
+
)
|
91
|
+
status_code = qdc_client.update_lineage_by_id(
|
92
|
+
global_id=update_column_lineage_input.downstream_global_id,
|
93
|
+
payload=update_column_lineage_input.upstreams.as_dict(),
|
94
|
+
)
|
95
|
+
if status_code == 200:
|
96
|
+
req_count += 1
|
97
|
+
logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
|
99
98
|
return
|
100
99
|
|
101
100
|
|
@@ -104,110 +103,67 @@ def snowflake_table_level_sqllineage(
|
|
104
103
|
qdc_client: qdc.QDCExternalAPIClient,
|
105
104
|
tenant_id: str,
|
106
105
|
) -> None:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
schema=conn.account_schema,
|
120
|
-
)
|
121
|
-
)
|
122
|
-
update_table_lineage_inputs_list = list()
|
123
|
-
sql_lineage = SQLLineage()
|
124
|
-
for result in results:
|
125
|
-
src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
|
126
|
-
sql=result["QUERY_TEXT"],
|
127
|
-
dialect="snowflake",
|
128
|
-
dest_db=result["DATABASE_NAME"],
|
129
|
-
dest_schema=result["SCHEMA_NAME"],
|
130
|
-
)
|
131
|
-
update_table_lineage_inputs = sql_lineage.gen_lineage_input(
|
132
|
-
tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
|
133
|
-
)
|
134
|
-
update_table_lineage_inputs_list.append(update_table_lineage_inputs)
|
135
|
-
|
136
|
-
req_count = 0
|
137
|
-
for update_table_lineage_input in update_table_lineage_inputs_list:
|
138
|
-
logger.info(
|
139
|
-
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
140
|
-
db=update_table_lineage_input.downstream_database_name,
|
141
|
-
schema=update_table_lineage_input.downstream_schema_name,
|
142
|
-
table=update_table_lineage_input.downstream_table_name,
|
106
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
107
|
+
results = sf_executor.get_query_results(
|
108
|
+
query="""
|
109
|
+
SELECT
|
110
|
+
database_name
|
111
|
+
, schema_name
|
112
|
+
, query_text
|
113
|
+
FROM
|
114
|
+
{db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
|
115
|
+
""".format(
|
116
|
+
db=conn.account_database,
|
117
|
+
schema=conn.account_schema,
|
143
118
|
)
|
144
119
|
)
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
120
|
+
update_table_lineage_inputs_list = list()
|
121
|
+
sql_lineage = SQLLineage()
|
122
|
+
for result in results:
|
123
|
+
src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
|
124
|
+
sql=result["QUERY_TEXT"],
|
125
|
+
dialect="snowflake",
|
126
|
+
dest_db=result["DATABASE_NAME"],
|
127
|
+
dest_schema=result["SCHEMA_NAME"],
|
128
|
+
)
|
129
|
+
update_table_lineage_inputs = sql_lineage.gen_lineage_input(
|
130
|
+
tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
|
131
|
+
)
|
132
|
+
update_table_lineage_inputs_list.append(update_table_lineage_inputs)
|
133
|
+
|
134
|
+
req_count = 0
|
135
|
+
for update_table_lineage_input in update_table_lineage_inputs_list:
|
136
|
+
logger.info(
|
137
|
+
"Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
|
138
|
+
db=update_table_lineage_input.downstream_database_name,
|
139
|
+
schema=update_table_lineage_input.downstream_schema_name,
|
140
|
+
table=update_table_lineage_input.downstream_table_name,
|
141
|
+
)
|
142
|
+
)
|
143
|
+
status_code = qdc_client.update_lineage_by_id(
|
144
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
145
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
146
|
+
)
|
147
|
+
if status_code == 200:
|
148
|
+
req_count += 1
|
149
|
+
logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
|
152
150
|
return
|
153
151
|
|
154
152
|
|
155
|
-
def _get_target_tables_query(db: str, schema: str) -> str:
|
156
|
-
query = """
|
157
|
-
SELECT
|
158
|
-
DISTINCT
|
159
|
-
TABLE_CATALOG
|
160
|
-
, TABLE_SCHEMA
|
161
|
-
, TABLE_NAME
|
162
|
-
FROM
|
163
|
-
{db}.{schema}.QUOLLIO_STATS_PROFILING_COLUMNS
|
164
|
-
""".format(
|
165
|
-
db=db, schema=schema
|
166
|
-
)
|
167
|
-
return query
|
168
|
-
|
169
|
-
|
170
|
-
def _get_stats_tables_query(db: str, schema: str) -> str:
|
171
|
-
query = """
|
172
|
-
SELECT
|
173
|
-
DISTINCT
|
174
|
-
TABLE_CATALOG
|
175
|
-
, TABLE_SCHEMA
|
176
|
-
, TABLE_NAME
|
177
|
-
FROM
|
178
|
-
{db}.INFORMATION_SCHEMA.TABLES
|
179
|
-
WHERE
|
180
|
-
startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
|
181
|
-
AND TABLE_SCHEMA = UPPER('{schema}')
|
182
|
-
""".format(
|
183
|
-
db=db, schema=schema
|
184
|
-
)
|
185
|
-
return query
|
186
|
-
|
187
|
-
|
188
153
|
def snowflake_table_stats(
|
189
154
|
conn: snowflake.SnowflakeConnectionConfig,
|
190
155
|
qdc_client: qdc.QDCExternalAPIClient,
|
191
156
|
tenant_id: str,
|
192
157
|
) -> None:
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
)
|
200
|
-
target_assets = sf_executor.get_query_results(query=target_query)
|
201
|
-
|
202
|
-
stats_query = _get_stats_tables_query(
|
203
|
-
db=conn.account_database,
|
204
|
-
schema=conn.account_schema,
|
205
|
-
)
|
206
|
-
stats_columns = sf_executor.get_query_results(query=stats_query)
|
158
|
+
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
159
|
+
stats_query = _gen_get_stats_views_query(
|
160
|
+
db=conn.account_database,
|
161
|
+
schema=conn.account_schema,
|
162
|
+
)
|
163
|
+
stats_views = sf_executor.get_query_results(query=stats_query)
|
207
164
|
|
208
|
-
|
209
|
-
|
210
|
-
for stats_column in stats_columns:
|
165
|
+
req_count = 0
|
166
|
+
for stats_view in stats_views:
|
211
167
|
stats_query = """
|
212
168
|
SELECT
|
213
169
|
db_name
|
@@ -224,18 +180,12 @@ def snowflake_table_stats(
|
|
224
180
|
, stddev_value
|
225
181
|
FROM
|
226
182
|
{db}.{schema}.{table}
|
227
|
-
WHERE
|
228
|
-
db_name = '{target_db}'
|
229
|
-
and schema_name = '{target_schema}'
|
230
|
-
and table_name = '{target_table}'
|
231
183
|
""".format(
|
232
|
-
db=
|
233
|
-
schema=
|
234
|
-
table=
|
235
|
-
target_db=target_asset["TABLE_CATALOG"],
|
236
|
-
target_schema=target_asset["TABLE_SCHEMA"],
|
237
|
-
target_table=target_asset["TABLE_NAME"],
|
184
|
+
db=stats_view["TABLE_CATALOG"],
|
185
|
+
schema=stats_view["TABLE_SCHEMA"],
|
186
|
+
table=stats_view["TABLE_NAME"],
|
238
187
|
)
|
188
|
+
logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
|
239
189
|
stats_result = sf_executor.get_query_results(query=stats_query)
|
240
190
|
payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
|
241
191
|
for payload in payloads:
|
@@ -253,4 +203,23 @@ def snowflake_table_stats(
|
|
253
203
|
)
|
254
204
|
if status_code == 200:
|
255
205
|
req_count += 1
|
256
|
-
|
206
|
+
logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
|
207
|
+
return
|
208
|
+
|
209
|
+
|
210
|
+
def _gen_get_stats_views_query(db: str, schema: str) -> str:
|
211
|
+
query = """
|
212
|
+
SELECT
|
213
|
+
DISTINCT
|
214
|
+
TABLE_CATALOG
|
215
|
+
, TABLE_SCHEMA
|
216
|
+
, TABLE_NAME
|
217
|
+
FROM
|
218
|
+
{db}.INFORMATION_SCHEMA.TABLES
|
219
|
+
WHERE
|
220
|
+
startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
|
221
|
+
AND TABLE_SCHEMA = UPPER('{schema}')
|
222
|
+
""".format(
|
223
|
+
db=db, schema=schema
|
224
|
+
)
|
225
|
+
return query
|
@@ -54,7 +54,6 @@ class SQLLineage:
|
|
54
54
|
dest_db: str = None,
|
55
55
|
dest_schema: str = None,
|
56
56
|
) -> Tuple[Set[Table], Table]:
|
57
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
58
57
|
try:
|
59
58
|
statement: sqlglot.Expression = sqlglot.parse_one(sql=sql, error_level=sqlglot.ErrorLevel.RAISE)
|
60
59
|
except ParseError as e:
|
quollio_core/redshift.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4
4
|
|
5
5
|
from quollio_core.helper.core import setup_dbt_profile
|
6
6
|
from quollio_core.helper.env_default import env_default
|
7
|
+
from quollio_core.helper.log import set_log_level
|
7
8
|
from quollio_core.profilers.redshift import (
|
8
9
|
redshift_table_level_lineage,
|
9
10
|
redshift_table_level_sqllineage,
|
@@ -16,12 +17,10 @@ logger = logging.getLogger(__name__)
|
|
16
17
|
|
17
18
|
def build_view(
|
18
19
|
conn: redshift.RedshiftConnectionConfig,
|
19
|
-
|
20
|
+
aggregate_all: bool = False,
|
20
21
|
target_tables: str = "",
|
21
22
|
log_level: str = "info",
|
22
23
|
) -> None:
|
23
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
24
|
-
|
25
24
|
logger.info("Build profiler views using dbt")
|
26
25
|
# set parameters
|
27
26
|
dbt_client = dbt.DBTClient()
|
@@ -29,9 +28,9 @@ def build_view(
|
|
29
28
|
project_path = f"{current_dir}/dbt_projects/redshift"
|
30
29
|
template_path = f"{current_dir}/dbt_projects/redshift/profiles"
|
31
30
|
template_name = "profiles_template.yml"
|
32
|
-
options = '{{"query_user": {query_user}, "
|
31
|
+
options = '{{"query_user": {query_user}, "aggregate_all": {aggregate_all}, "target_database": {database}}}'.format(
|
33
32
|
query_user=conn.query_user,
|
34
|
-
|
33
|
+
aggregate_all=aggregate_all,
|
35
34
|
database=conn.database,
|
36
35
|
)
|
37
36
|
|
@@ -74,7 +73,6 @@ def load_lineage(
|
|
74
73
|
qdc_client: qdc.QDCExternalAPIClient,
|
75
74
|
tenant_id: str,
|
76
75
|
) -> None:
|
77
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
78
76
|
logger.info("Generate redshift table to table lineage.")
|
79
77
|
redshift_table_level_lineage(
|
80
78
|
conn=conn,
|
@@ -101,7 +99,6 @@ def load_stats(
|
|
101
99
|
qdc_client: qdc.QDCExternalAPIClient,
|
102
100
|
tenant_id: str,
|
103
101
|
) -> None:
|
104
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
105
102
|
|
106
103
|
logger.info("Generate redshift stats.")
|
107
104
|
redshift_table_stats(
|
@@ -119,7 +116,6 @@ def load_sqllineage(
|
|
119
116
|
qdc_client: qdc.QDCExternalAPIClient,
|
120
117
|
tenant_id: str,
|
121
118
|
) -> None:
|
122
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
123
119
|
|
124
120
|
logger.info("Generate Redshift sqllineage.")
|
125
121
|
redshift_table_level_sqllineage(
|
@@ -210,12 +206,12 @@ if __name__ == "__main__":
|
|
210
206
|
help="Target schema name where the views are built by dbt",
|
211
207
|
)
|
212
208
|
parser.add_argument(
|
213
|
-
"--
|
209
|
+
"--aggregate_all",
|
214
210
|
type=bool,
|
215
|
-
action=env_default("
|
216
|
-
default=
|
211
|
+
action=env_default("REDSHIFT_AGGREGATE_ALL", store_true=True),
|
212
|
+
default=False,
|
217
213
|
required=False,
|
218
|
-
help="
|
214
|
+
help="Aggregate all stats values. False by default.",
|
219
215
|
)
|
220
216
|
parser.add_argument(
|
221
217
|
"--target_tables",
|
@@ -266,6 +262,8 @@ if __name__ == "__main__":
|
|
266
262
|
help="The client secrete that is created on Quollio console to let clients access Quollio External API",
|
267
263
|
)
|
268
264
|
args = parser.parse_args()
|
265
|
+
set_log_level(level=args.log_level)
|
266
|
+
|
269
267
|
conn = redshift.RedshiftConnectionConfig(
|
270
268
|
host=args.host,
|
271
269
|
build_user=args.build_user,
|
@@ -283,7 +281,7 @@ if __name__ == "__main__":
|
|
283
281
|
if "build_view" in args.commands:
|
284
282
|
build_view(
|
285
283
|
conn=conn,
|
286
|
-
|
284
|
+
aggregate_all=args.aggregate_all,
|
287
285
|
target_tables=args.target_tables,
|
288
286
|
log_level=args.log_level,
|
289
287
|
)
|
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional
|
|
5
5
|
from databricks.sdk.core import Config, HeaderFactory, oauth_service_principal
|
6
6
|
from databricks.sql.client import Connection, connect
|
7
7
|
|
8
|
-
|
8
|
+
logger = logging.getLogger(__name__)
|
9
9
|
|
10
10
|
|
11
11
|
@dataclass
|
@@ -47,8 +47,8 @@ class DatabricksQueryExecutor:
|
|
47
47
|
cur.execute(query)
|
48
48
|
result: List[Dict[str, str]] = cur.fetchall()
|
49
49
|
except Exception as e:
|
50
|
-
|
51
|
-
|
50
|
+
logger.error(query, exc_info=True)
|
51
|
+
logger.error("databricks get_query_results failed. %s", e)
|
52
52
|
raise
|
53
53
|
|
54
54
|
for row in result:
|
quollio_core/repository/dbt.py
CHANGED
@@ -11,7 +11,6 @@ class DBTClient:
|
|
11
11
|
self.dbt = dbtRunner()
|
12
12
|
|
13
13
|
def invoke(self, cmd: str, project_dir: str, profile_dir: str, options: List[str] = None) -> dbtRunnerResult:
|
14
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
15
14
|
req = [cmd, "--project-dir", project_dir, "--profiles-dir", profile_dir]
|
16
15
|
if options is not None:
|
17
16
|
req.extend(options)
|
quollio_core/repository/qdc.py
CHANGED
@@ -25,7 +25,6 @@ class QDCExternalAPIClient:
|
|
25
25
|
Tried to find a package for oauth0 client credentials flow,
|
26
26
|
but any of them contains bugs or lacks of features to handle the token refresh when it's expired
|
27
27
|
"""
|
28
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
29
28
|
url = f"{self.base_url}/oauth2/token"
|
30
29
|
creds = f"{self.client_id}:{self.client_secret}"
|
31
30
|
encoded_creds = base64.b64encode(creds.encode()).decode()
|
@@ -65,7 +64,6 @@ class QDCExternalAPIClient:
|
|
65
64
|
return session
|
66
65
|
|
67
66
|
def update_stats_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
|
68
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
69
67
|
self._refresh_token_if_expired()
|
70
68
|
headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
|
71
69
|
endpoint = f"{self.base_url}/v2/assets/{global_id}/stats"
|
@@ -85,7 +83,6 @@ class QDCExternalAPIClient:
|
|
85
83
|
return res.status_code
|
86
84
|
|
87
85
|
def update_lineage_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
|
88
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
89
86
|
self._refresh_token_if_expired()
|
90
87
|
headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
|
91
88
|
endpoint = f"{self.base_url}/v2/lineage/{global_id}"
|
@@ -67,7 +67,6 @@ class RedshiftQueryExecutor:
|
|
67
67
|
return conn
|
68
68
|
|
69
69
|
def get_query_results(self, query: str) -> Tuple[List[str]]:
|
70
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
71
70
|
with self.conn.cursor() as cur:
|
72
71
|
try:
|
73
72
|
cur.execute(query)
|