quollio-core 0.4.7__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quollio_core/__init__.py +1 -1
- quollio_core/bigquery.py +10 -1
- quollio_core/bricks.py +43 -12
- quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +1 -1
- quollio_core/dbt_projects/redshift/dbt_project.yml +1 -1
- quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +68 -29
- quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
- quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +50 -28
- quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
- quollio_core/helper/core.py +4 -0
- quollio_core/helper/env_default.py +24 -1
- quollio_core/helper/log.py +17 -0
- quollio_core/profilers/databricks.py +11 -6
- quollio_core/profilers/redshift.py +41 -74
- quollio_core/profilers/snowflake.py +138 -169
- quollio_core/profilers/sqllineage.py +0 -1
- quollio_core/redshift.py +11 -13
- quollio_core/repository/databricks.py +3 -3
- quollio_core/repository/dbt.py +0 -1
- quollio_core/repository/qdc.py +0 -3
- quollio_core/repository/redshift.py +0 -1
- quollio_core/repository/snowflake.py +6 -1
- quollio_core/snowflake.py +25 -11
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/METADATA +2 -2
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/RECORD +27 -26
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/LICENSE +0 -0
- {quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/WHEEL +0 -0
quollio_core/__init__.py
CHANGED
quollio_core/bigquery.py
CHANGED
@@ -3,6 +3,7 @@ import json
|
|
3
3
|
import logging
|
4
4
|
|
5
5
|
from quollio_core.helper.env_default import env_default
|
6
|
+
from quollio_core.helper.log import set_log_level
|
6
7
|
from quollio_core.profilers.bigquery import bigquery_table_lineage
|
7
8
|
from quollio_core.repository import qdc
|
8
9
|
from quollio_core.repository.bigquery import get_credentials, get_org_id
|
@@ -88,14 +89,22 @@ if __name__ == "__main__":
|
|
88
89
|
help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
|
89
90
|
nargs="+",
|
90
91
|
)
|
92
|
+
parser.add_argument(
|
93
|
+
"--log_level",
|
94
|
+
type=str,
|
95
|
+
choices=["debug", "info", "warn", "error", "none"],
|
96
|
+
action=env_default("LOG_LEVEL"),
|
97
|
+
required=False,
|
98
|
+
help="The log level for dbt commands. Default value is info",
|
99
|
+
)
|
91
100
|
|
92
101
|
args = parser.parse_args()
|
102
|
+
set_log_level(level=args.log_level)
|
93
103
|
|
94
104
|
if len(args.commands) == 0:
|
95
105
|
raise ValueError("No command is provided")
|
96
106
|
|
97
107
|
if "load_lineage" in args.commands:
|
98
|
-
|
99
108
|
qdc_client = qdc.QDCExternalAPIClient(
|
100
109
|
base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
|
101
110
|
)
|
quollio_core/bricks.py
CHANGED
@@ -2,8 +2,9 @@ import argparse
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
|
5
|
-
from quollio_core.helper.core import setup_dbt_profile
|
5
|
+
from quollio_core.helper.core import setup_dbt_profile, trim_prefix
|
6
6
|
from quollio_core.helper.env_default import env_default
|
7
|
+
from quollio_core.helper.log import set_log_level
|
7
8
|
from quollio_core.profilers.databricks import (
|
8
9
|
databricks_column_level_lineage,
|
9
10
|
databricks_column_stats,
|
@@ -20,7 +21,6 @@ def build_view(
|
|
20
21
|
target_tables: str = "",
|
21
22
|
log_level: str = "info",
|
22
23
|
) -> None:
|
23
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
24
24
|
|
25
25
|
logger.info("Build profiler views using dbt")
|
26
26
|
# set parameters
|
@@ -59,20 +59,34 @@ def build_view(
|
|
59
59
|
|
60
60
|
def load_lineage(
|
61
61
|
conn: db.DatabricksConnectionConfig,
|
62
|
+
endpoint: str,
|
62
63
|
qdc_client: qdc.QDCExternalAPIClient,
|
63
64
|
tenant_id: str,
|
65
|
+
enable_column_lineage: bool = False,
|
64
66
|
) -> None:
|
65
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
66
67
|
|
67
68
|
logger.info("Generate Databricks table to table lineage.")
|
68
69
|
databricks_table_level_lineage(
|
69
|
-
conn=conn,
|
70
|
+
conn=conn,
|
71
|
+
endpoint=endpoint,
|
72
|
+
qdc_client=qdc_client,
|
73
|
+
tenant_id=tenant_id,
|
74
|
+
dbt_table_name="quollio_lineage_table_level",
|
70
75
|
)
|
71
76
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
77
|
+
if enable_column_lineage:
|
78
|
+
logger.info(
|
79
|
+
f"enable_column_lineage is set to {enable_column_lineage}.Generate Databricks column to column lineage."
|
80
|
+
)
|
81
|
+
databricks_column_level_lineage(
|
82
|
+
conn=conn,
|
83
|
+
endpoint=endpoint,
|
84
|
+
qdc_client=qdc_client,
|
85
|
+
tenant_id=tenant_id,
|
86
|
+
dbt_table_name="quollio_lineage_column_level",
|
87
|
+
)
|
88
|
+
else:
|
89
|
+
logger.info("Skip column lineage ingestion. Set enable_column_lineage to True if you ingest column lineage.")
|
76
90
|
|
77
91
|
logger.info("Lineage data is successfully loaded.")
|
78
92
|
return
|
@@ -80,14 +94,15 @@ def load_lineage(
|
|
80
94
|
|
81
95
|
def load_column_stats(
|
82
96
|
conn: db.DatabricksConnectionConfig,
|
97
|
+
endpoint: str,
|
83
98
|
qdc_client: qdc.QDCExternalAPIClient,
|
84
99
|
tenant_id: str,
|
85
100
|
) -> None:
|
86
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
87
101
|
|
88
102
|
logger.info("Generate Databricks column stats.")
|
89
103
|
databricks_column_stats(
|
90
104
|
conn=conn,
|
105
|
+
endpoint=endpoint,
|
91
106
|
qdc_client=qdc_client,
|
92
107
|
tenant_id=tenant_id,
|
93
108
|
)
|
@@ -207,7 +222,6 @@ if __name__ == "__main__":
|
|
207
222
|
Please specify table name with blank delimiter like tableA tableB \
|
208
223
|
if you want to create two or more tables",
|
209
224
|
)
|
210
|
-
|
211
225
|
parser.add_argument(
|
212
226
|
"--monitoring_table_suffix",
|
213
227
|
type=str,
|
@@ -217,11 +231,21 @@ if __name__ == "__main__":
|
|
217
231
|
This is used to identify the monitoring tables created by the databricks monitoring tool. \
|
218
232
|
Default value is _profile_metrics",
|
219
233
|
)
|
234
|
+
parser.add_argument(
|
235
|
+
"--enable_column_lineage",
|
236
|
+
type=bool,
|
237
|
+
action=env_default("ENABLE_COLUMN_LINEAGE", store_true=True),
|
238
|
+
default=False,
|
239
|
+
required=False,
|
240
|
+
help="Whether to ingest column lineage into QDIC or not. Default value is False",
|
241
|
+
)
|
220
242
|
|
221
243
|
args = parser.parse_args()
|
244
|
+
set_log_level(level=args.log_level)
|
222
245
|
|
223
246
|
conn = db.DatabricksConnectionConfig(
|
224
|
-
host
|
247
|
+
# MEMO: Metadata agent allows the string 'https://' as a host name but is not allowed by intelligence agent.
|
248
|
+
host=trim_prefix(args.host, "https://"),
|
225
249
|
http_path=args.http_path,
|
226
250
|
client_id=args.databricks_client_id,
|
227
251
|
client_secret=args.databricks_client_secret,
|
@@ -243,7 +267,13 @@ if __name__ == "__main__":
|
|
243
267
|
qdc_client = qdc.QDCExternalAPIClient(
|
244
268
|
base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
|
245
269
|
)
|
246
|
-
load_lineage(
|
270
|
+
load_lineage(
|
271
|
+
conn=conn,
|
272
|
+
endpoint=args.host,
|
273
|
+
qdc_client=qdc_client,
|
274
|
+
tenant_id=args.tenant_id,
|
275
|
+
enable_column_lineage=args.enable_column_lineage,
|
276
|
+
)
|
247
277
|
|
248
278
|
if "load_stats" in args.commands:
|
249
279
|
qdc_client = qdc.QDCExternalAPIClient(
|
@@ -251,6 +281,7 @@ if __name__ == "__main__":
|
|
251
281
|
)
|
252
282
|
databricks_column_stats(
|
253
283
|
conn=conn,
|
284
|
+
endpoint=args.host,
|
254
285
|
qdc_client=qdc_client,
|
255
286
|
tenant_id=args.tenant_id,
|
256
287
|
monitoring_table_suffix=args.monitoring_table_suffix,
|
@@ -1,28 +1,67 @@
|
|
1
1
|
{%- materialization divided_view, default %}
|
2
2
|
{%- set identifier = model['alias'] %}
|
3
3
|
{%- set target_relations = [] %}
|
4
|
-
{%- set chunk = config.get('chunk') %}
|
5
4
|
{%- set grant_config = config.get('grants') %}
|
6
5
|
|
7
6
|
{{ run_hooks(pre_hooks, inside_transaction=False) }}
|
8
7
|
-- `BEGIN` happens here:
|
9
8
|
{{ run_hooks(pre_hooks, inside_transaction=True) }}
|
10
9
|
|
11
|
-
-- fetch
|
12
|
-
{%- set
|
13
|
-
SELECT
|
10
|
+
-- fetch target_tables
|
11
|
+
{%- set query_stats_target_tables -%}
|
12
|
+
SELECT
|
13
|
+
distinct
|
14
|
+
database_name
|
15
|
+
, schema_name
|
16
|
+
, table_name
|
17
|
+
FROM
|
18
|
+
{{ ref('quollio_stats_profiling_columns') }}
|
19
|
+
WHERE
|
20
|
+
table_name not like 'quollio_%%'
|
14
21
|
{%- endset -%}
|
15
|
-
{%- set results = run_query(
|
22
|
+
{%- set results = run_query(query_stats_target_tables) -%}
|
16
23
|
{%- if execute -%}
|
17
|
-
{%- set
|
24
|
+
{%- set stats_target_tables = results.rows -%}
|
18
25
|
{%- else -%}
|
19
|
-
{%- set
|
26
|
+
{%- set stats_target_tables = [] -%}
|
27
|
+
{%- endif -%}
|
28
|
+
|
29
|
+
-- skip creating views if the target profiling columns don't exist.
|
30
|
+
{%- if stats_target_tables | length == 0 -%}
|
31
|
+
{% call statement("main") %}
|
32
|
+
{{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
|
33
|
+
select null
|
34
|
+
{% endcall %}
|
35
|
+
{%- set full_refresh_mode = (should_full_refresh()) -%}
|
36
|
+
{%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
|
20
37
|
{%- endif -%}
|
21
38
|
|
22
39
|
-- build sql
|
23
|
-
{%- for
|
24
|
-
|
25
|
-
|
40
|
+
{%- for stats_target_table in stats_target_tables -%}
|
41
|
+
-- get columns for statistics.
|
42
|
+
-- LISTAGG function can't be used for sys table, then it's necessary to get column for each table.
|
43
|
+
-- See https://docs.aws.amazon.com/redshift/latest/dg/c_join_PG.html.
|
44
|
+
{%- set stats_target_columns %}
|
45
|
+
SELECT
|
46
|
+
database_name
|
47
|
+
, schema_name
|
48
|
+
, table_name
|
49
|
+
, column_name
|
50
|
+
, is_bool
|
51
|
+
, is_calculable
|
52
|
+
FROM
|
53
|
+
{{ ref('quollio_stats_profiling_columns') }}
|
54
|
+
WHERE
|
55
|
+
database_name = '{{stats_target_table[0]}}'
|
56
|
+
AND schema_name = '{{stats_target_table[1]}}'
|
57
|
+
AND table_name = '{{stats_target_table[2]}}'
|
58
|
+
{%- endset -%}
|
59
|
+
|
60
|
+
{%- set results = run_query(stats_target_columns) -%}
|
61
|
+
{%- set stats_target_columns = results.rows -%}
|
62
|
+
|
63
|
+
{%- set sql_for_column_stats %}
|
64
|
+
{%- for stats_target_column in stats_target_columns -%}
|
26
65
|
{%- if not loop.first -%}UNION{% endif %}
|
27
66
|
SELECT
|
28
67
|
main.db_name
|
@@ -41,33 +80,33 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name no
|
|
41
80
|
(
|
42
81
|
SELECT
|
43
82
|
DISTINCT
|
44
|
-
'{{
|
45
|
-
, '{{
|
46
|
-
, '{{
|
47
|
-
, '{{
|
48
|
-
, {% if var("
|
49
|
-
, {% if var("
|
83
|
+
'{{stats_target_column[0]}}'::varchar as db_name
|
84
|
+
, '{{stats_target_column[1]}}'::varchar as schema_name
|
85
|
+
, '{{stats_target_column[2]}}'::varchar as table_name
|
86
|
+
, '{{stats_target_column[3]}}'::varchar as column_name
|
87
|
+
, {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(max("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
|
88
|
+
, {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(min("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
|
50
89
|
-- requires full table scan
|
51
|
-
, {% if var("
|
52
|
-
, APPROXIMATE COUNT(DISTINCT "{{
|
90
|
+
, {% if var("aggregate_all") == True %}cast(SUM(NVL2("{{stats_target_column[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
|
91
|
+
, APPROXIMATE COUNT(DISTINCT "{{stats_target_column[3]}}") AS cardinality
|
53
92
|
-- requires full table scan
|
54
|
-
, {% if var("
|
55
|
-
, {% if var("
|
93
|
+
, {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(avg("{{stats_target_column[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
|
94
|
+
, {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(median("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
|
56
95
|
-- requires full table scan
|
57
|
-
, {% if
|
58
|
-
FROM {{
|
96
|
+
, {% if stats_target_column[5] == True %}cast(STDDEV_SAMP("{{stats_target_column[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
|
97
|
+
FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
|
59
98
|
) main, (
|
60
|
-
{%- if var("
|
99
|
+
{%- if var("aggregate_all") == True and stats_target_column[4] == false %}
|
61
100
|
SELECT
|
62
|
-
cast("{{
|
101
|
+
cast("{{stats_target_column[3]}}" as varchar) mode_value
|
63
102
|
FROM (
|
64
103
|
SELECT
|
65
104
|
DISTINCT
|
66
|
-
"{{
|
105
|
+
"{{stats_target_column[3]}}"
|
67
106
|
, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS row_num
|
68
|
-
FROM {{
|
107
|
+
FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
|
69
108
|
GROUP BY
|
70
|
-
"{{
|
109
|
+
"{{stats_target_column[3]}}"
|
71
110
|
)
|
72
111
|
WHERE
|
73
112
|
row_num = 1
|
@@ -77,11 +116,11 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name no
|
|
77
116
|
{% endfor -%}
|
78
117
|
{%- endset %}
|
79
118
|
-- create a view with a index as suffix
|
80
|
-
{%- set target_identifier = "%s_%
|
119
|
+
{%- set target_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
|
81
120
|
{%- set target_relation = api.Relation.create(identifier=target_identifier, schema=schema, database=database, type='view') %}
|
82
121
|
-- {{ drop_relation_if_exists(target_relation) }}
|
83
122
|
{% call statement("main") %}
|
84
|
-
{{ get_replace_view_sql(target_relation,
|
123
|
+
{{ get_replace_view_sql(target_relation, sql_for_column_stats) }}
|
85
124
|
{% endcall %}
|
86
125
|
{%- set full_refresh_mode = (should_full_refresh()) -%}
|
87
126
|
{%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
|
@@ -1,52 +1,74 @@
|
|
1
1
|
{%- materialization divided_view, default %}
|
2
2
|
{%- set identifier = model['alias'] %}
|
3
3
|
{%- set target_relations = [] %}
|
4
|
-
{%- set chunk = config.get('chunk') %}
|
5
4
|
{%- set grant_config = config.get('grants') %}
|
6
5
|
|
7
6
|
{{ run_hooks(pre_hooks, inside_transaction=False) }}
|
8
7
|
-- `BEGIN` happens here:
|
9
8
|
{{ run_hooks(pre_hooks, inside_transaction=True) }}
|
10
9
|
|
11
|
-
-- fetch
|
12
|
-
{%- set
|
13
|
-
SELECT
|
10
|
+
-- fetch target_tables
|
11
|
+
{%- set query_stats_target_tables -%}
|
12
|
+
SELECT
|
13
|
+
TABLE_CATALOG
|
14
|
+
, TABLE_SCHEMA
|
15
|
+
, TABLE_NAME
|
16
|
+
, OBJECT_AGG(COLUMN_NAME, IS_CALCULABLE) AS COLUMNS_OBJ
|
17
|
+
FROM
|
18
|
+
{{ ref('quollio_stats_profiling_columns') }}
|
19
|
+
WHERE NOT startswith(table_name, 'QUOLLIO_')
|
20
|
+
GROUP BY
|
21
|
+
TABLE_CATALOG
|
22
|
+
, TABLE_SCHEMA
|
23
|
+
, TABLE_NAME
|
14
24
|
{%- endset -%}
|
15
|
-
{%- set results = run_query(
|
25
|
+
{%- set results = run_query(query_stats_target_tables) -%}
|
16
26
|
{%- if execute -%}
|
17
|
-
{%- set
|
27
|
+
{%- set stats_target_tables = results.rows -%}
|
18
28
|
{%- else -%}
|
19
|
-
{%- set
|
29
|
+
{%- set stats_target_tables = [] -%}
|
20
30
|
{%- endif -%}
|
21
31
|
|
22
|
-
--
|
23
|
-
{%-
|
24
|
-
{
|
25
|
-
|
26
|
-
|
32
|
+
-- skip creating views if the target profiling columns don't exist.
|
33
|
+
{%- if stats_target_tables | length == 0 -%}
|
34
|
+
{% call statement("main") %}
|
35
|
+
{{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
|
36
|
+
select null
|
37
|
+
{% endcall %}
|
38
|
+
{%- set full_refresh_mode = (should_full_refresh()) -%}
|
39
|
+
{%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
|
40
|
+
{%- endif -%}
|
27
41
|
|
42
|
+
-- create view for each table
|
43
|
+
{%- for stats_target_table in stats_target_tables -%}
|
44
|
+
-- build sql for column value aggregation.
|
45
|
+
{%- set sql_for_column_stats %}
|
46
|
+
{% set columns_json = fromjson(stats_target_table[3]) %}
|
47
|
+
{%- for col_name, is_calclable in columns_json.items() -%}
|
48
|
+
{%- if not loop.first %}UNION{% endif %}
|
28
49
|
SELECT
|
29
50
|
DISTINCT
|
30
|
-
'{{
|
31
|
-
, '{{
|
32
|
-
, '{{
|
33
|
-
, '{{
|
34
|
-
, {% if
|
35
|
-
, {% if
|
36
|
-
, COUNT_IF("{{
|
37
|
-
, APPROX_COUNT_DISTINCT("{{
|
38
|
-
, {% if
|
39
|
-
, {% if
|
40
|
-
, {% if
|
41
|
-
, {% if
|
42
|
-
FROM "{{
|
51
|
+
'{{stats_target_table[0]}}' as db_name
|
52
|
+
, '{{stats_target_table[1]}}' as schema_name
|
53
|
+
, '{{stats_target_table[2]}}' as table_name
|
54
|
+
, '{{col_name}}' as column_name
|
55
|
+
, {% if is_calclable == True %}CAST(MAX("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS max_value
|
56
|
+
, {% if is_calclable == True %}CAST(MIN("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS min_value
|
57
|
+
, COUNT_IF("{{col_name}}" IS NULL) AS null_count
|
58
|
+
, APPROX_COUNT_DISTINCT("{{col_name}}") AS cardinality
|
59
|
+
, {% if is_calclable == True %}AVG("{{col_name}}"){% else %}NULL{% endif %} AS avg_value
|
60
|
+
, {% if is_calclable == True %}MEDIAN("{{col_name}}"){% else %}NULL{% endif %} AS median_value
|
61
|
+
, {% if is_calclable == True %}APPROX_TOP_K("{{col_name}}")[0][0]{% else %}NULL{% endif %} AS mode_value
|
62
|
+
, {% if is_calclable == True %}STDDEV("{{col_name}}"){% else %}NULL{% endif %} AS stddev_value
|
63
|
+
FROM "{{stats_target_table[0]}}"."{{stats_target_table[1]}}"."{{stats_target_table[2]}}" {{ var("sample_method") }}
|
43
64
|
{% endfor -%}
|
44
65
|
{%- endset %}
|
66
|
+
|
45
67
|
-- create a view with a index as suffix
|
46
|
-
{%- set
|
47
|
-
{%- set target_relation = api.Relation.create(identifier=
|
68
|
+
{%- set stats_view_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
|
69
|
+
{%- set target_relation = api.Relation.create(identifier=stats_view_identifier, schema=schema, database=database, type='view') %}
|
48
70
|
{% call statement("main") %}
|
49
|
-
{{ get_create_view_as_sql(target_relation,
|
71
|
+
{{ get_create_view_as_sql(target_relation, sql_for_column_stats) }}
|
50
72
|
{% endcall %}
|
51
73
|
{%- set full_refresh_mode = (should_full_refresh()) -%}
|
52
74
|
{%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
|
quollio_core/helper/core.py
CHANGED
@@ -31,3 +31,7 @@ def setup_dbt_profile(connections_json: Dict[str, str], template_path: str, temp
|
|
31
31
|
with open(profile_path, "w") as profiles:
|
32
32
|
yaml.dump(yaml.safe_load(profiles_body), profiles, default_flow_style=False, allow_unicode=True)
|
33
33
|
return
|
34
|
+
|
35
|
+
|
36
|
+
def trim_prefix(s: str, prefix: str) -> str:
|
37
|
+
return s.lstrip(prefix)
|
@@ -6,6 +6,8 @@ Currently requires explicit naming of env vars to check for
|
|
6
6
|
|
7
7
|
import argparse
|
8
8
|
import os
|
9
|
+
from distutils.util import strtobool
|
10
|
+
from typing import Union
|
9
11
|
|
10
12
|
|
11
13
|
# Courtesy of http://stackoverflow.com/a/10551190 with env-var retrieval fixed
|
@@ -28,9 +30,30 @@ class EnvDefault(argparse.Action):
|
|
28
30
|
setattr(namespace, self.dest, values)
|
29
31
|
|
30
32
|
|
33
|
+
class EnvStoreTrue(argparse._StoreTrueAction):
|
34
|
+
"""An argparse action class that auto-sets missing default values from env vars for store_true."""
|
35
|
+
|
36
|
+
def __init__(self, envvar, required=True, default=None, **kwargs):
|
37
|
+
# Only pass the arguments that argparse._StoreTrueAction expects
|
38
|
+
action_kwargs = {key: value for key, value in kwargs.items() if key in ("option_strings", "dest")}
|
39
|
+
if envvar in os.environ:
|
40
|
+
default = _convert_value_to_bool(os.environ[envvar])
|
41
|
+
if required and default:
|
42
|
+
required = False
|
43
|
+
super(EnvStoreTrue, self).__init__(default=default, required=required, **action_kwargs)
|
44
|
+
|
45
|
+
|
31
46
|
# functional sugar for the above
|
32
|
-
def env_default(envvar):
|
47
|
+
def env_default(envvar, store_true=False):
|
33
48
|
def wrapper(**kwargs):
|
49
|
+
if store_true:
|
50
|
+
return EnvStoreTrue(envvar, **kwargs)
|
34
51
|
return EnvDefault(envvar, **kwargs)
|
35
52
|
|
36
53
|
return wrapper
|
54
|
+
|
55
|
+
|
56
|
+
def _convert_value_to_bool(v: Union[str, bool]) -> bool:
|
57
|
+
if isinstance(v, str):
|
58
|
+
return bool(strtobool(v))
|
59
|
+
return v
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
|
4
|
+
def set_log_level(level: str = "info") -> None:
|
5
|
+
fmt = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
6
|
+
if level == "info":
|
7
|
+
logging.basicConfig(level=logging.INFO, format=fmt)
|
8
|
+
elif level == "debug":
|
9
|
+
logging.basicConfig(level=logging.DEBUG, format=fmt)
|
10
|
+
elif level == "warn":
|
11
|
+
logging.basicConfig(level=logging.WARNING, format=fmt)
|
12
|
+
elif level == "error":
|
13
|
+
logging.basicConfig(level=logging.ERROR, format=fmt)
|
14
|
+
elif level == "critical":
|
15
|
+
logging.basicConfig(level=logging.CRITICAL, format=fmt)
|
16
|
+
else:
|
17
|
+
logging.basicConfig(level=logging.NOTSET, format=fmt)
|
@@ -14,11 +14,11 @@ logger = logging.getLogger(__name__)
|
|
14
14
|
|
15
15
|
def databricks_table_level_lineage(
|
16
16
|
conn: databricks.DatabricksConnectionConfig,
|
17
|
+
endpoint: str,
|
17
18
|
qdc_client: qdc.QDCExternalAPIClient,
|
18
19
|
tenant_id: str,
|
19
20
|
dbt_table_name: str = "quollio_lineage_table_level",
|
20
21
|
) -> None:
|
21
|
-
logging.basicConfig(level=logging.info, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
22
22
|
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
23
23
|
results = databricks_executor.get_query_results(
|
24
24
|
query=f"""
|
@@ -31,7 +31,7 @@ def databricks_table_level_lineage(
|
|
31
31
|
tables = parse_databricks_table_lineage(results)
|
32
32
|
update_table_lineage_inputs = gen_table_lineage_payload(
|
33
33
|
tenant_id=tenant_id,
|
34
|
-
endpoint=
|
34
|
+
endpoint=endpoint,
|
35
35
|
tables=tables,
|
36
36
|
)
|
37
37
|
|
@@ -55,11 +55,11 @@ def databricks_table_level_lineage(
|
|
55
55
|
|
56
56
|
def databricks_column_level_lineage(
|
57
57
|
conn: databricks.DatabricksConnectionConfig,
|
58
|
+
endpoint: str,
|
58
59
|
qdc_client: qdc.QDCExternalAPIClient,
|
59
60
|
tenant_id: str,
|
60
61
|
dbt_table_name: str = "quollio_lineage_column_level",
|
61
62
|
) -> None:
|
62
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
63
63
|
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
64
64
|
results = databricks_executor.get_query_results(
|
65
65
|
query=f"""
|
@@ -72,7 +72,7 @@ def databricks_column_level_lineage(
|
|
72
72
|
|
73
73
|
update_column_lineage_inputs = gen_column_lineage_payload(
|
74
74
|
tenant_id=tenant_id,
|
75
|
-
endpoint=
|
75
|
+
endpoint=endpoint,
|
76
76
|
columns=results,
|
77
77
|
)
|
78
78
|
|
@@ -110,7 +110,9 @@ def _get_monitoring_tables(
|
|
110
110
|
CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_fqdn
|
111
111
|
FROM
|
112
112
|
system.information_schema.tables
|
113
|
-
WHERE
|
113
|
+
WHERE
|
114
|
+
table_name LIKE "%{monitoring_table_suffix}"
|
115
|
+
AND table_name NOT LIKE ('quollio_%')
|
114
116
|
"""
|
115
117
|
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
116
118
|
tables = databricks_executor.get_query_results(query)
|
@@ -153,6 +155,8 @@ def _get_column_stats(
|
|
153
155
|
MAX(t.window) AS LATEST
|
154
156
|
FROM
|
155
157
|
{monitoring_table} t
|
158
|
+
WHERE
|
159
|
+
t.column_name not in (':table')
|
156
160
|
GROUP BY
|
157
161
|
t.COLUMN_NAME,
|
158
162
|
t.DATA_TYPE,
|
@@ -176,13 +180,14 @@ def _get_column_stats(
|
|
176
180
|
|
177
181
|
def databricks_column_stats(
|
178
182
|
conn: databricks.DatabricksConnectionConfig,
|
183
|
+
endpoint: str,
|
179
184
|
qdc_client: qdc.QDCExternalAPIClient,
|
180
185
|
tenant_id: str,
|
181
186
|
monitoring_table_suffix: str = "_profile_metrics",
|
182
187
|
) -> None:
|
183
188
|
table_stats = _get_column_stats(conn, monitoring_table_suffix)
|
184
189
|
for table in table_stats:
|
185
|
-
stats = gen_table_stats_payload(tenant_id,
|
190
|
+
stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
|
186
191
|
for stat in stats:
|
187
192
|
status_code = qdc_client.update_stats_by_id(
|
188
193
|
global_id=stat.global_id,
|