quollio-core 0.4.7__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quollio_core/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.4.7"
3
+ __version__ = "0.4.10"
4
4
  __author__ = "Quollio Technologies, Inc"
quollio_core/bigquery.py CHANGED
@@ -3,6 +3,7 @@ import json
3
3
  import logging
4
4
 
5
5
  from quollio_core.helper.env_default import env_default
6
+ from quollio_core.helper.log import set_log_level
6
7
  from quollio_core.profilers.bigquery import bigquery_table_lineage
7
8
  from quollio_core.repository import qdc
8
9
  from quollio_core.repository.bigquery import get_credentials, get_org_id
@@ -88,14 +89,22 @@ if __name__ == "__main__":
88
89
  help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
89
90
  nargs="+",
90
91
  )
92
+ parser.add_argument(
93
+ "--log_level",
94
+ type=str,
95
+ choices=["debug", "info", "warn", "error", "none"],
96
+ action=env_default("LOG_LEVEL"),
97
+ required=False,
98
+ help="The log level for dbt commands. Default value is info",
99
+ )
91
100
 
92
101
  args = parser.parse_args()
102
+ set_log_level(level=args.log_level)
93
103
 
94
104
  if len(args.commands) == 0:
95
105
  raise ValueError("No command is provided")
96
106
 
97
107
  if "load_lineage" in args.commands:
98
-
99
108
  qdc_client = qdc.QDCExternalAPIClient(
100
109
  base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
101
110
  )
quollio_core/bricks.py CHANGED
@@ -2,8 +2,9 @@ import argparse
2
2
  import logging
3
3
  import os
4
4
 
5
- from quollio_core.helper.core import setup_dbt_profile
5
+ from quollio_core.helper.core import setup_dbt_profile, trim_prefix
6
6
  from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log import set_log_level
7
8
  from quollio_core.profilers.databricks import (
8
9
  databricks_column_level_lineage,
9
10
  databricks_column_stats,
@@ -20,7 +21,6 @@ def build_view(
20
21
  target_tables: str = "",
21
22
  log_level: str = "info",
22
23
  ) -> None:
23
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
24
24
 
25
25
  logger.info("Build profiler views using dbt")
26
26
  # set parameters
@@ -59,20 +59,34 @@ def build_view(
59
59
 
60
60
  def load_lineage(
61
61
  conn: db.DatabricksConnectionConfig,
62
+ endpoint: str,
62
63
  qdc_client: qdc.QDCExternalAPIClient,
63
64
  tenant_id: str,
65
+ enable_column_lineage: bool = False,
64
66
  ) -> None:
65
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
66
67
 
67
68
  logger.info("Generate Databricks table to table lineage.")
68
69
  databricks_table_level_lineage(
69
- conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_table_level"
70
+ conn=conn,
71
+ endpoint=endpoint,
72
+ qdc_client=qdc_client,
73
+ tenant_id=tenant_id,
74
+ dbt_table_name="quollio_lineage_table_level",
70
75
  )
71
76
 
72
- logger.info("Generate Databricks column to column lineage.")
73
- databricks_column_level_lineage(
74
- conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_column_level"
75
- )
77
+ if enable_column_lineage:
78
+ logger.info(
79
+ f"enable_column_lineage is set to {enable_column_lineage}.Generate Databricks column to column lineage."
80
+ )
81
+ databricks_column_level_lineage(
82
+ conn=conn,
83
+ endpoint=endpoint,
84
+ qdc_client=qdc_client,
85
+ tenant_id=tenant_id,
86
+ dbt_table_name="quollio_lineage_column_level",
87
+ )
88
+ else:
89
+ logger.info("Skip column lineage ingestion. Set enable_column_lineage to True if you ingest column lineage.")
76
90
 
77
91
  logger.info("Lineage data is successfully loaded.")
78
92
  return
@@ -80,14 +94,15 @@ def load_lineage(
80
94
 
81
95
  def load_column_stats(
82
96
  conn: db.DatabricksConnectionConfig,
97
+ endpoint: str,
83
98
  qdc_client: qdc.QDCExternalAPIClient,
84
99
  tenant_id: str,
85
100
  ) -> None:
86
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
87
101
 
88
102
  logger.info("Generate Databricks column stats.")
89
103
  databricks_column_stats(
90
104
  conn=conn,
105
+ endpoint=endpoint,
91
106
  qdc_client=qdc_client,
92
107
  tenant_id=tenant_id,
93
108
  )
@@ -207,7 +222,6 @@ if __name__ == "__main__":
207
222
  Please specify table name with blank delimiter like tableA tableB \
208
223
  if you want to create two or more tables",
209
224
  )
210
-
211
225
  parser.add_argument(
212
226
  "--monitoring_table_suffix",
213
227
  type=str,
@@ -217,11 +231,21 @@ if __name__ == "__main__":
217
231
  This is used to identify the monitoring tables created by the databricks monitoring tool. \
218
232
  Default value is _profile_metrics",
219
233
  )
234
+ parser.add_argument(
235
+ "--enable_column_lineage",
236
+ type=bool,
237
+ action=env_default("ENABLE_COLUMN_LINEAGE", store_true=True),
238
+ default=False,
239
+ required=False,
240
+ help="Whether to ingest column lineage into QDIC or not. Default value is False",
241
+ )
220
242
 
221
243
  args = parser.parse_args()
244
+ set_log_level(level=args.log_level)
222
245
 
223
246
  conn = db.DatabricksConnectionConfig(
224
- host=args.host,
247
+ # MEMO: Metadata agent allows the string 'https://' as a host name but is not allowed by intelligence agent.
248
+ host=trim_prefix(args.host, "https://"),
225
249
  http_path=args.http_path,
226
250
  client_id=args.databricks_client_id,
227
251
  client_secret=args.databricks_client_secret,
@@ -243,7 +267,13 @@ if __name__ == "__main__":
243
267
  qdc_client = qdc.QDCExternalAPIClient(
244
268
  base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
245
269
  )
246
- load_lineage(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
270
+ load_lineage(
271
+ conn=conn,
272
+ endpoint=args.host,
273
+ qdc_client=qdc_client,
274
+ tenant_id=args.tenant_id,
275
+ enable_column_lineage=args.enable_column_lineage,
276
+ )
247
277
 
248
278
  if "load_stats" in args.commands:
249
279
  qdc_client = qdc.QDCExternalAPIClient(
@@ -251,6 +281,7 @@ if __name__ == "__main__":
251
281
  )
252
282
  databricks_column_stats(
253
283
  conn=conn,
284
+ endpoint=args.host,
254
285
  qdc_client=qdc_client,
255
286
  tenant_id=args.tenant_id,
256
287
  monitoring_table_suffix=args.monitoring_table_suffix,
@@ -1,7 +1,7 @@
1
1
  version: 2
2
2
 
3
3
  model:
4
- - name: quollio_lineage_column_level
4
+ - name: quollio_lineage_table_level
5
5
  columns:
6
6
  - name: UPSTREAM_TABLES
7
7
  description: 'String column with all upstream tables in JSON format'
@@ -18,4 +18,4 @@ clean-targets:
18
18
  models:
19
19
  +dbt-osmosis: "{model}.yml"
20
20
  +grants:
21
- select: ["{{ var('query_user') }}"]
21
+ select: ["\"{{ var('query_user') }}\""]
@@ -1,28 +1,67 @@
1
1
  {%- materialization divided_view, default %}
2
2
  {%- set identifier = model['alias'] %}
3
3
  {%- set target_relations = [] %}
4
- {%- set chunk = config.get('chunk') %}
5
4
  {%- set grant_config = config.get('grants') %}
6
5
 
7
6
  {{ run_hooks(pre_hooks, inside_transaction=False) }}
8
7
  -- `BEGIN` happens here:
9
8
  {{ run_hooks(pre_hooks, inside_transaction=True) }}
10
9
 
11
- -- fetch records
12
- {%- set query_quollio_stats_profiling_columns -%}
13
- SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name not like 'quollio_%'
10
+ -- fetch target_tables
11
+ {%- set query_stats_target_tables -%}
12
+ SELECT
13
+ distinct
14
+ database_name
15
+ , schema_name
16
+ , table_name
17
+ FROM
18
+ {{ ref('quollio_stats_profiling_columns') }}
19
+ WHERE
20
+ table_name not like 'quollio_%%'
14
21
  {%- endset -%}
15
- {%- set results = run_query(query_quollio_stats_profiling_columns) -%}
22
+ {%- set results = run_query(query_stats_target_tables) -%}
16
23
  {%- if execute -%}
17
- {%- set records = results.rows -%}
24
+ {%- set stats_target_tables = results.rows -%}
18
25
  {%- else -%}
19
- {%- set records = [] -%}
26
+ {%- set stats_target_tables = [] -%}
27
+ {%- endif -%}
28
+
29
+ -- skip creating views if the target profiling columns don't exist.
30
+ {%- if stats_target_tables | length == 0 -%}
31
+ {% call statement("main") %}
32
+ {{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
33
+ select null
34
+ {% endcall %}
35
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
36
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
20
37
  {%- endif -%}
21
38
 
22
39
  -- build sql
23
- {%- for i in range(0, records|length, chunk) -%}
24
- {%- set build_sql %}
25
- {%- for record in records[i: i+chunk] -%}
40
+ {%- for stats_target_table in stats_target_tables -%}
41
+ -- get columns for statistics.
42
+ -- LISTAGG function can't be used for sys table, then it's necessary to get column for each table.
43
+ -- See https://docs.aws.amazon.com/redshift/latest/dg/c_join_PG.html.
44
+ {%- set stats_target_columns %}
45
+ SELECT
46
+ database_name
47
+ , schema_name
48
+ , table_name
49
+ , column_name
50
+ , is_bool
51
+ , is_calculable
52
+ FROM
53
+ {{ ref('quollio_stats_profiling_columns') }}
54
+ WHERE
55
+ database_name = '{{stats_target_table[0]}}'
56
+ AND schema_name = '{{stats_target_table[1]}}'
57
+ AND table_name = '{{stats_target_table[2]}}'
58
+ {%- endset -%}
59
+
60
+ {%- set results = run_query(stats_target_columns) -%}
61
+ {%- set stats_target_columns = results.rows -%}
62
+
63
+ {%- set sql_for_column_stats %}
64
+ {%- for stats_target_column in stats_target_columns -%}
26
65
  {%- if not loop.first -%}UNION{% endif %}
27
66
  SELECT
28
67
  main.db_name
@@ -41,33 +80,33 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name no
41
80
  (
42
81
  SELECT
43
82
  DISTINCT
44
- '{{record[0]}}'::varchar as db_name
45
- , '{{record[1]}}'::varchar as schema_name
46
- , '{{record[2]}}'::varchar as table_name
47
- , '{{record[3]}}'::varchar as column_name
48
- , {% if var("skip_heavy") == false and record[5] == true %}cast(max("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
49
- , {% if var("skip_heavy") == false and record[5] == true %}cast(min("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
83
+ '{{stats_target_column[0]}}'::varchar as db_name
84
+ , '{{stats_target_column[1]}}'::varchar as schema_name
85
+ , '{{stats_target_column[2]}}'::varchar as table_name
86
+ , '{{stats_target_column[3]}}'::varchar as column_name
87
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(max("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
88
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(min("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
50
89
  -- requires full table scan
51
- , {% if var("skip_heavy") == false %}cast(SUM(NVL2("{{record[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
52
- , APPROXIMATE COUNT(DISTINCT "{{record[3]}}") AS cardinality
90
+ , {% if var("aggregate_all") == True %}cast(SUM(NVL2("{{stats_target_column[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
91
+ , APPROXIMATE COUNT(DISTINCT "{{stats_target_column[3]}}") AS cardinality
53
92
  -- requires full table scan
54
- , {% if var("skip_heavy") == false and record[5] == true %}cast(avg("{{record[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
55
- , {% if var("skip_heavy") == false and record[5] == true %}cast(median("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
93
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(avg("{{stats_target_column[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
94
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(median("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
56
95
  -- requires full table scan
57
- , {% if record[5] == true %}cast(STDDEV_SAMP("{{record[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
58
- FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }}
96
+ , {% if stats_target_column[5] == True %}cast(STDDEV_SAMP("{{stats_target_column[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
97
+ FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
59
98
  ) main, (
60
- {%- if var("skip_heavy") == false and record[4] == false %}
99
+ {%- if var("aggregate_all") == True and stats_target_column[4] == false %}
61
100
  SELECT
62
- cast("{{record[3]}}" as varchar) mode_value
101
+ cast("{{stats_target_column[3]}}" as varchar) mode_value
63
102
  FROM (
64
103
  SELECT
65
104
  DISTINCT
66
- "{{record[3]}}"
105
+ "{{stats_target_column[3]}}"
67
106
  , ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS row_num
68
- FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }}
107
+ FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
69
108
  GROUP BY
70
- "{{record[3]}}"
109
+ "{{stats_target_column[3]}}"
71
110
  )
72
111
  WHERE
73
112
  row_num = 1
@@ -77,11 +116,11 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE table_name no
77
116
  {% endfor -%}
78
117
  {%- endset %}
79
118
  -- create a view with a index as suffix
80
- {%- set target_identifier = "%s_%d"|format(model['name'], loop.index) %}
119
+ {%- set target_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
81
120
  {%- set target_relation = api.Relation.create(identifier=target_identifier, schema=schema, database=database, type='view') %}
82
121
  -- {{ drop_relation_if_exists(target_relation) }}
83
122
  {% call statement("main") %}
84
- {{ get_replace_view_sql(target_relation, build_sql) }}
123
+ {{ get_replace_view_sql(target_relation, sql_for_column_stats) }}
85
124
  {% endcall %}
86
125
  {%- set full_refresh_mode = (should_full_refresh()) -%}
87
126
  {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
@@ -1,7 +1,6 @@
1
1
  {{
2
2
  config(
3
- materialized='divided_view',
4
- chunk=20
3
+ materialized='divided_view'
5
4
  )
6
5
  }}
7
6
  -- depends_on: {{ ref('quollio_stats_profiling_columns') }}
@@ -1,52 +1,74 @@
1
1
  {%- materialization divided_view, default %}
2
2
  {%- set identifier = model['alias'] %}
3
3
  {%- set target_relations = [] %}
4
- {%- set chunk = config.get('chunk') %}
5
4
  {%- set grant_config = config.get('grants') %}
6
5
 
7
6
  {{ run_hooks(pre_hooks, inside_transaction=False) }}
8
7
  -- `BEGIN` happens here:
9
8
  {{ run_hooks(pre_hooks, inside_transaction=True) }}
10
9
 
11
- -- fetch records
12
- {%- set query_quollio_stats_profiling_columns -%}
13
- SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE NOT startswith(table_name, 'QUOLLIO_')
10
+ -- fetch target_tables
11
+ {%- set query_stats_target_tables -%}
12
+ SELECT
13
+ TABLE_CATALOG
14
+ , TABLE_SCHEMA
15
+ , TABLE_NAME
16
+ , OBJECT_AGG(COLUMN_NAME, IS_CALCULABLE) AS COLUMNS_OBJ
17
+ FROM
18
+ {{ ref('quollio_stats_profiling_columns') }}
19
+ WHERE NOT startswith(table_name, 'QUOLLIO_')
20
+ GROUP BY
21
+ TABLE_CATALOG
22
+ , TABLE_SCHEMA
23
+ , TABLE_NAME
14
24
  {%- endset -%}
15
- {%- set results = run_query(query_quollio_stats_profiling_columns) -%}
25
+ {%- set results = run_query(query_stats_target_tables) -%}
16
26
  {%- if execute -%}
17
- {%- set records = results.rows -%}
27
+ {%- set stats_target_tables = results.rows -%}
18
28
  {%- else -%}
19
- {%- set records = [] -%}
29
+ {%- set stats_target_tables = [] -%}
20
30
  {%- endif -%}
21
31
 
22
- -- build sql
23
- {%- for i in range(0, records|length, chunk) -%}
24
- {%- set build_sql %}
25
- {%- for record in records[i: i+chunk] -%}
26
- {%- if not loop.first %}UNION{% endif %}
32
+ -- skip creating views if the target profiling columns don't exist.
33
+ {%- if stats_target_tables | length == 0 -%}
34
+ {% call statement("main") %}
35
+ {{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
36
+ select null
37
+ {% endcall %}
38
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
39
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
40
+ {%- endif -%}
27
41
 
42
+ -- create view for each table
43
+ {%- for stats_target_table in stats_target_tables -%}
44
+ -- build sql for column value aggregation.
45
+ {%- set sql_for_column_stats %}
46
+ {% set columns_json = fromjson(stats_target_table[3]) %}
47
+ {%- for col_name, is_calclable in columns_json.items() -%}
48
+ {%- if not loop.first %}UNION{% endif %}
28
49
  SELECT
29
50
  DISTINCT
30
- '{{record[0]}}' as db_name
31
- , '{{record[1]}}' as schema_name
32
- , '{{record[2]}}' as table_name
33
- , '{{record[3]}}' as column_name
34
- , {% if record[5] == true %}CAST(max("{{record[3]}}") AS STRING){% else %}null{% endif %} AS max_value
35
- , {% if record[5] == true %}CAST(min("{{record[3]}}") AS STRING){% else %}null{% endif %} AS min_value
36
- , COUNT_IF("{{record[3]}}" IS NULL) AS null_count
37
- , APPROX_COUNT_DISTINCT("{{record[3]}}") AS cardinality
38
- , {% if record[5] == true %}avg("{{record[3]}}"){% else %}null{% endif %} AS avg_value
39
- , {% if record[5] == true %}median("{{record[3]}}"){% else %}null{% endif %} AS median_value
40
- , {% if record[5] == true %}approx_top_k("{{record[3]}}")[0][0]{% else %}null{% endif %} AS mode_value
41
- , {% if record[5] == true %}stddev("{{record[3]}}"){% else %}null{% endif %} AS stddev_value
42
- FROM "{{record[0]}}"."{{record[1]}}"."{{record[2]}}" {{ var("sample_method") }}
51
+ '{{stats_target_table[0]}}' as db_name
52
+ , '{{stats_target_table[1]}}' as schema_name
53
+ , '{{stats_target_table[2]}}' as table_name
54
+ , '{{col_name}}' as column_name
55
+ , {% if is_calclable == True %}CAST(MAX("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS max_value
56
+ , {% if is_calclable == True %}CAST(MIN("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS min_value
57
+ , COUNT_IF("{{col_name}}" IS NULL) AS null_count
58
+ , APPROX_COUNT_DISTINCT("{{col_name}}") AS cardinality
59
+ , {% if is_calclable == True %}AVG("{{col_name}}"){% else %}NULL{% endif %} AS avg_value
60
+ , {% if is_calclable == True %}MEDIAN("{{col_name}}"){% else %}NULL{% endif %} AS median_value
61
+ , {% if is_calclable == True %}APPROX_TOP_K("{{col_name}}")[0][0]{% else %}NULL{% endif %} AS mode_value
62
+ , {% if is_calclable == True %}STDDEV("{{col_name}}"){% else %}NULL{% endif %} AS stddev_value
63
+ FROM "{{stats_target_table[0]}}"."{{stats_target_table[1]}}"."{{stats_target_table[2]}}" {{ var("sample_method") }}
43
64
  {% endfor -%}
44
65
  {%- endset %}
66
+
45
67
  -- create a view with a index as suffix
46
- {%- set target_identifier = "%s_%d"|format(model['name'], loop.index) %}
47
- {%- set target_relation = api.Relation.create(identifier=target_identifier, schema=schema, database=database, type='view') %}
68
+ {%- set stats_view_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
69
+ {%- set target_relation = api.Relation.create(identifier=stats_view_identifier, schema=schema, database=database, type='view') %}
48
70
  {% call statement("main") %}
49
- {{ get_create_view_as_sql(target_relation, build_sql) }}
71
+ {{ get_create_view_as_sql(target_relation, sql_for_column_stats) }}
50
72
  {% endcall %}
51
73
  {%- set full_refresh_mode = (should_full_refresh()) -%}
52
74
  {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
@@ -1,7 +1,6 @@
1
1
  {{
2
2
  config(
3
- materialized='divided_view',
4
- chunk=20
3
+ materialized='divided_view'
5
4
  )
6
5
  }}
7
6
  -- depends_on: {{ ref('quollio_stats_profiling_columns') }}
@@ -31,3 +31,7 @@ def setup_dbt_profile(connections_json: Dict[str, str], template_path: str, temp
31
31
  with open(profile_path, "w") as profiles:
32
32
  yaml.dump(yaml.safe_load(profiles_body), profiles, default_flow_style=False, allow_unicode=True)
33
33
  return
34
+
35
+
36
+ def trim_prefix(s: str, prefix: str) -> str:
37
+ return s.lstrip(prefix)
@@ -6,6 +6,8 @@ Currently requires explicit naming of env vars to check for
6
6
 
7
7
  import argparse
8
8
  import os
9
+ from distutils.util import strtobool
10
+ from typing import Union
9
11
 
10
12
 
11
13
  # Courtesy of http://stackoverflow.com/a/10551190 with env-var retrieval fixed
@@ -28,9 +30,30 @@ class EnvDefault(argparse.Action):
28
30
  setattr(namespace, self.dest, values)
29
31
 
30
32
 
33
+ class EnvStoreTrue(argparse._StoreTrueAction):
34
+ """An argparse action class that auto-sets missing default values from env vars for store_true."""
35
+
36
+ def __init__(self, envvar, required=True, default=None, **kwargs):
37
+ # Only pass the arguments that argparse._StoreTrueAction expects
38
+ action_kwargs = {key: value for key, value in kwargs.items() if key in ("option_strings", "dest")}
39
+ if envvar in os.environ:
40
+ default = _convert_value_to_bool(os.environ[envvar])
41
+ if required and default:
42
+ required = False
43
+ super(EnvStoreTrue, self).__init__(default=default, required=required, **action_kwargs)
44
+
45
+
31
46
  # functional sugar for the above
32
- def env_default(envvar):
47
+ def env_default(envvar, store_true=False):
33
48
  def wrapper(**kwargs):
49
+ if store_true:
50
+ return EnvStoreTrue(envvar, **kwargs)
34
51
  return EnvDefault(envvar, **kwargs)
35
52
 
36
53
  return wrapper
54
+
55
+
56
+ def _convert_value_to_bool(v: Union[str, bool]) -> bool:
57
+ if isinstance(v, str):
58
+ return bool(strtobool(v))
59
+ return v
@@ -0,0 +1,17 @@
1
+ import logging
2
+
3
+
4
+ def set_log_level(level: str = "info") -> None:
5
+ fmt = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
6
+ if level == "info":
7
+ logging.basicConfig(level=logging.INFO, format=fmt)
8
+ elif level == "debug":
9
+ logging.basicConfig(level=logging.DEBUG, format=fmt)
10
+ elif level == "warn":
11
+ logging.basicConfig(level=logging.WARNING, format=fmt)
12
+ elif level == "error":
13
+ logging.basicConfig(level=logging.ERROR, format=fmt)
14
+ elif level == "critical":
15
+ logging.basicConfig(level=logging.CRITICAL, format=fmt)
16
+ else:
17
+ logging.basicConfig(level=logging.NOTSET, format=fmt)
@@ -14,11 +14,11 @@ logger = logging.getLogger(__name__)
14
14
 
15
15
  def databricks_table_level_lineage(
16
16
  conn: databricks.DatabricksConnectionConfig,
17
+ endpoint: str,
17
18
  qdc_client: qdc.QDCExternalAPIClient,
18
19
  tenant_id: str,
19
20
  dbt_table_name: str = "quollio_lineage_table_level",
20
21
  ) -> None:
21
- logging.basicConfig(level=logging.info, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
22
22
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
23
23
  results = databricks_executor.get_query_results(
24
24
  query=f"""
@@ -31,7 +31,7 @@ def databricks_table_level_lineage(
31
31
  tables = parse_databricks_table_lineage(results)
32
32
  update_table_lineage_inputs = gen_table_lineage_payload(
33
33
  tenant_id=tenant_id,
34
- endpoint=conn.host,
34
+ endpoint=endpoint,
35
35
  tables=tables,
36
36
  )
37
37
 
@@ -55,11 +55,11 @@ def databricks_table_level_lineage(
55
55
 
56
56
  def databricks_column_level_lineage(
57
57
  conn: databricks.DatabricksConnectionConfig,
58
+ endpoint: str,
58
59
  qdc_client: qdc.QDCExternalAPIClient,
59
60
  tenant_id: str,
60
61
  dbt_table_name: str = "quollio_lineage_column_level",
61
62
  ) -> None:
62
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
63
63
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
64
64
  results = databricks_executor.get_query_results(
65
65
  query=f"""
@@ -72,7 +72,7 @@ def databricks_column_level_lineage(
72
72
 
73
73
  update_column_lineage_inputs = gen_column_lineage_payload(
74
74
  tenant_id=tenant_id,
75
- endpoint=conn.host,
75
+ endpoint=endpoint,
76
76
  columns=results,
77
77
  )
78
78
 
@@ -110,7 +110,9 @@ def _get_monitoring_tables(
110
110
  CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_fqdn
111
111
  FROM
112
112
  system.information_schema.tables
113
- WHERE table_name LIKE "%{monitoring_table_suffix}"
113
+ WHERE
114
+ table_name LIKE "%{monitoring_table_suffix}"
115
+ AND table_name NOT LIKE ('quollio_%')
114
116
  """
115
117
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
116
118
  tables = databricks_executor.get_query_results(query)
@@ -153,6 +155,8 @@ def _get_column_stats(
153
155
  MAX(t.window) AS LATEST
154
156
  FROM
155
157
  {monitoring_table} t
158
+ WHERE
159
+ t.column_name not in (':table')
156
160
  GROUP BY
157
161
  t.COLUMN_NAME,
158
162
  t.DATA_TYPE,
@@ -176,13 +180,14 @@ def _get_column_stats(
176
180
 
177
181
  def databricks_column_stats(
178
182
  conn: databricks.DatabricksConnectionConfig,
183
+ endpoint: str,
179
184
  qdc_client: qdc.QDCExternalAPIClient,
180
185
  tenant_id: str,
181
186
  monitoring_table_suffix: str = "_profile_metrics",
182
187
  ) -> None:
183
188
  table_stats = _get_column_stats(conn, monitoring_table_suffix)
184
189
  for table in table_stats:
185
- stats = gen_table_stats_payload(tenant_id, conn.host, table)
190
+ stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
186
191
  for stat in stats:
187
192
  status_code = qdc_client.update_stats_by_id(
188
193
  global_id=stat.global_id,