quollio-core 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quollio_core/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.4.9"
3
+ __version__ = "0.4.11"
4
4
  __author__ = "Quollio Technologies, Inc"
quollio_core/bigquery.py CHANGED
@@ -3,6 +3,7 @@ import json
3
3
  import logging
4
4
 
5
5
  from quollio_core.helper.env_default import env_default
6
+ from quollio_core.helper.log import set_log_level
6
7
  from quollio_core.profilers.bigquery import bigquery_table_lineage
7
8
  from quollio_core.repository import qdc
8
9
  from quollio_core.repository.bigquery import get_credentials, get_org_id
@@ -88,14 +89,22 @@ if __name__ == "__main__":
88
89
  help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
89
90
  nargs="+",
90
91
  )
92
+ parser.add_argument(
93
+ "--log_level",
94
+ type=str,
95
+ choices=["debug", "info", "warn", "error", "none"],
96
+ action=env_default("LOG_LEVEL"),
97
+ required=False,
98
+ help="The log level for dbt commands. Default value is info",
99
+ )
91
100
 
92
101
  args = parser.parse_args()
102
+ set_log_level(level=args.log_level)
93
103
 
94
104
  if len(args.commands) == 0:
95
105
  raise ValueError("No command is provided")
96
106
 
97
107
  if "load_lineage" in args.commands:
98
-
99
108
  qdc_client = qdc.QDCExternalAPIClient(
100
109
  base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
101
110
  )
quollio_core/bricks.py CHANGED
@@ -4,6 +4,7 @@ import os
4
4
 
5
5
  from quollio_core.helper.core import setup_dbt_profile, trim_prefix
6
6
  from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log import set_log_level
7
8
  from quollio_core.profilers.databricks import (
8
9
  databricks_column_level_lineage,
9
10
  databricks_column_stats,
@@ -20,7 +21,6 @@ def build_view(
20
21
  target_tables: str = "",
21
22
  log_level: str = "info",
22
23
  ) -> None:
23
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
24
24
 
25
25
  logger.info("Build profiler views using dbt")
26
26
  # set parameters
@@ -64,7 +64,6 @@ def load_lineage(
64
64
  tenant_id: str,
65
65
  enable_column_lineage: bool = False,
66
66
  ) -> None:
67
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
68
67
 
69
68
  logger.info("Generate Databricks table to table lineage.")
70
69
  databricks_table_level_lineage(
@@ -99,7 +98,6 @@ def load_column_stats(
99
98
  qdc_client: qdc.QDCExternalAPIClient,
100
99
  tenant_id: str,
101
100
  ) -> None:
102
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
103
101
 
104
102
  logger.info("Generate Databricks column stats.")
105
103
  databricks_column_stats(
@@ -243,6 +241,7 @@ if __name__ == "__main__":
243
241
  )
244
242
 
245
243
  args = parser.parse_args()
244
+ set_log_level(level=args.log_level)
246
245
 
247
246
  conn = db.DatabricksConnectionConfig(
248
247
  # MEMO: Metadata agent allows the string 'https://' as a host name but is not allowed by intelligence agent.
@@ -0,0 +1,17 @@
1
+ import logging
2
+
3
+
4
+ def set_log_level(level: str = "info") -> None:
5
+ fmt = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
6
+ if level == "info":
7
+ logging.basicConfig(level=logging.INFO, format=fmt)
8
+ elif level == "debug":
9
+ logging.basicConfig(level=logging.DEBUG, format=fmt)
10
+ elif level == "warn":
11
+ logging.basicConfig(level=logging.WARNING, format=fmt)
12
+ elif level == "error":
13
+ logging.basicConfig(level=logging.ERROR, format=fmt)
14
+ elif level == "critical":
15
+ logging.basicConfig(level=logging.CRITICAL, format=fmt)
16
+ else:
17
+ logging.basicConfig(level=logging.NOTSET, format=fmt)
@@ -19,7 +19,6 @@ def databricks_table_level_lineage(
19
19
  tenant_id: str,
20
20
  dbt_table_name: str = "quollio_lineage_table_level",
21
21
  ) -> None:
22
- logging.basicConfig(level=logging.info, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
23
22
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
24
23
  results = databricks_executor.get_query_results(
25
24
  query=f"""
@@ -61,7 +60,6 @@ def databricks_column_level_lineage(
61
60
  tenant_id: str,
62
61
  dbt_table_name: str = "quollio_lineage_column_level",
63
62
  ) -> None:
64
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
65
63
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
66
64
  results = databricks_executor.get_query_results(
67
65
  query=f"""
@@ -140,42 +138,47 @@ def _get_column_stats(
140
138
  raise ValueError(f"Invalid table name: {table['table_fqdn']}")
141
139
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
142
140
  query = """
143
- SELECT
144
- "{monitored_table_catalog}" as DB_NAME,
145
- "{monitored_table_schema}" as SCHEMA_NAME,
146
- "{monitored_table_name}" as TABLE_NAME,
147
- t.COLUMN_NAME,
148
- t.DATA_TYPE,
149
- t.distinct_count as CARDINALITY,
150
- t.MAX as MAX_VALUE,
151
- t.MIN as MIN_VALUE,
152
- t.AVG as AVG_VALUE,
153
- t.MEDIAN as MEDIAN_VALUE,
154
- t.STDDEV as STDDEV_VALUE,
155
- t.NUM_NULLS as NULL_COUNT,
156
- t.frequent_items[0].item AS MODE_VALUE,
157
- MAX(t.window) AS LATEST
158
- FROM
159
- {monitoring_table} t
160
- WHERE
161
- t.column_name not in (':table')
162
- GROUP BY
163
- t.COLUMN_NAME,
164
- t.DATA_TYPE,
165
- t.distinct_count,
166
- t.MAX,
167
- t.MIN,
168
- t.AVG,
169
- t.MEDIAN,
170
- t.STDDEV,
171
- t.NUM_NULLS,
172
- t.frequent_items
141
+ WITH profile_record_history AS (
142
+ SELECT
143
+ COLUMN_NAME
144
+ , distinct_count as CARDINALITY
145
+ , MAX as MAX_VALUE
146
+ , MIN as MIN_VALUE
147
+ , AVG as AVG_VALUE
148
+ , MEDIAN as MEDIAN_VALUE
149
+ , STDDEV as STDDEV_VALUE
150
+ , NUM_NULLS as NULL_COUNT
151
+ , frequent_items[0].item AS MODE_VALUE
152
+ , row_number() over(partition by column_name order by window desc) rownum
153
+ FROM
154
+ {monitoring_table}
155
+ WHERE
156
+ column_name not in (':table')
157
+ )
158
+ SELECT
159
+ "{monitored_table_catalog}" as DB_NAME
160
+ , "{monitored_table_schema}" as SCHEMA_NAME
161
+ , "{monitored_table_name}" as TABLE_NAME
162
+ , COLUMN_NAME
163
+ , CARDINALITY
164
+ , MAX_VALUE
165
+ , MIN_VALUE
166
+ , AVG_VALUE
167
+ , MEDIAN_VALUE
168
+ , STDDEV_VALUE
169
+ , NULL_COUNT
170
+ , MODE_VALUE
171
+ FROM
172
+ profile_record_history
173
+ WHERE
174
+ rownum = 1
173
175
  """.format(
174
176
  monitoring_table=table["table_fqdn"],
175
177
  monitored_table_catalog=monitored_table[0],
176
178
  monitored_table_schema=monitored_table[1],
177
179
  monitored_table_name=monitored_table[2],
178
180
  )
181
+ logger.debug(f"The following sql will be fetched to retrieve stats values. {query}")
179
182
  stats.append(databricks_executor.get_query_results(query))
180
183
  return stats
181
184
 
@@ -14,7 +14,6 @@ def redshift_table_level_lineage(
14
14
  tenant_id: str,
15
15
  dbt_table_name: str,
16
16
  ) -> None:
17
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
18
17
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
19
18
  results = redshift_executor.get_query_results(
20
19
  query="""
@@ -55,22 +54,7 @@ def redshift_table_level_lineage(
55
54
  return
56
55
 
57
56
 
58
- def _get_target_tables_query(db: str, schema: str) -> str:
59
- query = """
60
- SELECT
61
- DISTINCT
62
- database_name
63
- , schema_name
64
- , table_name
65
- FROM
66
- {db}.{schema}.quollio_stats_profiling_columns
67
- """.format(
68
- db=db, schema=schema
69
- )
70
- return query
71
-
72
-
73
- def _get_stats_tables_query(db: str, schema: str) -> str:
57
+ def _gen_get_stats_views_query(db: str, schema: str) -> str:
74
58
  query = """
75
59
  SELECT
76
60
  DISTINCT
@@ -93,70 +77,54 @@ def redshift_table_stats(
93
77
  qdc_client: qdc.QDCExternalAPIClient,
94
78
  tenant_id: str,
95
79
  ) -> None:
96
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
97
80
 
98
81
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
99
- req_count = 0
100
- target_query = _get_target_tables_query(
82
+ stats_query = _gen_get_stats_views_query(
101
83
  db=conn.database,
102
84
  schema=conn.schema,
103
85
  )
104
- target_assets = redshift_executor.get_query_results(query=target_query)
86
+ stats_views = redshift_executor.get_query_results(query=stats_query)
105
87
 
106
- stats_query = _get_stats_tables_query(
107
- db=conn.database,
108
- schema=conn.schema,
109
- )
110
- stats_columns = redshift_executor.get_query_results(query=stats_query)
111
- for target_asset in target_assets:
112
- for stats_column in stats_columns:
113
- stats_query = """
114
- SELECT
115
- db_name
116
- , schema_name
117
- , table_name
118
- , column_name
119
- , max_value
120
- , min_value
121
- , null_count
122
- , cardinality
123
- , avg_value
124
- , median_value
125
- , mode_value
126
- , stddev_value
127
- FROM
128
- {db}.{schema}.{table}
129
- WHERE
130
- db_name = '{target_db}'
131
- and schema_name = '{target_schema}'
132
- and table_name = '{target_table}'
133
- """.format(
134
- db=stats_column[0],
135
- schema=stats_column[1],
136
- table=stats_column[2],
137
- target_db=target_asset[0],
138
- target_schema=target_asset[1],
139
- target_table=target_asset[2],
88
+ req_count = 0
89
+ for stats_view in stats_views:
90
+ stats_query = """
91
+ SELECT
92
+ db_name
93
+ , schema_name
94
+ , table_name
95
+ , column_name
96
+ , max_value
97
+ , min_value
98
+ , null_count
99
+ , cardinality
100
+ , avg_value
101
+ , median_value
102
+ , mode_value
103
+ , stddev_value
104
+ FROM
105
+ {db}.{schema}.{table}
106
+ """.format(
107
+ db=stats_view[0],
108
+ schema=stats_view[1],
109
+ table=stats_view[2],
110
+ )
111
+ stats_result = redshift_executor.get_query_results(query=stats_query)
112
+ payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
113
+ for payload in payloads:
114
+ logger.info(
115
+ "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
116
+ db=payload.db,
117
+ schema=payload.schema,
118
+ table=payload.table,
119
+ column=payload.column,
120
+ )
140
121
  )
141
- stats_result = redshift_executor.get_query_results(query=stats_query)
142
- payloads = gen_table_stats_payload_from_tuple(
143
- tenant_id=tenant_id, endpoint=conn.host, stats=stats_result
122
+ status_code = qdc_client.update_stats_by_id(
123
+ global_id=payload.global_id,
124
+ payload=payload.body.get_column_stats(),
144
125
  )
145
- for payload in payloads:
146
- logger.info(
147
- "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
148
- db=payload.db,
149
- schema=payload.schema,
150
- table=payload.table,
151
- column=payload.column,
152
- )
153
- )
154
- status_code = qdc_client.update_stats_by_id(
155
- global_id=payload.global_id,
156
- payload=payload.body.get_column_stats(),
157
- )
158
- if status_code == 200:
159
- req_count += 1
126
+ if status_code == 200:
127
+ req_count += 1
160
128
  logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
161
129
  return
162
130
 
@@ -166,7 +134,6 @@ def redshift_table_level_sqllineage(
166
134
  qdc_client: qdc.QDCExternalAPIClient,
167
135
  tenant_id: str,
168
136
  ) -> None:
169
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
170
137
  redshift_connector = redshift.RedshiftQueryExecutor(conn)
171
138
  results = redshift_connector.get_query_results(
172
139
  query="""
@@ -17,42 +17,41 @@ def snowflake_table_to_table_lineage(
17
17
  qdc_client: qdc.QDCExternalAPIClient,
18
18
  tenant_id: str,
19
19
  ) -> None:
20
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
21
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
22
- results = sf_executor.get_query_results(
23
- query="""
24
- SELECT
25
- *
26
- FROM
27
- {db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
28
- """.format(
29
- db=conn.account_database,
30
- schema=conn.account_schema,
31
- )
32
- )
33
- parsed_results = parse_snowflake_results(results=results)
34
- update_table_lineage_inputs = gen_table_lineage_payload(
35
- tenant_id=tenant_id,
36
- endpoint=conn.account_id,
37
- tables=parsed_results,
38
- )
39
-
40
- req_count = 0
41
- for update_table_lineage_input in update_table_lineage_inputs:
42
- logger.info(
43
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
44
- db=update_table_lineage_input.downstream_database_name,
45
- schema=update_table_lineage_input.downstream_schema_name,
46
- table=update_table_lineage_input.downstream_table_name,
20
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
21
+ results = sf_executor.get_query_results(
22
+ query="""
23
+ SELECT
24
+ *
25
+ FROM
26
+ {db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
27
+ """.format(
28
+ db=conn.account_database,
29
+ schema=conn.account_schema,
47
30
  )
48
31
  )
49
- status_code = qdc_client.update_lineage_by_id(
50
- global_id=update_table_lineage_input.downstream_global_id,
51
- payload=update_table_lineage_input.upstreams.as_dict(),
32
+ parsed_results = parse_snowflake_results(results=results)
33
+ update_table_lineage_inputs = gen_table_lineage_payload(
34
+ tenant_id=tenant_id,
35
+ endpoint=conn.account_id,
36
+ tables=parsed_results,
52
37
  )
53
- if status_code == 200:
54
- req_count += 1
55
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
38
+
39
+ req_count = 0
40
+ for update_table_lineage_input in update_table_lineage_inputs:
41
+ logger.info(
42
+ "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
43
+ db=update_table_lineage_input.downstream_database_name,
44
+ schema=update_table_lineage_input.downstream_schema_name,
45
+ table=update_table_lineage_input.downstream_table_name,
46
+ )
47
+ )
48
+ status_code = qdc_client.update_lineage_by_id(
49
+ global_id=update_table_lineage_input.downstream_global_id,
50
+ payload=update_table_lineage_input.upstreams.as_dict(),
51
+ )
52
+ if status_code == 200:
53
+ req_count += 1
54
+ logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
56
55
  return
57
56
 
58
57
 
@@ -61,41 +60,41 @@ def snowflake_column_to_column_lineage(
61
60
  qdc_client: qdc.QDCExternalAPIClient,
62
61
  tenant_id: str,
63
62
  ) -> None:
64
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
65
- results = sf_executor.get_query_results(
66
- query="""
67
- SELECT
68
- *
69
- FROM
70
- {db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
71
- """.format(
72
- db=conn.account_database,
73
- schema=conn.account_schema,
74
- )
75
- )
76
- update_column_lineage_inputs = gen_column_lineage_payload(
77
- tenant_id=tenant_id,
78
- endpoint=conn.account_id,
79
- columns=results,
80
- )
81
-
82
- req_count = 0
83
- for update_column_lineage_input in update_column_lineage_inputs:
84
- logger.info(
85
- "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
86
- db=update_column_lineage_input.downstream_database_name,
87
- schema=update_column_lineage_input.downstream_schema_name,
88
- table=update_column_lineage_input.downstream_table_name,
89
- column=update_column_lineage_input.downstream_column_name,
63
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
64
+ results = sf_executor.get_query_results(
65
+ query="""
66
+ SELECT
67
+ *
68
+ FROM
69
+ {db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
70
+ """.format(
71
+ db=conn.account_database,
72
+ schema=conn.account_schema,
90
73
  )
91
74
  )
92
- status_code = qdc_client.update_lineage_by_id(
93
- global_id=update_column_lineage_input.downstream_global_id,
94
- payload=update_column_lineage_input.upstreams.as_dict(),
75
+ update_column_lineage_inputs = gen_column_lineage_payload(
76
+ tenant_id=tenant_id,
77
+ endpoint=conn.account_id,
78
+ columns=results,
95
79
  )
96
- if status_code == 200:
97
- req_count += 1
98
- logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
80
+
81
+ req_count = 0
82
+ for update_column_lineage_input in update_column_lineage_inputs:
83
+ logger.info(
84
+ "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
85
+ db=update_column_lineage_input.downstream_database_name,
86
+ schema=update_column_lineage_input.downstream_schema_name,
87
+ table=update_column_lineage_input.downstream_table_name,
88
+ column=update_column_lineage_input.downstream_column_name,
89
+ )
90
+ )
91
+ status_code = qdc_client.update_lineage_by_id(
92
+ global_id=update_column_lineage_input.downstream_global_id,
93
+ payload=update_column_lineage_input.upstreams.as_dict(),
94
+ )
95
+ if status_code == 200:
96
+ req_count += 1
97
+ logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
99
98
  return
100
99
 
101
100
 
@@ -104,110 +103,67 @@ def snowflake_table_level_sqllineage(
104
103
  qdc_client: qdc.QDCExternalAPIClient,
105
104
  tenant_id: str,
106
105
  ) -> None:
107
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
108
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
109
- results = sf_executor.get_query_results(
110
- query="""
111
- SELECT
112
- database_name
113
- , schema_name
114
- , query_text
115
- FROM
116
- {db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
117
- """.format(
118
- db=conn.account_database,
119
- schema=conn.account_schema,
120
- )
121
- )
122
- update_table_lineage_inputs_list = list()
123
- sql_lineage = SQLLineage()
124
- for result in results:
125
- src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
126
- sql=result["QUERY_TEXT"],
127
- dialect="snowflake",
128
- dest_db=result["DATABASE_NAME"],
129
- dest_schema=result["SCHEMA_NAME"],
130
- )
131
- update_table_lineage_inputs = sql_lineage.gen_lineage_input(
132
- tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
133
- )
134
- update_table_lineage_inputs_list.append(update_table_lineage_inputs)
135
-
136
- req_count = 0
137
- for update_table_lineage_input in update_table_lineage_inputs_list:
138
- logger.info(
139
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
140
- db=update_table_lineage_input.downstream_database_name,
141
- schema=update_table_lineage_input.downstream_schema_name,
142
- table=update_table_lineage_input.downstream_table_name,
106
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
107
+ results = sf_executor.get_query_results(
108
+ query="""
109
+ SELECT
110
+ database_name
111
+ , schema_name
112
+ , query_text
113
+ FROM
114
+ {db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
115
+ """.format(
116
+ db=conn.account_database,
117
+ schema=conn.account_schema,
143
118
  )
144
119
  )
145
- status_code = qdc_client.update_lineage_by_id(
146
- global_id=update_table_lineage_input.downstream_global_id,
147
- payload=update_table_lineage_input.upstreams.as_dict(),
148
- )
149
- if status_code == 200:
150
- req_count += 1
151
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
120
+ update_table_lineage_inputs_list = list()
121
+ sql_lineage = SQLLineage()
122
+ for result in results:
123
+ src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
124
+ sql=result["QUERY_TEXT"],
125
+ dialect="snowflake",
126
+ dest_db=result["DATABASE_NAME"],
127
+ dest_schema=result["SCHEMA_NAME"],
128
+ )
129
+ update_table_lineage_inputs = sql_lineage.gen_lineage_input(
130
+ tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
131
+ )
132
+ update_table_lineage_inputs_list.append(update_table_lineage_inputs)
133
+
134
+ req_count = 0
135
+ for update_table_lineage_input in update_table_lineage_inputs_list:
136
+ logger.info(
137
+ "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
138
+ db=update_table_lineage_input.downstream_database_name,
139
+ schema=update_table_lineage_input.downstream_schema_name,
140
+ table=update_table_lineage_input.downstream_table_name,
141
+ )
142
+ )
143
+ status_code = qdc_client.update_lineage_by_id(
144
+ global_id=update_table_lineage_input.downstream_global_id,
145
+ payload=update_table_lineage_input.upstreams.as_dict(),
146
+ )
147
+ if status_code == 200:
148
+ req_count += 1
149
+ logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
152
150
  return
153
151
 
154
152
 
155
- def _get_target_tables_query(db: str, schema: str) -> str:
156
- query = """
157
- SELECT
158
- DISTINCT
159
- TABLE_CATALOG
160
- , TABLE_SCHEMA
161
- , TABLE_NAME
162
- FROM
163
- {db}.{schema}.QUOLLIO_STATS_PROFILING_COLUMNS
164
- """.format(
165
- db=db, schema=schema
166
- )
167
- return query
168
-
169
-
170
- def _get_stats_tables_query(db: str, schema: str) -> str:
171
- query = """
172
- SELECT
173
- DISTINCT
174
- TABLE_CATALOG
175
- , TABLE_SCHEMA
176
- , TABLE_NAME
177
- FROM
178
- {db}.INFORMATION_SCHEMA.TABLES
179
- WHERE
180
- startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
181
- AND TABLE_SCHEMA = UPPER('{schema}')
182
- """.format(
183
- db=db, schema=schema
184
- )
185
- return query
186
-
187
-
188
153
  def snowflake_table_stats(
189
154
  conn: snowflake.SnowflakeConnectionConfig,
190
155
  qdc_client: qdc.QDCExternalAPIClient,
191
156
  tenant_id: str,
192
157
  ) -> None:
193
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
194
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
195
-
196
- target_query = _get_target_tables_query(
197
- db=conn.account_database,
198
- schema=conn.account_schema,
199
- )
200
- target_assets = sf_executor.get_query_results(query=target_query)
201
-
202
- stats_query = _get_stats_tables_query(
203
- db=conn.account_database,
204
- schema=conn.account_schema,
205
- )
206
- stats_columns = sf_executor.get_query_results(query=stats_query)
158
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
159
+ stats_query = _gen_get_stats_views_query(
160
+ db=conn.account_database,
161
+ schema=conn.account_schema,
162
+ )
163
+ stats_views = sf_executor.get_query_results(query=stats_query)
207
164
 
208
- req_count = 0
209
- for target_asset in target_assets:
210
- for stats_column in stats_columns:
165
+ req_count = 0
166
+ for stats_view in stats_views:
211
167
  stats_query = """
212
168
  SELECT
213
169
  db_name
@@ -224,18 +180,12 @@ def snowflake_table_stats(
224
180
  , stddev_value
225
181
  FROM
226
182
  {db}.{schema}.{table}
227
- WHERE
228
- db_name = '{target_db}'
229
- and schema_name = '{target_schema}'
230
- and table_name = '{target_table}'
231
183
  """.format(
232
- db=stats_column["TABLE_CATALOG"],
233
- schema=stats_column["TABLE_SCHEMA"],
234
- table=stats_column["TABLE_NAME"],
235
- target_db=target_asset["TABLE_CATALOG"],
236
- target_schema=target_asset["TABLE_SCHEMA"],
237
- target_table=target_asset["TABLE_NAME"],
184
+ db=stats_view["TABLE_CATALOG"],
185
+ schema=stats_view["TABLE_SCHEMA"],
186
+ table=stats_view["TABLE_NAME"],
238
187
  )
188
+ logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
239
189
  stats_result = sf_executor.get_query_results(query=stats_query)
240
190
  payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
241
191
  for payload in payloads:
@@ -253,4 +203,23 @@ def snowflake_table_stats(
253
203
  )
254
204
  if status_code == 200:
255
205
  req_count += 1
256
- logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
206
+ logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
207
+ return
208
+
209
+
210
+ def _gen_get_stats_views_query(db: str, schema: str) -> str:
211
+ query = """
212
+ SELECT
213
+ DISTINCT
214
+ TABLE_CATALOG
215
+ , TABLE_SCHEMA
216
+ , TABLE_NAME
217
+ FROM
218
+ {db}.INFORMATION_SCHEMA.TABLES
219
+ WHERE
220
+ startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
221
+ AND TABLE_SCHEMA = UPPER('{schema}')
222
+ """.format(
223
+ db=db, schema=schema
224
+ )
225
+ return query
@@ -54,7 +54,6 @@ class SQLLineage:
54
54
  dest_db: str = None,
55
55
  dest_schema: str = None,
56
56
  ) -> Tuple[Set[Table], Table]:
57
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
58
57
  try:
59
58
  statement: sqlglot.Expression = sqlglot.parse_one(sql=sql, error_level=sqlglot.ErrorLevel.RAISE)
60
59
  except ParseError as e:
quollio_core/redshift.py CHANGED
@@ -4,6 +4,7 @@ import os
4
4
 
5
5
  from quollio_core.helper.core import setup_dbt_profile
6
6
  from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log import set_log_level
7
8
  from quollio_core.profilers.redshift import (
8
9
  redshift_table_level_lineage,
9
10
  redshift_table_level_sqllineage,
@@ -20,8 +21,6 @@ def build_view(
20
21
  target_tables: str = "",
21
22
  log_level: str = "info",
22
23
  ) -> None:
23
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
24
-
25
24
  logger.info("Build profiler views using dbt")
26
25
  # set parameters
27
26
  dbt_client = dbt.DBTClient()
@@ -74,7 +73,6 @@ def load_lineage(
74
73
  qdc_client: qdc.QDCExternalAPIClient,
75
74
  tenant_id: str,
76
75
  ) -> None:
77
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
78
76
  logger.info("Generate redshift table to table lineage.")
79
77
  redshift_table_level_lineage(
80
78
  conn=conn,
@@ -101,7 +99,6 @@ def load_stats(
101
99
  qdc_client: qdc.QDCExternalAPIClient,
102
100
  tenant_id: str,
103
101
  ) -> None:
104
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
105
102
 
106
103
  logger.info("Generate redshift stats.")
107
104
  redshift_table_stats(
@@ -119,7 +116,6 @@ def load_sqllineage(
119
116
  qdc_client: qdc.QDCExternalAPIClient,
120
117
  tenant_id: str,
121
118
  ) -> None:
122
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
123
119
 
124
120
  logger.info("Generate Redshift sqllineage.")
125
121
  redshift_table_level_sqllineage(
@@ -266,6 +262,8 @@ if __name__ == "__main__":
266
262
  help="The client secrete that is created on Quollio console to let clients access Quollio External API",
267
263
  )
268
264
  args = parser.parse_args()
265
+ set_log_level(level=args.log_level)
266
+
269
267
  conn = redshift.RedshiftConnectionConfig(
270
268
  host=args.host,
271
269
  build_user=args.build_user,
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional
5
5
  from databricks.sdk.core import Config, HeaderFactory, oauth_service_principal
6
6
  from databricks.sql.client import Connection, connect
7
7
 
8
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
8
+ logger = logging.getLogger(__name__)
9
9
 
10
10
 
11
11
  @dataclass
@@ -47,8 +47,8 @@ class DatabricksQueryExecutor:
47
47
  cur.execute(query)
48
48
  result: List[Dict[str, str]] = cur.fetchall()
49
49
  except Exception as e:
50
- logging.error(query, exc_info=True)
51
- logging.error("databricks get_query_results failed. %s", e)
50
+ logger.error(query, exc_info=True)
51
+ logger.error("databricks get_query_results failed. %s", e)
52
52
  raise
53
53
 
54
54
  for row in result:
@@ -11,7 +11,6 @@ class DBTClient:
11
11
  self.dbt = dbtRunner()
12
12
 
13
13
  def invoke(self, cmd: str, project_dir: str, profile_dir: str, options: List[str] = None) -> dbtRunnerResult:
14
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
15
14
  req = [cmd, "--project-dir", project_dir, "--profiles-dir", profile_dir]
16
15
  if options is not None:
17
16
  req.extend(options)
@@ -25,7 +25,6 @@ class QDCExternalAPIClient:
25
25
  Tried to find a package for oauth0 client credentials flow,
26
26
  but any of them contains bugs or lacks of features to handle the token refresh when it's expired
27
27
  """
28
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
29
28
  url = f"{self.base_url}/oauth2/token"
30
29
  creds = f"{self.client_id}:{self.client_secret}"
31
30
  encoded_creds = base64.b64encode(creds.encode()).decode()
@@ -65,7 +64,6 @@ class QDCExternalAPIClient:
65
64
  return session
66
65
 
67
66
  def update_stats_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
68
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
69
67
  self._refresh_token_if_expired()
70
68
  headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
71
69
  endpoint = f"{self.base_url}/v2/assets/{global_id}/stats"
@@ -85,7 +83,6 @@ class QDCExternalAPIClient:
85
83
  return res.status_code
86
84
 
87
85
  def update_lineage_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
88
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
89
86
  self._refresh_token_if_expired()
90
87
  headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
91
88
  endpoint = f"{self.base_url}/v2/lineage/{global_id}"
@@ -67,7 +67,6 @@ class RedshiftQueryExecutor:
67
67
  return conn
68
68
 
69
69
  def get_query_results(self, query: str) -> Tuple[List[str]]:
70
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
71
70
  with self.conn.cursor() as cur:
72
71
  try:
73
72
  cur.execute(query)
@@ -28,6 +28,12 @@ class SnowflakeQueryExecutor:
28
28
  def __init__(self, config: SnowflakeConnectionConfig) -> None:
29
29
  self.conn = self.__initialize(config)
30
30
 
31
+ def __enter__(self):
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_value, traceback):
35
+ self.conn.close()
36
+
31
37
  def __initialize(self, config: SnowflakeConnectionConfig) -> SnowflakeConnection:
32
38
  conn: SnowflakeConnection = connect(
33
39
  user=config.account_user,
@@ -41,7 +47,6 @@ class SnowflakeQueryExecutor:
41
47
  return conn
42
48
 
43
49
  def get_query_results(self, query: str) -> List[Dict[str, str]]:
44
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
45
50
  with self.conn.cursor(DictCursor) as cur:
46
51
  try:
47
52
  cur.execute(query)
quollio_core/snowflake.py CHANGED
@@ -4,6 +4,7 @@ import os
4
4
 
5
5
  from quollio_core.helper.core import setup_dbt_profile
6
6
  from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log import set_log_level
7
8
  from quollio_core.profilers.snowflake import (
8
9
  snowflake_column_to_column_lineage,
9
10
  snowflake_table_level_sqllineage,
@@ -21,7 +22,6 @@ def build_view(
21
22
  target_tables: str = "",
22
23
  log_level: str = "info",
23
24
  ) -> None:
24
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
25
25
 
26
26
  logger.info("Build profiler views using dbt")
27
27
  # set parameters
@@ -74,9 +74,8 @@ def load_lineage(
74
74
  tenant_id: str,
75
75
  enable_column_lineage: bool = False,
76
76
  ) -> None:
77
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
78
-
79
77
  logger.info("Generate Snowflake table to table lineage.")
78
+
80
79
  snowflake_table_to_table_lineage(
81
80
  conn=conn,
82
81
  qdc_client=qdc_client,
@@ -105,7 +104,6 @@ def load_stats(
105
104
  qdc_client: qdc.QDCExternalAPIClient,
106
105
  tenant_id: str,
107
106
  ) -> None:
108
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
109
107
 
110
108
  logger.info("Generate Snowflake stats.")
111
109
  snowflake_table_stats(
@@ -124,7 +122,6 @@ def load_sqllineage(
124
122
  qdc_client: qdc.QDCExternalAPIClient,
125
123
  tenant_id: str,
126
124
  ) -> None:
127
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
128
125
 
129
126
  logger.info("Generate Snowflake sqllineage.")
130
127
  snowflake_table_level_sqllineage(
@@ -279,6 +276,8 @@ if __name__ == "__main__":
279
276
  help="Whether to ingest column lineage into QDIC or not. Default value is False",
280
277
  )
281
278
  args = parser.parse_args()
279
+ set_log_level(level=args.log_level)
280
+
282
281
  conn = snowflake.SnowflakeConnectionConfig(
283
282
  account_id=args.account_id,
284
283
  account_user=args.user,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quollio-core
3
- Version: 0.4.9
3
+ Version: 0.4.11
4
4
  Summary: Quollio Core
5
5
  Author-email: quollio-dev <qt.dev@quollio.com>
6
6
  Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
@@ -1,8 +1,8 @@
1
- quollio_core/__init__.py,sha256=143doeugqWpXanEUQkIVJmQ5p5zRqMXEa3Ab81BNjFw,83
2
- quollio_core/bigquery.py,sha256=2DrUMo4evcH4BHiUtnY48IjmsdAsQMoPGtNx8SRoyzQ,3528
3
- quollio_core/bricks.py,sha256=BVwh9clJMXe_YXbd78ku6Y9470cYYE3CflcuCRzrY3I,9689
4
- quollio_core/redshift.py,sha256=wVqtNnv1fmZ5QwYq4A2JVLHH3VrCYPEZBx39NoCqRIM,10220
5
- quollio_core/snowflake.py,sha256=WRtnXxc01AaV4oTa382MTppYvmOn4mOY7jFDNDnzxDA,10922
1
+ quollio_core/__init__.py,sha256=_jIDStu07dkGu5ouTAUr-ImjBx7pmvosgMOUjUT25Pc,84
2
+ quollio_core/bigquery.py,sha256=RguUznaY5YjROzJtXimoS8yCNH9jgGphpzd5v_JgSQM,3884
3
+ quollio_core/bricks.py,sha256=Lehv-qsBSMNNE9BGVvidGOXJsxLSSsbNtmiEZH4lSUg,9458
4
+ quollio_core/redshift.py,sha256=1d-mHnalB1jtiGPgzsGd3lRwLHCxaBJlUMEV2dh4f60,9882
5
+ quollio_core/snowflake.py,sha256=G3tImWbZgMlycYuw1b5WnNBp3zWo3hyrbOX5ARLIs7A,10585
6
6
  quollio_core/dbt_projects/databricks/.gitignore,sha256=1jJAyXSzJ3YUm0nx3i7wUSE4RjQMX3ad6F8O88UbtzI,29
7
7
  quollio_core/dbt_projects/databricks/README.md,sha256=ZpRQyhFAODAiS8dc1Kb_ndkul4cu4o4udN_EMa49CU4,440
8
8
  quollio_core/dbt_projects/databricks/dbt_project.yml,sha256=3sH98RNk7TnphvI3yEdXDstb92kW5BNxr-cT0tXhwzk,480
@@ -64,22 +64,23 @@ quollio_core/dbt_projects/snowflake/snapshots/.gitkeep,sha256=47DEQpj8HBSa-_TImW
64
64
  quollio_core/helper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  quollio_core/helper/core.py,sha256=wbu4FWI7YiFEttXGSuj3tMyAhtPAFlHOjDpWJGNXOHA,1202
66
66
  quollio_core/helper/env_default.py,sha256=H6gbSGUPrEDZr4YDrL49hbOpw6RntI4U82kX1q6vUnI,2148
67
+ quollio_core/helper/log.py,sha256=flxyZZ44G79l1TaUp3OT58uCHcnE5z_pCduwoeI6IUs,645
67
68
  quollio_core/profilers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
69
  quollio_core/profilers/bigquery.py,sha256=e1Y8cZR-LxI9mSsYb0DurQyy0eCjM_kAKLfvl4IuQLE,3262
69
- quollio_core/profilers/databricks.py,sha256=aNUIk48GuwSDEnmNzhZ6yv6TqAc1dBJt9e81NiTGBjo,7496
70
+ quollio_core/profilers/databricks.py,sha256=pFWV6ecTefOS8evbnnXTcqR1jmbLsqn1V89ISyM2uqU,7743
70
71
  quollio_core/profilers/lineage.py,sha256=4FyxIuPBrUFihqZryqTQBcfB0Z7634lKl_WwkD82vzE,6865
71
- quollio_core/profilers/redshift.py,sha256=obdHVIsOM1bwHGdvYKalsJcTXwLK02kAKQMSBzSvsDo,7862
72
- quollio_core/profilers/snowflake.py,sha256=C1LC19ZaUMwNoXjsbnez0xANydJYs8oNRt6tixWKDq8,9090
73
- quollio_core/profilers/sqllineage.py,sha256=oCyl4tpXL5bkfguXAzTHSB9kZBL3tQK_rfcJ4XQMrLo,5177
72
+ quollio_core/profilers/redshift.py,sha256=6_4amsBL4QW0ZajWhS-TW3f_cjKKa6TpXClMgBC-fZo,6440
73
+ quollio_core/profilers/snowflake.py,sha256=nitlP5pmDm2RhLGO4f_WTzkw41EmOTY2uWN1HZkCHbI,8465
74
+ quollio_core/profilers/sqllineage.py,sha256=XkF7hwDWIGNtyEP5cv2wETBgMfdQxeHolv7qPIkntSQ,5066
74
75
  quollio_core/profilers/stats.py,sha256=PG1NbbUSpc1JuEYvBzD66rd24tp0C13_Y5Y7vRjYG1c,4720
75
76
  quollio_core/repository/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
77
  quollio_core/repository/bigquery.py,sha256=KMJTeF4OUxtaJt0ymoJ4tkrMKq8yLyMYaMxNvU5yd_Y,2271
77
- quollio_core/repository/databricks.py,sha256=m68tja5N-QxH3VqEq-mOJKBeR2qldSgj_L9iIxvWwm0,1945
78
- quollio_core/repository/dbt.py,sha256=HXqW_xa4xYPh9CnKkg4L1gwG3SGjj2BAYoWgzWMFU4U,770
79
- quollio_core/repository/qdc.py,sha256=VCmzAUvjLemw1os5TaPtfBFkMCOMuPeftjZmUPhFj2Y,4702
80
- quollio_core/repository/redshift.py,sha256=UVHIpYzDQ2AbBTAGa8DgmEenG0NZsHfYroR1MmEPQGA,2991
81
- quollio_core/repository/snowflake.py,sha256=1YVMDfb9euJKvikv1pk_IxVF6SVsiemSvZ-WMTSbY7E,1874
82
- quollio_core-0.4.9.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
83
- quollio_core-0.4.9.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
84
- quollio_core-0.4.9.dist-info/METADATA,sha256=GW_XzITMPnzDHVXIMzgmAkYuzVkW2ZD3uPF89fCxrxA,6803
85
- quollio_core-0.4.9.dist-info/RECORD,,
78
+ quollio_core/repository/databricks.py,sha256=9Cgdv8qBnVaHqu3RA-IUBieAqb69moQ-KAAMVSf5Ds4,1877
79
+ quollio_core/repository/dbt.py,sha256=cnLwJPywLi8VowVW7zfIBa9jxVwDWO7xzzNRn1vWiuw,659
80
+ quollio_core/repository/qdc.py,sha256=qEpMF6rKdic23dPJoDYmbIcyCKDuSFqbDF2_jqmqoZw,4369
81
+ quollio_core/repository/redshift.py,sha256=p2ouEuYcDCjx1oBhc6H1ekQsvEqHGd3bFu3PW0ngYBc,2880
82
+ quollio_core/repository/snowflake.py,sha256=J9rHshfWdOSnjQWxwGEYPpAU2lY7Tu5UFB_BNakkAX0,1892
83
+ quollio_core-0.4.11.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
84
+ quollio_core-0.4.11.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
85
+ quollio_core-0.4.11.dist-info/METADATA,sha256=0tv9AKZYWKt9yPa9iaziHZyTzhj708v-Aq22MFEbYAs,6804
86
+ quollio_core-0.4.11.dist-info/RECORD,,