quollio-core 0.4.7__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,6 @@ def redshift_table_level_lineage(
14
14
  tenant_id: str,
15
15
  dbt_table_name: str,
16
16
  ) -> None:
17
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
18
17
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
19
18
  results = redshift_executor.get_query_results(
20
19
  query="""
@@ -55,22 +54,7 @@ def redshift_table_level_lineage(
55
54
  return
56
55
 
57
56
 
58
- def _get_target_tables_query(db: str, schema: str) -> str:
59
- query = """
60
- SELECT
61
- DISTINCT
62
- database_name
63
- , schema_name
64
- , table_name
65
- FROM
66
- {db}.{schema}.quollio_stats_profiling_columns
67
- """.format(
68
- db=db, schema=schema
69
- )
70
- return query
71
-
72
-
73
- def _get_stats_tables_query(db: str, schema: str) -> str:
57
+ def _gen_get_stats_views_query(db: str, schema: str) -> str:
74
58
  query = """
75
59
  SELECT
76
60
  DISTINCT
@@ -93,70 +77,54 @@ def redshift_table_stats(
93
77
  qdc_client: qdc.QDCExternalAPIClient,
94
78
  tenant_id: str,
95
79
  ) -> None:
96
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
97
80
 
98
81
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
99
- req_count = 0
100
- target_query = _get_target_tables_query(
82
+ stats_query = _gen_get_stats_views_query(
101
83
  db=conn.database,
102
84
  schema=conn.schema,
103
85
  )
104
- target_assets = redshift_executor.get_query_results(query=target_query)
86
+ stats_views = redshift_executor.get_query_results(query=stats_query)
105
87
 
106
- stats_query = _get_stats_tables_query(
107
- db=conn.database,
108
- schema=conn.schema,
109
- )
110
- stats_columns = redshift_executor.get_query_results(query=stats_query)
111
- for target_asset in target_assets:
112
- for stats_column in stats_columns:
113
- stats_query = """
114
- SELECT
115
- db_name
116
- , schema_name
117
- , table_name
118
- , column_name
119
- , max_value
120
- , min_value
121
- , null_count
122
- , cardinality
123
- , avg_value
124
- , median_value
125
- , mode_value
126
- , stddev_value
127
- FROM
128
- {db}.{schema}.{table}
129
- WHERE
130
- db_name = '{target_db}'
131
- and schema_name = '{target_schema}'
132
- and table_name = '{target_table}'
133
- """.format(
134
- db=stats_column[0],
135
- schema=stats_column[1],
136
- table=stats_column[2],
137
- target_db=target_asset[0],
138
- target_schema=target_asset[1],
139
- target_table=target_asset[2],
88
+ req_count = 0
89
+ for stats_view in stats_views:
90
+ stats_query = """
91
+ SELECT
92
+ db_name
93
+ , schema_name
94
+ , table_name
95
+ , column_name
96
+ , max_value
97
+ , min_value
98
+ , null_count
99
+ , cardinality
100
+ , avg_value
101
+ , median_value
102
+ , mode_value
103
+ , stddev_value
104
+ FROM
105
+ {db}.{schema}.{table}
106
+ """.format(
107
+ db=stats_view[0],
108
+ schema=stats_view[1],
109
+ table=stats_view[2],
110
+ )
111
+ stats_result = redshift_executor.get_query_results(query=stats_query)
112
+ payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
113
+ for payload in payloads:
114
+ logger.info(
115
+ "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
116
+ db=payload.db,
117
+ schema=payload.schema,
118
+ table=payload.table,
119
+ column=payload.column,
120
+ )
140
121
  )
141
- stats_result = redshift_executor.get_query_results(query=stats_query)
142
- payloads = gen_table_stats_payload_from_tuple(
143
- tenant_id=tenant_id, endpoint=conn.host, stats=stats_result
122
+ status_code = qdc_client.update_stats_by_id(
123
+ global_id=payload.global_id,
124
+ payload=payload.body.get_column_stats(),
144
125
  )
145
- for payload in payloads:
146
- logger.info(
147
- "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
148
- db=payload.db,
149
- schema=payload.schema,
150
- table=payload.table,
151
- column=payload.column,
152
- )
153
- )
154
- status_code = qdc_client.update_stats_by_id(
155
- global_id=payload.global_id,
156
- payload=payload.body.get_column_stats(),
157
- )
158
- if status_code == 200:
159
- req_count += 1
126
+ if status_code == 200:
127
+ req_count += 1
160
128
  logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
161
129
  return
162
130
 
@@ -166,7 +134,6 @@ def redshift_table_level_sqllineage(
166
134
  qdc_client: qdc.QDCExternalAPIClient,
167
135
  tenant_id: str,
168
136
  ) -> None:
169
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
170
137
  redshift_connector = redshift.RedshiftQueryExecutor(conn)
171
138
  results = redshift_connector.get_query_results(
172
139
  query="""
@@ -17,42 +17,41 @@ def snowflake_table_to_table_lineage(
17
17
  qdc_client: qdc.QDCExternalAPIClient,
18
18
  tenant_id: str,
19
19
  ) -> None:
20
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
21
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
22
- results = sf_executor.get_query_results(
23
- query="""
24
- SELECT
25
- *
26
- FROM
27
- {db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
28
- """.format(
29
- db=conn.account_database,
30
- schema=conn.account_schema,
31
- )
32
- )
33
- parsed_results = parse_snowflake_results(results=results)
34
- update_table_lineage_inputs = gen_table_lineage_payload(
35
- tenant_id=tenant_id,
36
- endpoint=conn.account_id,
37
- tables=parsed_results,
38
- )
39
-
40
- req_count = 0
41
- for update_table_lineage_input in update_table_lineage_inputs:
42
- logger.info(
43
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
44
- db=update_table_lineage_input.downstream_database_name,
45
- schema=update_table_lineage_input.downstream_schema_name,
46
- table=update_table_lineage_input.downstream_table_name,
20
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
21
+ results = sf_executor.get_query_results(
22
+ query="""
23
+ SELECT
24
+ *
25
+ FROM
26
+ {db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
27
+ """.format(
28
+ db=conn.account_database,
29
+ schema=conn.account_schema,
47
30
  )
48
31
  )
49
- status_code = qdc_client.update_lineage_by_id(
50
- global_id=update_table_lineage_input.downstream_global_id,
51
- payload=update_table_lineage_input.upstreams.as_dict(),
32
+ parsed_results = parse_snowflake_results(results=results)
33
+ update_table_lineage_inputs = gen_table_lineage_payload(
34
+ tenant_id=tenant_id,
35
+ endpoint=conn.account_id,
36
+ tables=parsed_results,
52
37
  )
53
- if status_code == 200:
54
- req_count += 1
55
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
38
+
39
+ req_count = 0
40
+ for update_table_lineage_input in update_table_lineage_inputs:
41
+ logger.info(
42
+ "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
43
+ db=update_table_lineage_input.downstream_database_name,
44
+ schema=update_table_lineage_input.downstream_schema_name,
45
+ table=update_table_lineage_input.downstream_table_name,
46
+ )
47
+ )
48
+ status_code = qdc_client.update_lineage_by_id(
49
+ global_id=update_table_lineage_input.downstream_global_id,
50
+ payload=update_table_lineage_input.upstreams.as_dict(),
51
+ )
52
+ if status_code == 200:
53
+ req_count += 1
54
+ logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
56
55
  return
57
56
 
58
57
 
@@ -61,41 +60,41 @@ def snowflake_column_to_column_lineage(
61
60
  qdc_client: qdc.QDCExternalAPIClient,
62
61
  tenant_id: str,
63
62
  ) -> None:
64
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
65
- results = sf_executor.get_query_results(
66
- query="""
67
- SELECT
68
- *
69
- FROM
70
- {db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
71
- """.format(
72
- db=conn.account_database,
73
- schema=conn.account_schema,
74
- )
75
- )
76
- update_column_lineage_inputs = gen_column_lineage_payload(
77
- tenant_id=tenant_id,
78
- endpoint=conn.account_id,
79
- columns=results,
80
- )
81
-
82
- req_count = 0
83
- for update_column_lineage_input in update_column_lineage_inputs:
84
- logger.info(
85
- "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
86
- db=update_column_lineage_input.downstream_database_name,
87
- schema=update_column_lineage_input.downstream_schema_name,
88
- table=update_column_lineage_input.downstream_table_name,
89
- column=update_column_lineage_input.downstream_column_name,
63
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
64
+ results = sf_executor.get_query_results(
65
+ query="""
66
+ SELECT
67
+ *
68
+ FROM
69
+ {db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
70
+ """.format(
71
+ db=conn.account_database,
72
+ schema=conn.account_schema,
90
73
  )
91
74
  )
92
- status_code = qdc_client.update_lineage_by_id(
93
- global_id=update_column_lineage_input.downstream_global_id,
94
- payload=update_column_lineage_input.upstreams.as_dict(),
75
+ update_column_lineage_inputs = gen_column_lineage_payload(
76
+ tenant_id=tenant_id,
77
+ endpoint=conn.account_id,
78
+ columns=results,
95
79
  )
96
- if status_code == 200:
97
- req_count += 1
98
- logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
80
+
81
+ req_count = 0
82
+ for update_column_lineage_input in update_column_lineage_inputs:
83
+ logger.info(
84
+ "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
85
+ db=update_column_lineage_input.downstream_database_name,
86
+ schema=update_column_lineage_input.downstream_schema_name,
87
+ table=update_column_lineage_input.downstream_table_name,
88
+ column=update_column_lineage_input.downstream_column_name,
89
+ )
90
+ )
91
+ status_code = qdc_client.update_lineage_by_id(
92
+ global_id=update_column_lineage_input.downstream_global_id,
93
+ payload=update_column_lineage_input.upstreams.as_dict(),
94
+ )
95
+ if status_code == 200:
96
+ req_count += 1
97
+ logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
99
98
  return
100
99
 
101
100
 
@@ -104,110 +103,67 @@ def snowflake_table_level_sqllineage(
104
103
  qdc_client: qdc.QDCExternalAPIClient,
105
104
  tenant_id: str,
106
105
  ) -> None:
107
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
108
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
109
- results = sf_executor.get_query_results(
110
- query="""
111
- SELECT
112
- database_name
113
- , schema_name
114
- , query_text
115
- FROM
116
- {db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
117
- """.format(
118
- db=conn.account_database,
119
- schema=conn.account_schema,
120
- )
121
- )
122
- update_table_lineage_inputs_list = list()
123
- sql_lineage = SQLLineage()
124
- for result in results:
125
- src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
126
- sql=result["QUERY_TEXT"],
127
- dialect="snowflake",
128
- dest_db=result["DATABASE_NAME"],
129
- dest_schema=result["SCHEMA_NAME"],
130
- )
131
- update_table_lineage_inputs = sql_lineage.gen_lineage_input(
132
- tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
133
- )
134
- update_table_lineage_inputs_list.append(update_table_lineage_inputs)
135
-
136
- req_count = 0
137
- for update_table_lineage_input in update_table_lineage_inputs_list:
138
- logger.info(
139
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
140
- db=update_table_lineage_input.downstream_database_name,
141
- schema=update_table_lineage_input.downstream_schema_name,
142
- table=update_table_lineage_input.downstream_table_name,
106
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
107
+ results = sf_executor.get_query_results(
108
+ query="""
109
+ SELECT
110
+ database_name
111
+ , schema_name
112
+ , query_text
113
+ FROM
114
+ {db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
115
+ """.format(
116
+ db=conn.account_database,
117
+ schema=conn.account_schema,
143
118
  )
144
119
  )
145
- status_code = qdc_client.update_lineage_by_id(
146
- global_id=update_table_lineage_input.downstream_global_id,
147
- payload=update_table_lineage_input.upstreams.as_dict(),
148
- )
149
- if status_code == 200:
150
- req_count += 1
151
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
120
+ update_table_lineage_inputs_list = list()
121
+ sql_lineage = SQLLineage()
122
+ for result in results:
123
+ src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
124
+ sql=result["QUERY_TEXT"],
125
+ dialect="snowflake",
126
+ dest_db=result["DATABASE_NAME"],
127
+ dest_schema=result["SCHEMA_NAME"],
128
+ )
129
+ update_table_lineage_inputs = sql_lineage.gen_lineage_input(
130
+ tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
131
+ )
132
+ update_table_lineage_inputs_list.append(update_table_lineage_inputs)
133
+
134
+ req_count = 0
135
+ for update_table_lineage_input in update_table_lineage_inputs_list:
136
+ logger.info(
137
+ "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
138
+ db=update_table_lineage_input.downstream_database_name,
139
+ schema=update_table_lineage_input.downstream_schema_name,
140
+ table=update_table_lineage_input.downstream_table_name,
141
+ )
142
+ )
143
+ status_code = qdc_client.update_lineage_by_id(
144
+ global_id=update_table_lineage_input.downstream_global_id,
145
+ payload=update_table_lineage_input.upstreams.as_dict(),
146
+ )
147
+ if status_code == 200:
148
+ req_count += 1
149
+ logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
152
150
  return
153
151
 
154
152
 
155
- def _get_target_tables_query(db: str, schema: str) -> str:
156
- query = """
157
- SELECT
158
- DISTINCT
159
- TABLE_CATALOG
160
- , TABLE_SCHEMA
161
- , TABLE_NAME
162
- FROM
163
- {db}.{schema}.QUOLLIO_STATS_PROFILING_COLUMNS
164
- """.format(
165
- db=db, schema=schema
166
- )
167
- return query
168
-
169
-
170
- def _get_stats_tables_query(db: str, schema: str) -> str:
171
- query = """
172
- SELECT
173
- DISTINCT
174
- TABLE_CATALOG
175
- , TABLE_SCHEMA
176
- , TABLE_NAME
177
- FROM
178
- {db}.INFORMATION_SCHEMA.TABLES
179
- WHERE
180
- startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
181
- AND TABLE_SCHEMA = UPPER('{schema}')
182
- """.format(
183
- db=db, schema=schema
184
- )
185
- return query
186
-
187
-
188
153
  def snowflake_table_stats(
189
154
  conn: snowflake.SnowflakeConnectionConfig,
190
155
  qdc_client: qdc.QDCExternalAPIClient,
191
156
  tenant_id: str,
192
157
  ) -> None:
193
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
194
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
195
-
196
- target_query = _get_target_tables_query(
197
- db=conn.account_database,
198
- schema=conn.account_schema,
199
- )
200
- target_assets = sf_executor.get_query_results(query=target_query)
201
-
202
- stats_query = _get_stats_tables_query(
203
- db=conn.account_database,
204
- schema=conn.account_schema,
205
- )
206
- stats_columns = sf_executor.get_query_results(query=stats_query)
158
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
159
+ stats_query = _gen_get_stats_views_query(
160
+ db=conn.account_database,
161
+ schema=conn.account_schema,
162
+ )
163
+ stats_views = sf_executor.get_query_results(query=stats_query)
207
164
 
208
- req_count = 0
209
- for target_asset in target_assets:
210
- for stats_column in stats_columns:
165
+ req_count = 0
166
+ for stats_view in stats_views:
211
167
  stats_query = """
212
168
  SELECT
213
169
  db_name
@@ -224,18 +180,12 @@ def snowflake_table_stats(
224
180
  , stddev_value
225
181
  FROM
226
182
  {db}.{schema}.{table}
227
- WHERE
228
- db_name = '{target_db}'
229
- and schema_name = '{target_schema}'
230
- and table_name = '{target_table}'
231
183
  """.format(
232
- db=stats_column["TABLE_CATALOG"],
233
- schema=stats_column["TABLE_SCHEMA"],
234
- table=stats_column["TABLE_NAME"],
235
- target_db=target_asset["TABLE_CATALOG"],
236
- target_schema=target_asset["TABLE_SCHEMA"],
237
- target_table=target_asset["TABLE_NAME"],
184
+ db=stats_view["TABLE_CATALOG"],
185
+ schema=stats_view["TABLE_SCHEMA"],
186
+ table=stats_view["TABLE_NAME"],
238
187
  )
188
+ logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
239
189
  stats_result = sf_executor.get_query_results(query=stats_query)
240
190
  payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
241
191
  for payload in payloads:
@@ -253,4 +203,23 @@ def snowflake_table_stats(
253
203
  )
254
204
  if status_code == 200:
255
205
  req_count += 1
256
- logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
206
+ logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
207
+ return
208
+
209
+
210
+ def _gen_get_stats_views_query(db: str, schema: str) -> str:
211
+ query = """
212
+ SELECT
213
+ DISTINCT
214
+ TABLE_CATALOG
215
+ , TABLE_SCHEMA
216
+ , TABLE_NAME
217
+ FROM
218
+ {db}.INFORMATION_SCHEMA.TABLES
219
+ WHERE
220
+ startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
221
+ AND TABLE_SCHEMA = UPPER('{schema}')
222
+ """.format(
223
+ db=db, schema=schema
224
+ )
225
+ return query
@@ -54,7 +54,6 @@ class SQLLineage:
54
54
  dest_db: str = None,
55
55
  dest_schema: str = None,
56
56
  ) -> Tuple[Set[Table], Table]:
57
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
58
57
  try:
59
58
  statement: sqlglot.Expression = sqlglot.parse_one(sql=sql, error_level=sqlglot.ErrorLevel.RAISE)
60
59
  except ParseError as e:
quollio_core/redshift.py CHANGED
@@ -4,6 +4,7 @@ import os
4
4
 
5
5
  from quollio_core.helper.core import setup_dbt_profile
6
6
  from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log import set_log_level
7
8
  from quollio_core.profilers.redshift import (
8
9
  redshift_table_level_lineage,
9
10
  redshift_table_level_sqllineage,
@@ -16,12 +17,10 @@ logger = logging.getLogger(__name__)
16
17
 
17
18
  def build_view(
18
19
  conn: redshift.RedshiftConnectionConfig,
19
- skip_heavy: bool,
20
+ aggregate_all: bool = False,
20
21
  target_tables: str = "",
21
22
  log_level: str = "info",
22
23
  ) -> None:
23
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
24
-
25
24
  logger.info("Build profiler views using dbt")
26
25
  # set parameters
27
26
  dbt_client = dbt.DBTClient()
@@ -29,9 +28,9 @@ def build_view(
29
28
  project_path = f"{current_dir}/dbt_projects/redshift"
30
29
  template_path = f"{current_dir}/dbt_projects/redshift/profiles"
31
30
  template_name = "profiles_template.yml"
32
- options = '{{"query_user": {query_user}, "skip_heavy": {skip_heavy}, "target_database": {database}}}'.format(
31
+ options = '{{"query_user": {query_user}, "aggregate_all": {aggregate_all}, "target_database": {database}}}'.format(
33
32
  query_user=conn.query_user,
34
- skip_heavy=skip_heavy,
33
+ aggregate_all=aggregate_all,
35
34
  database=conn.database,
36
35
  )
37
36
 
@@ -74,7 +73,6 @@ def load_lineage(
74
73
  qdc_client: qdc.QDCExternalAPIClient,
75
74
  tenant_id: str,
76
75
  ) -> None:
77
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
78
76
  logger.info("Generate redshift table to table lineage.")
79
77
  redshift_table_level_lineage(
80
78
  conn=conn,
@@ -101,7 +99,6 @@ def load_stats(
101
99
  qdc_client: qdc.QDCExternalAPIClient,
102
100
  tenant_id: str,
103
101
  ) -> None:
104
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
105
102
 
106
103
  logger.info("Generate redshift stats.")
107
104
  redshift_table_stats(
@@ -119,7 +116,6 @@ def load_sqllineage(
119
116
  qdc_client: qdc.QDCExternalAPIClient,
120
117
  tenant_id: str,
121
118
  ) -> None:
122
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
123
119
 
124
120
  logger.info("Generate Redshift sqllineage.")
125
121
  redshift_table_level_sqllineage(
@@ -210,12 +206,12 @@ if __name__ == "__main__":
210
206
  help="Target schema name where the views are built by dbt",
211
207
  )
212
208
  parser.add_argument(
213
- "--skip_heavy",
209
+ "--aggregate_all",
214
210
  type=bool,
215
- action=env_default("REDSHIFT_SKIP_HEAVY"),
216
- default=True,
211
+ action=env_default("REDSHIFT_AGGREGATE_ALL", store_true=True),
212
+ default=False,
217
213
  required=False,
218
- help="Skip heavy queries when building views by dbt",
214
+ help="Aggregate all stats values. False by default.",
219
215
  )
220
216
  parser.add_argument(
221
217
  "--target_tables",
@@ -266,6 +262,8 @@ if __name__ == "__main__":
266
262
  help="The client secrete that is created on Quollio console to let clients access Quollio External API",
267
263
  )
268
264
  args = parser.parse_args()
265
+ set_log_level(level=args.log_level)
266
+
269
267
  conn = redshift.RedshiftConnectionConfig(
270
268
  host=args.host,
271
269
  build_user=args.build_user,
@@ -283,7 +281,7 @@ if __name__ == "__main__":
283
281
  if "build_view" in args.commands:
284
282
  build_view(
285
283
  conn=conn,
286
- skip_heavy=args.skip_heavy,
284
+ aggregate_all=args.aggregate_all,
287
285
  target_tables=args.target_tables,
288
286
  log_level=args.log_level,
289
287
  )
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional
5
5
  from databricks.sdk.core import Config, HeaderFactory, oauth_service_principal
6
6
  from databricks.sql.client import Connection, connect
7
7
 
8
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
8
+ logger = logging.getLogger(__name__)
9
9
 
10
10
 
11
11
  @dataclass
@@ -47,8 +47,8 @@ class DatabricksQueryExecutor:
47
47
  cur.execute(query)
48
48
  result: List[Dict[str, str]] = cur.fetchall()
49
49
  except Exception as e:
50
- logging.error(query, exc_info=True)
51
- logging.error("databricks get_query_results failed. %s", e)
50
+ logger.error(query, exc_info=True)
51
+ logger.error("databricks get_query_results failed. %s", e)
52
52
  raise
53
53
 
54
54
  for row in result:
@@ -11,7 +11,6 @@ class DBTClient:
11
11
  self.dbt = dbtRunner()
12
12
 
13
13
  def invoke(self, cmd: str, project_dir: str, profile_dir: str, options: List[str] = None) -> dbtRunnerResult:
14
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
15
14
  req = [cmd, "--project-dir", project_dir, "--profiles-dir", profile_dir]
16
15
  if options is not None:
17
16
  req.extend(options)
@@ -25,7 +25,6 @@ class QDCExternalAPIClient:
25
25
  Tried to find a package for oauth0 client credentials flow,
26
26
  but any of them contains bugs or lacks of features to handle the token refresh when it's expired
27
27
  """
28
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
29
28
  url = f"{self.base_url}/oauth2/token"
30
29
  creds = f"{self.client_id}:{self.client_secret}"
31
30
  encoded_creds = base64.b64encode(creds.encode()).decode()
@@ -65,7 +64,6 @@ class QDCExternalAPIClient:
65
64
  return session
66
65
 
67
66
  def update_stats_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
68
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
69
67
  self._refresh_token_if_expired()
70
68
  headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
71
69
  endpoint = f"{self.base_url}/v2/assets/{global_id}/stats"
@@ -85,7 +83,6 @@ class QDCExternalAPIClient:
85
83
  return res.status_code
86
84
 
87
85
  def update_lineage_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
88
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
89
86
  self._refresh_token_if_expired()
90
87
  headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
91
88
  endpoint = f"{self.base_url}/v2/lineage/{global_id}"
@@ -67,7 +67,6 @@ class RedshiftQueryExecutor:
67
67
  return conn
68
68
 
69
69
  def get_query_results(self, query: str) -> Tuple[List[str]]:
70
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
71
70
  with self.conn.cursor() as cur:
72
71
  try:
73
72
  cur.execute(query)