quollio-core 0.4.4__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. quollio_core/__init__.py +1 -1
  2. quollio_core/bigquery.py +123 -0
  3. quollio_core/bricks.py +288 -0
  4. quollio_core/dbt_projects/databricks/.gitignore +4 -0
  5. quollio_core/dbt_projects/databricks/README.md +5 -0
  6. quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
  7. quollio_core/dbt_projects/databricks/dbt_project.yml +21 -0
  8. quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
  9. quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +73 -0
  10. quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +14 -0
  11. quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +63 -0
  12. quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +11 -0
  13. quollio_core/dbt_projects/databricks/models/sources.yml +84 -0
  14. quollio_core/dbt_projects/databricks/package-lock.yml +14 -0
  15. quollio_core/dbt_projects/databricks/packages.yml +13 -0
  16. quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +14 -0
  17. quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
  18. quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
  19. quollio_core/dbt_projects/redshift/dbt_project.yml +1 -1
  20. quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +101 -34
  21. quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
  22. quollio_core/dbt_projects/redshift/package-lock.yml +1 -1
  23. quollio_core/dbt_projects/seeds/.gitkeep +0 -0
  24. quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +50 -27
  25. quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
  26. quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +57 -20
  27. quollio_core/helper/core.py +4 -0
  28. quollio_core/helper/env_default.py +28 -2
  29. quollio_core/helper/log.py +17 -0
  30. quollio_core/profilers/bigquery.py +81 -0
  31. quollio_core/profilers/databricks.py +198 -0
  32. quollio_core/profilers/lineage.py +26 -0
  33. quollio_core/profilers/redshift.py +41 -74
  34. quollio_core/profilers/snowflake.py +138 -169
  35. quollio_core/profilers/sqllineage.py +0 -1
  36. quollio_core/profilers/stats.py +0 -1
  37. quollio_core/redshift.py +15 -18
  38. quollio_core/repository/bigquery.py +61 -0
  39. quollio_core/repository/databricks.py +62 -0
  40. quollio_core/repository/dbt.py +0 -1
  41. quollio_core/repository/qdc.py +0 -3
  42. quollio_core/repository/redshift.py +0 -1
  43. quollio_core/repository/snowflake.py +6 -1
  44. quollio_core/snowflake.py +29 -16
  45. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/METADATA +11 -2
  46. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/RECORD +48 -25
  47. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/LICENSE +0 -0
  48. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/WHEEL +0 -0
@@ -17,42 +17,41 @@ def snowflake_table_to_table_lineage(
17
17
  qdc_client: qdc.QDCExternalAPIClient,
18
18
  tenant_id: str,
19
19
  ) -> None:
20
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
21
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
22
- results = sf_executor.get_query_results(
23
- query="""
24
- SELECT
25
- *
26
- FROM
27
- {db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
28
- """.format(
29
- db=conn.account_database,
30
- schema=conn.account_schema,
31
- )
32
- )
33
- parsed_results = parse_snowflake_results(results=results)
34
- update_table_lineage_inputs = gen_table_lineage_payload(
35
- tenant_id=tenant_id,
36
- endpoint=conn.account_id,
37
- tables=parsed_results,
38
- )
39
-
40
- req_count = 0
41
- for update_table_lineage_input in update_table_lineage_inputs:
42
- logger.info(
43
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
44
- db=update_table_lineage_input.downstream_database_name,
45
- schema=update_table_lineage_input.downstream_schema_name,
46
- table=update_table_lineage_input.downstream_table_name,
20
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
21
+ results = sf_executor.get_query_results(
22
+ query="""
23
+ SELECT
24
+ *
25
+ FROM
26
+ {db}.{schema}.QUOLLIO_LINEAGE_TABLE_LEVEL
27
+ """.format(
28
+ db=conn.account_database,
29
+ schema=conn.account_schema,
47
30
  )
48
31
  )
49
- status_code = qdc_client.update_lineage_by_id(
50
- global_id=update_table_lineage_input.downstream_global_id,
51
- payload=update_table_lineage_input.upstreams.as_dict(),
32
+ parsed_results = parse_snowflake_results(results=results)
33
+ update_table_lineage_inputs = gen_table_lineage_payload(
34
+ tenant_id=tenant_id,
35
+ endpoint=conn.account_id,
36
+ tables=parsed_results,
52
37
  )
53
- if status_code == 200:
54
- req_count += 1
55
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
38
+
39
+ req_count = 0
40
+ for update_table_lineage_input in update_table_lineage_inputs:
41
+ logger.info(
42
+ "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
43
+ db=update_table_lineage_input.downstream_database_name,
44
+ schema=update_table_lineage_input.downstream_schema_name,
45
+ table=update_table_lineage_input.downstream_table_name,
46
+ )
47
+ )
48
+ status_code = qdc_client.update_lineage_by_id(
49
+ global_id=update_table_lineage_input.downstream_global_id,
50
+ payload=update_table_lineage_input.upstreams.as_dict(),
51
+ )
52
+ if status_code == 200:
53
+ req_count += 1
54
+ logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
56
55
  return
57
56
 
58
57
 
@@ -61,41 +60,41 @@ def snowflake_column_to_column_lineage(
61
60
  qdc_client: qdc.QDCExternalAPIClient,
62
61
  tenant_id: str,
63
62
  ) -> None:
64
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
65
- results = sf_executor.get_query_results(
66
- query="""
67
- SELECT
68
- *
69
- FROM
70
- {db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
71
- """.format(
72
- db=conn.account_database,
73
- schema=conn.account_schema,
74
- )
75
- )
76
- update_column_lineage_inputs = gen_column_lineage_payload(
77
- tenant_id=tenant_id,
78
- endpoint=conn.account_id,
79
- columns=results,
80
- )
81
-
82
- req_count = 0
83
- for update_column_lineage_input in update_column_lineage_inputs:
84
- logger.info(
85
- "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
86
- db=update_column_lineage_input.downstream_database_name,
87
- schema=update_column_lineage_input.downstream_schema_name,
88
- table=update_column_lineage_input.downstream_table_name,
89
- column=update_column_lineage_input.downstream_column_name,
63
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
64
+ results = sf_executor.get_query_results(
65
+ query="""
66
+ SELECT
67
+ *
68
+ FROM
69
+ {db}.{schema}.QUOLLIO_LINEAGE_COLUMN_LEVEL
70
+ """.format(
71
+ db=conn.account_database,
72
+ schema=conn.account_schema,
90
73
  )
91
74
  )
92
- status_code = qdc_client.update_lineage_by_id(
93
- global_id=update_column_lineage_input.downstream_global_id,
94
- payload=update_column_lineage_input.upstreams.as_dict(),
75
+ update_column_lineage_inputs = gen_column_lineage_payload(
76
+ tenant_id=tenant_id,
77
+ endpoint=conn.account_id,
78
+ columns=results,
95
79
  )
96
- if status_code == 200:
97
- req_count += 1
98
- logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
80
+
81
+ req_count = 0
82
+ for update_column_lineage_input in update_column_lineage_inputs:
83
+ logger.info(
84
+ "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
85
+ db=update_column_lineage_input.downstream_database_name,
86
+ schema=update_column_lineage_input.downstream_schema_name,
87
+ table=update_column_lineage_input.downstream_table_name,
88
+ column=update_column_lineage_input.downstream_column_name,
89
+ )
90
+ )
91
+ status_code = qdc_client.update_lineage_by_id(
92
+ global_id=update_column_lineage_input.downstream_global_id,
93
+ payload=update_column_lineage_input.upstreams.as_dict(),
94
+ )
95
+ if status_code == 200:
96
+ req_count += 1
97
+ logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
99
98
  return
100
99
 
101
100
 
@@ -104,110 +103,67 @@ def snowflake_table_level_sqllineage(
104
103
  qdc_client: qdc.QDCExternalAPIClient,
105
104
  tenant_id: str,
106
105
  ) -> None:
107
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
108
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
109
- results = sf_executor.get_query_results(
110
- query="""
111
- SELECT
112
- database_name
113
- , schema_name
114
- , query_text
115
- FROM
116
- {db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
117
- """.format(
118
- db=conn.account_database,
119
- schema=conn.account_schema,
120
- )
121
- )
122
- update_table_lineage_inputs_list = list()
123
- sql_lineage = SQLLineage()
124
- for result in results:
125
- src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
126
- sql=result["QUERY_TEXT"],
127
- dialect="snowflake",
128
- dest_db=result["DATABASE_NAME"],
129
- dest_schema=result["SCHEMA_NAME"],
130
- )
131
- update_table_lineage_inputs = sql_lineage.gen_lineage_input(
132
- tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
133
- )
134
- update_table_lineage_inputs_list.append(update_table_lineage_inputs)
135
-
136
- req_count = 0
137
- for update_table_lineage_input in update_table_lineage_inputs_list:
138
- logger.info(
139
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
140
- db=update_table_lineage_input.downstream_database_name,
141
- schema=update_table_lineage_input.downstream_schema_name,
142
- table=update_table_lineage_input.downstream_table_name,
106
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
107
+ results = sf_executor.get_query_results(
108
+ query="""
109
+ SELECT
110
+ database_name
111
+ , schema_name
112
+ , query_text
113
+ FROM
114
+ {db}.{schema}.QUOLLIO_SQLLINEAGE_SOURCES
115
+ """.format(
116
+ db=conn.account_database,
117
+ schema=conn.account_schema,
143
118
  )
144
119
  )
145
- status_code = qdc_client.update_lineage_by_id(
146
- global_id=update_table_lineage_input.downstream_global_id,
147
- payload=update_table_lineage_input.upstreams.as_dict(),
148
- )
149
- if status_code == 200:
150
- req_count += 1
151
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
120
+ update_table_lineage_inputs_list = list()
121
+ sql_lineage = SQLLineage()
122
+ for result in results:
123
+ src_tables, dest_table = sql_lineage.get_table_level_lineage_source(
124
+ sql=result["QUERY_TEXT"],
125
+ dialect="snowflake",
126
+ dest_db=result["DATABASE_NAME"],
127
+ dest_schema=result["SCHEMA_NAME"],
128
+ )
129
+ update_table_lineage_inputs = sql_lineage.gen_lineage_input(
130
+ tenant_id=tenant_id, endpoint=conn.account_id, src_tables=src_tables, dest_table=dest_table
131
+ )
132
+ update_table_lineage_inputs_list.append(update_table_lineage_inputs)
133
+
134
+ req_count = 0
135
+ for update_table_lineage_input in update_table_lineage_inputs_list:
136
+ logger.info(
137
+ "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
138
+ db=update_table_lineage_input.downstream_database_name,
139
+ schema=update_table_lineage_input.downstream_schema_name,
140
+ table=update_table_lineage_input.downstream_table_name,
141
+ )
142
+ )
143
+ status_code = qdc_client.update_lineage_by_id(
144
+ global_id=update_table_lineage_input.downstream_global_id,
145
+ payload=update_table_lineage_input.upstreams.as_dict(),
146
+ )
147
+ if status_code == 200:
148
+ req_count += 1
149
+ logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
152
150
  return
153
151
 
154
152
 
155
- def _get_target_tables_query(db: str, schema: str) -> str:
156
- query = """
157
- SELECT
158
- DISTINCT
159
- TABLE_CATALOG
160
- , TABLE_SCHEMA
161
- , TABLE_NAME
162
- FROM
163
- {db}.{schema}.QUOLLIO_STATS_PROFILING_COLUMNS
164
- """.format(
165
- db=db, schema=schema
166
- )
167
- return query
168
-
169
-
170
- def _get_stats_tables_query(db: str, schema: str) -> str:
171
- query = """
172
- SELECT
173
- DISTINCT
174
- TABLE_CATALOG
175
- , TABLE_SCHEMA
176
- , TABLE_NAME
177
- FROM
178
- {db}.INFORMATION_SCHEMA.TABLES
179
- WHERE
180
- startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
181
- AND TABLE_SCHEMA = UPPER('{schema}')
182
- """.format(
183
- db=db, schema=schema
184
- )
185
- return query
186
-
187
-
188
153
  def snowflake_table_stats(
189
154
  conn: snowflake.SnowflakeConnectionConfig,
190
155
  qdc_client: qdc.QDCExternalAPIClient,
191
156
  tenant_id: str,
192
157
  ) -> None:
193
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
194
- sf_executor = snowflake.SnowflakeQueryExecutor(conn)
195
-
196
- target_query = _get_target_tables_query(
197
- db=conn.account_database,
198
- schema=conn.account_schema,
199
- )
200
- target_assets = sf_executor.get_query_results(query=target_query)
201
-
202
- stats_query = _get_stats_tables_query(
203
- db=conn.account_database,
204
- schema=conn.account_schema,
205
- )
206
- stats_columns = sf_executor.get_query_results(query=stats_query)
158
+ with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
159
+ stats_query = _gen_get_stats_views_query(
160
+ db=conn.account_database,
161
+ schema=conn.account_schema,
162
+ )
163
+ stats_views = sf_executor.get_query_results(query=stats_query)
207
164
 
208
- req_count = 0
209
- for target_asset in target_assets:
210
- for stats_column in stats_columns:
165
+ req_count = 0
166
+ for stats_view in stats_views:
211
167
  stats_query = """
212
168
  SELECT
213
169
  db_name
@@ -224,18 +180,12 @@ def snowflake_table_stats(
224
180
  , stddev_value
225
181
  FROM
226
182
  {db}.{schema}.{table}
227
- WHERE
228
- db_name = '{target_db}'
229
- and schema_name = '{target_schema}'
230
- and table_name = '{target_table}'
231
183
  """.format(
232
- db=stats_column["TABLE_CATALOG"],
233
- schema=stats_column["TABLE_SCHEMA"],
234
- table=stats_column["TABLE_NAME"],
235
- target_db=target_asset["TABLE_CATALOG"],
236
- target_schema=target_asset["TABLE_SCHEMA"],
237
- target_table=target_asset["TABLE_NAME"],
184
+ db=stats_view["TABLE_CATALOG"],
185
+ schema=stats_view["TABLE_SCHEMA"],
186
+ table=stats_view["TABLE_NAME"],
238
187
  )
188
+ logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
239
189
  stats_result = sf_executor.get_query_results(query=stats_query)
240
190
  payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
241
191
  for payload in payloads:
@@ -253,4 +203,23 @@ def snowflake_table_stats(
253
203
  )
254
204
  if status_code == 200:
255
205
  req_count += 1
256
- logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
206
+ logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
207
+ return
208
+
209
+
210
+ def _gen_get_stats_views_query(db: str, schema: str) -> str:
211
+ query = """
212
+ SELECT
213
+ DISTINCT
214
+ TABLE_CATALOG
215
+ , TABLE_SCHEMA
216
+ , TABLE_NAME
217
+ FROM
218
+ {db}.INFORMATION_SCHEMA.TABLES
219
+ WHERE
220
+ startswith(TABLE_NAME, 'QUOLLIO_STATS_COLUMNS_')
221
+ AND TABLE_SCHEMA = UPPER('{schema}')
222
+ """.format(
223
+ db=db, schema=schema
224
+ )
225
+ return query
@@ -54,7 +54,6 @@ class SQLLineage:
54
54
  dest_db: str = None,
55
55
  dest_schema: str = None,
56
56
  ) -> Tuple[Set[Table], Table]:
57
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
58
57
  try:
59
58
  statement: sqlglot.Expression = sqlglot.parse_one(sql=sql, error_level=sqlglot.ErrorLevel.RAISE)
60
59
  except ParseError as e:
@@ -77,7 +77,6 @@ def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str,
77
77
  table_global_id = new_global_id(
78
78
  tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
79
79
  )
80
-
81
80
  stats_request = StatsRequest(
82
81
  global_id=table_global_id,
83
82
  db=stat["DB_NAME"],
quollio_core/redshift.py CHANGED
@@ -4,6 +4,7 @@ import os
4
4
 
5
5
  from quollio_core.helper.core import setup_dbt_profile
6
6
  from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log import set_log_level
7
8
  from quollio_core.profilers.redshift import (
8
9
  redshift_table_level_lineage,
9
10
  redshift_table_level_sqllineage,
@@ -16,12 +17,10 @@ logger = logging.getLogger(__name__)
16
17
 
17
18
  def build_view(
18
19
  conn: redshift.RedshiftConnectionConfig,
19
- skip_heavy: bool,
20
+ aggregate_all: bool = False,
20
21
  target_tables: str = "",
21
22
  log_level: str = "info",
22
23
  ) -> None:
23
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
24
-
25
24
  logger.info("Build profiler views using dbt")
26
25
  # set parameters
27
26
  dbt_client = dbt.DBTClient()
@@ -29,9 +28,9 @@ def build_view(
29
28
  project_path = f"{current_dir}/dbt_projects/redshift"
30
29
  template_path = f"{current_dir}/dbt_projects/redshift/profiles"
31
30
  template_name = "profiles_template.yml"
32
- options = '{{"query_user": {query_user}, "skip_heavy": {skip_heavy}, "target_database": {database}}}'.format(
31
+ options = '{{"query_user": {query_user}, "aggregate_all": {aggregate_all}, "target_database": {database}}}'.format(
33
32
  query_user=conn.query_user,
34
- skip_heavy=skip_heavy,
33
+ aggregate_all=aggregate_all,
35
34
  database=conn.database,
36
35
  )
37
36
 
@@ -50,11 +49,10 @@ def build_view(
50
49
  options=["--no-use-colors", "--log-level", log_level, "--vars", options],
51
50
  )
52
51
  run_options = ["--no-use-colors", "--log-level", log_level, "--vars", options]
53
- target_tables_list = target_tables.split()
54
- if target_tables_list is not None:
55
- if "quollio_stats_columns" in target_tables_list:
56
- target_tables_list.append("quollio_stats_profiling_columns")
57
- target_tables_str = " ".join(target_tables_list)
52
+ if target_tables is not None:
53
+ if "quollio_stats_columns" in target_tables:
54
+ target_tables.append("quollio_stats_profiling_columns")
55
+ target_tables_str = " ".join(target_tables)
58
56
  run_options.append("--select")
59
57
  run_options.append(target_tables_str)
60
58
 
@@ -75,7 +73,6 @@ def load_lineage(
75
73
  qdc_client: qdc.QDCExternalAPIClient,
76
74
  tenant_id: str,
77
75
  ) -> None:
78
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
79
76
  logger.info("Generate redshift table to table lineage.")
80
77
  redshift_table_level_lineage(
81
78
  conn=conn,
@@ -102,7 +99,6 @@ def load_stats(
102
99
  qdc_client: qdc.QDCExternalAPIClient,
103
100
  tenant_id: str,
104
101
  ) -> None:
105
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
106
102
 
107
103
  logger.info("Generate redshift stats.")
108
104
  redshift_table_stats(
@@ -120,7 +116,6 @@ def load_sqllineage(
120
116
  qdc_client: qdc.QDCExternalAPIClient,
121
117
  tenant_id: str,
122
118
  ) -> None:
123
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
124
119
 
125
120
  logger.info("Generate Redshift sqllineage.")
126
121
  redshift_table_level_sqllineage(
@@ -211,12 +206,12 @@ if __name__ == "__main__":
211
206
  help="Target schema name where the views are built by dbt",
212
207
  )
213
208
  parser.add_argument(
214
- "--skip_heavy",
209
+ "--aggregate_all",
215
210
  type=bool,
216
- action=env_default("REDSHIFT_SKIP_HEAVY"),
217
- default=True,
211
+ action=env_default("REDSHIFT_AGGREGATE_ALL", store_true=True),
212
+ default=False,
218
213
  required=False,
219
- help="Skip heavy queries when building views by dbt",
214
+ help="Aggregate all stats values. False by default.",
220
215
  )
221
216
  parser.add_argument(
222
217
  "--target_tables",
@@ -267,6 +262,8 @@ if __name__ == "__main__":
267
262
  help="The client secrete that is created on Quollio console to let clients access Quollio External API",
268
263
  )
269
264
  args = parser.parse_args()
265
+ set_log_level(level=args.log_level)
266
+
270
267
  conn = redshift.RedshiftConnectionConfig(
271
268
  host=args.host,
272
269
  build_user=args.build_user,
@@ -284,7 +281,7 @@ if __name__ == "__main__":
284
281
  if "build_view" in args.commands:
285
282
  build_view(
286
283
  conn=conn,
287
- skip_heavy=args.skip_heavy,
284
+ aggregate_all=args.aggregate_all,
288
285
  target_tables=args.target_tables,
289
286
  log_level=args.log_level,
290
287
  )
@@ -0,0 +1,61 @@
1
+ import logging
2
+
3
+ from google.cloud.bigquery import Client
4
+ from google.cloud.datacatalog_lineage_v1 import EntityReference, LineageClient, SearchLinksRequest
5
+ from google.oauth2.service_account import Credentials
6
+ from googleapiclient.discovery import build
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class BigQueryClient:
12
+ def __init__(self, credentials: Credentials) -> None:
13
+ self.client = self.__initialze(credentials=credentials)
14
+
15
+ def __initialze(self, credentials: Credentials) -> Client:
16
+ client = Client(credentials=credentials)
17
+ return client
18
+
19
+ def list_datasets(self, project_id) -> list:
20
+ datasets = list(self.client.list_datasets(project_id))
21
+ logger.debug("Found %s datasets in project %s", len(datasets), project_id)
22
+ return datasets
23
+
24
+ def list_tables(self, dataset_id) -> list:
25
+ tables = list(self.client.list_tables(dataset_id))
26
+ logger.debug("Found %s tables in dataset %s", len(tables), dataset_id)
27
+ return list(self.client.list_tables(dataset_id))
28
+
29
+
30
+ class GCPLineageClient:
31
+ def __init__(self, credentials: Credentials) -> None:
32
+ self.client = self.__initialze(credentials=credentials)
33
+
34
+ def __initialze(self, credentials: Credentials) -> LineageClient:
35
+ client = LineageClient(credentials=credentials)
36
+ return client
37
+
38
+ def get_links(self, request: SearchLinksRequest) -> list:
39
+ response = self.client.search_links(request)
40
+ return response.links
41
+
42
+
43
+ def get_entitiy_reference() -> EntityReference:
44
+ return EntityReference()
45
+
46
+
47
+ def get_search_request(downstream_table: EntityReference, project_id: str, region: str) -> SearchLinksRequest:
48
+ return SearchLinksRequest(target=downstream_table, parent=f"projects/{project_id}/locations/{region.lower()}")
49
+
50
+
51
+ def get_credentials(credentials_json: dict) -> Credentials:
52
+ return Credentials.from_service_account_info(credentials_json)
53
+
54
+
55
+ def get_org_id(credentials_json: dict) -> str:
56
+ credentials = get_credentials(credentials_json)
57
+ crm_service = build("cloudresourcemanager", "v1", credentials=credentials)
58
+ project_id = credentials_json["project_id"]
59
+ project = crm_service.projects().get(projectId=project_id).execute()
60
+ org_id = project["parent"]["id"]
61
+ return org_id
@@ -0,0 +1,62 @@
1
+ import logging
2
+ from dataclasses import asdict, dataclass
3
+ from typing import Dict, List, Optional
4
+
5
+ from databricks.sdk.core import Config, HeaderFactory, oauth_service_principal
6
+ from databricks.sql.client import Connection, connect
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @dataclass
12
+ class DatabricksConnectionConfig:
13
+ host: str
14
+ http_path: str
15
+ client_id: str
16
+ client_secret: str
17
+ catalog: str
18
+ schema: str
19
+
20
+ def as_dict(self) -> Dict[str, str]:
21
+ return asdict(self)
22
+
23
+
24
+ class DatabricksQueryExecutor:
25
+ def __init__(self, config: DatabricksConnectionConfig) -> None:
26
+ self.config = config
27
+ self.conn = self.__initialize()
28
+
29
+ def __enter__(self):
30
+ return self
31
+
32
+ def __exit__(self, exc_type, exc_value, traceback):
33
+ self.conn.close()
34
+
35
+ def __initialize(self) -> Connection:
36
+ conn = connect(
37
+ server_hostname=self.config.host,
38
+ http_path=self.config.http_path,
39
+ credentials_provider=self.credential_provider,
40
+ )
41
+ return conn
42
+
43
+ def get_query_results(self, query: str) -> List[Dict[str, str]]:
44
+ results_asdict: List[Dict[str, str]] = []
45
+ with self.conn.cursor() as cur:
46
+ try:
47
+ cur.execute(query)
48
+ result: List[Dict[str, str]] = cur.fetchall()
49
+ except Exception as e:
50
+ logger.error(query, exc_info=True)
51
+ logger.error("databricks get_query_results failed. %s", e)
52
+ raise
53
+
54
+ for row in result:
55
+ results_asdict.append(row.asDict())
56
+ return results_asdict
57
+
58
+ def credential_provider(self) -> Optional[HeaderFactory]:
59
+ config = Config(
60
+ host=f"https://{self.config.host}", client_id=self.config.client_id, client_secret=self.config.client_secret
61
+ )
62
+ return oauth_service_principal(config)
@@ -11,7 +11,6 @@ class DBTClient:
11
11
  self.dbt = dbtRunner()
12
12
 
13
13
  def invoke(self, cmd: str, project_dir: str, profile_dir: str, options: List[str] = None) -> dbtRunnerResult:
14
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
15
14
  req = [cmd, "--project-dir", project_dir, "--profiles-dir", profile_dir]
16
15
  if options is not None:
17
16
  req.extend(options)
@@ -25,7 +25,6 @@ class QDCExternalAPIClient:
25
25
  Tried to find a package for oauth0 client credentials flow,
26
26
  but any of them contains bugs or lacks of features to handle the token refresh when it's expired
27
27
  """
28
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
29
28
  url = f"{self.base_url}/oauth2/token"
30
29
  creds = f"{self.client_id}:{self.client_secret}"
31
30
  encoded_creds = base64.b64encode(creds.encode()).decode()
@@ -65,7 +64,6 @@ class QDCExternalAPIClient:
65
64
  return session
66
65
 
67
66
  def update_stats_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
68
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
69
67
  self._refresh_token_if_expired()
70
68
  headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
71
69
  endpoint = f"{self.base_url}/v2/assets/{global_id}/stats"
@@ -85,7 +83,6 @@ class QDCExternalAPIClient:
85
83
  return res.status_code
86
84
 
87
85
  def update_lineage_by_id(self, global_id: str, payload: Dict[str, List[str]]) -> int:
88
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
89
86
  self._refresh_token_if_expired()
90
87
  headers = {"content-type": "application/json", "authorization": f"Bearer {self.auth_token}"}
91
88
  endpoint = f"{self.base_url}/v2/lineage/{global_id}"
@@ -67,7 +67,6 @@ class RedshiftQueryExecutor:
67
67
  return conn
68
68
 
69
69
  def get_query_results(self, query: str) -> Tuple[List[str]]:
70
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
71
70
  with self.conn.cursor() as cur:
72
71
  try:
73
72
  cur.execute(query)
@@ -28,6 +28,12 @@ class SnowflakeQueryExecutor:
28
28
  def __init__(self, config: SnowflakeConnectionConfig) -> None:
29
29
  self.conn = self.__initialize(config)
30
30
 
31
+ def __enter__(self):
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_value, traceback):
35
+ self.conn.close()
36
+
31
37
  def __initialize(self, config: SnowflakeConnectionConfig) -> SnowflakeConnection:
32
38
  conn: SnowflakeConnection = connect(
33
39
  user=config.account_user,
@@ -41,7 +47,6 @@ class SnowflakeQueryExecutor:
41
47
  return conn
42
48
 
43
49
  def get_query_results(self, query: str) -> List[Dict[str, str]]:
44
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
45
50
  with self.conn.cursor(DictCursor) as cur:
46
51
  try:
47
52
  cur.execute(query)