quollio-core 0.4.4__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. quollio_core/__init__.py +1 -1
  2. quollio_core/bigquery.py +123 -0
  3. quollio_core/bricks.py +288 -0
  4. quollio_core/dbt_projects/databricks/.gitignore +4 -0
  5. quollio_core/dbt_projects/databricks/README.md +5 -0
  6. quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
  7. quollio_core/dbt_projects/databricks/dbt_project.yml +21 -0
  8. quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
  9. quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +73 -0
  10. quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +14 -0
  11. quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +63 -0
  12. quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +11 -0
  13. quollio_core/dbt_projects/databricks/models/sources.yml +84 -0
  14. quollio_core/dbt_projects/databricks/package-lock.yml +14 -0
  15. quollio_core/dbt_projects/databricks/packages.yml +13 -0
  16. quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +14 -0
  17. quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
  18. quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
  19. quollio_core/dbt_projects/redshift/dbt_project.yml +1 -1
  20. quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +101 -34
  21. quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
  22. quollio_core/dbt_projects/redshift/package-lock.yml +1 -1
  23. quollio_core/dbt_projects/seeds/.gitkeep +0 -0
  24. quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +50 -27
  25. quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
  26. quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +57 -20
  27. quollio_core/helper/core.py +4 -0
  28. quollio_core/helper/env_default.py +28 -2
  29. quollio_core/helper/log.py +17 -0
  30. quollio_core/profilers/bigquery.py +81 -0
  31. quollio_core/profilers/databricks.py +198 -0
  32. quollio_core/profilers/lineage.py +26 -0
  33. quollio_core/profilers/redshift.py +41 -74
  34. quollio_core/profilers/snowflake.py +138 -169
  35. quollio_core/profilers/sqllineage.py +0 -1
  36. quollio_core/profilers/stats.py +0 -1
  37. quollio_core/redshift.py +15 -18
  38. quollio_core/repository/bigquery.py +61 -0
  39. quollio_core/repository/databricks.py +62 -0
  40. quollio_core/repository/dbt.py +0 -1
  41. quollio_core/repository/qdc.py +0 -3
  42. quollio_core/repository/redshift.py +0 -1
  43. quollio_core/repository/snowflake.py +6 -1
  44. quollio_core/snowflake.py +29 -16
  45. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/METADATA +11 -2
  46. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/RECORD +48 -25
  47. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/LICENSE +0 -0
  48. {quollio_core-0.4.4.dist-info → quollio_core-0.4.10.dist-info}/WHEEL +0 -0
@@ -0,0 +1,81 @@
1
+ import logging
2
+ from typing import Any, Dict, List
3
+
4
+ from quollio_core.profilers.lineage import gen_table_lineage_payload, parse_bigquery_table_lineage
5
+ from quollio_core.repository import qdc
6
+ from quollio_core.repository.bigquery import BigQueryClient, GCPLineageClient, get_entitiy_reference, get_search_request
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def bigquery_table_lineage(
12
+ qdc_client: qdc.QDCExternalAPIClient,
13
+ tenant_id: str,
14
+ project_id: str,
15
+ regions: list,
16
+ org_id: str,
17
+ credentials: Any,
18
+ ):
19
+ lineage_client = GCPLineageClient(credentials)
20
+ bq_client = BigQueryClient(credentials)
21
+
22
+ datasets = bq_client.list_datasets(project_id)
23
+ all_tables = generate_table_list(datasets, bq_client)
24
+ lineage_links = generate_lineage_links(all_tables, lineage_client, project_id, regions)
25
+ lineage_links = parse_bigquery_table_lineage(lineage_links)
26
+
27
+ update_table_lineage_inputs = gen_table_lineage_payload(tenant_id=tenant_id, endpoint=org_id, tables=lineage_links)
28
+
29
+ req_count = 0
30
+ for update_table_lineage_input in update_table_lineage_inputs:
31
+ logger.info(
32
+ "Generating table lineage. downstream: %s -> %s-> %s",
33
+ update_table_lineage_input.downstream_database_name,
34
+ update_table_lineage_input.downstream_schema_name,
35
+ update_table_lineage_input.downstream_table_name,
36
+ )
37
+ status_code = qdc_client.update_lineage_by_id(
38
+ global_id=update_table_lineage_input.downstream_global_id,
39
+ payload=update_table_lineage_input.upstreams.as_dict(),
40
+ )
41
+ if status_code == 200:
42
+ req_count += 1
43
+ logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
44
+
45
+
46
+ def generate_table_list(datasets: List[str], bq_client: BigQueryClient) -> List[str]:
47
+ all_tables = []
48
+ for dataset in datasets:
49
+ all_tables.extend(
50
+ [
51
+ table
52
+ for table in bq_client.list_tables(dataset.dataset_id)
53
+ if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
54
+ ]
55
+ )
56
+
57
+ all_table_names = []
58
+ for table in all_tables:
59
+ all_table_names.append(f"{table.project}.{table.dataset_id}.{table.table_id}")
60
+
61
+ return all_table_names
62
+
63
+
64
+ def generate_lineage_links(
65
+ all_tables: List[str], lineage_client: GCPLineageClient, project_id: str, regions: List[str]
66
+ ) -> Dict[str, List[str]]:
67
+ lineage_links = {}
68
+ for table in all_tables:
69
+ downstream = get_entitiy_reference()
70
+ downstream.fully_qualified_name = f"bigquery:{table}"
71
+
72
+ for region in regions:
73
+ request = get_search_request(downstream_table=downstream, project_id=project_id, region=region)
74
+ response = lineage_client.get_links(request=request)
75
+ for lineage in response:
76
+ target_table = str(lineage.target.fully_qualified_name).replace("bigquery:", "")
77
+ if target_table not in lineage_links:
78
+ lineage_links[target_table] = []
79
+ lineage_links[target_table].append(str(lineage.source.fully_qualified_name).replace("bigquery:", ""))
80
+
81
+ return lineage_links
@@ -0,0 +1,198 @@
1
+ import logging
2
+ from typing import Dict, List
3
+
4
+ from quollio_core.profilers.lineage import (
5
+ gen_column_lineage_payload,
6
+ gen_table_lineage_payload,
7
+ parse_databricks_table_lineage,
8
+ )
9
+ from quollio_core.profilers.stats import gen_table_stats_payload
10
+ from quollio_core.repository import databricks, qdc
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def databricks_table_level_lineage(
16
+ conn: databricks.DatabricksConnectionConfig,
17
+ endpoint: str,
18
+ qdc_client: qdc.QDCExternalAPIClient,
19
+ tenant_id: str,
20
+ dbt_table_name: str = "quollio_lineage_table_level",
21
+ ) -> None:
22
+ with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
23
+ results = databricks_executor.get_query_results(
24
+ query=f"""
25
+ SELECT
26
+ DOWNSTREAM_TABLE_NAME,
27
+ UPSTREAM_TABLES
28
+ FROM {conn.catalog}.{conn.schema}.{dbt_table_name}
29
+ """
30
+ )
31
+ tables = parse_databricks_table_lineage(results)
32
+ update_table_lineage_inputs = gen_table_lineage_payload(
33
+ tenant_id=tenant_id,
34
+ endpoint=endpoint,
35
+ tables=tables,
36
+ )
37
+
38
+ req_count = 0
39
+ for update_table_lineage_input in update_table_lineage_inputs:
40
+ logger.info(
41
+ "Generating table lineage. downstream: %s -> %s-> %s",
42
+ update_table_lineage_input.downstream_database_name,
43
+ update_table_lineage_input.downstream_schema_name,
44
+ update_table_lineage_input.downstream_table_name,
45
+ )
46
+ status_code = qdc_client.update_lineage_by_id(
47
+ global_id=update_table_lineage_input.downstream_global_id,
48
+ payload=update_table_lineage_input.upstreams.as_dict(),
49
+ )
50
+ if status_code == 200:
51
+ req_count += 1
52
+ logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
53
+ return
54
+
55
+
56
+ def databricks_column_level_lineage(
57
+ conn: databricks.DatabricksConnectionConfig,
58
+ endpoint: str,
59
+ qdc_client: qdc.QDCExternalAPIClient,
60
+ tenant_id: str,
61
+ dbt_table_name: str = "quollio_lineage_column_level",
62
+ ) -> None:
63
+ with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
64
+ results = databricks_executor.get_query_results(
65
+ query=f"""
66
+ SELECT
67
+ *
68
+ FROM
69
+ {conn.catalog}.{conn.schema}.{dbt_table_name}
70
+ """
71
+ )
72
+
73
+ update_column_lineage_inputs = gen_column_lineage_payload(
74
+ tenant_id=tenant_id,
75
+ endpoint=endpoint,
76
+ columns=results,
77
+ )
78
+
79
+ req_count = 0
80
+ for update_column_lineage_input in update_column_lineage_inputs:
81
+ logger.info(
82
+ "Generating column lineage. downstream: %s -> %s -> %s -> %s",
83
+ update_column_lineage_input.downstream_database_name,
84
+ update_column_lineage_input.downstream_schema_name,
85
+ update_column_lineage_input.downstream_table_name,
86
+ update_column_lineage_input.downstream_column_name,
87
+ )
88
+ status_code = qdc_client.update_lineage_by_id(
89
+ global_id=update_column_lineage_input.downstream_global_id,
90
+ payload=update_column_lineage_input.upstreams.as_dict(),
91
+ )
92
+ if status_code == 200:
93
+ req_count += 1
94
+ logger.info(
95
+ "Generating column lineage is finished. %s lineages are ingested.",
96
+ req_count,
97
+ )
98
+ return
99
+
100
+
101
+ def _get_monitoring_tables(
102
+ conn: databricks.DatabricksConnectionConfig, monitoring_table_suffix: str = "_profile_metrics"
103
+ ) -> List[Dict[str, str]]:
104
+ tables = []
105
+ query = f"""
106
+ SELECT
107
+ table_catalog,
108
+ table_schema,
109
+ table_name,
110
+ CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_fqdn
111
+ FROM
112
+ system.information_schema.tables
113
+ WHERE
114
+ table_name LIKE "%{monitoring_table_suffix}"
115
+ AND table_name NOT LIKE ('quollio_%')
116
+ """
117
+ with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
118
+ tables = databricks_executor.get_query_results(query)
119
+ if len(tables) > 0:
120
+ logger.info("Found %s monitoring tables.", len(tables))
121
+ return tables
122
+ else:
123
+ logger.info("No monitoring tables found.")
124
+ return []
125
+
126
+
127
+ def _get_column_stats(
128
+ conn: databricks.DatabricksConnectionConfig, monitoring_table_suffix: str = "_profile_metrics"
129
+ ) -> List[Dict[str, str]]:
130
+ tables = _get_monitoring_tables(conn, monitoring_table_suffix)
131
+ if not tables:
132
+ return []
133
+ stats = []
134
+ for table in tables:
135
+ monitored_table = table["table_fqdn"].removesuffix("_profile_metrics")
136
+ monitored_table = monitored_table.split(".")
137
+ if len(monitored_table) != 3:
138
+ raise ValueError(f"Invalid table name: {table['table_fqdn']}")
139
+ with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
140
+ query = """
141
+ SELECT
142
+ "{monitored_table_catalog}" as DB_NAME,
143
+ "{monitored_table_schema}" as SCHEMA_NAME,
144
+ "{monitored_table_name}" as TABLE_NAME,
145
+ t.COLUMN_NAME,
146
+ t.DATA_TYPE,
147
+ t.distinct_count as CARDINALITY,
148
+ t.MAX as MAX_VALUE,
149
+ t.MIN as MIN_VALUE,
150
+ t.AVG as AVG_VALUE,
151
+ t.MEDIAN as MEDIAN_VALUE,
152
+ t.STDDEV as STDDEV_VALUE,
153
+ t.NUM_NULLS as NULL_COUNT,
154
+ t.frequent_items[0].item AS MODE_VALUE,
155
+ MAX(t.window) AS LATEST
156
+ FROM
157
+ {monitoring_table} t
158
+ WHERE
159
+ t.column_name not in (':table')
160
+ GROUP BY
161
+ t.COLUMN_NAME,
162
+ t.DATA_TYPE,
163
+ t.distinct_count,
164
+ t.MAX,
165
+ t.MIN,
166
+ t.AVG,
167
+ t.MEDIAN,
168
+ t.STDDEV,
169
+ t.NUM_NULLS,
170
+ t.frequent_items
171
+ """.format(
172
+ monitoring_table=table["table_fqdn"],
173
+ monitored_table_catalog=monitored_table[0],
174
+ monitored_table_schema=monitored_table[1],
175
+ monitored_table_name=monitored_table[2],
176
+ )
177
+ stats.append(databricks_executor.get_query_results(query))
178
+ return stats
179
+
180
+
181
+ def databricks_column_stats(
182
+ conn: databricks.DatabricksConnectionConfig,
183
+ endpoint: str,
184
+ qdc_client: qdc.QDCExternalAPIClient,
185
+ tenant_id: str,
186
+ monitoring_table_suffix: str = "_profile_metrics",
187
+ ) -> None:
188
+ table_stats = _get_column_stats(conn, monitoring_table_suffix)
189
+ for table in table_stats:
190
+ stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
191
+ for stat in stats:
192
+ status_code = qdc_client.update_stats_by_id(
193
+ global_id=stat.global_id,
194
+ payload=stat.body.as_dict(),
195
+ )
196
+ if status_code == 200:
197
+ logger.info("Stats for %s is successfully ingested.", stat.global_id)
198
+ return
@@ -141,3 +141,29 @@ def parse_snowflake_results(results: List[Dict[str, str]]):
141
141
  payload["UPSTREAM_TABLES"] = json.loads(result["UPSTREAM_TABLES"])
142
142
  payloads.append(payload)
143
143
  return payloads
144
+
145
+
146
+ def parse_databricks_table_lineage(results: List) -> List[Dict[str, Dict]]:
147
+ # Parses results from Quollio Databricks lineage table
148
+ # Returns tuple of downstream_table_name (0) and upstream_tables (1)
149
+ payloads = list()
150
+ for result in results:
151
+ payload = dict()
152
+ payload["DOWNSTREAM_TABLE_NAME"] = result["DOWNSTREAM_TABLE_NAME"]
153
+ payload["UPSTREAM_TABLES"] = json.loads(result["UPSTREAM_TABLES"])
154
+ payloads.append(payload)
155
+ return payloads
156
+
157
+
158
+ def parse_bigquery_table_lineage(tables: Dict) -> List[Dict[str, Dict]]:
159
+ payloads = list()
160
+ for downstream, upstream in tables.items():
161
+ payload = {
162
+ "DOWNSTREAM_TABLE_NAME": "",
163
+ "UPSTREAM_TABLES": [],
164
+ }
165
+ payload["DOWNSTREAM_TABLE_NAME"] = downstream
166
+ for upstream_table in upstream:
167
+ payload["UPSTREAM_TABLES"].append({"upstream_object_name": upstream_table})
168
+ payloads.append(payload)
169
+ return payloads
@@ -14,7 +14,6 @@ def redshift_table_level_lineage(
14
14
  tenant_id: str,
15
15
  dbt_table_name: str,
16
16
  ) -> None:
17
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
18
17
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
19
18
  results = redshift_executor.get_query_results(
20
19
  query="""
@@ -55,22 +54,7 @@ def redshift_table_level_lineage(
55
54
  return
56
55
 
57
56
 
58
- def _get_target_tables_query(db: str, schema: str) -> str:
59
- query = """
60
- SELECT
61
- DISTINCT
62
- database_name
63
- , schema_name
64
- , table_name
65
- FROM
66
- {db}.{schema}.quollio_stats_profiling_columns
67
- """.format(
68
- db=db, schema=schema
69
- )
70
- return query
71
-
72
-
73
- def _get_stats_tables_query(db: str, schema: str) -> str:
57
+ def _gen_get_stats_views_query(db: str, schema: str) -> str:
74
58
  query = """
75
59
  SELECT
76
60
  DISTINCT
@@ -93,70 +77,54 @@ def redshift_table_stats(
93
77
  qdc_client: qdc.QDCExternalAPIClient,
94
78
  tenant_id: str,
95
79
  ) -> None:
96
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
97
80
 
98
81
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
99
- req_count = 0
100
- target_query = _get_target_tables_query(
82
+ stats_query = _gen_get_stats_views_query(
101
83
  db=conn.database,
102
84
  schema=conn.schema,
103
85
  )
104
- target_assets = redshift_executor.get_query_results(query=target_query)
86
+ stats_views = redshift_executor.get_query_results(query=stats_query)
105
87
 
106
- stats_query = _get_stats_tables_query(
107
- db=conn.database,
108
- schema=conn.schema,
109
- )
110
- stats_columns = redshift_executor.get_query_results(query=stats_query)
111
- for target_asset in target_assets:
112
- for stats_column in stats_columns:
113
- stats_query = """
114
- SELECT
115
- db_name
116
- , schema_name
117
- , table_name
118
- , column_name
119
- , max_value
120
- , min_value
121
- , null_count
122
- , cardinality
123
- , avg_value
124
- , median_value
125
- , mode_value
126
- , stddev_value
127
- FROM
128
- {db}.{schema}.{table}
129
- WHERE
130
- db_name = '{target_db}'
131
- and schema_name = '{target_schema}'
132
- and table_name = '{target_table}'
133
- """.format(
134
- db=stats_column[0],
135
- schema=stats_column[1],
136
- table=stats_column[2],
137
- target_db=target_asset[0],
138
- target_schema=target_asset[1],
139
- target_table=target_asset[2],
88
+ req_count = 0
89
+ for stats_view in stats_views:
90
+ stats_query = """
91
+ SELECT
92
+ db_name
93
+ , schema_name
94
+ , table_name
95
+ , column_name
96
+ , max_value
97
+ , min_value
98
+ , null_count
99
+ , cardinality
100
+ , avg_value
101
+ , median_value
102
+ , mode_value
103
+ , stddev_value
104
+ FROM
105
+ {db}.{schema}.{table}
106
+ """.format(
107
+ db=stats_view[0],
108
+ schema=stats_view[1],
109
+ table=stats_view[2],
110
+ )
111
+ stats_result = redshift_executor.get_query_results(query=stats_query)
112
+ payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
113
+ for payload in payloads:
114
+ logger.info(
115
+ "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
116
+ db=payload.db,
117
+ schema=payload.schema,
118
+ table=payload.table,
119
+ column=payload.column,
120
+ )
140
121
  )
141
- stats_result = redshift_executor.get_query_results(query=stats_query)
142
- payloads = gen_table_stats_payload_from_tuple(
143
- tenant_id=tenant_id, endpoint=conn.host, stats=stats_result
122
+ status_code = qdc_client.update_stats_by_id(
123
+ global_id=payload.global_id,
124
+ payload=payload.body.get_column_stats(),
144
125
  )
145
- for payload in payloads:
146
- logger.info(
147
- "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
148
- db=payload.db,
149
- schema=payload.schema,
150
- table=payload.table,
151
- column=payload.column,
152
- )
153
- )
154
- status_code = qdc_client.update_stats_by_id(
155
- global_id=payload.global_id,
156
- payload=payload.body.get_column_stats(),
157
- )
158
- if status_code == 200:
159
- req_count += 1
126
+ if status_code == 200:
127
+ req_count += 1
160
128
  logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
161
129
  return
162
130
 
@@ -166,7 +134,6 @@ def redshift_table_level_sqllineage(
166
134
  qdc_client: qdc.QDCExternalAPIClient,
167
135
  tenant_id: str,
168
136
  ) -> None:
169
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
170
137
  redshift_connector = redshift.RedshiftQueryExecutor(conn)
171
138
  results = redshift_connector.get_query_results(
172
139
  query="""