quollio-core 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quollio_core/__init__.py +1 -1
- quollio_core/bigquery.py +106 -46
- quollio_core/bricks.py +15 -3
- quollio_core/helper/log_utils.py +48 -0
- quollio_core/profilers/bigquery.py +81 -17
- quollio_core/profilers/databricks.py +45 -39
- quollio_core/profilers/redshift.py +13 -22
- quollio_core/profilers/snowflake.py +7 -21
- quollio_core/profilers/stats.py +78 -17
- quollio_core/redshift.py +22 -2
- quollio_core/repository/bigquery.py +50 -17
- quollio_core/repository/qdc.py +4 -0
- quollio_core/snowflake.py +22 -3
- {quollio_core-0.4.11.dist-info → quollio_core-0.4.13.dist-info}/METADATA +2 -1
- {quollio_core-0.4.11.dist-info → quollio_core-0.4.13.dist-info}/RECORD +17 -16
- {quollio_core-0.4.11.dist-info → quollio_core-0.4.13.dist-info}/LICENSE +0 -0
- {quollio_core-0.4.11.dist-info → quollio_core-0.4.13.dist-info}/WHEEL +0 -0
quollio_core/__init__.py
CHANGED
quollio_core/bigquery.py
CHANGED
@@ -1,19 +1,37 @@
|
|
1
1
|
import argparse
|
2
2
|
import json
|
3
|
-
|
3
|
+
|
4
|
+
from google.auth.credentials import Credentials
|
4
5
|
|
5
6
|
from quollio_core.helper.env_default import env_default
|
6
|
-
from quollio_core.helper.
|
7
|
-
from quollio_core.profilers.bigquery import bigquery_table_lineage
|
7
|
+
from quollio_core.helper.log_utils import configure_logging, error_handling_decorator, logger
|
8
|
+
from quollio_core.profilers.bigquery import bigquery_table_lineage, bigquery_table_stats
|
8
9
|
from quollio_core.repository import qdc
|
9
|
-
from quollio_core.repository.bigquery import get_credentials, get_org_id
|
10
|
+
from quollio_core.repository.bigquery import BigQueryClient, get_credentials, get_org_id
|
11
|
+
|
12
|
+
|
13
|
+
def initialize_credentials(credentials_json: str) -> Credentials:
|
14
|
+
return get_credentials(json.loads(credentials_json))
|
15
|
+
|
10
16
|
|
11
|
-
|
17
|
+
def initialize_org_id(credentials_json: str) -> str:
|
18
|
+
return get_org_id(json.loads(credentials_json))
|
12
19
|
|
13
20
|
|
21
|
+
def initialize_bq_client(credentials: Credentials, project_id: str) -> BigQueryClient:
|
22
|
+
return BigQueryClient(credentials=credentials, project_id=project_id)
|
23
|
+
|
24
|
+
|
25
|
+
@error_handling_decorator
|
14
26
|
def load_lineage(
|
15
|
-
|
16
|
-
|
27
|
+
tenant_id: str,
|
28
|
+
project_id: str,
|
29
|
+
regions: list,
|
30
|
+
org_id: str,
|
31
|
+
credentials: Credentials,
|
32
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
33
|
+
) -> None:
|
34
|
+
logger.info("Loading lineage data.")
|
17
35
|
bigquery_table_lineage(
|
18
36
|
qdc_client=qdc_client,
|
19
37
|
tenant_id=tenant_id,
|
@@ -22,29 +40,53 @@ def load_lineage(
|
|
22
40
|
credentials=credentials,
|
23
41
|
org_id=org_id,
|
24
42
|
)
|
43
|
+
logger.info("Lineage data loaded successfully.")
|
44
|
+
|
45
|
+
|
46
|
+
@error_handling_decorator
|
47
|
+
def load_stats(
|
48
|
+
conn: BigQueryClient,
|
49
|
+
tenant_id: str,
|
50
|
+
org_id: str,
|
51
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
52
|
+
dataplex_stats_tables: list,
|
53
|
+
) -> None:
|
54
|
+
logger.info("Loading statistics data.")
|
55
|
+
bigquery_table_stats(
|
56
|
+
bq_client=conn,
|
57
|
+
qdc_client=qdc_client,
|
58
|
+
tenant_id=tenant_id,
|
59
|
+
org_id=org_id,
|
60
|
+
dataplex_stats_tables=dataplex_stats_tables,
|
61
|
+
)
|
62
|
+
logger.info("Statistics data loaded successfully.")
|
25
63
|
|
26
64
|
|
27
65
|
if __name__ == "__main__":
|
28
66
|
parser = argparse.ArgumentParser(
|
29
|
-
prog="Quollio Intelligence Agent for
|
30
|
-
description="
|
67
|
+
prog="Quollio Intelligence Agent for BigQuery",
|
68
|
+
description="Load lineage and stats to Quollio from BigQuery using Dataplex and BigQuery APIs",
|
31
69
|
epilog="Copyright (c) 2024 Quollio Technologies, Inc.",
|
32
70
|
)
|
33
71
|
parser.add_argument(
|
34
72
|
"commands",
|
35
|
-
choices=["load_lineage"],
|
73
|
+
choices=["load_lineage", "load_stats"],
|
36
74
|
type=str,
|
37
75
|
nargs="+",
|
38
76
|
help="""
|
39
77
|
The command to execute.
|
40
|
-
'load_lineage': Load lineage data from
|
78
|
+
'load_lineage': Load lineage data from created views to Quollio,
|
79
|
+
'load_stats': Load stats from created views to Quollio,
|
41
80
|
""",
|
42
81
|
)
|
43
82
|
parser.add_argument(
|
44
|
-
"--
|
83
|
+
"--log_level",
|
45
84
|
type=str,
|
46
|
-
|
47
|
-
|
85
|
+
choices=["debug", "info", "warn", "error", "none"],
|
86
|
+
action=env_default("LOG_LEVEL"),
|
87
|
+
default="info",
|
88
|
+
required=False,
|
89
|
+
help="The log level for dbt commands. Default value is info",
|
48
90
|
)
|
49
91
|
parser.add_argument(
|
50
92
|
"--tenant_id",
|
@@ -53,6 +95,27 @@ if __name__ == "__main__":
|
|
53
95
|
required=False,
|
54
96
|
help="The tenant id (company id) where the lineage and stats are loaded",
|
55
97
|
)
|
98
|
+
parser.add_argument(
|
99
|
+
"--project_id",
|
100
|
+
type=str,
|
101
|
+
default=None,
|
102
|
+
required=False,
|
103
|
+
help="Project ID of the BigQuery project to load lineage and stats from (default is loaded from credentials)",
|
104
|
+
)
|
105
|
+
parser.add_argument(
|
106
|
+
"--regions",
|
107
|
+
type=str,
|
108
|
+
action=env_default("GCP_REGIONS"),
|
109
|
+
required=True,
|
110
|
+
help="Comma-separated list of regions BigQuery data is in",
|
111
|
+
)
|
112
|
+
parser.add_argument(
|
113
|
+
"--credentials_json",
|
114
|
+
type=str,
|
115
|
+
action=env_default("GOOGLE_APPLICATION_CREDENTIALS"),
|
116
|
+
required=True,
|
117
|
+
help="Credentials JSON",
|
118
|
+
)
|
56
119
|
parser.add_argument(
|
57
120
|
"--api_url",
|
58
121
|
type=str,
|
@@ -74,50 +137,47 @@ if __name__ == "__main__":
|
|
74
137
|
required=False,
|
75
138
|
help="The client secret that is created on Quollio console to let clients access Quollio External API",
|
76
139
|
)
|
140
|
+
|
77
141
|
parser.add_argument(
|
78
|
-
"--
|
79
|
-
type=str,
|
80
|
-
action=env_default("GCP_PROJECT_ID"),
|
81
|
-
required=False,
|
82
|
-
help="GCP Project ID",
|
83
|
-
)
|
84
|
-
parser.add_argument(
|
85
|
-
"--regions",
|
86
|
-
type=str,
|
87
|
-
action=env_default("GCP_REGIONS"),
|
88
|
-
required=False,
|
89
|
-
help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
|
90
|
-
nargs="+",
|
91
|
-
)
|
92
|
-
parser.add_argument(
|
93
|
-
"--log_level",
|
142
|
+
"--dataplex_stats_tables",
|
94
143
|
type=str,
|
95
|
-
|
96
|
-
action=env_default("LOG_LEVEL"),
|
144
|
+
action=env_default("DATAPLEX_STATS_TABLES"),
|
97
145
|
required=False,
|
98
|
-
help="
|
146
|
+
help="Comma-separated list of dataplex stats tables - <project_id>.<dataset_id>.<table_id>",
|
99
147
|
)
|
100
148
|
|
101
149
|
args = parser.parse_args()
|
102
|
-
set_log_level(level=args.log_level)
|
103
150
|
|
104
|
-
if
|
105
|
-
|
151
|
+
# Validate that dataplex_stats_tables is provided if load_stats is in commands
|
152
|
+
if "load_stats" in args.commands and not args.dataplex_stats_tables:
|
153
|
+
parser.error("--dataplex_stats_tables is required when 'load_stats' command is used")
|
106
154
|
|
107
|
-
|
108
|
-
qdc_client = qdc.QDCExternalAPIClient(
|
109
|
-
base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
|
110
|
-
)
|
155
|
+
configure_logging(args.log_level)
|
111
156
|
|
112
|
-
|
113
|
-
|
114
|
-
|
157
|
+
credentials = initialize_credentials(args.credentials_json)
|
158
|
+
org_id = initialize_org_id(args.credentials_json)
|
159
|
+
qdc_client = qdc.initialize_qdc_client(args.api_url, args.client_id, args.client_secret)
|
160
|
+
bq_client = initialize_bq_client(credentials, args.project_id)
|
161
|
+
if args.project_id is None:
|
162
|
+
args.project_id = json.loads(args.credentials_json)["project_id"]
|
163
|
+
regions = args.regions.split(",")
|
115
164
|
|
165
|
+
if "load_lineage" in args.commands:
|
116
166
|
load_lineage(
|
117
|
-
qdc_client=qdc_client,
|
118
|
-
project_id=args.project_id,
|
119
|
-
regions=args.regions,
|
120
167
|
tenant_id=args.tenant_id,
|
168
|
+
project_id=args.project_id,
|
169
|
+
regions=regions,
|
170
|
+
org_id=org_id,
|
121
171
|
credentials=credentials,
|
172
|
+
qdc_client=qdc_client,
|
173
|
+
)
|
174
|
+
|
175
|
+
if "load_stats" in args.commands:
|
176
|
+
tables = args.dataplex_stats_tables.split(",")
|
177
|
+
load_stats(
|
178
|
+
conn=bq_client,
|
179
|
+
tenant_id=args.tenant_id,
|
122
180
|
org_id=org_id,
|
181
|
+
qdc_client=qdc_client,
|
182
|
+
dataplex_stats_tables=tables,
|
123
183
|
)
|
quollio_core/bricks.py
CHANGED
@@ -10,6 +10,7 @@ from quollio_core.profilers.databricks import (
|
|
10
10
|
databricks_column_stats,
|
11
11
|
databricks_table_level_lineage,
|
12
12
|
)
|
13
|
+
from quollio_core.profilers.stats import get_column_stats_items
|
13
14
|
from quollio_core.repository import databricks as db
|
14
15
|
from quollio_core.repository import dbt, qdc
|
15
16
|
|
@@ -21,7 +22,6 @@ def build_view(
|
|
21
22
|
target_tables: str = "",
|
22
23
|
log_level: str = "info",
|
23
24
|
) -> None:
|
24
|
-
|
25
25
|
logger.info("Build profiler views using dbt")
|
26
26
|
# set parameters
|
27
27
|
dbt_client = dbt.DBTClient()
|
@@ -64,7 +64,6 @@ def load_lineage(
|
|
64
64
|
tenant_id: str,
|
65
65
|
enable_column_lineage: bool = False,
|
66
66
|
) -> None:
|
67
|
-
|
68
67
|
logger.info("Generate Databricks table to table lineage.")
|
69
68
|
databricks_table_level_lineage(
|
70
69
|
conn=conn,
|
@@ -98,7 +97,6 @@ def load_column_stats(
|
|
98
97
|
qdc_client: qdc.QDCExternalAPIClient,
|
99
98
|
tenant_id: str,
|
100
99
|
) -> None:
|
101
|
-
|
102
100
|
logger.info("Generate Databricks column stats.")
|
103
101
|
databricks_column_stats(
|
104
102
|
conn=conn,
|
@@ -240,6 +238,19 @@ if __name__ == "__main__":
|
|
240
238
|
help="Whether to ingest column lineage into QDIC or not. Default value is False",
|
241
239
|
)
|
242
240
|
|
241
|
+
stats_items = get_column_stats_items()
|
242
|
+
parser.add_argument(
|
243
|
+
"--target_stats_items",
|
244
|
+
type=str,
|
245
|
+
nargs="*",
|
246
|
+
choices=stats_items,
|
247
|
+
default=stats_items,
|
248
|
+
action=env_default("DATABRICKS_STATS_ITEMS"),
|
249
|
+
required=False,
|
250
|
+
help="The items for statistic values.\
|
251
|
+
You can choose the items to be aggregated for stats. All items are selected by default.",
|
252
|
+
)
|
253
|
+
|
243
254
|
args = parser.parse_args()
|
244
255
|
set_log_level(level=args.log_level)
|
245
256
|
|
@@ -284,5 +295,6 @@ if __name__ == "__main__":
|
|
284
295
|
endpoint=args.host,
|
285
296
|
qdc_client=qdc_client,
|
286
297
|
tenant_id=args.tenant_id,
|
298
|
+
stats_items=args.target_stats_items,
|
287
299
|
monitoring_table_suffix=args.monitoring_table_suffix,
|
288
300
|
)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import inspect
|
2
|
+
import logging
|
3
|
+
|
4
|
+
LOG_LEVELS = {
|
5
|
+
"critical": logging.CRITICAL,
|
6
|
+
"error": logging.ERROR,
|
7
|
+
"warning": logging.WARNING,
|
8
|
+
"info": logging.INFO,
|
9
|
+
"debug": logging.DEBUG,
|
10
|
+
"notset": logging.NOTSET,
|
11
|
+
}
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def configure_logging(level: str = "INFO"):
|
17
|
+
"""Configure logging settings."""
|
18
|
+
log_level = LOG_LEVELS.get(level.lower())
|
19
|
+
if log_level is None:
|
20
|
+
raise ValueError(f"Unknown log level: {level}")
|
21
|
+
|
22
|
+
logging.basicConfig(
|
23
|
+
level=log_level,
|
24
|
+
format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
|
25
|
+
)
|
26
|
+
logger.setLevel(log_level)
|
27
|
+
logger.info(f"Logging is configured to {level} level.")
|
28
|
+
|
29
|
+
|
30
|
+
def error_handling_decorator(func):
|
31
|
+
"""Decorator for consistent error handling in CLI commands."""
|
32
|
+
|
33
|
+
def wrapper(*args, **kwargs):
|
34
|
+
func_name = func.__name__
|
35
|
+
try:
|
36
|
+
logger.debug(f"Starting {func_name}")
|
37
|
+
result = func(*args, **kwargs)
|
38
|
+
logger.debug(f"Completed {func_name} successfully")
|
39
|
+
return result
|
40
|
+
except Exception as e:
|
41
|
+
|
42
|
+
current_frame = inspect.currentframe()
|
43
|
+
error_frame = current_frame.f_back
|
44
|
+
line_number = error_frame.f_lineno
|
45
|
+
logger.error(f"Error in {func_name} at line {line_number}: {str(e)}", exc_info=True)
|
46
|
+
raise
|
47
|
+
|
48
|
+
return wrapper
|
@@ -1,26 +1,28 @@
|
|
1
|
-
import
|
2
|
-
from typing import Any, Dict, List
|
1
|
+
from typing import Dict, List
|
3
2
|
|
3
|
+
from google.auth.credentials import Credentials
|
4
|
+
|
5
|
+
from quollio_core.helper.log_utils import error_handling_decorator, logger
|
4
6
|
from quollio_core.profilers.lineage import gen_table_lineage_payload, parse_bigquery_table_lineage
|
7
|
+
from quollio_core.profilers.stats import gen_table_stats_payload
|
5
8
|
from quollio_core.repository import qdc
|
6
9
|
from quollio_core.repository.bigquery import BigQueryClient, GCPLineageClient, get_entitiy_reference, get_search_request
|
7
10
|
|
8
|
-
logger = logging.getLogger(__name__)
|
9
|
-
|
10
11
|
|
12
|
+
@error_handling_decorator
|
11
13
|
def bigquery_table_lineage(
|
12
14
|
qdc_client: qdc.QDCExternalAPIClient,
|
13
15
|
tenant_id: str,
|
14
16
|
project_id: str,
|
15
17
|
regions: list,
|
16
18
|
org_id: str,
|
17
|
-
credentials:
|
18
|
-
):
|
19
|
+
credentials: Credentials,
|
20
|
+
) -> None:
|
19
21
|
lineage_client = GCPLineageClient(credentials)
|
20
|
-
bq_client = BigQueryClient(credentials)
|
22
|
+
bq_client = BigQueryClient(credentials, project_id)
|
21
23
|
|
22
|
-
datasets = bq_client.
|
23
|
-
all_tables = generate_table_list(
|
24
|
+
datasets = bq_client.list_dataset_ids()
|
25
|
+
all_tables = generate_table_list(bq_client, datasets)
|
24
26
|
lineage_links = generate_lineage_links(all_tables, lineage_client, project_id, regions)
|
25
27
|
lineage_links = parse_bigquery_table_lineage(lineage_links)
|
26
28
|
|
@@ -29,7 +31,7 @@ def bigquery_table_lineage(
|
|
29
31
|
req_count = 0
|
30
32
|
for update_table_lineage_input in update_table_lineage_inputs:
|
31
33
|
logger.info(
|
32
|
-
"Generating table lineage. downstream: %s -> %s-> %s",
|
34
|
+
"Generating table lineage. downstream: %s -> %s -> %s",
|
33
35
|
update_table_lineage_input.downstream_database_name,
|
34
36
|
update_table_lineage_input.downstream_schema_name,
|
35
37
|
update_table_lineage_input.downstream_table_name,
|
@@ -43,29 +45,65 @@ def bigquery_table_lineage(
|
|
43
45
|
logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
|
44
46
|
|
45
47
|
|
46
|
-
|
48
|
+
@error_handling_decorator
|
49
|
+
def bigquery_table_stats(
|
50
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
51
|
+
bq_client: BigQueryClient,
|
52
|
+
tenant_id: str,
|
53
|
+
org_id: str,
|
54
|
+
dataplex_stats_tables: list,
|
55
|
+
) -> None:
|
56
|
+
profiling_results = []
|
57
|
+
for table in dataplex_stats_tables:
|
58
|
+
logger.info("Profiling columns using Dataplex stats table: %s", table)
|
59
|
+
profiling_results.extend(column_stats_from_dataplex(bq_client, table))
|
60
|
+
|
61
|
+
stats = gen_table_stats_payload(tenant_id, org_id, profiling_results)
|
62
|
+
|
63
|
+
for stat in stats:
|
64
|
+
status_code = qdc_client.update_stats_by_id(
|
65
|
+
global_id=stat.global_id,
|
66
|
+
payload=stat.body.as_dict(),
|
67
|
+
)
|
68
|
+
if status_code == 200:
|
69
|
+
logger.info(
|
70
|
+
"Stats for column %s -> %s -> %s -> %s is successfully ingested.",
|
71
|
+
stat.db,
|
72
|
+
stat.schema,
|
73
|
+
stat.table,
|
74
|
+
stat.column,
|
75
|
+
)
|
76
|
+
logger.debug("Stats for column id %s is successfully ingested.", stat.global_id)
|
77
|
+
|
78
|
+
|
79
|
+
def generate_table_list(bq_client: BigQueryClient, datasets: List[str]) -> List[str]:
|
47
80
|
all_tables = []
|
48
81
|
for dataset in datasets:
|
49
82
|
all_tables.extend(
|
50
83
|
[
|
51
84
|
table
|
52
|
-
for table in bq_client.list_tables(dataset
|
53
|
-
if table
|
54
|
-
]
|
85
|
+
for table in bq_client.list_tables(dataset)
|
86
|
+
if table["table_type"] in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
|
87
|
+
],
|
55
88
|
)
|
56
89
|
|
57
90
|
all_table_names = []
|
58
91
|
for table in all_tables:
|
59
|
-
all_table_names.append(f"{
|
92
|
+
all_table_names.append(f"{bq_client.client.project}.{table['dataset_id']}.{table['table_id']}")
|
60
93
|
|
61
94
|
return all_table_names
|
62
95
|
|
63
96
|
|
64
97
|
def generate_lineage_links(
|
65
|
-
all_tables: List[str],
|
98
|
+
all_tables: List[str],
|
99
|
+
lineage_client: GCPLineageClient,
|
100
|
+
project_id: str,
|
101
|
+
regions: List[str],
|
66
102
|
) -> Dict[str, List[str]]:
|
67
103
|
lineage_links = {}
|
68
104
|
for table in all_tables:
|
105
|
+
if "quollio" in table.lower():
|
106
|
+
continue
|
69
107
|
downstream = get_entitiy_reference()
|
70
108
|
downstream.fully_qualified_name = f"bigquery:{table}"
|
71
109
|
|
@@ -74,8 +112,34 @@ def generate_lineage_links(
|
|
74
112
|
response = lineage_client.get_links(request=request)
|
75
113
|
for lineage in response:
|
76
114
|
target_table = str(lineage.target.fully_qualified_name).replace("bigquery:", "")
|
115
|
+
source_table = str(lineage.source.fully_qualified_name).replace("bigquery:", "")
|
77
116
|
if target_table not in lineage_links:
|
78
117
|
lineage_links[target_table] = []
|
79
|
-
lineage_links[target_table]
|
118
|
+
if source_table not in lineage_links[target_table]:
|
119
|
+
lineage_links[target_table].append(source_table)
|
80
120
|
|
81
121
|
return lineage_links
|
122
|
+
|
123
|
+
|
124
|
+
def column_stats_from_dataplex(bq_client: BigQueryClient, profiling_table: str) -> List[Dict]:
|
125
|
+
query = f"""
|
126
|
+
SELECT
|
127
|
+
data_source.table_project_id AS DB_NAME,
|
128
|
+
data_source.dataset_id AS SCHEMA_NAME,
|
129
|
+
data_source.table_id AS TABLE_NAME,
|
130
|
+
column_name AS COLUMN_NAME,
|
131
|
+
min_value AS MIN_VALUE,
|
132
|
+
max_value AS MAX_VALUE,
|
133
|
+
average_value AS AVG_VALUE,
|
134
|
+
quartile_median AS MEDIAN_VALUE,
|
135
|
+
standard_deviation AS STDDEV_VALUE,
|
136
|
+
top_n[0][0] AS MODE_VALUE,
|
137
|
+
CAST((percent_null / 100) * job_rows_scanned AS INT) as NULL_COUNT,
|
138
|
+
CAST((percent_unique / 100) * job_rows_scanned AS INT) as CARDINALITY
|
139
|
+
FROM `{profiling_table}`
|
140
|
+
"""
|
141
|
+
logger.debug(f"Executing Query: {query}")
|
142
|
+
results = bq_client.client.query(query).result()
|
143
|
+
|
144
|
+
# Convert RowIterator to a list of dictionaries
|
145
|
+
return [dict(row) for row in results]
|
@@ -6,7 +6,7 @@ from quollio_core.profilers.lineage import (
|
|
6
6
|
gen_table_lineage_payload,
|
7
7
|
parse_databricks_table_lineage,
|
8
8
|
)
|
9
|
-
from quollio_core.profilers.stats import gen_table_stats_payload
|
9
|
+
from quollio_core.profilers.stats import gen_table_stats_payload, get_is_target_stats_items, render_sql_for_stats
|
10
10
|
from quollio_core.repository import databricks, qdc
|
11
11
|
|
12
12
|
logger = logging.getLogger(__name__)
|
@@ -125,59 +125,63 @@ def _get_monitoring_tables(
|
|
125
125
|
|
126
126
|
|
127
127
|
def _get_column_stats(
|
128
|
-
conn: databricks.DatabricksConnectionConfig,
|
128
|
+
conn: databricks.DatabricksConnectionConfig,
|
129
|
+
stats_items: List[str],
|
130
|
+
monitoring_table_suffix: str = "_profile_metrics",
|
129
131
|
) -> List[Dict[str, str]]:
|
130
132
|
tables = _get_monitoring_tables(conn, monitoring_table_suffix)
|
131
133
|
if not tables:
|
132
134
|
return []
|
133
135
|
stats = []
|
136
|
+
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
134
137
|
for table in tables:
|
135
138
|
monitored_table = table["table_fqdn"].removesuffix("_profile_metrics")
|
136
139
|
monitored_table = monitored_table.split(".")
|
137
140
|
if len(monitored_table) != 3:
|
138
141
|
raise ValueError(f"Invalid table name: {table['table_fqdn']}")
|
139
142
|
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
143
|
+
cte = """
|
144
|
+
WITH profile_record_history AS (
|
145
|
+
SELECT
|
146
|
+
COLUMN_NAME
|
147
|
+
, distinct_count as cardinality
|
148
|
+
, MAX as max_value
|
149
|
+
, MIN as min_value
|
150
|
+
, AVG as avg_value
|
151
|
+
, MEDIAN as median_value
|
152
|
+
, STDDEV as stddev_value
|
153
|
+
, NUM_NULLS as null_count
|
154
|
+
, get(frequent_items, 0).item AS mode_value
|
155
|
+
, row_number() over(partition by column_name order by window desc) rownum
|
156
|
+
FROM
|
157
|
+
{monitoring_table}
|
158
|
+
WHERE
|
159
|
+
column_name not in (':table')
|
160
|
+
), profile_record AS (
|
161
|
+
SELECT
|
162
|
+
"{monitored_table_catalog}" as db_name
|
163
|
+
, "{monitored_table_schema}" as schema_name
|
164
|
+
, "{monitored_table_name}" as table_name
|
165
|
+
, column_name
|
166
|
+
, max_value
|
167
|
+
, min_value
|
168
|
+
, null_count
|
169
|
+
, cardinality
|
170
|
+
, avg_value
|
171
|
+
, median_value
|
172
|
+
, mode_value
|
173
|
+
, stddev_value
|
174
|
+
FROM
|
175
|
+
profile_record_history
|
176
|
+
WHERE
|
177
|
+
rownum = 1
|
178
|
+
)""".format(
|
176
179
|
monitoring_table=table["table_fqdn"],
|
177
180
|
monitored_table_catalog=monitored_table[0],
|
178
181
|
monitored_table_schema=monitored_table[1],
|
179
182
|
monitored_table_name=monitored_table[2],
|
180
183
|
)
|
184
|
+
query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn="profile_record", cte=cte)
|
181
185
|
logger.debug(f"The following sql will be fetched to retrieve stats values. {query}")
|
182
186
|
stats.append(databricks_executor.get_query_results(query))
|
183
187
|
return stats
|
@@ -188,10 +192,12 @@ def databricks_column_stats(
|
|
188
192
|
endpoint: str,
|
189
193
|
qdc_client: qdc.QDCExternalAPIClient,
|
190
194
|
tenant_id: str,
|
195
|
+
stats_items: List[str],
|
191
196
|
monitoring_table_suffix: str = "_profile_metrics",
|
192
197
|
) -> None:
|
193
|
-
table_stats = _get_column_stats(conn, monitoring_table_suffix)
|
198
|
+
table_stats = _get_column_stats(conn, stats_items, monitoring_table_suffix)
|
194
199
|
for table in table_stats:
|
200
|
+
logger.debug("Table %s will be aggregated.", table)
|
195
201
|
stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
|
196
202
|
for stat in stats:
|
197
203
|
status_code = qdc_client.update_stats_by_id(
|
@@ -1,8 +1,13 @@
|
|
1
1
|
import logging
|
2
|
+
from typing import List
|
2
3
|
|
3
4
|
from quollio_core.profilers.lineage import gen_table_lineage_payload, gen_table_lineage_payload_inputs
|
4
5
|
from quollio_core.profilers.sqllineage import SQLLineage
|
5
|
-
from quollio_core.profilers.stats import
|
6
|
+
from quollio_core.profilers.stats import (
|
7
|
+
gen_table_stats_payload_from_tuple,
|
8
|
+
get_is_target_stats_items,
|
9
|
+
render_sql_for_stats,
|
10
|
+
)
|
6
11
|
from quollio_core.repository import qdc, redshift
|
7
12
|
|
8
13
|
logger = logging.getLogger(__name__)
|
@@ -76,38 +81,24 @@ def redshift_table_stats(
|
|
76
81
|
conn: redshift.RedshiftConnectionConfig,
|
77
82
|
qdc_client: qdc.QDCExternalAPIClient,
|
78
83
|
tenant_id: str,
|
84
|
+
stats_items: List[str],
|
79
85
|
) -> None:
|
80
|
-
|
86
|
+
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
81
87
|
with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
|
82
88
|
stats_query = _gen_get_stats_views_query(
|
83
89
|
db=conn.database,
|
84
90
|
schema=conn.schema,
|
85
91
|
)
|
86
92
|
stats_views = redshift_executor.get_query_results(query=stats_query)
|
93
|
+
logger.info("Found %s for table statistics.", len(stats_views))
|
87
94
|
|
88
95
|
req_count = 0
|
89
96
|
for stats_view in stats_views:
|
90
|
-
|
91
|
-
|
92
|
-
db_name
|
93
|
-
, schema_name
|
94
|
-
, table_name
|
95
|
-
, column_name
|
96
|
-
, max_value
|
97
|
-
, min_value
|
98
|
-
, null_count
|
99
|
-
, cardinality
|
100
|
-
, avg_value
|
101
|
-
, median_value
|
102
|
-
, mode_value
|
103
|
-
, stddev_value
|
104
|
-
FROM
|
105
|
-
{db}.{schema}.{table}
|
106
|
-
""".format(
|
107
|
-
db=stats_view[0],
|
108
|
-
schema=stats_view[1],
|
109
|
-
table=stats_view[2],
|
97
|
+
table_fqn = "{catalog}.{schema}.{table}".format(
|
98
|
+
catalog=stats_view[0], schema=stats_view[1], table=stats_view[2]
|
110
99
|
)
|
100
|
+
stats_query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn=table_fqn)
|
101
|
+
logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
|
111
102
|
stats_result = redshift_executor.get_query_results(query=stats_query)
|
112
103
|
payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
|
113
104
|
for payload in payloads:
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
from typing import List
|
2
3
|
|
3
4
|
from quollio_core.profilers.lineage import (
|
4
5
|
gen_column_lineage_payload,
|
@@ -6,7 +7,7 @@ from quollio_core.profilers.lineage import (
|
|
6
7
|
parse_snowflake_results,
|
7
8
|
)
|
8
9
|
from quollio_core.profilers.sqllineage import SQLLineage
|
9
|
-
from quollio_core.profilers.stats import gen_table_stats_payload
|
10
|
+
from quollio_core.profilers.stats import gen_table_stats_payload, get_is_target_stats_items, render_sql_for_stats
|
10
11
|
from quollio_core.repository import qdc, snowflake
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
@@ -154,6 +155,7 @@ def snowflake_table_stats(
|
|
154
155
|
conn: snowflake.SnowflakeConnectionConfig,
|
155
156
|
qdc_client: qdc.QDCExternalAPIClient,
|
156
157
|
tenant_id: str,
|
158
|
+
stats_items: List[str],
|
157
159
|
) -> None:
|
158
160
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
159
161
|
stats_query = _gen_get_stats_views_query(
|
@@ -163,28 +165,12 @@ def snowflake_table_stats(
|
|
163
165
|
stats_views = sf_executor.get_query_results(query=stats_query)
|
164
166
|
|
165
167
|
req_count = 0
|
168
|
+
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
166
169
|
for stats_view in stats_views:
|
167
|
-
|
168
|
-
|
169
|
-
db_name
|
170
|
-
, schema_name
|
171
|
-
, table_name
|
172
|
-
, column_name
|
173
|
-
, max_value
|
174
|
-
, min_value
|
175
|
-
, null_count
|
176
|
-
, cardinality
|
177
|
-
, avg_value
|
178
|
-
, median_value
|
179
|
-
, mode_value
|
180
|
-
, stddev_value
|
181
|
-
FROM
|
182
|
-
{db}.{schema}.{table}
|
183
|
-
""".format(
|
184
|
-
db=stats_view["TABLE_CATALOG"],
|
185
|
-
schema=stats_view["TABLE_SCHEMA"],
|
186
|
-
table=stats_view["TABLE_NAME"],
|
170
|
+
table_fqn = "{catalog}.{schema}.{table}".format(
|
171
|
+
catalog=stats_view["TABLE_CATALOG"], schema=stats_view["TABLE_SCHEMA"], table=stats_view["TABLE_NAME"]
|
187
172
|
)
|
173
|
+
stats_query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn=table_fqn)
|
188
174
|
logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
|
189
175
|
stats_result = sf_executor.get_query_results(query=stats_query)
|
190
176
|
payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
|
quollio_core/profilers/stats.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from dataclasses import asdict, dataclass
|
2
|
+
from dataclasses import asdict, dataclass, fields
|
3
3
|
from decimal import ROUND_HALF_UP, Decimal
|
4
4
|
from typing import Dict, List, Tuple, Union
|
5
5
|
|
6
|
+
from jinja2 import Template
|
7
|
+
|
6
8
|
from quollio_core.helper.core import new_global_id
|
7
9
|
|
8
10
|
logger = logging.getLogger(__name__)
|
@@ -71,30 +73,35 @@ def convert_value_type(obj, cast_str: bool = False):
|
|
71
73
|
def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str, str]]) -> List[StatsRequest]:
|
72
74
|
payloads = list()
|
73
75
|
for stat in stats:
|
76
|
+
db_name = stat.get("DB_NAME", stat.get("db_name"))
|
77
|
+
schema_name = stat.get("SCHEMA_NAME", stat.get("schema_name"))
|
78
|
+
table_name = stat.get("TABLE_NAME", stat.get("table_name"))
|
79
|
+
column_name = stat.get("COLUMN_NAME", stat.get("column_name"))
|
74
80
|
global_id_arg = "{db}{schema}{table}{column}".format(
|
75
|
-
db=
|
81
|
+
db=db_name, schema=schema_name, table=table_name, column=column_name
|
76
82
|
)
|
77
83
|
table_global_id = new_global_id(
|
78
84
|
tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
|
79
85
|
)
|
86
|
+
column_stats_input = ColumnStatsInput(
|
87
|
+
cardinality=convert_value_type(stat.get("CARDINALITY", stat.get("cardinality"))),
|
88
|
+
max=convert_value_type(stat.get("MAX_VALUE", stat.get("max_value")), True),
|
89
|
+
mean=convert_value_type(stat.get("AVG_VALUE", stat.get("avg_value")), True),
|
90
|
+
median=convert_value_type(stat.get("MEDIAN_VALUE", stat.get("median_value")), True),
|
91
|
+
min=convert_value_type(stat.get("MIN_VALUE", stat.get("min_value")), True),
|
92
|
+
mode=convert_value_type(stat.get("MODE_VALUE", stat.get("mode_value")), True),
|
93
|
+
number_of_null=convert_value_type(stat.get("NULL_COUNT", stat.get("null_count"))),
|
94
|
+
number_of_unique=convert_value_type(stat.get("CARDINALITY", stat.get("cardinality"))),
|
95
|
+
stddev=convert_value_type(stat.get("STDDEV_VALUE", stat.get("stddev_value")), True),
|
96
|
+
)
|
80
97
|
stats_request = StatsRequest(
|
81
98
|
global_id=table_global_id,
|
82
|
-
db=
|
83
|
-
schema=
|
84
|
-
table=
|
85
|
-
column=
|
99
|
+
db=db_name,
|
100
|
+
schema=schema_name,
|
101
|
+
table=table_name,
|
102
|
+
column=column_name,
|
86
103
|
body=StatsInput(
|
87
|
-
column_stats=
|
88
|
-
cardinality=convert_value_type(stat["CARDINALITY"]),
|
89
|
-
max=convert_value_type(stat["MAX_VALUE"], True),
|
90
|
-
mean=convert_value_type(stat["AVG_VALUE"], True),
|
91
|
-
median=convert_value_type(stat["MEDIAN_VALUE"], True),
|
92
|
-
min=convert_value_type(stat["MIN_VALUE"], True),
|
93
|
-
mode=convert_value_type(stat["MODE_VALUE"], True),
|
94
|
-
number_of_null=convert_value_type(stat["NULL_COUNT"]),
|
95
|
-
number_of_unique=convert_value_type(stat["CARDINALITY"]),
|
96
|
-
stddev=convert_value_type(stat["STDDEV_VALUE"], True),
|
97
|
-
),
|
104
|
+
column_stats=column_stats_input,
|
98
105
|
# MEMO: Table stats can be collected with metadata agent.
|
99
106
|
# Then, It's not necessary to update with this system for now.
|
100
107
|
table_stats=TableStatsInput(count=0, size=0.0),
|
@@ -138,3 +145,57 @@ def gen_table_stats_payload_from_tuple(
|
|
138
145
|
)
|
139
146
|
payloads.append(stats_request)
|
140
147
|
return payloads
|
148
|
+
|
149
|
+
|
150
|
+
def render_sql_for_stats(is_aggregate_items: Dict[str, bool], table_fqn: str, cte: str = "") -> str:
|
151
|
+
sql_template_for_stats = Template(
|
152
|
+
"""
|
153
|
+
{% if cte -%}
|
154
|
+
{{ cte }}
|
155
|
+
{% endif -%}
|
156
|
+
SELECT
|
157
|
+
db_name
|
158
|
+
, schema_name
|
159
|
+
, table_name
|
160
|
+
, column_name
|
161
|
+
, {% if agg_max == True -%} max_value {% else -%} null as max_value {% endif %}
|
162
|
+
, {% if agg_min == True -%} min_value {% else -%} null as min_value {% endif %}
|
163
|
+
, {% if agg_null_count == True -%} null_count {% else -%} null as null_count {% endif %}
|
164
|
+
, {% if agg_cardinality == True -%} cardinality {% else -%} null as cardinality {% endif %}
|
165
|
+
, {% if agg_avg == True -%} avg_value {% else -%} null as avg_value {% endif %}
|
166
|
+
, {% if agg_median == True -%} median_value {% else -%} null as median_value {% endif %}
|
167
|
+
, {% if agg_mode == True -%} mode_value {% else -%} null as mode_value {% endif %}
|
168
|
+
, {% if agg_stddev == True -%} stddev_value {% else -%} null as stddev_value {% endif %}
|
169
|
+
FROM
|
170
|
+
{{ table_fqn }}
|
171
|
+
"""
|
172
|
+
)
|
173
|
+
query = sql_template_for_stats.render(
|
174
|
+
agg_max=is_aggregate_items["max"],
|
175
|
+
agg_min=is_aggregate_items["min"],
|
176
|
+
agg_null_count=is_aggregate_items["number_of_null"],
|
177
|
+
agg_cardinality=is_aggregate_items["cardinality"],
|
178
|
+
agg_avg=is_aggregate_items["mean"],
|
179
|
+
agg_median=is_aggregate_items["median"],
|
180
|
+
agg_mode=is_aggregate_items["mode"],
|
181
|
+
agg_stddev=is_aggregate_items["stddev"],
|
182
|
+
table_fqn=table_fqn,
|
183
|
+
cte=cte,
|
184
|
+
)
|
185
|
+
return query
|
186
|
+
|
187
|
+
|
188
|
+
def get_is_target_stats_items(stats_items: List[str]) -> List[Dict[str, bool]]:
|
189
|
+
target_stats_fields = get_column_stats_items()
|
190
|
+
is_aggregate_items = dict()
|
191
|
+
for target_stats_field in target_stats_fields:
|
192
|
+
is_aggregate_items[target_stats_field] = False
|
193
|
+
|
194
|
+
for stats_item in stats_items:
|
195
|
+
is_aggregate_items[stats_item] = True
|
196
|
+
|
197
|
+
return is_aggregate_items
|
198
|
+
|
199
|
+
|
200
|
+
def get_column_stats_items() -> List[str]:
|
201
|
+
return [field.name for field in fields(ColumnStatsInput)]
|
quollio_core/redshift.py
CHANGED
@@ -10,6 +10,7 @@ from quollio_core.profilers.redshift import (
|
|
10
10
|
redshift_table_level_sqllineage,
|
11
11
|
redshift_table_stats,
|
12
12
|
)
|
13
|
+
from quollio_core.profilers.stats import get_column_stats_items
|
13
14
|
from quollio_core.repository import dbt, qdc, redshift
|
14
15
|
|
15
16
|
logger = logging.getLogger(__name__)
|
@@ -98,13 +99,19 @@ def load_stats(
|
|
98
99
|
conn: redshift.RedshiftConnectionConfig,
|
99
100
|
qdc_client: qdc.QDCExternalAPIClient,
|
100
101
|
tenant_id: str,
|
102
|
+
stats_items: str,
|
101
103
|
) -> None:
|
102
|
-
|
103
104
|
logger.info("Generate redshift stats.")
|
105
|
+
|
106
|
+
if stats_items is None:
|
107
|
+
raise ValueError("No stats items are not selected. Please specify any value to `stats_items` param.")
|
108
|
+
|
109
|
+
logger.info("The following values will be aggregated. {stats_items}".format(stats_items=stats_items))
|
104
110
|
redshift_table_stats(
|
105
111
|
conn=conn,
|
106
112
|
qdc_client=qdc_client,
|
107
113
|
tenant_id=tenant_id,
|
114
|
+
stats_items=stats_items,
|
108
115
|
)
|
109
116
|
|
110
117
|
logger.info("Stats data is successfully loaded.")
|
@@ -116,7 +123,6 @@ def load_sqllineage(
|
|
116
123
|
qdc_client: qdc.QDCExternalAPIClient,
|
117
124
|
tenant_id: str,
|
118
125
|
) -> None:
|
119
|
-
|
120
126
|
logger.info("Generate Redshift sqllineage.")
|
121
127
|
redshift_table_level_sqllineage(
|
122
128
|
conn=conn,
|
@@ -261,6 +267,19 @@ if __name__ == "__main__":
|
|
261
267
|
required=False,
|
262
268
|
help="The client secrete that is created on Quollio console to let clients access Quollio External API",
|
263
269
|
)
|
270
|
+
|
271
|
+
stats_items = get_column_stats_items()
|
272
|
+
parser.add_argument(
|
273
|
+
"--target_stats_items",
|
274
|
+
type=str,
|
275
|
+
nargs="*",
|
276
|
+
choices=stats_items,
|
277
|
+
default=stats_items,
|
278
|
+
action=env_default("REDSHIFT_STATS_ITEMS"),
|
279
|
+
required=False,
|
280
|
+
help="The items for stats values. \
|
281
|
+
You can choose the items to be aggregated for stats. All items are selected by default.",
|
282
|
+
)
|
264
283
|
args = parser.parse_args()
|
265
284
|
set_log_level(level=args.log_level)
|
266
285
|
|
@@ -306,6 +325,7 @@ if __name__ == "__main__":
|
|
306
325
|
conn=conn,
|
307
326
|
qdc_client=qdc_client,
|
308
327
|
tenant_id=args.tenant_id,
|
328
|
+
stats_items=args.target_stats_items,
|
309
329
|
)
|
310
330
|
if "load_sqllineage" in args.commands:
|
311
331
|
qdc_client = qdc.QDCExternalAPIClient(
|
@@ -1,41 +1,75 @@
|
|
1
|
-
import
|
1
|
+
from typing import Any, Dict, List
|
2
2
|
|
3
3
|
from google.cloud.bigquery import Client
|
4
4
|
from google.cloud.datacatalog_lineage_v1 import EntityReference, LineageClient, SearchLinksRequest
|
5
5
|
from google.oauth2.service_account import Credentials
|
6
6
|
from googleapiclient.discovery import build
|
7
7
|
|
8
|
-
logger
|
8
|
+
from quollio_core.helper.log_utils import logger # Importing the logger from logging_utils
|
9
9
|
|
10
10
|
|
11
11
|
class BigQueryClient:
|
12
|
-
|
13
|
-
self.client = self.__initialze(credentials=credentials)
|
12
|
+
"""Client to interact with the BigQuery API."""
|
14
13
|
|
15
|
-
def
|
16
|
-
client
|
17
|
-
|
14
|
+
def __init__(self, credentials: Credentials, project_id: str) -> None:
|
15
|
+
"""Initialize the BigQuery client with provided credentials."""
|
16
|
+
self.client = self.__initialize(credentials=credentials, project_id=project_id)
|
18
17
|
|
19
|
-
def
|
20
|
-
|
21
|
-
logger.debug("Found %s datasets in project %s", len(datasets), project_id)
|
22
|
-
return datasets
|
18
|
+
def __initialize(self, credentials: Credentials, project_id: str) -> Client:
|
19
|
+
return Client(credentials=credentials, project=project_id)
|
23
20
|
|
24
|
-
def
|
21
|
+
def list_dataset_ids(self) -> List[str]:
|
22
|
+
"""List all dataset ids in the project."""
|
23
|
+
datasets = list(self.client.list_datasets())
|
24
|
+
logger.debug("Found %s datasets in project %s", len(datasets), self.client.project)
|
25
|
+
return [dataset.dataset_id for dataset in datasets]
|
26
|
+
|
27
|
+
def list_tables(self, dataset_id: str) -> List[Dict[str, str]]:
|
28
|
+
"""List all tables in the dataset."""
|
25
29
|
tables = list(self.client.list_tables(dataset_id))
|
26
30
|
logger.debug("Found %s tables in dataset %s", len(tables), dataset_id)
|
27
|
-
return
|
31
|
+
return [
|
32
|
+
{
|
33
|
+
"table_id": table.table_id,
|
34
|
+
"table_type": table.table_type,
|
35
|
+
"project": table.project,
|
36
|
+
"dataset_id": table.dataset_id,
|
37
|
+
}
|
38
|
+
for table in tables
|
39
|
+
]
|
40
|
+
|
41
|
+
def get_columns(self, table_id: str, dataset_id: str) -> List[Dict[str, str]]:
|
42
|
+
"""Get the columns of the table."""
|
43
|
+
table = self.client.get_table(f"{self.client.project}.{dataset_id}.{table_id}")
|
44
|
+
return [{"name": field.name, "type": field.field_type} for field in table.schema]
|
45
|
+
|
46
|
+
def get_all_columns(self) -> Dict[str, Dict[str, List[Dict[str, Any]]]]:
|
47
|
+
"""Get all columns in the project."""
|
48
|
+
all_columns = {}
|
49
|
+
datasets = self.list_dataset_ids()
|
50
|
+
for dataset_id in datasets:
|
51
|
+
all_columns[dataset_id] = {}
|
52
|
+
tables = self.list_tables(dataset_id)
|
53
|
+
for table_info in tables:
|
54
|
+
table_id = table_info["table_id"]
|
55
|
+
table_type = table_info["table_type"]
|
56
|
+
columns = self.get_columns(table_id, dataset_id)
|
57
|
+
all_columns[dataset_id][table_id] = {"columns": columns, "table_type": table_type}
|
58
|
+
return all_columns
|
28
59
|
|
29
60
|
|
30
61
|
class GCPLineageClient:
|
62
|
+
"""Client to interact with the GCP Lineage API."""
|
63
|
+
|
31
64
|
def __init__(self, credentials: Credentials) -> None:
|
65
|
+
"""Initialize the GCP Lineage client with provided credentials."""
|
32
66
|
self.client = self.__initialze(credentials=credentials)
|
33
67
|
|
34
68
|
def __initialze(self, credentials: Credentials) -> LineageClient:
|
35
|
-
|
36
|
-
return client
|
69
|
+
return LineageClient(credentials=credentials)
|
37
70
|
|
38
71
|
def get_links(self, request: SearchLinksRequest) -> list:
|
72
|
+
"""Search for links between entities (tables)."""
|
39
73
|
response = self.client.search_links(request)
|
40
74
|
return response.links
|
41
75
|
|
@@ -57,5 +91,4 @@ def get_org_id(credentials_json: dict) -> str:
|
|
57
91
|
crm_service = build("cloudresourcemanager", "v1", credentials=credentials)
|
58
92
|
project_id = credentials_json["project_id"]
|
59
93
|
project = crm_service.projects().get(projectId=project_id).execute()
|
60
|
-
|
61
|
-
return org_id
|
94
|
+
return project["parent"]["id"]
|
quollio_core/repository/qdc.py
CHANGED
@@ -100,3 +100,7 @@ class QDCExternalAPIClient:
|
|
100
100
|
logger.error(f"Error: {re} downstream_global_id: {global_id}.")
|
101
101
|
else:
|
102
102
|
return res.status_code
|
103
|
+
|
104
|
+
|
105
|
+
def initialize_qdc_client(api_url: str, client_id: str, client_secret: str) -> QDCExternalAPIClient:
|
106
|
+
return QDCExternalAPIClient(base_url=api_url, client_id=client_id, client_secret=client_secret)
|
quollio_core/snowflake.py
CHANGED
@@ -11,6 +11,7 @@ from quollio_core.profilers.snowflake import (
|
|
11
11
|
snowflake_table_stats,
|
12
12
|
snowflake_table_to_table_lineage,
|
13
13
|
)
|
14
|
+
from quollio_core.profilers.stats import get_column_stats_items
|
14
15
|
from quollio_core.repository import dbt, qdc, snowflake
|
15
16
|
|
16
17
|
logger = logging.getLogger(__name__)
|
@@ -22,7 +23,6 @@ def build_view(
|
|
22
23
|
target_tables: str = "",
|
23
24
|
log_level: str = "info",
|
24
25
|
) -> None:
|
25
|
-
|
26
26
|
logger.info("Build profiler views using dbt")
|
27
27
|
# set parameters
|
28
28
|
dbt_client = dbt.DBTClient()
|
@@ -103,13 +103,19 @@ def load_stats(
|
|
103
103
|
conn: snowflake.SnowflakeConnectionConfig,
|
104
104
|
qdc_client: qdc.QDCExternalAPIClient,
|
105
105
|
tenant_id: str,
|
106
|
+
stats_items: str,
|
106
107
|
) -> None:
|
107
|
-
|
108
108
|
logger.info("Generate Snowflake stats.")
|
109
|
+
|
110
|
+
if stats_items is None:
|
111
|
+
raise ValueError("No stats items are not selected. Please specify any value to `stats_items` param.")
|
112
|
+
|
113
|
+
logger.info("The following values will be aggregated. {stats_items}".format(stats_items=stats_items))
|
109
114
|
snowflake_table_stats(
|
110
115
|
conn=conn,
|
111
116
|
qdc_client=qdc_client,
|
112
117
|
tenant_id=tenant_id,
|
118
|
+
stats_items=stats_items,
|
113
119
|
)
|
114
120
|
|
115
121
|
logger.info("Stats data is successfully loaded.")
|
@@ -122,7 +128,6 @@ def load_sqllineage(
|
|
122
128
|
qdc_client: qdc.QDCExternalAPIClient,
|
123
129
|
tenant_id: str,
|
124
130
|
) -> None:
|
125
|
-
|
126
131
|
logger.info("Generate Snowflake sqllineage.")
|
127
132
|
snowflake_table_level_sqllineage(
|
128
133
|
conn=conn,
|
@@ -275,6 +280,19 @@ if __name__ == "__main__":
|
|
275
280
|
required=False,
|
276
281
|
help="Whether to ingest column lineage into QDIC or not. Default value is False",
|
277
282
|
)
|
283
|
+
|
284
|
+
stats_items = get_column_stats_items()
|
285
|
+
parser.add_argument(
|
286
|
+
"--target_stats_items",
|
287
|
+
type=str,
|
288
|
+
nargs="*",
|
289
|
+
choices=stats_items,
|
290
|
+
default=stats_items,
|
291
|
+
action=env_default("SNOWFLAKE_STATS_ITEMS"),
|
292
|
+
required=False,
|
293
|
+
help="The items for statistic values.\
|
294
|
+
You can choose the items to be aggregated for stats. All items are selected by default.",
|
295
|
+
)
|
278
296
|
args = parser.parse_args()
|
279
297
|
set_log_level(level=args.log_level)
|
280
298
|
|
@@ -321,6 +339,7 @@ if __name__ == "__main__":
|
|
321
339
|
conn=conn,
|
322
340
|
qdc_client=qdc_client,
|
323
341
|
tenant_id=args.tenant_id,
|
342
|
+
stats_items=args.target_stats_items,
|
324
343
|
)
|
325
344
|
if "load_sqllineage" in args.commands:
|
326
345
|
qdc_client = qdc.QDCExternalAPIClient(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: quollio-core
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.13
|
4
4
|
Summary: Quollio Core
|
5
5
|
Author-email: quollio-dev <qt.dev@quollio.com>
|
6
6
|
Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
|
@@ -22,6 +22,7 @@ Requires-Dist: dbt-core==1.7.10
|
|
22
22
|
Requires-Dist: dbt-snowflake==1.7.0
|
23
23
|
Requires-Dist: dbt-redshift==1.7.1
|
24
24
|
Requires-Dist: dbt-databricks==1.7.1
|
25
|
+
Requires-Dist: db-dtypes==1.2.0
|
25
26
|
Requires-Dist: jinja2==3.1.3
|
26
27
|
Requires-Dist: PyYAML==6.0.1
|
27
28
|
Requires-Dist: requests==2.31.0
|
@@ -1,8 +1,8 @@
|
|
1
|
-
quollio_core/__init__.py,sha256=
|
2
|
-
quollio_core/bigquery.py,sha256=
|
3
|
-
quollio_core/bricks.py,sha256=
|
4
|
-
quollio_core/redshift.py,sha256=
|
5
|
-
quollio_core/snowflake.py,sha256=
|
1
|
+
quollio_core/__init__.py,sha256=AUePs5X9J3XSNhx1MlWVacGiCUUUbcMKTZG3Rs0jrNY,84
|
2
|
+
quollio_core/bigquery.py,sha256=6Oq4DVGpa3X21Es_nbrsb8pK3vaxwb9Egnvq3huo95k,5894
|
3
|
+
quollio_core/bricks.py,sha256=4M0fzxwtFCwAv2Lat9XYdLtoGp27fy-w6a3ty1dExSc,9999
|
4
|
+
quollio_core/redshift.py,sha256=x86Fu3QJoJNGKPYbOcqUgQzzj1qNR6I3dd0R9oQClUE,10720
|
5
|
+
quollio_core/snowflake.py,sha256=ars0S8sbEcDR74RLrsJX9VWh8fbBGgk2H7G81paCPlk,11426
|
6
6
|
quollio_core/dbt_projects/databricks/.gitignore,sha256=1jJAyXSzJ3YUm0nx3i7wUSE4RjQMX3ad6F8O88UbtzI,29
|
7
7
|
quollio_core/dbt_projects/databricks/README.md,sha256=ZpRQyhFAODAiS8dc1Kb_ndkul4cu4o4udN_EMa49CU4,440
|
8
8
|
quollio_core/dbt_projects/databricks/dbt_project.yml,sha256=3sH98RNk7TnphvI3yEdXDstb92kW5BNxr-cT0tXhwzk,480
|
@@ -65,22 +65,23 @@ quollio_core/helper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
65
65
|
quollio_core/helper/core.py,sha256=wbu4FWI7YiFEttXGSuj3tMyAhtPAFlHOjDpWJGNXOHA,1202
|
66
66
|
quollio_core/helper/env_default.py,sha256=H6gbSGUPrEDZr4YDrL49hbOpw6RntI4U82kX1q6vUnI,2148
|
67
67
|
quollio_core/helper/log.py,sha256=flxyZZ44G79l1TaUp3OT58uCHcnE5z_pCduwoeI6IUs,645
|
68
|
+
quollio_core/helper/log_utils.py,sha256=w1El5yafNcKgzpiMmspsAjUm3R32ACm5QNj5lNb3xsk,1392
|
68
69
|
quollio_core/profilers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
|
-
quollio_core/profilers/bigquery.py,sha256=
|
70
|
-
quollio_core/profilers/databricks.py,sha256=
|
70
|
+
quollio_core/profilers/bigquery.py,sha256=LQzDPo-fyTHPc4C-LC59Aby5cJ7m-m4THzl9HUurXm0,5641
|
71
|
+
quollio_core/profilers/databricks.py,sha256=ik4RiR_GOeU3S7s6C6Y9SGe1D_Y_f98BDWJVlEJXL4U,7868
|
71
72
|
quollio_core/profilers/lineage.py,sha256=4FyxIuPBrUFihqZryqTQBcfB0Z7634lKl_WwkD82vzE,6865
|
72
|
-
quollio_core/profilers/redshift.py,sha256=
|
73
|
-
quollio_core/profilers/snowflake.py,sha256=
|
73
|
+
quollio_core/profilers/redshift.py,sha256=p6ONDCkhndZAOcKAwEyQ5fsi-jsQrlwHHb7LTI_m1uk,6473
|
74
|
+
quollio_core/profilers/snowflake.py,sha256=YdrV82pjJ1BilWQvPES1pz3EmQoBOJEPc6mVlI4FDRg,8311
|
74
75
|
quollio_core/profilers/sqllineage.py,sha256=XkF7hwDWIGNtyEP5cv2wETBgMfdQxeHolv7qPIkntSQ,5066
|
75
|
-
quollio_core/profilers/stats.py,sha256=
|
76
|
+
quollio_core/profilers/stats.py,sha256=OLQrdrh0y64jo9rmzvGlDdxy_c7gMz_GnlXPJzWkBjM,7343
|
76
77
|
quollio_core/repository/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
-
quollio_core/repository/bigquery.py,sha256=
|
78
|
+
quollio_core/repository/bigquery.py,sha256=3AyGcJNYGnUyMweyc6lGm4quwrOzd-ZBS2zNnFwafII,3990
|
78
79
|
quollio_core/repository/databricks.py,sha256=9Cgdv8qBnVaHqu3RA-IUBieAqb69moQ-KAAMVSf5Ds4,1877
|
79
80
|
quollio_core/repository/dbt.py,sha256=cnLwJPywLi8VowVW7zfIBa9jxVwDWO7xzzNRn1vWiuw,659
|
80
|
-
quollio_core/repository/qdc.py,sha256=
|
81
|
+
quollio_core/repository/qdc.py,sha256=hw7L7RdX5srv_MUSxAObq3l9b3IYjzN5lopp6CgPXyY,4572
|
81
82
|
quollio_core/repository/redshift.py,sha256=p2ouEuYcDCjx1oBhc6H1ekQsvEqHGd3bFu3PW0ngYBc,2880
|
82
83
|
quollio_core/repository/snowflake.py,sha256=J9rHshfWdOSnjQWxwGEYPpAU2lY7Tu5UFB_BNakkAX0,1892
|
83
|
-
quollio_core-0.4.
|
84
|
-
quollio_core-0.4.
|
85
|
-
quollio_core-0.4.
|
86
|
-
quollio_core-0.4.
|
84
|
+
quollio_core-0.4.13.dist-info/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
|
85
|
+
quollio_core-0.4.13.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
|
86
|
+
quollio_core-0.4.13.dist-info/METADATA,sha256=fyVJbVrl739taWh8w9ndVhKI2KWgsVLLZbRmzVF9Yj8,6836
|
87
|
+
quollio_core-0.4.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|