quollio-core 0.4.12__tar.gz → 0.4.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quollio_core-0.4.12 → quollio_core-0.4.14}/PKG-INFO +3 -1
- {quollio_core-0.4.12 → quollio_core-0.4.14}/pyproject.toml +2 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/__init__.py +1 -1
- quollio_core-0.4.14/quollio_core/bigquery.py +183 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/bricks.py +15 -3
- quollio_core-0.4.14/quollio_core/helper/log_utils.py +47 -0
- quollio_core-0.4.14/quollio_core/profilers/bigquery.py +145 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/databricks.py +44 -39
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/redshift.py +13 -22
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/snowflake.py +7 -21
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/stats.py +78 -17
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/redshift.py +22 -2
- quollio_core-0.4.14/quollio_core/repository/bigquery.py +94 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/repository/qdc.py +8 -4
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/snowflake.py +22 -3
- quollio_core-0.4.12/quollio_core/bigquery.py +0 -123
- quollio_core-0.4.12/quollio_core/profilers/bigquery.py +0 -81
- quollio_core-0.4.12/quollio_core/repository/bigquery.py +0 -61
- {quollio_core-0.4.12 → quollio_core-0.4.14}/LICENSE +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/README.md +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/.gitignore +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/README.md +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/dbt_project.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/models/sources.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/package-lock.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/packages.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/README.md +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/analyses/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/dbt_project.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/macros/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/models/sources.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/package-lock.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/packages.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/profiles/profiles_template.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/seeds/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/redshift/snapshots/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/seeds/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/README.md +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/analyses/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/dbt_project.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/macros/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/models/sources.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/package-lock.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/packages.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/seeds/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/dbt_projects/snowflake/snapshots/.gitkeep +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/helper/__init__.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/helper/core.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/helper/env_default.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/helper/log.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/__init__.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/lineage.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/profilers/sqllineage.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/repository/__init__.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/repository/databricks.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/repository/dbt.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/repository/redshift.py +0 -0
- {quollio_core-0.4.12 → quollio_core-0.4.14}/quollio_core/repository/snowflake.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: quollio-core
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.14
|
4
4
|
Summary: Quollio Core
|
5
5
|
Author-email: quollio-dev <qt.dev@quollio.com>
|
6
6
|
Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
|
@@ -22,6 +22,7 @@ Requires-Dist: dbt-core==1.7.10
|
|
22
22
|
Requires-Dist: dbt-snowflake==1.7.0
|
23
23
|
Requires-Dist: dbt-redshift==1.7.1
|
24
24
|
Requires-Dist: dbt-databricks==1.7.1
|
25
|
+
Requires-Dist: db-dtypes==1.2.0
|
25
26
|
Requires-Dist: jinja2==3.1.3
|
26
27
|
Requires-Dist: PyYAML==6.0.1
|
27
28
|
Requires-Dist: requests==2.31.0
|
@@ -41,6 +42,7 @@ Requires-Dist: coverage>=7.3.2 ; extra == "test"
|
|
41
42
|
Requires-Dist: isort>=5.10.1 ; extra == "test"
|
42
43
|
Requires-Dist: pyproject-flake8>=0.0.1-alpha.2 ; extra == "test"
|
43
44
|
Requires-Dist: pytest>=5.2 ; extra == "test"
|
45
|
+
Requires-Dist: responses>=0.25.3 ; extra == "test"
|
44
46
|
Requires-Dist: dbt-osmosis==0.12.5 ; extra == "test"
|
45
47
|
Project-URL: Home, https://quollio.com
|
46
48
|
Project-URL: Source, https://github.com/QuollioLabs/quollio-core
|
@@ -34,6 +34,7 @@ dependencies = [
|
|
34
34
|
,"dbt-snowflake==1.7.0"
|
35
35
|
,"dbt-redshift==1.7.1"
|
36
36
|
,"dbt-databricks==1.7.1"
|
37
|
+
,"db-dtypes==1.2.0"
|
37
38
|
,"jinja2==3.1.3"
|
38
39
|
,"PyYAML==6.0.1"
|
39
40
|
,"requests==2.31.0"
|
@@ -62,6 +63,7 @@ test = [
|
|
62
63
|
,"isort>=5.10.1"
|
63
64
|
,"pyproject-flake8>=0.0.1-alpha.2"
|
64
65
|
,"pytest>=5.2"
|
66
|
+
,"responses>=0.25.3"
|
65
67
|
,"dbt-osmosis==0.12.5"
|
66
68
|
]
|
67
69
|
|
@@ -0,0 +1,183 @@
|
|
1
|
+
import argparse
|
2
|
+
import json
|
3
|
+
|
4
|
+
from google.auth.credentials import Credentials
|
5
|
+
|
6
|
+
from quollio_core.helper.env_default import env_default
|
7
|
+
from quollio_core.helper.log_utils import configure_logging, error_handling_decorator, logger
|
8
|
+
from quollio_core.profilers.bigquery import bigquery_table_lineage, bigquery_table_stats
|
9
|
+
from quollio_core.repository import qdc
|
10
|
+
from quollio_core.repository.bigquery import BigQueryClient, get_credentials, get_org_id
|
11
|
+
|
12
|
+
|
13
|
+
def initialize_credentials(credentials_json: str) -> Credentials:
|
14
|
+
return get_credentials(json.loads(credentials_json))
|
15
|
+
|
16
|
+
|
17
|
+
def initialize_org_id(credentials_json: str) -> str:
|
18
|
+
return get_org_id(json.loads(credentials_json))
|
19
|
+
|
20
|
+
|
21
|
+
def initialize_bq_client(credentials: Credentials, project_id: str) -> BigQueryClient:
|
22
|
+
return BigQueryClient(credentials=credentials, project_id=project_id)
|
23
|
+
|
24
|
+
|
25
|
+
@error_handling_decorator
|
26
|
+
def load_lineage(
|
27
|
+
tenant_id: str,
|
28
|
+
project_id: str,
|
29
|
+
regions: list,
|
30
|
+
org_id: str,
|
31
|
+
credentials: Credentials,
|
32
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
33
|
+
) -> None:
|
34
|
+
logger.info("Loading lineage data.")
|
35
|
+
bigquery_table_lineage(
|
36
|
+
qdc_client=qdc_client,
|
37
|
+
tenant_id=tenant_id,
|
38
|
+
project_id=project_id,
|
39
|
+
regions=regions,
|
40
|
+
credentials=credentials,
|
41
|
+
org_id=org_id,
|
42
|
+
)
|
43
|
+
logger.info("Lineage data loaded successfully.")
|
44
|
+
|
45
|
+
|
46
|
+
@error_handling_decorator
|
47
|
+
def load_stats(
|
48
|
+
conn: BigQueryClient,
|
49
|
+
tenant_id: str,
|
50
|
+
org_id: str,
|
51
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
52
|
+
dataplex_stats_tables: list,
|
53
|
+
) -> None:
|
54
|
+
logger.info("Loading statistics data.")
|
55
|
+
bigquery_table_stats(
|
56
|
+
bq_client=conn,
|
57
|
+
qdc_client=qdc_client,
|
58
|
+
tenant_id=tenant_id,
|
59
|
+
org_id=org_id,
|
60
|
+
dataplex_stats_tables=dataplex_stats_tables,
|
61
|
+
)
|
62
|
+
logger.info("Statistics data loaded successfully.")
|
63
|
+
|
64
|
+
|
65
|
+
if __name__ == "__main__":
|
66
|
+
parser = argparse.ArgumentParser(
|
67
|
+
prog="Quollio Intelligence Agent for BigQuery",
|
68
|
+
description="Load lineage and stats to Quollio from BigQuery using Dataplex and BigQuery APIs",
|
69
|
+
epilog="Copyright (c) 2024 Quollio Technologies, Inc.",
|
70
|
+
)
|
71
|
+
parser.add_argument(
|
72
|
+
"commands",
|
73
|
+
choices=["load_lineage", "load_stats"],
|
74
|
+
type=str,
|
75
|
+
nargs="+",
|
76
|
+
help="""
|
77
|
+
The command to execute.
|
78
|
+
'load_lineage': Load lineage data from created views to Quollio,
|
79
|
+
'load_stats': Load stats from created views to Quollio,
|
80
|
+
""",
|
81
|
+
)
|
82
|
+
parser.add_argument(
|
83
|
+
"--log_level",
|
84
|
+
type=str,
|
85
|
+
choices=["debug", "info", "warn", "error", "none"],
|
86
|
+
action=env_default("LOG_LEVEL"),
|
87
|
+
default="info",
|
88
|
+
required=False,
|
89
|
+
help="The log level for dbt commands. Default value is info",
|
90
|
+
)
|
91
|
+
parser.add_argument(
|
92
|
+
"--tenant_id",
|
93
|
+
type=str,
|
94
|
+
action=env_default("TENANT_ID"),
|
95
|
+
required=False,
|
96
|
+
help="The tenant id (company id) where the lineage and stats are loaded",
|
97
|
+
)
|
98
|
+
parser.add_argument(
|
99
|
+
"--project_id",
|
100
|
+
type=str,
|
101
|
+
default=None,
|
102
|
+
required=False,
|
103
|
+
help="Project ID of the BigQuery project to load lineage and stats from (default is loaded from credentials)",
|
104
|
+
)
|
105
|
+
parser.add_argument(
|
106
|
+
"--regions",
|
107
|
+
type=str,
|
108
|
+
action=env_default("GCP_REGIONS"),
|
109
|
+
required=True,
|
110
|
+
help="Comma-separated list of regions BigQuery data is in",
|
111
|
+
)
|
112
|
+
parser.add_argument(
|
113
|
+
"--credentials_json",
|
114
|
+
type=str,
|
115
|
+
action=env_default("GOOGLE_APPLICATION_CREDENTIALS"),
|
116
|
+
required=True,
|
117
|
+
help="Credentials JSON",
|
118
|
+
)
|
119
|
+
parser.add_argument(
|
120
|
+
"--api_url",
|
121
|
+
type=str,
|
122
|
+
action=env_default("QDC_API_URL"),
|
123
|
+
required=False,
|
124
|
+
help="The base URL of Quollio External API",
|
125
|
+
)
|
126
|
+
parser.add_argument(
|
127
|
+
"--client_id",
|
128
|
+
type=str,
|
129
|
+
action=env_default("QDC_CLIENT_ID"),
|
130
|
+
required=False,
|
131
|
+
help="The client id that is created on Quollio console to let clients access Quollio External API",
|
132
|
+
)
|
133
|
+
parser.add_argument(
|
134
|
+
"--client_secret",
|
135
|
+
type=str,
|
136
|
+
action=env_default("QDC_CLIENT_SECRET"),
|
137
|
+
required=False,
|
138
|
+
help="The client secret that is created on Quollio console to let clients access Quollio External API",
|
139
|
+
)
|
140
|
+
|
141
|
+
parser.add_argument(
|
142
|
+
"--dataplex_stats_tables",
|
143
|
+
type=str,
|
144
|
+
action=env_default("DATAPLEX_STATS_TABLES"),
|
145
|
+
required=False,
|
146
|
+
help="Comma-separated list of dataplex stats tables - <project_id>.<dataset_id>.<table_id>",
|
147
|
+
)
|
148
|
+
|
149
|
+
args = parser.parse_args()
|
150
|
+
|
151
|
+
# Validate that dataplex_stats_tables is provided if load_stats is in commands
|
152
|
+
if "load_stats" in args.commands and not args.dataplex_stats_tables:
|
153
|
+
parser.error("--dataplex_stats_tables is required when 'load_stats' command is used")
|
154
|
+
|
155
|
+
configure_logging(args.log_level)
|
156
|
+
|
157
|
+
credentials = initialize_credentials(args.credentials_json)
|
158
|
+
org_id = initialize_org_id(args.credentials_json)
|
159
|
+
qdc_client = qdc.initialize_qdc_client(args.api_url, args.client_id, args.client_secret)
|
160
|
+
bq_client = initialize_bq_client(credentials, args.project_id)
|
161
|
+
if args.project_id is None:
|
162
|
+
args.project_id = json.loads(args.credentials_json)["project_id"]
|
163
|
+
regions = args.regions.split(",")
|
164
|
+
|
165
|
+
if "load_lineage" in args.commands:
|
166
|
+
load_lineage(
|
167
|
+
tenant_id=args.tenant_id,
|
168
|
+
project_id=args.project_id,
|
169
|
+
regions=regions,
|
170
|
+
org_id=org_id,
|
171
|
+
credentials=credentials,
|
172
|
+
qdc_client=qdc_client,
|
173
|
+
)
|
174
|
+
|
175
|
+
if "load_stats" in args.commands:
|
176
|
+
tables = args.dataplex_stats_tables.split(",")
|
177
|
+
load_stats(
|
178
|
+
conn=bq_client,
|
179
|
+
tenant_id=args.tenant_id,
|
180
|
+
org_id=org_id,
|
181
|
+
qdc_client=qdc_client,
|
182
|
+
dataplex_stats_tables=tables,
|
183
|
+
)
|
@@ -10,6 +10,7 @@ from quollio_core.profilers.databricks import (
|
|
10
10
|
databricks_column_stats,
|
11
11
|
databricks_table_level_lineage,
|
12
12
|
)
|
13
|
+
from quollio_core.profilers.stats import get_column_stats_items
|
13
14
|
from quollio_core.repository import databricks as db
|
14
15
|
from quollio_core.repository import dbt, qdc
|
15
16
|
|
@@ -21,7 +22,6 @@ def build_view(
|
|
21
22
|
target_tables: str = "",
|
22
23
|
log_level: str = "info",
|
23
24
|
) -> None:
|
24
|
-
|
25
25
|
logger.info("Build profiler views using dbt")
|
26
26
|
# set parameters
|
27
27
|
dbt_client = dbt.DBTClient()
|
@@ -64,7 +64,6 @@ def load_lineage(
|
|
64
64
|
tenant_id: str,
|
65
65
|
enable_column_lineage: bool = False,
|
66
66
|
) -> None:
|
67
|
-
|
68
67
|
logger.info("Generate Databricks table to table lineage.")
|
69
68
|
databricks_table_level_lineage(
|
70
69
|
conn=conn,
|
@@ -98,7 +97,6 @@ def load_column_stats(
|
|
98
97
|
qdc_client: qdc.QDCExternalAPIClient,
|
99
98
|
tenant_id: str,
|
100
99
|
) -> None:
|
101
|
-
|
102
100
|
logger.info("Generate Databricks column stats.")
|
103
101
|
databricks_column_stats(
|
104
102
|
conn=conn,
|
@@ -240,6 +238,19 @@ if __name__ == "__main__":
|
|
240
238
|
help="Whether to ingest column lineage into QDIC or not. Default value is False",
|
241
239
|
)
|
242
240
|
|
241
|
+
stats_items = get_column_stats_items()
|
242
|
+
parser.add_argument(
|
243
|
+
"--target_stats_items",
|
244
|
+
type=str,
|
245
|
+
nargs="*",
|
246
|
+
choices=stats_items,
|
247
|
+
default=stats_items,
|
248
|
+
action=env_default("DATABRICKS_STATS_ITEMS"),
|
249
|
+
required=False,
|
250
|
+
help="The items for statistic values.\
|
251
|
+
You can choose the items to be aggregated for stats. All items are selected by default.",
|
252
|
+
)
|
253
|
+
|
243
254
|
args = parser.parse_args()
|
244
255
|
set_log_level(level=args.log_level)
|
245
256
|
|
@@ -284,5 +295,6 @@ if __name__ == "__main__":
|
|
284
295
|
endpoint=args.host,
|
285
296
|
qdc_client=qdc_client,
|
286
297
|
tenant_id=args.tenant_id,
|
298
|
+
stats_items=args.target_stats_items,
|
287
299
|
monitoring_table_suffix=args.monitoring_table_suffix,
|
288
300
|
)
|
@@ -0,0 +1,47 @@
|
|
1
|
+
import inspect
|
2
|
+
import logging
|
3
|
+
|
4
|
+
LOG_LEVELS = {
|
5
|
+
"critical": logging.CRITICAL,
|
6
|
+
"error": logging.ERROR,
|
7
|
+
"warning": logging.WARNING,
|
8
|
+
"info": logging.INFO,
|
9
|
+
"debug": logging.DEBUG,
|
10
|
+
"notset": logging.NOTSET,
|
11
|
+
}
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def configure_logging(level: str = "INFO"):
|
17
|
+
"""Configure logging settings."""
|
18
|
+
log_level = LOG_LEVELS.get(level.lower())
|
19
|
+
if log_level is None:
|
20
|
+
raise ValueError(f"Unknown log level: {level}")
|
21
|
+
|
22
|
+
logging.basicConfig(
|
23
|
+
level=log_level,
|
24
|
+
format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
|
25
|
+
)
|
26
|
+
logger.setLevel(log_level)
|
27
|
+
logger.info(f"Logging is configured to {level} level.")
|
28
|
+
|
29
|
+
|
30
|
+
def error_handling_decorator(func):
|
31
|
+
"""Decorator for consistent error handling in CLI commands."""
|
32
|
+
|
33
|
+
def wrapper(*args, **kwargs):
|
34
|
+
func_name = func.__name__
|
35
|
+
try:
|
36
|
+
logger.debug(f"Starting {func_name}")
|
37
|
+
result = func(*args, **kwargs)
|
38
|
+
logger.debug(f"Completed {func_name} successfully")
|
39
|
+
return result
|
40
|
+
except Exception as e:
|
41
|
+
current_frame = inspect.currentframe()
|
42
|
+
error_frame = current_frame.f_back
|
43
|
+
line_number = error_frame.f_lineno
|
44
|
+
logger.error(f"Error in {func_name} at line {line_number}: {str(e)}", exc_info=True)
|
45
|
+
raise
|
46
|
+
|
47
|
+
return wrapper
|
@@ -0,0 +1,145 @@
|
|
1
|
+
from typing import Dict, List
|
2
|
+
|
3
|
+
from google.auth.credentials import Credentials
|
4
|
+
|
5
|
+
from quollio_core.helper.log_utils import error_handling_decorator, logger
|
6
|
+
from quollio_core.profilers.lineage import gen_table_lineage_payload, parse_bigquery_table_lineage
|
7
|
+
from quollio_core.profilers.stats import gen_table_stats_payload
|
8
|
+
from quollio_core.repository import qdc
|
9
|
+
from quollio_core.repository.bigquery import BigQueryClient, GCPLineageClient, get_entitiy_reference, get_search_request
|
10
|
+
|
11
|
+
|
12
|
+
@error_handling_decorator
|
13
|
+
def bigquery_table_lineage(
|
14
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
15
|
+
tenant_id: str,
|
16
|
+
project_id: str,
|
17
|
+
regions: list,
|
18
|
+
org_id: str,
|
19
|
+
credentials: Credentials,
|
20
|
+
) -> None:
|
21
|
+
lineage_client = GCPLineageClient(credentials)
|
22
|
+
bq_client = BigQueryClient(credentials, project_id)
|
23
|
+
|
24
|
+
datasets = bq_client.list_dataset_ids()
|
25
|
+
all_tables = generate_table_list(bq_client, datasets)
|
26
|
+
lineage_links = generate_lineage_links(all_tables, lineage_client, project_id, regions)
|
27
|
+
lineage_links = parse_bigquery_table_lineage(lineage_links)
|
28
|
+
|
29
|
+
update_table_lineage_inputs = gen_table_lineage_payload(tenant_id=tenant_id, endpoint=org_id, tables=lineage_links)
|
30
|
+
|
31
|
+
req_count = 0
|
32
|
+
for update_table_lineage_input in update_table_lineage_inputs:
|
33
|
+
logger.info(
|
34
|
+
"Generating table lineage. downstream: %s -> %s -> %s",
|
35
|
+
update_table_lineage_input.downstream_database_name,
|
36
|
+
update_table_lineage_input.downstream_schema_name,
|
37
|
+
update_table_lineage_input.downstream_table_name,
|
38
|
+
)
|
39
|
+
status_code = qdc_client.update_lineage_by_id(
|
40
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
41
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
42
|
+
)
|
43
|
+
if status_code == 200:
|
44
|
+
req_count += 1
|
45
|
+
logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
|
46
|
+
|
47
|
+
|
48
|
+
@error_handling_decorator
|
49
|
+
def bigquery_table_stats(
|
50
|
+
qdc_client: qdc.QDCExternalAPIClient,
|
51
|
+
bq_client: BigQueryClient,
|
52
|
+
tenant_id: str,
|
53
|
+
org_id: str,
|
54
|
+
dataplex_stats_tables: list,
|
55
|
+
) -> None:
|
56
|
+
profiling_results = []
|
57
|
+
for table in dataplex_stats_tables:
|
58
|
+
logger.info("Profiling columns using Dataplex stats table: %s", table)
|
59
|
+
profiling_results.extend(column_stats_from_dataplex(bq_client, table))
|
60
|
+
|
61
|
+
stats = gen_table_stats_payload(tenant_id, org_id, profiling_results)
|
62
|
+
|
63
|
+
for stat in stats:
|
64
|
+
status_code = qdc_client.update_stats_by_id(
|
65
|
+
global_id=stat.global_id,
|
66
|
+
payload=stat.body.as_dict(),
|
67
|
+
)
|
68
|
+
if status_code == 200:
|
69
|
+
logger.info(
|
70
|
+
"Stats for column %s -> %s -> %s -> %s is successfully ingested.",
|
71
|
+
stat.db,
|
72
|
+
stat.schema,
|
73
|
+
stat.table,
|
74
|
+
stat.column,
|
75
|
+
)
|
76
|
+
logger.debug("Stats for column id %s is successfully ingested.", stat.global_id)
|
77
|
+
|
78
|
+
|
79
|
+
def generate_table_list(bq_client: BigQueryClient, datasets: List[str]) -> List[str]:
|
80
|
+
all_tables = []
|
81
|
+
for dataset in datasets:
|
82
|
+
all_tables.extend(
|
83
|
+
[
|
84
|
+
table
|
85
|
+
for table in bq_client.list_tables(dataset)
|
86
|
+
if table["table_type"] in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
|
87
|
+
],
|
88
|
+
)
|
89
|
+
|
90
|
+
all_table_names = []
|
91
|
+
for table in all_tables:
|
92
|
+
all_table_names.append(f"{bq_client.client.project}.{table['dataset_id']}.{table['table_id']}")
|
93
|
+
|
94
|
+
return all_table_names
|
95
|
+
|
96
|
+
|
97
|
+
def generate_lineage_links(
|
98
|
+
all_tables: List[str],
|
99
|
+
lineage_client: GCPLineageClient,
|
100
|
+
project_id: str,
|
101
|
+
regions: List[str],
|
102
|
+
) -> Dict[str, List[str]]:
|
103
|
+
lineage_links = {}
|
104
|
+
for table in all_tables:
|
105
|
+
if "quollio" in table.lower():
|
106
|
+
continue
|
107
|
+
downstream = get_entitiy_reference()
|
108
|
+
downstream.fully_qualified_name = f"bigquery:{table}"
|
109
|
+
|
110
|
+
for region in regions:
|
111
|
+
request = get_search_request(downstream_table=downstream, project_id=project_id, region=region)
|
112
|
+
response = lineage_client.get_links(request=request)
|
113
|
+
for lineage in response:
|
114
|
+
target_table = str(lineage.target.fully_qualified_name).replace("bigquery:", "")
|
115
|
+
source_table = str(lineage.source.fully_qualified_name).replace("bigquery:", "")
|
116
|
+
if target_table not in lineage_links:
|
117
|
+
lineage_links[target_table] = []
|
118
|
+
if source_table not in lineage_links[target_table]:
|
119
|
+
lineage_links[target_table].append(source_table)
|
120
|
+
|
121
|
+
return lineage_links
|
122
|
+
|
123
|
+
|
124
|
+
def column_stats_from_dataplex(bq_client: BigQueryClient, profiling_table: str) -> List[Dict]:
|
125
|
+
query = f"""
|
126
|
+
SELECT
|
127
|
+
data_source.table_project_id AS DB_NAME,
|
128
|
+
data_source.dataset_id AS SCHEMA_NAME,
|
129
|
+
data_source.table_id AS TABLE_NAME,
|
130
|
+
column_name AS COLUMN_NAME,
|
131
|
+
min_value AS MIN_VALUE,
|
132
|
+
max_value AS MAX_VALUE,
|
133
|
+
average_value AS AVG_VALUE,
|
134
|
+
quartile_median AS MEDIAN_VALUE,
|
135
|
+
standard_deviation AS STDDEV_VALUE,
|
136
|
+
top_n[0][0] AS MODE_VALUE,
|
137
|
+
CAST((percent_null / 100) * job_rows_scanned AS INT) as NULL_COUNT,
|
138
|
+
CAST((percent_unique / 100) * job_rows_scanned AS INT) as CARDINALITY
|
139
|
+
FROM `{profiling_table}`
|
140
|
+
"""
|
141
|
+
logger.debug(f"Executing Query: {query}")
|
142
|
+
results = bq_client.client.query(query).result()
|
143
|
+
|
144
|
+
# Convert RowIterator to a list of dictionaries
|
145
|
+
return [dict(row) for row in results]
|
@@ -6,7 +6,7 @@ from quollio_core.profilers.lineage import (
|
|
6
6
|
gen_table_lineage_payload,
|
7
7
|
parse_databricks_table_lineage,
|
8
8
|
)
|
9
|
-
from quollio_core.profilers.stats import gen_table_stats_payload
|
9
|
+
from quollio_core.profilers.stats import gen_table_stats_payload, get_is_target_stats_items, render_sql_for_stats
|
10
10
|
from quollio_core.repository import databricks, qdc
|
11
11
|
|
12
12
|
logger = logging.getLogger(__name__)
|
@@ -125,59 +125,63 @@ def _get_monitoring_tables(
|
|
125
125
|
|
126
126
|
|
127
127
|
def _get_column_stats(
|
128
|
-
conn: databricks.DatabricksConnectionConfig,
|
128
|
+
conn: databricks.DatabricksConnectionConfig,
|
129
|
+
stats_items: List[str],
|
130
|
+
monitoring_table_suffix: str = "_profile_metrics",
|
129
131
|
) -> List[Dict[str, str]]:
|
130
132
|
tables = _get_monitoring_tables(conn, monitoring_table_suffix)
|
131
133
|
if not tables:
|
132
134
|
return []
|
133
135
|
stats = []
|
136
|
+
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
134
137
|
for table in tables:
|
135
138
|
monitored_table = table["table_fqdn"].removesuffix("_profile_metrics")
|
136
139
|
monitored_table = monitored_table.split(".")
|
137
140
|
if len(monitored_table) != 3:
|
138
141
|
raise ValueError(f"Invalid table name: {table['table_fqdn']}")
|
139
142
|
with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
143
|
+
cte = """
|
144
|
+
WITH profile_record_history AS (
|
145
|
+
SELECT
|
146
|
+
COLUMN_NAME
|
147
|
+
, distinct_count as cardinality
|
148
|
+
, MAX as max_value
|
149
|
+
, MIN as min_value
|
150
|
+
, AVG as avg_value
|
151
|
+
, MEDIAN as median_value
|
152
|
+
, STDDEV as stddev_value
|
153
|
+
, NUM_NULLS as null_count
|
154
|
+
, get(frequent_items, 0).item AS mode_value
|
155
|
+
, row_number() over(partition by column_name order by window desc) rownum
|
156
|
+
FROM
|
157
|
+
{monitoring_table}
|
158
|
+
WHERE
|
159
|
+
column_name not in (':table')
|
160
|
+
), profile_record AS (
|
161
|
+
SELECT
|
162
|
+
"{monitored_table_catalog}" as db_name
|
163
|
+
, "{monitored_table_schema}" as schema_name
|
164
|
+
, "{monitored_table_name}" as table_name
|
165
|
+
, column_name
|
166
|
+
, max_value
|
167
|
+
, min_value
|
168
|
+
, null_count
|
169
|
+
, cardinality
|
170
|
+
, avg_value
|
171
|
+
, median_value
|
172
|
+
, mode_value
|
173
|
+
, stddev_value
|
174
|
+
FROM
|
175
|
+
profile_record_history
|
176
|
+
WHERE
|
177
|
+
rownum = 1
|
178
|
+
)""".format(
|
176
179
|
monitoring_table=table["table_fqdn"],
|
177
180
|
monitored_table_catalog=monitored_table[0],
|
178
181
|
monitored_table_schema=monitored_table[1],
|
179
182
|
monitored_table_name=monitored_table[2],
|
180
183
|
)
|
184
|
+
query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn="profile_record", cte=cte)
|
181
185
|
logger.debug(f"The following sql will be fetched to retrieve stats values. {query}")
|
182
186
|
stats.append(databricks_executor.get_query_results(query))
|
183
187
|
return stats
|
@@ -188,9 +192,10 @@ def databricks_column_stats(
|
|
188
192
|
endpoint: str,
|
189
193
|
qdc_client: qdc.QDCExternalAPIClient,
|
190
194
|
tenant_id: str,
|
195
|
+
stats_items: List[str],
|
191
196
|
monitoring_table_suffix: str = "_profile_metrics",
|
192
197
|
) -> None:
|
193
|
-
table_stats = _get_column_stats(conn, monitoring_table_suffix)
|
198
|
+
table_stats = _get_column_stats(conn, stats_items, monitoring_table_suffix)
|
194
199
|
for table in table_stats:
|
195
200
|
logger.debug("Table %s will be aggregated.", table)
|
196
201
|
stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
|
@@ -1,8 +1,13 @@
|
|
1
1
|
import logging
|
2
|
+
from typing import List
|
2
3
|
|
3
4
|
from quollio_core.profilers.lineage import gen_table_lineage_payload, gen_table_lineage_payload_inputs
|
4
5
|
from quollio_core.profilers.sqllineage import SQLLineage
|
5
|
-
from quollio_core.profilers.stats import
|
6
|
+
from quollio_core.profilers.stats import (
|
7
|
+
gen_table_stats_payload_from_tuple,
|
8
|
+
get_is_target_stats_items,
|
9
|
+
render_sql_for_stats,
|
10
|
+
)
|
6
11
|
from quollio_core.repository import qdc, redshift
|
7
12
|
|
8
13
|
logger = logging.getLogger(__name__)
|
@@ -76,38 +81,24 @@ def redshift_table_stats(
|
|
76
81
|
conn: redshift.RedshiftConnectionConfig,
|
77
82
|
qdc_client: qdc.QDCExternalAPIClient,
|
78
83
|
tenant_id: str,
|
84
|
+
stats_items: List[str],
|
79
85
|
) -> None:
|
80
|
-
|
86
|
+
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
81
87
|
with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
|
82
88
|
stats_query = _gen_get_stats_views_query(
|
83
89
|
db=conn.database,
|
84
90
|
schema=conn.schema,
|
85
91
|
)
|
86
92
|
stats_views = redshift_executor.get_query_results(query=stats_query)
|
93
|
+
logger.info("Found %s for table statistics.", len(stats_views))
|
87
94
|
|
88
95
|
req_count = 0
|
89
96
|
for stats_view in stats_views:
|
90
|
-
|
91
|
-
|
92
|
-
db_name
|
93
|
-
, schema_name
|
94
|
-
, table_name
|
95
|
-
, column_name
|
96
|
-
, max_value
|
97
|
-
, min_value
|
98
|
-
, null_count
|
99
|
-
, cardinality
|
100
|
-
, avg_value
|
101
|
-
, median_value
|
102
|
-
, mode_value
|
103
|
-
, stddev_value
|
104
|
-
FROM
|
105
|
-
{db}.{schema}.{table}
|
106
|
-
""".format(
|
107
|
-
db=stats_view[0],
|
108
|
-
schema=stats_view[1],
|
109
|
-
table=stats_view[2],
|
97
|
+
table_fqn = "{catalog}.{schema}.{table}".format(
|
98
|
+
catalog=stats_view[0], schema=stats_view[1], table=stats_view[2]
|
110
99
|
)
|
100
|
+
stats_query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn=table_fqn)
|
101
|
+
logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
|
111
102
|
stats_result = redshift_executor.get_query_results(query=stats_query)
|
112
103
|
payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
|
113
104
|
for payload in payloads:
|