quollio-core 0.4.17__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quollio_core-0.4.17 → quollio_core-0.5.0}/PKG-INFO +2 -1
- {quollio_core-0.4.17 → quollio_core-0.5.0}/pyproject.toml +1 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/__init__.py +1 -1
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +1 -1
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/helper/core.py +7 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/snowflake.py +75 -7
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/sqllineage.py +13 -9
- quollio_core-0.5.0/quollio_core/profilers/teradata/lineage.py +172 -0
- quollio_core-0.5.0/quollio_core/profilers/teradata/stats.py +218 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/qdc.py +1 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/snowflake.py +6 -9
- quollio_core-0.5.0/quollio_core/repository/ssm.py +59 -0
- quollio_core-0.5.0/quollio_core/repository/teradata.py +103 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/snowflake.py +29 -8
- quollio_core-0.5.0/quollio_core/teradata.py +254 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/LICENSE +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/README.md +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/bigquery.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/bricks.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/.gitignore +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/README.md +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/dbt_project.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/sources.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/packages_hub.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/packages_local.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/README.md +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/analyses/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/dbt_project.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/macros/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/sources.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/packages_hub.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/packages_local.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/profiles/profiles_template.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/seeds/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/snapshots/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/seeds/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/README.md +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/analyses/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/dbt_project.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/macros/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/sources.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/packages_hub.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/packages_local.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/seeds/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/snapshots/.gitkeep +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/helper/__init__.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/helper/env_default.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/helper/log.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/helper/log_utils.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/__init__.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/bigquery.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/databricks.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/lineage.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/redshift.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/profilers/stats.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/redshift.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/__init__.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/bigquery.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/databricks.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/dbt.py +0 -0
- {quollio_core-0.4.17 → quollio_core-0.5.0}/quollio_core/repository/redshift.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: quollio-core
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Quollio Core
|
5
5
|
Author-email: quollio-dev <qt.dev@quollio.com>
|
6
6
|
Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
|
@@ -37,6 +37,7 @@ Requires-Dist: google-cloud-bigquery==3.22.0
|
|
37
37
|
Requires-Dist: google-cloud-datacatalog==3.19.0
|
38
38
|
Requires-Dist: google-cloud-datacatalog-lineage==0.3.6
|
39
39
|
Requires-Dist: google-api-python-client==2.131.0
|
40
|
+
Requires-Dist: teradatasql==20.0.0.15
|
40
41
|
Requires-Dist: black>=22.3.0 ; extra == "test"
|
41
42
|
Requires-Dist: coverage>=7.3.2 ; extra == "test"
|
42
43
|
Requires-Dist: isort>=5.10.1 ; extra == "test"
|
@@ -35,3 +35,10 @@ def setup_dbt_profile(connections_json: Dict[str, str], template_path: str, temp
|
|
35
35
|
|
36
36
|
def trim_prefix(s: str, prefix: str) -> str:
|
37
37
|
return s.lstrip(prefix)
|
38
|
+
|
39
|
+
|
40
|
+
def is_valid_domain(domain: str, domain_type: str) -> bool:
|
41
|
+
if domain_type == "VPC_ENDPOINT":
|
42
|
+
return domain.endswith("/api")
|
43
|
+
else:
|
44
|
+
return domain.endswith(".com")
|
@@ -19,7 +19,7 @@ def snowflake_table_to_table_lineage(
|
|
19
19
|
tenant_id: str,
|
20
20
|
) -> None:
|
21
21
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
22
|
-
results = sf_executor.get_query_results(
|
22
|
+
results, err = sf_executor.get_query_results(
|
23
23
|
query="""
|
24
24
|
SELECT
|
25
25
|
*
|
@@ -30,6 +30,13 @@ def snowflake_table_to_table_lineage(
|
|
30
30
|
schema=conn.account_schema,
|
31
31
|
)
|
32
32
|
)
|
33
|
+
if err is not None:
|
34
|
+
handle_error(err=err)
|
35
|
+
if len(results) == 0:
|
36
|
+
logger.warning(
|
37
|
+
"No lineage data in ACCOUNT_USAGE.SNOWFLAKE. Please check the data in `QUOLLIO_LINEAGE_TABLE_LEVEL`."
|
38
|
+
)
|
39
|
+
return
|
33
40
|
parsed_results = parse_snowflake_results(results=results)
|
34
41
|
update_table_lineage_inputs = gen_table_lineage_payload(
|
35
42
|
tenant_id=tenant_id,
|
@@ -62,7 +69,7 @@ def snowflake_column_to_column_lineage(
|
|
62
69
|
tenant_id: str,
|
63
70
|
) -> None:
|
64
71
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
65
|
-
results = sf_executor.get_query_results(
|
72
|
+
results, err = sf_executor.get_query_results(
|
66
73
|
query="""
|
67
74
|
SELECT
|
68
75
|
*
|
@@ -73,6 +80,13 @@ def snowflake_column_to_column_lineage(
|
|
73
80
|
schema=conn.account_schema,
|
74
81
|
)
|
75
82
|
)
|
83
|
+
if err is not None:
|
84
|
+
handle_error(err=err)
|
85
|
+
if len(results) == 0:
|
86
|
+
logger.warning(
|
87
|
+
"No lineage data in ACCOUNT_USAGE.SNOWFLAKE. Please check the data in `QUOLLIO_LINEAGE_COLUMN_LEVEL`."
|
88
|
+
)
|
89
|
+
return
|
76
90
|
update_column_lineage_inputs = gen_column_lineage_payload(
|
77
91
|
tenant_id=tenant_id,
|
78
92
|
endpoint=conn.account_id,
|
@@ -105,7 +119,7 @@ def snowflake_table_level_sqllineage(
|
|
105
119
|
tenant_id: str,
|
106
120
|
) -> None:
|
107
121
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
108
|
-
results = sf_executor.get_query_results(
|
122
|
+
results, err = sf_executor.get_query_results(
|
109
123
|
query="""
|
110
124
|
SELECT
|
111
125
|
database_name
|
@@ -118,6 +132,13 @@ def snowflake_table_level_sqllineage(
|
|
118
132
|
schema=conn.account_schema,
|
119
133
|
)
|
120
134
|
)
|
135
|
+
if err is not None:
|
136
|
+
handle_error(err=err)
|
137
|
+
if len(results) == 0:
|
138
|
+
logger.warning(
|
139
|
+
"No lineage data in ACCOUNT_USAGE.SNOWFLAKE. Please check the data in `QUOLLIO_SQLLINEAGE_SOURCES`."
|
140
|
+
)
|
141
|
+
return
|
121
142
|
update_table_lineage_inputs_list = list()
|
122
143
|
sql_lineage = SQLLineage()
|
123
144
|
for result in results:
|
@@ -158,12 +179,20 @@ def snowflake_table_stats(
|
|
158
179
|
stats_items: List[str],
|
159
180
|
) -> None:
|
160
181
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
161
|
-
|
182
|
+
get_stats_view_query = _gen_get_stats_views_query(
|
162
183
|
db=conn.account_database,
|
163
184
|
schema=conn.account_schema,
|
164
185
|
)
|
165
|
-
stats_views = sf_executor.get_query_results(query=
|
166
|
-
|
186
|
+
stats_views, err = sf_executor.get_query_results(query=get_stats_view_query)
|
187
|
+
if err is not None:
|
188
|
+
handle_error(err=err)
|
189
|
+
if len(stats_views) == 0:
|
190
|
+
logger.warning(
|
191
|
+
f"No target table for stats aggregation. Please see the error message above \
|
192
|
+
and fix it or grant usage permission to both `{conn.account_database}` and `{conn.account_schema}` \
|
193
|
+
and select permissions to views begins with `QUOLLIO_STATS_COLUMNS_`."
|
194
|
+
)
|
195
|
+
return
|
167
196
|
req_count = 0
|
168
197
|
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
169
198
|
for stats_view in stats_views:
|
@@ -172,7 +201,15 @@ def snowflake_table_stats(
|
|
172
201
|
)
|
173
202
|
stats_query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn=table_fqn)
|
174
203
|
logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
|
175
|
-
stats_result = sf_executor.get_query_results(query=stats_query)
|
204
|
+
stats_result, err = sf_executor.get_query_results(query=stats_query)
|
205
|
+
if err is not None:
|
206
|
+
handle_error(err=err, force_skip=True)
|
207
|
+
if len(stats_result) == 0:
|
208
|
+
logger.warning(
|
209
|
+
f"No stats value. Please query {table_fqn} to check the value exists in it \
|
210
|
+
or user has select permission to it."
|
211
|
+
)
|
212
|
+
continue
|
176
213
|
payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
|
177
214
|
for payload in payloads:
|
178
215
|
logger.info(
|
@@ -209,3 +246,34 @@ def _gen_get_stats_views_query(db: str, schema: str) -> str:
|
|
209
246
|
db=db, schema=schema
|
210
247
|
)
|
211
248
|
return query
|
249
|
+
|
250
|
+
|
251
|
+
def handle_error(err: Exception, force_skip: bool = False):
|
252
|
+
if err.errno == 2037:
|
253
|
+
logger.warning(
|
254
|
+
"snowflake get_query_results failed. The table you query exists but user doesn't have permission to select.\
|
255
|
+
Please check a user has select or ownership permissions. ErrorNo: {0} SQLState: {1} Message: {2} SfqID: {3}".format(
|
256
|
+
err.errno, err.sqlstate, err.msg, err.sfqid
|
257
|
+
)
|
258
|
+
)
|
259
|
+
return
|
260
|
+
elif err.errno == 2003:
|
261
|
+
logger.warning(
|
262
|
+
"snowflake get_query_results failed. User doesn't have select permission to the object \
|
263
|
+
or the object you query doesn't exist.\
|
264
|
+
Please check a user has select or ownership permissions and whether the object exists or not. \
|
265
|
+
ErrorNo: {0} SQLState: {1} Message: {2} SfqID: {3}".format(
|
266
|
+
err.errno, err.sqlstate, err.msg, err.sfqid
|
267
|
+
)
|
268
|
+
)
|
269
|
+
return
|
270
|
+
else:
|
271
|
+
logger.error(
|
272
|
+
"snowflake get_query_results failed.\
|
273
|
+
Please check ErrNo and message. ErrorNo: {0} SQLState: {1} Message: {2} SfqID: {3}".format(
|
274
|
+
err.errno, err.sqlstate, err.msg, err.sfqid
|
275
|
+
)
|
276
|
+
)
|
277
|
+
if not force_skip:
|
278
|
+
raise Exception
|
279
|
+
return
|
@@ -67,15 +67,19 @@ class SQLLineage:
|
|
67
67
|
dest_schema = dest_schema.upper() if dest_schema is not None else None
|
68
68
|
|
69
69
|
# MEMO: Complement sql with dialect, source database and source schema info.
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
70
|
+
# MEMO: Skipping qualify because it normalizes the table names.
|
71
|
+
if dialect == "teradata":
|
72
|
+
optimized_stmt = statement
|
73
|
+
else:
|
74
|
+
optimized_stmt: sqlglot.Expression = optimizer.qualify.qualify(
|
75
|
+
statement,
|
76
|
+
dialect=dialect,
|
77
|
+
catalog=src_db,
|
78
|
+
db=src_schema,
|
79
|
+
qualify_columns=False,
|
80
|
+
validate_qualify_columns=False,
|
81
|
+
identify=False,
|
82
|
+
)
|
79
83
|
|
80
84
|
orig_dest_table = Table(table="")
|
81
85
|
dest_table = Table(table="")
|
@@ -0,0 +1,172 @@
|
|
1
|
+
import os
|
2
|
+
from collections import OrderedDict
|
3
|
+
from typing import Dict, List, Set, Tuple, Union
|
4
|
+
|
5
|
+
from sqlglot import ParseError
|
6
|
+
|
7
|
+
from quollio_core.helper.log_utils import error_handling_decorator, logger
|
8
|
+
from quollio_core.profilers.sqllineage import SQLLineage, Table
|
9
|
+
from quollio_core.repository import qdc
|
10
|
+
from quollio_core.repository import teradata as teradata_repo
|
11
|
+
|
12
|
+
|
13
|
+
@error_handling_decorator
|
14
|
+
def load_lineage(
|
15
|
+
conn_config: teradata_repo.TeradataConfig,
|
16
|
+
endpoint: str = None,
|
17
|
+
tenant_id: str = None,
|
18
|
+
qdc_client: qdc.QDCExternalAPIClient = None,
|
19
|
+
page_size: int = None,
|
20
|
+
) -> None:
|
21
|
+
page_size = page_size or int(os.environ.get("TERADATA_PAGE_SIZE", 1000))
|
22
|
+
offset = 0
|
23
|
+
all_lineage_results = []
|
24
|
+
|
25
|
+
with teradata_repo.new_teradata_client(conn_config) as conn:
|
26
|
+
while True:
|
27
|
+
query = f"""
|
28
|
+
SELECT
|
29
|
+
a.QueryID,
|
30
|
+
TRIM(a.SqlTextInfo) AS SqlTextInfo,
|
31
|
+
a.SqlRowNo,
|
32
|
+
TRIM(d.DatabaseName) AS DefaultDatabase
|
33
|
+
FROM DBC.QryLogSQLV a
|
34
|
+
JOIN DBC.QryLogV b
|
35
|
+
ON a.QueryID = b.QueryID
|
36
|
+
JOIN DBC.DatabasesV d
|
37
|
+
ON b.DefaultDatabase = d.DatabaseName
|
38
|
+
WHERE
|
39
|
+
UPPER(TRIM(SqlTextInfo)) LIKE 'CREATE TABLE%'
|
40
|
+
OR UPPER(TRIM(SqlTextInfo)) LIKE 'CREATE VIEW%'
|
41
|
+
OR UPPER(TRIM(SqlTextInfo)) LIKE 'INSERT%'
|
42
|
+
OR UPPER(TRIM(SqlTextInfo)) LIKE 'MERGE%'
|
43
|
+
OR UPPER(TRIM(SqlTextInfo)) LIKE 'UPDATE%'
|
44
|
+
QUALIFY ROW_NUMBER() OVER (ORDER BY a.QueryID, a.SqlRowNo) > {offset}
|
45
|
+
AND ROW_NUMBER() OVER (ORDER BY a.QueryID, a.SqlRowNo) <= {offset + page_size}
|
46
|
+
"""
|
47
|
+
|
48
|
+
rows = teradata_repo.execute_query(query, conn)
|
49
|
+
if not rows:
|
50
|
+
break
|
51
|
+
|
52
|
+
logger.info(f"Concatenating split queries for page {offset // page_size + 1}...")
|
53
|
+
concatenated_queries = concatenate_split_queries(rows)
|
54
|
+
|
55
|
+
logger.info("Processing SQL statements and extracting lineage...")
|
56
|
+
lineage_results = process_sql_statements(concatenated_queries)
|
57
|
+
all_lineage_results.extend(lineage_results)
|
58
|
+
|
59
|
+
if len(rows) < page_size:
|
60
|
+
break
|
61
|
+
|
62
|
+
offset += page_size
|
63
|
+
|
64
|
+
logger.info(f"Lineage extraction complete. Found {len(all_lineage_results)} unique entries.")
|
65
|
+
for entry in all_lineage_results:
|
66
|
+
if len(entry) > 1:
|
67
|
+
logger.debug(f"Destination table: {entry[1]}")
|
68
|
+
else:
|
69
|
+
logger.debug("Destination table: Not available (out of bounds)")
|
70
|
+
|
71
|
+
if len(entry) > 0 and isinstance(entry[0], list):
|
72
|
+
logger.debug("Source tables:")
|
73
|
+
for src_table in entry[0]:
|
74
|
+
logger.debug(f" - {src_table}")
|
75
|
+
else:
|
76
|
+
logger.debug("Source tables: Not available (out of bounds or invalid type)")
|
77
|
+
|
78
|
+
logger.debug("---")
|
79
|
+
|
80
|
+
sql_lineage = SQLLineage()
|
81
|
+
update_table_lineage_inputs = [
|
82
|
+
sql_lineage.gen_lineage_input(
|
83
|
+
tenant_id=tenant_id, endpoint=endpoint, src_tables=src_tables, dest_table=dest_table
|
84
|
+
)
|
85
|
+
for src_tables, dest_table in all_lineage_results
|
86
|
+
]
|
87
|
+
|
88
|
+
table_req_count = 0
|
89
|
+
logger.info(f"Starting to update lineage information for {len(update_table_lineage_inputs)} tables.")
|
90
|
+
for update_table_lineage_input in update_table_lineage_inputs:
|
91
|
+
logger.info(
|
92
|
+
f"Generating table lineage. downstream: {update_table_lineage_input.downstream_database_name}"
|
93
|
+
f" -> {update_table_lineage_input.downstream_table_name}"
|
94
|
+
)
|
95
|
+
try:
|
96
|
+
status_code = qdc_client.update_lineage_by_id(
|
97
|
+
global_id=update_table_lineage_input.downstream_global_id,
|
98
|
+
payload=update_table_lineage_input.upstreams.as_dict(),
|
99
|
+
)
|
100
|
+
if status_code == 200:
|
101
|
+
table_req_count += 1
|
102
|
+
else:
|
103
|
+
logger.error(
|
104
|
+
f"Failed to update lineage for {update_table_lineage_input.downstream_table_name}.\
|
105
|
+
Status code: {status_code}"
|
106
|
+
)
|
107
|
+
except Exception as e:
|
108
|
+
logger.error(
|
109
|
+
f"Exception occurred while updating lineage for {update_table_lineage_input.downstream_table_name}: {e}"
|
110
|
+
)
|
111
|
+
logger.info(f"Generating table lineage is finished. {table_req_count} lineages are ingested.")
|
112
|
+
|
113
|
+
|
114
|
+
@error_handling_decorator
|
115
|
+
def extract_lineage(sql_statement: str, default_database: str = None) -> Tuple[Set[Table], Table]:
|
116
|
+
try:
|
117
|
+
logger.debug(f"Parsing SQL: {sql_statement}")
|
118
|
+
sql_lineage = SQLLineage()
|
119
|
+
source_tables, dest_table = sql_lineage.get_table_level_lineage_source(sql=sql_statement, dialect="teradata")
|
120
|
+
|
121
|
+
source_tables = {Table(db=t.db_schema or default_database, db_schema="", table=t.table) for t in source_tables}
|
122
|
+
dest_table = Table(db=dest_table.db_schema or default_database, db_schema="", table=dest_table.table)
|
123
|
+
|
124
|
+
return source_tables, dest_table
|
125
|
+
except ParseError as e:
|
126
|
+
logger.error(f"Error parsing SQL: {e}")
|
127
|
+
logger.debug(f"Problematic SQL: {sql_statement}")
|
128
|
+
except AttributeError as e:
|
129
|
+
logger.error(f"Attribute error while extracting lineage: {e}")
|
130
|
+
logger.debug(f"Problematic SQL: {sql_statement}")
|
131
|
+
except Exception as e:
|
132
|
+
logger.error(f"Unexpected error while extracting lineage: {e}")
|
133
|
+
logger.debug(f"Problematic SQL: {sql_statement}")
|
134
|
+
return set(), Table(db="", table="")
|
135
|
+
|
136
|
+
|
137
|
+
@error_handling_decorator
|
138
|
+
def process_sql_statements(queries: List[Union[str, Dict[str, Union[str, int]]]]) -> List[Tuple[Set[Table], Table]]:
|
139
|
+
lineage_dict = OrderedDict()
|
140
|
+
for query in queries:
|
141
|
+
if isinstance(query, str):
|
142
|
+
sql = query
|
143
|
+
default_database = None
|
144
|
+
else:
|
145
|
+
sql = query["SqlTextInfo"]
|
146
|
+
default_database = query.get("DefaultDatabase")
|
147
|
+
|
148
|
+
source_tables, dest_table = extract_lineage(sql, default_database)
|
149
|
+
if dest_table.table and source_tables:
|
150
|
+
if dest_table in lineage_dict:
|
151
|
+
logger.info(f"Merging duplicate entry for {dest_table}")
|
152
|
+
# Merge source tables
|
153
|
+
lineage_dict[dest_table] = lineage_dict[dest_table].union(source_tables)
|
154
|
+
else:
|
155
|
+
lineage_dict[dest_table] = source_tables
|
156
|
+
return [(src_tables, dest_table) for dest_table, src_tables in lineage_dict.items()]
|
157
|
+
|
158
|
+
|
159
|
+
def concatenate_split_queries(rows: List[Dict[str, Union[str, int]]]) -> List[Dict[str, Union[str, int]]]:
|
160
|
+
queries = {}
|
161
|
+
for row in rows:
|
162
|
+
query_id = row["QueryID"]
|
163
|
+
sql_text = row["SqlTextInfo"]
|
164
|
+
default_database = row["DefaultDatabase"]
|
165
|
+
if query_id not in queries:
|
166
|
+
queries[query_id] = {"SqlTextInfo": [], "DefaultDatabase": default_database}
|
167
|
+
queries[query_id]["SqlTextInfo"].append(sql_text)
|
168
|
+
|
169
|
+
return [
|
170
|
+
{"SqlTextInfo": "".join(query["SqlTextInfo"]), "DefaultDatabase": query["DefaultDatabase"]}
|
171
|
+
for query in queries.values()
|
172
|
+
]
|
@@ -0,0 +1,218 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
2
|
+
|
3
|
+
from quollio_core.helper.log_utils import error_handling_decorator, logger
|
4
|
+
from quollio_core.profilers.stats import gen_table_stats_payload
|
5
|
+
from quollio_core.repository import qdc
|
6
|
+
from quollio_core.repository import teradata as teradata_repo
|
7
|
+
|
8
|
+
NUMERIC_TYPES = ["D", "F", "I1", "I2", "I8", "I", "N"]
|
9
|
+
|
10
|
+
# I, I1, I2, I8 - INT TYPES INTEGER, BYTEINT, SMALLINT, BIGINT
|
11
|
+
# F - Float
|
12
|
+
# D - Decimal
|
13
|
+
# N - Number
|
14
|
+
|
15
|
+
|
16
|
+
def quote_identifier(identifier: str) -> str:
|
17
|
+
return f'"{identifier}"'
|
18
|
+
|
19
|
+
|
20
|
+
@error_handling_decorator
|
21
|
+
def load_stats(
|
22
|
+
conn_config: teradata_repo.TeradataConfig,
|
23
|
+
sample_percent: Optional[float] = None,
|
24
|
+
endpoint: Optional[str] = None,
|
25
|
+
tenant_id: Optional[str] = None,
|
26
|
+
qdc_client: Optional[qdc.QDCExternalAPIClient] = None,
|
27
|
+
target_databases: Optional[List[str]] = None,
|
28
|
+
target_databases_method: str = "DENYLIST",
|
29
|
+
stats_items: Optional[List[str]] = None,
|
30
|
+
) -> None:
|
31
|
+
stats_list = []
|
32
|
+
numerical_columns = 0
|
33
|
+
non_numerical_columns = 0
|
34
|
+
logger.info(
|
35
|
+
f"Starting statistics collection. " f"Sample percent: {sample_percent if sample_percent is not None else 'N/A'}"
|
36
|
+
)
|
37
|
+
|
38
|
+
with teradata_repo.new_teradata_client(conn_config) as conn:
|
39
|
+
try:
|
40
|
+
tables = teradata_repo.get_table_list(conn, target_databases, target_databases_method)
|
41
|
+
for table in tables:
|
42
|
+
logger.debug(f"Processing table: {table}")
|
43
|
+
database_name = table["DataBaseName"]
|
44
|
+
table_name = table["TableName"]
|
45
|
+
|
46
|
+
logger.info(f"Processing table {database_name}.{table_name}")
|
47
|
+
columns = teradata_repo.get_column_list(conn, database_name=database_name, table_name=table_name)
|
48
|
+
logger.debug(f"Columns: {columns}")
|
49
|
+
|
50
|
+
for column in columns:
|
51
|
+
column_name = column["ColumnName"]
|
52
|
+
column_type = column["ColumnType"]
|
53
|
+
if column_type is None:
|
54
|
+
column_type = ""
|
55
|
+
else:
|
56
|
+
column_type = column_type.strip()
|
57
|
+
|
58
|
+
is_numerical = column_type in NUMERIC_TYPES
|
59
|
+
if is_numerical:
|
60
|
+
numerical_columns += 1
|
61
|
+
else:
|
62
|
+
non_numerical_columns += 1
|
63
|
+
|
64
|
+
stats_sql = generate_column_statistics_sql(
|
65
|
+
database_name,
|
66
|
+
table_name,
|
67
|
+
column_name,
|
68
|
+
column_type,
|
69
|
+
sample_percent if is_numerical else None,
|
70
|
+
stats_items,
|
71
|
+
)
|
72
|
+
logger.debug(f"Generated SQL for column {column_name}: {stats_sql}")
|
73
|
+
|
74
|
+
try:
|
75
|
+
result = teradata_repo.execute_query(stats_sql, conn)
|
76
|
+
logger.debug(f"Query result for column {column_name}: {result}")
|
77
|
+
if result:
|
78
|
+
column_stats = parse_column_statistics_result(
|
79
|
+
result[0], database_name, table_name, column_name, stats_items, is_numerical
|
80
|
+
)
|
81
|
+
stats_list.append(column_stats)
|
82
|
+
except Exception as e:
|
83
|
+
logger.error(
|
84
|
+
f"Failed to collect statistics for {database_name}.{table_name}.{column_name}: {e}"
|
85
|
+
)
|
86
|
+
|
87
|
+
except Exception as e:
|
88
|
+
logger.error(f"Error during statistics collection: {e}")
|
89
|
+
|
90
|
+
logger.info("Statistics collection completed successfully.")
|
91
|
+
|
92
|
+
logger.debug(f"Stats list: {stats_list}")
|
93
|
+
payloads = gen_table_stats_payload(stats=stats_list, tenant_id=tenant_id, endpoint=endpoint)
|
94
|
+
logger.debug(f"Generated payloads: {payloads}")
|
95
|
+
|
96
|
+
req_count = 0
|
97
|
+
for payload in payloads:
|
98
|
+
logger.info(f"Generating table stats. asset: {payload.db} -> {payload.table} -> {payload.column}")
|
99
|
+
status_code = qdc_client.update_stats_by_id(
|
100
|
+
global_id=payload.global_id,
|
101
|
+
payload=payload.body.get_column_stats(),
|
102
|
+
)
|
103
|
+
if status_code == 200:
|
104
|
+
req_count += 1
|
105
|
+
|
106
|
+
logger.info(
|
107
|
+
f"Loading statistics is finished. {req_count} statistics are ingested. "
|
108
|
+
f"Numerical columns: {numerical_columns}, Non-numerical columns: {non_numerical_columns}"
|
109
|
+
)
|
110
|
+
|
111
|
+
|
112
|
+
@error_handling_decorator
|
113
|
+
def parse_column_statistics_result(
|
114
|
+
result: Dict[str, Any],
|
115
|
+
database_name: str,
|
116
|
+
table_name: str,
|
117
|
+
column_name: str,
|
118
|
+
stats_items: Optional[List[str]] = None,
|
119
|
+
is_numerical: bool = False,
|
120
|
+
) -> Dict[str, Any]:
|
121
|
+
stats_dict = {
|
122
|
+
"DB_NAME": database_name,
|
123
|
+
"SCHEMA_NAME": "",
|
124
|
+
"TABLE_NAME": table_name,
|
125
|
+
"COLUMN_NAME": column_name,
|
126
|
+
}
|
127
|
+
|
128
|
+
if stats_items:
|
129
|
+
for item in stats_items:
|
130
|
+
if item == "cardinality" and "num_uniques" in result:
|
131
|
+
stats_dict["CARDINALITY"] = result["num_uniques"]
|
132
|
+
elif item == "number_of_null" and "num_nulls" in result:
|
133
|
+
stats_dict["NULL_COUNT"] = result["num_nulls"] # Changed from NUM_NULLS to NULL_COUNT
|
134
|
+
|
135
|
+
if is_numerical:
|
136
|
+
if item == "min" and "min_value" in result:
|
137
|
+
stats_dict["MIN_VALUE"] = str(result["min_value"])
|
138
|
+
elif item == "max" and "max_value" in result:
|
139
|
+
stats_dict["MAX_VALUE"] = str(result["max_value"])
|
140
|
+
elif item == "median" and "median_value" in result:
|
141
|
+
stats_dict["MEDIAN_VALUE"] = str(result["median_value"])
|
142
|
+
elif item == "mean" and "avg_value" in result:
|
143
|
+
stats_dict["AVG_VALUE"] = str(result["avg_value"])
|
144
|
+
elif item == "stddev" and "stddev_value" in result:
|
145
|
+
stats_dict["STDDEV_VALUE"] = str(result["stddev_value"])
|
146
|
+
elif item == "mode" and "mode_value" in result and is_numerical:
|
147
|
+
stats_dict["MODE_VALUE"] = str(result["mode_value"])
|
148
|
+
|
149
|
+
return stats_dict
|
150
|
+
|
151
|
+
|
152
|
+
@error_handling_decorator
|
153
|
+
def generate_column_statistics_sql(
|
154
|
+
database_name: str,
|
155
|
+
table_name: str,
|
156
|
+
column_name: str,
|
157
|
+
column_type: str,
|
158
|
+
sample_percent: Optional[float] = None,
|
159
|
+
stats_items: Optional[List[str]] = None,
|
160
|
+
) -> str:
|
161
|
+
quoted_column = quote_identifier(column_name)
|
162
|
+
quoted_database = quote_identifier(database_name)
|
163
|
+
|
164
|
+
# Handle the case where table_name might include a database
|
165
|
+
if "." in table_name:
|
166
|
+
schema, table = table_name.split(".", 1)
|
167
|
+
quoted_table = f"{quote_identifier(schema)}.{quote_identifier(table)}"
|
168
|
+
else:
|
169
|
+
quoted_table = quote_identifier(table_name)
|
170
|
+
|
171
|
+
stats_clauses = []
|
172
|
+
mode_query = ""
|
173
|
+
|
174
|
+
if stats_items:
|
175
|
+
if "cardinality" in stats_items:
|
176
|
+
stats_clauses.append(f"COUNT(DISTINCT {quoted_column}) AS num_uniques")
|
177
|
+
if "number_of_null" in stats_items:
|
178
|
+
stats_clauses.append(f"SUM(CASE WHEN {quoted_column} IS NULL THEN 1 ELSE 0 END) AS num_nulls")
|
179
|
+
|
180
|
+
if column_type in NUMERIC_TYPES:
|
181
|
+
if "min" in stats_items:
|
182
|
+
stats_clauses.append(f"MIN(CAST({quoted_column} AS FLOAT)) AS min_value")
|
183
|
+
if "max" in stats_items:
|
184
|
+
stats_clauses.append(f"MAX(CAST({quoted_column} AS FLOAT)) AS max_value")
|
185
|
+
if "median" in stats_items:
|
186
|
+
stats_clauses.append(f"MEDIAN(CAST({quoted_column} AS FLOAT)) AS median_value")
|
187
|
+
if "mean" in stats_items:
|
188
|
+
stats_clauses.append(f"AVG(CAST({quoted_column} AS FLOAT)) AS avg_value")
|
189
|
+
if "stddev" in stats_items:
|
190
|
+
stats_clauses.append(f"STDDEV_SAMP(CAST({quoted_column} AS FLOAT)) AS stddev_value")
|
191
|
+
if "mode" in stats_items:
|
192
|
+
mode_query = (
|
193
|
+
f"WITH MODE_VALUE AS ("
|
194
|
+
f" SELECT {quoted_column}, COUNT(*) as freq "
|
195
|
+
f" FROM {quoted_database}.{quoted_table} "
|
196
|
+
)
|
197
|
+
|
198
|
+
if sample_percent is not None and 0 < sample_percent <= 99:
|
199
|
+
sample_fraction = sample_percent / 100
|
200
|
+
mode_query += f" SAMPLE {sample_fraction} "
|
201
|
+
|
202
|
+
mode_query += (
|
203
|
+
f" GROUP BY {quoted_column} " f" QUALIFY ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) = 1" f") "
|
204
|
+
)
|
205
|
+
stats_clauses.append(f"(SELECT {quoted_column} FROM MODE_VALUE) AS mode_value")
|
206
|
+
|
207
|
+
if not stats_clauses:
|
208
|
+
logger.warning(f"No statistics selected for column {column_name}. Skipping this column.")
|
209
|
+
return ""
|
210
|
+
|
211
|
+
query = f"{mode_query}" f"SELECT {', '.join(stats_clauses)} " f"FROM {quoted_database}.{quoted_table}"
|
212
|
+
|
213
|
+
if sample_percent is not None and 0 < sample_percent <= 99:
|
214
|
+
sample_fraction = sample_percent / 100
|
215
|
+
query += f" SAMPLE {sample_fraction}"
|
216
|
+
|
217
|
+
logger.debug(f"Generated SQL query for {quoted_database}.{quoted_table}.{quoted_column}: {query}")
|
218
|
+
return query
|
@@ -25,6 +25,7 @@ class QDCExternalAPIClient:
|
|
25
25
|
Tried to find a package for oauth0 client credentials flow,
|
26
26
|
but any of them contains bugs or lacks of features to handle the token refresh when it's expired
|
27
27
|
"""
|
28
|
+
|
28
29
|
url = f"{self.base_url}/oauth2/token"
|
29
30
|
creds = f"{self.client_id}:{self.client_secret}"
|
30
31
|
encoded_creds = base64.b64encode(creds.encode()).decode()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from dataclasses import asdict, dataclass
|
3
|
-
from typing import Dict, List
|
3
|
+
from typing import Dict, List, Tuple
|
4
4
|
|
5
5
|
from snowflake.connector import DictCursor, connect, errors
|
6
6
|
from snowflake.connector.connection import SnowflakeConnection
|
@@ -46,16 +46,13 @@ class SnowflakeQueryExecutor:
|
|
46
46
|
)
|
47
47
|
return conn
|
48
48
|
|
49
|
-
def get_query_results(self, query: str) -> List[Dict[str, str]]:
|
49
|
+
def get_query_results(self, query: str) -> Tuple[List[Dict[str, str]], Exception]:
|
50
50
|
with self.conn.cursor(DictCursor) as cur:
|
51
51
|
try:
|
52
52
|
cur.execute(query)
|
53
53
|
result: List[Dict[str, str]] = cur.fetchall()
|
54
|
-
return result
|
54
|
+
return (result, None)
|
55
55
|
except errors.ProgrammingError as e:
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
e.errno, e.sqlstate, e.msg, e.sfqid
|
60
|
-
)
|
61
|
-
)
|
56
|
+
return ([], e)
|
57
|
+
except Exception as e:
|
58
|
+
return ([], e)
|