quollio-core 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quollio_core/__init__.py +1 -1
- quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +17 -0
- quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +17 -0
- quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +18 -0
- quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +18 -2
- quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +6 -1
- quollio_core/helper/core.py +1 -0
- quollio_core/models/avroasset.py +23 -0
- quollio_core/models/qdc.py +36 -0
- quollio_core/profilers/lineage.py +96 -0
- quollio_core/profilers/qdc.py +27 -0
- quollio_core/profilers/snowflake.py +113 -60
- quollio_core/profilers/stats.py +36 -0
- quollio_core/profilers/teradata/lineage.py +7 -3
- quollio_core/profilers/teradata/stats.py +9 -3
- quollio_core/repository/qdc.py +91 -0
- quollio_core/repository/snowflake.py +62 -13
- quollio_core/repository/teradata.py +19 -5
- quollio_core/snowflake.py +100 -18
- quollio_core/teradata.py +15 -1
- {quollio_core-0.5.0.dist-info → quollio_core-0.6.0.dist-info}/METADATA +5 -2
- {quollio_core-0.5.0.dist-info → quollio_core-0.6.0.dist-info}/RECORD +24 -21
- {quollio_core-0.5.0.dist-info → quollio_core-0.6.0.dist-info}/WHEEL +1 -1
- {quollio_core-0.5.0.dist-info → quollio_core-0.6.0.dist-info/licenses}/LICENSE +0 -0
quollio_core/__init__.py
CHANGED
@@ -95,6 +95,23 @@ UNION
|
|
95
95
|
{{ source('account_usage', 'TABLES') }}
|
96
96
|
WHERE
|
97
97
|
DELETED IS NULL
|
98
|
+
AND (
|
99
|
+
{% if var('target_databases_method') == 'ALLOWLIST' %}
|
100
|
+
{% if var('target_databases') %}
|
101
|
+
TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }})
|
102
|
+
{% else %}
|
103
|
+
1=0 -- If no databases specified in allowlist, deny all
|
104
|
+
{% endif %}
|
105
|
+
{% elif var('target_databases_method') == 'DENYLIST' %}
|
106
|
+
{% if var('target_databases') %}
|
107
|
+
NOT (TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }}))
|
108
|
+
{% else %}
|
109
|
+
1=1 -- If no databases specified in denylist, include all
|
110
|
+
{% endif %}
|
111
|
+
{% else %}
|
112
|
+
1=1 -- Default case: allow all databases
|
113
|
+
{% endif %}
|
114
|
+
)
|
98
115
|
), exists_upstream_column_lineage AS (
|
99
116
|
SELECT
|
100
117
|
downstream_table_name
|
@@ -49,6 +49,23 @@ WITH table_lineage_history AS (
|
|
49
49
|
{{ source('account_usage', 'TABLES') }}
|
50
50
|
WHERE
|
51
51
|
DELETED IS NULL
|
52
|
+
AND (
|
53
|
+
{% if var('target_databases_method') == 'ALLOWLIST' %}
|
54
|
+
{% if var('target_databases') %}
|
55
|
+
TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }})
|
56
|
+
{% else %}
|
57
|
+
1=0 -- If no databases specified in allowlist, deny all
|
58
|
+
{% endif %}
|
59
|
+
{% elif var('target_databases_method') == 'DENYLIST' %}
|
60
|
+
{% if var('target_databases') %}
|
61
|
+
NOT (TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }}))
|
62
|
+
{% else %}
|
63
|
+
1=1 -- If no databases specified in denylist, include all
|
64
|
+
{% endif %}
|
65
|
+
{% else %}
|
66
|
+
1=1 -- Default case: allow all databases
|
67
|
+
{% endif %}
|
68
|
+
)
|
52
69
|
), upstream_exists_table AS (
|
53
70
|
SELECT
|
54
71
|
downstream_table_name AS "DOWNSTREAM_TABLE_NAME"
|
@@ -48,3 +48,21 @@ on
|
|
48
48
|
lst.query_id = qt.query_id
|
49
49
|
where
|
50
50
|
qt.query_id is not null
|
51
|
+
AND (
|
52
|
+
{% if var('target_databases_method') == 'ALLOWLIST' %}
|
53
|
+
{% if var('target_databases') %}
|
54
|
+
database_name LIKE ANY ({{ var('target_databases')|join(",") }})
|
55
|
+
{% else %}
|
56
|
+
1=0 -- If no databases specified in allowlist, deny all
|
57
|
+
{% endif %}
|
58
|
+
{% elif var('target_databases_method') == 'DENYLIST' %}
|
59
|
+
{% if var('target_databases') %}
|
60
|
+
NOT (database_name LIKE ANY ({{ var('target_databases')|join(",") }}))
|
61
|
+
{% else %}
|
62
|
+
1=1 -- If no databases specified in denylist, include all
|
63
|
+
{% endif %}
|
64
|
+
{% else %}
|
65
|
+
1=1 -- Default case: allow all databases
|
66
|
+
{% endif %}
|
67
|
+
)
|
68
|
+
|
@@ -28,7 +28,7 @@ WITH columns AS (
|
|
28
28
|
FROM
|
29
29
|
{{ source('account_usage', 'GRANTS_TO_ROLES') }}
|
30
30
|
WHERE
|
31
|
-
granted_on in ('TABLE', 'MATERIALIZED VIEW')
|
31
|
+
granted_on in ('TABLE', 'VIEW', 'MATERIALIZED VIEW')
|
32
32
|
AND grantee_name = '{{ var("query_role") }}'
|
33
33
|
AND privilege in ('SELECT', 'OWNERSHIP')
|
34
34
|
AND deleted_on IS NULL
|
@@ -92,5 +92,21 @@ WITH columns AS (
|
|
92
92
|
else false END AS is_calculable
|
93
93
|
FROM
|
94
94
|
implicit_columns_removed
|
95
|
-
|
95
|
+
WHERE
|
96
|
+
{% if var('target_databases_method') == 'ALLOWLIST' %}
|
97
|
+
{% if var('target_databases') %}
|
98
|
+
TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }})
|
99
|
+
{% else %}
|
100
|
+
1=0 -- If no databases specified in allowlist, deny all
|
101
|
+
{% endif %}
|
102
|
+
{% elif var('target_databases_method') == 'DENYLIST' %}
|
103
|
+
{% if var('target_databases') %}
|
104
|
+
NOT (TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }}))
|
105
|
+
{% else %}
|
106
|
+
1=1 -- If no databases specified in denylist, include all
|
107
|
+
{% endif %}
|
108
|
+
{% else %}
|
109
|
+
1=1 -- Default case: allow all databases
|
110
|
+
{% endif %}
|
111
|
+
)
|
96
112
|
select * from final
|
@@ -8,6 +8,11 @@ quollio_intelligence_snowflake:
|
|
8
8
|
schema: {{ account_schema }}
|
9
9
|
type: snowflake
|
10
10
|
user: {{ account_user }}
|
11
|
-
password: {{ account_password }}
|
12
11
|
warehouse: {{ account_warehouse }}
|
13
12
|
threads: {{ threads }}
|
13
|
+
{% if private_key is defined %}
|
14
|
+
private_key: |
|
15
|
+
{{ private_key | indent(8) }}
|
16
|
+
{% else %}
|
17
|
+
password: {{ account_password }}
|
18
|
+
{% endif %}
|
quollio_core/helper/core.py
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import List, Optional
|
3
|
+
|
4
|
+
from dataclasses_avroschema import AvroModel
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class AvroAsset(AvroModel):
|
9
|
+
"AvroAsset"
|
10
|
+
|
11
|
+
id: str
|
12
|
+
object_type: str
|
13
|
+
parents: List[str]
|
14
|
+
name: str
|
15
|
+
stats_max: Optional[str] = None
|
16
|
+
stats_min: Optional[str] = None
|
17
|
+
stats_mean: Optional[str] = None
|
18
|
+
stats_median: Optional[str] = None
|
19
|
+
stats_mode: Optional[str] = None
|
20
|
+
stats_stddev: Optional[str] = None
|
21
|
+
stats_number_of_null: Optional[str] = None
|
22
|
+
stats_number_of_unique: Optional[str] = None
|
23
|
+
upstream: Optional[List[str]] = None
|
@@ -0,0 +1,36 @@
|
|
1
|
+
from dataclasses import asdict, dataclass
|
2
|
+
from typing import Dict
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class GetImportURLRequest:
|
7
|
+
service_name: str
|
8
|
+
source_name: str
|
9
|
+
file_name: str
|
10
|
+
override_logical_name: str
|
11
|
+
update_mode: str
|
12
|
+
|
13
|
+
def as_dict(self) -> Dict[str, str]:
|
14
|
+
return asdict(self)
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class DataSourceMetadataResponseBody:
|
19
|
+
user_id: str
|
20
|
+
job_key: str
|
21
|
+
service_name: str
|
22
|
+
source_name: str
|
23
|
+
source_type: str
|
24
|
+
override_logical_name: str
|
25
|
+
|
26
|
+
def as_dict(self) -> Dict[str, str]:
|
27
|
+
return asdict(self)
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class GetImportURLResponse:
|
32
|
+
location: str
|
33
|
+
datasource_metadata_response_body: DataSourceMetadataResponseBody
|
34
|
+
|
35
|
+
def as_dict(self) -> Dict[str, str]:
|
36
|
+
return asdict(self)
|
@@ -3,6 +3,7 @@ from dataclasses import asdict, dataclass
|
|
3
3
|
from typing import Dict, List, Tuple, Union
|
4
4
|
|
5
5
|
from quollio_core.helper.core import new_global_id
|
6
|
+
from quollio_core.models.avroasset import AvroAsset
|
6
7
|
|
7
8
|
|
8
9
|
@dataclass
|
@@ -23,6 +24,101 @@ class LineageInputs:
|
|
23
24
|
upstreams: LineageInput
|
24
25
|
|
25
26
|
|
27
|
+
def gen_table_avro_lineage_payload(
|
28
|
+
tenant_id: str,
|
29
|
+
endpoint: str,
|
30
|
+
tables: List[Dict[str, Union[Dict[str, str], str]]],
|
31
|
+
existing_global_ids: Dict[str, bool],
|
32
|
+
) -> List[Dict[str, str]]:
|
33
|
+
payload = list()
|
34
|
+
for table in tables:
|
35
|
+
downstream_table_fqn = table["DOWNSTREAM_TABLE_NAME"].split(".")
|
36
|
+
if len(downstream_table_fqn) != 3:
|
37
|
+
continue
|
38
|
+
else:
|
39
|
+
global_id_arg = "{db}{schema}{table}".format(
|
40
|
+
db=downstream_table_fqn[0], schema=downstream_table_fqn[1], table=downstream_table_fqn[2]
|
41
|
+
)
|
42
|
+
downstream_table_global_id = new_global_id(
|
43
|
+
tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="table"
|
44
|
+
)
|
45
|
+
if existing_global_ids.get(downstream_table_global_id) is not True:
|
46
|
+
continue
|
47
|
+
upstreams = list()
|
48
|
+
for upstream_table in table["UPSTREAM_TABLES"]:
|
49
|
+
upstream_table_fqn = upstream_table["upstream_object_name"].split(".")
|
50
|
+
if len(upstream_table_fqn) != 3:
|
51
|
+
continue
|
52
|
+
else:
|
53
|
+
upstream_global_id_arg = "{db}{schema}{table}".format(
|
54
|
+
db=upstream_table_fqn[0], schema=upstream_table_fqn[1], table=upstream_table_fqn[2]
|
55
|
+
)
|
56
|
+
upstream_table_global_id = new_global_id(
|
57
|
+
tenant_id=tenant_id, cluster_id=endpoint, data_id=upstream_global_id_arg, data_type="table"
|
58
|
+
)
|
59
|
+
upstreams.append(upstream_table_global_id)
|
60
|
+
|
61
|
+
avro_assets = AvroAsset(
|
62
|
+
id=downstream_table_global_id,
|
63
|
+
object_type="table",
|
64
|
+
parents=[downstream_table_fqn[0], downstream_table_fqn[1]],
|
65
|
+
name=downstream_table_fqn[2],
|
66
|
+
upstream=upstreams,
|
67
|
+
)
|
68
|
+
payload.append(avro_assets.to_dict())
|
69
|
+
return payload
|
70
|
+
|
71
|
+
|
72
|
+
def gen_column_avro_lineage_payload(
|
73
|
+
tenant_id: str, endpoint: str, columns: List[Dict[str, str]], existing_global_ids: Dict[str, bool]
|
74
|
+
) -> List[Dict[str, str]]:
|
75
|
+
payload = list()
|
76
|
+
for column in columns:
|
77
|
+
downstream_table_fqn = column["DOWNSTREAM_TABLE_NAME"].split(".")
|
78
|
+
if len(downstream_table_fqn) != 3:
|
79
|
+
continue
|
80
|
+
else:
|
81
|
+
global_id_arg = "{db}{schema}{table}{column}".format(
|
82
|
+
db=downstream_table_fqn[0],
|
83
|
+
schema=downstream_table_fqn[1],
|
84
|
+
table=downstream_table_fqn[2],
|
85
|
+
column=column["DOWNSTREAM_COLUMN_NAME"],
|
86
|
+
)
|
87
|
+
downstream_column_global_id = new_global_id(
|
88
|
+
tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
|
89
|
+
)
|
90
|
+
if existing_global_ids.get(downstream_column_global_id) is not True:
|
91
|
+
continue
|
92
|
+
upstream_columns: List[Dict[str, str]] = json.loads(column["UPSTREAM_COLUMNS"])
|
93
|
+
upstreams = list()
|
94
|
+
for upstream_column in upstream_columns:
|
95
|
+
upstream_table_fqn = upstream_column["upstream_table_name"].split(".")
|
96
|
+
if len(upstream_table_fqn) != 3:
|
97
|
+
continue
|
98
|
+
elif not upstream_column.get("upstream_column_name"):
|
99
|
+
continue
|
100
|
+
else:
|
101
|
+
upstream_global_id_arg = "{db}{schema}{table}{column}".format(
|
102
|
+
db=upstream_table_fqn[0],
|
103
|
+
schema=upstream_table_fqn[1],
|
104
|
+
table=upstream_table_fqn[2],
|
105
|
+
column=upstream_column["upstream_column_name"],
|
106
|
+
)
|
107
|
+
upstream_column_global_id = new_global_id(
|
108
|
+
tenant_id=tenant_id, cluster_id=endpoint, data_id=upstream_global_id_arg, data_type="column"
|
109
|
+
)
|
110
|
+
upstreams.append(upstream_column_global_id)
|
111
|
+
avro_assets = AvroAsset(
|
112
|
+
id=downstream_column_global_id,
|
113
|
+
object_type="column",
|
114
|
+
parents=[downstream_table_fqn[0], downstream_table_fqn[1], downstream_table_fqn[2]],
|
115
|
+
name=column["DOWNSTREAM_COLUMN_NAME"],
|
116
|
+
upstream=upstreams,
|
117
|
+
)
|
118
|
+
payload.append(avro_assets.to_dict())
|
119
|
+
return payload
|
120
|
+
|
121
|
+
|
26
122
|
def gen_table_lineage_payload(
|
27
123
|
tenant_id: str, endpoint: str, tables: List[Dict[str, Union[Dict[str, str], str]]]
|
28
124
|
) -> List[LineageInputs]:
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from typing import Dict
|
4
|
+
|
5
|
+
import fastavro
|
6
|
+
|
7
|
+
from quollio_core.helper.core import new_global_id
|
8
|
+
from quollio_core.models.avroasset import AvroAsset
|
9
|
+
from quollio_core.repository import qdc
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def gen_existing_global_id_dict(avro_content: bytes) -> Dict[str, bool]:
|
15
|
+
byte_io = BytesIO(avro_content)
|
16
|
+
avro_schema = AvroAsset.avro_schema_to_python()
|
17
|
+
reader = fastavro.reader(byte_io, avro_schema)
|
18
|
+
records = {record["id"]: True for record in reader}
|
19
|
+
return records
|
20
|
+
|
21
|
+
|
22
|
+
def get_avro_file_content(tenant_id: str, account_id: str, qdc_client: qdc.QDCExternalAPIClient) -> bytes:
|
23
|
+
datasource_id = new_global_id(tenant_id=tenant_id, cluster_id=account_id, data_id="", data_type="data_source")
|
24
|
+
logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
|
25
|
+
res = qdc_client.get_export_url(datasource_id=datasource_id)
|
26
|
+
file_content = qdc_client.download_file(res).content
|
27
|
+
return file_content
|
@@ -1,13 +1,20 @@
|
|
1
|
+
import io
|
1
2
|
import logging
|
2
|
-
|
3
|
+
import os
|
4
|
+
from typing import Dict, List
|
3
5
|
|
6
|
+
from fastavro import writer
|
7
|
+
|
8
|
+
from quollio_core.helper.core import new_global_id
|
9
|
+
from quollio_core.models.avroasset import AvroAsset
|
10
|
+
from quollio_core.models.qdc import GetImportURLRequest
|
4
11
|
from quollio_core.profilers.lineage import (
|
5
|
-
|
6
|
-
|
12
|
+
gen_column_avro_lineage_payload,
|
13
|
+
gen_table_avro_lineage_payload,
|
7
14
|
parse_snowflake_results,
|
8
15
|
)
|
9
16
|
from quollio_core.profilers.sqllineage import SQLLineage
|
10
|
-
from quollio_core.profilers.stats import
|
17
|
+
from quollio_core.profilers.stats import gen_table_stats_avro_payload, get_is_target_stats_items, render_sql_for_stats
|
11
18
|
from quollio_core.repository import qdc, snowflake
|
12
19
|
|
13
20
|
logger = logging.getLogger(__name__)
|
@@ -17,6 +24,7 @@ def snowflake_table_to_table_lineage(
|
|
17
24
|
conn: snowflake.SnowflakeConnectionConfig,
|
18
25
|
qdc_client: qdc.QDCExternalAPIClient,
|
19
26
|
tenant_id: str,
|
27
|
+
existing_global_ids: Dict[str, bool],
|
20
28
|
) -> None:
|
21
29
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
22
30
|
results, err = sf_executor.get_query_results(
|
@@ -38,28 +46,41 @@ def snowflake_table_to_table_lineage(
|
|
38
46
|
)
|
39
47
|
return
|
40
48
|
parsed_results = parse_snowflake_results(results=results)
|
41
|
-
update_table_lineage_inputs =
|
49
|
+
update_table_lineage_inputs = gen_table_avro_lineage_payload(
|
42
50
|
tenant_id=tenant_id,
|
43
51
|
endpoint=conn.account_id,
|
44
52
|
tables=parsed_results,
|
53
|
+
existing_global_ids=existing_global_ids,
|
54
|
+
)
|
55
|
+
stack_name = os.getenv("CF_STACK")
|
56
|
+
import_req = GetImportURLRequest(
|
57
|
+
service_name="snowflake",
|
58
|
+
source_name=stack_name,
|
59
|
+
file_name="{name}.avro".format(name=stack_name),
|
60
|
+
override_logical_name="false",
|
61
|
+
update_mode="partial",
|
62
|
+
)
|
63
|
+
datasource_id = new_global_id(
|
64
|
+
tenant_id=tenant_id, cluster_id=conn.account_id, data_id="", data_type="data_source"
|
45
65
|
)
|
66
|
+
logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
|
67
|
+
import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
|
68
|
+
if import_res is None:
|
69
|
+
logger.error("get_import_url failed. Please retry `load_lineage` again")
|
70
|
+
return
|
71
|
+
logger.debug("ImportResponse: {res}".format(res=import_res))
|
46
72
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
payload=update_table_lineage_input.upstreams.as_dict(),
|
59
|
-
)
|
60
|
-
if status_code == 200:
|
61
|
-
req_count += 1
|
62
|
-
logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
|
73
|
+
avro_schema = AvroAsset.avro_schema_to_python()
|
74
|
+
|
75
|
+
buffer = io.BytesIO()
|
76
|
+
writer(buffer, avro_schema, update_table_lineage_inputs)
|
77
|
+
res = qdc_client.upload_file(
|
78
|
+
url=import_res.location,
|
79
|
+
metadata=import_res.datasource_metadata_response_body,
|
80
|
+
buffer=buffer.getbuffer().tobytes(),
|
81
|
+
)
|
82
|
+
if res == 200:
|
83
|
+
logger.info("Upload table lineage is finished.")
|
63
84
|
return
|
64
85
|
|
65
86
|
|
@@ -67,6 +88,7 @@ def snowflake_column_to_column_lineage(
|
|
67
88
|
conn: snowflake.SnowflakeConnectionConfig,
|
68
89
|
qdc_client: qdc.QDCExternalAPIClient,
|
69
90
|
tenant_id: str,
|
91
|
+
existing_global_ids: Dict[str, bool],
|
70
92
|
) -> None:
|
71
93
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
72
94
|
results, err = sf_executor.get_query_results(
|
@@ -87,29 +109,39 @@ def snowflake_column_to_column_lineage(
|
|
87
109
|
"No lineage data in ACCOUNT_USAGE.SNOWFLAKE. Please check the data in `QUOLLIO_LINEAGE_COLUMN_LEVEL`."
|
88
110
|
)
|
89
111
|
return
|
90
|
-
update_column_lineage_inputs =
|
91
|
-
tenant_id=tenant_id,
|
92
|
-
endpoint=conn.account_id,
|
93
|
-
columns=results,
|
112
|
+
update_column_lineage_inputs = gen_column_avro_lineage_payload(
|
113
|
+
tenant_id=tenant_id, endpoint=conn.account_id, columns=results, existing_global_ids=existing_global_ids
|
94
114
|
)
|
95
115
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
logger.
|
116
|
+
stack_name = os.getenv("CF_STACK")
|
117
|
+
import_req = GetImportURLRequest(
|
118
|
+
service_name="snowflake",
|
119
|
+
source_name=stack_name,
|
120
|
+
file_name="{name}.avro".format(name=stack_name),
|
121
|
+
override_logical_name="false",
|
122
|
+
update_mode="partial",
|
123
|
+
)
|
124
|
+
datasource_id = new_global_id(
|
125
|
+
tenant_id=tenant_id, cluster_id=conn.account_id, data_id="", data_type="data_source"
|
126
|
+
)
|
127
|
+
logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
|
128
|
+
import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
|
129
|
+
if import_res is None:
|
130
|
+
logger.error("get_import_url failed. Please retry load_lineage again")
|
131
|
+
return
|
132
|
+
logger.debug("ImportResponse: {res}".format(res=import_res))
|
133
|
+
|
134
|
+
avro_schema = AvroAsset.avro_schema_to_python()
|
135
|
+
|
136
|
+
buffer = io.BytesIO()
|
137
|
+
writer(buffer, avro_schema, update_column_lineage_inputs)
|
138
|
+
res = qdc_client.upload_file(
|
139
|
+
url=import_res.location,
|
140
|
+
metadata=import_res.datasource_metadata_response_body,
|
141
|
+
buffer=buffer.getbuffer().tobytes(),
|
142
|
+
)
|
143
|
+
if res == 200:
|
144
|
+
logger.info("Upload column lineage is finished.")
|
113
145
|
return
|
114
146
|
|
115
147
|
|
@@ -177,6 +209,7 @@ def snowflake_table_stats(
|
|
177
209
|
qdc_client: qdc.QDCExternalAPIClient,
|
178
210
|
tenant_id: str,
|
179
211
|
stats_items: List[str],
|
212
|
+
existing_global_ids: Dict[str, bool],
|
180
213
|
) -> None:
|
181
214
|
with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
|
182
215
|
get_stats_view_query = _gen_get_stats_views_query(
|
@@ -193,8 +226,8 @@ and fix it or grant usage permission to both `{conn.account_database}` and `{con
|
|
193
226
|
and select permissions to views begins with `QUOLLIO_STATS_COLUMNS_`."
|
194
227
|
)
|
195
228
|
return
|
196
|
-
req_count = 0
|
197
229
|
is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
|
230
|
+
update_stats_inputs = list()
|
198
231
|
for stats_view in stats_views:
|
199
232
|
table_fqn = '"{catalog}"."{schema}"."{table}"'.format(
|
200
233
|
catalog=stats_view["TABLE_CATALOG"], schema=stats_view["TABLE_SCHEMA"], table=stats_view["TABLE_NAME"]
|
@@ -210,23 +243,43 @@ and select permissions to views begins with `QUOLLIO_STATS_COLUMNS_`."
|
|
210
243
|
or user has select permission to it."
|
211
244
|
)
|
212
245
|
continue
|
213
|
-
payloads =
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
246
|
+
payloads = gen_table_stats_avro_payload(
|
247
|
+
tenant_id=tenant_id,
|
248
|
+
endpoint=conn.account_id,
|
249
|
+
stats=stats_result,
|
250
|
+
existing_global_ids=existing_global_ids,
|
251
|
+
)
|
252
|
+
update_stats_inputs += payloads
|
253
|
+
|
254
|
+
stack_name = os.getenv("CF_STACK")
|
255
|
+
import_req = GetImportURLRequest(
|
256
|
+
service_name="snowflake",
|
257
|
+
source_name=stack_name,
|
258
|
+
file_name="{name}.avro".format(name=stack_name),
|
259
|
+
override_logical_name="false",
|
260
|
+
update_mode="partial",
|
261
|
+
)
|
262
|
+
datasource_id = new_global_id(
|
263
|
+
tenant_id=tenant_id, cluster_id=conn.account_id, data_id="", data_type="data_source"
|
264
|
+
)
|
265
|
+
logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
|
266
|
+
import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
|
267
|
+
if import_res is None:
|
268
|
+
logger.error("get_import_url failed. Please retry load_stats again")
|
269
|
+
return
|
270
|
+
logger.debug("ImportResponse: {res}".format(res=import_res))
|
271
|
+
|
272
|
+
avro_schema = AvroAsset.avro_schema_to_python()
|
273
|
+
|
274
|
+
buffer = io.BytesIO()
|
275
|
+
writer(buffer, avro_schema, update_stats_inputs)
|
276
|
+
res = qdc_client.upload_file(
|
277
|
+
url=import_res.location,
|
278
|
+
metadata=import_res.datasource_metadata_response_body,
|
279
|
+
buffer=buffer.getbuffer().tobytes(),
|
280
|
+
)
|
281
|
+
if res == 200:
|
282
|
+
logger.info("Generating table stats is finished.")
|
230
283
|
return
|
231
284
|
|
232
285
|
|
quollio_core/profilers/stats.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Dict, List, Tuple, Union
|
|
6
6
|
from jinja2 import Template
|
7
7
|
|
8
8
|
from quollio_core.helper.core import new_global_id
|
9
|
+
from quollio_core.models.avroasset import AvroAsset
|
9
10
|
|
10
11
|
logger = logging.getLogger(__name__)
|
11
12
|
|
@@ -70,6 +71,41 @@ def convert_value_type(obj, cast_str: bool = False):
|
|
70
71
|
return obj
|
71
72
|
|
72
73
|
|
74
|
+
def gen_table_stats_avro_payload(
|
75
|
+
tenant_id: str, endpoint: str, stats: List[Dict[str, str]], existing_global_ids: Dict[str, bool]
|
76
|
+
) -> List[Dict[str, str]]:
|
77
|
+
payloads = list()
|
78
|
+
for stat in stats:
|
79
|
+
db_name = stat.get("DB_NAME", stat.get("db_name"))
|
80
|
+
schema_name = stat.get("SCHEMA_NAME", stat.get("schema_name"))
|
81
|
+
table_name = stat.get("TABLE_NAME", stat.get("table_name"))
|
82
|
+
column_name = stat.get("COLUMN_NAME", stat.get("column_name"))
|
83
|
+
global_id_arg = "{db}{schema}{table}{column}".format(
|
84
|
+
db=db_name, schema=schema_name, table=table_name, column=column_name
|
85
|
+
)
|
86
|
+
column_global_id = new_global_id(
|
87
|
+
tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
|
88
|
+
)
|
89
|
+
if existing_global_ids.get(column_global_id) is not True:
|
90
|
+
continue
|
91
|
+
avro_assets = AvroAsset(
|
92
|
+
id=column_global_id,
|
93
|
+
object_type="column",
|
94
|
+
parents=[db_name, schema_name, table_name],
|
95
|
+
name=column_name,
|
96
|
+
stats_max=convert_value_type(stat.get("MAX_VALUE", stat.get("max_value")), True),
|
97
|
+
stats_min=convert_value_type(stat.get("MIN_VALUE", stat.get("min_value")), True),
|
98
|
+
stats_mean=convert_value_type(stat.get("AVG_VALUE", stat.get("avg_value")), True),
|
99
|
+
stats_median=convert_value_type(stat.get("MEDIAN_VALUE", stat.get("median_value")), True),
|
100
|
+
stats_mode=convert_value_type(stat.get("MODE_VALUE", stat.get("mode_value")), True),
|
101
|
+
stats_stddev=convert_value_type(stat.get("STDDEV_VALUE", stat.get("stddev_value")), True),
|
102
|
+
stats_number_of_null=convert_value_type(stat.get("NULL_COUNT", stat.get("null_count")), True),
|
103
|
+
stats_number_of_unique=convert_value_type(stat.get("CARDINALITY", stat.get("cardinality")), True),
|
104
|
+
)
|
105
|
+
payloads.append(avro_assets.to_dict())
|
106
|
+
return payloads
|
107
|
+
|
108
|
+
|
73
109
|
def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str, str]]) -> List[StatsRequest]:
|
74
110
|
payloads = list()
|
75
111
|
for stat in stats:
|
@@ -17,11 +17,15 @@ def load_lineage(
|
|
17
17
|
tenant_id: str = None,
|
18
18
|
qdc_client: qdc.QDCExternalAPIClient = None,
|
19
19
|
page_size: int = None,
|
20
|
+
system_database: str = None,
|
20
21
|
) -> None:
|
21
22
|
page_size = page_size or int(os.environ.get("TERADATA_PAGE_SIZE", 1000))
|
22
23
|
offset = 0
|
23
24
|
all_lineage_results = []
|
24
25
|
|
26
|
+
# Use system_database from config if not provided
|
27
|
+
system_database = system_database or conn_config.system_database
|
28
|
+
|
25
29
|
with teradata_repo.new_teradata_client(conn_config) as conn:
|
26
30
|
while True:
|
27
31
|
query = f"""
|
@@ -30,10 +34,10 @@ def load_lineage(
|
|
30
34
|
TRIM(a.SqlTextInfo) AS SqlTextInfo,
|
31
35
|
a.SqlRowNo,
|
32
36
|
TRIM(d.DatabaseName) AS DefaultDatabase
|
33
|
-
FROM
|
34
|
-
JOIN
|
37
|
+
FROM {system_database}.QryLogSQLV a
|
38
|
+
JOIN {system_database}.QryLogV b
|
35
39
|
ON a.QueryID = b.QueryID
|
36
|
-
JOIN
|
40
|
+
JOIN {system_database}.DatabasesV d
|
37
41
|
ON b.DefaultDatabase = d.DatabaseName
|
38
42
|
WHERE
|
39
43
|
UPPER(TRIM(SqlTextInfo)) LIKE 'CREATE TABLE%'
|