quollio-core 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quollio_core/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.5.0"
3
+ __version__ = "0.6.0"
4
4
  __author__ = "Quollio Technologies, Inc"
@@ -95,6 +95,23 @@ UNION
95
95
  {{ source('account_usage', 'TABLES') }}
96
96
  WHERE
97
97
  DELETED IS NULL
98
+ AND (
99
+ {% if var('target_databases_method') == 'ALLOWLIST' %}
100
+ {% if var('target_databases') %}
101
+ TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }})
102
+ {% else %}
103
+ 1=0 -- If no databases specified in allowlist, deny all
104
+ {% endif %}
105
+ {% elif var('target_databases_method') == 'DENYLIST' %}
106
+ {% if var('target_databases') %}
107
+ NOT (TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }}))
108
+ {% else %}
109
+ 1=1 -- If no databases specified in denylist, include all
110
+ {% endif %}
111
+ {% else %}
112
+ 1=1 -- Default case: allow all databases
113
+ {% endif %}
114
+ )
98
115
  ), exists_upstream_column_lineage AS (
99
116
  SELECT
100
117
  downstream_table_name
@@ -49,6 +49,23 @@ WITH table_lineage_history AS (
49
49
  {{ source('account_usage', 'TABLES') }}
50
50
  WHERE
51
51
  DELETED IS NULL
52
+ AND (
53
+ {% if var('target_databases_method') == 'ALLOWLIST' %}
54
+ {% if var('target_databases') %}
55
+ TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }})
56
+ {% else %}
57
+ 1=0 -- If no databases specified in allowlist, deny all
58
+ {% endif %}
59
+ {% elif var('target_databases_method') == 'DENYLIST' %}
60
+ {% if var('target_databases') %}
61
+ NOT (TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }}))
62
+ {% else %}
63
+ 1=1 -- If no databases specified in denylist, include all
64
+ {% endif %}
65
+ {% else %}
66
+ 1=1 -- Default case: allow all databases
67
+ {% endif %}
68
+ )
52
69
  ), upstream_exists_table AS (
53
70
  SELECT
54
71
  downstream_table_name AS "DOWNSTREAM_TABLE_NAME"
@@ -48,3 +48,21 @@ on
48
48
  lst.query_id = qt.query_id
49
49
  where
50
50
  qt.query_id is not null
51
+ AND (
52
+ {% if var('target_databases_method') == 'ALLOWLIST' %}
53
+ {% if var('target_databases') %}
54
+ database_name LIKE ANY ({{ var('target_databases')|join(",") }})
55
+ {% else %}
56
+ 1=0 -- If no databases specified in allowlist, deny all
57
+ {% endif %}
58
+ {% elif var('target_databases_method') == 'DENYLIST' %}
59
+ {% if var('target_databases') %}
60
+ NOT (database_name LIKE ANY ({{ var('target_databases')|join(",") }}))
61
+ {% else %}
62
+ 1=1 -- If no databases specified in denylist, include all
63
+ {% endif %}
64
+ {% else %}
65
+ 1=1 -- Default case: allow all databases
66
+ {% endif %}
67
+ )
68
+
@@ -28,7 +28,7 @@ WITH columns AS (
28
28
  FROM
29
29
  {{ source('account_usage', 'GRANTS_TO_ROLES') }}
30
30
  WHERE
31
- granted_on in ('TABLE', 'MATERIALIZED VIEW')
31
+ granted_on in ('TABLE', 'VIEW', 'MATERIALIZED VIEW')
32
32
  AND grantee_name = '{{ var("query_role") }}'
33
33
  AND privilege in ('SELECT', 'OWNERSHIP')
34
34
  AND deleted_on IS NULL
@@ -92,5 +92,21 @@ WITH columns AS (
92
92
  else false END AS is_calculable
93
93
  FROM
94
94
  implicit_columns_removed
95
- )
95
+ WHERE
96
+ {% if var('target_databases_method') == 'ALLOWLIST' %}
97
+ {% if var('target_databases') %}
98
+ TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }})
99
+ {% else %}
100
+ 1=0 -- If no databases specified in allowlist, deny all
101
+ {% endif %}
102
+ {% elif var('target_databases_method') == 'DENYLIST' %}
103
+ {% if var('target_databases') %}
104
+ NOT (TABLE_CATALOG LIKE ANY ({{ var('target_databases')|join(",") }}))
105
+ {% else %}
106
+ 1=1 -- If no databases specified in denylist, include all
107
+ {% endif %}
108
+ {% else %}
109
+ 1=1 -- Default case: allow all databases
110
+ {% endif %}
111
+ )
96
112
  select * from final
@@ -8,6 +8,11 @@ quollio_intelligence_snowflake:
8
8
  schema: {{ account_schema }}
9
9
  type: snowflake
10
10
  user: {{ account_user }}
11
- password: {{ account_password }}
12
11
  warehouse: {{ account_warehouse }}
13
12
  threads: {{ threads }}
13
+ {% if private_key is defined %}
14
+ private_key: |
15
+ {{ private_key | indent(8) }}
16
+ {% else %}
17
+ password: {{ account_password }}
18
+ {% endif %}
@@ -8,6 +8,7 @@ from jinja2 import Environment, FileSystemLoader
8
8
  def new_global_id(tenant_id: str, cluster_id: str, data_id: str, data_type: str) -> str:
9
9
  prefix = ""
10
10
  data_types = {
11
+ "data_source": "dsrc-",
11
12
  "schema": "schm-",
12
13
  "table": "tbl-",
13
14
  "column": "clmn-",
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
4
+ from dataclasses_avroschema import AvroModel
5
+
6
+
7
+ @dataclass
8
+ class AvroAsset(AvroModel):
9
+ "AvroAsset"
10
+
11
+ id: str
12
+ object_type: str
13
+ parents: List[str]
14
+ name: str
15
+ stats_max: Optional[str] = None
16
+ stats_min: Optional[str] = None
17
+ stats_mean: Optional[str] = None
18
+ stats_median: Optional[str] = None
19
+ stats_mode: Optional[str] = None
20
+ stats_stddev: Optional[str] = None
21
+ stats_number_of_null: Optional[str] = None
22
+ stats_number_of_unique: Optional[str] = None
23
+ upstream: Optional[List[str]] = None
@@ -0,0 +1,36 @@
1
+ from dataclasses import asdict, dataclass
2
+ from typing import Dict
3
+
4
+
5
+ @dataclass
6
+ class GetImportURLRequest:
7
+ service_name: str
8
+ source_name: str
9
+ file_name: str
10
+ override_logical_name: str
11
+ update_mode: str
12
+
13
+ def as_dict(self) -> Dict[str, str]:
14
+ return asdict(self)
15
+
16
+
17
+ @dataclass
18
+ class DataSourceMetadataResponseBody:
19
+ user_id: str
20
+ job_key: str
21
+ service_name: str
22
+ source_name: str
23
+ source_type: str
24
+ override_logical_name: str
25
+
26
+ def as_dict(self) -> Dict[str, str]:
27
+ return asdict(self)
28
+
29
+
30
+ @dataclass
31
+ class GetImportURLResponse:
32
+ location: str
33
+ datasource_metadata_response_body: DataSourceMetadataResponseBody
34
+
35
+ def as_dict(self) -> Dict[str, str]:
36
+ return asdict(self)
@@ -3,6 +3,7 @@ from dataclasses import asdict, dataclass
3
3
  from typing import Dict, List, Tuple, Union
4
4
 
5
5
  from quollio_core.helper.core import new_global_id
6
+ from quollio_core.models.avroasset import AvroAsset
6
7
 
7
8
 
8
9
  @dataclass
@@ -23,6 +24,101 @@ class LineageInputs:
23
24
  upstreams: LineageInput
24
25
 
25
26
 
27
+ def gen_table_avro_lineage_payload(
28
+ tenant_id: str,
29
+ endpoint: str,
30
+ tables: List[Dict[str, Union[Dict[str, str], str]]],
31
+ existing_global_ids: Dict[str, bool],
32
+ ) -> List[Dict[str, str]]:
33
+ payload = list()
34
+ for table in tables:
35
+ downstream_table_fqn = table["DOWNSTREAM_TABLE_NAME"].split(".")
36
+ if len(downstream_table_fqn) != 3:
37
+ continue
38
+ else:
39
+ global_id_arg = "{db}{schema}{table}".format(
40
+ db=downstream_table_fqn[0], schema=downstream_table_fqn[1], table=downstream_table_fqn[2]
41
+ )
42
+ downstream_table_global_id = new_global_id(
43
+ tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="table"
44
+ )
45
+ if existing_global_ids.get(downstream_table_global_id) is not True:
46
+ continue
47
+ upstreams = list()
48
+ for upstream_table in table["UPSTREAM_TABLES"]:
49
+ upstream_table_fqn = upstream_table["upstream_object_name"].split(".")
50
+ if len(upstream_table_fqn) != 3:
51
+ continue
52
+ else:
53
+ upstream_global_id_arg = "{db}{schema}{table}".format(
54
+ db=upstream_table_fqn[0], schema=upstream_table_fqn[1], table=upstream_table_fqn[2]
55
+ )
56
+ upstream_table_global_id = new_global_id(
57
+ tenant_id=tenant_id, cluster_id=endpoint, data_id=upstream_global_id_arg, data_type="table"
58
+ )
59
+ upstreams.append(upstream_table_global_id)
60
+
61
+ avro_assets = AvroAsset(
62
+ id=downstream_table_global_id,
63
+ object_type="table",
64
+ parents=[downstream_table_fqn[0], downstream_table_fqn[1]],
65
+ name=downstream_table_fqn[2],
66
+ upstream=upstreams,
67
+ )
68
+ payload.append(avro_assets.to_dict())
69
+ return payload
70
+
71
+
72
+ def gen_column_avro_lineage_payload(
73
+ tenant_id: str, endpoint: str, columns: List[Dict[str, str]], existing_global_ids: Dict[str, bool]
74
+ ) -> List[Dict[str, str]]:
75
+ payload = list()
76
+ for column in columns:
77
+ downstream_table_fqn = column["DOWNSTREAM_TABLE_NAME"].split(".")
78
+ if len(downstream_table_fqn) != 3:
79
+ continue
80
+ else:
81
+ global_id_arg = "{db}{schema}{table}{column}".format(
82
+ db=downstream_table_fqn[0],
83
+ schema=downstream_table_fqn[1],
84
+ table=downstream_table_fqn[2],
85
+ column=column["DOWNSTREAM_COLUMN_NAME"],
86
+ )
87
+ downstream_column_global_id = new_global_id(
88
+ tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
89
+ )
90
+ if existing_global_ids.get(downstream_column_global_id) is not True:
91
+ continue
92
+ upstream_columns: List[Dict[str, str]] = json.loads(column["UPSTREAM_COLUMNS"])
93
+ upstreams = list()
94
+ for upstream_column in upstream_columns:
95
+ upstream_table_fqn = upstream_column["upstream_table_name"].split(".")
96
+ if len(upstream_table_fqn) != 3:
97
+ continue
98
+ elif not upstream_column.get("upstream_column_name"):
99
+ continue
100
+ else:
101
+ upstream_global_id_arg = "{db}{schema}{table}{column}".format(
102
+ db=upstream_table_fqn[0],
103
+ schema=upstream_table_fqn[1],
104
+ table=upstream_table_fqn[2],
105
+ column=upstream_column["upstream_column_name"],
106
+ )
107
+ upstream_column_global_id = new_global_id(
108
+ tenant_id=tenant_id, cluster_id=endpoint, data_id=upstream_global_id_arg, data_type="column"
109
+ )
110
+ upstreams.append(upstream_column_global_id)
111
+ avro_assets = AvroAsset(
112
+ id=downstream_column_global_id,
113
+ object_type="column",
114
+ parents=[downstream_table_fqn[0], downstream_table_fqn[1], downstream_table_fqn[2]],
115
+ name=column["DOWNSTREAM_COLUMN_NAME"],
116
+ upstream=upstreams,
117
+ )
118
+ payload.append(avro_assets.to_dict())
119
+ return payload
120
+
121
+
26
122
  def gen_table_lineage_payload(
27
123
  tenant_id: str, endpoint: str, tables: List[Dict[str, Union[Dict[str, str], str]]]
28
124
  ) -> List[LineageInputs]:
@@ -0,0 +1,27 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from typing import Dict
4
+
5
+ import fastavro
6
+
7
+ from quollio_core.helper.core import new_global_id
8
+ from quollio_core.models.avroasset import AvroAsset
9
+ from quollio_core.repository import qdc
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def gen_existing_global_id_dict(avro_content: bytes) -> Dict[str, bool]:
15
+ byte_io = BytesIO(avro_content)
16
+ avro_schema = AvroAsset.avro_schema_to_python()
17
+ reader = fastavro.reader(byte_io, avro_schema)
18
+ records = {record["id"]: True for record in reader}
19
+ return records
20
+
21
+
22
+ def get_avro_file_content(tenant_id: str, account_id: str, qdc_client: qdc.QDCExternalAPIClient) -> bytes:
23
+ datasource_id = new_global_id(tenant_id=tenant_id, cluster_id=account_id, data_id="", data_type="data_source")
24
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
25
+ res = qdc_client.get_export_url(datasource_id=datasource_id)
26
+ file_content = qdc_client.download_file(res).content
27
+ return file_content
@@ -1,13 +1,20 @@
1
+ import io
1
2
  import logging
2
- from typing import List
3
+ import os
4
+ from typing import Dict, List
3
5
 
6
+ from fastavro import writer
7
+
8
+ from quollio_core.helper.core import new_global_id
9
+ from quollio_core.models.avroasset import AvroAsset
10
+ from quollio_core.models.qdc import GetImportURLRequest
4
11
  from quollio_core.profilers.lineage import (
5
- gen_column_lineage_payload,
6
- gen_table_lineage_payload,
12
+ gen_column_avro_lineage_payload,
13
+ gen_table_avro_lineage_payload,
7
14
  parse_snowflake_results,
8
15
  )
9
16
  from quollio_core.profilers.sqllineage import SQLLineage
10
- from quollio_core.profilers.stats import gen_table_stats_payload, get_is_target_stats_items, render_sql_for_stats
17
+ from quollio_core.profilers.stats import gen_table_stats_avro_payload, get_is_target_stats_items, render_sql_for_stats
11
18
  from quollio_core.repository import qdc, snowflake
12
19
 
13
20
  logger = logging.getLogger(__name__)
@@ -17,6 +24,7 @@ def snowflake_table_to_table_lineage(
17
24
  conn: snowflake.SnowflakeConnectionConfig,
18
25
  qdc_client: qdc.QDCExternalAPIClient,
19
26
  tenant_id: str,
27
+ existing_global_ids: Dict[str, bool],
20
28
  ) -> None:
21
29
  with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
22
30
  results, err = sf_executor.get_query_results(
@@ -38,28 +46,41 @@ def snowflake_table_to_table_lineage(
38
46
  )
39
47
  return
40
48
  parsed_results = parse_snowflake_results(results=results)
41
- update_table_lineage_inputs = gen_table_lineage_payload(
49
+ update_table_lineage_inputs = gen_table_avro_lineage_payload(
42
50
  tenant_id=tenant_id,
43
51
  endpoint=conn.account_id,
44
52
  tables=parsed_results,
53
+ existing_global_ids=existing_global_ids,
54
+ )
55
+ stack_name = os.getenv("CF_STACK")
56
+ import_req = GetImportURLRequest(
57
+ service_name="snowflake",
58
+ source_name=stack_name,
59
+ file_name="{name}.avro".format(name=stack_name),
60
+ override_logical_name="false",
61
+ update_mode="partial",
62
+ )
63
+ datasource_id = new_global_id(
64
+ tenant_id=tenant_id, cluster_id=conn.account_id, data_id="", data_type="data_source"
45
65
  )
66
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
67
+ import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
68
+ if import_res is None:
69
+ logger.error("get_import_url failed. Please retry `load_lineage` again")
70
+ return
71
+ logger.debug("ImportResponse: {res}".format(res=import_res))
46
72
 
47
- req_count = 0
48
- for update_table_lineage_input in update_table_lineage_inputs:
49
- logger.info(
50
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
51
- db=update_table_lineage_input.downstream_database_name,
52
- schema=update_table_lineage_input.downstream_schema_name,
53
- table=update_table_lineage_input.downstream_table_name,
54
- )
55
- )
56
- status_code = qdc_client.update_lineage_by_id(
57
- global_id=update_table_lineage_input.downstream_global_id,
58
- payload=update_table_lineage_input.upstreams.as_dict(),
59
- )
60
- if status_code == 200:
61
- req_count += 1
62
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
73
+ avro_schema = AvroAsset.avro_schema_to_python()
74
+
75
+ buffer = io.BytesIO()
76
+ writer(buffer, avro_schema, update_table_lineage_inputs)
77
+ res = qdc_client.upload_file(
78
+ url=import_res.location,
79
+ metadata=import_res.datasource_metadata_response_body,
80
+ buffer=buffer.getbuffer().tobytes(),
81
+ )
82
+ if res == 200:
83
+ logger.info("Upload table lineage is finished.")
63
84
  return
64
85
 
65
86
 
@@ -67,6 +88,7 @@ def snowflake_column_to_column_lineage(
67
88
  conn: snowflake.SnowflakeConnectionConfig,
68
89
  qdc_client: qdc.QDCExternalAPIClient,
69
90
  tenant_id: str,
91
+ existing_global_ids: Dict[str, bool],
70
92
  ) -> None:
71
93
  with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
72
94
  results, err = sf_executor.get_query_results(
@@ -87,29 +109,39 @@ def snowflake_column_to_column_lineage(
87
109
  "No lineage data in ACCOUNT_USAGE.SNOWFLAKE. Please check the data in `QUOLLIO_LINEAGE_COLUMN_LEVEL`."
88
110
  )
89
111
  return
90
- update_column_lineage_inputs = gen_column_lineage_payload(
91
- tenant_id=tenant_id,
92
- endpoint=conn.account_id,
93
- columns=results,
112
+ update_column_lineage_inputs = gen_column_avro_lineage_payload(
113
+ tenant_id=tenant_id, endpoint=conn.account_id, columns=results, existing_global_ids=existing_global_ids
94
114
  )
95
115
 
96
- req_count = 0
97
- for update_column_lineage_input in update_column_lineage_inputs:
98
- logger.info(
99
- "Generating column lineage. downstream: {db} -> {schema} -> {table} -> {column}".format(
100
- db=update_column_lineage_input.downstream_database_name,
101
- schema=update_column_lineage_input.downstream_schema_name,
102
- table=update_column_lineage_input.downstream_table_name,
103
- column=update_column_lineage_input.downstream_column_name,
104
- )
105
- )
106
- status_code = qdc_client.update_lineage_by_id(
107
- global_id=update_column_lineage_input.downstream_global_id,
108
- payload=update_column_lineage_input.upstreams.as_dict(),
109
- )
110
- if status_code == 200:
111
- req_count += 1
112
- logger.info(f"Generating column lineage is finished. {req_count} lineages are ingested.")
116
+ stack_name = os.getenv("CF_STACK")
117
+ import_req = GetImportURLRequest(
118
+ service_name="snowflake",
119
+ source_name=stack_name,
120
+ file_name="{name}.avro".format(name=stack_name),
121
+ override_logical_name="false",
122
+ update_mode="partial",
123
+ )
124
+ datasource_id = new_global_id(
125
+ tenant_id=tenant_id, cluster_id=conn.account_id, data_id="", data_type="data_source"
126
+ )
127
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
128
+ import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
129
+ if import_res is None:
130
+ logger.error("get_import_url failed. Please retry load_lineage again")
131
+ return
132
+ logger.debug("ImportResponse: {res}".format(res=import_res))
133
+
134
+ avro_schema = AvroAsset.avro_schema_to_python()
135
+
136
+ buffer = io.BytesIO()
137
+ writer(buffer, avro_schema, update_column_lineage_inputs)
138
+ res = qdc_client.upload_file(
139
+ url=import_res.location,
140
+ metadata=import_res.datasource_metadata_response_body,
141
+ buffer=buffer.getbuffer().tobytes(),
142
+ )
143
+ if res == 200:
144
+ logger.info("Upload column lineage is finished.")
113
145
  return
114
146
 
115
147
 
@@ -177,6 +209,7 @@ def snowflake_table_stats(
177
209
  qdc_client: qdc.QDCExternalAPIClient,
178
210
  tenant_id: str,
179
211
  stats_items: List[str],
212
+ existing_global_ids: Dict[str, bool],
180
213
  ) -> None:
181
214
  with snowflake.SnowflakeQueryExecutor(conn) as sf_executor:
182
215
  get_stats_view_query = _gen_get_stats_views_query(
@@ -193,8 +226,8 @@ and fix it or grant usage permission to both `{conn.account_database}` and `{con
193
226
  and select permissions to views begins with `QUOLLIO_STATS_COLUMNS_`."
194
227
  )
195
228
  return
196
- req_count = 0
197
229
  is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
230
+ update_stats_inputs = list()
198
231
  for stats_view in stats_views:
199
232
  table_fqn = '"{catalog}"."{schema}"."{table}"'.format(
200
233
  catalog=stats_view["TABLE_CATALOG"], schema=stats_view["TABLE_SCHEMA"], table=stats_view["TABLE_NAME"]
@@ -210,23 +243,43 @@ and select permissions to views begins with `QUOLLIO_STATS_COLUMNS_`."
210
243
  or user has select permission to it."
211
244
  )
212
245
  continue
213
- payloads = gen_table_stats_payload(tenant_id=tenant_id, endpoint=conn.account_id, stats=stats_result)
214
- for payload in payloads:
215
- logger.info(
216
- "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
217
- db=payload.db,
218
- schema=payload.schema,
219
- table=payload.table,
220
- column=payload.column,
221
- )
222
- )
223
- status_code = qdc_client.update_stats_by_id(
224
- global_id=payload.global_id,
225
- payload=payload.body.get_column_stats(),
226
- )
227
- if status_code == 200:
228
- req_count += 1
229
- logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
246
+ payloads = gen_table_stats_avro_payload(
247
+ tenant_id=tenant_id,
248
+ endpoint=conn.account_id,
249
+ stats=stats_result,
250
+ existing_global_ids=existing_global_ids,
251
+ )
252
+ update_stats_inputs += payloads
253
+
254
+ stack_name = os.getenv("CF_STACK")
255
+ import_req = GetImportURLRequest(
256
+ service_name="snowflake",
257
+ source_name=stack_name,
258
+ file_name="{name}.avro".format(name=stack_name),
259
+ override_logical_name="false",
260
+ update_mode="partial",
261
+ )
262
+ datasource_id = new_global_id(
263
+ tenant_id=tenant_id, cluster_id=conn.account_id, data_id="", data_type="data_source"
264
+ )
265
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
266
+ import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
267
+ if import_res is None:
268
+ logger.error("get_import_url failed. Please retry load_stats again")
269
+ return
270
+ logger.debug("ImportResponse: {res}".format(res=import_res))
271
+
272
+ avro_schema = AvroAsset.avro_schema_to_python()
273
+
274
+ buffer = io.BytesIO()
275
+ writer(buffer, avro_schema, update_stats_inputs)
276
+ res = qdc_client.upload_file(
277
+ url=import_res.location,
278
+ metadata=import_res.datasource_metadata_response_body,
279
+ buffer=buffer.getbuffer().tobytes(),
280
+ )
281
+ if res == 200:
282
+ logger.info("Generating table stats is finished.")
230
283
  return
231
284
 
232
285
 
@@ -6,6 +6,7 @@ from typing import Dict, List, Tuple, Union
6
6
  from jinja2 import Template
7
7
 
8
8
  from quollio_core.helper.core import new_global_id
9
+ from quollio_core.models.avroasset import AvroAsset
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
@@ -70,6 +71,41 @@ def convert_value_type(obj, cast_str: bool = False):
70
71
  return obj
71
72
 
72
73
 
74
+ def gen_table_stats_avro_payload(
75
+ tenant_id: str, endpoint: str, stats: List[Dict[str, str]], existing_global_ids: Dict[str, bool]
76
+ ) -> List[Dict[str, str]]:
77
+ payloads = list()
78
+ for stat in stats:
79
+ db_name = stat.get("DB_NAME", stat.get("db_name"))
80
+ schema_name = stat.get("SCHEMA_NAME", stat.get("schema_name"))
81
+ table_name = stat.get("TABLE_NAME", stat.get("table_name"))
82
+ column_name = stat.get("COLUMN_NAME", stat.get("column_name"))
83
+ global_id_arg = "{db}{schema}{table}{column}".format(
84
+ db=db_name, schema=schema_name, table=table_name, column=column_name
85
+ )
86
+ column_global_id = new_global_id(
87
+ tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
88
+ )
89
+ if existing_global_ids.get(column_global_id) is not True:
90
+ continue
91
+ avro_assets = AvroAsset(
92
+ id=column_global_id,
93
+ object_type="column",
94
+ parents=[db_name, schema_name, table_name],
95
+ name=column_name,
96
+ stats_max=convert_value_type(stat.get("MAX_VALUE", stat.get("max_value")), True),
97
+ stats_min=convert_value_type(stat.get("MIN_VALUE", stat.get("min_value")), True),
98
+ stats_mean=convert_value_type(stat.get("AVG_VALUE", stat.get("avg_value")), True),
99
+ stats_median=convert_value_type(stat.get("MEDIAN_VALUE", stat.get("median_value")), True),
100
+ stats_mode=convert_value_type(stat.get("MODE_VALUE", stat.get("mode_value")), True),
101
+ stats_stddev=convert_value_type(stat.get("STDDEV_VALUE", stat.get("stddev_value")), True),
102
+ stats_number_of_null=convert_value_type(stat.get("NULL_COUNT", stat.get("null_count")), True),
103
+ stats_number_of_unique=convert_value_type(stat.get("CARDINALITY", stat.get("cardinality")), True),
104
+ )
105
+ payloads.append(avro_assets.to_dict())
106
+ return payloads
107
+
108
+
73
109
  def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str, str]]) -> List[StatsRequest]:
74
110
  payloads = list()
75
111
  for stat in stats:
@@ -17,11 +17,15 @@ def load_lineage(
17
17
  tenant_id: str = None,
18
18
  qdc_client: qdc.QDCExternalAPIClient = None,
19
19
  page_size: int = None,
20
+ system_database: str = None,
20
21
  ) -> None:
21
22
  page_size = page_size or int(os.environ.get("TERADATA_PAGE_SIZE", 1000))
22
23
  offset = 0
23
24
  all_lineage_results = []
24
25
 
26
+ # Use system_database from config if not provided
27
+ system_database = system_database or conn_config.system_database
28
+
25
29
  with teradata_repo.new_teradata_client(conn_config) as conn:
26
30
  while True:
27
31
  query = f"""
@@ -30,10 +34,10 @@ def load_lineage(
30
34
  TRIM(a.SqlTextInfo) AS SqlTextInfo,
31
35
  a.SqlRowNo,
32
36
  TRIM(d.DatabaseName) AS DefaultDatabase
33
- FROM DBC.QryLogSQLV a
34
- JOIN DBC.QryLogV b
37
+ FROM {system_database}.QryLogSQLV a
38
+ JOIN {system_database}.QryLogV b
35
39
  ON a.QueryID = b.QueryID
36
- JOIN DBC.DatabasesV d
40
+ JOIN {system_database}.DatabasesV d
37
41
  ON b.DefaultDatabase = d.DatabaseName
38
42
  WHERE
39
43
  UPPER(TRIM(SqlTextInfo)) LIKE 'CREATE TABLE%'