quollio-core 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quollio_core/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.6.4"
3
+ __version__ = "0.7.0"
4
4
  __author__ = "Quollio Technologies, Inc"
quollio_core/bigquery.py CHANGED
@@ -6,6 +6,7 @@ from google.auth.credentials import Credentials
6
6
  from quollio_core.helper.env_default import env_default
7
7
  from quollio_core.helper.log_utils import configure_logging, error_handling_decorator, logger
8
8
  from quollio_core.profilers.bigquery import bigquery_table_lineage, bigquery_table_stats
9
+ from quollio_core.profilers.qdc import gen_existing_global_id_dict, get_avro_file_content
9
10
  from quollio_core.repository import qdc
10
11
  from quollio_core.repository.bigquery import BigQueryClient, get_credentials, get_org_id
11
12
 
@@ -30,8 +31,15 @@ def load_lineage(
30
31
  org_id: str,
31
32
  credentials: Credentials,
32
33
  qdc_client: qdc.QDCExternalAPIClient,
34
+ enable_multi_projects: str,
33
35
  ) -> None:
34
36
  logger.info("Loading lineage data.")
37
+ file_content = get_avro_file_content(
38
+ tenant_id=tenant_id,
39
+ account_id=org_id,
40
+ qdc_client=qdc_client,
41
+ )
42
+ existing_global_ids = gen_existing_global_id_dict(avro_content=file_content)
35
43
  bigquery_table_lineage(
36
44
  qdc_client=qdc_client,
37
45
  tenant_id=tenant_id,
@@ -39,6 +47,8 @@ def load_lineage(
39
47
  regions=regions,
40
48
  credentials=credentials,
41
49
  org_id=org_id,
50
+ existing_global_ids=existing_global_ids,
51
+ enable_multi_projects=enable_multi_projects,
42
52
  )
43
53
  logger.info("Lineage data loaded successfully.")
44
54
 
@@ -146,6 +156,17 @@ if __name__ == "__main__":
146
156
  help="Comma-separated list of dataplex stats tables - <project_id>.<dataset_id>.<table_id>",
147
157
  )
148
158
 
159
+ parser.add_argument(
160
+ "--enable_multi_projects",
161
+ type=str,
162
+ choices=["ENABLED", "DISABLED"],
163
+ action=env_default("ENABLE_MULTI_PROJECTS"),
164
+ default="DISABLED",
165
+ required=False,
166
+ help="Whether to enable multi-projects support. If set to 'true', \
167
+ the script will load lineage and stats from all projects accessible by the credentials. Default is 'false'.",
168
+ )
169
+
149
170
  args = parser.parse_args()
150
171
 
151
172
  # Validate that dataplex_stats_tables is provided if load_stats is in commands
@@ -170,6 +191,7 @@ if __name__ == "__main__":
170
191
  org_id=org_id,
171
192
  credentials=credentials,
172
193
  qdc_client=qdc_client,
194
+ enable_multi_projects=args.enable_multi_projects,
173
195
  )
174
196
 
175
197
  if "load_stats" in args.commands:
@@ -2,9 +2,10 @@
2
2
  {%- set identifier = model['alias'] %}
3
3
  {%- set target_relations = [] %}
4
4
  {%- set grant_config = config.get('grants') %}
5
+ {%- set max_columns_per_view = config.get('max_columns_per_view', 100) %}
5
6
 
6
7
  {{ run_hooks(pre_hooks, inside_transaction=False) }}
7
- -- `BEGIN` happens here:
8
+ -- BEGIN happens here:
8
9
  {{ run_hooks(pre_hooks, inside_transaction=True) }}
9
10
 
10
11
  -- fetch target_tables
@@ -41,45 +42,60 @@
41
42
 
42
43
  -- create view for each table
43
44
  {%- for stats_target_table in stats_target_tables -%}
44
- -- build sql for column value aggregation.
45
- {%- set sql_for_column_stats %}
46
- {% set columns_json = fromjson(stats_target_table[3]) %}
47
- {%- for col_name, attr in columns_json.items() -%}
48
- {%- if not loop.first %}UNION{% endif %}
49
- SELECT
50
- DISTINCT
51
- '{{stats_target_table[0]}}' as db_name
52
- , '{{stats_target_table[1]}}' as schema_name
53
- , '{{stats_target_table[2]}}' as table_name
54
- , '{{col_name}}' as column_name
55
- , {% if attr["IS_CALCULABLE"] == True %}CAST(MAX("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS max_value
56
- , {% if attr["IS_CALCULABLE"] == True %}CAST(MIN("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS min_value
57
- , COUNT_IF("{{col_name}}" IS NULL) AS null_count
58
- , {% if attr["CAN_APPROX_COUNT"] == True %}APPROX_COUNT_DISTINCT("{{col_name}}"){% else %}NULL{% endif %} AS cardinality
59
- , {% if attr["IS_CALCULABLE"] == True %}AVG("{{col_name}}"){% else %}NULL{% endif %} AS avg_value
60
- , {% if attr["IS_CALCULABLE"] == True %}MEDIAN("{{col_name}}"){% else %}NULL{% endif %} AS median_value
61
- , {% if attr["IS_CALCULABLE"] == True %}APPROX_TOP_K("{{col_name}}")[0][0]{% else %}NULL{% endif %} AS mode_value
62
- , {% if attr["IS_CALCULABLE"] == True %}STDDEV("{{col_name}}"){% else %}NULL{% endif %} AS stddev_value
63
- FROM "{{stats_target_table[0]}}"."{{stats_target_table[1]}}"."{{stats_target_table[2]}}" {{ var("sample_method") }}
64
- {% endfor -%}
65
- {%- endset %}
45
+ {%- set columns_json = fromjson(stats_target_table[3]) %}
46
+ {%- set column_list = columns_json.keys() | list %}
47
+ {%- set chunk_count = ((column_list | length) / max_columns_per_view) | round(0, 'ceil') | int %}
48
+
49
+ {%- for chunk_index in range(chunk_count) %}
50
+ {%- set start_idx = chunk_index * max_columns_per_view %}
51
+ {%- set end_idx = start_idx + max_columns_per_view %}
52
+ {%- set chunk_columns = column_list[start_idx:end_idx] %}
53
+
54
+ -- build sql for column value aggregation.
55
+ {%- set sql_for_column_stats %}
56
+ {%- for col_name in chunk_columns -%}
57
+ {%- set attr = columns_json[col_name] %}
58
+ {%- if not loop.first %}UNION{% endif %}
59
+ SELECT
60
+ DISTINCT
61
+ '{{stats_target_table[0]}}' as db_name
62
+ , '{{stats_target_table[1]}}' as schema_name
63
+ , '{{stats_target_table[2]}}' as table_name
64
+ , '{{col_name}}' as column_name
65
+ , {% if attr["IS_CALCULABLE"] == True %}CAST(MAX("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS max_value
66
+ , {% if attr["IS_CALCULABLE"] == True %}CAST(MIN("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS min_value
67
+ , COUNT_IF("{{col_name}}" IS NULL) AS null_count
68
+ , {% if attr["CAN_APPROX_COUNT"] == True %}APPROX_COUNT_DISTINCT("{{col_name}}"){% else %}NULL{% endif %} AS cardinality
69
+ , {% if attr["IS_CALCULABLE"] == True %}AVG("{{col_name}}"){% else %}NULL{% endif %} AS avg_value
70
+ , {% if attr["IS_CALCULABLE"] == True %}MEDIAN("{{col_name}}"){% else %}NULL{% endif %} AS median_value
71
+ , {% if attr["IS_CALCULABLE"] == True %}APPROX_TOP_K("{{col_name}}")[0][0]{% else %}NULL{% endif %} AS mode_value
72
+ , {% if attr["IS_CALCULABLE"] == True %}STDDEV("{{col_name}}"){% else %}NULL{% endif %} AS stddev_value
73
+ FROM "{{stats_target_table[0]}}"."{{stats_target_table[1]}}"."{{stats_target_table[2]}}" {{ var("sample_method") }}
74
+ {% endfor -%}
75
+ {%- endset %}
66
76
 
67
- -- create a view with a index as suffix
68
- {%- set stats_view_identifier = "\"%s_%s_%s_%s\"" | format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) | upper %}
69
- {%- set schema_name = "\"%s\""|format(schema) %}
70
- {%- set db_name = "\"%s\""|format(database) %}
71
- {%- set target_relation = api.Relation.create(identifier=stats_view_identifier, schema=schema_name, database=db_name, type='view') %}
72
- {% call statement("main") %}
73
- {{ get_create_view_as_sql(target_relation, sql_for_column_stats) }}
74
- {% endcall %}
75
- {%- set full_refresh_mode = (should_full_refresh()) -%}
76
- {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
77
- {%- do apply_grants(target_relation, grant_config, should_revoke) %}
78
- {%- set target_relations = target_relations.append(target_relation) %}
77
+ -- create a view with a index as suffix and chunk indicator
78
+ {%- set chunk_suffix = "" if chunk_count == 1 else "_PART" ~ (chunk_index + 1) %}
79
+ {%- set stats_view_identifier = "\"%s_%s_%s_%s%s\"" | format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2], chunk_suffix) | upper %}
80
+ {%- set schema_name = "\"%s\""|format(schema) %}
81
+ {%- set db_name = "\"%s\""|format(database) %}
82
+ {%- set target_relation = api.Relation.create(identifier=stats_view_identifier, schema=schema_name, database=db_name, type='view') %}
83
+
84
+ {{ log("Creating view " ~ stats_view_identifier ~ " with " ~ chunk_columns | length ~ " columns (chunk " ~ (chunk_index + 1) ~ " of " ~ chunk_count ~ ")", info=True) }}
85
+
86
+ {% call statement("main") %}
87
+ {{ get_create_view_as_sql(target_relation, sql_for_column_stats) }}
88
+ {% endcall %}
89
+
90
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
91
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
92
+ {%- do apply_grants(target_relation, grant_config, should_revoke) %}
93
+ {%- set target_relations = target_relations.append(target_relation) %}
94
+ {%- endfor %}
79
95
  {%- endfor -%}
80
96
 
81
97
  {{ run_hooks(post_hooks, inside_transaction=True) }}
82
- -- `COMMIT` happens here:
98
+ -- COMMIT happens here:
83
99
  {{ adapter.commit() }}
84
100
  {{ run_hooks(post_hooks, inside_transaction=False) }}
85
101
 
@@ -1,15 +1,25 @@
1
+ import io
2
+ import os
1
3
  from typing import Dict, List
2
4
 
5
+ from fastavro import writer
3
6
  from google.auth.credentials import Credentials
4
7
 
8
+ from quollio_core.helper.core import new_global_id
5
9
  from quollio_core.helper.log_utils import error_handling_decorator, logger
6
- from quollio_core.profilers.lineage import gen_table_lineage_payload, parse_bigquery_table_lineage
10
+ from quollio_core.models.avroasset import AvroAsset
11
+ from quollio_core.models.qdc import GetImportURLRequest
12
+ from quollio_core.profilers.lineage import (
13
+ gen_table_avro_lineage_payload,
14
+ gen_table_lineage_payload,
15
+ parse_bigquery_table_lineage,
16
+ )
7
17
  from quollio_core.profilers.stats import gen_table_stats_payload
8
18
  from quollio_core.repository import qdc
9
19
  from quollio_core.repository.bigquery import BigQueryClient, GCPLineageClient, get_entitiy_reference, get_search_request
20
+ from quollio_core.repository.cloud_resource_manager import CloudResourceManagerClient
10
21
 
11
22
 
12
- @error_handling_decorator
13
23
  def bigquery_table_lineage(
14
24
  qdc_client: qdc.QDCExternalAPIClient,
15
25
  tenant_id: str,
@@ -17,34 +27,84 @@ def bigquery_table_lineage(
17
27
  regions: list,
18
28
  org_id: str,
19
29
  credentials: Credentials,
30
+ existing_global_ids: Dict[str, bool],
31
+ enable_multi_projects: str = "DISABLED",
20
32
  ) -> None:
21
33
  lineage_client = GCPLineageClient(credentials)
22
- bq_client = BigQueryClient(credentials, project_id)
23
-
24
- datasets = bq_client.list_dataset_ids()
25
- all_tables = generate_table_list(bq_client, datasets)
26
- lineage_links = generate_lineage_links(all_tables, lineage_client, project_id, regions)
27
- lineage_links = parse_bigquery_table_lineage(lineage_links)
28
- logger.debug("The following resources will be ingested. %s", lineage_links)
29
-
30
- update_table_lineage_inputs = gen_table_lineage_payload(tenant_id=tenant_id, endpoint=org_id, tables=lineage_links)
31
-
32
- req_count = 0
33
- for update_table_lineage_input in update_table_lineage_inputs:
34
- logger.info(
35
- "Generating table lineage. downstream: %s -> %s -> %s. upstream: %s",
36
- update_table_lineage_input.downstream_database_name,
37
- update_table_lineage_input.downstream_schema_name,
38
- update_table_lineage_input.downstream_table_name,
39
- update_table_lineage_input.upstreams.as_dict(),
34
+ crm_client = CloudResourceManagerClient(credentials)
35
+
36
+ target_project_ids = []
37
+ if enable_multi_projects == "ENABLED":
38
+ try:
39
+ target_projects = crm_client.list_projects()
40
+ except Exception as e:
41
+ raise Exception(f"ListProjects by cloud resource manager failed. Err. {str(e)}")
42
+
43
+ for target_project in target_projects["projects"]:
44
+ if target_project is None:
45
+ logger.warning("projects.Projects returns None. Proceed to loop project value")
46
+ continue
47
+
48
+ target_project_id = target_project.get("projectId", "")
49
+ if target_project_id == "":
50
+ logger.warning("projects.Projects is empty string. Proceed to loop project value")
51
+ continue
52
+
53
+ target_project_ids.append(target_project_id)
54
+ else:
55
+ target_project_ids.append(project_id)
56
+
57
+ update_table_lineage_inputs = []
58
+ for target_project_id in target_project_ids:
59
+ bq_client = BigQueryClient(credentials, target_project_id)
60
+ datasets = bq_client.list_dataset_ids()
61
+ all_tables = generate_table_list(bq_client, datasets)
62
+ lineage_links = generate_lineage_links(all_tables, lineage_client, target_project_id, regions)
63
+ lineage_links = parse_bigquery_table_lineage(lineage_links)
64
+ logger.debug("The following resources will be ingested. %s", lineage_links)
65
+
66
+ update_table_lineage_input = gen_table_lineage_payload(
67
+ tenant_id=tenant_id, endpoint=org_id, tables=lineage_links
40
68
  )
41
- status_code = qdc_client.update_lineage_by_id(
42
- global_id=update_table_lineage_input.downstream_global_id,
43
- payload=update_table_lineage_input.upstreams.as_dict(),
69
+ update_table_lineage_input = gen_table_avro_lineage_payload(
70
+ tenant_id=tenant_id,
71
+ endpoint=org_id,
72
+ tables=lineage_links,
73
+ existing_global_ids=existing_global_ids,
44
74
  )
45
- if status_code == 200:
46
- req_count += 1
47
- logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
75
+ update_table_lineage_inputs.extend(update_table_lineage_input)
76
+
77
+ stack_name = os.getenv("CF_STACK")
78
+ import_req = GetImportURLRequest(
79
+ service_name="bigquery",
80
+ source_name=stack_name,
81
+ file_name="{name}.avro".format(name=stack_name),
82
+ override_logical_name="false",
83
+ update_mode="partial",
84
+ )
85
+ datasource_id = new_global_id(tenant_id=tenant_id, cluster_id=org_id, data_id="", data_type="data_source")
86
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
87
+
88
+ import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
89
+ if import_res is None:
90
+ logger.error("get_import_url failed. Please retry `load_lineage` again")
91
+ return
92
+ logger.debug("ImportResponse: {res}".format(res=import_res))
93
+
94
+ avro_schema = AvroAsset.avro_schema_to_python()
95
+
96
+ buffer = io.BytesIO()
97
+ writer(buffer, avro_schema, update_table_lineage_inputs)
98
+
99
+ res = qdc_client.upload_file(
100
+ url=import_res.location,
101
+ metadata=import_res.datasource_metadata_response_body,
102
+ buffer=buffer.getbuffer().tobytes(),
103
+ )
104
+
105
+ if res == 200:
106
+ logger.info("Upload table lineage is finished.")
107
+ return
48
108
 
49
109
 
50
110
  @error_handling_decorator
@@ -1,10 +1,17 @@
1
+ import io
1
2
  import logging
2
- from typing import List
3
+ import os
4
+ from typing import Dict, List
3
5
 
4
- from quollio_core.profilers.lineage import gen_table_lineage_payload, gen_table_lineage_payload_inputs
6
+ from fastavro import writer
7
+
8
+ from quollio_core.helper.core import new_global_id
9
+ from quollio_core.models.avroasset import AvroAsset
10
+ from quollio_core.models.qdc import GetImportURLRequest
11
+ from quollio_core.profilers.lineage import gen_table_avro_lineage_payload, gen_table_lineage_payload_inputs
5
12
  from quollio_core.profilers.sqllineage import SQLLineage
6
13
  from quollio_core.profilers.stats import (
7
- gen_table_stats_payload_from_tuple,
14
+ gen_table_stats_avro_payload_from_tuple,
8
15
  get_is_target_stats_items,
9
16
  render_sql_for_stats,
10
17
  )
@@ -18,6 +25,7 @@ def redshift_table_level_lineage(
18
25
  qdc_client: qdc.QDCExternalAPIClient,
19
26
  tenant_id: str,
20
27
  dbt_table_name: str,
28
+ existing_global_ids: Dict[str, bool],
21
29
  ) -> None:
22
30
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
23
31
  results = redshift_executor.get_query_results(
@@ -34,28 +42,39 @@ def redshift_table_level_lineage(
34
42
  )
35
43
  lineage_payload_inputs = gen_table_lineage_payload_inputs(input_data=results)
36
44
 
37
- update_table_lineage_inputs = gen_table_lineage_payload(
45
+ update_table_lineage_inputs = gen_table_avro_lineage_payload(
38
46
  tenant_id=tenant_id,
39
47
  endpoint=conn.host,
40
48
  tables=lineage_payload_inputs,
49
+ existing_global_ids=existing_global_ids,
50
+ )
51
+ stack_name = os.getenv("CF_STACK")
52
+ import_req = GetImportURLRequest(
53
+ service_name="redshift",
54
+ source_name=stack_name,
55
+ file_name="{name}.avro".format(name=stack_name),
56
+ override_logical_name="false",
57
+ update_mode="partial",
41
58
  )
59
+ datasource_id = new_global_id(tenant_id=tenant_id, cluster_id=conn.host, data_id="", data_type="data_source")
60
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
61
+ import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
62
+ if import_res is None:
63
+ logger.error("get_import_url failed. Please retry `load_lineage` again")
64
+ return
65
+ logger.debug("ImportResponse: {res}".format(res=import_res))
42
66
 
43
- req_count = 0
44
- for update_table_lineage_input in update_table_lineage_inputs:
45
- logger.info(
46
- "Generating table lineage. downstream: {db} -> {schema} -> {table}".format(
47
- db=update_table_lineage_input.downstream_database_name,
48
- schema=update_table_lineage_input.downstream_schema_name,
49
- table=update_table_lineage_input.downstream_table_name,
50
- )
51
- )
52
- status_code = qdc_client.update_lineage_by_id(
53
- global_id=update_table_lineage_input.downstream_global_id,
54
- payload=update_table_lineage_input.upstreams.as_dict(),
55
- )
56
- if status_code == 200:
57
- req_count += 1
58
- logger.info(f"Generating table lineage is finished. {req_count} lineages are ingested.")
67
+ avro_schema = AvroAsset.avro_schema_to_python()
68
+
69
+ buffer = io.BytesIO()
70
+ writer(buffer, avro_schema, update_table_lineage_inputs)
71
+ res = qdc_client.upload_file(
72
+ url=import_res.location,
73
+ metadata=import_res.datasource_metadata_response_body,
74
+ buffer=buffer.getbuffer().tobytes(),
75
+ )
76
+ if res == 200:
77
+ logger.info("Upload table lineage is finished.")
59
78
  return
60
79
 
61
80
 
@@ -82,6 +101,7 @@ def redshift_table_stats(
82
101
  qdc_client: qdc.QDCExternalAPIClient,
83
102
  tenant_id: str,
84
103
  stats_items: List[str],
104
+ existing_global_ids: Dict[str, bool],
85
105
  ) -> None:
86
106
  is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
87
107
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
@@ -92,7 +112,7 @@ def redshift_table_stats(
92
112
  stats_views = redshift_executor.get_query_results(query=stats_query)
93
113
  logger.info("Found %s for table statistics.", len(stats_views))
94
114
 
95
- req_count = 0
115
+ update_stats_inputs = list()
96
116
  for stats_view in stats_views:
97
117
  table_fqn = "{catalog}.{schema}.{table}".format(
98
118
  catalog=stats_view[0], schema=stats_view[1], table=stats_view[2]
@@ -100,23 +120,39 @@ def redshift_table_stats(
100
120
  stats_query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn=table_fqn)
101
121
  logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
102
122
  stats_result = redshift_executor.get_query_results(query=stats_query)
103
- payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
104
- for payload in payloads:
105
- logger.info(
106
- "Generating table stats. asset: {db} -> {schema} -> {table} -> {column}".format(
107
- db=payload.db,
108
- schema=payload.schema,
109
- table=payload.table,
110
- column=payload.column,
111
- )
112
- )
113
- status_code = qdc_client.update_stats_by_id(
114
- global_id=payload.global_id,
115
- payload=payload.body.get_column_stats(),
116
- )
117
- if status_code == 200:
118
- req_count += 1
119
- logger.info(f"Generating table stats is finished. {req_count} stats are ingested.")
123
+ payload = gen_table_stats_avro_payload_from_tuple(
124
+ tenant_id=tenant_id, endpoint=conn.host, stats=stats_result, existing_global_ids=existing_global_ids
125
+ )
126
+ update_stats_inputs += payload
127
+
128
+ stack_name = os.getenv("CF_STACK")
129
+ import_req = GetImportURLRequest(
130
+ service_name="redshift",
131
+ source_name=stack_name,
132
+ file_name="{name}.avro".format(name=stack_name),
133
+ override_logical_name="false",
134
+ update_mode="partial",
135
+ )
136
+ datasource_id = new_global_id(tenant_id=tenant_id, cluster_id=conn.host, data_id="", data_type="data_source")
137
+ logger.debug("Datasource id: {dsrc_id}".format(dsrc_id=datasource_id))
138
+ import_res = qdc_client.get_import_url(datasource_id=datasource_id, payload=import_req)
139
+ if import_res is None:
140
+ logger.error("get_import_url failed. Please retry load_stats again")
141
+ return
142
+ logger.debug("ImportResponse: {res}".format(res=import_res))
143
+
144
+ avro_schema = AvroAsset.avro_schema_to_python()
145
+
146
+ buffer = io.BytesIO()
147
+ writer(buffer, avro_schema, update_stats_inputs)
148
+ res = qdc_client.upload_file(
149
+ url=import_res.location,
150
+ metadata=import_res.datasource_metadata_response_body,
151
+ buffer=buffer.getbuffer().tobytes(),
152
+ )
153
+ if res == 200:
154
+ logger.info("Generating table stats is finished.")
155
+
120
156
  return
121
157
 
122
158
 
@@ -147,6 +147,43 @@ def gen_table_stats_payload(tenant_id: str, endpoint: str, stats: List[Dict[str,
147
147
  return payloads
148
148
 
149
149
 
150
+ def gen_table_stats_avro_payload_from_tuple(
151
+ tenant_id: str, endpoint: str, stats: Tuple[List[str]], existing_global_ids: Dict[str, bool]
152
+ ) -> List[Dict[str, str]]:
153
+ payloads = list()
154
+ for stat in stats:
155
+ db_name, schema_name, table_name, column_name = stat[:4]
156
+
157
+ global_id_arg = "{db}{schema}{table}{column}".format(
158
+ db=db_name, schema=schema_name, table=table_name, column=column_name
159
+ )
160
+ table_global_id = new_global_id(
161
+ tenant_id=tenant_id, cluster_id=endpoint, data_id=global_id_arg, data_type="column"
162
+ )
163
+
164
+ if existing_global_ids.get(table_global_id) is not True:
165
+ continue
166
+
167
+ avro_assets = AvroAsset(
168
+ id=table_global_id,
169
+ object_type="column",
170
+ parents=[db_name, schema_name, table_name],
171
+ name=column_name,
172
+ stats_max=convert_value_type(stat[4], True),
173
+ stats_min=convert_value_type(stat[5], True),
174
+ stats_mean=convert_value_type(stat[8], True),
175
+ stats_median=convert_value_type(stat[9], True),
176
+ stats_mode=convert_value_type(stat[10], True),
177
+ stats_stddev=convert_value_type(stat[11], True),
178
+ stats_number_of_null=convert_value_type(stat[6], True),
179
+ stats_number_of_unique=convert_value_type(stat[7], True),
180
+ )
181
+
182
+ payloads.append(avro_assets.to_dict())
183
+
184
+ return payloads
185
+
186
+
150
187
  def gen_table_stats_payload_from_tuple(
151
188
  tenant_id: str, endpoint: str, stats: Tuple[List[str]]
152
189
  ) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
quollio_core/redshift.py CHANGED
@@ -3,16 +3,17 @@ import logging
3
3
  import os
4
4
  import shutil
5
5
 
6
- from quollio_core.helper.core import setup_dbt_profile
6
+ from quollio_core.helper.core import is_valid_domain, setup_dbt_profile
7
7
  from quollio_core.helper.env_default import env_default
8
8
  from quollio_core.helper.log import set_log_level
9
+ from quollio_core.profilers.qdc import gen_existing_global_id_dict, get_avro_file_content
9
10
  from quollio_core.profilers.redshift import (
10
11
  redshift_table_level_lineage,
11
12
  redshift_table_level_sqllineage,
12
13
  redshift_table_stats,
13
14
  )
14
15
  from quollio_core.profilers.stats import get_column_stats_items
15
- from quollio_core.repository import dbt, qdc, redshift
16
+ from quollio_core.repository import dbt, qdc, redshift, ssm
16
17
 
17
18
  logger = logging.getLogger(__name__)
18
19
 
@@ -84,11 +85,20 @@ def load_lineage(
84
85
  tenant_id: str,
85
86
  ) -> None:
86
87
  logger.info("Generate redshift table to table lineage.")
88
+
89
+ file_content = get_avro_file_content(
90
+ tenant_id=tenant_id,
91
+ account_id=conn.host,
92
+ qdc_client=qdc_client,
93
+ )
94
+ existing_global_ids = gen_existing_global_id_dict(avro_content=file_content)
95
+
87
96
  redshift_table_level_lineage(
88
97
  conn=conn,
89
98
  qdc_client=qdc_client,
90
99
  tenant_id=tenant_id,
91
100
  dbt_table_name="quollio_lineage_table_level",
101
+ existing_global_ids=existing_global_ids,
92
102
  )
93
103
 
94
104
  logger.info("Generate redshift view level lineage.")
@@ -97,6 +107,7 @@ def load_lineage(
97
107
  qdc_client=qdc_client,
98
108
  tenant_id=tenant_id,
99
109
  dbt_table_name="quollio_lineage_view_level",
110
+ existing_global_ids=existing_global_ids,
100
111
  )
101
112
 
102
113
  logger.info("Lineage data is successfully loaded.")
@@ -115,12 +126,20 @@ def load_stats(
115
126
  if stats_items is None:
116
127
  raise ValueError("No stats items are not selected. Please specify any value to `stats_items` param.")
117
128
 
129
+ file_content = get_avro_file_content(
130
+ tenant_id=tenant_id,
131
+ account_id=conn.host,
132
+ qdc_client=qdc_client,
133
+ )
134
+ existing_global_ids = gen_existing_global_id_dict(avro_content=file_content)
135
+
118
136
  logger.info("The following values will be aggregated. {stats_items}".format(stats_items=stats_items))
119
137
  redshift_table_stats(
120
138
  conn=conn,
121
139
  qdc_client=qdc_client,
122
140
  tenant_id=tenant_id,
123
141
  stats_items=stats_items,
142
+ existing_global_ids=existing_global_ids,
124
143
  )
125
144
 
126
145
  logger.info("Stats data is successfully loaded.")
@@ -252,6 +271,7 @@ if __name__ == "__main__":
252
271
  type=str,
253
272
  choices=["debug", "info", "warn", "error", "none"],
254
273
  action=env_default("LOG_LEVEL"),
274
+ default="info",
255
275
  required=False,
256
276
  help="The log level for dbt commands. Default value is info",
257
277
  )
@@ -285,6 +305,16 @@ if __name__ == "__main__":
285
305
  required=False,
286
306
  help="The client secrete that is created on Quollio console to let clients access Quollio External API",
287
307
  )
308
+ parser.add_argument(
309
+ "--external_api_access",
310
+ type=str,
311
+ choices=["PUBLIC", "VPC_ENDPOINT"],
312
+ action=env_default("EXTERNAL_API_ACCESS"),
313
+ default="PUBLIC",
314
+ required=False,
315
+ help="Access method to Quollio API. Default 'PUBLIC'. Choose 'VPC_ENDPOINT'\
316
+ if you use API Gateway VPC Endpoint, DefaultValue is set to PUBLIC.",
317
+ )
288
318
 
289
319
  stats_items = get_column_stats_items()
290
320
  parser.add_argument(
@@ -323,11 +353,22 @@ if __name__ == "__main__":
323
353
  log_level=args.log_level,
324
354
  dbt_macro_source=args.dbt_macro_source,
325
355
  )
356
+
357
+ api_url = args.api_url
358
+ if args.external_api_access == "VPC_ENDPOINT":
359
+ api_url, err = ssm.get_parameter_by_assume_role(args.api_url)
360
+ if err is not None:
361
+ logger.error("Fail to ssm.get_parameter_by_assume_role. {err}".format(err=err))
362
+ raise Exception("Fail to ssm.get_parameter_by_assume_role. {err}".format(err=err))
363
+ is_domain_valid = is_valid_domain(domain=api_url, domain_type=args.external_api_access)
364
+ if not is_domain_valid:
365
+ raise ValueError("The format of quollio API URL is invalid. The URL must end with `.com`")
366
+
326
367
  if "load_lineage" in args.commands:
327
368
  qdc_client = qdc.QDCExternalAPIClient(
328
369
  client_id=args.client_id,
329
370
  client_secret=args.client_secret,
330
- base_url=args.api_url,
371
+ base_url=api_url,
331
372
  )
332
373
  load_lineage(
333
374
  conn=conn,
@@ -338,7 +379,7 @@ if __name__ == "__main__":
338
379
  qdc_client = qdc.QDCExternalAPIClient(
339
380
  client_id=args.client_id,
340
381
  client_secret=args.client_secret,
341
- base_url=args.api_url,
382
+ base_url=api_url,
342
383
  )
343
384
  load_stats(
344
385
  conn=conn,
@@ -348,7 +389,7 @@ if __name__ == "__main__":
348
389
  )
349
390
  if "load_sqllineage" in args.commands:
350
391
  qdc_client = qdc.QDCExternalAPIClient(
351
- base_url=args.api_url,
392
+ base_url=api_url,
352
393
  client_id=args.client_id,
353
394
  client_secret=args.client_secret,
354
395
  )
@@ -0,0 +1,25 @@
1
+ from google.oauth2.service_account import Credentials
2
+ from googleapiclient.discovery import build
3
+
4
+
5
+ class CloudResourceManagerClient:
6
+ """Client to interact with the Cloud Resource Manager API."""
7
+
8
+ def __init__(self, credentials: Credentials) -> None:
9
+ """Initialize the Cloud Resource Manager client with provided credentials."""
10
+ self.client = self.__initialize(credentials=credentials)
11
+
12
+ def __initialize(self, credentials: Credentials):
13
+ return build("cloudresourcemanager", "v1", credentials=credentials)
14
+
15
+ def list_projects(self):
16
+ """List all projects accessible with the current credentials."""
17
+ request = self.client.projects().list()
18
+ response = request.execute()
19
+ return response
20
+
21
+ def get_project(self, project_id: str):
22
+ """Get a specific project by project ID."""
23
+ request = self.client.projects().get(projectId=project_id)
24
+ response = request.execute()
25
+ return response
quollio_core/snowflake.py CHANGED
@@ -312,6 +312,7 @@ if __name__ == "__main__":
312
312
  type=str,
313
313
  choices=["debug", "info", "warn", "error", "none"],
314
314
  action=env_default("LOG_LEVEL"),
315
+ default="info",
315
316
  required=False,
316
317
  help="The log level for dbt commands. Default value is info",
317
318
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quollio-core
3
- Version: 0.6.4
3
+ Version: 0.7.0
4
4
  Summary: Quollio Core
5
5
  Author-email: quollio-dev <qt.dev@quollio.com>
6
6
  Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
@@ -1,8 +1,8 @@
1
- quollio_core/__init__.py,sha256=Cb-B2rl0ckOxRzDpNBNZdhUUNRb6SBPYnZxyLrdrXj0,83
2
- quollio_core/bigquery.py,sha256=6Oq4DVGpa3X21Es_nbrsb8pK3vaxwb9Egnvq3huo95k,5894
1
+ quollio_core/__init__.py,sha256=ZtRODvd0lC2-vqcXyfKDya8bb6FyyFWBsGNz7TSlO-s,83
2
+ quollio_core/bigquery.py,sha256=nxMWldMr14HzOeyzYM_fRrfHQ7jbm2HyGzc1r46mlco,6821
3
3
  quollio_core/bricks.py,sha256=8h3kbI2b6lGH2s-56jE_Q5-R5-nIsQYMfvtRrkFOzoU,10784
4
- quollio_core/redshift.py,sha256=KcdljY95xYf9JYrsaMOBoP_XxQQ8wFVE5ue_XEMVSFc,11504
5
- quollio_core/snowflake.py,sha256=fMkZ7OlbimgLg4TxVASTCpXSH2MPeawveAFi0xhKqnw,17060
4
+ quollio_core/redshift.py,sha256=Yi_udcgfen2PoCkDTIhemeCFbDVUU1rYWro9CyjHCZA,13192
5
+ quollio_core/snowflake.py,sha256=aOQ8tLSbHJEx_TUGaZLjix5KWgLiRp3A1tRx5qgUtRI,17084
6
6
  quollio_core/teradata.py,sha256=H2VUcJvr8W-M2wvm3710Gf1ENb-BSscrDRKNm8gdHJE,8227
7
7
  quollio_core/dbt_projects/databricks/.gitignore,sha256=1jJAyXSzJ3YUm0nx3i7wUSE4RjQMX3ad6F8O88UbtzI,29
8
8
  quollio_core/dbt_projects/databricks/README.md,sha256=ZpRQyhFAODAiS8dc1Kb_ndkul4cu4o4udN_EMa49CU4,440
@@ -47,7 +47,7 @@ quollio_core/dbt_projects/snowflake/packages_hub.yml,sha256=p9Bl2C44gdC6iYTUkz_1
47
47
  quollio_core/dbt_projects/snowflake/packages_local.yml,sha256=ryyJSXv83gYFu48xmzG5Z1l746jGCUBE6hs7pUNwuXE,43
48
48
  quollio_core/dbt_projects/snowflake/analyses/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
49
  quollio_core/dbt_projects/snowflake/macros/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
- quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql,sha256=oLf_rQ0tV_I_nhyEaMZ6rpmzIoLgMHor1bruBDyJaqU,3992
50
+ quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql,sha256=WjN_baGb_FuDX831rR5G7Payw5t91VaoQobEkTJR3jQ,4880
51
51
  quollio_core/dbt_projects/snowflake/macros/materialization/get_imported_databases.sql,sha256=WAFl9CyM-G07O7vZD2MkA1hncpjV_gSMGc2V8nRJRPk,1435
52
52
  quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql,sha256=Zhj0EXF1K8S-OkFxz3IBHe2olXktYrvly0LwZBOAUXw,5333
53
53
  quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml,sha256=a2uNIAh-xw51eu-GmHVuAnGnTbwK7h8-DjDeQtK3KaQ,711
@@ -71,18 +71,19 @@ quollio_core/helper/log_utils.py,sha256=QontLKETHjSAbQniJ7YqS0RY2AYvFHSjrlPiGr31
71
71
  quollio_core/models/avroasset.py,sha256=YZHzOS62N0_sidneXI3IZ2MA8Bz1vFVgF6F9_UilC3s,603
72
72
  quollio_core/models/qdc.py,sha256=UObaUpvAQ4vOhI6jfwvNFrJ3--6AX2v9yl9_d3Juy7M,739
73
73
  quollio_core/profilers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- quollio_core/profilers/bigquery.py,sha256=mEr7CFQNgBFqWR8XfCOk8WTm5k5qZhLF8ODVWfskPRI,5797
74
+ quollio_core/profilers/bigquery.py,sha256=rJ2eMZlUtEmsNCCkenC9DuL7--btcO2dF_NmIOawGJ4,7947
75
75
  quollio_core/profilers/databricks.py,sha256=ik4RiR_GOeU3S7s6C6Y9SGe1D_Y_f98BDWJVlEJXL4U,7868
76
76
  quollio_core/profilers/lineage.py,sha256=GMWue6lgiz7wFYnNpaHVFivprA-iqhbCHf63IsVB8Vk,11260
77
77
  quollio_core/profilers/qdc.py,sha256=P0STRfe5G4d-UI7RdVbYmAfP_tAn1HbFUjeirxWipz4,995
78
- quollio_core/profilers/redshift.py,sha256=p6ONDCkhndZAOcKAwEyQ5fsi-jsQrlwHHb7LTI_m1uk,6473
78
+ quollio_core/profilers/redshift.py,sha256=f5F3jnFJP2SbmPtG-PNyVgkt4mtfxuUX45pIS0mrw60,7856
79
79
  quollio_core/profilers/snowflake.py,sha256=ewvULWIlcq2h0jOyRzUpedW0NS8QlkSgICS-dZDYl18,13027
80
80
  quollio_core/profilers/sqllineage.py,sha256=h0FT6CYb0A20zSc68GELZ7Q8bDbaHLQnZQHsXBEXBug,5261
81
- quollio_core/profilers/stats.py,sha256=PwMNyr7JwdrVbSSfVd-XsgCZV7sKoRjihaRhhZ88uyM,9328
81
+ quollio_core/profilers/stats.py,sha256=Go1tR8IMpMZnHZJzYtAZTC89ZDkebUmGxljwy8h5KC0,10752
82
82
  quollio_core/profilers/teradata/lineage.py,sha256=2wNksBQD8vC6UTQwCglPsF53YMEVIkAb2CWTmpiTHDU,7368
83
83
  quollio_core/profilers/teradata/stats.py,sha256=OagvkTRFiWVbiLABwZwR3wQ7y36edwOViDetHsYiyxI,9277
84
84
  quollio_core/repository/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  quollio_core/repository/bigquery.py,sha256=3AyGcJNYGnUyMweyc6lGm4quwrOzd-ZBS2zNnFwafII,3990
86
+ quollio_core/repository/cloud_resource_manager.py,sha256=tmHxjV3AmKwv3OJ-f40N-XQA1qmxZTSDBDS3YK69CIo,978
86
87
  quollio_core/repository/databricks.py,sha256=9Cgdv8qBnVaHqu3RA-IUBieAqb69moQ-KAAMVSf5Ds4,1877
87
88
  quollio_core/repository/dbt.py,sha256=cnLwJPywLi8VowVW7zfIBa9jxVwDWO7xzzNRn1vWiuw,659
88
89
  quollio_core/repository/qdc.py,sha256=Ni0rk9CX8ienqM_HYLuWLBKTwycvTENC7x7wGWhzjXs,8978
@@ -90,7 +91,7 @@ quollio_core/repository/redshift.py,sha256=p2ouEuYcDCjx1oBhc6H1ekQsvEqHGd3bFu3PW
90
91
  quollio_core/repository/snowflake.py,sha256=yCYXrYf4I5GL_ITNTXoggj0xNbQsdwxPSmsVvZYwUVU,3869
91
92
  quollio_core/repository/ssm.py,sha256=xpm1FzbBnIsBptuYPUNnPgkKU2AH3XxI-ZL0bEetvW0,2182
92
93
  quollio_core/repository/teradata.py,sha256=1AExxRBTswpSyF4OVyAUkoiZ0yVRfqt4T99FdllkTEI,3763
93
- quollio_core-0.6.4.dist-info/licenses/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
94
- quollio_core-0.6.4.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
95
- quollio_core-0.6.4.dist-info/METADATA,sha256=KSBNAaAXnsuB5w8NomEzIaKHX02N5ue-33qdc2zSDqI,7023
96
- quollio_core-0.6.4.dist-info/RECORD,,
94
+ quollio_core-0.7.0.dist-info/licenses/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
95
+ quollio_core-0.7.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
96
+ quollio_core-0.7.0.dist-info/METADATA,sha256=xrt5HSRtSF0M-2Ef09CNsB2fo6hyMifPOySYhF5U1nA,7023
97
+ quollio_core-0.7.0.dist-info/RECORD,,