quollio-core 0.4.19__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {quollio_core-0.4.19 → quollio_core-0.5.0}/PKG-INFO +2 -1
  2. {quollio_core-0.4.19 → quollio_core-0.5.0}/pyproject.toml +1 -0
  3. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/__init__.py +1 -1
  4. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +1 -1
  5. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/helper/core.py +7 -0
  6. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/sqllineage.py +13 -9
  7. quollio_core-0.5.0/quollio_core/profilers/teradata/lineage.py +172 -0
  8. quollio_core-0.5.0/quollio_core/profilers/teradata/stats.py +218 -0
  9. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/qdc.py +0 -7
  10. quollio_core-0.5.0/quollio_core/repository/ssm.py +59 -0
  11. quollio_core-0.5.0/quollio_core/repository/teradata.py +103 -0
  12. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/snowflake.py +26 -5
  13. quollio_core-0.5.0/quollio_core/teradata.py +254 -0
  14. {quollio_core-0.4.19 → quollio_core-0.5.0}/LICENSE +0 -0
  15. {quollio_core-0.4.19 → quollio_core-0.5.0}/README.md +0 -0
  16. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/bigquery.py +0 -0
  17. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/bricks.py +0 -0
  18. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/.gitignore +0 -0
  19. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/README.md +0 -0
  20. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
  21. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/dbt_project.yml +0 -0
  22. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
  23. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +0 -0
  24. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +0 -0
  25. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +0 -0
  26. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +0 -0
  27. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/models/sources.yml +0 -0
  28. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/packages_hub.yml +0 -0
  29. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/packages_local.yml +0 -0
  30. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +0 -0
  31. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
  32. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
  33. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/README.md +0 -0
  34. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/analyses/.gitkeep +0 -0
  35. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/dbt_project.yml +0 -0
  36. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/macros/.gitkeep +0 -0
  37. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +0 -0
  38. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql +0 -0
  39. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml +0 -0
  40. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql +0 -0
  41. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml +0 -0
  42. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql +0 -0
  43. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml +0 -0
  44. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +0 -0
  45. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml +0 -0
  46. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql +0 -0
  47. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml +0 -0
  48. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/models/sources.yml +0 -0
  49. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/packages_hub.yml +0 -0
  50. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/packages_local.yml +0 -0
  51. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/profiles/profiles_template.yml +0 -0
  52. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/seeds/.gitkeep +0 -0
  53. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/redshift/snapshots/.gitkeep +0 -0
  54. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/seeds/.gitkeep +0 -0
  55. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/README.md +0 -0
  56. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/analyses/.gitkeep +0 -0
  57. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/dbt_project.yml +0 -0
  58. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/macros/.gitkeep +0 -0
  59. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +0 -0
  60. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +0 -0
  61. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml +0 -0
  62. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +0 -0
  63. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml +0 -0
  64. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +0 -0
  65. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml +0 -0
  66. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +0 -0
  67. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml +0 -0
  68. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml +0 -0
  69. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/models/sources.yml +0 -0
  70. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/packages_hub.yml +0 -0
  71. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/packages_local.yml +0 -0
  72. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +0 -0
  73. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/seeds/.gitkeep +0 -0
  74. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/dbt_projects/snowflake/snapshots/.gitkeep +0 -0
  75. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/helper/__init__.py +0 -0
  76. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/helper/env_default.py +0 -0
  77. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/helper/log.py +0 -0
  78. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/helper/log_utils.py +0 -0
  79. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/__init__.py +0 -0
  80. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/bigquery.py +0 -0
  81. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/databricks.py +0 -0
  82. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/lineage.py +0 -0
  83. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/redshift.py +0 -0
  84. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/snowflake.py +0 -0
  85. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/profilers/stats.py +0 -0
  86. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/redshift.py +0 -0
  87. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/__init__.py +0 -0
  88. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/bigquery.py +0 -0
  89. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/databricks.py +0 -0
  90. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/dbt.py +0 -0
  91. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/redshift.py +0 -0
  92. {quollio_core-0.4.19 → quollio_core-0.5.0}/quollio_core/repository/snowflake.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quollio-core
3
- Version: 0.4.19
3
+ Version: 0.5.0
4
4
  Summary: Quollio Core
5
5
  Author-email: quollio-dev <qt.dev@quollio.com>
6
6
  Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
@@ -37,6 +37,7 @@ Requires-Dist: google-cloud-bigquery==3.22.0
37
37
  Requires-Dist: google-cloud-datacatalog==3.19.0
38
38
  Requires-Dist: google-cloud-datacatalog-lineage==0.3.6
39
39
  Requires-Dist: google-api-python-client==2.131.0
40
+ Requires-Dist: teradatasql==20.0.0.15
40
41
  Requires-Dist: black>=22.3.0 ; extra == "test"
41
42
  Requires-Dist: coverage>=7.3.2 ; extra == "test"
42
43
  Requires-Dist: isort>=5.10.1 ; extra == "test"
@@ -49,6 +49,7 @@ dependencies = [
49
49
  ,"google-cloud-datacatalog==3.19.0"
50
50
  ,"google-cloud-datacatalog-lineage==0.3.6"
51
51
  ,"google-api-python-client==2.131.0"
52
+ ,"teradatasql==20.0.0.15"
52
53
  ]
53
54
  dynamic = ["version", "description"]
54
55
 
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.4.19"
3
+ __version__ = "0.5.0"
4
4
  __author__ = "Quollio Technologies, Inc"
@@ -87,7 +87,7 @@ WITH columns AS (
87
87
  , data_type
88
88
  , case when data_type in('NUMBER','DECIMAL', 'DEC', 'NUMERIC',
89
89
  'INT', 'INTEGER', 'BIGINT', 'SMALLINT',
90
- 'TINYINT', 'BYTEINT')
90
+ 'TINYINT', 'BYTEINT', 'FLOAT')
91
91
  THEN true
92
92
  else false END AS is_calculable
93
93
  FROM
@@ -35,3 +35,10 @@ def setup_dbt_profile(connections_json: Dict[str, str], template_path: str, temp
35
35
 
36
36
  def trim_prefix(s: str, prefix: str) -> str:
37
37
  return s.lstrip(prefix)
38
+
39
+
40
+ def is_valid_domain(domain: str, domain_type: str) -> bool:
41
+ if domain_type == "VPC_ENDPOINT":
42
+ return domain.endswith("/api")
43
+ else:
44
+ return domain.endswith(".com")
@@ -67,15 +67,19 @@ class SQLLineage:
67
67
  dest_schema = dest_schema.upper() if dest_schema is not None else None
68
68
 
69
69
  # MEMO: Complement sql with dialect, source database and source schema info.
70
- optimized_stmt: sqlglot.Expression = optimizer.qualify.qualify(
71
- statement,
72
- dialect=dialect,
73
- catalog=src_db,
74
- db=src_schema,
75
- qualify_columns=False,
76
- validate_qualify_columns=False,
77
- identify=False,
78
- )
70
+ # MEMO: Skipping qualify because it normalizes the table names.
71
+ if dialect == "teradata":
72
+ optimized_stmt = statement
73
+ else:
74
+ optimized_stmt: sqlglot.Expression = optimizer.qualify.qualify(
75
+ statement,
76
+ dialect=dialect,
77
+ catalog=src_db,
78
+ db=src_schema,
79
+ qualify_columns=False,
80
+ validate_qualify_columns=False,
81
+ identify=False,
82
+ )
79
83
 
80
84
  orig_dest_table = Table(table="")
81
85
  dest_table = Table(table="")
@@ -0,0 +1,172 @@
1
+ import os
2
+ from collections import OrderedDict
3
+ from typing import Dict, List, Set, Tuple, Union
4
+
5
+ from sqlglot import ParseError
6
+
7
+ from quollio_core.helper.log_utils import error_handling_decorator, logger
8
+ from quollio_core.profilers.sqllineage import SQLLineage, Table
9
+ from quollio_core.repository import qdc
10
+ from quollio_core.repository import teradata as teradata_repo
11
+
12
+
13
+ @error_handling_decorator
14
+ def load_lineage(
15
+ conn_config: teradata_repo.TeradataConfig,
16
+ endpoint: str = None,
17
+ tenant_id: str = None,
18
+ qdc_client: qdc.QDCExternalAPIClient = None,
19
+ page_size: int = None,
20
+ ) -> None:
21
+ page_size = page_size or int(os.environ.get("TERADATA_PAGE_SIZE", 1000))
22
+ offset = 0
23
+ all_lineage_results = []
24
+
25
+ with teradata_repo.new_teradata_client(conn_config) as conn:
26
+ while True:
27
+ query = f"""
28
+ SELECT
29
+ a.QueryID,
30
+ TRIM(a.SqlTextInfo) AS SqlTextInfo,
31
+ a.SqlRowNo,
32
+ TRIM(d.DatabaseName) AS DefaultDatabase
33
+ FROM DBC.QryLogSQLV a
34
+ JOIN DBC.QryLogV b
35
+ ON a.QueryID = b.QueryID
36
+ JOIN DBC.DatabasesV d
37
+ ON b.DefaultDatabase = d.DatabaseName
38
+ WHERE
39
+ UPPER(TRIM(SqlTextInfo)) LIKE 'CREATE TABLE%'
40
+ OR UPPER(TRIM(SqlTextInfo)) LIKE 'CREATE VIEW%'
41
+ OR UPPER(TRIM(SqlTextInfo)) LIKE 'INSERT%'
42
+ OR UPPER(TRIM(SqlTextInfo)) LIKE 'MERGE%'
43
+ OR UPPER(TRIM(SqlTextInfo)) LIKE 'UPDATE%'
44
+ QUALIFY ROW_NUMBER() OVER (ORDER BY a.QueryID, a.SqlRowNo) > {offset}
45
+ AND ROW_NUMBER() OVER (ORDER BY a.QueryID, a.SqlRowNo) <= {offset + page_size}
46
+ """
47
+
48
+ rows = teradata_repo.execute_query(query, conn)
49
+ if not rows:
50
+ break
51
+
52
+ logger.info(f"Concatenating split queries for page {offset // page_size + 1}...")
53
+ concatenated_queries = concatenate_split_queries(rows)
54
+
55
+ logger.info("Processing SQL statements and extracting lineage...")
56
+ lineage_results = process_sql_statements(concatenated_queries)
57
+ all_lineage_results.extend(lineage_results)
58
+
59
+ if len(rows) < page_size:
60
+ break
61
+
62
+ offset += page_size
63
+
64
+ logger.info(f"Lineage extraction complete. Found {len(all_lineage_results)} unique entries.")
65
+ for entry in all_lineage_results:
66
+ if len(entry) > 1:
67
+ logger.debug(f"Destination table: {entry[1]}")
68
+ else:
69
+ logger.debug("Destination table: Not available (out of bounds)")
70
+
71
+ if len(entry) > 0 and isinstance(entry[0], list):
72
+ logger.debug("Source tables:")
73
+ for src_table in entry[0]:
74
+ logger.debug(f" - {src_table}")
75
+ else:
76
+ logger.debug("Source tables: Not available (out of bounds or invalid type)")
77
+
78
+ logger.debug("---")
79
+
80
+ sql_lineage = SQLLineage()
81
+ update_table_lineage_inputs = [
82
+ sql_lineage.gen_lineage_input(
83
+ tenant_id=tenant_id, endpoint=endpoint, src_tables=src_tables, dest_table=dest_table
84
+ )
85
+ for src_tables, dest_table in all_lineage_results
86
+ ]
87
+
88
+ table_req_count = 0
89
+ logger.info(f"Starting to update lineage information for {len(update_table_lineage_inputs)} tables.")
90
+ for update_table_lineage_input in update_table_lineage_inputs:
91
+ logger.info(
92
+ f"Generating table lineage. downstream: {update_table_lineage_input.downstream_database_name}"
93
+ f" -> {update_table_lineage_input.downstream_table_name}"
94
+ )
95
+ try:
96
+ status_code = qdc_client.update_lineage_by_id(
97
+ global_id=update_table_lineage_input.downstream_global_id,
98
+ payload=update_table_lineage_input.upstreams.as_dict(),
99
+ )
100
+ if status_code == 200:
101
+ table_req_count += 1
102
+ else:
103
+ logger.error(
104
+ f"Failed to update lineage for {update_table_lineage_input.downstream_table_name}.\
105
+ Status code: {status_code}"
106
+ )
107
+ except Exception as e:
108
+ logger.error(
109
+ f"Exception occurred while updating lineage for {update_table_lineage_input.downstream_table_name}: {e}"
110
+ )
111
+ logger.info(f"Generating table lineage is finished. {table_req_count} lineages are ingested.")
112
+
113
+
114
+ @error_handling_decorator
115
+ def extract_lineage(sql_statement: str, default_database: str = None) -> Tuple[Set[Table], Table]:
116
+ try:
117
+ logger.debug(f"Parsing SQL: {sql_statement}")
118
+ sql_lineage = SQLLineage()
119
+ source_tables, dest_table = sql_lineage.get_table_level_lineage_source(sql=sql_statement, dialect="teradata")
120
+
121
+ source_tables = {Table(db=t.db_schema or default_database, db_schema="", table=t.table) for t in source_tables}
122
+ dest_table = Table(db=dest_table.db_schema or default_database, db_schema="", table=dest_table.table)
123
+
124
+ return source_tables, dest_table
125
+ except ParseError as e:
126
+ logger.error(f"Error parsing SQL: {e}")
127
+ logger.debug(f"Problematic SQL: {sql_statement}")
128
+ except AttributeError as e:
129
+ logger.error(f"Attribute error while extracting lineage: {e}")
130
+ logger.debug(f"Problematic SQL: {sql_statement}")
131
+ except Exception as e:
132
+ logger.error(f"Unexpected error while extracting lineage: {e}")
133
+ logger.debug(f"Problematic SQL: {sql_statement}")
134
+ return set(), Table(db="", table="")
135
+
136
+
137
+ @error_handling_decorator
138
+ def process_sql_statements(queries: List[Union[str, Dict[str, Union[str, int]]]]) -> List[Tuple[Set[Table], Table]]:
139
+ lineage_dict = OrderedDict()
140
+ for query in queries:
141
+ if isinstance(query, str):
142
+ sql = query
143
+ default_database = None
144
+ else:
145
+ sql = query["SqlTextInfo"]
146
+ default_database = query.get("DefaultDatabase")
147
+
148
+ source_tables, dest_table = extract_lineage(sql, default_database)
149
+ if dest_table.table and source_tables:
150
+ if dest_table in lineage_dict:
151
+ logger.info(f"Merging duplicate entry for {dest_table}")
152
+ # Merge source tables
153
+ lineage_dict[dest_table] = lineage_dict[dest_table].union(source_tables)
154
+ else:
155
+ lineage_dict[dest_table] = source_tables
156
+ return [(src_tables, dest_table) for dest_table, src_tables in lineage_dict.items()]
157
+
158
+
159
+ def concatenate_split_queries(rows: List[Dict[str, Union[str, int]]]) -> List[Dict[str, Union[str, int]]]:
160
+ queries = {}
161
+ for row in rows:
162
+ query_id = row["QueryID"]
163
+ sql_text = row["SqlTextInfo"]
164
+ default_database = row["DefaultDatabase"]
165
+ if query_id not in queries:
166
+ queries[query_id] = {"SqlTextInfo": [], "DefaultDatabase": default_database}
167
+ queries[query_id]["SqlTextInfo"].append(sql_text)
168
+
169
+ return [
170
+ {"SqlTextInfo": "".join(query["SqlTextInfo"]), "DefaultDatabase": query["DefaultDatabase"]}
171
+ for query in queries.values()
172
+ ]
@@ -0,0 +1,218 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from quollio_core.helper.log_utils import error_handling_decorator, logger
4
+ from quollio_core.profilers.stats import gen_table_stats_payload
5
+ from quollio_core.repository import qdc
6
+ from quollio_core.repository import teradata as teradata_repo
7
+
8
+ NUMERIC_TYPES = ["D", "F", "I1", "I2", "I8", "I", "N"]
9
+
10
+ # I, I1, I2, I8 - INT TYPES INTEGER, BYTEINT, SMALLINT, BIGINT
11
+ # F - Float
12
+ # D - Decimal
13
+ # N - Number
14
+
15
+
16
+ def quote_identifier(identifier: str) -> str:
17
+ return f'"{identifier}"'
18
+
19
+
20
+ @error_handling_decorator
21
+ def load_stats(
22
+ conn_config: teradata_repo.TeradataConfig,
23
+ sample_percent: Optional[float] = None,
24
+ endpoint: Optional[str] = None,
25
+ tenant_id: Optional[str] = None,
26
+ qdc_client: Optional[qdc.QDCExternalAPIClient] = None,
27
+ target_databases: Optional[List[str]] = None,
28
+ target_databases_method: str = "DENYLIST",
29
+ stats_items: Optional[List[str]] = None,
30
+ ) -> None:
31
+ stats_list = []
32
+ numerical_columns = 0
33
+ non_numerical_columns = 0
34
+ logger.info(
35
+ f"Starting statistics collection. " f"Sample percent: {sample_percent if sample_percent is not None else 'N/A'}"
36
+ )
37
+
38
+ with teradata_repo.new_teradata_client(conn_config) as conn:
39
+ try:
40
+ tables = teradata_repo.get_table_list(conn, target_databases, target_databases_method)
41
+ for table in tables:
42
+ logger.debug(f"Processing table: {table}")
43
+ database_name = table["DataBaseName"]
44
+ table_name = table["TableName"]
45
+
46
+ logger.info(f"Processing table {database_name}.{table_name}")
47
+ columns = teradata_repo.get_column_list(conn, database_name=database_name, table_name=table_name)
48
+ logger.debug(f"Columns: {columns}")
49
+
50
+ for column in columns:
51
+ column_name = column["ColumnName"]
52
+ column_type = column["ColumnType"]
53
+ if column_type is None:
54
+ column_type = ""
55
+ else:
56
+ column_type = column_type.strip()
57
+
58
+ is_numerical = column_type in NUMERIC_TYPES
59
+ if is_numerical:
60
+ numerical_columns += 1
61
+ else:
62
+ non_numerical_columns += 1
63
+
64
+ stats_sql = generate_column_statistics_sql(
65
+ database_name,
66
+ table_name,
67
+ column_name,
68
+ column_type,
69
+ sample_percent if is_numerical else None,
70
+ stats_items,
71
+ )
72
+ logger.debug(f"Generated SQL for column {column_name}: {stats_sql}")
73
+
74
+ try:
75
+ result = teradata_repo.execute_query(stats_sql, conn)
76
+ logger.debug(f"Query result for column {column_name}: {result}")
77
+ if result:
78
+ column_stats = parse_column_statistics_result(
79
+ result[0], database_name, table_name, column_name, stats_items, is_numerical
80
+ )
81
+ stats_list.append(column_stats)
82
+ except Exception as e:
83
+ logger.error(
84
+ f"Failed to collect statistics for {database_name}.{table_name}.{column_name}: {e}"
85
+ )
86
+
87
+ except Exception as e:
88
+ logger.error(f"Error during statistics collection: {e}")
89
+
90
+ logger.info("Statistics collection completed successfully.")
91
+
92
+ logger.debug(f"Stats list: {stats_list}")
93
+ payloads = gen_table_stats_payload(stats=stats_list, tenant_id=tenant_id, endpoint=endpoint)
94
+ logger.debug(f"Generated payloads: {payloads}")
95
+
96
+ req_count = 0
97
+ for payload in payloads:
98
+ logger.info(f"Generating table stats. asset: {payload.db} -> {payload.table} -> {payload.column}")
99
+ status_code = qdc_client.update_stats_by_id(
100
+ global_id=payload.global_id,
101
+ payload=payload.body.get_column_stats(),
102
+ )
103
+ if status_code == 200:
104
+ req_count += 1
105
+
106
+ logger.info(
107
+ f"Loading statistics is finished. {req_count} statistics are ingested. "
108
+ f"Numerical columns: {numerical_columns}, Non-numerical columns: {non_numerical_columns}"
109
+ )
110
+
111
+
112
+ @error_handling_decorator
113
+ def parse_column_statistics_result(
114
+ result: Dict[str, Any],
115
+ database_name: str,
116
+ table_name: str,
117
+ column_name: str,
118
+ stats_items: Optional[List[str]] = None,
119
+ is_numerical: bool = False,
120
+ ) -> Dict[str, Any]:
121
+ stats_dict = {
122
+ "DB_NAME": database_name,
123
+ "SCHEMA_NAME": "",
124
+ "TABLE_NAME": table_name,
125
+ "COLUMN_NAME": column_name,
126
+ }
127
+
128
+ if stats_items:
129
+ for item in stats_items:
130
+ if item == "cardinality" and "num_uniques" in result:
131
+ stats_dict["CARDINALITY"] = result["num_uniques"]
132
+ elif item == "number_of_null" and "num_nulls" in result:
133
+ stats_dict["NULL_COUNT"] = result["num_nulls"] # Changed from NUM_NULLS to NULL_COUNT
134
+
135
+ if is_numerical:
136
+ if item == "min" and "min_value" in result:
137
+ stats_dict["MIN_VALUE"] = str(result["min_value"])
138
+ elif item == "max" and "max_value" in result:
139
+ stats_dict["MAX_VALUE"] = str(result["max_value"])
140
+ elif item == "median" and "median_value" in result:
141
+ stats_dict["MEDIAN_VALUE"] = str(result["median_value"])
142
+ elif item == "mean" and "avg_value" in result:
143
+ stats_dict["AVG_VALUE"] = str(result["avg_value"])
144
+ elif item == "stddev" and "stddev_value" in result:
145
+ stats_dict["STDDEV_VALUE"] = str(result["stddev_value"])
146
+ elif item == "mode" and "mode_value" in result and is_numerical:
147
+ stats_dict["MODE_VALUE"] = str(result["mode_value"])
148
+
149
+ return stats_dict
150
+
151
+
152
+ @error_handling_decorator
153
+ def generate_column_statistics_sql(
154
+ database_name: str,
155
+ table_name: str,
156
+ column_name: str,
157
+ column_type: str,
158
+ sample_percent: Optional[float] = None,
159
+ stats_items: Optional[List[str]] = None,
160
+ ) -> str:
161
+ quoted_column = quote_identifier(column_name)
162
+ quoted_database = quote_identifier(database_name)
163
+
164
+ # Handle the case where table_name might include a database
165
+ if "." in table_name:
166
+ schema, table = table_name.split(".", 1)
167
+ quoted_table = f"{quote_identifier(schema)}.{quote_identifier(table)}"
168
+ else:
169
+ quoted_table = quote_identifier(table_name)
170
+
171
+ stats_clauses = []
172
+ mode_query = ""
173
+
174
+ if stats_items:
175
+ if "cardinality" in stats_items:
176
+ stats_clauses.append(f"COUNT(DISTINCT {quoted_column}) AS num_uniques")
177
+ if "number_of_null" in stats_items:
178
+ stats_clauses.append(f"SUM(CASE WHEN {quoted_column} IS NULL THEN 1 ELSE 0 END) AS num_nulls")
179
+
180
+ if column_type in NUMERIC_TYPES:
181
+ if "min" in stats_items:
182
+ stats_clauses.append(f"MIN(CAST({quoted_column} AS FLOAT)) AS min_value")
183
+ if "max" in stats_items:
184
+ stats_clauses.append(f"MAX(CAST({quoted_column} AS FLOAT)) AS max_value")
185
+ if "median" in stats_items:
186
+ stats_clauses.append(f"MEDIAN(CAST({quoted_column} AS FLOAT)) AS median_value")
187
+ if "mean" in stats_items:
188
+ stats_clauses.append(f"AVG(CAST({quoted_column} AS FLOAT)) AS avg_value")
189
+ if "stddev" in stats_items:
190
+ stats_clauses.append(f"STDDEV_SAMP(CAST({quoted_column} AS FLOAT)) AS stddev_value")
191
+ if "mode" in stats_items:
192
+ mode_query = (
193
+ f"WITH MODE_VALUE AS ("
194
+ f" SELECT {quoted_column}, COUNT(*) as freq "
195
+ f" FROM {quoted_database}.{quoted_table} "
196
+ )
197
+
198
+ if sample_percent is not None and 0 < sample_percent <= 99:
199
+ sample_fraction = sample_percent / 100
200
+ mode_query += f" SAMPLE {sample_fraction} "
201
+
202
+ mode_query += (
203
+ f" GROUP BY {quoted_column} " f" QUALIFY ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) = 1" f") "
204
+ )
205
+ stats_clauses.append(f"(SELECT {quoted_column} FROM MODE_VALUE) AS mode_value")
206
+
207
+ if not stats_clauses:
208
+ logger.warning(f"No statistics selected for column {column_name}. Skipping this column.")
209
+ return ""
210
+
211
+ query = f"{mode_query}" f"SELECT {', '.join(stats_clauses)} " f"FROM {quoted_database}.{quoted_table}"
212
+
213
+ if sample_percent is not None and 0 < sample_percent <= 99:
214
+ sample_fraction = sample_percent / 100
215
+ query += f" SAMPLE {sample_fraction}"
216
+
217
+ logger.debug(f"Generated SQL query for {quoted_database}.{quoted_table}.{quoted_column}: {query}")
218
+ return query
@@ -25,9 +25,6 @@ class QDCExternalAPIClient:
25
25
  Tried to find a package for oauth0 client credentials flow,
26
26
  but any of them contains bugs or lacks of features to handle the token refresh when it's expired
27
27
  """
28
- is_domain_valid = is_valid_domain(domain=self.base_url)
29
- if not is_domain_valid:
30
- raise ValueError("The format of quollio API URL is invalid. The URL must end with `.com`")
31
28
 
32
29
  url = f"{self.base_url}/oauth2/token"
33
30
  creds = f"{self.client_id}:{self.client_secret}"
@@ -108,7 +105,3 @@ class QDCExternalAPIClient:
108
105
 
109
106
  def initialize_qdc_client(api_url: str, client_id: str, client_secret: str) -> QDCExternalAPIClient:
110
107
  return QDCExternalAPIClient(base_url=api_url, client_id=client_id, client_secret=client_secret)
111
-
112
-
113
- def is_valid_domain(domain: str) -> bool:
114
- return domain.endswith(".com")
@@ -0,0 +1,59 @@
1
+ import logging
2
+ import os
3
+ from typing import Tuple
4
+
5
+ import boto3
6
+ from botocore.exceptions import ClientError
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def get_parameter_by_assume_role(key: str, region: str = "ap-northeast-1") -> Tuple[str, Exception]:
12
+ tenant_id = os.getenv("TENANT_ID")
13
+ if not _is_str_valid(tenant_id):
14
+ return ("", Exception("TENANT_ID is not set in get_parameter_by_assume_role."))
15
+ qdc_account_id = os.getenv("QDC_ACCOUNT_ID")
16
+ if not _is_valid_aws_account_id(qdc_account_id):
17
+ return ("", Exception("QDC_ACCOUNT_ID is not set in get_parameter_by_assume_role."))
18
+ qdc_region = os.getenv("QDC_REGION")
19
+ if not _is_str_valid(qdc_region):
20
+ return ("", Exception("QDC_REGION is not set in get_parameter_by_assume_role."))
21
+
22
+ sts_assume_role_arn = "arn:aws:iam::{account_id}:role/qdc-{tenant_id}-cross-account-access".format(
23
+ account_id=qdc_account_id, tenant_id=tenant_id
24
+ )
25
+
26
+ session = boto3.Session(region_name=region)
27
+ sts = session.client("sts", endpoint_url="https://sts.{region}.amazonaws.com".format(region=qdc_region))
28
+ assumed_role_object = sts.assume_role(
29
+ RoleArn=sts_assume_role_arn,
30
+ RoleSessionName="AssumeRoleSession",
31
+ )
32
+ credentials = assumed_role_object["Credentials"]
33
+
34
+ try:
35
+ ssm = session.client(
36
+ "ssm",
37
+ endpoint_url="https://ssm.{region}.amazonaws.com".format(region=qdc_region),
38
+ aws_access_key_id=credentials["AccessKeyId"],
39
+ aws_secret_access_key=credentials["SecretAccessKey"],
40
+ aws_session_token=credentials["SessionToken"],
41
+ )
42
+ res = ssm.get_parameter(Name=key, WithDecryption=True)
43
+ return (res["Parameter"]["Value"], None)
44
+ except ClientError as e:
45
+ logger.error(
46
+ "Failed to run ssm.get_parameter().\
47
+ Please check the value stored in parameter store is correct. error: {err}".format(
48
+ err=e
49
+ )
50
+ )
51
+ return ("", e)
52
+
53
+
54
+ def _is_valid_aws_account_id(s: str) -> bool:
55
+ return s is not None and len(s) == 12 and s.isdigit()
56
+
57
+
58
+ def _is_str_valid(s: str) -> bool:
59
+ return s is not None and s != ""
@@ -0,0 +1,103 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import teradatasql
5
+
6
+ from quollio_core.helper.log_utils import error_handling_decorator, logger
7
+
8
+
9
+ @dataclass
10
+ class TeradataConfig:
11
+ host: str
12
+ port: int
13
+ username: str
14
+ password: str
15
+ database: str = "DBC"
16
+ encrypt_data: bool = True
17
+ additional_params: Dict[str, Any] = field(default_factory=dict)
18
+
19
+ @classmethod
20
+ def from_dict(
21
+ cls, credentials: Dict[str, str], host: str, port: str, additional_params: Dict[str, Any] = None
22
+ ) -> "TeradataConfig":
23
+ return cls(
24
+ host=host,
25
+ port=int(port),
26
+ username=credentials["username"],
27
+ password=credentials["password"],
28
+ additional_params=additional_params or {},
29
+ )
30
+
31
+ def get_connection_params(self) -> Dict[str, Any]:
32
+ params = {
33
+ "host": self.host,
34
+ "user": self.username,
35
+ "password": self.password,
36
+ "database": self.database,
37
+ "dbs_port": self.port,
38
+ "encryptdata": str(self.encrypt_data).lower(),
39
+ }
40
+ params.update(self.additional_params)
41
+ return params
42
+
43
+
44
+ @error_handling_decorator
45
+ def new_teradata_client(config: TeradataConfig) -> teradatasql.connect:
46
+ conn = teradatasql.connect(**config.get_connection_params())
47
+ return conn
48
+
49
+
50
+ @error_handling_decorator
51
+ def get_table_list(
52
+ conn: teradatasql.connect, target_databases: Optional[List[str]] = None, target_databases_method: str = "DENYLIST"
53
+ ) -> List[Dict[str, str]]:
54
+ if target_databases_method == "DENYLIST":
55
+ operator = "NOT"
56
+ else:
57
+ operator = ""
58
+
59
+ query_tables = f"""
60
+ SELECT DatabaseName, TableName
61
+ FROM DBC.TablesV
62
+ WHERE TableKind IN ('T', 'O', 'Q')
63
+ AND DatabaseName {operator} IN ({','.join("'" + db + "'" for db in target_databases)})
64
+ """
65
+ logger.debug("Executing query to retrieve table names.")
66
+ tables = execute_query(query_tables, conn)
67
+ return tables
68
+
69
+
70
+ @error_handling_decorator
71
+ def get_column_list(conn: teradatasql.connect, database_name: str, table_name: str) -> List[Dict[str, str]]:
72
+ query_columns = f"""
73
+ SELECT ColumnName, ColumnType
74
+ FROM DBC.ColumnsV
75
+ WHERE DatabaseName = '{database_name}'
76
+ AND TableName = '{table_name}'
77
+ """
78
+ logger.debug(f"Executing query to retrieve columns for {database_name}.{table_name}.")
79
+ columns = execute_query(query_columns, conn)
80
+ logger.debug(f"Retrieved columns: {columns}")
81
+ return columns
82
+
83
+
84
+ @error_handling_decorator
85
+ def execute_query(query: str, con: teradatasql.connect) -> List[Dict[str, Any]]:
86
+ try:
87
+ with con.cursor() as cur:
88
+ logger.debug(f"Executing SQL query: {query}")
89
+ cur.execute(query)
90
+ logger.debug(f"Column descriptions: {cur.description}")
91
+ columns = [desc[0] for desc in cur.description]
92
+ rows = [dict(zip(columns, row)) for row in cur.fetchall()]
93
+ logger.debug(f"Fetched {len(rows)} rows from Teradata.")
94
+ return rows
95
+ except teradatasql.OperationalError as e:
96
+ logger.error(f"Teradata Operational Error: {e}")
97
+ raise
98
+ except teradatasql.ProgrammingError as e:
99
+ logger.error(f"Teradata Programming Error: {e}")
100
+ raise
101
+ except Exception as e:
102
+ logger.error(f"Unexpected error fetching data from Teradata: {e}")
103
+ raise