quollio-core 0.4.12__tar.gz → 0.4.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {quollio_core-0.4.12 → quollio_core-0.4.13}/PKG-INFO +2 -1
  2. {quollio_core-0.4.12 → quollio_core-0.4.13}/pyproject.toml +1 -0
  3. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/__init__.py +1 -1
  4. quollio_core-0.4.13/quollio_core/bigquery.py +183 -0
  5. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/bricks.py +15 -3
  6. quollio_core-0.4.13/quollio_core/helper/log_utils.py +48 -0
  7. quollio_core-0.4.13/quollio_core/profilers/bigquery.py +145 -0
  8. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/databricks.py +44 -39
  9. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/redshift.py +13 -22
  10. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/snowflake.py +7 -21
  11. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/stats.py +78 -17
  12. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/redshift.py +22 -2
  13. quollio_core-0.4.13/quollio_core/repository/bigquery.py +94 -0
  14. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/repository/qdc.py +4 -0
  15. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/snowflake.py +22 -3
  16. quollio_core-0.4.12/quollio_core/bigquery.py +0 -123
  17. quollio_core-0.4.12/quollio_core/profilers/bigquery.py +0 -81
  18. quollio_core-0.4.12/quollio_core/repository/bigquery.py +0 -61
  19. {quollio_core-0.4.12 → quollio_core-0.4.13}/LICENSE +0 -0
  20. {quollio_core-0.4.12 → quollio_core-0.4.13}/README.md +0 -0
  21. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/.gitignore +0 -0
  22. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/README.md +0 -0
  23. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
  24. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/dbt_project.yml +0 -0
  25. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
  26. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +0 -0
  27. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +0 -0
  28. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +0 -0
  29. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +0 -0
  30. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/models/sources.yml +0 -0
  31. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/package-lock.yml +0 -0
  32. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/packages.yml +0 -0
  33. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +0 -0
  34. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
  35. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
  36. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/README.md +0 -0
  37. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/analyses/.gitkeep +0 -0
  38. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/dbt_project.yml +0 -0
  39. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/macros/.gitkeep +0 -0
  40. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +0 -0
  41. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql +0 -0
  42. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml +0 -0
  43. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql +0 -0
  44. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml +0 -0
  45. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql +0 -0
  46. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml +0 -0
  47. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +0 -0
  48. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml +0 -0
  49. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql +0 -0
  50. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml +0 -0
  51. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/models/sources.yml +0 -0
  52. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/package-lock.yml +0 -0
  53. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/packages.yml +0 -0
  54. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/profiles/profiles_template.yml +0 -0
  55. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/seeds/.gitkeep +0 -0
  56. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/redshift/snapshots/.gitkeep +0 -0
  57. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/seeds/.gitkeep +0 -0
  58. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/README.md +0 -0
  59. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/analyses/.gitkeep +0 -0
  60. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/dbt_project.yml +0 -0
  61. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/macros/.gitkeep +0 -0
  62. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +0 -0
  63. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +0 -0
  64. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml +0 -0
  65. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +0 -0
  66. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml +0 -0
  67. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +0 -0
  68. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml +0 -0
  69. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +0 -0
  70. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml +0 -0
  71. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +0 -0
  72. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml +0 -0
  73. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/models/sources.yml +0 -0
  74. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/package-lock.yml +0 -0
  75. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/packages.yml +0 -0
  76. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +0 -0
  77. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/seeds/.gitkeep +0 -0
  78. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/dbt_projects/snowflake/snapshots/.gitkeep +0 -0
  79. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/helper/__init__.py +0 -0
  80. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/helper/core.py +0 -0
  81. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/helper/env_default.py +0 -0
  82. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/helper/log.py +0 -0
  83. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/__init__.py +0 -0
  84. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/lineage.py +0 -0
  85. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/profilers/sqllineage.py +0 -0
  86. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/repository/__init__.py +0 -0
  87. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/repository/databricks.py +0 -0
  88. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/repository/dbt.py +0 -0
  89. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/repository/redshift.py +0 -0
  90. {quollio_core-0.4.12 → quollio_core-0.4.13}/quollio_core/repository/snowflake.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quollio-core
3
- Version: 0.4.12
3
+ Version: 0.4.13
4
4
  Summary: Quollio Core
5
5
  Author-email: quollio-dev <qt.dev@quollio.com>
6
6
  Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
@@ -22,6 +22,7 @@ Requires-Dist: dbt-core==1.7.10
22
22
  Requires-Dist: dbt-snowflake==1.7.0
23
23
  Requires-Dist: dbt-redshift==1.7.1
24
24
  Requires-Dist: dbt-databricks==1.7.1
25
+ Requires-Dist: db-dtypes==1.2.0
25
26
  Requires-Dist: jinja2==3.1.3
26
27
  Requires-Dist: PyYAML==6.0.1
27
28
  Requires-Dist: requests==2.31.0
@@ -34,6 +34,7 @@ dependencies = [
34
34
  ,"dbt-snowflake==1.7.0"
35
35
  ,"dbt-redshift==1.7.1"
36
36
  ,"dbt-databricks==1.7.1"
37
+ ,"db-dtypes==1.2.0"
37
38
  ,"jinja2==3.1.3"
38
39
  ,"PyYAML==6.0.1"
39
40
  ,"requests==2.31.0"
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.4.12"
3
+ __version__ = "0.4.13"
4
4
  __author__ = "Quollio Technologies, Inc"
@@ -0,0 +1,183 @@
1
+ import argparse
2
+ import json
3
+
4
+ from google.auth.credentials import Credentials
5
+
6
+ from quollio_core.helper.env_default import env_default
7
+ from quollio_core.helper.log_utils import configure_logging, error_handling_decorator, logger
8
+ from quollio_core.profilers.bigquery import bigquery_table_lineage, bigquery_table_stats
9
+ from quollio_core.repository import qdc
10
+ from quollio_core.repository.bigquery import BigQueryClient, get_credentials, get_org_id
11
+
12
+
13
+ def initialize_credentials(credentials_json: str) -> Credentials:
14
+ return get_credentials(json.loads(credentials_json))
15
+
16
+
17
+ def initialize_org_id(credentials_json: str) -> str:
18
+ return get_org_id(json.loads(credentials_json))
19
+
20
+
21
+ def initialize_bq_client(credentials: Credentials, project_id: str) -> BigQueryClient:
22
+ return BigQueryClient(credentials=credentials, project_id=project_id)
23
+
24
+
25
+ @error_handling_decorator
26
+ def load_lineage(
27
+ tenant_id: str,
28
+ project_id: str,
29
+ regions: list,
30
+ org_id: str,
31
+ credentials: Credentials,
32
+ qdc_client: qdc.QDCExternalAPIClient,
33
+ ) -> None:
34
+ logger.info("Loading lineage data.")
35
+ bigquery_table_lineage(
36
+ qdc_client=qdc_client,
37
+ tenant_id=tenant_id,
38
+ project_id=project_id,
39
+ regions=regions,
40
+ credentials=credentials,
41
+ org_id=org_id,
42
+ )
43
+ logger.info("Lineage data loaded successfully.")
44
+
45
+
46
+ @error_handling_decorator
47
+ def load_stats(
48
+ conn: BigQueryClient,
49
+ tenant_id: str,
50
+ org_id: str,
51
+ qdc_client: qdc.QDCExternalAPIClient,
52
+ dataplex_stats_tables: list,
53
+ ) -> None:
54
+ logger.info("Loading statistics data.")
55
+ bigquery_table_stats(
56
+ bq_client=conn,
57
+ qdc_client=qdc_client,
58
+ tenant_id=tenant_id,
59
+ org_id=org_id,
60
+ dataplex_stats_tables=dataplex_stats_tables,
61
+ )
62
+ logger.info("Statistics data loaded successfully.")
63
+
64
+
65
+ if __name__ == "__main__":
66
+ parser = argparse.ArgumentParser(
67
+ prog="Quollio Intelligence Agent for BigQuery",
68
+ description="Load lineage and stats to Quollio from BigQuery using Dataplex and BigQuery APIs",
69
+ epilog="Copyright (c) 2024 Quollio Technologies, Inc.",
70
+ )
71
+ parser.add_argument(
72
+ "commands",
73
+ choices=["load_lineage", "load_stats"],
74
+ type=str,
75
+ nargs="+",
76
+ help="""
77
+ The command to execute.
78
+ 'load_lineage': Load lineage data from created views to Quollio,
79
+ 'load_stats': Load stats from created views to Quollio,
80
+ """,
81
+ )
82
+ parser.add_argument(
83
+ "--log_level",
84
+ type=str,
85
+ choices=["debug", "info", "warn", "error", "none"],
86
+ action=env_default("LOG_LEVEL"),
87
+ default="info",
88
+ required=False,
89
+ help="The log level for dbt commands. Default value is info",
90
+ )
91
+ parser.add_argument(
92
+ "--tenant_id",
93
+ type=str,
94
+ action=env_default("TENANT_ID"),
95
+ required=False,
96
+ help="The tenant id (company id) where the lineage and stats are loaded",
97
+ )
98
+ parser.add_argument(
99
+ "--project_id",
100
+ type=str,
101
+ default=None,
102
+ required=False,
103
+ help="Project ID of the BigQuery project to load lineage and stats from (default is loaded from credentials)",
104
+ )
105
+ parser.add_argument(
106
+ "--regions",
107
+ type=str,
108
+ action=env_default("GCP_REGIONS"),
109
+ required=True,
110
+ help="Comma-separated list of regions BigQuery data is in",
111
+ )
112
+ parser.add_argument(
113
+ "--credentials_json",
114
+ type=str,
115
+ action=env_default("GOOGLE_APPLICATION_CREDENTIALS"),
116
+ required=True,
117
+ help="Credentials JSON",
118
+ )
119
+ parser.add_argument(
120
+ "--api_url",
121
+ type=str,
122
+ action=env_default("QDC_API_URL"),
123
+ required=False,
124
+ help="The base URL of Quollio External API",
125
+ )
126
+ parser.add_argument(
127
+ "--client_id",
128
+ type=str,
129
+ action=env_default("QDC_CLIENT_ID"),
130
+ required=False,
131
+ help="The client id that is created on Quollio console to let clients access Quollio External API",
132
+ )
133
+ parser.add_argument(
134
+ "--client_secret",
135
+ type=str,
136
+ action=env_default("QDC_CLIENT_SECRET"),
137
+ required=False,
138
+ help="The client secret that is created on Quollio console to let clients access Quollio External API",
139
+ )
140
+
141
+ parser.add_argument(
142
+ "--dataplex_stats_tables",
143
+ type=str,
144
+ action=env_default("DATAPLEX_STATS_TABLES"),
145
+ required=False,
146
+ help="Comma-separated list of dataplex stats tables - <project_id>.<dataset_id>.<table_id>",
147
+ )
148
+
149
+ args = parser.parse_args()
150
+
151
+ # Validate that dataplex_stats_tables is provided if load_stats is in commands
152
+ if "load_stats" in args.commands and not args.dataplex_stats_tables:
153
+ parser.error("--dataplex_stats_tables is required when 'load_stats' command is used")
154
+
155
+ configure_logging(args.log_level)
156
+
157
+ credentials = initialize_credentials(args.credentials_json)
158
+ org_id = initialize_org_id(args.credentials_json)
159
+ qdc_client = qdc.initialize_qdc_client(args.api_url, args.client_id, args.client_secret)
160
+ bq_client = initialize_bq_client(credentials, args.project_id)
161
+ if args.project_id is None:
162
+ args.project_id = json.loads(args.credentials_json)["project_id"]
163
+ regions = args.regions.split(",")
164
+
165
+ if "load_lineage" in args.commands:
166
+ load_lineage(
167
+ tenant_id=args.tenant_id,
168
+ project_id=args.project_id,
169
+ regions=regions,
170
+ org_id=org_id,
171
+ credentials=credentials,
172
+ qdc_client=qdc_client,
173
+ )
174
+
175
+ if "load_stats" in args.commands:
176
+ tables = args.dataplex_stats_tables.split(",")
177
+ load_stats(
178
+ conn=bq_client,
179
+ tenant_id=args.tenant_id,
180
+ org_id=org_id,
181
+ qdc_client=qdc_client,
182
+ dataplex_stats_tables=tables,
183
+ )
@@ -10,6 +10,7 @@ from quollio_core.profilers.databricks import (
10
10
  databricks_column_stats,
11
11
  databricks_table_level_lineage,
12
12
  )
13
+ from quollio_core.profilers.stats import get_column_stats_items
13
14
  from quollio_core.repository import databricks as db
14
15
  from quollio_core.repository import dbt, qdc
15
16
 
@@ -21,7 +22,6 @@ def build_view(
21
22
  target_tables: str = "",
22
23
  log_level: str = "info",
23
24
  ) -> None:
24
-
25
25
  logger.info("Build profiler views using dbt")
26
26
  # set parameters
27
27
  dbt_client = dbt.DBTClient()
@@ -64,7 +64,6 @@ def load_lineage(
64
64
  tenant_id: str,
65
65
  enable_column_lineage: bool = False,
66
66
  ) -> None:
67
-
68
67
  logger.info("Generate Databricks table to table lineage.")
69
68
  databricks_table_level_lineage(
70
69
  conn=conn,
@@ -98,7 +97,6 @@ def load_column_stats(
98
97
  qdc_client: qdc.QDCExternalAPIClient,
99
98
  tenant_id: str,
100
99
  ) -> None:
101
-
102
100
  logger.info("Generate Databricks column stats.")
103
101
  databricks_column_stats(
104
102
  conn=conn,
@@ -240,6 +238,19 @@ if __name__ == "__main__":
240
238
  help="Whether to ingest column lineage into QDIC or not. Default value is False",
241
239
  )
242
240
 
241
+ stats_items = get_column_stats_items()
242
+ parser.add_argument(
243
+ "--target_stats_items",
244
+ type=str,
245
+ nargs="*",
246
+ choices=stats_items,
247
+ default=stats_items,
248
+ action=env_default("DATABRICKS_STATS_ITEMS"),
249
+ required=False,
250
+ help="The items for statistic values.\
251
+ You can choose the items to be aggregated for stats. All items are selected by default.",
252
+ )
253
+
243
254
  args = parser.parse_args()
244
255
  set_log_level(level=args.log_level)
245
256
 
@@ -284,5 +295,6 @@ if __name__ == "__main__":
284
295
  endpoint=args.host,
285
296
  qdc_client=qdc_client,
286
297
  tenant_id=args.tenant_id,
298
+ stats_items=args.target_stats_items,
287
299
  monitoring_table_suffix=args.monitoring_table_suffix,
288
300
  )
@@ -0,0 +1,48 @@
1
+ import inspect
2
+ import logging
3
+
4
+ LOG_LEVELS = {
5
+ "critical": logging.CRITICAL,
6
+ "error": logging.ERROR,
7
+ "warning": logging.WARNING,
8
+ "info": logging.INFO,
9
+ "debug": logging.DEBUG,
10
+ "notset": logging.NOTSET,
11
+ }
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def configure_logging(level: str = "INFO"):
17
+ """Configure logging settings."""
18
+ log_level = LOG_LEVELS.get(level.lower())
19
+ if log_level is None:
20
+ raise ValueError(f"Unknown log level: {level}")
21
+
22
+ logging.basicConfig(
23
+ level=log_level,
24
+ format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
25
+ )
26
+ logger.setLevel(log_level)
27
+ logger.info(f"Logging is configured to {level} level.")
28
+
29
+
30
+ def error_handling_decorator(func):
31
+ """Decorator for consistent error handling in CLI commands."""
32
+
33
+ def wrapper(*args, **kwargs):
34
+ func_name = func.__name__
35
+ try:
36
+ logger.debug(f"Starting {func_name}")
37
+ result = func(*args, **kwargs)
38
+ logger.debug(f"Completed {func_name} successfully")
39
+ return result
40
+ except Exception as e:
41
+
42
+ current_frame = inspect.currentframe()
43
+ error_frame = current_frame.f_back
44
+ line_number = error_frame.f_lineno
45
+ logger.error(f"Error in {func_name} at line {line_number}: {str(e)}", exc_info=True)
46
+ raise
47
+
48
+ return wrapper
@@ -0,0 +1,145 @@
1
+ from typing import Dict, List
2
+
3
+ from google.auth.credentials import Credentials
4
+
5
+ from quollio_core.helper.log_utils import error_handling_decorator, logger
6
+ from quollio_core.profilers.lineage import gen_table_lineage_payload, parse_bigquery_table_lineage
7
+ from quollio_core.profilers.stats import gen_table_stats_payload
8
+ from quollio_core.repository import qdc
9
+ from quollio_core.repository.bigquery import BigQueryClient, GCPLineageClient, get_entitiy_reference, get_search_request
10
+
11
+
12
+ @error_handling_decorator
13
+ def bigquery_table_lineage(
14
+ qdc_client: qdc.QDCExternalAPIClient,
15
+ tenant_id: str,
16
+ project_id: str,
17
+ regions: list,
18
+ org_id: str,
19
+ credentials: Credentials,
20
+ ) -> None:
21
+ lineage_client = GCPLineageClient(credentials)
22
+ bq_client = BigQueryClient(credentials, project_id)
23
+
24
+ datasets = bq_client.list_dataset_ids()
25
+ all_tables = generate_table_list(bq_client, datasets)
26
+ lineage_links = generate_lineage_links(all_tables, lineage_client, project_id, regions)
27
+ lineage_links = parse_bigquery_table_lineage(lineage_links)
28
+
29
+ update_table_lineage_inputs = gen_table_lineage_payload(tenant_id=tenant_id, endpoint=org_id, tables=lineage_links)
30
+
31
+ req_count = 0
32
+ for update_table_lineage_input in update_table_lineage_inputs:
33
+ logger.info(
34
+ "Generating table lineage. downstream: %s -> %s -> %s",
35
+ update_table_lineage_input.downstream_database_name,
36
+ update_table_lineage_input.downstream_schema_name,
37
+ update_table_lineage_input.downstream_table_name,
38
+ )
39
+ status_code = qdc_client.update_lineage_by_id(
40
+ global_id=update_table_lineage_input.downstream_global_id,
41
+ payload=update_table_lineage_input.upstreams.as_dict(),
42
+ )
43
+ if status_code == 200:
44
+ req_count += 1
45
+ logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
46
+
47
+
48
+ @error_handling_decorator
49
+ def bigquery_table_stats(
50
+ qdc_client: qdc.QDCExternalAPIClient,
51
+ bq_client: BigQueryClient,
52
+ tenant_id: str,
53
+ org_id: str,
54
+ dataplex_stats_tables: list,
55
+ ) -> None:
56
+ profiling_results = []
57
+ for table in dataplex_stats_tables:
58
+ logger.info("Profiling columns using Dataplex stats table: %s", table)
59
+ profiling_results.extend(column_stats_from_dataplex(bq_client, table))
60
+
61
+ stats = gen_table_stats_payload(tenant_id, org_id, profiling_results)
62
+
63
+ for stat in stats:
64
+ status_code = qdc_client.update_stats_by_id(
65
+ global_id=stat.global_id,
66
+ payload=stat.body.as_dict(),
67
+ )
68
+ if status_code == 200:
69
+ logger.info(
70
+ "Stats for column %s -> %s -> %s -> %s is successfully ingested.",
71
+ stat.db,
72
+ stat.schema,
73
+ stat.table,
74
+ stat.column,
75
+ )
76
+ logger.debug("Stats for column id %s is successfully ingested.", stat.global_id)
77
+
78
+
79
+ def generate_table_list(bq_client: BigQueryClient, datasets: List[str]) -> List[str]:
80
+ all_tables = []
81
+ for dataset in datasets:
82
+ all_tables.extend(
83
+ [
84
+ table
85
+ for table in bq_client.list_tables(dataset)
86
+ if table["table_type"] in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
87
+ ],
88
+ )
89
+
90
+ all_table_names = []
91
+ for table in all_tables:
92
+ all_table_names.append(f"{bq_client.client.project}.{table['dataset_id']}.{table['table_id']}")
93
+
94
+ return all_table_names
95
+
96
+
97
+ def generate_lineage_links(
98
+ all_tables: List[str],
99
+ lineage_client: GCPLineageClient,
100
+ project_id: str,
101
+ regions: List[str],
102
+ ) -> Dict[str, List[str]]:
103
+ lineage_links = {}
104
+ for table in all_tables:
105
+ if "quollio" in table.lower():
106
+ continue
107
+ downstream = get_entitiy_reference()
108
+ downstream.fully_qualified_name = f"bigquery:{table}"
109
+
110
+ for region in regions:
111
+ request = get_search_request(downstream_table=downstream, project_id=project_id, region=region)
112
+ response = lineage_client.get_links(request=request)
113
+ for lineage in response:
114
+ target_table = str(lineage.target.fully_qualified_name).replace("bigquery:", "")
115
+ source_table = str(lineage.source.fully_qualified_name).replace("bigquery:", "")
116
+ if target_table not in lineage_links:
117
+ lineage_links[target_table] = []
118
+ if source_table not in lineage_links[target_table]:
119
+ lineage_links[target_table].append(source_table)
120
+
121
+ return lineage_links
122
+
123
+
124
+ def column_stats_from_dataplex(bq_client: BigQueryClient, profiling_table: str) -> List[Dict]:
125
+ query = f"""
126
+ SELECT
127
+ data_source.table_project_id AS DB_NAME,
128
+ data_source.dataset_id AS SCHEMA_NAME,
129
+ data_source.table_id AS TABLE_NAME,
130
+ column_name AS COLUMN_NAME,
131
+ min_value AS MIN_VALUE,
132
+ max_value AS MAX_VALUE,
133
+ average_value AS AVG_VALUE,
134
+ quartile_median AS MEDIAN_VALUE,
135
+ standard_deviation AS STDDEV_VALUE,
136
+ top_n[0][0] AS MODE_VALUE,
137
+ CAST((percent_null / 100) * job_rows_scanned AS INT) as NULL_COUNT,
138
+ CAST((percent_unique / 100) * job_rows_scanned AS INT) as CARDINALITY
139
+ FROM `{profiling_table}`
140
+ """
141
+ logger.debug(f"Executing Query: {query}")
142
+ results = bq_client.client.query(query).result()
143
+
144
+ # Convert RowIterator to a list of dictionaries
145
+ return [dict(row) for row in results]
@@ -6,7 +6,7 @@ from quollio_core.profilers.lineage import (
6
6
  gen_table_lineage_payload,
7
7
  parse_databricks_table_lineage,
8
8
  )
9
- from quollio_core.profilers.stats import gen_table_stats_payload
9
+ from quollio_core.profilers.stats import gen_table_stats_payload, get_is_target_stats_items, render_sql_for_stats
10
10
  from quollio_core.repository import databricks, qdc
11
11
 
12
12
  logger = logging.getLogger(__name__)
@@ -125,59 +125,63 @@ def _get_monitoring_tables(
125
125
 
126
126
 
127
127
  def _get_column_stats(
128
- conn: databricks.DatabricksConnectionConfig, monitoring_table_suffix: str = "_profile_metrics"
128
+ conn: databricks.DatabricksConnectionConfig,
129
+ stats_items: List[str],
130
+ monitoring_table_suffix: str = "_profile_metrics",
129
131
  ) -> List[Dict[str, str]]:
130
132
  tables = _get_monitoring_tables(conn, monitoring_table_suffix)
131
133
  if not tables:
132
134
  return []
133
135
  stats = []
136
+ is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
134
137
  for table in tables:
135
138
  monitored_table = table["table_fqdn"].removesuffix("_profile_metrics")
136
139
  monitored_table = monitored_table.split(".")
137
140
  if len(monitored_table) != 3:
138
141
  raise ValueError(f"Invalid table name: {table['table_fqdn']}")
139
142
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
140
- query = """
141
- WITH profile_record_history AS (
142
- SELECT
143
- COLUMN_NAME
144
- , distinct_count as CARDINALITY
145
- , MAX as MAX_VALUE
146
- , MIN as MIN_VALUE
147
- , AVG as AVG_VALUE
148
- , MEDIAN as MEDIAN_VALUE
149
- , STDDEV as STDDEV_VALUE
150
- , NUM_NULLS as NULL_COUNT
151
- , get(frequent_items, 0).item AS MODE_VALUE
152
- , row_number() over(partition by column_name order by window desc) rownum
153
- FROM
154
- {monitoring_table}
155
- WHERE
156
- column_name not in (':table')
157
- )
158
- SELECT
159
- "{monitored_table_catalog}" as DB_NAME
160
- , "{monitored_table_schema}" as SCHEMA_NAME
161
- , "{monitored_table_name}" as TABLE_NAME
162
- , COLUMN_NAME
163
- , CARDINALITY
164
- , MAX_VALUE
165
- , MIN_VALUE
166
- , AVG_VALUE
167
- , MEDIAN_VALUE
168
- , STDDEV_VALUE
169
- , NULL_COUNT
170
- , MODE_VALUE
171
- FROM
172
- profile_record_history
173
- WHERE
174
- rownum = 1
175
- """.format(
143
+ cte = """
144
+ WITH profile_record_history AS (
145
+ SELECT
146
+ COLUMN_NAME
147
+ , distinct_count as cardinality
148
+ , MAX as max_value
149
+ , MIN as min_value
150
+ , AVG as avg_value
151
+ , MEDIAN as median_value
152
+ , STDDEV as stddev_value
153
+ , NUM_NULLS as null_count
154
+ , get(frequent_items, 0).item AS mode_value
155
+ , row_number() over(partition by column_name order by window desc) rownum
156
+ FROM
157
+ {monitoring_table}
158
+ WHERE
159
+ column_name not in (':table')
160
+ ), profile_record AS (
161
+ SELECT
162
+ "{monitored_table_catalog}" as db_name
163
+ , "{monitored_table_schema}" as schema_name
164
+ , "{monitored_table_name}" as table_name
165
+ , column_name
166
+ , max_value
167
+ , min_value
168
+ , null_count
169
+ , cardinality
170
+ , avg_value
171
+ , median_value
172
+ , mode_value
173
+ , stddev_value
174
+ FROM
175
+ profile_record_history
176
+ WHERE
177
+ rownum = 1
178
+ )""".format(
176
179
  monitoring_table=table["table_fqdn"],
177
180
  monitored_table_catalog=monitored_table[0],
178
181
  monitored_table_schema=monitored_table[1],
179
182
  monitored_table_name=monitored_table[2],
180
183
  )
184
+ query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn="profile_record", cte=cte)
181
185
  logger.debug(f"The following sql will be fetched to retrieve stats values. {query}")
182
186
  stats.append(databricks_executor.get_query_results(query))
183
187
  return stats
@@ -188,9 +192,10 @@ def databricks_column_stats(
188
192
  endpoint: str,
189
193
  qdc_client: qdc.QDCExternalAPIClient,
190
194
  tenant_id: str,
195
+ stats_items: List[str],
191
196
  monitoring_table_suffix: str = "_profile_metrics",
192
197
  ) -> None:
193
- table_stats = _get_column_stats(conn, monitoring_table_suffix)
198
+ table_stats = _get_column_stats(conn, stats_items, monitoring_table_suffix)
194
199
  for table in table_stats:
195
200
  logger.debug("Table %s will be aggregated.", table)
196
201
  stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
@@ -1,8 +1,13 @@
1
1
  import logging
2
+ from typing import List
2
3
 
3
4
  from quollio_core.profilers.lineage import gen_table_lineage_payload, gen_table_lineage_payload_inputs
4
5
  from quollio_core.profilers.sqllineage import SQLLineage
5
- from quollio_core.profilers.stats import gen_table_stats_payload_from_tuple
6
+ from quollio_core.profilers.stats import (
7
+ gen_table_stats_payload_from_tuple,
8
+ get_is_target_stats_items,
9
+ render_sql_for_stats,
10
+ )
6
11
  from quollio_core.repository import qdc, redshift
7
12
 
8
13
  logger = logging.getLogger(__name__)
@@ -76,38 +81,24 @@ def redshift_table_stats(
76
81
  conn: redshift.RedshiftConnectionConfig,
77
82
  qdc_client: qdc.QDCExternalAPIClient,
78
83
  tenant_id: str,
84
+ stats_items: List[str],
79
85
  ) -> None:
80
-
86
+ is_aggregate_items = get_is_target_stats_items(stats_items=stats_items)
81
87
  with redshift.RedshiftQueryExecutor(config=conn) as redshift_executor:
82
88
  stats_query = _gen_get_stats_views_query(
83
89
  db=conn.database,
84
90
  schema=conn.schema,
85
91
  )
86
92
  stats_views = redshift_executor.get_query_results(query=stats_query)
93
+ logger.info("Found %s for table statistics.", len(stats_views))
87
94
 
88
95
  req_count = 0
89
96
  for stats_view in stats_views:
90
- stats_query = """
91
- SELECT
92
- db_name
93
- , schema_name
94
- , table_name
95
- , column_name
96
- , max_value
97
- , min_value
98
- , null_count
99
- , cardinality
100
- , avg_value
101
- , median_value
102
- , mode_value
103
- , stddev_value
104
- FROM
105
- {db}.{schema}.{table}
106
- """.format(
107
- db=stats_view[0],
108
- schema=stats_view[1],
109
- table=stats_view[2],
97
+ table_fqn = "{catalog}.{schema}.{table}".format(
98
+ catalog=stats_view[0], schema=stats_view[1], table=stats_view[2]
110
99
  )
100
+ stats_query = render_sql_for_stats(is_aggregate_items=is_aggregate_items, table_fqn=table_fqn)
101
+ logger.debug(f"The following sql will be fetched to retrieve stats values. {stats_query}")
111
102
  stats_result = redshift_executor.get_query_results(query=stats_query)
112
103
  payloads = gen_table_stats_payload_from_tuple(tenant_id=tenant_id, endpoint=conn.host, stats=stats_result)
113
104
  for payload in payloads: