quollio-core 0.4.6__tar.gz → 0.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {quollio_core-0.4.6 → quollio_core-0.4.8}/PKG-INFO +7 -2
  2. {quollio_core-0.4.6 → quollio_core-0.4.8}/README.md +1 -1
  3. {quollio_core-0.4.6 → quollio_core-0.4.8}/pyproject.toml +5 -0
  4. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/__init__.py +1 -1
  5. quollio_core-0.4.8/quollio_core/bigquery.py +114 -0
  6. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/bricks.py +66 -14
  7. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +1 -1
  8. quollio_core-0.4.8/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +136 -0
  9. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
  10. quollio_core-0.4.8/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +85 -0
  11. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
  12. quollio_core-0.4.8/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +96 -0
  13. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/helper/core.py +4 -0
  14. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/helper/env_default.py +24 -1
  15. quollio_core-0.4.8/quollio_core/profilers/bigquery.py +81 -0
  16. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/databricks.py +16 -9
  17. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/lineage.py +14 -0
  18. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/redshift.py +8 -8
  19. quollio_core-0.4.8/quollio_core/repository/bigquery.py +61 -0
  20. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/snowflake.py +21 -6
  21. quollio_core-0.4.6/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +0 -97
  22. quollio_core-0.4.6/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +0 -62
  23. quollio_core-0.4.6/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +0 -59
  24. {quollio_core-0.4.6 → quollio_core-0.4.8}/LICENSE +0 -0
  25. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/.gitignore +0 -0
  26. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/README.md +0 -0
  27. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
  28. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/dbt_project.yml +0 -0
  29. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
  30. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +0 -0
  31. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +0 -0
  32. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +0 -0
  33. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/models/sources.yml +0 -0
  34. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/package-lock.yml +0 -0
  35. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/packages.yml +0 -0
  36. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +0 -0
  37. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
  38. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
  39. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/README.md +0 -0
  40. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/analyses/.gitkeep +0 -0
  41. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/dbt_project.yml +0 -0
  42. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/macros/.gitkeep +0 -0
  43. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql +0 -0
  44. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml +0 -0
  45. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql +0 -0
  46. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml +0 -0
  47. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql +0 -0
  48. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml +0 -0
  49. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml +0 -0
  50. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql +0 -0
  51. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml +0 -0
  52. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/models/sources.yml +0 -0
  53. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/package-lock.yml +0 -0
  54. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/packages.yml +0 -0
  55. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/profiles/profiles_template.yml +0 -0
  56. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/seeds/.gitkeep +0 -0
  57. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/redshift/snapshots/.gitkeep +0 -0
  58. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/seeds/.gitkeep +0 -0
  59. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/README.md +0 -0
  60. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/analyses/.gitkeep +0 -0
  61. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/dbt_project.yml +0 -0
  62. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/macros/.gitkeep +0 -0
  63. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +0 -0
  64. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml +0 -0
  65. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +0 -0
  66. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml +0 -0
  67. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +0 -0
  68. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml +0 -0
  69. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml +0 -0
  70. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml +0 -0
  71. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/models/sources.yml +0 -0
  72. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/package-lock.yml +0 -0
  73. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/packages.yml +0 -0
  74. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +0 -0
  75. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/seeds/.gitkeep +0 -0
  76. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/dbt_projects/snowflake/snapshots/.gitkeep +0 -0
  77. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/helper/__init__.py +0 -0
  78. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/__init__.py +0 -0
  79. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/redshift.py +0 -0
  80. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/snowflake.py +0 -0
  81. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/sqllineage.py +0 -0
  82. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/profilers/stats.py +0 -0
  83. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/repository/__init__.py +0 -0
  84. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/repository/databricks.py +0 -0
  85. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/repository/dbt.py +0 -0
  86. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/repository/qdc.py +0 -0
  87. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/repository/redshift.py +0 -0
  88. {quollio_core-0.4.6 → quollio_core-0.4.8}/quollio_core/repository/snowflake.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quollio-core
3
- Version: 0.4.6
3
+ Version: 0.4.8
4
4
  Summary: Quollio Core
5
5
  Author-email: quollio-dev <qt.dev@quollio.com>
6
6
  Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
@@ -31,6 +31,11 @@ Requires-Dist: snowflake-connector-python==3.5.0
31
31
  Requires-Dist: databricks-sdk==0.17.0
32
32
  Requires-Dist: databricks-sql-connector==2.9.5
33
33
  Requires-Dist: sqlglot==20.8.0
34
+ Requires-Dist: google-cloud==0.34.0
35
+ Requires-Dist: google-cloud-bigquery==3.22.0
36
+ Requires-Dist: google-cloud-datacatalog==3.19.0
37
+ Requires-Dist: google-cloud-datacatalog-lineage==0.3.6
38
+ Requires-Dist: google-api-python-client==2.131.0
34
39
  Requires-Dist: black>=22.3.0 ; extra == "test"
35
40
  Requires-Dist: coverage>=7.3.2 ; extra == "test"
36
41
  Requires-Dist: isort>=5.10.1 ; extra == "test"
@@ -74,7 +79,7 @@ To see available commands and options, please run the following command. (ex: Sn
74
79
  コマンドやオプションの詳細については、下記のコマンドを実行してください。(例: Snowflake)
75
80
 
76
81
  ```
77
- $ python3 -m quollio_core.snowflake -h
82
+ $ python -m quollio_core.snowflake -h
78
83
  ```
79
84
 
80
85
  Then run commands with the options provided.
@@ -31,7 +31,7 @@ To see available commands and options, please run the following command. (ex: Sn
31
31
  コマンドやオプションの詳細については、下記のコマンドを実行してください。(例: Snowflake)
32
32
 
33
33
  ```
34
- $ python3 -m quollio_core.snowflake -h
34
+ $ python -m quollio_core.snowflake -h
35
35
  ```
36
36
 
37
37
  Then run commands with the options provided.
@@ -43,6 +43,11 @@ dependencies = [
43
43
  ,"databricks-sdk==0.17.0"
44
44
  ,"databricks-sql-connector==2.9.5"
45
45
  ,"sqlglot==20.8.0"
46
+ ,"google-cloud==0.34.0"
47
+ ,"google-cloud-bigquery==3.22.0"
48
+ ,"google-cloud-datacatalog==3.19.0"
49
+ ,"google-cloud-datacatalog-lineage==0.3.6"
50
+ ,"google-api-python-client==2.131.0"
46
51
  ]
47
52
  dynamic = ["version", "description"]
48
53
 
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.4.6"
3
+ __version__ = "0.4.8"
4
4
  __author__ = "Quollio Technologies, Inc"
@@ -0,0 +1,114 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+
5
+ from quollio_core.helper.env_default import env_default
6
+ from quollio_core.profilers.bigquery import bigquery_table_lineage
7
+ from quollio_core.repository import qdc
8
+ from quollio_core.repository.bigquery import get_credentials, get_org_id
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def load_lineage(
14
+ qdc_client: qdc.QDCExternalAPIClient, project_id: str, regions: list, tenant_id: str, credentials: dict, org_id: str
15
+ ):
16
+ bigquery_table_lineage(
17
+ qdc_client=qdc_client,
18
+ tenant_id=tenant_id,
19
+ project_id=project_id,
20
+ regions=regions,
21
+ credentials=credentials,
22
+ org_id=org_id,
23
+ )
24
+
25
+
26
+ if __name__ == "__main__":
27
+ parser = argparse.ArgumentParser(
28
+ prog="Quollio Intelligence Agent for Google BigQuery",
29
+ description="Collect lineage and stats from Google BigQuery and load to Quollio Data Catalog",
30
+ epilog="Copyright (c) 2024 Quollio Technologies, Inc.",
31
+ )
32
+ parser.add_argument(
33
+ "commands",
34
+ choices=["load_lineage"],
35
+ type=str,
36
+ nargs="+",
37
+ help="""
38
+ The command to execute.
39
+ 'load_lineage': Load lineage data from Google Data Catalog to Quollio,
40
+ """,
41
+ )
42
+ parser.add_argument(
43
+ "--credentials",
44
+ type=str,
45
+ action=env_default("GOOGLE_APPLICATION_CREDENTIALS"),
46
+ help="Crendentials for Google Cloud Platform",
47
+ )
48
+ parser.add_argument(
49
+ "--tenant_id",
50
+ type=str,
51
+ action=env_default("TENANT_ID"),
52
+ required=False,
53
+ help="The tenant id (company id) where the lineage and stats are loaded",
54
+ )
55
+ parser.add_argument(
56
+ "--api_url",
57
+ type=str,
58
+ action=env_default("QDC_API_URL"),
59
+ required=False,
60
+ help="The base URL of Quollio External API",
61
+ )
62
+ parser.add_argument(
63
+ "--client_id",
64
+ type=str,
65
+ action=env_default("QDC_CLIENT_ID"),
66
+ required=False,
67
+ help="The client id that is created on Quollio console to let clients access Quollio External API",
68
+ )
69
+ parser.add_argument(
70
+ "--client_secret",
71
+ type=str,
72
+ action=env_default("QDC_CLIENT_SECRET"),
73
+ required=False,
74
+ help="The client secret that is created on Quollio console to let clients access Quollio External API",
75
+ )
76
+ parser.add_argument(
77
+ "--project_id",
78
+ type=str,
79
+ action=env_default("GCP_PROJECT_ID"),
80
+ required=False,
81
+ help="GCP Project ID",
82
+ )
83
+ parser.add_argument(
84
+ "--regions",
85
+ type=str,
86
+ action=env_default("GCP_REGIONS"),
87
+ required=False,
88
+ help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
89
+ nargs="+",
90
+ )
91
+
92
+ args = parser.parse_args()
93
+
94
+ if len(args.commands) == 0:
95
+ raise ValueError("No command is provided")
96
+
97
+ if "load_lineage" in args.commands:
98
+
99
+ qdc_client = qdc.QDCExternalAPIClient(
100
+ base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
101
+ )
102
+
103
+ credentials_json = json.loads(args.credentials)
104
+ credentials = get_credentials(credentials_json=credentials_json)
105
+ org_id = get_org_id(credentials_json=credentials_json)
106
+
107
+ load_lineage(
108
+ qdc_client=qdc_client,
109
+ project_id=args.project_id,
110
+ regions=args.regions,
111
+ tenant_id=args.tenant_id,
112
+ credentials=credentials,
113
+ org_id=org_id,
114
+ )
@@ -2,7 +2,7 @@ import argparse
2
2
  import logging
3
3
  import os
4
4
 
5
- from quollio_core.helper.core import setup_dbt_profile
5
+ from quollio_core.helper.core import setup_dbt_profile, trim_prefix
6
6
  from quollio_core.helper.env_default import env_default
7
7
  from quollio_core.profilers.databricks import (
8
8
  databricks_column_level_lineage,
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
  def build_view(
19
19
  conn: db.DatabricksConnectionConfig,
20
- target_tables: str,
20
+ target_tables: str = "",
21
21
  log_level: str = "info",
22
22
  ) -> None:
23
23
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
@@ -41,7 +41,13 @@ def build_view(
41
41
  options=["--no-use-colors", "--log-level", log_level],
42
42
  )
43
43
 
44
- run_options = ["--no-use-colors", "--log-level", log_level, "--select", target_tables]
44
+ run_options = ["--no-use-colors", "--log-level", log_level]
45
+
46
+ if target_tables is not None:
47
+ target_tables_str = " ".join(target_tables)
48
+ run_options.append("--select")
49
+ run_options.append(target_tables_str)
50
+
45
51
  dbt_client.invoke(
46
52
  cmd="run",
47
53
  project_dir=project_path,
@@ -53,20 +59,35 @@ def build_view(
53
59
 
54
60
  def load_lineage(
55
61
  conn: db.DatabricksConnectionConfig,
62
+ endpoint: str,
56
63
  qdc_client: qdc.QDCExternalAPIClient,
57
64
  tenant_id: str,
65
+ enable_column_lineage: bool = False,
58
66
  ) -> None:
59
67
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
60
68
 
61
69
  logger.info("Generate Databricks table to table lineage.")
62
70
  databricks_table_level_lineage(
63
- conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_table_level"
71
+ conn=conn,
72
+ endpoint=endpoint,
73
+ qdc_client=qdc_client,
74
+ tenant_id=tenant_id,
75
+ dbt_table_name="quollio_lineage_table_level",
64
76
  )
65
77
 
66
- logger.info("Generate Databricks column to column lineage.")
67
- databricks_column_level_lineage(
68
- conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_column_level"
69
- )
78
+ if enable_column_lineage:
79
+ logger.info(
80
+ f"enable_column_lineage is set to {enable_column_lineage}.Generate Databricks column to column lineage."
81
+ )
82
+ databricks_column_level_lineage(
83
+ conn=conn,
84
+ endpoint=endpoint,
85
+ qdc_client=qdc_client,
86
+ tenant_id=tenant_id,
87
+ dbt_table_name="quollio_lineage_column_level",
88
+ )
89
+ else:
90
+ logger.info("Skip column lineage ingestion. Set enable_column_lineage to True if you ingest column lineage.")
70
91
 
71
92
  logger.info("Lineage data is successfully loaded.")
72
93
  return
@@ -74,6 +95,7 @@ def load_lineage(
74
95
 
75
96
  def load_column_stats(
76
97
  conn: db.DatabricksConnectionConfig,
98
+ endpoint: str,
77
99
  qdc_client: qdc.QDCExternalAPIClient,
78
100
  tenant_id: str,
79
101
  ) -> None:
@@ -82,6 +104,7 @@ def load_column_stats(
82
104
  logger.info("Generate Databricks column stats.")
83
105
  databricks_column_stats(
84
106
  conn=conn,
107
+ endpoint=endpoint,
85
108
  qdc_client=qdc_client,
86
109
  tenant_id=tenant_id,
87
110
  )
@@ -106,7 +129,6 @@ if __name__ == "__main__":
106
129
  'build_view': Build views using dbt,
107
130
  'load_lineage': Load lineage data from created views to Quollio,
108
131
  'load_stats': Load stats from created views to Quollio,
109
- 'load_sqllineage': Load lineage data from sql parse result(alpha),
110
132
  """,
111
133
  )
112
134
  parser.add_argument(
@@ -193,8 +215,8 @@ if __name__ == "__main__":
193
215
  parser.add_argument(
194
216
  "--target_tables",
195
217
  type=str,
196
- nargs="*",
197
- choices=["quollio_lineage_table_level", "quollio_lineage_view_level"],
218
+ nargs="+",
219
+ choices=["quollio_lineage_table_level", "quollio_lineage_column_level"],
198
220
  action=env_default("DATABRICKS_TARGET_TABLES"),
199
221
  required=False,
200
222
  help="Target tables you want to create with dbt module. \
@@ -202,11 +224,29 @@ if __name__ == "__main__":
202
224
  Please specify table name with blank delimiter like tableA tableB \
203
225
  if you want to create two or more tables",
204
226
  )
227
+ parser.add_argument(
228
+ "--monitoring_table_suffix",
229
+ type=str,
230
+ action=env_default("DATABRICKS_MONITORING_TABLE_SUFFIX"),
231
+ required=False,
232
+ help="Sets the monitoring tables suffix for databricks. \
233
+ This is used to identify the monitoring tables created by the databricks monitoring tool. \
234
+ Default value is _profile_metrics",
235
+ )
236
+ parser.add_argument(
237
+ "--enable_column_lineage",
238
+ type=bool,
239
+ action=env_default("ENABLE_COLUMN_LINEAGE", store_true=True),
240
+ default=False,
241
+ required=False,
242
+ help="Whether to ingest column lineage into QDIC or not. Default value is False",
243
+ )
205
244
 
206
245
  args = parser.parse_args()
207
246
 
208
247
  conn = db.DatabricksConnectionConfig(
209
- host=args.host,
248
+ # MEMO: Metadata agent allows the string 'https://' as a host name but is not allowed by intelligence agent.
249
+ host=trim_prefix(args.host, "https://"),
210
250
  http_path=args.http_path,
211
251
  client_id=args.databricks_client_id,
212
252
  client_secret=args.databricks_client_secret,
@@ -228,10 +268,22 @@ if __name__ == "__main__":
228
268
  qdc_client = qdc.QDCExternalAPIClient(
229
269
  base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
230
270
  )
231
- load_lineage(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
271
+ load_lineage(
272
+ conn=conn,
273
+ endpoint=args.host,
274
+ qdc_client=qdc_client,
275
+ tenant_id=args.tenant_id,
276
+ enable_column_lineage=args.enable_column_lineage,
277
+ )
232
278
 
233
279
  if "load_stats" in args.commands:
234
280
  qdc_client = qdc.QDCExternalAPIClient(
235
281
  base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
236
282
  )
237
- databricks_column_stats(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
283
+ databricks_column_stats(
284
+ conn=conn,
285
+ endpoint=args.host,
286
+ qdc_client=qdc_client,
287
+ tenant_id=args.tenant_id,
288
+ monitoring_table_suffix=args.monitoring_table_suffix,
289
+ )
@@ -1,7 +1,7 @@
1
1
  version: 2
2
2
 
3
3
  model:
4
- - name: quollio_lineage_column_level
4
+ - name: quollio_lineage_table_level
5
5
  columns:
6
6
  - name: UPSTREAM_TABLES
7
7
  description: 'String column with all upstream tables in JSON format'
@@ -0,0 +1,136 @@
1
+ {%- materialization divided_view, default %}
2
+ {%- set identifier = model['alias'] %}
3
+ {%- set target_relations = [] %}
4
+ {%- set grant_config = config.get('grants') %}
5
+
6
+ {{ run_hooks(pre_hooks, inside_transaction=False) }}
7
+ -- `BEGIN` happens here:
8
+ {{ run_hooks(pre_hooks, inside_transaction=True) }}
9
+
10
+ -- fetch target_tables
11
+ {%- set query_stats_target_tables -%}
12
+ SELECT
13
+ distinct
14
+ database_name
15
+ , schema_name
16
+ , table_name
17
+ FROM
18
+ {{ ref('quollio_stats_profiling_columns') }}
19
+ WHERE
20
+ table_name not like 'quollio_%%'
21
+ {%- endset -%}
22
+ {%- set results = run_query(query_stats_target_tables) -%}
23
+ {%- if execute -%}
24
+ {%- set stats_target_tables = results.rows -%}
25
+ {%- else -%}
26
+ {%- set stats_target_tables = [] -%}
27
+ {%- endif -%}
28
+
29
+ -- skip creating views if the target profiling columns don't exist.
30
+ {%- if stats_target_tables | length == 0 -%}
31
+ {% call statement("main") %}
32
+ {{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
33
+ select null
34
+ {% endcall %}
35
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
36
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
37
+ {%- endif -%}
38
+
39
+ -- build sql
40
+ {%- for stats_target_table in stats_target_tables -%}
41
+ -- get columns for statistics.
42
+ -- LISTAGG function can't be used for sys table, then it's necessary to get column for each table.
43
+ -- See https://docs.aws.amazon.com/redshift/latest/dg/c_join_PG.html.
44
+ {%- set stats_target_columns %}
45
+ SELECT
46
+ database_name
47
+ , schema_name
48
+ , table_name
49
+ , column_name
50
+ , is_bool
51
+ , is_calculable
52
+ FROM
53
+ {{ ref('quollio_stats_profiling_columns') }}
54
+ WHERE
55
+ database_name = '{{stats_target_table[0]}}'
56
+ AND schema_name = '{{stats_target_table[1]}}'
57
+ AND table_name = '{{stats_target_table[2]}}'
58
+ {%- endset -%}
59
+
60
+ {%- set results = run_query(stats_target_columns) -%}
61
+ {%- set stats_target_columns = results.rows -%}
62
+
63
+ {%- set sql_for_column_stats %}
64
+ {%- for stats_target_column in stats_target_columns -%}
65
+ {%- if not loop.first -%}UNION{% endif %}
66
+ SELECT
67
+ main.db_name
68
+ , main.schema_name
69
+ , main.table_name
70
+ , main.column_name
71
+ , main.max_value
72
+ , main.min_value
73
+ , main.null_count
74
+ , main.cardinality
75
+ , main.avg_value
76
+ , main.median_value
77
+ , mode.mode_value
78
+ , main.stddev_value
79
+ FROM
80
+ (
81
+ SELECT
82
+ DISTINCT
83
+ '{{stats_target_column[0]}}'::varchar as db_name
84
+ , '{{stats_target_column[1]}}'::varchar as schema_name
85
+ , '{{stats_target_column[2]}}'::varchar as table_name
86
+ , '{{stats_target_column[3]}}'::varchar as column_name
87
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(max("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
88
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(min("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
89
+ -- requires full table scan
90
+ , {% if var("aggregate_all") == True %}cast(SUM(NVL2("{{stats_target_column[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
91
+ , APPROXIMATE COUNT(DISTINCT "{{stats_target_column[3]}}") AS cardinality
92
+ -- requires full table scan
93
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(avg("{{stats_target_column[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
94
+ , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(median("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
95
+ -- requires full table scan
96
+ , {% if stats_target_column[5] == True %}cast(STDDEV_SAMP("{{stats_target_column[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
97
+ FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
98
+ ) main, (
99
+ {%- if var("aggregate_all") == True and stats_target_column[4] == false %}
100
+ SELECT
101
+ cast("{{stats_target_column[3]}}" as varchar) mode_value
102
+ FROM (
103
+ SELECT
104
+ DISTINCT
105
+ "{{stats_target_column[3]}}"
106
+ , ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS row_num
107
+ FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
108
+ GROUP BY
109
+ "{{stats_target_column[3]}}"
110
+ )
111
+ WHERE
112
+ row_num = 1
113
+ {% else %}
114
+ SELECT null as mode_value {%- endif -%}
115
+ ) mode
116
+ {% endfor -%}
117
+ {%- endset %}
118
+ -- create a view with a index as suffix
119
+ {%- set target_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
120
+ {%- set target_relation = api.Relation.create(identifier=target_identifier, schema=schema, database=database, type='view') %}
121
+ -- {{ drop_relation_if_exists(target_relation) }}
122
+ {% call statement("main") %}
123
+ {{ get_replace_view_sql(target_relation, sql_for_column_stats) }}
124
+ {% endcall %}
125
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
126
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
127
+ {%- do apply_grants(target_relation, grant_config, should_revoke) %}
128
+ {%- set target_relations = target_relations.append(target_relation) %}
129
+ {%- endfor -%}
130
+
131
+ {{ run_hooks(post_hooks, inside_transaction=True) }}
132
+ {{ adapter.commit() }}
133
+ {{ run_hooks(post_hooks, inside_transaction=False) }}
134
+
135
+ {{ return({'relations': target_relations}) }}
136
+ {%- endmaterialization -%}
@@ -1,7 +1,6 @@
1
1
  {{
2
2
  config(
3
- materialized='divided_view',
4
- chunk=20
3
+ materialized='divided_view'
5
4
  )
6
5
  }}
7
6
  -- depends_on: {{ ref('quollio_stats_profiling_columns') }}
@@ -0,0 +1,85 @@
1
+ {%- materialization divided_view, default %}
2
+ {%- set identifier = model['alias'] %}
3
+ {%- set target_relations = [] %}
4
+ {%- set grant_config = config.get('grants') %}
5
+
6
+ {{ run_hooks(pre_hooks, inside_transaction=False) }}
7
+ -- `BEGIN` happens here:
8
+ {{ run_hooks(pre_hooks, inside_transaction=True) }}
9
+
10
+ -- fetch target_tables
11
+ {%- set query_stats_target_tables -%}
12
+ SELECT
13
+ TABLE_CATALOG
14
+ , TABLE_SCHEMA
15
+ , TABLE_NAME
16
+ , OBJECT_AGG(COLUMN_NAME, IS_CALCULABLE) AS COLUMNS_OBJ
17
+ FROM
18
+ {{ ref('quollio_stats_profiling_columns') }}
19
+ WHERE NOT startswith(table_name, 'QUOLLIO_')
20
+ GROUP BY
21
+ TABLE_CATALOG
22
+ , TABLE_SCHEMA
23
+ , TABLE_NAME
24
+ {%- endset -%}
25
+ {%- set results = run_query(query_stats_target_tables) -%}
26
+ {%- if execute -%}
27
+ {%- set stats_target_tables = results.rows -%}
28
+ {%- else -%}
29
+ {%- set stats_target_tables = [] -%}
30
+ {%- endif -%}
31
+
32
+ -- skip creating views if the target profiling columns don't exist.
33
+ {%- if stats_target_tables | length == 0 -%}
34
+ {% call statement("main") %}
35
+ {{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
36
+ select null
37
+ {% endcall %}
38
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
39
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
40
+ {%- endif -%}
41
+
42
+ -- create view for each table
43
+ {%- for stats_target_table in stats_target_tables -%}
44
+ -- build sql for column value aggregation.
45
+ {%- set sql_for_column_stats %}
46
+ {% set columns_json = fromjson(stats_target_table[3]) %}
47
+ {%- for col_name, is_calclable in columns_json.items() -%}
48
+ {%- if not loop.first %}UNION{% endif %}
49
+ SELECT
50
+ DISTINCT
51
+ '{{stats_target_table[0]}}' as db_name
52
+ , '{{stats_target_table[1]}}' as schema_name
53
+ , '{{stats_target_table[2]}}' as table_name
54
+ , '{{col_name}}' as column_name
55
+ , {% if is_calclable == True %}CAST(MAX("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS max_value
56
+ , {% if is_calclable == True %}CAST(MIN("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS min_value
57
+ , COUNT_IF("{{col_name}}" IS NULL) AS null_count
58
+ , APPROX_COUNT_DISTINCT("{{col_name}}") AS cardinality
59
+ , {% if is_calclable == True %}AVG("{{col_name}}"){% else %}NULL{% endif %} AS avg_value
60
+ , {% if is_calclable == True %}MEDIAN("{{col_name}}"){% else %}NULL{% endif %} AS median_value
61
+ , {% if is_calclable == True %}APPROX_TOP_K("{{col_name}}")[0][0]{% else %}NULL{% endif %} AS mode_value
62
+ , {% if is_calclable == True %}STDDEV("{{col_name}}"){% else %}NULL{% endif %} AS stddev_value
63
+ FROM "{{stats_target_table[0]}}"."{{stats_target_table[1]}}"."{{stats_target_table[2]}}" {{ var("sample_method") }}
64
+ {% endfor -%}
65
+ {%- endset %}
66
+
67
+ -- create a view with a index as suffix
68
+ {%- set stats_view_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
69
+ {%- set target_relation = api.Relation.create(identifier=stats_view_identifier, schema=schema, database=database, type='view') %}
70
+ {% call statement("main") %}
71
+ {{ get_create_view_as_sql(target_relation, sql_for_column_stats) }}
72
+ {% endcall %}
73
+ {%- set full_refresh_mode = (should_full_refresh()) -%}
74
+ {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
75
+ {%- do apply_grants(target_relation, grant_config, should_revoke) %}
76
+ {%- set target_relations = target_relations.append(target_relation) %}
77
+ {%- endfor -%}
78
+
79
+ {{ run_hooks(post_hooks, inside_transaction=True) }}
80
+ -- `COMMIT` happens here:
81
+ {{ adapter.commit() }}
82
+ {{ run_hooks(post_hooks, inside_transaction=False) }}
83
+
84
+ {{ return({'relations': target_relations}) }}
85
+ {%- endmaterialization -%}
@@ -1,7 +1,6 @@
1
1
  {{
2
2
  config(
3
- materialized='divided_view',
4
- chunk=20
3
+ materialized='divided_view'
5
4
  )
6
5
  }}
7
6
  -- depends_on: {{ ref('quollio_stats_profiling_columns') }}
@@ -0,0 +1,96 @@
1
+ WITH columns AS (
2
+ SELECT
3
+ table_catalog
4
+ , table_schema
5
+ , table_name
6
+ , column_name
7
+ , data_type
8
+ FROM
9
+ {{ source('account_usage', 'COLUMNS') }}
10
+ WHERE
11
+ deleted is null
12
+ AND table_name NOT LIKE 'QUOLLIO_%%'
13
+ GROUP BY
14
+ table_catalog
15
+ , table_schema
16
+ , table_name
17
+ , column_name
18
+ , data_type
19
+ ORDER BY
20
+ table_catalog
21
+ , table_schema
22
+ , table_name
23
+ ), accessible_tables AS (
24
+ SELECT
25
+ table_catalog
26
+ , table_schema
27
+ , name
28
+ FROM
29
+ {{ source('account_usage', 'GRANTS_TO_ROLES') }}
30
+ WHERE
31
+ granted_on in ('TABLE', 'MATERIALIZED VIEW')
32
+ AND grantee_name = '{{ var("query_role") }}'
33
+ AND privilege in ('SELECT', 'OWNERSHIP', 'REFERENCES')
34
+ AND deleted_on IS NULL
35
+ GROUP BY
36
+ table_catalog
37
+ , table_schema
38
+ , name
39
+ ), m_view_sys_columns AS (
40
+ SELECT
41
+ cols.table_catalog
42
+ , cols.table_schema
43
+ , cols.table_name
44
+ , cols.column_name
45
+ , cols.data_type
46
+ FROM
47
+ {{ source('account_usage', 'COLUMNS') }} cols
48
+ LEFT OUTER JOIN
49
+ {{ source('account_usage', 'TABLES') }} tbls
50
+ ON
51
+ cols.table_catalog = tbls.table_catalog
52
+ AND cols.table_schema = tbls.table_schema
53
+ AND cols.table_name = tbls.table_name
54
+ WHERE
55
+ tbls.table_type = 'MATERIALIZED VIEW'
56
+ AND cols.column_name = 'SYS_MV_SOURCE_PARTITION'
57
+ ), implicit_columns_removed AS (
58
+ SELECT
59
+ c.table_catalog
60
+ , c.table_schema
61
+ , c.table_name
62
+ , c.column_name
63
+ , c.data_type
64
+ FROM
65
+ columns c
66
+ INNER JOIN
67
+ accessible_tables a
68
+ ON
69
+ c.table_catalog = a.table_catalog
70
+ AND c.table_schema = a.table_schema
71
+ AND c.table_name = a.name
72
+ MINUS
73
+ SELECT
74
+ table_catalog
75
+ , table_schema
76
+ , table_name
77
+ , column_name
78
+ , data_type
79
+ FROM
80
+ m_view_sys_columns
81
+ ), final AS (
82
+ SELECT
83
+ table_catalog
84
+ , table_schema
85
+ , table_name
86
+ , column_name
87
+ , data_type
88
+ , case when data_type in('NUMBER','DECIMAL', 'DEC', 'NUMERIC',
89
+ 'INT', 'INTEGER', 'BIGINT', 'SMALLINT',
90
+ 'TINYINT', 'BYTEINT')
91
+ THEN true
92
+ else false END AS is_calculable
93
+ FROM
94
+ implicit_columns_removed
95
+ )
96
+ select * from final