quollio-core 0.4.6__tar.gz → 0.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {quollio_core-0.4.6 → quollio_core-0.4.7}/PKG-INFO +6 -1
  2. {quollio_core-0.4.6 → quollio_core-0.4.7}/pyproject.toml +5 -0
  3. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/__init__.py +1 -1
  4. quollio_core-0.4.7/quollio_core/bigquery.py +114 -0
  5. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/bricks.py +26 -6
  6. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +2 -1
  7. quollio_core-0.4.7/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +96 -0
  8. quollio_core-0.4.7/quollio_core/profilers/bigquery.py +81 -0
  9. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/databricks.py +6 -6
  10. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/lineage.py +14 -0
  11. quollio_core-0.4.7/quollio_core/repository/bigquery.py +61 -0
  12. quollio_core-0.4.6/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.sql +0 -59
  13. {quollio_core-0.4.6 → quollio_core-0.4.7}/LICENSE +0 -0
  14. {quollio_core-0.4.6 → quollio_core-0.4.7}/README.md +0 -0
  15. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/.gitignore +0 -0
  16. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/README.md +0 -0
  17. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/analyses/.gitkeep +0 -0
  18. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/dbt_project.yml +0 -0
  19. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/macros/.gitkeep +0 -0
  20. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.sql +0 -0
  21. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/models/quollio_lineage_column_level.yml +0 -0
  22. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.sql +0 -0
  23. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +0 -0
  24. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/models/sources.yml +0 -0
  25. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/package-lock.yml +0 -0
  26. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/packages.yml +0 -0
  27. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/profiles/profiles_template.yml +0 -0
  28. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/seeds/.gitkeep +0 -0
  29. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/databricks/snapshots/.gitkeep +0 -0
  30. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/README.md +0 -0
  31. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/analyses/.gitkeep +0 -0
  32. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/dbt_project.yml +0 -0
  33. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/macros/.gitkeep +0 -0
  34. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +0 -0
  35. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.sql +0 -0
  36. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_lineage_table_level.yml +0 -0
  37. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.sql +0 -0
  38. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_lineage_view_level.yml +0 -0
  39. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.sql +0 -0
  40. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_sqllineage_sources.yml +0 -0
  41. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +0 -0
  42. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_stats_columns.yml +0 -0
  43. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.sql +0 -0
  44. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/quollio_stats_profiling_columns.yml +0 -0
  45. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/models/sources.yml +0 -0
  46. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/package-lock.yml +0 -0
  47. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/packages.yml +0 -0
  48. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/profiles/profiles_template.yml +0 -0
  49. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/seeds/.gitkeep +0 -0
  50. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/redshift/snapshots/.gitkeep +0 -0
  51. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/seeds/.gitkeep +0 -0
  52. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/README.md +0 -0
  53. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/analyses/.gitkeep +0 -0
  54. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/dbt_project.yml +0 -0
  55. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/macros/.gitkeep +0 -0
  56. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.sql +0 -0
  57. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_column_level.yml +0 -0
  58. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.sql +0 -0
  59. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_lineage_table_level.yml +0 -0
  60. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.sql +0 -0
  61. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_sqllineage_sources.yml +0 -0
  62. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +0 -0
  63. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.yml +0 -0
  64. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/quollio_stats_profiling_columns.yml +0 -0
  65. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/models/sources.yml +0 -0
  66. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/package-lock.yml +0 -0
  67. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/packages.yml +0 -0
  68. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/profiles/profiles_template.yml +0 -0
  69. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/seeds/.gitkeep +0 -0
  70. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/dbt_projects/snowflake/snapshots/.gitkeep +0 -0
  71. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/helper/__init__.py +0 -0
  72. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/helper/core.py +0 -0
  73. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/helper/env_default.py +0 -0
  74. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/__init__.py +0 -0
  75. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/redshift.py +0 -0
  76. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/snowflake.py +0 -0
  77. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/sqllineage.py +0 -0
  78. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/profilers/stats.py +0 -0
  79. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/redshift.py +0 -0
  80. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/repository/__init__.py +0 -0
  81. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/repository/databricks.py +0 -0
  82. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/repository/dbt.py +0 -0
  83. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/repository/qdc.py +0 -0
  84. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/repository/redshift.py +0 -0
  85. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/repository/snowflake.py +0 -0
  86. {quollio_core-0.4.6 → quollio_core-0.4.7}/quollio_core/snowflake.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quollio-core
3
- Version: 0.4.6
3
+ Version: 0.4.7
4
4
  Summary: Quollio Core
5
5
  Author-email: quollio-dev <qt.dev@quollio.com>
6
6
  Maintainer-email: RyoAriyama <ryo.arym@gmail.com>, tharuta <35373297+TakumiHaruta@users.noreply.github.com>
@@ -31,6 +31,11 @@ Requires-Dist: snowflake-connector-python==3.5.0
31
31
  Requires-Dist: databricks-sdk==0.17.0
32
32
  Requires-Dist: databricks-sql-connector==2.9.5
33
33
  Requires-Dist: sqlglot==20.8.0
34
+ Requires-Dist: google-cloud==0.34.0
35
+ Requires-Dist: google-cloud-bigquery==3.22.0
36
+ Requires-Dist: google-cloud-datacatalog==3.19.0
37
+ Requires-Dist: google-cloud-datacatalog-lineage==0.3.6
38
+ Requires-Dist: google-api-python-client==2.131.0
34
39
  Requires-Dist: black>=22.3.0 ; extra == "test"
35
40
  Requires-Dist: coverage>=7.3.2 ; extra == "test"
36
41
  Requires-Dist: isort>=5.10.1 ; extra == "test"
@@ -43,6 +43,11 @@ dependencies = [
43
43
  ,"databricks-sdk==0.17.0"
44
44
  ,"databricks-sql-connector==2.9.5"
45
45
  ,"sqlglot==20.8.0"
46
+ ,"google-cloud==0.34.0"
47
+ ,"google-cloud-bigquery==3.22.0"
48
+ ,"google-cloud-datacatalog==3.19.0"
49
+ ,"google-cloud-datacatalog-lineage==0.3.6"
50
+ ,"google-api-python-client==2.131.0"
46
51
  ]
47
52
  dynamic = ["version", "description"]
48
53
 
@@ -1,4 +1,4 @@
1
1
  """Quollio Core"""
2
2
 
3
- __version__ = "0.4.6"
3
+ __version__ = "0.4.7"
4
4
  __author__ = "Quollio Technologies, Inc"
@@ -0,0 +1,114 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+
5
+ from quollio_core.helper.env_default import env_default
6
+ from quollio_core.profilers.bigquery import bigquery_table_lineage
7
+ from quollio_core.repository import qdc
8
+ from quollio_core.repository.bigquery import get_credentials, get_org_id
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def load_lineage(
14
+ qdc_client: qdc.QDCExternalAPIClient, project_id: str, regions: list, tenant_id: str, credentials: dict, org_id: str
15
+ ):
16
+ bigquery_table_lineage(
17
+ qdc_client=qdc_client,
18
+ tenant_id=tenant_id,
19
+ project_id=project_id,
20
+ regions=regions,
21
+ credentials=credentials,
22
+ org_id=org_id,
23
+ )
24
+
25
+
26
+ if __name__ == "__main__":
27
+ parser = argparse.ArgumentParser(
28
+ prog="Quollio Intelligence Agent for Google BigQuery",
29
+ description="Collect lineage and stats from Google BigQuery and load to Quollio Data Catalog",
30
+ epilog="Copyright (c) 2024 Quollio Technologies, Inc.",
31
+ )
32
+ parser.add_argument(
33
+ "commands",
34
+ choices=["load_lineage"],
35
+ type=str,
36
+ nargs="+",
37
+ help="""
38
+ The command to execute.
39
+ 'load_lineage': Load lineage data from Google Data Catalog to Quollio,
40
+ """,
41
+ )
42
+ parser.add_argument(
43
+ "--credentials",
44
+ type=str,
45
+ action=env_default("GOOGLE_APPLICATION_CREDENTIALS"),
46
+ help="Crendentials for Google Cloud Platform",
47
+ )
48
+ parser.add_argument(
49
+ "--tenant_id",
50
+ type=str,
51
+ action=env_default("TENANT_ID"),
52
+ required=False,
53
+ help="The tenant id (company id) where the lineage and stats are loaded",
54
+ )
55
+ parser.add_argument(
56
+ "--api_url",
57
+ type=str,
58
+ action=env_default("QDC_API_URL"),
59
+ required=False,
60
+ help="The base URL of Quollio External API",
61
+ )
62
+ parser.add_argument(
63
+ "--client_id",
64
+ type=str,
65
+ action=env_default("QDC_CLIENT_ID"),
66
+ required=False,
67
+ help="The client id that is created on Quollio console to let clients access Quollio External API",
68
+ )
69
+ parser.add_argument(
70
+ "--client_secret",
71
+ type=str,
72
+ action=env_default("QDC_CLIENT_SECRET"),
73
+ required=False,
74
+ help="The client secret that is created on Quollio console to let clients access Quollio External API",
75
+ )
76
+ parser.add_argument(
77
+ "--project_id",
78
+ type=str,
79
+ action=env_default("GCP_PROJECT_ID"),
80
+ required=False,
81
+ help="GCP Project ID",
82
+ )
83
+ parser.add_argument(
84
+ "--regions",
85
+ type=str,
86
+ action=env_default("GCP_REGIONS"),
87
+ required=False,
88
+ help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
89
+ nargs="+",
90
+ )
91
+
92
+ args = parser.parse_args()
93
+
94
+ if len(args.commands) == 0:
95
+ raise ValueError("No command is provided")
96
+
97
+ if "load_lineage" in args.commands:
98
+
99
+ qdc_client = qdc.QDCExternalAPIClient(
100
+ base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
101
+ )
102
+
103
+ credentials_json = json.loads(args.credentials)
104
+ credentials = get_credentials(credentials_json=credentials_json)
105
+ org_id = get_org_id(credentials_json=credentials_json)
106
+
107
+ load_lineage(
108
+ qdc_client=qdc_client,
109
+ project_id=args.project_id,
110
+ regions=args.regions,
111
+ tenant_id=args.tenant_id,
112
+ credentials=credentials,
113
+ org_id=org_id,
114
+ )
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
  def build_view(
19
19
  conn: db.DatabricksConnectionConfig,
20
- target_tables: str,
20
+ target_tables: str = "",
21
21
  log_level: str = "info",
22
22
  ) -> None:
23
23
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
@@ -41,7 +41,13 @@ def build_view(
41
41
  options=["--no-use-colors", "--log-level", log_level],
42
42
  )
43
43
 
44
- run_options = ["--no-use-colors", "--log-level", log_level, "--select", target_tables]
44
+ run_options = ["--no-use-colors", "--log-level", log_level]
45
+
46
+ if target_tables is not None:
47
+ target_tables_str = " ".join(target_tables)
48
+ run_options.append("--select")
49
+ run_options.append(target_tables_str)
50
+
45
51
  dbt_client.invoke(
46
52
  cmd="run",
47
53
  project_dir=project_path,
@@ -106,7 +112,6 @@ if __name__ == "__main__":
106
112
  'build_view': Build views using dbt,
107
113
  'load_lineage': Load lineage data from created views to Quollio,
108
114
  'load_stats': Load stats from created views to Quollio,
109
- 'load_sqllineage': Load lineage data from sql parse result(alpha),
110
115
  """,
111
116
  )
112
117
  parser.add_argument(
@@ -193,8 +198,8 @@ if __name__ == "__main__":
193
198
  parser.add_argument(
194
199
  "--target_tables",
195
200
  type=str,
196
- nargs="*",
197
- choices=["quollio_lineage_table_level", "quollio_lineage_view_level"],
201
+ nargs="+",
202
+ choices=["quollio_lineage_table_level", "quollio_lineage_column_level"],
198
203
  action=env_default("DATABRICKS_TARGET_TABLES"),
199
204
  required=False,
200
205
  help="Target tables you want to create with dbt module. \
@@ -203,6 +208,16 @@ if __name__ == "__main__":
203
208
  if you want to create two or more tables",
204
209
  )
205
210
 
211
+ parser.add_argument(
212
+ "--monitoring_table_suffix",
213
+ type=str,
214
+ action=env_default("DATABRICKS_MONITORING_TABLE_SUFFIX"),
215
+ required=False,
216
+ help="Sets the monitoring tables suffix for databricks. \
217
+ This is used to identify the monitoring tables created by the databricks monitoring tool. \
218
+ Default value is _profile_metrics",
219
+ )
220
+
206
221
  args = parser.parse_args()
207
222
 
208
223
  conn = db.DatabricksConnectionConfig(
@@ -234,4 +249,9 @@ if __name__ == "__main__":
234
249
  qdc_client = qdc.QDCExternalAPIClient(
235
250
  base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
236
251
  )
237
- databricks_column_stats(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
252
+ databricks_column_stats(
253
+ conn=conn,
254
+ qdc_client=qdc_client,
255
+ tenant_id=args.tenant_id,
256
+ monitoring_table_suffix=args.monitoring_table_suffix,
257
+ )
@@ -24,6 +24,7 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE NOT startswit
24
24
  {%- set build_sql %}
25
25
  {%- for record in records[i: i+chunk] -%}
26
26
  {%- if not loop.first %}UNION{% endif %}
27
+
27
28
  SELECT
28
29
  DISTINCT
29
30
  '{{record[0]}}' as db_name
@@ -38,7 +39,7 @@ SELECT * FROM {{ ref('quollio_stats_profiling_columns') }} WHERE NOT startswit
38
39
  , {% if record[5] == true %}median("{{record[3]}}"){% else %}null{% endif %} AS median_value
39
40
  , {% if record[5] == true %}approx_top_k("{{record[3]}}")[0][0]{% else %}null{% endif %} AS mode_value
40
41
  , {% if record[5] == true %}stddev("{{record[3]}}"){% else %}null{% endif %} AS stddev_value
41
- FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }} {{ var("sample_method") }}
42
+ FROM "{{record[0]}}"."{{record[1]}}"."{{record[2]}}" {{ var("sample_method") }}
42
43
  {% endfor -%}
43
44
  {%- endset %}
44
45
  -- create a view with a index as suffix
@@ -0,0 +1,96 @@
1
+ WITH columns AS (
2
+ SELECT
3
+ table_catalog
4
+ , table_schema
5
+ , table_name
6
+ , column_name
7
+ , data_type
8
+ FROM
9
+ {{ source('account_usage', 'COLUMNS') }}
10
+ WHERE
11
+ deleted is null
12
+ AND table_name NOT LIKE 'QUOLLIO_%%'
13
+ GROUP BY
14
+ table_catalog
15
+ , table_schema
16
+ , table_name
17
+ , column_name
18
+ , data_type
19
+ ORDER BY
20
+ table_catalog
21
+ , table_schema
22
+ , table_name
23
+ ), accessible_tables AS (
24
+ SELECT
25
+ table_catalog
26
+ , table_schema
27
+ , name
28
+ FROM
29
+ {{ source('account_usage', 'GRANTS_TO_ROLES') }}
30
+ WHERE
31
+ granted_on in ('TABLE', 'MATERIALIZED VIEW')
32
+ AND grantee_name = '{{ var("query_role") }}'
33
+ AND privilege in ('SELECT', 'OWNERSHIP', 'REFERENCES')
34
+ AND deleted_on IS NULL
35
+ GROUP BY
36
+ table_catalog
37
+ , table_schema
38
+ , name
39
+ ), m_view_sys_columns AS (
40
+ SELECT
41
+ cols.table_catalog
42
+ , cols.table_schema
43
+ , cols.table_name
44
+ , cols.column_name
45
+ , cols.data_type
46
+ FROM
47
+ {{ source('account_usage', 'COLUMNS') }} cols
48
+ LEFT OUTER JOIN
49
+ {{ source('account_usage', 'TABLES') }} tbls
50
+ ON
51
+ cols.table_catalog = tbls.table_catalog
52
+ AND cols.table_schema = tbls.table_schema
53
+ AND cols.table_name = tbls.table_name
54
+ WHERE
55
+ tbls.table_type = 'MATERIALIZED VIEW'
56
+ AND cols.column_name = 'SYS_MV_SOURCE_PARTITION'
57
+ ), implicit_columns_removed AS (
58
+ SELECT
59
+ c.table_catalog
60
+ , c.table_schema
61
+ , c.table_name
62
+ , c.column_name
63
+ , c.data_type
64
+ FROM
65
+ columns c
66
+ INNER JOIN
67
+ accessible_tables a
68
+ ON
69
+ c.table_catalog = a.table_catalog
70
+ AND c.table_schema = a.table_schema
71
+ AND c.table_name = a.name
72
+ MINUS
73
+ SELECT
74
+ table_catalog
75
+ , table_schema
76
+ , table_name
77
+ , column_name
78
+ , data_type
79
+ FROM
80
+ m_view_sys_columns
81
+ ), final AS (
82
+ SELECT
83
+ table_catalog
84
+ , table_schema
85
+ , table_name
86
+ , column_name
87
+ , data_type
88
+ , case when data_type in('NUMBER','DECIMAL', 'DEC', 'NUMERIC',
89
+ 'INT', 'INTEGER', 'BIGINT', 'SMALLINT',
90
+ 'TINYINT', 'BYTEINT')
91
+ THEN true
92
+ else false END AS is_calculable
93
+ FROM
94
+ implicit_columns_removed
95
+ )
96
+ select * from final
@@ -0,0 +1,81 @@
1
+ import logging
2
+ from typing import Any, Dict, List
3
+
4
+ from quollio_core.profilers.lineage import gen_table_lineage_payload, parse_bigquery_table_lineage
5
+ from quollio_core.repository import qdc
6
+ from quollio_core.repository.bigquery import BigQueryClient, GCPLineageClient, get_entitiy_reference, get_search_request
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def bigquery_table_lineage(
12
+ qdc_client: qdc.QDCExternalAPIClient,
13
+ tenant_id: str,
14
+ project_id: str,
15
+ regions: list,
16
+ org_id: str,
17
+ credentials: Any,
18
+ ):
19
+ lineage_client = GCPLineageClient(credentials)
20
+ bq_client = BigQueryClient(credentials)
21
+
22
+ datasets = bq_client.list_datasets(project_id)
23
+ all_tables = generate_table_list(datasets, bq_client)
24
+ lineage_links = generate_lineage_links(all_tables, lineage_client, project_id, regions)
25
+ lineage_links = parse_bigquery_table_lineage(lineage_links)
26
+
27
+ update_table_lineage_inputs = gen_table_lineage_payload(tenant_id=tenant_id, endpoint=org_id, tables=lineage_links)
28
+
29
+ req_count = 0
30
+ for update_table_lineage_input in update_table_lineage_inputs:
31
+ logger.info(
32
+ "Generating table lineage. downstream: %s -> %s-> %s",
33
+ update_table_lineage_input.downstream_database_name,
34
+ update_table_lineage_input.downstream_schema_name,
35
+ update_table_lineage_input.downstream_table_name,
36
+ )
37
+ status_code = qdc_client.update_lineage_by_id(
38
+ global_id=update_table_lineage_input.downstream_global_id,
39
+ payload=update_table_lineage_input.upstreams.as_dict(),
40
+ )
41
+ if status_code == 200:
42
+ req_count += 1
43
+ logger.info("Generating table lineage is finished. %s lineages are ingested.", req_count)
44
+
45
+
46
+ def generate_table_list(datasets: List[str], bq_client: BigQueryClient) -> List[str]:
47
+ all_tables = []
48
+ for dataset in datasets:
49
+ all_tables.extend(
50
+ [
51
+ table
52
+ for table in bq_client.list_tables(dataset.dataset_id)
53
+ if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
54
+ ]
55
+ )
56
+
57
+ all_table_names = []
58
+ for table in all_tables:
59
+ all_table_names.append(f"{table.project}.{table.dataset_id}.{table.table_id}")
60
+
61
+ return all_table_names
62
+
63
+
64
+ def generate_lineage_links(
65
+ all_tables: List[str], lineage_client: GCPLineageClient, project_id: str, regions: List[str]
66
+ ) -> Dict[str, List[str]]:
67
+ lineage_links = {}
68
+ for table in all_tables:
69
+ downstream = get_entitiy_reference()
70
+ downstream.fully_qualified_name = f"bigquery:{table}"
71
+
72
+ for region in regions:
73
+ request = get_search_request(downstream_table=downstream, project_id=project_id, region=region)
74
+ response = lineage_client.get_links(request=request)
75
+ for lineage in response:
76
+ target_table = str(lineage.target.fully_qualified_name).replace("bigquery:", "")
77
+ if target_table not in lineage_links:
78
+ lineage_links[target_table] = []
79
+ lineage_links[target_table].append(str(lineage.source.fully_qualified_name).replace("bigquery:", ""))
80
+
81
+ return lineage_links
@@ -99,7 +99,7 @@ def databricks_column_level_lineage(
99
99
 
100
100
 
101
101
  def _get_monitoring_tables(
102
- conn: databricks.DatabricksConnectionConfig, monitoring_table_id: str = "_profile_metrics"
102
+ conn: databricks.DatabricksConnectionConfig, monitoring_table_suffix: str = "_profile_metrics"
103
103
  ) -> List[Dict[str, str]]:
104
104
  tables = []
105
105
  query = f"""
@@ -110,7 +110,7 @@ def _get_monitoring_tables(
110
110
  CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_fqdn
111
111
  FROM
112
112
  system.information_schema.tables
113
- WHERE table_name LIKE "%{monitoring_table_id}"
113
+ WHERE table_name LIKE "%{monitoring_table_suffix}"
114
114
  """
115
115
  with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
116
116
  tables = databricks_executor.get_query_results(query)
@@ -123,9 +123,9 @@ def _get_monitoring_tables(
123
123
 
124
124
 
125
125
  def _get_column_stats(
126
- conn: databricks.DatabricksConnectionConfig, monitoring_table_id: str = "_profile_metrics"
126
+ conn: databricks.DatabricksConnectionConfig, monitoring_table_suffix: str = "_profile_metrics"
127
127
  ) -> List[Dict[str, str]]:
128
- tables = _get_monitoring_tables(conn, monitoring_table_id)
128
+ tables = _get_monitoring_tables(conn, monitoring_table_suffix)
129
129
  if not tables:
130
130
  return []
131
131
  stats = []
@@ -178,9 +178,9 @@ def databricks_column_stats(
178
178
  conn: databricks.DatabricksConnectionConfig,
179
179
  qdc_client: qdc.QDCExternalAPIClient,
180
180
  tenant_id: str,
181
- monitoring_table_id: str = "_profile_metrics",
181
+ monitoring_table_suffix: str = "_profile_metrics",
182
182
  ) -> None:
183
- table_stats = _get_column_stats(conn, monitoring_table_id)
183
+ table_stats = _get_column_stats(conn, monitoring_table_suffix)
184
184
  for table in table_stats:
185
185
  stats = gen_table_stats_payload(tenant_id, conn.host, table)
186
186
  for stat in stats:
@@ -153,3 +153,17 @@ def parse_databricks_table_lineage(results: List) -> List[Dict[str, Dict]]:
153
153
  payload["UPSTREAM_TABLES"] = json.loads(result["UPSTREAM_TABLES"])
154
154
  payloads.append(payload)
155
155
  return payloads
156
+
157
+
158
+ def parse_bigquery_table_lineage(tables: Dict) -> List[Dict[str, Dict]]:
159
+ payloads = list()
160
+ for downstream, upstream in tables.items():
161
+ payload = {
162
+ "DOWNSTREAM_TABLE_NAME": "",
163
+ "UPSTREAM_TABLES": [],
164
+ }
165
+ payload["DOWNSTREAM_TABLE_NAME"] = downstream
166
+ for upstream_table in upstream:
167
+ payload["UPSTREAM_TABLES"].append({"upstream_object_name": upstream_table})
168
+ payloads.append(payload)
169
+ return payloads
@@ -0,0 +1,61 @@
1
+ import logging
2
+
3
+ from google.cloud.bigquery import Client
4
+ from google.cloud.datacatalog_lineage_v1 import EntityReference, LineageClient, SearchLinksRequest
5
+ from google.oauth2.service_account import Credentials
6
+ from googleapiclient.discovery import build
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class BigQueryClient:
12
+ def __init__(self, credentials: Credentials) -> None:
13
+ self.client = self.__initialze(credentials=credentials)
14
+
15
+ def __initialze(self, credentials: Credentials) -> Client:
16
+ client = Client(credentials=credentials)
17
+ return client
18
+
19
+ def list_datasets(self, project_id) -> list:
20
+ datasets = list(self.client.list_datasets(project_id))
21
+ logger.debug("Found %s datasets in project %s", len(datasets), project_id)
22
+ return datasets
23
+
24
+ def list_tables(self, dataset_id) -> list:
25
+ tables = list(self.client.list_tables(dataset_id))
26
+ logger.debug("Found %s tables in dataset %s", len(tables), dataset_id)
27
+ return list(self.client.list_tables(dataset_id))
28
+
29
+
30
+ class GCPLineageClient:
31
+ def __init__(self, credentials: Credentials) -> None:
32
+ self.client = self.__initialze(credentials=credentials)
33
+
34
+ def __initialze(self, credentials: Credentials) -> LineageClient:
35
+ client = LineageClient(credentials=credentials)
36
+ return client
37
+
38
+ def get_links(self, request: SearchLinksRequest) -> list:
39
+ response = self.client.search_links(request)
40
+ return response.links
41
+
42
+
43
+ def get_entitiy_reference() -> EntityReference:
44
+ return EntityReference()
45
+
46
+
47
+ def get_search_request(downstream_table: EntityReference, project_id: str, region: str) -> SearchLinksRequest:
48
+ return SearchLinksRequest(target=downstream_table, parent=f"projects/{project_id}/locations/{region.lower()}")
49
+
50
+
51
+ def get_credentials(credentials_json: dict) -> Credentials:
52
+ return Credentials.from_service_account_info(credentials_json)
53
+
54
+
55
+ def get_org_id(credentials_json: dict) -> str:
56
+ credentials = get_credentials(credentials_json)
57
+ crm_service = build("cloudresourcemanager", "v1", credentials=credentials)
58
+ project_id = credentials_json["project_id"]
59
+ project = crm_service.projects().get(projectId=project_id).execute()
60
+ org_id = project["parent"]["id"]
61
+ return org_id
@@ -1,59 +0,0 @@
1
- WITH columns AS (
2
- SELECT
3
- table_catalog
4
- , table_schema
5
- , table_name
6
- , column_name
7
- , data_type
8
- FROM
9
- {{ source('account_usage', 'COLUMNS') }}
10
- WHERE
11
- deleted is null
12
- AND table_name NOT LIKE 'QUOLLIO_%%'
13
- GROUP BY
14
- table_catalog
15
- , table_schema
16
- , table_name
17
- , column_name
18
- , data_type
19
- ORDER BY
20
- table_catalog
21
- , table_schema
22
- , table_name
23
- ), accessible_tables AS (
24
- SELECT
25
- table_catalog
26
- , table_schema
27
- , name
28
- FROM
29
- {{ source('account_usage', 'GRANTS_TO_ROLES') }}
30
- WHERE
31
- granted_on in ('TABLE', 'MATERIALIZED VIEW')
32
- AND grantee_name = '{{ var("query_role") }}'
33
- AND privilege in ('SELECT', 'OWNERSHIP', 'REFERENCES')
34
- AND deleted_on IS NULL
35
- GROUP BY
36
- table_catalog
37
- , table_schema
38
- , name
39
- )
40
-
41
- SELECT
42
- c.table_catalog
43
- , c.table_schema
44
- , c.table_name
45
- , c.column_name
46
- , c.data_type
47
- , case when c.data_type in('NUMBER','DECIMAL', 'DEC', 'NUMERIC',
48
- 'INT', 'INTEGER', 'BIGINT', 'SMALLINT',
49
- 'TINYINT', 'BYTEINT')
50
- THEN true
51
- else false END AS is_calculable
52
- FROM
53
- columns c
54
- INNER JOIN
55
- accessible_tables a
56
- ON
57
- c.table_catalog = a.table_catalog
58
- AND c.table_schema = a.table_schema
59
- AND c.table_name = a.name
File without changes
File without changes