PyPI - quollio-core - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.10__py3-none-any.whl - Mend

quollio-core 0.4.7py3-none-any.whl → 0.4.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

quollio_core/__init__.py +1 -1
quollio_core/bigquery.py +10 -1
quollio_core/bricks.py +43 -12
quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml +1 -1
quollio_core/dbt_projects/redshift/dbt_project.yml +1 -1
quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql +68 -29
quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql +1 -2
quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql +50 -28
quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql +1 -2
quollio_core/helper/core.py +4 -0
quollio_core/helper/env_default.py +24 -1
quollio_core/helper/log.py +17 -0
quollio_core/profilers/databricks.py +11 -6
quollio_core/profilers/redshift.py +41 -74
quollio_core/profilers/snowflake.py +138 -169
quollio_core/profilers/sqllineage.py +0 -1
quollio_core/redshift.py +11 -13
quollio_core/repository/databricks.py +3 -3
quollio_core/repository/dbt.py +0 -1
quollio_core/repository/qdc.py +0 -3
quollio_core/repository/redshift.py +0 -1
quollio_core/repository/snowflake.py +6 -1
quollio_core/snowflake.py +25 -11
{quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/METADATA +2 -2
{quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/RECORD +27 -26
{quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/LICENSE +0 -0
{quollio_core-0.4.7.dist-info → quollio_core-0.4.10.dist-info}/WHEEL +0 -0

quollio_core/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 """Quollio Core"""
-__version__ = "0.4.7"
+__version__ = "0.4.10"
 __author__ = "Quollio Technologies, Inc"

quollio_core/bigquery.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import logging
 from quollio_core.helper.env_default import env_default
+from quollio_core.helper.log import set_log_level
 from quollio_core.profilers.bigquery import bigquery_table_lineage
 from quollio_core.repository import qdc
 from quollio_core.repository.bigquery import get_credentials, get_org_id
@@ -88,14 +89,22 @@ if __name__ == "__main__":
         help="GCP regions where the data is located. Multiple regions can be provided separated by space.",
         nargs="+",
     )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        choices=["debug", "info", "warn", "error", "none"],
+        action=env_default("LOG_LEVEL"),
+        required=False,
+        help="The log level for dbt commands. Default value is info",
+    )
     args = parser.parse_args()
+    set_log_level(level=args.log_level)
     if len(args.commands) == 0:
         raise ValueError("No command is provided")
     if "load_lineage" in args.commands:
         qdc_client = qdc.QDCExternalAPIClient(
             base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
         )

quollio_core/bricks.py CHANGED Viewed

@@ -2,8 +2,9 @@ import argparse
 import logging
 import os
-from quollio_core.helper.core import setup_dbt_profile
+from quollio_core.helper.core import setup_dbt_profile, trim_prefix
 from quollio_core.helper.env_default import env_default
+from quollio_core.helper.log import set_log_level
 from quollio_core.profilers.databricks import (
     databricks_column_level_lineage,
     databricks_column_stats,
@@ -20,7 +21,6 @@ def build_view(
     target_tables: str = "",
     log_level: str = "info",
 ) -> None:
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
     logger.info("Build profiler views using dbt")
     # set parameters
@@ -59,20 +59,34 @@ def build_view(
 def load_lineage(
     conn: db.DatabricksConnectionConfig,
+    endpoint: str,
     qdc_client: qdc.QDCExternalAPIClient,
     tenant_id: str,
+    enable_column_lineage: bool = False,
 ) -> None:
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
     logger.info("Generate Databricks table to table lineage.")
     databricks_table_level_lineage(
-        conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_table_level"
+        conn=conn,
+        endpoint=endpoint,
+        qdc_client=qdc_client,
+        tenant_id=tenant_id,
+        dbt_table_name="quollio_lineage_table_level",
     )
-    logger.info("Generate Databricks column to column lineage.")
-    databricks_column_level_lineage(
-        conn=conn, qdc_client=qdc_client, tenant_id=tenant_id, dbt_table_name="quollio_lineage_column_level"
-    )
+    if enable_column_lineage:
+        logger.info(
+            f"enable_column_lineage is set to {enable_column_lineage}.Generate Databricks column to column lineage."
+        )
+        databricks_column_level_lineage(
+            conn=conn,
+            endpoint=endpoint,
+            qdc_client=qdc_client,
+            tenant_id=tenant_id,
+            dbt_table_name="quollio_lineage_column_level",
+        )
+    else:
+        logger.info("Skip column lineage ingestion. Set enable_column_lineage to True if you ingest column lineage.")
     logger.info("Lineage data is successfully loaded.")
     return
@@ -80,14 +94,15 @@ def load_lineage(
 def load_column_stats(
     conn: db.DatabricksConnectionConfig,
+    endpoint: str,
     qdc_client: qdc.QDCExternalAPIClient,
     tenant_id: str,
 ) -> None:
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
     logger.info("Generate Databricks column stats.")
     databricks_column_stats(
         conn=conn,
+        endpoint=endpoint,
         qdc_client=qdc_client,
         tenant_id=tenant_id,
     )
@@ -207,7 +222,6 @@ if __name__ == "__main__":
               Please specify table name with blank delimiter like tableA tableB \
               if you want to create two or more tables",
     )
     parser.add_argument(
         "--monitoring_table_suffix",
         type=str,
@@ -217,11 +231,21 @@ if __name__ == "__main__":
               This is used to identify the monitoring tables created by the databricks monitoring tool. \
               Default value is _profile_metrics",
     )
+    parser.add_argument(
+        "--enable_column_lineage",
+        type=bool,
+        action=env_default("ENABLE_COLUMN_LINEAGE", store_true=True),
+        default=False,
+        required=False,
+        help="Whether to ingest column lineage into QDIC or not. Default value is False",
+    )
     args = parser.parse_args()
+    set_log_level(level=args.log_level)
     conn = db.DatabricksConnectionConfig(
-        host=args.host,
+        # MEMO: Metadata agent allows the string 'https://' as a host name but is not allowed by intelligence agent.
+        host=trim_prefix(args.host, "https://"),
         http_path=args.http_path,
         client_id=args.databricks_client_id,
         client_secret=args.databricks_client_secret,
@@ -243,7 +267,13 @@ if __name__ == "__main__":
         qdc_client = qdc.QDCExternalAPIClient(
             base_url=args.api_url, client_id=args.client_id, client_secret=args.client_secret
         )
-        load_lineage(conn=conn, qdc_client=qdc_client, tenant_id=args.tenant_id)
+        load_lineage(
+            conn=conn,
+            endpoint=args.host,
+            qdc_client=qdc_client,
+            tenant_id=args.tenant_id,
+            enable_column_lineage=args.enable_column_lineage,
+        )
     if "load_stats" in args.commands:
         qdc_client = qdc.QDCExternalAPIClient(
@@ -251,6 +281,7 @@ if __name__ == "__main__":
         )
         databricks_column_stats(
             conn=conn,
+            endpoint=args.host,
             qdc_client=qdc_client,
             tenant_id=args.tenant_id,
             monitoring_table_suffix=args.monitoring_table_suffix,

quollio_core/dbt_projects/databricks/models/quollio_lineage_table_level.yml CHANGED Viewed

@@ -1,7 +1,7 @@
 version: 2
 model:
-  - name: quollio_lineage_column_level
+  - name: quollio_lineage_table_level
     columns:
       - name: UPSTREAM_TABLES
         description: 'String column with all upstream tables in JSON format'

quollio_core/dbt_projects/redshift/dbt_project.yml CHANGED Viewed

@@ -18,4 +18,4 @@ clean-targets:
 models:
   +dbt-osmosis: "{model}.yml"
   +grants:
-    select: ["{{ var('query_user') }}"]
+    select: ["\"{{ var('query_user') }}\""]

quollio_core/dbt_projects/redshift/macros/materialization/divided_view.sql CHANGED Viewed

@@ -1,28 +1,67 @@
 {%- materialization divided_view, default %}
 {%- set identifier = model['alias'] %}
 {%- set target_relations = [] %}
-{%- set chunk = config.get('chunk') %}
 {%- set grant_config = config.get('grants') %}
 {{ run_hooks(pre_hooks, inside_transaction=False) }}
 -- `BEGIN` happens here:
 {{ run_hooks(pre_hooks, inside_transaction=True) }}
--- fetch records
-{%- set query_quollio_stats_profiling_columns -%}
-SELECT * FROM {{  ref('quollio_stats_profiling_columns')  }} WHERE table_name not like 'quollio_%'
+-- fetch target_tables
+{%- set query_stats_target_tables -%}
+    SELECT
+      distinct
+      database_name
+      , schema_name
+      , table_name
+    FROM
+      {{ ref('quollio_stats_profiling_columns') }}
+    WHERE
+      table_name not like 'quollio_%%'
 {%- endset -%}
-{%- set results = run_query(query_quollio_stats_profiling_columns) -%}
+{%- set results = run_query(query_stats_target_tables) -%}
 {%- if execute -%}
-{%- set records = results.rows -%}
+{%- set stats_target_tables = results.rows -%}
 {%- else -%}
-{%- set records = [] -%}
+{%- set stats_target_tables = [] -%}
+{%- endif -%}
+-- skip creating views if the target profiling columns don't exist.
+{%- if stats_target_tables | length == 0 -%}
+  {% call statement("main") %}
+    {{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
+    select null
+  {% endcall %}
+  {%- set full_refresh_mode = (should_full_refresh()) -%}
+  {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
 {%- endif -%}
 -- build sql
-{%- for i in range(0, records|length, chunk) -%}
-  {%- set build_sql %}
-  {%- for record in records[i: i+chunk] -%}
+{%- for stats_target_table in stats_target_tables -%}
+  -- get columns for statistics.
+  -- LISTAGG function can't be used for sys table, then it's necessary to get column for each table.
+  -- See https://docs.aws.amazon.com/redshift/latest/dg/c_join_PG.html.
+  {%- set stats_target_columns %}
+      SELECT
+        database_name
+        , schema_name
+        , table_name
+        , column_name
+        , is_bool
+        , is_calculable
+      FROM
+        {{ ref('quollio_stats_profiling_columns') }}
+      WHERE
+        database_name = '{{stats_target_table[0]}}'
+        AND schema_name = '{{stats_target_table[1]}}'
+        AND table_name = '{{stats_target_table[2]}}'
+  {%- endset -%}
+  {%- set results = run_query(stats_target_columns) -%}
+  {%- set stats_target_columns = results.rows -%}
+  {%- set sql_for_column_stats %}
+  {%- for stats_target_column in stats_target_columns -%}
     {%- if not loop.first -%}UNION{% endif %}
     SELECT
       main.db_name
@@ -41,33 +80,33 @@ SELECT * FROM {{  ref('quollio_stats_profiling_columns')  }} WHERE table_name no
       (
       SELECT
         DISTINCT
-        '{{record[0]}}'::varchar as db_name
-        , '{{record[1]}}'::varchar as schema_name
-        , '{{record[2]}}'::varchar as table_name
-        , '{{record[3]}}'::varchar as column_name
-        , {% if var("skip_heavy") == false and record[5] == true %}cast(max("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
-        , {% if var("skip_heavy") == false and record[5] == true %}cast(min("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
+        '{{stats_target_column[0]}}'::varchar as db_name
+        , '{{stats_target_column[1]}}'::varchar as schema_name
+        , '{{stats_target_column[2]}}'::varchar as table_name
+        , '{{stats_target_column[3]}}'::varchar as column_name
+        , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(max("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS max_value
+        , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(min("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS min_value
         -- requires full table scan
-        , {% if var("skip_heavy") == false %}cast(SUM(NVL2("{{record[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
-        , APPROXIMATE COUNT(DISTINCT "{{record[3]}}") AS cardinality
+        , {% if var("aggregate_all") == True %}cast(SUM(NVL2("{{stats_target_column[3]}}", 0, 1)) as integer){% else %}null::integer{% endif %} AS null_count
+        , APPROXIMATE COUNT(DISTINCT "{{stats_target_column[3]}}") AS cardinality
         -- requires full table scan
-        , {% if var("skip_heavy") == false and record[5] == true %}cast(avg("{{record[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
-        , {% if var("skip_heavy") == false and record[5] == true %}cast(median("{{record[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
+        , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(avg("{{stats_target_column[3]}}")as varchar){% else %}null::varchar{% endif %} AS avg_value
+        , {% if var("aggregate_all") == True and stats_target_column[5] == True %}cast(median("{{stats_target_column[3]}}") as varchar){% else %}null::varchar{% endif %} AS median_value
         -- requires full table scan
-        , {% if record[5] == true %}cast(STDDEV_SAMP("{{record[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
-      FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }}
+        , {% if stats_target_column[5] == True %}cast(STDDEV_SAMP("{{stats_target_column[3]}}") as integer){% else %}null::integer{% endif %} AS stddev_value
+      FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
     ) main, (
-      {%- if var("skip_heavy") == false and record[4] == false %}
+      {%- if var("aggregate_all") == True and stats_target_column[4] == false %}
         SELECT
-          cast("{{record[3]}}" as varchar) mode_value
+          cast("{{stats_target_column[3]}}" as varchar) mode_value
         FROM (
            SELECT
             DISTINCT
-            "{{record[3]}}"
+            "{{stats_target_column[3]}}"
             , ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS row_num
-          FROM {{ record[0] }}.{{ record[1] }}.{{ record[2] }}
+          FROM {{ stats_target_column[0] }}.{{ stats_target_column[1] }}.{{ stats_target_column[2] }}
           GROUP BY
-            "{{record[3]}}"
+            "{{stats_target_column[3]}}"
         )
         WHERE
           row_num = 1
@@ -77,11 +116,11 @@ SELECT * FROM {{  ref('quollio_stats_profiling_columns')  }} WHERE table_name no
   {% endfor -%}
   {%- endset %}
   -- create a view with a index as suffix
-  {%- set target_identifier = "%s_%d"|format(model['name'], loop.index) %}
+  {%- set target_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
   {%- set target_relation = api.Relation.create(identifier=target_identifier, schema=schema, database=database, type='view') %}
   -- {{ drop_relation_if_exists(target_relation) }}
   {% call statement("main") %}
-    {{ get_replace_view_sql(target_relation, build_sql) }}
+    {{ get_replace_view_sql(target_relation, sql_for_column_stats) }}
   {% endcall %}
   {%- set full_refresh_mode = (should_full_refresh()) -%}
   {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}

quollio_core/dbt_projects/redshift/models/quollio_stats_columns.sql CHANGED Viewed

@@ -1,7 +1,6 @@
 {{
     config(
-        materialized='divided_view',
-        chunk=20
+        materialized='divided_view'
     )
 }}
 -- depends_on: {{ ref('quollio_stats_profiling_columns') }}

quollio_core/dbt_projects/snowflake/macros/materialization/divided_view.sql CHANGED Viewed

@@ -1,52 +1,74 @@
 {%- materialization divided_view, default %}
 {%- set identifier = model['alias'] %}
 {%- set target_relations = [] %}
-{%- set chunk = config.get('chunk') %}
 {%- set grant_config = config.get('grants') %}
 {{ run_hooks(pre_hooks, inside_transaction=False) }}
 -- `BEGIN` happens here:
 {{ run_hooks(pre_hooks, inside_transaction=True) }}
--- fetch records
-{%- set query_quollio_stats_profiling_columns -%}
-SELECT * FROM {{  ref('quollio_stats_profiling_columns')  }} WHERE NOT startswith(table_name, 'QUOLLIO_')
+-- fetch target_tables
+{%- set query_stats_target_tables -%}
+    SELECT
+      TABLE_CATALOG
+      , TABLE_SCHEMA
+      , TABLE_NAME
+      , OBJECT_AGG(COLUMN_NAME, IS_CALCULABLE) AS COLUMNS_OBJ
+    FROM
+      {{ ref('quollio_stats_profiling_columns') }}
+    WHERE NOT startswith(table_name, 'QUOLLIO_')
+    GROUP BY
+      TABLE_CATALOG
+      , TABLE_SCHEMA
+      , TABLE_NAME
 {%- endset -%}
-{%- set results = run_query(query_quollio_stats_profiling_columns) -%}
+{%- set results = run_query(query_stats_target_tables) -%}
 {%- if execute -%}
-{%- set records = results.rows -%}
+{%- set stats_target_tables = results.rows -%}
 {%- else -%}
-{%- set records = [] -%}
+{%- set stats_target_tables = [] -%}
 {%- endif -%}
--- build sql
-{%- for i in range(0, records|length, chunk) -%}
-  {%- set build_sql %}
-  {%- for record in records[i: i+chunk] -%}
-    {%- if not loop.first %}UNION{% endif %}
+-- skip creating views if the target profiling columns don't exist.
+{%- if stats_target_tables | length == 0 -%}
+  {% call statement("main") %}
+    {{ log("No records found. Just execute select stmt for skipping call statement.", info=True) }}
+    select null
+  {% endcall %}
+  {%- set full_refresh_mode = (should_full_refresh()) -%}
+  {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}
+{%- endif -%}
+-- create view for each table
+{%- for stats_target_table in stats_target_tables -%}
+  -- build sql for column value aggregation.
+  {%- set sql_for_column_stats %}
+  {% set columns_json = fromjson(stats_target_table[3]) %}
+  {%- for col_name, is_calclable in columns_json.items() -%}
+    {%- if not loop.first %}UNION{% endif %}
     SELECT
       DISTINCT
-      '{{record[0]}}' as db_name
-      , '{{record[1]}}' as schema_name
-      , '{{record[2]}}' as table_name
-      , '{{record[3]}}' as column_name
-      , {% if record[5] == true %}CAST(max("{{record[3]}}") AS STRING){% else %}null{% endif %} AS max_value
-      , {% if record[5] == true %}CAST(min("{{record[3]}}") AS STRING){% else %}null{% endif %} AS min_value
-      , COUNT_IF("{{record[3]}}" IS NULL) AS null_count
-      , APPROX_COUNT_DISTINCT("{{record[3]}}") AS cardinality
-      , {% if record[5] == true %}avg("{{record[3]}}"){% else %}null{% endif %} AS avg_value
-      , {% if record[5] == true %}median("{{record[3]}}"){% else %}null{% endif %} AS median_value
-      , {% if record[5] == true %}approx_top_k("{{record[3]}}")[0][0]{% else %}null{% endif %} AS mode_value
-      , {% if record[5] == true %}stddev("{{record[3]}}"){% else %}null{% endif %} AS stddev_value
-    FROM "{{record[0]}}"."{{record[1]}}"."{{record[2]}}" {{ var("sample_method") }}
+      '{{stats_target_table[0]}}' as db_name
+      , '{{stats_target_table[1]}}' as schema_name
+      , '{{stats_target_table[2]}}' as table_name
+      , '{{col_name}}' as column_name
+      , {% if is_calclable == True %}CAST(MAX("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS max_value
+      , {% if is_calclable == True %}CAST(MIN("{{col_name}}") AS STRING){% else %}NULL{% endif %} AS min_value
+      , COUNT_IF("{{col_name}}" IS NULL) AS null_count
+      , APPROX_COUNT_DISTINCT("{{col_name}}") AS cardinality
+      , {% if is_calclable == True %}AVG("{{col_name}}"){% else %}NULL{% endif %} AS avg_value
+      , {% if is_calclable == True %}MEDIAN("{{col_name}}"){% else %}NULL{% endif %} AS median_value
+      , {% if is_calclable == True %}APPROX_TOP_K("{{col_name}}")[0][0]{% else %}NULL{% endif %} AS mode_value
+      , {% if is_calclable == True %}STDDEV("{{col_name}}"){% else %}NULL{% endif %} AS stddev_value
+    FROM "{{stats_target_table[0]}}"."{{stats_target_table[1]}}"."{{stats_target_table[2]}}" {{ var("sample_method") }}
   {% endfor -%}
   {%- endset %}
   -- create a view with a index as suffix
-  {%- set target_identifier = "%s_%d"|format(model['name'], loop.index) %}
-  {%- set target_relation = api.Relation.create(identifier=target_identifier, schema=schema, database=database, type='view') %}
+  {%- set stats_view_identifier = "%s_%s_%s_%s"|format(model['name'], stats_target_table[0], stats_target_table[1], stats_target_table[2]) %}
+  {%- set target_relation = api.Relation.create(identifier=stats_view_identifier, schema=schema, database=database, type='view') %}
   {% call statement("main") %}
-    {{ get_create_view_as_sql(target_relation, build_sql) }}
+    {{ get_create_view_as_sql(target_relation, sql_for_column_stats) }}
   {% endcall %}
   {%- set full_refresh_mode = (should_full_refresh()) -%}
   {%- set should_revoke = should_revoke(target_relation, full_refresh_mode) %}

quollio_core/dbt_projects/snowflake/models/quollio_stats_columns.sql CHANGED Viewed

@@ -1,7 +1,6 @@
 {{
     config(
-        materialized='divided_view',
-        chunk=20
+        materialized='divided_view'
     )
 }}
 -- depends_on: {{ ref('quollio_stats_profiling_columns') }}

quollio_core/helper/core.py CHANGED Viewed

@@ -31,3 +31,7 @@ def setup_dbt_profile(connections_json: Dict[str, str], template_path: str, temp
     with open(profile_path, "w") as profiles:
         yaml.dump(yaml.safe_load(profiles_body), profiles, default_flow_style=False, allow_unicode=True)
     return
+def trim_prefix(s: str, prefix: str) -> str:
+    return s.lstrip(prefix)

quollio_core/helper/env_default.py CHANGED Viewed

@@ -6,6 +6,8 @@ Currently requires explicit naming of env vars to check for
 import argparse
 import os
+from distutils.util import strtobool
+from typing import Union
 # Courtesy of http://stackoverflow.com/a/10551190 with env-var retrieval fixed
@@ -28,9 +30,30 @@ class EnvDefault(argparse.Action):
         setattr(namespace, self.dest, values)
+class EnvStoreTrue(argparse._StoreTrueAction):
+    """An argparse action class that auto-sets missing default values from env vars for store_true."""
+    def __init__(self, envvar, required=True, default=None, **kwargs):
+        # Only pass the arguments that argparse._StoreTrueAction expects
+        action_kwargs = {key: value for key, value in kwargs.items() if key in ("option_strings", "dest")}
+        if envvar in os.environ:
+            default = _convert_value_to_bool(os.environ[envvar])
+        if required and default:
+            required = False
+        super(EnvStoreTrue, self).__init__(default=default, required=required, **action_kwargs)
 # functional sugar for the above
-def env_default(envvar):
+def env_default(envvar, store_true=False):
     def wrapper(**kwargs):
+        if store_true:
+            return EnvStoreTrue(envvar, **kwargs)
         return EnvDefault(envvar, **kwargs)
     return wrapper
+def _convert_value_to_bool(v: Union[str, bool]) -> bool:
+    if isinstance(v, str):
+        return bool(strtobool(v))
+    return v

quollio_core/helper/log.py ADDED Viewed

@@ -0,0 +1,17 @@
+import logging
+def set_log_level(level: str = "info") -> None:
+    fmt = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+    if level == "info":
+        logging.basicConfig(level=logging.INFO, format=fmt)
+    elif level == "debug":
+        logging.basicConfig(level=logging.DEBUG, format=fmt)
+    elif level == "warn":
+        logging.basicConfig(level=logging.WARNING, format=fmt)
+    elif level == "error":
+        logging.basicConfig(level=logging.ERROR, format=fmt)
+    elif level == "critical":
+        logging.basicConfig(level=logging.CRITICAL, format=fmt)
+    else:
+        logging.basicConfig(level=logging.NOTSET, format=fmt)

quollio_core/profilers/databricks.py CHANGED Viewed

@@ -14,11 +14,11 @@ logger = logging.getLogger(__name__)
 def databricks_table_level_lineage(
     conn: databricks.DatabricksConnectionConfig,
+    endpoint: str,
     qdc_client: qdc.QDCExternalAPIClient,
     tenant_id: str,
     dbt_table_name: str = "quollio_lineage_table_level",
 ) -> None:
-    logging.basicConfig(level=logging.info, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
     with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
         results = databricks_executor.get_query_results(
             query=f"""
@@ -31,7 +31,7 @@ def databricks_table_level_lineage(
         tables = parse_databricks_table_lineage(results)
         update_table_lineage_inputs = gen_table_lineage_payload(
             tenant_id=tenant_id,
-            endpoint=conn.host,
+            endpoint=endpoint,
             tables=tables,
         )
@@ -55,11 +55,11 @@ def databricks_table_level_lineage(
 def databricks_column_level_lineage(
     conn: databricks.DatabricksConnectionConfig,
+    endpoint: str,
     qdc_client: qdc.QDCExternalAPIClient,
     tenant_id: str,
     dbt_table_name: str = "quollio_lineage_column_level",
 ) -> None:
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
     with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
         results = databricks_executor.get_query_results(
             query=f"""
@@ -72,7 +72,7 @@ def databricks_column_level_lineage(
     update_column_lineage_inputs = gen_column_lineage_payload(
         tenant_id=tenant_id,
-        endpoint=conn.host,
+        endpoint=endpoint,
         columns=results,
     )
@@ -110,7 +110,9 @@ def _get_monitoring_tables(
             CONCAT(table_catalog, '.', table_schema, '.', table_name) AS table_fqdn
         FROM
             system.information_schema.tables
-        WHERE table_name LIKE "%{monitoring_table_suffix}"
+        WHERE
+            table_name LIKE "%{monitoring_table_suffix}"
+            AND table_name NOT LIKE ('quollio_%')
         """
     with databricks.DatabricksQueryExecutor(config=conn) as databricks_executor:
         tables = databricks_executor.get_query_results(query)
@@ -153,6 +155,8 @@ def _get_column_stats(
                     MAX(t.window) AS LATEST
                 FROM
                     {monitoring_table} t
+                WHERE
+                    t.column_name not in (':table')
                 GROUP BY
                     t.COLUMN_NAME,
                     t.DATA_TYPE,
@@ -176,13 +180,14 @@ def _get_column_stats(
 def databricks_column_stats(
     conn: databricks.DatabricksConnectionConfig,
+    endpoint: str,
     qdc_client: qdc.QDCExternalAPIClient,
     tenant_id: str,
     monitoring_table_suffix: str = "_profile_metrics",
 ) -> None:
     table_stats = _get_column_stats(conn, monitoring_table_suffix)
     for table in table_stats:
-        stats = gen_table_stats_payload(tenant_id, conn.host, table)
+        stats = gen_table_stats_payload(tenant_id=tenant_id, endpoint=endpoint, stats=table)
         for stat in stats:
             status_code = qdc_client.update_stats_by_id(
                 global_id=stat.global_id,

quollio-core 0.4.7__py3-none-any.whl → 0.4.10__py3-none-any.whl

quollio-core 0.4.7py3-none-any.whl → 0.4.10py3-none-any.whl