PyPI - recce-nightly - Versions diffs - 1.2.0.20250506__py3-none-any.whl → 1.4.0.20250514__py3-none-any.whl - Mend

recce-nightly 1.2.0.20250506py3-none-any.whl → 1.4.0.20250514py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recce-nightly might be problematic. Click here for more details.

Files changed (93) hide show

recce/VERSION +1 -1
recce/__init__.py +22 -22
recce/adapter/base.py +11 -14
recce/adapter/dbt_adapter/__init__.py +355 -316
recce/adapter/dbt_adapter/dbt_version.py +3 -0
recce/adapter/sqlmesh_adapter.py +24 -35
recce/apis/check_api.py +39 -28
recce/apis/check_func.py +33 -27
recce/apis/run_api.py +25 -19
recce/apis/run_func.py +29 -23
recce/artifact.py +44 -49
recce/cli.py +484 -285
recce/config.py +42 -33
recce/core.py +52 -44
recce/data/404.html +1 -1
recce/data/_next/static/chunks/{368-7587b306577df275.js → 778-aef312bffb4c0312.js} +15 -15
recce/data/_next/static/chunks/8d700b6a.ed11a130057c7a47.js +1 -0
recce/data/_next/static/chunks/app/layout-c713a2829d3279e4.js +1 -0
recce/data/_next/static/chunks/app/page-7086764277331fcb.js +1 -0
recce/data/_next/static/chunks/{cd9f8d63-cf0d5a7b0f7a92e8.js → cd9f8d63-e020f408095ed77c.js} +3 -3
recce/data/_next/static/chunks/webpack-b787cb1a4f2293de.js +1 -0
recce/data/_next/static/css/88b8abc134cfd59a.css +3 -0
recce/data/index.html +2 -2
recce/data/index.txt +2 -2
recce/diff.py +6 -12
recce/event/__init__.py +74 -72
recce/event/collector.py +27 -20
recce/event/track.py +39 -27
recce/exceptions.py +1 -1
recce/git.py +7 -7
recce/github.py +57 -53
recce/models/__init__.py +1 -1
recce/models/check.py +6 -7
recce/models/run.py +1 -0
recce/models/types.py +27 -27
recce/pull_request.py +26 -24
recce/run.py +148 -111
recce/server.py +105 -88
recce/state.py +209 -177
recce/summary.py +168 -143
recce/tasks/__init__.py +3 -3
recce/tasks/core.py +11 -13
recce/tasks/dataframe.py +19 -17
recce/tasks/histogram.py +69 -34
recce/tasks/lineage.py +2 -2
recce/tasks/profile.py +152 -86
recce/tasks/query.py +139 -87
recce/tasks/rowcount.py +33 -30
recce/tasks/schema.py +14 -14
recce/tasks/top_k.py +35 -35
recce/tasks/valuediff.py +216 -152
recce/util/breaking.py +77 -84
recce/util/cll.py +55 -51
recce/util/io.py +19 -17
recce/util/logger.py +1 -1
recce/util/recce_cloud.py +70 -72
recce/util/singleton.py +4 -4
recce/yaml/__init__.py +7 -10
{recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/METADATA +5 -2
recce_nightly-1.4.0.20250514.dist-info/RECORD +143 -0
{recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/WHEEL +1 -1
tests/adapter/dbt_adapter/conftest.py +1 -0
tests/adapter/dbt_adapter/dbt_test_helper.py +28 -18
tests/adapter/dbt_adapter/test_dbt_adapter.py +0 -15
tests/adapter/dbt_adapter/test_dbt_cll.py +39 -32
tests/adapter/dbt_adapter/test_selector.py +22 -21
tests/tasks/test_histogram.py +58 -66
tests/tasks/test_lineage.py +36 -23
tests/tasks/test_preset_checks.py +45 -31
tests/tasks/test_profile.py +340 -15
tests/tasks/test_query.py +40 -40
tests/tasks/test_row_count.py +65 -46
tests/tasks/test_schema.py +65 -42
tests/tasks/test_top_k.py +22 -18
tests/tasks/test_valuediff.py +43 -32
tests/test_cli.py +71 -58
tests/test_config.py +7 -9
tests/test_core.py +5 -3
tests/test_dbt.py +7 -7
tests/test_pull_request.py +1 -1
tests/test_server.py +19 -13
tests/test_state.py +40 -27
tests/test_summary.py +18 -14
recce/data/_next/static/chunks/8d700b6a-f0b1f6b9e0d97ce2.js +0 -1
recce/data/_next/static/chunks/app/layout-9102e22cb73f74d6.js +0 -1
recce/data/_next/static/chunks/app/page-cee661090afbd6aa.js +0 -1
recce/data/_next/static/chunks/webpack-567d72f0bc0820d5.js +0 -1
recce_nightly-1.2.0.20250506.dist-info/RECORD +0 -142
/recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → E_HPXsXdrqHg2YEHmU3mK}/_buildManifest.js +0 -0
/recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → E_HPXsXdrqHg2YEHmU3mK}/_ssgManifest.js +0 -0
{recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/entry_points.txt +0 -0
{recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/licenses/LICENSE +0 -0
{recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/top_level.txt +0 -0

recce/tasks/dataframe.py CHANGED Viewed

@@ -10,14 +10,14 @@ from pydantic import BaseModel, Field
 class DataFrameColumnType(Enum):
-    NUMBER = 'number'
-    INTEGER = 'integer'
-    TEXT = 'text'
-    BOOLEAN = 'boolean'
-    DATE = 'date'
-    DATETIME = 'datetime'
-    TIMEDELTA = 'timedelta'
-    UNKNOWN = 'unknown'
+    NUMBER = "number"
+    INTEGER = "integer"
+    TEXT = "text"
+    BOOLEAN = "boolean"
+    DATE = "date"
+    DATETIME = "datetime"
+    TIMEDELTA = "timedelta"
+    UNKNOWN = "unknown"
 class DataFrameColumn(BaseModel):
@@ -32,19 +32,21 @@ class DataFrame(BaseModel):
     more: t.Optional[bool] = Field(None, description="Whether there are more rows to fetch")
     @staticmethod
-    def from_agate(table: 'agate.Table', limit: t.Optional[int] = None, more: t.Optional[bool] = None):
+    def from_agate(table: "agate.Table", limit: t.Optional[int] = None, more: t.Optional[bool] = None):
         from recce.adapter.dbt_adapter import dbt_version
-        if dbt_version < 'v1.8':
+        if dbt_version < "v1.8":
             import dbt.clients.agate_helper as agate_helper
         else:
             import dbt_common.clients.agate_helper as agate_helper
         import agate
         columns = []
         for col_name, col_type in zip(table.column_names, table.column_types):
-            has_integer = hasattr(agate_helper, 'Integer')
+            has_integer = hasattr(agate_helper, "Integer")
             if isinstance(col_type, agate.Number):
                 col_type = DataFrameColumnType.NUMBER
@@ -78,23 +80,23 @@ class DataFrame(BaseModel):
         return df
     @staticmethod
-    def from_pandas(pandas_df: 'pandas.DataFrame', limit: t.Optional[int] = None, more: t.Optional[bool] = None):
+    def from_pandas(pandas_df: "pandas.DataFrame", limit: t.Optional[int] = None, more: t.Optional[bool] = None):
         columns = []
         for column in pandas_df.columns:
             dtype = pandas_df[column].dtype
-            if dtype == 'int64':
+            if dtype == "int64":
                 col_type = DataFrameColumnType.INTEGER
-            elif dtype == 'float64':
+            elif dtype == "float64":
                 col_type = DataFrameColumnType.NUMBER
-            elif dtype == 'object':
+            elif dtype == "object":
                 col_type = DataFrameColumnType.TEXT
-            elif dtype == 'bool':
+            elif dtype == "bool":
                 col_type = DataFrameColumnType.BOOLEAN
             else:
                 col_type = DataFrameColumnType.UNKNOWN
             columns.append(DataFrameColumn(name=column, type=col_type))
-        s = pandas_df.to_json(orient='values')
+        s = pandas_df.to_json(orient="values")
         data = json.loads(s)
         df = DataFrame(

recce/tasks/histogram.py CHANGED Viewed

@@ -9,34 +9,66 @@ from pydantic import BaseModel
 from recce.core import default_context
 from recce.models import Check
 from recce.tasks import Task
-from recce.tasks.core import TaskResultDiffer, CheckValidator
+from recce.tasks.core import CheckValidator, TaskResultDiffer
 from recce.tasks.query import QueryMixin
 sql_datetime_types = [
-    "DATE", "DATETIME", "TIMESTAMP", "TIME",
+    "DATE",
+    "DATETIME",
+    "TIMESTAMP",
+    "TIME",
     "YEAR",  # Specific to MySQL/MariaDB
-    "DATETIME2", "SMALLDATETIME", "DATETIMEOFFSET",  # Specific to SQL Server
+    "DATETIME2",
+    "SMALLDATETIME",
+    "DATETIMEOFFSET",  # Specific to SQL Server
     "INTERVAL",  # Common in PostgreSQL and Oracle
-    "TIMESTAMPTZ", "TIMETZ",  # Specific to PostgreSQL
-    "TIMESTAMP WITH TIME ZONE", "TIMESTAMP WITH LOCAL TIME ZONE",  # Oracle
-    "TIMESTAMP_LTZ", "TIMESTAMP_NTZ", "TIMESTAMP_TZ",  # Specific to Snowflake
+    "TIMESTAMPTZ",
+    "TIMETZ",  # Specific to PostgreSQL
+    "TIMESTAMP WITH TIME ZONE",
+    "TIMESTAMP WITH LOCAL TIME ZONE",  # Oracle
+    "TIMESTAMP_LTZ",
+    "TIMESTAMP_NTZ",
+    "TIMESTAMP_TZ",  # Specific to Snowflake
 ]
 sql_integer_types = [
-    "TINYINT", "SMALLINT", "MEDIUMINT", "INT", "INTEGER", "BIGINT",  # Common across most databases
-    "INT2", "INT4", "INT8",  # PostgreSQL specific aliases
+    "TINYINT",
+    "SMALLINT",
+    "MEDIUMINT",
+    "INT",
+    "INTEGER",
+    "BIGINT",  # Common across most databases
+    "INT2",
+    "INT4",
+    "INT8",  # PostgreSQL specific aliases
     "UNSIGNED BIG INT",  # SQLite specific
     "NUMBER",  # Oracle, can be used as an integer with precision and scale
     "NUMERIC",  # Generally available in many SQL databases, used with precision and scale
-    "SMALLSERIAL", "SERIAL", "BIGSERIAL",  # PostgreSQL auto-increment types
-    "IDENTITY", "SMALLIDENTITY", "BIGIDENTITY",  # SQL Server specific auto-increment types
+    "SMALLSERIAL",
+    "SERIAL",
+    "BIGSERIAL",  # PostgreSQL auto-increment types
+    "IDENTITY",
+    "SMALLIDENTITY",
+    "BIGIDENTITY",  # SQL Server specific auto-increment types
     "BYTEINT",  # Specific to Snowflake, for storing very small integers
 ]
 sql_not_supported_types = [
-    "CHAR", "VARCHAR", "TINYTEXT", "TEXT", "MEDIUMTEXT", "LONGTEXT",
-    "NCHAR", "NVARCHAR", "VARCHAR2", "NVARCHAR2", "CLOB", "NCLOB",
-    "VARCHAR(MAX)", "XML", "JSON",
+    "CHAR",
+    "VARCHAR",
+    "TINYTEXT",
+    "TEXT",
+    "MEDIUMTEXT",
+    "LONGTEXT",
+    "NCHAR",
+    "NVARCHAR",
+    "VARCHAR2",
+    "NVARCHAR2",
+    "CLOB",
+    "NCLOB",
+    "VARCHAR(MAX)",
+    "XML",
+    "JSON",
     "BOOLEAN",  # PostgreSQL, SQLite, and others with native boolean support
     "TINYINT(1)",  # MySQL/MariaDB uses TINYINT(1) to represent boolean values
     "BIT",  # SQL Server and others use BIT to represent boolean values, where 1 is true and 0 is false
@@ -185,7 +217,7 @@ def query_numeric_histogram(task, node, column, column_type, min_value, max_valu
                 else:
                     counts[num_bins - 1] += count
         base_result = {
-            'counts': counts,
+            "counts": counts,
         }
     if curr is not None:
         counts = [0] * num_bins
@@ -199,7 +231,7 @@ def query_numeric_histogram(task, node, column, column_type, min_value, max_valu
                 else:
                     counts[num_bins - 1] += count
         curr_result = {
-            'counts': counts,
+            "counts": counts,
         }
     return base_result, curr_result, bin_edges, labels
@@ -209,7 +241,7 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
     print(max_value, min_value, days_delta)
     # _type = None
     if days_delta > 365 * 4:
-        _type = 'yearly'
+        _type = "yearly"
         dmin = date(min_value.year, 1, 1)
         if max_value.year < 3000:
             dmax = date(max_value.year, 1, 1) + relativedelta(years=+1)
@@ -237,7 +269,7 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
         else:
             dmax = date(3000, 1, 1)
         period = relativedelta(dmax, dmin)
-        num_buckets = (period.years * 12 + period.months)
+        num_buckets = period.years * 12 + period.months
         bin_edges = [dmin + relativedelta(months=i) for i in range(num_buckets + 1)]
         sql = f"""
         SELECT
@@ -285,18 +317,18 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
     base_counts = [0] * num_buckets
     print(_type)
-    for (d, v) in base.rows:
+    for d, v in base.rows:
         i = bin_edges.index(d.date()) if isinstance(d, datetime) else bin_edges.index(d)
         base_counts[i] = v
     curr_counts = [0] * num_buckets
-    for (d, v) in curr.rows:
+    for d, v in curr.rows:
         i = bin_edges.index(d.date()) if isinstance(d, datetime) else bin_edges.index(d)
         curr_counts[i] = v
     base_result = {
-        'counts': base_counts,
+        "counts": base_counts,
     }
     curr_result = {
-        'counts': curr_counts,
+        "counts": curr_counts,
     }
     return base_result, curr_result, bin_edges
@@ -310,6 +342,7 @@ class HistogramDiffTask(Task, QueryMixin):
     def execute(self):
         from recce.adapter.dbt_adapter import DbtAdapter
         result = {}
         dbt_adapter: DbtAdapter = default_context().adapter
@@ -353,29 +386,31 @@ class HistogramDiffTask(Task, QueryMixin):
             labels = None
             if min_value is None or max_value is None:
                 base_result = {
-                    'counts': [],
+                    "counts": [],
                 }
                 current_result = {
-                    'counts': [],
+                    "counts": [],
                 }
                 bin_edges = []
                 labels = []
             elif column_type.upper() in sql_datetime_types:
                 base_result, current_result, bin_edges = query_datetime_histogram(
-                    self, node, column, min_value, max_value)
+                    self, node, column, min_value, max_value
+                )
             else:
                 base_result, current_result, bin_edges, labels = query_numeric_histogram(
-                    self, node, column, column_type, min_value, max_value, num_bins)
+                    self, node, column, column_type, min_value, max_value, num_bins
+                )
             if base_result:
-                base_result['total'] = base_total
+                base_result["total"] = base_total
             if current_result:
-                current_result['total'] = curr_total
-            result['base'] = base_result
-            result['current'] = current_result
-            result['min'] = min_value
-            result['max'] = max_value
-            result['bin_edges'] = bin_edges
-            result['labels'] = labels
+                current_result["total"] = curr_total
+            result["base"] = base_result
+            result["current"] = current_result
+            result["min"] = min_value
+            result["max"] = max_value
+            result["bin_edges"] = bin_edges
+            result["labels"] = labels
         return result
     def cancel(self):
@@ -386,7 +421,7 @@ class HistogramDiffTask(Task, QueryMixin):
 class HistogramDiffTaskResultDiffer(TaskResultDiffer):
     def _check_result_changed_fn(self, result):
-        return TaskResultDiffer.diff(result['base'], result['current'])
+        return TaskResultDiffer.diff(result["base"], result["current"])
 class HistogramDiffCheckValidator(CheckValidator):

recce/tasks/lineage.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, Literal
+from typing import Literal, Optional
 from pydantic import BaseModel
@@ -10,7 +10,7 @@ class LineageDiffParams(BaseModel):
     select: Optional[str] = None
     exclude: Optional[str] = None
     packages: Optional[list[str]] = None
-    view_mode: Optional[Literal['all', 'changed_models']] = None
+    view_mode: Optional[Literal["all", "changed_models"]] = None
 class LineageDiffCheckValidator(CheckValidator):

recce/tasks/profile.py CHANGED Viewed

@@ -1,13 +1,141 @@
-import textwrap
 from typing import List
 from pydantic import BaseModel
-from .core import Task, TaskResultDiffer, CheckValidator
-from .dataframe import DataFrame
 from ..core import default_context
 from ..exceptions import RecceException
 from ..models import Check
+from .core import CheckValidator, Task, TaskResultDiffer
+from .dataframe import DataFrame
+PROFILE_COLUMN_JINJA_TEMPLATE = r"""
+{# Conditions -------------------------------------------- #}
+{%- set is_struct = column_type.startswith('struct') -%}
+{%- set is_numeric =
+    column_type.startswith('int') or
+    column_type.startswith('float') or
+    'numeric' in column_type or
+    'number' in column_type or
+    'double' in column_type or
+    'bigint' in column_type
+-%}
+{%- set is_date_or_time =
+    column_type.startswith('date') or
+    column_type.startswith('timestamp')
+-%}
+{%- set is_logical = column_type.startswith('bool') -%}
+{%- if db_type == 'sqlserver' -%}
+    {%- set is_numeric = column_type in [
+        "bigint", "numeric", "smallint", "decimal", "int",
+        "tinyint", "money", "float", "real"
+    ]-%}
+{%- elif db_type == 'athena' -%}
+    {%- set is_numeric =
+        "int" in column_type or
+        "float" in column_type or
+        "decimal" in column_type or
+        "double" in column_type
+    -%}
+{%- endif -%}
+{# General Agg ------------------------------------------- #}
+{%- set agg_row_count = 'cast(count(*) as ' ~ dbt.type_bigint() ~ ')' -%}
+{%- set agg_not_null_proportion =
+        'sum(case when ' ~ adapter.quote(column_name) ~ ' is null '
+        ~ 'then 0 '
+        ~ 'else 1 end) / '
+        ~ 'cast(count(*) as ' ~ dbt.type_numeric() ~ ')'
+-%}
+{%- set agg_distinct_proportion =
+        'count(distinct ' ~ adapter.quote(column_name) ~') / '
+        ~ 'cast(count(*) as ' ~ dbt.type_numeric() ~ ')'
+-%}
+{%- set agg_distinct_count = 'count(distinct ' ~ adapter.quote(column_name) ~ ')' -%}
+{%- set agg_is_unique =      'count(distinct ' ~ adapter.quote(column_name) ~ ') = count(*)' -%}
+{%- set agg_min =            'cast(null as ' ~ dbt.type_string() ~ ')' -%}
+{%- set agg_max =            'cast(null as ' ~ dbt.type_string() ~ ')' -%}
+{%- set agg_avg =            'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
+{%- set agg_median =         'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
+{%- if is_struct -%}
+    {%- set agg_distinct_proportion = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
+    {%- set agg_distinct_count = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
+    {%- set agg_is_unique = 'null' -%}
+{%- endif -%}
+{%- if (is_numeric or is_date_or_time) and (not is_struct) -%}
+    {%- set agg_min =
+        'cast(min(' ~ adapter.quote(column_name) ~ ') as ' ~ dbt.type_string() ~ ')'
+    -%}
+    {%- set agg_max =
+        'cast(max(' ~ adapter.quote(column_name) ~ ') as ' ~ dbt.type_string() ~ ')'
+    -%}
+{%- endif -%}
+{%- if is_numeric and not is_struct -%}
+    {%- set agg_avg = 'avg(' ~ adapter.quote(column_name) ~ ')' -%}
+    {%- if db_type == 'bigquery' -%}
+        {%- set agg_median = 'approx_quantiles(' ~ adapter.quote(column_name) ~ ', 100)[offset(50)]' -%}
+    {%- elif db_type == 'postgres' -%}
+        {%- set agg_median = 'percentile_cont(0.5) within group (order by ' ~ adapter.quote(column_name) ~ ')' -%}
+    {%- elif db_type == 'redshift' -%}
+        {%- set agg_median =
+            '(select percentile_cont(0.5) within group (order by '
+            ~ adapter.quote(column_name) ~ ') from ' ~ relation ~ ')' -%}
+    {%- elif db_type == 'athena' -%}
+        {%- set agg_median = 'approx_percentile( ' ~ adapter.quote(column_name) ~ ', 0.5)' -%}
+    {%- elif db_type == 'sqlserver' -%}
+        {%- set agg_median = 'percentile_cont(' ~ adapter.quote(column_name) ~ ', 0.5) over ()' -%}
+    {%- else -%}
+        {%- set agg_median = 'median(' ~ adapter.quote(column_name) ~ ')' -%}
+    {%- endif -%}
+{%- elif is_logical -%}
+    {%- set agg_avg = 'avg(case when ' ~ adapter.quote(column_name) ~ ' then 1 else 0 end)' -%}
+{%- endif -%}
+{# Overwrite Agg ----------------------------------------- #}
+{# DRC-663: Support bigquery array type }
+{%- set is_array = column_type.startswith('array') -%}
+{%- if db_type == 'bigquery' and is_array -%}
+    {%- set agg_distinct_proportion = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
+    {%- set agg_distinct_count = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
+    {%- set agg_is_unique = 'null' -%}
+    {%- set agg_min =
+        'cast(min(array_length(' ~ adapter.quote(column_name) ~ ')) as ' ~ dbt.type_string() ~ ')'
+    -%}
+    {%- set agg_max =
+        'cast(max(array_length(' ~ adapter.quote(column_name) ~ ')) as ' ~ dbt.type_string() ~ ')'
+    -%}
+    {%- set agg_avg = 'avg(array_length(' ~ adapter.quote(column_name) ~ '))' -%}
+    {%- set agg_median =
+        'approx_quantiles(array_length(' ~ adapter.quote(column_name) ~ '), 100)[offset(50)]'
+    -%}
+{%- endif -%}
+{# Main Query -------------------------------------------- #}
+select
+    '{{ column_name }}' as column_name,
+    nullif('{{ column_type }}', '') as data_type,
+    {{ agg_row_count }} as row_count,
+    {{ agg_not_null_proportion }} as not_null_proportion,
+    {{ agg_distinct_proportion }} as distinct_proportion,
+    {{ agg_distinct_count }} as distinct_count,
+    {{ agg_is_unique }} as is_unique,
+    {{ agg_min }} as min,
+    {{ agg_max }} as max,
+    {{ agg_avg }} as avg,
+    {{ agg_median }} as median
+from {{ relation }}
+"""
 class ProfileParams(BaseModel):
@@ -33,14 +161,14 @@ class ProfileDiffTask(Task):
     def execute(self):
         import agate
         from recce.adapter.dbt_adapter import DbtAdapter, merge_tables
         dbt_adapter: DbtAdapter = default_context().adapter
         model: str = self.params.model
         selected_columns: List[str] = self.params.columns
-        self._verify_dbt_profiler(dbt_adapter)
         with dbt_adapter.connection_named("query"):
             self.connection = dbt_adapter.get_thread_connection()
@@ -58,7 +186,7 @@ class ProfileDiffTask(Task):
             tables: List[agate.Table] = []
             for column in base_columns:
-                self.update_progress(message=f'[Base] Profile column: {column.name}', percentage=completed / total)
+                self.update_progress(message=f"[Base] Profile column: {column.name}", percentage=completed / total)
                 relation = dbt_adapter.create_relation(model, base=True)
                 response, table = self._profile_column(dbt_adapter, relation, column)
                 tables.append(table)
@@ -68,7 +196,7 @@ class ProfileDiffTask(Task):
             tables: List[agate.Table] = []
             for column in curr_columns:
-                self.update_progress(message=f'[Current] Profile column: {column.column}', percentage=completed / total)
+                self.update_progress(message=f"[Current] Profile column: {column.column}", percentage=completed / total)
                 relation = dbt_adapter.create_relation(model, base=False)
                 response, table = self._profile_column(dbt_adapter, relation, column)
                 tables.append(table)
@@ -76,87 +204,23 @@ class ProfileDiffTask(Task):
                 self.check_cancel()
             current = DataFrame.from_agate(merge_tables(tables))
-            return ProfileDiffResult(base=base, current=current)
+            if len(base.columns) == 0 and len(current.columns) != 0:
+                base.columns = current.columns
+            elif len(base.columns) != 0 and len(current.columns) == 0:
+                current.columns = base.columns
-    def _verify_dbt_profiler(self, dbt_adapter):
-        for macro_name, macro in dbt_adapter.manifest.macros.items():
-            if macro.package_name == 'dbt_profiler':
-                break
-        else:
-            raise RecceException(
-                r"Package 'dbt_profiler' not found. Please refer to the link to install: https://hub.getdbt.com/data-mie/dbt_profiler/")
+            return ProfileDiffResult(base=base, current=current)
     def _profile_column(self, dbt_adapter, relation, column):
-        sql_template = textwrap.dedent(r"""
-        select
-        '{{column_name}}' as column_name,
-        nullif('{{column_type}}', '') as data_type,
-        {{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
-        {{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
-        {{ dbt_profiler.measure_distinct_proportion(column_name, column_type) }} as distinct_proportion,
-        {{ dbt_profiler.measure_distinct_count(column_name, column_type) }} as distinct_count,
-        {{ dbt_profiler.measure_is_unique(column_name, column_type) }} as is_unique,
-        {{ dbt_profiler.measure_min(column_name, column_type) }} as min,
-        {{ dbt_profiler.measure_max(column_name, column_type) }} as max,
-        {{ dbt_profiler.measure_avg(column_name, column_type) }} as avg,
-        {{ dbt_profiler.measure_median(column_name, column_type) }} as median
-        from
-        {{ relation }}
-        """)
         column_name = column.name
         column_type = column.data_type.lower()
-        db_type = dbt_adapter.adapter.type()
-        if db_type == 'bigquery' and column_type.startswith('array'):
-            # DRC-663: Support bigquery array type
-            sql_template = textwrap.dedent(r"""
-            select
-            '{{column_name}}' as column_name,
-            nullif('{{column_type}}', '') as data_type,
-            {{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
-            {{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
-            cast(null as {{ dbt.type_numeric() }}) as distinct_proportion,
-            cast(null as {{ dbt.type_numeric() }}) as distinct_count,
-            null as is_unique,
-            cast(min(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as {{ dbt_profiler.type_string() }}) as min,
-            cast(max(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as {{ dbt_profiler.type_string() }}) as max,
-            avg(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as avg,
-            APPROX_QUANTILES(ARRAY_LENGTH({{ adapter.quote(column_name) }}), 100)[OFFSET(50)] as median,
-            from
-            {{ relation }}
-            """)
-        elif db_type == 'redshift':
-            # DRC-1149: Support redshift median calculation
-            # https://github.com/data-mie/dbt-profiler/pull/89
-            #
-            # Since dbt-profiler 0.8.2, there is the third parameter for measure_median
-            # For sake of compatibility, we use the new way to call the macro only for redshift
-            sql_template = textwrap.dedent(r"""
-            with source_data as (
-              select
-                *
-              from {{ relation }}
-            )
-            select
-            '{{column_name}}' as column_name,
-            nullif('{{column_type}}', '') as data_type,
-            {{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
-            {{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
-            {{ dbt_profiler.measure_distinct_proportion(column_name, column_type) }} as distinct_proportion,
-            {{ dbt_profiler.measure_distinct_count(column_name, column_type) }} as distinct_count,
-            {{ dbt_profiler.measure_is_unique(column_name, column_type) }} as is_unique,
-            {{ dbt_profiler.measure_min(column_name, column_type) }} as min,
-            {{ dbt_profiler.measure_max(column_name, column_type) }} as max,
-            {{ dbt_profiler.measure_avg(column_name, column_type) }} as avg,
-            ({{ dbt_profiler.measure_median(column_name, column_type, 'source_data') }}) as median
-            from
-            source_data
-            """)
+        db_type = dbt_adapter.adapter.type().lower()
         try:
             sql = dbt_adapter.generate_sql(
-                sql_template,
+                PROFILE_COLUMN_JINJA_TEMPLATE,
                 base=False,  # always false because we use the macro in current manifest
-                context=dict(relation=relation, column_name=column_name, column_type=column_type)
+                context=dict(relation=relation, column_name=column_name, column_type=column_type, db_type=db_type),
             )
         except Exception as e:
             raise RecceException(f"Failed to generate SQL for profiling column: {column_name}") from e
@@ -165,14 +229,15 @@ class ProfileDiffTask(Task):
             return dbt_adapter.execute(sql, fetch=True)
         except Exception as e:
             from recce.adapter.dbt_adapter import dbt_version
-            if dbt_version < 'v1.8':
+            if dbt_version < "v1.8":
                 from dbt.exceptions import DbtDatabaseError
             else:
                 from dbt_common.exceptions import DbtDatabaseError
             if isinstance(e, DbtDatabaseError):
-                if str(e).find('100051') >= 0:
+                if str(e).find("100051") >= 0:
                     # Snowflake error '100051 (22012): Division by zero"'
-                    e = RecceException('No profile diff result due to the model is empty.', False)
+                    e = RecceException("No profile diff result due to the model is empty.", False)
             raise e
     def cancel(self):
@@ -180,6 +245,7 @@ class ProfileDiffTask(Task):
         if self.connection:
             from recce.adapter.dbt_adapter import DbtAdapter
             dbt_adapter: DbtAdapter = default_context().adapter
             with dbt_adapter.connection_named("cancel"):
                 dbt_adapter.cancel(self.connection)
@@ -187,7 +253,7 @@ class ProfileDiffTask(Task):
 class ProfileDiffResultDiffer(TaskResultDiffer):
     def _check_result_changed_fn(self, result):
-        return self.diff(result['base'], result['current'])
+        return self.diff(result["base"], result["current"])
 class ProfileCheckValidator(CheckValidator):
@@ -202,14 +268,14 @@ class ProfileCheckValidator(CheckValidator):
 class ProfileTask(ProfileDiffTask):
     def execute(self):
         import agate
         from recce.adapter.dbt_adapter import DbtAdapter, merge_tables
         dbt_adapter: DbtAdapter = default_context().adapter
         model: str = self.params.model
         selected_columns: List[str] = self.params.columns
-        self._verify_dbt_profiler(dbt_adapter)
         with dbt_adapter.connection_named("query"):
             self.connection = dbt_adapter.get_thread_connection()
             curr_columns = [column for column in dbt_adapter.get_columns(model, base=False)]
@@ -222,7 +288,7 @@ class ProfileTask(ProfileDiffTask):
             tables: List[agate.Table] = []
             for column in curr_columns:
-                self.update_progress(message=f'[Current] Profile column: {column.column}', percentage=completed / total)
+                self.update_progress(message=f"[Current] Profile column: {column.column}", percentage=completed / total)
                 relation = dbt_adapter.create_relation(model, base=False)
                 response, table = self._profile_column(dbt_adapter, relation, column)
                 tables.append(table)

recce-nightly 1.2.0.20250506__py3-none-any.whl → 1.4.0.20250514__py3-none-any.whl

Potentially problematic release.

recce-nightly 1.2.0.20250506py3-none-any.whl → 1.4.0.20250514py3-none-any.whl