recce-nightly 1.3.0.20250507__py3-none-any.whl → 1.4.0.20250515__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recce-nightly might be problematic. Click here for more details.

Files changed (93) hide show
  1. recce/VERSION +1 -1
  2. recce/__init__.py +22 -22
  3. recce/adapter/base.py +11 -14
  4. recce/adapter/dbt_adapter/__init__.py +355 -316
  5. recce/adapter/dbt_adapter/dbt_version.py +3 -0
  6. recce/adapter/sqlmesh_adapter.py +24 -35
  7. recce/apis/check_api.py +39 -28
  8. recce/apis/check_func.py +33 -27
  9. recce/apis/run_api.py +25 -19
  10. recce/apis/run_func.py +29 -23
  11. recce/artifact.py +44 -49
  12. recce/cli.py +484 -285
  13. recce/config.py +42 -33
  14. recce/core.py +52 -44
  15. recce/data/404.html +1 -1
  16. recce/data/_next/static/chunks/{368-7587b306577df275.js → 778-aef312bffb4c0312.js} +15 -15
  17. recce/data/_next/static/chunks/8d700b6a.ed11a130057c7a47.js +1 -0
  18. recce/data/_next/static/chunks/app/layout-c713a2829d3279e4.js +1 -0
  19. recce/data/_next/static/chunks/app/page-7086764277331fcb.js +1 -0
  20. recce/data/_next/static/chunks/{cd9f8d63-cf0d5a7b0f7a92e8.js → cd9f8d63-e020f408095ed77c.js} +3 -3
  21. recce/data/_next/static/chunks/webpack-b787cb1a4f2293de.js +1 -0
  22. recce/data/_next/static/css/88b8abc134cfd59a.css +3 -0
  23. recce/data/index.html +2 -2
  24. recce/data/index.txt +2 -2
  25. recce/diff.py +6 -12
  26. recce/event/__init__.py +74 -72
  27. recce/event/collector.py +27 -20
  28. recce/event/track.py +39 -27
  29. recce/exceptions.py +1 -1
  30. recce/git.py +7 -7
  31. recce/github.py +57 -53
  32. recce/models/__init__.py +1 -1
  33. recce/models/check.py +6 -7
  34. recce/models/run.py +1 -0
  35. recce/models/types.py +27 -27
  36. recce/pull_request.py +26 -24
  37. recce/run.py +148 -111
  38. recce/server.py +103 -89
  39. recce/state.py +209 -177
  40. recce/summary.py +168 -143
  41. recce/tasks/__init__.py +3 -3
  42. recce/tasks/core.py +11 -13
  43. recce/tasks/dataframe.py +19 -17
  44. recce/tasks/histogram.py +69 -34
  45. recce/tasks/lineage.py +2 -2
  46. recce/tasks/profile.py +147 -86
  47. recce/tasks/query.py +139 -87
  48. recce/tasks/rowcount.py +33 -30
  49. recce/tasks/schema.py +14 -14
  50. recce/tasks/top_k.py +35 -35
  51. recce/tasks/valuediff.py +216 -152
  52. recce/util/breaking.py +77 -84
  53. recce/util/cll.py +55 -51
  54. recce/util/io.py +19 -17
  55. recce/util/logger.py +1 -1
  56. recce/util/recce_cloud.py +70 -72
  57. recce/util/singleton.py +4 -4
  58. recce/yaml/__init__.py +7 -10
  59. {recce_nightly-1.3.0.20250507.dist-info → recce_nightly-1.4.0.20250515.dist-info}/METADATA +5 -2
  60. recce_nightly-1.4.0.20250515.dist-info/RECORD +143 -0
  61. {recce_nightly-1.3.0.20250507.dist-info → recce_nightly-1.4.0.20250515.dist-info}/WHEEL +1 -1
  62. tests/adapter/dbt_adapter/conftest.py +1 -0
  63. tests/adapter/dbt_adapter/dbt_test_helper.py +28 -18
  64. tests/adapter/dbt_adapter/test_dbt_adapter.py +0 -15
  65. tests/adapter/dbt_adapter/test_dbt_cll.py +39 -32
  66. tests/adapter/dbt_adapter/test_selector.py +22 -21
  67. tests/tasks/test_histogram.py +58 -66
  68. tests/tasks/test_lineage.py +36 -23
  69. tests/tasks/test_preset_checks.py +45 -31
  70. tests/tasks/test_profile.py +340 -15
  71. tests/tasks/test_query.py +40 -40
  72. tests/tasks/test_row_count.py +65 -46
  73. tests/tasks/test_schema.py +65 -42
  74. tests/tasks/test_top_k.py +22 -18
  75. tests/tasks/test_valuediff.py +43 -32
  76. tests/test_cli.py +71 -58
  77. tests/test_config.py +7 -9
  78. tests/test_core.py +5 -3
  79. tests/test_dbt.py +7 -7
  80. tests/test_pull_request.py +1 -1
  81. tests/test_server.py +19 -13
  82. tests/test_state.py +40 -27
  83. tests/test_summary.py +18 -14
  84. recce/data/_next/static/chunks/8d700b6a-f0b1f6b9e0d97ce2.js +0 -1
  85. recce/data/_next/static/chunks/app/layout-9102e22cb73f74d6.js +0 -1
  86. recce/data/_next/static/chunks/app/page-92f13c8fad9fae3d.js +0 -1
  87. recce/data/_next/static/chunks/webpack-567d72f0bc0820d5.js +0 -1
  88. recce_nightly-1.3.0.20250507.dist-info/RECORD +0 -142
  89. /recce/data/_next/static/{K5iKlCYhdcpq8Ea6ck9J_ → q0Xsc9Sd6PDuo1lshYpLu}/_buildManifest.js +0 -0
  90. /recce/data/_next/static/{K5iKlCYhdcpq8Ea6ck9J_ → q0Xsc9Sd6PDuo1lshYpLu}/_ssgManifest.js +0 -0
  91. {recce_nightly-1.3.0.20250507.dist-info → recce_nightly-1.4.0.20250515.dist-info}/entry_points.txt +0 -0
  92. {recce_nightly-1.3.0.20250507.dist-info → recce_nightly-1.4.0.20250515.dist-info}/licenses/LICENSE +0 -0
  93. {recce_nightly-1.3.0.20250507.dist-info → recce_nightly-1.4.0.20250515.dist-info}/top_level.txt +0 -0
recce/tasks/dataframe.py CHANGED
@@ -10,14 +10,14 @@ from pydantic import BaseModel, Field
10
10
 
11
11
 
12
12
  class DataFrameColumnType(Enum):
13
- NUMBER = 'number'
14
- INTEGER = 'integer'
15
- TEXT = 'text'
16
- BOOLEAN = 'boolean'
17
- DATE = 'date'
18
- DATETIME = 'datetime'
19
- TIMEDELTA = 'timedelta'
20
- UNKNOWN = 'unknown'
13
+ NUMBER = "number"
14
+ INTEGER = "integer"
15
+ TEXT = "text"
16
+ BOOLEAN = "boolean"
17
+ DATE = "date"
18
+ DATETIME = "datetime"
19
+ TIMEDELTA = "timedelta"
20
+ UNKNOWN = "unknown"
21
21
 
22
22
 
23
23
  class DataFrameColumn(BaseModel):
@@ -32,19 +32,21 @@ class DataFrame(BaseModel):
32
32
  more: t.Optional[bool] = Field(None, description="Whether there are more rows to fetch")
33
33
 
34
34
  @staticmethod
35
- def from_agate(table: 'agate.Table', limit: t.Optional[int] = None, more: t.Optional[bool] = None):
35
+ def from_agate(table: "agate.Table", limit: t.Optional[int] = None, more: t.Optional[bool] = None):
36
36
  from recce.adapter.dbt_adapter import dbt_version
37
- if dbt_version < 'v1.8':
37
+
38
+ if dbt_version < "v1.8":
38
39
  import dbt.clients.agate_helper as agate_helper
39
40
  else:
40
41
  import dbt_common.clients.agate_helper as agate_helper
41
42
 
42
43
  import agate
44
+
43
45
  columns = []
44
46
 
45
47
  for col_name, col_type in zip(table.column_names, table.column_types):
46
48
 
47
- has_integer = hasattr(agate_helper, 'Integer')
49
+ has_integer = hasattr(agate_helper, "Integer")
48
50
 
49
51
  if isinstance(col_type, agate.Number):
50
52
  col_type = DataFrameColumnType.NUMBER
@@ -78,23 +80,23 @@ class DataFrame(BaseModel):
78
80
  return df
79
81
 
80
82
  @staticmethod
81
- def from_pandas(pandas_df: 'pandas.DataFrame', limit: t.Optional[int] = None, more: t.Optional[bool] = None):
83
+ def from_pandas(pandas_df: "pandas.DataFrame", limit: t.Optional[int] = None, more: t.Optional[bool] = None):
82
84
  columns = []
83
85
  for column in pandas_df.columns:
84
86
  dtype = pandas_df[column].dtype
85
- if dtype == 'int64':
87
+ if dtype == "int64":
86
88
  col_type = DataFrameColumnType.INTEGER
87
- elif dtype == 'float64':
89
+ elif dtype == "float64":
88
90
  col_type = DataFrameColumnType.NUMBER
89
- elif dtype == 'object':
91
+ elif dtype == "object":
90
92
  col_type = DataFrameColumnType.TEXT
91
- elif dtype == 'bool':
93
+ elif dtype == "bool":
92
94
  col_type = DataFrameColumnType.BOOLEAN
93
95
  else:
94
96
  col_type = DataFrameColumnType.UNKNOWN
95
97
  columns.append(DataFrameColumn(name=column, type=col_type))
96
98
 
97
- s = pandas_df.to_json(orient='values')
99
+ s = pandas_df.to_json(orient="values")
98
100
  data = json.loads(s)
99
101
 
100
102
  df = DataFrame(
recce/tasks/histogram.py CHANGED
@@ -9,34 +9,66 @@ from pydantic import BaseModel
9
9
  from recce.core import default_context
10
10
  from recce.models import Check
11
11
  from recce.tasks import Task
12
- from recce.tasks.core import TaskResultDiffer, CheckValidator
12
+ from recce.tasks.core import CheckValidator, TaskResultDiffer
13
13
  from recce.tasks.query import QueryMixin
14
14
 
15
15
  sql_datetime_types = [
16
- "DATE", "DATETIME", "TIMESTAMP", "TIME",
16
+ "DATE",
17
+ "DATETIME",
18
+ "TIMESTAMP",
19
+ "TIME",
17
20
  "YEAR", # Specific to MySQL/MariaDB
18
- "DATETIME2", "SMALLDATETIME", "DATETIMEOFFSET", # Specific to SQL Server
21
+ "DATETIME2",
22
+ "SMALLDATETIME",
23
+ "DATETIMEOFFSET", # Specific to SQL Server
19
24
  "INTERVAL", # Common in PostgreSQL and Oracle
20
- "TIMESTAMPTZ", "TIMETZ", # Specific to PostgreSQL
21
- "TIMESTAMP WITH TIME ZONE", "TIMESTAMP WITH LOCAL TIME ZONE", # Oracle
22
- "TIMESTAMP_LTZ", "TIMESTAMP_NTZ", "TIMESTAMP_TZ", # Specific to Snowflake
25
+ "TIMESTAMPTZ",
26
+ "TIMETZ", # Specific to PostgreSQL
27
+ "TIMESTAMP WITH TIME ZONE",
28
+ "TIMESTAMP WITH LOCAL TIME ZONE", # Oracle
29
+ "TIMESTAMP_LTZ",
30
+ "TIMESTAMP_NTZ",
31
+ "TIMESTAMP_TZ", # Specific to Snowflake
23
32
  ]
24
33
 
25
34
  sql_integer_types = [
26
- "TINYINT", "SMALLINT", "MEDIUMINT", "INT", "INTEGER", "BIGINT", # Common across most databases
27
- "INT2", "INT4", "INT8", # PostgreSQL specific aliases
35
+ "TINYINT",
36
+ "SMALLINT",
37
+ "MEDIUMINT",
38
+ "INT",
39
+ "INTEGER",
40
+ "BIGINT", # Common across most databases
41
+ "INT2",
42
+ "INT4",
43
+ "INT8", # PostgreSQL specific aliases
28
44
  "UNSIGNED BIG INT", # SQLite specific
29
45
  "NUMBER", # Oracle, can be used as an integer with precision and scale
30
46
  "NUMERIC", # Generally available in many SQL databases, used with precision and scale
31
- "SMALLSERIAL", "SERIAL", "BIGSERIAL", # PostgreSQL auto-increment types
32
- "IDENTITY", "SMALLIDENTITY", "BIGIDENTITY", # SQL Server specific auto-increment types
47
+ "SMALLSERIAL",
48
+ "SERIAL",
49
+ "BIGSERIAL", # PostgreSQL auto-increment types
50
+ "IDENTITY",
51
+ "SMALLIDENTITY",
52
+ "BIGIDENTITY", # SQL Server specific auto-increment types
33
53
  "BYTEINT", # Specific to Snowflake, for storing very small integers
34
54
  ]
35
55
 
36
56
  sql_not_supported_types = [
37
- "CHAR", "VARCHAR", "TINYTEXT", "TEXT", "MEDIUMTEXT", "LONGTEXT",
38
- "NCHAR", "NVARCHAR", "VARCHAR2", "NVARCHAR2", "CLOB", "NCLOB",
39
- "VARCHAR(MAX)", "XML", "JSON",
57
+ "CHAR",
58
+ "VARCHAR",
59
+ "TINYTEXT",
60
+ "TEXT",
61
+ "MEDIUMTEXT",
62
+ "LONGTEXT",
63
+ "NCHAR",
64
+ "NVARCHAR",
65
+ "VARCHAR2",
66
+ "NVARCHAR2",
67
+ "CLOB",
68
+ "NCLOB",
69
+ "VARCHAR(MAX)",
70
+ "XML",
71
+ "JSON",
40
72
  "BOOLEAN", # PostgreSQL, SQLite, and others with native boolean support
41
73
  "TINYINT(1)", # MySQL/MariaDB uses TINYINT(1) to represent boolean values
42
74
  "BIT", # SQL Server and others use BIT to represent boolean values, where 1 is true and 0 is false
@@ -185,7 +217,7 @@ def query_numeric_histogram(task, node, column, column_type, min_value, max_valu
185
217
  else:
186
218
  counts[num_bins - 1] += count
187
219
  base_result = {
188
- 'counts': counts,
220
+ "counts": counts,
189
221
  }
190
222
  if curr is not None:
191
223
  counts = [0] * num_bins
@@ -199,7 +231,7 @@ def query_numeric_histogram(task, node, column, column_type, min_value, max_valu
199
231
  else:
200
232
  counts[num_bins - 1] += count
201
233
  curr_result = {
202
- 'counts': counts,
234
+ "counts": counts,
203
235
  }
204
236
  return base_result, curr_result, bin_edges, labels
205
237
 
@@ -209,7 +241,7 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
209
241
  print(max_value, min_value, days_delta)
210
242
  # _type = None
211
243
  if days_delta > 365 * 4:
212
- _type = 'yearly'
244
+ _type = "yearly"
213
245
  dmin = date(min_value.year, 1, 1)
214
246
  if max_value.year < 3000:
215
247
  dmax = date(max_value.year, 1, 1) + relativedelta(years=+1)
@@ -237,7 +269,7 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
237
269
  else:
238
270
  dmax = date(3000, 1, 1)
239
271
  period = relativedelta(dmax, dmin)
240
- num_buckets = (period.years * 12 + period.months)
272
+ num_buckets = period.years * 12 + period.months
241
273
  bin_edges = [dmin + relativedelta(months=i) for i in range(num_buckets + 1)]
242
274
  sql = f"""
243
275
  SELECT
@@ -285,18 +317,18 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
285
317
 
286
318
  base_counts = [0] * num_buckets
287
319
  print(_type)
288
- for (d, v) in base.rows:
320
+ for d, v in base.rows:
289
321
  i = bin_edges.index(d.date()) if isinstance(d, datetime) else bin_edges.index(d)
290
322
  base_counts[i] = v
291
323
  curr_counts = [0] * num_buckets
292
- for (d, v) in curr.rows:
324
+ for d, v in curr.rows:
293
325
  i = bin_edges.index(d.date()) if isinstance(d, datetime) else bin_edges.index(d)
294
326
  curr_counts[i] = v
295
327
  base_result = {
296
- 'counts': base_counts,
328
+ "counts": base_counts,
297
329
  }
298
330
  curr_result = {
299
- 'counts': curr_counts,
331
+ "counts": curr_counts,
300
332
  }
301
333
 
302
334
  return base_result, curr_result, bin_edges
@@ -310,6 +342,7 @@ class HistogramDiffTask(Task, QueryMixin):
310
342
 
311
343
  def execute(self):
312
344
  from recce.adapter.dbt_adapter import DbtAdapter
345
+
313
346
  result = {}
314
347
 
315
348
  dbt_adapter: DbtAdapter = default_context().adapter
@@ -353,29 +386,31 @@ class HistogramDiffTask(Task, QueryMixin):
353
386
  labels = None
354
387
  if min_value is None or max_value is None:
355
388
  base_result = {
356
- 'counts': [],
389
+ "counts": [],
357
390
  }
358
391
  current_result = {
359
- 'counts': [],
392
+ "counts": [],
360
393
  }
361
394
  bin_edges = []
362
395
  labels = []
363
396
  elif column_type.upper() in sql_datetime_types:
364
397
  base_result, current_result, bin_edges = query_datetime_histogram(
365
- self, node, column, min_value, max_value)
398
+ self, node, column, min_value, max_value
399
+ )
366
400
  else:
367
401
  base_result, current_result, bin_edges, labels = query_numeric_histogram(
368
- self, node, column, column_type, min_value, max_value, num_bins)
402
+ self, node, column, column_type, min_value, max_value, num_bins
403
+ )
369
404
  if base_result:
370
- base_result['total'] = base_total
405
+ base_result["total"] = base_total
371
406
  if current_result:
372
- current_result['total'] = curr_total
373
- result['base'] = base_result
374
- result['current'] = current_result
375
- result['min'] = min_value
376
- result['max'] = max_value
377
- result['bin_edges'] = bin_edges
378
- result['labels'] = labels
407
+ current_result["total"] = curr_total
408
+ result["base"] = base_result
409
+ result["current"] = current_result
410
+ result["min"] = min_value
411
+ result["max"] = max_value
412
+ result["bin_edges"] = bin_edges
413
+ result["labels"] = labels
379
414
  return result
380
415
 
381
416
  def cancel(self):
@@ -386,7 +421,7 @@ class HistogramDiffTask(Task, QueryMixin):
386
421
 
387
422
  class HistogramDiffTaskResultDiffer(TaskResultDiffer):
388
423
  def _check_result_changed_fn(self, result):
389
- return TaskResultDiffer.diff(result['base'], result['current'])
424
+ return TaskResultDiffer.diff(result["base"], result["current"])
390
425
 
391
426
 
392
427
  class HistogramDiffCheckValidator(CheckValidator):
recce/tasks/lineage.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, Literal
1
+ from typing import Literal, Optional
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
@@ -10,7 +10,7 @@ class LineageDiffParams(BaseModel):
10
10
  select: Optional[str] = None
11
11
  exclude: Optional[str] = None
12
12
  packages: Optional[list[str]] = None
13
- view_mode: Optional[Literal['all', 'changed_models']] = None
13
+ view_mode: Optional[Literal["all", "changed_models"]] = None
14
14
 
15
15
 
16
16
  class LineageDiffCheckValidator(CheckValidator):
recce/tasks/profile.py CHANGED
@@ -1,13 +1,141 @@
1
- import textwrap
2
1
  from typing import List
3
2
 
4
3
  from pydantic import BaseModel
5
4
 
6
- from .core import Task, TaskResultDiffer, CheckValidator
7
- from .dataframe import DataFrame
8
5
  from ..core import default_context
9
6
  from ..exceptions import RecceException
10
7
  from ..models import Check
8
+ from .core import CheckValidator, Task, TaskResultDiffer
9
+ from .dataframe import DataFrame
10
+
11
+ PROFILE_COLUMN_JINJA_TEMPLATE = r"""
12
+ {# Conditions -------------------------------------------- #}
13
+ {%- set is_struct = column_type.startswith('struct') -%}
14
+ {%- set is_numeric =
15
+ column_type.startswith('int') or
16
+ column_type.startswith('float') or
17
+ 'numeric' in column_type or
18
+ 'number' in column_type or
19
+ 'double' in column_type or
20
+ 'bigint' in column_type
21
+ -%}
22
+ {%- set is_date_or_time =
23
+ column_type.startswith('date') or
24
+ column_type.startswith('timestamp')
25
+ -%}
26
+ {%- set is_logical = column_type.startswith('bool') -%}
27
+
28
+ {%- if db_type == 'sqlserver' -%}
29
+ {%- set is_numeric = column_type in [
30
+ "bigint", "numeric", "smallint", "decimal", "int",
31
+ "tinyint", "money", "float", "real"
32
+ ]-%}
33
+ {%- elif db_type == 'athena' -%}
34
+ {%- set is_numeric =
35
+ "int" in column_type or
36
+ "float" in column_type or
37
+ "decimal" in column_type or
38
+ "double" in column_type
39
+ -%}
40
+ {%- endif -%}
41
+
42
+ {# General Agg ------------------------------------------- #}
43
+ {%- set agg_row_count = 'cast(count(*) as ' ~ dbt.type_bigint() ~ ')' -%}
44
+ {%- set agg_not_null_proportion =
45
+ 'sum(case when ' ~ adapter.quote(column_name) ~ ' is null '
46
+ ~ 'then 0 '
47
+ ~ 'else 1 end) / '
48
+ ~ 'cast(count(*) as ' ~ dbt.type_numeric() ~ ')'
49
+ -%}
50
+ {%- set agg_distinct_proportion =
51
+ 'count(distinct ' ~ adapter.quote(column_name) ~') / '
52
+ ~ 'cast(count(*) as ' ~ dbt.type_numeric() ~ ')'
53
+ -%}
54
+ {%- set agg_distinct_count = 'count(distinct ' ~ adapter.quote(column_name) ~ ')' -%}
55
+ {%- set agg_is_unique = 'count(distinct ' ~ adapter.quote(column_name) ~ ') = count(*)' -%}
56
+ {%- set agg_min = 'cast(null as ' ~ dbt.type_string() ~ ')' -%}
57
+ {%- set agg_max = 'cast(null as ' ~ dbt.type_string() ~ ')' -%}
58
+ {%- set agg_avg = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
59
+ {%- set agg_median = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
60
+
61
+
62
+ {%- if is_struct -%}
63
+ {%- set agg_distinct_proportion = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
64
+ {%- set agg_distinct_count = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
65
+ {%- set agg_is_unique = 'null' -%}
66
+ {%- endif -%}
67
+
68
+
69
+ {%- if (is_numeric or is_date_or_time) and (not is_struct) -%}
70
+ {%- set agg_min =
71
+ 'cast(min(' ~ adapter.quote(column_name) ~ ') as ' ~ dbt.type_string() ~ ')'
72
+ -%}
73
+ {%- set agg_max =
74
+ 'cast(max(' ~ adapter.quote(column_name) ~ ') as ' ~ dbt.type_string() ~ ')'
75
+ -%}
76
+ {%- endif -%}
77
+
78
+
79
+ {%- if is_numeric and not is_struct -%}
80
+ {%- set agg_avg = 'avg(' ~ adapter.quote(column_name) ~ ')' -%}
81
+
82
+ {%- if db_type == 'bigquery' -%}
83
+ {%- set agg_median = 'approx_quantiles(' ~ adapter.quote(column_name) ~ ', 100)[offset(50)]' -%}
84
+ {%- elif db_type == 'postgres' -%}
85
+ {%- set agg_median = 'percentile_cont(0.5) within group (order by ' ~ adapter.quote(column_name) ~ ')' -%}
86
+ {%- elif db_type == 'redshift' -%}
87
+ {%- set agg_median =
88
+ '(select percentile_cont(0.5) within group (order by '
89
+ ~ adapter.quote(column_name) ~ ') from ' ~ relation ~ ')' -%}
90
+ {%- elif db_type == 'athena' -%}
91
+ {%- set agg_median = 'approx_percentile( ' ~ adapter.quote(column_name) ~ ', 0.5)' -%}
92
+ {%- elif db_type == 'sqlserver' -%}
93
+ {%- set agg_median = 'percentile_cont(' ~ adapter.quote(column_name) ~ ', 0.5) over ()' -%}
94
+ {%- else -%}
95
+ {%- set agg_median = 'median(' ~ adapter.quote(column_name) ~ ')' -%}
96
+ {%- endif -%}
97
+ {%- elif is_logical -%}
98
+ {%- set agg_avg = 'avg(case when ' ~ adapter.quote(column_name) ~ ' then 1 else 0 end)' -%}
99
+ {%- endif -%}
100
+
101
+
102
+ {# Overwrite Agg ----------------------------------------- #}
103
+
104
+ {# DRC-663: Support bigquery array type }
105
+ {%- set is_array = column_type.startswith('array') -%}
106
+ {%- if db_type == 'bigquery' and is_array -%}
107
+ {%- set agg_distinct_proportion = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
108
+ {%- set agg_distinct_count = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
109
+ {%- set agg_is_unique = 'null' -%}
110
+ {%- set agg_min =
111
+ 'cast(min(array_length(' ~ adapter.quote(column_name) ~ ')) as ' ~ dbt.type_string() ~ ')'
112
+ -%}
113
+ {%- set agg_max =
114
+ 'cast(max(array_length(' ~ adapter.quote(column_name) ~ ')) as ' ~ dbt.type_string() ~ ')'
115
+ -%}
116
+ {%- set agg_avg = 'avg(array_length(' ~ adapter.quote(column_name) ~ '))' -%}
117
+ {%- set agg_median =
118
+ 'approx_quantiles(array_length(' ~ adapter.quote(column_name) ~ '), 100)[offset(50)]'
119
+ -%}
120
+ {%- endif -%}
121
+
122
+
123
+ {# Main Query -------------------------------------------- #}
124
+
125
+ select
126
+ '{{ column_name }}' as column_name,
127
+ nullif('{{ column_type }}', '') as data_type,
128
+ {{ agg_row_count }} as row_count,
129
+ {{ agg_not_null_proportion }} as not_null_proportion,
130
+ {{ agg_distinct_proportion }} as distinct_proportion,
131
+ {{ agg_distinct_count }} as distinct_count,
132
+ {{ agg_is_unique }} as is_unique,
133
+ {{ agg_min }} as min,
134
+ {{ agg_max }} as max,
135
+ {{ agg_avg }} as avg,
136
+ {{ agg_median }} as median
137
+ from {{ relation }}
138
+ """
11
139
 
12
140
 
13
141
  class ProfileParams(BaseModel):
@@ -33,14 +161,14 @@ class ProfileDiffTask(Task):
33
161
 
34
162
  def execute(self):
35
163
  import agate
164
+
36
165
  from recce.adapter.dbt_adapter import DbtAdapter, merge_tables
166
+
37
167
  dbt_adapter: DbtAdapter = default_context().adapter
38
168
 
39
169
  model: str = self.params.model
40
170
  selected_columns: List[str] = self.params.columns
41
171
 
42
- self._verify_dbt_profiler(dbt_adapter)
43
-
44
172
  with dbt_adapter.connection_named("query"):
45
173
  self.connection = dbt_adapter.get_thread_connection()
46
174
 
@@ -58,7 +186,7 @@ class ProfileDiffTask(Task):
58
186
  tables: List[agate.Table] = []
59
187
 
60
188
  for column in base_columns:
61
- self.update_progress(message=f'[Base] Profile column: {column.name}', percentage=completed / total)
189
+ self.update_progress(message=f"[Base] Profile column: {column.name}", percentage=completed / total)
62
190
  relation = dbt_adapter.create_relation(model, base=True)
63
191
  response, table = self._profile_column(dbt_adapter, relation, column)
64
192
  tables.append(table)
@@ -68,7 +196,7 @@ class ProfileDiffTask(Task):
68
196
 
69
197
  tables: List[agate.Table] = []
70
198
  for column in curr_columns:
71
- self.update_progress(message=f'[Current] Profile column: {column.column}', percentage=completed / total)
199
+ self.update_progress(message=f"[Current] Profile column: {column.column}", percentage=completed / total)
72
200
  relation = dbt_adapter.create_relation(model, base=False)
73
201
  response, table = self._profile_column(dbt_adapter, relation, column)
74
202
  tables.append(table)
@@ -83,85 +211,16 @@ class ProfileDiffTask(Task):
83
211
 
84
212
  return ProfileDiffResult(base=base, current=current)
85
213
 
86
- def _verify_dbt_profiler(self, dbt_adapter):
87
- for macro_name, macro in dbt_adapter.manifest.macros.items():
88
- if macro.package_name == 'dbt_profiler':
89
- break
90
- else:
91
- raise RecceException(
92
- r"Package 'dbt_profiler' not found. Please refer to the link to install: https://hub.getdbt.com/data-mie/dbt_profiler/")
93
-
94
214
  def _profile_column(self, dbt_adapter, relation, column):
95
- sql_template = textwrap.dedent(r"""
96
- select
97
- '{{column_name}}' as column_name,
98
- nullif('{{column_type}}', '') as data_type,
99
- {{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
100
- {{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
101
- {{ dbt_profiler.measure_distinct_proportion(column_name, column_type) }} as distinct_proportion,
102
- {{ dbt_profiler.measure_distinct_count(column_name, column_type) }} as distinct_count,
103
- {{ dbt_profiler.measure_is_unique(column_name, column_type) }} as is_unique,
104
- {{ dbt_profiler.measure_min(column_name, column_type) }} as min,
105
- {{ dbt_profiler.measure_max(column_name, column_type) }} as max,
106
- {{ dbt_profiler.measure_avg(column_name, column_type) }} as avg,
107
- {{ dbt_profiler.measure_median(column_name, column_type) }} as median
108
- from
109
- {{ relation }}
110
- """)
111
215
  column_name = column.name
112
216
  column_type = column.data_type.lower()
113
- db_type = dbt_adapter.adapter.type()
114
- if db_type == 'bigquery' and column_type.startswith('array'):
115
- # DRC-663: Support bigquery array type
116
- sql_template = textwrap.dedent(r"""
117
- select
118
- '{{column_name}}' as column_name,
119
- nullif('{{column_type}}', '') as data_type,
120
- {{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
121
- {{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
122
- cast(null as {{ dbt.type_numeric() }}) as distinct_proportion,
123
- cast(null as {{ dbt.type_numeric() }}) as distinct_count,
124
- null as is_unique,
125
- cast(min(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as {{ dbt_profiler.type_string() }}) as min,
126
- cast(max(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as {{ dbt_profiler.type_string() }}) as max,
127
- avg(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as avg,
128
- APPROX_QUANTILES(ARRAY_LENGTH({{ adapter.quote(column_name) }}), 100)[OFFSET(50)] as median,
129
- from
130
- {{ relation }}
131
- """)
132
- elif db_type == 'redshift':
133
- # DRC-1149: Support redshift median calculation
134
- # https://github.com/data-mie/dbt-profiler/pull/89
135
- #
136
- # Since dbt-profiler 0.8.2, there is the third parameter for measure_median
137
- # For sake of compatibility, we use the new way to call the macro only for redshift
138
- sql_template = textwrap.dedent(r"""
139
- with source_data as (
140
- select
141
- *
142
- from {{ relation }}
143
- )
144
- select
145
- '{{column_name}}' as column_name,
146
- nullif('{{column_type}}', '') as data_type,
147
- {{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
148
- {{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
149
- {{ dbt_profiler.measure_distinct_proportion(column_name, column_type) }} as distinct_proportion,
150
- {{ dbt_profiler.measure_distinct_count(column_name, column_type) }} as distinct_count,
151
- {{ dbt_profiler.measure_is_unique(column_name, column_type) }} as is_unique,
152
- {{ dbt_profiler.measure_min(column_name, column_type) }} as min,
153
- {{ dbt_profiler.measure_max(column_name, column_type) }} as max,
154
- {{ dbt_profiler.measure_avg(column_name, column_type) }} as avg,
155
- ({{ dbt_profiler.measure_median(column_name, column_type, 'source_data') }}) as median
156
- from
157
- source_data
158
- """)
217
+ db_type = dbt_adapter.adapter.type().lower()
159
218
 
160
219
  try:
161
220
  sql = dbt_adapter.generate_sql(
162
- sql_template,
221
+ PROFILE_COLUMN_JINJA_TEMPLATE,
163
222
  base=False, # always false because we use the macro in current manifest
164
- context=dict(relation=relation, column_name=column_name, column_type=column_type)
223
+ context=dict(relation=relation, column_name=column_name, column_type=column_type, db_type=db_type),
165
224
  )
166
225
  except Exception as e:
167
226
  raise RecceException(f"Failed to generate SQL for profiling column: {column_name}") from e
@@ -170,14 +229,15 @@ class ProfileDiffTask(Task):
170
229
  return dbt_adapter.execute(sql, fetch=True)
171
230
  except Exception as e:
172
231
  from recce.adapter.dbt_adapter import dbt_version
173
- if dbt_version < 'v1.8':
232
+
233
+ if dbt_version < "v1.8":
174
234
  from dbt.exceptions import DbtDatabaseError
175
235
  else:
176
236
  from dbt_common.exceptions import DbtDatabaseError
177
237
  if isinstance(e, DbtDatabaseError):
178
- if str(e).find('100051') >= 0:
238
+ if str(e).find("100051") >= 0:
179
239
  # Snowflake error '100051 (22012): Division by zero"'
180
- e = RecceException('No profile diff result due to the model is empty.', False)
240
+ e = RecceException("No profile diff result due to the model is empty.", False)
181
241
  raise e
182
242
 
183
243
  def cancel(self):
@@ -185,6 +245,7 @@ class ProfileDiffTask(Task):
185
245
 
186
246
  if self.connection:
187
247
  from recce.adapter.dbt_adapter import DbtAdapter
248
+
188
249
  dbt_adapter: DbtAdapter = default_context().adapter
189
250
  with dbt_adapter.connection_named("cancel"):
190
251
  dbt_adapter.cancel(self.connection)
@@ -192,7 +253,7 @@ class ProfileDiffTask(Task):
192
253
 
193
254
  class ProfileDiffResultDiffer(TaskResultDiffer):
194
255
  def _check_result_changed_fn(self, result):
195
- return self.diff(result['base'], result['current'])
256
+ return self.diff(result["base"], result["current"])
196
257
 
197
258
 
198
259
  class ProfileCheckValidator(CheckValidator):
@@ -207,14 +268,14 @@ class ProfileCheckValidator(CheckValidator):
207
268
  class ProfileTask(ProfileDiffTask):
208
269
  def execute(self):
209
270
  import agate
271
+
210
272
  from recce.adapter.dbt_adapter import DbtAdapter, merge_tables
273
+
211
274
  dbt_adapter: DbtAdapter = default_context().adapter
212
275
 
213
276
  model: str = self.params.model
214
277
  selected_columns: List[str] = self.params.columns
215
278
 
216
- self._verify_dbt_profiler(dbt_adapter)
217
-
218
279
  with dbt_adapter.connection_named("query"):
219
280
  self.connection = dbt_adapter.get_thread_connection()
220
281
  curr_columns = [column for column in dbt_adapter.get_columns(model, base=False)]
@@ -227,7 +288,7 @@ class ProfileTask(ProfileDiffTask):
227
288
 
228
289
  tables: List[agate.Table] = []
229
290
  for column in curr_columns:
230
- self.update_progress(message=f'[Current] Profile column: {column.column}', percentage=completed / total)
291
+ self.update_progress(message=f"[Current] Profile column: {column.column}", percentage=completed / total)
231
292
  relation = dbt_adapter.create_relation(model, base=False)
232
293
  response, table = self._profile_column(dbt_adapter, relation, column)
233
294
  tables.append(table)