recce-nightly 1.2.0.20250506__py3-none-any.whl → 1.4.0.20250514__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recce-nightly might be problematic. Click here for more details.
- recce/VERSION +1 -1
- recce/__init__.py +22 -22
- recce/adapter/base.py +11 -14
- recce/adapter/dbt_adapter/__init__.py +355 -316
- recce/adapter/dbt_adapter/dbt_version.py +3 -0
- recce/adapter/sqlmesh_adapter.py +24 -35
- recce/apis/check_api.py +39 -28
- recce/apis/check_func.py +33 -27
- recce/apis/run_api.py +25 -19
- recce/apis/run_func.py +29 -23
- recce/artifact.py +44 -49
- recce/cli.py +484 -285
- recce/config.py +42 -33
- recce/core.py +52 -44
- recce/data/404.html +1 -1
- recce/data/_next/static/chunks/{368-7587b306577df275.js → 778-aef312bffb4c0312.js} +15 -15
- recce/data/_next/static/chunks/8d700b6a.ed11a130057c7a47.js +1 -0
- recce/data/_next/static/chunks/app/layout-c713a2829d3279e4.js +1 -0
- recce/data/_next/static/chunks/app/page-7086764277331fcb.js +1 -0
- recce/data/_next/static/chunks/{cd9f8d63-cf0d5a7b0f7a92e8.js → cd9f8d63-e020f408095ed77c.js} +3 -3
- recce/data/_next/static/chunks/webpack-b787cb1a4f2293de.js +1 -0
- recce/data/_next/static/css/88b8abc134cfd59a.css +3 -0
- recce/data/index.html +2 -2
- recce/data/index.txt +2 -2
- recce/diff.py +6 -12
- recce/event/__init__.py +74 -72
- recce/event/collector.py +27 -20
- recce/event/track.py +39 -27
- recce/exceptions.py +1 -1
- recce/git.py +7 -7
- recce/github.py +57 -53
- recce/models/__init__.py +1 -1
- recce/models/check.py +6 -7
- recce/models/run.py +1 -0
- recce/models/types.py +27 -27
- recce/pull_request.py +26 -24
- recce/run.py +148 -111
- recce/server.py +105 -88
- recce/state.py +209 -177
- recce/summary.py +168 -143
- recce/tasks/__init__.py +3 -3
- recce/tasks/core.py +11 -13
- recce/tasks/dataframe.py +19 -17
- recce/tasks/histogram.py +69 -34
- recce/tasks/lineage.py +2 -2
- recce/tasks/profile.py +152 -86
- recce/tasks/query.py +139 -87
- recce/tasks/rowcount.py +33 -30
- recce/tasks/schema.py +14 -14
- recce/tasks/top_k.py +35 -35
- recce/tasks/valuediff.py +216 -152
- recce/util/breaking.py +77 -84
- recce/util/cll.py +55 -51
- recce/util/io.py +19 -17
- recce/util/logger.py +1 -1
- recce/util/recce_cloud.py +70 -72
- recce/util/singleton.py +4 -4
- recce/yaml/__init__.py +7 -10
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/METADATA +5 -2
- recce_nightly-1.4.0.20250514.dist-info/RECORD +143 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/WHEEL +1 -1
- tests/adapter/dbt_adapter/conftest.py +1 -0
- tests/adapter/dbt_adapter/dbt_test_helper.py +28 -18
- tests/adapter/dbt_adapter/test_dbt_adapter.py +0 -15
- tests/adapter/dbt_adapter/test_dbt_cll.py +39 -32
- tests/adapter/dbt_adapter/test_selector.py +22 -21
- tests/tasks/test_histogram.py +58 -66
- tests/tasks/test_lineage.py +36 -23
- tests/tasks/test_preset_checks.py +45 -31
- tests/tasks/test_profile.py +340 -15
- tests/tasks/test_query.py +40 -40
- tests/tasks/test_row_count.py +65 -46
- tests/tasks/test_schema.py +65 -42
- tests/tasks/test_top_k.py +22 -18
- tests/tasks/test_valuediff.py +43 -32
- tests/test_cli.py +71 -58
- tests/test_config.py +7 -9
- tests/test_core.py +5 -3
- tests/test_dbt.py +7 -7
- tests/test_pull_request.py +1 -1
- tests/test_server.py +19 -13
- tests/test_state.py +40 -27
- tests/test_summary.py +18 -14
- recce/data/_next/static/chunks/8d700b6a-f0b1f6b9e0d97ce2.js +0 -1
- recce/data/_next/static/chunks/app/layout-9102e22cb73f74d6.js +0 -1
- recce/data/_next/static/chunks/app/page-cee661090afbd6aa.js +0 -1
- recce/data/_next/static/chunks/webpack-567d72f0bc0820d5.js +0 -1
- recce_nightly-1.2.0.20250506.dist-info/RECORD +0 -142
- /recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → E_HPXsXdrqHg2YEHmU3mK}/_buildManifest.js +0 -0
- /recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → E_HPXsXdrqHg2YEHmU3mK}/_ssgManifest.js +0 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/entry_points.txt +0 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/licenses/LICENSE +0 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/top_level.txt +0 -0
recce/tasks/dataframe.py
CHANGED
|
@@ -10,14 +10,14 @@ from pydantic import BaseModel, Field
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class DataFrameColumnType(Enum):
|
|
13
|
-
NUMBER =
|
|
14
|
-
INTEGER =
|
|
15
|
-
TEXT =
|
|
16
|
-
BOOLEAN =
|
|
17
|
-
DATE =
|
|
18
|
-
DATETIME =
|
|
19
|
-
TIMEDELTA =
|
|
20
|
-
UNKNOWN =
|
|
13
|
+
NUMBER = "number"
|
|
14
|
+
INTEGER = "integer"
|
|
15
|
+
TEXT = "text"
|
|
16
|
+
BOOLEAN = "boolean"
|
|
17
|
+
DATE = "date"
|
|
18
|
+
DATETIME = "datetime"
|
|
19
|
+
TIMEDELTA = "timedelta"
|
|
20
|
+
UNKNOWN = "unknown"
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class DataFrameColumn(BaseModel):
|
|
@@ -32,19 +32,21 @@ class DataFrame(BaseModel):
|
|
|
32
32
|
more: t.Optional[bool] = Field(None, description="Whether there are more rows to fetch")
|
|
33
33
|
|
|
34
34
|
@staticmethod
|
|
35
|
-
def from_agate(table:
|
|
35
|
+
def from_agate(table: "agate.Table", limit: t.Optional[int] = None, more: t.Optional[bool] = None):
|
|
36
36
|
from recce.adapter.dbt_adapter import dbt_version
|
|
37
|
-
|
|
37
|
+
|
|
38
|
+
if dbt_version < "v1.8":
|
|
38
39
|
import dbt.clients.agate_helper as agate_helper
|
|
39
40
|
else:
|
|
40
41
|
import dbt_common.clients.agate_helper as agate_helper
|
|
41
42
|
|
|
42
43
|
import agate
|
|
44
|
+
|
|
43
45
|
columns = []
|
|
44
46
|
|
|
45
47
|
for col_name, col_type in zip(table.column_names, table.column_types):
|
|
46
48
|
|
|
47
|
-
has_integer = hasattr(agate_helper,
|
|
49
|
+
has_integer = hasattr(agate_helper, "Integer")
|
|
48
50
|
|
|
49
51
|
if isinstance(col_type, agate.Number):
|
|
50
52
|
col_type = DataFrameColumnType.NUMBER
|
|
@@ -78,23 +80,23 @@ class DataFrame(BaseModel):
|
|
|
78
80
|
return df
|
|
79
81
|
|
|
80
82
|
@staticmethod
|
|
81
|
-
def from_pandas(pandas_df:
|
|
83
|
+
def from_pandas(pandas_df: "pandas.DataFrame", limit: t.Optional[int] = None, more: t.Optional[bool] = None):
|
|
82
84
|
columns = []
|
|
83
85
|
for column in pandas_df.columns:
|
|
84
86
|
dtype = pandas_df[column].dtype
|
|
85
|
-
if dtype ==
|
|
87
|
+
if dtype == "int64":
|
|
86
88
|
col_type = DataFrameColumnType.INTEGER
|
|
87
|
-
elif dtype ==
|
|
89
|
+
elif dtype == "float64":
|
|
88
90
|
col_type = DataFrameColumnType.NUMBER
|
|
89
|
-
elif dtype ==
|
|
91
|
+
elif dtype == "object":
|
|
90
92
|
col_type = DataFrameColumnType.TEXT
|
|
91
|
-
elif dtype ==
|
|
93
|
+
elif dtype == "bool":
|
|
92
94
|
col_type = DataFrameColumnType.BOOLEAN
|
|
93
95
|
else:
|
|
94
96
|
col_type = DataFrameColumnType.UNKNOWN
|
|
95
97
|
columns.append(DataFrameColumn(name=column, type=col_type))
|
|
96
98
|
|
|
97
|
-
s = pandas_df.to_json(orient=
|
|
99
|
+
s = pandas_df.to_json(orient="values")
|
|
98
100
|
data = json.loads(s)
|
|
99
101
|
|
|
100
102
|
df = DataFrame(
|
recce/tasks/histogram.py
CHANGED
|
@@ -9,34 +9,66 @@ from pydantic import BaseModel
|
|
|
9
9
|
from recce.core import default_context
|
|
10
10
|
from recce.models import Check
|
|
11
11
|
from recce.tasks import Task
|
|
12
|
-
from recce.tasks.core import
|
|
12
|
+
from recce.tasks.core import CheckValidator, TaskResultDiffer
|
|
13
13
|
from recce.tasks.query import QueryMixin
|
|
14
14
|
|
|
15
15
|
sql_datetime_types = [
|
|
16
|
-
"DATE",
|
|
16
|
+
"DATE",
|
|
17
|
+
"DATETIME",
|
|
18
|
+
"TIMESTAMP",
|
|
19
|
+
"TIME",
|
|
17
20
|
"YEAR", # Specific to MySQL/MariaDB
|
|
18
|
-
"DATETIME2",
|
|
21
|
+
"DATETIME2",
|
|
22
|
+
"SMALLDATETIME",
|
|
23
|
+
"DATETIMEOFFSET", # Specific to SQL Server
|
|
19
24
|
"INTERVAL", # Common in PostgreSQL and Oracle
|
|
20
|
-
"TIMESTAMPTZ",
|
|
21
|
-
"
|
|
22
|
-
"
|
|
25
|
+
"TIMESTAMPTZ",
|
|
26
|
+
"TIMETZ", # Specific to PostgreSQL
|
|
27
|
+
"TIMESTAMP WITH TIME ZONE",
|
|
28
|
+
"TIMESTAMP WITH LOCAL TIME ZONE", # Oracle
|
|
29
|
+
"TIMESTAMP_LTZ",
|
|
30
|
+
"TIMESTAMP_NTZ",
|
|
31
|
+
"TIMESTAMP_TZ", # Specific to Snowflake
|
|
23
32
|
]
|
|
24
33
|
|
|
25
34
|
sql_integer_types = [
|
|
26
|
-
"TINYINT",
|
|
27
|
-
"
|
|
35
|
+
"TINYINT",
|
|
36
|
+
"SMALLINT",
|
|
37
|
+
"MEDIUMINT",
|
|
38
|
+
"INT",
|
|
39
|
+
"INTEGER",
|
|
40
|
+
"BIGINT", # Common across most databases
|
|
41
|
+
"INT2",
|
|
42
|
+
"INT4",
|
|
43
|
+
"INT8", # PostgreSQL specific aliases
|
|
28
44
|
"UNSIGNED BIG INT", # SQLite specific
|
|
29
45
|
"NUMBER", # Oracle, can be used as an integer with precision and scale
|
|
30
46
|
"NUMERIC", # Generally available in many SQL databases, used with precision and scale
|
|
31
|
-
"SMALLSERIAL",
|
|
32
|
-
"
|
|
47
|
+
"SMALLSERIAL",
|
|
48
|
+
"SERIAL",
|
|
49
|
+
"BIGSERIAL", # PostgreSQL auto-increment types
|
|
50
|
+
"IDENTITY",
|
|
51
|
+
"SMALLIDENTITY",
|
|
52
|
+
"BIGIDENTITY", # SQL Server specific auto-increment types
|
|
33
53
|
"BYTEINT", # Specific to Snowflake, for storing very small integers
|
|
34
54
|
]
|
|
35
55
|
|
|
36
56
|
sql_not_supported_types = [
|
|
37
|
-
"CHAR",
|
|
38
|
-
"
|
|
39
|
-
"
|
|
57
|
+
"CHAR",
|
|
58
|
+
"VARCHAR",
|
|
59
|
+
"TINYTEXT",
|
|
60
|
+
"TEXT",
|
|
61
|
+
"MEDIUMTEXT",
|
|
62
|
+
"LONGTEXT",
|
|
63
|
+
"NCHAR",
|
|
64
|
+
"NVARCHAR",
|
|
65
|
+
"VARCHAR2",
|
|
66
|
+
"NVARCHAR2",
|
|
67
|
+
"CLOB",
|
|
68
|
+
"NCLOB",
|
|
69
|
+
"VARCHAR(MAX)",
|
|
70
|
+
"XML",
|
|
71
|
+
"JSON",
|
|
40
72
|
"BOOLEAN", # PostgreSQL, SQLite, and others with native boolean support
|
|
41
73
|
"TINYINT(1)", # MySQL/MariaDB uses TINYINT(1) to represent boolean values
|
|
42
74
|
"BIT", # SQL Server and others use BIT to represent boolean values, where 1 is true and 0 is false
|
|
@@ -185,7 +217,7 @@ def query_numeric_histogram(task, node, column, column_type, min_value, max_valu
|
|
|
185
217
|
else:
|
|
186
218
|
counts[num_bins - 1] += count
|
|
187
219
|
base_result = {
|
|
188
|
-
|
|
220
|
+
"counts": counts,
|
|
189
221
|
}
|
|
190
222
|
if curr is not None:
|
|
191
223
|
counts = [0] * num_bins
|
|
@@ -199,7 +231,7 @@ def query_numeric_histogram(task, node, column, column_type, min_value, max_valu
|
|
|
199
231
|
else:
|
|
200
232
|
counts[num_bins - 1] += count
|
|
201
233
|
curr_result = {
|
|
202
|
-
|
|
234
|
+
"counts": counts,
|
|
203
235
|
}
|
|
204
236
|
return base_result, curr_result, bin_edges, labels
|
|
205
237
|
|
|
@@ -209,7 +241,7 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
|
|
|
209
241
|
print(max_value, min_value, days_delta)
|
|
210
242
|
# _type = None
|
|
211
243
|
if days_delta > 365 * 4:
|
|
212
|
-
_type =
|
|
244
|
+
_type = "yearly"
|
|
213
245
|
dmin = date(min_value.year, 1, 1)
|
|
214
246
|
if max_value.year < 3000:
|
|
215
247
|
dmax = date(max_value.year, 1, 1) + relativedelta(years=+1)
|
|
@@ -237,7 +269,7 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
|
|
|
237
269
|
else:
|
|
238
270
|
dmax = date(3000, 1, 1)
|
|
239
271
|
period = relativedelta(dmax, dmin)
|
|
240
|
-
num_buckets =
|
|
272
|
+
num_buckets = period.years * 12 + period.months
|
|
241
273
|
bin_edges = [dmin + relativedelta(months=i) for i in range(num_buckets + 1)]
|
|
242
274
|
sql = f"""
|
|
243
275
|
SELECT
|
|
@@ -285,18 +317,18 @@ def query_datetime_histogram(task, node, column, min_value, max_value):
|
|
|
285
317
|
|
|
286
318
|
base_counts = [0] * num_buckets
|
|
287
319
|
print(_type)
|
|
288
|
-
for
|
|
320
|
+
for d, v in base.rows:
|
|
289
321
|
i = bin_edges.index(d.date()) if isinstance(d, datetime) else bin_edges.index(d)
|
|
290
322
|
base_counts[i] = v
|
|
291
323
|
curr_counts = [0] * num_buckets
|
|
292
|
-
for
|
|
324
|
+
for d, v in curr.rows:
|
|
293
325
|
i = bin_edges.index(d.date()) if isinstance(d, datetime) else bin_edges.index(d)
|
|
294
326
|
curr_counts[i] = v
|
|
295
327
|
base_result = {
|
|
296
|
-
|
|
328
|
+
"counts": base_counts,
|
|
297
329
|
}
|
|
298
330
|
curr_result = {
|
|
299
|
-
|
|
331
|
+
"counts": curr_counts,
|
|
300
332
|
}
|
|
301
333
|
|
|
302
334
|
return base_result, curr_result, bin_edges
|
|
@@ -310,6 +342,7 @@ class HistogramDiffTask(Task, QueryMixin):
|
|
|
310
342
|
|
|
311
343
|
def execute(self):
|
|
312
344
|
from recce.adapter.dbt_adapter import DbtAdapter
|
|
345
|
+
|
|
313
346
|
result = {}
|
|
314
347
|
|
|
315
348
|
dbt_adapter: DbtAdapter = default_context().adapter
|
|
@@ -353,29 +386,31 @@ class HistogramDiffTask(Task, QueryMixin):
|
|
|
353
386
|
labels = None
|
|
354
387
|
if min_value is None or max_value is None:
|
|
355
388
|
base_result = {
|
|
356
|
-
|
|
389
|
+
"counts": [],
|
|
357
390
|
}
|
|
358
391
|
current_result = {
|
|
359
|
-
|
|
392
|
+
"counts": [],
|
|
360
393
|
}
|
|
361
394
|
bin_edges = []
|
|
362
395
|
labels = []
|
|
363
396
|
elif column_type.upper() in sql_datetime_types:
|
|
364
397
|
base_result, current_result, bin_edges = query_datetime_histogram(
|
|
365
|
-
self, node, column, min_value, max_value
|
|
398
|
+
self, node, column, min_value, max_value
|
|
399
|
+
)
|
|
366
400
|
else:
|
|
367
401
|
base_result, current_result, bin_edges, labels = query_numeric_histogram(
|
|
368
|
-
self, node, column, column_type, min_value, max_value, num_bins
|
|
402
|
+
self, node, column, column_type, min_value, max_value, num_bins
|
|
403
|
+
)
|
|
369
404
|
if base_result:
|
|
370
|
-
base_result[
|
|
405
|
+
base_result["total"] = base_total
|
|
371
406
|
if current_result:
|
|
372
|
-
current_result[
|
|
373
|
-
result[
|
|
374
|
-
result[
|
|
375
|
-
result[
|
|
376
|
-
result[
|
|
377
|
-
result[
|
|
378
|
-
result[
|
|
407
|
+
current_result["total"] = curr_total
|
|
408
|
+
result["base"] = base_result
|
|
409
|
+
result["current"] = current_result
|
|
410
|
+
result["min"] = min_value
|
|
411
|
+
result["max"] = max_value
|
|
412
|
+
result["bin_edges"] = bin_edges
|
|
413
|
+
result["labels"] = labels
|
|
379
414
|
return result
|
|
380
415
|
|
|
381
416
|
def cancel(self):
|
|
@@ -386,7 +421,7 @@ class HistogramDiffTask(Task, QueryMixin):
|
|
|
386
421
|
|
|
387
422
|
class HistogramDiffTaskResultDiffer(TaskResultDiffer):
|
|
388
423
|
def _check_result_changed_fn(self, result):
|
|
389
|
-
return TaskResultDiffer.diff(result[
|
|
424
|
+
return TaskResultDiffer.diff(result["base"], result["current"])
|
|
390
425
|
|
|
391
426
|
|
|
392
427
|
class HistogramDiffCheckValidator(CheckValidator):
|
recce/tasks/lineage.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Literal, Optional
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
@@ -10,7 +10,7 @@ class LineageDiffParams(BaseModel):
|
|
|
10
10
|
select: Optional[str] = None
|
|
11
11
|
exclude: Optional[str] = None
|
|
12
12
|
packages: Optional[list[str]] = None
|
|
13
|
-
view_mode: Optional[Literal[
|
|
13
|
+
view_mode: Optional[Literal["all", "changed_models"]] = None
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class LineageDiffCheckValidator(CheckValidator):
|
recce/tasks/profile.py
CHANGED
|
@@ -1,13 +1,141 @@
|
|
|
1
|
-
import textwrap
|
|
2
1
|
from typing import List
|
|
3
2
|
|
|
4
3
|
from pydantic import BaseModel
|
|
5
4
|
|
|
6
|
-
from .core import Task, TaskResultDiffer, CheckValidator
|
|
7
|
-
from .dataframe import DataFrame
|
|
8
5
|
from ..core import default_context
|
|
9
6
|
from ..exceptions import RecceException
|
|
10
7
|
from ..models import Check
|
|
8
|
+
from .core import CheckValidator, Task, TaskResultDiffer
|
|
9
|
+
from .dataframe import DataFrame
|
|
10
|
+
|
|
11
|
+
PROFILE_COLUMN_JINJA_TEMPLATE = r"""
|
|
12
|
+
{# Conditions -------------------------------------------- #}
|
|
13
|
+
{%- set is_struct = column_type.startswith('struct') -%}
|
|
14
|
+
{%- set is_numeric =
|
|
15
|
+
column_type.startswith('int') or
|
|
16
|
+
column_type.startswith('float') or
|
|
17
|
+
'numeric' in column_type or
|
|
18
|
+
'number' in column_type or
|
|
19
|
+
'double' in column_type or
|
|
20
|
+
'bigint' in column_type
|
|
21
|
+
-%}
|
|
22
|
+
{%- set is_date_or_time =
|
|
23
|
+
column_type.startswith('date') or
|
|
24
|
+
column_type.startswith('timestamp')
|
|
25
|
+
-%}
|
|
26
|
+
{%- set is_logical = column_type.startswith('bool') -%}
|
|
27
|
+
|
|
28
|
+
{%- if db_type == 'sqlserver' -%}
|
|
29
|
+
{%- set is_numeric = column_type in [
|
|
30
|
+
"bigint", "numeric", "smallint", "decimal", "int",
|
|
31
|
+
"tinyint", "money", "float", "real"
|
|
32
|
+
]-%}
|
|
33
|
+
{%- elif db_type == 'athena' -%}
|
|
34
|
+
{%- set is_numeric =
|
|
35
|
+
"int" in column_type or
|
|
36
|
+
"float" in column_type or
|
|
37
|
+
"decimal" in column_type or
|
|
38
|
+
"double" in column_type
|
|
39
|
+
-%}
|
|
40
|
+
{%- endif -%}
|
|
41
|
+
|
|
42
|
+
{# General Agg ------------------------------------------- #}
|
|
43
|
+
{%- set agg_row_count = 'cast(count(*) as ' ~ dbt.type_bigint() ~ ')' -%}
|
|
44
|
+
{%- set agg_not_null_proportion =
|
|
45
|
+
'sum(case when ' ~ adapter.quote(column_name) ~ ' is null '
|
|
46
|
+
~ 'then 0 '
|
|
47
|
+
~ 'else 1 end) / '
|
|
48
|
+
~ 'cast(count(*) as ' ~ dbt.type_numeric() ~ ')'
|
|
49
|
+
-%}
|
|
50
|
+
{%- set agg_distinct_proportion =
|
|
51
|
+
'count(distinct ' ~ adapter.quote(column_name) ~') / '
|
|
52
|
+
~ 'cast(count(*) as ' ~ dbt.type_numeric() ~ ')'
|
|
53
|
+
-%}
|
|
54
|
+
{%- set agg_distinct_count = 'count(distinct ' ~ adapter.quote(column_name) ~ ')' -%}
|
|
55
|
+
{%- set agg_is_unique = 'count(distinct ' ~ adapter.quote(column_name) ~ ') = count(*)' -%}
|
|
56
|
+
{%- set agg_min = 'cast(null as ' ~ dbt.type_string() ~ ')' -%}
|
|
57
|
+
{%- set agg_max = 'cast(null as ' ~ dbt.type_string() ~ ')' -%}
|
|
58
|
+
{%- set agg_avg = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
|
|
59
|
+
{%- set agg_median = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
{%- if is_struct -%}
|
|
63
|
+
{%- set agg_distinct_proportion = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
|
|
64
|
+
{%- set agg_distinct_count = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
|
|
65
|
+
{%- set agg_is_unique = 'null' -%}
|
|
66
|
+
{%- endif -%}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
{%- if (is_numeric or is_date_or_time) and (not is_struct) -%}
|
|
70
|
+
{%- set agg_min =
|
|
71
|
+
'cast(min(' ~ adapter.quote(column_name) ~ ') as ' ~ dbt.type_string() ~ ')'
|
|
72
|
+
-%}
|
|
73
|
+
{%- set agg_max =
|
|
74
|
+
'cast(max(' ~ adapter.quote(column_name) ~ ') as ' ~ dbt.type_string() ~ ')'
|
|
75
|
+
-%}
|
|
76
|
+
{%- endif -%}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
{%- if is_numeric and not is_struct -%}
|
|
80
|
+
{%- set agg_avg = 'avg(' ~ adapter.quote(column_name) ~ ')' -%}
|
|
81
|
+
|
|
82
|
+
{%- if db_type == 'bigquery' -%}
|
|
83
|
+
{%- set agg_median = 'approx_quantiles(' ~ adapter.quote(column_name) ~ ', 100)[offset(50)]' -%}
|
|
84
|
+
{%- elif db_type == 'postgres' -%}
|
|
85
|
+
{%- set agg_median = 'percentile_cont(0.5) within group (order by ' ~ adapter.quote(column_name) ~ ')' -%}
|
|
86
|
+
{%- elif db_type == 'redshift' -%}
|
|
87
|
+
{%- set agg_median =
|
|
88
|
+
'(select percentile_cont(0.5) within group (order by '
|
|
89
|
+
~ adapter.quote(column_name) ~ ') from ' ~ relation ~ ')' -%}
|
|
90
|
+
{%- elif db_type == 'athena' -%}
|
|
91
|
+
{%- set agg_median = 'approx_percentile( ' ~ adapter.quote(column_name) ~ ', 0.5)' -%}
|
|
92
|
+
{%- elif db_type == 'sqlserver' -%}
|
|
93
|
+
{%- set agg_median = 'percentile_cont(' ~ adapter.quote(column_name) ~ ', 0.5) over ()' -%}
|
|
94
|
+
{%- else -%}
|
|
95
|
+
{%- set agg_median = 'median(' ~ adapter.quote(column_name) ~ ')' -%}
|
|
96
|
+
{%- endif -%}
|
|
97
|
+
{%- elif is_logical -%}
|
|
98
|
+
{%- set agg_avg = 'avg(case when ' ~ adapter.quote(column_name) ~ ' then 1 else 0 end)' -%}
|
|
99
|
+
{%- endif -%}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
{# Overwrite Agg ----------------------------------------- #}
|
|
103
|
+
|
|
104
|
+
{# DRC-663: Support bigquery array type }
|
|
105
|
+
{%- set is_array = column_type.startswith('array') -%}
|
|
106
|
+
{%- if db_type == 'bigquery' and is_array -%}
|
|
107
|
+
{%- set agg_distinct_proportion = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
|
|
108
|
+
{%- set agg_distinct_count = 'cast(null as ' ~ dbt.type_numeric() ~ ')' -%}
|
|
109
|
+
{%- set agg_is_unique = 'null' -%}
|
|
110
|
+
{%- set agg_min =
|
|
111
|
+
'cast(min(array_length(' ~ adapter.quote(column_name) ~ ')) as ' ~ dbt.type_string() ~ ')'
|
|
112
|
+
-%}
|
|
113
|
+
{%- set agg_max =
|
|
114
|
+
'cast(max(array_length(' ~ adapter.quote(column_name) ~ ')) as ' ~ dbt.type_string() ~ ')'
|
|
115
|
+
-%}
|
|
116
|
+
{%- set agg_avg = 'avg(array_length(' ~ adapter.quote(column_name) ~ '))' -%}
|
|
117
|
+
{%- set agg_median =
|
|
118
|
+
'approx_quantiles(array_length(' ~ adapter.quote(column_name) ~ '), 100)[offset(50)]'
|
|
119
|
+
-%}
|
|
120
|
+
{%- endif -%}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
{# Main Query -------------------------------------------- #}
|
|
124
|
+
|
|
125
|
+
select
|
|
126
|
+
'{{ column_name }}' as column_name,
|
|
127
|
+
nullif('{{ column_type }}', '') as data_type,
|
|
128
|
+
{{ agg_row_count }} as row_count,
|
|
129
|
+
{{ agg_not_null_proportion }} as not_null_proportion,
|
|
130
|
+
{{ agg_distinct_proportion }} as distinct_proportion,
|
|
131
|
+
{{ agg_distinct_count }} as distinct_count,
|
|
132
|
+
{{ agg_is_unique }} as is_unique,
|
|
133
|
+
{{ agg_min }} as min,
|
|
134
|
+
{{ agg_max }} as max,
|
|
135
|
+
{{ agg_avg }} as avg,
|
|
136
|
+
{{ agg_median }} as median
|
|
137
|
+
from {{ relation }}
|
|
138
|
+
"""
|
|
11
139
|
|
|
12
140
|
|
|
13
141
|
class ProfileParams(BaseModel):
|
|
@@ -33,14 +161,14 @@ class ProfileDiffTask(Task):
|
|
|
33
161
|
|
|
34
162
|
def execute(self):
|
|
35
163
|
import agate
|
|
164
|
+
|
|
36
165
|
from recce.adapter.dbt_adapter import DbtAdapter, merge_tables
|
|
166
|
+
|
|
37
167
|
dbt_adapter: DbtAdapter = default_context().adapter
|
|
38
168
|
|
|
39
169
|
model: str = self.params.model
|
|
40
170
|
selected_columns: List[str] = self.params.columns
|
|
41
171
|
|
|
42
|
-
self._verify_dbt_profiler(dbt_adapter)
|
|
43
|
-
|
|
44
172
|
with dbt_adapter.connection_named("query"):
|
|
45
173
|
self.connection = dbt_adapter.get_thread_connection()
|
|
46
174
|
|
|
@@ -58,7 +186,7 @@ class ProfileDiffTask(Task):
|
|
|
58
186
|
tables: List[agate.Table] = []
|
|
59
187
|
|
|
60
188
|
for column in base_columns:
|
|
61
|
-
self.update_progress(message=f
|
|
189
|
+
self.update_progress(message=f"[Base] Profile column: {column.name}", percentage=completed / total)
|
|
62
190
|
relation = dbt_adapter.create_relation(model, base=True)
|
|
63
191
|
response, table = self._profile_column(dbt_adapter, relation, column)
|
|
64
192
|
tables.append(table)
|
|
@@ -68,7 +196,7 @@ class ProfileDiffTask(Task):
|
|
|
68
196
|
|
|
69
197
|
tables: List[agate.Table] = []
|
|
70
198
|
for column in curr_columns:
|
|
71
|
-
self.update_progress(message=f
|
|
199
|
+
self.update_progress(message=f"[Current] Profile column: {column.column}", percentage=completed / total)
|
|
72
200
|
relation = dbt_adapter.create_relation(model, base=False)
|
|
73
201
|
response, table = self._profile_column(dbt_adapter, relation, column)
|
|
74
202
|
tables.append(table)
|
|
@@ -76,87 +204,23 @@ class ProfileDiffTask(Task):
|
|
|
76
204
|
self.check_cancel()
|
|
77
205
|
current = DataFrame.from_agate(merge_tables(tables))
|
|
78
206
|
|
|
79
|
-
|
|
207
|
+
if len(base.columns) == 0 and len(current.columns) != 0:
|
|
208
|
+
base.columns = current.columns
|
|
209
|
+
elif len(base.columns) != 0 and len(current.columns) == 0:
|
|
210
|
+
current.columns = base.columns
|
|
80
211
|
|
|
81
|
-
|
|
82
|
-
for macro_name, macro in dbt_adapter.manifest.macros.items():
|
|
83
|
-
if macro.package_name == 'dbt_profiler':
|
|
84
|
-
break
|
|
85
|
-
else:
|
|
86
|
-
raise RecceException(
|
|
87
|
-
r"Package 'dbt_profiler' not found. Please refer to the link to install: https://hub.getdbt.com/data-mie/dbt_profiler/")
|
|
212
|
+
return ProfileDiffResult(base=base, current=current)
|
|
88
213
|
|
|
89
214
|
def _profile_column(self, dbt_adapter, relation, column):
|
|
90
|
-
sql_template = textwrap.dedent(r"""
|
|
91
|
-
select
|
|
92
|
-
'{{column_name}}' as column_name,
|
|
93
|
-
nullif('{{column_type}}', '') as data_type,
|
|
94
|
-
{{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
|
|
95
|
-
{{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
|
|
96
|
-
{{ dbt_profiler.measure_distinct_proportion(column_name, column_type) }} as distinct_proportion,
|
|
97
|
-
{{ dbt_profiler.measure_distinct_count(column_name, column_type) }} as distinct_count,
|
|
98
|
-
{{ dbt_profiler.measure_is_unique(column_name, column_type) }} as is_unique,
|
|
99
|
-
{{ dbt_profiler.measure_min(column_name, column_type) }} as min,
|
|
100
|
-
{{ dbt_profiler.measure_max(column_name, column_type) }} as max,
|
|
101
|
-
{{ dbt_profiler.measure_avg(column_name, column_type) }} as avg,
|
|
102
|
-
{{ dbt_profiler.measure_median(column_name, column_type) }} as median
|
|
103
|
-
from
|
|
104
|
-
{{ relation }}
|
|
105
|
-
""")
|
|
106
215
|
column_name = column.name
|
|
107
216
|
column_type = column.data_type.lower()
|
|
108
|
-
db_type = dbt_adapter.adapter.type()
|
|
109
|
-
if db_type == 'bigquery' and column_type.startswith('array'):
|
|
110
|
-
# DRC-663: Support bigquery array type
|
|
111
|
-
sql_template = textwrap.dedent(r"""
|
|
112
|
-
select
|
|
113
|
-
'{{column_name}}' as column_name,
|
|
114
|
-
nullif('{{column_type}}', '') as data_type,
|
|
115
|
-
{{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
|
|
116
|
-
{{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
|
|
117
|
-
cast(null as {{ dbt.type_numeric() }}) as distinct_proportion,
|
|
118
|
-
cast(null as {{ dbt.type_numeric() }}) as distinct_count,
|
|
119
|
-
null as is_unique,
|
|
120
|
-
cast(min(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as {{ dbt_profiler.type_string() }}) as min,
|
|
121
|
-
cast(max(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as {{ dbt_profiler.type_string() }}) as max,
|
|
122
|
-
avg(ARRAY_LENGTH({{ adapter.quote(column_name) }})) as avg,
|
|
123
|
-
APPROX_QUANTILES(ARRAY_LENGTH({{ adapter.quote(column_name) }}), 100)[OFFSET(50)] as median,
|
|
124
|
-
from
|
|
125
|
-
{{ relation }}
|
|
126
|
-
""")
|
|
127
|
-
elif db_type == 'redshift':
|
|
128
|
-
# DRC-1149: Support redshift median calculation
|
|
129
|
-
# https://github.com/data-mie/dbt-profiler/pull/89
|
|
130
|
-
#
|
|
131
|
-
# Since dbt-profiler 0.8.2, there is the third parameter for measure_median
|
|
132
|
-
# For sake of compatibility, we use the new way to call the macro only for redshift
|
|
133
|
-
sql_template = textwrap.dedent(r"""
|
|
134
|
-
with source_data as (
|
|
135
|
-
select
|
|
136
|
-
*
|
|
137
|
-
from {{ relation }}
|
|
138
|
-
)
|
|
139
|
-
select
|
|
140
|
-
'{{column_name}}' as column_name,
|
|
141
|
-
nullif('{{column_type}}', '') as data_type,
|
|
142
|
-
{{ dbt_profiler.measure_row_count(column_name, column_type) }} as row_count,
|
|
143
|
-
{{ dbt_profiler.measure_not_null_proportion(column_name, column_type) }} as not_null_proportion,
|
|
144
|
-
{{ dbt_profiler.measure_distinct_proportion(column_name, column_type) }} as distinct_proportion,
|
|
145
|
-
{{ dbt_profiler.measure_distinct_count(column_name, column_type) }} as distinct_count,
|
|
146
|
-
{{ dbt_profiler.measure_is_unique(column_name, column_type) }} as is_unique,
|
|
147
|
-
{{ dbt_profiler.measure_min(column_name, column_type) }} as min,
|
|
148
|
-
{{ dbt_profiler.measure_max(column_name, column_type) }} as max,
|
|
149
|
-
{{ dbt_profiler.measure_avg(column_name, column_type) }} as avg,
|
|
150
|
-
({{ dbt_profiler.measure_median(column_name, column_type, 'source_data') }}) as median
|
|
151
|
-
from
|
|
152
|
-
source_data
|
|
153
|
-
""")
|
|
217
|
+
db_type = dbt_adapter.adapter.type().lower()
|
|
154
218
|
|
|
155
219
|
try:
|
|
156
220
|
sql = dbt_adapter.generate_sql(
|
|
157
|
-
|
|
221
|
+
PROFILE_COLUMN_JINJA_TEMPLATE,
|
|
158
222
|
base=False, # always false because we use the macro in current manifest
|
|
159
|
-
context=dict(relation=relation, column_name=column_name, column_type=column_type)
|
|
223
|
+
context=dict(relation=relation, column_name=column_name, column_type=column_type, db_type=db_type),
|
|
160
224
|
)
|
|
161
225
|
except Exception as e:
|
|
162
226
|
raise RecceException(f"Failed to generate SQL for profiling column: {column_name}") from e
|
|
@@ -165,14 +229,15 @@ class ProfileDiffTask(Task):
|
|
|
165
229
|
return dbt_adapter.execute(sql, fetch=True)
|
|
166
230
|
except Exception as e:
|
|
167
231
|
from recce.adapter.dbt_adapter import dbt_version
|
|
168
|
-
|
|
232
|
+
|
|
233
|
+
if dbt_version < "v1.8":
|
|
169
234
|
from dbt.exceptions import DbtDatabaseError
|
|
170
235
|
else:
|
|
171
236
|
from dbt_common.exceptions import DbtDatabaseError
|
|
172
237
|
if isinstance(e, DbtDatabaseError):
|
|
173
|
-
if str(e).find(
|
|
238
|
+
if str(e).find("100051") >= 0:
|
|
174
239
|
# Snowflake error '100051 (22012): Division by zero"'
|
|
175
|
-
e = RecceException(
|
|
240
|
+
e = RecceException("No profile diff result due to the model is empty.", False)
|
|
176
241
|
raise e
|
|
177
242
|
|
|
178
243
|
def cancel(self):
|
|
@@ -180,6 +245,7 @@ class ProfileDiffTask(Task):
|
|
|
180
245
|
|
|
181
246
|
if self.connection:
|
|
182
247
|
from recce.adapter.dbt_adapter import DbtAdapter
|
|
248
|
+
|
|
183
249
|
dbt_adapter: DbtAdapter = default_context().adapter
|
|
184
250
|
with dbt_adapter.connection_named("cancel"):
|
|
185
251
|
dbt_adapter.cancel(self.connection)
|
|
@@ -187,7 +253,7 @@ class ProfileDiffTask(Task):
|
|
|
187
253
|
|
|
188
254
|
class ProfileDiffResultDiffer(TaskResultDiffer):
|
|
189
255
|
def _check_result_changed_fn(self, result):
|
|
190
|
-
return self.diff(result[
|
|
256
|
+
return self.diff(result["base"], result["current"])
|
|
191
257
|
|
|
192
258
|
|
|
193
259
|
class ProfileCheckValidator(CheckValidator):
|
|
@@ -202,14 +268,14 @@ class ProfileCheckValidator(CheckValidator):
|
|
|
202
268
|
class ProfileTask(ProfileDiffTask):
|
|
203
269
|
def execute(self):
|
|
204
270
|
import agate
|
|
271
|
+
|
|
205
272
|
from recce.adapter.dbt_adapter import DbtAdapter, merge_tables
|
|
273
|
+
|
|
206
274
|
dbt_adapter: DbtAdapter = default_context().adapter
|
|
207
275
|
|
|
208
276
|
model: str = self.params.model
|
|
209
277
|
selected_columns: List[str] = self.params.columns
|
|
210
278
|
|
|
211
|
-
self._verify_dbt_profiler(dbt_adapter)
|
|
212
|
-
|
|
213
279
|
with dbt_adapter.connection_named("query"):
|
|
214
280
|
self.connection = dbt_adapter.get_thread_connection()
|
|
215
281
|
curr_columns = [column for column in dbt_adapter.get_columns(model, base=False)]
|
|
@@ -222,7 +288,7 @@ class ProfileTask(ProfileDiffTask):
|
|
|
222
288
|
|
|
223
289
|
tables: List[agate.Table] = []
|
|
224
290
|
for column in curr_columns:
|
|
225
|
-
self.update_progress(message=f
|
|
291
|
+
self.update_progress(message=f"[Current] Profile column: {column.column}", percentage=completed / total)
|
|
226
292
|
relation = dbt_adapter.create_relation(model, base=False)
|
|
227
293
|
response, table = self._profile_column(dbt_adapter, relation, column)
|
|
228
294
|
tables.append(table)
|