recce-nightly 1.2.0.20250506__py3-none-any.whl → 1.26.0.20251124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recce-nightly might be problematic. Click here for more details.

Files changed (213) hide show
  1. recce/VERSION +1 -1
  2. recce/__init__.py +27 -22
  3. recce/adapter/base.py +11 -14
  4. recce/adapter/dbt_adapter/__init__.py +810 -480
  5. recce/adapter/dbt_adapter/dbt_version.py +3 -0
  6. recce/adapter/sqlmesh_adapter.py +24 -35
  7. recce/apis/check_api.py +39 -28
  8. recce/apis/check_func.py +33 -27
  9. recce/apis/run_api.py +25 -19
  10. recce/apis/run_func.py +29 -23
  11. recce/artifact.py +119 -51
  12. recce/cli.py +1299 -323
  13. recce/config.py +42 -33
  14. recce/connect_to_cloud.py +138 -0
  15. recce/core.py +55 -47
  16. recce/data/404.html +1 -1
  17. recce/data/__next.__PAGE__.txt +10 -0
  18. recce/data/__next._full.txt +23 -0
  19. recce/data/__next._head.txt +8 -0
  20. recce/data/__next._index.txt +8 -0
  21. recce/data/__next._tree.txt +5 -0
  22. recce/data/_next/static/52aV_JrNUZU6dMFgvTQEO/_buildManifest.js +11 -0
  23. recce/data/_next/static/52aV_JrNUZU6dMFgvTQEO/_clientMiddlewareManifest.json +1 -0
  24. recce/data/_next/static/chunks/02b996c7f6a29a06.js +4 -0
  25. recce/data/_next/static/chunks/19c10d219a6a21ff.js +1 -0
  26. recce/data/_next/static/chunks/2df9ec28a061971d.js +11 -0
  27. recce/data/_next/static/chunks/3098c987393bda15.js +1 -0
  28. recce/data/_next/static/chunks/393dc43e483f717a.css +2 -0
  29. recce/data/_next/static/chunks/399e8d91a7e45073.js +2 -0
  30. recce/data/_next/static/chunks/4d0186f631230245.js +1 -0
  31. recce/data/_next/static/chunks/5794ba9e10a9c060.js +11 -0
  32. recce/data/_next/static/chunks/715761c929a3f28b.js +110 -0
  33. recce/data/_next/static/chunks/71f88fcc615bf282.js +1 -0
  34. recce/data/_next/static/chunks/80d2a95eaf1201ea.js +1 -0
  35. recce/data/_next/static/chunks/9979c6109bbbee35.js +1 -0
  36. recce/data/_next/static/chunks/99d638224186c118.js +1 -0
  37. recce/data/_next/static/chunks/d003eb36240e92f3.js +1 -0
  38. recce/data/_next/static/chunks/d3167cdfec4fc351.js +1 -0
  39. recce/data/_next/static/chunks/e124bccf574a3361.css +1 -0
  40. recce/data/_next/static/chunks/f40141db1bdb46f0.css +6 -0
  41. recce/data/_next/static/chunks/fcc53a88741a52f9.js +1 -0
  42. recce/data/_next/static/chunks/turbopack-b1920d28cfb1f28d.js +3 -0
  43. recce/data/_next/static/media/favicon.a8d38d84.ico +0 -0
  44. recce/data/_next/static/media/montserrat-cyrillic-800-normal.d80d830d.woff2 +0 -0
  45. recce/data/_next/static/media/montserrat-cyrillic-800-normal.f9d58125.woff +0 -0
  46. recce/data/_next/static/media/montserrat-cyrillic-ext-800-normal.076c2a93.woff2 +0 -0
  47. recce/data/_next/static/media/montserrat-cyrillic-ext-800-normal.a4fa76b5.woff +0 -0
  48. recce/data/_next/static/media/montserrat-latin-800-normal.cde454cc.woff2 +0 -0
  49. recce/data/_next/static/media/montserrat-latin-800-normal.d5761935.woff +0 -0
  50. recce/data/_next/static/media/montserrat-latin-ext-800-normal.40ec0659.woff2 +0 -0
  51. recce/data/_next/static/media/montserrat-latin-ext-800-normal.b671449b.woff +0 -0
  52. recce/data/_next/static/media/montserrat-vietnamese-800-normal.9f7b8541.woff +0 -0
  53. recce/data/_next/static/media/montserrat-vietnamese-800-normal.f9eb854e.woff2 +0 -0
  54. recce/data/_next/static/media/reload-image.7aa931c7.svg +4 -0
  55. recce/data/_not-found/__next._full.txt +17 -0
  56. recce/data/_not-found/__next._head.txt +8 -0
  57. recce/data/_not-found/__next._index.txt +8 -0
  58. recce/data/_not-found/__next._not-found.__PAGE__.txt +5 -0
  59. recce/data/_not-found/__next._not-found.txt +4 -0
  60. recce/data/_not-found/__next._tree.txt +3 -0
  61. recce/data/_not-found.html +1 -0
  62. recce/data/_not-found.txt +17 -0
  63. recce/data/auth_callback.html +68 -0
  64. recce/data/imgs/reload-image.svg +4 -0
  65. recce/data/index.html +1 -27
  66. recce/data/index.txt +23 -7
  67. recce/diff.py +6 -12
  68. recce/event/__init__.py +86 -74
  69. recce/event/collector.py +33 -22
  70. recce/event/track.py +49 -27
  71. recce/exceptions.py +1 -1
  72. recce/git.py +7 -7
  73. recce/github.py +57 -53
  74. recce/mcp_server.py +716 -0
  75. recce/models/__init__.py +4 -1
  76. recce/models/check.py +6 -7
  77. recce/models/run.py +1 -0
  78. recce/models/types.py +131 -28
  79. recce/pull_request.py +27 -25
  80. recce/run.py +165 -121
  81. recce/server.py +303 -111
  82. recce/state/__init__.py +31 -0
  83. recce/state/cloud.py +632 -0
  84. recce/state/const.py +26 -0
  85. recce/state/local.py +56 -0
  86. recce/state/state.py +119 -0
  87. recce/state/state_loader.py +174 -0
  88. recce/summary.py +188 -143
  89. recce/tasks/__init__.py +19 -3
  90. recce/tasks/core.py +11 -13
  91. recce/tasks/dataframe.py +82 -18
  92. recce/tasks/histogram.py +69 -34
  93. recce/tasks/lineage.py +2 -2
  94. recce/tasks/profile.py +152 -86
  95. recce/tasks/query.py +139 -87
  96. recce/tasks/rowcount.py +37 -31
  97. recce/tasks/schema.py +18 -15
  98. recce/tasks/top_k.py +35 -35
  99. recce/tasks/valuediff.py +216 -152
  100. recce/util/__init__.py +3 -0
  101. recce/util/api_token.py +80 -0
  102. recce/util/breaking.py +87 -85
  103. recce/util/cll.py +274 -219
  104. recce/util/io.py +22 -17
  105. recce/util/lineage.py +65 -16
  106. recce/util/logger.py +1 -1
  107. recce/util/onboarding_state.py +45 -0
  108. recce/util/perf_tracking.py +85 -0
  109. recce/util/recce_cloud.py +322 -72
  110. recce/util/singleton.py +4 -4
  111. recce/yaml/__init__.py +7 -10
  112. recce_cloud/__init__.py +24 -0
  113. recce_cloud/api/__init__.py +17 -0
  114. recce_cloud/api/base.py +111 -0
  115. recce_cloud/api/client.py +150 -0
  116. recce_cloud/api/exceptions.py +26 -0
  117. recce_cloud/api/factory.py +63 -0
  118. recce_cloud/api/github.py +76 -0
  119. recce_cloud/api/gitlab.py +82 -0
  120. recce_cloud/artifact.py +57 -0
  121. recce_cloud/ci_providers/__init__.py +9 -0
  122. recce_cloud/ci_providers/base.py +82 -0
  123. recce_cloud/ci_providers/detector.py +147 -0
  124. recce_cloud/ci_providers/github_actions.py +136 -0
  125. recce_cloud/ci_providers/gitlab_ci.py +130 -0
  126. recce_cloud/cli.py +245 -0
  127. recce_cloud/upload.py +214 -0
  128. {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.26.0.20251124.dist-info}/METADATA +68 -37
  129. recce_nightly-1.26.0.20251124.dist-info/RECORD +180 -0
  130. {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.26.0.20251124.dist-info}/WHEEL +1 -1
  131. {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.26.0.20251124.dist-info}/top_level.txt +1 -0
  132. tests/adapter/dbt_adapter/conftest.py +9 -5
  133. tests/adapter/dbt_adapter/dbt_test_helper.py +37 -22
  134. tests/adapter/dbt_adapter/test_dbt_adapter.py +0 -15
  135. tests/adapter/dbt_adapter/test_dbt_cll.py +656 -41
  136. tests/adapter/dbt_adapter/test_selector.py +22 -21
  137. tests/recce_cloud/__init__.py +0 -0
  138. tests/recce_cloud/test_ci_providers.py +351 -0
  139. tests/recce_cloud/test_cli.py +372 -0
  140. tests/recce_cloud/test_client.py +273 -0
  141. tests/recce_cloud/test_platform_clients.py +333 -0
  142. tests/tasks/conftest.py +1 -1
  143. tests/tasks/test_histogram.py +58 -66
  144. tests/tasks/test_lineage.py +36 -23
  145. tests/tasks/test_preset_checks.py +45 -31
  146. tests/tasks/test_profile.py +339 -15
  147. tests/tasks/test_query.py +46 -46
  148. tests/tasks/test_row_count.py +65 -46
  149. tests/tasks/test_schema.py +65 -42
  150. tests/tasks/test_top_k.py +22 -18
  151. tests/tasks/test_valuediff.py +43 -32
  152. tests/test_cli.py +174 -60
  153. tests/test_cli_mcp_optional.py +45 -0
  154. tests/test_cloud_listing_cli.py +324 -0
  155. tests/test_config.py +7 -9
  156. tests/test_connect_to_cloud.py +82 -0
  157. tests/test_core.py +151 -4
  158. tests/test_dbt.py +7 -7
  159. tests/test_mcp_server.py +332 -0
  160. tests/test_pull_request.py +1 -1
  161. tests/test_server.py +25 -19
  162. tests/test_summary.py +29 -17
  163. recce/data/_next/static/Kcbs3GEIyH2LxgLYat0es/_buildManifest.js +0 -1
  164. recce/data/_next/static/chunks/1f229bf6-d9fe92e56db8d93b.js +0 -1
  165. recce/data/_next/static/chunks/29e3cc0d-8c150e37dff9631b.js +0 -1
  166. recce/data/_next/static/chunks/368-7587b306577df275.js +0 -65
  167. recce/data/_next/static/chunks/36e1c10d-bb0210cbd6573a8d.js +0 -1
  168. recce/data/_next/static/chunks/3998a672-eaad84bdd88cc73e.js +0 -1
  169. recce/data/_next/static/chunks/3a92ee20-3b5d922d4157af5e.js +0 -1
  170. recce/data/_next/static/chunks/450c323b-1bb5db526e54435a.js +0 -1
  171. recce/data/_next/static/chunks/47d8844f-79a1b53c66a7d7ec.js +0 -1
  172. recce/data/_next/static/chunks/6dc81886-c94b9b91bc2c3caf.js +0 -1
  173. recce/data/_next/static/chunks/6ef81909-694dc38134099299.js +0 -1
  174. recce/data/_next/static/chunks/700-3b65fc3666820d00.js +0 -2
  175. recce/data/_next/static/chunks/7a8a3e83-d7fa409d97b38b2b.js +0 -1
  176. recce/data/_next/static/chunks/7f27ae6c-413f6b869a04183a.js +0 -1
  177. recce/data/_next/static/chunks/8d700b6a-f0b1f6b9e0d97ce2.js +0 -1
  178. recce/data/_next/static/chunks/9746af58-d74bef4d03eea6ab.js +0 -1
  179. recce/data/_next/static/chunks/a30376cd-7d806e1602f2dc3a.js +0 -1
  180. recce/data/_next/static/chunks/app/_not-found/page-8a886fa0855c3105.js +0 -1
  181. recce/data/_next/static/chunks/app/layout-9102e22cb73f74d6.js +0 -1
  182. recce/data/_next/static/chunks/app/page-cee661090afbd6aa.js +0 -1
  183. recce/data/_next/static/chunks/b63b1b3f-7395c74e11a14e95.js +0 -1
  184. recce/data/_next/static/chunks/c132bf7d-8102037f9ccf372a.js +0 -1
  185. recce/data/_next/static/chunks/c1ceaa8b-a1e442154d23515e.js +0 -1
  186. recce/data/_next/static/chunks/cd9f8d63-cf0d5a7b0f7a92e8.js +0 -54
  187. recce/data/_next/static/chunks/ce84277d-f42c2c58049cea2d.js +0 -1
  188. recce/data/_next/static/chunks/e24bf851-0f8cbc99656833e7.js +0 -1
  189. recce/data/_next/static/chunks/fee69bc6-f17d36c080742e74.js +0 -1
  190. recce/data/_next/static/chunks/framework-ded83d71b51ce901.js +0 -1
  191. recce/data/_next/static/chunks/main-a0859f1f36d0aa6c.js +0 -1
  192. recce/data/_next/static/chunks/main-app-0225a2255968e566.js +0 -1
  193. recce/data/_next/static/chunks/pages/_app-d5672bf3d8b6371b.js +0 -1
  194. recce/data/_next/static/chunks/pages/_error-ed75be3f25588548.js +0 -1
  195. recce/data/_next/static/chunks/webpack-567d72f0bc0820d5.js +0 -1
  196. recce/data/_next/static/css/c9ecb46a4b21c126.css +0 -14
  197. recce/data/_next/static/media/montserrat-cyrillic-800-normal.22628180.woff2 +0 -0
  198. recce/data/_next/static/media/montserrat-cyrillic-800-normal.31d693bb.woff +0 -0
  199. recce/data/_next/static/media/montserrat-cyrillic-ext-800-normal.7e2c1e62.woff +0 -0
  200. recce/data/_next/static/media/montserrat-cyrillic-ext-800-normal.94a63aea.woff2 +0 -0
  201. recce/data/_next/static/media/montserrat-latin-800-normal.6f8fa298.woff2 +0 -0
  202. recce/data/_next/static/media/montserrat-latin-800-normal.97e20d5e.woff +0 -0
  203. recce/data/_next/static/media/montserrat-latin-ext-800-normal.013b84f9.woff2 +0 -0
  204. recce/data/_next/static/media/montserrat-latin-ext-800-normal.aff52ab0.woff +0 -0
  205. recce/data/_next/static/media/montserrat-vietnamese-800-normal.5f21869b.woff +0 -0
  206. recce/data/_next/static/media/montserrat-vietnamese-800-normal.c0035377.woff2 +0 -0
  207. recce/state.py +0 -753
  208. recce_nightly-1.2.0.20250506.dist-info/RECORD +0 -142
  209. tests/test_state.py +0 -123
  210. /recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → 52aV_JrNUZU6dMFgvTQEO}/_ssgManifest.js +0 -0
  211. /recce/data/_next/static/chunks/{polyfills-42372ed130431b0a.js → a6dad97d9634a72d.js} +0 -0
  212. {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.26.0.20251124.dist-info}/entry_points.txt +0 -0
  213. {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.26.0.20251124.dist-info}/licenses/LICENSE +0 -0
recce/util/cll.py CHANGED
@@ -1,19 +1,24 @@
1
1
  import time
2
2
  from dataclasses import dataclass
3
- from typing import Dict, List, Literal
3
+ from typing import Dict, List, Optional, Tuple
4
4
 
5
- from sqlglot import parse_one, Dialect
6
- from sqlglot.errors import SqlglotError, OptimizeError
7
- from sqlglot.expressions import Column, Alias, Func, Binary, Paren, Case, Expression, If, Union, Intersect
8
- from sqlglot.optimizer import traverse_scope, Scope
5
+ import sqlglot.expressions as exp
6
+ from sqlglot import Dialect, parse_one
7
+ from sqlglot.errors import OptimizeError, SqlglotError
8
+ from sqlglot.optimizer import Scope, traverse_scope
9
9
  from sqlglot.optimizer.qualify import qualify
10
10
 
11
11
  from recce.exceptions import RecceException
12
- from recce.util import SingletonMeta
12
+ from recce.models.types import CllColumn, CllColumnDep
13
+
14
+ CllResult = Tuple[
15
+ List[CllColumnDep], # Model to column dependencies
16
+ Dict[str, CllColumn], # Column to column dependencies
17
+ ]
13
18
 
14
19
 
15
20
  @dataclass
16
- class CLLPerformanceTracking(metaclass=SingletonMeta):
21
+ class CLLPerformanceTracking:
17
22
  lineage_start = None
18
23
  lineage_elapsed = None
19
24
  column_lineage_start = None
@@ -50,11 +55,11 @@ class CLLPerformanceTracking(metaclass=SingletonMeta):
50
55
 
51
56
  def to_dict(self):
52
57
  return {
53
- 'lineage_elapsed_ms': self.lineage_elapsed,
54
- 'column_lineage_elapsed_ms': self.column_lineage_elapsed,
55
- 'total_nodes': self.total_nodes,
56
- 'sqlglot_error_nodes': self.sqlglot_error_nodes,
57
- 'other_error_nodes': self.other_error_nodes
58
+ "lineage_elapsed_ms": self.lineage_elapsed,
59
+ "column_lineage_elapsed_ms": self.column_lineage_elapsed,
60
+ "total_nodes": self.total_nodes,
61
+ "sqlglot_error_nodes": self.sqlglot_error_nodes,
62
+ "other_error_nodes": self.other_error_nodes,
58
63
  }
59
64
 
60
65
  def reset(self):
@@ -68,233 +73,283 @@ class CLLPerformanceTracking(metaclass=SingletonMeta):
68
73
  self.other_error_nodes = 0
69
74
 
70
75
 
71
- @dataclass
72
- class ColumnLevelDependsOn:
73
- node: str
74
- column: str
75
-
76
-
77
- @dataclass
78
- class ColumnLevelDependencyColumn:
79
- type: Literal['source', 'passthrough', 'renamed', 'derived']
80
- depends_on: List[ColumnLevelDependsOn]
81
-
82
-
83
- def _cll_expression(expression, table_alias_map) -> ColumnLevelDependencyColumn:
76
+ def _cll_column(proj, table_alias_map) -> CllColumn:
84
77
  # given an expression, return the columns depends on
85
78
  # [{node: table, column: column}, ...]
79
+ type = "source"
80
+ depends_on: List[CllColumnDep] = []
81
+
82
+ # instance of Column
83
+ if isinstance(proj, exp.Alias):
84
+ # 'select a as b'
85
+ # 'select CURRENT_TIMESTAMP() as create_at'
86
+ root = proj.this
87
+
88
+ for expression in root.walk(bfs=False):
89
+ if isinstance(expression, exp.Column):
90
+ column = expression
91
+ alias = column.table
92
+
93
+ if alias is None:
94
+ table = next(iter(table_alias_map.values()))
95
+ else:
96
+ table = table_alias_map.get(alias, alias)
97
+ depends_on.append(CllColumnDep(table, column.name))
98
+ if type == "source":
99
+ type = "passthrough"
100
+ elif isinstance(expression, (exp.Paren, exp.Identifier)):
101
+ pass
102
+ else:
103
+ type = "derived"
104
+
105
+ depends_on = _dedeup_depends_on(depends_on)
106
+
107
+ if len(depends_on) == 0:
108
+ type = "source"
109
+
110
+ if isinstance(proj, exp.Alias):
111
+ alias = proj
112
+ if type == "passthrough" and depends_on[0].column != alias.alias_or_name:
113
+ type = "renamed"
114
+
115
+ return CllColumn(type=type, depends_on=depends_on)
116
+
117
+
118
+ def _dedeup_depends_on(depends_on: List[CllColumnDep]) -> List[CllColumnDep]:
119
+ # deduplicate the depends_on list
120
+ dedup_set = set()
121
+ dedup_list = []
122
+ for col_dep in depends_on:
123
+ node_col = col_dep.node + "." + col_dep.column
124
+ if node_col not in dedup_set:
125
+ dedup_list.append(col_dep)
126
+ dedup_set.add(node_col)
127
+ return dedup_list
128
+
129
+
130
+ def _cll_set_scope(scope: Scope, scope_cll_map: dict[Scope, CllResult]) -> CllResult:
131
+ # model-to-column
132
+ m2c: List[CllColumnDep] = []
133
+ # column-to-column
134
+ c2c_map: Dict[str, CllColumn] = {}
135
+
136
+ for union_scope in scope.union_scopes:
137
+ sub_scope_result = scope_cll_map.get(union_scope)
138
+ if sub_scope_result is None:
139
+ raise RecceException(f"Scope {union_scope} not found in scope_cll_map")
140
+ sub_m2c, sub_c2c_map = sub_scope_result
141
+
142
+ for k, v in sub_c2c_map.items():
143
+ if k not in c2c_map:
144
+ c2c_map[k] = v
145
+ else:
146
+ c2c_map[k].depends_on.extend(v.depends_on)
147
+ c2c_map[k].transformation_type = "derived"
148
+
149
+ m2c.extend(sub_m2c)
150
+ return m2c, c2c_map
151
+
152
+
153
+ def _cll_select_scope(scope: Scope, scope_cll_map: dict[Scope, CllResult]) -> CllResult:
154
+ assert scope.expression.key == "select"
155
+
156
+ # model-to-column
157
+ m2c: List[CllColumnDep] = []
158
+ # column-to-column
159
+ c2c_map: Dict[str, CllColumn] = {}
160
+
161
+ table_alias_map = {t.alias_or_name: t.name for t in scope.tables}
162
+ select = scope.expression
163
+
164
+ def source_column_dependency(ref_column: exp.Column) -> Optional[CllColumn]:
165
+ column_name = ref_column.name
166
+ table_name = ref_column.table if ref_column.table != "" else next(iter(table_alias_map.values()))
167
+ source = scope.sources.get(table_name, None) # transformation_type: exp.Table | Scope
168
+ if isinstance(source, Scope):
169
+ ref_cll_result = scope_cll_map.get(source)
170
+ if ref_cll_result is None:
171
+ return None
172
+ _, sub_c2c_map = ref_cll_result
173
+ return sub_c2c_map.get(column_name)
174
+ elif isinstance(source, exp.Table):
175
+ return CllColumn(
176
+ name=column_name,
177
+ transformation_type="passthrough",
178
+ depends_on=[CllColumnDep(node=source.name, column=column_name)],
179
+ )
180
+ else:
181
+ return None
182
+
183
+ def subquery_cll(subquery: exp.Subquery) -> Optional[CllResult]:
184
+ select = subquery.find(exp.Select)
185
+ if select is None:
186
+ return None
187
+
188
+ matched_scope = None
189
+ for sub_scope in scope.subquery_scopes:
190
+ if sub_scope.expression == select:
191
+ matched_scope = sub_scope
192
+ break
193
+ if matched_scope is None:
194
+ return None
195
+
196
+ return scope_cll_map.get(matched_scope)
197
+
198
+ for proj in scope.expression.selects:
199
+ transformation_type = "source"
200
+ column_depends_on: List[CllColumnDep] = []
201
+ root = proj.this if isinstance(proj, exp.Alias) else proj
202
+ for expression in root.walk(bfs=False):
203
+ if isinstance(expression, exp.Column):
204
+ ref_column_dependency = source_column_dependency(expression)
205
+ if ref_column_dependency is not None:
206
+ column_depends_on.extend(ref_column_dependency.depends_on)
207
+ if ref_column_dependency.transformation_type == "derived":
208
+ transformation_type = "derived"
209
+ elif ref_column_dependency.transformation_type == "renamed":
210
+ if transformation_type == "source" or transformation_type == "passthrough":
211
+ transformation_type = "renamed"
212
+ elif ref_column_dependency.transformation_type == "passthrough":
213
+ if transformation_type == "source":
214
+ transformation_type = "passthrough"
215
+ else:
216
+ column_depends_on.append(CllColumnDep(expression.table, expression.name))
217
+ if transformation_type == "source":
218
+ transformation_type = "passthrough"
86
219
 
87
- if isinstance(expression, Column):
88
- column = expression
89
- alias = column.table
220
+ elif isinstance(expression, (exp.Paren, exp.Identifier)):
221
+ pass
222
+ else:
223
+ transformation_type = "derived"
90
224
 
91
- if alias is None:
92
- table = next(iter(table_alias_map.values()))
93
- else:
94
- table = table_alias_map.get(alias, alias)
225
+ column_depends_on = _dedeup_depends_on(column_depends_on)
226
+
227
+ if len(column_depends_on) == 0 and transformation_type != "source":
228
+ transformation_type = "source"
229
+
230
+ if isinstance(proj, exp.Alias):
231
+ alias = proj
232
+ if transformation_type == "passthrough" and column_depends_on[0].column != alias.alias_or_name:
233
+ transformation_type = "renamed"
95
234
 
96
- return ColumnLevelDependencyColumn(
97
- type='passthrough',
98
- depends_on=[ColumnLevelDependsOn(table, column.name)]
235
+ c2c_map[proj.alias_or_name] = CllColumn(
236
+ name=proj.alias_or_name, transformation_type=transformation_type, depends_on=column_depends_on
99
237
  )
100
- elif isinstance(expression, Paren):
101
- return _cll_expression(expression.this, table_alias_map)
102
- elif isinstance(expression, Binary):
103
- depends_on = []
104
- if expression.left:
105
- depends_on_left = _cll_expression(expression.left, table_alias_map).depends_on
106
- depends_on.extend(depends_on_left)
107
- if expression.right:
108
- depends_on_right = _cll_expression(expression.right, table_alias_map).depends_on
109
- depends_on.extend(depends_on_right)
110
- type = 'derived' if depends_on else 'source'
111
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
112
- elif isinstance(expression, Case):
113
- ifs = expression.args['ifs']
114
- default = expression.args['default']
115
- depends_on = []
116
- for expr in ifs:
117
- depends_on_one = _cll_expression(expr, table_alias_map).depends_on
118
- depends_on.extend(depends_on_one)
119
- if default is not None:
120
- depends_on.extend(_cll_expression(default, table_alias_map).depends_on)
121
- type = 'derived' if depends_on else 'source'
122
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
123
- elif isinstance(expression, If):
124
- depends_on = []
125
- if expression.this:
126
- depends_on_one = _cll_expression(expression.this, table_alias_map).depends_on
127
- depends_on.extend(depends_on_one)
128
- if expression.args.get('true'):
129
- depends_on_one = _cll_expression(expression.args.get('true'), table_alias_map).depends_on
130
- depends_on.extend(depends_on_one)
131
- if expression.args.get('false'):
132
- depends_on_one = _cll_expression(expression.args.get('false'), table_alias_map).depends_on
133
- depends_on.extend(depends_on_one)
134
- type = 'derived' if depends_on else 'source'
135
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
136
- elif isinstance(expression, Func):
137
- if expression.expressions:
138
- depends_on = []
139
- for expr in expression.expressions:
140
- depends_on_one = _cll_expression(expr, table_alias_map).depends_on
141
- depends_on.extend(depends_on_one)
142
- type = 'derived' if depends_on else 'source'
143
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
144
- if expression.this:
145
- depends_on = _cll_expression(expression.this, table_alias_map).depends_on
146
- type = 'derived' if depends_on else 'source'
147
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
148
-
149
- return ColumnLevelDependencyColumn(type='source', depends_on=[])
150
- elif expression.this and isinstance(expression.this, Expression):
151
- depends_on = _cll_expression(expression.this, table_alias_map).depends_on
152
- type = 'derived' if depends_on else 'source'
153
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
154
- elif expression.expressions:
155
- depends_on = []
156
- for expr in expression.expressions:
157
- depends_on_one = _cll_expression(expr, table_alias_map).depends_on
158
- depends_on.extend(depends_on_one)
159
- type = 'derived' if depends_on else 'source'
160
- return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
161
- else:
162
- depends_on = []
163
- return ColumnLevelDependencyColumn(type='source', depends_on=depends_on)
164
-
165
-
166
- def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn]:
167
- # given a sql, return the columns depends on
238
+
239
+ def selected_column_dependency(ref_column: exp.Column) -> Optional[CllColumn]:
240
+ column_name = ref_column.name
241
+ return c2c_map.get(column_name)
242
+
243
+ # joins clause: Reference the source columns
244
+ if select.args.get("joins"):
245
+ joins = select.args.get("joins")
246
+ for join in joins:
247
+ if isinstance(join, exp.Join):
248
+ for ref_column in join.find_all(exp.Column):
249
+ if source_column_dependency(ref_column) is not None:
250
+ m2c.extend(source_column_dependency(ref_column).depends_on)
251
+
252
+ # where clauses: Reference the source columns
253
+ if select.args.get("where"):
254
+ where = select.args.get("where")
255
+ if isinstance(where, exp.Where):
256
+ for ref_column in where.find_all(exp.Column):
257
+ if source_column_dependency(ref_column) is not None:
258
+ m2c.extend(source_column_dependency(ref_column).depends_on)
259
+ for subquery in where.find_all(exp.Subquery):
260
+ sub_cll = subquery_cll(subquery)
261
+ if sub_cll is not None:
262
+ sub_m2c, sub_c2c_map = sub_cll
263
+ m2c.extend(sub_m2c)
264
+ for sub_c in sub_c2c_map.values():
265
+ m2c.extend(sub_c.depends_on)
266
+
267
+ # group by clause: Reference the source columns, column index
268
+ if select.args.get("group"):
269
+ group = select.args.get("group")
270
+ if isinstance(group, exp.Group):
271
+ for ref_column in group.find_all(exp.Column):
272
+ if source_column_dependency(ref_column) is not None:
273
+ m2c.extend(source_column_dependency(ref_column).depends_on)
274
+
275
+ # having clause: Reference the source columns, selected columns
276
+ if select.args.get("having"):
277
+ having = select.args.get("having")
278
+ if isinstance(having, exp.Having):
279
+ for ref_column in having.find_all(exp.Column):
280
+ if source_column_dependency(ref_column) is not None:
281
+ m2c.extend(source_column_dependency(ref_column).depends_on)
282
+ elif selected_column_dependency(ref_column) is not None:
283
+ m2c.extend(selected_column_dependency(ref_column).depends_on)
284
+ for subquery in having.find_all(exp.Subquery):
285
+ sub_cll = subquery_cll(subquery)
286
+ if sub_cll is not None:
287
+ sub_m2c, sub_c2c_map = sub_cll
288
+ m2c.extend(sub_m2c)
289
+ for sub_c in sub_c2c_map.values():
290
+ m2c.extend(sub_c.depends_on)
291
+
292
+ # order by clause: Reference the source columns, selected columns, column index
293
+ if select.args.get("order"):
294
+ order = select.args.get("order")
295
+ if isinstance(order, exp.Order):
296
+ for ref_column in order.find_all(exp.Column):
297
+ if source_column_dependency(ref_column) is not None:
298
+ m2c.extend(source_column_dependency(ref_column).depends_on)
299
+ elif selected_column_dependency(ref_column) is not None:
300
+ m2c.extend(selected_column_dependency(ref_column).depends_on)
301
+
302
+ for source in scope.sources.values():
303
+ scope_cll_result = scope_cll_map.get(source)
304
+ if scope_cll_result is None:
305
+ continue
306
+ sub_m2c, _ = scope_cll_result
307
+ m2c.extend(sub_m2c)
308
+
309
+ m2c = _dedeup_depends_on(m2c)
310
+
311
+ return m2c, c2c_map
312
+
313
+
314
+ def cll(sql, schema=None, dialect=None) -> CllResult:
315
+ # given a sql, return the cll for the sql
168
316
  # {
169
- # 'column1': {
170
- # 'transformation_type': 'transform' or 'original',
171
- # 'depends_on': {
172
- # {node: model_id, column: column}, ...]
173
- # }
317
+ # 'depends_on': [{'node': 'model_id', 'column': 'column'}],
318
+ # 'columns': {
319
+ # 'column1': {
320
+ # 'type': 'derived',
321
+ # 'depends_on': [{'node': 'model_id', 'column': 'column'}],
322
+ # }
323
+ # }
324
+ # }
174
325
 
175
326
  dialect = Dialect.get(dialect) if dialect is not None else None
176
327
 
177
328
  try:
178
329
  expression = parse_one(sql, dialect=dialect)
179
330
  except SqlglotError as e:
180
- raise RecceException(f'Failed to parse SQL: {str(e)}')
331
+ raise RecceException(f"Failed to parse SQL: {str(e)}")
181
332
 
182
333
  try:
183
334
  expression = qualify(expression, schema=schema, dialect=dialect)
184
335
  except OptimizeError as e:
185
- raise RecceException(f'Failed to optimize SQL: {str(e)}')
336
+ raise RecceException(f"Failed to optimize SQL: {str(e)}")
186
337
  except SqlglotError as e:
187
- raise RecceException(f'Failed to qualify SQL: {str(e)}')
338
+ raise RecceException(f"Failed to qualify SQL: {str(e)}")
188
339
 
189
- result = {}
190
- global_lineage = {}
340
+ result = None
341
+ scope_cll_map = {}
191
342
  for scope in traverse_scope(expression):
192
- scope_lineage = {}
193
-
194
- table_alias_map = {
195
- t.alias_or_name: t.name
196
- for t in scope.tables
197
- }
198
-
199
- if isinstance(scope.expression, Union) or isinstance(scope.expression, Intersect):
200
- for union_scope in scope.union_scopes:
201
- for k, v in global_lineage[union_scope].items():
202
- if k not in scope_lineage:
203
- scope_lineage[k] = v
204
- else:
205
- scope_lineage[k].depends_on.extend(v.depends_on)
206
- scope_lineage[k].type = 'derived'
343
+ scope_type = scope.expression.key
344
+ if scope_type == "union" or scope_type == "intersect" or scope_type == "except":
345
+ result = _cll_set_scope(scope, scope_cll_map)
346
+ elif scope_type == "select":
347
+ result = _cll_select_scope(scope, scope_cll_map)
207
348
  else:
208
- for select in scope.expression.selects:
209
- # instance of Column
210
- if isinstance(select, Column):
211
- # 'select a'
212
- column = select
213
- column_cll = _cll_expression(column, table_alias_map)
214
- elif isinstance(select, Alias):
215
- # 'select a as b'
216
- # 'select CURRENT_TIMESTAMP() as create_at'
217
- alias = select
218
- col_expression = alias.this
219
- column_cll = _cll_expression(col_expression, table_alias_map)
220
- if (
221
- column_cll and
222
- column_cll.type == 'passthrough' and
223
- column_cll.depends_on[0].column != alias.alias_or_name
224
- ):
225
- column_cll.type = 'renamed'
226
- else:
227
- # 'select 1'
228
- column_cll = ColumnLevelDependencyColumn(type='source', depends_on=[])
229
-
230
- cte_type = None
231
- flatten_col_depends_on = []
232
- for col_dep in column_cll.depends_on:
233
- col_dep_node = col_dep.node
234
- col_dep_column = col_dep.column
235
- # cte
236
- cte_scope = scope.cte_sources.get(col_dep_node)
237
- # inline derived table
238
- source_scope = None
239
- if isinstance(scope.sources.get(col_dep_node), Scope):
240
- source_scope = scope.sources.get(col_dep_node)
241
-
242
- if cte_scope is not None:
243
- cte_cll = global_lineage[cte_scope]
244
- if cte_cll is None or cte_cll.get(col_dep_column) is None:
245
- # In dbt-duckdb, the external source is compiled as `read_csv('..') rather than a table.
246
- continue
247
- cte_type = cte_cll.get(col_dep_column).type
248
- flatten_col_depends_on.extend(cte_cll.get(col_dep_column).depends_on)
249
- elif source_scope is not None:
250
- source_cll = global_lineage[source_scope]
251
- if source_cll is None or source_cll.get(col_dep_column) is None:
252
- continue
253
- flatten_col_depends_on.extend(source_cll.get(col_dep_column).depends_on)
254
- else:
255
- flatten_col_depends_on.append(col_dep)
256
-
257
- # deduplicate
258
- dedup_col_depends_on = []
259
- dedup_set = set()
260
- for col_dep in flatten_col_depends_on:
261
- node_col = col_dep.node + '.' + col_dep.column
262
- if node_col not in dedup_set:
263
- dedup_col_depends_on.append(col_dep)
264
- dedup_set.add(node_col)
265
-
266
- # transformation type
267
- type = column_cll.type
268
- if type == 'derived':
269
- if len(dedup_col_depends_on) == 0:
270
- type = 'source'
271
- else:
272
- # keep current scope type
273
- pass
274
- elif cte_type is not None:
275
- if len(dedup_col_depends_on) > 1:
276
- type = 'derived'
277
- elif len(dedup_col_depends_on) == 0:
278
- type = 'source'
279
- else:
280
- if isinstance(select, Column):
281
- type = cte_type
282
- elif isinstance(select, Alias):
283
- alias = select
284
- if column_cll.depends_on[0].column == alias.alias_or_name:
285
- type = cte_type
286
- else:
287
- type = 'renamed' if cte_type == 'passthrough' else cte_type
288
- else:
289
- type = 'source'
290
-
291
- scope_lineage[select.alias_or_name] = ColumnLevelDependencyColumn(
292
- type=type,
293
- depends_on=dedup_col_depends_on
294
- )
295
-
296
- global_lineage[scope] = scope_lineage
297
- if not scope.is_cte:
298
- result = scope_lineage
349
+ continue
350
+
351
+ scope_cll_map[scope] = result
299
352
 
353
+ if result is None:
354
+ raise RecceException("Failed to extract CLL from SQL")
300
355
  return result
recce/util/io.py CHANGED
@@ -1,14 +1,14 @@
1
1
  import gzip
2
2
  import os
3
3
  import tempfile
4
- from abc import ABCMeta, abstractmethod, ABC
4
+ from abc import ABC, ABCMeta, abstractmethod
5
5
  from enum import Enum
6
6
 
7
7
 
8
8
  class SupportedFileTypes(Enum):
9
- FILE = 'file'
10
- GZIP = 'gzip'
11
- ZIP = 'zip'
9
+ FILE = "file"
10
+ GZIP = "gzip"
11
+ ZIP = "zip"
12
12
 
13
13
 
14
14
  def file_io_factory(file_type: SupportedFileTypes):
@@ -19,7 +19,7 @@ def file_io_factory(file_type: SupportedFileTypes):
19
19
  elif file_type == SupportedFileTypes.ZIP:
20
20
  return ZipFileIO
21
21
  else:
22
- raise ValueError(f'Unsupported file type: {file_type}')
22
+ raise ValueError(f"Unsupported file type: {file_type}")
23
23
 
24
24
 
25
25
  class AbstractFileIO(metaclass=ABCMeta):
@@ -37,24 +37,24 @@ class AbstractFileIO(metaclass=ABCMeta):
37
37
  class FileIO(AbstractFileIO, ABC):
38
38
  @staticmethod
39
39
  def write(path: str, data: str, **kwargs):
40
- with open(path, 'w') as f:
40
+ with open(path, "w", encoding="utf-8") as f:
41
41
  f.write(data)
42
42
 
43
43
  @staticmethod
44
44
  def read(path: str, **kwargs) -> str:
45
- with open(path, 'r') as f:
45
+ with open(path, "r", encoding="utf-8") as f:
46
46
  return f.read()
47
47
 
48
48
 
49
49
  class GzipFileIO(AbstractFileIO, ABC):
50
50
  @staticmethod
51
51
  def write(path: str, data: str, **kwargs):
52
- with gzip.open(path, 'wt') as f:
52
+ with gzip.open(path, "wt") as f:
53
53
  f.write(data)
54
54
 
55
55
  @staticmethod
56
56
  def read(path: str, **kwargs) -> str:
57
- with gzip.open(path, 'rt') as f:
57
+ with gzip.open(path, "rt") as f:
58
58
  return f.read()
59
59
 
60
60
  @staticmethod
@@ -68,18 +68,22 @@ class ZipFileIO(AbstractFileIO, ABC):
68
68
  def _is_pyminizip_installed():
69
69
  try:
70
70
  import pyminizip
71
+
72
+ # Use the module to avoid F401
73
+ return pyminizip is not None
71
74
  except ImportError:
72
- raise ImportError('pyminizip is not installed. Please install it using `pip install pyminizip`')
75
+ raise ImportError("pyminizip is not installed. Please install it using `pip install pyminizip`")
73
76
 
74
77
  @staticmethod
75
78
  def read(path: str, **kwargs) -> str:
76
79
  ZipFileIO._is_pyminizip_installed()
77
80
  import pyminizip
81
+
78
82
  cwd = os.getcwd()
79
- password = kwargs.get('password')
80
- zip_dir_name = kwargs.get('zip_dir_name')
83
+ password = kwargs.get("password")
84
+ zip_dir_name = kwargs.get("zip_dir_name")
81
85
  if zip_dir_name is None:
82
- raise ValueError('zip_dir_name is required for zipping')
86
+ raise ValueError("zip_dir_name is required for zipping")
83
87
 
84
88
  try:
85
89
  with tempfile.TemporaryDirectory() as tmp_dir:
@@ -88,7 +92,7 @@ class ZipFileIO(AbstractFileIO, ABC):
88
92
  content = FileIO.read(tmp_file)
89
93
  except Exception as e:
90
94
  error_msg = str(e)
91
- if '-3' in error_msg:
95
+ if "-3" in error_msg:
92
96
  raise Exception("Invalid password to uncompress state file.")
93
97
  raise Exception(f"Failed to uncompress state file: {error_msg}")
94
98
  finally:
@@ -100,11 +104,12 @@ class ZipFileIO(AbstractFileIO, ABC):
100
104
  def write(path: str, data: str, **kwargs):
101
105
  ZipFileIO._is_pyminizip_installed()
102
106
  import pyminizip
107
+
103
108
  cwd = os.getcwd()
104
- password = kwargs.get('password')
105
- zip_dir_name = kwargs.get('zip_dir_name')
109
+ password = kwargs.get("password")
110
+ zip_dir_name = kwargs.get("zip_dir_name")
106
111
  if zip_dir_name is None:
107
- raise ValueError('zip_dir_name is required for zipping')
112
+ raise ValueError("zip_dir_name is required for zipping")
108
113
 
109
114
  try:
110
115
  with tempfile.TemporaryDirectory() as tmp_dir: