recce-nightly 1.9.0.20250623__py3-none-any.whl → 1.25.0.20251112a2066__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. recce/VERSION +1 -1
  2. recce/__init__.py +5 -0
  3. recce/adapter/dbt_adapter/__init__.py +318 -240
  4. recce/artifact.py +76 -3
  5. recce/cli.py +703 -71
  6. recce/config.py +3 -3
  7. recce/connect_to_cloud.py +138 -0
  8. recce/core.py +3 -3
  9. recce/data/404.html +1 -22
  10. recce/data/__next.__PAGE__.txt +10 -0
  11. recce/data/__next._full.txt +23 -0
  12. recce/data/__next._index.txt +8 -0
  13. recce/data/__next._tree.txt +12 -0
  14. recce/data/_next/static/6LypcDXgyuSaiSCrsmUub/_buildManifest.js +11 -0
  15. recce/data/_next/static/6LypcDXgyuSaiSCrsmUub/_clientMiddlewareManifest.json +1 -0
  16. recce/data/_next/static/chunks/0a2b2dd4b57049c2.js +1 -0
  17. recce/data/_next/static/chunks/19c10d219a6a21ff.js +1 -0
  18. recce/data/_next/static/chunks/24fd885c7180a612.js +1 -0
  19. recce/data/_next/static/chunks/27e66b2eab4adc32.js +19 -0
  20. recce/data/_next/static/chunks/71f88fcc615bf282.js +1 -0
  21. recce/data/_next/static/chunks/917619ab62a32388.js +1 -0
  22. recce/data/_next/static/chunks/93ba5a62932b704f.js +4 -0
  23. recce/data/_next/static/chunks/a43a2a5e06d5a92b.js +1 -0
  24. recce/data/_next/static/chunks/a6c78b24bd8b84fc.js +1 -0
  25. recce/data/_next/static/chunks/b2610ba997ff8c4f.js +110 -0
  26. recce/data/_next/static/chunks/ba2d87265a68599d.css +2 -0
  27. recce/data/_next/static/chunks/c117fd1c1382dd83.js +11 -0
  28. recce/data/_next/static/chunks/c9425ca46eebdde9.js +1 -0
  29. recce/data/_next/static/chunks/cc8a9eadba012be0.css +6 -0
  30. recce/data/_next/static/chunks/e124bccf574a3361.css +1 -0
  31. recce/data/_next/static/chunks/e392ad92847c3e17.js +1 -0
  32. recce/data/_next/static/chunks/e4ce95efe88dae79.js +11 -0
  33. recce/data/_next/static/chunks/e69c777814fea6ed.js +2 -0
  34. recce/data/_next/static/chunks/turbopack-21cfd73037ff57ab.js +3 -0
  35. recce/data/_next/static/media/favicon.a8d38d84.ico +0 -0
  36. recce/data/_next/static/media/montserrat-cyrillic-800-normal.d80d830d.woff2 +0 -0
  37. recce/data/_next/static/media/{montserrat-cyrillic-800-normal.bd5c9f50.woff → montserrat-cyrillic-800-normal.f9d58125.woff} +0 -0
  38. recce/data/_next/static/media/montserrat-cyrillic-ext-800-normal.076c2a93.woff2 +0 -0
  39. recce/data/_next/static/media/montserrat-latin-800-normal.cde454cc.woff2 +0 -0
  40. recce/data/_next/static/media/{montserrat-latin-800-normal.fc315020.woff → montserrat-latin-800-normal.d5761935.woff} +0 -0
  41. recce/data/_next/static/media/montserrat-latin-ext-800-normal.40ec0659.woff2 +0 -0
  42. recce/data/_next/static/media/{montserrat-latin-ext-800-normal.2e5381b2.woff → montserrat-latin-ext-800-normal.b671449b.woff} +0 -0
  43. recce/data/_next/static/media/{montserrat-vietnamese-800-normal.20c545e6.woff → montserrat-vietnamese-800-normal.9f7b8541.woff} +0 -0
  44. recce/data/_next/static/media/montserrat-vietnamese-800-normal.f9eb854e.woff2 +0 -0
  45. recce/data/_not-found/__next._full.txt +17 -0
  46. recce/data/_not-found/__next._index.txt +8 -0
  47. recce/data/_not-found/__next._not-found.__PAGE__.txt +5 -0
  48. recce/data/_not-found/__next._not-found.txt +4 -0
  49. recce/data/_not-found/__next._tree.txt +10 -0
  50. recce/data/_not-found.html +1 -0
  51. recce/data/_not-found.txt +17 -0
  52. recce/data/auth_callback.html +68 -0
  53. recce/data/index.html +1 -27
  54. recce/data/index.txt +23 -8
  55. recce/event/__init__.py +9 -8
  56. recce/event/collector.py +6 -2
  57. recce/event/track.py +10 -0
  58. recce/github.py +1 -1
  59. recce/mcp_server.py +632 -0
  60. recce/models/types.py +23 -2
  61. recce/pull_request.py +1 -1
  62. recce/run.py +23 -16
  63. recce/server.py +194 -19
  64. recce/state/__init__.py +31 -0
  65. recce/state/cloud.py +632 -0
  66. recce/state/const.py +26 -0
  67. recce/state/local.py +56 -0
  68. recce/state/state.py +119 -0
  69. recce/state/state_loader.py +174 -0
  70. recce/summary.py +2 -1
  71. recce/tasks/dataframe.py +59 -2
  72. recce/tasks/rowcount.py +4 -1
  73. recce/tasks/schema.py +4 -1
  74. recce/tasks/valuediff.py +1 -1
  75. recce/util/api_token.py +11 -2
  76. recce/util/breaking.py +9 -0
  77. recce/util/cll.py +1 -2
  78. recce/util/io.py +2 -2
  79. recce/util/lineage.py +19 -18
  80. recce/util/perf_tracking.py +85 -0
  81. recce/util/recce_cloud.py +229 -5
  82. recce/yaml/__init__.py +2 -2
  83. recce_cloud/__init__.py +15 -0
  84. recce_cloud/api/__init__.py +17 -0
  85. recce_cloud/api/base.py +104 -0
  86. recce_cloud/api/client.py +150 -0
  87. recce_cloud/api/exceptions.py +26 -0
  88. recce_cloud/api/factory.py +63 -0
  89. recce_cloud/api/github.py +72 -0
  90. recce_cloud/api/gitlab.py +78 -0
  91. recce_cloud/artifact.py +57 -0
  92. recce_cloud/ci_providers/__init__.py +9 -0
  93. recce_cloud/ci_providers/base.py +82 -0
  94. recce_cloud/ci_providers/detector.py +147 -0
  95. recce_cloud/ci_providers/github_actions.py +136 -0
  96. recce_cloud/ci_providers/gitlab_ci.py +130 -0
  97. recce_cloud/cli.py +303 -0
  98. recce_cloud/upload.py +213 -0
  99. {recce_nightly-1.9.0.20250623.dist-info → recce_nightly-1.25.0.20251112a2066.dist-info}/METADATA +31 -27
  100. recce_nightly-1.25.0.20251112a2066.dist-info/RECORD +178 -0
  101. {recce_nightly-1.9.0.20250623.dist-info → recce_nightly-1.25.0.20251112a2066.dist-info}/top_level.txt +1 -0
  102. tests/adapter/dbt_adapter/test_dbt_cll.py +412 -79
  103. tests/recce_cloud/__init__.py +0 -0
  104. tests/recce_cloud/test_ci_providers.py +351 -0
  105. tests/recce_cloud/test_cli.py +372 -0
  106. tests/recce_cloud/test_client.py +273 -0
  107. tests/recce_cloud/test_platform_clients.py +279 -0
  108. tests/test_cli.py +106 -3
  109. tests/test_cli_mcp_optional.py +45 -0
  110. tests/test_cloud_listing_cli.py +324 -0
  111. tests/test_connect_to_cloud.py +82 -0
  112. tests/test_core.py +148 -3
  113. tests/test_mcp_server.py +332 -0
  114. tests/test_server.py +6 -6
  115. tests/test_summary.py +14 -6
  116. recce/data/_next/static/WrRUb3nV8BhAZG_R8kVma/_buildManifest.js +0 -1
  117. recce/data/_next/static/chunks/181-acc61ddada3bc0ca.js +0 -43
  118. recce/data/_next/static/chunks/1bff33f1-1ef85cf5e658a751.js +0 -1
  119. recce/data/_next/static/chunks/217-879a84d70f7a907c.js +0 -2
  120. recce/data/_next/static/chunks/29e3cc0d-60045b2e47aa3916.js +0 -1
  121. recce/data/_next/static/chunks/36e1c10d-8e7be4a6c1f6ab2d.js +0 -1
  122. recce/data/_next/static/chunks/3998a672-03adacad07b346ac.js +0 -1
  123. recce/data/_next/static/chunks/3a92ee20-1081c360214f9602.js +0 -1
  124. recce/data/_next/static/chunks/42-cd3c06533f5fd47c.js +0 -9
  125. recce/data/_next/static/chunks/450c323b-fd94e7ffaa4a5efa.js +0 -1
  126. recce/data/_next/static/chunks/47d8844f-929aed9b1c73a905.js +0 -1
  127. recce/data/_next/static/chunks/608-3b079b544e5d5f5e.js +0 -15
  128. recce/data/_next/static/chunks/6dc81886-adbfa45836061d79.js +0 -1
  129. recce/data/_next/static/chunks/7a8a3e83-edf6dc64b5d5f0a5.js +0 -1
  130. recce/data/_next/static/chunks/7f27ae6c-d5f0438edd5c2a5b.js +0 -1
  131. recce/data/_next/static/chunks/86730205-cfb14e3f051bab35.js +0 -1
  132. recce/data/_next/static/chunks/8d700b6a.8bb140898499c512.js +0 -1
  133. recce/data/_next/static/chunks/92-7ab55ae02606193c.js +0 -1
  134. recce/data/_next/static/chunks/9746af58-a42b7d169cacadf0.js +0 -1
  135. recce/data/_next/static/chunks/a30376cd-de84559016d7e133.js +0 -1
  136. recce/data/_next/static/chunks/app/_not-found/page-01ed58b7f971d311.js +0 -1
  137. recce/data/_next/static/chunks/app/layout-177a410a97e0d018.js +0 -1
  138. recce/data/_next/static/chunks/app/page-59241c42b7dd4fcf.js +0 -1
  139. recce/data/_next/static/chunks/b63b1b3f-4282bdcf459e075c.js +0 -1
  140. recce/data/_next/static/chunks/bbda5537-9ec25eb1dd62348a.js +0 -1
  141. recce/data/_next/static/chunks/c132bf7d-08cb668a789d6afd.js +0 -1
  142. recce/data/_next/static/chunks/ce84277d-2e5d1d46910cf052.js +0 -1
  143. recce/data/_next/static/chunks/febdd86e-c6b525341634b860.js +0 -54
  144. recce/data/_next/static/chunks/fee69bc6-2dbccaf9b90474e6.js +0 -1
  145. recce/data/_next/static/chunks/framework-ded83d71b51ce901.js +0 -1
  146. recce/data/_next/static/chunks/main-app-39061b0166c47f55.js +0 -1
  147. recce/data/_next/static/chunks/main-b5b3ae20a1405261.js +0 -1
  148. recce/data/_next/static/chunks/pages/_app-437c455677d62394.js +0 -1
  149. recce/data/_next/static/chunks/pages/_error-e7650df18ca04bde.js +0 -1
  150. recce/data/_next/static/chunks/webpack-7b49d5ba7e3a434d.js +0 -1
  151. recce/data/_next/static/css/17a96168e3a9db13.css +0 -1
  152. recce/data/_next/static/css/1b121dc4d36aeb4d.css +0 -3
  153. recce/data/_next/static/css/35c6679a098e1e34.css +0 -1
  154. recce/data/_next/static/css/951e2e0eea2d4a5b.css +0 -14
  155. recce/data/_next/static/media/montserrat-cyrillic-800-normal.22628180.woff2 +0 -0
  156. recce/data/_next/static/media/montserrat-cyrillic-ext-800-normal.94a63aea.woff2 +0 -0
  157. recce/data/_next/static/media/montserrat-latin-800-normal.6f8fa298.woff2 +0 -0
  158. recce/data/_next/static/media/montserrat-latin-ext-800-normal.013b84f9.woff2 +0 -0
  159. recce/data/_next/static/media/montserrat-vietnamese-800-normal.c0035377.woff2 +0 -0
  160. recce/state.py +0 -785
  161. recce_nightly-1.9.0.20250623.dist-info/RECORD +0 -151
  162. tests/test_state.py +0 -134
  163. /recce/data/_next/static/{WrRUb3nV8BhAZG_R8kVma → 6LypcDXgyuSaiSCrsmUub}/_ssgManifest.js +0 -0
  164. /recce/data/_next/static/chunks/{polyfills-42372ed130431b0a.js → a6dad97d9634a72d.js} +0 -0
  165. /recce/data/_next/static/media/{montserrat-cyrillic-ext-800-normal.e6e0d8d0.woff → montserrat-cyrillic-ext-800-normal.a4fa76b5.woff} +0 -0
  166. /recce/data/_next/static/media/{reload-image.79aabb7d.svg → reload-image.7aa931c7.svg} +0 -0
  167. {recce_nightly-1.9.0.20250623.dist-info → recce_nightly-1.25.0.20251112a2066.dist-info}/WHEEL +0 -0
  168. {recce_nightly-1.9.0.20250623.dist-info → recce_nightly-1.25.0.20251112a2066.dist-info}/entry_points.txt +0 -0
  169. {recce_nightly-1.9.0.20250623.dist-info → recce_nightly-1.25.0.20251112a2066.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,7 @@ import logging
3
3
  import os
4
4
  import uuid
5
5
  from contextlib import contextmanager
6
+ from copy import deepcopy
6
7
  from dataclasses import dataclass, fields
7
8
  from errno import ENOENT
8
9
  from functools import lru_cache
@@ -25,12 +26,12 @@ from recce.event import log_performance
25
26
  from recce.exceptions import RecceException
26
27
  from recce.util.cll import CLLPerformanceTracking, cll
27
28
  from recce.util.lineage import (
29
+ build_column_key,
28
30
  filter_dependency_maps,
29
- filter_lineage_vertices,
30
- find_column_dependencies,
31
31
  find_downstream,
32
32
  find_upstream,
33
33
  )
34
+ from recce.util.perf_tracking import LineagePerfTracker
34
35
 
35
36
  from ...tasks.profile import ProfileTask
36
37
  from ...util.breaking import BreakingPerformanceTracking, parse_change_category
@@ -278,7 +279,7 @@ class DbtArgs:
278
279
  target_path: Optional[str] = (None,)
279
280
  project_only_flags: Optional[Dict[str, Any]] = None
280
281
  which: Optional[str] = None
281
- state_modified_compare_more_unrendered_values: Optional[bool] = False # new flag added since dbt v1.9
282
+ state_modified_compare_more_unrendered_values: Optional[bool] = True # new flag added since dbt v1.9
282
283
 
283
284
 
284
285
  @dataclass
@@ -407,7 +408,7 @@ class DbtAdapter(BaseAdapter):
407
408
 
408
409
  if self.adapter.connections.TYPE == "databricks":
409
410
  # reference: get_columns_in_relation (dbt/adapters/databricks/impl.py)
410
- from dbt.adapters.databricks import DatabricksColumn
411
+ from dbt.adapters.databricks.column import DatabricksColumn
411
412
 
412
413
  rows = columns
413
414
  columns = []
@@ -599,7 +600,15 @@ class DbtAdapter(BaseAdapter):
599
600
  return node.compiled_code
600
601
  else:
601
602
  from dbt.clients import jinja
602
- from dbt.context.providers import generate_runtime_model_context
603
+ from dbt.context.providers import (
604
+ generate_runtime_macro_context,
605
+ generate_runtime_model_context,
606
+ )
607
+
608
+ # Set up macro resolver for dbt >= 1.8
609
+ macro_manifest = MacroManifest(manifest.macros)
610
+ self.adapter.set_macro_resolver(macro_manifest)
611
+ self.adapter.set_macro_context_generator(generate_runtime_macro_context)
603
612
 
604
613
  jinja_ctx = generate_runtime_model_context(node, self.runtime_config, manifest)
605
614
  jinja_ctx.update(context)
@@ -658,8 +667,8 @@ class DbtAdapter(BaseAdapter):
658
667
  @lru_cache(maxsize=2)
659
668
  def get_lineage_cached(self, base: Optional[bool] = False, cache_key=0):
660
669
  if base is False:
661
- cll_tracker = CLLPerformanceTracking()
662
- cll_tracker.start_lineage()
670
+ perf_tracker = LineagePerfTracker()
671
+ perf_tracker.start_lineage()
663
672
 
664
673
  manifest = self.curr_manifest if base is False else self.base_manifest
665
674
  catalog = self.curr_catalog if base is False else self.base_catalog
@@ -736,6 +745,7 @@ class DbtAdapter(BaseAdapter):
736
745
  nodes[unique_id] = {
737
746
  "id": source["unique_id"],
738
747
  "name": source["name"],
748
+ "source_name": source["source_name"],
739
749
  "resource_type": source["resource_type"],
740
750
  "package_name": source["package_name"],
741
751
  "config": source["config"],
@@ -777,10 +787,10 @@ class DbtAdapter(BaseAdapter):
777
787
  parent_map = self.build_parent_map(nodes, base)
778
788
 
779
789
  if base is False:
780
- cll_tracker.end_lineage()
781
- cll_tracker.set_total_nodes(len(nodes))
782
- log_performance("model lineage", cll_tracker.to_dict())
783
- cll_tracker.reset()
790
+ perf_tracker.end_lineage()
791
+ perf_tracker.set_total_nodes(len(nodes))
792
+ log_performance("model lineage", perf_tracker.to_dict())
793
+ perf_tracker.reset()
784
794
 
785
795
  return dict(
786
796
  parent_map=parent_map,
@@ -793,15 +803,43 @@ class DbtAdapter(BaseAdapter):
793
803
  def _get_lineage_diff_cached(self, cache_key) -> LineageDiff:
794
804
  base = self.get_lineage(base=True)
795
805
  current = self.get_lineage(base=False)
796
- keys = {*base.get("nodes", {}).keys(), *current.get("nodes", {}).keys()}
797
806
 
798
- # Start to diff
799
- perf_tracking = BreakingPerformanceTracking()
800
- perf_tracking.start_lineage_diff()
807
+ modified_nodes = self.select_nodes(select="state:modified")
808
+ diff = {}
809
+ for node_id in modified_nodes:
810
+ base_node = base.get("nodes", {}).get(node_id)
811
+ curr_node = current.get("nodes", {}).get(node_id)
812
+ if base_node and curr_node:
813
+ diff[node_id] = NodeDiff(change_status="modified")
814
+ elif base_node:
815
+ diff[node_id] = NodeDiff(change_status="removed")
816
+ elif curr_node:
817
+ diff[node_id] = NodeDiff(change_status="added")
818
+
819
+ return LineageDiff(
820
+ base=base,
821
+ current=current,
822
+ diff=diff,
823
+ )
824
+
825
+ @lru_cache(maxsize=128)
826
+ def get_change_analysis_cached(self, node_id: str):
827
+ breaking_perf_tracker = BreakingPerformanceTracking()
828
+ lineage_diff = self.get_lineage_diff()
829
+ diff = lineage_diff.diff
830
+
831
+ if node_id not in diff or diff[node_id].change_status != "modified":
832
+ return diff.get(node_id)
833
+
834
+ breaking_perf_tracker.increment_modified_nodes()
835
+ breaking_perf_tracker.start_lineage_diff()
836
+
837
+ base = lineage_diff.base
838
+ current = lineage_diff.current
801
839
 
802
840
  base_manifest = as_manifest(self.get_manifest(True))
803
841
  curr_manifest = as_manifest(self.get_manifest(False))
804
- perf_tracking.record_checkpoint("manifest")
842
+ breaking_perf_tracker.record_checkpoint("manifest")
805
843
 
806
844
  def ref_func(*args):
807
845
  if len(args) == 1:
@@ -821,111 +859,106 @@ class DbtAdapter(BaseAdapter):
821
859
  source=source_func,
822
860
  )
823
861
 
824
- # for each node, compare the base and current lineage
825
- diff = {}
826
- for key in keys:
827
- base_node = base.get("nodes", {}).get(key)
828
- curr_node = current.get("nodes", {}).get(key)
829
- if base_node and curr_node:
830
- base_checksum = base_node.get("checksum", {}).get("checksum")
831
- curr_checksum = curr_node.get("checksum", {}).get("checksum")
832
- change = None
833
- if base_checksum is None or curr_checksum is None or base_checksum == curr_checksum:
834
- continue
835
-
836
- if curr_node.get("resource_type") == "model":
837
- try:
838
- perf_tracking.increment_modified_nodes()
839
-
840
- def _get_schema(lineage):
841
- schema = {}
842
- nodes = lineage["nodes"]
843
- parent_list = lineage["parent_map"].get(key, [])
844
- for parent_id in parent_list:
845
- parent_node = nodes.get(parent_id)
846
- if parent_node is None:
847
- continue
848
- columns = parent_node.get("columns") or {}
849
- name = parent_node.get("name")
850
- if parent_node.get("resource_type") == "source":
851
- parts = parent_id.split(".")
852
- source = parts[2]
853
- table = parts[3]
854
- source = source.replace("-", "_")
855
- name = f"__{source}__{table}"
856
- schema[name] = {name: column.get("type") for name, column in columns.items()}
857
- return schema
858
-
859
- base_sql = self.generate_sql(
860
- base_node.get("raw_code"),
861
- context=jinja_context,
862
- provided_manifest=base_manifest,
863
- )
864
- curr_sql = self.generate_sql(
865
- curr_node.get("raw_code"),
866
- context=jinja_context,
867
- provided_manifest=curr_manifest,
868
- )
869
- base_schema = _get_schema(base)
870
- curr_schema = _get_schema(current)
871
- dialect = self.adapter.connections.TYPE
872
- if curr_manifest.metadata.adapter_type is not None:
873
- dialect = curr_manifest.metadata.adapter_type
874
-
875
- change = parse_change_category(
876
- base_sql,
877
- curr_sql,
878
- old_schema=base_schema,
879
- new_schema=curr_schema,
880
- dialect=dialect,
881
- perf_tracking=perf_tracking,
882
- )
883
-
884
- # Make sure that the case of the column names are the same
885
- changed_columns = {
886
- column.lower(): change_status for column, change_status in (change.columns or {}).items()
887
- }
888
- changed_columns_names = set(changed_columns)
889
- changed_columns_final = {}
862
+ base_node = base.get("nodes", {}).get(node_id)
863
+ curr_node = current.get("nodes", {}).get(node_id)
864
+ change = NodeChange(category="unknown")
865
+ if (
866
+ curr_node.get("resource_type") in ["model", "snapshot"]
867
+ and curr_node.get("raw_code") is not None
868
+ and base_node.get("raw_code") is not None
869
+ ):
870
+ try:
871
+
872
+ def _get_schema(lineage):
873
+ schema = {}
874
+ nodes = lineage["nodes"]
875
+ parent_list = lineage["parent_map"].get(node_id, [])
876
+ for parent_id in parent_list:
877
+ parent_node = nodes.get(parent_id)
878
+ if parent_node is None:
879
+ continue
880
+ columns = parent_node.get("columns") or {}
881
+ name = parent_node.get("name")
882
+ if parent_node.get("resource_type") == "source":
883
+ parts = parent_id.split(".")
884
+ source = parts[2]
885
+ table = parts[3]
886
+ source = source.replace("-", "_")
887
+ name = f"__{source}__{table}"
888
+ schema[name] = {name: column.get("type") for name, column in columns.items()}
889
+ return schema
890
+
891
+ base_sql = self.generate_sql(
892
+ base_node.get("raw_code"),
893
+ context=jinja_context,
894
+ provided_manifest=base_manifest,
895
+ )
896
+ curr_sql = self.generate_sql(
897
+ curr_node.get("raw_code"),
898
+ context=jinja_context,
899
+ provided_manifest=curr_manifest,
900
+ )
901
+ base_schema = _get_schema(base)
902
+ curr_schema = _get_schema(current)
903
+ dialect = self.adapter.connections.TYPE
904
+ if curr_manifest.metadata.adapter_type is not None:
905
+ dialect = curr_manifest.metadata.adapter_type
906
+
907
+ change = parse_change_category(
908
+ base_sql,
909
+ curr_sql,
910
+ old_schema=base_schema,
911
+ new_schema=curr_schema,
912
+ dialect=dialect,
913
+ perf_tracking=breaking_perf_tracker,
914
+ )
890
915
 
891
- base_columns = base_node.get("columns") or {}
892
- curr_columns = curr_node.get("columns") or {}
893
- columns_names = set(base_columns) | set(curr_columns)
916
+ # Make sure that the case of the column names are the same
917
+ changed_columns = {
918
+ column.lower(): change_status for column, change_status in (change.columns or {}).items()
919
+ }
920
+ changed_columns_names = set(changed_columns)
921
+ changed_columns_final = {}
894
922
 
895
- for column_name in columns_names:
896
- if column_name.lower() in changed_columns_names:
897
- changed_columns_final[column_name] = changed_columns[column_name.lower()]
923
+ base_columns = base_node.get("columns") or {}
924
+ curr_columns = curr_node.get("columns") or {}
925
+ columns_names = set(base_columns) | set(curr_columns)
898
926
 
899
- change.columns = changed_columns_final
900
- except Exception:
901
- change = NodeChange(category="unknown")
927
+ for column_name in columns_names:
928
+ if column_name.lower() in changed_columns_names:
929
+ changed_columns_final[column_name] = changed_columns[column_name.lower()]
902
930
 
903
- diff[key] = NodeDiff(change_status="modified", change=change)
904
- elif base_node:
905
- diff[key] = NodeDiff(change_status="removed")
906
- elif curr_node:
907
- diff[key] = NodeDiff(change_status="added")
931
+ change.columns = changed_columns_final
932
+ except Exception:
933
+ # TODO: telemetry
934
+ pass
908
935
 
909
- perf_tracking.end_lineage_diff()
910
- log_performance("model lineage diff", perf_tracking.to_dict())
911
-
912
- return LineageDiff(
913
- base=base,
914
- current=current,
915
- diff=diff,
916
- )
936
+ breaking_perf_tracker.end_lineage_diff()
937
+ log_performance("change analysis per node", breaking_perf_tracker.to_dict())
938
+ breaking_perf_tracker.reset()
939
+ node_diff = diff.get(node_id)
940
+ node_diff.change = change
941
+ return node_diff
917
942
 
918
943
  def get_cll(
919
944
  self,
920
945
  node_id: Optional[str] = None,
921
946
  column: Optional[str] = None,
922
947
  change_analysis: Optional[bool] = False,
923
- cll: Optional[bool] = True,
924
- upstream: Optional[bool] = True,
925
- downstream: Optional[bool] = True,
948
+ no_cll: Optional[bool] = False,
949
+ no_upstream: Optional[bool] = False,
950
+ no_downstream: Optional[bool] = False,
926
951
  no_filter: Optional[bool] = False,
927
952
  ) -> CllData:
928
- cll_tracker = CLLPerformanceTracking()
953
+ cll_tracker = LineagePerfTracker()
954
+ cll_tracker.set_params(
955
+ has_node=node_id is not None,
956
+ has_column=column is not None,
957
+ change_analysis=change_analysis,
958
+ no_cll=no_cll,
959
+ no_upstream=no_upstream,
960
+ no_downstream=no_downstream,
961
+ )
929
962
  cll_tracker.start_column_lineage()
930
963
 
931
964
  manifest = self.curr_manifest
@@ -936,47 +969,114 @@ class DbtAdapter(BaseAdapter):
936
969
  cll_node_ids = {node_id}
937
970
  else:
938
971
  lineage_diff = self.get_lineage_diff()
939
- cll_node_ids = lineage_diff.diff.keys()
972
+ cll_node_ids = set(lineage_diff.diff.keys())
973
+
974
+ cll_tracker.set_init_nodes(len(cll_node_ids))
940
975
 
941
976
  nodes = {}
942
977
  columns = {}
943
978
  parent_map = {}
944
979
  child_map = {}
945
980
 
946
- if upstream:
981
+ if not no_upstream:
947
982
  cll_node_ids = cll_node_ids.union(find_upstream(cll_node_ids, manifest_dict.get("parent_map")))
948
- if downstream:
983
+ if not no_downstream:
949
984
  cll_node_ids = cll_node_ids.union(find_downstream(cll_node_ids, manifest_dict.get("child_map")))
950
985
 
951
- if cll:
986
+ if not no_cll:
987
+ allowed_related_nodes = set()
988
+ for key in ["sources", "nodes", "exposures", "metrics"]:
989
+ attr = getattr(manifest, key)
990
+ allowed_related_nodes.update(set(attr.keys()))
991
+ if hasattr(manifest, "semantic_models"):
992
+ attr = getattr(manifest, "semantic_models")
993
+ allowed_related_nodes.update(set(attr.keys()))
952
994
  for cll_node_id in cll_node_ids:
953
- if (
954
- cll_node_id not in manifest.sources
955
- and cll_node_id not in manifest.nodes
956
- and cll_node_id not in manifest.exposures
957
- ):
995
+ if cll_node_id not in allowed_related_nodes:
958
996
  continue
959
- cll_data_one = self.get_cll_cached(cll_node_id, base=False)
997
+ cll_data_one = deepcopy(self.get_cll_cached(cll_node_id, base=False))
998
+ cll_tracker.increment_cll_nodes()
960
999
  if cll_data_one is None:
961
1000
  continue
962
1001
 
963
- node_diff = self.get_lineage_diff().diff.get(cll_node_id) if change_analysis else None
964
- for n_id, n in cll_data_one.nodes.items():
965
- nodes[n_id] = n
966
-
967
- if node_diff is not None:
968
- n.change_status = node_diff.change_status
969
- if node_diff.change is not None:
970
- n.change_category = node_diff.change.category
1002
+ nodes[cll_node_id] = cll_data_one.nodes.get(cll_node_id)
1003
+ node_diff = None
1004
+ if change_analysis:
1005
+ node_diff = self.get_change_analysis_cached(cll_node_id)
1006
+ cll_tracker.increment_change_analysis_nodes()
1007
+ if node_diff is not None:
1008
+ nodes[cll_node_id].change_status = node_diff.change_status
1009
+ if node_diff.change is not None:
1010
+ nodes[cll_node_id].change_category = node_diff.change.category
971
1011
  for c_id, c in cll_data_one.columns.items():
972
1012
  columns[c_id] = c
973
- if node_diff is not None and node_diff.change is not None:
974
- column_diff = node_diff.change.columns.get(c.name)
975
- if column_diff:
976
- c.change_status = column_diff
1013
+ if node_diff is not None:
1014
+ if node_diff.change_status == "added":
1015
+ c.change_status = "added"
1016
+ elif node_diff.change_status == "removed":
1017
+ c.change_status = "removed"
1018
+ elif node_diff.change is not None and node_diff.change.columns is not None:
1019
+ column_diff = node_diff.change.columns.get(c.name)
1020
+ if column_diff:
1021
+ c.change_status = column_diff
977
1022
 
978
1023
  for p_id, parents in cll_data_one.parent_map.items():
979
1024
  parent_map[p_id] = parents
1025
+ else:
1026
+ for cll_node_id in cll_node_ids:
1027
+ cll_node = None
1028
+ cll_node_columns: Dict[str, CllColumn] = {}
1029
+
1030
+ if cll_node_id in manifest.sources:
1031
+ cll_node = CllNode.build_cll_node(manifest, "sources", cll_node_id)
1032
+ if self.curr_catalog and cll_node_id in self.curr_catalog.sources:
1033
+ cll_node_columns = {
1034
+ column.name: CllColumn(
1035
+ id=f"{cll_node_id}_{column.name}",
1036
+ table_id=cll_node_id,
1037
+ name=column.name,
1038
+ type=column.type,
1039
+ )
1040
+ for column in self.curr_catalog.sources[cll_node_id].columns.values()
1041
+ }
1042
+ elif cll_node_id in manifest.nodes:
1043
+ cll_node = CllNode.build_cll_node(manifest, "nodes", cll_node_id)
1044
+ if self.curr_catalog and cll_node_id in self.curr_catalog.nodes:
1045
+ cll_node_columns = {
1046
+ column.name: CllColumn(
1047
+ id=f"{cll_node_id}_{column.name}",
1048
+ table_id=cll_node_id,
1049
+ name=column.name,
1050
+ type=column.type,
1051
+ )
1052
+ for column in self.curr_catalog.nodes[cll_node_id].columns.values()
1053
+ }
1054
+ elif cll_node_id in manifest.exposures:
1055
+ cll_node = CllNode.build_cll_node(manifest, "exposures", cll_node_id)
1056
+ elif hasattr(manifest, "semantic_models") and cll_node_id in manifest.semantic_models:
1057
+ cll_node = CllNode.build_cll_node(manifest, "semantic_models", cll_node_id)
1058
+ elif cll_node_id in manifest.metrics:
1059
+ cll_node = CllNode.build_cll_node(manifest, "metrics", cll_node_id)
1060
+
1061
+ if not cll_node:
1062
+ continue
1063
+ nodes[cll_node_id] = cll_node
1064
+
1065
+ node_diff = None
1066
+ if change_analysis:
1067
+ node_diff = self.get_change_analysis_cached(cll_node_id)
1068
+ cll_tracker.increment_change_analysis_nodes()
1069
+ if node_diff is not None:
1070
+ cll_node.change_status = node_diff.change_status
1071
+ if node_diff.change is not None:
1072
+ cll_node.change_category = node_diff.change.category
1073
+ for c, cll_column in cll_node_columns.items():
1074
+ cll_node.columns[c] = cll_column
1075
+ columns[cll_column.id] = cll_column
1076
+ if node_diff.change.columns and c in node_diff.change.columns:
1077
+ cll_column.change_status = node_diff.change.columns[c]
1078
+
1079
+ parent_map[cll_node_id] = manifest.parent_map.get(cll_node_id, [])
980
1080
 
981
1081
  # build the child map
982
1082
  for parent_id, parents in parent_map.items():
@@ -987,47 +1087,90 @@ class DbtAdapter(BaseAdapter):
987
1087
 
988
1088
  # Find the anchor nodes
989
1089
  anchor_node_ids = set()
1090
+ extra_node_ids = set()
990
1091
  if node_id is None and column is None:
991
1092
  if change_analysis:
992
1093
  # If change analysis is requested, we need to find the nodes that have changes
993
- for node_id, node_diff in self.get_lineage_diff().diff.items():
994
- if node_diff.change.category == "breaking":
995
- anchor_node_ids.add(node_id)
996
- for column_name in node_diff.change.columns:
997
- anchor_node_ids.add(f"{node_id}_{column_name}")
1094
+ lineage_diff = self.get_lineage_diff()
1095
+ for nid, nd in lineage_diff.diff.items():
1096
+ if nd.change_status == "added":
1097
+ anchor_node_ids.add(nid)
1098
+ n = lineage_diff.current["nodes"].get(nid)
1099
+ n_columns = n.get("columns", {})
1100
+ for c in n_columns:
1101
+ anchor_node_ids.add(build_column_key(nid, c))
1102
+ continue
1103
+ if nd.change_status == "removed":
1104
+ extra_node_ids.add(nid)
1105
+ continue
1106
+
1107
+ node_diff = self.get_change_analysis_cached(nid)
1108
+ if node_diff is not None and node_diff.change is not None:
1109
+ extra_node_ids.add(nid)
1110
+ if no_cll:
1111
+ if node_diff.change.category in ["breaking", "partial_breaking", "unknown"]:
1112
+ anchor_node_ids.add(nid)
1113
+ else:
1114
+ if node_diff.change.category in ["breaking", "unknown"]:
1115
+ anchor_node_ids.add(nid)
1116
+ if node_diff.change.columns is not None:
1117
+ for column_name in node_diff.change.columns:
1118
+ anchor_node_ids.add(f"{nid}_{column_name}")
998
1119
  else:
999
1120
  lineage_diff = self.get_lineage_diff()
1000
1121
  anchor_node_ids = lineage_diff.diff.keys()
1001
1122
  elif node_id is not None and column is None:
1002
1123
  if change_analysis:
1003
1124
  # If change analysis is requested, we need to find the nodes that have changes
1004
- node_diff = self.get_lineage_diff().diff.get(node_id)
1005
- if node_diff:
1006
- if node_diff.change.category == "breaking":
1007
- anchor_node_ids.add(node_id)
1008
- for column_name in node_diff.change.columns:
1009
- anchor_node_ids.add(f"{node_id}_{column_name}")
1125
+ node_diff = self.get_change_analysis_cached(node_id)
1126
+ if node_diff is not None and node_diff.change is not None:
1127
+ extra_node_ids.add(node_id)
1128
+ if no_cll:
1129
+ if node_diff.change.category in ["breaking", "partial_breaking", "unknown"]:
1130
+ anchor_node_ids.add(node_id)
1131
+ else:
1132
+ if node_diff.change.category in ["breaking", "unknown"]:
1133
+ anchor_node_ids.add(node_id)
1134
+ if node_diff.change.columns is not None:
1135
+ for column_name in node_diff.change.columns:
1136
+ anchor_node_ids.add(f"{node_id}_{column_name}")
1010
1137
  else:
1011
1138
  anchor_node_ids.add(node_id)
1012
1139
  else:
1013
1140
  anchor_node_ids.add(node_id)
1141
+ if not no_cll:
1142
+ node = nodes.get(node_id)
1143
+ if node:
1144
+ for column_name in node.columns:
1145
+ column_key = build_column_key(node_id, column_name)
1146
+ anchor_node_ids.add(column_key)
1014
1147
  else:
1015
1148
  anchor_node_ids.add(f"{node_id}_{column}")
1016
1149
 
1150
+ cll_tracker.set_anchor_nodes(len(anchor_node_ids))
1017
1151
  result_node_ids = set(anchor_node_ids)
1018
- if upstream:
1152
+ if not no_upstream:
1019
1153
  result_node_ids = result_node_ids.union(find_upstream(anchor_node_ids, parent_map))
1020
- if downstream:
1154
+ if not no_downstream:
1021
1155
  result_node_ids = result_node_ids.union(find_downstream(anchor_node_ids, child_map))
1022
1156
 
1023
1157
  # Filter the nodes and columns based on the anchor nodes
1024
1158
  if not no_filter:
1025
- nodes = {k: v for k, v in nodes.items() if k in result_node_ids}
1026
- columns = {k: v for k, v in columns.items() if k in result_node_ids}
1159
+ nodes = {k: v for k, v in nodes.items() if k in result_node_ids or k in extra_node_ids}
1160
+ columns = {k: v for k, v in columns.items() if k in result_node_ids or k in extra_node_ids}
1161
+
1162
+ for node in nodes.values():
1163
+ node.columns = {
1164
+ k: v for k, v in node.columns.items() if v.id in result_node_ids or v.id in extra_node_ids
1165
+ }
1166
+
1167
+ if change_analysis:
1168
+ node.impacted = node.id in result_node_ids
1169
+
1027
1170
  parent_map, child_map = filter_dependency_maps(parent_map, child_map, result_node_ids)
1028
1171
 
1029
1172
  cll_tracker.end_column_lineage()
1030
- cll_tracker.set_total_nodes(len(nodes))
1173
+ cll_tracker.set_total_nodes(len(nodes) + len(columns))
1031
1174
  log_performance("column level lineage", cll_tracker.to_dict())
1032
1175
  cll_tracker.reset()
1033
1176
 
@@ -1046,6 +1189,9 @@ class DbtAdapter(BaseAdapter):
1046
1189
  if node is None:
1047
1190
  return None
1048
1191
 
1192
+ cll_tracker.set_total_nodes(1)
1193
+ cll_tracker.start_column_lineage()
1194
+
1049
1195
  def _apply_all_columns(node: CllNode, transformation_type):
1050
1196
  cll_data = CllData()
1051
1197
  cll_data.nodes[node.id] = node
@@ -1170,6 +1316,10 @@ class DbtAdapter(BaseAdapter):
1170
1316
  depends_on.add(parent_key)
1171
1317
  column.transformation_type = c2c_map[name].transformation_type
1172
1318
  cll_data.parent_map[column_id] = set(depends_on)
1319
+
1320
+ cll_tracker.end_column_lineage()
1321
+ log_performance("column level lineage per node", cll_tracker.to_dict())
1322
+ cll_tracker.reset()
1173
1323
  return cll_data
1174
1324
 
1175
1325
  def get_cll_node(self, node_id: str, base: Optional[bool] = False) -> Tuple[Optional[CllNode], list[str]]:
@@ -1181,21 +1331,12 @@ class DbtAdapter(BaseAdapter):
1181
1331
  # model, seed, snapshot
1182
1332
  if node_id in manifest.nodes:
1183
1333
  found = manifest.nodes[node_id]
1184
- if found.resource_type not in ["model", "seed", "snapshot"]:
1185
- return None, []
1186
-
1187
1334
  unique_id = found.unique_id
1188
- node = CllNode(
1189
- id=found.unique_id,
1190
- name=found.name,
1191
- package_name=found.package_name,
1192
- resource_type=found.resource_type,
1193
- raw_code=found.raw_code,
1194
- )
1335
+ node = CllNode.build_cll_node(manifest, "nodes", node_id)
1195
1336
  if hasattr(found.depends_on, "nodes"):
1196
1337
  parent_list = found.depends_on.nodes
1197
1338
 
1198
- if catalog is not None and unique_id in catalog.nodes:
1339
+ if catalog is not None and node is not None and unique_id in catalog.nodes:
1199
1340
  columns = {}
1200
1341
  for col_name, col_metadata in catalog.nodes[unique_id].columns.items():
1201
1342
  column_id = f"{unique_id}_{col_name}"
@@ -1207,17 +1348,10 @@ class DbtAdapter(BaseAdapter):
1207
1348
  if node_id in manifest.sources:
1208
1349
  found = manifest.sources[node_id]
1209
1350
  unique_id = found.unique_id
1210
-
1211
- node = CllNode(
1212
- id=found.unique_id,
1213
- name=found.name,
1214
- package_name=found.package_name,
1215
- resource_type=found.resource_type,
1216
- source_name=found.source_name,
1217
- )
1351
+ node = CllNode.build_cll_node(manifest, "sources", node_id)
1218
1352
  parent_list = []
1219
1353
 
1220
- if catalog is not None and unique_id in catalog.sources:
1354
+ if catalog is not None and node is not None and unique_id in catalog.sources:
1221
1355
  columns = {}
1222
1356
  for col_name, col_metadata in catalog.sources[unique_id].columns.items():
1223
1357
  column_id = f"{unique_id}_{col_name}"
@@ -1228,13 +1362,19 @@ class DbtAdapter(BaseAdapter):
1228
1362
  # exposure
1229
1363
  if node_id in manifest.exposures:
1230
1364
  found = manifest.exposures[node_id]
1365
+ node = CllNode.build_cll_node(manifest, "exposures", node_id)
1366
+ if hasattr(found.depends_on, "nodes"):
1367
+ parent_list = found.depends_on.nodes
1231
1368
 
1232
- node = CllNode(
1233
- id=found.unique_id,
1234
- name=found.name,
1235
- package_name=found.package_name,
1236
- resource_type=found.resource_type,
1237
- )
1369
+ if hasattr(manifest, "semantic_models") and node_id in manifest.semantic_models:
1370
+ found = manifest.semantic_models[node_id]
1371
+ node = CllNode.build_cll_node(manifest, "semantic_models", node_id)
1372
+ if hasattr(found.depends_on, "nodes"):
1373
+ parent_list = found.depends_on.nodes
1374
+
1375
+ if node_id in manifest.metrics:
1376
+ found = manifest.metrics[node_id]
1377
+ node = CllNode.build_cll_node(manifest, "metrics", node_id)
1238
1378
  if hasattr(found.depends_on, "nodes"):
1239
1379
  parent_list = found.depends_on.nodes
1240
1380
 
@@ -1250,73 +1390,6 @@ class DbtAdapter(BaseAdapter):
1250
1390
  }
1251
1391
  return None
1252
1392
 
1253
- def get_impact_radius(self, node_id: str) -> CllData:
1254
- impacted_nodes = self.get_impacted_nodes(node_id)
1255
- impacted_cll = self.get_impacted_cll(node_id)
1256
-
1257
- # merge impact radius
1258
- return self._merge_cll_data(impacted_nodes, impacted_cll)
1259
-
1260
- def get_impacted_nodes(self, node_id: str) -> CllData:
1261
- lineage_diff = self.get_lineage_diff()
1262
- diff_info = lineage_diff.diff.get(node_id)
1263
- if diff_info is None:
1264
- return CllData()
1265
- change_category = diff_info.change.category
1266
-
1267
- if change_category == "breaking":
1268
- cll = self.get_cll(node_id, no_filter=True)
1269
- _, downstream = find_column_dependencies(node_id, cll.parent_map, cll.child_map)
1270
- relevant_columns = {node_id}
1271
- relevant_columns.update(downstream)
1272
- nodes, columns = filter_lineage_vertices(cll.nodes, cll.columns, relevant_columns)
1273
- p_map, c_map = filter_dependency_maps(cll.parent_map, cll.child_map, relevant_columns)
1274
-
1275
- return CllData(nodes=nodes, columns=columns, parent_map=p_map, child_map=c_map)
1276
-
1277
- return CllData()
1278
-
1279
- def get_impacted_cll(self, node_id: str) -> CllData:
1280
- lineage_diff = self.get_lineage_diff()
1281
- diff_info = lineage_diff.diff.get(node_id)
1282
- if diff_info is None:
1283
- return CllData()
1284
- change_columns = diff_info.change.columns
1285
-
1286
- cll = self.get_cll(node_id, no_filter=True)
1287
- relevant_columns = set()
1288
- for col, change_status in change_columns.items():
1289
- if change_status == "removed":
1290
- continue
1291
- target_column = f"{node_id}_{col}"
1292
- _, downstream = find_column_dependencies(target_column, cll.parent_map, cll.child_map)
1293
- relevant_columns.add(target_column)
1294
- relevant_columns.update(downstream)
1295
-
1296
- nodes, columns = filter_lineage_vertices(cll.nodes, cll.columns, relevant_columns)
1297
- p_map, c_map = filter_dependency_maps(cll.parent_map, cll.child_map, relevant_columns)
1298
-
1299
- return CllData(nodes=nodes, columns=columns, parent_map=p_map, child_map=c_map)
1300
-
1301
- @staticmethod
1302
- def _merge_cll_data(base: CllData, target: CllData) -> CllData:
1303
- merged_nodes = {**base.nodes, **target.nodes}
1304
- merged_columns = {**base.columns, **target.columns}
1305
-
1306
- merged_parent_map = {}
1307
- merged_keys = set(base.parent_map.keys()).union(set(target.parent_map.keys()))
1308
- for key in merged_keys:
1309
- merged_parent_map[key] = base.parent_map.get(key, set()).union(target.parent_map.get(key, set()))
1310
-
1311
- merged_child_map = {}
1312
- merged_keys = set(base.child_map.keys()).union(set(target.child_map.keys()))
1313
- for key in merged_keys:
1314
- merged_child_map[key] = base.child_map.get(key, set()).union(target.child_map.get(key, set()))
1315
-
1316
- return CllData(
1317
- nodes=merged_nodes, columns=merged_columns, parent_map=merged_parent_map, child_map=merged_child_map
1318
- )
1319
-
1320
1393
  def build_name_to_unique_id_index(self) -> Dict[str, str]:
1321
1394
  name_to_unique_id = {}
1322
1395
  curr_manifest = self.get_manifest(base=False)
@@ -1404,13 +1477,18 @@ class DbtAdapter(BaseAdapter):
1404
1477
  self.curr_manifest = load_manifest(path=refresh_file_path)
1405
1478
  self.manifest = as_manifest(self.curr_manifest)
1406
1479
  self.get_cll_cached.cache_clear()
1480
+ self.get_change_analysis_cached.cache_clear()
1407
1481
  elif refresh_file_path.endswith("catalog.json"):
1408
1482
  self.curr_catalog = load_catalog(path=refresh_file_path)
1483
+ self.get_cll_cached.cache_clear()
1484
+ self.get_change_analysis_cached.cache_clear()
1409
1485
  elif self.base_path and target_type == os.path.basename(self.base_path):
1410
1486
  if refresh_file_path.endswith("manifest.json"):
1411
1487
  self.base_manifest = load_manifest(path=refresh_file_path)
1488
+ self.get_change_analysis_cached.cache_clear()
1412
1489
  elif refresh_file_path.endswith("catalog.json"):
1413
1490
  self.base_catalog = load_catalog(path=refresh_file_path)
1491
+ self.get_change_analysis_cached.cache_clear()
1414
1492
 
1415
1493
  def create_relation(self, model, base=False):
1416
1494
  node = self.find_node_by_name(model, base)
@@ -1522,7 +1600,7 @@ class DbtAdapter(BaseAdapter):
1522
1600
  if not os.path.isfile(path):
1523
1601
  return None
1524
1602
 
1525
- with open(path, "r") as f:
1603
+ with open(path, "r", encoding="utf-8") as f:
1526
1604
  json_content = f.read()
1527
1605
  return json.loads(json_content)
1528
1606