acryl-datahub 1.2.0.7rc4__py3-none-any.whl → 1.2.0.8rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show
  1. {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/METADATA +2674 -2674
  2. {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/RECORD +34 -32
  3. datahub/_version.py +1 -1
  4. datahub/cli/delete_cli.py +1 -0
  5. datahub/ingestion/api/report.py +4 -0
  6. datahub/ingestion/autogenerated/capability_summary.json +1 -1
  7. datahub/ingestion/graph/client.py +8 -1
  8. datahub/ingestion/source/datahub/config.py +4 -0
  9. datahub/ingestion/source/datahub/datahub_database_reader.py +6 -1
  10. datahub/ingestion/source/metadata/lineage.py +8 -8
  11. datahub/ingestion/source/redshift/redshift.py +1 -1
  12. datahub/ingestion/source/sql/athena.py +95 -18
  13. datahub/ingestion/source/sql/athena_properties_extractor.py +43 -25
  14. datahub/ingestion/source/superset.py +3 -2
  15. datahub/ingestion/source/tableau/tableau.py +8 -5
  16. datahub/metadata/_internal_schema_classes.py +207 -12
  17. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  18. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
  19. datahub/metadata/schema.avsc +160 -12
  20. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  21. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -1
  22. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
  23. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  24. datahub/metadata/schemas/DomainKey.avsc +2 -1
  25. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  26. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  27. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  28. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +0 -3
  29. datahub/sql_parsing/sqlglot_lineage.py +121 -28
  30. datahub/sql_parsing/sqlglot_utils.py +12 -1
  31. {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/WHEEL +0 -0
  32. {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/entry_points.txt +0 -0
  33. {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/licenses/LICENSE +0 -0
  34. {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/top_level.txt +0 -0
@@ -543,16 +543,19 @@ def _select_statement_cll(
543
543
  root_scope: sqlglot.optimizer.Scope,
544
544
  column_resolver: _ColumnResolver,
545
545
  output_table: Optional[_TableName],
546
+ table_name_schema_mapping: Dict[_TableName, SchemaInfo],
547
+ default_db: Optional[str] = None,
548
+ default_schema: Optional[str] = None,
546
549
  ) -> List[_ColumnLineageInfo]:
547
550
  column_lineage: List[_ColumnLineageInfo] = []
548
551
 
549
552
  try:
550
- # List output columns.
551
553
  output_columns = [
552
554
  (select_col.alias_or_name, select_col) for select_col in statement.selects
553
555
  ]
554
556
  logger.debug("output columns: %s", [col[0] for col in output_columns])
555
- for output_col, original_col_expression in output_columns:
557
+
558
+ for output_col, _original_col_expression in output_columns:
556
559
  if not output_col or output_col == "*":
557
560
  # If schema information is available, the * will be expanded to the actual columns.
558
561
  # Otherwise, we can't process it.
@@ -576,13 +579,14 @@ def _select_statement_cll(
576
579
  trim_selects=False,
577
580
  # We don't need to pass the schema in here, since we've already qualified the columns.
578
581
  )
579
- # import pathlib
580
- # pathlib.Path("sqlglot.html").write_text(
581
- # str(lineage_node.to_html(dialect=dialect))
582
- # )
583
582
 
584
583
  # Generate SELECT lineage.
585
- direct_raw_col_upstreams = _get_direct_raw_col_upstreams(lineage_node)
584
+ direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
585
+ lineage_node,
586
+ dialect,
587
+ default_db,
588
+ default_schema,
589
+ )
586
590
 
587
591
  # Fuzzy resolve the output column.
588
592
  original_col_expression = lineage_node.expression
@@ -601,7 +605,7 @@ def _select_statement_cll(
601
605
  if original_col_expression.type:
602
606
  output_col_type = original_col_expression.type
603
607
 
604
- # Fuzzy resolve upstream columns.
608
+ # Resolve upstream columns - table names should already be qualified from placeholder processing
605
609
  direct_resolved_col_upstreams = {
606
610
  _ColumnRef(
607
611
  table=edge.table,
@@ -706,6 +710,9 @@ def _column_level_lineage(
706
710
  root_scope=root_scope,
707
711
  column_resolver=column_resolver,
708
712
  output_table=downstream_table,
713
+ table_name_schema_mapping=table_name_schema_mapping,
714
+ default_db=default_db,
715
+ default_schema=default_schema,
709
716
  )
710
717
 
711
718
  joins: Optional[List[_JoinInfo]] = None
@@ -726,6 +733,9 @@ def _column_level_lineage(
726
733
 
727
734
  def _get_direct_raw_col_upstreams(
728
735
  lineage_node: sqlglot.lineage.Node,
736
+ dialect: Optional[sqlglot.Dialect] = None,
737
+ default_db: Optional[str] = None,
738
+ default_schema: Optional[str] = None,
729
739
  ) -> OrderedSet[_ColumnRef]:
730
740
  # Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
731
741
  direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
@@ -755,6 +765,53 @@ def _get_direct_raw_col_upstreams(
755
765
  direct_raw_col_upstreams.add(
756
766
  _ColumnRef(table=table_ref, column=normalized_col)
757
767
  )
768
+ elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
769
+ # Handle placeholder expressions from lateral joins.
770
+ #
771
+ # In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
772
+ # expressions instead of regular table references. This is critical for lateral join column lineage.
773
+ #
774
+ # Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
775
+ # The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
776
+ # which we need to parse to establish the lineage: output.value <- my_table2.value
777
+ #
778
+ # Without this handling, lateral join column lineage would be incomplete/missing.
779
+ try:
780
+ parsed = sqlglot.parse_one(node.name, dialect=dialect)
781
+ if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
782
+ table_ref = _TableName.from_sqlglot_table(
783
+ sqlglot.parse_one(
784
+ parsed.table, into=sqlglot.exp.Table, dialect=dialect
785
+ )
786
+ )
787
+
788
+ # SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
789
+ # Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
790
+ # We need this runtime qualification to ensure proper lineage resolution.
791
+ # Only qualify if this appears to be a real table reference (not a temporary construct)
792
+ if (
793
+ not (table_ref.database or table_ref.db_schema)
794
+ and dialect is not None
795
+ ):
796
+ table_ref = table_ref.qualified(
797
+ dialect=dialect,
798
+ default_db=default_db,
799
+ default_schema=default_schema,
800
+ )
801
+
802
+ # Extract column name using proper isinstance check
803
+ if isinstance(parsed.this, sqlglot.exp.Identifier):
804
+ column_name = parsed.this.name
805
+ else:
806
+ column_name = str(parsed.this)
807
+ direct_raw_col_upstreams.add(
808
+ _ColumnRef(table=table_ref, column=column_name)
809
+ )
810
+ except Exception as e:
811
+ logger.debug(
812
+ f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
813
+ exc_info=True,
814
+ )
758
815
  else:
759
816
  # This branch doesn't matter. For example, a count(*) column would go here, and
760
817
  # we don't get any column-level lineage for that.
@@ -857,7 +914,7 @@ def _get_raw_col_upstreams_for_expression(
857
914
  trim_selects=False,
858
915
  )
859
916
 
860
- return _get_direct_raw_col_upstreams(node)
917
+ return _get_direct_raw_col_upstreams(node, dialect, None, None)
861
918
  finally:
862
919
  scope.expression = original_expression
863
920
 
@@ -872,8 +929,9 @@ def _list_joins(
872
929
 
873
930
  scope: sqlglot.optimizer.Scope
874
931
  for scope in root_scope.traverse():
932
+ # PART 1: Handle regular explicit JOINs (updated API)
875
933
  join: sqlglot.exp.Join
876
- for join in scope.find_all(sqlglot.exp.Join):
934
+ for join in scope.expression.find_all(sqlglot.exp.Join):
877
935
  left_side_tables: OrderedSet[_TableName] = OrderedSet()
878
936
  from_clause: sqlglot.exp.From
879
937
  for from_clause in scope.find_all(sqlglot.exp.From):
@@ -949,6 +1007,36 @@ def _list_joins(
949
1007
  )
950
1008
  )
951
1009
 
1010
+ # Handle LATERAL constructs
1011
+ for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
1012
+ # Get tables from non-lateral FROM clauses
1013
+ qualified_left: OrderedSet[_TableName] = OrderedSet()
1014
+ for from_clause in scope.find_all(sqlglot.exp.From):
1015
+ if not isinstance(from_clause.this, sqlglot.exp.Lateral):
1016
+ qualified_left.update(
1017
+ _get_join_side_tables(from_clause.this, dialect, scope)
1018
+ )
1019
+
1020
+ # Get tables from lateral subquery
1021
+ qualified_right: OrderedSet[_TableName] = OrderedSet()
1022
+ if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
1023
+ qualified_right.update(
1024
+ _TableName.from_sqlglot_table(t)
1025
+ for t in lateral.this.find_all(sqlglot.exp.Table)
1026
+ )
1027
+ qualified_right.update(qualified_left)
1028
+
1029
+ if qualified_left and qualified_right:
1030
+ joins.append(
1031
+ _JoinInfo(
1032
+ join_type="LATERAL JOIN",
1033
+ left_tables=list(qualified_left),
1034
+ right_tables=list(qualified_right),
1035
+ on_clause=None,
1036
+ columns_involved=[],
1037
+ )
1038
+ )
1039
+
952
1040
  return joins
953
1041
 
954
1042
 
@@ -1186,25 +1274,30 @@ def _translate_internal_joins(
1186
1274
  ) -> List[JoinInfo]:
1187
1275
  joins = []
1188
1276
  for raw_join in raw_joins:
1189
- joins.append(
1190
- JoinInfo(
1191
- join_type=raw_join.join_type,
1192
- left_tables=[
1193
- table_name_urn_mapping[table] for table in raw_join.left_tables
1194
- ],
1195
- right_tables=[
1196
- table_name_urn_mapping[table] for table in raw_join.right_tables
1197
- ],
1198
- on_clause=raw_join.on_clause,
1199
- columns_involved=[
1200
- ColumnRef(
1201
- table=table_name_urn_mapping[col.table],
1202
- column=col.column,
1203
- )
1204
- for col in raw_join.columns_involved
1205
- ],
1277
+ try:
1278
+ joins.append(
1279
+ JoinInfo(
1280
+ join_type=raw_join.join_type,
1281
+ left_tables=[
1282
+ table_name_urn_mapping[table] for table in raw_join.left_tables
1283
+ ],
1284
+ right_tables=[
1285
+ table_name_urn_mapping[table] for table in raw_join.right_tables
1286
+ ],
1287
+ on_clause=raw_join.on_clause,
1288
+ columns_involved=[
1289
+ ColumnRef(
1290
+ table=table_name_urn_mapping[col.table],
1291
+ column=col.column,
1292
+ )
1293
+ for col in raw_join.columns_involved
1294
+ ],
1295
+ )
1206
1296
  )
1207
- )
1297
+ except KeyError as e:
1298
+ # Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
1299
+ logger.debug(f"Skipping join with unresolvable table: {e}")
1300
+ continue
1208
1301
  return joins
1209
1302
 
1210
1303
 
@@ -115,6 +115,8 @@ def _expression_to_string(
115
115
  return expression.sql(dialect=get_dialect(platform))
116
116
 
117
117
 
118
+ PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION = re.compile(r"(%s|\$\d|\?)")
119
+
118
120
  _BASIC_NORMALIZATION_RULES = {
119
121
  # Remove /* */ comments.
120
122
  re.compile(r"/\*.*?\*/", re.DOTALL): "",
@@ -130,7 +132,9 @@ _BASIC_NORMALIZATION_RULES = {
130
132
  re.compile(r"'[^']*'"): "?",
131
133
  # Replace sequences of IN/VALUES with a single placeholder.
132
134
  # The r" ?" makes it more robust to uneven spacing.
133
- re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
135
+ re.compile(
136
+ r"\b(IN|VALUES)\s*\( ?(?:%s|\$\d|\?)(?:, ?(?:%s|\$\d|\?))* ?\)", re.IGNORECASE
137
+ ): r"\1 (?)",
134
138
  # Normalize parenthesis spacing.
135
139
  re.compile(r"\( "): "(",
136
140
  re.compile(r" \)"): ")",
@@ -139,6 +143,9 @@ _BASIC_NORMALIZATION_RULES = {
139
143
  # e.g. "col1,col2" -> "col1, col2"
140
144
  re.compile(r"\b ,"): ",",
141
145
  re.compile(r"\b,\b"): ", ",
146
+ # MAKE SURE THAT THIS IS AFTER THE ABOVE REPLACEMENT
147
+ # Replace all versions of placeholders with generic ? placeholder.
148
+ PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION: "?",
142
149
  }
143
150
  _TABLE_NAME_NORMALIZATION_RULES = {
144
151
  # Replace UUID-like strings with a placeholder (both - and _ variants).
@@ -262,6 +269,10 @@ def get_query_fingerprint_debug(
262
269
  if not fast:
263
270
  dialect = get_dialect(platform)
264
271
  expression_sql = generalize_query(expression, dialect=dialect)
272
+ # Normalize placeholders for consistent fingerprinting -> this only needs to be backward compatible with earlier sqglot generated generalized queries where the placeholders were always ?
273
+ expression_sql = PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION.sub(
274
+ "?", expression_sql
275
+ )
265
276
  else:
266
277
  expression_sql = generalize_query_fast(expression, dialect=platform)
267
278
  except (ValueError, sqlglot.errors.SqlglotError) as e: